Bayes Factor의 중요성
Single Hierarchical model로써의 모형 비교
모형 m 에 대한 posterior probability: \[ p(m\|D)=\frac { p(D|m)p(m) }{ \sum _{ m }^{ }{ p(D|m)p(m) } } \]
이에 따라, 베이지안 모형 비교가 prior의 선택에 따라 매우 민감해질 수 있다.
통상적으로 BF가 3.0이 넘으면 모형1에 대한, 1/3보다 작으면 모형2에 대한 “충분한 증거”가 있다고 간주 (Jefferys, 1961; Kass & Raftery, 1995; Wetzels et al., 2011)
동전을 만드는 공장
Q: 동전을 9번 던져서 6번의 앞면이 나왔다고 할 때, __앞면위주로 만드는 공장__에서 동전이 왔을 경우의 사후 확률과 __뒷면위주로 만드는 공장__에서 동전이 왔을 경우의 사후 확률을 어떻게 구할 것인가?
\(p(D\| m)\)을 계산할 때 다음과 같은 식을 재활용 \[ p(z,N)\quad =\quad \frac { B(z+a,N-z+b) }{ B(a,b) } \]
R로 이를 계산하려 할 때
# 원래 쓰는 함수
pD <- function(z,N,a,b){exp(beta(z+a,N-z+b) - beta(a,b))}
# 조금 더 robust하도록 log를 취한 beta lbeta를 씀
pD <- function(z,N,a,b){exp(lbeta(z+a,N-z+b) - lbeta(a,b))}
pD1 <- pD(z=6,N=9,a=3.5,b=8.5)
pD2 <- pD(z=6,N=9,a=8.5,b=3.5)
pD1/pD2
## [1] 0.2135266
3.1.1 JAGS 활용
source("Jags-Ydich-Xnom1subj-Mbernbeta_ver1.R")
##
## *********************************************************************
## Kruschke, J. K. (2015). Doing Bayesian Data Analysis, Second Edition:
## A Tutorial with R, JAGS, and Stan. Academic Press / Elsevier.
## *********************************************************************
## Loading required package: coda
## Linked to JAGS 3.4.0
## Loaded modules: basemod,bugs
modelString = "
model {
for (i in 1:Ntotal) {
y[i] ~ dbern(theta)
}
theta ~ dbeta( 0.75*(12-2)+1, (1-0.75)*(12-2)+1)
}
"
myData = c(rep(0,9-6), rep(1,6))
mcmcCoda = genMCMC(data=myData, numSavedSteps=10000)
## Compiling model graph
## Resolving undeclared variables
## Allocating nodes
## Graph Size: 21
##
## Initializing model
##
## Burning in the MCMC chain...
## Sampling final MCMC chain...
theta = as.matrix(mcmcCoda)[,"theta"]
meanTheta = mean(theta)
sdTheta = sd(theta)
aPost = meanTheta * (meanTheta*(1-meanTheta)/sdTheta^2 - 1)
bPost = (1 - meanTheta) * (meanTheta*(1-meanTheta)/sdTheta^2 - 1)
oneOverPD = mean( dbeta(theta, aPost, bPost) / (theta^6*(1-theta)^(9-6) * dbeta(theta, 0.75*(12-2)+1, (1-0.75)*(12-2)+1)))
PD = 1/oneOverPD
show(PD)
## [1] 0.002337925
##
## *********************************************************************
## Kruschke, J. K. (2015). Doing Bayesian Data Analysis, Second Edition:
## A Tutorial with R, JAGS, and Stan. Academic Press / Elsevier.
## *********************************************************************
#------------------------------------------------------------------------------
# THE DATA.
N=9
z=6
y = c( rep(0,N-z) , rep(1,z) )
dataList = list(
y = y ,
N = N
)
#------------------------------------------------------------------------------
modelString = "
model {
for ( i in 1:N ) {
y[i] ~ dbern( theta )
}
theta ~ dbeta( omega[m]*(kappa-2)+1 , (1-omega[m])*(kappa-2)+1 )
omega[1] <- .25
omega[2] <- .75
kappa <- 12
m ~ dcat( mPriorProb[] )
mPriorProb[1] <- .5
mPriorProb[2] <- .5
}
"
writeLines( modelString , con="TEMPmodel.txt" )
# RUN THE CHAINS.
parameters = c("theta","m")
adaptSteps = 1000 # Number of steps to "tune" the samplers.
burnInSteps = 1000 # Number of steps to "burn-in" the samplers.
nChains = 4 # Number of chains to run.
numSavedSteps=50000 # Total number of steps in chains to save.
thinSteps=1 # Number of steps to "thin" (1=keep every step).
nPerChain = ceiling( ( numSavedSteps * thinSteps ) / nChains ) # Steps per chain.
# Create, initialize, and adapt the model:
jagsModel = jags.model( "TEMPmodel.txt" , data=dataList , # inits=initsList ,
n.chains=nChains , n.adapt=adaptSteps )
## Compiling model graph
## Resolving undeclared variables
## Allocating nodes
## Graph Size: 27
##
## Initializing model
# Burn-in:
cat( "Burning in the MCMC chain...\n" )
## Burning in the MCMC chain...
update( jagsModel , n.iter=burnInSteps )
# The saved MCMC chain:
cat( "Sampling final MCMC chain...\n" )
## Sampling final MCMC chain...
codaSamples = coda.samples( jagsModel , variable.names=parameters ,
n.iter=nPerChain , thin=thinSteps )
# resulting codaSamples object has these indices:
# codaSamples[[ chainIdx ]][ stepIdx , paramIdx ]
save( codaSamples , file=paste0(fileNameRoot,"Mcmc.Rdata") )
# Display diagnostics of chain:
parameterNames = varnames(codaSamples) # get all parameter names
for ( parName in parameterNames ) {
diagMCMC( codaSamples , parName=parName ,
saveName=fileNameRoot , saveType="eps" )
}
# Convert coda-object codaSamples to matrix object for easier handling.
mcmcMat = as.matrix( codaSamples , chains=TRUE )
m = mcmcMat[,"m"]
theta = mcmcMat[,"theta"]
# Compute the proportion of m at each index value:
pM1 = sum( m == 1 ) / length( m )
pM2 = 1 - pM1
# Extract theta values for each model index:
thetaM1 = theta[ m == 1 ]
thetaM2 = theta[ m == 2 ]
# Plot histograms of sampled theta values for each model,
# with pM displayed.
openGraph(width=7,height=5)
par( mar=0.5+c(3,1,2,1) , mgp=c(2.0,0.7,0) )
layout( matrix(c(1,1,2,3),nrow=2,byrow=FALSE) , widths=c(1,2) )
plotPost( m , breaks=seq(0.9,2.1,0.2) , cenTend="mean" , xlab="m" , main="Model Index" )
## ESS mean median mode hdiMass hdiLow hdiHigh compVal
## m 14152.98 1.8264 2 2.001472 0.95 1 2 NA
## pGtCompVal ROPElow ROPEhigh pLtROPE pInROPE pGtROPE
## m NA NA NA NA NA NA
plotPost( thetaM1 ,
main=bquote( theta*" when m=1" * " ; p(m=1|D)" == .(signif(pM1,3)) ) ,
cex.main=1.75 , xlab=bquote(theta) , xlim=c(0,1) )
## ESS mean median mode hdiMass hdiLow hdiHigh
## theta 7908.974 0.4519298 0.4510206 0.4554968 0.95 0.2561034 0.6635806
## compVal pGtCompVal ROPElow ROPEhigh pLtROPE pInROPE pGtROPE
## theta NA NA NA NA NA NA NA
plotPost( thetaM2 ,
main=bquote( theta*" when m=2" * " ; p(m=2|D)" == .(signif(pM2,3)) ) ,
cex.main=1.75 , xlab=bquote(theta) , xlim=c(0,1) )
## ESS mean median mode hdiMass hdiLow hdiHigh
## theta 41320 0.6901628 0.6966432 0.7027959 0.95 0.4986758 0.8764728
## compVal pGtCompVal ROPElow ROPEhigh pLtROPE pInROPE pGtROPE
## theta NA NA NA NA NA NA NA
saveGraph( file=paste0(fileNameRoot,"Post") , type="eps" )
## quartz_off_screen
## 2
3.2.1 pseudo-prior를 활용하여 자기 상관(autocorrelation)을 줄이기
이때, chain의 각 단계에서 \(m=1\)의 경우 \({\theta}_{1}\), \(m=2\)의 경우 \({\theta}_{2}\)만 데이터를 설명하는 데 쓰이고(prior에 영향을 받으면서) 나머지 하나는 둥둥 데이터에 상관없이 떠다님
기존에 사용하는 MCMC
modelString = "
model {
for (i in 1:N){
y[i] ~ dbern(theta)
}
theta <- equals(m,1)*theta1 + equals(m,2)*theta2
theta1 ~ dbeta(omega1*(kappa1-2)+1, (1-omega1)*(kappa1-2)+1)
omega1 <- .25
kappa1 <- 12
thetaq ~ dbeta(omega2*(kappa2-2)+1, (1-omega2)*(kappa2-2)+1)
omega2 <- .75
kappa2 <- 12
m ~ dcat(mPriorProb[])
mPriorProb[1] <- .5
mPriorProb[2] <- .5
}
"
model index를 위한 chain이 매우 자기상관이 강할 수가 있다: chain이 다음 모델 순서전까지 이전 모형에 남아있을 수 있음.
“Jags-Ydich-Xnom1subj-MbernBetaModelCompPseudoPrior.R” 참조
##
## *********************************************************************
## Kruschke, J. K. (2015). Doing Bayesian Data Analysis, Second Edition:
## A Tutorial with R, JAGS, and Stan. Academic Press / Elsevier.
## *********************************************************************
# THE DATA.
N=30
z=ceiling(.55*N)
y = c( rep(0,N-z) , rep(1,z) )
dataList = list(
y = y ,
N = N
)
modelString = "
model {
for ( i in 1:N ) {
y[i] ~ dbern( theta )
}
theta <- equals(m,1)*theta1 + equals(m,2)*theta2
theta1 ~ dbeta( omega1[m]*(kappa1[m]-2)+1 , (1-omega1[m])*(kappa1[m]-2)+1 )
omega1[1] <- .10 # true prior value
omega1[2] <- .40 # pseudo prior value
kappa1[1] <- 20 # true prior value
kappa1[2] <- 50 # pseudo prior value
theta2 ~ dbeta( omega2[m]*(kappa2[m]-2)+1 , (1-omega2[m])*(kappa2[m]-2)+1 )
omega2[1] <- .70 # pseudo prior value
omega2[2] <- .90 # true prior value
kappa2[1] <- 50 # pseudo prior value
kappa2[2] <- 20 # true prior value
m ~ dcat( mPriorProb[] )
mPriorProb[1] <- .5
mPriorProb[2] <- .5
}
"
writeLines( modelString , con="TEMPmodel.txt" )
# RUN THE CHAINS.
parameters = c("m","theta1","theta2")
adaptSteps = 1000 # Number of steps to "tune" the samplers.
burnInSteps = 1000 # Number of steps to "burn-in" the samplers.
nChains = 4 # Number of chains to run.
numSavedSteps=10000 # Total number of steps in chains to save.
thinSteps=1 # Number of steps to "thin" (1=keep every step).
nPerChain = ceiling( ( numSavedSteps * thinSteps ) / nChains ) # Steps per chain.
# Create, initialize, and adapt the model:
jagsModel = jags.model( "TEMPmodel.txt" , data=dataList , # inits=initsList ,
n.chains=nChains , n.adapt=adaptSteps )
## Compiling model graph
## Resolving undeclared variables
## Allocating nodes
## Graph Size: 68
##
## Initializing model
# Burn-in:
cat( "Burning in the MCMC chain...\n" )
## Burning in the MCMC chain...
update( jagsModel , n.iter=burnInSteps )
# The saved MCMC chain:
cat( "Sampling final MCMC chain...\n" )
## Sampling final MCMC chain...
codaSamples = coda.samples( jagsModel , variable.names=parameters ,
n.iter=nPerChain , thin=thinSteps )
# resulting codaSamples object has these indices:
# codaSamples[[ chainIdx ]][ stepIdx , paramIdx ]
save( codaSamples , file=paste0(fileNameRoot,"Mcmc.Rdata") )
# EXAMINE THE RESULTS.
# Convert coda-object codaSamples to matrix object for easier handling.
mcmcMat = as.matrix( codaSamples , chains=TRUE )
m = mcmcMat[,"m"]
# Compute the proportion of m at each index value:
pM1 = sum( m == 1 ) / length( m )
pM2 = 1 - pM1
# Extract theta values for each model index:
theta1M1 = mcmcMat[,"theta1"][ m == 1 ] # true theta1
theta1M2 = mcmcMat[,"theta1"][ m == 2 ] # pseudo theta1
theta2M1 = mcmcMat[,"theta2"][ m == 1 ] # pseudo theta2
theta2M2 = mcmcMat[,"theta2"][ m == 2 ] # true theta2
# Plot histograms of sampled theta values for each model,
# with pM displayed.
openGraph(width=7,height=7)
par( mar=0.5+c(3,1,2,1) , mgp=c(2.0,0.7,0) )
layout( matrix(c(1,1,2,3,4,5),nrow=3,byrow=TRUE) )
plotPost( m , breaks=seq(0.95,2.05,0.1) , xlim=c(0.75,2.25) ,
cenTend="mean" , xlab="m" , cex.main=1.75 ,
main=bquote( "Model Index." *
" p(m=1|D) =" * .(signif(pM1,3)) *
", p(m=2|D) =" * .(signif(pM2,3)) ) )
## ESS mean median mode hdiMass hdiLow hdiHigh compVal pGtCompVal
## m 10000 1.9195 2 2.000622 0.95 1 2 NA NA
## ROPElow ROPEhigh pLtROPE pInROPE pGtROPE
## m NA NA NA NA NA
plotPost( theta1M1 ,
main=bquote( theta[1]*" when m=1 (using true prior)" ) ,
cex.main=1.75 , xlab=bquote(theta[1]) , xlim=c(0,1) )
## ESS mean median mode hdiMass hdiLow hdiHigh
## theta[1] 805 0.3977709 0.3964047 0.3884233 0.95 0.2662324 0.5169071
## compVal pGtCompVal ROPElow ROPEhigh pLtROPE pInROPE pGtROPE
## theta[1] NA NA NA NA NA NA NA
plotPost( theta2M1 ,
main=bquote( theta[2]*" when m=1; pseudo-prior" ) ,
cex.main=1.75 , xlab=bquote(theta[2]) , xlim=c(0,1) )
## ESS mean median mode hdiMass hdiLow
## theta[2] 708.2775 0.6900603 0.6920265 0.7013501 0.95 0.5528739
## hdiHigh compVal pGtCompVal ROPElow ROPEhigh pLtROPE pInROPE
## theta[2] 0.7953801 NA NA NA NA NA NA
## pGtROPE
## theta[2] NA
plotPost( theta1M2 ,
main=bquote( theta[1]*" when m=2; pseudo-prior" ) ,
cex.main=1.75 , xlab=bquote(theta[1]) , xlim=c(0,1) )
## ESS mean median mode hdiMass hdiLow
## theta[1] 6059.944 0.4041653 0.4018718 0.3921399 0.95 0.2768463
## hdiHigh compVal pGtCompVal ROPElow ROPEhigh pLtROPE pInROPE
## theta[1] 0.5427986 NA NA NA NA NA NA
## pGtROPE
## theta[1] NA
plotPost( theta2M2 ,
main=bquote( theta[2]*" when m=2 (using true prior)" ) ,
cex.main=1.75 , xlab=bquote(theta[2]) , xlim=c(0,1) )
## ESS mean median mode hdiMass hdiLow
## theta[2] 5976.625 0.6846584 0.6867314 0.6884611 0.95 0.5587032
## hdiHigh compVal pGtCompVal ROPElow ROPEhigh pLtROPE pInROPE
## theta[2] 0.8133595 NA NA NA NA NA NA
## pGtROPE
## theta[2] NA
saveGraph( file=paste0(fileNameRoot,"Post") , type="eps" )
## quartz_off_screen
## 2
#data
# C <- 10000
# for (i in 1:N) {
# ones[i] <- 1
#
#model
modelString = "
model {
for (i in 1:N) {
spy1[i] <- pdf1( y[i] , parameters1)/C
spy2[i] <- pdf2( y[i] , parameters2)/C
spy[i] <- equals(m,1)*spy1[i] + equals(m,2)*spy2[i]
ones[i] ~ dbern(spy[i])
}
parameters1 ~ dprior1...
parameters2 ~ dprior2...
m ~ dcat( mPriorProb[])
mPriorProb[1] <- .5
mPriorProb[2] <- .5
}
"
\[ p(\overset { \wedge }{ y } |D)\quad =\quad \sum _{ m }^{ }{ p(\overset { \wedge }{ y } |D,m)p(m|D) } \\ =\quad \sum _{ m }^{ }{ \int { { \theta }_{ m }{ p }_{ m }(\overset { \wedge }{ y } |{ \theta }_{ m },m){ p }_{ m }({ \theta }_{ m }|D,m)p(m|D)\quad d{ \theta }_{ m } } } \]
#z=15; N=20; pD(z,N,a=500,b=500)/pD(z,N,a=1,b=1)
#z=11; N=20; pD(z,N,a=500,b=500)/pD(z,N,a=1,b=1)
일반적인 continuous parameter를 bayes 추론할 때, prior의 불확실성에 상관없이 posterior가 robust하다는 점과 차이가 있음.
대안: 작은 대표성있는 샘플로 prior를 만들어서 모든 모형에 동일하게 적용할 수 있음
#z=65; N=100; pD(z,N,a=500,b=500)/pD(z,N,a=1,b=1)
#z=65; N=100; pD(z,N,a=500,b=500)/pD(z,N,a=0.01,b=0.01)
#z=65-6; N=100-100; pD(z,N,a=500+6,b=500-6)/pD(z,N,a=1+6,b=1+10-6)
#z=65-6; N=100-100; pD(z,N,a=500+6,b=500-6)/pD(z,N,a=0.01+6,b=0.01+10-6)
<끝>