Bayesian Statistical Methods

Partial solutions

Chapter 3: Computational approaches

Jump to probem: 1, 3, 5, 7, 9, 11, 13, 15, 17

(1) MAP: Fast but doesn't quantify uncertainty

Numerical integration: Accurate but fails in high dimensions

CLT: Fast but requires a large sample size

Gibbs: Accurate but requires conjugate priors

M-H: Flexible but requires tuning

(3a) The conjugate prior is \(\lambda\sim\mbox{Gamma}(a,b)\) and the posterior is \[\begin{align} p(\lambda|Y) &\propto \left[\prod_{i=1}^nf(Y_i|\lambda)\right]\pi(\lambda)\\ &\propto \left[\prod_{i=1}^n\lambda^{Y_i}\exp(-N_i\lambda)\right]\cdot[\lambda^{a-1}\exp(-\lambda b)]\\ &\propto \lambda^{(\sum_{i=1}^nY_i + a)-1}\exp\left[-\lambda\left(\sum_{i=1}^nN_i + b\right)\right] \end{align}\] and so \(\lambda|Y\sim\mbox{Gamma}(\sum_{i=1}^nY_i + a,\sum_{i=1}^nN_i + b)\).

(3b) For \(\lambda\in(0,20)\) the posterior is \(p(\lambda|Y)\propto \prod_{i=1}^n f(Y_i|\lambda)\). Therefore the log posterior is a constant plus \[l(\lambda) = \sum_{i=1}^n[-N_i\lambda + Y_i\log(\lambda)].\] Taking the derivative and setting to zero gives \[ l'(\lambda)= -\sum_{i=1}^nN_i + [\sum_{i=1}^nY_i]/\lambda=0\] and so the MAP estimate is \({\hat \lambda} = [\sum_{i=1}^nY_i]/[\sum_{i=1}^nN_i]\) (unless this is greater than 20, in which case the MAP is 20).

(3c)

  lambda <- seq(0.1,0.5,0.001)
  post   <- dpois(12,50*lambda)*dpois(25,100*lambda)
  MAP    <- (12+25)/(50+100)
  MAP
## [1] 0.2466667
  plot(lambda,post,type="l",ylab="Posterior")
  abline(v=MAP,col=2)

plot of chunk p3c

(3d) Taking the second derivative gives \[ \frac{d^2l(\lambda)}{d\lambda^2} = - [\sum_{i=1}^nY_i]/\lambda^2\] and so the approximate variance is \({\hat \lambda}^2/[\sum_{i=1}^nY_i]= [\sum_{i=1}^nY_i]/[\sum_{i=1}^nN_i]^2\).

  lambda <- seq(0.1,0.5,0.001)
  MAP    <- (12+25)/(50+100)
  VAR    <- MAP*MAP/(12+25)
  clt    <- dnorm(lambda,MAP,sqrt(VAR))
  plot(lambda,post/sum(post),type="l")
  lines(lambda,clt/sum(clt),col=2)
  legend("topright",c("Exact","CLT"),lty=1,col=1:2,bty="n")

plot of chunk p3d

(5a) Perhaps \(Y_i\) is the number of crimes in year \(i\) and a new police policy is put in place in year \(n\) so the objective is to test whether the mean number of crimes is affected by the new policy.

(5b) Let \(\tau=1/\sigma^2\). \[\mu|\mbox{rest} \sim \mbox{Normal}\left(\frac{\tau\sum_{i=1}^NY_i + \tau\sum_{i=n+1}^{n+m}(Y_i-\delta)}{\tau(n+m)+1/100^2},\frac{1}{\tau (n+m)+1/100^2}\right).\] Similarly, \[\delta|\mbox{rest} \sim \mbox{Normal}\left(\frac{\tau\sum_{i=n+1}^{n+m}(Y_i-\mu)}{\tau m+1/100^2},\frac{1}{\tau m+1/100^2}\right).\] Finally, \[\sigma^2|\mbox{rest} \sim \mbox{InvGamma}\left(a+(n+m)/2,b+\sum_{i=i}^{n}(Y_i-\mu)^2/2+\sum_{i=n+1}^{n+m}(Y_i-\mu-\delta)^2/2\right).\]

(5c)

  # Generate data
  n          <- 50
  m          <- 50
  mu_true    <- 10
  delta_true <- 1
  sigma_true <- 2
  set.seed(919)
  Y1         <- rnorm(n,mu_true,sigma_true)
  Y2         <- rnorm(m,mu_true+delta_true,sigma_true)
  Y          <- c(Y1,Y2)

  # Prep for Gibbs sampling
  S       <- 10000   # number of iterations
  pri_var <- 100^2   # priors
  eps     <- 0.01        
  keep    <- matrix(0,S,3)
  mu      <- mean(Y) # initial values
  delta   <- 0
  sigma2  <- var(Y)

  # Go!
  for(iter in 1:S){
    # mu 
    prec  <- (n+m)/sigma2 + 1/pri_var     
    mn    <- sum(Y[1:n])/sigma2 + sum(Y[1:m+n]-delta)/sigma2
    mu    <- rnorm(1,mn/prec,1/sqrt(prec))

    # delta
    prec  <- m/sigma2 + 1/pri_var    
    mn    <- sum(Y[1:m+n]-mu)/sigma2
    delta <- rnorm(1,mn/prec,1/sqrt(prec))

    # sigma2 
    SS     <- sum((Y[1:n]-mu)^2) + 
              sum((Y[1:m+n]-mu-delta)^2)
    sigma2 <- 1/rgamma(1,eps+(n+m)/2,SS/2+eps)

    keep[iter,] <- c(mu,delta,sigma2)
  }

  plot(keep[,1],type="l",ylab="mu")
  abline(mu_true,0,lwd=2,col=2)

plot of chunk p5c

  plot(keep[,2],type="l",ylab="delta")
  abline(delta_true,0,lwd=2,col=2)

plot of chunk p5c

  plot(keep[,3],type="l",ylab="sigma^2")
  abline(sigma_true^2,0,lwd=2,col=2)

plot of chunk p5c

(5d)

  library(rjags)
  data <- list(n=n,m=m,Y1=Y[1:n],Y2=Y[n+1:m])

  model_string <- textConnection("model{

   # Likelihood
   for(i in 1:n){
     Y1[i] ~ dnorm(mu,tau)
   }
   for(i in 1:m){
     Y2[i] ~ dnorm(mu+delta,tau)
   }

   # Priors
   mu    ~  dnorm(0, 0.0001)
   delta ~  dnorm(0, 0.0001)
   tau   ~  dgamma(0.1, 0.1)
   sigma <- 1/sqrt(tau)
 }")

 inits <- list(mu=mean(Y),delta=0,tau=1/var(Y))
 model <- jags.model(model_string,data = data, inits=inits, n.chains=2,quiet=TRUE)
 update(model, 10000, progress.bar="none")
 params  <- c("mu","delta","sigma")
 samples <- coda.samples(model, 
            variable.names=params, 
            n.iter=10000, progress.bar="none")
 plot(samples)

plot of chunk p5d

Convergence looks great and the posterior approximations in parts c and d are very similar.

(7a)

 library(MASS)
 Y<-galaxies
 n<-length(Y)
 hist(Y,breaks=25)

plot of chunk p7a

 mean(Y);var(Y)
## [1] 20828.17
## [1] 20827887

One reasonable initial value is to start at the approximately Gaussian model with \(k=30\) and mean and variance set to the sample mean and variance (above).

(7b)

  library(rjags)
  data <- list(n=n,Y=Y)

  model_string <- textConnection("model{

   # Likelihood
   for(i in 1:n){
     Y[i] ~ dt(mu,tau,k)
   }

   # Priors
   mu    ~  dnorm(0, 0.0000000001)
   tau   ~  dgamma(0.01, 0.01)
   k     ~  dunif(1, 30)
   sigma <- 1/sqrt(tau)
 }")

 inits <- list(mu=mean(Y),tau=1/var(Y),k=30)
 model <- jags.model(model_string,data = data, inits=inits, n.chains=2,quiet=TRUE)
 update(model, 10000, progress.bar="none")
 params  <- c("mu","sigma","k")
 samples <- coda.samples(model, 
            variable.names=params, 
            n.iter=10000, progress.bar="none")
 plot(samples)

plot of chunk p7b

 summary(samples)
## 
## Iterations = 11001:21000
## Thinning interval = 1 
## Number of chains = 2 
## Sample size per chain = 10000 
## 
## 1. Empirical mean and standard deviation for each variable,
##    plus standard error of the mean:
## 
##            Mean      SD Naive SE Time-series SE
## k         2.158   0.682 0.004823         0.0111
## mu    21140.856 323.627 2.288385         2.9208
## sigma  2262.255 359.784 2.544059         4.9864
## 
## 2. Quantiles for each variable:
## 
##            2.5%       25%       50%       75%     97.5%
## k         1.247     1.701     2.034     2.462     3.792
## mu    20515.753 20925.426 21133.193 21357.048 21788.130
## sigma  1645.335  2008.107  2230.688  2480.372  3053.795
 mn <- summary(samples)$statistics[,1]
 mn 
##            k           mu        sigma 
##     2.158073 21140.856281  2262.254839

Convergence looks good.

(7c) The code below compares the fitted t PDF with the kernel density estimate (an alternative to the histogram) of the data.

  kde <- density(Y)
  pdf <- dt((kde$x-mn[2])/mn[3],df=mn[1])
  pdf <- sum(kde$y)*pdf/sum(pdf)  # This makes both plots on the same scale

  plot(kde,xlab="y",ylab="Density",main="")
  lines(kde$x,pdf,col=2)
  legend("topright",c("Data","Fitted t"),lty=1,col=1:2,bty="n")

plot of chunk p7c

The t density misses the humps on the left and right and isn't a great fit.

(9)

  library(rjags)
  data <- list(N1=2820,Y1=563,N2=27,Y2=10)

  model_string <- textConnection("model{

   # Likelihood
    Y1 ~ dpois(N1*lambda1)
    Y2 ~ dpois(N2*lambda2)

   # Priors
    lambda1 ~  dunif(0, 10)
    lambda2 ~  dunif(0, 10)
    r       <- lambda2/lambda1
 }")

 inits <- list(lambda1=0.1,lambda2=0.2)
 model <- jags.model(model_string,data = data, inits=inits, n.chains=2,quiet=TRUE)
 update(model, 10000, progress.bar="none")
 params  <- c("lambda1","lambda2","r")
 samples <- coda.samples(model, 
            variable.names=params, 
            n.iter=10000, progress.bar="none")
 plot(samples)

plot of chunk p9

 summary(samples)
## 
## Iterations = 11001:21000
## Thinning interval = 1 
## Number of chains = 2 
## Sample size per chain = 10000 
## 
## 1. Empirical mean and standard deviation for each variable,
##    plus standard error of the mean:
## 
##           Mean       SD  Naive SE Time-series SE
## lambda1 0.2000 0.008387 5.931e-05      7.531e-05
## lambda2 0.4079 0.122202 8.641e-04      1.148e-03
## r       2.0428 0.617442 4.366e-03      5.730e-03
## 
## 2. Quantiles for each variable:
## 
##           2.5%    25%    50%    75%  97.5%
## lambda1 0.1839 0.1943 0.1998 0.2057 0.2166
## lambda2 0.2044 0.3200 0.3953 0.4816 0.6819
## r       1.0171 1.6016 1.9809 2.4141 3.4344
 effectiveSize(samples)
##  lambda1  lambda2        r 
## 12402.53 11332.00 11627.71
 gelman.diag(samples)
## Potential scale reduction factors:
## 
##         Point est. Upper C.I.
## lambda1          1          1
## lambda2          1          1
## r                1          1
## 
## Multivariate psrf
## 
## 1

The trace plots look great, the effective sample sizes are all large (over 1,000), and the Gelman-Rubin statistics are 1.0. Therefore, the chains have clearly converged.

(11) The code below uses Gaussian candidate distributions with standard deviation 0.2 and 0.05 (selected by trial and error to give acceptance probability around 0.4 for each parameter).

  Y      <- c(64,13,33,18,30,20)
  t      <- 1:6

  S      <- 25000
  beta   <- c(2,0)
  cansd  <- c(0.2,0.05)
  keep   <- matrix(0,S,2)

  post   <- function(Y,t,beta,pri.sd=10){
      mn <- exp(beta[1] + t*beta[2])
      l  <- prod(dpois(Y,mn))
      p  <- prod(dnorm(beta,0,pri.sd))
  return(l*p)}

  for(iter in 1:S){
     for(j in 1:2){
       can    <- beta
       can[j] <- rnorm(1,beta[j],cansd[j])
       R      <- post(Y,t,can)/post(Y,t,beta)
       if(runif(1)<R){
          beta <- can
       }
     }
     keep[iter,] <- beta
  }

  plot(keep[,1],type="l",ylab=expression(beta[1]),xlab="Iteration")

plot of chunk p11

  plot(keep[,2],type="l",ylab=expression(beta[2]),xlab="Iteration")

plot of chunk p11

  acc_rate <- colMeans(keep[-1,]!=keep[-S,])
  acc_rate
## [1] 0.4033761 0.4568983

The trace plots look great and the acceptance rates are around 0.4, which is acceptable.

(13a) The joint distribution of \(B\) and \(Y\) is \(f(Y,B)=g(Y|B)h(B)\) where \(g(Y|B)=\phi(Y-B\theta)\) is the normal PDF and \(h\) is the Bernoulli PMF. The marginal distribution of \(Y\) is then \[f(Y) = \sum_{b=0}^1g(Y|B)h(B) = \phi(Y)\frac{1}{2} + \phi(Y-\theta)\frac{1}{2}\] as desired.

(13b)

 set.seed(27695)
 theta_true <- 4
 n          <- 30
 B          <- rbinom(n,1,0.5)
 Y          <- rnorm(n,B*theta_true,1)
 hist(Y,breaks=25)

plot of chunk p13b

 y          <- seq(-3,10,0.1)
 plot(y,0.5*dnorm(y,0,1) + 0.5*dnorm(y,2,1),type="l",ylab="PDF")
 lines(y,0.5*dnorm(y,0,1) + 0.5*dnorm(y,4,1),col=2)
 lines(y,0.5*dnorm(y,0,1) + 0.5*dnorm(y,6,1),col=3)
 legend("topright",c("theta=2","theta=4","theta=6"),
                  lty=1,col=1:3,bty="n")

plot of chunk p13b

(13c)

 library(stats4)
 nlp <- function(theta,Y){
    like <- 0.5*dnorm(Y,0,1)+
            0.5*dnorm(Y,theta,1)
    prior <- dnorm(theta,0,10)
    neg_log_post <- -sum(log(like))-log(prior)  
 return(neg_log_post)}

 map_est <- mle(nlp,start=list(theta=1),
                    fixed=list(Y=Y))
 sd      <- sqrt(vcov(map_est))
 map_est; sd
## 
## Call:
## mle(minuslogl = nlp, start = list(theta = 1), fixed = list(Y = Y))
## 
## Coefficients:
##       theta          Y1          Y2          Y3          Y4          Y5 
##  4.17919603 -1.77844991  1.60305490  0.63529137  4.67157916 -0.43544347 
##          Y6          Y7          Y8          Y9         Y10         Y11 
## -0.12584690 -0.38585384  4.66977750  1.12469067  1.79842288  3.85616261 
##         Y12         Y13         Y14         Y15         Y16         Y17 
##  3.12904919 -1.38070620  4.07364670  3.35284203  0.52304572 -0.75516014 
##         Y18         Y19         Y20         Y21         Y22         Y23 
## -0.52569447  0.76633658  5.04733582  0.11272642  4.78594654  1.69632947 
##         Y24         Y25         Y26         Y27         Y28         Y29 
##  4.05891960  0.58024782  5.23736037  4.44583893 -0.09848674 -0.36530670 
##         Y30 
## -0.80020385
##           theta
## theta 0.3389466
 map     <- 4.17919603

(13d)

 posterior <- function(theta,Y,k){
    post <- dnorm(theta,0,sqrt(10^k))
    for(i in 1:length(Y)){
       post<-post*(0.5*dnorm(Y[i],0,1)+
                   0.5*dnorm(Y[i],theta,1))
    }
 return(post/sum(post))}

 theta <- seq(2,6,0.01)
 map   <- dnorm(theta,map,sd)
 plot(theta,map/sum(map),type="l",ylab="Posterior")
 lines(theta,posterior(theta,Y,0),col=2)
 lines(theta,posterior(theta,Y,1),col=3)
 lines(theta,posterior(theta,Y,2),col=4)
 lines(theta,posterior(theta,Y,3),col=5)
 legend("topright",c("MAP","k=0","k=1","k=2","k=3"),
        col=1:5,lty=1,bty="n")

plot of chunk p13d

Even though an improper prior is problematic in this case, the normal prior with large variance performs similar to the Gaussian approximation.

(13e)

  library(rjags)
  data <- list(n=n,Y=Y)

  model_string <- textConnection("model{

   # Likelihood
   for(i in 1:n){
     Y[i] ~ dnorm(B[i]*theta,1)
     B[i] ~ dbern(0.5)
   }

   # Priors
   theta ~  dnorm(0, 0.01)
 }")

 inits <- list(theta=1)
 model <- jags.model(model_string,data = data, inits=inits, n.chains=2,quiet=TRUE)
 update(model, 10000, progress.bar="none")
 params  <- c("theta")
 samples <- coda.samples(model, 
            variable.names=params, 
            n.iter=10000, progress.bar="none")
 plot(samples) 

plot of chunk p13e

Convergence looks good and the results are similar to part d.

(15a)

  library(rjags)
 mass <- c(29.9, 1761, 1807, 2984, 3230, 5040, 5654)
 age  <- c(2, 15, 14, 16, 18, 22, 28)
 n    <- length(age)
 data <- list(mass=mass,age=age,n=n)  

 model_string <- textConnection("model{

   # Likelihood
    for(i in 1:n){
      mass[i]  ~ dnorm(mu[i],tau)
      mu[i]   <- theta[1] + theta[2]*pow(age[i],theta[3])
    }

   # Priors
    theta[1] ~ dnorm(0, 0.00001)
    theta[2] ~ dunif(0, 10000)
    theta[3] ~ dnorm(0,1)
    tau      ~ dgamma(0.01,0.01)
 }")

 model <- jags.model(model_string,data = data, n.chains=2,quiet=TRUE)
 update(model, 10000, progress.bar="none")
 params  <- c("mu","theta")
 samples <- coda.samples(model, 
            variable.names=params, 
            n.iter=10000, progress.bar="none")

 q <- summary(samples)$quantiles

 plot(age,mass,pch=19)
 lines(age,q[1:n,3])        # Posterior medians
 lines(age,q[1:n,1],lty=2)  # Posterior 95% interval
 lines(age,q[1:n,5],lty=2)  
 legend("topleft",c("Data","Median","95% interval"),
        lty=c(NA,1,2),pch=c(19,NA,NA),bty="n")

plot of chunk p15a

(15b)

 plot(samples)

plot of chunk p15bplot of chunk p15bplot of chunk p15bplot of chunk p15b

 summary(samples)
## 
## Iterations = 11001:21000
## Thinning interval = 1 
## Number of chains = 2 
## Sample size per chain = 10000 
## 
## 1. Empirical mean and standard deviation for each variable,
##    plus standard error of the mean:
## 
##              Mean       SD Naive SE Time-series SE
## mu[1]     256.082 435.2737 3.077850       33.62855
## mu[2]    2639.986 371.5540 2.627284       30.46857
## mu[3]    2437.227 388.9419 2.750234       36.94359
## mu[4]    2845.571 357.2971 2.526472       24.12871
## mu[5]    3264.930 345.8699 2.445669       10.73452
## mu[6]    4134.656 430.8688 3.046702       13.72068
## mu[7]    5510.568 794.6578 5.619079       70.75311
## theta[1] -121.491 287.4275 2.032419        4.98096
## theta[2]  215.479 292.6183 2.069124       26.57504
## theta[3]    1.139   0.3305 0.002337        0.04447
## 
## 2. Quantiles for each variable:
## 
##               2.5%       25%      50%      75%   97.5%
## mu[1]    -363.8349  -23.5302  178.593  432.571 1386.68
## mu[2]    2041.2965 2397.3442 2590.581 2832.836 3521.57
## mu[3]    1829.0203 2176.9567 2377.629 2639.582 3372.65
## mu[4]    2257.2552 2618.8173 2806.125 3030.927 3669.95
## mu[5]    2622.9779 3069.7937 3246.819 3447.838 4012.44
## mu[6]    3138.4372 3942.2632 4173.945 4379.307 4875.10
## mu[7]    3591.0847 5116.0648 5635.422 6032.957 6754.45
## theta[1] -686.4429 -316.9708 -121.804   73.388  437.62
## theta[2]   26.0520   59.4941  108.357  237.609 1083.00
## theta[3]    0.4038    0.9362    1.196    1.381    1.64
 effectiveSize(samples)
##      mu[1]      mu[2]      mu[3]      mu[4]      mu[5]      mu[6] 
##  172.45780  156.28386  115.72326  234.80936 1095.44094 1043.56345 
##      mu[7]   theta[1]   theta[2]   theta[3] 
##  125.61708 3515.13968  126.57898   55.19019
 gelman.diag(samples,multivariate=F)
## Potential scale reduction factors:
## 
##          Point est. Upper C.I.
## mu[1]          1.01       1.03
## mu[2]          1.01       1.05
## mu[3]          1.01       1.06
## mu[4]          1.01       1.04
## mu[5]          1.00       1.02
## mu[6]          1.00       1.00
## mu[7]          1.01       1.03
## theta[1]       1.00       1.00
## theta[2]       1.01       1.04
## theta[3]       1.02       1.07

Convergence is fine for \(\theta_1\), but poor for \(\theta_2\) and \(\theta_3\) which have effective sample size less than 100.

(15c) One approach to improving convergence is to simply run the chains longer. This will likely work here because convergence isn't hopeless and the code is fast. A second approach is to simplify the model. The log-linear model \[\log(mass) = \beta_0 + \beta_1\log(age) + \epsilon\] is one possibility. Finally, \(\theta_2\) and \(\theta_3\) have posterior correlation -0.88 and so updating them in a block might improve convergence.

(17a) Convergence could be slow because there are more parameters than observations and so it is likely that not all parameters are identifiable.

(17b)

 library(rjags)
 mod <- "model{

   # Likelihood
    for(i in 1:n){
      Y[i]   ~ dnorm(mu[i],tau[i])
      mu[i]  ~ dnorm(0,theta[1])
      tau[i] ~ dgamma(theta[2],theta[3])
    }

   # Priors
    theta[1] ~ dgamma(eps, eps)
    theta[2] ~ dgamma(eps, eps)
    theta[3] ~ dgamma(eps, eps)
 }"

 params  <- c("theta")

 model_string <- textConnection(mod)
 data  <- list(Y=1:5,n=5,eps=0.1)  
 model <- jags.model(model_string,data = data, n.chains=2,quiet=TRUE)
 update(model, 10000, progress.bar="none")
 s <- coda.samples(model, 
       variable.names=params, 
       n.iter=20000, progress.bar="none")
 ESS1 <- effectiveSize(s)


 model_string <- textConnection(mod)
 data  <- list(Y=1:25,n=25,eps=0.1)  
 model <- jags.model(model_string,data = data, n.chains=2,quiet=TRUE)
 update(model, 10000, progress.bar="none")
 s <- coda.samples(model, 
       variable.names=params, 
       n.iter=20000, progress.bar="none")
 ESS2 <- effectiveSize(s)


 model_string <- textConnection(mod)
 data  <- list(Y=1:5,n=5,eps=10)  
 model <- jags.model(model_string,data = data, n.chains=2,quiet=TRUE)
 update(model, 10000, progress.bar="none")
 s <- coda.samples(model, 
       variable.names=params, 
       n.iter=20000, progress.bar="none")
 ESS3 <- effectiveSize(s)



 model_string <- textConnection(mod)
 data  <- list(Y=1:25,n=25,eps=10)  
 model <- jags.model(model_string,data = data, n.chains=2,quiet=TRUE)
 update(model, 10000, progress.bar="none")
 s <- coda.samples(model, 
       variable.names=params, 
       n.iter=20000, progress.bar="none")
 ESS4 <- effectiveSize(s)

 ESS1 # n = 5  and eps = 0.1
##  theta[1]  theta[2]  theta[3] 
## 1399.1240  268.1794  625.9386
 ESS2 # n = 25 and eps = 0.1
##   theta[1]   theta[2]   theta[3] 
## 30802.3032    57.2700   195.5023
 ESS3 # n = 5  and eps = 10
## theta[1] theta[2] theta[3] 
## 16901.53 12979.49 27441.12
 ESS4 # n = 35 and eps = 10
## theta[1] theta[2] theta[3] 
## 10533.82 18625.49 29523.03

In the first case with small sample size and uninformative prior convergence isn't great. In the second case the sample size increases, but so does the number of parameters, and so convergence remains problematic. However, increasing the prior information improves convergence for the last two fits.