Multiple linear regression prediction

Let \(Y_i\) be the percent increase in GOP support from 2012 to 2016 in county \(i=1,…,n\). We model \[Y_i|\beta,\sigma^2\sim \mbox{Normal}(\alpha+X_{1i}\beta_1+…+X_{pi}\beta_p,\sigma^2)\] where \(X_{ji}\) is the \(j^{th}\) covariate for county \(i\). All variables are centered and scaled. We select prior \(\sigma^2\sim\mbox{InvGamma}(0.01,0.01)\) and \(\alpha\sim\mbox{Normal}(0,100)\) for the error variance and intercept, and compare different priors for the regression coefficients.

Load and standardize the election data

# Load the data

 load("S:\\Documents\\www\\ABA\\code\\election_2008_2016.RData")

 junk <- is.na(Y+rowSums(X))
 Y    <- Y[!junk]
 X    <- X[!junk,]
 n    <- length(Y)
 p    <- ncol(X)

 n
## [1] 3111
 p
## [1] 15
 X    <- scale(X)

# Fit the model to a training set of size 100 and make prediction for the remaining observations

 set.seed(0820)
 test  <- order(runif(n))>100
 table(test)
## test
## FALSE  TRUE 
##   100  3011
 Yo    <- Y[!test]    # Observed data
 Xo    <- X[!test,]

 Yp    <- Y[test]     # Counties set aside for prediction
 Xp    <- X[test,]

 no    <- length(Yo)
 np    <- length(Yp)
 p     <- ncol(Xo)

Fit the linear regression model with Gaussian priors

library(rjags)


model_string <- "model{

  # Likelihood
  for(i in 1:no){
    Yo[i]   ~ dnorm(muo[i],inv.var)
    muo[i] <- alpha + inprod(Xo[i,],beta[])
  }

  # Prediction
  for(i in 1:np){
    Yp[i]  ~ dnorm(mup[i],inv.var)
    mup[i] <- alpha + inprod(Xp[i,],beta[])
  }

  # Priors
  for(j in 1:p){
    beta[j] ~ dnorm(0,0.0001)
  }
  alpha     ~ dnorm(0, 0.01)
  inv.var   ~ dgamma(0.01, 0.01)
  sigma     <- 1/sqrt(inv.var)
}"

Compile the model in JAGS

# NOTE: Yp is not sent to JAGS!
model <- jags.model(textConnection(model_string), 
                    data = list(Yo=Yo,no=no,np=np,p=p,Xo=Xo,Xp=Xp))
## Compiling model graph
##    Resolving undeclared variables
##    Allocating nodes
## Graph information:
##    Observed stochastic nodes: 100
##    Unobserved stochastic nodes: 3028
##    Total graph size: 59168
## 
## Initializing model
update(model, 10000, progress.bar="none")

samp <- coda.samples(model, 
        variable.names=c("beta","sigma","Yp","alpha"), 
        n.iter=20000, progress.bar="none")

summary(samp[,-c(1:np)])
## 
## Iterations = 10001:30000
## Thinning interval = 1 
## Number of chains = 1 
## Sample size per chain = 20000 
## 
## 1. Empirical mean and standard deviation for each variable,
##    plus standard error of the mean:
## 
##              Mean     SD Naive SE Time-series SE
## alpha     6.66347 0.9036 0.006389       0.007707
## beta[1]  -1.14874 1.3233 0.009357       0.020275
## beta[2]   3.66577 1.3388 0.009467       0.025716
## beta[3]  -3.66106 1.2904 0.009125       0.017778
## beta[4]  -4.61727 1.2163 0.008601       0.020422
## beta[5]  -2.63137 2.0526 0.014514       0.048175
## beta[6]  -5.85205 1.9437 0.013744       0.044012
## beta[7]  -2.13849 1.3481 0.009532       0.023377
## beta[8]  -4.15797 1.9742 0.013959       0.051740
## beta[9]   7.36413 2.3361 0.016519       0.067424
## beta[10]  4.44952 1.9359 0.013689       0.044266
## beta[11] -0.04178 1.1907 0.008420       0.013372
## beta[12]  0.65231 1.2856 0.009090       0.014209
## beta[13]  0.94857 1.2799 0.009050       0.017670
## beta[14] -0.07319 1.2760 0.009023       0.011539
## beta[15]  0.73091 1.0178 0.007197       0.014483
## sigma     8.50855 0.6678 0.004722       0.006463
## 
## 2. Quantiles for each variable:
## 
##             2.5%      25%      50%     75%   97.5%
## alpha     4.8893  6.05413  6.65756  7.2695  8.4493
## beta[1]  -3.7855 -2.03236 -1.15498 -0.2498  1.4282
## beta[2]   1.0642  2.76689  3.65430  4.5495  6.3116
## beta[3]  -6.2272 -4.51449 -3.65273 -2.7943 -1.1287
## beta[4]  -7.0121 -5.42143 -4.61353 -3.8334 -2.2085
## beta[5]  -6.7311 -3.98181 -2.60851 -1.2648  1.3619
## beta[6]  -9.7423 -7.13720 -5.83365 -4.5388 -2.0844
## beta[7]  -4.7651 -3.02933 -2.14558 -1.2515  0.5311
## beta[8]  -8.0832 -5.46725 -4.15128 -2.8322 -0.3308
## beta[9]   2.9812  5.77380  7.31290  8.9017 12.0458
## beta[10]  0.5786  3.16749  4.46162  5.7237  8.2357
## beta[11] -2.4046 -0.83210 -0.03928  0.7605  2.2785
## beta[12] -1.8918 -0.20479  0.66262  1.5161  3.1585
## beta[13] -1.5888  0.09883  0.95035  1.8004  3.4629
## beta[14] -2.6058 -0.91635 -0.06833  0.7824  2.4343
## beta[15] -1.2462  0.04627  0.72937  1.4039  2.7500
## sigma     7.3276  8.04283  8.47000  8.9267  9.9413

Plot samples from the posterior preditive distribution (PPD) and plug-in distribution

#Extract the samples for each parameter

 samps       <- samp[[1]]
 Yp.samps    <- samps[,1:np] 
 alpha.samps <- samps[,np+1]
 beta.samps  <- samps[,np+1+1:p]
 sigma.samps <- samps[,ncol(samps)]

# Compute the posterior mean for the plug-in predictions  

 beta.mn  <- colMeans(beta.samps)
 sigma.mn <- mean(sigma.samps)
 alpha.mn <- mean(alpha.samps) 


# Plot the PPD and plug-in

 for(j in 1:5){

    # Plug-in
    mu <- alpha.mn+sum(Xp[j,]*beta.mn)
    y  <- rnorm(20000,mu,sigma.mn)
    plot(density(y),col=2,xlab="Y",main="PPD")

    # PPD
    lines(density(Yp.samps[,j]))

    # Truth
    abline(v=Yp[j],col=3,lwd=2)

    legend("topright",c("PPD","Plug-in","Truth"),col=1:3,lty=1,inset=0.05)
 }

plot of chunk plotsplot of chunk plotsplot of chunk plotsplot of chunk plotsplot of chunk plots

 # plug-in 95% intervals
  low1   <- alpha.mn+Xp%*%beta.mn - 1.96*sigma.mn
  high1  <- alpha.mn+Xp%*%beta.mn + 1.96*sigma.mn
  cover1 <- mean(Yp>low1 & Yp<high1)
  mean(cover1)
## [1] 0.9452009
 # PPD 95% intervals
  low2   <- apply(Yp.samps,2,quantile,0.025)
  high2  <- apply(Yp.samps,2,quantile,0.975)
  cover2 <- mean(Yp>low2 & Yp<high2)
  mean(cover2)
## [1] 0.9634673

Notice how the PPD densities are slightly wider than the plug-in densities. This is the effect of accounting for uncertainty in \(\beta\) and \(\sigma\), and it explains the slightly lower covarage for the plug-in predictions. However, for these data coverage is still OK for the plug-in predictions.