# Multiple linear regression prediction

Let $$Y_i$$ be the percent increase in GOP support from 2012 to 2016 in county $$i=1,…,n$$. We model $Y_i|\beta,\sigma^2\sim \mbox{Normal}(\alpha+X_{1i}\beta_1+…+X_{pi}\beta_p,\sigma^2)$ where $$X_{ji}$$ is the $$j^{th}$$ covariate for county $$i$$. All variables are centered and scaled. We select prior $$\sigma^2\sim\mbox{InvGamma}(0.01,0.01)$$ and $$\alpha\sim\mbox{Normal}(0,100)$$ for the error variance and intercept, and compare different priors for the regression coefficients.

## Load and standardize the election data

# Load the data

junk <- is.na(Y+rowSums(X))
Y    <- Y[!junk]
X    <- X[!junk,]
n    <- length(Y)
p    <- ncol(X)

n

## [1] 3111

 p

## [1] 15

 X    <- scale(X)

# Fit the model to a training set of size 100 and make prediction for the remaining observations

set.seed(0820)
test  <- order(runif(n))>100
table(test)

## test
## FALSE  TRUE
##   100  3011

 Yo    <- Y[!test]    # Observed data
Xo    <- X[!test,]

Yp    <- Y[test]     # Counties set aside for prediction
Xp    <- X[test,]

no    <- length(Yo)
np    <- length(Yp)
p     <- ncol(Xo)


## Fit the linear regression model with Gaussian priors

library(rjags)

model_string <- "model{

# Likelihood
for(i in 1:no){
Yo[i]   ~ dnorm(muo[i],inv.var)
muo[i] <- alpha + inprod(Xo[i,],beta[])
}

# Prediction
for(i in 1:np){
Yp[i]  ~ dnorm(mup[i],inv.var)
mup[i] <- alpha + inprod(Xp[i,],beta[])
}

# Priors
for(j in 1:p){
beta[j] ~ dnorm(0,0.0001)
}
alpha     ~ dnorm(0, 0.01)
inv.var   ~ dgamma(0.01, 0.01)
sigma     <- 1/sqrt(inv.var)
}"


## Compile the model in JAGS

# NOTE: Yp is not sent to JAGS!
model <- jags.model(textConnection(model_string),
data = list(Yo=Yo,no=no,np=np,p=p,Xo=Xo,Xp=Xp))

## Compiling model graph
##    Resolving undeclared variables
##    Allocating nodes
## Graph information:
##    Observed stochastic nodes: 100
##    Unobserved stochastic nodes: 3028
##    Total graph size: 59168
##
## Initializing model

update(model, 10000, progress.bar="none")

samp <- coda.samples(model,
variable.names=c("beta","sigma","Yp","alpha"),
n.iter=20000, progress.bar="none")

summary(samp[,-c(1:np)])

##
## Iterations = 10001:30000
## Thinning interval = 1
## Number of chains = 1
## Sample size per chain = 20000
##
## 1. Empirical mean and standard deviation for each variable,
##    plus standard error of the mean:
##
##              Mean     SD Naive SE Time-series SE
## alpha     6.66347 0.9036 0.006389       0.007707
## beta[1]  -1.14874 1.3233 0.009357       0.020275
## beta[2]   3.66577 1.3388 0.009467       0.025716
## beta[3]  -3.66106 1.2904 0.009125       0.017778
## beta[4]  -4.61727 1.2163 0.008601       0.020422
## beta[5]  -2.63137 2.0526 0.014514       0.048175
## beta[6]  -5.85205 1.9437 0.013744       0.044012
## beta[7]  -2.13849 1.3481 0.009532       0.023377
## beta[8]  -4.15797 1.9742 0.013959       0.051740
## beta[9]   7.36413 2.3361 0.016519       0.067424
## beta[10]  4.44952 1.9359 0.013689       0.044266
## beta[11] -0.04178 1.1907 0.008420       0.013372
## beta[12]  0.65231 1.2856 0.009090       0.014209
## beta[13]  0.94857 1.2799 0.009050       0.017670
## beta[14] -0.07319 1.2760 0.009023       0.011539
## beta[15]  0.73091 1.0178 0.007197       0.014483
## sigma     8.50855 0.6678 0.004722       0.006463
##
## 2. Quantiles for each variable:
##
##             2.5%      25%      50%     75%   97.5%
## alpha     4.8893  6.05413  6.65756  7.2695  8.4493
## beta[1]  -3.7855 -2.03236 -1.15498 -0.2498  1.4282
## beta[2]   1.0642  2.76689  3.65430  4.5495  6.3116
## beta[3]  -6.2272 -4.51449 -3.65273 -2.7943 -1.1287
## beta[4]  -7.0121 -5.42143 -4.61353 -3.8334 -2.2085
## beta[5]  -6.7311 -3.98181 -2.60851 -1.2648  1.3619
## beta[6]  -9.7423 -7.13720 -5.83365 -4.5388 -2.0844
## beta[7]  -4.7651 -3.02933 -2.14558 -1.2515  0.5311
## beta[8]  -8.0832 -5.46725 -4.15128 -2.8322 -0.3308
## beta[9]   2.9812  5.77380  7.31290  8.9017 12.0458
## beta[10]  0.5786  3.16749  4.46162  5.7237  8.2357
## beta[11] -2.4046 -0.83210 -0.03928  0.7605  2.2785
## beta[12] -1.8918 -0.20479  0.66262  1.5161  3.1585
## beta[13] -1.5888  0.09883  0.95035  1.8004  3.4629
## beta[14] -2.6058 -0.91635 -0.06833  0.7824  2.4343
## beta[15] -1.2462  0.04627  0.72937  1.4039  2.7500
## sigma     7.3276  8.04283  8.47000  8.9267  9.9413


## Plot samples from the posterior preditive distribution (PPD) and plug-in distribution

#Extract the samples for each parameter

samps       <- samp[[1]]
Yp.samps    <- samps[,1:np]
alpha.samps <- samps[,np+1]
beta.samps  <- samps[,np+1+1:p]
sigma.samps <- samps[,ncol(samps)]

# Compute the posterior mean for the plug-in predictions

beta.mn  <- colMeans(beta.samps)
sigma.mn <- mean(sigma.samps)
alpha.mn <- mean(alpha.samps)

# Plot the PPD and plug-in

for(j in 1:5){

# Plug-in
mu <- alpha.mn+sum(Xp[j,]*beta.mn)
y  <- rnorm(20000,mu,sigma.mn)
plot(density(y),col=2,xlab="Y",main="PPD")

# PPD
lines(density(Yp.samps[,j]))

# Truth
abline(v=Yp[j],col=3,lwd=2)

legend("topright",c("PPD","Plug-in","Truth"),col=1:3,lty=1,inset=0.05)
}


 # plug-in 95% intervals
low1   <- alpha.mn+Xp%*%beta.mn - 1.96*sigma.mn
high1  <- alpha.mn+Xp%*%beta.mn + 1.96*sigma.mn
cover1 <- mean(Yp>low1 & Yp<high1)
mean(cover1)

## [1] 0.9452009

 # PPD 95% intervals
low2   <- apply(Yp.samps,2,quantile,0.025)
high2  <- apply(Yp.samps,2,quantile,0.975)
cover2 <- mean(Yp>low2 & Yp<high2)
mean(cover2)

## [1] 0.9634673


Notice how the PPD densities are slightly wider than the plug-in densities. This is the effect of accounting for uncertainty in $$\beta$$ and $$\sigma$$, and it explains the slightly lower covarage for the plug-in predictions. However, for these data coverage is still OK for the plug-in predictions.