Analysis of the 2016 US Presidential Election

Chapter 5.5: Model selection criteria

The data for this analysis come from Tony McGovern. The response variable, \(Y_i\), is the percentage change in Republican (GOP) support from 2012 to 2016, i.e., \[100\left(\frac{\mbox{% in 2016}}{\mbox{% in 2012}}-1\right),\] in county \(i=1,…,n\).

The \(p=10\) covariates \(X_{ij}\) are county-level census variables obtained from Kaggle are:

For a county in state \(s\), we assume the linear model \[Y_i=\beta_{0s}+\sum_{j=1}^pX_i\beta_{sj} + \varepsilon_i,\] where \(\beta_{js}\) is the effect of covariate \(j\) in state \(s\). We compare three models for the \(\beta_{js}\)

  1. Constant slopes: \(\beta_{js} \equiv \beta_j\) for all counties
  2. Varying slopes with uninformative priors: \(\beta_{js} \sim\mbox{Normal}(0,100)\)
  3. Varying slopes with informative priors: \(\beta_{js} \sim\mbox{Normal}(\mu_j,\sigma_j^2)\)

In the third model, the means \(\mu_j\) and variances \(\sigma_j^2\) are given prior and estimated from the data, therefore information is pooled across states via the prior. The three methods are compared using DIC, and final results are compared across models.

Load the data

 load("election_2008_2016.RData")
 X     <- scale(X)    # Standardize the covariates
 X     <- cbind(1,X)
 short <- c("Intercept", "Pop change", "65+","African American",
            "Hispanic","HS grad","Bachelor's",
            "Homeownership rate","Home value",
            "Median income", "Poverty")
 names <- c("Intercept", as.character(names[1:11,2]))
 colnames(X) <- short

 # Define a function to make county maps
 county_plot <- function(fips,Y,main="",units=""){
   library(choroplethr)
   temp  <- as.data.frame(list(region=fips,value=Y))
   suppressWarnings(county_choropleth(temp,title=main,legend=units))
 }
 county_plot(fips,Y,main="Percent increase in GOP support",units="")

plot of chunk load

 county_plot(fips,X[,7],main=names[7],units="")

plot of chunk load

 # Remove AK, HI and DC due to missing data
 state <- as.character(all_dat[,3])
 AKHI  <- state=="AK" | state=="HI" | state=="DC"
 fips  <- fips[!AKHI]
 Y     <- Y[!AKHI]
 X     <- X[!AKHI,]
 state <- state[!AKHI]

 # Assign a numeric id to the counties in each state
 st    <- unique(state)
 id    <- rep(NA,length(Y))
 for(j in 1:48){
   id[state==st[j]]<-j
 }

 n <- length(Y) # number of counties
 N <- 48        # number of states
 p <- ncol(X)   # number of covariates

 set.seed(0820)
 iters <- 50000
 burn  <- 10000

(1) Constant slopes

 model1_string <- "model{

   # Likelihood
   for(i in 1:n){
      Y[i]   ~ dnorm(mu[i],taue)
      mu[i] <- inprod(X[i,],beta[])
   }
   # Priors
   for(j in 1:p){beta[j] ~ dnorm(0,0.01)}
   taue ~ dgamma(0.1,0.1)
   sig <- 1/sqrt(taue)

   # WAIC calculations
   for(i in 1:n){
     like[i]    <- dnorm(Y[i],mu[i],taue)
   }
 }"

 library(rjags)

 # Load the model
 dat    <- list(Y=Y,n=n,X=X,p=p)
 init   <- list(beta=rep(0,p))
 model1 <- jags.model(textConnection(model1_string),n.chains=2,
                      inits=init,data = dat,quiet=TRUE)
 # Generate samples
 update(model1, burn, progress.bar="none")
 samp1   <- coda.samples(model1, 
            variable.names=c("beta"), 
            n.iter=iters, progress.bar="none")

 # Compile results
 ESS1    <- effectiveSize(samp1)
 out1    <- summary(samp1)$quantiles
 rownames(out1)<-short

 # Compute DIC
 dic1    <- dic.samples(model1,n.iter=iters,progress.bar="none")

 # Compute WAIC
 waic1   <- coda.samples(model1, 
            variable.names=c("like"), 
            n.iter=iters, progress.bar="none")
 like1   <- waic1[[1]]
 fbar1   <- colMeans(like1)
 P1      <- sum(apply(log(like1),2,var))
 WAIC1   <- -2*sum(log(fbar1))+2*P1

(2) Slopes as fixed effects

 model2_string <- "model{

   # Likelihood
   for(i in 1:n){
      Y[i]    ~ dnorm(mnY[i],taue)
      mnY[i] <- inprod(X[i,],beta[id[i],])
   }

   # Slopes
   for(j in 1:p){for(i in 1:N){
       beta[i,j] ~ dnorm(0,0.01)
   }}

   # Priors
   taue ~ dgamma(0.1,0.1)


   # WAIC calculations
   for(i in 1:n){
     like[i]    <- dnorm(Y[i],mnY[i],taue)
   }
  }"


  library(rjags)

  # Load the model
  dat    <- list(Y=Y,n=n,N=N,X=X,p=p,id=id)
  init   <- list(beta=matrix(0,N,p))
  model2 <- jags.model(textConnection(model2_string),n.chains=2,
                       inits=init,data = dat,quiet=TRUE)

  # Generate samples
  update(model2, burn, progress.bar="none")
  samp2  <- coda.samples(model2, 
            variable.names=c("beta"), 
            n.iter=iters, progress.bar="none")

  # Compile results  
  ESS2     <- effectiveSize(samp2)
  sum      <- summary(samp2)$stat
  post_mn2 <- matrix(sum[,1],N,p)
  post_sd2 <- matrix(sum[,2],N,p)

  # Compute DIC
  dic2   <- dic.samples(model2,n.iter=iters,progress.bar="none")

  # Compute WAIC
  waic2   <- coda.samples(model2, 
             variable.names=c("like"), 
             n.iter=iters, progress.bar="none")
  like2   <- waic2[[1]]
  fbar2   <- colMeans(like2)
  P2      <- sum(apply(log(like2),2,var))
  WAIC2   <- -2*sum(log(fbar2))+2*P2

(3) Slopes as random effects

 model3_string <- "model{

   # Likelihood
   for(i in 1:n){
      Y[i]    ~ dnorm(mnY[i],taue)
      mnY[i] <- inprod(X[i,],beta[id[i],])
   }

   # Random slopes
   for(j in 1:p){
     for(i in 1:N){
       beta[i,j] ~ dnorm(mu[j],taub[j])
     }
     mu[j]    ~ dnorm(0,0.01)
     taub[j]  ~ dgamma(0.1,0.1)
   }

   # Priors
   taue ~ dgamma(0.1,0.1)


   # WAIC calculations
   for(i in 1:n){
     like[i]    <- dnorm(Y[i],mnY[i],taue)
   }
  }"


  library(rjags)

  # Load the model
  dat    <- list(Y=Y,n=n,N=N,X=X,p=p,id=id)
  init   <- list(beta=matrix(0,N,p))
  model3 <- jags.model(textConnection(model3_string),n.chains=2,
                       inits=init,data = dat,quiet=TRUE)

  # Generate samples
  update(model3, burn, progress.bar="none")
  samp3  <- coda.samples(model3, 
            variable.names=c("beta"), 
            n.iter=iters, progress.bar="none")

  # Compile results
  ESS3     <- effectiveSize(samp3)
  sum      <- summary(samp3)$stat
  post_mn3 <- matrix(sum[,1],N,p)
  post_sd3 <- matrix(sum[,2],N,p)

  # Compute DIC
  dic3    <- dic.samples(model3,n.iter=iters,progress.bar="none")

  # Compute WAIC
  waic3   <- coda.samples(model3, 
             variable.names=c("like"), 
             n.iter=iters, progress.bar="none")
  like3   <- waic3[[1]]
  fbar3   <- colMeans(like3)
  P3      <- sum(apply(log(like3),2,var))
  WAIC3   <- -2*sum(log(fbar3))+2*P3

Check convergence

 ESS1
##   beta[1]   beta[2]   beta[3]   beta[4]   beta[5]   beta[6]   beta[7] 
## 99338.223 53652.963 18626.686 41660.287 30666.972 16547.198 15945.338 
##   beta[8]   beta[9]  beta[10]  beta[11] 
## 24268.764 17264.099  8718.938 12147.265
 hist(ESS2)

plot of chunk con

 hist(ESS3)

plot of chunk con

Summary: The effective sample size is large for all parameters and all models, therefore it seems the MCMC algorithm has converged.

Summarize the non-spatial model

 kable(round(out1,2))
2.5% 25% 50% 75% 97.5%
Intercept 6.41 6.58 6.67 6.76 6.93
Pop change -1.46 -1.25 -1.14 -1.03 -0.82
65+ 0.53 0.79 0.93 1.06 1.32
African American -1.89 -1.67 -1.56 -1.45 -1.23
Hispanic -2.40 -2.18 -2.06 -1.95 -1.72
HS grad 1.24 1.57 1.75 1.92 2.25
Bachelor's -6.71 -6.37 -6.19 -6.01 -5.66
Homeownership rate -0.38 -0.13 0.01 0.15 0.41
Home value -1.98 -1.68 -1.52 -1.36 -1.05
Median income 1.14 1.62 1.87 2.12 2.60
Poverty 0.91 1.28 1.47 1.66 2.03

Summary: All but one (home ownership rate) of the covariates have 95% interval that excludes zero. GOP support tended to increase in counties with

Compare models with DIC

 dic1
## Mean deviance:  21300 
## penalty 11.98 
## Penalized deviance: 21312
 dic2
## Mean deviance:  18484 
## penalty 454.9 
## Penalized deviance: 18938
 dic3
## Mean deviance:  18604 
## penalty 237.5 
## Penalized deviance: 18842

Summary: The first model with constant slopes is the simplest but fits the data poorly and thus has highest DIC. The second model with different slopes in each state has the best fit (smallest mean deviance), but is too complicated and has large \(p_D\). The final model has fairly small mean deviance and \(p_D\), and thus balances fit and complexity to give the smallest DIC.

Compare models with WAIC

 WAIC1; P1
## [1] 21334.86
## [1] 20.02579
 WAIC2; P2
## [1] 18970.65
## [1] 405.3173
 WAIC3; P3
## [1] 18910.91
## [1] 259.5478

Summary: WAIC agrees with DIC. Both prefer Model 3 with the regression coefficients treated as random effects.

Explore the results of the final model

 boxplot(post_mn3)

plot of chunk sum3

Summary: The effect of the proportion of college graduates varies the most across states

Explore the three estimate of the effects of college graduates

 # Posterior mean
  county_plot(fips,post_mn3[id,7],
              main="Proportion of college graduates - posterior mean")

plot of chunk BS

 # Posterior sd
  county_plot(fips,post_sd3[id,7],
  main="Proportion of college graduates - posterior SD")

plot of chunk BS

Summary: The effect of the proportion of college graduates is the strongest (most negative) in the midwest.