Missing data analysis of 2016 Boston marathon data

Chapter 6: Case studies using hierarchical modelling

Let \(Y_{i}\) be the speed in mile 26 for runner \(i\) and \(X_{ij}\) be the speed for runner \(i\) in file \(j\). We fit the linear regression model \[Y_{i} = \alpha + \sum_{j=1}^{25}X_{ij}\beta_j +\epsilon_{i}\] where \(\epsilon_{i}\sim\mbox{Normal}(0,\sigma^2)\). In these data there are missing \(X_{ij}\). In our Bayesian analysis we specify a first-order autoregressive prior for the missing (standardized) covariates \[X_{i1}\sim\mbox{Normal}(0,\sigma^2_1) \mbox{ and } X_{ij+1}|X_{ij} \sim \mbox{Normal}\left(\rho X_{ij},\sigma^2_2\right).\] The remaining hyperparameters have uninformative priors.

Load and plot the data

 load("BostonMarathon2016.Rdata")

 Y  <- Marathon_female$SPEED_mile[,26]
 X  <- Marathon_female$SPEED_mile[,-26]
 X  <- scale(X)
 n  <- length(Y)
 p  <- ncol(X)

 image(1:p,1:n,is.na(t(X)),col=0:1,
       xlab="Mile",ylab="Runner",
       main="Missing data pattern")
 for(j in 0:p){
   abline(v=j+0.5,col=gray(0.5))
 }
 for(j in 0:n){
   abline(j+0.5,0,col=gray(0.5))
 }

plot of chunk load

 matplot(t(X),type="l",lty=1,xlab="Mile",ylab="Standardized speed")

plot of chunk load

Define the model in JAGS

 library(rjags)
## Loading required package: coda
## Linked to JAGS 4.2.0
## Loaded modules: basemod,bugs
 model_string <- textConnection("model{

   # Likelihood
    for(i in 1:n){
      Y[i] ~ dnorm(alpha + inprod(X[i,],beta[]),taue)
    }

   # Missing data model
    for(i in 1:n){
      X[i,1]  ~ dnorm(0,tau1)
      for(j in 2:p){
        X[i,j] ~ dnorm(rho*X[i,j-1],tau2)
      }
    }

   # Priors
    alpha ~ dnorm(0,0.01)
    for(j in 1:p){
      beta[j] ~ dnorm(0,0.01)
    }
    taue  ~  dgamma(0.1, 0.1)
    tau1  ~  dgamma(0.1, 0.1)
    tau2  ~  dgamma(0.1, 0.1)
    rho   ~  dnorm(0, 0.01)
 }")

Fit the model

 inits <- list(rho=rnorm(1))
 data  <- list(Y=Y,X=X,n=n,p=p)
 model <- jags.model(model_string,data = data, quiet=TRUE,inits=inits, n.chains=2)

 update(model, 10000, progress.bar="none")
 params  <- c("alpha","beta","X[3,6]","X[3,7]",
              "X[149,19]","X[149,20]","X[149,21]","rho")
 samples <- coda.samples(model, 
            variable.names=params, 
            n.iter=20000, progress.bar="none")

 plot(samples)

plot of chunk fitplot of chunk fitplot of chunk fitplot of chunk fitplot of chunk fitplot of chunk fitplot of chunk fitplot of chunk fit

Plot the imputed covariates for two runners

 X_miss <- rbind(samples[[1]],samples[[2]])[,1:5]
 r      <- range(na.omit(as.vector(X)))

 # Runner 3
 plot(X[3,],ylim=r,
            xlab="Mile",ylab="Speed (minute/mile)",
            main="Imputed X for two runners")
 boxplot(X_miss[,4],at=6,outline=FALSE,add=TRUE)
 boxplot(X_miss[,5],at=7,outline=FALSE,add=TRUE)

 # Runner 149
 points(X[149,],pch=19)
 boxplot(X_miss[,1],at=19,col=1,outline=FALSE,add=TRUE)
 boxplot(X_miss[,2],at=20,col=1,outline=FALSE,add=TRUE)
 boxplot(X_miss[,3],at=21,col=1,outline=FALSE,add=TRUE)

 legend("topleft",c("Runner 3","Runner 149"),pch=c(1,19),bty="n",cex=1.5)

plot of chunk plot1

Summary: The posterior of the missing \(X_{ij}\) is similar to adjacent speeds.

Plot the posterior of beta

 beta <- rbind(samples[[1]],samples[[2]])[,1:p+6]

 boxplot(beta,xlab="Mile",ylab="Posterior",
         main="Posterior of beta",
         ylim=1.5*c(-1,1),outline=FALSE,axes=FALSE)
 axis(1)
 axis(2)

 abline(0,0)

plot of chunk beta

Summary: Only the speed for miles 24 and 25 are significant predictors of the speed in the final mile.