############################################################ # # Simulation to compare sampling properties of three # different estimators for the mean of a distribution # based on an iid sample of size n: # # - sample mean # # - 20% trimmed mean (may work better with heavy tails...) # # - median (may not work well for asymmetric distributions...) # # S = total number of simulated data sets # ############################################################ # function to view the first k lines of a data frame view <- function(dat,k){ message <- paste("First",k,"rows") krows <- dat[1:k,] cat(message,"\n","\n") print(krows) } # function to calculate summary statistics across the 1000 # data sets simsum <- function(dat,trueval){ S <- nrow(dat) MCmean <- apply(dat,2,mean) MCbias <- MCmean-trueval MCrelbias <- MCbias/trueval MCstddev <- sqrt(apply(dat,2,var)) MCMSE <- apply((dat-trueval)^2,2,mean) # MCMSE <- MCbias^2 + MCstddev^2 # alternative lazy calculation MCRE <- MCMSE[1]/MCMSE sumdat <- rbind(rep(trueval,3),S,MCmean,MCbias,MCrelbias,MCstddev,MCMSE, MCRE) names <- c("true value","# sims","MC mean","MC bias","MC relative bias", "MC standard deviation","MC MSE","MC relative efficiency") ests <- c("Sample mean","Trimmed mean","Median") dimnames(sumdat) <- list(names,ests) round(sumdat,5) } # function to generate S data sets of size n from normal # distribution with mean mu and variance sigma^2 generate.normal <- function(S,n,mu,sigma){ dat <- matrix(rnorm(n*S,mu,sigma),ncol=n,byrow=T) # Note: for this very simple data generation, we can get the data # in one step like this, which requires no looping. In more complex # statistical models, looping is often required to set up each # data set, because the scenario is much more complicated. Here is # a loop to get the same data as above; try running the program and see # how much longer it takes! # dat <- NULL # # for (i in 1:S){ # # Y <- rnorm(n,mu,sigma) # dat <- rbind(dat,Y) # # } out <- list(dat=dat) return(out) } # function to generate S data sets of size n from gamma # distribution with mean mu, variance sigma^2 mu^2 generate.gamma <- function(S,n,mu,sigma){ a <- 1/(sigma^2) s <- mu/a dat <- matrix(rgamma(n*S,shape=a,scale=s),ncol=n,byrow=T) # Alternative loop # dat <- NULL # # for (i in 1:S){ # # Y <- rgamma(n,shape=a,scale=s) # dat <- rbind(dat,Y) # # } out <- list(dat=dat) return(out) } # function to generate S data sets of size n from a t distribution # with df degrees of freedom centered at the value mu (a t distribution # has mean 0 and variance df/(df-2) for df>2) generate.t <- function(S,n,mu,df){ dat <- matrix(mu + rt(n*S,df),ncol=n,byrow=T) # Alternative loop # dat <- NULL # # for (i in 1:S){ # # Y <- mu + rt(n,df) # dat <- rbind(dat,Y) # # } out <- list(dat=dat) return(out) } # function to compute the 20% trimmed mean trimmean <- function(Y){mean(Y,0.2)} # set the seed for the simulation set.seed(3) # set number of simulated data sets and sample size S <- 1000 n <- 15 # generate data --Distribution choices are normal with mu,sigma # (rnorm), gamma (rgamma) and student t (rt) # a possible "fair" comparison would be to generate data from each # of these distributions with the same mean and variance and see how # the three methods perform on a relative basis under each condition mu <- 1 sigma <- sqrt(5/3) # out <- generate.normal(S,n,mu,sigma) # generate normal samples # out <- generate.gamma(S,n,mu,sigma) # generate gamma samples out <- generate.t(S,n,mu,5) # generate t_5 samples # get the sample means from each data set outsampmean <- apply(out$dat,1,mean) # get the sample trimmed mean from each data set outtrimmean <- apply(out$dat,1,trimmean) # get the sample median from each data set outmedian <- apply(out$dat,1,median) # now we can summarize -- remember, mu is the true value of the mean # for the true distribution of the data # get the 1000 estimates for each method summary.sim <- data.frame(mean=outsampmean,trim=outtrimmean, median=outmedian) # print out the first 5 of them (round to 4 decimal places) #view(round(summary.sim,4),5) # get summary statistics for each estimator results <- simsum(summary.sim,mu) ############################################################# # Some additional calculation for the sample mean # average of estimated standard errors for sample mean # usual standard error for sample mean from each data set sampmean.ses <- sqrt(apply(out$dat,1,var)/n) # take the average ave.sampmeanses <- mean(sampmean.ses) # coverage of usual confidence interval based on sample mean t05 <- qt(0.975,n-1) coverage <- sum((outsampmean-t05*sampmean.ses <= mu) & (outsampmean+t05*sampmean.ses >= mu))/S