nams2<-function(x,y,B=500,seed=100,largep=F,r2g=.5,print=T,digits=4){ # Code by Dennis Boos, latest version 4/27/07 # Estimated alpha for forward selection, uses library(leaps) # Uses alphahat.simple to select from alpha by slope plot # does not allow full model or any alpha past min of plot # be sure to look at plot for local maxima # x=data frame or matrix of predictors, y=responses # B= number of bootstrap resamples # Allows use of m=ncol(x) > n=length(y) # largep=T forces tau^2=(1-r2g)*var(y), r2g=guess of R^2 # print=T allows printing of p-values # Based on Luo, Stefanski, and Boos (2006, Technometrics) except # this version uses same noise variables for each alpha and lambda require(leaps) set.seed(seed) ok<-complete.cases(x,y) x<-x[ok,] # get rid of na's y<-y[ok] # since regsubsets can't handle na's x<-as.matrix(x) # in case x is a data frame m<-ncol(x) n<-nrow(x) if(m >= n) largep<-T # if they forget if(m >= n) m1 <- n-5 else m1<-m # to get rid of NA's in pv vm<-1:m1 pvm<-rep(0,m1) lam<-c(0.5,1.0,1.5,2.0) # lambda grid sqlam<-sqrt(lam) # first we call regsubsets to get the grid of p-values and alpha values out.x<-regsubsets(x,y,method="forward") # calculate the p-values of the entering variables pv.orig<-1-pf((out.x$rss[vm]-out.x$rss[vm+1])*(out.x$nn-(vm+1))/out.x$rss[vm+1],1,out.x$nn-(vm+1)) # next we get the max of the p-values up to the ith step for (i in 1:m1){pvm[i]<-max(pv.orig[1:i])} # sequential max of pvalues real.seq<-data.frame(var=(out.x$vorder-1)[2:(m1+1)],pval=pv.orig, pvmax=pvm,Rsq=round(1-out.x$rss[2:(m1+1)]/out.x$rss[1],4)) if(print)print(round(real.seq,digits)) if(largep){tau<-sqrt((1-r2g)*var(y))} else {tau<-sqrt(out.x$rss[m+1]/(n-m-1))} # calculate grid of alphas according to paper, p. 169 a<-c(0,unique(pvm)) # unique throws out duplicate pvm's ma<-length(a) alpha<-sort(c(0,(a[2:ma]+5*a[1:(ma-1)])/6,(2*a[2:ma]+4*a[1:(ma-1)])/6,(3*a[2:ma]+3*a[1:(ma-1)])/6, (4*a[2:ma]+2*a[1:(ma-1)])/6,(5*a[2:ma]+a[1:(ma-1)])/6,(1+2*max(a))/3,(2+max(a))/3,1)) ng<-length(alpha) # length of grid # now loop through the 4 lambda values to create the data and get mse's mse.a<-array(rep(0,B*ng*4),dim=c(B,ng,4)) av<-matrix(rep(0,4*ng),nrow=4) # to hold mean mse's ym<-matrix(rep(y,B),nrow=B,byrow=T) # B rows of y - may not need to do this holder<-rnorm(n*B) # to be used for all lambdas for (i in 1:4){ ystar<-ym+matrix(tau*sqlam[i]*holder,nrow=B,byrow=TRUE) mse.a[,,i]<-t(apply(ystar,1,mse.alpha,x=x,alpha=alpha)) # gives 4 matrices of mse's } for (i in 1:4){av[i,]<-mean(as.data.frame(mse.a[,,i]))} # averages the 4 matrices cbind(av[1,],av[2,],av[3,],av[4,])->mse.av # ng by 4 of average mse apply(mse.av,1,slope,x=lam)->slope.alpha # computes the slopes alphahat<-alphahat.simple(alpha,slope.alpha,largep,tau) # comment out the next 4 lines if you don't want the plot plot.l<-min(slope.alpha) if(largep) comp<-tau^2 else comp<-slope.alpha[ng] plot(slope.alpha,ylim=c(plot.l,2*comp-plot.l)) abline(h=comp) title("Slope vs. Alpha Index") # now get the selected model, size1=size of model including intercept if(alphahat>0) size1<-sum(pvm<=alphahat)+1 else size1<-1 colnames(x)<-colnames(x,do.NULL=F,prefix="") # corrects for no colnames x<-x[,colnames(x)[(out.x$vorder-1)[2:size1]]] if (size1==1) {mod <- lm(y~1)} else {mod <- lm(y~x)} print(res<-data.frame(m,n,B,seed,tau,ng,alphahat,alpha.index=which(alpha==alphahat))) cat("",fill=T) cat("Variables Selected =",colnames(x),fill=T) return(list(alpha=alpha,slope=slope.alpha,mse.av=mse.av,res=res,pv.orig=pv.orig, pvm=pvm,mod=mod,size=size1-1,alphahat=alphahat)) } #This is the end of the main function # auxiliary functions needed, the first is the ls slope est. slope<-function(x,y){sum((y-mean(y))*x)/sum((x-mean(x))^2)} farenough<-function(x,y,tol=.00001){(2*abs(x-y)/(abs(x)+abs(y)))>=tol} mse.alpha<-function(x,y,alpha){ # computes vector of mse estimates for forward(alpha) # assumes that all NA are gone and x is a matrix m<-ncol(x) n<-nrow(x) ng<-length(alpha) if(m >= n) m1 <- n-5 else m1<-m # to get rid of NA's in pv vm<-1:m1 ind<-rep(0,ng) # will contain size of each model(alpha) pvmax<-rep(0,m1) # sequential max of pvalues err.df<- n-(1:(m+1)) # error df regsubsets(x,y,method="forward")->out pv<-1-pf((out$rss[vm]-out$rss[vm+1])*(out$nn-(vm+1))/out$rss[vm+1],1,out$nn-(vm+1)) for (i in 1:m1){pvmax[i]<-max(pv[1:i])} # sequential max of pvalues mse<-out$rss/err.df # mse for each step for (ia in 1:ng){ind[ia] <- sum(pvmax<=alpha[ia])} ind[1]<-0 # force ind[1] to 0 mse.alp<-mse[ind+1] mse.alp } alphahat.simple<-function(x,y,largep,tau){ # finds alphahat where x is the alpha grid and y are the slopes # simple version, no local max, and alphahat=1 not possible ng<-length(x) if(largep) slope.end<-tau^2 else slope.end<-y[ng] xmin<-which.min(y) # index of min slope x<-x[1:xmin] # get rid of to the right of min y<-y[1:xmin] above<-(max(y)>slope.end) if(above) m2<-max((1:xmin)[y>slope.end]) else m2<-which.max(y)-1 alphahat<-x[m2+1] alphahat }