/* ******************************************************************************* * This SAS macro implements the NAMS method for variable selection in forward * * selection found in Luo, Stefanski and Boos (2006), Technometrics, May 2006, * * Vol. 48, p. 165-175. Basically, noise is added to the response variable in * * known amounts, and forward selection is run with a variety of "alpha-to-enter" * * values. The procedure uses the resulting mse averaged over bootstrap reps * * to estimate "alpha-to-enter" so that forward selection does not underfit or * * overfit. * * * * Main Macro name : NAMS. Version 2/17/07 by Xiaohui Luo. * * The response variable should be named y. * * The candidate explanatory variables are x1, x2, ... , * * * * macro parameters: * * * * idn = input dataset name (required) * * p = the first p x's, x1-xp, will be used (required) * * bb = number of bootstrap reps (default value is 4000), minimum value 500. * * We recommend 4000 in routine applications. * * (This version asks for 5*bb more replications if a local max occurs.) * * noint: indicator to include intercept or not in the forward selection. * * The default is to include the intercept, o.w. use noint=noint. * * seed = the random seed (default is 27606, we highly recommend changing) * * mses = the dataset name to store the average of mean square errors * * (default is mses) * * check = the dataset name to store the slope of the regression of the * average mean square errors on the labmda (default is slopes) * * result= the dataset name to store the selected model and the fitting * * results (default is NAMS) * * All dataset names can be two-level, i.e., starting with a libname. * * contamination = the option whether to use the same set of added noise * * for each lambda or different sets * * contamination = 1 (default, uses same set) * * contamination = 2 (uses different sets) * * two examples: * * contamination with the same N(0,1) rv's for different lambda,the default option; *%NAMS(idn=pollute,p=15,bb=4000,seed=123; * contamination with different N(0,1) r.v. for different lambda: *%NAMS(idn=pollute,p=15,bb=4000,seed=123,result=nextalpha,contamination=2); ***********************************************************************************/ * to claim if two values are practically different ; %macro farenough(var1,var2,diff=.00001); (abs(&var1-&var2)*2/(abs(&var1)+abs(&var2))>=&diff) %mend farenough; %macro forward_candidates(idn=,p=,noint=,odn=); ods listing close; ods output selectionsummary=details (keep=varentered probf); proc reg data=&idn ; model y = x1-x&p / &noint selection = forward slentry = .9999999999999 details=summary; run; quit; %let lll = %sysevalf(&p*4); data details; retain p step 0 ; length list $&lll.; retain list ""; set details end=eof; if _n_ = 1 then list = varentered; else list = left(trim(list)) || " " || varentered ; if _n_ = 1 then p = probf - .1; if probf > p then do; p = probf; step + 1; end; keep p list step; rename p = probf; run; data &odn.; set details; by step; if last.step; drop step; run; data zero; length list $&lll.; probf=0; list=""; run; data &odn.; set zero &odn.; rename probf=lower; run; %mend forward_candidates; %macro forwards(idn=,p=,noint=,b=,odn=); ods listing close; %if &b <= 100 %then %do; ods output selectionsummary=details (keep=probf modelrsquare numberin rename=(numberin=_p_)); proc reg data=&idn ; model y1-y&b = x1-x&p / &noint selection = forward slentry = .9999999999999999 details=steps; run; quit; %end; %else %do; %* to accommodate the needs to run a huge job in several pieces; %let tot = %sysfunc(ceil(&b/100)); %do cut = 1 %to &tot; %let left = %sysevalf((&cut-1)*100+1); %let right = %sysfunc(min(&cut*100,&b)); ods output selectionsummary=details&cut (keep=probf modelrsquare numberin rename=(numberin=_p_)); proc reg data=&idn ; model y&left-y&right = x1-x&p / &noint selection = forward slentry = .9999999999999 details=steps ; run; quit; %end; data details; set %do cut = 1 %to &tot; details&cut %end;; run; %end; ods output close; ods listing; %if "&noint" = "noint" %then %do; %* no intercept in the model: R square defined differently; proc means data=&idn uss n noprint ; var y1-y&b ; output out = out0 (drop=_type_ _freq_) uss= n=n; run; proc transpose data=out0(drop=n) out=mse0(keep=col1 rename=(col1=_mse_)); run; data mse0; if _n_ = 1 then set out0(keep=n); set mse0; sst = _mse_; _mse_ = _mse_ / n; drop n; dep = _n_; _p_ = 0; probf = 0; run; %end; %else %do; %* intercept in the model ; proc means data=&idn css n noprint ; var y1-y&b ; output out = out0 (drop=_type_ _freq_) css= n=n; run; proc transpose data=out0(drop=n) out=mse0(keep=col1 rename=(col1=_mse_)); run; data mse0; if _n_ = 1 then set out0(keep=n); set mse0; sst = _mse_; _mse_ = _mse_ / (n-1); drop n; dep = _n_; _p_ = 1; probf = 0; run; %end; data details; retain dep 1; set details; p = lag(_p_); if _p_ <= p then dep + 1; drop p; run; data details; retain p step 0 ; set details; by dep _p_; if first.dep then do; p = probf - .1; step = 0; end; if probf > p then do; p = probf; step + 1; end; if "&noint" = "" then _p_ = _p_ + 1; keep p step _p_ dep modelrsquare; rename p = probf; run; data details; set details; by dep step ; if last.step; drop step; run; data test; set mse0 details; by dep _p_; run; data &odn; %* calculate residual mean squares based on the total corrected sum squares and R square ; retain sst0; set test; by dep _p_; if first.dep then sst0 = sst; if _mse_ = . then _mse_ = sst0*(1-modelrsquare)/(&size - _p_); drop sst sst0 modelrsquare; label _mse_ = "Mean Square Error" _p_ = "Number of Parameters"; run; ods output close; ods listing; %mend forwards; %macro same_lambda(idn=,lambda=,b=,p=,out=,noint=); %forwards(idn=&idn,p=&p,noint=&noint,b=&b,odn=forwards); %* obtain the selected model of forward selection for each alpha ; data out; retain mse0-mse&alpha_n 0; set forwards; by dep _p_; %do j = 0 %to &alpha_n; if probf <= &&alpha&j then do; mse&j = _mse_; end; %end; if last.dep then output; keep mse0-mse&alpha_n; run; %* create the output data set by merging the results for different alpha values ; proc means data=out mean noprint; var mse0-mse&alpha_n; output out=&out(drop=_type_ _freq_) mean=; run; data &out; lambda = λ set &out; run; %mend same_lambda; %macro forward_tuning(idn=,p=,noint=,mses=); %* for each lambda value, the forward selection will be executed ; %do lj = 1 %to &lambda_n ; %let b = &bb; data current ; %* obtain the contaminated data for the lambda ; set &idn (drop=y); if lambda = &&lambda&lj ; drop lambda ; run; %* Call the same_lambda macro by passing some parameter values. * %* The input data set name is passed through "idn", * %* p is the number of predictors involved in the forward regression * %* the output data set name is passed through "out" * ; %same_lambda(idn=current,lambda=&&lambda&lj,b=&b,p=&p,out=temp&lj,noint=&noint); %end; %* assemble the result data sets ; data &mses; set %do k = 1 %to &lambda_n ; temp&k %end ; ; run; %mend forward_tuning; %macro NAMS0; data initialps; seed=&seed.; B=&bb.; run; %global alphahat1 list localmax; /* create the contamined data set */ %if &contamination=1 %then %do; data model; b = &bb; retain seed &seed.; array z{&bb} y1-y&bb. ; array tt{&bb} tt1-tt&bb. ; if _n_ = 1 then set sigma; set &idn end=eof; do i = 1 to b; call rannor(seed,tt{i}); end; %do j = 1 %to &lambda_n; lambda = &&lambda&j ; do i = 1 to b; z{i} = y + sqrt(lambda)* tt{i} * _rmse_; end; output; %end; if eof then call symput("seed",seed); drop seed b i _rmse_ tt1-tt&bb.; run; %end; %else %if &contamination ne 1 %then %do; data model; b = &bb; retain seed &seed.; array z{&bb} y1-y&bb ; if _n_ = 1 then set sigma; set &idn end=eof; %do j = 1 %to &lambda_n; lambda = &&lambda&j; do i = 1 to b; call rannor(seed,tt); z{i} = y + sqrt(lambda)* tt * _rmse_; end; output; %end; if eof then call symput("seed",seed); drop seed b i _rmse_ tt; run; %end; %forward_candidates(idn=&idn,p=&p,noint=&noint,odn=can00); /* add 1 to the end of the "p-values" of the forward selection */ data details; set can00(keep=lower rename=(lower=probf) firstobs=2) end=eof; output; if eof then do; probf = 1; output; end; keep probf; run; %* Set the alphas and keep the number of alphas into alpha_N. * %* They are global macro variables such that they can be accessed * %* within any other macro that is called by this macro * ; data alpha; %* creat the alpha grid for tuning procedure ; set details end=eof; p = lag(probf); if _n_ = 1 then do; alpha = 0 ; p = 0; output; end; if not eof then do; alpha = (5*p + probf)/6; n = 5 *_n_ - 4; call symput(compress("alpha" || n),alpha); output; alpha = (4*p + 2*probf)/6; n = 5 *_n_ - 3 ; call symput(compress("alpha" || n),alpha); output; alpha = (3*p + 3*probf)/6; n = 5 *_n_ - 2; call symput(compress("alpha" || n),alpha); output; alpha = (2*p + 4*probf)/6; n = 5 *_n_ - 1 ; call symput(compress("alpha" || n),alpha); output; alpha = (p + 5*probf)/6; n = 5 *_n_ ; call symput(compress("alpha" || n),alpha); output; end; else do; alpha = (2*p + probf)/3; n = 5*_n_ - 4; call symput(compress("alpha" || n),alpha); output; alpha = (p + 2*probf)/3; n = 5*_n_ - 3; call symput(compress("alpha" || n),alpha); output; alpha = 1; n = 5 *_n_ - 2; call symput(compress("alpha" || n),1); output; call symput("alpha_N",trimn(left(put(n,8.)))); end; keep alpha; run; %let alpha0 = 0; %forward_tuning(idn=model,p=&p,noint=&noint,mses=&mses.); data &mses.; if _n_ = 1 then set initialps; set &mses.; run; proc reg data=&mses outest=&check (keep=lambda rename=(lambda=slope)) noprint; model mse0 mse1-mse&alpha_N=lambda ; run; quit; %* Merge the "alpha" data set and the "slope" data set (excluding the first obs, * %* which corresponding the true model) together to be prepared for the regression * %* Also, a log transformation is applied to the alpha values * ; data ✓ if _n_ =1 then set initialps; merge &check alpha ; run; proc sort data=&check(keep=slope alpha) out=bestfit; by descending alpha; run; %* remove those alphas that are too close to their neighbours ; data bestfit; set bestfit; s = lag(slope); if _n_ <= 3 or %farenough(s,slope) ; drop s; run; %let alphahat1 = 0; %let localmax=0; data _null_; retain baseline -1; set bestfit end=eof; if _n_ = 1 then baseline=slope; slope1=lag(slope); slope2=lag2(slope); slope3=lag3(slope); slope4=lag4(slope); alpha1=lag(alpha); alpha2=lag2(alpha); alpha3=lag3(alpha); if _n_ = 3 and slope>=baseline and slope1>=baseline then do; call symput("alphahat1",1); stop; end; else if _n_ > 3 and slope>=baseline then do; call symput("alphahat1",alpha1); stop; end; else if _n_ > 4 and slope2>=slope3 and slope2>=slope4 and slope2>=slope1 and slope2>=slope then do; call symput("alphahat1",alpha3); call symput("localmax",left(1)); stop; end; if eof then do; if slope1>slope2 and slope1>slope3 and slope1>slope then do; call symput("alphahat1",alpha2); call symput("localmax",left(1)); stop; end; else if slope>slope1 and slope>slope2 then do; if alpha = 0 then do; call symput("alphahat1",0); stop; end; else do; call symput("alphahat1",trimn(left(alpha1))); stop; end; end; end; run; %* obtain the model ; data alphamodel; set can00 (where=(lower<=&alphahat1)) end=eof; if eof; call symput(compress("list"),list); alphahat=&alphahat1; keep alphahat list; rename list=model; run; proc reg data = &idn. (keep=x1-x&p y) outest=_result noprint; model y = x1-x&p. / &noint selection = forward slentry = &alphahat1. edf sse mse; run; quit; data &result.; retain model ; retain _in_ alphahat _rsq_ _rmse_ 0 intercept . x1-x&p. _sse_ _mse_ 0; merge _result alphamodel initialps; keep model _in_ alphahat _rsq_ _rmse_ intercept x1-x&p. _sse_ _mse_ seed B; run; %mend NAMS0; * this is the main macro ; %macro NAMS(idn=,p=,bb=4000,noint=,seed=27606,mses=mses,check=slopes,result=NAMS,contamination=1) ; %put NAMS (Noise Addition Model Selection) by Luo, Stefanski and Boos 2006 Technometrics paper. ; %if &bb. < 500 %then %let bb=500; %let seed0=&seed.; %let bb0=&bb.;%let mses0=&mses.; %let check0=&check.; %let result0=&result.; options nonotes nosource cleanup; %let lambda_n = 4; %* 4 different lambdas are used in NAMS; %let lambda1 = .5; %let lambda2 = 1; %let lambda3 = 1.5; %let lambda4 = 2; %* to remove the records with missing data ; data __completecase; set &idn.; if nmiss(of x1-x&p.) = 0 and y ne .; run; %let idn=__completecase; %* use the data set with complete cases for variable selection ; data _null_; %* obtain the sample size of the data ; point=1; set &idn point=point nobs=nobs; call symput("size",nobs); stop; run; %* obtain the residual mean squares from the full model on the original data; proc reg data = &idn outest=sigma (keep=_rmse_) noprint; model y = x1-x&p/&noint mse; run; quit; %NAMS0; %let runcounter=0; %if &localmax.=1 %then %do; %let runcounter=%eval(&runcounter.+1); %let bb=%sysevalf(&bb0.*5); %let seed=%eval(&seed0.+&runcounter.); %let mses=_mse&runcounter.; %let check=_check&runcounter.; %let result=_result&runcounter.; %NAMS0; data &mses0.; retain ExtraRun seed B 0 ; set &mses0. (in=in0) &mses.; if in0 then ExtraRun=0; else ExtraRun = &runcounter.; run; data &check0.; retain ExtraRun seed B 0 ; set &check0. (in=in0) &check.; if in0 then ExtraRun=0; else ExtraRun = &runcounter.; run; data &result0.; retain ExtraRun seed B 0 ; set &result0. (in=in0) &result.; if in0 then ExtraRun=0; else ExtraRun = &runcounter.; run; %end; proc sort data=&check0. out=_check; by alpha; %if &runcounter. ne 0 %then %do; where extrarun=&runcounter.; %end; run; data _check; set _check; alphaIndex=_n_; call symput("last",slope); run; proc gplot ; plot slope*alphaIndex/vref=&last; title "Slope vs. Alpha Index"; run; quit; Title; Goptions reset=all; options notes source cleanup; %put ; %put The selected model is saved in the dataset &result0..; %put The details of slopes and alphas are saved in the dataset &check0..; %if &runcounter ne 0 %then %do; %let bb=%sysevalf(&bb0.*5); %let seed=%eval(&seed0.+&runcounter.); %put Based on the seed and the number of bootstrapping specififed in the call,; %put the model would have been selected by a local max.; %put ; %put The program uses seed=&seed. (i.e., the original seed + &runcounter.) ; %put and BB=&bb. (i.e., the original BB times 5) instead to select a more reliable model.; %if &localmax=1 %then %do; %put ; %put But the model is still selected by a local max. ; %put Please confirm the selected model by checking the slope by alpha index plot.; %end; %end; %put ; %put The alphahat = &alphahat1. ; %put; %put The selected model is ; %put &list. ; %mend NAMS;