/* The macro "add_phonies" is used to append uninformative predictor variables to an embedded-image date set thereby creating a data set useful for variable selection exercises. The macro reads in an embedded-image data set and adds a user-specified number of uninformative predictor variables to the data set. It also places the original informative predictor variables in user-specified positions. I've commented the program so that you should be able to see where to make the changes necessary to use a different starting data set and control features of the output data set. The difficulty of the variable selection problem is controlled by: 1) true model R^2 (.05 and .50) of the input data set; 2) p_phony = number of phony predictors added; 3) cm = multiplier controlling correlation between real and phony predictors. ******************** Credits and Disclaimer: Sandy Donaghy wrote the original version of the macro below. If there are any bugs in it, I probably put them there. ******************** */ run; quit; /* You've got to get this line correct if you are reading from my website. Change "haiku_files" to read from a different directory. */ filename mypath url 'http://www4.stat.ncsu.edu/~stefanski/NSF_Supported/Hidden_Images/haiku_files/'; /* For this example I'm using a data set with a haiku embedded in it. The data set has 4 informative predictor variables. I used an "easy" case where R^2 = .50 so that forward selection finds the correct model. For a harder case change _50_ to _5_ in the data set name in the mypath() argument in the data step below. Change the data set name in mypath() to use a different data set. Of course if you've downloaded data sets already change the data step accordingly.*/ run;quit; run;quit; data indata; infile mypath(model_haiku_Lin_4p_50_flat.txt); * try changing _50_ to _5_ ; input y x1-x4; run; quit; run; quit; * You shouldn't have to tinker with the macro unless you want to adapt it to handle a data set with greater than 20 real (informative) predictors. Here's a sample call to add_phonies() /* %add_phonies(indataset=indata, */ /* outdataset=outdata, */ /* p_real=&p_real, */ /* p_phony=&p_phony, */ /* cm=&cm, */ /* seed=32150, */ /* col1=1, col2=2, col3=3, */ /* col4=4, col5=5, col6=6, */ /* col7=7, col8=8, col9=9); */; /* input variables for macro that adds phony variables: 1. indataset: input data set name with cols y x1-xp_real. 2. outdataset: output data set name with cols y w1,...,wp_total. 3. p_total: number of predictors in output data set. 4. p_real: number of real predictors in output data (p_real + p_phony = p_total). 5. cm: controls correlation between phony and real predictors. cm=0 <=> zero correlation, correlation increases with increasing |cm|. 6. seed: controls the seed of the normal random variable generation of phoneys. 7. colj: colj are columns (wj) in which the "real" predictors are placed. */ run; quit; %macro add_phonies(indataset=,outdataset=,p_real=,p_phony=,cm=,seed=0, col1=,col2=,col3=,col4=,col5=,col6=,col7=,col8=,col9=,col10=, col11=,col12=,col13=,col14=,col15=,col16=,col17=,col18=,col19=,col20=,); %let p_total= %eval(&p_real + &p_phony); * total number of predictors; data &outdataset; set &indataset ; array w{&p_total}; array z{&p_total}; array x{&p_real}; array column{&p_real} col1-col&p_real; %do i=1 %to &p_real; column{&i}=&&col&i; %end; /* initialize W to 0 */ do i=1 to &p_total; w{i}=.; end; /* generate new variables */ %do i=1 %to &p_phony; mean_seed=floor(sqrt(i)) + i + 247; call rannor(mean_seed,mean); stddev=( mean*mean + 3)/4; mean=4*abs( mean ) + 1; mean=0; i=&i; z{i}=mean + rannor(&seed)*stddev; * &cm controls correlation between phony and real predictors; z{i}=z{i} + &cm*x{ floor((&p_real)*(i-1)/(&p_phony))+1 }; z{i}=round(z{i},.00001); %end; /* insert original variables */ do jj=1 to &p_real; mean_seed2=2*jj*jj*jj + 5*jj*jj + 6*jj + 233; call rannor(mean_seed2,mean2); stddev2=( mean2*mean2 + 5)/6; j=column{jj}; w{j}=x{jj}*stddev2; end; /* insert phony variables */ j=0; do jj=1 to &p_total; if w{jj} =. then do; j+1; w{jj}=z{j}; end; end; keep y w1-w&p_total ; run; %mend add_phonies; * this code calls the macro that builds the output data set; * p_real and p_phony don't have to be assigned outside of the macro inputs, but it makes them easier to alter and have &p_total automatically updated; %let p_real=4; * p_real value must match input data set; %let p_phony=500; * try different p_phony; %let cm=0; * try different cm > 0; %add_phonies(indataset=indata, outdataset=outdata, p_real=&p_real, p_phony=&p_phony, cm=0, seed=34650, col1=13, col2=23, col3=33, col4=43); * Note: because p_real=4, col1,...,col4 need to be asigned. These are the locations (columns) of the "real" variables in the output data set. The values should be unique and between 1 and p_real + p_phony. So for this example the real variables will be columns 13, 23, 33. and 43. ; %let p_total=%eval(&p_real + &p_phony); * take a look at the generated data set and then run forwar selection and plot residuals; run; quit; proc means; run; quit; proc reg data=outdata; model y = w1-w&p_total / selection=forward slentry=.005; output out=regout p=p r=r; run; quit; proc gplot; symbol v=dot height=.2; plot r*p; run; quit;