/*
The macro "add_phonies" is used to append uninformative predictor variables
to an embedded-image date set thereby creating a data set useful for
variable selection exercises.
The macro reads in an embedded-image data set and adds a user-specified
number of uninformative predictor variables to the data set. It also
places the original informative predictor variables in user-specified positions.
I've commented the program so that you should be able to see where to make
the changes necessary to use a different starting data set and control
features of the output data set.
The difficulty of the variable selection problem is controlled by:
1) true model R^2 (.05 and .50) of the input data set;
2) p_phony = number of phony predictors added;
3) cm = multiplier controlling correlation between real and phony predictors.
********************
Credits and Disclaimer: Sandy Donaghy wrote the original version
of the macro below. If there are any bugs in it, I probably put them
there.
********************
*/
run; quit;
/* You've got to get this line correct if you are reading from my website.
Change "haiku_files" to read from a different directory. */
filename mypath url
'http://www4.stat.ncsu.edu/~stefanski/NSF_Supported/Hidden_Images/haiku_files/';
/* For this example I'm using a data set with a haiku embedded in it. The
data set has 4 informative predictor variables. I used an "easy" case where
R^2 = .50 so that forward selection finds the correct model. For a harder
case change _50_ to _5_ in the data set name in the mypath() argument in
the data step below. Change the data set name in mypath() to use a different
data set.
Of course if you've downloaded data sets already change the data step
accordingly.*/
run;quit;
run;quit;
data indata;
infile mypath(model_haiku_Lin_4p_50_flat.txt); * try changing _50_ to _5_ ;
input y x1-x4;
run; quit;
run; quit;
* You shouldn't have to tinker with the macro unless you want to adapt it to
handle a data set with greater than 20 real (informative) predictors. Here's
a sample call to add_phonies()
/* %add_phonies(indataset=indata, */
/* outdataset=outdata, */
/* p_real=&p_real, */
/* p_phony=&p_phony, */
/* cm=&cm, */
/* seed=32150, */
/* col1=1, col2=2, col3=3, */
/* col4=4, col5=5, col6=6, */
/* col7=7, col8=8, col9=9); */;
/*
input variables for macro that adds phony variables:
1. indataset: input data set name with cols y x1-xp_real.
2. outdataset: output data set name with cols y w1,...,wp_total.
3. p_total: number of predictors in output data set.
4. p_real: number of real predictors in output data (p_real + p_phony = p_total).
5. cm: controls correlation between phony and real predictors.
cm=0 <=> zero correlation, correlation increases with increasing |cm|.
6. seed: controls the seed of the normal random variable generation of phoneys.
7. colj: colj are columns (wj) in which the "real" predictors are placed.
*/
run; quit;
%macro add_phonies(indataset=,outdataset=,p_real=,p_phony=,cm=,seed=0,
col1=,col2=,col3=,col4=,col5=,col6=,col7=,col8=,col9=,col10=,
col11=,col12=,col13=,col14=,col15=,col16=,col17=,col18=,col19=,col20=,);
%let p_total= %eval(&p_real + &p_phony); * total number of predictors;
data &outdataset; set &indataset ;
array w{&p_total};
array z{&p_total};
array x{&p_real};
array column{&p_real} col1-col&p_real;
%do i=1 %to &p_real;
column{&i}=&&col&i;
%end;
/* initialize W to 0 */
do i=1 to &p_total;
w{i}=.;
end;
/* generate new variables */
%do i=1 %to &p_phony;
mean_seed=floor(sqrt(i)) + i + 247;
call rannor(mean_seed,mean);
stddev=( mean*mean + 3)/4;
mean=4*abs( mean ) + 1;
mean=0;
i=&i;
z{i}=mean + rannor(&seed)*stddev;
* &cm controls correlation between phony and real predictors;
z{i}=z{i} + &cm*x{ floor((&p_real)*(i-1)/(&p_phony))+1 };
z{i}=round(z{i},.00001);
%end;
/* insert original variables */
do jj=1 to &p_real;
mean_seed2=2*jj*jj*jj + 5*jj*jj + 6*jj + 233;
call rannor(mean_seed2,mean2);
stddev2=( mean2*mean2 + 5)/6;
j=column{jj};
w{j}=x{jj}*stddev2;
end;
/* insert phony variables */
j=0;
do jj=1 to &p_total;
if w{jj} =. then do;
j+1;
w{jj}=z{j};
end;
end;
keep y w1-w&p_total ;
run;
%mend add_phonies;
* this code calls the macro that builds the output data set;
* p_real and p_phony don't have to be assigned outside of the macro inputs, but
it makes them easier to alter and have &p_total automatically updated;
%let p_real=4; * p_real value must match input data set;
%let p_phony=500; * try different p_phony;
%let cm=0; * try different cm > 0;
%add_phonies(indataset=indata,
outdataset=outdata,
p_real=&p_real,
p_phony=&p_phony,
cm=0,
seed=34650,
col1=13, col2=23, col3=33, col4=43);
* Note: because p_real=4, col1,...,col4 need to be asigned. These are the
locations (columns) of the "real" variables in the output data set. The
values should be unique and between 1 and p_real + p_phony. So for this
example the real variables will be columns 13, 23, 33. and 43.
;
%let p_total=%eval(&p_real + &p_phony);
* take a look at the generated data set and then run forwar selection
and plot residuals;
run; quit;
proc means; run; quit;
proc reg data=outdata;
model y = w1-w&p_total / selection=forward slentry=.005;
output out=regout p=p r=r;
run; quit;
proc gplot; symbol v=dot height=.2; plot r*p; run; quit;