*options nonotes nosource cleanup ps=50; /************************************************************************************ Macro name : FSR. Purpose : Implement the FSR method for variable selection in linear regression. Parameters: ind -- input the original data set. The response variable should be named as y. The explanatory variables should be named as x_1, ... , x_KT KT : The number of original explanatory variables, i.e., the x variables. fsr : The specified target false selection rate. amax : The specified maximum alpha-to-enter value in forward selection. B : The number of bootstrap repetitions for generating pseudo-variables. Useage: call macro FSR as: %FSR(ind = , KT =, fsr = , amax=, B = ). For example, %FSR(ind = NCAA, KT = 10, fsr = 0.05, amax = 0.3, B= 500), where NCAA is a SAS data set; Reference: Wu, Y., Boos, D.D., and Stefanski, L.A. (2007). Controlling Variable Selection by the Addition of Pseudo-Variables. Journal of American Statistical Association, 102, p 235-243. *************************************************************************************/ %MACRO FSR(ind = , KT =, fsr = ,amax =, B = ); options nonotes nosource cleanup; *Run forward selection on the real data; ODS OUTPUT selectionsummary=XSeq (KEEP=varentered ProbF); PROC REG DATA=&ind; MODEL y = x1-x&kt. / SELECTION = FORWARD SLENTRY = 0.99 Details = Summary; RUN; /* The data set Xseq has two variables: Xent -- the sequence of the variables entered into the model; Xent_a -- the corresponding p-values */ DATA Xseq; SET Xseq; RENAME varentered = Xent ProbF = Xent_a; RUN; DATA FSALL; SET _NULL_; RUN; ODS LISTING CLOSE; %DO loop = 1 %TO &B.; /*Generate the pseduo-variables by regression-residual permutation method*/ /*generate the initial pseudo-variables by permutation.*/ DATA Phoney; SET &ind (drop=y); u = RANUNI(0); RUN; PROC SORT DATA = Phoney; BY u; RUN; DATA Phoney; SET Phoney (keep=x1-x&kt.); RENAME %DO k = 1 %TO &kt.; x&k = zz&k %END;; RUN; /* Replace the intial pseudo-variables with the regression residuals */ DATA ALL; MERGE &ind Phoney; RUN; PROC REG DATA=ALL; MODEL zz1-zz&kt. = x1 - x&kt.; OUTPUT OUT = Phoney(Keep = z1-z&kt.) R = z1-z&kt.; RUN; DATA Current; /* merge X's with Z's */ MERGE &ind Phoney; RUN; /*Run forward selection on the augmented data set*/ ODS OUTPUT selectionsummary=FSone (KEEP=varentered ProbF); PROC REG DATA=Current; MODEL y = x1-x&kt. z1-z&kt. / SELECTION = FORWARD SLENTRY = 0.99 details = summary; RUN; DATA FSone; SET FSone; Repeat = &loop.; RUN; DATA FSALL; SET FSALL FSone; RUN; %END; ODS LISTING; PROC IML; USE FSALL; READ ALL VAR{varentered ProbF Repeat}; USE Xseq; READ ALL VAR{Xent Xent_a}; /**Define the alpha values, taking values in {0.001, 0.002, ..., &amax}**/; alpha = do(0.001,&amax.,0.001); n = NCOL(alpha); Sp=j(&B.,n,0); *The number of total selected variables (real+pseudo); Up=j(&B.,n,0); *Then number of pseudo variables selected; EKu = j(1,n,0); /*The number of excluded real variables from the model given a alpha-to-enter when runing FS only on the real data */ DO i = 1 TO &B.; Varone = varentered[loc(Repeat=i)]; PV = ProbF[loc(Repeat=i)]; Do k = 1 To n; IF ANY(PV > alpha[k]) THEN Sp[i,k] = loc(PV > alpha[k])[1]; *include intercept; ELSE Sp[i,k] = NROW(PV) + 1; IF Sp[i,k] > 1 THEN Up[i,k] = SUM(UPCASE(SUBSTR(Varone[1:Sp[i,k]-1],1,1))="Z"); END; END; Do k = 1 To n; IF ANY(Xent_a > alpha[k]) THEN EKu[k] = &KT. - loc(Xent_a > alpha[k])[1] + 1; ELSE EKu[k] = NROW(Xent_a); END; Sp_ave = Sp[+,]/&B.; Up_ave = Up[+,]/&B.; *Method based on estimating rRE by ratio of expectation; rRE = (EKu#Up_ave/&KT.)/(Sp_ave-Up_ave); pos = loc(rRE<=&fsr.); alpha_RE = alpha[pos[NCOL(pos)]]; IF ANY(Xent_a > alpha_RE) THEN KI_RE = loc(Xent_a > alpha_RE)[1] - 1; ELSE KI_RE = NROW(Xent_a); IF KI_RE > 0 THEN Variable_Select = Xent[1:KI_RE]; ELSE Variable_Select = "NULL"; Print "The Selection Results By the FSR Method"; print "Method Based on Estimating rRE"; print alpha_RE KI_RE Variable_Select; *Method based on estimating rER by expectation of ratio; rER = (EKu#Up_ave/&KT.)/(1+ &KT. - EKu); pos = loc(rER<=&fsr.); alpha_ER = alpha[pos[NCOL(pos)]]; IF ANY(Xent_a > alpha_ER) THEN KI_ER = loc(Xent_a > alpha_ER)[1] - 1; ELSE KI_ER = NROW(Xent_a); IF KI_ER > 0 THEN Variable_Select = Xent[1:KI_ER]; ELSE Variable_Select = "NULL"; print "Method Based on Estimating rER"; print alpha_ER KI_ER Variable_Select; outp = t(alpha)||t(rRE)||t(rER); CREATE fsr from outp; APPEND FROM outp; quit; DATA fsr; set fsr; RENAME COL1 = alpha COL2=rRE COL3 = rER; RUN; *Draw the plots of rRE and rER; symbol1 i=j v=none c=red l=1 width=1; symbol2 i=j v=none c=blue l=20 width=1; axis1 label=( f=swissb h=1.1 j=c c=black a=0 r=0 'alpha') offset=(0,0) value=(h=1) major=(h=1); axis2 label=(f=swissb h=1.1 j=c c=black a =90 "The Estimated FSR") offset = (0,0) major=(h=1) value=(h=1); legend across=1 offset=(3,-2) label=none value=(h=1.3 f=swiss ) frame position=(top inside left) mode=share; proc gplot data=fsr; plot rRE*alpha rER*alpha/overlay haxis=axis1 vaxis=axis2 legend=legend; title ' '; run; quit; options notes source; %MEND FSR;