General Statistics Examples |
This example performs regression with variable selection. Some of the methods used in this example are also used in the REG procedure. Here is the program.
/* Initialization */ /* C,CSAVE the crossproducts matrix */ /* N number of observations */ /* K total number of variables to consider */ /* L number of variables currently in model */ /* IN a 0-1 vector of whether variable is in */ /* B print collects results (L MSE RSQ BETAS ) */ start initial; n=nrow(x); k=ncol(x); k1=k+1; ik=1:k; bnames={nparm mse rsquare} ||varnames; /* Correct by mean, adjust out intercept parameter */ y=y-y[+,]/n; /* correct y by mean */ x=x-repeat(x[+,]/n,n,1); /* correct x by mean */ xpy=x`*y; /* crossproducts */ ypy=y`*y; xpx=x`*x; free x y; /* no longer need the data */ /* Save a copy of crossproducts matrix */ csave=(xpx || xpy) // (xpy`|| ypy); finish initial;
/* Forward method */ start forward; print / "FORWARD SELECTION METHOD"; free bprint; c=csave; in=repeat(0,k,1); l=0; /* No variables are in */ dfe=n-1; mse=ypy/dfe; sprob=0; do while(sprob<.15 & l<k); indx=loc(^in); /* where are the variables not in? */ cd=vecdiag(c)[indx,]; /* xpx diagonals */ cb=c[indx,k1]; /* adjusted xpy */ tsqr=cb#cb/(cd#mse); /* squares of t tests */ imax=tsqr[<:>,]; /* location of maximum in indx */ sprob=(1-probt(sqrt(tsqr[imax,]),dfe))*2; if sprob<.15 then do; /* if t test significant */ ii=indx[,imax]; /* pick most significant */ run swp; /* routine to sweep */ run bpr; /* routine to collect results */ end; end; print bprint[colname=bnames] ; finish forward; /* Backward method */ start backward; print / "BACKWARD ELIMINATION "; free bprint; c=csave; in=repeat(0,k,1); ii=1:k; run swp; run bpr; /* start with all variables in */ sprob=1; do while(sprob>.15 & l>0); indx=loc(in); /* where are the variables in? */ cd=vecdiag(c)[indx,]; /* xpx diagonals */ cb=c[indx,k1]; /* bvalues */ tsqr=cb#cb/(cd#mse); /* squares of t tests */ imin=tsqr[>:<,]; /* location of minimum in indx */ sprob=(1-probt(sqrt(tsqr[imin,]),dfe))*2; if sprob>.15 then do; /* if t test nonsignificant */ ii=indx[,imin]; /* pick least significant */ run swp; /* routine to sweep in variable */ run bpr; /* routine to collect results */ end; end; print bprint[colname=bnames] ; finish backward;
/* Stepwise method */ start stepwise; print /"STEPWISE METHOD"; free bprint; c=csave; in=repeat(0,k,1); l=0; dfe=n-1; mse=ypy/dfe; sprob=0; do while(sprob<.15 & l<k); indx=loc(^in); /* where are the variables not in? */ nindx=loc(in); /* where are the variables in? */ cd=vecdiag(c)[indx,]; /* xpx diagonals */ cb=c[indx,k1]; /* adjusted xpy */ tsqr=cb#cb/cd/mse; /* squares of t tests */ imax=tsqr[<:>,]; /* location of maximum in indx */ sprob=(1-probt(sqrt(tsqr[imax,]),dfe))*2; if sprob<.15 then do; /* if t test significant */ ii=indx[,imax]; /* find index into c */ run swp; /* routine to sweep */ run backstep; /* check if remove any terms */ run bpr; /* routine to collect results */ end; end; print bprint[colname=bnames] ; finish stepwise; /* Routine to backwards-eliminate for stepwise */ start backstep; if nrow(nindx)=0 then return; bprob=1; do while(bprob>.15 & l<k); cd=vecdiag(c)[nindx,]; /* xpx diagonals */ cb=c[nindx,k1]; /* bvalues */ tsqr=cb#cb/(cd#mse); /* squares of t tests */ imin=tsqr[>:<,]; /* location of minimum in nindx */ bprob=(1-probt(sqrt(tsqr[imin,]),dfe))*2; if bprob>.15 then do; ii=nindx[,imin]; run swp; run bpr; end; end; finish backstep;
/* Search all possible models */ start all; /* Use method of Schatzoff et al. for search technique */ betak=repeat(0,k,k); /* rec. ests. for best l-param model */ msek=repeat(1e50,k,1); /* record best mse per # parms */ rsqk=repeat(0,k,1); /* record best rsquare */ ink=repeat(0,k,k); /* record best set per # parms */ limit=2##k-1; /* number of models to examine */ c=csave; in=repeat(0,k,1); /* start out with no variables in model */ do kk=1 to limit; run ztrail; /* find which one to sweep */ run swp; /* sweep it in */ bb=bb//(l||mse||rsq||(c[ik,k1]#in)`); if mse<msek[l,] then do; /* was this best for l parms? */ msek[l,]=mse; /* record mse */ rsqk[l,]=rsq; /* record rsquare */ ink[,l]=in; /* record which parms in model */ betak[l,]=(c[ik,k1]#in)`; /* record estimates */ end; end; print / "ALL POSSIBLE MODELS" " IN SEARCH ORDER"; print bb[colname=bnames]; free bb; bprint=ik`||msek||rsqk||betak; print ,"THE BEST MODEL FOR EACH NUMBER OF PARAMETERS"; print bprint[colname=bnames]; /* Mallows CP plot */ cp=msek#(n-ik`-1)/min(msek)-(n-2#ik`); cp=ik`||cp; cpname={"nparm" "cp"}; /* output cp out=cp colname=cpname; */ finish all; /* Subroutine to find number of */ /* trailing zeros in binary number */ /* on entry: kk is the number to examine */ /* on exit: ii has the result */ start ztrail; ii=1; zz=kk; do while(mod(zz,2)=0); ii=ii+1; zz=zz/2; end; finish ztrail;
/* Subroutine to sweep in a pivot */ /* on entry: ii has the position(s) to pivot */ /* on exit: in, l, dfe, mse, rsq recalculated */ start swp; if abs(c[ii,ii])<1e-9 then do; print , "FAILURE", c; stop; end; c=sweep(c,ii); in[ii,]=^in[ii,]; l=sum(in); dfe=n-1-l; sse=c[k1,k1]; mse=sse/dfe; rsq=1-sse/ypy; finish swp; /* Subroutine to collect bprint results */ /* on entry: l,mse,rsq, and c set up to collect */ /* on exit: bprint has another row */ start bpr; bprint=bprint//(l||mse||rsq||(c [ik,k1]#in)`); finish bpr; /* Stepwise Methods */ /* After a run to the initial routine, which sets up */ /* the data, four different routines can be called */ /* to do four different model-selection methods. */ start seq; run initial; /* initialization */ run all; /* all possible models */ run forward; /* foreward selection method */ run backward; /* backward elimination method */ run stepwise; /* stepwise method */ finish seq;
/* Data on physical fitness */ /* These measurements were made on men involved in a physical*/ /* fitness course at N.C. State Univ. The variables are */ /* age (years), weight (kg), oxygen uptake rate (ml per kg */ /* body weight per minute), time to run 1.5 miles (minutes), */ /* heart rate while resting, heart rate while running (same */ /* time oxygen rate measured), and maximum heart rate record-*/ /* ed while running. Certain values of maxpulse were modified*/ /* for consistency. Data courtesy Dr. A. C. Linnerud */ data= { 44 89.47 44.609 11.37 62 178 182 , 40 75.07 45.313 10.07 62 185 185 , 44 85.84 54.297 8.65 45 156 168 , 42 68.15 59.571 8.17 40 166 172 , 38 89.02 49.874 9.22 55 178 180 , 47 77.45 44.811 11.63 58 176 176 , 40 75.98 45.681 11.95 70 176 180 , 43 81.19 49.091 10.85 64 162 170 , 44 81.42 39.442 13.08 63 174 176 , 38 81.87 60.055 8.63 48 170 186 , 44 73.03 50.541 10.13 45 168 168 , 45 87.66 37.388 14.03 56 186 192 , 45 66.45 44.754 11.12 51 176 176 , 47 79.15 47.273 10.60 47 162 164 , 54 83.12 51.855 10.33 50 166 170 , 49 81.42 49.156 8.95 44 180 185 , 51 69.63 40.836 10.95 57 168 172 , 51 77.91 46.672 10.00 48 162 168 , 48 91.63 46.774 10.25 48 162 164 , 49 73.37 50.388 10.08 67 168 168 , 57 73.37 39.407 12.63 58 174 176 , 54 79.38 46.080 11.17 62 156 165 , 52 76.32 45.441 9.63 48 164 166 , 50 70.87 54.625 8.92 48 146 155 , 51 67.25 45.118 11.08 48 172 172 , 54 91.63 39.203 12.88 44 168 172 , 51 73.71 45.790 10.47 59 186 188 , 57 59.08 50.545 9.93 49 148 155 , 49 76.32 48.673 9.40 56 186 188 , 48 61.24 47.920 11.50 52 170 176 , 52 82.78 47.467 10.50 53 170 172 }; x=data[,{1 2 4 5 6 7 }]; y=data[,3]; free data; varnames={age weight runtime rstpulse runpulse maxpulse}; reset fw=8 linesize=90; run seq;The results are shown in Output 8.6.1. Output 8.6.1: Model Selection: Results
Output 8.6.1: (continued)
|
THE BEST MODEL FOR EACH NUMBER OF PARAMETERS | ||||||||
NPARM | MSE | RSQUARE | AGE | WEIGHT | RUNTIME | RSTPULSE | RUNPULSE | MAXPULSE |
1 | 7.533843 | 0.74338 | 0 | 0 | -3.31056 | 0 | 0 | 0 |
2 | 7.168422 | 0.764247 | -0.15037 | 0 | -3.20395 | 0 | 0 | 0 |
3 | 5.956692 | 0.811094 | -0.2564 | 0 | -2.82538 | 0 | -0.13091 | 0 |
4 | 5.343462 | 0.836818 | -0.19773 | 0 | -2.76758 | 0 | -0.34811 | 0.270513 |
5 | 5.176338 | 0.848002 | -0.21962 | -0.0723 | -2.68252 | 0 | -0.3734 | 0.304908 |
6 | 5.368247 | 0.848672 | -0.22697 | -0.07418 | -2.62865 | -0.02153 | -0.36963 | 0.303217 |
FORWARD SELECTION METHOD | ||||||||
NPARM | MSE | RSQUARE | AGE | WEIGHT | RUNTIME | RSTPULSE | RUNPULSE | MAXPULSE |
1 | 7.533843 | 0.74338 | 0 | 0 | -3.31056 | 0 | 0 | 0 |
2 | 7.168422 | 0.764247 | -0.15037 | 0 | -3.20395 | 0 | 0 | 0 |
3 | 5.956692 | 0.811094 | -0.2564 | 0 | -2.82538 | 0 | -0.13091 | 0 |
4 | 5.343462 | 0.836818 | -0.19773 | 0 | -2.76758 | 0 | -0.34811 | 0.270513 |
BACKWARD ELIMINATION | ||||||||
NPARM | MSE | RSQUARE | AGE | WEIGHT | RUNTIME | RSTPULSE | RUNPULSE | MAXPULSE |
6 | 5.368247 | 0.848672 | -0.22697 | -0.07418 | -2.62865 | -0.02153 | -0.36963 | 0.303217 |
5 | 5.176338 | 0.848002 | -0.21962 | -0.0723 | -2.68252 | 0 | -0.3734 | 0.304908 |
4 | 5.343462 | 0.836818 | -0.19773 | 0 | -2.76758 | 0 | -0.34811 | 0.270513 |
STEPWISE METHOD | ||||||||
NPARM | MSE | RSQUARE | AGE | WEIGHT | RUNTIME | RSTPULSE | RUNPULSE | MAXPULSE |
1 | 7.533843 | 0.74338 | 0 | 0 | -3.31056 | 0 | 0 | 0 |
2 | 7.168422 | 0.764247 | -0.15037 | 0 | -3.20395 | 0 | 0 | 0 |
3 | 5.956692 | 0.811094 | -0.2564 | 0 | -2.82538 | 0 | -0.13091 | 0 |
4 | 5.343462 | 0.836818 | -0.19773 | 0 | -2.76758 | 0 | -0.34811 | 0.270513 |
Copyright © 2009 by SAS Institute Inc., Cary, NC, USA. All rights reserved.