This example performs regression with variable selection. Some of the methods used in this example are also used in the REG procedure. Here is the program.
proc iml;
/*-------Initialization-------------------------------*
| c,csave the crossproducts matrix |
| n number of observations |
| k total number of variables to consider |
| l number of variables currently in model |
| in a 0-1 vector of whether variable is in |
| b print collects results (L MSE RSQ BETAS ) |
*----------------------------------------------------*/
start initial;
n=nrow(x); k=ncol(x); k1=k+1; ik=1:k;
bnames={nparm mse rsquare} ||varnames;
/*---correct by mean, adjust out intercept parameter---*/
y=y-y[+,]/n; /* correct y by mean */
x=x-repeat(x[+,]/n,n,1); /* correct x by mean */
xpy=x`*y; /* crossproducts */
ypy=y`*y;
xpx=x`*x;
free x y; /* no longer need the data*/
csave=(xpx || xpy) //
(xpy`|| ypy); /* save copy of crossproducts*/
finish;
/*-----forward method------------------------------------------*/
start forward;
print "FORWARD SELECTION METHOD";
free bprint;
c=csave; in=repeat(0,k,1); L=0; /* no variables are in */
dfe=n-1; mse=ypy/dfe;
sprob=0;
do while(sprob<.15 & l<k);
indx=loc(^in); /* where are the variables not in?*/
cd=vecdiag(c)[indx,]; /* xpx diagonals */
cb=c[indx,k1]; /* adjusted xpy */
tsqr=cb#cb/(cd#mse); /* squares of t tests */
imax=tsqr[<:>,]; /* location of maximum in indx */
sprob=(1-probt(sqrt(tsqr[imax,]),dfe))*2;
if sprob<.15 then do; /* if t-test significant */
ii=indx[,imax]; /* pick most significant */
run swp; /* routine to sweep */
run bpr; /* routine to collect results */
end;
end;
print bprint[colname=bnames] ;
finish;
/*-----backward method----------------------------------------*/
start backward;
print "BACKWARD ELIMINATION ";
free bprint;
c=csave; in=repeat(0,k,1);
ii=1:k; run swp; run bpr; /* start with all variables in*/
sprob=1;
do while(sprob>.15 & L>0);
indx=loc(in); /* where are the variables in? */
cd=vecdiag(c)[indx,]; /* xpx diagonals */
cb=c[indx,k1]; /* bvalues */
tsqr=cb#cb/(cd#mse); /* squares of t tests */
imin=tsqr[>:<,]; /* location of minimum in indx */
sprob=(1-probt(sqrt(tsqr[imin,]),dfe))*2;
if sprob>.15 then do; /* if t-test nonsignificant */
ii=indx[,imin]; /* pick least significant */
run swp; /* routine to sweep in variable*/
run bpr; /* routine to collect results */
end;
end;
print bprint[colname=bnames] ;
finish;
/*-----stepwise method-----------------------------------------*/
start stepwise;
print "STEPWISE METHOD";
free bprint;
c=csave; in=repeat(0,k,1); L=0;
dfe=n-1; mse=ypy/dfe;
sprob=0;
do while(sprob<.15 & L<k);
indx=loc(^in); /* where are the variables not in?*/
nindx=loc(in); /* where are the variables in? */
cd=vecdiag(c)[indx,]; /* xpx diagonals */
cb=c[indx,k1]; /* adjusted xpy */
tsqr=cb#cb/cd/mse; /* squares of t tests */
imax=tsqr[<:>,]; /* location of maximum in indx */
sprob=(1-probt(sqrt(tsqr[imax,]),dfe))*2;
if sprob<.15 then do; /* if t-test significant */
ii=indx[,imax]; /* find index into c */
run swp; /* routine to sweep */
run backstep; /* check if remove any terms */
run bpr; /* routine to collect results */
end;
end;
print bprint[colname=bnames] ;
finish;
/*----routine to backwards-eliminate for stepwise--*/
start backstep;
if nrow(nindx)=0 then return;
bprob=1;
do while(bprob>.15 & L<k);
cd=vecdiag(c)[nindx,]; /* xpx diagonals */
cb=c[nindx,k1]; /* bvalues */
tsqr=cb#cb/(cd#mse); /* squares of t tests */
imin=tsqr[>:<,]; /* location of minimum in nindx*/
bprob=(1-probt(sqrt(tsqr[imin,]),dfe))*2;
if bprob>.15 then do;
ii=nindx[,imin];
run swp;
run bpr;
end;
end;
finish;
/*-----search all possible models----------------------------*/
start all;
/*---use method of schatzoff et al. for search technique--*/
betak=repeat(0,k,k); /* record estimates for best l-param model*/
msek=repeat(1e50,k,1);/* record best mse per # parms */
rsqk=repeat(0,k,1); /* record best rsquare */
ink=repeat(0,k,k); /* record best set per # parms */
limit=2##k-1; /* number of models to examine */
c=csave; in=repeat(0,k,1);/* start out with no variables in model*/
do kk=1 to limit;
run ztrail; /* find which one to sweep */
run swp; /* sweep it in */
bb=bb//(L||mse||rsq||(c[ik,k1]#in)`);
if mse<msek[L,] then do; /* was this best for L parms? */
msek[L,]=mse; /* record mse */
rsqk[L,]=rsq; /* record rsquare */
ink[,L]=in; /* record which parms in model*/
betak[L,]=(c[ik,k1]#in)`;/* record estimates */
end;
end;
print "ALL POSSIBLE MODELS IN SEARCH ORDER";
print bb[colname=bnames]; free bb;
bprint=ik`||msek||rsqk||betak;
print "THE BEST MODEL FOR EACH NUMBER OF PARAMETERS";
print bprint[colname=bnames];
finish;
/*-subroutine to find number of trailing zeros in binary number*/
/* on entry: kk is the number to examine */
/* on exit: ii has the result */
/*-------------------------------------------------------------*/
start ztrail;
ii=1; zz=kk;
do while(mod(zz,2)=0); ii=ii+1; zz=zz/2; end;
finish;
/*-----subroutine to sweep in a pivot--------------------------*/
/* on entry: ii has the position(s) to pivot */
/* on exit: in, L, dfe, mse, rsq recalculated */
/*-------------------------------------------------------------*/
start swp;
if abs(c[ii,ii])<1e-9 then do; print "failure", c;stop;end;
c=sweep(c,ii);
in[ii,]=^in[ii,];
L=sum(in); dfe=n-1-L;
sse=c[k1,k1];
mse=sse/dfe;
rsq=1-sse/ypy;
finish;
/*-----subroutine to collect bprint results--------------------*/
/* on entry: L,mse,rsq, and c set up to collect */
/* on exit: bprint has another row */
/*-------------------------------------------------------------*/
start bpr;
bprint=bprint//(L||mse||rsq||(c[ik,k1]#in)`);
finish;
/*--------------stepwise methods---------------------*/
/* after a call to the initial routine, which sets up*/
/* the data, four different routines can be called */
/* to do four different model-selection methods. */
/*---------------------------------------------------*/
start seq;
run initial; /* initialization */
run all; /* all possible models */
run forward; /* foreward selection method */
run backward; /* backward elimination method*/
run stepwise; /* stepwise method */
finish;
/*------------------------data on physical fitness--------------*
| These measurements were made on men involved in a physical |
| fitness course at N.C.State Univ. The variables are age(years)|
| weight(kg), oxygen uptake rate(ml per kg body weight per |
| minute), time to run 1.5 miles(minutes), heart rate while |
| resting, heart rate while running (same time oxygen rate |
| measured), and maximum heart rate recorded while running. |
| Certain values of maxpulse were modified for consistency. |
| Data courtesy DR. A.C. Linnerud |
*---------------------------------------------------------------*/
data =
{ 44 89.47 44.609 11.37 62 178 182 ,
40 75.07 45.313 10.07 62 185 185 ,
44 85.84 54.297 8.65 45 156 168 ,
42 68.15 59.571 8.17 40 166 172 ,
38 89.02 49.874 9.22 55 178 180 ,
47 77.45 44.811 11.63 58 176 176 ,
40 75.98 45.681 11.95 70 176 180 ,
43 81.19 49.091 10.85 64 162 170 ,
44 81.42 39.442 13.08 63 174 176 ,
38 81.87 60.055 8.63 48 170 186 ,
44 73.03 50.541 10.13 45 168 168 ,
45 87.66 37.388 14.03 56 186 192 ,
45 66.45 44.754 11.12 51 176 176 ,
47 79.15 47.273 10.60 47 162 164 ,
54 83.12 51.855 10.33 50 166 170 ,
49 81.42 49.156 8.95 44 180 185 ,
51 69.63 40.836 10.95 57 168 172 ,
51 77.91 46.672 10.00 48 162 168 ,
48 91.63 46.774 10.25 48 162 164 ,
49 73.37 50.388 10.08 67 168 168 ,
57 73.37 39.407 12.63 58 174 176 ,
54 79.38 46.080 11.17 62 156 165 ,
52 76.32 45.441 9.63 48 164 166 ,
50 70.87 54.625 8.92 48 146 155 ,
51 67.25 45.118 11.08 48 172 172 ,
54 91.63 39.203 12.88 44 168 172 ,
51 73.71 45.790 10.47 59 186 188 ,
57 59.08 50.545 9.93 49 148 155 ,
49 76.32 48.673 9.40 56 186 188 ,
48 61.24 47.920 11.50 52 170 176 ,
52 82.78 47.467 10.50 53 170 172 };
x=data[,{1 2 4 5 6 7 }];
y=data[,3];
free data;
varnames={age weight runtime rstpuls runpuls maxpuls};
reset fw=6 linesize=87;
run seq;