| General Statistics Examples |
This example performs regression with variable selection. Some of the methods used in this example are also used in the REG procedure. Here is the program.
/* Initialization */
/* C,CSAVE the crossproducts matrix */
/* N number of observations */
/* K total number of variables to consider */
/* L number of variables currently in model */
/* IN a 0-1 vector of whether variable is in */
/* B print collects results (L MSE RSQ BETAS ) */
start initial;
n=nrow(x);
k=ncol(x);
k1=k+1;
ik=1:k;
bnames={nparm mse rsquare} ||varnames;
/* Correct by mean, adjust out intercept parameter */
y=y-y[+,]/n; /* correct y by mean */
x=x-repeat(x[+,]/n,n,1); /* correct x by mean */
xpy=x`*y; /* crossproducts */
ypy=y`*y;
xpx=x`*x;
free x y; /* no longer need the data */
/* Save a copy of crossproducts matrix */
csave=(xpx || xpy) // (xpy`|| ypy);
finish initial;
/* Forward method */
start forward;
print / "FORWARD SELECTION METHOD";
free bprint;
c=csave;
in=repeat(0,k,1);
l=0; /* No variables are in */
dfe=n-1;
mse=ypy/dfe;
sprob=0;
do while(sprob<.15 & l<k);
indx=loc(^in); /* where are the variables not in? */
cd=vecdiag(c)[indx,]; /* xpx diagonals */
cb=c[indx,k1]; /* adjusted xpy */
tsqr=cb#cb/(cd#mse); /* squares of t tests */
imax=tsqr[<:>,]; /* location of maximum in indx */
sprob=(1-probt(sqrt(tsqr[imax,]),dfe))*2;
if sprob<.15 then
do; /* if t test significant */
ii=indx[,imax]; /* pick most significant */
run swp; /* routine to sweep */
run bpr; /* routine to collect results */
end;
end;
print bprint[colname=bnames] ;
finish forward;
/* Backward method */
start backward;
print / "BACKWARD ELIMINATION ";
free bprint;
c=csave;
in=repeat(0,k,1);
ii=1:k;
run swp;
run bpr; /* start with all variables in */
sprob=1;
do while(sprob>.15 & l>0);
indx=loc(in); /* where are the variables in? */
cd=vecdiag(c)[indx,]; /* xpx diagonals */
cb=c[indx,k1]; /* bvalues */
tsqr=cb#cb/(cd#mse); /* squares of t tests */
imin=tsqr[>:<,]; /* location of minimum in indx */
sprob=(1-probt(sqrt(tsqr[imin,]),dfe))*2;
if sprob>.15 then
do; /* if t test nonsignificant */
ii=indx[,imin]; /* pick least significant */
run swp; /* routine to sweep in variable */
run bpr; /* routine to collect results */
end;
end;
print bprint[colname=bnames] ;
finish backward;
/* Stepwise method */
start stepwise;
print /"STEPWISE METHOD";
free bprint;
c=csave;
in=repeat(0,k,1);
l=0;
dfe=n-1;
mse=ypy/dfe;
sprob=0;
do while(sprob<.15 & l<k);
indx=loc(^in); /* where are the variables not in? */
nindx=loc(in); /* where are the variables in? */
cd=vecdiag(c)[indx,]; /* xpx diagonals */
cb=c[indx,k1]; /* adjusted xpy */
tsqr=cb#cb/cd/mse; /* squares of t tests */
imax=tsqr[<:>,]; /* location of maximum in indx */
sprob=(1-probt(sqrt(tsqr[imax,]),dfe))*2;
if sprob<.15 then
do; /* if t test significant */
ii=indx[,imax]; /* find index into c */
run swp; /* routine to sweep */
run backstep; /* check if remove any terms */
run bpr; /* routine to collect results */
end;
end;
print bprint[colname=bnames] ;
finish stepwise;
/* Routine to backwards-eliminate for stepwise */
start backstep;
if nrow(nindx)=0 then return;
bprob=1;
do while(bprob>.15 & l<k);
cd=vecdiag(c)[nindx,]; /* xpx diagonals */
cb=c[nindx,k1]; /* bvalues */
tsqr=cb#cb/(cd#mse); /* squares of t tests */
imin=tsqr[>:<,]; /* location of minimum in nindx */
bprob=(1-probt(sqrt(tsqr[imin,]),dfe))*2;
if bprob>.15 then
do;
ii=nindx[,imin];
run swp;
run bpr;
end;
end;
finish backstep;
/* Search all possible models */
start all;
/* Use method of Schatzoff et al. for search technique */
betak=repeat(0,k,k); /* rec. ests. for best l-param model */
msek=repeat(1e50,k,1); /* record best mse per # parms */
rsqk=repeat(0,k,1); /* record best rsquare */
ink=repeat(0,k,k); /* record best set per # parms */
limit=2##k-1; /* number of models to examine */
c=csave;
in=repeat(0,k,1); /* start out with no variables in model */
do kk=1 to limit;
run ztrail; /* find which one to sweep */
run swp; /* sweep it in */
bb=bb//(l||mse||rsq||(c[ik,k1]#in)`);
if mse<msek[l,] then
do; /* was this best for l parms? */
msek[l,]=mse; /* record mse */
rsqk[l,]=rsq; /* record rsquare */
ink[,l]=in; /* record which parms in model */
betak[l,]=(c[ik,k1]#in)`; /* record estimates */
end;
end;
print / "ALL POSSIBLE MODELS" " IN SEARCH ORDER";
print bb[colname=bnames]; free bb;
bprint=ik`||msek||rsqk||betak;
print ,"THE BEST MODEL FOR EACH NUMBER OF PARAMETERS";
print bprint[colname=bnames];
/* Mallows CP plot */
cp=msek#(n-ik`-1)/min(msek)-(n-2#ik`);
cp=ik`||cp;
cpname={"nparm" "cp"};
/* output cp out=cp colname=cpname; */
finish all;
/* Subroutine to find number of */
/* trailing zeros in binary number */
/* on entry: kk is the number to examine */
/* on exit: ii has the result */
start ztrail;
ii=1;
zz=kk;
do while(mod(zz,2)=0);
ii=ii+1;
zz=zz/2;
end;
finish ztrail;
/* Subroutine to sweep in a pivot */
/* on entry: ii has the position(s) to pivot */
/* on exit: in, l, dfe, mse, rsq recalculated */
start swp;
if abs(c[ii,ii])<1e-9 then
do;
print , "FAILURE", c;
stop;
end;
c=sweep(c,ii);
in[ii,]=^in[ii,];
l=sum(in);
dfe=n-1-l;
sse=c[k1,k1];
mse=sse/dfe;
rsq=1-sse/ypy;
finish swp;
/* Subroutine to collect bprint results */
/* on entry: l,mse,rsq, and c set up to collect */
/* on exit: bprint has another row */
start bpr;
bprint=bprint//(l||mse||rsq||(c [ik,k1]#in)`);
finish bpr;
/* Stepwise Methods */
/* After a run to the initial routine, which sets up */
/* the data, four different routines can be called */
/* to do four different model-selection methods. */
start seq;
run initial; /* initialization */
run all; /* all possible models */
run forward; /* foreward selection method */
run backward; /* backward elimination method */
run stepwise; /* stepwise method */
finish seq;
/* Data on physical fitness */
/* These measurements were made on men involved in a physical*/
/* fitness course at N.C. State Univ. The variables are */
/* age (years), weight (kg), oxygen uptake rate (ml per kg */
/* body weight per minute), time to run 1.5 miles (minutes), */
/* heart rate while resting, heart rate while running (same */
/* time oxygen rate measured), and maximum heart rate record-*/
/* ed while running. Certain values of maxpulse were modified*/
/* for consistency. Data courtesy Dr. A. C. Linnerud */
data=
{ 44 89.47 44.609 11.37 62 178 182 ,
40 75.07 45.313 10.07 62 185 185 ,
44 85.84 54.297 8.65 45 156 168 ,
42 68.15 59.571 8.17 40 166 172 ,
38 89.02 49.874 9.22 55 178 180 ,
47 77.45 44.811 11.63 58 176 176 ,
40 75.98 45.681 11.95 70 176 180 ,
43 81.19 49.091 10.85 64 162 170 ,
44 81.42 39.442 13.08 63 174 176 ,
38 81.87 60.055 8.63 48 170 186 ,
44 73.03 50.541 10.13 45 168 168 ,
45 87.66 37.388 14.03 56 186 192 ,
45 66.45 44.754 11.12 51 176 176 ,
47 79.15 47.273 10.60 47 162 164 ,
54 83.12 51.855 10.33 50 166 170 ,
49 81.42 49.156 8.95 44 180 185 ,
51 69.63 40.836 10.95 57 168 172 ,
51 77.91 46.672 10.00 48 162 168 ,
48 91.63 46.774 10.25 48 162 164 ,
49 73.37 50.388 10.08 67 168 168 ,
57 73.37 39.407 12.63 58 174 176 ,
54 79.38 46.080 11.17 62 156 165 ,
52 76.32 45.441 9.63 48 164 166 ,
50 70.87 54.625 8.92 48 146 155 ,
51 67.25 45.118 11.08 48 172 172 ,
54 91.63 39.203 12.88 44 168 172 ,
51 73.71 45.790 10.47 59 186 188 ,
57 59.08 50.545 9.93 49 148 155 ,
49 76.32 48.673 9.40 56 186 188 ,
48 61.24 47.920 11.50 52 170 176 ,
52 82.78 47.467 10.50 53 170 172
};
x=data[,{1 2 4 5 6 7 }];
y=data[,3];
free data;
varnames={age weight runtime rstpulse runpulse maxpulse};
reset fw=8 linesize=90;
run seq;
The results are shown in Output 8.6.1.
Output 8.6.1: Model Selection: Results
Output 8.6.1: (continued)
|
| THE BEST MODEL FOR EACH NUMBER OF PARAMETERS | ||||||||
| NPARM | MSE | RSQUARE | AGE | WEIGHT | RUNTIME | RSTPULSE | RUNPULSE | MAXPULSE |
| 1 | 7.533843 | 0.74338 | 0 | 0 | -3.31056 | 0 | 0 | 0 |
| 2 | 7.168422 | 0.764247 | -0.15037 | 0 | -3.20395 | 0 | 0 | 0 |
| 3 | 5.956692 | 0.811094 | -0.2564 | 0 | -2.82538 | 0 | -0.13091 | 0 |
| 4 | 5.343462 | 0.836818 | -0.19773 | 0 | -2.76758 | 0 | -0.34811 | 0.270513 |
| 5 | 5.176338 | 0.848002 | -0.21962 | -0.0723 | -2.68252 | 0 | -0.3734 | 0.304908 |
| 6 | 5.368247 | 0.848672 | -0.22697 | -0.07418 | -2.62865 | -0.02153 | -0.36963 | 0.303217 |
| FORWARD SELECTION METHOD | ||||||||
| NPARM | MSE | RSQUARE | AGE | WEIGHT | RUNTIME | RSTPULSE | RUNPULSE | MAXPULSE |
| 1 | 7.533843 | 0.74338 | 0 | 0 | -3.31056 | 0 | 0 | 0 |
| 2 | 7.168422 | 0.764247 | -0.15037 | 0 | -3.20395 | 0 | 0 | 0 |
| 3 | 5.956692 | 0.811094 | -0.2564 | 0 | -2.82538 | 0 | -0.13091 | 0 |
| 4 | 5.343462 | 0.836818 | -0.19773 | 0 | -2.76758 | 0 | -0.34811 | 0.270513 |
| BACKWARD ELIMINATION | ||||||||
| NPARM | MSE | RSQUARE | AGE | WEIGHT | RUNTIME | RSTPULSE | RUNPULSE | MAXPULSE |
| 6 | 5.368247 | 0.848672 | -0.22697 | -0.07418 | -2.62865 | -0.02153 | -0.36963 | 0.303217 |
| 5 | 5.176338 | 0.848002 | -0.21962 | -0.0723 | -2.68252 | 0 | -0.3734 | 0.304908 |
| 4 | 5.343462 | 0.836818 | -0.19773 | 0 | -2.76758 | 0 | -0.34811 | 0.270513 |
| STEPWISE METHOD | ||||||||
| NPARM | MSE | RSQUARE | AGE | WEIGHT | RUNTIME | RSTPULSE | RUNPULSE | MAXPULSE |
| 1 | 7.533843 | 0.74338 | 0 | 0 | -3.31056 | 0 | 0 | 0 |
| 2 | 7.168422 | 0.764247 | -0.15037 | 0 | -3.20395 | 0 | 0 | 0 |
| 3 | 5.956692 | 0.811094 | -0.2564 | 0 | -2.82538 | 0 | -0.13091 | 0 |
| 4 | 5.343462 | 0.836818 | -0.19773 | 0 | -2.76758 | 0 | -0.34811 | 0.270513 |
Copyright © 2009 by SAS Institute Inc., Cary, NC, USA. All rights reserved.