Example 9.6 Regression of Subsets of Variables

This example performs regression with variable selection. Some of the methods used in this example are also used in the REG procedure. Here is the program.

proc iml;

 /*-------Initialization-------------------------------*
 | c,csave the crossproducts matrix                   |
 | n       number of observations                     |
 | k       total number of variables to consider      |
 | l       number of variables currently in model     |
 | in      a 0-1 vector of whether variable is in     |
 | b       print collects results (L MSE RSQ BETAS )  |
 *----------------------------------------------------*/
start initial;
  n=nrow(x); k=ncol(x); k1=k+1; ik=1:k;
  bnames={nparm mse rsquare} ||varnames;

  /*---correct by mean, adjust out intercept parameter---*/
  y=y-y[+,]/n;                   /* correct y by mean */
  x=x-repeat(x[+,]/n,n,1);       /* correct x by mean */
  xpy=x`*y;                      /* crossproducts     */
  ypy=y`*y;
  xpx=x`*x;
  free x y;                      /* no longer need the data*/
  csave=(xpx || xpy) //
        (xpy`|| ypy);            /* save copy of crossproducts*/
finish;


 /*-----forward method------------------------------------------*/
start forward;
  print  "FORWARD SELECTION METHOD";
  free bprint;
  c=csave; in=repeat(0,k,1); L=0;      /* no variables are in */
  dfe=n-1; mse=ypy/dfe;
  sprob=0;

  do while(sprob<.15 & l<k);
     indx=loc(^in);           /* where are the variables not in?*/
     cd=vecdiag(c)[indx,];    /* xpx diagonals                  */
     cb=c[indx,k1];           /* adjusted xpy                   */
     tsqr=cb#cb/(cd#mse);     /* squares of t tests             */
     imax=tsqr[<:>,];         /* location of maximum in indx    */
     sprob=(1-probt(sqrt(tsqr[imax,]),dfe))*2;
     if sprob<.15 then do;    /* if t-test significant          */
        ii=indx[,imax];       /* pick most significant          */
        run swp;              /* routine to sweep               */
        run bpr;              /* routine to collect results     */
     end;
  end;

  print  bprint[colname=bnames] ;
  finish;



 /*-----backward method----------------------------------------*/
start backward;
  print  "BACKWARD ELIMINATION ";
  free bprint;
  c=csave; in=repeat(0,k,1);
  ii=1:k; run swp; run bpr;       /* start with all variables in*/
  sprob=1;

  do while(sprob>.15 & L>0);
     indx=loc(in);                /* where are the variables in? */
     cd=vecdiag(c)[indx,];        /* xpx diagonals               */
     cb=c[indx,k1];               /* bvalues                     */
     tsqr=cb#cb/(cd#mse);         /* squares of t tests          */
     imin=tsqr[>:<,];             /* location of minimum in indx */
     sprob=(1-probt(sqrt(tsqr[imin,]),dfe))*2;
     if sprob>.15 then do;        /* if t-test nonsignificant    */
        ii=indx[,imin];           /* pick least significant      */
        run swp;                  /* routine to sweep in variable*/
        run bpr;                  /* routine to collect results  */
     end;
  end;

  print  bprint[colname=bnames] ;
  finish;



 /*-----stepwise method-----------------------------------------*/
start stepwise;
  print "STEPWISE METHOD";
  free bprint;
  c=csave; in=repeat(0,k,1); L=0;
  dfe=n-1; mse=ypy/dfe;
  sprob=0;

  do while(sprob<.15 & L<k);
     indx=loc(^in);           /* where are the variables not in?*/
     nindx=loc(in);           /* where are the variables in?    */
     cd=vecdiag(c)[indx,];    /* xpx diagonals                 */
     cb=c[indx,k1];           /* adjusted xpy                  */
     tsqr=cb#cb/cd/mse;       /* squares of t tests            */
     imax=tsqr[<:>,];         /* location of maximum in indx   */
     sprob=(1-probt(sqrt(tsqr[imax,]),dfe))*2;
     if sprob<.15 then do;    /* if t-test significant         */
        ii=indx[,imax];       /* find index into c             */
        run swp;              /* routine to sweep              */
        run backstep;         /* check if remove any terms     */
        run bpr;              /* routine to collect results    */
     end;
  end;

  print  bprint[colname=bnames] ;
  finish;


 /*----routine to backwards-eliminate for stepwise--*/
start backstep;
  if nrow(nindx)=0 then return;
  bprob=1;
  do while(bprob>.15 & L<k);
     cd=vecdiag(c)[nindx,];      /* xpx diagonals         */
     cb=c[nindx,k1];             /* bvalues               */
     tsqr=cb#cb/(cd#mse);        /* squares of t tests    */
     imin=tsqr[>:<,];            /* location of minimum in nindx*/
     bprob=(1-probt(sqrt(tsqr[imin,]),dfe))*2;
     if bprob>.15 then do;
        ii=nindx[,imin];
        run swp;
        run bpr;
     end;
  end;
finish;


 /*-----search all possible models----------------------------*/
start all;
     /*---use method of schatzoff et al. for search technique--*/
  betak=repeat(0,k,k); /* record estimates for best l-param model*/
  msek=repeat(1e50,k,1);/* record best mse per # parms            */
  rsqk=repeat(0,k,1);  /* record best rsquare                    */
  ink=repeat(0,k,k);   /* record best set per # parms            */
  limit=2##k-1;        /* number of models to examine            */

c=csave; in=repeat(0,k,1);/* start out with no variables in model*/

     do kk=1 to limit;
        run ztrail;              /* find which one to sweep    */
        run swp;                 /* sweep it in                */
        bb=bb//(L||mse||rsq||(c[ik,k1]#in)`);
        if mse<msek[L,] then do; /* was this best for L parms? */
           msek[L,]=mse;         /* record mse                 */
           rsqk[L,]=rsq;         /* record rsquare             */
           ink[,L]=in;           /* record which parms in model*/
           betak[L,]=(c[ik,k1]#in)`;/* record estimates        */
        end;
     end;

     print "ALL POSSIBLE MODELS IN SEARCH ORDER";
     print bb[colname=bnames]; free bb;

     bprint=ik`||msek||rsqk||betak;
     print "THE BEST MODEL FOR EACH NUMBER OF PARAMETERS";
     print bprint[colname=bnames];
finish;


 /*-subroutine to find number of trailing zeros in binary number*/
 /* on entry: kk is the number to examine                       */
 /* on exit:  ii has the result                                 */
 /*-------------------------------------------------------------*/
start ztrail;
    ii=1; zz=kk;
    do while(mod(zz,2)=0); ii=ii+1; zz=zz/2; end;
finish;


 /*-----subroutine to sweep in a pivot--------------------------*/
 /* on entry: ii has the position(s) to pivot                   */
 /* on exit:  in, L, dfe, mse, rsq recalculated                 */
 /*-------------------------------------------------------------*/
start swp;
     if abs(c[ii,ii])<1e-9 then do; print  "failure", c;stop;end;
     c=sweep(c,ii);
     in[ii,]=^in[ii,];
     L=sum(in); dfe=n-1-L;
     sse=c[k1,k1];
     mse=sse/dfe;
     rsq=1-sse/ypy;
finish;


 /*-----subroutine to collect bprint results--------------------*/
 /* on entry: L,mse,rsq, and c set up to collect                */
 /* on exit:  bprint has another row                            */
 /*-------------------------------------------------------------*/
start bpr;
   bprint=bprint//(L||mse||rsq||(c[ik,k1]#in)`);
finish;



 /*--------------stepwise methods---------------------*/
 /* after a call to the initial routine, which sets up*/
 /* the data, four different routines can be called   */
 /* to do four different model-selection methods.     */
 /*---------------------------------------------------*/
start seq;
  run initial;          /* initialization             */
  run all;              /* all possible models        */
  run forward;          /* foreward selection method  */
  run backward;         /* backward elimination method*/
  run stepwise;         /* stepwise method            */
finish;


 /*------------------------data on physical fitness--------------*
| These measurements were made on men involved in a physical     |
| fitness course at N.C.State Univ.  The variables are age(years)|
| weight(kg), oxygen uptake rate(ml per kg body weight per       |
| minute), time to run 1.5 miles(minutes), heart rate while      |
| resting, heart rate while running (same time oxygen rate       |
| measured), and maximum heart rate recorded while running.      |
| Certain values of maxpulse were modified for consistency.      |
|   Data courtesy DR. A.C. Linnerud                              |
*---------------------------------------------------------------*/
data =
   { 44 89.47  44.609 11.37 62 178 182  ,
     40 75.07  45.313 10.07 62 185 185  ,
     44 85.84  54.297  8.65 45 156 168  ,
     42 68.15  59.571  8.17 40 166 172  ,
     38 89.02  49.874  9.22 55 178 180  ,
     47 77.45  44.811 11.63 58 176 176  ,
     40 75.98  45.681 11.95 70 176 180  ,
     43 81.19  49.091 10.85 64 162 170  ,
     44 81.42  39.442 13.08 63 174 176  ,
     38 81.87  60.055  8.63 48 170 186  ,
     44 73.03  50.541 10.13 45 168 168  ,
     45 87.66  37.388 14.03 56 186 192  ,
     45 66.45  44.754 11.12 51 176 176  ,
     47 79.15  47.273 10.60 47 162 164  ,
     54 83.12  51.855 10.33 50 166 170  ,
     49 81.42  49.156  8.95 44 180 185  ,
     51 69.63  40.836 10.95 57 168 172  ,
     51 77.91  46.672 10.00 48 162 168  ,
     48 91.63  46.774 10.25 48 162 164  ,
     49 73.37  50.388 10.08 67 168 168  ,
     57 73.37  39.407 12.63 58 174 176  ,
     54 79.38  46.080 11.17 62 156 165  ,
     52 76.32  45.441  9.63 48 164 166  ,
     50 70.87  54.625  8.92 48 146 155  ,
     51 67.25  45.118 11.08 48 172 172  ,
     54 91.63  39.203 12.88 44 168 172  ,
     51 73.71  45.790 10.47 59 186 188  ,
     57 59.08  50.545  9.93 49 148 155  ,
     49 76.32  48.673  9.40 56 186 188  ,
     48 61.24  47.920 11.50 52 170 176  ,
     52 82.78  47.467 10.50 53 170 172  };

x=data[,{1 2 4 5 6 7 }];
y=data[,3];
free data;
varnames={age weight runtime rstpuls runpuls maxpuls};
reset fw=6 linesize=87;
run seq;

The results are shown in Output 9.6.1.

Output 9.6.1: Model Selection: Results

ALL POSSIBLE MODELS IN SEARCH ORDER

bb
NPARM MSE RSQUARE AGE WEIGHT RUNTIME RSTPULS RUNPULS MAXPULS
1 26.634 0.0928 -0.311 0 0 0 0 0
2 25.826 0.1506 -0.37 -0.158 0 0 0 0
1 28.58 0.0265 0 -0.104 0 0 0 0
2 7.7556 0.7449 0 -0.025 -3.289 0 0 0
3 7.2263 0.7708 -0.174 -0.054 -3.14 0 0 0
2 7.1684 0.7642 -0.15 0 -3.204 0 0 0
1 7.5338 0.7434 0 0 -3.311 0 0 0
2 7.7983 0.7435 0 0 -3.287 -0.01 0 0
3 7.3361 0.7673 -0.168 0 -3.079 -0.045 0 0
4 7.3666 0.775 -0.196 -0.059 -2.989 -0.053 0 0
3 8.0373 0.7451 0 -0.026 -3.263 -0.01 0 0
2 24.915 0.1806 0 -0.093 0 -0.275 0 0
3 20.28 0.3568 -0.447 -0.156 0 -0.322 0 0
2 21.276 0.3003 -0.389 0 0 -0.323 0 0
1 24.676 0.1595 0 0 0 -0.279 0 0

...<rows skipped>...

THE BEST MODEL FOR EACH NUMBER OF PARAMETERS

bprint
NPARM MSE RSQUARE AGE WEIGHT RUNTIME RSTPULS RUNPULS MAXPULS
1 7.5338 0.7434 0 0 -3.311 0 0 0
2 7.1684 0.7642 -0.15 0 -3.204 0 0 0
3 5.9567 0.8111 -0.256 0 -2.825 0 -0.131 0
4 5.3435 0.8368 -0.198 0 -2.768 0 -0.348 0.2705
5 5.1763 0.848 -0.22 -0.072 -2.683 0 -0.373 0.3049
6 5.3682 0.8487 -0.227 -0.074 -2.629 -0.022 -0.37 0.3032

FORWARD SELECTION METHOD

bprint
NPARM MSE RSQUARE AGE WEIGHT RUNTIME RSTPULS RUNPULS MAXPULS
1 7.5338 0.7434 0 0 -3.311 0 0 0
2 7.1684 0.7642 -0.15 0 -3.204 0 0 0
3 5.9567 0.8111 -0.256 0 -2.825 0 -0.131 0
4 5.3435 0.8368 -0.198 0 -2.768 0 -0.348 0.2705

BACKWARD ELIMINATION

bprint
NPARM MSE RSQUARE AGE WEIGHT RUNTIME RSTPULS RUNPULS MAXPULS
6 5.3682 0.8487 -0.227 -0.074 -2.629 -0.022 -0.37 0.3032
5 5.1763 0.848 -0.22 -0.072 -2.683 0 -0.373 0.3049
4 5.3435 0.8368 -0.198 0 -2.768 0 -0.348 0.2705

STEPWISE METHOD

bprint
NPARM MSE RSQUARE AGE WEIGHT RUNTIME RSTPULS RUNPULS MAXPULS
1 7.5338 0.7434 0 0 -3.311 0 0 0
2 7.1684 0.7642 -0.15 0 -3.204 0 0 0
3 5.9567 0.8111 -0.256 0 -2.825 0 -0.131 0
4 5.3435 0.8368 -0.198 0 -2.768 0 -0.348 0.2705