Example 9.6 Regression of Subsets of Variables

This example performs regression along with variable selection. Some of the methods used in this example are also used in the REG procedure in SAS/STAT software. The GLMSELECT procedure implements these and other variable selection techniques.

To simplify communication between modules, the modules in this example do not take any arguments. This means that the modules do not use a local symbol table: all variables are defined at the main scope of the program. In this programming technique, modules are used to organize the algorithm and, potentially, to enable code reuse.

proc iml;
 /*-------Initialization-------------------------------*
 | c,csave the crossproducts matrix                   |
 | n       number of observations                     |
 | k       total number of variables to consider      |
 | l       number of variables currently in model     |
 | in      0-1 vector of whether variable is in model |
 | b       print collects results (L MSE RSQ BETAS )  |
 *----------------------------------------------------*/
start initial;
  n=nrow(x); k=ncol(x); k1=k+1; ik=1:k;
  bnames={nparm mse rsquare} ||varnames;

  /*---correct by mean, adjust out intercept parameter---*/
  y=y-y[+,]/n;                   /* correct y by mean */
  x=x-repeat(x[+,]/n,n,1);       /* correct x by mean */
  xpy=x`*y;                      /* crossproducts     */
  ypy=y`*y;
  xpx=x`*x;
  free x y;                      /* no longer need the data*/
  csave=(xpx || xpy) //
        (xpy`|| ypy);            /* save copy of crossproducts*/
finish;

/*-----forward method------------------------------------------*/
start forward;
  print  "FORWARD SELECTION METHOD";
  free bprint;
  c=csave; in=repeat(0,k,1); L=0;      /* no variables are in */
  dfe=n-1; mse=ypy/dfe;
  sprob=0;

  do while(sprob<.15 & l<k);
     indx=loc(^in);           /* where are the variables not in?*/
     cd=vecdiag(c)[indx,];    /* xpx diagonals                  */
     cb=c[indx,k1];           /* adjusted xpy                   */
     tsqr=cb#cb/(cd#mse);     /* squares of t tests             */
     imax=tsqr[<:>,];         /* location of maximum in indx    */
     sprob=(1-probt(sqrt(tsqr[imax,]),dfe))*2;
     if sprob<.15 then do;    /* if t-test significant          */
        ii=indx[,imax];       /* pick most significant          */
        run swp;              /* routine to sweep               */
        run bpr;              /* routine to collect results     */
     end;
  end;
  print  bprint[colname=bnames] ;
finish;

/*-----backward method----------------------------------------*/
start backward;
  print  "BACKWARD ELIMINATION ";
  free bprint;
  c=csave; in=repeat(0,k,1);
  ii=1:k; run swp; run bpr;       /* start with all variables in*/
  sprob=1;

  do while(sprob>.15 & L>0);
     indx=loc(in);                /* where are the variables in? */
     cd=vecdiag(c)[indx,];        /* xpx diagonals               */
     cb=c[indx,k1];               /* bvalues                     */
     tsqr=cb#cb/(cd#mse);         /* squares of t tests          */
     imin=tsqr[>:<,];             /* location of minimum in indx */
     sprob=(1-probt(sqrt(tsqr[imin,]),dfe))*2;
     if sprob>.15 then do;        /* if t-test nonsignificant    */
        ii=indx[,imin];           /* pick least significant      */
        run swp;                  /* routine to sweep in variable*/
        run bpr;                  /* routine to collect results  */
     end;
  end;
  print  bprint[colname=bnames] ;
finish;

/*-----stepwise method-----------------------------------------*/
start stepwise;
  print "STEPWISE METHOD";
  free bprint;
  c=csave; in=repeat(0,k,1); L=0;
  dfe=n-1; mse=ypy/dfe;
  sprob=0;

  do while(sprob<.15 & L<k);
     indx=loc(^in);           /* where are the variables not in?*/
     nindx=loc(in);           /* where are the variables in?    */
     cd=vecdiag(c)[indx,];    /* xpx diagonals                 */
     cb=c[indx,k1];           /* adjusted xpy                  */
     tsqr=cb#cb/cd/mse;       /* squares of t tests            */
     imax=tsqr[<:>,];         /* location of maximum in indx   */
     sprob=(1-probt(sqrt(tsqr[imax,]),dfe))*2;
     if sprob<.15 then do;    /* if t-test significant         */
        ii=indx[,imax];       /* find index into c             */
        run swp;              /* routine to sweep              */
        run backstep;         /* check if remove any terms     */
        run bpr;              /* routine to collect results    */
     end;
  end;
  print  bprint[colname=bnames] ;
finish;

/*----routine to backwards-eliminate for stepwise--*/
start backstep;
  if nrow(nindx)=0 then return;
  bprob=1;
  do while(bprob>.15 & L<k);
     cd=vecdiag(c)[nindx,];      /* xpx diagonals         */
     cb=c[nindx,k1];             /* bvalues               */
     tsqr=cb#cb/(cd#mse);        /* squares of t tests    */
     imin=tsqr[>:<,];            /* location of minimum in nindx*/
     bprob=(1-probt(sqrt(tsqr[imin,]),dfe))*2;
     if bprob>.15 then do;
        ii=nindx[,imin];
        run swp;
        run bpr;
     end;
  end;
finish;

/*-----search all possible models----------------------------*/
start all;
   /*---use method of Schatzoff et al. for search technique--*/
   betak=repeat(0,k,k); /* record estimates for best l-param model*/
   msek=repeat(1e50,k,1);/* record best mse per # parms           */
   rsqk=repeat(0,k,1);  /* record best rsquare                    */
   ink=repeat(0,k,k);   /* record best set per # parms            */
   limit=2##k-1;        /* number of models to examine            */
   c=csave; in=repeat(0,k,1);  /* start with no variables in model*/

   do kk=1 to limit;
      run ztrail;                  /* find which one to sweep    */
      run swp;                     /* sweep it in                */
      bb=bb//(L||mse||rsq||(c[ik,k1]#in)`);
      if mse<msek[L,] then do;     /* was this best for L parms? */
         msek[L,]=mse;             /* record mse                 */
         rsqk[L,]=rsq;             /* record rsquare             */
         ink[,L]=in;               /* record which parms in model*/
         betak[L,]=(c[ik,k1]#in)`; /* record estimates           */
      end;
   end;

   print "ALL POSSIBLE MODELS IN SEARCH ORDER";
   print bb[colname=bnames]; free bb;

   bprint=ik`||msek||rsqk||betak;
   print "THE BEST MODEL FOR EACH NUMBER OF PARAMETERS";
   print bprint[colname=bnames];
finish;

/*-subroutine to find number of trailing zeros in binary number*/
/* on entry: kk is the number to examine                       */
/* on exit:  ii has the result                                 */
/*-------------------------------------------------------------*/
start ztrail;
   ii=1; zz=kk;
   do while(mod(zz,2)=0); ii=ii+1; zz=zz/2; end;
finish;

/*-----subroutine to sweep in a pivot--------------------------*/
/* on entry: ii has the position(s) to pivot                   */
/* on exit:  in, L, dfe, mse, rsq recalculated                 */
/*-------------------------------------------------------------*/
start swp;
   if abs(c[ii,ii])<1e-9 then do; print  "failure", c;stop;end;
   c=sweep(c,ii);
   in[ii,]=^in[ii,];
   L=sum(in); dfe=n-1-L;
   sse=c[k1,k1];
   mse=sse/dfe;
   rsq=1-sse/ypy;
finish;

/*-----subroutine to collect bprint results--------------------*/
/* on entry: L,mse,rsq, and c set up to collect                */
/* on exit:  bprint has another row                            */
/*-------------------------------------------------------------*/
start bpr;
   bprint=bprint//(L||mse||rsq||(c[ik,k1]#in)`);
finish;

/*--------------stepwise methods---------------------*/
/* after a call to the initial routine, which sets up*/
/* the data, four different routines can be called   */
/* to do four different model-selection methods.     */
/*---------------------------------------------------*/
start seq;
   run initial;          /* initialization             */
   run all;              /* all possible models        */
   run forward;          /* forward selection method   */
   run backward;         /* backward elimination method*/
   run stepwise;         /* stepwise method            */
finish;

The following statements call the SEQ module, which in turn calls modules that perform all-subset regression, forward selection, backward selection, and stepwise selection. The results are shown in Output 9.6.1.


/*------------------------data on physical fitness--------------*
| These measurements were made on men involved in a physical     |
| fitness course at N.C.State Univ.  The variables are age(years)|
| weight(kg), oxygen uptake rate(ml per kg body weight per       |
| minute), time to run 1.5 miles(minutes), heart rate while      |
| resting, heart rate while running (same time oxygen rate       |
| measured), and maximum heart rate recorded while running.      |
| Certain values of maxpulse were modified for consistency.      |
|   Data courtesy Dr. A.C. Linnerud                              |
*---------------------------------------------------------------*/
data =
   { 44 89.47  44.609 11.37 62 178 182  ,
     40 75.07  45.313 10.07 62 185 185  ,
     44 85.84  54.297  8.65 45 156 168  ,
     42 68.15  59.571  8.17 40 166 172  ,
     38 89.02  49.874  9.22 55 178 180  ,
     47 77.45  44.811 11.63 58 176 176  ,
     40 75.98  45.681 11.95 70 176 180  ,
     43 81.19  49.091 10.85 64 162 170  ,
     44 81.42  39.442 13.08 63 174 176  ,
     38 81.87  60.055  8.63 48 170 186  ,
     44 73.03  50.541 10.13 45 168 168  ,
     45 87.66  37.388 14.03 56 186 192  ,
     45 66.45  44.754 11.12 51 176 176  ,
     47 79.15  47.273 10.60 47 162 164  ,
     54 83.12  51.855 10.33 50 166 170  ,
     49 81.42  49.156  8.95 44 180 185  ,
     51 69.63  40.836 10.95 57 168 172  ,
     51 77.91  46.672 10.00 48 162 168  ,
     48 91.63  46.774 10.25 48 162 164  ,
     49 73.37  50.388 10.08 67 168 168  ,
     57 73.37  39.407 12.63 58 174 176  ,
     54 79.38  46.080 11.17 62 156 165  ,
     52 76.32  45.441  9.63 48 164 166  ,
     50 70.87  54.625  8.92 48 146 155  ,
     51 67.25  45.118 11.08 48 172 172  ,
     54 91.63  39.203 12.88 44 168 172  ,
     51 73.71  45.790 10.47 59 186 188  ,
     57 59.08  50.545  9.93 49 148 155  ,
     49 76.32  48.673  9.40 56 186 188  ,
     48 61.24  47.920 11.50 52 170 176  ,
     52 82.78  47.467 10.50 53 170 172  };
y=data[,3];
x=data[,{1 2 4 5 6 7 }];
free data;
varnames={age weight runtime rstpuls runpuls maxpuls};
reset fw=6 linesize=87;
run seq;

Output 9.6.1: Model Selection: Results

Initial Probability Estimates
0.7837838 0.1486486 0.0675676
0.7425743 0.1881188 0.0693069
0.6712329 0.1917808 0.1369863
0.6987952 0.2048193 0.0963855
0.5 0.2727273 0.2272727
0.5844156 0.2857143 0.1298701
0.3488372 0.3023256 0.3488372
0.4936709 0.278481 0.2278481
0.1212121 0.3636364 0.5151515
0.1785714 0.5357143 0.2857143

GSK Estimates
beta stderr
0.9454429 0.1290925
0.4003259 0.1284867
-0.277777 0.1164699
-0.278472 0.1255916
1.4146936 0.267351
0.474136 0.294943
0.8464701 0.2362639
0.1526095 0.2633051
0.1952395 0.2214436
0.0723489 0.2366597
-0.514488 0.2171995
-0.400831 0.2285779

pr
0.7402867 0.1674472
0.7704057 0.1745023
0.6624811 0.1917744
0.7061615 0.2047033
0.516981 0.2648871
0.5697446 0.2923278
0.3988695 0.2589096
0.4667924 0.3034204
0.1320359 0.3958019
0.1651907 0.4958784

ML Estimates
beta stderr
0.9533597 0.1286179
0.4069338 0.1284592
-0.279081 0.1156222
-0.280699 0.1252816
1.4423195 0.2669357
0.4993123 0.2943437
0.8411595 0.2363089
0.1485875 0.2635159
0.1883383 0.2202755
0.0667313 0.236031
-0.527163 0.216581
-0.414965 0.2299618

pr
0.7431759 0.1673155
0.7723266 0.1744421
0.6627266 0.1916645
0.7062766 0.2049216
0.5170782 0.2646857
0.5697771 0.292607
0.3984205 0.2576653
0.4666825 0.3027898
0.1323243 0.3963114
0.165475 0.4972044

Iterations Criterion
3 0.0004092