General Statistics Examples

Example 8.6: Regression of Subsets of Variables

This example performs regression with variable selection. Some of the methods used in this example are also used in the REG procedure. Here is the program.

  
       /*       Initialization                               */ 
       /* C,CSAVE the crossproducts matrix                   */ 
       /* N       number of observations                     */ 
       /* K       total number of variables to consider      */ 
       /* L       number of variables currently in model     */ 
       /* IN      a 0-1 vector of whether variable is in     */ 
       /* B       print collects results (L MSE RSQ BETAS )  */ 
  
    start initial; 
       n=nrow(x); 
       k=ncol(x); 
       k1=k+1; 
       ik=1:k; 
       bnames={nparm mse rsquare} ||varnames; 
  
          /* Correct by mean, adjust out intercept parameter */ 
       y=y-y[+,]/n;                         /* correct y by mean */ 
       x=x-repeat(x[+,]/n,n,1);             /* correct x by mean */ 
       xpy=x`*y;                                /* crossproducts */ 
       ypy=y`*y; 
       xpx=x`*x; 
       free x y;                      /* no longer need the data */ 
  
          /* Save a copy of crossproducts matrix */ 
       csave=(xpx || xpy) // (xpy`|| ypy); 
    finish initial;
 

  
       /* Forward method */ 
    start forward; 
      print / "FORWARD SELECTION METHOD"; 
      free bprint; 
      c=csave; 
      in=repeat(0,k,1); 
      l=0;                                /* No variables are in */ 
      dfe=n-1; 
      mse=ypy/dfe; 
      sprob=0; 
      do while(sprob<.15 & l<k); 
         indx=loc(^in);       /* where are the variables not in? */ 
         cd=vecdiag(c)[indx,];                  /* xpx diagonals */ 
         cb=c[indx,k1];                          /* adjusted xpy */ 
         tsqr=cb#cb/(cd#mse);              /* squares of t tests */ 
         imax=tsqr[<:>,];         /* location of maximum in indx */ 
         sprob=(1-probt(sqrt(tsqr[imax,]),dfe))*2; 
         if sprob<.15 then 
  
         do;                            /* if t test significant */ 
            ii=indx[,imax];             /* pick most significant */ 
            run swp;                         /* routine to sweep */ 
            run bpr;               /* routine to collect results */ 
         end; 
      end; 
      print  bprint[colname=bnames] ; 
    finish forward; 
  
       /* Backward method */ 
    start backward; 
       print / "BACKWARD ELIMINATION "; 
       free bprint; 
       c=csave; 
       in=repeat(0,k,1); 
       ii=1:k; 
       run swp; 
       run bpr;                   /* start with all variables in */ 
       sprob=1; 
       do while(sprob>.15 & l>0); 
         indx=loc(in);            /* where are the variables in? */ 
         cd=vecdiag(c)[indx,];                  /* xpx diagonals */ 
         cb=c[indx,k1];                               /* bvalues */ 
         tsqr=cb#cb/(cd#mse);              /* squares of t tests */ 
         imin=tsqr[>:<,];         /* location of minimum in indx */ 
         sprob=(1-probt(sqrt(tsqr[imin,]),dfe))*2; 
         if sprob>.15 then 
         do;                         /* if t test nonsignificant */ 
            ii=indx[,imin];            /* pick least significant */ 
            run swp;             /* routine to sweep in variable */ 
            run bpr;               /* routine to collect results */ 
         end; 
       end; 
       print  bprint[colname=bnames] ; 
    finish backward;
 

  
       /* Stepwise method */ 
    start stepwise; 
       print /"STEPWISE METHOD"; 
       free bprint; 
       c=csave; 
       in=repeat(0,k,1); 
       l=0; 
       dfe=n-1; 
       mse=ypy/dfe; 
       sprob=0; 
  
       do while(sprob<.15 & l<k); 
         indx=loc(^in);       /* where are the variables not in? */ 
         nindx=loc(in);           /* where are the variables in? */ 
         cd=vecdiag(c)[indx,];                  /* xpx diagonals */ 
         cb=c[indx,k1];                          /* adjusted xpy */ 
         tsqr=cb#cb/cd/mse;                /* squares of t tests */ 
         imax=tsqr[<:>,];         /* location of maximum in indx */ 
         sprob=(1-probt(sqrt(tsqr[imax,]),dfe))*2; 
         if sprob<.15 then 
         do;                            /* if t test significant */ 
            ii=indx[,imax];                 /* find index into c */ 
            run swp;                         /* routine to sweep */ 
            run backstep;           /* check if remove any terms */ 
            run bpr;               /* routine to collect results */ 
            end; 
       end; 
       print  bprint[colname=bnames] ; 
    finish stepwise; 
  
       /* Routine to backwards-eliminate for stepwise  */ 
    start backstep; 
       if nrow(nindx)=0 then return; 
       bprob=1; 
       do while(bprob>.15 & l<k); 
          cd=vecdiag(c)[nindx,];                /* xpx diagonals */ 
          cb=c[nindx,k1];                             /* bvalues */ 
          tsqr=cb#cb/(cd#mse);             /* squares of t tests */ 
          imin=tsqr[>:<,];       /* location of minimum in nindx */ 
          bprob=(1-probt(sqrt(tsqr[imin,]),dfe))*2; 
          if bprob>.15 then 
          do; 
             ii=nindx[,imin]; 
             run swp; 
             run bpr; 
          end; 
       end; 
    finish backstep;
 

  
       /* Search all possible models */ 
    start all; 
  
       /* Use method of Schatzoff et al. for search technique */ 
       betak=repeat(0,k,k); /* rec. ests. for best l-param model */ 
       msek=repeat(1e50,k,1);     /* record best mse per # parms */ 
       rsqk=repeat(0,k,1);                /* record best rsquare */ 
       ink=repeat(0,k,k);         /* record best set per # parms */ 
       limit=2##k-1;              /* number of models to examine */ 
       c=csave; 
       in=repeat(0,k,1); /* start out with no variables in model */ 
  
       do kk=1 to limit; 
          run ztrail;                 /* find which one to sweep */ 
          run swp;                                /* sweep it in */ 
          bb=bb//(l||mse||rsq||(c[ik,k1]#in)`); 
          if mse<msek[l,] then 
          do;                      /* was this best for l parms? */ 
               msek[l,]=mse;                       /* record mse */ 
               rsqk[l,]=rsq;                   /* record rsquare */ 
               ink[,l]=in;        /* record which parms in model */ 
               betak[l,]=(c[ik,k1]#in)`;     /* record estimates */ 
          end; 
       end; 
       print / "ALL POSSIBLE MODELS" " IN SEARCH ORDER"; 
       print bb[colname=bnames]; free bb; 
       bprint=ik`||msek||rsqk||betak; 
       print ,"THE BEST MODEL FOR EACH NUMBER OF PARAMETERS"; 
       print bprint[colname=bnames]; 
          /* Mallows CP plot */ 
       cp=msek#(n-ik`-1)/min(msek)-(n-2#ik`); 
       cp=ik`||cp; 
       cpname={"nparm" "cp"}; 
          /* output cp out=cp colname=cpname; */ 
    finish all; 
  
       /*  Subroutine to find number of           */ 
       /*  trailing zeros in binary number        */ 
       /*  on entry: kk is the number to examine  */ 
       /*  on exit:  ii has the result            */ 
    start ztrail; 
        ii=1; 
        zz=kk; 
        do while(mod(zz,2)=0); 
           ii=ii+1; 
           zz=zz/2; 
        end; 
    finish ztrail;
 

  
       /* Subroutine to sweep in a pivot              */ 
       /* on entry: ii has the position(s) to pivot   */ 
       /* on exit:  in, l, dfe, mse, rsq recalculated */ 
    start swp; 
       if abs(c[ii,ii])<1e-9 then 
       do; 
          print , "FAILURE", c; 
          stop; 
       end; 
       c=sweep(c,ii); 
       in[ii,]=^in[ii,]; 
       l=sum(in); 
       dfe=n-1-l; 
       sse=c[k1,k1]; 
       mse=sse/dfe; 
       rsq=1-sse/ypy; 
    finish swp; 
  
  
       /* Subroutine to collect bprint results                   */ 
       /* on entry: l,mse,rsq, and c set up to collect           */ 
       /* on exit:  bprint has another row                       */ 
  
    start bpr; 
       bprint=bprint//(l||mse||rsq||(c [ik,k1]#in)`); 
    finish bpr; 
  
       /*              Stepwise Methods                          */ 
       /* After a run to the initial routine, which sets up      */ 
       /* the data, four different routines can be called        */ 
       /* to do four different model-selection methods.          */ 
  
    start seq; 
       run initial;          /* initialization                   */ 
       run all;              /* all possible models              */ 
       run forward;          /* foreward selection method        */ 
       run backward;         /* backward elimination method      */ 
       run stepwise;         /* stepwise method                  */ 
    finish seq;
 

  
  
    /*              Data on physical fitness                     */ 
    /* These measurements were made on men involved in a physical*/ 
    /* fitness course at N.C. State Univ.  The variables are     */ 
    /* age (years), weight (kg), oxygen uptake rate (ml per kg   */ 
    /* body weight per minute), time to run 1.5 miles (minutes), */ 
    /* heart rate while resting, heart rate while running (same  */ 
    /* time oxygen rate measured), and maximum heart rate record-*/ 
    /* ed while running. Certain values of maxpulse were modified*/ 
    /* for consistency.        Data courtesy Dr. A. C. Linnerud  */ 
  
    data= 
       { 44 89.47  44.609 11.37 62 178 182  , 
         40 75.07  45.313 10.07 62 185 185  , 
         44 85.84  54.297  8.65 45 156 168  , 
         42 68.15  59.571  8.17 40 166 172  , 
         38 89.02  49.874  9.22 55 178 180  , 
         47 77.45  44.811 11.63 58 176 176  , 
         40 75.98  45.681 11.95 70 176 180  , 
         43 81.19  49.091 10.85 64 162 170  , 
         44 81.42  39.442 13.08 63 174 176  , 
         38 81.87  60.055  8.63 48 170 186  , 
         44 73.03  50.541 10.13 45 168 168  , 
         45 87.66  37.388 14.03 56 186 192  , 
         45 66.45  44.754 11.12 51 176 176  , 
         47 79.15  47.273 10.60 47 162 164  , 
         54 83.12  51.855 10.33 50 166 170  , 
         49 81.42  49.156  8.95 44 180 185  , 
         51 69.63  40.836 10.95 57 168 172  , 
         51 77.91  46.672 10.00 48 162 168  , 
         48 91.63  46.774 10.25 48 162 164  , 
         49 73.37  50.388 10.08 67 168 168  , 
         57 73.37  39.407 12.63 58 174 176  , 
         54 79.38  46.080 11.17 62 156 165  , 
         52 76.32  45.441  9.63 48 164 166  , 
         50 70.87  54.625  8.92 48 146 155  , 
         51 67.25  45.118 11.08 48 172 172  , 
         54 91.63  39.203 12.88 44 168 172  , 
         51 73.71  45.790 10.47 59 186 188  , 
         57 59.08  50.545  9.93 49 148 155  , 
         49 76.32  48.673  9.40 56 186 188  , 
         48 61.24  47.920 11.50 52 170 176  , 
         52 82.78  47.467 10.50 53 170 172 
       }; 
  
    x=data[,{1 2 4 5 6 7 }]; 
    y=data[,3]; 
    free data; 
    varnames={age weight runtime rstpulse runpulse maxpulse}; 
    reset fw=8 linesize=90; 
    run seq;
 
The results are shown in Output 8.6.1.

Output 8.6.1: Model Selection: Results

ALL POSSIBLE MODELS IN SEARCH ORDER
NPARM MSE RSQUARE AGE WEIGHT RUNTIME RSTPULSE RUNPULSE MAXPULSE
1 26.63425 0.092777 -0.31136 0 0 0 0 0
2 25.82619 0.150635 -0.37042 -0.15823 0 0 0 0
1 28.58034 0.026488 0 -0.1041 0 0 0 0
2 7.755636 0.744935 0 -0.02548 -3.2886 0 0 0
3 7.226318 0.770831 -0.17388 -0.05444 -3.14039 0 0 0
2 7.168422 0.764247 -0.15037 0 -3.20395 0 0 0
1 7.533843 0.74338 0 0 -3.31056 0 0 0
2 7.798261 0.743533 0 0 -3.28661 -0.00968 0 0
3 7.336089 0.767349 -0.16755 0 -3.07925 -0.04549 0 0
4 7.366649 0.775033 -0.19603 -0.05915 -2.9889 -0.05326 0 0
3 8.037314 0.745111 0 -0.02569 -3.26268 -0.01041 0 0
2 24.91487 0.180607 0 -0.09305 0 -0.27474 0 0
3 20.28031 0.356847 -0.44698 -0.15647 0 -0.32186 0 0
2 21.27632 0.30027 -0.38882 0 0 -0.32286 0 0
1 24.67582 0.159485 0 0 0 -0.27921 0 0
2 23.26003 0.235031 0 0 0 -0.20684 -0.15262 0
3 16.81799 0.466648 -0.52338 0 0 -0.22524 -0.23769 0
4 16.26146 0.503398 -0.56317 -0.12697 0 -0.22981 -0.2246 0
3 23.81815 0.244651 0 -0.06381 0 -0.20843 -0.14279 0
4 7.785151 0.762252 0 -0.01231 -3.16759 0.016669 -0.0749 0
5 6.213174 0.817556 -0.28528 -0.05184 -2.70392 -0.02711 -0.12628 0
4 6.166944 0.81167 -0.26213 0 -2.77733 -0.01981 -0.12874 0
3 7.507972 0.761898 0 0 -3.17665 0.017616 -0.07658 0
2 7.254263 0.761424 0 0 -3.14019 0 -0.07351 0
3 5.956692 0.811094 -0.2564 0 -2.82538 0 -0.13091 0
4 6.009033 0.816493 -0.27642 -0.04932 -2.77237 0 -0.12932 0
3 7.510162 0.761829 0 -0.01315 -3.13261 0 -0.07189 0
2 25.333 0.166855 0 -0.05987 0 0 -0.19797 0
3 18.63184 0.409126 -0.54408 -0.12049 0 0 -0.28248 0
2 18.97378 0.375995 -0.50665 0 0 0 -0.29382 0
1 24.70817 0.158383 0 0 0 0 -0.2068 0
2 21.60626 0.289419 0 0 0 0 -0.6818 0.571538
3 18.21725 0.422273 -0.4214 0 0 0 -0.57966 0.361557
4 17.29877 0.47172 -0.45243 -0.14944 0 0 -0.61723 0.426862
3 21.41763 0.320779 0 -0.11815 0 0 -0.71745 0.635395
4 6.030105 0.815849 0 -0.05159 -2.9255 0 -0.39529 0.38537
5 5.176338 0.848002 -0.21962 -0.0723 -2.68252 0 -0.3734 0.304908
4 5.343462 0.836818 -0.19773 0 -2.76758 0 -0.34811 0.270513
3 5.991568 0.809988 0 0 -2.97019 0 -0.37511 0.354219
4 6.208523 0.8104 0 0 -3.00426 0.016412 -0.37778 0.353998
5 5.549941 0.837031 -0.20154 0 -2.7386 -0.01208 -0.34562 0.269064
6 5.368247 0.848672 -0.22697 -0.07418 -2.62865 -0.02153 -0.36963 0.303217
5 6.263348 0.816083 0 -0.05091 -2.95182 0.01239 -0.39704 0.384793
4 20.11235 0.385797 0 -0.1194 0 -0.19092 -0.64584 0.609632
5 15.1864 0.554066 -0.47923 -0.1527 0 -0.21555 -0.53045 0.385424
4 16.29247 0.502451 -0.44717 0 0 -0.21266 -0.49323 0.319267
3 20.37729 0.353772 0 0 0 -0.18993 -0.61019 0.545236
2 25.11456 0.174039 0 0 0 -0.25219 0 -0.07364
3 19.2347 0.390007 -0.52736 0 0 -0.26492 0 -0.20024
4 18.80875 0.425607 -0.55881 -0.12604 0 -0.27056 0 -0.17799
3 25.59719 0.188232 0 -0.07874 0 -0.25524 0 -0.05502
4 8.311496 0.746179 0 -0.02053 -3.25232 -0.00393 0 -0.02064
5 7.19584 0.788701 -0.25795 -0.04936 -2.86147 -0.04121 0 -0.08153
4 7.091611 0.783432 -0.23928 0 -2.92597 -0.0339 0 -0.08777
3 8.033673 0.745227 0 0 -3.26805 -0.00193 0 -0.02526
2 7.746932 0.745221 0 0 -3.27232 0 0 -0.02561
3 6.882626 0.78173 -0.22923 0 -3.01222 0 0 -0.09094
4 7.00018 0.786224 -0.24436 -0.04525 -2.97011 0 0 -0.08585
3 8.00441 0.746155 0 -0.02027 -3.26114 0 0 -0.02139
2 28.35356 0.067516 0 -0.07074 0 0 0 -0.12159
3 22.38148 0.290212 -0.54076 -0.11605 0 0 0 -0.24445
2 22.50135 0.259982 -0.5121 0 0 0 0 -0.2637
1 27.71259 0.056046 0 0 0 0 0 -0.13762



Output 8.6.1: (continued)

THE BEST MODEL FOR EACH NUMBER OF PARAMETERS
NPARM MSE RSQUARE AGE WEIGHT RUNTIME RSTPULSE RUNPULSE MAXPULSE
1 7.533843 0.74338 0 0 -3.31056 0 0 0
2 7.168422 0.764247 -0.15037 0 -3.20395 0 0 0
3 5.956692 0.811094 -0.2564 0 -2.82538 0 -0.13091 0
4 5.343462 0.836818 -0.19773 0 -2.76758 0 -0.34811 0.270513
5 5.176338 0.848002 -0.21962 -0.0723 -2.68252 0 -0.3734 0.304908
6 5.368247 0.848672 -0.22697 -0.07418 -2.62865 -0.02153 -0.36963 0.303217

FORWARD SELECTION METHOD
NPARM MSE RSQUARE AGE WEIGHT RUNTIME RSTPULSE RUNPULSE MAXPULSE
1 7.533843 0.74338 0 0 -3.31056 0 0 0
2 7.168422 0.764247 -0.15037 0 -3.20395 0 0 0
3 5.956692 0.811094 -0.2564 0 -2.82538 0 -0.13091 0
4 5.343462 0.836818 -0.19773 0 -2.76758 0 -0.34811 0.270513

BACKWARD ELIMINATION
NPARM MSE RSQUARE AGE WEIGHT RUNTIME RSTPULSE RUNPULSE MAXPULSE
6 5.368247 0.848672 -0.22697 -0.07418 -2.62865 -0.02153 -0.36963 0.303217
5 5.176338 0.848002 -0.21962 -0.0723 -2.68252 0 -0.3734 0.304908
4 5.343462 0.836818 -0.19773 0 -2.76758 0 -0.34811 0.270513

STEPWISE METHOD
NPARM MSE RSQUARE AGE WEIGHT RUNTIME RSTPULSE RUNPULSE MAXPULSE
1 7.533843 0.74338 0 0 -3.31056 0 0 0
2 7.168422 0.764247 -0.15037 0 -3.20395 0 0 0
3 5.956692 0.811094 -0.2564 0 -2.82538 0 -0.13091 0
4 5.343462 0.836818 -0.19773 0 -2.76758 0 -0.34811 0.270513



Previous Page | Next Page | Top of Page