Regression of Subsets of Variables

/****************************************************************/
/*          S A S   S A M P L E   L I B R A R Y                 */
/*                                                              */
/*    NAME: REGDEMO2                                            */
/*   TITLE: Regression of Subsets of Variables                  */
/* PRODUCT: IML                                                 */
/*  SYSTEM: ALL                                                 */
/*    KEYS: MATRIX  REGR    SUGI6                               */
/*   PROCS: IML                                                 */
/*    DATA:                                                     */
/*                                                              */
/* SUPPORT: Rick Wicklin                UPDATE: SEP2013         */
/*     REF:                                                     */
/*    MISC:                                                     */
/*                                                              */
/****************************************************************/


proc iml;
 /*-------Initialization-------------------------------*
 | c,csave the crossproducts matrix                   |
 | n       number of observations                     |
 | k       total number of variables to consider      |
 | l       number of variables currently in model     |
 | in      0-1 vector of whether variable is in model |
 | b       print collects results (L MSE RSQ BETAS )  |
 *----------------------------------------------------*/
start initial;
  n=nrow(x); k=ncol(x); k1=k+1; ik=1:k;
  bnames={nparm mse rsquare} ||varnames;

  /*---correct by mean, adjust out intercept parameter---*/
  y=y-y[+,]/n;                   /* correct y by mean */
  x=x-repeat(x[+,]/n,n,1);       /* correct x by mean */
  xpy=x`*y;                      /* crossproducts     */
  ypy=y`*y;
  xpx=x`*x;
  free x y;                      /* no longer need the data*/
  csave=(xpx || xpy) //
        (xpy`|| ypy);            /* save copy of crossproducts*/
finish;

/*-----forward method------------------------------------------*/
start forward;
  print  "FORWARD SELECTION METHOD";
  free bprint;
  c=csave; in=repeat(0,k,1); L=0;      /* no variables are in */
  dfe=n-1; mse=ypy/dfe;
  sprob=0;

  do while(sprob<.15 & l<k);
     indx=loc(^in);           /* where are the variables not in?*/
     cd=vecdiag(c)[indx,];    /* xpx diagonals                  */
     cb=c[indx,k1];           /* adjusted xpy                   */
     tsqr=cb#cb/(cd#mse);     /* squares of t tests             */
     imax=tsqr[<:>,];         /* location of maximum in indx    */
     sprob=(1-probt(sqrt(tsqr[imax,]),dfe))*2;
     if sprob<.15 then do;    /* if t-test significant          */
        ii=indx[,imax];       /* pick most significant          */
        run swp;              /* routine to sweep               */
        run bpr;              /* routine to collect results     */
     end;
  end;
  print  bprint[colname=bnames] ;
finish;

/*-----backward method----------------------------------------*/
start backward;
  print  "BACKWARD ELIMINATION ";
  free bprint;
  c=csave; in=repeat(0,k,1);
  ii=1:k; run swp; run bpr;       /* start with all variables in*/
  sprob=1;

  do while(sprob>.15 & L>0);
     indx=loc(in);                /* where are the variables in? */
     cd=vecdiag(c)[indx,];        /* xpx diagonals               */
     cb=c[indx,k1];               /* bvalues                     */
     tsqr=cb#cb/(cd#mse);         /* squares of t tests          */
     imin=tsqr[>:<,];             /* location of minimum in indx */
     sprob=(1-probt(sqrt(tsqr[imin,]),dfe))*2;
     if sprob>.15 then do;        /* if t-test nonsignificant    */
        ii=indx[,imin];           /* pick least significant      */
        run swp;                  /* routine to sweep in variable*/
        run bpr;                  /* routine to collect results  */
     end;
  end;
  print  bprint[colname=bnames] ;
finish;

/*-----stepwise method-----------------------------------------*/
start stepwise;
  print "STEPWISE METHOD";
  free bprint;
  c=csave; in=repeat(0,k,1); L=0;
  dfe=n-1; mse=ypy/dfe;
  sprob=0;

  do while(sprob<.15 & L<k);
     indx=loc(^in);           /* where are the variables not in?*/
     nindx=loc(in);           /* where are the variables in?    */
     cd=vecdiag(c)[indx,];    /* xpx diagonals                 */
     cb=c[indx,k1];           /* adjusted xpy                  */
     tsqr=cb#cb/cd/mse;       /* squares of t tests            */
     imax=tsqr[<:>,];         /* location of maximum in indx   */
     sprob=(1-probt(sqrt(tsqr[imax,]),dfe))*2;
     if sprob<.15 then do;    /* if t-test significant         */
        ii=indx[,imax];       /* find index into c             */
        run swp;              /* routine to sweep              */
        run backstep;         /* check if remove any terms     */
        run bpr;              /* routine to collect results    */
     end;
  end;
  print  bprint[colname=bnames] ;
finish;

/*----routine to backwards-eliminate for stepwise--*/
start backstep;
  if nrow(nindx)=0 then return;
  bprob=1;
  do while(bprob>.15 & L<k);
     cd=vecdiag(c)[nindx,];      /* xpx diagonals         */
     cb=c[nindx,k1];             /* bvalues               */
     tsqr=cb#cb/(cd#mse);        /* squares of t tests    */
     imin=tsqr[>:<,];            /* location of minimum in nindx*/
     bprob=(1-probt(sqrt(tsqr[imin,]),dfe))*2;
     if bprob>.15 then do;
        ii=nindx[,imin];
        run swp;
        run bpr;
     end;
  end;
finish;

/*-----search all possible models----------------------------*/
start all;
   /*---use method of Schatzoff et al. for search technique--*/
   betak=repeat(0,k,k); /* record estimates for best l-param model*/
   msek=repeat(1e50,k,1);/* record best mse per # parms           */
   rsqk=repeat(0,k,1);  /* record best rsquare                    */
   ink=repeat(0,k,k);   /* record best set per # parms            */
   limit=2##k-1;        /* number of models to examine            */
   c=csave; in=repeat(0,k,1);  /* start with no variables in model*/

   do kk=1 to limit;
      run ztrail;                  /* find which one to sweep    */
      run swp;                     /* sweep it in                */
      bb=bb//(L||mse||rsq||(c[ik,k1]#in)`);
      if mse<msek[L,] then do;     /* was this best for L parms? */
         msek[L,]=mse;             /* record mse                 */
         rsqk[L,]=rsq;             /* record rsquare             */
         ink[,L]=in;               /* record which parms in model*/
         betak[L,]=(c[ik,k1]#in)`; /* record estimates           */
      end;
   end;

   print "ALL POSSIBLE MODELS IN SEARCH ORDER";
   print bb[colname=bnames]; free bb;

   bprint=ik`||msek||rsqk||betak;
   print "THE BEST MODEL FOR EACH NUMBER OF PARAMETERS";
   print bprint[colname=bnames];
finish;

/*-subroutine to find number of trailing zeros in binary number*/
/* on entry: kk is the number to examine                       */
/* on exit:  ii has the result                                 */
/*-------------------------------------------------------------*/
start ztrail;
   ii=1; zz=kk;
   do while(mod(zz,2)=0); ii=ii+1; zz=zz/2; end;
finish;

/*-----subroutine to sweep in a pivot--------------------------*/
/* on entry: ii has the position(s) to pivot                   */
/* on exit:  in, L, dfe, mse, rsq recalculated                 */
/*-------------------------------------------------------------*/
start swp;
   if abs(c[ii,ii])<1e-9 then do; print  "failure", c;stop;end;
   c=sweep(c,ii);
   in[ii,]=^in[ii,];
   L=sum(in); dfe=n-1-L;
   sse=c[k1,k1];
   mse=sse/dfe;
   rsq=1-sse/ypy;
finish;

/*-----subroutine to collect bprint results--------------------*/
/* on entry: L,mse,rsq, and c set up to collect                */
/* on exit:  bprint has another row                            */
/*-------------------------------------------------------------*/
start bpr;
   bprint=bprint//(L||mse||rsq||(c[ik,k1]#in)`);
finish;

/*--------------stepwise methods---------------------*/
/* after a call to the initial routine, which sets up*/
/* the data, four different routines can be called   */
/* to do four different model-selection methods.     */
/*---------------------------------------------------*/
start seq;
   run initial;          /* initialization             */
   run all;              /* all possible models        */
   run forward;          /* forward selection method   */
   run backward;         /* backward elimination method*/
   run stepwise;         /* stepwise method            */
finish;


/*------------------------data on physical fitness--------------*
| These measurements were made on men involved in a physical     |
| fitness course at N.C.State Univ.  The variables are age(years)|
| weight(kg), oxygen uptake rate(ml per kg body weight per       |
| minute), time to run 1.5 miles(minutes), heart rate while      |
| resting, heart rate while running (same time oxygen rate       |
| measured), and maximum heart rate recorded while running.      |
| Certain values of maxpulse were modified for consistency.      |
|   Data courtesy Dr. A.C. Linnerud                              |
*---------------------------------------------------------------*/
data =
   { 44 89.47  44.609 11.37 62 178 182  ,
     40 75.07  45.313 10.07 62 185 185  ,
     44 85.84  54.297  8.65 45 156 168  ,
     42 68.15  59.571  8.17 40 166 172  ,
     38 89.02  49.874  9.22 55 178 180  ,
     47 77.45  44.811 11.63 58 176 176  ,
     40 75.98  45.681 11.95 70 176 180  ,
     43 81.19  49.091 10.85 64 162 170  ,
     44 81.42  39.442 13.08 63 174 176  ,
     38 81.87  60.055  8.63 48 170 186  ,
     44 73.03  50.541 10.13 45 168 168  ,
     45 87.66  37.388 14.03 56 186 192  ,
     45 66.45  44.754 11.12 51 176 176  ,
     47 79.15  47.273 10.60 47 162 164  ,
     54 83.12  51.855 10.33 50 166 170  ,
     49 81.42  49.156  8.95 44 180 185  ,
     51 69.63  40.836 10.95 57 168 172  ,
     51 77.91  46.672 10.00 48 162 168  ,
     48 91.63  46.774 10.25 48 162 164  ,
     49 73.37  50.388 10.08 67 168 168  ,
     57 73.37  39.407 12.63 58 174 176  ,
     54 79.38  46.080 11.17 62 156 165  ,
     52 76.32  45.441  9.63 48 164 166  ,
     50 70.87  54.625  8.92 48 146 155  ,
     51 67.25  45.118 11.08 48 172 172  ,
     54 91.63  39.203 12.88 44 168 172  ,
     51 73.71  45.790 10.47 59 186 188  ,
     57 59.08  50.545  9.93 49 148 155  ,
     49 76.32  48.673  9.40 56 186 188  ,
     48 61.24  47.920 11.50 52 170 176  ,
     52 82.78  47.467 10.50 53 170 172  };
y=data[,3];
x=data[,{1 2 4 5 6 7 }];
free data;
varnames={age weight runtime rstpuls runpuls maxpuls};
reset fw=6 linesize=87;
run seq;

proc iml;
/*          Quadratic Response Surface Regression            */
/* This matrix routine reads in the factor variables and     */
/* the response, forms the quadratic regression model and    */
/* estimates the parameters, and then solves for the optimal */
/* response, prints the optimal factors and response, and    */
/* displays the eigenvalues and eigenvectors of the          */
/* matrix of quadratic parameter estimates to determine if   */
/* the solution is a maximum or minimum, or saddlepoint, and */
/* which direction has the steepest and gentlest slopes.     */
/*                                                           */
/* Given:                                                    */
/* d contains the factor variables                           */
/* y contains the response variable                          */
/*                                                           */

start rsm(d, y);
   n=nrow(d);
   k=ncol(d);                                  /* dimensions */
   x=j(n,1,1) || d;                  /* set up design matrix */
   do i=1 to k;                     /* add quadratic effects */
      x = x || d[,i] #d[,1:i];
   end;
   beta=solve(x`*x, x`*y);            /* estimate parameters */
   names = "b0":("b"+strip(char(nrow(beta)-1)));
   print beta[rowname=names label="Parameter Estimates"];

   c=beta[1];                          /* intercept estimate */
   b=beta[2:(k+1)];                      /* linear estimates */
   a=j(k,k,0);
   L=k+1;                     /* form quadratics into matrix */
   do i=1 to k;
      do j=1 to i;
         L=L+1;
         a[i,j]=beta [L,];
      end;
   end;
   a=(a+a`)/2;                                 /* symmetrize */
   xx = -0.5*solve(a,b);         /* solve for critical value */
   print xx[label="Critical Factor Values"];

   /* Compute response at critical value */
   yopt=c + b`*xx + xx`*a*xx;
   print yopt[label="Response at Critical Value"];

   call eigen(eval,evec,a);
   if min(eval)>0 then print "Solution Is a Minimum";
   if max(eval)<0 then print "Solution Is a Maximum";
finish rsm;

/* Sample Problem with Two Factors */
d = {-1 -1, -1  0, -1  1,
      0 -1,  0  0,  0  1,
      1 -1,  1  0,  1  1};
y = {71.7, 75.2, 76.3, 79.2, 81.5, 80.2, 80.1, 79.1, 75.8};
run rsm(d,y);