## Regression of Subsets of Variables

```/****************************************************************/
/*          S A S   S A M P L E   L I B R A R Y                 */
/*                                                              */
/*    NAME: REGDEMO2                                            */
/*   TITLE: Regression of Subsets of Variables                  */
/* PRODUCT: IML                                                 */
/*  SYSTEM: ALL                                                 */
/*    KEYS: MATRIX  REGR    SUGI6                               */
/*   PROCS: IML                                                 */
/*    DATA:                                                     */
/*                                                              */
/* SUPPORT: Rick Wicklin                UPDATE: SEP2013         */
/*     REF:                                                     */
/*    MISC:                                                     */
/*                                                              */
/****************************************************************/

proc iml;
/*-------Initialization-------------------------------*
| c,csave the crossproducts matrix                   |
| n       number of observations                     |
| k       total number of variables to consider      |
| l       number of variables currently in model     |
| in      0-1 vector of whether variable is in model |
| b       print collects results (L MSE RSQ BETAS )  |
*----------------------------------------------------*/
start initial;
n=nrow(x); k=ncol(x); k1=k+1; ik=1:k;
bnames={nparm mse rsquare} ||varnames;

/*---correct by mean, adjust out intercept parameter---*/
y=y-y[+,]/n;                   /* correct y by mean */
x=x-repeat(x[+,]/n,n,1);       /* correct x by mean */
xpy=x`*y;                      /* crossproducts     */
ypy=y`*y;
xpx=x`*x;
free x y;                      /* no longer need the data*/
csave=(xpx || xpy) //
(xpy`|| ypy);            /* save copy of crossproducts*/
finish;

/*-----forward method------------------------------------------*/
start forward;
print  "FORWARD SELECTION METHOD";
free bprint;
c=csave; in=repeat(0,k,1); L=0;      /* no variables are in */
dfe=n-1; mse=ypy/dfe;
sprob=0;

do while(sprob<.15 & l<k);
indx=loc(^in);           /* where are the variables not in?*/
cd=vecdiag(c)[indx,];    /* xpx diagonals                  */
tsqr=cb#cb/(cd#mse);     /* squares of t tests             */
imax=tsqr[<:>,];         /* location of maximum in indx    */
sprob=(1-probt(sqrt(tsqr[imax,]),dfe))*2;
if sprob<.15 then do;    /* if t-test significant          */
ii=indx[,imax];       /* pick most significant          */
run swp;              /* routine to sweep               */
run bpr;              /* routine to collect results     */
end;
end;
print  bprint[colname=bnames] ;
finish;

/*-----backward method----------------------------------------*/
start backward;
print  "BACKWARD ELIMINATION ";
free bprint;
c=csave; in=repeat(0,k,1);
sprob=1;

do while(sprob>.15 & L>0);
indx=loc(in);                /* where are the variables in? */
cd=vecdiag(c)[indx,];        /* xpx diagonals               */
cb=c[indx,k1];               /* bvalues                     */
tsqr=cb#cb/(cd#mse);         /* squares of t tests          */
imin=tsqr[>:<,];             /* location of minimum in indx */
sprob=(1-probt(sqrt(tsqr[imin,]),dfe))*2;
if sprob>.15 then do;        /* if t-test nonsignificant    */
ii=indx[,imin];           /* pick least significant      */
run swp;                  /* routine to sweep in variable*/
run bpr;                  /* routine to collect results  */
end;
end;
print  bprint[colname=bnames] ;
finish;

/*-----stepwise method-----------------------------------------*/
start stepwise;
print "STEPWISE METHOD";
free bprint;
c=csave; in=repeat(0,k,1); L=0;
dfe=n-1; mse=ypy/dfe;
sprob=0;

do while(sprob<.15 & L<k);
indx=loc(^in);           /* where are the variables not in?*/
nindx=loc(in);           /* where are the variables in?    */
cd=vecdiag(c)[indx,];    /* xpx diagonals                 */
tsqr=cb#cb/cd/mse;       /* squares of t tests            */
imax=tsqr[<:>,];         /* location of maximum in indx   */
sprob=(1-probt(sqrt(tsqr[imax,]),dfe))*2;
if sprob<.15 then do;    /* if t-test significant         */
ii=indx[,imax];       /* find index into c             */
run swp;              /* routine to sweep              */
run backstep;         /* check if remove any terms     */
run bpr;              /* routine to collect results    */
end;
end;
print  bprint[colname=bnames] ;
finish;

/*----routine to backwards-eliminate for stepwise--*/
start backstep;
if nrow(nindx)=0 then return;
bprob=1;
do while(bprob>.15 & L<k);
cd=vecdiag(c)[nindx,];      /* xpx diagonals         */
cb=c[nindx,k1];             /* bvalues               */
tsqr=cb#cb/(cd#mse);        /* squares of t tests    */
imin=tsqr[>:<,];            /* location of minimum in nindx*/
bprob=(1-probt(sqrt(tsqr[imin,]),dfe))*2;
if bprob>.15 then do;
ii=nindx[,imin];
run swp;
run bpr;
end;
end;
finish;

/*-----search all possible models----------------------------*/
start all;
/*---use method of Schatzoff et al. for search technique--*/
betak=repeat(0,k,k); /* record estimates for best l-param model*/
msek=repeat(1e50,k,1);/* record best mse per # parms           */
rsqk=repeat(0,k,1);  /* record best rsquare                    */
ink=repeat(0,k,k);   /* record best set per # parms            */
limit=2##k-1;        /* number of models to examine            */

do kk=1 to limit;
run ztrail;                  /* find which one to sweep    */
run swp;                     /* sweep it in                */
bb=bb//(L||mse||rsq||(c[ik,k1]#in)`);
if mse<msek[L,] then do;     /* was this best for L parms? */
msek[L,]=mse;             /* record mse                 */
rsqk[L,]=rsq;             /* record rsquare             */
ink[,L]=in;               /* record which parms in model*/
betak[L,]=(c[ik,k1]#in)`; /* record estimates           */
end;
end;

print "ALL POSSIBLE MODELS IN SEARCH ORDER";
print bb[colname=bnames]; free bb;

bprint=ik`||msek||rsqk||betak;
print "THE BEST MODEL FOR EACH NUMBER OF PARAMETERS";
print bprint[colname=bnames];
finish;

/*-subroutine to find number of trailing zeros in binary number*/
/* on entry: kk is the number to examine                       */
/* on exit:  ii has the result                                 */
/*-------------------------------------------------------------*/
start ztrail;
ii=1; zz=kk;
do while(mod(zz,2)=0); ii=ii+1; zz=zz/2; end;
finish;

/*-----subroutine to sweep in a pivot--------------------------*/
/* on entry: ii has the position(s) to pivot                   */
/* on exit:  in, L, dfe, mse, rsq recalculated                 */
/*-------------------------------------------------------------*/
start swp;
if abs(c[ii,ii])<1e-9 then do; print  "failure", c;stop;end;
c=sweep(c,ii);
in[ii,]=^in[ii,];
L=sum(in); dfe=n-1-L;
sse=c[k1,k1];
mse=sse/dfe;
rsq=1-sse/ypy;
finish;

/*-----subroutine to collect bprint results--------------------*/
/* on entry: L,mse,rsq, and c set up to collect                */
/* on exit:  bprint has another row                            */
/*-------------------------------------------------------------*/
start bpr;
bprint=bprint//(L||mse||rsq||(c[ik,k1]#in)`);
finish;

/*--------------stepwise methods---------------------*/
/* after a call to the initial routine, which sets up*/
/* the data, four different routines can be called   */
/* to do four different model-selection methods.     */
/*---------------------------------------------------*/
start seq;
run initial;          /* initialization             */
run all;              /* all possible models        */
run forward;          /* forward selection method   */
run backward;         /* backward elimination method*/
run stepwise;         /* stepwise method            */
finish;

/*------------------------data on physical fitness--------------*
| These measurements were made on men involved in a physical     |
| fitness course at N.C.State Univ.  The variables are age(years)|
| weight(kg), oxygen uptake rate(ml per kg body weight per       |
| minute), time to run 1.5 miles(minutes), heart rate while      |
| resting, heart rate while running (same time oxygen rate       |
| measured), and maximum heart rate recorded while running.      |
| Certain values of maxpulse were modified for consistency.      |
|   Data courtesy Dr. A.C. Linnerud                              |
*---------------------------------------------------------------*/
data =
{ 44 89.47  44.609 11.37 62 178 182  ,
40 75.07  45.313 10.07 62 185 185  ,
44 85.84  54.297  8.65 45 156 168  ,
42 68.15  59.571  8.17 40 166 172  ,
38 89.02  49.874  9.22 55 178 180  ,
47 77.45  44.811 11.63 58 176 176  ,
40 75.98  45.681 11.95 70 176 180  ,
43 81.19  49.091 10.85 64 162 170  ,
44 81.42  39.442 13.08 63 174 176  ,
38 81.87  60.055  8.63 48 170 186  ,
44 73.03  50.541 10.13 45 168 168  ,
45 87.66  37.388 14.03 56 186 192  ,
45 66.45  44.754 11.12 51 176 176  ,
47 79.15  47.273 10.60 47 162 164  ,
54 83.12  51.855 10.33 50 166 170  ,
49 81.42  49.156  8.95 44 180 185  ,
51 69.63  40.836 10.95 57 168 172  ,
51 77.91  46.672 10.00 48 162 168  ,
48 91.63  46.774 10.25 48 162 164  ,
49 73.37  50.388 10.08 67 168 168  ,
57 73.37  39.407 12.63 58 174 176  ,
54 79.38  46.080 11.17 62 156 165  ,
52 76.32  45.441  9.63 48 164 166  ,
50 70.87  54.625  8.92 48 146 155  ,
51 67.25  45.118 11.08 48 172 172  ,
54 91.63  39.203 12.88 44 168 172  ,
51 73.71  45.790 10.47 59 186 188  ,
57 59.08  50.545  9.93 49 148 155  ,
49 76.32  48.673  9.40 56 186 188  ,
48 61.24  47.920 11.50 52 170 176  ,
52 82.78  47.467 10.50 53 170 172  };
y=data[,3];
x=data[,{1 2 4 5 6 7 }];
free data;
varnames={age weight runtime rstpuls runpuls maxpuls};
reset fw=6 linesize=87;
run seq;

proc iml;
/*          Quadratic Response Surface Regression            */
/* This matrix routine reads in the factor variables and     */
/* the response, forms the quadratic regression model and    */
/* estimates the parameters, and then solves for the optimal */
/* response, prints the optimal factors and response, and    */
/* displays the eigenvalues and eigenvectors of the          */
/* matrix of quadratic parameter estimates to determine if   */
/* the solution is a maximum or minimum, or saddlepoint, and */
/* which direction has the steepest and gentlest slopes.     */
/*                                                           */
/* Given:                                                    */
/* d contains the factor variables                           */
/* y contains the response variable                          */
/*                                                           */

start rsm(d, y);
n=nrow(d);
k=ncol(d);                                  /* dimensions */
x=j(n,1,1) || d;                  /* set up design matrix */
x = x || d[,i] #d[,1:i];
end;
beta=solve(x`*x, x`*y);            /* estimate parameters */
names = "b0":("b"+strip(char(nrow(beta)-1)));
print beta[rowname=names label="Parameter Estimates"];

c=beta[1];                          /* intercept estimate */
b=beta[2:(k+1)];                      /* linear estimates */
a=j(k,k,0);
L=k+1;                     /* form quadratics into matrix */
do i=1 to k;
do j=1 to i;
L=L+1;
a[i,j]=beta [L,];
end;
end;
a=(a+a`)/2;                                 /* symmetrize */
xx = -0.5*solve(a,b);         /* solve for critical value */
print xx[label="Critical Factor Values"];

/* Compute response at critical value */
yopt=c + b`*xx + xx`*a*xx;
print yopt[label="Response at Critical Value"];

call eigen(eval,evec,a);
if min(eval)>0 then print "Solution Is a Minimum";
if max(eval)<0 then print "Solution Is a Maximum";
finish rsm;

/* Sample Problem with Two Factors */
d = {-1 -1, -1  0, -1  1,
0 -1,  0  0,  0  1,
1 -1,  1  0,  1  1};
y = {71.7, 75.2, 76.3, 79.2, 81.5, 80.2, 80.1, 79.1, 75.8};
run rsm(d,y);

```