Multivariate Linear Regression of Fitness Data

 /****************************************************************/
 /*          S A S   S A M P L E   L I B R A R Y                 */
 /*                                                              */
 /*    NAME: REGTEST2                                            */
 /*   TITLE: Multivariate Linear Regression of Fitness Data      */
 /* PRODUCT: IML                                                 */
 /*  SYSTEM: ALL                                                 */
 /*    KEYS: MATRIX REGR                                         */
 /*   PROCS: IML  REG                                            */
 /*    DATA:                                                     */
 /*                                                              */
 /* SUPPORT: RHD                         UPDATE: 12MAY1989       */
 /*     REF:                                                     */
 /*    MISC:                                                     */
 /*                                                              */
 /****************************************************************/

 /* The first IML session contains a suite of routines for doing
 * regression analysis on a set of data.  It defines
 * the modules and stores them in a SAS/IML storage file, 'reg'.
 * The second IML session shows typical uses of the routines.
 *
 * Note that these routines are also defined by and used in
 * IML sample member REGTEST1.
 *
 * WARNING: THESE ROUTINES ARE FOR DEMONSTRATION ONLY. THEY
 *          DO NOT SUPPORT MISSING VALUES, NOR DO THEY HANDLE
 *          LINEAR DEPENDENCIES AMONG REGRESSORS!
 *
 * REGEST: basic estimation, parameter estimates
 * REGTEST: test a linear combination
 * REGCOL:  collinearity diagnostics on X`X
 * REGCOLX:  collinearity diagnostics on X
 * REGRESID: analyze residuals
 * REGPLOTR: plot residuals
 * REGPLOTP: plot predicted values
 */

proc iml ;

 /*-----REGEST: Regression Parameter Estimation-----
 *arguments:
 * x    the regressors, design matrix
 * y    the response, dependent variable
 * names the names of the regressors
 */

start regest;
  n=nrow(x);                  /* number of observations   */
  k=ncol(x);                  /* number of variables      */
  xpx=x`*x;                   /* cross-products           */
  xpy=x`*y;
  xpxi=inv(xpx);              /* inverse crossproducts    */
  beta=xpxi*xpy;              /* parameter estimates      */
  sse = y`*y-xpy`*beta;       /* sum of squares error     */
  dfe = n-k;                  /* degrees of freedom error */
  mse = sse/dfe;              /* mean square error      */
  rmse = sqrt(mse);           /* root mean square error */
  rsquare = 1-sse/((y-y[:])[##]);

  print ({'Regression Analysis','Residual Error'}),
         sse dfe mse rmse rsquare;

  stderr = sqrt(vecdiag(xpxi)#mse); /* std error of estimates  */
  tratio = beta/stderr;             /* test for parameter=0    */
  probt=1-probf(tratio##2,1,dfe);   /* significance probability */
  print 'Regression Parameter Estimates ',
           names beta stderr tratio probt;

  covb=xpxi#mse;              /* covariance of estimates  */
  s=1/stderr;
  corrb=s#covb#s`;            /* correlation of estimates */
  print "Covariance of estimates",  covb[r=names c=names],
        "Correlation of estimates",corrb[r=names c=names];

finish;




 /*---routine to test a linear combination of the estimates---
 * given l, this routine tests hypothesis that l b = 0.
 * THIS ROUTINE DOES NOT CHECK ESTIMABILITY
 *arguments:
 * l    the hypothesis to test
 * xpxi inverse crossproducts of regressors
 * mse  mean square error
 * dfe  degrees of freedom for error
 */
start regtest;
  dfn=nrow(l);
  lb=l*beta;
  vlb=l*xpxi*l`;
  q=lb`*inv(vlb)*lb /dfn;
  f=q/mse;
  prob=1-probf(f,dfn,dfe);
  print 'Testing' l,f dfn dfe prob;
finish;






 /* this stream defines and stores two modules that perform
 *  regression collinearity diagnostics. The routine REGCOL
 *  does it from a crossproducts matrix, while the routine REGCOLX
 *  does it from a full matrix of regressors.
 *
 *REGCOL
 * arguments
 *    XPX   the crossproducts matrix
 *    NAMES the names of the rows and columns of XPX
 * returns
 *    EIGVAL the eigenvalues of XPX
 *    CONDIT the condition numbers
 *    PI     the variance decomposition
 *
 *REGCOL X
 * arguments
 *    X     the crossproducts matrix
 *    NAMES the names of the columns of X
 * returns
 *    EIGVAL the eigenvalues of XPX
 *    CONDIT the condition numbers
 *    PI     the variance decomposition
 */



 /*---collinearity diagnostics: from the full X matrix---*/
start regcolx;
   k=ncol(x);
   s=1/sqrt(x[##,]);
   xs=x#s;
   call svd(u, q, v, xs);
    r=k+1-rank(q);
    t=u; u[,r]=t;
    t=q; q[r,]=t;
    t=v; v[,r]=t;
   eigval=q#q;
   condit=q[1]/q;
   phi=v#v#(1/eigval`);
   phik=phi[,+];
   pi=phi/ (phik*j(1,k));
   pi = pi`;
   print ,,'Collinearity Diagnostics, Variance Decomposition',,
          eigval condit pi [c=names];
finish;

 /*---collinearity diagnostics: from the crossproducts XPX---*/
start regcol;
   k=ncol(xpx);
   s = sqrt(1/vecdiag(xpx));
   xpxs = s`#xpx#s;
   call eigen(eigval,v,xpxs);
   q = sqrt(eigval);
   condit=q[1]/q;
   phi=v#v#(1/eigval`);
   phik=phi[,+];
   pi=phi/ (phik*j(1,k));
   pi=pi`;
   print ,,'Collinearity Diagnostics, Variance Decomposition',,
          eigval condit pi [c=names];
finish;


 /*--Residual and Predicted Analysis, with Influence Diagnostics--
 *arguments:
 *  xpxi  contains inverse regressor crossproducts
 *  beta  contains parameter estimates
 *  X     contains regressor data
 *  y     contains response
 *  mse   contains mean square error
 *  dfe   contains degrees of freedom for residual error
 *  names contains regressor names
 *  n     contains number of observations
 *  k     contains number of columns in x
 */
start regresid;

  pred=x*beta;                /* predicted values         */
  resid=y-pred;               /* residuals                */

 /*---compute leverage---*/
 c = x*xpxi;
 h = (c#x)[,+];    /* leverage; faster than vecdiag(x*xpxi*x`) */

 /*---compute standard errors---*/
 stdm = sqrt(h*mse);      /* std error for expected value */
 stdp = sqrt(h*mse+mse);  /* std error for individual value */
 stdr = sqrt(mse#(1-h));  /* std error for residual */
 stures=resid/stdr;       /* ordinary studentized residual */

 /*---compute confidence limits---*/
 tval = tinv(.975,dfe);
 lower95m=pred-tval#stdm; /* lower confidence limit for mean */
 upper95m=pred+tval#stdm; /* upper */
 lower95 =pred-tval#stdp; /* lower confidence limit for indiv */
 upper95 =pred+tval#stdp; /* upper */
 print ,,'Predicted and Residual Values',,y pred resid stures stdr
                                   lower95m upper95m stdm
                                   lower95  upper95  stdp;

 /*---compute influence statistics---*/
 si2 =  (mse#dfe - resid#resid / (1-h) )  / (dfe - 1);
 si = sqrt(si2);    /* root mse for dropping each observation */
 rstudent = resid/(si#sqrt(1-h));
 covratio = 1/((((n-k-1)+rstudent##2)/(n-k))##k # (1-h));
 dfbetas = (resid/(si#(1-h))) # c # (1/sqrt(c[##,])) ;
 dffits = rstudent # sqrt(h/(1-h));
 ii = (-4<>rstudent><4)+5.5 + (0:n-1)`#9; /* scale from 1 to 9 */
 rstuplot = cshape( repeat('|   |   |',n,1),n#9,1,1);
 rstuplot[ii]='*';  rstuplot = cshape(rstuplot,n,1,9);

 print ,,'Influence Diagnostics',,resid h rstudent rstuplot
                                  dffits dfbetas[c=names] covratio;

finish;



 /*-----Residual Plots-----
 *arguments:
 *  X     contains regressor data
 *  pred  contains predicted values
 *  resid contains residuals
 *  names contains regressor names
 */
start regplotr;
 reset linesize=80 pagesize=30;
 resname="Residual Value";
 call pgraf(pred||resid,'*','Predicted Value',resname);
 do i=2 to ncol(x);
    call pgraf(x[,i]||resid,'*',names[i],resname);
    end;
finish;


 /*-----Predicted Plots-----
 *arguments
 * xxx contains the regressor to plot on the x-axis
 * n   contains number of observations
 * plus all the variables from module regresid.
 */
start regplotp;
 reset linesize=120 pagesize=50;
 title1='Legend: a=actual p=pred l=lower95% u=upper95%';
 xxxx = xxx//xxx//xxx//xxx;
 yyyy = y // pred // lower95 // upper95;
 id = repeat('a',n,1)//repeat('p',n,1)//repeat('l',n,1)
           //repeat('u',n,1);
 call pgraf(xxxx||yyyy,id,'Regressor','Prediction',title1);
finish;




reset storage='reg';
store module=(regest regtest regcol regcolx regresid regplotr
                  regplotp);
quit;


 /*NB: THIS SAMPLE REQUIRES the product SAS/STAT in order to run,*/
 /*    although one could edit out the PROC REG to avoid SAS/STAT*/
 /*This sample runs a regression example using both PROC REG, and*/
 /*the IML modules created in the previous IML session.          */

options pagesize=80 linesize=120;

data fitness;
 input  age weight oxy runtime rstpulse runpulse maxpulse;
cards;
     44 89.47  44.609 11.37 62 178 182
     40 75.07  45.313 10.07 62 185 185
     44 85.84  54.297  8.65 45 156 168
     42 68.15  59.571  8.17 40 166 172
     38 89.02  49.874  9.22 55 178 180
     47 77.45  44.811 11.63 58 176 176
     40 75.98  45.681 11.95 70 176 180
     43 81.19  49.091 10.85 64 162 170
     44 81.42  39.442 13.08 63 174 176
     38 81.87  60.055  8.63 48 170 186
     44 73.03  50.541 10.13 45 168 168
     45 87.66  37.388 14.03 56 186 192
     45 66.45  44.754 11.12 51 176 176
     47 79.15  47.273 10.60 47 162 164
     54 83.12  51.855 10.33 50 166 170
     49 81.42  49.156  8.95 44 180 185
     51 69.63  40.836 10.95 57 168 172
     51 77.91  46.672 10.00 48 162 168
     48 91.63  46.774 10.25 48 162 164
     49 73.37  50.388 10.08 67 168 168
     57 73.37  39.407 12.63 58 174 176
     54 79.38  46.080 11.17 62 156 165
     52 76.32  45.441  9.63 48 164 166
     50 70.87  54.625  8.92 48 146 155
     51 67.25  45.118 11.08 48 172 172
     54 91.63  39.203 12.88 44 168 172
     51 73.71  45.790 10.47 59 186 188
     57 59.08  50.545  9.93 49 148 155
     49 76.32  48.673  9.40 56 186 188
     48 61.24  47.920 11.50 52 170 176
     52 82.78  47.467 10.50 53 170 172
run;
proc reg;
     model oxy=runtime weight age runpulse maxpulse
           /collinoint xpx;
     run;
     model oxy=runtime weight age runpulse maxpulse/collin xpx ;
     model oxy=runtime weight age runpulse maxpulse/
                influence r;
           test maxpulse=runpulse;
     run;

proc iml ;
 reset nolog linesize=125 fw=8;
 reset storage='reg';
 load module=(regest regtest regcol regresid regplotr regplotp);

 use fitness;
 read all;
 n=nrow(runtime);
 x = repeat(1,n,1)||runtime||weight||age||runpulse||maxpulse;
 names = { intercep, runtime, weight, age, runpulse, maxpulse};
 y = oxy;

 run regest;                     /* estimate the model */
 l={0 0 0 0 1 -1}; run regtest;  /* test an hypothesis */

 print ,,'Collinearity: Version with intercept',;
 run regcol;

 run regresid;                   /* analyze residuals */
 run regplotr;                   /* plot residuals */
 xxx = runpulse;  run regplotp;  /* plot predicted values */

 print ,,'Collinearity: Version WITHOUT intercept',;
 xx = x[,2:6];
 x=xx-j(n,1,1)*(j(1,n,1)*xx)/n;
 xpx = x`*x;
 names = names[2:6,];
 run regcol;
 quit;
 title1;