Multivariate Linear Regression of Fitness Data
/****************************************************************/
/* S A S S A M P L E L I B R A R Y */
/* */
/* NAME: REGTEST2 */
/* TITLE: Multivariate Linear Regression of Fitness Data */
/* PRODUCT: IML */
/* SYSTEM: ALL */
/* KEYS: MATRIX REGR */
/* PROCS: IML REG */
/* DATA: */
/* */
/* SUPPORT: RHD UPDATE: 12MAY1989 */
/* REF: */
/* MISC: */
/* */
/****************************************************************/
/* The first IML session contains a suite of routines for doing
* regression analysis on a set of data. It defines
* the modules and stores them in a SAS/IML storage file, 'reg'.
* The second IML session shows typical uses of the routines.
*
* Note that these routines are also defined by and used in
* IML sample member REGTEST1.
*
* WARNING: THESE ROUTINES ARE FOR DEMONSTRATION ONLY. THEY
* DO NOT SUPPORT MISSING VALUES, NOR DO THEY HANDLE
* LINEAR DEPENDENCIES AMONG REGRESSORS!
*
* REGEST: basic estimation, parameter estimates
* REGTEST: test a linear combination
* REGCOL: collinearity diagnostics on X`X
* REGCOLX: collinearity diagnostics on X
* REGRESID: analyze residuals
* REGPLOTR: plot residuals
* REGPLOTP: plot predicted values
*/
proc iml ;
/*-----REGEST: Regression Parameter Estimation-----
*arguments:
* x the regressors, design matrix
* y the response, dependent variable
* names the names of the regressors
*/
start regest;
n=nrow(x); /* number of observations */
k=ncol(x); /* number of variables */
xpx=x`*x; /* cross-products */
xpy=x`*y;
xpxi=inv(xpx); /* inverse crossproducts */
beta=xpxi*xpy; /* parameter estimates */
sse = y`*y-xpy`*beta; /* sum of squares error */
dfe = n-k; /* degrees of freedom error */
mse = sse/dfe; /* mean square error */
rmse = sqrt(mse); /* root mean square error */
rsquare = 1-sse/((y-y[:])[##]);
print ({'Regression Analysis','Residual Error'}),
sse dfe mse rmse rsquare;
stderr = sqrt(vecdiag(xpxi)#mse); /* std error of estimates */
tratio = beta/stderr; /* test for parameter=0 */
probt=1-probf(tratio##2,1,dfe); /* significance probability */
print 'Regression Parameter Estimates ',
names beta stderr tratio probt;
covb=xpxi#mse; /* covariance of estimates */
s=1/stderr;
corrb=s#covb#s`; /* correlation of estimates */
print "Covariance of estimates", covb[r=names c=names],
"Correlation of estimates",corrb[r=names c=names];
finish;
/*---routine to test a linear combination of the estimates---
* given l, this routine tests hypothesis that l b = 0.
* THIS ROUTINE DOES NOT CHECK ESTIMABILITY
*arguments:
* l the hypothesis to test
* xpxi inverse crossproducts of regressors
* mse mean square error
* dfe degrees of freedom for error
*/
start regtest;
dfn=nrow(l);
lb=l*beta;
vlb=l*xpxi*l`;
q=lb`*inv(vlb)*lb /dfn;
f=q/mse;
prob=1-probf(f,dfn,dfe);
print 'Testing' l,f dfn dfe prob;
finish;
/* this stream defines and stores two modules that perform
* regression collinearity diagnostics. The routine REGCOL
* does it from a crossproducts matrix, while the routine REGCOLX
* does it from a full matrix of regressors.
*
*REGCOL
* arguments
* XPX the crossproducts matrix
* NAMES the names of the rows and columns of XPX
* returns
* EIGVAL the eigenvalues of XPX
* CONDIT the condition numbers
* PI the variance decomposition
*
*REGCOL X
* arguments
* X the crossproducts matrix
* NAMES the names of the columns of X
* returns
* EIGVAL the eigenvalues of XPX
* CONDIT the condition numbers
* PI the variance decomposition
*/
/*---collinearity diagnostics: from the full X matrix---*/
start regcolx;
k=ncol(x);
s=1/sqrt(x[##,]);
xs=x#s;
call svd(u, q, v, xs);
r=k+1-rank(q);
t=u; u[,r]=t;
t=q; q[r,]=t;
t=v; v[,r]=t;
eigval=q#q;
condit=q[1]/q;
phi=v#v#(1/eigval`);
phik=phi[,+];
pi=phi/ (phik*j(1,k));
pi = pi`;
print ,,'Collinearity Diagnostics, Variance Decomposition',,
eigval condit pi [c=names];
finish;
/*---collinearity diagnostics: from the crossproducts XPX---*/
start regcol;
k=ncol(xpx);
s = sqrt(1/vecdiag(xpx));
xpxs = s`#xpx#s;
call eigen(eigval,v,xpxs);
q = sqrt(eigval);
condit=q[1]/q;
phi=v#v#(1/eigval`);
phik=phi[,+];
pi=phi/ (phik*j(1,k));
pi=pi`;
print ,,'Collinearity Diagnostics, Variance Decomposition',,
eigval condit pi [c=names];
finish;
/*--Residual and Predicted Analysis, with Influence Diagnostics--
*arguments:
* xpxi contains inverse regressor crossproducts
* beta contains parameter estimates
* X contains regressor data
* y contains response
* mse contains mean square error
* dfe contains degrees of freedom for residual error
* names contains regressor names
* n contains number of observations
* k contains number of columns in x
*/
start regresid;
pred=x*beta; /* predicted values */
resid=y-pred; /* residuals */
/*---compute leverage---*/
c = x*xpxi;
h = (c#x)[,+]; /* leverage; faster than vecdiag(x*xpxi*x`) */
/*---compute standard errors---*/
stdm = sqrt(h*mse); /* std error for expected value */
stdp = sqrt(h*mse+mse); /* std error for individual value */
stdr = sqrt(mse#(1-h)); /* std error for residual */
stures=resid/stdr; /* ordinary studentized residual */
/*---compute confidence limits---*/
tval = tinv(.975,dfe);
lower95m=pred-tval#stdm; /* lower confidence limit for mean */
upper95m=pred+tval#stdm; /* upper */
lower95 =pred-tval#stdp; /* lower confidence limit for indiv */
upper95 =pred+tval#stdp; /* upper */
print ,,'Predicted and Residual Values',,y pred resid stures stdr
lower95m upper95m stdm
lower95 upper95 stdp;
/*---compute influence statistics---*/
si2 = (mse#dfe - resid#resid / (1-h) ) / (dfe - 1);
si = sqrt(si2); /* root mse for dropping each observation */
rstudent = resid/(si#sqrt(1-h));
covratio = 1/((((n-k-1)+rstudent##2)/(n-k))##k # (1-h));
dfbetas = (resid/(si#(1-h))) # c # (1/sqrt(c[##,])) ;
dffits = rstudent # sqrt(h/(1-h));
ii = (-4<>rstudent><4)+5.5 + (0:n-1)`#9; /* scale from 1 to 9 */
rstuplot = cshape( repeat('| | |',n,1),n#9,1,1);
rstuplot[ii]='*'; rstuplot = cshape(rstuplot,n,1,9);
print ,,'Influence Diagnostics',,resid h rstudent rstuplot
dffits dfbetas[c=names] covratio;
finish;
/*-----Residual Plots-----
*arguments:
* X contains regressor data
* pred contains predicted values
* resid contains residuals
* names contains regressor names
*/
start regplotr;
reset linesize=80 pagesize=30;
resname="Residual Value";
call pgraf(pred||resid,'*','Predicted Value',resname);
do i=2 to ncol(x);
call pgraf(x[,i]||resid,'*',names[i],resname);
end;
finish;
/*-----Predicted Plots-----
*arguments
* xxx contains the regressor to plot on the x-axis
* n contains number of observations
* plus all the variables from module regresid.
*/
start regplotp;
reset linesize=120 pagesize=50;
title1='Legend: a=actual p=pred l=lower95% u=upper95%';
xxxx = xxx//xxx//xxx//xxx;
yyyy = y // pred // lower95 // upper95;
id = repeat('a',n,1)//repeat('p',n,1)//repeat('l',n,1)
//repeat('u',n,1);
call pgraf(xxxx||yyyy,id,'Regressor','Prediction',title1);
finish;
reset storage='reg';
store module=(regest regtest regcol regcolx regresid regplotr
regplotp);
quit;
/*NB: THIS SAMPLE REQUIRES the product SAS/STAT in order to run,*/
/* although one could edit out the PROC REG to avoid SAS/STAT*/
/*This sample runs a regression example using both PROC REG, and*/
/*the IML modules created in the previous IML session. */
options pagesize=80 linesize=120;
data fitness;
input age weight oxy runtime rstpulse runpulse maxpulse;
cards;
44 89.47 44.609 11.37 62 178 182
40 75.07 45.313 10.07 62 185 185
44 85.84 54.297 8.65 45 156 168
42 68.15 59.571 8.17 40 166 172
38 89.02 49.874 9.22 55 178 180
47 77.45 44.811 11.63 58 176 176
40 75.98 45.681 11.95 70 176 180
43 81.19 49.091 10.85 64 162 170
44 81.42 39.442 13.08 63 174 176
38 81.87 60.055 8.63 48 170 186
44 73.03 50.541 10.13 45 168 168
45 87.66 37.388 14.03 56 186 192
45 66.45 44.754 11.12 51 176 176
47 79.15 47.273 10.60 47 162 164
54 83.12 51.855 10.33 50 166 170
49 81.42 49.156 8.95 44 180 185
51 69.63 40.836 10.95 57 168 172
51 77.91 46.672 10.00 48 162 168
48 91.63 46.774 10.25 48 162 164
49 73.37 50.388 10.08 67 168 168
57 73.37 39.407 12.63 58 174 176
54 79.38 46.080 11.17 62 156 165
52 76.32 45.441 9.63 48 164 166
50 70.87 54.625 8.92 48 146 155
51 67.25 45.118 11.08 48 172 172
54 91.63 39.203 12.88 44 168 172
51 73.71 45.790 10.47 59 186 188
57 59.08 50.545 9.93 49 148 155
49 76.32 48.673 9.40 56 186 188
48 61.24 47.920 11.50 52 170 176
52 82.78 47.467 10.50 53 170 172
run;
proc reg;
model oxy=runtime weight age runpulse maxpulse
/collinoint xpx;
run;
model oxy=runtime weight age runpulse maxpulse/collin xpx ;
model oxy=runtime weight age runpulse maxpulse/
influence r;
test maxpulse=runpulse;
run;
proc iml ;
reset nolog linesize=125 fw=8;
reset storage='reg';
load module=(regest regtest regcol regresid regplotr regplotp);
use fitness;
read all;
n=nrow(runtime);
x = repeat(1,n,1)||runtime||weight||age||runpulse||maxpulse;
names = { intercep, runtime, weight, age, runpulse, maxpulse};
y = oxy;
run regest; /* estimate the model */
l={0 0 0 0 1 -1}; run regtest; /* test an hypothesis */
print ,,'Collinearity: Version with intercept',;
run regcol;
run regresid; /* analyze residuals */
run regplotr; /* plot residuals */
xxx = runpulse; run regplotp; /* plot predicted values */
print ,,'Collinearity: Version WITHOUT intercept',;
xx = x[,2:6];
x=xx-j(n,1,1)*(j(1,n,1)*xx)/n;
xpx = x`*x;
names = names[2:6,];
run regcol;
quit;
title1;