Quadratic Regression of US Population
/****************************************************************/
/* S A S S A M P L E L I B R A R Y */
/* */
/* NAME: REGTEST1 */
/* TITLE: Quadratic Regression of US Population */
/* PRODUCT: IML */
/* SYSTEM: ALL */
/* KEYS: MATRIX REGR */
/* PROCS: IML REG */
/* DATA: */
/* */
/* SUPPORT: RHD UPDATE: 12MAY1989 */
/* REF: */
/* MISC: */
/* */
/****************************************************************/
/* The first IML session contains a suite of routines for doing
* regression analysis on a set of data. It defines
* the modules and stores them in a SAS/IML storage file, 'reg'.
* The second IML session shows typical uses of the routines.
*
* Note that these routines are also defined by and used in
* IML sample member REGTEST2.
*
* WARNING: THESE ROUTINES ARE FOR DEMONSTRATION ONLY. THEY
* DO NOT SUPPORT MISSING VALUES, NOR DO THEY HANDLE
* LINEAR DEPENDENCIES AMONG REGRESSORS!
*
* REGEST: basic estimation, parameter estimates
* REGTEST: test a linear combination
* REGCOL: collinearity diagnostics on X`X
* REGCOLX: collinearity diagnostics on X
* REGRESID: analyze residuals
* REGPLOTR: plot residuals
* REGPLOTP: plot predicted values
*/
proc iml ;
/*-----REGEST: Regression Parameter Estimation-----
*arguments:
* x the regressors, design matrix
* y the response, dependent variable
* names the names of the regressors
*/
start regest;
n=nrow(x); /* number of observations */
k=ncol(x); /* number of variables */
xpx=x`*x; /* cross-products */
xpy=x`*y;
xpxi=inv(xpx); /* inverse crossproducts */
beta=xpxi*xpy; /* parameter estimates */
sse = y`*y-xpy`*beta; /* sum of squares error */
dfe = n-k; /* degrees of freedom error */
mse = sse/dfe; /* mean square error */
rmse = sqrt(mse); /* root mean square error */
rsquare = 1-sse/((y-y[:])[##]);
print ,,'Regression Analysis',,'Residual Error:'
sse dfe mse rmse rsquare;
stderr = sqrt(vecdiag(xpxi)#mse); /* std error of estimates */
tratio = beta/stderr; /* test for parameter=0 */
probt=1-probf(tratio##2,1,dfe); /* significance probability */
print ,,'Regression Parameter Estimates ',,
names beta stderr tratio probt;
covb=xpxi#mse; /* covariance of estimates */
s=1/stderr;
corrb=s#covb#s`; /* correlation of estimates */
print ,"Covariance of estimates", covb[r=names c=names],
"Correlation of estimates",corrb[r=names c=names];
finish;
/*---routine to test a linear combination of the estimates---
* given l, this routine tests hypothesis that l b = 0.
* THIS ROUTINE DOES NOT CHECK ESTIMABILITY
*arguments:
* l the hypothesis to test
* xpxi inverse crossproducts of regressors
* mse mean square error
* dfe degrees of freedom for error
*/
start regtest;
dfn=nrow(l);
lb=l*beta;
vlb=l*xpxi*l`;
q=lb`*inv(vlb)*lb /dfn;
f=q/mse;
prob=1-probf(f,dfn,dfe);
print 'Testing' l,f dfn dfe prob;
finish;
/* this stream defines and stores two modules that perform
* regression collinearity diagnostics. The routine REGCOL
* does it from a crossproducts matrix, while the routine REGCOLX
* does it from a full matrix of regressors.
*
*REGCOL
* arguments
* XPX the crossproducts matrix
* NAMES the names of the rows and columns of XPX
* returns
* EIGVAL the eigenvalues of XPX
* CONDIT the condition numbers
* PI the variance decomposition
*
*REGCOL X
* arguments
* X the crossproducts matrix
* NAMES the names of the columns of X
* returns
* EIGVAL the eigenvalues of XPX
* CONDIT the condition numbers
* PI the variance decomposition
*/
/*---collinearity diagnostics: from the full X matrix---*/
start regcolx;
k=ncol(x);
s=1/sqrt(x[##,]);
xs=x#s;
call svd(u, q, v, xs);
r=k+1-rank(q);
t=u; u[,r]=t;
t=q; q[r,]=t;
t=v; v[,r]=t;
eigval=q#q;
condit=q[1]/q;
phi=v#v#(1/eigval`);
phik=phi[,+];
pi=phi/ (phik*j(1,k));
pi = pi`;
print ,,'Collinearity Diagnostics, Variance Decomposition',,
eigval condit pi [c=names];
finish;
/*---collinearity diagnostics: from the crossproducts XPX---*/
start regcol;
k=ncol(xpx);
s = sqrt(1/vecdiag(xpx));
xpxs = s`#xpx#s;
call eigen(eigval,v,xpxs);
q = sqrt(eigval);
condit=q[1]/q;
phi=v#v#(1/eigval`);
phik=phi[,+];
pi=phi/ (phik*j(1,k));
pi=pi`;
print ,,'Collinearity Diagnostics, Variance Decomposition',,
eigval condit pi [c=names];
finish;
/*--Residual and Predicted Analysis, with Influence Diagnostics--
*arguments:
* xpxi contains inverse regressor crossproducts
* beta contains parameter estimates
* X contains regressor data
* y contains response
* mse contains mean square error
* dfe contains degrees of freedom for residual error
* names contains regressor names
* n contains number of observations
* k contains number of columns in x
*/
start regresid;
pred=x*beta; /* predicted values */
resid=y-pred; /* residuals */
/*---compute leverage---*/
c = x*xpxi;
h = (c#x)[,+]; /* leverage; faster than vecdiag(x*xpxi*x`) */
/*---compute standard errors---*/
stdm = sqrt(h*mse); /* std error for expected value */
stdp = sqrt(h*mse+mse); /* std error for individual value */
stdr = sqrt(mse#(1-h)); /* std error for residual */
stures=resid/stdr; /* ordinary studentized residual */
/*---compute confidence limits---*/
tval = tinv(.975,dfe);
lower95m=pred-tval#stdm; /* lower confidence limit for mean */
upper95m=pred+tval#stdm; /* upper */
lower95 =pred-tval#stdp; /* lower confidence limit for indiv */
upper95 =pred+tval#stdp; /* upper */
print ,,'Predicted and Residual Values',,y pred resid stures stdr
lower95m upper95m stdm
lower95 upper95 stdp;
/*---compute influence statistics---*/
si2 = (mse#dfe - resid#resid / (1-h) ) / (dfe - 1);
si = sqrt(si2); /* root mse for dropping each observation */
rstudent = resid/(si#sqrt(1-h));
covratio = 1/((((n-k-1)+rstudent##2)/(n-k))##k # (1-h));
dfbetas = (resid/(si#(1-h))) # c # (1/sqrt(c[##,])) ;
dffits = rstudent # sqrt(h/(1-h));
ii = (-4<>rstudent><4)+5.5 + (0:n-1)`#9; /* scale from 1 to 9 */
rstuplot = cshape( repeat('| | |',n,1),n#9,1,1);
rstuplot[ii]='*'; rstuplot = cshape(rstuplot,n,1,9);
print ,,'Influence Diagnostics',,resid h rstudent rstuplot
dffits dfbetas[c=names] covratio;
finish;
/*-----Residual Plots-----
*arguments:
* X contains regressor data
* pred contains predicted values
* resid contains residuals
* names contains regressor names
*/
start regplotr;
reset linesize=80 pagesize=30;
resname="Residual Value";
call pgraf(pred||resid,'*','Predicted Value',resname);
do i=2 to ncol(x);
call pgraf(x[,i]||resid,'*',names[i],resname);
end;
finish;
/*-----Predicted Plots-----
*arguments
* xxx contains the regressor to plot on the x-axis
* n contains number of observations
* plus all the variables from module regresid.
*/
start regplotp;
reset linesize=120 pagesize=50;
title='Legend: a=actual p=pred l=lower95% u=upper95%';
xxxx = xxx//xxx//xxx//xxx;
yyyy = y // pred // lower95 // upper95;
id = repeat('a',n,1)//repeat('p',n,1)//repeat('l',n,1)
//repeat('u',n,1);
call pgraf(xxxx||yyyy,id,'Regressor','Prediction',title);
finish;
reset storage='reg';
store module=(regest regtest regcol regcolx regresid regplotr regplotp);
quit;
/*NB: THIS SAMPLE REQUIRES the product SAS/STAT in order to run,*/
/* although one could edit out the PROC REG to avoid SAS/STAT*/
/*This sample runs a regression example using both PROC REG, and*/
/*the IML modules created in the previous IML session. */
options pagesize=80 linesize=120 ;
data uspop;
input pop @@; pop=pop/1000;
retain year 1780; year+10; yearsq=year*year;
cards;
3929 5308 7239 9638 12866 17069 23191 31443
39818 50155 62947 75994 91972 105710 122775 131669
151325 179323 203211
run;
proc reg;
model pop = year yearsq/ r collin influence covb corrb;
test year - 2*yearsq;
run;
proc iml;
reset storage='reg';
load module=(regest regtest regcol regcolx regresid
regplotr regplotp);
reset nolog linesize=120 pagesize=60 fw=8;
use uspop;
read all;
n=nrow(year);
x = j(n,1,1)||year||yearsq;
names = { intercep, year, yearsq};
y=pop;
run regest; /* estimate the model */
l={0 1 -2}; run regtest; /* test an hypothesis */
run regcol; /* collinearity diagnostics */
run regcolx; /* collinearity diagnostics again */
run regresid; /* residual analysis */
run regplotr; /* plot residuals */
xxx = year; run regplotp; /* plot predicteds, confidence limits*/
quit;
title;