Regression Example
/****************************************************************/
/* S A S S A M P L E L I B R A R Y */
/* */
/* NAME: REGDEMO */
/* TITLE: Regression Example */
/* PRODUCT: IML */
/* SYSTEM: ALL */
/* KEYS: MATRIX REGR SUGI6 */
/* PROCS: IML */
/* DATA: */
/* */
/* SUPPORT: Rick Wicklin UPDATE: SEP2013 */
/* REF: */
/* MISC: */
/* */
/****************************************************************/
proc iml;
start regress( x, y, name, tval=, l1=, l2=, l3= );
n = nrow(x); /* number of observations */
k = ncol(x); /* number of variables */
xpx = x` * x; /* cross-products */
xpy = x` * y;
xpxi = inv(xpx); /* inverse crossproducts */
b = xpxi * xpy; /* parameter estimates */
yhat = x * b; /* predicted values */
resid = y - yhat; /* residuals */
sse = resid` * resid; /* sum of squared errors */
dfe = n - k; /* degrees of freedom error */
mse = sse / dfe; /* mean squared error */
rmse = sqrt(mse); /* root mean squared error */
covb = xpxi # mse; /* covariance of estimates */
stdb = sqrt(vecdiag(covb)); /* standard errors */
t = b / stdb; /* ttest for estimates=0 */
probt = 1 - cdf("F",t#t,1,dfe); /* significance probability */
paramest = b || stdb || t || probt;
print paramest[c={"Estimate" "StdErr" "t" "Pr>|t|"} r=name
l="Parameter Estimates" f=Best6.];
s = diag(1/stdb);
corrb = s * covb * s; /* correlation of estimates */
reset fw=6 spaces=3; /* for proper formatting */
print covb[r=name c=name l="Covariance of Estimates"],
corrb[r=name c=name l="Correlation of Estimates"];
if nrow(tval) = 0 then return; /* is a t-value specified? */
projx = x * xpxi * x`; /* hat matrix */
vresid = (i(n) - projx) * mse; /* covariance of residuals */
vpred = projx # mse; /* covariance of pred vals */
h = vecdiag(projx); /* hat leverage values */
lowerm = yhat - tval # sqrt(h*mse); /* lower conf limit for mean*/
upperm = yhat + tval # sqrt(h*mse); /* upper CL for mean */
lower = yhat - tval # sqrt(h*mse+mse);/* lower CL for individual */
upper = yhat + tval # sqrt(h*mse+mse);/* upper CL */
R = y || yhat || resid || h || lowerm || upperm || lower || upper;
labels = {"y" "yhat" "resid" "h" "lowerm" "upperm" "lower" "upper"};
reset fw=6 spaces=1;
print R[c=labels label="Predicted values, Residuals, and Limits"];
/* test hypoth that L*b = 0, where L is linear comb of estimates */
do i = 1 to 3;
L = value("L"+ strip(char(i))); /* get L matrix for L1, L2, and L3 */
if nrow(L) = 0 then return;
dfn = nrow(L);
Lb = L * b;
vLb = L * xpxi * L`;
q = Lb` * inv(vLb) * Lb / dfn;
f = q / mse;
prob = 1 - cdf("F", f,dfn,dfe);
test = dfn || dfe || f || prob;
print L, test[c={"dfn" "dfe" "F" "Pr>F"} f=Best6.
l="Test Hypothesis that L*b = 0"];
end;
finish;
/* Quadratic regression on US population for decades beginning 1790 */
decade = T(1:8);
name={"Intercept", "Decade", "Decade**2" };
x= decade##0 || decade || decade##2;
y= {3.929,5.308,7.239,9.638,12.866,17.069,23.191,31.443};
/* n-p=5 dof at 0.025 level to get 95% confidence interval */
tval = quantile("T", 1-0.025, nrow(x)-ncol(x));
L1 = { 0 1 0 }; /* test hypothesis Lb=0 for linear coef */
L2 = { 0 1 0, /* test hypothesis Lb=0 for linear,quad */
0 0 1 };
L3 = { 0 1 1 }; /* test hypothesis Lb=0 for linear+quad */
option linesize=100;
run regress( x, y, name, tval, L1, L2, L3 );