Regression Example

/****************************************************************/
/*          S A S   S A M P L E   L I B R A R Y                 */
/*                                                              */
/*    NAME: REGDEMO                                             */
/*   TITLE: Regression Example                                  */
/* PRODUCT: IML                                                 */
/*  SYSTEM: ALL                                                 */
/*    KEYS: MATRIX  REGR    SUGI6                               */
/*   PROCS: IML                                                 */
/*    DATA:                                                     */
/*                                                              */
/* SUPPORT: Rick Wicklin                UPDATE: SEP2013         */
/*     REF:                                                     */
/*    MISC:                                                     */
/*                                                              */
/****************************************************************/


proc iml;
start regress( x, y, name, tval=, l1=, l2=, l3= );
   n = nrow(x);                          /* number of observations   */
   k = ncol(x);                          /* number of variables      */
   xpx = x` * x;                         /* cross-products           */
   xpy = x` * y;
   xpxi = inv(xpx);                      /* inverse crossproducts    */

   b = xpxi * xpy;                       /* parameter estimates      */
   yhat = x * b;                         /* predicted values         */
   resid = y - yhat;                     /* residuals                */
   sse = resid` * resid;                 /* sum of squared errors    */
   dfe = n - k;                          /* degrees of freedom error */
   mse = sse / dfe;                      /* mean squared error       */
   rmse = sqrt(mse);                     /* root mean squared error  */

   covb = xpxi # mse;                    /* covariance of estimates  */
   stdb = sqrt(vecdiag(covb));           /* standard errors          */
   t = b / stdb;                         /* ttest for estimates=0    */
   probt = 1 - cdf("F",t#t,1,dfe);       /* significance probability */
   paramest = b || stdb || t || probt;
   print paramest[c={"Estimate" "StdErr" "t" "Pr>|t|"} r=name
                  l="Parameter Estimates" f=Best6.];

   s = diag(1/stdb);
   corrb = s * covb * s;                 /* correlation of estimates */
   reset fw=6 spaces=3;                  /* for proper formatting    */
   print covb[r=name c=name l="Covariance of Estimates"],
         corrb[r=name c=name l="Correlation of Estimates"];

   if nrow(tval) = 0 then return;        /* is a t-value specified?  */
   projx = x * xpxi * x`;                /* hat matrix               */
   vresid = (i(n) - projx) * mse;        /* covariance of residuals  */
   vpred = projx # mse;                  /* covariance of pred vals  */
   h = vecdiag(projx);                   /* hat leverage values      */
   lowerm = yhat - tval # sqrt(h*mse);   /* lower conf limit for mean*/
   upperm = yhat + tval # sqrt(h*mse);   /* upper CL for mean        */
   lower = yhat - tval # sqrt(h*mse+mse);/* lower CL for individual  */
   upper = yhat + tval # sqrt(h*mse+mse);/* upper CL                 */

   R = y || yhat || resid || h || lowerm || upperm || lower || upper;
   labels = {"y" "yhat" "resid" "h" "lowerm" "upperm" "lower" "upper"};
   reset fw=6 spaces=1;
   print R[c=labels label="Predicted values, Residuals, and Limits"];

   /* test hypoth that L*b = 0, where L is linear comb of estimates  */
   do i = 1 to 3;
      L = value("L"+ strip(char(i)));   /* get L matrix for L1, L2, and L3 */
      if nrow(L) = 0 then return;
      dfn = nrow(L);
      Lb = L * b;
      vLb = L * xpxi * L`;
      q = Lb` * inv(vLb) * Lb / dfn;
      f = q / mse;
      prob = 1 - cdf("F", f,dfn,dfe);
      test = dfn || dfe || f || prob;
      print L, test[c={"dfn" "dfe" "F" "Pr>F"} f=Best6.
                    l="Test Hypothesis that L*b = 0"];
   end;
finish;

/* Quadratic regression on US population for decades beginning 1790 */
decade = T(1:8);
name={"Intercept", "Decade", "Decade**2" };
x= decade##0 || decade || decade##2;
y= {3.929,5.308,7.239,9.638,12.866,17.069,23.191,31.443};

/* n-p=5 dof at 0.025 level to get 95% confidence interval */
tval = quantile("T", 1-0.025, nrow(x)-ncol(x));
L1 = { 0 1 0 };   /* test hypothesis Lb=0 for linear coef */
L2 = { 0 1 0,     /* test hypothesis Lb=0 for linear,quad */
       0 0 1 };
L3 = { 0 1 1 };   /* test hypothesis Lb=0 for linear+quad */
option linesize=100;
run regress( x, y, name, tval, L1, L2, L3 );