Quadratic Regression of US Population

 /****************************************************************/
 /*          S A S   S A M P L E   L I B R A R Y                 */
 /*                                                              */
 /*    NAME: REGTEST1                                            */
 /*   TITLE: Quadratic Regression of US Population               */
 /* PRODUCT: IML                                                 */
 /*  SYSTEM: ALL                                                 */
 /*    KEYS: MATRIX REGR                                         */
 /*   PROCS: IML  REG                                            */
 /*    DATA:                                                     */
 /*                                                              */
 /* SUPPORT: RHD                         UPDATE:  12MAY1989      */
 /*     REF:                                                     */
 /*    MISC:                                                     */
 /*                                                              */
 /****************************************************************/

 /* The first IML session contains a suite of routines for doing
 * regression analysis on a set of data.  It defines
 * the modules and stores them in a SAS/IML storage file, 'reg'.
 * The second IML session shows typical uses of the routines.
 *
 * Note that these routines are also defined by and used in
 * IML sample member REGTEST2.
 *
 * WARNING: THESE ROUTINES ARE FOR DEMONSTRATION ONLY. THEY
 *          DO NOT SUPPORT MISSING VALUES, NOR DO THEY HANDLE
 *          LINEAR DEPENDENCIES AMONG REGRESSORS!
 *
 * REGEST: basic estimation, parameter estimates
 * REGTEST: test a linear combination
 * REGCOL:  collinearity diagnostics on X`X
 * REGCOLX:  collinearity diagnostics on X
 * REGRESID: analyze residuals
 * REGPLOTR: plot residuals
 * REGPLOTP: plot predicted values
 */


proc iml ;

 /*-----REGEST: Regression Parameter Estimation-----
 *arguments:
 * x    the regressors, design matrix
 * y    the response, dependent variable
 * names the names of the regressors
 */

start regest;
  n=nrow(x);                  /* number of observations   */
  k=ncol(x);                  /* number of variables      */
  xpx=x`*x;                   /* cross-products           */
  xpy=x`*y;
  xpxi=inv(xpx);              /* inverse crossproducts    */
  beta=xpxi*xpy;              /* parameter estimates      */
  sse = y`*y-xpy`*beta;       /* sum of squares error     */
  dfe = n-k;                  /* degrees of freedom error */
  mse = sse/dfe;              /* mean square error      */
  rmse = sqrt(mse);           /* root mean square error */
  rsquare = 1-sse/((y-y[:])[##]);

  print ,,'Regression Analysis',,'Residual Error:'
      sse dfe mse rmse rsquare;

  stderr = sqrt(vecdiag(xpxi)#mse); /* std error of estimates  */
  tratio = beta/stderr;             /* test for parameter=0    */
  probt=1-probf(tratio##2,1,dfe);   /* significance probability */
  print ,,'Regression Parameter Estimates ',,
           names beta stderr tratio probt;

  covb=xpxi#mse;              /* covariance of estimates  */
  s=1/stderr;
  corrb=s#covb#s`;            /* correlation of estimates */
  print ,"Covariance of estimates",  covb[r=names c=names],
         "Correlation of estimates",corrb[r=names c=names];

finish;




 /*---routine to test a linear combination of the estimates---
 * given l, this routine tests hypothesis that l b = 0.
 * THIS ROUTINE DOES NOT CHECK ESTIMABILITY
 *arguments:
 * l    the hypothesis to test
 * xpxi inverse crossproducts of regressors
 * mse  mean square error
 * dfe  degrees of freedom for error
 */
start regtest;
  dfn=nrow(l);
  lb=l*beta;
  vlb=l*xpxi*l`;
  q=lb`*inv(vlb)*lb /dfn;
  f=q/mse;
  prob=1-probf(f,dfn,dfe);
  print 'Testing' l,f dfn dfe prob;
finish;






 /* this stream defines and stores two modules that perform
 *  regression collinearity diagnostics. The routine REGCOL
 *  does it from a crossproducts matrix, while the routine REGCOLX
 *  does it from a full matrix of regressors.
 *
 *REGCOL
 * arguments
 *    XPX   the crossproducts matrix
 *    NAMES the names of the rows and columns of XPX
 * returns
 *    EIGVAL the eigenvalues of XPX
 *    CONDIT the condition numbers
 *    PI     the variance decomposition
 *
 *REGCOL X
 * arguments
 *    X     the crossproducts matrix
 *    NAMES the names of the columns of X
 * returns
 *    EIGVAL the eigenvalues of XPX
 *    CONDIT the condition numbers
 *    PI     the variance decomposition
 */



 /*---collinearity diagnostics: from the full X matrix---*/
start regcolx;
   k=ncol(x);
   s=1/sqrt(x[##,]);
   xs=x#s;
   call svd(u, q, v, xs);
    r=k+1-rank(q);
    t=u; u[,r]=t;
    t=q; q[r,]=t;
    t=v; v[,r]=t;
   eigval=q#q;
   condit=q[1]/q;
   phi=v#v#(1/eigval`);
   phik=phi[,+];
   pi=phi/ (phik*j(1,k));
   pi = pi`;
   print ,,'Collinearity Diagnostics, Variance Decomposition',,
          eigval condit pi [c=names];
finish;

 /*---collinearity diagnostics: from the crossproducts XPX---*/
start regcol;
   k=ncol(xpx);
   s = sqrt(1/vecdiag(xpx));
   xpxs = s`#xpx#s;
   call eigen(eigval,v,xpxs);
   q = sqrt(eigval);
   condit=q[1]/q;
   phi=v#v#(1/eigval`);
   phik=phi[,+];
   pi=phi/ (phik*j(1,k));
   pi=pi`;
   print ,,'Collinearity Diagnostics, Variance Decomposition',,
          eigval condit pi [c=names];
finish;


 /*--Residual and Predicted Analysis, with Influence Diagnostics--
 *arguments:
 *  xpxi  contains inverse regressor crossproducts
 *  beta  contains parameter estimates
 *  X     contains regressor data
 *  y     contains response
 *  mse   contains mean square error
 *  dfe   contains degrees of freedom for residual error
 *  names contains regressor names
 *  n     contains number of observations
 *  k     contains number of columns in x
 */
start regresid;

  pred=x*beta;                /* predicted values         */
  resid=y-pred;               /* residuals                */

 /*---compute leverage---*/
 c = x*xpxi;
 h = (c#x)[,+];    /* leverage; faster than vecdiag(x*xpxi*x`) */

 /*---compute standard errors---*/
 stdm = sqrt(h*mse);      /* std error for expected value */
 stdp = sqrt(h*mse+mse);  /* std error for individual value */
 stdr = sqrt(mse#(1-h));  /* std error for residual */
 stures=resid/stdr;       /* ordinary studentized residual */

 /*---compute confidence limits---*/
 tval = tinv(.975,dfe);
 lower95m=pred-tval#stdm; /* lower confidence limit for mean */
 upper95m=pred+tval#stdm; /* upper */
 lower95 =pred-tval#stdp; /* lower confidence limit for indiv */
 upper95 =pred+tval#stdp; /* upper */
 print ,,'Predicted and Residual Values',,y pred resid stures stdr
                                   lower95m upper95m stdm
                                   lower95  upper95  stdp;

 /*---compute influence statistics---*/
 si2 =  (mse#dfe - resid#resid / (1-h) )  / (dfe - 1);
 si = sqrt(si2);     /* root mse for dropping each observation */
 rstudent = resid/(si#sqrt(1-h));
 covratio = 1/((((n-k-1)+rstudent##2)/(n-k))##k # (1-h));
 dfbetas = (resid/(si#(1-h))) # c # (1/sqrt(c[##,])) ;
 dffits = rstudent # sqrt(h/(1-h));
 ii = (-4<>rstudent><4)+5.5 + (0:n-1)`#9; /* scale from 1 to 9 */
 rstuplot = cshape( repeat('|   |   |',n,1),n#9,1,1);
 rstuplot[ii]='*';  rstuplot = cshape(rstuplot,n,1,9);

 print ,,'Influence Diagnostics',,resid h rstudent rstuplot
                                dffits dfbetas[c=names] covratio;

finish;



 /*-----Residual Plots-----
 *arguments:
 *  X     contains regressor data
 *  pred  contains predicted values
 *  resid contains residuals
 *  names contains regressor names
 */
start regplotr;
 reset linesize=80 pagesize=30;
 resname="Residual Value";
 call pgraf(pred||resid,'*','Predicted Value',resname);
 do i=2 to ncol(x);
    call pgraf(x[,i]||resid,'*',names[i],resname);
    end;
finish;


 /*-----Predicted Plots-----
 *arguments
 * xxx contains the regressor to plot on the x-axis
 * n   contains number of observations
 * plus all the variables from module regresid.
 */
start regplotp;
 reset linesize=120 pagesize=50;
 title='Legend: a=actual p=pred l=lower95% u=upper95%';
 xxxx = xxx//xxx//xxx//xxx;
 yyyy = y // pred // lower95 // upper95;
 id = repeat('a',n,1)//repeat('p',n,1)//repeat('l',n,1)
     //repeat('u',n,1);
 call pgraf(xxxx||yyyy,id,'Regressor','Prediction',title);
finish;




reset storage='reg';
store module=(regest regtest regcol regcolx regresid regplotr regplotp);
quit;

 /*NB: THIS SAMPLE REQUIRES the product SAS/STAT in order to run,*/
 /*    although one could edit out the PROC REG to avoid SAS/STAT*/
 /*This sample runs a regression example using both PROC REG, and*/
 /*the IML modules created in the previous IML session.          */

options pagesize=80 linesize=120 ;
data uspop;
 input pop @@; pop=pop/1000;
 retain year 1780; year+10; yearsq=year*year;
 cards;
3929  5308  7239  9638  12866  17069  23191  31443
39818  50155  62947  75994  91972  105710  122775  131669
151325  179323  203211
run;
proc reg;
     model pop = year yearsq/ r collin influence covb corrb;
     test year - 2*yearsq;
     run;
proc iml;
 reset storage='reg';
 load module=(regest regtest regcol regcolx regresid
    regplotr regplotp);
 reset nolog linesize=120 pagesize=60 fw=8;
 use uspop;
 read all;
 n=nrow(year);
 x = j(n,1,1)||year||yearsq;
 names = { intercep, year, yearsq};
 y=pop;

 run regest;               /* estimate the model */
 l={0 1 -2}; run regtest;  /* test an hypothesis */
 run regcol;               /* collinearity diagnostics */
 run regcolx;              /* collinearity diagnostics again */
 run regresid;             /* residual analysis */
 run regplotr;             /* plot residuals */
 xxx = year;  run regplotp; /* plot predicteds, confidence limits*/

 quit;
 title;