Modules for Plotting Robust Regression Results

/****************************************************************/
/*          S A S   S A M P L E   L I B R A R Y                 */
/*                                                              */
/*    NAME: ROBUSTMC                                            */
/*   TITLE: Modules for Plotting Robust Regression Results      */
/* PRODUCT: IML                                                 */
/*  SYSTEM: ALL                                                 */
/*                                                              */
/* SUPPORT: Rick Wicklin                UPDATE: Jun 2016        */
/*     REF:                                                     */
/*    MISC:                                                     */
/* Modules: MVESCATTER - PLOTTING MVE RESULTS                   */
/*          MCDSCATTER - PLOTTING MCD RESULTS                   */
/*          RDPLOT - PLOTTING LMS AND LMT RESULTS               */
/*          CalcEllipsesFromCov - supports RESET NOEIGEN93      */
/****************************************************************/

proc iml;

/*------------------------------------------------------------*/
start CalcEllipsesFromCov(ellipse, mAxes, ConfLevel, numObs,
                                   mCenter, mCov, options );
/* This module computes confidence ellipses for mean or prediction
   from a 2x2 covariance matrix.
   Output matrices:
   ellipse    a numEllipsePts x (2*ncol(ConfLevel)) matrix that
              contains the ellipses in parametric form.  There
              are numEllipsePts in the parameterization.
              Each pair of columns ellipse[,2*i-1:2*i] gives the
              X and Y coordinates of the ellipse for confidence
              level ConfLevel[i].
   mAxes      a 4x(2*ncol(ConfLevel)) matrix containing the major
              and minor axes of the ellipses.  Each pair of columns
              m = mAxes[,2*i-1:2*i] describes the axes for that
              ellipse. The major axis goes from m[1,] to m[2,] and
              the minor axis goes from m[3,] to m[4,].

   Input parameters:
   ConfLevel  a 1xn vector of (1-alpha) confidence levels that will
              determine the ellipses. Each entry is
              0 < ConfLevel[i] < 1.  For example, 0.95 produces the
              95% confidence ellipse for alpha=0.05.
   numObs     The number of nonmissing observations in the data
              underlying the covariance matrix.  Require numObs>2.
   mCenter    a 1x2 vector containing an estimate of the (X,Y)
              center of the data. This can be the mean of X and Y,
              or any robust estimate.
   mCov       a 2x2 symmetric covariance matrix. This can be the
              standard covariance matrix or a robust estimate of
              the covariance.
   options    options[1]=1  ==> compute prediction ellipse
              otherwise     ==> compute confidence ellipse for mean
*/

numEllipsePts = 100;

/* Eigenvectors stored in columns. Symmetry==>pos real eigenvalues */
call eigen(eval,evec,mCov);

/* Standardize eigenvectors: put e1 in half space y>0 */
if evec[2,1] < 0 then
   evec[,1] = -evec[,1];
/* Put e2 so that it is in clockwise direction from e1 */
if det(evec) < 0 then
   evec[,2] = -evec[,2];

/* handle numeric zero */
if eval[1] <= 0 then da = 0.;
   else da = sqrt(eval[1]);
if eval[2] <= 0 then db = 0.;
   else db = sqrt(eval[2]);

/* Get orthogonal rotation matrix.
   Since length of eigenvectors is 1:
   evec[,1] = <cos(alpha), sin(alpha)> where alpha is direction angle
   rot = [cos(alpha)   -sin(alpha)]
         [sin(alpha)    cos(alpha)]
   Then rot*{x,y} is rotation of {x,y} by alpha.  If {x y} is row vector,
   use {x y}*rot`
   Because evec is in standard position, rot = evec */
rot = evec;

RX = da * T(evec[,1]); /* row vector: lambda1*evec[,1] */
RY = db * T(evec[,2]); /* row vector: lambda2*evec[,2] */
axes = j(4,2);
axes[1,] = -RX; /* negative semi-major axis */
axes[2,] =  RX; /* positive semi-major axis */
axes[3,] = -RY; /* negative semi-minor axis */
axes[4,] =  RY; /* positive semi-minor axis */

/* see documentation for CORR */
c = 2*(numObs-1) / (numObs*(numObs-2)); /* mean */
if options=1 then
   c = c * (numObs+1);                  /* predictions */
/* note ConfLevel = 1-alpha */
F = sqrt( c * quantile( "F", ConfLevel, 2, numObs-2 ) );

/* Points of ellipsoid */
twoPi = 2*constant("pi");
phi = twoPi * T(0:numEllipsePts-1) / (numEllipsePts-1);
x = cos(phi);
y = sin(phi);

p = rowvec(ConfLevel);
ellipse = j(numEllipsePts, 2*ncol(p));
mAxes = j(4, 2*ncol(p));
ellipseCenter = j(numEllipsePts,1,1) * mCenter;
axesCenter = j(4,1,1) * mCenter;
do i = 1 to ncol(p);
   ellipse[,(2*i-1):2*i] = F[i] # (da*x || db*y) * rot` + ellipseCenter;
   mAxes[,(2*i-1):2*i] = F[i] # axes + axesCenter;
end;
finish;

/*------------------------------------------------------------*/

start MVEScatter(a, optn, prob=0.9, vnam=,
                 titl="Estimates of Location and Scale");
if IsSkipped(vnam) then
   varnames = "x1":("x"+strip(char(ncol(a))));
else varnames = vnam;
call RobustScatter("MVE", a, optn, prob, varnames, titl);
finish;

/*------------------------------------------------------------*/

start MCDScatter(a, optn, prob=0.9, vnam=,
                 titl="Estimates of Location and Scale");
if IsSkipped(vnam) then
   varnames = "x1":("x"+strip(char(ncol(a))));
else varnames = vnam;
call RobustScatter("MCD", a, optn, prob, varnames, titl);
finish;

/*------------------------------------------------------------*/

start RobustScatter(type, a, optn, prob=0.9, varnames,
                    titl="Estimates of Location and Scale");
p = ncol(a);
numObs = nrow(a);
muc = mean(a); /* classical estimates */
covc= cov(a);

if upcase(type)="MVE" then
   CALL MVE(sc,xm,dist,optn,a);
else
   CALL MCD(sc,xm,dist,optn,a);
mur = xm[1,]; /* robust estimates */
covr = xm[3:2+p,];
wgts = dist[3,];

do j = 2 to p;
   do i = 1 to j-1;
      x=a[,i]; y=a[,j];
      mu2 = mur[i] || mur[j];
      cov2 = ( covr[i,i] || covr[i,j] ) //
             ( covr[j,i] || covr[j,j] );
      run CalcEllipsesFromCov(ellipse, mAxes, prob,
                              numObs, mu2, cov2, 1);
      ex = ellipse[,1];      ey = ellipse[,2];
      create _dta var {"x" "y" "wgts" "ex" "ey"};
      append;
      close _dta;
      labx = varnames[i]; laby = varnames[j];
      submit labx laby titl;
         data _dta;
         set _dta;
         label x="&labx" y="&laby";
         if wgts=0 then do;
            Outlier="Yes";  ID=_N_;
         end;
         else do;
            Outlier="No";  ID=.;
         end;
         run;

         title "&titl";
         proc sgplot data=_dta;
         styleattrs datacolors=(Blue Red)
                    datasymbols=(CircleFilled X);
         scatter x=x y=y / markerattrs=(size=10)
                    datalabel=ID
                    group=Outlier nomissinggroup name="scat";
         ellipse x=x y=y / alpha=0.1 lineattrs=(pattern=dash)
                    name="classic"
                    legendlabel="Classical 90% Prediction";
         series x=ex y=ey /name="robust"
                    legendlabel="Robust 90% Prediction";
         keylegend "scat" /location=inside position=bottomright
                    across=1 title="Outlier";
         keylegend "classic" "robust";
         run;
         title;
      endsubmit;
   end;
end;
call delete(_dta);
finish RobustScatter;

/*------------------------------------------------------------*/

start RDPlot(method,LMSOpt,MCDOpt,y,x);
call mcd(sc,xmve,dist,MCDOpt,x);
if upcase(method)="LMS" then
   call lms(sc, coef, wgt, LMSOpt, y, x);
else
   call lts(sc, coef, wgt, LMSOpt, y, x);

RobResid = wgt[2,]; /* robust residuals */
MD = dist[1,];      /* Mahalanobis distance (MD) */
RD = dist[2,];      /* Robust distance (RD) */

Outlier = (abs(RobResid)>2.5);
cutoff = sqrt(quantile("chisq",0.975, ncol(x)));
Leverage = ^dist[3,]; /* indicator variable for leverage pts */
/* classify each point:
   0 = regular point
   1 = outlier
   2 = leverage
   3 = outlier & leverage */
Both = Outlier # Leverage;
Type = choose(Leverage, 2, Outlier);
Type = choose(Both, 3, Type);
create _rdplot0 var {"RobResid" MD RD "Type"};
append;
close _rdplot0;

submit cutoff;
proc format;
value TypeFmt 0 = "Regular"
              1 = "Outlier"
              2 = "Leverage"
              3 = "Outlier and Leverage";
run;

data _Attrs;
length Value $20 MarkerColor $20;
ID = "RDAttr";
Value= putn(0,"TypeFmt."); MarkerColor = "Black       "; output;
Value= putn(1,"TypeFmt."); MarkerColor = "DarkRed     "; output;
Value= putn(2,"TypeFmt."); MarkerColor = "DarkBlue    "; output;
Value= putn(3,"TypeFmt."); MarkerColor = "DarkGreen   "; output;
run;

data _rdplot;
label RobResid="Standardized Robust Residual"
      MD="Mahalanobis Distance"
      RD="Robust Distance";
format Type TypeFmt.;
set _rdplot0;
if Type>0 then ID=_N_;
run;

title "Outlier and Leverage Diagnostics";
proc sgplot data=_rdplot DATTRMAP=_Attrs;
   scatter x=RD y=RobResid / datalabel=ID
           markerattrs=(size=10 symbol=CircleFilled)
           transparency=0.3 group=Type
           attrid=RDAttr GROUPORDER=ascending;
   refline &cutoff / axis=x;
   refline -2.5 2.5 / axis=y;
run;
title;
endsubmit;

call delete({_Attrs _rdplot0});
finish RDPlot;

/*------------------------------------------------------------*/

store module=_all_;
quit;