Modules for Plotting Robust Regression Results
/****************************************************************/
/* S A S S A M P L E L I B R A R Y */
/* */
/* NAME: ROBUSTMC */
/* TITLE: Modules for Plotting Robust Regression Results */
/* PRODUCT: IML */
/* SYSTEM: ALL */
/* */
/* SUPPORT: Rick Wicklin UPDATE: Jun 2016 */
/* REF: */
/* MISC: */
/* Modules: MVESCATTER - PLOTTING MVE RESULTS */
/* MCDSCATTER - PLOTTING MCD RESULTS */
/* RDPLOT - PLOTTING LMS AND LMT RESULTS */
/* CalcEllipsesFromCov - supports RESET NOEIGEN93 */
/****************************************************************/
proc iml;
/*------------------------------------------------------------*/
start CalcEllipsesFromCov(ellipse, mAxes, ConfLevel, numObs,
mCenter, mCov, options );
/* This module computes confidence ellipses for mean or prediction
from a 2x2 covariance matrix.
Output matrices:
ellipse a numEllipsePts x (2*ncol(ConfLevel)) matrix that
contains the ellipses in parametric form. There
are numEllipsePts in the parameterization.
Each pair of columns ellipse[,2*i-1:2*i] gives the
X and Y coordinates of the ellipse for confidence
level ConfLevel[i].
mAxes a 4x(2*ncol(ConfLevel)) matrix containing the major
and minor axes of the ellipses. Each pair of columns
m = mAxes[,2*i-1:2*i] describes the axes for that
ellipse. The major axis goes from m[1,] to m[2,] and
the minor axis goes from m[3,] to m[4,].
Input parameters:
ConfLevel a 1xn vector of (1-alpha) confidence levels that will
determine the ellipses. Each entry is
0 < ConfLevel[i] < 1. For example, 0.95 produces the
95% confidence ellipse for alpha=0.05.
numObs The number of nonmissing observations in the data
underlying the covariance matrix. Require numObs>2.
mCenter a 1x2 vector containing an estimate of the (X,Y)
center of the data. This can be the mean of X and Y,
or any robust estimate.
mCov a 2x2 symmetric covariance matrix. This can be the
standard covariance matrix or a robust estimate of
the covariance.
options options[1]=1 ==> compute prediction ellipse
otherwise ==> compute confidence ellipse for mean
*/
numEllipsePts = 100;
/* Eigenvectors stored in columns. Symmetry==>pos real eigenvalues */
call eigen(eval,evec,mCov);
/* Standardize eigenvectors: put e1 in half space y>0 */
if evec[2,1] < 0 then
evec[,1] = -evec[,1];
/* Put e2 so that it is in clockwise direction from e1 */
if det(evec) < 0 then
evec[,2] = -evec[,2];
/* handle numeric zero */
if eval[1] <= 0 then da = 0.;
else da = sqrt(eval[1]);
if eval[2] <= 0 then db = 0.;
else db = sqrt(eval[2]);
/* Get orthogonal rotation matrix.
Since length of eigenvectors is 1:
evec[,1] = <cos(alpha), sin(alpha)> where alpha is direction angle
rot = [cos(alpha) -sin(alpha)]
[sin(alpha) cos(alpha)]
Then rot*{x,y} is rotation of {x,y} by alpha. If {x y} is row vector,
use {x y}*rot`
Because evec is in standard position, rot = evec */
rot = evec;
RX = da * T(evec[,1]); /* row vector: lambda1*evec[,1] */
RY = db * T(evec[,2]); /* row vector: lambda2*evec[,2] */
axes = j(4,2);
axes[1,] = -RX; /* negative semi-major axis */
axes[2,] = RX; /* positive semi-major axis */
axes[3,] = -RY; /* negative semi-minor axis */
axes[4,] = RY; /* positive semi-minor axis */
/* see documentation for CORR */
c = 2*(numObs-1) / (numObs*(numObs-2)); /* mean */
if options=1 then
c = c * (numObs+1); /* predictions */
/* note ConfLevel = 1-alpha */
F = sqrt( c * quantile( "F", ConfLevel, 2, numObs-2 ) );
/* Points of ellipsoid */
twoPi = 2*constant("pi");
phi = twoPi * T(0:numEllipsePts-1) / (numEllipsePts-1);
x = cos(phi);
y = sin(phi);
p = rowvec(ConfLevel);
ellipse = j(numEllipsePts, 2*ncol(p));
mAxes = j(4, 2*ncol(p));
ellipseCenter = j(numEllipsePts,1,1) * mCenter;
axesCenter = j(4,1,1) * mCenter;
do i = 1 to ncol(p);
ellipse[,(2*i-1):2*i] = F[i] # (da*x || db*y) * rot` + ellipseCenter;
mAxes[,(2*i-1):2*i] = F[i] # axes + axesCenter;
end;
finish;
/*------------------------------------------------------------*/
start MVEScatter(a, optn, prob=0.9, vnam=,
titl="Estimates of Location and Scale");
if IsSkipped(vnam) then
varnames = "x1":("x"+strip(char(ncol(a))));
else varnames = vnam;
call RobustScatter("MVE", a, optn, prob, varnames, titl);
finish;
/*------------------------------------------------------------*/
start MCDScatter(a, optn, prob=0.9, vnam=,
titl="Estimates of Location and Scale");
if IsSkipped(vnam) then
varnames = "x1":("x"+strip(char(ncol(a))));
else varnames = vnam;
call RobustScatter("MCD", a, optn, prob, varnames, titl);
finish;
/*------------------------------------------------------------*/
start RobustScatter(type, a, optn, prob=0.9, varnames,
titl="Estimates of Location and Scale");
p = ncol(a);
numObs = nrow(a);
muc = mean(a); /* classical estimates */
covc= cov(a);
if upcase(type)="MVE" then
CALL MVE(sc,xm,dist,optn,a);
else
CALL MCD(sc,xm,dist,optn,a);
mur = xm[1,]; /* robust estimates */
covr = xm[3:2+p,];
wgts = dist[3,];
do j = 2 to p;
do i = 1 to j-1;
x=a[,i]; y=a[,j];
mu2 = mur[i] || mur[j];
cov2 = ( covr[i,i] || covr[i,j] ) //
( covr[j,i] || covr[j,j] );
run CalcEllipsesFromCov(ellipse, mAxes, prob,
numObs, mu2, cov2, 1);
ex = ellipse[,1]; ey = ellipse[,2];
create _dta var {"x" "y" "wgts" "ex" "ey"};
append;
close _dta;
labx = varnames[i]; laby = varnames[j];
submit labx laby titl;
data _dta;
set _dta;
label x="&labx" y="&laby";
if wgts=0 then do;
Outlier="Yes"; ID=_N_;
end;
else do;
Outlier="No"; ID=.;
end;
run;
title "&titl";
proc sgplot data=_dta;
styleattrs datacolors=(Blue Red)
datasymbols=(CircleFilled X);
scatter x=x y=y / markerattrs=(size=10)
datalabel=ID
group=Outlier nomissinggroup name="scat";
ellipse x=x y=y / alpha=0.1 lineattrs=(pattern=dash)
name="classic"
legendlabel="Classical 90% Prediction";
series x=ex y=ey /name="robust"
legendlabel="Robust 90% Prediction";
keylegend "scat" /location=inside position=bottomright
across=1 title="Outlier";
keylegend "classic" "robust";
run;
endsubmit;
end;
end;
call delete(_dta);
finish RobustScatter;
/*------------------------------------------------------------*/
start RDPlot(method,LMSOpt,MCDOpt,y,x);
call mcd(sc,xmve,dist,MCDOpt,x);
if upcase(method)="LMS" then
call lms(sc, coef, wgt, LMSOpt, y, x);
else
call lts(sc, coef, wgt, LMSOpt, y, x);
RobResid = wgt[2,]; /* robust residuals */
MD = dist[1,]; /* Mahalanobis distance (MD) */
RD = dist[2,]; /* Robust distance (RD) */
Outlier = (abs(RobResid)>2.5);
cutoff = sqrt(quantile("chisq",0.975, ncol(x)));
Leverage = ^dist[3,]; /* indicator variable for leverage pts */
/* classify each point:
0 = regular point
1 = outlier
2 = leverage
3 = outlier & leverage */
Both = Outlier # Leverage;
Type = choose(Leverage, 2, Outlier);
Type = choose(Both, 3, Type);
create _rdplot0 var {"RobResid" MD RD "Type"};
append;
close _rdplot0;
submit cutoff;
proc format;
value TypeFmt 0 = "Regular"
1 = "Outlier"
2 = "Leverage"
3 = "Outlier and Leverage";
run;
data _Attrs;
length Value $20 MarkerColor $20;
ID = "RDAttr";
Value= putn(0,"TypeFmt."); MarkerColor = "Black "; output;
Value= putn(1,"TypeFmt."); MarkerColor = "DarkRed "; output;
Value= putn(2,"TypeFmt."); MarkerColor = "DarkBlue "; output;
Value= putn(3,"TypeFmt."); MarkerColor = "DarkGreen "; output;
run;
data _rdplot;
label RobResid="Standardized Robust Residual"
MD="Mahalanobis Distance"
RD="Robust Distance";
format Type TypeFmt.;
set _rdplot0;
if Type>0 then ID=_N_;
run;
title "Outlier and Leverage Diagnostics";
proc sgplot data=_rdplot DATTRMAP=_Attrs;
scatter x=RD y=RobResid / datalabel=ID
markerattrs=(size=10 symbol=CircleFilled)
transparency=0.3 group=Type
attrid=RDAttr GROUPORDER=ascending;
refline &cutoff / axis=x;
refline -2.5 2.5 / axis=y;
run;
endsubmit;
call delete({_Attrs _rdplot0});
finish RDPlot;
/*------------------------------------------------------------*/
store module=_all_;
quit;