Documentation Example 6 for PROC HPSPLIT
/****************************************************************/
/* S A S S A M P L E L I B R A R Y */
/* */
/* NAME: HPSPLEX6 */
/* TITLE: Documentation Example 6 for PROC HPSPLIT */
/* DESC: */
/* REF: None */
/* */
/* PRODUCT: HPSTAT */
/* SYSTEM: ALL */
/* KEYS: */
/* PROCS: HPSTAT */
/* */
/* SUPPORT: saswfk */
/****************************************************************/
data Wine;
%let url = http://archive.ics.uci.edu/ml/machine-learning-databases;
infile "&url/wine/wine.data" url delimiter=',';
input Cultivar Alcohol Malic Ash Alkan Mg TotPhen
Flav NFPhen Cyanins Color Hue ODRatio Proline;
label Cultivar = "Cultivar"
Alcohol = "Alcohol"
Malic = "Malic Acid"
Ash = "Ash"
Alkan = "Alkalinity of Ash"
Mg = "Magnesium"
TotPhen = "Total Phenols"
Flav = "Flavonoids"
NFPhen = "Nonflavonoid Phenols"
Cyanins = "Proanthocyanins"
Color = "Color Intensity"
Hue = "Hue"
ODRatio = "OD280/OD315 of Diluted Wines"
Proline = "Proline";
run;
ods graphics on;
proc hpsplit data=Wine seed=15531 cvcc;
ods select CrossValidationValues CrossValidationASEPlot;
ods output CrossValidationValues=p;
class Cultivar;
model Cultivar = Alcohol Malic Ash Alkan Mg TotPhen Flav
NFPhen Cyanins Color Hue ODRatio Proline;
grow entropy;
prune costcomplexity;
run;
proc sort data=p; /* Ensure MiscAverage ascends within nLeaves ties */
by descending nleaves MiscAverage;
run;
data plot;
set p;
by descending nleaves;
if first.nleaves; /* Delete nLeaves dups */
retain yval 1e10;
MiscMax = MiscAverage + MISCStdErr; /* Error bar max */
MiscMin = MiscAverage - MISCStdErr; /* Error bar min */
if MiscAverage < yval then do;
yval = MiscAverage; /* Min MiscAverage */
call symputx('yref', MiscMax); /* 1-SE reference line */
call symputx('xref', nleaves); /* nLeaves reference line */
end;
run;
data plot; /* nLeaves at 1-SE point */
set plot;
if MiscAverage <= &yref then call symputx('nleaves', nleaves);
run;
data plot;
set plot;
if &nleaves = nleaves then do; /* Highlight 1-SE value */
xse = nleaves; /* X value for 1-SE */
yse = MiscAverage; /* Y value for 1-SE */
end;
if &xref = nleaves then do; /* Highlight minimum */
xmin = nleaves; /* X value for minimum */
ymin = MiscAverage; /* Y value for minimum */
end;
format pruningparameter best6.; /* X axis format */
run;
proc sgplot noautolegend;
title 'Cost-Complexity Analysis for LobaOreg Using Cross Validation';
refline &xref / axis=x2 lineattrs=(pattern=shortdash);
refline &yref / axis=y name='a' legendlabel='1-SE';
series y=MiscAverage x=nleaves / x2axis lineattrs=graphdata1;
scatter y=MiscAverage x=pruningparameter / yerrorlower=MiscMin
yerrorupper=MiscMax errorbarattrs=graphdata1;
scatter y=ymin x=xmin / markerattrs=GraphData2(symbol=circlefilled size=9px)
x2axis name='b' legendlabel='Min Misc Rate';
scatter y=yse x=xse / markerattrs=GraphData3(symbol=circlefilled size=9px)
x2axis name='c' legendlabel='1-SE Selection';
xaxis type=discrete label='Cost-Complexity Parameter' reverse;
x2axis type=discrete label='Number of Leaves';
yaxis label='Average Misclassification Rate' min=0 max=1;
keylegend 'a' 'b' 'c' / location=inside across=1 noborder;
run;
proc hpsplit data=Wine seed=15531;
class Cultivar;
model Cultivar = Alcohol Malic Ash Alkan Mg TotPhen Flav
NFPhen Cyanins Color Hue ODRatio Proline;
prune costcomplexity(leaves=&nLeaves);
run;