FOCUS AREAS

Assessing the Accuracy of Cluster Allocations Obtained from Finite Mixture Models


Contents | Back to Example

data ncsids;
   input County $ 1-12 Births SIDS;
   Rate=SIDS/Births;
   Logrisk=log(Births);
   datalines;
Alamance        4672     13
Alexander       1333      0
Alleghany        487      0
Anson           1570     15
Ashe            1091      1
Avery            781      0
Beaufort        2692      7
Bertie          1324      6
Bladen          1782      8
Brunswick       2181      5
Buncombe        7515      9
Burke           3573      5
Cabarrus        4099      3
Caldwell        3609      6
Camden           286      0
Carteret        2414      5
Caswell         1035      2
Catawba         5754      5
Chatham         1646      2
Cherokee        1027      2
Chowan           751      1
Clay             284      0
Cleveland       4866     10
Columbus        3350     15
Craven          5868     13
Cumberland     20366     38
Currituck        508      1
Dare             521      0
Davidson        5509      8
Davie           1207      1
Duplin          2483      4
Durham          7970     16
Edgecombe       3657     10
Forsyth        11858     10
Franklin        1399      2
Gaston          9014     11
Gates            420      0
Graham           415      0
Granville       1671      4
Greene           870      4
Guilford       16184     23
Halifax         3608     18
Harnett         3776      6
Haywood         2110      2
Henderson       2574      5
Hertford        1452      7
Hoke            1494      7
Hyde             338      0
Iredell         4139      4
Jackson         1143      2
Johnston        3999      6
Jones            578      1
Lee             2252      5
Lenoir          3589     10
Lincoln         2216      8
McDowell        1946      5
Macon            797      0
Madison          765      2
Martin          1549      2
Mecklenburg    21588     44
Mitchell         671      0
Montgomery      1258      3
Moore           2648      5
Nash            4021      8
New Hanover     5526     12
Northampton     1421      9
Onslow         11158     29
Orange          3164      4
Pamlico          542      1
Pasquotank      1638      3
Pender          1228      4
Perquimans       484      1
Person          1556      4
Pitt            5094     14
Polk             533      1
Randolph        4456      7
Richmond        2756      4
Robeson         7889     31
Rockingham      4449     16
Rowan           4606      3
Rutherford      2992     12
Sampson         3025      4
Scotland        2255      8
Stanly          2356      5
Stokes          1612      1
Surry           3188      5
Swain            675      3
Transylvania    1173      3
Tyrell           248      0
Union           3915      4
Vance           2180      4
Wake           14484     16
Warren           968      4
Washington       990      5
Watauga         1323      1
Wayne           6638     18
Wilkes          3146      4
Wilson          3702     11
Yadkin          1269      1
Yancey           770      0
;
proc fmm data=ncsids;
   model SIDS = / dist=poisson k=2 offset=Logrisk;
   output out=model class maxpost prior;
   ods output ParameterEstimates=Parameters
              NObs=NObs(where=(label="Number of Observations Used") keep=label N)
              MixingProbs=MixingProbs(keep=prob);
run;
proc rank data=model out=model descending;
   var rate;
   ranks order;
run;   
proc format;
   value group 1 = 'Normal'
               2 = 'High';
run;
proc sgplot data=model;
   title "Ordered Incidence Rates";
   format class group.;
   scatter x=order y=rate / 
           markerattrs=(symbol=CircleFilled size=4px)
           group=class;
   yaxis values=(.001 .002 .003 .004 .005 .006 .007 .008 .009 .01);
run;
proc rank data=model out=model descending;
   var maxpost;
   ranks postorder;
run;
title "Ordered Maximum Posterior Probabilities";
proc sgplot data=model;
   scatter y=maxpost x=postorder / markerattrs=(symbol=CircleFilled size=4px) 
                                   group=class;
run;
title;
title "Maximum Posterior Probabilities by Incidence Rates";
proc sgplot data=model;
   scatter y=maxpost x=rate / markerattrs=(symbol=CircleFilled size=4px) 
                              group=class;
run;
title;
proc sort data=model out=model;
   by Class;
run;

proc means data=model sum;
   by Class;
   var Maxpost;
   output out=ModelPost(drop=_TYPE_ _FREQ_) 
          sum(Maxpost)=Maxpost;
run;   

proc transpose data=ModelPost prefix=MaxPost
               out=ModelPost(drop=_LABEL_ _NAME_);
   var Maxpost;
   id Class;
run;
data ModelPost;
   merge ModelPost NObs(drop=label) MixingProbs;
   T=(MaxPost1 + MaxPost2)/N;
   T1=MaxPost1/(N*Prob);
   T2=MaxPost2/(N*(1-Prob));
   label T=Estimated Overall Correct Allocation Rate
         T1=Estimated Component 1 Correct Allocation Rate
         T2=Estimated Component 2 Correct Allocation Rate;
run;
proc print data=ModelPost noobs label;
   var T T1 T2;
run;
proc surveyselect data=model(keep=county sids logrisk) 
                  out=bootstrap
                  seed=872398 
                  method=srs 
                  samprate=1 
                  rep=200; 
run;
data parameters;
   set parameters;
   if component=1 then call symput('theta1',estimate);
   if component=2 then call symput('theta2',estimate);
run;

data MixingProbs;
   set MixingProbs;
   call symput('pi1',prob);
run;
data bootstrap; 
   set bootstrap;
   call streaminit(987346598);
   Group=rand('TABLE',&pi1);
   mu1=exp(&theta1+logrisk);
   mu2=exp(&theta2+logrisk);
   SIDS=ifn(group=1,rand('POISSON',mu1),rand('POISSON',mu2));
run;
proc sort data=bootstrap out=bootstrap;
   by Replicate;
run;
ods select none;
proc fmm data=bootstrap;
   by Replicate;
   model SIDS = / dist=poisson k=2 offset=Logrisk;
   output out=ml class maxpost prior;
   ods output ParameterEstimates=BootParameters(keep=replicate component estimate)
              ConvergenceStatus=Converge(keep=replicate status);
run;
proc transpose data=BootParameters 
               out=BootParameters(drop=_NAME_) 
               prefix=Estimate;
  by Replicate;
  id Component;
run;

proc sort data=BootParameters out=BootParameters;
   by Replicate;
run;

proc sort data=Converge;
   by Replicate;
run;
data ml;
   merge ml BootParameters Converge;
   by Replicate;
run;
data ml(drop=Prior1 Prior2 Lambda1 Lambda2 Class2 Status);
   set ml(where=(Status=0));
   Prior1=Prior_1;
   Prior2=Prior_2;
   Post1=Post_1;
   Post2=Post_2;
   Lambda1=Estimate1;
   Lambda2=Estimate2;
   Class2=Class;
   if Estimate1>Estimate2 then do;
      Prior_1=Prior2;
      Prior_2=Prior1;
      if Class2=1 then Class=2;
      if Class2=2 then Class=1;
   end;
run;
data ml;
   set ml;
   z1=ifn(Group=1,1,0);
   zhat1=ifn(Class=1,1,0);
   d1=ifn(z1=zhat1,1,0);
   z2=ifn(Group=2,1,0);
   zhat2=ifn(Class=2,1,0);
   d2=ifn(z2=zhat2,1,0);
   A1=z1*d1;
   A2=z2*d2;
run;
proc sort data=ml out=ml;
   by Replicate Class;
run;

proc means data=ml(keep= replicate A1 A2 z1 z2) sum;
   by Replicate;
   var A1 A2 z1 z2;
   output out=allocation(drop=_TYPE_ _FREQ_) 
          sum(A1)=A1
          sum(A2)=A2
          sum(z1)=n1
          sum(z2)=n2;
run;
data allocation;
   set allocation;
   A1=A1/n1;
   A2=A2/n2;
   n=n1+n2;
   A=(n1*A1 + n2*A2)/n;
run;
proc means data=ml sum;
   by Replicate Class;
   var maxpost;
   output out=PostSums(drop=_TYPE_ _FREQ_) 
          sum(maxpost)=Maxpost;
run;

proc transpose data=PostSums prefix=SumMaxPost
               out=PostSums(drop=_LABEL_ _NAME_);
   var Maxpost;
   by Replicate;
   id Class;
run;

proc sort data=allocation out=allocation;
   by Replicate;
run;

proc sort data=PostSums out=PostSums;
   by Replicate;
run;
data prior;
   set ml(keep=Replicate Prior_1 Prior_2);
   by Replicate;
   if First.Replicate;
run;
proc sort data=prior out=prior;
   by Replicate;
run;
data allocation;
   merge allocation PostSums prior;
   by Replicate;
run;

data allocation;
   set allocation;
   T=(SumMaxPost1 + SumMaxPost2)/n;
   T1=SumMaxPost1/(n*Prior_1);
   T2=SumMaxPost2/(n*Prior_2);
run;
data allocation;
   set allocation;
   B=T-A;
   Bsqr=B**2;
   B1=T1-A1;
   B1sqr=B1**2;
   B2=T2-A2;
   B2sqr=B2**2;
run;
proc means data=allocation mean;
   var B B1 B2 Bsqr B1sqr B2sqr;
   output out=bias 
          mean(B)=B
          var(B)=varB
          mean(B1)=B1
   var(B1)=varB1
          mean(B2)=B2
   var(B2)=varB2
          mean(Bsqr)=Bsqr
          mean(B1sqr)=B1sqr
          mean(B2sqr)=B2sqr
          N=K;
run;
data bias;
   set bias;
   se_B=sqrt(varB/K);
   se_B1=sqrt(varB1/K);
   se_B2=sqrt(varB2/K);
   RMSE_T1=sqrt(B1sqr);
   RMSE_T2=sqrt(B2sqr);
   RMSE_T=sqrt(Bsqr);
run;
data BiasAdjusted;
   merge bias(drop = _TYPE_ _FREQ_) ModelPost;
   BiasAdjT=T-B;
   BiasAdjT1=T1-B1;
   BiasAdjT2=T2-B2;
run;
data T;
   set BiasAdjusted(keep=T B SE_B BiasAdjT RMSE_T);
   length Label $ 12;
   label='Overall';
   rename T=Estimate
          B=Bias
          SE_B=SE
          BiasAdjT=Adjusted
          RMSE_T=RMSE;
run;

data T1;
   set BiasAdjusted(keep=T1 B1 SE_B1 BiasAdjT1 RMSE_T1);
   length Label $ 12;
   Label='Normal Risk';
   rename T1=Estimate
          B1=Bias
          SE_B1=SE
          BiasAdjT1=Adjusted
          RMSE_T1=RMSE;
run;

data T2;
   set BiasAdjusted(keep=T2 B2 SE_B2 BiasAdjT2 RMSE_T2);
   length Label $ 12;
   Label='High Risk';
   rename T2=Estimate
          B2=Bias
          SE_B2=SE
          BiasAdjT2=Adjusted
          RMSE_T2=RMSE;
run;
proc datasets;
   delete results;
run;
proc append base=results data=T;
run;
proc append base=results data=T1;
run;
proc append base=results data=T2;
run;
data results;
   set results;
   ratio=bias/rmse;
   label Label=Population
         Estimate=Estimated Correct Allocation Rate
         Bias=Estimate of Bias
         SE=Standard Error of Bias Estimate
         Adjusted=Bias-Adjusted Correct Allocation Rate
         RMSE=RMSE of Correct Allocation Rate
         ratio=ratio of Bias to RMSE; 
run;
ods select all;
proc print data=results noobs label;
   var Label Estimate RMSE Bias SE ratio Adjusted;
   title 'Estimated Correct Allocation Rates';
   title2 'Parametric Bootstrap Method';
run;
title;
title2;