Automatic Model Selection in Contingency Tables

 /****************************************************************/
 /*          S A S   S A M P L E   L I B R A R Y                 */
 /*                                                              */
 /*    NAME: LOGLIN                                              */
 /*   TITLE: Automatic Model Selection in Contingency Tables     */
 /* PRODUCT: IML                                                 */
 /*  SYSTEM: ALL                                                 */
 /*    KEYS: SSTAT MATRIX                                        */
 /*   PROCS: IML                                                 */
 /*    DATA:                                                     */
 /*                                                              */
 /* SUPPORT: LWB                         UPDATE:                 */
 /*     REF: REFERENCES:                                         */
 /*          GOODMAN, L.A. (1971). The Analysis of               */
 /*          Multidimensional Contingency Tables: Stepwise       */
 /*          Procedures and Direct Estimation Methods for Model  */
 /*          Building for Multiple Classification.               */
 /*          Technometrics, 13, 33-61                            */
 /*          LIN, S.P. (1982). Algorithm AS 185. Automatic Model */
 /*          Selection in Contingency Tables.                    */
 /*          APPLIED STATISTICS, 31, 317-326                     */
 /*    MISC: CONVERTED FROM MATRIX TO IML USING MATIML           */
 /*                                                              */
 /****************************************************************/

PROC IML;
  RESET AUTONAME  FW=8;
  TITLE1  'AUTOMATIC MODEL SELECTION IN CONTINGENCY TABLES';
  TITLE2 'BACKWARD ELIMINATION PROCEDURE';

 *--------------------------------------------------------------;
 *-----FIRST DEFINE IML MODULES TO BE USED BY MAIN PROGRAM------;
 *--------------------------------------------------------------;

 *-----ROUTINE TO PERFORM ITERATIVE BACKWARD ELIMINATION--------;
 START BWELM;
     DO WHILE (IM^=0);
        NOIT=NOIT+1;
        PP=0;
        IM=0;
        PRINT " ITERATION NO:";
        PRINT NOIT;
        IF ( NOIT=1) THEN IF ( INIT=1)
        THEN PRINT " INITIAL FIT GIVEN BY USER IS:";
        ELSE PRINT " INITIAL FIT SELECTED BY THE PROGRAM IS:";
        ELSE PRINT "BEST SIMPLIFIED MODEL FROM THE PREVIOUS ITERATION IS:";
        STAT=LRI || DFI || PI;
        RR=R1;
        CONFIG=CONFGI;

        RUN CONWRT;
        PRINT " SIMPLIFIED MODELS IN THIS ITERATION ARE:";
        DO II=1 TO NCONI;
           RUN SUBMOD;
           CALL IPF(FIT,STATUS,DIM,TABLE,CONFIG);
           RUN LRCHSQ;

           RUN DEGREF;
           P=1- PROBCHI(LR,DF);
           LRD=LR-LRI;
           DFD=DF-DFI;
           PD=1- PROBCHI(LRD, DFD);
           RR=R2;
           STAT=(LR || DF || P) // (LRD || DFD || PD);

           RUN CONWRT;
           IF ( P>=AP1 & PD >=AP3) THEN IF ( PD>=PP) THEN  DO;
              RUN COPYB;
              PP=PD;
              IM=IM+1;
              END;
           END;
        IF ( IM^=0) THEN
          RUN COPYT;
        END;
     STAT=LRI || DFI || PI;
     RR=R1;
     PRINT
     " ALL SIMPLIFIED MODELS IN THE ABOVE ITERATION ARE UNACCEPTABLE";
     PRINT " THE BEST FINAL FIT IS";
     CONFIG=CONFGI;

     RUN CONWRT;
     ABORT;
  FINISH;

  *--ROUTINE TO OBTAIN TOTAL NUMBER OF K-SUBSETS OF A N-SET----;
  START NCK;
     M=1;
     DO KI=1 TO K;
        M=M#(N-KI+1)/KI;
     END;
  FINISH;

* ROUTINE TO GET NEXT K-SUBSET OF AN N-SET, IN LEXICOGRAPHIC ORDER;
  START NXKSB;
     IF ( MM=1) THEN  DO;
        H=K;
        C=0;
     END;
     ELSE  DO;
        H=1;
        DO WHILE ( A[K+1-H,1]=N+1-H);
           H=H+1;
        END;
        C= A[K+1-H,1];
     END;
     DO J=1 TO H;
        A[K+J-H,1]=C+J;
     END;
  FINISH;

  *---ROUTINE TO COMPUTE TWICE THE LIKELIHOOD RATIO STATISTIC--;
  START LRCHSQ;
      SUMM=0;
      DO I=1 TO NTAB;
         IF ( TABLE[1,I]^=0) THEN
            SUMM=SUMM+ TABLE[1,I]* LOG( TABLE[1,I]/FIT[1,I]);
      END;
      LR=2#SUMM;
  FINISH;

  *-ROUTINE TO COMPUTE DEGREE OF FREEDOM OF A LOG-LINEAR MODEL-;
  START DEGREF;
     NP=0;
     HASH=A1;
     DO NN=1 TO NCON;
        N= NCOL( LOC( CONFIG[,NN]));
        DO K=1 TO N;
           RUN NCK;
           A=A0;
           DO MM=1 TO M;
              KK=0;
              LL=1;
              RUN NXKSB;
              DO KI=1 TO K;
                 TEMP= CONFIG[ A[KI,1],NN];
                 KK=KK+2##(TEMP-1);
                 LL=LL#( DIM[1,TEMP]-1);
                 END;
              KK= ROUND(KK);
              IF ( HASH[1,KK]=0) THEN  DO;
                 HASH[1,KK]=1;
                 NP=NP+LL;
                 END;
              END;
           END;
        END;
     DF=NTAB-NP-1;
  FINISH;

  *-ROUTINE TO CONSTRUCT SIMPLIFIED MODELS BY DELETING AN EFFECT-;
  START SUBMOD;
     N= NCOL( LOC( CONFGI[,II]));
     K=N-1;
     RUN NCK;
     A=A0;
     NLOW=0;
     IF ( NCONI>1) THEN  DO;
        IDX= J(1,NCONI-1,0);
        ICON=1:NCONI;
        DO J=1 TO NCONI;
           IF ( J<II) THEN  IDX[1,J]= ICON[1,J];
           IF ( J>II) THEN  IDX[1,J-1]= ICON[1,J];
           END;
        CONFG1= CONFGI[,IDX];
        END;
     DO MM=1 TO M;
        RUN NXKSB;
        KS= NCOL( LOC(A));
        ILOW=1;
        B=A0;
        B[1:KS,1]= CONFGI[ A[1:KS,1],II];
    IF ( NCONI>1) THEN  DO I1=1 TO NCONI-1 WHILE (ILOW=1);
           I3=0;
           IF ( NCOL( LOC( CONFG1[,I1]))>KS) THEN  DO;
              DO I2=1 TO KS;
                 I3=I3+( SUM( CONFG1[,I1]= B[I2,1])=1);
                 END;
              IF ( I3=KS) THEN  ILOW=0;
              END;
           END;
        NLOW=NLOW+ILOW;
        IF ( ILOW=1) THEN IF ( NLOW>1) THEN  CONFG2=CONFG2 || B;
         ELSE  CONFG2=B;
        END;
     IF ( NCONI=1) THEN  CONFIG=CONFG2;
     ELSE IF ( NLOW>=1) THEN  CONFIG=CONFG1 || CONFG2;
     ELSE  CONFIG=CONFG1;
     NCON= NCOL(CONFIG);
  FINISH;

  *-----ROUTINE TO PRINT MODEL CONFIGURATION & STATISTICS------;
  START CONWRT;
     MODEL= J(1, NCOL(CONFIG),0);
     DO I=1 TO NVAR;
        MODEL[1,]= MODEL[1,]+ CONFIG[I,]#(10##(NVAR-I));
        END;
     PRINT MODEL;
     PRINT STAT[ FORMAT=8.3][ ROWNAME=RR][ COLNAME=CC];
  FINISH;

  *-------------------EXIT AND COPY ROUTINES-------------------;
  START COPYI;
     NCONI=NCON;
     CONFGI=CONFIG;
     LRI=LR;
     DFI=DF;
     PI=P;
  FINISH;

  START COPYB;
     NCONB=NCON;
     CONFGB=CONFIG;
     LRB=LR;
     DFB=DF;
     PB=P;
  FINISH;

  START COPYT;
     NCONI=NCONB;
     CONFGI=CONFGB;
     LRI=LRB;
     DFI=DFB;
     PI=PB;
  FINISH;

  START FAULT;
     PRINT " ERROR IN NVAR, DIM OR TABLE";
     ABORT;
  FINISH;

  START SATUR;
     PRINT " THE BEST FINAL FIT IS THE SATURATED MODEL";
     ABORT;
  FINISH;

  *------------------------------------------------------------;
  *--NOW THAT IML MODULES ARE DEFINED, BEGIN THE MAIN PROGRAM--;
  *------------------------------------------------------------;
  START MAIN;
     *--------------------USER-PROVIDED INFORMATION------------;
     NVAR=3;
     DIM={2 2 2};
     AP1=0.05;
     AP2=0.10;
     AP3=0.10;
     INIT=1;
     TABLE={156 84 84 156 107 133 31 209};
                  *DATA FROM BISHOP ET AL. (1975);
     CONFIG={1 1 2,2 3 3,0 0 0};
                  *USER-SPECIFIED INITIAL FIT IF INIT=1;

     *------------ERROR CHECKING AND INITILIZATION-------------;
     IF ( NCOL(DIM)^=NVAR | NCOL(TABLE)^= DIM[,#]) THEN
      RUN FAULT;
     A0= J(NVAR,1,0);
     A1= J(1,2##NVAR-1,0);
     IFIT=0;
     NOIT=0;
     IM=1;
     NTAB= DIM[,#];
     R1='FIT';
     R2={'FIT' 'DIFF'};
     CC={'LR' 'DF' 'P'};

     *-------------FIT USER-SPECIFIED INITIAL FIT--------------;
     IF ( INIT=1) THEN  DO;
        NCON= NCOL(CONFIG);
        IF ( NCON=1 & NROW(CONFIG)=NVAR) THEN  DO;
           LR=0;
           DF=0;
           P=1;
           END;
         ELSE  DO;
           CALL IPF(FIT,STATUS,DIM,TABLE,CONFIG);
           RUN LRCHSQ;
           RUN DEGREF;
           P=1- PROBCHI(LR,DF);
           END;

         RUN COPYI;
         RUN BWELM;
         END;

     *--------------SEARCH FOR A BEST INITIAL FIT---------------;
     PRINT " SELECTION OF A BEST INITIAL FIT:";
     DO NWAY=1 TO NVAR;
        IF ( NWAY=NVAR) THEN
        RUN SATUR;
        N=NVAR;
        K=NWAY;

        RUN NCK;
        NCON=M;
        A=A0;
        DO MM=1 TO NCON;
           RUN NXKSB;
           IF ( MM=1) THEN  CONFIG=A;
            ELSE  CONFIG=CONFIG || A;
           END;
        CALL IPF(FIT,STATUS,DIM,TABLE,CONFIG);

        RUN LRCHSQ;
        RUN DEGREF;
        P=1- PROBCHI(LR,DF);
        STAT=LR || DF || P;
        RR=R1;

        RUN CONWRT;
        IF ( P>=AP1) THEN  DO;
           IFIT=IFIT+1;
  IF ( IFIT>1) THEN IF ( 1- PROBCHI(LRI-LR,DFI-DF)>AP2) THEN
            RUN BWELM;

        RUN COPYI;
           IF ( NWAY=NVAR-1) THEN
            RUN BWELM;
           END;
        END;
  FINISH;
  RUN MAIN;   *------RUN MAIN MODULE-------;
title; title2;