%macro multnorm (version, data=_last_ , /* Input data set name */ var=_numeric_, /* Variables for test. Can not be a */ /* variable list e.g. var1-var10 */ plot=mult , /* Optional assessment plots */ hires=yes /* High- or low-resolution plots */ ); %let time = %sysfunc(datetime()); %let _version=2.0; %if &version ne %then %put NOTE: &sysmacroname macro Version &_version; /* Define macro to compute Royston H ======================================================================*/ %macro RoystonH(data=, var=); proc corr data=&data outp=_outp noprint; var &var; run; data _corr; set _outp(where=(_TYPE_="CORR")); keep &var; if _n_>1; run; proc univariate data=&data noprint; var &var; output out=_norm normal=&var; run; data _roy; set _norm; _n=&_n; _p=&_p; array _w (&_p) &var; array _z (&_p); if 4<=_n<=11 then do; _u = _n; _gamma = -2.273 + 0.459*_u; _mu = 0.5440 - 0.39978*_u + 0.025054*_u**2 - 0.0006714*_u**3; _sigma = exp(1.3822 - 0.77857*_u + 0.062767*_u**2 - 0.0020322*_u**3); do _i = 1 to _p; _z(_i) = (-log(_gamma - log(1 - _w(_i))) - _mu)/_sigma; end; end; else if 12<=_n<=5000 then do; _u = log(_n); _mu = -1.5861 - 0.31082*_u - 0.083751*_u**2 + 0.0038915*_u**3; _sigma = exp(-0.4803 -0.082676*_u + 0.0030302*_u**2); do _i = 1 to _p; _z(_i) = ((log(1 - _w(_i))) - _mu)/_sigma; end; end; do _i = 1 to _p; _ksum+ quantile("normal", cdf("normal", -_z(_i) )/2 )**2; end; _kmn=_ksum/_p; _mu = 0.715; _lambda = 5; _nu = 0.21364 + 0.015124*(log(_n))**2 - 0.0018034*(log(_n))**3; /* transform corrs and sum over lower triangle */ array _r (&_p) &var; do _i=1 to _p-1; set _corr; do _j=1 to _i; /* Royston (1983) equation 4 c_ij = g(rho_ij,n) */ _g+ (min(_r(_j),1)**_lambda) * (1 - (_mu*(1-min(_r(_j),1))**_mu)/_nu); end; end; _gmn=_g/((_p**2-_p)/2); *average of transformed corrs, c_ij; DF=_p/(1+(_p-1)*_gmn); *equivalent df, e; TEST="Royston H"; STAT=_kmn*df; PROB=1-probchi(stat,df); format prob pvalue6.4; keep test stat df prob; run; %exitroy: %mend RoystonH; /* Define macro to compute Henze-Zirler T ======================================================================*/ %macro hz(data=, var=); proc princomp data=&data out=_prins(keep=prin:) std vardef=n noprint; var &var; run; data _hz2; retain _p betasq; set _prins end=eof nobs=nobs; array p (*) prin:; if _n_=1 then do; _p=dim(p); betasq = (2**-0.5 * ((2*_p+1)/4)**(1/(_p+4)) * nobs**(1/(_p+4)))**2; call symput("betasq",betasq); end; Tsum2+ exp((-betasq/(2*(1+betasq)))*(uss(of prin:))); if eof then do; Tpart2 = -2*((1+betasq)**(-_p/2))*Tsum2; Tpart3 = nobs*(1+2*betasq)**(-_p/2); keep _p betasq Tpart2 Tpart3; output; end; run; proc distance data=_prins out=_mahdist method=sqeuclid; var interval(prin:); run; data _hz; set _mahdist nobs=nobs end=eof; array dist{*} dist:; do k=1 to dim(dist); if dist(k) ne . then Tpart1+ exp(-(&betasq/2)*dist{k}); end; if eof then do; set _hz2; Tpart1 = (2*Tpart1-nobs)/nobs; Stat = Tpart1 + Tpart2 + Tpart3; E_t=1-(1+2*betasq)**(-_p/2)* (1+_p*betasq/(1+2*betasq)+_p*(_p+2)*betasq**2/(2*(1+2*betasq)**2)); _w=(1+betasq)*(1+3*betasq); V_t=2*(1+4*betasq)**(-_p/2) + 2*((1+2*betasq)**-_p)* (1+ 2*_p*betasq**2/(1+2*betasq)**2 + 3*_p*(_p+2)*betasq**4/(4*(1+2*betasq)**4)) - 4*_w**(-_p/2)*(1 + 3*_p*betasq**2/(2*_w) + _p*(_p+2)*betasq**4/(2*_w**2)); theta=log(E_t**2/sqrt(E_t**2 + V_t)); *mean of T, E(T), in paper; lambda=sqrt(log(1 + V_t/E_t**2)); *std dev of T, sqrt(V(T)), in paper; Prob=1-cdf("lognormal", stat, theta, max(1e-8,lambda)); Test="Henze-Zirkler T"; keep test stat prob; output; end; run; %mend; /* Define macro to compute Doornik-Hansen test ======================================================================*/ %macro DH(data=, var=); ods exclude all; proc princomp data=&data vardef=n; var &var; ods output eigenvalues=_val(keep=eigenvalue) eigenvectors=_H(keep=prin:); run; %if %index(&version,DEBUG) %then ods select all;; data _L; set _val; eigenvalue=1/sqrt(eigenvalue); run; proc summary data=&data vardef=n; var &var; output out=_var var=; run; proc transpose data=_var out=_tvar(keep=col1); var &var; run; data _V; set _tvar; col1=1/sqrt(col1); run; proc standard data=&data out=_xc m=0; var &var; run; data _xc; retain &var; set _xc; run; %mxmult(_xc,_V,diag=n y,out=_xV,prefix=xV) %mxmult(_xV,_H,out=_xVH,prefix=xVH) %mxmult(_xVH,_L,diag=n y,out=_xVHL,prefix=xVHL) %mxmult(_xVHL,_H,t=n y,out=_y,prefix=y) proc summary data=_y vardef=n; var y:; output out=_B1(drop=_:) skew=; output out=_B2(drop=_:) kurt=; run; data _B2; set _B2; array k (*) y:; do i=1 to dim(k); k(i)=k(i)+3; end; drop i; run; data _B1Z1; set _B1; beta=3*(&_n**2 + 27*&_n - 70)*(&_n+1)*(&_n+3) / ((&_n-2)*(&_n+5)*(&_n+7)*(&_n+9)); om2=-1+sqrt(2*(beta-1)); dels=1/sqrt(log(sqrt(om2))); array y (&_p); array b1 (&_p); array yt (&_p); array z1 (&_p); do i=1 to &_p; b1(i)=y(i); yt(i)=b1(i)*sqrt( (om2-1)/2 * ((&_n+1)*(&_n+3))/(6*(&_n-2)) ); z1(i)=dels*log(yt(i) + sqrt(yt(i)**2 + 1)); end; keep b1: z1:; run; data _DH; merge _B2 _B1Z1; delk=(&_n-3)*(&_n+1)*(&_n**2+15*&_n-4); a=(&_n-2)*(&_n+5)*(&_n+7)*(&_n**2+27*&_n-70) / (6*delk); c=(&_n-7)*(&_n+5)*(&_n+7)*(&_n**2+2*&_n-5) / (6*delk); k=(&_n+5)*(&_n+7)*(&_n**3+37*&_n**2+11*&_n-313) / (12*delk); array y (&_p); *B2; array b1 (&_p); array alph (&_p); array z2 (&_p); do i=1 to &_p; alph(i)=a+c*b1(i)**2; z2(i)=( ((y(i)-1-b1(i)**2)*(2*k)/(2*alph(i)))**(1/3) - 1 + (9*alph(i))**-1 ) * sqrt(9*alph(i)); end; Stat=uss(of z1:)+uss(of z2:); DF=2*&_p; Prob=1-probchi(Stat,df); format prob pvalue6.4; Test="Doornik-Hansen"; keep Test Stat DF Prob; run; %mend; /* Define macro to do matrix multiplication ======================================================================*/ %macro mxmult(a,b,t=N N,inv=N N,diag=N N,out=AxB,prefix=Prod); %let t=%upcase(&t); %let inv=%upcase(&inv); %let diag=%upcase(&diag); %if %eval(%scan(&t,1)=Y)+%eval(%scan(&inv,1)=Y)+%eval(%scan(&diag,1)=Y)>1 or %eval(%scan(&t,2)=Y)+%eval(%scan(&inv,2)=Y)+%eval(%scan(&diag,2)=Y)>1 %then %do; %put ERROR: Only one of T=, INV=, or DIAG= is allowed for each matrix.; %goto exit; %end; %let dsid=%sysfunc(open(&a)); %if &dsid=0 %then %do; %put ERROR: Data set &a not found.; %goto exit; %end; %let anrowr=%sysfunc(attrn(&dsid,nobs)); %let ancolr=%sysfunc(attrn(&dsid,nvars)); %let rc=%sysfunc(close(&dsid)); %let dsid=%sysfunc(open(&b)); %if &dsid=0 %then %do; %put ERROR: Data set &a not found.; %goto exit; %end; %let bnrowr=%sysfunc(attrn(&dsid,nobs)); %let bncolr=%sysfunc(attrn(&dsid,nvars)); %let rc=%sysfunc(close(&dsid)); data _anorm; set &a; keep _numeric_; if nmiss(of _numeric_)=0; run; data _bnorm; set &b; keep _numeric_; if nmiss(of _numeric_)=0; run; %let dsid=%sysfunc(open(_anorm)); %let anrow=%sysfunc(attrn(&dsid,nobs)); %let ancol=%sysfunc(attrn(&dsid,nvars)); %let rc=%sysfunc(close(&dsid)); %let dsid=%sysfunc(open(_bnorm)); %let bnrow=%sysfunc(attrn(&dsid,nobs)); %let bncol=%sysfunc(attrn(&dsid,nvars)); %let rc=%sysfunc(close(&dsid)); %if &anrow<2 or &bnrow<2 %then %do; %put ERROR: Matrices must have 2 or more observations.; %goto exit; %end; %put NOTE: Matrix &a: &anrowr rows and &ancolr cols read, &anrow rows and &ancol cols used.; %put NOTE: Matrix &b: &bnrowr rows and &bncolr cols read, &bnrow rows and &bncol cols used.; %let a=_anorm; %let b=_bnorm; %if %scan(&diag,1)=Y %then %do; %if %sysfunc(min(&anrow,&ancol))=1 %then %do; %if &ancol=1 %then %do; proc transpose data=&a out=_adiag(keep=col:); run; %end; %else %do; data _adiag; set &a; run; %end; %let adim=%sysfunc(max(&anrow,&ancol)); data _adiag; set %do _i=1 %to &adim; _adiag %end; ; run; %let anrow=&adim; %let ancol=&adim; %let a=_adiag; %end; %else %do; %put ERROR: For DIAG=&diag, the left matrix must have only 1 row or column.; %goto exit; %end; %end; %if %scan(&diag,2)=Y %then %do; %if %sysfunc(min(&bnrow,&bncol))=1 %then %do; %if &bncol=1 %then %do; proc transpose data=&b out=_bdiag(keep=col:); run; %end; %else %do; data _bdiag; set &b; run; %end; %let bdim=%sysfunc(max(&bnrow,&bncol)); data _bdiag; set %do _i=1 %to &bdim; _bdiag %end; ; run; %let bnrow=&bdim; %let bncol=&bdim; %let b=_bdiag; %end; %else %do; %put ERROR: For DIAG=&diag, the right matrix must have only 1 row or column.; %goto exit; %end; %end; proc fcmp; array a[&anrow,&ancol]/nosymbols; rc=read_array("&a",a); %let a=a; %if %scan(&t,1)=Y %then %do; %let tmp=&anrow; %let anrow=&ancol; %let ancol=&tmp; %let a=at; array at[&anrow,&ancol]/nosymbols; call transpose(a,at); %end; %else %if %scan(&inv,1)=Y %then %do; %let a=ainv; array ainv[&anrow,&ancol]/nosymbols; call inv(a,ainv); %end; %else %if %scan(&diag,1)=Y %then %do; array id[&adim,&adim]/nosymbols; call identity(id); array adiag[&adim,&adim]/nosymbols; call elemmult(a,id,adiag); %let a=adiag; %end; array b[&bnrow,&bncol]/nosymbols; rc=read_array("&b",b); %let b=b; %if %scan(&t,2)=Y %then %do; %let tmp=&bnrow; %let bnrow=&bncol; %let bncol=&tmp; %let b=bt; array bt[&bnrow,&bncol]/nosymbols; call transpose(b,bt); %end; %else %if %scan(&inv,2)=Y %then %do; %let b=binv; array binv[&bnrow,&bncol]/nosymbols; call inv(b,binv); %end; %else %if %scan(&diag,2)=Y %then %do; array id[&bdim,&bdim]/nosymbols; call identity(id); array bdiag[&bdim,&bdim]/nosymbols; call elemmult(b,id,bdiag); %let b=bdiag; %end; %if &ancol ne &bnrow %then %do; %put ERROR: Matrices to be multiplied are not compatible.; %put ERROR- A(&anrow,&ancol) x B(&bnrow,&bncol); %goto exit; %end; %else %do; array &prefix[&anrow,&bncol]/nosymbols; call mult(&a,&b,&prefix); rc=write_array("&out",&prefix); %end; run; quit; %if %sysfunc(exist(&out)) %then %put NOTE: The data set &out has &anrow observations and &bncol variables.; %exit: proc datasets nolist nowarn; delete _anorm _bnorm _adiag _bdiag; run; quit; %mend; /* Set options ======================================================================*/ %let opts = %sysfunc(getoption(notes)) _last_=%sysfunc(getoption(_last_)); %if &data=_last_ %then %let data=&syslast; %let newchk=1; %let version=%upcase(&version); %if %index(&version,DEBUG) %then %do; options notes mprint %if %index(&version,DEBUG2) %then mlogic symbolgen; ; ods select all; %put _user_; %end; %else %do; %if %index(&version,NOCHK) %then %let newchk=0; options nonotes nomprint nomlogic nosymbolgen; ods exclude all; %end; title "MULTNORM macro"; /* Check for newer version ======================================================================*/ %if &newchk %then %do; %let _notfound=0; filename _ver url 'http://ftp.sas.com/techsup/download/stat/versions.dat' termstr=crlf; data _null_; infile _ver end=_eof; input name:$15. ver; if upcase(name)="&sysmacroname" then do; call symput("_newver",ver); stop; end; if _eof then call symput("_notfound",1); run; options notes; %if &syserr ne 0 or &_notfound=1 or &_newver=0 %then %put NOTE: Unable to check for newer version of &sysmacroname macro.; %else %if %sysevalf(&_newver > &_version) %then %do; %put NOTE: A newer version of the &sysmacroname macro is available at; %put NOTE- this location: http://support.sas.com/ ; %end; %if %index(&version,DEBUG)=0 %then options nonotes;; %end; /* Check inputs, clean data, check for singularity ======================================================================*/ /* DATA= must be specified and data set must exist */ %if &data= or %sysfunc(exist(&data)) ne 1 %then %do; %put ERROR: DATA= data set not specified or not found.; %goto exit; %end; /* Verify that VAR= option is specified */ %if &var= %then %do; %put ERROR: Specify test variables in VAR=.; %goto exit; %end; %let plot=%upcase(%substr(&plot,1,1)); %let hires=%upcase(%substr(&hires,1,1)); /* Check existence and get number of VAR= variables */ %let _p=0; %let dserr=0; %let dsid=%sysfunc(open(&data)); %if &dsid %then %do; %if %upcase(&var) ne _NUMERIC_ %then %do; %let _i=1; %let _token=%scan(&var,&_i); %do %while ( &_token ne %str() ); %let _vnum=%sysfunc(varnum(&dsid,&_token)); %if &_vnum ne 0 %then %do; %if %sysfunc(vartype(&dsid,&_vnum))=N %then %let _p=%eval(&_p+1); %else %do; %put ERROR: Variable %upcase(&_token) is not numeric.; %let dserr=1; %end; %end; %else %do; %put ERROR: Variable %upcase(&_token) not found.; %let dserr=1; %end; %let _i=%eval(&_i+1); %let _token=%scan(&var,&_i); %end; %end; %else %do; %let _nvar=%sysfunc(attrn(&dsid,nvars)); %let var=; %do _i=1 %to &_nvar; %let _vn=%sysfunc(varname(&dsid,&_i)); %if %sysfunc(vartype(&dsid,&_i))=N %then %do; %let _p=%eval(&_p+1); %let var=&var &_vn; %end; %end; %end; %let rc=%sysfunc(close(&dsid)); %end; %else %do; %put ERROR: Could not open DATA= data set.; %let dserr=1; %end; %if &dserr %then %goto exit; /* Remove observations with missing values, keep only the var= variables */ data _nomiss; set &data end=eof; keep &var; if nmiss(of &var)=0 then do; output; _n+1; end; if eof then call symput("_n",_n); run; %if &_n=1 or &_p=1 %then %do; %put ERROR: There must be more than one observation and variable.; %goto exit; %end; /* Check if correlation matrix is singular */ %let singular=0; proc princomp data=_nomiss outstat=_evals out=_prins(keep=prin:) std vardef=n noprint; var &var ; run; %if &syserr=3000 %then %do; %put WARNING: PROC PRINCOMP required for singularity check.; %put WARNING- Only the Royston test can be performed.; %let singular=1; %end; %else %do; data _null_; set _evals; where _TYPE_='EIGENVAL'; if round(min(of &var ),1e-8)<=0 then do; put 'WARNING: Correlation matrix is singular.'; put 'WARNING- Only the Royston test can be performed.'; call symput('singular','1'); end; run; %end; %if &_n>=4 and &_n<=2000 %then %RoystonH(data=_nomiss, var=&var); %else %do; options notes; %put NOTE: The Royston test requires between 4 and 2000 observations.; %if %index(&version,DEBUG)=0 %then options nonotes;; %end; %if &singular=0 %then %do; /* Do Henze-Zirkler test */ %HZ(data=_nomiss, var=&var) /* Do Doornik-Hansen test */ %if &_n>=8 %then %DH(data=_nomiss, var=&var); %else %do; options notes; %put NOTE: The Doornik-Hansen test requires 8 or more observations.; %if %index(&version,DEBUG)=0 %then options nonotes;; %end; /* Mardia tests ======================================================================*/ proc transpose data=_prins out=_tprins; run; proc reg data=_tprins(keep=_numeric_) outsscp=_Mangle(where=(_TYPE_="SSCP")) noprint; var _numeric_; run; quit; %let _sscpexst=%sysfunc(exist(_mangle)); %let _nvsscp=0; %if &_sscpexst %then %do; %let dsid=%sysfunc(open(_mangle)); %if &dsid %then %let _nvsscp=%sysfunc(attrn(&dsid,nvars)); %let rc=%sysfunc(close(&dsid)); %end; %if &_sscpexst=0 or &_nvsscp=0 %then %do; %put ERROR: The Mardia multivariate skewness and kurtosis can not be done.; %end; %else %do; data _mardia; retain nparm; keep TEST VALUE: Expected STAT: PROB: DF; set _Mangle end=eof nobs=nobs; array col{*} col:; if _n_=1 then nparm=Intercept; else do; mk+(col{_n_-1})**2; do k=1 to dim(col); ms+col{k}**3; end; end; if eof then do; TEST="Mardia Skewness"; N=nobs-1; Value=ms/N**2; Expected=nparm*(nparm+2)*((N+1)*(nparm+1)-6) / ((N+1)*(N+3)); ValueCtr=Value-Expected; small_sample_correction=(nparm+1)*(N+1)*(N+3)/(N*((N+1)*(nparm+1)-6)); STAT=VALUE*N*small_sample_correction/6; StatUncorr=VALUE*N/6; DF=nparm*(nparm+1)*(nparm+2)/6; PROB=1-probchi(STAT,DF); ProbUncorr=1-probchi(StatUncorr,DF); format ProbUncorr pvalue6.4; output; TEST="Mardia Kurtosis"; Value=mk/N; Expected=nparm*(nparm+2)*(N-1)/(N+1); ValueCtr=Value-Expected; STAT=(VALUE-nparm*(nparm+2))/sqrt(8*nparm*(nparm+2)/N); df=.; PROB=2*(1-probnorm(abs(STAT))); StatUncorr=.; ProbUncorr=.; output; end; run; %end; %end; /* Univariate normality tests ======================================================================*/ proc univariate data=_nomiss; var &var; output out=_stat normal=&var ; output out=_prob probn=&var ; run; data _univ; set _stat _prob; run; proc transpose data=_univ name=variable out=_tuniv(rename=(col1=stat col2=prob)); var &var ; run; /* Collect and print results ======================================================================*/ ods select all; data _stats; length test $15.; set _tuniv %if %sysfunc(exist(_mardia)) %then _mardia; %if %sysfunc(exist(_roy)) %then _roy; %if %sysfunc(exist(_hz)) %then _hz; %if %sysfunc(exist(_dh)) %then _dh; ; N=&_n; df=round(df,.0001); if test='' then if n<=2000 then test='Shapiro-Wilk'; else test='Kolmogorov'; drop _label_; run; proc print data=_stats noobs split='/'; id test; var variable N %if %sysfunc(exist(_mardia)) %then valuectr; stat df prob; format prob pvalue6.4 df best10.4; title2 "Univariate and Multivariate Normality Tests"; label variable="Variable" test="Test" %if %sysfunc(exist(_mardia)) %then ValueCtr="Centered/Skewness &/Kurtosis"; stat="Test/Statistic/Value" prob="Prob" N="N"; run; /* Univariate (histograms) and multivariate (Chi-square Q-Q) plot ======================================================================*/ %if (&plot=U or &plot=B) and &hires=Y %then %do; ods select histogram goodnessoffit; proc univariate data=_nomiss; hist &var / normal; title2 "Univariate Plots and Normality Tests"; run; %end; %if &singular %then %do; options notes; %put NOTE: The chi-square Q-Q plot requires a nonsingular correlation matrix.; %if %index(&version,DEBUG)=0 %then options nonotes;; %end; %else %if (&plot=M or &plot=B) %then %do; /* compute values for chi-square Q-Q plot */ data _chiplot; set _prins; mahdist=uss(of prin1-prin&_p ); keep mahdist; run; proc rank data=_chiplot out=_chiplot; var mahdist; ranks rdist; run; data _chiplot; set _chiplot nobs=_n; chisq=cinv((rdist-.5)/_n,&_p); keep mahdist chisq; run; /* Create a chi-square Q-Q plot NOTE: Very large sample size is required for chi-square asymptotics unless the number of variables is very small. */ %if &hires=Y %then %do; proc sgplot; run; %if &syserr ne 3000 %then %do; proc sgplot aspect=1 data=_chiplot noautolegend; scatter x=mahdist y=chisq; lineparm x=0 y=0 slope=1 / transparency=.7; %end; %else %do; %if %sysprod(graph)=1 %then %str(proc gplot data=_chiplot; plot mahdist*chisq;); %else %do; %put Note: High resolution graphics procedures not found.; %put Note- PROC PLOT will be used instead.; proc plot data=_chiplot; plot mahdist*chisq; %end; %end; %end; %else %str(proc plot data=_chiplot; plot mahdist*chisq;); label mahdist="Squared Distance" chisq="Chi-square quantile"; title "MULTNORM macro"; title2 "Chi-square Q-Q plot"; run; quit; %end; /* Clean up ======================================================================*/ %exit: %if %index(&version,DEBUG)=0 %then %do; options nonotes; proc datasets nolist nowarn; delete _nomiss _evals _prins _tprins _Mangle _stat _prob _univ _tuniv _chiplot _mardia _norm _corr _outp _roy _mahdist _hz _hz2 _b1: _b2 _h _l _tvar _v _val _var _xc _xv: _y _z: _dh; run; quit; %end; %if %index(&version,DEBUG) %then %do; options nomprint nomlogic nosymbolgen; %put _user_; %end; ods select all; options &opts; title; %let time = %sysfunc(round(%sysevalf(%sysfunc(datetime()) - &time), 0.01)); %put NOTE: The &sysmacroname macro used &time seconds.; %mend;