/* Each analysis from a particular section of the book should be viewed as a distinct analysis */ /* here. In particular, I have put in new proc import statements each time a data set is read */ /* into SAS. I have also not paid attention to details like titles being carried along from */ /* analysis to analysis. */ /* I have not tried to construct most of the plots and many of the more unusual analyses and */ /* tests from the book. */ /* Chapter 2 */ /* Section 2.5 */ /* Input data from comma-delimited file, reading variable names from the first row */ proc import out=tornado datafile="tornado.csv" dbms=csv replace; getnames=yes; datarow=2; run; /* Scatter plot of the target and predictors */ title "Figure 2.3"; proc gplot; plot deaths*killer_tornadoes; plot deaths*tornadoes; run; title; /* Multiple regression, with a plot of residuals versus fitted values */ title "Figure 2.4"; proc reg data=tornado; model deaths=tornadoes killer_tornadoes; plot residual.*predicted.; title; run; /* Simple regression */ proc reg data=tornado; model deaths=killer_tornadoes; run; /* Chapter 3 */ /* Section 3.3 */ proc import out=tornado datafile="tornado.csv" dbms=csv replace; getnames=yes; datarow=2; run; /* Side-by-side boxplots: Figure 3.2 */ /* In order to construct boxplots, the data must first be sorted */ title "Figure 3.2"; proc sort data=tornado out=tornado2; by year; run; proc sort data=tornado out=tornado3; by month_number; run; proc boxplot data=tornado2; plot deaths*year; run; proc boxplot data=tornado3; plot deaths*month_number; run; /* ANCOVA model is fit using proc glm; partial F-tests are given under */ /* Type III SS. Standardized residuals, leverage values, and Cook's distances are output. */ /* The solution option prints out parameter estimates, but they are based on */ /* indicator variables, not effect codings. */ proc glm data=tornado; class year month; model deaths = killer_tornadoes tornadoes year month / solution; output out=result h=lev student=sres cookd=cook; run; /* This creates a variable that is the observation number, so index plots can be created */ data result; set result; obs=_N_; run; proc print data=result; var year month sres lev cook; run; proc gplot data=result; plot sres*obs; plot lev*obs; plot cook*obs; run; /* The following code manually creates effect coding variables. First indicator variables are */ /* formed, and then the effect codings are created. */ data tornado; set tornado; /* The following lines define 12 dummy (indicator) variables, giving each 3 bytes of storage, */ /* and then initializes the variables to 0. */ array dummys {*} 3. Month_1 - Month_12; do i=1 to 12; dummys(i) = 0; end; /* This sets the appropriate indicator variable to 1. */ dummys(Month_number) = 1; run; data tornado; set tornado; array dummys {*} 3. Year_1 - Year_4; do i=1 to 4; dummys(i) = 0; end; dummys(Year - 1995) = 1; run; /* Effect codings are created by subtracting the reference category indicator from the others */ data tornado4; set tornado; January = Month_1 - Month_12; February = Month_2 - Month_12; March = Month_3 - Month_12; April = Month_4 - Month_12; May = Month_5 - Month_12; June = Month_6 - Month_12; July = Month_7 - Month_12; August = Month_8 - Month_12; September = Month_9 - Month_12; October = Month_10 - Month_12; November = Month_11 - Month_12; Year96 = Year_1 - Year_4; Year97 = Year_2 - Year_4; Year98 = Year_3 - Year_4; run; /* This fits the model using the effect codings. The test statements construct the partial F-tests */ /* In order to get the missing coefficient (Year99 and December in this case), a different reference */ /* category must be used. */ proc reg data=tornado4; model deaths=tornadoes killer_tornadoes Year96 Year97 Year98 January February March April May June July August September October November; year: test Year96=0, Year97=0, Year98=0; month: test January=0, February=0, March=0, April=0, May=0, June=0, July=0, August=0, September=0, October=0, November=0; run; /* We can also get the effect coding fits by specifying contrasts to estimate and test. */ /* The estimate statement does this. Say there are k levels to the predictor; the i-th contrast */ /* are specified to have value (k-1)/k for the i-th category, and -1/k for all of the others. */ /* This method has the great advantage of not requiring an additional fit for the reference */ /* category. Note that we have to fudge things slightly for the month effect, since 11/12 and 1/12 */ /* don't have exact decimal representations. */ proc glm data=tornado; class year month_number; model deaths = killer_tornadoes tornadoes year month_number; estimate 'Year96' year .75 -.25 -.25 -.25; estimate 'Year97' year -.25 .75 -.25 -.25; estimate 'Year98' year -.25 -.25 .75 -.25; estimate 'Year99' year -.25 -.25 -.25 .75; estimate 'January' month_number .9167 -.0833 -.0834 -.0833 -.0833 -.0834 -.0833 -.0833 -.0834 -.0833 -.0833 -.0834; estimate 'February' month_number -.0833 .9167 -.0834 -.0833 -.0833 -.0834 -.0833 -.0833 -.0834 -.0833 -.0833 -.0834; estimate 'March' month_number -.0833 -.0834 .9167 -.0833 -.0833 -.0834 -.0833 -.0833 -.0834 -.0833 -.0833 -.0834; estimate 'April' month_number -.0833 -.0834 -.0833 .9167 -.0833 -.0834 -.0833 -.0833 -.0834 -.0833 -.0833 -.0834; estimate 'May' month_number -.0833 -.0834 -.0833 -.0833 .9167 -.0834 -.0833 -.0833 -.0834 -.0833 -.0833 -.0834; estimate 'June' month_number -.0833 -.0834 -.0833 -.0833 -.0834 .9167 -.0833 -.0833 -.0834 -.0833 -.0833 -.0834; estimate 'July' month_number -.0833 -.0834 -.0833 -.0833 -.0834 -.0833 .9167 -.0833 -.0834 -.0833 -.0833 -.0834; estimate 'August' month_number -.0833 -.0834 -.0833 -.0833 -.0834 -.0833 -.0833 .9167 -.0834 -.0833 -.0833 -.0834; estimate 'September' month_number -.0833 -.0834 -.0833 -.0833 -.0834 -.0833 -.0833 -.0834 .9167 -.0833 -.0833 -.0834; estimate 'October' month_number -.0833 -.0834 -.0833 -.0833 -.0834 -.0833 -.0833 -.0834 -.0833 .9167 -.0833 -.0834; estimate 'November' month_number -.0833 -.0834 -.0833 -.0833 -.0834 -.0833 -.0833 -.0834 -.0833 -.0833 .9167 -.0834; estimate 'December' month_number -.0833 -.0834 -.0833 -.0833 -.0834 -.0833 -.0833 -.0834 -.0833 -.0833 -.0834 .9167; run; /* Section 3.4 */ /* The selection = rsquare aic option orders the models by the r-squared value and gives the AIC. It */ /* does not give AICc, and cannot treat class variables (factors) as a group. */ proc reg data=tornado4; model deaths=tornadoes killer_tornadoes Year96 Year97 Year98 January February March April May June July August September October November / selection = rsquare aic; run; /* The genmodsummary macro takes as input a set of models, and returns as output the AIC and AICc values */ /* including the deviation of AIC and AICc from their minimal values). Note that it is written with a */ /* maximum of 20 models. See macro files. */ /* The macro uses the genmod procedure, which is the appropriate one for most categorical data analysis, */ /* but it works for least squares models also. the distribution=n setting says that the errors are */ /* normally distributed ("n"), and link=identity setting says that it is a linear model that is being */ /* fit. */ /* Read in macro file */ %include 'genmodsummary.sas'; /* The models must be specified in numerical order (as model1, model2, etc.) */ %genmodsummary(data=tornado, class=month year, yvariable=deaths, model1=, model2=killer_tornadoes, model3=tornadoes, model4=year, model5=month, model6=killer_tornadoes tornadoes, model7=killer_tornadoes year, model8=killer_tornadoes month, model9=tornadoes year, model10=tornadoes month, model11=year month, model12=killer_tornadoes tornadoes year, model13=killer_tornadoes tornadoes month, model14=killer_tornadoes year month, model15=tornadoes year month, model16=killer_tornadoes tornadoes year month, distribution=n, link=identity, number_of_models=16); /* Part (b) of Table 3.2. Note how the model with an interaction is specified. */ %genmodsummary(data=tornado, class=month year, yvariable=deaths, model1=killer_tornadoes, model2=killer_tornadoes|month, distribution=n, link=identity, number_of_models=2); /* Part (c) of Table 3.2 */ %genmodsummary(data=tornado, class=month year tornado_season, yvariable=deaths, model1=killer_tornadoes, model2=killer_tornadoes tornado_season, model3=killer_tornadoes|tornado_season, distribution=n, link=identity, number_of_models=3); /* Chapter 4 */ /* Please note that I have not programmed many of the more specialized methods from this chapter in SAS. */ /* The S-PLUS/R code provides a model for the sorts of things that can be done. The books listed on the SAS */ /* page of the book's web site also provide guidance and/or code for some of these analyses. */ /* Section 4.1 */ /* The standard binomial confidence interval is given in proc freq. */ /* The $ tells SAS that the variable is a character variable. */ data water; input response$ count ; cards ; yes 958 no 1860 ; run; /* Using count as a weight variable tells SAS that it is the count to be analyzed. */ /* The order option tells SAS that the first entry (yes) is the one to focus on. */ proc freq data=water order=data; table response / binomial; weight count; run; /* The standard binomial hypothesis test is given in proc freq. */ data needle; input infected$ count ; cards ; yes 50 no 2450 ; run; proc freq data=needle order=data; tables infected / binomial(p=.04) alpha=.05; weight count; run; /* This uses the SAS module UnifyPow by Ralph O'Brien; see http://www.bio.ri.ccf.org/power.html */ /* It uses the exact binomial distribution, rather than the approximate normal. */ %let UnifyPow = UnifyPow020817a.sas; %include "&UnifyPow"; datalines4; proportion .03 null .04 alpha .025 power .7 sides 1 ;;;; %tables /* proc freq also gives exact binomial tests and confidence intervals */ data lyme; input result$ sensitivityISA specificityISA sensitivitySER specificitySER reproducibility ; cards ; yes 8 19 10 22 10 no 5 6 3 3 0 ; run; proc freq data=lyme order=data; tables result / exact binomial alpha=.05; weight sensitivityISA; run; proc freq data=lyme order=data; tables result / exact binomial alpha=.05; weight specificityISA; run; proc freq data=lyme order=data; tables result / exact binomial alpha=.05; weight sensitivitySER; run; proc freq data=lyme order=data; tables result / exact binomial alpha=.05; weight specificitySER; run; proc freq data=lyme order=data; tables result / exact binomial alpha=.05; weight reproducibility; run; /* Section 4.3 */ /* A Poisson distribution can be fit to data using Poisson regression in proc genmod. */ /* No predictors are specified, and an identity link function is used. */ proc import out=devils datafile = "devils.csv" dbms = csv replace; getnames = yes; datarow = 2; run; proc genmod data=devils; model Goals = /dist=p link=identity; run; proc genmod data=devils; model Goals_against = /dist=p link=identity; run; /* Section 4.4 */ /* proc freq gives the Pearson chi-squared goodness-of-fit statistics, but */ /* the goodfit macro by Michael Friendly also gives the likelihood ratio statistic; see */ /* http://www.math.yorku.ca/SCS/vcd/index.html */ %include "goodfit.sas"; /* Testing goodness of fit -- Benford's Law */ proc import out = benford datafile = "benford.csv" dbms = csv replace; getnames = yes; datarow = 2; run; /* var is the variable being analyzed, dist=DISCRETE says that a specified discrete distribution with probability */ /* vector given by parm is being tested. */ /* The goodfit macro cannot handle variables names longer than 8 characters in the file, so a data step renames */ /* some variables and drops others */ data benford; set benford; large=large_return; small=small_return; largeth=large_return_thirty; smallth=small_return_thirty; drop large_return small_return return_thirty large_return_thirty small_return_thirty; keep Digit Return large small largeth smallth; run; /* The out file is included so that the index of dissimilarity can be calculated. I illustrate this for the overall */ /* return data. */ %goodfit(var=digit, freq=return, dist=DISCRETE, parm=0.30103 0.176091 0.124939 0.09691 0.079181 0.066947 0.057992 0.051153 0.045757, data=benford, out=fitreturn); /* These statements produce running totals of the numerator (totaldev) and denominator (totalret) of the index of */ /* dissimilarity, which then produce the index after division. */ data dissim(keep=totalret totaldev); set fitreturn end=lastobs; totalret+return; totaldev+abs(Return-exp)/2; if lastobs; run; data dissim; set dissim; dissim=totaldev/totalret; keep dissim; run; proc print data=dissim; run; %goodfit(var=digit, freq=large, dist=DISCRETE, parm=0.30103 0.176091 0.124939 0.09691 0.079181 0.066947 0.057992 0.051153 0.045757, data=benford); %goodfit(var=digit, freq=small, dist=DISCRETE, parm=0.30103 0.176091 0.124939 0.09691 0.079181 0.066947 0.057992 0.051153 0.045757, data=benford); %goodfit(var=digit, freq=small, dist=DISCRETE, parm=.11111 .11111 .11111 .11111 .11111 .11111 .11111 .11111 .11111, data=benford); %goodfit(var=digit, freq=largeth, dist=DISCRETE, parm=0.30103 0.176091 0.124939 0.09691 0.079181 0.066947 0.057992 0.051153 0.045757, data=benford); /* Exact goodness-of-fit testing is available through the exact statement in proc freq, but not if there are cells */ /* with zero counts, as is the case here. SAS ignores the cells with zero counts, but if a small positive number */ /* (say 1.e-20) is used for the zero counts, the program calculates the correct statistic. Unfortunately, in that */ /* case the exact option will not work. */ /* Section 4.5 */ /* A negative binomial model can be fit treating the problem as a negative binomial regression without any */ /* predictors using proc genmod. The estimate of mu is then exp(INTERCEPT), and the dispersion parameter */ /* is the estimate of alpha. */ data soccer; input Goals Count; cards; 0 225 1 293 2 224 3 114 4 41 5 15 6 9 7 1 8 2 ; run; proc genmod data=soccer; model Goals = / dist = nb; freq Count; run; /* We will use proc iml to get the appropriate goodness-of-fit tests for univariate binomial and */ /* beta-binomial data. */ proc import out = broadway datafile = "broadway.csv" dbms = csv replace; getnames = yes; datarow = 2; run; data broadway1; set broadway (keep=Tony_awards Tony_nominations); where Tony_nominations>0; /* The binomial model can be fit using proc logistic, fitting a model with no predictors. */ /* The scale=none option gives the goodness-of-fit statistics. */ proc logistic data=broadway1; model Tony_awards/Tony_nominations = /scale=none; run; /* This uses proc iml to calculate the test for overdispersion */ data broadway2; set broadway (keep=Tony_awards Tony_nominations); p=.2545455; awards=Tony_awards; nomin=Tony_nominations; drop Tony_awards Tony_nominations; if awards=. | nomin=. then delete; expect=nomin*p; dev=(awards-expect)**2/(p*(1-p)); denom=nomin*(nomin-1); run; proc iml; use broadway2; read all var {dev nomin denom} into m; z=(sum(m[,1])-sum(m[,2]))/(2*sum(m[,3]))##.5; title "test of overdispersion"; print z; /* This code uses proc iml to get the MLEs of the beta-binomial distribution */ proc iml; use broadway; read all var {Tony_awards Tony_nominations} into m; b=1.5; start bbdloglike(a) global(m,b); y=-sum(lgamma(a+m[,1])+lgamma(b+m[,2]-m[,1])+lgamma(a+b)-lgamma(a+b+m[,2])-lgamma(a)-lgamma(b)); return (y); finish bbdloglike; loglike=bbdloglike(1); x0=1; do b=1.5 to 2 by .01; call nlpnra(rc,xr,"bbdloglike", x0); temp=bbdloglike(xr); if (temp0) then do; loglike=temp; beta=b; end; end; b=beta; call nlpnra(rc, alpha, "bbdloglike",1); /* This calculates the Pearson goodness-of-fit test for the beta-binomial fit */ title "beta_binomial fit"; print alpha beta loglike; pi=alpha/(alpha+beta); theta=1/(alpha+beta+1); x2=sum(((m[,1]-m[,2]#pi)##2)/(m[,2]#pi#(1-pi)#(1+(m[,2]-1)#theta))); print x2 pi theta; /* Figure 4.6 */ start beta(p) global(alpha,beta); y=(p##(alpha-1))#((1-p)##(beta-1))#gamma(alpha+beta)/(gamma(alpha)#gamma(beta)); return (y); finish beta; p=do(.001,.999,.001); n=ncol(p); p=p`; pdf=J(n,1,0); do i=1 to n; pdf[i]=beta(p[i]); end; x=p||pdf; create temp from x; append from x; data temp; set temp; rename col1=p col2=pdf; symbol v=none i=join; title "Figure 4.6"; proc gplot data=temp; plot pdf*p; run; /* Section 4.6 */ proc import out = runshoes datafile = "runshoes.csv" dbms = csv replace; getnames = yes; datarow = 2; run; /* The macro zero_trun_p fits a univariate zero-truncated Poisson model. See macro files. */ %include 'zero_trun_p.sas'; %zero_trun_p(indata=runshoes,y=shoes); /* Chapter 5 */ /* Section 5.2 */ /* Poisson regression is perfomred using proc genmod. The genmodsummary macro can be used here */ /* to make AIC and AICc comparisons of models. Note that the "p" distribution (Poisson) and */ /* log link are chosen. The other count variable distribution option is negative binomial (nb). */ %include 'genmodsummary.sas'; proc import out = tornado datafile = "tornado.csv" dbms = csv replace; getnames = yes; datarow = 2; run; %genmodsummary(data=tornado, class=month year, yvariable=deaths, model1=, model2=killer_tornadoes, model3=tornadoes, model4=year, model5=month, model6=killer_tornadoes tornadoes, model7=killer_tornadoes year, model8=killer_tornadoes month, model9=tornadoes year, model10=tornadoes month, model11=year month, model12=killer_tornadoes tornadoes year, model13=killer_tornadoes tornadoes month, model14=killer_tornadoes year month, model15=tornadoes year month, model16=killer_tornadoes tornadoes year month, distribution=p, link=log, number_of_models=16); %genmodsummary(data=tornado, class=month year, yvariable=deaths, model1=killer_tornadoes month, model2=killer_tornadoes|month, distribution=p, link=log, number_of_models=2); /* This fits the Poisson regression model. The effect codings are estimated and tested using the contrasts */ /* approach used earlier in the least squares fitting (the resultant Wald tests are the squares of the */ /* z-tests). */ proc genmod data=tornado; class month_number; model deaths = killer_tornadoes month_number / dist=p link=log; estimate 'January' month_number .9167 -.0833 -.0834 -.0833 -.0833 -.0834 -.0833 -.0833 -.0834 -.0833 -.0833 -.0834; estimate 'February' month_number -.0833 .9167 -.0834 -.0833 -.0833 -.0834 -.0833 -.0833 -.0834 -.0833 -.0833 -.0834; estimate 'March' month_number -.0833 -.0834 .9167 -.0833 -.0833 -.0834 -.0833 -.0833 -.0834 -.0833 -.0833 -.0834; estimate 'April' month_number -.0833 -.0834 -.0833 .9167 -.0833 -.0834 -.0833 -.0833 -.0834 -.0833 -.0833 -.0834; estimate 'May' month_number -.0833 -.0834 -.0833 -.0833 .9167 -.0834 -.0833 -.0833 -.0834 -.0833 -.0833 -.0834; estimate 'June' month_number -.0833 -.0834 -.0833 -.0833 -.0834 .9167 -.0833 -.0833 -.0834 -.0833 -.0833 -.0834; estimate 'July' month_number -.0833 -.0834 -.0833 -.0833 -.0834 -.0833 .9167 -.0833 -.0834 -.0833 -.0833 -.0834; estimate 'August' month_number -.0833 -.0834 -.0833 -.0833 -.0834 -.0833 -.0833 .9167 -.0834 -.0833 -.0833 -.0834; estimate 'September' month_number -.0833 -.0834 -.0833 -.0833 -.0834 -.0833 -.0833 -.0834 .9167 -.0833 -.0833 -.0834; estimate 'October' month_number -.0833 -.0834 -.0833 -.0833 -.0834 -.0833 -.0833 -.0834 -.0833 .9167 -.0833 -.0834; estimate 'November' month_number -.0833 -.0834 -.0833 -.0833 -.0834 -.0833 -.0833 -.0834 -.0833 -.0833 .9167 -.0834; estimate 'December' month_number -.0833 -.0834 -.0833 -.0833 -.0834 -.0833 -.0833 -.0834 -.0833 -.0833 -.0834 .9167; run; /* Different slopes model. The scaled deviance residuals are output to a file (Pearson residuals are obtained */ /* using reschi, and standardized Pearson residuals are obtained using stdreschi). */ proc genmod data=tornado; class month_number; model deaths = killer_tornadoes|month_number / dist=p link=log; output out=tornpoisres stdresdev=stdres; run; data tornpoisres; set tornpoisres; obs=_N_; run; proc print data=tornpoisres; var year month stdres; run; proc gplot data=tornpoisres; plot stdres*obs; run; /* The inflglim macro by Michael Friendly calculates and plots various regression diagnostics; */ /* see http://www.math.yorku.ca/SCS/vcd/inflglim.html. Note that the gskip macro from the same source */ /* must also be loaded. */ %include 'inflglim.sas'; %include 'gskip.sas'; /* Variable names must be shortened */ data tornado2; set tornado; monthnum=month_number; killtorn=killer_tornadoes; keep monthnum killtorn deaths; run; %inflglim(data=tornado2, class=monthnum, resp=deaths, model=killtorn|monthnum, dist=poisson, out=torninf) data torninf; set torninf; obs=_N_; run; /* The symbol interpol=join statement tells SAS to create a plot with the points connected */ proc gplot data=torninf; plot stresdev*obs hat*obs cookd*obs; symbol interpol=join; run; /* The UK train accident data shows how a model with an offset is fit. */ proc import out = uktrainacc datafile = "uktrainacc.csv" dbms = csv replace; getnames = yes; datarow = 2; run; data uktrainacc; set uktrainacc; logtrain=log(train_km); logtraingroup=log(train_km_grouped); run; proc genmod data=uktrainacc; model accidents=year/link=log dist=p offset=logtrain; run; proc genmod data=uktrainacc; model spad_preventable=year/link=log dist=p offset=logtrain; run; proc genmod data=uktrainacc; model other_preventable=year/link=log dist=p offset=logtrain; run; proc genmod data=uktrainacc; model non_preventable=year/link=log dist=p offset=logtrain; run; proc genmod data=uktrainacc; model spad_grouped=year_grouped/link=log dist=p offset=logtraingroup; run; proc genmod data=uktrainacc; model other_grouped=year_grouped/link=log dist=p offset=logtraingroup; run; proc genmod data=uktrainacc; model non_grouped=year_grouped/link=log dist=p offset=logtraingroup; run; /* Section 5.3 */ proc import out = wildcat datafile = "wildcat.csv" dbms = csv replace; getnames = yes; datarow = 2; run; /* Poisson regression fit */ proc genmod data=wildcat; model wildcat_strikes= Grievances Rotate Union log_workforce /link=log dist=p; output out=result pred=muhat; run; /* Two tests for overdispersion */ data result; set result; u=((wildcat_strikes-muhat)**2-wildcat_strikes)/muhat; run; proc reg data=result; model u = ; model u = muhat / noint; run; /* Quasi-likelihood is performed using the scale=pearson option */ proc genmod data=wildcat; model wildcat_strikes= Grievances Rotate Union log_workforce /link=log dist=p scale=pearson; run; /* Negative binomial analysis of Florida election data */ proc import out = election2000 datafile = "election2000.csv" dbms = csv replace; getnames = yes; datarow = 2; run; data election2000; set election2000; Bush_rate=Bush00/total00; Nader_rate=Nader00/total00; Perot_rate=Perot96/total96; /* This sets Buchanan's Palm Beach value to missing, so that predictions can be made easily */ data election; set election2000; if county='Palm Beach' then Buchanan00=.; logtotal=log(total00); run; /* Models on only Bush percentage */ proc genmod data=election; model Buchanan00=Bush_rate/link=log dist=p offset=logtotal; output out=rslt_elect_p pred=predict_B; run; /* Print out predicted value for Palm Beach */ data rslt_elect_p; set rslt_elect_p; where (county='Palm Beach'); Buchanan00=3407; proc print data=rslt_elect_p; var county Buchanan00 predict_B; run; /* The expected option tells SAS to use the expected information matrix to calculate Wald statistics, */ /* while the nb distribution is negative binomial. */ proc genmod data=election; model Buchanan00=Bush_rate/link=log dist=nb offset=logtotal expected; output out=rslt_elect_nb pred=predict_B; run; data rslt_elect_nb; set rslt_elect_nb; where (county='Palm Beach'); Buchanan00=3407; proc print data=rslt_elect_nb; var county Buchanan00 predict_B; run; /* Models based on Buch, Nader, and Perot percentages */ proc genmod data=election; model Buchanan00=Bush_rate Nader_rate Perot_rate/link=log dist=p offset=logtotal; output out=rslt_elect_p2 pred=predict; run; data rslt_elect_p2; set rslt_elect_p2; where (county='Palm Beach'); Buchanan00=3407; proc print data=rslt_elect_p2; var county Buchanan00 predict; run; proc genmod data=election; model Buchanan00=Bush_rate Nader_rate Perot_rate/link=log dist=nb offset=logtotal expected; output out=rslt_elect_nb2 pred=predict_BNP; run; data rslt_elect_nb2; set rslt_elect_nb2; where (county='Palm Beach'); Buchanan00=3407; proc print data=rslt_elect_nb2; var county Buchanan00 predict_BNP; run; /* The genmodsummary macro also works for negative binomial models */ %include 'genmodsummary.sas'; %genmodsummary(data=election, yvariable=Buchanan00, model1=Bush_rate, model2=Bush_rate Nader_rate Perot_rate, model3=Nader_rate Perot_rate, distribution=nb, link=log, offset=logtotal, number_of_models=3); /* Section 5.5 */ proc import out=homerun datafile = "homerun.csv" dbms = csv replace; getnames = yes; datarow = 2; run; /* Starting in version 8.2, proc gam constructs generalized additive models. The automatic smoothing */ /* parameter selector used is generalized cross validation (GCV). GCV tends to lead to less smoothing */ /* than does AICc. The "p" option in the output statement saves the estimated values into a variable */ /* P_xxx, where xxx is the target variable names. */ data mcgwire; set homerun; if (McGwire_out = 1) then delete; keep StL_season_day McGwire_HR; run; proc gam data=mcgwire; model McGwire_HR = loess(StL_season_day) / dist=poisson method=gcv; output out=estimate p; run; data estimate; set estimate; pred = exp(P_McGwire_HR); run; proc gplot data=estimate; plot pred*StL_season_day; symbol interpol=join; run; data sosa; set homerun; if (Sosa_out = 1) then delete; keep Chi_season_day Sosa_HR; run; proc gam data=sosa; model Sosa_HR = loess(Chi_season_day) / dist=poisson method=gcv; output out=estimate p; run; data estimate; set estimate; pred = exp(P_Sosa_HR); run; proc gplot data=estimate; plot pred*Chi_season_day; symbol interpol=join; run; /* Smoothing a multinomial vector */ data texas; input Number Count; cards; 0 18 1 17 2 16 3 18 4 20 5 15 6 15 7 11 8 9 9 11 ; run; proc gam data=texas; model Count = loess(Number) / dist=poisson; output out=estimate p; run; data estimate; set estimate; obs = Count/150; pred = exp(P_Count)/150; run; proc gplot data=estimate; plot pred*Number obs*Number / overlay; symbol interpol=join; run; /* Generalized additive model */ proc import out=olympic datafile = "olympic2000.csv" dbms = csv replace; getnames = yes; datarow = 2; run; data olympic; set olympic; GDPpc = GDP/Population; run; proc gam data=olympic; model Total2000 = loess(Log_population) loess(GDPpc) loess(Total1996) / dist=poisson method=gcv; output out=estimate p; run; /* Plotting each of the smoothing components, which are identified by P_xxx, where xxx is the predictor */ proc sort data=estimate; by Log_population; run; proc gplot data=estimate; plot P_Log_population*Log_population; symbol interpol=join; run; proc sort data=estimate; by GDPpc; run; proc gplot data=estimate; plot P_GDPpc*GDPpc; symbol interpol=join; run; proc sort data=estimate; by Total1996; run; proc gplot data=estimate; plot P_Total1996*Total1996; symbol interpol=join; run; /* Chapter 6 */ /* Section 6.1 */ /* Use proc freq to get both confidence interval for difference in probabilities and chi-squared */ /* tests of independence. The riskdiff option gives the confidence interval. */ data leukemia; input PCR_status$ Followup_status$ count; cards; Traces_of_cancer Relapse 30 Traces_of_cancer No_relapse 45 Cancer_free Relapse 8 Cancer_free No_relapse 95 ; proc freq data=leukemia order=data; weight count; tables PCR_status*Followup_status/expected chisq norow nocol nopercent riskdiff; run; /* Section 6.2 */ /* Note that more than one line of data can be put on the same line with the @@ option */ data Major_marketing; input Time$ Marketing$ count @@; cards; High_school Yes 16 High_school No 41 High_school Undecided 8 Freshman Yes 21 Freshman No 29 Freshman Undecided 8 Sophomore Yes 38 Sophomore No 63 Sophomore Undecided 12 Junior Yes 34 Junior No 30 Junior Undecided 8 Senior Yes 9 Senior No 12 Senior Undecided 10 ; /* proc catmod is used to fit loglinear models. The terms on the left side of the model statement */ /* define the table, with the use of the special keyword _response_. The model is then specified */ /* in the loglin statement. Various details of fitting are suppressed here. This gives tests for */ /* the significance of each effect, and an overall likelihood ratio (deviance) goodness-of-fit */ /* test. */ proc catmod data=Major_marketing; weight count; model Time*Marketing=_response_ / noiter noparm noresponse noprofile; loglin Time Marketing; run; /* Tests of independence are better performed as above in proc freq */ proc freq data=Major_marketing order=data; weight count; tables Time*Marketing / expected chisq norow nopercent; run; /* Mosaics are constructed using Michael Friendly's macros and IML code; see */ /* http://www.math.yorku.ca/SCS/mosaics.html for instructions */ %include 'mosaic.sas'; %include 'mosaicm.sas'; %mosaicm; goptions vsize=7in hsize=7in ; *-- square plot environment; proc iml; start major_marketing; *-- time marketing data; table = { /*'High_school' 'Freshman' 'Sophomore' 'Junior' 'Senior'*/ 16 41 8 21 29 8 38 63 12 34 30 8 9 12 10 }; levels= { 3 5 }; vnames = {'marketing' 'time' }; /* Variable names */ lnames = { /* Category names */ 'Yes' 'No' 'Undecided' ' ' ' ', /* marketing */ 'High_school' 'Freshman' 'Sophomore' 'Junior' 'Senior'}; /* time */ title = 'Marketing-Time'; finish; run major_marketing; reset storage=mosaic.mosaic; load module=_all_; *-- Fit models of joint independence (fittype='JOINT'); plots = 2; htext=1.6; run mosaic(levels, table, vnames, lnames, plots, title); /* Adjusted residuals are standardized Pearson residuals, which can be obtained from proc genmod */ data antibiotic; input Diagnosis$ Time$ freq ; cards; Bronchitis 1-3/96 113 Bronchitis 4-6/96 58 Bronchitis 7-9/96 40 Bronchitis 10-12/96 108 Bronchitis 1-2/97 100 Sinusitis 1-3/96 99 Sinusitis 4-6/96 37 Sinusitis 7-9/96 23 Sinusitis 10-12/96 50 Sinusitis 1-2/97 32 URI 1-3/96 410 URI 4-6/96 228 URI 7-9/96 125 URI 10-12/96 366 URI 1-2/97 304 Pneumonia 1-3/96 60 Pneumonia 4-6/96 43 Pneumonia 7-9/96 30 Pneumonia 10-12/96 56 Pneumonia 1-2/97 45 ; proc genmod data=antibiotic; class Diagnosis Time; model freq = Diagnosis Time / dist=p obstats; run; /* Section 6.3 */ /* Fisher's exact test is given in proc freq, but the mid-p-value is not given. */ /* The same fisher option works for I X J tables. */ data thermal_imaging; input True_grp$ Classified_grp$ count; cards; Guilty Guilty 6 Guilty Innocent 2 Innocent Guilty 1 Innocent Innocent 11 ; proc freq data=thermal_imaging; weight count; tables True_grp*Classified_grp / fisher; run; /* Section 6.4 */ /* Structural zeroes do not appear in the file of counts, so there is nothing special */ /* in fitting a model without them. */ proc import out = cancer datafile = "cancerrate.csv" dbms = csv replace; getnames = yes; datarow = 2; run; proc genmod data=cancer; class type gender; model ny_count=type gender/link=log dist=p obstats; run; /* Section 6.5 */ /* Note how a table can be input */ data artifacts; input Artifact$ @; Distance='vicinity'; input count @; output; Distance='<.25'; input count @; output; Distance='.25-.5'; input count @; output; Distance='.5-1'; input count @; output; cards; Drills 2 10 4 2 Pots 3 8 4 6 Grinding_stones 13 5 3 9 Point_fragments 20 36 19 20 ; /* Here's how to convert a table to case-by-case form using proc freq */ /* The sparse option guarantees that zero counts are carried along. This means that if the */ /* all of the zeroes in a table are sampling zeroes, sparse should be used, while if all of */ /* them are structural zeroes, it should not be used. If there are some of both, sparse */ /* is used, and then rows for structural zeroes are deleted in a data statement. */ proc freq data=artifacts order=data; weight count; tables Artifact*Distance/ sparse out=artifact_data noprint; run; proc genmod data=artifact_data; class Artifact Distance; model count=Artifact Distance /link=log dist=p obstats; run; /* Another way to set an observation as a structural zero is to set its count to be missing, */ /* rather than delete it. This has the advantage of making it possible to get deleted residuals. */ /* Note that the Grinding_stone category name has been truncated to the eight character version */ /* Grinding. */ data artifact2_data; set artifact_data; if Artifact = 'Grinding' and Distance = 'vicinity' then count = .; if Artifact = 'Grinding' and Distance = '<.25' then count = .; run; proc genmod data=artifact2_data; class Artifact Distance; model count=Artifact Distance /link=log dist=p obstats; output out=out_art p = pred; run; data out_art; set out_art; if Artifact = 'Grinding' and Distance = 'vicinity' then delres = (13 - pred)/sqrt(pred); if Artifact = 'Grinding' and Distance = '<.25' then delres = (5 - pred)/sqrt(pred); run; proc print data=out_art; var delres; run; data artifact3_data; set artifact_data; if Artifact = 'Grinding' and Distance = 'vicinity' then count = .; run; proc genmod data=artifact3_data; class Artifact Distance; model count=Artifact Distance /link=log dist=p; output out=out_art p = pred; run; data out_art; set out_art; if Artifact = 'Grinding' and Distance = 'vicinity' then delres = (13 - pred)/sqrt(pred); run; proc print data=out_art; var delres; run; /* Chapter 7 */ /* Section 7.1 */ /* Uniform association model */ data fiscal; input Rating$ Nonprofits Count @@; cards; BBB 0 1 BBB 1 0 BBB 2 1 BBB 3 2 BBB 4 0 BBB 5 2 A 0 1 A 1 1 A 2 3 A 3 5 A 4 2 A 5 0 AA 0 5 AA 1 12 AA 2 11 AA 3 4 AA 4 1 AA 5 0 AAA 0 0 AAA 1 4 AAA 2 2 AAA 3 0 AAA 4 0 AAA 5 0 ; data fiscal; set fiscal; if Rating='BBB' then row=1; else if Rating='A' then row=2; else if Rating='AA' then row=3; else if Rating='AAA' then row=4; col=Nonprofits+1; assoc=row*col; run; proc genmod data=fiscal; class Rating Nonprofits; model Count=Rating Nonprofits / dist=p link=log; run; proc genmod data=fiscal; class Rating Nonprofits; model Count=Rating Nonprofits assoc / dist=p link=log residuals; run; data fiscal2; set fiscal; if Rating='BBB' and Nonprofits=0 then delete; run; proc genmod data=fiscal2; class Rating Nonprofits; model Count=Rating Nonprofits / dist=p link=log; run; proc genmod data=fiscal2; class Rating Nonprofits; model Count=Rating Nonprofits assoc / dist=p link=log; run; /* Column effects model */ data assess; input Response$ Type$ Count @@; cards; Nothing Structured 0 Nothing Open 0 Nothing Article 1 Nothing Output 0 Little Structured 3 Little Open 1 Little Article 6 Little Output 4 Enough Structured 8 Enough Open 7 Enough Article 4 Enough Output 8 Lots Structured 3 Lots Open 6 Lots Article 2 Lots Output 2 ; data assess; set assess; if Response='Nothing' then row=1; else if Response='Little' then row=2; else if Response='Enough' then row=3; else if Response='Lots' then row=4; run; proc genmod data=assess; class Response Type; model Count=Response Type / dist=p link=log; run; /* The type3 option gives partial tests for each effect, including of the Type*row */ /* effect, which corresponds to the test of independence given the column effects */ /* model. */ proc genmod data=assess; class Response Type; model Count=Response Type Type*row / dist=p link=log type3; run; /* genmodsummary is used to calculate AICc values */ %include 'genmodsummary.sas'; %genmodsummary(data=assess, class=Response Type, yvariable=Count, model1=Response Type, model2=Response Type Type*row, distribution=p, link=log, number_of_models=2); /* Row + Column effects model */ data myopia; input Refraction$ Lighting$ Count @@; cards; High_hyperopia Darkness 2 High_hyperopia Night_light 1 High_hyperopia Room_light 1 Hyperopia Darkness 21 Hyperopia Night_light 16 Hyperopia Room_light 15 Emmetropia Darkness 66 Emmetropia Night_light 50 Emmetropia Room_light 29 Myopia Darkness 9 Myopia Night_light 31 Myopia Room_light 48 High_myopia Darkness 1 High_myopia Night_light 3 High_myopia Room_light 7 ; data myopia; set myopia; if Refraction='High_hyp' then row=1; else if Refraction='Hyperopi' then row=2; else if Refraction='Emmetrop' then row=3; else if Refraction='Myopia' then row=4; else if Refraction='High_myo' then row=5; if Lighting='Darkness' then col=1; else if Lighting='Night_li' then col=2; else if Lighting='Room_lig' then col=3; if Refraction='Myopia' then myopiaeff=1; else if Refraction='High_myo' then myopiaeff=1; else myopiaeff=0; if refraction='Myopia' then my_high=1; else if refraction='High_myo' then my_high=2; else my_high=0; l=Lighting; r=Refraction; m=myopiaeff; mh=my_high; assoc=row*col; run; %genmodsummary(data=myopia, class=l r m mh, yvariable=count, model1=l r, model2=l r assoc, model3=l r r*col, model4=l r l*row, model5=l r r*col l*row, model6=l r m*col, model7=l r mh*col, distribution=p, link=log, number_of_models=7); /* Here is the row effects fit. Note that the type3 option will give a proper test for (I|R), */ /* but not (U|R), since the uniform association model is not a simple term drop from row */ /* effects (but it can be obtained from the G^2 values given by genmodsummary). The contrast */ /* estimates correspond to the row effects parameters. */ proc genmod data=myopia order=data; class Refraction Lighting; model Count=Refraction Lighting Refraction*col / link=log dist=p type3; estimate 'High_hyp' Refraction*col .8 -.2 -.2 -.2 -.2; estimate 'Hyperopia' Refraction*col -.2 .8 -.2 -.2 -.2; estimate 'Emmetropia' Refraction*col -.2 -.2 .8 -.2 -.2; estimate 'Myopia' Refraction*col -.2 -.2 -.2 .8 -.2; estimate 'High_myopia' Refraction*col -.2 -.2 -.2 -.2 .8; run; /* Fitting bivariate Poisson distributions */ proc import out = gviolence datafile = "gviolence.csv" dbms = csv replace; getnames = yes; datarow = 2; run; proc freq data=gviolence; tables Good_neutral_injuries*Bad_injuries / sparse out=gviolence_table nopercent nocol norow; run; data gviolence_table; set gviolence_table; col=Bad_injuries; row=Good_neutral_injuries; assoc=row*col; offs=-log(fact(col))-log(fact(row)); run; /* Determine AICc values for independence and association models */ %include 'genmodsummary.sas'; %genmodsummary(data=gviolence_table, class=Good_neutral_injuries Bad_injuries, yvariable=count, model1=Good_neutral_injuries Bad_injuries, model2=Good_neutral_injuries Bad_injuries assoc, distribution=p, link=log, number_of_models=2); /* Determine AICc values for bivariate Poisson models. Note that these couldn't be done with */ /* the other models, since they require the offset. Note also that the difference in AICc */ /* values are not right here, since we are not comparing all of the models at once; use the */ /* actual AICc values for comparisons. */ %genmodsummary(data=gviolence_table, yvariable=count, model1=row col, model2=row col row*col, distribution=p, link=log, offset=offs, number_of_models=2); /* Omit outlier. Association models are fit as usual. The code below shows how to fit the */ /* equal-mean independent and correlated Poisson models. */ data gviolence_table2; set gviolence_table; if (row=8) then delete; rowpcol=row+col; run; /* An alternative to the Wald test for equality of means of two independent Poissons is the */ /* likelihood ratio test obtained by the difference in deviance values for the model with */ /* unequal means and the model with equal means */ %genmodsummary(data=gviolence_table2, yvariable=count, model1=row col, model2=rowpcol, distribution=p, link=log, offset=offs, number_of_models=2); /* Unequal-mean and equal-mean correlated Poisson models */ %genmodsummary(data=gviolence_table2, yvariable=count, model1=row col row*col, model2=rowpcol row*col, distribution=p, link=log, offset=offs, number_of_models=2); /* Section 7.2 */ data rating; input Rating97$ Rating98$ Count @@; cards; AAA AAA 111 AAA AA 5 AAA A 0 AAA BBB 0 AAA Spec 0 AA AAA 6 AA AA 396 AA A 19 AA BBB 2 AA Spec 2 A AAA 0 A AA 18 A A 938 A BBB 41 A Spec 8 BBB AAA 0 BBB AA 2 BBB A 27 BBB BBB 671 BBB Spec 29 Spec AAA 0 Spec AA 0 Spec A 4 Spec BBB 48 Spec Spec 930 ; /* The macro factors creates the appropriate codings for many different symmetry models. It */ /* is given in the paper "Generating Factor variables for Asymmetry, Non-independence and */ /* Skew-symmetry models in Square Contingency Tables using SAS" by H.B. Lawal and R.A. Sundheim, */ /* Journal of Statistical Software, 7(8) [2002]; see http://www.jstatsoft.org/index.php?vol=7# */ %include 'factors.sas'; /* This creates the factors for a 5 x 5 table */ %Factors(5, FactorDSN=factors); /* This keeps the Q (diagonal quasi-independence) and S (symmetry) factors */ data rating; merge rating factors; keep Rating97 Rating98 Q S Count; run; %include 'genmodsummary.sas'; %genmodsummary(data=rating, class=Rating97 Rating98 Q S, yvariable=count, model1=Rating97 Rating98, model2=Rating97 Rating98 Q, model3=Q S, distribution=p, link=log, number_of_models=3); /* Quasi-symmetry model */ data germanmob; input job1848$ initial$ count; cards; Justice Justice 117 Justice Administration 11 Justice Education 4 Justice Military 0 Justice Church 0 Justice Farmer 0 Justice Lawyer 8 Justice Self-employed 0 Administration Justice 83 Administration Administration 37 Administration Education 3 Administration Military 5 Administration Church 1 Administration Farmer 4 Administration Lawyer 2 Administration Self-employed 7 Education Justice 19 Education Administration 6 Education Education 67 Education Military 1 Education Church 11 Education Farmer 0 Education Lawyer 1 Education Self-employed 20 Military Justice 0 Military Administration 0 Military Education 1 Military Military 16 Military Church 0 Military Farmer 0 Military Lawyer 0 Military Self-employed 0 Church Justice 1 Church Administration 0 Church Education 4 Church Military 0 Church Church 26 Church Farmer 0 Church Lawyer 0 Church Self-employed 4 Farmer Justice 17 Farmer Administration 7 Farmer Education 3 Farmer Military 8 Farmer Church 0 Farmer Farmer 19 Farmer Lawyer 0 Farmer Self-employed 1 Lawyer Justice 76 Lawyer Administration 7 Lawyer Education 5 Lawyer Military 1 Lawyer Church 1 Lawyer Farmer 1 Lawyer Lawyer 22 Lawyer Self-employed 0 Self-employed Justice 6 Self-employed Administration 6 Self-employed Education 13 Self-employed Military 0 Self-employed Church 0 Self-employed Farmer 0 Self-employed Lawyer 1 Self-employed Self-employed 37 ; %include 'factors.sas'; %include 'genmodsummary.sas'; %Factors(8, FactorDSN=factors); data germanmob; merge germanmob factors; keep initial job1848 Q S Count; run; %genmodsummary(data=germanmob, class=initial job1848 Q S, yvariable=count, model1=initial job1848, model2=initial job1848 Q, model3=Q S, model4=initial job1848 S, distribution=p, link=log, number_of_models=4); /* Scaled differences between quasi-symmetry and quasi-independence */ proc genmod data=germanmob order=data; class initial job1848 Q; model Count= initial job1848 Q / link=log dist=p; output out=german_qi p=predqi; run; proc genmod data=germanmob order=data; class initial job1848 S; model Count= initial job1848 S / link=log dist=p; output out=german_qs p=predqs; run; data scalediff; merge german_qi german_qs; diff=(predqs-predqi)/sqrt(predqi); keep initial job1848 diff; run; proc print data=scalediff; run; /* Ordinal square table */ data britishmob; input son father count @@; cards; 1 1 50 1 2 16 1 3 12 1 4 11 1 5 14 1 6 0 1 7 0 2 1 19 2 2 40 2 3 35 2 4 20 2 5 36 2 6 6 2 7 3 3 1 26 3 2 34 3 3 65 3 4 58 3 5 114 3 6 19 3 7 14 4 1 8 4 2 18 4 3 66 4 4 110 4 5 185 4 6 40 4 7 32 5 1 18 5 2 31 5 3 123 5 4 223 5 5 714 5 6 179 5 7 141 6 1 6 6 2 8 6 3 23 6 4 64 6 5 258 6 6 143 6 7 91 7 1 2 7 2 3 7 3 21 7 4 32 7 5 189 7 6 71 7 7 106 ; %include 'factors.sas'; %include 'genmodsummary.sas'; %Factors(7, FactorDSN=factors); data britishmob; merge britishmob factors; rownum=father; colnum=son; keep father son rownum colnum Q S Count; run; %genmodsummary(data=britishmob, class=father son Q S, yvariable=count, model1=father son, model2=father son Q, model3=Q S, model4=father son S, model5=Q S colnum, model6=father son rownum*colnum, model7=father son Q rownum*colnum, distribution=p, link=log, number_of_models=7); /* McNemar's test is in proc freq */ data diabetic; input pre$ post$ count; cards; Not_satisfactory Not_satisfactory 1 Not_satisfactory Satisfactory 12 Satisfactory Not_satisfactory 0 Satisfactory Satisfactory 4 ; proc freq data=diabetic order=data; weight count; tables pre*post / agree nopercent norow nocol; run; /* Kappa coefficient is in proc freq */ data thermometer; input rectal$ pacifier$ count; cards; No_fever No_fever 49 No_fever Fever 1 Fever No_fever 14 Fever Fever 36 ; proc freq data=thermometer order=data; weight count; tables rectal*pacifier / agree nopercent norow nocol; run; /* Weighted kappa coefficient is in proc freq */ data socktest; input therapist1$ therapist2$ count @@; cards; 0 0 5 0 1 0 0 2 0 0 3 0 1 0 3 1 1 5 1 2 0 1 3 0 2 0 0 2 1 2 2 2 3 2 3 0 3 0 0 3 1 0 3 2 0 3 3 3 ; /* To get quadratic weights, use the (wt=fc) option in the agree setting. */ proc freq data=socktest order=data; weight count; tables therapist1*therapist2 / agree(wt=fc) nopercent norow nocol; run; /* The default weighted kappa uses absolute weights. */ proc freq data=socktest order=data; weight count; tables therapist1*therapist2 / agree nopercent norow nocol; run; /* Exact analysis of McNemar's test */ data nutrition; input WHO$ Ehrenberg$ count; cards; Malnourished Malnourished 20 Malnourished Normal 0 Normal Malnourished 4 Normal Normal 237 ; proc freq data=nutrition order=data; weight count; tables WHO*Ehrenberg / nopercent norow nocol; exact mcnem; run; /* Chapter 8 */ /* Section 8.1 */ /* In an I x J x K table, put the variable with K levels as the first variable in the tables statement in proc freq */ /* to get separate analysis conditioning on that variable. The chisq option gives separate chi-squared tests for */ /* each subtable. */ data freezer; input Ownership$ Social_group$ Freezer$ count @@; cards; Owner I-II Yes 304 Owner I-II No 38 Owner III Yes 666 Owner III No 85 Owner IV Yes 894 Owner IV No 93 Owner V Yes 720 Owner V No 84 Renter I-II Yes 92 Renter I-II No 64 Renter III Yes 174 Renter III No 113 Renter IV Yes 379 Renter IV No 321 Renter V Yes 433 Renter V No 297 ; proc freq data=freezer order=data; weight count; tables Ownership*Social_group*Freezer / nocol norow nopercent chisq; run; /* Tests of conditional association are given in proc freq using the cmh option; it gives the CMH and BD tests, */ /* and the Mantel-Haenszel estimate of the common odds ratio, which is given under "Odds ratio". */ data whickham; input Age$ Smoking$ Mortality$ count @@; cards; 18-24 Smoker Alive 53 18-24 Smoker Dead 2 18-24 Nonsmoker Alive 61 18-24 Nonsmoker Dead 1 25-34 Smoker Alive 121 25-34 Smoker Dead 3 25-34 Nonsmoker Alive 152 25-34 Nonsmoker Dead 5 35-44 Smoker Alive 95 35-44 Smoker Dead 14 35-44 Nonsmoker Alive 114 35-44 Nonsmoker Dead 7 45-54 Smoker Alive 103 45-54 Smoker Dead 27 45-54 Nonsmoker Alive 66 45-54 Nonsmoker Dead 12 55-64 Smoker Alive 64 55-64 Smoker Dead 51 55-64 Nonsmoker Alive 81 55-64 Nonsmoker Dead 40 65-74 Smoker Alive 7 65-74 Smoker Dead 29 65-74 Nonsmoker Alive 28 65-74 Nonsmoker Dead 101 75+ Smoker Alive 0 75+ Smoker Dead 13 75+ Nonsmoker Alive 0 75+ Nonsmoker Dead 64 ; proc freq data=whickham order=data; weight count; tables Age*Smoking*Mortality / nocol norow nopercent cmh; run; /* Section 8.2 */ /* Loglinear models can be fit using either proc genmod or proc catmod. I will use genmodsummary (and hence */ /* proc genmod) to get AICc values, but I will first show how to fit a loglinear model using proc catmod */ data whickham; input Age$ Smoking$ Mortality$ count @@; cards; 18-24 Smoker Alive 53 18-24 Smoker Dead 2 18-24 Nonsmoker Alive 61 18-24 Nonsmoker Dead 1 25-34 Smoker Alive 121 25-34 Smoker Dead 3 25-34 Nonsmoker Alive 152 25-34 Nonsmoker Dead 5 35-44 Smoker Alive 95 35-44 Smoker Dead 14 35-44 Nonsmoker Alive 114 35-44 Nonsmoker Dead 7 45-54 Smoker Alive 103 45-54 Smoker Dead 27 45-54 Nonsmoker Alive 66 45-54 Nonsmoker Dead 12 55-64 Smoker Alive 64 55-64 Smoker Dead 51 55-64 Nonsmoker Alive 81 55-64 Nonsmoker Dead 40 65-74 Smoker Alive 7 65-74 Smoker Dead 29 65-74 Nonsmoker Alive 28 65-74 Nonsmoker Dead 101 75+ Smoker Alive 0 75+ Smoker Dead 13 75+ Nonsmoker Alive 0 75+ Nonsmoker Dead 64 ; /* This fits the (S, MA) model. Note the use of | to specify an interaction in the sufficient marginal */ proc catmod data=whickham order=data; weight count; model Age*Smoking*Mortality = _response_ / noiter noparm noresponse noprofile; loglin Smoking Age|Mortality; run; /* All of the models can be fit and compared using the genmodsummary macro */ %include 'genmodelsummary'; %genmodsummary(data=whickham, class=Age Smoking Mortality, yvariable=count, model1=Age*Smoking Age*Mortality Mortality*Smoking, model2=Age*Mortality Mortality*Smoking, model3=Age*Smoking Mortality*Smoking, model4=Age*Smoking Age*Mortality, model5=Smoking Age*Mortality, model6=Mortality Age*Smoking, distribution=p, link=log, number_of_models=6); /* Here is the proc genmod fit of the homogeneous association model, with the fitted values printed out. */ /* Note that the type3 option gives the likelihood ratio tests for AS, MA, and MS given the model. */ proc genmod data=whickham order=data; class Age Smoking Mortality; model count=Age*Smoking Age*Mortality Smoking*Mortality / dist=p link=log pred type3; run; /* Military death data */ data mildeath; input Gender$ Manner$ Service$ count; cards; Male Accident Army 158 Male Illness Army 56 Male Homicide Army 11 Male Self-inflicted Army 51 Male Unknown Army 24 Female Accident Army 15 Female Illness Army 5 Female Homicide Army 5 Female Self-inflicted Army 3 Female Unknown Army 1 Male Accident Navy 96 Male Illness Navy 30 Male Homicide Navy 7 Male Self-inflicted Navy 32 Male Unknown Navy 9 Female Accident Navy 9 Female Illness Navy 4 Female Homicide Navy 2 Female Self-inflicted Navy 3 Female Unknown Navy 2 Male Accident Air 54 Male Illness Air 21 Male Homicide Air 2 Male Self-inflicted Air 13 Male Unknown Air 13 Female Accident Air 7 Female Illness Air 4 Female Homicide Air 0 Female Self-inflicted Air 1 Female Unknown Air 2 Male Accident Marine 69 Male Illness Marine 6 Male Homicide Marine 6 Male Self-inflicted Marine 7 Male Unknown Marine 27 Female Accident Marine 3 Female Illness Marine 0 Female Homicide Marine 1 Female Self-inflicted Marine 0 Female Unknown Marine 2 ; %include 'genmodsummary.sas'; %genmodsummary(data=mildeath, class=Gender Manner Service, yvariable=count, model1=Service*Manner Service*Gender Manner*Gender, model2=Service*Manner Service*Gender, model3=Service*Manner Manner*Gender, model4=Service*Gender Manner*Gender, model5=Manner Service*Gender, model6=Gender Service*Manner, model7=Service Manner*Gender, distribution=p, link=log, number_of_models=7); proc genmod data=mildeath order=data; class Gender Manner Service; model count=Gender Service*Manner / dist=p link=log; estimate 'Male' Gender .5 -.5; estimate 'Female' Gender -.5 .5; run; /* Figure 8.3 - three-dimensional mosaic plot */ %include 'mosaic.sas'; %include 'mosaicm.sas'; %mosaicm; goptions vsize=8in hsize=8in ; *-- square plot environment; proc iml; start service; *-- service data; table = { 158 56 11 51 24 15 5 5 3 1 96 30 7 32 9 9 4 2 3 2 54 21 2 13 13 7 4 0 1 2 69 6 6 7 27 3 0 1 0 2 }; levels= {5 8}; vnames = { 'Manner' 'Gender_Service'}; /* Variable names */ lnames = { /* Category names */ 'Accident' 'Illness' 'Homicide' 'Self-inflicted' 'Unknown' '' '' '', 'Army_Male' 'Army_Female' 'Navy_Male' 'Navy_Female' 'Air_Male' 'Air_Female' 'Marine_Male' 'Marine_Female' }; title = 'Service'; finish; run service; reset storage=mosaic.mosaic; load module=_all_; *-- Fit models of joint independence (fittype='JOINT'); plots = 2; htext=0.6; run mosaic(levels, table, vnames, lnames, plots, title); quit; %include 'mosaic.sas'; %include 'mosaicm.sas'; %mosaicm; goptions vsize=8in hsize=8in ; *-- square plot environment; proc iml; start service; *-- service data; table = { 157.31406 55.46912 14.549277 49.103811 22.733246 15.68594 5.5308804 1.4507227 4.8961892 2.2667543 95.479632 30.917214 8.1839685 31.826544 10.002628 9.5203679 3.0827858 0.8160315 3.173456 0.9973719 55.46912 22.733246 1.8186597 12.730618 13.639947 5.5308804 2.2667543 0.1813403 1.2693824 1.3600526 65.471748 5.455979 6.3653088 6.3653088 26.370565 6.5282523 0.544021 0.6346912 0.6346912 2.629435}; levels= {5 8}; vnames = { 'Manner' 'Gender_Service'}; /* Variable names */ lnames = { /* Category names */ 'Accident' 'Illness' 'Homicide' 'Self-inflicted' 'Unknown' '' '' '', 'Army_Male' 'Army_Female' 'Navy_Male' 'Navy_Female' 'Air_Male' 'Air_Female' 'Marine_Male' 'Marine_Female' }; title = 'Fitted Counts'; finish; run service; reset storage=mosaic.mosaic; load module=_all_; *-- Fit models of joint independence (fittype='JOINT'); plots = 2; htext=0.6; run mosaic(levels, table, vnames, lnames, plots, title); quit; /* Now fitting models on death rates */ data miltotal; input total @@; cards; 407398 407398 407398 407398 407398 72028 72028 72028 72028 72028 321424 321424 321424 321424 321424 51622 51622 51622 51622 51622 294117 294117 294117 294117 294117 66473 66473 66473 66473 66473 162477 162477 162477 162477 162477 10164 10164 10164 10164 10164 ; data mildeath; merge mildeath miltotal; logtotal=log(total); %genmodsummary(data=mildeath, class=Gender Manner Service, yvariable=count, model1=Service*Manner Service*Gender Manner*Gender, model2=Service*Manner Service*Gender, model3=Service*Manner Manner*Gender, model4=Service*Gender Manner*Gender, model5=Manner Service*Gender, model6=Gender Service*Manner, model7=Service Manner*Gender, distribution=p, link=log, offset=logtotal, number_of_models=7); proc genmod data=mildeath order=data; class Gender Manner Service; model count=Gender Service*Manner / dist=p link=log offset=logtotal; estimate 'Male' Gender .5 -.5; estimate 'Female' Gender -.5 .5; run; /* Section 8.3 */ data bstudents; input When_major_chosen$ Employment$ Salary$ count; cards; High_school Very_unimportant Very_unimportant 2 High_school Unimportant Very_unimportant 0 High_school Neutral Very_unimportant 0 High_school Important Very_unimportant 0 High_school Very_important Very_unimportant 1 Freshman Very_unimportant Very_unimportant 0 Freshman Unimportant Very_unimportant 0 Freshman Neutral Very_unimportant 0 Freshman Important Very_unimportant 0 Freshman Very_important Very_unimportant 0 Sophomore Very_unimportant Very_unimportant 2 Sophomore Unimportant Very_unimportant 0 Sophomore Neutral Very_unimportant 0 Sophomore Important Very_unimportant 0 Sophomore Very_important Very_unimportant 0 Junior Very_unimportant Very_unimportant 0 Junior Unimportant Very_unimportant 0 Junior Neutral Very_unimportant 0 Junior Important Very_unimportant 0 Junior Very_important Very_unimportant 0 Senior Very_unimportant Very_unimportant 2 Senior Unimportant Very_unimportant 0 Senior Neutral Very_unimportant 0 Senior Important Very_unimportant 0 Senior Very_important Very_unimportant 0 High_school Very_unimportant Unimportant 0 High_school Unimportant Unimportant 0 High_school Neutral Unimportant 1 High_school Important Unimportant 0 High_school Very_important Unimportant 0 Freshman Very_unimportant Unimportant 0 Freshman Unimportant Unimportant 0 Freshman Neutral Unimportant 0 Freshman Important Unimportant 1 Freshman Very_important Unimportant 0 Sophomore Very_unimportant Unimportant 0 Sophomore Unimportant Unimportant 1 Sophomore Neutral Unimportant 1 Sophomore Important Unimportant 0 Sophomore Very_important Unimportant 0 Junior Very_unimportant Unimportant 0 Junior Unimportant Unimportant 1 Junior Neutral Unimportant 0 Junior Important Unimportant 1 Junior Very_important Unimportant 1 Senior Very_unimportant Unimportant 0 Senior Unimportant Unimportant 1 Senior Neutral Unimportant 0 Senior Important Unimportant 0 Senior Very_important Unimportant 0 High_school Very_unimportant Neutral 0 High_school Unimportant Neutral 3 High_school Neutral Neutral 3 High_school Important Neutral 1 High_school Very_important Neutral 2 Freshman Very_unimportant Neutral 0 Freshman Unimportant Neutral 0 Freshman Neutral Neutral 2 Freshman Important Neutral 4 Freshman Very_important Neutral 2 Sophomore Very_unimportant Neutral 0 Sophomore Unimportant Neutral 0 Sophomore Neutral Neutral 4 Sophomore Important Neutral 7 Sophomore Very_important Neutral 5 Junior Very_unimportant Neutral 0 Junior Unimportant Neutral 0 Junior Neutral Neutral 1 Junior Important Neutral 4 Junior Very_important Neutral 2 Senior Very_unimportant Neutral 1 Senior Unimportant Neutral 0 Senior Neutral Neutral 1 Senior Important Neutral 0 Senior Very_important Neutral 0 High_school Very_unimportant Important 0 High_school Unimportant Important 0 High_school Neutral Important 0 High_school Important Important 8 High_school Very_important Important 10 Freshman Very_unimportant Important 0 Freshman Unimportant Important 0 Freshman Neutral Important 1 Freshman Important Important 6 Freshman Very_important Important 15 Sophomore Very_unimportant Important 0 Sophomore Unimportant Important 1 Sophomore Neutral Important 3 Sophomore Important Important 19 Sophomore Very_important Important 22 Junior Very_unimportant Important 0 Junior Unimportant Important 1 Junior Neutral Important 1 Junior Important Important 15 Junior Very_important Important 14 Senior Very_unimportant Important 0 Senior Unimportant Important 0 Senior Neutral Important 2 Senior Important Important 8 Senior Very_important Important 6 High_school Very_unimportant Very_important 1 High_school Unimportant Very_important 1 High_school Neutral Very_important 0 High_school Important Very_important 8 High_school Very_important Very_important 25 Freshman Very_unimportant Very_important 0 Freshman Unimportant Very_important 0 Freshman Neutral Very_important 1 Freshman Important Very_important 6 Freshman Very_important Very_important 21 Sophomore Very_unimportant Very_important 0 Sophomore Unimportant Very_important 1 Sophomore Neutral Very_important 1 Sophomore Important Very_important 10 Sophomore Very_important Very_important 38 Junior Very_unimportant Very_important 0 Junior Unimportant Very_important 0 Junior Neutral Very_important 3 Junior Important Very_important 3 Junior Very_important Very_important 24 Senior Very_unimportant Very_important 1 Senior Unimportant Very_important 0 Senior Neutral Very_important 2 Senior Important Very_important 2 Senior Very_important Very_important 4 ; data bstudents; set bstudents; m=When_major_chosen; e=Employment; s=Salary; %genmodsummary(data=bstudents, class=m e s, yvariable=count, model1=s*e s*m e*m, model2=s*e s*m, model3=s*e e*m, model4=s*m e*m, model5=s e*m, model6=s*e m, model7=s e m, distribution=p, link=log, number_of_models=7); /* The row and column scores are coded explicitly here */ data bstudents; set bstudents; if s='Very_uni' then col=1; else if s='Unimport' then col=2; else if s='Neutral' then col=3; else if s='Importan' then col=4; else if s='Very_imp' then col=5; if e='Very_uni' then row=1; else if e='Unimport' then row=2; else if e='Neutral' then row=3; else if e='Importan' then row=4; else if e='Very_imp' then row=5; /* This codes the diagonal and off-diagonal cells together in a class */ /* variable that corresponds to both quasi-independence and symmetry */ if row=col then symmetry=row; else if row>col then symmetry=row*10+col; else if row