/* Each analysis from a particular section of the book should be viewed as a distinct analysis    */
/* here. In particular, I have put in new proc import statements each time a data set is read     */
/* into SAS. I have also not paid attention to details like titles being carried along from       */
/* analysis to analysis.                                                                          */

/* I have not tried to construct most of the plots and many of the more unusual analyses and      */
/* tests from the book.                                                                           */

/* Chapter 2 */

/* Section 2.5 */

/* Input data from comma-delimited file, reading variable names from the first row */

proc import out=tornado
            datafile="tornado.csv"
            dbms=csv replace;
     getnames=yes;
     datarow=2;
run;

/* Scatter plot of the target and predictors */

title "Figure 2.3";
proc gplot;
     plot deaths*killer_tornadoes;
     plot deaths*tornadoes;
run;
title;

/* Multiple regression, with a plot of residuals versus fitted values */

title "Figure 2.4";
proc reg data=tornado;
     model deaths=tornadoes killer_tornadoes;
     plot residual.*predicted.;
title;
run;

/* Simple regression */
proc reg data=tornado;
     model deaths=killer_tornadoes;
run;

/* Chapter 3 */

/* Section 3.3 */

proc import out=tornado
            datafile="tornado.csv"
            dbms=csv replace;
     getnames=yes;
     datarow=2;
run;

/* Side-by-side boxplots: Figure 3.2 */
/* In order to construct boxplots, the data must first be sorted  */

title "Figure 3.2";

proc sort data=tornado out=tornado2;
     by year;
run;

proc sort data=tornado out=tornado3;
     by month_number;
run;

proc boxplot data=tornado2;
     plot deaths*year;
run;

proc boxplot data=tornado3;
     plot deaths*month_number;
run;

/* ANCOVA model is fit using proc glm; partial F-tests are given under                     */
/* Type III SS. Standardized residuals, leverage values, and Cook's distances are output.  */
/* The solution option prints out parameter estimates, but they are based on              */
/* indicator variables, not effect codings.                                                */

proc glm data=tornado;
     class year month;
     model deaths = killer_tornadoes tornadoes year month / solution;
     output out=result
           h=lev
           student=sres
           cookd=cook;
run;

/* This creates a variable that is the observation number, so index plots can be created   */

data result;
     set result;
     obs=_N_;
run;

proc print data=result;
     var year month sres lev cook;
run;

proc gplot data=result;
     plot sres*obs;
     plot lev*obs;
     plot cook*obs;
run;

/* The following code manually creates effect coding variables. First indicator variables are   */
/* formed, and then the effect codings are created.                                             */

data tornado;
     set tornado;

/* The following lines define 12 dummy (indicator) variables, giving each 3 bytes of storage,    */
/* and then initializes the variables to 0.                                                      */

     array dummys {*} 3. Month_1 - Month_12;
     do i=1 to 12;
        dummys(i) = 0;
     end;

/*   This sets the appropriate indicator variable to 1.                                          */

     dummys(Month_number) = 1;
run;

data tornado;
     set tornado;
     array dummys {*} 3. Year_1 - Year_4;
     do i=1 to 4;
        dummys(i) = 0;
     end;
     dummys(Year - 1995) = 1;
run;

/* Effect codings are created by subtracting the reference category indicator from the others    */

data tornado4;
     set tornado;
     January = Month_1 - Month_12;
     February = Month_2 - Month_12;
     March = Month_3 - Month_12;
     April = Month_4 - Month_12;
     May = Month_5 - Month_12;
     June = Month_6 - Month_12;
     July = Month_7 - Month_12;
     August = Month_8 - Month_12;
     September = Month_9 - Month_12;
     October = Month_10 - Month_12;
     November = Month_11 - Month_12;
     Year96 = Year_1 - Year_4;
     Year97 = Year_2 - Year_4;
     Year98 = Year_3 - Year_4;
run;

/* This fits the model using the effect codings. The test statements construct the partial F-tests      */
/* In order to get the missing coefficient (Year99 and December in this case), a different reference  */
/* category must be used.                                                                             */

proc reg data=tornado4;
     model deaths=tornadoes killer_tornadoes Year96 Year97 Year98 January February March April May June July
          August September October November;
     year: test Year96=0, Year97=0, Year98=0;
     month: test January=0, February=0, March=0, April=0, May=0, June=0, July=0, August=0, September=0, October=0, November=0;
run;

/* We can also get the effect coding fits by specifying contrasts to estimate and test.               */
/* The estimate statement does this. Say there are k levels to the predictor; the i-th contrast         */
/* are specified to have value (k-1)/k for the i-th category, and -1/k for all of the others.         */
/* This method has the great advantage of not requiring an additional fit for the reference           */
/* category. Note that we have to fudge things slightly for the month effect, since 11/12 and 1/12    */
/* don't have exact decimal representations.                                                          */

proc glm data=tornado;
     class year month_number;
     model deaths = killer_tornadoes tornadoes year month_number;
       estimate 'Year96' year .75 -.25 -.25 -.25;
       estimate 'Year97' year -.25 .75 -.25 -.25;
       estimate 'Year98' year -.25 -.25 .75 -.25;
       estimate 'Year99' year -.25 -.25 -.25 .75;
       estimate 'January' month_number .9167 -.0833 -.0834 -.0833 -.0833 -.0834 -.0833 -.0833 -.0834 -.0833 -.0833 -.0834;
       estimate 'February' month_number -.0833 .9167 -.0834 -.0833 -.0833 -.0834 -.0833 -.0833 -.0834 -.0833 -.0833 -.0834;
       estimate 'March' month_number -.0833 -.0834 .9167 -.0833 -.0833 -.0834 -.0833 -.0833 -.0834 -.0833 -.0833 -.0834;
       estimate 'April' month_number -.0833 -.0834 -.0833 .9167 -.0833 -.0834 -.0833 -.0833 -.0834 -.0833 -.0833 -.0834;
       estimate 'May' month_number -.0833 -.0834 -.0833 -.0833 .9167 -.0834 -.0833 -.0833 -.0834 -.0833 -.0833 -.0834;
       estimate 'June' month_number -.0833 -.0834 -.0833 -.0833 -.0834 .9167 -.0833 -.0833 -.0834 -.0833 -.0833 -.0834;
       estimate 'July' month_number -.0833 -.0834 -.0833 -.0833 -.0834 -.0833 .9167 -.0833 -.0834 -.0833 -.0833 -.0834;
       estimate 'August' month_number -.0833 -.0834 -.0833 -.0833 -.0834 -.0833 -.0833 .9167 -.0834 -.0833 -.0833 -.0834;
       estimate 'September' month_number -.0833 -.0834 -.0833 -.0833 -.0834 -.0833 -.0833 -.0834 .9167 -.0833 -.0833 -.0834;
       estimate 'October' month_number -.0833 -.0834 -.0833 -.0833 -.0834 -.0833 -.0833 -.0834 -.0833 .9167 -.0833 -.0834;
       estimate 'November' month_number -.0833 -.0834 -.0833 -.0833 -.0834 -.0833 -.0833 -.0834 -.0833 -.0833 .9167 -.0834;
       estimate 'December' month_number -.0833 -.0834 -.0833 -.0833 -.0834 -.0833 -.0833 -.0834 -.0833 -.0833 -.0834 .9167;
run;

/* Section 3.4   */

/* The selection = rsquare aic option orders the models by the r-squared value and gives the AIC. It   */
/* does not give AICc, and cannot treat class variables (factors) as a group.                          */

proc reg data=tornado4;
     model deaths=tornadoes killer_tornadoes Year96 Year97 Year98 January February March April May June July
          August September October November / selection = rsquare aic;
run;

/* The genmodsummary macro takes as input a set of models, and returns as output the AIC and AICc values  */
/* including the deviation of AIC and AICc from their minimal values). Note that it is written with a     */
/* maximum of 20 models. See macro files.                                                                 */

/* The macro uses the genmod procedure, which is the appropriate one for most categorical data analysis,  */
/* but it works for least squares models also. the distribution=n setting says that the errors are        */
/* normally distributed ("n"), and link=identity setting says that it is a linear model that is being     */
/* fit.                                                                                                   */

/* Read in macro file                                                                                     */

%include 'genmodsummary.sas';

/* The models must be specified in numerical order (as model1, model2, etc.)                              */

%genmodsummary(data=tornado, class=month year, yvariable=deaths,
  model1=,
  model2=killer_tornadoes,
  model3=tornadoes,
  model4=year,
  model5=month,
  model6=killer_tornadoes tornadoes,
  model7=killer_tornadoes year,
  model8=killer_tornadoes month,
  model9=tornadoes year,
  model10=tornadoes month,
  model11=year month,
  model12=killer_tornadoes tornadoes year,
  model13=killer_tornadoes tornadoes month,
  model14=killer_tornadoes year month,
  model15=tornadoes year month,
  model16=killer_tornadoes tornadoes year month,
  distribution=n, link=identity, number_of_models=16);

/* Part (b) of Table 3.2. Note how the model with an interaction is specified.                             */

%genmodsummary(data=tornado,
  class=month year, yvariable=deaths,
  model1=killer_tornadoes,
  model2=killer_tornadoes|month,
  distribution=n, link=identity, number_of_models=2);

/* Part (c) of Table 3.2                                                                                   */

%genmodsummary(data=tornado,
  class=month year tornado_season, yvariable=deaths,
  model1=killer_tornadoes,
  model2=killer_tornadoes tornado_season,
  model3=killer_tornadoes|tornado_season,
  distribution=n, link=identity, number_of_models=3);


/* Chapter 4      */

/* Please note that I have not programmed many of the more specialized methods from this chapter in SAS.      */
/* The S-PLUS/R code provides a model for the sorts of things that can be done. The books listed on the SAS   */
/* page of the book's web site also provide guidance and/or code for some of these analyses.                  */   

/* Section 4.1  */

/* The standard binomial confidence interval is given in proc freq.      */
/* The $ tells SAS that the variable is a character variable.            */

data water; 
     input response$ count ; 
     cards ; 
     yes 958
     no 1860 
; 
run;

/* Using count as a weight variable tells SAS that it is the count to be analyzed.                              */
/* The order option tells SAS that the first entry (yes) is the one to focus on.                                */               

proc freq data=water order=data;
     table response / binomial;
     weight count;
run;


/* The standard binomial hypothesis test is given in proc freq.          */

data needle; 
     input infected$ count ; 
     cards ; 
     yes 50
     no 2450 
; 
run;

proc freq data=needle order=data;
     tables infected / binomial(p=.04) alpha=.05;
     weight count;
run;

/* This uses the SAS module UnifyPow by Ralph O'Brien; see http://www.bio.ri.ccf.org/power.html     */
/* It uses the exact binomial distribution, rather than the approximate normal.                     */

%let UnifyPow = UnifyPow020817a.sas;

%include "&UnifyPow";

datalines4;

proportion .03 
null .04 
alpha .025 
power .7
sides 1
;;;;
%tables

/* proc freq also gives exact binomial tests and confidence intervals                               */

data lyme; 
     input result$ sensitivityISA specificityISA sensitivitySER specificitySER reproducibility ; 
     cards ; 
     yes  8  19  10  22  10
     no   5   6   3   3   0
; 
run;

proc freq data=lyme order=data;
     tables result / exact binomial alpha=.05;
     weight sensitivityISA;
run;

proc freq data=lyme order=data;
     tables result / exact binomial alpha=.05;
     weight specificityISA;
run;

proc freq data=lyme order=data;
     tables result / exact binomial alpha=.05;
     weight sensitivitySER;
run;

proc freq data=lyme order=data;
     tables result / exact binomial alpha=.05;
     weight specificitySER;
run;

proc freq data=lyme order=data;
     tables result / exact binomial alpha=.05;
     weight reproducibility;
run;

/* Section 4.3      */

/* A Poisson distribution can be fit to data using Poisson regression in proc genmod.      */
/* No predictors are specified, and an identity link function is used.                     */

proc import out=devils
     datafile = "devils.csv"
     dbms = csv replace;
     getnames = yes;
     datarow = 2;
run;

proc genmod data=devils;
     model Goals = /dist=p link=identity;
run;

proc genmod data=devils;
     model Goals_against = /dist=p link=identity;
run;

/* Section 4.4      */

/* proc freq gives the Pearson chi-squared goodness-of-fit statistics, but                 */
/* the goodfit macro by Michael Friendly also gives the likelihood ratio statistic; see    */
/* http://www.math.yorku.ca/SCS/vcd/index.html                                             */

%include "goodfit.sas";

/* Testing goodness of fit -- Benford's Law */

proc import out = benford
     datafile = "benford.csv"
     dbms = csv replace;
     getnames = yes;
     datarow = 2;
run;

/* var is the variable being analyzed, dist=DISCRETE says that a specified discrete distribution with probability    */
/* vector given by parm is being tested.                                                                             */

/* The goodfit macro cannot handle variables names longer than 8 characters in the file, so a data step renames      */
/* some variables and drops others                                                                                   */

data benford;
     set benford;
     large=large_return;
     small=small_return;
     largeth=large_return_thirty;
     smallth=small_return_thirty;
     drop large_return small_return return_thirty large_return_thirty small_return_thirty;
     keep Digit Return large small largeth smallth;
run;

/* The out file is included so that the index of dissimilarity can be calculated. I illustrate this for the overall    */
/* return data.                                                                                                        */

%goodfit(var=digit, freq=return, dist=DISCRETE, parm=0.30103 0.176091 0.124939 0.09691 0.079181 0.066947 0.057992 0.051153 0.045757, 
  data=benford, out=fitreturn);

/* These statements produce running totals of the numerator (totaldev) and denominator (totalret) of the index of      */
/* dissimilarity, which then produce the index after division.                                                         */

data dissim(keep=totalret totaldev);
     set fitreturn end=lastobs;
	 totalret+return;
	 totaldev+abs(Return-exp)/2;
	 if lastobs;
run;

data dissim;
     set dissim;
	 dissim=totaldev/totalret;
	 keep dissim;
run;

proc print data=dissim;
run;

%goodfit(var=digit, freq=large, dist=DISCRETE, parm=0.30103 0.176091 0.124939 0.09691 0.079181 0.066947 0.057992 0.051153 0.045757, 
  data=benford);
%goodfit(var=digit, freq=small, dist=DISCRETE, parm=0.30103 0.176091 0.124939 0.09691 0.079181 0.066947 0.057992 0.051153 0.045757, 
  data=benford);
%goodfit(var=digit, freq=small, dist=DISCRETE, parm=.11111 .11111 .11111 .11111 .11111 .11111 .11111 .11111 .11111, 
  data=benford);
%goodfit(var=digit, freq=largeth, dist=DISCRETE, parm=0.30103 0.176091 0.124939 0.09691 0.079181 0.066947 0.057992 0.051153 0.045757, 
  data=benford);

/* Exact goodness-of-fit testing is available through the exact statement in proc freq, but not if there are cells */
/* with zero counts, as is the case here. SAS ignores the cells with zero counts, but if a small positive number   */
/* (say 1.e-20) is used for the zero counts, the program calculates the correct statistic. Unfortunately, in that  */
/* case the exact option will not work.                                                                            */

/* Section 4.5                 */

/* A negative binomial model can be fit treating the problem as a negative binomial regression without any         */
/* predictors using proc genmod. The estimate of mu is then exp(INTERCEPT), and the dispersion parameter           */
/* is the estimate of alpha.                                                                                       */

data soccer;
     input Goals Count;
     cards;
     0 225
     1 293
     2 224
     3 114
     4 41
     5 15
     6 9
     7 1
     8 2
;
run;

proc genmod data=soccer;
     model Goals = / dist = nb;
     freq Count;
run;

/* We will use proc iml to get the appropriate goodness-of-fit tests for univariate binomial and   */
/* beta-binomial data.                                                                             */

proc import out = broadway
     datafile = "broadway.csv"
     dbms = csv replace;
     getnames = yes;
     datarow = 2;
run;


data broadway1;
set broadway (keep=Tony_awards Tony_nominations);
where Tony_nominations>0;

/* The binomial model can be fit using proc logistic, fitting a model with no predictors.    */
/* The scale=none option gives the goodness-of-fit statistics.                               */

proc logistic data=broadway1;
     model Tony_awards/Tony_nominations = /scale=none;
run;

/* This uses proc iml to calculate the test for overdispersion                               */

data broadway2;
  set broadway (keep=Tony_awards Tony_nominations);
  p=.2545455;
  awards=Tony_awards;
  nomin=Tony_nominations;
  drop Tony_awards Tony_nominations;
  if awards=. | nomin=. then delete;
  expect=nomin*p;
  dev=(awards-expect)**2/(p*(1-p));
  denom=nomin*(nomin-1);
run;

proc iml;
  use broadway2;
  read all var {dev nomin denom} into m;
  z=(sum(m[,1])-sum(m[,2]))/(2*sum(m[,3]))##.5;
  title "test of overdispersion";
  print z;

/* This code uses proc iml to get the MLEs of the beta-binomial distribution               */

proc iml;
use broadway;
read all var {Tony_awards Tony_nominations} into m;
b=1.5;
start bbdloglike(a) global(m,b);
	y=-sum(lgamma(a+m[,1])+lgamma(b+m[,2]-m[,1])+lgamma(a+b)-lgamma(a+b+m[,2])-lgamma(a)-lgamma(b));
	return (y);
finish bbdloglike;
loglike=bbdloglike(1);
x0=1;

do b=1.5 to 2 by .01;
	call nlpnra(rc,xr,"bbdloglike", x0);
	temp=bbdloglike(xr);
	if (temp<loglike) & (rc>0) then do;
		loglike=temp;
		beta=b;
	end;
end;

b=beta;
call nlpnra(rc, alpha, "bbdloglike",1);

/* This calculates the Pearson goodness-of-fit test for the beta-binomial fit                  */

title "beta_binomial fit";
print alpha beta loglike;

pi=alpha/(alpha+beta);
theta=1/(alpha+beta+1);

x2=sum(((m[,1]-m[,2]#pi)##2)/(m[,2]#pi#(1-pi)#(1+(m[,2]-1)#theta)));
print x2 pi theta;

/* Figure 4.6   */

start beta(p) global(alpha,beta);
	y=(p##(alpha-1))#((1-p)##(beta-1))#gamma(alpha+beta)/(gamma(alpha)#gamma(beta));
	return (y);
finish beta;

p=do(.001,.999,.001);
n=ncol(p);
p=p`;
pdf=J(n,1,0);
do i=1 to n;
	pdf[i]=beta(p[i]);
end;

x=p||pdf;
create temp from x;
append from x;

data temp;
set temp;
rename col1=p col2=pdf;

symbol v=none i=join;

title "Figure 4.6";

proc gplot data=temp;
plot pdf*p;
run;

/* Section 4.6            */

proc import out = runshoes
     datafile = "runshoes.csv"
     dbms = csv replace;
     getnames = yes;
     datarow = 2;
run;

/* The macro zero_trun_p fits a univariate zero-truncated Poisson model. See macro files.       */ 

%include 'zero_trun_p.sas';

%zero_trun_p(indata=runshoes,y=shoes);


/* Chapter 5        */

/* Section 5.2      */

/* Poisson regression is perfomred using proc genmod. The genmodsummary macro can be used here     */
/* to make AIC and AICc comparisons of models. Note that the "p" distribution (Poisson) and        */
/* log link are chosen. The other count variable distribution option is negative binomial (nb).    */

%include 'genmodsummary.sas';

proc import out = tornado
     datafile = "tornado.csv"
     dbms = csv replace;
     getnames = yes;
     datarow = 2;
run;

%genmodsummary(data=tornado, class=month year, yvariable=deaths,
 model1=,
 model2=killer_tornadoes,
 model3=tornadoes,
 model4=year,
 model5=month,
 model6=killer_tornadoes tornadoes,
 model7=killer_tornadoes year,
 model8=killer_tornadoes month,
 model9=tornadoes year,
 model10=tornadoes month,
 model11=year month,
 model12=killer_tornadoes tornadoes year,
 model13=killer_tornadoes tornadoes month,
 model14=killer_tornadoes year month,
 model15=tornadoes year month,
 model16=killer_tornadoes tornadoes year month,
 distribution=p, link=log, number_of_models=16);

%genmodsummary(data=tornado, class=month year, yvariable=deaths,
 model1=killer_tornadoes month,
 model2=killer_tornadoes|month,
 distribution=p, link=log, number_of_models=2);

/* This fits the Poisson regression model. The effect codings are estimated and tested using the contrasts   */
/* approach used earlier in the least squares fitting (the resultant Wald tests are the squares of the       */
/* z-tests).                                                                                                 */

proc genmod data=tornado;
     class month_number;
     model deaths = killer_tornadoes month_number / dist=p link=log;
       estimate 'January' month_number .9167 -.0833 -.0834 -.0833 -.0833 -.0834 -.0833 -.0833 -.0834 -.0833 -.0833 -.0834;
       estimate 'February' month_number -.0833 .9167 -.0834 -.0833 -.0833 -.0834 -.0833 -.0833 -.0834 -.0833 -.0833 -.0834;
       estimate 'March' month_number -.0833 -.0834 .9167 -.0833 -.0833 -.0834 -.0833 -.0833 -.0834 -.0833 -.0833 -.0834;
       estimate 'April' month_number -.0833 -.0834 -.0833 .9167 -.0833 -.0834 -.0833 -.0833 -.0834 -.0833 -.0833 -.0834;
       estimate 'May' month_number -.0833 -.0834 -.0833 -.0833 .9167 -.0834 -.0833 -.0833 -.0834 -.0833 -.0833 -.0834;
       estimate 'June' month_number -.0833 -.0834 -.0833 -.0833 -.0834 .9167 -.0833 -.0833 -.0834 -.0833 -.0833 -.0834;
       estimate 'July' month_number -.0833 -.0834 -.0833 -.0833 -.0834 -.0833 .9167 -.0833 -.0834 -.0833 -.0833 -.0834;
       estimate 'August' month_number -.0833 -.0834 -.0833 -.0833 -.0834 -.0833 -.0833 .9167 -.0834 -.0833 -.0833 -.0834;
       estimate 'September' month_number -.0833 -.0834 -.0833 -.0833 -.0834 -.0833 -.0833 -.0834 .9167 -.0833 -.0833 -.0834;
       estimate 'October' month_number -.0833 -.0834 -.0833 -.0833 -.0834 -.0833 -.0833 -.0834 -.0833 .9167 -.0833 -.0834;
       estimate 'November' month_number -.0833 -.0834 -.0833 -.0833 -.0834 -.0833 -.0833 -.0834 -.0833 -.0833 .9167 -.0834;
       estimate 'December' month_number -.0833 -.0834 -.0833 -.0833 -.0834 -.0833 -.0833 -.0834 -.0833 -.0833 -.0834 .9167;
run;

/* Different slopes model. The scaled deviance residuals are output to a file (Pearson residuals are obtained   */
/* using reschi, and standardized Pearson residuals are obtained using stdreschi).                              */

proc genmod data=tornado;
     class month_number;
     model deaths = killer_tornadoes|month_number / dist=p link=log;
     output out=tornpoisres
         stdresdev=stdres;
run;

data tornpoisres;
     set tornpoisres;
     obs=_N_;
run;

proc print data=tornpoisres;
     var year month stdres;
run;

proc gplot data=tornpoisres;
     plot stdres*obs;
run;

/* The inflglim macro by Michael Friendly calculates and plots various regression diagnostics;                   */
/* see http://www.math.yorku.ca/SCS/vcd/inflglim.html. Note that the gskip macro from the same source            */
/* must also be loaded.                                                                                          */

%include 'inflglim.sas';

%include 'gskip.sas';

/* Variable names must be shortened                                                                              */

data tornado2;
     set tornado;
     monthnum=month_number;
     killtorn=killer_tornadoes;
     keep monthnum killtorn deaths;
run;

%inflglim(data=tornado2, class=monthnum, resp=deaths, model=killtorn|monthnum, dist=poisson, out=torninf)

data torninf;
     set torninf;
     obs=_N_;
run;

/* The symbol interpol=join statement tells SAS to create a plot with the points connected                  */

proc gplot data=torninf;
     plot stresdev*obs hat*obs cookd*obs;
     symbol interpol=join;
run;

/* The UK train accident data shows how a model with an offset is fit.                                           */

proc import out = uktrainacc
     datafile = "uktrainacc.csv"
     dbms = csv replace;
     getnames = yes;
     datarow = 2;
run;

data uktrainacc;
     set uktrainacc;
     logtrain=log(train_km);
     logtraingroup=log(train_km_grouped);
run;

proc genmod data=uktrainacc;
     model accidents=year/link=log dist=p offset=logtrain;
run;

proc genmod data=uktrainacc;
     model spad_preventable=year/link=log dist=p offset=logtrain;
run;

proc genmod data=uktrainacc;
     model other_preventable=year/link=log dist=p offset=logtrain;
run;

proc genmod data=uktrainacc;
     model non_preventable=year/link=log dist=p offset=logtrain;
run;

proc genmod data=uktrainacc;
     model spad_grouped=year_grouped/link=log dist=p offset=logtraingroup;
run;

proc genmod data=uktrainacc;
     model other_grouped=year_grouped/link=log dist=p offset=logtraingroup;
run;

proc genmod data=uktrainacc;
     model non_grouped=year_grouped/link=log dist=p offset=logtraingroup;
run;

/* Section 5.3           */

proc import out = wildcat
     datafile = "wildcat.csv"
     dbms = csv replace;
     getnames = yes;
     datarow = 2;
run;

/* Poisson regression fit    */

proc genmod data=wildcat;
     model wildcat_strikes= Grievances Rotate Union log_workforce /link=log dist=p;
     output out=result pred=muhat;
run;

/* Two tests for overdispersion                          */

data result;
     set result;
     u=((wildcat_strikes-muhat)**2-wildcat_strikes)/muhat;
run;

proc reg data=result;
     model u = ;
     model u = muhat / noint;
run;

/* Quasi-likelihood is performed using the scale=pearson option                                        */

proc genmod data=wildcat;                                                                        
     model wildcat_strikes= Grievances Rotate Union log_workforce /link=log dist=p scale=pearson;
run;                                                                                              

/* Negative binomial analysis of Florida election data               */

proc import out = election2000
     datafile = "election2000.csv"
     dbms = csv replace;
     getnames = yes;
     datarow = 2;
run;

data election2000;
     set election2000;
     Bush_rate=Bush00/total00;
     Nader_rate=Nader00/total00;
     Perot_rate=Perot96/total96;

/* This sets Buchanan's Palm Beach value to missing, so that predictions can be made easily        */

data election;
     set election2000;
     if county='Palm Beach' then Buchanan00=.;
     logtotal=log(total00);
run;

/* Models on only Bush percentage                        */

proc genmod data=election;
     model Buchanan00=Bush_rate/link=log dist=p offset=logtotal;
     output out=rslt_elect_p pred=predict_B;
run;

/* Print out predicted value for Palm Beach              */

data rslt_elect_p;
     set rslt_elect_p;
     where (county='Palm Beach');
     Buchanan00=3407;
proc print data=rslt_elect_p;
     var county Buchanan00 predict_B;
run;

/* The expected option tells SAS to use the expected information matrix to calculate Wald statistics,   */
/* while the nb distribution is negative binomial.                                                      */

proc genmod data=election;
     model Buchanan00=Bush_rate/link=log dist=nb offset=logtotal expected;
     output out=rslt_elect_nb pred=predict_B;
run;

data rslt_elect_nb;
     set rslt_elect_nb;
     where (county='Palm Beach');
     Buchanan00=3407;
proc print data=rslt_elect_nb;
     var county Buchanan00 predict_B;
run;

/* Models based on Buch, Nader, and Perot percentages                                          */

proc genmod data=election;
     model Buchanan00=Bush_rate Nader_rate Perot_rate/link=log dist=p offset=logtotal;
     output out=rslt_elect_p2 pred=predict;
run;

data rslt_elect_p2;
     set rslt_elect_p2;
     where (county='Palm Beach');
     Buchanan00=3407;
proc print data=rslt_elect_p2;
     var county Buchanan00 predict;
run;

proc genmod data=election;
     model Buchanan00=Bush_rate Nader_rate Perot_rate/link=log dist=nb offset=logtotal expected;
     output out=rslt_elect_nb2 pred=predict_BNP;
run;

data rslt_elect_nb2;
     set rslt_elect_nb2;
     where (county='Palm Beach');
     Buchanan00=3407;
proc print data=rslt_elect_nb2;
     var county Buchanan00 predict_BNP;
run;

/* The genmodsummary macro also works for negative binomial models                    */

%include 'genmodsummary.sas';

%genmodsummary(data=election, yvariable=Buchanan00,
  model1=Bush_rate,
  model2=Bush_rate Nader_rate Perot_rate,
  model3=Nader_rate Perot_rate,
  distribution=nb, link=log, offset=logtotal, number_of_models=3);

/* Section 5.5               */

proc import out=homerun
     datafile = "homerun.csv"
     dbms = csv replace;
     getnames = yes;
     datarow = 2;
run;

/* Starting in version 8.2, proc gam constructs generalized additive models. The automatic smoothing     */
/* parameter selector used is generalized cross validation (GCV). GCV tends to lead to less smoothing    */
/* than does AICc. The "p" option in the output statement saves the estimated values into a variable     */
/* P_xxx, where xxx is the target variable names.                                                        */

data mcgwire;
     set homerun;
     if (McGwire_out = 1) then delete;
     keep StL_season_day McGwire_HR;
run;

proc gam data=mcgwire;
     model McGwire_HR = loess(StL_season_day) / dist=poisson method=gcv;
     output out=estimate p;
run;

data estimate;
     set estimate;
     pred = exp(P_McGwire_HR);
run;

proc gplot data=estimate;
     plot pred*StL_season_day;
     symbol interpol=join;
run;

data sosa;
     set homerun;
     if (Sosa_out = 1) then delete;
     keep Chi_season_day Sosa_HR;
run;

proc gam data=sosa;
     model Sosa_HR = loess(Chi_season_day) / dist=poisson method=gcv;
     output out=estimate p;
run;

data estimate;
     set estimate;
     pred = exp(P_Sosa_HR);
run;

proc gplot data=estimate;
     plot pred*Chi_season_day;
     symbol interpol=join;
run;

/* Smoothing a multinomial vector                   */

data texas;
     input Number Count;
     cards;
     0 18
     1 17
     2 16
     3 18
     4 20
     5 15
     6 15
     7 11
     8 9
     9 11
;
run;

proc gam data=texas;
     model Count = loess(Number) / dist=poisson; 
     output out=estimate p;
run;

data estimate;
     set estimate;
     obs = Count/150;
     pred = exp(P_Count)/150;
run;

proc gplot data=estimate;
     plot pred*Number obs*Number / overlay;
     symbol interpol=join;
run;

/* Generalized additive model                      */

proc import out=olympic
     datafile = "olympic2000.csv"
     dbms = csv replace;
     getnames = yes;
     datarow = 2;
run;

data olympic;
     set olympic;
     GDPpc = GDP/Population;
run;

proc gam data=olympic;
     model Total2000 = loess(Log_population) loess(GDPpc) loess(Total1996) / dist=poisson method=gcv;
     output out=estimate p;
run;

/* Plotting each of the smoothing components, which are identified by P_xxx, where xxx is the predictor     */

proc sort data=estimate;
     by Log_population;
run;

proc gplot data=estimate;
     plot P_Log_population*Log_population;
     symbol interpol=join;
run;

proc sort data=estimate;
     by GDPpc;
run;

proc gplot data=estimate;
     plot P_GDPpc*GDPpc;
     symbol interpol=join;
run;

proc sort data=estimate;
     by Total1996;
run;

proc gplot data=estimate;
     plot P_Total1996*Total1996;
     symbol interpol=join;
run;


/* Chapter 6         */

/* Section 6.1      */

/* Use proc freq to get both confidence interval for difference in probabilities and chi-squared   */
/* tests of independence. The riskdiff option gives the confidence interval.                       */

data leukemia;
     input PCR_status$ Followup_status$ count;
     cards;
     Traces_of_cancer Relapse 30
     Traces_of_cancer No_relapse 45
     Cancer_free Relapse 8
     Cancer_free No_relapse 95
;

proc freq data=leukemia order=data;
     weight count;
     tables PCR_status*Followup_status/expected chisq norow nocol nopercent riskdiff;
run;

/* Section 6.2       */

/* Note that more than one line of data can be put on the same line with the @@ option            */

data Major_marketing;
     input Time$ Marketing$ count @@;
     cards;
     High_school Yes 16 High_school No 41 High_school Undecided 8
     Freshman Yes 21 Freshman No 29 Freshman Undecided 8
     Sophomore Yes 38 Sophomore No 63 Sophomore Undecided 12
     Junior Yes 34 Junior No 30 Junior Undecided 8
     Senior Yes 9 Senior No 12 Senior Undecided 10
;

/* proc catmod is used to fit loglinear models. The terms on the left side of the model statement  */
/* define the table, with the use of the special keyword _response_. The model is then specified   */
/* in the loglin statement. Various details of fitting are suppressed here. This gives tests for   */
/* the significance of each effect, and an overall likelihood ratio (deviance) goodness-of-fit     */
/* test.                                                                                           */

proc catmod data=Major_marketing;
     weight count;
     model Time*Marketing=_response_ / noiter noparm noresponse noprofile;
     loglin Time Marketing;
run;

/* Tests of independence are better performed as above in proc freq                                */

proc freq data=Major_marketing order=data;
     weight count;
     tables Time*Marketing / expected chisq norow nopercent;
run;

/* Mosaics are constructed using Michael Friendly's macros and IML code; see                        */
/* http://www.math.yorku.ca/SCS/mosaics.html for instructions                                       */

%include 'mosaic.sas';
%include 'mosaicm.sas';

%mosaicm;

goptions vsize=7in hsize=7in ;   *-- square plot environment;
 
proc iml;
start major_marketing;
   *-- time marketing data;
  table = {
  /*'High_school'  'Freshman'    'Sophomore'   'Junior'        'Senior'*/
     16   41   8   21   29   8  38   63   12   34   30   8   9   12   10 };
 
  levels= { 3 5 };
  vnames = {'marketing' 'time' };    /* Variable names */
  lnames = {                         /* Category names */
           'Yes' 'No' 'Undecided' ' '  ' ',    /* marketing */
           'High_school' 'Freshman' 'Sophomore' 'Junior' 'Senior'};   /* time  */
  title  = 'Marketing-Time';
  finish;
 
  run major_marketing;
   reset storage=mosaic.mosaic;
   load module=_all_;
   *-- Fit models of joint independence (fittype='JOINT');
   plots = 2;
   htext=1.6;
   run mosaic(levels, table, vnames, lnames, plots, title);

/* Adjusted residuals are standardized Pearson residuals, which can be obtained from proc genmod   */

data antibiotic;
   input Diagnosis$ Time$ freq ;
   cards;
   Bronchitis  1-3/96  113
   Bronchitis  4-6/96  58
   Bronchitis  7-9/96  40
   Bronchitis  10-12/96  108
   Bronchitis  1-2/97  100
   Sinusitis 1-3/96 99
   Sinusitis 4-6/96 37
   Sinusitis 7-9/96 23
   Sinusitis 10-12/96 50
   Sinusitis 1-2/97 32
   URI 1-3/96 410
   URI 4-6/96 228
   URI 7-9/96 125
   URI 10-12/96 366
   URI 1-2/97 304
   Pneumonia 1-3/96 60
   Pneumonia 4-6/96 43
   Pneumonia 7-9/96 30
   Pneumonia 10-12/96 56
   Pneumonia 1-2/97 45
;

proc genmod data=antibiotic;
     class Diagnosis Time;
     model freq = Diagnosis Time / dist=p obstats;
run;

/* Section 6.3                     */

/* Fisher's exact test is given in proc freq, but the mid-p-value is not given.         */
/* The same fisher option works for I X J tables.                                       */

data thermal_imaging;
     input True_grp$ Classified_grp$ count;
     cards;
     Guilty Guilty 6
     Guilty Innocent 2
     Innocent Guilty 1
     Innocent Innocent 11
;

proc freq data=thermal_imaging;
     weight count;
     tables True_grp*Classified_grp / fisher;
run;


/* Section 6.4                    */

/* Structural zeroes do not appear in the file of counts, so there is nothing special    */
/* in fitting a model without them.                                                      */

proc import out = cancer
            datafile = "cancerrate.csv"
            dbms = csv replace;
     getnames = yes;
     datarow = 2;
run;

proc genmod data=cancer;
     class type gender;
     model ny_count=type gender/link=log dist=p obstats;
run;


/* Section 6.5                   */

/* Note how a table can be input                           */

data artifacts;
     input Artifact$ @;
     Distance='vicinity'; input count @; output;
     Distance='<.25'; input count @; output;
     Distance='.25-.5'; input count @; output;
     Distance='.5-1'; input count @; output;
     cards;
     Drills 2 10 4 2
     Pots 3 8 4 6
     Grinding_stones 13 5 3 9
     Point_fragments 20 36 19 20
;

/* Here's how to convert a table to case-by-case form using proc freq      */
/* The sparse option guarantees that zero counts are carried along. This means that if the   */
/* all of the zeroes in a table are sampling zeroes, sparse should be used, while if all of        */
/* them are structural zeroes, it should not be used. If there are some of both, sparse          */
/* is used, and then rows for structural zeroes are deleted in a data statement.                     */

proc freq data=artifacts order=data;
     weight count;
     tables Artifact*Distance/ sparse out=artifact_data noprint;
run;

proc genmod data=artifact_data;
     class Artifact Distance;
     model count=Artifact Distance /link=log dist=p obstats;
run;

/* Another way to set an observation as a structural zero is to set its count to be missing,       */
/* rather than delete it. This has the advantage of making it possible to get deleted residuals.   */
/* Note that the Grinding_stone category name has been truncated to the eight character version    */
/* Grinding.                                                                                       */

data artifact2_data;
     set artifact_data;
     if Artifact = 'Grinding' and Distance = 'vicinity' then count = .;
     if Artifact = 'Grinding' and Distance = '<.25' then count = .;
run;

proc genmod data=artifact2_data;
     class Artifact Distance;
     model count=Artifact Distance /link=log dist=p obstats;
     output out=out_art 
           p = pred;
run;

data out_art;
     set out_art;
     if Artifact = 'Grinding' and Distance = 'vicinity' then delres = (13 - pred)/sqrt(pred);
     if Artifact = 'Grinding' and Distance = '<.25' then delres = (5 - pred)/sqrt(pred);
run;

proc print data=out_art;
     var delres;
run;

data artifact3_data;
     set artifact_data;
     if Artifact = 'Grinding' and Distance = 'vicinity' then count = .;
run;

proc genmod data=artifact3_data;
     class Artifact Distance;
     model count=Artifact Distance /link=log dist=p;
     output out=out_art 
           p = pred;
run;

data out_art;
     set out_art;
     if Artifact = 'Grinding' and Distance = 'vicinity' then delres = (13 - pred)/sqrt(pred);
run;

proc print data=out_art;
     var delres;
run;


/* Chapter 7            */

/* Section 7.1     */


/* Uniform association model                    */

data fiscal;
     input Rating$ Nonprofits Count @@;
     cards;
     BBB 0 1   BBB 1 0   BBB 2 1   BBB 3 2   BBB 4 0   BBB 5 2
     A 0 1   A 1 1   A 2 3   A 3 5   A 4 2   A 5 0
     AA 0 5   AA 1 12   AA 2 11   AA 3 4   AA 4 1   AA 5 0
     AAA 0 0   AAA 1 4   AAA 2 2   AAA 3 0   AAA 4 0   AAA 5 0
;
 
data fiscal;
     set fiscal;
     if Rating='BBB' then row=1;
        else if Rating='A' then row=2;
             else if Rating='AA' then row=3;
                  else if Rating='AAA' then row=4;
     col=Nonprofits+1;
     assoc=row*col;
run;

proc genmod data=fiscal;
     class Rating Nonprofits;
     model Count=Rating Nonprofits / dist=p link=log;
run;

proc genmod data=fiscal;
     class Rating Nonprofits;
     model Count=Rating Nonprofits assoc / dist=p link=log residuals;
run;

data fiscal2;
     set fiscal;
     if Rating='BBB' and Nonprofits=0 then delete;
run;

proc genmod data=fiscal2;
     class Rating Nonprofits;
     model Count=Rating Nonprofits / dist=p link=log;
run;

proc genmod data=fiscal2;
     class Rating Nonprofits;
     model Count=Rating Nonprofits assoc / dist=p link=log;
run;

/* Column effects model                                             */

data assess;
     input Response$ Type$ Count @@;
     cards;
     Nothing Structured 0   Nothing Open 0   Nothing Article 1   Nothing Output 0
     Little Structured 3   Little Open 1   Little Article 6   Little Output 4
     Enough Structured 8   Enough Open 7   Enough Article 4   Enough Output 8
     Lots Structured 3   Lots Open 6   Lots Article 2   Lots Output 2
;
 
data assess;
     set assess;
     if Response='Nothing' then row=1;
        else if Response='Little' then row=2;
             else if Response='Enough' then row=3;
                  else if Response='Lots' then row=4;
run;

proc genmod data=assess;
     class Response Type;
     model Count=Response Type / dist=p link=log;
run;

/* The type3 option gives partial tests for each effect, including of the Type*row   */
/* effect, which corresponds to the test of independence given the column effects    */
/* model.                                                                            */

proc genmod data=assess;
     class Response Type;
     model Count=Response Type Type*row / dist=p link=log type3;
run;

/* genmodsummary is used to calculate AICc values                   */

%include 'genmodsummary.sas';

%genmodsummary(data=assess, class=Response Type, yvariable=Count,
  model1=Response Type,
  model2=Response Type Type*row,
  distribution=p, link=log, number_of_models=2);


/* Row + Column effects model                                       */

data myopia;
     input Refraction$ Lighting$ Count @@;
     cards;
     High_hyperopia Darkness 2   High_hyperopia Night_light 1   High_hyperopia Room_light 1
     Hyperopia Darkness 21   Hyperopia Night_light 16   Hyperopia Room_light 15
     Emmetropia Darkness 66   Emmetropia Night_light 50   Emmetropia Room_light 29
     Myopia Darkness 9   Myopia Night_light 31   Myopia Room_light 48
     High_myopia Darkness 1   High_myopia Night_light 3   High_myopia Room_light 7
;

data myopia;
     set myopia;
     if Refraction='High_hyp' then row=1;
	  else if Refraction='Hyperopi' then row=2;
	          else if Refraction='Emmetrop' then row=3;
	                  else if Refraction='Myopia' then row=4;
		                    else if Refraction='High_myo' then row=5;
     if Lighting='Darkness' then col=1;
        else if Lighting='Night_li' then col=2;
                else if Lighting='Room_lig' then col=3;
     if Refraction='Myopia' then myopiaeff=1;
	  else if Refraction='High_myo' then myopiaeff=1;
                else myopiaeff=0;
     if refraction='Myopia' then my_high=1;
        else if refraction='High_myo' then my_high=2;
                else my_high=0;
     l=Lighting;
     r=Refraction;
     m=myopiaeff;
     mh=my_high;
     assoc=row*col;
run;

%genmodsummary(data=myopia, class=l r m mh, yvariable=count,
  model1=l r,
  model2=l r assoc,
  model3=l r r*col,
  model4=l r l*row,
  model5=l r r*col l*row,
  model6=l r m*col,
  model7=l r mh*col,
  distribution=p, link=log, number_of_models=7);

/* Here is the row effects fit. Note that the type3 option will give a proper test for (I|R),   */
/* but not (U|R), since the uniform association model is not a simple term drop from row        */
/* effects (but it can be obtained from the G^2 values given by genmodsummary). The contrast    */
/* estimates correspond to the row effects parameters.                                          */

proc genmod data=myopia order=data;
     class Refraction Lighting;
     model Count=Refraction Lighting Refraction*col / link=log dist=p type3;
     estimate 'High_hyp' Refraction*col .8 -.2 -.2 -.2 -.2;
     estimate 'Hyperopia' Refraction*col -.2 .8 -.2 -.2 -.2;
     estimate 'Emmetropia' Refraction*col -.2 -.2 .8 -.2 -.2;
     estimate 'Myopia' Refraction*col -.2 -.2 -.2 .8 -.2;
     estimate 'High_myopia' Refraction*col -.2 -.2 -.2 -.2 .8;
run;

/* Fitting bivariate Poisson distributions              */

proc import out = gviolence
     datafile = "gviolence.csv"
     dbms = csv replace;
     getnames = yes;
     datarow = 2;
run;

proc freq data=gviolence;
     tables Good_neutral_injuries*Bad_injuries / sparse out=gviolence_table nopercent nocol norow;
run;

data gviolence_table;
     set gviolence_table;
     col=Bad_injuries;
     row=Good_neutral_injuries;
     assoc=row*col;
     offs=-log(fact(col))-log(fact(row));
run;

/* Determine AICc values for independence and association models      */

%include 'genmodsummary.sas';

%genmodsummary(data=gviolence_table, class=Good_neutral_injuries Bad_injuries, yvariable=count,
  model1=Good_neutral_injuries Bad_injuries,
  model2=Good_neutral_injuries Bad_injuries assoc,
  distribution=p, link=log, number_of_models=2);

/* Determine AICc values for bivariate Poisson models. Note that these couldn't be done with   */
/* the other models, since they require the offset. Note also that the difference in AICc      */
/* values are not right here, since we are not comparing all of the models at once; use the    */
/* actual AICc values for comparisons.                                                         */

%genmodsummary(data=gviolence_table, yvariable=count,
  model1=row col,
  model2=row col row*col,
  distribution=p, link=log, offset=offs, number_of_models=2);

/* Omit outlier. Association models are fit as usual. The code below shows how to fit the      */
/* equal-mean independent and correlated Poisson models.                                       */

data gviolence_table2;
     set gviolence_table;
     if (row=8) then delete;
     rowpcol=row+col;
run;

/* An alternative to the Wald test for equality of means of two independent Poissons is the    */
/* likelihood ratio test obtained by the difference in deviance values for the model with      */
/* unequal means and the model with equal means                                                */

%genmodsummary(data=gviolence_table2, yvariable=count,
  model1=row col,
  model2=rowpcol,
  distribution=p, link=log, offset=offs, number_of_models=2);

/* Unequal-mean and equal-mean correlated Poisson models  */

%genmodsummary(data=gviolence_table2, yvariable=count,
  model1=row col row*col,
  model2=rowpcol row*col,
  distribution=p, link=log, offset=offs, number_of_models=2);

/* Section 7.2      */

data rating;
     input Rating97$ Rating98$ Count @@;
     cards;
     AAA AAA 111   AAA AA 5   AAA A 0   AAA BBB 0   AAA Spec 0
     AA AAA 6   AA AA 396   AA A 19   AA BBB 2   AA Spec 2
     A AAA 0   A AA 18   A A 938   A BBB 41   A Spec 8
     BBB AAA 0   BBB AA 2   BBB A 27   BBB BBB 671   BBB Spec 29
     Spec AAA 0   Spec AA 0   Spec A 4   Spec BBB 48   Spec Spec 930
;

/* The macro factors creates the appropriate codings for many different symmetry models. It         */
/* is given in the paper "Generating Factor variables for Asymmetry, Non-independence and           */
/* Skew-symmetry models in Square Contingency Tables using SAS" by H.B. Lawal and R.A. Sundheim,    */
/* Journal of Statistical Software, 7(8) [2002]; see http://www.jstatsoft.org/index.php?vol=7#      */

%include 'factors.sas';

/* This creates the factors for a 5 x 5 table                                                       */

%Factors(5, FactorDSN=factors);

/* This keeps the Q (diagonal quasi-independence) and S (symmetry) factors                          */

data rating;
     merge rating factors;
     keep Rating97 Rating98 Q S Count;
run;

%include 'genmodsummary.sas';

%genmodsummary(data=rating, class=Rating97 Rating98 Q S, yvariable=count,
  model1=Rating97 Rating98,
  model2=Rating97 Rating98 Q,
  model3=Q S,
  distribution=p, link=log, number_of_models=3);
  
/* Quasi-symmetry model                                */

data germanmob;
     input job1848$ initial$ count;
     cards;
     Justice  Justice  117  
     Justice  Administration  11  
     Justice  Education  4  
     Justice  Military  0  
     Justice  Church  0  
     Justice  Farmer  0  
     Justice  Lawyer  8  
     Justice  Self-employed  0  
     Administration  Justice  83  
     Administration  Administration  37  
     Administration  Education  3  
     Administration  Military  5  
     Administration  Church  1  
     Administration  Farmer  4
     Administration  Lawyer  2
     Administration  Self-employed  7
     Education  Justice  19
     Education  Administration  6
     Education  Education  67
     Education  Military  1
     Education  Church  11
     Education  Farmer  0
     Education  Lawyer  1
     Education  Self-employed  20
     Military  Justice  0
     Military  Administration  0
     Military  Education  1
     Military  Military  16
     Military  Church  0
     Military  Farmer  0
     Military  Lawyer  0
     Military  Self-employed  0
     Church  Justice  1
     Church  Administration  0
     Church  Education  4
     Church  Military  0
     Church  Church  26
     Church  Farmer  0
     Church  Lawyer  0
     Church  Self-employed  4
     Farmer  Justice  17
     Farmer  Administration  7
     Farmer  Education  3
     Farmer  Military  8
     Farmer  Church  0
     Farmer  Farmer  19
     Farmer  Lawyer  0
     Farmer  Self-employed  1
     Lawyer  Justice  76
     Lawyer  Administration  7
     Lawyer  Education  5
     Lawyer  Military  1
     Lawyer  Church  1
     Lawyer  Farmer  1
     Lawyer  Lawyer  22
     Lawyer  Self-employed  0
     Self-employed  Justice  6
     Self-employed  Administration  6
     Self-employed  Education  13
     Self-employed  Military  0
     Self-employed  Church  0
     Self-employed  Farmer  0
     Self-employed  Lawyer  1
     Self-employed  Self-employed  37
;

%include 'factors.sas';

%include 'genmodsummary.sas';

%Factors(8, FactorDSN=factors);

data germanmob;
     merge germanmob factors;
     keep initial job1848 Q S Count;
run;

%genmodsummary(data=germanmob, class=initial job1848 Q S, yvariable=count,
  model1=initial job1848,
  model2=initial job1848 Q,
  model3=Q S,
  model4=initial job1848 S,
  distribution=p, link=log, number_of_models=4);
 
/* Scaled differences between quasi-symmetry and quasi-independence           */

proc genmod data=germanmob order=data;
     class initial job1848 Q;
     model Count= initial job1848 Q / link=log dist=p;
     output out=german_qi
           p=predqi;
run;

proc genmod data=germanmob order=data;
     class initial job1848 S;
     model Count= initial job1848 S / link=log dist=p;
     output out=german_qs
           p=predqs;
run;

data scalediff;
     merge german_qi german_qs;
     diff=(predqs-predqi)/sqrt(predqi);
     keep initial job1848 diff;
run;

proc print data=scalediff;
run;
     
/* Ordinal square table                                  */

data britishmob;
     input son father count @@;
     cards;
     1  1  50    1  2  16    1  3  12    1  4  11    1  5  14    1  6  0    1  7  0
     2  1  19    2  2  40    2  3  35    2  4  20    2  5  36    2  6  6    2  7  3
     3  1  26    3  2  34    3  3  65    3  4  58    3  5  114   3  6  19   3  7  14
     4  1  8     4  2  18    4  3  66    4  4  110   4  5  185   4  6  40   4  7  32
     5  1  18    5  2  31    5  3  123   5  4  223   5  5  714   5  6  179  5  7  141
     6  1  6     6  2  8     6  3  23    6  4  64    6  5  258   6  6  143  6  7  91
     7  1  2     7  2  3     7  3  21    7  4  32    7  5  189   7  6  71   7  7  106
;

%include 'factors.sas';

%include 'genmodsummary.sas';

%Factors(7, FactorDSN=factors);

data britishmob;
     merge britishmob factors;
     rownum=father;
     colnum=son;
     keep father son rownum colnum Q S Count;
run;

%genmodsummary(data=britishmob, class=father son Q S, yvariable=count,
  model1=father son,
  model2=father son Q,
  model3=Q S,
  model4=father son S,
  model5=Q S colnum,
  model6=father son rownum*colnum,
  model7=father son Q rownum*colnum,
  distribution=p, link=log, number_of_models=7);

/* McNemar's test is in proc freq                   */

data diabetic;
     input pre$ post$ count;
     cards;
     Not_satisfactory Not_satisfactory 1
     Not_satisfactory Satisfactory 12
     Satisfactory Not_satisfactory 0
     Satisfactory Satisfactory 4
;

proc freq data=diabetic order=data;
     weight count;
     tables pre*post / agree nopercent norow nocol;
run;

/* Kappa coefficient is in proc freq                */

data thermometer;
     input rectal$ pacifier$ count;
     cards;
     No_fever No_fever 49
     No_fever Fever 1
     Fever No_fever 14
     Fever Fever 36
;

proc freq data=thermometer order=data;
     weight count;
     tables rectal*pacifier / agree nopercent norow nocol;
run;

/* Weighted kappa coefficient is in proc freq                 */

data socktest;
     input therapist1$ therapist2$ count @@;
     cards;
     0 0 5   0 1 0   0 2 0   0 3 0
     1 0 3   1 1 5   1 2 0   1 3 0
     2 0 0   2 1 2   2 2 3   2 3 0
     3 0 0   3 1 0   3 2 0   3 3 3
;

/* To get quadratic weights, use the (wt=fc) option in the agree setting.    */

proc freq data=socktest order=data;
     weight count;
     tables therapist1*therapist2 / agree(wt=fc) nopercent norow nocol;
run;

/* The default weighted kappa uses absolute weights.                         */

proc freq data=socktest order=data;
     weight count;
     tables therapist1*therapist2 / agree nopercent norow nocol;
run;

/* Exact analysis of McNemar's test                 */

data nutrition;
     input WHO$ Ehrenberg$ count;
     cards;
     Malnourished Malnourished 20
     Malnourished Normal 0
     Normal Malnourished 4
     Normal Normal 237
;

proc freq data=nutrition order=data;
     weight count;
     tables WHO*Ehrenberg / nopercent norow nocol;
	 exact mcnem;
run;


/* Chapter 8                   */

/* Section 8.1                 */

/* In an I x J x K table, put the variable with K levels as the first variable in the tables statement in proc freq  */
/* to get separate analysis conditioning on that variable. The chisq option gives separate chi-squared tests for     */
/* each subtable.                                                                                                    */

data freezer;
     input Ownership$ Social_group$ Freezer$ count @@;
     cards;
     Owner I-II Yes 304   Owner I-II No 38   Owner III Yes 666   Owner III No 85
     Owner IV Yes 894   Owner IV No 93   Owner V Yes 720   Owner V No 84
     Renter I-II Yes 92   Renter I-II No 64   Renter III Yes 174   Renter III No 113
     Renter IV Yes 379   Renter IV No 321   Renter V Yes 433   Renter V No 297
;

proc freq data=freezer order=data;
     weight count;
     tables Ownership*Social_group*Freezer / nocol norow nopercent chisq;
run;

/* Tests of conditional association are given in proc freq using the cmh option; it gives the CMH and BD tests,    */
/* and the Mantel-Haenszel estimate of the common odds ratio, which is given under "Odds ratio".                   */

data whickham;
     input Age$ Smoking$ Mortality$ count @@;
     cards;
     18-24 Smoker Alive 53   18-24 Smoker Dead 2   18-24 Nonsmoker Alive 61   18-24 Nonsmoker Dead 1
     25-34 Smoker Alive 121   25-34 Smoker Dead 3   25-34 Nonsmoker Alive 152   25-34 Nonsmoker Dead 5
     35-44 Smoker Alive 95   35-44 Smoker Dead 14   35-44 Nonsmoker Alive 114   35-44 Nonsmoker Dead 7
     45-54 Smoker Alive 103   45-54 Smoker Dead 27   45-54 Nonsmoker Alive 66   45-54 Nonsmoker Dead 12
     55-64 Smoker Alive 64   55-64 Smoker Dead 51   55-64 Nonsmoker Alive 81   55-64 Nonsmoker Dead 40
     65-74 Smoker Alive 7   65-74 Smoker Dead 29   65-74 Nonsmoker Alive 28   65-74 Nonsmoker Dead 101
     75+ Smoker Alive 0   75+ Smoker Dead 13   75+ Nonsmoker Alive 0   75+ Nonsmoker Dead 64
;

proc freq data=whickham order=data;
     weight count;
     tables Age*Smoking*Mortality / nocol norow nopercent cmh;
run;

/* Section 8.2           */

/* Loglinear models can be fit using either proc genmod or proc catmod. I will use genmodsummary (and hence       */
/* proc genmod) to get AICc values, but I will first show how to fit a loglinear model using proc catmod          */

data whickham;
     input Age$ Smoking$ Mortality$ count @@;
     cards;
     18-24 Smoker Alive 53   18-24 Smoker Dead 2   18-24 Nonsmoker Alive 61   18-24 Nonsmoker Dead 1
     25-34 Smoker Alive 121   25-34 Smoker Dead 3   25-34 Nonsmoker Alive 152   25-34 Nonsmoker Dead 5
     35-44 Smoker Alive 95   35-44 Smoker Dead 14   35-44 Nonsmoker Alive 114   35-44 Nonsmoker Dead 7
     45-54 Smoker Alive 103   45-54 Smoker Dead 27   45-54 Nonsmoker Alive 66   45-54 Nonsmoker Dead 12
     55-64 Smoker Alive 64   55-64 Smoker Dead 51   55-64 Nonsmoker Alive 81   55-64 Nonsmoker Dead 40
     65-74 Smoker Alive 7   65-74 Smoker Dead 29   65-74 Nonsmoker Alive 28   65-74 Nonsmoker Dead 101
     75+ Smoker Alive 0   75+ Smoker Dead 13   75+ Nonsmoker Alive 0   75+ Nonsmoker Dead 64
;

/* This fits the (S, MA) model. Note the use of | to specify an interaction in the sufficient marginal       */

proc catmod data=whickham order=data;
     weight count;
     model Age*Smoking*Mortality = _response_ / noiter noparm noresponse noprofile;
     loglin Smoking Age|Mortality;
run;

/* All of the models can be fit and compared using the genmodsummary macro            */

%include 'genmodelsummary';

%genmodsummary(data=whickham, class=Age Smoking Mortality, yvariable=count,
  model1=Age*Smoking Age*Mortality Mortality*Smoking,
  model2=Age*Mortality Mortality*Smoking,
  model3=Age*Smoking Mortality*Smoking,
  model4=Age*Smoking Age*Mortality,
  model5=Smoking Age*Mortality,
  model6=Mortality Age*Smoking,
  distribution=p, link=log, number_of_models=6);

/* Here is the proc genmod fit of the homogeneous association model, with the fitted values printed out.    */
/* Note that the type3 option gives the likelihood ratio tests for AS, MA, and MS given the model.          */

proc genmod data=whickham order=data;
     class Age Smoking Mortality;
     model count=Age*Smoking Age*Mortality Smoking*Mortality / dist=p link=log pred type3;
run;

/* Military death data             */

data mildeath;
     input Gender$ Manner$ Service$ count;
     cards;
     Male  Accident  Army  158
     Male  Illness  Army  56
     Male  Homicide  Army  11
     Male  Self-inflicted  Army  51
     Male  Unknown  Army  24
     Female  Accident  Army  15
     Female  Illness  Army  5
     Female  Homicide  Army  5
     Female  Self-inflicted  Army  3
     Female  Unknown  Army  1
     Male  Accident  Navy  96
     Male  Illness  Navy  30
     Male  Homicide  Navy  7
     Male  Self-inflicted  Navy  32
     Male  Unknown  Navy  9
     Female  Accident  Navy  9
     Female  Illness  Navy  4
     Female  Homicide  Navy  2
     Female  Self-inflicted  Navy  3
     Female  Unknown  Navy  2
     Male  Accident  Air  54
     Male  Illness  Air  21
     Male  Homicide  Air  2
     Male  Self-inflicted  Air  13
     Male  Unknown  Air  13
     Female  Accident  Air  7
     Female  Illness  Air  4
     Female  Homicide  Air  0
     Female  Self-inflicted  Air  1
     Female  Unknown  Air  2
     Male  Accident  Marine  69
     Male  Illness  Marine  6
     Male  Homicide  Marine  6
     Male  Self-inflicted  Marine  7
     Male  Unknown  Marine  27
     Female  Accident  Marine  3
     Female  Illness  Marine  0
     Female  Homicide  Marine  1
     Female  Self-inflicted  Marine  0
     Female  Unknown  Marine  2
;

%include 'genmodsummary.sas';

%genmodsummary(data=mildeath, class=Gender Manner Service, yvariable=count,
  model1=Service*Manner Service*Gender Manner*Gender,
  model2=Service*Manner Service*Gender,
  model3=Service*Manner Manner*Gender,
  model4=Service*Gender Manner*Gender,
  model5=Manner Service*Gender,
  model6=Gender Service*Manner,
  model7=Service Manner*Gender,
  distribution=p, link=log, number_of_models=7);

proc genmod data=mildeath order=data;
     class Gender Manner Service;
     model count=Gender Service*Manner / dist=p link=log;
     estimate 'Male' Gender .5 -.5;
     estimate 'Female' Gender -.5 .5;
run;

/* Figure 8.3 - three-dimensional mosaic plot           */

%include 'mosaic.sas';
%include 'mosaicm.sas';

%mosaicm;

goptions vsize=8in hsize=8in ;   *-- square plot environment;

proc iml;
start service;
   *-- service data;
  table = {
 158 56 11 51 24 15 5 5 3 1 96 30 7 32 9 9 4 2 3 2 54 21 2 13 13 7 4 0 1 2 69 6 6 7 27 3 0 1 0 2 };
 
  levels= {5 8};
  vnames = { 'Manner' 'Gender_Service'};    /* Variable names */
  lnames = {                         /* Category names */
			'Accident' 'Illness' 'Homicide' 'Self-inflicted' 'Unknown' '' '' '',
			'Army_Male' 'Army_Female' 'Navy_Male' 'Navy_Female' 'Air_Male' 'Air_Female' 'Marine_Male' 'Marine_Female'
			}; 
  title  = 'Service';
  finish;
 
  run service;
   reset storage=mosaic.mosaic;
   load module=_all_;
	*-- Fit models of joint independence (fittype='JOINT');
	plots = 2;
   htext=0.6;
   run mosaic(levels, table, vnames, lnames, plots, title);
quit;

%include 'mosaic.sas';
%include 'mosaicm.sas';

%mosaicm;

goptions vsize=8in hsize=8in ;   *-- square plot environment;


proc iml;
start service;
   *-- service data;
  table = {
 157.31406 55.46912 14.549277 49.103811 22.733246 15.68594 5.5308804 1.4507227 4.8961892 2.2667543 95.479632 30.917214 
8.1839685 31.826544 10.002628 9.5203679 3.0827858 0.8160315 3.173456 0.9973719 55.46912 22.733246 1.8186597 12.730618 
13.639947 5.5308804 2.2667543 0.1813403 1.2693824 1.3600526 65.471748 5.455979 6.3653088 6.3653088 26.370565 6.5282523 
0.544021 0.6346912 0.6346912 2.629435};
 
  levels= {5 8};
  vnames = { 'Manner' 'Gender_Service'};    /* Variable names */
  lnames = {                         /* Category names */
			'Accident' 'Illness' 'Homicide' 'Self-inflicted' 'Unknown' '' '' '',
			'Army_Male' 'Army_Female' 'Navy_Male' 'Navy_Female' 'Air_Male' 'Air_Female' 'Marine_Male' 'Marine_Female'
			}; 
  title  = 'Fitted Counts';
  finish;
 
  run service;
   reset storage=mosaic.mosaic;
   load module=_all_;
	*-- Fit models of joint independence (fittype='JOINT');
	plots = 2;
   htext=0.6;
   run mosaic(levels, table, vnames, lnames, plots, title);
quit;

/* Now fitting models on death rates        */

data miltotal;
     input total @@;
     cards;
     407398 407398 407398 407398 407398 72028 72028 72028 72028 72028  
     321424 321424 321424 321424 321424 51622 51622 51622 51622 51622 
     294117 294117 294117 294117 294117 66473 66473 66473 66473 66473 
     162477 162477 162477 162477 162477 10164 10164 10164 10164 10164 
;

data mildeath;
     merge mildeath miltotal;
     logtotal=log(total);

%genmodsummary(data=mildeath, class=Gender Manner Service, yvariable=count,
  model1=Service*Manner Service*Gender Manner*Gender,
  model2=Service*Manner Service*Gender,
  model3=Service*Manner Manner*Gender,
  model4=Service*Gender Manner*Gender,
  model5=Manner Service*Gender,
  model6=Gender Service*Manner,
  model7=Service Manner*Gender,
  distribution=p, link=log, offset=logtotal, number_of_models=7);

proc genmod data=mildeath order=data;
     class Gender Manner Service;
     model count=Gender Service*Manner / dist=p link=log offset=logtotal;
     estimate 'Male' Gender .5 -.5;
     estimate 'Female' Gender -.5 .5;
run;

/* Section 8.3     */

data bstudents;
     input When_major_chosen$ Employment$ Salary$ count;
     cards;
     High_school  Very_unimportant  Very_unimportant  2
     High_school  Unimportant  Very_unimportant  0
     High_school  Neutral  Very_unimportant  0
     High_school  Important  Very_unimportant  0
     High_school  Very_important  Very_unimportant  1
     Freshman  Very_unimportant  Very_unimportant  0
     Freshman  Unimportant  Very_unimportant  0
     Freshman  Neutral  Very_unimportant  0
     Freshman  Important  Very_unimportant  0
     Freshman  Very_important  Very_unimportant  0
     Sophomore  Very_unimportant  Very_unimportant  2
     Sophomore  Unimportant  Very_unimportant  0
     Sophomore  Neutral  Very_unimportant  0
     Sophomore  Important  Very_unimportant  0
     Sophomore  Very_important  Very_unimportant  0
     Junior  Very_unimportant  Very_unimportant  0
     Junior  Unimportant  Very_unimportant  0
     Junior  Neutral  Very_unimportant  0
     Junior  Important  Very_unimportant  0
     Junior  Very_important  Very_unimportant  0
     Senior  Very_unimportant  Very_unimportant  2
     Senior  Unimportant  Very_unimportant  0
     Senior  Neutral  Very_unimportant  0
     Senior  Important  Very_unimportant  0
     Senior  Very_important  Very_unimportant  0
     High_school  Very_unimportant  Unimportant  0
     High_school  Unimportant  Unimportant  0
     High_school  Neutral  Unimportant  1
     High_school  Important  Unimportant  0
     High_school  Very_important  Unimportant  0
     Freshman  Very_unimportant  Unimportant  0
     Freshman  Unimportant  Unimportant  0
     Freshman  Neutral  Unimportant  0
     Freshman  Important  Unimportant  1
     Freshman  Very_important  Unimportant  0
     Sophomore  Very_unimportant  Unimportant  0
     Sophomore  Unimportant  Unimportant  1
     Sophomore  Neutral  Unimportant  1
     Sophomore  Important  Unimportant  0
     Sophomore  Very_important  Unimportant  0
     Junior  Very_unimportant  Unimportant  0
     Junior  Unimportant  Unimportant  1
     Junior  Neutral  Unimportant  0
     Junior  Important  Unimportant  1
     Junior  Very_important  Unimportant  1
     Senior  Very_unimportant  Unimportant  0
     Senior  Unimportant  Unimportant  1
     Senior  Neutral  Unimportant  0
     Senior  Important  Unimportant  0
     Senior  Very_important  Unimportant  0
     High_school  Very_unimportant  Neutral  0
     High_school  Unimportant  Neutral  3
     High_school  Neutral  Neutral  3
     High_school  Important  Neutral  1
     High_school  Very_important  Neutral  2
     Freshman  Very_unimportant  Neutral  0
     Freshman  Unimportant  Neutral  0
     Freshman  Neutral  Neutral  2
     Freshman  Important  Neutral  4
     Freshman  Very_important  Neutral  2
     Sophomore  Very_unimportant  Neutral  0
     Sophomore  Unimportant  Neutral  0
     Sophomore  Neutral  Neutral  4
     Sophomore  Important  Neutral  7
     Sophomore  Very_important  Neutral  5
     Junior  Very_unimportant  Neutral  0
     Junior  Unimportant  Neutral  0
     Junior  Neutral  Neutral  1
     Junior  Important  Neutral  4
     Junior  Very_important  Neutral  2
     Senior  Very_unimportant  Neutral  1
     Senior  Unimportant  Neutral  0
     Senior  Neutral  Neutral  1
     Senior  Important  Neutral  0
     Senior  Very_important  Neutral  0
     High_school  Very_unimportant  Important  0
     High_school  Unimportant  Important  0
     High_school  Neutral  Important  0
     High_school  Important  Important  8
     High_school  Very_important  Important  10
     Freshman  Very_unimportant  Important  0
     Freshman  Unimportant  Important  0
     Freshman  Neutral  Important  1
     Freshman  Important  Important  6
     Freshman  Very_important  Important  15
     Sophomore  Very_unimportant  Important  0
     Sophomore  Unimportant  Important  1
     Sophomore  Neutral  Important  3
     Sophomore  Important  Important  19
     Sophomore  Very_important  Important  22
     Junior  Very_unimportant  Important  0
     Junior  Unimportant  Important  1
     Junior  Neutral  Important  1
     Junior  Important  Important  15
     Junior  Very_important  Important  14
     Senior  Very_unimportant  Important  0
     Senior  Unimportant  Important  0
     Senior  Neutral  Important  2
     Senior  Important  Important  8
     Senior  Very_important  Important  6
     High_school  Very_unimportant  Very_important  1
     High_school  Unimportant  Very_important  1
     High_school  Neutral  Very_important  0
     High_school  Important  Very_important  8
     High_school  Very_important  Very_important  25
     Freshman  Very_unimportant  Very_important  0
     Freshman  Unimportant  Very_important  0
     Freshman  Neutral  Very_important  1
     Freshman  Important  Very_important  6
     Freshman  Very_important  Very_important  21
     Sophomore  Very_unimportant  Very_important  0
     Sophomore  Unimportant  Very_important  1
     Sophomore  Neutral  Very_important  1
     Sophomore  Important  Very_important  10
     Sophomore  Very_important  Very_important  38
     Junior  Very_unimportant  Very_important  0
     Junior  Unimportant  Very_important  0
     Junior  Neutral  Very_important  3
     Junior  Important  Very_important  3
     Junior  Very_important  Very_important  24
     Senior  Very_unimportant  Very_important  1
     Senior  Unimportant  Very_important  0
     Senior  Neutral  Very_important  2
     Senior  Important  Very_important  2
     Senior  Very_important  Very_important  4
;

data bstudents;
     set bstudents;
     m=When_major_chosen;
     e=Employment;
     s=Salary;

%genmodsummary(data=bstudents, class=m e s, yvariable=count,
  model1=s*e s*m e*m,
  model2=s*e s*m,
  model3=s*e e*m,
  model4=s*m e*m,
  model5=s e*m,
  model6=s*e m,
  model7=s e m,
  distribution=p, link=log, number_of_models=7);

/* The row and column scores are coded explicitly here       */

data bstudents;
set bstudents;
if s='Very_uni' then col=1;
else if s='Unimport' then col=2;
	else if s='Neutral' then col=3;
		else if s='Importan' then col=4;
			else if s='Very_imp' then col=5;

if e='Very_uni' then row=1;
else if e='Unimport' then row=2;
        else if e='Neutral' then row=3;
                else if e='Importan' then row=4;
                        else if e='Very_imp' then row=5;

/* This codes the diagonal and off-diagonal cells together in a class  */
/* variable that corresponds to both quasi-independence and symmetry   */

if row=col then symmetry=row;
else if row>col then symmetry=row*10+col;
	else if row<col then symmetry=row+col*10;

%genmodsummary(data=bstudents, class=e m s symmetry, yvariable=count,
  model1=s*e m,
  model2=row*col s e m,
  model3=col*e s e m,
  model4=row*s s e m,
  model5=col*e row*s s e m,
  model6=symmetry m,
  model7=symmetry s e m,
  model8=symmetry m col,
  distribution=p, link=log, number_of_models=8);

/* Section 8.4   */

data stillbirth;
     input Gender$ GesAge$ Race$ Birth$ Count;
     cards;
     Male  <=24  Aborigine  StillBirth  22
     Male  25-28  Aborigine  StillBirth  21
     Male  29-32  Aborigine  StillBirth  12
     Male  33-36  Aborigine  StillBirth  4
     Male  37-41  Aborigine  StillBirth  7
     Female  <=24  Aborigine  StillBirth  17
     Female  25-28  Aborigine  StillBirth  13
     Female  29-32  Aborigine  StillBirth  10
     Female  33-36  Aborigine  StillBirth  10
     Female  37-41  Aborigine  StillBirth  13
     Male  <=24  Aborigine  LiveBirth  16
     Male  25-28  Aborigine  LiveBirth  42
     Male  29-32  Aborigine  LiveBirth  73
     Male  33-36  Aborigine  LiveBirth  387
     Male  37-41  Aborigine  LiveBirth  3934
     Female  <=24  Aborigine  LiveBirth  16
     Female  25-28  Aborigine  LiveBirth  19
     Female  29-32  Aborigine  LiveBirth  76
     Female  33-36  Aborigine  LiveBirth  451
     Female  37-41  Aborigine  LiveBirth  3729
     Male  <=24  White  StillBirth  171
     Male  25-28  White  StillBirth  109
     Male  29-32  White  StillBirth  95
     Male  33-36  White  StillBirth  112
     Male  37-41  White  StillBirth  169
     Female  <=24  White  StillBirth  167
     Female  25-28  White  StillBirth  100
     Female  29-32  White  StillBirth  78
     Female  33-36  White  StillBirth  92
     Female  37-41  White  StillBirth  209
     Male  <=24  White  LiveBirth  121
     Male  25-28  White  LiveBirth  358
     Male  29-32  White  LiveBirth  944
     Male  33-36  White  LiveBirth  5155
     Male  37-41  White  LiveBirth  101776
     Female  <=24  White  LiveBirth  107
     Female  25-28  White  LiveBirth  314
     Female  29-32  White  LiveBirth  727
     Female  33-36  White  LiveBirth  4224
     Female  37-41  White  LiveBirth  96077
;

data stillbirth;
     set stillbirth;
     rename gender=g;
     rename gesage=a;
     rename race=r;
     rename birth=b;

%genmodsummary(data=stillbirth, class=a r b g, yvariable=count,
  model1=a b r g,
  model2=a*b a*r a*g b*r b*g r*g,
  model3=a*b*r a*b*g a*r*g b*r*g,
  model4=a*b*g a*r*g b*r*g,
  model5=a*b*r a*r*g b*r*g,
  model6=a*b*r a*b*g b*r*g,
  model7=a*b*r a*b*g a*r*g,
  model8=a*b*r a*r*g b*g,
  model9=a*r*g a*b b*r b*g,
  model10=a*b*r a*r*g,
  model11=a*r*g b*r b*g,
  model12=a*r*g a*b b*g,
  distribution=p, link=log, number_of_models=12);


/* Chapter 9     */

/* Section 9.1   */

proc import out = challenger
     datafile = "challenger.csv"
     dbms = csv replace;
     getnames = yes;
     datarow = 2;
run;

/* Logistic regression can be performed using proc logistic or proc genmod.  The "number of     */
/* successes out of numer of trials" is asked for using NSUCC/NTRIALS in the model statement.   */
/* The scale=n option allows the deviance and Pearson goodness-of-fit statistics to be printed, */
/* while the lackfit option gives the Hosmer-Lemeshow test. The influence option gives various  */
/* influence diagnostics.                                                                       */

proc logistic data=challenger;
     model Damaged/O_Rings = Temperature Pressure / scale=n lackfit influence;
run;

/* Here is the same model fit using proc genmod. Note the appropriate distribution and link.     */
/* proc genmod doesn't give the Hosmer-Lemeshow test.                                            */

proc genmod data=challenger;
     model Damaged/O_Rings = Temperature Pressure / dist=b link=logit residuals;
run;

/* The logisticsummary macro calculates AIC and AICc values for logistic regression models.     */
/* syntax is similar to that of genmodsummary, with the addition of a "number of trials"        */
/* variable. Note also that the distribution is not given, since it must be binomial. See       */
/* macro files.                                                                                 */

%include 'logisticsummary.sas';

%logisticsummary(data=challenger, y_success=Damaged, y_total=O_rings,
  model1=, model2=Temperature, model3=Pressure, model4=Temperature Pressure,
  link=logit, number_of_models=4); 

/* Section 9.2       */

/* This code shows how to fit a logistic regression where the response is a 0/1 type variable.   */
/* By default SAS orders the response alphabetically/numerically, which means that for a 0/1     */
/* target variable the model is by default estimating P(Y=0), rather than P(Y=1). To reverse     */
/* this, we use the descending option.                                                           */

proc import out = bankruptcy
     datafile = "bankruptcy.csv"
     dbms = csv replace;
     getnames = yes;
     datarow = 2;
run;

/* Note that I have not used the scale=n option, since the deviance and Pearson statistics are        */
/* are meaningless for 0/1 response data. The selection=best option gives a best subsets              */
/* determination based on maximum LR statistic for each set of predictors, but the logisticsummary    */
/* macro does this on the basis of AICc (the selection=best option is far faster than the             */
/* logisticsummary algorithm when fitting many models).                                               */

proc logistic data=bankruptcy descending;
     model Bankrupt = WC_TA RE_TA EBIT_TA S_TA BVE_BVL / selection=score;
run;

proc logistic data=bankruptcy descending;
     model Bankrupt = RE_TA EBIT_TA BVE_BVL / lackfit influence;
run;

/* The logisticsummary macro looks for a variable that is the total number of trials. In the 0/1      */
/* situation, this is just a column of ones.                                                          */

%include 'logisticsummary.sas';

data bankruptcy;
     set bankruptcy;
     w=WC_TA;
     r=RE_TA;
     e=EBIT_TA;
     s=S_TA;
     b=BVE_BVL;
     b_total=1;

%logisticsummary(data=bankruptcy, y_success=Bankrupt, y_total=b_total,
  model1=w r e s b,
  model2=r e s b, model3=w e s b, model4=w r s b, model5=w r e s, model6=w r e b,model7=e s b, model8=r s b,
  model9=r e b, model10=r e s, model11=w s b, model12=w e b, model13=w e s, model14=w r b, model15=w r s, model16=w r e,
  link=logit, number_of_models=16); 

%logisticsummary(data=bankruptcy, y_success=Bankrupt, y_total=b_total,
  model1=r e b, model2=w r, model3=w e, model4=w s, model5=w b, model6=r e, model7=r s, model8=r b, model9=e s,
  model10=e b, model11=s b, model12=w, model13=r, model14=e, model15=s, model16=b, model17=,
  link=logit, number_of_models=17);

data bankruptcy2;
     set bankruptcy;
     if Company='360Networks' then delete;
run;

/* When the data are separable, a warning message is given, but it is not specifically diagnosed.     */

proc logistic data=bankruptcy2 descending;
     model Bankrupt = WC_TA RE_TA EBIT_TA S_TA BVE_BVL;
run;

/* The two models with zero deviance have smallest AICc values, but the best nonseparable model is on RE/TA,     */
/* EBIT/TA, and BVE/BVL.                                                                                         */

%logisticsummary(data=bankruptcy2, y_success=Bankrupt, y_total=b_total,
  model1=w r e s b,
  model2=r e s b, model3=w e s b, model4=w r s b, model5=w r e s, model6=w r e b,model7=e s b, model8=r s b,
  model9=r e b, model10=r e s, model11=w s b, model12=w e b, model13=w e s, model14=w r b, model15=w r s, model16=w r e,
  link=logit, number_of_models=16); 

%logisticsummary(data=bankruptcy2, y_success=Bankrupt, y_total=b_total,
  model1=r e b, model2=w r, model3=w e, model4=w s, model5=w b, model6=r e, model7=r s, model8=r b, model9=e s,
  model10=e b, model11=s b, model12=w, model13=r, model14=e, model15=s, model16=b, model17=,
  link=logit, number_of_models=17);


/* The ctable option gives a classification table. Note that it is based on cross-validation, which means that     */
/* it is designed to correct for the use of the data for both model fitting and validation.                        */

proc logistic data=bankruptcy2 descending;
     model Bankrupt = RE_TA EBIT_TA BVE_BVL / lackfit influence ctable pprob=.5;
run;

/* Section 9.3              */

proc import out=titanic
     datafile="titanic.csv"
     dbms=csv replace;
     getnames=yes;
     datarow=2;
run;

/* Rather than copying the variables to shorter-named versions, we can simply rename them        */

data titanic;
     set titanic;
     rename Economic_status=e;
     rename Age_group=a;
     rename gender=g;

%logisticsummary(data=titanic, class=a e g, y_total=at_risk, y_success=survived,
  model1=g, model2=e, model3=a,
  model4=e g, model5=a g, model6=a e, 
  model7=a e g,
  model8=e*g, model9=a*g, model10=a*e,
  model11=a e*g, model12=g a*e, model13=e a*g,
  model14=a*e e*g, model15=a*g e*g, model16=a*g a*e,
  model17=e*g a*e a*g, 
  link=logit, number_of_models=17);

/* Section 9.4     */

proc import out=beetles
     datafile="beetles.csv"
     dbms=csv replace;
     getnames=yes;
     datarow=2;
run;

/* Different links are set using the link option                 */

proc logistic data=beetles;
     model Killed/Total = Log_dose / link=logit scale=n;
run;

proc logistic data=beetles;
     model Killed/Total = Log_dose / link=probit scale=n;
run;

proc logistic data=beetles;
     model Killed/Total = Log_dose / link=cloglog scale=n;
run;

/* Section 9.5           */

proc import out=broadway
     datafile="broadway.csv"
     dbms=csv replace;
     getnames=yes;
     datarow=2;
run;

data broadway;
     set broadway;
     total=6;

proc logistic data=broadway;
     class Type;
     model Tony_nominations/total=Type NYT_rating DN_rating / link=logit;
run;

/* Quasi-likelihood       */

proc logistic data=broadway;
     class Type;
     model Tony_nominations/total=Type NYT_rating DN_rating / link=logit scale=pearson;
run;

/* SAS does not perform beta-binomial regression, but the Williams quasi-likelihood method      */
/* is conceptually similar, since it assumes the same variance function. Note that in general   */
/* the parameter estimates and Wald statistics are different from the scale=pearson analysis,   */
/* but when the number of trials is the same for all observations (as is the case here), the    */
/* variance function just reflects a constant inflation, so the results will be identical.      */

proc logistic data=broadway;
     class Type;
     model Tony_nominations/total=Type NYT_rating DN_rating / link=logit scale=williams;
run;



/* Section 9.6            */

data whickham;
     input Age Smoking$ Alive Dead @@;
     cards;
     21 Smoker 53 2   21 Nonsmoker 61 1
     30 Smoker 121 3   30 Nonsmoker 152 5
     40 Smoker 95 14   40 Nonsmoker 114 7
     50 Smoker 103 27   50 Nonsmoker 66 12
     60 Smoker 64 51   60 Nonsmoker 81 40
     70 Smoker 7 29   70 Nonsmoker 28 101
     80 Smoker 0 13   80 Nonsmoker 0 64
;

data whickham;
     set whickham;
     Total=Alive+Dead;
     if Smoking='Smoker' then Smoker=1; else Smoker=0;

/* proc gam doesn't allow the use of loess (local likelihood) for binomial data with multiple    */
/* trials, so the spline (penalized likelihood) method is used instead. Setting df=2 makes the   */
/* Smoker term come in linearly.                                                                 */

proc gam data=whickham;
     model Alive/Total = spline(Age) spline(Smoker, df=2) / dist=binomial method=gcv;
     output out=estimate p;
run;

/* Data are already in age order. There is still a break aroundage 60, but the curve is noticeably different. */

proc gplot data=estimate;
     plot P_Age*Age;
     symbol interpol=join;
run;

/* Section 9.7                  */

proc import out=esr
     datafile="esr.csv"
     dbms=csv replace;
     getnames=yes;
     datarow=2;
run;

/* Exact logistic regression is obtained using the exact statement      */

proc logistic data=esr descending;
     model ESR=Fibrinogen Gamma_globulin / link=logit;
     exact intercept Fibrinogen Gamma_globulin / estimate=both;
run;

data esr2;
     set esr;
     if Fibrinogen < 3 and ESR = 1 then delete;

proc logistic data=esr2 descending;
     model ESR=Fibrinogen Gamma_globulin / link=logit;
     exact intercept Fibrinogen Gamma_globulin / estimate=both;
run;

proc logistic data=esr2 descending;
     model ESR=Fibrinogen / link=logit;
     exact intercept Fibrinogen / estimate=both;
run;


/* Chapter 10           */

/* Section 10.1         */

/* proc logistic is used for multinomial (nominal) logistic regression using the glogit link    */ 
/* The predprobs=i option says to save estimated probabilities for each level of the response.              */

proc import out=authorship
     datafile="authorship.csv"
     dbms=csv replace;
     getnames=yes;
     datarow=2;
run;

proc logistic data=authorship;
     model Author= be had it not on the upon was which / link=glogit;
	 output out=author_out predprobs=i;
run;

/* The classification table is based on the created _FROM_ and _INTO_ variables        */

proc freq data=author_out;
     tables _FROM_*_INTO_ / nocol norow nopercent;
run;

/* Section 10.2           */

proc import out=asbestos
     datafile="asbestos.csv"
     dbms=csv replace;
     getnames=yes;
     datarow=2;
run;

/* proc logistic automatically fits a proportional odds model when there are three or more response levels.   */

proc logistic data=asbestos;
   class Task Ventilation;
   model Exposure = Task Ventilation / link=logit;
   output out=asbestos_out predprobs=i;
run;

/* The nodupkey option removes the duplicate values                                                           */

proc sort data=asbestos_out nodupkey out=asbestos_out2;
     by task ventilation;
run;

/* The "ip_" variables are the probabilities created by proc logistic, so they are renamed here               */

data asbestos_out2;
     set asbestos_out2;
     Low_Exposure=ip__1__Low_exposure;
     Action_Level= ip__2__action_level;
     Above_Legal=ip__3__above_legal;
     keep task ventilation Low_Exposure Action_Level Above_Legal;
run;

proc print data=asbestos_out2;
run;

/* Section 10.3             */

proc import out=broadway
     datafile="broadway.csv"
     dbms=csv replace;
     getnames=yes;
     datarow=2;
run;

/* A way to get indicator variables         */

data broadway;
      set broadway;
      Musical=(Type='Musical');
      Musicrev=(Type='Musical revue');

/* Proportional hazards model               */

proc logistic data=broadway;
     model Show_run_code = Musical Musicrev Tony_awards DN_rating / link=cloglog;
     output out=bwayhazards_out predprobs=i;
run;

proc freq data=bwayhazards_out;
     tables _FROM_*_INTO_ / nocol norow nopercent;
run;

/* Proportional odds model                  */

proc logistic data=broadway;
   class Type;
   model Show_run_code = Musical Musicrev Tony_awards DN_rating / link=logit;
   output out=bwayodds_out predprobs=i;
run;

proc freq data=bwayodds_out;
     tables _FROM_*_INTO_ / nocol norow nopercent;
run;

/* There are no adjacent-categories and continuation ratio model fits in the book. The models     */
/* can be fit as logistic regressions , but proc catmod provides fitting of the models directly   */
/* using weighted least squares, rather than using maximum likelihood. Note that catmod treats    */
/* sampling zeroes as structural zeroes, so any (random) zero cell must be replaced by a small    */
/* positive number. Here are examples using the asbestos data. Any numerical predictors would     */
/* be identified using a direct statement (e.g., direct var1 var2;), although if there are many   */
/* distinct numerical values the program might not run.                                           */

proc import out=asbestos
     datafile="asbestos.csv"
     dbms=csv replace;
     getnames=yes;
     datarow=2;
run;

/* Adjacent-categories logit                  */

proc catmod data=asbestos;
   response alogits;
   population Task Ventilation;
   model Exposure = Task Ventilation;
run;

/* Continuation ratio logit                   */

proc catmod data=asbestos;
   response clogits;
   population Task Ventilation;
   model Exposure = Task Ventilation;
run;