# # This makes factors fit using effect codings. NOTE: some libraries change this setting # when they are loaded! # # NOTE: Be careful about created variables masking ones connected with attached data frames. # I have not made this explicit in this code. # .First <- function(){options(contrasts = c(factor = "contr.sum", ordered = "contr.poly"))} # # Chapter 2 # # Section 2.5 # # Data frames attached here are first read in from the provided data files # attach(tornado) # # Scatter plots # par(mfrow=c(2,1)) plot(Tornadoes, Deaths, xlab="Number of tornadoes", ylab="Number of tornado-related deaths", main="Number of deaths versus number of tornadoes") plot(Killer.tornadoes, Deaths, xlab="Number of killer tornadoes", ylab="Number of tornado-related deaths", main="Number of deaths versus number of killer tornadoes") # # lm() function is used to fit least squares regression # tornlm1 <- lm(Deaths ~ Killer.tornadoes + Tornadoes) summary(tornlm1) # # Figure 2.4 # predtorn1 <- predict(tornlm1) restorn1 <- resid(tornlm1) plot(predtorn1, restorn1, xlab="Fitted value", ylab="Residual") # # Median-based regressygon # lines(apply(matrix(predtorn1[order(predtorn1)], ncol=12), 2, median), apply(matrix(restorn1[order(predtorn1)], ncol=12), 2, median), lty=2) # # Chapter 3 # # Section 3.3 # # Side-by side boxplots # par(mfrow=c(2,1)) # # Note: omit style.bxp setting in R # boxplot(split(Deaths,Year), style.bxp="old", xlab="Year", ylab="Number of tornado-related deaths", main="Number of deaths by year") boxplot(split(Deaths,Month.number), style.bxp="old", xlab="Month", ylab="Number of tornado-related deaths", main="Number of deaths by month", names=c("Jan","Feb","Mar","Apr","May","Jun","Jul","Aug","Sep","Oct","Nov","Dec")) tornlm2 <- lm(Deaths ~ Killer.tornadoes + Tornadoes + factor(Year) + Month) # # To get the effect coding coefficient and t-statistic for the missing category, # change one of the category names so that the missing category is no last # alphabetically # summary(tornlm2) # # The Anova() function comes from the car library; see http://www.socsci.mcmaster.ca/jfox/Books/companion/ # library(car) Anova(tornlm2, type=c("III")) # # Direct calculation of regression diagnostics # tornhat2 <- lm.influence(tornlm2)$hat tornsres2 <- stdres(tornlm2) torncook2 <- tornsres2^2*tornhat2/(tornlm2$rank*(1 - tornhat2)) # # Figure 3.3 # par(mfrow=c(3,1)) plot(c(1:48), tornsres2, type="l", xlab="Observation", ylab="Standardized residual", main="Index plot of standardized residuals") abline(h=2.5, lty=2) abline(h=-2.5, lty=2) plot(c(1:48), tornhat2, type="l", xlab="Observation", ylab="Leverage value", main="Index plot of leverage values") plot(c(1:48), torncook2, type="l", xlab="Observation", ylab="Cook's distance", main="Index plot of Cook's distances") # # Section 3.4 # # The extractAIC() function comes from the MASS package; see http://www.stats.ox.ac.uk/pub/MASS4/ # library(MASS) # # AIC values; values in the book subtract the AIC value of the best model # extractAIC(lm(Deaths ~ 1)) extractAIC(lm(Deaths ~ Killer.tornadoes)) extractAIC(lm(Deaths ~ Tornadoes)) extractAIC(lm(Deaths ~ factor(Year))) extractAIC(lm(Deaths ~ Month)) extractAIC(lm(Deaths ~ Killer.tornadoes + Tornadoes)) extractAIC(lm(Deaths ~ Killer.tornadoes + factor(Year))) extractAIC(lm(Deaths ~ Killer.tornadoes + Month)) extractAIC(lm(Deaths ~ Tornadoes + factor(Year))) extractAIC(lm(Deaths ~ Tornadoes + Month)) extractAIC(lm(Deaths ~ factor(Year) + Month)) extractAIC(lm(Deaths ~ Killer.tornadoes + Tornadoes + factor(Year))) extractAIC(lm(Deaths ~ Killer.tornadoes + Tornadoes + Month)) extractAIC(lm(Deaths ~ Killer.tornadoes + factor(Year) + Month)) extractAIC(lm(Deaths ~ Tornadoes + factor(Year) + Month)) extractAIC(lm(Deaths ~ Killer.tornadoes + Tornadoes + factor(Year) + Month)) # # AICc values # n <- length(Deaths) extractAIC(lm(Deaths ~ 1)) + 2*2*3/(n-3) extractAIC(lm(Deaths ~ Killer.tornadoes)) + 2*3*4/(n-4) extractAIC(lm(Deaths ~ Tornadoes)) + 2*3*4/(n-4) extractAIC(lm(Deaths ~ factor(Year))) + 2*5*6/(n-6) extractAIC(lm(Deaths ~ Month)) + 2*13*14/(n-14) extractAIC(lm(Deaths ~ Killer.tornadoes + Tornadoes)) + 2*4*5/(n-5) extractAIC(lm(Deaths ~ Killer.tornadoes + factor(Year))) + 2*6*7/(n-7) extractAIC(lm(Deaths ~ Killer.tornadoes + Month)) + 2*14*15/(n-15) extractAIC(lm(Deaths ~ Tornadoes + factor(Year))) + 2*6*7/(n-7) extractAIC(lm(Deaths ~ Tornadoes + Month)) + 2*14*15/(n-15) extractAIC(lm(Deaths ~ factor(Year) + Month)) + 2*16*17/(n-17) extractAIC(lm(Deaths ~ Killer.tornadoes + Tornadoes + factor(Year))) + 2*7*8/(n-8) extractAIC(lm(Deaths ~ Killer.tornadoes + Tornadoes + Month)) + 2*15*16/(n-16) extractAIC(lm(Deaths ~ Killer.tornadoes + factor(Year) + Month)) + 2*17*18/(n-18) extractAIC(lm(Deaths ~ Tornadoes + factor(Year) + Month)) + 2*17*18/(n-18) extractAIC(lm(Deaths ~ Killer.tornadoes + Tornadoes + factor(Year) + Month)) + 2*18*19/(n-19) extractAIC(lm(Deaths ~ Killer.tornadoes)) extractAIC(lm(Deaths ~ Killer.tornadoes + Month + Killer.tornadoes*Month)) extractAIC(lm(Deaths ~ Killer.tornadoes)) + 2*3*4/(n-4) extractAIC(lm(Deaths ~ Killer.tornadoes + Month + Killer.tornadoes*Month)) + 2*25*26/(n-26) extractAIC(lm(Deaths ~ Killer.tornadoes)) extractAIC(lm(Deaths ~ Killer.tornadoes + factor(Tornado.season))) extractAIC(lm(Deaths ~ Killer.tornadoes + factor(Tornado.season) + Killer.tornadoes*factor(Tornado.season))) extractAIC(lm(Deaths ~ Killer.tornadoes)) + 2*3*4/(n-4) extractAIC(lm(Deaths ~ Killer.tornadoes + factor(Tornado.season))) + 2*8*9/(n-9) extractAIC(lm(Deaths ~ Killer.tornadoes + factor(Tornado.season) + Killer.tornadoes*factor(Tornado.season))) + 2*13*14/(n-14) # # Chapter 4 # # Section 4.1 # # The binconf() function comes from the Hmisc library; see http://hesweb1.med.virginia.edu/biostat/s/Hmisc.html # library(Hmisc) binconf(958, 2818, method="asymptotic") binconf(958, 2818, method="wilson") # # Calculations for hypothesis test and p-value for needle echange programs # z <- (.02 - .04)/sqrt(.04*.96/2500) (1 - pnorm(abs(z)))*2 # # S-PLUS function to determine binomial sample size # binomial.sample.size(p=.04,p.alt=.03, power=.7,alternative="less",alpha=.025,correct=F) # # This function is not in R, so here is a direct calculation # ((1.96*sqrt(.04*.96) + .5244*sqrt(.03*.97))/.01)^2 # # The binom.test() function gives exact binomial hypothesis tests, but does not give # mid-p-values. It uses the Wilson-Sterne rule for p-values of two-tailed tests # binom.test(8, 10, .4, alternative="greater") binom.test(8, 10, .4) # # The binconf() function also gives exact confidence intervals # binconf(8, 13, method="all") binconf(19, 25, method="all") binconf(10, 13, method="all") binconf(22, 25, method="all") binconf(10, 10, method="all") # # Section 4.2 # # bonfmult() is a function written to construct Bonferroni-based simultaneous # multinomial confidence intervals; see file functions.s # draws <- c(206,200,206,223,176,215,202,223,213,204) bonfmult(draws) # # Section 4.3 # # Asymptotic confidence interval for Poisson random variable # attach(devils) c(mean(Goals) - 1.96*sqrt(mean(Goals)/length(Goals)), mean(Goals) + 1.96*sqrt(mean(Goals)/length(Goals))) c(mean(Goals.against) - 1.96*sqrt(mean(Goals.against)/length(Goals.against)), mean(Goals.against) + 1.96*sqrt(mean(Goals.against)/length(Goals.against))) # # scoreintpois() is a function written to construct the score interval for a Poisson # random variable; see file functions.s # scoreintpois(Goals) scoreintpois(Goals.against) # # exactpoisci() is a function written to construct the exact interval for a Poisson # random variable; see file functions.s # hurr45 <- c(1,0,0,1,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0) c(mean(hurr45) - 2.576*sqrt(mean(hurr45)/length(hurr45)),mean(hurr45) + 2.576*sqrt(mean(hurr45)/length(hurr45))) scoreintpois(hurr45, .01) exactpoisci(sum(hurr45), length(hurr45), .01) # # Section 4.4 # # powdiv() is a function written to construct the Cressie-Read power divergence goodness-of-fit # statistics; see file functions.s # attach(benford) # # This gives the expected counts under Benford's distribution # expreturn <- sum(Return)*log10(c(2:10)/c(1:9)) # # Pearson, likelihood ratio, and Cress-Read goodness-of-fit statistics # powdiv(Return, expreturn, 0, 1) powdiv(Return, expreturn, 0, 0) powdiv(Return, expreturn, 0, 2/3) # # Index of dissimilarity # sum(abs(Return - expreturn))/(2*sum(Return)) expreturn <- sum(Large.return)*log10(c(2:10)/c(1:9)) powdiv(Large.return, expreturn, 0, 1) powdiv(Large.return, expreturn, 0, 0) powdiv(Large.return, expreturn, 0, 2/3) sum(abs(Large.return - expreturn))/(2*sum(Large.return)) expreturn <- rep(sum(Small.return)/9, 9) powdiv(Small.return, expreturn, 0, 1) powdiv(Small.return, expreturn, 0, 0) powdiv(Small.return, expreturn, 0, 2/3) sum(abs(Small.return - expreturn))/(2*sum(Small.return)) expreturn <- sum(Large.return.thirty)*log10(c(2:10)/c(1:9)) powdiv(Large.return.thirty, expreturn, 0, 1) powdiv(Large.return.thirty, expreturn, 0, 0) powdiv(Large.return.thirty, expreturn, 0, 2/3) sum(abs(Large.return.thirty - expreturn))/(2*sum(Large.return.thirty)) # # partitionX2() is a function written to construct partitions of Pearson's X^2 # statistic; see file functions.s # texas.lottery <- c(18,17,16,18,20,15,15,11,9,11) partitionX2(texas.lottery, rep(15,10), 0) powdiv(texas.lottery, rep(15,10), 0, 0) powdiv(texas.lottery, rep(15,10), 0, 2/3) sum(abs(texas.lottery - rep(15,10)))/300 # # exactX2() is a function to tabulate the exact distribution of chi-squared # goodness-of-fit statistics; see file functions.s # # South Carolina hurricane strikes # powdiv(c(0,0,0,3,1),.8,0,1) powdiv(c(0,0,0,3,1),.8,0,0) distribchisq <- exactX2(5,4) distribchisq # # Section 4.5 # attach(dmft) # # The table() function returns a table of the observed values given # Begin <- table(DMFT.Begin) # # Set up Poisson probability distribution, recognizing that top cell is open-ended # prbegpois <- dpois(c(0:8), mean(DMFT.Begin)) prbegpois[9] <- 1 - ppois(7, mean(DMFT.Begin)) n <- sum(Begin) n*prbegpois # # Calculating log-likelihood function based on multinomial representation, in order # to cacluate AIC and AICc # loglikebegpois <- sum(Begin*log(prbegpois)) aicbegpois <- -2*loglikebegpois + 2 aiccbegpois <- aicbegpois + 2*2/(n - 2) c(aicbegpois, aiccbegpois) powdiv(Begin,n*prbegpois, 1, 1) powdiv(Begin,n*prbegpois, 1, 0) sum(abs(Begin - n*prbegpois))/(2*n) # # zipmle() is a function to calculate MLEs for univariate ZIP model; see file functions.s # beginmle <- zipmle(DMFT.Begin) # # Get probability function for ZIP model # prbegzip <- beginmle$p*dpois(c(0:8), beginmle$mu) prbegzip[9] <- beginmle$p*(1 - ppois(7, beginmle$mu)) prbegzip[1] <- 1-beginmle$p + beginmle$p*exp(-beginmle$mu) n*prbegzip loglikebegzip <- sum(Begin*log(prbegzip)) aicbegzip <- -2*loglikebegzip + 4 aiccbegzip <- aicbegzip + 2*2*3/(n - 3) c(aicbegzip, aiccbegzip) powdiv(Begin,n*prbegzip, 2, 1) powdiv(Begin,n*prbegzip, 2, 0) sum(abs(Begin - n*prbegzip))/(2*n) # soccer <- c(rep(0,225), rep(1,293), rep(2,224), rep(3,114), rep(4,41), rep(5,15), rep(6,9), 7, 8, 8) n <- sum(table(soccer)) soccmean <- mean(soccer) soccer.poisfit <- dpois(c(0:6), soccmean) soccer.poisfit <- n*c(soccer.poisfit, 1-sum(soccer.poisfit)) socc.table <- table(c(soccer[1:922], 7, 7)) powdiv(socc.table,soccer.poisfit, 1, 1) partitionX2(socc.table,soccer.poisfit, 1) powdiv(socc.table,soccer.poisfit, 1, 0) # # Test for negative binomial versus Poisson # z <- (var(soccer)/mean(soccer) - 1)*sqrt((n - 1)/2) pz <- (1 - pnorm(z))*2 c(z,pz) # # The glm.nb() function fits a negative binomial regression and is from the MASS package. Here # a regression on only the constant corresponds to univariate fitting of a negative binomial # random variable. # library(MASS) soccnb <- glm.nb(soccer ~ 1, maxit=100) # # dnegbinom() gives the probability function for a negative binomial random variable; # see file functions.s # soccer.nbfit <- dnegbinom(c(0:6), exp(coef(soccnb)[1]), soccnb$theta) soccer.nbfit <- n*c(soccer.nbfit, 1 - sum(soccer.nbfit)) powdiv(socc.table, soccer.nbfit, 2, 1) partitionX2(socc.table, soccer.nbfit, 2) powdiv(socc.table, soccer.nbfit, 2, 0) # product.use <- c(rep(0,85), rep(1,35), rep(2,78), rep(3,93),rep(4,68), rep(5,25),rep(6,18),rep(7,106),rep(8,7),rep(9,4),rep(10,4),rep(11,2), rep(12,2),rep(14,6)) prod.table <- c(85,35,78,93,68,25,18,106,7,4,4,2,2,0,6) n <- sum(prod.table) library(MASS) prodnb <- glm.nb(product.use ~ 1, maxit=100) prod.nbfit <- dnegbinom(c(0:13),exp(coef(prodnb)[1]),prodnb$theta) prod.nbfit <- n*c(prod.nbfit,1-sum(prod.nbfit)) gsq <- powdiv(prod.table,prod.nbfit,2,0) # # EM algorithm fit of zero-, seven-, and fourteen-inflated negative binomial. # Negative binomial is starting setting. # oldlr <- gsq$stat prod.fill <- prod.table ptilde <- prod.fill/n phat <- prod.nbfit/n qhat <- rep(0,15) q <- ((ptilde[1] + ptilde[8] + ptilde[15]) - (phat[1] + phat[8] + phat[15]))/ (1-(phat[1] + phat[8] + phat[15])) qhat[1] <- ptilde[1] - (1 - q)*phat[1] qhat[8] <- ptilde[8] - (1 - q)*phat[8] qhat[15] <- ptilde[15] - (1 - q)*phat[15] lrchange <- 10 # # EM iterations # while (lrchange > .1) { # # E-step # prod.fill[1] <- round(prod.table[1] - n*qhat[1]) prod.fill[8] <- round(prod.table[8] - n*qhat[8]) prod.fill[15] <- round(prod.table[15] - n*qhat[15]) nfill <- sum(prod.fill) product.use.fill <- c(rep(0,prod.fill[1]), rep(1,35), rep(2,78), rep(3,93), rep(4,68), rep(5,25), rep(6,18), rep(7,prod.fill[8]), rep(8,7), rep(9,4), rep(10,4), rep(11,2), rep(12,2), rep(14,prod.fill[15])) # # M-step # prodnbfill <- glm.nb(product.use.fill ~ 1, maxit=100) prod.nbfit.fill <- dnegbinom(c(0:13), exp(coef(prodnbfill)[1]), prodnbfill$theta) prod.nbfit.fill <- nfill*c(prod.nbfit.fill, 1 - sum(prod.nbfit.fill)) phat <- prod.nbfit.fill/nfill q <- ((ptilde[1] + ptilde[8] + ptilde[15]) - (phat[1] + phat[8] + phat[15]))/ (1 - (phat[1] + phat[8] + phat[15])) qhat[1] <- ptilde[1] - (1 - q)*phat[1] qhat[8] <- ptilde[8] - (1 - q)*phat[8] qhat[15] <- ptilde[15] - (1 - q)*phat[15] prod.mixnbfit <- n*qhat + n*(1 - q)*phat newlr <- powdiv(prod.table, prod.mixnbfit, 5, 0) lrchange <- abs(oldlr - newlr$stat) oldlr <- newlr$stat} powdiv(prod.table, prod.mixnbfit, 5, 1) # attach(broadway) broadway1 <- broadway[Tony.nominations > 0,] attach(broadway1) stem(Tony.awards/Tony.nominations*100) tonyp <- sum(Tony.awards)/sum(Tony.nominations) # # binomgof() is a function to give chi-squared statistics for a sample from the # binomial distribution; see file functions.s # binomgof(Tony.awards,Tony.nominations) # # Test for beta-binomial versus binomial # z <- (sum((Tony.awards - Tony.nominations*tonyp)^2)/(tonyp*(1 - tonyp))-sum(Tony.nominations))/ sqrt(2*sum(Tony.nominations*(Tony.nominations - 1))) # # bbdmle1() is an S-PLUS function to calculate the MLEs for a sample from the beta-binomial distribution; # see file functions.s # tony1 <- bbdmle1(Tony.awards, Tony.nominations, c(1,1)) c(tony1$alpha, tony1$beta) # # bbdmle2() is a function to calculate the MLEs for a sample from the beta-binomial distribution; # see file functions.s. The MASS library must be loaded before it is called in S-PLUS. # tony2 <- bbdmle2(Tony.awards, Tony.nominations, c(1,1)) c(tony2$alpha, tony2$beta) # # bbdgof() is a function to give Pearson statistic for a sample from the # beta-binomial distribution; see file functions.s # bbdgof(Tony.awards,Tony.nominations,.48,1.68) # # Section 4.6 # attach(runshoes) # # truncpoismle() is a function to calculate the MLE of a Poisson random variable # truncated from below at zero; see file functions.s # n <- length(Shoes) shoemu <- truncpoismle(Shoes)$mu obsshoe <- c(0,table(Shoes)[1:4],sum(table(Shoes)[5:7])) obstrpois <- obsshoe[2:6] exptrpois <- c(dpois(c(1:4), shoemu), 1 - ppois(4,shoemu))*n/(1 - exp(-shoemu)) powdiv(obstrpois,exptrpois, 2, 1) powdiv(obstrpois,exptrpois, 2, 0) # # Section 4.7 # polio <- c(0,1,0,0,1,3,0,2,3,5,3,5,2,2,0,1,0,1,3,3,2,1,1,5,0,3,1,0,1,4,0,0,1,6,14,1,1,0,0,1,1,1,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,0,2) # # tableform() is a function that converts a vector of inteeger values into a table of counts; # see file functions.s # poliotab <- tableform(polio, low=0, high=15) nonparest <- poliotab$table/sum(poliotab$table) mean(polio) mean(polio[polio < 10]) # # poishellest() is the calling function to give the Hellinger criterion to be minimized; # see file functions.s # optimize(poishellest, lower=1, upper=10, low=0, high=15, minfunc=hellinger, nonparest=nonparest) # # Chapter 5 # # Section 5.2 # attach(tornado) # # The glm() function with family=poisson fits a Poisson regression # # The extractAIC() function works on glm objects as well # library(MASS) n <- sum(Deaths) extractAIC(glm(Deaths ~ 1, family=poisson)) + 2*1*2/(n-2) extractAIC(glm(Deaths ~ Killer.tornadoes, family=poisson)) + 2*2*3/(n-3) extractAIC(glm(Deaths ~ Tornadoes, family=poisson)) + 2*2*3/(n-3) extractAIC(glm(Deaths ~ factor(Year), family=poisson)) + 2*4*5/(n-5) extractAIC(glm(Deaths ~ Month, family=poisson)) + 2*12*13/(n-13) extractAIC(glm(Deaths ~ Killer.tornadoes + Tornadoes, family=poisson)) + 2*3*4/(n-4) extractAIC(glm(Deaths ~ Killer.tornadoes + factor(Year), family=poisson)) + 2*5*6/(n-6) extractAIC(glm(Deaths ~ Killer.tornadoes + Month, family=poisson)) + 2*13*14/(n-14) extractAIC(glm(Deaths ~ Tornadoes + factor(Year), family=poisson)) + 2*5*6/(n-6) extractAIC(glm(Deaths ~ Tornadoes + Month, family=poisson)) + 2*13*14/(n-14) extractAIC(glm(Deaths ~ factor(Year) + Month, family=poisson)) + 2*15*16/(n-16) extractAIC(glm(Deaths ~ Killer.tornadoes + Tornadoes + factor(Year), family=poisson)) + 2*6*7/(n-7) extractAIC(glm(Deaths ~ Killer.tornadoes + Tornadoes + Month, family=poisson)) + 2*14*15/(n-15) extractAIC(glm(Deaths ~ Killer.tornadoes + factor(Year) + Month, family=poisson)) + 2*16*17/(n-17) extractAIC(glm(Deaths ~ Tornadoes + factor(Year) + Month, family=poisson)) + 2*16*17/(n-17) extractAIC(glm(Deaths ~ Killer.tornadoes + Tornadoes + factor(Year) + Month, family=poisson)) + 2*17*18/(n-18) extractAIC(glm(Deaths ~ Killer.tornadoes + Month + Killer.tornadoes*Month, family=poisson)) extractAIC(glm(Deaths ~ Killer.tornadoes + Month + Killer.tornadoes*Month, family=poisson)) + 2*24*25/(n-25) extractAIC(glm(Deaths ~ Killer.tornadoes + Month + Killer.tornadoes*Month + Tornadoes, family=poisson)) extractAIC(glm(Deaths ~ Killer.tornadoes + Month + Killer.tornadoes*Month + Tornadoes, family=poisson)) + 2*25*26/(n-26) extractAIC(glm(Deaths ~ Killer.tornadoes + Month + Tornadoes*Month + Tornadoes, family=poisson)) extractAIC(glm(Deaths ~ Killer.tornadoes + Month + Tornadoes*Month + Tornadoes, family=poisson)) + 2*25*26/(n-26) extractAIC(glm(Deaths ~ Killer.tornadoes + Month + Tornadoes*Month + Tornadoes + Killer.tornadoes*Month, family=poisson)) extractAIC(glm(Deaths ~ Killer.tornadoes + Month + Tornadoes*Month + Tornadoes + Killer.tornadoes*Month, family=poisson)) + 2*36*37/(n-37) extractAIC(glm(Deaths ~ Killer.tornadoes + factor(Tornado.season), family=poisson)) extractAIC(glm(Deaths ~ Killer.tornadoes + Tornadoes + factor(Tornado.season), family=poisson)) extractAIC(glm(Deaths ~ Killer.tornadoes + factor(Tornado.season), family=poisson)) + 2*7*8/(n-8) extractAIC(glm(Deaths ~ Killer.tornadoes + Tornadoes + factor(Tornado.season), family=poisson)) + 2*8*9/(n-9) # # Note: the "x=T" option is used so that diagnostics can be obtained using the glmdiag() function # tornglm1 <- glm(Deaths ~ Killer.tornadoes + Month, family=poisson, x=T) summary(tornglm1) # # glmdiag() is a function that calculates regression diagnostics for a glm object; # see file functions.s # tornglm1.diag <- glmdiag(tornglm1) # # Figure 5.2 # January <- exp(.418 + .254*c(0:90)/10) February <- exp(1.936 + .254*c(0:40)/10) March <- exp(1.185 + .254*c(0:90)/10) April <- exp(.836 + .254*c(0:130)/10) May <- exp(1.55 + .254*c(0:100)/10) June <- exp(.186 + .254*c(0:30)/10) July <- exp(-.192 + .254*c(0:40)/10) August <- exp(-.829 + .254*c(0:10)/10) September <- exp(-.502 + .254*c(0:20)/10) October <- exp(-.847 + .254*c(0:20)/10) November <- exp(-.847 + .254*c(0:20)/10) December <- exp(-.423 + .254*c(0:10)/10) plot(c(0:90)/10, January, xlab="Killer tornadoes", ylab="Estimated deaths", xlim=c(0,13), ylim=c(0,65), type="l") lines(c(0:40)/10, February) lines(c(0:90)/10, March) lines(c(0:130)/10, April) lines(c(0:100)/10, May) lines(c(0:30)/10, June, lty=2) lines(c(0:40)/10, July, lty=2) lines(c(0:10)/10, August, lty=2) lines(c(0:20)/10, September, lty=2) lines(c(0:20)/10, October, lty=2) lines(c(0:20)/10, November, lty=2) lines(c(0:10)/10, December, lty=2) text(9, 12, "January", adj=0, cex=.8) text(4, 21, "February", adj=1, cex=.8) text(9, 35, "March", adj=0, cex=.8) text(12.5, 60, "April", adj=1, cex=.8) text(9.5, 60, "May", adj=1, cex=.8) # # The Pearson statistic can be obtained as the sum of squared Pearson residuals # sum(resid(tornglm1,type="pearson")^2) tornglm2 <- glm(Deaths ~ Killer.tornadoes + Month + Killer.tornadoes*Month, family=poisson, x=T) tornglm2.diag <- glmdiag(tornglm2) # # Omitting the outliers # tornadoa <- tornado[-26,] attach(tornadoa) n <- sum(Deaths) extractAIC(glm(Deaths ~ 1, family=poisson)) + 2*1*2/(n-2) extractAIC(glm(Deaths ~ Killer.tornadoes, family=poisson)) + 2*2*3/(n-3) extractAIC(glm(Deaths ~ Tornadoes, family=poisson)) + 2*2*3/(n-3) extractAIC(glm(Deaths ~ factor(Year), family=poisson)) + 2*4*5/(n-5) extractAIC(glm(Deaths ~ Month, family=poisson)) + 2*12*13/(n-13) extractAIC(glm(Deaths ~ Killer.tornadoes + Tornadoes, family=poisson)) + 2*3*4/(n-4) extractAIC(glm(Deaths ~ Killer.tornadoes + factor(Year), family=poisson)) + 2*5*6/(n-6) extractAIC(glm(Deaths ~ Killer.tornadoes + Month, family=poisson)) + 2*13*14/(n-14) extractAIC(glm(Deaths ~ Tornadoes + factor(Year), family=poisson)) + 2*5*6/(n-6) extractAIC(glm(Deaths ~ Tornadoes + Month, family=poisson)) + 2*13*14/(n-14) extractAIC(glm(Deaths ~ factor(Year) + Month, family=poisson)) + 2*15*16/(n-16) extractAIC(glm(Deaths ~ Killer.tornadoes + Tornadoes + factor(Year), family=poisson)) + 2*6*7/(n-7) extractAIC(glm(Deaths ~ Killer.tornadoes + Tornadoes + Month, family=poisson)) + 2*14*15/(n-15) extractAIC(glm(Deaths ~ Killer.tornadoes + factor(Year) + Month, family=poisson)) + 2*16*17/(n-17) extractAIC(glm(Deaths ~ Tornadoes + factor(Year) + Month, family=poisson)) + 2*16*17/(n-17) extractAIC(glm(Deaths ~ Killer.tornadoes + Tornadoes + factor(Year) + Month, family=poisson)) + 2*17*18/(n-18) extractAIC(glm(Deaths ~ Killer.tornadoes + Month + Killer.tornadoes*Month, family=poisson)) + 2*24*25/(n-25) extractAIC(glm(Deaths ~ Killer.tornadoes + Month + Killer.tornadoes*Month + Tornadoes, family=poisson)) + 2*25*26/(n-26) extractAIC(glm(Deaths ~ Killer.tornadoes + Month + Tornadoes*Month + Tornadoes, family=poisson)) + 2*25*26/(n-26) extractAIC(glm(Deaths ~ Killer.tornadoes + Tornado.season, family=poisson)) + 2*7*8/(n-8) extractAIC(glm(Deaths ~ Killer.tornadoes + Tornadoes + Tornado.season, family=poisson)) + 2*8*9/(n-9) tornglm3 <- glm(Deaths ~ Killer.tornadoes + Month, family=poisson, x=T) tornglm3.diag <- glmdiag(tornglm3) tornadob <- tornado[-c(26,37),] attach(tornadob) n <- sum(Deaths) extractAIC(glm(Deaths ~ 1, family=poisson)) + 2*1*2/(n-2) extractAIC(glm(Deaths ~ Killer.tornadoes, family=poisson)) + 2*2*3/(n-3) extractAIC(glm(Deaths ~ Tornadoes, family=poisson)) + 2*2*3/(n-3) extractAIC(glm(Deaths ~ factor(Year), family=poisson)) + 2*4*5/(n-5) extractAIC(glm(Deaths ~ Month, family=poisson)) + 2*12*13/(n-13) extractAIC(glm(Deaths ~ Killer.tornadoes + Tornadoes, family=poisson)) + 2*3*4/(n-4) extractAIC(glm(Deaths ~ Killer.tornadoes + factor(Year), family=poisson)) + 2*5*6/(n-6) extractAIC(glm(Deaths ~ Killer.tornadoes + Month, family=poisson)) + 2*13*14/(n-14) extractAIC(glm(Deaths ~ Tornadoes + factor(Year), family=poisson)) + 2*5*6/(n-6) extractAIC(glm(Deaths ~ Tornadoes + Month, family=poisson)) + 2*13*14/(n-14) extractAIC(glm(Deaths ~ factor(Year) + Month, family=poisson)) + 2*15*16/(n-16) extractAIC(glm(Deaths ~ Killer.tornadoes + Tornadoes + factor(Year), family=poisson)) + 2*6*7/(n-7) extractAIC(glm(Deaths ~ Killer.tornadoes + Tornadoes + Month, family=poisson)) + 2*14*15/(n-15) extractAIC(glm(Deaths ~ Killer.tornadoes + factor(Year) + Month, family=poisson)) + 2*16*17/(n-17) extractAIC(glm(Deaths ~ Tornadoes + factor(Year) + Month, family=poisson)) + 2*16*17/(n-17) extractAIC(glm(Deaths ~ Killer.tornadoes + Tornadoes + factor(Year) + Month, family=poisson)) + 2*17*18/(n-18) extractAIC(glm(Deaths ~ Killer.tornadoes + Month + Killer.tornadoes*Month, family=poisson)) + 2*24*25/(n-25) extractAIC(glm(Deaths ~ Killer.tornadoes + Month + Killer.tornadoes*Month + Tornadoes, family=poisson)) + 2*25*26/(n-26) extractAIC(glm(Deaths ~ Killer.tornadoes + Month + Tornadoes*Month + Tornadoes, family=poisson)) + 2*25*26/(n-26) extractAIC(glm(Deaths ~ Killer.tornadoes + Tornado.season, family=poisson)) + 2*7*8/(n-8) extractAIC(glm(Deaths ~ Killer.tornadoes + Tornadoes + Tornado.season, family=poisson)) + 2*8*9/(n-9) tornglm4 <- glm(Deaths ~ Killer.tornadoes + Month, family=poisson, x=T) tornglm4.diag <- glmdiag(tornglm4) tornadoc <- tornado[-c(17,26,37),] attach(tornadoc) n <- sum(Deaths) extractAIC(glm(Deaths ~ 1, family=poisson)) + 2*1*2/(n-2) extractAIC(glm(Deaths ~ Killer.tornadoes, family=poisson)) + 2*2*3/(n-3) extractAIC(glm(Deaths ~ Tornadoes, family=poisson)) + 2*2*3/(n-3) extractAIC(glm(Deaths ~ factor(Year), family=poisson)) + 2*4*5/(n-5) extractAIC(glm(Deaths ~ Month, family=poisson)) + 2*12*13/(n-13) extractAIC(glm(Deaths ~ Killer.tornadoes + Tornadoes, family=poisson)) + 2*3*4/(n-4) extractAIC(glm(Deaths ~ Killer.tornadoes + factor(Year), family=poisson)) + 2*5*6/(n-6) extractAIC(glm(Deaths ~ Killer.tornadoes + Month, family=poisson)) + 2*13*14/(n-14) extractAIC(glm(Deaths ~ Tornadoes + factor(Year), family=poisson)) + 2*5*6/(n-6) extractAIC(glm(Deaths ~ Tornadoes + Month, family=poisson)) + 2*13*14/(n-14) extractAIC(glm(Deaths ~ factor(Year) + Month, family=poisson)) + 2*15*16/(n-16) extractAIC(glm(Deaths ~ Killer.tornadoes + Tornadoes + factor(Year), family=poisson)) + 2*6*7/(n-7) extractAIC(glm(Deaths ~ Killer.tornadoes + Tornadoes + Month, family=poisson)) + 2*14*15/(n-15) extractAIC(glm(Deaths ~ Killer.tornadoes + factor(Year) + Month, family=poisson)) + 2*16*17/(n-17) extractAIC(glm(Deaths ~ Tornadoes + factor(Year) + Month, family=poisson)) + 2*16*17/(n-17) extractAIC(glm(Deaths ~ Killer.tornadoes + Tornadoes + factor(Year) + Month, family=poisson)) + 2*17*18/(n-18) extractAIC(glm(Deaths ~ Killer.tornadoes + Month + Killer.tornadoes*Month, family=poisson)) + 2*24*25/(n-25) extractAIC(glm(Deaths ~ Killer.tornadoes + Month + Killer.tornadoes*Month + Tornadoes, family=poisson)) + 2*25*26/(n-26) extractAIC(glm(Deaths ~ Killer.tornadoes + Month + Tornadoes*Month + Tornadoes, family=poisson)) + 2*25*26/(n-26) extractAIC(glm(Deaths ~ Killer.tornadoes + Tornado.season, family=poisson)) + 2*7*8/(n-8) extractAIC(glm(Deaths ~ Killer.tornadoes + Tornadoes + Tornado.season, family=poisson)) + 2*8*9/(n-9) tornglm5 <- glm(Deaths ~ Killer.tornadoes + Month, family=poisson, x=T) tornglm5.diag <- glmdiag(tornglm5) tornadod <- tornado[tornado$Killer.tornadoes > 0,] attach(tornadod) tornglm6 <- glm(Deaths ~ log(Killer.tornadoes) + Month, family=poisson) # attach(uktrainacc) # # This illustrates the use of an offset # accglm1 <- glm(Accidents ~ Year + offset(log(Train.km)), family=poisson) accglm2 <- glm(SPAD.preventable ~ Year + offset(log(Train.km)), family=poisson) accglm3 <- glm(Other.preventable ~ Year + offset(log(Train.km)), family=poisson) accglm4 <- glm(Non.preventable ~ Year + offset(log(Train.km)), family=poisson) accglm5 <- glm(Accidents.grouped[1:6] ~ Year.grouped[1:6] + offset(log(Train.km.grouped[1:6])), family=poisson) accglm6 <- glm(SPAD.grouped[1:6] ~ Year.grouped[1:6] + offset(log(Train.km.grouped[1:6])), family=poisson) accglm7 <- glm(Other.grouped[1:6] ~ Year.grouped[1:6] + offset(log(Train.km.grouped[1:6])), family=poisson) accglm8 <- glm(Non.grouped[1:6] ~ Year.grouped[1:6] + offset(log(Train.km.grouped[1:6])), family=poisson) # # Section 5.3 # attach(wildcat) wildpois1 <- glm(Wildcat.strikes ~ Grievances + Rotate + Union + Log.workforce, family=poisson, x=T) summary(wildpois1) wildpois1.diag <- glmdiag(wildpois1) n <- length(Wildcat.strikes) # # z1 test for overdispersion # temp <- ((Wildcat.strikes - fitted(wildpois1))^2 - Wildcat.strikes)/(fitted(wildpois1)) t.test(temp) # # z2 test for overdispersion # summary(lm(temp ~ -1 + fitted(wildpois1))) # # Robust sandwich covariance estimator # n <- length(Wildcat.strikes) W1 <- diag(fitted(wildpois1)) X <- cbind(rep(1,n), Grievances, Rotate, Union, Log.workforce) W2 <- diag(resid(wildpois1, type="response")^2) robust <- solve(t(X)%*%W1%*%X)%*%(t(X)%*%W2%*%X)%*%solve(t(X)%*%W1%*%X) # # These are estimated standard errors and adjusted Wald statistics # sqrt(diag(robust)) coef(wildpois1)/sqrt(diag(robust)) 2*(1-pnorm(abs(coef(wildpois1)/sqrt(diag(robust))))) # # Getting correction factor for quasi-likelihood # sqrt(sum(residuals(wildpois1,type="pearson")^2)/(n-5)) # # This gives the quasi-likelihood output directly. In R, the family=quasi(link="log", var="mu") input # can be replaced with family=quasipoisson # summary(glm(Wildcat.strikes ~ Grievances + Rotate + Union + Log.workforce, family=quasi(link="log", var="mu"))) # # Section 5.4 # electclean <- election2000[-50,] attach(electclean) Bushpct <- Bush00/Total00 Gorepct <- Gore00/Total00 Naderpct <- Nader00/Total00 Perotpct <- Perot96/Total96 # # Poisson fit # electpois1 <- glm(Buchanan00 ~ offset(log(Total00)) + Bushpct, family=poisson, x=T) summary(electpois1) n <- sum(Buchanan00) # # Tests for overdispersion # temp <- ((Buchanan00-fitted(electpois1))^2-Buchanan00)/(fitted(electpois1)) t.test(temp) summary(lm(temp ~ -1 + fitted(electpois1))) # # Negative binomial fit # library(MASS) electnb1 <- glm.nb(Buchanan00 ~ offset(log(Total00)) + Bushpct, x=T) summary(electnb1) electnb2 <- glm.nb(Buchanan00 ~ offset(log(Total00)) + Bushpct + Naderpct + Perotpct, x=T) summary(electnb2) # # AICc values calculated directly from log-likelihood # -2*sum(log(dnegbinom(Buchanan00,fitted(electnb1),3.482))) + 2*3 + 2*3*4/(n-4) -2*sum(log(dnegbinom(Buchanan00,fitted(electnb2),6.56))) + 2*5 + 2*5*6/(n-6) electnb2.diag <- glmdiag(electnb2) electnb3 <- glm.nb(Buchanan00 ~ offset(log(Total00)) + Naderpct + Perotpct, x=T) summary(electnb3) -2*sum(log(dnegbinom(Buchanan00,fitted(electnb3),6.01))) + 2*4 + 2*4*5/(n-5) electnb3.diag <- glmdiag(electnb3) sum(resid(electnb3,type="pearson")^2) # # Table 5.5 # cumsum(dnegbinom(c(0:3407),1038.68,3.482))>.95 # at 2092 cumsum(dnegbinom(c(0:3407),1038.68,3.482))>.99 # at 2750 cumsum(dnegbinom(c(0:3407),948.25,6.56))>.95 # at 1630 cumsum(dnegbinom(c(0:3407),948.25,6.56))>.99 # at 2018 cumsum(dnegbinom(c(0:3407),1253.02,6.01))>.95 # at 2197 cumsum(dnegbinom(c(0:3407),1253.02,6.01))>.99 # at 2740 # # Section 5.5 # # The smoothing functions used here only work in S-PLUS; R's gam() function (from the mgcv # package) is distinctly different in functionality. The vgam() function in the package VGAM # fits generalized additive models, but I have been unable to get it to run on my machine # on these data. # attach(homerun) # # Don't count games where McGwire or Sosa didn't play # hrmcgwire <- homerun[McGwire.out==0,] hrsosa <- homerun[Sosa.out==0,] # # pois.crit.loess() is an S-PLUS function that gives the local likelihood AICc criterion # function; see file functions.s # # A grid search over different spans is performed here to find the AICc minimizer # attach(hrmcgwire) for (i in 40:100){ print(c(i/100,pois.crit.loess(i/100, StL.season.day, McGwire.HR, degree=1)$aicc))} macsmooth<-gam(McGwire.HR ~ lo(StL.season.day, degree=1, span=.78), family=poisson) macpred <- predict(macsmooth,type="response") attach(hrsosa) for (i in 30:70){ print(c(i/100,pois.crit.loess(i/100, Chi.season.day, Sosa.HR, degree=1)$aicc))} samsmooth<-gam(Sosa.HR ~ lo(Chi.season.day, degree=1, span=.46), family=poisson) sampred <- predict(samsmooth,type="response") # # Figure 5.17 # par(mfrow=c(2,1)) plot(hrmcgwire$StL.season.day, macpred, type="l", xlab="Calendar date of season", ylab="Home runs per game", ylim=c(.15,.65), xaxt="n", main="Mark McGwire home run rate") axis(1, at=c(16,47,77,108,139,169), labels=c("April","May","June","July","August","Sept."), ticks=F) axis(1, at=c(2,32,63,93,124,155,182), labels=F) rug(StL.season.day[McGwire.HR > 0], lwd=1) plot(hrsosa$Chi.season.day, sampred, type="l", xlab="Calendar date of season", ylab="Home runs per game", ylim=c(.15,.65), xaxt="n", main="Sammy Sosa home run rate") axis(1, at=c(16,47,77,108,139,169), labels=c("April","May","June","July","August","Sept."), ticks=F) axis(1, at=c(2,32,63,93,124,155,182), labels=F) rug(Chi.season.day[Sosa.HR > 0], lwd=1) # # Smoothing Texas Lottery data # for (i in 40:150){ print(c(i/100,cat.crit.loess(i/100, texas.lottery, degree=1)$aicc))} lotterysmooth <- cat.crit.loess(.95, texas.lottery, degree=1) # # Figure 5.18 # barplot(texas.lottery/150,col=0,names=as.character(c(0:9)),xlab="Digit chosen", ylab="Probability") lines(1.2*c(0:9) + .7,lotterysmooth$prob,type="b",pch="o") # attach(olympic2000) GDPpc <- GDP/Population olymp1 <- glm(Total2000 ~ offset(Log.athletes) + Log.population + GDPpc + Total1996, family=poisson) # # AICc value for Poisson regression # # Search for smoothing parameters than minimize AICc # crit.aicc.lm(olymp1) minaicc <- Inf mini <- NA minj <- NA mink <- NA for (i in 15:20){ for (j in 60:80){ for (k in 90:110){ gtemp <- gam(Total2000 ~ offset(Log.athletes) + lo(Log.population,i/10) + lo(GDPpc,j/100) + lo(Total1996,k/100), family=poisson) curaicc <- crit.aicc.gam(gtemp) print(c(i/10,j/100,k/100,curaicc,mini,minj,mink,minaicc)) if (curaicc < minaicc) { minaicc <- curaicc mini <- i/10 minj <- j/100 mink <- k/100} }}} olymp2 <- gam(Total2000 ~ offset(Log.athletes) + lo(Log.population,2) + lo(GDPpc,.69) + lo(Total1996,.97), family=poisson) crit.aicc.gam(olymp2) # # Determining smoothing parameter for GDPpc, assuming a model linear in logged population and logged 1996 medals # minaicc <- Inf minj <- NA for (j in 60:80){ gtemp <- gam(Total2000 ~ offset(Log.athletes) + Log.population + lo(GDPpc,j/100) + Log1996, family=poisson) curaicc <- crit.aicc.gam(gtemp) print(c(j/100,curaicc,minj,minaicc)) if (curaicc < minaicc) { minaicc <- curaicc minj <- j/100 }} olymp3 <- gam(Total2000 ~ offset(Log.athletes) + Log.population + lo(GDPpc,.72) + Log1996, family=poisson) # # Chapter 6 # # Section 6.1 # relapse <- c(30, 8) children <- c(75, 103) # # prop.test() gives equivalent chi-squared test to z-test comparing independent proportions # prop.test(relapse, children, correct=F) leukemia <- matrix(c(30,8,45,95), nrow=2) # # chisq.test() gives the Pearson chi-squared test of independence # chisq.test(leukemia, correct=F) # # powdivind() is a function to calculate the power-divergence tests of independence; # see file functions.s. The function also gives fitted values and residuals. # powdivind(leukemia) powdivind(leukemia, 0) powdivind(leukemia, 1) # # Section 6.2 # attach(marketing) # # The category() command attaches names to the numerical levels of the variables # major<-factor(X5, label=c("Marketing","Nonmarketing","Undecided"), exclude=NA) time<-factor(X9, label=c("High school","Freshman","Sophomore","Junior", "Senior"), exclude=NA) # # The table() function returns a cross-classification when more than one variable is input # marketmajor<-table(time, major) powdivind(marketmajor, 1) powdivind(marketmajor, 0) # # Printing out columns percentages # for (j in 1:3){ print(marketmajor[,j]/sum(marketmajor[,j]))} # # mosaic() is a function written by Jay Emerson to produce mosaic displays; see # file functions.s # mosaic(t(marketmajor)) # respdiag <- matrix(c(113,99,410,60,58,37,228,43,40,23,125,30,108,50,366,56,100,32,304,45), nrow=4, dimnames=list(c("Bronchitis","Sinusitis","URI","Pneumonia"), c("1/96-3/96","4/96-6/96","7/96-9/96","10/96-12/96","1/97-2/97"))) # # Getting adjusted residuals from Pearson residuals # respchi <- powdivind(respdiag,1) rowprob <- rowSums(respdiag)/sum(respdiag) colprob <- colSums(respdiag)/sum(respdiag) adjresid <- respchi$resid/sqrt(outer(1-rowprob,1-colprob)) adjresid # # Section 6.3 # thermal <- matrix(c(6,1,2,11), nrow=2, dimnames=list(c("Guilty","Innocent"), c("Guilty","Innocent"))) powdivind(thermal,1) powdivind(thermal,0) # # The fisher.test() function performs Fisher's exact test. It only gives the exact p-value, not # the mid-p-value # fisher.test(thermal) # # Section 6.4 # attach(cancerrate) # # This fits quasi-independence as a Poisson loglinear model. The structural zero # cells do not appear in the data, which are given in "case" form (one line for # cell in the table, and a Count variable that gives the number of observations # in the cell). # nycancer.glm <- glm(NY.count ~ factor(Type) + factor(Gender), family=poisson) resid(nycancer.glm, type="pearson") sum(resid(nycancer.glm, type="pearson")^2) # # This fits quasi-independnce as a loglinear model. The structural zero cells # are given as zeroes in the table, and a "start" design table identifies them # with zeroes (all other cells have the value 1 in the design table). The # "list(1,2)" entry tells the program to fit the loglinear model with only the # two marginal effects. # cancer.table <- matrix(c(7355,4831,1104,964,0,1563,9986,0,1014,618), byrow=T, nrow=5) cancer.design <- matrix(c(1,1,1,1,0,1,1,0,1,1), byrow=T, nrow=5) nycancer.loglin <- loglin(cancer.table, list(1,2), start=cancer.design, fit=T) c(nycancer.loglin$lrt, nycancer.loglin$pearson) # # Pearson residuals # (cancer.table - nycancer.loglin$fit)/sqrt(nycancer.loglin$fit) # # Section 6.5 # artifact <- matrix(c(2,3,13,20,10,8,5,36,4,4,3,19,2,6,9,20),nrow=4) # # table2case() is a function that converts a two-way contingency table into a data # frame that is in "case" form; see file functions.s # # The R function as.data.frame.table() has similar functionality # artcase <- table2case(artifact,rowname="Type",colname="Dist") # # Omitting outlier cells # typeout <- artcase$Type[-c(3,7)] distout <- artcase$Dist[-c(3,7)] artcountout <- artcase$Count[-c(3,7)] artclean.glm <- glm(artcountout ~ factor(typeout) + factor(distout), family=poisson) # # Forming a data frame of the predictors for the outlier cells, so that the fitted # values for the cells can be calculated using predict() # outcells <- data.frame(typeout=c(3,3), distout=c(1,2)) outfit <- predict(artclean.glm, outcells, type="response") # # Calculating standardized deleted residuals # c((artcase$Count[3]-outfit[1])/sqrt(outfit[1]), (artcase$Count[7]-outfit[2])/sqrt(outfit[2])) # # The S-PLUS function twoway() constructs a median polish fit. The corresponding R function # is medpolish(), which is available in the eda package. # artpol <- twoway(log(artifact)) # # Median polish fitted values are obtained using the residuals, which can then be used to # get Pearson residuals # artfit <- exp(log(artifact)-artpol$resid) pearres <- (artifact-artfit)/sqrt(artfit) pearres # # Chapter 7 # nonprofit <- matrix(c(1,0,1,2,0,2,1,1,3,5,2,0,5,12,11,4,1,0,0,4,2,0,0,0),byrow=T,nrow=4) nonprofit.data <- table2case(nonprofit) attach(nonprofit.data) # # Independence model # nonprofi <- glm(Count ~ factor(Rows) + factor(Cols), family=poisson) sum(resid(nonprofi,type="pearson")^2) # # Creating the uniform association score vector # unifscore <- (Rows-mean(Rows))*(Cols-mean(Cols)) # # Uniform association model # nonprofu <- glm(Count ~ factor(Rows) + factor(Cols) + unifscore, family=poisson) sum(resid(nonprofu,type="pearson")^2) matrix(resid(nonprofu,type="pearson"),nrow=4) # # Getting equivalent correlation coefficient. The S-PLUS function stdev() should # be replaced with the function sd() in R. # scaletheta <- coef(nonprofu)[10]*stdev(c(rep(1,6),rep(2,12),rep(3,33),rep(4,6)))* stdev(c(rep(1,7),rep(2,17),rep(3,17),rep(4,11),rep(5,3),rep(6,2))) (sqrt(1 + 4.*scaletheta*scaletheta) - 1)/(2.*scaletheta) # assessment <- matrix(c(0,3,8,3,0,1,7,6,1,6,4,2,0,4,8,2),nrow=4) assessment.data <- table2case(assessment) attach(assessment.data) n <- sum(Count) assessi <- glm(Count ~ factor(Rows) + factor(Cols), family=poisson) # # The value.stack.dim setting is not needed in R # fisher.test(assessment, value.stack.dim=50000) # # AICc determined using deviance # deviance(assessi) + 2*7*n/(n - 7 - 1) # # Column effects model # rowscore <- Rows - mean(Rows) assessc <- glm(Count ~ factor(Rows) + factor(Cols) + factor(Cols)*rowscore, family=poisson) deviance(assessc) + 2*10*n/(n - 10 - 1) # myopia <- matrix(c(2,21,66,9,1,1,16,50,31,3,1,15,29,48,7),nrow=5) dimnames(myopia) <- list(c("","Hyperopia","Emmetropia","Myopia",""), c("Darkness","Night light","Room light")) n <- sum(myopia) myopia.data <- table2case(myopia) attach(myopia.data) myopiai <- glm(Count ~ factor(Rows) + factor(Cols), family=poisson) deviance(myopiai) + 2*7*n/(n - 7 - 1) colscore <- Cols - mean(Cols) rowscore <- Rows - mean(Rows) unifscore <- rowscore*colscore myopiau <- glm(Count ~ factor(Rows) + factor(Cols) + unifscore, family=poisson) deviance(myopiau) + 2*8*n/(n - 8 - 1) # # Row effects model # myopiar <- glm(Count ~ factor(Rows) + factor(Cols) + factor(Rows)*colscore, family=poisson) deviance(myopiar) + 2*11*n/(n - 11 - 1) myopiac <- glm(Count ~ factor(Rows) + factor(Cols) + factor(Cols)*rowscore, family=poisson) deviance(myopiac) + 2*9*n/(n - 9 - 1) # # Row + Column effects model # myopiarpc <- glm(Count ~ factor(Rows) + factor(Cols) + factor(Rows)*colscore + factor(Cols)*rowscore, family=poisson) deviance(myopiarpc) + 2*12*n/(n - 12 - 1) # # Myopia effect models # myop <- as.numeric(Rows>3) - as.numeric(Rows < 4) myopiarm1 <- glm(Count ~ factor(Rows) + factor(Cols) + myop*colscore, family=poisson) deviance(myopiarm1) + 2*8*n/(n - 8 - 1) myop <- as.numeric(Rows==4) - as.numeric(Rows<=3) highmyop <- as.numeric(Rows==5) - as.numeric(Rows<=3) myopiarm2 <- glm(Count ~ factor(Rows) + factor(Cols) + myop*colscore + highmyop*colscore, family=poisson) deviance(myopiarm2) + 2*9*n/(n - 9 - 1) # # Second New York Public Library book deterioration table # nypl2 <- matrix(c(27,1,2,0,50,6,3,34,7,0,1,30,676,22,13,0),byrow=T,nrow=4) # # Loglinear association models are fit as above. The RC model can be fit using the grc() function # of the VGAM library; see http://www.stat.auckland.ac.nz/~yee/VGAM. Please note that the # formulation given is different from that given in the book, but the fitted values and deviance # are available for use. The values given here are slightly different from those obtained using # the package ANOAS, which are given in the book. # library(VGAM) nypl2rc <- grc(nypl2) summary(nypl2rc) # # Data are in file gviolence # gviolence.data <- table2case(table(gviolence$Good.neutral.injuries,gviolence$Bad.injuries)) # # Setting up data to fit bivariate distribution # gviolence.data$Rows <- gviolence.data$Rows-1 gviolence.data$Cols <- gviolence.data$Cols-1 gviolence.data$Rows[gviolence.data$Rows==5] <- 8 attach(gviolence.data) n <- sum(Count) gviolencei <- glm(Count ~ factor(Rows) + factor(Cols), family=poisson) deviance(gviolencei) + 2*10*n/(n - 10 - 1) colscore <- Cols-mean(Cols) rowscore <- Rows-mean(Rows) unifscore <- rowscore*colscore gviolenceu <- glm(Count ~ factor(Rows) + factor(Cols) + unifscore, family=poisson) deviance(gviolenceu) + 2*11*n/(n - 11 - 1) # # The factorial function is in S-PLUS, but not R; see file functions.s for it # # Fitting independent bivariate Poissons # gviolencebpi <- glm(Count ~ offset(-log(factorial(Rows))-log(factorial(Cols))) + Rows + Cols, family=poisson) deviance(gviolencebpi) + 2*3*n/(n - 3 - 1) # # Fitting correlated bivariate Poissons # gviolencebp <- glm(Count ~ offset(-log(factorial(Rows))-log(factorial(Cols))) + Rows + Cols + Rows*Cols, family=poisson) deviance(gviolencebp) + 2*4*n/(n - 4 - 1) # # Removing outlier # gviolence2.data <- gviolence.data[gviolence.data$Rows!=8,] attach(gviolence2.data) n <- sum(Count) gviolence2bp <- glm(Count ~ offset(-log(factorial(Rows))-log(factorial(Cols))) + Rows + Cols + Rows*Cols, family=poisson) deviance(gviolence2bp) + 2*4*n/(n - 4 - 1) # # Testing equal means of independent Poissons # gviolunpaired <- matrix(c(rep(0,73), rep(1,73), gviolence$Good.neutral.injuries[-52], gviolence$Bad.injuries[-52]), ncol=2) summary(glm(gviolunpaired[,2] ~ gviolunpaired[,1], family=poisson)) # # Fitting equal-mean correlated bivariate Poissons # rowpcol <- Rows + Cols rowtcol <- Rows * Cols gviolence2bpeq <- glm(Count ~ offset(-log(factorial(Rows))-log(factorial(Cols))) + rowpcol + rowtcol, family=poisson) deviance(gviolence2bpeq) + 2*3*n/(n - 3 - 1) # # Test for equal means of correlated Poissons # deviance(gviolence2bpeq) - deviance(gviolence2bp) # # Section 7.2 # ratetrans <- matrix(c(111,6,0,0,0,5,396,18,2,0,0,19,938,27,4,0,2,41,671,48,0,2,8,29,930), nrow=5) n <- sum(ratetrans) ratetrans.data <- table2case(ratetrans) attach(ratetrans.data) # # Independence model # ratei <- glm(Count ~ factor(Rows) + factor(Cols), family=poisson) deviance(ratei) + 2*9*n/(n - 9 - 1) # # makediags() is a function that creates a factor defining the diagonal cells; see file # functions.s # diags <- makediags(ratetrans.data) # # Quasi-independence model # rateqi <- glm(Count ~ diags + factor(Rows) + factor(Cols), family=poisson) deviance(rateqi) + 2*14*n/(n - 14 - 1) # # makeoffdiags() is a function that creates a factor defining the symmetric off-diagonal cells; # see file functions.s # symm <- makeoffdiags(ratetrans.data) rates <- glm(Count ~ diags + symm, family=poisson) deviance(rates) + 2*15*n/(n-15-1) # jobparl <- matrix(c(117,11,4,0,0,0,8,0,83,37,3,5,1,4,2,7,19,6,67,1,11,0,1,20,0,0,1,16,0,0,0,0,1,0,4, 0,26,0,0,4,17,7,3,8,0,19,0,1,76,7,5,1,1,1,22,0,6,6,13,0,0,0,1,37), nrow=8) n <- sum(jobparl) jobparl.data <- table2case(jobparl) attach(jobparl.data) jobparli <- glm(Count ~ factor(Rows) + factor(Cols), family=poisson) deviance(jobparli) + 2*15*n/(n-15-1) diags <- makediags(jobparl.data) jobparlqi <- glm(Count ~ diags + factor(Rows) + factor(Cols), family=poisson) deviance(jobparlqi) + 2*23*n/(n-23-1) symm <- makeoffdiags(jobparl.data) jobparls <- glm(Count ~ diags + symm, family=poisson) deviance(jobparls) + 2*36*n/(n-36-1) # # Fitting quasi-symmetry model # jobparlqs <- glm(Count ~ factor(Rows) + factor(Cols) + symm, family=poisson) deviance(jobparlqs) + 2*43*n/(n-43-1) # # Test of marginal homogeneity # deviance(jobparls) - deviance(jobparlqs) # occupmob <- matrix(c(50,19,26,8,18,6,2,16,40,34,18,31,8,3,12,35,65,66,123,23,21,11,20,58,110,223,64,32, 14,36,114,185,714,258,189,0,6,19,40,179,143,71,0,3,14,32,141,91,106), byrow=T, nrow=7) n <- sum(occupmob) occupmob.data <- table2case(occupmob) attach(occupmob.data) diags <- makediags(occupmob.data) symm <- makeoffdiags(occupmob.data) occmobqs <- glm(Count ~ factor(Rows) + factor(Cols) + symm, family=poisson) deviance(occmobqs) + 2*34*n/(n - 34 - 1) # # Fitting ordinal quasi-symmetry model # colscore <- Cols - mean(Cols) rowscore <- Rows - mean(Rows) occmobqso <- glm(Count ~ diags + symm + colscore, family=poisson) deviance(occmobqso) + 2*29*n/(n - 29 - 1) # # Fitting quasi-uniform association model # unifscore <- rowscore*colscore occmobqu <- glm(Count ~ factor(Rows) + factor(Cols) + diags + unifscore, family=poisson) deviance(occmobqu) + 2*21*n/(n - 21 - 1) # diabsens <- matrix(c(1,0,12,4),nrow=2) # # mcnemar.test() gives McNemar's test # mcnemar.test(diabsens,corr=F) diabspec <- matrix(c(3,1,10,3),nrow=2) mcnemar.test(diabspec,corr=F) # # depend.prop.ci() is a function that gives the confidence interval for the difference between # dependent proportions; see file functions.s # # There is a mistake in the formulas for the confidence intervals on page 281, # resulting in the intervals given being incorrect # depend.prop.ci(diabsens) depend.prop.ci(diabspec) # cataract <- matrix(c(0,1,3,7,0,2,0,9,1,1,0,5,4,3,3,29),nrow=4) n <- sum(cataract) cataract.data <- table2case(cataract) attach(cataract.data) colscore <- Cols - mean(Cols) diags <- makediags(cataract.data) symm <- makeoffdiags(cataract.data) cataractqso <- glm(Count ~ diags + symm + colscore, family=poisson) # # Getting marginal probabilities from the ordinal quasi-symmetry model rowSums(matrix(fitted(cataractqso),nrow=4)/n) colSums(matrix(fitted(cataractqso),nrow=4)/n) # paciftherm <- matrix(c(49,14,1,36), nrow=2) mcnemar.test(paciftherm, corr=F) # # kappa.counts() calculates unweighted and weighted kappa values; see file functions.s # kappa.counts(paciftherm) pacifthermadj <- matrix(c(38,4,12,46), nrow=2) mcnemar.test(pacifthermadj, corr=F) kappa.counts(pacifthermadj) # socktest <- matrix(c(5,3,0,0,0,5,2,0,0,0,3,0,0,0,0,3), nrow=4) # # Creating quadratic weight matrix for weighted kappa calculation # wtsock <- matrix(NA, nrow=4, ncol=4) for (i in 1:4){ for (j in 1:4){ wtsock[i,j] = 1 - (i - j)^2/(4 - 1)^2}} kappa.counts(socktest,wtsock) # # Creating absolute weight matrix for weighted kappa calculation # for (i in 1:4){ for (j in 1:4){ wtsock[i,j] = 1 - abs(i - j)/(4 - 1)}} kappa.counts(socktest,wtsock) # malnutrboy <- matrix(c(20,4,0,237), nrow=2) mcnemar.test(malnutrboy, corr=F) # # Exact p-value is obtained through the binomial probability function. The following # command gives the entire distribution, where the number of trials is n11 + n22 and # p=.5 # dbinom(c(0:4),4,.5) malnutrgirl <- matrix(c(9,14,0,229),nrow=2) mcnemar.test(malnutrgirl,corr=F) dbinom(c(0:14),14,.5) # # Chapter 8 # # Section 8.1 # # Figure 8.1 (Simpson's paradox plot) # plot(c(0,100),c(30,80),ylim=c(0,100),type="l",xlab="Percent from Los Angeles",ylab="Recall percentage") lines(c(0,100),c(40,90),lty=3) points(c(0,80,100),c(30,70,80),pch=16) points(c(0,20,100),c(40,50,90),pch=1) text(40,40,"Week 1") text(50,75,"Week 2") lines(c(80,80),c(3,67),lty=2) points(80,0,pch=6) lines(c(20,20),c(3,47),lty=2) points(20,0,pch=6) text(92,20,"Week 1 survey was",cex=.55) text(92,15,"80% Los Angelinos",cex=.55) text(8,20,"Week 2 survey was",cex=.55) text(8,15,"80% New Yorkers",cex=.55) # # Fitting independence to partial freezer ownership tables and marginal table # freezown <- matrix(c(304,666,894,720,38,85,93,84), nrow=4) freezrent <- matrix(c(92,174,379,433,64,113,321,297), nrow=4) freezmarg <- freezown + freezrent freezown.data <- table2case(freezown) attach(freezown.data) glm(Count ~ factor(Rows) + factor(Cols), family=poisson) freezrent.data <- table2case(freezrent) attach(freezrent.data) glm(Count ~ factor(Rows) + factor(Cols), family=poisson) freezmarg.data <- table2case(freezmarg) attach(freezmarg.data) glm(Count ~ factor(Rows) + factor(Cols), family=poisson) # smoking<-array(c(53,61,2,1,121,152,3,5,95,114,14,7,103,66,27,12, 64,81,51,40,7,28,29,101,0,0,13,64), dim=c(2,2,7), dimnames=list(c("Smoker","Nonsmoker"), c("Alive","Dead"), c("18-24","25-34","35-44","45-54","55-64","65-74","75 + "))) # # mantelhaen.test() function constructs Cochran-Mantel-Haenszel test # mantelhaen.test(smoking,cor=F) # # OR.MH() is a function to give the Mantel-Haenszel estimate of common odds ratio (with CI); # see file functions.s. These are given by the mantelhaen.test() function in R, but not in S-PLUS # OR.MH(smoking) # # OR.BD() is a function to give the Breslow-Day and Tarone adjusted Breslow-Day statistics; see # file functions.s # OR.BD(smoking,0.6545231) # # Section 8.2 # # # Loglinear models for smoking data are fit using loglin(). The model is fit by specifying the sufficient marginals. # n <- sum(smoking) # # Model MS, MA, AS # ms.ma.as.model <- loglin(smoking, list(c(1,2),c(1,3),c(2,3)), eps=.01) # # Getting number of parameters in the model for AICc calcualation # npar <- prod(dim(smoking)) - ms.ma.as.model$df ms.ma.as.model$lrt + 2*npar*(n/(n - npar - 1)) # # Model MS, MA # ms.ma.model <- loglin(smoking, list(c(1,2),c(2,3))) npar <- prod(dim(smoking)) - ms.ma.model$df ms.ma.model$lrt + 2*npar*(n/(n - npar - 1)) # # Model MS, AS # ms.as.model <- loglin(smoking, list(c(1,2),c(1,3))) npar <- prod(dim(smoking)) - ms.as.model$df ms.as.model$lrt + 2*npar*(n/(n - npar - 1)) # # Model MA, AS # ma.as.model <- loglin(smoking, list(c(1,3),c(2,3))) npar <- prod(dim(smoking)) - ma.as.model$df ma.as.model$lrt + 2*npar*(n/(n - npar - 1)) # # Model S, MA # s.ma.model <- loglin(smoking, list(c(2,3),1)) npar <- prod(dim(smoking)) - s.ma.model$df s.ma.model$lrt + 2*npar*(n/(n - npar - 1)) # # Model M, AS # m.as.model <- loglin(smoking, list(c(1,3),2)) npar <- prod(dim(smoking)) - m.as.model$df m.as.model$lrt + 2*npar*(n/(n - npar - 1)) # # Test for MS=0 given (MS, MA, AS) # ma.as.model$lrt - ms.ma.as.model$lrt # mildeath <- array(c(158,56,11,51,24,96,30,7,32,9,54,21,2,13,13,69,6,6,7,27,15,5,5,3,1,9,4,2,3,2,7,4,0,1,2,3,0,1,0,2), dim=c(5,4,2), dimnames=list(c("Accident","Illness","Homicide","Self-infl.","Unknown"),c("Army","Navy","Air Force", "Marine Corps"),c("Male","Female"))) # # Loglinear models on observed counts are fit as above using the loglin() function # g.ms.model <- loglin(mildeath, list(c(1,2),3),fit=T) # # Collapsed tables # apply(mildeath, c(1,2), sum) apply(mildeath, 3, sum) # # Mosaic plots for three-dimensional tables # par(mfrow=c(2,1), mex=.5) mosaic(aperm(mildeath, c(2,1,3)), main="Observed counts") mosaic(aperm(g.ms.model$fit, c(2,1,3)), main="Fitted counts") # # multtable2case() is a function that conversts multidimensional contingency tables in array form # into case form; see file functions.s # # The R function as.data.frame.table() has similar functionality # # Death rate models are fit by converting to case form, so that the glm() function can be used # with logged service membership as an offset # totalmil <- c(407398,321424,294117,162477,72028,51622,66473,10164) miltotal <- rep(totalmil,each=5) mildeath.data <- multtable2case(mildeath, vnames=c("Manner","Service","Gender")) attach(mildeath.data) ms.mg.rate.model <- glm(Count ~ offset(log(miltotal)) + factor(Manner)*factor(Service) + factor(Manner)*factor(Gender), family=poisson) deviance(ms.mg.rate.model) + 2*25*n/(n-25-1) g.ms.rate.model <- glm(Count ~ offset(log(miltotal)) + factor(Manner)*factor(Service) + factor(Gender), family=poisson) deviance(g.ms.rate.model) + 2*21*n/(n-21-1) # # Get death rates for collapsed table # apply(mildeath,c(1,2),sum)/matrix(c(totalmil[1] + totalmil[5],totalmil[2] + totalmil[6], totalmil[3] + totalmil[7],totalmil[4] + totalmil[8]),nrow=5,ncol=4,byrow=T)*100000 # # Section 8.3 # bizimport <- array(c(2,0,0,0,1,0,0,1,0,0,0,3,3,1,2,0,0,0,8,10,1,1,0,8,25,0,0,0,0,0,0,0,0,1,0,0,0,2,4,2, 0,0,1,6,15,0,0,1,6,21,2,0,0,0,0,0,1,1,0,0,0,0,4,7,5,0,1,3,19,22,0,1,1,10,38,0,0,0,0,0,0,1,0,1,1,0,0,1,4,2, 0,1,1,15,14,0,0,3,3,24,2,0,0,0,0,0,1,0,0,0,1,0,1,0,0,0,0,2,8,6,1,0,2,2,4), dim=c(5,5,5), dimnames=list(c("VU","U","N","I","VI"),c("VU","U","N","I","VI"),c("HS","F","So","J","Se"))) bizimport.data <- multtable2case(bizimport,vnames=c("Employment","Salary","Major")) attach(bizimport.data) n <- sum(Count) # # Models ignoring ordering of categories # es.em.sm.model <- glm(Count ~ factor(Employment)*factor(Salary) + factor(Employment)*factor(Major) + factor(Salary)*factor(Major), family=poisson, maxit=200) deviance(es.em.sm.model) + 2*61*n/(n - 61 - 1) es.em.model <- glm(Count ~ factor(Employment)*factor(Salary) + factor(Employment)*factor(Major), family=poisson) deviance(es.em.model) + 2*45*n/(n - 45 - 1) es.sm.model <- glm(Count ~ factor(Employment)*factor(Salary) + factor(Salary)*factor(Major), family=poisson) deviance(es.sm.model) + 2*45*n/(n - 45 - 1) em.sm.model <- glm(Count ~ factor(Employment)*factor(Major) + factor(Salary)*factor(Major), family=poisson) deviance(em.sm.model) + 2*45*n/(n - 45 - 1) s.em.model <- glm(Count ~ factor(Salary) + factor(Employment)*factor(Major), family=poisson) deviance(s.em.model) + 2*29*n/(n - 29 - 1) m.se.model <- glm(Count ~ factor(Employment)*factor(Salary) + factor(Major), family=poisson) deviance(m.se.model) + 2*29*n/(n - 29 - 1) m.s.e.model <- glm(Count ~ factor(Employment) + factor(Salary) + factor(Major), family=poisson) deviance(m.s.e.model) + 2*13*n/(n - 13 - 1) # # Association models # Rows <- Employment - mean(Employment) Cols <- Salary - mean(Salary) Levels <- Major - mean(Major) jointind.assoc.model <- glm(Count ~ factor(Employment) + factor(Salary) + factor(Major) + Rows*Cols, family=poisson) deviance(jointind.assoc.model) + 2*14*n/(n - 14 - 1) jointind.row.model <- glm(Count ~ factor(Employment) + factor(Salary) + factor(Major) + factor(Rows)*Cols, family=poisson) deviance(jointind.row.model) + 2*17*n/(n - 17 - 1) jointind.col.model <- glm(Count ~ factor(Employment) + factor(Salary) + factor(Major) + Rows*factor(Cols), family=poisson) deviance(jointind.col.model) + 2*17*n/(n - 17 - 1) jointind.rpc.model <- glm(Count ~ factor(Employment) + factor(Salary) + factor(Major) + Rows*factor(Cols) + Cols*factor(Rows), family=poisson) deviance(jointind.rpc.model) + 2*20*n/(n - 20 - 1) # # Symmetry models # Rows <- Employment Cols <- Salary # # The makediags() and makeoffdiags() functions can be used in this case by making the (Rows, Cols) variables # a data frame # diags <- makediags(as.data.frame(cbind(Rows,Cols))) symm <- makeoffdiags(as.data.frame(cbind(Rows,Cols))) jointind.symm.model <- glm(Count ~ factor(Major) + diags + symm, family=poisson) deviance(jointind.symm.model) + 2*19*n/(n - 19 - 1) jointind.quasisymm.model <- glm(Count ~ factor(Employment) + factor(Salary) + factor(Major) + symm, family=poisson) deviance(jointind.quasisymm.model) + 2*23*n/(n - 23 - 1) colscore <- Cols - mean(Cols) jointind.ordquasisymm.model <- glm(Count ~ factor(Major) + diags + symm + colscore, family=poisson) deviance(jointind.ordquasisymm.model) + 2*20*n/(n - 20 - 1) # # Print out estimated probabilities # print(array(fitted(jointind.quasisymm.model), dim=c(5,5,5))[,,1]/66, digit=4) # # Section 8.4 # stillbirth <- array(c(22,21,12,4,7,16,42,73,387,3934,171,109,95,112,169,121,358,944,5155,101776, 17,13,10,10,13,16,19,76,451,3729,167,100,78,92,209,107,314,727,4224,96077), dim=c(5,2,2,2), dimnames=list(c("le 24","25-28","29-32","33-36","37-41"),c("Stillbirth","Live birth"), c("Aborigines","Whites"),c("Males","Females"))) n <- sum(stillbirth) # # loglin() function is used for higher-dimensional tables as well, by specifying the sufficient marginals # a.b.r.g.model <- loglin(stillbirth, list(1,2,3,4), eps=.01) npar <- prod(dim(stillbirth)) - a.b.r.g.model$df a.b.r.g.model$lrt + 2*npar*(n/(n - npar - 1)) ab.ar.ag.br.bg.rg.model <- loglin(stillbirth, list(c(1,2),c(1,3),c(1,4),c(2,3),c(2,4),c(3,4)),eps=.01) npar <- prod(dim(stillbirth)) - ab.ar.ag.br.bg.rg.model$df ab.ar.ag.br.bg.rg.model$lrt + 2*npar*(n/(n - npar - 1)) abr.abg.agr.brg.model <- loglin(stillbirth, list(c(1,2,3),c(1,2,4),c(1,3,4),c(2,3,4)),eps=.01) npar <- prod(dim(stillbirth)) - abr.abg.agr.brg.model$df abr.abg.agr.brg.model$lrt + 2*npar*(n/(n - npar - 1)) abg.agr.brg.model <- loglin(stillbirth, list(c(1,2,4),c(1,3,4),c(2,3,4)),eps=.01) npar <- prod(dim(stillbirth)) - abg.agr.brg.model$df abg.agr.brg.model$lrt + 2*npar*(n/(n - npar - 1)) abr.agr.brg.model <- loglin(stillbirth, list(c(1,2,3),c(1,3,4),c(2,3,4)),eps=.01) npar <- prod(dim(stillbirth)) - abr.agr.brg.model$df abr.agr.brg.model$lrt + 2*npar*(n/(n - npar - 1)) abr.abg.brg.model <- loglin(stillbirth, list(c(1,2,3),c(1,2,4),c(2,3,4)),eps=.01) npar <- prod(dim(stillbirth)) - abr.abg.brg.model$df abr.abg.brg.model$lrt + 2*npar*(n/(n - npar - 1)) abr.abg.agr.model <- loglin(stillbirth, list(c(1,2,3),c(1,2,4),c(1,3,4)),eps=.01) npar <- prod(dim(stillbirth)) - abr.abg.agr.model$df abr.abg.agr.model$lrt + 2*npar*(n/(n - npar - 1)) abr.agr.bg.model <- loglin(stillbirth, list(c(1,2,3),c(1,3,4),c(2,4)),eps=.01) npar <- prod(dim(stillbirth)) - abr.agr.bg.model$df abr.agr.bg.model$lrt + 2*npar*(n/(n - npar - 1)) ab.agr.br.bg.model <- loglin(stillbirth, list(c(1,2),c(1,3,4),c(2,3),c(2,4)),eps=.01) npar <- prod(dim(stillbirth)) - ab.agr.br.bg.model$df ab.agr.br.bg.model$lrt + 2*npar*(n/(n - npar - 1)) abr.agr.model <- loglin(stillbirth, list(c(1,2,3),c(1,3,4)),eps=.01) npar <- prod(dim(stillbirth)) - abr.agr.model$df abr.agr.model$lrt + 2*npar*(n/(n - npar - 1)) br.agr.bg.model <- loglin(stillbirth, list(c(2,3),c(1,3,4),c(2,4)),eps=.01) npar <- prod(dim(stillbirth)) - br.agr.bg.model$df br.agr.bg.model$lrt + 2*npar*(n/(n - npar - 1)) ab.agr.bg.model <- loglin(stillbirth, list(c(1,2),c(1,3,4),c(2,4)),eps=.01) npar <- prod(dim(stillbirth)) - ab.agr.bg.model$df ab.agr.bg.model$lrt + 2*npar*(n/(n - npar - 1)) # # Collapsed tables # aperm(apply(stillbirth,c(1,3,4),sum),c(3,2,1)) apply(stillbirth,c(1,2,4),sum) aperm(apply(stillbirth,c(1,2,4),sum),c(3,2,1)) # # Chapter 9 # # Section 9.1 # attach(challenger) n <- sum(O.rings) # # The glm() function with binomial family gives logistic regression # # In this formulation, Damaged is the number of successes, and O.rings is the number of trials. # This is coded into the function with the target entered as Success variable/Number of trials # variable, with the number of trials variable given as a weight function # challog1 <- glm(Damaged/O.rings ~ Temperature + Pressure, weights=O.rings, family=binomial) summary(challog1) deviance(challog1) + 2*3*(n/(n - 3 - 1)) # # This is the 0/1 form of the data. Note that while the maximum likelihood estimates are identical # either way, the deviance statistics are different - see Section 9.1.3. # Temp01 <- rep(Temperature, each=6) Press01 <- rep(Pressure, each=6) Dam01 <- rep(0, n) for (i in 1:23) Dam01[(6*(i-1) + 1):(6*i)] <- c(rep(1,Damaged[i]), rep(0,6-Damaged[i])) chall01 <- data.frame(cbind(Temp01,Press01,Dam01)) challog2 <- glm(Dam01 ~ Temp01 + Press01, family=binomial) # # Simplified model # challog3 <- glm(Damaged/O.rings ~ Temperature, weights=O.rings, family=binomial, x=T) summary(challog3) deviance(challog3) + 2*2*(n/(n - 2 - 1)) # # Omitting outlier # chall2 <- challenger[-21,] attach(chall2) challog4 <- glm(Damaged/O.rings ~ Temperature + Pressure, weights=O.rings, family=binomial) challog5 <- glm(Damaged/O.rings ~ Temperature, weights=O.rings, family=binomial) # # Figure 9.5 # plot(challenger$Temperature, challenger$Damaged/challenger$O.rings, xlab="Temperature", ylab="Probability of O-ring damage", xlim=c(30,81), ylim=c(0,1)) # # Get the fitted curve by creating a fine grid of temperature values as a data frame, and then using the predict() # function with type="response" # new.data <- data.frame(Temperature=c(300:810)/10) lines(new.data$Temperature, predict(challog5,new.data,type="response")) # # Section 9.2 # attach(bankruptcy) # # Logistic regressions with a binary (0/1) target variable # bank1 <- glm(Bankrupt ~ WC.TA + RE.TA + EBIT.TA + S.TA + BVE.BVL, family=binomial, maxit=500) bank2 <- glm(Bankrupt ~ WC.TA + RE.TA + EBIT.TA + BVE.BVL, family=binomial, maxit=500) bankruptcy2 <- bankruptcy[-1,] attach(bankruptcy2) bank3 <- glm(Bankrupt ~ WC.TA + RE.TA + EBIT.TA + S.TA + BVE.BVL, family=binomial, maxit=500) bank4 <- glm(Bankrupt ~ WC.TA + RE.TA + EBIT.TA + BVE.BVL, family=binomial, maxit=500) bank5 <- glm(Bankrupt ~ RE.TA + EBIT.TA + BVE.BVL, family=binomial, maxit=500, x=T) summary(bank5) bank6 <- glm(Bankrupt ~ WC.TA + EBIT.TA + BVE.BVL, family=binomial, maxit=500) bank7 <- glm(Bankrupt ~ WC.TA + RE.TA + BVE.BVL, family=binomial, maxit=500) bank8 <- glm(Bankrupt ~ WC.TA + RE.TA + EBIT.TA, family=binomial, maxit=500) glm(Bankrupt ~ EBIT.TA + BVE.BVL, family=binomial, maxit=500) glm(Bankrupt ~ RE.TA + BVE.BVL, family=binomial, maxit=500) glm(Bankrupt ~ RE.TA + EBIT.TA, family=binomial, maxit=500) # # Diagnostics can be obtained using the glmdiag() function # bank5.diag <- glmdiag(bank5) # # Classification table # bankrupt.predict <- as.numeric(fitted(bank5) > .5) table(Bankrupt, bankrupt.predict) # # To get prospective probability estimates, adjust the retrospective logits and the transform back. # The retrospective logits are obtained from the predict() function # prosplogit <- predict(bank5) + log((.1*25)/(.9*24)) prospprob <- exp(prosplogit)/(1 + exp(prosplogit)) # # Section 9.3 # attach(titanic) # # The summary.formula() function is part of the hmisc library, and allows for sums # spearated by groups defined by a formula # library(hmisc) summary(Survived ~ Economic.status, fun=sum)/summary(At.risk ~ Economic.status, fun=sum) summary(Survived ~ Age.group, fun=sum)/summary(At.risk ~ Age.group, fun=sum) summary(Survived ~ Gender, fun=sum)/summary(At.risk ~ Gender, fun=sum) n <- sum(At.risk) titanic1 <- glm(Survived/At.risk ~ Economic.status, family=binomial, weights=At.risk) npar <- 14 - titanic1$df.residual deviance(titanic1) + 2*npar*n/(n - npar - 1) titanic2 <- glm(Survived/At.risk ~ Age.group, family=binomial, weights=At.risk) npar <- 14 - titanic2$df.residual deviance(titanic2) + 2*npar*n/(n - npar - 1) titanic3 <- glm(Survived/At.risk ~ Gender, family=binomial, weights=At.risk) npar <- 14 - titanic3$df.residual deviance(titanic3) + 2*npar*n/(n - npar - 1) titanic4 <- glm(Survived/At.risk ~ Economic.status + Age.group, family=binomial, weights=At.risk) npar <- 14 - titanic4$df.residual deviance(titanic4) + 2*npar*n/(n - npar - 1) titanic5 <- glm(Survived/At.risk ~ Economic.status + Gender, family=binomial, weights=At.risk) npar <- 14 - titanic5$df.residual deviance(titanic5) + 2*npar*n/(n - npar - 1) titanic6 <- glm(Survived/At.risk ~ Age.group + Gender, family=binomial, weights=At.risk) npar <- 14 - titanic6$df.residual deviance(titanic6) + 2*npar*n/(n - npar - 1) titanic7 <- glm(Survived/At.risk ~ Economic.status + Age.group + Gender, family=binomial, weights=At.risk) npar <- 14 - titanic7$df.residual deviance(titanic7) + 2*npar*n/(n - npar - 1) # # Interaction effects fit as was done for Poisson regression # titanic8 <- glm(Survived/At.risk ~ Economic.status*Age.group, family=binomial, weights=At.risk) npar <- 14 - titanic8$df.residual deviance(titanic8) + 2*npar*n/(n - npar - 1) titanic9 <- glm(Survived/At.risk ~ Economic.status*Gender, family=binomial, weights=At.risk) npar <- 14 - titanic9$df.residual deviance(titanic9) + 2*npar*n/(n - npar - 1) titanic10 <- glm(Survived/At.risk ~ Age.group*Gender, family=binomial, weights=At.risk) npar <- 14 - titanic10$df.residual deviance(titanic10) + 2*npar*n/(n - npar - 1) titanic11 <- glm(Survived/At.risk ~ Economic.status*Age.group + Gender, family=binomial, weights=At.risk) npar <- 14 - titanic11$df.residual deviance(titanic11) + 2*npar*n/(n - npar - 1) titanic12 <- glm(Survived/At.risk ~ Economic.status*Gender + Age.group, family=binomial, weights=At.risk) npar <- 14 - titanic12$df.residual deviance(titanic12) + 2*npar*n/(n - npar - 1) titanic13 <- glm(Survived/At.risk ~ Age.group*Gender + Economic.status, family=binomial, weights=At.risk) npar <- 14 - titanic13$df.residual deviance(titanic13) + 2*npar*n/(n - npar - 1) titanic14 <- glm(Survived/At.risk ~ Economic.status*Age.group + Economic.status*Gender, family=binomial, weights=At.risk, maxit=100) npar <- 14 - titanic14$df.residual deviance(titanic14) + 2*npar*n/(n - npar - 1) titanic15 <- glm(Survived/At.risk ~ Economic.status*Age.group + Age.group*Gender, family=binomial, weights=At.risk) npar <- 14 - titanic15$df.residual deviance(titanic15) + 2*npar*n/(n - npar - 1) titanic16 <- glm(Survived/At.risk ~ Economic.status*Gender + Age.group*Gender, family=binomial, weights=At.risk) npar <- 14 - titanic16$df.residual deviance(titanic16) + 2*npar*n/(n - npar - 1) titanic17 <- glm(Survived/At.risk ~ Economic.status*Gender + Age.group*Gender + Economic.status*Age.group, family=binomial, weights=At.risk, maxit=100) npar <- 14 - titanic17$df.residual deviance(titanic17) + 2*npar*n/(n - npar - 1) summary(Survived ~ Economic.status*Gender, method="cross", fun=sum) summary(At.risk ~ Economic.status*Gender, method="cross", fun=sum) summary(Survived ~ Economic.status*Age.group, method="cross", fun=sum) summary(At.risk ~ Economic.status*Age.group, method="cross", fun=sum) # # Section 9.4 # attach(beetles) beetlog <- glm(Killed/Total ~ Log.dose, family=binomial, weights=Total) # # Probit regression # beetprob <- glm(Killed/Total ~ Log.dose, family=binomial(probit), weights=Total) # # Cumulative log-log regression # beetcloglog <- glm(Killed/Total ~ Log.dose, family=binomial(cloglog), weights=Total) # # Figure 9.12 # # dgumbelmin() is the density function for the minimum value version of the extreme value distribution; # see file functions.s. That file also includes a function for the cumulative distribution function, and # corresponding functions for the maximum value version of the extreme value distribution. # plot(c(1500:1900)/1000, dgumbelmin(c(1500:1900)/1000, location=-coef(beetcloglog)[1]/coef(beetcloglog)[2], scale=1/coef(beetcloglog)[2]), type="l", xlab="Log dose of carbon disulphide", ylab="Density") # # Location alpha for extreme value distribution # -coef(beetcloglog)[1]/coef(beetcloglog)[2] # # Scale gamma for extreme value distribution # 1/coef(beetcloglog)[2] # # Median lethal dose # -coef(beetcloglog)[1]/coef(beetcloglog)[2] + log(log(2))/coef(beetcloglog)[2] # # Mean tolerance # -coef(beetcloglog)[1]/coef(beetcloglog)[2] + digamma(1)/coef(beetcloglog)[2] # # Section 9.5 # # Week 1 attendance not used here; shows with other missing data are omitted # broadway2 <- na.omit(broadway[,c(1:7)]) attach(broadway2) Nomavail <- rep(6, 92) Musical <- as.numeric(Type=="Musical") - as.numeric(Type=="Play") Musical.revue <- as.numeric(Type=="Musical revue") - as.numeric(Type=="Play") bwaylog <- glm(Tony.nominations/Nomavail ~ Musical + Musical.revue + NYT.rating + DN.rating, family=binomial, weights=Nomavail) summary(bwaylog) # # Overdispersion inflation factor # sum(residuals(bwaylog,type="pearson")^2)/bwaylog$df.residual # # Quasi-likelihood output in S-PLUS # summary(glm(Tony.nominations/Nomavail ~ Musical + Musical.revue + NYT.rating + DN.rating, family=quasi(link="logit", var="mu(1-mu)"), weights=Nomavail)) # # Quasi-likelihood output in R (note the special quasibinomial family) # summary(glm(Tony.nominations/Nomavail ~ Musical + Musical.revue + NYT.rating + DN.rating, family=quasibinomial, weights=Nomavail)) # # Robust sandwich estimator # W1 <- diag(Nomavail*fitted(bwaylog)*(1 - fitted(bwaylog))) X <- cbind(rep(1,92), Musical, Musical.revue, NYT.rating, DN.rating) W2 <- diag((Nomavail*resid(bwaylog, type="response"))^2) robust <- solve(t(X)%*%W1%*%X)%*%(t(X)%*%W2%*%X)%*%solve(t(X)%*%W1%*%X) sqrt(diag(robust)) coef(bwaylog)/sqrt(diag(robust)) 2*(1-pnorm(abs(coef(bwaylog)/sqrt(diag(robust))))) # # The gnlr() function fits generalized nonlinear regression models using various distributions, # including the beta-binomial. It is available in the gnlm package; see http://www.luc.ac.be/~jlindsey/rcode.html. # Note that this package is ONLY available in R. # library(gnlm) # # The response function is defined separately using the logistic form # bwaymu <- function(beta){exp(beta[1]+beta[2]*Musical+beta[3]*Musical.revue+beta[4]*NYT.rating+beta[5]*DN.rating)/ (1+exp(beta[1]+beta[2]*Musical+beta[3]*Musical.revue+beta[4]*NYT.rating+beta[5]*DN.rating))} # # The response is a matrix of (Successes, Failures), pmu is a vector of initial values for the betas, and pshape # is an initial value for the overdispersion parameter. # gnlr(y=cbind(Tony.nominations, Nomavail-Tony.nominations), distribution="beta binomial", mu=bwaymu, pmu=c(-3,1,-.5,.5,.5), pshape=1) # # Section 9.6 # # This can only be done in S-PLUS, not R # # Form of smoking data to allow numerical fits with age group # smoklogit <- data.frame(rbind(t(smoking[1,,]),t(smoking[2,,])), Smoker=rep(c(1,0),each=7), Age=rep(c(21,30,40,50,60,70,80),2)) attach(smoklogit) n <- sum(Alive + Dead) smoklog1 <- glm(cbind(Alive,Dead) ~ Smoker + factor(Age), family=binomial, maxit=500) deviance(smoklog1) + 2*(14 - smoklog1$df.residual)*n/(n - 15 + smoklog1$df.residual) smoklog2 <- glm(cbind(Alive,Dead) ~ Smoker + Age, family=binomial, maxit=500) deviance(smoklog2) + 2*(14 - smoklog2$df.residual)*n/(n - 15 + smoklog2$df.residual) # # Determining AICc value for generalized additive model # for (i in 50:100){ temp <- gam(cbind(Alive,Dead) ~ Smoker + lo(Age,span=i/100,degree=1), family=binomial, maxit=500) print (c(i/100,deviance(temp) + 2*(14 - temp$df.residual)*n/(n - 15 + temp$df.residual)))} smoklog3 <- gam(cbind(Alive,Dead) ~ Smoker + lo(Age,span=.65,degree=1), family=binomial, maxit=500) coef(smoklog3) Agesq <- Age*Age smoklog4 <- glm(cbind(Alive,Dead) ~ Smoker + Age + Agesq, family=binomial, maxit=500) deviance(smoklog4) + 2*(14 - smoklog4$df.residual)*n/(n - 15 + smoklog4$df.residual) Pre60 <- as.numeric(Age < 60) - as.numeric(Age >= 60) smoklog5 <- glm(cbind(Alive,Dead) ~ Smoker + Age + Pre60 + Pre60*Age, family=binomial, maxit=500) deviance(smoklog5) + 2*(14 - smoklog5$df.residual)*n/(n - 15 + smoklog5$df.residual) # # Chapter 10 # # Section 10.1 # attach(authorship) # # Function multinom() fits a nominal logistic regression. It requires loading the MASS and nnet libraries; # see http://www.stats.ox.ac.uk/pub/MASS4/ # # The response for each observation is a multinomial vector. This can be represented in two ways: # (1) If there is a single trial for each observation, the actual value observed can be the # the response. The responses together form a factor. # (2) If there are multiple trials for observations, each observation has a set of K counts # corresponding to the number of outcomes at each level. The responses together form a # matrix. # library(nnet) library(MASS) # # Making Shakespeare first alphabetically makes it the baseline category # Auth <- as.vector(Author) Auth[Auth=="Shakespeare"] <- "AShakes" Auth <- as.factor(Auth) authnom1 <- multinom(Auth ~ be + been + had + it + may + not + on + the + upon + was + which, data=authorship, maxit=200) summary(authnom1) authnom2 <- multinom(Auth ~ be + had + it + not + on + the + upon + was + which, data=authorship, maxit=200) summary(authnom2) # # Classification table # table(Auth,predict(authnom2)) # # Figure 10.2 # authnom3 <- multinom(Auth ~ been, data=authorship, maxit=200) beendata <- cbind(1,c(0:270)/10) # # Getting logits, and then probabilities, based on a grid of values for "been" and fitted coefficients # austenlogit <- beendata %*% coef(authnom3)[1,] londonlogit <- beendata %*% coef(authnom3)[2,] miltonlogit <- beendata %*% coef(authnom3)[3,] austenprob <- exp(austenlogit)/(exp(austenlogit) + exp(londonlogit) + exp(miltonlogit) + 1) londonprob <- exp(londonlogit)/(exp(austenlogit) + exp(londonlogit) + exp(miltonlogit) + 1) miltonprob <- exp(miltonlogit)/(exp(austenlogit) + exp(londonlogit) + exp(miltonlogit) + 1) shakespearprob <- 1 - austenprob - londonprob - miltonprob plot(beendata[,2], austenprob, xlab="Count of 'been'", ylab="Probability", ylim=c(0,1), type="l") lines(beendata[,2], londonprob, lty=2) lines(beendata[,2], miltonprob, lty=3) lines(beendata[,2], shakespearprob, lty=4) # # Section 10.2 # attach(asbestos) # # Function lrm() fits a proportional odds model. The design and hmisc libraries must be loaded first. # The polr() function in the MASS library also fits the proportional odds model. The Exposure # variable is set up to be in the right order, but factors can be set to be ordered using the # ordered() function. # library(design) library(hmisc) asb1 <- lrm(Exposure ~ Duration + Task + Ventilation) asb2 <- lrm(Exposure ~ Task + Ventilation) asb3 <- lrm(Exposure ~ Task + Ventilation + Task*Ventilation) # # Form grid of input values to get estimated probabilties # new.data <- expand.grid(Task=c("Insulation","Tile"),Ventilation=c("General","Negative pressure")) probs <- predict.lrm(asb2,new.data,type="fitted.ind") cbind(new.data,probs) # # Chi-squared goodness-of-fit tests # expasb <- array(probs * as.vector(table(Task,Ventilation)),c(2,2,3)) obsasb <- table(Task,Ventilation,Exposure) pearson <- sum((obsasb-expasb)^2/expasb) 1-pchisq(pearson,4) dev <- 2*sum(obsasb*log(obsasb/expasb)) 1-pchisq(dev,4) # # The vglm() function in the VGAM library will fit general cumulative probability models # library(VGAM) # Proportional odds model # vglm(Exposure ~ Task + Ventilation, cumulative(link=logit, parallel=T)) # # Cumulative logit model without the constant slopes assumption # vglm(Exposure ~ Task + Ventilation, cumulative(link=logit)) # # Cumulative probit model # vglm(Exposure ~ Task + Ventilation, cumulative(link=probit, parallel=T)) # # Proportional hazards model # vglm(Exposure ~ Task + Ventilation, cumulative(link=cloglog, parallel=T)) # # vglm() will also fit other multinomial response models. # # Multinomial logistic regression for authorship data # vglm(Auth ~ be + been + had + it + may + not + on + the + upon + was + which, family=multinomial, authorship) # # Adjacent categories model # vglm(Exposure ~ Task + Ventilation, acat(link=loge, parallel=T)) # # Continuation ratio logit model; VGAM calls these "stop ratios". # The design library includes a function cr.setup() that sets up the data so # that the continuation ratio logit model can be fit using lrm() # vglm(Exposure ~ Task + Ventilation, sratio(link=logit, parallel=T)) # # Continuation ratio probit model # vglm(Exposure ~ Task + Ventilation, sratio(link=probit, parallel=T)) # # Continuation ratio model with cumulative log-log link is equivalent to the proportional hazards model # vglm(Exposure ~ Task + Ventilation, sratio(link=cloglog, parallel=T)) # # It is possible to use glm() directly to fit the continuation ratio logit model; see the material prepared # by Laura Thompson at http://math.cl.uh.edu/~thompsonla#CDA for details on this, as well as many hints # for using S-PLUS/R to fit categorical data models. Brett Presnell has prepared material related to using # R for categorical data analysis that is available at http://www.stat.ufl.edu/~presnell/Teaching/sta4504-2000sp/R/. #