# # This is the code used to construct the figures and output in the book "Handbook of Regression Analysis with Applications # in R, Second Edition" by Samprit Chatterjee and Jeffrey S. Simonoff, published by John Wiley and Sons in 2020 # as part of their Series in Probability and Statistics. Note that only the code used to produce the output # and figures in the book is included here; this should not be taken to necessarily represent # all of the analyses that would be done in a complete analysis of the data. Although the code has been tested, # I cannot guarantee that it is error-free; it should be viewed as being available at the user's risk. # # All packages used are available on CRAN (https://cran.r-project.org/). The following packages are used: # bestglm, boot, car, caret, data.table, doBy, eha, FAdist, fivethirtyeight, gam, gee, ggplot2, # glmnet, lattice, leaps, lme4, lmtest, locfit, LTRCtrees, MASS, multcomp, nnet, party, partykit, REEMtree, # ResourceSelection, rms, robustHD, rpart, scatterplot3d, survival, tseries. These packages might require # installation of other packages to install and/or run correctly; if this is the case the code below will # return an error of the form "there is no package called ‘xxxxx’". # # Once loaded, packages are assumed to be loaded for all analyses in that chapter; if the package # is needed in a later chapter, a new library() command is given. # # All analyses were performed using R version 3.6.2, although virtually all should give identical # results in earlier versions of the software. # # Chapter 1 # # Figure 1.1 # set.seed(100) x = runif(30,0,10) x[28] = 8.5 y = 10 + 2*x + rnorm(30)*3 # # pch = 1 open circle, 2 triangle, 3 plus, 4 x, 5 diamond, 6 inverted triangle; 19 solid circle # plot(x, y, pch=19) abline(a=10, b=2) text(5.8, 18, expression(E(y) == beta[0] + beta[1]*x), pos=4) arrows(6, 18.8, 5.9, 21.8, length=.15) segments(x, y, x, 10+2*x, lty=3) dev.off() # # Figure 1.2 # plot(x, y, pch=19) abline(a=10, b=2, col="grey") regxy = lm(y~x) abline(regxy) text(6.6, 18, expression(hat(E)(y) == hat(beta)[0] + hat(beta)[1]*x), pos=4) arrows(7.4, 18.8, 7.4, coef(regxy)%*%c(1,7.4), length=.15) segments(x, y, x, fitted(regxy), lty=3) dev.off() # # Figure 1.3 # library(scatterplot3d) set.seed(100) x1 = runif(30,0,10) x2 = runif(30,0,10) y = 10 + 2*x1 + 2*x2 + rnorm(30)*3 lin3d = scatterplot3d(x1, x2, y, pch = 19, xlab=expression(x[1]), ylab=expression(x[2]), box=F) regx1x2y = lm(y~x1+x2) lin3d$plane3d(regx1x2y, lty="solid") orig = lin3d$xyz.convert(x1, x2, y) plane = lin3d$xyz.convert(x1, x2, fitted(regx1x2y)) i.negpos = 1 + (resid(regx1x2y) > 0) segments(orig$x, orig$y, plane$x, plane$y, lty = (2:1)[i.negpos]) dev.off() # # Figure 1.4 # # Once a data file is read into R it is assumed to be available in the workspace in all further analyses # # Levittown home prices analyses # homeprices = read.csv("homeprices.csv") par(mfrow=c(3,2)) plot(homeprices$Bedrooms, homeprices$Sale.price, xlab="Number of bedrooms", ylab="Sale price", pch = 19) plot(homeprices$Bathrooms, homeprices$Sale.price, xlab="Number of bathrooms", ylab="Sale price", pch = 19) plot(homeprices$Living.area, homeprices$Sale.price, xlab="Living area", ylab="Sale price", pch = 19) plot(homeprices$Lot.size, homeprices$Sale.price, xlab="Lot size", ylab="Sale price", pch = 19) plot(homeprices$Year.built, homeprices$Sale.price, xlab="Year built", ylab="Sale price", pch = 19) plot(homeprices$Property.tax, homeprices$Sale.price, xlab="Property taxes", ylab="Sale price", pch = 19) dev.off() # # Fitted regression # # Once an object is created by R (such as when using the lm() command) it is assumed to be available in the # workspace in all further analyses # homeprice.lm = lm(Sale.price ~ Bedrooms+Bathrooms+Living.area+Lot.size+Year.built+Property.tax, data=homeprices) summary(homeprice.lm) # # Confidence interval for conditional mean and prediction interval # new.homes = data.frame(Bedrooms=homeprices$Bedrooms.new[1:20], Bathrooms=homeprices$Bathrooms.new[1:20], Living.area=homeprices$Living.area.new[1:20], Lot.size=homeprices$Lot.size.new[1:20], Year.built=homeprices$Year.built.new[1:20], Property.tax=homeprices$Property.tax.new[1:20], Sale.price=homeprices$Sale.price.new[1:20]) predict(homeprice.lm, new.homes[1,], interval=c("p")) predict(homeprice.lm, new.homes[1,], interval=c("c")) predict(homeprice.lm, new.homes[1,], interval=c("p"), level=.5) # # Figure 1.5 # par(mfrow=c(1,2)) plot(fitted(homeprice.lm), resid(homeprice.lm), xlab="Fitted values", ylab="Residuals", main="(a)", pch = 19) qqnorm(resid(homeprice.lm), main="(b)", pch = 19) dev.off() # # Figure 1.6 # par(mfrow=c(3,2)) plot(homeprices$Bedrooms, resid(homeprice.lm), xlab="Number of bedrooms", ylab="Residuals", pch = 19) plot(homeprices$Bathrooms, resid(homeprice.lm), xlab="Number of bathrooms", ylab="Residuals", pch = 19) plot(homeprices$Living.area, resid(homeprice.lm), xlab="Living area", ylab="Residuals", pch = 19) plot(homeprices$Lot.size, resid(homeprice.lm), xlab="Lot size", ylab="Residuals", pch = 19) plot(homeprices$Year.built, resid(homeprice.lm), xlab="Year built", ylab="Residuals", pch = 19) plot(homeprices$Property.tax, resid(homeprice.lm), xlab="Property taxes", ylab="Residuals", pch = 19) dev.off() # # Chapter 2 # # # Figure 2.1 # library(scatterplot3d) par(mfrow=c(2,1)) set.seed(101) x1 = rnorm(30) x2 = rnorm(30) x2 = .999*x1 + sqrt(1 - .999^2)*x2 x1 = 5 + 2.5*x1 x2 = 5 + 2.5*x2 y = 10 + 2*x1 + 2*x2 + rnorm(30)*3 x115 = x1[15] x1[15] = 5 lin3d = scatterplot3d(x1, x2, y, pch = 19, xlab=expression(x[1]), ylab=expression(x[2]), type="h", box=F, color=c(rep(1,14),5,rep(1,15))) collinx1x2y = lm(y~x1+x2) collinx1x2y lin3d$plane3d(collinx1x2y, lty="solid") x1[15] = x115 x2[15] = 5 lin3d = scatterplot3d(x1, x2, y, pch = 19, xlab=expression(x[1]), ylab=expression(x[2]), type="h", box=F, color=c(rep(1,14),5,rep(1,15))) collinx1x2y = lm(y~x1+x2) collinx1x2y lin3d$plane3d(collinx1x2y, lty="solid") dev.off() # # Variance inflation factors. Output from R is edited to add the values. The vif() function in the HH # package has similar functionality. # library(car) vif(homeprice.lm) # # Best subsets regression. Output from R is used to construct the output given in the book. # library(leaps) leaps(homeprices[,1:6], homeprices[,7]) sigmahat = c(52576,54932,61828,48091,51397,51870,47092,47635,48346,46885,47304,47380,47094,47162,47599,47381) npred = c(rep(c(1:5),each=3),6) nobs = 85 aic = nobs*log(sigmahat^2) + nobs*log((nobs-npred-1)/nobs) + 2.*npred + 2 aic homeprice.lm4 = lm(Sale.price ~ Bedrooms+Bathrooms+Living.area+Year.built, data=homeprices) summary(homeprice.lm4) vif(homeprice.lm4) homeprice.lm3 = lm(Sale.price ~ Bathrooms+Living.area+Year.built, data=homeprices) summary(homeprice.lm3) vif(homeprice.lm3) # # Figure 2.2 # par(mfrow=c(1,2)) plot(fitted(homeprice.lm3), resid(homeprice.lm3), xlab="Fitted values", ylab="Residuals", main="(a)", pch = 19) qqnorm(resid(homeprice.lm3), main="(b)", pch = 19) dev.off() homeprice.pred = predict(homeprice.lm3, new.homes, interval=c("p")) # # Figure 2.3 # plot(homeprice.pred[,1], new.homes$Sale.price, ylim=c(100000,650000), xlab="Predicted sale price", ylab="Observed sale price", pch=19) abline(0, 1, lty="dotted") points(homeprice.pred[,1], homeprice.pred[,2], pch="_") points(homeprice.pred[,1], homeprice.pred[,3], pch="_") dev.off() new.homes$Sale.price-homeprice.pred[,1] mean(new.homes$Sale.price) mean(homeprice.pred[,1]-new.homes$Sale.price) mean(abs(homeprice.pred[,1]-new.homes$Sale.price)) sd(homeprice.pred[,1]-new.homes$Sale.price) mean((homeprice.pred[,1]-new.homes$Sale.price)[-c(13,16)]) mean(abs((homeprice.pred[,1]-new.homes$Sale.price)[-c(13,16)])) sd((homeprice.pred[,1]-new.homes$Sale.price)[-c(13,16)]) # # Electronic voting in the 2004 presidential election analyses # election2004 = read.csv("election2004.csv") # # Figure 2.4 # par(mfrow=c(1,2)) plot(election2004$Bush.pct.2000, election2004$Change.in.Bush.pct, xlab="2000 Bush pct.", ylab="Change in voting pct.", pch = 19, main="(a)") boxplot(election2004$Change.in.Bush.pct~election2004$e.Voting, xlab="2004 electronic voting", ylab="Change in voting pct.", main="(b)") dev.off() election.pooled = lm(Change.in.Bush.pct~Bush.pct.2000, data=election2004) summary(election.pooled) election.cshift = lm(Change.in.Bush.pct~Bush.pct.2000+e.Voting, data=election2004) summary(election.cshift) vif(election.cshift) Bush.2000Xe.Voting = election2004$Bush.pct.2000 * election2004$e.Voting election2004 = data.frame(election2004, Bush.2000Xe.Voting) election.full = lm(Change.in.Bush.pct~Bush.pct.2000+e.Voting+Bush.2000Xe.Voting, data=election2004) summary(election.full) vif(election.full) # # Partial F-test for full versus pooled model # anova(election.pooled, election.full) # # Figure 2.5 # plot(election2004$Bush.pct.2000[election2004$e.Voting==0], election2004$Change.in.Bush.pct[election2004$e.Voting==0], xlab="2000 Bush percentage", ylab="Change in voting percentage", pch = 3) points(election2004$Bush.pct.2000[election2004$e.Voting==1], election2004$Change.in.Bush.pct[election2004$e.Voting==1]) abline(-5.239, .162, lty="dotted") abline(4.434, -.038, lty="dashed") legend(35, 10.5, c("e-Voting", "No e-Voting"), pch=c(1, 3), lty=c("dashed","dotted")) dev.off() # # Figure 2.6 # par(mfrow=c(2,2)) plot(election2004$Voter.turnout.2000, election2004$Change.in.Bush.pct, xlab="2000 Turnout", ylab="Change in voting pct.", pch = 19, main="(a)") plot(election2004$Voter.turnout.2004, election2004$Change.in.Bush.pct, xlab="2004 Turnout", ylab="Change in voting pct.", pch = 19, main="(b)") plot(election2004$Median.income, election2004$Change.in.Bush.pct, xlab="Median income", ylab="Change in voting pct.", pch = 19, main="(c)") plot(election2004$Hispanic.population.pct, election2004$Change.in.Bush.pct, xlab="Hispanic pct.", ylab="Change in voting pct.", pch = 19, main="(d)") dev.off() election.full.control = lm(Change.in.Bush.pct~Bush.pct.2000+e.Voting+Bush.2000Xe.Voting+Voter.turnout.2000+ Voter.turnout.2004+Median.income+Hispanic.population.pct, data=election2004) summary(election.full.control) vif(election.full.control) election2004 = data.frame(election2004, Change.in.turnout=election2004$Voter.turnout.2004- election2004$Voter.turnout.2000) election.full.control2 = lm(Change.in.Bush.pct~Bush.pct.2000+e.Voting+Bush.2000Xe.Voting+Change.in.turnout+ Median.income+Hispanic.population.pct, data=election2004) summary(election.full.control2) vif(election.full.control2) anova(election.full.control2, election.full.control) # # Figure 2.7 # par(mfrow=c(3,3)) plot(fitted(election.full.control2), resid(election.full.control2), xlab="Fitted values", ylab="Residuals", pch = 19) qqnorm(resid(election.full.control2), pch = 19) plot(election2004$Bush.pct.2000, resid(election.full.control2), xlab="2000 Bush pct.", ylab="Residuals", pch = 19) boxplot(resid(election.full.control2)~election2004$e.Voting, xlab="2004 electronic voting", ylab="Residuals") plot(election2004$Change.in.turnout, resid(election.full.control2), xlab="Change in turnout", ylab="Residuals", pch = 19) plot(election2004$Median.income, resid(election.full.control2), xlab="Median income", ylab="Residuals", pch = 19) plot(election2004$Hispanic.population.pct, resid(election.full.control2), xlab="Hispanic pct.", ylab="Residuals", pch = 19) dev.off() # # Chapter 3 # # Figure 3.1 # set.seed(103) x = rnorm(99) z = rnorm(99) y = .8*x + sqrt(.36)*z x = x+5 y = y+5 x1 = c(x, 15) y1 = c(y, 3.5) plot(x1, y1, pch = 19, xlab="x", ylab="y") abline(lsfit(x1, y1)) abline(lsfit(x, y), lty=2) text(14.6,3.5,"A") dev.off() # # Figure 3.2 # par(mfrow=c(2,1)) x1 = c(x,15) y1 = c(y, 1 + .85*15) plot(x1, y1, pch = 19, xlab="x", ylab="y") abline(lsfit(x1, y1)) abline(lsfit(x, y), lty=2) x1 = c(x,5) y1 = c(y, 8.5 + .85*5) plot(x1, y1, pch = 19, xlab="x", ylab="y") abline(lsfit(x1, y1)) abline(lsfit(x, y), lty=2) dev.off() # # Figure 3.3 # sresprice1 = rstandard(homeprice.lm3) hatprice1 = hatvalues(homeprice.lm3) cookprice1 = cooks.distance(homeprice.lm3) par(mfrow=c(3,1)) plot(c(1:85), sresprice1, ylim=c(-2.5,2.5), xlab="Index", ylab="Std. residuals", pch=19, main="Standardized residuals") abline(h=2.5, lty=3) abline(h=-2.5, lty=3) plot(c(1:85), hatprice1, xlab="Index", ylab="Leverage", pch=19, main="Diagonal elements of the hat matrix") abline(h=2.5*4/85, lty=3) plot(c(1:85), cookprice1, xlab="Index", ylab="Cook's D", pch=19, main="Cook's distances") dev.off() # # Figure 3.4 # plot(hatprice1, cookprice1, pch = 19, xlab="Leverage values", ylab="Cook's distances") dev.off() # # Model selection and regression fit with unusual observations omitted # homeprices2 = homeprices[-c(4,11,29,48,71),] library(leaps) leaps(homeprices2[,1:6], homeprices2[,7]) homeprice.lm3.out = lm(Sale.price ~ Bathrooms+Living.area+Year.built, data=homeprices2) summary(homeprice.lm3.out) library(car) vif(homeprice.lm3.out) homeprice.lm2.out = lm(Sale.price ~ Bathrooms+Living.area, data=homeprices2) summary(homeprice.lm2.out) vif(homeprice.lm2.out) sresprice2 = rstandard(homeprice.lm2.out) hatprice2 = hatvalues(homeprice.lm2.out) cookprice2 = cooks.distance(homeprice.lm2.out) # # Figure 3.5 # par(mfrow=c(3,1)) plot(c(1:80), sresprice2, ylim=c(-2.5,2.5), xlab="Index", ylab="Std. residuals", pch=19, main="Standardized residuals") abline(h=2.5, lty=3) abline(h=-2.5, lty=3) plot(c(1:80), hatprice2, ylim=c(0,.15), xlab="Index", ylab="Leverage", pch=19, main="Diagonal elements of the hat matrix") abline(h=2.5*4/80, lty=3) plot(c(1:80), cookprice2, xlab="Index", ylab="Cook's D", pch=19, main="Cook's distances") dev.off() # # Chapter 4 # # Predicting movie grosses analyses # movies2009 = read.csv("movies2009.csv") # # Figure 4.1 # par(mfrow=c(2,2)) plot(movies2009$Opening, movies2009$Total.Gross, xlab="Opening weekend gross", ylab="Total domestic gross", pch = 19) plot(movies2009$Screens, movies2009$Total.Gross, xlab="Opening screens", ylab="Total domestic gross", pch = 19) plot(movies2009$Budget, movies2009$Total.Gross, xlab="Production budget", ylab="Total domestic gross", pch = 19) plot(movies2009$RT, movies2009$Total.Gross, xlab="Rotten Tomatoes rating", ylab="Total domestic gross", pch = 19) dev.off() Log.domestic.gross = log10(movies2009$Total.Gross) Log.opening.gross = log10(movies2009$Opening) Log.budget = log10(movies2009$Budget) movies2009 = data.frame(movies2009, Log.domestic.gross, Log.opening.gross, Log.budget) # # Figure 4.2 # par(mfrow=c(2,2)) plot(movies2009$Log.opening.gross, movies2009$Log.domestic.gross, xlab="Logged opening weekend gross", ylab="Logged total domestic gross", pch = 19) plot(movies2009$Screens, movies2009$Log.domestic.gross, xlab="Opening screens", ylab="Logged total domestic gross", pch = 19) plot(movies2009$Log.budget, movies2009$Log.domestic.gross, xlab="Logged production budget", ylab="Logged total domestic gross", pch = 19) plot(movies2009$RT, movies2009$Log.domestic.gross, xlab="Rotten Tomatoes rating", ylab="Logged total domestic gross", pch = 19) dev.off() # # Regression fit, model selection, and diagnostics # movies.lm1 = lm(Log.domestic.gross ~ Log.opening.gross + Screens + Log.budget + RT, data=movies2009) summary(movies.lm1) library(car) vif(movies.lm1) library(leaps) # # Budget values are missing for 9 observations, and leaps() does not allow missing values (as that would result in # different models being based on different sets of observations), so those observations are omitted in the # call. # leaps(movies2009[-c(9,21,36,43,58,78,102,108,117),c(12,4,13,5)], movies2009[-c(9,21,36,43,58,78,102,108,117),11]) movies.lm2 = lm(Log.domestic.gross ~ Log.opening.gross + Log.budget + RT, data=movies2009) summary(movies.lm2) vif(movies.lm2) sresmovie2 = rstandard(movies.lm2) hatmovie2 = hatvalues(movies.lm2) cookmovie2 = cooks.distance(movies.lm2) # # Figure 4.3 # par(mfrow=c(3,1)) plot(c(1:120), sresmovie2, ylim=c(-2.5,2.5), xlab="Index", ylab="Std. residuals", pch=19, main="Standardized residuals") abline(h=2.5, lty=3) abline(h=-2.5, lty=3) plot(c(1:120), hatmovie2, ylim=c(0,.45), xlab="Index", ylab="Leverage", pch=19, main="Diagonal elements of the hat matrix") abline(h=2.5*4/120, lty=3) plot(c(1:120), cookmovie2, xlab="Index", ylab="Cook's D", pch=19, main="Cook's distances") dev.off() # # Omitting "The Last House on the Left" # leaps(movies2009[-c(9,21,36,43,58,78,102,106,108,117),c(12,4,13,5)], movies2009[-c(9,21,36,43,58,78,102,106,108,117),11]) movies.lm3 = lm(Log.domestic.gross ~ Log.opening.gross + Log.budget + RT, data=movies2009[-106,]) summary(movies.lm3) vif(movies.lm3) sresmovie3 = rstandard(movies.lm3) hatmovie3 = hatvalues(movies.lm3) cookmovie3 = cooks.distance(movies.lm3) # # Figure 4.4 # par(mfrow=c(3,1)) plot(c(1:119), sresmovie3, ylim=c(-2.5,2.5), xlab="Index", ylab="Std. residuals", pch=19, main="Standardized residuals") abline(h=2.5, lty=3) abline(h=-2.5, lty=3) plot(c(1:119), hatmovie3, xlab="Index", ylab="Leverage", pch=19, main="Diagonal elements of the hat matrix") abline(h=2.5*4/119, lty=3) plot(c(1:119), cookmovie3, xlab="Index", ylab="Cook's D", pch=19, main="Cook's distances") dev.off() # # Figure 4.5 # par(mfrow=c(3,2)) plot(fitted(movies.lm3), sresmovie3, xlab="Fitted values", ylab="Std. residuals", pch = 19) qqnorm(sresmovie3, pch = 19) plot(movies2009$Log.opening.gross[-c(9,21,36,43,58,78,102,106,108,117)], sresmovie3, xlab="Logged opening weekend gross", ylab="Std. residuals", pch = 19) plot(movies2009$Log.budget[-c(9,21,36,43,58,78,102,106,108,117)], sresmovie3, xlab="Logged production budget", ylab="Std. residuals", pch = 19) plot(movies2009$RT[-c(9,21,36,43,58,78,102,106,108,117)], sresmovie3, xlab="Rotten Tomatoes rating", ylab="Std. residuals", pch = 19) dev.off() # # Validation on 2010 movies, producing Table 4.1 # movies2010 = read.csv("movies2010.csv") Log.domestic.gross = log10(movies2010$Total.Gross) Log.opening.gross = log10(movies2010$Opening) Log.budget = log10(movies2010$Budget) movies2010 = data.frame(movies2010, Log.domestic.gross, Log.opening.gross, Log.budget) movievalidate = predict(movies.lm3, movies2010, interval=c("p")) data.frame(movies2010$Movie,10^movievalidate[,2],movies2010$Total.Gross,10^movievalidate[,3]) # # Chapter 5 # # Figure 5.1 # library(lattice) rho = rep(c(-90:90)/100, 6) lambda = rep(c(0,1,3,5,7,9)/10, each=181) efficiency = (1+rho*lambda)*(1+rho^2-2*rho*lambda)/((1-rho^2)*(1-rho*lambda)) xyplot(efficiency ~ rho | lambda, type="l", col=1, xlab=as.expression(substitute(paste(rho))), strip = strip.custom(var.name = as.expression(substitute(paste(lambda))), strip.levels=c(TRUE,TRUE)), panel=function(...){ panel.xyplot(...) panel.abline(v=0) panel.abline(h=1)}, ylab="Inefficiency of OLS estimates") dev.off() # # Figure 5.2 # bias = ((1-rho*lambda)/(1+rho*lambda) - 1)*100 xyplot(bias ~ rho | lambda, type="l", col=1, xlab=as.expression(substitute(paste(rho))), strip = strip.custom(var.name = as.expression(substitute(paste(lambda))), strip.levels=c(TRUE,TRUE)), panel=function(...){ panel.xyplot(...) panel.abline(v=0) panel.abline(h=0) panel.abline(h=-100,lty=2)}, ylab="Percentage bias of estimated variance") dev.off() # # e-Commerce retails sales analyses # ecommerce = read.csv("ecommerce.csv") # # Figure 5.3 # Year = seq(1999.9167, 2011.1667, .25) plot(Year, ecommerce$e.Commerce.sales, type="l", ylab="e-Commerce retail sales") dev.off() # # Figure 5.4 # plot(ecommerce$Total.sales, ecommerce$e.Commerce.sales, ylab="e-Commerce retail sales", xlab="Total retail sales", pch=19) dev.off() # # Regression fit and diagnostic tests and plots # ecommerce.lm1 = lm(e.Commerce.sales ~ Total.sales, data=ecommerce) summary(ecommerce.lm1) # # Durbin-Watson statistic (not reported in the text) # library(lmtest) dwtest(ecommerce.lm1) # # The acf() function plots the ACF function by default, but includes the value at lag 0, which # for the autocorrelation is by definition equal to 1. This provides no information and can distort # the plot if observed positive correlations are not close to 1. Setting the first value to NA and # plotting the ACF object separately addresses this problem. # resid.acf = acf(resid(ecommerce.lm1), plot=FALSE) resid.acf$acf[1] = NA rdiag = ls.diag(ecommerce.lm1) # # Figure 5.5 # par(mfrow=c(1,2)) plot(Year, rdiag$std.res, ylab="Standardized residuals", main="(a)", pch = 19) plot(resid.acf, main="(b)") dev.off() # # Adding quarterly indicators and then recession indicator # Quarter = c("Q4",rep(c("Q1","Q2","Q3","Q4"),11),"Q1") # # Figure 5.6 # boxplot(rdiag$std.res~Quarter, ylab="Standardized residuals") dev.off() ecommerce.lm2 = lm(e.Commerce.sales ~ Total.sales+Quarter.1+Quarter.2+Quarter.3, data=ecommerce) summary(ecommerce.lm2) anova(ecommerce.lm1, ecommerce.lm2) dwtest(ecommerce.lm2) resid.acf = acf(resid(ecommerce.lm2), plot=FALSE) resid.acf$acf[1] = NA rdiag = ls.diag(ecommerce.lm2) # # Figure 5.7 # par(mfrow=c(1,2)) plot(Year, rdiag$std.res, ylab="Standardized residuals", main="(a)", pch = 19) plot(resid.acf, main="(b)") dev.off() ecommerce.lm3 = lm(e.Commerce.sales ~ Total.sales+Quarter.1+Quarter.2+Quarter.3+Recession, data=ecommerce) summary(ecommerce.lm3) dwtest(ecommerce.lm3) resid.acf = acf(resid(ecommerce.lm3), plot=FALSE) resid.acf$acf[1] = NA rdiag = ls.diag(ecommerce.lm3) # # Figure 5.8 # par(mfrow=c(3,2)) plot(fitted(ecommerce.lm3), rdiag$std.res, xlab="Fitted values", ylab="Standardized residuals", main="(a)", pch = 19) plot(Year, rdiag$std.res, ylab="Standardized residuals", main="(b)", pch = 19) qqnorm(rdiag$std.res, main="(c)", pch = 19) plot(resid.acf, main="(d)") boxplot(rdiag$std.res~Quarter, ylab="Standardized residuals", main="(e)") boxplot(rdiag$std.res~ecommerce$Recession, xlab="Recession", ylab="Standardized residuals", main="(f)") dev.off() library(tseries) runs.test(factor(sign(resid(ecommerce.lm3)))) # # S&P indices, with lagged and differenced (log return) analyses # sandpindices = read.csv("sandpindices.csv") # # Figure 5.9 # par(mfrow=c(1,2)) plot(sandpindices$S.P.Small.cap, sandpindices$S.P.Large.cap, xlab="S&P Small-cap Index", ylab="S&P Large-cap index", main="(a)", pch = 19) plot(sandpindices$S.P.Mid.cap, sandpindices$S.P.Large.cap, xlab="S&P Mid-cap Index", ylab="S&P Large-cap index", main="(b)", pch = 19) dev.off() log.S.P.Small.cap = log(sandpindices$S.P.Small.cap) log.S.P.Mid.cap = log(sandpindices$S.P.Mid.cap) log.S.P.Large.cap = log(sandpindices$S.P.Large.cap) # # Figure 5.10 # par(mfrow=c(1,2)) plot(log.S.P.Small.cap, log.S.P.Large.cap, xlab="Logged S&P Small-cap Index", ylab="Logged S&P Large-cap index", main="(a)", pch = 19) plot(log.S.P.Mid.cap, log.S.P.Large.cap, xlab="Logged S&P Mid-cap Index", ylab="Logged S&P Large-cap index", main="(b)", pch = 19) dev.off() sandpindices.lm1 = lm(log.S.P.Large.cap ~ log.S.P.Small.cap + log.S.P.Mid.cap) summary(sandpindices.lm1) dwtest(sandpindices.lm1) resid.acf = acf(resid(sandpindices.lm1), plot=FALSE) resid.acf$acf[1] = NA rdiag = ls.diag(sandpindices.lm1) Year = 2006 + c(1:1259)*5/1260 # # Figure 5.11 # par(mfrow=c(1,2)) plot(Year, rdiag$std.res, ylab="Standardized residuals", main="(a)", pch = 19) plot(resid.acf, main="(b)") dev.off() lag.log.S.P.Large.cap = c(NA, log.S.P.Large.cap[1:(length(log.S.P.Large.cap)-1)]) sandpindices.lm2 = lm(log.S.P.Large.cap ~ lag.log.S.P.Large.cap) summary(sandpindices.lm2) resid.acf = acf(resid(sandpindices.lm2), plot=FALSE) resid.acf$acf[1] = NA rdiag = ls.diag(sandpindices.lm2) # # Figure 5.12 # par(mfrow=c(2,2)) plot(Year[2:1259], rdiag$std.res, xlab="Year", ylab="Standardized residuals", main="(a)", pch = 19) plot(resid.acf, main="(b)") qqnorm(rdiag$std.res, main="(c)", pch = 19) dev.off() # # Regressions using log returns (defined as the differenced natural logged prices) # S.P.Large.cap.return = log.S.P.Large.cap-lag.log.S.P.Large.cap S.P.Mid.cap.return = log.S.P.Mid.cap-c(NA,log.S.P.Mid.cap[1:(length(log.S.P.Mid.cap)-1)]) S.P.Small.cap.return = log.S.P.Small.cap-c(NA,log.S.P.Small.cap[1:(length(log.S.P.Small.cap)-1)]) sandpindices.lm3 = lm(S.P.Large.cap.return ~ S.P.Small.cap.return + S.P.Mid.cap.return) summary(sandpindices.lm3) dwtest(sandpindices.lm3) resid.acf = acf(resid(sandpindices.lm3)) resid.acf$acf[1] = NA rdiag = ls.diag(sandpindices.lm3) # # Figure 5.13 # par(mfrow=c(3,2)) plot(fitted(sandpindices.lm3), rdiag$std.res, xlab="Fitted values", ylab="Standardized residuals", main="(a)", pch = 19) plot(Year[-1], rdiag$std.res, xlab="Year", ylab="Standardized residuals", main="(b)", pch = 19) qqnorm(rdiag$std.res, main="(c)", pch = 19) plot(resid.acf, main="(d)") plot(S.P.Small.cap.return[-1], rdiag$std.res, xlab="Small-cap log return", ylab="Standardized residuals", main="(e)", pch = 19) plot(S.P.Mid.cap.return[-1], rdiag$std.res, xlab="Mid-cap log return", ylab="Standardized residuals", main="(f)", pch = 19) dev.off() runs.test(factor(sign(resid(sandpindices.lm3)))) # # Old Faithful Geyser intereruption time analyses # oldfaithful = read.csv("oldfaithful.csv") # # Figure 5.14 # plot(oldfaithful$Duration, oldfaithful$Interval, xlab="Duration of previous eruption", ylab="Time to next eruption", pch = 19) dev.off() # # Regression fit and diagnostics, and Cochrane-Orcutt fit # oldfaithful.lm1 = lm(Interval ~ Duration, data=oldfaithful) summary(oldfaithful.lm1) dwtest(oldfaithful.lm1, alternative="two.sided") resid.acf <- acf(resid(oldfaithful.lm1)) resid.acf$acf[1] = NA rdiag = ls.diag(oldfaithful.lm1) # # Figure 5.15 # par(mfrow=c(2,2)) plot(fitted(oldfaithful.lm1), rdiag$std.res, xlab="Fitted values", ylab="Standardized residuals", main="(a)", pch = 19) plot(c(1:222), rdiag$std.res, xlab="Eruption number", ylab="Standardized residuals", main="(b)", pch = 19) plot(resid.acf, main="(c)") qqnorm(rdiag$std.res, main="(d)", pch = 19) dev.off() rho = resid.acf$acf[2] Durationstar = oldfaithful$Duration - rho*c(NA, oldfaithful$Duration[1:(nrow(oldfaithful)-1)]) Intervalstar = oldfaithful$Interval - rho*c(NA, oldfaithful$Interval[1:(nrow(oldfaithful)-1)]) oldfaithful.lm2 = lm(Intervalstar ~ Durationstar) summary(oldfaithful.lm2) dwtest(oldfaithful.lm2, alternative="two.sided") resid.acf <- acf(resid(oldfaithful.lm2), plot=FALSE) resid.acf$acf[1] = NA rdiag = ls.diag(oldfaithful.lm2) # # Figure 5.16 # par(mfrow=c(2,2)) plot(fitted(oldfaithful.lm2), rdiag$std.res, xlab="Fitted values", ylab="Standardized residuals", main="(a)", pch = 19) plot(c(2:222), rdiag$std.res, xlab="Eruption number", ylab="Standardized residuals", main="(b)", pch = 19) plot(resid.acf, main="(c)") qqnorm(rdiag$std.res, main="(d)", pch = 19) dev.off() runs.test(factor(sign(resid(oldfaithful.lm2)))) rdiag$hat c(max(rdiag$hat),2.5*2/221) rdiag$cooks max(rdiag$cooks) # # Chapter 6 # # Figure 6.1 # intplot1 = data.frame(Rows=rep(c("Row 1","Row 2","Row 3"),3),Columns=rep(c("Col. 1","Col. 2", "Col. 3"),each=3),Response=c(20,40,30,30,50,40,40,60,50)) intplot2 = data.frame(Rows=rep(c("Row 1","Row 2","Row 3"),3),Columns=rep(c("Col. 1","Col. 2", "Col. 3"),each=3),Response=c(20,40,30,20,10,0,50,40,60)) par(mfrow=c(2,1)) interaction.plot(intplot1$Rows, intplot1$Columns, intplot1$Response, xlab="Rows", ylab="Mean response", trace.label="Columns", fixed=T, lty=c(1,2,4), main="Without an interaction effect") interaction.plot(intplot2$Rows, intplot2$Columns, intplot2$Response, xlab="Rows", ylab="Mean response", trace.label="Columns", fixed=T, lty=c(1,2,4), main="With an interaction effect") dev.off() # # ANOVA fits for DVD sales of movie data # Log.dvd = log10(movies2009$DVD) movies2009 = data.frame(movies2009, Log.dvd) movies2009.subset = movies2009[movies2009$Rating!="G" & movies2009$Genre!="Documentary" & movies2009$Genre!="Musical",] movies2009.subset = na.omit(data.frame(Rating=movies2009.subset$Rating, Genre=movies2009.subset$Genre, Log.dvd=movies2009.subset$Log.dvd)) library(car) movies2009.subset$Genre = recode(movies2009.subset$Genre,"c('Action','Adventure')='Action/Adventure'; c('Horror','Romance','Thriller')='Drama'") levels(movies2009.subset$Rating) = c(NA,"PG","PG-13","R") # # Figure 6.2 # par(mfrow=c(1,2)) boxplot(movies2009.subset$Log.dvd~movies2009.subset$Rating, xlab="MPAA rating", ylab="Logged DVD sales") boxplot(movies2009.subset$Log.dvd~movies2009.subset$Genre, xlab="Genre", ylab="Logged DVD sales", xaxt="n") axis(1, at=c(1:3), labels=c("Action/Adventure", "Comedy", "Drama"), cex.axis=.6) dev.off() # # Assigns effect coding contrasts to define factor variables # contrasts(movies2009.subset$Rating) = contr.sum(3) contrasts(movies2009.subset$Genre) = contr.sum(3) dvd.aov1 = aov(Log.dvd ~ Rating + Genre + Rating*Genre, data=movies2009.subset) Anova(dvd.aov1, type="III") dvd.lm1 = lm(Log.dvd ~ Rating + Genre + Rating*Genre, data=movies2009.subset) summary(dvd.lm1) sresdvd1 = rstandard(dvd.lm1) # # Figure 6.3 # par(mfrow=c(1,2)) boxplot(sresdvd1~movies2009.subset$Rating, xlab="MPAA rating", ylab="Standardized residuals") boxplot(sresdvd1~movies2009.subset$Genre, xlab="Genre", ylab="Standardized residuals", xaxt="n") axis(1, at=c(1:3), labels=c("Action/Adventure", "Comedy", "Drama"), cex.axis=.6) dev.off() # # Levene's test and WLS # Abs.resid = abs(sresdvd1) dvd.levene1 = aov(Abs.resid ~ Rating + Genre, data=movies2009.subset) Anova(dvd.levene1, type="III") genrevar = by(sresdvd1, movies2009.subset$Genre, var) wt = recode(movies2009.subset$Genre, "'Action/Adventure'=1/genrevar[1]; 'Comedy'=1/genrevar[2]; 'Drama'=1/genrevar[3]", as.factor=FALSE) dvd.aov1.wls = aov(Log.dvd ~ Rating + Genre + Rating*Genre, weights=wt, data=movies2009.subset) Anova(dvd.aov1.wls, type="III") dvd.lm1.wls = lm(Log.dvd ~ Rating + Genre + Rating*Genre, weights=wt, data=movies2009.subset) summary(dvd.lm1.wls) sresdvd1.wls = rstandard(dvd.lm1.wls) # # Figure 6.4 # par(mfrow=c(1,2)) boxplot(sresdvd1.wls~movies2009.subset$Rating, xlab="MPAA rating", ylab="Standardized residuals") boxplot(sresdvd1.wls~movies2009.subset$Genre, xlab="Genre", ylab="Standardized residuals", xaxt="n") axis(1, at=c(1:3), labels=c("Action/Adventure", "Comedy", "Drama"), cex.axis=.6) dev.off() Abs.resid = abs(sresdvd1.wls) dvd.levene1.wls = aov(Abs.resid ~ Rating + Genre, data=movies2009.subset) Anova(dvd.levene1.wls, type="III") # # Figure 6.5 # interaction.plot(movies2009.subset$Rating, movies2009.subset$Genre, movies2009.subset$Log.dvd, xlab="MPAA Rating", ylab="Mean logged sales", trace.label="Genre", fixed=T, lty=c(1,2,4), xpd=F) dev.off() dvd.aov2.wls = aov(Log.dvd ~ Rating + Genre, weights=wt, data=movies2009.subset) Anova(dvd.aov2.wls) dvd.lm2.wls = lm(Log.dvd ~ Rating + Genre, weights=wt, data=movies2009.subset) summary(dvd.lm2.wls) dvd.aov3.wls = aov(Log.dvd ~ Genre, weights=wt, data=movies2009.subset) Anova(dvd.aov3.wls) dvd.lm3.wls = lm(Log.dvd ~ Genre, weights=wt, data=movies2009.subset) summary(dvd.lm3.wls) library(multcomp) summary(glht(dvd.aov3.wls, linfct=mcp(Genre="Tukey"))) sresdvd3.wls = rstandard(dvd.lm3.wls) hatdvd3.wls = hatvalues(dvd.lm3.wls) cookdvd3.wls = cooks.distance(dvd.lm3.wls) # # Figure 6.6 # par(mfrow=c(3,2)) plot(fitted(dvd.lm3.wls), sresdvd3.wls, xlab="Fitted values", ylab="Std. residuals", pch = 19) qqnorm(sresdvd3.wls, pch = 19) boxplot(sresdvd3.wls~movies2009.subset$Genre, xlab="Genre", ylab="Std. residuals", xaxt="n") axis(1, at=c(1:3), labels=c("Action/Adventure", "Comedy", "Drama"), cex.axis=.6) plot(c(1:118), sresdvd3.wls, ylim=c(-2.8,3), xlab="Index", ylab="Std. residuals", pch=19) abline(h=2.5, lty=3) abline(h=-2.5, lty=3) plot(c(1:118), hatdvd3.wls, ylim=c(0, .065), xlab="Index", ylab="Leverage", pch=19) abline(h=2.5*3/118, lty=3) plot(c(1:118), cookdvd3.wls, xlab="Index", ylab="Cook's D", ylim=c(0, .08), pch=19) dev.off() # # Prediction intervals for new observations # Log.dvd = log10(movies2010$DVD) movies2010 = data.frame(movies2010, Log.dvd) movies2010$Genre = recode(movies2010$Genre,"c('Action','Adventure')='Action/Adventure'; c('Horror','Romance','Thriller')='Drama'") wt2010 = recode(movies2010$Genre, "'Action/Adventure'=1/genrevar[1]; 'Comedy'=1/genrevar[2]; 'Drama'=1/genrevar[3]", as.factor=FALSE) dvdvalidate = predict(dvd.lm3.wls, movies2010, weights=wt2010, interval=c("p")) data.frame(movies2010$Movie,dvdvalidate[,2],movies2010$Log.dvd,dvdvalidate[,3]) data.frame(movies2010$Movie,10^dvdvalidate[,2],movies2010$DVD,10^dvdvalidate[,3]) # # Chapter 7 # # # International movie grosses analyses # Log.international.gross = log10(movies2009$International) movies2009 = data.frame(movies2009, Log.international.gross) # # Figure 7.1 # par(mfrow=c(1,2)) plot(movies2009$Log.domestic.gross, movies2009$Log.international.gross, xlab="Logged domestic grosses", ylab="Logged international grosses", main="(a)", pch=19) boxplot(movies2009$Log.international~movies2009$Rating, xlab="MPAA rating", ylab="Logged international grosses", main="(b)") dev.off() # # Effect coding contrasts for factor variable # contrasts(movies2009$Rating) = contr.sum(4) # # ANCOVA fits # intl.lm1 = lm(Log.international.gross ~ Log.domestic.gross + Rating, data=movies2009) summary(intl.lm1) intl.aov1 = aov(Log.international.gross ~ Log.domestic.gross + Rating, data=movies2009) library(car) Anova(intl.lm1, type="III") library(multcomp) summary(glht(intl.aov1, linfct=mcp(Rating="Tukey"))) intl.lm2 = lm(Log.international.gross ~ Log.domestic.gross + Rating + Log.domestic.gross*Rating, data=movies2009) summary(intl.lm2) Anova(intl.lm2, type="III") intl.aov2 = aov(Log.international.gross ~ Log.domestic.gross + Rating + Log.domestic.gross*Rating, data=movies2009) # # Figure 7.2 # par(mfrow=c(1,2)) plot(movies2009$Log.domestic.gross, movies2009$Log.international.gross, xlab="Logged domestic grosses", ylab="Logged international grosses", pch=".", main=" (a) Constant slope") abline(a=coef(intl.lm1)[1]+coef(intl.lm1)[3], b=coef(intl.lm1)[2], lty=1) abline(a=coef(intl.lm1)[1]+coef(intl.lm1)[4], b=coef(intl.lm1)[2], lty=2) abline(a=coef(intl.lm1)[1]+coef(intl.lm1)[5], b=coef(intl.lm1)[2], lty=4) abline(a=coef(intl.lm1)[1]-coef(intl.lm1)[3]-coef(intl.lm1)[4]-coef(intl.lm1)[5], b=coef(intl.lm1)[2], lty=5) plot(movies2009$Log.domestic.gross, movies2009$Log.international.gross, xlab="Logged domestic grosses", ylab="Logged international grosses", pch=".", main="(b) Varying slopes") abline(a=coef(intl.lm2)[1]+coef(intl.lm2)[3], b=coef(intl.lm2)[2]+coef(intl.lm2)[6], lty=1) abline(a=coef(intl.lm2)[1]+coef(intl.lm2)[4], b=coef(intl.lm2)[2]+coef(intl.lm2)[7], lty=2) abline(a=coef(intl.lm2)[1]+coef(intl.lm2)[5], b=coef(intl.lm2)[2]+coef(intl.lm2)[8], lty=4) abline(a=coef(intl.lm2)[1]-coef(intl.lm2)[3]-coef(intl.lm2)[4]-coef(intl.lm2)[5], b=coef(intl.lm2)[2]-coef(intl.lm2)[6]-coef(intl.lm2)[7]-coef(intl.lm2)[8], lty=5) dev.off() rdiag = ls.diag(intl.lm2) # # Note that two movies do not have international grosses given # # # Figure 7.3 # par(mfrow=c(2,2)) plot(fitted(intl.lm2), rdiag$std.res, xlab="Fitted values", ylab="Standardized residuals", main="(a)", pch = 19) plot(movies2009$Log.domestic.gross[-c(119,120)], rdiag$std.res, xlab="Logged domestic grosses", ylab="Standardized residuals", main="(b)", pch = 19) boxplot(rdiag$std.res~movies2009$Rating[-c(119,120)], xlab="MPAA rating", ylab="Standardized residuals", main="(c)") qqnorm(rdiag$std.res, main="(d)", pch = 19) dev.off() rdiag$hat c(max(rdiag$hat),2.5*9/128) rdiag$cooks max(rdiag$cooks) # # Chapter 8 # # Figure 8.1 # x = c(-500:500)/10 prob1 = exp(x)/(1+exp(x)) prob2 = exp(1+.15*x)/(1+exp(1+.15*x)) prob3 = exp(1-.2*x)/(1+exp(1-.2*x)) prob4 = exp(2-.05*x)/(1+exp(2-.05*x)) plot(x, prob1, type="l", xaxt="n", xlab="X", ylab="Probability") lines(x, prob2, lty=2) lines(x, prob3, lty=3) lines(x, prob4, lty=4) dev.off() # # Figure 8.2 # par(mfrow=c(1,2)) x = c(0:50) prob1 = c(rep(.05,15), rep(.95,15), rep(.05,21)) prob2 = rep(.5,51) plot(x, rbinom(51, 50, prob1)/50, xlab="X", ylab="Proportion of successes", ylim=c(0,1), pch = 19) plot(x, rbinom(51, 50, prob2)/50, xlab="X", ylab="Proportion of successes", ylim=c(0,1), pch = 19) dev.off() # # Figure 8.3 # # # Smoking and mortality survey analyses # smoking = read.csv("smoking.csv") prop = smoking$Survived/smoking$At.risk logit = log(prop/(1-prop)) par(mfrow=c(1,2)) plot(smoking$Age.group[smoking$Smoker==0], prop[smoking$Smoker==0], xlab="Age", ylab="Proportion survived", main="(a)") points(smoking$Age.group[smoking$Smoker==1], prop[smoking$Smoker==1], pch=3) legend(25, .3, c("Smoker", "Nonsmoker"), pch=c(3, 1), cex=.6) plot(smoking$Age.group[smoking$Smoker==0], logit[smoking$Smoker==0], xlab="Age", ylab="Empirical logit", main="(b)") points(smoking$Age.group[smoking$Smoker==1], logit[smoking$Smoker==1], pch=3) legend(25, .5, c("Smoker", "Nonsmoker"), pch=c(3, 1), cex=.6) dev.off() # # Logistic regression fits, tests, goodness-of-fit tests # # Likelihood ratio tests for the coefficients can be obtained using the drop1() command; for example, # drop1(smoking1.logit, test="LRT") # smoking1.logit = glm(Survived/At.risk ~ Age.group + Smoker, weights=At.risk, family=binomial, data=smoking) summary(smoking1.logit) lrtest1 = smoking1.logit$null.deviance - deviance(smoking1.logit) c(lrtest1, 1-pchisq(lrtest1,2)) exp(coef(smoking1.logit)[2:3]) c(deviance(smoking1.logit), 1-pchisq(deviance(smoking1.logit),11)) pearres1 = residuals(smoking1.logit, type="pearson") pearson = sum(pearres1^2) c(pearson, 1-pchisq(pearson,11)) Age.group.sq = smoking$Age.group^2 smoking2.logit = glm(Survived/At.risk ~ Age.group + Age.group.sq + Smoker, weights=At.risk, family=binomial, data=smoking) summary(smoking2.logit) c(deviance(smoking2.logit), 1-pchisq(deviance(smoking2.logit),10)) pearres2 = residuals(smoking2.logit, type="pearson") pearson = sum(pearres2^2) c(pearson, 1-pchisq(pearson,10)) c(deviance(smoking1.logit)-deviance(smoking2.logit), 1-pchisq(deviance(smoking1.logit)-deviance(smoking2.logit),1)) exp(coef(smoking2.logit)[2:4]) smoking3.logit = glm(Survived/At.risk ~ factor(Age.group) + Smoker, weights=At.risk, family=binomial, data=smoking) summary(smoking3.logit) c(deviance(smoking3.logit), 1-pchisq(deviance(smoking3.logit),6)) pearres3 = residuals(smoking3.logit, type="pearson") pearson = sum(pearres3^2) c(pearson, 1-pchisq(pearson,6)) c(deviance(smoking1.logit)-deviance(smoking3.logit), 1-pchisq(deviance(smoking1.logit)-deviance(smoking3.logit),5)) c(deviance(smoking2.logit)-deviance(smoking3.logit), 1-pchisq(deviance(smoking2.logit)-deviance(smoking3.logit),4)) exp(coef(smoking3.logit)[8]) # # Bankruptcy analyses # bankruptcy = read.csv("bankruptcy.csv") # # Figure 8.4 # par(mfrow=c(3,2)) boxplot(split(bankruptcy$WC.TA,bankruptcy$Bankrupt), xlab="Bankrupt", ylab="Working Capital / Total Assets") boxplot(split(bankruptcy$RE.TA,bankruptcy$Bankrupt), xlab="Bankrupt", ylab="Retained Earnings / Total Assets") boxplot(split(bankruptcy$EBIT.TA,bankruptcy$Bankrupt), xlab="Bankrupt", ylab="Earnings / Total Assets") boxplot(split(bankruptcy$S.TA,bankruptcy$Bankrupt), xlab="Bankrupt", ylab="Sales / Total Assets") boxplot(split(bankruptcy$BVE.BVL,bankruptcy$Bankrupt), xlab="Bankrupt", ylab="Equity / Liabilities") dev.off() bankrupt1.logit = glm(Bankrupt ~ WC.TA + RE.TA + EBIT.TA + S.TA + BVE.BVL, family=binomial, data=bankruptcy) summary(bankrupt1.logit) library(boot) bankrupt1.diag = glm.diag(bankrupt1.logit) # # Figure 8.5 # plot(c(1:50), bankrupt1.diag$rp, xlab="Index", ylab="Standardized residuals", pch=19, type="b") dev.off() bankruptcy.subset = bankruptcy[-1,] bankrupt2.logit = glm(Bankrupt ~ WC.TA + RE.TA + EBIT.TA + S.TA + BVE.BVL, family=binomial, data=bankruptcy.subset) summary(bankrupt2.logit) # # Calculations necessary to produce Table 8.2. Note that AIC values reported by bestglm are all 2 lower # those in Table 8.2, which does not affect model selection. # library(bestglm) bankrupt.best = bestglm(bankruptcy.subset[,-1], family=binomial, IC="AIC") bankrupt.best$Subsets # somersD = function(probsuccesses, probfailures){ # # Function to calculate Somers' D. This function requires the data to be given in the form of two vectors # of estimated probabilities, one for individuals that were successes and one for individuals that were # failures. Data that are in a form of {# successes, # trials} must be converted to 0/1 response form. # The function takes advantage of the connection between Somers' D and the Wilcoxon-Mann-Whitney statistic. # ns = length(probsuccesses) nf = length(probfailures) wmw = as.numeric(wilcox.test(probsuccesses, probfailures, correct=F)$statistic) list(D=2.*wmw/(ns*nf)-1)} # bankrupt3.logit = glm(Bankrupt ~ 1, family=binomial, data=bankruptcy.subset) summary(bankrupt3.logit) bankrupt4.logit = glm(Bankrupt ~ RE.TA, family=binomial, data=bankruptcy.subset) summary(bankrupt4.logit) library(ResourceSelection) hoslem.test(bankruptcy.subset$Bankrupt, fitted(bankrupt4.logit)) somersD(fitted(bankrupt4.logit)[bankruptcy.subset$Bankrupt==1], fitted(bankrupt4.logit)[bankruptcy.subset$Bankrupt==0]) bankrupt5.logit = glm(Bankrupt ~ RE.TA + BVE.BVL, family=binomial, data=bankruptcy.subset) summary(bankrupt5.logit) hoslem.test(bankruptcy.subset$Bankrupt, fitted(bankrupt5.logit)) somersD(fitted(bankrupt5.logit)[bankruptcy.subset$Bankrupt==1], fitted(bankrupt5.logit)[bankruptcy.subset$Bankrupt==0]) bankrupt6.logit = glm(Bankrupt ~ RE.TA + EBIT.TA + BVE.BVL, family=binomial, data=bankruptcy.subset) summary(bankrupt6.logit) hoslem.test(bankruptcy.subset$Bankrupt, fitted(bankrupt6.logit)) somersD(fitted(bankrupt6.logit)[bankruptcy.subset$Bankrupt==1], fitted(bankrupt6.logit)[bankruptcy.subset$Bankrupt==0]) bankrupt7.logit = glm(Bankrupt ~ WC.TA + RE.TA + EBIT.TA + BVE.BVL, family=binomial, data=bankruptcy.subset) summary(bankrupt7.logit) hoslem.test(bankruptcy.subset$Bankrupt, fitted(bankrupt7.logit)) bankrupt6.diag = glm.diag(bankrupt6.logit) # # Figure 8.6 # par(mfrow=c(2,2)) plot(fitted(bankrupt6.logit), bankrupt6.diag$rp, xlab="Estimated probability", ylab="Standardized residuals", pch=19) plot(c(2:50), bankrupt6.diag$rp, xlab="Index", ylab="Standardized residuals", pch=19, type="b") plot(c(2:50), bankrupt6.diag$h, xlab="Index", ylab="Leverage", pch=19, type="b") plot(c(2:50), bankrupt6.diag$cook, xlab="Index", ylab="Cook's D", pch=19, type="b") dev.off() # # Classification table; the table in the book takes into account that the omitted outlier was misclassified # bankrupt.predict <- as.numeric(fitted(bankrupt6.logit) > .5) table(bankruptcy.subset$Bankrupt, bankrupt.predict) # # Figure 9.1 # # A plot created from the code below will look slightly different from that in Figure 9.1, since no # set.seed() command is included # plot(c(10,10),c(-5,60),type="l",xlim=c(0,60),xaxt="n", yaxt="n",xlab="x",ylab=expression(y^"*")) den <- dlogis(c(-1500:1500)/100,0,4) lines(10+40*den,10+c(-1500:1500)/100) lines(c(30,30),c(-5,60)) lines(30+40*den,30+c(-1500:1500)/100) lines(c(45,45),c(-5,60)) lines(45+40*den,45+c(-1500:1500)/100) lines(c(0,60),c(0,60)) axis(1,at=c(10,30,45),label=c(expression(x[1]),expression(x[2]),expression(x[3]))) axis(2,at=c(0,5,30,45),label=c(expression(alpha[1]),expression(alpha[2]),expression(alpha[3]), expression(alpha[4]))) x <- runif(100)*60 y <- x+rlogis(100)*4 points(x,y,pch=".") abline(h=0,lty=2) abline(h=5,lty=2) abline(h=30,lty=2) abline(h=45,lty=2) dev.off() # # City bond ratings analyses # bondrate = read.csv("bondrate.csv") bondrate$Rating = ordered(bondrate$Rating, levels=c("(4) BBB","(3) A","(2) AA","(1) AAA"), labels=c("BBB","A","AA","AAA")) Logged.population = log(bondrate$Population) bondrate = data.frame(bondrate, Logged.population=Logged.population) # # Figure 9.2 # par(mfrow=c(2,2)) boxplot(split(bondrate$Logged.population,bondrate$Rating),xlab="Rating",ylab="Logged population") boxplot(split(bondrate$Household.income,bondrate$Rating),xlab="Rating",ylab="Household income") boxplot(split(bondrate$Nonprofits.in.top.10,bondrate$Rating),xlab="Rating",ylab="Nonprofits in top 10") boxplot(split(bondrate$For.profits.in.top.10,bondrate$Rating),xlab="Rating",ylab="For profits in top 10") dev.off() # # The multinom() function fits the nominal regression model. It gives incorrect results for estimated # standard errors when Household.income is given in the correct scale, so the variable is converted to # incomes in tens of thousands of dollars in the fits below. # bondrate$Household.income = bondrate$Household.income/10000 library(MASS) library(nnet) # # The form of the output and exact coefficients and standard errors are slightly different in the text. # # The afex package in combination with the car package can be used to obtain overall likelihood ratio # tests for the predictors, which are generally better behaved than are the Wald tests. The steps are # as follows: # (1) Set the response variable to be coded using effect codings # contrasts(bondrate$Rating) = contr.sum(4) # (2) Create bondnom1 (say) as below # (3) Call necessary libraries # library(afex) # library(car) # (4) Obtain the likelihood ratio tests for overall significance (over all three logistic regression # relationships being fit) # Anova(bondnom1, type="III") # bondnom0 = multinom(Rating ~ 1, data=bondrate, maxit=200) bondnom1 = multinom(Rating ~ Logged.population+Household.income+Nonprofits.in.top.10+For.profits.in.top.10, data=bondrate, maxit=200) bondnom1$AIC+2*(bondnom1$edf+1)*(bondnom1$edf+2)/(56-bondnom1$edf-2) bondnom2 = multinom(Rating ~ Logged.population, data=bondrate, maxit=200) bondnom2$AIC+2*(bondnom2$edf+1)*(bondnom2$edf+2)/(56-bondnom2$edf-2) bondnom3 = multinom(Rating ~ Household.income, data=bondrate, maxit=200) bondnom3$AIC+2*(bondnom3$edf+1)*(bondnom3$edf+2)/(56-bondnom3$edf-2) bondnom4 = multinom(Rating ~ Nonprofits.in.top.10, data=bondrate, maxit=200) bondnom4$AIC+2*(bondnom4$edf+1)*(bondnom4$edf+2)/(56-bondnom4$edf-2) bondnom5 = multinom(Rating ~ For.profits.in.top.10, data=bondrate, maxit=200) bondnom5$AIC+2*(bondnom5$edf+1)*(bondnom5$edf+2)/(56-bondnom5$edf-2) bondnom6 = multinom(Rating ~ Logged.population+Household.income, data=bondrate, maxit=200) bondnom6$AIC+2*(bondnom6$edf+1)*(bondnom6$edf+2)/(56-bondnom6$edf-2) bondnom7 = multinom(Rating ~ Logged.population+Nonprofits.in.top.10, data=bondrate, maxit=200) bondnom7$AIC+2*(bondnom7$edf+1)*(bondnom7$edf+2)/(56-bondnom7$edf-2) bondnom8 = multinom(Rating ~ Logged.population+For.profits.in.top.10, data=bondrate, maxit=200) bondnom8$AIC+2*(bondnom8$edf+1)*(bondnom8$edf+2)/(56-bondnom8$edf-2) bondnom9 = multinom(Rating ~ Household.income+Nonprofits.in.top.10, data=bondrate, maxit=200) bondnom9$AIC+2*(bondnom9$edf+1)*(bondnom9$edf+2)/(56-bondnom9$edf-2) bondnom10 = multinom(Rating ~ Household.income+For.profits.in.top.10, data=bondrate, maxit=200) bondnom10$AIC+2*(bondnom10$edf+1)*(bondnom10$edf+2)/(56-bondnom10$edf-2) bondnom11 = multinom(Rating ~ Nonprofits.in.top.10+For.profits.in.top.10, data=bondrate, maxit=200) bondnom11$AIC+2*(bondnom11$edf+1)*(bondnom11$edf+2)/(56-bondnom11$edf-2) bondnom12 = multinom(Rating ~ Logged.population+Household.income+Nonprofits.in.top.10, data=bondrate, maxit=200) bondnom12$AIC+2*(bondnom12$edf+1)*(bondnom12$edf+2)/(56-bondnom12$edf-2) bondnom13 = multinom(Rating ~ Logged.population+Household.income+For.profits.in.top.10, data=bondrate, maxit=200) bondnom13$AIC+2*(bondnom13$edf+1)*(bondnom13$edf+2)/(56-bondnom13$edf-2) bondnom14 = multinom(Rating ~ Logged.population+Nonprofits.in.top.10+For.profits.in.top.10, data=bondrate, maxit=200) bondnom14$AIC+2*(bondnom14$edf+1)*(bondnom14$edf+2)/(56-bondnom14$edf-2) bondnom15 = multinom(Rating ~ Household.income+Nonprofits.in.top.10+For.profits.in.top.10, data=bondrate, maxit=200) bondnom15$AIC+2*(bondnom15$edf+1)*(bondnom15$edf+2)/(56-bondnom15$edf-2) summary(bondnom1) c(deviance(bondnom4)-deviance(bondnom7), 1-pchisq(deviance(bondnom4)-deviance(bondnom7), 3)) coef(bondnom1)/summary(bondnom1)$standard.errors 2.*(1.-pnorm(abs(coef(bondnom1))/summary(bondnom1)$standard.errors)) summary(bondnom7) c(deviance(bondnom0)-deviance(bondnom7), 1-pchisq(deviance(bondnom0)-deviance(bondnom7), 6)) coef(bondnom7)/summary(bondnom7)$standard.errors 2.*(1.-pnorm(abs(coef(bondnom7))/summary(bondnom7)$standard.errors)) # # Ordinal logistic regression fitting using the lrm() function library(rms) bondord1 = lrm(Rating ~ Logged.population+Household.income+Nonprofits.in.top.10+For.profits.in.top.10, data=bondrate) bondord1$deviance[2]+2*length(bondord1$coefficients)+2*(length(bondord1$coefficients))* (length(bondord1$coefficients)+2)/(56-length(bondord1$coefficients)-2) bondord2 = lrm(Rating ~ Logged.population, data=bondrate) bondord2$deviance[2]+2*length(bondord2$coefficients)+2*(length(bondord2$coefficients))* (length(bondord2$coefficients)+2)/(56-length(bondord2$coefficients)-2) bondord3 = lrm(Rating ~ Household.income, data=bondrate) bondord3$deviance[2]+2*length(bondord3$coefficients)+2*(length(bondord3$coefficients))* (length(bondord3$coefficients)+2)/(56-length(bondord3$coefficients)-2) bondord4 = lrm(Rating ~ Nonprofits.in.top.10, data=bondrate) bondord4$deviance[2]+2*length(bondord4$coefficients)+2*(length(bondord4$coefficients))* (length(bondord4$coefficients)+2)/(56-length(bondord4$coefficients)-2) bondord5 = lrm(Rating ~ For.profits.in.top.10, data=bondrate) bondord5$deviance[2]+2*length(bondord5$coefficients)+2*(length(bondord5$coefficients))* (length(bondord5$coefficients)+2)/(56-length(bondord5$coefficients)-2) bondord6 = lrm(Rating ~ Logged.population+Household.income, data=bondrate) bondord6$deviance[2]+2*length(bondord6$coefficients)+2*(length(bondord6$coefficients))* (length(bondord6$coefficients)+2)/(56-length(bondord6$coefficients)-2) bondord7 = lrm(Rating ~ Logged.population+Nonprofits.in.top.10, data=bondrate) bondord7$deviance[2]+2*length(bondord7$coefficients)+2*(length(bondord7$coefficients))* (length(bondord7$coefficients)+2)/(56-length(bondord7$coefficients)-2) bondord8 = lrm(Rating ~ Logged.population+For.profits.in.top.10, data=bondrate) bondord8$deviance[2]+2*length(bondord8$coefficients)+2*(length(bondord8$coefficients))* (length(bondord8$coefficients)+2)/(56-length(bondord8$coefficients)-2) bondord9 = lrm(Rating ~ Household.income+Nonprofits.in.top.10, data=bondrate) bondord9$deviance[2]+2*length(bondord9$coefficients)+2*(length(bondord9$coefficients))* (length(bondord9$coefficients)+2)/(56-length(bondord9$coefficients)-2) bondord10 = lrm(Rating ~ Household.income+For.profits.in.top.10, data=bondrate) bondord10$deviance[2]+2*length(bondord10$coefficients)+2*(length(bondord10$coefficients))* (length(bondord10$coefficients)+2)/(56-length(bondord10$coefficients)-2) bondord11 = lrm(Rating ~ Nonprofits.in.top.10+For.profits.in.top.10, data=bondrate) bondord11$deviance[2]+2*length(bondord11$coefficients)+2*(length(bondord11$coefficients))* (length(bondord11$coefficients)+2)/(56-length(bondord11$coefficients)-2) bondord12 = lrm(Rating ~ Logged.population+Household.income+Nonprofits.in.top.10, data=bondrate) bondord12$deviance[2]+2*length(bondord12$coefficients)+2*(length(bondord12$coefficients))* (length(bondord12$coefficients)+2)/(56-length(bondord12$coefficients)-2) bondord13 = lrm(Rating ~ Logged.population+Household.income+For.profits.in.top.10, data=bondrate) bondord13$deviance[2]+2*length(bondord13$coefficients)+2*(length(bondord13$coefficients))* (length(bondord13$coefficients)+2)/(56-length(bondord13$coefficients)-2) bondord14 = lrm(Rating ~ Logged.population+Nonprofits.in.top.10+For.profits.in.top.10, data=bondrate) bondord14$deviance[2]+2*length(bondord14$coefficients)+2*(length(bondord14$coefficients))* (length(bondord14$coefficients)+2)/(56-length(bondord14$coefficients)-2) bondord15 = lrm(Rating ~ Household.income+Nonprofits.in.top.10+For.profits.in.top.10, data=bondrate) bondord15$deviance[2]+2*length(bondord15$coefficients)+2*(length(bondord15$coefficients))* (length(bondord15$coefficients)+2)/(56-length(bondord15$coefficients)-2) bondord1 c(bondord7$deviance[2]-bondnom7$deviance, 1-pchisq(bondord7$deviance[2]-bondnom7$deviance, 4)) bondord7 table(bondrate$Rating,predict(bondnom7)) predord7 = apply(predict(bondord7, type="fitted.ind"),1,which.max) predord7 = ordered(predord7, levels=c(1:4), labels=c("BBB","A","AA","AAA")) table(bondrate$Rating, predord7) data.frame(bondrate$Rating,predict(bondord7, type="fitted.ind")) Nonprofits.sq = bondrate$Nonprofits.in.top.10*bondrate$Nonprofits.in.top.10 bondord16 = lrm(Rating ~ Logged.population+Nonprofits.in.top.10+Nonprofits.sq, data=bondrate) bondord16$deviance[2]+2*length(bondord16$coefficients)+2*(length(bondord16$coefficients))* (length(bondord16$coefficients)+2)/(56-length(bondord16$coefficients)-2) predord16 = apply(predict(bondord16, type="fitted.ind"),1,which.max) predord16 = ordered(predord16, levels=c(1:4), labels=c("BBB","A","AA","AAA")) table(bondrate$Rating, predord16) # # Chapter 10 # # Florida shark attack analyses # floridashark = read.csv("floridashark.csv") # # Figure 10.1 # par(mfrow=c(1,2)) plot(floridashark$Year, floridashark$Attacks, xlab="Year", ylab="Attacks", pch=19) plot(floridashark$Population, floridashark$Attacks, xlab="Population", ylab="Attacks", pch=19) dev.off() # # Poisson regression fits and diagnostics # shark1.pois = glm(Attacks ~ Year + Population, family=poisson, data=floridashark) summary(shark1.pois) # # Figure 10.2 # plot(floridashark$Year, floridashark$Attacks/floridashark$Population, xlab="Year", ylab="Attack rate", pch=19) dev.off() shark2.pois = glm(Attacks ~ offset(log(Population))+Year, family=poisson, data=floridashark) summary(shark2.pois) shark3.pois = glm(Attacks ~ offset(log(Population))+Year+Population, family=poisson, data=floridashark) summary(shark3.pois) library(boot) shark2.diag = glm.diag(shark2.pois) # # Figure 10.3 # par(mfrow=c(2,2)) plot(fitted(shark2.pois), shark2.diag$rp, xlab="Estimated attacks", ylab="Standardized residuals", pch=19) plot(c(1:66), shark2.diag$rp, xlab="Index", ylab="Standardized residuals", pch=19, type="b") plot(c(1:66), shark2.diag$h, xlab="Index", ylab="Leverage", pch=19, type="b") plot(c(1:66), shark2.diag$cook, xlab="Index", ylab="Cook's D", pch=19, type="b") dev.off() Post.911 = as.numeric(floridashark$Year >= 2002) shark4.pois = glm(Attacks ~ offset(log(Population))+Year+Post.911+Post.911*Year, family=poisson, data=floridashark) summary(shark4.pois) # # Pearson goodness-of-fit statistic # shark4.pear = sum(residuals(shark4.pois, type = "pearson")^2) shark4.diag = glm.diag(shark4.pois) # # Figure 10.4 # par(mfrow=c(2,2)) plot(fitted(shark4.pois), shark4.diag$rp, xlab="Estimated attacks", ylab="Standardized residuals", pch=19) plot(c(1:66), shark4.diag$rp, xlab="Index", ylab="Standardized residuals", pch=19, type="b") plot(c(1:66), shark4.diag$h, xlab="Index", ylab="Leverage", pch=19, type="b") plot(c(1:66), shark4.diag$cook, xlab="Index", ylab="Cook's D", pch=19, type="b") dev.off() # # Quasi-likelihood tests and p-values # zQL = (summary(shark4.pois)$coefficients[,1]/summary(shark4.pois)$coefficients[,2])/sqrt(shark4.pear/summary(shark4.pois)$df.residual) data.frame(z = zQL, p = 2.*(1.-pnorm(abs(zQL)))) # # Negative binomial regression fit and diagnostics # library(MASS) shark5.nb = glm.nb(Attacks ~ offset(log(Population))+Year+Post.911+Post.911*Year, data=floridashark) summary(shark5.nb) sum(residuals(shark5.nb, type = "pearson")^2) shark5.diag = glm.diag(shark5.nb) # # Figure 10.5 # par(mfrow=c(2,2)) plot(fitted(shark5.nb), shark5.diag$rp, xlab="Estimated attacks", ylab="Standardized residuals", pch=19) plot(c(1:66), shark5.diag$rp, xlab="Index", ylab="Standardized residuals", pch=19, type="b") plot(c(1:66), shark5.diag$h, xlab="Index", ylab="Leverage", pch=19, type="b") plot(c(1:66), shark5.diag$cook, xlab="Index", ylab="Cook's D", pch=19, type="b") dev.off() # # Least squares fit of logged attack rates # summary(lm(log(Attacks/Population) ~ Year+Post.911[c(11:27,29:66)]+Post.911[c(11:27,29:66)]*Year, data=floridashark[c(11:27,29:66),])) # # Poisson regression fits of fatality rates # shark6.pois = glm(Fatalities ~ offset(log(Population))+Year+Attacks+Post.911+Post.911*Year, family=poisson, data=floridashark) summary(shark6.pois) shark7.pois = glm(Fatalities ~ offset(log(Population))+Year+Attacks+Post.911, family=poisson, data=floridashark) summary(shark7.pois) shark8.pois = glm(Fatalities ~ offset(log(Population))+Year+Attacks, family=poisson, data=floridashark) summary(shark8.pois) shark8.diag = glm.diag(shark8.pois) # # Figure 10.6 # par(mfrow=c(2,2)) plot(fitted(shark8.pois), shark8.diag$rp, xlab="Estimated fatalities", ylab="Standardized residuals", pch=19) plot(c(1:66), shark8.diag$rp, xlab="Index", ylab="Standardized residuals", pch=19, type="b") plot(c(1:66), shark8.diag$h, xlab="Index", ylab="Leverage", pch=19, type="b") plot(c(1:66), shark8.diag$cook, xlab="Index", ylab="Cook's D", pch=19, type="b") dev.off() # # Omitting 1976 # shark9.pois = glm(Fatalities ~ offset(log(Population))+Year+Attacks, family=poisson, data=floridashark[-31,]) summary(shark9.pois) # # Using Poisson regression to estimate weights for WLS for international movie grosses analyses # Log.international.gross = log10(movies2010$International) movies2010 = data.frame(movies2010, Log.international.gross) rdiag = ls.diag(intl.lm2) intl.wt = glm(rdiag$std.res^2 ~ Log.domestic.gross, family=poisson, data=movies2009[-c(119,120),]) intl.wt wt.intl = 1/fitted(intl.wt) # # Model comparison and identifying "Avatar" as a leverage point # intl.lmwls1 = lm(Log.international.gross ~ Log.domestic.gross + Rating + Log.domestic.gross*Rating, weights=wt.intl, data=movies2009[-c(119,120),]) summary(intl.lmwls1) intl.aovwls1 = aov(Log.international.gross ~ Log.domestic.gross + Rating + Log.domestic.gross*Rating, weights=wt.intl, data=movies2009[-c(119,120),]) library(car) Anova(intl.aovwls1, type="III") intl.lmwls2 = lm(Log.international.gross ~ Log.domestic.gross + Rating, weights=wt.intl, data=movies2009[-c(119,120),]) summary(intl.lmwls2) intl.aovwls2 = aov(Log.international.gross ~ Log.domestic.gross + Rating, weights=wt.intl, data=movies2009[-c(119,120),]) Anova(intl.aovwls2, type="III") intl.lmwls3 = lm(Log.international.gross ~ Log.domestic.gross, weights=wt.intl, data=movies2009[-c(119,120),]) summary(intl.lmwls3) par(mfrow=c(2,2)) plot(fitted(intl.lmwls3), rstudent(intl.lmwls3), xlab="Fitted values", ylab="Standardized residuals", pch = 19) qqnorm(rstudent(intl.lmwls3), pch = 19) plot(c(1:127), hatvalues(intl.lmwls3), xlab="Index", ylab="Leverage", pch = 19) plot(c(1:127), cooks.distance(intl.lmwls3), xlab="Index", ylab="Cook's D", pch = 19) # # Omitting "Avatar" # intl.lmwls4 = lm(Log.international.gross ~ Log.domestic.gross + Rating + Log.domestic.gross*Rating, weights=wt.intl[-15], data=movies2009[-c(15,119,120),]) summary(intl.lmwls4) intl.aovwls4 = aov(Log.international.gross ~ Log.domestic.gross + Rating + Log.domestic.gross*Rating, weights=wt.intl[-15], data=movies2009[-c(15,119,120),]) Anova(intl.aovwls4, type="III") AIC(intl.lmwls4)+2*(intl.lmwls4$rank+1)*(intl.lmwls4$rank+2)/(intl.lmwls4$df.residual-1) intl.lmwls5 = lm(Log.international.gross ~ Log.domestic.gross + Rating, weights=wt.intl[-15], data=movies2009[-c(15,119,120),]) summary(intl.lmwls5) intl.aovwls5 = aov(Log.international.gross ~ Log.domestic.gross + Rating, weights=wt.intl[-15], data=movies2009[-c(15,119,120),]) Anova(intl.aovwls5, type="III") AIC(intl.lmwls5)+2*(intl.lmwls5$rank+1)*(intl.lmwls5$rank+2)/(intl.lmwls5$df.residual-1) intl.lmwls6 = lm(Log.international.gross ~ Log.domestic.gross, weights=wt.intl[-15], data=movies2009[-c(15,119,120),]) summary(intl.lmwls6) AIC(intl.lmwls6)+2*(intl.lmwls6$rank+1)*(intl.lmwls6$rank+2)/(intl.lmwls6$df.residual-1) # # Figure 10.7 # par(mfrow=c(2,2)) plot(fitted(intl.lmwls6), rstudent(intl.lmwls6), xlab="Fitted values", ylab="Standardized residuals", pch = 19) qqnorm(rstudent(intl.lmwls6), pch = 19) plot(c(1:126), hatvalues(intl.lmwls6), xlab="Index", ylab="Leverage", pch=19) plot(c(1:126), cooks.distance(intl.lmwls6), xlab="Index", ylab="Cook's D", pch=19) dev.off() # # OLS and WLS prediction intervals # 10^predict(intl.lm2, movies2010, interval="prediction") wt.2010 = 1/predict(intl.wt, movies2010, type="response") 10^predict(intl.lmwls6, movies2010, weights=wt.2010, interval="prediction") # # Chapter 11 # # Figure 11.1 # library(FAdist) x = c(-500:500)/100 par(mfrow=c(1,2)) plot(x, dnorm(x), type="l", ylab="Density of logged survival", main="(a) Normal and Logistic") lines(x,dlogis(x), lty=2) plot(x,exp(x-exp(x)), ylim=c(0,.6), type="l", ylab="Density of logged survival", main="(b) Extreme value") lines(x,1.5*exp(1.5*x-exp(1.5*x)), lty=2) lines(x,exp(x/2.-exp(x/2.))/2., lty=4) dev.off() # # Figure 11.2 # x = c(0:5000)/100 par(mfrow=c(1,2)) plot(x, dlnorm(x)/(1-plnorm(x)), type="l", ylab="Hazard of survival", main="(a) Lognormal and Log-logistic") lines(x, dllog(x)/(1-pllog(x)), lty=2) x = c(0:500)/100 plot(c(0,5), c(1,1), ylim=c(0,5.5), type="l", xlab= "x", ylab="Hazard of survival", main="(b) Weibull") lines(x, 1.5*sqrt(x), lty=2) lines(x, .5*x^(-.5), lty=4) dev.off() # # Broadway show longevity analyses # library(survival) broadway = read.csv("broadway.csv") bwaypostawards = broadway[((broadway$LimitedRun == 0)&(broadway$Open.at.Tonys==1)),] # # Kaplan-Meier curves # # # Figure 11.3 # par(mfrow=c(2,2)) plot(survfit(Surv(Post.award.performances, Closed) ~ Musical, data=bwaypostawards, conf.type="none"), mark.time=TRUE, lty=c(1,2), main="(a) Musical") legend(1800, .9, c("Not musical", "Musical"), lty = 1:2) plot(survfit(Surv(Post.award.performances, Closed) ~ Revival, data=bwaypostawards, conf.type="none"), mark.time=TRUE, lty=c(1,2), main="(b) Revival") legend(1900, .9, c("Not revival", "Revival"), lty = 1:2) plot(survfit(Surv(Post.award.performances, Closed) ~ Tony.awards, data=bwaypostawards, conf.type="none"), mark.time=TRUE, lty=c(1:6), main="(c) Tony Awards") legend(2800, .9, c("0", "1", "2", "3", "4", "5"), lty = 1:6) plot(survfit(Surv(Post.award.performances, Closed) ~ Losing.nominations, data=bwaypostawards, conf.type="none"), mark.time=TRUE, lty=c("solid", "dashed", "dotted", "dotdash", "longdash", "twodash", "1F", "5F"), main="(d) Losing nominations") legend(2800, .99, c("0", "1", "2", "3", "4", "5", "6", "7"), lty = c("solid", "dashed", "dotted", "dotdash", "longdash", "twodash", "1F", "5F")) dev.off() # # Log-rank tests # survdiff(Surv(Post.award.performances, Closed) ~ Musical, data=bwaypostawards) survdiff(Surv(Post.award.performances, Closed) ~ Revival, data=bwaypostawards) survdiff(Surv(Post.award.performances, Closed) ~ Tony.awards, data=bwaypostawards) survdiff(Surv(Post.award.performances, Closed) ~ Losing.nominations, data=bwaypostawards) # # Logged cumulative hazard (Nelson-Aalen) plots # # Figure 11.4 # par(mfrow=c(2,2)) plot(survfit(Surv(Post.award.performances, Closed) ~ Musical, data=bwaypostawards, conf.type="none"), mark.time=TRUE, lty=c(1,2), main="(a) Musical", fun="cloglog") legend(50, -2.5, c("Not musical", "Musical"), lty = 1:2) plot(survfit(Surv(Post.award.performances, Closed) ~ Revival, data=bwaypostawards, conf.type="none"), mark.time=TRUE, lty=c(1,2), main="(b) Revival", fun="cloglog") legend(50, -2.5, c("Not revival", "Revival"), lty = 1:2) plot(survfit(Surv(Post.award.performances, Closed) ~ Tony.awards, data=bwaypostawards, conf.type="none"), mark.time=TRUE, lty=c(1:6), main="(c) Tony Awards", fun="cloglog") legend(700, -.8, c("0", "1", "2", "3", "4", "5"), lty = 1:6) plot(survfit(Surv(Post.award.performances, Closed) ~ Losing.nominations, data=bwaypostawards, conf.type="none"), mark.time=TRUE, lty=c("solid", "dashed", "dotted", "dotdash", "longdash", "twodash", "1F", "5F"), main="(d) Losing nominations", fun="cloglog") legend(800, -.3, c("0", "1", "2", "3", "4", "5", "6", "7"), lty = c("solid", "dashed", "dotted", "dotdash", "longdash", "twodash", "1F", "5F"), cex=.8) dev.off() # # Figure 11.5 # par(mfrow=c(2,2)) plot(bwaypostawards$Att.post.award[bwaypostawards$Closed==1], log(bwaypostawards$Post.award.performances[bwaypostawards$Closed==1]), pch=1, xlab="First week attendance", ylab="Logged performances", main="(a)", ylim=c(1.5, 8.5)) points(bwaypostawards$Att.post.award[bwaypostawards$Closed==0], log(bwaypostawards$Post.award.performances[bwaypostawards$Closed==0]), pch="x") # # The ".3" rating variables are the ones used in the test # plot(bwaypostawards$NYT.rating.3[bwaypostawards$Closed==1], log(bwaypostawards$Post.award.performances[bwaypostawards$Closed==1]), pch=1, xlab="NY Times rating", ylab="Logged performances", main="(b)", ylim=c(1.5, 8.5)) points(bwaypostawards$NYT.rating.3[bwaypostawards$Closed==0], log(bwaypostawards$Post.award.performances[bwaypostawards$Closed==0]), pch="x") plot(bwaypostawards$DN.rating.3[bwaypostawards$Closed==1], log(bwaypostawards$Post.award.performances[bwaypostawards$Closed==1]), pch=1, xlab="Daily News rating", ylab="Logged performances", main="(c)", ylim=c(1.5, 8.5)) points(bwaypostawards$DN.rating.3[bwaypostawards$Closed==0], log(bwaypostawards$Post.award.performances[bwaypostawards$Closed==0]), pch="x") plot(bwaypostawards$USA.rating.3[bwaypostawards$Closed==1], log(bwaypostawards$Post.award.performances[bwaypostawards$Closed==1]), pch=1, xlab="USA Today rating", ylab="Logged performances", main="(d)", ylim=c(1.5, 8.5)) points(bwaypostawards$USA.rating.3[bwaypostawards$Closed==0], log(bwaypostawards$Post.award.performances[bwaypostawards$Closed==0]), pch="x") dev.off() # # Weibull regression fits, including best subsets # ff1 = formula(Surv(Post.award.performances, Closed) ~ Musical + Revival + Tony.awards + Losing.nominations + Att.post.award + NYT.rating.3 + DN.rating.3 + USA.rating.3) weibullfit.postawards.1 = survreg(ff1, data = bwaypostawards, dist="weibull") summary(weibullfit.postawards.1) # # Likelihood ratio tests # drop1(weibullfit.postawards.1, test="Chisq") # # Exhaustively do best subsets # # Based on http://rpubs.com/kaz_yos/exhaustive # ## Create vectors for outcome and predictors # outcome = "Surv(Post.award.performances, Closed)" predictors = c("Musical", "Revival", "Att.post.award", "Tony.awards", "Losing.nominations", "NYT.rating.3", "DN.rating.3", "USA.rating.3") # ## Create list of models # list.of.models = lapply(seq_along((predictors)), function(n) { left.hand.side = outcome right.hand.side = apply(X = combn(predictors, n), MARGIN = 2, paste, collapse = " + ") paste(left.hand.side, right.hand.side, sep = " ~ ") }) # ## Convert to a vector # vector.of.models <- unlist(list.of.models) # ## Fit survreg to all models # list.of.fits = lapply(vector.of.models, function(x) { formula = as.formula(x) fit = survreg(formula, dist="weibull", data = bwaypostawards) result.AIC = extractAIC(fit) data.frame(num.predictors = result.AIC[1], AIC = result.AIC[2], model = x) }) # ## Collapse to a data frame # result = do.call(rbind, list.of.fits) # ## Sort and print # library(doBy) orderBy(~ AIC, result) weibullfit.postawards.2 = survreg(Surv(Post.award.performances, Closed) ~ Musical + Revival + Att.post.award + USA.rating.3, data = bwaypostawards, dist="weibull") summary(weibullfit.postawards.2) drop1(weibullfit.postawards.2, test="Chisq") # # Figure 11.6 # qqnorm(residuals(weibullfit.postawards.2, type="deviance"), main="", pch=19) dev.off() # # Mama Mia! outlier - omit it, and rerun best subsets # ## Fit survreg to all models # list.of.fits = lapply(vector.of.models, function(x) { formula = as.formula(x) fit = survreg(formula, dist="weibull", data = bwaypostawards[-16,]) result.AIC = extractAIC(fit) data.frame(num.predictors = result.AIC[1], AIC = result.AIC[2], model = x) }) # ## Collapse to a data frame # result = do.call(rbind, list.of.fits) # ## Sort and print # orderBy(~ AIC, result) # weibullfit.postawards.3 = survreg(Surv(Post.award.performances, Closed) ~ Musical + Revival + Att.post.award + Tony.awards + Losing.nominations + USA.rating.3, data = bwaypostawards[-16,], dist="weibull") summary(weibullfit.postawards.3) # # Proportional hazards interpretations of coefficients # library(eha) weibullfit.alt.postawards.3 = weibreg(Surv(Post.award.performances, Closed) ~ Musical + Revival + Att.post.award + Tony.awards + Losing.nominations + USA.rating.3, data = bwaypostawards[-16,]) summary(weibullfit.alt.postawards.3) drop1(weibullfit.postawards.3, test="Chisq") extractAIC(weibullfit.postawards.3) # # Exponential regression fit # expofit.postawards.3 = survreg(Surv(Post.award.performances, Closed) ~ Musical + Revival + Att.post.award + Tony.awards + Losing.nominations + USA.rating.3, data = bwaypostawards[-16,], dist="weibull", scale=1) extractAIC(expofit.postawards.3) # # Cox proportional hazards fits # coxfit.postawards.1 = coxph(ff1, iter.max=100, na.action = na.omit, x = T, data = bwaypostawards) summary(coxfit.postawards.1) # ## Fit coxph to all models # list.of.fits = lapply(vector.of.models, function(x) { formula = as.formula(x) fit = coxph(formula, data = bwaypostawards) result.AIC = extractAIC(fit) data.frame(num.predictors = result.AIC[1], AIC = result.AIC[2], model = x) }) # ## Collapse to a data frame # result = do.call(rbind, list.of.fits) # ## Sort and print orderBy(~ AIC, result) coxfit.postawards.2 = coxph(Surv(Post.award.performances, Closed) ~ Musical + Revival + Att.post.award + Tony.awards + Losing.nominations, iter.max=100, na.action = na.omit, x = T, data = bwaypostawards) summary(coxfit.postawards.2) drop1(coxfit.postawards.2, test="Chisq") cox.resD = residuals(coxfit.postawards.2, type="deviance") cox.resD = (cox.resD-mean(cox.resD)) / sd(cox.resD) # # Figure 11.7 # qqnorm(cox.resD, main="", pch=19) dev.off() # # Omit Mama Mia! # ## Fit coxph to all models # list.of.fits = lapply(vector.of.models, function(x) { formula = as.formula(x) fit = coxph(formula, data = bwaypostawards[-16,]) result.AIC = extractAIC(fit) data.frame(num.predictors = result.AIC[1], AIC = result.AIC[2], model = x) }) # ## Collapse to a data frame # result = do.call(rbind, list.of.fits) # ## Sort and print # orderBy(~ AIC, result) coxfit.postawards.3 = coxph(Surv(Post.award.performances, Closed) ~ Musical + Revival + Att.post.award + Tony.awards + Losing.nominations + USA.rating.3, iter.max=100, na.action = na.omit, x = T, data = bwaypostawards[-16,]) summary(coxfit.postawards.3) drop1(coxfit.postawards.3, test="Chisq") # # Lognormal fit # lognormalfit.postawards.3 = survreg(Surv(Post.award.performances, Closed) ~ Musical + Revival + Att.post.award + Tony.awards + Losing.nominations + USA.rating.3, data = bwaypostawards[-16,], dist="lognormal") summary(lognormalfit.postawards.3) drop1(lognormalfit.postawards.3, test="Chisq") residuals(lognormalfit.postawards.3, type="deviance") plot(1:129, residuals(lognormalfit.postawards.3, type="deviance"), type="b") # # Buckley-James regression fit # library(rms) bjfit.postawards.3 = bj(Surv(Post.award.performances, Closed) ~ Musical + Revival + Att.post.award + Tony.awards + Losing.nominations + USA.rating.3, data = bwaypostawards[-16,], x = TRUE, y=TRUE) bjfit.postawards.3 exp(bjfit.postawards.3$coefficients) bjres = residuals(bjfit.postawards.3, type="censored.normalized")[,1] bjres # # AIC for lognormal higher than for same weibull model by 11 # extractAIC(lognormalfit.postawards.3) coxphtest3 = cox.zph(coxfit.postawards.3, transform="log") coxphtest3 # # Figure 11.8 # par(mfrow=c(3,2)) plot(coxphtest3[1], main="(a) Musical") plot(coxphtest3[2], main="(b) Revival") plot(coxphtest3[3], ylab="Beta(t) for Post-Awards attendance", main="(c) First week attendance") plot(coxphtest3[4], ylab="Beta(t) for Tony Awards", main="(d) Tony Awards") plot(coxphtest3[5], ylab="Beta(t) for Losing Tony nominations", main="(e) Losing Tony nominations") plot(coxphtest3[6], ylab="Beta(t) for USA Today rating", main="(f) USA Today rating") dev.off() # # Modeling total performances as LTRC with number of pre-award performances as the left truncation variable # bwaypostawards$Pre.award.performances = bwaypostawards$Total.performances - bwaypostawards$Post.award.performances ff1ltrc = formula(Surv(Pre.award.performances, Total.performances, Closed) ~ Musical + Revival + Att.post.award + Tony.awards + Losing.nominations + NYT.rating.3 + DN.rating.3 + USA.rating.3) coxfit.postawards.ltrc.1 = coxph(ff1ltrc, iter.max=100, na.action = na.omit, x = T, data = bwaypostawards) summary(coxfit.postawards.ltrc.1) # # Exhaustively do best subsets # # Based on http://rpubs.com/kaz_yos/exhaustive # ## Create vectors for outcome and predictors outcome = "Surv(Pre.award.performances, Total.performances, Closed)" predictors = c("Musical", "Revival", "Att.post.award", "Tony.awards", "Losing.nominations", "NYT.rating.3", "DN.rating.3", "USA.rating.3") # ## Create list of models # list.of.models = lapply(seq_along((predictors)), function(n) { left.hand.side = outcome right.hand.side = apply(X = combn(predictors, n), MARGIN = 2, paste, collapse = " + ") paste(left.hand.side, right.hand.side, sep = " ~ ") }) # ## Convert to a vector # vector.of.models <- unlist(list.of.models) # ## Fit survreg to all models # list.of.fits = lapply(vector.of.models, function(x) { formula = as.formula(x) fit = coxph(formula, data = bwaypostawards) result.AIC = extractAIC(fit) data.frame(num.predictors = result.AIC[1], AIC = result.AIC[2], model = x) }) # ## Collapse to a data frame # result = do.call(rbind, list.of.fits) # ## Sort and print # orderBy(~ AIC, result) coxfit.postawards.ltrc.2 = coxph(Surv(Pre.award.performances, Total.performances, Closed) ~ Musical + Revival + Att.post.award + Tony.awards + Losing.nominations, iter.max=100, na.action = na.omit, x = T, data = bwaypostawards) summary(coxfit.postawards.ltrc.2) drop1(coxfit.postawards.ltrc.2, test="Chisq") cox.zph(coxfit.postawards.ltrc.2, transform="log") cox.resD = residuals(coxfit.postawards.ltrc.2, type="deviance") cox.resD = (cox.resD-mean(cox.resD)) / sd(cox.resD) cox.resD coxfit.postawards.ltrc.3 = coxph(Surv(Pre.award.performances, Total.performances, Closed) ~ Musical + Revival + Att.post.award + Tony.awards + Losing.nominations + USA.rating.3, iter.max=100, na.action = na.omit, x = T, data = bwaypostawards) summary(coxfit.postawards.ltrc.3) drop1(coxfit.postawards.ltrc.3, test="Chisq") cox.zph(coxfit.postawards.ltrc.3, transform="log") cox.resD = residuals(coxfit.postawards.ltrc.3, type="deviance") cox.resD = (cox.resD-mean(cox.resD)) / sd(cox.resD) cox.resD # # Female heads of governments analyses, involving time-varying covariates # femalehead = read.csv("femalehead.csv") femalehead$Adj.start = femalehead$Start - 1959 femalehead$Adj.end = femalehead$End - 1959 femalehead$Log.GDP.pc = log(femalehead$GDP.pc) # # Figure 11.9 # par(mfrow=c(1,2)) plot(survfit(Surv(Adj.start, Adj.end, Female.head.govt) ~ Continent, data = femalehead, conf.type="none"), lty=c(1:6), main="Continent") legend(5, .6, c("Africa", "Asia", "Europe", "North America", "Oceania", "South America"), lty = 1:6, cex=.5) plot(survfit(Surv(Adj.start, Adj.end, Female.head.govt) ~ Religion.3, data=femalehead, conf.type="none"), lty=c(1:3), main="Religion") legend(10, .5, c("Christianity", "Islam", "Other"), lty = 1:3, cex=.5) dev.off() data.frame(femalehead$Country[femalehead$Female.head.govt==1], femalehead$Religion.3[femalehead$Female.head.govt==1], femalehead$End[femalehead$Female.head.govt==1]) # # Figure 11.10 # par(mfrow=c(3,2)) boxplot(femalehead$Birth.rate~femalehead$Female.head.govt, xlab="Female head of government", ylab="Birth rate", main="(a) Birth rate") boxplot(femalehead$Death.rate~femalehead$Female.head.govt, xlab="Female head of government", ylab="Death rate", main="(b) Death rate") boxplot(femalehead$Fertility.rate~femalehead$Female.head.govt, xlab="Female head of government", ylab="Fertility rate", main="(c) Fertility rate") boxplot(femalehead$Log.GDP.pc~femalehead$Female.head.govt, xlab="Female head of government", ylab="Logged GDP p/c", main="(d) Logged per capita GDP") boxplot(femalehead$Infant.mortality.rate~femalehead$Female.head.govt, xlab="Female head of government", ylab="Infant mortality rate", main="(e) Infant mortality rate") boxplot(femalehead$Female.pct~femalehead$Female.head.govt, xlab="Female head of government", ylab="Percentage female", main="(f) Percentage population female") dev.off() ffhead = formula(Surv(Adj.start, Adj.end, Female.head.govt) ~ Birth.rate + Death.rate + Fertility.rate + Log.GDP.pc + Infant.mortality.rate + Female.pct + Continent + Religion.3) cox.female.head.1 = coxph(ffhead, iter.max=100, na.action = na.omit, x = T, data = femalehead) summary(cox.female.head.1) # # Exhaustively do best subsets # Based on http://rpubs.com/kaz_yos/exhaustive # ## Create vectors for outcome and predictors # outcome = "Surv(Adj.start, Adj.end, Female.head.govt)" predictors = c("Birth.rate", "Death.rate", "Fertility.rate", "Log.GDP.pc", "Infant.mortality.rate", "Female.pct", "Continent", "Religion.3") # ## Create list of models # list.of.models = lapply(seq_along((predictors)), function(n) { left.hand.side = outcome right.hand.side = apply(X = combn(predictors, n), MARGIN = 2, paste, collapse = " + ") paste(left.hand.side, right.hand.side, sep = " ~ ") }) # ## Convert to a vector # vector.of.models <- unlist(list.of.models) # ## Fit survreg to all models # list.of.fits = lapply(vector.of.models, function(x) { formula = as.formula(x) fit = coxph(formula, data = femalehead) result.AIC = extractAIC(fit) data.frame(num.predictors = result.AIC[1], AIC = result.AIC[2], model = x) }) # ## Collapse to a data frame # result = do.call(rbind, list.of.fits) # ## Sort and print # orderBy(~ AIC, result) contrasts(femalehead$Religion.3) = contr.sum(3) cox.female.head.2 = coxph(Surv(Adj.start, Adj.end, Female.head.govt) ~ Birth.rate + Log.GDP.pc + Infant.mortality.rate + Religion.3, iter.max=100, na.action = na.omit, x = T, data = femalehead) summary(cox.female.head.2) # # To get line in output for "Other" category # library(car) Religion.4 = femalehead$Religion.3 Religion.4 = recode(Religion.4,"'Other'='AOther'") contrasts(Religion.4) = contr.sum(3) cox.female.head.3 = coxph(Surv(Adj.start, Adj.end, Female.head.govt) ~ Birth.rate + Log.GDP.pc + Infant.mortality.rate + Religion.4, iter.max=100, na.action = na.omit, x = T, data = femalehead) summary(cox.female.head.3) # # Chapter 12 # # Michaelis-Menten kinetics analyses # enzyme = read.csv("enzyme.csv") puromycin = enzyme[enzyme$Treated==1,] # # Figure 12.1 # invconc = 1/puromycin$Concentration invveloc = 1/puromycin$Velocity puro.lm = lm(invveloc ~ invconc) par(mfrow=c(3,1)) plot(invconc, invveloc, xlab="1 / Concentration", ylab="1 / Velocity", pch=19, main="(a)") abline(puro.lm) # # Determining initial values for nonlinear least squares fitting # theta1.lm = as.vector(1/coef(puro.lm)[1]) theta2.lm = as.vector(coef(puro.lm)[2]/coef(puro.lm)[1]) c(theta1.lm, theta2.lm) plot(puromycin$Concentration, puromycin$Velocity, xlab="Concentration", ylab="Velocity", pch=19, main="(b)") concgrid = c(0:115)/100 # # Nonlinear least squares fit # michment = function(x, theta1, theta2){theta1*x/(theta2+x)} lines(concgrid, michment(concgrid,theta1.lm,theta2.lm)) puro1.nls = nls(Velocity~theta1*Concentration/(theta2+Concentration), start=list(theta1=theta1.lm, theta2=theta2.lm), data=puromycin) puro2.nls = nls(Velocity~theta1*Concentration/(theta2+Concentration), start=list(theta1=theta1.lm, theta2=theta2.lm), data=puromycin[-1,]) plot(puromycin$Concentration, puromycin$Velocity, xlab="Concentration", ylab="Velocity", pch=19, main="(c)") lines(concgrid, michment(concgrid,coef(puro1.nls)[1],coef(puro1.nls)[2])) lines(concgrid, michment(concgrid,coef(puro2.nls)[1],coef(puro2.nls)[2]), lty=2) dev.off() summary(puro1.nls) # # Exploring the effect of the initial values # nls(Velocity~theta1*Concentration/(theta2+Concentration), start=list(theta1=theta1.lm/2, theta2=theta2.lm/2), data=puromycin) nls(Velocity~theta1*Concentration/(theta2+Concentration), start=list(theta1=theta1.lm/10, theta2=theta2.lm/10), data=puromycin) nls(Velocity~theta1*Concentration/(theta2+Concentration), start=list(theta1=theta1.lm/100, theta2=theta2.lm/100), data=puromycin) nls(Velocity~theta1*Concentration/(theta2+Concentration), start=list(theta1=theta1.lm*2, theta2=theta2.lm*2), data=puromycin) nls(Velocity~theta1*Concentration/(theta2+Concentration), start=list(theta1=theta1.lm*10, theta2=theta2.lm*10), data=puromycin) nls(Velocity~theta1*Concentration/(theta2+Concentration), start=list(theta1=theta1.lm*100, theta2=theta2.lm*100), data=puromycin) # # Figure 12.2 # par(mfrow=c(1,2)) plot(fitted(puro1.nls), resid(puro1.nls), xlab="Fitted values", ylab="Residuals", main="(a)", pch = 19) qqnorm(resid(puro1.nls), main="(b)", pch = 19) dev.off() # # Exploring pooled / constant shift / full model versions of models # puro3.nls = nls(Velocity~theta1*Concentration/(theta2+Concentration), start=list(theta1=theta1.lm, theta2=theta2.lm), data=enzyme[-1,]) puro4.nls = nls(Velocity~theta1*Concentration/(theta2+Concentration), start=list(theta1=theta1.lm, theta2=theta2.lm), data=enzyme) summary(puro4.nls) puro5.nls = nls(Velocity~(theta1+theta3*Treated)*Concentration/((theta2+theta4*Treated)+Concentration), start=list(theta1=theta1.lm, theta2=theta2.lm, theta3=0.0, theta4=0.0), data=enzyme) anova(puro4.nls, puro5.nls) summary(puro5.nls) puro6.nls = nls(Velocity~(theta1+theta3*Treated)*Concentration/(theta2+Concentration), start=list(theta1=theta1.lm, theta2=theta2.lm, theta3=0.0), data=enzyme) summary(puro6.nls) anova(puro4.nls, puro6.nls) anova(puro6.nls, puro5.nls) # # Identifying and omitting potential outliers # plot(fitted(puro6.nls), resid(puro6.nls), xlab="Fitted values", ylab="Residuals", pch = 19) qqnorm(resid(puro6.nls), pch = 19) puro7.nls = nls(Velocity~theta1*Concentration/(theta2+Concentration), start=list(theta1=theta1.lm, theta2=theta2.lm), data=enzyme[-c(1,13),]) summary(puro7.nls) puro8.nls = nls(Velocity~(theta1+theta3*Treated)*Concentration/((theta2+theta4*Treated)+Concentration), start=list(theta1=theta1.lm, theta2=theta2.lm, theta3=0.0, theta4=0.0), data=enzyme[-c(1,13),]) anova(puro7.nls, puro8.nls) summary(puro8.nls) puro9.nls = nls(Velocity~(theta1+theta3*Treated)*Concentration/(theta2+Concentration), start=list(theta1=theta1.lm, theta2=theta2.lm, theta3=0.0), data=enzyme[-c(1,13),]) summary(puro9.nls) anova(puro9.nls, puro7.nls) anova(puro9.nls, puro8.nls) # # Figure 12.3 # par(mfrow=c(1,2)) plot(puromycin$Concentration, puromycin$Velocity, xlab="Concentration", ylab="Velocity", pch=19, main="Complete data") points(enzyme$Concentration[enzyme$Treated==0], enzyme$Velocity[enzyme$Treated==0], pch="x") lines(concgrid, michment(concgrid,coef(puro5.nls)[1],coef(puro5.nls)[2])) lines(concgrid, michment(concgrid,coef(puro5.nls)[1]+coef(puro5.nls)[3],coef(puro5.nls)[2]+coef(puro5.nls)[4])) lines(concgrid, michment(concgrid,coef(puro6.nls)[1],coef(puro6.nls)[2]),lty=2) lines(concgrid, michment(concgrid,coef(puro6.nls)[1]+coef(puro6.nls)[3],coef(puro6.nls)[2]),lty=2) plot(puromycin$Concentration[-c(1,13)], puromycin$Velocity[-c(1,13)], xlab="Concentration", ylab="Velocity", pch=19, main="Omitting outliers") points(enzyme$Concentration[enzyme$Treated==0][-1], enzyme$Velocity[enzyme$Treated==0][-1], pch="x") lines(concgrid, michment(concgrid,coef(puro8.nls)[1],coef(puro8.nls)[2])) lines(concgrid, michment(concgrid,coef(puro8.nls)[1]+coef(puro8.nls)[3],coef(puro8.nls)[2]+coef(puro8.nls)[4])) lines(concgrid, michment(concgrid,coef(puro9.nls)[1],coef(puro9.nls)[2]),lty=2) lines(concgrid, michment(concgrid,coef(puro9.nls)[1]+coef(puro9.nls)[3],coef(puro9.nls)[2]),lty=2) dev.off() # # Chapter 13 # # Cancer tumor growth analyses # tumorsize = read.csv("tumorsize.csv") tumorsize$Drug = factor(tumorsize$Drug, order=T, levels=c("No drug", "Drug")) tumorsize$Radiation = factor(tumorsize$Radiation, order=T, levels=c("No radiation", "Radiation")) contrasts(tumorsize$Drug) = contr.treatment(2) contrasts(tumorsize$Radiation) = contr.treatment(2) tumorsize$Treatment = factor(tumorsize$Treatment, order=T, levels=c("Control", "Drug", "Radiation", "Drug + Radiation")) tumorsize$Logged.tumor.size = log10(tumorsize$Tumor.size) tumorsize.complete = na.omit(tumorsize) library(ggplot2) # # Figure 13.1 # p = ggplot(data = tumorsize.complete, aes(x = Day, y = Tumor.size, group = Mouse)) p + geom_line() + facet_grid(vars(Drug), vars(Radiation)) + ylab("Tumor size") dev.off() # # Figure 13.2 # p + geom_line() + facet_grid(vars(Drug), vars(Radiation)) + scale_y_log10() + ylab("Tumor size") + stat_summary(aes(group = 1), geom = "line", fun = mean, size = 1.2) dev.off() # # Linear mixed effects model fit # library(lme4) Day.squared = tumorsize$Day^2 tumorsize = data.frame(tumorsize, Day.squared) tumor.lmm = lmer(Logged.tumor.size ~ Drug + Radiation + Drug*Radiation + Day + Day.squared + Drug*Radiation*Day + Drug*Radiation*Day.squared + (1 | Mouse), data=tumorsize) summary(tumor.lmm) # # Figure 13.3 # p = ggplot(data = tumorsize.complete, aes(x = Day, y = residuals(tumor.lmm), group = Mouse)) p + geom_line() + facet_grid(vars(Drug), vars(Radiation)) + ylab("Residuals") dev.off() # # Figure 13.4 # pf = ggplot(data = tumorsize.complete, aes(x = Day, y = 10^(fitted(tumor.lmm)), group = Mouse)) pf + geom_line(linetype="dashed") + facet_grid(vars(Drug), vars(Radiation)) + scale_y_log10() + ylab("Tumor size") + geom_line(aes(y = 10^(predict(tumor.lmm, re.form=NA))), size=1.1) dev.off() tumor.lmm.coef = summary(tumor.lmm)$coefficients tumor.lmm.coef[c(1,4,5)] tumor.lmm.coef[c(1,4,5)] + tumor.lmm.coef[c(2,7,9)] tumor.lmm.coef[c(1,4,5)] + tumor.lmm.coef[c(3,8,10)] tumor.lmm.coef[c(1,4,5)] + tumor.lmm.coef[c(2,7,9)] + tumor.lmm.coef[c(3,8,10)] + tumor.lmm.coef[c(6,11,12)] # # Table 13.1 # tumor.est = data.frame(Drug = c("No drug","No drug","No drug","Drug","Drug","Drug","No drug","No drug","No drug","Drug","Drug","Drug"), Radiation = c("No radiation", "No radiation", "No radiation", "No radiation", "No radiation", "No radiation", "Radiation", "Radiation", "Radiation", "Radiation", "Radiation", "Radiation"), Day = rep(c(5, 15, 25), 4), Day.squared = rep(c(25, 225, 625), 4)) predict(tumor.lmm, newdata=tumor.est, re.form=NA) tumor.lmm.coef[4] + 2.*tumor.lmm.coef[5]*c(5,15,25) tumor.lmm.coef[4] + tumor.lmm.coef[7] + 2.*(tumor.lmm.coef[5]+tumor.lmm.coef[9])*c(5,15,25) tumor.lmm.coef[4] + tumor.lmm.coef[8] + 2.*(tumor.lmm.coef[5]+tumor.lmm.coef[10])*c(5,15,25) tumor.lmm.coef[4] + tumor.lmm.coef[7] + tumor.lmm.coef[8] + tumor.lmm.coef[11] + 2.*(tumor.lmm.coef[5]+tumor.lmm.coef[9]+tumor.lmm.coef[10]+tumor.lmm.coef[12])*c(5,15,25) # # Nationwide shark attack longitudinal analyses # nationwide.shark.attacks = read.csv("nationwide.shark.attacks.csv") contrasts(nationwide.shark.attacks$Location) = contr.sum(3) nationwide.shark.attacks$attack.rate = nationwide.shark.attacks$Total.attacks/nationwide.shark.attacks$Population # # Figure 13.5 # library(lattice) xyplot(attack.rate ~ Year | State, type="p", col=1, data=nationwide.shark.attacks, ylab="Attack rate") dev.off() # # Figure 13.6 # par(mfrow=c(2,2)) plot(nationwide.shark.attacks$Shoreline[nationwide.shark.attacks$State=="Alabama"], nationwide.shark.attacks$attack.rate[nationwide.shark.attacks$State=="Alabama"], xlab="Miles of shoreline", ylab="Shark attack rate", ylim=c(0, .0000125), xlim=c(0, 1350), pch="A", main="(a)") points(nationwide.shark.attacks$Shoreline[nationwide.shark.attacks$State=="California"], nationwide.shark.attacks$attack.rate[nationwide.shark.attacks$State=="California"], pch="C") points(nationwide.shark.attacks$Shoreline[nationwide.shark.attacks$State=="Florida"], nationwide.shark.attacks$attack.rate[nationwide.shark.attacks$State=="Florida"], pch="F") points(nationwide.shark.attacks$Shoreline[nationwide.shark.attacks$State=="Hawaii"], nationwide.shark.attacks$attack.rate[nationwide.shark.attacks$State=="Hawaii"], pch="H") points(nationwide.shark.attacks$Shoreline[nationwide.shark.attacks$State=="North.Carolina"], nationwide.shark.attacks$attack.rate[nationwide.shark.attacks$State=="North.Carolina"], pch="N") points(nationwide.shark.attacks$Shoreline[nationwide.shark.attacks$State=="Oregon"], nationwide.shark.attacks$attack.rate[nationwide.shark.attacks$State=="Oregon"], pch="O") points(nationwide.shark.attacks$Shoreline[nationwide.shark.attacks$State=="South.Carolina"], nationwide.shark.attacks$attack.rate[nationwide.shark.attacks$State=="South.Carolina"], pch="S") points(nationwide.shark.attacks$Shoreline[nationwide.shark.attacks$State=="Texas"], nationwide.shark.attacks$attack.rate[nationwide.shark.attacks$State=="Texas"], pch="T") plot(nationwide.shark.attacks$Northern.latitude[nationwide.shark.attacks$State=="Alabama"], nationwide.shark.attacks$attack.rate[nationwide.shark.attacks$State=="Alabama"], xlab="Northernmost latitude", ylab="Shark attack rate", ylim=c(0, .0000125), xlim=c(20, 50), pch="A", main="(b)") points(nationwide.shark.attacks$Northern.latitude[nationwide.shark.attacks$State=="California"], nationwide.shark.attacks$attack.rate[nationwide.shark.attacks$State=="California"], pch="C") points(nationwide.shark.attacks$Northern.latitude[nationwide.shark.attacks$State=="Florida"], nationwide.shark.attacks$attack.rate[nationwide.shark.attacks$State=="Florida"], pch="F") points(nationwide.shark.attacks$Northern.latitude[nationwide.shark.attacks$State=="Hawaii"], nationwide.shark.attacks$attack.rate[nationwide.shark.attacks$State=="Hawaii"], pch="H") points(nationwide.shark.attacks$Northern.latitude[nationwide.shark.attacks$State=="North.Carolina"], nationwide.shark.attacks$attack.rate[nationwide.shark.attacks$State=="North.Carolina"], pch="N") points(nationwide.shark.attacks$Northern.latitude[nationwide.shark.attacks$State=="Oregon"], nationwide.shark.attacks$attack.rate[nationwide.shark.attacks$State=="Oregon"], pch="O") points(nationwide.shark.attacks$Northern.latitude[nationwide.shark.attacks$State=="South.Carolina"], nationwide.shark.attacks$attack.rate[nationwide.shark.attacks$State=="South.Carolina"], pch="S") points(nationwide.shark.attacks$Northern.latitude[nationwide.shark.attacks$State=="Texas"], nationwide.shark.attacks$attack.rate[nationwide.shark.attacks$State=="Texas"], pch="T") plot(nationwide.shark.attacks$Southern.latitude[nationwide.shark.attacks$State=="Alabama"], nationwide.shark.attacks$attack.rate[nationwide.shark.attacks$State=="Alabama"], xlab="Southernmost latitude", ylab="Shark attack rate", ylim=c(0, .0000125), xlim=c(15, 45), pch="A", main="(c)") points(nationwide.shark.attacks$Southern.latitude[nationwide.shark.attacks$State=="California"], nationwide.shark.attacks$attack.rate[nationwide.shark.attacks$State=="California"], pch="C") points(nationwide.shark.attacks$Southern.latitude[nationwide.shark.attacks$State=="Florida"], nationwide.shark.attacks$attack.rate[nationwide.shark.attacks$State=="Florida"], pch="F") points(nationwide.shark.attacks$Southern.latitude[nationwide.shark.attacks$State=="Hawaii"], nationwide.shark.attacks$attack.rate[nationwide.shark.attacks$State=="Hawaii"], pch="H") points(nationwide.shark.attacks$Southern.latitude[nationwide.shark.attacks$State=="North.Carolina"], nationwide.shark.attacks$attack.rate[nationwide.shark.attacks$State=="North.Carolina"], pch="N") points(nationwide.shark.attacks$Southern.latitude[nationwide.shark.attacks$State=="Oregon"], nationwide.shark.attacks$attack.rate[nationwide.shark.attacks$State=="Oregon"], pch="O") points(nationwide.shark.attacks$Southern.latitude[nationwide.shark.attacks$State=="South.Carolina"], nationwide.shark.attacks$attack.rate[nationwide.shark.attacks$State=="South.Carolina"], pch="S") points(nationwide.shark.attacks$Southern.latitude[nationwide.shark.attacks$State=="Texas"], nationwide.shark.attacks$attack.rate[nationwide.shark.attacks$State=="Texas"], pch="T") boxplot(nationwide.shark.attacks$attack.rate~ nationwide.shark.attacks$Location, ylab="Attack rate", xlab="Location", main="(d)") dev.off() # # Generalized linear (Poisson) mixed model fits # Years.since.2001 = nationwide.shark.attacks$Year - 2001 nationalshark1.pois = glmer(Total.attacks ~ offset(log(Population)) + Years.since.2001 + Northern.latitude + Southern.latitude + Location + (1 | State), family=poisson, data=nationwide.shark.attacks) summary(nationalshark1.pois) nationalshark2.pois = glmer(Total.attacks ~ offset(log(Population)) + Years.since.2001 + Northern.latitude + Location + (1 | State), family=poisson, data=nationwide.shark.attacks) summary(nationalshark2.pois) nationalshark3.pois = glmer(Total.attacks ~ offset(log(Population)) + Northern.latitude + Location + (1 | State), family=poisson, data=nationwide.shark.attacks) summary(nationalshark3.pois) cor(fitted(nationalshark1.pois, type="pearson"), fitted(nationalshark3.pois, type="pearson")) nationalshark4.pois = glmer(Total.attacks ~ offset(log(Population)) + Northern.latitude + Ocean + (1 | State), family=poisson, data=nationwide.shark.attacks) summary(nationalshark4.pois) # # Negative binomial mixed model fits # nationalshark1.nb = glmer.nb(Total.attacks ~ offset(log(Population)) + Years.since.2001 + Northern.latitude + Southern.latitude + Location + (1 | State), data=nationwide.shark.attacks) summary(nationalshark1.nb) nationalshark2.nb = glmer.nb(Total.attacks ~ offset(log(Population)) + Years.since.2001 + Northern.latitude + Location + (1 | State), data=nationwide.shark.attacks) summary(nationalshark2.nb) nationalshark3.nb = glmer.nb(Total.attacks ~ offset(log(Population)) + Northern.latitude + Location + (1 | State), data=nationwide.shark.attacks) summary(nationalshark3.nb) nationalshark4.nb = glmer.nb(Total.attacks ~ offset(log(Population)) + Northern.latitude + Ocean + (1 | State), data=nationwide.shark.attacks) summary(nationalshark4.nb) # # Figure 13.7 # par(mfrow=c(1,2)) boxplot(residuals(nationalshark4.pois, type="pearson") ~ nationwide.shark.attacks$State, ylab="Pearson residuals", xlab="State", names=c("A","C","F","H","N","O","S","T"), main="Poisson", cex.axis=.9) boxplot(residuals(nationalshark4.nb, type="pearson") ~ nationwide.shark.attacks$State, ylab="Pearson residuals", xlab="State", names=c("A","C","F","H","N","O","S","T"), main="Negative binomial", cex.axis=.9) dev.off() # # Generalized estimating equation (GEE) fits # library(gee) nationalshark1.gee = gee(Total.attacks ~ offset(log(Population)) + Years.since.2001 + Northern.latitude + Southern.latitude + Location, State, family=poisson, data=nationwide.shark.attacks) summary(nationalshark1.gee) nationalshark4.gee = gee(Total.attacks ~ offset(log(Population)) + Northern.latitude + Ocean, State, family=poisson, data=nationwide.shark.attacks) summary(nationalshark4.gee) # # Chapter 14 # library(glmnet) price.var = cbind(homeprices$Bedrooms/sd(homeprices$Bedrooms), homeprices$Bathrooms/sd(homeprices$Bathrooms), homeprices$Living.area/sd(homeprices$Living.area), homeprices$Lot.size/sd(homeprices$Lot.size), homeprices$Year.built/sd(homeprices$Year.built), homeprices$Property.tax/sd(homeprices$Property.tax)) price.resp = homeprices$Sale.price/sd(homeprices$Sale.price) # # Lasso fits on Levittown home prices data # homeprice.lasso = glmnet(price.var, price.resp) # # Figure 14.1 # plot(homeprice.lasso$lambda,coef(homeprice.lasso)[2,], type="l", ylim=c(-.2,.5), xlab=expression(paste(lambda)), ylab="Slopes", lty=3) lines(homeprice.lasso$lambda,coef(homeprice.lasso)[3,], lty=2) lines(homeprice.lasso$lambda,coef(homeprice.lasso)[4,]) lines(homeprice.lasso$lambda,coef(homeprice.lasso)[5,], lty=4) lines(homeprice.lasso$lambda,coef(homeprice.lasso)[6,], lty=5) lines(homeprice.lasso$lambda,coef(homeprice.lasso)[7,], lty=6) legend(.35, .5, lty=c(3:1,4:6), c("Bedrooms", "Bathrooms", "Living area", "Lot size", "Year built", "Property tax")) dev.off() # # This data set is based on the Kaggle data set at www.kaggle.com/undp/human-development. It was # made available by the United Nations Development Program under license CC BY-SA 4.0 (Attribution- # ShareAlike 4.0 International). It has been modified to rename several variables and only include # observations without missing values for the variables being used. # hdi = read.csv("hdi.complete.csv") library(robustHD) hdi[,2:30] = standardize(hdi[,2:30]) # # OLS on all predictors # hdi.ols = lm(HDI ~ .-Country, data=hdi) summary(hdi.ols) library(car) vif(hdi.ols) # # OLS on predictors with p < .05 # hdi.signif = lm(data.frame(hdi[,c(30,4,6,13,26:28)])) summary(hdi.signif) coef(hdi.signif) # # Lasso with 10-fold CV # hdi.var = as.matrix(hdi[,2:29]) hdi.resp = hdi[,30] hdi.lasso = glmnet(hdi.var, hdi.resp) set.seed(1234) hdi.cv.lasso = cv.glmnet(hdi.var, hdi.resp) hdi.cv.lasso hdi.lasso$beta[,42] # # Lasso-plus-OLS # hdi.lasso.plus.ols = lm(data.frame(hdi[,c(30,6:10,12,13,15,17,18,23,24,26:28)])) summary(hdi.lasso.plus.ols) coef(hdi.lasso.plus.ols) coef(hdi.lasso.plus.ols)[-1]/hdi.lasso$beta[c(5:9,11,12,14,16,17,22,23,25:27),42] # # Best subsets. The bestglm package provides 10-fold CV for best subsets. # library(bestglm) Xy = data.frame(hdi.var, hdi.resp) set.seed(1234) hdi.bestsubsets = bestglm(Xy, IC="CV", CVArgs=list(Method="HTF", K=10, REP=1)) hdi.bestsubsets$Subsets hdi.bestsubsets.ols = lm(HDI ~ Pop.ave.annual.growth.pct.2000.2005+Urban.pop.pct+Aged.dependency.ratio+Fertility.rate.2000.2005+Infant.mortality.rate+Unemployment.rate+Old.age.pension.recipient.pct+Internet.user.pct +Internet.user.pct.increase.2010.2015, data=hdi) summary(hdi.bestsubsets.ols) coef(hdi.bestsubsets.ols) vif(hdi.bestsubsets.ols) # # Forward stepwise # library(caret) set.seed(1234) train.control = trainControl(method = "cv", number = 10) hdi.forward = train(HDI ~ .-Country, method = "leapForward", trControl = train.control, tuneGrid = data.frame(nvmax = 1:28), data=hdi) hdi.forward coef(hdi.forward$finalModel, 9) summary(lm(HDI ~ Fertility.rate.2000.2005+Infant.mortality.rate+Internet.user.pct, data=hdi)) # # Chapter 15 # # Figure 15.1 # f = function(x){16.25-5*x+x*x-12*exp(-2*x)} x = (0:50)/10 set.seed(100) y1 = f(x)+rnorm(51)*2 library(locfit) par(mfrow=c(1,2)) plot(x, y1, type="l", ylim=c(0,20), xlab="x", ylab="y", lty=4) points(x, y1, pch=".") abline(lsfit(x,y1)) lines(locfit(y1 ~ lp(x, nn=.4)), lty=2, lwd=2) y2 = f(x)+rnorm(51)*2 plot(x, y2, type="l", ylim=c(0,20), xlab="x", ylab="y", lty=4) points(x, y2, pch=".") abline(lsfit(x,y2)) lines(locfit(y2 ~ lp(x, nn=.4)), lty=2, lwd=2) dev.off() # # Figure 15.2 # xsq = x*x lml11 = lm(y1[1:11] ~ x[1:11]) lmq11 = lm(y1[1:11] ~ x[1:11] + xsq[1:11]) lml21 = lm(y1[21:31] ~ x[21:31]) lmq21 = lm(y1[21:31] ~ x[21:31] + xsq[21:31]) lml31 = lm(y1[36:46] ~ x[36:46]) lmq31 = lm(y1[36:46] ~ x[36:46] + xsq[36:46]) lml12 = lm(y2[1:11] ~ x[1:11]) lmq12 = lm(y2[1:11] ~ x[1:11] + xsq[1:11]) lml22 = lm(y2[21:31] ~ x[21:31]) lmq22 = lm(y2[21:31] ~ x[21:31] + xsq[21:31]) lml32 = lm(y2[36:46] ~ x[36:46]) lmq32 = lm(y2[36:46] ~ x[36:46] + xsq[36:46]) par(mfrow=c(1,2)) plot(x, y1, pch=".", ylim=c(0,20), xlab="x", ylab="y") curve(coef(lml11)[1]+coef(lml11)[2]*x, 0, 1, add=T) curve(coef(lmq11)[1]+coef(lmq11)[2]*x+coef(lmq11)[3]*x*x, 0, 1, add=T, lty=2) abline(v=.5) curve(coef(lml21)[1]+coef(lml21)[2]*x, 2, 3, add=T) curve(coef(lmq21)[1]+coef(lmq21)[2]*x+coef(lmq21)[3]*x*x, 2, 3, add=T, lty=2) abline(v=2.5) curve(coef(lml31)[1]+coef(lml31)[2]*x, 3.5, 4.5, add=T) curve(coef(lmq31)[1]+coef(lmq31)[2]*x+coef(lmq31)[3]*x*x, 3.5, 4.5, add=T, lty=2) abline(v=4) plot(x, y2, pch=".", ylim=c(0,20), xlab="x", ylab="y") curve(coef(lml12)[1]+coef(lml12)[2]*x, 0, 1, add=T) curve(coef(lmq12)[1]+coef(lmq12)[2]*x+coef(lmq12)[3]*x*x, 0, 1, add=T, lty=2) abline(v=.5) curve(coef(lml22)[1]+coef(lml22)[2]*x, 2, 3, add=T) curve(coef(lmq22)[1]+coef(lmq22)[2]*x+coef(lmq22)[3]*x*x, 2, 3, add=T, lty=2) abline(v=2.5) curve(coef(lml32)[1]+coef(lml32)[2]*x, 3.5, 4.5, add=T) curve(coef(lmq32)[1]+coef(lmq32)[2]*x+coef(lmq32)[3]*x*x, 3.5, 4.5, add=T, lty=2) abline(v=4) dev.off() # # Processing of used auto price data originally from Kaggle database at https://www.kaggle.com/orgesleka/used-cars-database # These data are in the public domain. # auto = read.csv("autos.csv") colnames(auto)[2] = "Name" colnames(auto)[5] = "Price" colnames(auto)[8] = "Year.of.Registration" colnames(auto)[9] = "Gearbox" colnames(auto)[12] = "Kilometers" colnames(auto)[14] = "Fuel.type" aa = auto[((auto$Year.of.Registration>=1970)&(auto$Year.of.Registration<2016)&(auto$Price<4000000)&(auto$Gearbox!="")&(auto$Price>1)&((auto$Fuel.type=="benzin")|(auto$Fuel.type=="diesel"))),] aa$Fuel.type = droplevels(aa$Fuel.type) aa$Gearbox = droplevels(aa$Gearbox) aa$Log.price = log10(aa$Price) aa$Diesel = as.numeric(aa$Fuel.type=="diesel") aa$Manual = as.numeric(aa$Gearbox=="manuell") # # Creating March 5, 2016 and March 6, 2016 data # auto.March5 = aa[as.Date(aa$dateCrawled)=="2016-03-05",c("Name","Year.of.Registration","Manual","Kilometers","Diesel","Log.price")] auto.March6 = aa[as.Date(aa$dateCrawled)=="2016-03-06",] # # The site provides these data frames as files # # auto.March5 = read.csv("autoMarch5.csv") # auto.March6 = read.csv("autoMarch6.csv") # # Figure 15.3 # par(mfrow=c(2,2)) plot(auto.March5$Year.of.Registration, auto.March5$Log.price, xlab="Year of Registration", ylab="Logged price", pch = 19) plot(auto.March5$Kilometers, auto.March5$Log.price, xlab="Kilometers driven", ylab="Logged price", pch = 19) boxplot(auto.March5$Log.price~auto.March5$Manual, xlab="Manual transmission", ylab="Logged price") boxplot(auto.March5$Log.price~auto.March5$Diesel, xlab="Diesel", ylab="Logged price") dev.off() # # Choosing smoothing paramter for local quadratic additive model # # AICc choice for Year of registration too small (.15), for Kilometers around .42 # library(gam) grid1 = seq(.15, .53, .02) grid2 = seq(.2, .77, .03) autopriceaicc = data.frame(Year.of.Registration=rep(NA, 400), Kilometers=rep(NA,400), AICC=rep(NA,400)) n = nrow(auto.March5) ii = 1 for (i in (1:20)){ for (j in (1:20)){ gg = gam(Log.price ~ lo(Year.of.Registration, span=grid1[i]) + lo(Kilometers, span=grid2[j]) + Manual + Diesel, data=auto.March5) autopriceaicc$Year.of.Registration[ii] = grid1[i] autopriceaicc$Kilometers[ii] = grid2[j] df = sum(gg$nl.df) + 3 autopriceaicc$AICC[ii] = gg$aic + 2 * (df + 1) * (df + 2) / (n - df - 2) ii = ii + 1}} autopriceaicc autoprice.March5.gam1 = gam(Log.price ~ lo(Year.of.Registration, span=.15) + lo(Kilometers, span=.42) + Manual + Diesel, data=auto.March5) # # Figure 15.4 # par(mfrow=c(2,2)) # Reply 1, 2, 0 to the command below plot(autoprice.March5.gam1, rugplot=F, ylab="", main="AICc choice", ask=T) summary(autoprice.March5.gam1) coef(autoprice.March5.gam1) autoprice.March5.gam2 = gam(Log.price ~ lo(Year.of.Registration, span=.3) + lo(Kilometers, span=.5) + Manual + Diesel, data=auto.March5) # Reply 1, 2, 0 to the command below plot(autoprice.March5.gam2, rugplot=F, ylab="", main="Smoother choice", ask=T) dev.off() summary(autoprice.March5.gam2) coef(autoprice.March5.gam2) autoprice.March5.lm = lm(Log.price ~ Year.of.Registration + Kilometers + Manual + Diesel, data=auto.March5) pred.March6.gam1 = predict(autoprice.March5.gam1, auto.March6) pred.March6.gam2 = predict(autoprice.March5.gam2, auto.March6) pred.March6.lm = predict(autoprice.March5.lm, auto.March6) cor(pred.March6.gam1, auto.March6$Log.price) cor(pred.March6.gam2, auto.March6$Log.price) cor(pred.March6.lm, auto.March6$Log.price) mean(auto.March6$Log.price - pred.March6.gam1) mean(auto.March6$Log.price - pred.March6.gam2) mean(auto.March6$Log.price - pred.March6.lm) sd(auto.March6$Log.price - pred.March6.gam1) sd(auto.March6$Log.price - pred.March6.gam2) sd(auto.March6$Log.price - pred.March6.lm) sqrt(mean((auto.March6$Log.price - pred.March6.gam1)^2)) sqrt(mean((auto.March6$Log.price - pred.March6.gam2)^2)) sqrt(mean((auto.March6$Log.price - pred.March6.lm)^2)) mean(abs(auto.March6$Log.price - pred.March6.gam1) < abs(auto.March6$Log.price - pred.March6.gam2)) mean(abs(auto.March6$Log.price - pred.March6.gam1) < abs(auto.March6$Log.price - pred.March6.lm)) mean(abs(auto.March6$Log.price - pred.March6.gam2) < abs(auto.March6$Log.price - pred.March6.lm)) # # Bechdel test analyses # library(fivethirtyeight) data(package="fivethirtyeight") bechdel$Pass = as.numeric(bechdel$binary=="PASS") bechdel$Log.budget = log(bechdel$budget_2013) library(data.table) setnames(bechdel, "year", "Year") # # Figure 15.5 # par(mfrow=c(1,2)) boxplot(split(bechdel$Year,bechdel$Pass), xlab="Pass Bechdel test", ylab="Year") boxplot(split(bechdel$Log.budget,bechdel$Pass), xlab="Pass Bechdel test", ylab="Logged budget") dev.off() bechdel.glm = glm(Pass ~ Year + Log.budget, family=binomial, data=bechdel) summary(bechdel.glm) library(boot) bechdel.diag = glm.diag(bechdel.glm) # # Figure 15.6 # par(mfrow=c(1,2)) scatter.smooth(bechdel$Year, bechdel.diag$rp, xlab="Year", ylab="Standardized residuals", pch=19) scatter.smooth(bechdel$Log.budget, bechdel.diag$rp, xlab="Logged budget", ylab="Standardized residuals", pch=19) dev.off() # # Choosing smoothing parameter for local quadratic generalized additive model # grid1 = seq(.5, .88, .02) grid2 = seq(.5, .88, .02) bechdelaicc = data.frame(Year=rep(NA, 400), Log.budget=rep(NA,400), AICC=rep(NA,400)) n = nrow(bechdel) ii = 1 for (i in (1:20)){ for (j in (1:20)){ gg = gam(Pass ~ lo(Year, span=grid1[i]) + lo(Log.budget, span=grid2[i]), family=binomial, data=bechdel) bechdelaicc$Year[ii] = grid1[i] bechdelaicc$Log.budget[ii] = grid2[j] df = sum(gg$nl.df) + 3 bechdelaicc$AICC[ii] = gg$aic + 2 * (df + 1) * (df + 2) / (n - df - 2) ii = ii + 1}} bechdelaicc bechdel.gam = gam(Pass ~ lo(Year, span=.74) + lo(Log.budget, span=.74), family=binomial, data=bechdel) # # Figure 15.7 # par(mfrow=c(1,2)) # Reply 1, 2, 0 to command below plot(bechdel.gam, rugplot=F, ylab="", ask=T) dev.off() summary(bechdel.gam) coef(bechdel.gam) # # Levittown home prices data # grid = seq(.1, 1.09, .01) homepriceaicc = data.frame(span=rep(NA,100), AICC=rep(NA,100)) n = nrow(homeprices) for (i in (1:100)){ gg = gam(Sale.price ~ Bathrooms+lo(Living.area, Year.built, span=grid[i]), data=homeprices) homepriceaicc$span[i] = grid[i] df = sum(gg$nl.df) + 2 homepriceaicc$AICC[i] = gg$aic + 2 * (df + 1) * (df + 2) / (n - df - 2)} homepriceaicc homeprice.gam = gam(Sale.price ~ Bathrooms+lo(Living.area, Year.built, span=.57), data=homeprices) summary(homeprice.gam) pred.homprice.gam = predict(homeprice.gam) adjusted.pred.homprice.gam = predict(homeprice.gam) - coef(homeprice.gam)[2]*homeprices$Bathrooms homeprice.plot.frame = data.frame(Living.area=homeprices$Living.area, Year.built=homeprices$Year.built, pred=pred.homprice.gam, adjusted.pred=adjusted.pred.homprice.gam) library(lattice) # # Figure 15.8 # # # The 10 panels correspond to years 1948 1949 1950 1951 1952 1953 1954 1955 1961 1962 # xyplot(adjusted.pred ~ Living.area | Year.built, ylab="Adjusted estimated price", xlab="Living area", homeprice.plot.frame, col=1, pch=19) dev.off() # # Chapter 16 # set.seed(1601) x1 = runif(100,0,50) x2 = runif(100,0,50) x2[4] = 20.01 ytrue = rep(NA,100) errors = rnorm(100)*7 ytrue[x1 < 25] = 50 ytrue[(x1 >= 25)&(x2 < 20)] = 30 ytrue[(x1 >= 25)&(x2 >= 20)] = 70 y = ytrue + errors # # Figure 16.1 # par(mfrow=c(1,2)) plot(x1, y, xlab=expression(x[1]), pch=19) plot(x2, y, xlab=expression(x[2]), pch=19) dev.off() summary(lm(y~x1+x2)) # # Figure 16.2 # symbols(x1,x2,circles=sqrt(y)/10, xlab=expression(x[1]), ylab=expression(x[2]), inches=F) abline(v=25, lty=2) lines(c(25,50), c(20,20), lty=2) dev.off() # # Figure 16.3 was not created using R # ind1 = as.numeric(x1 < 25) ind2 = as.numeric((x1 >= 25)&(x2 >= 20)) summary(lm(y~ind1+ind2)) # # Levittown home price regression tree analyses # library(rpart) library(partykit) # # CART true pruned using 1-fold CV and the one-SE rule # set.seed(203) price.rpart = rpart(Sale.price ~ Bedrooms+Bathrooms+Living.area+Lot.size+Year.built+Property.tax, data=homeprices) # # Finding the cost complexity corresponding to the one-SE rule # cventry = which.min(price.rpart$cptable[, "xerror"]) xerrorcv = price.rpart$cptable[cventry, "xerror"] sexerrorcv = xerrorcv + price.rpart$cptable[cventry, "xstd"] cpcvse = price.rpart$cptable[which.max(price.rpart$cptable[, "xerror"] <= sexerrorcv), "CP"] price.rpart.prune = prune(price.rpart, cp = cpcvse) # # Figure 16.4a # plot(as.party(price.rpart), gp=gpar(fontsize = 6), main="Unpruned CART tree") dev.off() # # Figure 16.4b # plot(as.party(price.rpart.prune), main="Pruned CART tree") dev.off() # # Conditional inference tree # price.ctree = ctree(Sale.price ~ Bedrooms+Bathrooms+Living.area+Lot.size+Year.built+Property.tax, data=homeprices) # # Figure 16.5 # plot(price.ctree, inner_panel=node_inner(price.ctree, pval = FALSE)) dev.off() # # Airplane seat reclining classification tree analyses # library(fivethirtyeight) data(flying) flying$Obligation = as.factor(ifelse(flying$recline_obligation, 'Yes', 'No')) names(flying)[c(3,9,12,16, 18)] = c("Age","Frequency","Rude","Wake.up.bathroom","Baby") flying.full = na.omit(flying[,c(2,3,4,6:12,16:19,25,28)]) set.seed(301) recline.rpart = rpart(recline_frequency~Rude+gender+Age+height+household_income+education+location+Frequency+Obligation+Baby+unruly_child+Wake.up.bathroom+wake_up_walk+get_up, data=flying.full) # # Finding the cost complexity corresponding to the one-SE rule # cventry = which.min(recline.rpart$cptable[, "xerror"]) xerrorcv = recline.rpart$cptable[cventry, "xerror"] sexerrorcv = xerrorcv + recline.rpart$cptable[cventry, "xstd"] cpcvse = recline.rpart$cptable[which.max(recline.rpart$cptable[, "xerror"] <= sexerrorcv), "CP"] recline.rpart.prune = prune(recline.rpart, cp = cpcvse) recline.ctree = ctree(recline_frequency~Rude+factor(gender)+Age+height+household_income+education+factor(location)+Frequency+Obligation+Baby+unruly_child+Wake.up.bathroom+wake_up_walk+get_up, data=flying.full) # # Figure 16.6a # frame() title(main="Pruned CART tree", cex.main=.7) plot(as.party(recline.rpart.prune), gp=gpar(fontsize = 8), newpage=FALSE) dev.off() # # Figure 16.6b # frame() title(main="Conditional inference tree", cex.main=.7) plot(recline.ctree, gp=gpar(fontsize=8), newpage=FALSE, inner_panel=node_inner, ip_args=list(pval = FALSE)) dev.off() set.seed(350) rude.rpart = rpart(Rude~gender+Age+height+household_income+education+location+Frequency+Obligation+Baby+unruly_child+Wake.up.bathroom+wake_up_walk+get_up, data=flying.full) # # Finding the cost complexity corresponding to the one-SE rule # cventry = which.min(rude.rpart$cptable[, "xerror"]) xerrorcv = rude.rpart$cptable[cventry, "xerror"] sexerrorcv = xerrorcv + rude.rpart$cptable[cventry, "xstd"] cpcvse = rude.rpart$cptable[which.max(rude.rpart$cptable[, "xerror"] <= sexerrorcv), "CP"] rude.rpart.prune = prune(rude.rpart, cp = cpcvse) rude.rpart.prune rude.ctree = ctree(Rude~factor(gender)+Age+height+household_income+education+factor(location)+Frequency+Obligation+Baby+unruly_child+Wake.up.bathroom+wake_up_walk+get_up, data=flying.full) # # Figure 16.7 # plot(rude.ctree, gp=gpar(fontsize=6), inner_panel=node_inner, ip_args=list(pval = FALSE)) dev.off() # # The conditional inference REEM tree builds off the CART version available in the REEMtree package. # Load the following packages first, and copy and paste the REEMctree function into the workspace. # See http://people.stern.nyu.edu/jsimonof/unbiasedREEM/. # library(REEMtree) library(party) # #-------------------------start of main function---------------------------------------------------- REEMctree <- function (formula, data, random, subset = NULL, initialRandomEffects = rep(0, TotalObs), ErrorTolerance = 0.001, MaxIterations = 1000, verbose = FALSE, lme.control = lmeControl(returnObject = TRUE), ctree.control = party::ctree_control(mincriterion = 0.95), method = "REML", correlation = NULL) { TotalObs <- dim(data)[1] if (identical(subset, NULL)) { subs <- rep(TRUE, dim(data)[1]) } else { subs <- subset } Predictors <- paste(attr(terms(formula), "term.labels"), collapse = "+") TargetName <- formula[[2]] if (length(TargetName) > 1) TargetName <- TargetName[3] if (verbose) print(paste("Target variable: ", TargetName)) Target <- data[, toString(TargetName)] ContinueCondition <- TRUE iterations <- 0 AdjustedTarget <- Target - initialRandomEffects oldlik <- -Inf newdata <- data newdata[, "SubsetVector"] <- subs while (ContinueCondition) { newdata[, "AdjustedTarget"] <- AdjustedTarget iterations <- iterations + 1 tree <- party::ctree(formula(paste(c("AdjustedTarget", Predictors), collapse = "~")), data = newdata, subset = subs, controls = ctree.control) if (verbose) print(tree) newdata[, "nodeInd"] <- 0 newdata[subs, "nodeInd"] <- where(tree) if (min(where(tree)) == max(where(tree))) { #it doesn't split on root lmefit <- lme(formula(paste(c(toString(TargetName), 1), collapse = "~")), data = newdata, random = random, subset = SubsetVector, method = method, control = lme.control, correlation = correlation) } else { lmefit <- lme(formula(paste(c(toString(TargetName), "as.factor(nodeInd)"), collapse = "~")), data = newdata, random = random, subset = SubsetVector, method = method, control = lme.control, correlation = correlation) } if (verbose) { print(lmefit) print(paste("Estimated Error Variance = ", lmefit$sigma)) print("Estimated Random Effects Variance = ") print(as.matrix(lmefit$modelStruct$reStruct[[1]]) * lmefit$sigma^2) } newlik <- logLik(lmefit) if (verbose) print(paste("Log likelihood: ", newlik)) ContinueCondition <- (newlik - oldlik > ErrorTolerance & iterations < MaxIterations) oldlik <- newlik AllEffects <- lmefit$residuals[, 1] - lmefit$residuals[, dim(lmefit$residuals)[2]] AdjustedTarget[subs] <- Target[subs] - AllEffects } residuals <- rep(NA, length = length(Target)) residuals[subs] <- Target[subs] - predict(lmefit) attr(residuals, "label") <- NULL result <- list(Tree = tree, EffectModel = lmefit, RandomEffects = ranef(lmefit), BetweenMatrix = as.matrix(lmefit$modelStruct$reStruct[[1]]) * lmefit$sigma^2, ErrorVariance = lmefit$sigma^2, data = data, logLik = newlik, IterationsUsed = iterations, Formula = formula, Random = random, Subset = subs, ErrorTolerance = ErrorTolerance, correlation = correlation, residuals = residuals, method = method, lme.control = lme.control, ctree.control = ctree.control) class(result) <- "REEMctree" return(result) } #-------------------------end of main function---------------------------------------------------- #-------------------------------------------------------------------------------------------------- # # Wages of young people longitudinal tree analysis # wages = read.csv("wages.csv") wages.ctree = REEMctree(Logged.wages ~ Exper + GED + H.G.C. + Unemp.rate + Race, random = ~1|ID, data=wages) # # Figure 16.8 # plot(wages.ctree$Tree, type="simple", ip_args=list(pval = FALSE), terminal_panel=node_terminal(wages.ctree$Tree, fill = c("white"), id = FALSE)) dev.off() # # Survival trees for Broadway survival analyses # library(survival) # # CART-based survival tree # set.seed(401) broadway.rpart = rpart(Surv(Post.award.performances, Closed)~Musical+Revival+Att.post.award+Tony.awards+Losing.nominations+NYT.rating.3+DN.rating.3+USA.rating.3, data=bwaypostawards) # # Finding the cost complexity corresponding to the one-SE rule # cventry = which.min(broadway.rpart$cptable[, "xerror"]) xerrorcv = broadway.rpart$cptable[cventry, "xerror"] sexerrorcv = xerrorcv + broadway.rpart$cptable[cventry, "xstd"] cpcvse = broadway.rpart$cptable[which.max(broadway.rpart$cptable[, "xerror"] <= sexerrorcv), "CP"] broadway.rpart.prune = prune(broadway.rpart, cp = cpcvse) # # Conditional inference survival tree # broadway.ctree = ctree(Surv(Post.award.performances, Closed)~Musical+Revival+Att.post.award+Tony.awards+Losing.nominations+NYT.rating.3+DN.rating.3+USA.rating.3, data=bwaypostawards) # # Figure 16.9a # frame() title(main="Pruned CART tree", cex.main=.7) plot(as.party(broadway.rpart.prune), gp=gpar(fontsize = 8), newpage=FALSE) dev.off() # # Figure 16.9b # frame() title(main="Conditional inference tree", cex.main=.7) plot(broadway.ctree, gp=gpar(fontsize=8), newpage=FALSE, inner_panel=node_inner, ip_args=list(pval = FALSE)) dev.off() # # Female heads of government time-varying covariate tree analyses # femalehead.edit = femalehead library(car) femalehead.edit$Continent = recode(femalehead.edit$Continent,"'North America'='N America'; 'South America'='S America'") library(LTRCtrees) set.seed(450) female.head.rpart = LTRCART(ffhead, no.SE=1, data=femalehead.edit) female.head.rpart female.head.ctree = LTRCIT(ffhead, data=femalehead.edit) # # You have to detach the party package for this plotting command to work (it is based on the newer partykit package) # detach(package:party) # # Figure 16.10 # plot(female.head.ctree, gp=gpar(fontsize=6), inner_panel=node_inner, ip_args=list(pval = FALSE)) dev.off()