bonfmult <- function(count, alpha=.05){ # # Routine to construct Bonferroni-based simultaneous # multinomial confidence intervals. Deafult is alpha=.05, or a # 95% confidence level. # plow <- rep(NA,length(count)) phigh <- rep(NA,length(count)) n <- sum(count) goodalpha <- alpha/length(count) for (i in 1:length(count)){ int <- scoreintbinom(count[i],n,alpha=goodalpha) plow[i] <- int$plow phigh[i] <- int$phigh} list(plow=plow, phigh=phigh)} # scoreintpois <- function (x, alpha=.05){ # # Routine to construct score interval for Poisson sample. The variable # x contains the observed Poisson data. Default is alpha=.05, or a 95% confidence level. # Missing values are allowed. # z <- qnorm(1 - alpha/2) lhat <- mean(x,na.rm=T) n <- length(na.omit(x)) center <- lhat + z*z/(2*n) adj <- z*sqrt(lhat + z*z/(4*n))/sqrt(n) plow <- center-adj phigh <- center + adj list(plow=plow, phigh=phigh)} # exactpoisci <- function(x, t, alpha=.05, tol=.Machine$double.eps^.25){ # # Routine to construct the exact Clopper-Pearson confidence interval # of mean rate per time period for a Poisson sample with x events in # an interval of time t. Default is alpha=.05, or a 95% confidence level. # if (x==0) plow <- 0 else plow <- qchisq(alpha/2,2*x)/(2*t) phigh <- qchisq(1-alpha/2,2*x + 2)/(2*t) list(plow=plow,phigh=phigh)} # powdiv <- function(obs, exp, npar, lambda, statonly = F) # # Function to calculate Cressie-Read power divergence goodness-of-fit statistic # # Input parameters # # obs - vector of observed counts # exp - vector of expected counts # npar - number of parameters estimated in null model # lambda - power for statistic # lambda = 1 : Pearson statistic # lambda = 2/3 : Cressie-Read statistic # lambda = 0 : likelihood ratio statistic # lambda = -1/2 : Freeman-Tukey statistic # lambda = -1 : modified likelihood ratio statistic # lambda = -2 : Neyman-modified statistic # statonly - set equal to T if want only statistic output # # Output parameters # # stat - statistic # df - degrees of freedom for statistic # pval - p-value for statistic based on a chi-squared distribution with # degrees of freedom equal to df # # Note: For all statistics except the Pearson statistic, cells with observed counts # equal to zero do not contribute to the test statistic (effectively this # means that 0 * log(0) and 0 * Infinity are taken to be zero). This can lead to # strange values of the statistic (such as negative values) for lambda less # than zero. # { if(lambda == 0) { stat <- 2 * sum(na.omit(obs * log(obs/exp))) } else { if(lambda == -1) { stat <- 2 * sum((exp * log(exp/obs))[ is.finite(exp * log(exp/obs))]) } else { stat <- 2/(lambda * (lambda + 1)) * sum( na.omit(obs * ((obs/exp)^lambda - 1 ))) } } df <- length(obs) - npar - 1 pval <- 1 - pchisq(stat, df) if (statonly) return(stat) else return(list(stat = stat, df = df, pval = pval)) } # partitionX2 <- function(obs, exp, npar){ # # Routine to calculate linear, quadratic, and residual partitions of Pearson # X^2 statistic # # obs - vector of observed counts # exp - vector of expected counts # npar - number of parameters estimated in null model # # Output # pearson - Pearson statistic # p.pearson - p-value for Pearson statistic # linear - linear portion of lack of fit # p.linear - p-value for linear portion # linear.resid - residual from linear portion # p.linear.resid - p-value for residual from linear portion # quadratic - quadratic portion of lack of fit # p.quadratic - p-value for quadratic portion # quadratic.resid - residual from quadratic portion # p.quadratic.resid - p-value for residual from quadratic portion # pearson <- sum(((obs-exp)^2)/exp) n <- sum(obs) prob <- exp/n k <- length(obs) mu <- sum(c(1:k)*prob) mu2 <- sum((c(1:k)-mu)^2*prob) mu3 <- sum((c(1:k)-mu)^3*prob) mu4 <- sum((c(1:k)-mu)^4*prob) a <- 1/sqrt(mu4-mu3*mu3/mu2-mu2*mu2) g1 <- (c(1:k)-mu)/sqrt(mu2) g2 <- a*((c(1:k)-mu)^2-mu3*(c(1:k)-mu)/mu2-mu2) v1 <- sum(obs*g1)/sqrt(n) v2 <- sum(obs*g2)/sqrt(n) p.pearson <- 1-pchisq(pearson,k-1-npar) linear <- v1*v1 p.linear <- 1-pchisq(linear,1) linear.resid <- pearson - linear p.linear.resid <- 1-pchisq(linear.resid,k-2-npar) quadratic <- v2*v2 p.quadratic <- 1-pchisq(quadratic,1) quadratic.resid <- linear.resid - quadratic p.quadratic.resid <- 1-pchisq(quadratic.resid,k-3-npar) list(pearson=pearson,p.pearson=p.pearson,linear=linear,p.linear=p.linear, linear.resid=linear.resid,p.linear.resid=p.linear.resid,quadratic=quadratic, p.quadratic=p.quadratic,quadratic.resid=quadratic.resid, p.quadratic.resid=p.quadratic.resid)} # # Functions used to tabulate exact distribution of X^2 and G^2 # enummulti <- function(k, n) { # # Written by Bill Venables # k = number of categories # n = total count # if(n == 0) return(matrix(0, 1, k)) if(k == 1) return(n) E <- NULL for(i in 0:n) E <- rbind(E, cbind(i, Recall(k-1, n-i))) E} # dmultinom <- function(x,p){ # # Written by Albyn Jones # n <- sum(x) exp(lgamma(n + 1)-sum(lgamma(x + 1)) + sum(x*log(p)))} # exactX2 <- function(k, n, p = rep(1/k,k)) { # # Function to tabulate exact distribution of X^2 and G^2 # Adapted from code by Bill Venables # Y <- enummulti(k, n) E <- matrix(n * p,nrow=dim(Y)[1],ncol=k,byrow=T) u <- rep(1, k) PX2 <- ((Y - E)^2/E) %*% u LR <- 2 * (Y * log(pmax(1, Y)/E)) %*% u Pr <- apply(Y, 1, dmultinom, p=p) o1 <- order(PX2) o2 <- order(LR) structure(list(Pearson = cbind(x = PX2[o1], y = cumsum(Pr[o1])), Lik.Ratio = cbind(x = LR[o2], y = cumsum(Pr[o2])), n = n, k = k), class = "exactX2") } # # Functions for univariate zero-inflated Poisson (ZIP) estimation # scorezip <- function(mu, y){ # # Score equation for ZIP model # n <- length(y) ybar <- mean(y) n0 <- sum(y==0) ybar*(1-exp(-mu))/(1-n0/n)-mu} # zipmle <- function(y){ # # Function to determine MLE's from ZIP model, along with asymptotic # standard errors. Function assumes that probability of coming from # Poisson part is at least .001. # if (scorezip(mean(y), y) < 0) { return(p=1, mu = mean(y), se = c(0, sqrt(mean(y)/length(y))))} else { mu <- uniroot(scorezip, lower=mean(y), upper=1000*mean(y), y=y)$root p <- mean(y)/mu con <- 1 - p + p*exp(-mu) n0 <- sum(y==0) i11 <- n0*(exp(-mu)-1)^2/con i12 <- n0*exp(-mu)/con i21 <- i12 i22 <- length(y)*mean(y)/(mu*mu) - n0*p*(1-p)*exp(-mu)/con var <- solve(matrix(c(i11,i12,i21,i22),nrow=2)) list(p=p, mu=mu, se = sqrt(c(var[1,1],var[2,2])))}} # dnegbinom <- function(x, mu, nu){ # # Negative binomial probability function # p <- nu/(nu + mu) exp(lgamma(x + nu)-lgamma(x + 1)-lgamma(nu) + nu*log(p) + x*log(1-p))} # binomgof <- function(x,n,p = sum(x)/sum(n), est.p = T){ # Pearson and LR goodness-of-fit tests for a sample of binomial data # Default null probability is the observed sample proportion, but # a fixed value can be input as p, and by setting est.p to F. # pearson <- sum(((x - n * p)^2)/(n * p * (1 - p))) lr <- 2 * (sum(x * log(x/(n * p)), na.rm = T) + sum((n - x) * (log( (n - x)/(n * (1 - p)))), na.rm = T)) if(est.p) df <- length(x) - 1 else df <- length(x) p.pearson <- 1 - pchisq(pearson, df) p.lr <- 1 - pchisq(lr, df) list(pearson = pearson, p.pearson = p.pearson, lr = lr, p.lr = p.lr, df = df)} # bbdgof <- function(x,n,alpha,beta,est.ab=T){ # # Pearson goodness-of-fit test for a sample of beta-binomial data # Values of alpha and beta are given, and are assumed to be MLEs; # if they are fixed values, set est.ab to F. # p <- alpha/(alpha + beta) theta <- 1/(alpha + beta + 1) pearson <- sum(((x-n*p)^2)/(n*p*(1-p)*(1 + (n-1)*theta))) if (est.ab) df <- length(x) - 2 else df <- length(x) p.pearson <- 1-pchisq(pearson,df) list(pearson=pearson, p.pearson=p.pearson, df=df)} # bbdmle1 <- function(succ,trial,start){ # # S-PLUS routine to get MLE from a sample from the beta-binomial distribution # Note that this functions does NOT work in R # temp <- nlminb(start=start,obj=bbdloglike,succ=succ,trial=trial) list(alpha = temp$parameters[1], beta = temp$parameters[2], nlminb.obj = temp)} # bbdmle2 <- function(succ,trial,start){ # # S-PLUS/R routine to get MLE from a sample from the beta-binomial distribution # This function requires than the MASS library be loaded first in S-PLUS # temp <- optim(start, bbdloglike, hessian = T, succ=succ, trial=trial) list(alpha = temp$par[1], beta = temp$par[2], hessian = temp)} # bbdloglike <- function(ab,succ,trial){ # # Negative log-likelihood for beta-binomial distribution # -sum(log(choose(trial,succ)) + lgamma(ab[1] + succ) + lgamma(ab[2] + trial-succ) + lgamma(ab[1] + ab[2]) -lgamma(ab[1] + ab[2] + trial)-lgamma(ab[1])-lgamma(ab[2]))} # probbbd <- function(x,n,alpha,beta){ # # Probability function for beta-binomial distribution # exp(log(choose(n,x)) + lgamma(alpha + x) + lgamma(beta + n-x) + lgamma(alpha + beta)-lgamma(alpha + beta + n)- lgamma(alpha)-lgamma(beta))} # truncpoismle <- function(x){ # # Routine to get MLE from a sample from the Poisson r.v. truncated at zero # low <- mean(x)/10 while (truncpoisscore(low,x) > 0) low <- low/10 mu <- uniroot(truncpoisscore, lower=low, upper=mean(x), x=x)$root list(mu=mu)} # truncpoisscore <- function(mu,x){ length(x)*mu-sum(x)*(1-exp(-mu))} # tableform <- function(data, low=min(data),high=max(data)){ # # Function to convert integer data vector into one-dimensional table of counts # outtable <- rep(0,high-low + 1) nonneg <- as.numeric(unlist(dimnames(table(data)))) index <- nonneg - low + 1 if (any(index < 1) | any(index > length(outtable))) stop("Data values fall outside constructed table") for (i in 1:length(nonneg)) {outtable[index[i]] <- table(data)[i]} list(table=outtable,low=low,high=high)} hellinger <- function(nonparest, parest){ # # Function to give Hellinger distance between nonparametric estimate # and parametric estimate for categorical data # sqrt(sum((sqrt(nonparest)-sqrt(parest))^2))} penhellinger <- function(nonparest, parest){ # # Function to give penalized Hellinger distance between nonparametric estimate # and parametric estimate for categorical data; see exercise 4.45 # sqrt(sum((sqrt(nonparest)-sqrt(parest))^2)-sum(parest[nonparest==0])/2)} # poishellest <- function(mu,nonparest,low,high,minfunc){ parest <- dpois(c(low:high),mu) minfunc(nonparest,parest)} # glmdiag <- function(glm.obj){ # # Function to get diagnostics for a generalized linear model # NOTE: the glm must have been called using the "x=T" option # # Input : glm object # Output: scaled.dev.resid - scaled deviance residuals # leverage - leverage values # cooksd - Cook's distance # X <- glm.obj$x W <- diag(glm.obj$weights) h <- diag(sqrt(W)%*%X%*%solve(t(X)%*%W%*%X)%*%t(X)%*%sqrt(W)) scaled.dev.resid <- resid(glm.obj,type="deviance")/sqrt(1-h) cook <- h*(scaled.dev.resid^2)/(sum(h)*(1-h)) list(scaled.dev.resid = scaled.dev.resid, leverage = h, cooksd = cook)} # pois.crit.loess <- function(h, x, y, degree = 2){ # # S-PLUS function ONLY - R's version of gam does not work # # AICC criterion function for loess Poisson one-predictor gam estimator. y contains observed # counts, x is the predictor, and h is the span (roughly speaking, # the proportion of bins covered by the kernel). # # The AICC loess likelihood estimator is determined by minimizing this # function over h. In my experience, a grid search is necessary to find the # optimal value (the function nlminb doesn't seem to work). The output list # consists of the associated AICC value ($aicc) and the gam object # probabilities ($gam). # # Note that this function uses the convention that the number of observations is # NOT the sum of the individual Poisson values, but rather the number of them. # This is consistent with viewing the problem as a regression smoothing problem, rather # than a contingency table model building one. # k <- length(y) g <- gam(y ~ lo(x, degree = degree, h), family = "poisson") df <- g$nl.df + 2 gfit <- predict(g, type = "response") if(df >= k - 2) list(aicc = Inf, gam = g) else list(aicc = log(g$deviance) + (1 + df/k)/(1 - df/k - 2/k), gam = g)} # cat.crit.loess <- function(h, y, degree = 2){ # # S-PLUS program ONLY - R's version of gam does not work # # AICC criterion function for loess likelihood estimator. y contains observed # relative frequencies or counts, while h is the span (roughly speaking, # the proportion of bins covered by the kernel). See Simonoff # (1998, Int. Statist. Rev.) for discussion. # # Note that this function uses the convention that the number of observations is # NOT the sum of the individual counts, but rather the number of cells. # This is consistent with viewing the problem as a regression smoothing problem, rather # than a contingency table model building one. # # The AICC loess likelihood estimator is determined by minimizing this # function over h. In my experience, a grid search is necessary to find the # optimal value (the function nlminb doesn't seem to work). The output list # consists of the associated AICC value ($aicc) and the estimated cell # probabilities ($prob). # k <- length(y) x <- c(1:k) g <- gam(y ~ lo(x, degree = degree, h), family = "poisson") df <- g$nl.df + 2 gfit <- predict(g, type = "response") if(df >= k - 2) list(aicc = Inf, prob = gfit/sum(gfit)) else list(aicc = log(g$deviance) + (1 + df/k)/(1 - df/k - 2/k), prob = gfit/sum(gfit))} # crit.aicc.lm <- function(lm.obj){ # # Function to calculate the AICC value for a linear model object lm.obj. For discussion see # Hurvich and Tsai (1989, Biometrika). This function also works for glm objects, and gam # objects that do not contain any smoothing-based terms. # n <- lm.obj$rank + lm.obj$df.residual p <- lm.obj$rank log(deviance(lm.obj)/n) + (1 + p/n)/(1 - p/n - 2/n) } crit.aicc.gam <- function(gam.obj,alpha=1,estalpha=F){ # # S-PLUS program ONLY - R's version of gam does not work # # # Function to calculate the AICC value for a gam object gam.obj. The gam object # must include at least one smoothing-based term (otherwise, use crit.aicc.lm). # For discussion see Simonoff and Tsai (1999, JCGS). # # This function allows for a dispersion parameter alpha, in case a quasi-likelihood # model is being fit, and QAICC is desired (alpha is the multiplicative factor of # the variance, typically estimated by X^2/df). # # Note that this function uses the convention that the number of observations is # NOT the sum of the individual counts, but rather the number of cells. # This is consistent with viewing the problem as a regression smoothing problem, rather # than a contingency table model building one. # n <- length(gam.obj$y) if (gam.obj$df.residual < 0) stop("Model overparameterized") else p <- n - gam.obj$df.residual if(estalpha) p <- p + 1 as.numeric(log(deviance(gam.obj)/n)/alpha + (1 + p/n)/(1 - p/n - 2/n))} # powdivind <- function(obs, lambda = 2/3, statonly = F) { # # Function to calculate Cressie-Read power divergence goodness-of-fit statistic for independence in # two-dimensional table # # Input parameters # # obs - two-dimensional matrix of observed counts # lambda - power for statistic # lambda = 1 : Pearson statistic # lambda = 2/3 : Cressie-Read statistic # lambda = 0 : likelihood ratio statistic # lambda = -1/2 : Freeman-Tukey statistic # lambda = -2 : Neyman-modified statistic # statonly - set equal to T if want only statistic output # # Output parameters # # stat - statistic # df - degrees of freedom for statistic # pval - p-value for statistic based on a chi-squared distribution with # degrees of freedom equal to df # exp - expected counts under independence # resid - matrix of residuals derived from test statistic # # Note: For all statistics except the Pearson statistic, cells with observed counts # equal to zero do not contribute to the test statistic (effectively this # means that 0 * log(0) and 0 * Infinity are taken to be zero). This can lead to # strange values of the statistic (such as negative values) for lambda less # than zero. # rowmarg <- apply(obs, 1, sum) colmarg <- apply(obs, 2, sum) n <- sum(obs) exp <- outer(rowmarg, colmarg)/n if(lambda == 0) { resid <- sqrt(2 * (obs * log(ifelse((obs > 0 & exp > 0), obs/exp, 1)) + exp - obs)) * sign(obs - exp) } else { resid <- sqrt(2/(lambda * (lambda + 1)) * (obs * ((obs/exp)^lambda - 1) + lambda * (exp - obs))) * sign(obs - exp) } stat <- sum(resid^2) df <- cumprod(dim(obs) - c(1, 1))[2] pval <- 1 - pchisq(stat, df) if(statonly) return(stat) else return(list(stat = stat, df = df, pval = pval, exp = exp, resid = resid))} # mosaic <- function(X, main=NA, sort=NA, off=NA, dir=NA, color=FALSE) { ###################################################################### # SPLUS Mosaic Graphical Procedure # # Jay Emerson, Yale University, February 1998 # ###################################################################### # See "Mosaic Displays in S-PLUS: A General Implementation and a # # Case Study," by J.W. Emerson, _Statistical Computing and # # Statistical Graphics Newsletter_, 9(1), 17-23. # ###################################################################### # Plot Mosaic # # # # DESCRIPTION: # # Plots a mosaic on the current graphics device. # # # # USAGE: # # mosaic(X, main=NA, sort=NA, off=NA, dir=NA, color=F) # # # # REQUIRED ARGUMENTS: # # X: a contingency table, with optional category labels # # specified in the dimnames(X) attribute. The table is best # # created by the table() command, which produces an object # # of type array. # # # # OPTIONAL ARGUMENTS: # # main: character string for the mosaic title. # # sort: vector ordering of the variables, containing a permutation # # of the integers 1:length(dim(X)) (the default). # # off: vector of offsets to determine percentage spacing at each # # level of the mosaic (appropriate values are between 0 and # # 20, and the default is 10 at each level). There should be # # one offset for each dimension of the contingency table. # # dir: vector of split directions ("v"=vertical and # # "h"=horizontal) for each level of the mosaic, one # # direction for each dimension of the contingency table. # # The default consists of alternating directions, beginning # # with a vertical split. # # color: (TRUE or vector of integer colors) for color shading or # # (FALSE, the default) for empty boxes with no shading. # ###################################################################### ###################################################################### # NOTES: # # 1. Use of the par(fin) environment variable can be helpful # # when the desired mosaic is not square. # # 2. When using the student version of S-PLUS for Windows, # # limitations on the version may prevent drawing # # high-dimensional mosaics. # # 3. This file should be run (or source("mosaic.code")) with # # each new .Data directory. # ###################################################################### frame() par(usr=c(1,1000,1,1000)) if (is.vector(X)) { X <- array(X) } dimd <- length(dim(X)) if (!is.null(dimnames(X))) { label <- dimnames(X) } else { label <- NA } if (dimd>1) { Ind <- rep(1:(dim(X)[1]), prod(dim(X)[2:dimd])) for (i in 2:dimd) { Ind <- cbind(Ind, c(matrix(1:(dim(X)[i]), byrow=T, prod(dim(X)[1:(i-1)]), prod(dim(X)[i:dimd])))) } } else { Ind <- 1:(dim(X)[1]) } Ind <- cbind(Ind, c(X)) if (!is.na(main)) { title(main) } # Make the title. if ((is.na(off[1]))||(length(off)!=dimd)) { # Initialize spacing. off <- rep(10,50)[1:dimd] } if (is.na(dir[1])||(length(dir)!=dimd)) { # Initialize directions. dir <- rep(c("v","h"),50)[1:dimd] } if ((!is.na(sort[1]))&&(length(sort)==dimd)) { # Sort columns. Ind <- Ind[,c(sort,dimd+1)] off <- off[sort] dir <- dir[sort] label <- label[sort] } ncolors <- length(tabulate(Ind[,dimd])) if (is.na(color[1])) { color <- rep(0, ncolors) } else { if (length(color) != ncolors) { if (!color[1]) { color <- rep(0, ncolors) } else { color <- 2:(ncolors+1) } } } mosaic.cell(Ind, 50, 5, 950, 950, off/100, dir, color, 2, 2, apply(as.matrix(Ind[,1:dimd]), 2, max), 1, label) } mosaic.cell <- function(X, x1, y1, x2, y2, off, dir, color, lablevx, lablevy, maxdim, currlev, label) { if (dir[1] == "v") { # split here on the X-axis. xdim <- maxdim[1] XP <- rep(0, xdim) for (i in 1:xdim) { XP[i] <- sum(X[X[,1]==i,ncol(X)]) / sum(X[,ncol(X)]) } white <- off[1] * (x2 - x1) / (max(1, xdim-1)) x.l <- x1 x.r <- x1 + (1 - off[1]) * XP[1] * (x2 - x1) if (xdim > 1) { for (i in 2:xdim) { x.l <- c(x.l, x.r[i-1] + white) x.r <- c(x.r, x.r[i-1] + white + (1 - off[1]) * XP[i] * (x2 - x1)) } } if (lablevx > 0) { if (is.na(label[[1]][1])) { this.lab <- paste(rep(as.character(currlev), length(currlev)), as.character(1:xdim), sep=".") } else { this.lab <- label[[1]] } text(x=(x.l + (x.r - x.l) / 2), y=(965 + 22 * (lablevx - 1)), srt=0,adj=.5, cex=.5, this.lab) } if (ncol(X) > 2) { # recursive call. for (i in 1:xdim) { if (XP[i] > 0) { mosaic.cell(as.matrix(X[X[,1]==i,2:ncol(X)]), x.l[i], y1, x.r[i], y2, off[2:length(off)], dir[2:length(dir)], color, lablevx-1, (i==1)*lablevy, maxdim[2:length(maxdim)], currlev+1, label[2:ncol(X)]) } else { segments(rep(x.l[i],3), y1+(y2-y1)*c(0,2,4)/5, rep(x.l[i],3), y1+(y2-y1)*c(1,3,5)/5) } } } else { for (i in 1:xdim) { if (XP[i] > 0) { polygon(c(x.l[i], x.r[i], x.r[i], x.l[i]), c(y1, y1, y2, y2), col=color[i]) segments(c(rep(x.l[i],3),x.r[i]), c(y1,y1,y2,y2), c(x.r[i],x.l[i],x.r[i],x.r[i]), c(y1,y2,y2,y1)) } else { segments(rep(x.l[i],3), y1+(y2-y1)*c(0,2,4)/5, rep(x.l[i],3), y1+(y2-y1)*c(1,3,5)/5) } } } } else { # split here on the Y-axis. ydim <- maxdim[1] YP <- rep(0, ydim) for (j in 1:ydim) { YP[j] <- sum(X[X[,1]==j,ncol(X)]) / sum(X[,ncol(X)]) } white <- off[1] * (y2 - y1) / (max(1, ydim - 1)) y.b <- y2 - (1 - off[1]) * YP[1] * (y2 - y1) y.t <- y2 if (ydim > 1) { for (j in 2:ydim) { y.b <- c(y.b, y.b[j-1] - white - (1 - off[1]) * YP[j] * (y2 - y1)) y.t <- c(y.t, y.b[j-1] - white) } } if (lablevy > 0) { if (is.na(label[[1]][1])) { this.lab <- paste(rep(as.character(currlev), length(currlev)), as.character(1:ydim), sep=".") } else { this.lab <- label[[1]] } text(x=(35 - 20 * (lablevy - 1)), y=(y.b + (y.t - y.b) / 2), srt=90, adj=.5, cex=.5, this.lab) } if (ncol(X) > 2) { # recursive call. for (j in 1:ydim) { if (YP[j] > 0) { mosaic.cell(as.matrix(X[X[,1]==j,2:ncol(X)]), x1, y.b[j], x2, y.t[j], off[2:length(off)], dir[2:length(dir)], color, (j==1)*lablevx, lablevy-1, maxdim[2:length(maxdim)], currlev+1, label[2:ncol(X)]) } else { segments(x1+(x2-x1)*c(0,2,4)/5, rep(y.b[j],3), x1+(x2-x1)*c(1,3,5)/5, rep(y.b[j],3)) } } } else{ # final split polygon and segments. for (j in 1:ydim) { if (YP[j] > 0) { polygon(c(x1,x2,x2,x1), c(y.b[j],y.b[j],y.t[j],y.t[j]), col=color[j]) segments(c(x1,x1,x1,x2), c(y.b[j],y.b[j],y.t[j],y.t[j]), c(x2,x1,x2,x2), c(y.b[j],y.t[j],y.t[j],y.b[j])) } else { segments(x1+(x2-x1)*c(0,2,4)/5, rep(y.b[j],3), x1+(x2-x1)*c(1,3,5)/5, rep(y.b[j],3)) } } } } } # end(function mosaic.cell) # table2case <- function(intable, rowname="Rows", colname="Cols"){ # # Convert two-way contingency table in table to a case data form data frame # # intable - input contingency table as matrix # # outdata - output data frame of table in glm format # rows <- rep(c(1:nrow(intable)),ncol(intable)) cols <- rep(c(1:ncol(intable)),each=nrow(intable)) count <- as.vector(intable) outdata <- data.frame(cbind(rows,cols,count)) names(outdata) <- c(rowname,colname,"Count") outdata} # factorial <- function(n){ # # S-PLUS factorial function, which is not in R # result <- gamma(n + 1) wasInteger <- !(n %% 1) & !is.na(n) result[wasInteger] <- round(result[wasInteger]) result } # makediags <- function(casedata, rowname = "Rows", colname = "Cols"){ # # Function to create a factor identifying diagonal cells of a square # contingency table that is in case data form # numrow <- max(casedata[rowname]) diagfactor <- rep(0, length(casedata[rowname])) for(i in 1:numrow) diagfactor <- diagfactor + as.numeric(casedata[rowname] == i & casedata[colname] == i) * i diagfactor <- as.factor(diagfactor) contrasts(diagfactor) <- contr.treatment(numrow + 1) diagfactor} # makeoffdiags <- function(casedata, rowname = "Rows", colname = "Cols"){ # # Function to create a factor identifying symmetric off-diagonal cells of a square # contingency table that is in case data form. Factor is coded based on indicator variables. # numrow <- max(casedata[rowname]) offdiagfactor <- rep(0, length(casedata[rowname])) ind <- 1 for(i in 1:(numrow - 1)) { for(j in (i + 1):numrow) { offdiagfactor <- offdiagfactor + (as.numeric(casedata[rowname] == i & casedata[colname] == j) + as.numeric(casedata[rowname] == j & casedata[colname] == i)) * ind ind <- ind + 1}} offdiagfactor <- as.factor(offdiagfactor) contrasts(offdiagfactor) <- contr.treatment(ind) offdiagfactor} # depend.prop.ci <- function(X, succ.index=1, alpha=.05){ # # Function to produce confidence interval for the difference between dependent proportions. # Data are input as a 2 x 2 matrix; by default, the success category corresponds to the # first row and column # # Default level is 95% confidence # i <- succ.index z <- qnorm(1 - alpha/2) n <- sum(X) propmatrix <- X/n v <- sqrt((sum(propmatrix[i,])*(1 - sum(propmatrix[i,])) + sum(propmatrix[,i])*(1 - sum(propmatrix[,i])) - 2.*(propmatrix[1,1]*propmatrix[2,2] - propmatrix[1,2]*propmatrix[2,1]))/n) est <- sum(propmatrix[i,]) - sum(propmatrix[,i]) low <- est - z*v high <- est + z*v list(est=est, low=low, high=high)} # kappa.counts<-function(counts,wt=diag(nrow(counts)),print=T){ # # Function to calculate kappa values # # Original written by James L. Pratt # Changed to take weight matrix as input, with identity (unweighted) as default # ##counts is a k-by-k contingency table n.tot<-sum(counts) k<-dim(counts)[1] P<-counts/n.tot #k-by-k matrix of proportions Pij Pi<-rowSums(P) #k-by-1 vector of marginal proportions Pi. Pj<-colSums(P) #k-by-1 vector of marginal proportions P.j #observed proportion of agreement (weighted if needed) po<-sum(P*wt) # * is element-wise multiplication, %*% is linear algebra multiplication # t() is transpose function # expected proportion of agreement (weighted if needed) pe<-sum((Pj%*%t(Pi))*wt) kappa<-(po-pe)/(1-pe) #outer(x,y," + ") creates outer-sum of two matrices x and y, similar to a #cross-product, adding elemtents rather than multiplying them # part 1 of numerator for variance estimate var1<-sum(P*(wt-matrix(outer(Pj%*%t(wt),Pi%*%wt, FUN="+"),ncol=k,byrow=F)*(1-kappa))^2) # part 2 of numerator for variance estimate var2<-(kappa-pe*(1-kappa))^2 #denominator for variance estiamte den<-(n.tot*(1-pe)^2) #asymptotic se ase<-sqrt((var1-var2)/den) # part 1 of numerator for variance estimate under null hyp var1.0<-sum((Pj%*%t(Pi))*(wt-matrix(outer(Pj%*%t(wt),Pi%*%wt, FUN="+"),ncol=k,byrow=T))^2) #asymptotic se under null hyp ase.0<-sqrt((var1.0-pe^2)/den) if (print) print(counts) cbind(po,pe,kappa,var1,var2,den,ase,z.value=kappa/ase, p.value=2*(1-pnorm(abs(kappa/ase))),ase.0,z.value.0=kappa/ase.0, p.value.0=2*(1-pnorm(abs(kappa/ase.0)))) } # OR.MH <- function(x,alpha=.05){ # # Function to produce Mantel-Haenszel common odds ratio estimate and asymptotic CI # Function based on Thompson (1999) # # These are given by the mantelhaen.test function in R # # Input : x - 2 x 2 x K array of counts # # Output: theta - MH estimate # ci - 100 x (1-alpha)% confidence interval for theta # n11k <- x[1,1,] n22k <- x[2,2,] n12k <- x[1,2,] n21k <- x[2,1,] nk <- n11k + n22k + n12k + n21k theta <- sum(n11k*n22k/nk)/sum(n12k*n21k/nk) varlogtheta <- sum((n11k + n22k)*n11k*n22k/(nk^2))/(2*(sum(n11k*n22k/nk))^2) + sum(((n11k + n22k)*n12k*n21k + (n12k + n21k)*n11k*n22k)/(nk^2))/(2*sum(n11k*n22k/nk)*sum(n12k*n21k/nk)) + sum((n12k + n21k)*n12k*n21k/(nk^2))/(2*(sum(n12k*n21k/nk))^2) zalpha <- qnorm(1-alpha/2) ci <- c(exp(log(theta)-zalpha*sqrt(varlogtheta)),exp(log(theta) + zalpha*sqrt(varlogtheta))) list(theta=theta,ci=ci,alpha=alpha) } # OR.BD <- function(x,theta){ # # Function to produce Breslow-Day statistic and adjusted Tarone statistic # # Input : x - 2 x 2 x K array of counts # theta - Mantel-Haenszel estimate of common odds ratio # # Output: bd - Breslow-Day statistic # pbd - p-value for Breslow-Day statistic # tarone - adjusted (Tarone) Breslow-Day statistic # ptarone - p-value for Tarone statistic # df - degrees of freedom of test (K - 1 - # tables with zero row or column sum) # n11k <- x[1,1,] n22k <- x[2,2,] n12k <- x[1,2,] n21k <- x[2,1,] n1dk <- n11k + n12k n2dk <- n21k + n22k nd1k <- n11k + n21k K <- dim(x)[3] ek <- rep(NA,K) aa <- 1-theta for (i in 1:K){ bb <- n2dk[i] + theta*n1dk[i] + (theta-1)*nd1k[i] cc <- -theta*n1dk[i]*nd1k[i] ek[i] <- (-bb + sqrt(bb^2-4*aa*cc))/(2*aa)} vk <- 1/(1/ek + 1/(n1dk-ek) + 1/(nd1k-ek) + 1/(n2dk-nd1k + ek)) df <- K - 1 - sum(vk==0) bd <- sum(((n11k-ek)^2)/vk,na.rm=T) pbd <- 1-pchisq(bd,df) tarone <- bd - (sum(n11k-ek)^2)/sum(vk) ptarone <- 1-pchisq(tarone,df) list(bd=bd,pbd=pbd,tarone=tarone,ptarone=ptarone,df=df) } # multtable2case <- function(intable,vnames=NULL){ # # Function to convert multidimensional contingency table in array form to case form # # Input : intable - input array # vnames - character vector of names of indices for array in order of fastest change # (optional for S-PLUS; if omitted, names are X1.1, X1.2, etc.; required # for R) # # Output : data matrix of case data form of table. The last column in the table is the count # variable, and is named "Counts" # d <- dim(intable) n <- prod(d) p <- length(d) dd <- c(1,d) res <- data.frame(matrix(NA,n,p + 1)) for (j in 1:(p-1)){ res[,j] <- rep(c(1:d[j]),cumprod(rev(dd))[p-j],each=cumprod(dd)[j])} res[,p] <- rep(c(1:d[p]),each=cumprod(dd)[p]) res[,p + 1] <- as.vector(intable) names(res)[1:p] <- vnames names(res)[p + 1] <- "Count" res} # dgumbelmin <- function(x,location=0, scale=1){ # # Gumbel (extreme value) minimum density function # y <- (x - location)/scale exp(y-exp(y))/scale} # dgumbelmax <- function(x,location=0, scale=1){ # # Gumbel (extreme value) maximum density function # y <- (x - location)/scale exp(-y-exp(-y))/scale} # pgumbelmin <- function(x,location=0, scale=1){ # # Gumbel (extreme value) minimum cumulative probability function # y <- (x - location)/scale 1-exp(-exp(y))} # pgumbelmax <- function(x,location=0, scale=1){ # # Gumbel (extreme value) maximum cumulative probability function # y <- (x - location)/scale exp(-exp(-y))} #