#===================================================================== # Illustration of overdispersion and GLMs for overdispersed count data # Example: Horseshoe crabs #===================================================================== # Read crab data from the web data="http://www.stat.ufl.edu/~aa/glm/data/Crabs.dat" Crabs=read.table(data,header=T) head(Crabs) #=============================================================== # Make plots and summaries of data and check for oversdispersion #=============================================================== # Histogram of the number of satellites: hist(Crabs$y,breaks=(0:16)-0.5,xlab="Number of satellites") # The number of satellites depends on weight and color: plot(Crabs$weight,Crabs$y,xlab="Weight",ylab="Number of satellites") boxplot(y~color,data=Crabs,ylab="Number of satellites", names=c("medium light","medium","medium dark","dark")) # Summaries mean(Crabs$y) var(Crabs$y) var(Crabs$y)/mean(Crabs$y) # The variance is much larger than the mean (overdispersion) # There is also overdispersion within groups of colours and weight: for (i in 1:4){ ind=(Crabs$weight<2.5)&(Crabs$color==i) cat("color= ",i," Weight < 2.5kg"," #crabs =",sum(ind), " var/mean=",var(Crabs$y[ind])/mean(Crabs$y[ind]),"\n") ind=(Crabs$weight>=2.5)&(Crabs$color==i) cat("color =",i," Weight >= 2.5kg"," #crabs =",sum(ind), " var/mean=",var(Crabs$y[ind])/mean(Crabs$y[ind]),"\n") } #=================================== # Poisson and negative binomial GLMs #=================================== # Fit Poisson model fit.pois=glm(y~weight+factor(color),family=poisson,data=Crabs) summary(fit.pois) # Fit negative binomial model library(MASS) fit.negbin=glm.nb(y~weight+factor(color),data=Crabs) summary(fit.negbin) # Here it is clear that the negative binomial distribution gives a better fit. # For a formal test, we need to take into account that gamma=1/theta=1/k equal # to zero is on the boundary of the parameter space (cf. section 7.3.4). # This is done by dividing "the usual P-value" by 2. DifflogLik=-as.numeric(2*(logLik(fit.pois)-logLik(fit.negbin))) DifflogLik (1-pchisq(DifflogLik,1))/2 #=================== # Zero inflated GLMs #=================== # Fit zero-inflated poisson/logistic model library(pscl) fit.zeroinf1=zeroinfl(y~weight+factor(color)|weight+factor(color),data=Crabs) summary(fit.zeroinf1) AIC(fit.zeroinf1) #Compare to AIC for the negative-bbinomial: AIC(fit.negbin) # Fit zero-inflated negative binomial/logistic model fit.zeroinf2=zeroinfl(y~weight+factor(color)|weight+factor(color),dist="negbin",data=Crabs) summary(fit.zeroinf2) AIC(fit.zeroinf2) #Test of gamma=0 vs gamma>0 (i.e. zi-Poisson or zi-NegBin) DifflogLik.zi=-as.numeric(2*(logLik(fit.zeroinf1)-logLik(fit.zeroinf2))) DifflogLik.zi (1-pchisq(DifflogLik.zi,1))/2 #================================================ # Quasi-likelihood methods and variance inflation #================================================ # Fit Poisson model fit.pois=glm(y~weight+factor(color),family=poisson,data=Crabs) summary(fit.pois) # Compute overdispersion parameter phi: n=dim(Crabs)[1] p=length(fit.pois$coefficients) X2=sum(residuals(fit.pois,type="pearson")^2) phi.hat=X2/(n-p) phi.hat # For the quasi likelihood approach, the standard errors from # the Poisson models should be multiplied by sqrt(phi) # The computations may be done directly using the quasi-family: fit.quasipois=glm(y~weight+factor(color), family=quasi(link="log",variance="mu"),data=Crabs) summary(fit.quasipois)