#================================================== # Normal linear models and gamma GLMs # Illustrated by house price data # (see sections 3.4 and 4.7 in the book by Agresti) #================================================== # Read data from the web data="http://www.stat.ufl.edu/~aa/glm/data/Houses.dat" houses = read.table(data, header = T) head(houses) # First we make some plots of the data: # Make matrix plot of prices, size and taxes: plot(houses[,c("price","size","taxes")]) # Make box plots of prices for number of bedrooms, baths and if new or not par(mfrow=c(1,3)) boxplot(price~beds,data=houses,main="Bedrooms") boxplot(price~baths,data=houses,main="Bathrooms") boxplot(price~new,data=houses,main="Old or new") par(mfrow=c(1,1)) #============================================================ # We first fit linear normal models (section 4.7.1) # (as in section 4.7.1 in the book, we do not consider taxes) #============================================================ # Fit a model with all main effects (considered numeric) fit1=lm(price~size+new+baths+beds, data=houses) summary(fit1) AIC(fit1) # Fit a model with main effects and second order interactions fit2=lm(price~(size+new+baths+beds)^2, data=houses) summary(fit2) AIC(fit2) # Test the effect of the interactions (jointly) anova(fit1,fit2) # Remove non-significant interactions (at 1% level) by backwards elimination: fit3=update(fit2,.~.-baths:beds) summary(fit3) AIC(fit3) fit4=update(fit3,.~.-size:baths) summary(fit4) AIC(fit4) fit5=update(fit4,.~.-new:beds) summary(fit5) AIC(fit5) fit6=update(fit5,.~.-new:baths) summary(fit6) AIC(fit6) # We may now also drop bath form the model # (since it is not in any interaction) fit7=update(fit6,.~.-baths) summary(fit7) AIC(fit7) #===================================== # Check of residuals for model "fit7" #===================================== # Plot of residuals par(mfrow=c(1,2)) plot(fit7,1) plot(fit7,3) par(mfrow=c(1,1)) # Normal probability plot plot(fit7,2) #======================================= # We then fit gamma GLMs (section 4.7.2) #======================================= # Fit gamma GLM corresponding to the model "fit7" above: fitgam1=glm(price~size+new+beds+size:new+size:beds,family=Gamma(link=identity),data=houses) summary(fitgam1) # Remove some non-significant terms: fitgam2=update(fitgam1,.~.-size:beds) summary(fitgam2) fitgam3=update(fitgam2,.~.-beds) summary(fitgam3) #Using the approximate F-test shown in the lecture anova(fitgam3,fitgam1,test="F") anova(fitgam3,fitgam2,test="F") AIC(fitgam3) #The AIC is lower for fit3gam than for the normal linear model with #same expl. var.: AIC(lm(price~size+new+size:new,data=houses)) #, and also the lowest AIC we achieved above with #the normal linear mode (fit5: AIC=1070.565) plot(fitgam3) # Now there are not very influential observations and the variance assumption # is better, though not fitting too well for small predicted values. # (The QQ-norm is really irrelevant since we assume a gamma-distribution)