#=========================================================== # Model selection for GLMs # Illustrated by logistic regression for low birthweight data # (additional exercise 12, with the addition of forward and stepwise) #============================================================ # Read data: data="http://www.uio.no/studier/emner/matnat/math/STK3100/data/lowbirthweight.txt" lowbirthweight=read.table(data,header=T) head(lowbirthweight) # Data description: # The dataset consists of the following 10 variables: #Response vaiable: # low: indicator of birth weight less than 2.5kg #Potential covariates: # age: mother's age in years # lwt: mother's weight in pounds at last menstrual period # race: mothers 'race' ("white", "black", "other") # smoke: smoking status during pregnancy # ht: history of hypertension # ui: presence of uterine irritability # ftv: number of physician visits during the first trimester # ptl: number of previous premature labours #====================== # Backward elimination #====================== # Start with the "full model": fit.full=glm(low~age+lwt+race+smoke+ht+ui+ftv+ptl,family=binomial,data=lowbirthweight) summary(fit.full) # drop one term drop1(fit.full,scope=~age+lwt+race+smoke+ht+ui+ftv+ptl,family=binomial,data=lowbirthweight,test="LRT") # ?? is least significant and is dropped fit.back.1 <- glm(low~age+lwt+race+smoke+ht+ui+ptl,family=binomial,data=lowbirthweight) summary(fit.back.1) # drop one term drop1(fit.back.1,scope=~age+lwt+race+smoke+ht+ui+ptl,family=binomial,data=lowbirthweight,test="LRT") # ?? is least significant and is dropped fit.back.2 <- glm(low~lwt+race+smoke+ht+ui+ptl,family=binomial,data=lowbirthweight) summary(fit.back.2) # drop one term drop1(fit.back.2,scope=~lwt+race+smoke+ht+ui+ptl,family=binomial,data=lowbirthweight,test="LRT") # ?? is least significant and is dropped fit.back.3 <- glm(low~lwt+race+smoke+ht+ptl,family=binomial,data=lowbirthweight) summary(fit.back.3) # drop one term drop1(fit.back.3,scope=~lwt+race+smoke+ht+ptl,family=binomial,data=lowbirthweight,test="LRT") # All remaining covariates are significant, # so model "lwt+race+smoke+ht+ptl" is the one found by backward elimination # Result for this model summary(fit.back.3) # Automatic backward elimination (the procedure uses AIC as a stopping criterion) step(fit.full,scope=~1,direction="backward",family=binomial,data=lowbirthweight,test="LRT") #================== # Forward selection #================== # Start with "null model": fit0=glm(low~1,family=binomial,data=lowbirthweight) summary(fit0) # Include one term: add1(fit0,scope=~age+lwt+race+smoke+ht+ui+ftv+ptl,family=binomial,data=lowbirthweight,test="LRT") # ?? is most significant covariate and is included # Fit model with "ptl" and include one more covariate fit1=glm(low~ptl,family=binomial,data=lowbirthweight) add1(fit1,scope=~age+lwt+race+smoke+ht+ui+ftv+ptl,family=binomial,data=lowbirthweight,test="LRT") # ?? is the most significant new covariate and is included # Fit model with "ptl" and "age" and include one more covariate fit2=glm(low~ptl+age,family=binomial,data=lowbirthweight) add1(fit2,scope=~age+lwt+race+smoke+ht+ui+ftv+ptl,family=binomial,data=lowbirthweight,test="LRT") # ?? is the most significant new covariate and is included # Fit model with "ptl", "age" and "ht" and include one more covariate fit3=glm(low~ptl+age+ht,family=binomial,data=lowbirthweight) add1(fit3,scope=~age+lwt+race+smoke+ht+ui+ftv+ptl,family=binomial,data=lowbirthweight,test="LRT") # ?? is the most significant new covariate and is included # Fit model with "ptl", "age", "ht" and "lwt" and include one more covariate fit4=glm(low~ptl+age+ht+lwt,family=binomial,data=lowbirthweight) add1(fit4,scope=~age+lwt+race+smoke+ht+ui+ftv+ptl,family=binomial,data=lowbirthweight,test="LRT") # No more covariates are significant, so model "ptl+age+ht+lwt" is the one found by forward selection. # Result for this model (which differs from the one found by backward selection) summary(fit4) # Automatic forward selection (the procedure uses AIC as a stopping criterion) step(fit0,scope=~age+lwt+race+smoke+ht+ui+ftv+ptl,direction="forward",family=binomial,data=lowbirthweight,test="LRT") #================================================================ # Stepwise regression (the procedure uses AIC as a stopping criterion) #================================================================ # Starting from model with all main effects step(fit.full,scope=~age+lwt+race+smoke+ht+ui+ftv+ptl,direction="both",family=binomial,data=lowbirthweight,test="LRT")