auto <- read.table("http://azzalini.stat.unipd.it/Book-DM/auto.dat", header = TRUE) summary(auto) attach(auto) # sample size n <- nrow(auto) # create a dummy variable for fuel: diesel = FALSE, gasoline = TRUE fuel1 <- factor(fuel, levels = c("gas", "diesel")) ### example backward elimination ### full <- lm(log(city.distance) ~ log(engine.size) + fuel1 + log(curb.weight) + aspiration + log(wheel.base) + log(length) + log(width) + log(height) + log(compression.ratio) + log(HP)) summary(full) mod1b <- lm(log(city.distance) ~ log(engine.size) + fuel1 + log(curb.weight) + aspiration + log(length) + log(width) + log(height) + log(compression.ratio) + log(HP)) summary(mod1b) mod2b <- lm(log(city.distance) ~ log(engine.size) + fuel1 + log(curb.weight) + aspiration + log(length) + log(height) + log(compression.ratio) + log(HP)) summary(mod2b) mod3b <- lm(log(city.distance) ~ log(engine.size) + fuel1 + log(curb.weight) + aspiration + log(height) + log(compression.ratio) + log(HP)) summary(mod3b) mod4b <- lm(log(city.distance) ~ log(engine.size) + fuel1 + log(curb.weight) + aspiration + log(compression.ratio) + log(HP)) summary(mod4b) ### example forward selection ### null <- lm(log(city.distance) ~ 1) summary(null) add1(null, full, test = 'F') mod1f <- lm(log(city.distance) ~ log(HP)) summary(mod1f) add1(mod1f, full, test = 'F') mod2f <- lm(log(city.distance) ~ log(HP) + log(curb.weight)) summary(mod2f) add1(mod2f, full, test = 'F') mod3f <- lm(log(city.distance) ~ log(HP) + log(curb.weight) + log(compression.ratio)) summary(mod3f) add1(mod3f, full, test = 'F') mod4f <- lm(log(city.distance) ~ log(HP) + log(curb.weight) + log(compression.ratio) + log(engine.size)) summary(mod4f) add1(mod4f, full, test = 'F') mod5f <- lm(log(city.distance) ~ log(HP) + log(curb.weight) + log(compression.ratio) + log(engine.size) + aspiration) summary(mod5f) add1(mod5f, full, test = 'F') mod6f <- lm(log(city.distance) ~ log(HP) + log(curb.weight) + log(compression.ratio) + log(engine.size) + aspiration + fuel1) summary(mod6f) add1(mod6f, full, test = 'F') ### example stepwise selection ### mod1f <- lm(log(city.distance) ~ log(HP)) summary(mod1f) add1(mod1f, full, test = 'F') mod2s <- lm(log(city.distance) ~ log(HP) + log(length)) summary(mod2s) add1(mod2s, full, test = 'F') mod3s <- lm(log(city.distance) ~ log(HP) + log(length) + log(compression.ratio)) summary(mod3s) add1(mod3s, full, test = 'F') mod4s <- lm(log(city.distance) ~ log(HP) + log(length) + log(compression.ratio) + log(curb.weight)) summary(mod4s) mod5s <- lm(log(city.distance) ~ log(HP) + log(compression.ratio) + log(curb.weight)) summary(mod5s) add1(mod5s, full, test = 'F') mod6s <- lm(log(city.distance) ~ log(HP) + log(compression.ratio) + log(curb.weight) + log(engine.size)) summary(mod6s) add1(mod6s, full, test = 'F') mod7s <- lm(log(city.distance) ~ log(HP) + log(compression.ratio) + log(curb.weight) + log(engine.size) + aspiration) summary(mod7s) add1(mod7s, full, test = 'F') mod8s <- lm(log(city.distance) ~ log(HP) + log(compression.ratio) + log(curb.weight) + log(engine.size) + aspiration + fuel1) summary(mod8s) add1(mod8s, full, test = 'F') ### example best-subset ### allCombinations <- sapply(1:11, function(m) combn(x = 1:11, m = m)) transf <- cbind(log(city.distance), log(engine.size), fuel1, log(curb.weight), aspiration, log(wheel.base), log(length), log(width), log(height), log(compression.ratio), log(HP)) null.model <- lm(log(city.distance) ~ 1) result.RSS <- cbind(1, deviance(null.model)) for (i in 1:11) # number of variables in the model { for (j in 1:ncol(allCombinations[[i]])) # all models with that number of variables { model<-lm(log(city.distance) ~ ., data = as.data.frame(transf[, allCombinations[[i]][, j]])) result.RSS <- rbind(result.RSS, cbind(length(allCombinations[[i]][, j]), deviance(model))) } } plot(result.RSS[, 1], result.RSS[, 2], main = 'Plot', xlab = 'Subset size', ylab = 'Residual sum of squares') # using a package # library(leaps) mod.bestSubset <- regsubsets(log(city.distance) ~ log(engine.size) + fuel1 + log(curb.weight) + aspiration + log(wheel.base) + log(length) + log(width) + log(height) + log(compression.ratio) + log(HP), data = auto, nbest = 1, nvmax = 11) summary(mod.bestSubset) # the difference is in the penalty, so given the number of variables, detach(auto)