## STK2100. Spring 2024. ## Exercise set 9 ### ISLR 8.7 (Random forest, Boston data) ----- ### See section 8.3.3 for code examples library(ISLR2) library(randomForest) help("randomForest") # documentation # load data attach(Boston) head(Boston) # divide into train and test set set.seed(1) train <- sample(1:nrow(Boston), nrow(Boston) / 2) boston.test <- Boston[-train, "medv"] # fit random forest model (code from ISLR 8.3.3) set.seed(1) p <- ncol(Boston)-1 par <- list(mtry = round(c(sqrt(p), p/3, p/2, p)), ntree = c(seq(1, 501, by = 25))) err <- array(dim = lengths(par), dimnames = par) for (i in seq_along(par$mtry)) { for (j in seq_along(par$ntree)) { m <- par$mtry[i] n <- par$ntree[j] cat("\nmtry:", m, "ntree:", n) # fit model rf.boston = randomForest(medv~.,data=Boston,subset=train,mtry=m, ntree=n) # predict test-data outcomes yhat.rf = predict(rf.boston, newdata=Boston[-train,]) # compute test-error err[i, j] = mean((yhat.rf-boston.test)^2) } } matplot(par$ntree, t(err), type = "l", xlab = "Number of Trees", ylab = "Test error") legend("topright", legend = paste0("m = ", par$mtry), lty = seq_along(par$mtry), col = seq_along(par$mtry)) # Adding trees reduces the test error up to a certain number, where the test error flattens out. # Obtain lower test error when we restrict the number of candidate splits. # Here, the lowest test-error is typically with m = 3, followed by the default p/3 rule. ## ISLR 8.8 (Regression trees, Carseats data) ----- ## Solutions taken from spring 2019 exercise set 10. ## See section 8.3.1 for more code examples rm(list=ls()) err <- list() # for collecting test-errors of different models ## a) library(ISLR) attach(Carseats) set.seed(1234) idx_train = sample(dim(Carseats)[1], 200) train = Carseats[idx_train,] test= Carseats[-idx_train,] ## b) library(tree) ?tree ?tree.control # use control = tree.control() inside tree call. # fit regression tree fit = tree(Sales ~ ., data = train) summary(fit) plot(fit) text(fit) # compute test error preds = predict(fit, test) err$tree = mean((preds - test$Sales)^2) ## c) ?cv.tree ?prune.tree # run cross-validation over a sequence of subtrees cvfit = cv.tree(fit, FUN = prune.tree) plot(cvfit$size, cvfit$dev, xlab = "Number of terminal nodes (size)", ylab="Deviance") plot(cvfit$k, cvfit$dev, xlab = "Cost-complexity parameter, k", ylab="Deviance") # fit pruned tree with lowest cv-error idx_best = which.min(cvfit$dev) idx_best best_size = cvfit$size[idx_best] best_size fit_best = prune.tree(fit, best = best_size) preds = predict(fit_best, test) err$tree_cv <- mean((preds - test$Sales)^2) err # No, here pruning does not improves the test error. ## d) bagging # fit model - use m = p for bagging fit <- randomForest(Sales~.,data=train,mtry = ncol(Carseats)-1,importance=TRUE) # compute test error preds = predict(fit, newdata=test) err$bag <- mean((preds - test$Sales)^2) # variable importance importance(fit) varImpPlot(fit) ## e) random forest # fit model - use default m = p/3 fit <- randomForest(Sales~.,data=train,importance=TRUE) # compute test error preds = predict(fit, newdata=test) err$rf <- mean((preds - test$Sales)^2) # lower test-error compared to bagging. Reducing m reduces the test-error. # variable importance importance(fit) varImpPlot(fit) ## ISLR 8.12 (Boosting, bagging, Random forest) ----- ## Here we use Carseats data as in Ex. 8.8, where we have already applied bagging and RF. ## See ISLR 8.2.5 for a summary on the tree ensamble methods ## Boosting - see ISLR 8.3.4 for code examples library(gmb) set.seed(1) fit = gbm(Sales~.,data=train,distribution="gaussian",n.trees=5000,interaction.depth=4) summary(fit) plot(fit, i = "Price") # partial dependence plot illustrates marginal effect # compute test-error preds = predict(fit, newdata=test) err$boost <- mean((preds - test$Sales)^2) ## Best subset selection - see section 6.5.1 for code examples library(leaps) fit <- regsubsets(Sales~.,data=train, method = "exhaustive") fit_summary <- summary(fit) predict.regsubsets = function(object, newdata, id, ...) { form = as.formula(object$call[[2]]) mat = model.matrix(form, newdata) coefi = coef(object, id = id) mat[, names(coefi)] %*% coefi } preds = predict(fit, newdata = test, id = which.min(fit_summary$bic)) err$bs <- mean((preds - test$Sales)^2) ## Linear regression fit = lm(Sales~.,data=train) preds = predict(fit, newdata=test) err$lm <- mean((preds - test$Sales)^2) ## Compare test errors unlist(err) # On these data, the simpler linear regression methods obtains the best test error. # In particular, the best subset linear regression procedure.