## STK2100. Spring 2024.
## Exercise set 9


### ISLR 8.7 (Random forest, Boston data) -----
### See section 8.3.3 for code examples

library(ISLR2)
library(randomForest)
help("randomForest") # documentation

# load data
attach(Boston)
head(Boston)

# divide into train and test set
set.seed(1)
train <- sample(1:nrow(Boston), nrow(Boston) / 2)
boston.test <- Boston[-train, "medv"]

# fit random forest model (code from ISLR 8.3.3)
set.seed(1)
p <- ncol(Boston)-1


par <- list(mtry = round(c(sqrt(p), p/3, p/2, p)),
            ntree = c(seq(1, 501, by = 25)))

err <- array(dim = lengths(par), dimnames = par)
for (i in seq_along(par$mtry)) {
  for (j in seq_along(par$ntree)) {

    m <- par$mtry[i]
    n <- par$ntree[j]
    cat("\nmtry:", m, "ntree:", n)

    # fit model
    rf.boston = randomForest(medv~.,data=Boston,subset=train,mtry=m, ntree=n)

    # predict test-data outcomes
    yhat.rf = predict(rf.boston, newdata=Boston[-train,])

    # compute test-error
    err[i, j] = mean((yhat.rf-boston.test)^2)
  }
}

matplot(par$ntree, t(err), type = "l", xlab = "Number of Trees", ylab = "Test error")
legend("topright", legend = paste0("m = ", par$mtry), lty = seq_along(par$mtry), col = seq_along(par$mtry))

# Adding trees reduces the test error up to a certain number, where the test error flattens out.
# Obtain lower test error when we restrict the number of candidate splits.
# Here, the lowest test-error is typically with m = 3, followed by the default p/3 rule.



## ISLR 8.8 (Regression trees, Carseats data) -----
## Solutions taken from spring 2019 exercise set 10.
## See section 8.3.1 for more code examples

rm(list=ls())
err <- list()  # for collecting test-errors of different models

## a)

library(ISLR)
attach(Carseats)
set.seed(1234)
idx_train = sample(dim(Carseats)[1], 200)
train = Carseats[idx_train,]
test= Carseats[-idx_train,]

## b)
library(tree)
?tree
?tree.control  # use control = tree.control() inside tree call.

# fit regression tree
fit = tree(Sales ~ ., data = train)
summary(fit)
plot(fit)
text(fit)

# compute test error
preds = predict(fit, test)
err$tree = mean((preds - test$Sales)^2)


## c)
?cv.tree
?prune.tree

# run cross-validation over a sequence of subtrees
cvfit = cv.tree(fit, FUN = prune.tree)
plot(cvfit$size, cvfit$dev, xlab = "Number of terminal nodes (size)", ylab="Deviance")
plot(cvfit$k, cvfit$dev, xlab = "Cost-complexity parameter, k", ylab="Deviance")

# fit pruned tree with lowest cv-error
idx_best = which.min(cvfit$dev)
idx_best
best_size = cvfit$size[idx_best]
best_size
fit_best = prune.tree(fit, best = best_size)

preds = predict(fit_best, test)
err$tree_cv <- mean((preds - test$Sales)^2)
err
# No, here pruning does not improves the test error.

## d) bagging

# fit model - use m = p for bagging
fit <- randomForest(Sales~.,data=train,mtry = ncol(Carseats)-1,importance=TRUE)

# compute test error
preds = predict(fit, newdata=test)
err$bag <- mean((preds - test$Sales)^2)

# variable importance
importance(fit)
varImpPlot(fit)

## e) random forest

# fit model - use default m = p/3
fit <- randomForest(Sales~.,data=train,importance=TRUE)

# compute test error
preds = predict(fit, newdata=test)
err$rf <- mean((preds - test$Sales)^2)
# lower test-error compared to bagging. Reducing m reduces the test-error.

# variable importance
importance(fit)
varImpPlot(fit)



## ISLR 8.12 (Boosting, bagging, Random forest) -----
## Here we use Carseats data as in Ex. 8.8, where we have already applied bagging and RF.
## See ISLR 8.2.5 for a summary on the tree ensamble methods 

## Boosting - see ISLR 8.3.4 for code examples
library(gmb)
set.seed(1)
fit = gbm(Sales~.,data=train,distribution="gaussian",n.trees=5000,interaction.depth=4)
summary(fit)
plot(fit, i = "Price") # partial dependence plot illustrates marginal effect

# compute test-error
preds = predict(fit, newdata=test)
err$boost <- mean((preds - test$Sales)^2)


## Best subset selection - see section 6.5.1 for code examples
library(leaps)
fit <- regsubsets(Sales~.,data=train, method = "exhaustive")
fit_summary <- summary(fit)

predict.regsubsets = function(object, newdata, id, ...) {
  form = as.formula(object$call[[2]])
  mat = model.matrix(form, newdata)
  coefi = coef(object, id = id)
  mat[, names(coefi)] %*% coefi
}
preds = predict(fit, newdata = test, id = which.min(fit_summary$bic))
err$bs <- mean((preds - test$Sales)^2)


## Linear regression
fit = lm(Sales~.,data=train)
preds = predict(fit, newdata=test)
err$lm <- mean((preds - test$Sales)^2)

## Compare test errors
unlist(err)

# On these data, the simpler linear regression methods obtains the best test error.
# In particular, the best subset linear regression procedure.