# STK2100 Spring 2024.
# Exercise set 7.
# These solutions are based on those given in 2020.

# -------------------------------
# 7.9 from ISL
# -------------------------------

set.seed(1)
library(MASS)
attach(Boston)

# -------------------------------
# a)
# -------------------------------

# fit a third-degree polynomial
lm.fit = lm(nox ~ poly(dis, 3), data = Boston)
summary(lm.fit)

dislim = range(dis)
dis.grid = seq(from = dislim[1], to = dislim[2], by = 0.1)
lm.pred = predict(lm.fit, list(dis = dis.grid))
plot(nox ~ dis, data = Boston, col = "darkgrey")
lines(dis.grid, lm.pred, col = "red", lwd = 2)

# Summary shows that all polynomial terms are significant while predicting
# nox using dis. Plot shows a smooth curve fitting the data fairly well.

# -------------------------------
# b)
# -------------------------------

# We plot polynomials of degrees 1 to 10 and save train RSS.

all.rss = rep(NA, 10)
for (i in 1:10) {
  lm.fit = lm(nox ~ poly(dis, i), data = Boston)
  all.rss[i] = sum(lm.fit$residuals^2)
}
all.rss

# As expected, train RSS monotonically decreases with degree of polynomial.

# -------------------------------
# c)
# -------------------------------

# We use a 10-fold cross validation to pick the best polynomial degree.

library(boot)
all.deltas = rep(NA, 10)
for (i in seq_along(all.deltas)) {
  glm.fit = glm(nox ~ poly(dis, i), data = Boston)
  all.deltas[i] = cv.glm(Boston, glm.fit, K = 10)$delta[2]
}
plot(seq_along(all.deltas), all.deltas, xlab = "Degree", ylab = "CV error", type = "l", pch = 20,
     lwd = 2)

# A 10-fold CV shows that the CV error reduces as we increase degree from 1 to 3,
# stay almost constant till degree 5, and the starts increasing for higher degrees.
# We pick 4 as the best polynomial degree.

# -------------------------------
# d)
# -------------------------------

# We see that dis has limits of about 1 and 13 respectively. We split this range
# in roughly equal 4 intervals and establish knots at [4,7,11]. Note: bs function
# in R expects either df or knots argument. If both are specified, knots are
# ignored.

library(splines)
sp.fit = lm(nox ~ bs(dis, df = 4, knots = c(4, 7, 11)), data = Boston)
summary(sp.fit)

sp.pred = predict(sp.fit, list(dis = dis.grid))
plot(nox ~ dis, data = Boston, col = "darkgrey")
lines(dis.grid, sp.pred, col = "red", lwd = 2)

# The summary shows that all terms in spline fit are significant. Plot shows
# that the spline fits data well except at the extreme values of dis,
# (especially dis>10).

# -------------------------------
# e)
# -------------------------------

# We fit regression splines with dfs between 3 and 16.

# init a 4-by-4 matrix with plots
dev.off()
par(mfrow = c(2,2))

all.rss = rep(NA, 16)
for (i in 3:16) {
  lm.fit = lm(nox ~ bs(dis, df = i), data = Boston)
  all.rss[i] = sum(lm.fit$residuals^2)

  # plot regression line for selected dfs
  if (i %% 4 == 0) {
    plot(nox ~ dis, data = Boston, col = "darkgrey", main = paste0("df = ", i))
    lines(dis.grid, predict(lm.fit, list(dis = dis.grid)), col = "red", lwd = 2)
  }
}

all.rss[-c(1, 2)]
par(mfrow = c(1, 1)) # reset
plot(3:16, all.rss[-c(1, 2)], lwd = 2, type = "l", xlab = "df", ylab = "train RSS")

# Train RSS decreases till df=14 and then slightly increases for
# df=15 and df=16. The fitted regression lines are more flexible for higher dfs.
# Not large differences between the higher dfs (df > 8),
# which reflects the smaller changes in the train RSS.

# -------------------------------
# f)
# -------------------------------

# Finally, we use a 10-fold cross validation to find best df. We try all integer
# values of df between 3 and 16.

all.cv = rep(NA, 16)
for (i in 3:16) {
  lm.fit = glm(nox ~ bs(dis, df = i), data = Boston)
  all.cv[i] = cv.glm(Boston, lm.fit, K = 10)$delta[2]
}

plot(3:16, all.cv[-c(1, 2)], lwd = 2, type = "l", xlab = "df", ylab = "CV error")

# CV error is more jumpy in this case, but attains minimum at df=10. We pick 10
# as the optimal degrees of freedom. PS: This will vary for each CV, because
# if using different splits.