## STK2100. Spring 2024.
## Exercise set 8
## These solutions are mostly taken from the spring 2020, week 10.

## ISLR 7.10 (College data) -----

# -------------------------------
# 7.10 from ISL
# -------------------------------

# a)

set.seed(1)
library(ISLR)
library(leaps)
attach(College)


train = sample(length(Outstate), length(Outstate)/2)
test = -train
College.train = College[train, ]
College.test = College[test, ]
reg.fit = regsubsets(Outstate ~ ., data = College.train, nvmax = 17, method = "forward")
reg.summary = summary(reg.fit)
plot(reg.summary$bic,
     xlab = "Number of Variables", ylab = "BIC", type = "l")
min.bic = min(reg.summary$bic)
std.bic = sd(reg.summary$bic)
abline(h = min.bic + 0.2 * std.bic, col = "red", lty = 2)
abline(h = min.bic - 0.2 * std.bic, col = "red", lty = 2)


# BIC scores show that size 6 is the minimum size for the subset for which the scores are
# withing 0.2 standard deviations of optimum. We pick 6 as the best subset size and find
# best 6 variables using entire data.

reg.fit = regsubsets(Outstate ~ ., data = College.train, method = "forward")
coefi = coef(reg.fit, id = 6)
names(coefi)


# b)

#install.packages('gam')
library(gam)

gam.fit = gam(Outstate ~ Private + s(Room.Board, df = 2) + s(PhD, df = 2) +
                s(perc.alumni, df = 2) + s(Expend, df = 5) + s(Grad.Rate, df = 2), data = College.train)
par(mfrow = c(2, 3))
plot(gam.fit, se = T, col = "blue")

# The plot shows the smoothing spline fits for each covariate, and show that
# the response is mostly increasing in all these inputs.

# c)

# test error for gam
gam.pred = predict(gam.fit, College.test)
gam.err = mean((College.test$Outstate - gam.pred)^2)
gam.err

# test error for regression model
predict.regsubsets = function(object, newdata, id, ...) {
  form = as.formula(object$call[[2]])
  mat = model.matrix(form, newdata)
  coefi = coef(object, id = id)
  mat[, names(coefi)] %*% coefi
}

reg.pred = predict(reg.fit, newdata = College.test, id = 6)
reg.err = mean((College.test$Outstate - reg.pred)^2)
reg.err

# compare in terms of un-explained variance
tss = mean((College.test$Outstate - mean(College.test$Outstate))^2)
1 - gam.err/tss
1 - reg.err/tss

# We obtain a test R-squared of 0.77 using GAM with 6 predictors. This is a slight
# improvement over a test RSS of 0.74 obtained using OLS.

# d)
summary(gam.fit)

# Non-parametric Anova test shows a strong evidence of non-linear relationship between
# response and Expend.



# -------------------------------
# 7.11 from ISL
# -------------------------------

# a)

# We create variables according to the equation Y=-2.1+1.3X1+0.54X2.

set.seed(1)
X1 = rnorm(100)
X2 = rnorm(100)
eps = rnorm(100, sd = 0.1)
Y = -2.1 + 1.3 * X1 + 0.54 * X2 + eps

# b)

# Create a list of 1000 beta0, beta1 and beta2. Initialize first of beta1 coefficient to 10.

beta0 = rep(NA, 1000)
beta1 = rep(NA, 1000)
beta2 = rep(NA, 1000)
beta1[1] = 10

# c, d and e)

# Accumulate results of 1000 iterations in the beta arrays.

for (i in 1:1000) {
  a = Y - beta1[i] * X1
  beta2[i] = lm(a ~ X2)$coef[2]
  a = Y - beta2[i] * X2
  lm.fit = lm(a ~ X1)
  if (i < 1000) {
    beta1[i + 1] = lm.fit$coef[2]
  }
  beta0[i] = lm.fit$coef[1]
}
plot(1:1000, beta0, col = "green",
     type = "l", xlab = "iteration", ylab = "betas", ylim = c(-4, 1.6))
lines(1:1000, beta1, col = "red")
lines(1:1000, beta2, col = "blue")
#legend("topright", c("beta0", "beta1", "beta2"), lty = 1, col = c("green", "red", "blue"))
# The coefficients quickly attain their least square values.

# f)

lm.fit = lm(Y ~ X1 + X2)
abline(h = lm.fit$coef[1], lty = "dashed", lwd = 3, col = rgb(0, 0, 0, alpha = 0.4))
abline(h = lm.fit$coef[2], lty = "dashed", lwd = 3, col = rgb(0, 0, 0, alpha = 0.4))
abline(h = lm.fit$coef[3], lty = "dashed", lwd = 3, col = rgb(0, 0, 0, alpha = 0.4))
#legend("center", c("beta0", "beta1", "beta2", "multiple regression"), lty = c(1,
#                                                                              1, 1, 2), col = c("green", "red", "blue", "black"))

# Dotted lines show that the estimated multiple regression coefficients match exactly
# with the coefficients obtained using backfitting.

# g)

# When the relationship between Y and X's is linear, one iteration is sufficient to
# attain a good approximation of true regression coefficients.