## STK2100. Spring 2024. ## Exercise set 8 ## These solutions are mostly taken from the spring 2020, week 10. ## ISLR 7.10 (College data) ----- # ------------------------------- # 7.10 from ISL # ------------------------------- # a) set.seed(1) library(ISLR) library(leaps) attach(College) train = sample(length(Outstate), length(Outstate)/2) test = -train College.train = College[train, ] College.test = College[test, ] reg.fit = regsubsets(Outstate ~ ., data = College.train, nvmax = 17, method = "forward") reg.summary = summary(reg.fit) plot(reg.summary$bic, xlab = "Number of Variables", ylab = "BIC", type = "l") min.bic = min(reg.summary$bic) std.bic = sd(reg.summary$bic) abline(h = min.bic + 0.2 * std.bic, col = "red", lty = 2) abline(h = min.bic - 0.2 * std.bic, col = "red", lty = 2) # BIC scores show that size 6 is the minimum size for the subset for which the scores are # withing 0.2 standard deviations of optimum. We pick 6 as the best subset size and find # best 6 variables using entire data. reg.fit = regsubsets(Outstate ~ ., data = College.train, method = "forward") coefi = coef(reg.fit, id = 6) names(coefi) # b) #install.packages('gam') library(gam) gam.fit = gam(Outstate ~ Private + s(Room.Board, df = 2) + s(PhD, df = 2) + s(perc.alumni, df = 2) + s(Expend, df = 5) + s(Grad.Rate, df = 2), data = College.train) par(mfrow = c(2, 3)) plot(gam.fit, se = T, col = "blue") # The plot shows the smoothing spline fits for each covariate, and show that # the response is mostly increasing in all these inputs. # c) # test error for gam gam.pred = predict(gam.fit, College.test) gam.err = mean((College.test$Outstate - gam.pred)^2) gam.err # test error for regression model predict.regsubsets = function(object, newdata, id, ...) { form = as.formula(object$call[[2]]) mat = model.matrix(form, newdata) coefi = coef(object, id = id) mat[, names(coefi)] %*% coefi } reg.pred = predict(reg.fit, newdata = College.test, id = 6) reg.err = mean((College.test$Outstate - reg.pred)^2) reg.err # compare in terms of un-explained variance tss = mean((College.test$Outstate - mean(College.test$Outstate))^2) 1 - gam.err/tss 1 - reg.err/tss # We obtain a test R-squared of 0.77 using GAM with 6 predictors. This is a slight # improvement over a test RSS of 0.74 obtained using OLS. # d) summary(gam.fit) # Non-parametric Anova test shows a strong evidence of non-linear relationship between # response and Expend. # ------------------------------- # 7.11 from ISL # ------------------------------- # a) # We create variables according to the equation Y=-2.1+1.3X1+0.54X2. set.seed(1) X1 = rnorm(100) X2 = rnorm(100) eps = rnorm(100, sd = 0.1) Y = -2.1 + 1.3 * X1 + 0.54 * X2 + eps # b) # Create a list of 1000 beta0, beta1 and beta2. Initialize first of beta1 coefficient to 10. beta0 = rep(NA, 1000) beta1 = rep(NA, 1000) beta2 = rep(NA, 1000) beta1[1] = 10 # c, d and e) # Accumulate results of 1000 iterations in the beta arrays. for (i in 1:1000) { a = Y - beta1[i] * X1 beta2[i] = lm(a ~ X2)$coef[2] a = Y - beta2[i] * X2 lm.fit = lm(a ~ X1) if (i < 1000) { beta1[i + 1] = lm.fit$coef[2] } beta0[i] = lm.fit$coef[1] } plot(1:1000, beta0, col = "green", type = "l", xlab = "iteration", ylab = "betas", ylim = c(-4, 1.6)) lines(1:1000, beta1, col = "red") lines(1:1000, beta2, col = "blue") #legend("topright", c("beta0", "beta1", "beta2"), lty = 1, col = c("green", "red", "blue")) # The coefficients quickly attain their least square values. # f) lm.fit = lm(Y ~ X1 + X2) abline(h = lm.fit$coef[1], lty = "dashed", lwd = 3, col = rgb(0, 0, 0, alpha = 0.4)) abline(h = lm.fit$coef[2], lty = "dashed", lwd = 3, col = rgb(0, 0, 0, alpha = 0.4)) abline(h = lm.fit$coef[3], lty = "dashed", lwd = 3, col = rgb(0, 0, 0, alpha = 0.4)) #legend("center", c("beta0", "beta1", "beta2", "multiple regression"), lty = c(1, # 1, 1, 2), col = c("green", "red", "blue", "black")) # Dotted lines show that the estimated multiple regression coefficients match exactly # with the coefficients obtained using backfitting. # g) # When the relationship between Y and X's is linear, one iteration is sufficient to # attain a good approximation of true regression coefficients.