# Proposed solution to ISLR Ex 4.13 (4.10 in first edition) # Note that Chapter 4.7 includes code-examples where these # classification methods is applied. #install.packages('ISLR2') # The install.packages only needs to be done once,to install the data from the book ISLR on your computer. library(ISLR2) data(Weekly) head(Weekly) help(Weekly) # load documentation of data set # ------------------------------- # a) summarise data to explore any patterns ---- summary(Weekly) pairs(Weekly[, c("Year", "Volume", "Lag2", "Today", "Direction")]) corrplot::corrplot(cor(Weekly[, -9])) # Volume and Year seems to be correlated, but it's hard to find any other patterns. # marginal counts table(Weekly$Direction)/nrow(Weekly) # marginal counts by year tmp <- aggregate(Weekly$Direction == "Up", by = Weekly["Year"], mean) plot(tmp$Year, tmp$x, type = "l") abline(h = mean(Weekly$Direction == "Up"), col = "red") # ------------------------------- # b) logistic regression ---- help(glm) head(Weekly) fit = glm(Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 + Volume, family = 'binomial', data = Weekly) summary(fit) # Lag2 is significant at a 5% level. The others do not. # ------------------------------- # c) compute confusion matrix ---- # compute predicted probs and class help(predict.glm) probs = predict(fit, type = "response") preds = ifelse(probs > .5, "Up", "Down") # compute confusion matrix tab <- table(preds, true = Weekly$Direction) tab cat("\nOverall accuracy:") mean(preds == Weekly$Direction) cat("\nRate of true negative and positives:") diag(tab)/colSums(tab) # The predictions seems to heavily favor predicting Up. # Hence it is almost always correct when the market goes up, and # and almost always wrong when the market goes down. # define function for computing classification error, to be used in (d)-(f) compute_class_error <- function(yhat, y) { tab <- table(yhat, y) res <- c(sum(diag(tab))/length(y), diag(tab)/colSums(tab)) names(res) <- c("Acc", "TNR", "TPR") return(res) } # d) Fit logistic regression using Lag2 as only covariate, on data up to 2008 ---- train <- Weekly$Year < 2009 fit = glm(Direction ~ Lag2, family = 'binomial', data = Weekly[train, ]) summary(fit) probs <- predict(fit, newdata = Weekly[!train, ], type = "response") preds = rep("Down", length(probs)) preds[probs > 0.5] = "Up" # compute classification accuracy err <- list() # init list for storing res err$logistic = compute_class_error(preds, Weekly[!train, ]$Direction) err$logistic # Again, most predictions are positive. # e) Repeat (d) using linear discriminant analysis (LDA) ---- # See code-lab in ISLR2 4.7.3 for LDA-applications library(MASS) ?lda fit <- lda(Direction~Lag2 , data = Weekly[train, ]) fit # predict test-set outcomes using predict.lda ?predict.lda tmp <- predict(fit, newdata = Weekly[!train, ]) names(tmp) preds <- tmp$class # compute classification accuracy err$LDA <- compute_class_error(preds, Weekly[!train, ]$Direction) err$LDA # compare LDA to logistic regression plot(tmp$posterior[, 2]-probs, main = "Difference in predictions P(Up|x)") all((tmp$posterior[, 2] > .5) == (probs > .5)) # LDA and logistic regression in this case assign the exact same class labels. # The estimates of the conditional probs P(Up|x) are similar, but not exactly the same. # See ESL ch 4.X.X for a comparison of the two methods. # f) Repeat (d) using linear discriminant analysis QDA ---- library(MASS) ?qda fit <- qda(Direction~Lag2 , data = Weekly[train, ]) tmp <- predict(fit, newdata = Weekly[!train, ]) names(tmp) preds <- tmp$class # compute classification accuracy err$QDA <- compute_class_error(preds, Weekly[!train, ]$Direction) err$QDA # f) naive Bayes ---- library(e1071) ?naiveBayes fit <- naiveBayes(Direction~Lag2 , data = Weekly[train, ]) tmp <- predict(fit, newdata = Weekly[!train, ]) names(tmp) # compute classification accuracy err$naiveBayes <- compute_class_error(preds, Weekly[!train, ]$Direction) err$naiveBayes # i) Compare methods ---- do.call(rbind, err) # logistic reg and LDA has the highest overall-accuracy in this case. # all methods too often predict "Up" (QDA and naiveBayes only produce positive predictions.