# Proposed solution to ISLR Ex 4.13 (4.10 in first edition)
# Note that Chapter 4.7 includes code-examples where these
# classification methods is applied.

#install.packages('ISLR2')
# The install.packages only needs to be done once,to install the data from the book ISLR on your computer.

library(ISLR2)
data(Weekly)
head(Weekly)
help(Weekly)       # load documentation of data set

# -------------------------------
# a) summarise data to explore any patterns ----

summary(Weekly)
pairs(Weekly[, c("Year", "Volume", "Lag2", "Today", "Direction")])
corrplot::corrplot(cor(Weekly[, -9]))

# Volume and Year seems to be correlated, but it's hard to find any other patterns.

# marginal counts
table(Weekly$Direction)/nrow(Weekly)

# marginal counts by year
tmp <- aggregate(Weekly$Direction == "Up", by = Weekly["Year"], mean)
plot(tmp$Year, tmp$x, type = "l")
abline(h = mean(Weekly$Direction == "Up"), col = "red")

# -------------------------------
# b) logistic regression ----

help(glm)
head(Weekly)
fit = glm(Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 + Volume, family = 'binomial', data =  Weekly)
summary(fit)

# Lag2 is significant at a 5% level. The others do not.

# -------------------------------
# c) compute confusion matrix ----

# compute predicted probs and class
help(predict.glm)
probs = predict(fit, type = "response")
preds = ifelse(probs > .5, "Up", "Down")

# compute confusion matrix
tab <- table(preds, true = Weekly$Direction)
tab

cat("\nOverall accuracy:")
mean(preds == Weekly$Direction)

cat("\nRate of true negative and positives:")
diag(tab)/colSums(tab)


# The predictions seems to heavily favor predicting Up.
# Hence it is almost always correct when the market goes up, and
# and almost always wrong when the market goes down.


# define function for computing classification error, to be used in (d)-(f)
compute_class_error <- function(yhat, y) {
  tab <- table(yhat, y)
  res <- c(sum(diag(tab))/length(y),
           diag(tab)/colSums(tab))
  names(res) <- c("Acc", "TNR", "TPR")
  return(res)
}


# d)  Fit logistic regression using Lag2 as only covariate, on data up to 2008 ----

train <- Weekly$Year < 2009
fit = glm(Direction ~ Lag2, family = 'binomial', data =  Weekly[train, ])
summary(fit)

probs <- predict(fit, newdata = Weekly[!train, ], type = "response")
preds = rep("Down", length(probs))
preds[probs > 0.5] = "Up"


# compute classification accuracy
err <- list() # init list for storing res
err$logistic = compute_class_error(preds, Weekly[!train, ]$Direction)
err$logistic

# Again, most predictions are positive.

# e) Repeat (d) using linear discriminant analysis (LDA) ----
#    See code-lab in ISLR2 4.7.3 for LDA-applications
library(MASS)
?lda
fit <- lda(Direction~Lag2 , data = Weekly[train, ])
fit

# predict test-set outcomes using predict.lda
?predict.lda
tmp <- predict(fit, newdata = Weekly[!train, ])
names(tmp)
preds <- tmp$class

# compute classification accuracy
err$LDA <- compute_class_error(preds, Weekly[!train, ]$Direction)
err$LDA

# compare LDA to logistic regression
plot(tmp$posterior[, 2]-probs, main = "Difference in predictions P(Up|x)")
all((tmp$posterior[, 2] > .5) == (probs > .5))

# LDA and logistic regression in this case assign the exact same class labels.
# The estimates of the conditional probs P(Up|x) are similar, but not exactly the same.
# See ESL ch 4.X.X for a comparison of the two methods.

# f) Repeat (d) using linear discriminant analysis QDA ----
library(MASS)
?qda
fit <- qda(Direction~Lag2 , data = Weekly[train, ])
tmp <- predict(fit, newdata = Weekly[!train, ])
names(tmp)
preds <- tmp$class

# compute classification accuracy
err$QDA <- compute_class_error(preds, Weekly[!train, ]$Direction)
err$QDA

# f) naive Bayes ----
library(e1071)
?naiveBayes
fit <- naiveBayes(Direction~Lag2 , data = Weekly[train, ])
tmp <- predict(fit, newdata = Weekly[!train, ])
names(tmp)

# compute classification accuracy
err$naiveBayes <- compute_class_error(preds, Weekly[!train, ]$Direction)
err$naiveBayes

# i) Compare methods ----
do.call(rbind, err)

# logistic reg and LDA has the highest overall-accuracy in this case.
# all methods too often predict "Up" (QDA and naiveBayes only produce positive predictions.