# 4.10 from Elements of Statistical Learning.

#install.packages('ISLR')
# The install.packages only needs to be done once,to install the data from the book ISLR on your computer.

library(ISLR)
data(Weekly)
colnames(Weekly)
help(Weekly)

# -------------------------------
# a)
summary(Weekly)
pairs(Weekly[, c("Year", "Volume", "Lag2", "Today", "Direction")])
corrplot::corrplot(cor(Weekly[, -9]))

# Volume and Year seems to be correlated, but it's hard to find any other patterns.

# marginal counts
table(Weekly$Direction)/nrow(Weekly)

# marginal counts by year
tmp <- aggregate(Weekly$Direction == "Up", by = Weekly["Year"], mean)
plot(tmp$Year, tmp$x, type = "l")
abline(h = mean(Weekly$Direction == "Up"), col = "red")

# -------------------------------
# b)

help(glm)
head(Weekly)
fit = glm(Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 + Volume, family = 'binomial', data =  Weekly)
summary(fit)

# Lag2 is significant at a 5% level. The others do not.

# -------------------------------
# c)
help(predict.glm)
probs = predict(fit, type = "response")

preds = rep("Down", length(probs))
preds[probs > 0.5] = "Up"
tab <- table(preds, Weekly$Direction)
tab
sum(Weekly$Direction == 'Down')
sum(Weekly$Direction == 'Up')

cat("\nOverall accuracy:")
mean(preds == Weekly$Direction)

cat("\nRate of true negative and positives:")
diag(tab)/colSums(tab)


# The predictions seems to heavily favor predicting Up.
# Hence it is almost always correct when the market goes up, and
# and almost always wrong when the market goes down.



# d) Fit logistic regression using Lag2 as only covariate, on data up to 2008
train <- Weekly$Year < 2009
fit = glm(Direction ~ Lag2, family = 'binomial', data =  Weekly[train, ])
summary(fit)

probs <- predict(fit, newdata = Weekly[!train, ], type = "response")
preds = rep("Down", length(probs))
preds[probs > 0.5] = "Up"

# compute classfication accuracy
tab <- table(preds, Weekly$Direction[!train])
cat("\nOverall accuracy:")
mean(preds == Weekly$Direction[!train])

cat("\nRate of true negative and positives:")
diag(tab)/colSums(tab)

# Again, most predictions are positive.

# e) Repeat (d) using linear discriminant analysis (LDA)
#    See code-lab in ISLR2 4.7.3 for LDA-applications
library(MASS)
?lda
fit <- lda(Direction~Lag2 , data = Weekly[train, ])
fit

# predict test-set outcomes using predict.lda
?predict.lda
tmp <- predict(fit, newdata = Weekly[!train, ])
names(tmp)
preds <- tmp$class

# compute classfication accuracy
tab <- table(preds, Weekly$Direction[!train])
cat("\nOverall accuracy:")
mean(preds == Weekly$Direction[!train])

cat("\nRate of true negative and positives:")
diag(tab)/colSums(tab)

# compare LDA to logistic regression
plot(tmp$posterior[, 2]-probs, main = "Difference in predictions P(Up|x)")
all((tmp$posterior[, 2] > .5) == (probs > .5))

# LDA and logistic regression in this case assign the exact same class labels.
# The estimates of the conditional probs P(Up|x) are similar, but not exactly the same.
# See ESL ch 4.X.X for a comparison of the two methods.