# 4.10 from Elements of Statistical Learning. #install.packages('ISLR') # The install.packages only needs to be done once,to install the data from the book ISLR on your computer. library(ISLR) data(Weekly) colnames(Weekly) help(Weekly) # ------------------------------- # a) summary(Weekly) pairs(Weekly[, c("Year", "Volume", "Lag2", "Today", "Direction")]) corrplot::corrplot(cor(Weekly[, -9])) # Volume and Year seems to be correlated, but it's hard to find any other patterns. # marginal counts table(Weekly$Direction)/nrow(Weekly) # marginal counts by year tmp <- aggregate(Weekly$Direction == "Up", by = Weekly["Year"], mean) plot(tmp$Year, tmp$x, type = "l") abline(h = mean(Weekly$Direction == "Up"), col = "red") # ------------------------------- # b) help(glm) head(Weekly) fit = glm(Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 + Volume, family = 'binomial', data = Weekly) summary(fit) # Lag2 is significant at a 5% level. The others do not. # ------------------------------- # c) help(predict.glm) probs = predict(fit, type = "response") preds = rep("Down", length(probs)) preds[probs > 0.5] = "Up" tab <- table(preds, Weekly$Direction) tab sum(Weekly$Direction == 'Down') sum(Weekly$Direction == 'Up') cat("\nOverall accuracy:") mean(preds == Weekly$Direction) cat("\nRate of true negative and positives:") diag(tab)/colSums(tab) # The predictions seems to heavily favor predicting Up. # Hence it is almost always correct when the market goes up, and # and almost always wrong when the market goes down. # d) Fit logistic regression using Lag2 as only covariate, on data up to 2008 train <- Weekly$Year < 2009 fit = glm(Direction ~ Lag2, family = 'binomial', data = Weekly[train, ]) summary(fit) probs <- predict(fit, newdata = Weekly[!train, ], type = "response") preds = rep("Down", length(probs)) preds[probs > 0.5] = "Up" # compute classfication accuracy tab <- table(preds, Weekly$Direction[!train]) cat("\nOverall accuracy:") mean(preds == Weekly$Direction[!train]) cat("\nRate of true negative and positives:") diag(tab)/colSums(tab) # Again, most predictions are positive. # e) Repeat (d) using linear discriminant analysis (LDA) # See code-lab in ISLR2 4.7.3 for LDA-applications library(MASS) ?lda fit <- lda(Direction~Lag2 , data = Weekly[train, ]) fit # predict test-set outcomes using predict.lda ?predict.lda tmp <- predict(fit, newdata = Weekly[!train, ]) names(tmp) preds <- tmp$class # compute classfication accuracy tab <- table(preds, Weekly$Direction[!train]) cat("\nOverall accuracy:") mean(preds == Weekly$Direction[!train]) cat("\nRate of true negative and positives:") diag(tab)/colSums(tab) # compare LDA to logistic regression plot(tmp$posterior[, 2]-probs, main = "Difference in predictions P(Up|x)") all((tmp$posterior[, 2] > .5) == (probs > .5)) # LDA and logistic regression in this case assign the exact same class labels. # The estimates of the conditional probs P(Up|x) are similar, but not exactly the same. # See ESL ch 4.X.X for a comparison of the two methods.