# Ex. 2.8
# "Compare the classification performance of linear regression and k???
# nearest neighbor classification on the zipcode data. In particular, consider
# only the 2 ???s and 3 ???s, and k = 1, 3, 5, 7 and 15. Show both the training and
# test error for each choice. The zipcode data are available from the book
# website www-stat.stanford.edu/ElemStatLearn."

# Info about the data (from the website:)
# The data are in two gzipped files, and each line consists of the digit
# id (0-9) followed by the 256 grayscale values.

# There are 7291 training observations and 2007 test observations [...]


library(class)    # for knn-function
library(ggplot2)
library(dplyr)

# import data ----
if (!file.exists("./data/zip.train.gz")){
  options(timeout = max(300, getOption("timeout")))
  filepath = "https://hastie.su.domains/ElemStatLearn/datasets/zip.train.gz"
  download.file(filepath, "./data/zip.train.gz")

  filepath = "https://hastie.su.domains/ElemStatLearn/datasets/zip.test.gz"
  download.file(filepath, "./data/zip.test.gz")
  options(timeout = 60)
}

dfs <- list(train = read.table("./data/zip.train.gz"),
            test  = read.table("./data/zip.test.gz"))

# sample frequencies of each digit
lapply(dfs, function(df) addmargins(table(df$V1)))

# prep data ----
# - remove all observations that are not 2 or 3.
# - redefine outcome as a 0-1 variable, 1 if digit is 3.
dfs <- lapply(dfs,
              function(df) df %>%
                filter(V1 %in% c(2, 3)) %>%
                rename(y = V1) %>%
                mutate(y = y-2))

# sample frequencies of new outcome variable
tab <- lapply(dfs, function(df) addmargins(table(df$y)))
tab  # matches counts for 2 and 3 above


# fit classification models and compute error ----
## linear regression ----
# fit model
fit  <- lm(y~., data = dfs$train)
pred <- predict(fit)

# plot in-sample predictions
plot(pred)
y = dfs$train$y
points(which(y == 1), pred[y == 1], col = "red")
abline(a = 0.5, 0)

# compute misclassification errors:
# i)   false positive rate (FPR, number of 2's classified as 3's)
# ii)  false negative rate (FNR, number of 3's classified as 2's)
# iii) overall misclass rate (TOT)
err_ls <- numeric(6)
names(err_ls) <- paste(c("FPR", "FNR", "TOT"),
                       rep(c("train", "test"), each = 3),
                       sep = "_")

# predict outcomes in training set
yhat <- fitted(fit) > .5

# count number of correct predictions in each class
correct <- diag(table(yhat, dfs$train[, 1]))

# compute rate of misclassified samples in each class
err_ls[1:3] <- 1-c(correct, sum(correct))/tab$train

# repeat for test-set
yhat <- predict(fit, newdata = dfs$test[, -1]) > .5
correct <- diag(table(yhat, dfs$test[, 1]))
err_ls[4:6] <- 1-c(correct, sum(correct))/tab$test

# print
round(err_ls*100, 2)


## kNN ----

k <- c(1, 3, 5, 7, 15) # number of neighbors

# init matrix for storing errors
err_knn <- matrix(0, nrow = length(k), ncol = 2*3)
colnames(err_knn) <- names(err_ls)
rownames(err_knn) <- paste0("k = ", k)

for (i in seq_along(k)) {
  # predict outcomes in training set
  yhat <- knn(dfs$train[, -1], dfs$train[, -1], cl = dfs$train[, 1], k = k[i])
  correct <- diag(table(yhat, y = dfs$train[, 1]))
  err_knn[i, 1:3] <- 1-c(correct, sum(correct))/tab$train

  # predict outcomes in test set
  yhat <- knn(dfs$train[, -1], dfs$test[, -1], cl = dfs$train[, 1], k = k[i])
  correct <- diag(table(yhat, dfs$test[, 1]))
  err_knn[i, 4:6] <- 1-c(correct, sum(correct))/tab$test
}

# results ----
cat("Mis-classification errors:")
err <- rbind(LS = err_ls, err_knn)
round(err*100, 2)


# plot
dplyr::as_tibble(err*100, rownames = "model") %>%
  tidyr::pivot_longer(-model) %>%
  mutate(model = factor(model, unique(model)),
         class = ifelse(model == "LS", "LS", "kNN")) %>%
  tidyr::separate(name, into = c("error", "sample")) %>%
  ggplot(aes(model, value, fill = error, color = error, linetype = sample, shape = sample)) +
  facet_grid(.~class, scales = "free_x") +
  geom_line(aes(group = interaction(error, sample, fam))) +
  geom_point()