# Ex. 2.8 # "Compare the classification performance of linear regression and k??? # nearest neighbor classification on the zipcode data. In particular, consider # only the 2 ???s and 3 ???s, and k = 1, 3, 5, 7 and 15. Show both the training and # test error for each choice. The zipcode data are available from the book # website www-stat.stanford.edu/ElemStatLearn." # Info about the data (from the website:) # The data are in two gzipped files, and each line consists of the digit # id (0-9) followed by the 256 grayscale values. # There are 7291 training observations and 2007 test observations [...] library(class) # for knn-function library(ggplot2) library(dplyr) # import data ---- if (!file.exists("./data/zip.train.gz")){ options(timeout = max(300, getOption("timeout"))) filepath = "https://hastie.su.domains/ElemStatLearn/datasets/zip.train.gz" download.file(filepath, "./data/zip.train.gz") filepath = "https://hastie.su.domains/ElemStatLearn/datasets/zip.test.gz" download.file(filepath, "./data/zip.test.gz") options(timeout = 60) } dfs <- list(train = read.table("./data/zip.train.gz"), test = read.table("./data/zip.test.gz")) # sample frequencies of each digit lapply(dfs, function(df) addmargins(table(df$V1))) # prep data ---- # - remove all observations that are not 2 or 3. # - redefine outcome as a 0-1 variable, 1 if digit is 3. dfs <- lapply(dfs, function(df) df %>% filter(V1 %in% c(2, 3)) %>% rename(y = V1) %>% mutate(y = y-2)) # sample frequencies of new outcome variable tab <- lapply(dfs, function(df) addmargins(table(df$y))) tab # matches counts for 2 and 3 above # fit classification models and compute error ---- ## linear regression ---- # fit model fit <- lm(y~., data = dfs$train) pred <- predict(fit) # plot in-sample predictions plot(pred) y = dfs$train$y points(which(y == 1), pred[y == 1], col = "red") abline(a = 0.5, 0) # compute misclassification errors: # i) false positive rate (FPR, number of 2's classified as 3's) # ii) false negative rate (FNR, number of 3's classified as 2's) # iii) overall misclass rate (TOT) err_ls <- numeric(6) names(err_ls) <- paste(c("FPR", "FNR", "TOT"), rep(c("train", "test"), each = 3), sep = "_") # predict outcomes in training set yhat <- fitted(fit) > .5 # count number of correct predictions in each class correct <- diag(table(yhat, dfs$train[, 1])) # compute rate of misclassified samples in each class err_ls[1:3] <- 1-c(correct, sum(correct))/tab$train # repeat for test-set yhat <- predict(fit, newdata = dfs$test[, -1]) > .5 correct <- diag(table(yhat, dfs$test[, 1])) err_ls[4:6] <- 1-c(correct, sum(correct))/tab$test # print round(err_ls*100, 2) ## kNN ---- k <- c(1, 3, 5, 7, 15) # number of neighbors # init matrix for storing errors err_knn <- matrix(0, nrow = length(k), ncol = 2*3) colnames(err_knn) <- names(err_ls) rownames(err_knn) <- paste0("k = ", k) for (i in seq_along(k)) { # predict outcomes in training set yhat <- knn(dfs$train[, -1], dfs$train[, -1], cl = dfs$train[, 1], k = k[i]) correct <- diag(table(yhat, y = dfs$train[, 1])) err_knn[i, 1:3] <- 1-c(correct, sum(correct))/tab$train # predict outcomes in test set yhat <- knn(dfs$train[, -1], dfs$test[, -1], cl = dfs$train[, 1], k = k[i]) correct <- diag(table(yhat, dfs$test[, 1])) err_knn[i, 4:6] <- 1-c(correct, sum(correct))/tab$test } # results ---- cat("Mis-classification errors:") err <- rbind(LS = err_ls, err_knn) round(err*100, 2) # plot dplyr::as_tibble(err*100, rownames = "model") %>% tidyr::pivot_longer(-model) %>% mutate(model = factor(model, unique(model)), class = ifelse(model == "LS", "LS", "kNN")) %>% tidyr::separate(name, into = c("error", "sample")) %>% ggplot(aes(model, value, fill = error, color = error, linetype = sample, shape = sample)) + facet_grid(.~class, scales = "free_x") + geom_line(aes(group = interaction(error, sample, fam))) + geom_point()