# code based on the files "Code regarding section 3.2 (copyright 2003, 2004, 2012 A.Azzalini and B.Scarpa)" # and "Code regarding section 3.3-3.5" (copyright 2003, 2004, 2012 A.Azzalini and B.Scarpa) # read data dataset <- read.table("http://azzalini.stat.unipd.it/Book-DM/yesterday.dat", header = TRUE) attach(dataset) n <- nrow(dataset) y.range <- range(c(y.yesterday, y.tomorrow)) # define f(x) as polynomials polynomial <- function(p) lm(y.yesterday ~ poly(x,p)) polynomials <- apply(cbind(1:23), 1, polynomial) error.train <- sapply(polynomials, deviance) rss <- function(p) sum((y.tomorrow - predict(polynomials[[p]]))^2) error.test <- apply(cbind(1:23), 1, rss) out <- cbind(1:23, error.train, error.test) colnames(out) <- c('complexity', 'train.error', 'test.error') out # bias-variance trade-off rss60<- function(p) sum((y.tomorrow - predict(polynomials[[p]]))^2)+ sum((y.yesterday - predict(polynomials[[p]]))^2) Error <- apply(cbind(1:23), 1, rss) source("../lecture_4/f_true.R") polo.f.true <- function(p) lm(f.true ~ poly(x,p)) poli.f.true <- apply(cbind(1:23), 1, polo.f.true) bias2 <- function(p) mean((fitted(poli.f.true[[p]]) - f.true)^2) bias2.oss <- apply(cbind(1:23), 1, bias2) var.oss <- sqm.true^2 * (1:23)/30 err <- bias2.oss + var.oss # figure 3.6 plot(1:23, err, type = "b", cex = 2, pch = 16, xlab = "Model complexity", ylab = "Error") # figure 3.7 plot(bias2.oss, type = "b", col = 4, ylab = "Error", xlab = "Model complexity", cex = 2, pch = 16) lines(var.oss, type = "b", col = 2, cex = 2, pch = 17) lines(err, type = "b", cex = 2, pch = 18) legend('topright', legend = c("Bias^2", "Variance", "Total"), col = c(4, 2, 1), pch = c(16, 17, 18)) # figure 3.8 plot(error.train, type = "b", col = 2, xlab = "Model complexity", ylab = "Deviance", cex = 2, pch = 16) lines(error.test, type = "b",col = 4, cex = 2, pch = 17) text(x = c(19, 19), y = c(0.0015, 0.0065), c("Yesterday data", "Tomorrow data")) ### leave-one-out cross-validation ### # it only works until polynomial of degree 23 (up to 29) # figure 3.9 y <- c(y.yesterday, y.tomorrow) xx <- c(x, x) RSS <- numeric(23) for (p in 0:22) { m <- lm(y ~ poly(xx, p+1), x = TRUE) X <- m$x P <- X %*% solve(t(X)%*% X) %*% t(X) # projection matrix p.ii <- diag(P) RSS.pp <- sum(residuals(m)^2 / (1 - p.ii)^2) RSS.p <- 0 RSS[p + 1] <- RSS.pp } plot(0:22, RSS, type = "b", xlab = "Model complexity", pch = 16, ylab = "Error", cex = 2, lwd = 3) # additional figure (please note it is not 100% correct, but it is to illustrate the point) plot(error.test, type = "b", col = 4, xlab = "Model complexity", ylab = "Deviance", cex = 2, pch = 17) lines(0:22, RSS, type = "b",col = 6, cex = 2, pch = 16) ### K-fold cross-validation ### set.seed(20200219) # set seed, useful for data replication # set the number of folds K <- 5 # split the data in approximately equisize folds index <- sample(rep(1:K, ceiling(n / K))[1:n]) # empty matrix which will contain the errors cv.err.k <- matrix(NA, nrow = K, ncol = 11) # from degree 0 to degree 10 -> 11 for (k in 1:K) { # in turn, for each k, the k-th fold forms the test set temp.test.data <- data.frame(y[index == k], xx[index == k]) # the remaining K -1 folds form the training set temp.train.data <- data.frame(y[index != k], xx[index != k]) colnames(temp.test.data) <- colnames(temp.train.data) <- c('y', 'x') # fit the model on the training set for each polynomial polynoms <- sapply(0:10, function(p) if(p == 0) lm(y ~ 1, data = temp.train.data) else lm(y ~ poly(x, p), data = temp.train.data)) # compute the error computeError <- function(model, newdata) sum((newdata[ , 'y'] - predict(model, newdata = newdata))^2) # save the error in the matrix initialized above cv.err.k[k, ] <- unlist(lapply(polynoms, computeError, newdata = temp.test.data)) } # compute the average error for the K folds, for each degree of the polynomial cv.err.5 <- apply(cv.err.k, 2, mean) # select the degree with smallest cross-validation error which.min(cv.err.5) # plot plot(0:10, cv.err.5, type = "b", xlab = "Model complexity", pch = 16, ylab = "Error", cex = 2, lwd = 3) # repeated cross-validation D <- 15 cv.err.d <- matrix(NA, nrow = D, ncol = 11) # from degree 0 to degree 10 -> 11 for(d in 1:D) { set.seed(d) # set seed, useful for data replication # set the number of folds K <- 5 # split the data in approximately equisize folds index <- sample(rep(1:K, ceiling(n / K))[1:n]) # empty matrix which will contain the errors cv.err.k <- matrix(NA, nrow = K, ncol = 11) # from degree 0 to degree 10 -> 11 for (k in 1:K) { # in turn, for each k, the k-th fold forms the test set temp.test.data <- data.frame(y[index == k], xx[index == k]) # the remaining K -1 folds form the training set temp.train.data <- data.frame(y[index != k], xx[index != k]) colnames(temp.test.data) <- colnames(temp.train.data) <- c('y', 'x') # fit the model on the training set for each polynomial polynoms <- sapply(0:10, function(p) if(p == 0) lm(y ~ 1, data = temp.train.data) else lm(y ~ poly(x, p), data = temp.train.data)) # compute the error computeError <- function(model, newdata) sum((newdata[ , 'y'] - predict(model, newdata = newdata))^2) # save the error in the matrix initialized above cv.err.k[k, ] <- unlist(lapply(polynoms, computeError, newdata = temp.test.data)) } # average over the K splits cv.err.d[d, ] <- apply(cv.err.k, 2, mean) print(d) } # average over the D replications cv.err.5.rep <- apply(cv.err.d, 2, mean) # select the degree with smallest cross-validation error which.min(cv.err.5.rep) # plot plot(0:8, cv.err.5.rep[1:9], type = "b", xlab = "Model complexity", pch = 16, ylab = "Error", cex = 2, lwd = 3) # install.packages('class') # use the R routine to perform kNN form the library 'class' library(class) # generate training data set.seed(20200831) X <- matrix(rnorm(25e4), ncol = 5000) y <- rbinom(n = 50, size = 1, prob = 0.5) # CV error with 5-fold cross-validation cv.error <- 0 for(i in 1:50) { # generate the training data set.seed(i) X <- matrix(rnorm(25e4), ncol = 5000) y <- rbinom(n = 50, size = 1, prob = 0.5) # select the 100 features most correlated with the outcome univariate.correlations <- apply(X, 2, function(x,y) cor(x,y), y=y) best100 <- order(abs(univariate.correlations), decreasing = TRUE)[1:100] X.red <- X[, best100] # 5-fold cross-validation set.seed(i) index <- sample(rep(1:5, 10)) cv.err <- cv.err.red <- NULL for (k in 1:5) cv.err[k] <- sum(y[index == k] != knn1(train = X.red[index != k, ], test = X.red[index == k, ], cl = y[index != k])) cv.error <- cv.error + sum(cv.err) / nrow(X) print(i) } # average over 50 simulations cv.error / 50 ################### ### CORRECT WAY ### ################### # CV error with 5-fold cross-validation cv.error.val <- 0 for(i in 1:50) { # generate the training data set.seed(2 * i) X <- matrix(rnorm(25e4), ncol = 5000) y <- rbinom(n = 50, size = 1, prob = 0.5) # 5-fold cross-validation index <- sample(rep(1:5, 10)) cv.err.val <- NULL for (k in 1:5) { # select the 100 features most correlated with the outcome univariate.correlations <- apply(X[index != k, ], 2, function(x,y) cor(x,y), y = y[index != k]) best100 <- order(abs(univariate.correlations), decreasing = TRUE)[1:100] X.red <- X[index != k, best100] cv.err.val[k] <- sum(y[index == k] != knn1(train = X.red, test = X[index == k, best100], cl = y[index != k])) } cv.error.val <- cv.error.val + sum(cv.err.val) / nrow(X) print(i) } # average over 50 simulations cv.error.val / 50 detach(dataset)