# code based on the files "Code regarding section 3.2 (copyright 2003, 2004, 2012 A.Azzalini and B.Scarpa)"
# and "Code regarding section 3.3-3.5" (copyright 2003, 2004, 2012 A.Azzalini and B.Scarpa)

# read data 
dataset <- read.table("http://azzalini.stat.unipd.it/Book-DM/yesterday.dat", header = TRUE)
attach(dataset)
n <- nrow(dataset)
y.range <- range(c(y.yesterday, y.tomorrow))

# define f(x) as polynomials
polynomial <- function(p) lm(y.yesterday ~ poly(x,p))
polynomials <- apply(cbind(1:23), 1, polynomial)
error.train <- sapply(polynomials, deviance)

rss <- function(p) sum((y.tomorrow - predict(polynomials[[p]]))^2)
error.test <- apply(cbind(1:23), 1, rss)

out <- cbind(1:23, error.train, error.test)
colnames(out) <- c('complexity', 'train.error', 'test.error')
out


# bias-variance trade-off
rss60<- function(p) sum((y.tomorrow - predict(polynomials[[p]]))^2)+
  sum((y.yesterday - predict(polynomials[[p]]))^2)
Error <- apply(cbind(1:23), 1, rss)

source("../lecture_4/f_true.R")

polo.f.true <- function(p) lm(f.true ~ poly(x,p))
poli.f.true <- apply(cbind(1:23), 1, polo.f.true)

bias2 <- function(p) mean((fitted(poli.f.true[[p]]) - f.true)^2)
bias2.oss <- apply(cbind(1:23), 1, bias2)
var.oss <- sqm.true^2 * (1:23)/30

err <- bias2.oss + var.oss


# figure 3.6
plot(1:23, err, type = "b", cex = 2, pch = 16,
     xlab = "Model complexity", ylab = "Error")

# figure 3.7
plot(bias2.oss, type = "b", col = 4, ylab = "Error",
     xlab = "Model complexity", cex = 2, pch = 16)
lines(var.oss, type = "b", col = 2, cex = 2, pch = 17)
lines(err, type = "b", cex = 2, pch = 18)
legend('topright', legend = c("Bias^2", "Variance", "Total"), 
       col = c(4, 2, 1), pch = c(16, 17, 18))

# figure 3.8
plot(error.train, type = "b", col = 2, 
     xlab = "Model complexity", ylab = "Deviance", cex = 2, pch = 16)
lines(error.test, type = "b",col = 4, cex = 2, pch = 17)
text(x = c(19, 19), y = c(0.0015, 0.0065), c("Yesterday data", "Tomorrow data"))


### leave-one-out cross-validation ###
# it only works until polynomial of degree 23 (up to 29)
# figure 3.9
y <- c(y.yesterday, y.tomorrow)
xx <- c(x, x)

RSS <- numeric(23) 
for (p in 0:22) {
  m <- lm(y ~ poly(xx, p+1), x = TRUE)
  X <- m$x
  P <- X %*% solve(t(X)%*% X) %*% t(X) # projection matrix
  p.ii <- diag(P)
  RSS.pp <- sum(residuals(m)^2 / (1 - p.ii)^2)
  RSS.p <- 0
  RSS[p + 1] <- RSS.pp
}
plot(0:22, RSS, type = "b", xlab = "Model complexity", pch = 16,
     ylab = "Error", cex = 2, lwd = 3)

# additional figure (please note it is not 100% correct, but it is to illustrate the point)
plot(error.test, type = "b", col = 4, 
     xlab = "Model complexity", ylab = "Deviance", cex = 2, pch = 17)
lines(0:22, RSS, type = "b",col = 6, cex = 2, pch = 16)

### K-fold cross-validation ###
set.seed(20200219) # set seed, useful for data replication
# set the number of folds
K <- 5
# split the data in approximately equisize folds
index <- sample(rep(1:K, ceiling(n / K))[1:n])
# empty matrix which will contain the errors
cv.err.k <- matrix(NA, nrow = K, ncol = 11) # from degree 0 to degree 10 -> 11
for (k in 1:K)
{
  # in turn, for each k, the k-th fold forms the test set
  temp.test.data <- data.frame(y[index == k], xx[index == k])
  # the remaining K -1 folds form the training set
  temp.train.data <- data.frame(y[index != k], xx[index != k])
  colnames(temp.test.data) <- colnames(temp.train.data) <- c('y', 'x')
  # fit the model on the training set for each polynomial
  polynoms <- sapply(0:10,
                     function(p)
                       if(p == 0) lm(y ~ 1, data = temp.train.data) else
                         lm(y ~ poly(x, p), data = temp.train.data))
  # compute the error
  computeError <- function(model, newdata)
    sum((newdata[ , 'y'] - predict(model, newdata = newdata))^2)
  # save the error in the matrix initialized above
  cv.err.k[k, ] <- unlist(lapply(polynoms, computeError, newdata = temp.test.data))
}
# compute the average error for the K folds, for each degree of the polynomial
cv.err.5 <- apply(cv.err.k, 2, mean)
# select the degree with smallest cross-validation error
which.min(cv.err.5)
# plot
plot(0:10, cv.err.5, type = "b", xlab = "Model complexity", pch = 16,
     ylab = "Error", cex = 2, lwd = 3)

# repeated cross-validation
D <- 15
cv.err.d <- matrix(NA, nrow = D, ncol = 11) # from degree 0 to degree 10 -> 11
for(d in 1:D)
{
  set.seed(d) # set seed, useful for data replication
  # set the number of folds
  K <- 5
  # split the data in approximately equisize folds
  index <- sample(rep(1:K, ceiling(n / K))[1:n])
  # empty matrix which will contain the errors
  cv.err.k <- matrix(NA, nrow = K, ncol = 11) # from degree 0 to degree 10 -> 11
  for (k in 1:K)
  {
    # in turn, for each k, the k-th fold forms the test set
    temp.test.data <- data.frame(y[index == k], xx[index == k])
    # the remaining K -1 folds form the training set
    temp.train.data <- data.frame(y[index != k], xx[index != k])
    colnames(temp.test.data) <- colnames(temp.train.data) <- c('y', 'x')
    # fit the model on the training set for each polynomial
    polynoms <- sapply(0:10,
                       function(p)
                         if(p == 0) lm(y ~ 1, data = temp.train.data) else
                           lm(y ~ poly(x, p), data = temp.train.data))
    # compute the error
    computeError <- function(model, newdata)
      sum((newdata[ , 'y'] - predict(model, newdata = newdata))^2)
    # save the error in the matrix initialized above
    cv.err.k[k, ] <- unlist(lapply(polynoms, computeError, newdata = temp.test.data))
  }
  # average over the K splits
  cv.err.d[d, ] <- apply(cv.err.k, 2, mean)
  print(d)
}
# average over the D replications
cv.err.5.rep <- apply(cv.err.d, 2, mean)
# select the degree with smallest cross-validation error
which.min(cv.err.5.rep)
# plot
plot(0:8, cv.err.5.rep[1:9], type = "b", xlab = "Model complexity", pch = 16,
     ylab = "Error", cex = 2, lwd = 3)


# install.packages('class')
# use the R routine to perform kNN form the library 'class'
library(class)

# generate training data
set.seed(20200831)
X <- matrix(rnorm(25e4), ncol = 5000)
y <- rbinom(n = 50, size = 1, prob = 0.5)


# CV error with 5-fold cross-validation
cv.error <- 0
for(i in 1:50)
{
  # generate the training data
  set.seed(i)
  X <- matrix(rnorm(25e4), ncol = 5000)
  y <- rbinom(n = 50, size = 1, prob = 0.5)
  
  # select the 100 features most correlated with the outcome
  univariate.correlations <- apply(X, 2, function(x,y) cor(x,y), y=y)
  best100 <- order(abs(univariate.correlations), decreasing = TRUE)[1:100]
  X.red <- X[, best100]
  
  # 5-fold cross-validation
  set.seed(i)
  index <- sample(rep(1:5, 10))
  cv.err <- cv.err.red <- NULL
  for (k in 1:5)
    cv.err[k] <- sum(y[index == k] != knn1(train = X.red[index != k, ],
                                           test = X.red[index == k, ],
                                           cl = y[index != k]))
  cv.error <- cv.error + sum(cv.err) / nrow(X)
  
  print(i)
}
# average over 50 simulations
cv.error / 50


###################
### CORRECT WAY ###
###################

# CV error with 5-fold cross-validation
cv.error.val <- 0
for(i in 1:50)
{
  # generate the training data
  set.seed(2 * i)
  X <- matrix(rnorm(25e4), ncol = 5000)
  y <- rbinom(n = 50, size = 1, prob = 0.5)
  
  # 5-fold cross-validation
  index <- sample(rep(1:5, 10))
  
  cv.err.val <- NULL
  for (k in 1:5)
  {
    # select the 100 features most correlated with the outcome
    univariate.correlations <- apply(X[index != k, ], 2, function(x,y) cor(x,y), y = y[index != k])
    best100 <- order(abs(univariate.correlations), decreasing = TRUE)[1:100]
    X.red <- X[index != k, best100]
    
    cv.err.val[k] <- sum(y[index == k] != knn1(train = X.red,
                                               test = X[index == k, best100],
                                               cl = y[index != k]))
  }
  cv.error.val <- cv.error.val + sum(cv.err.val) / nrow(X)
  print(i)
}

# average over 50 simulations
cv.error.val / 50


detach(dataset)