# code based on files " Code regarding the first part of section 5.6",
# "Code regarding section 4.8", and "Code regarding section 5.7"
# (copyright 2003, 2004, 2012 A.Azzalini and B.Scarpa)

# help functions
pred.square <- function(model, x1, x2, grid = 50, Z.flag = FALSE)
{
  u <- pretty(x1, n = 10)
  x <- seq(u[1], u[10], length = grid)
  u <- pretty(x2, n = 10)
  y <- seq(u[1], u[10], length = grid)
  nx <- length(x)
  ny <- length(y)
  xoy <- cbind(rep(x, ny), as.vector(matrix(y, nx, ny, byrow = TRUE)))
  X <- matrix(xoy, nx * ny, 2, byrow = FALSE)
  if(Z.flag) pred <- predict(model, newdata = data.frame(z1 = X[ , 1], z2 = X[ , 2]))
  else  pred <- predict(model, newdata = data.frame(x1 = X[ , 1], x2 = X[ , 2]))
  list(pred = pred, x = x, y = y, X = X)
}

# function to procude the ROC curve
lift.roc <- function(previsti, g, type = "bin", plot.it = TRUE)
{
  library(sm)
  if(!is.numeric(g)) stop("g not numeric")
  ind <- rev(order(previsti))
  n <- length(g)
  x1 <- (1:n)/n
  x2 <- cumsum(g[ind]) / (mean(g) * (1:n))
  if(type == "crude" & plot.it) 
    plot(x1, x2, type = "l", col = 2,
         xlab = "fraction of predicted subjects", ylab = "lift")
  if(type == "sm") {
    a<- sm.regression(x1, x2, h = 0.1, display = "none")
    if(plot.it)
      plot(a$eval, a$estimate, type = "l", xlim=c(0,1), col = 2,
           xlab = "fraction of predicted subjects", ylab = "lift")
  }
  if(type == "bin") {
    b <-  binning(x1, x2, breaks = (-0.001:10) / 9.999)
    x <- c(0, seq(0.05, 0.95, by = 0.1), 1)
    if (plot.it) plot(x, c(x2[1], b$means, 1), type = "b", xlim = c(0, 1),
                      ylim = c(1, max(x2)), cex = 0.75, col = 2,
                      xlab = "fraction of predicted subjects",
                      ylab = "improvement factor")
    x1 <- x
    x2 <- c(x2[1], b$means,1)
  }
  if(plot.it) pause("press enter")
  u1 <- cumsum(1 - g[ind]) / sum(1 - g)
  u2 <- cumsum(g[ind]) / sum(g)
  if(type == "crude" & plot.it)
    plot(u1, u2, type = "l", xlim = c(0, 1), ylim = c(0, 1), col = 2,
         xlab = "1-specificity", ylab = "sensibility")
  if(type == "sm"){
    # browser()
    eps <- 0.00001
    a <- sm.regression(u1, log((u2 + eps) / (1 - u2 + 2 * eps)), h = 0.1, display = "none")
    q <- exp(a$estimate) / (1 + exp(a$estimate))
    if(plot.it) plot(a$eval, q, type = "l", xlim = c(0, 1), ylim = c(0, 1),
                     xlab = "1-specificity", ylab = "sensibility", col = 2)
  }
  if(type == "bin"){
    b <- binning(u1, u2, breaks = (-0.001:10) / 9.999)
    x <- c(0, seq(0.05, 0.95, by = 0.1),1)
    y<- c(0, b$means,1)
    if(plot.it)
      plot(x, y, type = "b", xlim = c(0,1),
           ylim = c(0, 1),cex = 0.75, xlab = "1-specificity",
           ylab = "sensibility", col = 2)
    u1 <- x
    u2 <- y
  }                      
  if(plot.it) {
    abline(0, 1, lty = 2, col = 3)
  }
  invisible(list(x1, x2, u1, u2))
}

# function to compute the confusion matrix
confusion.matrix <- function(predicted, observed){
  a <-  table(predicted, observed)
  err.tot <- 1 - sum(diag(a)) / sum(a)
  fn <- a[1, 2] / (a[1, 2] + a[2, 2])
  fp <- a[2, 1] / (a[1, 1] + a[2, 1])
  print(a)
  cat("total error: ", format(err.tot),"\n")
  cat("false positives/negatives: ",format(c(fp, fn)),"\n")
  invisible(a)
}

dataset <- read.table("http://azzalini.stat.unipd.it/Book-DM/classes.dat", header = TRUE, nrows = 200)
x1 <- dataset[ , 1]
x2 <- dataset[ , 2]
gr <- (2 - dataset[ , 3]) # groups are indicated by 0 and 1
K <- 2
n <- nrow(dataset)
plot(dataset[ , 1:2], type = "n", xlab = expression(italic(x)[1]), ylab = expression(italic(x)[2]))
for(k in 1:K)
{
  g <- (dataset[ , 3] == k)
  points(dataset[g, 1:2], pch = 15 + k, col = 1 + k, cex = 1.5)
}

### k-nearest-neighbors ###
library(class)

x <- seq(min(x1), max(x1), length = 100)
y <- seq(min(x2), max(x2), length = 100)
nx <- length(x)
ny <- length(y)
X <- cbind(rep(x, ny), as.vector(matrix(y, nx, ny, byrow = TRUE)))
#
knn1 <- knn(cbind(x1, x2), data.frame(x1 = X[ , 1], x2 = X[ , 2]), gr, k = 1)
pred <- matrix(knn1, nx, ny)
# figure 5.14a
plot(dataset[ , 1:2], type = "n", xlab = expression(italic(x)[1]), ylab = expression(italic(x)[2]))
for(k in 1:K)
{
  g <- (dataset[ , 3] == k)
  points(dataset[g, 1:2], pch = 15 + k, col = 1 + k, cex = 1.5)
}
contour(x, y, pred, levels = 0.5, add = TRUE, drawlabels = FALSE, lty = 1, ldw = 3)

#
knn50 <- knn(cbind(x1, x2), data.frame(x1 = X[ , 1], x2 = X[ , 2]), gr, k = 50)
pred <- matrix(knn50, nx, ny)
# figure 5.14b
plot(dataset[ , 1:2], type = "n", xlab = expression(italic(x)[1]), ylab = expression(italic(x)[2]))
for(k in 1:K)
{
  g <- (dataset[ , 3] == k)
  points(dataset[g, 1:2], pch = 15 + k, col = 1 + k, cex = 1.5)
}
contour(x, y, pred, levels = 0.5, add = TRUE, drawlabels = FALSE, lty = 1, ldw = 3)

B <- 10 # number of folds for cross-validation
K <- 50 # max number neighbours
set.seed(1)
index <- sample(rep(1:B, n/B))
cv.err <- matrix(NA, nrow = B, ncol = K)
for (b in 1:B)
{
  for(k in 1:K)
  {
    knnK <- knn(train = cbind(x1[index != b], x2[index != b]),
                test = cbind(x1[index == b], x2[index == b]),
                cl = gr[index != b], k = k)
    cv.err[b, k] <- mean(gr[index == b] != knnK)
  }
}
error <- apply(cv.err, 2, mean)
plot(error, type = 'l') 
which.min(error)  

knn7 <- knn(cbind(x1, x2), data.frame(x1 = X[ , 1], x2 = X[ , 2]), gr, k = 7)
pred <- matrix(knn7, nx, ny)
# figure
plot(dataset[ , 1:2], type = "n", xlab = expression(italic(x)[1]), ylab = expression(italic(x)[2]))
for(k in 1:K)
{
  g <- (dataset[ , 3] == k)
  points(dataset[g, 1:2], pch = 15 + k, col = 1 + k, cex = 1.5)
}
contour(x, y, pred, levels = 0.5, add = TRUE, drawlabels = FALSE, lty = 1, ldw = 3)


### classification trees ###
library(tree)

x<- seq(0.5, 3, length = 100)
y <-c(
  1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0,
  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0,
  0, 0, 1, 1)
# figure 5.18
plot(x, y, type = "n", ylim = c(0, 1))
points(x, y, col = 2, pch = 16)
abline(v = 0.866, lty = 3, col = 4, lwd = 2)
abline(v = 2.811, lty = 3, col = 4, lwd = 2)

t1<- tree(factor(y) ~ x)
# figure 5.19
plot(t1)
text(t1, digits = 2, pretty = 1)

x0 <- seq(0.5, 3, length = 500)
pr1<- predict(t1, newdata = data.frame(x = x0))
# figure 5.19b
plot(x, y, type = "n", ylim = c(0, 1))
points(x, y, col = 2, pch = 16)
lines(x0, pr1[ , 2], lwd = 2)

#-----------------------------------
juice <- read.table("http://azzalini.stat.unipd.it/Book-DM/juice.data", header = TRUE) 
juice[ , "store"] <- factor(juice[ , "store"])
v <- c(1, 3, 6:9, 13, 21)
juice <- juice[ , v]
#--select training and test
set.seed(123)
n <- nrow(juice)
n1 <- round(n * 0.75)
n2 <- n - n1
permutation <- sample(1:n,n)
training <- sort(permutation[1:n1])
juice1 <- juice[training, ]
juice2 <- juice[-training, ]

set.seed(123)
part1 <- sort(sample(training, 600))
part2 <- setdiff(training, part1)
f1 <- as.formula(paste("as.factor(choice) ~ ", paste(names(juice1)[-1], collapse = "+"),
                        collapse = NULL))
t1 <- tree(f1, data = juice[part1, ])
t2 <- prune.tree(t1, newdata = juice[part2, ])
# figure 5.20a
plot(t2)

J <- t2$size[t2$dev == min(t2$dev)]
t3 <- prune.tree(t1, best = J)
# figure 5.20b
plot(t3)
text(t3)

p3 <- predict(t3, newdata = juice2, type = "class")
# table 5.11
confusion.matrix(p3, juice2[ , "choice"])
#
p3 <-  predict(t3, newdata = juice2, type = "vector")[ , 2]
a <- lift.roc(p3, as.numeric(juice2[ , "choice"] == "MM"), type = "bin", plot.it = FALSE)
# figure 5.21b
plot(a[[3]], a[[4]], type = "b", xlim = c(0, 1), pch = 16,
     ylim = c(0, 1), cex = 2, xlab = "1-specificity",
     ylab = "Sensibility", col = 2)


dataset <- read.table("http://azzalini.stat.unipd.it/Book-DM/classes.dat", header = TRUE)
x1 <- dataset[ , 1]
x2 <- dataset[ , 2]
gr <- as.factor(dataset[ , 3])
K <- 3
t3 <- tree(gr ~ x1 + x2, control = tree.control(nobs = nrow(dataset), mindev = 0, minsize = 2))
# figure 5.22a
plot(t3)

set.seed(5)
t4 <- cv.tree(t3)
# figure 5.22b
plot(t4)

t5 <- prune.tree(t3, k = 10)
# figure 5.22c
plot(t5)
text(t5)

n.grid <- 250
p <- pred.square(t5, x1, x2, n.grid)
pred <- array(p$pred, c(n.grid, n.grid,3))
ind <- apply(pred, c(1, 2), order)[3, , ] 
# figure 5.22d
plot(dataset[ , 1:2], type = "n", xlab = expression(italic(x)[1]),  
     ylab = expression(italic(x)[2]))
for(k in 1:3)
  points(dataset[gr == k, 1:2], pch = 15 + k, col = 1 + k, cex = 1.5)
contour(p$x, p$y, ind, add = TRUE, drawlabels = FALSE, nlevels = 2, lty = 1, lwd = 2)
text(-4, 1, labels = 1, cex = 2)
text(3, 0, labels = 2, cex = 2)
text(2, -4, labels = 3, cex = 2)


### boosting ###


### SVM ###
library(class)
library(e1071)

dataset <- read.table("http://azzalini.stat.unipd.it/Book-DM/classes.dat", header = TRUE, nrows = 200)
gr <- (2 - dataset[ , 3]) # groups are indicated by 0 and 1
K <- 2
n <- nrow(dataset)
attach(dataset)
n.grid <- 800
plot(dataset[ , 1:2], type = "n", xlab = expression(italic(x)[1]), ylab = expression(italic(x)[2]))
for(k in 1:K)
{
  g <- (group == k)
  points(dataset[g, 1:2], pch = 15 + k, col = 1 + k, cex = 1.5)
}

s3 <- svm(factor(group) ~ x1 + x2, data = dataset, kernel = "polynomial", degree = 3, cost = 1)
# figure 5.26a
plot(dataset[ , 1:2], type = "n", xlab = expression(italic(x)[1]), 
     ylab = expression(italic(x)[2]))
for(k in 1:K)
{
  g <- (group == k)
  points(dataset[g, 1:2], pch = 15 + k, col = 1 + k, cex = 1.5)
}
p <- pred.square(s3, x1, x2, grid = n.grid)
pred<- matrix(as.numeric(p$pred), n.grid, n.grid)
contour(p$x, p$y, pred, add = TRUE, levels = 1.5, lty = 2, drawlabels = FALSE)

#--------
s2 <- svm(factor(group) ~ x1 + x2, data = dataset, kernel = "radial", cost = 1)
# figure 5.26b
plot(dataset[ , 1:2], type = "n", xlab = expression(italic(x)[1]), ylab = expression(italic(x)[2]))
for(k in 1:K)
{
  g <- (group == k)
  points(dataset[g, 1:2], pch = 15 + k, col = 1 + k, cex = 1.5)
}
p <- pred.square(s2, x1, x2, grid = n.grid)
pred <- matrix(as.numeric(p$pred), n.grid, n.grid)
contour(p$x, p$y, pred, add = TRUE, levels = 1.5, lty = 2, drawlabels = FALSE)

s0P <- tune.svm(factor(group) ~ x1 + x2, data = dataset, kernel = "polynomial",
                cost = exp(seq(-1, 6, length = 25)))
# figure 5.26c
plot(dataset[ , 1:2], type = "n", xlab = expression(italic(x)[1]), ylab = expression(italic(x)[2]))
for(k in 1:K)
{
  g <- (group == k)
  points(dataset[g, 1:2], pch = 15 + k, col = 1 + k, cex = 1.5)
}
p <- pred.square(s0P$best.model, x1, x2, grid = n.grid)
pred <- matrix(as.numeric(p$pred), n.grid, n.grid)
contour(p$x, p$y, pred, add = TRUE, levels = 1.5, lty = 2, drawlabels = FALSE)

s0R <- tune.svm(factor(group) ~ x1 + x2, data = dataset, kernel = "radial",
                cost = exp(seq(-2, 4, length = 25)))
# figure 5.26d
plot(dataset[ , 1:2], type = "n", xlab = expression(italic(x)[1]), ylab = expression(italic(x)[2]))
for(k in 1:K)
{
  g <- (group == k)
  points(dataset[g, 1:2], pch = 15 + k, col = 1 + k, cex = 1.5)
}
p <- pred.square(s0R$best.model, x1, x2, grid = n.grid)
pred <- matrix(as.numeric(p$pred), n.grid, n.grid)
contour(p$x, p$y, pred, add = TRUE, levels = 1.5, lty = 2, drawlabels = FALSE)

detach.all()