# code based on files " Code regarding the first part of section 5.6", # "Code regarding section 4.8", and "Code regarding section 5.7" # (copyright 2003, 2004, 2012 A.Azzalini and B.Scarpa) # help functions pred.square <- function(model, x1, x2, grid = 50, Z.flag = FALSE) { u <- pretty(x1, n = 10) x <- seq(u[1], u[10], length = grid) u <- pretty(x2, n = 10) y <- seq(u[1], u[10], length = grid) nx <- length(x) ny <- length(y) xoy <- cbind(rep(x, ny), as.vector(matrix(y, nx, ny, byrow = TRUE))) X <- matrix(xoy, nx * ny, 2, byrow = FALSE) if(Z.flag) pred <- predict(model, newdata = data.frame(z1 = X[ , 1], z2 = X[ , 2])) else pred <- predict(model, newdata = data.frame(x1 = X[ , 1], x2 = X[ , 2])) list(pred = pred, x = x, y = y, X = X) } # function to procude the ROC curve lift.roc <- function(previsti, g, type = "bin", plot.it = TRUE) { library(sm) if(!is.numeric(g)) stop("g not numeric") ind <- rev(order(previsti)) n <- length(g) x1 <- (1:n)/n x2 <- cumsum(g[ind]) / (mean(g) * (1:n)) if(type == "crude" & plot.it) plot(x1, x2, type = "l", col = 2, xlab = "fraction of predicted subjects", ylab = "lift") if(type == "sm") { a<- sm.regression(x1, x2, h = 0.1, display = "none") if(plot.it) plot(a$eval, a$estimate, type = "l", xlim=c(0,1), col = 2, xlab = "fraction of predicted subjects", ylab = "lift") } if(type == "bin") { b <- binning(x1, x2, breaks = (-0.001:10) / 9.999) x <- c(0, seq(0.05, 0.95, by = 0.1), 1) if (plot.it) plot(x, c(x2[1], b$means, 1), type = "b", xlim = c(0, 1), ylim = c(1, max(x2)), cex = 0.75, col = 2, xlab = "fraction of predicted subjects", ylab = "improvement factor") x1 <- x x2 <- c(x2[1], b$means,1) } if(plot.it) pause("press enter") u1 <- cumsum(1 - g[ind]) / sum(1 - g) u2 <- cumsum(g[ind]) / sum(g) if(type == "crude" & plot.it) plot(u1, u2, type = "l", xlim = c(0, 1), ylim = c(0, 1), col = 2, xlab = "1-specificity", ylab = "sensibility") if(type == "sm"){ # browser() eps <- 0.00001 a <- sm.regression(u1, log((u2 + eps) / (1 - u2 + 2 * eps)), h = 0.1, display = "none") q <- exp(a$estimate) / (1 + exp(a$estimate)) if(plot.it) plot(a$eval, q, type = "l", xlim = c(0, 1), ylim = c(0, 1), xlab = "1-specificity", ylab = "sensibility", col = 2) } if(type == "bin"){ b <- binning(u1, u2, breaks = (-0.001:10) / 9.999) x <- c(0, seq(0.05, 0.95, by = 0.1),1) y<- c(0, b$means,1) if(plot.it) plot(x, y, type = "b", xlim = c(0,1), ylim = c(0, 1),cex = 0.75, xlab = "1-specificity", ylab = "sensibility", col = 2) u1 <- x u2 <- y } if(plot.it) { abline(0, 1, lty = 2, col = 3) } invisible(list(x1, x2, u1, u2)) } # function to compute the confusion matrix confusion.matrix <- function(predicted, observed){ a <- table(predicted, observed) err.tot <- 1 - sum(diag(a)) / sum(a) fn <- a[1, 2] / (a[1, 2] + a[2, 2]) fp <- a[2, 1] / (a[1, 1] + a[2, 1]) print(a) cat("total error: ", format(err.tot),"\n") cat("false positives/negatives: ",format(c(fp, fn)),"\n") invisible(a) } dataset <- read.table("http://azzalini.stat.unipd.it/Book-DM/classes.dat", header = TRUE, nrows = 200) x1 <- dataset[ , 1] x2 <- dataset[ , 2] gr <- (2 - dataset[ , 3]) # groups are indicated by 0 and 1 K <- 2 n <- nrow(dataset) plot(dataset[ , 1:2], type = "n", xlab = expression(italic(x)[1]), ylab = expression(italic(x)[2])) for(k in 1:K) { g <- (dataset[ , 3] == k) points(dataset[g, 1:2], pch = 15 + k, col = 1 + k, cex = 1.5) } ### k-nearest-neighbors ### library(class) x <- seq(min(x1), max(x1), length = 100) y <- seq(min(x2), max(x2), length = 100) nx <- length(x) ny <- length(y) X <- cbind(rep(x, ny), as.vector(matrix(y, nx, ny, byrow = TRUE))) # knn1 <- knn(cbind(x1, x2), data.frame(x1 = X[ , 1], x2 = X[ , 2]), gr, k = 1) pred <- matrix(knn1, nx, ny) # figure 5.14a plot(dataset[ , 1:2], type = "n", xlab = expression(italic(x)[1]), ylab = expression(italic(x)[2])) for(k in 1:K) { g <- (dataset[ , 3] == k) points(dataset[g, 1:2], pch = 15 + k, col = 1 + k, cex = 1.5) } contour(x, y, pred, levels = 0.5, add = TRUE, drawlabels = FALSE, lty = 1, ldw = 3) # knn50 <- knn(cbind(x1, x2), data.frame(x1 = X[ , 1], x2 = X[ , 2]), gr, k = 50) pred <- matrix(knn50, nx, ny) # figure 5.14b plot(dataset[ , 1:2], type = "n", xlab = expression(italic(x)[1]), ylab = expression(italic(x)[2])) for(k in 1:K) { g <- (dataset[ , 3] == k) points(dataset[g, 1:2], pch = 15 + k, col = 1 + k, cex = 1.5) } contour(x, y, pred, levels = 0.5, add = TRUE, drawlabels = FALSE, lty = 1, ldw = 3) B <- 10 # number of folds for cross-validation K <- 50 # max number neighbours set.seed(1) index <- sample(rep(1:B, n/B)) cv.err <- matrix(NA, nrow = B, ncol = K) for (b in 1:B) { for(k in 1:K) { knnK <- knn(train = cbind(x1[index != b], x2[index != b]), test = cbind(x1[index == b], x2[index == b]), cl = gr[index != b], k = k) cv.err[b, k] <- mean(gr[index == b] != knnK) } } error <- apply(cv.err, 2, mean) plot(error, type = 'l') which.min(error) knn7 <- knn(cbind(x1, x2), data.frame(x1 = X[ , 1], x2 = X[ , 2]), gr, k = 7) pred <- matrix(knn7, nx, ny) # figure plot(dataset[ , 1:2], type = "n", xlab = expression(italic(x)[1]), ylab = expression(italic(x)[2])) for(k in 1:K) { g <- (dataset[ , 3] == k) points(dataset[g, 1:2], pch = 15 + k, col = 1 + k, cex = 1.5) } contour(x, y, pred, levels = 0.5, add = TRUE, drawlabels = FALSE, lty = 1, ldw = 3) ### classification trees ### library(tree) x<- seq(0.5, 3, length = 100) y <-c( 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1) # figure 5.18 plot(x, y, type = "n", ylim = c(0, 1)) points(x, y, col = 2, pch = 16) abline(v = 0.866, lty = 3, col = 4, lwd = 2) abline(v = 2.811, lty = 3, col = 4, lwd = 2) t1<- tree(factor(y) ~ x) # figure 5.19 plot(t1) text(t1, digits = 2, pretty = 1) x0 <- seq(0.5, 3, length = 500) pr1<- predict(t1, newdata = data.frame(x = x0)) # figure 5.19b plot(x, y, type = "n", ylim = c(0, 1)) points(x, y, col = 2, pch = 16) lines(x0, pr1[ , 2], lwd = 2) #----------------------------------- juice <- read.table("http://azzalini.stat.unipd.it/Book-DM/juice.data", header = TRUE) juice[ , "store"] <- factor(juice[ , "store"]) v <- c(1, 3, 6:9, 13, 21) juice <- juice[ , v] #--select training and test set.seed(123) n <- nrow(juice) n1 <- round(n * 0.75) n2 <- n - n1 permutation <- sample(1:n,n) training <- sort(permutation[1:n1]) juice1 <- juice[training, ] juice2 <- juice[-training, ] set.seed(123) part1 <- sort(sample(training, 600)) part2 <- setdiff(training, part1) f1 <- as.formula(paste("as.factor(choice) ~ ", paste(names(juice1)[-1], collapse = "+"), collapse = NULL)) t1 <- tree(f1, data = juice[part1, ]) t2 <- prune.tree(t1, newdata = juice[part2, ]) # figure 5.20a plot(t2) J <- t2$size[t2$dev == min(t2$dev)] t3 <- prune.tree(t1, best = J) # figure 5.20b plot(t3) text(t3) p3 <- predict(t3, newdata = juice2, type = "class") # table 5.11 confusion.matrix(p3, juice2[ , "choice"]) # p3 <- predict(t3, newdata = juice2, type = "vector")[ , 2] a <- lift.roc(p3, as.numeric(juice2[ , "choice"] == "MM"), type = "bin", plot.it = FALSE) # figure 5.21b plot(a[[3]], a[[4]], type = "b", xlim = c(0, 1), pch = 16, ylim = c(0, 1), cex = 2, xlab = "1-specificity", ylab = "Sensibility", col = 2) dataset <- read.table("http://azzalini.stat.unipd.it/Book-DM/classes.dat", header = TRUE) x1 <- dataset[ , 1] x2 <- dataset[ , 2] gr <- as.factor(dataset[ , 3]) K <- 3 t3 <- tree(gr ~ x1 + x2, control = tree.control(nobs = nrow(dataset), mindev = 0, minsize = 2)) # figure 5.22a plot(t3) set.seed(5) t4 <- cv.tree(t3) # figure 5.22b plot(t4) t5 <- prune.tree(t3, k = 10) # figure 5.22c plot(t5) text(t5) n.grid <- 250 p <- pred.square(t5, x1, x2, n.grid) pred <- array(p$pred, c(n.grid, n.grid,3)) ind <- apply(pred, c(1, 2), order)[3, , ] # figure 5.22d plot(dataset[ , 1:2], type = "n", xlab = expression(italic(x)[1]), ylab = expression(italic(x)[2])) for(k in 1:3) points(dataset[gr == k, 1:2], pch = 15 + k, col = 1 + k, cex = 1.5) contour(p$x, p$y, ind, add = TRUE, drawlabels = FALSE, nlevels = 2, lty = 1, lwd = 2) text(-4, 1, labels = 1, cex = 2) text(3, 0, labels = 2, cex = 2) text(2, -4, labels = 3, cex = 2) ### boosting ### ### SVM ### library(class) library(e1071) dataset <- read.table("http://azzalini.stat.unipd.it/Book-DM/classes.dat", header = TRUE, nrows = 200) gr <- (2 - dataset[ , 3]) # groups are indicated by 0 and 1 K <- 2 n <- nrow(dataset) attach(dataset) n.grid <- 800 plot(dataset[ , 1:2], type = "n", xlab = expression(italic(x)[1]), ylab = expression(italic(x)[2])) for(k in 1:K) { g <- (group == k) points(dataset[g, 1:2], pch = 15 + k, col = 1 + k, cex = 1.5) } s3 <- svm(factor(group) ~ x1 + x2, data = dataset, kernel = "polynomial", degree = 3, cost = 1) # figure 5.26a plot(dataset[ , 1:2], type = "n", xlab = expression(italic(x)[1]), ylab = expression(italic(x)[2])) for(k in 1:K) { g <- (group == k) points(dataset[g, 1:2], pch = 15 + k, col = 1 + k, cex = 1.5) } p <- pred.square(s3, x1, x2, grid = n.grid) pred<- matrix(as.numeric(p$pred), n.grid, n.grid) contour(p$x, p$y, pred, add = TRUE, levels = 1.5, lty = 2, drawlabels = FALSE) #-------- s2 <- svm(factor(group) ~ x1 + x2, data = dataset, kernel = "radial", cost = 1) # figure 5.26b plot(dataset[ , 1:2], type = "n", xlab = expression(italic(x)[1]), ylab = expression(italic(x)[2])) for(k in 1:K) { g <- (group == k) points(dataset[g, 1:2], pch = 15 + k, col = 1 + k, cex = 1.5) } p <- pred.square(s2, x1, x2, grid = n.grid) pred <- matrix(as.numeric(p$pred), n.grid, n.grid) contour(p$x, p$y, pred, add = TRUE, levels = 1.5, lty = 2, drawlabels = FALSE) s0P <- tune.svm(factor(group) ~ x1 + x2, data = dataset, kernel = "polynomial", cost = exp(seq(-1, 6, length = 25))) # figure 5.26c plot(dataset[ , 1:2], type = "n", xlab = expression(italic(x)[1]), ylab = expression(italic(x)[2])) for(k in 1:K) { g <- (group == k) points(dataset[g, 1:2], pch = 15 + k, col = 1 + k, cex = 1.5) } p <- pred.square(s0P$best.model, x1, x2, grid = n.grid) pred <- matrix(as.numeric(p$pred), n.grid, n.grid) contour(p$x, p$y, pred, add = TRUE, levels = 1.5, lty = 2, drawlabels = FALSE) s0R <- tune.svm(factor(group) ~ x1 + x2, data = dataset, kernel = "radial", cost = exp(seq(-2, 4, length = 25))) # figure 5.26d plot(dataset[ , 1:2], type = "n", xlab = expression(italic(x)[1]), ylab = expression(italic(x)[2])) for(k in 1:K) { g <- (group == k) points(dataset[g, 1:2], pch = 15 + k, col = 1 + k, cex = 1.5) } p <- pred.square(s0R$best.model, x1, x2, grid = n.grid) pred <- matrix(as.numeric(p$pred), n.grid, n.grid) contour(p$x, p$y, pred, add = TRUE, levels = 1.5, lty = 2, drawlabels = FALSE) detach.all()