# code based on the file "Code regarding section 5.2 (copyright 2003, 2004, 2012 A.Azzalini and B.Scarpa)" # function to procude the ROC curve lift.roc <- function(previsti, g, type = "bin", plot.it = TRUE) { library(sm) if(!is.numeric(g)) stop("g not numeric") ind <- rev(order(previsti)) n <- length(g) x1 <- (1:n)/n x2 <- cumsum(g[ind]) / (mean(g) * (1:n)) if(type == "crude" & plot.it) plot(x1, x2, type = "l", col = 2, xlab = "fraction of predicted subjects", ylab = "lift") if(type == "sm") { a<- sm.regression(x1, x2, h = 0.1, display = "none") if(plot.it) plot(a$eval, a$estimate, type = "l", xlim=c(0,1), col = 2, xlab = "fraction of predicted subjects", ylab = "lift") } if(type == "bin") { b <- binning(x1, x2, breaks = (-0.001:10) / 9.999) x <- c(0, seq(0.05, 0.95, by = 0.1), 1) if (plot.it) plot(x, c(x2[1], b$means, 1), type = "b", xlim = c(0, 1), ylim = c(1, max(x2)), cex = 0.75, col = 2, xlab = "fraction of predicted subjects", ylab = "improvement factor") x1 <- x x2 <- c(x2[1], b$means,1) } if(plot.it) pause("press enter") u1 <- cumsum(1 - g[ind]) / sum(1 - g) u2 <- cumsum(g[ind]) / sum(g) if(type == "crude" & plot.it) plot(u1, u2, type = "l", xlim = c(0, 1), ylim = c(0, 1), col = 2, xlab = "1-specificity", ylab = "sensibility") if(type == "sm"){ # browser() eps <- 0.00001 a <- sm.regression(u1, log((u2 + eps) / (1 - u2 + 2 * eps)), h = 0.1, display = "none") q <- exp(a$estimate) / (1 + exp(a$estimate)) if(plot.it) plot(a$eval, q, type = "l", xlim = c(0, 1), ylim = c(0, 1), xlab = "1-specificity", ylab = "sensibility", col = 2) } if(type == "bin"){ b <- binning(u1, u2, breaks = (-0.001:10) / 9.999) x <- c(0, seq(0.05, 0.95, by = 0.1),1) y<- c(0, b$means,1) if(plot.it) plot(x, y, type = "b", xlim = c(0,1), ylim = c(0, 1),cex = 0.75, xlab = "1-specificity", ylab = "sensibility", col = 2) u1 <- x u2 <- y } if(plot.it) { abline(0, 1, lty = 2, col = 3) } invisible(list(x1, x2, u1, u2)) } # function to compute the confusion matrix confusion.matrix <- function(predicted, observed){ a <- table(predicted, observed) err.tot <- 1 - sum(diag(a)) / sum(a) fn <- a[1, 2] / (a[1, 2] + a[2, 2]) fp <- a[2, 1] / (a[1, 1] + a[2, 1]) print(a) cat("total error: ", format(err.tot),"\n") cat("false positives/negatives: ",format(c(fp, fn)),"\n") invisible(a) } # read data and select the variables used in the book juice <- read.table("http://azzalini.stat.unipd.it/Book-DM/juice.data", header = TRUE) juice[ , "store"] <- factor(juice[ , "store"]) v <- c(1, 3, 6:9, 13, 21) juice <- juice[ , v] # select training and test set.seed(123) n <- nrow(juice) n1 <- round(n * 0.75) n2 <- n - n1 permuta <- sample(1:n, n) training <- sort(permuta[1:n1]) juice1 <- juice[training, ] # training set juice2 <- juice[-training, ] # test set attach(juice1) # figure 5.1a-f par(mfrow = c(2, 3)) K <- length(v) for(k in 2:(K-1)) boxplot(juice1[ , k] ~ choice, col = 5, ylab = names(juice1)[k]) par(mfrow = c(1, 1)) # figure 5.1g freq <- table(choice, store) freq <- 100 * freq / outer(rep(1, 2), apply(freq, 2, sum)) barplot(freq[2, ], width = 1, space = 1.25, col = 5, ylim = c(0, 100), ylab = "Percentage of choosing MM", xlab = "Shop") abline(h = 100 * 318 / nrow(juice1), lty = 3, col = 2) f1 <- as.formula(paste("as.factor(choice) ~ ", paste(names(juice1)[-1], collapse = " + "), collapse = NULL)) m1 <- glm(f1, data = juice1, family = binomial(link = "logit")) m2 <- update(m1, . ~ . - week, data = juice1) # test error p2 <- predict(m2, newdata = juice2, type = "response") # table 5.2 confusion.matrix(p2 > 0.5, juice2[ , 1]) addmargins(confusion.matrix(p2 > 0.5, juice2[ , 1])) # figure 5.2a g <- as.numeric(juice2[, 1] == "MM") a <- lift.roc(p2, g, type = "crude", plot.it = FALSE) p0 <- 0.5 plot(a[[3]], a[[4]], type = "l", xlab = "1-specificity", ylab = "Sensibility") abline(0, 1, col = 4, lty = 3) i0 <- order(abs(p2 - p0))[1] q0 <- a[[3]][sum(p2 > p2[i0])] abline(v = q0, col = 3, lty = 3) text(q0, 0, paste("Threshold=",format(p0), sep = "", collapse = NULL)) # figure 5.2b a <- lift.roc(p2, g, type = "bin", plot.it = FALSE) plot(a[[3]], a[[4]], type = "b", xlab = "1-specificity", ylab = "Sensibility") abline(0, 1, col = 4, lty = 3) i0 <- order(abs(p2 - p0))[1] q0 <- a[[3]][sum(p2 > p2[i0])] abline(v = q0, col = 3, lty = 3) text(q0, 0, paste("Threshold=", format(p0), sep = "", collapse = NULL)) detach(juice1) ######################################### dataset <- read.table("http://azzalini.stat.unipd.it/Book-DM/classes.dat", header = TRUE, nrows = 200) x1 <- dataset[,1] x2 <- dataset[,2] gr <- (2 - dataset[ , 3]) # the groups are indicated by 0 and 1 K <- 2 n <- nrow(dataset) # figure 5.8a plot(dataset[ , 1:2], type = "n", xlab = expression(italic(x)[1]), ylab = expression(italic(x)[2])) for(k in 1:K) { g <- (dataset[ , 3] == k) points(dataset[g, 1:2], pch = 15 + k, col = 1 + k, cex = 1.5) } m1 <- lm(gr ~ x1 + x2) beta <- coef(m1) # we look for the line such that: b0 + b1*x1 + b2*x2 = 0.5 a0 <- (0.5 - beta[1])/beta[3] a1<- -beta[2] / beta[3] abline(a0, a1, lty = 2, lwd = 2) m2 <- lm(gr ~ x1 + x2 + I(x1^2) + I(x1*x2) + I(x2^2)) summary(m2) # coeff of I(x1^2) not significant m2 <- lm(gr ~ x1 + x2 + I(x1*x2) + I(x2^2)) beta <- c(coef(m2)[1:3], 0, coef(m2)[4:5]) # figure 5.8b plot(dataset[ , 1:2], type = "n", xlab = expression(italic(x)[1]), ylab = expression(italic(x)[2])) for(k in 1:K) { g <- (dataset[ , 3] == k) points(dataset[g, 1:2], pch = 15 + k, col = 1 + k, cex = 1.5) } aq <- beta[6] x.seq <- seq(min(x1), max(x1), length = 500) x0 <- y1 <- y2 <- numeric(0) for(x in x.seq) { bq <- beta[5] * x + beta[3] cq <- beta[1] + beta[2] * x + beta[4] * x^2 - 0.5 delta2 <- bq^2 - 4 * aq * cq if(delta2 >= 0) { x0 <- c(x0, x) y1 <- c(y1, (-bq + sqrt(delta2)) / (2 * aq)) y2 <- c(y2, (-bq - sqrt(delta2)) / (2 * aq)) } } lines(x0, y1, lwd = 2) lines(x0, y2, lwd = 2) lines(rep(x0[1], 2), c(y1[1], y2[1]), lwd = 2)