### logistic regression ### bank <- read.csv("http://azzalini.stat.unipd.it/Book-DM/brazil.csv", header = TRUE) summary(bank) y <- ifelse(bank$satisfaction < 2.5, 'low', 'high') x <- ifelse(bank$age <= 45, 'young', 'old') table(x, y) # transform y in a 0-1 variable y <- ifelse(bank$satisfaction < 2.5, 0, 1) # fit the logistic model mod <- glm(y ~ x, family = 'binomial') summary(mod) beta <- mod$coefficients # recover the probability of high satisfaction pi.old = exp(beta[1]) / (1 + exp(beta[1])) pi.young = exp(beta[1] + beta[2]) / (1 + exp(beta[1] + beta[2])) # compute manually coefficients pi.old.hat <- sum(y[x == 'old']) / sum(x == 'old') pi.young.hat <- sum(y[x == 'young']) / sum(x == 'young') # odds odd.old <- pi.old.hat / (1 - pi.old.hat) log(odd.old) # baseline odds in the model odd.young <- pi.young.hat / (1 - pi.young.hat) log(odd.young) # log odds ratio log(odd.young / odd.old) # fit the model with age as a continuous variable mod_tab25up <- glm(y ~ bank$age + I(bank$age^2), family = 'binomial') summary(mod_tab25up) mod_tab25down <- glm(y ~ bank$age, family = 'binomial') summary(mod_tab25down) # difference in deviance diff_deviance <- mod_tab25down$deviance - mod_tab25up$deviance 1 - pchisq(q = diff_deviance, df = 4 - 3) # not significative at level 0.05 # code based on files "Code regarding section 4.2" (copyright 2003, 2004, 2012 A.Azzalini and B.Scarpa), # "Code regarding section 5.4" (copyright 2003, 2004, 2012 A.Azzalini and B.Scarpa), # and "Code regarding section 5.5" (copyright 2003, 2004, 2012 A.Azzalini and B.Scarpa) pred.square <- function(model, x1, x2, grid = 50, Z.flag = FALSE) { u <- pretty(x1, n = 10) x <- seq(u[1], u[10], length = grid) u <- pretty(x2, n = 10) y <- seq(u[1], u[10], length = grid) nx <- length(x) ny <- length(y) xoy <- cbind(rep(x, ny), as.vector(matrix(y, nx, ny, byrow = TRUE))) X <- matrix(xoy, nx * ny, 2, byrow = FALSE) if(Z.flag) pred <- predict(model, newdata = data.frame(z1 = X[ , 1], z2 = X[ , 2])) else pred <- predict(model, newdata = data.frame(x1 = X[ , 1], x2 = X[ , 2])) list(pred = pred, x = x, y = y, X = X) } # function to procude the ROC curve lift.roc <- function(previsti, g, type = "bin", plot.it = TRUE) { library(sm) if(!is.numeric(g)) stop("g not numeric") ind <- rev(order(previsti)) n <- length(g) x1 <- (1:n)/n x2 <- cumsum(g[ind]) / (mean(g) * (1:n)) if(type == "crude" & plot.it) plot(x1, x2, type = "l", col = 2, xlab = "fraction of predicted subjects", ylab = "lift") if(type == "sm") { a<- sm.regression(x1, x2, h = 0.1, display = "none") if(plot.it) plot(a$eval, a$estimate, type = "l", xlim=c(0,1), col = 2, xlab = "fraction of predicted subjects", ylab = "lift") } if(type == "bin") { b <- binning(x1, x2, breaks = (-0.001:10) / 9.999) x <- c(0, seq(0.05, 0.95, by = 0.1), 1) if (plot.it) plot(x, c(x2[1], b$means, 1), type = "b", xlim = c(0, 1), ylim = c(1, max(x2)), cex = 0.75, col = 2, xlab = "fraction of predicted subjects", ylab = "improvement factor") x1 <- x x2 <- c(x2[1], b$means,1) } if(plot.it) pause("press enter") u1 <- cumsum(1 - g[ind]) / sum(1 - g) u2 <- cumsum(g[ind]) / sum(g) if(type == "crude" & plot.it) plot(u1, u2, type = "l", xlim = c(0, 1), ylim = c(0, 1), col = 2, xlab = "1-specificity", ylab = "sensibility") if(type == "sm"){ # browser() eps <- 0.00001 a <- sm.regression(u1, log((u2 + eps) / (1 - u2 + 2 * eps)), h = 0.1, display = "none") q <- exp(a$estimate) / (1 + exp(a$estimate)) if(plot.it) plot(a$eval, q, type = "l", xlim = c(0, 1), ylim = c(0, 1), xlab = "1-specificity", ylab = "sensibility", col = 2) } if(type == "bin"){ b <- binning(u1, u2, breaks = (-0.001:10) / 9.999) x <- c(0, seq(0.05, 0.95, by = 0.1),1) y<- c(0, b$means,1) if(plot.it) plot(x, y, type = "b", xlim = c(0,1), ylim = c(0, 1),cex = 0.75, xlab = "1-specificity", ylab = "sensibility", col = 2) u1 <- x u2 <- y } if(plot.it) { abline(0, 1, lty = 2, col = 3) } invisible(list(x1, x2, u1, u2)) } # function to compute the confusion matrix confusion.matrix <- function(predicted, observed){ a <- table(predicted, observed) err.tot <- 1 - sum(diag(a)) / sum(a) fn <- a[1, 2] / (a[1, 2] + a[2, 2]) fp <- a[2, 1] / (a[1, 1] + a[2, 1]) print(a) cat("total error: ", format(err.tot),"\n") cat("false positives/negatives: ",format(c(fp, fn)),"\n") invisible(a) } dataset <- read.table("http://azzalini.stat.unipd.it/Book-DM/classes.dat", header = TRUE, nrows = 200) x1 <- dataset[,1] x2 <- dataset[,2] gr <- (2 - dataset[ , 3]) # the groups are indicated by 0 and 1 K <- 2 n <- nrow(dataset) # figure 5.8a plot(dataset[ , 1:2], type = "n", xlab = expression(italic(x)[1]), ylab = expression(italic(x)[2])) for(k in 1:K) { g <- (dataset[ , 3] == k) points(dataset[g, 1:2], pch = 15 + k, col = 1 + k, cex = 1.5) } m1 <- lm(gr ~ x1 + x2) beta <- coef(m1) # we look for the line such that: b0 + b1*x1 + b2*x2 = 0.5 a0 <- (0.5 - beta[1])/beta[3] a1<- -beta[2] / beta[3] abline(a0, a1, lty = 2, lwd = 2) m2 <- lm(gr ~ x1 + x2 + I(x1^2) + I(x1*x2) + I(x2^2)) summary(m2) # coeff of I(x1^2) not significant m2 <- lm(gr ~ x1 + x2 + I(x1*x2) + I(x2^2)) beta <- c(coef(m2)[1:3], 0, coef(m2)[4:5]) # figure 5.8b plot(dataset[ , 1:2], type = "n", xlab = expression(italic(x)[1]), ylab = expression(italic(x)[2])) for(k in 1:K) { g <- (dataset[ , 3] == k) points(dataset[g, 1:2], pch = 15 + k, col = 1 + k, cex = 1.5) } aq <- beta[6] x.seq <- seq(min(x1), max(x1), length = 500) x0 <- y1 <- y2 <- numeric(0) for(x in x.seq) { bq <- beta[5] * x + beta[3] cq <- beta[1] + beta[2] * x + beta[4] * x^2 - 0.5 delta2 <- bq^2 - 4 * aq * cq if(delta2 >= 0) { x0 <- c(x0, x) y1 <- c(y1, (-bq + sqrt(delta2)) / (2 * aq)) y2 <- c(y2, (-bq - sqrt(delta2)) / (2 * aq)) } } lines(x0, y1, lwd = 2) lines(x0, y2, lwd = 2) lines(rep(x0[1], 2), c(y1[1], y2[1]), lwd = 2) ################# juice <- read.table("http://azzalini.stat.unipd.it/Book-DM/juice.data", header = TRUE) juice[ , "store"] <- factor(juice[ , "store"]) v <- c(1, 3, 6:9, 13, 21) juice <- juice[ , v] #--select training and test set.seed(123) n <- nrow(juice) n1 <- round(n * 0.75) n2 <- n - n1 permuta <- sample(1:n, n) training <- sort(permuta[1:n1]) juice1 <- juice[training, ] juice2 <- juice[-training, ] f0 <- as.formula(paste("as.numeric(as.factor(choice))-1 ~", paste(names(juice1)[-1], collapse = "+"), collapse = NULL)) lm1 <- lm(f0, data = juice1) lm2 <- update(lm1, "~.-week") p3 <- predict(lm2, newdata = juice2) m1 <- glm(f0, data = juice1, family = binomial) m2 <- update(m1, "~.-week") p2 <- predict(m2, newdata = juice2, type = "response") # figure 5.9 plot(log(p2 / (1 - p2)), p3, xlab = "Logit(probability) predicted by the logistic model", ylab = "Predicted values with the linear model", cex = 1.5) abline(h = 0.5, col = 2, lty = 2) abline(v = 0, col = 2, lty = 2) # use again class.dat x1 <- dataset[ , 1] x2 <- dataset[ , 2] gr <- (2 - dataset[ , 3]) K <- 3 n <- nrow(dataset) x1 <- z1 <- dataset[ , 1] x2 <- z2 <- dataset[ , 2] gr <- dataset[ , 3] group <- factor(gr) n.grid <- 800 Y <- model.matrix( ~ group - 1) m1 <- lm(Y ~ x1 + x2) p <- pred.square(m1, x1, x2, n.grid) pred <- array(p$pred, dim = c(n.grid, n.grid, 3)) ind <- apply(pred, c(1, 2), order)[3, , ] # figure 5.10a plot(dataset[ , 1:2], type = "n", xlab = expression(italic(x)[1]), ylab = expression(italic(x)[2])) for(k in 1:K) points(dataset[gr == k, 1:2], pch = 15 + k, col= 1 + k) contour(p$x, p$y, ind, add = TRUE, drawlabels = FALSE, nlevels = 2, lty = 2) m2 <- lm(Y ~ x1 + x2+ I(x1^2) + I(x1 * x2)+ I (x2^2)) p <- pred.square(m2, x1, x2, n.grid) pred <- array(p$pred, dim = c(n.grid, n.grid, 3)) ind <- apply(pred, c(1, 2), order)[3, , ] # figure 5.10b plot(dataset[ , 1:2], type = "n", xlab = expression(italic(x)[1]), ylab = expression(italic(x)[2])) for(k in 1:3) points(dataset[gr == k, 1:2], pch = 15 + k, col = 1 + k) contour(p$x, p$y, ind, add = TRUE, drawlabels = FALSE, nlevels = 2, lty = 2) dataset <- read.table("http://azzalini.stat.unipd.it/Book-DM/classes.dat", header = TRUE) x1 <- dataset[ , 1] x2 <- dataset[ , 2] gr <- (2 - dataset[ , 3]) K <- 3 n <- nrow(dataset) x1 <- z1 <- dataset[ , 1] x2 <- z2 <- dataset[ , 2] gr <- dataset[,3] group <- factor(gr) n.grid <- 800 library(MASS) d1 <- lda(group ~ x1 + x2) p <- pred.square(d1, x1, x2, n.grid) pred <- array(p$pred$posterior, c(n.grid, n.grid, 3)) ind <- apply(pred, c(1, 2), order)[3, , ] # figure 5.11a plot(dataset[ , 1:2], type = "n", xlab = expression(italic(x)[1]), ylab = expression(italic(x)[2])) for(k in 1:3) points(dataset[gr == k, 1:2], pch = 15 + k, col = 1 + k) contour(p$x, p$y, ind, add = TRUE, drawlabels = FALSE, nlevels = 2, lty = 2) d2 <- lda(group ~ x1 + x2 + I(x1^2) + I(x1*x2) + I(x2^2)) p <- pred.square(d2, x1, x2, n.grid) pred <- array(p$pred$posterior, c(n.grid, n.grid, 3)) ind <- apply(pred, c(1, 2), order)[3, , ] # figure 5.11b plot(dataset[, 1:2], type = "n", xlab = expression(italic(x)[1]), ylab = expression(italic(x)[2])) for(k in 1:3) points(dataset[gr == k, 1:2], pch = 15 + k, col = 1 + k) contour(p$x, p$y, ind, add = TRUE, drawlabels = FALSE, nlevels = 2, lty = 2) ### QDA ### d3 <- qda(group ~ x1 + x2) p <- pred.square(d3, x1, x2, n.grid) pred <- array(p$pred$posterior, c(n.grid, n.grid, 3)) ind <- apply(pred, c(1, 2), order)[3, , ] # figure 5.12a plot(dataset[, 1:2], type = "n", xlab = expression(italic(x)[1]), ylab = expression(italic(x)[2])) for(k in 1:3) points(dataset[gr == k, 1:2], pch = 15 + k, col = 1 + k) contour(p$x, p$y, ind, add = TRUE, drawlabels = FALSE, nlevels = 2) d4 <- qda(group ~ x1 + x2 + I(x1^2) + I(x1*x2) + I(x2^2)) p <- pred.square(d4, x1, x2, n.grid) pred <- array(p$pred$posterior, c(n.grid, n.grid, 3)) ind <- apply(pred, c(1, 2), order)[3, , ] # figure 5.12b plot(dataset[ , 1:2], type = "n", xlab = expression(italic(x)[1]), ylab = expression(italic(x)[2])) for(k in 1:3) points(dataset[gr == k, 1:2], pch = 15 + k, col = 1 + k) contour(p$x, p$y, ind, add = TRUE, drawlabels = FALSE, nlevels = 2, lty = 2) f1 <- as.formula(paste("choice ~ ", paste(names(juice1)[-1], collapse = "+"), collapse = NULL)) d1 <- lda(f1, data = juice1) d2 <- qda(f1, data = juice1) p1 <- predict(d1, newdata = juice2) p2 <- predict(d2, newdata = juice2) # table 5.7 confusion.matrix(p1$class, juice2[ , "choice"]) confusion.matrix(p2$class, juice2[ , "choice"]) g <- as.numeric(juice2[ , 1] == "MM") lr1 <- lift.roc(p1$post[ , 2], g, type = "bin", plot.it = FALSE) lr2 <- lift.roc(p2$post[ , 2], g, type = "bin", plot.it = FALSE) # figure 5.13b plot(lr1[[3]], lr1[[4]], type = "b", xlim = c(0,1), pch = 16, ylim = c(0, 1), cex = 1.5, xlab = "1-specificity", ylab = "Sensibility", col = 2) lines(lr2[[3]], lr2[[4]], type = "b", cex = 1.5, col = 3, pch = 17) abline(0, 1, lty = 2, col = 4) legend('bottomright', legend = c("LDA", "QDA"), col = 2:3, pch = 16:17)