library(rpart) ##### Exercise 10.4 ##### ### Point (a) ### myAdaBoost <- function(X, y, M) { # Step 1: Initialization N <- nrow(X) f <- list() w <- rep(1/N, N) b <- matrix(NA, ncol = N, nrow = M) alpha <- rep(NA, M) X <- as.data.frame(X) # Step 2: Boosting iterations for (m in 1:M) { f[[m]] <- rpart(as.factor(y) ~ ., data = X, weights = w, control = rpart.control(maxdepth = 1)) # transforming back to -1/1 numeric values b[m, ] <- 2*(as.numeric(predict(f[[m]], newdata = X, type = 'class')) - 1.5) err_m <- (w %*% (b[m, ] != y)) / sum(w) alpha[m] <- log((1 - err_m) / err_m) w <- w * exp(alpha[m] * as.numeric(b[m, ] != y)) # w <- w * exp(alpha[m] * as.numeric(b[m, ] != y) - alpha[m]/2) # if one want to take into account constant w <- w / sum(w) print(m) } # Step 3: Predictions of final classifier pred_train <- sign(colSums(matrix(rep(alpha, N), ncol = N) * b)) return(list(prediction_train = pred_train, alpha = alpha, b = b, f = f)) } ### Point (b) ### M <- 400 # Generate training data set.seed(1) N_train <- 2000 p <- 10 X_train <- matrix(rnorm(N_train * p), ncol = p) y_train <- apply(X_train^2, 1, sum) y_train[y_train <= qchisq(p = 0.5, df = 10)] <- -1 y_train[y_train > qchisq(p = 0.5, df = 10)] <- 1 table(y_train) # Generate test data N_test <- 10000 X_test <- matrix(rnorm(N_test * p), ncol = p) y_test <- apply(X_test^2, 1, sum) y_test[y_test <= qchisq(p = 0.5, df = 10)] <- -1 y_test[y_test > qchisq(p = 0.5, df = 10)] <- 1 table(y_test) # Fit the algorithm model <- myAdaBoost(X_train, y_train, M = M) # Training error at each iteration train_error <- rep(NA, M) # vector("numeric", M) for (i in 1:M) { print(i) yhat <- sign(colSums(matrix(rep(model$alpha[1:i], N_train), ncol = N_train) * model$b[1:i, ])) train_error[i] <- mean(yhat != y_train) } plot(1:M, train_error, type = 'l', col = "blue", ylab = "error", xlab = "Boosting iteration") # Test error at each iteration b_test <- t(sapply(model$f, function(x) 2*(as.numeric(predict(x, newdata = as.data.frame(X_test), type = 'class')) - 1.5))) test_error <- rep(NA, M) # vector("numeric", M) for (i in 1:M) { print(i) yhat <- sign(colSums(matrix(rep(model$alpha[1:i], N_test), ncol = N_test) * b_test[1:i, ])) test_error[i] <- mean(yhat != y_test) } lines(1:M, test_error, col = "red") legend(x = "topright", legend = c("Training error", "Test error"), col = c("blue", "red"), lty = c(1,1), cex = 1) ### Point(c) ### # Rerun point (b) with M=1000, but still no overfitting M <- 1000 ### Point (d) ### M <- 400 # Generate training data set.seed(1) N_train <- 2000 p <- 10 y_train <- c(rep(1, 1000), rep(-1, 1000)) X_train <- matrix(NaN, nrow = N_train, ncol = p) X_train[1:1000, ] <- rnorm(n = N_train * p / 2) for (i in 1001:2000) { cont <- TRUE while(cont) { tmp <- rnorm(n = 10) if (sum(tmp^2) > 12) { X_train[i,] <- tmp cont <- FALSE } } } # Generate test data N_test <- 10000 p <- 10 y_test <- c(rep(1, 5000), rep(-1, 5000)) X_test <- matrix(NaN, nrow = N_test, ncol = p) X_test[1:5000, ] <- rnorm(n = N_test * p / 2) for (i in 5001:10000) { cont <- TRUE while(cont) { tmp <- rnorm(n = 10) if (sum(tmp^2) > 12) { X_test[i, ] <- tmp cont <- FALSE } } } # Fit the algorithm model <- myAdaBoost(X_train, y_train, M = M) # Training error at each iteration train_error <- vector("numeric", M) for (i in 1:M) { print(i) yhat <- sign(colSums(matrix(rep(model$alpha[1:i], N_train), ncol = N_train) * model$b[1:i, ])) train_error[i] <- mean(yhat != y_train) } plot(1:M, train_error, type = 'l', col = "blue", ylab = "error", xlab = "Boosting iteration") # Test error at each iteration b_test <- t(sapply(model$f, function(x) 2*(as.numeric(predict(x, newdata = as.data.frame(X_test), type = 'class')) - 1.5))) test_error <- vector("numeric", M) for (i in 1:M) { print(i) yhat <- sign(colSums(matrix(rep(model$alpha[1:i], N_test), ncol = N_test) * b_test[1:i, ])) test_error[i] <- mean(yhat != y_test) } lines(1:M, test_error, col = "red") legend(x = "topright", legend = c("Training error", "Test error"), col = c("blue", "red"), lty = c(1, 1), cex = 1)