# read data download.file('http://azzalini.stat.unipd.it/Book-DM/telekom-www.zip', 'dataTelekom.zip') train.data <- read.table(unz('dataTelekom.zip', 'telekom-www/telekom.training.dat'), header = TRUE) test.data <- read.table(unz('dataTelekom.zip', 'telekom-www/telekom.test.dat'), header = TRUE) system('rm dataTelekom.zip') # data prepraration head(train.data) response <- ifelse(train.data$q10.out.dur.peak + train.data$q10.out.dur.offpeak > 0, 1, 0) table(response) X.train <- train.data[, -c(1:2, 101:110)] X.train$tariff.plan <- as.factor(X.train$tariff.plan) X.train$activ.area <- as.factor(X.train$activ.area) X.train$activ.chan <- as.factor(X.train$activ.chan) response.test <- ifelse(test.data$q10.out.dur.peak + test.data$q10.out.dur.offpeak > 0, 1, 0) table(response.test) X.test <- test.data[, -c(1:2, 101:110)] X.test$tariff.plan <- as.factor(X.test$tariff.plan) X.test$activ.area <- as.factor(X.test$activ.area) X.test$activ.chan <- as.factor(X.test$activ.chan) # linear model mod.linear <- lm(response ~ ., data = X.train) summary(mod.linear) mod.null <- lm(response ~ 1, data = X.train) library(MASS) mod.linear.red <- lm(response ~ tariff.plan + payment.method + gender + age + vas1 + q01.out.dur.peak + q01.in.ch.tot + q09.out.ch.peak + q09.out.dur.peak + q09.out.val.peak + q09.in.ch.tot + q09.in.dur.tot + q09.ch.cc, data = X.train) summary(mod.linear.red) y.hat.linear <- ifelse(predict(mod.linear, newdata = X.test) > 0.5, 1, 0) y.hat.linear.red <- ifelse(predict(mod.linear.red, newdata = X.test) > 0.5, 1, 0) test.error <- list() test.error$lin.model <- mean((y.hat.linear - response.test)^2) test.error$lin.model.red <- mean((y.hat.linear.red - response.test)^2) unlist(test.error) # logistic model mod.logistic <- glm(response ~ ., data = X.train, family = binomial()) summary(mod.logistic) mod.tmp <- glm(response ~ 1, data = X.train, family = binomial()) mod.logistic.red <- glm(response ~ tariff.plan + payment.method + gender + age + vas1 + q01.out.dur.peak + q09.out.ch.peak + q09.out.ch.offpeak + q09.in.ch.tot + q09.ch.cc, data = X.train, family = binomial) y.hat.logistic <- ifelse(predict(mod.logistic, newdata = X.test, type = 'response') > 0.5, 1, 0) y.hat.logistic.red <- ifelse(predict(mod.logistic.red, newdata = X.test, type = 'response') > 0.5, 1, 0) test.error$logistic <- mean((y.hat.logistic - response.test)^2) test.error$logistic.red <- mean((y.hat.logistic.red - response.test)^2) # LDA require(MASS) mod.lda <- lda(as.factor(response) ~ ., data = X.train) y.hat.lda <- predict(mod.lda, newdata = X.test)$class test.error$lda <- mean((as.numeric(y.hat.lda) - 1 - response.test)^2) unlist(test.error) # tree library(tree) large.tree <- tree(as.factor(response) ~ ., data = X.train, control = tree.control(nobs = length(response), mindev = 1e-3)) plot(large.tree) text(large.tree) cv.err <- cv.tree(large.tree, method = "misclass") plot(cv.err) mod.tree <- prune.tree(large.tree, best = 5) plot(mod.tree) text(mod.tree) y.hat.large.tree <- predict(large.tree, newdata = X.test, type = 'class') y.hat.tree <- predict(mod.tree, newdata = X.test, type = 'class') test.error$large.tree <- mean((as.numeric(y.hat.large.tree) - 1 - response.test)^2) test.error$tree <- mean((as.numeric(y.hat.tree) - 1 - response.test)^2) unlist(test.error) # bagging B <- 20 boostrap.tree <- list() for (i in 1:B) { bootstrap.sample <- sample(1:length(response), replace = T) boostrap.tree[[i]] <- tree(as.factor(response)[bootstrap.sample] ~ ., data = X.train[bootstrap.sample, ], control = tree.control(nobs = length(response), mindev = 1e-3)) print(i) } consensus <- NULL probability <- matrix(0, ncol = 2, nrow = length(response.test)) for (i in 1:B) { consensus <- cbind(consensus, predict(boostrap.tree[[i]], newdata = X.test, type = 'class')) probability <- probability + predict(boostrap.tree[[i]], newdata = X.test) } y.hat.bagging.consensus <- ifelse(apply(consensus - 1, 1, mean) > 0.5, 1, 0) y.hat.bagging.probability <- apply(probability, 1, which.max) - 1 test.error$bagging.c <- mean((y.hat.bagging.consensus - response.test)^2) test.error$bagging.p <- mean((y.hat.bagging.probability - response.test)^2) # random forest library(ranger) mod.RF <- ranger(as.factor(response) ~ ., data = X.train, mtry = sqrt(ncol(X.train)), importance = 'permutation') y.hat.RF <- predict(mod.RF, data = X.test, type = 'response')$predictions # if we want to tune mtry, use library(tuneRanger) test.error$RF <- mean((as.numeric(y.hat.RF) - 1 - response.test)^2)