# read data
download.file('http://azzalini.stat.unipd.it/Book-DM/telekom-www.zip', 'dataTelekom.zip')
train.data <- read.table(unz('dataTelekom.zip', 'telekom-www/telekom.training.dat'),
                         header = TRUE)
test.data <- read.table(unz('dataTelekom.zip', 'telekom-www/telekom.test.dat'),
                        header = TRUE)
system('rm dataTelekom.zip')

# data prepraration
head(train.data)
response <- ifelse(train.data$q10.out.dur.peak + train.data$q10.out.dur.offpeak > 0, 1, 0)
table(response)
X.train <- train.data[, -c(1:2, 101:110)]
X.train$tariff.plan <- as.factor(X.train$tariff.plan)
X.train$activ.area <- as.factor(X.train$activ.area)
X.train$activ.chan <- as.factor(X.train$activ.chan)

response.test <- ifelse(test.data$q10.out.dur.peak + test.data$q10.out.dur.offpeak > 0, 1, 0)
table(response.test)
X.test <- test.data[, -c(1:2, 101:110)]
X.test$tariff.plan <- as.factor(X.test$tariff.plan)
X.test$activ.area <- as.factor(X.test$activ.area)
X.test$activ.chan <- as.factor(X.test$activ.chan)


# linear model
mod.linear <- lm(response ~ ., data = X.train)
summary(mod.linear)
mod.null <- lm(response ~ 1, data = X.train)
library(MASS)
mod.linear.red <- lm(response ~ tariff.plan + payment.method + gender + age +
                       vas1 + q01.out.dur.peak + q01.in.ch.tot + q09.out.ch.peak +
                       q09.out.dur.peak + q09.out.val.peak + q09.in.ch.tot + q09.in.dur.tot +
                       q09.ch.cc, data = X.train)
summary(mod.linear.red)

y.hat.linear <- ifelse(predict(mod.linear, newdata = X.test) > 0.5, 1, 0)
y.hat.linear.red <- ifelse(predict(mod.linear.red, newdata = X.test) > 0.5, 1, 0)

test.error <- list()
test.error$lin.model <- mean((y.hat.linear - response.test)^2)
test.error$lin.model.red <- mean((y.hat.linear.red - response.test)^2)

unlist(test.error)

# logistic model
mod.logistic <- glm(response ~ ., data = X.train, family = binomial())
summary(mod.logistic)
mod.tmp <- glm(response ~ 1, data = X.train, family = binomial())
mod.logistic.red <- glm(response ~ tariff.plan + payment.method + gender + age +
                       vas1 + q01.out.dur.peak + q09.out.ch.peak + q09.out.ch.offpeak + q09.in.ch.tot +
                       q09.ch.cc, data = X.train, family = binomial)

y.hat.logistic <- ifelse(predict(mod.logistic, newdata = X.test, type = 'response') > 0.5, 1, 0)
y.hat.logistic.red <- ifelse(predict(mod.logistic.red, newdata = X.test, type = 'response') > 0.5, 1, 0)

test.error$logistic <- mean((y.hat.logistic - response.test)^2)
test.error$logistic.red <- mean((y.hat.logistic.red - response.test)^2)

# LDA
require(MASS)
mod.lda <- lda(as.factor(response) ~ ., data = X.train)
y.hat.lda <- predict(mod.lda, newdata = X.test)$class

test.error$lda <- mean((as.numeric(y.hat.lda) - 1 - response.test)^2)
unlist(test.error)


# tree
library(tree)
large.tree <- tree(as.factor(response) ~ ., data = X.train,
                   control = tree.control(nobs = length(response), mindev = 1e-3))
plot(large.tree)
text(large.tree)
cv.err <- cv.tree(large.tree, method = "misclass")
plot(cv.err)
mod.tree <- prune.tree(large.tree, best = 5)
plot(mod.tree)
text(mod.tree)

y.hat.large.tree <- predict(large.tree, newdata = X.test, type = 'class')
y.hat.tree <- predict(mod.tree, newdata = X.test, type = 'class')

test.error$large.tree <- mean((as.numeric(y.hat.large.tree) - 1 - response.test)^2)
test.error$tree <- mean((as.numeric(y.hat.tree) - 1 - response.test)^2)

unlist(test.error)

# bagging
B <- 20
boostrap.tree <- list()
for (i in 1:B)
{
  bootstrap.sample <- sample(1:length(response), replace = T)
  boostrap.tree[[i]] <- tree(as.factor(response)[bootstrap.sample] ~ .,
                        data = X.train[bootstrap.sample, ],
                        control = tree.control(nobs = length(response), mindev = 1e-3))
  print(i)
}
  
consensus <- NULL
probability <- matrix(0, ncol = 2, nrow = length(response.test))
for (i in 1:B)
{
  consensus <- cbind(consensus, predict(boostrap.tree[[i]], newdata = X.test, type = 'class'))
  probability <- probability + predict(boostrap.tree[[i]], newdata = X.test)
}
y.hat.bagging.consensus <- ifelse(apply(consensus - 1, 1, mean) > 0.5, 1, 0)
y.hat.bagging.probability <- apply(probability, 1, which.max) - 1
  
test.error$bagging.c <- mean((y.hat.bagging.consensus - response.test)^2)
test.error$bagging.p <- mean((y.hat.bagging.probability - response.test)^2)
  
# random forest
library(ranger)
mod.RF <- ranger(as.factor(response) ~ ., data = X.train, mtry = sqrt(ncol(X.train)),
                 importance = 'permutation')
y.hat.RF <- predict(mod.RF, data = X.test, type = 'response')$predictions
# if we want to tune mtry, use library(tuneRanger)
test.error$RF <- mean((as.numeric(y.hat.RF) - 1 - response.test)^2)