# read data
download.file('http://azzalini.stat.unipd.it/Book-DM/telekom-www.zip', 'dataTelekom.zip')
train.data <- read.table(unz('dataTelekom.zip','telekom-www/telekom.training.dat'), header = TRUE)
test.data <- read.table(unz('dataTelekom.zip','telekom-www/telekom.test.dat'), header = TRUE)
system('rm dataTelekom.zip')

head(train.data)
response <- train.data$q10.out.dur.peak + train.data$q10.out.dur.offpeak
hist(response, breaks = 1e3, xlim = c(0, 6e3), freq = FALSE)
selection <- train.data$q10.out.dur.peak + train.data$q10.out.dur.offpeak > 0
response <- response[selection]
X.train <- train.data[ , -c(1:2, 101:110)]
summary(X.train)
X.train$tariff.plan <- as.factor(X.train$tariff.plan)
X.train$activ.area <- as.factor(X.train$activ.area)
X.train$activ.chan <- as.factor(X.train$activ.chan)
X.train <- X.train[selection, ]
selection <- test.data$q10.out.dur.peak + test.data$q10.out.dur.offpeak > 0
response.test <- test.data$q10.out.dur.peak + test.data$q10.out.dur.offpeak
response.test <- response.test[selection]
X.test <- test.data[ , -c(1:2, 101:110)]
X.test$tariff.plan <- as.factor(X.test$tariff.plan)
X.test$activ.area <- as.factor(X.test$activ.area)
X.test$activ.chan <- as.factor(X.test$activ.chan)
X.test <- X.test[selection, ]

# linear model #
mod.linear <- lm(response ~ ., data = X.train)
summary(mod.linear)
hist(mod.linear$residuals, breaks = 1e3, xlim = c(-2e4, 2e4), freq = FALSE)
curve(dnorm(x, 0, sd(mod.linear$residuals)), add = TRUE, col = 2)
library(MASS)
mod.linear.reduced <- stepAIC(object = mod.linear, scope = lm(response ~ 1, data = X.train))
summary(mod.linear.reduced)

hist(mod.linear.reduced$residuals, breaks = 1e3, xlim = c(-2e4, 2e4), freq = FALSE)
curve(dnorm(x, 0, sd(mod.linear.reduced$residuals)), add = TRUE, col = 2)

test.error <- list()
test.error$mod.linear <- mean((response.test - predict(mod.linear, newdata = X.test))^2)
test.error$mod.linear.red <- mean((response.test - predict(mod.linear.reduced, newdata = X.test))^2)

plot(mod.linear)
plot(mod.linear.reduced)

mod.linear.log <- lm(log(response) ~ ., data = X.train)
summary(mod.linear.log)
hist(mod.linear.log$residuals, breaks = 1e2, freq = FALSE)
curve(dnorm(x, 0, sd(mod.linear.log$residuals)), add = TRUE, col = 2)
mod.linear.log.reduced <- stepAIC(mod.linear.log, lm(log(response) ~ 1, data = X.train))
summary(mod.linear.log.reduced)
hist(mod.linear.log.reduced$residuals, breaks = 1e2, freq = FALSE)
curve(dnorm(x, 0, sd(mod.linear.log.reduced$residuals)), add = TRUE, col = 2)

plot(mod.linear.log)
plot(mod.linear.log.reduced)

test.error$mod.linear.log <- mean((response.test - exp(predict(mod.linear.log, newdata = X.test)))^2)
test.error$mod.linear.log.red <- mean((response.test - exp(predict(mod.linear.log.reduced, newdata = X.test)))^2)

y.hat <- predict(mod.linear, newdata = X.test)
summary(y.hat)
y.hat[y.hat < 0] <- 0.5
y.hat.red <- predict(mod.linear.reduced, newdata = X.test)
y.hat.red[y.hat.red < 0] <- 0.5
test.error.ls <- list()
test.error.ls$mod.linear <- mean((log(response.test) - log(y.hat))^2)
test.error.ls$mod.linear.red <- mean((log(response.test) - log(y.hat.red))^2)
test.error.ls$mod.linear.lin.log <- mean((log(response.test) - predict(mod.linear.log, newdata = X.test))^2)
test.error.ls$mod.linear.lin.log.red <- mean((log(response.test) - 
                                                predict(mod.linear.log.reduced, newdata = X.test))^2)

library(glmnet)
library(fastDummies)
X.train.penReg <- cbind(dummy_columns(X.train[, c(1:3, 5:8)]), X.train[, -c(1:3, 5:8)])[, -c(1:7, 8, 13, 16, 19, 24, 32, 34)]
X.test.penReg <- cbind(dummy_columns(X.test[, c(1:3, 5:8)]), X.test[, -c(1:3, 5:8)])[, -c(1:7, 8, 13, 16, 23, 31, 33)]


# gam #
library(gam)
formulaX <- paste0('+ s(',names(X.train[9:98]), ', 4)')
formula <- 'response ~ tariff.plan + payment.method + gender + s(age, 4) + activ.area + activ.chan + vas1 + vas2'
for (i in 1:length(formulaX)) formula <- paste0(formula, formulaX[i])
mod.gam <- gam(as.formula(formula), data = X.train)
summary(mod.gam)$parametric.anova

X.train.red <- X.train[ , summary(mod.gam)$parametric.anova[-c(1, 99), 5] < 0.05]
formulaX2 <- paste0('+ s(',names(X.train.red[-c(1:7)]), ', 4)')
formula <- 'response ~ tariff.plan + gender + s(age,4) + activ.area + activ.chan + vas1 + vas2'
for (i in 1:length(formulaX2)) formula <- paste0(formula, formulaX2[i])
mod.gam.reduced <- gam(as.formula(formula), data = X.train)
summary(mod.gam.reduced)

test.error$gam <- mean((response.test - predict(mod.gam, X.test))^2)
test.error$gam.red <- mean((response.test - predict(mod.gam.reduced, X.test))^2)

formula <- 'log(response) ~ tariff.plan + payment.method + gender + s(age,4) + activ.area + activ.chan +
            vas1 + vas2'
for (i in 1:length(formulaX)) formula <- paste0(formula, formulaX[i])
mod.gam.log <- gam(as.formula(formula), data = X.train)
summary(mod.gam.log)$parametric.anova
X.train.red <- X.train[ , summary(mod.gam.log)$parametric.anova[-c(1, 99), 5] < 0.05]
formulaX <- paste0('+s(',names(X.train.red[-c(1:8)]), ',4)')
formula <- 'log(response) ~ tariff.plan + payment.method + gender + s(age,4) + activ.area + activ.chan +
            vas1 + vas2'
for (i in 1:length(formulaX)) formula <- paste0(formula, formulaX[i])
mod.gam.log.reduced <- gam(as.formula(formula), data = X.train)

plot(mod.gam.log.reduced, terms = 's(age, 4)')
plot(mod.gam.log.reduced, terms = 's(age, 4)', se = TRUE)


test.error$gam.log <- mean((response.test - exp(predict(mod.gam.log, X.test)))^2)
test.error$gam.log.red <- mean((response.test - exp(predict(mod.gam.log.reduced, X.test)))^2)
test.error.ls$gam.log <- mean((log(response.test) - predict(mod.gam.log, X.test))^2)
test.error.ls$gam.log.red <- mean((log(response.test) - predict(mod.gam.log.reduced, X.test))^2)


# neural network #
library(nnet)
mod.neuralNetwork <- nnet(response ~ ., data = X.train, size = 3, maxit = 300, decay = 0.01, trace = FALSE)
test.error$nnet <- mean((response.test - predict(mod.neuralNetwork, newdata = X.test))^2)

mod.neuralNetwork.log <- nnet(log(response) ~ ., data = X.train, size = 3,
                              maxit = 300, decay = 0.01, trace = FALSE)
test.error$nnet.log <- mean((response.test - exp(predict(mod.neuralNetwork.log, newdata = X.test)))^2)
test.error.ls$nnet <- mean((log(response.test) - log(predict(mod.neuralNetwork, newdata = X.test)))^2)
test.error.ls$nnet.log <- mean((log(response.test) - predict(mod.neuralNetwork.log, newdata = X.test))^2)


# tree #
library(tree)
large.tree <- tree(response ~ ., data = X.train, control = tree.control(nobs = length(response), mindev = 0.001))
plot(large.tree)

cv.err <- cv.tree(large.tree, K = 10, method = 'deviance')
J <- cv.err$size[which.min(cv.err$dev)]
mod.tree.cv <- prune.tree(large.tree, best = J)
plot(mod.tree.cv)
text(mod.tree.cv)

plot(cv.err)
mod.tree <- prune.tree(large.tree, best = 8)
plot(mod.tree)
text(mod.tree)

test.error$tree <- mean((response.test - predict(mod.tree.cv, newdata = X.test))^2)
test.error$tree.red <- mean((response.test - predict(mod.tree, newdata = X.test))^2)


# ridge
lambda.r <- cv.glmnet(y = response, x = as.matrix(X.train.penReg), alpha = 0)
mod.ridge <- glmnet(y = response, x = as.matrix(X.train.penReg), alpha = 0,
                    lambda = lambda.r$lambda.1se)
test.error$ridge <- mean((response.test - predict(mod.ridge, newx = as.matrix(X.test.penReg)))^2)
lambda.r.log <- cv.glmnet(y = log(response), x = as.matrix(X.train.penReg), alpha = 0)
mod.ridge.log <- glmnet(y = log(response), x = as.matrix(X.train.penReg), alpha = 0,
                        lambda = lambda.r.log$lambda.1se)
test.error$ridge.log <- mean((response.test - exp(predict(mod.ridge.log, newx = as.matrix(X.test.penReg))))^2)

library(glmnet)
library(fastDummies)
X.train.penReg <- cbind(dummy_columns(X.train[, c(1:3, 5:8)]), X.train[, -c(1:3, 5:8)])[, -c(1:7, 8, 13, 16, 19, 24, 32, 34)]
X.test.penReg <- cbind(dummy_columns(X.test[, c(1:3, 5:8)]), X.test[, -c(1:3, 5:8)])[, -c(1:7, 8, 13, 16, 23, 31, 33)]


# lasso
lambda.l <- cv.glmnet(y = response, x = as.matrix(X.train.penReg), alpha = 1)
mod.lasso <- glmnet(y = response, x = as.matrix(X.train.penReg), alpha = 1, lambda = lambda.l$lambda.1se)
test.error$lasso <- mean((response.test - predict(mod.lasso, newx = as.matrix(X.test.penReg)))^2)
lambda.l.log <- cv.glmnet(y = log(response), x = as.matrix(X.train.penReg), alpha = 1)
mod.lasso.log <- glmnet(y = log(response), x = as.matrix(X.train.penReg), alpha = 1,
                    lambda = lambda.l.log$lambda.1se)
test.error$lasso <- mean((response.test - exp(predict(mod.lasso.log, newx = as.matrix(X.test.penReg))))^2)

#test.error.ls$ridge <- mean((log(response.test) - log(predict(mod.ridge, newx = as.matrix(X.test.penReg))))^2)
test.error.ls$ridge.log <- mean((log(response.test) - predict(mod.ridge.log, newx = as.matrix(X.test.penReg)))^2)
test.error.ls$lasso <- mean((log(response.test) - log(predict(mod.lasso, newx = as.matrix(X.test.penReg))))^2)
test.error.ls$lasso.log <- mean((log(response.test) - predict(mod.lasso.log, newx = as.matrix(X.test.penReg)))^2)


# boosting
library(mboost)

mod.boost <- gamboost(response ~ ., data = X.train, baselearner = 'bbs')
# mstop <- cvrisk(mod.boost) # commented out for time constraint
# mstop <- cvrisk(mod.boost, folds = cv(model.weights(mod.boost), type = "kfold", B = 5))
colnames(X.train.penReg)[5:6]
colnames(X.train.penReg)[5:6] <- c("payment.method_credit_card", "payment.method_post_account")
colnames(X.test.penReg)[5:6]
colnames(X.test.penReg)[5:6] <- c("payment.method_credit_card", "payment.method_post_account")
mod.boost.t <- gamboost(response ~ ., data = X.train.penReg, baselearner = 'btree')
# mstop.t <- cvrisk(mod.boost.t) # commented out for time constraint

test.error$boost <- mean((response.test - predict(mod.boost, newdata = X.test))^2)
test.error$boost.t <- mean((response.test - predict(mod.boost.t, newdata = X.test.penReg))^2)


mod.boost.log <- gamboost(log(response) ~ ., data = X.train, baselearner = 'bbs')
mod.boost.t.log <- gamboost(log(response) ~ ., data = X.train.penReg, baselearner = 'btree')

test.error.ls$boost <- mean((log(response.test) - predict(mod.boost.log, newdata = X.test))^2)
test.error.ls$boost.t <- mean((log(response.test) - predict(mod.boost.t.log, newdata = X.test.penReg))^2)

# random forests
library(ranger)
mod.rf <- ranger(dependent.variable.name = 'response', data = cbind(response, X.train.penReg))
test.error$rf <- mean((response.test - predict(mod.rf, data = X.test.penReg)$predictions)^2)
mod.rf.log <- ranger(dependent.variable.name = 'log(response)', data = cbind(log(response), X.train.penReg))
test.error.ls$rf <- mean((log(response.test) - predict(mod.rf.log, data = X.test.penReg)$predictions)^2)

library(tuneRanger)
task <- makeRegrTask(data = cbind(response, X.train.penReg), target = 'response')
mod.tuned <- tuneRanger(task)
test.error$rf.tuned <- mean((y.test - unlist(predict(mod.tuned$model, newdata = X.test.penReg)$data))^2)


unlist(test.error)/1e7
unlist(test.error.ls)