# read data download.file('http://azzalini.stat.unipd.it/Book-DM/telekom-www.zip', 'dataTelekom.zip') train.data <- read.table(unz('dataTelekom.zip','telekom-www/telekom.training.dat'), header = TRUE) test.data <- read.table(unz('dataTelekom.zip','telekom-www/telekom.test.dat'), header = TRUE) system('rm dataTelekom.zip') head(train.data) response <- train.data$q10.out.dur.peak + train.data$q10.out.dur.offpeak hist(response, breaks = 1e3, xlim = c(0, 6e3), freq = FALSE) selection <- train.data$q10.out.dur.peak + train.data$q10.out.dur.offpeak > 0 response <- response[selection] X.train <- train.data[ , -c(1:2, 101:110)] summary(X.train) X.train$tariff.plan <- as.factor(X.train$tariff.plan) X.train$activ.area <- as.factor(X.train$activ.area) X.train$activ.chan <- as.factor(X.train$activ.chan) X.train <- X.train[selection, ] selection <- test.data$q10.out.dur.peak + test.data$q10.out.dur.offpeak > 0 response.test <- test.data$q10.out.dur.peak + test.data$q10.out.dur.offpeak response.test <- response.test[selection] X.test <- test.data[ , -c(1:2, 101:110)] X.test$tariff.plan <- as.factor(X.test$tariff.plan) X.test$activ.area <- as.factor(X.test$activ.area) X.test$activ.chan <- as.factor(X.test$activ.chan) X.test <- X.test[selection, ] # linear model # mod.linear <- lm(response ~ ., data = X.train) summary(mod.linear) hist(mod.linear$residuals, breaks = 1e3, xlim = c(-2e4, 2e4), freq = FALSE) curve(dnorm(x, 0, sd(mod.linear$residuals)), add = TRUE, col = 2) library(MASS) mod.linear.reduced <- stepAIC(object = mod.linear, scope = lm(response ~ 1, data = X.train)) summary(mod.linear.reduced) hist(mod.linear.reduced$residuals, breaks = 1e3, xlim = c(-2e4, 2e4), freq = FALSE) curve(dnorm(x, 0, sd(mod.linear.reduced$residuals)), add = TRUE, col = 2) test.error <- list() test.error$mod.linear <- mean((response.test - predict(mod.linear, newdata = X.test))^2) test.error$mod.linear.red <- mean((response.test - predict(mod.linear.reduced, newdata = X.test))^2) plot(mod.linear) plot(mod.linear.reduced) mod.linear.log <- lm(log(response) ~ ., data = X.train) summary(mod.linear.log) hist(mod.linear.log$residuals, breaks = 1e2, freq = FALSE) curve(dnorm(x, 0, sd(mod.linear.log$residuals)), add = TRUE, col = 2) mod.linear.log.reduced <- stepAIC(mod.linear.log, lm(log(response) ~ 1, data = X.train)) summary(mod.linear.log.reduced) hist(mod.linear.log.reduced$residuals, breaks = 1e2, freq = FALSE) curve(dnorm(x, 0, sd(mod.linear.log.reduced$residuals)), add = TRUE, col = 2) plot(mod.linear.log) plot(mod.linear.log.reduced) test.error$mod.linear.log <- mean((response.test - exp(predict(mod.linear.log, newdata = X.test)))^2) test.error$mod.linear.log.red <- mean((response.test - exp(predict(mod.linear.log.reduced, newdata = X.test)))^2) y.hat <- predict(mod.linear, newdata = X.test) summary(y.hat) y.hat[y.hat < 0] <- 0.5 y.hat.red <- predict(mod.linear.reduced, newdata = X.test) y.hat.red[y.hat.red < 0] <- 0.5 test.error.ls <- list() test.error.ls$mod.linear <- mean((log(response.test) - log(y.hat))^2) test.error.ls$mod.linear.red <- mean((log(response.test) - log(y.hat.red))^2) test.error.ls$mod.linear.lin.log <- mean((log(response.test) - predict(mod.linear.log, newdata = X.test))^2) test.error.ls$mod.linear.lin.log.red <- mean((log(response.test) - predict(mod.linear.log.reduced, newdata = X.test))^2) library(glmnet) library(fastDummies) X.train.penReg <- cbind(dummy_columns(X.train[, c(1:3, 5:8)]), X.train[, -c(1:3, 5:8)])[, -c(1:7, 8, 13, 16, 19, 24, 32, 34)] X.test.penReg <- cbind(dummy_columns(X.test[, c(1:3, 5:8)]), X.test[, -c(1:3, 5:8)])[, -c(1:7, 8, 13, 16, 23, 31, 33)] # gam # library(gam) formulaX <- paste0('+ s(',names(X.train[9:98]), ', 4)') formula <- 'response ~ tariff.plan + payment.method + gender + s(age, 4) + activ.area + activ.chan + vas1 + vas2' for (i in 1:length(formulaX)) formula <- paste0(formula, formulaX[i]) mod.gam <- gam(as.formula(formula), data = X.train) summary(mod.gam)$parametric.anova X.train.red <- X.train[ , summary(mod.gam)$parametric.anova[-c(1, 99), 5] < 0.05] formulaX2 <- paste0('+ s(',names(X.train.red[-c(1:7)]), ', 4)') formula <- 'response ~ tariff.plan + gender + s(age,4) + activ.area + activ.chan + vas1 + vas2' for (i in 1:length(formulaX2)) formula <- paste0(formula, formulaX2[i]) mod.gam.reduced <- gam(as.formula(formula), data = X.train) summary(mod.gam.reduced) test.error$gam <- mean((response.test - predict(mod.gam, X.test))^2) test.error$gam.red <- mean((response.test - predict(mod.gam.reduced, X.test))^2) formula <- 'log(response) ~ tariff.plan + payment.method + gender + s(age,4) + activ.area + activ.chan + vas1 + vas2' for (i in 1:length(formulaX)) formula <- paste0(formula, formulaX[i]) mod.gam.log <- gam(as.formula(formula), data = X.train) summary(mod.gam.log)$parametric.anova X.train.red <- X.train[ , summary(mod.gam.log)$parametric.anova[-c(1, 99), 5] < 0.05] formulaX <- paste0('+s(',names(X.train.red[-c(1:8)]), ',4)') formula <- 'log(response) ~ tariff.plan + payment.method + gender + s(age,4) + activ.area + activ.chan + vas1 + vas2' for (i in 1:length(formulaX)) formula <- paste0(formula, formulaX[i]) mod.gam.log.reduced <- gam(as.formula(formula), data = X.train) plot(mod.gam.log.reduced, terms = 's(age, 4)') plot(mod.gam.log.reduced, terms = 's(age, 4)', se = TRUE) test.error$gam.log <- mean((response.test - exp(predict(mod.gam.log, X.test)))^2) test.error$gam.log.red <- mean((response.test - exp(predict(mod.gam.log.reduced, X.test)))^2) test.error.ls$gam.log <- mean((log(response.test) - predict(mod.gam.log, X.test))^2) test.error.ls$gam.log.red <- mean((log(response.test) - predict(mod.gam.log.reduced, X.test))^2) # neural network # library(nnet) mod.neuralNetwork <- nnet(response ~ ., data = X.train, size = 3, maxit = 300, decay = 0.01, trace = FALSE) test.error$nnet <- mean((response.test - predict(mod.neuralNetwork, newdata = X.test))^2) mod.neuralNetwork.log <- nnet(log(response) ~ ., data = X.train, size = 3, maxit = 300, decay = 0.01, trace = FALSE) test.error$nnet.log <- mean((response.test - exp(predict(mod.neuralNetwork.log, newdata = X.test)))^2) test.error.ls$nnet <- mean((log(response.test) - log(predict(mod.neuralNetwork, newdata = X.test)))^2) test.error.ls$nnet.log <- mean((log(response.test) - predict(mod.neuralNetwork.log, newdata = X.test))^2) # tree # library(tree) large.tree <- tree(response ~ ., data = X.train, control = tree.control(nobs = length(response), mindev = 0.001)) plot(large.tree) cv.err <- cv.tree(large.tree, K = 10, method = 'deviance') J <- cv.err$size[which.min(cv.err$dev)] mod.tree.cv <- prune.tree(large.tree, best = J) plot(mod.tree.cv) text(mod.tree.cv) plot(cv.err) mod.tree <- prune.tree(large.tree, best = 8) plot(mod.tree) text(mod.tree) test.error$tree <- mean((response.test - predict(mod.tree.cv, newdata = X.test))^2) test.error$tree.red <- mean((response.test - predict(mod.tree, newdata = X.test))^2) # ridge lambda.r <- cv.glmnet(y = response, x = as.matrix(X.train.penReg), alpha = 0) mod.ridge <- glmnet(y = response, x = as.matrix(X.train.penReg), alpha = 0, lambda = lambda.r$lambda.1se) test.error$ridge <- mean((response.test - predict(mod.ridge, newx = as.matrix(X.test.penReg)))^2) lambda.r.log <- cv.glmnet(y = log(response), x = as.matrix(X.train.penReg), alpha = 0) mod.ridge.log <- glmnet(y = log(response), x = as.matrix(X.train.penReg), alpha = 0, lambda = lambda.r.log$lambda.1se) test.error$ridge.log <- mean((response.test - exp(predict(mod.ridge.log, newx = as.matrix(X.test.penReg))))^2) library(glmnet) library(fastDummies) X.train.penReg <- cbind(dummy_columns(X.train[, c(1:3, 5:8)]), X.train[, -c(1:3, 5:8)])[, -c(1:7, 8, 13, 16, 19, 24, 32, 34)] X.test.penReg <- cbind(dummy_columns(X.test[, c(1:3, 5:8)]), X.test[, -c(1:3, 5:8)])[, -c(1:7, 8, 13, 16, 23, 31, 33)] # lasso lambda.l <- cv.glmnet(y = response, x = as.matrix(X.train.penReg), alpha = 1) mod.lasso <- glmnet(y = response, x = as.matrix(X.train.penReg), alpha = 1, lambda = lambda.l$lambda.1se) test.error$lasso <- mean((response.test - predict(mod.lasso, newx = as.matrix(X.test.penReg)))^2) lambda.l.log <- cv.glmnet(y = log(response), x = as.matrix(X.train.penReg), alpha = 1) mod.lasso.log <- glmnet(y = log(response), x = as.matrix(X.train.penReg), alpha = 1, lambda = lambda.l.log$lambda.1se) test.error$lasso <- mean((response.test - exp(predict(mod.lasso.log, newx = as.matrix(X.test.penReg))))^2) #test.error.ls$ridge <- mean((log(response.test) - log(predict(mod.ridge, newx = as.matrix(X.test.penReg))))^2) test.error.ls$ridge.log <- mean((log(response.test) - predict(mod.ridge.log, newx = as.matrix(X.test.penReg)))^2) test.error.ls$lasso <- mean((log(response.test) - log(predict(mod.lasso, newx = as.matrix(X.test.penReg))))^2) test.error.ls$lasso.log <- mean((log(response.test) - predict(mod.lasso.log, newx = as.matrix(X.test.penReg)))^2) # boosting library(mboost) mod.boost <- gamboost(response ~ ., data = X.train, baselearner = 'bbs') # mstop <- cvrisk(mod.boost) # commented out for time constraint # mstop <- cvrisk(mod.boost, folds = cv(model.weights(mod.boost), type = "kfold", B = 5)) colnames(X.train.penReg)[5:6] colnames(X.train.penReg)[5:6] <- c("payment.method_credit_card", "payment.method_post_account") colnames(X.test.penReg)[5:6] colnames(X.test.penReg)[5:6] <- c("payment.method_credit_card", "payment.method_post_account") mod.boost.t <- gamboost(response ~ ., data = X.train.penReg, baselearner = 'btree') # mstop.t <- cvrisk(mod.boost.t) # commented out for time constraint test.error$boost <- mean((response.test - predict(mod.boost, newdata = X.test))^2) test.error$boost.t <- mean((response.test - predict(mod.boost.t, newdata = X.test.penReg))^2) mod.boost.log <- gamboost(log(response) ~ ., data = X.train, baselearner = 'bbs') mod.boost.t.log <- gamboost(log(response) ~ ., data = X.train.penReg, baselearner = 'btree') test.error.ls$boost <- mean((log(response.test) - predict(mod.boost.log, newdata = X.test))^2) test.error.ls$boost.t <- mean((log(response.test) - predict(mod.boost.t.log, newdata = X.test.penReg))^2) # random forests library(ranger) mod.rf <- ranger(dependent.variable.name = 'response', data = cbind(response, X.train.penReg)) test.error$rf <- mean((response.test - predict(mod.rf, data = X.test.penReg)$predictions)^2) mod.rf.log <- ranger(dependent.variable.name = 'log(response)', data = cbind(log(response), X.train.penReg)) test.error.ls$rf <- mean((log(response.test) - predict(mod.rf.log, data = X.test.penReg)$predictions)^2) library(tuneRanger) task <- makeRegrTask(data = cbind(response, X.train.penReg), target = 'response') mod.tuned <- tuneRanger(task) test.error$rf.tuned <- mean((y.test - unlist(predict(mod.tuned$model, newdata = X.test.penReg)$data))^2) unlist(test.error)/1e7 unlist(test.error.ls)