# read data download.file('http://azzalini.stat.unipd.it/Book-DM/telekom-www.zip', 'dataTelekom.zip') train.data <- read.table(unz('dataTelekom.zip','telekom-www/telekom.training.dat'), header = TRUE) test.data <- read.table(unz('dataTelekom.zip','telekom-www/telekom.test.dat'), header = TRUE) system('rm dataTelekom.zip') # prepare training data head(train.data) response <- train.data$q10.out.dur.peak + train.data$q10.out.dur.offpeak hist(response, breaks = 1000, xlim = c(0, 5000)) selection <- response > 0 table(selection) response <- response[selection] summary(train.data) X.train <- train.data[, -c(1:2, 101:110)] X.train$tariff.plan <- as.factor(X.train$tariff.plan) X.train$activ.area <- as.factor(X.train$activ.area) X.train$activ.chan <- as.factor(X.train$activ.chan) X.train <- X.train[selection, ] # prepare test data response.test <- test.data$q10.out.dur.peak + test.data$q10.out.dur.offpeak selection <- response.test > 0 table(selection) response.test <- response.test[selection] X.test <- test.data[, -c(1:2, 101:110)] X.test$tariff.plan <- as.factor(X.test$tariff.plan) X.test$activ.area <- as.factor(X.test$activ.area) X.test$activ.chan <- as.factor(X.test$activ.chan) X.test <- X.test[selection, ] # linear model model.linear <- lm(response ~ ., data = X.train) summary(model.linear) plot(model.linear) # use backward elimination library(MASS) mod.reduced <- stepAIC(object = model.linear, scope = lm(response ~ 1, data = X.train)) summary(mod.reduced) test.error <- list() test.error$mod.linear <- mean((response.test - predict(model.linear, newdata = X.test))^2) test.error$mod.linear.red <- mean((response.test - predict(mod.reduced, newdata = X.test))^2) # use logarithm transformation of the response model.linear.log <- lm(log(response) ~ ., data = X.train) summary(model.linear.log) mod.reduced.log <- stepAIC(object = model.linear.log, scope = lm(log(response) ~ 1, data = X.train)) summary(mod.reduced.log) test.error$mod.linear.log <- mean((response.test - exp(predict(model.linear.log, newdata = X.test)))^2) test.error$mod.linear.red.log <- mean((response.test - exp(predict(mod.reduced.log, newdata = X.test)))^2) test.error test.error.ls <- list() test.error.ls$mod.linear.log <- mean((log(response.test) - predict(model.linear.log, newdata = X.test))^2) test.error.ls$mod.linear.red.log <- mean((log(response.test) - predict(mod.reduced.log, newdata = X.test))^2) y.hat <- predict(model.linear, newdata = X.test) y.hat[y.hat < 0] <- 0.5 y.hat.red <- predict(mod.reduced, newdata = X.test) y.hat.red[y.hat.red < 0] <- 0.5 test.error.ls$mod.linear <- mean((log(response.test) - log(y.hat))^2) test.error.ls$mod.linear.red <- mean((log(response.test) - log(y.hat.red))^2) test.error.ls # additive model library(gam) formula <- 'response ~ tariff.plan + payment.method + gender + s(age, 4) + activ.area + activ.chan + vas1 + vas2' formulaX <- paste0('+ s(', names(X.train)[9:98], ', 4)') for(i in 1:length(formulaX)) formula <- paste0(formula, formulaX[i]) mod.gam <- gam(as.formula(formula), data = X.train) summary(mod.gam) plot(mod.gam, terms = 's(age, 4)') plot(mod.gam, terms = 's(q09.out.dur.offpeak, 4)', se = TRUE) summary(mod.gam)$parametric.anova X.train.red <- X.train[ , summary(mod.gam)$parametric.anova[-c(1, 99), 5] < 0.05] formula <- 'response ~ payment.method + gender + s(age, 4) + activ.area + activ.chan + vas1 + vas2' formulaX <- paste0('+ s(', names(X.train.red)[8:63], ', 4)') for(i in 1:length(formulaX)) formula <- paste0(formula, formulaX[i]) mod.gam.red <- gam(as.formula(formula), data = X.train.red) summary(mod.gam.red) test.error$mod.gam <- mean((response.test - predict(mod.gam, X.test))^2) test.error$mod.gam.red <- mean((response.test - predict(mod.gam.red, X.test))^2) test.error formula <- 'log(response) ~ tariff.plan + payment.method + gender + s(age, 4) + activ.area + activ.chan + vas1 + vas2' formulaX <- paste0('+ s(', names(X.train)[9:98], ', 4)') for(i in 1:length(formulaX)) formula <- paste0(formula, formulaX[i]) mod.gam.log <- gam(as.formula(formula), data = X.train) summary(mod.gam.log) X.train.red <- X.train[ , summary(mod.gam.log)$parametric.anova[-c(1, 99), 5] < 0.05] formula <- 'log(response) ~ tariff.plan + payment.method + gender + s(age, 4) + activ.area + activ.chan + vas1 + vas2' formulaX <- paste0('+ s(', names(X.train.red)[9:58], ', 4)') for(i in 1:length(formulaX)) formula <- paste0(formula, formulaX[i]) mod.gam.log.red <- gam(as.formula(formula), data = X.train.red) summary(mod.gam.log.red) plot(mod.gam.log.red, terms = 's(age, 4)') test.error.ls$mod.gam.log <- mean((log(response.test) - predict(mod.gam.log, X.test))^2) test.error.ls$mod.gam.log.red <- mean((log(response.test) - predict(mod.gam.log.red, X.test))^2) test.error.ls # neural network # library(nnet) mod.nn <- nnet(response ~ ., data = X.train, size = 3, maxit = 300, decay = 0.001, trace = FALSE) test.error$nnet <- mean((response.test - predict(mod.nn, X.test))^2) mod.nn.log <- nnet(log(response) ~ ., data = X.train, size = 3, maxit = 300, decay = 0.001, trace = FALSE) test.error.ls$nnet <- mean((log(response.test) - log(predict(mod.nn, X.test)))^2) test.error$nnet.log <- mean((response.test - exp(predict(mod.nn.log, X.test)))^2) test.error.ls$nnet.log <- mean((log(response.test) - predict(mod.nn.log, X.test))^2) test.error test.error.ls # tree # library(tree) large.tree <- tree(response ~ ., data = X.train, control = tree.control(nobs = length(response), mindev = 0.001)) plot(large.tree) cv.err <- cv.tree(large.tree, K = 10, method = 'deviance') cv.err J <- cv.err$size[which.min(cv.err$dev)] plot(cv.err) mod.tree.cv <- prune.tree(large.tree, best = J) plot(mod.tree.cv) text(mod.tree.cv) mod.tree.elbow <- prune.tree(large.tree, best = 8) plot(mod.tree.elbow) text(mod.tree.elbow) test.error$tree.cv <- mean((response.test - predict(mod.tree.cv, X.test))^2) test.error$tree.elbow <- mean((response.test - predict(mod.tree.elbow, X.test))^2) test.error$tree.large <- mean((response.test - predict(large.tree, X.test))^2) test.error.ls$tree.cv <- mean((log(response.test) - log(predict(mod.tree.cv, X.test)))^2) test.error.ls$tree.elbow <- mean((log(response.test) - log(predict(mod.tree.elbow, X.test)))^2) test.error.ls$tree.large <- mean((log(response.test) - log(predict(large.tree, X.test)))^2) test.error.ls library(glmnet) library(fastDummies) X.train.pr <- cbind(dummy_columns(X.train[, c(1:3, 5:8)]), X.train[, -c(1:3, 5:8)]) X.train.pr <- X.train.pr[, -c(1:7, 8, 13, 16, 19, 24, 32, 34)] X.test.pr <- cbind(dummy_columns(X.test[, c(1:3, 5:8)]), X.test[, -c(1:3, 5:8)]) X.test.pr <- X.test.pr[, -c(1:7, 8, 13, 16, 23, 31, 33)] lambda.l <- cv.glmnet(x = as.matrix(X.train.pr), y = response, alpha = 1) mod.lasso <- glmnet(x = as.matrix(X.train.pr), y = response, alpha = 1, lambda = lambda.l$lambda.1se) lambda.l.log <- cv.glmnet(x = as.matrix(X.train.pr), y = log(response), alpha = 1) mod.lasso.log <- glmnet(x = as.matrix(X.train.pr), y = log(response), alpha = 1, lambda = lambda.l.log$lambda.1se) mod.lasso.log.min <- glmnet(x = as.matrix(X.train.pr), y = log(response), alpha = 1, lambda = lambda.l.log$lambda.min) test.error$lasso <- mean((response.test - predict(mod.lasso, newx = as.matrix(X.test.pr)))^2) test.error.ls$lasso <- mean((log(response.test) - predict(mod.lasso.log, newx = as.matrix(X.test.pr)))^2) test.error.ls$lasso.min <- mean((log(response.test) - predict(mod.lasso.log.min, newx = as.matrix(X.test.pr)))^2) test.error test.error.ls