# STK2100 / Proposed solution # Extra exercise 12 # Vera Kvisgaard 06.04.25 # # Resources: # - Exercise text: /studier/emner/matnat/math/STK2100/v22/exer.pdf # - Information about the spam data /studier/emner/matnat/math/STK2100/data/spam_info.txt # - the zip_nn.R file is available at the spring 2022 website, # /studier/emner/matnat/math/STK2100/v22/r-scripts/ library(ISLR) library(nnet) library(RSNNS) # a) import data ----- # import data url <- "/studier/emner/matnat/math/STK2100/data/spam.data" df <- read.table(url) dim(df) # 57 continous variables + 1 binary class label (spam or non-spam) head(df) # normalize predictors df[-ncol(df)] <- data.frame(lapply(df[-ncol(df)], scale)) # convert response variable to factor names(df)[ncol(df)] <- "cl" df$cl <- factor(df$cl) table(df$cl)/nrow(df) # ~40 percent of the cases are spam # import test/train split url <- "/studier/emner/matnat/math/STK2100/data/spam.traintest" train <- read.table(url)$V1 == 1 table(train)/length(train) # 2/3 allocated to test, might be a typo? # check class balance table(train, cl = df$cl)/as.numeric(table(train)) # define separate data frames for train and test data df_train <- df[train == 1, ] df_test <- df[train == 0, ] # b) one-layer network with nnet() ---- # specify different combinations of size / decay values params <- expand.grid(list(size = c(5, 10, 15), decay = c(0, .1, .3))) # iterate over all combinations, fit nnet and compute test-error err <- numeric(nrow(params)) set.seed(007) for (i in seq_len(nrow(params))) { size <- params$size[[i]] decay <- params$decay[[i]] cat("iter:", i, "size:", size, "decay:", decay, "\n") fit <- nnet(cl~., data = df_train, size = size, decay = decay, MaxNWts=10000, maxit=1000) pred <- predict(fit, df_test, type = "class") err[[i]] <- sum(pred != df_test$cl)/length(pred) } # store results in list for later res <- list() res$nnet <- data.frame(params, error = err) res$nnet res$nnet[which.min(res$nnet$err), ] # Among the chosen settings, size = 05, decay = 0.1 gives minimum error # The differences is rather small - and the optimal setting varies with seed - # but a large decay value seems reasonable # c) one-layer network with mlp() ----- # extract predictor/response objects for mlp() X <- model.matrix(cl~.-1, data = df) X_train <- X[train == 1, ] X_train <- scale(X_train) X_test <- X[train == 0, ] X_test <- scale(X_test) y_train <- df$cl[train == 1] y_test <- df$cl[train == 0] sizes <- list(5, 10, 15) err <- numeric(length(sizes)) set.seed(007) for (i in seq_along(sizes)) { size <- sizes[[i]] cat("iter:", i, "size:", size, "\n") fit <- mlp(x = X_train, y = decodeClassLabels(y_train), # convert factor to two-column matrix size = size, learnFunc = "Std_Backpropagation", learnFuncParams = c(0.3), maxit=300) probs <- predict(fit, X_test) # predicted probabilities for each class pred <- probs[, 2] > .5 err[[i]] <- sum(pred != (y_test == "1"))/length(pred) } res$mlp1 <- data.frame(size = as.character(sizes), error = err) # d) multi-layer network ---- sizes <- list(c(10), c(10, 10), c(10, 10, 10)) err <- numeric(length(sizes)) set.seed(007) for (i in seq_along(sizes)) { size <- sizes[[i]] cat("iter:", i, "size:", size, "\n") fit <- mlp(x = X_train, y = decodeClassLabels(y_train), # convert factor to two-column matrix size = size, learnFunc = "Std_Backpropagation", learnFuncParams = c(0.3), maxit=300) probs <- predict(fit, X_test) # predicted probabilities for each class pred <- probs[, 2] > .5 err[[i]] <- sum(pred != (y_test == "1"))/length(pred) } res$mlp <- data.frame(size = as.character(sizes), error = err) # e) combined with stepAIC ---- # fit a full logistic regression model full_model <- glm(cl ~ ., data = df_train, family = "binomial") # perform stepwise model selection based on AIC step_model <- MASS::stepAIC(full_model, direction = "both") # print the summary of the final model selected by AIC summary(step_model) fit <- nnet(step_model$formula, data = df_train, size = 10, decay = decay, MaxNWts=10000, maxit=300) pred <- predict(fit, df_test, type = "class") res$step_nnet <- sum(pred != df_test$cl)/length(pred) # f) ---- res