--- title: 'STK2100' subtitle: | | Machine Learning and Statistical Methods for Prediction and Classification | Mandatory Assignment 2 author: "Lars H. B. Olsen" date: "`r format(Sys.time(), '%d %B %Y')`" output: pdf_document: default fig_caption: true urlcolor: blue extra_dependencies: caption: labelfont={bf} asmath: null --- ```{r setup, include=FALSE} knitr::opts_chunk$set(echo = TRUE) ``` # Exercise 1 ```{r Ex1, cache = TRUE} # Read the spam data from the webpage fil = "http://www.uio.no/studier/emner/matnat/math/STK2100/data/spam_data.txt" spam = read.table(fil, header = T) ``` ## 1a) ```{r Ex1a, cache = TRUE} # Fit a standard logistic regression model to the training data. Include suppressWarnings # to not get "Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred". fit = suppressWarnings( glm(y ~ . - train, data = spam, subset = spam$train, family = binomial)) # Predict the response class of the test data pred = predict(fit, spam[!spam$train, ], type = "response") > 0.5 ``` ## 1b) ```{r Ex1b_chunk1, cache = TRUE} # Compute the principal components of the variables, which we scale to # have variances equal to 1 before transformation. x.prcomp = prcomp(spam[ , 1:57], scale = TRUE) # Insert the principal components, response, and indicator # for training data into a data frame. d = data.frame(x.prcomp$x, y = spam$y, train = spam$train) ``` ```{r Ex1b_chunk2, cache = TRUE} # Fit a logistic model based on the first 2 principal components to the training data fit.pc = glm(y ~ . - train, data = d[ , c(1:2, 58, 59)], family = binomial, subset = d$train) # Predict the response class of the test data pred.pc = predict(fit.pc,d[!d$train,], type = "response") > 0.5 ``` ## 1c) ## 1d) ```{r Ex1d, cache = TRUE} par(mfrow = c(1, 3)) plot.ts(x.prcomp$rotation[ , 1], ylab = bquote("Weights of components in first principal component"), xlab = bquote("Component X"[j]), ylim = c(-0.25, 0.45)) plot.ts(x.prcomp$rotation[ , 2], ylab = bquote("Weights of components in second principal component"), xlab = bquote("Component X"[j]), ylim = c(-0.25, 0.45)) plot.ts(x.prcomp$rotation[ , 3], ylab = bquote("Weights of components in third principal component"), xlab = bquote("Component X"[j]), ylim = c(-0.25, 0.45)) ``` ## 1e) ```{r Ex1e, cache = TRUE} # Import the necessary libraries library(splines) library(foreach) library(gam) # Fit a generalized additive model based on the first # three explanatory variable to the training data. fit.gam = gam(y ~ s(x1) + s(x2) + s(x3), data=spam, subset=spam$train, family = binomial) # Predict the response class of the test data pred.gam = predict(fit.gam, spam[!spam$train, ], type = "response") > 0.5 # Look at summary of the fitted GAM and the significance levels of nonparametric effects summary(fit.gam) # Plot the three estimated non-linear effect functions with and without upper and lower # pointwise twice-standard-error curves for each plot and a univariate # histogram (also called rugplot) along the base of each plot. par(mfrow = c(2, 3)) plot(fit.gam, se = T) plot(fit.gam, se = F) ``` ## 1f) ```{r Ex1f, cache = TRUE} # Fit a generalized additive model based on the first # three principal components to the training data. fit.pc.gam = gam(y ~ s(PC1) + s(PC2) + s(PC3), data = d, subset = d$train, family = binomial) # Look at summary of the fitted GAM and the significance levels of nonparametric effects summary(fit.pc.gam) # Predict the response class of the test data pred.pc.gam = predict(fit.pc.gam, d[!d$train, ], type = "response") > 0.5 # Make same type of plots as above par(mfrow = c(2, 3)) plot(fit.pc.gam, se = T) plot(fit.pc.gam, se = F) ``` ## 1g) ```{r Ex1g, cache = TRUE} # Extract the names of the principal components nam = names(d)[1:57] # Set the number of principal components to include in the model k = 20 # Create a formula with the given number of principal components formula = as.formula( paste("y", paste(paste("s(", nam[1:k], ")", sep = ""), collapse = "+"), sep = "~")) # Visually verify that the formula is correct print(formula) # Fit a generalized additive model based on the first # k principal components to the training data. fit.pc.gam = gam(formula, data = d, subset = d$train, family = binomial) # Predict the response class of the test data pred.pc.gam= predict(fit.pc.gam, d[!d$train, ], type = "response") > 0.5 ``` ## 1h) ## 1i) # Exercise 2 We will in this exercise continue on the spam data set, but now consider the use of neural nets. ## 2a) ```{r Ex2a, cache = TRUE} # Import the necessary libraries library(MASS) library(nnet) library(class) # Convert y to factor spam$y = as.factor(spam$y) # Fit a neural network with 10 hidden nodes and decay parameter 0.3 to the train data spam.nnet = nnet(y ~ . - train, data = spam[spam$train == TRUE, ], size = 10, decay = 0.3, MaxNWts = 10000, maxit = 500, trace = FALSE) # Use the fitted neural network to predict the class of the test data pred = predict(spam.nnet,spam[spam$train == FALSE, ], type = "class") # Compare the predicted class with the true class of the test data show(table(spam$y[spam$train == FALSE], pred)) ``` ## 2b) ## 2c) ```{r Ex2c, cache = TRUE} # Import the necessary libraries library(Rcpp) library(RSNNS) # Extract the training and test data sets spamTargets = decodeClassLabels(spam$y) spamTargets.train = spamTargets[spam$train == TRUE, ] spamTargets.test = spamTargets[spam$train == FALSE, ] # Fit a deep neural network with 3 layers (6, 10, and 6 neurons in each layer), # and learning rate 0.3 to the training data. spam.dnet = mlp(as.matrix(spam[spam$train == TRUE, 1:57]), spamTargets.train, size = c(6, 10, 6), learnFuncParams = c(0.3), maxit = 300) # Compare the predicted class with the true class of the test data pred.dnet = predict(spam.dnet,spam[spam$train == FALSE, 1:57]) pred.dnet = apply(pred.dnet, 1, which.is.max) - 1 pred.dnet = pred.dnet == 1 err.dnet = mean(spam$y[spam$train == FALSE] != pred.dnet) ``` ## 2d) ## 2e)