# # STK 2100: Extra exercise 13 (Neural network on Wage data) --------------------------------- # /studier/emner/matnat/math/STK2100/v23/exer.pdf # Lars H. B. Olsen, Spring 2023. # Clean up the memory before we start. rm(list = ls(all = TRUE)) # Import the necessary libraries library(ISLR) library(nnet) library(RSNNS) # Look at the data str(Wage) head(Wage) summary(Wage) # a) ------------------------------------------------------------------------------------------ # Set seed for reproducibility set.seed(1234) # Scale the year and age variables to have mean 0 and sd 1. # I.e., NOT "scaled to be between zero and one" as the exercise text states. Wage$year = scale(Wage$year) Wage$age = scale(Wage$age) mean(Wage$year) # 0 sd(Wage$year) # 1 # Get the number of observations in the Wage data set n = nrow(Wage) n # Split the data into training and test data at a 50-50 ratio train = sample(n, n/2) Wage_train = Wage[train, ] Wage_test = Wage[-train, ] # b) ------------------------------------------------------------------------------------------ # Neural network on wage data # Number of nodes in the hidden layer m = 10 # Look at the arguments to the 'nnet' function ?nnet # Set seed for reproducibility. Sensitive to seed value. set.seed(1234) # Fit a neural network with one hidden layer with m nodes wage_nnet = nnet( wage ~ year + age + maritl + race + education + jobclass + health + health_ins, data = Wage_train, # Data frame from which variables specified in formula are preferentially to be taken. size = m, # Number of units in the hidden layer. decay = 0.1, # Parameter for weight decay. Default 0. linout = TRUE, # Switch for linear output units. Default logistic output units (FALSE). MaxNWts = 10000, # The maximum allowable number of weights. maxit = 300, # Maximum number of iterations. Default 100. trace = FALSE # Switch for tracing optimization. Default TRUE. ) # Predict the response of the test data using the fitted network pred_nnet = predict(wage_nnet, Wage_test) # Compute the test mean squared error err_nnet = mean((Wage_test$wage - pred_nnet)^2) err_nnet # 1368.638 # c) ------------------------------------------------------------------------------------------ # Scale response to be between zero and one. maxWage = max(Wage$wage) Wage$rwage = Wage$wage/maxWage hist(Wage$rwage) # Create the updated training and test data Wage_train = Wage[train,] Wage_test = Wage[-train,] # Number of nodes in the hidden layer m = 10 # Set seed for reproducibility. NOT sensitive to seed value. set.seed(1234) # Fit a neural network with one hidden layer with m nodes where rwage is the response. wage_rnnet = nnet( rwage ~ year + age + maritl + race + education + jobclass + health + health_ins, data = Wage_train, size = m, decay = 0.1, linout = TRUE, MaxNWts = 10000, maxit = 300, trace = FALSE ) # Predict the response of the test data using the fitted network. # Remember to scale the response back to original scale. pred_rnnet = maxWage*predict(wage_rnnet, Wage_test) par(mfrow = c(2,1)) hist(predict(wage_rnnet, Wage_test), breaks = 40) hist(pred_rnnet, breaks = 50) # Compute the test mean squared error err_rnnet = mean((Wage_test$wage - pred_rnnet)^2) err_rnnet # 1247.765 # We get much better results. # This is reasonable as a response variable with a large spread of values, in turn, # may result in large error gradient values causing weight values to change dramatically, # making the learning process unstable. # d) ------------------------------------------------------------------------------------------ # We start using deep neural networks using the 'mlp' function # Transform categorical variables into dummy variables. # Note that X now includes an intercept. X = model.matrix(~ year + age + maritl + race + education + jobclass + health + health_ins, data = Wage) # Remove white space in the variable names as mlp does not like whitespace. Gives warnigns. colnames(X) colnames(X) = gsub(" ", "", colnames(X)) colnames(X) # Split the data into training and test data X_train = X[train,] X_test = X[-train,] # Look at the arguments to the 'mlp' function ?mlp # List of sizes we want to test sizes = list(c(10), c(10, 10), c(10, 10, 10), c(10, 10, 10, 10), c(10, 6, 3), c(8, 4)) # Array to store the test MSE err_dnet_array = rep(NA, length(sizes)) # Iterate over the different networks for (size_idx in seq_along(sizes)) { # Get the current size. # Note double square brackets to get element and not list. size = sizes[[size_idx]] # Small printout cat(sprintf("Working on size %d of %d: ", size_idx, length(sizes))) # Set seed for reproducibility. Somewhat sensitive to seed value. set.seed(1234) # Fit a deep neural network to the Wage data wage_dnet = mlp( X_train, # a matrix with training inputs for the network Wage_train$rwage, # the corresponding targets values size = size, # number of units in the hidden layer(s) linout = TRUE, # sets the activation function of the output units to linear or logistic learnFuncParams = c(0.3), # the parameters for the learning function, default c(0.2, 0). maxit = 300 # maximum of iterations to learn. ) # Predict the response of the test data using the fitted network. # Remember to scale the response back to original scale. pred_dnet = maxWage*predict(wage_dnet, X_test) # Compute the test mean squared error err_dnet_array[size_idx] = mean((Wage_test$wage - pred_dnet)^2) # Small printout to the user. cat(sprintf("Test MSE = %.2f for for network [%s].\n", err_dnet_array[size_idx], paste(size, collapse = ", "))) } # e) ------------------------------------------------------------------------------------------ # Get best size. # Note double square brackets. best_size = sizes[[which.min(err_dnet_array)]] best_size # Number of repetitions N_repeat = 10 # Array to store the test MSE for each repetition err_dnet_best_array = rep(NA, N_repeat) # Set seed for reproducibility. # Note that we set it outside the loop to not get the same results in each iteration. set.seed(1234) # List to store the fitted networks, will be used in f). wage_dnet_list = list() # Iterate over the repetitions for (repetition_idx in seq(N_repeat)) { # Small printout cat(sprintf("Working on repetition %d of %d: ", repetition_idx, N_repeat)) # Fit a deep neural network to the Wage data wage_dnet = mlp( X_train, # a matrix with training inputs for the network Wage_train$rwage, # the corresponding targets values size = best_size, # number of units in the hidden layer(s) linout = TRUE, # sets the activation function of the output units to linear or logistic learnFuncParams = c(0.3), # the parameters for the learning function, default c(0.2, 0). maxit = 300 # maximum of iterations to learn. ) # Predict the response of the test data using the fitted network. # Remember to scale the response back to original scale. pred_dnet = maxWage*predict(wage_dnet, X_test) # Compute the test mean squared error err_dnet_best_array[repetition_idx] = mean((Wage_test$wage - pred_dnet)^2) # Small printout to the user. cat(sprintf("Test MSE = %.2f.\n", err_dnet_best_array[repetition_idx])) # Store the fitted NN. Used in f), and note double square brackets. wage_dnet_list[[repetition_idx]] = wage_dnet } # Get the average test MSE err_dnet_best_avg = mean(err_dnet_best_array) err_dnet_best_avg # 1285.657 # We get a variation as randomly initiated the weights in the network in each repetition. # Sometimes we are lucky with the weights and sometimes we are unlucky. # f) ------------------------------------------------------------------------------------------ # Fit several networks and then take the average of the predictions you obtain. # Number of models N_models = 10 # Compute the prediction of the test data for all NN_models in the wage_dnet_list pred_dnet_matrix = sapply(wage_dnet_list, function(NN_model) maxWage*predict(NN_model, X_test)) pred_dnet_matrix # Take the means of the rows to get the average prediction pred_dnet_avg = rowMeans(pred_dnet_matrix) # Compute the test MSE err_dnet_avg = mean((Wage_test$wage - pred_dnet_avg)^2) err_dnet_avg # 1247.637 # e) ------------------------------------------------------------------------------------------ # Compare your results with previous prediction results on the Wage data. # Combine the results from d) err_dnet_array_aux = as.matrix(err_dnet_array) rownames(err_dnet_array_aux) = sizes # Combine all results into a matrix results_matrix = rbind( err_nnet, err_rnnet, err_dnet_array_aux, err_dnet_best_avg, err_dnet_avg ) colnames(results_matrix) = "Test MSE" # Look at the results results_matrix # Get the best model, have to set drop to FALSE to get # a matrix with the row and column names. results_matrix[which.min(results_matrix), , drop = FALSE]