# # STK 2100: Extra exercise 13 (Neural network on Wage data) ---------------------------------
# /studier/emner/matnat/math/STK2100/v23/exer.pdf
# Lars H. B. Olsen, Spring 2023.

# Clean up the memory before we start.
rm(list = ls(all = TRUE))

# Import the necessary libraries
library(ISLR)
library(nnet)
library(RSNNS)

# Look at the data
str(Wage)
head(Wage)
summary(Wage)

# a) ------------------------------------------------------------------------------------------
# Set seed for reproducibility 
set.seed(1234)

# Scale the year and age variables to have mean 0 and sd 1.
# I.e., NOT "scaled to be between zero and one" as the exercise text states.
Wage$year = scale(Wage$year)
Wage$age = scale(Wage$age)
mean(Wage$year) # 0
sd(Wage$year)   # 1


# Get the number of observations in the Wage data set
n = nrow(Wage)
n

# Split the data into training and test data at a 50-50 ratio
train = sample(n, n/2)
Wage_train = Wage[train, ]
Wage_test = Wage[-train, ]


# b) ------------------------------------------------------------------------------------------
# Neural network on wage data

# Number of nodes in the hidden layer
m = 10

# Look at the arguments to the 'nnet' function
?nnet

# Set seed for reproducibility. Sensitive to seed value.
set.seed(1234)

# Fit a neural network with one hidden layer with m nodes
wage_nnet = nnet(
  wage ~ year + age + maritl + race + education + jobclass + health + health_ins,
  data = Wage_train,  # Data frame from which variables specified in formula are preferentially to be taken.
  size = m,           # Number of units in the hidden layer.
  decay = 0.1,        # Parameter for weight decay. Default 0.
  linout = TRUE,      # Switch for linear output units. Default logistic output units (FALSE).
  MaxNWts = 10000,    # The maximum allowable number of weights. 
  maxit = 300,        # Maximum number of iterations. Default 100.
  trace = FALSE       # Switch for tracing optimization. Default TRUE.
)

# Predict the response of the test data using the fitted network
pred_nnet = predict(wage_nnet, Wage_test)

# Compute the test mean squared error
err_nnet = mean((Wage_test$wage - pred_nnet)^2)
err_nnet # 1368.638


# c) ------------------------------------------------------------------------------------------
# Scale response to be between zero and one.
maxWage = max(Wage$wage)
Wage$rwage = Wage$wage/maxWage
hist(Wage$rwage)

# Create the updated training and test data
Wage_train = Wage[train,]
Wage_test = Wage[-train,]

# Number of nodes in the hidden layer
m = 10

# Set seed for reproducibility. NOT sensitive to seed value.
set.seed(1234)

# Fit a neural network with one hidden layer with m nodes where rwage is the response.
wage_rnnet = nnet(
  rwage ~ year + age + maritl + race + education + jobclass + health + health_ins,
  data = Wage_train,
  size = m,
  decay = 0.1,
  linout = TRUE,
  MaxNWts = 10000,
  maxit = 300,
  trace = FALSE
)

# Predict the response of the test data using the fitted network. 
# Remember to scale the response back to original scale.
pred_rnnet = maxWage*predict(wage_rnnet, Wage_test)

par(mfrow = c(2,1))
hist(predict(wage_rnnet, Wage_test), breaks = 40)
hist(pred_rnnet, breaks = 50)

# Compute the test mean squared error
err_rnnet = mean((Wage_test$wage - pred_rnnet)^2)
err_rnnet # 1247.765

# We get much better results. 
# This is reasonable as a response variable with a large spread of values, in turn,
# may result in large error gradient values causing weight values to change dramatically,
# making the learning process unstable.


# d) ------------------------------------------------------------------------------------------
# We start using deep neural networks using the 'mlp' function 

# Transform categorical variables into dummy variables.
# Note that X now includes an intercept.
X = model.matrix(~ year + age + maritl + race + education + jobclass + health + health_ins,
                 data = Wage)

# Remove white space in the variable names as mlp does not like whitespace. Gives warnigns.
colnames(X)
colnames(X) = gsub(" ", "", colnames(X))
colnames(X)

# Split the data into training and test data
X_train = X[train,]
X_test = X[-train,]

# Look at the arguments to the 'mlp' function
?mlp

# List of sizes we want to test
sizes = list(c(10),
             c(10, 10),
             c(10, 10, 10),
             c(10, 10, 10, 10),
             c(10, 6, 3),
             c(8, 4))

# Array to store the test MSE
err_dnet_array = rep(NA, length(sizes))

# Iterate over the different networks
for (size_idx in seq_along(sizes)) {
  
  # Get the current size.
  # Note double square brackets to get element and not list.
  size = sizes[[size_idx]]
  
  # Small printout
  cat(sprintf("Working on size %d of %d: ", size_idx, length(sizes)))
  
  # Set seed for reproducibility. Somewhat sensitive to seed value.
  set.seed(1234)
  
  # Fit a deep neural network to the Wage data
  wage_dnet = mlp(
    X_train,                  # a matrix with training inputs for the network
    Wage_train$rwage,         # the corresponding targets values
    size = size,              # number of units in the hidden layer(s)
    linout = TRUE,            # sets the activation function of the output units to linear or logistic
    learnFuncParams = c(0.3), # the parameters for the learning function, default c(0.2, 0).
    maxit = 300               # maximum of iterations to learn. 
  )
  
  # Predict the response of the test data using the fitted network. 
  # Remember to scale the response back to original scale.
  pred_dnet = maxWage*predict(wage_dnet, X_test)
  
  # Compute the test mean squared error
  err_dnet_array[size_idx] = mean((Wage_test$wage - pred_dnet)^2)
  
  # Small printout to the user.
  cat(sprintf("Test MSE = %.2f for for network [%s].\n", err_dnet_array[size_idx], paste(size, collapse = ", ")))
}


# e) ------------------------------------------------------------------------------------------
# Get best size. 
# Note double square brackets.
best_size = sizes[[which.min(err_dnet_array)]]
best_size

# Number of repetitions
N_repeat = 10

# Array to store the test MSE for each repetition
err_dnet_best_array = rep(NA, N_repeat)

# Set seed for reproducibility. 
# Note that we set it outside the loop to not get the same results in each iteration.
set.seed(1234)

# List to store the fitted networks, will be used in f).
wage_dnet_list = list()

# Iterate over the repetitions
for (repetition_idx in seq(N_repeat)) {
  
  # Small printout
  cat(sprintf("Working on repetition %d of %d: ", repetition_idx, N_repeat))
  
  # Fit a deep neural network to the Wage data
  wage_dnet = mlp(
    X_train,                  # a matrix with training inputs for the network
    Wage_train$rwage,         # the corresponding targets values
    size = best_size,         # number of units in the hidden layer(s)
    linout = TRUE,            # sets the activation function of the output units to linear or logistic
    learnFuncParams = c(0.3), # the parameters for the learning function, default c(0.2, 0).
    maxit = 300               # maximum of iterations to learn. 
  )
  
  # Predict the response of the test data using the fitted network. 
  # Remember to scale the response back to original scale.
  pred_dnet = maxWage*predict(wage_dnet, X_test)
  
  # Compute the test mean squared error
  err_dnet_best_array[repetition_idx] = mean((Wage_test$wage - pred_dnet)^2)
  
  # Small printout to the user.
  cat(sprintf("Test MSE = %.2f.\n", err_dnet_best_array[repetition_idx]))
  
  # Store the fitted NN. Used in f), and note double square brackets.
  wage_dnet_list[[repetition_idx]] = wage_dnet
}

# Get the average test MSE
err_dnet_best_avg = mean(err_dnet_best_array)
err_dnet_best_avg # 1285.657

# We get a variation as randomly initiated the weights in the network in each repetition.
# Sometimes we are lucky with the weights and sometimes we are unlucky.


# f) ------------------------------------------------------------------------------------------
# Fit several networks and then take the average of the predictions you obtain.

# Number of models
N_models = 10

# Compute the prediction of the test data for all NN_models in the wage_dnet_list
pred_dnet_matrix = sapply(wage_dnet_list, function(NN_model) maxWage*predict(NN_model, X_test))
pred_dnet_matrix

# Take the means of the rows to get the average prediction
pred_dnet_avg = rowMeans(pred_dnet_matrix)

# Compute the test MSE
err_dnet_avg = mean((Wage_test$wage - pred_dnet_avg)^2)
err_dnet_avg # 1247.637


# e) ------------------------------------------------------------------------------------------
# Compare your results with previous prediction results on the Wage data.

# Combine the results from d)
err_dnet_array_aux = as.matrix(err_dnet_array)
rownames(err_dnet_array_aux) = sizes

# Combine all results into a matrix
results_matrix = rbind(
  err_nnet,
  err_rnnet,
  err_dnet_array_aux,
  err_dnet_best_avg,
  err_dnet_avg
)
colnames(results_matrix) = "Test MSE"

# Look at the results
results_matrix

# Get the best model, have to set drop to FALSE to get 
# a matrix with the row and column names.
results_matrix[which.min(results_matrix), , drop = FALSE]