# Ex 3.2 simulation experiment 
# Author: Lars Henry Berge Olsen 
# Date: 17.02.23

# Set seed for reproducibility
set.seed(3)

# Set the significance level.
alpha = 0.05
level = 1 - alpha

# The number of x values to consider
N = 100

# Generate the N evenly spaced x values 
x_values = seq(-2, 2, length.out = N)
x_values = sort(runif(N, min = -2, max = 2))

# Generate the design matrix 
X = matrix(c(x_values^0, x_values, x_values^2, x_values^3), nrow = N, ncol = 4) 

# Define the true beta values
beta = matrix(c(1, -1.5, -0.5, 1), nrow = 4, ncol = 1)

# Number of explanatory variables (intercept does not count)
p = nrow(beta) - 1

# Generate the response without noise
y_without_noise = X %*% beta

# The standard deviation of the noise
sigma_noise = 1

# Generate some noise
noise = rnorm(N, mean = 0, sd = sigma_noise)

# Add noise to response
y_with_noise = y_without_noise + noise

# Compare the two
{
  matplot(cbind(x_values, x_values), cbind(y_without_noise, y_with_noise), type = c("l", "p"), pch = 16,
          xlab = "x", ylab = "f(x)")
  legend("topleft", legend = c("True", "Generated"), col = c(1,2), lty = c(1,-1), pch = c(-1, 16), cex = 1.1)
}

# Can fit a linear model to the data
lm_data = data.frame("y" = y_with_noise, "X0" = X[,1], "X1" = X[,2], "X2" = X[,3], "X3" = X[,4])
lm_model = lm(y~.-1, data = lm_data) #-1 as we do not want an additional intercept
summary(lm_model)

# Can compare the true coefficients with the estimated ones
rbind(t(beta), lm_model$coefficients) # Reasonably good

# estimated sigma
sigma_hat = sqrt(sum(lm_model$residuals^2)/(N-p-1))

# Compare with what lm function got.
rbind(summary(lm_model)$sigma, sigma_hat)

# estimated beta coefficients
beta_hat = solve(t(X) %*% X) %*% t(X) %*% y_with_noise

# Compare with what lm function got.
rbind(summary(lm_model)$coeff[,1], t(beta_hat))

# predict the response
y_hat = X %*% beta_hat

# Compare the three
{
  matplot(cbind(x_values, x_values, x_values),
          cbind(y_without_noise, y_with_noise, y_hat),
          type = c("l", "p", "l"), pch = 16, lty = 1, col = c(1, 2, 4),
          xlab = "x", ylab = "f(x)")
  legend("topleft", legend = c("True", "Generated", "Predicted"), col = c(1,2, 4), lty = c(1,-1, 1), pch = c(-1, 16, -1), cex = 1.1)
  
}

# Define the x0 values
x0_values = seq(-2, 2, by=0.025)
X0 = matrix(c(x0_values^0, x0_values, x0_values^2, x0_values^3), nrow = length(x0_values), ncol = 4) 

# Approach 1 is just what is done in the lm function by default.
lm_data_new = data.frame("X0" = X0[,1], "X1" = X0[,2], "X2" = X0[,3], "X3" = X0[,4])
predict(lm_model, newdata = lm_data_new, interval = "confidence")

# Arrays to store the fitted values and the confidence bands.
{
  fit = rep(NA, length(x0_values))
  lower_pluss_minus_1.96 = rep(NA, length(x0_values))
  upper_pluss_minus_1.96 = rep(NA, length(x0_values))
  lower_pluss_minus_2 = rep(NA, length(x0_values))
  upper_pluss_minus_2 = rep(NA, length(x0_values))
  lower_pluss_minus_qt = rep(NA, length(x0_values))
  upper_pluss_minus_qt = rep(NA, length(x0_values))
}

# Iterate over the x0 values and compute the predicted values and the lower and uper confidence bands.
x0_values_index = 1
for (x0_values_index in seq(length(x0_values))) {
  # Extract the current x0 value
  # Drop = FALSE to still get a matrix, transpose to get a column vector
  x0_value = t(X0[x0_values_index, , drop = FALSE]) 
  
  # Get the predicted response
  fit[x0_values_index] = t(x0_value) %*% beta_hat
  
  # compute sqrt(var(x_0^T\hat{beta}))
  sd_term = sqrt(sigma_hat^2 * t(x0_value) %*% solve(t(X) %*% X) %*% x0_value)
  
  # Get the lower and upper bounds (Gaussian error assumption)
  lower_pluss_minus_1.96[x0_values_index] = fit[x0_values_index] - 1.96 * sd_term
  upper_pluss_minus_1.96[x0_values_index] = fit[x0_values_index] + 1.96 * sd_term
  
  # Also standard practice to use 2, as even if the Gaussian error assumption does not hold,
  # this interval will be approximately correct, with its coverage approaching 1 − 2\alpha as the sample size N \to \infty.
  lower_pluss_minus_2[x0_values_index] = fit[x0_values_index] - 2 * sd_term
  upper_pluss_minus_2[x0_values_index] = fit[x0_values_index] + 2 * sd_term
  
  # Get the lower and upper confidence bounds using that we have estimated sigma, hence t distribution.
  lower_pluss_minus_qt[x0_values_index] = fit[x0_values_index] - qt((1 - level)/2, N-p-1, lower.tail = FALSE) * sd_term
  upper_pluss_minus_qt[x0_values_index] = fit[x0_values_index] + qt((1 - level)/2, N-p-1, lower.tail = FALSE) * sd_term
}


# Plot the results
{
  matplot(x_values,
          cbind(y_without_noise, y_with_noise, y_hat),
          type = c("l", "p", "l"), pch = 16, lty = 1, lwd = 2,
          col = c(1, 2, 4),
          xlab = "x", ylab = "f(x)")
  legend("topleft", legend = c("True", "Generated", "Predicted"), col = c(1,2, 4), lty = c(1,-1, 1), pch = c(-1, 16, -1), cex = 1.1)
  matlines(x0_values, cbind(lower_pluss_minus_1.96, upper_pluss_minus_1.96), type = "l", lty = 2, col = 7, lwd = 2)
  matlines(x0_values, cbind(lower_pluss_minus_2, upper_pluss_minus_2), type = "l", lty = 2, col = 4, lwd = 2)
  matlines(x0_values, cbind(lower_pluss_minus_qt, upper_pluss_minus_qt), type = "l", lty = 3, col = 4, lwd = 2)
  legend("bottomright", title = "Confidence Bands", legend = c("Approach 1"),
         col = 4, lty = 2, cex = 1.1, lwd = 2)
}


# Approach 2 using equation 3.15 --------------------------------------------------------------
# The idea is to generate a lot of betas a keep only the ones that satisfies equation 3.15

# The number of betas to generate
n_beta_sample = 1000000

# Sample "random" betas around beta_hat. 
# I just randomly choose pluss-minus 1 as to not generate not too unlikely values.
beta_samples = rbind(
  runif(n_beta_sample, beta_hat[1]-1, beta_hat[1]+1),
  runif(n_beta_sample, beta_hat[2]-1, beta_hat[2]+1),
  runif(n_beta_sample, beta_hat[3]-1, beta_hat[3]+1),
  runif(n_beta_sample, beta_hat[4]-1, beta_hat[4]+1))

# Array to keep track of which betas that satisfies (3.15)
beta_indices_to_keep = rep(NA, n_beta_sample)

# Iterate over the betas
for (beta_sample_index in seq(n_beta_sample)) {
  # Extract the current beta as a column vector
  beta_sample = beta_samples[,beta_sample_index, drop = FALSE]
  
  # Check if the current beta satifies eq (3.15)
  beta_indices_to_keep[beta_sample_index] = 
    (t(beta_hat - beta_sample) %*% t(X) %*% X %*% (beta_hat - beta_sample))[1,1] <= sigma_hat^2 * qchisq(level, df = p + 1)
}

# Extract the relevant betas
beta_keep = beta_samples[,beta_indices_to_keep, drop = FALSE]

# Compute the predicted responses for these lines
prediced_respones = X0 %*% beta_keep

# Plot all of them to form the confidence bands using approach 2.
{
  matplot(x0_values, prediced_respones, type = "l", lty = 1, col = "5", lwd = 2,
          xlab = "x", ylab = "f(x)")
  matlines(x_values, cbind(y_without_noise, y_with_noise, y_hat),
           type = c("l", "p", "l"), pch = 16, lty = 1, lwd = 2,
           col = c(1, 2, 4))
  matlines(x0_values, cbind(lower_pluss_minus_qt, upper_pluss_minus_qt), type = "l", lty = 2, col = 4, lwd = 2)
  legend("topleft", legend = c("True", "Generated", "Predicted"), col = c(1,2, 4), lty = c(1,-1, 1), pch = c(-1, 16, -1), cex = 1.1)
  legend("bottomright", title = "Confidence Bands", legend = c("Approach 1", "Approach 2"),
         col = c(4, 5), lty = c(2, 1), cex = 1.1, lwd = 2)
}


# Instead of simulating the betas one can derive explicit expressions. 
{
  lower_math_stackexchange = rep(NA, length(x0_values))
  upper_math_stackexchange = rep(NA, length(x0_values))
  lower_wisconsin = rep(NA, length(x0_values))
  upper_wisconsin = rep(NA, length(x0_values))
}

# Iterate over the x0 values and compute the predicted values and the lower and uper confidence bands.
x0_values_index = 1
for (x0_values_index in seq(length(x0_values))) {
  # Extract the current x0 value
  # Drop = FALSE to still get a matrix, transpose to get a column vector
  x0_value = t(X0[x0_values_index, , drop = FALSE]) 
  
  # Get the predicted response
  fit[x0_values_index] = t(x0_value) %*% beta_hat
  
  # compute sqrt(var(x_0^T\hat{beta}))
  sd_term = sqrt(sigma_hat^2 * t(x0_value) %*% solve(t(X) %*% X) %*% x0_value)
  
  # Approach 2
  # Found some people online that says that the following is correct.
  # Get the lower and upper confidence bounds based on https://math.stackexchange.com/questions/3331735/on-confidence-sets-and-linear-regression (may have misunderstood df)
  lower_math_stackexchange[x0_values_index] = fit[x0_values_index] - sigma_hat*sqrt(qchisq(0.975, p+1) * t(x0_value) %*% solve(t(X) %*% X) %*% x0_value)
  upper_math_stackexchange[x0_values_index] = fit[x0_values_index] + sigma_hat*sqrt(qchisq(0.975, p+1) * t(x0_value) %*% solve(t(X) %*% X) %*% x0_value)
  
  # Get the lower and upper confidence bounds based on equation 5.9 in https://pages.stat.wisc.edu/~st849-1/lectures/Ch05.pdf (may have misunderstood df)
  lower_wisconsin[x0_values_index] = fit[x0_values_index] - sigma_hat*sqrt((p+1)*qf(0.95, p+1, N-p-1) * t(x0_value) %*% solve(t(X) %*% X) %*% x0_value)
  upper_wisconsin[x0_values_index] = fit[x0_values_index] + sigma_hat*sqrt((p+1)*qf(0.95, p+1, N-p-1) * t(x0_value) %*% solve(t(X) %*% X) %*% x0_value)
}

# Plot all of them to form the confidence bands using approach 2.
{
  matplot(x0_values, prediced_respones, type = "l", lty = 1, col = "5", lwd = 2,
          xlab = "x", ylab = "f(x)")
  matlines(x_values, cbind(y_without_noise, y_with_noise, y_hat),
           type = c("l", "p", "l"), pch = 16, lty = 1, lwd = 2,
           col = c(1, 2, 4))
  matlines(x0_values, cbind(lower_pluss_minus_qt, upper_pluss_minus_qt), type = "l", lty = 2, col = 4, lwd = 2)
  matlines(x0_values, cbind(lower_math_stackexchange, upper_math_stackexchange), type = "l", lty = 3, col = 6, lwd = 3)
  matlines(x0_values, cbind(lower_wisconsin, upper_wisconsin), type = "l", lty = 4, col = 7, lwd = 3)
  legend("topleft", legend = c("True", "Generated", "Predicted"), col = c(1,2, 4), lty = c(1,-1, 1), pch = c(-1, 16, -1), cex = 1.1)
  legend("bottomright", title = "Confidence Bands", legend = c("Approach 1", "Approach 2", "Approach 2 (math)", "Approach 2 (wisconsin)"),
         col = c(4, 5, 6, 7), lty = c(2, 1, 3, 4), cex = 1.1, lwd = 2)
}

# We see that "Approach 2", "Approach 2 (math)", "Approach 2 (wisconsin)" all overlap,
# and that they are wider than "Approach 1". 


# People who discuss the exercise or related stuff
# https://waxworksmath.com/Authors/G_M/Hastie/WriteUp/Weatherwax_Epstein_Hastie_Solution_Manual.pdf (p 33, Do not understand how they generate the lines from the CS)
# https://stats.stackexchange.com/questions/18322/confidence-set-for-parameter-vector-in-linear-regression
# https://math.stackexchange.com/questions/3331735/on-confidence-sets-and-linear-regression
# https://pages.stat.wisc.edu/~st849-1/lectures/Ch05.pdf (5.9)
# https://www.stat.cmu.edu/~cshalizi/mreg/15/lectures/18/lecture-18.pdf (section 3) (thorough)
# https://getd.libs.uga.edu/pdfs/ma_james_c_201412_ms.p (pp 44, use wrong df?)