# Ex 3.2 simulation experiment # Author: Lars Henry Berge Olsen # Date: 17.02.23 # Set seed for reproducibility set.seed(3) # Set the significance level. alpha = 0.05 level = 1 - alpha # The number of x values to consider N = 100 # Generate the N evenly spaced x values x_values = seq(-2, 2, length.out = N) x_values = sort(runif(N, min = -2, max = 2)) # Generate the design matrix X = matrix(c(x_values^0, x_values, x_values^2, x_values^3), nrow = N, ncol = 4) # Define the true beta values beta = matrix(c(1, -1.5, -0.5, 1), nrow = 4, ncol = 1) # Number of explanatory variables (intercept does not count) p = nrow(beta) - 1 # Generate the response without noise y_without_noise = X %*% beta # The standard deviation of the noise sigma_noise = 1 # Generate some noise noise = rnorm(N, mean = 0, sd = sigma_noise) # Add noise to response y_with_noise = y_without_noise + noise # Compare the two { matplot(cbind(x_values, x_values), cbind(y_without_noise, y_with_noise), type = c("l", "p"), pch = 16, xlab = "x", ylab = "f(x)") legend("topleft", legend = c("True", "Generated"), col = c(1,2), lty = c(1,-1), pch = c(-1, 16), cex = 1.1) } # Can fit a linear model to the data lm_data = data.frame("y" = y_with_noise, "X0" = X[,1], "X1" = X[,2], "X2" = X[,3], "X3" = X[,4]) lm_model = lm(y~.-1, data = lm_data) #-1 as we do not want an additional intercept summary(lm_model) # Can compare the true coefficients with the estimated ones rbind(t(beta), lm_model$coefficients) # Reasonably good # estimated sigma sigma_hat = sqrt(sum(lm_model$residuals^2)/(N-p-1)) # Compare with what lm function got. rbind(summary(lm_model)$sigma, sigma_hat) # estimated beta coefficients beta_hat = solve(t(X) %*% X) %*% t(X) %*% y_with_noise # Compare with what lm function got. rbind(summary(lm_model)$coeff[,1], t(beta_hat)) # predict the response y_hat = X %*% beta_hat # Compare the three { matplot(cbind(x_values, x_values, x_values), cbind(y_without_noise, y_with_noise, y_hat), type = c("l", "p", "l"), pch = 16, lty = 1, col = c(1, 2, 4), xlab = "x", ylab = "f(x)") legend("topleft", legend = c("True", "Generated", "Predicted"), col = c(1,2, 4), lty = c(1,-1, 1), pch = c(-1, 16, -1), cex = 1.1) } # Define the x0 values x0_values = seq(-2, 2, by=0.025) X0 = matrix(c(x0_values^0, x0_values, x0_values^2, x0_values^3), nrow = length(x0_values), ncol = 4) # Approach 1 is just what is done in the lm function by default. lm_data_new = data.frame("X0" = X0[,1], "X1" = X0[,2], "X2" = X0[,3], "X3" = X0[,4]) predict(lm_model, newdata = lm_data_new, interval = "confidence") # Arrays to store the fitted values and the confidence bands. { fit = rep(NA, length(x0_values)) lower_pluss_minus_1.96 = rep(NA, length(x0_values)) upper_pluss_minus_1.96 = rep(NA, length(x0_values)) lower_pluss_minus_2 = rep(NA, length(x0_values)) upper_pluss_minus_2 = rep(NA, length(x0_values)) lower_pluss_minus_qt = rep(NA, length(x0_values)) upper_pluss_minus_qt = rep(NA, length(x0_values)) } # Iterate over the x0 values and compute the predicted values and the lower and uper confidence bands. x0_values_index = 1 for (x0_values_index in seq(length(x0_values))) { # Extract the current x0 value # Drop = FALSE to still get a matrix, transpose to get a column vector x0_value = t(X0[x0_values_index, , drop = FALSE]) # Get the predicted response fit[x0_values_index] = t(x0_value) %*% beta_hat # compute sqrt(var(x_0^T\hat{beta})) sd_term = sqrt(sigma_hat^2 * t(x0_value) %*% solve(t(X) %*% X) %*% x0_value) # Get the lower and upper bounds (Gaussian error assumption) lower_pluss_minus_1.96[x0_values_index] = fit[x0_values_index] - 1.96 * sd_term upper_pluss_minus_1.96[x0_values_index] = fit[x0_values_index] + 1.96 * sd_term # Also standard practice to use 2, as even if the Gaussian error assumption does not hold, # this interval will be approximately correct, with its coverage approaching 1 − 2\alpha as the sample size N \to \infty. lower_pluss_minus_2[x0_values_index] = fit[x0_values_index] - 2 * sd_term upper_pluss_minus_2[x0_values_index] = fit[x0_values_index] + 2 * sd_term # Get the lower and upper confidence bounds using that we have estimated sigma, hence t distribution. lower_pluss_minus_qt[x0_values_index] = fit[x0_values_index] - qt((1 - level)/2, N-p-1, lower.tail = FALSE) * sd_term upper_pluss_minus_qt[x0_values_index] = fit[x0_values_index] + qt((1 - level)/2, N-p-1, lower.tail = FALSE) * sd_term } # Plot the results { matplot(x_values, cbind(y_without_noise, y_with_noise, y_hat), type = c("l", "p", "l"), pch = 16, lty = 1, lwd = 2, col = c(1, 2, 4), xlab = "x", ylab = "f(x)") legend("topleft", legend = c("True", "Generated", "Predicted"), col = c(1,2, 4), lty = c(1,-1, 1), pch = c(-1, 16, -1), cex = 1.1) matlines(x0_values, cbind(lower_pluss_minus_1.96, upper_pluss_minus_1.96), type = "l", lty = 2, col = 7, lwd = 2) matlines(x0_values, cbind(lower_pluss_minus_2, upper_pluss_minus_2), type = "l", lty = 2, col = 4, lwd = 2) matlines(x0_values, cbind(lower_pluss_minus_qt, upper_pluss_minus_qt), type = "l", lty = 3, col = 4, lwd = 2) legend("bottomright", title = "Confidence Bands", legend = c("Approach 1"), col = 4, lty = 2, cex = 1.1, lwd = 2) } # Approach 2 using equation 3.15 -------------------------------------------------------------- # The idea is to generate a lot of betas a keep only the ones that satisfies equation 3.15 # The number of betas to generate n_beta_sample = 1000000 # Sample "random" betas around beta_hat. # I just randomly choose pluss-minus 1 as to not generate not too unlikely values. beta_samples = rbind( runif(n_beta_sample, beta_hat[1]-1, beta_hat[1]+1), runif(n_beta_sample, beta_hat[2]-1, beta_hat[2]+1), runif(n_beta_sample, beta_hat[3]-1, beta_hat[3]+1), runif(n_beta_sample, beta_hat[4]-1, beta_hat[4]+1)) # Array to keep track of which betas that satisfies (3.15) beta_indices_to_keep = rep(NA, n_beta_sample) # Iterate over the betas for (beta_sample_index in seq(n_beta_sample)) { # Extract the current beta as a column vector beta_sample = beta_samples[,beta_sample_index, drop = FALSE] # Check if the current beta satifies eq (3.15) beta_indices_to_keep[beta_sample_index] = (t(beta_hat - beta_sample) %*% t(X) %*% X %*% (beta_hat - beta_sample))[1,1] <= sigma_hat^2 * qchisq(level, df = p + 1) } # Extract the relevant betas beta_keep = beta_samples[,beta_indices_to_keep, drop = FALSE] # Compute the predicted responses for these lines prediced_respones = X0 %*% beta_keep # Plot all of them to form the confidence bands using approach 2. { matplot(x0_values, prediced_respones, type = "l", lty = 1, col = "5", lwd = 2, xlab = "x", ylab = "f(x)") matlines(x_values, cbind(y_without_noise, y_with_noise, y_hat), type = c("l", "p", "l"), pch = 16, lty = 1, lwd = 2, col = c(1, 2, 4)) matlines(x0_values, cbind(lower_pluss_minus_qt, upper_pluss_minus_qt), type = "l", lty = 2, col = 4, lwd = 2) legend("topleft", legend = c("True", "Generated", "Predicted"), col = c(1,2, 4), lty = c(1,-1, 1), pch = c(-1, 16, -1), cex = 1.1) legend("bottomright", title = "Confidence Bands", legend = c("Approach 1", "Approach 2"), col = c(4, 5), lty = c(2, 1), cex = 1.1, lwd = 2) } # Instead of simulating the betas one can derive explicit expressions. { lower_math_stackexchange = rep(NA, length(x0_values)) upper_math_stackexchange = rep(NA, length(x0_values)) lower_wisconsin = rep(NA, length(x0_values)) upper_wisconsin = rep(NA, length(x0_values)) } # Iterate over the x0 values and compute the predicted values and the lower and uper confidence bands. x0_values_index = 1 for (x0_values_index in seq(length(x0_values))) { # Extract the current x0 value # Drop = FALSE to still get a matrix, transpose to get a column vector x0_value = t(X0[x0_values_index, , drop = FALSE]) # Get the predicted response fit[x0_values_index] = t(x0_value) %*% beta_hat # compute sqrt(var(x_0^T\hat{beta})) sd_term = sqrt(sigma_hat^2 * t(x0_value) %*% solve(t(X) %*% X) %*% x0_value) # Approach 2 # Found some people online that says that the following is correct. # Get the lower and upper confidence bounds based on https://math.stackexchange.com/questions/3331735/on-confidence-sets-and-linear-regression (may have misunderstood df) lower_math_stackexchange[x0_values_index] = fit[x0_values_index] - sigma_hat*sqrt(qchisq(0.975, p+1) * t(x0_value) %*% solve(t(X) %*% X) %*% x0_value) upper_math_stackexchange[x0_values_index] = fit[x0_values_index] + sigma_hat*sqrt(qchisq(0.975, p+1) * t(x0_value) %*% solve(t(X) %*% X) %*% x0_value) # Get the lower and upper confidence bounds based on equation 5.9 in https://pages.stat.wisc.edu/~st849-1/lectures/Ch05.pdf (may have misunderstood df) lower_wisconsin[x0_values_index] = fit[x0_values_index] - sigma_hat*sqrt((p+1)*qf(0.95, p+1, N-p-1) * t(x0_value) %*% solve(t(X) %*% X) %*% x0_value) upper_wisconsin[x0_values_index] = fit[x0_values_index] + sigma_hat*sqrt((p+1)*qf(0.95, p+1, N-p-1) * t(x0_value) %*% solve(t(X) %*% X) %*% x0_value) } # Plot all of them to form the confidence bands using approach 2. { matplot(x0_values, prediced_respones, type = "l", lty = 1, col = "5", lwd = 2, xlab = "x", ylab = "f(x)") matlines(x_values, cbind(y_without_noise, y_with_noise, y_hat), type = c("l", "p", "l"), pch = 16, lty = 1, lwd = 2, col = c(1, 2, 4)) matlines(x0_values, cbind(lower_pluss_minus_qt, upper_pluss_minus_qt), type = "l", lty = 2, col = 4, lwd = 2) matlines(x0_values, cbind(lower_math_stackexchange, upper_math_stackexchange), type = "l", lty = 3, col = 6, lwd = 3) matlines(x0_values, cbind(lower_wisconsin, upper_wisconsin), type = "l", lty = 4, col = 7, lwd = 3) legend("topleft", legend = c("True", "Generated", "Predicted"), col = c(1,2, 4), lty = c(1,-1, 1), pch = c(-1, 16, -1), cex = 1.1) legend("bottomright", title = "Confidence Bands", legend = c("Approach 1", "Approach 2", "Approach 2 (math)", "Approach 2 (wisconsin)"), col = c(4, 5, 6, 7), lty = c(2, 1, 3, 4), cex = 1.1, lwd = 2) } # We see that "Approach 2", "Approach 2 (math)", "Approach 2 (wisconsin)" all overlap, # and that they are wider than "Approach 1". # People who discuss the exercise or related stuff # https://waxworksmath.com/Authors/G_M/Hastie/WriteUp/Weatherwax_Epstein_Hastie_Solution_Manual.pdf (p 33, Do not understand how they generate the lines from the CS) # https://stats.stackexchange.com/questions/18322/confidence-set-for-parameter-vector-in-linear-regression # https://math.stackexchange.com/questions/3331735/on-confidence-sets-and-linear-regression # https://pages.stat.wisc.edu/~st849-1/lectures/Ch05.pdf (5.9) # https://www.stat.cmu.edu/~cshalizi/mreg/15/lectures/18/lecture-18.pdf (section 3) (thorough) # https://getd.libs.uga.edu/pdfs/ma_james_c_201412_ms.p (pp 44, use wrong df?)