# STK 4900 # Exercise 14 # Clean up the memory before we start. rm(list=ls(all=TRUE)) # a) # Read data. rock.data = read.table(file="http://www.uio.no/studier/emner/matnat/math/STK4900/data/rock.dat", col.names=c("area","peri","shape","perm"), header=FALSE) # Take a look at the data. rock.data # Descriptive statistics, histogram and boxplot of "perm" (make one plot at a time): summary(rock.data) par(mfrow=c(1,2)) hist(rock.data[,"perm"]) boxplot(rock.data[,"perm"]) par(mfrow=c(1,1)) # How is the distribution of "perm"? Symetric or skewed? # Try a log-transform: rock.data[,"perm.log"] = log(rock.data[,"perm"]) summary(rock.data[,"perm.log"]) par(mfrow=c(1,2)) hist(rock.data[,"perm.log"]) boxplot(rock.data[,"perm.log"]) par(mfrow=c(1,1)) # How is the distribution of "perm.log"? Symetric or skewed? # (We choose to use "perm.log" in question c).) # b) # Compute the correlation between the three covariates and make scatterplots of the covariates. cor(rock.data[,1:3]) # Create a scatter plot # Scatter plot matrix plot(rock.data[,1:3]) # Is there a strong correlation between some of the covariates? # How could this affect the regression analysis? # c) # Create scatter plots between 'perm.log' and other variables. # 1 by 3 grid of plots. par(mfrow=c(1,3)) # Scatter plot: 'perm.log' vs 'area' plot(x=rock.data[,"area"], y=rock.data[,"perm.log"], main = "") # Scatter plot: 'perm.log' vs 'peri' plot(x=rock.data[,"peri"], y=rock.data[,"perm.log"], main = "") # Scatter plot: 'perm.log' vs 'shape' plot(x=rock.data[,"shape"], y=rock.data[,"perm.log"], main = "") par(mfrow=c(1,1)) # Alternative: correlation matrix cor(rock.data[,c(1,2,3,5)]) # Which of the covariates seem to be of importance? # Linear regression of "perm.log" using all three covariates. # (the options "x=TRUE" and "y=TRUE" are included because of the computations in question d) # It includes both the x and the y in the lm.obj lm.obj.1 = lm(perm.log~area+peri+shape, data=rock.data, x=TRUE, y=TRUE) summary(lm.obj.1) names(lm.obj.1) # Which of the covariates seem to be of importance? # Is this in agreement with what you could see from the scatterplots? lm.area <- lm(perm.log~area, data=rock.data) summary(lm.area) lm.peri <- lm(perm.log~peri, data=rock.data) summary(lm.peri) lm.shape <- lm(perm.log~poly(shape, 2), data=rock.data) summary(lm.shape) # d) # According to their t-values the order of the covariates is: peri > area > shape # Fit simpler regression models (according to the given order). lm.obj.2 = lm(perm.log~area+peri, data=rock.data, x=TRUE, y=TRUE) lm.obj.3 = lm(perm.log~peri, data=rock.data, x=TRUE, y=TRUE) # A function that computes cross-validated R-squared. cv.R2.func = function(lm.obj) { # Safety check if (("x" %in% names(lm.obj)) == FALSE) {stop("The given 'lm.obj' doesn't contain attribute 'x'.")} if (("y" %in% names(lm.obj)) == FALSE) {stop("The given 'lm.obj' doesn't contain attribute 'y'.")} # Extract 'x' and 'y' from the given 'lm.obj'. x = lm.obj$x y = lm.obj$y # Compute cross-validated R-squared. a = t(x)%*%x d = diag(1/eigen(a)$values) e = eigen(a)$vector inv.a = e%*%d%*%t(e) v = diag(x%*%inv.a%*%t(x)) SSkryss = sum((lm.obj$res/(1-v))^2) SStot = sum((y-mean(y))^2) cross.validated.R2 = 1 - SSkryss/SStot # Return the cross validated R-squared. return(cross.validated.R2) } # Compute cross-validated R2: cv.R2.func(lm.obj.1) cv.R2.func(lm.obj.2) cv.R2.func(lm.obj.3) # Which of the three models give the best prediction? # Compare the cross-validated R2 with R2 from question c). What do you see? # e) # Fit a model with second order terms and interactions: lm.obj.4 = lm(perm.log~area+peri+shape+I(area^2)+I(peri^2)+I(shape^2)+area:peri+area:shape+peri:shape, data=rock.data, x=T, y=T) summary(lm.obj.4) # Which variables seem to be most important? # According to their t-values the variables are ordered as: area:peri > area^2 > peri > area > peri^2 > area:shape > peri:shape > shape^2 > shape # Fit models according to this ordering: lm.obj.5 = lm(perm.log~area+peri+I(area^2)+I(peri^2)+I(shape^2)+area:peri+area:shape+peri:shape, data=rock.data, x=T, y=T) lm.obj.6 = lm(perm.log~area+peri+I(area^2)+I(peri^2)+area:peri+area:shape+peri:shape, data=rock.data, x=T, y=T) lm.obj.7 = lm(perm.log~area+peri+I(area^2)+I(peri^2)+area:peri+area:shape, data=rock.data, x=T, y=T) lm.obj.8 = lm(perm.log~area+peri+I(area^2)+I(peri^2)+area:peri, data=rock.data, x=T, y=T) lm.obj.9 = lm(perm.log~area+peri+I(area^2)+area:peri, data=rock.data, x=T, y=T) lm.obj.10 = lm(perm.log~peri+I(area^2)+area:peri, data=rock.data, x=T, y=T) lm.obj.11 = lm(perm.log~I(area^2)+area:peri, data=rock.data, x=T, y=T) lm.obj.12 = lm(perm.log~area:peri, data=rock.data, x=T, y=T) # Compute cross-validated R2 for the models: cv.R2.func(lm.obj.5) cv.R2.func(lm.obj.6) cv.R2.func(lm.obj.7) cv.R2.func(lm.obj.8) cv.R2.func(lm.obj.9) cv.R2.func(lm.obj.10) cv.R2.func(lm.obj.11) cv.R2.func(lm.obj.12) # Which model gives the best prediction? # f) # Use the same "recipe" as in question e). # Which model gives the best prediction? Is it the same one as in question e? # K-Fold CV function kFoldCV <- function(lm.formula, K=5, seed=123) { n <- nrow(rock.data) kFolds <- rep(1:K, length.out=n) kFolds <- sample(kFolds, size=n, replace=F) MSEs <- c() for(k in 1:K) { Ids <- which(kFolds==k) train.data <- rock.data[-Ids, ] test.data <- rock.data[Ids, ] lm.cv <- lm(lm.formula, data=train.data) yhat <- predict(lm.cv, newdata=test.data) MSEs[k] <- mean((test.data$perm.log-yhat)^2) } return(mean(MSEs)) } # Let's use it on the models of point e) kFoldCV(formula(perm.log~area+peri+I(area^2)+I(peri^2)+I(shape^2)+area:peri+area:shape+peri:shape)) kFoldCV(formula(perm.log~area+peri+I(area^2)+I(peri^2)+area:peri+area:shape+peri:shape)) kFoldCV(formula(perm.log~area+peri+I(area^2)+I(peri^2)+area:peri+area:shape)) kFoldCV(formula(perm.log~area+peri+I(area^2)+I(peri^2)+area:peri)) kFoldCV(formula(perm.log~area+peri+I(area^2)+area:peri)) kFoldCV(formula(perm.log~peri+I(area^2)+area:peri)) kFoldCV(formula(perm.log~I(area^2)+area:peri)) kFoldCV(formula(perm.log~area:peri)) # Now we repeat CV 10 times and average over result mean(sapply(1:10, function(i) kFoldCV(formula(perm.log~area+peri+I(area^2)+I(peri^2)+I(shape^2)+area:peri+area:shape+peri:shape), seed=i))) mean(sapply(1:10, function(i) kFoldCV(formula(perm.log~area+peri+I(area^2)+I(peri^2)+area:peri+area:shape+peri:shape), seed=i))) mean(sapply(1:10, function(i) kFoldCV(formula(perm.log~area+peri+I(area^2)+I(peri^2)+area:peri+area:shape), seed=i))) mean(sapply(1:10, function(i) kFoldCV(formula(perm.log~area+peri+I(area^2)+I(peri^2)+area:peri), seed=i))) mean(sapply(1:10, function(i) kFoldCV(formula(perm.log~area+peri+I(area^2)+area:peri), seed=i))) mean(sapply(1:10, function(i) kFoldCV(formula(perm.log~peri+I(area^2)+area:peri), seed=i))) mean(sapply(1:10, function(i) kFoldCV(formula(perm.log~I(area^2)+area:peri), seed=i))) mean(sapply(1:10, function(i) kFoldCV(formula(perm.log~area:peri), seed=i))) # g) # Make various plots of the residuals. # See previous exercise for help with R-commands.