### penalized regression ### download.file('https://www.uniklinik-freiburg.de/fileadmin/mediapool/08_institute/biometrie-statistik/Dateien/Studium_und_Lehre/Lehrbuecher/Multivariable_Model-building/edu_bodyfat_both.zip', 'dataEdu_BodyFat.zip') data <- read.csv(unz('dataEdu_BodyFat.zip', 'edu_bodyfat_both/edu_bodyfat/edu_bodyfat.csv'), header = TRUE, row.names = 1) system('rm dataEdu_BodyFat.zip') # check data summary(data) n <- nrow(data) # split in training and test sets # set.seed(20200304) train <- sample(n, 2*n/3, replace = FALSE) X.train <- data[train, -14] y.train <- data[train, 14] X.test <- data[-train, -14] y.test <- data[-train, 14] # manual standardization # p <- ncol(X.train) X.train.mean <- colMeans(X.train) X.train.sd <- apply(X.train, 2, sd) X.train.std <- scale(X.train, center = TRUE, scale = TRUE) X.test.std <- sapply(1:p, function(i, data, m, s) (data[,i] - m[i])/s[i], data = X.test, m = X.train.mean, s = X.train.sd) colnames(X.test.std) <- colnames(X.test) # ridge library(glmnet) mod.ridge <- glmnet(y = y.train, x = X.train.std, alpha = 0, standardize = FALSE) lambda.r <- cv.glmnet(y = y.train, x = X.train.std, alpha = 0, standardize = FALSE) c(mod.ridge$a0[mod.ridge$lambda == lambda.r$lambda.min], mod.ridge$beta[, mod.ridge$lambda == lambda.r$lambda.min]) # lasso mod.lasso <- glmnet(y = y.train, x = X.train.std, alpha = 1, standardize = FALSE) lambda.l <- cv.glmnet(y = y.train, x = X.train.std, alpha = 1, standardize = FALSE) plot(lambda.l) c(mod.lasso$a0[mod.lasso$lambda == lambda.l$lambda.min], mod.lasso$beta[, mod.lasso$lambda == lambda.l$lambda.min]) plot(mod.lasso) c(mod.lasso$a0[mod.lasso$lambda == lambda.l$lambda.1se], mod.lasso$beta[, mod.lasso$lambda == lambda.l$lambda.1se]) plot(lambda.r) # authomatic standardization # # ridge mod.ridge.a <- glmnet(y = y.train, x = as.matrix(X.train), alpha = 0, standardize = TRUE) lambda.r.a <- cv.glmnet(y = y.train, x = as.matrix(X.train), alpha = 0, standardize = TRUE) c(mod.ridge.a$a0[mod.ridge.a$lambda == lambda.r.a$lambda.min], mod.ridge.a$beta[, mod.ridge.a$lambda == lambda.r.a$lambda.min]) # lasso mod.lasso.a <- glmnet(y = y.train, x = as.matrix(X.train), alpha = 1, standardize = TRUE) lambda.l.a <- cv.glmnet(y = y.train, x = as.matrix(X.train), alpha = 1, standardize = TRUE) c(mod.lasso.a$a0[mod.lasso.a$lambda == lambda.l.a$lambda.min], mod.lasso.a$beta[, mod.lasso.a$lambda == lambda.l.a$lambda.min]) plot(lambda.l.a) plot(mod.lasso.a) # pcr library(pls) mod.pcr <- pcr(y.train ~ ., data = as.data.frame(X.train.std), validation = 'CV') summary(mod.pcr) # elastic.net findLambda <- function(x, y, a) { tmp <- cv.glmnet(x, y, alpha = a, standardize = TRUE) which.min(tmp$cvm) c(min(tmp$cvm), tmp$lambda.min) } cv <- sapply(seq(0, 1, by = 0.05), findLambda, y = y.train, x = as.matrix(X.train)) par <- list(alpha = NULL, lambda = NULL, choice = NULL) par$choice <- which.min(cv[1, ]) par$alpha <- seq(0, 1, by = 0.05)[par$choice] par$lambda <- cv[2, par$choice] mod.elnet <- glmnet(y = y.train, x = as.matrix(X.train), alpha = par$alpha, standardize = TRUE, lambda = par$lambda) round(rbind(mean(y.train), model.elnet$beta), 3) # train and test error # train.error <- list() train.error$ridge <- mean((y.train - predict(mod.ridge, newx = X.train.std))^2) train.error$lasso <- mean((y.train - predict(mod.lasso, newx = X.train.std))^2) train.error$pcr <- mean((y.train - predict(mod.pcr, newdata = as.data.frame(X.train.std), ncomp = 4))^2) train.error$elnet <- mean((y.train - predict(mod.elnet, newx = as.matrix(X.train)))^2) train.error$lasso.1se <- mean((y.train - predict(mod.lasso, newx = X.train.std, s = lambda.l$lambda.1se))^2) train.error$lasso.a <- mean((y.train - predict(mod.lasso.a, newx = as.matrix(X.train)))^2) test.error <- list() test.error$ridge <- mean((y.test - predict(mod.ridge, newx = X.test.std))^2) test.error$lasso <- mean((y.test - predict(mod.lasso, newx = X.test.std))^2) test.error$pcr <- mean((y.test - predict(mod.pcr, newdata = as.data.frame(X.test.std), ncomp = 4))^2) test.error$elnet <- mean((y.test - predict(mod.elnet, newx = as.matrix(X.test)))^2) test.error$lasso.1se <- mean((y.test - predict(mod.lasso, newx = X.test.std, s = lambda.l$lambda.1se))^2) test.error$lasso.a <- mean((y.test - predict(mod.lasso.a, newx = as.matrix(X.test)))^2) cbind(train.error, test.error)