# Solution proposal ESL 9.6 # Based on the solutions from STK-IN4300 found here: /studier/emner/matnat/math/STK-IN4300/h24/exercises.html library(MASS) library(class) library(gam) library(rpart) rm(list = ls()) fil= "https://hastie.su.domains/ElemStatLearn/datasets/ozone.data" ozonedata = read.table(fil,header=T) summary(ozonedata) str(ozonedata) #111 observations pairs(ozonedata) head(ozonedata) sum(is.na(ozonedata)) # Description # Data frame with components: ozone, radiation, temperature, and wind. # Measurements of daily ozone concentration (ppb), wind speed (mph), # daily maximum temperature (degrees F), and solar radiation # (langleys) on 111 days from May to September 1973 in New York. This # data frame is similar to air in SPLUS (or library(data) in S), but # has a different definition for ozone (air contains cube-roots of # ozone). train.set=sample(1:nrow(ozonedata),size=floor(nrow(ozonedata)*0.6),replace=F) test.set=(1:nrow(ozonedata))[!(1:nrow(ozonedata)%in%train.set)] #Note that gam applies 4 degrees of freedom by default ozonedata$y<-ozonedata$ozone^(1/3) fit.gam = gam(y~s(temperature)+s(radiation)+s(wind),data=ozonedata[train.set,],family=gaussian) summary(fit.gam) # plot marginal effect of radiation, for comparison with figure 6.9 # - as in figure 6.9 the ozone concentration is an increasing function for lower levels of radiation. # - note that this plot, in contrast to 6.9, is not conditioned on the wind/temperature level. plot(fit.gam, se = T, terms = c("s(radiation)")) # plot marginal effects of other covariates plot(fit.gam, se = T, terms = c("s(temperature)")) plot(fit.gam, se = T, terms = c("s(wind)")) # re-fit the gam-model with a spline with 3 degrees of freedom for temperature fit.gam2 = gam(y~s(temperature,df=3)+s(radiation)+s(wind),data=ozonedata[train.set,],family=gaussian) summary(fit.gam2) AIC(fit.gam) AIC(fit.gam2) BIC(fit.gam) BIC(fit.gam2) # plot predicted against the true values plot(fit.gam2$y, fit.gam2$fitted.values, xlab="y", ylab="y_pred", main="Predictions vs response", col="blue") lines(c(min(fit.gam2$y),max(fit.gam2$y)), c(min(fit.gam2$y),max(fit.gam2$y))) points(ozonedata[test.set,"y"], predict(fit.gam2,newdata = ozonedata[test.set,]), col="darkgreen", pch=7) legend("bottomright", legend=c("Training","Test"), pch=c(1,7), col=c("blue","darkgreen")) # (b) fit (a single) regression tree, PRIM and MARS # regression tree ---- ?rpart fit.tree <- rpart(y ~ radiation+temperature+wind, data = ozonedata[train.set,], method = 'anova') plot(fit.tree, uniform = TRUE, margin = 0.1) # Draws the tree structure text(fit.tree) #