#------------------------------------------------------------------------------------------------------------- # 2.1 Remove cubic term ---- auto <- read.table("http://azzalini.stat.unipd.it/Book-DM/auto.dat", header = TRUE) summary(auto) head(auto) bodystyle = auto$bodystyle bodystyle attach(auto) # make each column avaiable as a variable in the global environment # convert fuel from string to factor fuel1 <- factor(fuel, levels = c("gas", "diesel")) # help(functionname) or ?functionname opens the documentation help(lm) # fit full model fit_orig <- lm(city.distance ~ engine.size + I(engine.size^2) + I(engine.size^3) + fuel1) summary(fit_orig) # fit model without cubic term fit_new <- lm(city.distance ~ engine.size + I(engine.size^2) + fuel1) print(summary(fit_new)) # We see that the both the estimate and standard error of (engine size)^2 has decreased, but still resulting in # a smaller p-value. This indicates that the two variables were both used to estimate the same information. #------------------------------------------------------------------------------------------------------------- # 2.2 Extrapolate ----- x = seq(1, 7, 0.1) x idx = fuel1 == 'gas' plot(engine.size[idx], city.distance[idx], xlim = c(1, 7), ylim = c(0, 22)) beta <- coef(fit_orig) lines(x, beta[1] + beta[2]*x + beta[3]*x^2 + beta[4]*x^3, col = 2, lty = 1, lwd = 2) # The extrapolation doesn't seem hopeless, though it's hard to say if it is reasonable. # As explained in the book, the two-cylinder cars behave differently. Also, we have som problems estimating for low enginge size. ## motors with 2 cylinders are outliers points(engine.size[idx & n.cylinders == 2], city.distance[idx & n.cylinders == 2], col = "red") ## plot regression line of the smaller model beta <- coef(fit_new) lines(x, beta[1] + beta[2]*x + beta[3]*x^2, col = "blue", lty = 1, lwd = 2) # --- another way to get preds ----- help(predict.lm) only_gas = factor(rep('gas', length(x)), levels=levels(fuel1)) new_x = data.frame(x, only_gas) colnames(new_x) = c("engine.size", "fuel1") head(new_x) preds = predict(fit_orig, new_x) preds2 = beta[1] + beta[2]*x + beta[3]*x^2 + beta[4]*x^3 all(preds == preds2) #------------------------------------------------------------------------------------------------------------- # 2.3 R-squared and transformations ---- y = 1/city.distance # transform distance per liter to liter per distance fit_consumption <- lm(y ~ engine.size + fuel1) summary(fit_consumption) #R-squared: 0.64 # compute R-squared in original scale r2 <- 1 - sum((city.distance - 1/fitted(fit_consumption))^2) / ((length(city.distance)-1)*var(city.distance)) r2 #R-squared: 0.56 # R-squared of cubic model summary(fit_orig)$r.squared # R-squared of second order poly summary(fit_new)$r.squared # We are performing a non-linear transformation of our targets, to we expect R^2 to be different. # To compare the models we need to do this in the same space (of our responce), # meaning that the original (no-transformation) model is still a better fit. # add regression line from transformed to plot beta <- coefficients(fit_consumption) pred <- 1/(beta[1] + beta[2]*x) # distance per litre lines(x, pred, col = "green") # ------------------------------------------------------------------------------ # 2.4 Regression or classification? ---- "(a) regression. inference. quantitative output of CEO salary based on CEO firm's features. n - 500 firms in the US p - profit, number of employees, industry (b) classification. prediction. predicting new product's success or failure. n - 20 similar products previously launched p - price charged, marketing budget, comp. price, ten other variables (c) regression. prediction. quantitative output of % change n - 52 weeks of 2012 weekly data p - % change in US market, % change in British market, % change in German market" #------------------------------------------------------------------------------------------------------------- # 2.5 College data ---- college <- read.csv("https://www.statlearning.com/s/College.csv", stringsAsFactors = T) # (b) fix(college) rownames(college) = college[,1] college = college[,-1] fix(college) # (c) # i. summary(college) # ii. pairs(college[,1:10]) # iii. plot(college$Private, college$Outstate) # iv. Elite = rep("No", nrow(college)) Elite[college$Top10perc>50] = "Yes" Elite = as.factor(Elite) college = data.frame(college, Elite) summary(college$Elite) plot(college$Elite, college$Outstate) # v. par(mfrow=c(2,2)) hist(college$Apps) hist(college$perc.alumni, col=2) hist(college$S.F.Ratio, col=3, breaks=10) hist(college$Expend, breaks=100) # vi. par(mfrow=c(1,1)) plot(college$Outstate, college$Grad.Rate) # High tuition correlates to high graduation rate. plot(college$Accept / college$Apps, college$S.F.Ratio) # Colleges with low acceptance rate tend to have low S:F ratio. plot(college$Top10perc, college$Grad.Rate) # Colleges with the most students from top 10% perc don't necessarily have # the highest graduation rate. Also, rate > 100 is erroneous!