#-------------------------------------------------------------------------------------------------------------
# 2.1 Remove cubic term ----
auto <- read.table("http://azzalini.stat.unipd.it/Book-DM/auto.dat", header = TRUE)
summary(auto)
head(auto)
bodystyle = auto$bodystyle
bodystyle
attach(auto)  # make each column avaiable as a variable in the global environment

# convert fuel from string to factor
fuel1 <- factor(fuel, levels = c("gas", "diesel"))

# help(functionname) or ?functionname opens the documentation
help(lm)

# fit full model
fit_orig <- lm(city.distance ~ engine.size + I(engine.size^2) + I(engine.size^3) + fuel1)
summary(fit_orig)

# fit model without cubic term
fit_new <- lm(city.distance ~ engine.size + I(engine.size^2) +  fuel1)
print(summary(fit_new))

# We see that the both the estimate and standard error of (engine size)^2 has decreased, but still resulting in
# a smaller p-value. This indicates that the two variables were both used to estimate the same information.


#-------------------------------------------------------------------------------------------------------------
# 2.2 Extrapolate -----
x =  seq(1, 7, 0.1)
x
idx = fuel1 == 'gas'
plot(engine.size[idx], city.distance[idx], xlim = c(1, 7), ylim = c(0, 22))
beta <- coef(fit_orig)
lines(x, beta[1] + beta[2]*x + beta[3]*x^2 + beta[4]*x^3, col = 2, lty = 1, lwd = 2)

# The extrapolation doesn't seem hopeless, though it's hard to say if it is reasonable.
# As explained in the book, the two-cylinder cars behave differently. Also, we have som problems estimating for low enginge size.

## motors with 2 cylinders are outliers
points(engine.size[idx & n.cylinders == 2], city.distance[idx & n.cylinders == 2], col = "red")

## plot regression line of the smaller model
beta <- coef(fit_new)
lines(x, beta[1] + beta[2]*x + beta[3]*x^2, col = "blue", lty = 1, lwd = 2)

# --- another way to get preds -----
help(predict.lm)
only_gas = factor(rep('gas', length(x)), levels=levels(fuel1))
new_x = data.frame(x, only_gas)
colnames(new_x) = c("engine.size", "fuel1")
head(new_x)
preds = predict(fit_orig, new_x)
preds2 = beta[1] + beta[2]*x + beta[3]*x^2 + beta[4]*x^3
all(preds == preds2)


#-------------------------------------------------------------------------------------------------------------
# 2.3  R-squared and transformations ----

y = 1/city.distance      # transform distance per liter to liter per distance
fit_consumption <- lm(y ~ engine.size  + fuel1)
summary(fit_consumption) #R-squared: 0.64

# compute R-squared in original scale
r2 <-  1 - sum((city.distance - 1/fitted(fit_consumption))^2) / ((length(city.distance)-1)*var(city.distance))
r2 #R-squared: 0.56

# R-squared of cubic model
summary(fit_orig)$r.squared

# R-squared of second order poly
summary(fit_new)$r.squared

# We are performing a non-linear transformation of our targets, to we expect R^2 to be different.
# To compare the models we need to do this in the same space (of our responce),
# meaning that the original (no-transformation) model is still a better fit.

# add regression line from transformed to plot
beta <- coefficients(fit_consumption)
pred <- 1/(beta[1] + beta[2]*x)  # distance per litre
lines(x, pred, col = "green")


# ------------------------------------------------------------------------------
# 2.4 Regression or classification? ----

"(a) regression. inference. quantitative output of CEO salary based on CEO
firm's features.
n - 500 firms in the US
p - profit, number of employees, industry

(b) classification. prediction. predicting new product's success or failure.
n - 20 similar products previously launched
p - price charged, marketing budget, comp. price, ten other variables

(c) regression. prediction. quantitative output of % change
n - 52 weeks of 2012 weekly data
p - % change in US market, % change in British market, % change in German market"


#-------------------------------------------------------------------------------------------------------------
# 2.5 College data ----

college <- read.csv("https://www.statlearning.com/s/College.csv", stringsAsFactors = T)



# (b)
fix(college)
rownames(college) = college[,1]
college = college[,-1]
fix(college)

# (c)
# i.
summary(college)
# ii.
pairs(college[,1:10])
# iii.
plot(college$Private, college$Outstate)
# iv.
Elite = rep("No", nrow(college))
Elite[college$Top10perc>50] = "Yes"
Elite = as.factor(Elite)
college = data.frame(college, Elite)
summary(college$Elite)
plot(college$Elite, college$Outstate)
# v.
par(mfrow=c(2,2))
hist(college$Apps)
hist(college$perc.alumni, col=2)
hist(college$S.F.Ratio, col=3, breaks=10)
hist(college$Expend, breaks=100)

# vi.
par(mfrow=c(1,1))
plot(college$Outstate, college$Grad.Rate)
# High tuition correlates to high graduation rate.
plot(college$Accept / college$Apps, college$S.F.Ratio)
# Colleges with low acceptance rate tend to have low S:F ratio.
plot(college$Top10perc, college$Grad.Rate)
# Colleges with the most students from top 10% perc don't necessarily have
# the highest graduation rate. Also, rate > 100 is erroneous!