#-------------------------------------------------------------------------------------------------------------
# 2.7

# Predict highway.distance
#  (a): Using variables from chapter: engine.size, fuel, cylinders, curb.weight, city.distance
#  (b): Using variables listed in Appendix B.2

# Note: This is en exercise where you should experiment a bit, so there is not one correct solution. This is just
# one way to do it.
auto <- read.table("http://azzalini.stat.unipd.it/Book-DM/auto.dat", header = TRUE)
summary(auto)
head(auto)
bodystyle = auto$bodystyle
bodystyle
attach(auto)

fuel1 <- factor(fuel, levels = c("gas", "diesel"))
cylinders2 <- factor(n.cylinders==2)



fit_a = lm(log(highway.distance) ~ log(engine.size) + fuel1 + cylinders2 + log(curb.weight))

print(summary(fit_a)) #R-squared = 0.86

fit_b = lm(log(highway.distance) ~ log(engine.size) + fuel1 + cylinders2 + log(curb.weight) +
             factor(brand) + factor(aspiration) + factor(bodystyle) + factor(drive.wheels) + factor(engine.location) +
             wheel.base + length + width + height + compression.ratio + log(HP) + peak.rot)

print(summary(fit_b)) #R-squared = 0.92



# t-statistic ------------------------------------------------------------------

# simulate data
set.seed(20)
x <- rnorm(100)
y <- 2*x + rnorm(100)
x <- y/2 + rnorm(100, sd = 1/2)

## a) regress y on x
fit <- lm(y~x+0)
summary(fit)
# betahat = 1.99
# sd(betahat) = 0.11
# t-valute = 18.73
# Comment:
# The estimated value is close to the true value and highly significant (p < .001)

## b) regress x on y
fit <- lm(x~y+0)
summary(fit)
# betahat = 0.39
# sd(betahat) = 0.02
# t-valute = 18.73
# Comment:
# The estimated coeffient value is low compared to the true, but highly significant (p < .001).
# The observed t-statistic is the same as above.

## c)
# The residuals in the two models is dependent. The latter ones are simply scaled
# version of the former.
# This causes a downward bias when regression x on y (y is correlated with the residuals).
# Also, the t-statistic is the same. Intuitively, given the regressor, these are what contributes to
# randomness in the coefficients. The residuals in the first is only scaled
# realization in the former, and equally "unlikely".

## f) include an intercept in the model and show that the t-value is still unchanged
fit <- lm(y~x)  # the model now includes an intercept
summary(fit)
# t-value of beta1 = 18.56

fit <- lm(x~y)  # the model now includes an intercept
summary(fit)
# t-value of beta1 = 18.56

# Colinearity ---------------------------------------------------------------------

## a) simulate data
set.seed(1)
x1 <- runif(100)
x2 <- 0.5 * x1 + rnorm(100) / 10
y <- 2 + 2 * x1 + 0.3 * x2 + rnorm(100)

# regression model: y = beta0 + beta1*x1 + beta2*x2 + e,
# where: beta0 = 2, beta1 = 2, beta2 = .3, e is normal with mean zero

## b)
cor(x1, x2)   # compute correlation
plot(x1, x2)  # print scatter-plot

## c)
fit <- lm(y~x1+x2)
summary(fit)
# comment:
# The estimated coefficients deviates quite some compared to their true values.
# In particular, beta1 is underestimated and beta2 overestimated.
# We reject the null of beta1 = 0 at 5% significance level (p-value is 0.049)
# We can not reject the null of beta2 = 0 at any conventional level (p-value is 0.38).

## d)
fit <- lm(y~x1) # fit model including intercept and x1 only
summary(fit)
# comment:
# the estimated coefficient is now closer to its true value
# the p-value is lower
# E[Y|x1] = 2 + 2*x1 + .3*E[X2|x1] = 2 + (2 + .3* .4)*x1


## e)
fit <- lm(y~x2) # fit model including intercept and x2 only
summary(fit)
# comment:
# the estimated coefficient is overestimated and significant

## f)
# Yes. The p-value of beta_1 increases when excluding x2 and beta_2 becomes
# significant when we exlude x1.
# Including highly correlated variables can drive up the estimated variance
# of the coefficients. Leaving one out leads to omitted variable bias.

## g)
# add additional observation
x1 <- c(x1, 0.1)
x2 <- c(x2, 0.8)
y <- c(y, 6)

cor(x1, x2)

plot(x1, y)
points(.1, 6, col = "red")

# re-fit models
fit <- lm(y~x1+x2)
summary(fit)
fit <- lm(y~x1)
summary(fit)
fit <- lm(y~x2)
summary(fit)