#------------------------------------------------------------------------------------------------------------- # 2.7 # Predict highway.distance # (a): Using variables from chapter: engine.size, fuel, cylinders, curb.weight, city.distance # (b): Using variables listed in Appendix B.2 # Note: This is en exercise where you should experiment a bit, so there is not one correct solution. This is just # one way to do it. auto <- read.table("http://azzalini.stat.unipd.it/Book-DM/auto.dat", header = TRUE) summary(auto) head(auto) bodystyle = auto$bodystyle bodystyle attach(auto) fuel1 <- factor(fuel, levels = c("gas", "diesel")) cylinders2 <- factor(n.cylinders==2) fit_a = lm(log(highway.distance) ~ log(engine.size) + fuel1 + cylinders2 + log(curb.weight)) print(summary(fit_a)) #R-squared = 0.86 fit_b = lm(log(highway.distance) ~ log(engine.size) + fuel1 + cylinders2 + log(curb.weight) + factor(brand) + factor(aspiration) + factor(bodystyle) + factor(drive.wheels) + factor(engine.location) + wheel.base + length + width + height + compression.ratio + log(HP) + peak.rot) print(summary(fit_b)) #R-squared = 0.92 # t-statistic ------------------------------------------------------------------ # simulate data set.seed(20) x <- rnorm(100) y <- 2*x + rnorm(100) x <- y/2 + rnorm(100, sd = 1/2) ## a) regress y on x fit <- lm(y~x+0) summary(fit) # betahat = 1.99 # sd(betahat) = 0.11 # t-valute = 18.73 # Comment: # The estimated value is close to the true value and highly significant (p < .001) ## b) regress x on y fit <- lm(x~y+0) summary(fit) # betahat = 0.39 # sd(betahat) = 0.02 # t-valute = 18.73 # Comment: # The estimated coeffient value is low compared to the true, but highly significant (p < .001). # The observed t-statistic is the same as above. ## c) # The residuals in the two models is dependent. The latter ones are simply scaled # version of the former. # This causes a downward bias when regression x on y (y is correlated with the residuals). # Also, the t-statistic is the same. Intuitively, given the regressor, these are what contributes to # randomness in the coefficients. The residuals in the first is only scaled # realization in the former, and equally "unlikely". ## f) include an intercept in the model and show that the t-value is still unchanged fit <- lm(y~x) # the model now includes an intercept summary(fit) # t-value of beta1 = 18.56 fit <- lm(x~y) # the model now includes an intercept summary(fit) # t-value of beta1 = 18.56 # Colinearity --------------------------------------------------------------------- ## a) simulate data set.seed(1) x1 <- runif(100) x2 <- 0.5 * x1 + rnorm(100) / 10 y <- 2 + 2 * x1 + 0.3 * x2 + rnorm(100) # regression model: y = beta0 + beta1*x1 + beta2*x2 + e, # where: beta0 = 2, beta1 = 2, beta2 = .3, e is normal with mean zero ## b) cor(x1, x2) # compute correlation plot(x1, x2) # print scatter-plot ## c) fit <- lm(y~x1+x2) summary(fit) # comment: # The estimated coefficients deviates quite some compared to their true values. # In particular, beta1 is underestimated and beta2 overestimated. # We reject the null of beta1 = 0 at 5% significance level (p-value is 0.049) # We can not reject the null of beta2 = 0 at any conventional level (p-value is 0.38). ## d) fit <- lm(y~x1) # fit model including intercept and x1 only summary(fit) # comment: # the estimated coefficient is now closer to its true value # the p-value is lower # E[Y|x1] = 2 + 2*x1 + .3*E[X2|x1] = 2 + (2 + .3* .4)*x1 ## e) fit <- lm(y~x2) # fit model including intercept and x2 only summary(fit) # comment: # the estimated coefficient is overestimated and significant ## f) # Yes. The p-value of beta_1 increases when excluding x2 and beta_2 becomes # significant when we exlude x1. # Including highly correlated variables can drive up the estimated variance # of the coefficients. Leaving one out leads to omitted variable bias. ## g) # add additional observation x1 <- c(x1, 0.1) x2 <- c(x2, 0.8) y <- c(y, 6) cor(x1, x2) plot(x1, y) points(.1, 6, col = "red") # re-fit models fit <- lm(y~x1+x2) summary(fit) fit <- lm(y~x1) summary(fit) fit <- lm(y~x2) summary(fit)