# STK 4900 # Exercise 13 # Clean up the memory before we start. rm(list=ls(all=TRUE)) # a) # Read data. salinity.data = read.table(file="http://www.uio.no/studier/emner/matnat/math/STK4900/data/exer5.dat", header=FALSE) colnames(salinity.data)=c("salt","saltprev","trend","discharge") # Take a look at the data. salinity.data # Get an overview of the data: summary(salinity.data) # Create a scatter plot matrix. plot(salinity.data) # Make sure that you understand what the summary measures tell you! # What do you see from the scatter plots? # Fit multiple linear regression with all three covariates. lm.obj = lm(salt~saltprev+trend+discharge, data=salinity.data) summary(lm.obj) # How important are each of the covariates? How does this agree with the scatter plots? # b) # Extract fitted values and residuals, and give a summary of the residuals: y.hat = lm.obj$fit residual.val = lm.obj$res # Check salinity.data[,"salt"] - (y.hat + residual.val) # Why are these 0? # c) # We will make various plots of the residuals # (i) Checking normality par(mfrow=c(1,2)) # Histogram and Q-Q plot (make one plot at a time) hist(residual.val) # Q-Q plot qqnorm(residual.val) qqline(residual.val) par(mfrow=c(1,1)) plot(lm.obj) # What do the plots tell you? # (ii) Checking homoscedasticy # Plot of residuals versus fitted values: plot(y.hat, residual.val, xlab="Fitted values", ylab="Residuals") # What does the plot tell you? # (iii) Checking linearity # We will make CPR plots for each of the predictors # We use the "car" library (package) library(car) crPlots(lm.obj, terms=~.) # What do the plots tell you? Are there indications of deviation from linearity? # (Note that the purple line in the plots may be influenced quite a lot by a few observations, # so one should be careful not to overinterpret the plots) # (iv) Studying influence # We will make boxplots of the standardized dfbetas to check if some # observations have a large influence of the estimates of the fitted model. boxplot(dfbetas(lm.obj)[,-1]) # You will find that one observation is very influential for the estimated effect of "discharge". # A simple way of identifying this observations is to just list # the observations together with the standardized defbetas # for "discharge" (which is the 4th parameter when we also count the intercept) cbind(salinity.data, dfbetas(lm.obj)[,4]) # Which observation is the influential one? # A more "advanced" way to identify the influential observation, is to use # the identify-command (cf. the slides from the lectures). # If time allows you may try the following commands for identifying the observation # that is influential for the estimate of "discharge" db = dfbeta(lm.obj) #Select column discharge j = 4 boxplot(db[,j]) which.max(db[,j]) # Or we can sort the dataframes with values in decreasing order! # d) # Try fitting a model with a second degree term for discharge lm.obj.2 = lm(salt~saltprev+trend+discharge+I(discharge^2), data=salinity.data) summary(lm.obj.2) # Check the model assumptions for this model. Is it improved comare to the model without a second degree term? crPlots(lm.obj.2) # Try yourself other models (e.g. with interactions and/or more second order terms) # Which model would you suggest to use for predicting salinity? lm.obj.3 = lm(salt~saltprev+trend+discharge+I(discharge^2), data=salinity.data[-16,]) summary(lm.obj.3) # Check the model assumptions for this model. Is it improved comare to the model without a second degree term? crPlots(lm.obj.3)