# STK 4900
# Week 1, Exercise 9

# Clean up the memory before we start.
rm(list=ls(all=TRUE))

# Read the data into a dataframe and give names to the variables.
insurance.data = read.table("http://www.uio.no/studier/emner/matnat/math/STK4900/data/exer3_2.dat", header=FALSE)
names(insurance.data)=c("income","risk.avs","amount")

# Take a look at the data.
insurance.data
summary(insurance.data)
# Make sure that you understand what the summary measures tell you!

# Create scatter plots between 'amount' and other variables.

# 1 by 2 grid of plots.
par(mfrow=c(1,2))
# Scatter plot: 'amount' vs 'income'
plot(x=insurance.data[,"income"],
     y=insurance.data[,"amount"],
     xlab="Annual income",
     ylab="Amount of life insurance",
     main = "")
# Scatter plot: 'amount' vs 'risk.avs'
plot(x=insurance.data[,"risk.avs"],
     y=insurance.data[,"amount"],
     xlab="Risk aversion score",
     ylab="Amount of life insurance",
     main = "")
par(mfrow=c(1,1))

# Alternative: scatter plot matrix
plot(insurance.data)


# What do the plots tell you?


# Compute the correlation between the variables.
cor(insurance.data)
# Do the correlations agree with what you saw from the plots?

# Carry out univariate regression analysis of 'amount' versus each of the other two variables.
lm.obj.1 = lm(amount~income, data=insurance.data)
summary(lm.obj.1)
lm.obj.2 = lm(amount~risk.avs, data=insurance.data)
summary(lm.obj.2)

# Which of the two variables, income and risk aversion, is most important for explaining the variation in the amount of life insurance carried?
# Does any of the variables (alone) have a significant effect?

# Carry out a regression analysis including both income and risk aversion:
lm.obj.3 = lm(amount~income+risk.avs, data=insurance.data)
summary(lm.obj.3)

# What does this model tell you? Does it look better than the best of the two models with only one covariate?
# Try yourself models with second order terms for income and/or risk aversion.
lm.obj.1.poly = lm(amount~income+I(income^2), data=insurance.data)
summary(lm.obj.1.poly)

lm.obj.2.poly = lm(amount~risk.avs+I(risk.avs^2), data=insurance.data)
summary(lm.obj.2.poly)

x.grid = seq(min(insurance.data$risk.avs)-1, max(insurance.data$risk.avs)+1, length.out=1000)
y.predicted = predict.lm(lm.obj.2.poly, newdata=data.frame(risk.avs=x.grid))

par(mfrow=c(1,2))
# Scatter plot: 'amount' vs 'income'
plot(x=insurance.data[,"income"],
     y=insurance.data[,"amount"],
     xlab="Annual income",
     ylab="Amount of life insurance",
     main = "")
abline(lm.obj.1, col="red")
# Scatter plot: 'amount' vs 'risk.avs'
plot(x=insurance.data[,"risk.avs"],
     y=insurance.data[,"amount"],
     xlab="Risk aversion score",
     ylab="Amount of life insurance",
     main = "")
lines(x=x.grid,
      y=y.predicted,
      col="blue")
par(mfrow=c(1,1))