---
title: "Day 2 exercises"
author: "Alfonso Diz-Lois"
date: "2025-02-12"
output: html_document
---

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
```


## Exercise 4:  one way ANOVA

### a

Read and inspect the data:

```{r}

solvents=read.table("http://www.uio.no/studier/emner/matnat/math/STK4900/data/solvents.txt",header=T)

head(solvents)
#Show variables and types
str(solvents)
#Are there NAs?
sum(is.na(solvents))
```


### b
```{r}
boxplot(rate~type,data=solvents)
```
At first glance, type 3 looks quite different from the other two.

### c

We want to test the null hypothesis that all the means are equal, against the alternative hypothesis that they are not.

```{r}
group1=solvents[solvents$type==1,"rate"]
group2=solvents[solvents$type==2,"rate"]
group3=solvents[solvents$type==3,"rate"]


TSS=var(solvents$rate)*(nrow(solvents)-1)
MSS=length(group1)*(mean(group1)-mean(solvents$rate))^2+length(group2)*(mean(group2)-mean(solvents$rate))^2+
  length(group3)*(mean(group3)-mean(solvents$rate))^2
RSS=var(group1)*(length(group1)-1)+var(group2)*(length(group2)-1)+var(group3)*(length(group3)-1)

print(paste0("MSS + RSS: ",MSS+RSS))
print(paste0("TSS: ",TSS))
```
We build our statistic to compare against the reference value. In this case:

$$
F=\frac{MSS/(K-1)}{RSS/(n-K)}
$$
We get:

```{r}
fstat=(MSS/(3-1))/(RSS/(nrow(solvents)-3))
fstat

```
Now, what is the probability that $F>fstat$:

```{r}
print(1-pf(fstat,2,nrow(solvents)-3))
plot(seq(0.01,30,0.1),df(seq(0.01,30,0.1),2,nrow(solvents)-3),type="l")
abline(v=fstat,col="red",lty=4)

```
Alternatively...

```{r}
#Do not forget to turn the group variable into a factor
solvents[,"type"] = factor(solvents[,"type"])
aov.solv=aov(rate~type,data=solvents)
summary(aov.solv)
```

## Exercise 5:  Correlation and regression

### a Read and plot the data

```{r}
# Clean up the memory before we start.
rm(list=ls(all=TRUE))

# a) Read the data into R and make a plot of the data.

# Enter the data.
pef.data = data.frame(pef = c(494,395,516,434,476,413,442,433),
                      minipef = c(512,430,520,428,500,364,380,445))

summary(pef.data) #You may want to check for NAs too (not in this case but in general): sum(is.na(pef.data))
plot(pef.data)
```
### b Compute the Pearson correlation coefficient.

$$
r=\frac{\sum^n_{i=1}(x_i-\bar{x})(y_i-\bar{y})/(n-1)}{s_xs_y}
$$


```{r}
#Manually (formula)
print(sum((pef.data$pef-mean(pef.data$pef))*(pef.data$minipef-mean(pef.data$minipef))/(nrow(pef.data)-1))/(sd(pef.data$pef)*sd(pef.data$minipef)))
#Manually (from covariance)
print(cov(pef.data$pef,pef.data$minipef)/(sd(pef.data$pef)*sd(pef.data$minipef)))
#Using R
print(cor(pef.data$pef,pef.data$minipef))


```

### c Find 95% confidence interval for the (theoretical) correlation coefficient.

```{r}
cor.test(pef.data$pef,pef.data$minipef)

```

### d  Fit a linear regression model with "minipef" as the outcome and "pef" as the predictor. Give an interpretation of the estimated slope, i.e. the least square estimate for "pef"

```{r}
# Fit a linear model.
lm.obj = lm(minipef~pef, data=pef.data)
# Print the result.
summary(lm.obj)

```

Two very important points:
-   Understand the output of the summary function
-   Be able to interpret the confidence intervals around the parameters


```{r}
# Extract estimated parameters
beta.0.hat = lm.obj$coefficients[1]
names(beta.0.hat) = NULL
beta.1.hat = lm.obj$coefficients[2]
names(beta.1.hat) = NULL


# Create a scatter plot.
plot(x=pef.data[,"pef"],
     y=pef.data[,"minipef"],
     xlab="pef",
     ylab="minipef")
# Plot the fitted linear model.
abline(a=beta.0.hat, b=beta.1.hat, col="red")
```

Testing whether the $\beta_1=0$ is equivalent to testing that $\rho=0$ for bivariate data. 

### e Multiply the estimated slope from question d with the ratio of the empirical standard deviations of "pef" and "minipef". Compare the result with the Pearson correlation from question b. What do you see?

Given that $\beta_1=r\frac{s_y}{s_x}$, we expect to get the same value. Let' s check:

```{r}

print(paste(beta.1.hat*sd(pef.data$pef)/sd(pef.data$minipef),cor(pef.data$pef,pef.data$minipef),sep=" ; "))


```

##Exercise 6:  simple linear regression

###a Read the data into R and inspect the data. Make a plot of the systolic blood pressure versus age:

```{r}

HERS.data=read.table("http://www.uio.no/studier/emner/matnat/math/STK4900/v17/hers.sample.txt",header=T)

summary(HERS.data)
print(sum(is.na(HERS.data)))
print(dim(HERS.data))
head(HERS.data)

```

```{r}
plot(HERS.data)
```

### b Fit a linear regression model to the data, using systolic blood pressure as the outcome and age as the predictor:

```{r}


# Fit a linear model.
lm.obj.1 = lm(sbp~age, data=HERS.data)
# Show the result.
summary(lm.obj.1)

# Extract estimated parameters
beta.0.hat = lm.obj.1$coefficients[1]
names(beta.0.hat) = NULL
beta.1.hat = lm.obj.1$coefficients[2]
names(beta.1.hat) = NULL

print(beta.0.hat)
print(beta.1.hat)

# Create a scatter plot with fitted regression line
plot(x=HERS.data[,"age"],
     y=HERS.data[,"sbp"],
     xlab="age",
     ylab="sbp")
# Plot the fitted linear model.
abline(a=beta.0.hat, b=beta.1.hat, col="blue")
```

### c In order to get a more meaningful interpretation of the intercept, it may be useful to "center" the predictor by subtraction of its mean (or a quantity close to its mean).  Fit a linear regression using "age - 67" as predictor:


```{r}
# Fit a linear regression with "age - 67" as predictor.
lm.obj.2 = lm(sbp~I(age-67), data=HERS.data)
# Show the result.
summary(lm.obj.2)

# Summary of the result.
result.table = data.frame(lm.1 = lm.obj.1$coefficients,
                          lm.2 = lm.obj.2$coefficients)
rownames(result.table) = c("beta.0.hat","beta.1.hat")
result.table

# Pay attention when you produce a plot *after* having transformed data.
plot(sbp~I(age-67), data=HERS.data)
abline(lm.obj.2, col="red")
abline(v=0, lty=3)


```


### d Sometimes it may be useful to use another unit for measuring a predictor than the one given in the data file. To illustrate this point, we will here fit a linear regression model where age is measured in units of then years:

```{r}

# Fit a linear regression with age/10 as predictor.
lm.obj.3 = lm(sbp~I(age/10), data=HERS.data)
# Show the result.
summary(lm.obj.3)

# Summary of the result.
result.table = cbind(result.table, lm.3 = lm.obj.3$coefficients)
result.table

# In this case, the scatterplot will be
plot(sbp~I(age/10), data=HERS.data)
abline(lm.obj.3, col="darkorange")

# Compare the least squares estimates with the ones you got in question b).
# How can you now interpret the slope?

```


## Exercise 7:  Understanding correlation

### a  Generate 25 observations (x,y) from the bivariate normal distribution with correlation 0.30 (see slide 24 from the lectures for explanation of the bivariat normal. Actual commands for generating bivariat normal data are given below). Compute the Pearson correlation coefficient and plot the observations:

```{r}
library(MASS)
n=25
rho=0.30
m=matrix(c(0,0),nrow=2)
S=matrix(c(1,rho,rho,1),nrow=2)
set.seed(1231)
obs=mvrnorm(n,m,S)
x=obs[,1]
y=obs[,2]
cor(x,y)
plot(x,y)
```

```{r}
plot(x=x,
     y=y,
     xlab="x",
     ylab="y",
     main = paste("Correlation = ", round(x=cor(x,y), digits=2)))

```

###b Repeat a) for correlation 0.60 and correlation 0.90.


```{r}
simulate.and.plot.func = function(n, rho=0, seed) {
  
  # The true mean vector.
  mu.vec = matrix(data=c(0,0), nrow=2, ncol=1)
  # The true covariance matrix.
  cov.mat = matrix(data=c(1,rho,rho,1), nrow=2, ncol=2)
  
  # Simulate data
  # Use set.seed() for replicability.
  set.seed(seed)
  sim.data = mvrnorm(n=n, mu=mu.vec, Sigma=cov.mat)
  
  # Compute empirical correlation.
  sample.cor = cor(sim.data[,1], sim.data[,2])
  
  # Create a scatter plot.
  plot(x=sim.data[,1],
       y=sim.data[,2],
       xlab="x",
       ylab="y",
       main=paste("Sample cor = ", round(x=sample.cor, digits=2), "\n", "n = ", n, ", True cor = ", rho),
       xlim=c(-3, 3),
       ylim=c(-3, 3))
  
} # End of function.

# Now, we can simply use the function we wrote.
# Create a 1 by 3 grid of plots.
par(mfrow=c(1,3))
simulate.and.plot.func(n=25, rho=.3, seed=1)
# Repeat a) for rho = 0.6
simulate.and.plot.func(n=25, rho=.6, seed=1)
# Repeat a) for rho = 0.9
simulate.and.plot.func(n=25, rho=.9, seed=1)
par(mfrow=c(1,1))

```

###c Repeat a) and b) for n=100 and n=500


```{r}

par(mfrow=c(2,3))

# Simulation with n = 100, rho = 0.3
simulate.and.plot.func(n=100, rho=0.3, seed=1)
# Simulation with n = 100, rho = 0.6
simulate.and.plot.func(n=100, rho=0.6, seed=1)
# Simulation with n = 100, rho = 0.9
simulate.and.plot.func(n=100, rho=0.9, seed=1)

# Simulation with n = 500, rho = 0.3
simulate.and.plot.func(n=500, rho=0.3, seed=1)
# Simulation with n = 500, rho = 0.6
simulate.and.plot.func(n=500, rho=0.6, seed=1)
# Simulation with n = 500, rho = 0.9
simulate.and.plot.func(n=500, rho=0.9, seed=1)

par(mfrow=c(1,1))
```