# STK 4900 # Week 1, Exercise 7 # Clean up the memory before we start. rm(list=ls(all=TRUE)) # Check and change working directory. # In this exercise we will perform some simulations that may help to get intuition on the concept # of correlation and the performance of the Pearson correlation coefficient. # In order to keep it simple, we will assume throughout the exercise that the expected values # are 0 and the standard deviations are 1. # Before you start doing the computations, you have to load the "MASS" library into R by the command. library(MASS) # a) # Generate 25 observations (x,y) from the bivariate normal distribution with correlation 0.30 # (See slide 24 from the lectures for explanation of the bivariat normal. # Actual commands for generating bivariat normal data are given below.) # Compute the Pearson correlation coefficient and plot the observations. # Number of data points to simulate. n = 25 # Correlation value that we want to use for simulation. rho = 0.30 # The true mean vector. mu = matrix(data=c(0,0), nrow=2, ncol=1) # The true covariance matrix. S = matrix(data=c(1,rho,rho,1), nrow=2, ncol=2) # The non-diagonal elements of "S" should be covariance. # However, we used Pearson correlation coefficient (rho) in the non-diagonal elements of "S". # Why is this ok in this case? # Simulate data # Use set.seed() for replicability. set.seed(1) sim.data = mvrnorm(n=n, mu=mu, Sigma=S) # Display the simulated data head(sim.data) tail(sim.data) summary(sim.data) # Compute empirical correlation. sample.cor = cor(sim.data[,1], sim.data[,2]) sample.cor # Repeat the commands a number of times. Note how the Pearson correlation coefficient and the plot vary. # Create a scatter plot # Open a pdf device. #pdf("./Exercise_7_a.pdf", width = 6, height = 6) # Create a scatter plot. plot(x=sim.data[,1], y=sim.data[,2], xlab="x", ylab="y", main = paste("Correlation = ", round(x=sample.cor, digits=2))) # Turn off the device. #dev.off() # More extensive plot of the result. # Create a 2 by 1 grid of plots. par(mfrow=c(2,1)) # Histogram of x hist(sim.data[,1], xlab="x", main="") # Display sample mean. abline(v=mean(sim.data[,1]), col="blue", lty=2) # Display true mean. abline(v=mean(mu[1]), col="blue", lty=1) # Display legend. legend("topright", legend=c("sample mean", "true mean"), col=c("blue", "blue"), lty=c(2,1), cex=0.8) # Histogram of y hist(sim.data[,2], xlab="y", main="") # Display sample mean. abline(v=mean(sim.data[,2]), col="blue", lty=2) # Display true mean. abline(v=mean(mu[2]), col="blue", lty=1) # Display legend. legend("topleft", legend=c("sample mean", "true mean"), col=c("blue", "blue"), lty=c(2,1), cex=0.8) par(mfrow=c(1,1)) # Create a scatter plot. plot(x=sim.data[,1], y=sim.data[,2], xlab="x", ylab="y", main=paste("Correlation = ", round(x=sample.cor, digits=2))) # b) Repeat a) for correlation 0.60 and correlation 0.90. # Note how the plots look like when the correlation is 0.60 and 0.90. # Since we have to repeat the simulation multiple times, we rewrite the simulation as a function. # This function simulates 'n' data points from bivariate normal # distribution (with mean = 0 variance = 0 and rho, which is to be specified by the user). # Then, it creates a plot. # This function takes following argumets as its inputs: # n: Number of data points to simulate # rho: Correlation value that we want to use for simulation # seed: Seed for replicability. simulate.and.plot.func = function(n, rho=0, seed) { # The true mean vector. mu.vec = matrix(data=c(0,0), nrow=2, ncol=1) # The true covariance matrix. cov.mat = matrix(data=c(1,rho,rho,1), nrow=2, ncol=2) # Simulate data # Use set.seed() for replicability. set.seed(seed) sim.data = mvrnorm(n=n, mu=mu.vec, Sigma=cov.mat) # Compute empirical correlation. sample.cor = cor(sim.data[,1], sim.data[,2]) # Create a scatter plot. plot(x=sim.data[,1], y=sim.data[,2], xlab="x", ylab="y", main=paste("Sample cor = ", round(x=sample.cor, digits=2), "\n", "n = ", n, ", True cor = ", rho), xlim=c(-3, 3), ylim=c(-3, 3)) } # End of function. # Now, we can simply use the function we wrote. # Create a 1 by 3 grid of plots. par(mfrow=c(1,3)) simulate.and.plot.func(n=25, rho=.3, seed=1) # Repeat a) for rho = 0.6 simulate.and.plot.func(n=25, rho=.6, seed=1) # Repeat a) for rho = 0.9 simulate.and.plot.func(n=25, rho=.9, seed=1) par(mfrow=c(1,1)) # c) # Repeat a) and b) for n=100 and n=500 # Note how the variation in the Pearson correlation coefficient depends on the sample size. # Open a pdf device. #pdf("./Exercise_7_c.pdf", width = 12, height = 8) # Create a 1 by 2 grid of plots. par(mfrow=c(2,3)) # Simulation with n = 100, rho = 0.3 simulate.and.plot.func(n=100, rho=0.3, seed=1) # Simulation with n = 100, rho = 0.6 simulate.and.plot.func(n=100, rho=0.6, seed=1) # Simulation with n = 100, rho = 0.9 simulate.and.plot.func(n=100, rho=0.9, seed=1) # Simulation with n = 500, rho = 0.3 simulate.and.plot.func(n=500, rho=0.3, seed=1) # Simulation with n = 500, rho = 0.6 simulate.and.plot.func(n=500, rho=0.6, seed=1) # Simulation with n = 500, rho = 0.9 simulate.and.plot.func(n=500, rho=0.9, seed=1) par(mfrow=c(1,1))