# Exercises week 14 STK2100 Spring 2024
# These solutions are taken from Spring 2020

# -------------------------------
# 10.3 from ISL
# -------------------------------

set.seed(1)
x = cbind(c(1, 1, 0, 5, 6, 4), c(4, 3, 4, 1, 2, 0))
x

# a)

plot(x[,1], x[,2])

# b)

labels = sample(2, nrow(x), replace=T)
labels

# c)

centroid1 = c(mean(x[labels==1, 1]), mean(x[labels==1, 2]))
centroid2 = c(mean(x[labels==2, 1]), mean(x[labels==2, 2]))
centroid1
centroid2

plot(x[,1], x[,2], col=(labels+1), pch=1, cex=2)
points(centroid1[1], centroid1[2], col=2, pch=4)
points(centroid2[1], centroid2[2], col=3, pch=4)

# d)

euclid = function(a, b) {
  return(sqrt((a[1] - b[1])^2 + (a[2]-b[2])^2))
}
assign_labels = function(x, centroid1, centroid2) {
  labels = rep(NA, nrow(x))
  for (i in 1:nrow(x)) {
    if (euclid(x[i,], centroid1) < euclid(x[i,], centroid2)) {
      labels[i] = 1
    } else {
      labels[i] = 2
    }
  }
  return(labels)
}
labels = assign_labels(x, centroid1, centroid2)
labels

#  e)

last_labels = rep(-1, 6)
while (!all(last_labels == labels)) {
  last_labels = labels
  centroid1 = c(mean(x[labels==1, 1]), mean(x[labels==1, 2]))
  centroid2 = c(mean(x[labels==2, 1]), mean(x[labels==2, 2]))
  print(centroid1)
  print(centroid2)
  labels = assign_labels(x, centroid1, centroid2)
}

labels

# f)

plot(x[,1], x[,2], col=(labels+1), pch=1, cex=3)
points(centroid1[1], centroid1[2], col=2, pch=4)
points(centroid2[1], centroid2[2], col=3, pch=4)


# -------------------------------
# 10.9 from ISL
# -------------------------------

library(ISLR)
set.seed(2)
head(USArrests)

# a)

?hclust
?dist
hc.complete = hclust(dist(USArrests), method="complete")
plot(hc.complete)

# b)
?cutree
cutree(hc.complete, 3)
plot(cutree(hc.complete, 3))
table(cutree(hc.complete, 3))

# c)

dsc = scale(USArrests)
hc.s.complete = hclust(dist(dsc), method="complete")
plot(hc.s.complete)


# d)

cutree(hc.s.complete, 3)

table(cutree(hc.s.complete, 3))

table(cutree(hc.s.complete, 3), cutree(hc.complete, 3))

# Scaling the variables effects the max height of the dendogram obtained from 
# hierarchical clustering. From a cursory glance, it doesn't effect the bushiness 
# of the tree obtained. However, it does affect the clusters obtained from cutting 
# the dendogram into 3 clusters. In my opinion, for this data set the data should be 
# standardized because the data measured has different units (UrbanPop compared to 
# other three columns).