# Exercises week 14 STK2100 Spring 2024 # These solutions are taken from Spring 2020 # ------------------------------- # 10.3 from ISL # ------------------------------- set.seed(1) x = cbind(c(1, 1, 0, 5, 6, 4), c(4, 3, 4, 1, 2, 0)) x # a) plot(x[,1], x[,2]) # b) labels = sample(2, nrow(x), replace=T) labels # c) centroid1 = c(mean(x[labels==1, 1]), mean(x[labels==1, 2])) centroid2 = c(mean(x[labels==2, 1]), mean(x[labels==2, 2])) centroid1 centroid2 plot(x[,1], x[,2], col=(labels+1), pch=1, cex=2) points(centroid1[1], centroid1[2], col=2, pch=4) points(centroid2[1], centroid2[2], col=3, pch=4) # d) euclid = function(a, b) { return(sqrt((a[1] - b[1])^2 + (a[2]-b[2])^2)) } assign_labels = function(x, centroid1, centroid2) { labels = rep(NA, nrow(x)) for (i in 1:nrow(x)) { if (euclid(x[i,], centroid1) < euclid(x[i,], centroid2)) { labels[i] = 1 } else { labels[i] = 2 } } return(labels) } labels = assign_labels(x, centroid1, centroid2) labels # e) last_labels = rep(-1, 6) while (!all(last_labels == labels)) { last_labels = labels centroid1 = c(mean(x[labels==1, 1]), mean(x[labels==1, 2])) centroid2 = c(mean(x[labels==2, 1]), mean(x[labels==2, 2])) print(centroid1) print(centroid2) labels = assign_labels(x, centroid1, centroid2) } labels # f) plot(x[,1], x[,2], col=(labels+1), pch=1, cex=3) points(centroid1[1], centroid1[2], col=2, pch=4) points(centroid2[1], centroid2[2], col=3, pch=4) # ------------------------------- # 10.9 from ISL # ------------------------------- library(ISLR) set.seed(2) head(USArrests) # a) ?hclust ?dist hc.complete = hclust(dist(USArrests), method="complete") plot(hc.complete) # b) ?cutree cutree(hc.complete, 3) plot(cutree(hc.complete, 3)) table(cutree(hc.complete, 3)) # c) dsc = scale(USArrests) hc.s.complete = hclust(dist(dsc), method="complete") plot(hc.s.complete) # d) cutree(hc.s.complete, 3) table(cutree(hc.s.complete, 3)) table(cutree(hc.s.complete, 3), cutree(hc.complete, 3)) # Scaling the variables effects the max height of the dendogram obtained from # hierarchical clustering. From a cursory glance, it doesn't effect the bushiness # of the tree obtained. However, it does affect the clusters obtained from cutting # the dendogram into 3 clusters. In my opinion, for this data set the data should be # standardized because the data measured has different units (UrbanPop compared to # other three columns).