# code based on "Code regarding section 6.1" # (copyright 2003, 2004, 2012 A.Azzalini and B.Scarpa) dataset <- read.table("http://azzalini.stat.unipd.it/Book-DM/C1.dat", header = TRUE)[ , 1:2] attach(dataset) # k-means set.seed(123) angle <- runif(1, 0, 2 * pi) x1.m <- mean(x1) x2.m <- mean(x2) r <- sqrt(var(x1) + var(x2)) / 2 # figure 6.1 # K = 3 K <- 3 centroid <- matrix(NA, K, 2) plot(dataset, xlab = expression(italic(x)[1]), ylab = expression(italic(x)[2]), cex = 1.5) for(k in 1:K){ centroid[k, 1]<- x1.m + r*cos(angle + 2 * pi * k / K) centroid[k, 2]<- x2.m + r*sin(angle + 2 * pi * k / K) points(centroid[k, 1], centroid[k, 2], col = 2, pch = 15, cex = 2) } k1 <- kmeans(dataset, centroid) points(dataset, col = k1$cluster + 2, cex = 1.5, pch = 16) for(k in 1:K) { lines(t(cbind(centroid[k, ], k1$centers[k, ])), lwd = 3, col = 2) points(k1$centers[k, 1], k1$centers[k, 2], col = 2, cex = 1.5, pch = 16) } # new centroids angle <- angle + 2 * pi / 9 plot(dataset, xlab = expression(italic(x)[1]), ylab = expression(italic(x)[2]), cex = 1.5) for(k in 1:K){ centroid[k, 1]<- x1.m + r*cos(angle + 2 * pi * k / K) centroid[k, 2]<- x2.m + r*sin(angle + 2 * pi * k / K) points(centroid[k, 1], centroid[k, 2], col = 2, pch = 15, cex = 2) } k1 <- kmeans(dataset, centroid) points(dataset, col = k1$cluster + 2, cex = 1.5, pch = 16) for(k in 1:K) { lines(t(cbind(centroid[k, ], k1$centers[k, ])), lwd = 3, col = 2) points(k1$centers[k, 1], k1$centers[k, 2], col = 2, cex = 1.5, pch = 16) } # K = 4 K <- 4 centroid <- matrix(NA, K, 2) plot(dataset, xlab = expression(italic(x)[1]), ylab = expression(italic(x)[2]), cex = 1.5) for(k in 1:K){ centroid[k, 1]<- x1.m + r*cos(angle + 2 * pi * k / K) centroid[k, 2]<- x2.m + r*sin(angle + 2 * pi * k / K) points(centroid[k, 1], centroid[k, 2], col = 2, pch = 15, cex = 2) } k1 <- kmeans(dataset, centroid) points(dataset, col = k1$cluster + 2, cex = 1.5, pch = 16) for(k in 1:K) { lines(t(cbind(centroid[k, ], k1$centers[k, ])), lwd = 3, col = 2) points(k1$centers[k, 1], k1$centers[k, 2], col = 2, cex = 1.5, pch = 16) } detach(dataset) # different data dataset2 <- read.table("http://azzalini.stat.unipd.it/Book-DM/C2.dat", header = TRUE)[ , 1:2] attach(dataset2) set.seed(123) angle <- runif(1, 0, 2 * pi) + 2 * pi / 9 x1.m <- mean(x1) x2.m <- mean(x2) r <- sqrt(var(x1)+var(x2))/2 K <- 3 centroid <- matrix(NA, K, 2) plot(dataset2, xlab = expression(italic(x)[1]), ylab = expression(italic(x)[2]), cex = 1.5) for(k in 1:K){ centroid[k, 1] <- x1.m + r * cos(angle + 2 * pi * k / K) centroid[k, 2] <- x2.m + r * sin(angle + 2 * pi * k / K) points(centroid[k, 1], centroid[k, 2], col = 2, pch = 15, cex = 2) } k1 <- kmeans(dataset2, centroid) points(dataset2, col = k1$cluster + 2, cex = 1.5, pch = 16) for(k in 1:K) { lines(t(cbind(centroid[k, ], k1$centers[k, ])), lwd = 3, col = 2) points(k1$centers[k, 1], k1$centers[k, 2], col = 2, cex = 1.5, pch = 16) } detach(dataset2) dataset <- read.table("http://azzalini.stat.unipd.it/Book-DM/C1.dat", header = TRUE)[ , 1:2] # figure 6.2 d0 <- dist(dataset[1:30, ]) h1 <- hclust(d0, method = "complete") plot(h1, xlab = "", ylab = "", axes = FALSE, main = "", sub = "", lwd = 2) abline(h = 2, lty = 2, lwd = 2) abline(h = 1.1, lty = 2, lwd = 2) abline(h = 1.5, lty = 2, lwd = 2, col = 2) # figure 6.3 d <- dist(dataset) K <- 3 c3 <- hclust(d, method = "single") # figure 6.3a plot(c3, labels = FALSE, xlab = "", ylab = "", main = "", sub = "") a <- rect.hclust(c3, k = K) # figure 6.3b plot(dataset, type = "n", xlab = expression(italic(x)[1]), ylab = expression(italic(x)[2])) for(k in 1:K) points(dataset[a[[k]], ], pch = 15 + k, col = 1 + k, cex = 1.5) # figure 6.3c c1 <- hclust(d, method = "complete") plot(c1, labels = FALSE, xlab = "", ylab = "", main = "", sub = "") a <- rect.hclust(c1, k = K) # figure 6.3d plot(dataset, type="n", xlab = expression(italic(x)[1]), ylab = expression(italic(x)[2])) for(k in 1:K) points(dataset[a[[k]], ], pch = 15 + k, col = 1 + k, cex = 2) # figure 6.3e c2 <- hclust(d, method = "average") plot(c2, labels = FALSE, xlab = "", ylab = "", main = "", sub = "") a <- rect.hclust(c2, k = K) # figure 6.3f plot(dataset, type = "n", xlab = expression(italic(x)[1]), ylab = expression(italic(x)[2])) for(k in 1:K) points(dataset[a[[k]], ], pch = 15 + k, col = 1 + k, cex = 2) # different data dataset2 <- read.table("http://azzalini.stat.unipd.it/Book-DM/C2.dat", header = TRUE)[ , 1:2] # figure 6.4 d <- dist(dataset2) K <- 3 # figure 6.4a c3 <- hclust(d, method = "single") plot(c3, labels = FALSE, xlab = "", ylab = "", main = "", sub = "") a <- rect.hclust(c3, k = K) # figure 6.4.b plot(dataset2, type = "n", xlab = expression(italic(x)[1]), ylab = expression(italic(x)[2])) for(k in 1:K) points(dataset2[a[[k]], ], pch = 15 + k, col = 1 + k, cex = 2) # figure 6.4c c1 <- hclust(d, method = "complete") plot(c1, labels = FALSE, xlab = "", ylab = "", main = "", sub = "") a <- rect.hclust(c1, k = K) # figure 6.4d plot(dataset2, type = "n", xlab = expression(italic(x)[1]), ylab = expression(italic(x)[2])) for(k in 1:K) points(dataset2[a[[k]], ], pch = 15 + k, col = 1 + k, cex = 2) # figure 6.3e c2 <- hclust(d, method = "average") plot(c2, labels = FALSE, xlab = "", ylab = "", main = "", sub = "") a <- rect.hclust(c2, k = K) # figure 6.4f plot(dataset2, type = "n", xlab = expression(italic(x)[1]), ylab = expression(italic(x)[2])) for(k in 1:K) points(dataset2[a[[k]], ], pch = 15 + k, col = 1 + k, cex = 2)