# Case study, clustering: Web Usage Mining # # From ch 6.3 in "Data anlysis and data mining" by A.Azzalini and B.Scarpa, # Oxford University Press, 2012 (ISBN 978-0-19-976710-6). # # - Profiling website visitor's hit-patterns is a simple way of identifying differences in interests between potential customers, # and appropriate marketing actions. # - The data set analyzed contains information on 26,157 anonymous hits on a consulting company's website # - There are 231 pages on the website (grouped into eight categories) and a total of 47,387 page views. # - Because some of the pages have similar content, they were grouped into eight categories (home, contacts, communications, events, company, white papers, business units, consulting) # # The data is found at: http://www.azzalini.it/Book-DM/ # load data ---- download.file('http://www.azzalini.it/Book-DM/webdata.zip', 'webData.zip') data <- read.csv(unz('webData.zip', 'webdata.csv'), sep = ';', header = TRUE, row.names = 1) system('rm webData.zip') dim(data[data$sessionID == 499, ]) head(data) ## Table 6.2 ----- ## Percentage of the single-page session for each area # count number of pages visited per session nPages <- table(data$sessionID) # count single-page sessions by area table_onePage <- table(data$area[data$sessionID %in% names(nPages)[nPages == 1]]) # compute percentages round(100 * table_onePage / sum(table_onePage), digits = 2) ## Table 6.3 ---- ## Summary stats for hits to each area # construct data set with multi-page visitors only multiplePages <- data[data$sessionID %in% names(nPages)[nPages > 1], ] summary(multiplePages) # for each session, compute number of visits (hits) to each area aggregateMultiplePages <- table(multiplePages$sessionID, multiplePages$area) aggregateMultiplePages <- as.matrix(aggregateMultiplePages) head(aggregateMultiplePages) dim(aggregateMultiplePages) # summary stats t(apply(aggregateMultiplePages, 2, function(x) c(mean = mean(x), median = median(x), quantile(x, probs = c(0.75, 0.9, 0.99))))) ## Fig 6.7 ---- ## Make a dendrogram clustering the visitors based on the number of hits to each area. head(aggregateMultiplePages) # transform data transfData <- ceiling(log2(aggregateMultiplePages + 1)) plot(aggregateMultiplePages[, 1], transfData[, 1]) # apply hierarchical clustering with complete linkage ?dist dissimil <- dist(transfData, method = "euclidian") h1 <- hclust(dissimil, method = 'complete') plot(h1, xlab = '', ylab = '', main = '', sub = '', lwd = 2) abline(h = 9.2, lty = 2, lwd = 2) ?rect.hclust a <- rect.hclust(h1, k = 4) # assigns each session to one of four clusters names(a) <- paste('Cluster', LETTERS[1:4], sep = ' ') ## Table 6.4 ---- ## Characterize the fours cluster by the average number of visits to each of the eight areas. # compute mean of visits to each area by cluster means <- lapply(a, function(a) apply(aggregateMultiplePages[a, ], 2, mean)) sds <- lapply(a, function(a) apply(aggregateMultiplePages[a, ], 2, sd)) # put results in a table makeCells <- function(mean, sd) paste0(round(mean, 2), ' (', round(sd, 2), ')') out <- matrix(character(), nrow = ncol(aggregateMultiplePages), ncol = length(a)) colnames(out) <- names(a) rownames(out) <- colnames(aggregateMultiplePages) for (i in 1:ncol(aggregateMultiplePages)) { for (j in 1:length(a)) { out[i, j] <- makeCells(means[[j]][i], sds[[j]][i]) } } # add overall mean and sd means <- apply(aggregateMultiplePages, 2, mean) sds <- apply(aggregateMultiplePages, 2, sd) out <- cbind(out, Overall = mapply(makeCells, mean = means, sd = sds)) # add counts out <- rbind(out, "Number of visitors" = c(lengths(a), nrow(aggregateMultiplePages))) out ## Figure 6.8 ---- ## Apply k-means algorithm ?kmeans # fit <- kmeans(x = aggregateMultiplePages, centers = 4, nstart = 10) # fails kmeans_rand_start <- function(data, k, nstart = 1) { # run kmeans algo for nstart randomly chosen initial centers # return fit with lowest total within-cluser-sum-of-squared best <- Inf for (i in 1:nstart) { n <- nrow(data) centers <- data[sample(n, k),, drop = FALSE] centers <- centers + runif(length(centers))/100 # make centers unique fit <- try(kmeans(x = data, centers = centers, iter.max = 100)) if (class(fit) == "try-error") { cat("failed to find cluster at iteration ", i) } else if (fit$tot.withinss < best) { best <- fit$tot.withins best_fit <- fit } } return(best_fit) } # example with 4 clusters, starting from the 4 means above K.means <- kmeans_rand_start(aggregateMultiplePages, 4) str(K.means) # run for different number of clusters D.within <- NULL for (k in 1:15) { cat("\nnumber of clusters:", k) K.means <- kmeans_rand_start(aggregateMultiplePages, k, nstart = 15) D.within[k] <- K.means$tot.withinss } dev.off() plot(seq_along(D.within), D.within, type = 'l', ylab = "Within-group sum of squares") # compute gap-statistic for selecting number of cluster set.seed(007) #install.packages('cluster') library(cluster) gap.statistics <- clusGap(aggregateMultiplePages, kmeans_rand_start, K.max = 10, B = 25) # note limited number of bootstrap samples (default is 100) gap.statistics plot(1:10, gap.statistics$Tab[, "E.logW"], ylim = c(8, 12), type = "l") # estimated expected within-ss points(1:10, gap.statistics$Tab[, "logW"]) # observed plot(gap.statistics) # run kmeans with 4 clusters as in book k1 <- kmeans_rand_start(aggregateMultiplePages, 4, nstart = 15) means.k <- sapply(1:4, function(x, data) apply(data[k1$cluster == x, ], 2, mean), data = aggregateMultiplePages) means.k colnames(means.k) <- paste('cluster', 1:4, sep = ' ') # plot library(beeswarm) par(mar=c(5.1, 8.1, 4.1, 2.1)) beeswarm(as.data.frame(t(log2(means.k + 1))), las = 2, vertical = FALSE, pwpch = rep(15:18, 8), cex = 1.5, pwcol = rep(2:5, 8), xaxt = 'n') axis(1) legend('bottomright', paste('cluster', 1:4, sep = ' '), pch = 15:18, col = 2:5) # compare with hierchical clustering clusters hierClust <- rep(NA, nrow = nrow(aggregateMultiplePages)) hierClust[a[[1]]] <- 'A' hierClust[a[[2]]] <- 'B' hierClust[a[[3]]] <- 'C' hierClust[a[[4]]] <- 'D' table(hierClust) table(k1$cluster) table(hierClust, k1$cluster)