# Case study, clustering: Web Usage Mining
#
# From ch 6.3 in "Data anlysis and data mining" by A.Azzalini and B.Scarpa,
# Oxford University Press, 2012 (ISBN 978-0-19-976710-6).
#
# - Profiling website visitor's hit-patterns is a simple way of identifying differences in interests between potential customers,
#   and appropriate marketing actions.
# - The data set analyzed contains information on 26,157 anonymous hits on a consulting company's website
# - There are 231 pages on the website (grouped into eight categories) and a total of 47,387 page views.
# - Because some of the pages have similar content, they were grouped into eight categories (home, contacts, communications, events, company, white papers, business units, consulting)
#
# The data is found at: http://www.azzalini.it/Book-DM/


# load data ----
download.file('http://www.azzalini.it/Book-DM/webdata.zip', 'webData.zip')
data <- read.csv(unz('webData.zip', 'webdata.csv'), sep = ';', header = TRUE,
                 row.names = 1)
system('rm webData.zip')
dim(data[data$sessionID == 499, ])

head(data)


## Table 6.2 -----
## Percentage of the single-page session for each area

# count number of pages visited per session
nPages <- table(data$sessionID)

# count single-page sessions by area
table_onePage <- table(data$area[data$sessionID %in% names(nPages)[nPages == 1]])

# compute percentages
round(100 * table_onePage / sum(table_onePage), digits = 2)

## Table 6.3 ----
## Summary stats for hits to each area

# construct data set with multi-page visitors only
multiplePages <- data[data$sessionID %in% names(nPages)[nPages > 1], ]
summary(multiplePages)

# for each session, compute number of visits (hits) to each area
aggregateMultiplePages <- table(multiplePages$sessionID, multiplePages$area)
aggregateMultiplePages <- as.matrix(aggregateMultiplePages)
head(aggregateMultiplePages)
dim(aggregateMultiplePages)

# summary stats
t(apply(aggregateMultiplePages, 2,
        function(x) c(mean = mean(x),
                      median = median(x),
                      quantile(x, probs = c(0.75, 0.9, 0.99)))))

## Fig 6.7 ----
## Make a dendrogram clustering the visitors based on the number of hits to each area.

head(aggregateMultiplePages)

# transform data
transfData <- ceiling(log2(aggregateMultiplePages + 1))
plot(aggregateMultiplePages[, 1], transfData[, 1])

# apply hierarchical clustering with complete linkage
?dist
dissimil <- dist(transfData, method = "euclidian")
h1 <- hclust(dissimil, method = 'complete')
plot(h1, xlab = '', ylab = '',  main = '', sub = '', lwd = 2)
abline(h = 9.2, lty = 2, lwd = 2)
?rect.hclust
a <- rect.hclust(h1, k = 4) # assigns each session to one of four clusters
names(a) <- paste('Cluster', LETTERS[1:4], sep = ' ')

## Table 6.4 ----
## Characterize the fours cluster by the average number of visits to each of the eight areas.

# compute mean of visits to each area by cluster
means <- lapply(a,
                function(a) apply(aggregateMultiplePages[a, ], 2, mean))

sds   <- lapply(a,
                function(a) apply(aggregateMultiplePages[a, ], 2, sd))

# put results in a table
makeCells <- function(mean, sd) paste0(round(mean, 2), ' (', round(sd, 2), ')')

out <- matrix(character(), nrow = ncol(aggregateMultiplePages), ncol = length(a))
colnames(out) <- names(a)
rownames(out) <- colnames(aggregateMultiplePages)
for (i in 1:ncol(aggregateMultiplePages)) {
  for (j in 1:length(a)) {
    out[i, j] <- makeCells(means[[j]][i], sds[[j]][i])
  }
}

# add overall mean and sd
means <- apply(aggregateMultiplePages, 2, mean)
sds   <- apply(aggregateMultiplePages, 2, sd)
out  <- cbind(out, Overall = mapply(makeCells, mean = means, sd = sds))

# add counts
out <- rbind(out, "Number of visitors" = c(lengths(a), nrow(aggregateMultiplePages)))
out


## Figure 6.8 ----
## Apply k-means algorithm

?kmeans
# fit <- kmeans(x = aggregateMultiplePages, centers = 4, nstart = 10) # fails


kmeans_rand_start <- function(data, k, nstart = 1) {
  # run kmeans algo for nstart randomly chosen initial centers
  # return fit with lowest total within-cluser-sum-of-squared
  best <- Inf
  for (i in 1:nstart) {
    n <- nrow(data)
    centers <- data[sample(n, k),, drop = FALSE]
    centers <- centers + runif(length(centers))/100   # make centers unique
    fit <- try(kmeans(x = data, centers = centers, iter.max = 100))
    if (class(fit) == "try-error") {
      cat("failed to find cluster at iteration ", i)
    } else if (fit$tot.withinss < best) {
      best <- fit$tot.withins
      best_fit <- fit
    }
  }
  return(best_fit)
}

# example with 4 clusters, starting from the 4 means above
K.means <- kmeans_rand_start(aggregateMultiplePages, 4)
str(K.means)

# run for different number of clusters
D.within <- NULL
for (k in 1:15) {
  cat("\nnumber of clusters:", k)
  K.means <- kmeans_rand_start(aggregateMultiplePages, k, nstart = 15)
  D.within[k] <- K.means$tot.withinss
}
dev.off()
plot(seq_along(D.within), D.within, type = 'l', ylab = "Within-group sum of squares")



# compute gap-statistic for selecting number of cluster
set.seed(007)
#install.packages('cluster')
library(cluster)
gap.statistics <- clusGap(aggregateMultiplePages, kmeans_rand_start, K.max = 10,
                          B = 25) # note limited number of bootstrap samples (default is 100)
gap.statistics

plot(1:10, gap.statistics$Tab[, "E.logW"], ylim = c(8, 12), type = "l")   # estimated expected within-ss
points(1:10, gap.statistics$Tab[, "logW"])                                # observed
plot(gap.statistics)



# run kmeans with 4 clusters as in book
k1 <- kmeans_rand_start(aggregateMultiplePages, 4, nstart = 15)
means.k <- sapply(1:4,
                  function(x, data)
                    apply(data[k1$cluster == x, ], 2, mean),
                  data = aggregateMultiplePages)
means.k
colnames(means.k) <- paste('cluster', 1:4, sep = ' ')

# plot
library(beeswarm)
par(mar=c(5.1, 8.1, 4.1, 2.1))
beeswarm(as.data.frame(t(log2(means.k + 1))), las = 2, vertical = FALSE,
         pwpch = rep(15:18, 8), cex = 1.5, pwcol = rep(2:5, 8), xaxt = 'n')
axis(1)
legend('bottomright', paste('cluster', 1:4, sep = ' '),
       pch = 15:18, col = 2:5)

# compare with hierchical clustering clusters
hierClust <- rep(NA, nrow = nrow(aggregateMultiplePages))
hierClust[a[[1]]] <- 'A'
hierClust[a[[2]]] <- 'B'
hierClust[a[[3]]] <- 'C'
hierClust[a[[4]]] <- 'D'

table(hierClust)
table(k1$cluster)
table(hierClust, k1$cluster)