mtt_haum/code/07_item-clustering.R

# 07_item-clustering.R
#
# content: (1) Read data
#           (1.1) Read log event data
#           (1.2) Read infos for PM for infos
#           (1.3) Extract additional infos for clustering
#          (2) Clustering
#          (3) Visualization with pictures
#
# input:  results/haum/eventlogs_pre-corona_cleaned.RData
#         results/haum/pn_infos_items.csv
# output: results/haum/eventlogs_pre-corona_item-clusters.csv
#         ../../thesis/figures/data/clustering_items.RData"
#
# last mod: 2024-03-21

# setwd("C:/Users/nwickelmaier/Nextcloud/Documents/MDS/2023ss/60100_master_thesis/analysis/code")

source("R_helpers.R")

#--------------- (1) Read data ---------------

#--------------- (1.1) Read log event data ---------------

load("results/haum/eventlogs_pre-corona_cleaned.RData")

#--------------- (1.2) Read infos for PM for items ---------------

datitem <- read.table("results/haum/pn_infos_items.csv", header = TRUE,
                      sep = ";", row.names = 1)

#--------------- (1.3) Extract additional infos for clustering ---------------

# Get average duration per path
dat_split <- split(dat, ~ path)
dat_list <- pbapply::pblapply(dat_split, time_minmax_ms)
dat_minmax <- dplyr::bind_rows(dat_list)

datpath <- aggregate(duration ~ item + path, dat, mean, na.action = NULL)
datpath$min_time <- aggregate(min_time ~ path, dat_minmax, unique, na.action = NULL)$min_time
datpath$max_time <- aggregate(max_time ~ path, dat_minmax, unique, na.action = NULL)$max_time
datpath$duration <- datpath$max_time - datpath$min_time

datitem$duration <- aggregate(duration ~ item, datpath, mean)$duration
datitem$distance <- aggregate(distance ~ item, dat, mean)$distance
datitem$scaleSize <- aggregate(scaleSize ~ item, dat, mean)$scaleSize
datitem$rotationDegree <- aggregate(rotationDegree ~ item, dat, mean)$rotationDegree
datitem$npaths <- aggregate(path ~ item, dat, function(x) length(unique(x)))$path
datitem$ncases <- aggregate(case ~ item, dat, function(x) length(unique(x)))$case
datitem$nmoves <- aggregate(event ~ item, dat, table)$event[,"move"]
datitem$nflipCard <- aggregate(event ~ item, dat, table)$event[,"flipCard"]
datitem$nopenTopic <- aggregate(event ~ item, dat, table)$event[,"openTopic"]
datitem$nopenPopup <- aggregate(event ~ item, dat, table)$event[,"openPopup"]

rm(datpath)

#--------------- (2) Clustering ---------------

df <- datitem[, c("precision", "generalizability", "nvariants", "duration",
                  "distance", "scaleSize", "rotationDegree", "npaths",
                  "ncases", "nmoves", "nflipCard", "nopenTopic",
                  "nopenPopup")] |>
      scale()

dist_mat <- dist(df)

heatmap(as.matrix(dist_mat))

# Choosing best linkage method
method <- c(average = "average", single = "single", complete = "complete",
            ward = "ward")

hcs <- lapply(method, function(x) cluster::agnes(dist_mat, method = x))
acs <- sapply(hcs, function(x) x$ac)

# Dendograms
par(mfrow=c(4,2))
for (hc in hcs) plot(hc,  main = "")

hc <- hcs$ward

factoextra::fviz_nbclust(df, FUNcluster = factoextra::hcut, method = "wss")
factoextra::fviz_nbclust(df, FUNcluster = factoextra::hcut, method = "silhouette")

gap_stat <- cluster::clusGap(df, FUNcluster = factoextra::hcut,
                             hc_func = "agnes", hc_method = "ward",
                             K.max = 15)
factoextra::fviz_gap_stat(gap_stat)

k <- 6    # number of clusters

mycols <- c("#434F4F", "#78004B", "#FF6900", "#3CB4DC", "#91C86E", "Black")

cluster <- cutree(hc, k = k)

factoextra::fviz_dend(hc, k = k,
                      cex = 0.5,
                      k_colors = mycols,
                      #type = "phylogenic",
                      rect = TRUE,
                      main = "",
                      ylab = ""
                      #ggtheme = ggplot2::theme_bw()
)

factoextra::fviz_cluster(list(data = df, cluster = cluster),
                         palette = mycols,
                         ellipse.type = "convex",
                         repel = TRUE,
                         show.clust.cent = FALSE,
                         main = "",
                         ggtheme = ggplot2::theme_bw())

aggregate(cbind(precision, generalizability, nvariants, duration, distance,
                scaleSize , rotationDegree, npaths, ncases, nmoves,
                nflipCard, nopenTopic, nopenPopup) ~ cluster, datitem,
          mean)

aggregate(cbind(duration, distance, scaleSize , rotationDegree, npaths,
                ncases, nmoves, nflipCard, nopenTopic, nopenPopup) ~ cluster,
          datitem, max)

item <- sprintf("%03d", as.numeric(gsub("item_([0-9]{3})", "\\1",
                                        row.names(datitem))))

res <- merge(dat, data.frame(item, cluster), by = "item", all.x = TRUE)
res <- res[order(res$fileId.start, res$date.start, res$timeMs.start), ]

# Look at clusters
par(mfrow = c(2,2))
vioplot::vioplot(duration ~ cluster, res)
vioplot::vioplot(distance ~ cluster, res)
vioplot::vioplot(scaleSize ~ cluster, res)
vioplot::vioplot(rotationDegree ~ cluster, res)

write.table(res,
            file = "results/haum/eventlogs_pre-corona_item-clusters.csv",
            sep = ";",
            quote = FALSE,
            row.names = FALSE)

# Save data for plots and tables

save(hc, k, res, dist_mat, datitem, df,
     file = "../../thesis/figures/data/clustering_items.RData")