# 07_item-clustering.R # # content: (1) Read data # (1.1) Read log event data # (1.2) Read infos for PM for infos # (1.3) Extract additional infos for clustering # (2) Clustering # (3) Visualization with pictures # # input: results/haum/eventlogs_pre-corona_cleaned.RData # results/haum/pn_infos_items.csv # output: results/haum/eventlogs_pre-corona_item-clusters.csv # results/figures/dendrogram_items.pdf # results/figures/clustering_items.pdf # results/figures/clustering_artworks.pdf # results/figures/clustering_artworks.png # # last mod: 2024-03-08 # setwd("C:/Users/nwickelmaier/Nextcloud/Documents/MDS/2023ss/60100_master_thesis/analysis/code") source("R_helpers.R") #--------------- (1) Read data --------------- #--------------- (1.1) Read log event data --------------- load("results/haum/eventlogs_pre-corona_cleaned.RData") #--------------- (1.2) Read infos for PM for items --------------- datitem <- read.table("results/haum/pn_infos_items.csv", header = TRUE, sep = ";", row.names = 1) #--------------- (1.3) Extract additional infos for clustering --------------- # Get average duration per path dat_split <- split(dat, ~ path) dat_list <- pbapply::pblapply(dat_split, time_minmax_ms) dat_minmax <- dplyr::bind_rows(dat_list) datpath <- aggregate(duration ~ item + path, dat, mean, na.action = NULL) datpath$min_time <- aggregate(min_time ~ path, dat_minmax, unique, na.action = NULL)$min_time datpath$max_time <- aggregate(max_time ~ path, dat_minmax, unique, na.action = NULL)$max_time datpath$duration <- datpath$max_time - datpath$min_time datitem$duration <- aggregate(duration ~ item, datpath, mean)$duration datitem$distance <- aggregate(distance ~ item, dat, mean)$distance datitem$scaleSize <- aggregate(scaleSize ~ item, dat, mean)$scaleSize datitem$rotationDegree <- aggregate(rotationDegree ~ item, dat, mean)$rotationDegree datitem$npaths <- aggregate(path ~ item, dat, function(x) length(unique(x)))$path datitem$ncases <- aggregate(case ~ item, dat, function(x) length(unique(x)))$case datitem$nmoves <- aggregate(event ~ item, dat, table)$event[,"move"] datitem$nflipCard <- aggregate(event ~ item, dat, table)$event[,"flipCard"] datitem$nopenTopic <- aggregate(event ~ item, dat, table)$event[,"openTopic"] datitem$nopenPopup <- aggregate(event ~ item, dat, table)$event[,"openPopup"] rm(datpath) #--------------- (2) Clustering --------------- df <- datitem[, c("precision", "generalizability", "nvariants", "duration", "distance", "scaleSize", "rotationDegree", "npaths", "ncases", "nmoves", "nflipCard", "nopenTopic", "nopenPopup")] |> scale() dist_mat <- dist(df) heatmap(as.matrix(dist_mat)) # Choosing best linkage method method <- c(average = "average", single = "single", complete = "complete", ward = "ward") hcs <- lapply(method, function(x) cluster::agnes(dist_mat, method = x)) acs <- sapply(hcs, function(x) x$ac) # Dendograms par(mfrow=c(4,2)) for (hc in hcs) plot(hc, main = "") hc <- hcs$ward factoextra::fviz_nbclust(df, FUNcluster = factoextra::hcut, method = "wss") factoextra::fviz_nbclust(df, FUNcluster = factoextra::hcut, method = "silhouette") gap_stat <- cluster::clusGap(df, FUNcluster = factoextra::hcut, hc_func = "agnes", hc_method = "ward", K.max = 15) factoextra::fviz_gap_stat(gap_stat) k <- 6 # number of clusters mycols <- c("#434F4F", "#78004B", "#FF6900", "#3CB4DC", "#91C86E", "Black") cluster <- cutree(hc, k = k) pdf("results/figures/dendrogram_items.pdf", width = 6.5, height = 5.5, pointsize = 10) # TODO: Move code for plots to /thesis/ factoextra::fviz_dend(hc, k = k, cex = 0.5, k_colors = mycols, #type = "phylogenic", rect = TRUE, main = "", ylab = "" #ggtheme = ggplot2::theme_bw() ) dev.off() pdf("results/figures/clustering_items.pdf", width = 6.5, height = 5.5, pointsize = 10) factoextra::fviz_cluster(list(data = df, cluster = cluster), palette = mycols, ellipse.type = "convex", repel = TRUE, show.clust.cent = FALSE, main = "", ggtheme = ggplot2::theme_bw()) dev.off() aggregate(cbind(precision, generalizability, nvariants, duration, distance, scaleSize , rotationDegree, npaths, ncases, nmoves, nflipCard, nopenTopic, nopenPopup) ~ cluster, datitem, mean) aggregate(cbind(duration, distance, scaleSize , rotationDegree, npaths, ncases, nmoves, nflipCard, nopenTopic, nopenPopup) ~ cluster, datitem, max) item <- sprintf("%03d", as.numeric(gsub("item_([0-9]{3})", "\\1", row.names(datitem)))) res <- merge(dat, data.frame(item, cluster), by = "item", all.x = TRUE) res <- res[order(res$fileId.start, res$date.start, res$timeMs.start), ] # Look at clusters par(mfrow = c(2,2)) vioplot::vioplot(duration ~ cluster, res) vioplot::vioplot(distance ~ cluster, res) vioplot::vioplot(scaleSize ~ cluster, res) vioplot::vioplot(rotationDegree ~ cluster, res) write.table(res, file = "results/haum/eventlogs_pre-corona_item-clusters.csv", sep = ";", quote = FALSE, row.names = FALSE) # DFGs for clusters res$start <- res$date.start res$complete <- res$date.stop for (clst in sort(unique(res$cluster))) { alog <- bupaR::activitylog(res[res$cluster == clst, ], case_id = "path", activity_id = "event", resource_id = "item", timestamps = c("start", "complete")) dfg <- processmapR::process_map(alog, type_nodes = processmapR::frequency("relative", color_scale = "Greys"), sec_nodes = processmapR::frequency("absolute"), type_edges = processmapR::frequency("relative", color_edges = "#FF6900"), sec_edges = processmapR::frequency("absolute"), rankdir = "LR", render = FALSE) processmapR::export_map(dfg, file_name = paste0("results/processmaps/dfg_items_cluster", clst, "_R.pdf"), file_type = "pdf", title = paste("Cluster", clst)) } #--------------- (3) Visualization with pictures --------------- coor_2d <- cmdscale(dist_mat, k = 2) items <- sprintf("%03d", as.numeric(rownames(datitem))) pdf("results/figures/clustering_artworks.pdf", height = 8, width = 8, pointsize = 16) #png("results/figures/clustering_artworks.png", units = "in", height = 8, width = 8, pointsize = 16, res = 300) par(mai = c(.6,.6,.1,.1), mgp = c(2.4, 1, 0)) plot(coor_2d, type = "n", ylim = c(-3.7, 2.6), xlim = c(-5, 10.5), xlab = "", ylab = "") for (item in items) { if (item == "125") { pic <- jpeg::readJPEG(paste0("../data/haum/ContentEyevisit/eyevisit_cards_light/", item, "/", item, ".jpg")) } else { pic <- png::readPNG(paste0("../data/haum/ContentEyevisit/eyevisit_cards_light/", item, "/", item, ".png")) } img <- as.raster(pic[,,1:3]) x <- coor_2d[items == item, 1] y <- coor_2d[items == item, 2] points(x, y, col = mycols[cluster[items == item]], cex = 6, pch = 15) rasterImage(img, xleft = x - .45, xright = x + .45, ybottom = y - .2, ytop = y + .2) } legend("topright", paste("Cluster", 1:k), col = mycols, pch = 15, bty = "n") dev.off()