From 26ba7265f56b1a948574bcecf6f831069fd8fd21 Mon Sep 17 00:00:00 2001 From: nwickel Date: Fri, 8 Mar 2024 18:05:40 +0100 Subject: [PATCH] Cleaned up item clustering; works well now --- code/07_item-clustering.R | 144 ++++++++++++++++++-------------------- 1 file changed, 70 insertions(+), 74 deletions(-) diff --git a/code/07_item-clustering.R b/code/07_item-clustering.R index 563bfd6..8a3282d 100644 --- a/code/07_item-clustering.R +++ b/code/07_item-clustering.R @@ -10,13 +10,14 @@ # input: results/haum/eventlogs_pre-corona_cleaned.RData # results/haum/pn_infos_items.csv # output: results/haum/eventlogs_pre-corona_item-clusters.csv +# results/figures/clustering_artworks.pdf +# results/figures/clustering_artworks.png # -# last mod: 2024-03-06 +# last mod: 2024-03-08 # setwd("C:/Users/nwickelmaier/Nextcloud/Documents/MDS/2023ss/60100_master_thesis/analysis/code") -library(bupaverse) -library(factoextra) +source("R_helpers.R") #--------------- (1) Read data --------------- @@ -31,17 +32,6 @@ datitem <- read.table("results/haum/pn_infos_items.csv", header = TRUE, #--------------- (1.3) Extract additional infos for clustering --------------- -time_minmax_ms <- function(subdata) { - subdata$min_time <- min(subdata$timeMs.start) - if (all(is.na(subdata$timeMs.stop))) { - subdata$max_time <- NA - } else { - subdata$max_time <- max(subdata$timeMs.stop, na.rm = TRUE) - } - subdata -} -# TODO: Move to helper file - # Get average duration per path dat_split <- split(dat, ~ path) dat_list <- pbapply::pblapply(dat_split, time_minmax_ms) @@ -63,11 +53,14 @@ datitem$nflipCard <- aggregate(event ~ item, dat, table)$event[,"flipCard"] datitem$nopenTopic <- aggregate(event ~ item, dat, table)$event[,"openTopic"] datitem$nopenPopup <- aggregate(event ~ item, dat, table)$event[,"openPopup"] +rm(datpath) + #--------------- (2) Clustering --------------- df <- datitem[, c("precision", "generalizability", "nvariants", "duration", "distance", "scaleSize", "rotationDegree", "npaths", - "ncases", "nmoves", "nopenTopic", "nopenPopup")] |> + "ncases", "nmoves", "nflipCard", "nopenTopic", + "nopenPopup")] |> scale() dist_mat <- dist(df) @@ -87,52 +80,54 @@ for (hc in hcs) plot(hc, main = "") hc <- hcs$ward -k <- 4 # number of clusters +factoextra::fviz_nbclust(df, FUNcluster = factoextra::hcut, method = "wss") +factoextra::fviz_nbclust(df, FUNcluster = factoextra::hcut, method = "silhouette") -mycols <- c("#78004B", "#FF6900", "#3CB4DC", "#91C86E") +gap_stat <- cluster::clusGap(df, FUNcluster = factoextra::hcut, + hc_func = "agnes", hc_method = "ward", + K.max = 10) +factoextra::fviz_gap_stat(gap_stat) -grp <- cutree(hc, k = k) -datitem$grp <- grp +k <- 6 # number of clusters -fviz_dend(hc, k = k, - cex = 0.5, - k_colors = mycols, - #type = "phylogenic", - rect = TRUE +mycols <- c("#434F4F", "#78004B", "#FF6900", "#3CB4DC", "#91C86E", "Black") + +cluster <- cutree(hc, k = k) + +factoextra::fviz_dend(hc, k = k, + cex = 0.5, + k_colors = mycols, + #type = "phylogenic", + rect = TRUE ) -p <- fviz_cluster(list(data = df, cluster = grp), - palette = mycols, - ellipse.type = "convex", - repel = TRUE, - show.clust.cent = FALSE, ggtheme = theme_bw()) -p +factoextra::fviz_cluster(list(data = df, cluster = cluster), + palette = mycols, + ellipse.type = "convex", + repel = TRUE, + show.clust.cent = FALSE, + ggtheme = ggplot2::theme_bw()) aggregate(cbind(duration, distance, scaleSize , rotationDegree, npaths, - ncases, nmoves, nflipCard, nopenTopic, nopenPopup) ~ grp, + ncases, nmoves, nflipCard, nopenTopic, nopenPopup) ~ cluster, datitem, mean) - aggregate(cbind(duration, distance, scaleSize , rotationDegree, npaths, - ncases, nmoves, nflipCard, nopenTopic, nopenPopup) ~ grp, + ncases, nmoves, nflipCard, nopenTopic, nopenPopup) ~ cluster, datitem, max) +item <- sprintf("%03d", as.numeric(gsub("item_([0-9]{3})", "\\1", + row.names(datitem)))) -# Something like a scree plot (??) -plot(rev(hc$height), type = "b", pch = 16, cex = .5) - -datitem$item <- sprintf("%03d", - as.numeric(gsub("item_([0-9]{3})", "\\1", row.names(datitem)))) - -res <- merge(dat, datitem[, c("item", "grp")], by = "item", all.x = TRUE) +res <- merge(dat, data.frame(item, cluster), by = "item", all.x = TRUE) res <- res[order(res$fileId.start, res$date.start, res$timeMs.start), ] # Look at clusters par(mfrow = c(2,2)) -vioplot::vioplot(duration ~ grp, res) -vioplot::vioplot(distance ~ grp, res) -vioplot::vioplot(scaleSize ~ grp, res) -vioplot::vioplot(rotationDegree ~ grp, res) +vioplot::vioplot(duration ~ cluster, res) +vioplot::vioplot(distance ~ cluster, res) +vioplot::vioplot(scaleSize ~ cluster, res) +vioplot::vioplot(rotationDegree ~ cluster, res) write.table(res, file = "results/haum/eventlogs_pre-corona_item-clusters.csv", @@ -144,59 +139,60 @@ write.table(res, res$start <- res$date.start res$complete <- res$date.stop -for (cluster in sort(unique(res$grp))) { +for (clst in sort(unique(res$cluster))) { - alog <- activitylog(res[res$grp == cluster, ], - case_id = "path", - activity_id = "event", - resource_id = "item", - timestamps = c("start", "complete")) + alog <- bupaR::activitylog(res[res$cluster == clst, ], + case_id = "path", + activity_id = "event", + resource_id = "item", + timestamps = c("start", "complete")) - dfg <- process_map(alog, - type_nodes = frequency("relative", color_scale = "Greys"), - sec_nodes = frequency("absolute"), - type_edges = frequency("relative", color_edges = "#FF6900"), - sec_edges = frequency("absolute"), - rankdir = "LR", - render = FALSE) - export_map(dfg, - file_name = paste0("results/processmaps/dfg_cluster", cluster, "_R.pdf"), - file_type = "pdf", - title = paste("DFG Cluster", cluster)) + dfg <- processmapR::process_map(alog, + type_nodes = processmapR::frequency("relative", color_scale = "Greys"), + sec_nodes = processmapR::frequency("absolute"), + type_edges = processmapR::frequency("relative", color_edges = "#FF6900"), + sec_edges = processmapR::frequency("absolute"), + rankdir = "LR", + render = FALSE) + + processmapR::export_map(dfg, + file_name = paste0("results/processmaps/dfg_cluster", clst, "_R.pdf"), + file_type = "pdf", + title = paste("DFG Cluster", clst)) } #--------------- (3) Visualization with pictures --------------- -library(png) -library(jpeg) -library(grid) +coor_2d <- cmdscale(dist_mat, k = 2) -pdf("results/figures/clustering_artworks.pdf", height = 8, width = 8, pointsize = 10) -#png("results/figures/clustering_artworks.png", units = "in", height = 8, width = 8, pointsize = 10, res = 300) +items <- sprintf("%03d", as.numeric(rownames(datitem))) + +#pdf("results/figures/clustering_artworks.pdf", height = 8, width = 8, pointsize = 10) +png("results/figures/clustering_artworks.png", units = "in", height = 8, width = 8, pointsize = 10, res = 300) par(mai = c(.4,.4,.1,.1), mgp = c(2.4, 1, 0)) -plot(y ~ x, p$data, type = "n", ylim = c(-3.5, 2.8), xlim = c(-5, 10), +plot(coor_2d, type = "n", ylim = c(-3.7, 2.6), xlim = c(-5, 10.5), xlab = "", ylab = "") -for (item in sprintf("%03d", as.numeric(rownames(p$data)))) { +for (item in items) { if (item == "125") { - pic <- readJPEG(paste0("../data/haum/ContentEyevisit/eyevisit_cards_light/", + pic <- jpeg::readJPEG(paste0("../data/haum/ContentEyevisit/eyevisit_cards_light/", item, "/", item, ".jpg")) } else { - pic <- readPNG(paste0("../data/haum/ContentEyevisit/eyevisit_cards_light/", - item, "/", item, ".png")) + pic <- png::readPNG(paste0("../data/haum/ContentEyevisit/eyevisit_cards_light/", + item, "/", item, ".png")) } img <- as.raster(pic[,,1:3]) - x <- p$data$x[sprintf("%03d", as.numeric(rownames(p$data))) == item] - y <- p$data$y[sprintf("%03d", as.numeric(rownames(p$data))) == item] + x <- coor_2d[items == item, 1] + y <- coor_2d[items == item, 2] points(x, y, - col = mycols[p$data$cluster[sprintf("%03d", as.numeric(rownames(p$data))) == item]], + col = mycols[cluster[items == item]], cex = 9, pch = 15)