Cleaned up item clustering; works well now

2024-03-08 18:05:40 +01:00 · 2024-03-08 18:05:40 +01:00 · 26ba7265f5
commit 26ba7265f5
parent 66fab4fa18
1 changed files with 70 additions and 74 deletions
--- a/code/07_item-clustering.R
+++ b/code/07_item-clustering.R
@ -10,13 +10,14 @@
 # input:  results/haum/eventlogs_pre-corona_cleaned.RData
 #         results/haum/pn_infos_items.csv
 # output: results/haum/eventlogs_pre-corona_item-clusters.csv
+#         results/figures/clustering_artworks.pdf
+#         results/figures/clustering_artworks.png
 #
-# last mod: 2024-03-06
+# last mod: 2024-03-08

 # setwd("C:/Users/nwickelmaier/Nextcloud/Documents/MDS/2023ss/60100_master_thesis/analysis/code")

-library(bupaverse)
-library(factoextra)
+source("R_helpers.R")

 #--------------- (1) Read data ---------------

@ -31,17 +32,6 @@ datitem <- read.table("results/haum/pn_infos_items.csv", header = TRUE,

 #--------------- (1.3) Extract additional infos for clustering ---------------

-time_minmax_ms <- function(subdata) {
-  subdata$min_time <- min(subdata$timeMs.start)
-    if (all(is.na(subdata$timeMs.stop))) {
-      subdata$max_time <- NA
-    } else {
-       subdata$max_time <- max(subdata$timeMs.stop, na.rm = TRUE)
-    }
-  subdata
-}
-# TODO: Move to helper file
-
 # Get average duration per path
 dat_split <- split(dat, ~ path)
 dat_list <- pbapply::pblapply(dat_split, time_minmax_ms)
@ -63,11 +53,14 @@ datitem$nflipCard <- aggregate(event ~ item, dat, table)$event[,"flipCard"]
 datitem$nopenTopic <- aggregate(event ~ item, dat, table)$event[,"openTopic"]
 datitem$nopenPopup <- aggregate(event ~ item, dat, table)$event[,"openPopup"]

+rm(datpath)
+
 #--------------- (2) Clustering ---------------

 df <- datitem[, c("precision", "generalizability", "nvariants", "duration",
                  "distance", "scaleSize", "rotationDegree", "npaths",
-                  "ncases", "nmoves", "nopenTopic", "nopenPopup")] |>
+                  "ncases", "nmoves", "nflipCard", "nopenTopic",
+                  "nopenPopup")] |>
      scale()

 dist_mat <- dist(df)
@ -87,52 +80,54 @@ for (hc in hcs) plot(hc,  main = "")

 hc <- hcs$ward

-k <- 4    # number of clusters
+factoextra::fviz_nbclust(df, FUNcluster = factoextra::hcut, method = "wss")
+factoextra::fviz_nbclust(df, FUNcluster = factoextra::hcut, method = "silhouette")

-mycols <- c("#78004B", "#FF6900", "#3CB4DC", "#91C86E")
+gap_stat <- cluster::clusGap(df, FUNcluster = factoextra::hcut,
+                             hc_func = "agnes", hc_method = "ward",
+                             K.max = 10)
+factoextra::fviz_gap_stat(gap_stat)

-grp <- cutree(hc, k = k)
-datitem$grp <- grp
+k <- 6    # number of clusters

-fviz_dend(hc, k = k,
+mycols <- c("#434F4F", "#78004B", "#FF6900", "#3CB4DC", "#91C86E", "Black")
+
+cluster <- cutree(hc, k = k)
+
+factoextra::fviz_dend(hc, k = k,
                      cex = 0.5,
                      k_colors = mycols,
                      #type = "phylogenic",
                      rect = TRUE
 )

-p <- fviz_cluster(list(data = df, cluster = grp),
+factoextra::fviz_cluster(list(data = df, cluster = cluster),
                         palette = mycols,
                         ellipse.type = "convex",
                         repel = TRUE,
-                  show.clust.cent = FALSE, ggtheme = theme_bw())
-p
+                         show.clust.cent = FALSE,
+                         ggtheme = ggplot2::theme_bw())

 aggregate(cbind(duration, distance, scaleSize , rotationDegree, npaths,
-                ncases, nmoves, nflipCard, nopenTopic, nopenPopup) ~ grp,
+                ncases, nmoves, nflipCard, nopenTopic, nopenPopup) ~ cluster,
          datitem, mean)

-
 aggregate(cbind(duration, distance, scaleSize , rotationDegree, npaths,
-                ncases, nmoves, nflipCard, nopenTopic, nopenPopup) ~ grp,
+                ncases, nmoves, nflipCard, nopenTopic, nopenPopup) ~ cluster,
          datitem, max)

+item <- sprintf("%03d", as.numeric(gsub("item_([0-9]{3})", "\\1",
+                                        row.names(datitem))))

-# Something like a scree plot (??)
-plot(rev(hc$height), type = "b", pch = 16, cex = .5)
-
-datitem$item <- sprintf("%03d",
-                        as.numeric(gsub("item_([0-9]{3})", "\\1", row.names(datitem))))
-
-res <- merge(dat, datitem[, c("item", "grp")], by = "item", all.x = TRUE)
+res <- merge(dat, data.frame(item, cluster), by = "item", all.x = TRUE)
 res <- res[order(res$fileId.start, res$date.start, res$timeMs.start), ]

 # Look at clusters
 par(mfrow = c(2,2))
-vioplot::vioplot(duration ~ grp, res)
-vioplot::vioplot(distance ~ grp, res)
-vioplot::vioplot(scaleSize ~ grp, res)
-vioplot::vioplot(rotationDegree ~ grp, res)
+vioplot::vioplot(duration ~ cluster, res)
+vioplot::vioplot(distance ~ cluster, res)
+vioplot::vioplot(scaleSize ~ cluster, res)
+vioplot::vioplot(rotationDegree ~ cluster, res)

 write.table(res,
            file = "results/haum/eventlogs_pre-corona_item-clusters.csv",
@ -144,59 +139,60 @@ write.table(res,
 res$start <- res$date.start
 res$complete <- res$date.stop

-for (cluster in sort(unique(res$grp))) {
+for (clst in sort(unique(res$cluster))) {

-  alog <- activitylog(res[res$grp == cluster, ],
+  alog <- bupaR::activitylog(res[res$cluster == clst, ],
    case_id     = "path",
    activity_id = "event",
    resource_id = "item",
    timestamps  = c("start", "complete"))
  
-  dfg <- process_map(alog,
-                     type_nodes = frequency("relative", color_scale = "Greys"),
-                     sec_nodes  = frequency("absolute"),
-                     type_edges = frequency("relative", color_edges = "#FF6900"),
-                     sec_edges  = frequency("absolute"),
+  dfg <- processmapR::process_map(alog,
+    type_nodes = processmapR::frequency("relative", color_scale = "Greys"),
+    sec_nodes  = processmapR::frequency("absolute"),
+    type_edges = processmapR::frequency("relative", color_edges = "#FF6900"),
+    sec_edges  = processmapR::frequency("absolute"),
    rankdir    = "LR",
    render     = FALSE)
-  export_map(dfg,
-             file_name = paste0("results/processmaps/dfg_cluster", cluster, "_R.pdf"),
+
+  processmapR::export_map(dfg,
+    file_name = paste0("results/processmaps/dfg_cluster", clst, "_R.pdf"),
    file_type = "pdf",
-             title     = paste("DFG Cluster", cluster))
+    title     = paste("DFG Cluster", clst))
 }

 #--------------- (3) Visualization with pictures ---------------

-library(png)
-library(jpeg)
-library(grid)
+coor_2d <- cmdscale(dist_mat, k = 2)

-pdf("results/figures/clustering_artworks.pdf", height = 8, width = 8, pointsize = 10)
-#png("results/figures/clustering_artworks.png", units = "in", height = 8, width = 8, pointsize = 10, res = 300)
+items <- sprintf("%03d", as.numeric(rownames(datitem)))
+
+#pdf("results/figures/clustering_artworks.pdf", height = 8, width = 8, pointsize = 10)
+png("results/figures/clustering_artworks.png", units = "in", height = 8, width = 8, pointsize = 10, res = 300)

 par(mai = c(.4,.4,.1,.1), mgp = c(2.4, 1, 0))

-plot(y ~ x, p$data, type = "n", ylim = c(-3.5, 2.8), xlim = c(-5, 10),
+plot(coor_2d, type = "n", ylim = c(-3.7, 2.6), xlim = c(-5, 10.5),
     xlab = "", ylab = "")

-for (item in sprintf("%03d", as.numeric(rownames(p$data)))) {
+for (item in items) {

  if (item == "125") {

-    pic <- readJPEG(paste0("../data/haum/ContentEyevisit/eyevisit_cards_light/",
+    pic <- jpeg::readJPEG(paste0("../data/haum/ContentEyevisit/eyevisit_cards_light/",
                          item, "/", item, ".jpg"))
  } else {
-    pic <- readPNG(paste0("../data/haum/ContentEyevisit/eyevisit_cards_light/",
+    pic <- png::readPNG(paste0("../data/haum/ContentEyevisit/eyevisit_cards_light/",
                        item, "/", item, ".png"))
  }

  img <- as.raster(pic[,,1:3])

-  x <- p$data$x[sprintf("%03d", as.numeric(rownames(p$data))) == item]
-  y <- p$data$y[sprintf("%03d", as.numeric(rownames(p$data))) == item]
+  x <- coor_2d[items == item, 1]
+  y <- coor_2d[items == item, 2]

  points(x, y,
-         col = mycols[p$data$cluster[sprintf("%03d", as.numeric(rownames(p$data))) == item]],
+         col = mycols[cluster[items == item]],
         cex = 9,
         pch = 15)