# 08_item-clustering.R # # content: (1) Read data # (1.1) Read log event data # (1.2) Read infos for PM for infos # (1.3) Extract additional infos for clustering # (2) Clustering # (3) Visualization with pictures # # input: results/haum/event_logfiles_2024-01-18_09-58-52.csv # results/haum/pn_infos_items.csv # output: results/haum/event_logfiles_pre-corona_with-clusters.csv # # last mod: 2024-01-30 # setwd("C:/Users/nwickelmaier/Nextcloud/Documents/MDS/2023ss/60100_master_thesis/analysis/code") library(bupaverse) library(factoextra) #--------------- (1) Read data --------------- #--------------- (1.1) Read log event data --------------- dat0 <- read.table("results/haum/event_logfiles_2024-01-18_09-58-52.csv", colClasses = c("character", "character", "POSIXct", "POSIXct", "character", "integer", "numeric", "character", "character", rep("numeric", 3), "character", "character", rep("numeric", 11), "character", "character"), sep = ";", header = TRUE) dat0$event <- factor(dat0$event, levels = c("move", "flipCard", "openTopic", "openPopup")) # TODO: Maybe look at this with complete data? # Select data pre Corona dat <- dat0[as.Date(dat0$date.start) < "2020-03-13", ] dat <- dat[dat$path != 106098, ] #--------------- (1.2) Read infos for PM for items --------------- datitem <- read.table("results/haum/pn_infos_items.csv", header = TRUE, sep = ";", row.names = 1) #--------------- (1.3) Extract additional infos for clustering --------------- datitem$duration <- aggregate(duration ~ item, dat, mean)$duration datitem$distance <- aggregate(distance ~ item, dat, mean)$distance datitem$scaleSize <- aggregate(scaleSize ~ item, dat, mean)$scaleSize datitem$rotationDegree <- aggregate(rotationDegree ~ item, dat, mean)$rotationDegree datitem$npaths <- aggregate(path ~ item, dat, function(x) length(unique(x)))$path datitem$ncases <- aggregate(case ~ item, dat, function(x) length(unique(x)))$case datitem$ntopics <- aggregate(topic ~ item, dat, function(x) length(unique(x)))$topic datitem$mostfreq_num <- as.numeric(gsub(".*: (.*)}", "\\1", datitem$mostfreq)) #--------------- (2) Clustering --------------- df <- datitem[, c("precision", "generalizability", "nvariants", "mostfreq_num", "duration", "distance", "scaleSize", "rotationDegree", "npaths", "ncases", "ntopics")] |> scale() mat <- dist(df) heatmap(as.matrix(mat)) # Choosing best linkage method h1 <- hclust(mat, method = "average") h2 <- hclust(mat, method = "complete") h3 <- hclust(mat, method = "ward.D") h4 <- hclust(mat, method = "ward.D2") h5 <- hclust(mat, method = "single") # Cophenetic Distances, for each linkage c1 <- cophenetic(h1) c2 <- cophenetic(h2) c3 <- cophenetic(h3) c4 <- cophenetic(h4) c5 <- cophenetic(h5) # Correlations cor(mat, c1) cor(mat, c2) cor(mat, c3) cor(mat, c4) cor(mat, c5) # https://en.wikipedia.org/wiki/Cophenetic_correlation # Dendograms par(mfrow=c(3,2)) plot(h1, main = "Average Linkage") plot(h2, main = "Complete Linkage") plot(h3, main = "Ward Linkage") plot(h4, main = "Ward 2 Linkage") plot(h5, main = "Single Linkage") hc <- h1 # Note that ‘agnes(*, method="ward")’ corresponds to ‘hclust(*, "ward.D2")’ k <- 7 # number of clusters grp <- cutree(hc, k = k) datitem$grp <- grp fviz_dend(hc, k = k, cex = 0.5, k_colors = c("#78004B", "#000000", "#3CB4DC", "#91C86E", "#FF6900", "gold", "#434F4F"), #type = "phylogenic", rect = TRUE ) plot(hc) rect.hclust(hc, k=8, border="red") rect.hclust(hc, k=7, border="blue") rect.hclust(hc, k=6, border="green") p <- fviz_cluster(list(data = df, cluster = grp), palette = c("#78004B", "#000000", "#3CB4DC", "#91C86E", "#FF6900", "#434F4F", "gold"), ellipse.type = "convex", repel = TRUE, show.clust.cent = FALSE, ggtheme = theme_bw()) p aggregate(cbind(duration, distance, scaleSize , rotationDegree, npaths, ncases, ntopics) ~ grp, datitem, median) # Something like a scree plot (??) plot(rev(seq_along(hc$height)), hc$height, type = "l") points(rev(seq_along(hc$height)), hc$height, pch = 16, cex = .5) datitem$item <- sprintf("%03d", as.numeric(gsub("item_([0-9]{3})", "\\1", row.names(datitem)))) res <- merge(dat, datitem[, c("item", "grp")], by = "item", all.x = TRUE) res <- res[order(res$fileId.start, res$date.start, res$timeMs.start), ] # Look at clusters par(mfrow = c(2,2)) vioplot::vioplot(duration ~ grp, res) vioplot::vioplot(distance ~ grp, res) vioplot::vioplot(scaleSize ~ grp, res) vioplot::vioplot(rotationDegree ~ grp, res) write.table(res, file = "results/haum/event_logfiles_pre-corona_with-clusters.csv", sep = ";", quote = FALSE, row.names = FALSE) # DFGs for clusters res$start <- res$date.start res$complete <- res$date.stop for (cluster in sort(unique(res$grp))) { alog <- activitylog(res[res$grp == cluster, ], case_id = "path", activity_id = "event", resource_id = "item", timestamps = c("start", "complete")) dfg <- process_map(alog, type_nodes = frequency("relative", color_scale = "Greys"), sec_nodes = frequency("absolute"), type_edges = frequency("relative", color_edges = "#FF6900"), sec_edges = frequency("absolute"), rankdir = "LR", render = FALSE) export_map(dfg, file_name = paste0("results/processmaps/dfg_cluster", cluster, "_R.pdf"), file_type = "pdf", title = paste("DFG Cluster", cluster)) } #--------------- (3) Visualization with pictures --------------- library(png) library(jpeg) library(grid) colors <- c("#78004B", "#000000", "#3CB4DC", "#91C86E", "#FF6900", "#434F4F") #pdf("results/haum/figures/clustering_artworks.pdf", height = 8, width = 8, pointsize = 10) png("results/haum/figures/clustering_artworks.png", units = "in", height = 8, width = 8, pointsize = 10, res = 300) par(mai = c(.6,.6,.1,.1), mgp = c(2.4, 1, 0)) plot(y ~ x, p$data, type = "n", ylim = c(-3.2, 3), xlim = c(-4.7, 6.4)) for (item in sprintf("%03d", as.numeric(rownames(p$data)))) { if (item == "125") { pic <- readJPEG(paste0("../data/haum/ContentEyevisit/eyevisit_cards_light/", item, "/", item, ".jpg")) } else { pic <- readPNG(paste0("../data/haum/ContentEyevisit/eyevisit_cards_light/", item, "/", item, ".png")) } img <- as.raster(pic[,,1:3]) x <- p$data$x[sprintf("%03d", as.numeric(rownames(p$data))) == item] y <- p$data$y[sprintf("%03d", as.numeric(rownames(p$data))) == item] points(x, y, col = colors[p$data$cluster[sprintf("%03d", as.numeric(rownames(p$data))) == item]], cex = 9, pch = 15) rasterImage(img, xleft = x - .4, xright = x + .4, ybottom = y - .2, ytop = y + .2) } dev.off()