# 01_clustering.R # # content: (1) Read evaluation data # (2) Clustering # (3) Visualization with pictures # (4) Read event logs # (5) Frequency plot for clusters # (6) DFGs for clusters # # input: results/eval_heuristics_artworks.csv # results/eval_all-miners_complete.csv # results/haum/event_logfiles_glossar_2023-11-03_17-46-28.csv # output: ../figures/clustering_heuristics.pdf # ../figures/clustering_heuristics.png # ../figures/processmaps/dfg_complete_R.pdf # ../figures/processmaps/dfg_complete_R.png # ../figures/processmaps/dfg_cluster1_R.pdf # ../figures/processmaps/dfg_cluster2_R.pdf # ../figures/processmaps/dfg_cluster3_R.pdf # ../figures/processmaps/dfg_cluster4_R.pdf # # last mod: 2023-12-21, NW # setwd("C:/Users/nwickelmaier/Nextcloud/Documents/MDS/2023ss/60100_master_thesis/code") #--------------- (1) Read evaluation data --------------- eval_heuristics <- read.table("results/eval_artworks_heuristics.csv", header = TRUE, sep = ";", row.names = 1) eval_inductive <- read.table("results/eval_artworks_inductive.csv", header = TRUE, sep = ";", row.names = 1) eval_alpha <- read.table("results/eval_artworks_alpha.csv", header = TRUE, sep = ";", row.names = 1) eval_ilp <- read.table("results/eval_artworks_ilp.csv", header = TRUE, sep = ";", row.names = 1) #--------------- (2) Clustering --------------- set.seed(1607) # Heuristics Miner k1 <- kmeans(eval_heuristics, 4) colors <- c("#3CB4DC", "#78004B", "#91C86E", "#FF6900") plot(generalizability ~ precision, eval_heuristics, pch = 16, col = colors[k1$cluster]) ## Scree plot ks <- 1:10 sse <- NULL for (k in ks) sse <- c(sse, kmeans(eval_heuristics, k)$tot.withinss) plot(sse ~ ks, type = "l") # Inductive Miner k2 <- kmeans(eval_inductive, 4) plot(generalizability ~ precision, eval_inductive, pch = 16, col = colors[k2$cluster]) ## Scree plot ks <- 1:10 sse <- NULL for (k in ks) sse <- c(sse, kmeans(eval_inductive, k)$tot.withinss) plot(sse ~ ks, type = "l") # Alpha Miner k3 <- kmeans(eval_alpha, 4) par(mfrow = c(2, 2)) plot(generalizability ~ precision, eval_alpha, pch = 16, col = colors[k3$cluster]) plot(fitness ~ precision, eval_alpha, pch = 16, col = colors[k3$cluster]) plot(fitness ~ generalizability, eval_alpha, pch = 16, col = colors[k3$cluster]) ## Scree plot ks <- 1:10 sse <- NULL for (k in ks) sse <- c(sse, kmeans(eval_alpha, k)$tot.withinss) plot(sse ~ ks, type = "l") # ILP Miner k4 <- kmeans(eval_ilp, 4) plot(generalizability ~ precision, eval_ilp, pch = 16, col = colors[k4$cluster]) ## Scree plot ks <- 1:10 sse <- NULL for (k in ks) sse <- c(sse, kmeans(eval_ilp, k)$tot.withinss) plot(sse ~ ks, type = "l") #--------------- (3) Visualization with pictures --------------- library(png) library(jpeg) library(grid) ## Heuristics Miner #pdf("../figures/clustering_heuristics.pdf", height = 8, width = 8, pointsize = 10) png("../figures/clustering_heuristics.png", units = "in", height = 8, width = 8, pointsize = 10, res = 300) par(mai = c(.6,.6,.1,.1), mgp = c(2.4, 1, 0)) plot(generalizability ~ precision, eval_heuristics, type = "n", ylim = c(0.845, 0.98)) for (art in as.numeric(rownames(eval_heuristics))) { art_string <- sprintf("%03d", art) if (art == 125) { pic <- readJPEG(paste0("../data/haum/ContentEyevisit/eyevisit_cards_light/", art_string, "/", art_string, ".jpg")) } else { pic <- readPNG(paste0("../data/haum/ContentEyevisit/eyevisit_cards_light/", art_string, "/", art_string, ".png")) } img <- as.raster(pic[,,1:3]) x <- eval_heuristics[rownames(eval_heuristics) == art, "precision"] y <- eval_heuristics[rownames(eval_heuristics) == art, "generalizability"] points(x, y, col = colors[k1$cluster[as.character(art)]], cex = 8, pch = 15) rasterImage(img, xleft = x - .002, xright = x + .002, ybottom = y - .004, ytop = y + .004) } dev.off() ## Inductive Miner plot(generalizability ~ precision, eval_inductive, col = colors[k2$cluster], cex = 8, pch = 15) for (art in as.numeric(rownames(eval_inductive))) { art_string <- sprintf("%03d", art) if (art == 125) { pic <- readJPEG(paste0("../data/haum/ContentEyevisit/eyevisit_cards_light/", art_string, "/", art_string, ".jpg")) } else { pic <- readPNG(paste0("../data/haum/ContentEyevisit/eyevisit_cards_light/", art_string, "/", art_string, ".png")) } img <- as.raster(pic[,,1:3]) x <- eval_inductive[rownames(eval_inductive) == art, "precision"] y <- eval_inductive[rownames(eval_inductive) == art, "generalizability"] rasterImage(img, xleft = x - .001, xright = x + .001, ybottom = y - .002, ytop = y + .002) } #--------------- (4) Read event logs --------------- dat <- read.table("results/haum/event_logfiles_glossar_2023-11-03_17-46-28.csv", sep = ";", header = TRUE) dat$date <- as.POSIXct(dat$date) dat$date.start <- as.POSIXct(dat$date.start) dat$date.stop <- as.POSIXct(dat$date.stop) dat$artwork <- sprintf("%03d", dat$artwork) dat$event <- factor(dat$event, levels = c("move", "flipCard", "openTopic", "openPopup")) dat$weekdays <- factor(weekdays(dat$date.start), levels = c("Montag", "Dienstag", "Mittwoch", "Donnerstag", "Freitag", "Samstag", "Sonntag"), labels = c("Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday")) #--------------- (5) Frequency plot for clusters --------------- # Only pre Corona dat <- dat[dat$date < "2020-03-13",] counts_artwork <- table(dat$artwork) dat_count <- as.data.frame(counts_artwork) names(dat_count) <- c("artwork", "freq") dat_count$cluster <- k1$cluster[order(as.numeric(names(k1$cluster)))] dat_count$cluster <- factor(dat_count$cluster, levels = c(4, 2, 1, 3), labels = 4:1) dat_count <- dat_count[order(dat_count$cluster, dat_count$freq, decreasing = TRUE), ] dat_count$artwork <- factor(dat_count$artwork, levels = unique(dat_count$artwork)) png("../figures/counts_artworks_clusters.png", units = "in", height = 3.375, width = 12, pointsize = 10, res = 300) par(mai = c(.6,.6,.1,.1), mgp = c(2.4, 1, 0)) barplot(freq ~ artwork, dat_count, las = 2, ylim = c(0, 60000), border = "white", ylab = "", col = c("#FF6900", "#78004B", "#3CB4DC", "#91C86E" )[dat_count$cluster]) dev.off() # compare to clusters png("../figures/pm_heuristics_clusters.png", units = "in", height = 3.375, width = 3.375, pointsize = 10, res = 300) par(mai = c(.6,.6,.1,.1), mgp = c(2.4, 1, 0)) plot(generalizability ~ precision, eval_heuristics, type = "n", ylim = c(0.845, 0.98)) with(eval_heuristics, text(precision, generalizability, rownames(eval_heuristics), col = colors[k1$cluster])) dev.off() #--------------- (6) DFGs for clusters --------------- library(bupaverse) dat$start <- dat$date.start dat$complete <- dat$date.stop alog <- activitylog(dat, case_id = "trace", activity_id = "event", resource_id = "artwork", timestamps = c("start", "complete")) alog_c1 <- filter_case_condition(alog, artwork %in% dat_count[dat_count$cluster == 1, "artwork"]) alog_c2 <- filter_case_condition(alog, artwork %in% dat_count[dat_count$cluster == 2, "artwork"]) alog_c3 <- filter_case_condition(alog, artwork %in% dat_count[dat_count$cluster == 3, "artwork"]) alog_c4 <- filter_case_condition(alog, artwork %in% dat_count[dat_count$cluster == 4, "artwork"]) dfg_complete <- process_map(alog, type_nodes = frequency("absolute", color_scale = "Greys"), sec_nodes = frequency("relative"), type_edges = frequency("absolute", color_edges = "#FF6900"), sec_edges = frequency("relative"), #rankdir = "TB", render = FALSE) export_map(dfg_complete, file_name = "../figures/processmaps/dfg_complete_R.pdf", file_type = "pdf", title = "DFG complete") export_map(dfg_complete, file_name = "../figures/processmaps/dfg_complete_R.png", file_type = "png") dfg_c1 <- process_map(alog_c1, type_nodes = frequency("absolute", color_scale = "Greys"), sec_nodes = frequency("relative"), type_edges = frequency("absolute", color_edges = "#FF6900"), sec_edges = frequency("relative"), rankdir = "TB", render = FALSE) export_map(dfg_c1, file_name = "../figures/processmaps/dfg_cluster1_R.pdf", file_type = "pdf", title = "DFG Cluster 1") dfg_c2 <- process_map(alog_c2, type_nodes = frequency("absolute", color_scale = "Greys"), sec_nodes = frequency("relative"), type_edges = frequency("absolute", color_edges = "#FF6900"), sec_edges = frequency("relative"), rankdir = "TB", render = FALSE) export_map(dfg_c2, file_name = "../figures/processmaps/dfg_cluster2_R.pdf", file_type = "pdf", title = "DFG Cluster 2") dfg_c3 <- process_map(alog_c3, type_nodes = frequency("absolute", color_scale = "Greys"), sec_nodes = frequency("relative"), type_edges = frequency("absolute", color_edges = "#FF6900"), sec_edges = frequency("relative"), rankdir = "TB", render = FALSE) export_map(dfg_c3, file_name = "../figures/processmaps/dfg_cluster3_R.pdf", file_type = "pdf", title = "DFG Cluster 3") dfg_c4 <- process_map(alog_c4, type_nodes = frequency("absolute", color_scale = "Greys"), sec_nodes = frequency("relative"), type_edges = frequency("absolute", color_edges = "#FF6900"), sec_edges = frequency("relative"), rankdir = "TB", render = FALSE) export_map(dfg_c4, file_name = "../figures/processmaps/dfg_cluster4_R.pdf", file_type = "pdf", title = "DFG Cluster 4")