mtt_haum/code/01_clustering.R

315 lines
10 KiB
R

# 01_clustering.R
#
# content: (1) Read evaluation data
# (2) Clustering
# (3) Visualization with pictures
# (4) Read event logs
# (5) Frequency plot for clusters
# (6) DFGs for clusters
#
# input: results/eval_heuristics_artworks.csv
# results/eval_all-miners_complete.csv
# results/haum/event_logfiles_glossar_2023-11-03_17-46-28.csv
# output: ../figures/clustering_heuristics.pdf
# ../figures/clustering_heuristics.png
# ../figures/processmaps/dfg_complete_R.pdf
# ../figures/processmaps/dfg_complete_R.png
# ../figures/processmaps/dfg_cluster1_R.pdf
# ../figures/processmaps/dfg_cluster2_R.pdf
# ../figures/processmaps/dfg_cluster3_R.pdf
# ../figures/processmaps/dfg_cluster4_R.pdf
#
# last mod: 2023-12-21, NW
# setwd("C:/Users/nwickelmaier/Nextcloud/Documents/MDS/2023ss/60100_master_thesis/code")
#--------------- (1) Read evaluation data ---------------
eval_heuristics <- read.table("results/eval_artworks_heuristics.csv", header = TRUE,
sep = ";", row.names = 1)
eval_inductive <- read.table("results/eval_artworks_inductive.csv", header = TRUE,
sep = ";", row.names = 1)
eval_alpha <- read.table("results/eval_artworks_alpha.csv", header = TRUE,
sep = ";", row.names = 1)
eval_ilp <- read.table("results/eval_artworks_ilp.csv", header = TRUE,
sep = ";", row.names = 1)
#--------------- (2) Clustering ---------------
set.seed(1607)
# Heuristics Miner
k1 <- kmeans(eval_heuristics, 4)
colors <- c("#3CB4DC", "#78004B", "#91C86E", "#FF6900")
plot(generalizability ~ precision, eval_heuristics, pch = 16, col = colors[k1$cluster])
## Scree plot
ks <- 1:10
sse <- NULL
for (k in ks) sse <- c(sse, kmeans(eval_heuristics, k)$tot.withinss)
plot(sse ~ ks, type = "l")
# Inductive Miner
k2 <- kmeans(eval_inductive, 4)
plot(generalizability ~ precision, eval_inductive, pch = 16, col = colors[k2$cluster])
## Scree plot
ks <- 1:10
sse <- NULL
for (k in ks) sse <- c(sse, kmeans(eval_inductive, k)$tot.withinss)
plot(sse ~ ks, type = "l")
# Alpha Miner
k3 <- kmeans(eval_alpha, 4)
par(mfrow = c(2, 2))
plot(generalizability ~ precision, eval_alpha, pch = 16, col = colors[k3$cluster])
plot(fitness ~ precision, eval_alpha, pch = 16, col = colors[k3$cluster])
plot(fitness ~ generalizability, eval_alpha, pch = 16, col = colors[k3$cluster])
## Scree plot
ks <- 1:10
sse <- NULL
for (k in ks) sse <- c(sse, kmeans(eval_alpha, k)$tot.withinss)
plot(sse ~ ks, type = "l")
# ILP Miner
k4 <- kmeans(eval_ilp, 4)
plot(generalizability ~ precision, eval_ilp, pch = 16, col = colors[k4$cluster])
## Scree plot
ks <- 1:10
sse <- NULL
for (k in ks) sse <- c(sse, kmeans(eval_ilp, k)$tot.withinss)
plot(sse ~ ks, type = "l")
#--------------- (3) Visualization with pictures ---------------
library(png)
library(jpeg)
library(grid)
## Heuristics Miner
#pdf("../figures/clustering_heuristics.pdf", height = 8, width = 8, pointsize = 10)
png("../figures/clustering_heuristics.png", units = "in", height = 8, width = 8, pointsize = 10, res = 300)
par(mai = c(.6,.6,.1,.1), mgp = c(2.4, 1, 0))
plot(generalizability ~ precision, eval_heuristics, type = "n", ylim = c(0.845, 0.98))
for (art in as.numeric(rownames(eval_heuristics))) {
art_string <- sprintf("%03d", art)
if (art == 125) {
pic <- readJPEG(paste0("../data/haum/ContentEyevisit/eyevisit_cards_light/",
art_string, "/", art_string, ".jpg"))
} else {
pic <- readPNG(paste0("../data/haum/ContentEyevisit/eyevisit_cards_light/",
art_string, "/", art_string, ".png"))
}
img <- as.raster(pic[,,1:3])
x <- eval_heuristics[rownames(eval_heuristics) == art, "precision"]
y <- eval_heuristics[rownames(eval_heuristics) == art, "generalizability"]
points(x, y, col = colors[k1$cluster[as.character(art)]], cex = 8, pch = 15)
rasterImage(img,
xleft = x - .002,
xright = x + .002,
ybottom = y - .004,
ytop = y + .004)
}
dev.off()
## Inductive Miner
plot(generalizability ~ precision, eval_inductive, col = colors[k2$cluster],
cex = 8, pch = 15)
for (art in as.numeric(rownames(eval_inductive))) {
art_string <- sprintf("%03d", art)
if (art == 125) {
pic <- readJPEG(paste0("../data/haum/ContentEyevisit/eyevisit_cards_light/",
art_string, "/", art_string, ".jpg"))
} else {
pic <- readPNG(paste0("../data/haum/ContentEyevisit/eyevisit_cards_light/",
art_string, "/", art_string, ".png"))
}
img <- as.raster(pic[,,1:3])
x <- eval_inductive[rownames(eval_inductive) == art, "precision"]
y <- eval_inductive[rownames(eval_inductive) == art, "generalizability"]
rasterImage(img,
xleft = x - .001,
xright = x + .001,
ybottom = y - .002,
ytop = y + .002)
}
#--------------- (4) Read event logs ---------------
dat <- read.table("results/haum/event_logfiles_glossar_2023-11-03_17-46-28.csv",
sep = ";", header = TRUE)
dat$date <- as.POSIXct(dat$date)
dat$date.start <- as.POSIXct(dat$date.start)
dat$date.stop <- as.POSIXct(dat$date.stop)
dat$artwork <- sprintf("%03d", dat$artwork)
dat$event <- factor(dat$event, levels = c("move", "flipCard", "openTopic", "openPopup"))
dat$weekdays <- factor(weekdays(dat$date.start),
levels = c("Montag", "Dienstag", "Mittwoch",
"Donnerstag", "Freitag", "Samstag",
"Sonntag"),
labels = c("Monday", "Tuesday", "Wednesday",
"Thursday", "Friday", "Saturday",
"Sunday"))
#--------------- (5) Frequency plot for clusters ---------------
# Only pre Corona
dat <- dat[dat$date < "2020-03-13",]
counts_artwork <- table(dat$artwork)
dat_count <- as.data.frame(counts_artwork)
names(dat_count) <- c("artwork", "freq")
dat_count$cluster <- k1$cluster[order(as.numeric(names(k1$cluster)))]
dat_count$cluster <- factor(dat_count$cluster, levels = c(4, 2, 1, 3), labels = 4:1)
dat_count <- dat_count[order(dat_count$cluster, dat_count$freq, decreasing = TRUE), ]
dat_count$artwork <- factor(dat_count$artwork, levels = unique(dat_count$artwork))
png("../figures/counts_artworks_clusters.png", units = "in", height = 3.375, width = 12, pointsize = 10, res = 300)
par(mai = c(.6,.6,.1,.1), mgp = c(2.4, 1, 0))
barplot(freq ~ artwork, dat_count, las = 2, ylim = c(0, 60000),
border = "white", ylab = "",
col = c("#FF6900", "#78004B", "#3CB4DC", "#91C86E" )[dat_count$cluster])
dev.off()
# compare to clusters
png("../figures/pm_heuristics_clusters.png", units = "in", height = 3.375, width = 3.375, pointsize = 10, res = 300)
par(mai = c(.6,.6,.1,.1), mgp = c(2.4, 1, 0))
plot(generalizability ~ precision, eval_heuristics, type = "n", ylim = c(0.845, 0.98))
with(eval_heuristics, text(precision, generalizability,
rownames(eval_heuristics),
col = colors[k1$cluster]))
dev.off()
#--------------- (6) DFGs for clusters ---------------
library(bupaverse)
dat$start <- dat$date.start
dat$complete <- dat$date.stop
alog <- activitylog(dat,
case_id = "trace",
activity_id = "event",
resource_id = "artwork",
timestamps = c("start", "complete"))
alog_c1 <- filter_case_condition(alog,
artwork %in% dat_count[dat_count$cluster == 1, "artwork"])
alog_c2 <- filter_case_condition(alog,
artwork %in% dat_count[dat_count$cluster == 2, "artwork"])
alog_c3 <- filter_case_condition(alog,
artwork %in% dat_count[dat_count$cluster == 3, "artwork"])
alog_c4 <- filter_case_condition(alog,
artwork %in% dat_count[dat_count$cluster == 4, "artwork"])
dfg_complete <- process_map(alog,
type_nodes = frequency("absolute", color_scale = "Greys"),
sec_nodes = frequency("relative"),
type_edges = frequency("absolute", color_edges = "#FF6900"),
sec_edges = frequency("relative"),
#rankdir = "TB",
render = FALSE)
export_map(dfg_complete,
file_name = "../figures/processmaps/dfg_complete_R.pdf",
file_type = "pdf",
title = "DFG complete")
export_map(dfg_complete,
file_name = "../figures/processmaps/dfg_complete_R.png",
file_type = "png")
dfg_c1 <- process_map(alog_c1,
type_nodes = frequency("absolute", color_scale = "Greys"),
sec_nodes = frequency("relative"),
type_edges = frequency("absolute", color_edges = "#FF6900"),
sec_edges = frequency("relative"),
rankdir = "TB",
render = FALSE)
export_map(dfg_c1,
file_name = "../figures/processmaps/dfg_cluster1_R.pdf",
file_type = "pdf",
title = "DFG Cluster 1")
dfg_c2 <- process_map(alog_c2,
type_nodes = frequency("absolute", color_scale = "Greys"),
sec_nodes = frequency("relative"),
type_edges = frequency("absolute", color_edges = "#FF6900"),
sec_edges = frequency("relative"),
rankdir = "TB",
render = FALSE)
export_map(dfg_c2,
file_name = "../figures/processmaps/dfg_cluster2_R.pdf",
file_type = "pdf",
title = "DFG Cluster 2")
dfg_c3 <- process_map(alog_c3,
type_nodes = frequency("absolute", color_scale = "Greys"),
sec_nodes = frequency("relative"),
type_edges = frequency("absolute", color_edges = "#FF6900"),
sec_edges = frequency("relative"),
rankdir = "TB",
render = FALSE)
export_map(dfg_c3,
file_name = "../figures/processmaps/dfg_cluster3_R.pdf",
file_type = "pdf",
title = "DFG Cluster 3")
dfg_c4 <- process_map(alog_c4,
type_nodes = frequency("absolute", color_scale = "Greys"),
sec_nodes = frequency("relative"),
type_edges = frequency("absolute", color_edges = "#FF6900"),
sec_edges = frequency("relative"),
rankdir = "TB",
render = FALSE)
export_map(dfg_c4,
file_name = "../figures/processmaps/dfg_cluster4_R.pdf",
file_type = "pdf",
title = "DFG Cluster 4")