2024-02-27 09:08:20 +01:00
|
|
|
# 07_item-clustering.R
|
2024-01-30 11:48:48 +01:00
|
|
|
#
|
|
|
|
# content: (1) Read data
|
|
|
|
# (1.1) Read log event data
|
|
|
|
# (1.2) Read infos for PM for infos
|
|
|
|
# (1.3) Extract additional infos for clustering
|
|
|
|
# (2) Clustering
|
|
|
|
# (3) Visualization with pictures
|
|
|
|
#
|
2024-03-22 12:07:45 +01:00
|
|
|
# input: results/eventlogs_pre-corona_cleaned.RData
|
|
|
|
# results/pn_infos_items.csv
|
|
|
|
# output: results/eventlogs_pre-corona_item-clusters.csv
|
2024-03-21 17:19:47 +01:00
|
|
|
# ../../thesis/figures/data/clustering_items.RData"
|
2024-01-30 11:48:48 +01:00
|
|
|
#
|
2024-03-22 12:07:45 +01:00
|
|
|
# last mod: 2024-03-22
|
2024-01-30 11:48:48 +01:00
|
|
|
|
2024-01-25 17:21:18 +01:00
|
|
|
# setwd("C:/Users/nwickelmaier/Nextcloud/Documents/MDS/2023ss/60100_master_thesis/analysis/code")
|
|
|
|
|
2024-03-08 18:05:40 +01:00
|
|
|
source("R_helpers.R")
|
2024-01-30 11:48:48 +01:00
|
|
|
|
2024-01-25 17:21:18 +01:00
|
|
|
#--------------- (1) Read data ---------------
|
|
|
|
|
|
|
|
#--------------- (1.1) Read log event data ---------------
|
|
|
|
|
2024-03-22 12:07:45 +01:00
|
|
|
load("results/eventlogs_pre-corona_cleaned.RData")
|
2024-01-25 17:21:18 +01:00
|
|
|
|
|
|
|
#--------------- (1.2) Read infos for PM for items ---------------
|
|
|
|
|
2024-03-22 12:07:45 +01:00
|
|
|
datitem <- read.table("results/pn_infos_items.csv", header = TRUE,
|
2024-01-25 17:21:18 +01:00
|
|
|
sep = ";", row.names = 1)
|
|
|
|
|
|
|
|
#--------------- (1.3) Extract additional infos for clustering ---------------
|
|
|
|
|
2024-03-06 17:59:22 +01:00
|
|
|
# Get average duration per path
|
|
|
|
dat_split <- split(dat, ~ path)
|
|
|
|
dat_list <- pbapply::pblapply(dat_split, time_minmax_ms)
|
2024-02-27 09:08:20 +01:00
|
|
|
dat_minmax <- dplyr::bind_rows(dat_list)
|
|
|
|
|
|
|
|
datpath <- aggregate(duration ~ item + path, dat, mean, na.action = NULL)
|
|
|
|
datpath$min_time <- aggregate(min_time ~ path, dat_minmax, unique, na.action = NULL)$min_time
|
|
|
|
datpath$max_time <- aggregate(max_time ~ path, dat_minmax, unique, na.action = NULL)$max_time
|
2024-03-06 17:59:22 +01:00
|
|
|
datpath$duration <- datpath$max_time - datpath$min_time
|
2024-02-27 09:08:20 +01:00
|
|
|
|
|
|
|
datitem$duration <- aggregate(duration ~ item, datpath, mean)$duration
|
2024-01-25 17:21:18 +01:00
|
|
|
datitem$distance <- aggregate(distance ~ item, dat, mean)$distance
|
|
|
|
datitem$scaleSize <- aggregate(scaleSize ~ item, dat, mean)$scaleSize
|
|
|
|
datitem$rotationDegree <- aggregate(rotationDegree ~ item, dat, mean)$rotationDegree
|
|
|
|
datitem$npaths <- aggregate(path ~ item, dat, function(x) length(unique(x)))$path
|
|
|
|
datitem$ncases <- aggregate(case ~ item, dat, function(x) length(unique(x)))$case
|
2024-02-27 09:08:20 +01:00
|
|
|
datitem$nmoves <- aggregate(event ~ item, dat, table)$event[,"move"]
|
|
|
|
datitem$nflipCard <- aggregate(event ~ item, dat, table)$event[,"flipCard"]
|
|
|
|
datitem$nopenTopic <- aggregate(event ~ item, dat, table)$event[,"openTopic"]
|
|
|
|
datitem$nopenPopup <- aggregate(event ~ item, dat, table)$event[,"openPopup"]
|
2024-01-25 17:21:18 +01:00
|
|
|
|
2024-03-08 18:05:40 +01:00
|
|
|
rm(datpath)
|
|
|
|
|
2024-01-25 17:21:18 +01:00
|
|
|
#--------------- (2) Clustering ---------------
|
|
|
|
|
2024-02-27 09:08:20 +01:00
|
|
|
df <- datitem[, c("precision", "generalizability", "nvariants", "duration",
|
|
|
|
"distance", "scaleSize", "rotationDegree", "npaths",
|
2024-03-08 18:05:40 +01:00
|
|
|
"ncases", "nmoves", "nflipCard", "nopenTopic",
|
|
|
|
"nopenPopup")] |>
|
2024-02-27 09:08:20 +01:00
|
|
|
scale()
|
|
|
|
|
2024-03-06 17:59:22 +01:00
|
|
|
dist_mat <- dist(df)
|
2024-01-25 17:21:18 +01:00
|
|
|
|
2024-03-06 17:59:22 +01:00
|
|
|
heatmap(as.matrix(dist_mat))
|
2024-02-07 18:00:25 +01:00
|
|
|
|
|
|
|
# Choosing best linkage method
|
2024-03-06 17:59:22 +01:00
|
|
|
method <- c(average = "average", single = "single", complete = "complete",
|
|
|
|
ward = "ward")
|
2024-02-07 18:00:25 +01:00
|
|
|
|
2024-03-06 17:59:22 +01:00
|
|
|
hcs <- lapply(method, function(x) cluster::agnes(dist_mat, method = x))
|
|
|
|
acs <- sapply(hcs, function(x) x$ac)
|
2024-02-07 18:00:25 +01:00
|
|
|
|
2024-03-06 17:59:22 +01:00
|
|
|
# Dendograms
|
|
|
|
par(mfrow=c(4,2))
|
|
|
|
for (hc in hcs) plot(hc, main = "")
|
2024-02-07 18:00:25 +01:00
|
|
|
|
2024-03-06 17:59:22 +01:00
|
|
|
hc <- hcs$ward
|
2024-02-07 18:00:25 +01:00
|
|
|
|
2024-03-08 18:05:40 +01:00
|
|
|
factoextra::fviz_nbclust(df, FUNcluster = factoextra::hcut, method = "wss")
|
|
|
|
factoextra::fviz_nbclust(df, FUNcluster = factoextra::hcut, method = "silhouette")
|
|
|
|
|
|
|
|
gap_stat <- cluster::clusGap(df, FUNcluster = factoextra::hcut,
|
|
|
|
hc_func = "agnes", hc_method = "ward",
|
2024-03-09 17:22:46 +01:00
|
|
|
K.max = 15)
|
2024-03-08 18:05:40 +01:00
|
|
|
factoextra::fviz_gap_stat(gap_stat)
|
|
|
|
|
|
|
|
k <- 6 # number of clusters
|
2024-02-07 18:00:25 +01:00
|
|
|
|
2024-03-08 18:05:40 +01:00
|
|
|
mycols <- c("#434F4F", "#78004B", "#FF6900", "#3CB4DC", "#91C86E", "Black")
|
2024-03-06 17:59:22 +01:00
|
|
|
|
2024-03-08 18:05:40 +01:00
|
|
|
cluster <- cutree(hc, k = k)
|
2024-01-25 17:21:18 +01:00
|
|
|
|
2024-03-08 18:05:40 +01:00
|
|
|
factoextra::fviz_dend(hc, k = k,
|
|
|
|
cex = 0.5,
|
|
|
|
k_colors = mycols,
|
|
|
|
#type = "phylogenic",
|
2024-03-09 17:22:46 +01:00
|
|
|
rect = TRUE,
|
|
|
|
main = "",
|
|
|
|
ylab = ""
|
|
|
|
#ggtheme = ggplot2::theme_bw()
|
2024-01-25 17:21:18 +01:00
|
|
|
)
|
|
|
|
|
2024-03-08 18:05:40 +01:00
|
|
|
factoextra::fviz_cluster(list(data = df, cluster = cluster),
|
|
|
|
palette = mycols,
|
|
|
|
ellipse.type = "convex",
|
|
|
|
repel = TRUE,
|
|
|
|
show.clust.cent = FALSE,
|
2024-03-09 17:22:46 +01:00
|
|
|
main = "",
|
2024-03-08 18:05:40 +01:00
|
|
|
ggtheme = ggplot2::theme_bw())
|
2024-01-25 17:21:18 +01:00
|
|
|
|
2024-03-09 17:22:46 +01:00
|
|
|
aggregate(cbind(precision, generalizability, nvariants, duration, distance,
|
|
|
|
scaleSize , rotationDegree, npaths, ncases, nmoves,
|
|
|
|
nflipCard, nopenTopic, nopenPopup) ~ cluster, datitem,
|
|
|
|
mean)
|
2024-02-07 18:00:25 +01:00
|
|
|
|
2024-03-06 17:59:22 +01:00
|
|
|
aggregate(cbind(duration, distance, scaleSize , rotationDegree, npaths,
|
2024-03-08 18:05:40 +01:00
|
|
|
ncases, nmoves, nflipCard, nopenTopic, nopenPopup) ~ cluster,
|
2024-03-06 17:59:22 +01:00
|
|
|
datitem, max)
|
2024-02-07 18:00:25 +01:00
|
|
|
|
2024-03-08 18:05:40 +01:00
|
|
|
item <- sprintf("%03d", as.numeric(gsub("item_([0-9]{3})", "\\1",
|
|
|
|
row.names(datitem))))
|
2024-02-07 18:00:25 +01:00
|
|
|
|
2024-03-08 18:05:40 +01:00
|
|
|
res <- merge(dat, data.frame(item, cluster), by = "item", all.x = TRUE)
|
2024-01-25 17:21:18 +01:00
|
|
|
res <- res[order(res$fileId.start, res$date.start, res$timeMs.start), ]
|
|
|
|
|
2024-03-22 12:07:45 +01:00
|
|
|
|
|
|
|
# DFGs for clusters
|
|
|
|
res$start <- res$date.start
|
|
|
|
res$complete <- res$date.stop
|
|
|
|
|
|
|
|
for (clst in sort(unique(res$cluster))) {
|
|
|
|
|
|
|
|
alog <- bupaR::activitylog(res[res$cluster == clst, ],
|
|
|
|
case_id = "path",
|
|
|
|
activity_id = "event",
|
|
|
|
resource_id = "item",
|
|
|
|
timestamps = c("start", "complete"))
|
|
|
|
|
|
|
|
processmapR::process_map(alog,
|
|
|
|
type_nodes = processmapR::frequency("relative", color_scale = "Greys"),
|
|
|
|
sec_nodes = processmapR::frequency("absolute"),
|
|
|
|
type_edges = processmapR::frequency("relative", color_edges = "#FF6900"),
|
|
|
|
sec_edges = processmapR::frequency("absolute"),
|
|
|
|
rankdir = "LR")
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2024-01-30 11:48:48 +01:00
|
|
|
# Look at clusters
|
2024-02-07 18:00:25 +01:00
|
|
|
par(mfrow = c(2,2))
|
2024-03-08 18:05:40 +01:00
|
|
|
vioplot::vioplot(duration ~ cluster, res)
|
|
|
|
vioplot::vioplot(distance ~ cluster, res)
|
|
|
|
vioplot::vioplot(scaleSize ~ cluster, res)
|
|
|
|
vioplot::vioplot(rotationDegree ~ cluster, res)
|
2024-01-30 11:48:48 +01:00
|
|
|
|
2024-01-25 17:21:18 +01:00
|
|
|
write.table(res,
|
2024-03-22 12:07:45 +01:00
|
|
|
file = "results/eventlogs_pre-corona_item-clusters.csv",
|
2024-01-25 17:21:18 +01:00
|
|
|
sep = ";",
|
|
|
|
quote = FALSE,
|
|
|
|
row.names = FALSE)
|
|
|
|
|
2024-03-21 17:19:47 +01:00
|
|
|
# Save data for plots and tables
|
2024-01-25 17:21:18 +01:00
|
|
|
|
2024-03-21 17:19:47 +01:00
|
|
|
save(hc, k, res, dist_mat, datitem, df,
|
|
|
|
file = "../../thesis/figures/data/clustering_items.RData")
|
2024-01-25 17:21:18 +01:00
|
|
|
|