mtt_haum/code/07_item-clustering.R

214 lines
6.9 KiB
R
Raw Normal View History

# 07_item-clustering.R
#
# content: (1) Read data
# (1.1) Read log event data
# (1.2) Read infos for PM for infos
# (1.3) Extract additional infos for clustering
# (2) Clustering
# (3) Visualization with pictures
#
# input: results/haum/eventlogs_pre-corona_cleaned.RData
# results/haum/pn_infos_items.csv
# output: results/haum/eventlogs_pre-corona_item-clusters.csv
#
# last mod: 2024-03-06
# setwd("C:/Users/nwickelmaier/Nextcloud/Documents/MDS/2023ss/60100_master_thesis/analysis/code")
library(bupaverse)
library(factoextra)
#--------------- (1) Read data ---------------
#--------------- (1.1) Read log event data ---------------
load("results/haum/eventlogs_pre-corona_cleaned.RData")
#--------------- (1.2) Read infos for PM for items ---------------
datitem <- read.table("results/haum/pn_infos_items.csv", header = TRUE,
sep = ";", row.names = 1)
#--------------- (1.3) Extract additional infos for clustering ---------------
time_minmax_ms <- function(subdata) {
subdata$min_time <- min(subdata$timeMs.start)
if (all(is.na(subdata$timeMs.stop))) {
subdata$max_time <- NA
} else {
subdata$max_time <- max(subdata$timeMs.stop, na.rm = TRUE)
}
subdata
}
# TODO: Move to helper file
# Get average duration per path
dat_split <- split(dat, ~ path)
dat_list <- pbapply::pblapply(dat_split, time_minmax_ms)
dat_minmax <- dplyr::bind_rows(dat_list)
datpath <- aggregate(duration ~ item + path, dat, mean, na.action = NULL)
datpath$min_time <- aggregate(min_time ~ path, dat_minmax, unique, na.action = NULL)$min_time
datpath$max_time <- aggregate(max_time ~ path, dat_minmax, unique, na.action = NULL)$max_time
datpath$duration <- datpath$max_time - datpath$min_time
datitem$duration <- aggregate(duration ~ item, datpath, mean)$duration
datitem$distance <- aggregate(distance ~ item, dat, mean)$distance
datitem$scaleSize <- aggregate(scaleSize ~ item, dat, mean)$scaleSize
datitem$rotationDegree <- aggregate(rotationDegree ~ item, dat, mean)$rotationDegree
datitem$npaths <- aggregate(path ~ item, dat, function(x) length(unique(x)))$path
datitem$ncases <- aggregate(case ~ item, dat, function(x) length(unique(x)))$case
datitem$nmoves <- aggregate(event ~ item, dat, table)$event[,"move"]
datitem$nflipCard <- aggregate(event ~ item, dat, table)$event[,"flipCard"]
datitem$nopenTopic <- aggregate(event ~ item, dat, table)$event[,"openTopic"]
datitem$nopenPopup <- aggregate(event ~ item, dat, table)$event[,"openPopup"]
#--------------- (2) Clustering ---------------
df <- datitem[, c("precision", "generalizability", "nvariants", "duration",
"distance", "scaleSize", "rotationDegree", "npaths",
"ncases", "nmoves", "nopenTopic", "nopenPopup")] |>
scale()
dist_mat <- dist(df)
heatmap(as.matrix(dist_mat))
2024-02-07 18:00:25 +01:00
# Choosing best linkage method
method <- c(average = "average", single = "single", complete = "complete",
ward = "ward")
2024-02-07 18:00:25 +01:00
hcs <- lapply(method, function(x) cluster::agnes(dist_mat, method = x))
acs <- sapply(hcs, function(x) x$ac)
2024-02-07 18:00:25 +01:00
# Dendograms
par(mfrow=c(4,2))
for (hc in hcs) plot(hc, main = "")
2024-02-07 18:00:25 +01:00
hc <- hcs$ward
2024-02-07 18:00:25 +01:00
k <- 4 # number of clusters
2024-02-07 18:00:25 +01:00
mycols <- c("#78004B", "#FF6900", "#3CB4DC", "#91C86E")
2024-02-07 18:00:25 +01:00
grp <- cutree(hc, k = k)
datitem$grp <- grp
2024-02-07 18:00:25 +01:00
fviz_dend(hc, k = k,
cex = 0.5,
k_colors = mycols,
#type = "phylogenic",
rect = TRUE
)
p <- fviz_cluster(list(data = df, cluster = grp),
palette = mycols,
ellipse.type = "convex",
repel = TRUE,
show.clust.cent = FALSE, ggtheme = theme_bw())
p
aggregate(cbind(duration, distance, scaleSize , rotationDegree, npaths,
ncases, nmoves, nflipCard, nopenTopic, nopenPopup) ~ grp,
datitem, mean)
2024-02-07 18:00:25 +01:00
aggregate(cbind(duration, distance, scaleSize , rotationDegree, npaths,
ncases, nmoves, nflipCard, nopenTopic, nopenPopup) ~ grp,
datitem, max)
2024-02-07 18:00:25 +01:00
# Something like a scree plot (??)
plot(rev(hc$height), type = "b", pch = 16, cex = .5)
datitem$item <- sprintf("%03d",
as.numeric(gsub("item_([0-9]{3})", "\\1", row.names(datitem))))
res <- merge(dat, datitem[, c("item", "grp")], by = "item", all.x = TRUE)
res <- res[order(res$fileId.start, res$date.start, res$timeMs.start), ]
# Look at clusters
2024-02-07 18:00:25 +01:00
par(mfrow = c(2,2))
vioplot::vioplot(duration ~ grp, res)
vioplot::vioplot(distance ~ grp, res)
vioplot::vioplot(scaleSize ~ grp, res)
vioplot::vioplot(rotationDegree ~ grp, res)
write.table(res,
file = "results/haum/eventlogs_pre-corona_item-clusters.csv",
sep = ";",
quote = FALSE,
row.names = FALSE)
# DFGs for clusters
res$start <- res$date.start
res$complete <- res$date.stop
for (cluster in sort(unique(res$grp))) {
alog <- activitylog(res[res$grp == cluster, ],
case_id = "path",
activity_id = "event",
resource_id = "item",
timestamps = c("start", "complete"))
dfg <- process_map(alog,
type_nodes = frequency("relative", color_scale = "Greys"),
sec_nodes = frequency("absolute"),
type_edges = frequency("relative", color_edges = "#FF6900"),
sec_edges = frequency("absolute"),
rankdir = "LR",
render = FALSE)
export_map(dfg,
file_name = paste0("results/processmaps/dfg_cluster", cluster, "_R.pdf"),
file_type = "pdf",
title = paste("DFG Cluster", cluster))
}
#--------------- (3) Visualization with pictures ---------------
library(png)
library(jpeg)
library(grid)
pdf("results/figures/clustering_artworks.pdf", height = 8, width = 8, pointsize = 10)
#png("results/figures/clustering_artworks.png", units = "in", height = 8, width = 8, pointsize = 10, res = 300)
par(mai = c(.4,.4,.1,.1), mgp = c(2.4, 1, 0))
plot(y ~ x, p$data, type = "n", ylim = c(-3.5, 2.8), xlim = c(-5, 10),
xlab = "", ylab = "")
for (item in sprintf("%03d", as.numeric(rownames(p$data)))) {
if (item == "125") {
pic <- readJPEG(paste0("../data/haum/ContentEyevisit/eyevisit_cards_light/",
item, "/", item, ".jpg"))
} else {
pic <- readPNG(paste0("../data/haum/ContentEyevisit/eyevisit_cards_light/",
item, "/", item, ".png"))
}
img <- as.raster(pic[,,1:3])
x <- p$data$x[sprintf("%03d", as.numeric(rownames(p$data))) == item]
y <- p$data$y[sprintf("%03d", as.numeric(rownames(p$data))) == item]
points(x, y,
col = mycols[p$data$cluster[sprintf("%03d", as.numeric(rownames(p$data))) == item]],
cex = 9,
pch = 15)
rasterImage(img,
xleft = x - .45,
xright = x + .45,
ybottom = y - .2,
ytop = y + .2)
}
legend("topright", paste("Cluster", 1:k), col = mycols, pch = 15, bty = "n")
dev.off()