2024-02-27 09:08:20 +01:00
|
|
|
|
# 07_item-clustering.R
|
2024-01-30 11:48:48 +01:00
|
|
|
|
#
|
|
|
|
|
# content: (1) Read data
|
|
|
|
|
# (1.1) Read log event data
|
|
|
|
|
# (1.2) Read infos for PM for infos
|
|
|
|
|
# (1.3) Extract additional infos for clustering
|
|
|
|
|
# (2) Clustering
|
|
|
|
|
# (3) Visualization with pictures
|
|
|
|
|
#
|
2024-02-27 09:08:20 +01:00
|
|
|
|
# input: results/haum/event_logfiles_2024-02-21_16-07-33.csv
|
2024-01-30 11:48:48 +01:00
|
|
|
|
# results/haum/pn_infos_items.csv
|
|
|
|
|
# output: results/haum/event_logfiles_pre-corona_with-clusters.csv
|
|
|
|
|
#
|
2024-02-27 09:08:20 +01:00
|
|
|
|
# last mod: 2024-02-23
|
2024-01-30 11:48:48 +01:00
|
|
|
|
|
|
|
|
|
|
2024-01-25 17:21:18 +01:00
|
|
|
|
# setwd("C:/Users/nwickelmaier/Nextcloud/Documents/MDS/2023ss/60100_master_thesis/analysis/code")
|
|
|
|
|
|
2024-01-30 11:48:48 +01:00
|
|
|
|
library(bupaverse)
|
|
|
|
|
library(factoextra)
|
|
|
|
|
|
2024-01-25 17:21:18 +01:00
|
|
|
|
#--------------- (1) Read data ---------------
|
|
|
|
|
|
|
|
|
|
#--------------- (1.1) Read log event data ---------------
|
|
|
|
|
|
2024-02-27 09:08:20 +01:00
|
|
|
|
dat0 <- read.table("results/haum/event_logfiles_2024-02-21_16-07-33.csv",
|
2024-01-25 17:21:18 +01:00
|
|
|
|
colClasses = c("character", "character", "POSIXct",
|
|
|
|
|
"POSIXct", "character", "integer",
|
|
|
|
|
"numeric", "character", "character",
|
|
|
|
|
rep("numeric", 3), "character",
|
|
|
|
|
"character", rep("numeric", 11),
|
|
|
|
|
"character", "character"),
|
|
|
|
|
sep = ";", header = TRUE)
|
|
|
|
|
dat0$event <- factor(dat0$event, levels = c("move", "flipCard", "openTopic",
|
|
|
|
|
"openPopup"))
|
|
|
|
|
|
2024-02-07 18:00:25 +01:00
|
|
|
|
# TODO: Maybe look at this with complete data?
|
|
|
|
|
|
2024-01-25 17:21:18 +01:00
|
|
|
|
# Select data pre Corona
|
|
|
|
|
dat <- dat0[as.Date(dat0$date.start) < "2020-03-13", ]
|
|
|
|
|
dat <- dat[dat$path != 106098, ]
|
|
|
|
|
|
|
|
|
|
#--------------- (1.2) Read infos for PM for items ---------------
|
|
|
|
|
|
|
|
|
|
datitem <- read.table("results/haum/pn_infos_items.csv", header = TRUE,
|
|
|
|
|
sep = ";", row.names = 1)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#--------------- (1.3) Extract additional infos for clustering ---------------
|
|
|
|
|
|
2024-02-27 09:08:20 +01:00
|
|
|
|
dat_split <- split(dat, ~ path)
|
|
|
|
|
|
|
|
|
|
time_minmax <- function(subdata) {
|
|
|
|
|
subdata$min_time <- min(subdata$timeMs.start)
|
|
|
|
|
if (all(is.na(subdata$timeMs.stop))) {
|
|
|
|
|
subdata$max_time <- NA
|
|
|
|
|
} else {
|
|
|
|
|
subdata$max_time <- max(subdata$timeMs.stop, na.rm = TRUE)
|
|
|
|
|
}
|
|
|
|
|
subdata
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
dat_list <- pbapply::pblapply(dat_split, time_minmax)
|
|
|
|
|
dat_minmax <- dplyr::bind_rows(dat_list)
|
|
|
|
|
|
|
|
|
|
datpath <- aggregate(duration ~ item + path, dat, mean, na.action = NULL)
|
|
|
|
|
|
|
|
|
|
datpath$min_time <- aggregate(min_time ~ path, dat_minmax, unique, na.action = NULL)$min_time
|
|
|
|
|
datpath$max_time <- aggregate(max_time ~ path, dat_minmax, unique, na.action = NULL)$max_time
|
|
|
|
|
|
|
|
|
|
datpath$duration_path <- datpath$max_time - datpath$min_time
|
|
|
|
|
|
|
|
|
|
# average duration per path
|
|
|
|
|
datitem$duration <- aggregate(duration ~ item, datpath, mean)$duration
|
2024-01-25 17:21:18 +01:00
|
|
|
|
datitem$distance <- aggregate(distance ~ item, dat, mean)$distance
|
|
|
|
|
datitem$scaleSize <- aggregate(scaleSize ~ item, dat, mean)$scaleSize
|
|
|
|
|
datitem$rotationDegree <- aggregate(rotationDegree ~ item, dat, mean)$rotationDegree
|
|
|
|
|
datitem$npaths <- aggregate(path ~ item, dat, function(x) length(unique(x)))$path
|
|
|
|
|
datitem$ncases <- aggregate(case ~ item, dat, function(x) length(unique(x)))$case
|
2024-02-27 09:08:20 +01:00
|
|
|
|
datitem$nmoves <- aggregate(event ~ item, dat, table)$event[,"move"]
|
|
|
|
|
datitem$nflipCard <- aggregate(event ~ item, dat, table)$event[,"flipCard"]
|
|
|
|
|
datitem$nopenTopic <- aggregate(event ~ item, dat, table)$event[,"openTopic"]
|
|
|
|
|
datitem$nopenPopup <- aggregate(event ~ item, dat, table)$event[,"openPopup"]
|
2024-01-25 17:21:18 +01:00
|
|
|
|
|
|
|
|
|
#--------------- (2) Clustering ---------------
|
|
|
|
|
|
2024-02-27 09:08:20 +01:00
|
|
|
|
df <- datitem[, c("precision", "generalizability", "nvariants", "duration",
|
|
|
|
|
"distance", "scaleSize", "rotationDegree", "npaths",
|
|
|
|
|
"ncases", "nmoves", "nopenTopic", "nopenPopup")] |>
|
|
|
|
|
scale()
|
|
|
|
|
|
2024-01-25 17:21:18 +01:00
|
|
|
|
mat <- dist(df)
|
|
|
|
|
|
2024-02-07 18:00:25 +01:00
|
|
|
|
heatmap(as.matrix(mat))
|
|
|
|
|
|
|
|
|
|
# Choosing best linkage method
|
|
|
|
|
h1 <- hclust(mat, method = "average")
|
|
|
|
|
h2 <- hclust(mat, method = "complete")
|
|
|
|
|
h3 <- hclust(mat, method = "ward.D")
|
|
|
|
|
h4 <- hclust(mat, method = "ward.D2")
|
|
|
|
|
h5 <- hclust(mat, method = "single")
|
|
|
|
|
|
|
|
|
|
# Cophenetic Distances, for each linkage
|
|
|
|
|
c1 <- cophenetic(h1)
|
|
|
|
|
c2 <- cophenetic(h2)
|
|
|
|
|
c3 <- cophenetic(h3)
|
|
|
|
|
c4 <- cophenetic(h4)
|
|
|
|
|
c5 <- cophenetic(h5)
|
|
|
|
|
|
|
|
|
|
# Correlations
|
|
|
|
|
cor(mat, c1)
|
|
|
|
|
cor(mat, c2)
|
|
|
|
|
cor(mat, c3)
|
|
|
|
|
cor(mat, c4)
|
|
|
|
|
cor(mat, c5)
|
|
|
|
|
# https://en.wikipedia.org/wiki/Cophenetic_correlation
|
2024-02-27 09:08:20 +01:00
|
|
|
|
# https://stats.stackexchange.com/questions/195446/choosing-the-right-linkage-method-for-hierarchical-clustering
|
2024-02-07 18:00:25 +01:00
|
|
|
|
|
|
|
|
|
# Dendograms
|
|
|
|
|
par(mfrow=c(3,2))
|
|
|
|
|
plot(h1, main = "Average Linkage")
|
|
|
|
|
plot(h2, main = "Complete Linkage")
|
|
|
|
|
plot(h3, main = "Ward Linkage")
|
|
|
|
|
plot(h4, main = "Ward 2 Linkage")
|
|
|
|
|
plot(h5, main = "Single Linkage")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
hc <- h1
|
|
|
|
|
# Note that ‘agnes(*, method="ward")’ corresponds to ‘hclust(*, "ward.D2")’
|
|
|
|
|
|
2024-02-27 09:08:20 +01:00
|
|
|
|
k <- 4 # number of clusters
|
2024-02-07 18:00:25 +01:00
|
|
|
|
|
|
|
|
|
grp <- cutree(hc, k = k)
|
2024-01-30 11:48:48 +01:00
|
|
|
|
datitem$grp <- grp
|
2024-01-25 17:21:18 +01:00
|
|
|
|
|
2024-02-07 18:00:25 +01:00
|
|
|
|
fviz_dend(hc, k = k,
|
2024-01-25 17:21:18 +01:00
|
|
|
|
cex = 0.5,
|
2024-02-27 09:08:20 +01:00
|
|
|
|
k_colors = c("#78004B", "#FF6900", "#3CB4DC", "#91C86E",
|
|
|
|
|
"#000000", "gold", "#434F4F"),
|
2024-01-25 17:21:18 +01:00
|
|
|
|
#type = "phylogenic",
|
|
|
|
|
rect = TRUE
|
|
|
|
|
)
|
|
|
|
|
|
2024-02-07 18:00:25 +01:00
|
|
|
|
plot(hc)
|
|
|
|
|
rect.hclust(hc, k=8, border="red")
|
|
|
|
|
rect.hclust(hc, k=7, border="blue")
|
|
|
|
|
rect.hclust(hc, k=6, border="green")
|
|
|
|
|
|
2024-01-25 17:21:18 +01:00
|
|
|
|
p <- fviz_cluster(list(data = df, cluster = grp),
|
2024-02-27 09:08:20 +01:00
|
|
|
|
palette = c("#78004B", "#FF6900", "#3CB4DC", "#91C86E",
|
|
|
|
|
"#000000", "#434F4F", "gold"),
|
2024-01-25 17:21:18 +01:00
|
|
|
|
ellipse.type = "convex",
|
|
|
|
|
repel = TRUE,
|
|
|
|
|
show.clust.cent = FALSE, ggtheme = theme_bw())
|
|
|
|
|
p
|
|
|
|
|
|
|
|
|
|
aggregate(cbind(duration, distance, scaleSize , rotationDegree, npaths,
|
2024-02-27 09:08:20 +01:00
|
|
|
|
ncases, nmoves, nflipCard, nopenTopic, nopenPopup) ~ grp,
|
|
|
|
|
datitem, median)
|
2024-02-07 18:00:25 +01:00
|
|
|
|
|
|
|
|
|
# Something like a scree plot (??)
|
|
|
|
|
plot(rev(seq_along(hc$height)), hc$height, type = "l")
|
|
|
|
|
points(rev(seq_along(hc$height)), hc$height, pch = 16, cex = .5)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2024-01-25 17:21:18 +01:00
|
|
|
|
|
2024-01-30 11:48:48 +01:00
|
|
|
|
datitem$item <- sprintf("%03d",
|
|
|
|
|
as.numeric(gsub("item_([0-9]{3})", "\\1", row.names(datitem))))
|
2024-01-25 17:21:18 +01:00
|
|
|
|
|
|
|
|
|
res <- merge(dat, datitem[, c("item", "grp")], by = "item", all.x = TRUE)
|
|
|
|
|
res <- res[order(res$fileId.start, res$date.start, res$timeMs.start), ]
|
|
|
|
|
|
2024-01-30 11:48:48 +01:00
|
|
|
|
# Look at clusters
|
2024-02-07 18:00:25 +01:00
|
|
|
|
par(mfrow = c(2,2))
|
2024-01-30 11:48:48 +01:00
|
|
|
|
vioplot::vioplot(duration ~ grp, res)
|
|
|
|
|
vioplot::vioplot(distance ~ grp, res)
|
|
|
|
|
vioplot::vioplot(scaleSize ~ grp, res)
|
|
|
|
|
vioplot::vioplot(rotationDegree ~ grp, res)
|
|
|
|
|
|
2024-01-25 17:21:18 +01:00
|
|
|
|
write.table(res,
|
|
|
|
|
file = "results/haum/event_logfiles_pre-corona_with-clusters.csv",
|
|
|
|
|
sep = ";",
|
|
|
|
|
quote = FALSE,
|
|
|
|
|
row.names = FALSE)
|
|
|
|
|
|
2024-01-30 11:48:48 +01:00
|
|
|
|
# DFGs for clusters
|
2024-01-25 17:21:18 +01:00
|
|
|
|
res$start <- res$date.start
|
|
|
|
|
res$complete <- res$date.stop
|
|
|
|
|
|
|
|
|
|
for (cluster in sort(unique(res$grp))) {
|
|
|
|
|
|
|
|
|
|
alog <- activitylog(res[res$grp == cluster, ],
|
|
|
|
|
case_id = "path",
|
|
|
|
|
activity_id = "event",
|
|
|
|
|
resource_id = "item",
|
|
|
|
|
timestamps = c("start", "complete"))
|
|
|
|
|
|
|
|
|
|
dfg <- process_map(alog,
|
2024-01-30 11:48:48 +01:00
|
|
|
|
type_nodes = frequency("relative", color_scale = "Greys"),
|
2024-01-25 17:21:18 +01:00
|
|
|
|
sec_nodes = frequency("absolute"),
|
2024-01-30 11:48:48 +01:00
|
|
|
|
type_edges = frequency("relative", color_edges = "#FF6900"),
|
2024-01-25 17:21:18 +01:00
|
|
|
|
sec_edges = frequency("absolute"),
|
|
|
|
|
rankdir = "LR",
|
|
|
|
|
render = FALSE)
|
|
|
|
|
export_map(dfg,
|
|
|
|
|
file_name = paste0("results/processmaps/dfg_cluster", cluster, "_R.pdf"),
|
|
|
|
|
file_type = "pdf",
|
|
|
|
|
title = paste("DFG Cluster", cluster))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
#--------------- (3) Visualization with pictures ---------------
|
|
|
|
|
|
|
|
|
|
library(png)
|
|
|
|
|
library(jpeg)
|
|
|
|
|
library(grid)
|
|
|
|
|
|
2024-02-27 09:08:20 +01:00
|
|
|
|
colors <- c("#78004B", "#FF6900", "#3CB4DC", "#91C86E")
|
2024-01-25 17:21:18 +01:00
|
|
|
|
|
2024-02-27 09:08:20 +01:00
|
|
|
|
pdf("results/figures/clustering_artworks.pdf", height = 8, width = 8, pointsize = 10)
|
|
|
|
|
#png("results/figures/clustering_artworks.png", units = "in", height = 8, width = 8, pointsize = 10, res = 300)
|
2024-01-25 17:21:18 +01:00
|
|
|
|
|
2024-02-27 09:08:20 +01:00
|
|
|
|
par(mai = c(.4,.4,.1,.1), mgp = c(2.4, 1, 0))
|
2024-01-25 17:21:18 +01:00
|
|
|
|
|
2024-02-27 09:08:20 +01:00
|
|
|
|
plot(y ~ x, p$data, type = "n", ylim = c(-3.5, 2.8), xlim = c(-5, 10),
|
|
|
|
|
xlab = "", ylab = "")
|
2024-01-25 17:21:18 +01:00
|
|
|
|
|
|
|
|
|
for (item in sprintf("%03d", as.numeric(rownames(p$data)))) {
|
|
|
|
|
|
|
|
|
|
if (item == "125") {
|
|
|
|
|
|
|
|
|
|
pic <- readJPEG(paste0("../data/haum/ContentEyevisit/eyevisit_cards_light/",
|
|
|
|
|
item, "/", item, ".jpg"))
|
|
|
|
|
} else {
|
|
|
|
|
pic <- readPNG(paste0("../data/haum/ContentEyevisit/eyevisit_cards_light/",
|
|
|
|
|
item, "/", item, ".png"))
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
img <- as.raster(pic[,,1:3])
|
|
|
|
|
|
|
|
|
|
x <- p$data$x[sprintf("%03d", as.numeric(rownames(p$data))) == item]
|
|
|
|
|
y <- p$data$y[sprintf("%03d", as.numeric(rownames(p$data))) == item]
|
|
|
|
|
|
|
|
|
|
points(x, y,
|
|
|
|
|
col = colors[p$data$cluster[sprintf("%03d", as.numeric(rownames(p$data))) == item]],
|
|
|
|
|
cex = 9,
|
|
|
|
|
pch = 15)
|
|
|
|
|
|
|
|
|
|
rasterImage(img,
|
2024-02-27 09:08:20 +01:00
|
|
|
|
xleft = x - .45,
|
|
|
|
|
xright = x + .45,
|
2024-01-25 17:21:18 +01:00
|
|
|
|
ybottom = y - .2,
|
|
|
|
|
ytop = y + .2)
|
|
|
|
|
|
|
|
|
|
}
|
2024-02-27 09:08:20 +01:00
|
|
|
|
legend("topright", paste("Cluster", 1:k), col = colors, pch = 15, bty = "n")
|
2024-01-25 17:21:18 +01:00
|
|
|
|
|
|
|
|
|
dev.off()
|
|
|
|
|
|