mtt_haum/code/07_item-clustering.R

262 lines
8.4 KiB
R
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# 07_item-clustering.R
#
# content: (1) Read data
# (1.1) Read log event data
# (1.2) Read infos for PM for infos
# (1.3) Extract additional infos for clustering
# (2) Clustering
# (3) Visualization with pictures
#
# input: results/haum/event_logfiles_2024-02-21_16-07-33.csv
# results/haum/pn_infos_items.csv
# output: results/haum/event_logfiles_pre-corona_with-clusters.csv
#
# last mod: 2024-02-23
# setwd("C:/Users/nwickelmaier/Nextcloud/Documents/MDS/2023ss/60100_master_thesis/analysis/code")
library(bupaverse)
library(factoextra)
#--------------- (1) Read data ---------------
#--------------- (1.1) Read log event data ---------------
dat0 <- read.table("results/haum/event_logfiles_2024-02-21_16-07-33.csv",
colClasses = c("character", "character", "POSIXct",
"POSIXct", "character", "integer",
"numeric", "character", "character",
rep("numeric", 3), "character",
"character", rep("numeric", 11),
"character", "character"),
sep = ";", header = TRUE)
dat0$event <- factor(dat0$event, levels = c("move", "flipCard", "openTopic",
"openPopup"))
# TODO: Maybe look at this with complete data?
# Select data pre Corona
dat <- dat0[as.Date(dat0$date.start) < "2020-03-13", ]
dat <- dat[dat$path != 106098, ]
#--------------- (1.2) Read infos for PM for items ---------------
datitem <- read.table("results/haum/pn_infos_items.csv", header = TRUE,
sep = ";", row.names = 1)
#--------------- (1.3) Extract additional infos for clustering ---------------
dat_split <- split(dat, ~ path)
time_minmax <- function(subdata) {
subdata$min_time <- min(subdata$timeMs.start)
if (all(is.na(subdata$timeMs.stop))) {
subdata$max_time <- NA
} else {
subdata$max_time <- max(subdata$timeMs.stop, na.rm = TRUE)
}
subdata
}
dat_list <- pbapply::pblapply(dat_split, time_minmax)
dat_minmax <- dplyr::bind_rows(dat_list)
datpath <- aggregate(duration ~ item + path, dat, mean, na.action = NULL)
datpath$min_time <- aggregate(min_time ~ path, dat_minmax, unique, na.action = NULL)$min_time
datpath$max_time <- aggregate(max_time ~ path, dat_minmax, unique, na.action = NULL)$max_time
datpath$duration_path <- datpath$max_time - datpath$min_time
# average duration per path
datitem$duration <- aggregate(duration ~ item, datpath, mean)$duration
datitem$distance <- aggregate(distance ~ item, dat, mean)$distance
datitem$scaleSize <- aggregate(scaleSize ~ item, dat, mean)$scaleSize
datitem$rotationDegree <- aggregate(rotationDegree ~ item, dat, mean)$rotationDegree
datitem$npaths <- aggregate(path ~ item, dat, function(x) length(unique(x)))$path
datitem$ncases <- aggregate(case ~ item, dat, function(x) length(unique(x)))$case
datitem$nmoves <- aggregate(event ~ item, dat, table)$event[,"move"]
datitem$nflipCard <- aggregate(event ~ item, dat, table)$event[,"flipCard"]
datitem$nopenTopic <- aggregate(event ~ item, dat, table)$event[,"openTopic"]
datitem$nopenPopup <- aggregate(event ~ item, dat, table)$event[,"openPopup"]
#--------------- (2) Clustering ---------------
df <- datitem[, c("precision", "generalizability", "nvariants", "duration",
"distance", "scaleSize", "rotationDegree", "npaths",
"ncases", "nmoves", "nopenTopic", "nopenPopup")] |>
scale()
mat <- dist(df)
heatmap(as.matrix(mat))
# Choosing best linkage method
h1 <- hclust(mat, method = "average")
h2 <- hclust(mat, method = "complete")
h3 <- hclust(mat, method = "ward.D")
h4 <- hclust(mat, method = "ward.D2")
h5 <- hclust(mat, method = "single")
# Cophenetic Distances, for each linkage
c1 <- cophenetic(h1)
c2 <- cophenetic(h2)
c3 <- cophenetic(h3)
c4 <- cophenetic(h4)
c5 <- cophenetic(h5)
# Correlations
cor(mat, c1)
cor(mat, c2)
cor(mat, c3)
cor(mat, c4)
cor(mat, c5)
# https://en.wikipedia.org/wiki/Cophenetic_correlation
# https://stats.stackexchange.com/questions/195446/choosing-the-right-linkage-method-for-hierarchical-clustering
# Dendograms
par(mfrow=c(3,2))
plot(h1, main = "Average Linkage")
plot(h2, main = "Complete Linkage")
plot(h3, main = "Ward Linkage")
plot(h4, main = "Ward 2 Linkage")
plot(h5, main = "Single Linkage")
hc <- h1
# Note that agnes(*, method="ward") corresponds to hclust(*, "ward.D2")
k <- 4 # number of clusters
grp <- cutree(hc, k = k)
datitem$grp <- grp
fviz_dend(hc, k = k,
cex = 0.5,
k_colors = c("#78004B", "#FF6900", "#3CB4DC", "#91C86E",
"#000000", "gold", "#434F4F"),
#type = "phylogenic",
rect = TRUE
)
plot(hc)
rect.hclust(hc, k=8, border="red")
rect.hclust(hc, k=7, border="blue")
rect.hclust(hc, k=6, border="green")
p <- fviz_cluster(list(data = df, cluster = grp),
palette = c("#78004B", "#FF6900", "#3CB4DC", "#91C86E",
"#000000", "#434F4F", "gold"),
ellipse.type = "convex",
repel = TRUE,
show.clust.cent = FALSE, ggtheme = theme_bw())
p
aggregate(cbind(duration, distance, scaleSize , rotationDegree, npaths,
ncases, nmoves, nflipCard, nopenTopic, nopenPopup) ~ grp,
datitem, median)
# Something like a scree plot (??)
plot(rev(seq_along(hc$height)), hc$height, type = "l")
points(rev(seq_along(hc$height)), hc$height, pch = 16, cex = .5)
datitem$item <- sprintf("%03d",
as.numeric(gsub("item_([0-9]{3})", "\\1", row.names(datitem))))
res <- merge(dat, datitem[, c("item", "grp")], by = "item", all.x = TRUE)
res <- res[order(res$fileId.start, res$date.start, res$timeMs.start), ]
# Look at clusters
par(mfrow = c(2,2))
vioplot::vioplot(duration ~ grp, res)
vioplot::vioplot(distance ~ grp, res)
vioplot::vioplot(scaleSize ~ grp, res)
vioplot::vioplot(rotationDegree ~ grp, res)
write.table(res,
file = "results/haum/event_logfiles_pre-corona_with-clusters.csv",
sep = ";",
quote = FALSE,
row.names = FALSE)
# DFGs for clusters
res$start <- res$date.start
res$complete <- res$date.stop
for (cluster in sort(unique(res$grp))) {
alog <- activitylog(res[res$grp == cluster, ],
case_id = "path",
activity_id = "event",
resource_id = "item",
timestamps = c("start", "complete"))
dfg <- process_map(alog,
type_nodes = frequency("relative", color_scale = "Greys"),
sec_nodes = frequency("absolute"),
type_edges = frequency("relative", color_edges = "#FF6900"),
sec_edges = frequency("absolute"),
rankdir = "LR",
render = FALSE)
export_map(dfg,
file_name = paste0("results/processmaps/dfg_cluster", cluster, "_R.pdf"),
file_type = "pdf",
title = paste("DFG Cluster", cluster))
}
#--------------- (3) Visualization with pictures ---------------
library(png)
library(jpeg)
library(grid)
colors <- c("#78004B", "#FF6900", "#3CB4DC", "#91C86E")
pdf("results/figures/clustering_artworks.pdf", height = 8, width = 8, pointsize = 10)
#png("results/figures/clustering_artworks.png", units = "in", height = 8, width = 8, pointsize = 10, res = 300)
par(mai = c(.4,.4,.1,.1), mgp = c(2.4, 1, 0))
plot(y ~ x, p$data, type = "n", ylim = c(-3.5, 2.8), xlim = c(-5, 10),
xlab = "", ylab = "")
for (item in sprintf("%03d", as.numeric(rownames(p$data)))) {
if (item == "125") {
pic <- readJPEG(paste0("../data/haum/ContentEyevisit/eyevisit_cards_light/",
item, "/", item, ".jpg"))
} else {
pic <- readPNG(paste0("../data/haum/ContentEyevisit/eyevisit_cards_light/",
item, "/", item, ".png"))
}
img <- as.raster(pic[,,1:3])
x <- p$data$x[sprintf("%03d", as.numeric(rownames(p$data))) == item]
y <- p$data$y[sprintf("%03d", as.numeric(rownames(p$data))) == item]
points(x, y,
col = colors[p$data$cluster[sprintf("%03d", as.numeric(rownames(p$data))) == item]],
cex = 9,
pch = 15)
rasterImage(img,
xleft = x - .45,
xright = x + .45,
ybottom = y - .2,
ytop = y + .2)
}
legend("topright", paste("Cluster", 1:k), col = colors, pch = 15, bty = "n")
dev.off()