Cleaned up item clustering; works well now
This commit is contained in:
parent
66fab4fa18
commit
26ba7265f5
@ -10,13 +10,14 @@
|
|||||||
# input: results/haum/eventlogs_pre-corona_cleaned.RData
|
# input: results/haum/eventlogs_pre-corona_cleaned.RData
|
||||||
# results/haum/pn_infos_items.csv
|
# results/haum/pn_infos_items.csv
|
||||||
# output: results/haum/eventlogs_pre-corona_item-clusters.csv
|
# output: results/haum/eventlogs_pre-corona_item-clusters.csv
|
||||||
|
# results/figures/clustering_artworks.pdf
|
||||||
|
# results/figures/clustering_artworks.png
|
||||||
#
|
#
|
||||||
# last mod: 2024-03-06
|
# last mod: 2024-03-08
|
||||||
|
|
||||||
# setwd("C:/Users/nwickelmaier/Nextcloud/Documents/MDS/2023ss/60100_master_thesis/analysis/code")
|
# setwd("C:/Users/nwickelmaier/Nextcloud/Documents/MDS/2023ss/60100_master_thesis/analysis/code")
|
||||||
|
|
||||||
library(bupaverse)
|
source("R_helpers.R")
|
||||||
library(factoextra)
|
|
||||||
|
|
||||||
#--------------- (1) Read data ---------------
|
#--------------- (1) Read data ---------------
|
||||||
|
|
||||||
@ -31,17 +32,6 @@ datitem <- read.table("results/haum/pn_infos_items.csv", header = TRUE,
|
|||||||
|
|
||||||
#--------------- (1.3) Extract additional infos for clustering ---------------
|
#--------------- (1.3) Extract additional infos for clustering ---------------
|
||||||
|
|
||||||
time_minmax_ms <- function(subdata) {
|
|
||||||
subdata$min_time <- min(subdata$timeMs.start)
|
|
||||||
if (all(is.na(subdata$timeMs.stop))) {
|
|
||||||
subdata$max_time <- NA
|
|
||||||
} else {
|
|
||||||
subdata$max_time <- max(subdata$timeMs.stop, na.rm = TRUE)
|
|
||||||
}
|
|
||||||
subdata
|
|
||||||
}
|
|
||||||
# TODO: Move to helper file
|
|
||||||
|
|
||||||
# Get average duration per path
|
# Get average duration per path
|
||||||
dat_split <- split(dat, ~ path)
|
dat_split <- split(dat, ~ path)
|
||||||
dat_list <- pbapply::pblapply(dat_split, time_minmax_ms)
|
dat_list <- pbapply::pblapply(dat_split, time_minmax_ms)
|
||||||
@ -63,11 +53,14 @@ datitem$nflipCard <- aggregate(event ~ item, dat, table)$event[,"flipCard"]
|
|||||||
datitem$nopenTopic <- aggregate(event ~ item, dat, table)$event[,"openTopic"]
|
datitem$nopenTopic <- aggregate(event ~ item, dat, table)$event[,"openTopic"]
|
||||||
datitem$nopenPopup <- aggregate(event ~ item, dat, table)$event[,"openPopup"]
|
datitem$nopenPopup <- aggregate(event ~ item, dat, table)$event[,"openPopup"]
|
||||||
|
|
||||||
|
rm(datpath)
|
||||||
|
|
||||||
#--------------- (2) Clustering ---------------
|
#--------------- (2) Clustering ---------------
|
||||||
|
|
||||||
df <- datitem[, c("precision", "generalizability", "nvariants", "duration",
|
df <- datitem[, c("precision", "generalizability", "nvariants", "duration",
|
||||||
"distance", "scaleSize", "rotationDegree", "npaths",
|
"distance", "scaleSize", "rotationDegree", "npaths",
|
||||||
"ncases", "nmoves", "nopenTopic", "nopenPopup")] |>
|
"ncases", "nmoves", "nflipCard", "nopenTopic",
|
||||||
|
"nopenPopup")] |>
|
||||||
scale()
|
scale()
|
||||||
|
|
||||||
dist_mat <- dist(df)
|
dist_mat <- dist(df)
|
||||||
@ -87,52 +80,54 @@ for (hc in hcs) plot(hc, main = "")
|
|||||||
|
|
||||||
hc <- hcs$ward
|
hc <- hcs$ward
|
||||||
|
|
||||||
k <- 4 # number of clusters
|
factoextra::fviz_nbclust(df, FUNcluster = factoextra::hcut, method = "wss")
|
||||||
|
factoextra::fviz_nbclust(df, FUNcluster = factoextra::hcut, method = "silhouette")
|
||||||
|
|
||||||
mycols <- c("#78004B", "#FF6900", "#3CB4DC", "#91C86E")
|
gap_stat <- cluster::clusGap(df, FUNcluster = factoextra::hcut,
|
||||||
|
hc_func = "agnes", hc_method = "ward",
|
||||||
|
K.max = 10)
|
||||||
|
factoextra::fviz_gap_stat(gap_stat)
|
||||||
|
|
||||||
grp <- cutree(hc, k = k)
|
k <- 6 # number of clusters
|
||||||
datitem$grp <- grp
|
|
||||||
|
|
||||||
fviz_dend(hc, k = k,
|
mycols <- c("#434F4F", "#78004B", "#FF6900", "#3CB4DC", "#91C86E", "Black")
|
||||||
|
|
||||||
|
cluster <- cutree(hc, k = k)
|
||||||
|
|
||||||
|
factoextra::fviz_dend(hc, k = k,
|
||||||
cex = 0.5,
|
cex = 0.5,
|
||||||
k_colors = mycols,
|
k_colors = mycols,
|
||||||
#type = "phylogenic",
|
#type = "phylogenic",
|
||||||
rect = TRUE
|
rect = TRUE
|
||||||
)
|
)
|
||||||
|
|
||||||
p <- fviz_cluster(list(data = df, cluster = grp),
|
factoextra::fviz_cluster(list(data = df, cluster = cluster),
|
||||||
palette = mycols,
|
palette = mycols,
|
||||||
ellipse.type = "convex",
|
ellipse.type = "convex",
|
||||||
repel = TRUE,
|
repel = TRUE,
|
||||||
show.clust.cent = FALSE, ggtheme = theme_bw())
|
show.clust.cent = FALSE,
|
||||||
p
|
ggtheme = ggplot2::theme_bw())
|
||||||
|
|
||||||
aggregate(cbind(duration, distance, scaleSize , rotationDegree, npaths,
|
aggregate(cbind(duration, distance, scaleSize , rotationDegree, npaths,
|
||||||
ncases, nmoves, nflipCard, nopenTopic, nopenPopup) ~ grp,
|
ncases, nmoves, nflipCard, nopenTopic, nopenPopup) ~ cluster,
|
||||||
datitem, mean)
|
datitem, mean)
|
||||||
|
|
||||||
|
|
||||||
aggregate(cbind(duration, distance, scaleSize , rotationDegree, npaths,
|
aggregate(cbind(duration, distance, scaleSize , rotationDegree, npaths,
|
||||||
ncases, nmoves, nflipCard, nopenTopic, nopenPopup) ~ grp,
|
ncases, nmoves, nflipCard, nopenTopic, nopenPopup) ~ cluster,
|
||||||
datitem, max)
|
datitem, max)
|
||||||
|
|
||||||
|
item <- sprintf("%03d", as.numeric(gsub("item_([0-9]{3})", "\\1",
|
||||||
|
row.names(datitem))))
|
||||||
|
|
||||||
# Something like a scree plot (??)
|
res <- merge(dat, data.frame(item, cluster), by = "item", all.x = TRUE)
|
||||||
plot(rev(hc$height), type = "b", pch = 16, cex = .5)
|
|
||||||
|
|
||||||
datitem$item <- sprintf("%03d",
|
|
||||||
as.numeric(gsub("item_([0-9]{3})", "\\1", row.names(datitem))))
|
|
||||||
|
|
||||||
res <- merge(dat, datitem[, c("item", "grp")], by = "item", all.x = TRUE)
|
|
||||||
res <- res[order(res$fileId.start, res$date.start, res$timeMs.start), ]
|
res <- res[order(res$fileId.start, res$date.start, res$timeMs.start), ]
|
||||||
|
|
||||||
# Look at clusters
|
# Look at clusters
|
||||||
par(mfrow = c(2,2))
|
par(mfrow = c(2,2))
|
||||||
vioplot::vioplot(duration ~ grp, res)
|
vioplot::vioplot(duration ~ cluster, res)
|
||||||
vioplot::vioplot(distance ~ grp, res)
|
vioplot::vioplot(distance ~ cluster, res)
|
||||||
vioplot::vioplot(scaleSize ~ grp, res)
|
vioplot::vioplot(scaleSize ~ cluster, res)
|
||||||
vioplot::vioplot(rotationDegree ~ grp, res)
|
vioplot::vioplot(rotationDegree ~ cluster, res)
|
||||||
|
|
||||||
write.table(res,
|
write.table(res,
|
||||||
file = "results/haum/eventlogs_pre-corona_item-clusters.csv",
|
file = "results/haum/eventlogs_pre-corona_item-clusters.csv",
|
||||||
@ -144,59 +139,60 @@ write.table(res,
|
|||||||
res$start <- res$date.start
|
res$start <- res$date.start
|
||||||
res$complete <- res$date.stop
|
res$complete <- res$date.stop
|
||||||
|
|
||||||
for (cluster in sort(unique(res$grp))) {
|
for (clst in sort(unique(res$cluster))) {
|
||||||
|
|
||||||
alog <- activitylog(res[res$grp == cluster, ],
|
alog <- bupaR::activitylog(res[res$cluster == clst, ],
|
||||||
case_id = "path",
|
case_id = "path",
|
||||||
activity_id = "event",
|
activity_id = "event",
|
||||||
resource_id = "item",
|
resource_id = "item",
|
||||||
timestamps = c("start", "complete"))
|
timestamps = c("start", "complete"))
|
||||||
|
|
||||||
dfg <- process_map(alog,
|
dfg <- processmapR::process_map(alog,
|
||||||
type_nodes = frequency("relative", color_scale = "Greys"),
|
type_nodes = processmapR::frequency("relative", color_scale = "Greys"),
|
||||||
sec_nodes = frequency("absolute"),
|
sec_nodes = processmapR::frequency("absolute"),
|
||||||
type_edges = frequency("relative", color_edges = "#FF6900"),
|
type_edges = processmapR::frequency("relative", color_edges = "#FF6900"),
|
||||||
sec_edges = frequency("absolute"),
|
sec_edges = processmapR::frequency("absolute"),
|
||||||
rankdir = "LR",
|
rankdir = "LR",
|
||||||
render = FALSE)
|
render = FALSE)
|
||||||
export_map(dfg,
|
|
||||||
file_name = paste0("results/processmaps/dfg_cluster", cluster, "_R.pdf"),
|
processmapR::export_map(dfg,
|
||||||
|
file_name = paste0("results/processmaps/dfg_cluster", clst, "_R.pdf"),
|
||||||
file_type = "pdf",
|
file_type = "pdf",
|
||||||
title = paste("DFG Cluster", cluster))
|
title = paste("DFG Cluster", clst))
|
||||||
}
|
}
|
||||||
|
|
||||||
#--------------- (3) Visualization with pictures ---------------
|
#--------------- (3) Visualization with pictures ---------------
|
||||||
|
|
||||||
library(png)
|
coor_2d <- cmdscale(dist_mat, k = 2)
|
||||||
library(jpeg)
|
|
||||||
library(grid)
|
|
||||||
|
|
||||||
pdf("results/figures/clustering_artworks.pdf", height = 8, width = 8, pointsize = 10)
|
items <- sprintf("%03d", as.numeric(rownames(datitem)))
|
||||||
#png("results/figures/clustering_artworks.png", units = "in", height = 8, width = 8, pointsize = 10, res = 300)
|
|
||||||
|
#pdf("results/figures/clustering_artworks.pdf", height = 8, width = 8, pointsize = 10)
|
||||||
|
png("results/figures/clustering_artworks.png", units = "in", height = 8, width = 8, pointsize = 10, res = 300)
|
||||||
|
|
||||||
par(mai = c(.4,.4,.1,.1), mgp = c(2.4, 1, 0))
|
par(mai = c(.4,.4,.1,.1), mgp = c(2.4, 1, 0))
|
||||||
|
|
||||||
plot(y ~ x, p$data, type = "n", ylim = c(-3.5, 2.8), xlim = c(-5, 10),
|
plot(coor_2d, type = "n", ylim = c(-3.7, 2.6), xlim = c(-5, 10.5),
|
||||||
xlab = "", ylab = "")
|
xlab = "", ylab = "")
|
||||||
|
|
||||||
for (item in sprintf("%03d", as.numeric(rownames(p$data)))) {
|
for (item in items) {
|
||||||
|
|
||||||
if (item == "125") {
|
if (item == "125") {
|
||||||
|
|
||||||
pic <- readJPEG(paste0("../data/haum/ContentEyevisit/eyevisit_cards_light/",
|
pic <- jpeg::readJPEG(paste0("../data/haum/ContentEyevisit/eyevisit_cards_light/",
|
||||||
item, "/", item, ".jpg"))
|
item, "/", item, ".jpg"))
|
||||||
} else {
|
} else {
|
||||||
pic <- readPNG(paste0("../data/haum/ContentEyevisit/eyevisit_cards_light/",
|
pic <- png::readPNG(paste0("../data/haum/ContentEyevisit/eyevisit_cards_light/",
|
||||||
item, "/", item, ".png"))
|
item, "/", item, ".png"))
|
||||||
}
|
}
|
||||||
|
|
||||||
img <- as.raster(pic[,,1:3])
|
img <- as.raster(pic[,,1:3])
|
||||||
|
|
||||||
x <- p$data$x[sprintf("%03d", as.numeric(rownames(p$data))) == item]
|
x <- coor_2d[items == item, 1]
|
||||||
y <- p$data$y[sprintf("%03d", as.numeric(rownames(p$data))) == item]
|
y <- coor_2d[items == item, 2]
|
||||||
|
|
||||||
points(x, y,
|
points(x, y,
|
||||||
col = mycols[p$data$cluster[sprintf("%03d", as.numeric(rownames(p$data))) == item]],
|
col = mycols[cluster[items == item]],
|
||||||
cex = 9,
|
cex = 9,
|
||||||
pch = 15)
|
pch = 15)
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user