2024-03-08 11:52:55 +01:00
|
|
|
# 10_user-navigation.R
|
|
|
|
#
|
|
|
|
# content: (1) Load data
|
|
|
|
# (2) Clustering
|
|
|
|
# (3) Fit tree
|
|
|
|
#
|
2024-03-15 16:29:50 +01:00
|
|
|
# input: results/haum/dataframes_case_2019.RData
|
|
|
|
# output: results/haum/eventlogs_2019_case-clusters.csv
|
|
|
|
# results/haum/tmp_user-navigation.RData
|
|
|
|
# ../../thesis/figures/data/clustering_cases.RData
|
2024-03-08 11:52:55 +01:00
|
|
|
#
|
2024-03-15 16:29:50 +01:00
|
|
|
# last mod: 2024-03-15
|
2024-03-08 11:52:55 +01:00
|
|
|
|
|
|
|
# setwd("C:/Users/nwickelmaier/Nextcloud/Documents/MDS/2023ss/60100_master_thesis/analysis/code")
|
|
|
|
|
|
|
|
#--------------- (1) Load data ---------------
|
|
|
|
|
|
|
|
load("results/haum/dataframes_case_2019.RData")
|
|
|
|
|
|
|
|
#--------------- (2) Clustering ---------------
|
|
|
|
|
2024-03-08 18:06:37 +01:00
|
|
|
df <- dattree[, c("PropItems", "SearchInfo", "PropMoves", "AvDurItemNorm",
|
|
|
|
"Pattern", "InfocardOnly")]
|
2024-03-08 11:52:55 +01:00
|
|
|
|
|
|
|
summary(df)
|
|
|
|
|
2024-03-08 18:06:37 +01:00
|
|
|
#dist_mat <- cluster::daisy(df, metric = "euclidean")
|
2024-03-08 11:52:55 +01:00
|
|
|
dist_mat <- cluster::daisy(df, metric = "gower")
|
|
|
|
|
|
|
|
# "Flatten" with MDS
|
2024-03-08 18:06:37 +01:00
|
|
|
# coor_3d <- as.data.frame(cmdscale(dist_mat, k = 3))
|
|
|
|
# coor_3d <- prcomp(df)$x[, 1:3]
|
|
|
|
|
2024-03-09 17:22:46 +01:00
|
|
|
coor_3d <- smacof::mds(dist_mat, ndim = 3, type = "ordinal")$conf
|
|
|
|
coor_2d <- coor_3d[, 1:2]
|
2024-03-08 11:52:55 +01:00
|
|
|
|
|
|
|
plot(coor_2d)
|
|
|
|
rgl::plot3d(coor_3d)
|
|
|
|
|
2024-03-13 18:14:57 +01:00
|
|
|
# pm <- cluster::pam(dist_mat, k = k)
|
|
|
|
# cluster <- pm$clustering
|
|
|
|
# --> Does not look as good as the hierarchical clustring
|
|
|
|
|
2024-03-09 17:22:46 +01:00
|
|
|
method <- c(average = "average", single = "single", complete = "complete",
|
|
|
|
ward = "ward")
|
|
|
|
hcs <- pbapply::pblapply(method, function(x) cluster::agnes(dist_mat, method = x))
|
|
|
|
acs <- pbapply::pbsapply(hcs, function(x) x$ac)
|
|
|
|
# average single complete ward
|
|
|
|
# 0.9881224 0.9725661 0.9937669 0.9994267
|
|
|
|
hc <- hcs$ward
|
2024-03-08 11:52:55 +01:00
|
|
|
|
2024-03-09 17:22:46 +01:00
|
|
|
#hc <- cluster::agnes(dist_mat, method = "ward")
|
2024-03-08 11:52:55 +01:00
|
|
|
|
2024-03-08 18:06:37 +01:00
|
|
|
k <- 5
|
2024-03-08 11:52:55 +01:00
|
|
|
|
2024-03-14 17:28:10 +01:00
|
|
|
mycols <- c("#3CB4DC", "#FF6900", "#78004B", "#91C86E", "#434F4F")
|
2024-03-08 11:52:55 +01:00
|
|
|
|
|
|
|
cluster <- cutree(as.hclust(hc), k = k)
|
|
|
|
|
|
|
|
table(cluster)
|
|
|
|
|
2024-03-14 17:28:10 +01:00
|
|
|
plot(coor_2d, col = mycols[cluster], pch = 16)
|
|
|
|
#legend("topleft", paste("Cl", 1:5), col = mycols, pch = 21)
|
|
|
|
legend("topleft", c("Scanning", "Exploring", "Flitting", "Searching", "Info"),
|
|
|
|
col = mycols, bty = "n", pch = 16)
|
2024-03-08 11:52:55 +01:00
|
|
|
rgl::plot3d(coor_3d, col = mycols[cluster])
|
|
|
|
|
2024-03-14 17:28:10 +01:00
|
|
|
print(ftable(xtabs( ~ InfocardOnly + Pattern + cluster, dattree)), zero = "-")
|
2024-03-08 11:52:55 +01:00
|
|
|
|
|
|
|
aggregate(. ~ cluster, df, mean)
|
|
|
|
|
2024-03-08 18:06:37 +01:00
|
|
|
aggregate(cbind(duration, distance, scaleSize, rotationDegree, length, nitems,
|
2024-03-08 11:52:55 +01:00
|
|
|
nmove, nflipCard, nopenTopic, nopenPopup) ~ cluster, datcase,
|
|
|
|
mean)
|
|
|
|
|
|
|
|
### Look at selected cases ###########################################
|
|
|
|
tmp <- dat
|
|
|
|
tmp$start <- tmp$date.start
|
|
|
|
tmp$complete <- tmp$date.stop
|
|
|
|
|
2024-03-13 18:14:57 +01:00
|
|
|
# Examples:
|
|
|
|
## Scholar: 29679
|
|
|
|
## Star: 24456
|
|
|
|
## Dispersion: 26000
|
|
|
|
|
|
|
|
## only info cards: 24299
|
|
|
|
## not only info cards: 24013
|
|
|
|
|
|
|
|
#head(dattree[dattree$Pattern == "Dispersion" & datcase$nitems == 10, ])
|
|
|
|
#head(dattree[dattree$InfocardOnly == "yes" & datcase$nitems == 3, ])
|
|
|
|
|
|
|
|
alog <- bupaR::activitylog(tmp[tmp$case == 24013, ],
|
2024-03-08 18:06:37 +01:00
|
|
|
case_id = "case",
|
|
|
|
activity_id = "item",
|
|
|
|
resource_id = "path",
|
|
|
|
timestamps = c("start", "complete"))
|
2024-03-08 11:52:55 +01:00
|
|
|
|
2024-03-08 18:06:37 +01:00
|
|
|
processmapR::process_map(alog)
|
2024-03-08 11:52:55 +01:00
|
|
|
|
|
|
|
rm(tmp)
|
|
|
|
|
|
|
|
######################################################################
|
|
|
|
|
|
|
|
res <- merge(dat, data.frame(case = dattree$case, cluster),
|
|
|
|
by = "case", all.x = TRUE)
|
|
|
|
res <- res[order(res$fileId.start, res$date.start, res$timeMs.start), ]
|
|
|
|
|
|
|
|
xtabs( ~ item + cluster, res)
|
|
|
|
aggregate(event ~ cluster, res, table)
|
|
|
|
|
|
|
|
# Look at clusters
|
|
|
|
par(mfrow = c(2, 2))
|
|
|
|
vioplot::vioplot(duration ~ cluster, res)
|
|
|
|
vioplot::vioplot(distance ~ cluster, res)
|
|
|
|
vioplot::vioplot(scaleSize ~ cluster, res)
|
|
|
|
vioplot::vioplot(rotationDegree ~ cluster, res)
|
|
|
|
|
|
|
|
aggregate(cbind(duration, distance, scaleSize, rotationDegree) ~ cluster, res, mean)
|
|
|
|
aggregate(cbind(duration, distance, scaleSize, rotationDegree) ~ cluster, res, median)
|
|
|
|
|
|
|
|
write.table(res,
|
|
|
|
file = "results/haum/eventlogs_2019_case-clusters.csv",
|
|
|
|
sep = ";",
|
|
|
|
quote = FALSE,
|
|
|
|
row.names = FALSE)
|
|
|
|
|
2024-03-08 18:06:37 +01:00
|
|
|
save(res, dist_mat, hcs, acs, coor_2d, coor_3d,
|
2024-03-08 11:52:55 +01:00
|
|
|
file = "results/haum/tmp_user-navigation.RData")
|
|
|
|
|
2024-03-15 16:29:50 +01:00
|
|
|
save(coor_2d, coor_3d, cluster, dattree,
|
2024-03-14 17:28:10 +01:00
|
|
|
file = "../../thesis/figures/data/clustering_cases.RData")
|
|
|
|
|
|
|
|
|
2024-03-08 11:52:55 +01:00
|
|
|
#--------------- (3) Fit tree ---------------
|
|
|
|
|
2024-03-08 18:06:37 +01:00
|
|
|
c1 <- rpart::rpart(as.factor(cluster) ~ ., data = dattree[, c("PropMoves",
|
|
|
|
"PropItems",
|
|
|
|
"SearchInfo",
|
|
|
|
"AvDurItem",
|
|
|
|
"Pattern",
|
|
|
|
"InfocardOnly")],
|
|
|
|
method = "class")
|
|
|
|
|
2024-03-14 17:28:10 +01:00
|
|
|
plot(partykit::as.party(c1), tp_args = list(fill = mycols, col = mycols))
|
2024-03-08 11:52:55 +01:00
|
|
|
|
|
|
|
# with conditional tree
|
2024-03-08 18:06:37 +01:00
|
|
|
c2 <- partykit::ctree(as.factor(cluster) ~ ., data = dattree[, c("PropMoves",
|
|
|
|
"PropItems",
|
|
|
|
"SearchInfo",
|
|
|
|
"AvDurItem",
|
|
|
|
"Pattern",
|
|
|
|
"InfocardOnly")],
|
|
|
|
alpha = 0.001)
|
2024-03-09 17:22:46 +01:00
|
|
|
|
2024-03-14 17:28:10 +01:00
|
|
|
plot(c2, tp_args = list(fill = mycols, col = mycols))
|
2024-03-08 11:52:55 +01:00
|
|
|
|
2024-03-14 17:28:10 +01:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
factoextra::fviz_dend(as.hclust(hc), k = k,
|
|
|
|
cex = 0.5,
|
|
|
|
k_colors = mycols,
|
2024-03-15 16:29:50 +01:00
|
|
|
type = "phylogenic",
|
2024-03-14 17:28:10 +01:00
|
|
|
rect = TRUE,
|
|
|
|
main = "",
|
|
|
|
ylab = ""
|
|
|
|
#ggtheme = ggplot2::theme_bw()
|
|
|
|
)
|
|
|
|
|