# 10_user-navigation.R # # content: (1) Load data # (2) Clustering # (3) Fit tree # (4) Investigate variants # # input: results/haum/event_logfiles_2024-02-21_16-07-33.csv # output: results/haum/eventlogs_pre-corona_case-clusters.csv # # last mod: 2024-03-08 # setwd("C:/Users/nwickelmaier/Nextcloud/Documents/MDS/2023ss/60100_master_thesis/analysis/code") library(bupaverse) library(factoextra) library(rpart) library(partykit) #--------------- (1) Load data --------------- load("results/haum/dataframes_case_2019.RData") #--------------- (2) Clustering --------------- df <- dattree[, -1] summary(df) # Look at collinearity cor_mat <- cor(df) diag(cor_mat) <- NA heatmap(cor_mat) #--------------- (2.2) Hierarchical clustering --------------- dist_mat <- cluster::daisy(df, metric = "gower") # "Flatten" with MDS coor_2d <- as.data.frame(cmdscale(dist_mat, k = 2)) coor_3d <- as.data.frame(cmdscale(dist_mat, k = 3)) # TODO: Better use MASS::isoMDS() since I am not using Euclidean distances? plot(coor_2d) rgl::plot3d(coor_3d) method <- c(average = "average", single = "single", complete = "complete", ward = "ward") method <- "ward" hcs <- pbapply::pblapply(method, function(x) cluster::agnes(dist_mat, method = x)) acs <- pbapply::pbsapply(hcs, function(x) x$ac) hc <- hcs$ward # Something like a scree plot (??) plot(rev(hc$height)[1:100], type = "b", pch = 16, cex = .5) k <- 4 mycols <- c("#78004B", "#FF6900", "#3CB4DC", "#91C86E") cluster <- cutree(as.hclust(hc), k = k) table(cluster) plot(coor_2d, col = mycols[cluster]) legend("topleft", paste("Cl", 1:4), col = mycols, pch = 21) rgl::plot3d(coor_3d, col = mycols[cluster]) table(dattree[cluster == 1, "Pattern"]) table(dattree[cluster == 2, "Pattern"]) table(dattree[cluster == 3, "Pattern"]) table(dattree[cluster == 4, "Pattern"]) table(dattree[cluster == 1, "InfocardOnly"]) table(dattree[cluster == 2, "InfocardOnly"]) table(dattree[cluster == 3, "InfocardOnly"]) table(dattree[cluster == 4, "InfocardOnly"]) aggregate(. ~ cluster, df, mean) aggregate(cbind(duration, distance, scaleSize, rotationDegree, length, nmove, nflipCard, nopenTopic, nopenPopup) ~ cluster, datcase, mean) ### Look at selected cases ########################################### tmp <- dat tmp$start <- tmp$date.start tmp$complete <- tmp$date.stop alog <- activitylog(tmp[tmp$case == 24016, ], case_id = "case", activity_id = "item", resource_id = "path", timestamps = c("start", "complete")) process_map(alog) rm(tmp) ###################################################################### res <- merge(dat, data.frame(case = dattree$case, cluster), by = "case", all.x = TRUE) res <- res[order(res$fileId.start, res$date.start, res$timeMs.start), ] xtabs( ~ item + cluster, res) aggregate(event ~ cluster, res, table) # Look at clusters par(mfrow = c(2, 2)) vioplot::vioplot(duration ~ cluster, res) vioplot::vioplot(distance ~ cluster, res) vioplot::vioplot(scaleSize ~ cluster, res) vioplot::vioplot(rotationDegree ~ cluster, res) aggregate(cbind(duration, distance, scaleSize, rotationDegree) ~ cluster, res, mean) aggregate(cbind(duration, distance, scaleSize, rotationDegree) ~ cluster, res, median) write.table(res, file = "results/haum/eventlogs_2019_case-clusters.csv", sep = ";", quote = FALSE, row.names = FALSE) save(res, dist_mat, hcs, acs, datcase, dattree, coor_2d, coor_3d, file = "results/haum/tmp_user-navigation.RData") #--------------- (3) Fit tree --------------- c1 <- rpart(as.factor(cluster) ~ ., data = dattree[, -1], method = "class") plot(as.party(c1)) # with conditional tree c2 <- ctree(as.factor(cluster) ~ ., data = dattree[, -1], alpha = 0) plot(c2)