From 29ac6c15d48d63e7072ed822aa78d507abcc6c3b Mon Sep 17 00:00:00 2001 From: nwickel Date: Fri, 8 Mar 2024 18:06:37 +0100 Subject: [PATCH] More work on case clustering --- code/10_user-navigation.R | 90 +++++++++++++++++++++------------------ 1 file changed, 48 insertions(+), 42 deletions(-) diff --git a/code/10_user-navigation.R b/code/10_user-navigation.R index 29ff354..72cfb35 100644 --- a/code/10_user-navigation.R +++ b/code/10_user-navigation.R @@ -10,57 +10,46 @@ # # last mod: 2024-03-08 - # setwd("C:/Users/nwickelmaier/Nextcloud/Documents/MDS/2023ss/60100_master_thesis/analysis/code") -library(bupaverse) -library(factoextra) -library(rpart) -library(partykit) - #--------------- (1) Load data --------------- load("results/haum/dataframes_case_2019.RData") #--------------- (2) Clustering --------------- -df <- dattree[, -1] +df <- dattree[, c("PropItems", "SearchInfo", "PropMoves", "AvDurItemNorm", + "Pattern", "InfocardOnly")] summary(df) -# Look at collinearity -cor_mat <- cor(df) -diag(cor_mat) <- NA -heatmap(cor_mat) - -#--------------- (2.2) Hierarchical clustering --------------- - +#dist_mat <- cluster::daisy(df, metric = "euclidean") dist_mat <- cluster::daisy(df, metric = "gower") # "Flatten" with MDS -coor_2d <- as.data.frame(cmdscale(dist_mat, k = 2)) -coor_3d <- as.data.frame(cmdscale(dist_mat, k = 3)) -# TODO: Better use MASS::isoMDS() since I am not using Euclidean distances? +# coor_2d <- as.data.frame(cmdscale(dist_mat, k = 2)) +# coor_3d <- as.data.frame(cmdscale(dist_mat, k = 3)) + +# coor_2d <- prcomp(df)$x[, 1:2] +# coor_3d <- prcomp(df)$x[, 1:3] + +coor_2d <- smacof::mds(dist_mat, ndim = 2, type = "ordinal")$conf +coor_3d <- smacof::mds(dist_mat, ndim = 2, type = "ordinal")$conf plot(coor_2d) rgl::plot3d(coor_3d) -method <- c(average = "average", single = "single", complete = "complete", - ward = "ward") +# method <- c(average = "average", single = "single", complete = "complete", +# ward = "ward") +# hcs <- pbapply::pblapply(method, function(x) cluster::agnes(dist_mat, method = x)) +# acs <- pbapply::pbsapply(hcs, function(x) x$ac) +# hc <- hcs$ward -method <- "ward" +hc <- cluster::agnes(dist_mat, method = "ward") -hcs <- pbapply::pblapply(method, function(x) cluster::agnes(dist_mat, method = x)) -acs <- pbapply::pbsapply(hcs, function(x) x$ac) +k <- 5 -hc <- hcs$ward - -# Something like a scree plot (??) -plot(rev(hc$height)[1:100], type = "b", pch = 16, cex = .5) - -k <- 4 - -mycols <- c("#78004B", "#FF6900", "#3CB4DC", "#91C86E") +mycols <- c("#434F4F", "#78004B", "#FF6900", "#3CB4DC", "#91C86E") cluster <- cutree(as.hclust(hc), k = k) @@ -74,30 +63,35 @@ table(dattree[cluster == 1, "Pattern"]) table(dattree[cluster == 2, "Pattern"]) table(dattree[cluster == 3, "Pattern"]) table(dattree[cluster == 4, "Pattern"]) +table(dattree[cluster == 5, "Pattern"]) table(dattree[cluster == 1, "InfocardOnly"]) table(dattree[cluster == 2, "InfocardOnly"]) table(dattree[cluster == 3, "InfocardOnly"]) table(dattree[cluster == 4, "InfocardOnly"]) +table(dattree[cluster == 5, "InfocardOnly"]) aggregate(. ~ cluster, df, mean) -aggregate(cbind(duration, distance, scaleSize, rotationDegree, length, +aggregate(cbind(duration, distance, scaleSize, rotationDegree, length, nitems, nmove, nflipCard, nopenTopic, nopenPopup) ~ cluster, datcase, mean) ### Look at selected cases ########################################### + +load("") + tmp <- dat tmp$start <- tmp$date.start tmp$complete <- tmp$date.stop -alog <- activitylog(tmp[tmp$case == 24016, ], - case_id = "case", - activity_id = "item", - resource_id = "path", - timestamps = c("start", "complete")) +alog <- bupaR::activitylog(tmp[tmp$case == 24016, ], + case_id = "case", + activity_id = "item", + resource_id = "path", + timestamps = c("start", "complete")) -process_map(alog) +processmapR::process_map(alog) rm(tmp) @@ -126,16 +120,28 @@ write.table(res, quote = FALSE, row.names = FALSE) -save(res, dist_mat, hcs, acs, datcase, dattree, coor_2d, coor_3d, +save(res, dist_mat, hcs, acs, coor_2d, coor_3d, file = "results/haum/tmp_user-navigation.RData") #--------------- (3) Fit tree --------------- -c1 <- rpart(as.factor(cluster) ~ ., data = dattree[, -1], method = "class") -plot(as.party(c1)) +c1 <- rpart::rpart(as.factor(cluster) ~ ., data = dattree[, c("PropMoves", + "PropItems", + "SearchInfo", + "AvDurItem", + "Pattern", + "InfocardOnly")], + method = "class") + +plot(partykit::as.party(c1)) # with conditional tree -c2 <- ctree(as.factor(cluster) ~ ., data = dattree[, -1], alpha = 0) +c2 <- partykit::ctree(as.factor(cluster) ~ ., data = dattree[, c("PropMoves", + "PropItems", + "SearchInfo", + "AvDurItem", + "Pattern", + "InfocardOnly")], + alpha = 0.001) plot(c2) -