More work on case clustering

This commit is contained in:
Nora Wickelmaier 2024-03-08 18:06:37 +01:00
parent 26ba7265f5
commit 29ac6c15d4

View File

@ -10,57 +10,46 @@
# #
# last mod: 2024-03-08 # last mod: 2024-03-08
# setwd("C:/Users/nwickelmaier/Nextcloud/Documents/MDS/2023ss/60100_master_thesis/analysis/code") # setwd("C:/Users/nwickelmaier/Nextcloud/Documents/MDS/2023ss/60100_master_thesis/analysis/code")
library(bupaverse)
library(factoextra)
library(rpart)
library(partykit)
#--------------- (1) Load data --------------- #--------------- (1) Load data ---------------
load("results/haum/dataframes_case_2019.RData") load("results/haum/dataframes_case_2019.RData")
#--------------- (2) Clustering --------------- #--------------- (2) Clustering ---------------
df <- dattree[, -1] df <- dattree[, c("PropItems", "SearchInfo", "PropMoves", "AvDurItemNorm",
"Pattern", "InfocardOnly")]
summary(df) summary(df)
# Look at collinearity #dist_mat <- cluster::daisy(df, metric = "euclidean")
cor_mat <- cor(df)
diag(cor_mat) <- NA
heatmap(cor_mat)
#--------------- (2.2) Hierarchical clustering ---------------
dist_mat <- cluster::daisy(df, metric = "gower") dist_mat <- cluster::daisy(df, metric = "gower")
# "Flatten" with MDS # "Flatten" with MDS
coor_2d <- as.data.frame(cmdscale(dist_mat, k = 2)) # coor_2d <- as.data.frame(cmdscale(dist_mat, k = 2))
coor_3d <- as.data.frame(cmdscale(dist_mat, k = 3)) # coor_3d <- as.data.frame(cmdscale(dist_mat, k = 3))
# TODO: Better use MASS::isoMDS() since I am not using Euclidean distances?
# coor_2d <- prcomp(df)$x[, 1:2]
# coor_3d <- prcomp(df)$x[, 1:3]
coor_2d <- smacof::mds(dist_mat, ndim = 2, type = "ordinal")$conf
coor_3d <- smacof::mds(dist_mat, ndim = 2, type = "ordinal")$conf
plot(coor_2d) plot(coor_2d)
rgl::plot3d(coor_3d) rgl::plot3d(coor_3d)
method <- c(average = "average", single = "single", complete = "complete", # method <- c(average = "average", single = "single", complete = "complete",
ward = "ward") # ward = "ward")
# hcs <- pbapply::pblapply(method, function(x) cluster::agnes(dist_mat, method = x))
# acs <- pbapply::pbsapply(hcs, function(x) x$ac)
# hc <- hcs$ward
method <- "ward" hc <- cluster::agnes(dist_mat, method = "ward")
hcs <- pbapply::pblapply(method, function(x) cluster::agnes(dist_mat, method = x)) k <- 5
acs <- pbapply::pbsapply(hcs, function(x) x$ac)
hc <- hcs$ward mycols <- c("#434F4F", "#78004B", "#FF6900", "#3CB4DC", "#91C86E")
# Something like a scree plot (??)
plot(rev(hc$height)[1:100], type = "b", pch = 16, cex = .5)
k <- 4
mycols <- c("#78004B", "#FF6900", "#3CB4DC", "#91C86E")
cluster <- cutree(as.hclust(hc), k = k) cluster <- cutree(as.hclust(hc), k = k)
@ -74,30 +63,35 @@ table(dattree[cluster == 1, "Pattern"])
table(dattree[cluster == 2, "Pattern"]) table(dattree[cluster == 2, "Pattern"])
table(dattree[cluster == 3, "Pattern"]) table(dattree[cluster == 3, "Pattern"])
table(dattree[cluster == 4, "Pattern"]) table(dattree[cluster == 4, "Pattern"])
table(dattree[cluster == 5, "Pattern"])
table(dattree[cluster == 1, "InfocardOnly"]) table(dattree[cluster == 1, "InfocardOnly"])
table(dattree[cluster == 2, "InfocardOnly"]) table(dattree[cluster == 2, "InfocardOnly"])
table(dattree[cluster == 3, "InfocardOnly"]) table(dattree[cluster == 3, "InfocardOnly"])
table(dattree[cluster == 4, "InfocardOnly"]) table(dattree[cluster == 4, "InfocardOnly"])
table(dattree[cluster == 5, "InfocardOnly"])
aggregate(. ~ cluster, df, mean) aggregate(. ~ cluster, df, mean)
aggregate(cbind(duration, distance, scaleSize, rotationDegree, length, aggregate(cbind(duration, distance, scaleSize, rotationDegree, length, nitems,
nmove, nflipCard, nopenTopic, nopenPopup) ~ cluster, datcase, nmove, nflipCard, nopenTopic, nopenPopup) ~ cluster, datcase,
mean) mean)
### Look at selected cases ########################################### ### Look at selected cases ###########################################
load("")
tmp <- dat tmp <- dat
tmp$start <- tmp$date.start tmp$start <- tmp$date.start
tmp$complete <- tmp$date.stop tmp$complete <- tmp$date.stop
alog <- activitylog(tmp[tmp$case == 24016, ], alog <- bupaR::activitylog(tmp[tmp$case == 24016, ],
case_id = "case", case_id = "case",
activity_id = "item", activity_id = "item",
resource_id = "path", resource_id = "path",
timestamps = c("start", "complete")) timestamps = c("start", "complete"))
process_map(alog) processmapR::process_map(alog)
rm(tmp) rm(tmp)
@ -126,16 +120,28 @@ write.table(res,
quote = FALSE, quote = FALSE,
row.names = FALSE) row.names = FALSE)
save(res, dist_mat, hcs, acs, datcase, dattree, coor_2d, coor_3d, save(res, dist_mat, hcs, acs, coor_2d, coor_3d,
file = "results/haum/tmp_user-navigation.RData") file = "results/haum/tmp_user-navigation.RData")
#--------------- (3) Fit tree --------------- #--------------- (3) Fit tree ---------------
c1 <- rpart(as.factor(cluster) ~ ., data = dattree[, -1], method = "class") c1 <- rpart::rpart(as.factor(cluster) ~ ., data = dattree[, c("PropMoves",
plot(as.party(c1)) "PropItems",
"SearchInfo",
"AvDurItem",
"Pattern",
"InfocardOnly")],
method = "class")
plot(partykit::as.party(c1))
# with conditional tree # with conditional tree
c2 <- ctree(as.factor(cluster) ~ ., data = dattree[, -1], alpha = 0) c2 <- partykit::ctree(as.factor(cluster) ~ ., data = dattree[, c("PropMoves",
"PropItems",
"SearchInfo",
"AvDurItem",
"Pattern",
"InfocardOnly")],
alpha = 0.001)
plot(c2) plot(c2)