From b3bc81ccbc21bddbfa751980592168850514bd6b Mon Sep 17 00:00:00 2001 From: nwickel Date: Fri, 1 Mar 2024 17:39:07 +0100 Subject: [PATCH] Played around with clustering --- code/09_user-navigation.R | 209 ++++++++++++++++++++++++++++++-------- 1 file changed, 164 insertions(+), 45 deletions(-) diff --git a/code/09_user-navigation.R b/code/09_user-navigation.R index 8752ce9..dc66bff 100644 --- a/code/09_user-navigation.R +++ b/code/09_user-navigation.R @@ -49,6 +49,8 @@ dat0$weekdays <- factor(weekdays(dat0$date.start), dat <- dat0[as.Date(dat0$date.start) < "2020-03-13", ] dat <- dat[dat$path != 106098, ] +rm(dat0) + #--------------- (1.2) Extract additional infos for clustering --------------- datcase <- aggregate(cbind(distance, scaleSize, rotationDegree) ~ @@ -112,6 +114,7 @@ time_minmax <- function(subdata) { } subdata } +# TODO: Export from package mtt dat_list <- pbapply::pblapply(dat_split, time_minmax) dat_minmax <- dplyr::bind_rows(dat_list) @@ -180,19 +183,24 @@ heatmap(cor_mat) dattree <- data.frame(case = datcase$case, Duration = datcase$duration, PropItems = datcase$nitems / length(unique(dat$item)), - SearchInfo = - 2*(((datcase$nopenPopup / datcase$nopenTopic) * - (datcase$nopenTopic / datcase$nflipCard)) / - ((datcase$nopenPopup / datcase$nopenTopic) + - (datcase$nopenTopic / datcase$nflipCard)) - ), + PropTopic = datcase$nopenTopic / datcase$nflipCard, + PropPopup = datcase$nopenPopup / datcase$nopenTopic, + # SearchInfo = + # 2*(((datcase$nopenPopup / datcase$nopenTopic) * + # (datcase$nopenTopic / datcase$nflipCard)) / + # ((datcase$nopenPopup / datcase$nopenTopic) + + # (datcase$nopenTopic / datcase$nflipCard)) + # ), PropMoves = datcase$nmove / datcase$length, PathLinearity = datcase$nitems / datcase$npaths, Singularity = datcase$npaths / datcase$length ) -dattree$SearchInfo <- ifelse(dattree$SearchInfo %in% 0, 0.1, dattree$SearchInfo) -dattree$SearchInfo <- ifelse(is.na(dattree$SearchInfo), 0, dattree$SearchInfo) +#dattree$SearchInfo <- ifelse(dattree$SearchInfo %in% 0, 0.1, dattree$SearchInfo) +#dattree$SearchInfo <- ifelse(is.na(dattree$SearchInfo), 0, dattree$SearchInfo) + +dattree$PropTopic <- ifelse(is.na(dattree$PropTopic), 0, dattree$PropTopic) +dattree$PropPopup <- ifelse(is.na(dattree$PropPopup), 0, dattree$PropPopup) get_centrality <- function(case, data) { @@ -223,6 +231,26 @@ dattree$centr_degree <- centrality[, 1] dattree$centr_degree_loops <- centrality[, 2] dattree$centr_between <- centrality[, 3] +## Add average duration per item + +dat_split <- split(dat[, c("item", "case", "path", "timeMs.start", "timeMs.stop")], ~ path) + +dat_list <- pbapply::pblapply(dat_split, time_minmax) +dat_minmax <- dplyr::bind_rows(dat_list) + +tmp <- aggregate(min_time ~ path, dat_minmax, unique) +tmp$max_time <- aggregate(max_time ~ path, dat_minmax, unique, na.action = NULL)$max_time +tmp$duration <- tmp$max_time - tmp$min_time +tmp$case <- aggregate(case ~ path, dat_minmax, unique)$case + +dattree$AvDurItem <- aggregate(duration ~ case, tmp, mean)$duration + +rm(tmp) + + + + + par(mfrow = c(3,3)) hist(dattree$Duration, breaks = 50, main = "") hist(dattree$SearchInfo, breaks = 50, main = "") @@ -256,15 +284,21 @@ write.table(dattree, #--------------- (2) Clustering --------------- -df <- dattree[, c("Duration", "PropItems", "SearchInfo", "PropMoves")] +df <- dattree[, c("AvDurItem", "PropItems", "PropTopic", "PropPopup", "PropMoves")] +#df <- dattree[, c("AvDurItem", "PropItems", "SearchInfo", "PropMoves")] # TODO: With or without duration? Why is it relevant? + df$Scholar <- ifelse(dattree$Pattern == "Scholar", 1, 0) df$Star <- ifelse(dattree$Pattern == "Star", 1, 0) df$Dispersion <- ifelse(dattree$Pattern == "Dispersion", 1, 0) # scale Duration and min/max SearchInfo -df$Duration <- as.numeric(scale(df$Duration)) -df$SearchInfo <- (df$SearchInfo - min(df$SearchInfo)) / - (max(df$SearchInfo) - min(df$SearchInfo)) +df$AvDurItem <- as.numeric(scale(df$AvDurItem)) +#df$SearchInfo <- (df$SearchInfo - min(df$SearchInfo)) / +# (max(df$SearchInfo) - min(df$SearchInfo)) +df$PropTopic <- (df$PropTopic - min(df$PropTopic, na.rm = TRUE)) / + (max(df$PropTopic, na.rm = TRUE) - min(df$PropTopic, na.rm = TRUE)) +df$PropPopup <- (df$PropPopup - min(df$PropPopup, na.rm = TRUE)) / + (max(df$PropPopup, na.rm = TRUE) - min(df$PropPopup, na.rm = TRUE)) mat <- dist(df) # TODO: Do I need to scale all variables? @@ -284,15 +318,10 @@ c5 <- cophenetic(h5) # Correlations cor(mat, c1) -# 0.8854558 cor(mat, c2) -# 0.883313 cor(mat, c3) -# 0.5368663 cor(mat, c4) -# 0.725247 cor(mat, c5) -# 0.3895215 # https://en.wikipedia.org/wiki/Cophenetic_correlation # https://stats.stackexchange.com/questions/195446/choosing-the-right-linkage-method-for-hierarchical-clustering @@ -304,59 +333,99 @@ plot(rev(hc$height)[1:100], type = "b", pch = 16, cex = .5) k <- 4 -grp <- cutree(hc, k = k) -df$grp <- grp +grp_hclust <- cutree(hc, k = k) -table(grp) +table(grp_hclust) -fviz_cluster(list(data = df, cluster = grp), +fviz_cluster(list(data = df, cluster = grp_hclust), palette = c("#78004B", "#FF6900", "#3CB4DC", "#91C86E", "black"), ellipse.type = "convex", show.clust.cent = FALSE, ggtheme = theme_bw()) +table(dattree[grp_hclust == 1, "Pattern"]) +table(dattree[grp_hclust == 2, "Pattern"]) +table(dattree[grp_hclust == 3, "Pattern"]) +table(dattree[grp_hclust == 4, "Pattern"]) + # Look at 3d plot to see if clusters are actually separate pc <- prcomp(df) coor <- as.data.frame(pc$x[, c(1, 2, 3)]) -coor$grp <- df$grp -rgl::plot3d(coor, col = c("#78004B", "#FF6900", "#3CB4DC", "#91C86E", "black")[coor$grp]) +rgl::plot3d(coor, col = c("#78004B", "#FF6900", "#3CB4DC", "#91C86E")[grp_hclust]) - - - -dattree$grp <- grp -table(dattree[dattree$grp == 1, "Pattern"]) -table(dattree[dattree$grp == 2, "Pattern"]) -table(dattree[dattree$grp == 3, "Pattern"]) -table(dattree[dattree$grp == 4, "Pattern"]) - - aggregate(cbind(Duration, PropItems, SearchInfo, PropMoves, PathLinearity, Singularity, centr_degree, centr_degree_loops, - centr_between) ~ grp, dattree, mean) + centr_between) ~ grp_hclust, dattree, mean) -aggregate(cbind(Duration, PropItems, SearchInfo, PropMoves, Dispersion, - Scholar, Star) ~ grp, df, mean) +aggregate(cbind(duration, distance, scaleSize, rotationDegree, length, + nmove, nflipCard, nopenTopic, nopenPopup) ~ grp_hclust, datcase, + mean) + +### DBSCAN clustering + +library(dbscan) +d1 <- dbscan(df, eps = .5, minPts = 9) +hullplot(df, d1) + +grp_db <- d1$cluster +table(grp_db) + +kNNdistplot(df, k = 6) +abline(h = 0.5, col = "red") + +fviz_cluster(list(data = df[grp_db != 0, ], cluster = grp_db[grp_db != 0]), + palette = c("#78004B", "#FF6900", "#3CB4DC", "#91C86E"), + ellipse.type = "convex", + show.clust.cent = FALSE, + ggtheme = theme_bw()) + +rgl::plot3d(coor, col = c("#78004B", "#FF6900", "#3CB4DC", "#91C86E")[grp_db + 1]) + +aggregate(. ~ grp_db, df, mean) + +table(dattree[grp_db == 0, "Pattern"]) +table(dattree[grp_db == 1, "Pattern"]) +table(dattree[grp_db == 2, "Pattern"]) +table(dattree[grp_db == 3, "Pattern"]) + + +### K-Means clustering + +k1 <- kmeans(df, 4) + +grp_km <- k1$cluster +table(grp_km) + +fviz_cluster(list(data = df, cluster = grp_km), + palette = c("#78004B", "#FF6900", "#3CB4DC", "#91C86E"), + ellipse.type = "convex", + show.clust.cent = FALSE, + ggtheme = theme_bw()) + +rgl::plot3d(coor, col = c("#78004B", "#FF6900", "#3CB4DC", "#91C86E")[grp_km]) ### Look at selected cases ########################################### -tmp <- dat +tmp <- res tmp$start <- tmp$date.start tmp$complete <- tmp$date.stop -alog <- activitylog(tmp[tmp$case == 3448, ], +alog <- activitylog(tmp[tmp$case == 30855, ], case_id = "case", activity_id = "item", resource_id = "path", timestamps = c("start", "complete")) process_map(alog) + ###################################################################### res <- merge(dat, dattree[, c("case", "grp")], by = "case", all.x = TRUE) res <- res[order(res$fileId.start, res$date.start, res$timeMs.start), ] +rm(dat) + xtabs( ~ item + grp, res) aggregate(event ~ grp, res, table) @@ -376,24 +445,74 @@ write.table(res, quote = FALSE, row.names = FALSE) +save(res, mat, h1, h2, h3, h4, h5, c1, c2, c3, c4, c5, datcase, dattree, df, + file = "results/haum/tmp_user-navigation.RData") + #--------------- (3) Fit tree --------------- library(rpart) library(partykit) -dattree$Duration_scaled <- scale(dattree$Duration) -dattree$grp <- factor(dattree$grp) -dattree$Pattern <- factor(dattree$Pattern) +## dbscan -c1 <- rpart(grp ~ Duration + PropItems + SearchInfo + PropMoves + +dattree_db <- dattree[grp_db != 0, ] + +dattree_db$grp <- factor(grp_db[grp_db != 0]) +dattree_db$Pattern <- factor(dattree_db$Pattern) + +c1 <- rpart(grp ~ AvDurItem + PropItems + SearchInfo + PropMoves + + Pattern, data = dattree_db, method = "class") + +c1 <- rpart(grp_db ~ AvDurItem + PropItems + PropTopic + PropPopup + PropMoves + + Pattern, data = dattree, method = "class") + + +plot(as.party(c1)) + + +c1a <- rpart(grp_db ~ AvDurItem + PropItems + SearchInfo + PropMoves + + Pattern, data = dattree, method = "class") + +plot(as.party(c1a)) + + +c2 <- rpart(grp ~ PropItems + SearchInfo + PropMoves + Pattern, + data = dattree_db, method = "class") + +plot(as.party(c2)) + +# with conditional tree function +c3 <- ctree(as.factor(grp_db) ~ AvDurItem + PropItems + PropTopic + PropPopup + + PropMoves + as.factor(Pattern), data = dattree, alpha = 1) +plot(c3) + +cluster <- as.factor(grp_db[grp_db != 0]) + +c4 <- ctree(cluster ~ nmove + nflipCard + nopenTopic + nopenPopup, + data = datcase[grp_db != 0, ], alpha = .001) +plot(c4) + + +c5 <- ctree(cluster ~ duration, + data = datcase[grp_db != 0, ], alpha = .001) +plot(c5) + +## hclust + +c1 <- rpart(as.factor(grp_hclust) ~ AvDurItem + PropItems + SearchInfo + PropMoves + Pattern, data = dattree, method = "class") plot(as.party(c1)) -c2 <- rpart(grp ~ PropItems + SearchInfo + PropMoves + Pattern, - data = dattree, method = "class") +c3 <- ctree(as.factor(grp_hclust) ~ AvDurItem + PropItems + SearchInfo + + PropMoves + as.factor(Pattern), data = dattree, alpha = 0) +plot(c3) + +c4 <- ctree(as.factor(grp_hclust) ~ nmove + nflipCard + nopenTopic + nopenPopup, + data = datcase, alpha = .001) +plot(c4) + -plot(as.party(c2)) #--------------- (4) Investigate variants ---------------