Played around with clustering

2024-03-01 17:39:07 +01:00 · 2024-03-01 17:39:07 +01:00 · b3bc81ccbc
commit b3bc81ccbc
parent e7eb2cb784
1 changed files with 164 additions and 45 deletions
--- a/code/09_user-navigation.R
+++ b/code/09_user-navigation.R
@ -49,6 +49,8 @@ dat0$weekdays <- factor(weekdays(dat0$date.start),
 dat <- dat0[as.Date(dat0$date.start) < "2020-03-13", ]
 dat <- dat[dat$path != 106098, ]

+rm(dat0)
+
 #--------------- (1.2) Extract additional infos for clustering ---------------

 datcase <- aggregate(cbind(distance, scaleSize, rotationDegree) ~
@ -112,6 +114,7 @@ time_minmax <- function(subdata) {
    }
  subdata
 }
+# TODO: Export from package mtt

 dat_list <- pbapply::pblapply(dat_split, time_minmax)
 dat_minmax <- dplyr::bind_rows(dat_list)
@ -180,19 +183,24 @@ heatmap(cor_mat)
 dattree <- data.frame(case = datcase$case,
                      Duration = datcase$duration,
                      PropItems = datcase$nitems / length(unique(dat$item)),
-                      SearchInfo = 
-                        2*(((datcase$nopenPopup / datcase$nopenTopic) *
-                            (datcase$nopenTopic / datcase$nflipCard)) /
-                      ((datcase$nopenPopup / datcase$nopenTopic) +
-                       (datcase$nopenTopic / datcase$nflipCard))
-                      ),
+                      PropTopic = datcase$nopenTopic / datcase$nflipCard,
+                      PropPopup = datcase$nopenPopup / datcase$nopenTopic,
+                      # SearchInfo = 
+                      #   2*(((datcase$nopenPopup / datcase$nopenTopic) *
+                      #       (datcase$nopenTopic / datcase$nflipCard)) /
+                      # ((datcase$nopenPopup / datcase$nopenTopic) +
+                      #  (datcase$nopenTopic / datcase$nflipCard))
+                      # ),
                      PropMoves = datcase$nmove / datcase$length,
                      PathLinearity = datcase$nitems / datcase$npaths,
                      Singularity = datcase$npaths / datcase$length
 )

-dattree$SearchInfo <- ifelse(dattree$SearchInfo %in% 0, 0.1, dattree$SearchInfo)
-dattree$SearchInfo <- ifelse(is.na(dattree$SearchInfo), 0, dattree$SearchInfo)
+#dattree$SearchInfo <- ifelse(dattree$SearchInfo %in% 0, 0.1, dattree$SearchInfo)
+#dattree$SearchInfo <- ifelse(is.na(dattree$SearchInfo), 0, dattree$SearchInfo)
+
+dattree$PropTopic <- ifelse(is.na(dattree$PropTopic), 0, dattree$PropTopic)
+dattree$PropPopup <- ifelse(is.na(dattree$PropPopup), 0, dattree$PropPopup)

 get_centrality <- function(case, data) {

@ -223,6 +231,26 @@ dattree$centr_degree <- centrality[, 1]
 dattree$centr_degree_loops <- centrality[, 2]
 dattree$centr_between <- centrality[, 3]

+## Add average duration per item
+
+dat_split <- split(dat[, c("item", "case", "path", "timeMs.start", "timeMs.stop")], ~ path)
+
+dat_list <- pbapply::pblapply(dat_split, time_minmax)
+dat_minmax <- dplyr::bind_rows(dat_list)
+
+tmp <- aggregate(min_time ~ path, dat_minmax, unique)
+tmp$max_time <- aggregate(max_time ~ path, dat_minmax, unique, na.action = NULL)$max_time
+tmp$duration <- tmp$max_time - tmp$min_time
+tmp$case <- aggregate(case ~ path, dat_minmax, unique)$case
+
+dattree$AvDurItem <- aggregate(duration ~ case, tmp, mean)$duration
+
+rm(tmp)
+
+
+
+
+
 par(mfrow = c(3,3))
 hist(dattree$Duration, breaks = 50, main = "")
 hist(dattree$SearchInfo, breaks = 50, main = "")
@ -256,15 +284,21 @@ write.table(dattree,

 #--------------- (2) Clustering ---------------

-df <- dattree[, c("Duration", "PropItems", "SearchInfo", "PropMoves")]
+df <- dattree[, c("AvDurItem", "PropItems", "PropTopic", "PropPopup", "PropMoves")]
+#df <- dattree[, c("AvDurItem", "PropItems", "SearchInfo", "PropMoves")]
 # TODO: With or without duration? Why is it relevant?
+
 df$Scholar <- ifelse(dattree$Pattern == "Scholar", 1, 0)
 df$Star <- ifelse(dattree$Pattern == "Star", 1, 0)
 df$Dispersion <- ifelse(dattree$Pattern == "Dispersion", 1, 0)
 # scale Duration and min/max SearchInfo
-df$Duration <- as.numeric(scale(df$Duration))
-df$SearchInfo <- (df$SearchInfo - min(df$SearchInfo)) /
-  (max(df$SearchInfo) - min(df$SearchInfo))
+df$AvDurItem <- as.numeric(scale(df$AvDurItem))
+#df$SearchInfo <- (df$SearchInfo - min(df$SearchInfo)) /
+#  (max(df$SearchInfo) - min(df$SearchInfo))
+df$PropTopic <- (df$PropTopic - min(df$PropTopic, na.rm = TRUE)) /
+  (max(df$PropTopic, na.rm = TRUE) - min(df$PropTopic, na.rm = TRUE))
+df$PropPopup <- (df$PropPopup - min(df$PropPopup, na.rm = TRUE)) /
+  (max(df$PropPopup, na.rm = TRUE) - min(df$PropPopup, na.rm = TRUE))

 mat <- dist(df)
 # TODO: Do I need to scale all variables?
@ -284,15 +318,10 @@ c5 <- cophenetic(h5)

 # Correlations
 cor(mat, c1)
-# 0.8854558
 cor(mat, c2)
-# 0.883313
 cor(mat, c3)
-# 0.5368663
 cor(mat, c4)
-# 0.725247
 cor(mat, c5)
-# 0.3895215

 # https://en.wikipedia.org/wiki/Cophenetic_correlation
 # https://stats.stackexchange.com/questions/195446/choosing-the-right-linkage-method-for-hierarchical-clustering
@ -304,59 +333,99 @@ plot(rev(hc$height)[1:100], type = "b", pch = 16, cex = .5)

 k <- 4

-grp <- cutree(hc, k = k)
-df$grp <- grp
+grp_hclust <- cutree(hc, k = k)

-table(grp)
+table(grp_hclust)

-fviz_cluster(list(data = df, cluster = grp),
+fviz_cluster(list(data = df, cluster = grp_hclust),
             palette = c("#78004B", "#FF6900", "#3CB4DC", "#91C86E", "black"),
             ellipse.type = "convex",
             show.clust.cent = FALSE,
             ggtheme = theme_bw())

+table(dattree[grp_hclust == 1, "Pattern"])
+table(dattree[grp_hclust == 2, "Pattern"])
+table(dattree[grp_hclust == 3, "Pattern"])
+table(dattree[grp_hclust == 4, "Pattern"])
+
 # Look at 3d plot to see if clusters are actually separate
 pc <- prcomp(df)
 coor <- as.data.frame(pc$x[, c(1, 2, 3)])
-coor$grp <- df$grp
-rgl::plot3d(coor, col = c("#78004B", "#FF6900", "#3CB4DC", "#91C86E", "black")[coor$grp])
+rgl::plot3d(coor, col = c("#78004B", "#FF6900", "#3CB4DC", "#91C86E")[grp_hclust])



-
-
-
-dattree$grp <- grp
-table(dattree[dattree$grp == 1, "Pattern"])
-table(dattree[dattree$grp == 2, "Pattern"])
-table(dattree[dattree$grp == 3, "Pattern"])
-table(dattree[dattree$grp == 4, "Pattern"])
-
-
 aggregate(cbind(Duration, PropItems, SearchInfo, PropMoves, PathLinearity,
                Singularity, centr_degree, centr_degree_loops,
-                centr_between) ~ grp, dattree, mean)
+                centr_between) ~ grp_hclust, dattree, mean)

-aggregate(cbind(Duration, PropItems, SearchInfo, PropMoves, Dispersion,
-                Scholar, Star) ~ grp, df, mean)
+aggregate(cbind(duration, distance, scaleSize, rotationDegree, length,
+                nmove, nflipCard, nopenTopic, nopenPopup) ~ grp_hclust, datcase,
+          mean)
+
+### DBSCAN clustering
+
+library(dbscan)
+d1 <- dbscan(df, eps = .5, minPts = 9)
+hullplot(df, d1)
+
+grp_db <- d1$cluster
+table(grp_db)
+
+kNNdistplot(df, k = 6)
+abline(h = 0.5, col = "red")
+
+fviz_cluster(list(data = df[grp_db != 0, ], cluster = grp_db[grp_db != 0]),
+             palette = c("#78004B", "#FF6900", "#3CB4DC", "#91C86E"),
+             ellipse.type = "convex",
+             show.clust.cent = FALSE,
+             ggtheme = theme_bw())
+
+rgl::plot3d(coor, col = c("#78004B", "#FF6900", "#3CB4DC", "#91C86E")[grp_db + 1])
+
+aggregate(. ~ grp_db, df, mean)
+
+table(dattree[grp_db == 0, "Pattern"])
+table(dattree[grp_db == 1, "Pattern"])
+table(dattree[grp_db == 2, "Pattern"])
+table(dattree[grp_db == 3, "Pattern"])
+
+
+### K-Means clustering
+
+k1 <- kmeans(df, 4)
+
+grp_km <- k1$cluster
+table(grp_km)
+
+fviz_cluster(list(data = df, cluster = grp_km),
+             palette = c("#78004B", "#FF6900", "#3CB4DC", "#91C86E"),
+             ellipse.type = "convex",
+             show.clust.cent = FALSE,
+             ggtheme = theme_bw())
+
+rgl::plot3d(coor, col = c("#78004B", "#FF6900", "#3CB4DC", "#91C86E")[grp_km])

 ### Look at selected cases ###########################################
-tmp <- dat
+tmp <- res
 tmp$start <- tmp$date.start
 tmp$complete <- tmp$date.stop

-alog <- activitylog(tmp[tmp$case == 3448, ],
+alog <- activitylog(tmp[tmp$case == 30855, ],
                    case_id     = "case",
                    activity_id = "item",
                    resource_id = "path",
                    timestamps  = c("start", "complete"))

 process_map(alog)
+
 ######################################################################

 res <- merge(dat, dattree[, c("case", "grp")], by = "case", all.x = TRUE)
 res <- res[order(res$fileId.start, res$date.start, res$timeMs.start), ]

+rm(dat)
+
 xtabs( ~ item + grp, res)
 aggregate(event ~ grp, res, table)

@ -376,24 +445,74 @@ write.table(res,
            quote = FALSE,
            row.names = FALSE)

+save(res, mat, h1, h2, h3, h4, h5, c1, c2, c3, c4, c5, datcase, dattree, df,
+     file = "results/haum/tmp_user-navigation.RData")
+
 #--------------- (3) Fit tree ---------------

 library(rpart)
 library(partykit)

-dattree$Duration_scaled <- scale(dattree$Duration)
-dattree$grp <- factor(dattree$grp)
-dattree$Pattern <- factor(dattree$Pattern)
+## dbscan

-c1 <- rpart(grp ~ Duration + PropItems + SearchInfo + PropMoves +
+dattree_db <- dattree[grp_db != 0, ]
+
+dattree_db$grp <- factor(grp_db[grp_db != 0])
+dattree_db$Pattern <- factor(dattree_db$Pattern)
+
+c1 <- rpart(grp ~ AvDurItem + PropItems + SearchInfo + PropMoves +
+              Pattern, data = dattree_db, method = "class")
+
+c1 <- rpart(grp_db ~ AvDurItem + PropItems + PropTopic + PropPopup + PropMoves +
+              Pattern, data = dattree, method = "class")
+
+
+plot(as.party(c1))
+
+
+c1a <- rpart(grp_db ~ AvDurItem + PropItems + SearchInfo + PropMoves +
+              Pattern, data = dattree, method = "class")
+
+plot(as.party(c1a))
+
+
+c2 <- rpart(grp ~ PropItems + SearchInfo + PropMoves + Pattern,
+            data = dattree_db, method = "class")
+
+plot(as.party(c2))
+
+# with conditional tree function
+c3 <- ctree(as.factor(grp_db) ~ AvDurItem + PropItems + PropTopic + PropPopup +
+            PropMoves + as.factor(Pattern), data = dattree, alpha = 1)
+plot(c3)
+
+cluster <- as.factor(grp_db[grp_db != 0])
+
+c4 <- ctree(cluster ~ nmove + nflipCard + nopenTopic + nopenPopup,
+            data = datcase[grp_db != 0, ], alpha = .001)
+plot(c4)
+
+
+c5 <- ctree(cluster ~ duration,
+            data = datcase[grp_db != 0, ], alpha = .001)
+plot(c5)
+
+## hclust
+
+c1 <- rpart(as.factor(grp_hclust) ~ AvDurItem + PropItems + SearchInfo + PropMoves +
              Pattern, data = dattree, method = "class")

 plot(as.party(c1))

-c2 <- rpart(grp ~ PropItems + SearchInfo + PropMoves + Pattern,
-            data = dattree, method = "class")
+c3 <- ctree(as.factor(grp_hclust) ~ AvDurItem + PropItems + SearchInfo +
+            PropMoves + as.factor(Pattern), data = dattree, alpha = 0)
+plot(c3)
+
+c4 <- ctree(as.factor(grp_hclust) ~ nmove + nflipCard + nopenTopic + nopenPopup,
+            data = datcase, alpha = .001)
+plot(c4)
+

-plot(as.party(c2))

 #--------------- (4) Investigate variants ---------------