Cleaned up, tried out some stuff with clustering; Pattern variable probably has to go

2024-03-04 15:21:22 +01:00 · 2024-03-04 15:21:22 +01:00 · 72d2b6b799
commit 72d2b6b799
parent b3bc81ccbc
1 changed files with 78 additions and 99 deletions
--- a/code/09_user-navigation.R
+++ b/code/09_user-navigation.R
@ -247,14 +247,14 @@ dattree$AvDurItem <- aggregate(duration ~ case, tmp, mean)$duration

 rm(tmp)

+plot(dattree)

-
-
-
-par(mfrow = c(3,3))
+par(mfrow = c(3,4))
 hist(dattree$Duration, breaks = 50, main = "")
-hist(dattree$SearchInfo, breaks = 50, main = "")
+hist(dattree$AvDurItem, breaks = 50, main = "")
 hist(dattree$PropItems, breaks = 50, main = "")
+hist(dattree$PropTopic, breaks = 50, main = "")
+hist(dattree$PropPopup, breaks = 50, main = "")
 hist(dattree$PropMoves, breaks = 50, main = "")
 hist(dattree$PathLinearity, breaks = 50, main = "")
 hist(dattree$Singularity, breaks = 50, main = "")
@ -286,13 +286,14 @@ write.table(dattree,

 df <- dattree[, c("AvDurItem", "PropItems", "PropTopic", "PropPopup", "PropMoves")]
 #df <- dattree[, c("AvDurItem", "PropItems", "SearchInfo", "PropMoves")]
-# TODO: With or without duration? Why is it relevant?
-
 df$Scholar <- ifelse(dattree$Pattern == "Scholar", 1, 0)
 df$Star <- ifelse(dattree$Pattern == "Star", 1, 0)
 df$Dispersion <- ifelse(dattree$Pattern == "Dispersion", 1, 0)
+
 # scale Duration and min/max SearchInfo
-df$AvDurItem <- as.numeric(scale(df$AvDurItem))
+#df$AvDurItem <- as.numeric(scale(df$AvDurItem))
+df$AvDurItem <- (df$AvDurItem - min(df$AvDurItem, na.rm = TRUE)) /
+  (max(df$AvDurItem, na.rm = TRUE) - min(df$AvDurItem, na.rm = TRUE))
 #df$SearchInfo <- (df$SearchInfo - min(df$SearchInfo)) /
 #  (max(df$SearchInfo) - min(df$SearchInfo))
 df$PropTopic <- (df$PropTopic - min(df$PropTopic, na.rm = TRUE)) /
@ -300,6 +301,33 @@ df$PropTopic <- (df$PropTopic - min(df$PropTopic, na.rm = TRUE)) /
 df$PropPopup <- (df$PropPopup - min(df$PropPopup, na.rm = TRUE)) /
  (max(df$PropPopup, na.rm = TRUE) - min(df$PropPopup, na.rm = TRUE))

+
+# "Flatten" with PCA
+pc <- prcomp(df)
+coor_2d <- as.data.frame(pc$x[, c(1, 2)])
+coor_3d <- as.data.frame(pc$x[, c(1, 2, 3)])
+
+#--------------- (2.1) K-Means clustering ---------------
+
+mycols <- c("#78004B", "#FF6900", "#3CB4DC", "#91C86E")
+
+k1 <- kmeans(df, 4)
+
+grp_km <- k1$cluster
+table(grp_km)
+
+fviz_cluster(list(data = df, cluster = grp_km),
+             palette = mycols,
+             ellipse.type = "convex",
+             show.clust.cent = FALSE,
+             ggtheme = theme_bw())
+
+plot(coor_2d, col = mycols[grp_km])
+
+rgl::plot3d(coor_3d, col = mycols[grp_km])
+
+#--------------- (2.2) Hierarchical clustering ---------------
+
 mat <- dist(df)
 # TODO: Do I need to scale all variables?

@ -326,7 +354,7 @@ cor(mat, c5)
 # https://en.wikipedia.org/wiki/Cophenetic_correlation
 # https://stats.stackexchange.com/questions/195446/choosing-the-right-linkage-method-for-hierarchical-clustering

-hc <- h4
+hc <- h1

 # Something like a scree plot (??)
 plot(rev(hc$height)[1:100], type = "b", pch = 16, cex = .5)
@ -338,50 +366,46 @@ grp_hclust <- cutree(hc, k = k)
 table(grp_hclust)

 fviz_cluster(list(data = df, cluster = grp_hclust),
-             palette = c("#78004B", "#FF6900", "#3CB4DC", "#91C86E", "black"),
+             palette = mycols,
             ellipse.type = "convex",
             show.clust.cent = FALSE,
             ggtheme = theme_bw())

+plot(coor_2d, col = mycols[grp_hclust])
+rgl::plot3d(coor_3d, col = mycols[grp_hclust])
+
 table(dattree[grp_hclust == 1, "Pattern"])
 table(dattree[grp_hclust == 2, "Pattern"])
 table(dattree[grp_hclust == 3, "Pattern"])
 table(dattree[grp_hclust == 4, "Pattern"])

-# Look at 3d plot to see if clusters are actually separate
-pc <- prcomp(df)
-coor <- as.data.frame(pc$x[, c(1, 2, 3)])
-rgl::plot3d(coor, col = c("#78004B", "#FF6900", "#3CB4DC", "#91C86E")[grp_hclust])

+aggregate(. ~ grp_hclust, df, mean)


-aggregate(cbind(Duration, PropItems, SearchInfo, PropMoves, PathLinearity,
-                Singularity, centr_degree, centr_degree_loops,
-                centr_between) ~ grp_hclust, dattree, mean)
-
 aggregate(cbind(duration, distance, scaleSize, rotationDegree, length,
                nmove, nflipCard, nopenTopic, nopenPopup) ~ grp_hclust, datcase,
          mean)

-### DBSCAN clustering
+#--------------- (2.3) DBSCAN clustering ---------------

 library(dbscan)
-d1 <- dbscan(df, eps = .5, minPts = 9)
+d1 <- dbscan(df, eps = .2, minPts = 9)
 hullplot(df, d1)

 grp_db <- d1$cluster
 table(grp_db)

-kNNdistplot(df, k = 6)
-abline(h = 0.5, col = "red")
+kNNdistplot(df, k = ncol(df))
+abline(h = 0.2, col = "red")

 fviz_cluster(list(data = df[grp_db != 0, ], cluster = grp_db[grp_db != 0]),
-             palette = c("#78004B", "#FF6900", "#3CB4DC", "#91C86E"),
+             palette = mycols,
             ellipse.type = "convex",
             show.clust.cent = FALSE,
             ggtheme = theme_bw())

-rgl::plot3d(coor, col = c("#78004B", "#FF6900", "#3CB4DC", "#91C86E")[grp_db + 1])
+rgl::plot3d(coor_3d, col = mycols[grp_db + 1])

 aggregate(. ~ grp_db, df, mean)

@ -390,28 +414,12 @@ table(dattree[grp_db == 1, "Pattern"])
 table(dattree[grp_db == 2, "Pattern"])
 table(dattree[grp_db == 3, "Pattern"])

-
-### K-Means clustering
-
-k1 <- kmeans(df, 4)
-
-grp_km <- k1$cluster
-table(grp_km)
-
-fviz_cluster(list(data = df, cluster = grp_km),
-             palette = c("#78004B", "#FF6900", "#3CB4DC", "#91C86E"),
-             ellipse.type = "convex",
-             show.clust.cent = FALSE,
-             ggtheme = theme_bw())
-
-rgl::plot3d(coor, col = c("#78004B", "#FF6900", "#3CB4DC", "#91C86E")[grp_km])
-
 ### Look at selected cases ###########################################
-tmp <- res
+tmp <- dat
 tmp$start <- tmp$date.start
 tmp$complete <- tmp$date.stop

-alog <- activitylog(tmp[tmp$case == 30855, ],
+alog <- activitylog(tmp[tmp$case == 30418, ],
                    case_id     = "case",
                    activity_id = "item",
                    resource_id = "path",
@ -419,25 +427,26 @@ alog <- activitylog(tmp[tmp$case == 30855, ],

 process_map(alog)

+rm(tmp)
+
 ######################################################################

-res <- merge(dat, dattree[, c("case", "grp")], by = "case", all.x = TRUE)
+res <- merge(dat, data.frame(case = dattree$case, grp_km, grp_hclust, grp_db),
+             by = "case", all.x = TRUE)
 res <- res[order(res$fileId.start, res$date.start, res$timeMs.start), ]

-rm(dat)
-
-xtabs( ~ item + grp, res)
-aggregate(event ~ grp, res, table)
+xtabs( ~ item + grp_db, res)
+aggregate(event ~ grp_db, res, table)

 # Look at clusters
 par(mfrow = c(2, 2))
-vioplot::vioplot(duration ~ grp, res)
-vioplot::vioplot(distance ~ grp, res)
-vioplot::vioplot(scaleSize ~ grp, res)
-vioplot::vioplot(rotationDegree ~ grp, res)
+vioplot::vioplot(duration ~ grp_db, res)
+vioplot::vioplot(distance ~ grp_db, res)
+vioplot::vioplot(scaleSize ~ grp_db, res)
+vioplot::vioplot(rotationDegree ~ grp_db, res)

-aggregate(cbind(duration, distance, scaleSize, rotationDegree) ~ grp, res, mean)
-aggregate(cbind(duration, distance, scaleSize, rotationDegree) ~ grp, res, median)
+aggregate(cbind(duration, distance, scaleSize, rotationDegree) ~ grp_db, res, mean)
+aggregate(cbind(duration, distance, scaleSize, rotationDegree) ~ grp_db, res, median)

 write.table(res,
            file = "results/haum/event_logfiles_pre-corona_with-clusters_cases.csv",
@ -453,66 +462,36 @@ save(res, mat, h1, h2, h3, h4, h5, c1, c2, c3, c4, c5, datcase, dattree, df,
 library(rpart)
 library(partykit)

-## dbscan
-
 dattree_db <- dattree[grp_db != 0, ]
-
 dattree_db$grp <- factor(grp_db[grp_db != 0])
 dattree_db$Pattern <- factor(dattree_db$Pattern)

-c1 <- rpart(grp ~ AvDurItem + PropItems + SearchInfo + PropMoves +
+c1 <- rpart(grp ~ AvDurItem + PropItems + PropTopic + PropPopup + PropMoves +
              Pattern, data = dattree_db, method = "class")
-
-c1 <- rpart(grp_db ~ AvDurItem + PropItems + PropTopic + PropPopup + PropMoves +
-              Pattern, data = dattree, method = "class")
-
-
 plot(as.party(c1))

+# with conditional tree
+c2 <- ctree(grp ~ AvDurItem + PropItems + PropTopic + PropPopup +
+            PropMoves + Pattern, data = dattree_db, alpha = 0.5)
+plot(c2)

-c1a <- rpart(grp_db ~ AvDurItem + PropItems + SearchInfo + PropMoves +
-              Pattern, data = dattree, method = "class")
-
-plot(as.party(c1a))
-
-
-c2 <- rpart(grp ~ PropItems + SearchInfo + PropMoves + Pattern,
-            data = dattree_db, method = "class")
-
-plot(as.party(c2))
-
-# with conditional tree function
-c3 <- ctree(as.factor(grp_db) ~ AvDurItem + PropItems + PropTopic + PropPopup +
-            PropMoves + as.factor(Pattern), data = dattree, alpha = 1)
+c3 <- ctree(grp ~ AvDurItem + PropItems + PropTopic + PropPopup +
+            PropMoves + Pattern, data = dattree_db, alpha = 0)
 plot(c3)

-cluster <- as.factor(grp_db[grp_db != 0])
-
-c4 <- ctree(cluster ~ nmove + nflipCard + nopenTopic + nopenPopup,
-            data = datcase[grp_db != 0, ], alpha = .001)
+c4 <- ctree(grp ~ AvDurItem + PropItems + PropTopic + PropPopup +
+            PropMoves + Pattern, data = dattree_db, alpha = 1)
 plot(c4)

-
-c5 <- ctree(cluster ~ duration,
-            data = datcase[grp_db != 0, ], alpha = .001)
+# with excluded points
+c5 <- ctree(factor(grp_db) ~ AvDurItem + PropItems + PropTopic + PropPopup +
+            PropMoves + factor(Pattern), data = dattree, alpha = 1)
 plot(c5)

-## hclust
-
-c1 <- rpart(as.factor(grp_hclust) ~ AvDurItem + PropItems + SearchInfo + PropMoves +
-              Pattern, data = dattree, method = "class")
-
-plot(as.party(c1))
-
-c3 <- ctree(as.factor(grp_hclust) ~ AvDurItem + PropItems + SearchInfo +
-            PropMoves + as.factor(Pattern), data = dattree, alpha = 0)
-plot(c3)
-
-c4 <- ctree(as.factor(grp_hclust) ~ nmove + nflipCard + nopenTopic + nopenPopup,
-            data = datcase, alpha = .001)
-plot(c4)
-
-
+# with excluded points
+c6 <- ctree(factor(grp_db) ~ ., data = df, alpha = 1)
+plot(c6)
+# --> just checking

 #--------------- (4) Investigate variants ---------------