From e7eb2cb7848c85da91a4618c2862ae04c703ad6d Mon Sep 17 00:00:00 2001
From: nwickel <n.wickelmaier@iwm-tuebingen.de>
Date: Tue, 27 Feb 2024 17:28:01 +0100
Subject: [PATCH] Added partitioning to user navigation script

---
 code/09_user-navigation.R | 235 +++++++++++++++-----------------------
 1 file changed, 91 insertions(+), 144 deletions(-)

diff --git a/code/09_user-navigation.R b/code/09_user-navigation.R
index bf33425..8752ce9 100644
--- a/code/09_user-navigation.R
+++ b/code/09_user-navigation.R
@@ -4,13 +4,14 @@
 #           (1.1) Read log event data
 #           (1.2) Extract additional infos for clustering
 #          (2) Clustering
+#          (3) Fit tree
 #          (3) Investigate variants
 #
 # input:  results/haum/event_logfiles_2024-02-21_16-07-33.csv
 # output: results/haum/event_logfiles_pre-corona_with-clusters_cases.csv
 #         results/haum/dattree.csv
 #
-# last mod: 2024-02-23
+# last mod: 2024-02-27
 
 
 # setwd("C:/Users/nwickelmaier/Nextcloud/Documents/MDS/2023ss/60100_master_thesis/analysis/code")
@@ -176,10 +177,6 @@ heatmap(cor_mat)
 #   datcase$duration / mean(datcase$duration) or median(datcase$duration)
 #   * Duration per artwork is low: "ave_duration_item" / datcase$duration
 
-# TODO: Come up with relevant features for navigation behavior
-
-
-
 dattree <- data.frame(case = datcase$case,
                       Duration = datcase$duration,
                       PropItems = datcase$nitems / length(unique(dat$item)),
@@ -257,47 +254,15 @@ write.table(dattree,
             quote = FALSE,
             row.names = FALSE)
 
-
-tmp <- dat
-tmp$start <- tmp$date.start
-tmp$complete <- tmp$date.stop
-
-
-alog <- activitylog(tmp[tmp$case == 3448, ],
-                    case_id     = "case",
-                    activity_id = "item",
-                    resource_id = "path",
-                    timestamps  = c("start", "complete"))
-
-process_map(alog)
-
-net <- process_map(alog, render = FALSE)
-#DiagrammeR::get_node_df(net)
-
-DiagrammeR::get_node_info(net)
-
-DiagrammeR::get_degree_distribution(net)
-
-DiagrammeR::get_degree_in(net)
-DiagrammeR::get_degree_out(net)
-DiagrammeR::get_degree_total(net)
-
-
-N <- DiagrammeR::count_nodes(net) - 2   # Do not count start and stop nodes
-
-dc <- DiagrammeR::get_degree_total(net)[1:N, "total_degree"] / (N - 1)
-
-inet <- DiagrammeR::to_igraph(net)
-igraph::centr_degree(inet, loops = FALSE)
-igraph::centr_betw(inet)
-igraph::centr_clo(inet)
+#--------------- (2) Clustering ---------------
 
 df <- dattree[, c("Duration", "PropItems", "SearchInfo", "PropMoves")]
+# TODO: With or without duration? Why is it relevant?
 df$Scholar <- ifelse(dattree$Pattern == "Scholar", 1, 0)
 df$Star <- ifelse(dattree$Pattern == "Star", 1, 0)
 df$Dispersion <- ifelse(dattree$Pattern == "Dispersion", 1, 0)
 # scale Duration and min/max SearchInfo
-df$Duration <- scale(df$Duration)
+df$Duration <- as.numeric(scale(df$Duration))
 df$SearchInfo <- (df$SearchInfo - min(df$SearchInfo)) /
   (max(df$SearchInfo) - min(df$SearchInfo))
 
@@ -319,20 +284,20 @@ c5 <- cophenetic(h5)
 
 # Correlations
 cor(mat, c1)
-# 0.9029232
+# 0.8854558
 cor(mat, c2)
-# 0.8879478
+# 0.883313
 cor(mat, c3)
-# 0.5747296
+# 0.5368663
 cor(mat, c4)
-# 0.5994121
+# 0.725247
 cor(mat, c5)
-# 0.5292353
+# 0.3895215
 
 # https://en.wikipedia.org/wiki/Cophenetic_correlation
 # https://stats.stackexchange.com/questions/195446/choosing-the-right-linkage-method-for-hierarchical-clustering
 
-hc <- h1
+hc <- h4
 
 # Something like a scree plot (??)
 plot(rev(hc$height)[1:100], type = "b", pch = 16, cex = .5)
@@ -345,118 +310,51 @@ df$grp <- grp
 table(grp)
 
 fviz_cluster(list(data = df, cluster = grp),
-             #palette = c("#78004B", "#FF6900", "#3CB4DC", "#91C86E",
-             #            "#000000", "#434F4F"),
+             palette = c("#78004B", "#FF6900", "#3CB4DC", "#91C86E", "black"),
              ellipse.type = "convex",
-             show.clust.cent = FALSE, ggtheme = theme_bw())
+             show.clust.cent = FALSE,
+             ggtheme = theme_bw())
 
+# Look at 3d plot to see if clusters are actually separate
+pc <- prcomp(df)
+coor <- as.data.frame(pc$x[, c(1, 2, 3)])
+coor$grp <- df$grp
+rgl::plot3d(coor, col = c("#78004B", "#FF6900", "#3CB4DC", "#91C86E", "black")[coor$grp])
+
+
+
+
+
+
+dattree$grp <- grp
+table(dattree[dattree$grp == 1, "Pattern"])
+table(dattree[dattree$grp == 2, "Pattern"])
 table(dattree[dattree$grp == 3, "Pattern"])
+table(dattree[dattree$grp == 4, "Pattern"])
 
 
 aggregate(cbind(Duration, PropItems, SearchInfo, PropMoves, PathLinearity,
                 Singularity, centr_degree, centr_degree_loops,
                 centr_between) ~ grp, dattree, mean)
 
-
 aggregate(cbind(Duration, PropItems, SearchInfo, PropMoves, Dispersion,
                 Scholar, Star) ~ grp, df, mean)
 
+### Look at selected cases ###########################################
+tmp <- dat
+tmp$start <- tmp$date.start
+tmp$complete <- tmp$date.stop
 
-# "We first extract the graph sub-sequences corresponding to the four
-# patterns of Canter et al. (1985). We also identified the number of nodes
-# to which the learner often goes back to (Fig. 4). These nodes are called
-# “central nodes”. If the number of central nodes is lower than or equal to
-# half of the sub-sequences, the browsing pattern indicator takes on the
-# value “Star”." Bousbia et al. (2010)
+alog <- activitylog(tmp[tmp$case == 3448, ],
+                    case_id     = "case",
+                    activity_id = "item",
+                    resource_id = "path",
+                    timestamps  = c("start", "complete"))
 
-# I do not know how they got the sub-sequences. I am taking the ratio of
-# strongly connected nodes to weakly connected nodes. If the number of
-# weakly connected nodes is twice as high, the pattern is classified as a
-# star, i.e., NodeConnect <= 0.5.
-# TODO: This does not make sense, smallest and most frequent number is 3!
-# (and I do not understand it...)
+process_map(alog)
+######################################################################
 
-
-# count_asymmetric_node_pairs  Get the number of asymmetrically-connected node pairs
-# count_edges                  Get a count of all edges
-# count_loop_edges             Get count of all loop edges
-# count_mutual_node_pairs      Get the number of mutually-connected node pairs
-# count_unconnected_node_pairs Get the number of unconnected node pairs
-# count_unconnected_nodes      Get count of all unconnected nodes
-
-
-# TODO: Read up on centrality measures
-# https://www.r-bloggers.com/2018/12/network-centrality-in-r-an-introduction/
-# https://www.datacamp.com/tutorial/centrality-network-analysis-R
-# http://davidrajuh.net/reggie/publications/publications-filer/rd114-2018-Network-Centrality.pdf
-# https://link.springer.com/article/10.1007/s10618-024-01003-4
-
-#--------------- (2) Clustering ---------------
-
-df <- na.omit(datcase[, c("duration", "distance", "scaleSize",
-                          "rotationDegree", "length", "nmove",
-                          "nitems", "npaths")])
-
-#df <- cbind(df, datcase[, c("vacation", "holiday", "weekend", "morning")])
-mat <- dist(scale(df))
-#mat <- dist(df)
-
-h1 <- hclust(mat, method = "average")
-h2 <- hclust(mat, method = "complete")
-h3 <- hclust(mat, method = "ward.D")
-h4 <- hclust(mat, method = "ward.D2")
-h5 <- hclust(mat, method = "single")
-
-# Cophenetic Distances, for each linkage (runs quite some time!)
-c1 <- cophenetic(h1)
-c2 <- cophenetic(h2)
-c3 <- cophenetic(h3)
-c4 <- cophenetic(h4)
-c5 <- cophenetic(h5)
-
-# Correlations
-cor(mat, c1)
-cor(mat, c2)
-cor(mat, c3)
-cor(mat, c4)
-cor(mat, c5)
-# https://en.wikipedia.org/wiki/Cophenetic_correlation
-# https://stats.stackexchange.com/questions/195446/choosing-the-right-linkage-method-for-hierarchical-clustering
-
-hc <- h1
-
-# Something like a scree plot (??)
-plot(rev(hc$height)[1:100], type = "b", pch = 16, cex = .5)
-
-
-# TODO: Something is wrong
-
-k <- 4
-
-grp <- cutree(hc, k = k)
-df$grp <- grp
-
-table(grp)
-
-fviz_cluster(list(data = df, cluster = grp),
-             #palette = c("#78004B", "#FF6900", "#3CB4DC", "#91C86E",
-             #            "#000000", "#434F4F"),
-             ellipse.type = "convex",
-             show.clust.cent = FALSE, ggtheme = theme_bw())
-
-aggregate(cbind(duration, distance, scaleSize , rotationDegree, length, nmove,
-                nitems, npaths) ~ grp, df, mean)
-aggregate(cbind(duration, distance, scaleSize , rotationDegree, length, nmove,
-                nitems, npaths) ~ grp, df, median)
-aggregate(cbind(duration, distance, scaleSize , rotationDegree, length, nmove,
-                nitems, npaths) ~ grp, df, max)
-
-
-df$case <- na.omit(datcase[, c("case", "duration", "distance", "scaleSize",
-                          "rotationDegree", "length", "nmove",
-                          "nitems", "npaths")])$case
-
-res <- merge(dat, df[, c("case", "grp")], by = "case", all.x = TRUE)
+res <- merge(dat, dattree[, c("case", "grp")], by = "case", all.x = TRUE)
 res <- res[order(res$fileId.start, res$date.start, res$timeMs.start), ]
 
 xtabs( ~ item + grp, res)
@@ -469,13 +367,35 @@ vioplot::vioplot(distance ~ grp, res)
 vioplot::vioplot(scaleSize ~ grp, res)
 vioplot::vioplot(rotationDegree ~ grp, res)
 
+aggregate(cbind(duration, distance, scaleSize, rotationDegree) ~ grp, res, mean)
+aggregate(cbind(duration, distance, scaleSize, rotationDegree) ~ grp, res, median)
+
 write.table(res,
             file = "results/haum/event_logfiles_pre-corona_with-clusters_cases.csv",
             sep = ";",
             quote = FALSE,
             row.names = FALSE)
 
-#--------------- (3) Investigate variants ---------------
+#--------------- (3) Fit tree ---------------
+
+library(rpart)
+library(partykit)
+
+dattree$Duration_scaled <- scale(dattree$Duration)
+dattree$grp <- factor(dattree$grp)
+dattree$Pattern <- factor(dattree$Pattern)
+
+c1 <- rpart(grp ~ Duration + PropItems + SearchInfo + PropMoves +
+              Pattern, data = dattree, method = "class")
+
+plot(as.party(c1))
+
+c2 <- rpart(grp ~ PropItems + SearchInfo + PropMoves + Pattern,
+            data = dattree, method = "class")
+
+plot(as.party(c2))
+
+#--------------- (4) Investigate variants ---------------
 
 res$start <- res$date.start
 res$complete <- res$date.stop
@@ -558,3 +478,30 @@ for (case in cases) {
 
 }
 
+
+
+
+########################### TODO: Still need it?
+
+
+net <- process_map(alog, render = FALSE)
+#DiagrammeR::get_node_df(net)
+
+DiagrammeR::get_node_info(net)
+
+DiagrammeR::get_degree_distribution(net)
+
+DiagrammeR::get_degree_in(net)
+DiagrammeR::get_degree_out(net)
+DiagrammeR::get_degree_total(net)
+
+
+N <- DiagrammeR::count_nodes(net) - 2   # Do not count start and stop nodes
+
+dc <- DiagrammeR::get_degree_total(net)[1:N, "total_degree"] / (N - 1)
+
+inet <- DiagrammeR::to_igraph(net)
+igraph::centr_degree(inet, loops = FALSE)
+igraph::centr_betw(inet)
+igraph::centr_clo(inet)
+