Added partitioning to user navigation script

2024-02-27 17:28:01 +01:00 · 2024-02-27 17:28:01 +01:00 · e7eb2cb784
commit e7eb2cb784
parent 42f12b9256
1 changed files with 91 additions and 144 deletions
--- a/code/09_user-navigation.R
+++ b/code/09_user-navigation.R
@ -4,13 +4,14 @@
 #           (1.1) Read log event data
 #           (1.2) Extract additional infos for clustering
 #          (2) Clustering
 #          (3) Fit tree
 #          (3) Investigate variants
 #
 # input:  results/haum/event_logfiles_2024-02-21_16-07-33.csv
 # output: results/haum/event_logfiles_pre-corona_with-clusters_cases.csv
 #         results/haum/dattree.csv
 #
-# last mod: 2024-02-23
+# last mod: 2024-02-27
 # setwd("C:/Users/nwickelmaier/Nextcloud/Documents/MDS/2023ss/60100_master_thesis/analysis/code")
@ -176,10 +177,6 @@ heatmap(cor_mat)
 #   datcase$duration / mean(datcase$duration) or median(datcase$duration)
 #   * Duration per artwork is low: "ave_duration_item" / datcase$duration
 # TODO: Come up with relevant features for navigation behavior
 dattree <- data.frame(case = datcase$case,
                      Duration = datcase$duration,
                      PropItems = datcase$nitems / length(unique(dat$item)),
@ -257,47 +254,15 @@ write.table(dattree,
            quote = FALSE,
            row.names = FALSE)
-
+#--------------- (2) Clustering ---------------
 tmp <- dat
 tmp$start <- tmp$date.start
 tmp$complete <- tmp$date.stop
 alog <- activitylog(tmp[tmp$case == 3448, ],
                    case_id     = "case",
                    activity_id = "item",
                    resource_id = "path",
                    timestamps  = c("start", "complete"))
 process_map(alog)
 net <- process_map(alog, render = FALSE)
 #DiagrammeR::get_node_df(net)
 DiagrammeR::get_node_info(net)
 DiagrammeR::get_degree_distribution(net)
 DiagrammeR::get_degree_in(net)
 DiagrammeR::get_degree_out(net)
 DiagrammeR::get_degree_total(net)
 N <- DiagrammeR::count_nodes(net) - 2   # Do not count start and stop nodes
 dc <- DiagrammeR::get_degree_total(net)[1:N, "total_degree"] / (N - 1)
 inet <- DiagrammeR::to_igraph(net)
 igraph::centr_degree(inet, loops = FALSE)
 igraph::centr_betw(inet)
 igraph::centr_clo(inet)
 df <- dattree[, c("Duration", "PropItems", "SearchInfo", "PropMoves")]
 # TODO: With or without duration? Why is it relevant?
 df$Scholar <- ifelse(dattree$Pattern == "Scholar", 1, 0)
 df$Star <- ifelse(dattree$Pattern == "Star", 1, 0)
 df$Dispersion <- ifelse(dattree$Pattern == "Dispersion", 1, 0)
 # scale Duration and min/max SearchInfo
-df$Duration <- scale(df$Duration)
+df$Duration <- as.numeric(scale(df$Duration))
 df$SearchInfo <- (df$SearchInfo - min(df$SearchInfo)) /
  (max(df$SearchInfo) - min(df$SearchInfo))
@ -319,20 +284,20 @@ c5 <- cophenetic(h5)
 # Correlations
 cor(mat, c1)
-# 0.9029232
+# 0.8854558
 cor(mat, c2)
-# 0.8879478
+# 0.883313
 cor(mat, c3)
-# 0.5747296
+# 0.5368663
 cor(mat, c4)
-# 0.5994121
+# 0.725247
 cor(mat, c5)
-# 0.5292353
+# 0.3895215
 # https://en.wikipedia.org/wiki/Cophenetic_correlation
 # https://stats.stackexchange.com/questions/195446/choosing-the-right-linkage-method-for-hierarchical-clustering
-hc <- h1
+hc <- h4
 # Something like a scree plot (??)
 plot(rev(hc$height)[1:100], type = "b", pch = 16, cex = .5)
@ -345,118 +310,51 @@ df$grp <- grp
 table(grp)
 fviz_cluster(list(data = df, cluster = grp),
-             #palette = c("#78004B", "#FF6900", "#3CB4DC", "#91C86E",
+             palette = c("#78004B", "#FF6900", "#3CB4DC", "#91C86E", "black"),
             #            "#000000", "#434F4F"),
             ellipse.type = "convex",
-             show.clust.cent = FALSE, ggtheme = theme_bw())
+             show.clust.cent = FALSE,
             ggtheme = theme_bw())
 # Look at 3d plot to see if clusters are actually separate
 pc <- prcomp(df)
 coor <- as.data.frame(pc$x[, c(1, 2, 3)])
 coor$grp <- df$grp
 rgl::plot3d(coor, col = c("#78004B", "#FF6900", "#3CB4DC", "#91C86E", "black")[coor$grp])
 dattree$grp <- grp
 table(dattree[dattree$grp == 1, "Pattern"])
 table(dattree[dattree$grp == 2, "Pattern"])
 table(dattree[dattree$grp == 3, "Pattern"])
 table(dattree[dattree$grp == 4, "Pattern"])
 aggregate(cbind(Duration, PropItems, SearchInfo, PropMoves, PathLinearity,
                Singularity, centr_degree, centr_degree_loops,
                centr_between) ~ grp, dattree, mean)
 aggregate(cbind(Duration, PropItems, SearchInfo, PropMoves, Dispersion,
                Scholar, Star) ~ grp, df, mean)
 ### Look at selected cases ###########################################
 tmp <- dat
 tmp$start <- tmp$date.start
 tmp$complete <- tmp$date.stop
-# "We first extract the graph sub-sequences corresponding to the four
+alog <- activitylog(tmp[tmp$case == 3448, ],
-# patterns of Canter et al. (1985). We also identified the number of nodes
+                    case_id     = "case",
-# to which the learner often goes back to (Fig. 4). These nodes are called
+                    activity_id = "item",
-# “central nodes”. If the number of central nodes is lower than or equal to
+                    resource_id = "path",
-# half of the sub-sequences, the browsing pattern indicator takes on the
+                    timestamps  = c("start", "complete"))
 # value “Star”." Bousbia et al. (2010)
-# I do not know how they got the sub-sequences. I am taking the ratio of
+process_map(alog)
-# strongly connected nodes to weakly connected nodes. If the number of
+######################################################################
 # weakly connected nodes is twice as high, the pattern is classified as a
 # star, i.e., NodeConnect <= 0.5.
 # TODO: This does not make sense, smallest and most frequent number is 3!
 # (and I do not understand it...)
-
+res <- merge(dat, dattree[, c("case", "grp")], by = "case", all.x = TRUE)
 # count_asymmetric_node_pairs  Get the number of asymmetrically-connected node pairs
 # count_edges                  Get a count of all edges
 # count_loop_edges             Get count of all loop edges
 # count_mutual_node_pairs      Get the number of mutually-connected node pairs
 # count_unconnected_node_pairs Get the number of unconnected node pairs
 # count_unconnected_nodes      Get count of all unconnected nodes
 # TODO: Read up on centrality measures
 # https://www.r-bloggers.com/2018/12/network-centrality-in-r-an-introduction/
 # https://www.datacamp.com/tutorial/centrality-network-analysis-R
 # http://davidrajuh.net/reggie/publications/publications-filer/rd114-2018-Network-Centrality.pdf
 # https://link.springer.com/article/10.1007/s10618-024-01003-4
 #--------------- (2) Clustering ---------------
 df <- na.omit(datcase[, c("duration", "distance", "scaleSize",
                          "rotationDegree", "length", "nmove",
                          "nitems", "npaths")])
 #df <- cbind(df, datcase[, c("vacation", "holiday", "weekend", "morning")])
 mat <- dist(scale(df))
 #mat <- dist(df)
 h1 <- hclust(mat, method = "average")
 h2 <- hclust(mat, method = "complete")
 h3 <- hclust(mat, method = "ward.D")
 h4 <- hclust(mat, method = "ward.D2")
 h5 <- hclust(mat, method = "single")
 # Cophenetic Distances, for each linkage (runs quite some time!)
 c1 <- cophenetic(h1)
 c2 <- cophenetic(h2)
 c3 <- cophenetic(h3)
 c4 <- cophenetic(h4)
 c5 <- cophenetic(h5)
 # Correlations
 cor(mat, c1)
 cor(mat, c2)
 cor(mat, c3)
 cor(mat, c4)
 cor(mat, c5)
 # https://en.wikipedia.org/wiki/Cophenetic_correlation
 # https://stats.stackexchange.com/questions/195446/choosing-the-right-linkage-method-for-hierarchical-clustering
 hc <- h1
 # Something like a scree plot (??)
 plot(rev(hc$height)[1:100], type = "b", pch = 16, cex = .5)
 # TODO: Something is wrong
 k <- 4
 grp <- cutree(hc, k = k)
 df$grp <- grp
 table(grp)
 fviz_cluster(list(data = df, cluster = grp),
             #palette = c("#78004B", "#FF6900", "#3CB4DC", "#91C86E",
             #            "#000000", "#434F4F"),
             ellipse.type = "convex",
             show.clust.cent = FALSE, ggtheme = theme_bw())
 aggregate(cbind(duration, distance, scaleSize , rotationDegree, length, nmove,
                nitems, npaths) ~ grp, df, mean)
 aggregate(cbind(duration, distance, scaleSize , rotationDegree, length, nmove,
                nitems, npaths) ~ grp, df, median)
 aggregate(cbind(duration, distance, scaleSize , rotationDegree, length, nmove,
                nitems, npaths) ~ grp, df, max)
 df$case <- na.omit(datcase[, c("case", "duration", "distance", "scaleSize",
                          "rotationDegree", "length", "nmove",
                          "nitems", "npaths")])$case
 res <- merge(dat, df[, c("case", "grp")], by = "case", all.x = TRUE)
 res <- res[order(res$fileId.start, res$date.start, res$timeMs.start), ]
 xtabs( ~ item + grp, res)
@ -469,13 +367,35 @@ vioplot::vioplot(distance ~ grp, res)
 vioplot::vioplot(scaleSize ~ grp, res)
 vioplot::vioplot(rotationDegree ~ grp, res)
 aggregate(cbind(duration, distance, scaleSize, rotationDegree) ~ grp, res, mean)
 aggregate(cbind(duration, distance, scaleSize, rotationDegree) ~ grp, res, median)
 write.table(res,
            file = "results/haum/event_logfiles_pre-corona_with-clusters_cases.csv",
            sep = ";",
            quote = FALSE,
            row.names = FALSE)
-#--------------- (3) Investigate variants ---------------
+#--------------- (3) Fit tree ---------------
 library(rpart)
 library(partykit)
 dattree$Duration_scaled <- scale(dattree$Duration)
 dattree$grp <- factor(dattree$grp)
 dattree$Pattern <- factor(dattree$Pattern)
 c1 <- rpart(grp ~ Duration + PropItems + SearchInfo + PropMoves +
              Pattern, data = dattree, method = "class")
 plot(as.party(c1))
 c2 <- rpart(grp ~ PropItems + SearchInfo + PropMoves + Pattern,
            data = dattree, method = "class")
 plot(as.party(c2))
 #--------------- (4) Investigate variants ---------------
 res$start <- res$date.start
 res$complete <- res$date.stop
@ -558,3 +478,30 @@ for (case in cases) {
 }
 ########################### TODO: Still need it?
 net <- process_map(alog, render = FALSE)
 #DiagrammeR::get_node_df(net)
 DiagrammeR::get_node_info(net)
 DiagrammeR::get_degree_distribution(net)
 DiagrammeR::get_degree_in(net)
 DiagrammeR::get_degree_out(net)
 DiagrammeR::get_degree_total(net)
 N <- DiagrammeR::count_nodes(net) - 2   # Do not count start and stop nodes
 dc <- DiagrammeR::get_degree_total(net)[1:N, "total_degree"] / (N - 1)
 inet <- DiagrammeR::to_igraph(net)
 igraph::centr_degree(inet, loops = FALSE)
 igraph::centr_betw(inet)
 igraph::centr_clo(inet)