diff --git a/code/09_user-navigation.R b/code/09_user-navigation.R index bf33425..8752ce9 100644 --- a/code/09_user-navigation.R +++ b/code/09_user-navigation.R @@ -4,13 +4,14 @@ # (1.1) Read log event data # (1.2) Extract additional infos for clustering # (2) Clustering +# (3) Fit tree # (3) Investigate variants # # input: results/haum/event_logfiles_2024-02-21_16-07-33.csv # output: results/haum/event_logfiles_pre-corona_with-clusters_cases.csv # results/haum/dattree.csv # -# last mod: 2024-02-23 +# last mod: 2024-02-27 # setwd("C:/Users/nwickelmaier/Nextcloud/Documents/MDS/2023ss/60100_master_thesis/analysis/code") @@ -176,10 +177,6 @@ heatmap(cor_mat) # datcase$duration / mean(datcase$duration) or median(datcase$duration) # * Duration per artwork is low: "ave_duration_item" / datcase$duration -# TODO: Come up with relevant features for navigation behavior - - - dattree <- data.frame(case = datcase$case, Duration = datcase$duration, PropItems = datcase$nitems / length(unique(dat$item)), @@ -257,47 +254,15 @@ write.table(dattree, quote = FALSE, row.names = FALSE) - -tmp <- dat -tmp$start <- tmp$date.start -tmp$complete <- tmp$date.stop - - -alog <- activitylog(tmp[tmp$case == 3448, ], - case_id = "case", - activity_id = "item", - resource_id = "path", - timestamps = c("start", "complete")) - -process_map(alog) - -net <- process_map(alog, render = FALSE) -#DiagrammeR::get_node_df(net) - -DiagrammeR::get_node_info(net) - -DiagrammeR::get_degree_distribution(net) - -DiagrammeR::get_degree_in(net) -DiagrammeR::get_degree_out(net) -DiagrammeR::get_degree_total(net) - - -N <- DiagrammeR::count_nodes(net) - 2 # Do not count start and stop nodes - -dc <- DiagrammeR::get_degree_total(net)[1:N, "total_degree"] / (N - 1) - -inet <- DiagrammeR::to_igraph(net) -igraph::centr_degree(inet, loops = FALSE) -igraph::centr_betw(inet) -igraph::centr_clo(inet) +#--------------- (2) Clustering --------------- df <- dattree[, c("Duration", "PropItems", "SearchInfo", "PropMoves")] +# TODO: With or without duration? Why is it relevant? df$Scholar <- ifelse(dattree$Pattern == "Scholar", 1, 0) df$Star <- ifelse(dattree$Pattern == "Star", 1, 0) df$Dispersion <- ifelse(dattree$Pattern == "Dispersion", 1, 0) # scale Duration and min/max SearchInfo -df$Duration <- scale(df$Duration) +df$Duration <- as.numeric(scale(df$Duration)) df$SearchInfo <- (df$SearchInfo - min(df$SearchInfo)) / (max(df$SearchInfo) - min(df$SearchInfo)) @@ -319,20 +284,20 @@ c5 <- cophenetic(h5) # Correlations cor(mat, c1) -# 0.9029232 +# 0.8854558 cor(mat, c2) -# 0.8879478 +# 0.883313 cor(mat, c3) -# 0.5747296 +# 0.5368663 cor(mat, c4) -# 0.5994121 +# 0.725247 cor(mat, c5) -# 0.5292353 +# 0.3895215 # https://en.wikipedia.org/wiki/Cophenetic_correlation # https://stats.stackexchange.com/questions/195446/choosing-the-right-linkage-method-for-hierarchical-clustering -hc <- h1 +hc <- h4 # Something like a scree plot (??) plot(rev(hc$height)[1:100], type = "b", pch = 16, cex = .5) @@ -345,118 +310,51 @@ df$grp <- grp table(grp) fviz_cluster(list(data = df, cluster = grp), - #palette = c("#78004B", "#FF6900", "#3CB4DC", "#91C86E", - # "#000000", "#434F4F"), + palette = c("#78004B", "#FF6900", "#3CB4DC", "#91C86E", "black"), ellipse.type = "convex", - show.clust.cent = FALSE, ggtheme = theme_bw()) + show.clust.cent = FALSE, + ggtheme = theme_bw()) +# Look at 3d plot to see if clusters are actually separate +pc <- prcomp(df) +coor <- as.data.frame(pc$x[, c(1, 2, 3)]) +coor$grp <- df$grp +rgl::plot3d(coor, col = c("#78004B", "#FF6900", "#3CB4DC", "#91C86E", "black")[coor$grp]) + + + + + + +dattree$grp <- grp +table(dattree[dattree$grp == 1, "Pattern"]) +table(dattree[dattree$grp == 2, "Pattern"]) table(dattree[dattree$grp == 3, "Pattern"]) +table(dattree[dattree$grp == 4, "Pattern"]) aggregate(cbind(Duration, PropItems, SearchInfo, PropMoves, PathLinearity, Singularity, centr_degree, centr_degree_loops, centr_between) ~ grp, dattree, mean) - aggregate(cbind(Duration, PropItems, SearchInfo, PropMoves, Dispersion, Scholar, Star) ~ grp, df, mean) +### Look at selected cases ########################################### +tmp <- dat +tmp$start <- tmp$date.start +tmp$complete <- tmp$date.stop -# "We first extract the graph sub-sequences corresponding to the four -# patterns of Canter et al. (1985). We also identified the number of nodes -# to which the learner often goes back to (Fig. 4). These nodes are called -# “central nodes”. If the number of central nodes is lower than or equal to -# half of the sub-sequences, the browsing pattern indicator takes on the -# value “Star”." Bousbia et al. (2010) +alog <- activitylog(tmp[tmp$case == 3448, ], + case_id = "case", + activity_id = "item", + resource_id = "path", + timestamps = c("start", "complete")) -# I do not know how they got the sub-sequences. I am taking the ratio of -# strongly connected nodes to weakly connected nodes. If the number of -# weakly connected nodes is twice as high, the pattern is classified as a -# star, i.e., NodeConnect <= 0.5. -# TODO: This does not make sense, smallest and most frequent number is 3! -# (and I do not understand it...) +process_map(alog) +###################################################################### - -# count_asymmetric_node_pairs Get the number of asymmetrically-connected node pairs -# count_edges Get a count of all edges -# count_loop_edges Get count of all loop edges -# count_mutual_node_pairs Get the number of mutually-connected node pairs -# count_unconnected_node_pairs Get the number of unconnected node pairs -# count_unconnected_nodes Get count of all unconnected nodes - - -# TODO: Read up on centrality measures -# https://www.r-bloggers.com/2018/12/network-centrality-in-r-an-introduction/ -# https://www.datacamp.com/tutorial/centrality-network-analysis-R -# http://davidrajuh.net/reggie/publications/publications-filer/rd114-2018-Network-Centrality.pdf -# https://link.springer.com/article/10.1007/s10618-024-01003-4 - -#--------------- (2) Clustering --------------- - -df <- na.omit(datcase[, c("duration", "distance", "scaleSize", - "rotationDegree", "length", "nmove", - "nitems", "npaths")]) - -#df <- cbind(df, datcase[, c("vacation", "holiday", "weekend", "morning")]) -mat <- dist(scale(df)) -#mat <- dist(df) - -h1 <- hclust(mat, method = "average") -h2 <- hclust(mat, method = "complete") -h3 <- hclust(mat, method = "ward.D") -h4 <- hclust(mat, method = "ward.D2") -h5 <- hclust(mat, method = "single") - -# Cophenetic Distances, for each linkage (runs quite some time!) -c1 <- cophenetic(h1) -c2 <- cophenetic(h2) -c3 <- cophenetic(h3) -c4 <- cophenetic(h4) -c5 <- cophenetic(h5) - -# Correlations -cor(mat, c1) -cor(mat, c2) -cor(mat, c3) -cor(mat, c4) -cor(mat, c5) -# https://en.wikipedia.org/wiki/Cophenetic_correlation -# https://stats.stackexchange.com/questions/195446/choosing-the-right-linkage-method-for-hierarchical-clustering - -hc <- h1 - -# Something like a scree plot (??) -plot(rev(hc$height)[1:100], type = "b", pch = 16, cex = .5) - - -# TODO: Something is wrong - -k <- 4 - -grp <- cutree(hc, k = k) -df$grp <- grp - -table(grp) - -fviz_cluster(list(data = df, cluster = grp), - #palette = c("#78004B", "#FF6900", "#3CB4DC", "#91C86E", - # "#000000", "#434F4F"), - ellipse.type = "convex", - show.clust.cent = FALSE, ggtheme = theme_bw()) - -aggregate(cbind(duration, distance, scaleSize , rotationDegree, length, nmove, - nitems, npaths) ~ grp, df, mean) -aggregate(cbind(duration, distance, scaleSize , rotationDegree, length, nmove, - nitems, npaths) ~ grp, df, median) -aggregate(cbind(duration, distance, scaleSize , rotationDegree, length, nmove, - nitems, npaths) ~ grp, df, max) - - -df$case <- na.omit(datcase[, c("case", "duration", "distance", "scaleSize", - "rotationDegree", "length", "nmove", - "nitems", "npaths")])$case - -res <- merge(dat, df[, c("case", "grp")], by = "case", all.x = TRUE) +res <- merge(dat, dattree[, c("case", "grp")], by = "case", all.x = TRUE) res <- res[order(res$fileId.start, res$date.start, res$timeMs.start), ] xtabs( ~ item + grp, res) @@ -469,13 +367,35 @@ vioplot::vioplot(distance ~ grp, res) vioplot::vioplot(scaleSize ~ grp, res) vioplot::vioplot(rotationDegree ~ grp, res) +aggregate(cbind(duration, distance, scaleSize, rotationDegree) ~ grp, res, mean) +aggregate(cbind(duration, distance, scaleSize, rotationDegree) ~ grp, res, median) + write.table(res, file = "results/haum/event_logfiles_pre-corona_with-clusters_cases.csv", sep = ";", quote = FALSE, row.names = FALSE) -#--------------- (3) Investigate variants --------------- +#--------------- (3) Fit tree --------------- + +library(rpart) +library(partykit) + +dattree$Duration_scaled <- scale(dattree$Duration) +dattree$grp <- factor(dattree$grp) +dattree$Pattern <- factor(dattree$Pattern) + +c1 <- rpart(grp ~ Duration + PropItems + SearchInfo + PropMoves + + Pattern, data = dattree, method = "class") + +plot(as.party(c1)) + +c2 <- rpart(grp ~ PropItems + SearchInfo + PropMoves + Pattern, + data = dattree, method = "class") + +plot(as.party(c2)) + +#--------------- (4) Investigate variants --------------- res$start <- res$date.start res$complete <- res$date.stop @@ -558,3 +478,30 @@ for (case in cases) { } + + + +########################### TODO: Still need it? + + +net <- process_map(alog, render = FALSE) +#DiagrammeR::get_node_df(net) + +DiagrammeR::get_node_info(net) + +DiagrammeR::get_degree_distribution(net) + +DiagrammeR::get_degree_in(net) +DiagrammeR::get_degree_out(net) +DiagrammeR::get_degree_total(net) + + +N <- DiagrammeR::count_nodes(net) - 2 # Do not count start and stop nodes + +dc <- DiagrammeR::get_degree_total(net)[1:N, "total_degree"] / (N - 1) + +inet <- DiagrammeR::to_igraph(net) +igraph::centr_degree(inet, loops = FALSE) +igraph::centr_betw(inet) +igraph::centr_clo(inet) +