Added partitioning to user navigation script
This commit is contained in:
		
							parent
							
								
									42f12b9256
								
							
						
					
					
						commit
						e7eb2cb784
					
				| @ -4,13 +4,14 @@ | |||||||
| #           (1.1) Read log event data | #           (1.1) Read log event data | ||||||
| #           (1.2) Extract additional infos for clustering | #           (1.2) Extract additional infos for clustering | ||||||
| #          (2) Clustering | #          (2) Clustering | ||||||
|  | #          (3) Fit tree | ||||||
| #          (3) Investigate variants | #          (3) Investigate variants | ||||||
| # | # | ||||||
| # input:  results/haum/event_logfiles_2024-02-21_16-07-33.csv | # input:  results/haum/event_logfiles_2024-02-21_16-07-33.csv | ||||||
| # output: results/haum/event_logfiles_pre-corona_with-clusters_cases.csv | # output: results/haum/event_logfiles_pre-corona_with-clusters_cases.csv | ||||||
| #         results/haum/dattree.csv | #         results/haum/dattree.csv | ||||||
| # | # | ||||||
| # last mod: 2024-02-23 | # last mod: 2024-02-27 | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| # setwd("C:/Users/nwickelmaier/Nextcloud/Documents/MDS/2023ss/60100_master_thesis/analysis/code") | # setwd("C:/Users/nwickelmaier/Nextcloud/Documents/MDS/2023ss/60100_master_thesis/analysis/code") | ||||||
| @ -176,10 +177,6 @@ heatmap(cor_mat) | |||||||
| #   datcase$duration / mean(datcase$duration) or median(datcase$duration) | #   datcase$duration / mean(datcase$duration) or median(datcase$duration) | ||||||
| #   * Duration per artwork is low: "ave_duration_item" / datcase$duration | #   * Duration per artwork is low: "ave_duration_item" / datcase$duration | ||||||
| 
 | 
 | ||||||
| # TODO: Come up with relevant features for navigation behavior |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| dattree <- data.frame(case = datcase$case, | dattree <- data.frame(case = datcase$case, | ||||||
|                       Duration = datcase$duration, |                       Duration = datcase$duration, | ||||||
|                       PropItems = datcase$nitems / length(unique(dat$item)), |                       PropItems = datcase$nitems / length(unique(dat$item)), | ||||||
| @ -257,47 +254,15 @@ write.table(dattree, | |||||||
|             quote = FALSE, |             quote = FALSE, | ||||||
|             row.names = FALSE) |             row.names = FALSE) | ||||||
| 
 | 
 | ||||||
| 
 | #--------------- (2) Clustering --------------- | ||||||
| tmp <- dat |  | ||||||
| tmp$start <- tmp$date.start |  | ||||||
| tmp$complete <- tmp$date.stop |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| alog <- activitylog(tmp[tmp$case == 3448, ], |  | ||||||
|                     case_id     = "case", |  | ||||||
|                     activity_id = "item", |  | ||||||
|                     resource_id = "path", |  | ||||||
|                     timestamps  = c("start", "complete")) |  | ||||||
| 
 |  | ||||||
| process_map(alog) |  | ||||||
| 
 |  | ||||||
| net <- process_map(alog, render = FALSE) |  | ||||||
| #DiagrammeR::get_node_df(net) |  | ||||||
| 
 |  | ||||||
| DiagrammeR::get_node_info(net) |  | ||||||
| 
 |  | ||||||
| DiagrammeR::get_degree_distribution(net) |  | ||||||
| 
 |  | ||||||
| DiagrammeR::get_degree_in(net) |  | ||||||
| DiagrammeR::get_degree_out(net) |  | ||||||
| DiagrammeR::get_degree_total(net) |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| N <- DiagrammeR::count_nodes(net) - 2   # Do not count start and stop nodes |  | ||||||
| 
 |  | ||||||
| dc <- DiagrammeR::get_degree_total(net)[1:N, "total_degree"] / (N - 1) |  | ||||||
| 
 |  | ||||||
| inet <- DiagrammeR::to_igraph(net) |  | ||||||
| igraph::centr_degree(inet, loops = FALSE) |  | ||||||
| igraph::centr_betw(inet) |  | ||||||
| igraph::centr_clo(inet) |  | ||||||
| 
 | 
 | ||||||
| df <- dattree[, c("Duration", "PropItems", "SearchInfo", "PropMoves")] | df <- dattree[, c("Duration", "PropItems", "SearchInfo", "PropMoves")] | ||||||
|  | # TODO: With or without duration? Why is it relevant? | ||||||
| df$Scholar <- ifelse(dattree$Pattern == "Scholar", 1, 0) | df$Scholar <- ifelse(dattree$Pattern == "Scholar", 1, 0) | ||||||
| df$Star <- ifelse(dattree$Pattern == "Star", 1, 0) | df$Star <- ifelse(dattree$Pattern == "Star", 1, 0) | ||||||
| df$Dispersion <- ifelse(dattree$Pattern == "Dispersion", 1, 0) | df$Dispersion <- ifelse(dattree$Pattern == "Dispersion", 1, 0) | ||||||
| # scale Duration and min/max SearchInfo | # scale Duration and min/max SearchInfo | ||||||
| df$Duration <- scale(df$Duration) | df$Duration <- as.numeric(scale(df$Duration)) | ||||||
| df$SearchInfo <- (df$SearchInfo - min(df$SearchInfo)) / | df$SearchInfo <- (df$SearchInfo - min(df$SearchInfo)) / | ||||||
|   (max(df$SearchInfo) - min(df$SearchInfo)) |   (max(df$SearchInfo) - min(df$SearchInfo)) | ||||||
| 
 | 
 | ||||||
| @ -319,20 +284,20 @@ c5 <- cophenetic(h5) | |||||||
| 
 | 
 | ||||||
| # Correlations | # Correlations | ||||||
| cor(mat, c1) | cor(mat, c1) | ||||||
| # 0.9029232 | # 0.8854558 | ||||||
| cor(mat, c2) | cor(mat, c2) | ||||||
| # 0.8879478 | # 0.883313 | ||||||
| cor(mat, c3) | cor(mat, c3) | ||||||
| # 0.5747296 | # 0.5368663 | ||||||
| cor(mat, c4) | cor(mat, c4) | ||||||
| # 0.5994121 | # 0.725247 | ||||||
| cor(mat, c5) | cor(mat, c5) | ||||||
| # 0.5292353 | # 0.3895215 | ||||||
| 
 | 
 | ||||||
| # https://en.wikipedia.org/wiki/Cophenetic_correlation | # https://en.wikipedia.org/wiki/Cophenetic_correlation | ||||||
| # https://stats.stackexchange.com/questions/195446/choosing-the-right-linkage-method-for-hierarchical-clustering | # https://stats.stackexchange.com/questions/195446/choosing-the-right-linkage-method-for-hierarchical-clustering | ||||||
| 
 | 
 | ||||||
| hc <- h1 | hc <- h4 | ||||||
| 
 | 
 | ||||||
| # Something like a scree plot (??) | # Something like a scree plot (??) | ||||||
| plot(rev(hc$height)[1:100], type = "b", pch = 16, cex = .5) | plot(rev(hc$height)[1:100], type = "b", pch = 16, cex = .5) | ||||||
| @ -345,118 +310,51 @@ df$grp <- grp | |||||||
| table(grp) | table(grp) | ||||||
| 
 | 
 | ||||||
| fviz_cluster(list(data = df, cluster = grp), | fviz_cluster(list(data = df, cluster = grp), | ||||||
|              #palette = c("#78004B", "#FF6900", "#3CB4DC", "#91C86E", |              palette = c("#78004B", "#FF6900", "#3CB4DC", "#91C86E", "black"), | ||||||
|              #            "#000000", "#434F4F"), |  | ||||||
|              ellipse.type = "convex", |              ellipse.type = "convex", | ||||||
|              show.clust.cent = FALSE, ggtheme = theme_bw()) |              show.clust.cent = FALSE, | ||||||
|  |              ggtheme = theme_bw()) | ||||||
| 
 | 
 | ||||||
|  | # Look at 3d plot to see if clusters are actually separate | ||||||
|  | pc <- prcomp(df) | ||||||
|  | coor <- as.data.frame(pc$x[, c(1, 2, 3)]) | ||||||
|  | coor$grp <- df$grp | ||||||
|  | rgl::plot3d(coor, col = c("#78004B", "#FF6900", "#3CB4DC", "#91C86E", "black")[coor$grp]) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | dattree$grp <- grp | ||||||
|  | table(dattree[dattree$grp == 1, "Pattern"]) | ||||||
|  | table(dattree[dattree$grp == 2, "Pattern"]) | ||||||
| table(dattree[dattree$grp == 3, "Pattern"]) | table(dattree[dattree$grp == 3, "Pattern"]) | ||||||
|  | table(dattree[dattree$grp == 4, "Pattern"]) | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| aggregate(cbind(Duration, PropItems, SearchInfo, PropMoves, PathLinearity, | aggregate(cbind(Duration, PropItems, SearchInfo, PropMoves, PathLinearity, | ||||||
|                 Singularity, centr_degree, centr_degree_loops, |                 Singularity, centr_degree, centr_degree_loops, | ||||||
|                 centr_between) ~ grp, dattree, mean) |                 centr_between) ~ grp, dattree, mean) | ||||||
| 
 | 
 | ||||||
| 
 |  | ||||||
| aggregate(cbind(Duration, PropItems, SearchInfo, PropMoves, Dispersion, | aggregate(cbind(Duration, PropItems, SearchInfo, PropMoves, Dispersion, | ||||||
|                 Scholar, Star) ~ grp, df, mean) |                 Scholar, Star) ~ grp, df, mean) | ||||||
| 
 | 
 | ||||||
|  | ### Look at selected cases ########################################### | ||||||
|  | tmp <- dat | ||||||
|  | tmp$start <- tmp$date.start | ||||||
|  | tmp$complete <- tmp$date.stop | ||||||
| 
 | 
 | ||||||
| # "We first extract the graph sub-sequences corresponding to the four | alog <- activitylog(tmp[tmp$case == 3448, ], | ||||||
| # patterns of Canter et al. (1985). We also identified the number of nodes |                     case_id     = "case", | ||||||
| # to which the learner often goes back to (Fig. 4). These nodes are called |                     activity_id = "item", | ||||||
| # “central nodes”. If the number of central nodes is lower than or equal to |                     resource_id = "path", | ||||||
| # half of the sub-sequences, the browsing pattern indicator takes on the |                     timestamps  = c("start", "complete")) | ||||||
| # value “Star”." Bousbia et al. (2010) |  | ||||||
| 
 | 
 | ||||||
| # I do not know how they got the sub-sequences. I am taking the ratio of | process_map(alog) | ||||||
| # strongly connected nodes to weakly connected nodes. If the number of | ###################################################################### | ||||||
| # weakly connected nodes is twice as high, the pattern is classified as a |  | ||||||
| # star, i.e., NodeConnect <= 0.5. |  | ||||||
| # TODO: This does not make sense, smallest and most frequent number is 3! |  | ||||||
| # (and I do not understand it...) |  | ||||||
| 
 | 
 | ||||||
| 
 | res <- merge(dat, dattree[, c("case", "grp")], by = "case", all.x = TRUE) | ||||||
| # count_asymmetric_node_pairs  Get the number of asymmetrically-connected node pairs |  | ||||||
| # count_edges                  Get a count of all edges |  | ||||||
| # count_loop_edges             Get count of all loop edges |  | ||||||
| # count_mutual_node_pairs      Get the number of mutually-connected node pairs |  | ||||||
| # count_unconnected_node_pairs Get the number of unconnected node pairs |  | ||||||
| # count_unconnected_nodes      Get count of all unconnected nodes |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| # TODO: Read up on centrality measures |  | ||||||
| # https://www.r-bloggers.com/2018/12/network-centrality-in-r-an-introduction/ |  | ||||||
| # https://www.datacamp.com/tutorial/centrality-network-analysis-R |  | ||||||
| # http://davidrajuh.net/reggie/publications/publications-filer/rd114-2018-Network-Centrality.pdf |  | ||||||
| # https://link.springer.com/article/10.1007/s10618-024-01003-4 |  | ||||||
| 
 |  | ||||||
| #--------------- (2) Clustering --------------- |  | ||||||
| 
 |  | ||||||
| df <- na.omit(datcase[, c("duration", "distance", "scaleSize", |  | ||||||
|                           "rotationDegree", "length", "nmove", |  | ||||||
|                           "nitems", "npaths")]) |  | ||||||
| 
 |  | ||||||
| #df <- cbind(df, datcase[, c("vacation", "holiday", "weekend", "morning")]) |  | ||||||
| mat <- dist(scale(df)) |  | ||||||
| #mat <- dist(df) |  | ||||||
| 
 |  | ||||||
| h1 <- hclust(mat, method = "average") |  | ||||||
| h2 <- hclust(mat, method = "complete") |  | ||||||
| h3 <- hclust(mat, method = "ward.D") |  | ||||||
| h4 <- hclust(mat, method = "ward.D2") |  | ||||||
| h5 <- hclust(mat, method = "single") |  | ||||||
| 
 |  | ||||||
| # Cophenetic Distances, for each linkage (runs quite some time!) |  | ||||||
| c1 <- cophenetic(h1) |  | ||||||
| c2 <- cophenetic(h2) |  | ||||||
| c3 <- cophenetic(h3) |  | ||||||
| c4 <- cophenetic(h4) |  | ||||||
| c5 <- cophenetic(h5) |  | ||||||
| 
 |  | ||||||
| # Correlations |  | ||||||
| cor(mat, c1) |  | ||||||
| cor(mat, c2) |  | ||||||
| cor(mat, c3) |  | ||||||
| cor(mat, c4) |  | ||||||
| cor(mat, c5) |  | ||||||
| # https://en.wikipedia.org/wiki/Cophenetic_correlation |  | ||||||
| # https://stats.stackexchange.com/questions/195446/choosing-the-right-linkage-method-for-hierarchical-clustering |  | ||||||
| 
 |  | ||||||
| hc <- h1 |  | ||||||
| 
 |  | ||||||
| # Something like a scree plot (??) |  | ||||||
| plot(rev(hc$height)[1:100], type = "b", pch = 16, cex = .5) |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| # TODO: Something is wrong |  | ||||||
| 
 |  | ||||||
| k <- 4 |  | ||||||
| 
 |  | ||||||
| grp <- cutree(hc, k = k) |  | ||||||
| df$grp <- grp |  | ||||||
| 
 |  | ||||||
| table(grp) |  | ||||||
| 
 |  | ||||||
| fviz_cluster(list(data = df, cluster = grp), |  | ||||||
|              #palette = c("#78004B", "#FF6900", "#3CB4DC", "#91C86E", |  | ||||||
|              #            "#000000", "#434F4F"), |  | ||||||
|              ellipse.type = "convex", |  | ||||||
|              show.clust.cent = FALSE, ggtheme = theme_bw()) |  | ||||||
| 
 |  | ||||||
| aggregate(cbind(duration, distance, scaleSize , rotationDegree, length, nmove, |  | ||||||
|                 nitems, npaths) ~ grp, df, mean) |  | ||||||
| aggregate(cbind(duration, distance, scaleSize , rotationDegree, length, nmove, |  | ||||||
|                 nitems, npaths) ~ grp, df, median) |  | ||||||
| aggregate(cbind(duration, distance, scaleSize , rotationDegree, length, nmove, |  | ||||||
|                 nitems, npaths) ~ grp, df, max) |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| df$case <- na.omit(datcase[, c("case", "duration", "distance", "scaleSize", |  | ||||||
|                           "rotationDegree", "length", "nmove", |  | ||||||
|                           "nitems", "npaths")])$case |  | ||||||
| 
 |  | ||||||
| res <- merge(dat, df[, c("case", "grp")], by = "case", all.x = TRUE) |  | ||||||
| res <- res[order(res$fileId.start, res$date.start, res$timeMs.start), ] | res <- res[order(res$fileId.start, res$date.start, res$timeMs.start), ] | ||||||
| 
 | 
 | ||||||
| xtabs( ~ item + grp, res) | xtabs( ~ item + grp, res) | ||||||
| @ -469,13 +367,35 @@ vioplot::vioplot(distance ~ grp, res) | |||||||
| vioplot::vioplot(scaleSize ~ grp, res) | vioplot::vioplot(scaleSize ~ grp, res) | ||||||
| vioplot::vioplot(rotationDegree ~ grp, res) | vioplot::vioplot(rotationDegree ~ grp, res) | ||||||
| 
 | 
 | ||||||
|  | aggregate(cbind(duration, distance, scaleSize, rotationDegree) ~ grp, res, mean) | ||||||
|  | aggregate(cbind(duration, distance, scaleSize, rotationDegree) ~ grp, res, median) | ||||||
|  | 
 | ||||||
| write.table(res, | write.table(res, | ||||||
|             file = "results/haum/event_logfiles_pre-corona_with-clusters_cases.csv", |             file = "results/haum/event_logfiles_pre-corona_with-clusters_cases.csv", | ||||||
|             sep = ";", |             sep = ";", | ||||||
|             quote = FALSE, |             quote = FALSE, | ||||||
|             row.names = FALSE) |             row.names = FALSE) | ||||||
| 
 | 
 | ||||||
| #--------------- (3) Investigate variants --------------- | #--------------- (3) Fit tree --------------- | ||||||
|  | 
 | ||||||
|  | library(rpart) | ||||||
|  | library(partykit) | ||||||
|  | 
 | ||||||
|  | dattree$Duration_scaled <- scale(dattree$Duration) | ||||||
|  | dattree$grp <- factor(dattree$grp) | ||||||
|  | dattree$Pattern <- factor(dattree$Pattern) | ||||||
|  | 
 | ||||||
|  | c1 <- rpart(grp ~ Duration + PropItems + SearchInfo + PropMoves + | ||||||
|  |               Pattern, data = dattree, method = "class") | ||||||
|  | 
 | ||||||
|  | plot(as.party(c1)) | ||||||
|  | 
 | ||||||
|  | c2 <- rpart(grp ~ PropItems + SearchInfo + PropMoves + Pattern, | ||||||
|  |             data = dattree, method = "class") | ||||||
|  | 
 | ||||||
|  | plot(as.party(c2)) | ||||||
|  | 
 | ||||||
|  | #--------------- (4) Investigate variants --------------- | ||||||
| 
 | 
 | ||||||
| res$start <- res$date.start | res$start <- res$date.start | ||||||
| res$complete <- res$date.stop | res$complete <- res$date.stop | ||||||
| @ -558,3 +478,30 @@ for (case in cases) { | |||||||
| 
 | 
 | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | ########################### TODO: Still need it? | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | net <- process_map(alog, render = FALSE) | ||||||
|  | #DiagrammeR::get_node_df(net) | ||||||
|  | 
 | ||||||
|  | DiagrammeR::get_node_info(net) | ||||||
|  | 
 | ||||||
|  | DiagrammeR::get_degree_distribution(net) | ||||||
|  | 
 | ||||||
|  | DiagrammeR::get_degree_in(net) | ||||||
|  | DiagrammeR::get_degree_out(net) | ||||||
|  | DiagrammeR::get_degree_total(net) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | N <- DiagrammeR::count_nodes(net) - 2   # Do not count start and stop nodes | ||||||
|  | 
 | ||||||
|  | dc <- DiagrammeR::get_degree_total(net)[1:N, "total_degree"] / (N - 1) | ||||||
|  | 
 | ||||||
|  | inet <- DiagrammeR::to_igraph(net) | ||||||
|  | igraph::centr_degree(inet, loops = FALSE) | ||||||
|  | igraph::centr_betw(inet) | ||||||
|  | igraph::centr_clo(inet) | ||||||
|  | 
 | ||||||
|  | |||||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user