From 42f12b925633440ab9d42651a1af76d5cc835d15 Mon Sep 17 00:00:00 2001 From: nwickel Date: Tue, 27 Feb 2024 09:08:20 +0100 Subject: [PATCH] Re-ran preprocessing and updated files; worked on user navigation behavior -- intermediate step --- code/01_preprocessing.R | 2 + code/02_descriptives.R | 4 +- code/04_conformance-checking.py | 3 +- code/05_check-traces.R | 4 +- code/{07_infos-items.py => 06_infos-items.py} | 2 +- ...item-clustering.R => 07_item-clustering.R} | 79 ++-- ...infos-clusters.py => 08_infos-clusters.py} | 0 code/09_user-navigation.R | 421 ++++++++++++++++-- 8 files changed, 451 insertions(+), 64 deletions(-) rename code/{07_infos-items.py => 06_infos-items.py} (91%) rename code/{08_item-clustering.R => 07_item-clustering.R} (70%) rename code/{06_infos-clusters.py => 08_infos-clusters.py} (100%) diff --git a/code/01_preprocessing.R b/code/01_preprocessing.R index 3750883..7d2d4a3 100644 --- a/code/01_preprocessing.R +++ b/code/01_preprocessing.R @@ -126,6 +126,8 @@ dat1 <- merge(datlogs, hd, by.x = "date", by.y = "date", all.x = TRUE) dat2 <- merge(dat1, sfdat, by.x = "date", by.y = "date", all.x = TRUE) dat2$date <- NULL +dat2 <- dat2[order(dat2$fileId.start, dat2$date.start, dat2$timeMs.start), ] + ## Export data diff --git a/code/02_descriptives.R b/code/02_descriptives.R index 7c54e88..a925389 100644 --- a/code/02_descriptives.R +++ b/code/02_descriptives.R @@ -9,7 +9,7 @@ # (3.4) Artwork sequences # (3.5) Topics # -# input: results/haum/event_logfiles_glossar_2023-12-28_09-49-43.csv +# input: results/haum/event_logfiles_2024-02-21_16-07-33.csv # output: # # last mod: 2023-11-15, NW @@ -27,7 +27,7 @@ library(bupaverse) #--------------- (1) Read data --------------- -datlogs <- read.table("results/haum/event_logfiles_2024-01-18_09-58-52.csv", +datlogs <- read.table("results/haum/event_logfiles_2024-02-21_16-07-33.csv", colClasses = c("character", "character", "POSIXct", "POSIXct", "character", "integer", "numeric", "character", "character", diff --git a/code/04_conformance-checking.py b/code/04_conformance-checking.py index f8401d5..9ffff09 100644 --- a/code/04_conformance-checking.py +++ b/code/04_conformance-checking.py @@ -5,7 +5,7 @@ from python_helpers import eval_pm, pn_infos_miner ###### Load data and create event logs ###### -dat = pd.read_csv("results/haum/event_logfiles_2024-01-18_09-58-52.csv", sep = ";") +dat = pd.read_csv("results/haum/event_logfiles_2024-02-21_16-07-33.csv", sep = ";") event_log = pm4py.format_dataframe(dat, case_id = "path", activity_key = "event", @@ -53,6 +53,7 @@ for i in range(len(replayed_traces)): set(l1) x1 = np.array(l1) index_broken = np.where(x1 == 1)[0].tolist() +len(index_broken) set(l3) l4.count([]) diff --git a/code/05_check-traces.R b/code/05_check-traces.R index d640c14..b9431ca 100644 --- a/code/05_check-traces.R +++ b/code/05_check-traces.R @@ -2,11 +2,11 @@ #--------------- (1) Look at broken trace --------------- -datraw <- read.table("results/haum/raw_logfiles_2024-01-18_09-58-52.csv", +datraw <- read.table("results/haum/raw_logfiles_2024-02-21_16-07-33.csv", header = TRUE, sep = ";") -datlogs <- read.table("results/haum/event_logfiles_2024-01-18_09-58-52.csv", +datlogs <- read.table("results/haum/event_logfiles_2024-02-21_16-07-33.csv", colClasses = c("character", "character", "POSIXct", "POSIXct", "character", "integer", "numeric", "character", "character", diff --git a/code/07_infos-items.py b/code/06_infos-items.py similarity index 91% rename from code/07_infos-items.py rename to code/06_infos-items.py index 901167d..d43e217 100644 --- a/code/07_infos-items.py +++ b/code/06_infos-items.py @@ -6,7 +6,7 @@ from python_helpers import eval_pm, pn_infos ###### Load data and create event logs ###### -dat = pd.read_csv("results/haum/event_logfiles_2024-01-18_09-58-52.csv", sep = ";") +dat = pd.read_csv("results/haum/event_logfiles_2024-02-21_16-07-33.csv", sep = ";") dat = dat[dat["date.start"] < "2020-03-13"] # --> only pre corona (before artworks were updated) dat = dat[dat["path"] != 106098] diff --git a/code/08_item-clustering.R b/code/07_item-clustering.R similarity index 70% rename from code/08_item-clustering.R rename to code/07_item-clustering.R index 941b61d..a3651a8 100644 --- a/code/08_item-clustering.R +++ b/code/07_item-clustering.R @@ -1,4 +1,4 @@ -# 08_item-clustering.R +# 07_item-clustering.R # # content: (1) Read data # (1.1) Read log event data @@ -7,11 +7,11 @@ # (2) Clustering # (3) Visualization with pictures # -# input: results/haum/event_logfiles_2024-01-18_09-58-52.csv +# input: results/haum/event_logfiles_2024-02-21_16-07-33.csv # results/haum/pn_infos_items.csv # output: results/haum/event_logfiles_pre-corona_with-clusters.csv # -# last mod: 2024-01-30 +# last mod: 2024-02-23 # setwd("C:/Users/nwickelmaier/Nextcloud/Documents/MDS/2023ss/60100_master_thesis/analysis/code") @@ -23,7 +23,7 @@ library(factoextra) #--------------- (1.1) Read log event data --------------- -dat0 <- read.table("results/haum/event_logfiles_2024-01-18_09-58-52.csv", +dat0 <- read.table("results/haum/event_logfiles_2024-02-21_16-07-33.csv", colClasses = c("character", "character", "POSIXct", "POSIXct", "character", "integer", "numeric", "character", "character", @@ -48,21 +48,47 @@ datitem <- read.table("results/haum/pn_infos_items.csv", header = TRUE, #--------------- (1.3) Extract additional infos for clustering --------------- -datitem$duration <- aggregate(duration ~ item, dat, mean)$duration +dat_split <- split(dat, ~ path) + +time_minmax <- function(subdata) { + subdata$min_time <- min(subdata$timeMs.start) + if (all(is.na(subdata$timeMs.stop))) { + subdata$max_time <- NA + } else { + subdata$max_time <- max(subdata$timeMs.stop, na.rm = TRUE) + } + subdata +} + +dat_list <- pbapply::pblapply(dat_split, time_minmax) +dat_minmax <- dplyr::bind_rows(dat_list) + +datpath <- aggregate(duration ~ item + path, dat, mean, na.action = NULL) + +datpath$min_time <- aggregate(min_time ~ path, dat_minmax, unique, na.action = NULL)$min_time +datpath$max_time <- aggregate(max_time ~ path, dat_minmax, unique, na.action = NULL)$max_time + +datpath$duration_path <- datpath$max_time - datpath$min_time + +# average duration per path +datitem$duration <- aggregate(duration ~ item, datpath, mean)$duration datitem$distance <- aggregate(distance ~ item, dat, mean)$distance datitem$scaleSize <- aggregate(scaleSize ~ item, dat, mean)$scaleSize datitem$rotationDegree <- aggregate(rotationDegree ~ item, dat, mean)$rotationDegree datitem$npaths <- aggregate(path ~ item, dat, function(x) length(unique(x)))$path datitem$ncases <- aggregate(case ~ item, dat, function(x) length(unique(x)))$case -datitem$ntopics <- aggregate(topic ~ item, dat, function(x) length(unique(x)))$topic -datitem$mostfreq_num <- as.numeric(gsub(".*: (.*)}", "\\1", datitem$mostfreq)) +datitem$nmoves <- aggregate(event ~ item, dat, table)$event[,"move"] +datitem$nflipCard <- aggregate(event ~ item, dat, table)$event[,"flipCard"] +datitem$nopenTopic <- aggregate(event ~ item, dat, table)$event[,"openTopic"] +datitem$nopenPopup <- aggregate(event ~ item, dat, table)$event[,"openPopup"] #--------------- (2) Clustering --------------- -df <- datitem[, c("precision", "generalizability", "nvariants", - "mostfreq_num", "duration", "distance", "scaleSize", - "rotationDegree", "npaths", "ncases", "ntopics")] |> - scale() +df <- datitem[, c("precision", "generalizability", "nvariants", "duration", + "distance", "scaleSize", "rotationDegree", "npaths", + "ncases", "nmoves", "nopenTopic", "nopenPopup")] |> + scale() + mat <- dist(df) heatmap(as.matrix(mat)) @@ -88,6 +114,7 @@ cor(mat, c3) cor(mat, c4) cor(mat, c5) # https://en.wikipedia.org/wiki/Cophenetic_correlation +# https://stats.stackexchange.com/questions/195446/choosing-the-right-linkage-method-for-hierarchical-clustering # Dendograms par(mfrow=c(3,2)) @@ -101,15 +128,15 @@ plot(h5, main = "Single Linkage") hc <- h1 # Note that ‘agnes(*, method="ward")’ corresponds to ‘hclust(*, "ward.D2")’ -k <- 7 # number of clusters +k <- 4 # number of clusters grp <- cutree(hc, k = k) datitem$grp <- grp fviz_dend(hc, k = k, cex = 0.5, - k_colors = c("#78004B", "#000000", "#3CB4DC", "#91C86E", - "#FF6900", "gold", "#434F4F"), + k_colors = c("#78004B", "#FF6900", "#3CB4DC", "#91C86E", + "#000000", "gold", "#434F4F"), #type = "phylogenic", rect = TRUE ) @@ -120,15 +147,16 @@ rect.hclust(hc, k=7, border="blue") rect.hclust(hc, k=6, border="green") p <- fviz_cluster(list(data = df, cluster = grp), - palette = c("#78004B", "#000000", "#3CB4DC", "#91C86E", - "#FF6900", "#434F4F", "gold"), + palette = c("#78004B", "#FF6900", "#3CB4DC", "#91C86E", + "#000000", "#434F4F", "gold"), ellipse.type = "convex", repel = TRUE, show.clust.cent = FALSE, ggtheme = theme_bw()) p aggregate(cbind(duration, distance, scaleSize , rotationDegree, npaths, - ncases, ntopics) ~ grp, datitem, median) + ncases, nmoves, nflipCard, nopenTopic, nopenPopup) ~ grp, + datitem, median) # Something like a scree plot (??) plot(rev(seq_along(hc$height)), hc$height, type = "l") @@ -189,15 +217,15 @@ library(png) library(jpeg) library(grid) -colors <- c("#78004B", "#000000", "#3CB4DC", "#91C86E", "#FF6900", - "#434F4F") +colors <- c("#78004B", "#FF6900", "#3CB4DC", "#91C86E") -#pdf("results/haum/figures/clustering_artworks.pdf", height = 8, width = 8, pointsize = 10) -png("results/haum/figures/clustering_artworks.png", units = "in", height = 8, width = 8, pointsize = 10, res = 300) +pdf("results/figures/clustering_artworks.pdf", height = 8, width = 8, pointsize = 10) +#png("results/figures/clustering_artworks.png", units = "in", height = 8, width = 8, pointsize = 10, res = 300) -par(mai = c(.6,.6,.1,.1), mgp = c(2.4, 1, 0)) +par(mai = c(.4,.4,.1,.1), mgp = c(2.4, 1, 0)) -plot(y ~ x, p$data, type = "n", ylim = c(-3.2, 3), xlim = c(-4.7, 6.4)) +plot(y ~ x, p$data, type = "n", ylim = c(-3.5, 2.8), xlim = c(-5, 10), + xlab = "", ylab = "") for (item in sprintf("%03d", as.numeric(rownames(p$data)))) { @@ -221,12 +249,13 @@ for (item in sprintf("%03d", as.numeric(rownames(p$data)))) { pch = 15) rasterImage(img, - xleft = x - .4, - xright = x + .4, + xleft = x - .45, + xright = x + .45, ybottom = y - .2, ytop = y + .2) } +legend("topright", paste("Cluster", 1:k), col = colors, pch = 15, bty = "n") dev.off() diff --git a/code/06_infos-clusters.py b/code/08_infos-clusters.py similarity index 100% rename from code/06_infos-clusters.py rename to code/08_infos-clusters.py diff --git a/code/09_user-navigation.R b/code/09_user-navigation.R index ca206bb..bf33425 100644 --- a/code/09_user-navigation.R +++ b/code/09_user-navigation.R @@ -6,10 +6,11 @@ # (2) Clustering # (3) Investigate variants # -# input: results/haum/event_logfiles_2024-01-18_09-58-52.csv +# input: results/haum/event_logfiles_2024-02-21_16-07-33.csv # output: results/haum/event_logfiles_pre-corona_with-clusters_cases.csv +# results/haum/dattree.csv # -# last mod: 2024-02-07 +# last mod: 2024-02-23 # setwd("C:/Users/nwickelmaier/Nextcloud/Documents/MDS/2023ss/60100_master_thesis/analysis/code") @@ -21,17 +22,18 @@ library(factoextra) #--------------- (1.1) Read log event data --------------- -dat0 <- read.table("results/haum/event_logfiles_2024-01-18_09-58-52.csv", - colClasses = c("character", "character", "POSIXct", - "POSIXct", "character", "integer", - "numeric", "character", "character", - rep("numeric", 3), "character", - "character", rep("numeric", 11), - "character", "character"), - sep = ";", header = TRUE) +dat0 <- read.table("results/haum/event_logfiles_2024-02-21_16-07-33.csv", + colClasses = c("character", "character", "POSIXct", + "POSIXct", "character", "integer", + "numeric", "character", "character", + rep("numeric", 3), "character", + "character", rep("numeric", 11), + "character", "character"), + sep = ";", header = TRUE) dat0$event <- factor(dat0$event, levels = c("move", "flipCard", "openTopic", "openPopup")) +dat0$topic <- factor(dat0$topic) dat0$weekdays <- factor(weekdays(dat0$date.start), levels = c("Montag", "Dienstag", "Mittwoch", @@ -48,18 +50,43 @@ dat <- dat[dat$path != 106098, ] #--------------- (1.2) Extract additional infos for clustering --------------- -datcase <- aggregate(cbind(duration, distance, scaleSize, rotationDegree) ~ +datcase <- aggregate(cbind(distance, scaleSize, rotationDegree) ~ case, dat, function(x) mean(x, na.rm = TRUE), na.action = NULL) datcase$length <- aggregate(item ~ case, dat, length)$item + +eventtab <- aggregate(event ~ case, dat, table)["case"] +eventtab$nmove <- aggregate(event ~ case, dat, table)$event[, "move"] +eventtab$nflipCard <- aggregate(event ~ case, dat, table)$event[, "flipCard"] +eventtab$nopenTopic <- aggregate(event ~ case, dat, table)$event[, "openTopic"] +eventtab$nopenPopup <- aggregate(event ~ case, dat, table)$event[, "openPopup"] + +topictab <- aggregate(topic ~ case, dat, table)["case"] +topictab$artist <- aggregate(topic ~ case, dat, table)$topic[, 1] +topictab$details <- aggregate(topic ~ case, dat, table)$topic[, 2] +topictab$extra_info <- aggregate(topic ~ case, dat, table)$topic[, 3] +topictab$komposition <- aggregate(topic ~ case, dat, table)$topic[, 4] +topictab$leben_des_kunstwerks <- aggregate(topic ~ case, dat, table)$topic[, 5] +topictab$licht_und_farbe <- aggregate(topic ~ case, dat, table)$topic[, 6] +topictab$technik <- aggregate(topic ~ case, dat, table)$topic[, 7] +topictab$thema <- aggregate(topic ~ case, dat, table)$topic[, 8] + +datcase <- datcase |> + merge(eventtab, by = "case", all = TRUE) |> + merge(topictab, by = "case", all = TRUE) + +datcase$ntopiccards <- aggregate(topic ~ case, dat, + function(x) ifelse(all(is.na(x)), NA, + length(na.omit(x))), na.action = + NULL)$topic +datcase$ntopics <- aggregate(topic ~ case, dat, + function(x) ifelse(all(is.na(x)), NA, + length(unique(na.omit(x)))), na.action = + NULL)$topic datcase$nitems <- aggregate(item ~ case, dat, function(x) length(unique(x)), na.action = NULL)$item datcase$npaths <- aggregate(path ~ case, dat, function(x) length(unique(x)), na.action = NULL)$path -# datcase$ntopics <- aggregate(topic ~ case, dat, -# function(x) ifelse(all(is.na(x)), NA, -# length(unique(na.omit(x)))), na.action = -# NULL)$topic datcase$vacation <- aggregate(vacation ~ case, dat, function(x) ifelse(all(is.na(x)), 0, 1), na.action = NULL)$vacation @@ -73,42 +100,370 @@ datcase$morning <- aggregate(date.start ~ case, dat, function(x) ifelse(lubridate::hour(x[1]) > 13, 0, 1), na.action = NULL)$date.start -datcase <- na.omit(datcase) +dat_split <- split(dat, ~ case) -#--------------- (2) Clustering --------------- +time_minmax <- function(subdata) { + subdata$min_time <- min(subdata$timeMs.start) + if (all(is.na(subdata$timeMs.stop))) { + subdata$max_time <- NA + } else { + subdata$max_time <- max(subdata$timeMs.stop, na.rm = TRUE) + } + subdata +} + +dat_list <- pbapply::pblapply(dat_split, time_minmax) +dat_minmax <- dplyr::bind_rows(dat_list) + +datcase$min_time <- aggregate(min_time ~ case, dat_minmax, unique)$min_time +datcase$max_time <- aggregate(max_time ~ case, dat_minmax, unique)$max_time + +datcase$duration <- datcase$max_time - datcase$min_time +datcase$min_time <- NULL +datcase$max_time <- NULL + +cor_mat <- cor(datcase[, -1], use = "pairwise") +diag(cor_mat) <- NA +heatmap(cor_mat) + +# TODO: Add info if all items of a case are information cards?? + +# Navigation types by Bousbia et al. (2010): +# - Overviewing: this value is close to the Canter “scanning” value. It +# implies that the learner is covering a large proportion of course pages. +# Through this phase of fast-reading, the user seeks to acquire an +# overall view of the course. +# - Flitting: close to “wandering”. It reflects a browsing activity without a +# strategy or a particular goal. The main difference with the overviewing +# type is the lack of focus on the course. +# - Studying: corresponds to a partial or complete reading of the course +# pages where the learner spends time on each page. +# - Deepening: This describes a learner who spends relatively long time on a +# course, checking details, and seeking Web documents related to the course +# topics. The main difference with studying is the Web search part that the +# learner uses to obtain a deeper understanding of the course. + +# Taxonomy defined by Canter et al. (1985): +# - Scanning: seeking an overview of a theme (i.e. subpart of the hypermedia) +# by requesting an important proportion of its pages but without spending +# much time on them. +# - Browsing: going wherever the data leads the navigator until catching an +# interest. +# - Exploring: reading the viewed pages thoroughly. +# - Searching: seeking for a particular document or information. +# - Wandering: navigating in an unstructured fashion without any particular +# goal or strategy. + +# Features for navigation types for MTT: +# - Scanning / Overviewing: +# * Proportion of artworks looked at is high: datcase$nitems / 70 +# * Duration per artwork is low: "ave_duration_item" / datcase$duration +# - Exploring: +# * Looking at additional information for most items touched (high value): +# harmonic mean of datcase$nopenTopic / datcase$nflipCard and +# datcase$nopenPopup / datcase$nopenTopic +# - Searching / Studying: +# * Looking only at a few items +# datcase$nitems / 70 is low +# * Opening few cards +# datcase$nflipCard / mean(datcase$nflipCard) or median(datcase$nflipCard) is low +# * but for most cards popups are opened: +# datcase$nopenPopup / datcase$nflipCard is high +# - Wandering / Flitting: +# * Items are mostly just moved: +# datcase$nmove / datcase$length is high +# * Duration per case is low: +# datcase$duration / mean(datcase$duration) or median(datcase$duration) +# * Duration per artwork is low: "ave_duration_item" / datcase$duration + +# TODO: Come up with relevant features for navigation behavior + + + +dattree <- data.frame(case = datcase$case, + Duration = datcase$duration, + PropItems = datcase$nitems / length(unique(dat$item)), + SearchInfo = + 2*(((datcase$nopenPopup / datcase$nopenTopic) * + (datcase$nopenTopic / datcase$nflipCard)) / + ((datcase$nopenPopup / datcase$nopenTopic) + + (datcase$nopenTopic / datcase$nflipCard)) + ), + PropMoves = datcase$nmove / datcase$length, + PathLinearity = datcase$nitems / datcase$npaths, + Singularity = datcase$npaths / datcase$length +) + +dattree$SearchInfo <- ifelse(dattree$SearchInfo %in% 0, 0.1, dattree$SearchInfo) +dattree$SearchInfo <- ifelse(is.na(dattree$SearchInfo), 0, dattree$SearchInfo) + +get_centrality <- function(case, data) { + + data$start <- data$date.start + data$complete <- data$date.stop + + alog <- activitylog(data[data$case == case, ], + case_id = "case", + activity_id = "item", + resource_id = "path", + timestamps = c("start", "complete")) + + net <- process_map(alog, render = FALSE) + inet <- DiagrammeR::to_igraph(net) + + c(igraph::centr_degree(inet, loops = FALSE)$centralization, + igraph::centr_degree(inet, loops = TRUE)$centralization, + igraph::centr_betw(inet)$centralization) +} + + +centrality <- lapply(dattree$case, get_centrality, data = dat) +centrality <- do.call(rbind, centrality) + +save(centrality, file = "results/haum/tmp_centrality.RData") + +dattree$centr_degree <- centrality[, 1] +dattree$centr_degree_loops <- centrality[, 2] +dattree$centr_between <- centrality[, 3] + +par(mfrow = c(3,3)) +hist(dattree$Duration, breaks = 50, main = "") +hist(dattree$SearchInfo, breaks = 50, main = "") +hist(dattree$PropItems, breaks = 50, main = "") +hist(dattree$PropMoves, breaks = 50, main = "") +hist(dattree$PathLinearity, breaks = 50, main = "") +hist(dattree$Singularity, breaks = 50, main = "") +hist(dattree$centr_degree, breaks = 50, main = "") +hist(dattree$centr_degree_loops, breaks = 50, main = "") +hist(dattree$centr_between, breaks = 50, main = "") + + +cor_mat <- cor(dattree[, -1], use = "pairwise") +diag(cor_mat) <- NA +heatmap(cor_mat) + + +dattree$Pattern <- "Dispersion" +dattree$Pattern <- ifelse(dattree$PathLinearity > 0.8 & + dattree$Singularity > 0.8, "Scholar", + dattree$Pattern) +dattree$Pattern <- ifelse(dattree$PathLinearity <= 0.8 & + dattree$centr_between > 0.5, "Star", + dattree$Pattern) + +write.table(dattree, + file = "results/haum/dattree.csv", + sep = ";", + quote = FALSE, + row.names = FALSE) + + +tmp <- dat +tmp$start <- tmp$date.start +tmp$complete <- tmp$date.stop + + +alog <- activitylog(tmp[tmp$case == 3448, ], + case_id = "case", + activity_id = "item", + resource_id = "path", + timestamps = c("start", "complete")) + +process_map(alog) + +net <- process_map(alog, render = FALSE) +#DiagrammeR::get_node_df(net) + +DiagrammeR::get_node_info(net) + +DiagrammeR::get_degree_distribution(net) + +DiagrammeR::get_degree_in(net) +DiagrammeR::get_degree_out(net) +DiagrammeR::get_degree_total(net) + + +N <- DiagrammeR::count_nodes(net) - 2 # Do not count start and stop nodes + +dc <- DiagrammeR::get_degree_total(net)[1:N, "total_degree"] / (N - 1) + +inet <- DiagrammeR::to_igraph(net) +igraph::centr_degree(inet, loops = FALSE) +igraph::centr_betw(inet) +igraph::centr_clo(inet) + +df <- dattree[, c("Duration", "PropItems", "SearchInfo", "PropMoves")] +df$Scholar <- ifelse(dattree$Pattern == "Scholar", 1, 0) +df$Star <- ifelse(dattree$Pattern == "Star", 1, 0) +df$Dispersion <- ifelse(dattree$Pattern == "Dispersion", 1, 0) +# scale Duration and min/max SearchInfo +df$Duration <- scale(df$Duration) +df$SearchInfo <- (df$SearchInfo - min(df$SearchInfo)) / + (max(df$SearchInfo) - min(df$SearchInfo)) -df <- datcase[, c("duration", "distance", "scaleSize", "rotationDegree", - "length", "nitems", "npaths")] |> - scale() -#df <- cbind(df, datcase[, c("vacation", "holiday", "weekend", "morning")]) mat <- dist(df) +# TODO: Do I need to scale all variables? -hc <- hclust(mat, method = "ward.D2") +h1 <- hclust(mat, method = "average") +h2 <- hclust(mat, method = "complete") +h3 <- hclust(mat, method = "ward.D") +h4 <- hclust(mat, method = "ward.D2") +h5 <- hclust(mat, method = "single") -grp <- cutree(hc, k = 6) -datcase$grp <- grp +# Cophenetic Distances, for each linkage (runs quite some time!) +c1 <- cophenetic(h1) +c2 <- cophenetic(h2) +c3 <- cophenetic(h3) +c4 <- cophenetic(h4) +c5 <- cophenetic(h5) + +# Correlations +cor(mat, c1) +# 0.9029232 +cor(mat, c2) +# 0.8879478 +cor(mat, c3) +# 0.5747296 +cor(mat, c4) +# 0.5994121 +cor(mat, c5) +# 0.5292353 + +# https://en.wikipedia.org/wiki/Cophenetic_correlation +# https://stats.stackexchange.com/questions/195446/choosing-the-right-linkage-method-for-hierarchical-clustering + +hc <- h1 + +# Something like a scree plot (??) +plot(rev(hc$height)[1:100], type = "b", pch = 16, cex = .5) + +k <- 4 + +grp <- cutree(hc, k = k) +df$grp <- grp table(grp) fviz_cluster(list(data = df, cluster = grp), - palette = c("#78004B", "#000000", "#3CB4DC", "#91C86E", - "#FF6900", "#434F4F"), + #palette = c("#78004B", "#FF6900", "#3CB4DC", "#91C86E", + # "#000000", "#434F4F"), ellipse.type = "convex", show.clust.cent = FALSE, ggtheme = theme_bw()) -aggregate(cbind(duration, distance, scaleSize , rotationDegree, length, - nitems, npaths) ~ grp, datcase, mean) -aggregate(cbind(duration, distance, scaleSize , rotationDegree, length, - nitems, npaths) ~ grp, datcase, median) -aggregate(cbind(duration, distance, scaleSize , rotationDegree, length, - nitems, npaths) ~ grp, datcase, max) +table(dattree[dattree$grp == 3, "Pattern"]) -res <- merge(dat, datcase[, c("case", "grp")], by = "case", all.x = TRUE) + +aggregate(cbind(Duration, PropItems, SearchInfo, PropMoves, PathLinearity, + Singularity, centr_degree, centr_degree_loops, + centr_between) ~ grp, dattree, mean) + + +aggregate(cbind(Duration, PropItems, SearchInfo, PropMoves, Dispersion, + Scholar, Star) ~ grp, df, mean) + + +# "We first extract the graph sub-sequences corresponding to the four +# patterns of Canter et al. (1985). We also identified the number of nodes +# to which the learner often goes back to (Fig. 4). These nodes are called +# “central nodes”. If the number of central nodes is lower than or equal to +# half of the sub-sequences, the browsing pattern indicator takes on the +# value “Star”." Bousbia et al. (2010) + +# I do not know how they got the sub-sequences. I am taking the ratio of +# strongly connected nodes to weakly connected nodes. If the number of +# weakly connected nodes is twice as high, the pattern is classified as a +# star, i.e., NodeConnect <= 0.5. +# TODO: This does not make sense, smallest and most frequent number is 3! +# (and I do not understand it...) + + +# count_asymmetric_node_pairs Get the number of asymmetrically-connected node pairs +# count_edges Get a count of all edges +# count_loop_edges Get count of all loop edges +# count_mutual_node_pairs Get the number of mutually-connected node pairs +# count_unconnected_node_pairs Get the number of unconnected node pairs +# count_unconnected_nodes Get count of all unconnected nodes + + +# TODO: Read up on centrality measures +# https://www.r-bloggers.com/2018/12/network-centrality-in-r-an-introduction/ +# https://www.datacamp.com/tutorial/centrality-network-analysis-R +# http://davidrajuh.net/reggie/publications/publications-filer/rd114-2018-Network-Centrality.pdf +# https://link.springer.com/article/10.1007/s10618-024-01003-4 + +#--------------- (2) Clustering --------------- + +df <- na.omit(datcase[, c("duration", "distance", "scaleSize", + "rotationDegree", "length", "nmove", + "nitems", "npaths")]) + +#df <- cbind(df, datcase[, c("vacation", "holiday", "weekend", "morning")]) +mat <- dist(scale(df)) +#mat <- dist(df) + +h1 <- hclust(mat, method = "average") +h2 <- hclust(mat, method = "complete") +h3 <- hclust(mat, method = "ward.D") +h4 <- hclust(mat, method = "ward.D2") +h5 <- hclust(mat, method = "single") + +# Cophenetic Distances, for each linkage (runs quite some time!) +c1 <- cophenetic(h1) +c2 <- cophenetic(h2) +c3 <- cophenetic(h3) +c4 <- cophenetic(h4) +c5 <- cophenetic(h5) + +# Correlations +cor(mat, c1) +cor(mat, c2) +cor(mat, c3) +cor(mat, c4) +cor(mat, c5) +# https://en.wikipedia.org/wiki/Cophenetic_correlation +# https://stats.stackexchange.com/questions/195446/choosing-the-right-linkage-method-for-hierarchical-clustering + +hc <- h1 + +# Something like a scree plot (??) +plot(rev(hc$height)[1:100], type = "b", pch = 16, cex = .5) + + +# TODO: Something is wrong + +k <- 4 + +grp <- cutree(hc, k = k) +df$grp <- grp + +table(grp) + +fviz_cluster(list(data = df, cluster = grp), + #palette = c("#78004B", "#FF6900", "#3CB4DC", "#91C86E", + # "#000000", "#434F4F"), + ellipse.type = "convex", + show.clust.cent = FALSE, ggtheme = theme_bw()) + +aggregate(cbind(duration, distance, scaleSize , rotationDegree, length, nmove, + nitems, npaths) ~ grp, df, mean) +aggregate(cbind(duration, distance, scaleSize , rotationDegree, length, nmove, + nitems, npaths) ~ grp, df, median) +aggregate(cbind(duration, distance, scaleSize , rotationDegree, length, nmove, + nitems, npaths) ~ grp, df, max) + + +df$case <- na.omit(datcase[, c("case", "duration", "distance", "scaleSize", + "rotationDegree", "length", "nmove", + "nitems", "npaths")])$case + +res <- merge(dat, df[, c("case", "grp")], by = "case", all.x = TRUE) res <- res[order(res$fileId.start, res$date.start, res$timeMs.start), ] xtabs( ~ item + grp, res) +aggregate(event ~ grp, res, table) # Look at clusters +par(mfrow = c(2, 2)) vioplot::vioplot(duration ~ grp, res) vioplot::vioplot(distance ~ grp, res) vioplot::vioplot(scaleSize ~ grp, res)