From 66fab4fa184df99fc33035fa8786a7ef37cdc7ce Mon Sep 17 00:00:00 2001 From: nwickel Date: Fri, 8 Mar 2024 11:52:55 +0100 Subject: [PATCH] Cleaned up scripts; separated case data frame, clustering, and trace analysis into separate files --- code/09_case-characteristics.R | 187 +++++++++++++ code/09_user-navigation.R | 481 --------------------------------- code/10_user-navigation.R | 141 ++++++++++ code/11_investigate-variants.R | 101 +++++++ code/R_helpers.R | 45 +++ 5 files changed, 474 insertions(+), 481 deletions(-) create mode 100644 code/09_case-characteristics.R delete mode 100644 code/09_user-navigation.R create mode 100644 code/10_user-navigation.R create mode 100644 code/11_investigate-variants.R create mode 100644 code/R_helpers.R diff --git a/code/09_case-characteristics.R b/code/09_case-characteristics.R new file mode 100644 index 0000000..7f84ee6 --- /dev/null +++ b/code/09_case-characteristics.R @@ -0,0 +1,187 @@ +# 09_user-navigation.R +# +# content: (1) Read data +# (2) Extract characteristics for cases +# (3) Select features for navigation behavior +# (4) Export data frames +# +# input: results/haum/event_logfiles_2024-02-21_16-07-33.csv +# output: results/haum/eventlogs_pre-corona_case-clusters.csv +# +# last mod: 2024-03-08 + +# setwd("C:/Users/nwickelmaier/Nextcloud/Documents/MDS/2023ss/60100_master_thesis/analysis/code") + +source("R_helpers.R") + +#--------------- (1) Read data --------------- + +load("results/haum/eventlogs_pre-corona_cleaned.RData") + +# Select one year to handle number of cases +dat <- dat[as.Date(dat$date.start) > "2018-12-31" & + as.Date(dat$date.start) < "2020-01-01", ] + +#--------------- (2) Extract characteristics for cases --------------- + +datcase <- aggregate(cbind(distance, scaleSize, rotationDegree) ~ + case, dat, function(x) mean(x, na.rm = TRUE), na.action = NULL) + +datcase$length <- aggregate(item ~ case, dat, length)$item + +eventtab <- aggregate(event ~ case, dat, table)["case"] +eventtab$nmove <- aggregate(event ~ case, dat, table)$event[, "move"] +eventtab$nflipCard <- aggregate(event ~ case, dat, table)$event[, "flipCard"] +eventtab$nopenTopic <- aggregate(event ~ case, dat, table)$event[, "openTopic"] +eventtab$nopenPopup <- aggregate(event ~ case, dat, table)$event[, "openPopup"] + +topictab <- aggregate(topic ~ case, dat, table)["case"] +topictab$artist <- aggregate(topic ~ case, dat, table)$topic[, 1] +topictab$details <- aggregate(topic ~ case, dat, table)$topic[, 2] +topictab$extra_info <- aggregate(topic ~ case, dat, table)$topic[, 3] +topictab$komposition <- aggregate(topic ~ case, dat, table)$topic[, 4] +topictab$leben_des_kunstwerks <- aggregate(topic ~ case, dat, table)$topic[, 5] +topictab$licht_und_farbe <- aggregate(topic ~ case, dat, table)$topic[, 6] +topictab$technik <- aggregate(topic ~ case, dat, table)$topic[, 7] +topictab$thema <- aggregate(topic ~ case, dat, table)$topic[, 8] + +datcase <- datcase |> + merge(eventtab, by = "case", all = TRUE) |> + merge(topictab, by = "case", all = TRUE) + +rm(eventtab, topictab) + +datcase$ntopiccards <- aggregate(topic ~ case, dat, + function(x) ifelse(all(is.na(x)), NA, + length(na.omit(x))), na.action = + NULL)$topic +datcase$ntopics <- aggregate(topic ~ case, dat, + function(x) ifelse(all(is.na(x)), NA, + length(unique(na.omit(x)))), na.action = + NULL)$topic +datcase$nitems <- aggregate(item ~ case, dat, function(x) + length(unique(x)), na.action = NULL)$item +datcase$npaths <- aggregate(path ~ case, dat, function(x) + length(unique(x)), na.action = NULL)$path +datcase$vacation <- aggregate(vacation ~ case, dat, + function(x) ifelse(all(is.na(x)), 0, 1), + na.action = NULL)$vacation +datcase$holiday <- aggregate(holiday ~ case, dat, + function(x) ifelse(all(is.na(x)), 0, 1), + na.action = NULL)$holiday +datcase$weekend <- aggregate(weekdays ~ case, dat, + function(x) ifelse(any(x %in% c("Saturday", "Sunday")), 1, 0), + na.action = NULL)$weekdays +datcase$morning <- aggregate(date.start ~ case, dat, + function(x) ifelse(lubridate::hour(x[1]) > 13, 0, 1), + na.action = NULL)$date.start + +dat_split <- split(dat, ~ case) +dat_list <- pbapply::pblapply(dat_split, time_minmax_ms) +dat_minmax <- dplyr::bind_rows(dat_list) + +datcase$min_time <- aggregate(min_time ~ case, dat_minmax, unique)$min_time +datcase$max_time <- aggregate(max_time ~ case, dat_minmax, unique)$max_time + +datcase$duration <- datcase$max_time - datcase$min_time +datcase$min_time <- NULL +datcase$max_time <- NULL + +artworks <- unique(dat$item)[!unique(dat$item) %in% c("501", "502", "503")] +datcase$infocardOnly <- pbapply::pbsapply(dat_split, check_infocards, artworks = artworks) + +# Clean up NAs +datcase$distance <- ifelse(is.na(datcase$distance), 0, datcase$distance) +datcase$scaleSize <- ifelse(is.na(datcase$scaleSize), 1, datcase$scaleSize) +datcase$rotationDegree <- ifelse(is.na(datcase$rotationDegree), 0, datcase$rotationDegree) +datcase$artist <- ifelse(is.na(datcase$artist), 0, datcase$artist) +datcase$details <- ifelse(is.na(datcase$details), 0, datcase$details) +datcase$extra_info <- ifelse(is.na(datcase$extra_info), 0, datcase$extra_info) +datcase$komposition <- ifelse(is.na(datcase$komposition), 0, datcase$komposition) +datcase$leben_des_kunstwerks <- ifelse(is.na(datcase$leben_des_kunstwerks), 0, datcase$leben_des_kunstwerks) +datcase$licht_und_farbe <- ifelse(is.na(datcase$licht_und_farbe), 0, datcase$licht_und_farbe) +datcase$technik <- ifelse(is.na(datcase$technik), 0, datcase$technik) +datcase$thema <- ifelse(is.na(datcase$thema), 0, datcase$thema) +datcase$ntopics <- ifelse(is.na(datcase$ntopics), 0, datcase$ntopics) +datcase$ntopiccards <- ifelse(is.na(datcase$ntopiccards), 0, datcase$ntopiccards) + +#--------------- (3) Select features for navigation behavior --------------- + +# Features for navigation types for MTT: +# - Scanning / Overviewing: +# * Proportion of artworks looked at is high +# * Duration per artwork is low: "ave_duration_item" / datcase$duration +# - Exploring: +# * Looking at additional information is high +# - Searching / Studying: +# * Proportion of artworks looked at is low +# * Opening few cards +# datcase$nflipCard / mean(datcase$nflipCard) or median(datcase$nflipCard) is low +# * but for most cards popups are opened: +# datcase$nopenPopup / datcase$nflipCard is high +# - Wandering / Flitting: +# * Proportion of moves is high +# * Duration per case is low: +# datcase$duration / mean(datcase$duration) or median(datcase$duration) +# * Duration per artwork is low: "ave_duration_item" / datcase$duration + +dattree <- data.frame(case = datcase$case, + PropItems = datcase$nitems / length(unique(dat$item)), + SearchInfo = (datcase$nopenTopic + + datcase$nopenPopup) / datcase$length, + PropMoves = datcase$nmove / datcase$length, + PathLinearity = datcase$nitems / datcase$npaths, + Singularity = datcase$npaths / datcase$length +) + +# centrality <- pbapply::pbsapply(dattree$case, get_centrality, data = dat) +# save(centrality, file = "results/haum/tmp_centrality.RData") +load("results/haum/tmp_centrality.RData") + +dattree$BetweenCentrality <- centrality + +# Average duration per item +dat_split <- split(dat[, c("item", "case", "path", "timeMs.start", "timeMs.stop")], ~ path) +dat_list <- pbapply::pblapply(dat_split, time_minmax_ms) +dat_minmax <- dplyr::bind_rows(dat_list) + +tmp <- aggregate(min_time ~ path, dat_minmax, unique) +tmp$max_time <- aggregate(max_time ~ path, dat_minmax, unique, na.action = NULL)$max_time +tmp$duration <- tmp$max_time - tmp$min_time +tmp$case <- aggregate(case ~ path, dat_minmax, unique)$case + +dattree$AvDurItem <- aggregate(duration ~ case, tmp, mean)$duration + +rm(tmp) + +# Indicator variable if table was used as info terminal only +dattree$InfocardOnly <- factor(datcase$infocardOnly, levels = 0:1, + labels = c("no", "yes")) + +# Add pattern to datcase; loosely based on Bousbia et al. (2009) +dattree$Pattern <- "Dispersion" +dattree$Pattern <- ifelse(dattree$PathLinearity > 0.8, "Scholar", + dattree$Pattern) +dattree$Pattern <- ifelse(dattree$PathLinearity <= 0.8 & + dattree$BetweenCentrality >= 0.5, "Star", + dattree$Pattern) +dattree$Pattern <- factor(dattree$Pattern) + +dattree$AvDurItemNorm <- normalize(dattree$AvDurItem) + +#--------------- (4) Export data frames --------------- + +save(datcase, dattree, file = "results/haum/dataframes_case_2019.RData") + +write.table(datcase, + file = "results/haum/datcase.csv", + sep = ";", + quote = FALSE, + row.names = FALSE) + +write.table(datcase, + file = "results/haum/dattree.csv", + sep = ";", + quote = FALSE, + row.names = FALSE) + diff --git a/code/09_user-navigation.R b/code/09_user-navigation.R deleted file mode 100644 index b6f5b02..0000000 --- a/code/09_user-navigation.R +++ /dev/null @@ -1,481 +0,0 @@ -# 09_user-navigation.R -# -# content: (1) Read data -# (1.1) Read log event data -# (1.2) Extract additional infos for clustering -# (2) Clustering -# (3) Fit tree -# (4) Investigate variants -# -# input: results/haum/event_logfiles_2024-02-21_16-07-33.csv -# output: results/haum/eventlogs_pre-corona_case-clusters.csv -# -# last mod: 2024-03-06 - - -# setwd("C:/Users/nwickelmaier/Nextcloud/Documents/MDS/2023ss/60100_master_thesis/analysis/code") - -library(bupaverse) -library(factoextra) - -#--------------- (1) Read data --------------- - -#--------------- (1.1) Read log event data --------------- - -load("results/haum/eventlogs_pre-corona_cleaned.RData") - -# Select one year to handle number of cases -dat <- dat[as.Date(dat$date.start) > "2018-12-31" & - as.Date(dat$date.start) < "2020-01-01", ] - -#--------------- (1.2) Extract additional infos for clustering --------------- - -datcase <- aggregate(cbind(distance, scaleSize, rotationDegree) ~ - case, dat, function(x) mean(x, na.rm = TRUE), na.action = NULL) - -datcase$length <- aggregate(item ~ case, dat, length)$item - -eventtab <- aggregate(event ~ case, dat, table)["case"] -eventtab$nmove <- aggregate(event ~ case, dat, table)$event[, "move"] -eventtab$nflipCard <- aggregate(event ~ case, dat, table)$event[, "flipCard"] -eventtab$nopenTopic <- aggregate(event ~ case, dat, table)$event[, "openTopic"] -eventtab$nopenPopup <- aggregate(event ~ case, dat, table)$event[, "openPopup"] - -topictab <- aggregate(topic ~ case, dat, table)["case"] -topictab$artist <- aggregate(topic ~ case, dat, table)$topic[, 1] -topictab$details <- aggregate(topic ~ case, dat, table)$topic[, 2] -topictab$extra_info <- aggregate(topic ~ case, dat, table)$topic[, 3] -topictab$komposition <- aggregate(topic ~ case, dat, table)$topic[, 4] -topictab$leben_des_kunstwerks <- aggregate(topic ~ case, dat, table)$topic[, 5] -topictab$licht_und_farbe <- aggregate(topic ~ case, dat, table)$topic[, 6] -topictab$technik <- aggregate(topic ~ case, dat, table)$topic[, 7] -topictab$thema <- aggregate(topic ~ case, dat, table)$topic[, 8] - -datcase <- datcase |> - merge(eventtab, by = "case", all = TRUE) |> - merge(topictab, by = "case", all = TRUE) - -rm(eventtab, topictab) - -datcase$ntopiccards <- aggregate(topic ~ case, dat, - function(x) ifelse(all(is.na(x)), NA, - length(na.omit(x))), na.action = - NULL)$topic -datcase$ntopics <- aggregate(topic ~ case, dat, - function(x) ifelse(all(is.na(x)), NA, - length(unique(na.omit(x)))), na.action = - NULL)$topic -datcase$nitems <- aggregate(item ~ case, dat, function(x) - length(unique(x)), na.action = NULL)$item -datcase$npaths <- aggregate(path ~ case, dat, function(x) - length(unique(x)), na.action = NULL)$path -datcase$vacation <- aggregate(vacation ~ case, dat, - function(x) ifelse(all(is.na(x)), 0, 1), - na.action = NULL)$vacation -datcase$holiday <- aggregate(holiday ~ case, dat, - function(x) ifelse(all(is.na(x)), 0, 1), - na.action = NULL)$holiday -datcase$weekend <- aggregate(weekdays ~ case, dat, - function(x) ifelse(any(x %in% c("Saturday", "Sunday")), 1, 0), - na.action = NULL)$weekdays -datcase$morning <- aggregate(date.start ~ case, dat, - function(x) ifelse(lubridate::hour(x[1]) > 13, 0, 1), - na.action = NULL)$date.start - -dat_split <- split(dat, ~ case) - -time_minmax_ms <- function(subdata) { - subdata$min_time <- min(subdata$timeMs.start) - if (all(is.na(subdata$timeMs.stop))) { - subdata$max_time <- NA - } else { - subdata$max_time <- max(subdata$timeMs.stop, na.rm = TRUE) - } - subdata -} -# TODO: Move to helper file - -dat_list <- pbapply::pblapply(dat_split, time_minmax_ms) -dat_minmax <- dplyr::bind_rows(dat_list) - -datcase$min_time <- aggregate(min_time ~ case, dat_minmax, unique)$min_time -datcase$max_time <- aggregate(max_time ~ case, dat_minmax, unique)$max_time - -datcase$duration <- datcase$max_time - datcase$min_time -datcase$min_time <- NULL -datcase$max_time <- NULL - - -check_infocards <- function(subdata, artworks) { - infocard_only <- NULL - if(any(unique(subdata$item) %in% artworks)) { - infocard_only <- FALSE - } else { - infocard_only <- TRUE - } - as.numeric(infocard_only) -} -# TODO: Move to helper file - -artworks <- unique(dat$item)[!unique(dat$item) %in% c("501", "502", "503")] - -datcase$infocardOnly <- pbapply::pbsapply(dat_split, check_infocards, artworks = artworks) - - -# Clean up NAs -datcase$distance <- ifelse(is.na(datcase$distance), 0, datcase$distance) -datcase$scaleSize <- ifelse(is.na(datcase$scaleSize), 1, datcase$scaleSize) -datcase$rotationDegree <- ifelse(is.na(datcase$rotationDegree), 0, datcase$rotationDegree) -datcase$artist <- ifelse(is.na(datcase$artist), 0, datcase$artist) -datcase$details <- ifelse(is.na(datcase$details), 0, datcase$details) -datcase$extra_info <- ifelse(is.na(datcase$extra_info), 0, datcase$extra_info) -datcase$komposition <- ifelse(is.na(datcase$komposition), 0, datcase$komposition) -datcase$leben_des_kunstwerks <- ifelse(is.na(datcase$leben_des_kunstwerks), 0, datcase$leben_des_kunstwerks) -datcase$licht_und_farbe <- ifelse(is.na(datcase$licht_und_farbe), 0, datcase$licht_und_farbe) -datcase$technik <- ifelse(is.na(datcase$technik), 0, datcase$technik) -datcase$thema <- ifelse(is.na(datcase$thema), 0, datcase$thema) -datcase$ntopics <- ifelse(is.na(datcase$ntopics), 0, datcase$ntopics) -datcase$ntopiccards <- ifelse(is.na(datcase$ntopiccards), 0, datcase$ntopiccards) - -cor_mat <- cor(datcase[, -1], use = "pairwise") -diag(cor_mat) <- NA -heatmap(cor_mat) - - -normalize <- function(x) { - (x - min(x)) / (max(x) - min(x)) -} -# TODO: Move to helper file - -# Features for navigation types for MTT: -# - Scanning / Overviewing: -# * Proportion of artworks looked at is high -# * Duration per artwork is low: "ave_duration_item" / datcase$duration -# - Exploring: -# * Looking at additional information is high -# - Searching / Studying: -# * Proportion of artworks looked at is low -# * Opening few cards -# datcase$nflipCard / mean(datcase$nflipCard) or median(datcase$nflipCard) is low -# * but for most cards popups are opened: -# datcase$nopenPopup / datcase$nflipCard is high -# - Wandering / Flitting: -# * Proportion of moves is high -# * Duration per case is low: -# datcase$duration / mean(datcase$duration) or median(datcase$duration) -# * Duration per artwork is low: "ave_duration_item" / datcase$duration - -dattree <- data.frame(case = datcase$case, - PropItems = datcase$nitems / length(unique(dat$item)), - SearchInfo = datcase$nopenTopic + datcase$nopenPopup, - PropMoves = datcase$nmove / datcase$length, - PathLinearity = datcase$nitems / datcase$npaths, - Singularity = datcase$npaths / datcase$length -) - -dattree$SearchInfo <- ifelse(is.na(dattree$NumTopic), 0, dattree$NumTopic) - -get_centrality <- function(case, data) { - - data$start <- data$date.start - data$complete <- data$date.stop - - alog <- activitylog(data[data$case == case, ], - case_id = "case", - activity_id = "item", - resource_id = "path", - timestamps = c("start", "complete")) - - net <- process_map(alog, render = FALSE) - inet <- DiagrammeR::to_igraph(net) - - #c(igraph::centr_degree(inet, loops = FALSE)$centralization, - # igraph::centr_degree(inet, loops = TRUE)$centralization, - # igraph::centr_betw(inet)$centralization) - igraph::centr_betw(inet)$centralization -} -# TODO: Move to helper file - -centrality <- pbapply::pblapply(dattree$case, get_centrality, data = dat) -centrality <- do.call(rbind, centrality) - -# save(centrality, file = "results/haum/tmp_centrality.RData") -#load("results/haum/tmp_centrality.RData") - -dattree$BetweenCentrality <- unlist(centrality) - -# Average duration per item -dat_split <- split(dat[, c("item", "case", "path", "timeMs.start", "timeMs.stop")], ~ path) -dat_list <- pbapply::pblapply(dat_split, time_minmax_ms) -dat_minmax <- dplyr::bind_rows(dat_list) - -tmp <- aggregate(min_time ~ path, dat_minmax, unique) -tmp$max_time <- aggregate(max_time ~ path, dat_minmax, unique, na.action = NULL)$max_time -tmp$duration <- tmp$max_time - tmp$min_time -tmp$case <- aggregate(case ~ path, dat_minmax, unique)$case - -dattree$AvDurItem <- aggregate(duration ~ case, tmp, mean)$duration - -rm(tmp) - -# Indicator variable if table was used as info terminal only -dattree$InfocardOnly <- factor(datcase$infocardOnly, levels = 0:1, - labels = c("no", "yes")) - -# Add pattern to datcase; loosely based on Bousbia et al. (2009) -dattree$Pattern <- "Dispersion" -dattree$Pattern <- ifelse(dattree$PathLinearity > 0.8 & dattree$Singularity > 0.8, "Scholar", - dattree$Pattern) -dattree$Pattern <- ifelse(dattree$PathLinearity <= 0.8 & - dattree$BetweenCentrality > 0.5, "Star", - dattree$Pattern) -dattree$Pattern <- factor(dattree$Pattern) -# TODO: Get rid of PathLinearity and Singularity as features when I am -# using Pattern? - -dattree$PathLinearity <- NULL -dattree$Singularity <- NULL -dattree$BetweenCentrality <- NULL - -summary(dattree) - -plot(dattree[, -1], pch = ".") - -par(mfrow = c(2,4)) -hist(dattree$AvDurItem, breaks = 50, main = "") -hist(dattree$NumItems, breaks = 50, main = "") -hist(dattree$NumTopic, breaks = 50, main = "") -hist(dattree$NumPopup, breaks = 50, main = "") -hist(dattree$PropMoves, breaks = 50, main = "") -hist(dattree$PathLinearity, breaks = 50, main = "") -hist(dattree$Singularity, breaks = 50, main = "") -hist(dattree$BetweenCentrality, breaks = 50, main = "") - -#++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ -# Remove cases with extreme outliers -# TODO: Do I want this??? - -quantile(datcase$nopenTopic, 0.999) -quantile(datcase$nopenPopup, 0.999) - -dattree <- dattree[!(dattree$NumTopic > 40 | dattree$NumPopup > 40), ] - -plot(dattree[, -1], pch = ".") - -par(mfrow = c(2,4)) -hist(dattree$AvDurItem, breaks = 50, main = "") -hist(dattree$NumItems, breaks = 50, main = "") -hist(dattree$NumTopic, breaks = 50, main = "") -hist(dattree$NumPopup, breaks = 50, main = "") -hist(dattree$PropMoves, breaks = 50, main = "") -hist(dattree$PathLinearity, breaks = 50, main = "") -hist(dattree$Singularity, breaks = 50, main = "") -hist(dattree$BetweenCentrality, breaks = 50, main = "") -#++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ - -#--------------- (2) Clustering --------------- - -df <- dattree[, -1] # remove case variable - -# Normalize Duration and SearchInfo -df$AvDurItem <- normalize(df$AvDurItem) -df$SearchInfo <- normalize(df$SearchInfo) - -summary(df) - -# Look at collinearity -cor_mat <- cor(df) -diag(cor_mat) <- NA -heatmap(cor_mat) - -#--------------- (2.2) Hierarchical clustering --------------- - -dist_mat <- cluster::daisy(df, metric = "gower") - -# "Flatten" with MDS -coor_2d <- as.data.frame(cmdscale(dist_mat, k = 2)) -coor_3d <- as.data.frame(cmdscale(dist_mat, k = 3)) -# TODO: Better use MASS::isoMDS() since I am not using Euclidean distances? - -plot(coor_2d) -rgl::plot3d(coor_3d) - -method <- c(average = "average", single = "single", complete = "complete", - ward = "ward") - -method <- "ward" - -hcs <- pbapply::pblapply(method, function(x) cluster::agnes(dist_mat, method = x)) -acs <- pbapply::pbsapply(hcs, function(x) x$ac) - -hc <- hcs$ward - -# Something like a scree plot (??) -plot(rev(hc$height)[1:100], type = "b", pch = 16, cex = .5) - -k <- 4 - -mycols <- c("#78004B", "#FF6900", "#3CB4DC", "#91C86E") - -cluster <- cutree(as.hclust(hc), k = k) - -table(cluster) - -plot(coor_2d, col = mycols[cluster]) -legend("topleft", paste("Cl", 1:4), col = mycols, pch = 21) -rgl::plot3d(coor_3d, col = mycols[cluster]) - -table(dattree[cluster == 1, "Pattern"]) -table(dattree[cluster == 2, "Pattern"]) -table(dattree[cluster == 3, "Pattern"]) -table(dattree[cluster == 4, "Pattern"]) - -table(dattree[cluster == 1, "InfocardOnly"]) -table(dattree[cluster == 2, "InfocardOnly"]) -table(dattree[cluster == 3, "InfocardOnly"]) -table(dattree[cluster == 4, "InfocardOnly"]) - -aggregate(. ~ cluster, df, mean) - -aggregate(cbind(duration, distance, scaleSize, rotationDegree, length, - nmove, nflipCard, nopenTopic, nopenPopup) ~ cluster, datcase, - mean) - -### Look at selected cases ########################################### -tmp <- dat -tmp$start <- tmp$date.start -tmp$complete <- tmp$date.stop - -alog <- activitylog(tmp[tmp$case == 24016, ], - case_id = "case", - activity_id = "item", - resource_id = "path", - timestamps = c("start", "complete")) - -process_map(alog) - -rm(tmp) - -###################################################################### - -res <- merge(dat, data.frame(case = dattree$case, cluster), - by = "case", all.x = TRUE) -res <- res[order(res$fileId.start, res$date.start, res$timeMs.start), ] - -xtabs( ~ item + cluster, res) -aggregate(event ~ cluster, res, table) - -# Look at clusters -par(mfrow = c(2, 2)) -vioplot::vioplot(duration ~ cluster, res) -vioplot::vioplot(distance ~ cluster, res) -vioplot::vioplot(scaleSize ~ cluster, res) -vioplot::vioplot(rotationDegree ~ cluster, res) - -aggregate(cbind(duration, distance, scaleSize, rotationDegree) ~ cluster, res, mean) -aggregate(cbind(duration, distance, scaleSize, rotationDegree) ~ cluster, res, median) - -write.table(res, - file = "results/haum/eventlogs_2019_case-clusters.csv", - sep = ";", - quote = FALSE, - row.names = FALSE) - -save(res, dist_mat, hcs, acs, datcase, dattree, coor_2d, coor_3d, - file = "results/haum/tmp_user-navigation.RData") - -#--------------- (3) Fit tree --------------- - -library(rpart) -library(partykit) - -c1 <- rpart(as.factor(cluster) ~ ., data = dattree[, -1], method = "class") -plot(as.party(c1)) - -# with conditional tree -c2 <- ctree(as.factor(cluster) ~ ., data = dattree[, -1], alpha = 0) -plot(c2) - -#--------------- (4) Investigate variants --------------- - -res$start <- res$date.start -res$complete <- res$date.stop - -alog <- activitylog(res, - case_id = "case", - activity_id = "item", - resource_id = "path", - timestamps = c("start", "complete")) - -trace_explorer(alog, n_traces = 25) -# --> sequences of artworks are just too rare - -tr <- traces(alog) -trace_length <- pbapply::pbsapply(strsplit(tr$trace, ","), length) -tr[trace_length > 10, ] - -trace_varied <- pbapply::pbsapply(strsplit(tr$trace, ","), function(x) length(unique(x))) -tr[trace_varied > 1, ] -table(tr[trace_varied > 2, "absolute_frequency"]) -table(tr[trace_varied > 3, "absolute_frequency"]) - -summary(tr$absolute_frequency) -vioplot::vioplot(tr$absolute_frequency) - -# Power law for frequencies of traces -tab <- table(tr$absolute_frequency) -x <- as.numeric(tab) -y <- as.numeric(names(tab)) - -plot(x, y, log = "xy") -p1 <- lm(log(y) ~ log(x)) -pre <- exp(coef(p1)[1]) * x^coef(p1)[2] -lines(x, pre) - - -# Look at individual traces as examples -tr[trace_varied == 5 & trace_length > 50, ] -# --> every variant exists only once, of course -datcase[datcase$nitems == 5 & datcase$length > 50,] - -pbapply::pbsapply(datcase[, -c(1, 9)], median) - -#ex <- datcase[datcase$nitems == 4 & datcase$length == 15,] -ex <- datcase[datcase$nitems == 5,] -ex <- ex[sample(1:nrow(ex), 20), ] -# --> pretty randomly chosen... TODO: - -case_ids <- NULL - -for (case in ex$case) { - if ("080" %in% res$item[res$case == case] | "503" %in% res$item[res$case == case]) { - case_ids <- c(case_ids, TRUE) - } else { - case_ids <- c(case_ids, FALSE) - } -} - -cases <- ex$case[case_ids] - - -for (case in cases) { - - alog <- activitylog(res[res$case == case, ], - case_id = "case", - activity_id = "item", - resource_id = "path", - timestamps = c("start", "complete")) - - dfg <- process_map(alog, - type_nodes = frequency("absolute", color_scale = "Greys"), - type_edges = frequency("absolute", color_edges = "#FF6900"), - rankdir = "LR", - render = FALSE) - export_map(dfg, - file_name = paste0("results/processmaps/dfg_example_cases_", case, "_R.pdf"), - file_type = "pdf", - title = paste("Case", case)) - - -} - diff --git a/code/10_user-navigation.R b/code/10_user-navigation.R new file mode 100644 index 0000000..29ff354 --- /dev/null +++ b/code/10_user-navigation.R @@ -0,0 +1,141 @@ +# 10_user-navigation.R +# +# content: (1) Load data +# (2) Clustering +# (3) Fit tree +# (4) Investigate variants +# +# input: results/haum/event_logfiles_2024-02-21_16-07-33.csv +# output: results/haum/eventlogs_pre-corona_case-clusters.csv +# +# last mod: 2024-03-08 + + +# setwd("C:/Users/nwickelmaier/Nextcloud/Documents/MDS/2023ss/60100_master_thesis/analysis/code") + +library(bupaverse) +library(factoextra) +library(rpart) +library(partykit) + +#--------------- (1) Load data --------------- + +load("results/haum/dataframes_case_2019.RData") + +#--------------- (2) Clustering --------------- + +df <- dattree[, -1] + +summary(df) + +# Look at collinearity +cor_mat <- cor(df) +diag(cor_mat) <- NA +heatmap(cor_mat) + +#--------------- (2.2) Hierarchical clustering --------------- + +dist_mat <- cluster::daisy(df, metric = "gower") + +# "Flatten" with MDS +coor_2d <- as.data.frame(cmdscale(dist_mat, k = 2)) +coor_3d <- as.data.frame(cmdscale(dist_mat, k = 3)) +# TODO: Better use MASS::isoMDS() since I am not using Euclidean distances? + +plot(coor_2d) +rgl::plot3d(coor_3d) + +method <- c(average = "average", single = "single", complete = "complete", + ward = "ward") + +method <- "ward" + +hcs <- pbapply::pblapply(method, function(x) cluster::agnes(dist_mat, method = x)) +acs <- pbapply::pbsapply(hcs, function(x) x$ac) + +hc <- hcs$ward + +# Something like a scree plot (??) +plot(rev(hc$height)[1:100], type = "b", pch = 16, cex = .5) + +k <- 4 + +mycols <- c("#78004B", "#FF6900", "#3CB4DC", "#91C86E") + +cluster <- cutree(as.hclust(hc), k = k) + +table(cluster) + +plot(coor_2d, col = mycols[cluster]) +legend("topleft", paste("Cl", 1:4), col = mycols, pch = 21) +rgl::plot3d(coor_3d, col = mycols[cluster]) + +table(dattree[cluster == 1, "Pattern"]) +table(dattree[cluster == 2, "Pattern"]) +table(dattree[cluster == 3, "Pattern"]) +table(dattree[cluster == 4, "Pattern"]) + +table(dattree[cluster == 1, "InfocardOnly"]) +table(dattree[cluster == 2, "InfocardOnly"]) +table(dattree[cluster == 3, "InfocardOnly"]) +table(dattree[cluster == 4, "InfocardOnly"]) + +aggregate(. ~ cluster, df, mean) + +aggregate(cbind(duration, distance, scaleSize, rotationDegree, length, + nmove, nflipCard, nopenTopic, nopenPopup) ~ cluster, datcase, + mean) + +### Look at selected cases ########################################### +tmp <- dat +tmp$start <- tmp$date.start +tmp$complete <- tmp$date.stop + +alog <- activitylog(tmp[tmp$case == 24016, ], + case_id = "case", + activity_id = "item", + resource_id = "path", + timestamps = c("start", "complete")) + +process_map(alog) + +rm(tmp) + +###################################################################### + +res <- merge(dat, data.frame(case = dattree$case, cluster), + by = "case", all.x = TRUE) +res <- res[order(res$fileId.start, res$date.start, res$timeMs.start), ] + +xtabs( ~ item + cluster, res) +aggregate(event ~ cluster, res, table) + +# Look at clusters +par(mfrow = c(2, 2)) +vioplot::vioplot(duration ~ cluster, res) +vioplot::vioplot(distance ~ cluster, res) +vioplot::vioplot(scaleSize ~ cluster, res) +vioplot::vioplot(rotationDegree ~ cluster, res) + +aggregate(cbind(duration, distance, scaleSize, rotationDegree) ~ cluster, res, mean) +aggregate(cbind(duration, distance, scaleSize, rotationDegree) ~ cluster, res, median) + +write.table(res, + file = "results/haum/eventlogs_2019_case-clusters.csv", + sep = ";", + quote = FALSE, + row.names = FALSE) + +save(res, dist_mat, hcs, acs, datcase, dattree, coor_2d, coor_3d, + file = "results/haum/tmp_user-navigation.RData") + +#--------------- (3) Fit tree --------------- + +c1 <- rpart(as.factor(cluster) ~ ., data = dattree[, -1], method = "class") +plot(as.party(c1)) + +# with conditional tree +c2 <- ctree(as.factor(cluster) ~ ., data = dattree[, -1], alpha = 0) +plot(c2) + + diff --git a/code/11_investigate-variants.R b/code/11_investigate-variants.R new file mode 100644 index 0000000..5fae799 --- /dev/null +++ b/code/11_investigate-variants.R @@ -0,0 +1,101 @@ +# 11_investigate-variants.R +# +# content: (1) Read data +# (2) Extract characteristics for cases +# (3) Select features for navigation behavior +# (4) Export data frames +# +# input: results/haum/event_logfiles_2024-02-21_16-07-33.csv +# output: results/haum/eventlogs_pre-corona_case-clusters.csv +# +# last mod: 2024-03-08 + +# setwd("C:/Users/nwickelmaier/Nextcloud/Documents/MDS/2023ss/60100_master_thesis/analysis/code") + +#--------------- (1) Read data --------------- + +load("results/haum/eventlogs_pre-corona_cleaned.RData") + +#--------------- (4) Investigate variants --------------- + +res$start <- res$date.start +res$complete <- res$date.stop + +alog <- activitylog(res, + case_id = "case", + activity_id = "item", + resource_id = "path", + timestamps = c("start", "complete")) + +trace_explorer(alog, n_traces = 25) +# --> sequences of artworks are just too rare + +tr <- traces(alog) +trace_length <- pbapply::pbsapply(strsplit(tr$trace, ","), length) +tr[trace_length > 10, ] + +trace_varied <- pbapply::pbsapply(strsplit(tr$trace, ","), function(x) length(unique(x))) +tr[trace_varied > 1, ] +table(tr[trace_varied > 2, "absolute_frequency"]) +table(tr[trace_varied > 3, "absolute_frequency"]) + +summary(tr$absolute_frequency) +vioplot::vioplot(tr$absolute_frequency) + +# Power law for frequencies of traces +tab <- table(tr$absolute_frequency) +x <- as.numeric(tab) +y <- as.numeric(names(tab)) + +plot(x, y, log = "xy") +p1 <- lm(log(y) ~ log(x)) +pre <- exp(coef(p1)[1]) * x^coef(p1)[2] +lines(x, pre) + + +# Look at individual traces as examples +tr[trace_varied == 5 & trace_length > 50, ] +# --> every variant exists only once, of course +datcase[datcase$nitems == 5 & datcase$length > 50,] + +pbapply::pbsapply(datcase[, -c(1, 9)], median) + +#ex <- datcase[datcase$nitems == 4 & datcase$length == 15,] +ex <- datcase[datcase$nitems == 5,] +ex <- ex[sample(1:nrow(ex), 20), ] +# --> pretty randomly chosen... TODO: + +case_ids <- NULL + +for (case in ex$case) { + if ("080" %in% res$item[res$case == case] | "503" %in% res$item[res$case == case]) { + case_ids <- c(case_ids, TRUE) + } else { + case_ids <- c(case_ids, FALSE) + } +} + +cases <- ex$case[case_ids] + + +for (case in cases) { + + alog <- activitylog(res[res$case == case, ], + case_id = "case", + activity_id = "item", + resource_id = "path", + timestamps = c("start", "complete")) + + dfg <- process_map(alog, + type_nodes = frequency("absolute", color_scale = "Greys"), + type_edges = frequency("absolute", color_edges = "#FF6900"), + rankdir = "LR", + render = FALSE) + export_map(dfg, + file_name = paste0("results/processmaps/dfg_example_cases_", case, "_R.pdf"), + file_type = "pdf", + title = paste("Case", case)) + + +} + diff --git a/code/R_helpers.R b/code/R_helpers.R new file mode 100644 index 0000000..f543910 --- /dev/null +++ b/code/R_helpers.R @@ -0,0 +1,45 @@ +###################################################################### +time_minmax_ms <- function(subdata) { + subdata$min_time <- min(subdata$timeMs.start) + if (all(is.na(subdata$timeMs.stop))) { + subdata$max_time <- NA + } else { + subdata$max_time <- max(subdata$timeMs.stop, na.rm = TRUE) + } + subdata +} + +###################################################################### +check_infocards <- function(subdata, artworks) { + infocard_only <- NULL + if(any(unique(subdata$item) %in% artworks)) { + infocard_only <- FALSE + } else { + infocard_only <- TRUE + } + as.numeric(infocard_only) +} + +###################################################################### +normalize <- function(x) { + (x - min(x)) / (max(x) - min(x)) +} + +###################################################################### +get_centrality <- function(case, data) { + + data$start <- data$date.start + data$complete <- data$date.stop + + alog <- bupaR::activitylog(data[data$case == case, ], + case_id = "case", + activity_id = "item", + resource_id = "path", + timestamps = c("start", "complete")) + + net <- processmapR::process_map(alog, render = FALSE) + inet <- DiagrammeR::to_igraph(net) + + igraph::centr_betw(inet)$centralization +} +