# 11_validation.R # # content: (1) Load data # (2) Extract characteristics for cases # (3) Select features for navigation behavior # (4) Clustering # (5) Fit tree # # input: results/event_logfiles_2024-02-21_16-07-33.csv # output: -- # # last mod: 2024-03-22 # setwd("C:/Users/nwickelmaier/Nextcloud/Documents/MDS/2023ss/60100_master_thesis/analysis/code") source("R_helpers.R") #--------------- (1) Read data --------------- load("results/eventlogs_pre-corona_cleaned.RData") # Select one year to handle number of cases dat <- dat[as.Date(dat$date.start) > "2017-12-31" & as.Date(dat$date.start) < "2019-01-01", ] #--------------- (2) Extract characteristics for cases --------------- datcase18 <- aggregate(cbind(distance, scaleSize, rotationDegree) ~ case, dat, function(x) mean(x, na.rm = TRUE), na.action = NULL) datcase18$length <- aggregate(item ~ case, dat, length)$item eventtab <- aggregate(event ~ case, dat, table)["case"] eventtab$nmove <- aggregate(event ~ case, dat, table)$event[, "move"] eventtab$nflipCard <- aggregate(event ~ case, dat, table)$event[, "flipCard"] eventtab$nopenTopic <- aggregate(event ~ case, dat, table)$event[, "openTopic"] eventtab$nopenPopup <- aggregate(event ~ case, dat, table)$event[, "openPopup"] datcase18 <- datcase18 |> merge(eventtab, by = "case", all = TRUE) rm(eventtab) datcase18$nitems <- aggregate(item ~ case, dat, function(x) length(unique(x)), na.action = NULL)$item datcase18$npaths <- aggregate(path ~ case, dat, function(x) length(unique(x)), na.action = NULL)$path dat_split <- split(dat, ~ case) dat_list <- pbapply::pblapply(dat_split, time_minmax_ms) dat_minmax <- dplyr::bind_rows(dat_list) datcase18$min_time <- aggregate(min_time ~ case, dat_minmax, unique)$min_time datcase18$max_time <- aggregate(max_time ~ case, dat_minmax, unique)$max_time datcase18$duration <- datcase18$max_time - datcase18$min_time datcase18$min_time <- NULL datcase18$max_time <- NULL artworks <- unique(dat$item)[!unique(dat$item) %in% c("501", "502", "503")] datcase18$infocardOnly <- pbapply::pbsapply(dat_split, check_infocards, artworks = artworks) # Clean up NAs datcase18$distance <- ifelse(is.na(datcase18$distance), 0, datcase18$distance) datcase18$scaleSize <- ifelse(is.na(datcase18$scaleSize), 1, datcase18$scaleSize) datcase18$rotationDegree <- ifelse(is.na(datcase18$rotationDegree), 0, datcase18$rotationDegree) #--------------- (3) Select features for navigation behavior --------------- dattree18 <- data.frame(case = datcase18$case, PropItems = datcase18$nitems / length(unique(dat$item)), SearchInfo = (datcase18$nopenTopic + datcase18$nopenPopup) / datcase18$length, PropMoves = datcase18$nmove / datcase18$length, PathLinearity = datcase18$nitems / datcase18$npaths, Singularity = datcase18$npaths / datcase18$length ) # centrality <- pbapply::pbsapply(dattree18$case, get_centrality, data = dat) # save(centrality, file = "results/centrality_2018.RData") load("results/centrality_2018.RData") dattree18$BetweenCentrality <- centrality # Average duration per item dat_split <- split(dat[, c("item", "case", "path", "timeMs.start", "timeMs.stop")], ~ path) dat_list <- pbapply::pblapply(dat_split, time_minmax_ms) dat_minmax <- dplyr::bind_rows(dat_list) tmp <- aggregate(min_time ~ path, dat_minmax, unique) tmp$max_time <- aggregate(max_time ~ path, dat_minmax, unique, na.action = NULL)$max_time tmp$duration <- tmp$max_time - tmp$min_time tmp$case <- aggregate(case ~ path, dat_minmax, unique)$case dattree18$AvDurItem <- aggregate(duration ~ case, tmp, mean)$duration rm(tmp) # Indicator variable if table was used as info terminal only dattree18$InfocardOnly <- factor(datcase18$infocardOnly, levels = 0:1, labels = c("no", "yes")) # Add pattern dattree18$Pattern <- "Dispersion" dattree18$Pattern <- ifelse(dattree18$PathLinearity > 0.8, "Scholar", dattree18$Pattern) dattree18$Pattern <- ifelse(dattree18$PathLinearity <= 0.8 & dattree18$BetweenCentrality >= 0.5, "Star", dattree18$Pattern) dattree18$Pattern <- factor(dattree18$Pattern) dattree18$AvDurItemNorm <- normalize(dattree18$AvDurItem) #--------------- (4) Clustering --------------- df <- dattree18[, c("PropItems", "SearchInfo", "PropMoves", "AvDurItemNorm", "Pattern", "InfocardOnly")] dist_mat18 <- cluster::daisy(df, metric = "gower") coor_3d_18 <- smacof::mds(dist_mat18, ndim = 3, type = "ordinal")$conf coor_2d_18 <- coor_3d_18[, 1:2] plot(coor_2d_18) rgl::plot3d(coor_3d_18) hc18 <- cluster::agnes(dist_mat18, method = "ward") k <- 5 mycols <- c("#91C86E", "#FF6900", "#3CB4DC", "#78004B", "#434F4F") cluster18 <- cutree(as.hclust(hc18), k = k) table(cluster18) plot(coor_2d_18, col = mycols[cluster18], pch = 16) legend("topleft", c("Searching", "Exploring", "Scanning", "Flitting", "Info"), col = mycols, bty = "n", pch = 16) rgl::plot3d(coor_3d_18, col = mycols[cluster18]) print(ftable(xtabs( ~ InfocardOnly + Pattern + cluster18, dattree18)), zero = "-") aggregate(. ~ cluster18, df, mean) aggregate(. ~ cluster18, dattree18[, -1], mean) save(coor_2d_18, coor_3d_18, cluster18, dattree18, dist_mat18, hc18, file = "../../thesis/figures/data/clustering_cases_2018.RData") #--------------- (5) Fit tree --------------- c1 <- rpart::rpart(as.factor(cluster18) ~ ., data = dattree18[, c("PropMoves", "PropItems", "SearchInfo", "AvDurItem", "Pattern", "InfocardOnly")], method = "class") plot(partykit::as.party(c1), tp_args = list(fill = mycols, col = mycols)) ## Load data load("../../thesis/figures/data/clustering_cases.RData") c19 <- rpart::rpart(as.factor(cluster) ~ ., data = dattree[, c("PropMoves", "PropItems", "SearchInfo", "AvDurItem", "Pattern", "InfocardOnly")], method = "class") cl18 <- rpart:::predict.rpart(c1, type = "class", newdata = dattree18) cl18 <- factor(cl18, labels = c("Searching", "Exploring", "Scanning", "Flitting", "Info")) cl19 <- rpart:::predict.rpart(c19, type = "class", newdata = dattree18) cl19 <- factor(cl19, labels = c("Scanning", "Exploring", "Flitting", "Searching", "Info")) xtabs( ~ cl18 + cl19)