Cleaned up scripts; separated case data frame, clustering, and trace analysis into separate files
This commit is contained in:
		
							parent
							
								
									3cf6c4c51d
								
							
						
					
					
						commit
						66fab4fa18
					
				
							
								
								
									
										187
									
								
								code/09_case-characteristics.R
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										187
									
								
								code/09_case-characteristics.R
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,187 @@ | |||||||
|  | # 09_user-navigation.R | ||||||
|  | # | ||||||
|  | # content: (1) Read data | ||||||
|  | #          (2) Extract characteristics for cases | ||||||
|  | #          (3) Select features for navigation behavior | ||||||
|  | #          (4) Export data frames | ||||||
|  | # | ||||||
|  | # input:  results/haum/event_logfiles_2024-02-21_16-07-33.csv | ||||||
|  | # output: results/haum/eventlogs_pre-corona_case-clusters.csv | ||||||
|  | # | ||||||
|  | # last mod: 2024-03-08 | ||||||
|  | 
 | ||||||
|  | # setwd("C:/Users/nwickelmaier/Nextcloud/Documents/MDS/2023ss/60100_master_thesis/analysis/code") | ||||||
|  | 
 | ||||||
|  | source("R_helpers.R") | ||||||
|  | 
 | ||||||
|  | #--------------- (1) Read data --------------- | ||||||
|  | 
 | ||||||
|  | load("results/haum/eventlogs_pre-corona_cleaned.RData") | ||||||
|  | 
 | ||||||
|  | # Select one year to handle number of cases | ||||||
|  | dat <- dat[as.Date(dat$date.start) > "2018-12-31" & | ||||||
|  |            as.Date(dat$date.start) < "2020-01-01", ] | ||||||
|  | 
 | ||||||
|  | #--------------- (2) Extract characteristics for cases --------------- | ||||||
|  | 
 | ||||||
|  | datcase <- aggregate(cbind(distance, scaleSize, rotationDegree) ~ | ||||||
|  |                      case, dat, function(x) mean(x, na.rm = TRUE), na.action = NULL) | ||||||
|  | 
 | ||||||
|  | datcase$length <- aggregate(item ~ case, dat, length)$item | ||||||
|  | 
 | ||||||
|  | eventtab <- aggregate(event ~ case, dat, table)["case"] | ||||||
|  | eventtab$nmove <- aggregate(event ~ case, dat, table)$event[, "move"] | ||||||
|  | eventtab$nflipCard <- aggregate(event ~ case, dat, table)$event[, "flipCard"] | ||||||
|  | eventtab$nopenTopic <- aggregate(event ~ case, dat, table)$event[, "openTopic"] | ||||||
|  | eventtab$nopenPopup <- aggregate(event ~ case, dat, table)$event[, "openPopup"] | ||||||
|  | 
 | ||||||
|  | topictab <- aggregate(topic ~ case, dat, table)["case"] | ||||||
|  | topictab$artist <- aggregate(topic ~ case, dat, table)$topic[, 1] | ||||||
|  | topictab$details <- aggregate(topic ~ case, dat, table)$topic[, 2] | ||||||
|  | topictab$extra_info <- aggregate(topic ~ case, dat, table)$topic[, 3] | ||||||
|  | topictab$komposition <- aggregate(topic ~ case, dat, table)$topic[, 4] | ||||||
|  | topictab$leben_des_kunstwerks <- aggregate(topic ~ case, dat, table)$topic[, 5] | ||||||
|  | topictab$licht_und_farbe <- aggregate(topic ~ case, dat, table)$topic[, 6] | ||||||
|  | topictab$technik <- aggregate(topic ~ case, dat, table)$topic[, 7] | ||||||
|  | topictab$thema <- aggregate(topic ~ case, dat, table)$topic[, 8] | ||||||
|  | 
 | ||||||
|  | datcase <- datcase |> | ||||||
|  |   merge(eventtab, by = "case", all = TRUE) |> | ||||||
|  |   merge(topictab, by = "case", all = TRUE) | ||||||
|  | 
 | ||||||
|  | rm(eventtab, topictab) | ||||||
|  | 
 | ||||||
|  | datcase$ntopiccards <- aggregate(topic ~ case, dat, | ||||||
|  |                              function(x) ifelse(all(is.na(x)), NA, | ||||||
|  |                              length(na.omit(x))), na.action = | ||||||
|  |                              NULL)$topic | ||||||
|  | datcase$ntopics <- aggregate(topic ~ case, dat, | ||||||
|  |                              function(x) ifelse(all(is.na(x)), NA, | ||||||
|  |                              length(unique(na.omit(x)))), na.action = | ||||||
|  |                              NULL)$topic | ||||||
|  | datcase$nitems <- aggregate(item ~ case, dat, function(x) | ||||||
|  |                             length(unique(x)), na.action = NULL)$item | ||||||
|  | datcase$npaths <- aggregate(path ~ case, dat, function(x) | ||||||
|  |                             length(unique(x)), na.action = NULL)$path | ||||||
|  | datcase$vacation <- aggregate(vacation ~ case, dat, | ||||||
|  |                               function(x) ifelse(all(is.na(x)), 0, 1), | ||||||
|  |                               na.action = NULL)$vacation | ||||||
|  | datcase$holiday <- aggregate(holiday ~ case, dat, | ||||||
|  |                              function(x) ifelse(all(is.na(x)), 0, 1), | ||||||
|  |                              na.action = NULL)$holiday | ||||||
|  | datcase$weekend <- aggregate(weekdays ~ case, dat, | ||||||
|  |                              function(x) ifelse(any(x %in% c("Saturday", "Sunday")), 1, 0), | ||||||
|  |                              na.action = NULL)$weekdays | ||||||
|  | datcase$morning <- aggregate(date.start ~ case, dat, | ||||||
|  |                              function(x) ifelse(lubridate::hour(x[1]) > 13, 0, 1), | ||||||
|  |                              na.action = NULL)$date.start | ||||||
|  | 
 | ||||||
|  | dat_split <- split(dat, ~ case) | ||||||
|  | dat_list <- pbapply::pblapply(dat_split, time_minmax_ms) | ||||||
|  | dat_minmax <- dplyr::bind_rows(dat_list) | ||||||
|  | 
 | ||||||
|  | datcase$min_time <- aggregate(min_time ~ case, dat_minmax, unique)$min_time | ||||||
|  | datcase$max_time <- aggregate(max_time ~ case, dat_minmax, unique)$max_time | ||||||
|  | 
 | ||||||
|  | datcase$duration <- datcase$max_time - datcase$min_time | ||||||
|  | datcase$min_time <- NULL | ||||||
|  | datcase$max_time <- NULL | ||||||
|  | 
 | ||||||
|  | artworks <- unique(dat$item)[!unique(dat$item) %in% c("501", "502", "503")] | ||||||
|  | datcase$infocardOnly <- pbapply::pbsapply(dat_split, check_infocards, artworks = artworks) | ||||||
|  | 
 | ||||||
|  | # Clean up NAs | ||||||
|  | datcase$distance <- ifelse(is.na(datcase$distance), 0, datcase$distance) | ||||||
|  | datcase$scaleSize <- ifelse(is.na(datcase$scaleSize), 1, datcase$scaleSize) | ||||||
|  | datcase$rotationDegree <- ifelse(is.na(datcase$rotationDegree), 0, datcase$rotationDegree) | ||||||
|  | datcase$artist <- ifelse(is.na(datcase$artist), 0, datcase$artist) | ||||||
|  | datcase$details <- ifelse(is.na(datcase$details), 0, datcase$details) | ||||||
|  | datcase$extra_info <- ifelse(is.na(datcase$extra_info), 0, datcase$extra_info) | ||||||
|  | datcase$komposition <- ifelse(is.na(datcase$komposition), 0, datcase$komposition) | ||||||
|  | datcase$leben_des_kunstwerks <- ifelse(is.na(datcase$leben_des_kunstwerks), 0, datcase$leben_des_kunstwerks) | ||||||
|  | datcase$licht_und_farbe <- ifelse(is.na(datcase$licht_und_farbe), 0, datcase$licht_und_farbe) | ||||||
|  | datcase$technik <- ifelse(is.na(datcase$technik), 0, datcase$technik) | ||||||
|  | datcase$thema <- ifelse(is.na(datcase$thema), 0, datcase$thema) | ||||||
|  | datcase$ntopics <- ifelse(is.na(datcase$ntopics), 0, datcase$ntopics) | ||||||
|  | datcase$ntopiccards <- ifelse(is.na(datcase$ntopiccards), 0, datcase$ntopiccards) | ||||||
|  | 
 | ||||||
|  | #--------------- (3) Select features for navigation behavior --------------- | ||||||
|  | 
 | ||||||
|  | # Features for navigation types for MTT: | ||||||
|  | # - Scanning / Overviewing: | ||||||
|  | #   * Proportion of artworks looked at is high | ||||||
|  | #   * Duration per artwork is low: "ave_duration_item" / datcase$duration | ||||||
|  | # - Exploring: | ||||||
|  | #   * Looking at additional information is high | ||||||
|  | # - Searching / Studying: | ||||||
|  | #   * Proportion of artworks looked at is low | ||||||
|  | #   * Opening few cards | ||||||
|  | #   datcase$nflipCard / mean(datcase$nflipCard) or median(datcase$nflipCard) is low | ||||||
|  | #   * but for most cards popups are opened: | ||||||
|  | #   datcase$nopenPopup / datcase$nflipCard is high | ||||||
|  | # - Wandering / Flitting: | ||||||
|  | #   * Proportion of moves is high | ||||||
|  | #   * Duration per case is low: | ||||||
|  | #   datcase$duration / mean(datcase$duration) or median(datcase$duration) | ||||||
|  | #   * Duration per artwork is low: "ave_duration_item" / datcase$duration | ||||||
|  | 
 | ||||||
|  | dattree <- data.frame(case = datcase$case, | ||||||
|  |                       PropItems = datcase$nitems / length(unique(dat$item)), | ||||||
|  |                       SearchInfo = (datcase$nopenTopic + | ||||||
|  |                                     datcase$nopenPopup) / datcase$length, | ||||||
|  |                       PropMoves = datcase$nmove / datcase$length, | ||||||
|  |                       PathLinearity = datcase$nitems / datcase$npaths, | ||||||
|  |                       Singularity = datcase$npaths / datcase$length | ||||||
|  | ) | ||||||
|  | 
 | ||||||
|  | # centrality <- pbapply::pbsapply(dattree$case, get_centrality, data = dat) | ||||||
|  | # save(centrality, file = "results/haum/tmp_centrality.RData") | ||||||
|  | load("results/haum/tmp_centrality.RData") | ||||||
|  | 
 | ||||||
|  | dattree$BetweenCentrality <- centrality | ||||||
|  | 
 | ||||||
|  | # Average duration per item | ||||||
|  | dat_split <- split(dat[, c("item", "case", "path", "timeMs.start", "timeMs.stop")], ~ path) | ||||||
|  | dat_list <- pbapply::pblapply(dat_split, time_minmax_ms) | ||||||
|  | dat_minmax <- dplyr::bind_rows(dat_list) | ||||||
|  | 
 | ||||||
|  | tmp <- aggregate(min_time ~ path, dat_minmax, unique) | ||||||
|  | tmp$max_time <- aggregate(max_time ~ path, dat_minmax, unique, na.action = NULL)$max_time | ||||||
|  | tmp$duration <- tmp$max_time - tmp$min_time | ||||||
|  | tmp$case <- aggregate(case ~ path, dat_minmax, unique)$case | ||||||
|  | 
 | ||||||
|  | dattree$AvDurItem <- aggregate(duration ~ case, tmp, mean)$duration | ||||||
|  | 
 | ||||||
|  | rm(tmp) | ||||||
|  | 
 | ||||||
|  | # Indicator variable if table was used as info terminal only | ||||||
|  | dattree$InfocardOnly <- factor(datcase$infocardOnly, levels = 0:1, | ||||||
|  |                                labels = c("no", "yes")) | ||||||
|  | 
 | ||||||
|  | # Add pattern to datcase; loosely based on Bousbia et al. (2009) | ||||||
|  | dattree$Pattern <- "Dispersion" | ||||||
|  | dattree$Pattern <- ifelse(dattree$PathLinearity > 0.8, "Scholar", | ||||||
|  |                           dattree$Pattern) | ||||||
|  | dattree$Pattern <- ifelse(dattree$PathLinearity <= 0.8 & | ||||||
|  |                           dattree$BetweenCentrality >= 0.5, "Star", | ||||||
|  |                           dattree$Pattern) | ||||||
|  | dattree$Pattern <- factor(dattree$Pattern) | ||||||
|  | 
 | ||||||
|  | dattree$AvDurItemNorm <- normalize(dattree$AvDurItem) | ||||||
|  | 
 | ||||||
|  | #--------------- (4) Export data frames --------------- | ||||||
|  | 
 | ||||||
|  | save(datcase, dattree, file = "results/haum/dataframes_case_2019.RData") | ||||||
|  | 
 | ||||||
|  | write.table(datcase, | ||||||
|  |             file = "results/haum/datcase.csv", | ||||||
|  |             sep = ";", | ||||||
|  |             quote = FALSE, | ||||||
|  |             row.names = FALSE) | ||||||
|  | 
 | ||||||
|  | write.table(datcase, | ||||||
|  |             file = "results/haum/dattree.csv", | ||||||
|  |             sep = ";", | ||||||
|  |             quote = FALSE, | ||||||
|  |             row.names = FALSE) | ||||||
|  | 
 | ||||||
| @ -1,481 +0,0 @@ | |||||||
| # 09_user-navigation.R |  | ||||||
| # |  | ||||||
| # content: (1) Read data |  | ||||||
| #           (1.1) Read log event data |  | ||||||
| #           (1.2) Extract additional infos for clustering |  | ||||||
| #          (2) Clustering |  | ||||||
| #          (3) Fit tree |  | ||||||
| #          (4) Investigate variants |  | ||||||
| # |  | ||||||
| # input:  results/haum/event_logfiles_2024-02-21_16-07-33.csv |  | ||||||
| # output: results/haum/eventlogs_pre-corona_case-clusters.csv |  | ||||||
| # |  | ||||||
| # last mod: 2024-03-06 |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| # setwd("C:/Users/nwickelmaier/Nextcloud/Documents/MDS/2023ss/60100_master_thesis/analysis/code") |  | ||||||
| 
 |  | ||||||
| library(bupaverse) |  | ||||||
| library(factoextra) |  | ||||||
| 
 |  | ||||||
| #--------------- (1) Read data --------------- |  | ||||||
| 
 |  | ||||||
| #--------------- (1.1) Read log event data --------------- |  | ||||||
| 
 |  | ||||||
| load("results/haum/eventlogs_pre-corona_cleaned.RData") |  | ||||||
| 
 |  | ||||||
| # Select one year to handle number of cases |  | ||||||
| dat <- dat[as.Date(dat$date.start) > "2018-12-31" & |  | ||||||
|            as.Date(dat$date.start) < "2020-01-01", ] |  | ||||||
| 
 |  | ||||||
| #--------------- (1.2) Extract additional infos for clustering --------------- |  | ||||||
| 
 |  | ||||||
| datcase <- aggregate(cbind(distance, scaleSize, rotationDegree) ~ |  | ||||||
|                      case, dat, function(x) mean(x, na.rm = TRUE), na.action = NULL) |  | ||||||
| 
 |  | ||||||
| datcase$length <- aggregate(item ~ case, dat, length)$item |  | ||||||
| 
 |  | ||||||
| eventtab <- aggregate(event ~ case, dat, table)["case"] |  | ||||||
| eventtab$nmove <- aggregate(event ~ case, dat, table)$event[, "move"] |  | ||||||
| eventtab$nflipCard <- aggregate(event ~ case, dat, table)$event[, "flipCard"] |  | ||||||
| eventtab$nopenTopic <- aggregate(event ~ case, dat, table)$event[, "openTopic"] |  | ||||||
| eventtab$nopenPopup <- aggregate(event ~ case, dat, table)$event[, "openPopup"] |  | ||||||
| 
 |  | ||||||
| topictab <- aggregate(topic ~ case, dat, table)["case"] |  | ||||||
| topictab$artist <- aggregate(topic ~ case, dat, table)$topic[, 1] |  | ||||||
| topictab$details <- aggregate(topic ~ case, dat, table)$topic[, 2] |  | ||||||
| topictab$extra_info <- aggregate(topic ~ case, dat, table)$topic[, 3] |  | ||||||
| topictab$komposition <- aggregate(topic ~ case, dat, table)$topic[, 4] |  | ||||||
| topictab$leben_des_kunstwerks <- aggregate(topic ~ case, dat, table)$topic[, 5] |  | ||||||
| topictab$licht_und_farbe <- aggregate(topic ~ case, dat, table)$topic[, 6] |  | ||||||
| topictab$technik <- aggregate(topic ~ case, dat, table)$topic[, 7] |  | ||||||
| topictab$thema <- aggregate(topic ~ case, dat, table)$topic[, 8] |  | ||||||
| 
 |  | ||||||
| datcase <- datcase |> |  | ||||||
|   merge(eventtab, by = "case", all = TRUE) |> |  | ||||||
|   merge(topictab, by = "case", all = TRUE) |  | ||||||
| 
 |  | ||||||
| rm(eventtab, topictab) |  | ||||||
| 
 |  | ||||||
| datcase$ntopiccards <- aggregate(topic ~ case, dat, |  | ||||||
|                              function(x) ifelse(all(is.na(x)), NA, |  | ||||||
|                              length(na.omit(x))), na.action = |  | ||||||
|                              NULL)$topic |  | ||||||
| datcase$ntopics <- aggregate(topic ~ case, dat, |  | ||||||
|                              function(x) ifelse(all(is.na(x)), NA, |  | ||||||
|                              length(unique(na.omit(x)))), na.action = |  | ||||||
|                              NULL)$topic |  | ||||||
| datcase$nitems <- aggregate(item ~ case, dat, function(x) |  | ||||||
|                             length(unique(x)), na.action = NULL)$item |  | ||||||
| datcase$npaths <- aggregate(path ~ case, dat, function(x) |  | ||||||
|                             length(unique(x)), na.action = NULL)$path |  | ||||||
| datcase$vacation <- aggregate(vacation ~ case, dat, |  | ||||||
|                               function(x) ifelse(all(is.na(x)), 0, 1), |  | ||||||
|                               na.action = NULL)$vacation |  | ||||||
| datcase$holiday <- aggregate(holiday ~ case, dat, |  | ||||||
|                              function(x) ifelse(all(is.na(x)), 0, 1), |  | ||||||
|                              na.action = NULL)$holiday |  | ||||||
| datcase$weekend <- aggregate(weekdays ~ case, dat, |  | ||||||
|                              function(x) ifelse(any(x %in% c("Saturday", "Sunday")), 1, 0), |  | ||||||
|                              na.action = NULL)$weekdays |  | ||||||
| datcase$morning <- aggregate(date.start ~ case, dat, |  | ||||||
|                              function(x) ifelse(lubridate::hour(x[1]) > 13, 0, 1), |  | ||||||
|                              na.action = NULL)$date.start |  | ||||||
| 
 |  | ||||||
| dat_split <- split(dat, ~ case) |  | ||||||
| 
 |  | ||||||
| time_minmax_ms <- function(subdata) { |  | ||||||
|   subdata$min_time <- min(subdata$timeMs.start) |  | ||||||
|     if (all(is.na(subdata$timeMs.stop))) { |  | ||||||
|       subdata$max_time <- NA |  | ||||||
|     } else { |  | ||||||
|        subdata$max_time <- max(subdata$timeMs.stop, na.rm = TRUE) |  | ||||||
|     } |  | ||||||
|   subdata |  | ||||||
| } |  | ||||||
| # TODO: Move to helper file |  | ||||||
| 
 |  | ||||||
| dat_list <- pbapply::pblapply(dat_split, time_minmax_ms) |  | ||||||
| dat_minmax <- dplyr::bind_rows(dat_list) |  | ||||||
| 
 |  | ||||||
| datcase$min_time <- aggregate(min_time ~ case, dat_minmax, unique)$min_time |  | ||||||
| datcase$max_time <- aggregate(max_time ~ case, dat_minmax, unique)$max_time |  | ||||||
| 
 |  | ||||||
| datcase$duration <- datcase$max_time - datcase$min_time |  | ||||||
| datcase$min_time <- NULL |  | ||||||
| datcase$max_time <- NULL |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| check_infocards <- function(subdata, artworks) { |  | ||||||
|   infocard_only <- NULL |  | ||||||
|   if(any(unique(subdata$item) %in% artworks)) { |  | ||||||
|     infocard_only <- FALSE |  | ||||||
|   } else { |  | ||||||
|     infocard_only <- TRUE |  | ||||||
|   } |  | ||||||
|   as.numeric(infocard_only) |  | ||||||
| } |  | ||||||
| # TODO: Move to helper file |  | ||||||
| 
 |  | ||||||
| artworks <- unique(dat$item)[!unique(dat$item) %in% c("501", "502", "503")] |  | ||||||
| 
 |  | ||||||
| datcase$infocardOnly <- pbapply::pbsapply(dat_split, check_infocards, artworks = artworks) |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| # Clean up NAs |  | ||||||
| datcase$distance <- ifelse(is.na(datcase$distance), 0, datcase$distance) |  | ||||||
| datcase$scaleSize <- ifelse(is.na(datcase$scaleSize), 1, datcase$scaleSize) |  | ||||||
| datcase$rotationDegree <- ifelse(is.na(datcase$rotationDegree), 0, datcase$rotationDegree) |  | ||||||
| datcase$artist <- ifelse(is.na(datcase$artist), 0, datcase$artist) |  | ||||||
| datcase$details <- ifelse(is.na(datcase$details), 0, datcase$details) |  | ||||||
| datcase$extra_info <- ifelse(is.na(datcase$extra_info), 0, datcase$extra_info) |  | ||||||
| datcase$komposition <- ifelse(is.na(datcase$komposition), 0, datcase$komposition) |  | ||||||
| datcase$leben_des_kunstwerks <- ifelse(is.na(datcase$leben_des_kunstwerks), 0, datcase$leben_des_kunstwerks) |  | ||||||
| datcase$licht_und_farbe <- ifelse(is.na(datcase$licht_und_farbe), 0, datcase$licht_und_farbe) |  | ||||||
| datcase$technik <- ifelse(is.na(datcase$technik), 0, datcase$technik) |  | ||||||
| datcase$thema <- ifelse(is.na(datcase$thema), 0, datcase$thema) |  | ||||||
| datcase$ntopics <- ifelse(is.na(datcase$ntopics), 0, datcase$ntopics) |  | ||||||
| datcase$ntopiccards <- ifelse(is.na(datcase$ntopiccards), 0, datcase$ntopiccards) |  | ||||||
| 
 |  | ||||||
| cor_mat <- cor(datcase[, -1], use = "pairwise") |  | ||||||
| diag(cor_mat) <- NA |  | ||||||
| heatmap(cor_mat) |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| normalize <- function(x) { |  | ||||||
|   (x - min(x)) / (max(x) - min(x)) |  | ||||||
| } |  | ||||||
| # TODO: Move to helper file |  | ||||||
| 
 |  | ||||||
| # Features for navigation types for MTT: |  | ||||||
| # - Scanning / Overviewing: |  | ||||||
| #   * Proportion of artworks looked at is high |  | ||||||
| #   * Duration per artwork is low: "ave_duration_item" / datcase$duration |  | ||||||
| # - Exploring: |  | ||||||
| #   * Looking at additional information is high |  | ||||||
| # - Searching / Studying: |  | ||||||
| #   * Proportion of artworks looked at is low |  | ||||||
| #   * Opening few cards |  | ||||||
| #   datcase$nflipCard / mean(datcase$nflipCard) or median(datcase$nflipCard) is low |  | ||||||
| #   * but for most cards popups are opened: |  | ||||||
| #   datcase$nopenPopup / datcase$nflipCard is high |  | ||||||
| # - Wandering / Flitting: |  | ||||||
| #   * Proportion of moves is high |  | ||||||
| #   * Duration per case is low: |  | ||||||
| #   datcase$duration / mean(datcase$duration) or median(datcase$duration) |  | ||||||
| #   * Duration per artwork is low: "ave_duration_item" / datcase$duration |  | ||||||
| 
 |  | ||||||
| dattree <- data.frame(case = datcase$case, |  | ||||||
|                       PropItems = datcase$nitems / length(unique(dat$item)), |  | ||||||
|                       SearchInfo = datcase$nopenTopic + datcase$nopenPopup, |  | ||||||
|                       PropMoves = datcase$nmove / datcase$length, |  | ||||||
|                       PathLinearity = datcase$nitems / datcase$npaths, |  | ||||||
|                       Singularity = datcase$npaths / datcase$length |  | ||||||
| ) |  | ||||||
| 
 |  | ||||||
| dattree$SearchInfo <- ifelse(is.na(dattree$NumTopic), 0, dattree$NumTopic) |  | ||||||
| 
 |  | ||||||
| get_centrality <- function(case, data) { |  | ||||||
| 
 |  | ||||||
|   data$start <- data$date.start |  | ||||||
|   data$complete <- data$date.stop |  | ||||||
| 
 |  | ||||||
|   alog <- activitylog(data[data$case == case, ], |  | ||||||
|                       case_id     = "case", |  | ||||||
|                       activity_id = "item", |  | ||||||
|                       resource_id = "path", |  | ||||||
|                       timestamps  = c("start", "complete")) |  | ||||||
| 
 |  | ||||||
|   net <- process_map(alog, render = FALSE) |  | ||||||
|   inet <- DiagrammeR::to_igraph(net) |  | ||||||
| 
 |  | ||||||
|   #c(igraph::centr_degree(inet, loops = FALSE)$centralization, |  | ||||||
|   #  igraph::centr_degree(inet, loops = TRUE)$centralization, |  | ||||||
|   #  igraph::centr_betw(inet)$centralization) |  | ||||||
|   igraph::centr_betw(inet)$centralization |  | ||||||
| } |  | ||||||
| # TODO: Move to helper file |  | ||||||
| 
 |  | ||||||
| centrality <- pbapply::pblapply(dattree$case, get_centrality, data = dat) |  | ||||||
| centrality <- do.call(rbind, centrality) |  | ||||||
| 
 |  | ||||||
| # save(centrality, file = "results/haum/tmp_centrality.RData") |  | ||||||
| #load("results/haum/tmp_centrality.RData") |  | ||||||
| 
 |  | ||||||
| dattree$BetweenCentrality <- unlist(centrality) |  | ||||||
| 
 |  | ||||||
| # Average duration per item |  | ||||||
| dat_split <- split(dat[, c("item", "case", "path", "timeMs.start", "timeMs.stop")], ~ path) |  | ||||||
| dat_list <- pbapply::pblapply(dat_split, time_minmax_ms) |  | ||||||
| dat_minmax <- dplyr::bind_rows(dat_list) |  | ||||||
| 
 |  | ||||||
| tmp <- aggregate(min_time ~ path, dat_minmax, unique) |  | ||||||
| tmp$max_time <- aggregate(max_time ~ path, dat_minmax, unique, na.action = NULL)$max_time |  | ||||||
| tmp$duration <- tmp$max_time - tmp$min_time |  | ||||||
| tmp$case <- aggregate(case ~ path, dat_minmax, unique)$case |  | ||||||
| 
 |  | ||||||
| dattree$AvDurItem <- aggregate(duration ~ case, tmp, mean)$duration |  | ||||||
| 
 |  | ||||||
| rm(tmp) |  | ||||||
| 
 |  | ||||||
| # Indicator variable if table was used as info terminal only |  | ||||||
| dattree$InfocardOnly <- factor(datcase$infocardOnly, levels = 0:1, |  | ||||||
|                                labels = c("no", "yes")) |  | ||||||
| 
 |  | ||||||
| # Add pattern to datcase; loosely based on Bousbia et al. (2009) |  | ||||||
| dattree$Pattern <- "Dispersion" |  | ||||||
| dattree$Pattern <- ifelse(dattree$PathLinearity > 0.8 & dattree$Singularity > 0.8, "Scholar", |  | ||||||
|                           dattree$Pattern) |  | ||||||
| dattree$Pattern <- ifelse(dattree$PathLinearity <= 0.8 & |  | ||||||
|                           dattree$BetweenCentrality > 0.5, "Star", |  | ||||||
|                           dattree$Pattern) |  | ||||||
| dattree$Pattern <- factor(dattree$Pattern) |  | ||||||
| # TODO: Get rid of PathLinearity and Singularity as features when I am |  | ||||||
| # using Pattern? |  | ||||||
| 
 |  | ||||||
| dattree$PathLinearity <- NULL |  | ||||||
| dattree$Singularity <- NULL |  | ||||||
| dattree$BetweenCentrality <- NULL |  | ||||||
| 
 |  | ||||||
| summary(dattree) |  | ||||||
| 
 |  | ||||||
| plot(dattree[, -1], pch = ".") |  | ||||||
| 
 |  | ||||||
| par(mfrow = c(2,4)) |  | ||||||
| hist(dattree$AvDurItem, breaks = 50, main = "") |  | ||||||
| hist(dattree$NumItems, breaks = 50, main = "") |  | ||||||
| hist(dattree$NumTopic, breaks = 50, main = "") |  | ||||||
| hist(dattree$NumPopup, breaks = 50, main = "") |  | ||||||
| hist(dattree$PropMoves, breaks = 50, main = "") |  | ||||||
| hist(dattree$PathLinearity, breaks = 50, main = "") |  | ||||||
| hist(dattree$Singularity, breaks = 50, main = "") |  | ||||||
| hist(dattree$BetweenCentrality, breaks = 50, main = "") |  | ||||||
| 
 |  | ||||||
| #++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ |  | ||||||
| # Remove cases with extreme outliers |  | ||||||
| # TODO: Do I want this??? |  | ||||||
| 
 |  | ||||||
| quantile(datcase$nopenTopic, 0.999) |  | ||||||
| quantile(datcase$nopenPopup, 0.999) |  | ||||||
| 
 |  | ||||||
| dattree <- dattree[!(dattree$NumTopic > 40 | dattree$NumPopup > 40), ] |  | ||||||
| 
 |  | ||||||
| plot(dattree[, -1], pch = ".") |  | ||||||
| 
 |  | ||||||
| par(mfrow = c(2,4)) |  | ||||||
| hist(dattree$AvDurItem, breaks = 50, main = "") |  | ||||||
| hist(dattree$NumItems, breaks = 50, main = "") |  | ||||||
| hist(dattree$NumTopic, breaks = 50, main = "") |  | ||||||
| hist(dattree$NumPopup, breaks = 50, main = "") |  | ||||||
| hist(dattree$PropMoves, breaks = 50, main = "") |  | ||||||
| hist(dattree$PathLinearity, breaks = 50, main = "") |  | ||||||
| hist(dattree$Singularity, breaks = 50, main = "") |  | ||||||
| hist(dattree$BetweenCentrality, breaks = 50, main = "") |  | ||||||
| #++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ |  | ||||||
| 
 |  | ||||||
| #--------------- (2) Clustering --------------- |  | ||||||
| 
 |  | ||||||
| df <- dattree[, -1] # remove case variable |  | ||||||
| 
 |  | ||||||
| # Normalize Duration and SearchInfo |  | ||||||
| df$AvDurItem <- normalize(df$AvDurItem) |  | ||||||
| df$SearchInfo  <- normalize(df$SearchInfo) |  | ||||||
| 
 |  | ||||||
| summary(df) |  | ||||||
| 
 |  | ||||||
| # Look at collinearity |  | ||||||
| cor_mat <- cor(df) |  | ||||||
| diag(cor_mat) <- NA |  | ||||||
| heatmap(cor_mat) |  | ||||||
| 
 |  | ||||||
| #--------------- (2.2) Hierarchical clustering --------------- |  | ||||||
| 
 |  | ||||||
| dist_mat <- cluster::daisy(df, metric = "gower") |  | ||||||
| 
 |  | ||||||
| # "Flatten" with MDS |  | ||||||
| coor_2d <- as.data.frame(cmdscale(dist_mat, k = 2)) |  | ||||||
| coor_3d <- as.data.frame(cmdscale(dist_mat, k = 3)) |  | ||||||
| # TODO: Better use MASS::isoMDS() since I am not using Euclidean distances? |  | ||||||
| 
 |  | ||||||
| plot(coor_2d) |  | ||||||
| rgl::plot3d(coor_3d) |  | ||||||
| 
 |  | ||||||
| method <- c(average = "average", single = "single", complete = "complete", |  | ||||||
|             ward = "ward") |  | ||||||
| 
 |  | ||||||
| method <- "ward" |  | ||||||
| 
 |  | ||||||
| hcs <- pbapply::pblapply(method, function(x) cluster::agnes(dist_mat, method = x)) |  | ||||||
| acs <- pbapply::pbsapply(hcs, function(x) x$ac) |  | ||||||
| 
 |  | ||||||
| hc <- hcs$ward |  | ||||||
| 
 |  | ||||||
| # Something like a scree plot (??) |  | ||||||
| plot(rev(hc$height)[1:100], type = "b", pch = 16, cex = .5) |  | ||||||
| 
 |  | ||||||
| k <- 4 |  | ||||||
| 
 |  | ||||||
| mycols <- c("#78004B", "#FF6900", "#3CB4DC", "#91C86E") |  | ||||||
| 
 |  | ||||||
| cluster <- cutree(as.hclust(hc), k = k) |  | ||||||
| 
 |  | ||||||
| table(cluster) |  | ||||||
| 
 |  | ||||||
| plot(coor_2d, col = mycols[cluster]) |  | ||||||
| legend("topleft", paste("Cl", 1:4), col = mycols, pch = 21) |  | ||||||
| rgl::plot3d(coor_3d, col = mycols[cluster]) |  | ||||||
| 
 |  | ||||||
| table(dattree[cluster == 1, "Pattern"]) |  | ||||||
| table(dattree[cluster == 2, "Pattern"]) |  | ||||||
| table(dattree[cluster == 3, "Pattern"]) |  | ||||||
| table(dattree[cluster == 4, "Pattern"]) |  | ||||||
| 
 |  | ||||||
| table(dattree[cluster == 1, "InfocardOnly"]) |  | ||||||
| table(dattree[cluster == 2, "InfocardOnly"]) |  | ||||||
| table(dattree[cluster == 3, "InfocardOnly"]) |  | ||||||
| table(dattree[cluster == 4, "InfocardOnly"]) |  | ||||||
| 
 |  | ||||||
| aggregate(. ~ cluster, df, mean) |  | ||||||
| 
 |  | ||||||
| aggregate(cbind(duration, distance, scaleSize, rotationDegree, length, |  | ||||||
|                 nmove, nflipCard, nopenTopic, nopenPopup) ~ cluster, datcase, |  | ||||||
|           mean) |  | ||||||
| 
 |  | ||||||
| ### Look at selected cases ########################################### |  | ||||||
| tmp <- dat |  | ||||||
| tmp$start <- tmp$date.start |  | ||||||
| tmp$complete <- tmp$date.stop |  | ||||||
| 
 |  | ||||||
| alog <- activitylog(tmp[tmp$case == 24016, ], |  | ||||||
|                     case_id     = "case", |  | ||||||
|                     activity_id = "item", |  | ||||||
|                     resource_id = "path", |  | ||||||
|                     timestamps  = c("start", "complete")) |  | ||||||
| 
 |  | ||||||
| process_map(alog) |  | ||||||
| 
 |  | ||||||
| rm(tmp) |  | ||||||
| 
 |  | ||||||
| ###################################################################### |  | ||||||
| 
 |  | ||||||
| res <- merge(dat, data.frame(case = dattree$case, cluster), |  | ||||||
|              by = "case", all.x = TRUE) |  | ||||||
| res <- res[order(res$fileId.start, res$date.start, res$timeMs.start), ] |  | ||||||
| 
 |  | ||||||
| xtabs( ~ item + cluster, res) |  | ||||||
| aggregate(event ~ cluster, res, table) |  | ||||||
| 
 |  | ||||||
| # Look at clusters |  | ||||||
| par(mfrow = c(2, 2)) |  | ||||||
| vioplot::vioplot(duration ~ cluster, res) |  | ||||||
| vioplot::vioplot(distance ~ cluster, res) |  | ||||||
| vioplot::vioplot(scaleSize ~ cluster, res) |  | ||||||
| vioplot::vioplot(rotationDegree ~ cluster, res) |  | ||||||
| 
 |  | ||||||
| aggregate(cbind(duration, distance, scaleSize, rotationDegree) ~ cluster, res, mean) |  | ||||||
| aggregate(cbind(duration, distance, scaleSize, rotationDegree) ~ cluster, res, median) |  | ||||||
| 
 |  | ||||||
| write.table(res, |  | ||||||
|             file = "results/haum/eventlogs_2019_case-clusters.csv", |  | ||||||
|             sep = ";", |  | ||||||
|             quote = FALSE, |  | ||||||
|             row.names = FALSE) |  | ||||||
| 
 |  | ||||||
| save(res, dist_mat, hcs, acs, datcase, dattree, coor_2d, coor_3d, |  | ||||||
|      file = "results/haum/tmp_user-navigation.RData") |  | ||||||
| 
 |  | ||||||
| #--------------- (3) Fit tree --------------- |  | ||||||
| 
 |  | ||||||
| library(rpart) |  | ||||||
| library(partykit) |  | ||||||
| 
 |  | ||||||
| c1 <- rpart(as.factor(cluster) ~ ., data = dattree[, -1], method = "class") |  | ||||||
| plot(as.party(c1)) |  | ||||||
| 
 |  | ||||||
| # with conditional tree |  | ||||||
| c2 <- ctree(as.factor(cluster) ~ ., data = dattree[, -1], alpha = 0) |  | ||||||
| plot(c2) |  | ||||||
| 
 |  | ||||||
| #--------------- (4) Investigate variants --------------- |  | ||||||
| 
 |  | ||||||
| res$start <- res$date.start |  | ||||||
| res$complete <- res$date.stop |  | ||||||
| 
 |  | ||||||
| alog <- activitylog(res, |  | ||||||
|                     case_id     = "case", |  | ||||||
|                     activity_id = "item", |  | ||||||
|                     resource_id = "path", |  | ||||||
|                     timestamps  = c("start", "complete")) |  | ||||||
| 
 |  | ||||||
| trace_explorer(alog, n_traces = 25) |  | ||||||
| # --> sequences of artworks are just too rare |  | ||||||
| 
 |  | ||||||
| tr <- traces(alog) |  | ||||||
| trace_length <- pbapply::pbsapply(strsplit(tr$trace, ","), length) |  | ||||||
| tr[trace_length > 10, ] |  | ||||||
| 
 |  | ||||||
| trace_varied <- pbapply::pbsapply(strsplit(tr$trace, ","), function(x) length(unique(x))) |  | ||||||
| tr[trace_varied > 1, ] |  | ||||||
| table(tr[trace_varied > 2, "absolute_frequency"]) |  | ||||||
| table(tr[trace_varied > 3, "absolute_frequency"]) |  | ||||||
| 
 |  | ||||||
| summary(tr$absolute_frequency) |  | ||||||
| vioplot::vioplot(tr$absolute_frequency) |  | ||||||
| 
 |  | ||||||
| # Power law for frequencies of traces |  | ||||||
| tab <- table(tr$absolute_frequency) |  | ||||||
| x <- as.numeric(tab) |  | ||||||
| y <- as.numeric(names(tab)) |  | ||||||
| 
 |  | ||||||
| plot(x, y, log = "xy") |  | ||||||
| p1 <- lm(log(y) ~ log(x)) |  | ||||||
| pre <- exp(coef(p1)[1]) * x^coef(p1)[2] |  | ||||||
| lines(x, pre) |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| # Look at individual traces as examples |  | ||||||
| tr[trace_varied == 5 & trace_length > 50, ] |  | ||||||
| # --> every variant exists only once, of course |  | ||||||
| datcase[datcase$nitems == 5 & datcase$length > 50,] |  | ||||||
| 
 |  | ||||||
| pbapply::pbsapply(datcase[, -c(1, 9)], median) |  | ||||||
| 
 |  | ||||||
| #ex <- datcase[datcase$nitems == 4 & datcase$length == 15,] |  | ||||||
| ex <- datcase[datcase$nitems == 5,] |  | ||||||
| ex <- ex[sample(1:nrow(ex), 20), ] |  | ||||||
| # --> pretty randomly chosen... TODO: |  | ||||||
| 
 |  | ||||||
| case_ids <- NULL |  | ||||||
| 
 |  | ||||||
| for (case in ex$case) { |  | ||||||
|   if ("080" %in% res$item[res$case == case] | "503" %in% res$item[res$case == case]) { |  | ||||||
|     case_ids <- c(case_ids, TRUE) |  | ||||||
|   } else { |  | ||||||
|     case_ids <- c(case_ids, FALSE) |  | ||||||
|   } |  | ||||||
| } |  | ||||||
| 
 |  | ||||||
| cases <- ex$case[case_ids] |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| for (case in cases) { |  | ||||||
| 
 |  | ||||||
|   alog <- activitylog(res[res$case == case, ], |  | ||||||
|                       case_id     = "case", |  | ||||||
|                       activity_id = "item", |  | ||||||
|                       resource_id = "path", |  | ||||||
|                       timestamps  = c("start", "complete")) |  | ||||||
| 
 |  | ||||||
|   dfg <- process_map(alog, |  | ||||||
|                      type_nodes = frequency("absolute", color_scale = "Greys"), |  | ||||||
|                      type_edges = frequency("absolute", color_edges = "#FF6900"), |  | ||||||
|                      rankdir    = "LR", |  | ||||||
|                      render     = FALSE) |  | ||||||
|   export_map(dfg, |  | ||||||
|              file_name = paste0("results/processmaps/dfg_example_cases_", case, "_R.pdf"), |  | ||||||
|              file_type = "pdf", |  | ||||||
|              title     = paste("Case", case)) |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| } |  | ||||||
| 
 |  | ||||||
							
								
								
									
										141
									
								
								code/10_user-navigation.R
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										141
									
								
								code/10_user-navigation.R
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,141 @@ | |||||||
|  | # 10_user-navigation.R | ||||||
|  | # | ||||||
|  | # content: (1) Load data | ||||||
|  | #          (2) Clustering | ||||||
|  | #          (3) Fit tree | ||||||
|  | #          (4) Investigate variants | ||||||
|  | # | ||||||
|  | # input:  results/haum/event_logfiles_2024-02-21_16-07-33.csv | ||||||
|  | # output: results/haum/eventlogs_pre-corona_case-clusters.csv | ||||||
|  | # | ||||||
|  | # last mod: 2024-03-08 | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | # setwd("C:/Users/nwickelmaier/Nextcloud/Documents/MDS/2023ss/60100_master_thesis/analysis/code") | ||||||
|  | 
 | ||||||
|  | library(bupaverse) | ||||||
|  | library(factoextra) | ||||||
|  | library(rpart) | ||||||
|  | library(partykit) | ||||||
|  | 
 | ||||||
|  | #--------------- (1) Load data --------------- | ||||||
|  | 
 | ||||||
|  | load("results/haum/dataframes_case_2019.RData") | ||||||
|  | 
 | ||||||
|  | #--------------- (2) Clustering --------------- | ||||||
|  | 
 | ||||||
|  | df <- dattree[, -1] | ||||||
|  | 
 | ||||||
|  | summary(df) | ||||||
|  | 
 | ||||||
|  | # Look at collinearity | ||||||
|  | cor_mat <- cor(df) | ||||||
|  | diag(cor_mat) <- NA | ||||||
|  | heatmap(cor_mat) | ||||||
|  | 
 | ||||||
|  | #--------------- (2.2) Hierarchical clustering --------------- | ||||||
|  | 
 | ||||||
|  | dist_mat <- cluster::daisy(df, metric = "gower") | ||||||
|  | 
 | ||||||
|  | # "Flatten" with MDS | ||||||
|  | coor_2d <- as.data.frame(cmdscale(dist_mat, k = 2)) | ||||||
|  | coor_3d <- as.data.frame(cmdscale(dist_mat, k = 3)) | ||||||
|  | # TODO: Better use MASS::isoMDS() since I am not using Euclidean distances? | ||||||
|  | 
 | ||||||
|  | plot(coor_2d) | ||||||
|  | rgl::plot3d(coor_3d) | ||||||
|  | 
 | ||||||
|  | method <- c(average = "average", single = "single", complete = "complete", | ||||||
|  |             ward = "ward") | ||||||
|  | 
 | ||||||
|  | method <- "ward" | ||||||
|  | 
 | ||||||
|  | hcs <- pbapply::pblapply(method, function(x) cluster::agnes(dist_mat, method = x)) | ||||||
|  | acs <- pbapply::pbsapply(hcs, function(x) x$ac) | ||||||
|  | 
 | ||||||
|  | hc <- hcs$ward | ||||||
|  | 
 | ||||||
|  | # Something like a scree plot (??) | ||||||
|  | plot(rev(hc$height)[1:100], type = "b", pch = 16, cex = .5) | ||||||
|  | 
 | ||||||
|  | k <- 4 | ||||||
|  | 
 | ||||||
|  | mycols <- c("#78004B", "#FF6900", "#3CB4DC", "#91C86E") | ||||||
|  | 
 | ||||||
|  | cluster <- cutree(as.hclust(hc), k = k) | ||||||
|  | 
 | ||||||
|  | table(cluster) | ||||||
|  | 
 | ||||||
|  | plot(coor_2d, col = mycols[cluster]) | ||||||
|  | legend("topleft", paste("Cl", 1:4), col = mycols, pch = 21) | ||||||
|  | rgl::plot3d(coor_3d, col = mycols[cluster]) | ||||||
|  | 
 | ||||||
|  | table(dattree[cluster == 1, "Pattern"]) | ||||||
|  | table(dattree[cluster == 2, "Pattern"]) | ||||||
|  | table(dattree[cluster == 3, "Pattern"]) | ||||||
|  | table(dattree[cluster == 4, "Pattern"]) | ||||||
|  | 
 | ||||||
|  | table(dattree[cluster == 1, "InfocardOnly"]) | ||||||
|  | table(dattree[cluster == 2, "InfocardOnly"]) | ||||||
|  | table(dattree[cluster == 3, "InfocardOnly"]) | ||||||
|  | table(dattree[cluster == 4, "InfocardOnly"]) | ||||||
|  | 
 | ||||||
|  | aggregate(. ~ cluster, df, mean) | ||||||
|  | 
 | ||||||
|  | aggregate(cbind(duration, distance, scaleSize, rotationDegree, length, | ||||||
|  |                 nmove, nflipCard, nopenTopic, nopenPopup) ~ cluster, datcase, | ||||||
|  |           mean) | ||||||
|  | 
 | ||||||
|  | ### Look at selected cases ########################################### | ||||||
|  | tmp <- dat | ||||||
|  | tmp$start <- tmp$date.start | ||||||
|  | tmp$complete <- tmp$date.stop | ||||||
|  | 
 | ||||||
|  | alog <- activitylog(tmp[tmp$case == 24016, ], | ||||||
|  |                     case_id     = "case", | ||||||
|  |                     activity_id = "item", | ||||||
|  |                     resource_id = "path", | ||||||
|  |                     timestamps  = c("start", "complete")) | ||||||
|  | 
 | ||||||
|  | process_map(alog) | ||||||
|  | 
 | ||||||
|  | rm(tmp) | ||||||
|  | 
 | ||||||
|  | ###################################################################### | ||||||
|  | 
 | ||||||
|  | res <- merge(dat, data.frame(case = dattree$case, cluster), | ||||||
|  |              by = "case", all.x = TRUE) | ||||||
|  | res <- res[order(res$fileId.start, res$date.start, res$timeMs.start), ] | ||||||
|  | 
 | ||||||
|  | xtabs( ~ item + cluster, res) | ||||||
|  | aggregate(event ~ cluster, res, table) | ||||||
|  | 
 | ||||||
|  | # Look at clusters | ||||||
|  | par(mfrow = c(2, 2)) | ||||||
|  | vioplot::vioplot(duration ~ cluster, res) | ||||||
|  | vioplot::vioplot(distance ~ cluster, res) | ||||||
|  | vioplot::vioplot(scaleSize ~ cluster, res) | ||||||
|  | vioplot::vioplot(rotationDegree ~ cluster, res) | ||||||
|  | 
 | ||||||
|  | aggregate(cbind(duration, distance, scaleSize, rotationDegree) ~ cluster, res, mean) | ||||||
|  | aggregate(cbind(duration, distance, scaleSize, rotationDegree) ~ cluster, res, median) | ||||||
|  | 
 | ||||||
|  | write.table(res, | ||||||
|  |             file = "results/haum/eventlogs_2019_case-clusters.csv", | ||||||
|  |             sep = ";", | ||||||
|  |             quote = FALSE, | ||||||
|  |             row.names = FALSE) | ||||||
|  | 
 | ||||||
|  | save(res, dist_mat, hcs, acs, datcase, dattree, coor_2d, coor_3d, | ||||||
|  |      file = "results/haum/tmp_user-navigation.RData") | ||||||
|  | 
 | ||||||
|  | #--------------- (3) Fit tree --------------- | ||||||
|  | 
 | ||||||
|  | c1 <- rpart(as.factor(cluster) ~ ., data = dattree[, -1], method = "class") | ||||||
|  | plot(as.party(c1)) | ||||||
|  | 
 | ||||||
|  | # with conditional tree | ||||||
|  | c2 <- ctree(as.factor(cluster) ~ ., data = dattree[, -1], alpha = 0) | ||||||
|  | plot(c2) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
							
								
								
									
										101
									
								
								code/11_investigate-variants.R
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										101
									
								
								code/11_investigate-variants.R
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,101 @@ | |||||||
|  | # 11_investigate-variants.R | ||||||
|  | # | ||||||
|  | # content: (1) Read data | ||||||
|  | #          (2) Extract characteristics for cases | ||||||
|  | #          (3) Select features for navigation behavior | ||||||
|  | #          (4) Export data frames | ||||||
|  | # | ||||||
|  | # input:  results/haum/event_logfiles_2024-02-21_16-07-33.csv | ||||||
|  | # output: results/haum/eventlogs_pre-corona_case-clusters.csv | ||||||
|  | # | ||||||
|  | # last mod: 2024-03-08 | ||||||
|  | 
 | ||||||
|  | # setwd("C:/Users/nwickelmaier/Nextcloud/Documents/MDS/2023ss/60100_master_thesis/analysis/code") | ||||||
|  | 
 | ||||||
|  | #--------------- (1) Read data --------------- | ||||||
|  | 
 | ||||||
|  | load("results/haum/eventlogs_pre-corona_cleaned.RData") | ||||||
|  | 
 | ||||||
|  | #--------------- (4) Investigate variants --------------- | ||||||
|  | 
 | ||||||
|  | res$start <- res$date.start | ||||||
|  | res$complete <- res$date.stop | ||||||
|  | 
 | ||||||
|  | alog <- activitylog(res, | ||||||
|  |                     case_id     = "case", | ||||||
|  |                     activity_id = "item", | ||||||
|  |                     resource_id = "path", | ||||||
|  |                     timestamps  = c("start", "complete")) | ||||||
|  | 
 | ||||||
|  | trace_explorer(alog, n_traces = 25) | ||||||
|  | # --> sequences of artworks are just too rare | ||||||
|  | 
 | ||||||
|  | tr <- traces(alog) | ||||||
|  | trace_length <- pbapply::pbsapply(strsplit(tr$trace, ","), length) | ||||||
|  | tr[trace_length > 10, ] | ||||||
|  | 
 | ||||||
|  | trace_varied <- pbapply::pbsapply(strsplit(tr$trace, ","), function(x) length(unique(x))) | ||||||
|  | tr[trace_varied > 1, ] | ||||||
|  | table(tr[trace_varied > 2, "absolute_frequency"]) | ||||||
|  | table(tr[trace_varied > 3, "absolute_frequency"]) | ||||||
|  | 
 | ||||||
|  | summary(tr$absolute_frequency) | ||||||
|  | vioplot::vioplot(tr$absolute_frequency) | ||||||
|  | 
 | ||||||
|  | # Power law for frequencies of traces | ||||||
|  | tab <- table(tr$absolute_frequency) | ||||||
|  | x <- as.numeric(tab) | ||||||
|  | y <- as.numeric(names(tab)) | ||||||
|  | 
 | ||||||
|  | plot(x, y, log = "xy") | ||||||
|  | p1 <- lm(log(y) ~ log(x)) | ||||||
|  | pre <- exp(coef(p1)[1]) * x^coef(p1)[2] | ||||||
|  | lines(x, pre) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | # Look at individual traces as examples | ||||||
|  | tr[trace_varied == 5 & trace_length > 50, ] | ||||||
|  | # --> every variant exists only once, of course | ||||||
|  | datcase[datcase$nitems == 5 & datcase$length > 50,] | ||||||
|  | 
 | ||||||
|  | pbapply::pbsapply(datcase[, -c(1, 9)], median) | ||||||
|  | 
 | ||||||
|  | #ex <- datcase[datcase$nitems == 4 & datcase$length == 15,] | ||||||
|  | ex <- datcase[datcase$nitems == 5,] | ||||||
|  | ex <- ex[sample(1:nrow(ex), 20), ] | ||||||
|  | # --> pretty randomly chosen... TODO: | ||||||
|  | 
 | ||||||
|  | case_ids <- NULL | ||||||
|  | 
 | ||||||
|  | for (case in ex$case) { | ||||||
|  |   if ("080" %in% res$item[res$case == case] | "503" %in% res$item[res$case == case]) { | ||||||
|  |     case_ids <- c(case_ids, TRUE) | ||||||
|  |   } else { | ||||||
|  |     case_ids <- c(case_ids, FALSE) | ||||||
|  |   } | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | cases <- ex$case[case_ids] | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | for (case in cases) { | ||||||
|  | 
 | ||||||
|  |   alog <- activitylog(res[res$case == case, ], | ||||||
|  |                       case_id     = "case", | ||||||
|  |                       activity_id = "item", | ||||||
|  |                       resource_id = "path", | ||||||
|  |                       timestamps  = c("start", "complete")) | ||||||
|  | 
 | ||||||
|  |   dfg <- process_map(alog, | ||||||
|  |                      type_nodes = frequency("absolute", color_scale = "Greys"), | ||||||
|  |                      type_edges = frequency("absolute", color_edges = "#FF6900"), | ||||||
|  |                      rankdir    = "LR", | ||||||
|  |                      render     = FALSE) | ||||||
|  |   export_map(dfg, | ||||||
|  |              file_name = paste0("results/processmaps/dfg_example_cases_", case, "_R.pdf"), | ||||||
|  |              file_type = "pdf", | ||||||
|  |              title     = paste("Case", case)) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | } | ||||||
|  | 
 | ||||||
							
								
								
									
										45
									
								
								code/R_helpers.R
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										45
									
								
								code/R_helpers.R
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,45 @@ | |||||||
|  | ###################################################################### | ||||||
|  | time_minmax_ms <- function(subdata) { | ||||||
|  |   subdata$min_time <- min(subdata$timeMs.start) | ||||||
|  |     if (all(is.na(subdata$timeMs.stop))) { | ||||||
|  |       subdata$max_time <- NA | ||||||
|  |     } else { | ||||||
|  |        subdata$max_time <- max(subdata$timeMs.stop, na.rm = TRUE) | ||||||
|  |     } | ||||||
|  |   subdata | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | ###################################################################### | ||||||
|  | check_infocards <- function(subdata, artworks) { | ||||||
|  |   infocard_only <- NULL | ||||||
|  |   if(any(unique(subdata$item) %in% artworks)) { | ||||||
|  |     infocard_only <- FALSE | ||||||
|  |   } else { | ||||||
|  |     infocard_only <- TRUE | ||||||
|  |   } | ||||||
|  |   as.numeric(infocard_only) | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | ###################################################################### | ||||||
|  | normalize <- function(x) { | ||||||
|  |   (x - min(x)) / (max(x) - min(x)) | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | ###################################################################### | ||||||
|  | get_centrality <- function(case, data) { | ||||||
|  | 
 | ||||||
|  |   data$start <- data$date.start | ||||||
|  |   data$complete <- data$date.stop | ||||||
|  | 
 | ||||||
|  |   alog <- bupaR::activitylog(data[data$case == case, ], | ||||||
|  |                              case_id     = "case", | ||||||
|  |                              activity_id = "item", | ||||||
|  |                              resource_id = "path", | ||||||
|  |                              timestamps  = c("start", "complete")) | ||||||
|  | 
 | ||||||
|  |   net <- processmapR::process_map(alog, render = FALSE) | ||||||
|  |   inet <- DiagrammeR::to_igraph(net) | ||||||
|  | 
 | ||||||
|  |   igraph::centr_betw(inet)$centralization | ||||||
|  | } | ||||||
|  | 
 | ||||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user