#' --- #' title: "Open Questions -- Card indices" #' author: "Nora Wickelmaier" #' date: "`r Sys.Date()`" #' output: #' html_document: #' number_sections: true #' toc: true #' --- #+ include = FALSE # setwd("C:/Users/nwickelmaier/Nextcloud/Documents/MDS/2023ss/60100_master_thesis/code") dat <- read.table("../data/event_logfiles.csv", sep = ";", header = TRUE) dat$date.start <- as.POSIXct(dat$date.start) dat$date.stop <- as.POSIXct(dat$date.stop) dat$artwork <- sprintf("%03d", dat$artwork) #' The following table shows an overview of the card indices. The indices #' should have values between 0 and 5. It is unclear what the numbers mean. table(dat$card) #' Number of cards for each artwork in the data set (subset from 2016) artworks <- sort(unique(dat$artwork)) count <- function(x) length(table(dat[which(dat$artwork == x), "card"])) max_index <- function(x) max(dat[which(dat$artwork == x), "card"], na.rm = TRUE) num_cards <- sapply(artworks, count) highest_index <- sapply(artworks, max_index) #' Check how many XML-files for cards are present path <- "../data/ContentEyevisit/eyevisit_cards_light" num_files <- NULL for (artwork in artworks) { fnames <- dir(pattern = paste0(artwork, "_"), path = paste(path, artwork, sep = "/")) num_files <- c(num_files, length(fnames)) } #' The table shows that each artwork has 6 cards the most (as expected). #' This is a subset of the data, so not all cards have been opened. cards <- data.frame(artwork = artworks, num_cards, highest_index, num_files, diff = num_files - highest_index) cards #' There are more than 8 files for a couple of artworks: subset(cards, cards$num_files >= 8) #' It might be possible, that the number indicates the index of the file #' and not the actual card that was displayed. BUT: In many cases, there #' are only 6 (or less) files, but a higher index is present... subset(cards, cards$diff < 0)