#' ---
#' title: "Open Questions -- Card indices"
#' author: "Nora Wickelmaier"
#' date: "`r Sys.Date()`"
#' output: 
#'   html_document:
#'     number_sections: true
#'     toc: true
#' ---

#+ include = FALSE
# setwd("C:/Users/nwickelmaier/Nextcloud/Documents/MDS/2023ss/60100_master_thesis/code")
dat <- read.table("../data/event_logfiles.csv", sep = ";", header = TRUE)
dat$date.start <- as.POSIXct(dat$date.start)
dat$date.stop <- as.POSIXct(dat$date.stop)
dat$artwork <- sprintf("%03d", dat$artwork)

#' The following table shows an overview of the card indices. The indices
#' should have values between 0 and 5. It is unclear what the numbers mean.

table(dat$card)

#' Number of cards for each artwork in the data set (subset from 2016)

artworks <- sort(unique(dat$artwork))

count <- function(x) length(table(dat[which(dat$artwork == x), "card"]))
max_index <- function(x) max(dat[which(dat$artwork == x), "card"], na.rm = TRUE)
num_cards <- sapply(artworks, count)
highest_index <- sapply(artworks, max_index)

#' Check how many XML-files for cards are present

path <- "../data/ContentEyevisit/eyevisit_cards_light"

num_files <- NULL
for (artwork in artworks) {
  fnames <- dir(pattern = paste0(artwork, "_"), path = paste(path, artwork, sep = "/"))
  num_files <- c(num_files, length(fnames))
}

#' The table shows that each artwork has 6 cards the most (as expected).
#' This is a subset of the data, so not all cards have been opened.

cards <- data.frame(artwork = artworks, num_cards, highest_index,
                    num_files, diff = num_files - highest_index)
cards

#' There are more than 8 files for a couple of artworks:

subset(cards, cards$num_files >= 8)

#' It might be possible, that the number indicates the index of the file
#' and not the actual card that was displayed. BUT: In many cases, there
#' are only 6 (or less) files, but a higher index is present...

subset(cards, cards$diff < 0)