mtt_haum/code/questions/questions_number-of-cards.R

59 lines
1.9 KiB
R

#' ---
#' title: "Open Questions -- Card indices"
#' author: "Nora Wickelmaier"
#' date: "`r Sys.Date()`"
#' output:
#' html_document:
#' number_sections: true
#' toc: true
#' ---
#+ include = FALSE
# setwd("C:/Users/nwickelmaier/Nextcloud/Documents/MDS/2023ss/60100_master_thesis/code")
dat <- read.table("../data/event_logfiles.csv", sep = ";", header = TRUE)
dat$date.start <- as.POSIXct(dat$date.start)
dat$date.stop <- as.POSIXct(dat$date.stop)
dat$artwork <- sprintf("%03d", dat$artwork)
#' The following table shows an overview of the card indices. The indices
#' should have values between 0 and 5. It is unclear what the numbers mean.
table(dat$card)
#' Number of cards for each artwork in the data set (subset from 2016)
artworks <- sort(unique(dat$artwork))
count <- function(x) length(table(dat[which(dat$artwork == x), "card"]))
max_index <- function(x) max(dat[which(dat$artwork == x), "card"], na.rm = TRUE)
num_cards <- sapply(artworks, count)
highest_index <- sapply(artworks, max_index)
#' Check how many XML-files for cards are present
path <- "../data/ContentEyevisit/eyevisit_cards_light"
num_files <- NULL
for (artwork in artworks) {
fnames <- dir(pattern = paste0(artwork, "_"), path = paste(path, artwork, sep = "/"))
num_files <- c(num_files, length(fnames))
}
#' The table shows that each artwork has 6 cards the most (as expected).
#' This is a subset of the data, so not all cards have been opened.
cards <- data.frame(artwork = artworks, num_cards, highest_index,
num_files, diff = num_files - highest_index)
cards
#' There are more than 8 files for a couple of artworks:
subset(cards, cards$num_files >= 8)
#' It might be possible, that the number indicates the index of the file
#' and not the actual card that was displayed. BUT: In many cases, there
#' are only 6 (or less) files, but a higher index is present...
subset(cards, cards$diff < 0)