diff --git a/code/questions/questions_data-inconsistencies.R b/code/questions/questions_data-inconsistencies.R deleted file mode 100644 index 6ebdf04..0000000 --- a/code/questions/questions_data-inconsistencies.R +++ /dev/null @@ -1,125 +0,0 @@ -#' --- -#' title: "Open Questions" -#' author: "Nora Wickelmaier" -#' date: "`r Sys.Date()`" -#' output: -#' html_document: -#' number_sections: true -#' toc: true -#' --- - -#+ include = FALSE -# setwd("C:/Users/nwickelmaier/Nextcloud/Documents/MDS/2023ss/60100_master_thesis/code") -dat <- read.table("../data/event_logfiles.csv", sep = ";", header = TRUE) -dat$date.start <- as.POSIXct(dat$date.start) -dat$date.stop <- as.POSIXct(dat$date.stop) - -#' This is what the data look like after preprocessing right now - -#+ include = FALSE -mat <- as.data.frame(t(sapply(dat, range, na.rm = TRUE))) -names(mat) <- c("min", "max") -mat$min <- round(as.numeric(mat$min), 1) -mat$max <- round(as.numeric(mat$max), 1) -mat$mean <- round(sapply(dat, function(x) mean(x, na.rm = TRUE)), 1) -mat$missings <- sapply(dat, function(x) sum(is.na(x))) -mat <- mat[!(rownames(mat) %in% c("eventid", "case", "trace", "event", "artwork", "card", "popup", "date.start", "date.stop")), ] - -#+ echo = FALSE -knitr::kable(mat) - -#' This is only the data for 2016! So only about 2 weeks in December. - -# Date ranges -range(dat$date.start) -range(dat$date.stop, na.rm = TRUE) - -#' # Units of x and y -#' I assume that x and y are pixel $\to$ correct? - -#' But they look weird, when plotted. Is it possible that there are -#' outliers? If yes, how? Do we have the true ranges of the display? - -par(mfrow = c(1, 2)) -plot(y.start ~ x.start, dat) -abline(v = c(0, 3800), h = c(0, 2150), col = "blue", lwd = 2) -plot(y.stop ~ x.stop, dat) -abline(v = c(0, 3800), h = c(0, 2150), col = "blue", lwd = 2) - -aggregate(cbind(x.start, x.stop, y.start, y.stop) ~ 1, dat, mean) - -#' Looks like the range should be something like $x = [0, 3800]$ and -#' $y = [0, 2150]$. Do we have the starting coordinates for each artwork? -#' - -#' # Unit of scale - -summary(dat$scaleSize) - -#' I thought it would be some kind of scaling factor, but then I would -#' have expected that `scale.start` is always 1 or something. -#' - -#' # Unit of rotation - -summary(dat$rotationDegree) - -#' This looks pretty clear. Should be degree. Anything else to consider -#' here? I am assuming negative means left, but maybe not? -#' - -#' # Meaningful unit for "case" - -#' I pretty randomly chose `20 sec` based on this plot. I would love a -#' second opinion. `:)` - -timediff <- as.numeric(diff(c(dat$date.start[1], dat$date.start))) -hist(timediff[timediff < 40], breaks = 50) -abline(v = 20, col = "red", lwd = 2) - -#' This actually works pretty well and lets me assign `trace` values to the -#' moves. But maybe there are other ideas on how to define this? - -dat[1:40, c("date.start", "case", "trace", "event", "artwork")] - - -#' # Problems with `time_ms` - -#' What exactly happens, when `time_ms` goes down again? Why does it not go -#' down to 0? - -par(mfrow = c(1, 2)) - -plot(dat$time_ms.start[1:100], type = "b", ylab = "time_ms", xlab = "") -points(dat$time_ms.stop[1:100], type = "b", col = rgb(1, 0, 0, .5)) -legend("topleft", c("start", "stop"), lty = 1, col = c("black", "red")) - -plot(dat$time_ms.stop[1:100] - dat$time_ms.start[1:100], type = "b", - ylab = "duration", col = rgb(0, 0, 1, .5)) -abline(h = 0, lty = 2) - -#' For the regular timestamps everything looks fine. - -par(mfrow = c(1, 2)) - -plot(dat$date.stop[1:100], type = "b", ylab = "timestamp", xlab = "", - col = rgb(1, 0, 0, .5)) -points(dat$date.start[1:100], type = "b") -legend("topleft", c("start", "stop"), lty = 1, col = c("black", "red")) - -plot(dat$date.stop[1:100] - dat$date.start[1:100], type = "b", - ylab = "duration", col = rgb(0, 0, 1, .5)) -abline(h = 0, lty = 2) - -#+ -plot(time_ms.start ~ date.start, dat[1:1000, ], type = "b") -points(time_ms.stop ~ date.stop, dat[1:1000, ], type = "b", col = rgb(1, 0, 0, .3)) - -#' For `time_ms.stop` this looks even weirder. -#' - -#' # After which time interval does the table reset? - -#' I cannot see this in the data at all. Or can I? Has this something to do -#' with the weird behavior of `time_ms`? - diff --git a/code/questions/questions_number-of-cards.R b/code/questions/questions_number-of-cards.R deleted file mode 100644 index 19f7a2b..0000000 --- a/code/questions/questions_number-of-cards.R +++ /dev/null @@ -1,58 +0,0 @@ -#' --- -#' title: "Open Questions -- Card indices" -#' author: "Nora Wickelmaier" -#' date: "`r Sys.Date()`" -#' output: -#' html_document: -#' number_sections: true -#' toc: true -#' --- - -#+ include = FALSE -# setwd("C:/Users/nwickelmaier/Nextcloud/Documents/MDS/2023ss/60100_master_thesis/code") -dat <- read.table("../data/event_logfiles.csv", sep = ";", header = TRUE) -dat$date.start <- as.POSIXct(dat$date.start) -dat$date.stop <- as.POSIXct(dat$date.stop) -dat$artwork <- sprintf("%03d", dat$artwork) - -#' The following table shows an overview of the card indices. The indices -#' should have values between 0 and 5. It is unclear what the numbers mean. - -table(dat$card) - -#' Number of cards for each artwork in the data set (subset from 2016) - -artworks <- sort(unique(dat$artwork)) - -count <- function(x) length(table(dat[which(dat$artwork == x), "card"])) -max_index <- function(x) max(dat[which(dat$artwork == x), "card"], na.rm = TRUE) -num_cards <- sapply(artworks, count) -highest_index <- sapply(artworks, max_index) - -#' Check how many XML-files for cards are present - -path <- "../data/ContentEyevisit/eyevisit_cards_light" - -num_files <- NULL -for (artwork in artworks) { - fnames <- dir(pattern = paste0(artwork, "_"), path = paste(path, artwork, sep = "/")) - num_files <- c(num_files, length(fnames)) -} - -#' The table shows that each artwork has 6 cards the most (as expected). -#' This is a subset of the data, so not all cards have been opened. - -cards <- data.frame(artwork = artworks, num_cards, highest_index, - num_files, diff = num_files - highest_index) -cards - -#' There are more than 8 files for a couple of artworks: - -subset(cards, cards$num_files >= 8) - -#' It might be possible, that the number indicates the index of the file -#' and not the actual card that was displayed. BUT: In many cases, there -#' are only 6 (or less) files, but a higher index is present... - -subset(cards, cards$diff < 0) - diff --git a/code/questions/questions_programming-input.R b/code/questions/questions_programming-input.R deleted file mode 100644 index 9fc5afb..0000000 --- a/code/questions/questions_programming-input.R +++ /dev/null @@ -1,105 +0,0 @@ -#' --- -#' title: "Programming input" -#' author: "Nora Wickelmaier" -#' date: "`r Sys.Date()`" -#' output: html_document -#' --- - -#+ include = FALSE -# setwd("C:/Users/nwickelmaier/Nextcloud/Documents/MDS/2023ss/60100_master_thesis/code") - -#+ -dat0 <- read.table("../data/rawdata_logfiles_small.csv", sep = ";", header = TRUE) -dat0$date <- as.POSIXct(dat0$date) # create date object - -# Remove irrelevant events -dat <- subset(dat0, !(dat0$event %in% c("Start Application", "Show Application"))) -str(dat) - -# make data better manageable -tmp <- dat[!dat$event %in% c("Transform start", "Transform stop"), ] -rownames(tmp) <- NULL - -#' # Add `trace` variable for closing events - -tmp$trace <- NA -last_event <- tmp$event[1] -aws <- unique(tmp$artwork)[unique(tmp$artwork) != "glossar"] - -for (art in aws) { # select artwork - - for (i in 1:nrow(tmp)) { # go through rows - - if (last_event == "Show Info" & tmp$artwork[i] == art) { - tmp$trace[i] <- i - j <- i - - } else if (last_event == "Show Front" & tmp$artwork[i] == art) { - tmp$trace[i] <- j - - } else if (!(last_event %in% c("Show Info", "Show Front")) & - tmp$artwork[i] == art) { - tmp$trace[i] <- j - } - - if (i <= nrow(tmp)) { - last_event <- tmp$event[i + 1] - } - } -} - -head(tmp[, c("artwork", "event", "trace")], 50) - -#' # Find artwork for glossar entry - -glossar_files <- unique(tmp[tmp$artwork == "glossar", "popup"]) - -# Load lookup table for artworks and glossar files -load("../data/glossar_dict.RData") -lut <- glossar_dict[glossar_dict$glossar_file %in% glossar_files, ] - -# Fill in trace variable based on last `Show Info` -for (file in lut$glossar_file) { - - artwork_list <- unlist(lut[lut$glossar_file == file, "artwork"]) - - for (i in seq_len(nrow(tmp))) { - - if (tmp$event[i] == "Show Info") { - - current_artwork <- tmp[i, "artwork"] - j <- i - k <- i - - } else { - - current_artwork <- current_artwork - - } - - if (tmp$event[i] == "Show Front" & tmp$artwork[i] == current_artwork) { - # make sure artwork has not been closed, yet! - k <- i - } - - if (tmp$artwork[i] == "glossar" & - (current_artwork %in% artwork_list) & - tmp$popup[i] == file & (j-k == 0)) { - - tmp[i, "trace"] <- tmp[j, "trace"] - - } - } -} - -tmp[tmp$artwork == "glossar", c("artwork", "event", "popup", "trace")] - -proportions(table(is.na(tmp$trace[tmp$artwork == "glossar"]))) -# --> finds about half of the glossar entries for small data set... - -# REMEMBER: It can never be 100% correct, since it is always possible that -# several cards are open and that they link to the same glossar entry - -# How many glossar_files are only associated with one artwork? -lut[sapply(lut$artwork, length) == 1, "glossar_file"] -