Removed questions, since it is all outdated

2024-03-22 13:38:37 +01:00 · 2024-03-22 13:38:37 +01:00 · 37e67bfa69
commit 37e67bfa69
parent cba441f08b
3 changed files with 0 additions and 288 deletions
--- a/code/questions/questions_data-inconsistencies.R
+++ b/code/questions/questions_data-inconsistencies.R
@ -1,125 +0,0 @@
 #' ---
 #' title: "Open Questions"
 #' author: "Nora Wickelmaier"
 #' date: "`r Sys.Date()`"
 #' output: 
 #'   html_document:
 #'     number_sections: true
 #'     toc: true
 #' ---
 #+ include = FALSE
 # setwd("C:/Users/nwickelmaier/Nextcloud/Documents/MDS/2023ss/60100_master_thesis/code")
 dat <- read.table("../data/event_logfiles.csv", sep = ";", header = TRUE)
 dat$date.start <- as.POSIXct(dat$date.start)
 dat$date.stop <- as.POSIXct(dat$date.stop)
 #' This is what the data look like after preprocessing right now
 #+ include = FALSE
 mat <- as.data.frame(t(sapply(dat, range, na.rm = TRUE)))
 names(mat) <- c("min", "max")
 mat$min <- round(as.numeric(mat$min), 1)
 mat$max <- round(as.numeric(mat$max), 1)
 mat$mean <- round(sapply(dat, function(x) mean(x, na.rm = TRUE)), 1)
 mat$missings <- sapply(dat, function(x) sum(is.na(x)))
 mat <- mat[!(rownames(mat) %in% c("eventid", "case", "trace", "event", "artwork", "card", "popup", "date.start", "date.stop")), ]
 #+ echo = FALSE
 knitr::kable(mat)
 #' This is only the data for 2016! So only about 2 weeks in December.
 # Date ranges
 range(dat$date.start)
 range(dat$date.stop, na.rm = TRUE)
 #' # Units of x and y
 #' I assume that x and y are pixel $\to$ correct?
 #' But they look weird, when plotted. Is it possible that there are
 #' outliers? If yes, how? Do we have the true ranges of the display?
 par(mfrow = c(1, 2))
 plot(y.start ~ x.start, dat)
 abline(v = c(0, 3800), h = c(0, 2150), col = "blue", lwd = 2)
 plot(y.stop ~ x.stop, dat)
 abline(v = c(0, 3800), h = c(0, 2150), col = "blue", lwd = 2)
 aggregate(cbind(x.start, x.stop, y.start, y.stop) ~ 1, dat, mean)
 #' Looks like the range should be something like $x = [0, 3800]$ and
 #' $y = [0, 2150]$. Do we have the starting coordinates for each artwork?
 #'
 #' # Unit of scale
 summary(dat$scaleSize)
 #' I thought it would be some kind of scaling factor, but then I would
 #' have expected that `scale.start` is always 1 or something.
 #'
 #' # Unit of rotation
 summary(dat$rotationDegree)
 #' This looks pretty clear. Should be degree. Anything else to consider
 #' here? I am assuming negative means left, but maybe not?
 #'
 #' # Meaningful unit for "case"
 #' I pretty randomly chose `20 sec` based on this plot. I would love a
 #' second opinion. `:)`
 timediff <- as.numeric(diff(c(dat$date.start[1], dat$date.start)))
 hist(timediff[timediff < 40], breaks = 50)
 abline(v = 20, col = "red", lwd = 2)
 #' This actually works pretty well and lets me assign `trace` values to the
 #' moves. But maybe there are other ideas on how to define this?
 dat[1:40, c("date.start", "case", "trace", "event", "artwork")]
 #' # Problems with `time_ms`
 #' What exactly happens, when `time_ms` goes down again? Why does it not go
 #' down to 0?
 par(mfrow = c(1, 2))
 plot(dat$time_ms.start[1:100], type = "b", ylab = "time_ms", xlab = "")
 points(dat$time_ms.stop[1:100], type = "b", col = rgb(1, 0, 0, .5))
 legend("topleft", c("start", "stop"), lty = 1, col = c("black", "red"))
 plot(dat$time_ms.stop[1:100] - dat$time_ms.start[1:100], type = "b",
     ylab = "duration", col = rgb(0, 0, 1, .5))
 abline(h = 0, lty = 2)
 #' For the regular timestamps everything looks fine.
 par(mfrow = c(1, 2))
 plot(dat$date.stop[1:100], type = "b", ylab = "timestamp", xlab = "",
     col = rgb(1, 0, 0, .5))
 points(dat$date.start[1:100], type = "b")
 legend("topleft", c("start", "stop"), lty = 1, col = c("black", "red"))
 plot(dat$date.stop[1:100] - dat$date.start[1:100], type = "b",
     ylab = "duration", col = rgb(0, 0, 1, .5))
 abline(h = 0, lty = 2)
 #+
 plot(time_ms.start ~ date.start, dat[1:1000, ], type = "b")
 points(time_ms.stop ~ date.stop, dat[1:1000, ], type = "b", col = rgb(1, 0, 0, .3))
 #' For `time_ms.stop` this looks even weirder.
 #'
 #' # After which time interval does the table reset?
 #' I cannot see this in the data at all. Or can I? Has this something to do
 #' with the weird behavior of `time_ms`?
--- a/code/questions/questions_number-of-cards.R
+++ b/code/questions/questions_number-of-cards.R
@ -1,58 +0,0 @@
 #' ---
 #' title: "Open Questions -- Card indices"
 #' author: "Nora Wickelmaier"
 #' date: "`r Sys.Date()`"
 #' output: 
 #'   html_document:
 #'     number_sections: true
 #'     toc: true
 #' ---
 #+ include = FALSE
 # setwd("C:/Users/nwickelmaier/Nextcloud/Documents/MDS/2023ss/60100_master_thesis/code")
 dat <- read.table("../data/event_logfiles.csv", sep = ";", header = TRUE)
 dat$date.start <- as.POSIXct(dat$date.start)
 dat$date.stop <- as.POSIXct(dat$date.stop)
 dat$artwork <- sprintf("%03d", dat$artwork)
 #' The following table shows an overview of the card indices. The indices
 #' should have values between 0 and 5. It is unclear what the numbers mean.
 table(dat$card)
 #' Number of cards for each artwork in the data set (subset from 2016)
 artworks <- sort(unique(dat$artwork))
 count <- function(x) length(table(dat[which(dat$artwork == x), "card"]))
 max_index <- function(x) max(dat[which(dat$artwork == x), "card"], na.rm = TRUE)
 num_cards <- sapply(artworks, count)
 highest_index <- sapply(artworks, max_index)
 #' Check how many XML-files for cards are present
 path <- "../data/ContentEyevisit/eyevisit_cards_light"
 num_files <- NULL
 for (artwork in artworks) {
  fnames <- dir(pattern = paste0(artwork, "_"), path = paste(path, artwork, sep = "/"))
  num_files <- c(num_files, length(fnames))
 }
 #' The table shows that each artwork has 6 cards the most (as expected).
 #' This is a subset of the data, so not all cards have been opened.
 cards <- data.frame(artwork = artworks, num_cards, highest_index,
                    num_files, diff = num_files - highest_index)
 cards
 #' There are more than 8 files for a couple of artworks:
 subset(cards, cards$num_files >= 8)
 #' It might be possible, that the number indicates the index of the file
 #' and not the actual card that was displayed. BUT: In many cases, there
 #' are only 6 (or less) files, but a higher index is present...
 subset(cards, cards$diff < 0)
--- a/code/questions/questions_programming-input.R
+++ b/code/questions/questions_programming-input.R
@ -1,105 +0,0 @@
 #' ---
 #' title: "Programming input"
 #' author: "Nora Wickelmaier"
 #' date: "`r Sys.Date()`"
 #' output: html_document
 #' ---
 #+ include = FALSE
 # setwd("C:/Users/nwickelmaier/Nextcloud/Documents/MDS/2023ss/60100_master_thesis/code")
 #+
 dat0 <- read.table("../data/rawdata_logfiles_small.csv", sep = ";", header = TRUE)
 dat0$date <- as.POSIXct(dat0$date)  # create date object
 # Remove irrelevant events
 dat <- subset(dat0, !(dat0$event %in% c("Start Application", "Show Application")))
 str(dat)
 # make data better manageable
 tmp <- dat[!dat$event %in% c("Transform start", "Transform stop"), ]
 rownames(tmp) <- NULL
 #' # Add `trace` variable for closing events
 tmp$trace <- NA
 last_event <- tmp$event[1]
 aws <- unique(tmp$artwork)[unique(tmp$artwork) != "glossar"]
 for (art in aws) {              # select artwork
  for (i in 1:nrow(tmp)) {      # go through rows
    if (last_event == "Show Info" & tmp$artwork[i] == art) {
      tmp$trace[i] <- i
      j <- i
    } else if (last_event == "Show Front" & tmp$artwork[i] == art) {
      tmp$trace[i] <- j
    } else if (!(last_event %in% c("Show Info", "Show Front")) &
               tmp$artwork[i] == art) {
      tmp$trace[i] <- j
    }
    if (i <= nrow(tmp)) {
      last_event <- tmp$event[i + 1]
    }
  }
 }
 head(tmp[, c("artwork", "event", "trace")], 50)
 #' # Find artwork for glossar entry
 glossar_files <- unique(tmp[tmp$artwork == "glossar", "popup"])
 # Load lookup table for artworks and glossar files
 load("../data/glossar_dict.RData")
 lut <- glossar_dict[glossar_dict$glossar_file %in% glossar_files, ]
 # Fill in trace variable based on last `Show Info`
 for (file in lut$glossar_file) {
  artwork_list <- unlist(lut[lut$glossar_file == file, "artwork"])
  for (i in seq_len(nrow(tmp))) {
    if (tmp$event[i] == "Show Info") {
      current_artwork <- tmp[i, "artwork"]
      j <- i
      k <- i
    } else {
      current_artwork <- current_artwork
    }
    if (tmp$event[i] == "Show Front" & tmp$artwork[i] == current_artwork) {
    # make sure artwork has not been closed, yet!
      k <- i
    }
    if (tmp$artwork[i] == "glossar" &
        (current_artwork %in% artwork_list) &
        tmp$popup[i] == file & (j-k == 0)) {
      tmp[i, "trace"] <- tmp[j, "trace"]
    }
  }
 }
 tmp[tmp$artwork == "glossar", c("artwork", "event", "popup", "trace")]
 proportions(table(is.na(tmp$trace[tmp$artwork == "glossar"])))
 # --> finds about half of the glossar entries for small data set...
 # REMEMBER: It can never be 100% correct, since it is always possible that
 # several cards are open and that they link to the same glossar entry
 # How many glossar_files are only associated with one artwork?
 lut[sapply(lut$artwork, length) == 1, "glossar_file"]