Removed questions, since it is all outdated

This commit is contained in:
Nora Wickelmaier 2024-03-22 13:38:37 +01:00
parent cba441f08b
commit 37e67bfa69
3 changed files with 0 additions and 288 deletions

View File

@ -1,125 +0,0 @@
#' ---
#' title: "Open Questions"
#' author: "Nora Wickelmaier"
#' date: "`r Sys.Date()`"
#' output:
#' html_document:
#' number_sections: true
#' toc: true
#' ---
#+ include = FALSE
# setwd("C:/Users/nwickelmaier/Nextcloud/Documents/MDS/2023ss/60100_master_thesis/code")
dat <- read.table("../data/event_logfiles.csv", sep = ";", header = TRUE)
dat$date.start <- as.POSIXct(dat$date.start)
dat$date.stop <- as.POSIXct(dat$date.stop)
#' This is what the data look like after preprocessing right now
#+ include = FALSE
mat <- as.data.frame(t(sapply(dat, range, na.rm = TRUE)))
names(mat) <- c("min", "max")
mat$min <- round(as.numeric(mat$min), 1)
mat$max <- round(as.numeric(mat$max), 1)
mat$mean <- round(sapply(dat, function(x) mean(x, na.rm = TRUE)), 1)
mat$missings <- sapply(dat, function(x) sum(is.na(x)))
mat <- mat[!(rownames(mat) %in% c("eventid", "case", "trace", "event", "artwork", "card", "popup", "date.start", "date.stop")), ]
#+ echo = FALSE
knitr::kable(mat)
#' This is only the data for 2016! So only about 2 weeks in December.
# Date ranges
range(dat$date.start)
range(dat$date.stop, na.rm = TRUE)
#' # Units of x and y
#' I assume that x and y are pixel $\to$ correct?
#' But they look weird, when plotted. Is it possible that there are
#' outliers? If yes, how? Do we have the true ranges of the display?
par(mfrow = c(1, 2))
plot(y.start ~ x.start, dat)
abline(v = c(0, 3800), h = c(0, 2150), col = "blue", lwd = 2)
plot(y.stop ~ x.stop, dat)
abline(v = c(0, 3800), h = c(0, 2150), col = "blue", lwd = 2)
aggregate(cbind(x.start, x.stop, y.start, y.stop) ~ 1, dat, mean)
#' Looks like the range should be something like $x = [0, 3800]$ and
#' $y = [0, 2150]$. Do we have the starting coordinates for each artwork?
#'
#' # Unit of scale
summary(dat$scaleSize)
#' I thought it would be some kind of scaling factor, but then I would
#' have expected that `scale.start` is always 1 or something.
#'
#' # Unit of rotation
summary(dat$rotationDegree)
#' This looks pretty clear. Should be degree. Anything else to consider
#' here? I am assuming negative means left, but maybe not?
#'
#' # Meaningful unit for "case"
#' I pretty randomly chose `20 sec` based on this plot. I would love a
#' second opinion. `:)`
timediff <- as.numeric(diff(c(dat$date.start[1], dat$date.start)))
hist(timediff[timediff < 40], breaks = 50)
abline(v = 20, col = "red", lwd = 2)
#' This actually works pretty well and lets me assign `trace` values to the
#' moves. But maybe there are other ideas on how to define this?
dat[1:40, c("date.start", "case", "trace", "event", "artwork")]
#' # Problems with `time_ms`
#' What exactly happens, when `time_ms` goes down again? Why does it not go
#' down to 0?
par(mfrow = c(1, 2))
plot(dat$time_ms.start[1:100], type = "b", ylab = "time_ms", xlab = "")
points(dat$time_ms.stop[1:100], type = "b", col = rgb(1, 0, 0, .5))
legend("topleft", c("start", "stop"), lty = 1, col = c("black", "red"))
plot(dat$time_ms.stop[1:100] - dat$time_ms.start[1:100], type = "b",
ylab = "duration", col = rgb(0, 0, 1, .5))
abline(h = 0, lty = 2)
#' For the regular timestamps everything looks fine.
par(mfrow = c(1, 2))
plot(dat$date.stop[1:100], type = "b", ylab = "timestamp", xlab = "",
col = rgb(1, 0, 0, .5))
points(dat$date.start[1:100], type = "b")
legend("topleft", c("start", "stop"), lty = 1, col = c("black", "red"))
plot(dat$date.stop[1:100] - dat$date.start[1:100], type = "b",
ylab = "duration", col = rgb(0, 0, 1, .5))
abline(h = 0, lty = 2)
#+
plot(time_ms.start ~ date.start, dat[1:1000, ], type = "b")
points(time_ms.stop ~ date.stop, dat[1:1000, ], type = "b", col = rgb(1, 0, 0, .3))
#' For `time_ms.stop` this looks even weirder.
#'
#' # After which time interval does the table reset?
#' I cannot see this in the data at all. Or can I? Has this something to do
#' with the weird behavior of `time_ms`?

View File

@ -1,58 +0,0 @@
#' ---
#' title: "Open Questions -- Card indices"
#' author: "Nora Wickelmaier"
#' date: "`r Sys.Date()`"
#' output:
#' html_document:
#' number_sections: true
#' toc: true
#' ---
#+ include = FALSE
# setwd("C:/Users/nwickelmaier/Nextcloud/Documents/MDS/2023ss/60100_master_thesis/code")
dat <- read.table("../data/event_logfiles.csv", sep = ";", header = TRUE)
dat$date.start <- as.POSIXct(dat$date.start)
dat$date.stop <- as.POSIXct(dat$date.stop)
dat$artwork <- sprintf("%03d", dat$artwork)
#' The following table shows an overview of the card indices. The indices
#' should have values between 0 and 5. It is unclear what the numbers mean.
table(dat$card)
#' Number of cards for each artwork in the data set (subset from 2016)
artworks <- sort(unique(dat$artwork))
count <- function(x) length(table(dat[which(dat$artwork == x), "card"]))
max_index <- function(x) max(dat[which(dat$artwork == x), "card"], na.rm = TRUE)
num_cards <- sapply(artworks, count)
highest_index <- sapply(artworks, max_index)
#' Check how many XML-files for cards are present
path <- "../data/ContentEyevisit/eyevisit_cards_light"
num_files <- NULL
for (artwork in artworks) {
fnames <- dir(pattern = paste0(artwork, "_"), path = paste(path, artwork, sep = "/"))
num_files <- c(num_files, length(fnames))
}
#' The table shows that each artwork has 6 cards the most (as expected).
#' This is a subset of the data, so not all cards have been opened.
cards <- data.frame(artwork = artworks, num_cards, highest_index,
num_files, diff = num_files - highest_index)
cards
#' There are more than 8 files for a couple of artworks:
subset(cards, cards$num_files >= 8)
#' It might be possible, that the number indicates the index of the file
#' and not the actual card that was displayed. BUT: In many cases, there
#' are only 6 (or less) files, but a higher index is present...
subset(cards, cards$diff < 0)

View File

@ -1,105 +0,0 @@
#' ---
#' title: "Programming input"
#' author: "Nora Wickelmaier"
#' date: "`r Sys.Date()`"
#' output: html_document
#' ---
#+ include = FALSE
# setwd("C:/Users/nwickelmaier/Nextcloud/Documents/MDS/2023ss/60100_master_thesis/code")
#+
dat0 <- read.table("../data/rawdata_logfiles_small.csv", sep = ";", header = TRUE)
dat0$date <- as.POSIXct(dat0$date) # create date object
# Remove irrelevant events
dat <- subset(dat0, !(dat0$event %in% c("Start Application", "Show Application")))
str(dat)
# make data better manageable
tmp <- dat[!dat$event %in% c("Transform start", "Transform stop"), ]
rownames(tmp) <- NULL
#' # Add `trace` variable for closing events
tmp$trace <- NA
last_event <- tmp$event[1]
aws <- unique(tmp$artwork)[unique(tmp$artwork) != "glossar"]
for (art in aws) { # select artwork
for (i in 1:nrow(tmp)) { # go through rows
if (last_event == "Show Info" & tmp$artwork[i] == art) {
tmp$trace[i] <- i
j <- i
} else if (last_event == "Show Front" & tmp$artwork[i] == art) {
tmp$trace[i] <- j
} else if (!(last_event %in% c("Show Info", "Show Front")) &
tmp$artwork[i] == art) {
tmp$trace[i] <- j
}
if (i <= nrow(tmp)) {
last_event <- tmp$event[i + 1]
}
}
}
head(tmp[, c("artwork", "event", "trace")], 50)
#' # Find artwork for glossar entry
glossar_files <- unique(tmp[tmp$artwork == "glossar", "popup"])
# Load lookup table for artworks and glossar files
load("../data/glossar_dict.RData")
lut <- glossar_dict[glossar_dict$glossar_file %in% glossar_files, ]
# Fill in trace variable based on last `Show Info`
for (file in lut$glossar_file) {
artwork_list <- unlist(lut[lut$glossar_file == file, "artwork"])
for (i in seq_len(nrow(tmp))) {
if (tmp$event[i] == "Show Info") {
current_artwork <- tmp[i, "artwork"]
j <- i
k <- i
} else {
current_artwork <- current_artwork
}
if (tmp$event[i] == "Show Front" & tmp$artwork[i] == current_artwork) {
# make sure artwork has not been closed, yet!
k <- i
}
if (tmp$artwork[i] == "glossar" &
(current_artwork %in% artwork_list) &
tmp$popup[i] == file & (j-k == 0)) {
tmp[i, "trace"] <- tmp[j, "trace"]
}
}
}
tmp[tmp$artwork == "glossar", c("artwork", "event", "popup", "trace")]
proportions(table(is.na(tmp$trace[tmp$artwork == "glossar"])))
# --> finds about half of the glossar entries for small data set...
# REMEMBER: It can never be 100% correct, since it is always possible that
# several cards are open and that they link to the same glossar entry
# How many glossar_files are only associated with one artwork?
lut[sapply(lut$artwork, length) == 1, "glossar_file"]