Removed questions, since it is all outdated
This commit is contained in:
parent
cba441f08b
commit
37e67bfa69
@ -1,125 +0,0 @@
|
|||||||
#' ---
|
|
||||||
#' title: "Open Questions"
|
|
||||||
#' author: "Nora Wickelmaier"
|
|
||||||
#' date: "`r Sys.Date()`"
|
|
||||||
#' output:
|
|
||||||
#' html_document:
|
|
||||||
#' number_sections: true
|
|
||||||
#' toc: true
|
|
||||||
#' ---
|
|
||||||
|
|
||||||
#+ include = FALSE
|
|
||||||
# setwd("C:/Users/nwickelmaier/Nextcloud/Documents/MDS/2023ss/60100_master_thesis/code")
|
|
||||||
dat <- read.table("../data/event_logfiles.csv", sep = ";", header = TRUE)
|
|
||||||
dat$date.start <- as.POSIXct(dat$date.start)
|
|
||||||
dat$date.stop <- as.POSIXct(dat$date.stop)
|
|
||||||
|
|
||||||
#' This is what the data look like after preprocessing right now
|
|
||||||
|
|
||||||
#+ include = FALSE
|
|
||||||
mat <- as.data.frame(t(sapply(dat, range, na.rm = TRUE)))
|
|
||||||
names(mat) <- c("min", "max")
|
|
||||||
mat$min <- round(as.numeric(mat$min), 1)
|
|
||||||
mat$max <- round(as.numeric(mat$max), 1)
|
|
||||||
mat$mean <- round(sapply(dat, function(x) mean(x, na.rm = TRUE)), 1)
|
|
||||||
mat$missings <- sapply(dat, function(x) sum(is.na(x)))
|
|
||||||
mat <- mat[!(rownames(mat) %in% c("eventid", "case", "trace", "event", "artwork", "card", "popup", "date.start", "date.stop")), ]
|
|
||||||
|
|
||||||
#+ echo = FALSE
|
|
||||||
knitr::kable(mat)
|
|
||||||
|
|
||||||
#' This is only the data for 2016! So only about 2 weeks in December.
|
|
||||||
|
|
||||||
# Date ranges
|
|
||||||
range(dat$date.start)
|
|
||||||
range(dat$date.stop, na.rm = TRUE)
|
|
||||||
|
|
||||||
#' # Units of x and y
|
|
||||||
#' I assume that x and y are pixel $\to$ correct?
|
|
||||||
|
|
||||||
#' But they look weird, when plotted. Is it possible that there are
|
|
||||||
#' outliers? If yes, how? Do we have the true ranges of the display?
|
|
||||||
|
|
||||||
par(mfrow = c(1, 2))
|
|
||||||
plot(y.start ~ x.start, dat)
|
|
||||||
abline(v = c(0, 3800), h = c(0, 2150), col = "blue", lwd = 2)
|
|
||||||
plot(y.stop ~ x.stop, dat)
|
|
||||||
abline(v = c(0, 3800), h = c(0, 2150), col = "blue", lwd = 2)
|
|
||||||
|
|
||||||
aggregate(cbind(x.start, x.stop, y.start, y.stop) ~ 1, dat, mean)
|
|
||||||
|
|
||||||
#' Looks like the range should be something like $x = [0, 3800]$ and
|
|
||||||
#' $y = [0, 2150]$. Do we have the starting coordinates for each artwork?
|
|
||||||
#'
|
|
||||||
|
|
||||||
#' # Unit of scale
|
|
||||||
|
|
||||||
summary(dat$scaleSize)
|
|
||||||
|
|
||||||
#' I thought it would be some kind of scaling factor, but then I would
|
|
||||||
#' have expected that `scale.start` is always 1 or something.
|
|
||||||
#'
|
|
||||||
|
|
||||||
#' # Unit of rotation
|
|
||||||
|
|
||||||
summary(dat$rotationDegree)
|
|
||||||
|
|
||||||
#' This looks pretty clear. Should be degree. Anything else to consider
|
|
||||||
#' here? I am assuming negative means left, but maybe not?
|
|
||||||
#'
|
|
||||||
|
|
||||||
#' # Meaningful unit for "case"
|
|
||||||
|
|
||||||
#' I pretty randomly chose `20 sec` based on this plot. I would love a
|
|
||||||
#' second opinion. `:)`
|
|
||||||
|
|
||||||
timediff <- as.numeric(diff(c(dat$date.start[1], dat$date.start)))
|
|
||||||
hist(timediff[timediff < 40], breaks = 50)
|
|
||||||
abline(v = 20, col = "red", lwd = 2)
|
|
||||||
|
|
||||||
#' This actually works pretty well and lets me assign `trace` values to the
|
|
||||||
#' moves. But maybe there are other ideas on how to define this?
|
|
||||||
|
|
||||||
dat[1:40, c("date.start", "case", "trace", "event", "artwork")]
|
|
||||||
|
|
||||||
|
|
||||||
#' # Problems with `time_ms`
|
|
||||||
|
|
||||||
#' What exactly happens, when `time_ms` goes down again? Why does it not go
|
|
||||||
#' down to 0?
|
|
||||||
|
|
||||||
par(mfrow = c(1, 2))
|
|
||||||
|
|
||||||
plot(dat$time_ms.start[1:100], type = "b", ylab = "time_ms", xlab = "")
|
|
||||||
points(dat$time_ms.stop[1:100], type = "b", col = rgb(1, 0, 0, .5))
|
|
||||||
legend("topleft", c("start", "stop"), lty = 1, col = c("black", "red"))
|
|
||||||
|
|
||||||
plot(dat$time_ms.stop[1:100] - dat$time_ms.start[1:100], type = "b",
|
|
||||||
ylab = "duration", col = rgb(0, 0, 1, .5))
|
|
||||||
abline(h = 0, lty = 2)
|
|
||||||
|
|
||||||
#' For the regular timestamps everything looks fine.
|
|
||||||
|
|
||||||
par(mfrow = c(1, 2))
|
|
||||||
|
|
||||||
plot(dat$date.stop[1:100], type = "b", ylab = "timestamp", xlab = "",
|
|
||||||
col = rgb(1, 0, 0, .5))
|
|
||||||
points(dat$date.start[1:100], type = "b")
|
|
||||||
legend("topleft", c("start", "stop"), lty = 1, col = c("black", "red"))
|
|
||||||
|
|
||||||
plot(dat$date.stop[1:100] - dat$date.start[1:100], type = "b",
|
|
||||||
ylab = "duration", col = rgb(0, 0, 1, .5))
|
|
||||||
abline(h = 0, lty = 2)
|
|
||||||
|
|
||||||
#+
|
|
||||||
plot(time_ms.start ~ date.start, dat[1:1000, ], type = "b")
|
|
||||||
points(time_ms.stop ~ date.stop, dat[1:1000, ], type = "b", col = rgb(1, 0, 0, .3))
|
|
||||||
|
|
||||||
#' For `time_ms.stop` this looks even weirder.
|
|
||||||
#'
|
|
||||||
|
|
||||||
#' # After which time interval does the table reset?
|
|
||||||
|
|
||||||
#' I cannot see this in the data at all. Or can I? Has this something to do
|
|
||||||
#' with the weird behavior of `time_ms`?
|
|
||||||
|
|
@ -1,58 +0,0 @@
|
|||||||
#' ---
|
|
||||||
#' title: "Open Questions -- Card indices"
|
|
||||||
#' author: "Nora Wickelmaier"
|
|
||||||
#' date: "`r Sys.Date()`"
|
|
||||||
#' output:
|
|
||||||
#' html_document:
|
|
||||||
#' number_sections: true
|
|
||||||
#' toc: true
|
|
||||||
#' ---
|
|
||||||
|
|
||||||
#+ include = FALSE
|
|
||||||
# setwd("C:/Users/nwickelmaier/Nextcloud/Documents/MDS/2023ss/60100_master_thesis/code")
|
|
||||||
dat <- read.table("../data/event_logfiles.csv", sep = ";", header = TRUE)
|
|
||||||
dat$date.start <- as.POSIXct(dat$date.start)
|
|
||||||
dat$date.stop <- as.POSIXct(dat$date.stop)
|
|
||||||
dat$artwork <- sprintf("%03d", dat$artwork)
|
|
||||||
|
|
||||||
#' The following table shows an overview of the card indices. The indices
|
|
||||||
#' should have values between 0 and 5. It is unclear what the numbers mean.
|
|
||||||
|
|
||||||
table(dat$card)
|
|
||||||
|
|
||||||
#' Number of cards for each artwork in the data set (subset from 2016)
|
|
||||||
|
|
||||||
artworks <- sort(unique(dat$artwork))
|
|
||||||
|
|
||||||
count <- function(x) length(table(dat[which(dat$artwork == x), "card"]))
|
|
||||||
max_index <- function(x) max(dat[which(dat$artwork == x), "card"], na.rm = TRUE)
|
|
||||||
num_cards <- sapply(artworks, count)
|
|
||||||
highest_index <- sapply(artworks, max_index)
|
|
||||||
|
|
||||||
#' Check how many XML-files for cards are present
|
|
||||||
|
|
||||||
path <- "../data/ContentEyevisit/eyevisit_cards_light"
|
|
||||||
|
|
||||||
num_files <- NULL
|
|
||||||
for (artwork in artworks) {
|
|
||||||
fnames <- dir(pattern = paste0(artwork, "_"), path = paste(path, artwork, sep = "/"))
|
|
||||||
num_files <- c(num_files, length(fnames))
|
|
||||||
}
|
|
||||||
|
|
||||||
#' The table shows that each artwork has 6 cards the most (as expected).
|
|
||||||
#' This is a subset of the data, so not all cards have been opened.
|
|
||||||
|
|
||||||
cards <- data.frame(artwork = artworks, num_cards, highest_index,
|
|
||||||
num_files, diff = num_files - highest_index)
|
|
||||||
cards
|
|
||||||
|
|
||||||
#' There are more than 8 files for a couple of artworks:
|
|
||||||
|
|
||||||
subset(cards, cards$num_files >= 8)
|
|
||||||
|
|
||||||
#' It might be possible, that the number indicates the index of the file
|
|
||||||
#' and not the actual card that was displayed. BUT: In many cases, there
|
|
||||||
#' are only 6 (or less) files, but a higher index is present...
|
|
||||||
|
|
||||||
subset(cards, cards$diff < 0)
|
|
||||||
|
|
@ -1,105 +0,0 @@
|
|||||||
#' ---
|
|
||||||
#' title: "Programming input"
|
|
||||||
#' author: "Nora Wickelmaier"
|
|
||||||
#' date: "`r Sys.Date()`"
|
|
||||||
#' output: html_document
|
|
||||||
#' ---
|
|
||||||
|
|
||||||
#+ include = FALSE
|
|
||||||
# setwd("C:/Users/nwickelmaier/Nextcloud/Documents/MDS/2023ss/60100_master_thesis/code")
|
|
||||||
|
|
||||||
#+
|
|
||||||
dat0 <- read.table("../data/rawdata_logfiles_small.csv", sep = ";", header = TRUE)
|
|
||||||
dat0$date <- as.POSIXct(dat0$date) # create date object
|
|
||||||
|
|
||||||
# Remove irrelevant events
|
|
||||||
dat <- subset(dat0, !(dat0$event %in% c("Start Application", "Show Application")))
|
|
||||||
str(dat)
|
|
||||||
|
|
||||||
# make data better manageable
|
|
||||||
tmp <- dat[!dat$event %in% c("Transform start", "Transform stop"), ]
|
|
||||||
rownames(tmp) <- NULL
|
|
||||||
|
|
||||||
#' # Add `trace` variable for closing events
|
|
||||||
|
|
||||||
tmp$trace <- NA
|
|
||||||
last_event <- tmp$event[1]
|
|
||||||
aws <- unique(tmp$artwork)[unique(tmp$artwork) != "glossar"]
|
|
||||||
|
|
||||||
for (art in aws) { # select artwork
|
|
||||||
|
|
||||||
for (i in 1:nrow(tmp)) { # go through rows
|
|
||||||
|
|
||||||
if (last_event == "Show Info" & tmp$artwork[i] == art) {
|
|
||||||
tmp$trace[i] <- i
|
|
||||||
j <- i
|
|
||||||
|
|
||||||
} else if (last_event == "Show Front" & tmp$artwork[i] == art) {
|
|
||||||
tmp$trace[i] <- j
|
|
||||||
|
|
||||||
} else if (!(last_event %in% c("Show Info", "Show Front")) &
|
|
||||||
tmp$artwork[i] == art) {
|
|
||||||
tmp$trace[i] <- j
|
|
||||||
}
|
|
||||||
|
|
||||||
if (i <= nrow(tmp)) {
|
|
||||||
last_event <- tmp$event[i + 1]
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
head(tmp[, c("artwork", "event", "trace")], 50)
|
|
||||||
|
|
||||||
#' # Find artwork for glossar entry
|
|
||||||
|
|
||||||
glossar_files <- unique(tmp[tmp$artwork == "glossar", "popup"])
|
|
||||||
|
|
||||||
# Load lookup table for artworks and glossar files
|
|
||||||
load("../data/glossar_dict.RData")
|
|
||||||
lut <- glossar_dict[glossar_dict$glossar_file %in% glossar_files, ]
|
|
||||||
|
|
||||||
# Fill in trace variable based on last `Show Info`
|
|
||||||
for (file in lut$glossar_file) {
|
|
||||||
|
|
||||||
artwork_list <- unlist(lut[lut$glossar_file == file, "artwork"])
|
|
||||||
|
|
||||||
for (i in seq_len(nrow(tmp))) {
|
|
||||||
|
|
||||||
if (tmp$event[i] == "Show Info") {
|
|
||||||
|
|
||||||
current_artwork <- tmp[i, "artwork"]
|
|
||||||
j <- i
|
|
||||||
k <- i
|
|
||||||
|
|
||||||
} else {
|
|
||||||
|
|
||||||
current_artwork <- current_artwork
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
if (tmp$event[i] == "Show Front" & tmp$artwork[i] == current_artwork) {
|
|
||||||
# make sure artwork has not been closed, yet!
|
|
||||||
k <- i
|
|
||||||
}
|
|
||||||
|
|
||||||
if (tmp$artwork[i] == "glossar" &
|
|
||||||
(current_artwork %in% artwork_list) &
|
|
||||||
tmp$popup[i] == file & (j-k == 0)) {
|
|
||||||
|
|
||||||
tmp[i, "trace"] <- tmp[j, "trace"]
|
|
||||||
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
tmp[tmp$artwork == "glossar", c("artwork", "event", "popup", "trace")]
|
|
||||||
|
|
||||||
proportions(table(is.na(tmp$trace[tmp$artwork == "glossar"])))
|
|
||||||
# --> finds about half of the glossar entries for small data set...
|
|
||||||
|
|
||||||
# REMEMBER: It can never be 100% correct, since it is always possible that
|
|
||||||
# several cards are open and that they link to the same glossar entry
|
|
||||||
|
|
||||||
# How many glossar_files are only associated with one artwork?
|
|
||||||
lut[sapply(lut$artwork, length) == 1, "glossar_file"]
|
|
||||||
|
|
Loading…
Reference in New Issue
Block a user