mtt_haum/code/questions/questions_data-inconsistenc...

126 lines
3.8 KiB
R

#' ---
#' title: "Open Questions"
#' author: "Nora Wickelmaier"
#' date: "`r Sys.Date()`"
#' output:
#' html_document:
#' number_sections: true
#' toc: true
#' ---
#+ include = FALSE
# setwd("C:/Users/nwickelmaier/Nextcloud/Documents/MDS/2023ss/60100_master_thesis/code")
dat <- read.table("../data/event_logfiles.csv", sep = ";", header = TRUE)
dat$date.start <- as.POSIXct(dat$date.start)
dat$date.stop <- as.POSIXct(dat$date.stop)
#' This is what the data look like after preprocessing right now
#+ include = FALSE
mat <- as.data.frame(t(sapply(dat, range, na.rm = TRUE)))
names(mat) <- c("min", "max")
mat$min <- round(as.numeric(mat$min), 1)
mat$max <- round(as.numeric(mat$max), 1)
mat$mean <- round(sapply(dat, function(x) mean(x, na.rm = TRUE)), 1)
mat$missings <- sapply(dat, function(x) sum(is.na(x)))
mat <- mat[!(rownames(mat) %in% c("eventid", "case", "trace", "event", "artwork", "card", "popup", "date.start", "date.stop")), ]
#+ echo = FALSE
knitr::kable(mat)
#' This is only the data for 2016! So only about 2 weeks in December.
# Date ranges
range(dat$date.start)
range(dat$date.stop, na.rm = TRUE)
#' # Units of x and y
#' I assume that x and y are pixel $\to$ correct?
#' But they look weird, when plotted. Is it possible that there are
#' outliers? If yes, how? Do we have the true ranges of the display?
par(mfrow = c(1, 2))
plot(y.start ~ x.start, dat)
abline(v = c(0, 3800), h = c(0, 2150), col = "blue", lwd = 2)
plot(y.stop ~ x.stop, dat)
abline(v = c(0, 3800), h = c(0, 2150), col = "blue", lwd = 2)
aggregate(cbind(x.start, x.stop, y.start, y.stop) ~ 1, dat, mean)
#' Looks like the range should be something like $x = [0, 3800]$ and
#' $y = [0, 2150]$. Do we have the starting coordinates for each artwork?
#'
#' # Unit of scale
summary(dat$scaleSize)
#' I thought it would be some kind of scaling factor, but then I would
#' have expected that `scale.start` is always 1 or something.
#'
#' # Unit of rotation
summary(dat$rotationDegree)
#' This looks pretty clear. Should be degree. Anything else to consider
#' here? I am assuming negative means left, but maybe not?
#'
#' # Meaningful unit for "case"
#' I pretty randomly chose `20 sec` based on this plot. I would love a
#' second opinion. `:)`
timediff <- as.numeric(diff(c(dat$date.start[1], dat$date.start)))
hist(timediff[timediff < 40], breaks = 50)
abline(v = 20, col = "red", lwd = 2)
#' This actually works pretty well and lets me assign `trace` values to the
#' moves. But maybe there are other ideas on how to define this?
dat[1:40, c("date.start", "case", "trace", "event", "artwork")]
#' # Problems with `time_ms`
#' What exactly happens, when `time_ms` goes down again? Why does it not go
#' down to 0?
par(mfrow = c(1, 2))
plot(dat$time_ms.start[1:100], type = "b", ylab = "time_ms", xlab = "")
points(dat$time_ms.stop[1:100], type = "b", col = rgb(1, 0, 0, .5))
legend("topleft", c("start", "stop"), lty = 1, col = c("black", "red"))
plot(dat$time_ms.stop[1:100] - dat$time_ms.start[1:100], type = "b",
ylab = "duration", col = rgb(0, 0, 1, .5))
abline(h = 0, lty = 2)
#' For the regular timestamps everything looks fine.
par(mfrow = c(1, 2))
plot(dat$date.stop[1:100], type = "b", ylab = "timestamp", xlab = "",
col = rgb(1, 0, 0, .5))
points(dat$date.start[1:100], type = "b")
legend("topleft", c("start", "stop"), lty = 1, col = c("black", "red"))
plot(dat$date.stop[1:100] - dat$date.start[1:100], type = "b",
ylab = "duration", col = rgb(0, 0, 1, .5))
abline(h = 0, lty = 2)
#+
plot(time_ms.start ~ date.start, dat[1:1000, ], type = "b")
points(time_ms.stop ~ date.stop, dat[1:1000, ], type = "b", col = rgb(1, 0, 0, .3))
#' For `time_ms.stop` this looks even weirder.
#'
#' # After which time interval does the table reset?
#' I cannot see this in the data at all. Or can I? Has this something to do
#' with the weird behavior of `time_ms`?