2023-06-26 10:30:07 +02:00
|
|
|
#' ---
|
|
|
|
#' title: "Preprocessing log files"
|
|
|
|
#' author: "Nora Wickelmaier"
|
|
|
|
#' date: "`r Sys.Date()`"
|
2023-07-20 17:06:28 +02:00
|
|
|
#' output:
|
2023-06-26 10:30:07 +02:00
|
|
|
#' html_document:
|
|
|
|
#' toc: true
|
|
|
|
#' toc_float: true
|
|
|
|
#' pdf_document:
|
|
|
|
#' toc: true
|
|
|
|
#' number_sections: true
|
|
|
|
#' geometry: margin = 2.5cm
|
|
|
|
#' ---
|
|
|
|
|
|
|
|
# setwd("C:/Users/nwickelmaier/Nextcloud/Documents/MDS/2023ss/60100_master_thesis/code")
|
|
|
|
|
|
|
|
#' # Read data
|
|
|
|
|
2023-07-20 17:06:28 +02:00
|
|
|
dat0 <- read.table("../data/rawdata_logfiles_small.csv", sep = ";",
|
|
|
|
header = TRUE)
|
|
|
|
dat0$date <- as.POSIXct(dat0$date) # create date object
|
2023-08-28 17:29:56 +02:00
|
|
|
dat0$glossar <- ifelse(dat0$artwork == "glossar", 1, 0)
|
2023-07-20 17:06:28 +02:00
|
|
|
|
2023-06-26 10:30:07 +02:00
|
|
|
#' # Remove irrelevant events
|
|
|
|
|
|
|
|
#' ## Remove Start Application and Show Application
|
|
|
|
|
2023-07-20 17:06:28 +02:00
|
|
|
dat <- subset(dat0, !(dat0$event %in% c("Start Application",
|
|
|
|
"Show Application")))
|
2023-08-28 17:29:56 +02:00
|
|
|
rownames(dat) <- NULL
|
2023-07-20 17:06:28 +02:00
|
|
|
|
|
|
|
#' # Close events
|
|
|
|
|
2023-08-02 18:24:16 +02:00
|
|
|
########
|
|
|
|
|
|
|
|
#' Do it for Transform events first
|
2023-08-18 13:42:18 +02:00
|
|
|
dat1 <- dat[dat$event %in% c("Transform start", "Transform stop"), ]
|
|
|
|
dat1 <- dat1[order(dat1$artwork, dat1$date), ]
|
|
|
|
rownames(dat1) <- NULL
|
2023-07-20 17:06:28 +02:00
|
|
|
|
2023-08-02 18:24:16 +02:00
|
|
|
# Create event ID for closing events
|
2023-08-18 13:42:18 +02:00
|
|
|
num_start <- diff(c(0, which(dat1$event == "Transform stop")))
|
|
|
|
dat1$eventid <- rep(seq_along(num_start), num_start)
|
|
|
|
head(dat1[, c("event", "eventid")], 25)
|
2023-07-20 17:06:28 +02:00
|
|
|
|
2023-08-18 13:42:18 +02:00
|
|
|
table(table(dat1$eventid))
|
2023-07-20 17:06:28 +02:00
|
|
|
# 1 2 3 4 5 6 7 8 10 11
|
2023-08-28 17:29:56 +02:00
|
|
|
# 70 78435 5153 842 222 66 18 14 3 1
|
2023-07-20 17:06:28 +02:00
|
|
|
# --> compare to table(num_start)!
|
|
|
|
|
|
|
|
# Find out how often "Transform stop" follows each other
|
2023-08-18 13:42:18 +02:00
|
|
|
num_stop <- c(diff(c(0, which(dat1$event == "Transform start"))))
|
2023-07-20 17:06:28 +02:00
|
|
|
table(num_stop)
|
|
|
|
|
2023-08-31 16:12:34 +02:00
|
|
|
# TODO: Do I still need this?
|
2023-08-18 13:42:18 +02:00
|
|
|
dat1$eventrep <- rep(num_start, num_start)
|
|
|
|
dat1$dupl <- duplicated(dat1[, c("event", "eventid")]) # keep first
|
|
|
|
dat1$dupl <- duplicated(dat1[, c("event", "eventid")], fromLast = TRUE) # keep last
|
|
|
|
dat1[dat1$eventrep == 10, ]
|
|
|
|
dat1$dupl <- NULL
|
|
|
|
dat1$eventrep <- NULL
|
2023-07-20 17:06:28 +02:00
|
|
|
|
|
|
|
|
|
|
|
# remove duplicated "Transform start" events
|
2023-08-18 13:42:18 +02:00
|
|
|
dat1 <- dat1[!duplicated(dat1[, c("event", "eventid")]), ]
|
2023-07-20 17:06:28 +02:00
|
|
|
|
|
|
|
# remove duplicated "Transform stop" events
|
2023-08-18 13:42:18 +02:00
|
|
|
id_stop <- which(dat1$event == "Transform stop")
|
2023-07-20 17:06:28 +02:00
|
|
|
id_rm_stop <- id_stop[diff(id_stop) == 1]
|
|
|
|
|
2023-08-18 13:42:18 +02:00
|
|
|
dat1 <- dat1[-(id_rm_stop + 1), ]
|
2023-07-20 17:06:28 +02:00
|
|
|
|
|
|
|
# transform to wide data format
|
2023-08-28 17:29:56 +02:00
|
|
|
dat1$time <- ifelse(dat1$event == "Transform start", "start", "stop")
|
2023-07-20 17:06:28 +02:00
|
|
|
|
2023-08-18 13:42:18 +02:00
|
|
|
trans_wide <- reshape(dat1, direction = "wide",
|
2023-08-28 17:29:56 +02:00
|
|
|
idvar = c("eventid", "artwork", "glossar"),
|
|
|
|
timevar = "time",
|
|
|
|
drop = c("popup", "card", "event")
|
2023-07-20 17:06:28 +02:00
|
|
|
)
|
2023-08-11 08:35:41 +02:00
|
|
|
# TODO: This runs for quite some time
|
|
|
|
# --> Is this more efficient with tidyr::pivot_wider?
|
2023-07-20 17:06:28 +02:00
|
|
|
|
2023-08-28 17:29:56 +02:00
|
|
|
# --> when fileid is part of the reshape, it does not work correctly, since
|
|
|
|
# we sometimes have a start - stop event that is recorded in two separate
|
|
|
|
# log files, BUT: after finding out, that `time_ms` changes for each log
|
|
|
|
# file, I want to exclude those cases, so `fileid` has to be included!!!
|
|
|
|
|
|
|
|
# check how often an eventid is associated with two fileids
|
|
|
|
nrow(subset(trans_wide, trans_wide$fileid.start != trans_wide$fileid.stop))
|
|
|
|
|
|
|
|
# exclude from data set ??
|
|
|
|
# trans_wide <- subset(trans_wide, trans_wide$fileid.start != trans_wide$fileid.stop)
|
|
|
|
|
2023-07-20 17:06:28 +02:00
|
|
|
# which(is.na(trans_wide$date.start))
|
|
|
|
|
2023-08-02 18:24:16 +02:00
|
|
|
trans_wide$event <- "move"
|
|
|
|
trans_wide$eventid <- NULL
|
|
|
|
|
|
|
|
rownames(trans_wide) <- NULL
|
|
|
|
|
2023-07-20 17:06:28 +02:00
|
|
|
trans_wide$duration <- trans_wide$time_ms.stop - trans_wide$time_ms.start
|
2023-08-18 13:42:18 +02:00
|
|
|
#trans_wide$duration <- trans_wide$date.stop - trans_wide$date.start
|
2023-08-02 18:24:16 +02:00
|
|
|
# only seconds - not fine grained enough
|
|
|
|
trans_wide$distance <- apply(
|
|
|
|
trans_wide[, c("x.start", "y.start", "x.stop", "y.stop")], 1,
|
|
|
|
function(x) dist(matrix(x, 2, 2, byrow = TRUE)))
|
|
|
|
trans_wide$rotationDegree <- trans_wide$rotation.stop -
|
|
|
|
trans_wide$rotation.start
|
2023-08-18 13:42:18 +02:00
|
|
|
trans_wide$scaleSize <- trans_wide$scale.stop / trans_wide$scale.start
|
2023-07-20 17:06:28 +02:00
|
|
|
|
2023-08-14 16:57:03 +02:00
|
|
|
trans_wide$trace <- NA
|
|
|
|
trans_wide$card <- NA
|
|
|
|
trans_wide$popup <- NA
|
|
|
|
|
|
|
|
dat_trans <- trans_wide[trans_wide$distance != 0 &
|
2023-08-02 18:24:16 +02:00
|
|
|
trans_wide$rotationDegree != 0 &
|
2023-08-18 13:42:18 +02:00
|
|
|
trans_wide$scaleSize != 1,
|
2023-08-28 17:29:56 +02:00
|
|
|
c("fileid.start", "fileid.stop", "event", "artwork",
|
|
|
|
"trace", "glossar", "date.start", "date.stop",
|
2023-08-11 08:35:41 +02:00
|
|
|
"time_ms.start", "time_ms.stop", "duration",
|
2023-08-28 17:29:56 +02:00
|
|
|
"card", "popup", "x.start", "y.start", "x.stop",
|
|
|
|
"y.stop", "distance", "scale.start", "scale.stop",
|
2023-08-11 08:35:41 +02:00
|
|
|
"scaleSize", "rotation.start", "rotation.stop",
|
|
|
|
"rotationDegree")]
|
2023-08-28 17:29:56 +02:00
|
|
|
1 - nrow(dat_trans) / nrow(trans_wide)
|
2023-07-20 17:06:28 +02:00
|
|
|
# removes almost 2/3 of the data (for small data set)
|
|
|
|
|
2023-08-18 13:42:18 +02:00
|
|
|
rm(id_rm_stop, id_stop, trans_wide, num_start, num_stop)
|
2023-07-20 17:06:28 +02:00
|
|
|
|
2023-08-02 18:24:16 +02:00
|
|
|
summary(dat_trans)
|
2023-07-20 17:06:28 +02:00
|
|
|
|
|
|
|
|
2023-08-02 18:24:16 +02:00
|
|
|
#' # Close other events
|
2023-07-20 17:06:28 +02:00
|
|
|
|
2023-08-18 13:42:18 +02:00
|
|
|
dat2 <- dat[!dat$event %in% c("Transform start", "Transform stop"), ]
|
|
|
|
rownames(dat2) <- NULL
|
2023-07-20 17:06:28 +02:00
|
|
|
|
2023-08-18 13:42:18 +02:00
|
|
|
dat2$trace <- NA
|
|
|
|
last_event <- dat2$event[1]
|
|
|
|
aws <- unique(dat2$artwork)[unique(dat2$artwork) != "glossar"]
|
2023-08-02 18:24:16 +02:00
|
|
|
#
|
2023-08-31 16:12:34 +02:00
|
|
|
for (art in aws) { # select artwork
|
2023-08-02 18:24:16 +02:00
|
|
|
|
2023-08-18 13:42:18 +02:00
|
|
|
for (i in 1:nrow(dat2)) { # go through rows
|
2023-07-20 17:06:28 +02:00
|
|
|
|
2023-08-18 13:42:18 +02:00
|
|
|
if (last_event == "Show Info" & dat2$artwork[i] == art) {
|
|
|
|
dat2$trace[i] <- i
|
2023-08-02 18:24:16 +02:00
|
|
|
j <- i
|
2023-07-20 17:06:28 +02:00
|
|
|
|
2023-08-18 13:42:18 +02:00
|
|
|
} else if (last_event == "Show Front" & dat2$artwork[i] == art) {
|
|
|
|
dat2$trace[i] <- j
|
2023-07-20 17:06:28 +02:00
|
|
|
|
2023-08-02 18:24:16 +02:00
|
|
|
} else if (!(last_event %in% c("Show Info", "Show Front")) &
|
2023-08-18 13:42:18 +02:00
|
|
|
dat2$artwork[i] == art) {
|
|
|
|
dat2$trace[i] <- j
|
2023-08-02 18:24:16 +02:00
|
|
|
}
|
|
|
|
|
2023-08-18 13:42:18 +02:00
|
|
|
if (i <= nrow(dat2)) {
|
|
|
|
last_event <- dat2$event[i + 1]
|
2023-07-20 17:06:28 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2023-08-18 13:42:18 +02:00
|
|
|
head(dat2[, c("artwork", "event", "trace")], 50)
|
|
|
|
tail(dat2[, c("artwork", "event", "trace")], 50)
|
2023-08-02 18:24:16 +02:00
|
|
|
|
2023-08-11 08:35:41 +02:00
|
|
|
rm(aws, i, j, last_event, art)
|
2023-08-02 18:24:16 +02:00
|
|
|
|
2023-08-31 16:12:34 +02:00
|
|
|
#' ## Fix glossar entries (find corresponding artworks)
|
2023-08-02 18:24:16 +02:00
|
|
|
|
2023-08-18 13:42:18 +02:00
|
|
|
glossar_files <- unique(dat2[dat2$artwork == "glossar", "popup"])
|
2023-08-02 18:24:16 +02:00
|
|
|
|
|
|
|
# load lookup table for artworks and glossar files
|
|
|
|
load("../data/glossar_dict.RData")
|
|
|
|
lut <- glossar_dict[glossar_dict$glossar_file %in% glossar_files, ]
|
|
|
|
|
2023-08-18 13:42:18 +02:00
|
|
|
head(dat2[, c("artwork", "event", "popup", "trace")], 20)
|
2023-08-02 18:24:16 +02:00
|
|
|
|
2023-07-20 17:06:28 +02:00
|
|
|
|
2023-08-02 18:24:16 +02:00
|
|
|
for (file in lut$glossar_file) {
|
|
|
|
|
|
|
|
artwork_list <- unlist(lut[lut$glossar_file == file, "artwork"])
|
|
|
|
|
2023-08-18 13:42:18 +02:00
|
|
|
for (i in seq_len(nrow(dat2))) {
|
2023-08-02 18:24:16 +02:00
|
|
|
|
2023-08-18 13:42:18 +02:00
|
|
|
if (dat2$event[i] == "Show Info") {
|
2023-08-02 18:24:16 +02:00
|
|
|
|
2023-08-18 13:42:18 +02:00
|
|
|
current_artwork <- dat2[i, "artwork"]
|
2023-08-02 18:24:16 +02:00
|
|
|
j <- i
|
|
|
|
k <- i
|
|
|
|
|
|
|
|
} else {
|
|
|
|
|
|
|
|
current_artwork <- current_artwork
|
|
|
|
|
|
|
|
}
|
|
|
|
|
2023-08-18 13:42:18 +02:00
|
|
|
if (dat2$event[i] == "Show Front" & dat2$artwork[i] == current_artwork) {
|
2023-08-02 18:24:16 +02:00
|
|
|
# make sure artwork has not been closed, yet!
|
|
|
|
k <- i
|
|
|
|
}
|
2023-07-20 17:06:28 +02:00
|
|
|
|
2023-08-18 13:42:18 +02:00
|
|
|
if (dat2$artwork[i] == "glossar" &
|
2023-08-02 18:24:16 +02:00
|
|
|
(current_artwork %in% artwork_list) &
|
2023-08-18 13:42:18 +02:00
|
|
|
dat2$popup[i] == file & (j-k == 0)) {
|
2023-08-02 18:24:16 +02:00
|
|
|
|
2023-08-18 13:42:18 +02:00
|
|
|
dat2[i, "trace"] <- dat2[j, "trace"]
|
2023-08-28 17:29:56 +02:00
|
|
|
dat2[i, "artwork"] <- current_artwork
|
2023-08-02 18:24:16 +02:00
|
|
|
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2023-08-28 17:29:56 +02:00
|
|
|
# --> finds about half of the glossar entries for the small data set...
|
|
|
|
table(is.na(dat2[dat2$glossar == 1, "trace"]))
|
2023-08-02 18:24:16 +02:00
|
|
|
|
|
|
|
# REMEMBER: It can never bo 100% correct, since it is always possible that
|
2023-08-11 08:35:41 +02:00
|
|
|
# several cards are open and that they link to the same glossar entry
|
2023-08-02 18:24:16 +02:00
|
|
|
|
|
|
|
# How many glossar_files are only associated with one artwork?
|
|
|
|
lut[sapply(lut$artwork, length) == 1, "glossar_file"]
|
|
|
|
|
2023-08-28 17:29:56 +02:00
|
|
|
single <- lut[sapply(lut$artwork, length) == 1, "glossar_file"]
|
|
|
|
tmp <- subset(dat2, is.na(dat2$trace))$popup
|
|
|
|
inside <- unique(tmp[tmp %in% lut[sapply(lut$artwork, length) == 1, "glossar_file"]])
|
|
|
|
single_art <- unlist(lut[lut$glossar_file %in% inside, "artwork"])
|
|
|
|
tmp_lut <- data.frame(glossar_file = sort(inside), artwork = single_art)
|
2023-08-02 18:24:16 +02:00
|
|
|
|
|
|
|
|
2023-08-28 17:29:56 +02:00
|
|
|
dat2[dat2$glossar == 1, c("artwork", "popup", "glossar", "trace")]
|
|
|
|
|
|
|
|
for (file in tmp_lut$glossar_file) {
|
|
|
|
|
|
|
|
for (i in seq_len(nrow(dat2))) {
|
|
|
|
|
|
|
|
if (dat2$event[i] == "Artwork/OpenCard" & dat2$artwork[i] %in% tmp_lut$artwork) {
|
|
|
|
|
|
|
|
current_artwork <- dat2[i, "artwork"]
|
|
|
|
j <- i
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
if (dat2$artwork[i] == "glossar" &
|
|
|
|
dat2$popup[i] == file) {
|
|
|
|
|
|
|
|
dat2[i, "trace"] <- dat2[j, "trace"]
|
|
|
|
dat2[i, "artwork"] <- current_artwork
|
|
|
|
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
dat2[14110:14130, ]
|
|
|
|
|
2023-08-31 16:12:34 +02:00
|
|
|
# TODO: Integrate for-loop into for-loop above
|
2023-08-02 18:24:16 +02:00
|
|
|
|
|
|
|
# TODO: For now: Exclude not matched glossar entries
|
|
|
|
|
2023-08-18 13:42:18 +02:00
|
|
|
df <- subset(dat2, !is.na(dat2$trace))
|
2023-08-02 18:24:16 +02:00
|
|
|
df <- df[order(df$trace), ]
|
|
|
|
rownames(df) <- NULL
|
|
|
|
|
2023-08-18 13:42:18 +02:00
|
|
|
rm(lut, current_artwork, file, glossar_dict, i, j, k, artwork_list,
|
2023-08-11 08:35:41 +02:00
|
|
|
glossar_files)
|
2023-08-02 18:24:16 +02:00
|
|
|
|
|
|
|
#' ## Close flipCard
|
|
|
|
|
2023-08-18 13:42:18 +02:00
|
|
|
dat3 <- subset(df, df$event %in% c("Show Info", "Show Front"))
|
2023-08-02 18:24:16 +02:00
|
|
|
|
2023-08-28 17:29:56 +02:00
|
|
|
dat3$time <- ifelse(dat3$event == "Show Info", "start", "stop")
|
2023-08-02 18:24:16 +02:00
|
|
|
|
2023-08-18 13:42:18 +02:00
|
|
|
flipCard_wide <- reshape(dat3, direction = "wide",
|
2023-08-28 17:29:56 +02:00
|
|
|
idvar = c("trace", "artwork", "glossar"),
|
|
|
|
timevar = "time",
|
|
|
|
drop = c("popup", "card"))
|
2023-08-11 08:35:41 +02:00
|
|
|
flipCard_wide$event <- "flipCard"
|
|
|
|
flipCard_wide$duration <- flipCard_wide$time_ms.stop -
|
|
|
|
flipCard_wide$time_ms.start
|
|
|
|
|
2023-08-31 16:12:34 +02:00
|
|
|
# TODO: Check if I still need to enter all of these variables
|
|
|
|
# --> x, y, scale, rotation?
|
2023-08-14 16:57:03 +02:00
|
|
|
flipCard_wide$card <- NA
|
|
|
|
flipCard_wide$popup <- NA
|
|
|
|
flipCard_wide$x.start <- NA
|
|
|
|
flipCard_wide$x.stop <- NA
|
|
|
|
flipCard_wide$y.start <- NA
|
|
|
|
flipCard_wide$y.stop <- NA
|
|
|
|
flipCard_wide$distance <- NA
|
|
|
|
flipCard_wide$scale.start <- NA
|
|
|
|
flipCard_wide$scale.stop <- NA
|
|
|
|
flipCard_wide$scaleSize <- NA
|
|
|
|
flipCard_wide$rotation.start <- NA
|
|
|
|
flipCard_wide$rotation.stop <- NA
|
|
|
|
flipCard_wide$rotationDegree <- NA
|
|
|
|
|
2023-08-28 17:29:56 +02:00
|
|
|
dat_flipCard <- flipCard_wide[, c("fileid.start", "fileid.stop", "event",
|
|
|
|
"artwork", "trace", "glossar",
|
|
|
|
"date.start", "date.stop",
|
|
|
|
"time_ms.start", "time_ms.stop",
|
|
|
|
"duration", "card", "popup", "x.start",
|
|
|
|
"y.start", "x.stop", "y.stop",
|
|
|
|
"distance", "scale.start", "scale.stop",
|
|
|
|
"scaleSize", "rotation.start",
|
|
|
|
"rotation.stop", "rotationDegree")]
|
2023-08-02 18:24:16 +02:00
|
|
|
|
2023-08-18 13:42:18 +02:00
|
|
|
rm(flipCard_wide)
|
2023-08-02 18:24:16 +02:00
|
|
|
|
|
|
|
#' ## Close openTopic
|
|
|
|
|
2023-08-18 13:42:18 +02:00
|
|
|
dat4 <- subset(df, df$event %in% c("Artwork/OpenCard", "Artwork/CloseCard"))
|
|
|
|
dat4 <- dat4[order(dat4$artwork, dat4$date), ]
|
|
|
|
rownames(dat4) <- NULL
|
2023-08-02 18:24:16 +02:00
|
|
|
|
2023-08-18 13:42:18 +02:00
|
|
|
num_start <- diff(c(0, which(dat4$event == "Artwork/CloseCard")))
|
|
|
|
dat4$eventid <- rep(seq_along(num_start), num_start)
|
2023-08-02 18:24:16 +02:00
|
|
|
|
2023-08-28 17:29:56 +02:00
|
|
|
dat4$time <- ifelse(dat4$event == "Artwork/OpenCard", "start", "stop")
|
2023-08-02 18:24:16 +02:00
|
|
|
|
2023-08-18 13:42:18 +02:00
|
|
|
openTopic_wide <- reshape(dat4, direction = "wide",
|
2023-08-28 17:29:56 +02:00
|
|
|
idvar = c("eventid", "trace", "glossar", "artwork", "card"),
|
|
|
|
timevar = "time", drop = "popup")
|
2023-08-11 08:35:41 +02:00
|
|
|
openTopic_wide$event <- "openTopic"
|
|
|
|
openTopic_wide$duration <- openTopic_wide$time_ms.stop -
|
|
|
|
openTopic_wide$time_ms.start
|
2023-08-02 18:24:16 +02:00
|
|
|
|
|
|
|
|
2023-08-14 16:57:03 +02:00
|
|
|
openTopic_wide$popup <- NA
|
|
|
|
openTopic_wide$x.start <- NA
|
|
|
|
openTopic_wide$x.stop <- NA
|
|
|
|
openTopic_wide$y.start <- NA
|
|
|
|
openTopic_wide$y.stop <- NA
|
|
|
|
openTopic_wide$distance <- NA
|
|
|
|
openTopic_wide$scale.start <- NA
|
|
|
|
openTopic_wide$scale.stop <- NA
|
|
|
|
openTopic_wide$scaleSize <- NA
|
|
|
|
openTopic_wide$rotation.start <- NA
|
|
|
|
openTopic_wide$rotation.stop <- NA
|
|
|
|
openTopic_wide$rotationDegree <- NA
|
|
|
|
|
2023-08-28 17:29:56 +02:00
|
|
|
dat_openTopic <- openTopic_wide[, c("fileid.start", "fileid.stop", "event",
|
|
|
|
"artwork", "trace", "glossar",
|
|
|
|
"date.start", "date.stop",
|
|
|
|
"time_ms.start", "time_ms.stop",
|
|
|
|
"duration", "card", "popup", "x.start",
|
|
|
|
"y.start", "x.stop", "y.stop",
|
|
|
|
"distance", "scale.start",
|
|
|
|
"scale.stop", "scaleSize",
|
|
|
|
"rotation.start", "rotation.stop",
|
|
|
|
"rotationDegree")]
|
2023-08-11 08:35:41 +02:00
|
|
|
# TODO: card should have a unique identifier for each artwork
|
|
|
|
|
2023-08-18 13:42:18 +02:00
|
|
|
rm(openTopic_wide, num_start)
|
2023-08-02 18:24:16 +02:00
|
|
|
|
|
|
|
#' ## close openPopup
|
2023-08-31 16:12:34 +02:00
|
|
|
|
2023-08-18 13:42:18 +02:00
|
|
|
dat5 <- subset(df, df$event %in% c("ShowPopup", "HidePopup"))
|
2023-08-28 17:29:56 +02:00
|
|
|
dat5 <- dat5[order(dat5$artwork, dat5$popup, dat5$date), ]
|
2023-08-18 13:42:18 +02:00
|
|
|
rownames(dat5) <- NULL
|
2023-08-02 18:24:16 +02:00
|
|
|
|
2023-08-18 13:42:18 +02:00
|
|
|
num_start <- diff(c(0, which(dat5$event == "HidePopup")))
|
2023-08-28 17:29:56 +02:00
|
|
|
# last event is "ShowPopup"! Needs to be fixed
|
|
|
|
# num_start <- c(num_start, 1)
|
|
|
|
# TODO: Needs to be caught in a function --> not anymore - still relevant???
|
2023-08-02 18:24:16 +02:00
|
|
|
|
2023-08-18 13:42:18 +02:00
|
|
|
dat5$eventid <- rep(seq_along(num_start), num_start)
|
2023-07-20 17:06:28 +02:00
|
|
|
|
2023-08-28 17:29:56 +02:00
|
|
|
dat5$time <- ifelse(dat5$event == "ShowPopup", "start", "stop")
|
2023-07-20 17:06:28 +02:00
|
|
|
|
2023-08-18 13:42:18 +02:00
|
|
|
openPopup_wide <- reshape(dat5, direction = "wide",
|
2023-08-28 17:29:56 +02:00
|
|
|
idvar = c("eventid", "trace", "glossar", "artwork", "popup"),
|
|
|
|
timevar = "time", drop = "card")
|
2023-08-02 18:24:16 +02:00
|
|
|
# there is a pathological entry which gets deleted...
|
|
|
|
# df[df$trace == 4595, ]
|
2023-08-28 17:29:56 +02:00
|
|
|
# --> artwork 046 popup selene.xml gets opened twice
|
2023-07-20 17:06:28 +02:00
|
|
|
|
|
|
|
|
2023-08-11 08:35:41 +02:00
|
|
|
openPopup_wide$event <- "openPopup"
|
|
|
|
openPopup_wide$duration <- openPopup_wide$time_ms.stop -
|
|
|
|
openPopup_wide$time_ms.start
|
2023-07-20 17:06:28 +02:00
|
|
|
|
2023-08-14 16:57:03 +02:00
|
|
|
openPopup_wide$card <- NA
|
|
|
|
openPopup_wide$x.start <- NA
|
|
|
|
openPopup_wide$x.stop <- NA
|
|
|
|
openPopup_wide$y.start <- NA
|
|
|
|
openPopup_wide$y.stop <- NA
|
|
|
|
openPopup_wide$distance <- NA
|
|
|
|
openPopup_wide$scale.start <- NA
|
|
|
|
openPopup_wide$scale.stop <- NA
|
|
|
|
openPopup_wide$scaleSize <- NA
|
|
|
|
openPopup_wide$rotation.start <- NA
|
|
|
|
openPopup_wide$rotation.stop <- NA
|
|
|
|
openPopup_wide$rotationDegree <- NA
|
|
|
|
|
2023-08-28 17:29:56 +02:00
|
|
|
dat_openPopup <- openPopup_wide[, c("fileid.start", "fileid.stop", "event",
|
|
|
|
"artwork", "trace", "glossar",
|
|
|
|
"date.start", "date.stop",
|
|
|
|
"time_ms.start", "time_ms.stop",
|
|
|
|
"duration", "card", "popup", "x.start",
|
|
|
|
"y.start", "x.stop", "y.stop",
|
|
|
|
"distance", "scale.start",
|
|
|
|
"scale.stop", "scaleSize",
|
|
|
|
"rotation.start", "rotation.stop",
|
|
|
|
"rotationDegree")]
|
2023-08-18 13:42:18 +02:00
|
|
|
rm(num_start, openPopup_wide)
|
2023-08-11 08:35:41 +02:00
|
|
|
|
2023-07-20 17:06:28 +02:00
|
|
|
|
2023-08-11 08:35:41 +02:00
|
|
|
# TODO: Should card maybe also be filled in for "openPopup"?
|
2023-06-26 10:30:07 +02:00
|
|
|
|
2023-08-31 16:12:34 +02:00
|
|
|
#' ## Merge data sets for different events
|
2023-08-14 16:57:03 +02:00
|
|
|
|
|
|
|
dat_all <- rbind(dat_trans, dat_flipCard, dat_openTopic, dat_openPopup)
|
|
|
|
|
|
|
|
# check
|
|
|
|
nrow(dat_all) == (nrow(dat_trans) + nrow(dat_flipCard) +
|
|
|
|
nrow(dat_openTopic) + nrow(dat_openPopup))
|
|
|
|
|
2023-08-31 16:12:34 +02:00
|
|
|
#' ## Remove all events that do not have a `date.start`
|
|
|
|
|
2023-08-28 17:29:56 +02:00
|
|
|
dim(dat_all[is.na(dat_all$date.start), ])
|
2023-08-14 16:57:03 +02:00
|
|
|
dat_all <- dat_all[!is.na(dat_all$date.start), ]
|
2023-08-28 17:29:56 +02:00
|
|
|
# There is only a `date.stop`, when event is not properly closed, see here:
|
|
|
|
df[df$trace == 1843, ]
|
|
|
|
dat_openPopup[dat_openPopup$trace == 1843, ]
|
|
|
|
## --> still 50 (small data set) left, and some really do not seem to be
|
|
|
|
## opened! Must be a log error
|
|
|
|
# --> others should be closed!
|
|
|
|
dat[31000:31019,] # this one e.g.
|
|
|
|
# --> Actually NOT! card gets flipped before! Again - log error!
|
|
|
|
|
|
|
|
|
|
|
|
# Remove durations when event spans more than one log file, since they are
|
|
|
|
# not interpretable
|
|
|
|
dat_all[which(dat_all$fileid.start != dat_all$fileid.stop), "duration"] <- NA
|
|
|
|
|
2023-08-14 16:57:03 +02:00
|
|
|
# sort by `start.date`
|
|
|
|
dat_all <- dat_all[order(dat_all$date.start), ]
|
|
|
|
rownames(dat_all) <- NULL
|
|
|
|
|
|
|
|
ind <- rowSums(is.na(dat_all)) == ncol(dat_all)
|
|
|
|
any(ind)
|
|
|
|
dat_all[ind, ]
|
|
|
|
# --> No rows with only NA, as it should be.
|
2023-06-26 10:30:07 +02:00
|
|
|
|
2023-08-14 16:57:03 +02:00
|
|
|
summary(dat_all) # OK, this actually makes a lot of sense :)
|
2023-06-26 10:30:07 +02:00
|
|
|
|
2023-08-14 16:57:03 +02:00
|
|
|
#' ## Create case variable
|
2023-06-26 10:30:07 +02:00
|
|
|
|
2023-08-14 16:57:03 +02:00
|
|
|
dat_all$timediff <- as.numeric(diff(c(dat_all$date.start[1], dat_all$date.start)))
|
2023-06-26 10:30:07 +02:00
|
|
|
|
2023-08-14 16:57:03 +02:00
|
|
|
hist(dat_all$timediff[dat_all$timediff < 40], breaks = 50)
|
2023-06-26 10:30:07 +02:00
|
|
|
|
2023-08-14 16:57:03 +02:00
|
|
|
|
|
|
|
# TODO: What is the best choice for the cutoff here? I took 20 secs for now
|
|
|
|
dat_all$case <- NA
|
|
|
|
j <- 1
|
|
|
|
|
|
|
|
for (i in seq_len(nrow(dat_all))) {
|
|
|
|
if (dat_all$timediff[i] < 21) {
|
|
|
|
dat_all$case[i] <- j
|
|
|
|
} else {
|
|
|
|
j <- j + 1
|
|
|
|
dat_all$case[i] <- j
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
head(dat_all[, c("event", "artwork", "trace", "date.start", "timediff", "case")], 100)
|
|
|
|
|
|
|
|
#' ## Add event ID
|
|
|
|
|
|
|
|
dat_all$eventid <- seq_len(nrow(dat_all))
|
|
|
|
|
2023-08-28 17:29:56 +02:00
|
|
|
dat_all <- dat_all[, c("fileid.start", "fileid.stop", "eventid", "case",
|
|
|
|
"trace", "glossar", "event", "artwork",
|
2023-08-14 16:57:03 +02:00
|
|
|
"date.start", "date.stop", "time_ms.start",
|
|
|
|
"time_ms.stop", "duration", "card", "popup",
|
|
|
|
"x.start", "y.start", "x.stop", "y.stop",
|
|
|
|
"distance", "scale.start", "scale.stop",
|
|
|
|
"scaleSize", "rotation.start", "rotation.stop",
|
|
|
|
"rotationDegree")]
|
|
|
|
|
|
|
|
#' ## Add `trace` numbers for `move` events
|
|
|
|
|
|
|
|
cases <- unique(dat_all$case)
|
|
|
|
aws <- unique(dat_all$artwork)[unique(dat_all$artwork) != "glossar"]
|
|
|
|
max_trace <- max(dat_all$trace, na.rm = TRUE) + 1
|
|
|
|
out <- NULL
|
|
|
|
|
|
|
|
for (case in cases) {
|
|
|
|
for (art in aws) {
|
|
|
|
tmp <- dat_all[dat_all$case == case & dat_all$artwork == art, ]
|
|
|
|
if (nrow(tmp) != 0) {
|
|
|
|
|
|
|
|
if (length(na.omit(unique(tmp$trace))) == 1) {
|
|
|
|
tmp[tmp$event == "move", "trace"] <- na.omit(unique(tmp$trace))
|
|
|
|
} else if (length(na.omit(unique(tmp$trace))) > 1) {
|
|
|
|
for (i in 1:nrow(tmp)) {
|
|
|
|
if (tmp$event[i] == "move") {
|
|
|
|
if (i == 1) {
|
|
|
|
tmp$trace[i] <- na.omit(unique(tmp$trace))[1]
|
|
|
|
} else {
|
|
|
|
tmp$trace[i] <- tmp$trace[i - 1]
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
} else if (all(is.na(tmp$trace))) {
|
|
|
|
for (i in 1:nrow(tmp)) {
|
|
|
|
if (tmp$event[i] == "move") {
|
|
|
|
tmp$trace[i] <- max_trace
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
max_trace <- max_trace + 1
|
|
|
|
}
|
|
|
|
if (nrow(tmp) > 0) {
|
|
|
|
out <- rbind(out, tmp)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
2023-08-28 17:29:56 +02:00
|
|
|
|
2023-08-14 16:57:03 +02:00
|
|
|
# TODO: Get rid of the loops
|
|
|
|
# --> This takes forever...
|
|
|
|
|
2023-08-31 16:12:34 +02:00
|
|
|
# put glossar events back in --> not relevant anymore
|
2023-08-14 16:57:03 +02:00
|
|
|
|
2023-08-28 17:29:56 +02:00
|
|
|
#dat_all <- rbind(out, dat_all[dat_all$artwork == "glossar", ])
|
|
|
|
out <- out[order(out$date.start), ]
|
|
|
|
rownames(out) <- NULL
|
2023-08-14 16:57:03 +02:00
|
|
|
|
|
|
|
# Make `trace` a consecutive number
|
2023-08-28 17:29:56 +02:00
|
|
|
out$trace2 <- as.numeric(factor(out$trace, levels = unique(out$trace)))
|
2023-08-18 13:42:18 +02:00
|
|
|
|
2023-08-14 16:57:03 +02:00
|
|
|
#' # Export data
|
|
|
|
|
2023-08-28 17:29:56 +02:00
|
|
|
write.table(out, "../data/event_logfiles.csv",
|
2023-08-14 16:57:03 +02:00
|
|
|
sep = ";", quote = FALSE, row.names = FALSE)
|
2023-06-26 10:30:07 +02:00
|
|
|
|
2023-08-11 08:35:41 +02:00
|
|
|
# TODO: Write function for closing events
|
2023-08-14 16:57:03 +02:00
|
|
|
|