Started moving preprocessing steps to functions; intermediate version

This commit is contained in:
Nora Wickelmaier 2023-09-11 18:24:21 +02:00
parent 6c6778f80f
commit c133792285
3 changed files with 381 additions and 140 deletions

View File

@ -33,8 +33,8 @@ knitr::opts_chunk$set(warning = FALSE, message = FALSE)
#' Choose which folders with raw log files should be included: #' Choose which folders with raw log files should be included:
#folders <- "all" folders <- "all"
folders <- "_2016b" #folders <- "_2016b"
dirpaths <- paste0("../data/haum_logs_2016-2023/", folders) dirpaths <- paste0("../data/haum_logs_2016-2023/", folders)
@ -64,7 +64,8 @@ leftpad_fnames <- function(x) {
e6 <- sprintf("%02d", as.numeric(gsub(".log", "", y[5]))) e6 <- sprintf("%02d", as.numeric(gsub(".log", "", y[5])))
e6 <- sprintf("%02d", as.numeric(gsub(".log", "", y[5]))) e6 <- sprintf("%02d", as.numeric(gsub(".log", "", y[5])))
res <- c(res, paste0(e1, "_", e2, "_", e3, "-", e4, "_", e5, "_", e6, ".log")) res <- c(res,
paste0(e1, "_", e2, "_", e3, "-", e4, "_", e5, "_", e6, ".log"))
} }
res res
} }
@ -72,7 +73,8 @@ leftpad_fnames <- function(x) {
logs <- lapply(fnames, readLines) logs <- lapply(fnames, readLines)
nlog <- sapply(logs, length) nlog <- sapply(logs, length)
dat <- data.frame(fileid = rep(leftpad_fnames(fnames), nlog), logs = unlist(logs)) dat <- data.frame(fileId = rep(leftpad_fnames(fnames), nlog),
logs = unlist(logs))
head(dat$logs) head(dat$logs)
#' Remove corrupted lines #' Remove corrupted lines
@ -95,7 +97,8 @@ d1 <- dim(dat)[1]
dat <- subset(dat, dat$logs != "") dat <- subset(dat, dat$logs != "")
d2 <- dim(dat)[1] d2 <- dim(dat)[1]
#' The files contain `r d1-d2` corrupt lines that were remooved from the data. #' The files contain `r d1-d2` corrupt lines that were remooved from the
#' data.
#' #'
#' ### Extract relevant infos #' ### Extract relevant infos
@ -139,16 +142,16 @@ time_ms <- as.numeric(sapply(ts_elements, function(x) x[4])) +
# TODO: Maybe change to simple gsub()... # TODO: Maybe change to simple gsub()...
# --> This is theoretically sound but a lot of lines for just removing ":" # --> This is theoretically sound but a lot of lines for just removing ":"
dat$date <- lubridate::parse_date_time(date, "bdyHMSOp") dat$date <- lubridate::parse_date_time(date, "bdyHMSOp")
dat$time_ms <- time_ms dat$timeMs <- time_ms
dat$event <- events dat$event <- events
dat$artwork <- trimws(sapply(strsplit(topics, "/"), function(x) x[1])) dat$artwork <- trimws(sapply(strsplit(topics, "/"), function(x) x[1]))
dat$popup <- sapply(strsplit(topics, "/"), function(x) x[2]) dat$popup <- sapply(strsplit(topics, "/"), function(x) x[2])
dat$card <- card dat$topicNumber <- card
dat$x <- moves[,1] dat$x <- moves[,1]
dat$y <- moves[,2] dat$y <- moves[,2]
dat$scale <- moves[,3] dat$scale <- moves[,3]
dat$rotation <- moves[,4] dat$rotation <- moves[,4]
dat$logs <- NULL dat$logs <- NULL
# remove original log files from data so file becomes smaller # remove original log files from data so file becomes smaller
@ -157,14 +160,14 @@ str(dat)
head(dat, 20) head(dat, 20)
# sort by fileid, since reading in by file names does not make sense because of # sort by fileId, since reading in by file names does not make sense because of
# missing left zero padding # missing left zero padding
dat <- dat[order(dat$fileid, dat$date, dat$time_ms), ] dat <- dat[order(dat$fileId, dat$date, dat$timeMs), ]
## TODO: Replace artwork and popup numbers with informative strings ## TODO: Replace artwork and popup numbers with informative strings
#' ### Save data frame #' ### Save data frame
write.table(dat, "../data/rawdata_logfiles_small.csv", write.table(dat, "../data/rawdata_logfiles.csv",
sep = ";", quote = FALSE, row.names = FALSE) sep = ";", quote = FALSE, row.names = FALSE)

View File

@ -40,10 +40,10 @@ rownames(dat1) <- NULL
# Create event ID for closing events # Create event ID for closing events
num_start <- diff(c(0, which(dat1$event == "Transform stop"))) num_start <- diff(c(0, which(dat1$event == "Transform stop")))
dat1$eventid <- rep(seq_along(num_start), num_start) dat1$eventId <- rep(seq_along(num_start), num_start)
head(dat1[, c("event", "eventid")], 25) head(dat1[, c("event", "eventId")], 25)
table(table(dat1$eventid)) table(table(dat1$eventId))
# 1 2 3 4 5 6 7 8 10 11 # 1 2 3 4 5 6 7 8 10 11
# 70 78435 5153 842 222 66 18 14 3 1 # 70 78435 5153 842 222 66 18 14 3 1
# --> compare to table(num_start)! # --> compare to table(num_start)!
@ -52,17 +52,8 @@ table(table(dat1$eventid))
num_stop <- c(diff(c(0, which(dat1$event == "Transform start")))) num_stop <- c(diff(c(0, which(dat1$event == "Transform start"))))
table(num_stop) table(num_stop)
# TODO: Do I still need this?
# dat1$eventrep <- rep(num_start, num_start)
# dat1$dupl <- duplicated(dat1[, c("event", "eventid")]) # keep first
# dat1$dupl <- duplicated(dat1[, c("event", "eventid")], fromLast = TRUE) # keep last
# dat1[dat1$eventrep == 10, ]
# dat1$dupl <- NULL
# dat1$eventrep <- NULL
# remove duplicated "Transform start" events # remove duplicated "Transform start" events
dat1 <- dat1[!duplicated(dat1[, c("event", "eventid")]), ] dat1 <- dat1[!duplicated(dat1[, c("event", "eventId")]), ]
# remove duplicated "Transform stop" events # remove duplicated "Transform stop" events
id_stop <- which(dat1$event == "Transform stop") id_stop <- which(dat1$event == "Transform stop")
@ -74,36 +65,30 @@ dat1 <- dat1[-(id_rm_stop + 1), ]
dat1$time <- ifelse(dat1$event == "Transform start", "start", "stop") dat1$time <- ifelse(dat1$event == "Transform start", "start", "stop")
trans_wide <- reshape(dat1, direction = "wide", trans_wide <- reshape(dat1, direction = "wide",
idvar = c("eventid", "artwork", "glossar"), idvar = c("eventId", "artwork", "glossar"),
timevar = "time", timevar = "time",
drop = c("popup", "card", "event") drop = c("popup", "topicNumber", "event")
) )
# TODO: Should `card` remain? Or maybe rather topic?
# --> Rethink when you add topics, maybe card -> topicNumber?
# TODO: This runs for quite some time # TODO: This runs for quite some time
# --> Is this more efficient with tidyr::pivot_wider? # --> Is this more efficient with tidyr::pivot_wider?
# --> when fileid is part of the reshape, it does not work correctly, since # --> when fileId is part of the reshape, it does not work correctly, since
# we sometimes have a start - stop event that is recorded in two separate # we sometimes have a start - stop event that is recorded in two separate
# log files, BUT: after finding out, that `time_ms` changes for each log # log files, BUT: after finding out, that `timeMs` changes for each log
# file, I want to exclude those cases, so `fileid` has to be included!!! # file, I want to exclude those cases, so `fileId` has to be included!!!
# check how often an eventid is associated with two fileids # check how often an eventId is associated with two fileIds
nrow(subset(trans_wide, trans_wide$fileid.start != trans_wide$fileid.stop)) nrow(subset(trans_wide, trans_wide$fileId.start != trans_wide$fileId.stop))
# TODO: exclude from data set ??
# trans_wide <- subset(trans_wide, trans_wide$fileid.start != trans_wide$fileid.stop)
# which(is.na(trans_wide$date.start)) # which(is.na(trans_wide$date.start))
trans_wide$event <- "move" trans_wide$event <- "move"
trans_wide$eventid <- NULL trans_wide$eventId <- NULL
rownames(trans_wide) <- NULL rownames(trans_wide) <- NULL
trans_wide$duration <- trans_wide$time_ms.stop - trans_wide$time_ms.start trans_wide$duration <- trans_wide$timeMs.stop - trans_wide$timeMs.start
#trans_wide$duration <- trans_wide$date.stop - trans_wide$date.start #trans_wide$duration <- trans_wide$date.stop - trans_wide$date.start
# only seconds - not fine grained enough # only seconds - not fine grained enough
trans_wide$distance <- apply( trans_wide$distance <- apply(
@ -114,19 +99,19 @@ trans_wide$rotationDegree <- trans_wide$rotation.stop -
trans_wide$scaleSize <- trans_wide$scale.stop / trans_wide$scale.start trans_wide$scaleSize <- trans_wide$scale.stop / trans_wide$scale.start
trans_wide$trace <- NA trans_wide$trace <- NA
trans_wide$card <- NA trans_wide$topicNumber <- NA
trans_wide$popup <- NA trans_wide$popup <- NA
dat_trans <- trans_wide[trans_wide$distance != 0 & dat_trans <- trans_wide[trans_wide$distance != 0 &
trans_wide$rotationDegree != 0 & trans_wide$rotationDegree != 0 &
trans_wide$scaleSize != 1, trans_wide$scaleSize != 1,
c("fileid.start", "fileid.stop", "event", "artwork", c("fileId.start", "fileId.stop", "event", "artwork",
"trace", "glossar", "date.start", "date.stop", "trace", "glossar", "date.start", "date.stop",
"time_ms.start", "time_ms.stop", "duration", "timeMs.start", "timeMs.stop", "duration",
"card", "popup", "x.start", "y.start", "x.stop", "topicNumber", "popup", "x.start", "y.start",
"y.stop", "distance", "scale.start", "scale.stop", "x.stop", "y.stop", "distance", "scale.start",
"scaleSize", "rotation.start", "rotation.stop", "scale.stop", "scaleSize", "rotation.start",
"rotationDegree")] "rotation.stop", "rotationDegree")]
1 - nrow(dat_trans) / nrow(trans_wide) 1 - nrow(dat_trans) / nrow(trans_wide)
# removes almost 2/3 of the data (for small data set) # removes almost 2/3 of the data (for small data set)
@ -166,9 +151,6 @@ for (art in aws) { # select artwork
} }
} }
head(dat2[, c("artwork", "event", "trace")], 50)
tail(dat2[, c("artwork", "event", "trace")], 50)
rm(aws, i, j, last_event, art) rm(aws, i, j, last_event, art)
#' ## Fix glossar entries (find corresponding artworks and fill in trace) #' ## Fix glossar entries (find corresponding artworks and fill in trace)
@ -181,6 +163,11 @@ lut <- glossar_dict[glossar_dict$glossar_file %in% glossar_files, ]
head(dat2[, c("artwork", "event", "popup", "trace")], 20) head(dat2[, c("artwork", "event", "popup", "trace")], 20)
inside <- glossar_files[glossar_files %in%
lut[sapply(lut$artwork, length) == 1,
"glossar_file"]]
single_art <- unlist(lut[lut$glossar_file %in% inside, "artwork"])
for (file in lut$glossar_file) { for (file in lut$glossar_file) {
@ -188,7 +175,9 @@ for (file in lut$glossar_file) {
for (i in seq_len(nrow(dat2))) { for (i in seq_len(nrow(dat2))) {
if (dat2$event[i] == "Show Info") { if (dat2$event[i] == "Show Info" |
(dat2$event[i] == "Artwork/OpenCard" &
dat2$artwork[i] %in% single_art)) {
current_artwork <- dat2[i, "artwork"] current_artwork <- dat2[i, "artwork"]
j <- i j <- i
@ -207,67 +196,32 @@ for (file in lut$glossar_file) {
if (dat2$artwork[i] == "glossar" & if (dat2$artwork[i] == "glossar" &
(current_artwork %in% artwork_list) & (current_artwork %in% artwork_list) &
dat2$popup[i] == file & (j-k == 0)) { dat2$popup[i] == file & (j - k == 0)) {
dat2[i, "trace"] <- dat2[j, "trace"] dat2[i, "trace"] <- dat2[j, "trace"]
dat2[i, "artwork"] <- current_artwork dat2[i, "artwork"] <- current_artwork
} }
} }
} }
# --> finds about half of the glossar entries for the small data set... # --> finds a bit more than half of the glossar entries for the small data
table(is.na(dat2[dat2$glossar == 1, "trace"])) # set...
proportions(table(is.na(dat2[dat2$glossar == 1, "trace"])))
# REMEMBER: It can never bo 100% correct, since it is always possible that # REMEMBER: It can never be 100% correct, since it is always possible that
# several cards are open and that they link to the same glossar entry # several cards are open and that they link to the same glossar entry
# How many glossar_files are only associated with one artwork?
lut[sapply(lut$artwork, length) == 1, "glossar_file"]
single <- lut[sapply(lut$artwork, length) == 1, "glossar_file"]
tmp <- subset(dat2, is.na(dat2$trace))$popup
inside <- unique(tmp[tmp %in% lut[sapply(lut$artwork, length) == 1, "glossar_file"]])
single_art <- unlist(lut[lut$glossar_file %in% inside, "artwork"])
tmp_lut <- data.frame(glossar_file = sort(inside), artwork = single_art)
dat2[dat2$glossar == 1, c("artwork", "popup", "glossar", "trace")]
for (file in tmp_lut$glossar_file) {
for (i in seq_len(nrow(dat2))) {
if (dat2$event[i] == "Artwork/OpenCard" & dat2$artwork[i] %in% tmp_lut$artwork) {
current_artwork <- dat2[i, "artwork"]
j <- i
}
if (dat2$artwork[i] == "glossar" &
dat2$popup[i] == file) {
dat2[i, "trace"] <- dat2[j, "trace"]
dat2[i, "artwork"] <- current_artwork
}
}
}
dat2[14110:14130, ] dat2[14110:14130, ]
dat2[dat2$glossar == 1, ]
# TODO: Integrate for-loop into for-loop above # Exclude not matched glossar entries
df <- subset(dat2, !is.na(dat2$trace)) df <- subset(dat2, !is.na(dat2$trace))
# TODO: For now: Exclude not matched glossar entries
df <- df[order(df$trace), ] df <- df[order(df$trace), ]
rownames(df) <- NULL rownames(df) <- NULL
rm(lut, current_artwork, file, glossar_dict, i, j, k, artwork_list, rm(lut, current_artwork, file, glossar_dict, i, j, k, artwork_list,
glossar_files) glossar_files, inside, single_art)
#' ## Close flipCard #' ## Close flipCard
@ -278,23 +232,23 @@ dat3$time <- ifelse(dat3$event == "Show Info", "start", "stop")
flipCard_wide <- reshape(dat3, direction = "wide", flipCard_wide <- reshape(dat3, direction = "wide",
idvar = c("trace", "artwork", "glossar"), idvar = c("trace", "artwork", "glossar"),
timevar = "time", timevar = "time",
drop = c("popup", "card")) drop = c("popup", "topicNumber"))
flipCard_wide$event <- "flipCard" flipCard_wide$event <- "flipCard"
flipCard_wide$duration <- flipCard_wide$time_ms.stop - flipCard_wide$duration <- flipCard_wide$timeMs.stop -
flipCard_wide$time_ms.start flipCard_wide$timeMs.start
flipCard_wide$card <- NA flipCard_wide$topicNumber <- NA
flipCard_wide$popup <- NA flipCard_wide$popup <- NA
flipCard_wide$distance <- NA flipCard_wide$distance <- NA
flipCard_wide$scaleSize <- NA flipCard_wide$scaleSize <- NA
flipCard_wide$rotationDegree <- NA flipCard_wide$rotationDegree <- NA
dat_flipCard <- flipCard_wide[, c("fileid.start", "fileid.stop", "event", dat_flipCard <- flipCard_wide[, c("fileId.start", "fileId.stop", "event",
"artwork", "trace", "glossar", "artwork", "trace", "glossar",
"date.start", "date.stop", "date.start", "date.stop",
"time_ms.start", "time_ms.stop", "timeMs.start", "timeMs.stop",
"duration", "card", "popup", "x.start", "duration", "topicNumber", "popup",
"y.start", "x.stop", "y.stop", "x.start", "y.start", "x.stop", "y.stop",
"distance", "scale.start", "scale.stop", "distance", "scale.start", "scale.stop",
"scaleSize", "rotation.start", "scaleSize", "rotation.start",
"rotation.stop", "rotationDegree")] "rotation.stop", "rotationDegree")]
@ -308,33 +262,33 @@ dat4 <- dat4[order(dat4$artwork, dat4$date), ]
rownames(dat4) <- NULL rownames(dat4) <- NULL
num_start <- diff(c(0, which(dat4$event == "Artwork/CloseCard"))) num_start <- diff(c(0, which(dat4$event == "Artwork/CloseCard")))
dat4$eventid <- rep(seq_along(num_start), num_start) dat4$eventId <- rep(seq_along(num_start), num_start)
dat4$time <- ifelse(dat4$event == "Artwork/OpenCard", "start", "stop") dat4$time <- ifelse(dat4$event == "Artwork/OpenCard", "start", "stop")
openTopic_wide <- reshape(dat4, direction = "wide", openTopic_wide <- reshape(dat4, direction = "wide",
idvar = c("eventid", "trace", "glossar", "artwork", "card"), idvar = c("eventId", "trace", "glossar", "artwork", "topicNumber"),
timevar = "time", drop = "popup") timevar = "time", drop = "popup")
openTopic_wide$event <- "openTopic" openTopic_wide$event <- "openTopic"
openTopic_wide$duration <- openTopic_wide$time_ms.stop - openTopic_wide$duration <- openTopic_wide$timeMs.stop -
openTopic_wide$time_ms.start openTopic_wide$timeMs.start
openTopic_wide$popup <- NA openTopic_wide$popup <- NA
openTopic_wide$distance <- NA openTopic_wide$distance <- NA
openTopic_wide$scaleSize <- NA openTopic_wide$scaleSize <- NA
openTopic_wide$rotationDegree <- NA openTopic_wide$rotationDegree <- NA
dat_openTopic <- openTopic_wide[, c("fileid.start", "fileid.stop", "event", dat_openTopic <- openTopic_wide[, c("fileId.start", "fileId.stop", "event",
"artwork", "trace", "glossar", "artwork", "trace", "glossar",
"date.start", "date.stop", "date.start", "date.stop",
"time_ms.start", "time_ms.stop", "timeMs.start", "timeMs.stop",
"duration", "card", "popup", "x.start", "duration", "topicNumber", "popup",
"y.start", "x.stop", "y.stop", "x.start", "y.start", "x.stop",
"distance", "scale.start", "y.stop", "distance", "scale.start",
"scale.stop", "scaleSize", "scale.stop", "scaleSize",
"rotation.start", "rotation.stop", "rotation.start", "rotation.stop",
"rotationDegree")] "rotationDegree")]
# TODO: card should have a unique identifier for each artwork # TODO: topicNumber should have a unique identifier for each artwork
rm(openTopic_wide, num_start) rm(openTopic_wide, num_start)
@ -347,42 +301,45 @@ rownames(dat5) <- NULL
num_start <- diff(c(0, which(dat5$event == "HidePopup"))) num_start <- diff(c(0, which(dat5$event == "HidePopup")))
# last event is "ShowPopup"! Needs to be fixed # last event is "ShowPopup"! Needs to be fixed
# num_start <- c(num_start, 1) # num_start <- c(num_start, 1)
# TODO: Needs to be caught in a function --> not anymore - still relevant??? # TODO: Needs to be caught in a function
# --> not anymore - still relevant???
dat5$eventid <- rep(seq_along(num_start), num_start) dat5$eventId <- rep(seq_along(num_start), num_start)
dat5$time <- ifelse(dat5$event == "ShowPopup", "start", "stop") dat5$time <- ifelse(dat5$event == "ShowPopup", "start", "stop")
openPopup_wide <- reshape(dat5, direction = "wide", openPopup_wide <- reshape(dat5, direction = "wide",
idvar = c("eventid", "trace", "glossar", "artwork", "popup"), idvar = c("eventId", "trace", "glossar",
timevar = "time", drop = "card") "artwork", "popup"),
timevar = "time",
drop = "topicNumber")
# there is a pathological entry which gets deleted... # there is a pathological entry which gets deleted...
# df[df$trace == 4595, ] # df[df$trace == 4595, ]
# --> artwork 046 popup selene.xml gets opened twice # --> artwork 046 popup selene.xml gets opened twice
openPopup_wide$event <- "openPopup" openPopup_wide$event <- "openPopup"
openPopup_wide$duration <- openPopup_wide$time_ms.stop - openPopup_wide$duration <- openPopup_wide$timeMs.stop -
openPopup_wide$time_ms.start openPopup_wide$timeMs.start
openPopup_wide$card <- NA openPopup_wide$topicNumber <- NA
openPopup_wide$distance <- NA openPopup_wide$distance <- NA
openPopup_wide$scaleSize <- NA openPopup_wide$scaleSize <- NA
openPopup_wide$rotationDegree <- NA openPopup_wide$rotationDegree <- NA
dat_openPopup <- openPopup_wide[, c("fileid.start", "fileid.stop", "event", dat_openPopup <- openPopup_wide[, c("fileId.start", "fileId.stop", "event",
"artwork", "trace", "glossar", "artwork", "trace", "glossar",
"date.start", "date.stop", "date.start", "date.stop",
"time_ms.start", "time_ms.stop", "timeMs.start", "timeMs.stop",
"duration", "card", "popup", "x.start", "duration", "topicNumber", "popup",
"y.start", "x.stop", "y.stop", "x.start", "y.start", "x.stop",
"distance", "scale.start", "y.stop", "distance", "scale.start",
"scale.stop", "scaleSize", "scale.stop", "scaleSize",
"rotation.start", "rotation.stop", "rotation.start", "rotation.stop",
"rotationDegree")] "rotationDegree")]
rm(num_start, openPopup_wide) rm(num_start, openPopup_wide)
# TODO: Should card maybe also be filled in for "openPopup"? # TODO: Should topicNumber maybe also be filled in for "openPopup"?
#' ## Merge data sets for different events #' ## Merge data sets for different events
@ -408,7 +365,7 @@ dat[31000:31019,] # this one e.g.
# Remove durations when event spans more than one log file, since they are # Remove durations when event spans more than one log file, since they are
# not interpretable # not interpretable
dat_all[which(dat_all$fileid.start != dat_all$fileid.stop), "duration"] <- NA dat_all[which(dat_all$fileId.start != dat_all$fileId.stop), "duration"] <- NA
# sort by `start.date` # sort by `start.date`
dat_all <- dat_all[order(dat_all$date.start), ] dat_all <- dat_all[order(dat_all$date.start), ]
@ -445,12 +402,12 @@ head(dat_all[, c("event", "artwork", "trace", "date.start", "timediff", "case")]
#' ## Add event ID #' ## Add event ID
dat_all$eventid <- seq_len(nrow(dat_all)) dat_all$eventId <- seq_len(nrow(dat_all))
dat_all <- dat_all[, c("fileid.start", "fileid.stop", "eventid", "case", dat_all <- dat_all[, c("fileId.start", "fileId.stop", "eventId", "case",
"trace", "glossar", "event", "artwork", "trace", "glossar", "event", "artwork",
"date.start", "date.stop", "time_ms.start", "date.start", "date.stop", "timeMs.start",
"time_ms.stop", "duration", "card", "popup", "timeMs.stop", "duration", "topicNumber", "popup",
"x.start", "y.start", "x.stop", "y.stop", "x.start", "y.start", "x.stop", "y.stop",
"distance", "scale.start", "scale.stop", "distance", "scale.start", "scale.stop",
"scaleSize", "rotation.start", "rotation.stop", "scaleSize", "rotation.start", "rotation.stop",
@ -510,12 +467,10 @@ out$trace <- as.numeric(factor(out$trace, levels = unique(out$trace)))
#' # Fill in topics #' # Fill in topics
topics <- read.table("../data/topics.csv", sep = ";", header = TRUE) topics <- read.table("../data/topics.csv", sep = ";", header = TRUE)
# TODO: # TODO: Add topics to data frame
#' # Export data #' # Export data
write.table(out, "../data/event_logfiles.csv", sep = ";", write.table(out, "../data/event_logfiles.csv", sep = ";",
row.names = FALSE) row.names = FALSE)
# TODO: Write function for closing events

283
code/functions.R Normal file
View File

@ -0,0 +1,283 @@
# setwd("C:/Users/nwickelmaier/Nextcloud/Documents/MDS/2023ss/60100_master_thesis/code")
# Read data
dat0 <- read.table("../data/rawdata_logfiles_small.csv", sep = ";",
header = TRUE)
dat0$date <- as.POSIXct(dat0$date)
dat0$glossar <- ifelse(dat0$artwork == "glossar", 1, 0)
# remove irrelevant events
dat <- subset(dat0, !(dat0$event %in% c("Start Application",
"Show Application")))
# Close move events
close_moves <- function(data) {
# close move events
dat1 <- data[data$event %in% c("Transform start", "Transform stop"), ]
dat1 <- dat1[order(dat1$artwork, dat1$date), ]
num_start <- diff(c(0, which(dat1$event == "Transform stop")))
dat1$eventId <- rep(seq_along(num_start), num_start)
# remove duplicated "Transform start" events
dat1 <- dat1[!duplicated(dat1[, c("event", "eventId")]), ]
# remove duplicated "Transform stop" events
id_stop <- which(dat1$event == "Transform stop")
id_rm_stop <- id_stop[diff(id_stop) == 1]
dat1 <- dat1[-(id_rm_stop + 1), ]
# transform to wide data format
dat1$time <- ifelse(dat1$event == "Transform start", "start", "stop")
trans_wide <- reshape(dat1, direction = "wide",
idvar = c("eventId", "artwork", "glossar"),
timevar = "time",
drop = c("popup", "topicNumber", "event")
)
trans_wide$event <- "move"
trans_wide$eventId <- NULL
trans_wide$duration <- trans_wide$timeMs.stop - trans_wide$timeMs.start
trans_wide$distance <- apply(
trans_wide[, c("x.start", "y.start", "x.stop", "y.stop")], 1,
function(x) dist(matrix(x, 2, 2, byrow = TRUE)))
trans_wide$rotationDegree <- trans_wide$rotation.stop -
trans_wide$rotation.start
trans_wide$scaleSize <- trans_wide$scale.stop / trans_wide$scale.start
trans_wide$trace <- NA
trans_wide$topicNumber <- NA
trans_wide$popup <- NA
dat_trans <- trans_wide[trans_wide$distance != 0 &
trans_wide$rotationDegree != 0 &
trans_wide$scaleSize != 1,
c("fileId.start", "fileId.stop", "event", "artwork",
"trace", "glossar", "date.start", "date.stop",
"timeMs.start", "timeMs.stop", "duration",
"topicNumber", "popup", "x.start", "y.start",
"x.stop", "y.stop", "distance", "scale.start",
"scale.stop", "scaleSize", "rotation.start",
"rotation.stop", "rotationDegree")]
rownames(dat_trans) <- NULL
cat(paste("INFORMATION:", nrow(trans_wide) - nrow(dat_trans),
"lines containing move events were removed since they did",
"\nnot contain any change"), fill = TRUE)
dat_trans
}
dat1 <- close_moves(dat)
# TODO: Integrate this function into close_events?
###########################################################################
# Add trace variable
add_trace <- function(data) {
dat2 <- data[!data$event %in% c("Transform start", "Transform stop"), ]
dat2$trace <- NA
last_event <- dat2$event[1]
aws <- unique(dat2$artwork)[unique(dat2$artwork) != "glossar"]
#
for (art in aws) { # select artwork
for (i in 1:nrow(dat2)) { # go through rows
if (last_event == "Show Info" & dat2$artwork[i] == art) {
dat2$trace[i] <- i
j <- i
} else if (last_event == "Show Front" & dat2$artwork[i] == art) {
dat2$trace[i] <- j
} else if (!(last_event %in% c("Show Info", "Show Front")) &
dat2$artwork[i] == art) {
dat2$trace[i] <- j
}
if (i <= nrow(dat2)) {
last_event <- dat2$event[i + 1]
}
}
}
dat2
}
add_trace2 <- function(data, glossar_dict = "../data/glossar_dict.RData") {
data$trace <- NA
dat1 <- data[data$event %in% c("Transform start", "Transform stop"), ]
dat2 <- data[!data$event %in% c("Transform start", "Transform stop"), ]
last_event <- dat2$event[1]
aws <- unique(dat2$artwork)[unique(dat2$artwork) != "glossar"]
#
for (art in aws) { # select artwork
for (i in 1:nrow(dat2)) { # go through rows
if (last_event == "Show Info" & dat2$artwork[i] == art) {
dat2$trace[i] <- i
j <- i
} else if (last_event == "Show Front" & dat2$artwork[i] == art) {
dat2$trace[i] <- j
} else if (!(last_event %in% c("Show Info", "Show Front")) &
dat2$artwork[i] == art) {
dat2$trace[i] <- j
}
if (i <= nrow(dat2)) {
last_event <- dat2$event[i + 1]
}
}
}
glossar_files <- unique(dat2[dat2$artwork == "glossar", "popup"])
# load lookup table for artworks and glossar files
load(glossar_dict)
lut <- glossar_dict[glossar_dict$glossar_file %in% glossar_files, ]
head(dat2[, c("artwork", "event", "popup", "trace")], 20)
inside <- glossar_files[glossar_files %in%
lut[sapply(lut$artwork, length) == 1,
"glossar_file"]]
single_art <- unlist(lut[lut$glossar_file %in% inside, "artwork"])
for (file in lut$glossar_file) {
artwork_list <- unlist(lut[lut$glossar_file == file, "artwork"])
for (i in seq_len(nrow(dat2))) {
if (dat2$event[i] == "Show Info" |
(dat2$event[i] == "Artwork/OpenCard" &
dat2$artwork[i] %in% single_art)) {
current_artwork <- dat2[i, "artwork"]
j <- i
k <- i
} else {
current_artwork <- current_artwork
}
if (dat2$event[i] == "Show Front" & dat2$artwork[i] == current_artwork) {
# make sure artwork has not been closed, yet!
k <- i
}
if (dat2$artwork[i] == "glossar" &
(current_artwork %in% artwork_list) &
dat2$popup[i] == file & (j - k == 0)) {
dat2[i, "trace"] <- dat2[j, "trace"]
dat2[i, "artwork"] <- current_artwork
}
}
}
cat(proportions(table(is.na(dat2[dat2$glossar == 1, "trace"]))), fill = TRUE)
out <- rbind(dat1, dat2)
out <- out[order(out$fileId, out$date, out$timeMs), ]
out
}
tmp <- add_trace2(dat)
###########################################################################
close_events <- function(data, event = c("flipCard", "openTopic", "openPopup")) {
if (event == "flipCard") {
subdata <- subset(data, data$event %in% c("Show Info", "Show Front"))
subdata$time <- ifelse(subdata$event == "Show Info", "start", "stop")
subdata$eventId <- NA
idvar <- c("trace", "artwork", "glossar")
drop <- c("popup", "topicNumber")
} else if (event == "openTopic") {
subdata <- subset(data, data$event %in% c("Artwork/OpenCard", "Artwork/CloseCard"))
subdata$time <- ifelse(subdata$event == "Artwork/OpenCard", "start", "stop")
num_start <- diff(c(0, which(subdata$event == "Artwork/CloseCard")))
subdata$eventId <- rep(seq_along(num_start), num_start)
idvar <- c("eventId", "trace", "glossar", "artwork", "topicNumber")
drop <- "popup"
} else if (event == "openPopup") {
subdata <- subset(data, data$event %in% c("ShowPopup", "HidePopup"))
subdata$time <- ifelse(subdata$event == "ShowPopup", "start", "stop")
num_start <- diff(c(0, which(subdata$event == "HidePopup")))
subdata$eventId <- rep(seq_along(num_start), num_start)
idvar <- c("eventId", "trace", "glossar", "artwork", "popup")
drop <- "topicNumber"
}
data_wide <- reshape(subdata, direction = "wide",
idvar = idvar,
timevar = "time",
drop = drop)
data_wide$event <- event
data_wide$duration <- data_wide$timeMs.stop - data_wide$timeMs.start
for (d in drop) data_wide[d] <- NA
data_wide$distance <- NA
data_wide$scaleSize <- NA
data_wide$rotationDegree <- NA
out <- data_wide[, c("fileId.start", "fileId.stop", "event", "artwork",
"trace", "glossar", "date.start", "date.stop",
"timeMs.start", "timeMs.stop", "duration",
"topicNumber", "popup", "x.start", "y.start",
"x.stop", "y.stop", "distance", "scale.start",
"scale.stop", "scaleSize", "rotation.start",
"rotation.stop", "rotationDegree")]
out
# TODO: Suppress warnings?
}
tmp <- rbind(close_moves(dat),
close_events(df, "flipCard"),
close_events(df, "openTopic"),
close_events(df, "openPopup"))
# 'data.frame': 38607 obs. of 24 variables:
# $ fileId.start : chr "2016_11_15-12_32_57.log" "2016_11_15-14_42_57.log" "2016_11_15-14_42_57.log" "2016_11_16-12_31_32.log" ...
# $ fileId.stop : chr "2016_11_15-12_32_57.log" "2016_11_15-14_42_57.log" "2016_11_15-14_42_57.log" "2016_11_16-12_31_32.log" ...
# $ event : chr "move" "move" "move" "move" ...
# $ artwork : chr "001" "001" "001" "001" ...
# $ trace : int NA NA NA NA NA NA NA NA NA NA ...
# $ glossar : num 0 0 0 0 0 0 0 0 0 0 ...
# $ date.start : POSIXct, format: "2016-12-15 12:39:49" "2016-12-15 14:49:37" ...
# $ date.stop : POSIXct, format: "2016-12-15 12:39:49" "2016-12-15 14:49:40" ...
# $ timeMs.start : int 412141 400777 554506 384312 406277 533864 548467 549396 158632 194982 ...
# $ timeMs.stop : int 412474 403784 556633 388313 407994 538185 549088 551116 160343 197099 ...
# $ duration : int 333 3007 2127 4001 1717 4321 621 1720 1711 2117 ...
# $ topicNumber : int NA NA NA NA NA NA NA NA NA NA ...
# $ popup : chr NA NA NA NA ...
# $ x.start : num 531 235 470 326 326 ...
# $ y.start : num 1221 734 2090 747 747 ...
# $ x.stop : num 513 360 1492 256 2459 ...
# $ y.stop : num 1212 809 1687 643 1430 ...
# $ distance : num 19.8 146.6 1098.5 125.2 2239.4 ...
# $ scale.start : num 0.8 0.301 0.8 0.301 0.301 ...
# $ scale.stop : num 0.8 0.331 0.822 0.391 0.397 ...
# $ scaleSize : num 1 1.1 1.03 1.3 1.32 ...
# $ rotation.start: num 116 116 90 116 116 ...
# $ rotation.stop : num 116.3 89.6 2.8 86.1 125.8 ...
# $ rotationDegree: num 0.00245 -26.72397 -87.19711 -30.14456 9.49951 ...