2023-06-26 10:30:07 +02:00
|
|
|
# setwd("C:/Users/nwickelmaier/Nextcloud/Documents/MDS/2023ss/60100_master_thesis/code")
|
|
|
|
|
2023-09-13 14:20:08 +02:00
|
|
|
###### HELPER ######
|
2023-06-26 10:30:07 +02:00
|
|
|
|
2023-08-18 13:42:18 +02:00
|
|
|
# Need to left pad file names. If I do not do this, the sorting of the
|
|
|
|
# timestamps will be off and I get negative durations later on since the
|
|
|
|
# wrong events get closed.
|
|
|
|
|
|
|
|
|
|
|
|
leftpad_fnames <- function(x) {
|
|
|
|
|
|
|
|
z <- gsub(paste0(dirpaths, "/"), "\\1", x)
|
|
|
|
ys <- strsplit(z, "_")
|
|
|
|
|
|
|
|
res <- NULL
|
|
|
|
|
|
|
|
for (y in ys) {
|
|
|
|
y2 <- unlist(strsplit(y[3], "-"))
|
|
|
|
e1 <- y[1]
|
|
|
|
e2 <- sprintf("%02d", as.numeric(y[2]))
|
|
|
|
e3 <- sprintf("%02d", as.numeric(y2[1]))
|
|
|
|
e4 <- sprintf("%02d", as.numeric(y2[2]))
|
|
|
|
e5 <- sprintf("%02d", as.numeric(y[4]))
|
|
|
|
e6 <- sprintf("%02d", as.numeric(gsub(".log", "", y[5])))
|
|
|
|
e6 <- sprintf("%02d", as.numeric(gsub(".log", "", y[5])))
|
|
|
|
|
2023-09-11 18:24:21 +02:00
|
|
|
res <- c(res,
|
|
|
|
paste0(e1, "_", e2, "_", e3, "-", e4, "_", e5, "_", e6, ".log"))
|
2023-08-18 13:42:18 +02:00
|
|
|
}
|
|
|
|
res
|
|
|
|
}
|
|
|
|
|
2023-09-13 14:20:08 +02:00
|
|
|
##### CONTENT ######
|
|
|
|
|
|
|
|
# Choose which folders with raw log files should be included
|
|
|
|
|
|
|
|
folders <- "all"
|
|
|
|
#folders <- "_2016b"
|
|
|
|
|
|
|
|
dirpaths <- paste0("../data/haum_logs_2016-2023/", folders)
|
|
|
|
|
|
|
|
fnames <- dir(dirpaths, pattern = "*.log", full.names = TRUE)
|
|
|
|
length(fnames)
|
|
|
|
head(fnames)
|
2023-08-18 13:42:18 +02:00
|
|
|
|
2023-06-26 10:30:07 +02:00
|
|
|
logs <- lapply(fnames, readLines)
|
|
|
|
nlog <- sapply(logs, length)
|
2023-09-11 18:24:21 +02:00
|
|
|
dat <- data.frame(fileId = rep(leftpad_fnames(fnames), nlog),
|
|
|
|
logs = unlist(logs))
|
2023-06-26 10:30:07 +02:00
|
|
|
head(dat$logs)
|
|
|
|
|
2023-09-13 14:20:08 +02:00
|
|
|
# Remove corrupted lines
|
2023-06-26 10:30:07 +02:00
|
|
|
|
|
|
|
|
|
|
|
# corrupt lines are "" and need to be removed
|
|
|
|
d1 <- dim(dat)[1]
|
|
|
|
dat <- subset(dat, dat$logs != "")
|
|
|
|
d2 <- dim(dat)[1]
|
|
|
|
|
2023-09-13 14:20:08 +02:00
|
|
|
# TODO: Catch this in a function and give back a meaningful warning
|
|
|
|
# The files contain `r d1-d2` corrupt lines that were removed from the
|
|
|
|
# data.
|
2023-07-07 17:18:09 +02:00
|
|
|
|
2023-09-13 14:20:08 +02:00
|
|
|
# Extract relevant infos
|
2023-06-26 10:30:07 +02:00
|
|
|
|
|
|
|
date <- sapply(dat$logs, gsub,
|
2023-08-18 13:42:18 +02:00
|
|
|
pattern = "^\\[(.*)\\], \\[.*$",
|
2023-06-26 10:30:07 +02:00
|
|
|
replacement = "\\1",
|
|
|
|
USE.NAMES = FALSE)
|
|
|
|
|
|
|
|
timestamp <- sapply(dat$logs, gsub,
|
2023-08-18 13:42:18 +02:00
|
|
|
pattern = "^\\[.*\\], \\[(.*)\\].*$",
|
2023-06-26 10:30:07 +02:00
|
|
|
replacement = "\\1",
|
|
|
|
USE.NAMES = FALSE)
|
|
|
|
|
|
|
|
action <- sapply(dat$logs, gsub,
|
|
|
|
pattern = "^.*EyeVisit, (.*):*.*$",
|
|
|
|
replacement = "\\1",
|
|
|
|
USE.NAMES = FALSE)
|
|
|
|
|
|
|
|
events <- sapply(strsplit(action, ":"), function(x) x[1])
|
|
|
|
|
|
|
|
topics <- sapply(strsplit(action, ":"), function(x) x[2])
|
|
|
|
|
|
|
|
moves <- apply(do.call(rbind,
|
|
|
|
strsplit(sapply(strsplit(action, ":"), function(x) x[3]),
|
|
|
|
",")),
|
|
|
|
2, as.numeric)
|
|
|
|
# ATTENTION: as.numeric() forces NAs for "OpenCard" and "CloseCard"
|
|
|
|
|
|
|
|
card_action <- trimws(sapply(strsplit(action, ":"),
|
|
|
|
function(x) x[3])[grep("Artwork", events)])
|
|
|
|
|
|
|
|
card <- as.numeric(sapply(strsplit(action, ":"), function(x) x[4]))
|
|
|
|
|
|
|
|
events[grep("Artwork", events)] <- paste("Artwork", card_action, sep = "/")
|
|
|
|
|
|
|
|
ts_elements <- strsplit(timestamp, ":")
|
|
|
|
time_ms <- as.numeric(sapply(ts_elements, function(x) x[4])) +
|
|
|
|
as.numeric(sapply(ts_elements, function(x) x[3])) * 1000 +
|
|
|
|
as.numeric(sapply(ts_elements, function(x) x[2])) * 1000 * 60
|
|
|
|
|
2023-09-11 18:24:21 +02:00
|
|
|
dat$date <- lubridate::parse_date_time(date, "bdyHMSOp")
|
|
|
|
dat$timeMs <- time_ms
|
|
|
|
dat$event <- events
|
|
|
|
dat$artwork <- trimws(sapply(strsplit(topics, "/"), function(x) x[1]))
|
|
|
|
dat$popup <- sapply(strsplit(topics, "/"), function(x) x[2])
|
|
|
|
dat$topicNumber <- card
|
|
|
|
dat$x <- moves[,1]
|
|
|
|
dat$y <- moves[,2]
|
|
|
|
dat$scale <- moves[,3]
|
|
|
|
dat$rotation <- moves[,4]
|
2023-06-26 10:30:07 +02:00
|
|
|
|
2023-07-07 17:18:09 +02:00
|
|
|
dat$logs <- NULL
|
|
|
|
# remove original log files from data so file becomes smaller
|
|
|
|
|
2023-09-13 14:20:08 +02:00
|
|
|
# sort by fileId, since reading in by file names does not make sense
|
|
|
|
# because of missing left zero padding
|
2023-09-11 18:24:21 +02:00
|
|
|
dat <- dat[order(dat$fileId, dat$date, dat$timeMs), ]
|
2023-07-20 17:06:28 +02:00
|
|
|
|
2023-09-13 14:20:08 +02:00
|
|
|
# Export data
|
2023-08-14 16:57:03 +02:00
|
|
|
|
2023-09-11 18:24:21 +02:00
|
|
|
write.table(dat, "../data/rawdata_logfiles.csv",
|
2023-06-26 10:30:07 +02:00
|
|
|
sep = ";", quote = FALSE, row.names = FALSE)
|
|
|
|
|