mtt_haum/code/01_parse-logfiles.R

126 lines
3.7 KiB
R

# setwd("C:/Users/nwickelmaier/Nextcloud/Documents/MDS/2023ss/60100_master_thesis/code")
###### HELPER ######
# Need to left pad file names. If I do not do this, the sorting of the
# timestamps will be off and I get negative durations later on since the
# wrong events get closed.
leftpad_fnames <- function(x) {
z <- gsub(paste0(dirpaths, "/"), "\\1", x)
ys <- strsplit(z, "_")
res <- NULL
for (y in ys) {
y2 <- unlist(strsplit(y[3], "-"))
e1 <- y[1]
e2 <- sprintf("%02d", as.numeric(y[2]))
e3 <- sprintf("%02d", as.numeric(y2[1]))
e4 <- sprintf("%02d", as.numeric(y2[2]))
e5 <- sprintf("%02d", as.numeric(y[4]))
e6 <- sprintf("%02d", as.numeric(gsub(".log", "", y[5])))
e6 <- sprintf("%02d", as.numeric(gsub(".log", "", y[5])))
res <- c(res,
paste0(e1, "_", e2, "_", e3, "-", e4, "_", e5, "_", e6, ".log"))
}
res
}
##### CONTENT ######
# Choose which folders with raw log files should be included
folders <- "all"
#folders <- "_2016b"
dirpaths <- paste0("../data/haum_logs_2016-2023/", folders)
fnames <- dir(dirpaths, pattern = "*.log", full.names = TRUE)
length(fnames)
head(fnames)
logs <- lapply(fnames, readLines)
nlog <- sapply(logs, length)
dat <- data.frame(fileId = rep(leftpad_fnames(fnames), nlog),
logs = unlist(logs))
head(dat$logs)
# Remove corrupted lines
# corrupt lines are "" and need to be removed
d1 <- dim(dat)[1]
dat <- subset(dat, dat$logs != "")
d2 <- dim(dat)[1]
# TODO: Catch this in a function and give back a meaningful warning
# The files contain `r d1-d2` corrupt lines that were removed from the
# data.
# Extract relevant infos
date <- sapply(dat$logs, gsub,
pattern = "^\\[(.*)\\], \\[.*$",
replacement = "\\1",
USE.NAMES = FALSE)
timestamp <- sapply(dat$logs, gsub,
pattern = "^\\[.*\\], \\[(.*)\\].*$",
replacement = "\\1",
USE.NAMES = FALSE)
action <- sapply(dat$logs, gsub,
pattern = "^.*EyeVisit, (.*):*.*$",
replacement = "\\1",
USE.NAMES = FALSE)
events <- sapply(strsplit(action, ":"), function(x) x[1])
topics <- sapply(strsplit(action, ":"), function(x) x[2])
moves <- apply(do.call(rbind,
strsplit(sapply(strsplit(action, ":"), function(x) x[3]),
",")),
2, as.numeric)
# ATTENTION: as.numeric() forces NAs for "OpenCard" and "CloseCard"
card_action <- trimws(sapply(strsplit(action, ":"),
function(x) x[3])[grep("Artwork", events)])
card <- as.numeric(sapply(strsplit(action, ":"), function(x) x[4]))
events[grep("Artwork", events)] <- paste("Artwork", card_action, sep = "/")
ts_elements <- strsplit(timestamp, ":")
time_ms <- as.numeric(sapply(ts_elements, function(x) x[4])) +
as.numeric(sapply(ts_elements, function(x) x[3])) * 1000 +
as.numeric(sapply(ts_elements, function(x) x[2])) * 1000 * 60
dat$date <- lubridate::parse_date_time(date, "bdyHMSOp")
dat$timeMs <- time_ms
dat$event <- events
dat$artwork <- trimws(sapply(strsplit(topics, "/"), function(x) x[1]))
dat$popup <- sapply(strsplit(topics, "/"), function(x) x[2])
dat$topicNumber <- card
dat$x <- moves[,1]
dat$y <- moves[,2]
dat$scale <- moves[,3]
dat$rotation <- moves[,4]
dat$logs <- NULL
# remove original log files from data so file becomes smaller
# sort by fileId, since reading in by file names does not make sense
# because of missing left zero padding
dat <- dat[order(dat$fileId, dat$date, dat$timeMs), ]
# Export data
write.table(dat, "../data/rawdata_logfiles.csv",
sep = ";", quote = FALSE, row.names = FALSE)