171 lines
5.1 KiB
R
171 lines
5.1 KiB
R
#' ---
|
|
#' title: "Preprocessing raw log files"
|
|
#' author: "Nora Wickelmaier"
|
|
#' date: "`r Sys.Date()`"
|
|
#' output:
|
|
#' html_document:
|
|
#' default
|
|
#' pdf_document:
|
|
#' toc: true
|
|
#' number_sections: true
|
|
#' geometry: margin = 2.5cm
|
|
#' ---
|
|
|
|
# setwd("C:/Users/nwickelmaier/Nextcloud/Documents/MDS/2023ss/60100_master_thesis/code")
|
|
|
|
#+ setup, include = FALSE
|
|
knitr::opts_chunk$set(warning = FALSE, message = FALSE)
|
|
|
|
#' The following events can be extracted from the log files:
|
|
#'
|
|
#' ```
|
|
#' LogEntry classes:
|
|
#' TRANSFORM_START: "Transform start" --> "Transformation Start" in Tool
|
|
#' TRANSFORM_STOP: "Transform stop"
|
|
#' START_APPLICATION: "Start Application"
|
|
#' SHOW_APPLICATION: "Show Application"
|
|
#' SHOW_INFO: "Show Info" --> "Flip Card" in Tool
|
|
#' SHOW_FRONT: "Show Front"
|
|
#' SHOW_POPUP: "ShowPopup" --> "Show Popup" in Tool
|
|
#' HIDE_POPUP: "HidePopup"
|
|
#' ARTWORK: "Artwork" --> "Show Topic" in Tool
|
|
#' ```
|
|
|
|
#' Choose which folders with raw log files should be included:
|
|
|
|
#folders <- "all"
|
|
folders <- "_2016b"
|
|
|
|
dirpaths <- paste0("../data/haum_logs_2016-2023/", folders)
|
|
|
|
fnames <- dir(dirpaths, pattern = "*.log", full.names = TRUE)
|
|
length(fnames)
|
|
head(fnames)
|
|
|
|
# Need to left pad file names. If I do not do this, the sorting of the
|
|
# timestamps will be off and I get negative durations later on since the
|
|
# wrong events get closed.
|
|
|
|
|
|
leftpad_fnames <- function(x) {
|
|
|
|
z <- gsub(paste0(dirpaths, "/"), "\\1", x)
|
|
ys <- strsplit(z, "_")
|
|
|
|
res <- NULL
|
|
|
|
for (y in ys) {
|
|
y2 <- unlist(strsplit(y[3], "-"))
|
|
e1 <- y[1]
|
|
e2 <- sprintf("%02d", as.numeric(y[2]))
|
|
e3 <- sprintf("%02d", as.numeric(y2[1]))
|
|
e4 <- sprintf("%02d", as.numeric(y2[2]))
|
|
e5 <- sprintf("%02d", as.numeric(y[4]))
|
|
e6 <- sprintf("%02d", as.numeric(gsub(".log", "", y[5])))
|
|
e6 <- sprintf("%02d", as.numeric(gsub(".log", "", y[5])))
|
|
|
|
res <- c(res, paste0(e1, "_", e2, "_", e3, "-", e4, "_", e5, "_", e6, ".log"))
|
|
}
|
|
res
|
|
}
|
|
|
|
|
|
logs <- lapply(fnames, readLines)
|
|
nlog <- sapply(logs, length)
|
|
dat <- data.frame(fileid = rep(leftpad_fnames(fnames), nlog), logs = unlist(logs))
|
|
head(dat$logs)
|
|
|
|
#' Remove corrupted lines
|
|
|
|
# Warning messages:
|
|
# incomplete final line found on '_2016/2016_11_18-11_31_0.log'
|
|
# incomplete final line found on '_2016/2016_11_18-11_38_30.log'
|
|
# incomplete final line found on '_2016/2016_11_18-11_40_36.log'
|
|
# ...
|
|
|
|
## --> files have a last line that looks like a binary entry??
|
|
|
|
# From LogEntry.as:
|
|
# //pm: inserted this check to account for some broken logfiles
|
|
# if (metaData[1] == null){
|
|
# trace("corrupt line... still do not know how these came to happen.");
|
|
|
|
# corrupt lines are "" and need to be removed
|
|
d1 <- dim(dat)[1]
|
|
dat <- subset(dat, dat$logs != "")
|
|
d2 <- dim(dat)[1]
|
|
|
|
#' The files contain `r d1-d2` corrupt lines that were remooved from the data.
|
|
#'
|
|
|
|
#' ### Extract relevant infos
|
|
|
|
date <- sapply(dat$logs, gsub,
|
|
pattern = "^\\[(.*)\\], \\[.*$",
|
|
replacement = "\\1",
|
|
USE.NAMES = FALSE)
|
|
|
|
timestamp <- sapply(dat$logs, gsub,
|
|
pattern = "^\\[.*\\], \\[(.*)\\].*$",
|
|
replacement = "\\1",
|
|
USE.NAMES = FALSE)
|
|
|
|
action <- sapply(dat$logs, gsub,
|
|
pattern = "^.*EyeVisit, (.*):*.*$",
|
|
replacement = "\\1",
|
|
USE.NAMES = FALSE)
|
|
|
|
events <- sapply(strsplit(action, ":"), function(x) x[1])
|
|
|
|
topics <- sapply(strsplit(action, ":"), function(x) x[2])
|
|
|
|
moves <- apply(do.call(rbind,
|
|
strsplit(sapply(strsplit(action, ":"), function(x) x[3]),
|
|
",")),
|
|
2, as.numeric)
|
|
# ATTENTION: as.numeric() forces NAs for "OpenCard" and "CloseCard"
|
|
|
|
card_action <- trimws(sapply(strsplit(action, ":"),
|
|
function(x) x[3])[grep("Artwork", events)])
|
|
|
|
card <- as.numeric(sapply(strsplit(action, ":"), function(x) x[4]))
|
|
|
|
events[grep("Artwork", events)] <- paste("Artwork", card_action, sep = "/")
|
|
|
|
ts_elements <- strsplit(timestamp, ":")
|
|
time_ms <- as.numeric(sapply(ts_elements, function(x) x[4])) +
|
|
as.numeric(sapply(ts_elements, function(x) x[3])) * 1000 +
|
|
as.numeric(sapply(ts_elements, function(x) x[2])) * 1000 * 60
|
|
# TODO: Maybe change to simple gsub()...
|
|
# --> This is theoretically sound but a lot of lines for just removing ":"
|
|
|
|
dat$date <- lubridate::parse_date_time(date, "bdyHMSOp")
|
|
dat$time_ms <- time_ms
|
|
dat$event <- events
|
|
dat$artwork <- trimws(sapply(strsplit(topics, "/"), function(x) x[1]))
|
|
dat$popup <- sapply(strsplit(topics, "/"), function(x) x[2])
|
|
dat$card <- card
|
|
dat$x <- moves[,1]
|
|
dat$y <- moves[,2]
|
|
dat$scale <- moves[,3]
|
|
dat$rotation <- moves[,4]
|
|
|
|
dat$logs <- NULL
|
|
# remove original log files from data so file becomes smaller
|
|
|
|
str(dat)
|
|
|
|
head(dat, 20)
|
|
|
|
# sort by fileid, since reading in by file names does not make sense because of
|
|
# missing left zero padding
|
|
dat <- dat[order(dat$fileid, dat$date, dat$time_ms), ]
|
|
|
|
## TODO: Replace artwork and popup numbers with informative strings
|
|
|
|
#' ### Save data frame
|
|
|
|
write.table(dat, "../data/rawdata_logfiles_small.csv",
|
|
sep = ";", quote = FALSE, row.names = FALSE)
|
|
|