mtt_haum/code/01_parse-logfiles.R

142 lines
4.3 KiB
R

#' ---
#' title: "Preprocessing raw log files"
#' author: "Nora Wickelmaier"
#' date: "`r Sys.Date()`"
#' output:
#' html_document:
#' toc: true
#' toc_float: true
#' pdf_document:
#' toc: true
#' number_sections: true
#' geometry: margin = 2.5cm
#' ---
# setwd("C:/Users/nwickelmaier/Nextcloud/Documents/MDS/2023ss/60100_master_thesis/code")
#+ setup, include = FALSE
knitr::opts_chunk$set(warning = FALSE, message = FALSE)
#' # Preprocessing raw log files into data frame
#' The following events can be extracted from the log files:
#'
#' ```
#' LogEntry classes:
#' TRANSFORM_START: "Transform start" --> "Transformation Start" in Tool
#' TRANSFORM_STOP: "Transform stop"
#' START_APPLICATION: "Start Application"
#' SHOW_APPLICATION: "Show Application"
#' SHOW_INFO: "Show Info" --> "Flip Card" in Tool
#' SHOW_FRONT: "Show Front"
#' SHOW_POPUP: "ShowPopup" --> "Show Popup" in Tool
#' HIDE_POPUP: "HidePopup"
#' ARTWORK: "Artwork" --> "Show Topic" in Tool
#' ```
#' Choose which folders with raw log files should be included:
folders <- "all"
#folders <- "_2016b"
dirpaths <- paste0("../data/haum_logs_2016-2023/", folders)
fnames <- dir(dirpaths, pattern = "*.log", full.names = TRUE)
length(fnames)
head(fnames)
logs <- lapply(fnames, readLines)
nlog <- sapply(logs, length)
dat <- data.frame(fileid = rep(fnames, nlog), logs = unlist(logs))
head(dat$logs)
#' Remove corrupted lines
# Warning messages:
# incomplete final line found on '_2016/2016_11_18-11_31_0.log'
# incomplete final line found on '_2016/2016_11_18-11_38_30.log'
# incomplete final line found on '_2016/2016_11_18-11_40_36.log'
# ...
## --> files have a last line that looks like a binary entry??
# From LogEntry.as:
# //pm: inserted this check to account for some broken logfiles
# if (metaData[1] == null){
# trace("corrupt line... still do not know how these came to happen.");
# corrupt lines are "" and need to be removed
d1 <- dim(dat)[1]
dat <- subset(dat, dat$logs != "")
d2 <- dim(dat)[1]
#' The files contain `r d1-d2` corrupt lines that were remooved from the data.
#' ### Extract relevant infos
date <- sapply(dat$logs, gsub,
pattern = "^\\[(.*)\\], \\[.*$",
replacement = "\\1",
USE.NAMES = FALSE)
timestamp <- sapply(dat$logs, gsub,
pattern = "^\\[.*\\], \\[(.*)\\].*$",
replacement = "\\1",
USE.NAMES = FALSE)
action <- sapply(dat$logs, gsub,
pattern = "^.*EyeVisit, (.*):*.*$",
replacement = "\\1",
USE.NAMES = FALSE)
events <- sapply(strsplit(action, ":"), function(x) x[1])
topics <- sapply(strsplit(action, ":"), function(x) x[2])
moves <- apply(do.call(rbind,
strsplit(sapply(strsplit(action, ":"), function(x) x[3]),
",")),
2, as.numeric)
# ATTENTION: as.numeric() forces NAs for "OpenCard" and "CloseCard"
card_action <- trimws(sapply(strsplit(action, ":"),
function(x) x[3])[grep("Artwork", events)])
card <- as.numeric(sapply(strsplit(action, ":"), function(x) x[4]))
events[grep("Artwork", events)] <- paste("Artwork", card_action, sep = "/")
ts_elements <- strsplit(timestamp, ":")
time_ms <- as.numeric(sapply(ts_elements, function(x) x[4])) +
as.numeric(sapply(ts_elements, function(x) x[3])) * 1000 +
as.numeric(sapply(ts_elements, function(x) x[2])) * 1000 * 60
dat$date <- lubridate::parse_date_time(date, "bdyHMSOp")
dat$time_ms <- time_ms
dat$event <- events
dat$artwork <- trimws(sapply(strsplit(topics, "/"), function(x) x[1]))
dat$popup <- sapply(strsplit(topics, "/"), function(x) x[2])
dat$card <- card
dat$x <- moves[,1]
dat$y <- moves[,2]
dat$scale <- moves[,3]
dat$rotation <- moves[,4]
dat$logs <- NULL
# remove original log files from data so file becomes smaller
str(dat)
head(dat[, 2:ncol(dat)], 20)
# sort by date, since sorting by file names does not make sense because of
# missing left zero padding
dat <- dat[order(dat$date), ]
## TODO: Replace artwork and popup numbers with informative strings
write.table(dat, "../data/rawdata_logfiles.csv",
sep = ";", quote = FALSE, row.names = FALSE)