Updated README and tried to document all decisions I made so far

This commit is contained in:
2023-09-13 14:20:08 +02:00
parent 9f15ea1b62
commit 498b487338
5 changed files with 503 additions and 403 deletions
+21 -69
View File
@@ -1,46 +1,6 @@
#' ---
#' title: "Preprocessing raw log files"
#' author: "Nora Wickelmaier"
#' date: "`r Sys.Date()`"
#' output:
#' html_document:
#' default
#' pdf_document:
#' toc: true
#' number_sections: true
#' geometry: margin = 2.5cm
#' ---
# setwd("C:/Users/nwickelmaier/Nextcloud/Documents/MDS/2023ss/60100_master_thesis/code")
#+ setup, include = FALSE
knitr::opts_chunk$set(warning = FALSE, message = FALSE)
#' The following events can be extracted from the log files:
#'
#' ```
#' LogEntry classes:
#' TRANSFORM_START: "Transform start" --> "Transformation Start" in Tool
#' TRANSFORM_STOP: "Transform stop"
#' START_APPLICATION: "Start Application"
#' SHOW_APPLICATION: "Show Application"
#' SHOW_INFO: "Show Info" --> "Flip Card" in Tool
#' SHOW_FRONT: "Show Front"
#' SHOW_POPUP: "ShowPopup" --> "Show Popup" in Tool
#' HIDE_POPUP: "HidePopup"
#' ARTWORK: "Artwork" --> "Show Topic" in Tool
#' ```
#' Choose which folders with raw log files should be included:
folders <- "all"
#folders <- "_2016b"
dirpaths <- paste0("../data/haum_logs_2016-2023/", folders)
fnames <- dir(dirpaths, pattern = "*.log", full.names = TRUE)
length(fnames)
head(fnames)
###### HELPER ######
# Need to left pad file names. If I do not do this, the sorting of the
# timestamps will be off and I get negative durations later on since the
@@ -70,6 +30,18 @@ leftpad_fnames <- function(x) {
res
}
##### CONTENT ######
# Choose which folders with raw log files should be included
folders <- "all"
#folders <- "_2016b"
dirpaths <- paste0("../data/haum_logs_2016-2023/", folders)
fnames <- dir(dirpaths, pattern = "*.log", full.names = TRUE)
length(fnames)
head(fnames)
logs <- lapply(fnames, readLines)
nlog <- sapply(logs, length)
@@ -77,31 +49,19 @@ dat <- data.frame(fileId = rep(leftpad_fnames(fnames), nlog),
logs = unlist(logs))
head(dat$logs)
#' Remove corrupted lines
# Remove corrupted lines
# Warning messages:
# incomplete final line found on '_2016/2016_11_18-11_31_0.log'
# incomplete final line found on '_2016/2016_11_18-11_38_30.log'
# incomplete final line found on '_2016/2016_11_18-11_40_36.log'
# ...
## --> files have a last line that looks like a binary entry??
# From LogEntry.as:
# //pm: inserted this check to account for some broken logfiles
# if (metaData[1] == null){
# trace("corrupt line... still do not know how these came to happen.");
# corrupt lines are "" and need to be removed
d1 <- dim(dat)[1]
dat <- subset(dat, dat$logs != "")
d2 <- dim(dat)[1]
#' The files contain `r d1-d2` corrupt lines that were remooved from the
#' data.
#'
# TODO: Catch this in a function and give back a meaningful warning
# The files contain `r d1-d2` corrupt lines that were removed from the
# data.
#' ### Extract relevant infos
# Extract relevant infos
date <- sapply(dat$logs, gsub,
pattern = "^\\[(.*)\\], \\[.*$",
@@ -139,8 +99,6 @@ ts_elements <- strsplit(timestamp, ":")
time_ms <- as.numeric(sapply(ts_elements, function(x) x[4])) +
as.numeric(sapply(ts_elements, function(x) x[3])) * 1000 +
as.numeric(sapply(ts_elements, function(x) x[2])) * 1000 * 60
# TODO: Maybe change to simple gsub()...
# --> This is theoretically sound but a lot of lines for just removing ":"
dat$date <- lubridate::parse_date_time(date, "bdyHMSOp")
dat$timeMs <- time_ms
@@ -156,17 +114,11 @@ dat$rotation <- moves[,4]
dat$logs <- NULL
# remove original log files from data so file becomes smaller
str(dat)
head(dat, 20)
# sort by fileId, since reading in by file names does not make sense because of
# missing left zero padding
# sort by fileId, since reading in by file names does not make sense
# because of missing left zero padding
dat <- dat[order(dat$fileId, dat$date, dat$timeMs), ]
## TODO: Replace artwork and popup numbers with informative strings
#' ### Save data frame
# Export data
write.table(dat, "../data/rawdata_logfiles.csv",
sep = ";", quote = FALSE, row.names = FALSE)
+12 -10
View File
@@ -2,8 +2,7 @@
source("functions.R")
# Read data
# Read data ##############################################################
dat0 <- read.table("../data/rawdata_logfiles_small.csv", sep = ";",
header = TRUE)
dat0$date <- as.POSIXct(dat0$date)
@@ -13,7 +12,7 @@ dat0$glossar <- ifelse(dat0$artwork == "glossar", 1, 0)
dat <- subset(dat0, !(dat0$event %in% c("Start Application",
"Show Application")))
# Add trace variable
# Add trace variable #####################################################
dat1 <- add_trace(dat)
# Close events
@@ -21,7 +20,9 @@ dat2 <- rbind(close_events(dat1, "move"),
close_events(dat1, "flipCard"),
close_events(dat1, "openTopic"),
close_events(dat1, "openPopup"))
dat2 <- dat2[order(dat2$date.start, dat2$fileId.start), ]
# Remove durations when event spans more than one log file, since they are
# not interpretable
dat2[which(dat2$fileId.start != dat2$fileId.stop), "duration"] <- NA
@@ -29,13 +30,12 @@ dat2[which(dat2$fileId.start != dat2$fileId.stop), "duration"] <- NA
# Remove all events that do not have a `date.start`
dat2 <- dat2[!is.na(dat2$date.start), ]
rownames(dat2) <- NULL
# TODO: Throw warning about this
#summary(dat2)
# Add case variable
# Add case variable ######################################################
dat3 <- add_case(dat2)
# Add event ID
# Add event ID ###########################################################
dat3$eventId <- seq_len(nrow(dat3))
dat3 <- dat3[, c("fileId.start", "fileId.stop", "eventId", "case",
"trace", "glossar", "event", "artwork",
@@ -46,17 +46,19 @@ dat3 <- dat3[, c("fileId.start", "fileId.stop", "eventId", "case",
"scaleSize", "rotation.start", "rotation.stop",
"rotationDegree")]
# Add trace for move events
# Add trace for move events ##############################################
dat4 <- add_trace_moves(dat3)
# Add topics: file names and topics
# Add topics: file names and topics ######################################
artworks <- unique(dat4$artwork)
topics <- extract_topics(artworks, pattern = paste0(artworks, ".xml"),
path = "../data/ContentEyevisit/eyevisit_cards_light/")
dat5 <- add_topic(dat4, topics = topics)
# Export data
# TODO: Replace artwork with informative strings
# Export data ############################################################
write.table(dat5, "../data/event_logfiles.csv", sep = ";",
row.names = FALSE)
-3
View File
@@ -40,14 +40,11 @@ add_trace <- function(data, glossar_dict = "../data/glossar_dict.RData") {
load(glossar_dict)
lut <- glossar_dict[glossar_dict$glossar_file %in% glossar_files, ]
head(subdata2[, c("artwork", "event", "popup", "trace")], 20)
inside <- glossar_files[glossar_files %in%
lut[sapply(lut$artwork, length) == 1,
"glossar_file"]]
single_art <- unlist(lut[lut$glossar_file %in% inside, "artwork"])
for (file in lut$glossar_file) {
artwork_list <- unlist(lut[lut$glossar_file == file, "artwork"])