Updated README and tried to document all decisions I made so far
This commit is contained in:
+21
-69
@@ -1,46 +1,6 @@
|
||||
#' ---
|
||||
#' title: "Preprocessing raw log files"
|
||||
#' author: "Nora Wickelmaier"
|
||||
#' date: "`r Sys.Date()`"
|
||||
#' output:
|
||||
#' html_document:
|
||||
#' default
|
||||
#' pdf_document:
|
||||
#' toc: true
|
||||
#' number_sections: true
|
||||
#' geometry: margin = 2.5cm
|
||||
#' ---
|
||||
|
||||
# setwd("C:/Users/nwickelmaier/Nextcloud/Documents/MDS/2023ss/60100_master_thesis/code")
|
||||
|
||||
#+ setup, include = FALSE
|
||||
knitr::opts_chunk$set(warning = FALSE, message = FALSE)
|
||||
|
||||
#' The following events can be extracted from the log files:
|
||||
#'
|
||||
#' ```
|
||||
#' LogEntry classes:
|
||||
#' TRANSFORM_START: "Transform start" --> "Transformation Start" in Tool
|
||||
#' TRANSFORM_STOP: "Transform stop"
|
||||
#' START_APPLICATION: "Start Application"
|
||||
#' SHOW_APPLICATION: "Show Application"
|
||||
#' SHOW_INFO: "Show Info" --> "Flip Card" in Tool
|
||||
#' SHOW_FRONT: "Show Front"
|
||||
#' SHOW_POPUP: "ShowPopup" --> "Show Popup" in Tool
|
||||
#' HIDE_POPUP: "HidePopup"
|
||||
#' ARTWORK: "Artwork" --> "Show Topic" in Tool
|
||||
#' ```
|
||||
|
||||
#' Choose which folders with raw log files should be included:
|
||||
|
||||
folders <- "all"
|
||||
#folders <- "_2016b"
|
||||
|
||||
dirpaths <- paste0("../data/haum_logs_2016-2023/", folders)
|
||||
|
||||
fnames <- dir(dirpaths, pattern = "*.log", full.names = TRUE)
|
||||
length(fnames)
|
||||
head(fnames)
|
||||
###### HELPER ######
|
||||
|
||||
# Need to left pad file names. If I do not do this, the sorting of the
|
||||
# timestamps will be off and I get negative durations later on since the
|
||||
@@ -70,6 +30,18 @@ leftpad_fnames <- function(x) {
|
||||
res
|
||||
}
|
||||
|
||||
##### CONTENT ######
|
||||
|
||||
# Choose which folders with raw log files should be included
|
||||
|
||||
folders <- "all"
|
||||
#folders <- "_2016b"
|
||||
|
||||
dirpaths <- paste0("../data/haum_logs_2016-2023/", folders)
|
||||
|
||||
fnames <- dir(dirpaths, pattern = "*.log", full.names = TRUE)
|
||||
length(fnames)
|
||||
head(fnames)
|
||||
|
||||
logs <- lapply(fnames, readLines)
|
||||
nlog <- sapply(logs, length)
|
||||
@@ -77,31 +49,19 @@ dat <- data.frame(fileId = rep(leftpad_fnames(fnames), nlog),
|
||||
logs = unlist(logs))
|
||||
head(dat$logs)
|
||||
|
||||
#' Remove corrupted lines
|
||||
# Remove corrupted lines
|
||||
|
||||
# Warning messages:
|
||||
# incomplete final line found on '_2016/2016_11_18-11_31_0.log'
|
||||
# incomplete final line found on '_2016/2016_11_18-11_38_30.log'
|
||||
# incomplete final line found on '_2016/2016_11_18-11_40_36.log'
|
||||
# ...
|
||||
|
||||
## --> files have a last line that looks like a binary entry??
|
||||
|
||||
# From LogEntry.as:
|
||||
# //pm: inserted this check to account for some broken logfiles
|
||||
# if (metaData[1] == null){
|
||||
# trace("corrupt line... still do not know how these came to happen.");
|
||||
|
||||
# corrupt lines are "" and need to be removed
|
||||
d1 <- dim(dat)[1]
|
||||
dat <- subset(dat, dat$logs != "")
|
||||
d2 <- dim(dat)[1]
|
||||
|
||||
#' The files contain `r d1-d2` corrupt lines that were remooved from the
|
||||
#' data.
|
||||
#'
|
||||
# TODO: Catch this in a function and give back a meaningful warning
|
||||
# The files contain `r d1-d2` corrupt lines that were removed from the
|
||||
# data.
|
||||
|
||||
#' ### Extract relevant infos
|
||||
# Extract relevant infos
|
||||
|
||||
date <- sapply(dat$logs, gsub,
|
||||
pattern = "^\\[(.*)\\], \\[.*$",
|
||||
@@ -139,8 +99,6 @@ ts_elements <- strsplit(timestamp, ":")
|
||||
time_ms <- as.numeric(sapply(ts_elements, function(x) x[4])) +
|
||||
as.numeric(sapply(ts_elements, function(x) x[3])) * 1000 +
|
||||
as.numeric(sapply(ts_elements, function(x) x[2])) * 1000 * 60
|
||||
# TODO: Maybe change to simple gsub()...
|
||||
# --> This is theoretically sound but a lot of lines for just removing ":"
|
||||
|
||||
dat$date <- lubridate::parse_date_time(date, "bdyHMSOp")
|
||||
dat$timeMs <- time_ms
|
||||
@@ -156,17 +114,11 @@ dat$rotation <- moves[,4]
|
||||
dat$logs <- NULL
|
||||
# remove original log files from data so file becomes smaller
|
||||
|
||||
str(dat)
|
||||
|
||||
head(dat, 20)
|
||||
|
||||
# sort by fileId, since reading in by file names does not make sense because of
|
||||
# missing left zero padding
|
||||
# sort by fileId, since reading in by file names does not make sense
|
||||
# because of missing left zero padding
|
||||
dat <- dat[order(dat$fileId, dat$date, dat$timeMs), ]
|
||||
|
||||
## TODO: Replace artwork and popup numbers with informative strings
|
||||
|
||||
#' ### Save data frame
|
||||
# Export data
|
||||
|
||||
write.table(dat, "../data/rawdata_logfiles.csv",
|
||||
sep = ";", quote = FALSE, row.names = FALSE)
|
||||
|
||||
+12
-10
@@ -2,8 +2,7 @@
|
||||
|
||||
source("functions.R")
|
||||
|
||||
# Read data
|
||||
|
||||
# Read data ##############################################################
|
||||
dat0 <- read.table("../data/rawdata_logfiles_small.csv", sep = ";",
|
||||
header = TRUE)
|
||||
dat0$date <- as.POSIXct(dat0$date)
|
||||
@@ -13,7 +12,7 @@ dat0$glossar <- ifelse(dat0$artwork == "glossar", 1, 0)
|
||||
dat <- subset(dat0, !(dat0$event %in% c("Start Application",
|
||||
"Show Application")))
|
||||
|
||||
# Add trace variable
|
||||
# Add trace variable #####################################################
|
||||
dat1 <- add_trace(dat)
|
||||
|
||||
# Close events
|
||||
@@ -21,7 +20,9 @@ dat2 <- rbind(close_events(dat1, "move"),
|
||||
close_events(dat1, "flipCard"),
|
||||
close_events(dat1, "openTopic"),
|
||||
close_events(dat1, "openPopup"))
|
||||
|
||||
dat2 <- dat2[order(dat2$date.start, dat2$fileId.start), ]
|
||||
|
||||
# Remove durations when event spans more than one log file, since they are
|
||||
# not interpretable
|
||||
dat2[which(dat2$fileId.start != dat2$fileId.stop), "duration"] <- NA
|
||||
@@ -29,13 +30,12 @@ dat2[which(dat2$fileId.start != dat2$fileId.stop), "duration"] <- NA
|
||||
# Remove all events that do not have a `date.start`
|
||||
dat2 <- dat2[!is.na(dat2$date.start), ]
|
||||
rownames(dat2) <- NULL
|
||||
# TODO: Throw warning about this
|
||||
|
||||
#summary(dat2)
|
||||
|
||||
# Add case variable
|
||||
# Add case variable ######################################################
|
||||
dat3 <- add_case(dat2)
|
||||
|
||||
# Add event ID
|
||||
# Add event ID ###########################################################
|
||||
dat3$eventId <- seq_len(nrow(dat3))
|
||||
dat3 <- dat3[, c("fileId.start", "fileId.stop", "eventId", "case",
|
||||
"trace", "glossar", "event", "artwork",
|
||||
@@ -46,17 +46,19 @@ dat3 <- dat3[, c("fileId.start", "fileId.stop", "eventId", "case",
|
||||
"scaleSize", "rotation.start", "rotation.stop",
|
||||
"rotationDegree")]
|
||||
|
||||
# Add trace for move events
|
||||
# Add trace for move events ##############################################
|
||||
dat4 <- add_trace_moves(dat3)
|
||||
|
||||
# Add topics: file names and topics
|
||||
# Add topics: file names and topics ######################################
|
||||
artworks <- unique(dat4$artwork)
|
||||
topics <- extract_topics(artworks, pattern = paste0(artworks, ".xml"),
|
||||
path = "../data/ContentEyevisit/eyevisit_cards_light/")
|
||||
|
||||
dat5 <- add_topic(dat4, topics = topics)
|
||||
|
||||
# Export data
|
||||
# TODO: Replace artwork with informative strings
|
||||
|
||||
# Export data ############################################################
|
||||
write.table(dat5, "../data/event_logfiles.csv", sep = ";",
|
||||
row.names = FALSE)
|
||||
|
||||
|
||||
@@ -40,14 +40,11 @@ add_trace <- function(data, glossar_dict = "../data/glossar_dict.RData") {
|
||||
load(glossar_dict)
|
||||
lut <- glossar_dict[glossar_dict$glossar_file %in% glossar_files, ]
|
||||
|
||||
head(subdata2[, c("artwork", "event", "popup", "trace")], 20)
|
||||
|
||||
inside <- glossar_files[glossar_files %in%
|
||||
lut[sapply(lut$artwork, length) == 1,
|
||||
"glossar_file"]]
|
||||
single_art <- unlist(lut[lut$glossar_file %in% inside, "artwork"])
|
||||
|
||||
|
||||
for (file in lut$glossar_file) {
|
||||
|
||||
artwork_list <- unlist(lut[lut$glossar_file == file, "artwork"])
|
||||
|
||||
Reference in New Issue
Block a user