#' --- #' title: "Preprocessing raw log files" #' author: "Nora Wickelmaier" #' date: "`r Sys.Date()`" #' output: #' html_document: #' default #' pdf_document: #' toc: true #' number_sections: true #' geometry: margin = 2.5cm #' --- # setwd("C:/Users/nwickelmaier/Nextcloud/Documents/MDS/2023ss/60100_master_thesis/code") #+ setup, include = FALSE knitr::opts_chunk$set(warning = FALSE, message = FALSE) #' The following events can be extracted from the log files: #' #' ``` #' LogEntry classes: #' TRANSFORM_START: "Transform start" --> "Transformation Start" in Tool #' TRANSFORM_STOP: "Transform stop" #' START_APPLICATION: "Start Application" #' SHOW_APPLICATION: "Show Application" #' SHOW_INFO: "Show Info" --> "Flip Card" in Tool #' SHOW_FRONT: "Show Front" #' SHOW_POPUP: "ShowPopup" --> "Show Popup" in Tool #' HIDE_POPUP: "HidePopup" #' ARTWORK: "Artwork" --> "Show Topic" in Tool #' ``` #' Choose which folders with raw log files should be included: #folders <- "all" folders <- "_2016b" dirpaths <- paste0("../data/haum_logs_2016-2023/", folders) fnames <- dir(dirpaths, pattern = "*.log", full.names = TRUE) length(fnames) head(fnames) # Need to left pad file names. If I do not do this, the sorting of the # timestamps will be off and I get negative durations later on since the # wrong events get closed. leftpad_fnames <- function(x) { z <- gsub(paste0(dirpaths, "/"), "\\1", x) ys <- strsplit(z, "_") res <- NULL for (y in ys) { y2 <- unlist(strsplit(y[3], "-")) e1 <- y[1] e2 <- sprintf("%02d", as.numeric(y[2])) e3 <- sprintf("%02d", as.numeric(y2[1])) e4 <- sprintf("%02d", as.numeric(y2[2])) e5 <- sprintf("%02d", as.numeric(y[4])) e6 <- sprintf("%02d", as.numeric(gsub(".log", "", y[5]))) e6 <- sprintf("%02d", as.numeric(gsub(".log", "", y[5]))) res <- c(res, paste0(e1, "_", e2, "_", e3, "-", e4, "_", e5, "_", e6, ".log")) } res } logs <- lapply(fnames, readLines) nlog <- sapply(logs, length) dat <- data.frame(fileid = rep(leftpad_fnames(fnames), nlog), logs = unlist(logs)) head(dat$logs) #' Remove corrupted lines # Warning messages: # incomplete final line found on '_2016/2016_11_18-11_31_0.log' # incomplete final line found on '_2016/2016_11_18-11_38_30.log' # incomplete final line found on '_2016/2016_11_18-11_40_36.log' # ... ## --> files have a last line that looks like a binary entry?? # From LogEntry.as: # //pm: inserted this check to account for some broken logfiles # if (metaData[1] == null){ # trace("corrupt line... still do not know how these came to happen."); # corrupt lines are "" and need to be removed d1 <- dim(dat)[1] dat <- subset(dat, dat$logs != "") d2 <- dim(dat)[1] #' The files contain `r d1-d2` corrupt lines that were remooved from the data. #' #' ### Extract relevant infos date <- sapply(dat$logs, gsub, pattern = "^\\[(.*)\\], \\[.*$", replacement = "\\1", USE.NAMES = FALSE) timestamp <- sapply(dat$logs, gsub, pattern = "^\\[.*\\], \\[(.*)\\].*$", replacement = "\\1", USE.NAMES = FALSE) action <- sapply(dat$logs, gsub, pattern = "^.*EyeVisit, (.*):*.*$", replacement = "\\1", USE.NAMES = FALSE) events <- sapply(strsplit(action, ":"), function(x) x[1]) topics <- sapply(strsplit(action, ":"), function(x) x[2]) moves <- apply(do.call(rbind, strsplit(sapply(strsplit(action, ":"), function(x) x[3]), ",")), 2, as.numeric) # ATTENTION: as.numeric() forces NAs for "OpenCard" and "CloseCard" card_action <- trimws(sapply(strsplit(action, ":"), function(x) x[3])[grep("Artwork", events)]) card <- as.numeric(sapply(strsplit(action, ":"), function(x) x[4])) events[grep("Artwork", events)] <- paste("Artwork", card_action, sep = "/") ts_elements <- strsplit(timestamp, ":") time_ms <- as.numeric(sapply(ts_elements, function(x) x[4])) + as.numeric(sapply(ts_elements, function(x) x[3])) * 1000 + as.numeric(sapply(ts_elements, function(x) x[2])) * 1000 * 60 # TODO: Maybe change to simple gsub()... # --> This is theoretically sound but a lot of lines for just removing ":" dat$date <- lubridate::parse_date_time(date, "bdyHMSOp") dat$time_ms <- time_ms dat$event <- events dat$artwork <- trimws(sapply(strsplit(topics, "/"), function(x) x[1])) dat$popup <- sapply(strsplit(topics, "/"), function(x) x[2]) dat$card <- card dat$x <- moves[,1] dat$y <- moves[,2] dat$scale <- moves[,3] dat$rotation <- moves[,4] dat$logs <- NULL # remove original log files from data so file becomes smaller str(dat) head(dat, 20) # sort by fileid, since reading in by file names does not make sense because of # missing left zero padding dat <- dat[order(dat$fileid, dat$date, dat$time_ms), ] ## TODO: Replace artwork and popup numbers with informative strings #' ### Save data frame write.table(dat, "../data/rawdata_logfiles_small.csv", sep = ";", quote = FALSE, row.names = FALSE)