From c6076f3d3477103317dec5bc6a4482957fac1c08 Mon Sep 17 00:00:00 2001 From: nwickel Date: Mon, 26 Jun 2023 10:30:07 +0200 Subject: [PATCH] Initialized repository --- code/01_parse-logfiles.R | 135 ++++++++++++++++++++++++++++++++ code/02_preprocessing.R | 164 +++++++++++++++++++++++++++++++++++++++ code/03_specs.R | 99 +++++++++++++++++++++++ 3 files changed, 398 insertions(+) create mode 100644 code/01_parse-logfiles.R create mode 100644 code/02_preprocessing.R create mode 100644 code/03_specs.R diff --git a/code/01_parse-logfiles.R b/code/01_parse-logfiles.R new file mode 100644 index 0000000..84dab52 --- /dev/null +++ b/code/01_parse-logfiles.R @@ -0,0 +1,135 @@ +#' --- +#' title: "Preprocessing raw log files" +#' author: "Nora Wickelmaier" +#' date: "`r Sys.Date()`" +#' output: +#' html_document: +#' toc: true +#' toc_float: true +#' pdf_document: +#' toc: true +#' number_sections: true +#' geometry: margin = 2.5cm +#' --- + +# setwd("C:/Users/nwickelmaier/Nextcloud/Documents/MDS/2023ss/60100_master_thesis/code") + + +#' # Preprocessing raw log files into data frame + +#' The following events can be extracted from the log files: +#' +#' ``` +#' LogEntry classes: +#' TRANSFORM_START: "Transform start" --> "Transformation Start" in Tool +#' TRANSFORM_STOP: "Transform stop" +#' START_APPLICATION: "Start Application" +#' SHOW_APPLICATION: "Show Application" +#' SHOW_INFO: "Show Info" --> "Flip Card" in Tool +#' SHOW_FRONT: "Show Front" +#' SHOW_POPUP: "ShowPopup" --> "Show Popup" in Tool +#' HIDE_POPUP: "HidePopup" +#' ARTWORK: "Artwork" --> "Show Topic" in Tool +#' ``` + +#' Choose which folders with raw log files should be included: + +folders <- c( + "_2016" + , "_2017a" + , "_2017b" + , "_2018" + ) + +dirpaths <- paste0("../data/HAUM Logs/", folders) + +fnames <- dir(dirpaths, pattern = "*.log", full.names = TRUE) +# TODO: Enter all data +length(fnames) +head(fnames) + +logs <- lapply(fnames, readLines) +nlog <- sapply(logs, length) +dat <- data.frame(fileid = rep(fnames, nlog), logs = unlist(logs)) +head(dat$logs) + +#' Remove corrupted lines + +# Warning messages: +# incomplete final line found on '_2016/2016_11_18-11_31_0.log' +# incomplete final line found on '_2016/2016_11_18-11_38_30.log' +# incomplete final line found on '_2016/2016_11_18-11_40_36.log' + +## --> files have a last line that looks like a binary entry?? + +# From LogEntry.as: +# //pm: inserted this check to account for some broken logfiles +# if (metaData[1] == null){ +# trace("corrupt line... still do not know how these came to happen."); + +# corrupt lines are "" and need to be removed +d1 <- dim(dat)[1] +dat <- subset(dat, dat$logs != "") +d2 <- dim(dat)[1] + +#' The files contain `r d1-d2` corrupt lines that were remooved from the data. + +#' ### Extract relevant infos + +date <- sapply(dat$logs, gsub, + pattern = "^\\[(.*)\\], \\[.*$", + replacement = "\\1", + USE.NAMES = FALSE) + +timestamp <- sapply(dat$logs, gsub, + pattern = "^\\[.*\\], \\[(.*)\\].*$", + replacement = "\\1", + USE.NAMES = FALSE) + +action <- sapply(dat$logs, gsub, + pattern = "^.*EyeVisit, (.*):*.*$", + replacement = "\\1", + USE.NAMES = FALSE) + +events <- sapply(strsplit(action, ":"), function(x) x[1]) + +topics <- sapply(strsplit(action, ":"), function(x) x[2]) + +moves <- apply(do.call(rbind, + strsplit(sapply(strsplit(action, ":"), function(x) x[3]), + ",")), + 2, as.numeric) +# ATTENTION: as.numeric() forces NAs for "OpenCard" and "CloseCard" + +card_action <- trimws(sapply(strsplit(action, ":"), + function(x) x[3])[grep("Artwork", events)]) + +card <- as.numeric(sapply(strsplit(action, ":"), function(x) x[4])) + +events[grep("Artwork", events)] <- paste("Artwork", card_action, sep = "/") + +ts_elements <- strsplit(timestamp, ":") +time_ms <- as.numeric(sapply(ts_elements, function(x) x[4])) + + as.numeric(sapply(ts_elements, function(x) x[3])) * 1000 + + as.numeric(sapply(ts_elements, function(x) x[2])) * 1000 * 60 + +dat$date <- lubridate::parse_date_time(date, "bdyHMSOp") +dat$time_ms <- time_ms +dat$event <- events +dat$artwork <- trimws(sapply(strsplit(topics, "/"), function(x) x[1])) +dat$popup <- sapply(strsplit(topics, "/"), function(x) x[2]) +dat$card <- card +dat$x <- moves[,1] +dat$y <- moves[,2] +dat$scale <- moves[,3] +dat$rotation <- moves[,4] + +str(dat) + +head(dat[, 3:ncol(dat)], 20) + +## TODO: Replace artwork and popup numbers with informative strings + +write.table(dat, "../data/rawdata_logfiles.csv", + sep = ";", quote = FALSE, row.names = FALSE) + diff --git a/code/02_preprocessing.R b/code/02_preprocessing.R new file mode 100644 index 0000000..888ff86 --- /dev/null +++ b/code/02_preprocessing.R @@ -0,0 +1,164 @@ +#' --- +#' title: "Preprocessing log files" +#' author: "Nora Wickelmaier" +#' date: "`r Sys.Date()`" +#' output: +#' html_document: +#' toc: true +#' toc_float: true +#' pdf_document: +#' toc: true +#' number_sections: true +#' geometry: margin = 2.5cm +#' --- + +# setwd("C:/Users/nwickelmaier/Nextcloud/Documents/MDS/2023ss/60100_master_thesis/code") + +# LogEntry classes: +# TRANSFORM_START: "Transform start" --> "Transformation Start" in Tool +# TRANSFORM_STOP: "Transform stop" +# START_APPLICATION: "Start Application" +# SHOW_APPLICATION: "Show Application" +# SHOW_INFO: "Show Info" --> "Flip Card" in Tool +# SHOW_FRONT: "Show Front" +# SHOW_POPUP: "ShowPopup" --> "Show Popup" in Tool +# HIDE_POPUP: "HidePopup" +# ARTWORK: "Artwork" --> "Show Topic" in Tool + +#' # Read data + +dat0 <- read.table("../data/rawdata_logfiles.csv", sep = ";", header = TRUE) + +#' # Remove irrelevant events + +#' ## Remove Start Application and Show Application + +dat <- subset(dat0, !(dat0$event %in% c("Start Application", "Show Application"))) +dat$logs <- NULL # do not need original log files +dat$date <- as.POSIXct(dat$date) # create date object + +#' ## Remove "button presses" + +# Sort data frame by artwork and date +dat <- dat[order(dat$artwork, dat$date), ] + +# remove "Transform start" and "Transform stop" following directly each +# other, since I do not know how to interpret them as events +id_start <- which(dat$event == "Transform start") +id_stop <- which(dat$event == "Transform stop") + +id_rm_start <- id_start[diff(id_start) == 1] +id_rm_stop <- id_stop[diff(id_stop) == 1] + +dat <- dat[-c(id_rm_start, id_rm_stop), ] +rownames(dat) <- NULL + + +id_start2 <- which(dat$event == "Transform start") +id_stop2 <- which(dat$event == "Transform stop") + +length(id_start2) - length(id_stop2) +# 340 --> "starts too many" + +# remove "Transform start" and "Transform stop" following directly each +# other (but with events in between!) +id_start_new <- id_start2 +id_stop_new <- id_stop2 + +for (i in 2:length(id_start_new)) { + if (id_start_new[i-1] < id_stop_new[i-1] & id_start_new[i] < id_stop_new[i-1]) { + id_start_new <- id_start_new[-(i-1)] + } else if (id_start_new[i-1] > id_stop_new[i-1] & id_start_new[i] > id_stop_new[i-1]) { + id_stop_new <- id_stop_new[-(i-1)] + } +} + +length(id_start2) - length(id_start_new) +length(id_stop2) - length(id_stop_new) + +ids <- data.frame(start = id_start_new, stop = id_stop_new) +ids$diff <- ids$stop - ids$start + +table(ids$diff) + +# remove "Transform start" and "Transform stop" around other events + +id_rm_start2 <- id_start2[!(id_start2 %in% id_start_new)] +id_rm_stop2 <- id_stop2[!(id_stop2 %in% id_stop_new)] + +# TODO: It still does not work correctly: +dat[64764:64769,] +# time_ms event artwork popup x y scale rotation +# 64764 473081 Transform start 052 052.xml 1958.65 1505.75 0.8234455 -0.1351998 +# 64765 474226 Show Info 052 052.xml NA NA NA NA +# 64766 475735 Transform start 052 052.xml 1988.25 1625.25 0.9927645 2.4527958 +# 64767 475739 Transform stop 052 052.xml 1988.25 1625.25 0.9927645 2.4527958 +# 64768 479326 Artwork 052 052.xml NA NA NA NA +# 64769 479751 Transform stop 052 052.xml 1660.90 1883.20 0.8074586 29.0875534 + +# --> but no idea how to find these cases in an automated way... + +dat <- dat[-c(id_rm_start2, id_rm_stop2), ] +# --> Every start ends with a stop now (but not necessarily the correct one!) + + +dat1 <- dat[order(dat$date, dat$time_ms), ] +dat1$time_diff <- c(NA, diff(dat1$time_ms)) + +boxplot(time_diff ~ as.Date(date), dat1[dat1$time_diff > 1000 & dat1$time_diff < 4000, ]) + +boxplot(time_ms ~ event, dat1) + + +#' ## Plots + +counts <- table(as.Date(dat$date), dat$event) +lattice::barchart(counts, auto.key = TRUE) + + +start_events <- c("Transform start", "Show Info", "ShowPopup", "Artwork/OpenCard") + +counts <- table(as.Date(dat$date[dat$event %in% start_events]), + dat$event[dat$event %in% start_events]) +lattice::barchart(counts, auto.key = TRUE) + + +# TODO: Do I want to "collapse" the data frame in a way, that I only have +# one event for each "set", meaning +# +# * Transform start + Transform stop --> Transform +# * Artwork/OpenCard + Artwork/CloseCard --> Show Subcard +# * ShowPopup + HidePopup --> Show Popup +# * Show Info + Show Front --> Flip Card +# (s.o. ;)) +# +# Then I would have meaningful variables like duration, distance, degree of +# rotation, size of scaling, selection of Subcard etc. +# This means that I would have to delete all "unclosed" events. + +# Create a data frame with +# case event attributes (can differ for different events) +# ?? +# Is `artwork` my case? Or `artwork` per day? Or `artwork` per some other +# unit??? Maybe look at differences between timestamps separately for +# `artwork`? And identify "new observational unit" this way? +# +# Definition: (???) +# 1. Touching a new `artwork` corresponds to "observational unit change" +# 2. Time interval of XX min within one `artwork` on the same day +# corresponds to "observational unit change" + +# id activity timestamp + +# Split data frame in list of data frame which all correspond to one +# artwork +# dat_art <- split(dat, dat$artwork) + +## --> Maybe need it at some point? + +#' # Problems + +#' * Opening and closing of events cannot be identified unambiguously; it +#' can happen that the wrong tags have been put together (e.g., Transform +#' start and Transform stop); therefore, durations etc. are only heuristic + diff --git a/code/03_specs.R b/code/03_specs.R new file mode 100644 index 0000000..2a0a293 --- /dev/null +++ b/code/03_specs.R @@ -0,0 +1,99 @@ +# setwd("C:/Users/nwickelmaier/Nextcloud/Documents/MDS/2023ss/60100_master_thesis/code") + +library(lubridate) + +dat <- read.table("../data/rawdata_logfiles.csv", header = TRUE, sep = ";") +dat$event <- factor(dat$event, levels = c("Start Application", + "Show Application", + "Transform start", + "Transform stop", + "Show Info", + "Show Front", + "Artwork/OpenCard", + "Artwork/CloseCard", + "ShowPopup", "HidePopup")) + + +#dat$logs <- NULL # do not need original log files +dat$date <- as.POSIXct(dat$date) # create date object + +str(dat) +head(dat) + +#' Log files between December 2016 and December 2018 + +range(dat$date) + +#' Number of log files per year + +table(year(dat$date)) + +#' Number of events total and per year + +table(dat$event) +# Start Application Show Application +# 1679 1656 +# Transform start Transform stop +# 2119815 1944618 +# Show Info Show Front +# 71955 71043 +# Artwork/OpenCard Artwork/CloseCard +# 64990 56750 +# ShowPopup HidePopup +# 44070 43813 +# --> more "openups" than "closes" (not surprisingly) + +table(dat$event, year(dat$date)) + +#' Number of log files per week day + +table(weekdays(dat$date)) + +#' Anzahl der Tage + +length(unique(as.Date(dat$date))) + +#' Which artworks are looked at most often + +table(dat$artwork) +lattice::barchart(table(dat$artwork)) + + + +#' ## Plots from Visualization Tool + +dat16 <- dat[year(dat$date) == 2016, ] + +counts <- table(as.Date(dat16$date), dat16$event) +lattice::barchart(counts, auto.key = TRUE) + + +start_events <- c("Transform start", "Show Info", "ShowPopup", "Artwork/OpenCard") + +counts <- table(as.Date(dat16$date[dat16$event %in% start_events]), + dat16$event[dat16$event %in% start_events]) +counts +lattice::barchart(counts, auto.key = TRUE) + + + +### Example for log file in order to show structure + +write.table(dat[240:660, 3:12], "set.txt", quote = FALSE) +# is then edited by hand to have all possible events... + + +select <- c(243, 244, 245, 246, 253, 254, 255, 256, 257, 259, 260, 262, + 265, 266, 267, 268, 269, 270, 271, 287, 288, 289, 294, 295, + 296, 303, 304, 305, 306, 307, 318, 319, 320, 321, 322, 323, + 324, 325, 326, 357, 358, 359, 360, 361, 362, 363, 408, 409, + 410, 411, 412, 413, 414, 415, 416, 417, 418, 439, 440, 441, + 516, 518, 519, 587, 588, 589, 590, 591, 592, 593, 594, 595, + 652, 653, 654, 655, 656, 657) + +writeLines(dat[select, "logs"], "set_logs.txt") + + +tab <- table(diff(dat$date)) # uninformative on raw log data! +plot(tab[tab > 2000]) +