Initialized repository

This commit is contained in:
Nora Wickelmaier 2023-06-26 10:30:07 +02:00
commit c6076f3d34
3 changed files with 398 additions and 0 deletions

135
code/01_parse-logfiles.R Normal file
View File

@ -0,0 +1,135 @@
#' ---
#' title: "Preprocessing raw log files"
#' author: "Nora Wickelmaier"
#' date: "`r Sys.Date()`"
#' output:
#' html_document:
#' toc: true
#' toc_float: true
#' pdf_document:
#' toc: true
#' number_sections: true
#' geometry: margin = 2.5cm
#' ---
# setwd("C:/Users/nwickelmaier/Nextcloud/Documents/MDS/2023ss/60100_master_thesis/code")
#' # Preprocessing raw log files into data frame
#' The following events can be extracted from the log files:
#'
#' ```
#' LogEntry classes:
#' TRANSFORM_START: "Transform start" --> "Transformation Start" in Tool
#' TRANSFORM_STOP: "Transform stop"
#' START_APPLICATION: "Start Application"
#' SHOW_APPLICATION: "Show Application"
#' SHOW_INFO: "Show Info" --> "Flip Card" in Tool
#' SHOW_FRONT: "Show Front"
#' SHOW_POPUP: "ShowPopup" --> "Show Popup" in Tool
#' HIDE_POPUP: "HidePopup"
#' ARTWORK: "Artwork" --> "Show Topic" in Tool
#' ```
#' Choose which folders with raw log files should be included:
folders <- c(
"_2016"
, "_2017a"
, "_2017b"
, "_2018"
)
dirpaths <- paste0("../data/HAUM Logs/", folders)
fnames <- dir(dirpaths, pattern = "*.log", full.names = TRUE)
# TODO: Enter all data
length(fnames)
head(fnames)
logs <- lapply(fnames, readLines)
nlog <- sapply(logs, length)
dat <- data.frame(fileid = rep(fnames, nlog), logs = unlist(logs))
head(dat$logs)
#' Remove corrupted lines
# Warning messages:
# incomplete final line found on '_2016/2016_11_18-11_31_0.log'
# incomplete final line found on '_2016/2016_11_18-11_38_30.log'
# incomplete final line found on '_2016/2016_11_18-11_40_36.log'
## --> files have a last line that looks like a binary entry??
# From LogEntry.as:
# //pm: inserted this check to account for some broken logfiles
# if (metaData[1] == null){
# trace("corrupt line... still do not know how these came to happen.");
# corrupt lines are "" and need to be removed
d1 <- dim(dat)[1]
dat <- subset(dat, dat$logs != "")
d2 <- dim(dat)[1]
#' The files contain `r d1-d2` corrupt lines that were remooved from the data.
#' ### Extract relevant infos
date <- sapply(dat$logs, gsub,
pattern = "^\\[(.*)\\], \\[.*$",
replacement = "\\1",
USE.NAMES = FALSE)
timestamp <- sapply(dat$logs, gsub,
pattern = "^\\[.*\\], \\[(.*)\\].*$",
replacement = "\\1",
USE.NAMES = FALSE)
action <- sapply(dat$logs, gsub,
pattern = "^.*EyeVisit, (.*):*.*$",
replacement = "\\1",
USE.NAMES = FALSE)
events <- sapply(strsplit(action, ":"), function(x) x[1])
topics <- sapply(strsplit(action, ":"), function(x) x[2])
moves <- apply(do.call(rbind,
strsplit(sapply(strsplit(action, ":"), function(x) x[3]),
",")),
2, as.numeric)
# ATTENTION: as.numeric() forces NAs for "OpenCard" and "CloseCard"
card_action <- trimws(sapply(strsplit(action, ":"),
function(x) x[3])[grep("Artwork", events)])
card <- as.numeric(sapply(strsplit(action, ":"), function(x) x[4]))
events[grep("Artwork", events)] <- paste("Artwork", card_action, sep = "/")
ts_elements <- strsplit(timestamp, ":")
time_ms <- as.numeric(sapply(ts_elements, function(x) x[4])) +
as.numeric(sapply(ts_elements, function(x) x[3])) * 1000 +
as.numeric(sapply(ts_elements, function(x) x[2])) * 1000 * 60
dat$date <- lubridate::parse_date_time(date, "bdyHMSOp")
dat$time_ms <- time_ms
dat$event <- events
dat$artwork <- trimws(sapply(strsplit(topics, "/"), function(x) x[1]))
dat$popup <- sapply(strsplit(topics, "/"), function(x) x[2])
dat$card <- card
dat$x <- moves[,1]
dat$y <- moves[,2]
dat$scale <- moves[,3]
dat$rotation <- moves[,4]
str(dat)
head(dat[, 3:ncol(dat)], 20)
## TODO: Replace artwork and popup numbers with informative strings
write.table(dat, "../data/rawdata_logfiles.csv",
sep = ";", quote = FALSE, row.names = FALSE)

164
code/02_preprocessing.R Normal file
View File

@ -0,0 +1,164 @@
#' ---
#' title: "Preprocessing log files"
#' author: "Nora Wickelmaier"
#' date: "`r Sys.Date()`"
#' output:
#' html_document:
#' toc: true
#' toc_float: true
#' pdf_document:
#' toc: true
#' number_sections: true
#' geometry: margin = 2.5cm
#' ---
# setwd("C:/Users/nwickelmaier/Nextcloud/Documents/MDS/2023ss/60100_master_thesis/code")
# LogEntry classes:
# TRANSFORM_START: "Transform start" --> "Transformation Start" in Tool
# TRANSFORM_STOP: "Transform stop"
# START_APPLICATION: "Start Application"
# SHOW_APPLICATION: "Show Application"
# SHOW_INFO: "Show Info" --> "Flip Card" in Tool
# SHOW_FRONT: "Show Front"
# SHOW_POPUP: "ShowPopup" --> "Show Popup" in Tool
# HIDE_POPUP: "HidePopup"
# ARTWORK: "Artwork" --> "Show Topic" in Tool
#' # Read data
dat0 <- read.table("../data/rawdata_logfiles.csv", sep = ";", header = TRUE)
#' # Remove irrelevant events
#' ## Remove Start Application and Show Application
dat <- subset(dat0, !(dat0$event %in% c("Start Application", "Show Application")))
dat$logs <- NULL # do not need original log files
dat$date <- as.POSIXct(dat$date) # create date object
#' ## Remove "button presses"
# Sort data frame by artwork and date
dat <- dat[order(dat$artwork, dat$date), ]
# remove "Transform start" and "Transform stop" following directly each
# other, since I do not know how to interpret them as events
id_start <- which(dat$event == "Transform start")
id_stop <- which(dat$event == "Transform stop")
id_rm_start <- id_start[diff(id_start) == 1]
id_rm_stop <- id_stop[diff(id_stop) == 1]
dat <- dat[-c(id_rm_start, id_rm_stop), ]
rownames(dat) <- NULL
id_start2 <- which(dat$event == "Transform start")
id_stop2 <- which(dat$event == "Transform stop")
length(id_start2) - length(id_stop2)
# 340 --> "starts too many"
# remove "Transform start" and "Transform stop" following directly each
# other (but with events in between!)
id_start_new <- id_start2
id_stop_new <- id_stop2
for (i in 2:length(id_start_new)) {
if (id_start_new[i-1] < id_stop_new[i-1] & id_start_new[i] < id_stop_new[i-1]) {
id_start_new <- id_start_new[-(i-1)]
} else if (id_start_new[i-1] > id_stop_new[i-1] & id_start_new[i] > id_stop_new[i-1]) {
id_stop_new <- id_stop_new[-(i-1)]
}
}
length(id_start2) - length(id_start_new)
length(id_stop2) - length(id_stop_new)
ids <- data.frame(start = id_start_new, stop = id_stop_new)
ids$diff <- ids$stop - ids$start
table(ids$diff)
# remove "Transform start" and "Transform stop" around other events
id_rm_start2 <- id_start2[!(id_start2 %in% id_start_new)]
id_rm_stop2 <- id_stop2[!(id_stop2 %in% id_stop_new)]
# TODO: It still does not work correctly:
dat[64764:64769,]
# time_ms event artwork popup x y scale rotation
# 64764 473081 Transform start 052 052.xml 1958.65 1505.75 0.8234455 -0.1351998
# 64765 474226 Show Info 052 052.xml NA NA NA NA
# 64766 475735 Transform start 052 052.xml 1988.25 1625.25 0.9927645 2.4527958
# 64767 475739 Transform stop 052 052.xml 1988.25 1625.25 0.9927645 2.4527958
# 64768 479326 Artwork 052 052.xml NA NA NA NA
# 64769 479751 Transform stop 052 052.xml 1660.90 1883.20 0.8074586 29.0875534
# --> but no idea how to find these cases in an automated way...
dat <- dat[-c(id_rm_start2, id_rm_stop2), ]
# --> Every start ends with a stop now (but not necessarily the correct one!)
dat1 <- dat[order(dat$date, dat$time_ms), ]
dat1$time_diff <- c(NA, diff(dat1$time_ms))
boxplot(time_diff ~ as.Date(date), dat1[dat1$time_diff > 1000 & dat1$time_diff < 4000, ])
boxplot(time_ms ~ event, dat1)
#' ## Plots
counts <- table(as.Date(dat$date), dat$event)
lattice::barchart(counts, auto.key = TRUE)
start_events <- c("Transform start", "Show Info", "ShowPopup", "Artwork/OpenCard")
counts <- table(as.Date(dat$date[dat$event %in% start_events]),
dat$event[dat$event %in% start_events])
lattice::barchart(counts, auto.key = TRUE)
# TODO: Do I want to "collapse" the data frame in a way, that I only have
# one event for each "set", meaning
#
# * Transform start + Transform stop --> Transform
# * Artwork/OpenCard + Artwork/CloseCard --> Show Subcard
# * ShowPopup + HidePopup --> Show Popup
# * Show Info + Show Front --> Flip Card
# (s.o. ;))
#
# Then I would have meaningful variables like duration, distance, degree of
# rotation, size of scaling, selection of Subcard etc.
# This means that I would have to delete all "unclosed" events.
# Create a data frame with
# case event attributes (can differ for different events)
# ??
# Is `artwork` my case? Or `artwork` per day? Or `artwork` per some other
# unit??? Maybe look at differences between timestamps separately for
# `artwork`? And identify "new observational unit" this way?
#
# Definition: (???)
# 1. Touching a new `artwork` corresponds to "observational unit change"
# 2. Time interval of XX min within one `artwork` on the same day
# corresponds to "observational unit change"
# id activity timestamp
# Split data frame in list of data frame which all correspond to one
# artwork
# dat_art <- split(dat, dat$artwork)
## --> Maybe need it at some point?
#' # Problems
#' * Opening and closing of events cannot be identified unambiguously; it
#' can happen that the wrong tags have been put together (e.g., Transform
#' start and Transform stop); therefore, durations etc. are only heuristic

99
code/03_specs.R Normal file
View File

@ -0,0 +1,99 @@
# setwd("C:/Users/nwickelmaier/Nextcloud/Documents/MDS/2023ss/60100_master_thesis/code")
library(lubridate)
dat <- read.table("../data/rawdata_logfiles.csv", header = TRUE, sep = ";")
dat$event <- factor(dat$event, levels = c("Start Application",
"Show Application",
"Transform start",
"Transform stop",
"Show Info",
"Show Front",
"Artwork/OpenCard",
"Artwork/CloseCard",
"ShowPopup", "HidePopup"))
#dat$logs <- NULL # do not need original log files
dat$date <- as.POSIXct(dat$date) # create date object
str(dat)
head(dat)
#' Log files between December 2016 and December 2018
range(dat$date)
#' Number of log files per year
table(year(dat$date))
#' Number of events total and per year
table(dat$event)
# Start Application Show Application
# 1679 1656
# Transform start Transform stop
# 2119815 1944618
# Show Info Show Front
# 71955 71043
# Artwork/OpenCard Artwork/CloseCard
# 64990 56750
# ShowPopup HidePopup
# 44070 43813
# --> more "openups" than "closes" (not surprisingly)
table(dat$event, year(dat$date))
#' Number of log files per week day
table(weekdays(dat$date))
#' Anzahl der Tage
length(unique(as.Date(dat$date)))
#' Which artworks are looked at most often
table(dat$artwork)
lattice::barchart(table(dat$artwork))
#' ## Plots from Visualization Tool
dat16 <- dat[year(dat$date) == 2016, ]
counts <- table(as.Date(dat16$date), dat16$event)
lattice::barchart(counts, auto.key = TRUE)
start_events <- c("Transform start", "Show Info", "ShowPopup", "Artwork/OpenCard")
counts <- table(as.Date(dat16$date[dat16$event %in% start_events]),
dat16$event[dat16$event %in% start_events])
counts
lattice::barchart(counts, auto.key = TRUE)
### Example for log file in order to show structure
write.table(dat[240:660, 3:12], "set.txt", quote = FALSE)
# is then edited by hand to have all possible events...
select <- c(243, 244, 245, 246, 253, 254, 255, 256, 257, 259, 260, 262,
265, 266, 267, 268, 269, 270, 271, 287, 288, 289, 294, 295,
296, 303, 304, 305, 306, 307, 318, 319, 320, 321, 322, 323,
324, 325, 326, 357, 358, 359, 360, 361, 362, 363, 408, 409,
410, 411, 412, 413, 414, 415, 416, 417, 418, 439, 440, 441,
516, 518, 519, 587, 588, 589, 590, 591, 592, 593, 594, 595,
652, 653, 654, 655, 656, 657)
writeLines(dat[select, "logs"], "set_logs.txt")
tab <- table(diff(dat$date)) # uninformative on raw log data!
plot(tab[tab > 2000])