Initialized repository
This commit is contained in:
commit
c6076f3d34
135
code/01_parse-logfiles.R
Normal file
135
code/01_parse-logfiles.R
Normal file
@ -0,0 +1,135 @@
|
||||
#' ---
|
||||
#' title: "Preprocessing raw log files"
|
||||
#' author: "Nora Wickelmaier"
|
||||
#' date: "`r Sys.Date()`"
|
||||
#' output:
|
||||
#' html_document:
|
||||
#' toc: true
|
||||
#' toc_float: true
|
||||
#' pdf_document:
|
||||
#' toc: true
|
||||
#' number_sections: true
|
||||
#' geometry: margin = 2.5cm
|
||||
#' ---
|
||||
|
||||
# setwd("C:/Users/nwickelmaier/Nextcloud/Documents/MDS/2023ss/60100_master_thesis/code")
|
||||
|
||||
|
||||
#' # Preprocessing raw log files into data frame
|
||||
|
||||
#' The following events can be extracted from the log files:
|
||||
#'
|
||||
#' ```
|
||||
#' LogEntry classes:
|
||||
#' TRANSFORM_START: "Transform start" --> "Transformation Start" in Tool
|
||||
#' TRANSFORM_STOP: "Transform stop"
|
||||
#' START_APPLICATION: "Start Application"
|
||||
#' SHOW_APPLICATION: "Show Application"
|
||||
#' SHOW_INFO: "Show Info" --> "Flip Card" in Tool
|
||||
#' SHOW_FRONT: "Show Front"
|
||||
#' SHOW_POPUP: "ShowPopup" --> "Show Popup" in Tool
|
||||
#' HIDE_POPUP: "HidePopup"
|
||||
#' ARTWORK: "Artwork" --> "Show Topic" in Tool
|
||||
#' ```
|
||||
|
||||
#' Choose which folders with raw log files should be included:
|
||||
|
||||
folders <- c(
|
||||
"_2016"
|
||||
, "_2017a"
|
||||
, "_2017b"
|
||||
, "_2018"
|
||||
)
|
||||
|
||||
dirpaths <- paste0("../data/HAUM Logs/", folders)
|
||||
|
||||
fnames <- dir(dirpaths, pattern = "*.log", full.names = TRUE)
|
||||
# TODO: Enter all data
|
||||
length(fnames)
|
||||
head(fnames)
|
||||
|
||||
logs <- lapply(fnames, readLines)
|
||||
nlog <- sapply(logs, length)
|
||||
dat <- data.frame(fileid = rep(fnames, nlog), logs = unlist(logs))
|
||||
head(dat$logs)
|
||||
|
||||
#' Remove corrupted lines
|
||||
|
||||
# Warning messages:
|
||||
# incomplete final line found on '_2016/2016_11_18-11_31_0.log'
|
||||
# incomplete final line found on '_2016/2016_11_18-11_38_30.log'
|
||||
# incomplete final line found on '_2016/2016_11_18-11_40_36.log'
|
||||
|
||||
## --> files have a last line that looks like a binary entry??
|
||||
|
||||
# From LogEntry.as:
|
||||
# //pm: inserted this check to account for some broken logfiles
|
||||
# if (metaData[1] == null){
|
||||
# trace("corrupt line... still do not know how these came to happen.");
|
||||
|
||||
# corrupt lines are "" and need to be removed
|
||||
d1 <- dim(dat)[1]
|
||||
dat <- subset(dat, dat$logs != "")
|
||||
d2 <- dim(dat)[1]
|
||||
|
||||
#' The files contain `r d1-d2` corrupt lines that were remooved from the data.
|
||||
|
||||
#' ### Extract relevant infos
|
||||
|
||||
date <- sapply(dat$logs, gsub,
|
||||
pattern = "^\\[(.*)\\], \\[.*$",
|
||||
replacement = "\\1",
|
||||
USE.NAMES = FALSE)
|
||||
|
||||
timestamp <- sapply(dat$logs, gsub,
|
||||
pattern = "^\\[.*\\], \\[(.*)\\].*$",
|
||||
replacement = "\\1",
|
||||
USE.NAMES = FALSE)
|
||||
|
||||
action <- sapply(dat$logs, gsub,
|
||||
pattern = "^.*EyeVisit, (.*):*.*$",
|
||||
replacement = "\\1",
|
||||
USE.NAMES = FALSE)
|
||||
|
||||
events <- sapply(strsplit(action, ":"), function(x) x[1])
|
||||
|
||||
topics <- sapply(strsplit(action, ":"), function(x) x[2])
|
||||
|
||||
moves <- apply(do.call(rbind,
|
||||
strsplit(sapply(strsplit(action, ":"), function(x) x[3]),
|
||||
",")),
|
||||
2, as.numeric)
|
||||
# ATTENTION: as.numeric() forces NAs for "OpenCard" and "CloseCard"
|
||||
|
||||
card_action <- trimws(sapply(strsplit(action, ":"),
|
||||
function(x) x[3])[grep("Artwork", events)])
|
||||
|
||||
card <- as.numeric(sapply(strsplit(action, ":"), function(x) x[4]))
|
||||
|
||||
events[grep("Artwork", events)] <- paste("Artwork", card_action, sep = "/")
|
||||
|
||||
ts_elements <- strsplit(timestamp, ":")
|
||||
time_ms <- as.numeric(sapply(ts_elements, function(x) x[4])) +
|
||||
as.numeric(sapply(ts_elements, function(x) x[3])) * 1000 +
|
||||
as.numeric(sapply(ts_elements, function(x) x[2])) * 1000 * 60
|
||||
|
||||
dat$date <- lubridate::parse_date_time(date, "bdyHMSOp")
|
||||
dat$time_ms <- time_ms
|
||||
dat$event <- events
|
||||
dat$artwork <- trimws(sapply(strsplit(topics, "/"), function(x) x[1]))
|
||||
dat$popup <- sapply(strsplit(topics, "/"), function(x) x[2])
|
||||
dat$card <- card
|
||||
dat$x <- moves[,1]
|
||||
dat$y <- moves[,2]
|
||||
dat$scale <- moves[,3]
|
||||
dat$rotation <- moves[,4]
|
||||
|
||||
str(dat)
|
||||
|
||||
head(dat[, 3:ncol(dat)], 20)
|
||||
|
||||
## TODO: Replace artwork and popup numbers with informative strings
|
||||
|
||||
write.table(dat, "../data/rawdata_logfiles.csv",
|
||||
sep = ";", quote = FALSE, row.names = FALSE)
|
||||
|
164
code/02_preprocessing.R
Normal file
164
code/02_preprocessing.R
Normal file
@ -0,0 +1,164 @@
|
||||
#' ---
|
||||
#' title: "Preprocessing log files"
|
||||
#' author: "Nora Wickelmaier"
|
||||
#' date: "`r Sys.Date()`"
|
||||
#' output:
|
||||
#' html_document:
|
||||
#' toc: true
|
||||
#' toc_float: true
|
||||
#' pdf_document:
|
||||
#' toc: true
|
||||
#' number_sections: true
|
||||
#' geometry: margin = 2.5cm
|
||||
#' ---
|
||||
|
||||
# setwd("C:/Users/nwickelmaier/Nextcloud/Documents/MDS/2023ss/60100_master_thesis/code")
|
||||
|
||||
# LogEntry classes:
|
||||
# TRANSFORM_START: "Transform start" --> "Transformation Start" in Tool
|
||||
# TRANSFORM_STOP: "Transform stop"
|
||||
# START_APPLICATION: "Start Application"
|
||||
# SHOW_APPLICATION: "Show Application"
|
||||
# SHOW_INFO: "Show Info" --> "Flip Card" in Tool
|
||||
# SHOW_FRONT: "Show Front"
|
||||
# SHOW_POPUP: "ShowPopup" --> "Show Popup" in Tool
|
||||
# HIDE_POPUP: "HidePopup"
|
||||
# ARTWORK: "Artwork" --> "Show Topic" in Tool
|
||||
|
||||
#' # Read data
|
||||
|
||||
dat0 <- read.table("../data/rawdata_logfiles.csv", sep = ";", header = TRUE)
|
||||
|
||||
#' # Remove irrelevant events
|
||||
|
||||
#' ## Remove Start Application and Show Application
|
||||
|
||||
dat <- subset(dat0, !(dat0$event %in% c("Start Application", "Show Application")))
|
||||
dat$logs <- NULL # do not need original log files
|
||||
dat$date <- as.POSIXct(dat$date) # create date object
|
||||
|
||||
#' ## Remove "button presses"
|
||||
|
||||
# Sort data frame by artwork and date
|
||||
dat <- dat[order(dat$artwork, dat$date), ]
|
||||
|
||||
# remove "Transform start" and "Transform stop" following directly each
|
||||
# other, since I do not know how to interpret them as events
|
||||
id_start <- which(dat$event == "Transform start")
|
||||
id_stop <- which(dat$event == "Transform stop")
|
||||
|
||||
id_rm_start <- id_start[diff(id_start) == 1]
|
||||
id_rm_stop <- id_stop[diff(id_stop) == 1]
|
||||
|
||||
dat <- dat[-c(id_rm_start, id_rm_stop), ]
|
||||
rownames(dat) <- NULL
|
||||
|
||||
|
||||
id_start2 <- which(dat$event == "Transform start")
|
||||
id_stop2 <- which(dat$event == "Transform stop")
|
||||
|
||||
length(id_start2) - length(id_stop2)
|
||||
# 340 --> "starts too many"
|
||||
|
||||
# remove "Transform start" and "Transform stop" following directly each
|
||||
# other (but with events in between!)
|
||||
id_start_new <- id_start2
|
||||
id_stop_new <- id_stop2
|
||||
|
||||
for (i in 2:length(id_start_new)) {
|
||||
if (id_start_new[i-1] < id_stop_new[i-1] & id_start_new[i] < id_stop_new[i-1]) {
|
||||
id_start_new <- id_start_new[-(i-1)]
|
||||
} else if (id_start_new[i-1] > id_stop_new[i-1] & id_start_new[i] > id_stop_new[i-1]) {
|
||||
id_stop_new <- id_stop_new[-(i-1)]
|
||||
}
|
||||
}
|
||||
|
||||
length(id_start2) - length(id_start_new)
|
||||
length(id_stop2) - length(id_stop_new)
|
||||
|
||||
ids <- data.frame(start = id_start_new, stop = id_stop_new)
|
||||
ids$diff <- ids$stop - ids$start
|
||||
|
||||
table(ids$diff)
|
||||
|
||||
# remove "Transform start" and "Transform stop" around other events
|
||||
|
||||
id_rm_start2 <- id_start2[!(id_start2 %in% id_start_new)]
|
||||
id_rm_stop2 <- id_stop2[!(id_stop2 %in% id_stop_new)]
|
||||
|
||||
# TODO: It still does not work correctly:
|
||||
dat[64764:64769,]
|
||||
# time_ms event artwork popup x y scale rotation
|
||||
# 64764 473081 Transform start 052 052.xml 1958.65 1505.75 0.8234455 -0.1351998
|
||||
# 64765 474226 Show Info 052 052.xml NA NA NA NA
|
||||
# 64766 475735 Transform start 052 052.xml 1988.25 1625.25 0.9927645 2.4527958
|
||||
# 64767 475739 Transform stop 052 052.xml 1988.25 1625.25 0.9927645 2.4527958
|
||||
# 64768 479326 Artwork 052 052.xml NA NA NA NA
|
||||
# 64769 479751 Transform stop 052 052.xml 1660.90 1883.20 0.8074586 29.0875534
|
||||
|
||||
# --> but no idea how to find these cases in an automated way...
|
||||
|
||||
dat <- dat[-c(id_rm_start2, id_rm_stop2), ]
|
||||
# --> Every start ends with a stop now (but not necessarily the correct one!)
|
||||
|
||||
|
||||
dat1 <- dat[order(dat$date, dat$time_ms), ]
|
||||
dat1$time_diff <- c(NA, diff(dat1$time_ms))
|
||||
|
||||
boxplot(time_diff ~ as.Date(date), dat1[dat1$time_diff > 1000 & dat1$time_diff < 4000, ])
|
||||
|
||||
boxplot(time_ms ~ event, dat1)
|
||||
|
||||
|
||||
#' ## Plots
|
||||
|
||||
counts <- table(as.Date(dat$date), dat$event)
|
||||
lattice::barchart(counts, auto.key = TRUE)
|
||||
|
||||
|
||||
start_events <- c("Transform start", "Show Info", "ShowPopup", "Artwork/OpenCard")
|
||||
|
||||
counts <- table(as.Date(dat$date[dat$event %in% start_events]),
|
||||
dat$event[dat$event %in% start_events])
|
||||
lattice::barchart(counts, auto.key = TRUE)
|
||||
|
||||
|
||||
# TODO: Do I want to "collapse" the data frame in a way, that I only have
|
||||
# one event for each "set", meaning
|
||||
#
|
||||
# * Transform start + Transform stop --> Transform
|
||||
# * Artwork/OpenCard + Artwork/CloseCard --> Show Subcard
|
||||
# * ShowPopup + HidePopup --> Show Popup
|
||||
# * Show Info + Show Front --> Flip Card
|
||||
# (s.o. ;))
|
||||
#
|
||||
# Then I would have meaningful variables like duration, distance, degree of
|
||||
# rotation, size of scaling, selection of Subcard etc.
|
||||
# This means that I would have to delete all "unclosed" events.
|
||||
|
||||
# Create a data frame with
|
||||
# case event attributes (can differ for different events)
|
||||
# ??
|
||||
# Is `artwork` my case? Or `artwork` per day? Or `artwork` per some other
|
||||
# unit??? Maybe look at differences between timestamps separately for
|
||||
# `artwork`? And identify "new observational unit" this way?
|
||||
#
|
||||
# Definition: (???)
|
||||
# 1. Touching a new `artwork` corresponds to "observational unit change"
|
||||
# 2. Time interval of XX min within one `artwork` on the same day
|
||||
# corresponds to "observational unit change"
|
||||
|
||||
# id activity timestamp
|
||||
|
||||
# Split data frame in list of data frame which all correspond to one
|
||||
# artwork
|
||||
# dat_art <- split(dat, dat$artwork)
|
||||
|
||||
## --> Maybe need it at some point?
|
||||
|
||||
#' # Problems
|
||||
|
||||
#' * Opening and closing of events cannot be identified unambiguously; it
|
||||
#' can happen that the wrong tags have been put together (e.g., Transform
|
||||
#' start and Transform stop); therefore, durations etc. are only heuristic
|
||||
|
99
code/03_specs.R
Normal file
99
code/03_specs.R
Normal file
@ -0,0 +1,99 @@
|
||||
# setwd("C:/Users/nwickelmaier/Nextcloud/Documents/MDS/2023ss/60100_master_thesis/code")
|
||||
|
||||
library(lubridate)
|
||||
|
||||
dat <- read.table("../data/rawdata_logfiles.csv", header = TRUE, sep = ";")
|
||||
dat$event <- factor(dat$event, levels = c("Start Application",
|
||||
"Show Application",
|
||||
"Transform start",
|
||||
"Transform stop",
|
||||
"Show Info",
|
||||
"Show Front",
|
||||
"Artwork/OpenCard",
|
||||
"Artwork/CloseCard",
|
||||
"ShowPopup", "HidePopup"))
|
||||
|
||||
|
||||
#dat$logs <- NULL # do not need original log files
|
||||
dat$date <- as.POSIXct(dat$date) # create date object
|
||||
|
||||
str(dat)
|
||||
head(dat)
|
||||
|
||||
#' Log files between December 2016 and December 2018
|
||||
|
||||
range(dat$date)
|
||||
|
||||
#' Number of log files per year
|
||||
|
||||
table(year(dat$date))
|
||||
|
||||
#' Number of events total and per year
|
||||
|
||||
table(dat$event)
|
||||
# Start Application Show Application
|
||||
# 1679 1656
|
||||
# Transform start Transform stop
|
||||
# 2119815 1944618
|
||||
# Show Info Show Front
|
||||
# 71955 71043
|
||||
# Artwork/OpenCard Artwork/CloseCard
|
||||
# 64990 56750
|
||||
# ShowPopup HidePopup
|
||||
# 44070 43813
|
||||
# --> more "openups" than "closes" (not surprisingly)
|
||||
|
||||
table(dat$event, year(dat$date))
|
||||
|
||||
#' Number of log files per week day
|
||||
|
||||
table(weekdays(dat$date))
|
||||
|
||||
#' Anzahl der Tage
|
||||
|
||||
length(unique(as.Date(dat$date)))
|
||||
|
||||
#' Which artworks are looked at most often
|
||||
|
||||
table(dat$artwork)
|
||||
lattice::barchart(table(dat$artwork))
|
||||
|
||||
|
||||
|
||||
#' ## Plots from Visualization Tool
|
||||
|
||||
dat16 <- dat[year(dat$date) == 2016, ]
|
||||
|
||||
counts <- table(as.Date(dat16$date), dat16$event)
|
||||
lattice::barchart(counts, auto.key = TRUE)
|
||||
|
||||
|
||||
start_events <- c("Transform start", "Show Info", "ShowPopup", "Artwork/OpenCard")
|
||||
|
||||
counts <- table(as.Date(dat16$date[dat16$event %in% start_events]),
|
||||
dat16$event[dat16$event %in% start_events])
|
||||
counts
|
||||
lattice::barchart(counts, auto.key = TRUE)
|
||||
|
||||
|
||||
|
||||
### Example for log file in order to show structure
|
||||
|
||||
write.table(dat[240:660, 3:12], "set.txt", quote = FALSE)
|
||||
# is then edited by hand to have all possible events...
|
||||
|
||||
|
||||
select <- c(243, 244, 245, 246, 253, 254, 255, 256, 257, 259, 260, 262,
|
||||
265, 266, 267, 268, 269, 270, 271, 287, 288, 289, 294, 295,
|
||||
296, 303, 304, 305, 306, 307, 318, 319, 320, 321, 322, 323,
|
||||
324, 325, 326, 357, 358, 359, 360, 361, 362, 363, 408, 409,
|
||||
410, 411, 412, 413, 414, 415, 416, 417, 418, 439, 440, 441,
|
||||
516, 518, 519, 587, 588, 589, 590, 591, 592, 593, 594, 595,
|
||||
652, 653, 654, 655, 656, 657)
|
||||
|
||||
writeLines(dat[select, "logs"], "set_logs.txt")
|
||||
|
||||
|
||||
tab <- table(diff(dat$date)) # uninformative on raw log data!
|
||||
plot(tab[tab > 2000])
|
||||
|
Loading…
Reference in New Issue
Block a user