From e88981e3b96ea77a75e6e79001c273f0837e7111 Mon Sep 17 00:00:00 2001
From: nwickel <n.wickelmaier@iwm-tuebingen.de>
Date: Thu, 20 Jul 2023 17:06:28 +0200
Subject: [PATCH] Started seriously working on data preprocessing; very
 intermediate version though

---
 code/01_parse-logfiles.R |   5 +
 code/01b_investigate.R   | 224 +++++++++++++++++++++++++++++++++++++++
 code/02_preprocessing.R  | 135 ++++++++++++++++++++++-
 3 files changed, 359 insertions(+), 5 deletions(-)
 create mode 100644 code/01b_investigate.R

diff --git a/code/01_parse-logfiles.R b/code/01_parse-logfiles.R
index 9084333..24d7b11 100644
--- a/code/01_parse-logfiles.R
+++ b/code/01_parse-logfiles.R
@@ -37,6 +37,7 @@ knitr::opts_chunk$set(warning = FALSE, message = FALSE)
 #' Choose which folders with raw log files should be included:
 
 folders <- "all"
+#folders <- "_2016b"
 
 dirpaths <- paste0("../data/haum_logs_2016-2023/", folders)
 
@@ -129,6 +130,10 @@ str(dat)
 
 head(dat[, 2:ncol(dat)], 20)
 
+# sort by date, since sorting by file names does not make sense because of
+# missing left zero padding
+dat <- dat[order(dat$date), ]
+
 ## TODO: Replace artwork and popup numbers with informative strings
 
 write.table(dat, "../data/rawdata_logfiles.csv",
diff --git a/code/01b_investigate.R b/code/01b_investigate.R
new file mode 100644
index 0000000..166fbc3
--- /dev/null
+++ b/code/01b_investigate.R
@@ -0,0 +1,224 @@
+#' ---
+#' title: "Preprocessing log files"
+#' author: "Nora Wickelmaier"
+#' date: "`r Sys.Date()`"
+#' output:
+#'   html_document:
+#'     toc: true
+#'     toc_float: true
+#'   pdf_document:
+#'     toc: true
+#'     number_sections: true
+#' geometry: margin = 2.5cm
+#' ---
+
+# setwd("C:/Users/nwickelmaier/Nextcloud/Documents/MDS/2023ss/60100_master_thesis/code")
+
+# LogEntry classes:
+#   TRANSFORM_START:    "Transform start" --> "Transformation Start" in Tool
+#   TRANSFORM_STOP:     "Transform stop"
+#   START_APPLICATION:  "Start Application"
+#   SHOW_APPLICATION:   "Show Application"
+#   SHOW_INFO:          "Show Info"       --> "Flip Card" in Tool
+#   SHOW_FRONT:         "Show Front"
+#   SHOW_POPUP:         "ShowPopup"       --> "Show Popup" in Tool
+#   HIDE_POPUP:         "HidePopup"
+#   ARTWORK:            "Artwork"         --> "Show Topic" in Tool
+
+#' # Read data
+
+dat0 <- read.table("../data/rawdata_logfiles.csv", sep = ";", header = TRUE)
+dat0$date <- as.POSIXct(dat0$date)  # create date object
+
+plot(dat0$time_ms[1:3000], type = "l")
+
+# what happens here? Why does `time_ms` go down, but not to 0?
+plot(dat0$time_ms[2500:3000], type = "l")
+plot(dat0$time_ms[2755:2765], type = "l")  # "zoom in"
+dat0[2755:2765, ]
+# --> overall time stamp keeps going up...
+
+# TODO: How to create a plot that gives the same information based on
+# `time_ms` und `date`??
+plot(time_ms ~ date, dat0[1:5000, ], type = "b")
+abline(h = 0, col = "red", lty = 3)
+# Visualize night
+plot(time_ms ~ date, dat0[1:10000, ], type = "b")
+
+
+# Not all `Start Application` have `time_ms = 0` - why??
+
+dat0[125537:125542, ]
+dat0[6673501:6673510, ]
+# --> What's happening here?
+
+table(dat0[dat0$event %in% "Start Application", c("event", "date", "time_ms")]$time_ms)
+#    0    1   15   16  296 2819 2914 3191 5316 6535
+# 3131    4   21   48    1    1    1    1    1    1
+# --> ???
+dat0[dat0$event == "Start Application" & dat0$time_ms == 6535, ]
+dat0[989313:989317, ]
+
+dat0[dat0$event == "Start Application" & dat0$time_ms == 5316, ]
+dat0[2071078:2071082, ]
+
+dat0[dat0$event == "Start Application" & dat0$time_ms == 3191, ]
+dat0[2851863:2851867, ]
+
+dat0[dat0$event == "Start Application" & dat0$time_ms == 16, ]
+dat0[156382:156386, ]
+dat0[5566940:5566947, ]
+# --> pattern is *not* consistent
+
+dat0[dat0$event == "Start Application" & dat0$time_ms == 1, ]
+dat0[125537:125542, ]
+
+
+xtabs( ~ event + as.Date(date), dat0[1:1000, ])
+
+# How many days do we have with up to 8 "Start Applications"
+table(xtabs( ~ event + as.Date(date), dat0[dat0$event == "Start Application", ]))
+#   1   2   3   4   5   6   7   8
+# 381 657 272  86  37  14  10   2
+# --> 8 days without any "Start Application"
+length(unique(as.Date(dat0$date))) -
+  length(xtabs( ~ event + as.Date(date), dat0[dat0$event == "Start Application", ]))
+
+# But only 6 files with 2 "Start Applications"
+table(xtabs( ~ event + fileid, dat0[dat0$event == "Start Application", ]))
+#    1    2
+# 3198    6
+# --> That means we have 36,563 file ids without any "Start Application"
+
+
+#' # Remove irrelevant events
+
+#' ## Remove Start Application and Show Application
+
+dat <- subset(dat0, !(dat0$event %in% c("Start Application", "Show Application")))
+
+#' ## Remove "button presses"
+
+# Sort data frame by artwork and date
+dat <- dat[order(dat$artwork, dat$date), ]
+
+# remove "Transform start" and "Transform stop" following directly each
+# other, since I do not know how to interpret them as events
+id_start <- which(dat$event == "Transform start")
+id_stop  <- which(dat$event == "Transform stop")
+
+id_rm_start <- id_start[diff(id_start) == 1]
+id_rm_stop <- id_stop[diff(id_stop) == 1]
+
+dat <- dat[-c(id_rm_start, id_rm_stop), ]
+rownames(dat) <- NULL
+
+
+id_start2 <- which(dat$event == "Transform start")
+id_stop2  <- which(dat$event == "Transform stop")
+
+length(id_start2) - length(id_stop2)
+# 340 --> "starts too many"
+
+# remove "Transform start" and "Transform stop" following directly each
+# other (but with events in between!)
+id_start_new <- id_start2
+id_stop_new <- id_stop2
+
+for (i in 2:length(id_start_new)) {
+  if (id_start_new[i-1] < id_stop_new[i-1] & id_start_new[i] < id_stop_new[i-1]) {
+    id_start_new <- id_start_new[-(i-1)]
+  } else if (id_start_new[i-1] > id_stop_new[i-1] & id_start_new[i] > id_stop_new[i-1]) {
+    id_stop_new <- id_stop_new[-(i-1)]
+  }
+}
+
+length(id_start2) - length(id_start_new)
+length(id_stop2) - length(id_stop_new)
+
+ids <- data.frame(start = id_start_new, stop = id_stop_new)
+ids$diff <- ids$stop - ids$start
+
+table(ids$diff)
+
+# remove "Transform start" and "Transform stop" around other events
+
+id_rm_start2 <- id_start2[!(id_start2 %in% id_start_new)]
+id_rm_stop2 <- id_stop2[!(id_stop2 %in% id_stop_new)]
+
+# TODO: It still does not work correctly:
+dat[64764:64769,]
+#        time_ms           event artwork   popup       x       y     scale   rotation
+# 64764   473081 Transform start     052 052.xml 1958.65 1505.75 0.8234455 -0.1351998
+# 64765   474226       Show Info     052 052.xml      NA      NA        NA         NA
+# 64766   475735 Transform start     052 052.xml 1988.25 1625.25 0.9927645  2.4527958
+# 64767   475739  Transform stop     052 052.xml 1988.25 1625.25 0.9927645  2.4527958
+# 64768   479326         Artwork     052 052.xml      NA      NA        NA         NA
+# 64769   479751  Transform stop     052 052.xml 1660.90 1883.20 0.8074586 29.0875534
+
+# --> but no idea how to find these cases in an automated way...
+
+dat <- dat[-c(id_rm_start2, id_rm_stop2), ]
+# --> Every start ends with a stop now (but not necessarily the correct one!)
+
+
+dat1 <- dat[order(dat$date, dat$time_ms), ]
+dat1$time_diff <- c(NA, diff(dat1$time_ms))
+
+boxplot(time_diff ~ as.Date(date), dat1[dat1$time_diff > 1000 & dat1$time_diff < 4000, ])
+
+boxplot(time_ms ~ event, dat1)
+
+
+#' ## Plots
+
+counts <- table(as.Date(dat$date), dat$event)
+lattice::barchart(counts, auto.key = TRUE)
+
+
+start_events <- c("Transform start", "Show Info", "ShowPopup", "Artwork/OpenCard")
+
+counts <- table(as.Date(dat$date[dat$event %in% start_events]),
+                dat$event[dat$event %in% start_events])
+lattice::barchart(counts, auto.key = TRUE)
+
+
+# TODO: Do I want to "collapse" the data frame in a way, that I only have
+# one event for each "set", meaning
+#
+# * Transform start   + Transform stop     --> Transform
+# * Artwork/OpenCard  + Artwork/CloseCard  --> Show Subcard
+# * ShowPopup         + HidePopup          --> Show Popup
+# * Show Info         + Show Front         --> Flip Card
+# (s.o. ;))
+#
+# Then I would have meaningful variables like duration, distance, degree of
+# rotation, size of scaling, selection of Subcard etc.
+# This means that I would have to delete all "unclosed" events.
+
+# Create a data frame with
+# case    event     attributes (can differ for different events)
+# ??
+# Is `artwork` my case? Or `artwork` per day? Or `artwork` per some other
+# unit??? Maybe look at differences between timestamps separately for
+# `artwork`? And identify "new observational unit" this way?
+#
+# Definition: (???)
+# 1. Touching a new `artwork` corresponds to "observational unit change"
+# 2. Time interval of XX min within one `artwork` on the same day
+#    corresponds to "observational unit change"
+
+# id    activity    timestamp
+
+# Split data frame in list of data frame which all correspond to one
+# artwork
+# dat_art <- split(dat, dat$artwork)
+
+## --> Maybe need it at some point?
+
+#' # Problems
+
+#' * Opening and closing of events cannot be identified unambiguously; it
+#'   can happen that the wrong tags have been put together (e.g., Transform
+#'   start and Transform stop); therefore, durations etc. are only heuristic
+
diff --git a/code/02_preprocessing.R b/code/02_preprocessing.R
index 888ff86..6264eb6 100644
--- a/code/02_preprocessing.R
+++ b/code/02_preprocessing.R
@@ -2,7 +2,7 @@
 #' title: "Preprocessing log files"
 #' author: "Nora Wickelmaier"
 #' date: "`r Sys.Date()`"
-#' output: 
+#' output:
 #'   html_document:
 #'     toc: true
 #'     toc_float: true
@@ -27,15 +27,140 @@
 
 #' # Read data
 
-dat0 <- read.table("../data/rawdata_logfiles.csv", sep = ";", header = TRUE)
+dat0 <- read.table("../data/rawdata_logfiles_small.csv", sep = ";",
+                   header = TRUE)
+dat0$date <- as.POSIXct(dat0$date)  # create date object
+
+# TODO: Add a case identifier based on timestamps -- needs to be done on
+# "raw data". Is it possible? Something seems seriously wrong with
+# `time_ms`
 
 #' # Remove irrelevant events
 
 #' ## Remove Start Application and Show Application
 
-dat <- subset(dat0, !(dat0$event %in% c("Start Application", "Show Application")))
-dat$logs <- NULL                  # do not need original log files
-dat$date <- as.POSIXct(dat$date)  # create date object
+dat <- subset(dat0, !(dat0$event %in% c("Start Application",
+                                        "Show Application")))
+
+#' # Close events
+
+#' Do it for Tranform events first
+tmp <- dat[dat$event %in% c("Transform start", "Transform stop"), ]
+tmp <- tmp[order(tmp$artwork, tmp$date), ]
+rownames(tmp) <- NULL
+
+# Find out how often "Transform start" follows each other
+num_start <- diff(c(0, which(tmp$event == "Transform stop")))
+tmp$eventid <- rep(seq_along(num_start), num_start)
+head(tmp[, c("event", "eventid")], 25)
+
+table(table(tmp$eventid))
+#   1     2     3     4     5     6     7     8    10    11
+#  73 78429  5156   842   222    66    18    14     3     1
+# --> compare to table(num_start)!
+
+# Find out how often "Transform stop" follows each other
+num_stop <- c(diff(c(0, which(tmp$event == "Transform start"))))
+table(num_stop)
+
+tmp$eventrep <- rep(num_start, num_start)
+tmp$dupl <- duplicated(tmp[, c("event", "eventid")])                    # keep first
+tmp$dupl <- duplicated(tmp[, c("event", "eventid")], fromLast = TRUE)   # keep last
+tmp[tmp$eventrep == 10, ]
+
+tmp$dupl <- NULL
+tmp$eventrep <- NULL
+
+
+# remove duplicated "Transform start" events
+tmp <- tmp[!duplicated(tmp[, c("event", "eventid")]), ]
+
+# remove duplicated "Transform stop" events
+id_stop  <- which(tmp$event == "Transform stop")
+id_rm_stop <- id_stop[diff(id_stop) == 1]
+
+tmp <- tmp[-(id_rm_stop + 1), ]
+
+# transform to wide data format
+tmp$event <- ifelse(tmp$event == "Transform start", "start", "stop")
+
+trans_wide <- reshape(tmp, direction = "wide",
+                      idvar = c("eventid", "artwork"),
+                      timevar = "event", drop = c("fileid", "popup", "card")
+)
+
+rownames(trans_wide) <- NULL
+# --> when fileid is part of the reshape, it does not work correctly, since
+# we sometimes have a start - stop event that is recorded in two separate
+# log files
+# TODO: This runs for quite some time. Is this more efficient with dplyr?
+
+# which(is.na(trans_wide$date.start))
+
+trans_wide$duration <- trans_wide$time_ms.stop - trans_wide$time_ms.start
+trans_wide$distance <- apply(trans_wide[, c("x.start", "y.start", "x.stop", "y.stop")],
+      1, function(x) dist(matrix(x, 2, 2, byrow = TRUE)))
+trans_wide$rotationDegree <- trans_wide$rotation.stop - trans_wide$rotation.start
+trans_wide$scaleSize <- trans_wide$scale.stop - trans_wide$scale.start
+
+trans_wide <- trans_wide[trans_wide$distance != 0 &
+                         trans_wide$rotationDegree != 0 &
+                         trans_wide$scaleSize != 0, ]
+# removes almost 2/3 of the data (for small data set)
+
+
+# TODO: How do I handle popups from glossar???
+
+
+
+
+
+
+# Should every "Show front" be the beginning of a new trace?
+# Should Transform events be handled separately and then be "added" again
+# by timestamp?
+
+########
+tmp <- dat[!dat$event %in% c("Transform start", "Transform stop"), ]
+rownames(tmp) <- NULL
+
+tmp$trace <- NA
+last_event <- tmp$event[1]
+
+for (art in unique(tmp$artwork)) {
+
+  for (i in 1:nrow(tmp)) {
+
+    if (last_event == "Show Info" & (tmp$artwork[i] == art |
+                                     tmp$artwork[i] == "glossar")) {
+      tmp$trace[i] <- "start"
+    } else if (last_event == "Show Front" & (tmp$artwork[i] == art |
+                                             tmp$artwork[i] == "glossar")) {
+      tmp$trace[i] <- "stop"
+    }
+  last_event <- tmp$event[i + 1]
+  }
+}
+
+
+head(tmp[4:ncol(tmp)], 50)
+# TODO: Great job! You used a for-loop to rename "Show info" and "Show
+# front" to "start" and "stop" ;)
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
 
 #' ## Remove "button presses"