Worked on preprocessing; added trace variable; tried out modeling; prepared questions for PM

2023-08-14 16:57:03 +02:00
parent 9e3783cf1f
commit c63c0a8206
7 changed files with 683 additions and 49 deletions
@@ -0,0 +1,103 @@
+# Offene Fragen
+
+## Datenverständnis
+
+* Welche Einheit haben x und y? Pixel?
+* Welche Einheit hat scale?
+* rotation wirklich degree?
+* Nach welchem Zeitintervall resettet sich der Tisch wieder in die
+  Ausgangskonfiguration?
+
+## Tisch-Software
+
+* Gibt es Doku für die Bilder, die über die xml files hinausgeht? Sowas wie
+  ein Manual oder ähnliches?
+* Gibt es evtl. irgendwo noch ein Tablet mit der Anwendung drauf?
+* Was bedeuten die Farben der Topic Cards?
+
+## Event Logs
+
+* Wie gehen wir mit "nicht geschlossenen" Events um? Einfach rauslöschen?
+  - für Transform tendiere ich zu ja, weil sonst total uninteressant
+  - bei flipCard bin ich nicht so sicher... Aber man kann dann keine
+    duration berechnen, wäre NA
+* Moves/scales/rotations ohne Veränderung würde ich auf jeden Fall
+  rauslöschen
+* Es ist nicht möglich (bzw. ich weiß nicht wie) zusammengehörige Events
+  eineindeutig zu identifizieren
+  - nach Heuristik vorgehen? Doppelte Transformation start und stop einfach
+    raus?
+  - Daten sind nicht "fehlerfrei"; es gibt z.B. Transformation-Events wo
+    das Ende nicht geloggt wurde
+* Wie identifiziere ich eine "Interaktionseinheit"?
+  - Was ist ein "case"?
+  - Eher grob über Zeitintervalle?
+  - Noch irgendeine andere Idee?
+* Herausfinden, ob mehr als eine Person am Tisch steht?
+  - Sliding window, in der Anzahl von Artworks gezählt wird? Oder wie weit
+    angefasste Artworks voneinander entfernt sind?
+  - Man kann sowas schon "sehen" in den Logs - aber wie kann ich es
+    automatisiert rausziehen? Was ist meine Definition von
+    "Interaktionsboost"?
+  - Egal wie wir es machen, geht es auf den "Event-Log-Daten"?
+* Anreicherung der Log-Daten mit weiteren Metadaten? Was wäre interessant?
+  - Metadata on artworks like, name, artist, type of artwork, epoch, etc.
+ - School vacations and holidays
+ - Special exhibits at the museum
+ - Number of visitors per day
+ - Age structure of visitors per day?
+  - ... ????
+
+## HAUM
+
+* Bei Sven noch mal nachhaken wegen Besucherzahlen?
+
+
+
+
+# Problems and how I handled them
+
+This lists some problems with the log data that required decisions. These
+decisions influence the outcome and maybe even the data quality. Hence, I
+tried to document how I handled these problems and explain the decisions I
+made.
+
+## Weird behavior of `time_ms` and neg. `duration`values
+
+
+## Events that only close (`date.start` is NA)
+
+
+## Timestamps repeat
+
+
+
+## Popups from glossar cannot be assigned to a specific artwork
+
+
+## Assign a case variable based on "time heuristic"
+
+## A `move`event does not record any change
+
+## Add moves to `trace` variable
+
+
+
+
+# Reading list
+
+* @Arizmendi2022 [$-$]
+* @Bannert2014 [x]
+* @Bousbia2010 [$-$]
+* @Cerezo2020
+* @GerjetsSchwan2021 [x]
+* @Goldhammer2020
+* @Guenther2007
+* @HuberBannert2023 [x]
+* @Kroehne2018
+* @SchwanGerjets2021 [x]
+* @vanderAalst2016 [Chap. 2, x]
+* @vanderAalst2016 [Chap. 3]
+* @vanderAalst2016 [Chap. 5, x]
+* @Wang2019
+
@@ -4,8 +4,7 @@
 #' date: "`r Sys.Date()`"
 #' output: 
 #'   html_document:
-#'     toc: true
-#'     toc_float: true
+#'     default
 #'   pdf_document:
 #'     toc: true
 #'     number_sections: true
@@ -17,8 +16,6 @@
 #+ setup, include = FALSE
 knitr::opts_chunk$set(warning = FALSE, message = FALSE) 

-#' # Preprocessing raw log files into data frame
-
 #' The following events can be extracted from the log files:
 #' 
 #' ```
@@ -36,8 +33,8 @@ knitr::opts_chunk$set(warning = FALSE, message = FALSE)

 #' Choose which folders with raw log files should be included:

-folders <- "all"
-#folders <- "_2016b"
+#folders <- "all"
+folders <- "_2016b"

 dirpaths <- paste0("../data/haum_logs_2016-2023/", folders)

@@ -71,7 +68,7 @@ dat <- subset(dat, dat$logs != "")
 d2 <- dim(dat)[1]

 #' The files contain `r d1-d2` corrupt lines that were remooved from the data.
-
+#'

 #' ### Extract relevant infos

@@ -136,6 +133,8 @@ dat <- dat[order(dat$date), ]

 ## TODO: Replace artwork and popup numbers with informative strings

-write.table(dat, "../data/rawdata_logfiles.csv",
+#' ### Save data frame
+
+write.table(dat, "../data/rawdata_logfiles_small.csv",
            sep = ";", quote = FALSE, row.names = FALSE)

@@ -222,3 +222,18 @@ lattice::barchart(counts, auto.key = TRUE)
 #'   can happen that the wrong tags have been put together (e.g., Transform
 #'   start and Transform stop); therefore, durations etc. are only heuristic

+
+
+#' ## Plots
+
+counts <- table(as.Date(dat$date), dat$event)
+lattice::barchart(counts, auto.key = TRUE)
+
+
+start_events <- c("Transform start", "Show Info", "ShowPopup", "Artwork/OpenCard")
+
+counts <- table(as.Date(dat$date[dat$event %in% start_events]),
+                dat$event[dat$event %in% start_events])
+lattice::barchart(counts, auto.key = TRUE)
+
+
@@ -109,11 +109,16 @@ trans_wide$rotationDegree <- trans_wide$rotation.stop -
  trans_wide$rotation.start
 trans_wide$scaleSize <- trans_wide$scale.stop - trans_wide$scale.start

-dat_trans <- dat_trans[trans_wide$distance != 0 &
+trans_wide$trace <- NA
+trans_wide$card  <- NA
+trans_wide$popup <- NA
+
+dat_trans <- trans_wide[trans_wide$distance != 0 &
                        trans_wide$rotationDegree != 0 &
                        trans_wide$scaleSize != 0,
-                      c("event", "artwork", "date.start", "date.stop",
+                      c("event", "artwork", "trace", "date.start", "date.stop",
                        "time_ms.start", "time_ms.stop", "duration",
+                        "card", "popup",
                        "x.start", "y.start", "x.stop", "y.stop",
                        "distance", "scale.start", "scale.stop",
                        "scaleSize", "rotation.start", "rotation.stop",
@@ -128,7 +133,8 @@ summary(dat_trans)
 # --> Hat er eine Erklärung dafür?

 #plot(time_ms.stop ~ time_ms.start, dat_trans, type = "b")
-plot(time_ms.stop ~ time_ms.start, dat_trans, col = rgb(red = 0, green = 0, blue = 0, alpha = 0.2))
+plot(time_ms.stop ~ time_ms.start, dat_trans, 
+     col = rgb(red = 0, green = 0, blue = 0, alpha = 0.2))

 plot(date.stop ~ date.start, dat_trans[1:1000,], type = "b")

@@ -248,7 +254,7 @@ lut[sapply(lut$artwork, length) == 1, "glossar_file"]
 # TODO: Fill in the ones that are associated with one artwork
 # --> Can't come up with something -- maybe ask AK???

-# TODO: How to check if one of the former "Show Infos" is correct on
+# TODO: How to check if one of the former "Show Infos" is correct
 # --> Can't come up with something -- maybe ask AK???

 # for (file in lut$glossar_file) {
@@ -285,6 +291,8 @@ lut[sapply(lut$artwork, length) == 1, "glossar_file"]
 # correct: 17940
 # incorrect: 17963

+# TODO: "glossar" entry should be changed to the corresponding artwork
+# TODO: Add additional variable `glossar` with 0/1 or similar instead

 # TODO: For now: Exclude not matched glossar entries

@@ -314,10 +322,28 @@ flipCard_wide$duration <- flipCard_wide$time_ms.stop -
 flipCard_wide$duration <- ifelse(flipCard_wide$duration < 0,
                                 NA, flipCard_wide$duration)

+flipCard_wide$card            <- NA
+flipCard_wide$popup           <- NA
+flipCard_wide$x.start         <- NA
+flipCard_wide$x.stop          <- NA
+flipCard_wide$y.start         <- NA
+flipCard_wide$y.stop          <- NA
+flipCard_wide$distance        <- NA
+flipCard_wide$scale.start     <- NA
+flipCard_wide$scale.stop      <- NA
+flipCard_wide$scaleSize       <- NA
+flipCard_wide$rotation.start  <- NA
+flipCard_wide$rotation.stop   <- NA
+flipCard_wide$rotationDegree  <- NA
+
 dat_flipCard <- flipCard_wide[, c("event", "artwork", "trace",
                                 "date.start", "date.stop",
                                 "time_ms.start", "time_ms.stop",
-                                 "duration")]
+                                 "duration", "card", "popup",
+                                 "x.start", "y.start", "x.stop", "y.stop",
+                                 "distance", "scale.start", "scale.stop",
+                                 "scaleSize", "rotation.start",
+                                 "rotation.stop", "rotationDegree")]

 rm(tmp, flipCard_wide)

@@ -344,10 +370,27 @@ openTopic_wide$duration <- ifelse(openTopic_wide$duration < 0,
 # TODO: How to handle duration < 0
 # --> Replace with NA for now...

-dat_openTopic <- openTopic_wide[, c("event", "artwork", "card", "trace",
+openTopic_wide$popup           <- NA
+openTopic_wide$x.start         <- NA
+openTopic_wide$x.stop          <- NA
+openTopic_wide$y.start         <- NA
+openTopic_wide$y.stop          <- NA
+openTopic_wide$distance        <- NA
+openTopic_wide$scale.start     <- NA
+openTopic_wide$scale.stop      <- NA
+openTopic_wide$scaleSize       <- NA
+openTopic_wide$rotation.start  <- NA
+openTopic_wide$rotation.stop   <- NA
+openTopic_wide$rotationDegree  <- NA
+
+dat_openTopic <- openTopic_wide[, c("event", "artwork", "trace",
                                   "date.start", "date.stop",
                                   "time_ms.start", "time_ms.stop",
-                                   "duration")]
+                                   "duration", "card", "popup", "x.start",
+                                   "y.start", "x.stop", "y.stop",
+                                   "distance", "scale.start", "scale.stop",
+                                   "scaleSize", "rotation.start",
+                                   "rotation.stop", "rotationDegree")]
 # TODO: card should have a unique identifier for each artwork

 rm(openTopic_wide, num_start, tmp)
@@ -385,29 +428,47 @@ openPopup_wide$duration <- ifelse(openPopup_wide$duration < 0,
 # TODO: How to handle duration < 0
 # --> Replace with NA for now...

-dat_openPopup <- openPopup_wide[, c("event", "artwork", "popup", "trace",
+openPopup_wide$card            <- NA
+openPopup_wide$x.start         <- NA
+openPopup_wide$x.stop          <- NA
+openPopup_wide$y.start         <- NA
+openPopup_wide$y.stop          <- NA
+openPopup_wide$distance        <- NA
+openPopup_wide$scale.start     <- NA
+openPopup_wide$scale.stop      <- NA
+openPopup_wide$scaleSize       <- NA
+openPopup_wide$rotation.start  <- NA
+openPopup_wide$rotation.stop   <- NA
+openPopup_wide$rotationDegree  <- NA
+
+dat_openPopup <- openPopup_wide[, c("event", "artwork", "trace",
                                   "date.start", "date.stop",
                                   "time_ms.start", "time_ms.stop",
-                                   "duration")]
+                                   "duration", "card", "popup", "x.start",
+                                   "y.start", "x.stop", "y.stop",
+                                   "distance", "scale.start", "scale.stop",
+                                   "scaleSize", "rotation.start",
+                                   "rotation.stop", "rotationDegree")]
 rm(num_start, openPopup_wide, tmp)


 # Merge all
-system.time({
-dat_all <- merge(dat_trans, dat_flipCard, all = TRUE)
-dat_all <- merge(dat_all, dat_openTopic, all = TRUE)
-dat_all <- merge(dat_all, dat_openPopup, all = TRUE)
-})
-
-# check
-nrow(dat_all) == (nrow(dat_trans) + nrow(dat_flipCard) +
-                  nrow(dat_openTopic) + nrow(dat_openPopup))
-
-dat_all <- dat_all[order(dat_all$date.start), ]
-rownames(dat_all) <- NULL
+# system.time({
+# dat_all <- merge(dat_trans, dat_flipCard, all = TRUE)
+# dat_all <- merge(dat_all, dat_openTopic, all = TRUE)
+# dat_all <- merge(dat_all, dat_openPopup, all = TRUE)
+# })
+# 
+# # check
+# nrow(dat_all) == (nrow(dat_trans) + nrow(dat_flipCard) +
+#                   nrow(dat_openTopic) + nrow(dat_openPopup))
+# 
+# dat_all <- dat_all[order(dat_all$date.start), ]
+# rownames(dat_all) <- NULL
+# 

 # TODO: from here on NA... WHY??
-dat_all[19426:19435, ]
+# dat_all[19426:19435, ]

 # TODO: Should card maybe also be filled in for "openPopup"?

@@ -423,19 +484,135 @@ dat_all[19426:19435, ]
 # TODO: --> same result - but faster. Need it?
 # --> Would hate to depend on dplyr...

+#' ## Use `rbind()` instead...
+# --> unbeatable in terms of time!
+
+dat_all <- rbind(dat_trans, dat_flipCard, dat_openTopic, dat_openPopup)
+
+# check
+nrow(dat_all) == (nrow(dat_trans) + nrow(dat_flipCard) +
+                  nrow(dat_openTopic) + nrow(dat_openPopup))
+
+# remove all events that do not have a `date.start`
+dat_all <- dat_all[!is.na(dat_all$date.start), ]
+# TODO: Find out how it can be that there is only a `date.stop`
+
+# sort by `start.date`
+dat_all <- dat_all[order(dat_all$date.start), ]
+rownames(dat_all) <- NULL
+
+ind <- rowSums(is.na(dat_all)) == ncol(dat_all)
+any(ind)
+dat_all[ind, ]
+# --> No rows with only NA, as it should be.
+
+summary(dat_all)    # OK, this actually makes a lot of sense :)
+
+#' ## Create case variable
+
+#dat_all$timediff <- as.numeric(dat_all$date.stop - dat_all$date.start)
+
+dat_all$timediff <- as.numeric(diff(c(dat_all$date.start[1], dat_all$date.start)))
+
+hist(dat_all$timediff[dat_all$timediff < 40], breaks = 50)


-#' ## Plots
+# TODO: What is the best choice for the cutoff here? I took 20 secs for now
+dat_all$case <- NA
+j <- 1

-counts <- table(as.Date(dat$date), dat$event)
-lattice::barchart(counts, auto.key = TRUE)
+for (i in seq_len(nrow(dat_all))) {
+  if (dat_all$timediff[i] < 21) {
+    dat_all$case[i] <- j
+  } else {
+    j <- j + 1
+    dat_all$case[i] <- j
+  }
+}

+head(dat_all[, c("event", "artwork", "trace", "date.start", "timediff", "case")], 100)

-start_events <- c("Transform start", "Show Info", "ShowPopup", "Artwork/OpenCard")
+#' ## Add event ID

-counts <- table(as.Date(dat$date[dat$event %in% start_events]),
-                dat$event[dat$event %in% start_events])
-lattice::barchart(counts, auto.key = TRUE)
+dat_all$eventid <- seq_len(nrow(dat_all))
+
+dat_all <- dat_all[, c("eventid", "case", "trace", "event", "artwork",
+                       "date.start", "date.stop", "time_ms.start",
+                       "time_ms.stop", "duration", "card", "popup",
+                       "x.start", "y.start", "x.stop", "y.stop",
+                       "distance", "scale.start", "scale.stop",
+                       "scaleSize", "rotation.start", "rotation.stop",
+                       "rotationDegree")]
+
+#' ## Add `trace` numbers for `move` events
+
+# when case and artwork are identical and there is only 1 trace value
+# --> assign it to all `move` events for that case and artwork
+# when case and artwork are identical and there is more than 1 trace value
+# --> assign the `trace` value that was right before this `move` event
+# (could, of course, also be after)
+
+cases <- unique(dat_all$case)
+aws <- unique(dat_all$artwork)[unique(dat_all$artwork) != "glossar"]
+max_trace <- max(dat_all$trace, na.rm = TRUE) + 1
+out <- NULL
+
+for (case in cases) {
+  for (art in aws) {
+    tmp <- dat_all[dat_all$case == case & dat_all$artwork == art, ]
+    if (nrow(tmp) != 0) {
+
+      if (length(na.omit(unique(tmp$trace))) == 1) {
+        tmp[tmp$event == "move", "trace"] <- na.omit(unique(tmp$trace))
+      } else if (length(na.omit(unique(tmp$trace))) > 1) {
+        for (i in 1:nrow(tmp)) {
+          if (tmp$event[i] == "move") {
+            if (i == 1) {
+              tmp$trace[i] <- na.omit(unique(tmp$trace))[1]
+            } else {
+              tmp$trace[i] <- tmp$trace[i - 1]
+            }
+          }
+        }
+      } else if (all(is.na(tmp$trace))) {
+        for (i in 1:nrow(tmp)) {
+          if (tmp$event[i] == "move") {
+            tmp$trace[i] <- max_trace
+          }
+        }
+      }
+      max_trace <- max_trace + 1
+    }
+    if (nrow(tmp) > 0) {
+      #print(tmp[, c("case", "event", "trace", "artwork")])
+      out <- rbind(out, tmp)
+    }
+  }
+}
+# TODO: Get rid of the loops
+# --> This takes forever...
+
+#head(out[, c("time_ms.start", "case", "trace", "event", "artwork")], 55)
+
+#head(dat_all[dat_all$artwork %in% "501", c("time_ms.start", "case", "trace", "event", "artwork")], 50)
+
+# identical(dat_all[which(!dat_all$eventid %in% out$eventid), ],
+#           dat_all[dat_all$artwork == "glossar", ])
+# --> TRUE
+
+# put glossar events back in
+
+dat_all <- rbind(out, dat_all[dat_all$artwork == "glossar", ])
+dat_all <- dat_all[order(dat_all$date.start), ]
+rownames(dat_all) <- NULL
+
+# Make `trace` a consecutive number
+dat_all$trace <- as.numeric(as.factor(dat_all$trace))
+
+#' # Export data
+
+write.table(dat_all, "../data/event_logfiles.csv",
+            sep = ";", quote = FALSE, row.names = FALSE)


 # Is `artwork` my case? Or `artwork` per day? Or `artwork` per some other
@@ -451,16 +628,5 @@ lattice::barchart(counts, auto.key = TRUE)
 # artwork
 # dat_art <- split(dat, dat$artwork)

-## --> Maybe need it at some point?
-
-#' # Problems
-
-#' * Opening and closing of events cannot be identified unambiguously; it
-#'   can happen that the wrong tags have been put together (e.g., Transform
-#'   start and Transform stop); therefore, durations etc. are only heuristic
-
-# TODO: Add a case identifier based on timestamps
-# --> needs to be done on "raw data". Is it possible? Something seems
-# seriously wrong with `time_ms`
-
 # TODO: Write function for closing events
+
@@ -0,0 +1,115 @@
+#' ---
+#' title: "Modelling log files with Process Mining"
+#' author: "Nora Wickelmaier"
+#' date: "`r Sys.Date()`"
+#' output:
+#'   html_document:
+#'     toc: true
+#'     toc_float: true
+#'   pdf_document:
+#'     toc: true
+#'     number_sections: true
+#' geometry: margin = 2.5cm
+#' ---
+
+# setwd("C:/Users/nwickelmaier/Nextcloud/Documents/MDS/2023ss/60100_master_thesis/code")
+
+#' # Read data
+
+dat <- read.table("../data/event_logfiles.csv", sep = ";", header = TRUE)
+dat$date.start <- as.POSIXct(dat$date.start)
+dat$date.stop <- as.POSIXct(dat$date.stop)
+
+#' # Creating event logs
+
+library(bupaverse)
+
+names(dat)[6:7] <- c("start", "complete")
+
+table(table(dat$start))
+# --> hmm...
+
+summary(aggregate(duration ~ trace, dat, mean))
+
+
+alog <- activitylog(dat,
+                    case_id = "trace",
+                    activity_id = "event",
+                    #resource_id = "case",
+                    resource_id = "artwork",
+                    timestamps = c("start", "complete"))
+
+# --> have not understood, yet, which ist what...
+
+process_map(alog)
+
+process_map(alog, frequency("relative"))
+process_map(alog, frequency("relative_consequent"))
+
+library(processanimateR)
+
+animate_process(to_eventlog(alog))
+
+col_vector <- c("#7FC97F", "#BEAED4", "#FDC086", "#FFFF99", "#386CB0",
+                "#F0027F", "#BF5B17", "#666666", "#1B9E77", "#D95F02",
+                "#7570B3", "#E7298A", "#66A61E", "#E6AB02", "#A6761D",
+                "#666666", "#A6CEE3", "#1F78B4", "#B2DF8A", "#33A02C",
+                "#FB9A99", "#E31A1C", "#FDBF6F", "#FF7F00", "#CAB2D6",
+                "#6A3D9A", "#FFFF99", "#B15928", "#FBB4AE", "#B3CDE3",
+                "#CCEBC5", "#DECBE4", "#FED9A6", "#FFFFCC", "#E5D8BD",
+                "#FDDAEC", "#F2F2F2", "#B3E2CD", "#FDCDAC", "#CBD5E8",
+                "#F4CAE4", "#E6F5C9", "#FFF2AE", "#F1E2CC", "#CCCCCC",
+                "#E41A1C", "#377EB8", "#4DAF4A", "#984EA3", "#FF7F00",
+                "#FFFF33", "#A65628", "#F781BF", "#999999", "#66C2A5",
+                "#FC8D62", "#8DA0CB", "#E78AC3", "#A6D854", "#FFD92F",
+                "#E5C494", "#B3B3B3", "#8DD3C7", "#FFFFB3", "#BEBADA",
+                "#FB8072", "#80B1D3", "#FDB462", "#B3DE69", "#FCCDE5",
+                "#D9D9D9")
+
+animate_process(to_eventlog(alog), mode = "relative", jitter = 10, legend = "color",
+  mapping = token_aes(color = token_scale("artwork", 
+    scale = "ordinal", 
+    range = col_vector)))
+
+elog <- to_eventlog(alog)
+animate_process(elog[elog$artwork == "054", ])
+animate_process(elog[elog$artwork == "080", ])
+animate_process(elog[elog$artwork == "501", ])
+
+process_map(alog[alog$artwork == "054", ])
+
+animate_process(elog[elog$artwork %in% c("080", "054"), ],
+                mode = "relative", jitter = 10, legend = "color",
+                mapping = token_aes(color = token_scale("artwork",
+                                                scale = "ordinal",
+                                                range = c("black", "gray"))))
+# --> not sure, yet, how to interpret this...
+
+
+
+
+alog080 <- activitylog(dat[dat$artwork %in% "080", ],
+                    #case_id = "case",
+                    case_id = "trace",
+                    activity_id = "event",
+                    #resource_id = "trace",
+                    resource_id = "case",
+                    timestamps = c("start", "complete"))
+
+process_map(alog080, frequency("relative"))
+
+
+
+alog054 <- activitylog(dat[dat$artwork %in% "054", ],
+                    #case_id = "case",
+                    case_id = "trace",
+                    activity_id = "event",
+                    #resource_id = "trace",
+                    resource_id = "case",
+                    timestamps = c("start", "complete"))
+
+process_map(alog054, frequency("relative"))
+
+
+
+
@@ -0,0 +1,125 @@
+#' ---
+#' title: "Open Questions"
+#' author: "Nora Wickelmaier"
+#' date: "`r Sys.Date()`"
+#' output: 
+#'   html_document:
+#'     number_sections: true
+#'     toc: true
+#' ---
+
+#+ include = FALSE
+# setwd("C:/Users/nwickelmaier/Nextcloud/Documents/MDS/2023ss/60100_master_thesis/code")
+dat <- read.table("../data/event_logfiles.csv", sep = ";", header = TRUE)
+dat$date.start <- as.POSIXct(dat$date.start)
+dat$date.stop <- as.POSIXct(dat$date.stop)
+
+#' This is what the data look like after preprocessing right now
+
+#+ include = FALSE
+mat <- as.data.frame(t(sapply(dat, range, na.rm = TRUE)))
+names(mat) <- c("min", "max")
+mat$min <- round(as.numeric(mat$min), 1)
+mat$max <- round(as.numeric(mat$max), 1)
+mat$mean <- round(sapply(dat, function(x) mean(x, na.rm = TRUE)), 1)
+mat$missings <- sapply(dat, function(x) sum(is.na(x)))
+mat <- mat[!(rownames(mat) %in% c("eventid", "case", "trace", "event", "artwork", "card", "popup", "date.start", "date.stop")), ]
+
+#+ echo = FALSE
+knitr::kable(mat)
+
+#' This is only the data for 2016! So only about 2 weeks in December.
+
+# Date ranges
+range(dat$date.start)
+range(dat$date.stop, na.rm = TRUE)
+
+#' # Units of x and y
+#' I assume that x and y are pixel $\to$ correct?
+
+#' But they look weird, when plotted. Is it possible that there are
+#' outliers? If yes, how? Do we have the true ranges of the display?
+
+par(mfrow = c(1, 2))
+plot(y.start ~ x.start, dat)
+abline(v = c(0, 3800), h = c(0, 2150), col = "blue", lwd = 2)
+plot(y.stop ~ x.stop, dat)
+abline(v = c(0, 3800), h = c(0, 2150), col = "blue", lwd = 2)
+
+aggregate(cbind(x.start, x.stop, y.start, y.stop) ~ 1, dat, mean)
+
+#' Looks like the range should be something like $x = [0, 3800]$ and
+#' $y = [0, 2150]$. Do we have the starting coordinates for each artwork?
+#'
+
+#' # Unit of scale
+
+summary(dat$scaleSize)
+
+#' I thought it would be some kind of scaling factor, but then I would
+#' have expected that `scale.start` is always 1 or something.
+#'
+
+#' # Unit of rotation
+
+summary(dat$rotationDegree)
+
+#' This looks pretty clear. Should be degree. Anything else to consider
+#' here? I am assuming negative means left, but maybe not?
+#'
+
+#' # Meaningful unit for "case"
+
+#' I pretty randomly chose `20 sec` based on this plot. I would love a
+#' second opinion. `:)`
+
+timediff <- as.numeric(diff(c(dat$date.start[1], dat$date.start)))
+hist(timediff[timediff < 40], breaks = 50)
+abline(v = 20, col = "red", lwd = 2)
+
+#' This actually works pretty well and lets me assign `trace` values to the
+#' moves. But maybe there are other ideas on how to define this?
+
+dat[1:40, c("date.start", "case", "trace", "event", "artwork")]
+
+
+#' # Problems with `time_ms`
+
+#' What exactly happens, when `time_ms` goes down again? Why does it not go
+#' down to 0?
+
+par(mfrow = c(1, 2))
+
+plot(dat$time_ms.start[1:100], type = "b", ylab = "time_ms", xlab = "")
+points(dat$time_ms.stop[1:100], type = "b", col = rgb(1, 0, 0, .5))
+legend("topleft", c("start", "stop"), lty = 1, col = c("black", "red"))
+
+plot(dat$time_ms.stop[1:100] - dat$time_ms.start[1:100], type = "b",
+     ylab = "duration", col = rgb(0, 0, 1, .5))
+abline(h = 0, lty = 2)
+
+#' For the regular timestamps everything looks fine.
+
+par(mfrow = c(1, 2))
+
+plot(dat$date.stop[1:100], type = "b", ylab = "timestamp", xlab = "",
+     col = rgb(1, 0, 0, .5))
+points(dat$date.start[1:100], type = "b")
+legend("topleft", c("start", "stop"), lty = 1, col = c("black", "red"))
+
+plot(dat$date.stop[1:100] - dat$date.start[1:100], type = "b",
+     ylab = "duration", col = rgb(0, 0, 1, .5))
+abline(h = 0, lty = 2)
+
+#+
+plot(time_ms.start ~ date.start, dat[1:1000, ], type = "b")
+points(time_ms.stop ~ date.stop, dat[1:1000, ], type = "b", col = rgb(1, 0, 0, .3))
+
+#' For `time_ms.stop` this looks even weirder.
+#'
+
+#' # After which time interval does the table reset?
+
+#' I cannot see this in the data at all. Or can I? Has this something to do
+#' with the weird behavior of `time_ms`?
+
@@ -0,0 +1,111 @@
+#' ---
+#' title: "Programming input"
+#' author: "Nora Wickelmaier"
+#' date: "`r Sys.Date()`"
+#' output: html_document
+#' ---
+
+#+ include = FALSE
+# setwd("C:/Users/nwickelmaier/Nextcloud/Documents/MDS/2023ss/60100_master_thesis/code")
+
+#+
+dat0 <- read.table("../data/rawdata_logfiles_small.csv", sep = ";", header = TRUE)
+dat0$date <- as.POSIXct(dat0$date)  # create date object
+
+# Remove irrelevant events
+dat <- subset(dat0, !(dat0$event %in% c("Start Application", "Show Application")))
+str(dat)
+
+# make data better manageable
+tmp <- dat[!dat$event %in% c("Transform start", "Transform stop"), ]
+rownames(tmp) <- NULL
+
+#' # Add `trace` variable for closing events
+
+tmp$trace <- NA
+last_event <- tmp$event[1]
+aws <- unique(tmp$artwork)[unique(tmp$artwork) != "glossar"]
+
+for (art in aws) {              # select artwork
+
+  for (i in 1:nrow(tmp)) {      # go through rows
+
+    if (last_event == "Show Info" & tmp$artwork[i] == art) {
+      tmp$trace[i] <- i
+      j <- i
+
+    } else if (last_event == "Show Front" & tmp$artwork[i] == art) {
+      tmp$trace[i] <- j
+
+    } else if (!(last_event %in% c("Show Info", "Show Front")) &
+               tmp$artwork[i] == art) {
+      tmp$trace[i] <- j
+    }
+
+    if (i <= nrow(tmp)) {
+      last_event <- tmp$event[i + 1]
+    }
+  }
+}
+
+head(tmp[, c("artwork", "event", "trace")], 50)
+
+#' # Find artwork for glossar entry
+
+glossar_files <- unique(tmp[tmp$artwork == "glossar", "popup"])
+
+# Load lookup table for artworks and glossar files
+load("../data/glossar_dict.RData")
+lut <- glossar_dict[glossar_dict$glossar_file %in% glossar_files, ]
+
+# Fill in trace variable based on last `Show Info`
+for (file in lut$glossar_file) {
+
+  artwork_list <- unlist(lut[lut$glossar_file == file, "artwork"])
+
+  for (i in seq_len(nrow(tmp))) {
+
+    if (tmp$event[i] == "Show Info") {
+
+      current_artwork <- tmp[i, "artwork"]
+      j <- i
+      k <- i
+
+    } else {
+
+      current_artwork <- current_artwork
+
+    }
+
+    if (tmp$event[i] == "Show Front" & tmp$artwork[i] == current_artwork) {
+    # make sure artwork has not been closed, yet!
+      k <- i
+    }
+
+    if (tmp$artwork[i] == "glossar" &
+        (current_artwork %in% artwork_list) &
+        tmp$popup[i] == file & (j-k == 0)) {
+
+      tmp[i, "trace"] <- tmp[j, "trace"]
+
+    }
+  }
+}
+
+tmp[tmp$artwork == "glossar", c("artwork", "event", "popup", "trace")]
+
+proportions(table(is.na(tmp$trace[tmp$artwork == "glossar"])))
+# --> finds about half of the glossar entries for small data set...
+
+# REMEMBER: It can never be 100% correct, since it is always possible that
+# several cards are open and that they link to the same glossar entry
+
+# How many glossar_files are only associated with one artwork?
+lut[sapply(lut$artwork, length) == 1, "glossar_file"]
+
+# TODO: Fill in the ones that are associated with one artwork
+# --> Can't come up with something -- maybe ask Philipp???
+
+# TODO: How to check if one of the former "Show Infos" is correct
+# --> Can't come up with something -- maybe ask Philipp???
+