From 495665a659392d3622ed1ba37bf0437d3d1099fc Mon Sep 17 00:00:00 2001
From: nwickel <n.wickelmaier@iwm-tuebingen.de>
Date: Thu, 31 Aug 2023 16:12:34 +0200
Subject: [PATCH] Cleaned out some commented code, that I do not need anymore

---
 code/02_preprocessing.R | 77 ++++++-----------------------------------
 1 file changed, 11 insertions(+), 66 deletions(-)

diff --git a/code/02_preprocessing.R b/code/02_preprocessing.R
index e66394d..54b37ec 100644
--- a/code/02_preprocessing.R
+++ b/code/02_preprocessing.R
@@ -14,17 +14,6 @@
 
 # setwd("C:/Users/nwickelmaier/Nextcloud/Documents/MDS/2023ss/60100_master_thesis/code")
 
-# LogEntry classes:
-#   TRANSFORM_START:    "Transform start" --> "Transformation Start" in Tool
-#   TRANSFORM_STOP:     "Transform stop"
-#   START_APPLICATION:  "Start Application"
-#   SHOW_APPLICATION:   "Show Application"
-#   SHOW_INFO:          "Show Info"       --> "Flip Card" in Tool
-#   SHOW_FRONT:         "Show Front"
-#   SHOW_POPUP:         "ShowPopup"       --> "Show Popup" in Tool
-#   HIDE_POPUP:         "HidePopup"
-#   ARTWORK:            "Artwork"         --> "Show Topic" in Tool
-
 #' # Read data
 
 dat0 <- read.table("../data/rawdata_logfiles_small.csv", sep = ";",
@@ -63,11 +52,11 @@ table(table(dat1$eventid))
 num_stop <- c(diff(c(0, which(dat1$event == "Transform start"))))
 table(num_stop)
 
+# TODO: Do I still need this?
 dat1$eventrep <- rep(num_start, num_start)
 dat1$dupl <- duplicated(dat1[, c("event", "eventid")])                    # keep first
 dat1$dupl <- duplicated(dat1[, c("event", "eventid")], fromLast = TRUE)   # keep last
 dat1[dat1$eventrep == 10, ]
-
 dat1$dupl <- NULL
 dat1$eventrep <- NULL
 
@@ -145,24 +134,13 @@ summary(dat_trans)
 #' # Close other events
 
 dat2 <- dat[!dat$event %in% c("Transform start", "Transform stop"), ]
-# dat2$x <- NULL
-# dat2$y <- NULL
-# dat2$scale <- NULL
-# dat2$rotation <- NULL
 rownames(dat2) <- NULL
 
-# Create event ID for closing events
-# num_start <- diff(c(0, which(dat2$event == "Show Front")))
-# dat2$trace <- rep(seq_along(num_start), num_start)
-# head(dat2[, c("artwork", "event", "trace")], 50)
-# --> does not work because of glossar entries... can't sort by artwork
-
-
 dat2$trace <- NA
 last_event <- dat2$event[1]
 aws <- unique(dat2$artwork)[unique(dat2$artwork) != "glossar"]
 #
-for (art in aws) {              # select artwork
+for (art in aws) {               # select artwork
 
   for (i in 1:nrow(dat2)) {      # go through rows
 
@@ -189,9 +167,7 @@ tail(dat2[, c("artwork", "event", "trace")], 50)
 
 rm(aws, i, j, last_event, art)
 
-## Fix glossar entries
-
-### Find artwork for glossar entry
+#' ## Fix glossar entries (find corresponding artworks)
 
 glossar_files <- unique(dat2[dat2$artwork == "glossar", "popup"])
 
@@ -278,7 +254,7 @@ for (file in tmp_lut$glossar_file) {
 
 dat2[14110:14130, ]
 
-# TODO: Integrate for loop into for loop above
+# TODO: Integrate for-loop into for-loop above
 
 # TODO: For now: Exclude not matched glossar entries
 
@@ -303,7 +279,8 @@ flipCard_wide$event <- "flipCard"
 flipCard_wide$duration <- flipCard_wide$time_ms.stop -
   flipCard_wide$time_ms.start
 
-
+# TODO: Check if I still need to enter all of these variables
+# --> x, y, scale, rotation?
 flipCard_wide$card            <- NA
 flipCard_wide$popup           <- NA
 flipCard_wide$x.start         <- NA
@@ -377,6 +354,7 @@ dat_openTopic <- openTopic_wide[, c("fileid.start", "fileid.stop", "event",
 rm(openTopic_wide, num_start)
 
 #' ## close openPopup
+
 dat5 <- subset(df, df$event %in% c("ShowPopup", "HidePopup"))
 dat5 <- dat5[order(dat5$artwork, dat5$popup, dat5$date), ]
 rownames(dat5) <- NULL
@@ -430,8 +408,7 @@ rm(num_start, openPopup_wide)
 
 # TODO: Should card maybe also be filled in for "openPopup"?
 
-#' ## Use `rbind()` instead...
-# --> unbeatable in terms of time!
+#' ## Merge data sets for different events
 
 dat_all <- rbind(dat_trans, dat_flipCard, dat_openTopic, dat_openPopup)
 
@@ -439,7 +416,8 @@ dat_all <- rbind(dat_trans, dat_flipCard, dat_openTopic, dat_openPopup)
 nrow(dat_all) == (nrow(dat_trans) + nrow(dat_flipCard) +
                   nrow(dat_openTopic) + nrow(dat_openPopup))
 
-# remove all events that do not have a `date.start`
+#' ## Remove all events that do not have a `date.start`
+
 dim(dat_all[is.na(dat_all$date.start), ])
 dat_all <- dat_all[!is.na(dat_all$date.start), ]
 # There is only a `date.stop`, when event is not properly closed, see here:
@@ -456,7 +434,6 @@ dat[31000:31019,]     # this one e.g.
 # not interpretable
 dat_all[which(dat_all$fileid.start != dat_all$fileid.stop), "duration"] <- NA
 
-
 # sort by `start.date`
 dat_all <- dat_all[order(dat_all$date.start), ]
 rownames(dat_all) <- NULL
@@ -470,8 +447,6 @@ summary(dat_all)    # OK, this actually makes a lot of sense :)
 
 #' ## Create case variable
 
-#dat_all$timediff <- as.numeric(dat_all$date.stop - dat_all$date.start)
-
 dat_all$timediff <- as.numeric(diff(c(dat_all$date.start[1], dat_all$date.start)))
 
 hist(dat_all$timediff[dat_all$timediff < 40], breaks = 50)
@@ -507,12 +482,6 @@ dat_all <- dat_all[, c("fileid.start", "fileid.stop", "eventid", "case",
 
 #' ## Add `trace` numbers for `move` events
 
-# when case and artwork are identical and there is only 1 trace value
-# --> assign it to all `move` events for that case and artwork
-# when case and artwork are identical and there is more than 1 trace value
-# --> assign the `trace` value that was right before this `move` event
-# (could, of course, also be after)
-
 cases <- unique(dat_all$case)
 aws <- unique(dat_all$artwork)[unique(dat_all$artwork) != "glossar"]
 max_trace <- max(dat_all$trace, na.rm = TRUE) + 1
@@ -545,7 +514,6 @@ for (case in cases) {
       max_trace <- max_trace + 1
     }
     if (nrow(tmp) > 0) {
-      #print(tmp[, c("case", "event", "trace", "artwork")])
       out <- rbind(out, tmp)
     }
   }
@@ -554,15 +522,7 @@ for (case in cases) {
 # TODO: Get rid of the loops
 # --> This takes forever...
 
-#head(out[, c("time_ms.start", "case", "trace", "event", "artwork")], 55)
-
-#head(dat_all[dat_all$artwork %in% "501", c("time_ms.start", "case", "trace", "event", "artwork")], 50)
-
-# identical(dat_all[which(!dat_all$eventid %in% out$eventid), ],
-#           dat_all[dat_all$artwork == "glossar", ])
-# --> TRUE
-
-# put glossar events back in
+# put glossar events back in --> not relevant anymore
 
 #dat_all <- rbind(out, dat_all[dat_all$artwork == "glossar", ])
 out <- out[order(out$date.start), ]
@@ -571,25 +531,10 @@ rownames(out) <- NULL
 # Make `trace` a consecutive number
 out$trace2 <- as.numeric(factor(out$trace, levels = unique(out$trace)))
 
-#head(out[, c("trace", "trace2")], 50)
-
 #' # Export data
 
 write.table(out, "../data/event_logfiles.csv",
             sep = ";", quote = FALSE, row.names = FALSE)
 
-# Is `artwork` my case? Or `artwork` per day? Or `artwork` per some other
-# unit??? Maybe look at differences between timestamps separately for
-# `artwork`? And identify "new observational unit" this way?
-#
-# Definition: (???)
-# 1. Touching a new `artwork` corresponds to "observational unit change"
-# 2. Time interval of XX min within one `artwork` on the same day
-#    corresponds to "observational unit change"
-
-# Split data frame in list of data frame which all correspond to one
-# artwork
-# dat_art <- split(dat, dat$artwork)
-
 # TODO: Write function for closing events