From 2a2eab4b9f8ee5bfa924643a4cc1c01bccb36fc1 Mon Sep 17 00:00:00 2001
From: nwickel <n.wickelmaier@iwm-tuebingen.de>
Date: Fri, 15 Sep 2023 16:22:21 +0200
Subject: [PATCH] Started trying out how preprocessing behaves when done on
 full data set

---
 README.Rmd              | 42 +++++++++++++++++++
 code/02_preprocessing.R | 21 +++++++++-
 code/functions.R        | 90 +++++++++++++++++++++++++----------------
 3 files changed, 118 insertions(+), 35 deletions(-)
diff --git a/README.Rmd b/README.Rmd
index 9b7be92..7a15a5d 100644
--- a/README.Rmd
+++ b/README.Rmd
@@ -427,6 +427,48 @@ out), but `topicNumber` for this artwork has 6 different entries. I will
 therefore extract the topics from `<artwork>.html`. (This seems also better
 compatible with other data sets like 8o8m.)
 
+## New artworks "504" and "505" starting October 2022
+
+When I read in the complete data frame for the first time, all of the
+sudden there were 72 instead of 70 artworks. It seems like these two
+artworks appear on October 21, 2022.
+
+```{r}
+dat0 <- read.table("../data/rawdata_logfiles.csv", sep = ";", header = TRUE)
+dat0$date <- as.POSIXct(dat0$date)
+dat0$glossar <- ifelse(dat0$artwork == "glossar", 1, 0)
+
+# Remove irrelevant events
+dat <- subset(dat0, !(dat0$event %in% c("Start Application",
+                                        "Show Application")))
+
+summary(dat[dat$artwork %in% c("504", "505"), ])
+```
+
+The artworks seem to be have updated in general after October 21, 2022.
+
+```{r}
+art_before_oct2022 <- sort(unique(dat[dat$date >= "2022-10-21", "artwork"]))
+art_before_oct2022 <- sort(unique(dat[dat$date <= "2022-10-21", "artwork"]))
+# Removed artworks
+art_before_oct2022[!art_before_oct2022 %in% art_after_oct2022]
+# Additional artworks
+art_after_oct2022[!art_after_oct2022 %in% art_before_oct2022]
+```
+
+The following table shows which artworks were presented in which years.
+
+```{r}
+xtabs(~ artwork + lubridate::year(date), dat)
+```
+
+It strongly suggests that the artworks haven been updated after the Corona
+pandemic. I think, the table was also moved to a different location at that
+point. (Check with PG to make sure.)
+
+I need to get the XML files for "504" and "505" from PM in order to extract
+information on them for the metadata.
+
 # Reading list
 
 * @Arizmendi2022 [--]
diff --git a/code/02_preprocessing.R b/code/02_preprocessing.R
index 693c1b5..843bd95 100644
--- a/code/02_preprocessing.R
+++ b/code/02_preprocessing.R
@@ -3,7 +3,8 @@
 source("functions.R")
 
 # Read data ##############################################################
-dat0 <- read.table("../data/rawdata_logfiles_small.csv", sep = ";",
+cat("########## Reading in data... ##########", "\n")
+dat0 <- read.table("../data/rawdata_logfiles.csv", sep = ";",
                    header = TRUE)
 dat0$date <- as.POSIXct(dat0$date)
 dat0$glossar <- ifelse(dat0$artwork == "glossar", 1, 0)
@@ -12,10 +13,16 @@ dat0$glossar <- ifelse(dat0$artwork == "glossar", 1, 0)
 dat <- subset(dat0, !(dat0$event %in% c("Start Application",
                                         "Show Application")))
 
+save(dat, file = "tmp/dat.RData")
+
 # Add trace variable #####################################################
+cat("########## Adding trace variable... ##########", "\n")
 dat1 <- add_trace(dat)
 
+save(dat1, file = "tmp/dat1.RData")
+
 # Close events
+cat("########## Closing events...")
 dat2 <- rbind(close_events(dat1, "move"),
               close_events(dat1, "flipCard"),
               close_events(dat1, "openTopic"),
@@ -32,7 +39,10 @@ dat2 <- dat2[!is.na(dat2$date.start), ]
 rownames(dat2) <- NULL
 # TODO: Throw warning about this
 
+save(dat2, file = "tmp/dat2.RData")
+
 # Add case variable ######################################################
+cat("########## Adding case and eventId variables... ##########", "\n")
 dat3 <- add_case(dat2)
 
 # Add event ID ###########################################################
@@ -46,19 +56,28 @@ dat3 <- dat3[, c("fileId.start", "fileId.stop", "eventId", "case",
                  "scaleSize", "rotation.start", "rotation.stop",
                  "rotationDegree")]
 
+save(dat3, file = "tmp/dat3.RData")
+
 # Add trace for move events ##############################################
+cat("########## Adding trace variable for move events... ##########", "\n")
 dat4 <- add_trace_moves(dat3)
 
+save(dat4, file = "tmp/dat4.RData")
+
 # Add topics: file names and topics ######################################
+cat("########## Adding information about topics... ##########", "\n")
 artworks <- unique(dat4$artwork)
 topics <- extract_topics(artworks, pattern = paste0(artworks, ".xml"),
                          path = "../data/ContentEyevisit/eyevisit_cards_light/")
 
 dat5 <- add_topic(dat4, topics = topics)
 
+save(dat5, file = "tmp/dat5.RData")
+
 # TODO: Replace artwork with informative strings
 
 # Export data ############################################################
+cat("########## Exporting data frame with event logs... ##########", "\n")
 write.table(dat5, "../data/event_logfiles.csv", sep = ";",
             row.names = FALSE)
 
diff --git a/code/functions.R b/code/functions.R
index 857adec..1f8bfb4 100644
--- a/code/functions.R
+++ b/code/functions.R
@@ -9,28 +9,36 @@ add_trace <- function(data, glossar_dict = "../data/glossar_dict.RData") {
   subdata2 <- data[!data$event %in% c("Transform start", "Transform stop"), ]
 
   last_event <- subdata2$event[1]
-  aws <- unique(subdata2$artwork)[unique(subdata2$artwork) != "glossar"]
+  artworks <- unique(subdata2$artwork)[unique(subdata2$artwork) != "glossar"]
+  n <- 1    # count artworks for progress
 
-  for (art in aws) {
+  pb <- txtProgressBar(min = 0, max = nrow(subdata2), style = 3)
+
+  for (artwork in artworks) {
+
+    cat("\n\nAdding trace variable for artwork", artwork,
+               paste0("(", n, "/", length(artworks), ")"), "\n")
 
     for (i in 1:nrow(subdata2)) {
 
-      if (last_event == "Show Info" & subdata2$artwork[i] == art) {
+      if (last_event == "Show Info" & subdata2$artwork[i] == artwork) {
         subdata2$trace[i] <- i
         j <- i
 
-      } else if (last_event == "Show Front" & subdata2$artwork[i] == art) {
+      } else if (last_event == "Show Front" & subdata2$artwork[i] == artwork) {
         subdata2$trace[i] <- j
 
       } else if (!(last_event %in% c("Show Info", "Show Front")) &
-                 subdata2$artwork[i] == art) {
+                 subdata2$artwork[i] == artwork) {
         subdata2$trace[i] <- j
       }
 
       if (i <= nrow(subdata2)) {
         last_event <- subdata2$event[i + 1]
       }
+      setTxtProgressBar(pb, i)
     }
+    n <- n + 1
   }
 
   # Fix glossar entries (find corresponding artworks and fill in trace)
@@ -45,8 +53,13 @@ add_trace <- function(data, glossar_dict = "../data/glossar_dict.RData") {
                                "glossar_file"]]
   single_art  <- unlist(lut[lut$glossar_file %in% inside, "artwork"])
 
+  m <- 1
+
   for (file in lut$glossar_file) {
 
+    cat("\n\nAdding trace variable for glossar entry", file,
+               paste0("(", m, "/", length(lut$glossar_file), ")"), "\n")
+
     artwork_list <- unlist(lut[lut$glossar_file == file, "artwork"])
 
     for (i in seq_len(nrow(subdata2))) {
@@ -78,11 +91,13 @@ add_trace <- function(data, glossar_dict = "../data/glossar_dict.RData") {
         subdata2[i, "artwork"] <- current_artwork
 
       }
+      setTxtProgressBar(pb, i)
     }
+    m <- m + 1
   }
 
   # Exclude not matched glossar entries
-  cat("INFORMATION: glossar entries that are not matched will be removed:",
+  cat("\n\nINFORMATION: glossar entries that are not matched will be removed:",
       sum(is.na(subdata2[subdata2$glossar == 1, "trace"])), "entries",
       #proportions(table(is.na(subdata2[subdata2$glossar == 1, "trace"]))),
       fill = TRUE)
@@ -229,40 +244,52 @@ add_case <- function(data, cutoff = 20) {
 add_trace_moves <- function(data) {
 
   cases <- unique(data$case)
-  aws <- unique(data$artwork)[unique(data$artwork) != "glossar"]
+  artworks <- unique(data$artwork)[unique(data$artwork) != "glossar"]
   max_trace <- max(data$trace, na.rm = TRUE) + 1
   out <- NULL
 
-  for (case in cases) {
-    for (art in aws) {
-      tmp <- data[data$case == case & data$artwork == art, ]
-      if (nrow(tmp) != 0) {
+  pb <- txtProgressBar(min = 0, max = length(artworks), style = 3)
 
-        if (length(na.omit(unique(tmp$trace))) == 1) {
-          tmp[tmp$event == "move", "trace"] <- na.omit(unique(tmp$trace))
-        } else if (length(na.omit(unique(tmp$trace))) > 1) {
-          for (i in 1:nrow(tmp)) {
-            if (tmp$event[i] == "move") {
+  n <- 1    # count cases for progress
+
+  for (case in cases) {
+
+    cat("\n\nAdding trace variable for move events per case",
+               paste0("(", n, "/", length(cases), ")"), "\n")
+    j <- 1
+
+    for (artwork in artworks) {
+      subdata <- data[data$case == case & data$artwork == artwork, ]
+      if (nrow(subdata) != 0) {
+
+        if (length(na.omit(unique(subdata$trace))) == 1) {
+          subdata[subdata$event == "move", "trace"] <- na.omit(unique(subdata$trace))
+        } else if (length(na.omit(unique(subdata$trace))) > 1) {
+          for (i in 1:nrow(subdata)) {
+            if (subdata$event[i] == "move") {
               if (i == 1) {
-                tmp$trace[i] <- na.omit(unique(tmp$trace))[1]
+                subdata$trace[i] <- na.omit(unique(subdata$trace))[1]
               } else {
-                tmp$trace[i] <- tmp$trace[i - 1]
+                subdata$trace[i] <- subdata$trace[i - 1]
               }
             }
           }
-        } else if (all(is.na(tmp$trace))) {
-          for (i in 1:nrow(tmp)) {
-            if (tmp$event[i] == "move") {
-              tmp$trace[i] <- max_trace
+        } else if (all(is.na(subdata$trace))) {
+          for (i in 1:nrow(subdata)) {
+            if (subdata$event[i] == "move") {
+              subdata$trace[i] <- max_trace
             }
           }
         }
         max_trace <- max_trace + 1
       }
-      if (nrow(tmp) > 0) {
-        out <- rbind(out, tmp)
+      if (nrow(subdata) > 0) {
+        out <- rbind(out, subdata)
       }
+      setTxtProgressBar(pb, j)
+      j <- j + 1
     }
+    n <- n + 1
   }
   out <- out[order(out$date.start, out$fileId.start), ]
   rownames(out) <- NULL
@@ -340,19 +367,19 @@ add_topic <- function(data, topics) {
   dat_split <- split(data, data$artwork)
 
   set_label <- function(x) {
-    art <- unique(x$artwork)
-    x$topicIndex <- factor(x$topicNumber, labels = tab_index[[art]])
+    artwork <- unique(x$artwork)
+    x$topicIndex <- factor(x$topicNumber, labels = tab_index[[artwork]])
     x
   }
 
   dat_label <- lapply(dat_split, set_label)
 
   set_topic <- function(x) {
-    art <- unique(x$artwork)
-    labels_file <- topics[topics$artwork == art,
+    artwork <- unique(x$artwork)
+    labels_file <- topics[topics$artwork == artwork,
                           "file_name"][as.numeric(levels(x$topicIndex))]
     x$topicFile <- as.character(factor(x$topicIndex, labels = labels_file))
-    labels_topic <- topics[topics$artwork == art,
+    labels_topic <- topics[topics$artwork == artwork,
                           "topic"][as.numeric(levels(x$topicIndex))]
     x$topic <- as.character(factor(x$topicIndex, labels = labels_topic))
     x
@@ -360,11 +387,6 @@ add_topic <- function(data, topics) {
 
   dat_topic <- lapply(dat_label, set_topic)
 
-  # table(dat_topic[["501"]]$topicNumber)
-  # table(dat_topic[["501"]]$topicIndex)
-  # table(dat_topic[["501"]]$topicFile)
-  # table(dat_topic[["501"]]$topic)
-
   out <- do.call(rbind, dat_topic)
   out <- out[order(out$date.start, out$fileId.start), ]
   rownames(out) <- NULL