From bfc5c1d93065edecb9e364796ebabd906499c155 Mon Sep 17 00:00:00 2001
From: nwickel <n.wickelmaier@iwm-tuebingen.de>
Date: Fri, 1 Sep 2023 15:01:54 +0200
Subject: [PATCH] Worked on extracting topics for cards

---
 README.md                        |  4 +++
 code/02_preprocessing.R          | 57 ++++++++++---------------------
 code/03_topic-cards.R            | 42 +++++++++++++++++++++++
 code/questions_number-of-cards.R | 58 ++++++++++++++++++++++++++++++++
 4 files changed, 121 insertions(+), 40 deletions(-)
 create mode 100644 code/03_topic-cards.R
 create mode 100644 code/questions_number-of-cards.R

diff --git a/README.md b/README.md
index 28bdeab..7e78064 100644
--- a/README.md
+++ b/README.md
@@ -242,6 +242,10 @@ Will probably just get rid of them!
 Think about if you want give warning messages about these deletions in the
 functions.
 
+## Card indices go from 0 to 7 (instead of 0 to 5 as expected)
+
+See `questions_number-of-cards.R` for details.
+
 # Reading list
 
 * @Arizmendi2022 [$-$]
diff --git a/code/02_preprocessing.R b/code/02_preprocessing.R
index 54b37ec..6127d13 100644
--- a/code/02_preprocessing.R
+++ b/code/02_preprocessing.R
@@ -53,12 +53,12 @@ num_stop <- c(diff(c(0, which(dat1$event == "Transform start"))))
 table(num_stop)
 
 # TODO: Do I still need this?
-dat1$eventrep <- rep(num_start, num_start)
-dat1$dupl <- duplicated(dat1[, c("event", "eventid")])                    # keep first
-dat1$dupl <- duplicated(dat1[, c("event", "eventid")], fromLast = TRUE)   # keep last
-dat1[dat1$eventrep == 10, ]
-dat1$dupl <- NULL
-dat1$eventrep <- NULL
+# dat1$eventrep <- rep(num_start, num_start)
+# dat1$dupl <- duplicated(dat1[, c("event", "eventid")])                    # keep first
+# dat1$dupl <- duplicated(dat1[, c("event", "eventid")], fromLast = TRUE)   # keep last
+# dat1[dat1$eventrep == 10, ]
+# dat1$dupl <- NULL
+# dat1$eventrep <- NULL
 
 
 # remove duplicated "Transform start" events
@@ -89,7 +89,7 @@ trans_wide <- reshape(dat1, direction = "wide",
 # check how often an eventid is associated with two fileids
 nrow(subset(trans_wide, trans_wide$fileid.start != trans_wide$fileid.stop))
 
-# exclude from data set ??
+# TODO: exclude from data set ??
 # trans_wide <- subset(trans_wide, trans_wide$fileid.start != trans_wide$fileid.stop)
 
 # which(is.na(trans_wide$date.start))
@@ -167,7 +167,7 @@ tail(dat2[, c("artwork", "event", "trace")], 50)
 
 rm(aws, i, j, last_event, art)
 
-#' ## Fix glossar entries (find corresponding artworks)
+#' ## Fix glossar entries (find corresponding artworks and fill in trace)
 
 glossar_files <- unique(dat2[dat2$artwork == "glossar", "popup"])
 
@@ -256,9 +256,9 @@ dat2[14110:14130, ]
 
 # TODO: Integrate for-loop into for-loop above
 
-# TODO: For now: Exclude not matched glossar entries
 
 df <- subset(dat2, !is.na(dat2$trace))
+# TODO: For now: Exclude not matched glossar entries
 df <- df[order(df$trace), ]
 rownames(df) <- NULL
 
@@ -279,20 +279,10 @@ flipCard_wide$event <- "flipCard"
 flipCard_wide$duration <- flipCard_wide$time_ms.stop -
   flipCard_wide$time_ms.start
 
-# TODO: Check if I still need to enter all of these variables
-# --> x, y, scale, rotation?
 flipCard_wide$card            <- NA
 flipCard_wide$popup           <- NA
-flipCard_wide$x.start         <- NA
-flipCard_wide$x.stop          <- NA
-flipCard_wide$y.start         <- NA
-flipCard_wide$y.stop          <- NA
 flipCard_wide$distance        <- NA
-flipCard_wide$scale.start     <- NA
-flipCard_wide$scale.stop      <- NA
 flipCard_wide$scaleSize       <- NA
-flipCard_wide$rotation.start  <- NA
-flipCard_wide$rotation.stop   <- NA
 flipCard_wide$rotationDegree  <- NA
 
 dat_flipCard <- flipCard_wide[, c("fileid.start", "fileid.stop", "event",
@@ -325,18 +315,9 @@ openTopic_wide$event <- "openTopic"
 openTopic_wide$duration <- openTopic_wide$time_ms.stop -
   openTopic_wide$time_ms.start
 
-
 openTopic_wide$popup           <- NA
-openTopic_wide$x.start         <- NA
-openTopic_wide$x.stop          <- NA
-openTopic_wide$y.start         <- NA
-openTopic_wide$y.stop          <- NA
 openTopic_wide$distance        <- NA
-openTopic_wide$scale.start     <- NA
-openTopic_wide$scale.stop      <- NA
 openTopic_wide$scaleSize       <- NA
-openTopic_wide$rotation.start  <- NA
-openTopic_wide$rotation.stop   <- NA
 openTopic_wide$rotationDegree  <- NA
 
 dat_openTopic <- openTopic_wide[, c("fileid.start", "fileid.stop", "event",
@@ -375,22 +356,13 @@ openPopup_wide <- reshape(dat5, direction = "wide",
 # df[df$trace == 4595, ]
 # --> artwork 046 popup selene.xml gets opened twice
 
-
 openPopup_wide$event <- "openPopup"
 openPopup_wide$duration <- openPopup_wide$time_ms.stop -
   openPopup_wide$time_ms.start
 
 openPopup_wide$card            <- NA
-openPopup_wide$x.start         <- NA
-openPopup_wide$x.stop          <- NA
-openPopup_wide$y.start         <- NA
-openPopup_wide$y.stop          <- NA
 openPopup_wide$distance        <- NA
-openPopup_wide$scale.start     <- NA
-openPopup_wide$scale.stop      <- NA
 openPopup_wide$scaleSize       <- NA
-openPopup_wide$rotation.start  <- NA
-openPopup_wide$rotation.stop   <- NA
 openPopup_wide$rotationDegree  <- NA
 
 dat_openPopup <- openPopup_wide[, c("fileid.start", "fileid.stop", "event",
@@ -529,12 +501,17 @@ out <- out[order(out$date.start), ]
 rownames(out) <- NULL
 
 # Make `trace` a consecutive number
-out$trace2 <- as.numeric(factor(out$trace, levels = unique(out$trace)))
+out$trace <- as.numeric(factor(out$trace, levels = unique(out$trace)))
+
+#' # Fill in topics
+
+topics <- read.table("../data/topics.csv", sep = ";", header = TRUE)
+# TODO:
 
 #' # Export data
 
-write.table(out, "../data/event_logfiles.csv",
-            sep = ";", quote = FALSE, row.names = FALSE)
+write.table(out, "../data/event_logfiles.csv", sep = ";",
+            row.names = FALSE)
 
 # TODO: Write function for closing events
 
diff --git a/code/03_topic-cards.R b/code/03_topic-cards.R
new file mode 100644
index 0000000..3227967
--- /dev/null
+++ b/code/03_topic-cards.R
@@ -0,0 +1,42 @@
+# setwd("C:/Users/nwickelmaier/Nextcloud/Documents/MDS/2023ss/60100_master_thesis/data/ContentEyevisit/eyevisit_cards_light")
+rm(list=ls())
+
+dat0 <- read.table("../../event_logfiles.csv", sep = ";", header = TRUE)
+dat0$artwork <- sprintf("%03d", dat0$artwork)
+
+# artwork names
+artworks <- sort(unique(dat0$artwork))
+
+# create data frame with file names and topics for each artwork
+
+dat <- NULL
+file_order <- NULL
+
+for (artwork in artworks) {
+  fnames <- dir(pattern = paste0(artwork, "_"), path = artwork, full.names = TRUE)
+  topic <- NULL
+  for (fname in fnames) {
+    topic <- c(topic, gsub("^<card type=.(.*).>$", "\\1",
+                           grep("^<card type=", trimws(readLines(fname)), value = T)))
+    
+  }
+  index <- paste(artwork, "index.xml", sep = "/")
+  file_order <- c(file_order, gsub("^<card src=.(.*)./>$", "\\1",
+                       grep("^<card src=", trimws(readLines(index)), value = TRUE)))
+  in_index <- fnames %in% file_order
+  dat <- rbind(dat, data.frame(artwork, file_name = fnames, in_index, topic))
+}
+
+table(dat$artwork)
+table(dat$topic)
+
+# take only the ones that are actually displayed and sort in the same order
+# as indicated in index.html
+
+dat2 <- dat[dat$in_index, -3]
+dat2 <- dat2[order(file_order, dat2$file_name), ]
+
+dat2$index <- unlist(sapply(table(dat2$artwork), seq_len))
+
+write.table(dat2, file = "../../topics.csv", sep = ";", row.names = FALSE)
+
diff --git a/code/questions_number-of-cards.R b/code/questions_number-of-cards.R
new file mode 100644
index 0000000..19f7a2b
--- /dev/null
+++ b/code/questions_number-of-cards.R
@@ -0,0 +1,58 @@
+#' ---
+#' title: "Open Questions -- Card indices"
+#' author: "Nora Wickelmaier"
+#' date: "`r Sys.Date()`"
+#' output: 
+#'   html_document:
+#'     number_sections: true
+#'     toc: true
+#' ---
+
+#+ include = FALSE
+# setwd("C:/Users/nwickelmaier/Nextcloud/Documents/MDS/2023ss/60100_master_thesis/code")
+dat <- read.table("../data/event_logfiles.csv", sep = ";", header = TRUE)
+dat$date.start <- as.POSIXct(dat$date.start)
+dat$date.stop <- as.POSIXct(dat$date.stop)
+dat$artwork <- sprintf("%03d", dat$artwork)
+
+#' The following table shows an overview of the card indices. The indices
+#' should have values between 0 and 5. It is unclear what the numbers mean.
+
+table(dat$card)
+
+#' Number of cards for each artwork in the data set (subset from 2016)
+
+artworks <- sort(unique(dat$artwork))
+
+count <- function(x) length(table(dat[which(dat$artwork == x), "card"]))
+max_index <- function(x) max(dat[which(dat$artwork == x), "card"], na.rm = TRUE)
+num_cards <- sapply(artworks, count)
+highest_index <- sapply(artworks, max_index)
+
+#' Check how many XML-files for cards are present
+
+path <- "../data/ContentEyevisit/eyevisit_cards_light"
+
+num_files <- NULL
+for (artwork in artworks) {
+  fnames <- dir(pattern = paste0(artwork, "_"), path = paste(path, artwork, sep = "/"))
+  num_files <- c(num_files, length(fnames))
+}
+
+#' The table shows that each artwork has 6 cards the most (as expected).
+#' This is a subset of the data, so not all cards have been opened.
+
+cards <- data.frame(artwork = artworks, num_cards, highest_index,
+                    num_files, diff = num_files - highest_index)
+cards
+
+#' There are more than 8 files for a couple of artworks:
+
+subset(cards, cards$num_files >= 8)
+
+#' It might be possible, that the number indicates the index of the file
+#' and not the actual card that was displayed. BUT: In many cases, there
+#' are only 6 (or less) files, but a higher index is present...
+
+subset(cards, cards$diff < 0)
+