Worked on extracting topics for cards

2023-09-01 15:01:54 +02:00 · 2023-09-01 15:01:54 +02:00 · bfc5c1d930
commit bfc5c1d930
parent 495665a659
4 changed files with 121 additions and 40 deletions
--- a/README.md
+++ b/README.md
@ -242,6 +242,10 @@ Will probably just get rid of them!
 Think about if you want give warning messages about these deletions in the
 functions.
 ## Card indices go from 0 to 7 (instead of 0 to 5 as expected)
 See `questions_number-of-cards.R` for details.
 # Reading list
 * @Arizmendi2022 [$-$]
--- a/code/02_preprocessing.R
+++ b/code/02_preprocessing.R
@ -53,12 +53,12 @@ num_stop <- c(diff(c(0, which(dat1$event == "Transform start"))))
 table(num_stop)
 # TODO: Do I still need this?
-dat1$eventrep <- rep(num_start, num_start)
+# dat1$eventrep <- rep(num_start, num_start)
-dat1$dupl <- duplicated(dat1[, c("event", "eventid")])                    # keep first
+# dat1$dupl <- duplicated(dat1[, c("event", "eventid")])                    # keep first
-dat1$dupl <- duplicated(dat1[, c("event", "eventid")], fromLast = TRUE)   # keep last
+# dat1$dupl <- duplicated(dat1[, c("event", "eventid")], fromLast = TRUE)   # keep last
-dat1[dat1$eventrep == 10, ]
+# dat1[dat1$eventrep == 10, ]
-dat1$dupl <- NULL
+# dat1$dupl <- NULL
-dat1$eventrep <- NULL
+# dat1$eventrep <- NULL
 # remove duplicated "Transform start" events
@ -89,7 +89,7 @@ trans_wide <- reshape(dat1, direction = "wide",
 # check how often an eventid is associated with two fileids
 nrow(subset(trans_wide, trans_wide$fileid.start != trans_wide$fileid.stop))
-# exclude from data set ??
+# TODO: exclude from data set ??
 # trans_wide <- subset(trans_wide, trans_wide$fileid.start != trans_wide$fileid.stop)
 # which(is.na(trans_wide$date.start))
@ -167,7 +167,7 @@ tail(dat2[, c("artwork", "event", "trace")], 50)
 rm(aws, i, j, last_event, art)
-#' ## Fix glossar entries (find corresponding artworks)
+#' ## Fix glossar entries (find corresponding artworks and fill in trace)
 glossar_files <- unique(dat2[dat2$artwork == "glossar", "popup"])
@ -256,9 +256,9 @@ dat2[14110:14130, ]
 # TODO: Integrate for-loop into for-loop above
 # TODO: For now: Exclude not matched glossar entries
 df <- subset(dat2, !is.na(dat2$trace))
 # TODO: For now: Exclude not matched glossar entries
 df <- df[order(df$trace), ]
 rownames(df) <- NULL
@ -279,20 +279,10 @@ flipCard_wide$event <- "flipCard"
 flipCard_wide$duration <- flipCard_wide$time_ms.stop -
  flipCard_wide$time_ms.start
 # TODO: Check if I still need to enter all of these variables
 # --> x, y, scale, rotation?
 flipCard_wide$card            <- NA
 flipCard_wide$popup           <- NA
 flipCard_wide$x.start         <- NA
 flipCard_wide$x.stop          <- NA
 flipCard_wide$y.start         <- NA
 flipCard_wide$y.stop          <- NA
 flipCard_wide$distance        <- NA
 flipCard_wide$scale.start     <- NA
 flipCard_wide$scale.stop      <- NA
 flipCard_wide$scaleSize       <- NA
 flipCard_wide$rotation.start  <- NA
 flipCard_wide$rotation.stop   <- NA
 flipCard_wide$rotationDegree  <- NA
 dat_flipCard <- flipCard_wide[, c("fileid.start", "fileid.stop", "event",
@ -325,18 +315,9 @@ openTopic_wide$event <- "openTopic"
 openTopic_wide$duration <- openTopic_wide$time_ms.stop -
  openTopic_wide$time_ms.start
 openTopic_wide$popup           <- NA
 openTopic_wide$x.start         <- NA
 openTopic_wide$x.stop          <- NA
 openTopic_wide$y.start         <- NA
 openTopic_wide$y.stop          <- NA
 openTopic_wide$distance        <- NA
 openTopic_wide$scale.start     <- NA
 openTopic_wide$scale.stop      <- NA
 openTopic_wide$scaleSize       <- NA
 openTopic_wide$rotation.start  <- NA
 openTopic_wide$rotation.stop   <- NA
 openTopic_wide$rotationDegree  <- NA
 dat_openTopic <- openTopic_wide[, c("fileid.start", "fileid.stop", "event",
@ -375,22 +356,13 @@ openPopup_wide <- reshape(dat5, direction = "wide",
 # df[df$trace == 4595, ]
 # --> artwork 046 popup selene.xml gets opened twice
 openPopup_wide$event <- "openPopup"
 openPopup_wide$duration <- openPopup_wide$time_ms.stop -
  openPopup_wide$time_ms.start
 openPopup_wide$card            <- NA
 openPopup_wide$x.start         <- NA
 openPopup_wide$x.stop          <- NA
 openPopup_wide$y.start         <- NA
 openPopup_wide$y.stop          <- NA
 openPopup_wide$distance        <- NA
 openPopup_wide$scale.start     <- NA
 openPopup_wide$scale.stop      <- NA
 openPopup_wide$scaleSize       <- NA
 openPopup_wide$rotation.start  <- NA
 openPopup_wide$rotation.stop   <- NA
 openPopup_wide$rotationDegree  <- NA
 dat_openPopup <- openPopup_wide[, c("fileid.start", "fileid.stop", "event",
@ -529,12 +501,17 @@ out <- out[order(out$date.start), ]
 rownames(out) <- NULL
 # Make `trace` a consecutive number
-out$trace2 <- as.numeric(factor(out$trace, levels = unique(out$trace)))
+out$trace <- as.numeric(factor(out$trace, levels = unique(out$trace)))
 #' # Fill in topics
 topics <- read.table("../data/topics.csv", sep = ";", header = TRUE)
 # TODO:
 #' # Export data
-write.table(out, "../data/event_logfiles.csv",
+write.table(out, "../data/event_logfiles.csv", sep = ";",
-            sep = ";", quote = FALSE, row.names = FALSE)
+            row.names = FALSE)
 # TODO: Write function for closing events
--- a/code/03_topic-cards.R
+++ b/code/03_topic-cards.R
@ -0,0 +1,42 @@
 # setwd("C:/Users/nwickelmaier/Nextcloud/Documents/MDS/2023ss/60100_master_thesis/data/ContentEyevisit/eyevisit_cards_light")
 rm(list=ls())
 dat0 <- read.table("../../event_logfiles.csv", sep = ";", header = TRUE)
 dat0$artwork <- sprintf("%03d", dat0$artwork)
 # artwork names
 artworks <- sort(unique(dat0$artwork))
 # create data frame with file names and topics for each artwork
 dat <- NULL
 file_order <- NULL
 for (artwork in artworks) {
  fnames <- dir(pattern = paste0(artwork, "_"), path = artwork, full.names = TRUE)
  topic <- NULL
  for (fname in fnames) {
    topic <- c(topic, gsub("^<card type=.(.*).>$", "\\1",
                           grep("^<card type=", trimws(readLines(fname)), value = T)))
  }
  index <- paste(artwork, "index.xml", sep = "/")
  file_order <- c(file_order, gsub("^<card src=.(.*)./>$", "\\1",
                       grep("^<card src=", trimws(readLines(index)), value = TRUE)))
  in_index <- fnames %in% file_order
  dat <- rbind(dat, data.frame(artwork, file_name = fnames, in_index, topic))
 }
 table(dat$artwork)
 table(dat$topic)
 # take only the ones that are actually displayed and sort in the same order
 # as indicated in index.html
 dat2 <- dat[dat$in_index, -3]
 dat2 <- dat2[order(file_order, dat2$file_name), ]
 dat2$index <- unlist(sapply(table(dat2$artwork), seq_len))
 write.table(dat2, file = "../../topics.csv", sep = ";", row.names = FALSE)
--- a/code/questions_number-of-cards.R
+++ b/code/questions_number-of-cards.R
@ -0,0 +1,58 @@
 #' ---
 #' title: "Open Questions -- Card indices"
 #' author: "Nora Wickelmaier"
 #' date: "`r Sys.Date()`"
 #' output: 
 #'   html_document:
 #'     number_sections: true
 #'     toc: true
 #' ---
 #+ include = FALSE
 # setwd("C:/Users/nwickelmaier/Nextcloud/Documents/MDS/2023ss/60100_master_thesis/code")
 dat <- read.table("../data/event_logfiles.csv", sep = ";", header = TRUE)
 dat$date.start <- as.POSIXct(dat$date.start)
 dat$date.stop <- as.POSIXct(dat$date.stop)
 dat$artwork <- sprintf("%03d", dat$artwork)
 #' The following table shows an overview of the card indices. The indices
 #' should have values between 0 and 5. It is unclear what the numbers mean.
 table(dat$card)
 #' Number of cards for each artwork in the data set (subset from 2016)
 artworks <- sort(unique(dat$artwork))
 count <- function(x) length(table(dat[which(dat$artwork == x), "card"]))
 max_index <- function(x) max(dat[which(dat$artwork == x), "card"], na.rm = TRUE)
 num_cards <- sapply(artworks, count)
 highest_index <- sapply(artworks, max_index)
 #' Check how many XML-files for cards are present
 path <- "../data/ContentEyevisit/eyevisit_cards_light"
 num_files <- NULL
 for (artwork in artworks) {
  fnames <- dir(pattern = paste0(artwork, "_"), path = paste(path, artwork, sep = "/"))
  num_files <- c(num_files, length(fnames))
 }
 #' The table shows that each artwork has 6 cards the most (as expected).
 #' This is a subset of the data, so not all cards have been opened.
 cards <- data.frame(artwork = artworks, num_cards, highest_index,
                    num_files, diff = num_files - highest_index)
 cards
 #' There are more than 8 files for a couple of artworks:
 subset(cards, cards$num_files >= 8)
 #' It might be possible, that the number indicates the index of the file
 #' and not the actual card that was displayed. BUT: In many cases, there
 #' are only 6 (or less) files, but a higher index is present...
 subset(cards, cards$diff < 0)