Moved most stuff into package folder mtt; updated README so it works with new code

2023-09-21 16:45:06 +02:00 · 2023-09-21 16:45:06 +02:00 · 3dd13a6c6e
commit 3dd13a6c6e
parent 55adcf03d7
9 changed files with 51 additions and 666 deletions
--- a/README.Rmd
+++ b/README.Rmd
@ -8,6 +8,11 @@ output:
    toc: true
 ---
 ```{r, include = FALSE}
 # setwd("C:/Users/nwickelmaier/Nextcloud/Documents/MDS/2023ss/60100_master_thesis")
 devtools::load_all("../../../software/mtt")
 ```
 # Log data from the Multi-Touch Table at the HAUM
 The Multi Touch Table at the Herzog-Anton-Ulrich-Museum (HAUM) in
@ -117,11 +122,6 @@ files have been affected.
 # Problems and how I handled them
 ```{r, include = FALSE}
 # setwd("C:/Users/nwickelmaier/Nextcloud/Documents/MDS/2023ss/60100_master_thesis")
 source("code/functions.R")
 ```
 This lists some problems with the log data that required decisions. These
 decisions influence the outcome and maybe even the data quality. Hence, I
 tried to document how I handled these problems and explain the decisions I
@ -136,7 +136,7 @@ continuous within one log file but not over several log files.
 ```{r}
 # Read data
-dat0 <- read.table("data/rawdata_logfiles_small.csv", sep = ";",
+dat0 <- read.table("data/haum/rawdata_logfiles_small.csv", sep = ";",
                   header = TRUE)
 dat0$date <- as.POSIXct(dat0$date)
 dat0$glossar <- ifelse(dat0$artwork == "glossar", 1, 0)
@ -146,26 +146,16 @@ dat <- subset(dat0, !(dat0$event %in% c("Start Application",
                                        "Show Application")))
 # Add trace variable
-dat1 <- add_trace(dat, glossar_dict = "data/glossar_dict.RData")
+dat1 <- add_trace(dat, glossar_dict = "data/haum/glossar_dict.RData")
 # Close events
 dat2 <- rbind(close_events(dat1, "move"),
              close_events(dat1, "flipCard"),
              close_events(dat1, "openTopic"),
              close_events(dat1, "openPopup"))
-dat2 <- dat2[order(dat2$date.start, dat2$fileId.start), ]
+dat2 <- dat2[order(dat2$date.start, dat2$fileId), ]
 head(dat2[which(dat2$duration < 0),
     c("fileId.start", "fileId.stop", "event", "artwork", "duration")], 20)
 head(dat2[which(dat2$fileId.start != dat2$fileId.stop),
     c("fileId.start", "fileId.stop", "event", "artwork", "duration")], 20)
 plot(timeMs ~ as.factor(fileId), dat[1:5000,], xlab = "fileId")
 # Remove durations when event spans more than one log file, since they are
 # not interpretable
 #dat2[which(dat2$fileId.start != dat2$fileId.stop), "duration"] <- NA
 ```
 The boxplot shows that we have a continuous range of values within one log
@ -183,6 +173,9 @@ exactly fixed. Unfortunately, only three `move` events were fixed, since it
 only fixed irregularities *within* one log file. See below for more
 details.
 UPDATE: By now I remove all events that span more than one log file. This
 lets me improve speed considerably.
 ## Left padding of file IDs
 The file names of the raw log files are automatically generated and contain
@ -196,7 +189,7 @@ will sort these files in the order shown below. In order to preprocess the
 data and close events that belong together, the data need to be sorted by
 events and artworks repeatedly. In order to get them back in the correct
 time order, it is necessary to order them based on three variables:
-`fileId.start`, `date.start` and `timeMs`. The file IDs therefore need to
+`fileId`, `date.start` and `timeMs`. The file IDs therefore need to
 sort in the correct order (again see below for example). I zero left padded
 the log file names within the data frame using it as an identifier. These
 "file names" do not correspond exactly to the original raw log file names.
@ -406,7 +399,7 @@ assign topics and file names to the according pop-ups. This needs to be
 cross checked with the programming, but seems the most plausible approach
 with my current knowledge.
-## Extracting topics from `index.xml` vs. `<artwork_number>.xml
+## Extracting topics from `index.xml` vs. `<artwork_number>.xml`
 When I extract the topics from `index.html` I get different topics, than
 when I get them from `<artwork>.html`. At first glance, it looks like using
@ -414,7 +407,7 @@ when I get them from `<artwork>.html`. At first glance, it looks like using
 ```{r}
 artworks <- unique(dat2$artwork)
-path <- "data/ContentEyevisit/eyevisit_cards_light/"
+path <- "data/haum/ContentEyevisit/eyevisit_cards_light/"
 topics <- extract_topics(artworks, "index.xml", path)
 topics2 <- extract_topics(artworks, paste0(artworks, ".xml"), path)
@ -434,7 +427,7 @@ sudden there were 72 instead of 70 artworks. It seems like these two
 artworks appear on October 21, 2022.
 ```{r}
-dat0 <- read.table("data/rawdata_logfiles.csv", sep = ";", header = TRUE)
+dat0 <- read.table("data/haum/rawdata_logfiles.csv", sep = ";", header = TRUE)
 dat0$date <- as.POSIXct(dat0$date)
 dat0$glossar <- ifelse(dat0$artwork == "glossar", 1, 0)
--- a/code/01_parse-logfiles.R
+++ b/code/01_parse-logfiles.R
@ -1,125 +0,0 @@
 # setwd("C:/Users/nwickelmaier/Nextcloud/Documents/MDS/2023ss/60100_master_thesis/code")
 ###### HELPER ######
 # Need to left pad file names. If I do not do this, the sorting of the
 # timestamps will be off and I get negative durations later on since the
 # wrong events get closed.
 leftpad_fnames <- function(x) {
  z <- gsub(paste0(dirpaths, "/"), "\\1", x)
  ys <- strsplit(z, "_")
  res <- NULL
  for (y in ys) {
    y2 <- unlist(strsplit(y[3], "-"))
    e1 <- y[1]
    e2 <- sprintf("%02d", as.numeric(y[2]))
    e3 <- sprintf("%02d", as.numeric(y2[1]))
    e4 <- sprintf("%02d", as.numeric(y2[2]))
    e5 <- sprintf("%02d", as.numeric(y[4]))
    e6 <- sprintf("%02d", as.numeric(gsub(".log", "", y[5])))
    e6 <- sprintf("%02d", as.numeric(gsub(".log", "", y[5])))
    res <- c(res,
             paste0(e1, "_", e2, "_", e3, "-", e4, "_", e5, "_", e6, ".log"))
  }
  res
 }
 ##### CONTENT ######
 # Choose which folders with raw log files should be included
 folders <- "all"
 #folders <- "_2016b"
 dirpaths <- paste0("../data/haum_logs_2016-2023/", folders)
 fnames <- dir(dirpaths, pattern = "*.log", full.names = TRUE)
 length(fnames)
 head(fnames)
 logs <- lapply(fnames, readLines)
 nlog <- sapply(logs, length)
 dat <- data.frame(fileId = rep(leftpad_fnames(fnames), nlog),
                  logs = unlist(logs))
 head(dat$logs)
 # Remove corrupted lines
 # corrupt lines are "" and need to be removed
 d1 <- dim(dat)[1]
 dat <- subset(dat, dat$logs != "")
 d2 <- dim(dat)[1]
 # TODO: Catch this in a function and give back a meaningful warning
 # The files contain `r d1-d2` corrupt lines that were removed from the
 # data.
 # Extract relevant infos
 date <- sapply(dat$logs, gsub,
               pattern = "^\\[(.*)\\], \\[.*$",
               replacement = "\\1",
               USE.NAMES = FALSE)
 timestamp <- sapply(dat$logs, gsub,
                    pattern = "^\\[.*\\], \\[(.*)\\].*$",
                    replacement = "\\1",
                    USE.NAMES = FALSE)
 action <- sapply(dat$logs, gsub,
                 pattern = "^.*EyeVisit, (.*):*.*$",
                 replacement = "\\1",
                 USE.NAMES = FALSE)
 events <- sapply(strsplit(action, ":"), function(x) x[1])
 topics <- sapply(strsplit(action, ":"), function(x) x[2])
 moves <- apply(do.call(rbind,
                 strsplit(sapply(strsplit(action, ":"), function(x) x[3]),
                          ",")),
               2, as.numeric)
 # ATTENTION: as.numeric() forces NAs for "OpenCard" and "CloseCard"
 card_action <- trimws(sapply(strsplit(action, ":"),
                             function(x) x[3])[grep("Artwork", events)])
 card <- as.numeric(sapply(strsplit(action, ":"), function(x) x[4]))
 events[grep("Artwork", events)] <- paste("Artwork", card_action, sep = "/")
 ts_elements <- strsplit(timestamp, ":")
 time_ms <- as.numeric(sapply(ts_elements, function(x) x[4])) +
           as.numeric(sapply(ts_elements, function(x) x[3])) * 1000 +
           as.numeric(sapply(ts_elements, function(x) x[2])) * 1000 * 60
 dat$date        <- lubridate::parse_date_time(date, "bdyHMSOp")
 dat$timeMs      <- time_ms
 dat$event       <- events
 dat$artwork     <- trimws(sapply(strsplit(topics, "/"), function(x) x[1]))
 dat$popup       <- sapply(strsplit(topics, "/"), function(x) x[2])
 dat$topicNumber <- card
 dat$x           <- moves[,1]
 dat$y           <- moves[,2]
 dat$scale       <- moves[,3]
 dat$rotation    <- moves[,4]
 dat$logs <- NULL
 # remove original log files from data so file becomes smaller
 # sort by fileId, since reading in by file names does not make sense
 # because of missing left zero padding
 dat <- dat[order(dat$fileId, dat$date, dat$timeMs), ]
 # Export data
 write.table(dat, "../data/rawdata_logfiles.csv",
            sep = ";", quote = FALSE, row.names = FALSE)
--- a/code/02_glossar_artworks.R
+++ b/code/02_glossar_artworks.R
@ -1,37 +0,0 @@
 # setwd("C:/Users/nwickelmaier/Nextcloud/Documents/MDS/2023ss/60100_master_thesis/data/ContentEyevisit/eyevisit_cards_light")
 dat0 <- read.table("../../rawdata_logfiles.csv", sep = ";",
                   header = TRUE)
 # artwork names
 artworks <- unique(na.omit(dat0$artwork))[unique(na.omit(dat0$artwork)) != "glossar"]
 dat <- subset(dat0, dat0$artwork == "glossar")
 glossar_files <- unique(dat$popup)
 x <- NULL
 for (glossar_file in glossar_files) {
  for (artwork in artworks) {
    fnames <- dir(pattern = paste0(artwork, "_"), path = artwork)
    for (fname in fnames) {
      lines <- readLines(paste0(artwork, "/", fname))
        if (any(grepl(glossar_file, lines))) {
          x <- rbind(x, data.frame(glossar_file, artwork))
          break
      }
    }
  }
 }
 head(x, 20)
 glossar_dict <- as.data.frame(tapply(x$artwork, x$glossar_file, FUN = c))
 names(glossar_dict) <- "artwork"
 glossar_dict$glossar_file <- rownames(glossar_dict)
 rownames(glossar_dict) <- NULL
 glossar_dict <- glossar_dict[, c("glossar_file", "artwork")]
 save(glossar_dict, file = "../../glossar_dict.RData")
 # TODO: Save in interoperable format
--- a/code/02_preprocessing.R
+++ b/code/02_preprocessing.R
@ -1,6 +1,9 @@
 # TODO: This script is obsolete and needs to be updated!
 # setwd("C:/Users/nwickelmaier/Nextcloud/Documents/MDS/2023ss/60100_master_thesis/code")
-source("functions.R")
+#source("functions.R")
 devtools::load_all("../../../../software/mtt")
 small <- TRUE
@ -10,10 +13,10 @@ now <- Sys.time()
 cat("########## Reading in data... ##########", "\n")
 if (small) {
-  dat0 <- read.table("../data/rawdata_logfiles_small.csv", sep = ";",
+  dat0 <- read.table("../data/haum/rawdata_logfiles_small.csv", sep = ";",
                     header = TRUE)
 } else {
-  dat0 <- read.table("../data/rawdata_logfiles.csv", sep = ";",
+  dat0 <- read.table("../data/haum/rawdata_logfiles.csv", sep = ";",
                     header = TRUE)
 }
 dat0$date <- as.POSIXct(dat0$date)
@ -54,7 +57,6 @@ dat2 <- dat2[order(dat2$date.start, dat2$fileId.start), ]
 # Remove all events that do not have a `date.start`
 dat2 <- dat2[!is.na(dat2$date.start), ]
 rownames(dat2) <- NULL
 # TODO: Throw warning about this
 save(dat2, file = paste("tmp/dat2", ifelse(small, "small_", "full_"),
                       format(now, "%Y-%m-%d_%H-%M-%S"), ".RData"))
@ -90,7 +92,7 @@ artworks <- unique(dat4$artwork)
 # remove artworks without XML information
 artworks <- artworks[!artworks %in% c("504", "505")]
 topics <- extract_topics(artworks, pattern = paste0(artworks, ".xml"),
-                         path = "../data/ContentEyevisit/eyevisit_cards_light/")
+                         path = "../data/haum/ContentEyevisit/eyevisit_cards_light/")
 dat5 <- add_topic(dat4, topics = topics)
@ -101,6 +103,6 @@ save(dat5, file = paste("tmp/dat5", ifelse(small, "small_", "full_"),
 # Export data ############################################################
 cat("########## Exporting data frame with event logs... ##########", "\n")
-write.table(dat5, "../data/event_logfiles.csv", sep = ";",
+write.table(dat5, "../data/haum/event_logfiles.csv", sep = ";",
            row.names = FALSE)
--- a/code/03_modeling.R
+++ b/code/03_modeling.R
@ -16,7 +16,7 @@
 #' # Read data
-dat <- read.table("../data/event_logfiles.csv", sep = ";", header = TRUE)
+dat <- read.table("../data/haum/event_logfiles.csv", sep = ";", header = TRUE)
 dat$date.start <- as.POSIXct(dat$date.start)
 dat$date.stop <- as.POSIXct(dat$date.stop)
--- a/code/03_topic-cards.R
+++ b/code/03_topic-cards.R
@ -1,16 +0,0 @@
 path <- "C:/Users/nwickelmaier/Nextcloud/Documents/MDS/2023ss/60100_master_thesis/data/ContentEyevisit/eyevisit_cards_light"
 setwd(path)
 # artwork names
 dat0 <- read.table("../../event_logfiles.csv", sep = ";", header = TRUE)
 dat0$artwork <- sprintf("%03d", dat0$artwork)
 artworks <- sort(unique(dat0$artwork))
 # extract topics
 topics <- extract_topics(artworks, paste0(artworks, ".xml"), path)
 write.table(topics, file = "../../topics.csv", sep = ";", row.names = FALSE)
 # TODO: Keep this file?
--- a/code/functions.R
+++ b/code/functions.R
@ -1,458 +0,0 @@
 ###########################################################################
 # Add trace variable
 add_trace <- function(data, glossar_dict = "../data/glossar_dict.RData") {
  data$trace <- NA
  subdata1 <- data[data$event %in% c("Transform start", "Transform stop"), ]
  subdata2 <- data[!data$event %in% c("Transform start", "Transform stop"), ]
  last_event <- subdata2$event[1]
  artworks <- unique(subdata2$artwork)[unique(subdata2$artwork) != "glossar"]
  n <- 1    # count artworks for progress
  pb <- txtProgressBar(min = 0, max = nrow(subdata2), initial = NA,
                       style = 3)
  for (artwork in artworks) {
    cat("\n\nAdding trace variable for artwork", artwork,
               paste0("(", n, "/", length(artworks), ")"), "\n")
    for (i in 1:nrow(subdata2)) {
      if (last_event == "Show Info" & subdata2$artwork[i] == artwork) {
        subdata2$trace[i] <- i
        j <- i
      } else if (last_event == "Show Front" & subdata2$artwork[i] == artwork) {
        subdata2$trace[i] <- j
      } else if (!(last_event %in% c("Show Info", "Show Front")) &
                 subdata2$artwork[i] == artwork) {
        subdata2$trace[i] <- j
      }
      if (i <= nrow(subdata2)) {
        last_event <- subdata2$event[i + 1]
      }
      setTxtProgressBar(pb, i)
    }
    n <- n + 1
  }
  # Fix glossar entries (find corresponding artworks and fill in trace)
  glossar_files <- unique(subdata2[subdata2$artwork == "glossar", "popup"])
  # load lookup table for artworks and glossar files
  load(glossar_dict)
  lut <- glossar_dict[glossar_dict$glossar_file %in% glossar_files, ]
  inside  <- glossar_files[glossar_files %in%
                           lut[sapply(lut$artwork, length) == 1,
                               "glossar_file"]]
  single_art  <- unlist(lut[lut$glossar_file %in% inside, "artwork"])
  m <- 1
  for (file in lut$glossar_file) {
    cat("\n\nAdding trace variable for glossar entry", file,
               paste0("(", m, "/", length(lut$glossar_file), ")"), "\n")
    artwork_list <- unlist(lut[lut$glossar_file == file, "artwork"])
    for (i in seq_len(nrow(subdata2))) {
      if (subdata2$event[i] == "Show Info" |
          (subdata2$event[i] == "Artwork/OpenCard" &
           subdata2$artwork[i] %in% single_art)) {
        current_artwork <- subdata2[i, "artwork"]
        j <- i
        k <- i
      } else {
        current_artwork <- current_artwork
      }
      if (subdata2$event[i] == "Show Front" & subdata2$artwork[i] == current_artwork) {
      # make sure artwork has not been closed, yet!
        k <- i
      }
      if (subdata2$artwork[i] == "glossar" &
          (current_artwork %in% artwork_list) &
          subdata2$popup[i] == file & (j - k == 0)) {
        subdata2[i, "trace"]   <- subdata2[j, "trace"]
        subdata2[i, "artwork"] <- current_artwork
      }
      setTxtProgressBar(pb, i)
    }
    m <- m + 1
  }
  # Exclude not matched glossar entries
  cat("\n\nINFORMATION: glossar entries that are not matched will be removed:",
      sum(is.na(subdata2[subdata2$glossar == 1, "trace"])), "entries",
      #proportions(table(is.na(subdata2[subdata2$glossar == 1, "trace"]))),
      fill = TRUE)
  subdata2 <- subset(subdata2, !is.na(subdata2$trace))
  # REMEMBER: It can never be 100% correct, since it is always possible
  # that several cards are open and that they link to the same glossar
  # entry
  # dat2[14110:14130, ]
  # dat2[dat2$glossar == 1, ]
  out <- rbind(subdata1, subdata2)
  out <- out[order(out$fileId, out$date, out$timeMs), ]
  out
 }
 ###########################################################################
 close_events <- function(data, event = c("move", "flipCard", "openTopic", "openPopup")) {
  event <- match.arg(event)
  switch(event,
    "move" = {
      actions <- c("Transform start", "Transform stop")
      idvar   <- c("fileId", "eventId", "artwork", "glossar")
      drop    <- c("popup", "topicNumber", "trace", "event")
      ncol    <- 16
    },
    "flipCard" = {
      actions <- c("Show Info", "Show Front")
      idvar   <- c("fileId", "trace", "artwork", "glossar")
      drop    <- c("popup", "topicNumber", "eventId", "event")
      ncol    <- 16
    },
    "openTopic" = {
      actions <- c("Artwork/OpenCard", "Artwork/CloseCard")
      idvar   <- c("fileId", "eventId", "trace", "glossar", "artwork",
                   "topicNumber")
      drop    <- c("popup", "event")
      ncol    <- 18
    },
    "openPopup" = {
      actions <- c("ShowPopup", "HidePopup")
      idvar   <- c("fileId", "eventId", "trace", "glossar", "artwork", "popup")
      drop    <- c("topicNumber", "event")
      ncol    <- 18
 #   TODO: Should topicNumber maybe also be filled in for "openPopup"?
    }
  )
  subdata <- subset(data, data$event %in% actions)
  subdata <- subdata[order(subdata$artwork, subdata$popup, subdata$date, subdata$timeMs), ]
  subdata$time <- ifelse(subdata$event == actions[1], "start", "stop")
  num_start <- diff(c(0, which(subdata$event == actions[2])))
  if (tail(subdata, 1)$time == "start") {
    num_start <- c(num_start, 1)
  }
  subdata$eventId <- rep(seq_along(num_start), num_start)
  if (event == "move") {
    subdata    <- subdata[!duplicated(subdata[, c("event", "eventId")]), ]
    id_stop    <- which(subdata$event == actions[2])
    id_rm_stop <- id_stop[diff(id_stop) == 1]
    subdata    <- subdata[-(id_rm_stop + 1), ]
  }
  subdata_split <- split(subdata, ~ fileId)
  pbapply::pboptions(style = 3, char = "=")
  subdata_split_wide <- pbapply::pblapply(subdata_split, reshape,
                               direction = "wide",
                               idvar = idvar,
                               timevar = "time",
                               drop = drop)
 #  suppressWarnings(
 #    data_wide <- reshape(subdata, direction = "wide",
 #                         idvar = idvar,
 #                         timevar = "time",
 #                         drop = drop)
 #  )
  # remove entries with only start or stop events since they do not have
  # all columns
  ids <- which(sapply(subdata_split_wide, ncol) != ncol)
  if (length(ids) > 0) subdata_split_wide <- subdata_split_wide[-ids]
  data_wide <- dplyr::bind_rows(subdata_split_wide)
  for (d in drop) data_wide[d] <- NA
  data_wide$distance        <- NA
  data_wide$scaleSize       <- NA
  data_wide$rotationDegree  <- NA
  data_wide$event <- event
  data_wide$duration <- data_wide$timeMs.stop - data_wide$timeMs.start
  if (event == "move") {
    data_wide$distance <- apply(
        data_wide[, c("x.start", "y.start", "x.stop", "y.stop")], 1,
        function(x) dist(matrix(x, 2, 2, byrow = TRUE)))
    data_wide$rotationDegree <- data_wide$rotation.stop -
      data_wide$rotation.start
    data_wide$scaleSize <- data_wide$scale.stop / data_wide$scale.start
    # remove moves without any change
    move_wide <- data_wide[data_wide$distance != 0 &
                           data_wide$rotationDegree != 0 &
                           data_wide$scaleSize != 1, ]
    cat(paste("INFORMATION:", nrow(data_wide) - nrow(move_wide),
    "lines containing move events were removed since they did",
    "\nnot contain any change"), fill = TRUE)
    data_wide <- move_wide
  }
  out <- data_wide[, c("fileId", "event", "artwork", "trace", "glossar",
                       "date.start", "date.stop", "timeMs.start",
                       "timeMs.stop", "duration", "topicNumber", "popup",
                       "x.start", "y.start", "x.stop", "y.stop",
                       "distance", "scale.start", "scale.stop",
                       "scaleSize", "rotation.start", "rotation.stop",
                       "rotationDegree")]
  rownames(out) <- NULL
  out
 }
 ###########################################################################
 # Add case variable
 add_case <- function(data, cutoff = 20) {
 # TODO: What is the best choice for the cutoff here?
  data$timediff <- as.numeric(diff(c(data$date.start[1], data$date.start)))
  data$case <- NA
  j <- 1
  pb <- txtProgressBar(min = 0, max = nrow(data), initial = NA, style = 3)
  for (i in seq_len(nrow(data))) {
    if (data$timediff[i] <= cutoff) {
      data$case[i] <- j
    } else {
      j <- j + 1
      data$case[i] <- j
    }
    setTxtProgressBar(pb, i)
  }
  data$timediff <- NULL
  data
 }
 ###########################################################################
 # Add trace for moves
 add_trace_moves <- function(data) {
  pbapply::pboptions(style = 3, char = "=")
  trace_max <- max(data$trace, na.rm = TRUE)
  #subdata_art <- split(data, ~ artwork)
  subdata_case <- split(data, ~ case)
  #subdata_list <- split(data, ~ artwork + case)
  # --> does not work with complete data set
  cat("Splitting data...", "\n")
  subdata_list <- pbapply::pblapply(subdata_case, split, f = ~artwork)
  subdata_list <- unlist(subdata_list, recursive = FALSE)
  cat("Adding trace...", "\n")
  subdata_trace <- pbapply::pblapply(subdata_list,
        function(x) {
          trace_max <<- trace_max + 1
          add_trace_subdata(x, max_trace = trace_max)
        }
      )
  out <- dplyr::bind_rows(subdata_trace)
  out <- out[order(out$fileId.start, out$date.start, out$timeMs.start), ]
  rownames(out) <- NULL
  # Make trace a consecutive number
  out$trace <- as.numeric(factor(out$trace, levels = unique(out$trace)))
  out
 }
 add_trace_subdata <- function(subdata, max_trace) {
  if (nrow(subdata) != 0) {
    if (length(na.omit(unique(subdata$trace))) == 1) {
      subdata[subdata$event == "move", "trace"] <- na.omit(unique(subdata$trace))
    } else if (length(na.omit(unique(subdata$trace))) > 1) {
      for (i in 1:nrow(subdata)) {
        if (subdata$event[i] == "move") {
          if (i == 1) {
            subdata$trace[i] <- na.omit(unique(subdata$trace))[1]
          } else {
            subdata$trace[i] <- subdata$trace[i - 1]
          }
        }
      }
    } else if (all(is.na(subdata$trace))) {
      for (i in 1:nrow(subdata)) {
        subdata$trace[i] <- max_trace
      }
    }
  } else {
    warning("`subdata` has nrow = 0")
  }
  subdata
 }
 ###########################################################################
 # Create data frame with file names and topics for each artwork
 extract_topics <- function(artworks, pattern, path) {
  dat <- NULL
  file_order <- NULL
  i <- 1
  for (artwork in artworks) {
    if (length(pattern) == 1) {
      index_file <- pattern
    } else {
      index_file <- pattern[i]
    }
    fnames <- dir(pattern = paste0(artwork, "_"),
                  path = paste(path, artwork, sep = "/"))
    topic <- NULL
    for (fname in fnames) {
      suppressWarnings(
      topic <- c(topic, gsub("^<card type=.(.*).>$", "\\1",
        grep("^<card type=",
          trimws(readLines(paste(path, artwork, fname, sep = "/"))),
          value = T)))
      )
    }
    index <- paste(path, artwork, index_file, sep = "/")
    suppressWarnings(
    file_order <- c(file_order, gsub("^<card src=.*/(.*)./>$", "\\1",
                         grep("^<card src=", trimws(readLines(index)),
                              value = TRUE)))
    )
    in_index <- fnames %in% file_order
    dat <- rbind(dat, data.frame(artwork, file_name = fnames, in_index, topic))
    i <- i + 1
  }
  # take only the ones that are actually displayed and sort in the same order
  # as indicated in index.html
  out <- dat[dat$in_index, -3]
  out <- out[order(file_order, out$file_name), ]
  rownames(out) <- NULL
  out$index <- unlist(sapply(table(out$artwork), seq_len))
  out
 }
 ###########################################################################
 # Add topics: file names and topics
 add_topic <- function(data, topics) {
  artworks <- unique(data$artwork)
  tab_art <- lapply(artworks,
               function(x) names(table(data$topicNumber[data$artwork == x])))
  names(tab_art) <- artworks
  tab_index <- lapply(tab_art, seq_along)
  dat_split <- split(data, ~ artwork)
  set_label <- function(x) {
    artwork <- unique(x$artwork)
    x$topicIndex <- factor(x$topicNumber, labels = tab_index[[artwork]])
    x
  }
  dat_label <- lapply(dat_split, set_label)
  set_topic <- function(x) {
    artwork <- unique(x$artwork)
    labels_file <- topics[topics$artwork == artwork,
                          "file_name"][as.numeric(levels(x$topicIndex))]
    x$topicFile <- as.character(factor(x$topicIndex, labels = labels_file))
    labels_topic <- topics[topics$artwork == artwork,
                          "topic"][as.numeric(levels(x$topicIndex))]
    x$topic <- as.character(factor(x$topicIndex, labels = labels_topic))
    x
  }
  dat_topic <- lapply(dat_label, set_topic)
  #out <- do.call(rbind, dat_topic)
  out <- dplyr::bind_rows(dat_topic)
  out <- out[order(out$fileId.start, out$date.start, out$timeMs.start), ]
  rownames(out) <- NULL
  out
 }
 ###########################################################################
 # Create data frame with information on artworks
 extract_artworks <- function(artworks, files = paste0(artworks, ".xml"),
                             path = path) {
  out <- NULL
  i <- 1
  for (artwork in artworks) {
    if (length(files) == 1) {
      index_file <- files
    } else {
      index_file <- files[i]
    }
    index <- paste(path, artwork, index_file, sep = "/")
    varnames <- c("artist", "title", "misc", "description")
    xmllist <- XML::xmlToList(index)$header[varnames]
    if (any(sapply(xmllist, is.null))) {# necessary for missing entries
      names(xmllist) <- varnames
      xmllist[which(sapply(xmllist, is.null))] <- NA
    }
    # remove ugly quotes
    xmllist <- lapply(xmllist, function(x) gsub("„|“", "", x))
    # remove HTML tags
    xmllist <- lapply(xmllist, function(x) gsub("<br/>", " ", x))
    xmldat <- as.data.frame(xmllist)
    xmldat$artwork <- artwork
    # trim white space from strings
    xmldat$artist      <- trimws(xmldat$artist)
    xmldat$title       <- trimws(xmldat$title)
    xmldat$misc        <- trimws(xmldat$misc)
    xmldat$description <- trimws(xmldat$description)
    out <- rbind(out, xmldat)
    i <- i + 1
  }
  out
 }
--- a/code/03_specs.R
+++ b/code/03_specs.R
@ -2,7 +2,7 @@
 library(lubridate)
-dat <- read.table("../data/rawdata_logfiles.csv", header = TRUE, sep = ";")
+dat <- read.table("../data/haum/rawdata_logfiles.csv", header = TRUE, sep = ";")
 # dat$event <- factor(dat$event, levels = c("Start Application",
 #                                           "Show Application",
 #                                           "Transform start",
--- a/code/visualization.R
+++ b/code/visualization.R
@ -0,0 +1,26 @@
 setwd("C:/Users/nwickelmaier/Nextcloud/Documents/MDS/2023ss/60100_master_thesis/code/")
 devtools::load_all("../../../../software/mtt")
 #library(mtt)
 dat <- parse_logfiles("2016", path = "../data/haum/LogFiles/",
                      save = FALSE)
 datlogs <- create_eventlogs(dat, "../data/haum/ContentEyevisit/eyevisit_cards_light/")
 dat001 <- datlogs[which(datlogs$artwork == "001"), ]
 index <- as.numeric(as.factor(dat001$trace))
 cc <- sample(colors(), length(unique(dat001$trace)))
 plot(y.start ~ x.start, dat001, type = "n", xlab = "x", ylab = "y",
     xlim = c(0, 3840), ylim = c(0, 2160))
 with(dat001[1:200,], arrows(x.start, y.start, x.stop, y.stop,
                            length = .07, col = cc[index]))
 plot(y.start ~ x.start, dat001, xlab = "x", ylab = "y",
     xlim = c(0, 3840), ylim = c(0, 2160), pch = 16, col = "gray")
 points(y.start ~ x.start, dat001, xlab = "x", ylab = "y",
     xlim = c(0, 3840), ylim = c(0, 2160), cex = dat001$scaleSize,
     col = "blue")