Moved most stuff into package folder mtt; updated README so it works with new code

2023-09-21 16:45:06 +02:00 · 2023-09-21 16:45:06 +02:00 · 3dd13a6c6e
commit 3dd13a6c6e
parent 55adcf03d7
9 changed files with 51 additions and 666 deletions
--- a/README.Rmd
+++ b/README.Rmd
@ -8,6 +8,11 @@ output:
    toc: true
 ---

+```{r, include = FALSE}
+# setwd("C:/Users/nwickelmaier/Nextcloud/Documents/MDS/2023ss/60100_master_thesis")
+devtools::load_all("../../../software/mtt")
+```
+
 # Log data from the Multi-Touch Table at the HAUM

 The Multi Touch Table at the Herzog-Anton-Ulrich-Museum (HAUM) in
@ -117,11 +122,6 @@ files have been affected.

 # Problems and how I handled them

-```{r, include = FALSE}
-# setwd("C:/Users/nwickelmaier/Nextcloud/Documents/MDS/2023ss/60100_master_thesis")
-source("code/functions.R")
-```
-
 This lists some problems with the log data that required decisions. These
 decisions influence the outcome and maybe even the data quality. Hence, I
 tried to document how I handled these problems and explain the decisions I
@ -136,7 +136,7 @@ continuous within one log file but not over several log files.

 ```{r}
 # Read data
-dat0 <- read.table("data/rawdata_logfiles_small.csv", sep = ";",
+dat0 <- read.table("data/haum/rawdata_logfiles_small.csv", sep = ";",
                   header = TRUE)
 dat0$date <- as.POSIXct(dat0$date)
 dat0$glossar <- ifelse(dat0$artwork == "glossar", 1, 0)
@ -146,26 +146,16 @@ dat <- subset(dat0, !(dat0$event %in% c("Start Application",
                                        "Show Application")))

 # Add trace variable
-dat1 <- add_trace(dat, glossar_dict = "data/glossar_dict.RData")
+dat1 <- add_trace(dat, glossar_dict = "data/haum/glossar_dict.RData")

 # Close events
 dat2 <- rbind(close_events(dat1, "move"),
              close_events(dat1, "flipCard"),
              close_events(dat1, "openTopic"),
              close_events(dat1, "openPopup"))
-dat2 <- dat2[order(dat2$date.start, dat2$fileId.start), ]
-
-head(dat2[which(dat2$duration < 0),
-     c("fileId.start", "fileId.stop", "event", "artwork", "duration")], 20)
-
-head(dat2[which(dat2$fileId.start != dat2$fileId.stop),
-     c("fileId.start", "fileId.stop", "event", "artwork", "duration")], 20)
+dat2 <- dat2[order(dat2$date.start, dat2$fileId), ]

 plot(timeMs ~ as.factor(fileId), dat[1:5000,], xlab = "fileId")
-
-# Remove durations when event spans more than one log file, since they are
-# not interpretable
-#dat2[which(dat2$fileId.start != dat2$fileId.stop), "duration"] <- NA
 ```

 The boxplot shows that we have a continuous range of values within one log
@ -183,6 +173,9 @@ exactly fixed. Unfortunately, only three `move` events were fixed, since it
 only fixed irregularities *within* one log file. See below for more
 details.

+UPDATE: By now I remove all events that span more than one log file. This
+lets me improve speed considerably.
+
 ## Left padding of file IDs

 The file names of the raw log files are automatically generated and contain
@ -196,7 +189,7 @@ will sort these files in the order shown below. In order to preprocess the
 data and close events that belong together, the data need to be sorted by
 events and artworks repeatedly. In order to get them back in the correct
 time order, it is necessary to order them based on three variables:
-`fileId.start`, `date.start` and `timeMs`. The file IDs therefore need to
+`fileId`, `date.start` and `timeMs`. The file IDs therefore need to
 sort in the correct order (again see below for example). I zero left padded
 the log file names within the data frame using it as an identifier. These
 "file names" do not correspond exactly to the original raw log file names.
@ -406,7 +399,7 @@ assign topics and file names to the according pop-ups. This needs to be
 cross checked with the programming, but seems the most plausible approach
 with my current knowledge.

-## Extracting topics from `index.xml` vs. `<artwork_number>.xml
+## Extracting topics from `index.xml` vs. `<artwork_number>.xml`

 When I extract the topics from `index.html` I get different topics, than
 when I get them from `<artwork>.html`. At first glance, it looks like using
@ -414,7 +407,7 @@ when I get them from `<artwork>.html`. At first glance, it looks like using

 ```{r}
 artworks <- unique(dat2$artwork)
-path <- "data/ContentEyevisit/eyevisit_cards_light/"
+path <- "data/haum/ContentEyevisit/eyevisit_cards_light/"
 topics <- extract_topics(artworks, "index.xml", path)
 topics2 <- extract_topics(artworks, paste0(artworks, ".xml"), path)

@ -434,7 +427,7 @@ sudden there were 72 instead of 70 artworks. It seems like these two
 artworks appear on October 21, 2022.

 ```{r}
-dat0 <- read.table("data/rawdata_logfiles.csv", sep = ";", header = TRUE)
+dat0 <- read.table("data/haum/rawdata_logfiles.csv", sep = ";", header = TRUE)
 dat0$date <- as.POSIXct(dat0$date)
 dat0$glossar <- ifelse(dat0$artwork == "glossar", 1, 0)

--- a/code/01_parse-logfiles.R
+++ b/code/01_parse-logfiles.R
@ -1,125 +0,0 @@
-# setwd("C:/Users/nwickelmaier/Nextcloud/Documents/MDS/2023ss/60100_master_thesis/code")
-
-###### HELPER ######
-
-# Need to left pad file names. If I do not do this, the sorting of the
-# timestamps will be off and I get negative durations later on since the
-# wrong events get closed.
-
-
-leftpad_fnames <- function(x) {
-
-  z <- gsub(paste0(dirpaths, "/"), "\\1", x)
-  ys <- strsplit(z, "_")
-
-  res <- NULL
-
-  for (y in ys) {
-    y2 <- unlist(strsplit(y[3], "-"))
-    e1 <- y[1]
-    e2 <- sprintf("%02d", as.numeric(y[2]))
-    e3 <- sprintf("%02d", as.numeric(y2[1]))
-    e4 <- sprintf("%02d", as.numeric(y2[2]))
-    e5 <- sprintf("%02d", as.numeric(y[4]))
-    e6 <- sprintf("%02d", as.numeric(gsub(".log", "", y[5])))
-    e6 <- sprintf("%02d", as.numeric(gsub(".log", "", y[5])))
-
-    res <- c(res,
-             paste0(e1, "_", e2, "_", e3, "-", e4, "_", e5, "_", e6, ".log"))
-  }
-  res
-}
-
-##### CONTENT ######
-
-# Choose which folders with raw log files should be included
-
-folders <- "all"
-#folders <- "_2016b"
-
-dirpaths <- paste0("../data/haum_logs_2016-2023/", folders)
-
-fnames <- dir(dirpaths, pattern = "*.log", full.names = TRUE)
-length(fnames)
-head(fnames)
-
-logs <- lapply(fnames, readLines)
-nlog <- sapply(logs, length)
-dat <- data.frame(fileId = rep(leftpad_fnames(fnames), nlog),
-                  logs = unlist(logs))
-head(dat$logs)
-
-# Remove corrupted lines
-
-
-# corrupt lines are "" and need to be removed
-d1 <- dim(dat)[1]
-dat <- subset(dat, dat$logs != "")
-d2 <- dim(dat)[1]
-
-# TODO: Catch this in a function and give back a meaningful warning
-# The files contain `r d1-d2` corrupt lines that were removed from the
-# data.
-
-# Extract relevant infos
-
-date <- sapply(dat$logs, gsub,
-               pattern = "^\\[(.*)\\], \\[.*$",
-               replacement = "\\1",
-               USE.NAMES = FALSE)
-
-timestamp <- sapply(dat$logs, gsub,
-                    pattern = "^\\[.*\\], \\[(.*)\\].*$",
-                    replacement = "\\1",
-                    USE.NAMES = FALSE)
-
-action <- sapply(dat$logs, gsub,
-                 pattern = "^.*EyeVisit, (.*):*.*$",
-                 replacement = "\\1",
-                 USE.NAMES = FALSE)
-
-events <- sapply(strsplit(action, ":"), function(x) x[1])
-
-topics <- sapply(strsplit(action, ":"), function(x) x[2])
-
-moves <- apply(do.call(rbind,
-                 strsplit(sapply(strsplit(action, ":"), function(x) x[3]),
-                          ",")),
-               2, as.numeric)
-# ATTENTION: as.numeric() forces NAs for "OpenCard" and "CloseCard"
-
-card_action <- trimws(sapply(strsplit(action, ":"),
-                             function(x) x[3])[grep("Artwork", events)])
-
-card <- as.numeric(sapply(strsplit(action, ":"), function(x) x[4]))
-
-events[grep("Artwork", events)] <- paste("Artwork", card_action, sep = "/")
-
-ts_elements <- strsplit(timestamp, ":")
-time_ms <- as.numeric(sapply(ts_elements, function(x) x[4])) +
-           as.numeric(sapply(ts_elements, function(x) x[3])) * 1000 +
-           as.numeric(sapply(ts_elements, function(x) x[2])) * 1000 * 60
-
-dat$date        <- lubridate::parse_date_time(date, "bdyHMSOp")
-dat$timeMs      <- time_ms
-dat$event       <- events
-dat$artwork     <- trimws(sapply(strsplit(topics, "/"), function(x) x[1]))
-dat$popup       <- sapply(strsplit(topics, "/"), function(x) x[2])
-dat$topicNumber <- card
-dat$x           <- moves[,1]
-dat$y           <- moves[,2]
-dat$scale       <- moves[,3]
-dat$rotation    <- moves[,4]
-
-dat$logs <- NULL
-# remove original log files from data so file becomes smaller
-
-# sort by fileId, since reading in by file names does not make sense
-# because of missing left zero padding
-dat <- dat[order(dat$fileId, dat$date, dat$timeMs), ]
-
-# Export data
-
-write.table(dat, "../data/rawdata_logfiles.csv",
-            sep = ";", quote = FALSE, row.names = FALSE)
-
--- a/code/02_glossar_artworks.R
+++ b/code/02_glossar_artworks.R
@ -1,37 +0,0 @@
-# setwd("C:/Users/nwickelmaier/Nextcloud/Documents/MDS/2023ss/60100_master_thesis/data/ContentEyevisit/eyevisit_cards_light")
-
-dat0 <- read.table("../../rawdata_logfiles.csv", sep = ";",
-                   header = TRUE)
-# artwork names
-artworks <- unique(na.omit(dat0$artwork))[unique(na.omit(dat0$artwork)) != "glossar"]
-
-dat <- subset(dat0, dat0$artwork == "glossar")
-
-glossar_files <- unique(dat$popup)
-
-x <- NULL
-
-for (glossar_file in glossar_files) {
-  for (artwork in artworks) {
-    fnames <- dir(pattern = paste0(artwork, "_"), path = artwork)
-    for (fname in fnames) {
-      lines <- readLines(paste0(artwork, "/", fname))
-        if (any(grepl(glossar_file, lines))) {
-          x <- rbind(x, data.frame(glossar_file, artwork))
-          break
-      }
-    }
-  }
-}
-
-head(x, 20)
-
-glossar_dict <- as.data.frame(tapply(x$artwork, x$glossar_file, FUN = c))
-names(glossar_dict) <- "artwork"
-glossar_dict$glossar_file <- rownames(glossar_dict)
-rownames(glossar_dict) <- NULL
-glossar_dict <- glossar_dict[, c("glossar_file", "artwork")]
-
-save(glossar_dict, file = "../../glossar_dict.RData")
-# TODO: Save in interoperable format
-
--- a/code/02_preprocessing.R
+++ b/code/02_preprocessing.R
@ -1,6 +1,9 @@
+# TODO: This script is obsolete and needs to be updated!
+
 # setwd("C:/Users/nwickelmaier/Nextcloud/Documents/MDS/2023ss/60100_master_thesis/code")

-source("functions.R")
+#source("functions.R")
+devtools::load_all("../../../../software/mtt")

 small <- TRUE

@ -10,10 +13,10 @@ now <- Sys.time()
 cat("########## Reading in data... ##########", "\n")

 if (small) {
-  dat0 <- read.table("../data/rawdata_logfiles_small.csv", sep = ";",
+  dat0 <- read.table("../data/haum/rawdata_logfiles_small.csv", sep = ";",
                     header = TRUE)
 } else {
-  dat0 <- read.table("../data/rawdata_logfiles.csv", sep = ";",
+  dat0 <- read.table("../data/haum/rawdata_logfiles.csv", sep = ";",
                     header = TRUE)
 }
 dat0$date <- as.POSIXct(dat0$date)
@ -54,7 +57,6 @@ dat2 <- dat2[order(dat2$date.start, dat2$fileId.start), ]
 # Remove all events that do not have a `date.start`
 dat2 <- dat2[!is.na(dat2$date.start), ]
 rownames(dat2) <- NULL
-# TODO: Throw warning about this

 save(dat2, file = paste("tmp/dat2", ifelse(small, "small_", "full_"),
                       format(now, "%Y-%m-%d_%H-%M-%S"), ".RData"))
@ -90,7 +92,7 @@ artworks <- unique(dat4$artwork)
 # remove artworks without XML information
 artworks <- artworks[!artworks %in% c("504", "505")]
 topics <- extract_topics(artworks, pattern = paste0(artworks, ".xml"),
-                         path = "../data/ContentEyevisit/eyevisit_cards_light/")
+                         path = "../data/haum/ContentEyevisit/eyevisit_cards_light/")

 dat5 <- add_topic(dat4, topics = topics)

@ -101,6 +103,6 @@ save(dat5, file = paste("tmp/dat5", ifelse(small, "small_", "full_"),

 # Export data ############################################################
 cat("########## Exporting data frame with event logs... ##########", "\n")
-write.table(dat5, "../data/event_logfiles.csv", sep = ";",
+write.table(dat5, "../data/haum/event_logfiles.csv", sep = ";",
            row.names = FALSE)

--- a/code/03_modeling.R
+++ b/code/03_modeling.R
@ -16,7 +16,7 @@

 #' # Read data

-dat <- read.table("../data/event_logfiles.csv", sep = ";", header = TRUE)
+dat <- read.table("../data/haum/event_logfiles.csv", sep = ";", header = TRUE)
 dat$date.start <- as.POSIXct(dat$date.start)
 dat$date.stop <- as.POSIXct(dat$date.stop)

--- a/code/03_topic-cards.R
+++ b/code/03_topic-cards.R
@ -1,16 +0,0 @@
-path <- "C:/Users/nwickelmaier/Nextcloud/Documents/MDS/2023ss/60100_master_thesis/data/ContentEyevisit/eyevisit_cards_light"
-
-setwd(path)
-
-# artwork names
-dat0 <- read.table("../../event_logfiles.csv", sep = ";", header = TRUE)
-dat0$artwork <- sprintf("%03d", dat0$artwork)
-artworks <- sort(unique(dat0$artwork))
-
-# extract topics
-topics <- extract_topics(artworks, paste0(artworks, ".xml"), path)
-
-write.table(topics, file = "../../topics.csv", sep = ";", row.names = FALSE)
-
-# TODO: Keep this file?
-
--- a/code/functions.R
+++ b/code/functions.R
@ -1,458 +0,0 @@
-###########################################################################
-
-# Add trace variable
-
-add_trace <- function(data, glossar_dict = "../data/glossar_dict.RData") {
-
-  data$trace <- NA
-  subdata1 <- data[data$event %in% c("Transform start", "Transform stop"), ]
-  subdata2 <- data[!data$event %in% c("Transform start", "Transform stop"), ]
-
-  last_event <- subdata2$event[1]
-  artworks <- unique(subdata2$artwork)[unique(subdata2$artwork) != "glossar"]
-  n <- 1    # count artworks for progress
-
-  pb <- txtProgressBar(min = 0, max = nrow(subdata2), initial = NA,
-                       style = 3)
-
-  for (artwork in artworks) {
-
-    cat("\n\nAdding trace variable for artwork", artwork,
-               paste0("(", n, "/", length(artworks), ")"), "\n")
-
-    for (i in 1:nrow(subdata2)) {
-
-      if (last_event == "Show Info" & subdata2$artwork[i] == artwork) {
-        subdata2$trace[i] <- i
-        j <- i
-
-      } else if (last_event == "Show Front" & subdata2$artwork[i] == artwork) {
-        subdata2$trace[i] <- j
-
-      } else if (!(last_event %in% c("Show Info", "Show Front")) &
-                 subdata2$artwork[i] == artwork) {
-        subdata2$trace[i] <- j
-      }
-
-      if (i <= nrow(subdata2)) {
-        last_event <- subdata2$event[i + 1]
-      }
-      setTxtProgressBar(pb, i)
-    }
-    n <- n + 1
-  }
-
-  # Fix glossar entries (find corresponding artworks and fill in trace)
-  glossar_files <- unique(subdata2[subdata2$artwork == "glossar", "popup"])
-
-  # load lookup table for artworks and glossar files
-  load(glossar_dict)
-  lut <- glossar_dict[glossar_dict$glossar_file %in% glossar_files, ]
-
-  inside  <- glossar_files[glossar_files %in%
-                           lut[sapply(lut$artwork, length) == 1,
-                               "glossar_file"]]
-  single_art  <- unlist(lut[lut$glossar_file %in% inside, "artwork"])
-
-  m <- 1
-
-  for (file in lut$glossar_file) {
-
-    cat("\n\nAdding trace variable for glossar entry", file,
-               paste0("(", m, "/", length(lut$glossar_file), ")"), "\n")
-
-    artwork_list <- unlist(lut[lut$glossar_file == file, "artwork"])
-
-    for (i in seq_len(nrow(subdata2))) {
-
-      if (subdata2$event[i] == "Show Info" |
-          (subdata2$event[i] == "Artwork/OpenCard" &
-           subdata2$artwork[i] %in% single_art)) {
-
-        current_artwork <- subdata2[i, "artwork"]
-        j <- i
-        k <- i
-
-      } else {
-
-        current_artwork <- current_artwork
-
-      }
-
-      if (subdata2$event[i] == "Show Front" & subdata2$artwork[i] == current_artwork) {
-      # make sure artwork has not been closed, yet!
-        k <- i
-      }
-
-      if (subdata2$artwork[i] == "glossar" &
-          (current_artwork %in% artwork_list) &
-          subdata2$popup[i] == file & (j - k == 0)) {
-
-        subdata2[i, "trace"]   <- subdata2[j, "trace"]
-        subdata2[i, "artwork"] <- current_artwork
-
-      }
-      setTxtProgressBar(pb, i)
-    }
-    m <- m + 1
-  }
-
-  # Exclude not matched glossar entries
-  cat("\n\nINFORMATION: glossar entries that are not matched will be removed:",
-      sum(is.na(subdata2[subdata2$glossar == 1, "trace"])), "entries",
-      #proportions(table(is.na(subdata2[subdata2$glossar == 1, "trace"]))),
-      fill = TRUE)
-  subdata2 <- subset(subdata2, !is.na(subdata2$trace))
-  # REMEMBER: It can never be 100% correct, since it is always possible
-  # that several cards are open and that they link to the same glossar
-  # entry
-
-  # dat2[14110:14130, ]
-  # dat2[dat2$glossar == 1, ]
-
-  out <- rbind(subdata1, subdata2)
-  out <- out[order(out$fileId, out$date, out$timeMs), ]
-  out
-}
-
-###########################################################################
-
-close_events <- function(data, event = c("move", "flipCard", "openTopic", "openPopup")) {
-
-  event <- match.arg(event)
-
-  switch(event,
-    "move" = {
-      actions <- c("Transform start", "Transform stop")
-      idvar   <- c("fileId", "eventId", "artwork", "glossar")
-      drop    <- c("popup", "topicNumber", "trace", "event")
-      ncol    <- 16
-
-    },
-    "flipCard" = {
-      actions <- c("Show Info", "Show Front")
-      idvar   <- c("fileId", "trace", "artwork", "glossar")
-      drop    <- c("popup", "topicNumber", "eventId", "event")
-      ncol    <- 16
-
-    },
-    "openTopic" = {
-      actions <- c("Artwork/OpenCard", "Artwork/CloseCard")
-      idvar   <- c("fileId", "eventId", "trace", "glossar", "artwork",
-                   "topicNumber")
-      drop    <- c("popup", "event")
-      ncol    <- 18
-
-    },
-    "openPopup" = {
-      actions <- c("ShowPopup", "HidePopup")
-      idvar   <- c("fileId", "eventId", "trace", "glossar", "artwork", "popup")
-      drop    <- c("topicNumber", "event")
-      ncol    <- 18
-#   TODO: Should topicNumber maybe also be filled in for "openPopup"?
-
-    }
-  )
-
-  subdata <- subset(data, data$event %in% actions)
-  subdata <- subdata[order(subdata$artwork, subdata$popup, subdata$date, subdata$timeMs), ]
-  subdata$time <- ifelse(subdata$event == actions[1], "start", "stop")
-  num_start <- diff(c(0, which(subdata$event == actions[2])))
-  if (tail(subdata, 1)$time == "start") {
-    num_start <- c(num_start, 1)
-  }
-  subdata$eventId <- rep(seq_along(num_start), num_start)
-
-  if (event == "move") {
-    subdata    <- subdata[!duplicated(subdata[, c("event", "eventId")]), ]
-    id_stop    <- which(subdata$event == actions[2])
-    id_rm_stop <- id_stop[diff(id_stop) == 1]
-    subdata    <- subdata[-(id_rm_stop + 1), ]
-  }
-
-  subdata_split <- split(subdata, ~ fileId)
-
-  pbapply::pboptions(style = 3, char = "=")
-
-  subdata_split_wide <- pbapply::pblapply(subdata_split, reshape,
-                               direction = "wide",
-                               idvar = idvar,
-                               timevar = "time",
-                               drop = drop)
-#  suppressWarnings(
-#    data_wide <- reshape(subdata, direction = "wide",
-#                         idvar = idvar,
-#                         timevar = "time",
-#                         drop = drop)
-#  )
-
-  # remove entries with only start or stop events since they do not have
-  # all columns
-  ids <- which(sapply(subdata_split_wide, ncol) != ncol)
-  if (length(ids) > 0) subdata_split_wide <- subdata_split_wide[-ids]
-
-  data_wide <- dplyr::bind_rows(subdata_split_wide)
-
-  for (d in drop) data_wide[d] <- NA
-  data_wide$distance        <- NA
-  data_wide$scaleSize       <- NA
-  data_wide$rotationDegree  <- NA
-
-  data_wide$event <- event
-  data_wide$duration <- data_wide$timeMs.stop - data_wide$timeMs.start
-
-  if (event == "move") {
-    data_wide$distance <- apply(
-        data_wide[, c("x.start", "y.start", "x.stop", "y.stop")], 1,
-        function(x) dist(matrix(x, 2, 2, byrow = TRUE)))
-    data_wide$rotationDegree <- data_wide$rotation.stop -
-      data_wide$rotation.start
-    data_wide$scaleSize <- data_wide$scale.stop / data_wide$scale.start
-    # remove moves without any change
-    move_wide <- data_wide[data_wide$distance != 0 &
-                           data_wide$rotationDegree != 0 &
-                           data_wide$scaleSize != 1, ]
-    cat(paste("INFORMATION:", nrow(data_wide) - nrow(move_wide),
-    "lines containing move events were removed since they did",
-    "\nnot contain any change"), fill = TRUE)
-    data_wide <- move_wide
-  }
-
-  out <- data_wide[, c("fileId", "event", "artwork", "trace", "glossar",
-                       "date.start", "date.stop", "timeMs.start",
-                       "timeMs.stop", "duration", "topicNumber", "popup",
-                       "x.start", "y.start", "x.stop", "y.stop",
-                       "distance", "scale.start", "scale.stop",
-                       "scaleSize", "rotation.start", "rotation.stop",
-                       "rotationDegree")]
-  rownames(out) <- NULL
-  out
-}
-
-###########################################################################
-
-# Add case variable
-
-add_case <- function(data, cutoff = 20) {
-# TODO: What is the best choice for the cutoff here?
-
-  data$timediff <- as.numeric(diff(c(data$date.start[1], data$date.start)))
-  data$case <- NA
-  j <- 1
-  pb <- txtProgressBar(min = 0, max = nrow(data), initial = NA, style = 3)
-
-  for (i in seq_len(nrow(data))) {
-    if (data$timediff[i] <= cutoff) {
-      data$case[i] <- j
-    } else {
-      j <- j + 1
-      data$case[i] <- j
-    }
-    setTxtProgressBar(pb, i)
-  }
-  data$timediff <- NULL
-  data
-}
-
-###########################################################################
-
-# Add trace for moves
-
-add_trace_moves <- function(data) {
-
-  pbapply::pboptions(style = 3, char = "=")
-
-  trace_max <- max(data$trace, na.rm = TRUE)
-
-  #subdata_art <- split(data, ~ artwork)
-  subdata_case <- split(data, ~ case)
-
-  #subdata_list <- split(data, ~ artwork + case)
-  # --> does not work with complete data set
-  cat("Splitting data...", "\n")
-  subdata_list <- pbapply::pblapply(subdata_case, split, f = ~artwork)
-  subdata_list <- unlist(subdata_list, recursive = FALSE)
-
-  cat("Adding trace...", "\n")
-  subdata_trace <- pbapply::pblapply(subdata_list,
-        function(x) {
-          trace_max <<- trace_max + 1
-          add_trace_subdata(x, max_trace = trace_max)
-        }
-      )
-
-  out <- dplyr::bind_rows(subdata_trace)
-  out <- out[order(out$fileId.start, out$date.start, out$timeMs.start), ]
-  rownames(out) <- NULL
-
-  # Make trace a consecutive number
-  out$trace <- as.numeric(factor(out$trace, levels = unique(out$trace)))
-  out
-}
-
-
-add_trace_subdata <- function(subdata, max_trace) {
-
-  if (nrow(subdata) != 0) {
-
-    if (length(na.omit(unique(subdata$trace))) == 1) {
-      subdata[subdata$event == "move", "trace"] <- na.omit(unique(subdata$trace))
-    } else if (length(na.omit(unique(subdata$trace))) > 1) {
-      for (i in 1:nrow(subdata)) {
-        if (subdata$event[i] == "move") {
-          if (i == 1) {
-            subdata$trace[i] <- na.omit(unique(subdata$trace))[1]
-          } else {
-            subdata$trace[i] <- subdata$trace[i - 1]
-          }
-        }
-      }
-    } else if (all(is.na(subdata$trace))) {
-      for (i in 1:nrow(subdata)) {
-        subdata$trace[i] <- max_trace
-      }
-    }
-
-  } else {
-    warning("`subdata` has nrow = 0")
-  }
-  subdata
-}
-
-
-###########################################################################
-
-# Create data frame with file names and topics for each artwork
-
-extract_topics <- function(artworks, pattern, path) {
-
-  dat <- NULL
-  file_order <- NULL
-  i <- 1
-
-  for (artwork in artworks) {
-
-    if (length(pattern) == 1) {
-      index_file <- pattern
-    } else {
-      index_file <- pattern[i]
-    }
-
-    fnames <- dir(pattern = paste0(artwork, "_"),
-                  path = paste(path, artwork, sep = "/"))
-    topic <- NULL
-    for (fname in fnames) {
-      suppressWarnings(
-      topic <- c(topic, gsub("^<card type=.(.*).>$", "\\1",
-        grep("^<card type=",
-          trimws(readLines(paste(path, artwork, fname, sep = "/"))),
-          value = T)))
-      )
-
-    }
-    index <- paste(path, artwork, index_file, sep = "/")
-    suppressWarnings(
-    file_order <- c(file_order, gsub("^<card src=.*/(.*)./>$", "\\1",
-                         grep("^<card src=", trimws(readLines(index)),
-                              value = TRUE)))
-    )
-    in_index <- fnames %in% file_order
-    dat <- rbind(dat, data.frame(artwork, file_name = fnames, in_index, topic))
-    i <- i + 1
-  }
-
-  # take only the ones that are actually displayed and sort in the same order
-  # as indicated in index.html
-  out <- dat[dat$in_index, -3]
-  out <- out[order(file_order, out$file_name), ]
-  rownames(out) <- NULL
-
-  out$index <- unlist(sapply(table(out$artwork), seq_len))
-  out
-}
-
-###########################################################################
-
-# Add topics: file names and topics
-
-add_topic <- function(data, topics) {
-
-  artworks <- unique(data$artwork)
-  tab_art <- lapply(artworks,
-               function(x) names(table(data$topicNumber[data$artwork == x])))
-  names(tab_art) <- artworks
-
-  tab_index <- lapply(tab_art, seq_along)
-
-  dat_split <- split(data, ~ artwork)
-
-  set_label <- function(x) {
-    artwork <- unique(x$artwork)
-    x$topicIndex <- factor(x$topicNumber, labels = tab_index[[artwork]])
-    x
-  }
-
-  dat_label <- lapply(dat_split, set_label)
-
-  set_topic <- function(x) {
-    artwork <- unique(x$artwork)
-    labels_file <- topics[topics$artwork == artwork,
-                          "file_name"][as.numeric(levels(x$topicIndex))]
-    x$topicFile <- as.character(factor(x$topicIndex, labels = labels_file))
-    labels_topic <- topics[topics$artwork == artwork,
-                          "topic"][as.numeric(levels(x$topicIndex))]
-    x$topic <- as.character(factor(x$topicIndex, labels = labels_topic))
-    x
-  }
-
-  dat_topic <- lapply(dat_label, set_topic)
-
-  #out <- do.call(rbind, dat_topic)
-  out <- dplyr::bind_rows(dat_topic)
-  out <- out[order(out$fileId.start, out$date.start, out$timeMs.start), ]
-  rownames(out) <- NULL
-  out
-}
-
-###########################################################################
-
-# Create data frame with information on artworks
-
-extract_artworks <- function(artworks, files = paste0(artworks, ".xml"),
-                             path = path) {
-  out <- NULL
-  i <- 1
-
-  for (artwork in artworks) {
-
-    if (length(files) == 1) {
-      index_file <- files
-    } else {
-      index_file <- files[i]
-    }
-
-    index <- paste(path, artwork, index_file, sep = "/")
-    varnames <- c("artist", "title", "misc", "description")
-    xmllist <- XML::xmlToList(index)$header[varnames]
-
-    if (any(sapply(xmllist, is.null))) {# necessary for missing entries
-      names(xmllist) <- varnames
-      xmllist[which(sapply(xmllist, is.null))] <- NA
-    }
-    # remove ugly quotes
-    xmllist <- lapply(xmllist, function(x) gsub("„|“", "", x))
-    # remove HTML tags
-    xmllist <- lapply(xmllist, function(x) gsub("<br/>", " ", x))
-    xmldat <- as.data.frame(xmllist)
-    xmldat$artwork <- artwork
-    # trim white space from strings
-    xmldat$artist      <- trimws(xmldat$artist)
-    xmldat$title       <- trimws(xmldat$title)
-    xmldat$misc        <- trimws(xmldat$misc)
-    xmldat$description <- trimws(xmldat$description)
-    out <- rbind(out, xmldat)
-    i <- i + 1
-  }
-  out
-}
-
--- a/code/03_specs.R
+++ b/code/03_specs.R
@ -2,7 +2,7 @@

 library(lubridate)

-dat <- read.table("../data/rawdata_logfiles.csv", header = TRUE, sep = ";")
+dat <- read.table("../data/haum/rawdata_logfiles.csv", header = TRUE, sep = ";")
 # dat$event <- factor(dat$event, levels = c("Start Application",
 #                                           "Show Application",
 #                                           "Transform start",
--- a/code/visualization.R
+++ b/code/visualization.R
@ -0,0 +1,26 @@
+setwd("C:/Users/nwickelmaier/Nextcloud/Documents/MDS/2023ss/60100_master_thesis/code/")
+
+devtools::load_all("../../../../software/mtt")
+#library(mtt)
+
+dat <- parse_logfiles("2016", path = "../data/haum/LogFiles/",
+                      save = FALSE)
+datlogs <- create_eventlogs(dat, "../data/haum/ContentEyevisit/eyevisit_cards_light/")
+
+dat001 <- datlogs[which(datlogs$artwork == "001"), ]
+
+index <- as.numeric(as.factor(dat001$trace))
+cc <- sample(colors(), length(unique(dat001$trace)))
+
+plot(y.start ~ x.start, dat001, type = "n", xlab = "x", ylab = "y",
+     xlim = c(0, 3840), ylim = c(0, 2160))
+with(dat001[1:200,], arrows(x.start, y.start, x.stop, y.stop,
+                            length = .07, col = cc[index]))
+
+plot(y.start ~ x.start, dat001, xlab = "x", ylab = "y",
+     xlim = c(0, 3840), ylim = c(0, 2160), pch = 16, col = "gray")
+points(y.start ~ x.start, dat001, xlab = "x", ylab = "y",
+     xlim = c(0, 3840), ylim = c(0, 2160), cex = dat001$scaleSize,
+     col = "blue")
+
+