From 3dd13a6c6e76e591b4a212d6283609613cf15175 Mon Sep 17 00:00:00 2001
From: nwickel <n.wickelmaier@iwm-tuebingen.de>
Date: Thu, 21 Sep 2023 16:45:06 +0200
Subject: [PATCH] Moved most stuff into package folder mtt; updated README so
 it works with new code

---
 README.Rmd                   |  37 ++-
 code/01_parse-logfiles.R     | 125 ----------
 code/02_glossar_artworks.R   |  37 ---
 code/02_preprocessing.R      |  14 +-
 code/03_modeling.R           |   2 +-
 code/03_topic-cards.R        |  16 --
 code/functions.R             | 458 -----------------------------------
 code/{03_specs.R => specs.R} |   2 +-
 code/visualization.R         |  26 ++
 9 files changed, 51 insertions(+), 666 deletions(-)
 delete mode 100644 code/01_parse-logfiles.R
 delete mode 100644 code/02_glossar_artworks.R
 delete mode 100644 code/03_topic-cards.R
 delete mode 100644 code/functions.R
 rename code/{03_specs.R => specs.R} (97%)
 create mode 100644 code/visualization.R

diff --git a/README.Rmd b/README.Rmd
index e7ec837..e730198 100644
--- a/README.Rmd
+++ b/README.Rmd
@@ -8,6 +8,11 @@ output:
     toc: true
 ---
 
+```{r, include = FALSE}
+# setwd("C:/Users/nwickelmaier/Nextcloud/Documents/MDS/2023ss/60100_master_thesis")
+devtools::load_all("../../../software/mtt")
+```
+
 # Log data from the Multi-Touch Table at the HAUM
 
 The Multi Touch Table at the Herzog-Anton-Ulrich-Museum (HAUM) in
@@ -117,11 +122,6 @@ files have been affected.
 
 # Problems and how I handled them
 
-```{r, include = FALSE}
-# setwd("C:/Users/nwickelmaier/Nextcloud/Documents/MDS/2023ss/60100_master_thesis")
-source("code/functions.R")
-```
-
 This lists some problems with the log data that required decisions. These
 decisions influence the outcome and maybe even the data quality. Hence, I
 tried to document how I handled these problems and explain the decisions I
@@ -136,7 +136,7 @@ continuous within one log file but not over several log files.
 
 ```{r}
 # Read data
-dat0 <- read.table("data/rawdata_logfiles_small.csv", sep = ";",
+dat0 <- read.table("data/haum/rawdata_logfiles_small.csv", sep = ";",
                    header = TRUE)
 dat0$date <- as.POSIXct(dat0$date)
 dat0$glossar <- ifelse(dat0$artwork == "glossar", 1, 0)
@@ -146,26 +146,16 @@ dat <- subset(dat0, !(dat0$event %in% c("Start Application",
                                         "Show Application")))
 
 # Add trace variable
-dat1 <- add_trace(dat, glossar_dict = "data/glossar_dict.RData")
+dat1 <- add_trace(dat, glossar_dict = "data/haum/glossar_dict.RData")
 
 # Close events
 dat2 <- rbind(close_events(dat1, "move"),
               close_events(dat1, "flipCard"),
               close_events(dat1, "openTopic"),
               close_events(dat1, "openPopup"))
-dat2 <- dat2[order(dat2$date.start, dat2$fileId.start), ]
-
-head(dat2[which(dat2$duration < 0),
-     c("fileId.start", "fileId.stop", "event", "artwork", "duration")], 20)
-
-head(dat2[which(dat2$fileId.start != dat2$fileId.stop),
-     c("fileId.start", "fileId.stop", "event", "artwork", "duration")], 20)
+dat2 <- dat2[order(dat2$date.start, dat2$fileId), ]
 
 plot(timeMs ~ as.factor(fileId), dat[1:5000,], xlab = "fileId")
-
-# Remove durations when event spans more than one log file, since they are
-# not interpretable
-#dat2[which(dat2$fileId.start != dat2$fileId.stop), "duration"] <- NA
 ```
 
 The boxplot shows that we have a continuous range of values within one log
@@ -183,6 +173,9 @@ exactly fixed. Unfortunately, only three `move` events were fixed, since it
 only fixed irregularities *within* one log file. See below for more
 details.
 
+UPDATE: By now I remove all events that span more than one log file. This
+lets me improve speed considerably.
+
 ## Left padding of file IDs
 
 The file names of the raw log files are automatically generated and contain
@@ -196,7 +189,7 @@ will sort these files in the order shown below. In order to preprocess the
 data and close events that belong together, the data need to be sorted by
 events and artworks repeatedly. In order to get them back in the correct
 time order, it is necessary to order them based on three variables:
-`fileId.start`, `date.start` and `timeMs`. The file IDs therefore need to
+`fileId`, `date.start` and `timeMs`. The file IDs therefore need to
 sort in the correct order (again see below for example). I zero left padded
 the log file names within the data frame using it as an identifier. These
 "file names" do not correspond exactly to the original raw log file names.
@@ -406,7 +399,7 @@ assign topics and file names to the according pop-ups. This needs to be
 cross checked with the programming, but seems the most plausible approach
 with my current knowledge.
 
-## Extracting topics from `index.xml` vs. `<artwork_number>.xml
+## Extracting topics from `index.xml` vs. `<artwork_number>.xml`
 
 When I extract the topics from `index.html` I get different topics, than
 when I get them from `<artwork>.html`. At first glance, it looks like using
@@ -414,7 +407,7 @@ when I get them from `<artwork>.html`. At first glance, it looks like using
 
 ```{r}
 artworks <- unique(dat2$artwork)
-path <- "data/ContentEyevisit/eyevisit_cards_light/"
+path <- "data/haum/ContentEyevisit/eyevisit_cards_light/"
 topics <- extract_topics(artworks, "index.xml", path)
 topics2 <- extract_topics(artworks, paste0(artworks, ".xml"), path)
 
@@ -434,7 +427,7 @@ sudden there were 72 instead of 70 artworks. It seems like these two
 artworks appear on October 21, 2022.
 
 ```{r}
-dat0 <- read.table("data/rawdata_logfiles.csv", sep = ";", header = TRUE)
+dat0 <- read.table("data/haum/rawdata_logfiles.csv", sep = ";", header = TRUE)
 dat0$date <- as.POSIXct(dat0$date)
 dat0$glossar <- ifelse(dat0$artwork == "glossar", 1, 0)
 
diff --git a/code/01_parse-logfiles.R b/code/01_parse-logfiles.R
deleted file mode 100644
index 7481f0a..0000000
--- a/code/01_parse-logfiles.R
+++ /dev/null
@@ -1,125 +0,0 @@
-# setwd("C:/Users/nwickelmaier/Nextcloud/Documents/MDS/2023ss/60100_master_thesis/code")
-
-###### HELPER ######
-
-# Need to left pad file names. If I do not do this, the sorting of the
-# timestamps will be off and I get negative durations later on since the
-# wrong events get closed.
-
-
-leftpad_fnames <- function(x) {
-
-  z <- gsub(paste0(dirpaths, "/"), "\\1", x)
-  ys <- strsplit(z, "_")
-
-  res <- NULL
-
-  for (y in ys) {
-    y2 <- unlist(strsplit(y[3], "-"))
-    e1 <- y[1]
-    e2 <- sprintf("%02d", as.numeric(y[2]))
-    e3 <- sprintf("%02d", as.numeric(y2[1]))
-    e4 <- sprintf("%02d", as.numeric(y2[2]))
-    e5 <- sprintf("%02d", as.numeric(y[4]))
-    e6 <- sprintf("%02d", as.numeric(gsub(".log", "", y[5])))
-    e6 <- sprintf("%02d", as.numeric(gsub(".log", "", y[5])))
-
-    res <- c(res,
-             paste0(e1, "_", e2, "_", e3, "-", e4, "_", e5, "_", e6, ".log"))
-  }
-  res
-}
-
-##### CONTENT ######
-
-# Choose which folders with raw log files should be included
-
-folders <- "all"
-#folders <- "_2016b"
-
-dirpaths <- paste0("../data/haum_logs_2016-2023/", folders)
-
-fnames <- dir(dirpaths, pattern = "*.log", full.names = TRUE)
-length(fnames)
-head(fnames)
-
-logs <- lapply(fnames, readLines)
-nlog <- sapply(logs, length)
-dat <- data.frame(fileId = rep(leftpad_fnames(fnames), nlog),
-                  logs = unlist(logs))
-head(dat$logs)
-
-# Remove corrupted lines
-
-
-# corrupt lines are "" and need to be removed
-d1 <- dim(dat)[1]
-dat <- subset(dat, dat$logs != "")
-d2 <- dim(dat)[1]
-
-# TODO: Catch this in a function and give back a meaningful warning
-# The files contain `r d1-d2` corrupt lines that were removed from the
-# data.
-
-# Extract relevant infos
-
-date <- sapply(dat$logs, gsub,
-               pattern = "^\\[(.*)\\], \\[.*$",
-               replacement = "\\1",
-               USE.NAMES = FALSE)
-
-timestamp <- sapply(dat$logs, gsub,
-                    pattern = "^\\[.*\\], \\[(.*)\\].*$",
-                    replacement = "\\1",
-                    USE.NAMES = FALSE)
-
-action <- sapply(dat$logs, gsub,
-                 pattern = "^.*EyeVisit, (.*):*.*$",
-                 replacement = "\\1",
-                 USE.NAMES = FALSE)
-
-events <- sapply(strsplit(action, ":"), function(x) x[1])
-
-topics <- sapply(strsplit(action, ":"), function(x) x[2])
-
-moves <- apply(do.call(rbind,
-                 strsplit(sapply(strsplit(action, ":"), function(x) x[3]),
-                          ",")),
-               2, as.numeric)
-# ATTENTION: as.numeric() forces NAs for "OpenCard" and "CloseCard"
-
-card_action <- trimws(sapply(strsplit(action, ":"),
-                             function(x) x[3])[grep("Artwork", events)])
-
-card <- as.numeric(sapply(strsplit(action, ":"), function(x) x[4]))
-
-events[grep("Artwork", events)] <- paste("Artwork", card_action, sep = "/")
-
-ts_elements <- strsplit(timestamp, ":")
-time_ms <- as.numeric(sapply(ts_elements, function(x) x[4])) +
-           as.numeric(sapply(ts_elements, function(x) x[3])) * 1000 +
-           as.numeric(sapply(ts_elements, function(x) x[2])) * 1000 * 60
-
-dat$date        <- lubridate::parse_date_time(date, "bdyHMSOp")
-dat$timeMs      <- time_ms
-dat$event       <- events
-dat$artwork     <- trimws(sapply(strsplit(topics, "/"), function(x) x[1]))
-dat$popup       <- sapply(strsplit(topics, "/"), function(x) x[2])
-dat$topicNumber <- card
-dat$x           <- moves[,1]
-dat$y           <- moves[,2]
-dat$scale       <- moves[,3]
-dat$rotation    <- moves[,4]
-
-dat$logs <- NULL
-# remove original log files from data so file becomes smaller
-
-# sort by fileId, since reading in by file names does not make sense
-# because of missing left zero padding
-dat <- dat[order(dat$fileId, dat$date, dat$timeMs), ]
-
-# Export data
-
-write.table(dat, "../data/rawdata_logfiles.csv",
-            sep = ";", quote = FALSE, row.names = FALSE)
-
diff --git a/code/02_glossar_artworks.R b/code/02_glossar_artworks.R
deleted file mode 100644
index 1b37e67..0000000
--- a/code/02_glossar_artworks.R
+++ /dev/null
@@ -1,37 +0,0 @@
-# setwd("C:/Users/nwickelmaier/Nextcloud/Documents/MDS/2023ss/60100_master_thesis/data/ContentEyevisit/eyevisit_cards_light")
-
-dat0 <- read.table("../../rawdata_logfiles.csv", sep = ";",
-                   header = TRUE)
-# artwork names
-artworks <- unique(na.omit(dat0$artwork))[unique(na.omit(dat0$artwork)) != "glossar"]
-
-dat <- subset(dat0, dat0$artwork == "glossar")
-
-glossar_files <- unique(dat$popup)
-
-x <- NULL
-
-for (glossar_file in glossar_files) {
-  for (artwork in artworks) {
-    fnames <- dir(pattern = paste0(artwork, "_"), path = artwork)
-    for (fname in fnames) {
-      lines <- readLines(paste0(artwork, "/", fname))
-        if (any(grepl(glossar_file, lines))) {
-          x <- rbind(x, data.frame(glossar_file, artwork))
-          break
-      }
-    }
-  }
-}
-
-head(x, 20)
-
-glossar_dict <- as.data.frame(tapply(x$artwork, x$glossar_file, FUN = c))
-names(glossar_dict) <- "artwork"
-glossar_dict$glossar_file <- rownames(glossar_dict)
-rownames(glossar_dict) <- NULL
-glossar_dict <- glossar_dict[, c("glossar_file", "artwork")]
-
-save(glossar_dict, file = "../../glossar_dict.RData")
-# TODO: Save in interoperable format
-
diff --git a/code/02_preprocessing.R b/code/02_preprocessing.R
index ec85a1e..2c67729 100644
--- a/code/02_preprocessing.R
+++ b/code/02_preprocessing.R
@@ -1,6 +1,9 @@
+# TODO: This script is obsolete and needs to be updated!
+
 # setwd("C:/Users/nwickelmaier/Nextcloud/Documents/MDS/2023ss/60100_master_thesis/code")
 
-source("functions.R")
+#source("functions.R")
+devtools::load_all("../../../../software/mtt")
 
 small <- TRUE
 
@@ -10,10 +13,10 @@ now <- Sys.time()
 cat("########## Reading in data... ##########", "\n")
 
 if (small) {
-  dat0 <- read.table("../data/rawdata_logfiles_small.csv", sep = ";",
+  dat0 <- read.table("../data/haum/rawdata_logfiles_small.csv", sep = ";",
                      header = TRUE)
 } else {
-  dat0 <- read.table("../data/rawdata_logfiles.csv", sep = ";",
+  dat0 <- read.table("../data/haum/rawdata_logfiles.csv", sep = ";",
                      header = TRUE)
 }
 dat0$date <- as.POSIXct(dat0$date)
@@ -54,7 +57,6 @@ dat2 <- dat2[order(dat2$date.start, dat2$fileId.start), ]
 # Remove all events that do not have a `date.start`
 dat2 <- dat2[!is.na(dat2$date.start), ]
 rownames(dat2) <- NULL
-# TODO: Throw warning about this
 
 save(dat2, file = paste("tmp/dat2", ifelse(small, "small_", "full_"),
                        format(now, "%Y-%m-%d_%H-%M-%S"), ".RData"))
@@ -90,7 +92,7 @@ artworks <- unique(dat4$artwork)
 # remove artworks without XML information
 artworks <- artworks[!artworks %in% c("504", "505")]
 topics <- extract_topics(artworks, pattern = paste0(artworks, ".xml"),
-                         path = "../data/ContentEyevisit/eyevisit_cards_light/")
+                         path = "../data/haum/ContentEyevisit/eyevisit_cards_light/")
 
 dat5 <- add_topic(dat4, topics = topics)
 
@@ -101,6 +103,6 @@ save(dat5, file = paste("tmp/dat5", ifelse(small, "small_", "full_"),
 
 # Export data ############################################################
 cat("########## Exporting data frame with event logs... ##########", "\n")
-write.table(dat5, "../data/event_logfiles.csv", sep = ";",
+write.table(dat5, "../data/haum/event_logfiles.csv", sep = ";",
             row.names = FALSE)
 
diff --git a/code/03_modeling.R b/code/03_modeling.R
index 36d087d..6093c71 100644
--- a/code/03_modeling.R
+++ b/code/03_modeling.R
@@ -16,7 +16,7 @@
 
 #' # Read data
 
-dat <- read.table("../data/event_logfiles.csv", sep = ";", header = TRUE)
+dat <- read.table("../data/haum/event_logfiles.csv", sep = ";", header = TRUE)
 dat$date.start <- as.POSIXct(dat$date.start)
 dat$date.stop <- as.POSIXct(dat$date.stop)
 
diff --git a/code/03_topic-cards.R b/code/03_topic-cards.R
deleted file mode 100644
index 921008e..0000000
--- a/code/03_topic-cards.R
+++ /dev/null
@@ -1,16 +0,0 @@
-path <- "C:/Users/nwickelmaier/Nextcloud/Documents/MDS/2023ss/60100_master_thesis/data/ContentEyevisit/eyevisit_cards_light"
-
-setwd(path)
-
-# artwork names
-dat0 <- read.table("../../event_logfiles.csv", sep = ";", header = TRUE)
-dat0$artwork <- sprintf("%03d", dat0$artwork)
-artworks <- sort(unique(dat0$artwork))
-
-# extract topics
-topics <- extract_topics(artworks, paste0(artworks, ".xml"), path)
-
-write.table(topics, file = "../../topics.csv", sep = ";", row.names = FALSE)
-
-# TODO: Keep this file?
-
diff --git a/code/functions.R b/code/functions.R
deleted file mode 100644
index 00f579b..0000000
--- a/code/functions.R
+++ /dev/null
@@ -1,458 +0,0 @@
-###########################################################################
-
-# Add trace variable
-
-add_trace <- function(data, glossar_dict = "../data/glossar_dict.RData") {
-
-  data$trace <- NA
-  subdata1 <- data[data$event %in% c("Transform start", "Transform stop"), ]
-  subdata2 <- data[!data$event %in% c("Transform start", "Transform stop"), ]
-
-  last_event <- subdata2$event[1]
-  artworks <- unique(subdata2$artwork)[unique(subdata2$artwork) != "glossar"]
-  n <- 1    # count artworks for progress
-
-  pb <- txtProgressBar(min = 0, max = nrow(subdata2), initial = NA,
-                       style = 3)
-
-  for (artwork in artworks) {
-
-    cat("\n\nAdding trace variable for artwork", artwork,
-               paste0("(", n, "/", length(artworks), ")"), "\n")
-
-    for (i in 1:nrow(subdata2)) {
-
-      if (last_event == "Show Info" & subdata2$artwork[i] == artwork) {
-        subdata2$trace[i] <- i
-        j <- i
-
-      } else if (last_event == "Show Front" & subdata2$artwork[i] == artwork) {
-        subdata2$trace[i] <- j
-
-      } else if (!(last_event %in% c("Show Info", "Show Front")) &
-                 subdata2$artwork[i] == artwork) {
-        subdata2$trace[i] <- j
-      }
-
-      if (i <= nrow(subdata2)) {
-        last_event <- subdata2$event[i + 1]
-      }
-      setTxtProgressBar(pb, i)
-    }
-    n <- n + 1
-  }
-
-  # Fix glossar entries (find corresponding artworks and fill in trace)
-  glossar_files <- unique(subdata2[subdata2$artwork == "glossar", "popup"])
-
-  # load lookup table for artworks and glossar files
-  load(glossar_dict)
-  lut <- glossar_dict[glossar_dict$glossar_file %in% glossar_files, ]
-
-  inside  <- glossar_files[glossar_files %in%
-                           lut[sapply(lut$artwork, length) == 1,
-                               "glossar_file"]]
-  single_art  <- unlist(lut[lut$glossar_file %in% inside, "artwork"])
-
-  m <- 1
-
-  for (file in lut$glossar_file) {
-
-    cat("\n\nAdding trace variable for glossar entry", file,
-               paste0("(", m, "/", length(lut$glossar_file), ")"), "\n")
-
-    artwork_list <- unlist(lut[lut$glossar_file == file, "artwork"])
-
-    for (i in seq_len(nrow(subdata2))) {
-
-      if (subdata2$event[i] == "Show Info" |
-          (subdata2$event[i] == "Artwork/OpenCard" &
-           subdata2$artwork[i] %in% single_art)) {
-
-        current_artwork <- subdata2[i, "artwork"]
-        j <- i
-        k <- i
-
-      } else {
-
-        current_artwork <- current_artwork
-
-      }
-
-      if (subdata2$event[i] == "Show Front" & subdata2$artwork[i] == current_artwork) {
-      # make sure artwork has not been closed, yet!
-        k <- i
-      }
-
-      if (subdata2$artwork[i] == "glossar" &
-          (current_artwork %in% artwork_list) &
-          subdata2$popup[i] == file & (j - k == 0)) {
-
-        subdata2[i, "trace"]   <- subdata2[j, "trace"]
-        subdata2[i, "artwork"] <- current_artwork
-
-      }
-      setTxtProgressBar(pb, i)
-    }
-    m <- m + 1
-  }
-
-  # Exclude not matched glossar entries
-  cat("\n\nINFORMATION: glossar entries that are not matched will be removed:",
-      sum(is.na(subdata2[subdata2$glossar == 1, "trace"])), "entries",
-      #proportions(table(is.na(subdata2[subdata2$glossar == 1, "trace"]))),
-      fill = TRUE)
-  subdata2 <- subset(subdata2, !is.na(subdata2$trace))
-  # REMEMBER: It can never be 100% correct, since it is always possible
-  # that several cards are open and that they link to the same glossar
-  # entry
-
-  # dat2[14110:14130, ]
-  # dat2[dat2$glossar == 1, ]
-
-  out <- rbind(subdata1, subdata2)
-  out <- out[order(out$fileId, out$date, out$timeMs), ]
-  out
-}
-
-###########################################################################
-
-close_events <- function(data, event = c("move", "flipCard", "openTopic", "openPopup")) {
-
-  event <- match.arg(event)
-
-  switch(event,
-    "move" = {
-      actions <- c("Transform start", "Transform stop")
-      idvar   <- c("fileId", "eventId", "artwork", "glossar")
-      drop    <- c("popup", "topicNumber", "trace", "event")
-      ncol    <- 16
-
-    },
-    "flipCard" = {
-      actions <- c("Show Info", "Show Front")
-      idvar   <- c("fileId", "trace", "artwork", "glossar")
-      drop    <- c("popup", "topicNumber", "eventId", "event")
-      ncol    <- 16
-
-    },
-    "openTopic" = {
-      actions <- c("Artwork/OpenCard", "Artwork/CloseCard")
-      idvar   <- c("fileId", "eventId", "trace", "glossar", "artwork",
-                   "topicNumber")
-      drop    <- c("popup", "event")
-      ncol    <- 18
-
-    },
-    "openPopup" = {
-      actions <- c("ShowPopup", "HidePopup")
-      idvar   <- c("fileId", "eventId", "trace", "glossar", "artwork", "popup")
-      drop    <- c("topicNumber", "event")
-      ncol    <- 18
-#   TODO: Should topicNumber maybe also be filled in for "openPopup"?
-
-    }
-  )
-
-  subdata <- subset(data, data$event %in% actions)
-  subdata <- subdata[order(subdata$artwork, subdata$popup, subdata$date, subdata$timeMs), ]
-  subdata$time <- ifelse(subdata$event == actions[1], "start", "stop")
-  num_start <- diff(c(0, which(subdata$event == actions[2])))
-  if (tail(subdata, 1)$time == "start") {
-    num_start <- c(num_start, 1)
-  }
-  subdata$eventId <- rep(seq_along(num_start), num_start)
-
-  if (event == "move") {
-    subdata    <- subdata[!duplicated(subdata[, c("event", "eventId")]), ]
-    id_stop    <- which(subdata$event == actions[2])
-    id_rm_stop <- id_stop[diff(id_stop) == 1]
-    subdata    <- subdata[-(id_rm_stop + 1), ]
-  }
-
-  subdata_split <- split(subdata, ~ fileId)
-
-  pbapply::pboptions(style = 3, char = "=")
-
-  subdata_split_wide <- pbapply::pblapply(subdata_split, reshape,
-                               direction = "wide",
-                               idvar = idvar,
-                               timevar = "time",
-                               drop = drop)
-#  suppressWarnings(
-#    data_wide <- reshape(subdata, direction = "wide",
-#                         idvar = idvar,
-#                         timevar = "time",
-#                         drop = drop)
-#  )
-
-  # remove entries with only start or stop events since they do not have
-  # all columns
-  ids <- which(sapply(subdata_split_wide, ncol) != ncol)
-  if (length(ids) > 0) subdata_split_wide <- subdata_split_wide[-ids]
-
-  data_wide <- dplyr::bind_rows(subdata_split_wide)
-
-  for (d in drop) data_wide[d] <- NA
-  data_wide$distance        <- NA
-  data_wide$scaleSize       <- NA
-  data_wide$rotationDegree  <- NA
-
-  data_wide$event <- event
-  data_wide$duration <- data_wide$timeMs.stop - data_wide$timeMs.start
-
-  if (event == "move") {
-    data_wide$distance <- apply(
-        data_wide[, c("x.start", "y.start", "x.stop", "y.stop")], 1,
-        function(x) dist(matrix(x, 2, 2, byrow = TRUE)))
-    data_wide$rotationDegree <- data_wide$rotation.stop -
-      data_wide$rotation.start
-    data_wide$scaleSize <- data_wide$scale.stop / data_wide$scale.start
-    # remove moves without any change
-    move_wide <- data_wide[data_wide$distance != 0 &
-                           data_wide$rotationDegree != 0 &
-                           data_wide$scaleSize != 1, ]
-    cat(paste("INFORMATION:", nrow(data_wide) - nrow(move_wide),
-    "lines containing move events were removed since they did",
-    "\nnot contain any change"), fill = TRUE)
-    data_wide <- move_wide
-  }
-
-  out <- data_wide[, c("fileId", "event", "artwork", "trace", "glossar",
-                       "date.start", "date.stop", "timeMs.start",
-                       "timeMs.stop", "duration", "topicNumber", "popup",
-                       "x.start", "y.start", "x.stop", "y.stop",
-                       "distance", "scale.start", "scale.stop",
-                       "scaleSize", "rotation.start", "rotation.stop",
-                       "rotationDegree")]
-  rownames(out) <- NULL
-  out
-}
-
-###########################################################################
-
-# Add case variable
-
-add_case <- function(data, cutoff = 20) {
-# TODO: What is the best choice for the cutoff here?
-
-  data$timediff <- as.numeric(diff(c(data$date.start[1], data$date.start)))
-  data$case <- NA
-  j <- 1
-  pb <- txtProgressBar(min = 0, max = nrow(data), initial = NA, style = 3)
-
-  for (i in seq_len(nrow(data))) {
-    if (data$timediff[i] <= cutoff) {
-      data$case[i] <- j
-    } else {
-      j <- j + 1
-      data$case[i] <- j
-    }
-    setTxtProgressBar(pb, i)
-  }
-  data$timediff <- NULL
-  data
-}
-
-###########################################################################
-
-# Add trace for moves
-
-add_trace_moves <- function(data) {
-
-  pbapply::pboptions(style = 3, char = "=")
-
-  trace_max <- max(data$trace, na.rm = TRUE)
-
-  #subdata_art <- split(data, ~ artwork)
-  subdata_case <- split(data, ~ case)
-
-  #subdata_list <- split(data, ~ artwork + case)
-  # --> does not work with complete data set
-  cat("Splitting data...", "\n")
-  subdata_list <- pbapply::pblapply(subdata_case, split, f = ~artwork)
-  subdata_list <- unlist(subdata_list, recursive = FALSE)
-
-  cat("Adding trace...", "\n")
-  subdata_trace <- pbapply::pblapply(subdata_list,
-        function(x) {
-          trace_max <<- trace_max + 1
-          add_trace_subdata(x, max_trace = trace_max)
-        }
-      )
-
-  out <- dplyr::bind_rows(subdata_trace)
-  out <- out[order(out$fileId.start, out$date.start, out$timeMs.start), ]
-  rownames(out) <- NULL
-
-  # Make trace a consecutive number
-  out$trace <- as.numeric(factor(out$trace, levels = unique(out$trace)))
-  out
-}
-
-
-add_trace_subdata <- function(subdata, max_trace) {
-
-  if (nrow(subdata) != 0) {
-
-    if (length(na.omit(unique(subdata$trace))) == 1) {
-      subdata[subdata$event == "move", "trace"] <- na.omit(unique(subdata$trace))
-    } else if (length(na.omit(unique(subdata$trace))) > 1) {
-      for (i in 1:nrow(subdata)) {
-        if (subdata$event[i] == "move") {
-          if (i == 1) {
-            subdata$trace[i] <- na.omit(unique(subdata$trace))[1]
-          } else {
-            subdata$trace[i] <- subdata$trace[i - 1]
-          }
-        }
-      }
-    } else if (all(is.na(subdata$trace))) {
-      for (i in 1:nrow(subdata)) {
-        subdata$trace[i] <- max_trace
-      }
-    }
-
-  } else {
-    warning("`subdata` has nrow = 0")
-  }
-  subdata
-}
-
-
-###########################################################################
-
-# Create data frame with file names and topics for each artwork
-
-extract_topics <- function(artworks, pattern, path) {
-
-  dat <- NULL
-  file_order <- NULL
-  i <- 1
-
-  for (artwork in artworks) {
-
-    if (length(pattern) == 1) {
-      index_file <- pattern
-    } else {
-      index_file <- pattern[i]
-    }
-
-    fnames <- dir(pattern = paste0(artwork, "_"),
-                  path = paste(path, artwork, sep = "/"))
-    topic <- NULL
-    for (fname in fnames) {
-      suppressWarnings(
-      topic <- c(topic, gsub("^<card type=.(.*).>$", "\\1",
-        grep("^<card type=",
-          trimws(readLines(paste(path, artwork, fname, sep = "/"))),
-          value = T)))
-      )
-
-    }
-    index <- paste(path, artwork, index_file, sep = "/")
-    suppressWarnings(
-    file_order <- c(file_order, gsub("^<card src=.*/(.*)./>$", "\\1",
-                         grep("^<card src=", trimws(readLines(index)),
-                              value = TRUE)))
-    )
-    in_index <- fnames %in% file_order
-    dat <- rbind(dat, data.frame(artwork, file_name = fnames, in_index, topic))
-    i <- i + 1
-  }
-
-  # take only the ones that are actually displayed and sort in the same order
-  # as indicated in index.html
-  out <- dat[dat$in_index, -3]
-  out <- out[order(file_order, out$file_name), ]
-  rownames(out) <- NULL
-
-  out$index <- unlist(sapply(table(out$artwork), seq_len))
-  out
-}
-
-###########################################################################
-
-# Add topics: file names and topics
-
-add_topic <- function(data, topics) {
-
-  artworks <- unique(data$artwork)
-  tab_art <- lapply(artworks,
-               function(x) names(table(data$topicNumber[data$artwork == x])))
-  names(tab_art) <- artworks
-
-  tab_index <- lapply(tab_art, seq_along)
-
-  dat_split <- split(data, ~ artwork)
-
-  set_label <- function(x) {
-    artwork <- unique(x$artwork)
-    x$topicIndex <- factor(x$topicNumber, labels = tab_index[[artwork]])
-    x
-  }
-
-  dat_label <- lapply(dat_split, set_label)
-
-  set_topic <- function(x) {
-    artwork <- unique(x$artwork)
-    labels_file <- topics[topics$artwork == artwork,
-                          "file_name"][as.numeric(levels(x$topicIndex))]
-    x$topicFile <- as.character(factor(x$topicIndex, labels = labels_file))
-    labels_topic <- topics[topics$artwork == artwork,
-                          "topic"][as.numeric(levels(x$topicIndex))]
-    x$topic <- as.character(factor(x$topicIndex, labels = labels_topic))
-    x
-  }
-
-  dat_topic <- lapply(dat_label, set_topic)
-
-  #out <- do.call(rbind, dat_topic)
-  out <- dplyr::bind_rows(dat_topic)
-  out <- out[order(out$fileId.start, out$date.start, out$timeMs.start), ]
-  rownames(out) <- NULL
-  out
-}
-
-###########################################################################
-
-# Create data frame with information on artworks
-
-extract_artworks <- function(artworks, files = paste0(artworks, ".xml"),
-                             path = path) {
-  out <- NULL
-  i <- 1
-
-  for (artwork in artworks) {
-
-    if (length(files) == 1) {
-      index_file <- files
-    } else {
-      index_file <- files[i]
-    }
-
-    index <- paste(path, artwork, index_file, sep = "/")
-    varnames <- c("artist", "title", "misc", "description")
-    xmllist <- XML::xmlToList(index)$header[varnames]
-
-    if (any(sapply(xmllist, is.null))) {# necessary for missing entries
-      names(xmllist) <- varnames
-      xmllist[which(sapply(xmllist, is.null))] <- NA
-    }
-    # remove ugly quotes
-    xmllist <- lapply(xmllist, function(x) gsub("„|“", "", x))
-    # remove HTML tags
-    xmllist <- lapply(xmllist, function(x) gsub("<br/>", " ", x))
-    xmldat <- as.data.frame(xmllist)
-    xmldat$artwork <- artwork
-    # trim white space from strings
-    xmldat$artist      <- trimws(xmldat$artist)
-    xmldat$title       <- trimws(xmldat$title)
-    xmldat$misc        <- trimws(xmldat$misc)
-    xmldat$description <- trimws(xmldat$description)
-    out <- rbind(out, xmldat)
-    i <- i + 1
-  }
-  out
-}
-
diff --git a/code/03_specs.R b/code/specs.R
similarity index 97%
rename from code/03_specs.R
rename to code/specs.R
index 79de837..860404a 100644
--- a/code/03_specs.R
+++ b/code/specs.R
@@ -2,7 +2,7 @@
 
 library(lubridate)
 
-dat <- read.table("../data/rawdata_logfiles.csv", header = TRUE, sep = ";")
+dat <- read.table("../data/haum/rawdata_logfiles.csv", header = TRUE, sep = ";")
 # dat$event <- factor(dat$event, levels = c("Start Application",
 #                                           "Show Application",
 #                                           "Transform start",
diff --git a/code/visualization.R b/code/visualization.R
new file mode 100644
index 0000000..255ab27
--- /dev/null
+++ b/code/visualization.R
@@ -0,0 +1,26 @@
+setwd("C:/Users/nwickelmaier/Nextcloud/Documents/MDS/2023ss/60100_master_thesis/code/")
+
+devtools::load_all("../../../../software/mtt")
+#library(mtt)
+
+dat <- parse_logfiles("2016", path = "../data/haum/LogFiles/",
+                      save = FALSE)
+datlogs <- create_eventlogs(dat, "../data/haum/ContentEyevisit/eyevisit_cards_light/")
+
+dat001 <- datlogs[which(datlogs$artwork == "001"), ]
+
+index <- as.numeric(as.factor(dat001$trace))
+cc <- sample(colors(), length(unique(dat001$trace)))
+
+plot(y.start ~ x.start, dat001, type = "n", xlab = "x", ylab = "y",
+     xlim = c(0, 3840), ylim = c(0, 2160))
+with(dat001[1:200,], arrows(x.start, y.start, x.stop, y.stop,
+                            length = .07, col = cc[index]))
+
+plot(y.start ~ x.start, dat001, xlab = "x", ylab = "y",
+     xlim = c(0, 3840), ylim = c(0, 2160), pch = 16, col = "gray")
+points(y.start ~ x.start, dat001, xlab = "x", ylab = "y",
+     xlim = c(0, 3840), ylim = c(0, 2160), cex = dat001$scaleSize,
+     col = "blue")
+
+