mtt_haum/code/02_preprocessing.R

#' ---
#' title: "Preprocessing log files"
#' author: "Nora Wickelmaier"
#' date: "`r Sys.Date()`"
#' output:
#'   html_document:
#'     toc: true
#'     toc_float: true
#'   pdf_document:
#'     toc: true
#'     number_sections: true
#' geometry: margin = 2.5cm
#' ---

# setwd("C:/Users/nwickelmaier/Nextcloud/Documents/MDS/2023ss/60100_master_thesis/code")

# LogEntry classes:
#   TRANSFORM_START:    "Transform start" --> "Transformation Start" in Tool
#   TRANSFORM_STOP:     "Transform stop"
#   START_APPLICATION:  "Start Application"
#   SHOW_APPLICATION:   "Show Application"
#   SHOW_INFO:          "Show Info"       --> "Flip Card" in Tool
#   SHOW_FRONT:         "Show Front"
#   SHOW_POPUP:         "ShowPopup"       --> "Show Popup" in Tool
#   HIDE_POPUP:         "HidePopup"
#   ARTWORK:            "Artwork"         --> "Show Topic" in Tool

#' # Read data

dat0 <- read.table("../data/rawdata_logfiles_small.csv", sep = ";",
                   header = TRUE)
dat0$date <- as.POSIXct(dat0$date)  # create date object
dat0$glossar <- ifelse(dat0$artwork == "glossar", 1, 0)

#' # Remove irrelevant events

#' ## Remove Start Application and Show Application

dat <- subset(dat0, !(dat0$event %in% c("Start Application",
                                        "Show Application")))
rownames(dat) <- NULL

#' # Close events

########

#' Do it for Transform events first
dat1 <- dat[dat$event %in% c("Transform start", "Transform stop"), ]
dat1 <- dat1[order(dat1$artwork, dat1$date), ]
rownames(dat1) <- NULL

# Create event ID for closing events
num_start <- diff(c(0, which(dat1$event == "Transform stop")))
dat1$eventid <- rep(seq_along(num_start), num_start)
head(dat1[, c("event", "eventid")], 25)

table(table(dat1$eventid))
#   1     2     3     4     5     6     7     8    10    11
#  70 78435  5153   842   222    66    18    14     3     1
# --> compare to table(num_start)!

# Find out how often "Transform stop" follows each other
num_stop <- c(diff(c(0, which(dat1$event == "Transform start"))))
table(num_stop)

dat1$eventrep <- rep(num_start, num_start)
dat1$dupl <- duplicated(dat1[, c("event", "eventid")])                    # keep first
dat1$dupl <- duplicated(dat1[, c("event", "eventid")], fromLast = TRUE)   # keep last
dat1[dat1$eventrep == 10, ]

dat1$dupl <- NULL
dat1$eventrep <- NULL


# remove duplicated "Transform start" events
dat1 <- dat1[!duplicated(dat1[, c("event", "eventid")]), ]

# remove duplicated "Transform stop" events
id_stop  <- which(dat1$event == "Transform stop")
id_rm_stop <- id_stop[diff(id_stop) == 1]

dat1 <- dat1[-(id_rm_stop + 1), ]

# transform to wide data format
dat1$time <- ifelse(dat1$event == "Transform start", "start", "stop")

trans_wide <- reshape(dat1, direction = "wide",
                      idvar = c("eventid", "artwork", "glossar"),
                      timevar = "time",
                      drop = c("popup", "card", "event")
)
# TODO: This runs for quite some time
# --> Is this more efficient with tidyr::pivot_wider?

# --> when fileid is part of the reshape, it does not work correctly, since
# we sometimes have a start - stop event that is recorded in two separate
# log files, BUT: after finding out, that `time_ms` changes for each log
# file, I want to exclude those cases, so `fileid` has to be included!!!

# check how often an eventid is associated with two fileids
nrow(subset(trans_wide, trans_wide$fileid.start != trans_wide$fileid.stop))

# exclude from data set ??
# trans_wide <- subset(trans_wide, trans_wide$fileid.start != trans_wide$fileid.stop)

# which(is.na(trans_wide$date.start))

trans_wide$event <- "move"
trans_wide$eventid <- NULL

rownames(trans_wide) <- NULL

trans_wide$duration <- trans_wide$time_ms.stop - trans_wide$time_ms.start
#trans_wide$duration <- trans_wide$date.stop - trans_wide$date.start
# only seconds - not fine grained enough
trans_wide$distance <- apply(
      trans_wide[, c("x.start", "y.start", "x.stop", "y.stop")], 1,
      function(x) dist(matrix(x, 2, 2, byrow = TRUE)))
trans_wide$rotationDegree <- trans_wide$rotation.stop -
  trans_wide$rotation.start
trans_wide$scaleSize <- trans_wide$scale.stop / trans_wide$scale.start

trans_wide$trace <- NA
trans_wide$card  <- NA
trans_wide$popup <- NA

dat_trans <- trans_wide[trans_wide$distance != 0 &
                        trans_wide$rotationDegree != 0 &
                        trans_wide$scaleSize != 1,
                      c("fileid.start", "fileid.stop", "event", "artwork",
                        "trace", "glossar", "date.start", "date.stop",
                        "time_ms.start", "time_ms.stop", "duration",
                        "card", "popup", "x.start", "y.start", "x.stop",
                        "y.stop", "distance", "scale.start", "scale.stop",
                        "scaleSize", "rotation.start", "rotation.stop",
                        "rotationDegree")]
1 - nrow(dat_trans) / nrow(trans_wide)
# removes almost 2/3 of the data (for small data set)

rm(id_rm_stop, id_stop, trans_wide, num_start, num_stop)

summary(dat_trans)


#' # Close other events

dat2 <- dat[!dat$event %in% c("Transform start", "Transform stop"), ]
# dat2$x <- NULL
# dat2$y <- NULL
# dat2$scale <- NULL
# dat2$rotation <- NULL
rownames(dat2) <- NULL

# Create event ID for closing events
# num_start <- diff(c(0, which(dat2$event == "Show Front")))
# dat2$trace <- rep(seq_along(num_start), num_start)
# head(dat2[, c("artwork", "event", "trace")], 50)
# --> does not work because of glossar entries... can't sort by artwork


dat2$trace <- NA
last_event <- dat2$event[1]
aws <- unique(dat2$artwork)[unique(dat2$artwork) != "glossar"]
#
for (art in aws) {              # select artwork

  for (i in 1:nrow(dat2)) {      # go through rows

    if (last_event == "Show Info" & dat2$artwork[i] == art) {
      dat2$trace[i] <- i
      j <- i

    } else if (last_event == "Show Front" & dat2$artwork[i] == art) {
      dat2$trace[i] <- j

    } else if (!(last_event %in% c("Show Info", "Show Front")) &
               dat2$artwork[i] == art) {
      dat2$trace[i] <- j
    }

    if (i <= nrow(dat2)) {
      last_event <- dat2$event[i + 1]
    }
  }
}

head(dat2[, c("artwork", "event", "trace")], 50)
tail(dat2[, c("artwork", "event", "trace")], 50)
# TODO: How to handle popups from glossar???

rm(aws, i, j, last_event, art)

## Fix glossar entries

### Find artwork for glossar entry

glossar_files <- unique(dat2[dat2$artwork == "glossar", "popup"])

# load lookup table for artworks and glossar files
load("../data/glossar_dict.RData")
lut <- glossar_dict[glossar_dict$glossar_file %in% glossar_files, ]

head(dat2[, c("artwork", "event", "popup", "trace")], 20)


for (file in lut$glossar_file) {

  artwork_list <- unlist(lut[lut$glossar_file == file, "artwork"])

  for (i in seq_len(nrow(dat2))) {

    if (dat2$event[i] == "Show Info") {

      current_artwork <- dat2[i, "artwork"]
      j <- i
      k <- i

    } else {

      current_artwork <- current_artwork

    }

    if (dat2$event[i] == "Show Front" & dat2$artwork[i] == current_artwork) {
    # make sure artwork has not been closed, yet!
      k <- i
    }

    if (dat2$artwork[i] == "glossar" &
        (current_artwork %in% artwork_list) &
        dat2$popup[i] == file & (j-k == 0)) {

      dat2[i, "trace"] <- dat2[j, "trace"]
      dat2[i, "artwork"] <- current_artwork

    }
  }
}

# --> finds about half of the glossar entries for the small data set...
table(is.na(dat2[dat2$glossar == 1, "trace"]))

# REMEMBER: It can never bo 100% correct, since it is always possible that
# several cards are open and that they link to the same glossar entry

# How many glossar_files are only associated with one artwork?
lut[sapply(lut$artwork, length) == 1, "glossar_file"]

# TODO: Fill in the ones that are associated with one artwork
# --> Can't come up with something -- maybe ask AK???

single <- lut[sapply(lut$artwork, length) == 1, "glossar_file"]
tmp <- subset(dat2, is.na(dat2$trace))$popup
inside <- unique(tmp[tmp %in% lut[sapply(lut$artwork, length) == 1, "glossar_file"]])
single_art <- unlist(lut[lut$glossar_file %in% inside, "artwork"])
tmp_lut <- data.frame(glossar_file = sort(inside), artwork = single_art)


dat2[dat2$glossar == 1, c("artwork", "popup", "glossar", "trace")]

for (file in tmp_lut$glossar_file) {

  for (i in seq_len(nrow(dat2))) {

    if (dat2$event[i] == "Artwork/OpenCard" & dat2$artwork[i] %in% tmp_lut$artwork) {

      current_artwork <- dat2[i, "artwork"]
      j <- i

    }

    if (dat2$artwork[i] == "glossar" &
        dat2$popup[i] == file) {

      dat2[i, "trace"] <- dat2[j, "trace"]
      dat2[i, "artwork"] <- current_artwork

    }
  }
}


dat2[14110:14130, ]

# TODO: Integrate for loop into for loop above

# TODO: "glossar" entry should be changed to the corresponding artwork

# TODO: For now: Exclude not matched glossar entries

df <- subset(dat2, !is.na(dat2$trace))
df <- df[order(df$trace), ]
rownames(df) <- NULL

rm(lut, current_artwork, file, glossar_dict, i, j, k, artwork_list,
   glossar_files)

#' ## Close flipCard

dat3 <- subset(df, df$event %in% c("Show Info", "Show Front"))

dat3$time <- ifelse(dat3$event == "Show Info", "start", "stop")

flipCard_wide <- reshape(dat3, direction = "wide",
                        idvar = c("trace", "artwork", "glossar"),
                        timevar = "time",
                        drop = c("popup", "card"))
flipCard_wide$event <- "flipCard"
flipCard_wide$duration <- flipCard_wide$time_ms.stop -
  flipCard_wide$time_ms.start


flipCard_wide$card            <- NA
flipCard_wide$popup           <- NA
flipCard_wide$x.start         <- NA
flipCard_wide$x.stop          <- NA
flipCard_wide$y.start         <- NA
flipCard_wide$y.stop          <- NA
flipCard_wide$distance        <- NA
flipCard_wide$scale.start     <- NA
flipCard_wide$scale.stop      <- NA
flipCard_wide$scaleSize       <- NA
flipCard_wide$rotation.start  <- NA
flipCard_wide$rotation.stop   <- NA
flipCard_wide$rotationDegree  <- NA

dat_flipCard <- flipCard_wide[, c("fileid.start", "fileid.stop", "event",
                                  "artwork", "trace", "glossar",
                                  "date.start", "date.stop",
                                  "time_ms.start", "time_ms.stop",
                                  "duration", "card", "popup", "x.start",
                                  "y.start", "x.stop", "y.stop",
                                  "distance", "scale.start", "scale.stop",
                                  "scaleSize", "rotation.start",
                                  "rotation.stop", "rotationDegree")]

rm(flipCard_wide)

#' ## Close openTopic

dat4 <- subset(df, df$event %in% c("Artwork/OpenCard", "Artwork/CloseCard"))
dat4 <- dat4[order(dat4$artwork, dat4$date), ]
rownames(dat4) <- NULL

num_start <- diff(c(0, which(dat4$event == "Artwork/CloseCard")))
dat4$eventid <- rep(seq_along(num_start), num_start)

dat4$time <- ifelse(dat4$event == "Artwork/OpenCard", "start", "stop")

openTopic_wide <- reshape(dat4, direction = "wide",
                         idvar = c("eventid", "trace", "glossar", "artwork", "card"),
                         timevar = "time", drop = "popup")
openTopic_wide$event <- "openTopic"
openTopic_wide$duration <- openTopic_wide$time_ms.stop -
  openTopic_wide$time_ms.start


openTopic_wide$popup           <- NA
openTopic_wide$x.start         <- NA
openTopic_wide$x.stop          <- NA
openTopic_wide$y.start         <- NA
openTopic_wide$y.stop          <- NA
openTopic_wide$distance        <- NA
openTopic_wide$scale.start     <- NA
openTopic_wide$scale.stop      <- NA
openTopic_wide$scaleSize       <- NA
openTopic_wide$rotation.start  <- NA
openTopic_wide$rotation.stop   <- NA
openTopic_wide$rotationDegree  <- NA

dat_openTopic <- openTopic_wide[, c("fileid.start", "fileid.stop", "event",
                                    "artwork", "trace", "glossar",
                                    "date.start", "date.stop",
                                    "time_ms.start", "time_ms.stop",
                                    "duration", "card", "popup", "x.start",
                                    "y.start", "x.stop", "y.stop",
                                    "distance", "scale.start",
                                    "scale.stop", "scaleSize",
                                    "rotation.start", "rotation.stop",
                                    "rotationDegree")]
# TODO: card should have a unique identifier for each artwork

rm(openTopic_wide, num_start)

#' ## close openPopup
dat5 <- subset(df, df$event %in% c("ShowPopup", "HidePopup"))
dat5 <- dat5[order(dat5$artwork, dat5$popup, dat5$date), ]
rownames(dat5) <- NULL

num_start <- diff(c(0, which(dat5$event == "HidePopup")))
# last event is "ShowPopup"! Needs to be fixed
# num_start <- c(num_start, 1)
# TODO: Needs to be caught in a function --> not anymore - still relevant???

dat5$eventid <- rep(seq_along(num_start), num_start)

dat5$time <- ifelse(dat5$event == "ShowPopup", "start", "stop")

openPopup_wide <- reshape(dat5, direction = "wide",
                         idvar = c("eventid", "trace", "glossar", "artwork", "popup"),
                         timevar = "time", drop = "card")
# there is a pathological entry which gets deleted...
# df[df$trace == 4595, ]
# --> artwork 046 popup selene.xml gets opened twice


openPopup_wide$event <- "openPopup"
openPopup_wide$duration <- openPopup_wide$time_ms.stop -
  openPopup_wide$time_ms.start

openPopup_wide$card            <- NA
openPopup_wide$x.start         <- NA
openPopup_wide$x.stop          <- NA
openPopup_wide$y.start         <- NA
openPopup_wide$y.stop          <- NA
openPopup_wide$distance        <- NA
openPopup_wide$scale.start     <- NA
openPopup_wide$scale.stop      <- NA
openPopup_wide$scaleSize       <- NA
openPopup_wide$rotation.start  <- NA
openPopup_wide$rotation.stop   <- NA
openPopup_wide$rotationDegree  <- NA

dat_openPopup <- openPopup_wide[, c("fileid.start", "fileid.stop", "event",
                                    "artwork", "trace", "glossar",
                                    "date.start", "date.stop",
                                    "time_ms.start", "time_ms.stop",
                                    "duration", "card", "popup", "x.start",
                                    "y.start", "x.stop", "y.stop",
                                    "distance", "scale.start",
                                    "scale.stop", "scaleSize",
                                    "rotation.start", "rotation.stop",
                                    "rotationDegree")]
rm(num_start, openPopup_wide)


# Merge all
# system.time({
# dat_all <- merge(dat_trans, dat_flipCard, all = TRUE)
# dat_all <- merge(dat_all, dat_openTopic, all = TRUE)
# dat_all <- merge(dat_all, dat_openPopup, all = TRUE)
# })
#
# # check
# nrow(dat_all) == (nrow(dat_trans) + nrow(dat_flipCard) +
#                   nrow(dat_openTopic) + nrow(dat_openPopup))
#
# dat_all <- dat_all[order(dat_all$date.start), ]
# rownames(dat_all) <- NULL
#

# TODO: from here on NA... WHY??
# dat_all[19426:19435, ]

# TODO: Should card maybe also be filled in for "openPopup"?

# dat_all2 <- dplyr::full_join(dat_trans, dat_flipCard)
# dat_all2 <- dplyr::full_join(dat_all, dat_openTopic)
# dat_all2 <- dplyr::full_join(dat_all, dat_openPopup)
#
# nrow(dat_all2) == (nrow(dat_trans) + nrow(dat_flipCard) +
#                   nrow(dat_openTopic) + nrow(dat_openPopup))
#
# dat_all2 <- dat_all2[order(dat_all2$date.start), ]
# rownames(dat_all2) <- NULL
# TODO: --> same result - but faster. Need it?
# --> Would hate to depend on dplyr...

#' ## Use `rbind()` instead...
# --> unbeatable in terms of time!

dat_all <- rbind(dat_trans, dat_flipCard, dat_openTopic, dat_openPopup)

# check
nrow(dat_all) == (nrow(dat_trans) + nrow(dat_flipCard) +
                  nrow(dat_openTopic) + nrow(dat_openPopup))

# remove all events that do not have a `date.start`
dim(dat_all[is.na(dat_all$date.start), ])
dat_all <- dat_all[!is.na(dat_all$date.start), ]
# There is only a `date.stop`, when event is not properly closed, see here:
df[df$trace == 1843, ]
dat_openPopup[dat_openPopup$trace == 1843, ]
## --> still 50 (small data set) left, and some really do not seem to be
## opened! Must be a log error
# --> others should be closed!
dat[31000:31019,]     # this one e.g.
# --> Actually NOT! card gets flipped before! Again - log error!


# Remove durations when event spans more than one log file, since they are
# not interpretable
dat_all[which(dat_all$fileid.start != dat_all$fileid.stop), "duration"] <- NA


# sort by `start.date`
dat_all <- dat_all[order(dat_all$date.start), ]
rownames(dat_all) <- NULL

ind <- rowSums(is.na(dat_all)) == ncol(dat_all)
any(ind)
dat_all[ind, ]
# --> No rows with only NA, as it should be.

summary(dat_all)    # OK, this actually makes a lot of sense :)

#' ## Create case variable

#dat_all$timediff <- as.numeric(dat_all$date.stop - dat_all$date.start)

dat_all$timediff <- as.numeric(diff(c(dat_all$date.start[1], dat_all$date.start)))

hist(dat_all$timediff[dat_all$timediff < 40], breaks = 50)


# TODO: What is the best choice for the cutoff here? I took 20 secs for now
dat_all$case <- NA
j <- 1

for (i in seq_len(nrow(dat_all))) {
  if (dat_all$timediff[i] < 21) {
    dat_all$case[i] <- j
  } else {
    j <- j + 1
    dat_all$case[i] <- j
  }
}

head(dat_all[, c("event", "artwork", "trace", "date.start", "timediff", "case")], 100)

#' ## Add event ID

dat_all$eventid <- seq_len(nrow(dat_all))

dat_all <- dat_all[, c("fileid.start", "fileid.stop", "eventid", "case",
                       "trace", "glossar", "event", "artwork",
                       "date.start", "date.stop", "time_ms.start",
                       "time_ms.stop", "duration", "card", "popup",
                       "x.start", "y.start", "x.stop", "y.stop",
                       "distance", "scale.start", "scale.stop",
                       "scaleSize", "rotation.start", "rotation.stop",
                       "rotationDegree")]

#' ## Add `trace` numbers for `move` events

# when case and artwork are identical and there is only 1 trace value
# --> assign it to all `move` events for that case and artwork
# when case and artwork are identical and there is more than 1 trace value
# --> assign the `trace` value that was right before this `move` event
# (could, of course, also be after)

cases <- unique(dat_all$case)
aws <- unique(dat_all$artwork)[unique(dat_all$artwork) != "glossar"]
max_trace <- max(dat_all$trace, na.rm = TRUE) + 1
out <- NULL

for (case in cases) {
  for (art in aws) {
    tmp <- dat_all[dat_all$case == case & dat_all$artwork == art, ]
    if (nrow(tmp) != 0) {

      if (length(na.omit(unique(tmp$trace))) == 1) {
        tmp[tmp$event == "move", "trace"] <- na.omit(unique(tmp$trace))
      } else if (length(na.omit(unique(tmp$trace))) > 1) {
        for (i in 1:nrow(tmp)) {
          if (tmp$event[i] == "move") {
            if (i == 1) {
              tmp$trace[i] <- na.omit(unique(tmp$trace))[1]
            } else {
              tmp$trace[i] <- tmp$trace[i - 1]
            }
          }
        }
      } else if (all(is.na(tmp$trace))) {
        for (i in 1:nrow(tmp)) {
          if (tmp$event[i] == "move") {
            tmp$trace[i] <- max_trace
          }
        }
      }
      max_trace <- max_trace + 1
    }
    if (nrow(tmp) > 0) {
      #print(tmp[, c("case", "event", "trace", "artwork")])
      out <- rbind(out, tmp)
    }
  }
}

# TODO: Get rid of the loops
# --> This takes forever...

#head(out[, c("time_ms.start", "case", "trace", "event", "artwork")], 55)

#head(dat_all[dat_all$artwork %in% "501", c("time_ms.start", "case", "trace", "event", "artwork")], 50)

# identical(dat_all[which(!dat_all$eventid %in% out$eventid), ],
#           dat_all[dat_all$artwork == "glossar", ])
# --> TRUE

# put glossar events back in

#dat_all <- rbind(out, dat_all[dat_all$artwork == "glossar", ])
out <- out[order(out$date.start), ]
rownames(out) <- NULL

# Make `trace` a consecutive number
out$trace2 <- as.numeric(factor(out$trace, levels = unique(out$trace)))

#head(out[, c("trace", "trace2")], 50)

#' # Export data

write.table(out, "../data/event_logfiles.csv",
            sep = ";", quote = FALSE, row.names = FALSE)

# Is `artwork` my case? Or `artwork` per day? Or `artwork` per some other
# unit??? Maybe look at differences between timestamps separately for
# `artwork`? And identify "new observational unit" this way?
#
# Definition: (???)
# 1. Touching a new `artwork` corresponds to "observational unit change"
# 2. Time interval of XX min within one `artwork` on the same day
#    corresponds to "observational unit change"

# Split data frame in list of data frame which all correspond to one
# artwork
# dat_art <- split(dat, dat$artwork)

# TODO: Write function for closing events