Optimized close_events() so it runs on complete data set, uses dplyr at the moment

2023-09-16 16:29:24 +02:00 · 2023-09-16 16:29:24 +02:00 · b242eaddf3
commit b242eaddf3
parent 1f55608ebf
2 changed files with 49 additions and 12 deletions
--- a/README.Rmd
+++ b/README.Rmd
@ -469,6 +469,23 @@ point. (Check with PG to make sure.)
 I need to get the XML files for "504" and "505" from PM in order to extract
 information on them for the metadata.
 # Optimizing resources used by the code
 After I started trying out the functions on the complete data set, it
 became obvious (not surprisingly `:)`) that this will not work --
 especially for the move events. The reshape function cannot take a long
 data frame with over 6 Million entries and convert it into a wide data
 frame (at least not on my laptop). The code is supposed to work "out of the
 box" for researchers, hence it *should* run on a regular (8 core) laptop.
 So, I changed the reshaping so that it is done in batches on subsets of the
 data for every `fileId` separately. This means that events that span over
 two raw log files cannot be closed and will then be removed from the data
 set. The functions warns about this, but it is a random process getting rid
 of these data and seems therefore not like a systematic problem. Another
 reason why this is not bad, is that durations cannot be calculated for
 events across log files, because the time stamps do not increase over
 systematically over log files (see above).
 # Reading list
 * @Arizmendi2022 [--]
--- a/code/functions.R
+++ b/code/functions.R
@ -122,21 +122,25 @@ close_events <- function(data, event = c("move", "flipCard", "openTopic", "openP
    actions <- c("Transform start", "Transform stop")
    idvar   <- c("eventId", "artwork", "glossar")
    drop    <- c("popup", "topicNumber", "trace", "event")
    ncol    <- 17
  } else if (event == "flipCard") {
    actions <- c("Show Info", "Show Front")
    idvar   <- c("trace", "artwork", "glossar")
    drop    <- c("popup", "topicNumber", "eventId", "event")
    ncol    <- 17
  } else if (event == "openTopic") {
    actions <- c("Artwork/OpenCard", "Artwork/CloseCard")
    idvar   <- c("eventId", "trace", "glossar", "artwork", "topicNumber")
    drop    <- c("popup", "event")
    ncol    <- 19
  } else if (event == "openPopup") {
    actions <- c("ShowPopup", "HidePopup")
    idvar   <- c("eventId", "trace", "glossar", "artwork", "popup")
    drop    <- c("topicNumber", "event")
    ncol    <- 19
 # TODO: Should topicNumber maybe also be filled in for "openPopup"?
  } else {
@ -149,11 +153,10 @@ close_events <- function(data, event = c("move", "flipCard", "openTopic", "openP
  subdata <- subdata[order(subdata$artwork, subdata$popup, subdata$date, subdata$timeMs), ]
  subdata$time <- ifelse(subdata$event == actions[1], "start", "stop")
  num_start <- diff(c(0, which(subdata$event == actions[2])))
  if (tail(subdata, 1)$time == "start") {
    num_start <- c(num_start, 1)
  }
  subdata$eventId <- rep(seq_along(num_start), num_start)
 # If last event is start event, it needs to be fixed:
 # num_start <- c(num_start, 1)
 # TODO: Needs to be caught in a function
 # --> not anymore - still relevant???
  if (event == "move") {
    subdata    <- subdata[!duplicated(subdata[, c("event", "eventId")]), ]
@ -162,19 +165,36 @@ close_events <- function(data, event = c("move", "flipCard", "openTopic", "openP
    subdata    <- subdata[-(id_rm_stop + 1), ]
  }
  subdata_split <- split(subdata, subdata$fileId)
  suppressWarnings(
-    data_wide <- reshape(subdata, direction = "wide",
+  subdata_split_wide <- lapply(subdata_split, reshape,
                               direction = "wide",
                               idvar = idvar,
                               timevar = "time",
                               drop = drop)
  )
 #  suppressWarnings(
 #    data_wide <- reshape(subdata, direction = "wide",
 #                         idvar = idvar,
 #                         timevar = "time",
 #                         drop = drop)
 #  )
 # TODO: Suppress warnings? Better with tryCatch()?
 # there is a pathological entry which gets deleted...
 # df[df$trace == 4595, ]
 # --> artwork 046 popup selene.xml gets opened twice
-# TODO: This runs for quite some time
+  # remove entries with only start or stop events since they do not have
-# --> Is this more efficient with tidyr::pivot_wider?
+  # all columns
  subdata_split_wide <-
    subdata_split_wide[-which(sapply(subdata_split_wide, ncol) != ncol)]
  #data_wide <- do.call(rbind, subdata_split_wide)
 # TODO: This runs quite some time
 # --> There is a more efficient function in dplyr, which would also allow
 # to keep the file IDs with only start or stop or a single entry...
  data_wide <- dplyr::bind_rows(subdata_split_wide)
  for (d in drop) data_wide[d] <- NA