Optimized close_events() so it runs on complete data set, uses dplyr at the moment

2023-09-16 16:29:24 +02:00 · 2023-09-16 16:29:24 +02:00 · b242eaddf3
commit b242eaddf3
parent 1f55608ebf
2 changed files with 49 additions and 12 deletions
--- a/README.Rmd
+++ b/README.Rmd
@ -469,6 +469,23 @@ point. (Check with PG to make sure.)
 I need to get the XML files for "504" and "505" from PM in order to extract
 information on them for the metadata.

+# Optimizing resources used by the code
+
+After I started trying out the functions on the complete data set, it
+became obvious (not surprisingly `:)`) that this will not work --
+especially for the move events. The reshape function cannot take a long
+data frame with over 6 Million entries and convert it into a wide data
+frame (at least not on my laptop). The code is supposed to work "out of the
+box" for researchers, hence it *should* run on a regular (8 core) laptop.
+So, I changed the reshaping so that it is done in batches on subsets of the
+data for every `fileId` separately. This means that events that span over
+two raw log files cannot be closed and will then be removed from the data
+set. The functions warns about this, but it is a random process getting rid
+of these data and seems therefore not like a systematic problem. Another
+reason why this is not bad, is that durations cannot be calculated for
+events across log files, because the time stamps do not increase over
+systematically over log files (see above).
+
 # Reading list

 * @Arizmendi2022 [--]
--- a/code/functions.R
+++ b/code/functions.R
@ -122,21 +122,25 @@ close_events <- function(data, event = c("move", "flipCard", "openTopic", "openP
    actions <- c("Transform start", "Transform stop")
    idvar   <- c("eventId", "artwork", "glossar")
    drop    <- c("popup", "topicNumber", "trace", "event")
+    ncol    <- 17

  } else if (event == "flipCard") {
    actions <- c("Show Info", "Show Front")
    idvar   <- c("trace", "artwork", "glossar")
    drop    <- c("popup", "topicNumber", "eventId", "event")
+    ncol    <- 17

  } else if (event == "openTopic") {
    actions <- c("Artwork/OpenCard", "Artwork/CloseCard")
    idvar   <- c("eventId", "trace", "glossar", "artwork", "topicNumber")
    drop    <- c("popup", "event")
+    ncol    <- 19

  } else if (event == "openPopup") {
    actions <- c("ShowPopup", "HidePopup")
    idvar   <- c("eventId", "trace", "glossar", "artwork", "popup")
    drop    <- c("topicNumber", "event")
+    ncol    <- 19
 # TODO: Should topicNumber maybe also be filled in for "openPopup"?

  } else {
@ -149,11 +153,10 @@ close_events <- function(data, event = c("move", "flipCard", "openTopic", "openP
  subdata <- subdata[order(subdata$artwork, subdata$popup, subdata$date, subdata$timeMs), ]
  subdata$time <- ifelse(subdata$event == actions[1], "start", "stop")
  num_start <- diff(c(0, which(subdata$event == actions[2])))
+  if (tail(subdata, 1)$time == "start") {
+    num_start <- c(num_start, 1)
+  }
  subdata$eventId <- rep(seq_along(num_start), num_start)
-# If last event is start event, it needs to be fixed:
-# num_start <- c(num_start, 1)
-# TODO: Needs to be caught in a function
-# --> not anymore - still relevant???

  if (event == "move") {
    subdata    <- subdata[!duplicated(subdata[, c("event", "eventId")]), ]
@ -162,19 +165,36 @@ close_events <- function(data, event = c("move", "flipCard", "openTopic", "openP
    subdata    <- subdata[-(id_rm_stop + 1), ]
  }

+  subdata_split <- split(subdata, subdata$fileId)
+
  suppressWarnings(
-    data_wide <- reshape(subdata, direction = "wide",
-                         idvar = idvar,
-                         timevar = "time",
-                         drop = drop)
+  subdata_split_wide <- lapply(subdata_split, reshape,
+                               direction = "wide",
+                               idvar = idvar,
+                               timevar = "time",
+                               drop = drop)
  )
+#  suppressWarnings(
+#    data_wide <- reshape(subdata, direction = "wide",
+#                         idvar = idvar,
+#                         timevar = "time",
+#                         drop = drop)
+#  )
 # TODO: Suppress warnings? Better with tryCatch()?
 # there is a pathological entry which gets deleted...
 # df[df$trace == 4595, ]
 # --> artwork 046 popup selene.xml gets opened twice

-# TODO: This runs for quite some time
-# --> Is this more efficient with tidyr::pivot_wider?
+  # remove entries with only start or stop events since they do not have
+  # all columns
+  subdata_split_wide <-
+    subdata_split_wide[-which(sapply(subdata_split_wide, ncol) != ncol)]
+
+  #data_wide <- do.call(rbind, subdata_split_wide)
+# TODO: This runs quite some time
+# --> There is a more efficient function in dplyr, which would also allow
+# to keep the file IDs with only start or stop or a single entry...
+  data_wide <- dplyr::bind_rows(subdata_split_wide)


  for (d in drop) data_wide[d] <- NA
@ -194,8 +214,8 @@ close_events <- function(data, event = c("move", "flipCard", "openTopic", "openP
    data_wide$scaleSize <- data_wide$scale.stop / data_wide$scale.start
    # remove moves without any change
    move_wide <- data_wide[data_wide$distance != 0 &
-                            data_wide$rotationDegree != 0 &
-                            data_wide$scaleSize != 1, ]
+                           data_wide$rotationDegree != 0 &
+                           data_wide$scaleSize != 1, ]
    cat(paste("INFORMATION:", nrow(data_wide) - nrow(move_wide),
    "lines containing move events were removed since they did",
    "\nnot contain any change"), fill = TRUE)