Optimized close_events() so it runs on complete data set, uses dplyr at the moment

This commit is contained in:
Nora Wickelmaier 2023-09-16 16:29:24 +02:00
parent 1f55608ebf
commit b242eaddf3
2 changed files with 49 additions and 12 deletions

View File

@ -469,6 +469,23 @@ point. (Check with PG to make sure.)
I need to get the XML files for "504" and "505" from PM in order to extract I need to get the XML files for "504" and "505" from PM in order to extract
information on them for the metadata. information on them for the metadata.
# Optimizing resources used by the code
After I started trying out the functions on the complete data set, it
became obvious (not surprisingly `:)`) that this will not work --
especially for the move events. The reshape function cannot take a long
data frame with over 6 Million entries and convert it into a wide data
frame (at least not on my laptop). The code is supposed to work "out of the
box" for researchers, hence it *should* run on a regular (8 core) laptop.
So, I changed the reshaping so that it is done in batches on subsets of the
data for every `fileId` separately. This means that events that span over
two raw log files cannot be closed and will then be removed from the data
set. The functions warns about this, but it is a random process getting rid
of these data and seems therefore not like a systematic problem. Another
reason why this is not bad, is that durations cannot be calculated for
events across log files, because the time stamps do not increase over
systematically over log files (see above).
# Reading list # Reading list
* @Arizmendi2022 [--] * @Arizmendi2022 [--]

View File

@ -122,21 +122,25 @@ close_events <- function(data, event = c("move", "flipCard", "openTopic", "openP
actions <- c("Transform start", "Transform stop") actions <- c("Transform start", "Transform stop")
idvar <- c("eventId", "artwork", "glossar") idvar <- c("eventId", "artwork", "glossar")
drop <- c("popup", "topicNumber", "trace", "event") drop <- c("popup", "topicNumber", "trace", "event")
ncol <- 17
} else if (event == "flipCard") { } else if (event == "flipCard") {
actions <- c("Show Info", "Show Front") actions <- c("Show Info", "Show Front")
idvar <- c("trace", "artwork", "glossar") idvar <- c("trace", "artwork", "glossar")
drop <- c("popup", "topicNumber", "eventId", "event") drop <- c("popup", "topicNumber", "eventId", "event")
ncol <- 17
} else if (event == "openTopic") { } else if (event == "openTopic") {
actions <- c("Artwork/OpenCard", "Artwork/CloseCard") actions <- c("Artwork/OpenCard", "Artwork/CloseCard")
idvar <- c("eventId", "trace", "glossar", "artwork", "topicNumber") idvar <- c("eventId", "trace", "glossar", "artwork", "topicNumber")
drop <- c("popup", "event") drop <- c("popup", "event")
ncol <- 19
} else if (event == "openPopup") { } else if (event == "openPopup") {
actions <- c("ShowPopup", "HidePopup") actions <- c("ShowPopup", "HidePopup")
idvar <- c("eventId", "trace", "glossar", "artwork", "popup") idvar <- c("eventId", "trace", "glossar", "artwork", "popup")
drop <- c("topicNumber", "event") drop <- c("topicNumber", "event")
ncol <- 19
# TODO: Should topicNumber maybe also be filled in for "openPopup"? # TODO: Should topicNumber maybe also be filled in for "openPopup"?
} else { } else {
@ -149,11 +153,10 @@ close_events <- function(data, event = c("move", "flipCard", "openTopic", "openP
subdata <- subdata[order(subdata$artwork, subdata$popup, subdata$date, subdata$timeMs), ] subdata <- subdata[order(subdata$artwork, subdata$popup, subdata$date, subdata$timeMs), ]
subdata$time <- ifelse(subdata$event == actions[1], "start", "stop") subdata$time <- ifelse(subdata$event == actions[1], "start", "stop")
num_start <- diff(c(0, which(subdata$event == actions[2]))) num_start <- diff(c(0, which(subdata$event == actions[2])))
if (tail(subdata, 1)$time == "start") {
num_start <- c(num_start, 1)
}
subdata$eventId <- rep(seq_along(num_start), num_start) subdata$eventId <- rep(seq_along(num_start), num_start)
# If last event is start event, it needs to be fixed:
# num_start <- c(num_start, 1)
# TODO: Needs to be caught in a function
# --> not anymore - still relevant???
if (event == "move") { if (event == "move") {
subdata <- subdata[!duplicated(subdata[, c("event", "eventId")]), ] subdata <- subdata[!duplicated(subdata[, c("event", "eventId")]), ]
@ -162,19 +165,36 @@ close_events <- function(data, event = c("move", "flipCard", "openTopic", "openP
subdata <- subdata[-(id_rm_stop + 1), ] subdata <- subdata[-(id_rm_stop + 1), ]
} }
subdata_split <- split(subdata, subdata$fileId)
suppressWarnings( suppressWarnings(
data_wide <- reshape(subdata, direction = "wide", subdata_split_wide <- lapply(subdata_split, reshape,
direction = "wide",
idvar = idvar, idvar = idvar,
timevar = "time", timevar = "time",
drop = drop) drop = drop)
) )
# suppressWarnings(
# data_wide <- reshape(subdata, direction = "wide",
# idvar = idvar,
# timevar = "time",
# drop = drop)
# )
# TODO: Suppress warnings? Better with tryCatch()? # TODO: Suppress warnings? Better with tryCatch()?
# there is a pathological entry which gets deleted... # there is a pathological entry which gets deleted...
# df[df$trace == 4595, ] # df[df$trace == 4595, ]
# --> artwork 046 popup selene.xml gets opened twice # --> artwork 046 popup selene.xml gets opened twice
# TODO: This runs for quite some time # remove entries with only start or stop events since they do not have
# --> Is this more efficient with tidyr::pivot_wider? # all columns
subdata_split_wide <-
subdata_split_wide[-which(sapply(subdata_split_wide, ncol) != ncol)]
#data_wide <- do.call(rbind, subdata_split_wide)
# TODO: This runs quite some time
# --> There is a more efficient function in dplyr, which would also allow
# to keep the file IDs with only start or stop or a single entry...
data_wide <- dplyr::bind_rows(subdata_split_wide)
for (d in drop) data_wide[d] <- NA for (d in drop) data_wide[d] <- NA