Optimized close_events() so it runs on complete data set, uses dplyr at the moment
This commit is contained in:
parent
1f55608ebf
commit
b242eaddf3
17
README.Rmd
17
README.Rmd
@ -469,6 +469,23 @@ point. (Check with PG to make sure.)
|
|||||||
I need to get the XML files for "504" and "505" from PM in order to extract
|
I need to get the XML files for "504" and "505" from PM in order to extract
|
||||||
information on them for the metadata.
|
information on them for the metadata.
|
||||||
|
|
||||||
|
# Optimizing resources used by the code
|
||||||
|
|
||||||
|
After I started trying out the functions on the complete data set, it
|
||||||
|
became obvious (not surprisingly `:)`) that this will not work --
|
||||||
|
especially for the move events. The reshape function cannot take a long
|
||||||
|
data frame with over 6 Million entries and convert it into a wide data
|
||||||
|
frame (at least not on my laptop). The code is supposed to work "out of the
|
||||||
|
box" for researchers, hence it *should* run on a regular (8 core) laptop.
|
||||||
|
So, I changed the reshaping so that it is done in batches on subsets of the
|
||||||
|
data for every `fileId` separately. This means that events that span over
|
||||||
|
two raw log files cannot be closed and will then be removed from the data
|
||||||
|
set. The functions warns about this, but it is a random process getting rid
|
||||||
|
of these data and seems therefore not like a systematic problem. Another
|
||||||
|
reason why this is not bad, is that durations cannot be calculated for
|
||||||
|
events across log files, because the time stamps do not increase over
|
||||||
|
systematically over log files (see above).
|
||||||
|
|
||||||
# Reading list
|
# Reading list
|
||||||
|
|
||||||
* @Arizmendi2022 [--]
|
* @Arizmendi2022 [--]
|
||||||
|
@ -122,21 +122,25 @@ close_events <- function(data, event = c("move", "flipCard", "openTopic", "openP
|
|||||||
actions <- c("Transform start", "Transform stop")
|
actions <- c("Transform start", "Transform stop")
|
||||||
idvar <- c("eventId", "artwork", "glossar")
|
idvar <- c("eventId", "artwork", "glossar")
|
||||||
drop <- c("popup", "topicNumber", "trace", "event")
|
drop <- c("popup", "topicNumber", "trace", "event")
|
||||||
|
ncol <- 17
|
||||||
|
|
||||||
} else if (event == "flipCard") {
|
} else if (event == "flipCard") {
|
||||||
actions <- c("Show Info", "Show Front")
|
actions <- c("Show Info", "Show Front")
|
||||||
idvar <- c("trace", "artwork", "glossar")
|
idvar <- c("trace", "artwork", "glossar")
|
||||||
drop <- c("popup", "topicNumber", "eventId", "event")
|
drop <- c("popup", "topicNumber", "eventId", "event")
|
||||||
|
ncol <- 17
|
||||||
|
|
||||||
} else if (event == "openTopic") {
|
} else if (event == "openTopic") {
|
||||||
actions <- c("Artwork/OpenCard", "Artwork/CloseCard")
|
actions <- c("Artwork/OpenCard", "Artwork/CloseCard")
|
||||||
idvar <- c("eventId", "trace", "glossar", "artwork", "topicNumber")
|
idvar <- c("eventId", "trace", "glossar", "artwork", "topicNumber")
|
||||||
drop <- c("popup", "event")
|
drop <- c("popup", "event")
|
||||||
|
ncol <- 19
|
||||||
|
|
||||||
} else if (event == "openPopup") {
|
} else if (event == "openPopup") {
|
||||||
actions <- c("ShowPopup", "HidePopup")
|
actions <- c("ShowPopup", "HidePopup")
|
||||||
idvar <- c("eventId", "trace", "glossar", "artwork", "popup")
|
idvar <- c("eventId", "trace", "glossar", "artwork", "popup")
|
||||||
drop <- c("topicNumber", "event")
|
drop <- c("topicNumber", "event")
|
||||||
|
ncol <- 19
|
||||||
# TODO: Should topicNumber maybe also be filled in for "openPopup"?
|
# TODO: Should topicNumber maybe also be filled in for "openPopup"?
|
||||||
|
|
||||||
} else {
|
} else {
|
||||||
@ -149,11 +153,10 @@ close_events <- function(data, event = c("move", "flipCard", "openTopic", "openP
|
|||||||
subdata <- subdata[order(subdata$artwork, subdata$popup, subdata$date, subdata$timeMs), ]
|
subdata <- subdata[order(subdata$artwork, subdata$popup, subdata$date, subdata$timeMs), ]
|
||||||
subdata$time <- ifelse(subdata$event == actions[1], "start", "stop")
|
subdata$time <- ifelse(subdata$event == actions[1], "start", "stop")
|
||||||
num_start <- diff(c(0, which(subdata$event == actions[2])))
|
num_start <- diff(c(0, which(subdata$event == actions[2])))
|
||||||
|
if (tail(subdata, 1)$time == "start") {
|
||||||
|
num_start <- c(num_start, 1)
|
||||||
|
}
|
||||||
subdata$eventId <- rep(seq_along(num_start), num_start)
|
subdata$eventId <- rep(seq_along(num_start), num_start)
|
||||||
# If last event is start event, it needs to be fixed:
|
|
||||||
# num_start <- c(num_start, 1)
|
|
||||||
# TODO: Needs to be caught in a function
|
|
||||||
# --> not anymore - still relevant???
|
|
||||||
|
|
||||||
if (event == "move") {
|
if (event == "move") {
|
||||||
subdata <- subdata[!duplicated(subdata[, c("event", "eventId")]), ]
|
subdata <- subdata[!duplicated(subdata[, c("event", "eventId")]), ]
|
||||||
@ -162,19 +165,36 @@ close_events <- function(data, event = c("move", "flipCard", "openTopic", "openP
|
|||||||
subdata <- subdata[-(id_rm_stop + 1), ]
|
subdata <- subdata[-(id_rm_stop + 1), ]
|
||||||
}
|
}
|
||||||
|
|
||||||
|
subdata_split <- split(subdata, subdata$fileId)
|
||||||
|
|
||||||
suppressWarnings(
|
suppressWarnings(
|
||||||
data_wide <- reshape(subdata, direction = "wide",
|
subdata_split_wide <- lapply(subdata_split, reshape,
|
||||||
|
direction = "wide",
|
||||||
idvar = idvar,
|
idvar = idvar,
|
||||||
timevar = "time",
|
timevar = "time",
|
||||||
drop = drop)
|
drop = drop)
|
||||||
)
|
)
|
||||||
|
# suppressWarnings(
|
||||||
|
# data_wide <- reshape(subdata, direction = "wide",
|
||||||
|
# idvar = idvar,
|
||||||
|
# timevar = "time",
|
||||||
|
# drop = drop)
|
||||||
|
# )
|
||||||
# TODO: Suppress warnings? Better with tryCatch()?
|
# TODO: Suppress warnings? Better with tryCatch()?
|
||||||
# there is a pathological entry which gets deleted...
|
# there is a pathological entry which gets deleted...
|
||||||
# df[df$trace == 4595, ]
|
# df[df$trace == 4595, ]
|
||||||
# --> artwork 046 popup selene.xml gets opened twice
|
# --> artwork 046 popup selene.xml gets opened twice
|
||||||
|
|
||||||
# TODO: This runs for quite some time
|
# remove entries with only start or stop events since they do not have
|
||||||
# --> Is this more efficient with tidyr::pivot_wider?
|
# all columns
|
||||||
|
subdata_split_wide <-
|
||||||
|
subdata_split_wide[-which(sapply(subdata_split_wide, ncol) != ncol)]
|
||||||
|
|
||||||
|
#data_wide <- do.call(rbind, subdata_split_wide)
|
||||||
|
# TODO: This runs quite some time
|
||||||
|
# --> There is a more efficient function in dplyr, which would also allow
|
||||||
|
# to keep the file IDs with only start or stop or a single entry...
|
||||||
|
data_wide <- dplyr::bind_rows(subdata_split_wide)
|
||||||
|
|
||||||
|
|
||||||
for (d in drop) data_wide[d] <- NA
|
for (d in drop) data_wide[d] <- NA
|
||||||
|
Loading…
Reference in New Issue
Block a user