diff --git a/README.Rmd b/README.Rmd index 7a15a5d..31ebdd9 100644 --- a/README.Rmd +++ b/README.Rmd @@ -469,6 +469,23 @@ point. (Check with PG to make sure.) I need to get the XML files for "504" and "505" from PM in order to extract information on them for the metadata. +# Optimizing resources used by the code + +After I started trying out the functions on the complete data set, it +became obvious (not surprisingly `:)`) that this will not work -- +especially for the move events. The reshape function cannot take a long +data frame with over 6 Million entries and convert it into a wide data +frame (at least not on my laptop). The code is supposed to work "out of the +box" for researchers, hence it *should* run on a regular (8 core) laptop. +So, I changed the reshaping so that it is done in batches on subsets of the +data for every `fileId` separately. This means that events that span over +two raw log files cannot be closed and will then be removed from the data +set. The functions warns about this, but it is a random process getting rid +of these data and seems therefore not like a systematic problem. Another +reason why this is not bad, is that durations cannot be calculated for +events across log files, because the time stamps do not increase over +systematically over log files (see above). + # Reading list * @Arizmendi2022 [--] diff --git a/code/functions.R b/code/functions.R index 1f8bfb4..df14910 100644 --- a/code/functions.R +++ b/code/functions.R @@ -122,21 +122,25 @@ close_events <- function(data, event = c("move", "flipCard", "openTopic", "openP actions <- c("Transform start", "Transform stop") idvar <- c("eventId", "artwork", "glossar") drop <- c("popup", "topicNumber", "trace", "event") + ncol <- 17 } else if (event == "flipCard") { actions <- c("Show Info", "Show Front") idvar <- c("trace", "artwork", "glossar") drop <- c("popup", "topicNumber", "eventId", "event") + ncol <- 17 } else if (event == "openTopic") { actions <- c("Artwork/OpenCard", "Artwork/CloseCard") idvar <- c("eventId", "trace", "glossar", "artwork", "topicNumber") drop <- c("popup", "event") + ncol <- 19 } else if (event == "openPopup") { actions <- c("ShowPopup", "HidePopup") idvar <- c("eventId", "trace", "glossar", "artwork", "popup") drop <- c("topicNumber", "event") + ncol <- 19 # TODO: Should topicNumber maybe also be filled in for "openPopup"? } else { @@ -149,11 +153,10 @@ close_events <- function(data, event = c("move", "flipCard", "openTopic", "openP subdata <- subdata[order(subdata$artwork, subdata$popup, subdata$date, subdata$timeMs), ] subdata$time <- ifelse(subdata$event == actions[1], "start", "stop") num_start <- diff(c(0, which(subdata$event == actions[2]))) + if (tail(subdata, 1)$time == "start") { + num_start <- c(num_start, 1) + } subdata$eventId <- rep(seq_along(num_start), num_start) -# If last event is start event, it needs to be fixed: -# num_start <- c(num_start, 1) -# TODO: Needs to be caught in a function -# --> not anymore - still relevant??? if (event == "move") { subdata <- subdata[!duplicated(subdata[, c("event", "eventId")]), ] @@ -162,19 +165,36 @@ close_events <- function(data, event = c("move", "flipCard", "openTopic", "openP subdata <- subdata[-(id_rm_stop + 1), ] } + subdata_split <- split(subdata, subdata$fileId) + suppressWarnings( - data_wide <- reshape(subdata, direction = "wide", - idvar = idvar, - timevar = "time", - drop = drop) + subdata_split_wide <- lapply(subdata_split, reshape, + direction = "wide", + idvar = idvar, + timevar = "time", + drop = drop) ) +# suppressWarnings( +# data_wide <- reshape(subdata, direction = "wide", +# idvar = idvar, +# timevar = "time", +# drop = drop) +# ) # TODO: Suppress warnings? Better with tryCatch()? # there is a pathological entry which gets deleted... # df[df$trace == 4595, ] # --> artwork 046 popup selene.xml gets opened twice -# TODO: This runs for quite some time -# --> Is this more efficient with tidyr::pivot_wider? + # remove entries with only start or stop events since they do not have + # all columns + subdata_split_wide <- + subdata_split_wide[-which(sapply(subdata_split_wide, ncol) != ncol)] + + #data_wide <- do.call(rbind, subdata_split_wide) +# TODO: This runs quite some time +# --> There is a more efficient function in dplyr, which would also allow +# to keep the file IDs with only start or stop or a single entry... + data_wide <- dplyr::bind_rows(subdata_split_wide) for (d in drop) data_wide[d] <- NA @@ -194,8 +214,8 @@ close_events <- function(data, event = c("move", "flipCard", "openTopic", "openP data_wide$scaleSize <- data_wide$scale.stop / data_wide$scale.start # remove moves without any change move_wide <- data_wide[data_wide$distance != 0 & - data_wide$rotationDegree != 0 & - data_wide$scaleSize != 1, ] + data_wide$rotationDegree != 0 & + data_wide$scaleSize != 1, ] cat(paste("INFORMATION:", nrow(data_wide) - nrow(move_wide), "lines containing move events were removed since they did", "\nnot contain any change"), fill = TRUE)