From 64241bf5f92aab21907e8578a561ecdfe671fce6 Mon Sep 17 00:00:00 2001 From: nwickel Date: Wed, 25 Oct 2023 14:37:51 +0200 Subject: [PATCH] Debugging and refactoring of code that closes events spanning more than one log file; tests on complete data set --- R/close_events.R | 58 ++++++++++++++++++-------------------------- R/create_eventlogs.R | 19 +++++++++++++-- 2 files changed, 40 insertions(+), 37 deletions(-) diff --git a/R/close_events.R b/R/close_events.R index c5d3644..5ca8da6 100644 --- a/R/close_events.R +++ b/R/close_events.R @@ -73,10 +73,18 @@ close_events <- function(data, event = c("move", "flipCard", "openTopic", "openP } # Remove eventIds associated with more than one trace, usually logging - # errors that cannot be resolved - corrupt_eventIds <- names(which(rowSums(stats::xtabs( ~ eventId + trace, - subdata) != 0) != 1)) - subdata <- subdata[!subdata$eventId %in% corrupt_eventIds, ] + # errors that cannot be resolved for openTopic or openPopup + if (event %in% c("openTopic", "openPopup")) { + # corrupt_eventIds <- names(which(rowSums(stats::xtabs( ~ eventId + trace, + # subdata) != 0) != 1)) + # --> does not run on complete data set + + subdata_eid <- split(subdata, ~ eventId) + tmp <- sapply(subdata_eid, function(x) length(stats::xtabs( ~ trace, x))) + corrupt_eventIds <- names(tmp[tmp > 1]) + + subdata <- subdata[!subdata$eventId %in% corrupt_eventIds, ] + } # if (event == "flipCard") { # subdata$eventId <- subdata$trace @@ -96,10 +104,9 @@ close_events <- function(data, event = c("move", "flipCard", "openTopic", "openP # Add start and stop variables that get lost because events span more # than one log file - subdata_split_wide <- lapply(subdata_split_wide, add_variables, ncol = ncol, - event = event) - - data_wide <- dplyr::bind_rows(subdata_split_wide) + data_wide <- + lapply(subdata_split_wide, add_variables, ncol = ncol, event = event) |> + dplyr::bind_rows() select <- is.na(data_wide$date.start) | is.na(data_wide$date.stop) correct <- data_wide[!select, ] @@ -108,8 +115,11 @@ close_events <- function(data, event = c("move", "flipCard", "openTopic", "openP corrupt$identi <- "corrupt" # Close events spanning more than one log file - fixed <- dplyr::bind_rows(lapply(unique(corrupt$eventId), - close_open_eventIds, df = corrupt)) + cat("\n########## Closing start/stop events spanning several log files...", + "\n") + fixed <- + pbapply::pblapply(unique(corrupt$eventId), close_open_eventIds, df = corrupt) |> + dplyr::bind_rows() data_wide <- rbind(correct, fixed) @@ -136,37 +146,15 @@ close_events <- function(data, event = c("move", "flipCard", "openTopic", "openP data_wide$rotationDegree != 0 & data_wide$scaleSize != 1, ] d2 <- nrow(data_wide) - cat(paste("INFORMATION:", d1 - d2, + cat(paste("\nINFORMATION:", d1 - d2, "lines containing move events were removed since they did", "\nnot contain any change"), fill = TRUE) } - data_wide } - data_wide <- data_wide[order(data_wide$fileId.start, + out <- data_wide[order(data_wide$fileId.start, data_wide$date.start, - data_wide$timeMs.start), ] - - # Fix durations that span more than one log file - if (event != "move") { - tab <- colSums(stats::xtabs( ~ fileId + trace, subdata) != 0) - number_logfiles <- data.frame(trace = names(tab), nlogfile = tab) - data_wide <- merge(data_wide, number_logfiles, by = "trace", all.x = TRUE) - data_wide$duration[data_wide$identi == "corrupt"] <- - (data_wide$nlogfile[data_wide$identi == "corrupt"] - 1) * 600000 - - data_wide$timeMs.start[data_wide$identi == "corrupt"] + - data_wide$timeMs.stop[data_wide$identi == "corrupt"] -# TODO: This assumes that no log files are skipped -# --> Is this assumption really valid?? - } else { - data_wide$duration[data_wide$identi == "corrupt"] <- - 600000 - - data_wide$timeMs.start[data_wide$identi == "corrupt"] + - data_wide$timeMs.stop[data_wide$identi == "corrupt"] - } # there should be no movements spanning more than two log files! - - out <- data_wide[# !apply(data_wide, 1, function(x) all(is.na(x))), - # remove all NA rows + data_wide$timeMs.start), c("fileId.start", "fileId.stop", "folder", "event", "artwork", "trace", "glossar", "date.start", "date.stop", "timeMs.start", "timeMs.stop", diff --git a/R/create_eventlogs.R b/R/create_eventlogs.R index 4485daf..239891a 100644 --- a/R/create_eventlogs.R +++ b/R/create_eventlogs.R @@ -85,8 +85,24 @@ create_eventlogs <- function(data, xmlpath, case_cutoff = 20, rm_nochange_moves cat("\n\n########## Adding trace variable for move events... ##########", "\n") dat4 <- add_trace_moves(dat3) + + # Fix durations that span more than one log file ######################### + levels_fId <- sort(unique(c(dat4$fileId.start, dat4$fileId.stop))) + dat4$fIdNum.start <- factor(dat4$fileId.start, levels = levels_fId) + dat4$fIdNum.stop <- factor(dat4$fileId.stop, levels = levels_fId) + dat4$fIdNum.start <- as.numeric(dat4$fIdNum.start) + dat4$fIdNum.stop <- as.numeric(dat4$fIdNum.stop) + + dat4$fIdDiff <- dat4$fIdNum.stop - dat4$fIdNum.start + + # Remove moves where stop is before start + dat4 <- dat4[which(dat4$fIdDiff > 0), ] + + dat4$duration[dat4$fIdDiff > 0] <- dat4$fIdDiff * 600000 - + dat4$timeMs.start + dat4$timeMs.stop + # Remove fragmented traces ############################################### - tab <- xtabs( ~ trace + event, dat4) + tab <- stats::xtabs( ~ trace + event, dat4) fragments <- NULL @@ -100,7 +116,6 @@ create_eventlogs <- function(data, xmlpath, case_cutoff = 20, rm_nochange_moves } } dat5 <- dat4[!dat4$trace %in% fragments, ] - # TODO: Should be tested more thouroughly on complete data set dat5 }