Debugging and refactoring of code that closes events spanning more than one log file; tests on complete data set

This commit is contained in:
Nora Wickelmaier 2023-10-25 14:37:51 +02:00
parent 2340e081ff
commit 64241bf5f9
2 changed files with 40 additions and 37 deletions

View File

@ -73,10 +73,18 @@ close_events <- function(data, event = c("move", "flipCard", "openTopic", "openP
}
# Remove eventIds associated with more than one trace, usually logging
# errors that cannot be resolved
corrupt_eventIds <- names(which(rowSums(stats::xtabs( ~ eventId + trace,
subdata) != 0) != 1))
subdata <- subdata[!subdata$eventId %in% corrupt_eventIds, ]
# errors that cannot be resolved for openTopic or openPopup
if (event %in% c("openTopic", "openPopup")) {
# corrupt_eventIds <- names(which(rowSums(stats::xtabs( ~ eventId + trace,
# subdata) != 0) != 1))
# --> does not run on complete data set
subdata_eid <- split(subdata, ~ eventId)
tmp <- sapply(subdata_eid, function(x) length(stats::xtabs( ~ trace, x)))
corrupt_eventIds <- names(tmp[tmp > 1])
subdata <- subdata[!subdata$eventId %in% corrupt_eventIds, ]
}
# if (event == "flipCard") {
# subdata$eventId <- subdata$trace
@ -96,10 +104,9 @@ close_events <- function(data, event = c("move", "flipCard", "openTopic", "openP
# Add start and stop variables that get lost because events span more
# than one log file
subdata_split_wide <- lapply(subdata_split_wide, add_variables, ncol = ncol,
event = event)
data_wide <- dplyr::bind_rows(subdata_split_wide)
data_wide <-
lapply(subdata_split_wide, add_variables, ncol = ncol, event = event) |>
dplyr::bind_rows()
select <- is.na(data_wide$date.start) | is.na(data_wide$date.stop)
correct <- data_wide[!select, ]
@ -108,8 +115,11 @@ close_events <- function(data, event = c("move", "flipCard", "openTopic", "openP
corrupt$identi <- "corrupt"
# Close events spanning more than one log file
fixed <- dplyr::bind_rows(lapply(unique(corrupt$eventId),
close_open_eventIds, df = corrupt))
cat("\n########## Closing start/stop events spanning several log files...",
"\n")
fixed <-
pbapply::pblapply(unique(corrupt$eventId), close_open_eventIds, df = corrupt) |>
dplyr::bind_rows()
data_wide <- rbind(correct, fixed)
@ -136,37 +146,15 @@ close_events <- function(data, event = c("move", "flipCard", "openTopic", "openP
data_wide$rotationDegree != 0 &
data_wide$scaleSize != 1, ]
d2 <- nrow(data_wide)
cat(paste("INFORMATION:", d1 - d2,
cat(paste("\nINFORMATION:", d1 - d2,
"lines containing move events were removed since they did",
"\nnot contain any change"), fill = TRUE)
}
data_wide
}
data_wide <- data_wide[order(data_wide$fileId.start,
out <- data_wide[order(data_wide$fileId.start,
data_wide$date.start,
data_wide$timeMs.start), ]
# Fix durations that span more than one log file
if (event != "move") {
tab <- colSums(stats::xtabs( ~ fileId + trace, subdata) != 0)
number_logfiles <- data.frame(trace = names(tab), nlogfile = tab)
data_wide <- merge(data_wide, number_logfiles, by = "trace", all.x = TRUE)
data_wide$duration[data_wide$identi == "corrupt"] <-
(data_wide$nlogfile[data_wide$identi == "corrupt"] - 1) * 600000 -
data_wide$timeMs.start[data_wide$identi == "corrupt"] +
data_wide$timeMs.stop[data_wide$identi == "corrupt"]
# TODO: This assumes that no log files are skipped
# --> Is this assumption really valid??
} else {
data_wide$duration[data_wide$identi == "corrupt"] <-
600000 -
data_wide$timeMs.start[data_wide$identi == "corrupt"] +
data_wide$timeMs.stop[data_wide$identi == "corrupt"]
} # there should be no movements spanning more than two log files!
out <- data_wide[# !apply(data_wide, 1, function(x) all(is.na(x))),
# remove all NA rows
data_wide$timeMs.start),
c("fileId.start", "fileId.stop", "folder", "event",
"artwork", "trace", "glossar", "date.start",
"date.stop", "timeMs.start", "timeMs.stop",

View File

@ -85,8 +85,24 @@ create_eventlogs <- function(data, xmlpath, case_cutoff = 20, rm_nochange_moves
cat("\n\n########## Adding trace variable for move events... ##########", "\n")
dat4 <- add_trace_moves(dat3)
# Fix durations that span more than one log file #########################
levels_fId <- sort(unique(c(dat4$fileId.start, dat4$fileId.stop)))
dat4$fIdNum.start <- factor(dat4$fileId.start, levels = levels_fId)
dat4$fIdNum.stop <- factor(dat4$fileId.stop, levels = levels_fId)
dat4$fIdNum.start <- as.numeric(dat4$fIdNum.start)
dat4$fIdNum.stop <- as.numeric(dat4$fIdNum.stop)
dat4$fIdDiff <- dat4$fIdNum.stop - dat4$fIdNum.start
# Remove moves where stop is before start
dat4 <- dat4[which(dat4$fIdDiff > 0), ]
dat4$duration[dat4$fIdDiff > 0] <- dat4$fIdDiff * 600000 -
dat4$timeMs.start + dat4$timeMs.stop
# Remove fragmented traces ###############################################
tab <- xtabs( ~ trace + event, dat4)
tab <- stats::xtabs( ~ trace + event, dat4)
fragments <- NULL
@ -100,7 +116,6 @@ create_eventlogs <- function(data, xmlpath, case_cutoff = 20, rm_nochange_moves
}
}
dat5 <- dat4[!dat4$trace %in% fragments, ]
# TODO: Should be tested more thouroughly on complete data set
dat5
}