Debugging and refactoring of code that closes events spanning more than one log file; tests on complete data set
This commit is contained in:
parent
2340e081ff
commit
64241bf5f9
@ -73,10 +73,18 @@ close_events <- function(data, event = c("move", "flipCard", "openTopic", "openP
|
||||
}
|
||||
|
||||
# Remove eventIds associated with more than one trace, usually logging
|
||||
# errors that cannot be resolved
|
||||
corrupt_eventIds <- names(which(rowSums(stats::xtabs( ~ eventId + trace,
|
||||
subdata) != 0) != 1))
|
||||
subdata <- subdata[!subdata$eventId %in% corrupt_eventIds, ]
|
||||
# errors that cannot be resolved for openTopic or openPopup
|
||||
if (event %in% c("openTopic", "openPopup")) {
|
||||
# corrupt_eventIds <- names(which(rowSums(stats::xtabs( ~ eventId + trace,
|
||||
# subdata) != 0) != 1))
|
||||
# --> does not run on complete data set
|
||||
|
||||
subdata_eid <- split(subdata, ~ eventId)
|
||||
tmp <- sapply(subdata_eid, function(x) length(stats::xtabs( ~ trace, x)))
|
||||
corrupt_eventIds <- names(tmp[tmp > 1])
|
||||
|
||||
subdata <- subdata[!subdata$eventId %in% corrupt_eventIds, ]
|
||||
}
|
||||
|
||||
# if (event == "flipCard") {
|
||||
# subdata$eventId <- subdata$trace
|
||||
@ -96,10 +104,9 @@ close_events <- function(data, event = c("move", "flipCard", "openTopic", "openP
|
||||
|
||||
# Add start and stop variables that get lost because events span more
|
||||
# than one log file
|
||||
subdata_split_wide <- lapply(subdata_split_wide, add_variables, ncol = ncol,
|
||||
event = event)
|
||||
|
||||
data_wide <- dplyr::bind_rows(subdata_split_wide)
|
||||
data_wide <-
|
||||
lapply(subdata_split_wide, add_variables, ncol = ncol, event = event) |>
|
||||
dplyr::bind_rows()
|
||||
|
||||
select <- is.na(data_wide$date.start) | is.na(data_wide$date.stop)
|
||||
correct <- data_wide[!select, ]
|
||||
@ -108,8 +115,11 @@ close_events <- function(data, event = c("move", "flipCard", "openTopic", "openP
|
||||
corrupt$identi <- "corrupt"
|
||||
|
||||
# Close events spanning more than one log file
|
||||
fixed <- dplyr::bind_rows(lapply(unique(corrupt$eventId),
|
||||
close_open_eventIds, df = corrupt))
|
||||
cat("\n########## Closing start/stop events spanning several log files...",
|
||||
"\n")
|
||||
fixed <-
|
||||
pbapply::pblapply(unique(corrupt$eventId), close_open_eventIds, df = corrupt) |>
|
||||
dplyr::bind_rows()
|
||||
|
||||
data_wide <- rbind(correct, fixed)
|
||||
|
||||
@ -136,37 +146,15 @@ close_events <- function(data, event = c("move", "flipCard", "openTopic", "openP
|
||||
data_wide$rotationDegree != 0 &
|
||||
data_wide$scaleSize != 1, ]
|
||||
d2 <- nrow(data_wide)
|
||||
cat(paste("INFORMATION:", d1 - d2,
|
||||
cat(paste("\nINFORMATION:", d1 - d2,
|
||||
"lines containing move events were removed since they did",
|
||||
"\nnot contain any change"), fill = TRUE)
|
||||
}
|
||||
data_wide
|
||||
}
|
||||
|
||||
data_wide <- data_wide[order(data_wide$fileId.start,
|
||||
out <- data_wide[order(data_wide$fileId.start,
|
||||
data_wide$date.start,
|
||||
data_wide$timeMs.start), ]
|
||||
|
||||
# Fix durations that span more than one log file
|
||||
if (event != "move") {
|
||||
tab <- colSums(stats::xtabs( ~ fileId + trace, subdata) != 0)
|
||||
number_logfiles <- data.frame(trace = names(tab), nlogfile = tab)
|
||||
data_wide <- merge(data_wide, number_logfiles, by = "trace", all.x = TRUE)
|
||||
data_wide$duration[data_wide$identi == "corrupt"] <-
|
||||
(data_wide$nlogfile[data_wide$identi == "corrupt"] - 1) * 600000 -
|
||||
data_wide$timeMs.start[data_wide$identi == "corrupt"] +
|
||||
data_wide$timeMs.stop[data_wide$identi == "corrupt"]
|
||||
# TODO: This assumes that no log files are skipped
|
||||
# --> Is this assumption really valid??
|
||||
} else {
|
||||
data_wide$duration[data_wide$identi == "corrupt"] <-
|
||||
600000 -
|
||||
data_wide$timeMs.start[data_wide$identi == "corrupt"] +
|
||||
data_wide$timeMs.stop[data_wide$identi == "corrupt"]
|
||||
} # there should be no movements spanning more than two log files!
|
||||
|
||||
out <- data_wide[# !apply(data_wide, 1, function(x) all(is.na(x))),
|
||||
# remove all NA rows
|
||||
data_wide$timeMs.start),
|
||||
c("fileId.start", "fileId.stop", "folder", "event",
|
||||
"artwork", "trace", "glossar", "date.start",
|
||||
"date.stop", "timeMs.start", "timeMs.stop",
|
||||
|
@ -85,8 +85,24 @@ create_eventlogs <- function(data, xmlpath, case_cutoff = 20, rm_nochange_moves
|
||||
cat("\n\n########## Adding trace variable for move events... ##########", "\n")
|
||||
dat4 <- add_trace_moves(dat3)
|
||||
|
||||
|
||||
# Fix durations that span more than one log file #########################
|
||||
levels_fId <- sort(unique(c(dat4$fileId.start, dat4$fileId.stop)))
|
||||
dat4$fIdNum.start <- factor(dat4$fileId.start, levels = levels_fId)
|
||||
dat4$fIdNum.stop <- factor(dat4$fileId.stop, levels = levels_fId)
|
||||
dat4$fIdNum.start <- as.numeric(dat4$fIdNum.start)
|
||||
dat4$fIdNum.stop <- as.numeric(dat4$fIdNum.stop)
|
||||
|
||||
dat4$fIdDiff <- dat4$fIdNum.stop - dat4$fIdNum.start
|
||||
|
||||
# Remove moves where stop is before start
|
||||
dat4 <- dat4[which(dat4$fIdDiff > 0), ]
|
||||
|
||||
dat4$duration[dat4$fIdDiff > 0] <- dat4$fIdDiff * 600000 -
|
||||
dat4$timeMs.start + dat4$timeMs.stop
|
||||
|
||||
# Remove fragmented traces ###############################################
|
||||
tab <- xtabs( ~ trace + event, dat4)
|
||||
tab <- stats::xtabs( ~ trace + event, dat4)
|
||||
|
||||
fragments <- NULL
|
||||
|
||||
@ -100,7 +116,6 @@ create_eventlogs <- function(data, xmlpath, case_cutoff = 20, rm_nochange_moves
|
||||
}
|
||||
}
|
||||
dat5 <- dat4[!dat4$trace %in% fragments, ]
|
||||
# TODO: Should be tested more thouroughly on complete data set
|
||||
|
||||
dat5
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user