Debugging and refactoring of code that closes events spanning more than one log file; tests on complete data set
This commit is contained in:
		
							parent
							
								
									2340e081ff
								
							
						
					
					
						commit
						64241bf5f9
					
				@ -73,10 +73,18 @@ close_events <- function(data, event = c("move", "flipCard", "openTopic", "openP
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  # Remove eventIds associated with more than one trace, usually logging
 | 
			
		||||
  # errors that cannot be resolved
 | 
			
		||||
  corrupt_eventIds <- names(which(rowSums(stats::xtabs( ~ eventId + trace,
 | 
			
		||||
                                                       subdata) != 0) != 1))
 | 
			
		||||
  subdata <- subdata[!subdata$eventId %in% corrupt_eventIds, ]
 | 
			
		||||
  # errors that cannot be resolved for openTopic or openPopup
 | 
			
		||||
  if (event %in% c("openTopic", "openPopup")) {
 | 
			
		||||
    # corrupt_eventIds <- names(which(rowSums(stats::xtabs( ~ eventId + trace,
 | 
			
		||||
    #                                                      subdata) != 0) != 1))
 | 
			
		||||
    # --> does not run on complete data set
 | 
			
		||||
 | 
			
		||||
    subdata_eid <- split(subdata, ~ eventId)
 | 
			
		||||
    tmp <- sapply(subdata_eid, function(x) length(stats::xtabs( ~ trace, x)))
 | 
			
		||||
    corrupt_eventIds <- names(tmp[tmp > 1])
 | 
			
		||||
 | 
			
		||||
    subdata <- subdata[!subdata$eventId %in% corrupt_eventIds, ]
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  # if (event == "flipCard") {
 | 
			
		||||
  #   subdata$eventId <- subdata$trace
 | 
			
		||||
@ -96,10 +104,9 @@ close_events <- function(data, event = c("move", "flipCard", "openTopic", "openP
 | 
			
		||||
 | 
			
		||||
  # Add start and stop variables that get lost because events span more
 | 
			
		||||
  # than one log file
 | 
			
		||||
  subdata_split_wide <- lapply(subdata_split_wide, add_variables, ncol = ncol,
 | 
			
		||||
                               event = event)
 | 
			
		||||
 | 
			
		||||
  data_wide <- dplyr::bind_rows(subdata_split_wide)
 | 
			
		||||
  data_wide <-
 | 
			
		||||
    lapply(subdata_split_wide, add_variables, ncol = ncol, event = event) |>
 | 
			
		||||
    dplyr::bind_rows()
 | 
			
		||||
 | 
			
		||||
  select <- is.na(data_wide$date.start) | is.na(data_wide$date.stop)
 | 
			
		||||
  correct <- data_wide[!select, ]
 | 
			
		||||
@ -108,8 +115,11 @@ close_events <- function(data, event = c("move", "flipCard", "openTopic", "openP
 | 
			
		||||
  corrupt$identi <- "corrupt"
 | 
			
		||||
 | 
			
		||||
  # Close events spanning more than one log file
 | 
			
		||||
  fixed <- dplyr::bind_rows(lapply(unique(corrupt$eventId),
 | 
			
		||||
                                   close_open_eventIds, df = corrupt))
 | 
			
		||||
  cat("\n########## Closing start/stop events spanning several log files...",
 | 
			
		||||
      "\n")
 | 
			
		||||
  fixed <-
 | 
			
		||||
    pbapply::pblapply(unique(corrupt$eventId), close_open_eventIds, df = corrupt) |>
 | 
			
		||||
    dplyr::bind_rows()
 | 
			
		||||
 | 
			
		||||
  data_wide <- rbind(correct, fixed)
 | 
			
		||||
 | 
			
		||||
@ -136,37 +146,15 @@ close_events <- function(data, event = c("move", "flipCard", "openTopic", "openP
 | 
			
		||||
                             data_wide$rotationDegree != 0 &
 | 
			
		||||
                             data_wide$scaleSize != 1, ]
 | 
			
		||||
      d2 <- nrow(data_wide)
 | 
			
		||||
      cat(paste("INFORMATION:", d1 - d2,
 | 
			
		||||
      cat(paste("\nINFORMATION:", d1 - d2,
 | 
			
		||||
      "lines containing move events were removed since they did",
 | 
			
		||||
      "\nnot contain any change"), fill = TRUE)
 | 
			
		||||
    }
 | 
			
		||||
    data_wide
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  data_wide <- data_wide[order(data_wide$fileId.start,
 | 
			
		||||
  out <- data_wide[order(data_wide$fileId.start,
 | 
			
		||||
                               data_wide$date.start,
 | 
			
		||||
                               data_wide$timeMs.start), ]
 | 
			
		||||
 | 
			
		||||
  # Fix durations that span more than one log file
 | 
			
		||||
  if (event != "move") {
 | 
			
		||||
    tab <- colSums(stats::xtabs( ~ fileId + trace, subdata) != 0)
 | 
			
		||||
    number_logfiles <- data.frame(trace = names(tab), nlogfile = tab)
 | 
			
		||||
    data_wide <- merge(data_wide, number_logfiles, by = "trace", all.x = TRUE)
 | 
			
		||||
    data_wide$duration[data_wide$identi == "corrupt"] <-
 | 
			
		||||
      (data_wide$nlogfile[data_wide$identi == "corrupt"] - 1) * 600000 -
 | 
			
		||||
      data_wide$timeMs.start[data_wide$identi == "corrupt"] +
 | 
			
		||||
      data_wide$timeMs.stop[data_wide$identi == "corrupt"]
 | 
			
		||||
# TODO: This assumes that no log files are skipped
 | 
			
		||||
# --> Is this assumption really valid??
 | 
			
		||||
  } else {
 | 
			
		||||
    data_wide$duration[data_wide$identi == "corrupt"] <-
 | 
			
		||||
      600000 -
 | 
			
		||||
      data_wide$timeMs.start[data_wide$identi == "corrupt"] +
 | 
			
		||||
      data_wide$timeMs.stop[data_wide$identi == "corrupt"]
 | 
			
		||||
  } # there should be no movements spanning more than two log files!
 | 
			
		||||
 | 
			
		||||
  out <- data_wide[# !apply(data_wide, 1, function(x) all(is.na(x))),
 | 
			
		||||
                   # remove all NA rows
 | 
			
		||||
                               data_wide$timeMs.start),
 | 
			
		||||
                   c("fileId.start", "fileId.stop", "folder", "event",
 | 
			
		||||
                     "artwork", "trace", "glossar", "date.start",
 | 
			
		||||
                     "date.stop", "timeMs.start", "timeMs.stop",
 | 
			
		||||
 | 
			
		||||
@ -85,8 +85,24 @@ create_eventlogs <- function(data, xmlpath, case_cutoff = 20, rm_nochange_moves
 | 
			
		||||
  cat("\n\n########## Adding trace variable for move events... ##########", "\n")
 | 
			
		||||
  dat4 <- add_trace_moves(dat3)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
  # Fix durations that span more than one log file #########################
 | 
			
		||||
  levels_fId <- sort(unique(c(dat4$fileId.start, dat4$fileId.stop)))
 | 
			
		||||
  dat4$fIdNum.start <- factor(dat4$fileId.start, levels = levels_fId)
 | 
			
		||||
  dat4$fIdNum.stop  <- factor(dat4$fileId.stop, levels = levels_fId)
 | 
			
		||||
  dat4$fIdNum.start <- as.numeric(dat4$fIdNum.start)
 | 
			
		||||
  dat4$fIdNum.stop  <- as.numeric(dat4$fIdNum.stop)
 | 
			
		||||
 | 
			
		||||
  dat4$fIdDiff      <- dat4$fIdNum.stop - dat4$fIdNum.start
 | 
			
		||||
 | 
			
		||||
  # Remove moves where stop is before start
 | 
			
		||||
  dat4 <- dat4[which(dat4$fIdDiff > 0), ]
 | 
			
		||||
 | 
			
		||||
  dat4$duration[dat4$fIdDiff > 0] <- dat4$fIdDiff * 600000 -
 | 
			
		||||
    dat4$timeMs.start + dat4$timeMs.stop
 | 
			
		||||
 | 
			
		||||
  # Remove fragmented traces ###############################################
 | 
			
		||||
  tab <- xtabs( ~ trace + event, dat4)
 | 
			
		||||
  tab <- stats::xtabs( ~ trace + event, dat4)
 | 
			
		||||
 | 
			
		||||
  fragments <- NULL
 | 
			
		||||
 | 
			
		||||
@ -100,7 +116,6 @@ create_eventlogs <- function(data, xmlpath, case_cutoff = 20, rm_nochange_moves
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
  dat5 <- dat4[!dat4$trace %in% fragments, ]
 | 
			
		||||
  # TODO: Should be tested more thouroughly on complete data set
 | 
			
		||||
 | 
			
		||||
  dat5
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user