From 9e3783cf1faf274d8375bbe3da04bd42c2538d17 Mon Sep 17 00:00:00 2001 From: nwickel Date: Fri, 11 Aug 2023 08:35:41 +0200 Subject: [PATCH] Cleaned up TODOs --- code/02_preprocessing.R | 238 +++++++++++++++------------------------- 1 file changed, 90 insertions(+), 148 deletions(-) diff --git a/code/02_preprocessing.R b/code/02_preprocessing.R index 4d6572b..b097784 100644 --- a/code/02_preprocessing.R +++ b/code/02_preprocessing.R @@ -31,10 +31,6 @@ dat0 <- read.table("../data/rawdata_logfiles_small.csv", sep = ";", header = TRUE) dat0$date <- as.POSIXct(dat0$date) # create date object -# TODO: Add a case identifier based on timestamps -- needs to be done on -# "raw data". Is it possible? Something seems seriously wrong with -# `time_ms` - #' # Remove irrelevant events #' ## Remove Start Application and Show Application @@ -44,11 +40,6 @@ dat <- subset(dat0, !(dat0$event %in% c("Start Application", #' # Close events -# TODO: -# Should every "Show front" be the beginning of a new trace? -# Should Transform events be handled separately and then be "added" again -# by timestamp? - ######## #' Do it for Transform events first @@ -98,7 +89,8 @@ trans_wide <- reshape(tmp, direction = "wide", # --> when fileid is part of the reshape, it does not work correctly, since # we sometimes have a start - stop event that is recorded in two separate # log files -# TODO: This runs for quite some time. Is this more efficient with dplyr? +# TODO: This runs for quite some time +# --> Is this more efficient with tidyr::pivot_wider? # which(is.na(trans_wide$date.start)) @@ -117,28 +109,31 @@ trans_wide$rotationDegree <- trans_wide$rotation.stop - trans_wide$rotation.start trans_wide$scaleSize <- trans_wide$scale.stop - trans_wide$scale.start -dat_trans <- trans_wide[trans_wide$distance != 0 & +dat_trans <- dat_trans[trans_wide$distance != 0 & trans_wide$rotationDegree != 0 & - trans_wide$scaleSize != 0, ] + trans_wide$scaleSize != 0, + c("event", "artwork", "date.start", "date.stop", + "time_ms.start", "time_ms.stop", "duration", + "x.start", "y.start", "x.stop", "y.stop", + "distance", "scale.start", "scale.stop", + "scaleSize", "rotation.start", "rotation.stop", + "rotationDegree")] # removes almost 2/3 of the data (for small data set) -dat_trans <- dat_trans[, c("event", "artwork", - "date.start", "date.stop", - "time_ms.start", "time_ms.stop", "duration", - "x.start", "y.start", "x.stop", "y.stop", "distance", - "scale.start", "scale.stop", "scaleSize", - "rotation.start", "rotation.stop", "rotationDegree")] +rm(tmp, id_rm_stop, id_stop, trans_wide, num_start, num_stop) summary(dat_trans) -# TODO: Phillip fragen was mit `time_ms` schief läuft... Hat er eine -# Erklärung dafür? +# TODO: Ask Phillip what is wrong with `time_ms` +# --> Hat er eine Erklärung dafür? + #plot(time_ms.stop ~ time_ms.start, dat_trans, type = "b") plot(time_ms.stop ~ time_ms.start, dat_trans, col = rgb(red = 0, green = 0, blue = 0, alpha = 0.2)) plot(date.stop ~ date.start, dat_trans[1:1000,], type = "b") -# TODO: Wie mit duration < 0 umgehen? Einfach auf NA setzen? For now... +# TODO: How to handle duration < 0 +# --> Replace with NA for now... dat_trans[dat_trans$duration < 0, "duration"] <- NA @@ -156,7 +151,6 @@ rownames(tmp) <- NULL # num_start <- diff(c(0, which(tmp$event == "Show Front"))) # tmp$trace <- rep(seq_along(num_start), num_start) # head(tmp[, c("artwork", "event", "trace")], 50) -# TODO: How do I handle popups from glossar??? # --> does not work because of glossar entries... can't sort by artwork @@ -188,20 +182,19 @@ for (art in aws) { # select artwork head(tmp[, c("artwork", "event", "trace")], 50) tail(tmp[, c("artwork", "event", "trace")], 50) +# TODO: How to handle popups from glossar??? +rm(aws, i, j, last_event, art) ## Fix glossar entries ### Find artwork for glossar entry -tmp2 <- tmp[tmp$artwork == "glossar", ] - -glossar_files <- unique(tmp2$popup) +glossar_files <- unique(tmp[tmp$artwork == "glossar", "popup"]) # load lookup table for artworks and glossar files load("../data/glossar_dict.RData") lut <- glossar_dict[glossar_dict$glossar_file %in% glossar_files, ] -# TODO: Find last "Artwork/OpenCard" that matches possible artwork head(tmp[, c("artwork", "event", "popup", "trace")], 20) @@ -247,17 +240,16 @@ for (file in lut$glossar_file) { # tmp[apply(df[, c("j", "i")], 1, c), c("artwork", "event", "popup", "trace")] # REMEMBER: It can never bo 100% correct, since it is always possible that -# several cards are open and that they link at the same glossar entry +# several cards are open and that they link to the same glossar entry -# Fill in the glossar_files that are only associated with one artwork # How many glossar_files are only associated with one artwork? lut[sapply(lut$artwork, length) == 1, "glossar_file"] -# TODO: Can I fill in the ones that are only associated with one artwork -# easily? I can't come up with something --> maybe ask AK??? +# TODO: Fill in the ones that are associated with one artwork +# --> Can't come up with something -- maybe ask AK??? -# TODO: How can I check if one of the former "Show Infos" is the correct -# one? I can't come up with something --> maybe ask AK??? +# TODO: How to check if one of the former "Show Infos" is correct on +# --> Can't come up with something -- maybe ask AK??? # for (file in lut$glossar_file) { # @@ -300,6 +292,8 @@ df <- subset(tmp, !is.na(tmp$trace)) df <- df[order(df$trace), ] rownames(df) <- NULL +rm(tmp, lut, current_artwork, file, glossar_dict, i, j, k, artwork_list, + glossar_files) #' ## Close flipCard @@ -307,19 +301,25 @@ tmp <- subset(df, df$event %in% c("Show Info", "Show Front")) tmp$event <- ifelse(tmp$event == "Show Info", "start", "stop") -dat_flipCard <- reshape(tmp, direction = "wide", +flipCard_wide <- reshape(tmp, direction = "wide", idvar = c("trace", "artwork"), - timevar = "event", drop = c("fileid", "popup", "card")) -dat_flipCard$event <- "flipCard" -dat_flipCard$duration <- dat_flipCard$time_ms.stop - dat_flipCard$time_ms.start + timevar = "event", + drop = c("fileid", "popup", "card")) +flipCard_wide$event <- "flipCard" +flipCard_wide$duration <- flipCard_wide$time_ms.stop - + flipCard_wide$time_ms.start -dat_flipCard$duration <- ifelse(dat_flipCard$duration < 0, NA, dat_flipCard$duration) -# TODO: Wie mit duration < 0 umgehen? Einfach auf NA setzen? For now... +# TODO: How to handle duration < 0 +# --> Replace with NA for now... +flipCard_wide$duration <- ifelse(flipCard_wide$duration < 0, + NA, flipCard_wide$duration) -dat_flipCard <- dat_flipCard[, c("event", "artwork", "trace", +dat_flipCard <- flipCard_wide[, c("event", "artwork", "trace", "date.start", "date.stop", - "time_ms.start", "time_ms.stop", "duration")] + "time_ms.start", "time_ms.stop", + "duration")] +rm(tmp, flipCard_wide) #' ## Close openTopic @@ -332,19 +332,25 @@ tmp$eventid <- rep(seq_along(num_start), num_start) tmp$event <- ifelse(tmp$event == "Artwork/OpenCard", "start", "stop") -dat_openTopic <- reshape(tmp, direction = "wide", +openTopic_wide <- reshape(tmp, direction = "wide", idvar = c("eventid", "trace", "artwork", "card"), timevar = "event", drop = c("fileid", "popup")) -dat_openTopic$event <- "openTopic" -dat_openTopic$duration <- dat_openTopic$time_ms.stop - dat_openTopic$time_ms.start +openTopic_wide$event <- "openTopic" +openTopic_wide$duration <- openTopic_wide$time_ms.stop - + openTopic_wide$time_ms.start -dat_openTopic$duration <- ifelse(dat_openTopic$duration < 0, NA, dat_openTopic$duration) -# TODO: Wie mit duration < 0 umgehen? Einfach auf NA setzen? For now... +openTopic_wide$duration <- ifelse(openTopic_wide$duration < 0, + NA, openTopic_wide$duration) +# TODO: How to handle duration < 0 +# --> Replace with NA for now... -dat_openTopic <- dat_openTopic[, c("event", "artwork", "card", "trace", +dat_openTopic <- openTopic_wide[, c("event", "artwork", "card", "trace", "date.start", "date.stop", - "time_ms.start", "time_ms.stop", "duration")] -# TODO: card should have a unique identifier for each artwork! + "time_ms.start", "time_ms.stop", + "duration")] +# TODO: card should have a unique identifier for each artwork + +rm(openTopic_wide, num_start, tmp) #' ## close openPopup tmp <- subset(df, df$event %in% c("ShowPopup", "HidePopup")) @@ -352,15 +358,15 @@ tmp <- tmp[order(tmp$artwork, tmp$date), ] rownames(tmp) <- NULL num_start <- diff(c(0, which(tmp$event == "HidePopup"))) -# last event is "ShowPopup"! Needs to be fixed TODO: Needs to be cought in -# a function! +# last event is "ShowPopup"! Needs to be fixed num_start <- c(num_start, 1) +# TODO: Needs to be caught in a function tmp$eventid <- rep(seq_along(num_start), num_start) tmp$event <- ifelse(tmp$event == "ShowPopup", "start", "stop") -dat_openPopup <- reshape(tmp, direction = "wide", +openPopup_wide <- reshape(tmp, direction = "wide", idvar = c("eventid", "trace", "artwork", "popup"), timevar = "event", drop = c("fileid", "card")) # there is a pathological entry which gets deleted... @@ -370,20 +376,28 @@ dat_openPopup <- reshape(tmp, direction = "wide", df[df$trace == 1843, ] # WHY??? -dat_openPopup$event <- "openPopup" -dat_openPopup$duration <- dat_openPopup$time_ms.stop - dat_openPopup$time_ms.start +openPopup_wide$event <- "openPopup" +openPopup_wide$duration <- openPopup_wide$time_ms.stop - + openPopup_wide$time_ms.start -dat_openPopup$duration <- ifelse(dat_openPopup$duration < 0, NA, dat_openPopup$duration) -# TODO: Wie mit duration < 0 umgehen? Einfach auf NA setzen? For now... +openPopup_wide$duration <- ifelse(openPopup_wide$duration < 0, + NA, openPopup_wide$duration) +# TODO: How to handle duration < 0 +# --> Replace with NA for now... -dat_openPopup <- dat_openPopup[, c("event", "artwork", "popup", "trace", +dat_openPopup <- openPopup_wide[, c("event", "artwork", "popup", "trace", "date.start", "date.stop", - "time_ms.start", "time_ms.stop", "duration")] + "time_ms.start", "time_ms.stop", + "duration")] +rm(num_start, openPopup_wide, tmp) + # Merge all +system.time({ dat_all <- merge(dat_trans, dat_flipCard, all = TRUE) dat_all <- merge(dat_all, dat_openTopic, all = TRUE) dat_all <- merge(dat_all, dat_openPopup, all = TRUE) +}) # check nrow(dat_all) == (nrow(dat_trans) + nrow(dat_flipCard) + @@ -392,84 +406,25 @@ nrow(dat_all) == (nrow(dat_trans) + nrow(dat_flipCard) + dat_all <- dat_all[order(dat_all$date.start), ] rownames(dat_all) <- NULL +# TODO: from here on NA... WHY?? +dat_all[19426:19435, ] + # TODO: Should card maybe also be filled in for "openPopup"? +# dat_all2 <- dplyr::full_join(dat_trans, dat_flipCard) +# dat_all2 <- dplyr::full_join(dat_all, dat_openTopic) +# dat_all2 <- dplyr::full_join(dat_all, dat_openPopup) +# +# nrow(dat_all2) == (nrow(dat_trans) + nrow(dat_flipCard) + +# nrow(dat_openTopic) + nrow(dat_openPopup)) +# +# dat_all2 <- dat_all2[order(dat_all2$date.start), ] +# rownames(dat_all2) <- NULL +# TODO: --> same result - but faster. Need it? +# --> Would hate to depend on dplyr... -#' ## Remove "button presses" - -# Sort data frame by artwork and date -dat <- dat[order(dat$artwork, dat$date), ] - -# remove "Transform start" and "Transform stop" following directly each -# other, since I do not know how to interpret them as events -id_start <- which(dat$event == "Transform start") -id_stop <- which(dat$event == "Transform stop") - -id_rm_start <- id_start[diff(id_start) == 1] -id_rm_stop <- id_stop[diff(id_stop) == 1] - -dat <- dat[-c(id_rm_start, id_rm_stop), ] -rownames(dat) <- NULL - - -id_start2 <- which(dat$event == "Transform start") -id_stop2 <- which(dat$event == "Transform stop") - -length(id_start2) - length(id_stop2) -# 340 --> "starts too many" - -# remove "Transform start" and "Transform stop" following directly each -# other (but with events in between!) -id_start_new <- id_start2 -id_stop_new <- id_stop2 - -for (i in 2:length(id_start_new)) { - if (id_start_new[i-1] < id_stop_new[i-1] & id_start_new[i] < id_stop_new[i-1]) { - id_start_new <- id_start_new[-(i-1)] - } else if (id_start_new[i-1] > id_stop_new[i-1] & id_start_new[i] > id_stop_new[i-1]) { - id_stop_new <- id_stop_new[-(i-1)] - } -} - -length(id_start2) - length(id_start_new) -length(id_stop2) - length(id_stop_new) - -ids <- data.frame(start = id_start_new, stop = id_stop_new) -ids$diff <- ids$stop - ids$start - -table(ids$diff) - -# remove "Transform start" and "Transform stop" around other events - -id_rm_start2 <- id_start2[!(id_start2 %in% id_start_new)] -id_rm_stop2 <- id_stop2[!(id_stop2 %in% id_stop_new)] - -# TODO: It still does not work correctly: -dat[64764:64769,] -# time_ms event artwork popup x y scale rotation -# 64764 473081 Transform start 052 052.xml 1958.65 1505.75 0.8234455 -0.1351998 -# 64765 474226 Show Info 052 052.xml NA NA NA NA -# 64766 475735 Transform start 052 052.xml 1988.25 1625.25 0.9927645 2.4527958 -# 64767 475739 Transform stop 052 052.xml 1988.25 1625.25 0.9927645 2.4527958 -# 64768 479326 Artwork 052 052.xml NA NA NA NA -# 64769 479751 Transform stop 052 052.xml 1660.90 1883.20 0.8074586 29.0875534 - -# --> but no idea how to find these cases in an automated way... - -dat <- dat[-c(id_rm_start2, id_rm_stop2), ] -# --> Every start ends with a stop now (but not necessarily the correct one!) - - -dat1 <- dat[order(dat$date, dat$time_ms), ] -dat1$time_diff <- c(NA, diff(dat1$time_ms)) - -boxplot(time_diff ~ as.Date(date), dat1[dat1$time_diff > 1000 & dat1$time_diff < 4000, ]) - -boxplot(time_ms ~ event, dat1) - - #' ## Plots counts <- table(as.Date(dat$date), dat$event) @@ -483,22 +438,6 @@ counts <- table(as.Date(dat$date[dat$event %in% start_events]), lattice::barchart(counts, auto.key = TRUE) -# TODO: Do I want to "collapse" the data frame in a way, that I only have -# one event for each "set", meaning -# -# * Transform start + Transform stop --> Transform -# * Artwork/OpenCard + Artwork/CloseCard --> Show Subcard -# * ShowPopup + HidePopup --> Show Popup -# * Show Info + Show Front --> Flip Card -# (s.o. ;)) -# -# Then I would have meaningful variables like duration, distance, degree of -# rotation, size of scaling, selection of Subcard etc. -# This means that I would have to delete all "unclosed" events. - -# Create a data frame with -# case event attributes (can differ for different events) -# ?? # Is `artwork` my case? Or `artwork` per day? Or `artwork` per some other # unit??? Maybe look at differences between timestamps separately for # `artwork`? And identify "new observational unit" this way? @@ -508,8 +447,6 @@ lattice::barchart(counts, auto.key = TRUE) # 2. Time interval of XX min within one `artwork` on the same day # corresponds to "observational unit change" -# id activity timestamp - # Split data frame in list of data frame which all correspond to one # artwork # dat_art <- split(dat, dat$artwork) @@ -522,3 +459,8 @@ lattice::barchart(counts, auto.key = TRUE) #' can happen that the wrong tags have been put together (e.g., Transform #' start and Transform stop); therefore, durations etc. are only heuristic +# TODO: Add a case identifier based on timestamps +# --> needs to be done on "raw data". Is it possible? Something seems +# seriously wrong with `time_ms` + +# TODO: Write function for closing events