From 495665a659392d3622ed1ba37bf0437d3d1099fc Mon Sep 17 00:00:00 2001 From: nwickel Date: Thu, 31 Aug 2023 16:12:34 +0200 Subject: [PATCH] Cleaned out some commented code, that I do not need anymore --- code/02_preprocessing.R | 77 ++++++----------------------------------- 1 file changed, 11 insertions(+), 66 deletions(-) diff --git a/code/02_preprocessing.R b/code/02_preprocessing.R index e66394d..54b37ec 100644 --- a/code/02_preprocessing.R +++ b/code/02_preprocessing.R @@ -14,17 +14,6 @@ # setwd("C:/Users/nwickelmaier/Nextcloud/Documents/MDS/2023ss/60100_master_thesis/code") -# LogEntry classes: -# TRANSFORM_START: "Transform start" --> "Transformation Start" in Tool -# TRANSFORM_STOP: "Transform stop" -# START_APPLICATION: "Start Application" -# SHOW_APPLICATION: "Show Application" -# SHOW_INFO: "Show Info" --> "Flip Card" in Tool -# SHOW_FRONT: "Show Front" -# SHOW_POPUP: "ShowPopup" --> "Show Popup" in Tool -# HIDE_POPUP: "HidePopup" -# ARTWORK: "Artwork" --> "Show Topic" in Tool - #' # Read data dat0 <- read.table("../data/rawdata_logfiles_small.csv", sep = ";", @@ -63,11 +52,11 @@ table(table(dat1$eventid)) num_stop <- c(diff(c(0, which(dat1$event == "Transform start")))) table(num_stop) +# TODO: Do I still need this? dat1$eventrep <- rep(num_start, num_start) dat1$dupl <- duplicated(dat1[, c("event", "eventid")]) # keep first dat1$dupl <- duplicated(dat1[, c("event", "eventid")], fromLast = TRUE) # keep last dat1[dat1$eventrep == 10, ] - dat1$dupl <- NULL dat1$eventrep <- NULL @@ -145,24 +134,13 @@ summary(dat_trans) #' # Close other events dat2 <- dat[!dat$event %in% c("Transform start", "Transform stop"), ] -# dat2$x <- NULL -# dat2$y <- NULL -# dat2$scale <- NULL -# dat2$rotation <- NULL rownames(dat2) <- NULL -# Create event ID for closing events -# num_start <- diff(c(0, which(dat2$event == "Show Front"))) -# dat2$trace <- rep(seq_along(num_start), num_start) -# head(dat2[, c("artwork", "event", "trace")], 50) -# --> does not work because of glossar entries... can't sort by artwork - - dat2$trace <- NA last_event <- dat2$event[1] aws <- unique(dat2$artwork)[unique(dat2$artwork) != "glossar"] # -for (art in aws) { # select artwork +for (art in aws) { # select artwork for (i in 1:nrow(dat2)) { # go through rows @@ -189,9 +167,7 @@ tail(dat2[, c("artwork", "event", "trace")], 50) rm(aws, i, j, last_event, art) -## Fix glossar entries - -### Find artwork for glossar entry +#' ## Fix glossar entries (find corresponding artworks) glossar_files <- unique(dat2[dat2$artwork == "glossar", "popup"]) @@ -278,7 +254,7 @@ for (file in tmp_lut$glossar_file) { dat2[14110:14130, ] -# TODO: Integrate for loop into for loop above +# TODO: Integrate for-loop into for-loop above # TODO: For now: Exclude not matched glossar entries @@ -303,7 +279,8 @@ flipCard_wide$event <- "flipCard" flipCard_wide$duration <- flipCard_wide$time_ms.stop - flipCard_wide$time_ms.start - +# TODO: Check if I still need to enter all of these variables +# --> x, y, scale, rotation? flipCard_wide$card <- NA flipCard_wide$popup <- NA flipCard_wide$x.start <- NA @@ -377,6 +354,7 @@ dat_openTopic <- openTopic_wide[, c("fileid.start", "fileid.stop", "event", rm(openTopic_wide, num_start) #' ## close openPopup + dat5 <- subset(df, df$event %in% c("ShowPopup", "HidePopup")) dat5 <- dat5[order(dat5$artwork, dat5$popup, dat5$date), ] rownames(dat5) <- NULL @@ -430,8 +408,7 @@ rm(num_start, openPopup_wide) # TODO: Should card maybe also be filled in for "openPopup"? -#' ## Use `rbind()` instead... -# --> unbeatable in terms of time! +#' ## Merge data sets for different events dat_all <- rbind(dat_trans, dat_flipCard, dat_openTopic, dat_openPopup) @@ -439,7 +416,8 @@ dat_all <- rbind(dat_trans, dat_flipCard, dat_openTopic, dat_openPopup) nrow(dat_all) == (nrow(dat_trans) + nrow(dat_flipCard) + nrow(dat_openTopic) + nrow(dat_openPopup)) -# remove all events that do not have a `date.start` +#' ## Remove all events that do not have a `date.start` + dim(dat_all[is.na(dat_all$date.start), ]) dat_all <- dat_all[!is.na(dat_all$date.start), ] # There is only a `date.stop`, when event is not properly closed, see here: @@ -456,7 +434,6 @@ dat[31000:31019,] # this one e.g. # not interpretable dat_all[which(dat_all$fileid.start != dat_all$fileid.stop), "duration"] <- NA - # sort by `start.date` dat_all <- dat_all[order(dat_all$date.start), ] rownames(dat_all) <- NULL @@ -470,8 +447,6 @@ summary(dat_all) # OK, this actually makes a lot of sense :) #' ## Create case variable -#dat_all$timediff <- as.numeric(dat_all$date.stop - dat_all$date.start) - dat_all$timediff <- as.numeric(diff(c(dat_all$date.start[1], dat_all$date.start))) hist(dat_all$timediff[dat_all$timediff < 40], breaks = 50) @@ -507,12 +482,6 @@ dat_all <- dat_all[, c("fileid.start", "fileid.stop", "eventid", "case", #' ## Add `trace` numbers for `move` events -# when case and artwork are identical and there is only 1 trace value -# --> assign it to all `move` events for that case and artwork -# when case and artwork are identical and there is more than 1 trace value -# --> assign the `trace` value that was right before this `move` event -# (could, of course, also be after) - cases <- unique(dat_all$case) aws <- unique(dat_all$artwork)[unique(dat_all$artwork) != "glossar"] max_trace <- max(dat_all$trace, na.rm = TRUE) + 1 @@ -545,7 +514,6 @@ for (case in cases) { max_trace <- max_trace + 1 } if (nrow(tmp) > 0) { - #print(tmp[, c("case", "event", "trace", "artwork")]) out <- rbind(out, tmp) } } @@ -554,15 +522,7 @@ for (case in cases) { # TODO: Get rid of the loops # --> This takes forever... -#head(out[, c("time_ms.start", "case", "trace", "event", "artwork")], 55) - -#head(dat_all[dat_all$artwork %in% "501", c("time_ms.start", "case", "trace", "event", "artwork")], 50) - -# identical(dat_all[which(!dat_all$eventid %in% out$eventid), ], -# dat_all[dat_all$artwork == "glossar", ]) -# --> TRUE - -# put glossar events back in +# put glossar events back in --> not relevant anymore #dat_all <- rbind(out, dat_all[dat_all$artwork == "glossar", ]) out <- out[order(out$date.start), ] @@ -571,25 +531,10 @@ rownames(out) <- NULL # Make `trace` a consecutive number out$trace2 <- as.numeric(factor(out$trace, levels = unique(out$trace))) -#head(out[, c("trace", "trace2")], 50) - #' # Export data write.table(out, "../data/event_logfiles.csv", sep = ";", quote = FALSE, row.names = FALSE) -# Is `artwork` my case? Or `artwork` per day? Or `artwork` per some other -# unit??? Maybe look at differences between timestamps separately for -# `artwork`? And identify "new observational unit" this way? -# -# Definition: (???) -# 1. Touching a new `artwork` corresponds to "observational unit change" -# 2. Time interval of XX min within one `artwork` on the same day -# corresponds to "observational unit change" - -# Split data frame in list of data frame which all correspond to one -# artwork -# dat_art <- split(dat, dat$artwork) - # TODO: Write function for closing events