#' --- #' title: "Preprocessing log files" #' author: "Nora Wickelmaier" #' date: "`r Sys.Date()`" #' output: #' html_document: #' toc: true #' toc_float: true #' pdf_document: #' toc: true #' number_sections: true #' geometry: margin = 2.5cm #' --- # setwd("C:/Users/nwickelmaier/Nextcloud/Documents/MDS/2023ss/60100_master_thesis/code") # LogEntry classes: # TRANSFORM_START: "Transform start" --> "Transformation Start" in Tool # TRANSFORM_STOP: "Transform stop" # START_APPLICATION: "Start Application" # SHOW_APPLICATION: "Show Application" # SHOW_INFO: "Show Info" --> "Flip Card" in Tool # SHOW_FRONT: "Show Front" # SHOW_POPUP: "ShowPopup" --> "Show Popup" in Tool # HIDE_POPUP: "HidePopup" # ARTWORK: "Artwork" --> "Show Topic" in Tool #' # Read data dat0 <- read.table("../data/rawdata_logfiles_small.csv", sep = ";", header = TRUE) dat0$date <- as.POSIXct(dat0$date) # create date object dat0$glossar <- ifelse(dat0$artwork == "glossar", 1, 0) #' # Remove irrelevant events #' ## Remove Start Application and Show Application dat <- subset(dat0, !(dat0$event %in% c("Start Application", "Show Application"))) rownames(dat) <- NULL #' # Close events ######## #' Do it for Transform events first dat1 <- dat[dat$event %in% c("Transform start", "Transform stop"), ] dat1 <- dat1[order(dat1$artwork, dat1$date), ] rownames(dat1) <- NULL # Create event ID for closing events num_start <- diff(c(0, which(dat1$event == "Transform stop"))) dat1$eventid <- rep(seq_along(num_start), num_start) head(dat1[, c("event", "eventid")], 25) table(table(dat1$eventid)) # 1 2 3 4 5 6 7 8 10 11 # 70 78435 5153 842 222 66 18 14 3 1 # --> compare to table(num_start)! # Find out how often "Transform stop" follows each other num_stop <- c(diff(c(0, which(dat1$event == "Transform start")))) table(num_stop) dat1$eventrep <- rep(num_start, num_start) dat1$dupl <- duplicated(dat1[, c("event", "eventid")]) # keep first dat1$dupl <- duplicated(dat1[, c("event", "eventid")], fromLast = TRUE) # keep last dat1[dat1$eventrep == 10, ] dat1$dupl <- NULL dat1$eventrep <- NULL # remove duplicated "Transform start" events dat1 <- dat1[!duplicated(dat1[, c("event", "eventid")]), ] # remove duplicated "Transform stop" events id_stop <- which(dat1$event == "Transform stop") id_rm_stop <- id_stop[diff(id_stop) == 1] dat1 <- dat1[-(id_rm_stop + 1), ] # transform to wide data format dat1$time <- ifelse(dat1$event == "Transform start", "start", "stop") trans_wide <- reshape(dat1, direction = "wide", idvar = c("eventid", "artwork", "glossar"), timevar = "time", drop = c("popup", "card", "event") ) # TODO: This runs for quite some time # --> Is this more efficient with tidyr::pivot_wider? # --> when fileid is part of the reshape, it does not work correctly, since # we sometimes have a start - stop event that is recorded in two separate # log files, BUT: after finding out, that `time_ms` changes for each log # file, I want to exclude those cases, so `fileid` has to be included!!! # check how often an eventid is associated with two fileids nrow(subset(trans_wide, trans_wide$fileid.start != trans_wide$fileid.stop)) # exclude from data set ?? # trans_wide <- subset(trans_wide, trans_wide$fileid.start != trans_wide$fileid.stop) # which(is.na(trans_wide$date.start)) trans_wide$event <- "move" trans_wide$eventid <- NULL rownames(trans_wide) <- NULL trans_wide$duration <- trans_wide$time_ms.stop - trans_wide$time_ms.start #trans_wide$duration <- trans_wide$date.stop - trans_wide$date.start # only seconds - not fine grained enough trans_wide$distance <- apply( trans_wide[, c("x.start", "y.start", "x.stop", "y.stop")], 1, function(x) dist(matrix(x, 2, 2, byrow = TRUE))) trans_wide$rotationDegree <- trans_wide$rotation.stop - trans_wide$rotation.start trans_wide$scaleSize <- trans_wide$scale.stop / trans_wide$scale.start trans_wide$trace <- NA trans_wide$card <- NA trans_wide$popup <- NA dat_trans <- trans_wide[trans_wide$distance != 0 & trans_wide$rotationDegree != 0 & trans_wide$scaleSize != 1, c("fileid.start", "fileid.stop", "event", "artwork", "trace", "glossar", "date.start", "date.stop", "time_ms.start", "time_ms.stop", "duration", "card", "popup", "x.start", "y.start", "x.stop", "y.stop", "distance", "scale.start", "scale.stop", "scaleSize", "rotation.start", "rotation.stop", "rotationDegree")] 1 - nrow(dat_trans) / nrow(trans_wide) # removes almost 2/3 of the data (for small data set) rm(id_rm_stop, id_stop, trans_wide, num_start, num_stop) summary(dat_trans) #' # Close other events dat2 <- dat[!dat$event %in% c("Transform start", "Transform stop"), ] # dat2$x <- NULL # dat2$y <- NULL # dat2$scale <- NULL # dat2$rotation <- NULL rownames(dat2) <- NULL # Create event ID for closing events # num_start <- diff(c(0, which(dat2$event == "Show Front"))) # dat2$trace <- rep(seq_along(num_start), num_start) # head(dat2[, c("artwork", "event", "trace")], 50) # --> does not work because of glossar entries... can't sort by artwork dat2$trace <- NA last_event <- dat2$event[1] aws <- unique(dat2$artwork)[unique(dat2$artwork) != "glossar"] # for (art in aws) { # select artwork for (i in 1:nrow(dat2)) { # go through rows if (last_event == "Show Info" & dat2$artwork[i] == art) { dat2$trace[i] <- i j <- i } else if (last_event == "Show Front" & dat2$artwork[i] == art) { dat2$trace[i] <- j } else if (!(last_event %in% c("Show Info", "Show Front")) & dat2$artwork[i] == art) { dat2$trace[i] <- j } if (i <= nrow(dat2)) { last_event <- dat2$event[i + 1] } } } head(dat2[, c("artwork", "event", "trace")], 50) tail(dat2[, c("artwork", "event", "trace")], 50) # TODO: How to handle popups from glossar??? rm(aws, i, j, last_event, art) ## Fix glossar entries ### Find artwork for glossar entry glossar_files <- unique(dat2[dat2$artwork == "glossar", "popup"]) # load lookup table for artworks and glossar files load("../data/glossar_dict.RData") lut <- glossar_dict[glossar_dict$glossar_file %in% glossar_files, ] head(dat2[, c("artwork", "event", "popup", "trace")], 20) for (file in lut$glossar_file) { artwork_list <- unlist(lut[lut$glossar_file == file, "artwork"]) for (i in seq_len(nrow(dat2))) { if (dat2$event[i] == "Show Info") { current_artwork <- dat2[i, "artwork"] j <- i k <- i } else { current_artwork <- current_artwork } if (dat2$event[i] == "Show Front" & dat2$artwork[i] == current_artwork) { # make sure artwork has not been closed, yet! k <- i } if (dat2$artwork[i] == "glossar" & (current_artwork %in% artwork_list) & dat2$popup[i] == file & (j-k == 0)) { dat2[i, "trace"] <- dat2[j, "trace"] dat2[i, "artwork"] <- current_artwork } } } # --> finds about half of the glossar entries for the small data set... table(is.na(dat2[dat2$glossar == 1, "trace"])) # REMEMBER: It can never bo 100% correct, since it is always possible that # several cards are open and that they link to the same glossar entry # How many glossar_files are only associated with one artwork? lut[sapply(lut$artwork, length) == 1, "glossar_file"] # TODO: Fill in the ones that are associated with one artwork # --> Can't come up with something -- maybe ask AK??? single <- lut[sapply(lut$artwork, length) == 1, "glossar_file"] tmp <- subset(dat2, is.na(dat2$trace))$popup inside <- unique(tmp[tmp %in% lut[sapply(lut$artwork, length) == 1, "glossar_file"]]) single_art <- unlist(lut[lut$glossar_file %in% inside, "artwork"]) tmp_lut <- data.frame(glossar_file = sort(inside), artwork = single_art) dat2[dat2$glossar == 1, c("artwork", "popup", "glossar", "trace")] for (file in tmp_lut$glossar_file) { for (i in seq_len(nrow(dat2))) { if (dat2$event[i] == "Artwork/OpenCard" & dat2$artwork[i] %in% tmp_lut$artwork) { current_artwork <- dat2[i, "artwork"] j <- i } if (dat2$artwork[i] == "glossar" & dat2$popup[i] == file) { dat2[i, "trace"] <- dat2[j, "trace"] dat2[i, "artwork"] <- current_artwork } } } dat2[14110:14130, ] # TODO: Integrate for loop into for loop above # TODO: "glossar" entry should be changed to the corresponding artwork # TODO: For now: Exclude not matched glossar entries df <- subset(dat2, !is.na(dat2$trace)) df <- df[order(df$trace), ] rownames(df) <- NULL rm(lut, current_artwork, file, glossar_dict, i, j, k, artwork_list, glossar_files) #' ## Close flipCard dat3 <- subset(df, df$event %in% c("Show Info", "Show Front")) dat3$time <- ifelse(dat3$event == "Show Info", "start", "stop") flipCard_wide <- reshape(dat3, direction = "wide", idvar = c("trace", "artwork", "glossar"), timevar = "time", drop = c("popup", "card")) flipCard_wide$event <- "flipCard" flipCard_wide$duration <- flipCard_wide$time_ms.stop - flipCard_wide$time_ms.start flipCard_wide$card <- NA flipCard_wide$popup <- NA flipCard_wide$x.start <- NA flipCard_wide$x.stop <- NA flipCard_wide$y.start <- NA flipCard_wide$y.stop <- NA flipCard_wide$distance <- NA flipCard_wide$scale.start <- NA flipCard_wide$scale.stop <- NA flipCard_wide$scaleSize <- NA flipCard_wide$rotation.start <- NA flipCard_wide$rotation.stop <- NA flipCard_wide$rotationDegree <- NA dat_flipCard <- flipCard_wide[, c("fileid.start", "fileid.stop", "event", "artwork", "trace", "glossar", "date.start", "date.stop", "time_ms.start", "time_ms.stop", "duration", "card", "popup", "x.start", "y.start", "x.stop", "y.stop", "distance", "scale.start", "scale.stop", "scaleSize", "rotation.start", "rotation.stop", "rotationDegree")] rm(flipCard_wide) #' ## Close openTopic dat4 <- subset(df, df$event %in% c("Artwork/OpenCard", "Artwork/CloseCard")) dat4 <- dat4[order(dat4$artwork, dat4$date), ] rownames(dat4) <- NULL num_start <- diff(c(0, which(dat4$event == "Artwork/CloseCard"))) dat4$eventid <- rep(seq_along(num_start), num_start) dat4$time <- ifelse(dat4$event == "Artwork/OpenCard", "start", "stop") openTopic_wide <- reshape(dat4, direction = "wide", idvar = c("eventid", "trace", "glossar", "artwork", "card"), timevar = "time", drop = "popup") openTopic_wide$event <- "openTopic" openTopic_wide$duration <- openTopic_wide$time_ms.stop - openTopic_wide$time_ms.start openTopic_wide$popup <- NA openTopic_wide$x.start <- NA openTopic_wide$x.stop <- NA openTopic_wide$y.start <- NA openTopic_wide$y.stop <- NA openTopic_wide$distance <- NA openTopic_wide$scale.start <- NA openTopic_wide$scale.stop <- NA openTopic_wide$scaleSize <- NA openTopic_wide$rotation.start <- NA openTopic_wide$rotation.stop <- NA openTopic_wide$rotationDegree <- NA dat_openTopic <- openTopic_wide[, c("fileid.start", "fileid.stop", "event", "artwork", "trace", "glossar", "date.start", "date.stop", "time_ms.start", "time_ms.stop", "duration", "card", "popup", "x.start", "y.start", "x.stop", "y.stop", "distance", "scale.start", "scale.stop", "scaleSize", "rotation.start", "rotation.stop", "rotationDegree")] # TODO: card should have a unique identifier for each artwork rm(openTopic_wide, num_start) #' ## close openPopup dat5 <- subset(df, df$event %in% c("ShowPopup", "HidePopup")) dat5 <- dat5[order(dat5$artwork, dat5$popup, dat5$date), ] rownames(dat5) <- NULL num_start <- diff(c(0, which(dat5$event == "HidePopup"))) # last event is "ShowPopup"! Needs to be fixed # num_start <- c(num_start, 1) # TODO: Needs to be caught in a function --> not anymore - still relevant??? dat5$eventid <- rep(seq_along(num_start), num_start) dat5$time <- ifelse(dat5$event == "ShowPopup", "start", "stop") openPopup_wide <- reshape(dat5, direction = "wide", idvar = c("eventid", "trace", "glossar", "artwork", "popup"), timevar = "time", drop = "card") # there is a pathological entry which gets deleted... # df[df$trace == 4595, ] # --> artwork 046 popup selene.xml gets opened twice openPopup_wide$event <- "openPopup" openPopup_wide$duration <- openPopup_wide$time_ms.stop - openPopup_wide$time_ms.start openPopup_wide$card <- NA openPopup_wide$x.start <- NA openPopup_wide$x.stop <- NA openPopup_wide$y.start <- NA openPopup_wide$y.stop <- NA openPopup_wide$distance <- NA openPopup_wide$scale.start <- NA openPopup_wide$scale.stop <- NA openPopup_wide$scaleSize <- NA openPopup_wide$rotation.start <- NA openPopup_wide$rotation.stop <- NA openPopup_wide$rotationDegree <- NA dat_openPopup <- openPopup_wide[, c("fileid.start", "fileid.stop", "event", "artwork", "trace", "glossar", "date.start", "date.stop", "time_ms.start", "time_ms.stop", "duration", "card", "popup", "x.start", "y.start", "x.stop", "y.stop", "distance", "scale.start", "scale.stop", "scaleSize", "rotation.start", "rotation.stop", "rotationDegree")] rm(num_start, openPopup_wide) # Merge all # system.time({ # dat_all <- merge(dat_trans, dat_flipCard, all = TRUE) # dat_all <- merge(dat_all, dat_openTopic, all = TRUE) # dat_all <- merge(dat_all, dat_openPopup, all = TRUE) # }) # # # check # nrow(dat_all) == (nrow(dat_trans) + nrow(dat_flipCard) + # nrow(dat_openTopic) + nrow(dat_openPopup)) # # dat_all <- dat_all[order(dat_all$date.start), ] # rownames(dat_all) <- NULL # # TODO: from here on NA... WHY?? # dat_all[19426:19435, ] # TODO: Should card maybe also be filled in for "openPopup"? # dat_all2 <- dplyr::full_join(dat_trans, dat_flipCard) # dat_all2 <- dplyr::full_join(dat_all, dat_openTopic) # dat_all2 <- dplyr::full_join(dat_all, dat_openPopup) # # nrow(dat_all2) == (nrow(dat_trans) + nrow(dat_flipCard) + # nrow(dat_openTopic) + nrow(dat_openPopup)) # # dat_all2 <- dat_all2[order(dat_all2$date.start), ] # rownames(dat_all2) <- NULL # TODO: --> same result - but faster. Need it? # --> Would hate to depend on dplyr... #' ## Use `rbind()` instead... # --> unbeatable in terms of time! dat_all <- rbind(dat_trans, dat_flipCard, dat_openTopic, dat_openPopup) # check nrow(dat_all) == (nrow(dat_trans) + nrow(dat_flipCard) + nrow(dat_openTopic) + nrow(dat_openPopup)) # remove all events that do not have a `date.start` dim(dat_all[is.na(dat_all$date.start), ]) dat_all <- dat_all[!is.na(dat_all$date.start), ] # There is only a `date.stop`, when event is not properly closed, see here: df[df$trace == 1843, ] dat_openPopup[dat_openPopup$trace == 1843, ] ## --> still 50 (small data set) left, and some really do not seem to be ## opened! Must be a log error # --> others should be closed! dat[31000:31019,] # this one e.g. # --> Actually NOT! card gets flipped before! Again - log error! # Remove durations when event spans more than one log file, since they are # not interpretable dat_all[which(dat_all$fileid.start != dat_all$fileid.stop), "duration"] <- NA # sort by `start.date` dat_all <- dat_all[order(dat_all$date.start), ] rownames(dat_all) <- NULL ind <- rowSums(is.na(dat_all)) == ncol(dat_all) any(ind) dat_all[ind, ] # --> No rows with only NA, as it should be. summary(dat_all) # OK, this actually makes a lot of sense :) #' ## Create case variable #dat_all$timediff <- as.numeric(dat_all$date.stop - dat_all$date.start) dat_all$timediff <- as.numeric(diff(c(dat_all$date.start[1], dat_all$date.start))) hist(dat_all$timediff[dat_all$timediff < 40], breaks = 50) # TODO: What is the best choice for the cutoff here? I took 20 secs for now dat_all$case <- NA j <- 1 for (i in seq_len(nrow(dat_all))) { if (dat_all$timediff[i] < 21) { dat_all$case[i] <- j } else { j <- j + 1 dat_all$case[i] <- j } } head(dat_all[, c("event", "artwork", "trace", "date.start", "timediff", "case")], 100) #' ## Add event ID dat_all$eventid <- seq_len(nrow(dat_all)) dat_all <- dat_all[, c("fileid.start", "fileid.stop", "eventid", "case", "trace", "glossar", "event", "artwork", "date.start", "date.stop", "time_ms.start", "time_ms.stop", "duration", "card", "popup", "x.start", "y.start", "x.stop", "y.stop", "distance", "scale.start", "scale.stop", "scaleSize", "rotation.start", "rotation.stop", "rotationDegree")] #' ## Add `trace` numbers for `move` events # when case and artwork are identical and there is only 1 trace value # --> assign it to all `move` events for that case and artwork # when case and artwork are identical and there is more than 1 trace value # --> assign the `trace` value that was right before this `move` event # (could, of course, also be after) cases <- unique(dat_all$case) aws <- unique(dat_all$artwork)[unique(dat_all$artwork) != "glossar"] max_trace <- max(dat_all$trace, na.rm = TRUE) + 1 out <- NULL for (case in cases) { for (art in aws) { tmp <- dat_all[dat_all$case == case & dat_all$artwork == art, ] if (nrow(tmp) != 0) { if (length(na.omit(unique(tmp$trace))) == 1) { tmp[tmp$event == "move", "trace"] <- na.omit(unique(tmp$trace)) } else if (length(na.omit(unique(tmp$trace))) > 1) { for (i in 1:nrow(tmp)) { if (tmp$event[i] == "move") { if (i == 1) { tmp$trace[i] <- na.omit(unique(tmp$trace))[1] } else { tmp$trace[i] <- tmp$trace[i - 1] } } } } else if (all(is.na(tmp$trace))) { for (i in 1:nrow(tmp)) { if (tmp$event[i] == "move") { tmp$trace[i] <- max_trace } } } max_trace <- max_trace + 1 } if (nrow(tmp) > 0) { #print(tmp[, c("case", "event", "trace", "artwork")]) out <- rbind(out, tmp) } } } # TODO: Get rid of the loops # --> This takes forever... #head(out[, c("time_ms.start", "case", "trace", "event", "artwork")], 55) #head(dat_all[dat_all$artwork %in% "501", c("time_ms.start", "case", "trace", "event", "artwork")], 50) # identical(dat_all[which(!dat_all$eventid %in% out$eventid), ], # dat_all[dat_all$artwork == "glossar", ]) # --> TRUE # put glossar events back in #dat_all <- rbind(out, dat_all[dat_all$artwork == "glossar", ]) out <- out[order(out$date.start), ] rownames(out) <- NULL # Make `trace` a consecutive number out$trace2 <- as.numeric(factor(out$trace, levels = unique(out$trace))) #head(out[, c("trace", "trace2")], 50) #' # Export data write.table(out, "../data/event_logfiles.csv", sep = ";", quote = FALSE, row.names = FALSE) # Is `artwork` my case? Or `artwork` per day? Or `artwork` per some other # unit??? Maybe look at differences between timestamps separately for # `artwork`? And identify "new observational unit" this way? # # Definition: (???) # 1. Touching a new `artwork` corresponds to "observational unit change" # 2. Time interval of XX min within one `artwork` on the same day # corresponds to "observational unit change" # Split data frame in list of data frame which all correspond to one # artwork # dat_art <- split(dat, dat$artwork) # TODO: Write function for closing events