#' --- #' title: "Preprocessing log files" #' author: "Nora Wickelmaier" #' date: "`r Sys.Date()`" #' output: #' html_document: #' toc: true #' toc_float: true #' pdf_document: #' toc: true #' number_sections: true #' geometry: margin = 2.5cm #' --- # setwd("C:/Users/nwickelmaier/Nextcloud/Documents/MDS/2023ss/60100_master_thesis/code") # LogEntry classes: # TRANSFORM_START: "Transform start" --> "Transformation Start" in Tool # TRANSFORM_STOP: "Transform stop" # START_APPLICATION: "Start Application" # SHOW_APPLICATION: "Show Application" # SHOW_INFO: "Show Info" --> "Flip Card" in Tool # SHOW_FRONT: "Show Front" # SHOW_POPUP: "ShowPopup" --> "Show Popup" in Tool # HIDE_POPUP: "HidePopup" # ARTWORK: "Artwork" --> "Show Topic" in Tool #' # Read data dat0 <- read.table("../data/rawdata_logfiles_small.csv", sep = ";", header = TRUE) dat0$date <- as.POSIXct(dat0$date) # create date object #' # Remove irrelevant events #' ## Remove Start Application and Show Application dat <- subset(dat0, !(dat0$event %in% c("Start Application", "Show Application"))) #' # Close events ######## #' Do it for Transform events first tmp <- dat[dat$event %in% c("Transform start", "Transform stop"), ] tmp <- tmp[order(tmp$artwork, tmp$date), ] rownames(tmp) <- NULL # Create event ID for closing events num_start <- diff(c(0, which(tmp$event == "Transform stop"))) tmp$eventid <- rep(seq_along(num_start), num_start) head(tmp[, c("event", "eventid")], 25) table(table(tmp$eventid)) # 1 2 3 4 5 6 7 8 10 11 # 73 78429 5156 842 222 66 18 14 3 1 # --> compare to table(num_start)! # Find out how often "Transform stop" follows each other num_stop <- c(diff(c(0, which(tmp$event == "Transform start")))) table(num_stop) tmp$eventrep <- rep(num_start, num_start) tmp$dupl <- duplicated(tmp[, c("event", "eventid")]) # keep first tmp$dupl <- duplicated(tmp[, c("event", "eventid")], fromLast = TRUE) # keep last tmp[tmp$eventrep == 10, ] tmp$dupl <- NULL tmp$eventrep <- NULL # remove duplicated "Transform start" events tmp <- tmp[!duplicated(tmp[, c("event", "eventid")]), ] # remove duplicated "Transform stop" events id_stop <- which(tmp$event == "Transform stop") id_rm_stop <- id_stop[diff(id_stop) == 1] tmp <- tmp[-(id_rm_stop + 1), ] # transform to wide data format tmp$event <- ifelse(tmp$event == "Transform start", "start", "stop") trans_wide <- reshape(tmp, direction = "wide", idvar = c("eventid", "artwork"), timevar = "event", drop = c("fileid", "popup", "card") ) # --> when fileid is part of the reshape, it does not work correctly, since # we sometimes have a start - stop event that is recorded in two separate # log files # TODO: This runs for quite some time # --> Is this more efficient with tidyr::pivot_wider? # which(is.na(trans_wide$date.start)) trans_wide$event <- "move" trans_wide$eventid <- NULL rownames(trans_wide) <- NULL trans_wide$duration <- trans_wide$time_ms.stop - trans_wide$time_ms.start #trans_wide$duration2 <- trans_wide$date.stop - trans_wide$date.start # only seconds - not fine grained enough trans_wide$distance <- apply( trans_wide[, c("x.start", "y.start", "x.stop", "y.stop")], 1, function(x) dist(matrix(x, 2, 2, byrow = TRUE))) trans_wide$rotationDegree <- trans_wide$rotation.stop - trans_wide$rotation.start trans_wide$scaleSize <- trans_wide$scale.stop - trans_wide$scale.start dat_trans <- dat_trans[trans_wide$distance != 0 & trans_wide$rotationDegree != 0 & trans_wide$scaleSize != 0, c("event", "artwork", "date.start", "date.stop", "time_ms.start", "time_ms.stop", "duration", "x.start", "y.start", "x.stop", "y.stop", "distance", "scale.start", "scale.stop", "scaleSize", "rotation.start", "rotation.stop", "rotationDegree")] # removes almost 2/3 of the data (for small data set) rm(tmp, id_rm_stop, id_stop, trans_wide, num_start, num_stop) summary(dat_trans) # TODO: Ask Phillip what is wrong with `time_ms` # --> Hat er eine Erklärung dafür? #plot(time_ms.stop ~ time_ms.start, dat_trans, type = "b") plot(time_ms.stop ~ time_ms.start, dat_trans, col = rgb(red = 0, green = 0, blue = 0, alpha = 0.2)) plot(date.stop ~ date.start, dat_trans[1:1000,], type = "b") # TODO: How to handle duration < 0 # --> Replace with NA for now... dat_trans[dat_trans$duration < 0, "duration"] <- NA #' # Close other events tmp <- dat[!dat$event %in% c("Transform start", "Transform stop"), ] tmp$x <- NULL tmp$y <- NULL tmp$scale <- NULL tmp$rotation <- NULL rownames(tmp) <- NULL # Create event ID for closing events # num_start <- diff(c(0, which(tmp$event == "Show Front"))) # tmp$trace <- rep(seq_along(num_start), num_start) # head(tmp[, c("artwork", "event", "trace")], 50) # --> does not work because of glossar entries... can't sort by artwork tmp$trace <- NA last_event <- tmp$event[1] aws <- unique(tmp$artwork)[unique(tmp$artwork) != "glossar"] # for (art in aws) { # select artwork for (i in 1:nrow(tmp)) { # go through rows if (last_event == "Show Info" & tmp$artwork[i] == art) { tmp$trace[i] <- i j <- i } else if (last_event == "Show Front" & tmp$artwork[i] == art) { tmp$trace[i] <- j } else if (!(last_event %in% c("Show Info", "Show Front")) & tmp$artwork[i] == art) { tmp$trace[i] <- j } if (i <= nrow(tmp)) { last_event <- tmp$event[i + 1] } } } head(tmp[, c("artwork", "event", "trace")], 50) tail(tmp[, c("artwork", "event", "trace")], 50) # TODO: How to handle popups from glossar??? rm(aws, i, j, last_event, art) ## Fix glossar entries ### Find artwork for glossar entry glossar_files <- unique(tmp[tmp$artwork == "glossar", "popup"]) # load lookup table for artworks and glossar files load("../data/glossar_dict.RData") lut <- glossar_dict[glossar_dict$glossar_file %in% glossar_files, ] head(tmp[, c("artwork", "event", "popup", "trace")], 20) #df <- NULL for (file in lut$glossar_file) { artwork_list <- unlist(lut[lut$glossar_file == file, "artwork"]) for (i in seq_len(nrow(tmp))) { if (tmp$event[i] == "Show Info") { current_artwork <- tmp[i, "artwork"] j <- i k <- i } else { current_artwork <- current_artwork } if (tmp$event[i] == "Show Front" & tmp$artwork[i] == current_artwork) { # make sure artwork has not been closed, yet! k <- i } if (tmp$artwork[i] == "glossar" & (current_artwork %in% artwork_list) & tmp$popup[i] == file & (j-k == 0)) { #df <- rbind(df, data.frame(file, current_artwork, i, j)) tmp[i, "trace"] <- tmp[j, "trace"] } } } # dim(tmp[is.na(tmp$trace), ]) # --> finds about half of the glossar entries for the small data set... # tmp[apply(df[, c("j", "i")], 1, c), c("artwork", "event", "popup", "trace")] # REMEMBER: It can never bo 100% correct, since it is always possible that # several cards are open and that they link to the same glossar entry # How many glossar_files are only associated with one artwork? lut[sapply(lut$artwork, length) == 1, "glossar_file"] # TODO: Fill in the ones that are associated with one artwork # --> Can't come up with something -- maybe ask AK??? # TODO: How to check if one of the former "Show Infos" is correct on # --> Can't come up with something -- maybe ask AK??? # for (file in lut$glossar_file) { # # artwork_list <- unlist(lut[lut$glossar_file == file, "artwork"]) # # for (i in seq_len(nrow(tmp))) { # # if (tmp$event[i] == "Show Info") { # # artworks <- NULL # current_artwork <- tmp[i, "artwork"] # j <- i # # } else { # # print(current_artwork) # artworks <- c(artworks, tmp[i, "artwork"]) # print(artworks) # # } # # # if (tmp$artwork[i] == "glossar" & # # (current_artwork %in% artwork_list) & # # tmp$popup[i] == file) { # # # # #df <- rbind(df, data.frame(file, current_artwork, i, j)) # # tmp[i, "trace"] <- tmp[j, "trace"] # # # } # } # } # correct: 17940 # incorrect: 17963 # TODO: For now: Exclude not matched glossar entries df <- subset(tmp, !is.na(tmp$trace)) df <- df[order(df$trace), ] rownames(df) <- NULL rm(tmp, lut, current_artwork, file, glossar_dict, i, j, k, artwork_list, glossar_files) #' ## Close flipCard tmp <- subset(df, df$event %in% c("Show Info", "Show Front")) tmp$event <- ifelse(tmp$event == "Show Info", "start", "stop") flipCard_wide <- reshape(tmp, direction = "wide", idvar = c("trace", "artwork"), timevar = "event", drop = c("fileid", "popup", "card")) flipCard_wide$event <- "flipCard" flipCard_wide$duration <- flipCard_wide$time_ms.stop - flipCard_wide$time_ms.start # TODO: How to handle duration < 0 # --> Replace with NA for now... flipCard_wide$duration <- ifelse(flipCard_wide$duration < 0, NA, flipCard_wide$duration) dat_flipCard <- flipCard_wide[, c("event", "artwork", "trace", "date.start", "date.stop", "time_ms.start", "time_ms.stop", "duration")] rm(tmp, flipCard_wide) #' ## Close openTopic tmp <- subset(df, df$event %in% c("Artwork/OpenCard", "Artwork/CloseCard")) tmp <- tmp[order(tmp$artwork, tmp$date), ] rownames(tmp) <- NULL num_start <- diff(c(0, which(tmp$event == "Artwork/CloseCard"))) tmp$eventid <- rep(seq_along(num_start), num_start) tmp$event <- ifelse(tmp$event == "Artwork/OpenCard", "start", "stop") openTopic_wide <- reshape(tmp, direction = "wide", idvar = c("eventid", "trace", "artwork", "card"), timevar = "event", drop = c("fileid", "popup")) openTopic_wide$event <- "openTopic" openTopic_wide$duration <- openTopic_wide$time_ms.stop - openTopic_wide$time_ms.start openTopic_wide$duration <- ifelse(openTopic_wide$duration < 0, NA, openTopic_wide$duration) # TODO: How to handle duration < 0 # --> Replace with NA for now... dat_openTopic <- openTopic_wide[, c("event", "artwork", "card", "trace", "date.start", "date.stop", "time_ms.start", "time_ms.stop", "duration")] # TODO: card should have a unique identifier for each artwork rm(openTopic_wide, num_start, tmp) #' ## close openPopup tmp <- subset(df, df$event %in% c("ShowPopup", "HidePopup")) tmp <- tmp[order(tmp$artwork, tmp$date), ] rownames(tmp) <- NULL num_start <- diff(c(0, which(tmp$event == "HidePopup"))) # last event is "ShowPopup"! Needs to be fixed num_start <- c(num_start, 1) # TODO: Needs to be caught in a function tmp$eventid <- rep(seq_along(num_start), num_start) tmp$event <- ifelse(tmp$event == "ShowPopup", "start", "stop") openPopup_wide <- reshape(tmp, direction = "wide", idvar = c("eventid", "trace", "artwork", "popup"), timevar = "event", drop = c("fileid", "card")) # there is a pathological entry which gets deleted... # df[df$trace == 4595, ] # TODO: Some correct entries are not closed: df[df$trace == 1843, ] # WHY??? openPopup_wide$event <- "openPopup" openPopup_wide$duration <- openPopup_wide$time_ms.stop - openPopup_wide$time_ms.start openPopup_wide$duration <- ifelse(openPopup_wide$duration < 0, NA, openPopup_wide$duration) # TODO: How to handle duration < 0 # --> Replace with NA for now... dat_openPopup <- openPopup_wide[, c("event", "artwork", "popup", "trace", "date.start", "date.stop", "time_ms.start", "time_ms.stop", "duration")] rm(num_start, openPopup_wide, tmp) # Merge all system.time({ dat_all <- merge(dat_trans, dat_flipCard, all = TRUE) dat_all <- merge(dat_all, dat_openTopic, all = TRUE) dat_all <- merge(dat_all, dat_openPopup, all = TRUE) }) # check nrow(dat_all) == (nrow(dat_trans) + nrow(dat_flipCard) + nrow(dat_openTopic) + nrow(dat_openPopup)) dat_all <- dat_all[order(dat_all$date.start), ] rownames(dat_all) <- NULL # TODO: from here on NA... WHY?? dat_all[19426:19435, ] # TODO: Should card maybe also be filled in for "openPopup"? # dat_all2 <- dplyr::full_join(dat_trans, dat_flipCard) # dat_all2 <- dplyr::full_join(dat_all, dat_openTopic) # dat_all2 <- dplyr::full_join(dat_all, dat_openPopup) # # nrow(dat_all2) == (nrow(dat_trans) + nrow(dat_flipCard) + # nrow(dat_openTopic) + nrow(dat_openPopup)) # # dat_all2 <- dat_all2[order(dat_all2$date.start), ] # rownames(dat_all2) <- NULL # TODO: --> same result - but faster. Need it? # --> Would hate to depend on dplyr... #' ## Plots counts <- table(as.Date(dat$date), dat$event) lattice::barchart(counts, auto.key = TRUE) start_events <- c("Transform start", "Show Info", "ShowPopup", "Artwork/OpenCard") counts <- table(as.Date(dat$date[dat$event %in% start_events]), dat$event[dat$event %in% start_events]) lattice::barchart(counts, auto.key = TRUE) # Is `artwork` my case? Or `artwork` per day? Or `artwork` per some other # unit??? Maybe look at differences between timestamps separately for # `artwork`? And identify "new observational unit" this way? # # Definition: (???) # 1. Touching a new `artwork` corresponds to "observational unit change" # 2. Time interval of XX min within one `artwork` on the same day # corresponds to "observational unit change" # Split data frame in list of data frame which all correspond to one # artwork # dat_art <- split(dat, dat$artwork) ## --> Maybe need it at some point? #' # Problems #' * Opening and closing of events cannot be identified unambiguously; it #' can happen that the wrong tags have been put together (e.g., Transform #' start and Transform stop); therefore, durations etc. are only heuristic # TODO: Add a case identifier based on timestamps # --> needs to be done on "raw data". Is it possible? Something seems # seriously wrong with `time_ms` # TODO: Write function for closing events