#' --- #' title: "Preprocessing log files" #' author: "Nora Wickelmaier" #' date: "`r Sys.Date()`" #' output: #' html_document: #' toc: true #' toc_float: true #' pdf_document: #' toc: true #' number_sections: true #' geometry: margin = 2.5cm #' --- # setwd("C:/Users/nwickelmaier/Nextcloud/Documents/MDS/2023ss/60100_master_thesis/code") # LogEntry classes: # TRANSFORM_START: "Transform start" --> "Transformation Start" in Tool # TRANSFORM_STOP: "Transform stop" # START_APPLICATION: "Start Application" # SHOW_APPLICATION: "Show Application" # SHOW_INFO: "Show Info" --> "Flip Card" in Tool # SHOW_FRONT: "Show Front" # SHOW_POPUP: "ShowPopup" --> "Show Popup" in Tool # HIDE_POPUP: "HidePopup" # ARTWORK: "Artwork" --> "Show Topic" in Tool #' # Read data dat0 <- read.table("../data/rawdata_logfiles_small.csv", sep = ";", header = TRUE) dat0$date <- as.POSIXct(dat0$date) # create date object # TODO: Add a case identifier based on timestamps -- needs to be done on # "raw data". Is it possible? Something seems seriously wrong with # `time_ms` #' # Remove irrelevant events #' ## Remove Start Application and Show Application dat <- subset(dat0, !(dat0$event %in% c("Start Application", "Show Application"))) #' # Close events #' Do it for Tranform events first tmp <- dat[dat$event %in% c("Transform start", "Transform stop"), ] tmp <- tmp[order(tmp$artwork, tmp$date), ] rownames(tmp) <- NULL # Find out how often "Transform start" follows each other num_start <- diff(c(0, which(tmp$event == "Transform stop"))) tmp$eventid <- rep(seq_along(num_start), num_start) head(tmp[, c("event", "eventid")], 25) table(table(tmp$eventid)) # 1 2 3 4 5 6 7 8 10 11 # 73 78429 5156 842 222 66 18 14 3 1 # --> compare to table(num_start)! # Find out how often "Transform stop" follows each other num_stop <- c(diff(c(0, which(tmp$event == "Transform start")))) table(num_stop) tmp$eventrep <- rep(num_start, num_start) tmp$dupl <- duplicated(tmp[, c("event", "eventid")]) # keep first tmp$dupl <- duplicated(tmp[, c("event", "eventid")], fromLast = TRUE) # keep last tmp[tmp$eventrep == 10, ] tmp$dupl <- NULL tmp$eventrep <- NULL # remove duplicated "Transform start" events tmp <- tmp[!duplicated(tmp[, c("event", "eventid")]), ] # remove duplicated "Transform stop" events id_stop <- which(tmp$event == "Transform stop") id_rm_stop <- id_stop[diff(id_stop) == 1] tmp <- tmp[-(id_rm_stop + 1), ] # transform to wide data format tmp$event <- ifelse(tmp$event == "Transform start", "start", "stop") trans_wide <- reshape(tmp, direction = "wide", idvar = c("eventid", "artwork"), timevar = "event", drop = c("fileid", "popup", "card") ) rownames(trans_wide) <- NULL # --> when fileid is part of the reshape, it does not work correctly, since # we sometimes have a start - stop event that is recorded in two separate # log files # TODO: This runs for quite some time. Is this more efficient with dplyr? # which(is.na(trans_wide$date.start)) trans_wide$duration <- trans_wide$time_ms.stop - trans_wide$time_ms.start trans_wide$distance <- apply(trans_wide[, c("x.start", "y.start", "x.stop", "y.stop")], 1, function(x) dist(matrix(x, 2, 2, byrow = TRUE))) trans_wide$rotationDegree <- trans_wide$rotation.stop - trans_wide$rotation.start trans_wide$scaleSize <- trans_wide$scale.stop - trans_wide$scale.start trans_wide <- trans_wide[trans_wide$distance != 0 & trans_wide$rotationDegree != 0 & trans_wide$scaleSize != 0, ] # removes almost 2/3 of the data (for small data set) # TODO: How do I handle popups from glossar??? # Should every "Show front" be the beginning of a new trace? # Should Transform events be handled separately and then be "added" again # by timestamp? ######## tmp <- dat[!dat$event %in% c("Transform start", "Transform stop"), ] rownames(tmp) <- NULL tmp$trace <- NA last_event <- tmp$event[1] for (art in unique(tmp$artwork)) { for (i in 1:nrow(tmp)) { if (last_event == "Show Info" & (tmp$artwork[i] == art | tmp$artwork[i] == "glossar")) { tmp$trace[i] <- "start" } else if (last_event == "Show Front" & (tmp$artwork[i] == art | tmp$artwork[i] == "glossar")) { tmp$trace[i] <- "stop" } last_event <- tmp$event[i + 1] } } head(tmp[4:ncol(tmp)], 50) # TODO: Great job! You used a for-loop to rename "Show info" and "Show # front" to "start" and "stop" ;) #' ## Remove "button presses" # Sort data frame by artwork and date dat <- dat[order(dat$artwork, dat$date), ] # remove "Transform start" and "Transform stop" following directly each # other, since I do not know how to interpret them as events id_start <- which(dat$event == "Transform start") id_stop <- which(dat$event == "Transform stop") id_rm_start <- id_start[diff(id_start) == 1] id_rm_stop <- id_stop[diff(id_stop) == 1] dat <- dat[-c(id_rm_start, id_rm_stop), ] rownames(dat) <- NULL id_start2 <- which(dat$event == "Transform start") id_stop2 <- which(dat$event == "Transform stop") length(id_start2) - length(id_stop2) # 340 --> "starts too many" # remove "Transform start" and "Transform stop" following directly each # other (but with events in between!) id_start_new <- id_start2 id_stop_new <- id_stop2 for (i in 2:length(id_start_new)) { if (id_start_new[i-1] < id_stop_new[i-1] & id_start_new[i] < id_stop_new[i-1]) { id_start_new <- id_start_new[-(i-1)] } else if (id_start_new[i-1] > id_stop_new[i-1] & id_start_new[i] > id_stop_new[i-1]) { id_stop_new <- id_stop_new[-(i-1)] } } length(id_start2) - length(id_start_new) length(id_stop2) - length(id_stop_new) ids <- data.frame(start = id_start_new, stop = id_stop_new) ids$diff <- ids$stop - ids$start table(ids$diff) # remove "Transform start" and "Transform stop" around other events id_rm_start2 <- id_start2[!(id_start2 %in% id_start_new)] id_rm_stop2 <- id_stop2[!(id_stop2 %in% id_stop_new)] # TODO: It still does not work correctly: dat[64764:64769,] # time_ms event artwork popup x y scale rotation # 64764 473081 Transform start 052 052.xml 1958.65 1505.75 0.8234455 -0.1351998 # 64765 474226 Show Info 052 052.xml NA NA NA NA # 64766 475735 Transform start 052 052.xml 1988.25 1625.25 0.9927645 2.4527958 # 64767 475739 Transform stop 052 052.xml 1988.25 1625.25 0.9927645 2.4527958 # 64768 479326 Artwork 052 052.xml NA NA NA NA # 64769 479751 Transform stop 052 052.xml 1660.90 1883.20 0.8074586 29.0875534 # --> but no idea how to find these cases in an automated way... dat <- dat[-c(id_rm_start2, id_rm_stop2), ] # --> Every start ends with a stop now (but not necessarily the correct one!) dat1 <- dat[order(dat$date, dat$time_ms), ] dat1$time_diff <- c(NA, diff(dat1$time_ms)) boxplot(time_diff ~ as.Date(date), dat1[dat1$time_diff > 1000 & dat1$time_diff < 4000, ]) boxplot(time_ms ~ event, dat1) #' ## Plots counts <- table(as.Date(dat$date), dat$event) lattice::barchart(counts, auto.key = TRUE) start_events <- c("Transform start", "Show Info", "ShowPopup", "Artwork/OpenCard") counts <- table(as.Date(dat$date[dat$event %in% start_events]), dat$event[dat$event %in% start_events]) lattice::barchart(counts, auto.key = TRUE) # TODO: Do I want to "collapse" the data frame in a way, that I only have # one event for each "set", meaning # # * Transform start + Transform stop --> Transform # * Artwork/OpenCard + Artwork/CloseCard --> Show Subcard # * ShowPopup + HidePopup --> Show Popup # * Show Info + Show Front --> Flip Card # (s.o. ;)) # # Then I would have meaningful variables like duration, distance, degree of # rotation, size of scaling, selection of Subcard etc. # This means that I would have to delete all "unclosed" events. # Create a data frame with # case event attributes (can differ for different events) # ?? # Is `artwork` my case? Or `artwork` per day? Or `artwork` per some other # unit??? Maybe look at differences between timestamps separately for # `artwork`? And identify "new observational unit" this way? # # Definition: (???) # 1. Touching a new `artwork` corresponds to "observational unit change" # 2. Time interval of XX min within one `artwork` on the same day # corresponds to "observational unit change" # id activity timestamp # Split data frame in list of data frame which all correspond to one # artwork # dat_art <- split(dat, dat$artwork) ## --> Maybe need it at some point? #' # Problems #' * Opening and closing of events cannot be identified unambiguously; it #' can happen that the wrong tags have been put together (e.g., Transform #' start and Transform stop); therefore, durations etc. are only heuristic