diff --git a/code/zz_investigate.R b/code/zz_investigate.R deleted file mode 100644 index 3cbb897..0000000 --- a/code/zz_investigate.R +++ /dev/null @@ -1,248 +0,0 @@ -#' --- -#' title: "Preprocessing log files" -#' author: "Nora Wickelmaier" -#' date: "`r Sys.Date()`" -#' output: -#' html_document: -#' toc: true -#' toc_float: true -#' pdf_document: -#' toc: true -#' number_sections: true -#' geometry: margin = 2.5cm -#' --- - -# setwd("C:/Users/nwickelmaier/Nextcloud/Documents/MDS/2023ss/60100_master_thesis/code") - -# LogEntry classes: -# TRANSFORM_START: "Transform start" --> "Transformation Start" in Tool -# TRANSFORM_STOP: "Transform stop" -# START_APPLICATION: "Start Application" -# SHOW_APPLICATION: "Show Application" -# SHOW_INFO: "Show Info" --> "Flip Card" in Tool -# SHOW_FRONT: "Show Front" -# SHOW_POPUP: "ShowPopup" --> "Show Popup" in Tool -# HIDE_POPUP: "HidePopup" -# ARTWORK: "Artwork" --> "Show Topic" in Tool - -#' # Read data - -dat0 <- read.table("../data/rawdata_logfiles.csv", sep = ";", header = TRUE) -dat0$date <- as.POSIXct(dat0$date) # create date object - -plot(dat0$time_ms[1:3000], type = "l") - -# what happens here? Why does `time_ms` go down, but not to 0? -plot(dat0$time_ms[2500:3000], type = "l") -plot(dat0$time_ms[2755:2765], type = "l") # "zoom in" -dat0[2755:2765, ] -# --> overall time stamp keeps going up... - -# TODO: How to create a plot that gives the same information based on -# `time_ms` und `date`?? -plot(time_ms ~ date, dat0[1:5000, ], type = "b") -abline(h = 0, col = "red", lty = 3) -# Visualize night -plot(time_ms ~ date, dat0[1:10000, ], type = "b") - - -# Not all `Start Application` have `time_ms = 0` - why?? - -dat0[125537:125542, ] -dat0[6673501:6673510, ] -# --> What's happening here? - -table(dat0[dat0$event %in% "Start Application", c("event", "date", "time_ms")]$time_ms) -# 0 1 15 16 296 2819 2914 3191 5316 6535 -# 3131 4 21 48 1 1 1 1 1 1 -# --> ??? -dat0[dat0$event == "Start Application" & dat0$time_ms == 6535, ] -dat0[989313:989317, ] - -dat0[dat0$event == "Start Application" & dat0$time_ms == 5316, ] -dat0[2071078:2071082, ] - -dat0[dat0$event == "Start Application" & dat0$time_ms == 3191, ] -dat0[2851863:2851867, ] - -dat0[dat0$event == "Start Application" & dat0$time_ms == 16, ] -dat0[156382:156386, ] -dat0[5566940:5566947, ] -# --> pattern is *not* consistent - -dat0[dat0$event == "Start Application" & dat0$time_ms == 1, ] -dat0[125537:125542, ] - - -xtabs( ~ event + as.Date(date), dat0[1:1000, ]) - -# How many days do we have with up to 8 "Start Applications" -table(xtabs( ~ event + as.Date(date), dat0[dat0$event == "Start Application", ])) -# 1 2 3 4 5 6 7 8 -# 381 657 272 86 37 14 10 2 -# --> 8 days without any "Start Application" -length(unique(as.Date(dat0$date))) - - length(xtabs( ~ event + as.Date(date), dat0[dat0$event == "Start Application", ])) - -# But only 6 files with 2 "Start Applications" -table(xtabs( ~ event + fileid, dat0[dat0$event == "Start Application", ])) -# 1 2 -# 3198 6 -# --> That means we have 36,563 file ids without any "Start Application" - - -#' # Remove irrelevant events - -#' ## Remove Start Application and Show Application - -dat <- subset(dat0, !(dat0$event %in% c("Start Application", "Show Application"))) - -#' ## Remove "button presses" - -# Sort data frame by artwork and date -dat <- dat[order(dat$artwork, dat$date), ] - -# remove "Transform start" and "Transform stop" following directly each -# other, since I do not know how to interpret them as events -id_start <- which(dat$event == "Transform start") -id_stop <- which(dat$event == "Transform stop") - -id_rm_start <- id_start[diff(id_start) == 1] -id_rm_stop <- id_stop[diff(id_stop) == 1] - -dat <- dat[-c(id_rm_start, id_rm_stop), ] -rownames(dat) <- NULL - - -id_start2 <- which(dat$event == "Transform start") -id_stop2 <- which(dat$event == "Transform stop") - -length(id_start2) - length(id_stop2) -# 340 --> "starts too many" - -# remove "Transform start" and "Transform stop" following directly each -# other (but with events in between!) -id_start_new <- id_start2 -id_stop_new <- id_stop2 - -for (i in 2:length(id_start_new)) { - if (id_start_new[i-1] < id_stop_new[i-1] & id_start_new[i] < id_stop_new[i-1]) { - id_start_new <- id_start_new[-(i-1)] - } else if (id_start_new[i-1] > id_stop_new[i-1] & id_start_new[i] > id_stop_new[i-1]) { - id_stop_new <- id_stop_new[-(i-1)] - } -} - -length(id_start2) - length(id_start_new) -length(id_stop2) - length(id_stop_new) - -ids <- data.frame(start = id_start_new, stop = id_stop_new) -ids$diff <- ids$stop - ids$start - -table(ids$diff) - -# remove "Transform start" and "Transform stop" around other events - -id_rm_start2 <- id_start2[!(id_start2 %in% id_start_new)] -id_rm_stop2 <- id_stop2[!(id_stop2 %in% id_stop_new)] - -# TODO: It still does not work correctly: -dat[64764:64769,] -# time_ms event artwork popup x y scale rotation -# 64764 473081 Transform start 052 052.xml 1958.65 1505.75 0.8234455 -0.1351998 -# 64765 474226 Show Info 052 052.xml NA NA NA NA -# 64766 475735 Transform start 052 052.xml 1988.25 1625.25 0.9927645 2.4527958 -# 64767 475739 Transform stop 052 052.xml 1988.25 1625.25 0.9927645 2.4527958 -# 64768 479326 Artwork 052 052.xml NA NA NA NA -# 64769 479751 Transform stop 052 052.xml 1660.90 1883.20 0.8074586 29.0875534 - -# --> but no idea how to find these cases in an automated way... - -dat <- dat[-c(id_rm_start2, id_rm_stop2), ] -# --> Every start ends with a stop now (but not necessarily the correct one!) - - -dat1 <- dat[order(dat$date, dat$time_ms), ] -dat1$time_diff <- c(NA, diff(dat1$time_ms)) - -boxplot(time_diff ~ as.Date(date), dat1[dat1$time_diff > 1000 & dat1$time_diff < 4000, ]) - -boxplot(time_ms ~ event, dat1) - - -#' ## Plots - -counts <- table(as.Date(dat$date), dat$event) -lattice::barchart(counts, auto.key = TRUE) - - -start_events <- c("Transform start", "Show Info", "ShowPopup", "Artwork/OpenCard") - -counts <- table(as.Date(dat$date[dat$event %in% start_events]), - dat$event[dat$event %in% start_events]) -lattice::barchart(counts, auto.key = TRUE) - - -# TODO: Do I want to "collapse" the data frame in a way, that I only have -# one event for each "set", meaning -# -# * Transform start + Transform stop --> Transform -# * Artwork/OpenCard + Artwork/CloseCard --> Show Subcard -# * ShowPopup + HidePopup --> Show Popup -# * Show Info + Show Front --> Flip Card -# (s.o. ;)) -# -# Then I would have meaningful variables like duration, distance, degree of -# rotation, size of scaling, selection of Subcard etc. -# This means that I would have to delete all "unclosed" events. - -# Create a data frame with -# case event attributes (can differ for different events) -# ?? -# Is `artwork` my case? Or `artwork` per day? Or `artwork` per some other -# unit??? Maybe look at differences between timestamps separately for -# `artwork`? And identify "new observational unit" this way? -# -# Definition: (???) -# 1. Touching a new `artwork` corresponds to "observational unit change" -# 2. Time interval of XX min within one `artwork` on the same day -# corresponds to "observational unit change" - -# id activity timestamp - -# Split data frame in list of data frame which all correspond to one -# artwork -# dat_art <- split(dat, dat$artwork) - -## --> Maybe need it at some point? - -#' # Problems - -#' * Opening and closing of events cannot be identified unambiguously; it -#' can happen that the wrong tags have been put together (e.g., Transform -#' start and Transform stop); therefore, durations etc. are only heuristic - - - -#' ## Plots - -counts <- table(as.Date(dat$date), dat$event) -lattice::barchart(counts, auto.key = TRUE) - - -start_events <- c("Transform start", "Show Info", "ShowPopup", "Artwork/OpenCard") - -counts <- table(as.Date(dat$date[dat$event %in% start_events]), - dat$event[dat$event %in% start_events]) -lattice::barchart(counts, auto.key = TRUE) - - -# TODO: Ask Phillip what is wrong with `time_ms` -# --> Hat er eine Erklärung dafür? - -#plot(time_ms.stop ~ time_ms.start, dat_trans, type = "b") -plot(time_ms.stop ~ time_ms.start, dat_trans, - col = rgb(red = 0, green = 0, blue = 0, alpha = 0.2)) - -plot(date.stop ~ date.start, dat_trans[1:1000,], type = "b") -