From e88981e3b96ea77a75e6e79001c273f0837e7111 Mon Sep 17 00:00:00 2001 From: nwickel Date: Thu, 20 Jul 2023 17:06:28 +0200 Subject: [PATCH] Started seriously working on data preprocessing; very intermediate version though --- code/01_parse-logfiles.R | 5 + code/01b_investigate.R | 224 +++++++++++++++++++++++++++++++++++++++ code/02_preprocessing.R | 135 ++++++++++++++++++++++- 3 files changed, 359 insertions(+), 5 deletions(-) create mode 100644 code/01b_investigate.R diff --git a/code/01_parse-logfiles.R b/code/01_parse-logfiles.R index 9084333..24d7b11 100644 --- a/code/01_parse-logfiles.R +++ b/code/01_parse-logfiles.R @@ -37,6 +37,7 @@ knitr::opts_chunk$set(warning = FALSE, message = FALSE) #' Choose which folders with raw log files should be included: folders <- "all" +#folders <- "_2016b" dirpaths <- paste0("../data/haum_logs_2016-2023/", folders) @@ -129,6 +130,10 @@ str(dat) head(dat[, 2:ncol(dat)], 20) +# sort by date, since sorting by file names does not make sense because of +# missing left zero padding +dat <- dat[order(dat$date), ] + ## TODO: Replace artwork and popup numbers with informative strings write.table(dat, "../data/rawdata_logfiles.csv", diff --git a/code/01b_investigate.R b/code/01b_investigate.R new file mode 100644 index 0000000..166fbc3 --- /dev/null +++ b/code/01b_investigate.R @@ -0,0 +1,224 @@ +#' --- +#' title: "Preprocessing log files" +#' author: "Nora Wickelmaier" +#' date: "`r Sys.Date()`" +#' output: +#' html_document: +#' toc: true +#' toc_float: true +#' pdf_document: +#' toc: true +#' number_sections: true +#' geometry: margin = 2.5cm +#' --- + +# setwd("C:/Users/nwickelmaier/Nextcloud/Documents/MDS/2023ss/60100_master_thesis/code") + +# LogEntry classes: +# TRANSFORM_START: "Transform start" --> "Transformation Start" in Tool +# TRANSFORM_STOP: "Transform stop" +# START_APPLICATION: "Start Application" +# SHOW_APPLICATION: "Show Application" +# SHOW_INFO: "Show Info" --> "Flip Card" in Tool +# SHOW_FRONT: "Show Front" +# SHOW_POPUP: "ShowPopup" --> "Show Popup" in Tool +# HIDE_POPUP: "HidePopup" +# ARTWORK: "Artwork" --> "Show Topic" in Tool + +#' # Read data + +dat0 <- read.table("../data/rawdata_logfiles.csv", sep = ";", header = TRUE) +dat0$date <- as.POSIXct(dat0$date) # create date object + +plot(dat0$time_ms[1:3000], type = "l") + +# what happens here? Why does `time_ms` go down, but not to 0? +plot(dat0$time_ms[2500:3000], type = "l") +plot(dat0$time_ms[2755:2765], type = "l") # "zoom in" +dat0[2755:2765, ] +# --> overall time stamp keeps going up... + +# TODO: How to create a plot that gives the same information based on +# `time_ms` und `date`?? +plot(time_ms ~ date, dat0[1:5000, ], type = "b") +abline(h = 0, col = "red", lty = 3) +# Visualize night +plot(time_ms ~ date, dat0[1:10000, ], type = "b") + + +# Not all `Start Application` have `time_ms = 0` - why?? + +dat0[125537:125542, ] +dat0[6673501:6673510, ] +# --> What's happening here? + +table(dat0[dat0$event %in% "Start Application", c("event", "date", "time_ms")]$time_ms) +# 0 1 15 16 296 2819 2914 3191 5316 6535 +# 3131 4 21 48 1 1 1 1 1 1 +# --> ??? +dat0[dat0$event == "Start Application" & dat0$time_ms == 6535, ] +dat0[989313:989317, ] + +dat0[dat0$event == "Start Application" & dat0$time_ms == 5316, ] +dat0[2071078:2071082, ] + +dat0[dat0$event == "Start Application" & dat0$time_ms == 3191, ] +dat0[2851863:2851867, ] + +dat0[dat0$event == "Start Application" & dat0$time_ms == 16, ] +dat0[156382:156386, ] +dat0[5566940:5566947, ] +# --> pattern is *not* consistent + +dat0[dat0$event == "Start Application" & dat0$time_ms == 1, ] +dat0[125537:125542, ] + + +xtabs( ~ event + as.Date(date), dat0[1:1000, ]) + +# How many days do we have with up to 8 "Start Applications" +table(xtabs( ~ event + as.Date(date), dat0[dat0$event == "Start Application", ])) +# 1 2 3 4 5 6 7 8 +# 381 657 272 86 37 14 10 2 +# --> 8 days without any "Start Application" +length(unique(as.Date(dat0$date))) - + length(xtabs( ~ event + as.Date(date), dat0[dat0$event == "Start Application", ])) + +# But only 6 files with 2 "Start Applications" +table(xtabs( ~ event + fileid, dat0[dat0$event == "Start Application", ])) +# 1 2 +# 3198 6 +# --> That means we have 36,563 file ids without any "Start Application" + + +#' # Remove irrelevant events + +#' ## Remove Start Application and Show Application + +dat <- subset(dat0, !(dat0$event %in% c("Start Application", "Show Application"))) + +#' ## Remove "button presses" + +# Sort data frame by artwork and date +dat <- dat[order(dat$artwork, dat$date), ] + +# remove "Transform start" and "Transform stop" following directly each +# other, since I do not know how to interpret them as events +id_start <- which(dat$event == "Transform start") +id_stop <- which(dat$event == "Transform stop") + +id_rm_start <- id_start[diff(id_start) == 1] +id_rm_stop <- id_stop[diff(id_stop) == 1] + +dat <- dat[-c(id_rm_start, id_rm_stop), ] +rownames(dat) <- NULL + + +id_start2 <- which(dat$event == "Transform start") +id_stop2 <- which(dat$event == "Transform stop") + +length(id_start2) - length(id_stop2) +# 340 --> "starts too many" + +# remove "Transform start" and "Transform stop" following directly each +# other (but with events in between!) +id_start_new <- id_start2 +id_stop_new <- id_stop2 + +for (i in 2:length(id_start_new)) { + if (id_start_new[i-1] < id_stop_new[i-1] & id_start_new[i] < id_stop_new[i-1]) { + id_start_new <- id_start_new[-(i-1)] + } else if (id_start_new[i-1] > id_stop_new[i-1] & id_start_new[i] > id_stop_new[i-1]) { + id_stop_new <- id_stop_new[-(i-1)] + } +} + +length(id_start2) - length(id_start_new) +length(id_stop2) - length(id_stop_new) + +ids <- data.frame(start = id_start_new, stop = id_stop_new) +ids$diff <- ids$stop - ids$start + +table(ids$diff) + +# remove "Transform start" and "Transform stop" around other events + +id_rm_start2 <- id_start2[!(id_start2 %in% id_start_new)] +id_rm_stop2 <- id_stop2[!(id_stop2 %in% id_stop_new)] + +# TODO: It still does not work correctly: +dat[64764:64769,] +# time_ms event artwork popup x y scale rotation +# 64764 473081 Transform start 052 052.xml 1958.65 1505.75 0.8234455 -0.1351998 +# 64765 474226 Show Info 052 052.xml NA NA NA NA +# 64766 475735 Transform start 052 052.xml 1988.25 1625.25 0.9927645 2.4527958 +# 64767 475739 Transform stop 052 052.xml 1988.25 1625.25 0.9927645 2.4527958 +# 64768 479326 Artwork 052 052.xml NA NA NA NA +# 64769 479751 Transform stop 052 052.xml 1660.90 1883.20 0.8074586 29.0875534 + +# --> but no idea how to find these cases in an automated way... + +dat <- dat[-c(id_rm_start2, id_rm_stop2), ] +# --> Every start ends with a stop now (but not necessarily the correct one!) + + +dat1 <- dat[order(dat$date, dat$time_ms), ] +dat1$time_diff <- c(NA, diff(dat1$time_ms)) + +boxplot(time_diff ~ as.Date(date), dat1[dat1$time_diff > 1000 & dat1$time_diff < 4000, ]) + +boxplot(time_ms ~ event, dat1) + + +#' ## Plots + +counts <- table(as.Date(dat$date), dat$event) +lattice::barchart(counts, auto.key = TRUE) + + +start_events <- c("Transform start", "Show Info", "ShowPopup", "Artwork/OpenCard") + +counts <- table(as.Date(dat$date[dat$event %in% start_events]), + dat$event[dat$event %in% start_events]) +lattice::barchart(counts, auto.key = TRUE) + + +# TODO: Do I want to "collapse" the data frame in a way, that I only have +# one event for each "set", meaning +# +# * Transform start + Transform stop --> Transform +# * Artwork/OpenCard + Artwork/CloseCard --> Show Subcard +# * ShowPopup + HidePopup --> Show Popup +# * Show Info + Show Front --> Flip Card +# (s.o. ;)) +# +# Then I would have meaningful variables like duration, distance, degree of +# rotation, size of scaling, selection of Subcard etc. +# This means that I would have to delete all "unclosed" events. + +# Create a data frame with +# case event attributes (can differ for different events) +# ?? +# Is `artwork` my case? Or `artwork` per day? Or `artwork` per some other +# unit??? Maybe look at differences between timestamps separately for +# `artwork`? And identify "new observational unit" this way? +# +# Definition: (???) +# 1. Touching a new `artwork` corresponds to "observational unit change" +# 2. Time interval of XX min within one `artwork` on the same day +# corresponds to "observational unit change" + +# id activity timestamp + +# Split data frame in list of data frame which all correspond to one +# artwork +# dat_art <- split(dat, dat$artwork) + +## --> Maybe need it at some point? + +#' # Problems + +#' * Opening and closing of events cannot be identified unambiguously; it +#' can happen that the wrong tags have been put together (e.g., Transform +#' start and Transform stop); therefore, durations etc. are only heuristic + diff --git a/code/02_preprocessing.R b/code/02_preprocessing.R index 888ff86..6264eb6 100644 --- a/code/02_preprocessing.R +++ b/code/02_preprocessing.R @@ -2,7 +2,7 @@ #' title: "Preprocessing log files" #' author: "Nora Wickelmaier" #' date: "`r Sys.Date()`" -#' output: +#' output: #' html_document: #' toc: true #' toc_float: true @@ -27,15 +27,140 @@ #' # Read data -dat0 <- read.table("../data/rawdata_logfiles.csv", sep = ";", header = TRUE) +dat0 <- read.table("../data/rawdata_logfiles_small.csv", sep = ";", + header = TRUE) +dat0$date <- as.POSIXct(dat0$date) # create date object + +# TODO: Add a case identifier based on timestamps -- needs to be done on +# "raw data". Is it possible? Something seems seriously wrong with +# `time_ms` #' # Remove irrelevant events #' ## Remove Start Application and Show Application -dat <- subset(dat0, !(dat0$event %in% c("Start Application", "Show Application"))) -dat$logs <- NULL # do not need original log files -dat$date <- as.POSIXct(dat$date) # create date object +dat <- subset(dat0, !(dat0$event %in% c("Start Application", + "Show Application"))) + +#' # Close events + +#' Do it for Tranform events first +tmp <- dat[dat$event %in% c("Transform start", "Transform stop"), ] +tmp <- tmp[order(tmp$artwork, tmp$date), ] +rownames(tmp) <- NULL + +# Find out how often "Transform start" follows each other +num_start <- diff(c(0, which(tmp$event == "Transform stop"))) +tmp$eventid <- rep(seq_along(num_start), num_start) +head(tmp[, c("event", "eventid")], 25) + +table(table(tmp$eventid)) +# 1 2 3 4 5 6 7 8 10 11 +# 73 78429 5156 842 222 66 18 14 3 1 +# --> compare to table(num_start)! + +# Find out how often "Transform stop" follows each other +num_stop <- c(diff(c(0, which(tmp$event == "Transform start")))) +table(num_stop) + +tmp$eventrep <- rep(num_start, num_start) +tmp$dupl <- duplicated(tmp[, c("event", "eventid")]) # keep first +tmp$dupl <- duplicated(tmp[, c("event", "eventid")], fromLast = TRUE) # keep last +tmp[tmp$eventrep == 10, ] + +tmp$dupl <- NULL +tmp$eventrep <- NULL + + +# remove duplicated "Transform start" events +tmp <- tmp[!duplicated(tmp[, c("event", "eventid")]), ] + +# remove duplicated "Transform stop" events +id_stop <- which(tmp$event == "Transform stop") +id_rm_stop <- id_stop[diff(id_stop) == 1] + +tmp <- tmp[-(id_rm_stop + 1), ] + +# transform to wide data format +tmp$event <- ifelse(tmp$event == "Transform start", "start", "stop") + +trans_wide <- reshape(tmp, direction = "wide", + idvar = c("eventid", "artwork"), + timevar = "event", drop = c("fileid", "popup", "card") +) + +rownames(trans_wide) <- NULL +# --> when fileid is part of the reshape, it does not work correctly, since +# we sometimes have a start - stop event that is recorded in two separate +# log files +# TODO: This runs for quite some time. Is this more efficient with dplyr? + +# which(is.na(trans_wide$date.start)) + +trans_wide$duration <- trans_wide$time_ms.stop - trans_wide$time_ms.start +trans_wide$distance <- apply(trans_wide[, c("x.start", "y.start", "x.stop", "y.stop")], + 1, function(x) dist(matrix(x, 2, 2, byrow = TRUE))) +trans_wide$rotationDegree <- trans_wide$rotation.stop - trans_wide$rotation.start +trans_wide$scaleSize <- trans_wide$scale.stop - trans_wide$scale.start + +trans_wide <- trans_wide[trans_wide$distance != 0 & + trans_wide$rotationDegree != 0 & + trans_wide$scaleSize != 0, ] +# removes almost 2/3 of the data (for small data set) + + +# TODO: How do I handle popups from glossar??? + + + + + + +# Should every "Show front" be the beginning of a new trace? +# Should Transform events be handled separately and then be "added" again +# by timestamp? + +######## +tmp <- dat[!dat$event %in% c("Transform start", "Transform stop"), ] +rownames(tmp) <- NULL + +tmp$trace <- NA +last_event <- tmp$event[1] + +for (art in unique(tmp$artwork)) { + + for (i in 1:nrow(tmp)) { + + if (last_event == "Show Info" & (tmp$artwork[i] == art | + tmp$artwork[i] == "glossar")) { + tmp$trace[i] <- "start" + } else if (last_event == "Show Front" & (tmp$artwork[i] == art | + tmp$artwork[i] == "glossar")) { + tmp$trace[i] <- "stop" + } + last_event <- tmp$event[i + 1] + } +} + + +head(tmp[4:ncol(tmp)], 50) +# TODO: Great job! You used a for-loop to rename "Show info" and "Show +# front" to "start" and "stop" ;) + + + + + + + + + + + + + + + #' ## Remove "button presses"