165 lines
5.5 KiB
R
165 lines
5.5 KiB
R
|
#' ---
|
||
|
#' title: "Preprocessing log files"
|
||
|
#' author: "Nora Wickelmaier"
|
||
|
#' date: "`r Sys.Date()`"
|
||
|
#' output:
|
||
|
#' html_document:
|
||
|
#' toc: true
|
||
|
#' toc_float: true
|
||
|
#' pdf_document:
|
||
|
#' toc: true
|
||
|
#' number_sections: true
|
||
|
#' geometry: margin = 2.5cm
|
||
|
#' ---
|
||
|
|
||
|
# setwd("C:/Users/nwickelmaier/Nextcloud/Documents/MDS/2023ss/60100_master_thesis/code")
|
||
|
|
||
|
# LogEntry classes:
|
||
|
# TRANSFORM_START: "Transform start" --> "Transformation Start" in Tool
|
||
|
# TRANSFORM_STOP: "Transform stop"
|
||
|
# START_APPLICATION: "Start Application"
|
||
|
# SHOW_APPLICATION: "Show Application"
|
||
|
# SHOW_INFO: "Show Info" --> "Flip Card" in Tool
|
||
|
# SHOW_FRONT: "Show Front"
|
||
|
# SHOW_POPUP: "ShowPopup" --> "Show Popup" in Tool
|
||
|
# HIDE_POPUP: "HidePopup"
|
||
|
# ARTWORK: "Artwork" --> "Show Topic" in Tool
|
||
|
|
||
|
#' # Read data
|
||
|
|
||
|
dat0 <- read.table("../data/rawdata_logfiles.csv", sep = ";", header = TRUE)
|
||
|
|
||
|
#' # Remove irrelevant events
|
||
|
|
||
|
#' ## Remove Start Application and Show Application
|
||
|
|
||
|
dat <- subset(dat0, !(dat0$event %in% c("Start Application", "Show Application")))
|
||
|
dat$logs <- NULL # do not need original log files
|
||
|
dat$date <- as.POSIXct(dat$date) # create date object
|
||
|
|
||
|
#' ## Remove "button presses"
|
||
|
|
||
|
# Sort data frame by artwork and date
|
||
|
dat <- dat[order(dat$artwork, dat$date), ]
|
||
|
|
||
|
# remove "Transform start" and "Transform stop" following directly each
|
||
|
# other, since I do not know how to interpret them as events
|
||
|
id_start <- which(dat$event == "Transform start")
|
||
|
id_stop <- which(dat$event == "Transform stop")
|
||
|
|
||
|
id_rm_start <- id_start[diff(id_start) == 1]
|
||
|
id_rm_stop <- id_stop[diff(id_stop) == 1]
|
||
|
|
||
|
dat <- dat[-c(id_rm_start, id_rm_stop), ]
|
||
|
rownames(dat) <- NULL
|
||
|
|
||
|
|
||
|
id_start2 <- which(dat$event == "Transform start")
|
||
|
id_stop2 <- which(dat$event == "Transform stop")
|
||
|
|
||
|
length(id_start2) - length(id_stop2)
|
||
|
# 340 --> "starts too many"
|
||
|
|
||
|
# remove "Transform start" and "Transform stop" following directly each
|
||
|
# other (but with events in between!)
|
||
|
id_start_new <- id_start2
|
||
|
id_stop_new <- id_stop2
|
||
|
|
||
|
for (i in 2:length(id_start_new)) {
|
||
|
if (id_start_new[i-1] < id_stop_new[i-1] & id_start_new[i] < id_stop_new[i-1]) {
|
||
|
id_start_new <- id_start_new[-(i-1)]
|
||
|
} else if (id_start_new[i-1] > id_stop_new[i-1] & id_start_new[i] > id_stop_new[i-1]) {
|
||
|
id_stop_new <- id_stop_new[-(i-1)]
|
||
|
}
|
||
|
}
|
||
|
|
||
|
length(id_start2) - length(id_start_new)
|
||
|
length(id_stop2) - length(id_stop_new)
|
||
|
|
||
|
ids <- data.frame(start = id_start_new, stop = id_stop_new)
|
||
|
ids$diff <- ids$stop - ids$start
|
||
|
|
||
|
table(ids$diff)
|
||
|
|
||
|
# remove "Transform start" and "Transform stop" around other events
|
||
|
|
||
|
id_rm_start2 <- id_start2[!(id_start2 %in% id_start_new)]
|
||
|
id_rm_stop2 <- id_stop2[!(id_stop2 %in% id_stop_new)]
|
||
|
|
||
|
# TODO: It still does not work correctly:
|
||
|
dat[64764:64769,]
|
||
|
# time_ms event artwork popup x y scale rotation
|
||
|
# 64764 473081 Transform start 052 052.xml 1958.65 1505.75 0.8234455 -0.1351998
|
||
|
# 64765 474226 Show Info 052 052.xml NA NA NA NA
|
||
|
# 64766 475735 Transform start 052 052.xml 1988.25 1625.25 0.9927645 2.4527958
|
||
|
# 64767 475739 Transform stop 052 052.xml 1988.25 1625.25 0.9927645 2.4527958
|
||
|
# 64768 479326 Artwork 052 052.xml NA NA NA NA
|
||
|
# 64769 479751 Transform stop 052 052.xml 1660.90 1883.20 0.8074586 29.0875534
|
||
|
|
||
|
# --> but no idea how to find these cases in an automated way...
|
||
|
|
||
|
dat <- dat[-c(id_rm_start2, id_rm_stop2), ]
|
||
|
# --> Every start ends with a stop now (but not necessarily the correct one!)
|
||
|
|
||
|
|
||
|
dat1 <- dat[order(dat$date, dat$time_ms), ]
|
||
|
dat1$time_diff <- c(NA, diff(dat1$time_ms))
|
||
|
|
||
|
boxplot(time_diff ~ as.Date(date), dat1[dat1$time_diff > 1000 & dat1$time_diff < 4000, ])
|
||
|
|
||
|
boxplot(time_ms ~ event, dat1)
|
||
|
|
||
|
|
||
|
#' ## Plots
|
||
|
|
||
|
counts <- table(as.Date(dat$date), dat$event)
|
||
|
lattice::barchart(counts, auto.key = TRUE)
|
||
|
|
||
|
|
||
|
start_events <- c("Transform start", "Show Info", "ShowPopup", "Artwork/OpenCard")
|
||
|
|
||
|
counts <- table(as.Date(dat$date[dat$event %in% start_events]),
|
||
|
dat$event[dat$event %in% start_events])
|
||
|
lattice::barchart(counts, auto.key = TRUE)
|
||
|
|
||
|
|
||
|
# TODO: Do I want to "collapse" the data frame in a way, that I only have
|
||
|
# one event for each "set", meaning
|
||
|
#
|
||
|
# * Transform start + Transform stop --> Transform
|
||
|
# * Artwork/OpenCard + Artwork/CloseCard --> Show Subcard
|
||
|
# * ShowPopup + HidePopup --> Show Popup
|
||
|
# * Show Info + Show Front --> Flip Card
|
||
|
# (s.o. ;))
|
||
|
#
|
||
|
# Then I would have meaningful variables like duration, distance, degree of
|
||
|
# rotation, size of scaling, selection of Subcard etc.
|
||
|
# This means that I would have to delete all "unclosed" events.
|
||
|
|
||
|
# Create a data frame with
|
||
|
# case event attributes (can differ for different events)
|
||
|
# ??
|
||
|
# Is `artwork` my case? Or `artwork` per day? Or `artwork` per some other
|
||
|
# unit??? Maybe look at differences between timestamps separately for
|
||
|
# `artwork`? And identify "new observational unit" this way?
|
||
|
#
|
||
|
# Definition: (???)
|
||
|
# 1. Touching a new `artwork` corresponds to "observational unit change"
|
||
|
# 2. Time interval of XX min within one `artwork` on the same day
|
||
|
# corresponds to "observational unit change"
|
||
|
|
||
|
# id activity timestamp
|
||
|
|
||
|
# Split data frame in list of data frame which all correspond to one
|
||
|
# artwork
|
||
|
# dat_art <- split(dat, dat$artwork)
|
||
|
|
||
|
## --> Maybe need it at some point?
|
||
|
|
||
|
#' # Problems
|
||
|
|
||
|
#' * Opening and closing of events cannot be identified unambiguously; it
|
||
|
#' can happen that the wrong tags have been put together (e.g., Transform
|
||
|
#' start and Transform stop); therefore, durations etc. are only heuristic
|
||
|
|