mtt_haum/code/02_preprocessing.R

525 lines
17 KiB
R
Raw Normal View History

2023-06-26 10:30:07 +02:00
#' ---
#' title: "Preprocessing log files"
#' author: "Nora Wickelmaier"
#' date: "`r Sys.Date()`"
#' output:
2023-06-26 10:30:07 +02:00
#' html_document:
#' toc: true
#' toc_float: true
#' pdf_document:
#' toc: true
#' number_sections: true
#' geometry: margin = 2.5cm
#' ---
# setwd("C:/Users/nwickelmaier/Nextcloud/Documents/MDS/2023ss/60100_master_thesis/code")
# LogEntry classes:
# TRANSFORM_START: "Transform start" --> "Transformation Start" in Tool
# TRANSFORM_STOP: "Transform stop"
# START_APPLICATION: "Start Application"
# SHOW_APPLICATION: "Show Application"
# SHOW_INFO: "Show Info" --> "Flip Card" in Tool
# SHOW_FRONT: "Show Front"
# SHOW_POPUP: "ShowPopup" --> "Show Popup" in Tool
# HIDE_POPUP: "HidePopup"
# ARTWORK: "Artwork" --> "Show Topic" in Tool
#' # Read data
dat0 <- read.table("../data/rawdata_logfiles_small.csv", sep = ";",
header = TRUE)
dat0$date <- as.POSIXct(dat0$date) # create date object
# TODO: Add a case identifier based on timestamps -- needs to be done on
# "raw data". Is it possible? Something seems seriously wrong with
# `time_ms`
2023-06-26 10:30:07 +02:00
#' # Remove irrelevant events
#' ## Remove Start Application and Show Application
dat <- subset(dat0, !(dat0$event %in% c("Start Application",
"Show Application")))
#' # Close events
# TODO:
# Should every "Show front" be the beginning of a new trace?
# Should Transform events be handled separately and then be "added" again
# by timestamp?
########
#' Do it for Transform events first
tmp <- dat[dat$event %in% c("Transform start", "Transform stop"), ]
tmp <- tmp[order(tmp$artwork, tmp$date), ]
rownames(tmp) <- NULL
# Create event ID for closing events
num_start <- diff(c(0, which(tmp$event == "Transform stop")))
tmp$eventid <- rep(seq_along(num_start), num_start)
head(tmp[, c("event", "eventid")], 25)
table(table(tmp$eventid))
# 1 2 3 4 5 6 7 8 10 11
# 73 78429 5156 842 222 66 18 14 3 1
# --> compare to table(num_start)!
# Find out how often "Transform stop" follows each other
num_stop <- c(diff(c(0, which(tmp$event == "Transform start"))))
table(num_stop)
tmp$eventrep <- rep(num_start, num_start)
tmp$dupl <- duplicated(tmp[, c("event", "eventid")]) # keep first
tmp$dupl <- duplicated(tmp[, c("event", "eventid")], fromLast = TRUE) # keep last
tmp[tmp$eventrep == 10, ]
tmp$dupl <- NULL
tmp$eventrep <- NULL
# remove duplicated "Transform start" events
tmp <- tmp[!duplicated(tmp[, c("event", "eventid")]), ]
# remove duplicated "Transform stop" events
id_stop <- which(tmp$event == "Transform stop")
id_rm_stop <- id_stop[diff(id_stop) == 1]
tmp <- tmp[-(id_rm_stop + 1), ]
# transform to wide data format
tmp$event <- ifelse(tmp$event == "Transform start", "start", "stop")
trans_wide <- reshape(tmp, direction = "wide",
idvar = c("eventid", "artwork"),
timevar = "event", drop = c("fileid", "popup", "card")
)
# --> when fileid is part of the reshape, it does not work correctly, since
# we sometimes have a start - stop event that is recorded in two separate
# log files
# TODO: This runs for quite some time. Is this more efficient with dplyr?
# which(is.na(trans_wide$date.start))
trans_wide$event <- "move"
trans_wide$eventid <- NULL
rownames(trans_wide) <- NULL
trans_wide$duration <- trans_wide$time_ms.stop - trans_wide$time_ms.start
#trans_wide$duration2 <- trans_wide$date.stop - trans_wide$date.start
# only seconds - not fine grained enough
trans_wide$distance <- apply(
trans_wide[, c("x.start", "y.start", "x.stop", "y.stop")], 1,
function(x) dist(matrix(x, 2, 2, byrow = TRUE)))
trans_wide$rotationDegree <- trans_wide$rotation.stop -
trans_wide$rotation.start
trans_wide$scaleSize <- trans_wide$scale.stop - trans_wide$scale.start
dat_trans <- trans_wide[trans_wide$distance != 0 &
trans_wide$rotationDegree != 0 &
trans_wide$scaleSize != 0, ]
# removes almost 2/3 of the data (for small data set)
dat_trans <- dat_trans[, c("event", "artwork",
"date.start", "date.stop",
"time_ms.start", "time_ms.stop", "duration",
"x.start", "y.start", "x.stop", "y.stop", "distance",
"scale.start", "scale.stop", "scaleSize",
"rotation.start", "rotation.stop", "rotationDegree")]
summary(dat_trans)
# TODO: Phillip fragen was mit `time_ms` schief läuft... Hat er eine
# Erklärung dafür?
#plot(time_ms.stop ~ time_ms.start, dat_trans, type = "b")
plot(time_ms.stop ~ time_ms.start, dat_trans, col = rgb(red = 0, green = 0, blue = 0, alpha = 0.2))
plot(date.stop ~ date.start, dat_trans[1:1000,], type = "b")
# TODO: Wie mit duration < 0 umgehen? Einfach auf NA setzen? For now...
dat_trans[dat_trans$duration < 0, "duration"] <- NA
#' # Close other events
tmp <- dat[!dat$event %in% c("Transform start", "Transform stop"), ]
tmp$x <- NULL
tmp$y <- NULL
tmp$scale <- NULL
tmp$rotation <- NULL
rownames(tmp) <- NULL
# Create event ID for closing events
# num_start <- diff(c(0, which(tmp$event == "Show Front")))
# tmp$trace <- rep(seq_along(num_start), num_start)
# head(tmp[, c("artwork", "event", "trace")], 50)
# TODO: How do I handle popups from glossar???
# --> does not work because of glossar entries... can't sort by artwork
tmp$trace <- NA
last_event <- tmp$event[1]
aws <- unique(tmp$artwork)[unique(tmp$artwork) != "glossar"]
#
for (art in aws) { # select artwork
for (i in 1:nrow(tmp)) { # go through rows
if (last_event == "Show Info" & tmp$artwork[i] == art) {
tmp$trace[i] <- i
j <- i
} else if (last_event == "Show Front" & tmp$artwork[i] == art) {
tmp$trace[i] <- j
} else if (!(last_event %in% c("Show Info", "Show Front")) &
tmp$artwork[i] == art) {
tmp$trace[i] <- j
}
if (i <= nrow(tmp)) {
last_event <- tmp$event[i + 1]
}
}
}
head(tmp[, c("artwork", "event", "trace")], 50)
tail(tmp[, c("artwork", "event", "trace")], 50)
## Fix glossar entries
### Find artwork for glossar entry
tmp2 <- tmp[tmp$artwork == "glossar", ]
glossar_files <- unique(tmp2$popup)
# load lookup table for artworks and glossar files
load("../data/glossar_dict.RData")
lut <- glossar_dict[glossar_dict$glossar_file %in% glossar_files, ]
# TODO: Find last "Artwork/OpenCard" that matches possible artwork
head(tmp[, c("artwork", "event", "popup", "trace")], 20)
#df <- NULL
for (file in lut$glossar_file) {
artwork_list <- unlist(lut[lut$glossar_file == file, "artwork"])
for (i in seq_len(nrow(tmp))) {
if (tmp$event[i] == "Show Info") {
current_artwork <- tmp[i, "artwork"]
j <- i
k <- i
} else {
current_artwork <- current_artwork
}
if (tmp$event[i] == "Show Front" & tmp$artwork[i] == current_artwork) {
# make sure artwork has not been closed, yet!
k <- i
}
if (tmp$artwork[i] == "glossar" &
(current_artwork %in% artwork_list) &
tmp$popup[i] == file & (j-k == 0)) {
#df <- rbind(df, data.frame(file, current_artwork, i, j))
tmp[i, "trace"] <- tmp[j, "trace"]
}
}
}
# dim(tmp[is.na(tmp$trace), ])
# --> finds about half of the glossar entries for the small data set...
# tmp[apply(df[, c("j", "i")], 1, c), c("artwork", "event", "popup", "trace")]
# REMEMBER: It can never bo 100% correct, since it is always possible that
# several cards are open and that they link at the same glossar entry
# Fill in the glossar_files that are only associated with one artwork
# How many glossar_files are only associated with one artwork?
lut[sapply(lut$artwork, length) == 1, "glossar_file"]
# TODO: Can I fill in the ones that are only associated with one artwork
# easily? I can't come up with something --> maybe ask AK???
# TODO: How can I check if one of the former "Show Infos" is the correct
# one? I can't come up with something --> maybe ask AK???
# for (file in lut$glossar_file) {
#
# artwork_list <- unlist(lut[lut$glossar_file == file, "artwork"])
#
# for (i in seq_len(nrow(tmp))) {
#
# if (tmp$event[i] == "Show Info") {
#
# artworks <- NULL
# current_artwork <- tmp[i, "artwork"]
# j <- i
#
# } else {
#
# print(current_artwork)
# artworks <- c(artworks, tmp[i, "artwork"])
# print(artworks)
#
# }
#
# # if (tmp$artwork[i] == "glossar" &
# # (current_artwork %in% artwork_list) &
# # tmp$popup[i] == file) {
# #
# # #df <- rbind(df, data.frame(file, current_artwork, i, j))
# # tmp[i, "trace"] <- tmp[j, "trace"]
#
# # }
# }
# }
# correct: 17940
# incorrect: 17963
# TODO: For now: Exclude not matched glossar entries
df <- subset(tmp, !is.na(tmp$trace))
df <- df[order(df$trace), ]
rownames(df) <- NULL
#' ## Close flipCard
tmp <- subset(df, df$event %in% c("Show Info", "Show Front"))
tmp$event <- ifelse(tmp$event == "Show Info", "start", "stop")
dat_flipCard <- reshape(tmp, direction = "wide",
idvar = c("trace", "artwork"),
timevar = "event", drop = c("fileid", "popup", "card"))
dat_flipCard$event <- "flipCard"
dat_flipCard$duration <- dat_flipCard$time_ms.stop - dat_flipCard$time_ms.start
dat_flipCard$duration <- ifelse(dat_flipCard$duration < 0, NA, dat_flipCard$duration)
# TODO: Wie mit duration < 0 umgehen? Einfach auf NA setzen? For now...
dat_flipCard <- dat_flipCard[, c("event", "artwork", "trace",
"date.start", "date.stop",
"time_ms.start", "time_ms.stop", "duration")]
#' ## Close openTopic
tmp <- subset(df, df$event %in% c("Artwork/OpenCard", "Artwork/CloseCard"))
tmp <- tmp[order(tmp$artwork, tmp$date), ]
rownames(tmp) <- NULL
num_start <- diff(c(0, which(tmp$event == "Artwork/CloseCard")))
tmp$eventid <- rep(seq_along(num_start), num_start)
tmp$event <- ifelse(tmp$event == "Artwork/OpenCard", "start", "stop")
dat_openTopic <- reshape(tmp, direction = "wide",
idvar = c("eventid", "trace", "artwork", "card"),
timevar = "event", drop = c("fileid", "popup"))
dat_openTopic$event <- "openTopic"
dat_openTopic$duration <- dat_openTopic$time_ms.stop - dat_openTopic$time_ms.start
dat_openTopic$duration <- ifelse(dat_openTopic$duration < 0, NA, dat_openTopic$duration)
# TODO: Wie mit duration < 0 umgehen? Einfach auf NA setzen? For now...
dat_openTopic <- dat_openTopic[, c("event", "artwork", "card", "trace",
"date.start", "date.stop",
"time_ms.start", "time_ms.stop", "duration")]
# TODO: card should have a unique identifier for each artwork!
#' ## close openPopup
tmp <- subset(df, df$event %in% c("ShowPopup", "HidePopup"))
tmp <- tmp[order(tmp$artwork, tmp$date), ]
rownames(tmp) <- NULL
num_start <- diff(c(0, which(tmp$event == "HidePopup")))
# last event is "ShowPopup"! Needs to be fixed TODO: Needs to be cought in
# a function!
num_start <- c(num_start, 1)
tmp$eventid <- rep(seq_along(num_start), num_start)
tmp$event <- ifelse(tmp$event == "ShowPopup", "start", "stop")
dat_openPopup <- reshape(tmp, direction = "wide",
idvar = c("eventid", "trace", "artwork", "popup"),
timevar = "event", drop = c("fileid", "card"))
# there is a pathological entry which gets deleted...
# df[df$trace == 4595, ]
# TODO: Some correct entries are not closed:
df[df$trace == 1843, ]
# WHY???
dat_openPopup$event <- "openPopup"
dat_openPopup$duration <- dat_openPopup$time_ms.stop - dat_openPopup$time_ms.start
dat_openPopup$duration <- ifelse(dat_openPopup$duration < 0, NA, dat_openPopup$duration)
# TODO: Wie mit duration < 0 umgehen? Einfach auf NA setzen? For now...
dat_openPopup <- dat_openPopup[, c("event", "artwork", "popup", "trace",
"date.start", "date.stop",
"time_ms.start", "time_ms.stop", "duration")]
# Merge all
dat_all <- merge(dat_trans, dat_flipCard, all = TRUE)
dat_all <- merge(dat_all, dat_openTopic, all = TRUE)
dat_all <- merge(dat_all, dat_openPopup, all = TRUE)
# check
nrow(dat_all) == (nrow(dat_trans) + nrow(dat_flipCard) +
nrow(dat_openTopic) + nrow(dat_openPopup))
dat_all <- dat_all[order(dat_all$date.start), ]
rownames(dat_all) <- NULL
# TODO: Should card maybe also be filled in for "openPopup"?
2023-06-26 10:30:07 +02:00
#' ## Remove "button presses"
# Sort data frame by artwork and date
dat <- dat[order(dat$artwork, dat$date), ]
# remove "Transform start" and "Transform stop" following directly each
# other, since I do not know how to interpret them as events
id_start <- which(dat$event == "Transform start")
id_stop <- which(dat$event == "Transform stop")
id_rm_start <- id_start[diff(id_start) == 1]
id_rm_stop <- id_stop[diff(id_stop) == 1]
dat <- dat[-c(id_rm_start, id_rm_stop), ]
rownames(dat) <- NULL
id_start2 <- which(dat$event == "Transform start")
id_stop2 <- which(dat$event == "Transform stop")
length(id_start2) - length(id_stop2)
# 340 --> "starts too many"
# remove "Transform start" and "Transform stop" following directly each
# other (but with events in between!)
id_start_new <- id_start2
id_stop_new <- id_stop2
for (i in 2:length(id_start_new)) {
if (id_start_new[i-1] < id_stop_new[i-1] & id_start_new[i] < id_stop_new[i-1]) {
id_start_new <- id_start_new[-(i-1)]
} else if (id_start_new[i-1] > id_stop_new[i-1] & id_start_new[i] > id_stop_new[i-1]) {
id_stop_new <- id_stop_new[-(i-1)]
}
}
length(id_start2) - length(id_start_new)
length(id_stop2) - length(id_stop_new)
ids <- data.frame(start = id_start_new, stop = id_stop_new)
ids$diff <- ids$stop - ids$start
table(ids$diff)
# remove "Transform start" and "Transform stop" around other events
id_rm_start2 <- id_start2[!(id_start2 %in% id_start_new)]
id_rm_stop2 <- id_stop2[!(id_stop2 %in% id_stop_new)]
# TODO: It still does not work correctly:
dat[64764:64769,]
# time_ms event artwork popup x y scale rotation
# 64764 473081 Transform start 052 052.xml 1958.65 1505.75 0.8234455 -0.1351998
# 64765 474226 Show Info 052 052.xml NA NA NA NA
# 64766 475735 Transform start 052 052.xml 1988.25 1625.25 0.9927645 2.4527958
# 64767 475739 Transform stop 052 052.xml 1988.25 1625.25 0.9927645 2.4527958
# 64768 479326 Artwork 052 052.xml NA NA NA NA
# 64769 479751 Transform stop 052 052.xml 1660.90 1883.20 0.8074586 29.0875534
# --> but no idea how to find these cases in an automated way...
dat <- dat[-c(id_rm_start2, id_rm_stop2), ]
# --> Every start ends with a stop now (but not necessarily the correct one!)
dat1 <- dat[order(dat$date, dat$time_ms), ]
dat1$time_diff <- c(NA, diff(dat1$time_ms))
boxplot(time_diff ~ as.Date(date), dat1[dat1$time_diff > 1000 & dat1$time_diff < 4000, ])
boxplot(time_ms ~ event, dat1)
#' ## Plots
counts <- table(as.Date(dat$date), dat$event)
lattice::barchart(counts, auto.key = TRUE)
start_events <- c("Transform start", "Show Info", "ShowPopup", "Artwork/OpenCard")
counts <- table(as.Date(dat$date[dat$event %in% start_events]),
dat$event[dat$event %in% start_events])
lattice::barchart(counts, auto.key = TRUE)
# TODO: Do I want to "collapse" the data frame in a way, that I only have
# one event for each "set", meaning
#
# * Transform start + Transform stop --> Transform
# * Artwork/OpenCard + Artwork/CloseCard --> Show Subcard
# * ShowPopup + HidePopup --> Show Popup
# * Show Info + Show Front --> Flip Card
# (s.o. ;))
#
# Then I would have meaningful variables like duration, distance, degree of
# rotation, size of scaling, selection of Subcard etc.
# This means that I would have to delete all "unclosed" events.
# Create a data frame with
# case event attributes (can differ for different events)
# ??
# Is `artwork` my case? Or `artwork` per day? Or `artwork` per some other
# unit??? Maybe look at differences between timestamps separately for
# `artwork`? And identify "new observational unit" this way?
#
# Definition: (???)
# 1. Touching a new `artwork` corresponds to "observational unit change"
# 2. Time interval of XX min within one `artwork` on the same day
# corresponds to "observational unit change"
# id activity timestamp
# Split data frame in list of data frame which all correspond to one
# artwork
# dat_art <- split(dat, dat$artwork)
## --> Maybe need it at some point?
#' # Problems
#' * Opening and closing of events cannot be identified unambiguously; it
#' can happen that the wrong tags have been put together (e.g., Transform
#' start and Transform stop); therefore, durations etc. are only heuristic