Reworked structure of preprocessing; moved all steps to separate functions
This commit is contained in:
parent
c133792285
commit
fa730081db
@ -1,410 +1,43 @@
|
||||
#' ---
|
||||
#' title: "Preprocessing log files"
|
||||
#' author: "Nora Wickelmaier"
|
||||
#' date: "`r Sys.Date()`"
|
||||
#' output:
|
||||
#' html_document:
|
||||
#' toc: true
|
||||
#' toc_float: true
|
||||
#' pdf_document:
|
||||
#' toc: true
|
||||
#' number_sections: true
|
||||
#' geometry: margin = 2.5cm
|
||||
#' ---
|
||||
|
||||
# setwd("C:/Users/nwickelmaier/Nextcloud/Documents/MDS/2023ss/60100_master_thesis/code")
|
||||
|
||||
#' # Read data
|
||||
source("functions.R")
|
||||
|
||||
# Read data
|
||||
|
||||
dat0 <- read.table("../data/rawdata_logfiles_small.csv", sep = ";",
|
||||
header = TRUE)
|
||||
dat0$date <- as.POSIXct(dat0$date) # create date object
|
||||
dat0$date <- as.POSIXct(dat0$date)
|
||||
dat0$glossar <- ifelse(dat0$artwork == "glossar", 1, 0)
|
||||
|
||||
#' # Remove irrelevant events
|
||||
|
||||
#' ## Remove Start Application and Show Application
|
||||
|
||||
# Remove irrelevant events
|
||||
dat <- subset(dat0, !(dat0$event %in% c("Start Application",
|
||||
"Show Application")))
|
||||
rownames(dat) <- NULL
|
||||
|
||||
#' # Close events
|
||||
|
||||
########
|
||||
|
||||
#' Do it for Transform events first
|
||||
dat1 <- dat[dat$event %in% c("Transform start", "Transform stop"), ]
|
||||
dat1 <- dat1[order(dat1$artwork, dat1$date), ]
|
||||
rownames(dat1) <- NULL
|
||||
|
||||
# Create event ID for closing events
|
||||
num_start <- diff(c(0, which(dat1$event == "Transform stop")))
|
||||
dat1$eventId <- rep(seq_along(num_start), num_start)
|
||||
head(dat1[, c("event", "eventId")], 25)
|
||||
|
||||
table(table(dat1$eventId))
|
||||
# 1 2 3 4 5 6 7 8 10 11
|
||||
# 70 78435 5153 842 222 66 18 14 3 1
|
||||
# --> compare to table(num_start)!
|
||||
|
||||
# Find out how often "Transform stop" follows each other
|
||||
num_stop <- c(diff(c(0, which(dat1$event == "Transform start"))))
|
||||
table(num_stop)
|
||||
|
||||
# remove duplicated "Transform start" events
|
||||
dat1 <- dat1[!duplicated(dat1[, c("event", "eventId")]), ]
|
||||
|
||||
# remove duplicated "Transform stop" events
|
||||
id_stop <- which(dat1$event == "Transform stop")
|
||||
id_rm_stop <- id_stop[diff(id_stop) == 1]
|
||||
|
||||
dat1 <- dat1[-(id_rm_stop + 1), ]
|
||||
|
||||
# transform to wide data format
|
||||
dat1$time <- ifelse(dat1$event == "Transform start", "start", "stop")
|
||||
|
||||
trans_wide <- reshape(dat1, direction = "wide",
|
||||
idvar = c("eventId", "artwork", "glossar"),
|
||||
timevar = "time",
|
||||
drop = c("popup", "topicNumber", "event")
|
||||
)
|
||||
|
||||
# TODO: This runs for quite some time
|
||||
# --> Is this more efficient with tidyr::pivot_wider?
|
||||
|
||||
# --> when fileId is part of the reshape, it does not work correctly, since
|
||||
# we sometimes have a start - stop event that is recorded in two separate
|
||||
# log files, BUT: after finding out, that `timeMs` changes for each log
|
||||
# file, I want to exclude those cases, so `fileId` has to be included!!!
|
||||
|
||||
# check how often an eventId is associated with two fileIds
|
||||
nrow(subset(trans_wide, trans_wide$fileId.start != trans_wide$fileId.stop))
|
||||
|
||||
# which(is.na(trans_wide$date.start))
|
||||
|
||||
trans_wide$event <- "move"
|
||||
trans_wide$eventId <- NULL
|
||||
|
||||
rownames(trans_wide) <- NULL
|
||||
|
||||
trans_wide$duration <- trans_wide$timeMs.stop - trans_wide$timeMs.start
|
||||
#trans_wide$duration <- trans_wide$date.stop - trans_wide$date.start
|
||||
# only seconds - not fine grained enough
|
||||
trans_wide$distance <- apply(
|
||||
trans_wide[, c("x.start", "y.start", "x.stop", "y.stop")], 1,
|
||||
function(x) dist(matrix(x, 2, 2, byrow = TRUE)))
|
||||
trans_wide$rotationDegree <- trans_wide$rotation.stop -
|
||||
trans_wide$rotation.start
|
||||
trans_wide$scaleSize <- trans_wide$scale.stop / trans_wide$scale.start
|
||||
|
||||
trans_wide$trace <- NA
|
||||
trans_wide$topicNumber <- NA
|
||||
trans_wide$popup <- NA
|
||||
|
||||
dat_trans <- trans_wide[trans_wide$distance != 0 &
|
||||
trans_wide$rotationDegree != 0 &
|
||||
trans_wide$scaleSize != 1,
|
||||
c("fileId.start", "fileId.stop", "event", "artwork",
|
||||
"trace", "glossar", "date.start", "date.stop",
|
||||
"timeMs.start", "timeMs.stop", "duration",
|
||||
"topicNumber", "popup", "x.start", "y.start",
|
||||
"x.stop", "y.stop", "distance", "scale.start",
|
||||
"scale.stop", "scaleSize", "rotation.start",
|
||||
"rotation.stop", "rotationDegree")]
|
||||
1 - nrow(dat_trans) / nrow(trans_wide)
|
||||
# removes almost 2/3 of the data (for small data set)
|
||||
|
||||
rm(id_rm_stop, id_stop, trans_wide, num_start, num_stop)
|
||||
|
||||
summary(dat_trans)
|
||||
|
||||
|
||||
#' # Close other events
|
||||
|
||||
dat2 <- dat[!dat$event %in% c("Transform start", "Transform stop"), ]
|
||||
rownames(dat2) <- NULL
|
||||
|
||||
dat2$trace <- NA
|
||||
last_event <- dat2$event[1]
|
||||
aws <- unique(dat2$artwork)[unique(dat2$artwork) != "glossar"]
|
||||
#
|
||||
for (art in aws) { # select artwork
|
||||
|
||||
for (i in 1:nrow(dat2)) { # go through rows
|
||||
|
||||
if (last_event == "Show Info" & dat2$artwork[i] == art) {
|
||||
dat2$trace[i] <- i
|
||||
j <- i
|
||||
|
||||
} else if (last_event == "Show Front" & dat2$artwork[i] == art) {
|
||||
dat2$trace[i] <- j
|
||||
|
||||
} else if (!(last_event %in% c("Show Info", "Show Front")) &
|
||||
dat2$artwork[i] == art) {
|
||||
dat2$trace[i] <- j
|
||||
}
|
||||
|
||||
if (i <= nrow(dat2)) {
|
||||
last_event <- dat2$event[i + 1]
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
rm(aws, i, j, last_event, art)
|
||||
|
||||
#' ## Fix glossar entries (find corresponding artworks and fill in trace)
|
||||
|
||||
glossar_files <- unique(dat2[dat2$artwork == "glossar", "popup"])
|
||||
|
||||
# load lookup table for artworks and glossar files
|
||||
load("../data/glossar_dict.RData")
|
||||
lut <- glossar_dict[glossar_dict$glossar_file %in% glossar_files, ]
|
||||
|
||||
head(dat2[, c("artwork", "event", "popup", "trace")], 20)
|
||||
|
||||
inside <- glossar_files[glossar_files %in%
|
||||
lut[sapply(lut$artwork, length) == 1,
|
||||
"glossar_file"]]
|
||||
single_art <- unlist(lut[lut$glossar_file %in% inside, "artwork"])
|
||||
|
||||
|
||||
for (file in lut$glossar_file) {
|
||||
|
||||
artwork_list <- unlist(lut[lut$glossar_file == file, "artwork"])
|
||||
|
||||
for (i in seq_len(nrow(dat2))) {
|
||||
|
||||
if (dat2$event[i] == "Show Info" |
|
||||
(dat2$event[i] == "Artwork/OpenCard" &
|
||||
dat2$artwork[i] %in% single_art)) {
|
||||
|
||||
current_artwork <- dat2[i, "artwork"]
|
||||
j <- i
|
||||
k <- i
|
||||
|
||||
} else {
|
||||
|
||||
current_artwork <- current_artwork
|
||||
|
||||
}
|
||||
|
||||
if (dat2$event[i] == "Show Front" & dat2$artwork[i] == current_artwork) {
|
||||
# make sure artwork has not been closed, yet!
|
||||
k <- i
|
||||
}
|
||||
|
||||
if (dat2$artwork[i] == "glossar" &
|
||||
(current_artwork %in% artwork_list) &
|
||||
dat2$popup[i] == file & (j - k == 0)) {
|
||||
|
||||
dat2[i, "trace"] <- dat2[j, "trace"]
|
||||
dat2[i, "artwork"] <- current_artwork
|
||||
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
# --> finds a bit more than half of the glossar entries for the small data
|
||||
# set...
|
||||
proportions(table(is.na(dat2[dat2$glossar == 1, "trace"])))
|
||||
|
||||
# REMEMBER: It can never be 100% correct, since it is always possible that
|
||||
# several cards are open and that they link to the same glossar entry
|
||||
|
||||
dat2[14110:14130, ]
|
||||
dat2[dat2$glossar == 1, ]
|
||||
|
||||
# Exclude not matched glossar entries
|
||||
df <- subset(dat2, !is.na(dat2$trace))
|
||||
df <- df[order(df$trace), ]
|
||||
rownames(df) <- NULL
|
||||
|
||||
rm(lut, current_artwork, file, glossar_dict, i, j, k, artwork_list,
|
||||
glossar_files, inside, single_art)
|
||||
|
||||
#' ## Close flipCard
|
||||
|
||||
dat3 <- subset(df, df$event %in% c("Show Info", "Show Front"))
|
||||
|
||||
dat3$time <- ifelse(dat3$event == "Show Info", "start", "stop")
|
||||
|
||||
flipCard_wide <- reshape(dat3, direction = "wide",
|
||||
idvar = c("trace", "artwork", "glossar"),
|
||||
timevar = "time",
|
||||
drop = c("popup", "topicNumber"))
|
||||
flipCard_wide$event <- "flipCard"
|
||||
flipCard_wide$duration <- flipCard_wide$timeMs.stop -
|
||||
flipCard_wide$timeMs.start
|
||||
|
||||
flipCard_wide$topicNumber <- NA
|
||||
flipCard_wide$popup <- NA
|
||||
flipCard_wide$distance <- NA
|
||||
flipCard_wide$scaleSize <- NA
|
||||
flipCard_wide$rotationDegree <- NA
|
||||
|
||||
dat_flipCard <- flipCard_wide[, c("fileId.start", "fileId.stop", "event",
|
||||
"artwork", "trace", "glossar",
|
||||
"date.start", "date.stop",
|
||||
"timeMs.start", "timeMs.stop",
|
||||
"duration", "topicNumber", "popup",
|
||||
"x.start", "y.start", "x.stop", "y.stop",
|
||||
"distance", "scale.start", "scale.stop",
|
||||
"scaleSize", "rotation.start",
|
||||
"rotation.stop", "rotationDegree")]
|
||||
|
||||
rm(flipCard_wide)
|
||||
|
||||
#' ## Close openTopic
|
||||
|
||||
dat4 <- subset(df, df$event %in% c("Artwork/OpenCard", "Artwork/CloseCard"))
|
||||
dat4 <- dat4[order(dat4$artwork, dat4$date), ]
|
||||
rownames(dat4) <- NULL
|
||||
|
||||
num_start <- diff(c(0, which(dat4$event == "Artwork/CloseCard")))
|
||||
dat4$eventId <- rep(seq_along(num_start), num_start)
|
||||
|
||||
dat4$time <- ifelse(dat4$event == "Artwork/OpenCard", "start", "stop")
|
||||
|
||||
openTopic_wide <- reshape(dat4, direction = "wide",
|
||||
idvar = c("eventId", "trace", "glossar", "artwork", "topicNumber"),
|
||||
timevar = "time", drop = "popup")
|
||||
openTopic_wide$event <- "openTopic"
|
||||
openTopic_wide$duration <- openTopic_wide$timeMs.stop -
|
||||
openTopic_wide$timeMs.start
|
||||
|
||||
openTopic_wide$popup <- NA
|
||||
openTopic_wide$distance <- NA
|
||||
openTopic_wide$scaleSize <- NA
|
||||
openTopic_wide$rotationDegree <- NA
|
||||
|
||||
dat_openTopic <- openTopic_wide[, c("fileId.start", "fileId.stop", "event",
|
||||
"artwork", "trace", "glossar",
|
||||
"date.start", "date.stop",
|
||||
"timeMs.start", "timeMs.stop",
|
||||
"duration", "topicNumber", "popup",
|
||||
"x.start", "y.start", "x.stop",
|
||||
"y.stop", "distance", "scale.start",
|
||||
"scale.stop", "scaleSize",
|
||||
"rotation.start", "rotation.stop",
|
||||
"rotationDegree")]
|
||||
# TODO: topicNumber should have a unique identifier for each artwork
|
||||
|
||||
rm(openTopic_wide, num_start)
|
||||
|
||||
#' ## close openPopup
|
||||
|
||||
dat5 <- subset(df, df$event %in% c("ShowPopup", "HidePopup"))
|
||||
dat5 <- dat5[order(dat5$artwork, dat5$popup, dat5$date), ]
|
||||
rownames(dat5) <- NULL
|
||||
|
||||
num_start <- diff(c(0, which(dat5$event == "HidePopup")))
|
||||
# last event is "ShowPopup"! Needs to be fixed
|
||||
# num_start <- c(num_start, 1)
|
||||
# TODO: Needs to be caught in a function
|
||||
# --> not anymore - still relevant???
|
||||
|
||||
dat5$eventId <- rep(seq_along(num_start), num_start)
|
||||
|
||||
dat5$time <- ifelse(dat5$event == "ShowPopup", "start", "stop")
|
||||
|
||||
openPopup_wide <- reshape(dat5, direction = "wide",
|
||||
idvar = c("eventId", "trace", "glossar",
|
||||
"artwork", "popup"),
|
||||
timevar = "time",
|
||||
drop = "topicNumber")
|
||||
# there is a pathological entry which gets deleted...
|
||||
# df[df$trace == 4595, ]
|
||||
# --> artwork 046 popup selene.xml gets opened twice
|
||||
|
||||
openPopup_wide$event <- "openPopup"
|
||||
openPopup_wide$duration <- openPopup_wide$timeMs.stop -
|
||||
openPopup_wide$timeMs.start
|
||||
|
||||
openPopup_wide$topicNumber <- NA
|
||||
openPopup_wide$distance <- NA
|
||||
openPopup_wide$scaleSize <- NA
|
||||
openPopup_wide$rotationDegree <- NA
|
||||
|
||||
dat_openPopup <- openPopup_wide[, c("fileId.start", "fileId.stop", "event",
|
||||
"artwork", "trace", "glossar",
|
||||
"date.start", "date.stop",
|
||||
"timeMs.start", "timeMs.stop",
|
||||
"duration", "topicNumber", "popup",
|
||||
"x.start", "y.start", "x.stop",
|
||||
"y.stop", "distance", "scale.start",
|
||||
"scale.stop", "scaleSize",
|
||||
"rotation.start", "rotation.stop",
|
||||
"rotationDegree")]
|
||||
rm(num_start, openPopup_wide)
|
||||
|
||||
|
||||
# TODO: Should topicNumber maybe also be filled in for "openPopup"?
|
||||
|
||||
#' ## Merge data sets for different events
|
||||
|
||||
dat_all <- rbind(dat_trans, dat_flipCard, dat_openTopic, dat_openPopup)
|
||||
|
||||
# check
|
||||
nrow(dat_all) == (nrow(dat_trans) + nrow(dat_flipCard) +
|
||||
nrow(dat_openTopic) + nrow(dat_openPopup))
|
||||
|
||||
#' ## Remove all events that do not have a `date.start`
|
||||
|
||||
dim(dat_all[is.na(dat_all$date.start), ])
|
||||
dat_all <- dat_all[!is.na(dat_all$date.start), ]
|
||||
# There is only a `date.stop`, when event is not properly closed, see here:
|
||||
df[df$trace == 1843, ]
|
||||
dat_openPopup[dat_openPopup$trace == 1843, ]
|
||||
## --> still 50 (small data set) left, and some really do not seem to be
|
||||
## opened! Must be a log error
|
||||
# --> others should be closed!
|
||||
dat[31000:31019,] # this one e.g.
|
||||
# --> Actually NOT! card gets flipped before! Again - log error!
|
||||
|
||||
# Add trace variable
|
||||
dat1 <- add_trace(dat)
|
||||
|
||||
# Close events
|
||||
dat2 <- rbind(close_events(dat1, "move"),
|
||||
close_events(dat1, "flipCard"),
|
||||
close_events(dat1, "openTopic"),
|
||||
close_events(dat1, "openPopup"))
|
||||
dat2 <- dat2[order(dat2$date.start, dat2$fileId.start), ]
|
||||
# Remove durations when event spans more than one log file, since they are
|
||||
# not interpretable
|
||||
dat_all[which(dat_all$fileId.start != dat_all$fileId.stop), "duration"] <- NA
|
||||
dat2[which(dat2$fileId.start != dat2$fileId.stop), "duration"] <- NA
|
||||
|
||||
# sort by `start.date`
|
||||
dat_all <- dat_all[order(dat_all$date.start), ]
|
||||
rownames(dat_all) <- NULL
|
||||
# Remove all events that do not have a `date.start`
|
||||
dat2 <- dat2[!is.na(dat2$date.start), ]
|
||||
rownames(dat2) <- NULL
|
||||
|
||||
ind <- rowSums(is.na(dat_all)) == ncol(dat_all)
|
||||
any(ind)
|
||||
dat_all[ind, ]
|
||||
# --> No rows with only NA, as it should be.
|
||||
#summary(dat2)
|
||||
|
||||
summary(dat_all) # OK, this actually makes a lot of sense :)
|
||||
# Add case variable
|
||||
dat3 <- add_case(dat2)
|
||||
|
||||
#' ## Create case variable
|
||||
|
||||
dat_all$timediff <- as.numeric(diff(c(dat_all$date.start[1], dat_all$date.start)))
|
||||
|
||||
hist(dat_all$timediff[dat_all$timediff < 40], breaks = 50)
|
||||
|
||||
|
||||
# TODO: What is the best choice for the cutoff here? I took 20 secs for now
|
||||
dat_all$case <- NA
|
||||
j <- 1
|
||||
|
||||
for (i in seq_len(nrow(dat_all))) {
|
||||
if (dat_all$timediff[i] < 21) {
|
||||
dat_all$case[i] <- j
|
||||
} else {
|
||||
j <- j + 1
|
||||
dat_all$case[i] <- j
|
||||
}
|
||||
}
|
||||
|
||||
head(dat_all[, c("event", "artwork", "trace", "date.start", "timediff", "case")], 100)
|
||||
|
||||
#' ## Add event ID
|
||||
|
||||
dat_all$eventId <- seq_len(nrow(dat_all))
|
||||
|
||||
dat_all <- dat_all[, c("fileId.start", "fileId.stop", "eventId", "case",
|
||||
# Add event ID
|
||||
dat3$eventId <- seq_len(nrow(dat3))
|
||||
dat3 <- dat3[, c("fileId.start", "fileId.stop", "eventId", "case",
|
||||
"trace", "glossar", "event", "artwork",
|
||||
"date.start", "date.stop", "timeMs.start",
|
||||
"timeMs.stop", "duration", "topicNumber", "popup",
|
||||
@ -413,64 +46,15 @@ dat_all <- dat_all[, c("fileId.start", "fileId.stop", "eventId", "case",
|
||||
"scaleSize", "rotation.start", "rotation.stop",
|
||||
"rotationDegree")]
|
||||
|
||||
#' ## Add `trace` numbers for `move` events
|
||||
# Add trace for move events
|
||||
dat4 <- add_trace_moves(dat3)
|
||||
|
||||
cases <- unique(dat_all$case)
|
||||
aws <- unique(dat_all$artwork)[unique(dat_all$artwork) != "glossar"]
|
||||
max_trace <- max(dat_all$trace, na.rm = TRUE) + 1
|
||||
out <- NULL
|
||||
# Fill in topics
|
||||
|
||||
for (case in cases) {
|
||||
for (art in aws) {
|
||||
tmp <- dat_all[dat_all$case == case & dat_all$artwork == art, ]
|
||||
if (nrow(tmp) != 0) {
|
||||
|
||||
if (length(na.omit(unique(tmp$trace))) == 1) {
|
||||
tmp[tmp$event == "move", "trace"] <- na.omit(unique(tmp$trace))
|
||||
} else if (length(na.omit(unique(tmp$trace))) > 1) {
|
||||
for (i in 1:nrow(tmp)) {
|
||||
if (tmp$event[i] == "move") {
|
||||
if (i == 1) {
|
||||
tmp$trace[i] <- na.omit(unique(tmp$trace))[1]
|
||||
} else {
|
||||
tmp$trace[i] <- tmp$trace[i - 1]
|
||||
}
|
||||
}
|
||||
}
|
||||
} else if (all(is.na(tmp$trace))) {
|
||||
for (i in 1:nrow(tmp)) {
|
||||
if (tmp$event[i] == "move") {
|
||||
tmp$trace[i] <- max_trace
|
||||
}
|
||||
}
|
||||
}
|
||||
max_trace <- max_trace + 1
|
||||
}
|
||||
if (nrow(tmp) > 0) {
|
||||
out <- rbind(out, tmp)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
# TODO: Get rid of the loops
|
||||
# --> This takes forever...
|
||||
|
||||
# put glossar events back in --> not relevant anymore
|
||||
|
||||
#dat_all <- rbind(out, dat_all[dat_all$artwork == "glossar", ])
|
||||
out <- out[order(out$date.start), ]
|
||||
rownames(out) <- NULL
|
||||
|
||||
# Make `trace` a consecutive number
|
||||
out$trace <- as.numeric(factor(out$trace, levels = unique(out$trace)))
|
||||
|
||||
#' # Fill in topics
|
||||
|
||||
topics <- read.table("../data/topics.csv", sep = ";", header = TRUE)
|
||||
# topics <- read.table("../data/topics.csv", sep = ";", header = TRUE)
|
||||
# TODO: Add topics to data frame
|
||||
|
||||
#' # Export data
|
||||
|
||||
write.table(out, "../data/event_logfiles.csv", sep = ";",
|
||||
# Export data
|
||||
write.table(dat4, "../data/event_logfiles.csv", sep = ";",
|
||||
row.names = FALSE)
|
||||
|
||||
|
373
code/functions.R
373
code/functions.R
@ -1,152 +1,46 @@
|
||||
# setwd("C:/Users/nwickelmaier/Nextcloud/Documents/MDS/2023ss/60100_master_thesis/code")
|
||||
|
||||
# Read data
|
||||
|
||||
dat0 <- read.table("../data/rawdata_logfiles_small.csv", sep = ";",
|
||||
header = TRUE)
|
||||
|
||||
dat0$date <- as.POSIXct(dat0$date)
|
||||
dat0$glossar <- ifelse(dat0$artwork == "glossar", 1, 0)
|
||||
# remove irrelevant events
|
||||
dat <- subset(dat0, !(dat0$event %in% c("Start Application",
|
||||
"Show Application")))
|
||||
|
||||
# Close move events
|
||||
close_moves <- function(data) {
|
||||
|
||||
# close move events
|
||||
dat1 <- data[data$event %in% c("Transform start", "Transform stop"), ]
|
||||
dat1 <- dat1[order(dat1$artwork, dat1$date), ]
|
||||
num_start <- diff(c(0, which(dat1$event == "Transform stop")))
|
||||
dat1$eventId <- rep(seq_along(num_start), num_start)
|
||||
|
||||
# remove duplicated "Transform start" events
|
||||
dat1 <- dat1[!duplicated(dat1[, c("event", "eventId")]), ]
|
||||
|
||||
# remove duplicated "Transform stop" events
|
||||
id_stop <- which(dat1$event == "Transform stop")
|
||||
id_rm_stop <- id_stop[diff(id_stop) == 1]
|
||||
|
||||
dat1 <- dat1[-(id_rm_stop + 1), ]
|
||||
|
||||
# transform to wide data format
|
||||
dat1$time <- ifelse(dat1$event == "Transform start", "start", "stop")
|
||||
|
||||
trans_wide <- reshape(dat1, direction = "wide",
|
||||
idvar = c("eventId", "artwork", "glossar"),
|
||||
timevar = "time",
|
||||
drop = c("popup", "topicNumber", "event")
|
||||
)
|
||||
trans_wide$event <- "move"
|
||||
trans_wide$eventId <- NULL
|
||||
|
||||
trans_wide$duration <- trans_wide$timeMs.stop - trans_wide$timeMs.start
|
||||
trans_wide$distance <- apply(
|
||||
trans_wide[, c("x.start", "y.start", "x.stop", "y.stop")], 1,
|
||||
function(x) dist(matrix(x, 2, 2, byrow = TRUE)))
|
||||
trans_wide$rotationDegree <- trans_wide$rotation.stop -
|
||||
trans_wide$rotation.start
|
||||
trans_wide$scaleSize <- trans_wide$scale.stop / trans_wide$scale.start
|
||||
|
||||
trans_wide$trace <- NA
|
||||
trans_wide$topicNumber <- NA
|
||||
trans_wide$popup <- NA
|
||||
|
||||
dat_trans <- trans_wide[trans_wide$distance != 0 &
|
||||
trans_wide$rotationDegree != 0 &
|
||||
trans_wide$scaleSize != 1,
|
||||
c("fileId.start", "fileId.stop", "event", "artwork",
|
||||
"trace", "glossar", "date.start", "date.stop",
|
||||
"timeMs.start", "timeMs.stop", "duration",
|
||||
"topicNumber", "popup", "x.start", "y.start",
|
||||
"x.stop", "y.stop", "distance", "scale.start",
|
||||
"scale.stop", "scaleSize", "rotation.start",
|
||||
"rotation.stop", "rotationDegree")]
|
||||
rownames(dat_trans) <- NULL
|
||||
cat(paste("INFORMATION:", nrow(trans_wide) - nrow(dat_trans),
|
||||
"lines containing move events were removed since they did",
|
||||
"\nnot contain any change"), fill = TRUE)
|
||||
dat_trans
|
||||
}
|
||||
|
||||
|
||||
dat1 <- close_moves(dat)
|
||||
# TODO: Integrate this function into close_events?
|
||||
|
||||
###########################################################################
|
||||
|
||||
# Add trace variable
|
||||
add_trace <- function(data) {
|
||||
|
||||
dat2 <- data[!data$event %in% c("Transform start", "Transform stop"), ]
|
||||
|
||||
dat2$trace <- NA
|
||||
last_event <- dat2$event[1]
|
||||
aws <- unique(dat2$artwork)[unique(dat2$artwork) != "glossar"]
|
||||
#
|
||||
for (art in aws) { # select artwork
|
||||
|
||||
for (i in 1:nrow(dat2)) { # go through rows
|
||||
|
||||
if (last_event == "Show Info" & dat2$artwork[i] == art) {
|
||||
dat2$trace[i] <- i
|
||||
j <- i
|
||||
|
||||
} else if (last_event == "Show Front" & dat2$artwork[i] == art) {
|
||||
dat2$trace[i] <- j
|
||||
|
||||
} else if (!(last_event %in% c("Show Info", "Show Front")) &
|
||||
dat2$artwork[i] == art) {
|
||||
dat2$trace[i] <- j
|
||||
}
|
||||
|
||||
if (i <= nrow(dat2)) {
|
||||
last_event <- dat2$event[i + 1]
|
||||
}
|
||||
}
|
||||
}
|
||||
dat2
|
||||
}
|
||||
|
||||
|
||||
add_trace2 <- function(data, glossar_dict = "../data/glossar_dict.RData") {
|
||||
add_trace <- function(data, glossar_dict = "../data/glossar_dict.RData") {
|
||||
|
||||
data$trace <- NA
|
||||
dat1 <- data[data$event %in% c("Transform start", "Transform stop"), ]
|
||||
dat2 <- data[!data$event %in% c("Transform start", "Transform stop"), ]
|
||||
subdata1 <- data[data$event %in% c("Transform start", "Transform stop"), ]
|
||||
subdata2 <- data[!data$event %in% c("Transform start", "Transform stop"), ]
|
||||
|
||||
last_event <- dat2$event[1]
|
||||
aws <- unique(dat2$artwork)[unique(dat2$artwork) != "glossar"]
|
||||
#
|
||||
for (art in aws) { # select artwork
|
||||
last_event <- subdata2$event[1]
|
||||
aws <- unique(subdata2$artwork)[unique(subdata2$artwork) != "glossar"]
|
||||
|
||||
for (i in 1:nrow(dat2)) { # go through rows
|
||||
for (art in aws) {
|
||||
|
||||
if (last_event == "Show Info" & dat2$artwork[i] == art) {
|
||||
dat2$trace[i] <- i
|
||||
for (i in 1:nrow(subdata2)) {
|
||||
|
||||
if (last_event == "Show Info" & subdata2$artwork[i] == art) {
|
||||
subdata2$trace[i] <- i
|
||||
j <- i
|
||||
|
||||
} else if (last_event == "Show Front" & dat2$artwork[i] == art) {
|
||||
dat2$trace[i] <- j
|
||||
} else if (last_event == "Show Front" & subdata2$artwork[i] == art) {
|
||||
subdata2$trace[i] <- j
|
||||
|
||||
} else if (!(last_event %in% c("Show Info", "Show Front")) &
|
||||
dat2$artwork[i] == art) {
|
||||
dat2$trace[i] <- j
|
||||
subdata2$artwork[i] == art) {
|
||||
subdata2$trace[i] <- j
|
||||
}
|
||||
|
||||
if (i <= nrow(dat2)) {
|
||||
last_event <- dat2$event[i + 1]
|
||||
if (i <= nrow(subdata2)) {
|
||||
last_event <- subdata2$event[i + 1]
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
glossar_files <- unique(dat2[dat2$artwork == "glossar", "popup"])
|
||||
# Fix glossar entries (find corresponding artworks and fill in trace)
|
||||
glossar_files <- unique(subdata2[subdata2$artwork == "glossar", "popup"])
|
||||
|
||||
# load lookup table for artworks and glossar files
|
||||
load(glossar_dict)
|
||||
lut <- glossar_dict[glossar_dict$glossar_file %in% glossar_files, ]
|
||||
|
||||
head(dat2[, c("artwork", "event", "popup", "trace")], 20)
|
||||
head(subdata2[, c("artwork", "event", "popup", "trace")], 20)
|
||||
|
||||
inside <- glossar_files[glossar_files %in%
|
||||
lut[sapply(lut$artwork, length) == 1,
|
||||
@ -158,13 +52,13 @@ add_trace2 <- function(data, glossar_dict = "../data/glossar_dict.RData") {
|
||||
|
||||
artwork_list <- unlist(lut[lut$glossar_file == file, "artwork"])
|
||||
|
||||
for (i in seq_len(nrow(dat2))) {
|
||||
for (i in seq_len(nrow(subdata2))) {
|
||||
|
||||
if (dat2$event[i] == "Show Info" |
|
||||
(dat2$event[i] == "Artwork/OpenCard" &
|
||||
dat2$artwork[i] %in% single_art)) {
|
||||
if (subdata2$event[i] == "Show Info" |
|
||||
(subdata2$event[i] == "Artwork/OpenCard" &
|
||||
subdata2$artwork[i] %in% single_art)) {
|
||||
|
||||
current_artwork <- dat2[i, "artwork"]
|
||||
current_artwork <- subdata2[i, "artwork"]
|
||||
j <- i
|
||||
k <- i
|
||||
|
||||
@ -174,72 +68,128 @@ add_trace2 <- function(data, glossar_dict = "../data/glossar_dict.RData") {
|
||||
|
||||
}
|
||||
|
||||
if (dat2$event[i] == "Show Front" & dat2$artwork[i] == current_artwork) {
|
||||
if (subdata2$event[i] == "Show Front" & subdata2$artwork[i] == current_artwork) {
|
||||
# make sure artwork has not been closed, yet!
|
||||
k <- i
|
||||
}
|
||||
|
||||
if (dat2$artwork[i] == "glossar" &
|
||||
if (subdata2$artwork[i] == "glossar" &
|
||||
(current_artwork %in% artwork_list) &
|
||||
dat2$popup[i] == file & (j - k == 0)) {
|
||||
subdata2$popup[i] == file & (j - k == 0)) {
|
||||
|
||||
dat2[i, "trace"] <- dat2[j, "trace"]
|
||||
dat2[i, "artwork"] <- current_artwork
|
||||
subdata2[i, "trace"] <- subdata2[j, "trace"]
|
||||
subdata2[i, "artwork"] <- current_artwork
|
||||
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
cat(proportions(table(is.na(dat2[dat2$glossar == 1, "trace"]))), fill = TRUE)
|
||||
# Exclude not matched glossar entries
|
||||
cat("INFORMATION: glossar entries that are not matched will be removed:",
|
||||
sum(is.na(subdata2[subdata2$glossar == 1, "trace"])), "entries",
|
||||
#proportions(table(is.na(subdata2[subdata2$glossar == 1, "trace"]))),
|
||||
fill = TRUE)
|
||||
subdata2 <- subset(subdata2, !is.na(subdata2$trace))
|
||||
# REMEMBER: It can never be 100% correct, since it is always possible
|
||||
# that several cards are open and that they link to the same glossar
|
||||
# entry
|
||||
|
||||
out <- rbind(dat1, dat2)
|
||||
# dat2[14110:14130, ]
|
||||
# dat2[dat2$glossar == 1, ]
|
||||
|
||||
out <- rbind(subdata1, subdata2)
|
||||
out <- out[order(out$fileId, out$date, out$timeMs), ]
|
||||
out
|
||||
}
|
||||
|
||||
|
||||
tmp <- add_trace2(dat)
|
||||
|
||||
###########################################################################
|
||||
|
||||
close_events <- function(data, event = c("flipCard", "openTopic", "openPopup")) {
|
||||
close_events <- function(data, event = c("move", "flipCard", "openTopic", "openPopup")) {
|
||||
|
||||
if (event == "flipCard") {
|
||||
subdata <- subset(data, data$event %in% c("Show Info", "Show Front"))
|
||||
subdata$time <- ifelse(subdata$event == "Show Info", "start", "stop")
|
||||
subdata$eventId <- NA
|
||||
if (event == "move") {
|
||||
actions <- c("Transform start", "Transform stop")
|
||||
idvar <- c("eventId", "artwork", "glossar")
|
||||
drop <- c("popup", "topicNumber", "trace", "event")
|
||||
|
||||
} else if (event == "flipCard") {
|
||||
actions <- c("Show Info", "Show Front")
|
||||
idvar <- c("trace", "artwork", "glossar")
|
||||
drop <- c("popup", "topicNumber")
|
||||
drop <- c("popup", "topicNumber", "eventId", "event")
|
||||
|
||||
} else if (event == "openTopic") {
|
||||
subdata <- subset(data, data$event %in% c("Artwork/OpenCard", "Artwork/CloseCard"))
|
||||
subdata$time <- ifelse(subdata$event == "Artwork/OpenCard", "start", "stop")
|
||||
num_start <- diff(c(0, which(subdata$event == "Artwork/CloseCard")))
|
||||
subdata$eventId <- rep(seq_along(num_start), num_start)
|
||||
actions <- c("Artwork/OpenCard", "Artwork/CloseCard")
|
||||
idvar <- c("eventId", "trace", "glossar", "artwork", "topicNumber")
|
||||
drop <- "popup"
|
||||
drop <- c("popup", "event")
|
||||
|
||||
} else if (event == "openPopup") {
|
||||
subdata <- subset(data, data$event %in% c("ShowPopup", "HidePopup"))
|
||||
subdata$time <- ifelse(subdata$event == "ShowPopup", "start", "stop")
|
||||
num_start <- diff(c(0, which(subdata$event == "HidePopup")))
|
||||
subdata$eventId <- rep(seq_along(num_start), num_start)
|
||||
actions <- c("ShowPopup", "HidePopup")
|
||||
idvar <- c("eventId", "trace", "glossar", "artwork", "popup")
|
||||
drop <- "topicNumber"
|
||||
drop <- c("topicNumber", "event")
|
||||
# TODO: Should topicNumber maybe also be filled in for "openPopup"?
|
||||
|
||||
} else {
|
||||
stop("`event` must be one of 'move', 'flipCard', 'openTopic',
|
||||
'openPopup'.")
|
||||
}
|
||||
|
||||
subdata <- subset(data, data$event %in% actions)
|
||||
#subdata <- subdata[order(subdata$artwork, subdata$popup, subdata$date), ]
|
||||
subdata <- subdata[order(subdata$artwork, subdata$popup, subdata$date, subdata$timeMs), ]
|
||||
subdata$time <- ifelse(subdata$event == actions[1], "start", "stop")
|
||||
num_start <- diff(c(0, which(subdata$event == actions[2])))
|
||||
subdata$eventId <- rep(seq_along(num_start), num_start)
|
||||
# If last event is start event, it needs to be fixed:
|
||||
# num_start <- c(num_start, 1)
|
||||
# TODO: Needs to be caught in a function
|
||||
# --> not anymore - still relevant???
|
||||
|
||||
if (event == "move") {
|
||||
subdata <- subdata[!duplicated(subdata[, c("event", "eventId")]), ]
|
||||
id_stop <- which(subdata$event == actions[2])
|
||||
id_rm_stop <- id_stop[diff(id_stop) == 1]
|
||||
subdata <- subdata[-(id_rm_stop + 1), ]
|
||||
}
|
||||
|
||||
suppressWarnings(
|
||||
data_wide <- reshape(subdata, direction = "wide",
|
||||
idvar = idvar,
|
||||
timevar = "time",
|
||||
drop = drop)
|
||||
data_wide$event <- event
|
||||
data_wide$duration <- data_wide$timeMs.stop - data_wide$timeMs.start
|
||||
)
|
||||
# TODO: Suppress warnings? Better with tryCatch()?
|
||||
# there is a pathological entry which gets deleted...
|
||||
# df[df$trace == 4595, ]
|
||||
# --> artwork 046 popup selene.xml gets opened twice
|
||||
|
||||
# TODO: This runs for quite some time
|
||||
# --> Is this more efficient with tidyr::pivot_wider?
|
||||
|
||||
|
||||
for (d in drop) data_wide[d] <- NA
|
||||
data_wide$distance <- NA
|
||||
data_wide$scaleSize <- NA
|
||||
data_wide$rotationDegree <- NA
|
||||
|
||||
data_wide$event <- event
|
||||
data_wide$duration <- data_wide$timeMs.stop - data_wide$timeMs.start
|
||||
|
||||
if (event == "move") {
|
||||
data_wide$distance <- apply(
|
||||
data_wide[, c("x.start", "y.start", "x.stop", "y.stop")], 1,
|
||||
function(x) dist(matrix(x, 2, 2, byrow = TRUE)))
|
||||
data_wide$rotationDegree <- data_wide$rotation.stop -
|
||||
data_wide$rotation.start
|
||||
data_wide$scaleSize <- data_wide$scale.stop / data_wide$scale.start
|
||||
# remove moves without any change
|
||||
move_wide <- data_wide[data_wide$distance != 0 &
|
||||
data_wide$rotationDegree != 0 &
|
||||
data_wide$scaleSize != 1, ]
|
||||
cat(paste("INFORMATION:", nrow(data_wide) - nrow(move_wide),
|
||||
"lines containing move events were removed since they did",
|
||||
"\nnot contain any change"), fill = TRUE)
|
||||
data_wide <- move_wide
|
||||
}
|
||||
|
||||
out <- data_wide[, c("fileId.start", "fileId.stop", "event", "artwork",
|
||||
"trace", "glossar", "date.start", "date.stop",
|
||||
"timeMs.start", "timeMs.stop", "duration",
|
||||
@ -247,37 +197,84 @@ close_events <- function(data, event = c("flipCard", "openTopic", "openPopup"))
|
||||
"x.stop", "y.stop", "distance", "scale.start",
|
||||
"scale.stop", "scaleSize", "rotation.start",
|
||||
"rotation.stop", "rotationDegree")]
|
||||
rownames(out) <- NULL
|
||||
out
|
||||
# TODO: Suppress warnings?
|
||||
}
|
||||
|
||||
tmp <- rbind(close_moves(dat),
|
||||
close_events(df, "flipCard"),
|
||||
close_events(df, "openTopic"),
|
||||
close_events(df, "openPopup"))
|
||||
###########################################################################
|
||||
|
||||
# Add case variable
|
||||
|
||||
add_case <- function(data, cutoff = 20) {
|
||||
# TODO: What is the best choice for the cutoff here?
|
||||
|
||||
data$timediff <- as.numeric(diff(c(data$date.start[1], data$date.start)))
|
||||
|
||||
data$case <- NA
|
||||
j <- 1
|
||||
|
||||
for (i in seq_len(nrow(data))) {
|
||||
if (data$timediff[i] <= cutoff) {
|
||||
data$case[i] <- j
|
||||
} else {
|
||||
j <- j + 1
|
||||
data$case[i] <- j
|
||||
}
|
||||
}
|
||||
data$timediff <- NULL
|
||||
data
|
||||
}
|
||||
|
||||
###########################################################################
|
||||
|
||||
# Add trace for moves
|
||||
|
||||
add_trace_moves <- function(data) {
|
||||
|
||||
cases <- unique(data$case)
|
||||
aws <- unique(data$artwork)[unique(data$artwork) != "glossar"]
|
||||
max_trace <- max(data$trace, na.rm = TRUE) + 1
|
||||
out <- NULL
|
||||
|
||||
for (case in cases) {
|
||||
for (art in aws) {
|
||||
tmp <- data[data$case == case & data$artwork == art, ]
|
||||
if (nrow(tmp) != 0) {
|
||||
|
||||
if (length(na.omit(unique(tmp$trace))) == 1) {
|
||||
tmp[tmp$event == "move", "trace"] <- na.omit(unique(tmp$trace))
|
||||
} else if (length(na.omit(unique(tmp$trace))) > 1) {
|
||||
for (i in 1:nrow(tmp)) {
|
||||
if (tmp$event[i] == "move") {
|
||||
if (i == 1) {
|
||||
tmp$trace[i] <- na.omit(unique(tmp$trace))[1]
|
||||
} else {
|
||||
tmp$trace[i] <- tmp$trace[i - 1]
|
||||
}
|
||||
}
|
||||
}
|
||||
} else if (all(is.na(tmp$trace))) {
|
||||
for (i in 1:nrow(tmp)) {
|
||||
if (tmp$event[i] == "move") {
|
||||
tmp$trace[i] <- max_trace
|
||||
}
|
||||
}
|
||||
}
|
||||
max_trace <- max_trace + 1
|
||||
}
|
||||
if (nrow(tmp) > 0) {
|
||||
out <- rbind(out, tmp)
|
||||
}
|
||||
}
|
||||
}
|
||||
out <- out[order(out$date.start, out$fileId.start), ]
|
||||
rownames(out) <- NULL
|
||||
|
||||
# Make trace a consecutive number
|
||||
out$trace <- as.numeric(factor(out$trace, levels = unique(out$trace)))
|
||||
out
|
||||
}
|
||||
# TODO: Get rid of the loops
|
||||
# --> This takes forever...
|
||||
|
||||
|
||||
# 'data.frame': 38607 obs. of 24 variables:
|
||||
# $ fileId.start : chr "2016_11_15-12_32_57.log" "2016_11_15-14_42_57.log" "2016_11_15-14_42_57.log" "2016_11_16-12_31_32.log" ...
|
||||
# $ fileId.stop : chr "2016_11_15-12_32_57.log" "2016_11_15-14_42_57.log" "2016_11_15-14_42_57.log" "2016_11_16-12_31_32.log" ...
|
||||
# $ event : chr "move" "move" "move" "move" ...
|
||||
# $ artwork : chr "001" "001" "001" "001" ...
|
||||
# $ trace : int NA NA NA NA NA NA NA NA NA NA ...
|
||||
# $ glossar : num 0 0 0 0 0 0 0 0 0 0 ...
|
||||
# $ date.start : POSIXct, format: "2016-12-15 12:39:49" "2016-12-15 14:49:37" ...
|
||||
# $ date.stop : POSIXct, format: "2016-12-15 12:39:49" "2016-12-15 14:49:40" ...
|
||||
# $ timeMs.start : int 412141 400777 554506 384312 406277 533864 548467 549396 158632 194982 ...
|
||||
# $ timeMs.stop : int 412474 403784 556633 388313 407994 538185 549088 551116 160343 197099 ...
|
||||
# $ duration : int 333 3007 2127 4001 1717 4321 621 1720 1711 2117 ...
|
||||
# $ topicNumber : int NA NA NA NA NA NA NA NA NA NA ...
|
||||
# $ popup : chr NA NA NA NA ...
|
||||
# $ x.start : num 531 235 470 326 326 ...
|
||||
# $ y.start : num 1221 734 2090 747 747 ...
|
||||
# $ x.stop : num 513 360 1492 256 2459 ...
|
||||
# $ y.stop : num 1212 809 1687 643 1430 ...
|
||||
# $ distance : num 19.8 146.6 1098.5 125.2 2239.4 ...
|
||||
# $ scale.start : num 0.8 0.301 0.8 0.301 0.301 ...
|
||||
# $ scale.stop : num 0.8 0.331 0.822 0.391 0.397 ...
|
||||
# $ scaleSize : num 1 1.1 1.03 1.3 1.32 ...
|
||||
# $ rotation.start: num 116 116 90 116 116 ...
|
||||
# $ rotation.stop : num 116.3 89.6 2.8 86.1 125.8 ...
|
||||
# $ rotationDegree: num 0.00245 -26.72397 -87.19711 -30.14456 9.49951 ...
|
||||
|
@ -103,9 +103,3 @@ proportions(table(is.na(tmp$trace[tmp$artwork == "glossar"])))
|
||||
# How many glossar_files are only associated with one artwork?
|
||||
lut[sapply(lut$artwork, length) == 1, "glossar_file"]
|
||||
|
||||
# TODO: Fill in the ones that are associated with one artwork
|
||||
# --> Can't come up with something -- maybe ask Philipp???
|
||||
|
||||
# TODO: How to check if one of the former "Show Infos" is correct
|
||||
# --> Can't come up with something -- maybe ask Philipp???
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user