Worked on TODOs

This commit is contained in:
Nora Wickelmaier 2023-09-19 15:25:30 +02:00
parent fb9db8b908
commit 9645bc62f1
3 changed files with 70 additions and 59 deletions

View File

@ -483,9 +483,16 @@ two raw log files cannot be closed and will then be removed from the data
set. The functions warns about this, but it is a random process getting rid
of these data and seems therefore not like a systematic problem. Another
reason why this is not bad, is that durations cannot be calculated for
events across log files, because the time stamps do not increase over
events across log files anyways, because the time stamps do not increase
systematically over log files (see above).
I meant to put the lists back together with `do.call(rbind, some_list)` but
this can also not handle big data sets. I therefore switched to
`dplyr::bind_rows(some_ist)` which is really fast and was developed
especially for this purpose. It means, that I have to depend on the dplyr
package (which I am not a big fan of, since I meant to keep the package
self-contained).
# Reading list
* @Arizmendi2022 [--]

View File

@ -2,10 +2,18 @@
source("functions.R")
small <- TRUE
# Read data ##############################################################
cat("########## Reading in data... ##########", "\n")
dat0 <- read.table("../data/rawdata_logfiles.csv", sep = ";",
header = TRUE)
if (small) {
dat0 <- read.table("../data/rawdata_logfiles_small.csv", sep = ";",
header = TRUE)
} else {
dat0 <- read.table("../data/rawdata_logfiles.csv", sep = ";",
header = TRUE)
}
dat0$date <- as.POSIXct(dat0$date)
dat0$glossar <- ifelse(dat0$artwork == "glossar", 1, 0)
@ -13,13 +21,15 @@ dat0$glossar <- ifelse(dat0$artwork == "glossar", 1, 0)
dat <- subset(dat0, !(dat0$event %in% c("Start Application",
"Show Application")))
save(dat, file = "tmp/dat.RData")
save(dat, file = paste0("tmp/dat_", ifelse(small, "small_", "full_"),
as.numeric(Sys.time()), ".RData"))
# Add trace variable #####################################################
cat("########## Adding trace variable... ##########", "\n")
dat1 <- add_trace(dat)
save(dat1, file = "tmp/dat1.RData")
save(dat1, file = paste("tmp/dat1", ifelse(small, "small_", "full_"),
as.numeric(Sys.time()), ".RData"))
# Close events
cat("########## Closing events... ##########", "\n")
@ -44,7 +54,8 @@ dat2 <- dat2[!is.na(dat2$date.start), ]
rownames(dat2) <- NULL
# TODO: Throw warning about this
save(dat2, file = "tmp/dat2.RData")
save(dat2, file = paste("tmp/dat2", ifelse(small, "small_", "full_"),
as.numeric(Sys.time()), ".RData"))
# Add case variable ######################################################
cat("########## Adding case and eventId variables... ##########", "\n")
@ -61,13 +72,15 @@ dat3 <- dat3[, c("fileId.start", "fileId.stop", "eventId", "case",
"scaleSize", "rotation.start", "rotation.stop",
"rotationDegree")]
save(dat3, file = "tmp/dat3.RData")
save(dat3, file = paste("tmp/dat3", ifelse(small, "small_", "full_"),
as.numeric(Sys.time()), ".RData"))
# Add trace for move events ##############################################
cat("\n########## Adding trace variable for move events... ##########", "\n")
dat4 <- add_trace_moves(dat3)
save(dat4, file = "tmp/dat4.RData")
save(dat4, file = paste("tmp/dat4", ifelse(small, "small_", "full_"),
as.numeric(Sys.time()), ".RData"))
# Add topics: file names and topics ######################################
cat("########## Adding information about topics... ##########", "\n")
@ -79,7 +92,8 @@ topics <- extract_topics(artworks, pattern = paste0(artworks, ".xml"),
dat5 <- add_topic(dat4, topics = topics)
save(dat5, file = "tmp/dat5.RData")
save(dat5, file = paste("tmp/dat5", ifelse(small, "small_", "full_"),
as.numeric(Sys.time()), ".RData"))
# TODO: Replace artwork with informative strings

View File

@ -118,42 +118,43 @@ add_trace <- function(data, glossar_dict = "../data/glossar_dict.RData") {
###########################################################################
close_events <- function(data, event = c("move", "flipCard", "openTopic", "openPopup")) {
# TODO: How do I set default vector and partial matching of arguments?
# --> `macht.arg()` and `pmatch()`
if (event == "move") {
actions <- c("Transform start", "Transform stop")
idvar <- c("eventId", "artwork", "glossar")
drop <- c("popup", "topicNumber", "trace", "event")
ncol <- 17
event <- match.arg(event)
} else if (event == "flipCard") {
actions <- c("Show Info", "Show Front")
idvar <- c("trace", "artwork", "glossar")
drop <- c("popup", "topicNumber", "eventId", "event")
ncol <- 17
switch(event,
"move" = {
actions <- c("Transform start", "Transform stop")
idvar <- c("fileId", "eventId", "artwork", "glossar")
drop <- c("popup", "topicNumber", "trace", "event")
ncol <- 16
} else if (event == "openTopic") {
actions <- c("Artwork/OpenCard", "Artwork/CloseCard")
idvar <- c("eventId", "trace", "glossar", "artwork", "topicNumber")
drop <- c("popup", "event")
ncol <- 19
},
"flipCard" = {
actions <- c("Show Info", "Show Front")
idvar <- c("fileId", "trace", "artwork", "glossar")
drop <- c("popup", "topicNumber", "eventId", "event")
ncol <- 16
} else if (event == "openPopup") {
actions <- c("ShowPopup", "HidePopup")
idvar <- c("eventId", "trace", "glossar", "artwork", "popup")
drop <- c("topicNumber", "event")
ncol <- 19
# TODO: Should topicNumber maybe also be filled in for "openPopup"?
},
"openTopic" = {
actions <- c("Artwork/OpenCard", "Artwork/CloseCard")
idvar <- c("fileId", "eventId", "trace", "glossar", "artwork",
"topicNumber")
drop <- c("popup", "event")
ncol <- 18
} else {
stop("`event` must be one of 'move', 'flipCard', 'openTopic',
'openPopup'.")
}
# TODO: `fileId` should now maybe go back into `idvar`
},
"openPopup" = {
actions <- c("ShowPopup", "HidePopup")
idvar <- c("fileId", "eventId", "trace", "glossar", "artwork", "popup")
drop <- c("topicNumber", "event")
ncol <- 18
# TODO: Should topicNumber maybe also be filled in for "openPopup"?
}
)
subdata <- subset(data, data$event %in% actions)
#subdata <- subdata[order(subdata$artwork, subdata$popup, subdata$date), ]
subdata <- subdata[order(subdata$artwork, subdata$popup, subdata$date, subdata$timeMs), ]
subdata$time <- ifelse(subdata$event == actions[1], "start", "stop")
num_start <- diff(c(0, which(subdata$event == actions[2])))
@ -172,36 +173,26 @@ close_events <- function(data, event = c("move", "flipCard", "openTopic", "openP
subdata_split <- split(subdata, ~ fileId)
pbapply::pboptions(style = 3, char = "=")
suppressWarnings(
subdata_split_wide <- pbapply::pblapply(subdata_split, reshape,
direction = "wide",
idvar = idvar,
timevar = "time",
drop = drop)
)
# suppressWarnings(
# data_wide <- reshape(subdata, direction = "wide",
# idvar = idvar,
# timevar = "time",
# drop = drop)
# )
# TODO: Suppress warnings? Better with tryCatch()?
# there is a pathological entry which gets deleted...
# df[df$trace == 4595, ]
# --> artwork 046 popup selene.xml gets opened twice
# remove entries with only start or stop events since they do not have
# all columns
subdata_split_wide <-
subdata_split_wide[-which(sapply(subdata_split_wide, ncol) != ncol)]
ids <- which(sapply(subdata_split_wide, ncol) != ncol)
if (length(ids) > 0) subdata_split_wide <- subdata_split_wide[-ids]
#data_wide <- do.call(rbind, subdata_split_wide)
# TODO: This runs quite some time
# --> There is a more efficient function in dplyr, which would also allow
# to keep the file IDs with only start or stop or a single entry...
data_wide <- dplyr::bind_rows(subdata_split_wide)
for (d in drop) data_wide[d] <- NA
data_wide$distance <- NA
data_wide$scaleSize <- NA
@ -227,13 +218,13 @@ close_events <- function(data, event = c("move", "flipCard", "openTopic", "openP
data_wide <- move_wide
}
out <- data_wide[, c("fileId.start", "fileId.stop", "event", "artwork",
"trace", "glossar", "date.start", "date.stop",
"timeMs.start", "timeMs.stop", "duration",
"topicNumber", "popup", "x.start", "y.start",
"x.stop", "y.stop", "distance", "scale.start",
"scale.stop", "scaleSize", "rotation.start",
"rotation.stop", "rotationDegree")]
out <- data_wide[, c("fileId", "event", "artwork", "trace", "glossar",
"date.start", "date.stop", "timeMs.start",
"timeMs.stop", "duration", "topicNumber", "popup",
"x.start", "y.start", "x.stop", "y.stop",
"distance", "scale.start", "scale.stop",
"scaleSize", "rotation.start", "rotation.stop",
"rotationDegree")]
rownames(out) <- NULL
out
}
@ -262,7 +253,6 @@ add_case <- function(data, cutoff = 20) {
data$timediff <- NULL
data
}
# TODO: Is this faster with lapply()?
###########################################################################