diff --git a/README.Rmd b/README.Rmd index b252ea0..e7ec837 100644 --- a/README.Rmd +++ b/README.Rmd @@ -483,9 +483,16 @@ two raw log files cannot be closed and will then be removed from the data set. The functions warns about this, but it is a random process getting rid of these data and seems therefore not like a systematic problem. Another reason why this is not bad, is that durations cannot be calculated for -events across log files, because the time stamps do not increase over +events across log files anyways, because the time stamps do not increase systematically over log files (see above). +I meant to put the lists back together with `do.call(rbind, some_list)` but +this can also not handle big data sets. I therefore switched to +`dplyr::bind_rows(some_ist)` which is really fast and was developed +especially for this purpose. It means, that I have to depend on the dplyr +package (which I am not a big fan of, since I meant to keep the package +self-contained). + # Reading list * @Arizmendi2022 [--] diff --git a/code/02_preprocessing.R b/code/02_preprocessing.R index 534a16d..57b5e84 100644 --- a/code/02_preprocessing.R +++ b/code/02_preprocessing.R @@ -2,10 +2,18 @@ source("functions.R") +small <- TRUE + # Read data ############################################################## cat("########## Reading in data... ##########", "\n") -dat0 <- read.table("../data/rawdata_logfiles.csv", sep = ";", - header = TRUE) + +if (small) { + dat0 <- read.table("../data/rawdata_logfiles_small.csv", sep = ";", + header = TRUE) +} else { + dat0 <- read.table("../data/rawdata_logfiles.csv", sep = ";", + header = TRUE) +} dat0$date <- as.POSIXct(dat0$date) dat0$glossar <- ifelse(dat0$artwork == "glossar", 1, 0) @@ -13,13 +21,15 @@ dat0$glossar <- ifelse(dat0$artwork == "glossar", 1, 0) dat <- subset(dat0, !(dat0$event %in% c("Start Application", "Show Application"))) -save(dat, file = "tmp/dat.RData") +save(dat, file = paste0("tmp/dat_", ifelse(small, "small_", "full_"), + as.numeric(Sys.time()), ".RData")) # Add trace variable ##################################################### cat("########## Adding trace variable... ##########", "\n") dat1 <- add_trace(dat) -save(dat1, file = "tmp/dat1.RData") +save(dat1, file = paste("tmp/dat1", ifelse(small, "small_", "full_"), + as.numeric(Sys.time()), ".RData")) # Close events cat("########## Closing events... ##########", "\n") @@ -44,7 +54,8 @@ dat2 <- dat2[!is.na(dat2$date.start), ] rownames(dat2) <- NULL # TODO: Throw warning about this -save(dat2, file = "tmp/dat2.RData") +save(dat2, file = paste("tmp/dat2", ifelse(small, "small_", "full_"), + as.numeric(Sys.time()), ".RData")) # Add case variable ###################################################### cat("########## Adding case and eventId variables... ##########", "\n") @@ -61,13 +72,15 @@ dat3 <- dat3[, c("fileId.start", "fileId.stop", "eventId", "case", "scaleSize", "rotation.start", "rotation.stop", "rotationDegree")] -save(dat3, file = "tmp/dat3.RData") +save(dat3, file = paste("tmp/dat3", ifelse(small, "small_", "full_"), + as.numeric(Sys.time()), ".RData")) # Add trace for move events ############################################## cat("\n########## Adding trace variable for move events... ##########", "\n") dat4 <- add_trace_moves(dat3) -save(dat4, file = "tmp/dat4.RData") +save(dat4, file = paste("tmp/dat4", ifelse(small, "small_", "full_"), + as.numeric(Sys.time()), ".RData")) # Add topics: file names and topics ###################################### cat("########## Adding information about topics... ##########", "\n") @@ -79,7 +92,8 @@ topics <- extract_topics(artworks, pattern = paste0(artworks, ".xml"), dat5 <- add_topic(dat4, topics = topics) -save(dat5, file = "tmp/dat5.RData") +save(dat5, file = paste("tmp/dat5", ifelse(small, "small_", "full_"), + as.numeric(Sys.time()), ".RData")) # TODO: Replace artwork with informative strings diff --git a/code/functions.R b/code/functions.R index 0ae707a..00f579b 100644 --- a/code/functions.R +++ b/code/functions.R @@ -118,42 +118,43 @@ add_trace <- function(data, glossar_dict = "../data/glossar_dict.RData") { ########################################################################### close_events <- function(data, event = c("move", "flipCard", "openTopic", "openPopup")) { -# TODO: How do I set default vector and partial matching of arguments? - # --> `macht.arg()` and `pmatch()` - if (event == "move") { - actions <- c("Transform start", "Transform stop") - idvar <- c("eventId", "artwork", "glossar") - drop <- c("popup", "topicNumber", "trace", "event") - ncol <- 17 + event <- match.arg(event) - } else if (event == "flipCard") { - actions <- c("Show Info", "Show Front") - idvar <- c("trace", "artwork", "glossar") - drop <- c("popup", "topicNumber", "eventId", "event") - ncol <- 17 + switch(event, + "move" = { + actions <- c("Transform start", "Transform stop") + idvar <- c("fileId", "eventId", "artwork", "glossar") + drop <- c("popup", "topicNumber", "trace", "event") + ncol <- 16 - } else if (event == "openTopic") { - actions <- c("Artwork/OpenCard", "Artwork/CloseCard") - idvar <- c("eventId", "trace", "glossar", "artwork", "topicNumber") - drop <- c("popup", "event") - ncol <- 19 + }, + "flipCard" = { + actions <- c("Show Info", "Show Front") + idvar <- c("fileId", "trace", "artwork", "glossar") + drop <- c("popup", "topicNumber", "eventId", "event") + ncol <- 16 - } else if (event == "openPopup") { - actions <- c("ShowPopup", "HidePopup") - idvar <- c("eventId", "trace", "glossar", "artwork", "popup") - drop <- c("topicNumber", "event") - ncol <- 19 -# TODO: Should topicNumber maybe also be filled in for "openPopup"? + }, + "openTopic" = { + actions <- c("Artwork/OpenCard", "Artwork/CloseCard") + idvar <- c("fileId", "eventId", "trace", "glossar", "artwork", + "topicNumber") + drop <- c("popup", "event") + ncol <- 18 - } else { - stop("`event` must be one of 'move', 'flipCard', 'openTopic', - 'openPopup'.") - } -# TODO: `fileId` should now maybe go back into `idvar` + }, + "openPopup" = { + actions <- c("ShowPopup", "HidePopup") + idvar <- c("fileId", "eventId", "trace", "glossar", "artwork", "popup") + drop <- c("topicNumber", "event") + ncol <- 18 +# TODO: Should topicNumber maybe also be filled in for "openPopup"? + + } + ) subdata <- subset(data, data$event %in% actions) - #subdata <- subdata[order(subdata$artwork, subdata$popup, subdata$date), ] subdata <- subdata[order(subdata$artwork, subdata$popup, subdata$date, subdata$timeMs), ] subdata$time <- ifelse(subdata$event == actions[1], "start", "stop") num_start <- diff(c(0, which(subdata$event == actions[2]))) @@ -172,36 +173,26 @@ close_events <- function(data, event = c("move", "flipCard", "openTopic", "openP subdata_split <- split(subdata, ~ fileId) pbapply::pboptions(style = 3, char = "=") - suppressWarnings( + subdata_split_wide <- pbapply::pblapply(subdata_split, reshape, direction = "wide", idvar = idvar, timevar = "time", drop = drop) - ) # suppressWarnings( # data_wide <- reshape(subdata, direction = "wide", # idvar = idvar, # timevar = "time", # drop = drop) # ) -# TODO: Suppress warnings? Better with tryCatch()? -# there is a pathological entry which gets deleted... -# df[df$trace == 4595, ] -# --> artwork 046 popup selene.xml gets opened twice # remove entries with only start or stop events since they do not have # all columns - subdata_split_wide <- - subdata_split_wide[-which(sapply(subdata_split_wide, ncol) != ncol)] + ids <- which(sapply(subdata_split_wide, ncol) != ncol) + if (length(ids) > 0) subdata_split_wide <- subdata_split_wide[-ids] - #data_wide <- do.call(rbind, subdata_split_wide) -# TODO: This runs quite some time -# --> There is a more efficient function in dplyr, which would also allow -# to keep the file IDs with only start or stop or a single entry... data_wide <- dplyr::bind_rows(subdata_split_wide) - for (d in drop) data_wide[d] <- NA data_wide$distance <- NA data_wide$scaleSize <- NA @@ -227,13 +218,13 @@ close_events <- function(data, event = c("move", "flipCard", "openTopic", "openP data_wide <- move_wide } - out <- data_wide[, c("fileId.start", "fileId.stop", "event", "artwork", - "trace", "glossar", "date.start", "date.stop", - "timeMs.start", "timeMs.stop", "duration", - "topicNumber", "popup", "x.start", "y.start", - "x.stop", "y.stop", "distance", "scale.start", - "scale.stop", "scaleSize", "rotation.start", - "rotation.stop", "rotationDegree")] + out <- data_wide[, c("fileId", "event", "artwork", "trace", "glossar", + "date.start", "date.stop", "timeMs.start", + "timeMs.stop", "duration", "topicNumber", "popup", + "x.start", "y.start", "x.stop", "y.stop", + "distance", "scale.start", "scale.stop", + "scaleSize", "rotation.start", "rotation.stop", + "rotationDegree")] rownames(out) <- NULL out } @@ -262,7 +253,6 @@ add_case <- function(data, cutoff = 20) { data$timediff <- NULL data } -# TODO: Is this faster with lapply()? ###########################################################################