From 0e911bed3f866ec06823e880a2303d78f857d980 Mon Sep 17 00:00:00 2001 From: nwickel Date: Mon, 25 Sep 2023 11:29:35 +0200 Subject: [PATCH] Removed extract_topics() from create_eventlogs(); removed helper.R and put everything in separate files --- R/add_case.R | 25 +++++ R/add_metadata.R | 0 R/add_trace.R | 65 +++++++++++++ R/close_events.R | 8 +- R/create_eventlogs.R | 19 +--- R/extract_artworks.R | 54 +++++++++++ R/extract_topics.R | 92 +++++++++++++++++++ R/helper.R | 214 ------------------------------------------- 8 files changed, 244 insertions(+), 233 deletions(-) create mode 100644 R/add_case.R create mode 100644 R/add_metadata.R create mode 100644 R/extract_artworks.R create mode 100644 R/extract_topics.R delete mode 100644 R/helper.R diff --git a/R/add_case.R b/R/add_case.R new file mode 100644 index 0000000..87b210d --- /dev/null +++ b/R/add_case.R @@ -0,0 +1,25 @@ +########################################################################### + +# Add case variable + +add_case <- function(data, cutoff = 20) { +# TODO: What is the best choice for the cutoff here? + + data$timediff <- as.numeric(diff(c(data$date.start[1], data$date.start))) + data$case <- NA + j <- 1 + pb <- utils::txtProgressBar(min = 0, max = nrow(data), initial = NA, style = 3) + + for (i in seq_len(nrow(data))) { + if (data$timediff[i] <= cutoff) { + data$case[i] <- j + } else { + j <- j + 1 + data$case[i] <- j + } + utils::setTxtProgressBar(pb, i) + } + data$timediff <- NULL + data +} + diff --git a/R/add_metadata.R b/R/add_metadata.R new file mode 100644 index 0000000..e69de29 diff --git a/R/add_trace.R b/R/add_trace.R index 4425373..78937b7 100644 --- a/R/add_trace.R +++ b/R/add_trace.R @@ -107,3 +107,68 @@ add_trace <- function(data, glossar_dict) { out } +########################################################################### + +# Add trace for moves + +add_trace_moves <- function(data) { + + pbapply::pboptions(style = 3, char = "=") + + trace_max <- max(data$trace, na.rm = TRUE) + + #subdata_art <- split(data, ~ artwork) + subdata_case <- split(data, ~ case) + + #subdata_list <- split(data, ~ artwork + case) + # --> does not work with complete data set + cat("Splitting data...", "\n") + subdata_list <- pbapply::pblapply(subdata_case, split, f = ~artwork) + subdata_list <- unlist(subdata_list, recursive = FALSE) + + cat("Adding trace...", "\n") + subdata_trace <- pbapply::pblapply(subdata_list, + function(x) { + trace_max <<- trace_max + 1 + add_trace_subdata(x, max_trace = trace_max) + } + ) + + out <- dplyr::bind_rows(subdata_trace) + out <- out[order(out$fileId, out$date.start, out$timeMs.start), ] + rownames(out) <- NULL + + # Make trace a consecutive number + out$trace <- as.numeric(factor(out$trace, levels = unique(out$trace))) + out +} + + +add_trace_subdata <- function(subdata, max_trace) { + + if (nrow(subdata) != 0) { + + if (length(stats::na.omit(unique(subdata$trace))) == 1) { + subdata[subdata$event == "move", "trace"] <- stats::na.omit(unique(subdata$trace)) + } else if (length(stats::na.omit(unique(subdata$trace))) > 1) { + for (i in 1:nrow(subdata)) { + if (subdata$event[i] == "move") { + if (i == 1) { + subdata$trace[i] <- stats::na.omit(unique(subdata$trace))[1] + } else { + subdata$trace[i] <- subdata$trace[i - 1] + } + } + } + } else if (all(is.na(subdata$trace))) { + for (i in 1:nrow(subdata)) { + subdata$trace[i] <- max_trace + } + } + + } else { + warning("subdata has nrow = 0") + } + subdata +} + diff --git a/R/close_events.R b/R/close_events.R index 38b4cec..87929dc 100644 --- a/R/close_events.R +++ b/R/close_events.R @@ -9,14 +9,14 @@ close_events <- function(data, event = c("move", "flipCard", "openTopic", "openP actions <- c("Transform start", "Transform stop") idvar <- c("fileId", "folder", "eventId", "artwork", "glossar") drop <- c("popup", "topicNumber", "trace", "event") - ncol <- 16 + ncol <- 17 }, "flipCard" = { actions <- c("Show Info", "Show Front") idvar <- c("fileId", "folder", "trace", "artwork", "glossar") drop <- c("popup", "topicNumber", "eventId", "event") - ncol <- 16 + ncol <- 17 }, "openTopic" = { @@ -24,7 +24,7 @@ close_events <- function(data, event = c("move", "flipCard", "openTopic", "openP idvar <- c("fileId", "folder", "eventId", "trace", "glossar", "artwork", "topicNumber") drop <- c("popup", "event") - ncol <- 18 + ncol <- 19 }, "openPopup" = { @@ -32,7 +32,7 @@ close_events <- function(data, event = c("move", "flipCard", "openTopic", "openP idvar <- c("fileId", "folder", "eventId", "trace", "glossar", "artwork", "popup") drop <- c("topicNumber", "event") - ncol <- 18 + ncol <- 19 # TODO: Should topicNumber maybe also be filled in for "openPopup"? } diff --git a/R/create_eventlogs.R b/R/create_eventlogs.R index 72c7578..0a3231f 100644 --- a/R/create_eventlogs.R +++ b/R/create_eventlogs.R @@ -5,14 +5,12 @@ #' #' @param data Data frame of raw log files created with `parse_logfiles()`. #' See `?parse_logfiles` for more details. -#' @param xmlfiles Vector of names of index files, often something like -#' `.xml`. #' @param xmlpath Path to folder where XML definitions of artworks live. #' @return Data frame. #' @export #' @examples #' # tbd -create_eventlogs <- function(data, xmlfiles, xmlpath) { +create_eventlogs <- function(data, xmlpath) { if (!lubridate::is.POSIXt(data$date)){ cat("########## Convertion variable `date` to POSIXct ##########", "\n") @@ -24,11 +22,11 @@ create_eventlogs <- function(data, xmlfiles, xmlpath) { dat <- subset(data, !(data$event %in% c("Start Application", "Show Application"))) - # Create glossar dictionary ############################################## - cat("\n########## Creating glossar dictionary ##########", "\n") artworks <- unique(stats::na.omit(dat$artwork)) + # Create glossar dictionary ############################################## if ("glossar" %in% artworks) { + cat("\n########## Creating glossar dictionary ##########", "\n") artworks <- artworks[artworks != "glossar"] glossar_files <- unique(subset(dat, dat$artwork == "glossar")$popup) glossar_dict <- create_glossardict(artworks, glossar_files, xmlpath = xmlpath) @@ -82,16 +80,7 @@ create_eventlogs <- function(data, xmlfiles, xmlpath) { # Add trace for move events ############################################## cat("\n\n########## Adding trace variable for move events... ##########", "\n") dat4 <- add_trace_moves(dat3) - - # Add topics: file names and topics ###################################### - cat("\n########## Adding information about topics... ##########", "\n\n") - # remove artworks without XML information - #artworks <- artworks[!artworks %in% c("504", "505")] - # TODO: This is hardcoded! Remove it! - topics <- extract_topics(artworks, xmlfiles = xmlfiles, xmlpath = xmlpath) - - dat5 <- add_topic(dat4, topics = topics) - dat5 + dat4 } diff --git a/R/extract_artworks.R b/R/extract_artworks.R new file mode 100644 index 0000000..1500af8 --- /dev/null +++ b/R/extract_artworks.R @@ -0,0 +1,54 @@ +#' Creating data frame with information about artworks +#' +#' Information about artowrks are extracted from XML files and written to a +#' data frame that contains `artist`, `title`, `misc`, and `description`. +#' +#' @param artworks A character vector with names of the artworks. Needs to +#' correspond to the folder names which contain the XML files. +#' @param xmlfiles Vector of names of index files, often something like +#' `.xml`. Need to be in the same order as artworks! +#' @param xmlpath Path to folder where XML definitions of artworks live. +#' @return Data frame. +#' @export +#' @examples +#' # tbd +extract_artworks <- function(artworks, xmlfiles, xmlpath) { + out <- NULL + i <- 1 + + for (artwork in artworks) { + + if (length(xmlfiles) == 1) { + index_file <- xmlfiles + } else { + index_file <- xmlfiles[i] + } + + index <- paste(xmlpath, artwork, index_file, sep = "/") + varnames <- c("artist", "title", "misc", "description") + xmllist <- XML::xmlToList(index)$header[varnames] + + if (any(sapply(xmllist, is.null))) {# necessary for missing entries + names(xmllist) <- varnames + xmllist[which(sapply(xmllist, is.null))] <- NA + } + # remove German quotes + xmllist <- lapply(xmllist, function(x) gsub("\u201e|\u201c", "", x)) + # remove HTML tags + xmllist <- lapply(xmllist, function(x) gsub("
", " ", x)) + xmldat <- as.data.frame(xmllist) + xmldat$artwork <- artwork + # trim white space from strings + xmldat$artist <- trimws(xmldat$artist) + xmldat$title <- trimws(xmldat$title) + xmldat$misc <- trimws(xmldat$misc) + xmldat$description <- trimws(xmldat$description) + out <- rbind(out, xmldat) + i <- i + 1 + } + out +} + +# TODO: Check if artworks all artworks have a folder, catch it and throw +# warning + diff --git a/R/extract_topics.R b/R/extract_topics.R new file mode 100644 index 0000000..7056ebf --- /dev/null +++ b/R/extract_topics.R @@ -0,0 +1,92 @@ +#' Creating data frame with artworks and topics +#' +#' Topics are extracted from XML files and written to a data frame that +#' shows which artworks belong to which topics. +#' +#' @param artworks A character vector with names of the artworks. Needs to +#' correspond to the folder names which contain the XML files. +#' @param xmlfiles Vector of names of index files, often something like +#' `.xml`. Need to be in the same order as artworks! +#' @param xmlpath Path to folder where XML definitions of artworks live. +#' @return Data frame. +#' @export +#' @examples +#' # tbd +extract_topics <- function(artworks, xmlfiles, xmlpath) { + + out <- NULL + i <- 1 + + for (artwork in artworks) { + + index_file <- paste0(xmlpath, artwork, "/", xmlfiles[i]) + suppressWarnings( + fnames <- gsub("^$", "\\1", + grep("^$", "\\1", + grep("^ does not work with complete data set - cat("Splitting data...", "\n") - subdata_list <- pbapply::pblapply(subdata_case, split, f = ~artwork) - subdata_list <- unlist(subdata_list, recursive = FALSE) - - cat("Adding trace...", "\n") - subdata_trace <- pbapply::pblapply(subdata_list, - function(x) { - trace_max <<- trace_max + 1 - add_trace_subdata(x, max_trace = trace_max) - } - ) - - out <- dplyr::bind_rows(subdata_trace) - out <- out[order(out$fileId, out$date.start, out$timeMs.start), ] - rownames(out) <- NULL - - # Make trace a consecutive number - out$trace <- as.numeric(factor(out$trace, levels = unique(out$trace))) - out -} - - -add_trace_subdata <- function(subdata, max_trace) { - - if (nrow(subdata) != 0) { - - if (length(stats::na.omit(unique(subdata$trace))) == 1) { - subdata[subdata$event == "move", "trace"] <- stats::na.omit(unique(subdata$trace)) - } else if (length(stats::na.omit(unique(subdata$trace))) > 1) { - for (i in 1:nrow(subdata)) { - if (subdata$event[i] == "move") { - if (i == 1) { - subdata$trace[i] <- stats::na.omit(unique(subdata$trace))[1] - } else { - subdata$trace[i] <- subdata$trace[i - 1] - } - } - } - } else if (all(is.na(subdata$trace))) { - for (i in 1:nrow(subdata)) { - subdata$trace[i] <- max_trace - } - } - - } else { - warning("subdata has nrow = 0") - } - subdata -} - - -########################################################################### - -# Create data frame with file names and topics for each artwork - -extract_topics <- function(artworks, xmlfiles, xmlpath) { - - out <- NULL - i <- 1 - - for (artwork in artworks) { - - index_file <- paste0(xmlpath, artwork, "/", xmlfiles[i]) - suppressWarnings( - fnames <- gsub("^$", "\\1", - grep("^$", "\\1", - grep("^", " ", x)) - xmldat <- as.data.frame(xmllist) - xmldat$artwork <- artwork - # trim white space from strings - xmldat$artist <- trimws(xmldat$artist) - xmldat$title <- trimws(xmldat$title) - xmldat$misc <- trimws(xmldat$misc) - xmldat$description <- trimws(xmldat$description) - out <- rbind(out, xmldat) - i <- i + 1 - } - out -} -