Removed extract_topics() from create_eventlogs(); removed helper.R and put everything in separate files

This commit is contained in:
Nora Wickelmaier 2023-09-25 11:29:35 +02:00
parent 85d3cfc2fa
commit 0e911bed3f
8 changed files with 244 additions and 233 deletions

25
R/add_case.R Normal file
View File

@ -0,0 +1,25 @@
###########################################################################
# Add case variable
add_case <- function(data, cutoff = 20) {
# TODO: What is the best choice for the cutoff here?
data$timediff <- as.numeric(diff(c(data$date.start[1], data$date.start)))
data$case <- NA
j <- 1
pb <- utils::txtProgressBar(min = 0, max = nrow(data), initial = NA, style = 3)
for (i in seq_len(nrow(data))) {
if (data$timediff[i] <= cutoff) {
data$case[i] <- j
} else {
j <- j + 1
data$case[i] <- j
}
utils::setTxtProgressBar(pb, i)
}
data$timediff <- NULL
data
}

0
R/add_metadata.R Normal file
View File

View File

@ -107,3 +107,68 @@ add_trace <- function(data, glossar_dict) {
out
}
###########################################################################
# Add trace for moves
add_trace_moves <- function(data) {
pbapply::pboptions(style = 3, char = "=")
trace_max <- max(data$trace, na.rm = TRUE)
#subdata_art <- split(data, ~ artwork)
subdata_case <- split(data, ~ case)
#subdata_list <- split(data, ~ artwork + case)
# --> does not work with complete data set
cat("Splitting data...", "\n")
subdata_list <- pbapply::pblapply(subdata_case, split, f = ~artwork)
subdata_list <- unlist(subdata_list, recursive = FALSE)
cat("Adding trace...", "\n")
subdata_trace <- pbapply::pblapply(subdata_list,
function(x) {
trace_max <<- trace_max + 1
add_trace_subdata(x, max_trace = trace_max)
}
)
out <- dplyr::bind_rows(subdata_trace)
out <- out[order(out$fileId, out$date.start, out$timeMs.start), ]
rownames(out) <- NULL
# Make trace a consecutive number
out$trace <- as.numeric(factor(out$trace, levels = unique(out$trace)))
out
}
add_trace_subdata <- function(subdata, max_trace) {
if (nrow(subdata) != 0) {
if (length(stats::na.omit(unique(subdata$trace))) == 1) {
subdata[subdata$event == "move", "trace"] <- stats::na.omit(unique(subdata$trace))
} else if (length(stats::na.omit(unique(subdata$trace))) > 1) {
for (i in 1:nrow(subdata)) {
if (subdata$event[i] == "move") {
if (i == 1) {
subdata$trace[i] <- stats::na.omit(unique(subdata$trace))[1]
} else {
subdata$trace[i] <- subdata$trace[i - 1]
}
}
}
} else if (all(is.na(subdata$trace))) {
for (i in 1:nrow(subdata)) {
subdata$trace[i] <- max_trace
}
}
} else {
warning("subdata has nrow = 0")
}
subdata
}

View File

@ -9,14 +9,14 @@ close_events <- function(data, event = c("move", "flipCard", "openTopic", "openP
actions <- c("Transform start", "Transform stop")
idvar <- c("fileId", "folder", "eventId", "artwork", "glossar")
drop <- c("popup", "topicNumber", "trace", "event")
ncol <- 16
ncol <- 17
},
"flipCard" = {
actions <- c("Show Info", "Show Front")
idvar <- c("fileId", "folder", "trace", "artwork", "glossar")
drop <- c("popup", "topicNumber", "eventId", "event")
ncol <- 16
ncol <- 17
},
"openTopic" = {
@ -24,7 +24,7 @@ close_events <- function(data, event = c("move", "flipCard", "openTopic", "openP
idvar <- c("fileId", "folder", "eventId", "trace", "glossar",
"artwork", "topicNumber")
drop <- c("popup", "event")
ncol <- 18
ncol <- 19
},
"openPopup" = {
@ -32,7 +32,7 @@ close_events <- function(data, event = c("move", "flipCard", "openTopic", "openP
idvar <- c("fileId", "folder", "eventId", "trace", "glossar",
"artwork", "popup")
drop <- c("topicNumber", "event")
ncol <- 18
ncol <- 19
# TODO: Should topicNumber maybe also be filled in for "openPopup"?
}

View File

@ -5,14 +5,12 @@
#'
#' @param data Data frame of raw log files created with `parse_logfiles()`.
#' See `?parse_logfiles` for more details.
#' @param xmlfiles Vector of names of index files, often something like
#' `<artwork>.xml`.
#' @param xmlpath Path to folder where XML definitions of artworks live.
#' @return Data frame.
#' @export
#' @examples
#' # tbd
create_eventlogs <- function(data, xmlfiles, xmlpath) {
create_eventlogs <- function(data, xmlpath) {
if (!lubridate::is.POSIXt(data$date)){
cat("########## Convertion variable `date` to POSIXct ##########", "\n")
@ -24,11 +22,11 @@ create_eventlogs <- function(data, xmlfiles, xmlpath) {
dat <- subset(data, !(data$event %in% c("Start Application",
"Show Application")))
# Create glossar dictionary ##############################################
cat("\n########## Creating glossar dictionary ##########", "\n")
artworks <- unique(stats::na.omit(dat$artwork))
# Create glossar dictionary ##############################################
if ("glossar" %in% artworks) {
cat("\n########## Creating glossar dictionary ##########", "\n")
artworks <- artworks[artworks != "glossar"]
glossar_files <- unique(subset(dat, dat$artwork == "glossar")$popup)
glossar_dict <- create_glossardict(artworks, glossar_files, xmlpath = xmlpath)
@ -82,16 +80,7 @@ create_eventlogs <- function(data, xmlfiles, xmlpath) {
# Add trace for move events ##############################################
cat("\n\n########## Adding trace variable for move events... ##########", "\n")
dat4 <- add_trace_moves(dat3)
# Add topics: file names and topics ######################################
cat("\n########## Adding information about topics... ##########", "\n\n")
# remove artworks without XML information
#artworks <- artworks[!artworks %in% c("504", "505")]
# TODO: This is hardcoded! Remove it!
topics <- extract_topics(artworks, xmlfiles = xmlfiles, xmlpath = xmlpath)
dat5 <- add_topic(dat4, topics = topics)
dat5
dat4
}

54
R/extract_artworks.R Normal file
View File

@ -0,0 +1,54 @@
#' Creating data frame with information about artworks
#'
#' Information about artowrks are extracted from XML files and written to a
#' data frame that contains `artist`, `title`, `misc`, and `description`.
#'
#' @param artworks A character vector with names of the artworks. Needs to
#' correspond to the folder names which contain the XML files.
#' @param xmlfiles Vector of names of index files, often something like
#' `<artwork>.xml`. Need to be in the same order as artworks!
#' @param xmlpath Path to folder where XML definitions of artworks live.
#' @return Data frame.
#' @export
#' @examples
#' # tbd
extract_artworks <- function(artworks, xmlfiles, xmlpath) {
out <- NULL
i <- 1
for (artwork in artworks) {
if (length(xmlfiles) == 1) {
index_file <- xmlfiles
} else {
index_file <- xmlfiles[i]
}
index <- paste(xmlpath, artwork, index_file, sep = "/")
varnames <- c("artist", "title", "misc", "description")
xmllist <- XML::xmlToList(index)$header[varnames]
if (any(sapply(xmllist, is.null))) {# necessary for missing entries
names(xmllist) <- varnames
xmllist[which(sapply(xmllist, is.null))] <- NA
}
# remove German quotes
xmllist <- lapply(xmllist, function(x) gsub("\u201e|\u201c", "", x))
# remove HTML tags
xmllist <- lapply(xmllist, function(x) gsub("<br/>", " ", x))
xmldat <- as.data.frame(xmllist)
xmldat$artwork <- artwork
# trim white space from strings
xmldat$artist <- trimws(xmldat$artist)
xmldat$title <- trimws(xmldat$title)
xmldat$misc <- trimws(xmldat$misc)
xmldat$description <- trimws(xmldat$description)
out <- rbind(out, xmldat)
i <- i + 1
}
out
}
# TODO: Check if artworks all artworks have a folder, catch it and throw
# warning

92
R/extract_topics.R Normal file
View File

@ -0,0 +1,92 @@
#' Creating data frame with artworks and topics
#'
#' Topics are extracted from XML files and written to a data frame that
#' shows which artworks belong to which topics.
#'
#' @param artworks A character vector with names of the artworks. Needs to
#' correspond to the folder names which contain the XML files.
#' @param xmlfiles Vector of names of index files, often something like
#' `<artwork>.xml`. Need to be in the same order as artworks!
#' @param xmlpath Path to folder where XML definitions of artworks live.
#' @return Data frame.
#' @export
#' @examples
#' # tbd
extract_topics <- function(artworks, xmlfiles, xmlpath) {
out <- NULL
i <- 1
for (artwork in artworks) {
index_file <- paste0(xmlpath, artwork, "/", xmlfiles[i])
suppressWarnings(
fnames <- gsub("^<card src=.*/(.*)./>$", "\\1",
grep("^<card src=", trimws(readLines(index_file)),
value = TRUE))
)
topic <- NULL
for (fname in fnames) {
suppressWarnings(
topic <- c(topic, gsub("^<card type=.(.*).>$", "\\1",
grep("^<card type=",
trimws(readLines(paste(xmlpath, artwork, fname, sep = "/"))),
value = T)))
)
}
out <- rbind(out, data.frame(artwork, file_name = fnames, topic))
i <- i + 1
}
out <- out[order(out$artwork), ]
rownames(out) <- NULL
out$index <- unlist(lapply(table(out$artwork), seq_len))
out
}
###########################################################################
# Add topics: file names and topics
add_topic <- function(data, topics) {
artworks <- unique(data$artwork)
tab_art <- lapply(artworks,
function(x) names(table(data$topicNumber[data$artwork == x])))
names(tab_art) <- artworks
tab_index <- lapply(tab_art, seq_along)
dat_split <- split(data, ~ artwork)
set_label <- function(x) {
artwork <- unique(x$artwork)
x$topicIndex <- factor(x$topicNumber, labels = tab_index[[artwork]])
x
}
dat_label <- lapply(dat_split, set_label)
set_topic <- function(x) {
artwork <- unique(x$artwork)
labels_file <- topics[topics$artwork == artwork,
"file_name"][as.numeric(levels(x$topicIndex))]
x$topicFile <- as.character(factor(x$topicIndex, labels = labels_file))
labels_topic <- topics[topics$artwork == artwork,
"topic"][as.numeric(levels(x$topicIndex))]
x$topic <- as.character(factor(x$topicIndex, labels = labels_topic))
x
}
dat_topic <- lapply(dat_label, set_topic)
#out <- do.call(rbind, dat_topic)
out <- dplyr::bind_rows(dat_topic)
out$topicIndex <- as.numeric(out$topicIndex)
out <- out[order(out$fileId, out$date.start, out$timeMs.start), ]
rownames(out) <- NULL
out
}

View File

@ -1,214 +0,0 @@
###########################################################################
# Add case variable
add_case <- function(data, cutoff = 20) {
# TODO: What is the best choice for the cutoff here?
data$timediff <- as.numeric(diff(c(data$date.start[1], data$date.start)))
data$case <- NA
j <- 1
pb <- utils::txtProgressBar(min = 0, max = nrow(data), initial = NA, style = 3)
for (i in seq_len(nrow(data))) {
if (data$timediff[i] <= cutoff) {
data$case[i] <- j
} else {
j <- j + 1
data$case[i] <- j
}
utils::setTxtProgressBar(pb, i)
}
data$timediff <- NULL
data
}
###########################################################################
# Add trace for moves
add_trace_moves <- function(data) {
pbapply::pboptions(style = 3, char = "=")
trace_max <- max(data$trace, na.rm = TRUE)
#subdata_art <- split(data, ~ artwork)
subdata_case <- split(data, ~ case)
#subdata_list <- split(data, ~ artwork + case)
# --> does not work with complete data set
cat("Splitting data...", "\n")
subdata_list <- pbapply::pblapply(subdata_case, split, f = ~artwork)
subdata_list <- unlist(subdata_list, recursive = FALSE)
cat("Adding trace...", "\n")
subdata_trace <- pbapply::pblapply(subdata_list,
function(x) {
trace_max <<- trace_max + 1
add_trace_subdata(x, max_trace = trace_max)
}
)
out <- dplyr::bind_rows(subdata_trace)
out <- out[order(out$fileId, out$date.start, out$timeMs.start), ]
rownames(out) <- NULL
# Make trace a consecutive number
out$trace <- as.numeric(factor(out$trace, levels = unique(out$trace)))
out
}
add_trace_subdata <- function(subdata, max_trace) {
if (nrow(subdata) != 0) {
if (length(stats::na.omit(unique(subdata$trace))) == 1) {
subdata[subdata$event == "move", "trace"] <- stats::na.omit(unique(subdata$trace))
} else if (length(stats::na.omit(unique(subdata$trace))) > 1) {
for (i in 1:nrow(subdata)) {
if (subdata$event[i] == "move") {
if (i == 1) {
subdata$trace[i] <- stats::na.omit(unique(subdata$trace))[1]
} else {
subdata$trace[i] <- subdata$trace[i - 1]
}
}
}
} else if (all(is.na(subdata$trace))) {
for (i in 1:nrow(subdata)) {
subdata$trace[i] <- max_trace
}
}
} else {
warning("subdata has nrow = 0")
}
subdata
}
###########################################################################
# Create data frame with file names and topics for each artwork
extract_topics <- function(artworks, xmlfiles, xmlpath) {
out <- NULL
i <- 1
for (artwork in artworks) {
index_file <- paste0(xmlpath, artwork, "/", xmlfiles[i])
suppressWarnings(
fnames <- gsub("^<card src=.*/(.*)./>$", "\\1",
grep("^<card src=", trimws(readLines(index_file)),
value = TRUE))
)
topic <- NULL
for (fname in fnames) {
suppressWarnings(
topic <- c(topic, gsub("^<card type=.(.*).>$", "\\1",
grep("^<card type=",
trimws(readLines(paste(xmlpath, artwork, fname, sep = "/"))),
value = T)))
)
}
out <- rbind(out, data.frame(artwork, file_name = fnames, topic))
i <- i + 1
}
out <- out[order(out$artwork), ]
rownames(out) <- NULL
out$index <- unlist(lapply(table(out$artwork), seq_len))
out
}
###########################################################################
# Add topics: file names and topics
add_topic <- function(data, topics) {
artworks <- unique(data$artwork)
tab_art <- lapply(artworks,
function(x) names(table(data$topicNumber[data$artwork == x])))
names(tab_art) <- artworks
tab_index <- lapply(tab_art, seq_along)
dat_split <- split(data, ~ artwork)
set_label <- function(x) {
artwork <- unique(x$artwork)
x$topicIndex <- factor(x$topicNumber, labels = tab_index[[artwork]])
x
}
dat_label <- lapply(dat_split, set_label)
set_topic <- function(x) {
artwork <- unique(x$artwork)
labels_file <- topics[topics$artwork == artwork,
"file_name"][as.numeric(levels(x$topicIndex))]
x$topicFile <- as.character(factor(x$topicIndex, labels = labels_file))
labels_topic <- topics[topics$artwork == artwork,
"topic"][as.numeric(levels(x$topicIndex))]
x$topic <- as.character(factor(x$topicIndex, labels = labels_topic))
x
}
dat_topic <- lapply(dat_label, set_topic)
#out <- do.call(rbind, dat_topic)
out <- dplyr::bind_rows(dat_topic)
out$topicIndex <- as.numeric(out$topicIndex)
out <- out[order(out$fileId, out$date.start, out$timeMs.start), ]
rownames(out) <- NULL
out
}
###########################################################################
# Create data frame with information on artworks
extract_artworks <- function(artworks, files, xmlpath) {
out <- NULL
i <- 1
for (artwork in artworks) {
if (length(files) == 1) {
index_file <- files
} else {
index_file <- files[i]
}
index <- paste(xmlpath, artwork, index_file, sep = "/")
varnames <- c("artist", "title", "misc", "description")
xmllist <- XML::xmlToList(index)$header[varnames]
if (any(sapply(xmllist, is.null))) {# necessary for missing entries
names(xmllist) <- varnames
xmllist[which(sapply(xmllist, is.null))] <- NA
}
# remove German quotes
xmllist <- lapply(xmllist, function(x) gsub("\u201e|\u201c", "", x))
# remove HTML tags
xmllist <- lapply(xmllist, function(x) gsub("<br/>", " ", x))
xmldat <- as.data.frame(xmllist)
xmldat$artwork <- artwork
# trim white space from strings
xmldat$artist <- trimws(xmldat$artist)
xmldat$title <- trimws(xmldat$title)
xmldat$misc <- trimws(xmldat$misc)
xmldat$description <- trimws(xmldat$description)
out <- rbind(out, xmldat)
i <- i + 1
}
out
}