mtt/R/extract_topics.R

93 lines
2.7 KiB
R

#' Creating data frame with artworks and topics
#'
#' Topics are extracted from XML files and written to a data frame that
#' shows which artworks belong to which topics.
#'
#' @param artworks A character vector with names of the artworks. Needs to
#' correspond to the folder names which contain the XML files.
#' @param xmlfiles Vector of names of index files, often something like
#' `<artwork>.xml`. Need to be in the same order as artworks!
#' @param xmlpath Path to folder where XML definitions of artworks live.
#' @return Data frame.
#' @export
#' @examples
#' # tbd
extract_topics <- function(artworks, xmlfiles, xmlpath) {
out <- NULL
i <- 1
for (artwork in artworks) {
index_file <- paste0(xmlpath, artwork, "/", xmlfiles[i])
suppressWarnings(
fnames <- gsub("^<card src=.*/(.*)./>$", "\\1",
grep("^<card src=", trimws(readLines(index_file)),
value = TRUE))
)
topic <- NULL
for (fname in fnames) {
suppressWarnings(
topic <- c(topic, gsub("^<card type=.(.*).>$", "\\1",
grep("^<card type=",
trimws(readLines(paste(xmlpath, artwork, fname, sep = "/"))),
value = T)))
)
}
out <- rbind(out, data.frame(artwork, file_name = fnames, topic))
i <- i + 1
}
out <- out[order(out$artwork), ]
rownames(out) <- NULL
out$index <- unlist(lapply(table(out$artwork), seq_len))
out
}
###########################################################################
# Add topics: file names and topics
add_topic <- function(data, topics) {
artworks <- unique(data$artwork)
tab_art <- lapply(artworks,
function(x) names(table(data$topicNumber[data$artwork == x])))
names(tab_art) <- artworks
tab_index <- lapply(tab_art, seq_along)
dat_split <- split(data, ~ artwork)
set_label <- function(x) {
artwork <- unique(x$artwork)
x$topicIndex <- factor(x$topicNumber, labels = tab_index[[artwork]])
x
}
dat_label <- lapply(dat_split, set_label)
set_topic <- function(x) {
artwork <- unique(x$artwork)
labels_file <- topics[topics$artwork == artwork,
"file_name"][as.numeric(levels(x$topicIndex))]
x$topicFile <- as.character(factor(x$topicIndex, labels = labels_file))
labels_topic <- topics[topics$artwork == artwork,
"topic"][as.numeric(levels(x$topicIndex))]
x$topic <- as.character(factor(x$topicIndex, labels = labels_topic))
x
}
dat_topic <- lapply(dat_label, set_topic)
#out <- do.call(rbind, dat_topic)
out <- dplyr::bind_rows(dat_topic)
out$topicIndex <- as.numeric(out$topicIndex)
out <- out[order(out$fileId.start, out$date.start, out$timeMs.start), ]
rownames(out) <- NULL
out
}