mtt/R/extract_topics.R

47 lines
1.4 KiB
R

#' Creating data frame with artworks and topics
#'
#' Topics are extracted from XML files and written to a data frame that
#' shows which artworks belong to which topics.
#'
#' @param artworks A character vector with names of the artworks. Needs to
#' correspond to the folder names which contain the XML files.
#' @param xmlfiles Vector of names of index files, often something like
#' `<artwork>.xml`. Need to be in the same order as artworks!
#' @param xmlpath Path to folder where XML definitions of artworks live.
#' @return Data frame.
#' @export
#' @examples
#' # tbd
extract_topics <- function(artworks, xmlfiles, xmlpath) {
out <- NULL
i <- 1
for (artwork in artworks) {
index_file <- paste0(xmlpath, artwork, "/", xmlfiles[i])
suppressWarnings(
fnames <- gsub("^<card src=.*/(.*)./>$", "\\1",
grep("^<card src=", trimws(readLines(index_file)),
value = TRUE))
)
topic <- NULL
for (fname in fnames) {
suppressWarnings(
topic <- c(topic, gsub("^<card type=.(.*).>$", "\\1",
grep("^<card type=",
trimws(readLines(paste(xmlpath, artwork, fname, sep = "/"))),
value = TRUE)))
)
}
out <- rbind(out, data.frame(artwork, file_name = fnames, topic))
i <- i + 1
}
out <- out[order(out$artwork), ]
rownames(out) <- NULL
out
}