93 lines
2.7 KiB
R
93 lines
2.7 KiB
R
#' Creating data frame with artworks and topics
|
|
#'
|
|
#' Topics are extracted from XML files and written to a data frame that
|
|
#' shows which artworks belong to which topics.
|
|
#'
|
|
#' @param artworks A character vector with names of the artworks. Needs to
|
|
#' correspond to the folder names which contain the XML files.
|
|
#' @param xmlfiles Vector of names of index files, often something like
|
|
#' `<artwork>.xml`. Need to be in the same order as artworks!
|
|
#' @param xmlpath Path to folder where XML definitions of artworks live.
|
|
#' @return Data frame.
|
|
#' @export
|
|
#' @examples
|
|
#' # tbd
|
|
extract_topics <- function(artworks, xmlfiles, xmlpath) {
|
|
|
|
out <- NULL
|
|
i <- 1
|
|
|
|
for (artwork in artworks) {
|
|
|
|
index_file <- paste0(xmlpath, artwork, "/", xmlfiles[i])
|
|
suppressWarnings(
|
|
fnames <- gsub("^<card src=.*/(.*)./>$", "\\1",
|
|
grep("^<card src=", trimws(readLines(index_file)),
|
|
value = TRUE))
|
|
)
|
|
|
|
topic <- NULL
|
|
for (fname in fnames) {
|
|
suppressWarnings(
|
|
topic <- c(topic, gsub("^<card type=.(.*).>$", "\\1",
|
|
grep("^<card type=",
|
|
trimws(readLines(paste(xmlpath, artwork, fname, sep = "/"))),
|
|
value = T)))
|
|
)
|
|
}
|
|
out <- rbind(out, data.frame(artwork, file_name = fnames, topic))
|
|
i <- i + 1
|
|
}
|
|
|
|
out <- out[order(out$artwork), ]
|
|
rownames(out) <- NULL
|
|
|
|
out$index <- unlist(lapply(table(out$artwork), seq_len))
|
|
out
|
|
}
|
|
|
|
###########################################################################
|
|
|
|
# Add topics: file names and topics
|
|
|
|
add_topic <- function(data, topics) {
|
|
|
|
artworks <- unique(data$artwork)
|
|
tab_art <- lapply(artworks,
|
|
function(x) names(table(data$topicNumber[data$artwork == x])))
|
|
names(tab_art) <- artworks
|
|
|
|
tab_index <- lapply(tab_art, seq_along)
|
|
|
|
dat_split <- split(data, ~ artwork)
|
|
|
|
set_label <- function(x) {
|
|
artwork <- unique(x$artwork)
|
|
x$topicIndex <- factor(x$topicNumber, labels = tab_index[[artwork]])
|
|
x
|
|
}
|
|
|
|
dat_label <- lapply(dat_split, set_label)
|
|
|
|
set_topic <- function(x) {
|
|
artwork <- unique(x$artwork)
|
|
labels_file <- topics[topics$artwork == artwork,
|
|
"file_name"][as.numeric(levels(x$topicIndex))]
|
|
x$topicFile <- as.character(factor(x$topicIndex, labels = labels_file))
|
|
labels_topic <- topics[topics$artwork == artwork,
|
|
"topic"][as.numeric(levels(x$topicIndex))]
|
|
x$topic <- as.character(factor(x$topicIndex, labels = labels_topic))
|
|
x
|
|
}
|
|
|
|
dat_topic <- lapply(dat_label, set_topic)
|
|
|
|
#out <- do.call(rbind, dat_topic)
|
|
out <- dplyr::bind_rows(dat_topic)
|
|
out$topicIndex <- as.numeric(out$topicIndex)
|
|
out <- out[order(out$fileId.start, out$date.start, out$timeMs.start), ]
|
|
rownames(out) <- NULL
|
|
out
|
|
}
|
|
|