mtt/R/extract_artworks.R

56 lines
1.9 KiB
R

#' Creating data frame with information about artworks
#'
#' Information about artworks are extracted from XML files and written to a
#' data frame that contains `artist`, `title`, `misc`, and `description`.
#'
#' @param artworks A character vector with names of the artworks. Needs to
#' correspond to the folder names which contain the XML files.
#' @param xmlfiles Vector of names of index files, often something like
#' `<artwork>.xml`. Need to be in the same order as artworks!
#' @param xmlpath Path to folder where XML definitions of artworks live.
#' @return Data frame.
#' @export
#' @examples
#' # tbd
extract_artworks <- function(artworks, xmlfiles, xmlpath) {
out <- NULL
i <- 1
for (artwork in artworks) {
if (length(xmlfiles) == 1) {
index_file <- xmlfiles
} else {
index_file <- xmlfiles[i]
}
index <- paste(xmlpath, artwork, index_file, sep = "/")
varnames <- c("artist", "title", "misc", "description")
xmllist <- XML::xmlToList(index)$header[varnames]
if (any(sapply(xmllist, is.null))) {# necessary for missing entries
names(xmllist) <- varnames
xmllist[which(sapply(xmllist, is.null))] <- NA
}
# remove German quotes
xmllist <- lapply(xmllist, function(x) gsub("\u201e|\u201c", "", x))
# remove HTML tags
xmllist <- lapply(xmllist, function(x) gsub("<br/>", " ", x))
xmldat <- as.data.frame(xmllist)
xmldat$artwork <- artwork
# trim white space from strings
xmldat$artist <- trimws(xmldat$artist)
xmldat$title <- trimws(xmldat$title)
xmldat$misc <- trimws(xmldat$misc)
xmldat$description <- trimws(xmldat$description)
out <- rbind(out, xmldat)
i <- i + 1
}
out
}
# TODO: Check if all artworks have a folder, catch it and throw warning
# TODO: Is this function generic for most projects? If yes, adjust variable
# names, if no: Maybe remove it from package?