55 lines
1.8 KiB
R
55 lines
1.8 KiB
R
#' Creating data frame with information about artworks
|
|
#'
|
|
#' Information about artowrks are extracted from XML files and written to a
|
|
#' data frame that contains `artist`, `title`, `misc`, and `description`.
|
|
#'
|
|
#' @param artworks A character vector with names of the artworks. Needs to
|
|
#' correspond to the folder names which contain the XML files.
|
|
#' @param xmlfiles Vector of names of index files, often something like
|
|
#' `<artwork>.xml`. Need to be in the same order as artworks!
|
|
#' @param xmlpath Path to folder where XML definitions of artworks live.
|
|
#' @return Data frame.
|
|
#' @export
|
|
#' @examples
|
|
#' # tbd
|
|
extract_artworks <- function(artworks, xmlfiles, xmlpath) {
|
|
out <- NULL
|
|
i <- 1
|
|
|
|
for (artwork in artworks) {
|
|
|
|
if (length(xmlfiles) == 1) {
|
|
index_file <- xmlfiles
|
|
} else {
|
|
index_file <- xmlfiles[i]
|
|
}
|
|
|
|
index <- paste(xmlpath, artwork, index_file, sep = "/")
|
|
varnames <- c("artist", "title", "misc", "description")
|
|
xmllist <- XML::xmlToList(index)$header[varnames]
|
|
|
|
if (any(sapply(xmllist, is.null))) {# necessary for missing entries
|
|
names(xmllist) <- varnames
|
|
xmllist[which(sapply(xmllist, is.null))] <- NA
|
|
}
|
|
# remove German quotes
|
|
xmllist <- lapply(xmllist, function(x) gsub("\u201e|\u201c", "", x))
|
|
# remove HTML tags
|
|
xmllist <- lapply(xmllist, function(x) gsub("<br/>", " ", x))
|
|
xmldat <- as.data.frame(xmllist)
|
|
xmldat$artwork <- artwork
|
|
# trim white space from strings
|
|
xmldat$artist <- trimws(xmldat$artist)
|
|
xmldat$title <- trimws(xmldat$title)
|
|
xmldat$misc <- trimws(xmldat$misc)
|
|
xmldat$description <- trimws(xmldat$description)
|
|
out <- rbind(out, xmldat)
|
|
i <- i + 1
|
|
}
|
|
out
|
|
}
|
|
|
|
# TODO: Check if artworks all artworks have a folder, catch it and throw
|
|
# warning
|
|
|