#' Creating data frame with information about artworks #' #' Information about artworks are extracted from XML files and written to a #' data frame that contains `artist`, `title`, `misc`, and `description`. #' #' @param artworks A character vector with names of the artworks. Needs to #' correspond to the folder names which contain the XML files. #' @param xmlfiles Vector of names of index files, often something like #' `.xml`. Need to be in the same order as artworks! #' @param xmlpath Path to folder where XML definitions of artworks live. #' @return Data frame. #' @export #' @examples #' # tbd extract_artworks <- function(artworks, xmlfiles, xmlpath) { out <- NULL i <- 1 for (artwork in artworks) { if (length(xmlfiles) == 1) { index_file <- xmlfiles } else { index_file <- xmlfiles[i] } index <- paste(xmlpath, artwork, index_file, sep = "/") varnames <- c("artist", "title", "misc", "description") xmllist <- XML::xmlToList(index)$header[varnames] if (any(sapply(xmllist, is.null))) {# necessary for missing entries names(xmllist) <- varnames xmllist[which(sapply(xmllist, is.null))] <- NA } # remove German quotes xmllist <- lapply(xmllist, function(x) gsub("\u201e|\u201c", "", x)) # remove HTML tags xmllist <- lapply(xmllist, function(x) gsub("
", " ", x)) xmldat <- as.data.frame(xmllist) xmldat$artwork <- artwork # trim white space from strings xmldat$artist <- trimws(xmldat$artist) xmldat$title <- trimws(xmldat$title) xmldat$misc <- trimws(xmldat$misc) xmldat$description <- trimws(xmldat$description) out <- rbind(out, xmldat) i <- i + 1 } out } # TODO: Check if all artworks have a folder, catch it and throw warning # TODO: Is this function generic for most projects? If yes, adjust variable # names, if no: Maybe remove it from package?