Removed extract_topics() from create_eventlogs(); removed helper.R and put everything in separate files
This commit is contained in:
parent
85d3cfc2fa
commit
0e911bed3f
25
R/add_case.R
Normal file
25
R/add_case.R
Normal file
@ -0,0 +1,25 @@
|
|||||||
|
###########################################################################
|
||||||
|
|
||||||
|
# Add case variable
|
||||||
|
|
||||||
|
add_case <- function(data, cutoff = 20) {
|
||||||
|
# TODO: What is the best choice for the cutoff here?
|
||||||
|
|
||||||
|
data$timediff <- as.numeric(diff(c(data$date.start[1], data$date.start)))
|
||||||
|
data$case <- NA
|
||||||
|
j <- 1
|
||||||
|
pb <- utils::txtProgressBar(min = 0, max = nrow(data), initial = NA, style = 3)
|
||||||
|
|
||||||
|
for (i in seq_len(nrow(data))) {
|
||||||
|
if (data$timediff[i] <= cutoff) {
|
||||||
|
data$case[i] <- j
|
||||||
|
} else {
|
||||||
|
j <- j + 1
|
||||||
|
data$case[i] <- j
|
||||||
|
}
|
||||||
|
utils::setTxtProgressBar(pb, i)
|
||||||
|
}
|
||||||
|
data$timediff <- NULL
|
||||||
|
data
|
||||||
|
}
|
||||||
|
|
0
R/add_metadata.R
Normal file
0
R/add_metadata.R
Normal file
@ -107,3 +107,68 @@ add_trace <- function(data, glossar_dict) {
|
|||||||
out
|
out
|
||||||
}
|
}
|
||||||
|
|
||||||
|
###########################################################################
|
||||||
|
|
||||||
|
# Add trace for moves
|
||||||
|
|
||||||
|
add_trace_moves <- function(data) {
|
||||||
|
|
||||||
|
pbapply::pboptions(style = 3, char = "=")
|
||||||
|
|
||||||
|
trace_max <- max(data$trace, na.rm = TRUE)
|
||||||
|
|
||||||
|
#subdata_art <- split(data, ~ artwork)
|
||||||
|
subdata_case <- split(data, ~ case)
|
||||||
|
|
||||||
|
#subdata_list <- split(data, ~ artwork + case)
|
||||||
|
# --> does not work with complete data set
|
||||||
|
cat("Splitting data...", "\n")
|
||||||
|
subdata_list <- pbapply::pblapply(subdata_case, split, f = ~artwork)
|
||||||
|
subdata_list <- unlist(subdata_list, recursive = FALSE)
|
||||||
|
|
||||||
|
cat("Adding trace...", "\n")
|
||||||
|
subdata_trace <- pbapply::pblapply(subdata_list,
|
||||||
|
function(x) {
|
||||||
|
trace_max <<- trace_max + 1
|
||||||
|
add_trace_subdata(x, max_trace = trace_max)
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
out <- dplyr::bind_rows(subdata_trace)
|
||||||
|
out <- out[order(out$fileId, out$date.start, out$timeMs.start), ]
|
||||||
|
rownames(out) <- NULL
|
||||||
|
|
||||||
|
# Make trace a consecutive number
|
||||||
|
out$trace <- as.numeric(factor(out$trace, levels = unique(out$trace)))
|
||||||
|
out
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
add_trace_subdata <- function(subdata, max_trace) {
|
||||||
|
|
||||||
|
if (nrow(subdata) != 0) {
|
||||||
|
|
||||||
|
if (length(stats::na.omit(unique(subdata$trace))) == 1) {
|
||||||
|
subdata[subdata$event == "move", "trace"] <- stats::na.omit(unique(subdata$trace))
|
||||||
|
} else if (length(stats::na.omit(unique(subdata$trace))) > 1) {
|
||||||
|
for (i in 1:nrow(subdata)) {
|
||||||
|
if (subdata$event[i] == "move") {
|
||||||
|
if (i == 1) {
|
||||||
|
subdata$trace[i] <- stats::na.omit(unique(subdata$trace))[1]
|
||||||
|
} else {
|
||||||
|
subdata$trace[i] <- subdata$trace[i - 1]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else if (all(is.na(subdata$trace))) {
|
||||||
|
for (i in 1:nrow(subdata)) {
|
||||||
|
subdata$trace[i] <- max_trace
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
} else {
|
||||||
|
warning("subdata has nrow = 0")
|
||||||
|
}
|
||||||
|
subdata
|
||||||
|
}
|
||||||
|
|
||||||
|
@ -9,14 +9,14 @@ close_events <- function(data, event = c("move", "flipCard", "openTopic", "openP
|
|||||||
actions <- c("Transform start", "Transform stop")
|
actions <- c("Transform start", "Transform stop")
|
||||||
idvar <- c("fileId", "folder", "eventId", "artwork", "glossar")
|
idvar <- c("fileId", "folder", "eventId", "artwork", "glossar")
|
||||||
drop <- c("popup", "topicNumber", "trace", "event")
|
drop <- c("popup", "topicNumber", "trace", "event")
|
||||||
ncol <- 16
|
ncol <- 17
|
||||||
|
|
||||||
},
|
},
|
||||||
"flipCard" = {
|
"flipCard" = {
|
||||||
actions <- c("Show Info", "Show Front")
|
actions <- c("Show Info", "Show Front")
|
||||||
idvar <- c("fileId", "folder", "trace", "artwork", "glossar")
|
idvar <- c("fileId", "folder", "trace", "artwork", "glossar")
|
||||||
drop <- c("popup", "topicNumber", "eventId", "event")
|
drop <- c("popup", "topicNumber", "eventId", "event")
|
||||||
ncol <- 16
|
ncol <- 17
|
||||||
|
|
||||||
},
|
},
|
||||||
"openTopic" = {
|
"openTopic" = {
|
||||||
@ -24,7 +24,7 @@ close_events <- function(data, event = c("move", "flipCard", "openTopic", "openP
|
|||||||
idvar <- c("fileId", "folder", "eventId", "trace", "glossar",
|
idvar <- c("fileId", "folder", "eventId", "trace", "glossar",
|
||||||
"artwork", "topicNumber")
|
"artwork", "topicNumber")
|
||||||
drop <- c("popup", "event")
|
drop <- c("popup", "event")
|
||||||
ncol <- 18
|
ncol <- 19
|
||||||
|
|
||||||
},
|
},
|
||||||
"openPopup" = {
|
"openPopup" = {
|
||||||
@ -32,7 +32,7 @@ close_events <- function(data, event = c("move", "flipCard", "openTopic", "openP
|
|||||||
idvar <- c("fileId", "folder", "eventId", "trace", "glossar",
|
idvar <- c("fileId", "folder", "eventId", "trace", "glossar",
|
||||||
"artwork", "popup")
|
"artwork", "popup")
|
||||||
drop <- c("topicNumber", "event")
|
drop <- c("topicNumber", "event")
|
||||||
ncol <- 18
|
ncol <- 19
|
||||||
# TODO: Should topicNumber maybe also be filled in for "openPopup"?
|
# TODO: Should topicNumber maybe also be filled in for "openPopup"?
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -5,14 +5,12 @@
|
|||||||
#'
|
#'
|
||||||
#' @param data Data frame of raw log files created with `parse_logfiles()`.
|
#' @param data Data frame of raw log files created with `parse_logfiles()`.
|
||||||
#' See `?parse_logfiles` for more details.
|
#' See `?parse_logfiles` for more details.
|
||||||
#' @param xmlfiles Vector of names of index files, often something like
|
|
||||||
#' `<artwork>.xml`.
|
|
||||||
#' @param xmlpath Path to folder where XML definitions of artworks live.
|
#' @param xmlpath Path to folder where XML definitions of artworks live.
|
||||||
#' @return Data frame.
|
#' @return Data frame.
|
||||||
#' @export
|
#' @export
|
||||||
#' @examples
|
#' @examples
|
||||||
#' # tbd
|
#' # tbd
|
||||||
create_eventlogs <- function(data, xmlfiles, xmlpath) {
|
create_eventlogs <- function(data, xmlpath) {
|
||||||
|
|
||||||
if (!lubridate::is.POSIXt(data$date)){
|
if (!lubridate::is.POSIXt(data$date)){
|
||||||
cat("########## Convertion variable `date` to POSIXct ##########", "\n")
|
cat("########## Convertion variable `date` to POSIXct ##########", "\n")
|
||||||
@ -24,11 +22,11 @@ create_eventlogs <- function(data, xmlfiles, xmlpath) {
|
|||||||
dat <- subset(data, !(data$event %in% c("Start Application",
|
dat <- subset(data, !(data$event %in% c("Start Application",
|
||||||
"Show Application")))
|
"Show Application")))
|
||||||
|
|
||||||
# Create glossar dictionary ##############################################
|
|
||||||
cat("\n########## Creating glossar dictionary ##########", "\n")
|
|
||||||
artworks <- unique(stats::na.omit(dat$artwork))
|
artworks <- unique(stats::na.omit(dat$artwork))
|
||||||
|
|
||||||
|
# Create glossar dictionary ##############################################
|
||||||
if ("glossar" %in% artworks) {
|
if ("glossar" %in% artworks) {
|
||||||
|
cat("\n########## Creating glossar dictionary ##########", "\n")
|
||||||
artworks <- artworks[artworks != "glossar"]
|
artworks <- artworks[artworks != "glossar"]
|
||||||
glossar_files <- unique(subset(dat, dat$artwork == "glossar")$popup)
|
glossar_files <- unique(subset(dat, dat$artwork == "glossar")$popup)
|
||||||
glossar_dict <- create_glossardict(artworks, glossar_files, xmlpath = xmlpath)
|
glossar_dict <- create_glossardict(artworks, glossar_files, xmlpath = xmlpath)
|
||||||
@ -82,16 +80,7 @@ create_eventlogs <- function(data, xmlfiles, xmlpath) {
|
|||||||
# Add trace for move events ##############################################
|
# Add trace for move events ##############################################
|
||||||
cat("\n\n########## Adding trace variable for move events... ##########", "\n")
|
cat("\n\n########## Adding trace variable for move events... ##########", "\n")
|
||||||
dat4 <- add_trace_moves(dat3)
|
dat4 <- add_trace_moves(dat3)
|
||||||
|
dat4
|
||||||
# Add topics: file names and topics ######################################
|
|
||||||
cat("\n########## Adding information about topics... ##########", "\n\n")
|
|
||||||
# remove artworks without XML information
|
|
||||||
#artworks <- artworks[!artworks %in% c("504", "505")]
|
|
||||||
# TODO: This is hardcoded! Remove it!
|
|
||||||
topics <- extract_topics(artworks, xmlfiles = xmlfiles, xmlpath = xmlpath)
|
|
||||||
|
|
||||||
dat5 <- add_topic(dat4, topics = topics)
|
|
||||||
dat5
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
54
R/extract_artworks.R
Normal file
54
R/extract_artworks.R
Normal file
@ -0,0 +1,54 @@
|
|||||||
|
#' Creating data frame with information about artworks
|
||||||
|
#'
|
||||||
|
#' Information about artowrks are extracted from XML files and written to a
|
||||||
|
#' data frame that contains `artist`, `title`, `misc`, and `description`.
|
||||||
|
#'
|
||||||
|
#' @param artworks A character vector with names of the artworks. Needs to
|
||||||
|
#' correspond to the folder names which contain the XML files.
|
||||||
|
#' @param xmlfiles Vector of names of index files, often something like
|
||||||
|
#' `<artwork>.xml`. Need to be in the same order as artworks!
|
||||||
|
#' @param xmlpath Path to folder where XML definitions of artworks live.
|
||||||
|
#' @return Data frame.
|
||||||
|
#' @export
|
||||||
|
#' @examples
|
||||||
|
#' # tbd
|
||||||
|
extract_artworks <- function(artworks, xmlfiles, xmlpath) {
|
||||||
|
out <- NULL
|
||||||
|
i <- 1
|
||||||
|
|
||||||
|
for (artwork in artworks) {
|
||||||
|
|
||||||
|
if (length(xmlfiles) == 1) {
|
||||||
|
index_file <- xmlfiles
|
||||||
|
} else {
|
||||||
|
index_file <- xmlfiles[i]
|
||||||
|
}
|
||||||
|
|
||||||
|
index <- paste(xmlpath, artwork, index_file, sep = "/")
|
||||||
|
varnames <- c("artist", "title", "misc", "description")
|
||||||
|
xmllist <- XML::xmlToList(index)$header[varnames]
|
||||||
|
|
||||||
|
if (any(sapply(xmllist, is.null))) {# necessary for missing entries
|
||||||
|
names(xmllist) <- varnames
|
||||||
|
xmllist[which(sapply(xmllist, is.null))] <- NA
|
||||||
|
}
|
||||||
|
# remove German quotes
|
||||||
|
xmllist <- lapply(xmllist, function(x) gsub("\u201e|\u201c", "", x))
|
||||||
|
# remove HTML tags
|
||||||
|
xmllist <- lapply(xmllist, function(x) gsub("<br/>", " ", x))
|
||||||
|
xmldat <- as.data.frame(xmllist)
|
||||||
|
xmldat$artwork <- artwork
|
||||||
|
# trim white space from strings
|
||||||
|
xmldat$artist <- trimws(xmldat$artist)
|
||||||
|
xmldat$title <- trimws(xmldat$title)
|
||||||
|
xmldat$misc <- trimws(xmldat$misc)
|
||||||
|
xmldat$description <- trimws(xmldat$description)
|
||||||
|
out <- rbind(out, xmldat)
|
||||||
|
i <- i + 1
|
||||||
|
}
|
||||||
|
out
|
||||||
|
}
|
||||||
|
|
||||||
|
# TODO: Check if artworks all artworks have a folder, catch it and throw
|
||||||
|
# warning
|
||||||
|
|
92
R/extract_topics.R
Normal file
92
R/extract_topics.R
Normal file
@ -0,0 +1,92 @@
|
|||||||
|
#' Creating data frame with artworks and topics
|
||||||
|
#'
|
||||||
|
#' Topics are extracted from XML files and written to a data frame that
|
||||||
|
#' shows which artworks belong to which topics.
|
||||||
|
#'
|
||||||
|
#' @param artworks A character vector with names of the artworks. Needs to
|
||||||
|
#' correspond to the folder names which contain the XML files.
|
||||||
|
#' @param xmlfiles Vector of names of index files, often something like
|
||||||
|
#' `<artwork>.xml`. Need to be in the same order as artworks!
|
||||||
|
#' @param xmlpath Path to folder where XML definitions of artworks live.
|
||||||
|
#' @return Data frame.
|
||||||
|
#' @export
|
||||||
|
#' @examples
|
||||||
|
#' # tbd
|
||||||
|
extract_topics <- function(artworks, xmlfiles, xmlpath) {
|
||||||
|
|
||||||
|
out <- NULL
|
||||||
|
i <- 1
|
||||||
|
|
||||||
|
for (artwork in artworks) {
|
||||||
|
|
||||||
|
index_file <- paste0(xmlpath, artwork, "/", xmlfiles[i])
|
||||||
|
suppressWarnings(
|
||||||
|
fnames <- gsub("^<card src=.*/(.*)./>$", "\\1",
|
||||||
|
grep("^<card src=", trimws(readLines(index_file)),
|
||||||
|
value = TRUE))
|
||||||
|
)
|
||||||
|
|
||||||
|
topic <- NULL
|
||||||
|
for (fname in fnames) {
|
||||||
|
suppressWarnings(
|
||||||
|
topic <- c(topic, gsub("^<card type=.(.*).>$", "\\1",
|
||||||
|
grep("^<card type=",
|
||||||
|
trimws(readLines(paste(xmlpath, artwork, fname, sep = "/"))),
|
||||||
|
value = T)))
|
||||||
|
)
|
||||||
|
}
|
||||||
|
out <- rbind(out, data.frame(artwork, file_name = fnames, topic))
|
||||||
|
i <- i + 1
|
||||||
|
}
|
||||||
|
|
||||||
|
out <- out[order(out$artwork), ]
|
||||||
|
rownames(out) <- NULL
|
||||||
|
|
||||||
|
out$index <- unlist(lapply(table(out$artwork), seq_len))
|
||||||
|
out
|
||||||
|
}
|
||||||
|
|
||||||
|
###########################################################################
|
||||||
|
|
||||||
|
# Add topics: file names and topics
|
||||||
|
|
||||||
|
add_topic <- function(data, topics) {
|
||||||
|
|
||||||
|
artworks <- unique(data$artwork)
|
||||||
|
tab_art <- lapply(artworks,
|
||||||
|
function(x) names(table(data$topicNumber[data$artwork == x])))
|
||||||
|
names(tab_art) <- artworks
|
||||||
|
|
||||||
|
tab_index <- lapply(tab_art, seq_along)
|
||||||
|
|
||||||
|
dat_split <- split(data, ~ artwork)
|
||||||
|
|
||||||
|
set_label <- function(x) {
|
||||||
|
artwork <- unique(x$artwork)
|
||||||
|
x$topicIndex <- factor(x$topicNumber, labels = tab_index[[artwork]])
|
||||||
|
x
|
||||||
|
}
|
||||||
|
|
||||||
|
dat_label <- lapply(dat_split, set_label)
|
||||||
|
|
||||||
|
set_topic <- function(x) {
|
||||||
|
artwork <- unique(x$artwork)
|
||||||
|
labels_file <- topics[topics$artwork == artwork,
|
||||||
|
"file_name"][as.numeric(levels(x$topicIndex))]
|
||||||
|
x$topicFile <- as.character(factor(x$topicIndex, labels = labels_file))
|
||||||
|
labels_topic <- topics[topics$artwork == artwork,
|
||||||
|
"topic"][as.numeric(levels(x$topicIndex))]
|
||||||
|
x$topic <- as.character(factor(x$topicIndex, labels = labels_topic))
|
||||||
|
x
|
||||||
|
}
|
||||||
|
|
||||||
|
dat_topic <- lapply(dat_label, set_topic)
|
||||||
|
|
||||||
|
#out <- do.call(rbind, dat_topic)
|
||||||
|
out <- dplyr::bind_rows(dat_topic)
|
||||||
|
out$topicIndex <- as.numeric(out$topicIndex)
|
||||||
|
out <- out[order(out$fileId, out$date.start, out$timeMs.start), ]
|
||||||
|
rownames(out) <- NULL
|
||||||
|
out
|
||||||
|
}
|
||||||
|
|
214
R/helper.R
214
R/helper.R
@ -1,214 +0,0 @@
|
|||||||
###########################################################################
|
|
||||||
|
|
||||||
# Add case variable
|
|
||||||
|
|
||||||
add_case <- function(data, cutoff = 20) {
|
|
||||||
# TODO: What is the best choice for the cutoff here?
|
|
||||||
|
|
||||||
data$timediff <- as.numeric(diff(c(data$date.start[1], data$date.start)))
|
|
||||||
data$case <- NA
|
|
||||||
j <- 1
|
|
||||||
pb <- utils::txtProgressBar(min = 0, max = nrow(data), initial = NA, style = 3)
|
|
||||||
|
|
||||||
for (i in seq_len(nrow(data))) {
|
|
||||||
if (data$timediff[i] <= cutoff) {
|
|
||||||
data$case[i] <- j
|
|
||||||
} else {
|
|
||||||
j <- j + 1
|
|
||||||
data$case[i] <- j
|
|
||||||
}
|
|
||||||
utils::setTxtProgressBar(pb, i)
|
|
||||||
}
|
|
||||||
data$timediff <- NULL
|
|
||||||
data
|
|
||||||
}
|
|
||||||
|
|
||||||
###########################################################################
|
|
||||||
|
|
||||||
# Add trace for moves
|
|
||||||
|
|
||||||
add_trace_moves <- function(data) {
|
|
||||||
|
|
||||||
pbapply::pboptions(style = 3, char = "=")
|
|
||||||
|
|
||||||
trace_max <- max(data$trace, na.rm = TRUE)
|
|
||||||
|
|
||||||
#subdata_art <- split(data, ~ artwork)
|
|
||||||
subdata_case <- split(data, ~ case)
|
|
||||||
|
|
||||||
#subdata_list <- split(data, ~ artwork + case)
|
|
||||||
# --> does not work with complete data set
|
|
||||||
cat("Splitting data...", "\n")
|
|
||||||
subdata_list <- pbapply::pblapply(subdata_case, split, f = ~artwork)
|
|
||||||
subdata_list <- unlist(subdata_list, recursive = FALSE)
|
|
||||||
|
|
||||||
cat("Adding trace...", "\n")
|
|
||||||
subdata_trace <- pbapply::pblapply(subdata_list,
|
|
||||||
function(x) {
|
|
||||||
trace_max <<- trace_max + 1
|
|
||||||
add_trace_subdata(x, max_trace = trace_max)
|
|
||||||
}
|
|
||||||
)
|
|
||||||
|
|
||||||
out <- dplyr::bind_rows(subdata_trace)
|
|
||||||
out <- out[order(out$fileId, out$date.start, out$timeMs.start), ]
|
|
||||||
rownames(out) <- NULL
|
|
||||||
|
|
||||||
# Make trace a consecutive number
|
|
||||||
out$trace <- as.numeric(factor(out$trace, levels = unique(out$trace)))
|
|
||||||
out
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
add_trace_subdata <- function(subdata, max_trace) {
|
|
||||||
|
|
||||||
if (nrow(subdata) != 0) {
|
|
||||||
|
|
||||||
if (length(stats::na.omit(unique(subdata$trace))) == 1) {
|
|
||||||
subdata[subdata$event == "move", "trace"] <- stats::na.omit(unique(subdata$trace))
|
|
||||||
} else if (length(stats::na.omit(unique(subdata$trace))) > 1) {
|
|
||||||
for (i in 1:nrow(subdata)) {
|
|
||||||
if (subdata$event[i] == "move") {
|
|
||||||
if (i == 1) {
|
|
||||||
subdata$trace[i] <- stats::na.omit(unique(subdata$trace))[1]
|
|
||||||
} else {
|
|
||||||
subdata$trace[i] <- subdata$trace[i - 1]
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
} else if (all(is.na(subdata$trace))) {
|
|
||||||
for (i in 1:nrow(subdata)) {
|
|
||||||
subdata$trace[i] <- max_trace
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
} else {
|
|
||||||
warning("subdata has nrow = 0")
|
|
||||||
}
|
|
||||||
subdata
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
###########################################################################
|
|
||||||
|
|
||||||
# Create data frame with file names and topics for each artwork
|
|
||||||
|
|
||||||
extract_topics <- function(artworks, xmlfiles, xmlpath) {
|
|
||||||
|
|
||||||
out <- NULL
|
|
||||||
i <- 1
|
|
||||||
|
|
||||||
for (artwork in artworks) {
|
|
||||||
|
|
||||||
index_file <- paste0(xmlpath, artwork, "/", xmlfiles[i])
|
|
||||||
suppressWarnings(
|
|
||||||
fnames <- gsub("^<card src=.*/(.*)./>$", "\\1",
|
|
||||||
grep("^<card src=", trimws(readLines(index_file)),
|
|
||||||
value = TRUE))
|
|
||||||
)
|
|
||||||
|
|
||||||
topic <- NULL
|
|
||||||
for (fname in fnames) {
|
|
||||||
suppressWarnings(
|
|
||||||
topic <- c(topic, gsub("^<card type=.(.*).>$", "\\1",
|
|
||||||
grep("^<card type=",
|
|
||||||
trimws(readLines(paste(xmlpath, artwork, fname, sep = "/"))),
|
|
||||||
value = T)))
|
|
||||||
)
|
|
||||||
}
|
|
||||||
out <- rbind(out, data.frame(artwork, file_name = fnames, topic))
|
|
||||||
i <- i + 1
|
|
||||||
}
|
|
||||||
|
|
||||||
out <- out[order(out$artwork), ]
|
|
||||||
rownames(out) <- NULL
|
|
||||||
|
|
||||||
out$index <- unlist(lapply(table(out$artwork), seq_len))
|
|
||||||
out
|
|
||||||
}
|
|
||||||
|
|
||||||
###########################################################################
|
|
||||||
|
|
||||||
# Add topics: file names and topics
|
|
||||||
|
|
||||||
add_topic <- function(data, topics) {
|
|
||||||
|
|
||||||
artworks <- unique(data$artwork)
|
|
||||||
tab_art <- lapply(artworks,
|
|
||||||
function(x) names(table(data$topicNumber[data$artwork == x])))
|
|
||||||
names(tab_art) <- artworks
|
|
||||||
|
|
||||||
tab_index <- lapply(tab_art, seq_along)
|
|
||||||
|
|
||||||
dat_split <- split(data, ~ artwork)
|
|
||||||
|
|
||||||
set_label <- function(x) {
|
|
||||||
artwork <- unique(x$artwork)
|
|
||||||
x$topicIndex <- factor(x$topicNumber, labels = tab_index[[artwork]])
|
|
||||||
x
|
|
||||||
}
|
|
||||||
|
|
||||||
dat_label <- lapply(dat_split, set_label)
|
|
||||||
|
|
||||||
set_topic <- function(x) {
|
|
||||||
artwork <- unique(x$artwork)
|
|
||||||
labels_file <- topics[topics$artwork == artwork,
|
|
||||||
"file_name"][as.numeric(levels(x$topicIndex))]
|
|
||||||
x$topicFile <- as.character(factor(x$topicIndex, labels = labels_file))
|
|
||||||
labels_topic <- topics[topics$artwork == artwork,
|
|
||||||
"topic"][as.numeric(levels(x$topicIndex))]
|
|
||||||
x$topic <- as.character(factor(x$topicIndex, labels = labels_topic))
|
|
||||||
x
|
|
||||||
}
|
|
||||||
|
|
||||||
dat_topic <- lapply(dat_label, set_topic)
|
|
||||||
|
|
||||||
#out <- do.call(rbind, dat_topic)
|
|
||||||
out <- dplyr::bind_rows(dat_topic)
|
|
||||||
out$topicIndex <- as.numeric(out$topicIndex)
|
|
||||||
out <- out[order(out$fileId, out$date.start, out$timeMs.start), ]
|
|
||||||
rownames(out) <- NULL
|
|
||||||
out
|
|
||||||
}
|
|
||||||
|
|
||||||
###########################################################################
|
|
||||||
|
|
||||||
# Create data frame with information on artworks
|
|
||||||
|
|
||||||
extract_artworks <- function(artworks, files, xmlpath) {
|
|
||||||
out <- NULL
|
|
||||||
i <- 1
|
|
||||||
|
|
||||||
for (artwork in artworks) {
|
|
||||||
|
|
||||||
if (length(files) == 1) {
|
|
||||||
index_file <- files
|
|
||||||
} else {
|
|
||||||
index_file <- files[i]
|
|
||||||
}
|
|
||||||
|
|
||||||
index <- paste(xmlpath, artwork, index_file, sep = "/")
|
|
||||||
varnames <- c("artist", "title", "misc", "description")
|
|
||||||
xmllist <- XML::xmlToList(index)$header[varnames]
|
|
||||||
|
|
||||||
if (any(sapply(xmllist, is.null))) {# necessary for missing entries
|
|
||||||
names(xmllist) <- varnames
|
|
||||||
xmllist[which(sapply(xmllist, is.null))] <- NA
|
|
||||||
}
|
|
||||||
# remove German quotes
|
|
||||||
xmllist <- lapply(xmllist, function(x) gsub("\u201e|\u201c", "", x))
|
|
||||||
# remove HTML tags
|
|
||||||
xmllist <- lapply(xmllist, function(x) gsub("<br/>", " ", x))
|
|
||||||
xmldat <- as.data.frame(xmllist)
|
|
||||||
xmldat$artwork <- artwork
|
|
||||||
# trim white space from strings
|
|
||||||
xmldat$artist <- trimws(xmldat$artist)
|
|
||||||
xmldat$title <- trimws(xmldat$title)
|
|
||||||
xmldat$misc <- trimws(xmldat$misc)
|
|
||||||
xmldat$description <- trimws(xmldat$description)
|
|
||||||
out <- rbind(out, xmldat)
|
|
||||||
i <- i + 1
|
|
||||||
}
|
|
||||||
out
|
|
||||||
}
|
|
||||||
|
|
Loading…
Reference in New Issue
Block a user