Removed extract_topics() from create_eventlogs(); removed helper.R and put everything in separate files
This commit is contained in:
parent
85d3cfc2fa
commit
0e911bed3f
25
R/add_case.R
Normal file
25
R/add_case.R
Normal file
@ -0,0 +1,25 @@
|
||||
###########################################################################
|
||||
|
||||
# Add case variable
|
||||
|
||||
add_case <- function(data, cutoff = 20) {
|
||||
# TODO: What is the best choice for the cutoff here?
|
||||
|
||||
data$timediff <- as.numeric(diff(c(data$date.start[1], data$date.start)))
|
||||
data$case <- NA
|
||||
j <- 1
|
||||
pb <- utils::txtProgressBar(min = 0, max = nrow(data), initial = NA, style = 3)
|
||||
|
||||
for (i in seq_len(nrow(data))) {
|
||||
if (data$timediff[i] <= cutoff) {
|
||||
data$case[i] <- j
|
||||
} else {
|
||||
j <- j + 1
|
||||
data$case[i] <- j
|
||||
}
|
||||
utils::setTxtProgressBar(pb, i)
|
||||
}
|
||||
data$timediff <- NULL
|
||||
data
|
||||
}
|
||||
|
0
R/add_metadata.R
Normal file
0
R/add_metadata.R
Normal file
@ -107,3 +107,68 @@ add_trace <- function(data, glossar_dict) {
|
||||
out
|
||||
}
|
||||
|
||||
###########################################################################
|
||||
|
||||
# Add trace for moves
|
||||
|
||||
add_trace_moves <- function(data) {
|
||||
|
||||
pbapply::pboptions(style = 3, char = "=")
|
||||
|
||||
trace_max <- max(data$trace, na.rm = TRUE)
|
||||
|
||||
#subdata_art <- split(data, ~ artwork)
|
||||
subdata_case <- split(data, ~ case)
|
||||
|
||||
#subdata_list <- split(data, ~ artwork + case)
|
||||
# --> does not work with complete data set
|
||||
cat("Splitting data...", "\n")
|
||||
subdata_list <- pbapply::pblapply(subdata_case, split, f = ~artwork)
|
||||
subdata_list <- unlist(subdata_list, recursive = FALSE)
|
||||
|
||||
cat("Adding trace...", "\n")
|
||||
subdata_trace <- pbapply::pblapply(subdata_list,
|
||||
function(x) {
|
||||
trace_max <<- trace_max + 1
|
||||
add_trace_subdata(x, max_trace = trace_max)
|
||||
}
|
||||
)
|
||||
|
||||
out <- dplyr::bind_rows(subdata_trace)
|
||||
out <- out[order(out$fileId, out$date.start, out$timeMs.start), ]
|
||||
rownames(out) <- NULL
|
||||
|
||||
# Make trace a consecutive number
|
||||
out$trace <- as.numeric(factor(out$trace, levels = unique(out$trace)))
|
||||
out
|
||||
}
|
||||
|
||||
|
||||
add_trace_subdata <- function(subdata, max_trace) {
|
||||
|
||||
if (nrow(subdata) != 0) {
|
||||
|
||||
if (length(stats::na.omit(unique(subdata$trace))) == 1) {
|
||||
subdata[subdata$event == "move", "trace"] <- stats::na.omit(unique(subdata$trace))
|
||||
} else if (length(stats::na.omit(unique(subdata$trace))) > 1) {
|
||||
for (i in 1:nrow(subdata)) {
|
||||
if (subdata$event[i] == "move") {
|
||||
if (i == 1) {
|
||||
subdata$trace[i] <- stats::na.omit(unique(subdata$trace))[1]
|
||||
} else {
|
||||
subdata$trace[i] <- subdata$trace[i - 1]
|
||||
}
|
||||
}
|
||||
}
|
||||
} else if (all(is.na(subdata$trace))) {
|
||||
for (i in 1:nrow(subdata)) {
|
||||
subdata$trace[i] <- max_trace
|
||||
}
|
||||
}
|
||||
|
||||
} else {
|
||||
warning("subdata has nrow = 0")
|
||||
}
|
||||
subdata
|
||||
}
|
||||
|
||||
|
@ -9,14 +9,14 @@ close_events <- function(data, event = c("move", "flipCard", "openTopic", "openP
|
||||
actions <- c("Transform start", "Transform stop")
|
||||
idvar <- c("fileId", "folder", "eventId", "artwork", "glossar")
|
||||
drop <- c("popup", "topicNumber", "trace", "event")
|
||||
ncol <- 16
|
||||
ncol <- 17
|
||||
|
||||
},
|
||||
"flipCard" = {
|
||||
actions <- c("Show Info", "Show Front")
|
||||
idvar <- c("fileId", "folder", "trace", "artwork", "glossar")
|
||||
drop <- c("popup", "topicNumber", "eventId", "event")
|
||||
ncol <- 16
|
||||
ncol <- 17
|
||||
|
||||
},
|
||||
"openTopic" = {
|
||||
@ -24,7 +24,7 @@ close_events <- function(data, event = c("move", "flipCard", "openTopic", "openP
|
||||
idvar <- c("fileId", "folder", "eventId", "trace", "glossar",
|
||||
"artwork", "topicNumber")
|
||||
drop <- c("popup", "event")
|
||||
ncol <- 18
|
||||
ncol <- 19
|
||||
|
||||
},
|
||||
"openPopup" = {
|
||||
@ -32,7 +32,7 @@ close_events <- function(data, event = c("move", "flipCard", "openTopic", "openP
|
||||
idvar <- c("fileId", "folder", "eventId", "trace", "glossar",
|
||||
"artwork", "popup")
|
||||
drop <- c("topicNumber", "event")
|
||||
ncol <- 18
|
||||
ncol <- 19
|
||||
# TODO: Should topicNumber maybe also be filled in for "openPopup"?
|
||||
|
||||
}
|
||||
|
@ -5,14 +5,12 @@
|
||||
#'
|
||||
#' @param data Data frame of raw log files created with `parse_logfiles()`.
|
||||
#' See `?parse_logfiles` for more details.
|
||||
#' @param xmlfiles Vector of names of index files, often something like
|
||||
#' `<artwork>.xml`.
|
||||
#' @param xmlpath Path to folder where XML definitions of artworks live.
|
||||
#' @return Data frame.
|
||||
#' @export
|
||||
#' @examples
|
||||
#' # tbd
|
||||
create_eventlogs <- function(data, xmlfiles, xmlpath) {
|
||||
create_eventlogs <- function(data, xmlpath) {
|
||||
|
||||
if (!lubridate::is.POSIXt(data$date)){
|
||||
cat("########## Convertion variable `date` to POSIXct ##########", "\n")
|
||||
@ -24,11 +22,11 @@ create_eventlogs <- function(data, xmlfiles, xmlpath) {
|
||||
dat <- subset(data, !(data$event %in% c("Start Application",
|
||||
"Show Application")))
|
||||
|
||||
# Create glossar dictionary ##############################################
|
||||
cat("\n########## Creating glossar dictionary ##########", "\n")
|
||||
artworks <- unique(stats::na.omit(dat$artwork))
|
||||
|
||||
# Create glossar dictionary ##############################################
|
||||
if ("glossar" %in% artworks) {
|
||||
cat("\n########## Creating glossar dictionary ##########", "\n")
|
||||
artworks <- artworks[artworks != "glossar"]
|
||||
glossar_files <- unique(subset(dat, dat$artwork == "glossar")$popup)
|
||||
glossar_dict <- create_glossardict(artworks, glossar_files, xmlpath = xmlpath)
|
||||
@ -82,16 +80,7 @@ create_eventlogs <- function(data, xmlfiles, xmlpath) {
|
||||
# Add trace for move events ##############################################
|
||||
cat("\n\n########## Adding trace variable for move events... ##########", "\n")
|
||||
dat4 <- add_trace_moves(dat3)
|
||||
|
||||
# Add topics: file names and topics ######################################
|
||||
cat("\n########## Adding information about topics... ##########", "\n\n")
|
||||
# remove artworks without XML information
|
||||
#artworks <- artworks[!artworks %in% c("504", "505")]
|
||||
# TODO: This is hardcoded! Remove it!
|
||||
topics <- extract_topics(artworks, xmlfiles = xmlfiles, xmlpath = xmlpath)
|
||||
|
||||
dat5 <- add_topic(dat4, topics = topics)
|
||||
dat5
|
||||
dat4
|
||||
}
|
||||
|
||||
|
||||
|
54
R/extract_artworks.R
Normal file
54
R/extract_artworks.R
Normal file
@ -0,0 +1,54 @@
|
||||
#' Creating data frame with information about artworks
|
||||
#'
|
||||
#' Information about artowrks are extracted from XML files and written to a
|
||||
#' data frame that contains `artist`, `title`, `misc`, and `description`.
|
||||
#'
|
||||
#' @param artworks A character vector with names of the artworks. Needs to
|
||||
#' correspond to the folder names which contain the XML files.
|
||||
#' @param xmlfiles Vector of names of index files, often something like
|
||||
#' `<artwork>.xml`. Need to be in the same order as artworks!
|
||||
#' @param xmlpath Path to folder where XML definitions of artworks live.
|
||||
#' @return Data frame.
|
||||
#' @export
|
||||
#' @examples
|
||||
#' # tbd
|
||||
extract_artworks <- function(artworks, xmlfiles, xmlpath) {
|
||||
out <- NULL
|
||||
i <- 1
|
||||
|
||||
for (artwork in artworks) {
|
||||
|
||||
if (length(xmlfiles) == 1) {
|
||||
index_file <- xmlfiles
|
||||
} else {
|
||||
index_file <- xmlfiles[i]
|
||||
}
|
||||
|
||||
index <- paste(xmlpath, artwork, index_file, sep = "/")
|
||||
varnames <- c("artist", "title", "misc", "description")
|
||||
xmllist <- XML::xmlToList(index)$header[varnames]
|
||||
|
||||
if (any(sapply(xmllist, is.null))) {# necessary for missing entries
|
||||
names(xmllist) <- varnames
|
||||
xmllist[which(sapply(xmllist, is.null))] <- NA
|
||||
}
|
||||
# remove German quotes
|
||||
xmllist <- lapply(xmllist, function(x) gsub("\u201e|\u201c", "", x))
|
||||
# remove HTML tags
|
||||
xmllist <- lapply(xmllist, function(x) gsub("<br/>", " ", x))
|
||||
xmldat <- as.data.frame(xmllist)
|
||||
xmldat$artwork <- artwork
|
||||
# trim white space from strings
|
||||
xmldat$artist <- trimws(xmldat$artist)
|
||||
xmldat$title <- trimws(xmldat$title)
|
||||
xmldat$misc <- trimws(xmldat$misc)
|
||||
xmldat$description <- trimws(xmldat$description)
|
||||
out <- rbind(out, xmldat)
|
||||
i <- i + 1
|
||||
}
|
||||
out
|
||||
}
|
||||
|
||||
# TODO: Check if artworks all artworks have a folder, catch it and throw
|
||||
# warning
|
||||
|
92
R/extract_topics.R
Normal file
92
R/extract_topics.R
Normal file
@ -0,0 +1,92 @@
|
||||
#' Creating data frame with artworks and topics
|
||||
#'
|
||||
#' Topics are extracted from XML files and written to a data frame that
|
||||
#' shows which artworks belong to which topics.
|
||||
#'
|
||||
#' @param artworks A character vector with names of the artworks. Needs to
|
||||
#' correspond to the folder names which contain the XML files.
|
||||
#' @param xmlfiles Vector of names of index files, often something like
|
||||
#' `<artwork>.xml`. Need to be in the same order as artworks!
|
||||
#' @param xmlpath Path to folder where XML definitions of artworks live.
|
||||
#' @return Data frame.
|
||||
#' @export
|
||||
#' @examples
|
||||
#' # tbd
|
||||
extract_topics <- function(artworks, xmlfiles, xmlpath) {
|
||||
|
||||
out <- NULL
|
||||
i <- 1
|
||||
|
||||
for (artwork in artworks) {
|
||||
|
||||
index_file <- paste0(xmlpath, artwork, "/", xmlfiles[i])
|
||||
suppressWarnings(
|
||||
fnames <- gsub("^<card src=.*/(.*)./>$", "\\1",
|
||||
grep("^<card src=", trimws(readLines(index_file)),
|
||||
value = TRUE))
|
||||
)
|
||||
|
||||
topic <- NULL
|
||||
for (fname in fnames) {
|
||||
suppressWarnings(
|
||||
topic <- c(topic, gsub("^<card type=.(.*).>$", "\\1",
|
||||
grep("^<card type=",
|
||||
trimws(readLines(paste(xmlpath, artwork, fname, sep = "/"))),
|
||||
value = T)))
|
||||
)
|
||||
}
|
||||
out <- rbind(out, data.frame(artwork, file_name = fnames, topic))
|
||||
i <- i + 1
|
||||
}
|
||||
|
||||
out <- out[order(out$artwork), ]
|
||||
rownames(out) <- NULL
|
||||
|
||||
out$index <- unlist(lapply(table(out$artwork), seq_len))
|
||||
out
|
||||
}
|
||||
|
||||
###########################################################################
|
||||
|
||||
# Add topics: file names and topics
|
||||
|
||||
add_topic <- function(data, topics) {
|
||||
|
||||
artworks <- unique(data$artwork)
|
||||
tab_art <- lapply(artworks,
|
||||
function(x) names(table(data$topicNumber[data$artwork == x])))
|
||||
names(tab_art) <- artworks
|
||||
|
||||
tab_index <- lapply(tab_art, seq_along)
|
||||
|
||||
dat_split <- split(data, ~ artwork)
|
||||
|
||||
set_label <- function(x) {
|
||||
artwork <- unique(x$artwork)
|
||||
x$topicIndex <- factor(x$topicNumber, labels = tab_index[[artwork]])
|
||||
x
|
||||
}
|
||||
|
||||
dat_label <- lapply(dat_split, set_label)
|
||||
|
||||
set_topic <- function(x) {
|
||||
artwork <- unique(x$artwork)
|
||||
labels_file <- topics[topics$artwork == artwork,
|
||||
"file_name"][as.numeric(levels(x$topicIndex))]
|
||||
x$topicFile <- as.character(factor(x$topicIndex, labels = labels_file))
|
||||
labels_topic <- topics[topics$artwork == artwork,
|
||||
"topic"][as.numeric(levels(x$topicIndex))]
|
||||
x$topic <- as.character(factor(x$topicIndex, labels = labels_topic))
|
||||
x
|
||||
}
|
||||
|
||||
dat_topic <- lapply(dat_label, set_topic)
|
||||
|
||||
#out <- do.call(rbind, dat_topic)
|
||||
out <- dplyr::bind_rows(dat_topic)
|
||||
out$topicIndex <- as.numeric(out$topicIndex)
|
||||
out <- out[order(out$fileId, out$date.start, out$timeMs.start), ]
|
||||
rownames(out) <- NULL
|
||||
out
|
||||
}
|
||||
|
214
R/helper.R
214
R/helper.R
@ -1,214 +0,0 @@
|
||||
###########################################################################
|
||||
|
||||
# Add case variable
|
||||
|
||||
add_case <- function(data, cutoff = 20) {
|
||||
# TODO: What is the best choice for the cutoff here?
|
||||
|
||||
data$timediff <- as.numeric(diff(c(data$date.start[1], data$date.start)))
|
||||
data$case <- NA
|
||||
j <- 1
|
||||
pb <- utils::txtProgressBar(min = 0, max = nrow(data), initial = NA, style = 3)
|
||||
|
||||
for (i in seq_len(nrow(data))) {
|
||||
if (data$timediff[i] <= cutoff) {
|
||||
data$case[i] <- j
|
||||
} else {
|
||||
j <- j + 1
|
||||
data$case[i] <- j
|
||||
}
|
||||
utils::setTxtProgressBar(pb, i)
|
||||
}
|
||||
data$timediff <- NULL
|
||||
data
|
||||
}
|
||||
|
||||
###########################################################################
|
||||
|
||||
# Add trace for moves
|
||||
|
||||
add_trace_moves <- function(data) {
|
||||
|
||||
pbapply::pboptions(style = 3, char = "=")
|
||||
|
||||
trace_max <- max(data$trace, na.rm = TRUE)
|
||||
|
||||
#subdata_art <- split(data, ~ artwork)
|
||||
subdata_case <- split(data, ~ case)
|
||||
|
||||
#subdata_list <- split(data, ~ artwork + case)
|
||||
# --> does not work with complete data set
|
||||
cat("Splitting data...", "\n")
|
||||
subdata_list <- pbapply::pblapply(subdata_case, split, f = ~artwork)
|
||||
subdata_list <- unlist(subdata_list, recursive = FALSE)
|
||||
|
||||
cat("Adding trace...", "\n")
|
||||
subdata_trace <- pbapply::pblapply(subdata_list,
|
||||
function(x) {
|
||||
trace_max <<- trace_max + 1
|
||||
add_trace_subdata(x, max_trace = trace_max)
|
||||
}
|
||||
)
|
||||
|
||||
out <- dplyr::bind_rows(subdata_trace)
|
||||
out <- out[order(out$fileId, out$date.start, out$timeMs.start), ]
|
||||
rownames(out) <- NULL
|
||||
|
||||
# Make trace a consecutive number
|
||||
out$trace <- as.numeric(factor(out$trace, levels = unique(out$trace)))
|
||||
out
|
||||
}
|
||||
|
||||
|
||||
add_trace_subdata <- function(subdata, max_trace) {
|
||||
|
||||
if (nrow(subdata) != 0) {
|
||||
|
||||
if (length(stats::na.omit(unique(subdata$trace))) == 1) {
|
||||
subdata[subdata$event == "move", "trace"] <- stats::na.omit(unique(subdata$trace))
|
||||
} else if (length(stats::na.omit(unique(subdata$trace))) > 1) {
|
||||
for (i in 1:nrow(subdata)) {
|
||||
if (subdata$event[i] == "move") {
|
||||
if (i == 1) {
|
||||
subdata$trace[i] <- stats::na.omit(unique(subdata$trace))[1]
|
||||
} else {
|
||||
subdata$trace[i] <- subdata$trace[i - 1]
|
||||
}
|
||||
}
|
||||
}
|
||||
} else if (all(is.na(subdata$trace))) {
|
||||
for (i in 1:nrow(subdata)) {
|
||||
subdata$trace[i] <- max_trace
|
||||
}
|
||||
}
|
||||
|
||||
} else {
|
||||
warning("subdata has nrow = 0")
|
||||
}
|
||||
subdata
|
||||
}
|
||||
|
||||
|
||||
###########################################################################
|
||||
|
||||
# Create data frame with file names and topics for each artwork
|
||||
|
||||
extract_topics <- function(artworks, xmlfiles, xmlpath) {
|
||||
|
||||
out <- NULL
|
||||
i <- 1
|
||||
|
||||
for (artwork in artworks) {
|
||||
|
||||
index_file <- paste0(xmlpath, artwork, "/", xmlfiles[i])
|
||||
suppressWarnings(
|
||||
fnames <- gsub("^<card src=.*/(.*)./>$", "\\1",
|
||||
grep("^<card src=", trimws(readLines(index_file)),
|
||||
value = TRUE))
|
||||
)
|
||||
|
||||
topic <- NULL
|
||||
for (fname in fnames) {
|
||||
suppressWarnings(
|
||||
topic <- c(topic, gsub("^<card type=.(.*).>$", "\\1",
|
||||
grep("^<card type=",
|
||||
trimws(readLines(paste(xmlpath, artwork, fname, sep = "/"))),
|
||||
value = T)))
|
||||
)
|
||||
}
|
||||
out <- rbind(out, data.frame(artwork, file_name = fnames, topic))
|
||||
i <- i + 1
|
||||
}
|
||||
|
||||
out <- out[order(out$artwork), ]
|
||||
rownames(out) <- NULL
|
||||
|
||||
out$index <- unlist(lapply(table(out$artwork), seq_len))
|
||||
out
|
||||
}
|
||||
|
||||
###########################################################################
|
||||
|
||||
# Add topics: file names and topics
|
||||
|
||||
add_topic <- function(data, topics) {
|
||||
|
||||
artworks <- unique(data$artwork)
|
||||
tab_art <- lapply(artworks,
|
||||
function(x) names(table(data$topicNumber[data$artwork == x])))
|
||||
names(tab_art) <- artworks
|
||||
|
||||
tab_index <- lapply(tab_art, seq_along)
|
||||
|
||||
dat_split <- split(data, ~ artwork)
|
||||
|
||||
set_label <- function(x) {
|
||||
artwork <- unique(x$artwork)
|
||||
x$topicIndex <- factor(x$topicNumber, labels = tab_index[[artwork]])
|
||||
x
|
||||
}
|
||||
|
||||
dat_label <- lapply(dat_split, set_label)
|
||||
|
||||
set_topic <- function(x) {
|
||||
artwork <- unique(x$artwork)
|
||||
labels_file <- topics[topics$artwork == artwork,
|
||||
"file_name"][as.numeric(levels(x$topicIndex))]
|
||||
x$topicFile <- as.character(factor(x$topicIndex, labels = labels_file))
|
||||
labels_topic <- topics[topics$artwork == artwork,
|
||||
"topic"][as.numeric(levels(x$topicIndex))]
|
||||
x$topic <- as.character(factor(x$topicIndex, labels = labels_topic))
|
||||
x
|
||||
}
|
||||
|
||||
dat_topic <- lapply(dat_label, set_topic)
|
||||
|
||||
#out <- do.call(rbind, dat_topic)
|
||||
out <- dplyr::bind_rows(dat_topic)
|
||||
out$topicIndex <- as.numeric(out$topicIndex)
|
||||
out <- out[order(out$fileId, out$date.start, out$timeMs.start), ]
|
||||
rownames(out) <- NULL
|
||||
out
|
||||
}
|
||||
|
||||
###########################################################################
|
||||
|
||||
# Create data frame with information on artworks
|
||||
|
||||
extract_artworks <- function(artworks, files, xmlpath) {
|
||||
out <- NULL
|
||||
i <- 1
|
||||
|
||||
for (artwork in artworks) {
|
||||
|
||||
if (length(files) == 1) {
|
||||
index_file <- files
|
||||
} else {
|
||||
index_file <- files[i]
|
||||
}
|
||||
|
||||
index <- paste(xmlpath, artwork, index_file, sep = "/")
|
||||
varnames <- c("artist", "title", "misc", "description")
|
||||
xmllist <- XML::xmlToList(index)$header[varnames]
|
||||
|
||||
if (any(sapply(xmllist, is.null))) {# necessary for missing entries
|
||||
names(xmllist) <- varnames
|
||||
xmllist[which(sapply(xmllist, is.null))] <- NA
|
||||
}
|
||||
# remove German quotes
|
||||
xmllist <- lapply(xmllist, function(x) gsub("\u201e|\u201c", "", x))
|
||||
# remove HTML tags
|
||||
xmllist <- lapply(xmllist, function(x) gsub("<br/>", " ", x))
|
||||
xmldat <- as.data.frame(xmllist)
|
||||
xmldat$artwork <- artwork
|
||||
# trim white space from strings
|
||||
xmldat$artist <- trimws(xmldat$artist)
|
||||
xmldat$title <- trimws(xmldat$title)
|
||||
xmldat$misc <- trimws(xmldat$misc)
|
||||
xmldat$description <- trimws(xmldat$description)
|
||||
out <- rbind(out, xmldat)
|
||||
i <- i + 1
|
||||
}
|
||||
out
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user