mtt_haum/code/functions.R
2023-09-19 15:25:30 +02:00

459 lines
14 KiB
R

###########################################################################
# Add trace variable
add_trace <- function(data, glossar_dict = "../data/glossar_dict.RData") {
data$trace <- NA
subdata1 <- data[data$event %in% c("Transform start", "Transform stop"), ]
subdata2 <- data[!data$event %in% c("Transform start", "Transform stop"), ]
last_event <- subdata2$event[1]
artworks <- unique(subdata2$artwork)[unique(subdata2$artwork) != "glossar"]
n <- 1 # count artworks for progress
pb <- txtProgressBar(min = 0, max = nrow(subdata2), initial = NA,
style = 3)
for (artwork in artworks) {
cat("\n\nAdding trace variable for artwork", artwork,
paste0("(", n, "/", length(artworks), ")"), "\n")
for (i in 1:nrow(subdata2)) {
if (last_event == "Show Info" & subdata2$artwork[i] == artwork) {
subdata2$trace[i] <- i
j <- i
} else if (last_event == "Show Front" & subdata2$artwork[i] == artwork) {
subdata2$trace[i] <- j
} else if (!(last_event %in% c("Show Info", "Show Front")) &
subdata2$artwork[i] == artwork) {
subdata2$trace[i] <- j
}
if (i <= nrow(subdata2)) {
last_event <- subdata2$event[i + 1]
}
setTxtProgressBar(pb, i)
}
n <- n + 1
}
# Fix glossar entries (find corresponding artworks and fill in trace)
glossar_files <- unique(subdata2[subdata2$artwork == "glossar", "popup"])
# load lookup table for artworks and glossar files
load(glossar_dict)
lut <- glossar_dict[glossar_dict$glossar_file %in% glossar_files, ]
inside <- glossar_files[glossar_files %in%
lut[sapply(lut$artwork, length) == 1,
"glossar_file"]]
single_art <- unlist(lut[lut$glossar_file %in% inside, "artwork"])
m <- 1
for (file in lut$glossar_file) {
cat("\n\nAdding trace variable for glossar entry", file,
paste0("(", m, "/", length(lut$glossar_file), ")"), "\n")
artwork_list <- unlist(lut[lut$glossar_file == file, "artwork"])
for (i in seq_len(nrow(subdata2))) {
if (subdata2$event[i] == "Show Info" |
(subdata2$event[i] == "Artwork/OpenCard" &
subdata2$artwork[i] %in% single_art)) {
current_artwork <- subdata2[i, "artwork"]
j <- i
k <- i
} else {
current_artwork <- current_artwork
}
if (subdata2$event[i] == "Show Front" & subdata2$artwork[i] == current_artwork) {
# make sure artwork has not been closed, yet!
k <- i
}
if (subdata2$artwork[i] == "glossar" &
(current_artwork %in% artwork_list) &
subdata2$popup[i] == file & (j - k == 0)) {
subdata2[i, "trace"] <- subdata2[j, "trace"]
subdata2[i, "artwork"] <- current_artwork
}
setTxtProgressBar(pb, i)
}
m <- m + 1
}
# Exclude not matched glossar entries
cat("\n\nINFORMATION: glossar entries that are not matched will be removed:",
sum(is.na(subdata2[subdata2$glossar == 1, "trace"])), "entries",
#proportions(table(is.na(subdata2[subdata2$glossar == 1, "trace"]))),
fill = TRUE)
subdata2 <- subset(subdata2, !is.na(subdata2$trace))
# REMEMBER: It can never be 100% correct, since it is always possible
# that several cards are open and that they link to the same glossar
# entry
# dat2[14110:14130, ]
# dat2[dat2$glossar == 1, ]
out <- rbind(subdata1, subdata2)
out <- out[order(out$fileId, out$date, out$timeMs), ]
out
}
###########################################################################
close_events <- function(data, event = c("move", "flipCard", "openTopic", "openPopup")) {
event <- match.arg(event)
switch(event,
"move" = {
actions <- c("Transform start", "Transform stop")
idvar <- c("fileId", "eventId", "artwork", "glossar")
drop <- c("popup", "topicNumber", "trace", "event")
ncol <- 16
},
"flipCard" = {
actions <- c("Show Info", "Show Front")
idvar <- c("fileId", "trace", "artwork", "glossar")
drop <- c("popup", "topicNumber", "eventId", "event")
ncol <- 16
},
"openTopic" = {
actions <- c("Artwork/OpenCard", "Artwork/CloseCard")
idvar <- c("fileId", "eventId", "trace", "glossar", "artwork",
"topicNumber")
drop <- c("popup", "event")
ncol <- 18
},
"openPopup" = {
actions <- c("ShowPopup", "HidePopup")
idvar <- c("fileId", "eventId", "trace", "glossar", "artwork", "popup")
drop <- c("topicNumber", "event")
ncol <- 18
# TODO: Should topicNumber maybe also be filled in for "openPopup"?
}
)
subdata <- subset(data, data$event %in% actions)
subdata <- subdata[order(subdata$artwork, subdata$popup, subdata$date, subdata$timeMs), ]
subdata$time <- ifelse(subdata$event == actions[1], "start", "stop")
num_start <- diff(c(0, which(subdata$event == actions[2])))
if (tail(subdata, 1)$time == "start") {
num_start <- c(num_start, 1)
}
subdata$eventId <- rep(seq_along(num_start), num_start)
if (event == "move") {
subdata <- subdata[!duplicated(subdata[, c("event", "eventId")]), ]
id_stop <- which(subdata$event == actions[2])
id_rm_stop <- id_stop[diff(id_stop) == 1]
subdata <- subdata[-(id_rm_stop + 1), ]
}
subdata_split <- split(subdata, ~ fileId)
pbapply::pboptions(style = 3, char = "=")
subdata_split_wide <- pbapply::pblapply(subdata_split, reshape,
direction = "wide",
idvar = idvar,
timevar = "time",
drop = drop)
# suppressWarnings(
# data_wide <- reshape(subdata, direction = "wide",
# idvar = idvar,
# timevar = "time",
# drop = drop)
# )
# remove entries with only start or stop events since they do not have
# all columns
ids <- which(sapply(subdata_split_wide, ncol) != ncol)
if (length(ids) > 0) subdata_split_wide <- subdata_split_wide[-ids]
data_wide <- dplyr::bind_rows(subdata_split_wide)
for (d in drop) data_wide[d] <- NA
data_wide$distance <- NA
data_wide$scaleSize <- NA
data_wide$rotationDegree <- NA
data_wide$event <- event
data_wide$duration <- data_wide$timeMs.stop - data_wide$timeMs.start
if (event == "move") {
data_wide$distance <- apply(
data_wide[, c("x.start", "y.start", "x.stop", "y.stop")], 1,
function(x) dist(matrix(x, 2, 2, byrow = TRUE)))
data_wide$rotationDegree <- data_wide$rotation.stop -
data_wide$rotation.start
data_wide$scaleSize <- data_wide$scale.stop / data_wide$scale.start
# remove moves without any change
move_wide <- data_wide[data_wide$distance != 0 &
data_wide$rotationDegree != 0 &
data_wide$scaleSize != 1, ]
cat(paste("INFORMATION:", nrow(data_wide) - nrow(move_wide),
"lines containing move events were removed since they did",
"\nnot contain any change"), fill = TRUE)
data_wide <- move_wide
}
out <- data_wide[, c("fileId", "event", "artwork", "trace", "glossar",
"date.start", "date.stop", "timeMs.start",
"timeMs.stop", "duration", "topicNumber", "popup",
"x.start", "y.start", "x.stop", "y.stop",
"distance", "scale.start", "scale.stop",
"scaleSize", "rotation.start", "rotation.stop",
"rotationDegree")]
rownames(out) <- NULL
out
}
###########################################################################
# Add case variable
add_case <- function(data, cutoff = 20) {
# TODO: What is the best choice for the cutoff here?
data$timediff <- as.numeric(diff(c(data$date.start[1], data$date.start)))
data$case <- NA
j <- 1
pb <- txtProgressBar(min = 0, max = nrow(data), initial = NA, style = 3)
for (i in seq_len(nrow(data))) {
if (data$timediff[i] <= cutoff) {
data$case[i] <- j
} else {
j <- j + 1
data$case[i] <- j
}
setTxtProgressBar(pb, i)
}
data$timediff <- NULL
data
}
###########################################################################
# Add trace for moves
add_trace_moves <- function(data) {
pbapply::pboptions(style = 3, char = "=")
trace_max <- max(data$trace, na.rm = TRUE)
#subdata_art <- split(data, ~ artwork)
subdata_case <- split(data, ~ case)
#subdata_list <- split(data, ~ artwork + case)
# --> does not work with complete data set
cat("Splitting data...", "\n")
subdata_list <- pbapply::pblapply(subdata_case, split, f = ~artwork)
subdata_list <- unlist(subdata_list, recursive = FALSE)
cat("Adding trace...", "\n")
subdata_trace <- pbapply::pblapply(subdata_list,
function(x) {
trace_max <<- trace_max + 1
add_trace_subdata(x, max_trace = trace_max)
}
)
out <- dplyr::bind_rows(subdata_trace)
out <- out[order(out$fileId.start, out$date.start, out$timeMs.start), ]
rownames(out) <- NULL
# Make trace a consecutive number
out$trace <- as.numeric(factor(out$trace, levels = unique(out$trace)))
out
}
add_trace_subdata <- function(subdata, max_trace) {
if (nrow(subdata) != 0) {
if (length(na.omit(unique(subdata$trace))) == 1) {
subdata[subdata$event == "move", "trace"] <- na.omit(unique(subdata$trace))
} else if (length(na.omit(unique(subdata$trace))) > 1) {
for (i in 1:nrow(subdata)) {
if (subdata$event[i] == "move") {
if (i == 1) {
subdata$trace[i] <- na.omit(unique(subdata$trace))[1]
} else {
subdata$trace[i] <- subdata$trace[i - 1]
}
}
}
} else if (all(is.na(subdata$trace))) {
for (i in 1:nrow(subdata)) {
subdata$trace[i] <- max_trace
}
}
} else {
warning("`subdata` has nrow = 0")
}
subdata
}
###########################################################################
# Create data frame with file names and topics for each artwork
extract_topics <- function(artworks, pattern, path) {
dat <- NULL
file_order <- NULL
i <- 1
for (artwork in artworks) {
if (length(pattern) == 1) {
index_file <- pattern
} else {
index_file <- pattern[i]
}
fnames <- dir(pattern = paste0(artwork, "_"),
path = paste(path, artwork, sep = "/"))
topic <- NULL
for (fname in fnames) {
suppressWarnings(
topic <- c(topic, gsub("^<card type=.(.*).>$", "\\1",
grep("^<card type=",
trimws(readLines(paste(path, artwork, fname, sep = "/"))),
value = T)))
)
}
index <- paste(path, artwork, index_file, sep = "/")
suppressWarnings(
file_order <- c(file_order, gsub("^<card src=.*/(.*)./>$", "\\1",
grep("^<card src=", trimws(readLines(index)),
value = TRUE)))
)
in_index <- fnames %in% file_order
dat <- rbind(dat, data.frame(artwork, file_name = fnames, in_index, topic))
i <- i + 1
}
# take only the ones that are actually displayed and sort in the same order
# as indicated in index.html
out <- dat[dat$in_index, -3]
out <- out[order(file_order, out$file_name), ]
rownames(out) <- NULL
out$index <- unlist(sapply(table(out$artwork), seq_len))
out
}
###########################################################################
# Add topics: file names and topics
add_topic <- function(data, topics) {
artworks <- unique(data$artwork)
tab_art <- lapply(artworks,
function(x) names(table(data$topicNumber[data$artwork == x])))
names(tab_art) <- artworks
tab_index <- lapply(tab_art, seq_along)
dat_split <- split(data, ~ artwork)
set_label <- function(x) {
artwork <- unique(x$artwork)
x$topicIndex <- factor(x$topicNumber, labels = tab_index[[artwork]])
x
}
dat_label <- lapply(dat_split, set_label)
set_topic <- function(x) {
artwork <- unique(x$artwork)
labels_file <- topics[topics$artwork == artwork,
"file_name"][as.numeric(levels(x$topicIndex))]
x$topicFile <- as.character(factor(x$topicIndex, labels = labels_file))
labels_topic <- topics[topics$artwork == artwork,
"topic"][as.numeric(levels(x$topicIndex))]
x$topic <- as.character(factor(x$topicIndex, labels = labels_topic))
x
}
dat_topic <- lapply(dat_label, set_topic)
#out <- do.call(rbind, dat_topic)
out <- dplyr::bind_rows(dat_topic)
out <- out[order(out$fileId.start, out$date.start, out$timeMs.start), ]
rownames(out) <- NULL
out
}
###########################################################################
# Create data frame with information on artworks
extract_artworks <- function(artworks, files = paste0(artworks, ".xml"),
path = path) {
out <- NULL
i <- 1
for (artwork in artworks) {
if (length(files) == 1) {
index_file <- files
} else {
index_file <- files[i]
}
index <- paste(path, artwork, index_file, sep = "/")
varnames <- c("artist", "title", "misc", "description")
xmllist <- XML::xmlToList(index)$header[varnames]
if (any(sapply(xmllist, is.null))) {# necessary for missing entries
names(xmllist) <- varnames
xmllist[which(sapply(xmllist, is.null))] <- NA
}
# remove ugly quotes
xmllist <- lapply(xmllist, function(x) gsub("„|“", "", x))
# remove HTML tags
xmllist <- lapply(xmllist, function(x) gsub("<br/>", " ", x))
xmldat <- as.data.frame(xmllist)
xmldat$artwork <- artwork
# trim white space from strings
xmldat$artist <- trimws(xmldat$artist)
xmldat$title <- trimws(xmldat$title)
xmldat$misc <- trimws(xmldat$misc)
xmldat$description <- trimws(xmldat$description)
out <- rbind(out, xmldat)
i <- i + 1
}
out
}