########################################################################### # Add trace variable add_trace <- function(data, glossar_dict = "../data/glossar_dict.RData") { data$trace <- NA subdata1 <- data[data$event %in% c("Transform start", "Transform stop"), ] subdata2 <- data[!data$event %in% c("Transform start", "Transform stop"), ] last_event <- subdata2$event[1] artworks <- unique(subdata2$artwork)[unique(subdata2$artwork) != "glossar"] n <- 1 # count artworks for progress pb <- txtProgressBar(min = 0, max = nrow(subdata2), initial = NA, style = 3) for (artwork in artworks) { cat("\n\nAdding trace variable for artwork", artwork, paste0("(", n, "/", length(artworks), ")"), "\n") for (i in 1:nrow(subdata2)) { if (last_event == "Show Info" & subdata2$artwork[i] == artwork) { subdata2$trace[i] <- i j <- i } else if (last_event == "Show Front" & subdata2$artwork[i] == artwork) { subdata2$trace[i] <- j } else if (!(last_event %in% c("Show Info", "Show Front")) & subdata2$artwork[i] == artwork) { subdata2$trace[i] <- j } if (i <= nrow(subdata2)) { last_event <- subdata2$event[i + 1] } setTxtProgressBar(pb, i) } n <- n + 1 } # Fix glossar entries (find corresponding artworks and fill in trace) glossar_files <- unique(subdata2[subdata2$artwork == "glossar", "popup"]) # load lookup table for artworks and glossar files load(glossar_dict) lut <- glossar_dict[glossar_dict$glossar_file %in% glossar_files, ] inside <- glossar_files[glossar_files %in% lut[sapply(lut$artwork, length) == 1, "glossar_file"]] single_art <- unlist(lut[lut$glossar_file %in% inside, "artwork"]) m <- 1 for (file in lut$glossar_file) { cat("\n\nAdding trace variable for glossar entry", file, paste0("(", m, "/", length(lut$glossar_file), ")"), "\n") artwork_list <- unlist(lut[lut$glossar_file == file, "artwork"]) for (i in seq_len(nrow(subdata2))) { if (subdata2$event[i] == "Show Info" | (subdata2$event[i] == "Artwork/OpenCard" & subdata2$artwork[i] %in% single_art)) { current_artwork <- subdata2[i, "artwork"] j <- i k <- i } else { current_artwork <- current_artwork } if (subdata2$event[i] == "Show Front" & subdata2$artwork[i] == current_artwork) { # make sure artwork has not been closed, yet! k <- i } if (subdata2$artwork[i] == "glossar" & (current_artwork %in% artwork_list) & subdata2$popup[i] == file & (j - k == 0)) { subdata2[i, "trace"] <- subdata2[j, "trace"] subdata2[i, "artwork"] <- current_artwork } setTxtProgressBar(pb, i) } m <- m + 1 } # Exclude not matched glossar entries cat("\n\nINFORMATION: glossar entries that are not matched will be removed:", sum(is.na(subdata2[subdata2$glossar == 1, "trace"])), "entries", #proportions(table(is.na(subdata2[subdata2$glossar == 1, "trace"]))), fill = TRUE) subdata2 <- subset(subdata2, !is.na(subdata2$trace)) # REMEMBER: It can never be 100% correct, since it is always possible # that several cards are open and that they link to the same glossar # entry # dat2[14110:14130, ] # dat2[dat2$glossar == 1, ] out <- rbind(subdata1, subdata2) out <- out[order(out$fileId, out$date, out$timeMs), ] out } ########################################################################### close_events <- function(data, event = c("move", "flipCard", "openTopic", "openPopup")) { # TODO: How do I set default vector and partial matching of arguments? # --> `macht.arg()` and `pmatch()` if (event == "move") { actions <- c("Transform start", "Transform stop") idvar <- c("eventId", "artwork", "glossar") drop <- c("popup", "topicNumber", "trace", "event") ncol <- 17 } else if (event == "flipCard") { actions <- c("Show Info", "Show Front") idvar <- c("trace", "artwork", "glossar") drop <- c("popup", "topicNumber", "eventId", "event") ncol <- 17 } else if (event == "openTopic") { actions <- c("Artwork/OpenCard", "Artwork/CloseCard") idvar <- c("eventId", "trace", "glossar", "artwork", "topicNumber") drop <- c("popup", "event") ncol <- 19 } else if (event == "openPopup") { actions <- c("ShowPopup", "HidePopup") idvar <- c("eventId", "trace", "glossar", "artwork", "popup") drop <- c("topicNumber", "event") ncol <- 19 # TODO: Should topicNumber maybe also be filled in for "openPopup"? } else { stop("`event` must be one of 'move', 'flipCard', 'openTopic', 'openPopup'.") } # TODO: `fileId` should now maybe go back into `idvar` subdata <- subset(data, data$event %in% actions) #subdata <- subdata[order(subdata$artwork, subdata$popup, subdata$date), ] subdata <- subdata[order(subdata$artwork, subdata$popup, subdata$date, subdata$timeMs), ] subdata$time <- ifelse(subdata$event == actions[1], "start", "stop") num_start <- diff(c(0, which(subdata$event == actions[2]))) if (tail(subdata, 1)$time == "start") { num_start <- c(num_start, 1) } subdata$eventId <- rep(seq_along(num_start), num_start) if (event == "move") { subdata <- subdata[!duplicated(subdata[, c("event", "eventId")]), ] id_stop <- which(subdata$event == actions[2]) id_rm_stop <- id_stop[diff(id_stop) == 1] subdata <- subdata[-(id_rm_stop + 1), ] } subdata_split <- split(subdata, ~ fileId) pbapply::pboptions(style = 3, char = "=") suppressWarnings( subdata_split_wide <- pbapply::pblapply(subdata_split, reshape, direction = "wide", idvar = idvar, timevar = "time", drop = drop) ) # suppressWarnings( # data_wide <- reshape(subdata, direction = "wide", # idvar = idvar, # timevar = "time", # drop = drop) # ) # TODO: Suppress warnings? Better with tryCatch()? # there is a pathological entry which gets deleted... # df[df$trace == 4595, ] # --> artwork 046 popup selene.xml gets opened twice # remove entries with only start or stop events since they do not have # all columns subdata_split_wide <- subdata_split_wide[-which(sapply(subdata_split_wide, ncol) != ncol)] #data_wide <- do.call(rbind, subdata_split_wide) # TODO: This runs quite some time # --> There is a more efficient function in dplyr, which would also allow # to keep the file IDs with only start or stop or a single entry... data_wide <- dplyr::bind_rows(subdata_split_wide) for (d in drop) data_wide[d] <- NA data_wide$distance <- NA data_wide$scaleSize <- NA data_wide$rotationDegree <- NA data_wide$event <- event data_wide$duration <- data_wide$timeMs.stop - data_wide$timeMs.start if (event == "move") { data_wide$distance <- apply( data_wide[, c("x.start", "y.start", "x.stop", "y.stop")], 1, function(x) dist(matrix(x, 2, 2, byrow = TRUE))) data_wide$rotationDegree <- data_wide$rotation.stop - data_wide$rotation.start data_wide$scaleSize <- data_wide$scale.stop / data_wide$scale.start # remove moves without any change move_wide <- data_wide[data_wide$distance != 0 & data_wide$rotationDegree != 0 & data_wide$scaleSize != 1, ] cat(paste("INFORMATION:", nrow(data_wide) - nrow(move_wide), "lines containing move events were removed since they did", "\nnot contain any change"), fill = TRUE) data_wide <- move_wide } out <- data_wide[, c("fileId.start", "fileId.stop", "event", "artwork", "trace", "glossar", "date.start", "date.stop", "timeMs.start", "timeMs.stop", "duration", "topicNumber", "popup", "x.start", "y.start", "x.stop", "y.stop", "distance", "scale.start", "scale.stop", "scaleSize", "rotation.start", "rotation.stop", "rotationDegree")] rownames(out) <- NULL out } ########################################################################### # Add case variable add_case <- function(data, cutoff = 20) { # TODO: What is the best choice for the cutoff here? data$timediff <- as.numeric(diff(c(data$date.start[1], data$date.start))) data$case <- NA j <- 1 pb <- txtProgressBar(min = 0, max = nrow(data), initial = NA, style = 3) for (i in seq_len(nrow(data))) { if (data$timediff[i] <= cutoff) { data$case[i] <- j } else { j <- j + 1 data$case[i] <- j } setTxtProgressBar(pb, i) } data$timediff <- NULL data } ########################################################################### # Add trace for moves add_trace_moves <- function(data) { cases <- unique(data$case) artworks <- unique(data$artwork)[unique(data$artwork) != "glossar"] trace_max <- max(data$trace, na.rm = TRUE) subdata_list <- split(data, ~ artwork + case) subdata_list <- subdata_list[which(sapply(subdata_list, nrow) != 0)] pbapply::pboptions(style = 3, char = "=") subdata_trace <- pbapply::pblapply(subdata_list, function(x) { trace_max <<- trace_max + 1 add_trace_subdata(x, max_trace = trace_max) } ) out <- dplyr::bind_rows(subdata_trace) out <- out[order(out$fileId.start, out$date.start, out$timeMs.start), ] rownames(out) <- NULL # Make trace a consecutive number out$trace <- as.numeric(factor(out$trace, levels = unique(out$trace))) out } add_trace_subdata <- function(subdata, max_trace) { if (nrow(subdata) != 0) { if (length(na.omit(unique(subdata$trace))) == 1) { subdata[subdata$event == "move", "trace"] <- na.omit(unique(subdata$trace)) } else if (length(na.omit(unique(subdata$trace))) > 1) { for (i in 1:nrow(subdata)) { if (subdata$event[i] == "move") { if (i == 1) { subdata$trace[i] <- na.omit(unique(subdata$trace))[1] } else { subdata$trace[i] <- subdata$trace[i - 1] } } } } else if (all(is.na(subdata$trace))) { for (i in 1:nrow(subdata)) { subdata$trace[i] <- max_trace } } } else { warning("`subdata` has nrow = 0") } subdata } #system.time(dat4a <- add_trace_moves2(dat3)) #system.time(dat4b <- add_trace_moves(dat3)) ########################################################################### # Create data frame with file names and topics for each artwork extract_topics <- function(artworks, pattern, path) { dat <- NULL file_order <- NULL i <- 1 for (artwork in artworks) { if (length(pattern) == 1) { index_file <- pattern } else { index_file <- pattern[i] } fnames <- dir(pattern = paste0(artwork, "_"), path = paste(path, artwork, sep = "/")) topic <- NULL for (fname in fnames) { suppressWarnings( topic <- c(topic, gsub("^$", "\\1", grep("^$", "\\1", grep("^", " ", x)) xmldat <- as.data.frame(xmllist) xmldat$artwork <- artwork # trim white space from strings xmldat$artist <- trimws(xmldat$artist) xmldat$title <- trimws(xmldat$title) xmldat$misc <- trimws(xmldat$misc) xmldat$description <- trimws(xmldat$description) out <- rbind(out, xmldat) i <- i + 1 } out }