commit 17e8e39cc3238e8244e1da8b60304c23f132f482 Author: nwickel Date: Wed Sep 20 16:16:47 2023 +0200 Created R package mtt diff --git a/DESCRIPTION b/DESCRIPTION new file mode 100644 index 0000000..762c8c4 --- /dev/null +++ b/DESCRIPTION @@ -0,0 +1,12 @@ +Package: mtt +Title: Log files from a Multi-Touch-Table +Version: 0.0.0.9000 +Authors@R: + person("Nora", "Wickelmaier", , "n.wickelmaier@iwm-tuebingen.de", role = c("aut", "cre")) +Imports: stats, utils, dplyr, pbapply, XML, lubridate +Description: Creating event logs from raw log files from a Multi-Touch-Table at the IWM +License: `use_mit_license()`, `use_gpl3_license()` or friends to pick a + license +Encoding: UTF-8 +Roxygen: list(markdown = TRUE) +RoxygenNote: 7.2.3 diff --git a/NAMESPACE b/NAMESPACE new file mode 100644 index 0000000..158dff3 --- /dev/null +++ b/NAMESPACE @@ -0,0 +1,4 @@ +# Generated by roxygen2: do not edit by hand + +export(create_eventlogs) +export(parse_logfiles) diff --git a/R/create_eventlogs.R b/R/create_eventlogs.R new file mode 100644 index 0000000..628bc32 --- /dev/null +++ b/R/create_eventlogs.R @@ -0,0 +1,79 @@ +#' Creating log events from raw log files. +#' +#' Creating event logs from a data frame of raw log files from a +#' Multi-Touch-Table at the IWM. +#' +#' @param data Data frame of raw log files created with `parse_logfiles()`. +#' See `?parse_logfiles` for more details. +#' @return Data frame. +#' @export +#' @examples +#' # tbd +create_eventlogs <- function(data) { + + data$date <- as.POSIXct(data$date) + data$glossar <- ifelse(data$artwork == "glossar", 1, 0) + + # Remove irrelevant events + dat <- subset(data, !(data$event %in% c("Start Application", + "Show Application"))) + + # Add trace variable ##################################################### + cat("########## Adding trace variable... ##########", "\n") + dat1 <- add_trace(dat) + + # Close events + cat("########## Closing events... ##########", "\n") + c1 <- close_events(dat1, "move") + cat("## --> move events closed.", "\n") + c2 <- close_events(dat1, "flipCard") + cat("## --> flipCard events closed.", "\n") + c3 <- close_events(dat1, "openTopic") + cat("## --> openTopic events closed.", "\n") + c4 <- close_events(dat1, "openPopup") + cat("## --> openPopup events closed.", "\n") + dat2 <- rbind(c1, c2, c3, c4) + + dat2 <- dat2[order(dat2$date.start, dat2$fileId.start), ] + + # Remove all events that do not have a `date.start` + d1 <- nrow(dat2) + dat2 <- dat2[!is.na(dat2$date.start), ] + d2 <- nrow(dat2) + if(d1 > d2) { + warning(paste0(d1-d2, " lines that do not contain a start event have been removed. This can happen when events span over more than one log file.\n")) + } + + rownames(dat2) <- NULL + + # Add case variable ###################################################### + cat("########## Adding case and eventId variables... ##########", "\n") + dat3 <- add_case(dat2) + + # Add event ID ########################################################### + dat3$eventId <- seq_len(nrow(dat3)) + dat3 <- dat3[, c("fileId.start", "fileId.stop", "eventId", "case", + "trace", "glossar", "event", "artwork", + "date.start", "date.stop", "timeMs.start", + "timeMs.stop", "duration", "topicNumber", "popup", + "x.start", "y.start", "x.stop", "y.stop", + "distance", "scale.start", "scale.stop", + "scaleSize", "rotation.start", "rotation.stop", + "rotationDegree")] + + # Add trace for move events ############################################## + cat("\n########## Adding trace variable for move events... ##########", "\n") + dat4 <- add_trace_moves(dat3) + + # Add topics: file names and topics ###################################### + cat("########## Adding information about topics... ##########", "\n") + artworks <- unique(dat4$artwork) + # remove artworks without XML information + artworks <- artworks[!artworks %in% c("504", "505")] + topics <- extract_topics(artworks, pattern = paste0(artworks, ".xml"), + path = "../data/ContentEyevisit/eyevisit_cards_light/") + + dat5 <- add_topic(dat4, topics = topics) + dat5 +} + diff --git a/R/helper.R b/R/helper.R new file mode 100644 index 0000000..ecf7e8d --- /dev/null +++ b/R/helper.R @@ -0,0 +1,458 @@ +########################################################################### + +# Add trace variable + +add_trace <- function(data, glossar_dict = "../data/glossar_dict.RData") { + + data$trace <- NA + subdata1 <- data[data$event %in% c("Transform start", "Transform stop"), ] + subdata2 <- data[!data$event %in% c("Transform start", "Transform stop"), ] + + last_event <- subdata2$event[1] + artworks <- unique(subdata2$artwork)[unique(subdata2$artwork) != "glossar"] + n <- 1 # count artworks for progress + + pb <- utils::txtProgressBar(min = 0, max = nrow(subdata2), initial = NA, + style = 3) + + for (artwork in artworks) { + + cat("\n\nAdding trace variable for artwork", artwork, + paste0("(", n, "/", length(artworks), ")"), "\n") + + for (i in 1:nrow(subdata2)) { + + if (last_event == "Show Info" & subdata2$artwork[i] == artwork) { + subdata2$trace[i] <- i + j <- i + + } else if (last_event == "Show Front" & subdata2$artwork[i] == artwork) { + subdata2$trace[i] <- j + + } else if (!(last_event %in% c("Show Info", "Show Front")) & + subdata2$artwork[i] == artwork) { + subdata2$trace[i] <- j + } + + if (i <= nrow(subdata2)) { + last_event <- subdata2$event[i + 1] + } + utils::setTxtProgressBar(pb, i) + } + n <- n + 1 + } + + # Fix glossar entries (find corresponding artworks and fill in trace) + glossar_files <- unique(subdata2[subdata2$artwork == "glossar", "popup"]) + + # load lookup table for artworks and glossar files + load(glossar_dict) + lut <- glossar_dict[glossar_dict$glossar_file %in% glossar_files, ] + + inside <- glossar_files[glossar_files %in% + lut[sapply(lut$artwork, length) == 1, + "glossar_file"]] + single_art <- unlist(lut[lut$glossar_file %in% inside, "artwork"]) + + m <- 1 + + for (file in lut$glossar_file) { + + cat("\n\nAdding trace variable for glossar entry", file, + paste0("(", m, "/", length(lut$glossar_file), ")"), "\n") + + artwork_list <- unlist(lut[lut$glossar_file == file, "artwork"]) + + for (i in seq_len(nrow(subdata2))) { + + if (subdata2$event[i] == "Show Info" | + (subdata2$event[i] == "Artwork/OpenCard" & + subdata2$artwork[i] %in% single_art)) { + + current_artwork <- subdata2[i, "artwork"] + j <- i + k <- i + + } else { + + current_artwork <- current_artwork + + } + + if (subdata2$event[i] == "Show Front" & subdata2$artwork[i] == current_artwork) { + # make sure artwork has not been closed, yet! + k <- i + } + + if (subdata2$artwork[i] == "glossar" & + (current_artwork %in% artwork_list) & + subdata2$popup[i] == file & (j - k == 0)) { + + subdata2[i, "trace"] <- subdata2[j, "trace"] + subdata2[i, "artwork"] <- current_artwork + + } + utils::setTxtProgressBar(pb, i) + } + m <- m + 1 + } + + # Exclude not matched glossar entries + cat("\n\nINFORMATION: glossar entries that are not matched will be removed:", + sum(is.na(subdata2[subdata2$glossar == 1, "trace"])), "entries", + #proportions(table(is.na(subdata2[subdata2$glossar == 1, "trace"]))), + fill = TRUE) + subdata2 <- subset(subdata2, !is.na(subdata2$trace)) + # REMEMBER: It can never be 100% correct, since it is always possible + # that several cards are open and that they link to the same glossar + # entry + + # dat2[14110:14130, ] + # dat2[dat2$glossar == 1, ] + + out <- rbind(subdata1, subdata2) + out <- out[order(out$fileId, out$date, out$timeMs), ] + out +} + +########################################################################### + +close_events <- function(data, event = c("move", "flipCard", "openTopic", "openPopup")) { + + event <- match.arg(event) + + switch(event, + "move" = { + actions <- c("Transform start", "Transform stop") + idvar <- c("fileId", "eventId", "artwork", "glossar") + drop <- c("popup", "topicNumber", "trace", "event") + ncol <- 16 + + }, + "flipCard" = { + actions <- c("Show Info", "Show Front") + idvar <- c("fileId", "trace", "artwork", "glossar") + drop <- c("popup", "topicNumber", "eventId", "event") + ncol <- 16 + + }, + "openTopic" = { + actions <- c("Artwork/OpenCard", "Artwork/CloseCard") + idvar <- c("fileId", "eventId", "trace", "glossar", "artwork", + "topicNumber") + drop <- c("popup", "event") + ncol <- 18 + + }, + "openPopup" = { + actions <- c("ShowPopup", "HidePopup") + idvar <- c("fileId", "eventId", "trace", "glossar", "artwork", "popup") + drop <- c("topicNumber", "event") + ncol <- 18 +# TODO: Should topicNumber maybe also be filled in for "openPopup"? + + } + ) + + subdata <- subset(data, data$event %in% actions) + subdata <- subdata[order(subdata$artwork, subdata$popup, subdata$date, subdata$timeMs), ] + subdata$time <- ifelse(subdata$event == actions[1], "start", "stop") + num_start <- diff(c(0, which(subdata$event == actions[2]))) + if (utils::tail(subdata, 1)$time == "start") { + num_start <- c(num_start, 1) + } + subdata$eventId <- rep(seq_along(num_start), num_start) + + if (event == "move") { + subdata <- subdata[!duplicated(subdata[, c("event", "eventId")]), ] + id_stop <- which(subdata$event == actions[2]) + id_rm_stop <- id_stop[diff(id_stop) == 1] + subdata <- subdata[-(id_rm_stop + 1), ] + } + + subdata_split <- split(subdata, ~ fileId) + + pbapply::pboptions(style = 3, char = "=") + + subdata_split_wide <- pbapply::pblapply(subdata_split, stats::reshape, + direction = "wide", + idvar = idvar, + timevar = "time", + drop = drop) +# suppressWarnings( +# data_wide <- stats::reshape(subdata, direction = "wide", +# idvar = idvar, +# timevar = "time", +# drop = drop) +# ) + + # remove entries with only start or stop events since they do not have + # all columns + ids <- which(sapply(subdata_split_wide, ncol) != ncol) + if (length(ids) > 0) subdata_split_wide <- subdata_split_wide[-ids] + + data_wide <- dplyr::bind_rows(subdata_split_wide) + + for (d in drop) data_wide[d] <- NA + data_wide$distance <- NA + data_wide$scaleSize <- NA + data_wide$rotationDegree <- NA + + data_wide$event <- event + data_wide$duration <- data_wide$timeMs.stop - data_wide$timeMs.start + + if (event == "move") { + data_wide$distance <- apply( + data_wide[, c("x.start", "y.start", "x.stop", "y.stop")], 1, + function(x) stats::dist(matrix(x, 2, 2, byrow = TRUE))) + data_wide$rotationDegree <- data_wide$rotation.stop - + data_wide$rotation.start + data_wide$scaleSize <- data_wide$scale.stop / data_wide$scale.start + # remove moves without any change + move_wide <- data_wide[data_wide$distance != 0 & + data_wide$rotationDegree != 0 & + data_wide$scaleSize != 1, ] + cat(paste("INFORMATION:", nrow(data_wide) - nrow(move_wide), + "lines containing move events were removed since they did", + "\nnot contain any change"), fill = TRUE) + data_wide <- move_wide + } + + out <- data_wide[, c("fileId", "event", "artwork", "trace", "glossar", + "date.start", "date.stop", "timeMs.start", + "timeMs.stop", "duration", "topicNumber", "popup", + "x.start", "y.start", "x.stop", "y.stop", + "distance", "scale.start", "scale.stop", + "scaleSize", "rotation.start", "rotation.stop", + "rotationDegree")] + rownames(out) <- NULL + out +} + +########################################################################### + +# Add case variable + +add_case <- function(data, cutoff = 20) { +# TODO: What is the best choice for the cutoff here? + + data$timediff <- as.numeric(diff(c(data$date.start[1], data$date.start))) + data$case <- NA + j <- 1 + pb <- utils::txtProgressBar(min = 0, max = nrow(data), initial = NA, style = 3) + + for (i in seq_len(nrow(data))) { + if (data$timediff[i] <= cutoff) { + data$case[i] <- j + } else { + j <- j + 1 + data$case[i] <- j + } + utils::setTxtProgressBar(pb, i) + } + data$timediff <- NULL + data +} + +########################################################################### + +# Add trace for moves + +add_trace_moves <- function(data) { + + pbapply::pboptions(style = 3, char = "=") + + trace_max <- max(data$trace, na.rm = TRUE) + + #subdata_art <- split(data, ~ artwork) + subdata_case <- split(data, ~ case) + + #subdata_list <- split(data, ~ artwork + case) + # --> does not work with complete data set + cat("Splitting data...", "\n") + subdata_list <- pbapply::pblapply(subdata_case, split, f = ~artwork) + subdata_list <- unlist(subdata_list, recursive = FALSE) + + cat("Adding trace...", "\n") + subdata_trace <- pbapply::pblapply(subdata_list, + function(x) { + trace_max <<- trace_max + 1 + add_trace_subdata(x, max_trace = trace_max) + } + ) + + out <- dplyr::bind_rows(subdata_trace) + out <- out[order(out$fileId.start, out$date.start, out$timeMs.start), ] + rownames(out) <- NULL + + # Make trace a consecutive number + out$trace <- as.numeric(factor(out$trace, levels = unique(out$trace))) + out +} + + +add_trace_subdata <- function(subdata, max_trace) { + + if (nrow(subdata) != 0) { + + if (length(stats::na.omit(unique(subdata$trace))) == 1) { + subdata[subdata$event == "move", "trace"] <- stats::na.omit(unique(subdata$trace)) + } else if (length(stats::na.omit(unique(subdata$trace))) > 1) { + for (i in 1:nrow(subdata)) { + if (subdata$event[i] == "move") { + if (i == 1) { + subdata$trace[i] <- stats::na.omit(unique(subdata$trace))[1] + } else { + subdata$trace[i] <- subdata$trace[i - 1] + } + } + } + } else if (all(is.na(subdata$trace))) { + for (i in 1:nrow(subdata)) { + subdata$trace[i] <- max_trace + } + } + + } else { + warning("subdata has nrow = 0") + } + subdata +} + + +########################################################################### + +# Create data frame with file names and topics for each artwork + +extract_topics <- function(artworks, pattern, path) { + + dat <- NULL + file_order <- NULL + i <- 1 + + for (artwork in artworks) { + + if (length(pattern) == 1) { + index_file <- pattern + } else { + index_file <- pattern[i] + } + + fnames <- dir(pattern = paste0(artwork, "_"), + path = paste(path, artwork, sep = "/")) + topic <- NULL + for (fname in fnames) { + suppressWarnings( + topic <- c(topic, gsub("^$", "\\1", + grep("^$", "\\1", + grep("^", " ", x)) + xmldat <- as.data.frame(xmllist) + xmldat$artwork <- artwork + # trim white space from strings + xmldat$artist <- trimws(xmldat$artist) + xmldat$title <- trimws(xmldat$title) + xmldat$misc <- trimws(xmldat$misc) + xmldat$description <- trimws(xmldat$description) + out <- rbind(out, xmldat) + i <- i + 1 + } + out +} + diff --git a/R/parse_logfiles.R b/R/parse_logfiles.R new file mode 100644 index 0000000..9b99bc8 --- /dev/null +++ b/R/parse_logfiles.R @@ -0,0 +1,139 @@ +#' Left padding file names of raw log files from Multi-Touch-Table at the +#' IWM. +#' +#' File names need to be left padded since otherwise the sorting of the +#' timestamps will be off and one will get negative durations later on +#' since the wrong events get closed. +#' +#' @param x file name in the form of `yyyy_mm_dd-hh_mm_ss. +#' @param dirpaths Paths on system where files live that should be renamed. +#' @return Left padded file names. +#' @examples +#' # folders <- "all" +#' # dirpaths <- paste0("../data/haum_logs_2016-2023/", folders) +#' # fnames <- dir(dirpaths, pattern = "*.log", full.names = TRUE) +#' # leftpad_fnames(fnames) +leftpad_fnames <- function(x, dirpaths) { + + z <- gsub(paste0(dirpaths, "/"), "\\1", x) + ys <- strsplit(z, "_") + + res <- NULL + + for (y in ys) { + y2 <- unlist(strsplit(y[3], "-")) + e1 <- y[1] + e2 <- sprintf("%02d", as.numeric(y[2])) + e3 <- sprintf("%02d", as.numeric(y2[1])) + e4 <- sprintf("%02d", as.numeric(y2[2])) + e5 <- sprintf("%02d", as.numeric(y[4])) + e6 <- sprintf("%02d", as.numeric(gsub(".log", "", y[5]))) + e6 <- sprintf("%02d", as.numeric(gsub(".log", "", y[5]))) + + res <- c(res, + paste0(e1, "_", e2, "_", e3, "-", e4, "_", e5, "_", e6, ".log")) + } + res +} + +#' Creating data frame for raw log files. +#' +#' Creates a data frame or CSV file from raw log files from a +#' Multi-Touch-Table at the IWM. +#' +#' @param folders A character vector of folder names that contain the raw +#' log files from the Multi-Touch-Table at the IWM. +#' @param path A path to the folders. +#' @param file Name of the file where parsed log files should be saved. +#' Default is "rawdata_logfiles.csv". +#' @param save Logical. If data frame should be returned by the function or +#' saved. Default is TRUE. +#' @return A data frame or NULL. +#' @export +#' @examples +#' # parse_logfiles("all", path = "../data/haum_logs_2016-2023/") +parse_logfiles <- function(folders, path, file = "rawdata_logfiles.csv", + save = TRUE) { + dirpaths <- paste0(path, folders) + # TODO: This is not very intutitive + fnames <- dir(dirpaths, pattern = "*.log", full.names = TRUE) + + suppressWarnings( + logs <- lapply(fnames, readLines) + ) + nlog <- sapply(logs, length) + dat <- data.frame(fileId = rep(leftpad_fnames(fnames, dirpaths), nlog), + logs = unlist(logs)) + + # Remove corrupt lines + d1 <- nrow(dat) + dat <- subset(dat, dat$logs != "") + d2 <- nrow(dat) + + if(d1 > d2) { + warning(paste0(d1-d2, " corrupt lines have been found and removed from the data set\n")) + } + + # Extract relevant infos + date <- sapply(dat$logs, gsub, + pattern = "^\\[(.*)\\], \\[.*$", + replacement = "\\1", + USE.NAMES = FALSE) + + timestamp <- sapply(dat$logs, gsub, + pattern = "^\\[.*\\], \\[(.*)\\].*$", + replacement = "\\1", + USE.NAMES = FALSE) + + action <- sapply(dat$logs, gsub, + pattern = "^.*EyeVisit, (.*):*.*$", + replacement = "\\1", + USE.NAMES = FALSE) + + events <- sapply(strsplit(action, ":"), function(x) x[1]) + + topics <- sapply(strsplit(action, ":"), function(x) x[2]) + + suppressWarnings( + moves <- apply(do.call(rbind, + strsplit(sapply(strsplit(action, ":"), + function(x) x[3]), ",")), 2, + as.numeric) + ) + # ATTENTION: as.numeric() forces NAs for "OpenCard" and "CloseCard" + + card_action <- trimws(sapply(strsplit(action, ":"), + function(x) x[3])[grep("Artwork", events)]) + + card <- as.numeric(sapply(strsplit(action, ":"), function(x) x[4])) + + events[grep("Artwork", events)] <- paste("Artwork", card_action, sep = "/") + + ts_elements <- strsplit(timestamp, ":") + time_ms <- as.numeric(sapply(ts_elements, function(x) x[4])) + + as.numeric(sapply(ts_elements, function(x) x[3])) * 1000 + + as.numeric(sapply(ts_elements, function(x) x[2])) * 1000 * 60 + + dat$date <- lubridate::parse_date_time(date, "bdyHMSOp") + dat$timeMs <- time_ms + dat$event <- events + dat$artwork <- trimws(sapply(strsplit(topics, "/"), function(x) x[1])) + dat$popup <- sapply(strsplit(topics, "/"), function(x) x[2]) + dat$topicNumber <- card + dat$x <- moves[,1] + dat$y <- moves[,2] + dat$scale <- moves[,3] + dat$rotation <- moves[,4] + dat$logs <- NULL + + dat <- dat[order(dat$fileId, dat$date, dat$timeMs), ] + + # Export data + if (save) { + utils::write.table(dat, file = file, sep = ";", row.names = FALSE) + cat(paste0("INFORMATION: Data file", file, "has been written to ", getwd(), "\n")) + } else { + return(dat) + } +} + diff --git a/man/create_eventlogs.Rd b/man/create_eventlogs.Rd new file mode 100644 index 0000000..65aaccc --- /dev/null +++ b/man/create_eventlogs.Rd @@ -0,0 +1,22 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/create_eventlogs.R +\name{create_eventlogs} +\alias{create_eventlogs} +\title{Creating log events from raw log files.} +\usage{ +create_eventlogs(data) +} +\arguments{ +\item{data}{Data frame of raw log files created with \code{parse_logfiles()}. +See \code{?parse_logfiles} for more details.} +} +\value{ +Data frame. +} +\description{ +Creating event logs from a data frame of raw log files from a +Multi-Touch-Table at the IWM. +} +\examples{ +# tbd +} diff --git a/man/leftpad_fnames.Rd b/man/leftpad_fnames.Rd new file mode 100644 index 0000000..da929e0 --- /dev/null +++ b/man/leftpad_fnames.Rd @@ -0,0 +1,28 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/parse_logfiles.R +\name{leftpad_fnames} +\alias{leftpad_fnames} +\title{Left padding file names of raw log files from Multi-Touch-Table at the +IWM.} +\usage{ +leftpad_fnames(x, dirpaths) +} +\arguments{ +\item{x}{file name in the form of `yyyy_mm_dd-hh_mm_ss.} + +\item{dirpaths}{Paths on system where files live that should be renamed.} +} +\value{ +Left padded file names. +} +\description{ +File names need to be left padded since otherwise the sorting of the +timestamps will be off and one will get negative durations later on +since the wrong events get closed. +} +\examples{ +# folders <- "all" +# dirpaths <- paste0("../data/haum_logs_2016-2023/", folders) +# fnames <- dir(dirpaths, pattern = "*.log", full.names = TRUE) +# leftpad_fnames(fnames) +} diff --git a/man/parse_logfiles.Rd b/man/parse_logfiles.Rd new file mode 100644 index 0000000..e1a15f1 --- /dev/null +++ b/man/parse_logfiles.Rd @@ -0,0 +1,30 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/parse_logfiles.R +\name{parse_logfiles} +\alias{parse_logfiles} +\title{Creating data frame for raw log files.} +\usage{ +parse_logfiles(folders, path, file = "rawdata_logfiles.csv", save = TRUE) +} +\arguments{ +\item{folders}{A character vector of folder names that contain the raw +log files from the Multi-Touch-Table at the IWM.} + +\item{path}{A path to the folders.} + +\item{file}{Name of the file where parsed log files should be saved. +Default is "rawdata_logfiles.csv".} + +\item{save}{Logical. If data frame should be returned by the function or +saved. Default is TRUE.} +} +\value{ +A data frame or NULL. +} +\description{ +Creates a data frame or CSV file from raw log files from a +Multi-Touch-Table at the IWM. +} +\examples{ +# parse_logfiles("all", path = "../data/haum_logs_2016-2023/") +}