From fa730081db018436ddb1b7bc6a5474f165324e8a Mon Sep 17 00:00:00 2001 From: nwickel Date: Tue, 12 Sep 2023 14:31:36 +0200 Subject: [PATCH] Reworked structure of preprocessing; moved all steps to separate functions --- code/02_preprocessing.R | 488 +++-------------------------- code/functions.R | 431 +++++++++++++------------ code/questions_programming-input.R | 6 - 3 files changed, 250 insertions(+), 675 deletions(-) diff --git a/code/02_preprocessing.R b/code/02_preprocessing.R index 89991ea..d41f538 100644 --- a/code/02_preprocessing.R +++ b/code/02_preprocessing.R @@ -1,476 +1,60 @@ -#' --- -#' title: "Preprocessing log files" -#' author: "Nora Wickelmaier" -#' date: "`r Sys.Date()`" -#' output: -#' html_document: -#' toc: true -#' toc_float: true -#' pdf_document: -#' toc: true -#' number_sections: true -#' geometry: margin = 2.5cm -#' --- - # setwd("C:/Users/nwickelmaier/Nextcloud/Documents/MDS/2023ss/60100_master_thesis/code") -#' # Read data +source("functions.R") + +# Read data dat0 <- read.table("../data/rawdata_logfiles_small.csv", sep = ";", header = TRUE) -dat0$date <- as.POSIXct(dat0$date) # create date object +dat0$date <- as.POSIXct(dat0$date) dat0$glossar <- ifelse(dat0$artwork == "glossar", 1, 0) -#' # Remove irrelevant events - -#' ## Remove Start Application and Show Application - +# Remove irrelevant events dat <- subset(dat0, !(dat0$event %in% c("Start Application", "Show Application"))) -rownames(dat) <- NULL - -#' # Close events - -######## - -#' Do it for Transform events first -dat1 <- dat[dat$event %in% c("Transform start", "Transform stop"), ] -dat1 <- dat1[order(dat1$artwork, dat1$date), ] -rownames(dat1) <- NULL - -# Create event ID for closing events -num_start <- diff(c(0, which(dat1$event == "Transform stop"))) -dat1$eventId <- rep(seq_along(num_start), num_start) -head(dat1[, c("event", "eventId")], 25) - -table(table(dat1$eventId)) -# 1 2 3 4 5 6 7 8 10 11 -# 70 78435 5153 842 222 66 18 14 3 1 -# --> compare to table(num_start)! - -# Find out how often "Transform stop" follows each other -num_stop <- c(diff(c(0, which(dat1$event == "Transform start")))) -table(num_stop) - -# remove duplicated "Transform start" events -dat1 <- dat1[!duplicated(dat1[, c("event", "eventId")]), ] - -# remove duplicated "Transform stop" events -id_stop <- which(dat1$event == "Transform stop") -id_rm_stop <- id_stop[diff(id_stop) == 1] - -dat1 <- dat1[-(id_rm_stop + 1), ] - -# transform to wide data format -dat1$time <- ifelse(dat1$event == "Transform start", "start", "stop") - -trans_wide <- reshape(dat1, direction = "wide", - idvar = c("eventId", "artwork", "glossar"), - timevar = "time", - drop = c("popup", "topicNumber", "event") -) - -# TODO: This runs for quite some time -# --> Is this more efficient with tidyr::pivot_wider? - -# --> when fileId is part of the reshape, it does not work correctly, since -# we sometimes have a start - stop event that is recorded in two separate -# log files, BUT: after finding out, that `timeMs` changes for each log -# file, I want to exclude those cases, so `fileId` has to be included!!! - -# check how often an eventId is associated with two fileIds -nrow(subset(trans_wide, trans_wide$fileId.start != trans_wide$fileId.stop)) - -# which(is.na(trans_wide$date.start)) - -trans_wide$event <- "move" -trans_wide$eventId <- NULL - -rownames(trans_wide) <- NULL - -trans_wide$duration <- trans_wide$timeMs.stop - trans_wide$timeMs.start -#trans_wide$duration <- trans_wide$date.stop - trans_wide$date.start -# only seconds - not fine grained enough -trans_wide$distance <- apply( - trans_wide[, c("x.start", "y.start", "x.stop", "y.stop")], 1, - function(x) dist(matrix(x, 2, 2, byrow = TRUE))) -trans_wide$rotationDegree <- trans_wide$rotation.stop - - trans_wide$rotation.start -trans_wide$scaleSize <- trans_wide$scale.stop / trans_wide$scale.start - -trans_wide$trace <- NA -trans_wide$topicNumber <- NA -trans_wide$popup <- NA - -dat_trans <- trans_wide[trans_wide$distance != 0 & - trans_wide$rotationDegree != 0 & - trans_wide$scaleSize != 1, - c("fileId.start", "fileId.stop", "event", "artwork", - "trace", "glossar", "date.start", "date.stop", - "timeMs.start", "timeMs.stop", "duration", - "topicNumber", "popup", "x.start", "y.start", - "x.stop", "y.stop", "distance", "scale.start", - "scale.stop", "scaleSize", "rotation.start", - "rotation.stop", "rotationDegree")] -1 - nrow(dat_trans) / nrow(trans_wide) -# removes almost 2/3 of the data (for small data set) - -rm(id_rm_stop, id_stop, trans_wide, num_start, num_stop) - -summary(dat_trans) - - -#' # Close other events - -dat2 <- dat[!dat$event %in% c("Transform start", "Transform stop"), ] -rownames(dat2) <- NULL - -dat2$trace <- NA -last_event <- dat2$event[1] -aws <- unique(dat2$artwork)[unique(dat2$artwork) != "glossar"] -# -for (art in aws) { # select artwork - - for (i in 1:nrow(dat2)) { # go through rows - - if (last_event == "Show Info" & dat2$artwork[i] == art) { - dat2$trace[i] <- i - j <- i - - } else if (last_event == "Show Front" & dat2$artwork[i] == art) { - dat2$trace[i] <- j - - } else if (!(last_event %in% c("Show Info", "Show Front")) & - dat2$artwork[i] == art) { - dat2$trace[i] <- j - } - - if (i <= nrow(dat2)) { - last_event <- dat2$event[i + 1] - } - } -} - -rm(aws, i, j, last_event, art) - -#' ## Fix glossar entries (find corresponding artworks and fill in trace) - -glossar_files <- unique(dat2[dat2$artwork == "glossar", "popup"]) - -# load lookup table for artworks and glossar files -load("../data/glossar_dict.RData") -lut <- glossar_dict[glossar_dict$glossar_file %in% glossar_files, ] - -head(dat2[, c("artwork", "event", "popup", "trace")], 20) - -inside <- glossar_files[glossar_files %in% - lut[sapply(lut$artwork, length) == 1, - "glossar_file"]] -single_art <- unlist(lut[lut$glossar_file %in% inside, "artwork"]) - - -for (file in lut$glossar_file) { - - artwork_list <- unlist(lut[lut$glossar_file == file, "artwork"]) - - for (i in seq_len(nrow(dat2))) { - - if (dat2$event[i] == "Show Info" | - (dat2$event[i] == "Artwork/OpenCard" & - dat2$artwork[i] %in% single_art)) { - - current_artwork <- dat2[i, "artwork"] - j <- i - k <- i - - } else { - - current_artwork <- current_artwork - - } - - if (dat2$event[i] == "Show Front" & dat2$artwork[i] == current_artwork) { - # make sure artwork has not been closed, yet! - k <- i - } - - if (dat2$artwork[i] == "glossar" & - (current_artwork %in% artwork_list) & - dat2$popup[i] == file & (j - k == 0)) { - - dat2[i, "trace"] <- dat2[j, "trace"] - dat2[i, "artwork"] <- current_artwork - - } - } -} - -# --> finds a bit more than half of the glossar entries for the small data -# set... -proportions(table(is.na(dat2[dat2$glossar == 1, "trace"]))) - -# REMEMBER: It can never be 100% correct, since it is always possible that -# several cards are open and that they link to the same glossar entry - -dat2[14110:14130, ] -dat2[dat2$glossar == 1, ] - -# Exclude not matched glossar entries -df <- subset(dat2, !is.na(dat2$trace)) -df <- df[order(df$trace), ] -rownames(df) <- NULL - -rm(lut, current_artwork, file, glossar_dict, i, j, k, artwork_list, - glossar_files, inside, single_art) - -#' ## Close flipCard - -dat3 <- subset(df, df$event %in% c("Show Info", "Show Front")) - -dat3$time <- ifelse(dat3$event == "Show Info", "start", "stop") - -flipCard_wide <- reshape(dat3, direction = "wide", - idvar = c("trace", "artwork", "glossar"), - timevar = "time", - drop = c("popup", "topicNumber")) -flipCard_wide$event <- "flipCard" -flipCard_wide$duration <- flipCard_wide$timeMs.stop - - flipCard_wide$timeMs.start - -flipCard_wide$topicNumber <- NA -flipCard_wide$popup <- NA -flipCard_wide$distance <- NA -flipCard_wide$scaleSize <- NA -flipCard_wide$rotationDegree <- NA - -dat_flipCard <- flipCard_wide[, c("fileId.start", "fileId.stop", "event", - "artwork", "trace", "glossar", - "date.start", "date.stop", - "timeMs.start", "timeMs.stop", - "duration", "topicNumber", "popup", - "x.start", "y.start", "x.stop", "y.stop", - "distance", "scale.start", "scale.stop", - "scaleSize", "rotation.start", - "rotation.stop", "rotationDegree")] - -rm(flipCard_wide) - -#' ## Close openTopic - -dat4 <- subset(df, df$event %in% c("Artwork/OpenCard", "Artwork/CloseCard")) -dat4 <- dat4[order(dat4$artwork, dat4$date), ] -rownames(dat4) <- NULL - -num_start <- diff(c(0, which(dat4$event == "Artwork/CloseCard"))) -dat4$eventId <- rep(seq_along(num_start), num_start) - -dat4$time <- ifelse(dat4$event == "Artwork/OpenCard", "start", "stop") - -openTopic_wide <- reshape(dat4, direction = "wide", - idvar = c("eventId", "trace", "glossar", "artwork", "topicNumber"), - timevar = "time", drop = "popup") -openTopic_wide$event <- "openTopic" -openTopic_wide$duration <- openTopic_wide$timeMs.stop - - openTopic_wide$timeMs.start - -openTopic_wide$popup <- NA -openTopic_wide$distance <- NA -openTopic_wide$scaleSize <- NA -openTopic_wide$rotationDegree <- NA - -dat_openTopic <- openTopic_wide[, c("fileId.start", "fileId.stop", "event", - "artwork", "trace", "glossar", - "date.start", "date.stop", - "timeMs.start", "timeMs.stop", - "duration", "topicNumber", "popup", - "x.start", "y.start", "x.stop", - "y.stop", "distance", "scale.start", - "scale.stop", "scaleSize", - "rotation.start", "rotation.stop", - "rotationDegree")] -# TODO: topicNumber should have a unique identifier for each artwork - -rm(openTopic_wide, num_start) - -#' ## close openPopup - -dat5 <- subset(df, df$event %in% c("ShowPopup", "HidePopup")) -dat5 <- dat5[order(dat5$artwork, dat5$popup, dat5$date), ] -rownames(dat5) <- NULL - -num_start <- diff(c(0, which(dat5$event == "HidePopup"))) -# last event is "ShowPopup"! Needs to be fixed -# num_start <- c(num_start, 1) -# TODO: Needs to be caught in a function -# --> not anymore - still relevant??? - -dat5$eventId <- rep(seq_along(num_start), num_start) - -dat5$time <- ifelse(dat5$event == "ShowPopup", "start", "stop") - -openPopup_wide <- reshape(dat5, direction = "wide", - idvar = c("eventId", "trace", "glossar", - "artwork", "popup"), - timevar = "time", - drop = "topicNumber") -# there is a pathological entry which gets deleted... -# df[df$trace == 4595, ] -# --> artwork 046 popup selene.xml gets opened twice - -openPopup_wide$event <- "openPopup" -openPopup_wide$duration <- openPopup_wide$timeMs.stop - - openPopup_wide$timeMs.start - -openPopup_wide$topicNumber <- NA -openPopup_wide$distance <- NA -openPopup_wide$scaleSize <- NA -openPopup_wide$rotationDegree <- NA - -dat_openPopup <- openPopup_wide[, c("fileId.start", "fileId.stop", "event", - "artwork", "trace", "glossar", - "date.start", "date.stop", - "timeMs.start", "timeMs.stop", - "duration", "topicNumber", "popup", - "x.start", "y.start", "x.stop", - "y.stop", "distance", "scale.start", - "scale.stop", "scaleSize", - "rotation.start", "rotation.stop", - "rotationDegree")] -rm(num_start, openPopup_wide) - - -# TODO: Should topicNumber maybe also be filled in for "openPopup"? - -#' ## Merge data sets for different events - -dat_all <- rbind(dat_trans, dat_flipCard, dat_openTopic, dat_openPopup) - -# check -nrow(dat_all) == (nrow(dat_trans) + nrow(dat_flipCard) + - nrow(dat_openTopic) + nrow(dat_openPopup)) - -#' ## Remove all events that do not have a `date.start` - -dim(dat_all[is.na(dat_all$date.start), ]) -dat_all <- dat_all[!is.na(dat_all$date.start), ] -# There is only a `date.stop`, when event is not properly closed, see here: -df[df$trace == 1843, ] -dat_openPopup[dat_openPopup$trace == 1843, ] -## --> still 50 (small data set) left, and some really do not seem to be -## opened! Must be a log error -# --> others should be closed! -dat[31000:31019,] # this one e.g. -# --> Actually NOT! card gets flipped before! Again - log error! +# Add trace variable +dat1 <- add_trace(dat) +# Close events +dat2 <- rbind(close_events(dat1, "move"), + close_events(dat1, "flipCard"), + close_events(dat1, "openTopic"), + close_events(dat1, "openPopup")) +dat2 <- dat2[order(dat2$date.start, dat2$fileId.start), ] # Remove durations when event spans more than one log file, since they are # not interpretable -dat_all[which(dat_all$fileId.start != dat_all$fileId.stop), "duration"] <- NA +dat2[which(dat2$fileId.start != dat2$fileId.stop), "duration"] <- NA -# sort by `start.date` -dat_all <- dat_all[order(dat_all$date.start), ] -rownames(dat_all) <- NULL +# Remove all events that do not have a `date.start` +dat2 <- dat2[!is.na(dat2$date.start), ] +rownames(dat2) <- NULL -ind <- rowSums(is.na(dat_all)) == ncol(dat_all) -any(ind) -dat_all[ind, ] -# --> No rows with only NA, as it should be. +#summary(dat2) -summary(dat_all) # OK, this actually makes a lot of sense :) +# Add case variable +dat3 <- add_case(dat2) -#' ## Create case variable +# Add event ID +dat3$eventId <- seq_len(nrow(dat3)) +dat3 <- dat3[, c("fileId.start", "fileId.stop", "eventId", "case", + "trace", "glossar", "event", "artwork", + "date.start", "date.stop", "timeMs.start", + "timeMs.stop", "duration", "topicNumber", "popup", + "x.start", "y.start", "x.stop", "y.stop", + "distance", "scale.start", "scale.stop", + "scaleSize", "rotation.start", "rotation.stop", + "rotationDegree")] -dat_all$timediff <- as.numeric(diff(c(dat_all$date.start[1], dat_all$date.start))) +# Add trace for move events +dat4 <- add_trace_moves(dat3) -hist(dat_all$timediff[dat_all$timediff < 40], breaks = 50) +# Fill in topics - -# TODO: What is the best choice for the cutoff here? I took 20 secs for now -dat_all$case <- NA -j <- 1 - -for (i in seq_len(nrow(dat_all))) { - if (dat_all$timediff[i] < 21) { - dat_all$case[i] <- j - } else { - j <- j + 1 - dat_all$case[i] <- j - } -} - -head(dat_all[, c("event", "artwork", "trace", "date.start", "timediff", "case")], 100) - -#' ## Add event ID - -dat_all$eventId <- seq_len(nrow(dat_all)) - -dat_all <- dat_all[, c("fileId.start", "fileId.stop", "eventId", "case", - "trace", "glossar", "event", "artwork", - "date.start", "date.stop", "timeMs.start", - "timeMs.stop", "duration", "topicNumber", "popup", - "x.start", "y.start", "x.stop", "y.stop", - "distance", "scale.start", "scale.stop", - "scaleSize", "rotation.start", "rotation.stop", - "rotationDegree")] - -#' ## Add `trace` numbers for `move` events - -cases <- unique(dat_all$case) -aws <- unique(dat_all$artwork)[unique(dat_all$artwork) != "glossar"] -max_trace <- max(dat_all$trace, na.rm = TRUE) + 1 -out <- NULL - -for (case in cases) { - for (art in aws) { - tmp <- dat_all[dat_all$case == case & dat_all$artwork == art, ] - if (nrow(tmp) != 0) { - - if (length(na.omit(unique(tmp$trace))) == 1) { - tmp[tmp$event == "move", "trace"] <- na.omit(unique(tmp$trace)) - } else if (length(na.omit(unique(tmp$trace))) > 1) { - for (i in 1:nrow(tmp)) { - if (tmp$event[i] == "move") { - if (i == 1) { - tmp$trace[i] <- na.omit(unique(tmp$trace))[1] - } else { - tmp$trace[i] <- tmp$trace[i - 1] - } - } - } - } else if (all(is.na(tmp$trace))) { - for (i in 1:nrow(tmp)) { - if (tmp$event[i] == "move") { - tmp$trace[i] <- max_trace - } - } - } - max_trace <- max_trace + 1 - } - if (nrow(tmp) > 0) { - out <- rbind(out, tmp) - } - } -} - -# TODO: Get rid of the loops -# --> This takes forever... - -# put glossar events back in --> not relevant anymore - -#dat_all <- rbind(out, dat_all[dat_all$artwork == "glossar", ]) -out <- out[order(out$date.start), ] -rownames(out) <- NULL - -# Make `trace` a consecutive number -out$trace <- as.numeric(factor(out$trace, levels = unique(out$trace))) - -#' # Fill in topics - -topics <- read.table("../data/topics.csv", sep = ";", header = TRUE) +# topics <- read.table("../data/topics.csv", sep = ";", header = TRUE) # TODO: Add topics to data frame -#' # Export data - -write.table(out, "../data/event_logfiles.csv", sep = ";", +# Export data +write.table(dat4, "../data/event_logfiles.csv", sep = ";", row.names = FALSE) diff --git a/code/functions.R b/code/functions.R index 9b0b165..1d96bf1 100644 --- a/code/functions.R +++ b/code/functions.R @@ -1,245 +1,195 @@ -# setwd("C:/Users/nwickelmaier/Nextcloud/Documents/MDS/2023ss/60100_master_thesis/code") - -# Read data - -dat0 <- read.table("../data/rawdata_logfiles_small.csv", sep = ";", - header = TRUE) - - dat0$date <- as.POSIXct(dat0$date) - dat0$glossar <- ifelse(dat0$artwork == "glossar", 1, 0) - # remove irrelevant events - dat <- subset(dat0, !(dat0$event %in% c("Start Application", - "Show Application"))) - -# Close move events -close_moves <- function(data) { - - # close move events - dat1 <- data[data$event %in% c("Transform start", "Transform stop"), ] - dat1 <- dat1[order(dat1$artwork, dat1$date), ] - num_start <- diff(c(0, which(dat1$event == "Transform stop"))) - dat1$eventId <- rep(seq_along(num_start), num_start) - - # remove duplicated "Transform start" events - dat1 <- dat1[!duplicated(dat1[, c("event", "eventId")]), ] - - # remove duplicated "Transform stop" events - id_stop <- which(dat1$event == "Transform stop") - id_rm_stop <- id_stop[diff(id_stop) == 1] - - dat1 <- dat1[-(id_rm_stop + 1), ] - - # transform to wide data format - dat1$time <- ifelse(dat1$event == "Transform start", "start", "stop") - - trans_wide <- reshape(dat1, direction = "wide", - idvar = c("eventId", "artwork", "glossar"), - timevar = "time", - drop = c("popup", "topicNumber", "event") - ) - trans_wide$event <- "move" - trans_wide$eventId <- NULL - - trans_wide$duration <- trans_wide$timeMs.stop - trans_wide$timeMs.start - trans_wide$distance <- apply( - trans_wide[, c("x.start", "y.start", "x.stop", "y.stop")], 1, - function(x) dist(matrix(x, 2, 2, byrow = TRUE))) - trans_wide$rotationDegree <- trans_wide$rotation.stop - - trans_wide$rotation.start - trans_wide$scaleSize <- trans_wide$scale.stop / trans_wide$scale.start - - trans_wide$trace <- NA - trans_wide$topicNumber <- NA - trans_wide$popup <- NA - - dat_trans <- trans_wide[trans_wide$distance != 0 & - trans_wide$rotationDegree != 0 & - trans_wide$scaleSize != 1, - c("fileId.start", "fileId.stop", "event", "artwork", - "trace", "glossar", "date.start", "date.stop", - "timeMs.start", "timeMs.stop", "duration", - "topicNumber", "popup", "x.start", "y.start", - "x.stop", "y.stop", "distance", "scale.start", - "scale.stop", "scaleSize", "rotation.start", - "rotation.stop", "rotationDegree")] - rownames(dat_trans) <- NULL - cat(paste("INFORMATION:", nrow(trans_wide) - nrow(dat_trans), - "lines containing move events were removed since they did", - "\nnot contain any change"), fill = TRUE) - dat_trans -} - - -dat1 <- close_moves(dat) -# TODO: Integrate this function into close_events? - ########################################################################### # Add trace variable -add_trace <- function(data) { - dat2 <- data[!data$event %in% c("Transform start", "Transform stop"), ] - - dat2$trace <- NA - last_event <- dat2$event[1] - aws <- unique(dat2$artwork)[unique(dat2$artwork) != "glossar"] - # - for (art in aws) { # select artwork - - for (i in 1:nrow(dat2)) { # go through rows - - if (last_event == "Show Info" & dat2$artwork[i] == art) { - dat2$trace[i] <- i - j <- i - - } else if (last_event == "Show Front" & dat2$artwork[i] == art) { - dat2$trace[i] <- j - - } else if (!(last_event %in% c("Show Info", "Show Front")) & - dat2$artwork[i] == art) { - dat2$trace[i] <- j - } - - if (i <= nrow(dat2)) { - last_event <- dat2$event[i + 1] - } - } - } - dat2 -} - - -add_trace2 <- function(data, glossar_dict = "../data/glossar_dict.RData") { +add_trace <- function(data, glossar_dict = "../data/glossar_dict.RData") { data$trace <- NA - dat1 <- data[data$event %in% c("Transform start", "Transform stop"), ] - dat2 <- data[!data$event %in% c("Transform start", "Transform stop"), ] - - last_event <- dat2$event[1] - aws <- unique(dat2$artwork)[unique(dat2$artwork) != "glossar"] - # - for (art in aws) { # select artwork - - for (i in 1:nrow(dat2)) { # go through rows + subdata1 <- data[data$event %in% c("Transform start", "Transform stop"), ] + subdata2 <- data[!data$event %in% c("Transform start", "Transform stop"), ] - if (last_event == "Show Info" & dat2$artwork[i] == art) { - dat2$trace[i] <- i + last_event <- subdata2$event[1] + aws <- unique(subdata2$artwork)[unique(subdata2$artwork) != "glossar"] + + for (art in aws) { + + for (i in 1:nrow(subdata2)) { + + if (last_event == "Show Info" & subdata2$artwork[i] == art) { + subdata2$trace[i] <- i j <- i - } else if (last_event == "Show Front" & dat2$artwork[i] == art) { - dat2$trace[i] <- j + } else if (last_event == "Show Front" & subdata2$artwork[i] == art) { + subdata2$trace[i] <- j } else if (!(last_event %in% c("Show Info", "Show Front")) & - dat2$artwork[i] == art) { - dat2$trace[i] <- j + subdata2$artwork[i] == art) { + subdata2$trace[i] <- j } - if (i <= nrow(dat2)) { - last_event <- dat2$event[i + 1] + if (i <= nrow(subdata2)) { + last_event <- subdata2$event[i + 1] } } } - glossar_files <- unique(dat2[dat2$artwork == "glossar", "popup"]) - + # Fix glossar entries (find corresponding artworks and fill in trace) + glossar_files <- unique(subdata2[subdata2$artwork == "glossar", "popup"]) + # load lookup table for artworks and glossar files load(glossar_dict) lut <- glossar_dict[glossar_dict$glossar_file %in% glossar_files, ] - - head(dat2[, c("artwork", "event", "popup", "trace")], 20) - - inside <- glossar_files[glossar_files %in% + + head(subdata2[, c("artwork", "event", "popup", "trace")], 20) + + inside <- glossar_files[glossar_files %in% lut[sapply(lut$artwork, length) == 1, "glossar_file"]] single_art <- unlist(lut[lut$glossar_file %in% inside, "artwork"]) - - + + for (file in lut$glossar_file) { - + artwork_list <- unlist(lut[lut$glossar_file == file, "artwork"]) - - for (i in seq_len(nrow(dat2))) { - - if (dat2$event[i] == "Show Info" | - (dat2$event[i] == "Artwork/OpenCard" & - dat2$artwork[i] %in% single_art)) { - - current_artwork <- dat2[i, "artwork"] + + for (i in seq_len(nrow(subdata2))) { + + if (subdata2$event[i] == "Show Info" | + (subdata2$event[i] == "Artwork/OpenCard" & + subdata2$artwork[i] %in% single_art)) { + + current_artwork <- subdata2[i, "artwork"] j <- i k <- i - + } else { - + current_artwork <- current_artwork - + } - - if (dat2$event[i] == "Show Front" & dat2$artwork[i] == current_artwork) { + + if (subdata2$event[i] == "Show Front" & subdata2$artwork[i] == current_artwork) { # make sure artwork has not been closed, yet! k <- i } - - if (dat2$artwork[i] == "glossar" & + + if (subdata2$artwork[i] == "glossar" & (current_artwork %in% artwork_list) & - dat2$popup[i] == file & (j - k == 0)) { - - dat2[i, "trace"] <- dat2[j, "trace"] - dat2[i, "artwork"] <- current_artwork - + subdata2$popup[i] == file & (j - k == 0)) { + + subdata2[i, "trace"] <- subdata2[j, "trace"] + subdata2[i, "artwork"] <- current_artwork + } } } - cat(proportions(table(is.na(dat2[dat2$glossar == 1, "trace"]))), fill = TRUE) + # Exclude not matched glossar entries + cat("INFORMATION: glossar entries that are not matched will be removed:", + sum(is.na(subdata2[subdata2$glossar == 1, "trace"])), "entries", + #proportions(table(is.na(subdata2[subdata2$glossar == 1, "trace"]))), + fill = TRUE) + subdata2 <- subset(subdata2, !is.na(subdata2$trace)) + # REMEMBER: It can never be 100% correct, since it is always possible + # that several cards are open and that they link to the same glossar + # entry - out <- rbind(dat1, dat2) + # dat2[14110:14130, ] + # dat2[dat2$glossar == 1, ] + + out <- rbind(subdata1, subdata2) out <- out[order(out$fileId, out$date, out$timeMs), ] out } - -tmp <- add_trace2(dat) - ########################################################################### -close_events <- function(data, event = c("flipCard", "openTopic", "openPopup")) { +close_events <- function(data, event = c("move", "flipCard", "openTopic", "openPopup")) { + + if (event == "move") { + actions <- c("Transform start", "Transform stop") + idvar <- c("eventId", "artwork", "glossar") + drop <- c("popup", "topicNumber", "trace", "event") + + } else if (event == "flipCard") { + actions <- c("Show Info", "Show Front") + idvar <- c("trace", "artwork", "glossar") + drop <- c("popup", "topicNumber", "eventId", "event") - if (event == "flipCard") { - subdata <- subset(data, data$event %in% c("Show Info", "Show Front")) - subdata$time <- ifelse(subdata$event == "Show Info", "start", "stop") - subdata$eventId <- NA - idvar <- c("trace", "artwork", "glossar") - drop <- c("popup", "topicNumber") - } else if (event == "openTopic") { - subdata <- subset(data, data$event %in% c("Artwork/OpenCard", "Artwork/CloseCard")) - subdata$time <- ifelse(subdata$event == "Artwork/OpenCard", "start", "stop") - num_start <- diff(c(0, which(subdata$event == "Artwork/CloseCard"))) - subdata$eventId <- rep(seq_along(num_start), num_start) - idvar <- c("eventId", "trace", "glossar", "artwork", "topicNumber") - drop <- "popup" + actions <- c("Artwork/OpenCard", "Artwork/CloseCard") + idvar <- c("eventId", "trace", "glossar", "artwork", "topicNumber") + drop <- c("popup", "event") } else if (event == "openPopup") { - subdata <- subset(data, data$event %in% c("ShowPopup", "HidePopup")) - subdata$time <- ifelse(subdata$event == "ShowPopup", "start", "stop") - num_start <- diff(c(0, which(subdata$event == "HidePopup"))) - subdata$eventId <- rep(seq_along(num_start), num_start) - idvar <- c("eventId", "trace", "glossar", "artwork", "popup") - drop <- "topicNumber" + actions <- c("ShowPopup", "HidePopup") + idvar <- c("eventId", "trace", "glossar", "artwork", "popup") + drop <- c("topicNumber", "event") +# TODO: Should topicNumber maybe also be filled in for "openPopup"? + + } else { + stop("`event` must be one of 'move', 'flipCard', 'openTopic', + 'openPopup'.") } - data_wide <- reshape(subdata, direction = "wide", - idvar = idvar, - timevar = "time", - drop = drop) - data_wide$event <- event - data_wide$duration <- data_wide$timeMs.stop - data_wide$timeMs.start - + subdata <- subset(data, data$event %in% actions) + #subdata <- subdata[order(subdata$artwork, subdata$popup, subdata$date), ] + subdata <- subdata[order(subdata$artwork, subdata$popup, subdata$date, subdata$timeMs), ] + subdata$time <- ifelse(subdata$event == actions[1], "start", "stop") + num_start <- diff(c(0, which(subdata$event == actions[2]))) + subdata$eventId <- rep(seq_along(num_start), num_start) +# If last event is start event, it needs to be fixed: +# num_start <- c(num_start, 1) +# TODO: Needs to be caught in a function +# --> not anymore - still relevant??? + + if (event == "move") { + subdata <- subdata[!duplicated(subdata[, c("event", "eventId")]), ] + id_stop <- which(subdata$event == actions[2]) + id_rm_stop <- id_stop[diff(id_stop) == 1] + subdata <- subdata[-(id_rm_stop + 1), ] + } + + suppressWarnings( + data_wide <- reshape(subdata, direction = "wide", + idvar = idvar, + timevar = "time", + drop = drop) + ) +# TODO: Suppress warnings? Better with tryCatch()? +# there is a pathological entry which gets deleted... +# df[df$trace == 4595, ] +# --> artwork 046 popup selene.xml gets opened twice + +# TODO: This runs for quite some time +# --> Is this more efficient with tidyr::pivot_wider? + + for (d in drop) data_wide[d] <- NA data_wide$distance <- NA data_wide$scaleSize <- NA data_wide$rotationDegree <- NA - + + data_wide$event <- event + data_wide$duration <- data_wide$timeMs.stop - data_wide$timeMs.start + + if (event == "move") { + data_wide$distance <- apply( + data_wide[, c("x.start", "y.start", "x.stop", "y.stop")], 1, + function(x) dist(matrix(x, 2, 2, byrow = TRUE))) + data_wide$rotationDegree <- data_wide$rotation.stop - + data_wide$rotation.start + data_wide$scaleSize <- data_wide$scale.stop / data_wide$scale.start + # remove moves without any change + move_wide <- data_wide[data_wide$distance != 0 & + data_wide$rotationDegree != 0 & + data_wide$scaleSize != 1, ] + cat(paste("INFORMATION:", nrow(data_wide) - nrow(move_wide), + "lines containing move events were removed since they did", + "\nnot contain any change"), fill = TRUE) + data_wide <- move_wide + } + out <- data_wide[, c("fileId.start", "fileId.stop", "event", "artwork", "trace", "glossar", "date.start", "date.stop", "timeMs.start", "timeMs.stop", "duration", @@ -247,37 +197,84 @@ close_events <- function(data, event = c("flipCard", "openTopic", "openPopup")) "x.stop", "y.stop", "distance", "scale.start", "scale.stop", "scaleSize", "rotation.start", "rotation.stop", "rotationDegree")] + rownames(out) <- NULL out - # TODO: Suppress warnings? } -tmp <- rbind(close_moves(dat), - close_events(df, "flipCard"), - close_events(df, "openTopic"), - close_events(df, "openPopup")) +########################################################################### + +# Add case variable + +add_case <- function(data, cutoff = 20) { +# TODO: What is the best choice for the cutoff here? + + data$timediff <- as.numeric(diff(c(data$date.start[1], data$date.start))) + + data$case <- NA + j <- 1 + + for (i in seq_len(nrow(data))) { + if (data$timediff[i] <= cutoff) { + data$case[i] <- j + } else { + j <- j + 1 + data$case[i] <- j + } + } + data$timediff <- NULL + data +} + +########################################################################### + +# Add trace for moves + +add_trace_moves <- function(data) { + + cases <- unique(data$case) + aws <- unique(data$artwork)[unique(data$artwork) != "glossar"] + max_trace <- max(data$trace, na.rm = TRUE) + 1 + out <- NULL + + for (case in cases) { + for (art in aws) { + tmp <- data[data$case == case & data$artwork == art, ] + if (nrow(tmp) != 0) { + + if (length(na.omit(unique(tmp$trace))) == 1) { + tmp[tmp$event == "move", "trace"] <- na.omit(unique(tmp$trace)) + } else if (length(na.omit(unique(tmp$trace))) > 1) { + for (i in 1:nrow(tmp)) { + if (tmp$event[i] == "move") { + if (i == 1) { + tmp$trace[i] <- na.omit(unique(tmp$trace))[1] + } else { + tmp$trace[i] <- tmp$trace[i - 1] + } + } + } + } else if (all(is.na(tmp$trace))) { + for (i in 1:nrow(tmp)) { + if (tmp$event[i] == "move") { + tmp$trace[i] <- max_trace + } + } + } + max_trace <- max_trace + 1 + } + if (nrow(tmp) > 0) { + out <- rbind(out, tmp) + } + } + } + out <- out[order(out$date.start, out$fileId.start), ] + rownames(out) <- NULL + + # Make trace a consecutive number + out$trace <- as.numeric(factor(out$trace, levels = unique(out$trace))) + out +} +# TODO: Get rid of the loops +# --> This takes forever... + -# 'data.frame': 38607 obs. of 24 variables: -# $ fileId.start : chr "2016_11_15-12_32_57.log" "2016_11_15-14_42_57.log" "2016_11_15-14_42_57.log" "2016_11_16-12_31_32.log" ... -# $ fileId.stop : chr "2016_11_15-12_32_57.log" "2016_11_15-14_42_57.log" "2016_11_15-14_42_57.log" "2016_11_16-12_31_32.log" ... -# $ event : chr "move" "move" "move" "move" ... -# $ artwork : chr "001" "001" "001" "001" ... -# $ trace : int NA NA NA NA NA NA NA NA NA NA ... -# $ glossar : num 0 0 0 0 0 0 0 0 0 0 ... -# $ date.start : POSIXct, format: "2016-12-15 12:39:49" "2016-12-15 14:49:37" ... -# $ date.stop : POSIXct, format: "2016-12-15 12:39:49" "2016-12-15 14:49:40" ... -# $ timeMs.start : int 412141 400777 554506 384312 406277 533864 548467 549396 158632 194982 ... -# $ timeMs.stop : int 412474 403784 556633 388313 407994 538185 549088 551116 160343 197099 ... -# $ duration : int 333 3007 2127 4001 1717 4321 621 1720 1711 2117 ... -# $ topicNumber : int NA NA NA NA NA NA NA NA NA NA ... -# $ popup : chr NA NA NA NA ... -# $ x.start : num 531 235 470 326 326 ... -# $ y.start : num 1221 734 2090 747 747 ... -# $ x.stop : num 513 360 1492 256 2459 ... -# $ y.stop : num 1212 809 1687 643 1430 ... -# $ distance : num 19.8 146.6 1098.5 125.2 2239.4 ... -# $ scale.start : num 0.8 0.301 0.8 0.301 0.301 ... -# $ scale.stop : num 0.8 0.331 0.822 0.391 0.397 ... -# $ scaleSize : num 1 1.1 1.03 1.3 1.32 ... -# $ rotation.start: num 116 116 90 116 116 ... -# $ rotation.stop : num 116.3 89.6 2.8 86.1 125.8 ... -# $ rotationDegree: num 0.00245 -26.72397 -87.19711 -30.14456 9.49951 ... diff --git a/code/questions_programming-input.R b/code/questions_programming-input.R index dd0e33c..9fc5afb 100644 --- a/code/questions_programming-input.R +++ b/code/questions_programming-input.R @@ -103,9 +103,3 @@ proportions(table(is.na(tmp$trace[tmp$artwork == "glossar"]))) # How many glossar_files are only associated with one artwork? lut[sapply(lut$artwork, length) == 1, "glossar_file"] -# TODO: Fill in the ones that are associated with one artwork -# --> Can't come up with something -- maybe ask Philipp??? - -# TODO: How to check if one of the former "Show Infos" is correct -# --> Can't come up with something -- maybe ask Philipp??? -