diff --git a/code/02_preprocessing.R b/code/02_preprocessing.R index 6264eb6..4d6572b 100644 --- a/code/02_preprocessing.R +++ b/code/02_preprocessing.R @@ -44,12 +44,19 @@ dat <- subset(dat0, !(dat0$event %in% c("Start Application", #' # Close events -#' Do it for Tranform events first +# TODO: +# Should every "Show front" be the beginning of a new trace? +# Should Transform events be handled separately and then be "added" again +# by timestamp? + +######## + +#' Do it for Transform events first tmp <- dat[dat$event %in% c("Transform start", "Transform stop"), ] tmp <- tmp[order(tmp$artwork, tmp$date), ] rownames(tmp) <- NULL -# Find out how often "Transform start" follows each other +# Create event ID for closing events num_start <- diff(c(0, which(tmp$event == "Transform stop"))) tmp$eventid <- rep(seq_along(num_start), num_start) head(tmp[, c("event", "eventid")], 25) @@ -88,8 +95,6 @@ trans_wide <- reshape(tmp, direction = "wide", idvar = c("eventid", "artwork"), timevar = "event", drop = c("fileid", "popup", "card") ) - -rownames(trans_wide) <- NULL # --> when fileid is part of the reshape, it does not work correctly, since # we sometimes have a start - stop event that is recorded in two separate # log files @@ -97,67 +102,297 @@ rownames(trans_wide) <- NULL # which(is.na(trans_wide$date.start)) +trans_wide$event <- "move" +trans_wide$eventid <- NULL + +rownames(trans_wide) <- NULL + trans_wide$duration <- trans_wide$time_ms.stop - trans_wide$time_ms.start -trans_wide$distance <- apply(trans_wide[, c("x.start", "y.start", "x.stop", "y.stop")], - 1, function(x) dist(matrix(x, 2, 2, byrow = TRUE))) -trans_wide$rotationDegree <- trans_wide$rotation.stop - trans_wide$rotation.start +#trans_wide$duration2 <- trans_wide$date.stop - trans_wide$date.start +# only seconds - not fine grained enough +trans_wide$distance <- apply( + trans_wide[, c("x.start", "y.start", "x.stop", "y.stop")], 1, + function(x) dist(matrix(x, 2, 2, byrow = TRUE))) +trans_wide$rotationDegree <- trans_wide$rotation.stop - + trans_wide$rotation.start trans_wide$scaleSize <- trans_wide$scale.stop - trans_wide$scale.start -trans_wide <- trans_wide[trans_wide$distance != 0 & - trans_wide$rotationDegree != 0 & - trans_wide$scaleSize != 0, ] +dat_trans <- trans_wide[trans_wide$distance != 0 & + trans_wide$rotationDegree != 0 & + trans_wide$scaleSize != 0, ] # removes almost 2/3 of the data (for small data set) +dat_trans <- dat_trans[, c("event", "artwork", + "date.start", "date.stop", + "time_ms.start", "time_ms.stop", "duration", + "x.start", "y.start", "x.stop", "y.stop", "distance", + "scale.start", "scale.stop", "scaleSize", + "rotation.start", "rotation.stop", "rotationDegree")] -# TODO: How do I handle popups from glossar??? +summary(dat_trans) + +# TODO: Phillip fragen was mit `time_ms` schief läuft... Hat er eine +# Erklärung dafür? +#plot(time_ms.stop ~ time_ms.start, dat_trans, type = "b") +plot(time_ms.stop ~ time_ms.start, dat_trans, col = rgb(red = 0, green = 0, blue = 0, alpha = 0.2)) + +plot(date.stop ~ date.start, dat_trans[1:1000,], type = "b") + +# TODO: Wie mit duration < 0 umgehen? Einfach auf NA setzen? For now... +dat_trans[dat_trans$duration < 0, "duration"] <- NA +#' # Close other events - - -# Should every "Show front" be the beginning of a new trace? -# Should Transform events be handled separately and then be "added" again -# by timestamp? - -######## tmp <- dat[!dat$event %in% c("Transform start", "Transform stop"), ] +tmp$x <- NULL +tmp$y <- NULL +tmp$scale <- NULL +tmp$rotation <- NULL rownames(tmp) <- NULL +# Create event ID for closing events +# num_start <- diff(c(0, which(tmp$event == "Show Front"))) +# tmp$trace <- rep(seq_along(num_start), num_start) +# head(tmp[, c("artwork", "event", "trace")], 50) +# TODO: How do I handle popups from glossar??? +# --> does not work because of glossar entries... can't sort by artwork + + tmp$trace <- NA last_event <- tmp$event[1] +aws <- unique(tmp$artwork)[unique(tmp$artwork) != "glossar"] +# +for (art in aws) { # select artwork -for (art in unique(tmp$artwork)) { + for (i in 1:nrow(tmp)) { # go through rows - for (i in 1:nrow(tmp)) { + if (last_event == "Show Info" & tmp$artwork[i] == art) { + tmp$trace[i] <- i + j <- i - if (last_event == "Show Info" & (tmp$artwork[i] == art | - tmp$artwork[i] == "glossar")) { - tmp$trace[i] <- "start" - } else if (last_event == "Show Front" & (tmp$artwork[i] == art | - tmp$artwork[i] == "glossar")) { - tmp$trace[i] <- "stop" + } else if (last_event == "Show Front" & tmp$artwork[i] == art) { + tmp$trace[i] <- j + + } else if (!(last_event %in% c("Show Info", "Show Front")) & + tmp$artwork[i] == art) { + tmp$trace[i] <- j + } + + if (i <= nrow(tmp)) { + last_event <- tmp$event[i + 1] } - last_event <- tmp$event[i + 1] } } - -head(tmp[4:ncol(tmp)], 50) -# TODO: Great job! You used a for-loop to rename "Show info" and "Show -# front" to "start" and "stop" ;) +head(tmp[, c("artwork", "event", "trace")], 50) +tail(tmp[, c("artwork", "event", "trace")], 50) +## Fix glossar entries + +### Find artwork for glossar entry + +tmp2 <- tmp[tmp$artwork == "glossar", ] + +glossar_files <- unique(tmp2$popup) + +# load lookup table for artworks and glossar files +load("../data/glossar_dict.RData") +lut <- glossar_dict[glossar_dict$glossar_file %in% glossar_files, ] +# TODO: Find last "Artwork/OpenCard" that matches possible artwork + +head(tmp[, c("artwork", "event", "popup", "trace")], 20) +#df <- NULL + +for (file in lut$glossar_file) { + + artwork_list <- unlist(lut[lut$glossar_file == file, "artwork"]) + + for (i in seq_len(nrow(tmp))) { + + if (tmp$event[i] == "Show Info") { + + current_artwork <- tmp[i, "artwork"] + j <- i + k <- i + + } else { + + current_artwork <- current_artwork + + } + + if (tmp$event[i] == "Show Front" & tmp$artwork[i] == current_artwork) { + # make sure artwork has not been closed, yet! + k <- i + } + + if (tmp$artwork[i] == "glossar" & + (current_artwork %in% artwork_list) & + tmp$popup[i] == file & (j-k == 0)) { + + #df <- rbind(df, data.frame(file, current_artwork, i, j)) + tmp[i, "trace"] <- tmp[j, "trace"] + + } + } +} +# dim(tmp[is.na(tmp$trace), ]) +# --> finds about half of the glossar entries for the small data set... + +# tmp[apply(df[, c("j", "i")], 1, c), c("artwork", "event", "popup", "trace")] + +# REMEMBER: It can never bo 100% correct, since it is always possible that +# several cards are open and that they link at the same glossar entry + +# Fill in the glossar_files that are only associated with one artwork +# How many glossar_files are only associated with one artwork? +lut[sapply(lut$artwork, length) == 1, "glossar_file"] + +# TODO: Can I fill in the ones that are only associated with one artwork +# easily? I can't come up with something --> maybe ask AK??? + +# TODO: How can I check if one of the former "Show Infos" is the correct +# one? I can't come up with something --> maybe ask AK??? + +# for (file in lut$glossar_file) { +# +# artwork_list <- unlist(lut[lut$glossar_file == file, "artwork"]) +# +# for (i in seq_len(nrow(tmp))) { +# +# if (tmp$event[i] == "Show Info") { +# +# artworks <- NULL +# current_artwork <- tmp[i, "artwork"] +# j <- i +# +# } else { +# +# print(current_artwork) +# artworks <- c(artworks, tmp[i, "artwork"]) +# print(artworks) +# +# } +# +# # if (tmp$artwork[i] == "glossar" & +# # (current_artwork %in% artwork_list) & +# # tmp$popup[i] == file) { +# # +# # #df <- rbind(df, data.frame(file, current_artwork, i, j)) +# # tmp[i, "trace"] <- tmp[j, "trace"] +# +# # } +# } +# } + +# correct: 17940 +# incorrect: 17963 +# TODO: For now: Exclude not matched glossar entries + +df <- subset(tmp, !is.na(tmp$trace)) +df <- df[order(df$trace), ] +rownames(df) <- NULL +#' ## Close flipCard + +tmp <- subset(df, df$event %in% c("Show Info", "Show Front")) + +tmp$event <- ifelse(tmp$event == "Show Info", "start", "stop") + +dat_flipCard <- reshape(tmp, direction = "wide", + idvar = c("trace", "artwork"), + timevar = "event", drop = c("fileid", "popup", "card")) +dat_flipCard$event <- "flipCard" +dat_flipCard$duration <- dat_flipCard$time_ms.stop - dat_flipCard$time_ms.start + +dat_flipCard$duration <- ifelse(dat_flipCard$duration < 0, NA, dat_flipCard$duration) +# TODO: Wie mit duration < 0 umgehen? Einfach auf NA setzen? For now... + +dat_flipCard <- dat_flipCard[, c("event", "artwork", "trace", + "date.start", "date.stop", + "time_ms.start", "time_ms.stop", "duration")] +#' ## Close openTopic +tmp <- subset(df, df$event %in% c("Artwork/OpenCard", "Artwork/CloseCard")) +tmp <- tmp[order(tmp$artwork, tmp$date), ] +rownames(tmp) <- NULL +num_start <- diff(c(0, which(tmp$event == "Artwork/CloseCard"))) +tmp$eventid <- rep(seq_along(num_start), num_start) + +tmp$event <- ifelse(tmp$event == "Artwork/OpenCard", "start", "stop") + +dat_openTopic <- reshape(tmp, direction = "wide", + idvar = c("eventid", "trace", "artwork", "card"), + timevar = "event", drop = c("fileid", "popup")) +dat_openTopic$event <- "openTopic" +dat_openTopic$duration <- dat_openTopic$time_ms.stop - dat_openTopic$time_ms.start + +dat_openTopic$duration <- ifelse(dat_openTopic$duration < 0, NA, dat_openTopic$duration) +# TODO: Wie mit duration < 0 umgehen? Einfach auf NA setzen? For now... + +dat_openTopic <- dat_openTopic[, c("event", "artwork", "card", "trace", + "date.start", "date.stop", + "time_ms.start", "time_ms.stop", "duration")] +# TODO: card should have a unique identifier for each artwork! + +#' ## close openPopup +tmp <- subset(df, df$event %in% c("ShowPopup", "HidePopup")) +tmp <- tmp[order(tmp$artwork, tmp$date), ] +rownames(tmp) <- NULL + +num_start <- diff(c(0, which(tmp$event == "HidePopup"))) +# last event is "ShowPopup"! Needs to be fixed TODO: Needs to be cought in +# a function! +num_start <- c(num_start, 1) + +tmp$eventid <- rep(seq_along(num_start), num_start) + +tmp$event <- ifelse(tmp$event == "ShowPopup", "start", "stop") + +dat_openPopup <- reshape(tmp, direction = "wide", + idvar = c("eventid", "trace", "artwork", "popup"), + timevar = "event", drop = c("fileid", "card")) +# there is a pathological entry which gets deleted... +# df[df$trace == 4595, ] + +# TODO: Some correct entries are not closed: +df[df$trace == 1843, ] +# WHY??? + +dat_openPopup$event <- "openPopup" +dat_openPopup$duration <- dat_openPopup$time_ms.stop - dat_openPopup$time_ms.start + +dat_openPopup$duration <- ifelse(dat_openPopup$duration < 0, NA, dat_openPopup$duration) +# TODO: Wie mit duration < 0 umgehen? Einfach auf NA setzen? For now... + +dat_openPopup <- dat_openPopup[, c("event", "artwork", "popup", "trace", + "date.start", "date.stop", + "time_ms.start", "time_ms.stop", "duration")] + +# Merge all +dat_all <- merge(dat_trans, dat_flipCard, all = TRUE) +dat_all <- merge(dat_all, dat_openTopic, all = TRUE) +dat_all <- merge(dat_all, dat_openPopup, all = TRUE) + +# check +nrow(dat_all) == (nrow(dat_trans) + nrow(dat_flipCard) + + nrow(dat_openTopic) + nrow(dat_openPopup)) + +dat_all <- dat_all[order(dat_all$date.start), ] +rownames(dat_all) <- NULL + +# TODO: Should card maybe also be filled in for "openPopup"?