Cleaned up TODOs
This commit is contained in:
parent
c09ee933ce
commit
9e3783cf1f
@ -31,10 +31,6 @@ dat0 <- read.table("../data/rawdata_logfiles_small.csv", sep = ";",
|
||||
header = TRUE)
|
||||
dat0$date <- as.POSIXct(dat0$date) # create date object
|
||||
|
||||
# TODO: Add a case identifier based on timestamps -- needs to be done on
|
||||
# "raw data". Is it possible? Something seems seriously wrong with
|
||||
# `time_ms`
|
||||
|
||||
#' # Remove irrelevant events
|
||||
|
||||
#' ## Remove Start Application and Show Application
|
||||
@ -44,11 +40,6 @@ dat <- subset(dat0, !(dat0$event %in% c("Start Application",
|
||||
|
||||
#' # Close events
|
||||
|
||||
# TODO:
|
||||
# Should every "Show front" be the beginning of a new trace?
|
||||
# Should Transform events be handled separately and then be "added" again
|
||||
# by timestamp?
|
||||
|
||||
########
|
||||
|
||||
#' Do it for Transform events first
|
||||
@ -98,7 +89,8 @@ trans_wide <- reshape(tmp, direction = "wide",
|
||||
# --> when fileid is part of the reshape, it does not work correctly, since
|
||||
# we sometimes have a start - stop event that is recorded in two separate
|
||||
# log files
|
||||
# TODO: This runs for quite some time. Is this more efficient with dplyr?
|
||||
# TODO: This runs for quite some time
|
||||
# --> Is this more efficient with tidyr::pivot_wider?
|
||||
|
||||
# which(is.na(trans_wide$date.start))
|
||||
|
||||
@ -117,28 +109,31 @@ trans_wide$rotationDegree <- trans_wide$rotation.stop -
|
||||
trans_wide$rotation.start
|
||||
trans_wide$scaleSize <- trans_wide$scale.stop - trans_wide$scale.start
|
||||
|
||||
dat_trans <- trans_wide[trans_wide$distance != 0 &
|
||||
dat_trans <- dat_trans[trans_wide$distance != 0 &
|
||||
trans_wide$rotationDegree != 0 &
|
||||
trans_wide$scaleSize != 0, ]
|
||||
trans_wide$scaleSize != 0,
|
||||
c("event", "artwork", "date.start", "date.stop",
|
||||
"time_ms.start", "time_ms.stop", "duration",
|
||||
"x.start", "y.start", "x.stop", "y.stop",
|
||||
"distance", "scale.start", "scale.stop",
|
||||
"scaleSize", "rotation.start", "rotation.stop",
|
||||
"rotationDegree")]
|
||||
# removes almost 2/3 of the data (for small data set)
|
||||
|
||||
dat_trans <- dat_trans[, c("event", "artwork",
|
||||
"date.start", "date.stop",
|
||||
"time_ms.start", "time_ms.stop", "duration",
|
||||
"x.start", "y.start", "x.stop", "y.stop", "distance",
|
||||
"scale.start", "scale.stop", "scaleSize",
|
||||
"rotation.start", "rotation.stop", "rotationDegree")]
|
||||
rm(tmp, id_rm_stop, id_stop, trans_wide, num_start, num_stop)
|
||||
|
||||
summary(dat_trans)
|
||||
|
||||
# TODO: Phillip fragen was mit `time_ms` schief läuft... Hat er eine
|
||||
# Erklärung dafür?
|
||||
# TODO: Ask Phillip what is wrong with `time_ms`
|
||||
# --> Hat er eine Erklärung dafür?
|
||||
|
||||
#plot(time_ms.stop ~ time_ms.start, dat_trans, type = "b")
|
||||
plot(time_ms.stop ~ time_ms.start, dat_trans, col = rgb(red = 0, green = 0, blue = 0, alpha = 0.2))
|
||||
|
||||
plot(date.stop ~ date.start, dat_trans[1:1000,], type = "b")
|
||||
|
||||
# TODO: Wie mit duration < 0 umgehen? Einfach auf NA setzen? For now...
|
||||
# TODO: How to handle duration < 0
|
||||
# --> Replace with NA for now...
|
||||
dat_trans[dat_trans$duration < 0, "duration"] <- NA
|
||||
|
||||
|
||||
@ -156,7 +151,6 @@ rownames(tmp) <- NULL
|
||||
# num_start <- diff(c(0, which(tmp$event == "Show Front")))
|
||||
# tmp$trace <- rep(seq_along(num_start), num_start)
|
||||
# head(tmp[, c("artwork", "event", "trace")], 50)
|
||||
# TODO: How do I handle popups from glossar???
|
||||
# --> does not work because of glossar entries... can't sort by artwork
|
||||
|
||||
|
||||
@ -188,20 +182,19 @@ for (art in aws) { # select artwork
|
||||
|
||||
head(tmp[, c("artwork", "event", "trace")], 50)
|
||||
tail(tmp[, c("artwork", "event", "trace")], 50)
|
||||
# TODO: How to handle popups from glossar???
|
||||
|
||||
rm(aws, i, j, last_event, art)
|
||||
|
||||
## Fix glossar entries
|
||||
|
||||
### Find artwork for glossar entry
|
||||
|
||||
tmp2 <- tmp[tmp$artwork == "glossar", ]
|
||||
|
||||
glossar_files <- unique(tmp2$popup)
|
||||
glossar_files <- unique(tmp[tmp$artwork == "glossar", "popup"])
|
||||
|
||||
# load lookup table for artworks and glossar files
|
||||
load("../data/glossar_dict.RData")
|
||||
lut <- glossar_dict[glossar_dict$glossar_file %in% glossar_files, ]
|
||||
# TODO: Find last "Artwork/OpenCard" that matches possible artwork
|
||||
|
||||
head(tmp[, c("artwork", "event", "popup", "trace")], 20)
|
||||
|
||||
@ -247,17 +240,16 @@ for (file in lut$glossar_file) {
|
||||
# tmp[apply(df[, c("j", "i")], 1, c), c("artwork", "event", "popup", "trace")]
|
||||
|
||||
# REMEMBER: It can never bo 100% correct, since it is always possible that
|
||||
# several cards are open and that they link at the same glossar entry
|
||||
# several cards are open and that they link to the same glossar entry
|
||||
|
||||
# Fill in the glossar_files that are only associated with one artwork
|
||||
# How many glossar_files are only associated with one artwork?
|
||||
lut[sapply(lut$artwork, length) == 1, "glossar_file"]
|
||||
|
||||
# TODO: Can I fill in the ones that are only associated with one artwork
|
||||
# easily? I can't come up with something --> maybe ask AK???
|
||||
# TODO: Fill in the ones that are associated with one artwork
|
||||
# --> Can't come up with something -- maybe ask AK???
|
||||
|
||||
# TODO: How can I check if one of the former "Show Infos" is the correct
|
||||
# one? I can't come up with something --> maybe ask AK???
|
||||
# TODO: How to check if one of the former "Show Infos" is correct on
|
||||
# --> Can't come up with something -- maybe ask AK???
|
||||
|
||||
# for (file in lut$glossar_file) {
|
||||
#
|
||||
@ -300,6 +292,8 @@ df <- subset(tmp, !is.na(tmp$trace))
|
||||
df <- df[order(df$trace), ]
|
||||
rownames(df) <- NULL
|
||||
|
||||
rm(tmp, lut, current_artwork, file, glossar_dict, i, j, k, artwork_list,
|
||||
glossar_files)
|
||||
|
||||
#' ## Close flipCard
|
||||
|
||||
@ -307,19 +301,25 @@ tmp <- subset(df, df$event %in% c("Show Info", "Show Front"))
|
||||
|
||||
tmp$event <- ifelse(tmp$event == "Show Info", "start", "stop")
|
||||
|
||||
dat_flipCard <- reshape(tmp, direction = "wide",
|
||||
flipCard_wide <- reshape(tmp, direction = "wide",
|
||||
idvar = c("trace", "artwork"),
|
||||
timevar = "event", drop = c("fileid", "popup", "card"))
|
||||
dat_flipCard$event <- "flipCard"
|
||||
dat_flipCard$duration <- dat_flipCard$time_ms.stop - dat_flipCard$time_ms.start
|
||||
timevar = "event",
|
||||
drop = c("fileid", "popup", "card"))
|
||||
flipCard_wide$event <- "flipCard"
|
||||
flipCard_wide$duration <- flipCard_wide$time_ms.stop -
|
||||
flipCard_wide$time_ms.start
|
||||
|
||||
dat_flipCard$duration <- ifelse(dat_flipCard$duration < 0, NA, dat_flipCard$duration)
|
||||
# TODO: Wie mit duration < 0 umgehen? Einfach auf NA setzen? For now...
|
||||
# TODO: How to handle duration < 0
|
||||
# --> Replace with NA for now...
|
||||
flipCard_wide$duration <- ifelse(flipCard_wide$duration < 0,
|
||||
NA, flipCard_wide$duration)
|
||||
|
||||
dat_flipCard <- dat_flipCard[, c("event", "artwork", "trace",
|
||||
dat_flipCard <- flipCard_wide[, c("event", "artwork", "trace",
|
||||
"date.start", "date.stop",
|
||||
"time_ms.start", "time_ms.stop", "duration")]
|
||||
"time_ms.start", "time_ms.stop",
|
||||
"duration")]
|
||||
|
||||
rm(tmp, flipCard_wide)
|
||||
|
||||
#' ## Close openTopic
|
||||
|
||||
@ -332,19 +332,25 @@ tmp$eventid <- rep(seq_along(num_start), num_start)
|
||||
|
||||
tmp$event <- ifelse(tmp$event == "Artwork/OpenCard", "start", "stop")
|
||||
|
||||
dat_openTopic <- reshape(tmp, direction = "wide",
|
||||
openTopic_wide <- reshape(tmp, direction = "wide",
|
||||
idvar = c("eventid", "trace", "artwork", "card"),
|
||||
timevar = "event", drop = c("fileid", "popup"))
|
||||
dat_openTopic$event <- "openTopic"
|
||||
dat_openTopic$duration <- dat_openTopic$time_ms.stop - dat_openTopic$time_ms.start
|
||||
openTopic_wide$event <- "openTopic"
|
||||
openTopic_wide$duration <- openTopic_wide$time_ms.stop -
|
||||
openTopic_wide$time_ms.start
|
||||
|
||||
dat_openTopic$duration <- ifelse(dat_openTopic$duration < 0, NA, dat_openTopic$duration)
|
||||
# TODO: Wie mit duration < 0 umgehen? Einfach auf NA setzen? For now...
|
||||
openTopic_wide$duration <- ifelse(openTopic_wide$duration < 0,
|
||||
NA, openTopic_wide$duration)
|
||||
# TODO: How to handle duration < 0
|
||||
# --> Replace with NA for now...
|
||||
|
||||
dat_openTopic <- dat_openTopic[, c("event", "artwork", "card", "trace",
|
||||
dat_openTopic <- openTopic_wide[, c("event", "artwork", "card", "trace",
|
||||
"date.start", "date.stop",
|
||||
"time_ms.start", "time_ms.stop", "duration")]
|
||||
# TODO: card should have a unique identifier for each artwork!
|
||||
"time_ms.start", "time_ms.stop",
|
||||
"duration")]
|
||||
# TODO: card should have a unique identifier for each artwork
|
||||
|
||||
rm(openTopic_wide, num_start, tmp)
|
||||
|
||||
#' ## close openPopup
|
||||
tmp <- subset(df, df$event %in% c("ShowPopup", "HidePopup"))
|
||||
@ -352,15 +358,15 @@ tmp <- tmp[order(tmp$artwork, tmp$date), ]
|
||||
rownames(tmp) <- NULL
|
||||
|
||||
num_start <- diff(c(0, which(tmp$event == "HidePopup")))
|
||||
# last event is "ShowPopup"! Needs to be fixed TODO: Needs to be cought in
|
||||
# a function!
|
||||
# last event is "ShowPopup"! Needs to be fixed
|
||||
num_start <- c(num_start, 1)
|
||||
# TODO: Needs to be caught in a function
|
||||
|
||||
tmp$eventid <- rep(seq_along(num_start), num_start)
|
||||
|
||||
tmp$event <- ifelse(tmp$event == "ShowPopup", "start", "stop")
|
||||
|
||||
dat_openPopup <- reshape(tmp, direction = "wide",
|
||||
openPopup_wide <- reshape(tmp, direction = "wide",
|
||||
idvar = c("eventid", "trace", "artwork", "popup"),
|
||||
timevar = "event", drop = c("fileid", "card"))
|
||||
# there is a pathological entry which gets deleted...
|
||||
@ -370,20 +376,28 @@ dat_openPopup <- reshape(tmp, direction = "wide",
|
||||
df[df$trace == 1843, ]
|
||||
# WHY???
|
||||
|
||||
dat_openPopup$event <- "openPopup"
|
||||
dat_openPopup$duration <- dat_openPopup$time_ms.stop - dat_openPopup$time_ms.start
|
||||
openPopup_wide$event <- "openPopup"
|
||||
openPopup_wide$duration <- openPopup_wide$time_ms.stop -
|
||||
openPopup_wide$time_ms.start
|
||||
|
||||
dat_openPopup$duration <- ifelse(dat_openPopup$duration < 0, NA, dat_openPopup$duration)
|
||||
# TODO: Wie mit duration < 0 umgehen? Einfach auf NA setzen? For now...
|
||||
openPopup_wide$duration <- ifelse(openPopup_wide$duration < 0,
|
||||
NA, openPopup_wide$duration)
|
||||
# TODO: How to handle duration < 0
|
||||
# --> Replace with NA for now...
|
||||
|
||||
dat_openPopup <- dat_openPopup[, c("event", "artwork", "popup", "trace",
|
||||
dat_openPopup <- openPopup_wide[, c("event", "artwork", "popup", "trace",
|
||||
"date.start", "date.stop",
|
||||
"time_ms.start", "time_ms.stop", "duration")]
|
||||
"time_ms.start", "time_ms.stop",
|
||||
"duration")]
|
||||
rm(num_start, openPopup_wide, tmp)
|
||||
|
||||
|
||||
# Merge all
|
||||
system.time({
|
||||
dat_all <- merge(dat_trans, dat_flipCard, all = TRUE)
|
||||
dat_all <- merge(dat_all, dat_openTopic, all = TRUE)
|
||||
dat_all <- merge(dat_all, dat_openPopup, all = TRUE)
|
||||
})
|
||||
|
||||
# check
|
||||
nrow(dat_all) == (nrow(dat_trans) + nrow(dat_flipCard) +
|
||||
@ -392,84 +406,25 @@ nrow(dat_all) == (nrow(dat_trans) + nrow(dat_flipCard) +
|
||||
dat_all <- dat_all[order(dat_all$date.start), ]
|
||||
rownames(dat_all) <- NULL
|
||||
|
||||
# TODO: from here on NA... WHY??
|
||||
dat_all[19426:19435, ]
|
||||
|
||||
# TODO: Should card maybe also be filled in for "openPopup"?
|
||||
|
||||
# dat_all2 <- dplyr::full_join(dat_trans, dat_flipCard)
|
||||
# dat_all2 <- dplyr::full_join(dat_all, dat_openTopic)
|
||||
# dat_all2 <- dplyr::full_join(dat_all, dat_openPopup)
|
||||
#
|
||||
# nrow(dat_all2) == (nrow(dat_trans) + nrow(dat_flipCard) +
|
||||
# nrow(dat_openTopic) + nrow(dat_openPopup))
|
||||
#
|
||||
# dat_all2 <- dat_all2[order(dat_all2$date.start), ]
|
||||
# rownames(dat_all2) <- NULL
|
||||
# TODO: --> same result - but faster. Need it?
|
||||
# --> Would hate to depend on dplyr...
|
||||
|
||||
|
||||
|
||||
#' ## Remove "button presses"
|
||||
|
||||
# Sort data frame by artwork and date
|
||||
dat <- dat[order(dat$artwork, dat$date), ]
|
||||
|
||||
# remove "Transform start" and "Transform stop" following directly each
|
||||
# other, since I do not know how to interpret them as events
|
||||
id_start <- which(dat$event == "Transform start")
|
||||
id_stop <- which(dat$event == "Transform stop")
|
||||
|
||||
id_rm_start <- id_start[diff(id_start) == 1]
|
||||
id_rm_stop <- id_stop[diff(id_stop) == 1]
|
||||
|
||||
dat <- dat[-c(id_rm_start, id_rm_stop), ]
|
||||
rownames(dat) <- NULL
|
||||
|
||||
|
||||
id_start2 <- which(dat$event == "Transform start")
|
||||
id_stop2 <- which(dat$event == "Transform stop")
|
||||
|
||||
length(id_start2) - length(id_stop2)
|
||||
# 340 --> "starts too many"
|
||||
|
||||
# remove "Transform start" and "Transform stop" following directly each
|
||||
# other (but with events in between!)
|
||||
id_start_new <- id_start2
|
||||
id_stop_new <- id_stop2
|
||||
|
||||
for (i in 2:length(id_start_new)) {
|
||||
if (id_start_new[i-1] < id_stop_new[i-1] & id_start_new[i] < id_stop_new[i-1]) {
|
||||
id_start_new <- id_start_new[-(i-1)]
|
||||
} else if (id_start_new[i-1] > id_stop_new[i-1] & id_start_new[i] > id_stop_new[i-1]) {
|
||||
id_stop_new <- id_stop_new[-(i-1)]
|
||||
}
|
||||
}
|
||||
|
||||
length(id_start2) - length(id_start_new)
|
||||
length(id_stop2) - length(id_stop_new)
|
||||
|
||||
ids <- data.frame(start = id_start_new, stop = id_stop_new)
|
||||
ids$diff <- ids$stop - ids$start
|
||||
|
||||
table(ids$diff)
|
||||
|
||||
# remove "Transform start" and "Transform stop" around other events
|
||||
|
||||
id_rm_start2 <- id_start2[!(id_start2 %in% id_start_new)]
|
||||
id_rm_stop2 <- id_stop2[!(id_stop2 %in% id_stop_new)]
|
||||
|
||||
# TODO: It still does not work correctly:
|
||||
dat[64764:64769,]
|
||||
# time_ms event artwork popup x y scale rotation
|
||||
# 64764 473081 Transform start 052 052.xml 1958.65 1505.75 0.8234455 -0.1351998
|
||||
# 64765 474226 Show Info 052 052.xml NA NA NA NA
|
||||
# 64766 475735 Transform start 052 052.xml 1988.25 1625.25 0.9927645 2.4527958
|
||||
# 64767 475739 Transform stop 052 052.xml 1988.25 1625.25 0.9927645 2.4527958
|
||||
# 64768 479326 Artwork 052 052.xml NA NA NA NA
|
||||
# 64769 479751 Transform stop 052 052.xml 1660.90 1883.20 0.8074586 29.0875534
|
||||
|
||||
# --> but no idea how to find these cases in an automated way...
|
||||
|
||||
dat <- dat[-c(id_rm_start2, id_rm_stop2), ]
|
||||
# --> Every start ends with a stop now (but not necessarily the correct one!)
|
||||
|
||||
|
||||
dat1 <- dat[order(dat$date, dat$time_ms), ]
|
||||
dat1$time_diff <- c(NA, diff(dat1$time_ms))
|
||||
|
||||
boxplot(time_diff ~ as.Date(date), dat1[dat1$time_diff > 1000 & dat1$time_diff < 4000, ])
|
||||
|
||||
boxplot(time_ms ~ event, dat1)
|
||||
|
||||
|
||||
#' ## Plots
|
||||
|
||||
counts <- table(as.Date(dat$date), dat$event)
|
||||
@ -483,22 +438,6 @@ counts <- table(as.Date(dat$date[dat$event %in% start_events]),
|
||||
lattice::barchart(counts, auto.key = TRUE)
|
||||
|
||||
|
||||
# TODO: Do I want to "collapse" the data frame in a way, that I only have
|
||||
# one event for each "set", meaning
|
||||
#
|
||||
# * Transform start + Transform stop --> Transform
|
||||
# * Artwork/OpenCard + Artwork/CloseCard --> Show Subcard
|
||||
# * ShowPopup + HidePopup --> Show Popup
|
||||
# * Show Info + Show Front --> Flip Card
|
||||
# (s.o. ;))
|
||||
#
|
||||
# Then I would have meaningful variables like duration, distance, degree of
|
||||
# rotation, size of scaling, selection of Subcard etc.
|
||||
# This means that I would have to delete all "unclosed" events.
|
||||
|
||||
# Create a data frame with
|
||||
# case event attributes (can differ for different events)
|
||||
# ??
|
||||
# Is `artwork` my case? Or `artwork` per day? Or `artwork` per some other
|
||||
# unit??? Maybe look at differences between timestamps separately for
|
||||
# `artwork`? And identify "new observational unit" this way?
|
||||
@ -508,8 +447,6 @@ lattice::barchart(counts, auto.key = TRUE)
|
||||
# 2. Time interval of XX min within one `artwork` on the same day
|
||||
# corresponds to "observational unit change"
|
||||
|
||||
# id activity timestamp
|
||||
|
||||
# Split data frame in list of data frame which all correspond to one
|
||||
# artwork
|
||||
# dat_art <- split(dat, dat$artwork)
|
||||
@ -522,3 +459,8 @@ lattice::barchart(counts, auto.key = TRUE)
|
||||
#' can happen that the wrong tags have been put together (e.g., Transform
|
||||
#' start and Transform stop); therefore, durations etc. are only heuristic
|
||||
|
||||
# TODO: Add a case identifier based on timestamps
|
||||
# --> needs to be done on "raw data". Is it possible? Something seems
|
||||
# seriously wrong with `time_ms`
|
||||
|
||||
# TODO: Write function for closing events
|
||||
|
Loading…
Reference in New Issue
Block a user