Worked on preprocessing TODOS; made progress with glossar entries and durations

This commit is contained in:
Nora Wickelmaier 2023-08-28 17:29:56 +02:00
parent e9120a2e4b
commit 2c4f48531a
3 changed files with 212 additions and 131 deletions

View File

@ -157,7 +157,37 @@ dat_all[735, ]
# 1427 2016_11_15-12_12_57.log 2016-12-15 12:12:57 850 Transform stop 076 076.xml NA 2092.25 2008.00 0.2997107 13.26223362
```
## Events that only close (`date.start` is NA)
`time_ms` does not increase from log file to log file
```{r}
tmp1 <- dat[!duplicated(dat$fileid), c("fileid", "time_ms", "event")]
tmp2 <- dat[!duplicated(dat$fileid, fromLast=T), c("fileid", "time_ms", "event")]
tmp <- rbind(tmp1, tmp2)
tmp <- tmp[order(tmp$fileid), ]
head(tmp, 50)
plot(time_ms ~ as.factor(fileid), dat[1:2000, ], xlab = "fileid")
```
## x,y-coordinates outside of display range
The display is a 4K-display with 3840 x 2160 pixels. When you plot the
start and stop coordinates, the display is clearly to distinguish. However,
a lot of points are outside of the display range. This can happen, when the
art objects are scaled and then moved to the very edge of the table. Then
it will record pixels outside of the table. These are actually valid data
points and I will leave them as is.
```{r}
par(mfrow = c(1, 2))
plot(y.start ~ x.start, dat)
abline(v = c(0, 3840), h = c(0, 2160), col = "blue", lwd = 2)
plot(y.stop ~ x.stop, dat)
abline(v = c(0, 3840), h = c(0, 2160), col = "blue", lwd = 2)
aggregate(cbind(x.start, x.stop, y.start, y.stop) ~ 1, dat, mean)
```
## Timestamps repeat
@ -173,7 +203,44 @@ dat_all[735, ]
## Add moves to `trace` variable
## openPopup does not close correctly
The sorting had to include `popup` otherwise nested events could not be
closed correctly.
```{r}
# TODO: Some correct entries are not closed:
df[df$trace == 1843, ]
# WHY???
# --> Wrong eventid!
dat5[dat5$trace == 1843, ]
openPopup_wide[openPopup_wide$trace == 1843, ]
```
## Events that only close (`date.start` is NA)
It looks like there is some kind of log error for the events that do not
have a start stop. I was able to get rid of most by sorting for `popup` for
the openPopup events, but there are still some left (50 for the small data
set, which corresponds to 0.2 per mill).
```{r}
# remove all events that do not have a `date.start`
dim(dat_all[is.na(dat_all$date.start), ])
dat_all <- dat_all[!is.na(dat_all$date.start), ]
# TODO: Find out how it can be that there is only a `date.stop`
## --> happens, when event is not properly closed, see here:
df[df$trace == 1843, ]
dat_openPopup[dat_openPopup$trace == 1843, ]
## --> still 50 (small data set) left, and some really do not seem to be
## opened! Must be a log error
# --> others should be closed!
dat[31000:31019,] # this one e.g.
# --> Actually NOT! card gets flipped before! Again - log error!
```
Will probably just get rid of them!
Think about if you want give warning messages about these deletions in the
functions.
# Reading list

View File

@ -30,6 +30,7 @@
dat0 <- read.table("../data/rawdata_logfiles_small.csv", sep = ";",
header = TRUE)
dat0$date <- as.POSIXct(dat0$date) # create date object
dat0$glossar <- ifelse(dat0$artwork == "glossar", 1, 0)
#' # Remove irrelevant events
@ -37,6 +38,7 @@ dat0$date <- as.POSIXct(dat0$date) # create date object
dat <- subset(dat0, !(dat0$event %in% c("Start Application",
"Show Application")))
rownames(dat) <- NULL
#' # Close events
@ -54,7 +56,7 @@ head(dat1[, c("event", "eventid")], 25)
table(table(dat1$eventid))
# 1 2 3 4 5 6 7 8 10 11
# 73 78429 5156 842 222 66 18 14 3 1
# 70 78435 5153 842 222 66 18 14 3 1
# --> compare to table(num_start)!
# Find out how often "Transform stop" follows each other
@ -80,18 +82,27 @@ id_rm_stop <- id_stop[diff(id_stop) == 1]
dat1 <- dat1[-(id_rm_stop + 1), ]
# transform to wide data format
dat1$event <- ifelse(dat1$event == "Transform start", "start", "stop")
dat1$time <- ifelse(dat1$event == "Transform start", "start", "stop")
trans_wide <- reshape(dat1, direction = "wide",
idvar = c("eventid", "artwork"),
timevar = "event", drop = c("fileid", "popup", "card")
idvar = c("eventid", "artwork", "glossar"),
timevar = "time",
drop = c("popup", "card", "event")
)
# --> when fileid is part of the reshape, it does not work correctly, since
# we sometimes have a start - stop event that is recorded in two separate
# log files
# TODO: This runs for quite some time
# --> Is this more efficient with tidyr::pivot_wider?
# --> when fileid is part of the reshape, it does not work correctly, since
# we sometimes have a start - stop event that is recorded in two separate
# log files, BUT: after finding out, that `time_ms` changes for each log
# file, I want to exclude those cases, so `fileid` has to be included!!!
# check how often an eventid is associated with two fileids
nrow(subset(trans_wide, trans_wide$fileid.start != trans_wide$fileid.stop))
# exclude from data set ??
# trans_wide <- subset(trans_wide, trans_wide$fileid.start != trans_wide$fileid.stop)
# which(is.na(trans_wide$date.start))
trans_wide$event <- "move"
@ -116,38 +127,28 @@ trans_wide$popup <- NA
dat_trans <- trans_wide[trans_wide$distance != 0 &
trans_wide$rotationDegree != 0 &
trans_wide$scaleSize != 1,
c("event", "artwork", "trace", "date.start", "date.stop",
c("fileid.start", "fileid.stop", "event", "artwork",
"trace", "glossar", "date.start", "date.stop",
"time_ms.start", "time_ms.stop", "duration",
"card", "popup",
"x.start", "y.start", "x.stop", "y.stop",
"distance", "scale.start", "scale.stop",
"card", "popup", "x.start", "y.start", "x.stop",
"y.stop", "distance", "scale.start", "scale.stop",
"scaleSize", "rotation.start", "rotation.stop",
"rotationDegree")]
1 - nrow(dat_trans) / nrow(trans_wide)
# removes almost 2/3 of the data (for small data set)
rm(id_rm_stop, id_stop, trans_wide, num_start, num_stop)
summary(dat_trans)
# TODO: Ask Phillip what is wrong with `time_ms`
# --> Hat er eine Erklärung dafür?
#plot(time_ms.stop ~ time_ms.start, dat_trans, type = "b")
plot(time_ms.stop ~ time_ms.start, dat_trans,
col = rgb(red = 0, green = 0, blue = 0, alpha = 0.2))
plot(date.stop ~ date.start, dat_trans[1:1000,], type = "b")
#' # Close other events
dat2 <- dat[!dat$event %in% c("Transform start", "Transform stop"), ]
dat2$x <- NULL
dat2$y <- NULL
dat2$scale <- NULL
dat2$rotation <- NULL
# dat2$x <- NULL
# dat2$y <- NULL
# dat2$scale <- NULL
# dat2$rotation <- NULL
rownames(dat2) <- NULL
# Create event ID for closing events
@ -202,8 +203,6 @@ lut <- glossar_dict[glossar_dict$glossar_file %in% glossar_files, ]
head(dat2[, c("artwork", "event", "popup", "trace")], 20)
#df <- NULL
for (file in lut$glossar_file) {
artwork_list <- unlist(lut[lut$glossar_file == file, "artwork"])
@ -231,16 +230,15 @@ for (file in lut$glossar_file) {
(current_artwork %in% artwork_list) &
dat2$popup[i] == file & (j-k == 0)) {
#df <- rbind(df, data.frame(file, current_artwork, i, j))
dat2[i, "trace"] <- dat2[j, "trace"]
dat2[i, "artwork"] <- current_artwork
}
}
}
# dim(dat2[is.na(dat2$trace), ])
# --> finds about half of the glossar entries for the small data set...
# dat2[apply(df[, c("j", "i")], 1, c), c("artwork", "event", "popup", "trace")]
# --> finds about half of the glossar entries for the small data set...
table(is.na(dat2[dat2$glossar == 1, "trace"]))
# REMEMBER: It can never bo 100% correct, since it is always possible that
# several cards are open and that they link to the same glossar entry
@ -251,45 +249,42 @@ lut[sapply(lut$artwork, length) == 1, "glossar_file"]
# TODO: Fill in the ones that are associated with one artwork
# --> Can't come up with something -- maybe ask AK???
# TODO: How to check if one of the former "Show Infos" is correct
# --> Can't come up with something -- maybe ask AK???
single <- lut[sapply(lut$artwork, length) == 1, "glossar_file"]
tmp <- subset(dat2, is.na(dat2$trace))$popup
inside <- unique(tmp[tmp %in% lut[sapply(lut$artwork, length) == 1, "glossar_file"]])
single_art <- unlist(lut[lut$glossar_file %in% inside, "artwork"])
tmp_lut <- data.frame(glossar_file = sort(inside), artwork = single_art)
# for (file in lut$glossar_file) {
#
# artwork_list <- unlist(lut[lut$glossar_file == file, "artwork"])
#
# for (i in seq_len(nrow(dat2))) {
#
# if (dat2$event[i] == "Show Info") {
#
# artworks <- NULL
# current_artwork <- dat2[i, "artwork"]
# j <- i
#
# } else {
#
# print(current_artwork)
# artworks <- c(artworks, dat2[i, "artwork"])
# print(artworks)
#
# }
#
# # if (dat2$artwork[i] == "glossar" &
# # (current_artwork %in% artwork_list) &
# # dat2$popup[i] == file) {
# #
# # #df <- rbind(df, data.frame(file, current_artwork, i, j))
# # dat2[i, "trace"] <- dat2[j, "trace"]
#
# # }
# }
# }
# correct: 17940
# incorrect: 17963
dat2[dat2$glossar == 1, c("artwork", "popup", "glossar", "trace")]
for (file in tmp_lut$glossar_file) {
for (i in seq_len(nrow(dat2))) {
if (dat2$event[i] == "Artwork/OpenCard" & dat2$artwork[i] %in% tmp_lut$artwork) {
current_artwork <- dat2[i, "artwork"]
j <- i
}
if (dat2$artwork[i] == "glossar" &
dat2$popup[i] == file) {
dat2[i, "trace"] <- dat2[j, "trace"]
dat2[i, "artwork"] <- current_artwork
}
}
}
dat2[14110:14130, ]
# TODO: Integrate for loop into for loop above
# TODO: "glossar" entry should be changed to the corresponding artwork
# TODO: Add additional variable `glossar` with 0/1 or similar instead
# TODO: For now: Exclude not matched glossar entries
@ -304,12 +299,12 @@ rm(lut, current_artwork, file, glossar_dict, i, j, k, artwork_list,
dat3 <- subset(df, df$event %in% c("Show Info", "Show Front"))
dat3$event <- ifelse(dat3$event == "Show Info", "start", "stop")
dat3$time <- ifelse(dat3$event == "Show Info", "start", "stop")
flipCard_wide <- reshape(dat3, direction = "wide",
idvar = c("trace", "artwork"),
timevar = "event",
drop = c("fileid", "popup", "card"))
idvar = c("trace", "artwork", "glossar"),
timevar = "time",
drop = c("popup", "card"))
flipCard_wide$event <- "flipCard"
flipCard_wide$duration <- flipCard_wide$time_ms.stop -
flipCard_wide$time_ms.start
@ -329,14 +324,15 @@ flipCard_wide$rotation.start <- NA
flipCard_wide$rotation.stop <- NA
flipCard_wide$rotationDegree <- NA
dat_flipCard <- flipCard_wide[, c("event", "artwork", "trace",
"date.start", "date.stop",
"time_ms.start", "time_ms.stop",
"duration", "card", "popup",
"x.start", "y.start", "x.stop", "y.stop",
"distance", "scale.start", "scale.stop",
"scaleSize", "rotation.start",
"rotation.stop", "rotationDegree")]
dat_flipCard <- flipCard_wide[, c("fileid.start", "fileid.stop", "event",
"artwork", "trace", "glossar",
"date.start", "date.stop",
"time_ms.start", "time_ms.stop",
"duration", "card", "popup", "x.start",
"y.start", "x.stop", "y.stop",
"distance", "scale.start", "scale.stop",
"scaleSize", "rotation.start",
"rotation.stop", "rotationDegree")]
rm(flipCard_wide)
@ -349,11 +345,11 @@ rownames(dat4) <- NULL
num_start <- diff(c(0, which(dat4$event == "Artwork/CloseCard")))
dat4$eventid <- rep(seq_along(num_start), num_start)
dat4$event <- ifelse(dat4$event == "Artwork/OpenCard", "start", "stop")
dat4$time <- ifelse(dat4$event == "Artwork/OpenCard", "start", "stop")
openTopic_wide <- reshape(dat4, direction = "wide",
idvar = c("eventid", "trace", "artwork", "card"),
timevar = "event", drop = c("fileid", "popup"))
idvar = c("eventid", "trace", "glossar", "artwork", "card"),
timevar = "time", drop = "popup")
openTopic_wide$event <- "openTopic"
openTopic_wide$duration <- openTopic_wide$time_ms.stop -
openTopic_wide$time_ms.start
@ -372,47 +368,46 @@ openTopic_wide$rotation.start <- NA
openTopic_wide$rotation.stop <- NA
openTopic_wide$rotationDegree <- NA
dat_openTopic <- openTopic_wide[, c("event", "artwork", "trace",
"date.start", "date.stop",
"time_ms.start", "time_ms.stop",
"duration", "card", "popup", "x.start",
"y.start", "x.stop", "y.stop",
"distance", "scale.start", "scale.stop",
"scaleSize", "rotation.start",
"rotation.stop", "rotationDegree")]
dat_openTopic <- openTopic_wide[, c("fileid.start", "fileid.stop", "event",
"artwork", "trace", "glossar",
"date.start", "date.stop",
"time_ms.start", "time_ms.stop",
"duration", "card", "popup", "x.start",
"y.start", "x.stop", "y.stop",
"distance", "scale.start",
"scale.stop", "scaleSize",
"rotation.start", "rotation.stop",
"rotationDegree")]
# TODO: card should have a unique identifier for each artwork
rm(openTopic_wide, num_start)
#' ## close openPopup
dat5 <- subset(df, df$event %in% c("ShowPopup", "HidePopup"))
dat5 <- dat5[order(dat5$artwork, dat5$date), ]
dat5 <- dat5[order(dat5$artwork, dat5$popup, dat5$date), ]
rownames(dat5) <- NULL
num_start <- diff(c(0, which(dat5$event == "HidePopup")))
# last event is "ShowPopup"! Needs to be fixed
num_start <- c(num_start, 1)
# TODO: Needs to be caught in a function
# last event is "ShowPopup"! Needs to be fixed
# num_start <- c(num_start, 1)
# TODO: Needs to be caught in a function --> not anymore - still relevant???
dat5$eventid <- rep(seq_along(num_start), num_start)
dat5$event <- ifelse(dat5$event == "ShowPopup", "start", "stop")
dat5$time <- ifelse(dat5$event == "ShowPopup", "start", "stop")
openPopup_wide <- reshape(dat5, direction = "wide",
idvar = c("eventid", "trace", "artwork", "popup"),
timevar = "event", drop = c("fileid", "card"))
idvar = c("eventid", "trace", "glossar", "artwork", "popup"),
timevar = "time", drop = "card")
# there is a pathological entry which gets deleted...
# df[df$trace == 4595, ]
# --> artwork 046 popup selene.xml gets opened twice
# TODO: Some correct entries are not closed:
df[df$trace == 1843, ]
# WHY???
openPopup_wide$event <- "openPopup"
openPopup_wide$duration <- openPopup_wide$time_ms.stop -
openPopup_wide$time_ms.start
openPopup_wide$card <- NA
openPopup_wide$x.start <- NA
openPopup_wide$x.stop <- NA
@ -426,14 +421,16 @@ openPopup_wide$rotation.start <- NA
openPopup_wide$rotation.stop <- NA
openPopup_wide$rotationDegree <- NA
dat_openPopup <- openPopup_wide[, c("event", "artwork", "trace",
"date.start", "date.stop",
"time_ms.start", "time_ms.stop",
"duration", "card", "popup", "x.start",
"y.start", "x.stop", "y.stop",
"distance", "scale.start", "scale.stop",
"scaleSize", "rotation.start",
"rotation.stop", "rotationDegree")]
dat_openPopup <- openPopup_wide[, c("fileid.start", "fileid.stop", "event",
"artwork", "trace", "glossar",
"date.start", "date.stop",
"time_ms.start", "time_ms.stop",
"duration", "card", "popup", "x.start",
"y.start", "x.stop", "y.stop",
"distance", "scale.start",
"scale.stop", "scaleSize",
"rotation.start", "rotation.stop",
"rotationDegree")]
rm(num_start, openPopup_wide)
@ -443,14 +440,14 @@ rm(num_start, openPopup_wide)
# dat_all <- merge(dat_all, dat_openTopic, all = TRUE)
# dat_all <- merge(dat_all, dat_openPopup, all = TRUE)
# })
#
#
# # check
# nrow(dat_all) == (nrow(dat_trans) + nrow(dat_flipCard) +
# nrow(dat_openTopic) + nrow(dat_openPopup))
#
#
# dat_all <- dat_all[order(dat_all$date.start), ]
# rownames(dat_all) <- NULL
#
#
# TODO: from here on NA... WHY??
# dat_all[19426:19435, ]
@ -460,10 +457,10 @@ rm(num_start, openPopup_wide)
# dat_all2 <- dplyr::full_join(dat_trans, dat_flipCard)
# dat_all2 <- dplyr::full_join(dat_all, dat_openTopic)
# dat_all2 <- dplyr::full_join(dat_all, dat_openPopup)
#
#
# nrow(dat_all2) == (nrow(dat_trans) + nrow(dat_flipCard) +
# nrow(dat_openTopic) + nrow(dat_openPopup))
#
#
# dat_all2 <- dat_all2[order(dat_all2$date.start), ]
# rownames(dat_all2) <- NULL
# TODO: --> same result - but faster. Need it?
@ -479,8 +476,22 @@ nrow(dat_all) == (nrow(dat_trans) + nrow(dat_flipCard) +
nrow(dat_openTopic) + nrow(dat_openPopup))
# remove all events that do not have a `date.start`
dim(dat_all[is.na(dat_all$date.start), ])
dat_all <- dat_all[!is.na(dat_all$date.start), ]
# TODO: Find out how it can be that there is only a `date.stop`
# There is only a `date.stop`, when event is not properly closed, see here:
df[df$trace == 1843, ]
dat_openPopup[dat_openPopup$trace == 1843, ]
## --> still 50 (small data set) left, and some really do not seem to be
## opened! Must be a log error
# --> others should be closed!
dat[31000:31019,] # this one e.g.
# --> Actually NOT! card gets flipped before! Again - log error!
# Remove durations when event spans more than one log file, since they are
# not interpretable
dat_all[which(dat_all$fileid.start != dat_all$fileid.stop), "duration"] <- NA
# sort by `start.date`
dat_all <- dat_all[order(dat_all$date.start), ]
@ -521,7 +532,8 @@ head(dat_all[, c("event", "artwork", "trace", "date.start", "timediff", "case")]
dat_all$eventid <- seq_len(nrow(dat_all))
dat_all <- dat_all[, c("eventid", "case", "trace", "event", "artwork",
dat_all <- dat_all[, c("fileid.start", "fileid.stop", "eventid", "case",
"trace", "glossar", "event", "artwork",
"date.start", "date.stop", "time_ms.start",
"time_ms.stop", "duration", "card", "popup",
"x.start", "y.start", "x.stop", "y.stop",
@ -574,6 +586,7 @@ for (case in cases) {
}
}
}
# TODO: Get rid of the loops
# --> This takes forever...
@ -587,25 +600,20 @@ for (case in cases) {
# put glossar events back in
dat_all <- rbind(out, dat_all[dat_all$artwork == "glossar", ])
dat_all <- dat_all[order(dat_all$date.start), ]
rownames(dat_all) <- NULL
#dat_all <- rbind(out, dat_all[dat_all$artwork == "glossar", ])
out <- out[order(out$date.start), ]
rownames(out) <- NULL
# Make `trace` a consecutive number
dat_all$trace <- as.numeric(as.factor(dat_all$trace))
# TODO: How to handle duration < 0
# --> Replace with NA for now...
dat_all$duration <- ifelse(dat_all$duration < 0, NA, dat_all$duration)
out$trace2 <- as.numeric(factor(out$trace, levels = unique(out$trace)))
#head(out[, c("trace", "trace2")], 50)
#' # Export data
write.table(dat_all, "../data/event_logfiles.csv",
write.table(out, "../data/event_logfiles.csv",
sep = ";", quote = FALSE, row.names = FALSE)
# Is `artwork` my case? Or `artwork` per day? Or `artwork` per some other
# unit??? Maybe look at differences between timestamps separately for
# `artwork`? And identify "new observational unit" this way?
@ -621,6 +629,3 @@ write.table(dat_all, "../data/event_logfiles.csv",
# TODO: Write function for closing events

View File

@ -237,3 +237,12 @@ counts <- table(as.Date(dat$date[dat$event %in% start_events]),
lattice::barchart(counts, auto.key = TRUE)
# TODO: Ask Phillip what is wrong with `time_ms`
# --> Hat er eine Erklärung dafür?
#plot(time_ms.stop ~ time_ms.start, dat_trans, type = "b")
plot(time_ms.stop ~ time_ms.start, dat_trans,
col = rgb(red = 0, green = 0, blue = 0, alpha = 0.2))
plot(date.stop ~ date.start, dat_trans[1:1000,], type = "b")