diff --git a/README.md b/README.md index 50f3082..28bdeab 100644 --- a/README.md +++ b/README.md @@ -157,7 +157,37 @@ dat_all[735, ] # 1427 2016_11_15-12_12_57.log 2016-12-15 12:12:57 850 Transform stop 076 076.xml NA 2092.25 2008.00 0.2997107 13.26223362 ``` -## Events that only close (`date.start` is NA) +`time_ms` does not increase from log file to log file + +```{r} +tmp1 <- dat[!duplicated(dat$fileid), c("fileid", "time_ms", "event")] +tmp2 <- dat[!duplicated(dat$fileid, fromLast=T), c("fileid", "time_ms", "event")] +tmp <- rbind(tmp1, tmp2) +tmp <- tmp[order(tmp$fileid), ] +head(tmp, 50) + +plot(time_ms ~ as.factor(fileid), dat[1:2000, ], xlab = "fileid") +``` + +## x,y-coordinates outside of display range + +The display is a 4K-display with 3840 x 2160 pixels. When you plot the +start and stop coordinates, the display is clearly to distinguish. However, +a lot of points are outside of the display range. This can happen, when the +art objects are scaled and then moved to the very edge of the table. Then +it will record pixels outside of the table. These are actually valid data +points and I will leave them as is. + +```{r} +par(mfrow = c(1, 2)) +plot(y.start ~ x.start, dat) +abline(v = c(0, 3840), h = c(0, 2160), col = "blue", lwd = 2) +plot(y.stop ~ x.stop, dat) +abline(v = c(0, 3840), h = c(0, 2160), col = "blue", lwd = 2) + + +aggregate(cbind(x.start, x.stop, y.start, y.stop) ~ 1, dat, mean) +``` ## Timestamps repeat @@ -173,7 +203,44 @@ dat_all[735, ] ## Add moves to `trace` variable +## openPopup does not close correctly +The sorting had to include `popup` otherwise nested events could not be +closed correctly. + + ```{r} +# TODO: Some correct entries are not closed: +df[df$trace == 1843, ] +# WHY??? +# --> Wrong eventid! +dat5[dat5$trace == 1843, ] +openPopup_wide[openPopup_wide$trace == 1843, ] +``` +## Events that only close (`date.start` is NA) + +It looks like there is some kind of log error for the events that do not +have a start stop. I was able to get rid of most by sorting for `popup` for +the openPopup events, but there are still some left (50 for the small data +set, which corresponds to 0.2 per mill). + + ```{r} +# remove all events that do not have a `date.start` +dim(dat_all[is.na(dat_all$date.start), ]) +dat_all <- dat_all[!is.na(dat_all$date.start), ] +# TODO: Find out how it can be that there is only a `date.stop` +## --> happens, when event is not properly closed, see here: +df[df$trace == 1843, ] +dat_openPopup[dat_openPopup$trace == 1843, ] +## --> still 50 (small data set) left, and some really do not seem to be +## opened! Must be a log error +# --> others should be closed! +dat[31000:31019,] # this one e.g. +# --> Actually NOT! card gets flipped before! Again - log error! +``` +Will probably just get rid of them! + +Think about if you want give warning messages about these deletions in the +functions. # Reading list diff --git a/code/02_preprocessing.R b/code/02_preprocessing.R index 04151e3..a6438b6 100644 --- a/code/02_preprocessing.R +++ b/code/02_preprocessing.R @@ -30,6 +30,7 @@ dat0 <- read.table("../data/rawdata_logfiles_small.csv", sep = ";", header = TRUE) dat0$date <- as.POSIXct(dat0$date) # create date object +dat0$glossar <- ifelse(dat0$artwork == "glossar", 1, 0) #' # Remove irrelevant events @@ -37,6 +38,7 @@ dat0$date <- as.POSIXct(dat0$date) # create date object dat <- subset(dat0, !(dat0$event %in% c("Start Application", "Show Application"))) +rownames(dat) <- NULL #' # Close events @@ -54,7 +56,7 @@ head(dat1[, c("event", "eventid")], 25) table(table(dat1$eventid)) # 1 2 3 4 5 6 7 8 10 11 -# 73 78429 5156 842 222 66 18 14 3 1 +# 70 78435 5153 842 222 66 18 14 3 1 # --> compare to table(num_start)! # Find out how often "Transform stop" follows each other @@ -80,18 +82,27 @@ id_rm_stop <- id_stop[diff(id_stop) == 1] dat1 <- dat1[-(id_rm_stop + 1), ] # transform to wide data format -dat1$event <- ifelse(dat1$event == "Transform start", "start", "stop") +dat1$time <- ifelse(dat1$event == "Transform start", "start", "stop") trans_wide <- reshape(dat1, direction = "wide", - idvar = c("eventid", "artwork"), - timevar = "event", drop = c("fileid", "popup", "card") + idvar = c("eventid", "artwork", "glossar"), + timevar = "time", + drop = c("popup", "card", "event") ) -# --> when fileid is part of the reshape, it does not work correctly, since -# we sometimes have a start - stop event that is recorded in two separate -# log files # TODO: This runs for quite some time # --> Is this more efficient with tidyr::pivot_wider? +# --> when fileid is part of the reshape, it does not work correctly, since +# we sometimes have a start - stop event that is recorded in two separate +# log files, BUT: after finding out, that `time_ms` changes for each log +# file, I want to exclude those cases, so `fileid` has to be included!!! + +# check how often an eventid is associated with two fileids +nrow(subset(trans_wide, trans_wide$fileid.start != trans_wide$fileid.stop)) + +# exclude from data set ?? +# trans_wide <- subset(trans_wide, trans_wide$fileid.start != trans_wide$fileid.stop) + # which(is.na(trans_wide$date.start)) trans_wide$event <- "move" @@ -116,38 +127,28 @@ trans_wide$popup <- NA dat_trans <- trans_wide[trans_wide$distance != 0 & trans_wide$rotationDegree != 0 & trans_wide$scaleSize != 1, - c("event", "artwork", "trace", "date.start", "date.stop", + c("fileid.start", "fileid.stop", "event", "artwork", + "trace", "glossar", "date.start", "date.stop", "time_ms.start", "time_ms.stop", "duration", - "card", "popup", - "x.start", "y.start", "x.stop", "y.stop", - "distance", "scale.start", "scale.stop", + "card", "popup", "x.start", "y.start", "x.stop", + "y.stop", "distance", "scale.start", "scale.stop", "scaleSize", "rotation.start", "rotation.stop", "rotationDegree")] +1 - nrow(dat_trans) / nrow(trans_wide) # removes almost 2/3 of the data (for small data set) rm(id_rm_stop, id_stop, trans_wide, num_start, num_stop) summary(dat_trans) -# TODO: Ask Phillip what is wrong with `time_ms` -# --> Hat er eine Erklärung dafür? - -#plot(time_ms.stop ~ time_ms.start, dat_trans, type = "b") -plot(time_ms.stop ~ time_ms.start, dat_trans, - col = rgb(red = 0, green = 0, blue = 0, alpha = 0.2)) - -plot(date.stop ~ date.start, dat_trans[1:1000,], type = "b") - - - #' # Close other events dat2 <- dat[!dat$event %in% c("Transform start", "Transform stop"), ] -dat2$x <- NULL -dat2$y <- NULL -dat2$scale <- NULL -dat2$rotation <- NULL +# dat2$x <- NULL +# dat2$y <- NULL +# dat2$scale <- NULL +# dat2$rotation <- NULL rownames(dat2) <- NULL # Create event ID for closing events @@ -202,8 +203,6 @@ lut <- glossar_dict[glossar_dict$glossar_file %in% glossar_files, ] head(dat2[, c("artwork", "event", "popup", "trace")], 20) -#df <- NULL - for (file in lut$glossar_file) { artwork_list <- unlist(lut[lut$glossar_file == file, "artwork"]) @@ -231,16 +230,15 @@ for (file in lut$glossar_file) { (current_artwork %in% artwork_list) & dat2$popup[i] == file & (j-k == 0)) { - #df <- rbind(df, data.frame(file, current_artwork, i, j)) dat2[i, "trace"] <- dat2[j, "trace"] + dat2[i, "artwork"] <- current_artwork } } } -# dim(dat2[is.na(dat2$trace), ]) -# --> finds about half of the glossar entries for the small data set... -# dat2[apply(df[, c("j", "i")], 1, c), c("artwork", "event", "popup", "trace")] +# --> finds about half of the glossar entries for the small data set... +table(is.na(dat2[dat2$glossar == 1, "trace"])) # REMEMBER: It can never bo 100% correct, since it is always possible that # several cards are open and that they link to the same glossar entry @@ -251,45 +249,42 @@ lut[sapply(lut$artwork, length) == 1, "glossar_file"] # TODO: Fill in the ones that are associated with one artwork # --> Can't come up with something -- maybe ask AK??? -# TODO: How to check if one of the former "Show Infos" is correct -# --> Can't come up with something -- maybe ask AK??? +single <- lut[sapply(lut$artwork, length) == 1, "glossar_file"] +tmp <- subset(dat2, is.na(dat2$trace))$popup +inside <- unique(tmp[tmp %in% lut[sapply(lut$artwork, length) == 1, "glossar_file"]]) +single_art <- unlist(lut[lut$glossar_file %in% inside, "artwork"]) +tmp_lut <- data.frame(glossar_file = sort(inside), artwork = single_art) -# for (file in lut$glossar_file) { -# -# artwork_list <- unlist(lut[lut$glossar_file == file, "artwork"]) -# -# for (i in seq_len(nrow(dat2))) { -# -# if (dat2$event[i] == "Show Info") { -# -# artworks <- NULL -# current_artwork <- dat2[i, "artwork"] -# j <- i -# -# } else { -# -# print(current_artwork) -# artworks <- c(artworks, dat2[i, "artwork"]) -# print(artworks) -# -# } -# -# # if (dat2$artwork[i] == "glossar" & -# # (current_artwork %in% artwork_list) & -# # dat2$popup[i] == file) { -# # -# # #df <- rbind(df, data.frame(file, current_artwork, i, j)) -# # dat2[i, "trace"] <- dat2[j, "trace"] -# -# # } -# } -# } -# correct: 17940 -# incorrect: 17963 +dat2[dat2$glossar == 1, c("artwork", "popup", "glossar", "trace")] + +for (file in tmp_lut$glossar_file) { + + for (i in seq_len(nrow(dat2))) { + + if (dat2$event[i] == "Artwork/OpenCard" & dat2$artwork[i] %in% tmp_lut$artwork) { + + current_artwork <- dat2[i, "artwork"] + j <- i + + } + + if (dat2$artwork[i] == "glossar" & + dat2$popup[i] == file) { + + dat2[i, "trace"] <- dat2[j, "trace"] + dat2[i, "artwork"] <- current_artwork + + } + } +} + + +dat2[14110:14130, ] + +# TODO: Integrate for loop into for loop above # TODO: "glossar" entry should be changed to the corresponding artwork -# TODO: Add additional variable `glossar` with 0/1 or similar instead # TODO: For now: Exclude not matched glossar entries @@ -304,12 +299,12 @@ rm(lut, current_artwork, file, glossar_dict, i, j, k, artwork_list, dat3 <- subset(df, df$event %in% c("Show Info", "Show Front")) -dat3$event <- ifelse(dat3$event == "Show Info", "start", "stop") +dat3$time <- ifelse(dat3$event == "Show Info", "start", "stop") flipCard_wide <- reshape(dat3, direction = "wide", - idvar = c("trace", "artwork"), - timevar = "event", - drop = c("fileid", "popup", "card")) + idvar = c("trace", "artwork", "glossar"), + timevar = "time", + drop = c("popup", "card")) flipCard_wide$event <- "flipCard" flipCard_wide$duration <- flipCard_wide$time_ms.stop - flipCard_wide$time_ms.start @@ -329,14 +324,15 @@ flipCard_wide$rotation.start <- NA flipCard_wide$rotation.stop <- NA flipCard_wide$rotationDegree <- NA -dat_flipCard <- flipCard_wide[, c("event", "artwork", "trace", - "date.start", "date.stop", - "time_ms.start", "time_ms.stop", - "duration", "card", "popup", - "x.start", "y.start", "x.stop", "y.stop", - "distance", "scale.start", "scale.stop", - "scaleSize", "rotation.start", - "rotation.stop", "rotationDegree")] +dat_flipCard <- flipCard_wide[, c("fileid.start", "fileid.stop", "event", + "artwork", "trace", "glossar", + "date.start", "date.stop", + "time_ms.start", "time_ms.stop", + "duration", "card", "popup", "x.start", + "y.start", "x.stop", "y.stop", + "distance", "scale.start", "scale.stop", + "scaleSize", "rotation.start", + "rotation.stop", "rotationDegree")] rm(flipCard_wide) @@ -349,11 +345,11 @@ rownames(dat4) <- NULL num_start <- diff(c(0, which(dat4$event == "Artwork/CloseCard"))) dat4$eventid <- rep(seq_along(num_start), num_start) -dat4$event <- ifelse(dat4$event == "Artwork/OpenCard", "start", "stop") +dat4$time <- ifelse(dat4$event == "Artwork/OpenCard", "start", "stop") openTopic_wide <- reshape(dat4, direction = "wide", - idvar = c("eventid", "trace", "artwork", "card"), - timevar = "event", drop = c("fileid", "popup")) + idvar = c("eventid", "trace", "glossar", "artwork", "card"), + timevar = "time", drop = "popup") openTopic_wide$event <- "openTopic" openTopic_wide$duration <- openTopic_wide$time_ms.stop - openTopic_wide$time_ms.start @@ -372,47 +368,46 @@ openTopic_wide$rotation.start <- NA openTopic_wide$rotation.stop <- NA openTopic_wide$rotationDegree <- NA -dat_openTopic <- openTopic_wide[, c("event", "artwork", "trace", - "date.start", "date.stop", - "time_ms.start", "time_ms.stop", - "duration", "card", "popup", "x.start", - "y.start", "x.stop", "y.stop", - "distance", "scale.start", "scale.stop", - "scaleSize", "rotation.start", - "rotation.stop", "rotationDegree")] +dat_openTopic <- openTopic_wide[, c("fileid.start", "fileid.stop", "event", + "artwork", "trace", "glossar", + "date.start", "date.stop", + "time_ms.start", "time_ms.stop", + "duration", "card", "popup", "x.start", + "y.start", "x.stop", "y.stop", + "distance", "scale.start", + "scale.stop", "scaleSize", + "rotation.start", "rotation.stop", + "rotationDegree")] # TODO: card should have a unique identifier for each artwork rm(openTopic_wide, num_start) #' ## close openPopup dat5 <- subset(df, df$event %in% c("ShowPopup", "HidePopup")) -dat5 <- dat5[order(dat5$artwork, dat5$date), ] +dat5 <- dat5[order(dat5$artwork, dat5$popup, dat5$date), ] rownames(dat5) <- NULL num_start <- diff(c(0, which(dat5$event == "HidePopup"))) -# last event is "ShowPopup"! Needs to be fixed -num_start <- c(num_start, 1) -# TODO: Needs to be caught in a function +# last event is "ShowPopup"! Needs to be fixed +# num_start <- c(num_start, 1) +# TODO: Needs to be caught in a function --> not anymore - still relevant??? dat5$eventid <- rep(seq_along(num_start), num_start) -dat5$event <- ifelse(dat5$event == "ShowPopup", "start", "stop") +dat5$time <- ifelse(dat5$event == "ShowPopup", "start", "stop") openPopup_wide <- reshape(dat5, direction = "wide", - idvar = c("eventid", "trace", "artwork", "popup"), - timevar = "event", drop = c("fileid", "card")) + idvar = c("eventid", "trace", "glossar", "artwork", "popup"), + timevar = "time", drop = "card") # there is a pathological entry which gets deleted... # df[df$trace == 4595, ] +# --> artwork 046 popup selene.xml gets opened twice -# TODO: Some correct entries are not closed: -df[df$trace == 1843, ] -# WHY??? openPopup_wide$event <- "openPopup" openPopup_wide$duration <- openPopup_wide$time_ms.stop - openPopup_wide$time_ms.start - openPopup_wide$card <- NA openPopup_wide$x.start <- NA openPopup_wide$x.stop <- NA @@ -426,14 +421,16 @@ openPopup_wide$rotation.start <- NA openPopup_wide$rotation.stop <- NA openPopup_wide$rotationDegree <- NA -dat_openPopup <- openPopup_wide[, c("event", "artwork", "trace", - "date.start", "date.stop", - "time_ms.start", "time_ms.stop", - "duration", "card", "popup", "x.start", - "y.start", "x.stop", "y.stop", - "distance", "scale.start", "scale.stop", - "scaleSize", "rotation.start", - "rotation.stop", "rotationDegree")] +dat_openPopup <- openPopup_wide[, c("fileid.start", "fileid.stop", "event", + "artwork", "trace", "glossar", + "date.start", "date.stop", + "time_ms.start", "time_ms.stop", + "duration", "card", "popup", "x.start", + "y.start", "x.stop", "y.stop", + "distance", "scale.start", + "scale.stop", "scaleSize", + "rotation.start", "rotation.stop", + "rotationDegree")] rm(num_start, openPopup_wide) @@ -443,14 +440,14 @@ rm(num_start, openPopup_wide) # dat_all <- merge(dat_all, dat_openTopic, all = TRUE) # dat_all <- merge(dat_all, dat_openPopup, all = TRUE) # }) -# +# # # check # nrow(dat_all) == (nrow(dat_trans) + nrow(dat_flipCard) + # nrow(dat_openTopic) + nrow(dat_openPopup)) -# +# # dat_all <- dat_all[order(dat_all$date.start), ] # rownames(dat_all) <- NULL -# +# # TODO: from here on NA... WHY?? # dat_all[19426:19435, ] @@ -460,10 +457,10 @@ rm(num_start, openPopup_wide) # dat_all2 <- dplyr::full_join(dat_trans, dat_flipCard) # dat_all2 <- dplyr::full_join(dat_all, dat_openTopic) # dat_all2 <- dplyr::full_join(dat_all, dat_openPopup) -# +# # nrow(dat_all2) == (nrow(dat_trans) + nrow(dat_flipCard) + # nrow(dat_openTopic) + nrow(dat_openPopup)) -# +# # dat_all2 <- dat_all2[order(dat_all2$date.start), ] # rownames(dat_all2) <- NULL # TODO: --> same result - but faster. Need it? @@ -479,8 +476,22 @@ nrow(dat_all) == (nrow(dat_trans) + nrow(dat_flipCard) + nrow(dat_openTopic) + nrow(dat_openPopup)) # remove all events that do not have a `date.start` +dim(dat_all[is.na(dat_all$date.start), ]) dat_all <- dat_all[!is.na(dat_all$date.start), ] -# TODO: Find out how it can be that there is only a `date.stop` +# There is only a `date.stop`, when event is not properly closed, see here: +df[df$trace == 1843, ] +dat_openPopup[dat_openPopup$trace == 1843, ] +## --> still 50 (small data set) left, and some really do not seem to be +## opened! Must be a log error +# --> others should be closed! +dat[31000:31019,] # this one e.g. +# --> Actually NOT! card gets flipped before! Again - log error! + + +# Remove durations when event spans more than one log file, since they are +# not interpretable +dat_all[which(dat_all$fileid.start != dat_all$fileid.stop), "duration"] <- NA + # sort by `start.date` dat_all <- dat_all[order(dat_all$date.start), ] @@ -521,7 +532,8 @@ head(dat_all[, c("event", "artwork", "trace", "date.start", "timediff", "case")] dat_all$eventid <- seq_len(nrow(dat_all)) -dat_all <- dat_all[, c("eventid", "case", "trace", "event", "artwork", +dat_all <- dat_all[, c("fileid.start", "fileid.stop", "eventid", "case", + "trace", "glossar", "event", "artwork", "date.start", "date.stop", "time_ms.start", "time_ms.stop", "duration", "card", "popup", "x.start", "y.start", "x.stop", "y.stop", @@ -574,6 +586,7 @@ for (case in cases) { } } } + # TODO: Get rid of the loops # --> This takes forever... @@ -587,25 +600,20 @@ for (case in cases) { # put glossar events back in -dat_all <- rbind(out, dat_all[dat_all$artwork == "glossar", ]) -dat_all <- dat_all[order(dat_all$date.start), ] -rownames(dat_all) <- NULL +#dat_all <- rbind(out, dat_all[dat_all$artwork == "glossar", ]) +out <- out[order(out$date.start), ] +rownames(out) <- NULL # Make `trace` a consecutive number -dat_all$trace <- as.numeric(as.factor(dat_all$trace)) - - -# TODO: How to handle duration < 0 -# --> Replace with NA for now... -dat_all$duration <- ifelse(dat_all$duration < 0, NA, dat_all$duration) +out$trace2 <- as.numeric(factor(out$trace, levels = unique(out$trace))) +#head(out[, c("trace", "trace2")], 50) #' # Export data -write.table(dat_all, "../data/event_logfiles.csv", +write.table(out, "../data/event_logfiles.csv", sep = ";", quote = FALSE, row.names = FALSE) - # Is `artwork` my case? Or `artwork` per day? Or `artwork` per some other # unit??? Maybe look at differences between timestamps separately for # `artwork`? And identify "new observational unit" this way? @@ -621,6 +629,3 @@ write.table(dat_all, "../data/event_logfiles.csv", # TODO: Write function for closing events - - - diff --git a/code/01b_investigate.R b/code/zz_investigate.R similarity index 96% rename from code/01b_investigate.R rename to code/zz_investigate.R index 870a30d..3cbb897 100644 --- a/code/01b_investigate.R +++ b/code/zz_investigate.R @@ -237,3 +237,12 @@ counts <- table(as.Date(dat$date[dat$event %in% start_events]), lattice::barchart(counts, auto.key = TRUE) +# TODO: Ask Phillip what is wrong with `time_ms` +# --> Hat er eine Erklärung dafür? + +#plot(time_ms.stop ~ time_ms.start, dat_trans, type = "b") +plot(time_ms.stop ~ time_ms.start, dat_trans, + col = rgb(red = 0, green = 0, blue = 0, alpha = 0.2)) + +plot(date.stop ~ date.start, dat_trans[1:1000,], type = "b") +