From c63c0a8206fe1e511478d4e292ca035f418beaad Mon Sep 17 00:00:00 2001 From: nwickel Date: Mon, 14 Aug 2023 16:57:03 +0200 Subject: [PATCH] Worked on preprocessing; added trace variable; tried out modeling; prepared questions for PM --- README.md | 103 +++++++++++ code/01_parse-logfiles.R | 15 +- code/01b_investigate.R | 15 ++ code/02_preprocessing.R | 248 +++++++++++++++++++++----- code/03_modeling.R | 115 ++++++++++++ code/questions_data-inconsistencies.R | 125 +++++++++++++ code/questions_programming-input.R | 111 ++++++++++++ 7 files changed, 683 insertions(+), 49 deletions(-) create mode 100644 README.md create mode 100644 code/03_modeling.R create mode 100644 code/questions_data-inconsistencies.R create mode 100644 code/questions_programming-input.R diff --git a/README.md b/README.md new file mode 100644 index 0000000..030d192 --- /dev/null +++ b/README.md @@ -0,0 +1,103 @@ +# Offene Fragen + +## Datenverständnis + +* Welche Einheit haben x und y? Pixel? +* Welche Einheit hat scale? +* rotation wirklich degree? +* Nach welchem Zeitintervall resettet sich der Tisch wieder in die + Ausgangskonfiguration? + +## Tisch-Software + +* Gibt es Doku für die Bilder, die über die xml files hinausgeht? Sowas wie + ein Manual oder ähnliches? +* Gibt es evtl. irgendwo noch ein Tablet mit der Anwendung drauf? +* Was bedeuten die Farben der Topic Cards? + +## Event Logs + +* Wie gehen wir mit "nicht geschlossenen" Events um? Einfach rauslöschen? + - für Transform tendiere ich zu ja, weil sonst total uninteressant + - bei flipCard bin ich nicht so sicher... Aber man kann dann keine + duration berechnen, wäre NA +* Moves/scales/rotations ohne Veränderung würde ich auf jeden Fall + rauslöschen +* Es ist nicht möglich (bzw. ich weiß nicht wie) zusammengehörige Events + eineindeutig zu identifizieren + - nach Heuristik vorgehen? Doppelte Transformation start und stop einfach + raus? + - Daten sind nicht "fehlerfrei"; es gibt z.B. Transformation-Events wo + das Ende nicht geloggt wurde +* Wie identifiziere ich eine "Interaktionseinheit"? + - Was ist ein "case"? + - Eher grob über Zeitintervalle? + - Noch irgendeine andere Idee? +* Herausfinden, ob mehr als eine Person am Tisch steht? + - Sliding window, in der Anzahl von Artworks gezählt wird? Oder wie weit + angefasste Artworks voneinander entfernt sind? + - Man kann sowas schon "sehen" in den Logs - aber wie kann ich es + automatisiert rausziehen? Was ist meine Definition von + "Interaktionsboost"? + - Egal wie wir es machen, geht es auf den "Event-Log-Daten"? +* Anreicherung der Log-Daten mit weiteren Metadaten? Was wäre interessant? + - Metadata on artworks like, name, artist, type of artwork, epoch, etc. +ˆ - School vacations and holidays +ˆ - Special exhibits at the museum +ˆ - Number of visitors per day +ˆ - Age structure of visitors per day? + - ... ???? + +## HAUM + +* Bei Sven noch mal nachhaken wegen Besucherzahlen? + + + + +# Problems and how I handled them + +This lists some problems with the log data that required decisions. These +decisions influence the outcome and maybe even the data quality. Hence, I +tried to document how I handled these problems and explain the decisions I +made. + +## Weird behavior of `time_ms` and neg. `duration`values + + +## Events that only close (`date.start` is NA) + + +## Timestamps repeat + + + +## Popups from glossar cannot be assigned to a specific artwork + + +## Assign a case variable based on "time heuristic" + +## A `move`event does not record any change + +## Add moves to `trace` variable + + + + +# Reading list + +* @Arizmendi2022 [$-$] +* @Bannert2014 [x] +* @Bousbia2010 [$-$] +* @Cerezo2020 +* @GerjetsSchwan2021 [x] +* @Goldhammer2020 +* @Guenther2007 +* @HuberBannert2023 [x] +* @Kroehne2018 +* @SchwanGerjets2021 [x] +* @vanderAalst2016 [Chap. 2, x] +* @vanderAalst2016 [Chap. 3] +* @vanderAalst2016 [Chap. 5, x] +* @Wang2019 + diff --git a/code/01_parse-logfiles.R b/code/01_parse-logfiles.R index 24d7b11..340e022 100644 --- a/code/01_parse-logfiles.R +++ b/code/01_parse-logfiles.R @@ -4,8 +4,7 @@ #' date: "`r Sys.Date()`" #' output: #' html_document: -#' toc: true -#' toc_float: true +#' default #' pdf_document: #' toc: true #' number_sections: true @@ -17,8 +16,6 @@ #+ setup, include = FALSE knitr::opts_chunk$set(warning = FALSE, message = FALSE) -#' # Preprocessing raw log files into data frame - #' The following events can be extracted from the log files: #' #' ``` @@ -36,8 +33,8 @@ knitr::opts_chunk$set(warning = FALSE, message = FALSE) #' Choose which folders with raw log files should be included: -folders <- "all" -#folders <- "_2016b" +#folders <- "all" +folders <- "_2016b" dirpaths <- paste0("../data/haum_logs_2016-2023/", folders) @@ -71,7 +68,7 @@ dat <- subset(dat, dat$logs != "") d2 <- dim(dat)[1] #' The files contain `r d1-d2` corrupt lines that were remooved from the data. - +#' #' ### Extract relevant infos @@ -136,6 +133,8 @@ dat <- dat[order(dat$date), ] ## TODO: Replace artwork and popup numbers with informative strings -write.table(dat, "../data/rawdata_logfiles.csv", +#' ### Save data frame + +write.table(dat, "../data/rawdata_logfiles_small.csv", sep = ";", quote = FALSE, row.names = FALSE) diff --git a/code/01b_investigate.R b/code/01b_investigate.R index 166fbc3..870a30d 100644 --- a/code/01b_investigate.R +++ b/code/01b_investigate.R @@ -222,3 +222,18 @@ lattice::barchart(counts, auto.key = TRUE) #' can happen that the wrong tags have been put together (e.g., Transform #' start and Transform stop); therefore, durations etc. are only heuristic + + +#' ## Plots + +counts <- table(as.Date(dat$date), dat$event) +lattice::barchart(counts, auto.key = TRUE) + + +start_events <- c("Transform start", "Show Info", "ShowPopup", "Artwork/OpenCard") + +counts <- table(as.Date(dat$date[dat$event %in% start_events]), + dat$event[dat$event %in% start_events]) +lattice::barchart(counts, auto.key = TRUE) + + diff --git a/code/02_preprocessing.R b/code/02_preprocessing.R index b097784..58e2640 100644 --- a/code/02_preprocessing.R +++ b/code/02_preprocessing.R @@ -109,11 +109,16 @@ trans_wide$rotationDegree <- trans_wide$rotation.stop - trans_wide$rotation.start trans_wide$scaleSize <- trans_wide$scale.stop - trans_wide$scale.start -dat_trans <- dat_trans[trans_wide$distance != 0 & +trans_wide$trace <- NA +trans_wide$card <- NA +trans_wide$popup <- NA + +dat_trans <- trans_wide[trans_wide$distance != 0 & trans_wide$rotationDegree != 0 & trans_wide$scaleSize != 0, - c("event", "artwork", "date.start", "date.stop", + c("event", "artwork", "trace", "date.start", "date.stop", "time_ms.start", "time_ms.stop", "duration", + "card", "popup", "x.start", "y.start", "x.stop", "y.stop", "distance", "scale.start", "scale.stop", "scaleSize", "rotation.start", "rotation.stop", @@ -128,7 +133,8 @@ summary(dat_trans) # --> Hat er eine Erklärung dafür? #plot(time_ms.stop ~ time_ms.start, dat_trans, type = "b") -plot(time_ms.stop ~ time_ms.start, dat_trans, col = rgb(red = 0, green = 0, blue = 0, alpha = 0.2)) +plot(time_ms.stop ~ time_ms.start, dat_trans, + col = rgb(red = 0, green = 0, blue = 0, alpha = 0.2)) plot(date.stop ~ date.start, dat_trans[1:1000,], type = "b") @@ -248,7 +254,7 @@ lut[sapply(lut$artwork, length) == 1, "glossar_file"] # TODO: Fill in the ones that are associated with one artwork # --> Can't come up with something -- maybe ask AK??? -# TODO: How to check if one of the former "Show Infos" is correct on +# TODO: How to check if one of the former "Show Infos" is correct # --> Can't come up with something -- maybe ask AK??? # for (file in lut$glossar_file) { @@ -285,6 +291,8 @@ lut[sapply(lut$artwork, length) == 1, "glossar_file"] # correct: 17940 # incorrect: 17963 +# TODO: "glossar" entry should be changed to the corresponding artwork +# TODO: Add additional variable `glossar` with 0/1 or similar instead # TODO: For now: Exclude not matched glossar entries @@ -314,10 +322,28 @@ flipCard_wide$duration <- flipCard_wide$time_ms.stop - flipCard_wide$duration <- ifelse(flipCard_wide$duration < 0, NA, flipCard_wide$duration) +flipCard_wide$card <- NA +flipCard_wide$popup <- NA +flipCard_wide$x.start <- NA +flipCard_wide$x.stop <- NA +flipCard_wide$y.start <- NA +flipCard_wide$y.stop <- NA +flipCard_wide$distance <- NA +flipCard_wide$scale.start <- NA +flipCard_wide$scale.stop <- NA +flipCard_wide$scaleSize <- NA +flipCard_wide$rotation.start <- NA +flipCard_wide$rotation.stop <- NA +flipCard_wide$rotationDegree <- NA + dat_flipCard <- flipCard_wide[, c("event", "artwork", "trace", "date.start", "date.stop", "time_ms.start", "time_ms.stop", - "duration")] + "duration", "card", "popup", + "x.start", "y.start", "x.stop", "y.stop", + "distance", "scale.start", "scale.stop", + "scaleSize", "rotation.start", + "rotation.stop", "rotationDegree")] rm(tmp, flipCard_wide) @@ -344,10 +370,27 @@ openTopic_wide$duration <- ifelse(openTopic_wide$duration < 0, # TODO: How to handle duration < 0 # --> Replace with NA for now... -dat_openTopic <- openTopic_wide[, c("event", "artwork", "card", "trace", +openTopic_wide$popup <- NA +openTopic_wide$x.start <- NA +openTopic_wide$x.stop <- NA +openTopic_wide$y.start <- NA +openTopic_wide$y.stop <- NA +openTopic_wide$distance <- NA +openTopic_wide$scale.start <- NA +openTopic_wide$scale.stop <- NA +openTopic_wide$scaleSize <- NA +openTopic_wide$rotation.start <- NA +openTopic_wide$rotation.stop <- NA +openTopic_wide$rotationDegree <- NA + +dat_openTopic <- openTopic_wide[, c("event", "artwork", "trace", "date.start", "date.stop", "time_ms.start", "time_ms.stop", - "duration")] + "duration", "card", "popup", "x.start", + "y.start", "x.stop", "y.stop", + "distance", "scale.start", "scale.stop", + "scaleSize", "rotation.start", + "rotation.stop", "rotationDegree")] # TODO: card should have a unique identifier for each artwork rm(openTopic_wide, num_start, tmp) @@ -385,29 +428,47 @@ openPopup_wide$duration <- ifelse(openPopup_wide$duration < 0, # TODO: How to handle duration < 0 # --> Replace with NA for now... -dat_openPopup <- openPopup_wide[, c("event", "artwork", "popup", "trace", +openPopup_wide$card <- NA +openPopup_wide$x.start <- NA +openPopup_wide$x.stop <- NA +openPopup_wide$y.start <- NA +openPopup_wide$y.stop <- NA +openPopup_wide$distance <- NA +openPopup_wide$scale.start <- NA +openPopup_wide$scale.stop <- NA +openPopup_wide$scaleSize <- NA +openPopup_wide$rotation.start <- NA +openPopup_wide$rotation.stop <- NA +openPopup_wide$rotationDegree <- NA + +dat_openPopup <- openPopup_wide[, c("event", "artwork", "trace", "date.start", "date.stop", "time_ms.start", "time_ms.stop", - "duration")] + "duration", "card", "popup", "x.start", + "y.start", "x.stop", "y.stop", + "distance", "scale.start", "scale.stop", + "scaleSize", "rotation.start", + "rotation.stop", "rotationDegree")] rm(num_start, openPopup_wide, tmp) # Merge all -system.time({ -dat_all <- merge(dat_trans, dat_flipCard, all = TRUE) -dat_all <- merge(dat_all, dat_openTopic, all = TRUE) -dat_all <- merge(dat_all, dat_openPopup, all = TRUE) -}) - -# check -nrow(dat_all) == (nrow(dat_trans) + nrow(dat_flipCard) + - nrow(dat_openTopic) + nrow(dat_openPopup)) - -dat_all <- dat_all[order(dat_all$date.start), ] -rownames(dat_all) <- NULL +# system.time({ +# dat_all <- merge(dat_trans, dat_flipCard, all = TRUE) +# dat_all <- merge(dat_all, dat_openTopic, all = TRUE) +# dat_all <- merge(dat_all, dat_openPopup, all = TRUE) +# }) +# +# # check +# nrow(dat_all) == (nrow(dat_trans) + nrow(dat_flipCard) + +# nrow(dat_openTopic) + nrow(dat_openPopup)) +# +# dat_all <- dat_all[order(dat_all$date.start), ] +# rownames(dat_all) <- NULL +# # TODO: from here on NA... WHY?? -dat_all[19426:19435, ] +# dat_all[19426:19435, ] # TODO: Should card maybe also be filled in for "openPopup"? @@ -423,19 +484,135 @@ dat_all[19426:19435, ] # TODO: --> same result - but faster. Need it? # --> Would hate to depend on dplyr... +#' ## Use `rbind()` instead... +# --> unbeatable in terms of time! + +dat_all <- rbind(dat_trans, dat_flipCard, dat_openTopic, dat_openPopup) + +# check +nrow(dat_all) == (nrow(dat_trans) + nrow(dat_flipCard) + + nrow(dat_openTopic) + nrow(dat_openPopup)) + +# remove all events that do not have a `date.start` +dat_all <- dat_all[!is.na(dat_all$date.start), ] +# TODO: Find out how it can be that there is only a `date.stop` + +# sort by `start.date` +dat_all <- dat_all[order(dat_all$date.start), ] +rownames(dat_all) <- NULL + +ind <- rowSums(is.na(dat_all)) == ncol(dat_all) +any(ind) +dat_all[ind, ] +# --> No rows with only NA, as it should be. + +summary(dat_all) # OK, this actually makes a lot of sense :) + +#' ## Create case variable + +#dat_all$timediff <- as.numeric(dat_all$date.stop - dat_all$date.start) + +dat_all$timediff <- as.numeric(diff(c(dat_all$date.start[1], dat_all$date.start))) + +hist(dat_all$timediff[dat_all$timediff < 40], breaks = 50) -#' ## Plots +# TODO: What is the best choice for the cutoff here? I took 20 secs for now +dat_all$case <- NA +j <- 1 -counts <- table(as.Date(dat$date), dat$event) -lattice::barchart(counts, auto.key = TRUE) +for (i in seq_len(nrow(dat_all))) { + if (dat_all$timediff[i] < 21) { + dat_all$case[i] <- j + } else { + j <- j + 1 + dat_all$case[i] <- j + } +} +head(dat_all[, c("event", "artwork", "trace", "date.start", "timediff", "case")], 100) -start_events <- c("Transform start", "Show Info", "ShowPopup", "Artwork/OpenCard") +#' ## Add event ID -counts <- table(as.Date(dat$date[dat$event %in% start_events]), - dat$event[dat$event %in% start_events]) -lattice::barchart(counts, auto.key = TRUE) +dat_all$eventid <- seq_len(nrow(dat_all)) + +dat_all <- dat_all[, c("eventid", "case", "trace", "event", "artwork", + "date.start", "date.stop", "time_ms.start", + "time_ms.stop", "duration", "card", "popup", + "x.start", "y.start", "x.stop", "y.stop", + "distance", "scale.start", "scale.stop", + "scaleSize", "rotation.start", "rotation.stop", + "rotationDegree")] + +#' ## Add `trace` numbers for `move` events + +# when case and artwork are identical and there is only 1 trace value +# --> assign it to all `move` events for that case and artwork +# when case and artwork are identical and there is more than 1 trace value +# --> assign the `trace` value that was right before this `move` event +# (could, of course, also be after) + +cases <- unique(dat_all$case) +aws <- unique(dat_all$artwork)[unique(dat_all$artwork) != "glossar"] +max_trace <- max(dat_all$trace, na.rm = TRUE) + 1 +out <- NULL + +for (case in cases) { + for (art in aws) { + tmp <- dat_all[dat_all$case == case & dat_all$artwork == art, ] + if (nrow(tmp) != 0) { + + if (length(na.omit(unique(tmp$trace))) == 1) { + tmp[tmp$event == "move", "trace"] <- na.omit(unique(tmp$trace)) + } else if (length(na.omit(unique(tmp$trace))) > 1) { + for (i in 1:nrow(tmp)) { + if (tmp$event[i] == "move") { + if (i == 1) { + tmp$trace[i] <- na.omit(unique(tmp$trace))[1] + } else { + tmp$trace[i] <- tmp$trace[i - 1] + } + } + } + } else if (all(is.na(tmp$trace))) { + for (i in 1:nrow(tmp)) { + if (tmp$event[i] == "move") { + tmp$trace[i] <- max_trace + } + } + } + max_trace <- max_trace + 1 + } + if (nrow(tmp) > 0) { + #print(tmp[, c("case", "event", "trace", "artwork")]) + out <- rbind(out, tmp) + } + } +} +# TODO: Get rid of the loops +# --> This takes forever... + +#head(out[, c("time_ms.start", "case", "trace", "event", "artwork")], 55) + +#head(dat_all[dat_all$artwork %in% "501", c("time_ms.start", "case", "trace", "event", "artwork")], 50) + +# identical(dat_all[which(!dat_all$eventid %in% out$eventid), ], +# dat_all[dat_all$artwork == "glossar", ]) +# --> TRUE + +# put glossar events back in + +dat_all <- rbind(out, dat_all[dat_all$artwork == "glossar", ]) +dat_all <- dat_all[order(dat_all$date.start), ] +rownames(dat_all) <- NULL + +# Make `trace` a consecutive number +dat_all$trace <- as.numeric(as.factor(dat_all$trace)) + +#' # Export data + +write.table(dat_all, "../data/event_logfiles.csv", + sep = ";", quote = FALSE, row.names = FALSE) # Is `artwork` my case? Or `artwork` per day? Or `artwork` per some other @@ -451,16 +628,5 @@ lattice::barchart(counts, auto.key = TRUE) # artwork # dat_art <- split(dat, dat$artwork) -## --> Maybe need it at some point? - -#' # Problems - -#' * Opening and closing of events cannot be identified unambiguously; it -#' can happen that the wrong tags have been put together (e.g., Transform -#' start and Transform stop); therefore, durations etc. are only heuristic - -# TODO: Add a case identifier based on timestamps -# --> needs to be done on "raw data". Is it possible? Something seems -# seriously wrong with `time_ms` - # TODO: Write function for closing events + diff --git a/code/03_modeling.R b/code/03_modeling.R new file mode 100644 index 0000000..36d087d --- /dev/null +++ b/code/03_modeling.R @@ -0,0 +1,115 @@ +#' --- +#' title: "Modelling log files with Process Mining" +#' author: "Nora Wickelmaier" +#' date: "`r Sys.Date()`" +#' output: +#' html_document: +#' toc: true +#' toc_float: true +#' pdf_document: +#' toc: true +#' number_sections: true +#' geometry: margin = 2.5cm +#' --- + +# setwd("C:/Users/nwickelmaier/Nextcloud/Documents/MDS/2023ss/60100_master_thesis/code") + +#' # Read data + +dat <- read.table("../data/event_logfiles.csv", sep = ";", header = TRUE) +dat$date.start <- as.POSIXct(dat$date.start) +dat$date.stop <- as.POSIXct(dat$date.stop) + +#' # Creating event logs + +library(bupaverse) + +names(dat)[6:7] <- c("start", "complete") + +table(table(dat$start)) +# --> hmm... + +summary(aggregate(duration ~ trace, dat, mean)) + + +alog <- activitylog(dat, + case_id = "trace", + activity_id = "event", + #resource_id = "case", + resource_id = "artwork", + timestamps = c("start", "complete")) + +# --> have not understood, yet, which ist what... + +process_map(alog) + +process_map(alog, frequency("relative")) +process_map(alog, frequency("relative_consequent")) + +library(processanimateR) + +animate_process(to_eventlog(alog)) + +col_vector <- c("#7FC97F", "#BEAED4", "#FDC086", "#FFFF99", "#386CB0", + "#F0027F", "#BF5B17", "#666666", "#1B9E77", "#D95F02", + "#7570B3", "#E7298A", "#66A61E", "#E6AB02", "#A6761D", + "#666666", "#A6CEE3", "#1F78B4", "#B2DF8A", "#33A02C", + "#FB9A99", "#E31A1C", "#FDBF6F", "#FF7F00", "#CAB2D6", + "#6A3D9A", "#FFFF99", "#B15928", "#FBB4AE", "#B3CDE3", + "#CCEBC5", "#DECBE4", "#FED9A6", "#FFFFCC", "#E5D8BD", + "#FDDAEC", "#F2F2F2", "#B3E2CD", "#FDCDAC", "#CBD5E8", + "#F4CAE4", "#E6F5C9", "#FFF2AE", "#F1E2CC", "#CCCCCC", + "#E41A1C", "#377EB8", "#4DAF4A", "#984EA3", "#FF7F00", + "#FFFF33", "#A65628", "#F781BF", "#999999", "#66C2A5", + "#FC8D62", "#8DA0CB", "#E78AC3", "#A6D854", "#FFD92F", + "#E5C494", "#B3B3B3", "#8DD3C7", "#FFFFB3", "#BEBADA", + "#FB8072", "#80B1D3", "#FDB462", "#B3DE69", "#FCCDE5", + "#D9D9D9") + +animate_process(to_eventlog(alog), mode = "relative", jitter = 10, legend = "color", + mapping = token_aes(color = token_scale("artwork", + scale = "ordinal", + range = col_vector))) + +elog <- to_eventlog(alog) +animate_process(elog[elog$artwork == "054", ]) +animate_process(elog[elog$artwork == "080", ]) +animate_process(elog[elog$artwork == "501", ]) + +process_map(alog[alog$artwork == "054", ]) + +animate_process(elog[elog$artwork %in% c("080", "054"), ], + mode = "relative", jitter = 10, legend = "color", + mapping = token_aes(color = token_scale("artwork", + scale = "ordinal", + range = c("black", "gray")))) +# --> not sure, yet, how to interpret this... + + + + +alog080 <- activitylog(dat[dat$artwork %in% "080", ], + #case_id = "case", + case_id = "trace", + activity_id = "event", + #resource_id = "trace", + resource_id = "case", + timestamps = c("start", "complete")) + +process_map(alog080, frequency("relative")) + + + +alog054 <- activitylog(dat[dat$artwork %in% "054", ], + #case_id = "case", + case_id = "trace", + activity_id = "event", + #resource_id = "trace", + resource_id = "case", + timestamps = c("start", "complete")) + +process_map(alog054, frequency("relative")) + + + + diff --git a/code/questions_data-inconsistencies.R b/code/questions_data-inconsistencies.R new file mode 100644 index 0000000..6ebdf04 --- /dev/null +++ b/code/questions_data-inconsistencies.R @@ -0,0 +1,125 @@ +#' --- +#' title: "Open Questions" +#' author: "Nora Wickelmaier" +#' date: "`r Sys.Date()`" +#' output: +#' html_document: +#' number_sections: true +#' toc: true +#' --- + +#+ include = FALSE +# setwd("C:/Users/nwickelmaier/Nextcloud/Documents/MDS/2023ss/60100_master_thesis/code") +dat <- read.table("../data/event_logfiles.csv", sep = ";", header = TRUE) +dat$date.start <- as.POSIXct(dat$date.start) +dat$date.stop <- as.POSIXct(dat$date.stop) + +#' This is what the data look like after preprocessing right now + +#+ include = FALSE +mat <- as.data.frame(t(sapply(dat, range, na.rm = TRUE))) +names(mat) <- c("min", "max") +mat$min <- round(as.numeric(mat$min), 1) +mat$max <- round(as.numeric(mat$max), 1) +mat$mean <- round(sapply(dat, function(x) mean(x, na.rm = TRUE)), 1) +mat$missings <- sapply(dat, function(x) sum(is.na(x))) +mat <- mat[!(rownames(mat) %in% c("eventid", "case", "trace", "event", "artwork", "card", "popup", "date.start", "date.stop")), ] + +#+ echo = FALSE +knitr::kable(mat) + +#' This is only the data for 2016! So only about 2 weeks in December. + +# Date ranges +range(dat$date.start) +range(dat$date.stop, na.rm = TRUE) + +#' # Units of x and y +#' I assume that x and y are pixel $\to$ correct? + +#' But they look weird, when plotted. Is it possible that there are +#' outliers? If yes, how? Do we have the true ranges of the display? + +par(mfrow = c(1, 2)) +plot(y.start ~ x.start, dat) +abline(v = c(0, 3800), h = c(0, 2150), col = "blue", lwd = 2) +plot(y.stop ~ x.stop, dat) +abline(v = c(0, 3800), h = c(0, 2150), col = "blue", lwd = 2) + +aggregate(cbind(x.start, x.stop, y.start, y.stop) ~ 1, dat, mean) + +#' Looks like the range should be something like $x = [0, 3800]$ and +#' $y = [0, 2150]$. Do we have the starting coordinates for each artwork? +#' + +#' # Unit of scale + +summary(dat$scaleSize) + +#' I thought it would be some kind of scaling factor, but then I would +#' have expected that `scale.start` is always 1 or something. +#' + +#' # Unit of rotation + +summary(dat$rotationDegree) + +#' This looks pretty clear. Should be degree. Anything else to consider +#' here? I am assuming negative means left, but maybe not? +#' + +#' # Meaningful unit for "case" + +#' I pretty randomly chose `20 sec` based on this plot. I would love a +#' second opinion. `:)` + +timediff <- as.numeric(diff(c(dat$date.start[1], dat$date.start))) +hist(timediff[timediff < 40], breaks = 50) +abline(v = 20, col = "red", lwd = 2) + +#' This actually works pretty well and lets me assign `trace` values to the +#' moves. But maybe there are other ideas on how to define this? + +dat[1:40, c("date.start", "case", "trace", "event", "artwork")] + + +#' # Problems with `time_ms` + +#' What exactly happens, when `time_ms` goes down again? Why does it not go +#' down to 0? + +par(mfrow = c(1, 2)) + +plot(dat$time_ms.start[1:100], type = "b", ylab = "time_ms", xlab = "") +points(dat$time_ms.stop[1:100], type = "b", col = rgb(1, 0, 0, .5)) +legend("topleft", c("start", "stop"), lty = 1, col = c("black", "red")) + +plot(dat$time_ms.stop[1:100] - dat$time_ms.start[1:100], type = "b", + ylab = "duration", col = rgb(0, 0, 1, .5)) +abline(h = 0, lty = 2) + +#' For the regular timestamps everything looks fine. + +par(mfrow = c(1, 2)) + +plot(dat$date.stop[1:100], type = "b", ylab = "timestamp", xlab = "", + col = rgb(1, 0, 0, .5)) +points(dat$date.start[1:100], type = "b") +legend("topleft", c("start", "stop"), lty = 1, col = c("black", "red")) + +plot(dat$date.stop[1:100] - dat$date.start[1:100], type = "b", + ylab = "duration", col = rgb(0, 0, 1, .5)) +abline(h = 0, lty = 2) + +#+ +plot(time_ms.start ~ date.start, dat[1:1000, ], type = "b") +points(time_ms.stop ~ date.stop, dat[1:1000, ], type = "b", col = rgb(1, 0, 0, .3)) + +#' For `time_ms.stop` this looks even weirder. +#' + +#' # After which time interval does the table reset? + +#' I cannot see this in the data at all. Or can I? Has this something to do +#' with the weird behavior of `time_ms`? + diff --git a/code/questions_programming-input.R b/code/questions_programming-input.R new file mode 100644 index 0000000..dd0e33c --- /dev/null +++ b/code/questions_programming-input.R @@ -0,0 +1,111 @@ +#' --- +#' title: "Programming input" +#' author: "Nora Wickelmaier" +#' date: "`r Sys.Date()`" +#' output: html_document +#' --- + +#+ include = FALSE +# setwd("C:/Users/nwickelmaier/Nextcloud/Documents/MDS/2023ss/60100_master_thesis/code") + +#+ +dat0 <- read.table("../data/rawdata_logfiles_small.csv", sep = ";", header = TRUE) +dat0$date <- as.POSIXct(dat0$date) # create date object + +# Remove irrelevant events +dat <- subset(dat0, !(dat0$event %in% c("Start Application", "Show Application"))) +str(dat) + +# make data better manageable +tmp <- dat[!dat$event %in% c("Transform start", "Transform stop"), ] +rownames(tmp) <- NULL + +#' # Add `trace` variable for closing events + +tmp$trace <- NA +last_event <- tmp$event[1] +aws <- unique(tmp$artwork)[unique(tmp$artwork) != "glossar"] + +for (art in aws) { # select artwork + + for (i in 1:nrow(tmp)) { # go through rows + + if (last_event == "Show Info" & tmp$artwork[i] == art) { + tmp$trace[i] <- i + j <- i + + } else if (last_event == "Show Front" & tmp$artwork[i] == art) { + tmp$trace[i] <- j + + } else if (!(last_event %in% c("Show Info", "Show Front")) & + tmp$artwork[i] == art) { + tmp$trace[i] <- j + } + + if (i <= nrow(tmp)) { + last_event <- tmp$event[i + 1] + } + } +} + +head(tmp[, c("artwork", "event", "trace")], 50) + +#' # Find artwork for glossar entry + +glossar_files <- unique(tmp[tmp$artwork == "glossar", "popup"]) + +# Load lookup table for artworks and glossar files +load("../data/glossar_dict.RData") +lut <- glossar_dict[glossar_dict$glossar_file %in% glossar_files, ] + +# Fill in trace variable based on last `Show Info` +for (file in lut$glossar_file) { + + artwork_list <- unlist(lut[lut$glossar_file == file, "artwork"]) + + for (i in seq_len(nrow(tmp))) { + + if (tmp$event[i] == "Show Info") { + + current_artwork <- tmp[i, "artwork"] + j <- i + k <- i + + } else { + + current_artwork <- current_artwork + + } + + if (tmp$event[i] == "Show Front" & tmp$artwork[i] == current_artwork) { + # make sure artwork has not been closed, yet! + k <- i + } + + if (tmp$artwork[i] == "glossar" & + (current_artwork %in% artwork_list) & + tmp$popup[i] == file & (j-k == 0)) { + + tmp[i, "trace"] <- tmp[j, "trace"] + + } + } +} + +tmp[tmp$artwork == "glossar", c("artwork", "event", "popup", "trace")] + +proportions(table(is.na(tmp$trace[tmp$artwork == "glossar"]))) +# --> finds about half of the glossar entries for small data set... + +# REMEMBER: It can never be 100% correct, since it is always possible that +# several cards are open and that they link to the same glossar entry + +# How many glossar_files are only associated with one artwork? +lut[sapply(lut$artwork, length) == 1, "glossar_file"] + +# TODO: Fill in the ones that are associated with one artwork +# --> Can't come up with something -- maybe ask Philipp??? + +# TODO: How to check if one of the former "Show Infos" is correct +# --> Can't come up with something -- maybe ask Philipp??? +