From 498b4873380a3764fd2b09677e59e96ad91a73dc Mon Sep 17 00:00:00 2001 From: nwickel Date: Wed, 13 Sep 2023 14:20:08 +0200 Subject: [PATCH] Updated README and tried to document all decisions I made so far --- README.Rmd | 470 +++++++++++++++++++++++++++++++++++++++ README.md | 321 -------------------------- code/01_parse-logfiles.R | 90 ++------ code/02_preprocessing.R | 22 +- code/functions.R | 3 - 5 files changed, 503 insertions(+), 403 deletions(-) create mode 100644 README.Rmd delete mode 100644 README.md diff --git a/README.Rmd b/README.Rmd new file mode 100644 index 0000000..9b7be92 --- /dev/null +++ b/README.Rmd @@ -0,0 +1,470 @@ +--- +title: "Background information about MTT data" +author: "Nora Wickelmaier" +date: "`r Sys.Date()`" +output: + html_document: + number_sections: true + toc: true +--- + +# Log data from the Multi-Touch Table at the HAUM + +The Multi Touch Table at the Herzog-Anton-Ulrich-Museum (HAUM) in +Braunschweig gives visitors of the Museum the opportunity to interact with +67 artworks and 3 tiles containing information about the museum and its +layout. The table was installed at the institute in October 2016 and since +November 2016 log files from interactions of visitors of the museum have +been collected. These log files are in an unstructured format and cannot be +easily analyzed. The purpose of the following document is to describe how +the data haven been transformed and which decisions have been made a long +the way. + +# Data structure + +The log files contain lines that indicate the beginning and end of possible +actions that can be performed when interacting with the artworks on the +table. The layout of the table looks like 70 pictures have been tossed on a +large table. Every artwork is visible at the start configuration. People +can move the pictures on the table, they can be scaled and rotated. +Additionally, the virtual picture cards can be flipped in order to find +more information of the artwork on the "back" of the card. One has to press +a little `i` for more information in one of the bottom corners of the card. +On the back of the card two (?) to six information cards can be found with +a teaser text about a certain topic. These topic cards can be opened and a +hypertext with detailed information pops up. Within these hypertexts +certain technical terms can be clicked for lay people to get more +information. This also opens up a pop-up. The events encoded in the raw log +files therefore have the following structure. + +``` +"Start Application" --> Start Application +"Show Application" +"Transform start" --> Move +"Transform stop" +"Show Info" --> Flip Card +"Show Front" +"Artwork/OpenCard" --> Open Topic +"Artwork/CloseCard" +"ShowPopup" --> Open Popup +"HidePopup" +``` + +The right side shows what events can be extracted from these raw lines. The +"Start Application" is not an event in the original sense since it only +indicates if the table was started or maybe reset itself. This is not an +interaction with the table and therefore not interesting in itself. All +"Start Application" and "Show Application" are therefore excluded from the +data when further processed and are only in the raw log files. + +# Parsing the raw log files + +The first step is to parse the raw log files that are stored by the +application as text files in a rather unstructured format to a format that +is better handled. The data are therefore transferred to a spread sheet +format. The following section describes what problems were encountered +while doing this. + +## Corrupt lines + +When reading the files containing the raw logs into R, a warning appears +that says + +``` +Warning messages: + incomplete final line found on '_2016/2016_11_18-11_31_0.log' + incomplete final line found on '_2016/2016_11_18-11_38_30.log' + incomplete final line found on '_2016/2016_11_18-11_40_36.log' + ... +``` + +When you open these files, it looks like the last line contains some binary +content. It is unclear why and how this happens. So when reading the data, +these lines were removed. A warning will be given that indicates how many +files have been affected. + +## Units of the variables + +* Welche Einheit haben x und y? Pixel? --> yes +* Welche Einheit hat scale? --> some kind if bit, does not matter, when + calculating a ratio +* rotation wirklich degree? --> yes +* Nach welchem Zeitintervall resettet sich der Tisch wieder in die + Ausgangskonfiguration? --> PM needs to look it up + +## How unclosed events are handled + +## How a case is defined + +* Herausfinden, ob mehr als eine Person am Tisch steht? + - Sliding window, in der Anzahl von Artworks gezählt wird? Oder wie weit + angefasste Artworks voneinander entfernt sind? + - Man kann sowas schon "sehen" in den Logs - aber wie kann ich es + automatisiert rausziehen? Was ist meine Definition von + "Interaktionsboost"? + - Egal wie wir es machen, geht es auf den "Event-Log-Daten"? + +## Additional meta data + +* Anreicherung der Log-Daten mit weiteren Metadaten? Was wäre interessant? + + - Metadata on artworks like, name, artist, type of artwork, epoch, etc. + - School vacations and holidays + - Special exhibits at the museum + - Number of visitors per day (bei Sven noch mal nachhaken?) + - Age structure of visitors per day? + - ... ???? + +# Problems and how I handled them + +```{r, include = FALSE} +# setwd("C:/Users/nwickelmaier/Nextcloud/Documents/MDS/2023ss/60100_master_thesis") +source("code/functions.R") +``` + +This lists some problems with the log data that required decisions. These +decisions influence the outcome and maybe even the data quality. Hence, I +tried to document how I handled these problems and explain the decisions I +made. + +## Weird behavior of `timeMs` and neg. `duration` values + +I think the negative duration values happen, when an event starts in one +log file and completes in another one. The variable `timeMs` seems to be +continuous within one log file but not over several log files. + + +```{r} +# Read data +dat0 <- read.table("data/rawdata_logfiles_small.csv", sep = ";", + header = TRUE) +dat0$date <- as.POSIXct(dat0$date) +dat0$glossar <- ifelse(dat0$artwork == "glossar", 1, 0) + +# Remove irrelevant events +dat <- subset(dat0, !(dat0$event %in% c("Start Application", + "Show Application"))) + +# Add trace variable +dat1 <- add_trace(dat, glossar_dict = "data/glossar_dict.RData") + +# Close events +dat2 <- rbind(close_events(dat1, "move"), + close_events(dat1, "flipCard"), + close_events(dat1, "openTopic"), + close_events(dat1, "openPopup")) +dat2 <- dat2[order(dat2$date.start, dat2$fileId.start), ] + +head(dat2[which(dat2$duration < 0), + c("fileId.start", "fileId.stop", "event", "artwork", "duration")], 20) + +head(dat2[which(dat2$fileId.start != dat2$fileId.stop), + c("fileId.start", "fileId.stop", "event", "artwork", "duration")], 20) + +plot(timeMs ~ as.factor(fileId), dat[1:5000,], xlab = "fileId") + +# Remove durations when event spans more than one log file, since they are +# not interpretable +#dat2[which(dat2$fileId.start != dat2$fileId.stop), "duration"] <- NA +``` + +The boxplot shows that we have a continuous range of values within one log +file but that `timeMs` does not increase over log files. Since it seems not +possible to fix this in a consistent way, I set all durations to `NA` where +`fileId.start` and `fileId.stop` are not identical. I kept `timeMs.start` +and `timeMs.stop` and also `fileId.start` and `fileId.stop` in the data +frame, so it is clear why there are no durations. The other + +NOTE: Part of this problem was that time stamps that are part of the log +file names are not zero-left-padded and therefore the files were not in the +correct order when read into R. When zero left padding these file IDs and +sorting by them and then by `date.start` within, some of the durations are +exactly fixed. Unfortunately, only three `move` events were fixed, since it +only fixed irregularities *within* one log file. See below for more +details. + +## Left padding of file IDs + +The file names of the raw log files are automatically generated and contain +a time stamp. This time stamp is not well formed. First, it contains an +incorrect month. The months go from 0 to 11 which means, that the file name +`2016_11_15-12_12_57.log` was collected on December 15, 2016 at 12:12 pm. +Another problem is that the file names are not zero left padded, e.g., +`2016_11_15-12_2_57.log`. This file was collected on December 15, 2016 at +12:02 pm and therefore before the file above. But most sorting algorithms, +will sort these files in the order shown below. In order to preprocess the +data and close events that belong together, the data need to be sorted by +events and artworks repeatedly. In order to get them back in the correct +time order, it is necessary to order them based on three variables: +`fileId.start`, `date.start` and `timeMs`. The file IDs therefore need to +sort in the correct order (again see below for example). I zero left padded +the log file names within the data frame using it as an identifier. These +"file names" do not correspond exactly to the original raw log file names. +This needs to be kept in mind when doing any kind of matching etc. + +``` +## what it looked like before left padding +# 1422 ../data/haum_logs_2016-2023/_2016b/2016_11_15-12_2_57.log 2016-12-15 12:12:56 599671 Transform start 076 076.xml NA 2092.25 2008.00 0.3000000 13.26874254 +# 1423 ../data/haum_logs_2016-2023/_2016b/2016_11_15-12_12_57.log 2016-12-15 12:12:57 621 Transform start 076 076.xml NA 2092.25 2008.00 0.3000000 13.26523465 +# 1424 ../data/haum_logs_2016-2023/_2016b/2016_11_15-12_12_57.log 2016-12-15 12:12:57 677 Transform stop 076 076.xml NA 2092.25 2008.00 0.2997736 13.26239605 +# 1425 ../data/haum_logs_2016-2023/_2016b/2016_11_15-12_12_57.log 2016-12-15 12:12:57 774 Transform start 076 076.xml NA 2092.25 2008.00 0.2999345 13.26239605 +# 1426 ../data/haum_logs_2016-2023/_2016b/2016_11_15-12_12_57.log 2016-12-15 12:12:57 850 Transform stop 076 076.xml NA 2092.25 2008.00 0.2997107 13.26223362 +# 1427 ../data/haum_logs_2016-2023/_2016b/2016_11_15-12_2_57.log 2016-12-15 12:12:57 599916 Transform stop 076 076.xml NA 2092.25 2008.00 0.2997771 13.26523465 + +## what it looks like now +# 1422 2016_11_15-12_02_57.log 2016-12-15 12:12:56 599671 Transform start 076 076.xml NA 2092.25 2008.00 0.3000000 13.26874254 +# 1423 2016_11_15-12_02_57.log 2016-12-15 12:12:57 599916 Transform stop 076 076.xml NA 2092.25 2008.00 0.2997771 13.26523465 +# 1424 2016_11_15-12_12_57.log 2016-12-15 12:12:57 621 Transform start 076 076.xml NA 2092.25 2008.00 0.3000000 13.26523465 +# 1425 2016_11_15-12_12_57.log 2016-12-15 12:12:57 677 Transform stop 076 076.xml NA 2092.25 2008.00 0.2997736 13.26239605 +# 1426 2016_11_15-12_12_57.log 2016-12-15 12:12:57 774 Transform start 076 076.xml NA 2092.25 2008.00 0.2999345 13.26239605 +# 1427 2016_11_15-12_12_57.log 2016-12-15 12:12:57 850 Transform stop 076 076.xml NA 2092.25 2008.00 0.2997107 13.26223362 +``` + +## Timestamps repeat + +The time stamps in the `date` variable record year, month, day, hour, +minute and seconds. Since one second is not a very short time interval for +a move on a touch display, this is not fine grained enough to bring events +into the correct order, meaning there are events from the same log file +having the same time stamp and even events from different log files having +the same time stamp. The log files get written about every 10 minutes +(which can easily be seen when looking at the file names of the raw log +files). So in order to get events in the correct order, it is necessary to +first order by file ID, within file ID then sort by time stamp `date` and +then within these more coarse grained time stamps sort be `timeMs`. But as +explained above, `timeMs` can only be sorted within one file ID, since they +do not increase consistently over log files, but have a new setoff for each +raw log file. + +## x,y-coordinates outside of display range + +The display of the Multi-Touch-Table is a 4K-display with 3840 x 2160 +pixels. When you plot the start and stop coordinates, the display is +clearly to distinguish. However, a lot of points are outside of the display +range. This can happen, when the art objects are scaled and then moved to +the very edge of the table. Then it will record pixels outside of the +table. These are actually valid data points and I will leave them as is. + +```{r} +par(mfrow = c(1, 2)) +plot(y.start ~ x.start, dat2) +abline(v = c(0, 3840), h = c(0, 2160), col = "blue", lwd = 2) +plot(y.stop ~ x.stop, dat2) +abline(v = c(0, 3840), h = c(0, 2160), col = "blue", lwd = 2) + +aggregate(cbind(x.start, x.stop, y.start, y.stop) ~ 1, dat2, mean) +``` + +## Pop-ups from glossar cannot be assigned to a specific artwork + +All the information, pictures and texts for the topics and pop-ups are +stored in +`/Logfiles/ContentEyevisit/eyevisit_cards_light/`. Among +other things, each folder contains XML-files with the information about any +technical terms that can be opened from the hypertexts on the topic cards. +Often these information are artwork dependent and then the corresponding +XML-file is in the folder for this artwork. Sometimes, however, more +general terms can be opened. In order to avoid multiple files containing +the same informatione, these were stored in a folder called `glossar` and +get accessed from there. The raw log files only contain the path to this +glossar entry and did not record from which artwork it was accessed. I +tried to assign these glossar entries to the correct artworks. The (very +heuristic) approach was this: + +1. Create a lookup table with all XML-file names (possible pop-ups) from + the glossar folder and what artworks possibly call them. This was stored + as an `RData` object for easier handling but should maybe be stored in a + more interoperable format. + +2. I went through all possible pop-ups in this lookup table and stored the + artworks that are associated with it. + +3. I created a sub data frame without move events (since they can never be + associated with a pop-up) and went through every line and looked up if + an artwork and a topic card had been opened. If this was the case and a + glossar entry came up before the artwork was closed again, I assigned + this artwork to this glossar entry. + +This is heuristic since it is possible that several topic cards from +different artworks are opened simultaneously and the glossar pop-up could +be opened from either one (it could even be more than two, of course). In +these cases the artwork that was opened closest to the glossar pop-up has +been assigned, but this can never be completely error free. + +And this heuristic only assigns a little more than half of the glossar +entries. Since my heuristic only looks for the last artwork that has been +opened and if this artwork is a possible candidate it misses all glossar +pop-ups where another artwork has been opened in between. This is still an +open TODO to write a more elaborate algorithm. + +All glossar pop-ups that do not get matched with an artwork are removed +from the data set with a warning. + +## Assign a `case` variable based on "time heuristic" + +One thing needed in order to work with the data set and use it for machine +learning algorithms like process mining is a variable that tries to +identify a case. A case variable will structure the data frame in a way +that navigation behavior can actually be investigated. However, we do not +know if several people are standing around the table interacting with it or +just one very active person. The simplest way to define a case variable is +to just use a time limit between events. This means that when the table has +not been interacted with for, e.g., 20 seconds than it is assumed that a +person moved on and a new person started interacting with the table. This +is the easiest heuristic and implemented at the moment. Process mining +shows that this simple approach works in a way that the correct process +gets extracted by the algorithm. + +In order to investigate user behavior on a more fine grained level, it will +be necessary to come up with a more elaborate approach. A better, still +simple approach could be to use this kind of time limit and additionally +look at the distance between artworks interacted with within one time +window. When artworks are far apart is seems plausible that more than one +person interacted with them. Very short time lapses between events on +different artworks could also be an indicator that more than one person is +interacting with the table. + +## Assign a `trace` variable + +The `trace` variable is supposed to show one interaction trace with one +artwork. Meaning it starts when an artwork is touched or flipped and stops +when it is closed again. It is easy to assign a trace from flipping a card +over opening (maybe several) topics and pop-ups for this artwork card until +closing this card again. But one would like to assign the same trace to +move events surrounding this interaction. Again, this is not possible in an +algorithmic way but only heuristically. I used the `case` variable in order +to get meaningful units around the artworks. + +If within one case only a single trace for a single artwork was opened, I +assigned this trace to the moves associated with this artwork. I (quite +often) happens that within one case one artwork is opened and closed +several times, each time starting a new trace. I then assigned all the +following move events to the trace beforehand. This is, of course, +arbitrary and could also be handled the other way around. + +Another possibility is, that an artwork gets moved within one trace without +being flipped. I then assigned a new trace to this move. + +This overall worked very well even though it was based on the very +heuristic approach assigning a case when the table has not been touched for +20 seconds. It should be kept in mind that the trace assignments for the +moves will change when case is defined in a different way. + +## A `move` event does not record any change + +Most of the events in the log files are move events. Additionally, many of +these move events are recorded but they do not indicate any change meaning +the only difference is the time stamp. All other variables indicating moves +like `x.start` and `x.stop`, `rotation.start` and `rotation.stop` etc. do +not show any change. They represent about 2/3 of all move events. These +events are probably short touches of the table without an actual +interaction. They were therefore removed from the data set. + +## Events that only close (`date.start` is NA) + +It looks like there is some kind of log error for the events that do not +have a start stop. I was able to get rid of most by sorting for `popup` for +the openPopup events, but there are still some left (50 for the small data +set, which corresponds to 0.2 per mill). The following example shows that +artwork "501" gets closed (line 31030) while the pop-up `sommerbau.xml` +is still opened (line 31027). Then artwork "501" gets opened again +(line 31035) and after that the pop-up `sommerbau.xml` is closed (line +31040). This should not be possible and therefore (correctly) two events +are assigned: One where the pop-up was opened and then not closed (which is +common) and another one where the pop-up has no start. + +```{r} +dat[31000:31019,] +# Card gets flipped closed before pop-up closes --> log error! +``` + +I did not check all of these cases (for the complete data set this is +simply not possible by hand) but just excluded all events that do not have +a `date.start` since they are hard to interpret. Often they are log errors +but in some cases they might be resolvable. + +```{r} +# remove all events that do not have a `date.start` +dim(dat2[is.na(dat2$date.start), ]) +dat2 <- dat2[!is.na(dat2$date.start), ] +``` + +## Card indices go from 0 to 7 (instead of 0 to 5 as expected) + +See `questions_number-of-cards.R` for more details. + +I wrote a function that for each artwork extracts the file names of the +possible topic cards and then looks up which topics have actually been +displayed on the back of the card. I added an index giving the ordering in +the index files. + +The possible values in the variable `topicNumber` range from 0 to 7, +however, not artwork has more than six different numbers. So I just renamed +those numbers from 1 to the highest number, e.g., $0,1,2,4,5,6$ was changed +to $0\to 1,1\to 2,2\to 3,4\to 4,5\to 5,6\to 6$. Next I used the index to +assign topics and file names to the according pop-ups. This needs to be +cross checked with the programming, but seems the most plausible approach +with my current knowledge. + +## Extracting topics from `index.xml` vs. `.xml + +When I extract the topics from `index.html` I get different topics, than +when I get them from `.html`. At first glance, it looks like using +`index.html` actually gives the wrong results. + +```{r} +artworks <- unique(dat2$artwork) +path <- "data/ContentEyevisit/eyevisit_cards_light/" +topics <- extract_topics(artworks, "index.xml", path) +topics2 <- extract_topics(artworks, paste0(artworks, ".xml"), path) + +topics[!topics$file_name %in% topics2$file_name, ] +topics2[!topics2$file_name %in% topics$file_name, ] +``` + +For artwork "031", `index.html` only defines 5 cards (the 6th is commented +out), but `topicNumber` for this artwork has 6 different entries. I will +therefore extract the topics from `.html`. (This seems also better +compatible with other data sets like 8o8m.) + +# Reading list + +* @Arizmendi2022 [--] +* @Bannert2014 [x] +* @Bousbia2010 [--] +* @Cerezo2020 +* @GerjetsSchwan2021 [x] +* @Goldhammer2020 +* @Guenther2007 +* @HuberBannert2023 [x] +* @Kroehne2018 +* @SchwanGerjets2021 [x] +* @vanderAalst2016 [Chap. 2, x] +* @vanderAalst2016 [Chap. 3] +* @vanderAalst2016 [Chap. 5, x] +* @Wang2019 + +# Open stuff + +* Angle from which people approach table in Braunschweig? Consider in + rotation variable? +* Time limit for `case` variable different for different events? (openTopic + should be opened the longest) + + $\to$ I think this is not relevant since I am looking at time *between* + events! + +# Stuff AK found interesting + +* Pre/post corona +* Identify school classes +* How many persons are present at the table? + +# Other potential questions + +* "Bursts" +* 1st vs. 2nd half of the day +* Can we identify "types of art"? With clustering or something? +* Possible to estimate how many persons per day? Maybe average of certain + weekdays? ... ? + diff --git a/README.md b/README.md deleted file mode 100644 index 2095388..0000000 --- a/README.md +++ /dev/null @@ -1,321 +0,0 @@ -# Offene Fragen - -## Datenverständnis - -* Welche Einheit haben x und y? Pixel? --> yes -* Welche Einheit hat scale? --> some kind if bit, does not matter, when - calculating a ratio -* rotation wirklich degree? --> yes -* Nach welchem Zeitintervall resettet sich der Tisch wieder in die - Ausgangskonfiguration? --> PM needs to look it up - -## Tisch-Software - -* Gibt es Doku für die Bilder, die über die xml files hinausgeht? Sowas wie - ein Manual oder ähnliches? -* Gibt es evtl. irgendwo noch ein Tablet mit der Anwendung drauf? -* Was bedeuten die Farben der Topic Cards? --> sieht man in den xml files - -## Event Logs - -* Wie gehen wir mit "nicht geschlossenen" Events um? Einfach rauslöschen? - - für Transform tendiere ich zu ja, weil sonst total uninteressant - - bei flipCard bin ich nicht so sicher... Aber man kann dann keine - duration berechnen, wäre NA -* Moves/scales/rotations ohne Veränderung würde ich auf jeden Fall - rauslöschen -* Es ist nicht möglich (bzw. ich weiß nicht wie) zusammengehörige Events - eineindeutig zu identifizieren - - nach Heuristik vorgehen? Doppelte Transformation start und stop einfach - raus? - - Daten sind nicht "fehlerfrei"; es gibt z.B. Transformation-Events wo - das Ende nicht geloggt wurde -* Wie identifiziere ich eine "Interaktionseinheit"? - - Was ist ein "case"? - - Eher grob über Zeitintervalle? - - Noch irgendeine andere Idee? -* Herausfinden, ob mehr als eine Person am Tisch steht? - - Sliding window, in der Anzahl von Artworks gezählt wird? Oder wie weit - angefasste Artworks voneinander entfernt sind? - - Man kann sowas schon "sehen" in den Logs - aber wie kann ich es - automatisiert rausziehen? Was ist meine Definition von - "Interaktionsboost"? - - Egal wie wir es machen, geht es auf den "Event-Log-Daten"? -* Anreicherung der Log-Daten mit weiteren Metadaten? Was wäre interessant? - - Metadata on artworks like, name, artist, type of artwork, epoch, etc. -ˆ - School vacations and holidays -ˆ - Special exhibits at the museum -ˆ - Number of visitors per day -ˆ - Age structure of visitors per day? - - ... ???? - -## HAUM - -* Bei Sven noch mal nachhaken wegen Besucherzahlen? - - - - -# Problems and how I handled them - -This lists some problems with the log data that required decisions. These -decisions influence the outcome and maybe even the data quality. Hence, I -tried to document how I handled these problems and explain the decisions I -made. - -## Weird behavior of `time_ms` and neg. `duration`values - -I think the negative duration values happen, when an event starts in one -log file and completes in another one. The variable `time_ms` seems to be -continuous within one log file but not over several log files. - -```{r} -dat_all[which(dat_all$duration < 0), ][1:5, 1:10] - -# flipCard -## trace 56 -dat3[dat3$trace == 56,] - -dat[dat$fileid == "2016_11_15-11_12_57.log" & dat$date == "2016-12-15 11:17:26", ] -dat[dat$fileid == "2016_11_15-11_42_57.log" & dat$date == "2016-12-15 11:46:19", ] - -#dat[309:1405, ] - -tmp <- dat[300:1405, ] -tmp[tmp$artwork == "051", ] -## -> was closed correctly, but does it belong together? - - -## trace 61 -dat3[dat3$trace == 61,] - -dat[dat$fileid == "2016_11_15-11_12_57.log" & dat$date == "2016-12-15 11:17:52", ] -dat[dat$fileid == "2016_11_15-11_42_57.log" & dat$date == "2016-12-15 11:46:19", ] - -tmp <- dat[350:1408, ] -tmp[tmp$artwork == "057", ] -## -> was closed correctly, but does it belong together? - - -# openTopic -dat_all[which(dat_all$duration < 0), ][100:105, 1:10] - -# trace 2052 -dat4[dat4$trace == 2052,] - -dat[dat$fileid == "2016_11_17-14_12_10.log" & dat$date == "2016-12-17 14:21:51", ] -dat[dat$fileid == "2016_11_17-14_22_10.log" & dat$date == "2016-12-17 14:22:25", ] - -tmp <- dat[23801:23950, ] -tmp[tmp$artwork == "502", ] - -plot(time_ms ~ as.factor(fileid), dat[1:5000,]) -``` - -The boxplot shows that we have a continuous range of values within one log -file but that `time_ms` does not increase over log files. - -Since it seems not possible to fix this in a consistent way, I will set -negative durations to `NA`. I will keep `time_ms.start` and `time_ms.stop` -in the data frame, so it is clear why there are no durations. Maybe it -would also be useful to keep `logfileid.start` and `logfileid.stop` in the -data? Maybe just for proof checking this theory... - -Part of it was that timestamps that are part of the log file names are not -zero-left-padded. But this fixed only three `move` events, since it only -fixed irregularities *within* one log file. - -```{r} -table(dat_all[dat_all$duration < 0, "event"]) - -# flipCard move openPopup openTopic -# 562 100 34 284 - - -dat[dat$event %in% c("Transform start", "Transform stop"), ][1100:1300,] -# --> got fixed by left padding... but only three all together!! - -dat_all[735, ] - -## what it looked like before left padding -# 1422 ../data/haum_logs_2016-2023/_2016b/2016_11_15-12_2_57.log 2016-12-15 12:12:56 599671 Transform start 076 076.xml NA 2092.25 2008.00 0.3000000 13.26874254 -# 1423 ../data/haum_logs_2016-2023/_2016b/2016_11_15-12_12_57.log 2016-12-15 12:12:57 621 Transform start 076 076.xml NA 2092.25 2008.00 0.3000000 13.26523465 -# 1424 ../data/haum_logs_2016-2023/_2016b/2016_11_15-12_12_57.log 2016-12-15 12:12:57 677 Transform stop 076 076.xml NA 2092.25 2008.00 0.2997736 13.26239605 -# 1425 ../data/haum_logs_2016-2023/_2016b/2016_11_15-12_12_57.log 2016-12-15 12:12:57 774 Transform start 076 076.xml NA 2092.25 2008.00 0.2999345 13.26239605 -# 1426 ../data/haum_logs_2016-2023/_2016b/2016_11_15-12_12_57.log 2016-12-15 12:12:57 850 Transform stop 076 076.xml NA 2092.25 2008.00 0.2997107 13.26223362 -# 1427 ../data/haum_logs_2016-2023/_2016b/2016_11_15-12_2_57.log 2016-12-15 12:12:57 599916 Transform stop 076 076.xml NA 2092.25 2008.00 0.2997771 13.26523465 - -## what it looks like now -# 1422 2016_11_15-12_02_57.log 2016-12-15 12:12:56 599671 Transform start 076 076.xml NA 2092.25 2008.00 0.3000000 13.26874254 -# 1423 2016_11_15-12_02_57.log 2016-12-15 12:12:57 599916 Transform stop 076 076.xml NA 2092.25 2008.00 0.2997771 13.26523465 -# 1424 2016_11_15-12_12_57.log 2016-12-15 12:12:57 621 Transform start 076 076.xml NA 2092.25 2008.00 0.3000000 13.26523465 -# 1425 2016_11_15-12_12_57.log 2016-12-15 12:12:57 677 Transform stop 076 076.xml NA 2092.25 2008.00 0.2997736 13.26239605 -# 1426 2016_11_15-12_12_57.log 2016-12-15 12:12:57 774 Transform start 076 076.xml NA 2092.25 2008.00 0.2999345 13.26239605 -# 1427 2016_11_15-12_12_57.log 2016-12-15 12:12:57 850 Transform stop 076 076.xml NA 2092.25 2008.00 0.2997107 13.26223362 -``` - -`time_ms` does not increase from log file to log file - -```{r} -tmp1 <- dat[!duplicated(dat$fileid), c("fileid", "time_ms", "event")] -tmp2 <- dat[!duplicated(dat$fileid, fromLast=T), c("fileid", "time_ms", "event")] -tmp <- rbind(tmp1, tmp2) -tmp <- tmp[order(tmp$fileid), ] -head(tmp, 50) - -plot(time_ms ~ as.factor(fileid), dat[1:2000, ], xlab = "fileid") -``` - -## x,y-coordinates outside of display range - -The display is a 4K-display with 3840 x 2160 pixels. When you plot the -start and stop coordinates, the display is clearly to distinguish. However, -a lot of points are outside of the display range. This can happen, when the -art objects are scaled and then moved to the very edge of the table. Then -it will record pixels outside of the table. These are actually valid data -points and I will leave them as is. - -```{r} -par(mfrow = c(1, 2)) -plot(y.start ~ x.start, dat) -abline(v = c(0, 3840), h = c(0, 2160), col = "blue", lwd = 2) -plot(y.stop ~ x.stop, dat) -abline(v = c(0, 3840), h = c(0, 2160), col = "blue", lwd = 2) - - -aggregate(cbind(x.start, x.stop, y.start, y.stop) ~ 1, dat, mean) -``` - - -## Timestamps repeat - - - -## Popups from glossar cannot be assigned to a specific artwork - - -## Assign a case variable based on "time heuristic" - -## A `move`event does not record any change - -## Add moves to `trace` variable - -## openPopup does not close correctly - -The sorting had to include `popup` otherwise nested events could not be -closed correctly. - - ```{r} -# TODO: Some correct entries are not closed: -df[df$trace == 1843, ] -# WHY??? -# --> Wrong eventid! -dat5[dat5$trace == 1843, ] -openPopup_wide[openPopup_wide$trace == 1843, ] -``` -## Events that only close (`date.start` is NA) - -It looks like there is some kind of log error for the events that do not -have a start stop. I was able to get rid of most by sorting for `popup` for -the openPopup events, but there are still some left (50 for the small data -set, which corresponds to 0.2 per mill). - - ```{r} -# remove all events that do not have a `date.start` -dim(dat_all[is.na(dat_all$date.start), ]) -dat_all <- dat_all[!is.na(dat_all$date.start), ] -# TODO: Find out how it can be that there is only a `date.stop` -## --> happens, when event is not properly closed, see here: -df[df$trace == 1843, ] -dat_openPopup[dat_openPopup$trace == 1843, ] -## --> still 50 (small data set) left, and some really do not seem to be -## opened! Must be a log error -# --> others should be closed! -dat[31000:31019,] # this one e.g. -# --> Actually NOT! card gets flipped before! Again - log error! -``` -Will probably just get rid of them! - -Think about if you want give warning messages about these deletions in the -functions. - -## Card indices go from 0 to 7 (instead of 0 to 5 as expected) - -See `questions_number-of-cards.R` for details. - -## Extracting topics - -When I extract the topics from `index.html` I get different topics, than -when I get them from `.html`. At first glance, it looks like using -`index.html` actually gives the wrong results. - -``` -topics <- extract_topics(artworks, "index.xml", path) -topics2 <- extract_topics(artworks, paste0(artworks, ".xml"), path) - -topics[!topics$file_name %in% topics2$file_name, ] -# artwork file_name topic index -# 072 072_artist.xml artist 1 -# 073 073_artist.xml artist 1 -# 110 110_technik.xml technik 2 -topics2[!topics2$file_name %in% topics$file_name, ] -# artwork file_name topic index -# 031 031_vergleich.xml extra info 6 -# 033 033_technik.xml technik 2 -# 055 055_vergleich4.xml extra info 5 -# 063 063_thema3.xml thema 3 -# 063 063_extrainfo1.xml thema 4 -# 072 072_artist2.xml artist 1 -# 073 073_artist2.xml artist 1 -# 099 099_technik.xml technik 2 -# 110 110_technikneu.xml technik 2 -``` - -For artwork 031, `index.html` only defines 5 cards (the 6th is commented -out), but `topicNumber` for this artwork has 6 different entries. I will -therefore extract the topics from `.html`. (This seems also better -compatible with other data sets like 8o8m. - -# Reading list - -* @Arizmendi2022 [$-$] -* @Bannert2014 [x] -* @Bousbia2010 [$-$] -* @Cerezo2020 -* @GerjetsSchwan2021 [x] -* @Goldhammer2020 -* @Guenther2007 -* @HuberBannert2023 [x] -* @Kroehne2018 -* @SchwanGerjets2021 [x] -* @vanderAalst2016 [Chap. 2, x] -* @vanderAalst2016 [Chap. 3] -* @vanderAalst2016 [Chap. 5, x] -* @Wang2019 - -# Open stuff - -* Angle from which people approach table in Braunschweig? Consider in - rotation variable? -* Time limit for `case` variable different for different events? (openTopic - should be opened the longest) - --> I think this is not relevant since I am looking at time *between* - events! - -# Stuff AK found interesting - -* Pre/post corona -* Identify school classes -* How many persons are present at the table? - -# Other potential questions - -* "Bursts" -* 1st vs. 2nd half of the day -* Can we identify "types of art"? With clustering or something? -* Possible to estimate how many persons per day? Maybe average of certain - weekdays? ... ? - diff --git a/code/01_parse-logfiles.R b/code/01_parse-logfiles.R index 32b19c1..7481f0a 100644 --- a/code/01_parse-logfiles.R +++ b/code/01_parse-logfiles.R @@ -1,46 +1,6 @@ -#' --- -#' title: "Preprocessing raw log files" -#' author: "Nora Wickelmaier" -#' date: "`r Sys.Date()`" -#' output: -#' html_document: -#' default -#' pdf_document: -#' toc: true -#' number_sections: true -#' geometry: margin = 2.5cm -#' --- - # setwd("C:/Users/nwickelmaier/Nextcloud/Documents/MDS/2023ss/60100_master_thesis/code") -#+ setup, include = FALSE -knitr::opts_chunk$set(warning = FALSE, message = FALSE) - -#' The following events can be extracted from the log files: -#' -#' ``` -#' LogEntry classes: -#' TRANSFORM_START: "Transform start" --> "Transformation Start" in Tool -#' TRANSFORM_STOP: "Transform stop" -#' START_APPLICATION: "Start Application" -#' SHOW_APPLICATION: "Show Application" -#' SHOW_INFO: "Show Info" --> "Flip Card" in Tool -#' SHOW_FRONT: "Show Front" -#' SHOW_POPUP: "ShowPopup" --> "Show Popup" in Tool -#' HIDE_POPUP: "HidePopup" -#' ARTWORK: "Artwork" --> "Show Topic" in Tool -#' ``` - -#' Choose which folders with raw log files should be included: - -folders <- "all" -#folders <- "_2016b" - -dirpaths <- paste0("../data/haum_logs_2016-2023/", folders) - -fnames <- dir(dirpaths, pattern = "*.log", full.names = TRUE) -length(fnames) -head(fnames) +###### HELPER ###### # Need to left pad file names. If I do not do this, the sorting of the # timestamps will be off and I get negative durations later on since the @@ -70,6 +30,18 @@ leftpad_fnames <- function(x) { res } +##### CONTENT ###### + +# Choose which folders with raw log files should be included + +folders <- "all" +#folders <- "_2016b" + +dirpaths <- paste0("../data/haum_logs_2016-2023/", folders) + +fnames <- dir(dirpaths, pattern = "*.log", full.names = TRUE) +length(fnames) +head(fnames) logs <- lapply(fnames, readLines) nlog <- sapply(logs, length) @@ -77,31 +49,19 @@ dat <- data.frame(fileId = rep(leftpad_fnames(fnames), nlog), logs = unlist(logs)) head(dat$logs) -#' Remove corrupted lines +# Remove corrupted lines -# Warning messages: -# incomplete final line found on '_2016/2016_11_18-11_31_0.log' -# incomplete final line found on '_2016/2016_11_18-11_38_30.log' -# incomplete final line found on '_2016/2016_11_18-11_40_36.log' -# ... - -## --> files have a last line that looks like a binary entry?? - -# From LogEntry.as: -# //pm: inserted this check to account for some broken logfiles -# if (metaData[1] == null){ -# trace("corrupt line... still do not know how these came to happen."); # corrupt lines are "" and need to be removed d1 <- dim(dat)[1] dat <- subset(dat, dat$logs != "") d2 <- dim(dat)[1] -#' The files contain `r d1-d2` corrupt lines that were remooved from the -#' data. -#' +# TODO: Catch this in a function and give back a meaningful warning +# The files contain `r d1-d2` corrupt lines that were removed from the +# data. -#' ### Extract relevant infos +# Extract relevant infos date <- sapply(dat$logs, gsub, pattern = "^\\[(.*)\\], \\[.*$", @@ -139,8 +99,6 @@ ts_elements <- strsplit(timestamp, ":") time_ms <- as.numeric(sapply(ts_elements, function(x) x[4])) + as.numeric(sapply(ts_elements, function(x) x[3])) * 1000 + as.numeric(sapply(ts_elements, function(x) x[2])) * 1000 * 60 -# TODO: Maybe change to simple gsub()... -# --> This is theoretically sound but a lot of lines for just removing ":" dat$date <- lubridate::parse_date_time(date, "bdyHMSOp") dat$timeMs <- time_ms @@ -156,17 +114,11 @@ dat$rotation <- moves[,4] dat$logs <- NULL # remove original log files from data so file becomes smaller -str(dat) - -head(dat, 20) - -# sort by fileId, since reading in by file names does not make sense because of -# missing left zero padding +# sort by fileId, since reading in by file names does not make sense +# because of missing left zero padding dat <- dat[order(dat$fileId, dat$date, dat$timeMs), ] -## TODO: Replace artwork and popup numbers with informative strings - -#' ### Save data frame +# Export data write.table(dat, "../data/rawdata_logfiles.csv", sep = ";", quote = FALSE, row.names = FALSE) diff --git a/code/02_preprocessing.R b/code/02_preprocessing.R index 4bab6c1..693c1b5 100644 --- a/code/02_preprocessing.R +++ b/code/02_preprocessing.R @@ -2,8 +2,7 @@ source("functions.R") -# Read data - +# Read data ############################################################## dat0 <- read.table("../data/rawdata_logfiles_small.csv", sep = ";", header = TRUE) dat0$date <- as.POSIXct(dat0$date) @@ -13,7 +12,7 @@ dat0$glossar <- ifelse(dat0$artwork == "glossar", 1, 0) dat <- subset(dat0, !(dat0$event %in% c("Start Application", "Show Application"))) -# Add trace variable +# Add trace variable ##################################################### dat1 <- add_trace(dat) # Close events @@ -21,7 +20,9 @@ dat2 <- rbind(close_events(dat1, "move"), close_events(dat1, "flipCard"), close_events(dat1, "openTopic"), close_events(dat1, "openPopup")) + dat2 <- dat2[order(dat2$date.start, dat2$fileId.start), ] + # Remove durations when event spans more than one log file, since they are # not interpretable dat2[which(dat2$fileId.start != dat2$fileId.stop), "duration"] <- NA @@ -29,13 +30,12 @@ dat2[which(dat2$fileId.start != dat2$fileId.stop), "duration"] <- NA # Remove all events that do not have a `date.start` dat2 <- dat2[!is.na(dat2$date.start), ] rownames(dat2) <- NULL +# TODO: Throw warning about this -#summary(dat2) - -# Add case variable +# Add case variable ###################################################### dat3 <- add_case(dat2) -# Add event ID +# Add event ID ########################################################### dat3$eventId <- seq_len(nrow(dat3)) dat3 <- dat3[, c("fileId.start", "fileId.stop", "eventId", "case", "trace", "glossar", "event", "artwork", @@ -46,17 +46,19 @@ dat3 <- dat3[, c("fileId.start", "fileId.stop", "eventId", "case", "scaleSize", "rotation.start", "rotation.stop", "rotationDegree")] -# Add trace for move events +# Add trace for move events ############################################## dat4 <- add_trace_moves(dat3) -# Add topics: file names and topics +# Add topics: file names and topics ###################################### artworks <- unique(dat4$artwork) topics <- extract_topics(artworks, pattern = paste0(artworks, ".xml"), path = "../data/ContentEyevisit/eyevisit_cards_light/") dat5 <- add_topic(dat4, topics = topics) -# Export data +# TODO: Replace artwork with informative strings + +# Export data ############################################################ write.table(dat5, "../data/event_logfiles.csv", sep = ";", row.names = FALSE) diff --git a/code/functions.R b/code/functions.R index 491c20e..652c5d4 100644 --- a/code/functions.R +++ b/code/functions.R @@ -40,14 +40,11 @@ add_trace <- function(data, glossar_dict = "../data/glossar_dict.RData") { load(glossar_dict) lut <- glossar_dict[glossar_dict$glossar_file %in% glossar_files, ] - head(subdata2[, c("artwork", "event", "popup", "trace")], 20) - inside <- glossar_files[glossar_files %in% lut[sapply(lut$artwork, length) == 1, "glossar_file"]] single_art <- unlist(lut[lut$glossar_file %in% inside, "artwork"]) - for (file in lut$glossar_file) { artwork_list <- unlist(lut[lut$glossar_file == file, "artwork"])