diff --git a/README.Rmd b/README.Rmd index 0560b2d..2a836ea 100644 --- a/README.Rmd +++ b/README.Rmd @@ -181,6 +181,20 @@ details. UPDATE: By now I remove all events that span more than one log file. This lets me improve speed considerably. +UPDATE: Infos from Philipp: + +"Bin außerdem gerade den Code von damals durchgegangen. Das Logging läuft +so: Mit Start der Anwendung wird alle 10 Minuten ein neues Logfile +erstellt. Die Startzeit, von der aus die Duration berechnet wird, wird +jeweils neu gesetzt. Duration ist also nicht "Dauer seit Start der +Anwendung" sondern "Dauer seit Restart des Loggers". Deine Vermutung ist +also richtig - es sollte keine Durations >10 Minuten geben. Der erste +Eintrag eines Logfiles kann alles zwischen 0 und 10 Minuten sein (je +nachdem, ob der Tisch zum Zeitpunkt des neuen Logging-Intervalls in +Benutzung war). Wenn ein Case also über 2+ Logs verteilt ist, musst du auf +die Duration jeweils 10 Minuten pro Logfile nach dem ersten addieren, damit +es passt." + ## Left padding of file IDs The file names of the raw log files are automatically generated and contain diff --git a/code/01_preprocessing_haum.R b/code/01_preprocessing_haum.R index 495329e..cf3e3a6 100644 --- a/code/01_preprocessing_haum.R +++ b/code/01_preprocessing_haum.R @@ -11,21 +11,20 @@ path <- "../data/haum/LogFiles/" folders <- dir(path) # parse raw log files -datraw <- parse_logfiles(folders, path) +#datraw <- parse_logfiles(folders, path) +datraw <- read.table("../data/haum/raw_logfiles_small_2023-09-26_13-50-20.csv", + sep = ";", header = TRUE) # export data write.table(datraw, paste0("../data/haum/raw_logfiles_", now, ".csv"), sep = ";", row.names = FALSE) -#save(datraw, file = paste0("../data/haum/datraw_", now, ".RData")) -#load("../data/haum/datraw_2023-09-23_01-31-30.RData") -artworks <- unique(na.omit(datraw$artwork)) # convert to log events datlogs <- create_eventlogs(datraw, xmlpath = "../data/haum/ContentEyevisit/eyevisit_cards_light/") + artworks <- unique(na.omit(datlogs$artwork)) artworks <- artworks[!artworks %in% c("504", "505")] - topics <- extract_topics(artworks, xmlfiles = paste0(artworks, ".xml"), xmlpath = "../data/haum/ContentEyevisit/eyevisit_cards_light/") diff --git a/code/02_metadata_haum.R b/code/02_metadata_haum.R index 201cc7f..9112fad 100644 --- a/code/02_metadata_haum.R +++ b/code/02_metadata_haum.R @@ -47,7 +47,8 @@ for (i in seq_len(nrow(sf))) { # data sets... Not important here, since I only do NI. # load (small) event log data set -dat <- read.table("data/haum/event_logfiles_2023-09-23_01-31-30.csv", +#dat <- read.table("data/haum/event_logfiles_2023-09-23_01-31-30.csv", +dat <- read.table("data/haum/event_logfiles_small_2023-10-15_10-08-43.csv", sep = ";", header = TRUE) dat$date.start <- as.POSIXct(dat$date.start) dat$date.stop <- as.POSIXct(dat$date.stop) @@ -64,7 +65,7 @@ dat2 <- merge(dat1, sfdat, by = "date", all.x = TRUE) ## Export data write.table(dat2, - file = "data/haum/event_logfiles_metadata_2023-09-23_01-31-30.csv", + file = "data/haum/event_logfiles_small_metadata_2023-10-15_10-08-43.csv", sep = ";", row.names = FALSE) # TODO: Maybe add infos about artworks? diff --git a/code/04_modeling_haum.R b/code/04_modeling_haum.R index 51c92d2..8c40744 100644 --- a/code/04_modeling_haum.R +++ b/code/04_modeling_haum.R @@ -4,14 +4,14 @@ # dat0 <- read.table("../data/haum/event_logfiles_metadata_2023-09-23_01-31-30.csv", # sep = ";", header = TRUE) -dat0 <- read.table("../data/haum/event_logfiles_small_metadata_2023-09-25_09-56-34.csv", +dat0 <- read.table("../data/haum/event_logfiles_small_metadata_2023-10-15_10-08-43.csv", sep = ";", header = TRUE) dat0$date <- as.Date(dat0$date) dat0$date.start <- as.POSIXct(dat0$date.start) dat0$date.stop <- as.POSIXct(dat0$date.stop) dat0$artwork <- sprintf("%03d", dat0$artwork) -# TODO: Write a functions that closes events spanning different log files +# TODO: Write a function that closes events spanning different log files # OR: Remove openTopic and OpenPopup events that do not start with a # flipCard (AND openPopup events without openTopic event beforehand) @@ -48,16 +48,46 @@ table(table(dat$start)) summary(aggregate(duration ~ trace, dat, mean)) +# remove fragmented traces +tab <- xtabs( ~ trace + event, dat) -alog <- activitylog(dat, +fragments <- NULL + +for (i in seq_len(nrow(tab))) { + if (tab[i, "openPopup"] != 0 & tab[i, "flipCard"] == 0) { + fragments <- c(fragments, rownames(tab)[i]) + } else if (tab[i, "openTopic"] != 0 & tab[i, "flipCard"] == 0) { + fragments <- c(fragments, rownames(tab)[i]) + } else if (tab[i, "openPopup"] != 0 & tab[i, "openTopic"] == 0) { + fragments <- c(fragments, rownames(tab)[i]) + } +} +datrm <- dat[!dat$trace %in% fragments, ] + +# TODO: Find trace that has flipCard --> openPopup --> openTopic +dato <- datrm[datrm$event != "move", ] +tmp <- lapply(unique(dato$trace), function(x) unique(dato[dato$trace == x, "event"])) +names(tmp) <- unique(dato$trace) + +ids <- sapply(tmp, length) == 3 +do.call(rbind, tmp[ids]) + +# TODO: +# fragmentary traces: for 4591 openTopic for topic 1 is in the raw log files, but gets +# probably removed in close_events(); how can I prevent that? How can I fix +# the traces and eventIds that do not match correctly ??? +ct <- c(4591, 5937, 7080, 8412, 8279) +datrm[datrm$trace %in% ct, 1:10] + +### WHY????? + +alog <- activitylog(datrm, case_id = "trace", activity_id = "event", #resource_id = "case", resource_id = "artwork", timestamps = c("start", "complete")) -# --> have not understood, yet, which ist what... - process_map(alog) process_map(alog, frequency("relative")) diff --git a/code/check_traces.R b/code/check_traces.R index cd7be08..cf26e79 100644 --- a/code/check_traces.R +++ b/code/check_traces.R @@ -2,10 +2,10 @@ # Read data -dat <- read.table("../data/haum/event_logfiles_metadata_2023-09-23_01-31-30.csv", - sep = ";", header = TRUE) -# dat <- read.table("../data/haum/event_logfiles_small_metadata_2023-09-25_09-56-34.csv", -# sep = ";", header = TRUE) +# dat <- read.table("../data/haum/event_logfiles_metadata_2023-09-23_01-31-30.csv", +# sep = ";", header = TRUE) +dat <- read.table("../data/haum/event_logfiles_small_metadata_2023-10-15_10-08-43.csv", + sep = ";", header = TRUE) dat$date <- as.Date(dat$date) dat$date.start <- as.POSIXct(dat$date.start) dat$date.stop <- as.POSIXct(dat$date.stop)