From 2a2eab4b9f8ee5bfa924643a4cc1c01bccb36fc1 Mon Sep 17 00:00:00 2001 From: nwickel Date: Fri, 15 Sep 2023 16:22:21 +0200 Subject: [PATCH] Started trying out how preprocessing behaves when done on full data set --- README.Rmd | 42 +++++++++++++++++++ code/02_preprocessing.R | 21 +++++++++- code/functions.R | 90 +++++++++++++++++++++++++---------------- 3 files changed, 118 insertions(+), 35 deletions(-) diff --git a/README.Rmd b/README.Rmd index 9b7be92..7a15a5d 100644 --- a/README.Rmd +++ b/README.Rmd @@ -427,6 +427,48 @@ out), but `topicNumber` for this artwork has 6 different entries. I will therefore extract the topics from `.html`. (This seems also better compatible with other data sets like 8o8m.) +## New artworks "504" and "505" starting October 2022 + +When I read in the complete data frame for the first time, all of the +sudden there were 72 instead of 70 artworks. It seems like these two +artworks appear on October 21, 2022. + +```{r} +dat0 <- read.table("../data/rawdata_logfiles.csv", sep = ";", header = TRUE) +dat0$date <- as.POSIXct(dat0$date) +dat0$glossar <- ifelse(dat0$artwork == "glossar", 1, 0) + +# Remove irrelevant events +dat <- subset(dat0, !(dat0$event %in% c("Start Application", + "Show Application"))) + +summary(dat[dat$artwork %in% c("504", "505"), ]) +``` + +The artworks seem to be have updated in general after October 21, 2022. + +```{r} +art_before_oct2022 <- sort(unique(dat[dat$date >= "2022-10-21", "artwork"])) +art_before_oct2022 <- sort(unique(dat[dat$date <= "2022-10-21", "artwork"])) +# Removed artworks +art_before_oct2022[!art_before_oct2022 %in% art_after_oct2022] +# Additional artworks +art_after_oct2022[!art_after_oct2022 %in% art_before_oct2022] +``` + +The following table shows which artworks were presented in which years. + +```{r} +xtabs(~ artwork + lubridate::year(date), dat) +``` + +It strongly suggests that the artworks haven been updated after the Corona +pandemic. I think, the table was also moved to a different location at that +point. (Check with PG to make sure.) + +I need to get the XML files for "504" and "505" from PM in order to extract +information on them for the metadata. + # Reading list * @Arizmendi2022 [--] diff --git a/code/02_preprocessing.R b/code/02_preprocessing.R index 693c1b5..843bd95 100644 --- a/code/02_preprocessing.R +++ b/code/02_preprocessing.R @@ -3,7 +3,8 @@ source("functions.R") # Read data ############################################################## -dat0 <- read.table("../data/rawdata_logfiles_small.csv", sep = ";", +cat("########## Reading in data... ##########", "\n") +dat0 <- read.table("../data/rawdata_logfiles.csv", sep = ";", header = TRUE) dat0$date <- as.POSIXct(dat0$date) dat0$glossar <- ifelse(dat0$artwork == "glossar", 1, 0) @@ -12,10 +13,16 @@ dat0$glossar <- ifelse(dat0$artwork == "glossar", 1, 0) dat <- subset(dat0, !(dat0$event %in% c("Start Application", "Show Application"))) +save(dat, file = "tmp/dat.RData") + # Add trace variable ##################################################### +cat("########## Adding trace variable... ##########", "\n") dat1 <- add_trace(dat) +save(dat1, file = "tmp/dat1.RData") + # Close events +cat("########## Closing events...") dat2 <- rbind(close_events(dat1, "move"), close_events(dat1, "flipCard"), close_events(dat1, "openTopic"), @@ -32,7 +39,10 @@ dat2 <- dat2[!is.na(dat2$date.start), ] rownames(dat2) <- NULL # TODO: Throw warning about this +save(dat2, file = "tmp/dat2.RData") + # Add case variable ###################################################### +cat("########## Adding case and eventId variables... ##########", "\n") dat3 <- add_case(dat2) # Add event ID ########################################################### @@ -46,19 +56,28 @@ dat3 <- dat3[, c("fileId.start", "fileId.stop", "eventId", "case", "scaleSize", "rotation.start", "rotation.stop", "rotationDegree")] +save(dat3, file = "tmp/dat3.RData") + # Add trace for move events ############################################## +cat("########## Adding trace variable for move events... ##########", "\n") dat4 <- add_trace_moves(dat3) +save(dat4, file = "tmp/dat4.RData") + # Add topics: file names and topics ###################################### +cat("########## Adding information about topics... ##########", "\n") artworks <- unique(dat4$artwork) topics <- extract_topics(artworks, pattern = paste0(artworks, ".xml"), path = "../data/ContentEyevisit/eyevisit_cards_light/") dat5 <- add_topic(dat4, topics = topics) +save(dat5, file = "tmp/dat5.RData") + # TODO: Replace artwork with informative strings # Export data ############################################################ +cat("########## Exporting data frame with event logs... ##########", "\n") write.table(dat5, "../data/event_logfiles.csv", sep = ";", row.names = FALSE) diff --git a/code/functions.R b/code/functions.R index 857adec..1f8bfb4 100644 --- a/code/functions.R +++ b/code/functions.R @@ -9,28 +9,36 @@ add_trace <- function(data, glossar_dict = "../data/glossar_dict.RData") { subdata2 <- data[!data$event %in% c("Transform start", "Transform stop"), ] last_event <- subdata2$event[1] - aws <- unique(subdata2$artwork)[unique(subdata2$artwork) != "glossar"] + artworks <- unique(subdata2$artwork)[unique(subdata2$artwork) != "glossar"] + n <- 1 # count artworks for progress - for (art in aws) { + pb <- txtProgressBar(min = 0, max = nrow(subdata2), style = 3) + + for (artwork in artworks) { + + cat("\n\nAdding trace variable for artwork", artwork, + paste0("(", n, "/", length(artworks), ")"), "\n") for (i in 1:nrow(subdata2)) { - if (last_event == "Show Info" & subdata2$artwork[i] == art) { + if (last_event == "Show Info" & subdata2$artwork[i] == artwork) { subdata2$trace[i] <- i j <- i - } else if (last_event == "Show Front" & subdata2$artwork[i] == art) { + } else if (last_event == "Show Front" & subdata2$artwork[i] == artwork) { subdata2$trace[i] <- j } else if (!(last_event %in% c("Show Info", "Show Front")) & - subdata2$artwork[i] == art) { + subdata2$artwork[i] == artwork) { subdata2$trace[i] <- j } if (i <= nrow(subdata2)) { last_event <- subdata2$event[i + 1] } + setTxtProgressBar(pb, i) } + n <- n + 1 } # Fix glossar entries (find corresponding artworks and fill in trace) @@ -45,8 +53,13 @@ add_trace <- function(data, glossar_dict = "../data/glossar_dict.RData") { "glossar_file"]] single_art <- unlist(lut[lut$glossar_file %in% inside, "artwork"]) + m <- 1 + for (file in lut$glossar_file) { + cat("\n\nAdding trace variable for glossar entry", file, + paste0("(", m, "/", length(lut$glossar_file), ")"), "\n") + artwork_list <- unlist(lut[lut$glossar_file == file, "artwork"]) for (i in seq_len(nrow(subdata2))) { @@ -78,11 +91,13 @@ add_trace <- function(data, glossar_dict = "../data/glossar_dict.RData") { subdata2[i, "artwork"] <- current_artwork } + setTxtProgressBar(pb, i) } + m <- m + 1 } # Exclude not matched glossar entries - cat("INFORMATION: glossar entries that are not matched will be removed:", + cat("\n\nINFORMATION: glossar entries that are not matched will be removed:", sum(is.na(subdata2[subdata2$glossar == 1, "trace"])), "entries", #proportions(table(is.na(subdata2[subdata2$glossar == 1, "trace"]))), fill = TRUE) @@ -229,40 +244,52 @@ add_case <- function(data, cutoff = 20) { add_trace_moves <- function(data) { cases <- unique(data$case) - aws <- unique(data$artwork)[unique(data$artwork) != "glossar"] + artworks <- unique(data$artwork)[unique(data$artwork) != "glossar"] max_trace <- max(data$trace, na.rm = TRUE) + 1 out <- NULL - for (case in cases) { - for (art in aws) { - tmp <- data[data$case == case & data$artwork == art, ] - if (nrow(tmp) != 0) { + pb <- txtProgressBar(min = 0, max = length(artworks), style = 3) - if (length(na.omit(unique(tmp$trace))) == 1) { - tmp[tmp$event == "move", "trace"] <- na.omit(unique(tmp$trace)) - } else if (length(na.omit(unique(tmp$trace))) > 1) { - for (i in 1:nrow(tmp)) { - if (tmp$event[i] == "move") { + n <- 1 # count cases for progress + + for (case in cases) { + + cat("\n\nAdding trace variable for move events per case", + paste0("(", n, "/", length(cases), ")"), "\n") + j <- 1 + + for (artwork in artworks) { + subdata <- data[data$case == case & data$artwork == artwork, ] + if (nrow(subdata) != 0) { + + if (length(na.omit(unique(subdata$trace))) == 1) { + subdata[subdata$event == "move", "trace"] <- na.omit(unique(subdata$trace)) + } else if (length(na.omit(unique(subdata$trace))) > 1) { + for (i in 1:nrow(subdata)) { + if (subdata$event[i] == "move") { if (i == 1) { - tmp$trace[i] <- na.omit(unique(tmp$trace))[1] + subdata$trace[i] <- na.omit(unique(subdata$trace))[1] } else { - tmp$trace[i] <- tmp$trace[i - 1] + subdata$trace[i] <- subdata$trace[i - 1] } } } - } else if (all(is.na(tmp$trace))) { - for (i in 1:nrow(tmp)) { - if (tmp$event[i] == "move") { - tmp$trace[i] <- max_trace + } else if (all(is.na(subdata$trace))) { + for (i in 1:nrow(subdata)) { + if (subdata$event[i] == "move") { + subdata$trace[i] <- max_trace } } } max_trace <- max_trace + 1 } - if (nrow(tmp) > 0) { - out <- rbind(out, tmp) + if (nrow(subdata) > 0) { + out <- rbind(out, subdata) } + setTxtProgressBar(pb, j) + j <- j + 1 } + n <- n + 1 } out <- out[order(out$date.start, out$fileId.start), ] rownames(out) <- NULL @@ -340,19 +367,19 @@ add_topic <- function(data, topics) { dat_split <- split(data, data$artwork) set_label <- function(x) { - art <- unique(x$artwork) - x$topicIndex <- factor(x$topicNumber, labels = tab_index[[art]]) + artwork <- unique(x$artwork) + x$topicIndex <- factor(x$topicNumber, labels = tab_index[[artwork]]) x } dat_label <- lapply(dat_split, set_label) set_topic <- function(x) { - art <- unique(x$artwork) - labels_file <- topics[topics$artwork == art, + artwork <- unique(x$artwork) + labels_file <- topics[topics$artwork == artwork, "file_name"][as.numeric(levels(x$topicIndex))] x$topicFile <- as.character(factor(x$topicIndex, labels = labels_file)) - labels_topic <- topics[topics$artwork == art, + labels_topic <- topics[topics$artwork == artwork, "topic"][as.numeric(levels(x$topicIndex))] x$topic <- as.character(factor(x$topicIndex, labels = labels_topic)) x @@ -360,11 +387,6 @@ add_topic <- function(data, topics) { dat_topic <- lapply(dat_label, set_topic) - # table(dat_topic[["501"]]$topicNumber) - # table(dat_topic[["501"]]$topicIndex) - # table(dat_topic[["501"]]$topicFile) - # table(dat_topic[["501"]]$topic) - out <- do.call(rbind, dat_topic) out <- out[order(out$date.start, out$fileId.start), ] rownames(out) <- NULL