Started trying out how preprocessing behaves when done on full data set

This commit is contained in:
Nora Wickelmaier 2023-09-15 16:22:21 +02:00
parent e416974906
commit 2a2eab4b9f
3 changed files with 118 additions and 35 deletions

View File

@ -427,6 +427,48 @@ out), but `topicNumber` for this artwork has 6 different entries. I will
therefore extract the topics from `<artwork>.html`. (This seems also better therefore extract the topics from `<artwork>.html`. (This seems also better
compatible with other data sets like 8o8m.) compatible with other data sets like 8o8m.)
## New artworks "504" and "505" starting October 2022
When I read in the complete data frame for the first time, all of the
sudden there were 72 instead of 70 artworks. It seems like these two
artworks appear on October 21, 2022.
```{r}
dat0 <- read.table("../data/rawdata_logfiles.csv", sep = ";", header = TRUE)
dat0$date <- as.POSIXct(dat0$date)
dat0$glossar <- ifelse(dat0$artwork == "glossar", 1, 0)
# Remove irrelevant events
dat <- subset(dat0, !(dat0$event %in% c("Start Application",
"Show Application")))
summary(dat[dat$artwork %in% c("504", "505"), ])
```
The artworks seem to be have updated in general after October 21, 2022.
```{r}
art_before_oct2022 <- sort(unique(dat[dat$date >= "2022-10-21", "artwork"]))
art_before_oct2022 <- sort(unique(dat[dat$date <= "2022-10-21", "artwork"]))
# Removed artworks
art_before_oct2022[!art_before_oct2022 %in% art_after_oct2022]
# Additional artworks
art_after_oct2022[!art_after_oct2022 %in% art_before_oct2022]
```
The following table shows which artworks were presented in which years.
```{r}
xtabs(~ artwork + lubridate::year(date), dat)
```
It strongly suggests that the artworks haven been updated after the Corona
pandemic. I think, the table was also moved to a different location at that
point. (Check with PG to make sure.)
I need to get the XML files for "504" and "505" from PM in order to extract
information on them for the metadata.
# Reading list # Reading list
* @Arizmendi2022 [--] * @Arizmendi2022 [--]

View File

@ -3,7 +3,8 @@
source("functions.R") source("functions.R")
# Read data ############################################################## # Read data ##############################################################
dat0 <- read.table("../data/rawdata_logfiles_small.csv", sep = ";", cat("########## Reading in data... ##########", "\n")
dat0 <- read.table("../data/rawdata_logfiles.csv", sep = ";",
header = TRUE) header = TRUE)
dat0$date <- as.POSIXct(dat0$date) dat0$date <- as.POSIXct(dat0$date)
dat0$glossar <- ifelse(dat0$artwork == "glossar", 1, 0) dat0$glossar <- ifelse(dat0$artwork == "glossar", 1, 0)
@ -12,10 +13,16 @@ dat0$glossar <- ifelse(dat0$artwork == "glossar", 1, 0)
dat <- subset(dat0, !(dat0$event %in% c("Start Application", dat <- subset(dat0, !(dat0$event %in% c("Start Application",
"Show Application"))) "Show Application")))
save(dat, file = "tmp/dat.RData")
# Add trace variable ##################################################### # Add trace variable #####################################################
cat("########## Adding trace variable... ##########", "\n")
dat1 <- add_trace(dat) dat1 <- add_trace(dat)
save(dat1, file = "tmp/dat1.RData")
# Close events # Close events
cat("########## Closing events...")
dat2 <- rbind(close_events(dat1, "move"), dat2 <- rbind(close_events(dat1, "move"),
close_events(dat1, "flipCard"), close_events(dat1, "flipCard"),
close_events(dat1, "openTopic"), close_events(dat1, "openTopic"),
@ -32,7 +39,10 @@ dat2 <- dat2[!is.na(dat2$date.start), ]
rownames(dat2) <- NULL rownames(dat2) <- NULL
# TODO: Throw warning about this # TODO: Throw warning about this
save(dat2, file = "tmp/dat2.RData")
# Add case variable ###################################################### # Add case variable ######################################################
cat("########## Adding case and eventId variables... ##########", "\n")
dat3 <- add_case(dat2) dat3 <- add_case(dat2)
# Add event ID ########################################################### # Add event ID ###########################################################
@ -46,19 +56,28 @@ dat3 <- dat3[, c("fileId.start", "fileId.stop", "eventId", "case",
"scaleSize", "rotation.start", "rotation.stop", "scaleSize", "rotation.start", "rotation.stop",
"rotationDegree")] "rotationDegree")]
save(dat3, file = "tmp/dat3.RData")
# Add trace for move events ############################################## # Add trace for move events ##############################################
cat("########## Adding trace variable for move events... ##########", "\n")
dat4 <- add_trace_moves(dat3) dat4 <- add_trace_moves(dat3)
save(dat4, file = "tmp/dat4.RData")
# Add topics: file names and topics ###################################### # Add topics: file names and topics ######################################
cat("########## Adding information about topics... ##########", "\n")
artworks <- unique(dat4$artwork) artworks <- unique(dat4$artwork)
topics <- extract_topics(artworks, pattern = paste0(artworks, ".xml"), topics <- extract_topics(artworks, pattern = paste0(artworks, ".xml"),
path = "../data/ContentEyevisit/eyevisit_cards_light/") path = "../data/ContentEyevisit/eyevisit_cards_light/")
dat5 <- add_topic(dat4, topics = topics) dat5 <- add_topic(dat4, topics = topics)
save(dat5, file = "tmp/dat5.RData")
# TODO: Replace artwork with informative strings # TODO: Replace artwork with informative strings
# Export data ############################################################ # Export data ############################################################
cat("########## Exporting data frame with event logs... ##########", "\n")
write.table(dat5, "../data/event_logfiles.csv", sep = ";", write.table(dat5, "../data/event_logfiles.csv", sep = ";",
row.names = FALSE) row.names = FALSE)

View File

@ -9,28 +9,36 @@ add_trace <- function(data, glossar_dict = "../data/glossar_dict.RData") {
subdata2 <- data[!data$event %in% c("Transform start", "Transform stop"), ] subdata2 <- data[!data$event %in% c("Transform start", "Transform stop"), ]
last_event <- subdata2$event[1] last_event <- subdata2$event[1]
aws <- unique(subdata2$artwork)[unique(subdata2$artwork) != "glossar"] artworks <- unique(subdata2$artwork)[unique(subdata2$artwork) != "glossar"]
n <- 1 # count artworks for progress
for (art in aws) { pb <- txtProgressBar(min = 0, max = nrow(subdata2), style = 3)
for (artwork in artworks) {
cat("\n\nAdding trace variable for artwork", artwork,
paste0("(", n, "/", length(artworks), ")"), "\n")
for (i in 1:nrow(subdata2)) { for (i in 1:nrow(subdata2)) {
if (last_event == "Show Info" & subdata2$artwork[i] == art) { if (last_event == "Show Info" & subdata2$artwork[i] == artwork) {
subdata2$trace[i] <- i subdata2$trace[i] <- i
j <- i j <- i
} else if (last_event == "Show Front" & subdata2$artwork[i] == art) { } else if (last_event == "Show Front" & subdata2$artwork[i] == artwork) {
subdata2$trace[i] <- j subdata2$trace[i] <- j
} else if (!(last_event %in% c("Show Info", "Show Front")) & } else if (!(last_event %in% c("Show Info", "Show Front")) &
subdata2$artwork[i] == art) { subdata2$artwork[i] == artwork) {
subdata2$trace[i] <- j subdata2$trace[i] <- j
} }
if (i <= nrow(subdata2)) { if (i <= nrow(subdata2)) {
last_event <- subdata2$event[i + 1] last_event <- subdata2$event[i + 1]
} }
setTxtProgressBar(pb, i)
} }
n <- n + 1
} }
# Fix glossar entries (find corresponding artworks and fill in trace) # Fix glossar entries (find corresponding artworks and fill in trace)
@ -45,8 +53,13 @@ add_trace <- function(data, glossar_dict = "../data/glossar_dict.RData") {
"glossar_file"]] "glossar_file"]]
single_art <- unlist(lut[lut$glossar_file %in% inside, "artwork"]) single_art <- unlist(lut[lut$glossar_file %in% inside, "artwork"])
m <- 1
for (file in lut$glossar_file) { for (file in lut$glossar_file) {
cat("\n\nAdding trace variable for glossar entry", file,
paste0("(", m, "/", length(lut$glossar_file), ")"), "\n")
artwork_list <- unlist(lut[lut$glossar_file == file, "artwork"]) artwork_list <- unlist(lut[lut$glossar_file == file, "artwork"])
for (i in seq_len(nrow(subdata2))) { for (i in seq_len(nrow(subdata2))) {
@ -78,11 +91,13 @@ add_trace <- function(data, glossar_dict = "../data/glossar_dict.RData") {
subdata2[i, "artwork"] <- current_artwork subdata2[i, "artwork"] <- current_artwork
} }
setTxtProgressBar(pb, i)
} }
m <- m + 1
} }
# Exclude not matched glossar entries # Exclude not matched glossar entries
cat("INFORMATION: glossar entries that are not matched will be removed:", cat("\n\nINFORMATION: glossar entries that are not matched will be removed:",
sum(is.na(subdata2[subdata2$glossar == 1, "trace"])), "entries", sum(is.na(subdata2[subdata2$glossar == 1, "trace"])), "entries",
#proportions(table(is.na(subdata2[subdata2$glossar == 1, "trace"]))), #proportions(table(is.na(subdata2[subdata2$glossar == 1, "trace"]))),
fill = TRUE) fill = TRUE)
@ -229,40 +244,52 @@ add_case <- function(data, cutoff = 20) {
add_trace_moves <- function(data) { add_trace_moves <- function(data) {
cases <- unique(data$case) cases <- unique(data$case)
aws <- unique(data$artwork)[unique(data$artwork) != "glossar"] artworks <- unique(data$artwork)[unique(data$artwork) != "glossar"]
max_trace <- max(data$trace, na.rm = TRUE) + 1 max_trace <- max(data$trace, na.rm = TRUE) + 1
out <- NULL out <- NULL
for (case in cases) { pb <- txtProgressBar(min = 0, max = length(artworks), style = 3)
for (art in aws) {
tmp <- data[data$case == case & data$artwork == art, ]
if (nrow(tmp) != 0) {
if (length(na.omit(unique(tmp$trace))) == 1) { n <- 1 # count cases for progress
tmp[tmp$event == "move", "trace"] <- na.omit(unique(tmp$trace))
} else if (length(na.omit(unique(tmp$trace))) > 1) { for (case in cases) {
for (i in 1:nrow(tmp)) {
if (tmp$event[i] == "move") { cat("\n\nAdding trace variable for move events per case",
paste0("(", n, "/", length(cases), ")"), "\n")
j <- 1
for (artwork in artworks) {
subdata <- data[data$case == case & data$artwork == artwork, ]
if (nrow(subdata) != 0) {
if (length(na.omit(unique(subdata$trace))) == 1) {
subdata[subdata$event == "move", "trace"] <- na.omit(unique(subdata$trace))
} else if (length(na.omit(unique(subdata$trace))) > 1) {
for (i in 1:nrow(subdata)) {
if (subdata$event[i] == "move") {
if (i == 1) { if (i == 1) {
tmp$trace[i] <- na.omit(unique(tmp$trace))[1] subdata$trace[i] <- na.omit(unique(subdata$trace))[1]
} else { } else {
tmp$trace[i] <- tmp$trace[i - 1] subdata$trace[i] <- subdata$trace[i - 1]
} }
} }
} }
} else if (all(is.na(tmp$trace))) { } else if (all(is.na(subdata$trace))) {
for (i in 1:nrow(tmp)) { for (i in 1:nrow(subdata)) {
if (tmp$event[i] == "move") { if (subdata$event[i] == "move") {
tmp$trace[i] <- max_trace subdata$trace[i] <- max_trace
} }
} }
} }
max_trace <- max_trace + 1 max_trace <- max_trace + 1
} }
if (nrow(tmp) > 0) { if (nrow(subdata) > 0) {
out <- rbind(out, tmp) out <- rbind(out, subdata)
} }
setTxtProgressBar(pb, j)
j <- j + 1
} }
n <- n + 1
} }
out <- out[order(out$date.start, out$fileId.start), ] out <- out[order(out$date.start, out$fileId.start), ]
rownames(out) <- NULL rownames(out) <- NULL
@ -340,19 +367,19 @@ add_topic <- function(data, topics) {
dat_split <- split(data, data$artwork) dat_split <- split(data, data$artwork)
set_label <- function(x) { set_label <- function(x) {
art <- unique(x$artwork) artwork <- unique(x$artwork)
x$topicIndex <- factor(x$topicNumber, labels = tab_index[[art]]) x$topicIndex <- factor(x$topicNumber, labels = tab_index[[artwork]])
x x
} }
dat_label <- lapply(dat_split, set_label) dat_label <- lapply(dat_split, set_label)
set_topic <- function(x) { set_topic <- function(x) {
art <- unique(x$artwork) artwork <- unique(x$artwork)
labels_file <- topics[topics$artwork == art, labels_file <- topics[topics$artwork == artwork,
"file_name"][as.numeric(levels(x$topicIndex))] "file_name"][as.numeric(levels(x$topicIndex))]
x$topicFile <- as.character(factor(x$topicIndex, labels = labels_file)) x$topicFile <- as.character(factor(x$topicIndex, labels = labels_file))
labels_topic <- topics[topics$artwork == art, labels_topic <- topics[topics$artwork == artwork,
"topic"][as.numeric(levels(x$topicIndex))] "topic"][as.numeric(levels(x$topicIndex))]
x$topic <- as.character(factor(x$topicIndex, labels = labels_topic)) x$topic <- as.character(factor(x$topicIndex, labels = labels_topic))
x x
@ -360,11 +387,6 @@ add_topic <- function(data, topics) {
dat_topic <- lapply(dat_label, set_topic) dat_topic <- lapply(dat_label, set_topic)
# table(dat_topic[["501"]]$topicNumber)
# table(dat_topic[["501"]]$topicIndex)
# table(dat_topic[["501"]]$topicFile)
# table(dat_topic[["501"]]$topic)
out <- do.call(rbind, dat_topic) out <- do.call(rbind, dat_topic)
out <- out[order(out$date.start, out$fileId.start), ] out <- out[order(out$date.start, out$fileId.start), ]
rownames(out) <- NULL rownames(out) <- NULL