Started trying out how preprocessing behaves when done on full data set
This commit is contained in:
parent
e416974906
commit
2a2eab4b9f
42
README.Rmd
42
README.Rmd
@ -427,6 +427,48 @@ out), but `topicNumber` for this artwork has 6 different entries. I will
|
||||
therefore extract the topics from `<artwork>.html`. (This seems also better
|
||||
compatible with other data sets like 8o8m.)
|
||||
|
||||
## New artworks "504" and "505" starting October 2022
|
||||
|
||||
When I read in the complete data frame for the first time, all of the
|
||||
sudden there were 72 instead of 70 artworks. It seems like these two
|
||||
artworks appear on October 21, 2022.
|
||||
|
||||
```{r}
|
||||
dat0 <- read.table("../data/rawdata_logfiles.csv", sep = ";", header = TRUE)
|
||||
dat0$date <- as.POSIXct(dat0$date)
|
||||
dat0$glossar <- ifelse(dat0$artwork == "glossar", 1, 0)
|
||||
|
||||
# Remove irrelevant events
|
||||
dat <- subset(dat0, !(dat0$event %in% c("Start Application",
|
||||
"Show Application")))
|
||||
|
||||
summary(dat[dat$artwork %in% c("504", "505"), ])
|
||||
```
|
||||
|
||||
The artworks seem to be have updated in general after October 21, 2022.
|
||||
|
||||
```{r}
|
||||
art_before_oct2022 <- sort(unique(dat[dat$date >= "2022-10-21", "artwork"]))
|
||||
art_before_oct2022 <- sort(unique(dat[dat$date <= "2022-10-21", "artwork"]))
|
||||
# Removed artworks
|
||||
art_before_oct2022[!art_before_oct2022 %in% art_after_oct2022]
|
||||
# Additional artworks
|
||||
art_after_oct2022[!art_after_oct2022 %in% art_before_oct2022]
|
||||
```
|
||||
|
||||
The following table shows which artworks were presented in which years.
|
||||
|
||||
```{r}
|
||||
xtabs(~ artwork + lubridate::year(date), dat)
|
||||
```
|
||||
|
||||
It strongly suggests that the artworks haven been updated after the Corona
|
||||
pandemic. I think, the table was also moved to a different location at that
|
||||
point. (Check with PG to make sure.)
|
||||
|
||||
I need to get the XML files for "504" and "505" from PM in order to extract
|
||||
information on them for the metadata.
|
||||
|
||||
# Reading list
|
||||
|
||||
* @Arizmendi2022 [--]
|
||||
|
@ -3,7 +3,8 @@
|
||||
source("functions.R")
|
||||
|
||||
# Read data ##############################################################
|
||||
dat0 <- read.table("../data/rawdata_logfiles_small.csv", sep = ";",
|
||||
cat("########## Reading in data... ##########", "\n")
|
||||
dat0 <- read.table("../data/rawdata_logfiles.csv", sep = ";",
|
||||
header = TRUE)
|
||||
dat0$date <- as.POSIXct(dat0$date)
|
||||
dat0$glossar <- ifelse(dat0$artwork == "glossar", 1, 0)
|
||||
@ -12,10 +13,16 @@ dat0$glossar <- ifelse(dat0$artwork == "glossar", 1, 0)
|
||||
dat <- subset(dat0, !(dat0$event %in% c("Start Application",
|
||||
"Show Application")))
|
||||
|
||||
save(dat, file = "tmp/dat.RData")
|
||||
|
||||
# Add trace variable #####################################################
|
||||
cat("########## Adding trace variable... ##########", "\n")
|
||||
dat1 <- add_trace(dat)
|
||||
|
||||
save(dat1, file = "tmp/dat1.RData")
|
||||
|
||||
# Close events
|
||||
cat("########## Closing events...")
|
||||
dat2 <- rbind(close_events(dat1, "move"),
|
||||
close_events(dat1, "flipCard"),
|
||||
close_events(dat1, "openTopic"),
|
||||
@ -32,7 +39,10 @@ dat2 <- dat2[!is.na(dat2$date.start), ]
|
||||
rownames(dat2) <- NULL
|
||||
# TODO: Throw warning about this
|
||||
|
||||
save(dat2, file = "tmp/dat2.RData")
|
||||
|
||||
# Add case variable ######################################################
|
||||
cat("########## Adding case and eventId variables... ##########", "\n")
|
||||
dat3 <- add_case(dat2)
|
||||
|
||||
# Add event ID ###########################################################
|
||||
@ -46,19 +56,28 @@ dat3 <- dat3[, c("fileId.start", "fileId.stop", "eventId", "case",
|
||||
"scaleSize", "rotation.start", "rotation.stop",
|
||||
"rotationDegree")]
|
||||
|
||||
save(dat3, file = "tmp/dat3.RData")
|
||||
|
||||
# Add trace for move events ##############################################
|
||||
cat("########## Adding trace variable for move events... ##########", "\n")
|
||||
dat4 <- add_trace_moves(dat3)
|
||||
|
||||
save(dat4, file = "tmp/dat4.RData")
|
||||
|
||||
# Add topics: file names and topics ######################################
|
||||
cat("########## Adding information about topics... ##########", "\n")
|
||||
artworks <- unique(dat4$artwork)
|
||||
topics <- extract_topics(artworks, pattern = paste0(artworks, ".xml"),
|
||||
path = "../data/ContentEyevisit/eyevisit_cards_light/")
|
||||
|
||||
dat5 <- add_topic(dat4, topics = topics)
|
||||
|
||||
save(dat5, file = "tmp/dat5.RData")
|
||||
|
||||
# TODO: Replace artwork with informative strings
|
||||
|
||||
# Export data ############################################################
|
||||
cat("########## Exporting data frame with event logs... ##########", "\n")
|
||||
write.table(dat5, "../data/event_logfiles.csv", sep = ";",
|
||||
row.names = FALSE)
|
||||
|
||||
|
@ -9,28 +9,36 @@ add_trace <- function(data, glossar_dict = "../data/glossar_dict.RData") {
|
||||
subdata2 <- data[!data$event %in% c("Transform start", "Transform stop"), ]
|
||||
|
||||
last_event <- subdata2$event[1]
|
||||
aws <- unique(subdata2$artwork)[unique(subdata2$artwork) != "glossar"]
|
||||
artworks <- unique(subdata2$artwork)[unique(subdata2$artwork) != "glossar"]
|
||||
n <- 1 # count artworks for progress
|
||||
|
||||
for (art in aws) {
|
||||
pb <- txtProgressBar(min = 0, max = nrow(subdata2), style = 3)
|
||||
|
||||
for (artwork in artworks) {
|
||||
|
||||
cat("\n\nAdding trace variable for artwork", artwork,
|
||||
paste0("(", n, "/", length(artworks), ")"), "\n")
|
||||
|
||||
for (i in 1:nrow(subdata2)) {
|
||||
|
||||
if (last_event == "Show Info" & subdata2$artwork[i] == art) {
|
||||
if (last_event == "Show Info" & subdata2$artwork[i] == artwork) {
|
||||
subdata2$trace[i] <- i
|
||||
j <- i
|
||||
|
||||
} else if (last_event == "Show Front" & subdata2$artwork[i] == art) {
|
||||
} else if (last_event == "Show Front" & subdata2$artwork[i] == artwork) {
|
||||
subdata2$trace[i] <- j
|
||||
|
||||
} else if (!(last_event %in% c("Show Info", "Show Front")) &
|
||||
subdata2$artwork[i] == art) {
|
||||
subdata2$artwork[i] == artwork) {
|
||||
subdata2$trace[i] <- j
|
||||
}
|
||||
|
||||
if (i <= nrow(subdata2)) {
|
||||
last_event <- subdata2$event[i + 1]
|
||||
}
|
||||
setTxtProgressBar(pb, i)
|
||||
}
|
||||
n <- n + 1
|
||||
}
|
||||
|
||||
# Fix glossar entries (find corresponding artworks and fill in trace)
|
||||
@ -45,8 +53,13 @@ add_trace <- function(data, glossar_dict = "../data/glossar_dict.RData") {
|
||||
"glossar_file"]]
|
||||
single_art <- unlist(lut[lut$glossar_file %in% inside, "artwork"])
|
||||
|
||||
m <- 1
|
||||
|
||||
for (file in lut$glossar_file) {
|
||||
|
||||
cat("\n\nAdding trace variable for glossar entry", file,
|
||||
paste0("(", m, "/", length(lut$glossar_file), ")"), "\n")
|
||||
|
||||
artwork_list <- unlist(lut[lut$glossar_file == file, "artwork"])
|
||||
|
||||
for (i in seq_len(nrow(subdata2))) {
|
||||
@ -78,11 +91,13 @@ add_trace <- function(data, glossar_dict = "../data/glossar_dict.RData") {
|
||||
subdata2[i, "artwork"] <- current_artwork
|
||||
|
||||
}
|
||||
setTxtProgressBar(pb, i)
|
||||
}
|
||||
m <- m + 1
|
||||
}
|
||||
|
||||
# Exclude not matched glossar entries
|
||||
cat("INFORMATION: glossar entries that are not matched will be removed:",
|
||||
cat("\n\nINFORMATION: glossar entries that are not matched will be removed:",
|
||||
sum(is.na(subdata2[subdata2$glossar == 1, "trace"])), "entries",
|
||||
#proportions(table(is.na(subdata2[subdata2$glossar == 1, "trace"]))),
|
||||
fill = TRUE)
|
||||
@ -229,40 +244,52 @@ add_case <- function(data, cutoff = 20) {
|
||||
add_trace_moves <- function(data) {
|
||||
|
||||
cases <- unique(data$case)
|
||||
aws <- unique(data$artwork)[unique(data$artwork) != "glossar"]
|
||||
artworks <- unique(data$artwork)[unique(data$artwork) != "glossar"]
|
||||
max_trace <- max(data$trace, na.rm = TRUE) + 1
|
||||
out <- NULL
|
||||
|
||||
for (case in cases) {
|
||||
for (art in aws) {
|
||||
tmp <- data[data$case == case & data$artwork == art, ]
|
||||
if (nrow(tmp) != 0) {
|
||||
pb <- txtProgressBar(min = 0, max = length(artworks), style = 3)
|
||||
|
||||
if (length(na.omit(unique(tmp$trace))) == 1) {
|
||||
tmp[tmp$event == "move", "trace"] <- na.omit(unique(tmp$trace))
|
||||
} else if (length(na.omit(unique(tmp$trace))) > 1) {
|
||||
for (i in 1:nrow(tmp)) {
|
||||
if (tmp$event[i] == "move") {
|
||||
n <- 1 # count cases for progress
|
||||
|
||||
for (case in cases) {
|
||||
|
||||
cat("\n\nAdding trace variable for move events per case",
|
||||
paste0("(", n, "/", length(cases), ")"), "\n")
|
||||
j <- 1
|
||||
|
||||
for (artwork in artworks) {
|
||||
subdata <- data[data$case == case & data$artwork == artwork, ]
|
||||
if (nrow(subdata) != 0) {
|
||||
|
||||
if (length(na.omit(unique(subdata$trace))) == 1) {
|
||||
subdata[subdata$event == "move", "trace"] <- na.omit(unique(subdata$trace))
|
||||
} else if (length(na.omit(unique(subdata$trace))) > 1) {
|
||||
for (i in 1:nrow(subdata)) {
|
||||
if (subdata$event[i] == "move") {
|
||||
if (i == 1) {
|
||||
tmp$trace[i] <- na.omit(unique(tmp$trace))[1]
|
||||
subdata$trace[i] <- na.omit(unique(subdata$trace))[1]
|
||||
} else {
|
||||
tmp$trace[i] <- tmp$trace[i - 1]
|
||||
subdata$trace[i] <- subdata$trace[i - 1]
|
||||
}
|
||||
}
|
||||
}
|
||||
} else if (all(is.na(tmp$trace))) {
|
||||
for (i in 1:nrow(tmp)) {
|
||||
if (tmp$event[i] == "move") {
|
||||
tmp$trace[i] <- max_trace
|
||||
} else if (all(is.na(subdata$trace))) {
|
||||
for (i in 1:nrow(subdata)) {
|
||||
if (subdata$event[i] == "move") {
|
||||
subdata$trace[i] <- max_trace
|
||||
}
|
||||
}
|
||||
}
|
||||
max_trace <- max_trace + 1
|
||||
}
|
||||
if (nrow(tmp) > 0) {
|
||||
out <- rbind(out, tmp)
|
||||
if (nrow(subdata) > 0) {
|
||||
out <- rbind(out, subdata)
|
||||
}
|
||||
setTxtProgressBar(pb, j)
|
||||
j <- j + 1
|
||||
}
|
||||
n <- n + 1
|
||||
}
|
||||
out <- out[order(out$date.start, out$fileId.start), ]
|
||||
rownames(out) <- NULL
|
||||
@ -340,19 +367,19 @@ add_topic <- function(data, topics) {
|
||||
dat_split <- split(data, data$artwork)
|
||||
|
||||
set_label <- function(x) {
|
||||
art <- unique(x$artwork)
|
||||
x$topicIndex <- factor(x$topicNumber, labels = tab_index[[art]])
|
||||
artwork <- unique(x$artwork)
|
||||
x$topicIndex <- factor(x$topicNumber, labels = tab_index[[artwork]])
|
||||
x
|
||||
}
|
||||
|
||||
dat_label <- lapply(dat_split, set_label)
|
||||
|
||||
set_topic <- function(x) {
|
||||
art <- unique(x$artwork)
|
||||
labels_file <- topics[topics$artwork == art,
|
||||
artwork <- unique(x$artwork)
|
||||
labels_file <- topics[topics$artwork == artwork,
|
||||
"file_name"][as.numeric(levels(x$topicIndex))]
|
||||
x$topicFile <- as.character(factor(x$topicIndex, labels = labels_file))
|
||||
labels_topic <- topics[topics$artwork == art,
|
||||
labels_topic <- topics[topics$artwork == artwork,
|
||||
"topic"][as.numeric(levels(x$topicIndex))]
|
||||
x$topic <- as.character(factor(x$topicIndex, labels = labels_topic))
|
||||
x
|
||||
@ -360,11 +387,6 @@ add_topic <- function(data, topics) {
|
||||
|
||||
dat_topic <- lapply(dat_label, set_topic)
|
||||
|
||||
# table(dat_topic[["501"]]$topicNumber)
|
||||
# table(dat_topic[["501"]]$topicIndex)
|
||||
# table(dat_topic[["501"]]$topicFile)
|
||||
# table(dat_topic[["501"]]$topic)
|
||||
|
||||
out <- do.call(rbind, dat_topic)
|
||||
out <- out[order(out$date.start, out$fileId.start), ]
|
||||
rownames(out) <- NULL
|
||||
|
Loading…
Reference in New Issue
Block a user