Started trying out how preprocessing behaves when done on full data set
This commit is contained in:
parent
e416974906
commit
2a2eab4b9f
42
README.Rmd
42
README.Rmd
@ -427,6 +427,48 @@ out), but `topicNumber` for this artwork has 6 different entries. I will
|
|||||||
therefore extract the topics from `<artwork>.html`. (This seems also better
|
therefore extract the topics from `<artwork>.html`. (This seems also better
|
||||||
compatible with other data sets like 8o8m.)
|
compatible with other data sets like 8o8m.)
|
||||||
|
|
||||||
|
## New artworks "504" and "505" starting October 2022
|
||||||
|
|
||||||
|
When I read in the complete data frame for the first time, all of the
|
||||||
|
sudden there were 72 instead of 70 artworks. It seems like these two
|
||||||
|
artworks appear on October 21, 2022.
|
||||||
|
|
||||||
|
```{r}
|
||||||
|
dat0 <- read.table("../data/rawdata_logfiles.csv", sep = ";", header = TRUE)
|
||||||
|
dat0$date <- as.POSIXct(dat0$date)
|
||||||
|
dat0$glossar <- ifelse(dat0$artwork == "glossar", 1, 0)
|
||||||
|
|
||||||
|
# Remove irrelevant events
|
||||||
|
dat <- subset(dat0, !(dat0$event %in% c("Start Application",
|
||||||
|
"Show Application")))
|
||||||
|
|
||||||
|
summary(dat[dat$artwork %in% c("504", "505"), ])
|
||||||
|
```
|
||||||
|
|
||||||
|
The artworks seem to be have updated in general after October 21, 2022.
|
||||||
|
|
||||||
|
```{r}
|
||||||
|
art_before_oct2022 <- sort(unique(dat[dat$date >= "2022-10-21", "artwork"]))
|
||||||
|
art_before_oct2022 <- sort(unique(dat[dat$date <= "2022-10-21", "artwork"]))
|
||||||
|
# Removed artworks
|
||||||
|
art_before_oct2022[!art_before_oct2022 %in% art_after_oct2022]
|
||||||
|
# Additional artworks
|
||||||
|
art_after_oct2022[!art_after_oct2022 %in% art_before_oct2022]
|
||||||
|
```
|
||||||
|
|
||||||
|
The following table shows which artworks were presented in which years.
|
||||||
|
|
||||||
|
```{r}
|
||||||
|
xtabs(~ artwork + lubridate::year(date), dat)
|
||||||
|
```
|
||||||
|
|
||||||
|
It strongly suggests that the artworks haven been updated after the Corona
|
||||||
|
pandemic. I think, the table was also moved to a different location at that
|
||||||
|
point. (Check with PG to make sure.)
|
||||||
|
|
||||||
|
I need to get the XML files for "504" and "505" from PM in order to extract
|
||||||
|
information on them for the metadata.
|
||||||
|
|
||||||
# Reading list
|
# Reading list
|
||||||
|
|
||||||
* @Arizmendi2022 [--]
|
* @Arizmendi2022 [--]
|
||||||
|
@ -3,7 +3,8 @@
|
|||||||
source("functions.R")
|
source("functions.R")
|
||||||
|
|
||||||
# Read data ##############################################################
|
# Read data ##############################################################
|
||||||
dat0 <- read.table("../data/rawdata_logfiles_small.csv", sep = ";",
|
cat("########## Reading in data... ##########", "\n")
|
||||||
|
dat0 <- read.table("../data/rawdata_logfiles.csv", sep = ";",
|
||||||
header = TRUE)
|
header = TRUE)
|
||||||
dat0$date <- as.POSIXct(dat0$date)
|
dat0$date <- as.POSIXct(dat0$date)
|
||||||
dat0$glossar <- ifelse(dat0$artwork == "glossar", 1, 0)
|
dat0$glossar <- ifelse(dat0$artwork == "glossar", 1, 0)
|
||||||
@ -12,10 +13,16 @@ dat0$glossar <- ifelse(dat0$artwork == "glossar", 1, 0)
|
|||||||
dat <- subset(dat0, !(dat0$event %in% c("Start Application",
|
dat <- subset(dat0, !(dat0$event %in% c("Start Application",
|
||||||
"Show Application")))
|
"Show Application")))
|
||||||
|
|
||||||
|
save(dat, file = "tmp/dat.RData")
|
||||||
|
|
||||||
# Add trace variable #####################################################
|
# Add trace variable #####################################################
|
||||||
|
cat("########## Adding trace variable... ##########", "\n")
|
||||||
dat1 <- add_trace(dat)
|
dat1 <- add_trace(dat)
|
||||||
|
|
||||||
|
save(dat1, file = "tmp/dat1.RData")
|
||||||
|
|
||||||
# Close events
|
# Close events
|
||||||
|
cat("########## Closing events...")
|
||||||
dat2 <- rbind(close_events(dat1, "move"),
|
dat2 <- rbind(close_events(dat1, "move"),
|
||||||
close_events(dat1, "flipCard"),
|
close_events(dat1, "flipCard"),
|
||||||
close_events(dat1, "openTopic"),
|
close_events(dat1, "openTopic"),
|
||||||
@ -32,7 +39,10 @@ dat2 <- dat2[!is.na(dat2$date.start), ]
|
|||||||
rownames(dat2) <- NULL
|
rownames(dat2) <- NULL
|
||||||
# TODO: Throw warning about this
|
# TODO: Throw warning about this
|
||||||
|
|
||||||
|
save(dat2, file = "tmp/dat2.RData")
|
||||||
|
|
||||||
# Add case variable ######################################################
|
# Add case variable ######################################################
|
||||||
|
cat("########## Adding case and eventId variables... ##########", "\n")
|
||||||
dat3 <- add_case(dat2)
|
dat3 <- add_case(dat2)
|
||||||
|
|
||||||
# Add event ID ###########################################################
|
# Add event ID ###########################################################
|
||||||
@ -46,19 +56,28 @@ dat3 <- dat3[, c("fileId.start", "fileId.stop", "eventId", "case",
|
|||||||
"scaleSize", "rotation.start", "rotation.stop",
|
"scaleSize", "rotation.start", "rotation.stop",
|
||||||
"rotationDegree")]
|
"rotationDegree")]
|
||||||
|
|
||||||
|
save(dat3, file = "tmp/dat3.RData")
|
||||||
|
|
||||||
# Add trace for move events ##############################################
|
# Add trace for move events ##############################################
|
||||||
|
cat("########## Adding trace variable for move events... ##########", "\n")
|
||||||
dat4 <- add_trace_moves(dat3)
|
dat4 <- add_trace_moves(dat3)
|
||||||
|
|
||||||
|
save(dat4, file = "tmp/dat4.RData")
|
||||||
|
|
||||||
# Add topics: file names and topics ######################################
|
# Add topics: file names and topics ######################################
|
||||||
|
cat("########## Adding information about topics... ##########", "\n")
|
||||||
artworks <- unique(dat4$artwork)
|
artworks <- unique(dat4$artwork)
|
||||||
topics <- extract_topics(artworks, pattern = paste0(artworks, ".xml"),
|
topics <- extract_topics(artworks, pattern = paste0(artworks, ".xml"),
|
||||||
path = "../data/ContentEyevisit/eyevisit_cards_light/")
|
path = "../data/ContentEyevisit/eyevisit_cards_light/")
|
||||||
|
|
||||||
dat5 <- add_topic(dat4, topics = topics)
|
dat5 <- add_topic(dat4, topics = topics)
|
||||||
|
|
||||||
|
save(dat5, file = "tmp/dat5.RData")
|
||||||
|
|
||||||
# TODO: Replace artwork with informative strings
|
# TODO: Replace artwork with informative strings
|
||||||
|
|
||||||
# Export data ############################################################
|
# Export data ############################################################
|
||||||
|
cat("########## Exporting data frame with event logs... ##########", "\n")
|
||||||
write.table(dat5, "../data/event_logfiles.csv", sep = ";",
|
write.table(dat5, "../data/event_logfiles.csv", sep = ";",
|
||||||
row.names = FALSE)
|
row.names = FALSE)
|
||||||
|
|
||||||
|
@ -9,28 +9,36 @@ add_trace <- function(data, glossar_dict = "../data/glossar_dict.RData") {
|
|||||||
subdata2 <- data[!data$event %in% c("Transform start", "Transform stop"), ]
|
subdata2 <- data[!data$event %in% c("Transform start", "Transform stop"), ]
|
||||||
|
|
||||||
last_event <- subdata2$event[1]
|
last_event <- subdata2$event[1]
|
||||||
aws <- unique(subdata2$artwork)[unique(subdata2$artwork) != "glossar"]
|
artworks <- unique(subdata2$artwork)[unique(subdata2$artwork) != "glossar"]
|
||||||
|
n <- 1 # count artworks for progress
|
||||||
|
|
||||||
for (art in aws) {
|
pb <- txtProgressBar(min = 0, max = nrow(subdata2), style = 3)
|
||||||
|
|
||||||
|
for (artwork in artworks) {
|
||||||
|
|
||||||
|
cat("\n\nAdding trace variable for artwork", artwork,
|
||||||
|
paste0("(", n, "/", length(artworks), ")"), "\n")
|
||||||
|
|
||||||
for (i in 1:nrow(subdata2)) {
|
for (i in 1:nrow(subdata2)) {
|
||||||
|
|
||||||
if (last_event == "Show Info" & subdata2$artwork[i] == art) {
|
if (last_event == "Show Info" & subdata2$artwork[i] == artwork) {
|
||||||
subdata2$trace[i] <- i
|
subdata2$trace[i] <- i
|
||||||
j <- i
|
j <- i
|
||||||
|
|
||||||
} else if (last_event == "Show Front" & subdata2$artwork[i] == art) {
|
} else if (last_event == "Show Front" & subdata2$artwork[i] == artwork) {
|
||||||
subdata2$trace[i] <- j
|
subdata2$trace[i] <- j
|
||||||
|
|
||||||
} else if (!(last_event %in% c("Show Info", "Show Front")) &
|
} else if (!(last_event %in% c("Show Info", "Show Front")) &
|
||||||
subdata2$artwork[i] == art) {
|
subdata2$artwork[i] == artwork) {
|
||||||
subdata2$trace[i] <- j
|
subdata2$trace[i] <- j
|
||||||
}
|
}
|
||||||
|
|
||||||
if (i <= nrow(subdata2)) {
|
if (i <= nrow(subdata2)) {
|
||||||
last_event <- subdata2$event[i + 1]
|
last_event <- subdata2$event[i + 1]
|
||||||
}
|
}
|
||||||
|
setTxtProgressBar(pb, i)
|
||||||
}
|
}
|
||||||
|
n <- n + 1
|
||||||
}
|
}
|
||||||
|
|
||||||
# Fix glossar entries (find corresponding artworks and fill in trace)
|
# Fix glossar entries (find corresponding artworks and fill in trace)
|
||||||
@ -45,8 +53,13 @@ add_trace <- function(data, glossar_dict = "../data/glossar_dict.RData") {
|
|||||||
"glossar_file"]]
|
"glossar_file"]]
|
||||||
single_art <- unlist(lut[lut$glossar_file %in% inside, "artwork"])
|
single_art <- unlist(lut[lut$glossar_file %in% inside, "artwork"])
|
||||||
|
|
||||||
|
m <- 1
|
||||||
|
|
||||||
for (file in lut$glossar_file) {
|
for (file in lut$glossar_file) {
|
||||||
|
|
||||||
|
cat("\n\nAdding trace variable for glossar entry", file,
|
||||||
|
paste0("(", m, "/", length(lut$glossar_file), ")"), "\n")
|
||||||
|
|
||||||
artwork_list <- unlist(lut[lut$glossar_file == file, "artwork"])
|
artwork_list <- unlist(lut[lut$glossar_file == file, "artwork"])
|
||||||
|
|
||||||
for (i in seq_len(nrow(subdata2))) {
|
for (i in seq_len(nrow(subdata2))) {
|
||||||
@ -78,11 +91,13 @@ add_trace <- function(data, glossar_dict = "../data/glossar_dict.RData") {
|
|||||||
subdata2[i, "artwork"] <- current_artwork
|
subdata2[i, "artwork"] <- current_artwork
|
||||||
|
|
||||||
}
|
}
|
||||||
|
setTxtProgressBar(pb, i)
|
||||||
}
|
}
|
||||||
|
m <- m + 1
|
||||||
}
|
}
|
||||||
|
|
||||||
# Exclude not matched glossar entries
|
# Exclude not matched glossar entries
|
||||||
cat("INFORMATION: glossar entries that are not matched will be removed:",
|
cat("\n\nINFORMATION: glossar entries that are not matched will be removed:",
|
||||||
sum(is.na(subdata2[subdata2$glossar == 1, "trace"])), "entries",
|
sum(is.na(subdata2[subdata2$glossar == 1, "trace"])), "entries",
|
||||||
#proportions(table(is.na(subdata2[subdata2$glossar == 1, "trace"]))),
|
#proportions(table(is.na(subdata2[subdata2$glossar == 1, "trace"]))),
|
||||||
fill = TRUE)
|
fill = TRUE)
|
||||||
@ -229,40 +244,52 @@ add_case <- function(data, cutoff = 20) {
|
|||||||
add_trace_moves <- function(data) {
|
add_trace_moves <- function(data) {
|
||||||
|
|
||||||
cases <- unique(data$case)
|
cases <- unique(data$case)
|
||||||
aws <- unique(data$artwork)[unique(data$artwork) != "glossar"]
|
artworks <- unique(data$artwork)[unique(data$artwork) != "glossar"]
|
||||||
max_trace <- max(data$trace, na.rm = TRUE) + 1
|
max_trace <- max(data$trace, na.rm = TRUE) + 1
|
||||||
out <- NULL
|
out <- NULL
|
||||||
|
|
||||||
for (case in cases) {
|
pb <- txtProgressBar(min = 0, max = length(artworks), style = 3)
|
||||||
for (art in aws) {
|
|
||||||
tmp <- data[data$case == case & data$artwork == art, ]
|
|
||||||
if (nrow(tmp) != 0) {
|
|
||||||
|
|
||||||
if (length(na.omit(unique(tmp$trace))) == 1) {
|
n <- 1 # count cases for progress
|
||||||
tmp[tmp$event == "move", "trace"] <- na.omit(unique(tmp$trace))
|
|
||||||
} else if (length(na.omit(unique(tmp$trace))) > 1) {
|
for (case in cases) {
|
||||||
for (i in 1:nrow(tmp)) {
|
|
||||||
if (tmp$event[i] == "move") {
|
cat("\n\nAdding trace variable for move events per case",
|
||||||
|
paste0("(", n, "/", length(cases), ")"), "\n")
|
||||||
|
j <- 1
|
||||||
|
|
||||||
|
for (artwork in artworks) {
|
||||||
|
subdata <- data[data$case == case & data$artwork == artwork, ]
|
||||||
|
if (nrow(subdata) != 0) {
|
||||||
|
|
||||||
|
if (length(na.omit(unique(subdata$trace))) == 1) {
|
||||||
|
subdata[subdata$event == "move", "trace"] <- na.omit(unique(subdata$trace))
|
||||||
|
} else if (length(na.omit(unique(subdata$trace))) > 1) {
|
||||||
|
for (i in 1:nrow(subdata)) {
|
||||||
|
if (subdata$event[i] == "move") {
|
||||||
if (i == 1) {
|
if (i == 1) {
|
||||||
tmp$trace[i] <- na.omit(unique(tmp$trace))[1]
|
subdata$trace[i] <- na.omit(unique(subdata$trace))[1]
|
||||||
} else {
|
} else {
|
||||||
tmp$trace[i] <- tmp$trace[i - 1]
|
subdata$trace[i] <- subdata$trace[i - 1]
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} else if (all(is.na(tmp$trace))) {
|
} else if (all(is.na(subdata$trace))) {
|
||||||
for (i in 1:nrow(tmp)) {
|
for (i in 1:nrow(subdata)) {
|
||||||
if (tmp$event[i] == "move") {
|
if (subdata$event[i] == "move") {
|
||||||
tmp$trace[i] <- max_trace
|
subdata$trace[i] <- max_trace
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
max_trace <- max_trace + 1
|
max_trace <- max_trace + 1
|
||||||
}
|
}
|
||||||
if (nrow(tmp) > 0) {
|
if (nrow(subdata) > 0) {
|
||||||
out <- rbind(out, tmp)
|
out <- rbind(out, subdata)
|
||||||
}
|
}
|
||||||
|
setTxtProgressBar(pb, j)
|
||||||
|
j <- j + 1
|
||||||
}
|
}
|
||||||
|
n <- n + 1
|
||||||
}
|
}
|
||||||
out <- out[order(out$date.start, out$fileId.start), ]
|
out <- out[order(out$date.start, out$fileId.start), ]
|
||||||
rownames(out) <- NULL
|
rownames(out) <- NULL
|
||||||
@ -340,19 +367,19 @@ add_topic <- function(data, topics) {
|
|||||||
dat_split <- split(data, data$artwork)
|
dat_split <- split(data, data$artwork)
|
||||||
|
|
||||||
set_label <- function(x) {
|
set_label <- function(x) {
|
||||||
art <- unique(x$artwork)
|
artwork <- unique(x$artwork)
|
||||||
x$topicIndex <- factor(x$topicNumber, labels = tab_index[[art]])
|
x$topicIndex <- factor(x$topicNumber, labels = tab_index[[artwork]])
|
||||||
x
|
x
|
||||||
}
|
}
|
||||||
|
|
||||||
dat_label <- lapply(dat_split, set_label)
|
dat_label <- lapply(dat_split, set_label)
|
||||||
|
|
||||||
set_topic <- function(x) {
|
set_topic <- function(x) {
|
||||||
art <- unique(x$artwork)
|
artwork <- unique(x$artwork)
|
||||||
labels_file <- topics[topics$artwork == art,
|
labels_file <- topics[topics$artwork == artwork,
|
||||||
"file_name"][as.numeric(levels(x$topicIndex))]
|
"file_name"][as.numeric(levels(x$topicIndex))]
|
||||||
x$topicFile <- as.character(factor(x$topicIndex, labels = labels_file))
|
x$topicFile <- as.character(factor(x$topicIndex, labels = labels_file))
|
||||||
labels_topic <- topics[topics$artwork == art,
|
labels_topic <- topics[topics$artwork == artwork,
|
||||||
"topic"][as.numeric(levels(x$topicIndex))]
|
"topic"][as.numeric(levels(x$topicIndex))]
|
||||||
x$topic <- as.character(factor(x$topicIndex, labels = labels_topic))
|
x$topic <- as.character(factor(x$topicIndex, labels = labels_topic))
|
||||||
x
|
x
|
||||||
@ -360,11 +387,6 @@ add_topic <- function(data, topics) {
|
|||||||
|
|
||||||
dat_topic <- lapply(dat_label, set_topic)
|
dat_topic <- lapply(dat_label, set_topic)
|
||||||
|
|
||||||
# table(dat_topic[["501"]]$topicNumber)
|
|
||||||
# table(dat_topic[["501"]]$topicIndex)
|
|
||||||
# table(dat_topic[["501"]]$topicFile)
|
|
||||||
# table(dat_topic[["501"]]$topic)
|
|
||||||
|
|
||||||
out <- do.call(rbind, dat_topic)
|
out <- do.call(rbind, dat_topic)
|
||||||
out <- out[order(out$date.start, out$fileId.start), ]
|
out <- out[order(out$date.start, out$fileId.start), ]
|
||||||
rownames(out) <- NULL
|
rownames(out) <- NULL
|
||||||
|
Loading…
Reference in New Issue
Block a user