Started trying out how preprocessing behaves when done on full data set

This commit is contained in:
2023-09-15 16:22:21 +02:00
parent e416974906
commit 2a2eab4b9f
3 changed files with 118 additions and 35 deletions
+20 -1
View File
@@ -3,7 +3,8 @@
source("functions.R")
# Read data ##############################################################
dat0 <- read.table("../data/rawdata_logfiles_small.csv", sep = ";",
cat("########## Reading in data... ##########", "\n")
dat0 <- read.table("../data/rawdata_logfiles.csv", sep = ";",
header = TRUE)
dat0$date <- as.POSIXct(dat0$date)
dat0$glossar <- ifelse(dat0$artwork == "glossar", 1, 0)
@@ -12,10 +13,16 @@ dat0$glossar <- ifelse(dat0$artwork == "glossar", 1, 0)
dat <- subset(dat0, !(dat0$event %in% c("Start Application",
"Show Application")))
save(dat, file = "tmp/dat.RData")
# Add trace variable #####################################################
cat("########## Adding trace variable... ##########", "\n")
dat1 <- add_trace(dat)
save(dat1, file = "tmp/dat1.RData")
# Close events
cat("########## Closing events...")
dat2 <- rbind(close_events(dat1, "move"),
close_events(dat1, "flipCard"),
close_events(dat1, "openTopic"),
@@ -32,7 +39,10 @@ dat2 <- dat2[!is.na(dat2$date.start), ]
rownames(dat2) <- NULL
# TODO: Throw warning about this
save(dat2, file = "tmp/dat2.RData")
# Add case variable ######################################################
cat("########## Adding case and eventId variables... ##########", "\n")
dat3 <- add_case(dat2)
# Add event ID ###########################################################
@@ -46,19 +56,28 @@ dat3 <- dat3[, c("fileId.start", "fileId.stop", "eventId", "case",
"scaleSize", "rotation.start", "rotation.stop",
"rotationDegree")]
save(dat3, file = "tmp/dat3.RData")
# Add trace for move events ##############################################
cat("########## Adding trace variable for move events... ##########", "\n")
dat4 <- add_trace_moves(dat3)
save(dat4, file = "tmp/dat4.RData")
# Add topics: file names and topics ######################################
cat("########## Adding information about topics... ##########", "\n")
artworks <- unique(dat4$artwork)
topics <- extract_topics(artworks, pattern = paste0(artworks, ".xml"),
path = "../data/ContentEyevisit/eyevisit_cards_light/")
dat5 <- add_topic(dat4, topics = topics)
save(dat5, file = "tmp/dat5.RData")
# TODO: Replace artwork with informative strings
# Export data ############################################################
cat("########## Exporting data frame with event logs... ##########", "\n")
write.table(dat5, "../data/event_logfiles.csv", sep = ";",
row.names = FALSE)
+56 -34
View File
@@ -9,28 +9,36 @@ add_trace <- function(data, glossar_dict = "../data/glossar_dict.RData") {
subdata2 <- data[!data$event %in% c("Transform start", "Transform stop"), ]
last_event <- subdata2$event[1]
aws <- unique(subdata2$artwork)[unique(subdata2$artwork) != "glossar"]
artworks <- unique(subdata2$artwork)[unique(subdata2$artwork) != "glossar"]
n <- 1 # count artworks for progress
for (art in aws) {
pb <- txtProgressBar(min = 0, max = nrow(subdata2), style = 3)
for (artwork in artworks) {
cat("\n\nAdding trace variable for artwork", artwork,
paste0("(", n, "/", length(artworks), ")"), "\n")
for (i in 1:nrow(subdata2)) {
if (last_event == "Show Info" & subdata2$artwork[i] == art) {
if (last_event == "Show Info" & subdata2$artwork[i] == artwork) {
subdata2$trace[i] <- i
j <- i
} else if (last_event == "Show Front" & subdata2$artwork[i] == art) {
} else if (last_event == "Show Front" & subdata2$artwork[i] == artwork) {
subdata2$trace[i] <- j
} else if (!(last_event %in% c("Show Info", "Show Front")) &
subdata2$artwork[i] == art) {
subdata2$artwork[i] == artwork) {
subdata2$trace[i] <- j
}
if (i <= nrow(subdata2)) {
last_event <- subdata2$event[i + 1]
}
setTxtProgressBar(pb, i)
}
n <- n + 1
}
# Fix glossar entries (find corresponding artworks and fill in trace)
@@ -45,8 +53,13 @@ add_trace <- function(data, glossar_dict = "../data/glossar_dict.RData") {
"glossar_file"]]
single_art <- unlist(lut[lut$glossar_file %in% inside, "artwork"])
m <- 1
for (file in lut$glossar_file) {
cat("\n\nAdding trace variable for glossar entry", file,
paste0("(", m, "/", length(lut$glossar_file), ")"), "\n")
artwork_list <- unlist(lut[lut$glossar_file == file, "artwork"])
for (i in seq_len(nrow(subdata2))) {
@@ -78,11 +91,13 @@ add_trace <- function(data, glossar_dict = "../data/glossar_dict.RData") {
subdata2[i, "artwork"] <- current_artwork
}
setTxtProgressBar(pb, i)
}
m <- m + 1
}
# Exclude not matched glossar entries
cat("INFORMATION: glossar entries that are not matched will be removed:",
cat("\n\nINFORMATION: glossar entries that are not matched will be removed:",
sum(is.na(subdata2[subdata2$glossar == 1, "trace"])), "entries",
#proportions(table(is.na(subdata2[subdata2$glossar == 1, "trace"]))),
fill = TRUE)
@@ -229,40 +244,52 @@ add_case <- function(data, cutoff = 20) {
add_trace_moves <- function(data) {
cases <- unique(data$case)
aws <- unique(data$artwork)[unique(data$artwork) != "glossar"]
artworks <- unique(data$artwork)[unique(data$artwork) != "glossar"]
max_trace <- max(data$trace, na.rm = TRUE) + 1
out <- NULL
for (case in cases) {
for (art in aws) {
tmp <- data[data$case == case & data$artwork == art, ]
if (nrow(tmp) != 0) {
pb <- txtProgressBar(min = 0, max = length(artworks), style = 3)
if (length(na.omit(unique(tmp$trace))) == 1) {
tmp[tmp$event == "move", "trace"] <- na.omit(unique(tmp$trace))
} else if (length(na.omit(unique(tmp$trace))) > 1) {
for (i in 1:nrow(tmp)) {
if (tmp$event[i] == "move") {
n <- 1 # count cases for progress
for (case in cases) {
cat("\n\nAdding trace variable for move events per case",
paste0("(", n, "/", length(cases), ")"), "\n")
j <- 1
for (artwork in artworks) {
subdata <- data[data$case == case & data$artwork == artwork, ]
if (nrow(subdata) != 0) {
if (length(na.omit(unique(subdata$trace))) == 1) {
subdata[subdata$event == "move", "trace"] <- na.omit(unique(subdata$trace))
} else if (length(na.omit(unique(subdata$trace))) > 1) {
for (i in 1:nrow(subdata)) {
if (subdata$event[i] == "move") {
if (i == 1) {
tmp$trace[i] <- na.omit(unique(tmp$trace))[1]
subdata$trace[i] <- na.omit(unique(subdata$trace))[1]
} else {
tmp$trace[i] <- tmp$trace[i - 1]
subdata$trace[i] <- subdata$trace[i - 1]
}
}
}
} else if (all(is.na(tmp$trace))) {
for (i in 1:nrow(tmp)) {
if (tmp$event[i] == "move") {
tmp$trace[i] <- max_trace
} else if (all(is.na(subdata$trace))) {
for (i in 1:nrow(subdata)) {
if (subdata$event[i] == "move") {
subdata$trace[i] <- max_trace
}
}
}
max_trace <- max_trace + 1
}
if (nrow(tmp) > 0) {
out <- rbind(out, tmp)
if (nrow(subdata) > 0) {
out <- rbind(out, subdata)
}
setTxtProgressBar(pb, j)
j <- j + 1
}
n <- n + 1
}
out <- out[order(out$date.start, out$fileId.start), ]
rownames(out) <- NULL
@@ -340,19 +367,19 @@ add_topic <- function(data, topics) {
dat_split <- split(data, data$artwork)
set_label <- function(x) {
art <- unique(x$artwork)
x$topicIndex <- factor(x$topicNumber, labels = tab_index[[art]])
artwork <- unique(x$artwork)
x$topicIndex <- factor(x$topicNumber, labels = tab_index[[artwork]])
x
}
dat_label <- lapply(dat_split, set_label)
set_topic <- function(x) {
art <- unique(x$artwork)
labels_file <- topics[topics$artwork == art,
artwork <- unique(x$artwork)
labels_file <- topics[topics$artwork == artwork,
"file_name"][as.numeric(levels(x$topicIndex))]
x$topicFile <- as.character(factor(x$topicIndex, labels = labels_file))
labels_topic <- topics[topics$artwork == art,
labels_topic <- topics[topics$artwork == artwork,
"topic"][as.numeric(levels(x$topicIndex))]
x$topic <- as.character(factor(x$topicIndex, labels = labels_topic))
x
@@ -360,11 +387,6 @@ add_topic <- function(data, topics) {
dat_topic <- lapply(dat_label, set_topic)
# table(dat_topic[["501"]]$topicNumber)
# table(dat_topic[["501"]]$topicIndex)
# table(dat_topic[["501"]]$topicFile)
# table(dat_topic[["501"]]$topic)
out <- do.call(rbind, dat_topic)
out <- out[order(out$date.start, out$fileId.start), ]
rownames(out) <- NULL