Moved most stuff into package folder mtt; updated README so it works with new code

This commit is contained in:
Nora Wickelmaier 2023-09-21 16:45:06 +02:00
parent 55adcf03d7
commit 3dd13a6c6e
9 changed files with 51 additions and 666 deletions

View File

@ -8,6 +8,11 @@ output:
toc: true
---
```{r, include = FALSE}
# setwd("C:/Users/nwickelmaier/Nextcloud/Documents/MDS/2023ss/60100_master_thesis")
devtools::load_all("../../../software/mtt")
```
# Log data from the Multi-Touch Table at the HAUM
The Multi Touch Table at the Herzog-Anton-Ulrich-Museum (HAUM) in
@ -117,11 +122,6 @@ files have been affected.
# Problems and how I handled them
```{r, include = FALSE}
# setwd("C:/Users/nwickelmaier/Nextcloud/Documents/MDS/2023ss/60100_master_thesis")
source("code/functions.R")
```
This lists some problems with the log data that required decisions. These
decisions influence the outcome and maybe even the data quality. Hence, I
tried to document how I handled these problems and explain the decisions I
@ -136,7 +136,7 @@ continuous within one log file but not over several log files.
```{r}
# Read data
dat0 <- read.table("data/rawdata_logfiles_small.csv", sep = ";",
dat0 <- read.table("data/haum/rawdata_logfiles_small.csv", sep = ";",
header = TRUE)
dat0$date <- as.POSIXct(dat0$date)
dat0$glossar <- ifelse(dat0$artwork == "glossar", 1, 0)
@ -146,26 +146,16 @@ dat <- subset(dat0, !(dat0$event %in% c("Start Application",
"Show Application")))
# Add trace variable
dat1 <- add_trace(dat, glossar_dict = "data/glossar_dict.RData")
dat1 <- add_trace(dat, glossar_dict = "data/haum/glossar_dict.RData")
# Close events
dat2 <- rbind(close_events(dat1, "move"),
close_events(dat1, "flipCard"),
close_events(dat1, "openTopic"),
close_events(dat1, "openPopup"))
dat2 <- dat2[order(dat2$date.start, dat2$fileId.start), ]
head(dat2[which(dat2$duration < 0),
c("fileId.start", "fileId.stop", "event", "artwork", "duration")], 20)
head(dat2[which(dat2$fileId.start != dat2$fileId.stop),
c("fileId.start", "fileId.stop", "event", "artwork", "duration")], 20)
dat2 <- dat2[order(dat2$date.start, dat2$fileId), ]
plot(timeMs ~ as.factor(fileId), dat[1:5000,], xlab = "fileId")
# Remove durations when event spans more than one log file, since they are
# not interpretable
#dat2[which(dat2$fileId.start != dat2$fileId.stop), "duration"] <- NA
```
The boxplot shows that we have a continuous range of values within one log
@ -183,6 +173,9 @@ exactly fixed. Unfortunately, only three `move` events were fixed, since it
only fixed irregularities *within* one log file. See below for more
details.
UPDATE: By now I remove all events that span more than one log file. This
lets me improve speed considerably.
## Left padding of file IDs
The file names of the raw log files are automatically generated and contain
@ -196,7 +189,7 @@ will sort these files in the order shown below. In order to preprocess the
data and close events that belong together, the data need to be sorted by
events and artworks repeatedly. In order to get them back in the correct
time order, it is necessary to order them based on three variables:
`fileId.start`, `date.start` and `timeMs`. The file IDs therefore need to
`fileId`, `date.start` and `timeMs`. The file IDs therefore need to
sort in the correct order (again see below for example). I zero left padded
the log file names within the data frame using it as an identifier. These
"file names" do not correspond exactly to the original raw log file names.
@ -406,7 +399,7 @@ assign topics and file names to the according pop-ups. This needs to be
cross checked with the programming, but seems the most plausible approach
with my current knowledge.
## Extracting topics from `index.xml` vs. `<artwork_number>.xml
## Extracting topics from `index.xml` vs. `<artwork_number>.xml`
When I extract the topics from `index.html` I get different topics, than
when I get them from `<artwork>.html`. At first glance, it looks like using
@ -414,7 +407,7 @@ when I get them from `<artwork>.html`. At first glance, it looks like using
```{r}
artworks <- unique(dat2$artwork)
path <- "data/ContentEyevisit/eyevisit_cards_light/"
path <- "data/haum/ContentEyevisit/eyevisit_cards_light/"
topics <- extract_topics(artworks, "index.xml", path)
topics2 <- extract_topics(artworks, paste0(artworks, ".xml"), path)
@ -434,7 +427,7 @@ sudden there were 72 instead of 70 artworks. It seems like these two
artworks appear on October 21, 2022.
```{r}
dat0 <- read.table("data/rawdata_logfiles.csv", sep = ";", header = TRUE)
dat0 <- read.table("data/haum/rawdata_logfiles.csv", sep = ";", header = TRUE)
dat0$date <- as.POSIXct(dat0$date)
dat0$glossar <- ifelse(dat0$artwork == "glossar", 1, 0)

View File

@ -1,125 +0,0 @@
# setwd("C:/Users/nwickelmaier/Nextcloud/Documents/MDS/2023ss/60100_master_thesis/code")
###### HELPER ######
# Need to left pad file names. If I do not do this, the sorting of the
# timestamps will be off and I get negative durations later on since the
# wrong events get closed.
leftpad_fnames <- function(x) {
z <- gsub(paste0(dirpaths, "/"), "\\1", x)
ys <- strsplit(z, "_")
res <- NULL
for (y in ys) {
y2 <- unlist(strsplit(y[3], "-"))
e1 <- y[1]
e2 <- sprintf("%02d", as.numeric(y[2]))
e3 <- sprintf("%02d", as.numeric(y2[1]))
e4 <- sprintf("%02d", as.numeric(y2[2]))
e5 <- sprintf("%02d", as.numeric(y[4]))
e6 <- sprintf("%02d", as.numeric(gsub(".log", "", y[5])))
e6 <- sprintf("%02d", as.numeric(gsub(".log", "", y[5])))
res <- c(res,
paste0(e1, "_", e2, "_", e3, "-", e4, "_", e5, "_", e6, ".log"))
}
res
}
##### CONTENT ######
# Choose which folders with raw log files should be included
folders <- "all"
#folders <- "_2016b"
dirpaths <- paste0("../data/haum_logs_2016-2023/", folders)
fnames <- dir(dirpaths, pattern = "*.log", full.names = TRUE)
length(fnames)
head(fnames)
logs <- lapply(fnames, readLines)
nlog <- sapply(logs, length)
dat <- data.frame(fileId = rep(leftpad_fnames(fnames), nlog),
logs = unlist(logs))
head(dat$logs)
# Remove corrupted lines
# corrupt lines are "" and need to be removed
d1 <- dim(dat)[1]
dat <- subset(dat, dat$logs != "")
d2 <- dim(dat)[1]
# TODO: Catch this in a function and give back a meaningful warning
# The files contain `r d1-d2` corrupt lines that were removed from the
# data.
# Extract relevant infos
date <- sapply(dat$logs, gsub,
pattern = "^\\[(.*)\\], \\[.*$",
replacement = "\\1",
USE.NAMES = FALSE)
timestamp <- sapply(dat$logs, gsub,
pattern = "^\\[.*\\], \\[(.*)\\].*$",
replacement = "\\1",
USE.NAMES = FALSE)
action <- sapply(dat$logs, gsub,
pattern = "^.*EyeVisit, (.*):*.*$",
replacement = "\\1",
USE.NAMES = FALSE)
events <- sapply(strsplit(action, ":"), function(x) x[1])
topics <- sapply(strsplit(action, ":"), function(x) x[2])
moves <- apply(do.call(rbind,
strsplit(sapply(strsplit(action, ":"), function(x) x[3]),
",")),
2, as.numeric)
# ATTENTION: as.numeric() forces NAs for "OpenCard" and "CloseCard"
card_action <- trimws(sapply(strsplit(action, ":"),
function(x) x[3])[grep("Artwork", events)])
card <- as.numeric(sapply(strsplit(action, ":"), function(x) x[4]))
events[grep("Artwork", events)] <- paste("Artwork", card_action, sep = "/")
ts_elements <- strsplit(timestamp, ":")
time_ms <- as.numeric(sapply(ts_elements, function(x) x[4])) +
as.numeric(sapply(ts_elements, function(x) x[3])) * 1000 +
as.numeric(sapply(ts_elements, function(x) x[2])) * 1000 * 60
dat$date <- lubridate::parse_date_time(date, "bdyHMSOp")
dat$timeMs <- time_ms
dat$event <- events
dat$artwork <- trimws(sapply(strsplit(topics, "/"), function(x) x[1]))
dat$popup <- sapply(strsplit(topics, "/"), function(x) x[2])
dat$topicNumber <- card
dat$x <- moves[,1]
dat$y <- moves[,2]
dat$scale <- moves[,3]
dat$rotation <- moves[,4]
dat$logs <- NULL
# remove original log files from data so file becomes smaller
# sort by fileId, since reading in by file names does not make sense
# because of missing left zero padding
dat <- dat[order(dat$fileId, dat$date, dat$timeMs), ]
# Export data
write.table(dat, "../data/rawdata_logfiles.csv",
sep = ";", quote = FALSE, row.names = FALSE)

View File

@ -1,37 +0,0 @@
# setwd("C:/Users/nwickelmaier/Nextcloud/Documents/MDS/2023ss/60100_master_thesis/data/ContentEyevisit/eyevisit_cards_light")
dat0 <- read.table("../../rawdata_logfiles.csv", sep = ";",
header = TRUE)
# artwork names
artworks <- unique(na.omit(dat0$artwork))[unique(na.omit(dat0$artwork)) != "glossar"]
dat <- subset(dat0, dat0$artwork == "glossar")
glossar_files <- unique(dat$popup)
x <- NULL
for (glossar_file in glossar_files) {
for (artwork in artworks) {
fnames <- dir(pattern = paste0(artwork, "_"), path = artwork)
for (fname in fnames) {
lines <- readLines(paste0(artwork, "/", fname))
if (any(grepl(glossar_file, lines))) {
x <- rbind(x, data.frame(glossar_file, artwork))
break
}
}
}
}
head(x, 20)
glossar_dict <- as.data.frame(tapply(x$artwork, x$glossar_file, FUN = c))
names(glossar_dict) <- "artwork"
glossar_dict$glossar_file <- rownames(glossar_dict)
rownames(glossar_dict) <- NULL
glossar_dict <- glossar_dict[, c("glossar_file", "artwork")]
save(glossar_dict, file = "../../glossar_dict.RData")
# TODO: Save in interoperable format

View File

@ -1,6 +1,9 @@
# TODO: This script is obsolete and needs to be updated!
# setwd("C:/Users/nwickelmaier/Nextcloud/Documents/MDS/2023ss/60100_master_thesis/code")
source("functions.R")
#source("functions.R")
devtools::load_all("../../../../software/mtt")
small <- TRUE
@ -10,10 +13,10 @@ now <- Sys.time()
cat("########## Reading in data... ##########", "\n")
if (small) {
dat0 <- read.table("../data/rawdata_logfiles_small.csv", sep = ";",
dat0 <- read.table("../data/haum/rawdata_logfiles_small.csv", sep = ";",
header = TRUE)
} else {
dat0 <- read.table("../data/rawdata_logfiles.csv", sep = ";",
dat0 <- read.table("../data/haum/rawdata_logfiles.csv", sep = ";",
header = TRUE)
}
dat0$date <- as.POSIXct(dat0$date)
@ -54,7 +57,6 @@ dat2 <- dat2[order(dat2$date.start, dat2$fileId.start), ]
# Remove all events that do not have a `date.start`
dat2 <- dat2[!is.na(dat2$date.start), ]
rownames(dat2) <- NULL
# TODO: Throw warning about this
save(dat2, file = paste("tmp/dat2", ifelse(small, "small_", "full_"),
format(now, "%Y-%m-%d_%H-%M-%S"), ".RData"))
@ -90,7 +92,7 @@ artworks <- unique(dat4$artwork)
# remove artworks without XML information
artworks <- artworks[!artworks %in% c("504", "505")]
topics <- extract_topics(artworks, pattern = paste0(artworks, ".xml"),
path = "../data/ContentEyevisit/eyevisit_cards_light/")
path = "../data/haum/ContentEyevisit/eyevisit_cards_light/")
dat5 <- add_topic(dat4, topics = topics)
@ -101,6 +103,6 @@ save(dat5, file = paste("tmp/dat5", ifelse(small, "small_", "full_"),
# Export data ############################################################
cat("########## Exporting data frame with event logs... ##########", "\n")
write.table(dat5, "../data/event_logfiles.csv", sep = ";",
write.table(dat5, "../data/haum/event_logfiles.csv", sep = ";",
row.names = FALSE)

View File

@ -16,7 +16,7 @@
#' # Read data
dat <- read.table("../data/event_logfiles.csv", sep = ";", header = TRUE)
dat <- read.table("../data/haum/event_logfiles.csv", sep = ";", header = TRUE)
dat$date.start <- as.POSIXct(dat$date.start)
dat$date.stop <- as.POSIXct(dat$date.stop)

View File

@ -1,16 +0,0 @@
path <- "C:/Users/nwickelmaier/Nextcloud/Documents/MDS/2023ss/60100_master_thesis/data/ContentEyevisit/eyevisit_cards_light"
setwd(path)
# artwork names
dat0 <- read.table("../../event_logfiles.csv", sep = ";", header = TRUE)
dat0$artwork <- sprintf("%03d", dat0$artwork)
artworks <- sort(unique(dat0$artwork))
# extract topics
topics <- extract_topics(artworks, paste0(artworks, ".xml"), path)
write.table(topics, file = "../../topics.csv", sep = ";", row.names = FALSE)
# TODO: Keep this file?

View File

@ -1,458 +0,0 @@
###########################################################################
# Add trace variable
add_trace <- function(data, glossar_dict = "../data/glossar_dict.RData") {
data$trace <- NA
subdata1 <- data[data$event %in% c("Transform start", "Transform stop"), ]
subdata2 <- data[!data$event %in% c("Transform start", "Transform stop"), ]
last_event <- subdata2$event[1]
artworks <- unique(subdata2$artwork)[unique(subdata2$artwork) != "glossar"]
n <- 1 # count artworks for progress
pb <- txtProgressBar(min = 0, max = nrow(subdata2), initial = NA,
style = 3)
for (artwork in artworks) {
cat("\n\nAdding trace variable for artwork", artwork,
paste0("(", n, "/", length(artworks), ")"), "\n")
for (i in 1:nrow(subdata2)) {
if (last_event == "Show Info" & subdata2$artwork[i] == artwork) {
subdata2$trace[i] <- i
j <- i
} else if (last_event == "Show Front" & subdata2$artwork[i] == artwork) {
subdata2$trace[i] <- j
} else if (!(last_event %in% c("Show Info", "Show Front")) &
subdata2$artwork[i] == artwork) {
subdata2$trace[i] <- j
}
if (i <= nrow(subdata2)) {
last_event <- subdata2$event[i + 1]
}
setTxtProgressBar(pb, i)
}
n <- n + 1
}
# Fix glossar entries (find corresponding artworks and fill in trace)
glossar_files <- unique(subdata2[subdata2$artwork == "glossar", "popup"])
# load lookup table for artworks and glossar files
load(glossar_dict)
lut <- glossar_dict[glossar_dict$glossar_file %in% glossar_files, ]
inside <- glossar_files[glossar_files %in%
lut[sapply(lut$artwork, length) == 1,
"glossar_file"]]
single_art <- unlist(lut[lut$glossar_file %in% inside, "artwork"])
m <- 1
for (file in lut$glossar_file) {
cat("\n\nAdding trace variable for glossar entry", file,
paste0("(", m, "/", length(lut$glossar_file), ")"), "\n")
artwork_list <- unlist(lut[lut$glossar_file == file, "artwork"])
for (i in seq_len(nrow(subdata2))) {
if (subdata2$event[i] == "Show Info" |
(subdata2$event[i] == "Artwork/OpenCard" &
subdata2$artwork[i] %in% single_art)) {
current_artwork <- subdata2[i, "artwork"]
j <- i
k <- i
} else {
current_artwork <- current_artwork
}
if (subdata2$event[i] == "Show Front" & subdata2$artwork[i] == current_artwork) {
# make sure artwork has not been closed, yet!
k <- i
}
if (subdata2$artwork[i] == "glossar" &
(current_artwork %in% artwork_list) &
subdata2$popup[i] == file & (j - k == 0)) {
subdata2[i, "trace"] <- subdata2[j, "trace"]
subdata2[i, "artwork"] <- current_artwork
}
setTxtProgressBar(pb, i)
}
m <- m + 1
}
# Exclude not matched glossar entries
cat("\n\nINFORMATION: glossar entries that are not matched will be removed:",
sum(is.na(subdata2[subdata2$glossar == 1, "trace"])), "entries",
#proportions(table(is.na(subdata2[subdata2$glossar == 1, "trace"]))),
fill = TRUE)
subdata2 <- subset(subdata2, !is.na(subdata2$trace))
# REMEMBER: It can never be 100% correct, since it is always possible
# that several cards are open and that they link to the same glossar
# entry
# dat2[14110:14130, ]
# dat2[dat2$glossar == 1, ]
out <- rbind(subdata1, subdata2)
out <- out[order(out$fileId, out$date, out$timeMs), ]
out
}
###########################################################################
close_events <- function(data, event = c("move", "flipCard", "openTopic", "openPopup")) {
event <- match.arg(event)
switch(event,
"move" = {
actions <- c("Transform start", "Transform stop")
idvar <- c("fileId", "eventId", "artwork", "glossar")
drop <- c("popup", "topicNumber", "trace", "event")
ncol <- 16
},
"flipCard" = {
actions <- c("Show Info", "Show Front")
idvar <- c("fileId", "trace", "artwork", "glossar")
drop <- c("popup", "topicNumber", "eventId", "event")
ncol <- 16
},
"openTopic" = {
actions <- c("Artwork/OpenCard", "Artwork/CloseCard")
idvar <- c("fileId", "eventId", "trace", "glossar", "artwork",
"topicNumber")
drop <- c("popup", "event")
ncol <- 18
},
"openPopup" = {
actions <- c("ShowPopup", "HidePopup")
idvar <- c("fileId", "eventId", "trace", "glossar", "artwork", "popup")
drop <- c("topicNumber", "event")
ncol <- 18
# TODO: Should topicNumber maybe also be filled in for "openPopup"?
}
)
subdata <- subset(data, data$event %in% actions)
subdata <- subdata[order(subdata$artwork, subdata$popup, subdata$date, subdata$timeMs), ]
subdata$time <- ifelse(subdata$event == actions[1], "start", "stop")
num_start <- diff(c(0, which(subdata$event == actions[2])))
if (tail(subdata, 1)$time == "start") {
num_start <- c(num_start, 1)
}
subdata$eventId <- rep(seq_along(num_start), num_start)
if (event == "move") {
subdata <- subdata[!duplicated(subdata[, c("event", "eventId")]), ]
id_stop <- which(subdata$event == actions[2])
id_rm_stop <- id_stop[diff(id_stop) == 1]
subdata <- subdata[-(id_rm_stop + 1), ]
}
subdata_split <- split(subdata, ~ fileId)
pbapply::pboptions(style = 3, char = "=")
subdata_split_wide <- pbapply::pblapply(subdata_split, reshape,
direction = "wide",
idvar = idvar,
timevar = "time",
drop = drop)
# suppressWarnings(
# data_wide <- reshape(subdata, direction = "wide",
# idvar = idvar,
# timevar = "time",
# drop = drop)
# )
# remove entries with only start or stop events since they do not have
# all columns
ids <- which(sapply(subdata_split_wide, ncol) != ncol)
if (length(ids) > 0) subdata_split_wide <- subdata_split_wide[-ids]
data_wide <- dplyr::bind_rows(subdata_split_wide)
for (d in drop) data_wide[d] <- NA
data_wide$distance <- NA
data_wide$scaleSize <- NA
data_wide$rotationDegree <- NA
data_wide$event <- event
data_wide$duration <- data_wide$timeMs.stop - data_wide$timeMs.start
if (event == "move") {
data_wide$distance <- apply(
data_wide[, c("x.start", "y.start", "x.stop", "y.stop")], 1,
function(x) dist(matrix(x, 2, 2, byrow = TRUE)))
data_wide$rotationDegree <- data_wide$rotation.stop -
data_wide$rotation.start
data_wide$scaleSize <- data_wide$scale.stop / data_wide$scale.start
# remove moves without any change
move_wide <- data_wide[data_wide$distance != 0 &
data_wide$rotationDegree != 0 &
data_wide$scaleSize != 1, ]
cat(paste("INFORMATION:", nrow(data_wide) - nrow(move_wide),
"lines containing move events were removed since they did",
"\nnot contain any change"), fill = TRUE)
data_wide <- move_wide
}
out <- data_wide[, c("fileId", "event", "artwork", "trace", "glossar",
"date.start", "date.stop", "timeMs.start",
"timeMs.stop", "duration", "topicNumber", "popup",
"x.start", "y.start", "x.stop", "y.stop",
"distance", "scale.start", "scale.stop",
"scaleSize", "rotation.start", "rotation.stop",
"rotationDegree")]
rownames(out) <- NULL
out
}
###########################################################################
# Add case variable
add_case <- function(data, cutoff = 20) {
# TODO: What is the best choice for the cutoff here?
data$timediff <- as.numeric(diff(c(data$date.start[1], data$date.start)))
data$case <- NA
j <- 1
pb <- txtProgressBar(min = 0, max = nrow(data), initial = NA, style = 3)
for (i in seq_len(nrow(data))) {
if (data$timediff[i] <= cutoff) {
data$case[i] <- j
} else {
j <- j + 1
data$case[i] <- j
}
setTxtProgressBar(pb, i)
}
data$timediff <- NULL
data
}
###########################################################################
# Add trace for moves
add_trace_moves <- function(data) {
pbapply::pboptions(style = 3, char = "=")
trace_max <- max(data$trace, na.rm = TRUE)
#subdata_art <- split(data, ~ artwork)
subdata_case <- split(data, ~ case)
#subdata_list <- split(data, ~ artwork + case)
# --> does not work with complete data set
cat("Splitting data...", "\n")
subdata_list <- pbapply::pblapply(subdata_case, split, f = ~artwork)
subdata_list <- unlist(subdata_list, recursive = FALSE)
cat("Adding trace...", "\n")
subdata_trace <- pbapply::pblapply(subdata_list,
function(x) {
trace_max <<- trace_max + 1
add_trace_subdata(x, max_trace = trace_max)
}
)
out <- dplyr::bind_rows(subdata_trace)
out <- out[order(out$fileId.start, out$date.start, out$timeMs.start), ]
rownames(out) <- NULL
# Make trace a consecutive number
out$trace <- as.numeric(factor(out$trace, levels = unique(out$trace)))
out
}
add_trace_subdata <- function(subdata, max_trace) {
if (nrow(subdata) != 0) {
if (length(na.omit(unique(subdata$trace))) == 1) {
subdata[subdata$event == "move", "trace"] <- na.omit(unique(subdata$trace))
} else if (length(na.omit(unique(subdata$trace))) > 1) {
for (i in 1:nrow(subdata)) {
if (subdata$event[i] == "move") {
if (i == 1) {
subdata$trace[i] <- na.omit(unique(subdata$trace))[1]
} else {
subdata$trace[i] <- subdata$trace[i - 1]
}
}
}
} else if (all(is.na(subdata$trace))) {
for (i in 1:nrow(subdata)) {
subdata$trace[i] <- max_trace
}
}
} else {
warning("`subdata` has nrow = 0")
}
subdata
}
###########################################################################
# Create data frame with file names and topics for each artwork
extract_topics <- function(artworks, pattern, path) {
dat <- NULL
file_order <- NULL
i <- 1
for (artwork in artworks) {
if (length(pattern) == 1) {
index_file <- pattern
} else {
index_file <- pattern[i]
}
fnames <- dir(pattern = paste0(artwork, "_"),
path = paste(path, artwork, sep = "/"))
topic <- NULL
for (fname in fnames) {
suppressWarnings(
topic <- c(topic, gsub("^<card type=.(.*).>$", "\\1",
grep("^<card type=",
trimws(readLines(paste(path, artwork, fname, sep = "/"))),
value = T)))
)
}
index <- paste(path, artwork, index_file, sep = "/")
suppressWarnings(
file_order <- c(file_order, gsub("^<card src=.*/(.*)./>$", "\\1",
grep("^<card src=", trimws(readLines(index)),
value = TRUE)))
)
in_index <- fnames %in% file_order
dat <- rbind(dat, data.frame(artwork, file_name = fnames, in_index, topic))
i <- i + 1
}
# take only the ones that are actually displayed and sort in the same order
# as indicated in index.html
out <- dat[dat$in_index, -3]
out <- out[order(file_order, out$file_name), ]
rownames(out) <- NULL
out$index <- unlist(sapply(table(out$artwork), seq_len))
out
}
###########################################################################
# Add topics: file names and topics
add_topic <- function(data, topics) {
artworks <- unique(data$artwork)
tab_art <- lapply(artworks,
function(x) names(table(data$topicNumber[data$artwork == x])))
names(tab_art) <- artworks
tab_index <- lapply(tab_art, seq_along)
dat_split <- split(data, ~ artwork)
set_label <- function(x) {
artwork <- unique(x$artwork)
x$topicIndex <- factor(x$topicNumber, labels = tab_index[[artwork]])
x
}
dat_label <- lapply(dat_split, set_label)
set_topic <- function(x) {
artwork <- unique(x$artwork)
labels_file <- topics[topics$artwork == artwork,
"file_name"][as.numeric(levels(x$topicIndex))]
x$topicFile <- as.character(factor(x$topicIndex, labels = labels_file))
labels_topic <- topics[topics$artwork == artwork,
"topic"][as.numeric(levels(x$topicIndex))]
x$topic <- as.character(factor(x$topicIndex, labels = labels_topic))
x
}
dat_topic <- lapply(dat_label, set_topic)
#out <- do.call(rbind, dat_topic)
out <- dplyr::bind_rows(dat_topic)
out <- out[order(out$fileId.start, out$date.start, out$timeMs.start), ]
rownames(out) <- NULL
out
}
###########################################################################
# Create data frame with information on artworks
extract_artworks <- function(artworks, files = paste0(artworks, ".xml"),
path = path) {
out <- NULL
i <- 1
for (artwork in artworks) {
if (length(files) == 1) {
index_file <- files
} else {
index_file <- files[i]
}
index <- paste(path, artwork, index_file, sep = "/")
varnames <- c("artist", "title", "misc", "description")
xmllist <- XML::xmlToList(index)$header[varnames]
if (any(sapply(xmllist, is.null))) {# necessary for missing entries
names(xmllist) <- varnames
xmllist[which(sapply(xmllist, is.null))] <- NA
}
# remove ugly quotes
xmllist <- lapply(xmllist, function(x) gsub("„|“", "", x))
# remove HTML tags
xmllist <- lapply(xmllist, function(x) gsub("<br/>", " ", x))
xmldat <- as.data.frame(xmllist)
xmldat$artwork <- artwork
# trim white space from strings
xmldat$artist <- trimws(xmldat$artist)
xmldat$title <- trimws(xmldat$title)
xmldat$misc <- trimws(xmldat$misc)
xmldat$description <- trimws(xmldat$description)
out <- rbind(out, xmldat)
i <- i + 1
}
out
}

View File

@ -2,7 +2,7 @@
library(lubridate)
dat <- read.table("../data/rawdata_logfiles.csv", header = TRUE, sep = ";")
dat <- read.table("../data/haum/rawdata_logfiles.csv", header = TRUE, sep = ";")
# dat$event <- factor(dat$event, levels = c("Start Application",
# "Show Application",
# "Transform start",

26
code/visualization.R Normal file
View File

@ -0,0 +1,26 @@
setwd("C:/Users/nwickelmaier/Nextcloud/Documents/MDS/2023ss/60100_master_thesis/code/")
devtools::load_all("../../../../software/mtt")
#library(mtt)
dat <- parse_logfiles("2016", path = "../data/haum/LogFiles/",
save = FALSE)
datlogs <- create_eventlogs(dat, "../data/haum/ContentEyevisit/eyevisit_cards_light/")
dat001 <- datlogs[which(datlogs$artwork == "001"), ]
index <- as.numeric(as.factor(dat001$trace))
cc <- sample(colors(), length(unique(dat001$trace)))
plot(y.start ~ x.start, dat001, type = "n", xlab = "x", ylab = "y",
xlim = c(0, 3840), ylim = c(0, 2160))
with(dat001[1:200,], arrows(x.start, y.start, x.stop, y.stop,
length = .07, col = cc[index]))
plot(y.start ~ x.start, dat001, xlab = "x", ylab = "y",
xlim = c(0, 3840), ylim = c(0, 2160), pch = 16, col = "gray")
points(y.start ~ x.start, dat001, xlab = "x", ylab = "y",
xlim = c(0, 3840), ylim = c(0, 2160), cex = dat001$scaleSize,
col = "blue")