Worked on extracting topics for cards
This commit is contained in:
parent
495665a659
commit
bfc5c1d930
@ -242,6 +242,10 @@ Will probably just get rid of them!
|
|||||||
Think about if you want give warning messages about these deletions in the
|
Think about if you want give warning messages about these deletions in the
|
||||||
functions.
|
functions.
|
||||||
|
|
||||||
|
## Card indices go from 0 to 7 (instead of 0 to 5 as expected)
|
||||||
|
|
||||||
|
See `questions_number-of-cards.R` for details.
|
||||||
|
|
||||||
# Reading list
|
# Reading list
|
||||||
|
|
||||||
* @Arizmendi2022 [$-$]
|
* @Arizmendi2022 [$-$]
|
||||||
|
@ -53,12 +53,12 @@ num_stop <- c(diff(c(0, which(dat1$event == "Transform start"))))
|
|||||||
table(num_stop)
|
table(num_stop)
|
||||||
|
|
||||||
# TODO: Do I still need this?
|
# TODO: Do I still need this?
|
||||||
dat1$eventrep <- rep(num_start, num_start)
|
# dat1$eventrep <- rep(num_start, num_start)
|
||||||
dat1$dupl <- duplicated(dat1[, c("event", "eventid")]) # keep first
|
# dat1$dupl <- duplicated(dat1[, c("event", "eventid")]) # keep first
|
||||||
dat1$dupl <- duplicated(dat1[, c("event", "eventid")], fromLast = TRUE) # keep last
|
# dat1$dupl <- duplicated(dat1[, c("event", "eventid")], fromLast = TRUE) # keep last
|
||||||
dat1[dat1$eventrep == 10, ]
|
# dat1[dat1$eventrep == 10, ]
|
||||||
dat1$dupl <- NULL
|
# dat1$dupl <- NULL
|
||||||
dat1$eventrep <- NULL
|
# dat1$eventrep <- NULL
|
||||||
|
|
||||||
|
|
||||||
# remove duplicated "Transform start" events
|
# remove duplicated "Transform start" events
|
||||||
@ -89,7 +89,7 @@ trans_wide <- reshape(dat1, direction = "wide",
|
|||||||
# check how often an eventid is associated with two fileids
|
# check how often an eventid is associated with two fileids
|
||||||
nrow(subset(trans_wide, trans_wide$fileid.start != trans_wide$fileid.stop))
|
nrow(subset(trans_wide, trans_wide$fileid.start != trans_wide$fileid.stop))
|
||||||
|
|
||||||
# exclude from data set ??
|
# TODO: exclude from data set ??
|
||||||
# trans_wide <- subset(trans_wide, trans_wide$fileid.start != trans_wide$fileid.stop)
|
# trans_wide <- subset(trans_wide, trans_wide$fileid.start != trans_wide$fileid.stop)
|
||||||
|
|
||||||
# which(is.na(trans_wide$date.start))
|
# which(is.na(trans_wide$date.start))
|
||||||
@ -167,7 +167,7 @@ tail(dat2[, c("artwork", "event", "trace")], 50)
|
|||||||
|
|
||||||
rm(aws, i, j, last_event, art)
|
rm(aws, i, j, last_event, art)
|
||||||
|
|
||||||
#' ## Fix glossar entries (find corresponding artworks)
|
#' ## Fix glossar entries (find corresponding artworks and fill in trace)
|
||||||
|
|
||||||
glossar_files <- unique(dat2[dat2$artwork == "glossar", "popup"])
|
glossar_files <- unique(dat2[dat2$artwork == "glossar", "popup"])
|
||||||
|
|
||||||
@ -256,9 +256,9 @@ dat2[14110:14130, ]
|
|||||||
|
|
||||||
# TODO: Integrate for-loop into for-loop above
|
# TODO: Integrate for-loop into for-loop above
|
||||||
|
|
||||||
# TODO: For now: Exclude not matched glossar entries
|
|
||||||
|
|
||||||
df <- subset(dat2, !is.na(dat2$trace))
|
df <- subset(dat2, !is.na(dat2$trace))
|
||||||
|
# TODO: For now: Exclude not matched glossar entries
|
||||||
df <- df[order(df$trace), ]
|
df <- df[order(df$trace), ]
|
||||||
rownames(df) <- NULL
|
rownames(df) <- NULL
|
||||||
|
|
||||||
@ -279,20 +279,10 @@ flipCard_wide$event <- "flipCard"
|
|||||||
flipCard_wide$duration <- flipCard_wide$time_ms.stop -
|
flipCard_wide$duration <- flipCard_wide$time_ms.stop -
|
||||||
flipCard_wide$time_ms.start
|
flipCard_wide$time_ms.start
|
||||||
|
|
||||||
# TODO: Check if I still need to enter all of these variables
|
|
||||||
# --> x, y, scale, rotation?
|
|
||||||
flipCard_wide$card <- NA
|
flipCard_wide$card <- NA
|
||||||
flipCard_wide$popup <- NA
|
flipCard_wide$popup <- NA
|
||||||
flipCard_wide$x.start <- NA
|
|
||||||
flipCard_wide$x.stop <- NA
|
|
||||||
flipCard_wide$y.start <- NA
|
|
||||||
flipCard_wide$y.stop <- NA
|
|
||||||
flipCard_wide$distance <- NA
|
flipCard_wide$distance <- NA
|
||||||
flipCard_wide$scale.start <- NA
|
|
||||||
flipCard_wide$scale.stop <- NA
|
|
||||||
flipCard_wide$scaleSize <- NA
|
flipCard_wide$scaleSize <- NA
|
||||||
flipCard_wide$rotation.start <- NA
|
|
||||||
flipCard_wide$rotation.stop <- NA
|
|
||||||
flipCard_wide$rotationDegree <- NA
|
flipCard_wide$rotationDegree <- NA
|
||||||
|
|
||||||
dat_flipCard <- flipCard_wide[, c("fileid.start", "fileid.stop", "event",
|
dat_flipCard <- flipCard_wide[, c("fileid.start", "fileid.stop", "event",
|
||||||
@ -325,18 +315,9 @@ openTopic_wide$event <- "openTopic"
|
|||||||
openTopic_wide$duration <- openTopic_wide$time_ms.stop -
|
openTopic_wide$duration <- openTopic_wide$time_ms.stop -
|
||||||
openTopic_wide$time_ms.start
|
openTopic_wide$time_ms.start
|
||||||
|
|
||||||
|
|
||||||
openTopic_wide$popup <- NA
|
openTopic_wide$popup <- NA
|
||||||
openTopic_wide$x.start <- NA
|
|
||||||
openTopic_wide$x.stop <- NA
|
|
||||||
openTopic_wide$y.start <- NA
|
|
||||||
openTopic_wide$y.stop <- NA
|
|
||||||
openTopic_wide$distance <- NA
|
openTopic_wide$distance <- NA
|
||||||
openTopic_wide$scale.start <- NA
|
|
||||||
openTopic_wide$scale.stop <- NA
|
|
||||||
openTopic_wide$scaleSize <- NA
|
openTopic_wide$scaleSize <- NA
|
||||||
openTopic_wide$rotation.start <- NA
|
|
||||||
openTopic_wide$rotation.stop <- NA
|
|
||||||
openTopic_wide$rotationDegree <- NA
|
openTopic_wide$rotationDegree <- NA
|
||||||
|
|
||||||
dat_openTopic <- openTopic_wide[, c("fileid.start", "fileid.stop", "event",
|
dat_openTopic <- openTopic_wide[, c("fileid.start", "fileid.stop", "event",
|
||||||
@ -375,22 +356,13 @@ openPopup_wide <- reshape(dat5, direction = "wide",
|
|||||||
# df[df$trace == 4595, ]
|
# df[df$trace == 4595, ]
|
||||||
# --> artwork 046 popup selene.xml gets opened twice
|
# --> artwork 046 popup selene.xml gets opened twice
|
||||||
|
|
||||||
|
|
||||||
openPopup_wide$event <- "openPopup"
|
openPopup_wide$event <- "openPopup"
|
||||||
openPopup_wide$duration <- openPopup_wide$time_ms.stop -
|
openPopup_wide$duration <- openPopup_wide$time_ms.stop -
|
||||||
openPopup_wide$time_ms.start
|
openPopup_wide$time_ms.start
|
||||||
|
|
||||||
openPopup_wide$card <- NA
|
openPopup_wide$card <- NA
|
||||||
openPopup_wide$x.start <- NA
|
|
||||||
openPopup_wide$x.stop <- NA
|
|
||||||
openPopup_wide$y.start <- NA
|
|
||||||
openPopup_wide$y.stop <- NA
|
|
||||||
openPopup_wide$distance <- NA
|
openPopup_wide$distance <- NA
|
||||||
openPopup_wide$scale.start <- NA
|
|
||||||
openPopup_wide$scale.stop <- NA
|
|
||||||
openPopup_wide$scaleSize <- NA
|
openPopup_wide$scaleSize <- NA
|
||||||
openPopup_wide$rotation.start <- NA
|
|
||||||
openPopup_wide$rotation.stop <- NA
|
|
||||||
openPopup_wide$rotationDegree <- NA
|
openPopup_wide$rotationDegree <- NA
|
||||||
|
|
||||||
dat_openPopup <- openPopup_wide[, c("fileid.start", "fileid.stop", "event",
|
dat_openPopup <- openPopup_wide[, c("fileid.start", "fileid.stop", "event",
|
||||||
@ -529,12 +501,17 @@ out <- out[order(out$date.start), ]
|
|||||||
rownames(out) <- NULL
|
rownames(out) <- NULL
|
||||||
|
|
||||||
# Make `trace` a consecutive number
|
# Make `trace` a consecutive number
|
||||||
out$trace2 <- as.numeric(factor(out$trace, levels = unique(out$trace)))
|
out$trace <- as.numeric(factor(out$trace, levels = unique(out$trace)))
|
||||||
|
|
||||||
|
#' # Fill in topics
|
||||||
|
|
||||||
|
topics <- read.table("../data/topics.csv", sep = ";", header = TRUE)
|
||||||
|
# TODO:
|
||||||
|
|
||||||
#' # Export data
|
#' # Export data
|
||||||
|
|
||||||
write.table(out, "../data/event_logfiles.csv",
|
write.table(out, "../data/event_logfiles.csv", sep = ";",
|
||||||
sep = ";", quote = FALSE, row.names = FALSE)
|
row.names = FALSE)
|
||||||
|
|
||||||
# TODO: Write function for closing events
|
# TODO: Write function for closing events
|
||||||
|
|
||||||
|
42
code/03_topic-cards.R
Normal file
42
code/03_topic-cards.R
Normal file
@ -0,0 +1,42 @@
|
|||||||
|
# setwd("C:/Users/nwickelmaier/Nextcloud/Documents/MDS/2023ss/60100_master_thesis/data/ContentEyevisit/eyevisit_cards_light")
|
||||||
|
rm(list=ls())
|
||||||
|
|
||||||
|
dat0 <- read.table("../../event_logfiles.csv", sep = ";", header = TRUE)
|
||||||
|
dat0$artwork <- sprintf("%03d", dat0$artwork)
|
||||||
|
|
||||||
|
# artwork names
|
||||||
|
artworks <- sort(unique(dat0$artwork))
|
||||||
|
|
||||||
|
# create data frame with file names and topics for each artwork
|
||||||
|
|
||||||
|
dat <- NULL
|
||||||
|
file_order <- NULL
|
||||||
|
|
||||||
|
for (artwork in artworks) {
|
||||||
|
fnames <- dir(pattern = paste0(artwork, "_"), path = artwork, full.names = TRUE)
|
||||||
|
topic <- NULL
|
||||||
|
for (fname in fnames) {
|
||||||
|
topic <- c(topic, gsub("^<card type=.(.*).>$", "\\1",
|
||||||
|
grep("^<card type=", trimws(readLines(fname)), value = T)))
|
||||||
|
|
||||||
|
}
|
||||||
|
index <- paste(artwork, "index.xml", sep = "/")
|
||||||
|
file_order <- c(file_order, gsub("^<card src=.(.*)./>$", "\\1",
|
||||||
|
grep("^<card src=", trimws(readLines(index)), value = TRUE)))
|
||||||
|
in_index <- fnames %in% file_order
|
||||||
|
dat <- rbind(dat, data.frame(artwork, file_name = fnames, in_index, topic))
|
||||||
|
}
|
||||||
|
|
||||||
|
table(dat$artwork)
|
||||||
|
table(dat$topic)
|
||||||
|
|
||||||
|
# take only the ones that are actually displayed and sort in the same order
|
||||||
|
# as indicated in index.html
|
||||||
|
|
||||||
|
dat2 <- dat[dat$in_index, -3]
|
||||||
|
dat2 <- dat2[order(file_order, dat2$file_name), ]
|
||||||
|
|
||||||
|
dat2$index <- unlist(sapply(table(dat2$artwork), seq_len))
|
||||||
|
|
||||||
|
write.table(dat2, file = "../../topics.csv", sep = ";", row.names = FALSE)
|
||||||
|
|
58
code/questions_number-of-cards.R
Normal file
58
code/questions_number-of-cards.R
Normal file
@ -0,0 +1,58 @@
|
|||||||
|
#' ---
|
||||||
|
#' title: "Open Questions -- Card indices"
|
||||||
|
#' author: "Nora Wickelmaier"
|
||||||
|
#' date: "`r Sys.Date()`"
|
||||||
|
#' output:
|
||||||
|
#' html_document:
|
||||||
|
#' number_sections: true
|
||||||
|
#' toc: true
|
||||||
|
#' ---
|
||||||
|
|
||||||
|
#+ include = FALSE
|
||||||
|
# setwd("C:/Users/nwickelmaier/Nextcloud/Documents/MDS/2023ss/60100_master_thesis/code")
|
||||||
|
dat <- read.table("../data/event_logfiles.csv", sep = ";", header = TRUE)
|
||||||
|
dat$date.start <- as.POSIXct(dat$date.start)
|
||||||
|
dat$date.stop <- as.POSIXct(dat$date.stop)
|
||||||
|
dat$artwork <- sprintf("%03d", dat$artwork)
|
||||||
|
|
||||||
|
#' The following table shows an overview of the card indices. The indices
|
||||||
|
#' should have values between 0 and 5. It is unclear what the numbers mean.
|
||||||
|
|
||||||
|
table(dat$card)
|
||||||
|
|
||||||
|
#' Number of cards for each artwork in the data set (subset from 2016)
|
||||||
|
|
||||||
|
artworks <- sort(unique(dat$artwork))
|
||||||
|
|
||||||
|
count <- function(x) length(table(dat[which(dat$artwork == x), "card"]))
|
||||||
|
max_index <- function(x) max(dat[which(dat$artwork == x), "card"], na.rm = TRUE)
|
||||||
|
num_cards <- sapply(artworks, count)
|
||||||
|
highest_index <- sapply(artworks, max_index)
|
||||||
|
|
||||||
|
#' Check how many XML-files for cards are present
|
||||||
|
|
||||||
|
path <- "../data/ContentEyevisit/eyevisit_cards_light"
|
||||||
|
|
||||||
|
num_files <- NULL
|
||||||
|
for (artwork in artworks) {
|
||||||
|
fnames <- dir(pattern = paste0(artwork, "_"), path = paste(path, artwork, sep = "/"))
|
||||||
|
num_files <- c(num_files, length(fnames))
|
||||||
|
}
|
||||||
|
|
||||||
|
#' The table shows that each artwork has 6 cards the most (as expected).
|
||||||
|
#' This is a subset of the data, so not all cards have been opened.
|
||||||
|
|
||||||
|
cards <- data.frame(artwork = artworks, num_cards, highest_index,
|
||||||
|
num_files, diff = num_files - highest_index)
|
||||||
|
cards
|
||||||
|
|
||||||
|
#' There are more than 8 files for a couple of artworks:
|
||||||
|
|
||||||
|
subset(cards, cards$num_files >= 8)
|
||||||
|
|
||||||
|
#' It might be possible, that the number indicates the index of the file
|
||||||
|
#' and not the actual card that was displayed. BUT: In many cases, there
|
||||||
|
#' are only 6 (or less) files, but a higher index is present...
|
||||||
|
|
||||||
|
subset(cards, cards$diff < 0)
|
||||||
|
|
Loading…
Reference in New Issue
Block a user