Worked on extracting topics for cards

This commit is contained in:
Nora Wickelmaier 2023-09-01 15:01:54 +02:00
parent 495665a659
commit bfc5c1d930
4 changed files with 121 additions and 40 deletions

View File

@ -242,6 +242,10 @@ Will probably just get rid of them!
Think about if you want give warning messages about these deletions in the Think about if you want give warning messages about these deletions in the
functions. functions.
## Card indices go from 0 to 7 (instead of 0 to 5 as expected)
See `questions_number-of-cards.R` for details.
# Reading list # Reading list
* @Arizmendi2022 [$-$] * @Arizmendi2022 [$-$]

View File

@ -53,12 +53,12 @@ num_stop <- c(diff(c(0, which(dat1$event == "Transform start"))))
table(num_stop) table(num_stop)
# TODO: Do I still need this? # TODO: Do I still need this?
dat1$eventrep <- rep(num_start, num_start) # dat1$eventrep <- rep(num_start, num_start)
dat1$dupl <- duplicated(dat1[, c("event", "eventid")]) # keep first # dat1$dupl <- duplicated(dat1[, c("event", "eventid")]) # keep first
dat1$dupl <- duplicated(dat1[, c("event", "eventid")], fromLast = TRUE) # keep last # dat1$dupl <- duplicated(dat1[, c("event", "eventid")], fromLast = TRUE) # keep last
dat1[dat1$eventrep == 10, ] # dat1[dat1$eventrep == 10, ]
dat1$dupl <- NULL # dat1$dupl <- NULL
dat1$eventrep <- NULL # dat1$eventrep <- NULL
# remove duplicated "Transform start" events # remove duplicated "Transform start" events
@ -89,7 +89,7 @@ trans_wide <- reshape(dat1, direction = "wide",
# check how often an eventid is associated with two fileids # check how often an eventid is associated with two fileids
nrow(subset(trans_wide, trans_wide$fileid.start != trans_wide$fileid.stop)) nrow(subset(trans_wide, trans_wide$fileid.start != trans_wide$fileid.stop))
# exclude from data set ?? # TODO: exclude from data set ??
# trans_wide <- subset(trans_wide, trans_wide$fileid.start != trans_wide$fileid.stop) # trans_wide <- subset(trans_wide, trans_wide$fileid.start != trans_wide$fileid.stop)
# which(is.na(trans_wide$date.start)) # which(is.na(trans_wide$date.start))
@ -167,7 +167,7 @@ tail(dat2[, c("artwork", "event", "trace")], 50)
rm(aws, i, j, last_event, art) rm(aws, i, j, last_event, art)
#' ## Fix glossar entries (find corresponding artworks) #' ## Fix glossar entries (find corresponding artworks and fill in trace)
glossar_files <- unique(dat2[dat2$artwork == "glossar", "popup"]) glossar_files <- unique(dat2[dat2$artwork == "glossar", "popup"])
@ -256,9 +256,9 @@ dat2[14110:14130, ]
# TODO: Integrate for-loop into for-loop above # TODO: Integrate for-loop into for-loop above
# TODO: For now: Exclude not matched glossar entries
df <- subset(dat2, !is.na(dat2$trace)) df <- subset(dat2, !is.na(dat2$trace))
# TODO: For now: Exclude not matched glossar entries
df <- df[order(df$trace), ] df <- df[order(df$trace), ]
rownames(df) <- NULL rownames(df) <- NULL
@ -279,20 +279,10 @@ flipCard_wide$event <- "flipCard"
flipCard_wide$duration <- flipCard_wide$time_ms.stop - flipCard_wide$duration <- flipCard_wide$time_ms.stop -
flipCard_wide$time_ms.start flipCard_wide$time_ms.start
# TODO: Check if I still need to enter all of these variables
# --> x, y, scale, rotation?
flipCard_wide$card <- NA flipCard_wide$card <- NA
flipCard_wide$popup <- NA flipCard_wide$popup <- NA
flipCard_wide$x.start <- NA
flipCard_wide$x.stop <- NA
flipCard_wide$y.start <- NA
flipCard_wide$y.stop <- NA
flipCard_wide$distance <- NA flipCard_wide$distance <- NA
flipCard_wide$scale.start <- NA
flipCard_wide$scale.stop <- NA
flipCard_wide$scaleSize <- NA flipCard_wide$scaleSize <- NA
flipCard_wide$rotation.start <- NA
flipCard_wide$rotation.stop <- NA
flipCard_wide$rotationDegree <- NA flipCard_wide$rotationDegree <- NA
dat_flipCard <- flipCard_wide[, c("fileid.start", "fileid.stop", "event", dat_flipCard <- flipCard_wide[, c("fileid.start", "fileid.stop", "event",
@ -325,18 +315,9 @@ openTopic_wide$event <- "openTopic"
openTopic_wide$duration <- openTopic_wide$time_ms.stop - openTopic_wide$duration <- openTopic_wide$time_ms.stop -
openTopic_wide$time_ms.start openTopic_wide$time_ms.start
openTopic_wide$popup <- NA openTopic_wide$popup <- NA
openTopic_wide$x.start <- NA
openTopic_wide$x.stop <- NA
openTopic_wide$y.start <- NA
openTopic_wide$y.stop <- NA
openTopic_wide$distance <- NA openTopic_wide$distance <- NA
openTopic_wide$scale.start <- NA
openTopic_wide$scale.stop <- NA
openTopic_wide$scaleSize <- NA openTopic_wide$scaleSize <- NA
openTopic_wide$rotation.start <- NA
openTopic_wide$rotation.stop <- NA
openTopic_wide$rotationDegree <- NA openTopic_wide$rotationDegree <- NA
dat_openTopic <- openTopic_wide[, c("fileid.start", "fileid.stop", "event", dat_openTopic <- openTopic_wide[, c("fileid.start", "fileid.stop", "event",
@ -375,22 +356,13 @@ openPopup_wide <- reshape(dat5, direction = "wide",
# df[df$trace == 4595, ] # df[df$trace == 4595, ]
# --> artwork 046 popup selene.xml gets opened twice # --> artwork 046 popup selene.xml gets opened twice
openPopup_wide$event <- "openPopup" openPopup_wide$event <- "openPopup"
openPopup_wide$duration <- openPopup_wide$time_ms.stop - openPopup_wide$duration <- openPopup_wide$time_ms.stop -
openPopup_wide$time_ms.start openPopup_wide$time_ms.start
openPopup_wide$card <- NA openPopup_wide$card <- NA
openPopup_wide$x.start <- NA
openPopup_wide$x.stop <- NA
openPopup_wide$y.start <- NA
openPopup_wide$y.stop <- NA
openPopup_wide$distance <- NA openPopup_wide$distance <- NA
openPopup_wide$scale.start <- NA
openPopup_wide$scale.stop <- NA
openPopup_wide$scaleSize <- NA openPopup_wide$scaleSize <- NA
openPopup_wide$rotation.start <- NA
openPopup_wide$rotation.stop <- NA
openPopup_wide$rotationDegree <- NA openPopup_wide$rotationDegree <- NA
dat_openPopup <- openPopup_wide[, c("fileid.start", "fileid.stop", "event", dat_openPopup <- openPopup_wide[, c("fileid.start", "fileid.stop", "event",
@ -529,12 +501,17 @@ out <- out[order(out$date.start), ]
rownames(out) <- NULL rownames(out) <- NULL
# Make `trace` a consecutive number # Make `trace` a consecutive number
out$trace2 <- as.numeric(factor(out$trace, levels = unique(out$trace))) out$trace <- as.numeric(factor(out$trace, levels = unique(out$trace)))
#' # Fill in topics
topics <- read.table("../data/topics.csv", sep = ";", header = TRUE)
# TODO:
#' # Export data #' # Export data
write.table(out, "../data/event_logfiles.csv", write.table(out, "../data/event_logfiles.csv", sep = ";",
sep = ";", quote = FALSE, row.names = FALSE) row.names = FALSE)
# TODO: Write function for closing events # TODO: Write function for closing events

42
code/03_topic-cards.R Normal file
View File

@ -0,0 +1,42 @@
# setwd("C:/Users/nwickelmaier/Nextcloud/Documents/MDS/2023ss/60100_master_thesis/data/ContentEyevisit/eyevisit_cards_light")
rm(list=ls())
dat0 <- read.table("../../event_logfiles.csv", sep = ";", header = TRUE)
dat0$artwork <- sprintf("%03d", dat0$artwork)
# artwork names
artworks <- sort(unique(dat0$artwork))
# create data frame with file names and topics for each artwork
dat <- NULL
file_order <- NULL
for (artwork in artworks) {
fnames <- dir(pattern = paste0(artwork, "_"), path = artwork, full.names = TRUE)
topic <- NULL
for (fname in fnames) {
topic <- c(topic, gsub("^<card type=.(.*).>$", "\\1",
grep("^<card type=", trimws(readLines(fname)), value = T)))
}
index <- paste(artwork, "index.xml", sep = "/")
file_order <- c(file_order, gsub("^<card src=.(.*)./>$", "\\1",
grep("^<card src=", trimws(readLines(index)), value = TRUE)))
in_index <- fnames %in% file_order
dat <- rbind(dat, data.frame(artwork, file_name = fnames, in_index, topic))
}
table(dat$artwork)
table(dat$topic)
# take only the ones that are actually displayed and sort in the same order
# as indicated in index.html
dat2 <- dat[dat$in_index, -3]
dat2 <- dat2[order(file_order, dat2$file_name), ]
dat2$index <- unlist(sapply(table(dat2$artwork), seq_len))
write.table(dat2, file = "../../topics.csv", sep = ";", row.names = FALSE)

View File

@ -0,0 +1,58 @@
#' ---
#' title: "Open Questions -- Card indices"
#' author: "Nora Wickelmaier"
#' date: "`r Sys.Date()`"
#' output:
#' html_document:
#' number_sections: true
#' toc: true
#' ---
#+ include = FALSE
# setwd("C:/Users/nwickelmaier/Nextcloud/Documents/MDS/2023ss/60100_master_thesis/code")
dat <- read.table("../data/event_logfiles.csv", sep = ";", header = TRUE)
dat$date.start <- as.POSIXct(dat$date.start)
dat$date.stop <- as.POSIXct(dat$date.stop)
dat$artwork <- sprintf("%03d", dat$artwork)
#' The following table shows an overview of the card indices. The indices
#' should have values between 0 and 5. It is unclear what the numbers mean.
table(dat$card)
#' Number of cards for each artwork in the data set (subset from 2016)
artworks <- sort(unique(dat$artwork))
count <- function(x) length(table(dat[which(dat$artwork == x), "card"]))
max_index <- function(x) max(dat[which(dat$artwork == x), "card"], na.rm = TRUE)
num_cards <- sapply(artworks, count)
highest_index <- sapply(artworks, max_index)
#' Check how many XML-files for cards are present
path <- "../data/ContentEyevisit/eyevisit_cards_light"
num_files <- NULL
for (artwork in artworks) {
fnames <- dir(pattern = paste0(artwork, "_"), path = paste(path, artwork, sep = "/"))
num_files <- c(num_files, length(fnames))
}
#' The table shows that each artwork has 6 cards the most (as expected).
#' This is a subset of the data, so not all cards have been opened.
cards <- data.frame(artwork = artworks, num_cards, highest_index,
num_files, diff = num_files - highest_index)
cards
#' There are more than 8 files for a couple of artworks:
subset(cards, cards$num_files >= 8)
#' It might be possible, that the number indicates the index of the file
#' and not the actual card that was displayed. BUT: In many cases, there
#' are only 6 (or less) files, but a higher index is present...
subset(cards, cards$diff < 0)