Worked on extracting topics for cards

This commit is contained in:
Nora Wickelmaier 2023-09-01 15:01:54 +02:00
parent 495665a659
commit bfc5c1d930
4 changed files with 121 additions and 40 deletions

View File

@ -242,6 +242,10 @@ Will probably just get rid of them!
Think about if you want give warning messages about these deletions in the
functions.
## Card indices go from 0 to 7 (instead of 0 to 5 as expected)
See `questions_number-of-cards.R` for details.
# Reading list
* @Arizmendi2022 [$-$]

View File

@ -53,12 +53,12 @@ num_stop <- c(diff(c(0, which(dat1$event == "Transform start"))))
table(num_stop)
# TODO: Do I still need this?
dat1$eventrep <- rep(num_start, num_start)
dat1$dupl <- duplicated(dat1[, c("event", "eventid")]) # keep first
dat1$dupl <- duplicated(dat1[, c("event", "eventid")], fromLast = TRUE) # keep last
dat1[dat1$eventrep == 10, ]
dat1$dupl <- NULL
dat1$eventrep <- NULL
# dat1$eventrep <- rep(num_start, num_start)
# dat1$dupl <- duplicated(dat1[, c("event", "eventid")]) # keep first
# dat1$dupl <- duplicated(dat1[, c("event", "eventid")], fromLast = TRUE) # keep last
# dat1[dat1$eventrep == 10, ]
# dat1$dupl <- NULL
# dat1$eventrep <- NULL
# remove duplicated "Transform start" events
@ -89,7 +89,7 @@ trans_wide <- reshape(dat1, direction = "wide",
# check how often an eventid is associated with two fileids
nrow(subset(trans_wide, trans_wide$fileid.start != trans_wide$fileid.stop))
# exclude from data set ??
# TODO: exclude from data set ??
# trans_wide <- subset(trans_wide, trans_wide$fileid.start != trans_wide$fileid.stop)
# which(is.na(trans_wide$date.start))
@ -167,7 +167,7 @@ tail(dat2[, c("artwork", "event", "trace")], 50)
rm(aws, i, j, last_event, art)
#' ## Fix glossar entries (find corresponding artworks)
#' ## Fix glossar entries (find corresponding artworks and fill in trace)
glossar_files <- unique(dat2[dat2$artwork == "glossar", "popup"])
@ -256,9 +256,9 @@ dat2[14110:14130, ]
# TODO: Integrate for-loop into for-loop above
# TODO: For now: Exclude not matched glossar entries
df <- subset(dat2, !is.na(dat2$trace))
# TODO: For now: Exclude not matched glossar entries
df <- df[order(df$trace), ]
rownames(df) <- NULL
@ -279,20 +279,10 @@ flipCard_wide$event <- "flipCard"
flipCard_wide$duration <- flipCard_wide$time_ms.stop -
flipCard_wide$time_ms.start
# TODO: Check if I still need to enter all of these variables
# --> x, y, scale, rotation?
flipCard_wide$card <- NA
flipCard_wide$popup <- NA
flipCard_wide$x.start <- NA
flipCard_wide$x.stop <- NA
flipCard_wide$y.start <- NA
flipCard_wide$y.stop <- NA
flipCard_wide$distance <- NA
flipCard_wide$scale.start <- NA
flipCard_wide$scale.stop <- NA
flipCard_wide$scaleSize <- NA
flipCard_wide$rotation.start <- NA
flipCard_wide$rotation.stop <- NA
flipCard_wide$rotationDegree <- NA
dat_flipCard <- flipCard_wide[, c("fileid.start", "fileid.stop", "event",
@ -325,18 +315,9 @@ openTopic_wide$event <- "openTopic"
openTopic_wide$duration <- openTopic_wide$time_ms.stop -
openTopic_wide$time_ms.start
openTopic_wide$popup <- NA
openTopic_wide$x.start <- NA
openTopic_wide$x.stop <- NA
openTopic_wide$y.start <- NA
openTopic_wide$y.stop <- NA
openTopic_wide$distance <- NA
openTopic_wide$scale.start <- NA
openTopic_wide$scale.stop <- NA
openTopic_wide$scaleSize <- NA
openTopic_wide$rotation.start <- NA
openTopic_wide$rotation.stop <- NA
openTopic_wide$rotationDegree <- NA
dat_openTopic <- openTopic_wide[, c("fileid.start", "fileid.stop", "event",
@ -375,22 +356,13 @@ openPopup_wide <- reshape(dat5, direction = "wide",
# df[df$trace == 4595, ]
# --> artwork 046 popup selene.xml gets opened twice
openPopup_wide$event <- "openPopup"
openPopup_wide$duration <- openPopup_wide$time_ms.stop -
openPopup_wide$time_ms.start
openPopup_wide$card <- NA
openPopup_wide$x.start <- NA
openPopup_wide$x.stop <- NA
openPopup_wide$y.start <- NA
openPopup_wide$y.stop <- NA
openPopup_wide$distance <- NA
openPopup_wide$scale.start <- NA
openPopup_wide$scale.stop <- NA
openPopup_wide$scaleSize <- NA
openPopup_wide$rotation.start <- NA
openPopup_wide$rotation.stop <- NA
openPopup_wide$rotationDegree <- NA
dat_openPopup <- openPopup_wide[, c("fileid.start", "fileid.stop", "event",
@ -529,12 +501,17 @@ out <- out[order(out$date.start), ]
rownames(out) <- NULL
# Make `trace` a consecutive number
out$trace2 <- as.numeric(factor(out$trace, levels = unique(out$trace)))
out$trace <- as.numeric(factor(out$trace, levels = unique(out$trace)))
#' # Fill in topics
topics <- read.table("../data/topics.csv", sep = ";", header = TRUE)
# TODO:
#' # Export data
write.table(out, "../data/event_logfiles.csv",
sep = ";", quote = FALSE, row.names = FALSE)
write.table(out, "../data/event_logfiles.csv", sep = ";",
row.names = FALSE)
# TODO: Write function for closing events

42
code/03_topic-cards.R Normal file
View File

@ -0,0 +1,42 @@
# setwd("C:/Users/nwickelmaier/Nextcloud/Documents/MDS/2023ss/60100_master_thesis/data/ContentEyevisit/eyevisit_cards_light")
rm(list=ls())
dat0 <- read.table("../../event_logfiles.csv", sep = ";", header = TRUE)
dat0$artwork <- sprintf("%03d", dat0$artwork)
# artwork names
artworks <- sort(unique(dat0$artwork))
# create data frame with file names and topics for each artwork
dat <- NULL
file_order <- NULL
for (artwork in artworks) {
fnames <- dir(pattern = paste0(artwork, "_"), path = artwork, full.names = TRUE)
topic <- NULL
for (fname in fnames) {
topic <- c(topic, gsub("^<card type=.(.*).>$", "\\1",
grep("^<card type=", trimws(readLines(fname)), value = T)))
}
index <- paste(artwork, "index.xml", sep = "/")
file_order <- c(file_order, gsub("^<card src=.(.*)./>$", "\\1",
grep("^<card src=", trimws(readLines(index)), value = TRUE)))
in_index <- fnames %in% file_order
dat <- rbind(dat, data.frame(artwork, file_name = fnames, in_index, topic))
}
table(dat$artwork)
table(dat$topic)
# take only the ones that are actually displayed and sort in the same order
# as indicated in index.html
dat2 <- dat[dat$in_index, -3]
dat2 <- dat2[order(file_order, dat2$file_name), ]
dat2$index <- unlist(sapply(table(dat2$artwork), seq_len))
write.table(dat2, file = "../../topics.csv", sep = ";", row.names = FALSE)

View File

@ -0,0 +1,58 @@
#' ---
#' title: "Open Questions -- Card indices"
#' author: "Nora Wickelmaier"
#' date: "`r Sys.Date()`"
#' output:
#' html_document:
#' number_sections: true
#' toc: true
#' ---
#+ include = FALSE
# setwd("C:/Users/nwickelmaier/Nextcloud/Documents/MDS/2023ss/60100_master_thesis/code")
dat <- read.table("../data/event_logfiles.csv", sep = ";", header = TRUE)
dat$date.start <- as.POSIXct(dat$date.start)
dat$date.stop <- as.POSIXct(dat$date.stop)
dat$artwork <- sprintf("%03d", dat$artwork)
#' The following table shows an overview of the card indices. The indices
#' should have values between 0 and 5. It is unclear what the numbers mean.
table(dat$card)
#' Number of cards for each artwork in the data set (subset from 2016)
artworks <- sort(unique(dat$artwork))
count <- function(x) length(table(dat[which(dat$artwork == x), "card"]))
max_index <- function(x) max(dat[which(dat$artwork == x), "card"], na.rm = TRUE)
num_cards <- sapply(artworks, count)
highest_index <- sapply(artworks, max_index)
#' Check how many XML-files for cards are present
path <- "../data/ContentEyevisit/eyevisit_cards_light"
num_files <- NULL
for (artwork in artworks) {
fnames <- dir(pattern = paste0(artwork, "_"), path = paste(path, artwork, sep = "/"))
num_files <- c(num_files, length(fnames))
}
#' The table shows that each artwork has 6 cards the most (as expected).
#' This is a subset of the data, so not all cards have been opened.
cards <- data.frame(artwork = artworks, num_cards, highest_index,
num_files, diff = num_files - highest_index)
cards
#' There are more than 8 files for a couple of artworks:
subset(cards, cards$num_files >= 8)
#' It might be possible, that the number indicates the index of the file
#' and not the actual card that was displayed. BUT: In many cases, there
#' are only 6 (or less) files, but a higher index is present...
subset(cards, cards$diff < 0)