Worked on extracting topics for cards
This commit is contained in:
parent
495665a659
commit
bfc5c1d930
@ -242,6 +242,10 @@ Will probably just get rid of them!
|
||||
Think about if you want give warning messages about these deletions in the
|
||||
functions.
|
||||
|
||||
## Card indices go from 0 to 7 (instead of 0 to 5 as expected)
|
||||
|
||||
See `questions_number-of-cards.R` for details.
|
||||
|
||||
# Reading list
|
||||
|
||||
* @Arizmendi2022 [$-$]
|
||||
|
@ -53,12 +53,12 @@ num_stop <- c(diff(c(0, which(dat1$event == "Transform start"))))
|
||||
table(num_stop)
|
||||
|
||||
# TODO: Do I still need this?
|
||||
dat1$eventrep <- rep(num_start, num_start)
|
||||
dat1$dupl <- duplicated(dat1[, c("event", "eventid")]) # keep first
|
||||
dat1$dupl <- duplicated(dat1[, c("event", "eventid")], fromLast = TRUE) # keep last
|
||||
dat1[dat1$eventrep == 10, ]
|
||||
dat1$dupl <- NULL
|
||||
dat1$eventrep <- NULL
|
||||
# dat1$eventrep <- rep(num_start, num_start)
|
||||
# dat1$dupl <- duplicated(dat1[, c("event", "eventid")]) # keep first
|
||||
# dat1$dupl <- duplicated(dat1[, c("event", "eventid")], fromLast = TRUE) # keep last
|
||||
# dat1[dat1$eventrep == 10, ]
|
||||
# dat1$dupl <- NULL
|
||||
# dat1$eventrep <- NULL
|
||||
|
||||
|
||||
# remove duplicated "Transform start" events
|
||||
@ -89,7 +89,7 @@ trans_wide <- reshape(dat1, direction = "wide",
|
||||
# check how often an eventid is associated with two fileids
|
||||
nrow(subset(trans_wide, trans_wide$fileid.start != trans_wide$fileid.stop))
|
||||
|
||||
# exclude from data set ??
|
||||
# TODO: exclude from data set ??
|
||||
# trans_wide <- subset(trans_wide, trans_wide$fileid.start != trans_wide$fileid.stop)
|
||||
|
||||
# which(is.na(trans_wide$date.start))
|
||||
@ -167,7 +167,7 @@ tail(dat2[, c("artwork", "event", "trace")], 50)
|
||||
|
||||
rm(aws, i, j, last_event, art)
|
||||
|
||||
#' ## Fix glossar entries (find corresponding artworks)
|
||||
#' ## Fix glossar entries (find corresponding artworks and fill in trace)
|
||||
|
||||
glossar_files <- unique(dat2[dat2$artwork == "glossar", "popup"])
|
||||
|
||||
@ -256,9 +256,9 @@ dat2[14110:14130, ]
|
||||
|
||||
# TODO: Integrate for-loop into for-loop above
|
||||
|
||||
# TODO: For now: Exclude not matched glossar entries
|
||||
|
||||
df <- subset(dat2, !is.na(dat2$trace))
|
||||
# TODO: For now: Exclude not matched glossar entries
|
||||
df <- df[order(df$trace), ]
|
||||
rownames(df) <- NULL
|
||||
|
||||
@ -279,20 +279,10 @@ flipCard_wide$event <- "flipCard"
|
||||
flipCard_wide$duration <- flipCard_wide$time_ms.stop -
|
||||
flipCard_wide$time_ms.start
|
||||
|
||||
# TODO: Check if I still need to enter all of these variables
|
||||
# --> x, y, scale, rotation?
|
||||
flipCard_wide$card <- NA
|
||||
flipCard_wide$popup <- NA
|
||||
flipCard_wide$x.start <- NA
|
||||
flipCard_wide$x.stop <- NA
|
||||
flipCard_wide$y.start <- NA
|
||||
flipCard_wide$y.stop <- NA
|
||||
flipCard_wide$distance <- NA
|
||||
flipCard_wide$scale.start <- NA
|
||||
flipCard_wide$scale.stop <- NA
|
||||
flipCard_wide$scaleSize <- NA
|
||||
flipCard_wide$rotation.start <- NA
|
||||
flipCard_wide$rotation.stop <- NA
|
||||
flipCard_wide$rotationDegree <- NA
|
||||
|
||||
dat_flipCard <- flipCard_wide[, c("fileid.start", "fileid.stop", "event",
|
||||
@ -325,18 +315,9 @@ openTopic_wide$event <- "openTopic"
|
||||
openTopic_wide$duration <- openTopic_wide$time_ms.stop -
|
||||
openTopic_wide$time_ms.start
|
||||
|
||||
|
||||
openTopic_wide$popup <- NA
|
||||
openTopic_wide$x.start <- NA
|
||||
openTopic_wide$x.stop <- NA
|
||||
openTopic_wide$y.start <- NA
|
||||
openTopic_wide$y.stop <- NA
|
||||
openTopic_wide$distance <- NA
|
||||
openTopic_wide$scale.start <- NA
|
||||
openTopic_wide$scale.stop <- NA
|
||||
openTopic_wide$scaleSize <- NA
|
||||
openTopic_wide$rotation.start <- NA
|
||||
openTopic_wide$rotation.stop <- NA
|
||||
openTopic_wide$rotationDegree <- NA
|
||||
|
||||
dat_openTopic <- openTopic_wide[, c("fileid.start", "fileid.stop", "event",
|
||||
@ -375,22 +356,13 @@ openPopup_wide <- reshape(dat5, direction = "wide",
|
||||
# df[df$trace == 4595, ]
|
||||
# --> artwork 046 popup selene.xml gets opened twice
|
||||
|
||||
|
||||
openPopup_wide$event <- "openPopup"
|
||||
openPopup_wide$duration <- openPopup_wide$time_ms.stop -
|
||||
openPopup_wide$time_ms.start
|
||||
|
||||
openPopup_wide$card <- NA
|
||||
openPopup_wide$x.start <- NA
|
||||
openPopup_wide$x.stop <- NA
|
||||
openPopup_wide$y.start <- NA
|
||||
openPopup_wide$y.stop <- NA
|
||||
openPopup_wide$distance <- NA
|
||||
openPopup_wide$scale.start <- NA
|
||||
openPopup_wide$scale.stop <- NA
|
||||
openPopup_wide$scaleSize <- NA
|
||||
openPopup_wide$rotation.start <- NA
|
||||
openPopup_wide$rotation.stop <- NA
|
||||
openPopup_wide$rotationDegree <- NA
|
||||
|
||||
dat_openPopup <- openPopup_wide[, c("fileid.start", "fileid.stop", "event",
|
||||
@ -529,12 +501,17 @@ out <- out[order(out$date.start), ]
|
||||
rownames(out) <- NULL
|
||||
|
||||
# Make `trace` a consecutive number
|
||||
out$trace2 <- as.numeric(factor(out$trace, levels = unique(out$trace)))
|
||||
out$trace <- as.numeric(factor(out$trace, levels = unique(out$trace)))
|
||||
|
||||
#' # Fill in topics
|
||||
|
||||
topics <- read.table("../data/topics.csv", sep = ";", header = TRUE)
|
||||
# TODO:
|
||||
|
||||
#' # Export data
|
||||
|
||||
write.table(out, "../data/event_logfiles.csv",
|
||||
sep = ";", quote = FALSE, row.names = FALSE)
|
||||
write.table(out, "../data/event_logfiles.csv", sep = ";",
|
||||
row.names = FALSE)
|
||||
|
||||
# TODO: Write function for closing events
|
||||
|
||||
|
42
code/03_topic-cards.R
Normal file
42
code/03_topic-cards.R
Normal file
@ -0,0 +1,42 @@
|
||||
# setwd("C:/Users/nwickelmaier/Nextcloud/Documents/MDS/2023ss/60100_master_thesis/data/ContentEyevisit/eyevisit_cards_light")
|
||||
rm(list=ls())
|
||||
|
||||
dat0 <- read.table("../../event_logfiles.csv", sep = ";", header = TRUE)
|
||||
dat0$artwork <- sprintf("%03d", dat0$artwork)
|
||||
|
||||
# artwork names
|
||||
artworks <- sort(unique(dat0$artwork))
|
||||
|
||||
# create data frame with file names and topics for each artwork
|
||||
|
||||
dat <- NULL
|
||||
file_order <- NULL
|
||||
|
||||
for (artwork in artworks) {
|
||||
fnames <- dir(pattern = paste0(artwork, "_"), path = artwork, full.names = TRUE)
|
||||
topic <- NULL
|
||||
for (fname in fnames) {
|
||||
topic <- c(topic, gsub("^<card type=.(.*).>$", "\\1",
|
||||
grep("^<card type=", trimws(readLines(fname)), value = T)))
|
||||
|
||||
}
|
||||
index <- paste(artwork, "index.xml", sep = "/")
|
||||
file_order <- c(file_order, gsub("^<card src=.(.*)./>$", "\\1",
|
||||
grep("^<card src=", trimws(readLines(index)), value = TRUE)))
|
||||
in_index <- fnames %in% file_order
|
||||
dat <- rbind(dat, data.frame(artwork, file_name = fnames, in_index, topic))
|
||||
}
|
||||
|
||||
table(dat$artwork)
|
||||
table(dat$topic)
|
||||
|
||||
# take only the ones that are actually displayed and sort in the same order
|
||||
# as indicated in index.html
|
||||
|
||||
dat2 <- dat[dat$in_index, -3]
|
||||
dat2 <- dat2[order(file_order, dat2$file_name), ]
|
||||
|
||||
dat2$index <- unlist(sapply(table(dat2$artwork), seq_len))
|
||||
|
||||
write.table(dat2, file = "../../topics.csv", sep = ";", row.names = FALSE)
|
||||
|
58
code/questions_number-of-cards.R
Normal file
58
code/questions_number-of-cards.R
Normal file
@ -0,0 +1,58 @@
|
||||
#' ---
|
||||
#' title: "Open Questions -- Card indices"
|
||||
#' author: "Nora Wickelmaier"
|
||||
#' date: "`r Sys.Date()`"
|
||||
#' output:
|
||||
#' html_document:
|
||||
#' number_sections: true
|
||||
#' toc: true
|
||||
#' ---
|
||||
|
||||
#+ include = FALSE
|
||||
# setwd("C:/Users/nwickelmaier/Nextcloud/Documents/MDS/2023ss/60100_master_thesis/code")
|
||||
dat <- read.table("../data/event_logfiles.csv", sep = ";", header = TRUE)
|
||||
dat$date.start <- as.POSIXct(dat$date.start)
|
||||
dat$date.stop <- as.POSIXct(dat$date.stop)
|
||||
dat$artwork <- sprintf("%03d", dat$artwork)
|
||||
|
||||
#' The following table shows an overview of the card indices. The indices
|
||||
#' should have values between 0 and 5. It is unclear what the numbers mean.
|
||||
|
||||
table(dat$card)
|
||||
|
||||
#' Number of cards for each artwork in the data set (subset from 2016)
|
||||
|
||||
artworks <- sort(unique(dat$artwork))
|
||||
|
||||
count <- function(x) length(table(dat[which(dat$artwork == x), "card"]))
|
||||
max_index <- function(x) max(dat[which(dat$artwork == x), "card"], na.rm = TRUE)
|
||||
num_cards <- sapply(artworks, count)
|
||||
highest_index <- sapply(artworks, max_index)
|
||||
|
||||
#' Check how many XML-files for cards are present
|
||||
|
||||
path <- "../data/ContentEyevisit/eyevisit_cards_light"
|
||||
|
||||
num_files <- NULL
|
||||
for (artwork in artworks) {
|
||||
fnames <- dir(pattern = paste0(artwork, "_"), path = paste(path, artwork, sep = "/"))
|
||||
num_files <- c(num_files, length(fnames))
|
||||
}
|
||||
|
||||
#' The table shows that each artwork has 6 cards the most (as expected).
|
||||
#' This is a subset of the data, so not all cards have been opened.
|
||||
|
||||
cards <- data.frame(artwork = artworks, num_cards, highest_index,
|
||||
num_files, diff = num_files - highest_index)
|
||||
cards
|
||||
|
||||
#' There are more than 8 files for a couple of artworks:
|
||||
|
||||
subset(cards, cards$num_files >= 8)
|
||||
|
||||
#' It might be possible, that the number indicates the index of the file
|
||||
#' and not the actual card that was displayed. BUT: In many cases, there
|
||||
#' are only 6 (or less) files, but a higher index is present...
|
||||
|
||||
subset(cards, cards$diff < 0)
|
||||
|
Loading…
Reference in New Issue
Block a user