Added topics to data frame

This commit is contained in:
Nora Wickelmaier 2023-09-12 17:49:35 +02:00
parent fa730081db
commit 9f15ea1b62
4 changed files with 150 additions and 44 deletions

View File

@ -246,6 +246,39 @@ functions.
See `questions_number-of-cards.R` for details.
## Extracting topics
When I extract the topics from `index.html` I get different topics, than
when I get them from `<artwork>.html`. At first glance, it looks like using
`index.html` actually gives the wrong results.
```
topics <- extract_topics(artworks, "index.xml", path)
topics2 <- extract_topics(artworks, paste0(artworks, ".xml"), path)
topics[!topics$file_name %in% topics2$file_name, ]
# artwork file_name topic index
# 072 072_artist.xml artist 1
# 073 073_artist.xml artist 1
# 110 110_technik.xml technik 2
topics2[!topics2$file_name %in% topics$file_name, ]
# artwork file_name topic index
# 031 031_vergleich.xml extra info 6
# 033 033_technik.xml technik 2
# 055 055_vergleich4.xml extra info 5
# 063 063_thema3.xml thema 3
# 063 063_extrainfo1.xml thema 4
# 072 072_artist2.xml artist 1
# 073 073_artist2.xml artist 1
# 099 099_technik.xml technik 2
# 110 110_technikneu.xml technik 2
```
For artwork 031, `index.html` only defines 5 cards (the 6th is commented
out), but `topicNumber` for this artwork has 6 different entries. I will
therefore extract the topics from `<artwork>.html`. (This seems also better
compatible with other data sets like 8o8m.
# Reading list
* @Arizmendi2022 [$-$]

View File

@ -49,12 +49,14 @@ dat3 <- dat3[, c("fileId.start", "fileId.stop", "eventId", "case",
# Add trace for move events
dat4 <- add_trace_moves(dat3)
# Fill in topics
# Add topics: file names and topics
artworks <- unique(dat4$artwork)
topics <- extract_topics(artworks, pattern = paste0(artworks, ".xml"),
path = "../data/ContentEyevisit/eyevisit_cards_light/")
# topics <- read.table("../data/topics.csv", sep = ";", header = TRUE)
# TODO: Add topics to data frame
dat5 <- add_topic(dat4, topics = topics)
# Export data
write.table(dat4, "../data/event_logfiles.csv", sep = ";",
write.table(dat5, "../data/event_logfiles.csv", sep = ";",
row.names = FALSE)

View File

@ -1,42 +1,16 @@
# setwd("C:/Users/nwickelmaier/Nextcloud/Documents/MDS/2023ss/60100_master_thesis/data/ContentEyevisit/eyevisit_cards_light")
rm(list=ls())
path <- "C:/Users/nwickelmaier/Nextcloud/Documents/MDS/2023ss/60100_master_thesis/data/ContentEyevisit/eyevisit_cards_light"
dat0 <- read.table("../../event_logfiles.csv", sep = ";", header = TRUE)
dat0$artwork <- sprintf("%03d", dat0$artwork)
setwd(path)
# artwork names
dat0 <- read.table("../../event_logfiles.csv", sep = ";", header = TRUE)
dat0$artwork <- sprintf("%03d", dat0$artwork)
artworks <- sort(unique(dat0$artwork))
# create data frame with file names and topics for each artwork
# extract topics
topics <- extract_topics(artworks, paste0(artworks, ".xml"), path)
dat <- NULL
file_order <- NULL
write.table(topics, file = "../../topics.csv", sep = ";", row.names = FALSE)
for (artwork in artworks) {
fnames <- dir(pattern = paste0(artwork, "_"), path = artwork, full.names = TRUE)
topic <- NULL
for (fname in fnames) {
topic <- c(topic, gsub("^<card type=.(.*).>$", "\\1",
grep("^<card type=", trimws(readLines(fname)), value = T)))
}
index <- paste(artwork, "index.xml", sep = "/")
file_order <- c(file_order, gsub("^<card src=.(.*)./>$", "\\1",
grep("^<card src=", trimws(readLines(index)), value = TRUE)))
in_index <- fnames %in% file_order
dat <- rbind(dat, data.frame(artwork, file_name = fnames, in_index, topic))
}
table(dat$artwork)
table(dat$topic)
# take only the ones that are actually displayed and sort in the same order
# as indicated in index.html
dat2 <- dat[dat$in_index, -3]
dat2 <- dat2[order(file_order, dat2$file_name), ]
dat2$index <- unlist(sapply(table(dat2$artwork), seq_len))
write.table(dat2, file = "../../topics.csv", sep = ";", row.names = FALSE)
# TODO: Keep this file?

View File

@ -277,4 +277,101 @@ add_trace_moves <- function(data) {
# TODO: Get rid of the loops
# --> This takes forever...
###########################################################################
# Create data frame with file names and topics for each artwork
extract_topics <- function(artworks, pattern, path) {
dat <- NULL
file_order <- NULL
i <- 1
for (artwork in artworks) {
if (length(pattern) == 1) {
index_file <- pattern
} else {
index_file <- pattern[i]
}
fnames <- dir(pattern = paste0(artwork, "_"),
path = paste(path, artwork, sep = "/"))
topic <- NULL
for (fname in fnames) {
suppressWarnings(
topic <- c(topic, gsub("^<card type=.(.*).>$", "\\1",
grep("^<card type=",
trimws(readLines(paste(path, artwork, fname, sep = "/"))),
value = T)))
)
}
index <- paste(path, artwork, index_file, sep = "/")
suppressWarnings(
file_order <- c(file_order, gsub("^<card src=.*/(.*)./>$", "\\1",
grep("^<card src=", trimws(readLines(index)),
value = TRUE)))
)
in_index <- fnames %in% file_order
dat <- rbind(dat, data.frame(artwork, file_name = fnames, in_index, topic))
i <- i + 1
}
# take only the ones that are actually displayed and sort in the same order
# as indicated in index.html
out <- dat[dat$in_index, -3]
out <- out[order(file_order, out$file_name), ]
rownames(out) <- NULL
out$index <- unlist(sapply(table(out$artwork), seq_len))
out
}
###########################################################################
# Add topics: file names and topics
add_topic <- function(data, topics) {
tab_art <- lapply(artworks,
function(x) names(table(data$topicNumber[data$artwork == x])))
names(tab_art) <- artworks
tab_index <- lapply(tab_art, seq_along)
dat_split <- split(data, data$artwork)
set_label <- function(x) {
art <- unique(x$artwork)
x$topicIndex <- factor(x$topicNumber, labels = tab_index[[art]])
x
}
dat_label <- lapply(dat_split, set_label)
set_topic <- function(x) {
art <- unique(x$artwork)
labels_file <- topics[topics$artwork == art,
"file_name"][as.numeric(levels(x$topicIndex))]
x$topicFile <- as.character(factor(x$topicIndex, labels = labels_file))
labels_topic <- topics[topics$artwork == art,
"topic"][as.numeric(levels(x$topicIndex))]
x$topic <- as.character(factor(x$topicIndex, labels = labels_topic))
x
}
dat_topic <- lapply(dat_label, set_topic)
# table(dat_topic[["501"]]$topicNumber)
# table(dat_topic[["501"]]$topicIndex)
# table(dat_topic[["501"]]$topicFile)
# table(dat_topic[["501"]]$topic)
out <- do.call(rbind, dat_topic)
out <- out[order(out$date.start, out$fileId.start), ]
rownames(out) <- NULL
out
}