Added topics to data frame
This commit is contained in:
parent
fa730081db
commit
9f15ea1b62
33
README.md
33
README.md
@ -246,6 +246,39 @@ functions.
|
||||
|
||||
See `questions_number-of-cards.R` for details.
|
||||
|
||||
## Extracting topics
|
||||
|
||||
When I extract the topics from `index.html` I get different topics, than
|
||||
when I get them from `<artwork>.html`. At first glance, it looks like using
|
||||
`index.html` actually gives the wrong results.
|
||||
|
||||
```
|
||||
topics <- extract_topics(artworks, "index.xml", path)
|
||||
topics2 <- extract_topics(artworks, paste0(artworks, ".xml"), path)
|
||||
|
||||
topics[!topics$file_name %in% topics2$file_name, ]
|
||||
# artwork file_name topic index
|
||||
# 072 072_artist.xml artist 1
|
||||
# 073 073_artist.xml artist 1
|
||||
# 110 110_technik.xml technik 2
|
||||
topics2[!topics2$file_name %in% topics$file_name, ]
|
||||
# artwork file_name topic index
|
||||
# 031 031_vergleich.xml extra info 6
|
||||
# 033 033_technik.xml technik 2
|
||||
# 055 055_vergleich4.xml extra info 5
|
||||
# 063 063_thema3.xml thema 3
|
||||
# 063 063_extrainfo1.xml thema 4
|
||||
# 072 072_artist2.xml artist 1
|
||||
# 073 073_artist2.xml artist 1
|
||||
# 099 099_technik.xml technik 2
|
||||
# 110 110_technikneu.xml technik 2
|
||||
```
|
||||
|
||||
For artwork 031, `index.html` only defines 5 cards (the 6th is commented
|
||||
out), but `topicNumber` for this artwork has 6 different entries. I will
|
||||
therefore extract the topics from `<artwork>.html`. (This seems also better
|
||||
compatible with other data sets like 8o8m.
|
||||
|
||||
# Reading list
|
||||
|
||||
* @Arizmendi2022 [$-$]
|
||||
|
@ -49,12 +49,14 @@ dat3 <- dat3[, c("fileId.start", "fileId.stop", "eventId", "case",
|
||||
# Add trace for move events
|
||||
dat4 <- add_trace_moves(dat3)
|
||||
|
||||
# Fill in topics
|
||||
# Add topics: file names and topics
|
||||
artworks <- unique(dat4$artwork)
|
||||
topics <- extract_topics(artworks, pattern = paste0(artworks, ".xml"),
|
||||
path = "../data/ContentEyevisit/eyevisit_cards_light/")
|
||||
|
||||
# topics <- read.table("../data/topics.csv", sep = ";", header = TRUE)
|
||||
# TODO: Add topics to data frame
|
||||
dat5 <- add_topic(dat4, topics = topics)
|
||||
|
||||
# Export data
|
||||
write.table(dat4, "../data/event_logfiles.csv", sep = ";",
|
||||
write.table(dat5, "../data/event_logfiles.csv", sep = ";",
|
||||
row.names = FALSE)
|
||||
|
||||
|
@ -1,42 +1,16 @@
|
||||
# setwd("C:/Users/nwickelmaier/Nextcloud/Documents/MDS/2023ss/60100_master_thesis/data/ContentEyevisit/eyevisit_cards_light")
|
||||
rm(list=ls())
|
||||
path <- "C:/Users/nwickelmaier/Nextcloud/Documents/MDS/2023ss/60100_master_thesis/data/ContentEyevisit/eyevisit_cards_light"
|
||||
|
||||
dat0 <- read.table("../../event_logfiles.csv", sep = ";", header = TRUE)
|
||||
dat0$artwork <- sprintf("%03d", dat0$artwork)
|
||||
setwd(path)
|
||||
|
||||
# artwork names
|
||||
dat0 <- read.table("../../event_logfiles.csv", sep = ";", header = TRUE)
|
||||
dat0$artwork <- sprintf("%03d", dat0$artwork)
|
||||
artworks <- sort(unique(dat0$artwork))
|
||||
|
||||
# create data frame with file names and topics for each artwork
|
||||
# extract topics
|
||||
topics <- extract_topics(artworks, paste0(artworks, ".xml"), path)
|
||||
|
||||
dat <- NULL
|
||||
file_order <- NULL
|
||||
write.table(topics, file = "../../topics.csv", sep = ";", row.names = FALSE)
|
||||
|
||||
for (artwork in artworks) {
|
||||
fnames <- dir(pattern = paste0(artwork, "_"), path = artwork, full.names = TRUE)
|
||||
topic <- NULL
|
||||
for (fname in fnames) {
|
||||
topic <- c(topic, gsub("^<card type=.(.*).>$", "\\1",
|
||||
grep("^<card type=", trimws(readLines(fname)), value = T)))
|
||||
|
||||
}
|
||||
index <- paste(artwork, "index.xml", sep = "/")
|
||||
file_order <- c(file_order, gsub("^<card src=.(.*)./>$", "\\1",
|
||||
grep("^<card src=", trimws(readLines(index)), value = TRUE)))
|
||||
in_index <- fnames %in% file_order
|
||||
dat <- rbind(dat, data.frame(artwork, file_name = fnames, in_index, topic))
|
||||
}
|
||||
|
||||
table(dat$artwork)
|
||||
table(dat$topic)
|
||||
|
||||
# take only the ones that are actually displayed and sort in the same order
|
||||
# as indicated in index.html
|
||||
|
||||
dat2 <- dat[dat$in_index, -3]
|
||||
dat2 <- dat2[order(file_order, dat2$file_name), ]
|
||||
|
||||
dat2$index <- unlist(sapply(table(dat2$artwork), seq_len))
|
||||
|
||||
write.table(dat2, file = "../../topics.csv", sep = ";", row.names = FALSE)
|
||||
# TODO: Keep this file?
|
||||
|
||||
|
@ -277,4 +277,101 @@ add_trace_moves <- function(data) {
|
||||
# TODO: Get rid of the loops
|
||||
# --> This takes forever...
|
||||
|
||||
###########################################################################
|
||||
|
||||
# Create data frame with file names and topics for each artwork
|
||||
|
||||
extract_topics <- function(artworks, pattern, path) {
|
||||
|
||||
dat <- NULL
|
||||
file_order <- NULL
|
||||
i <- 1
|
||||
|
||||
for (artwork in artworks) {
|
||||
|
||||
if (length(pattern) == 1) {
|
||||
index_file <- pattern
|
||||
} else {
|
||||
index_file <- pattern[i]
|
||||
}
|
||||
|
||||
fnames <- dir(pattern = paste0(artwork, "_"),
|
||||
path = paste(path, artwork, sep = "/"))
|
||||
topic <- NULL
|
||||
for (fname in fnames) {
|
||||
suppressWarnings(
|
||||
topic <- c(topic, gsub("^<card type=.(.*).>$", "\\1",
|
||||
grep("^<card type=",
|
||||
trimws(readLines(paste(path, artwork, fname, sep = "/"))),
|
||||
value = T)))
|
||||
)
|
||||
|
||||
}
|
||||
index <- paste(path, artwork, index_file, sep = "/")
|
||||
suppressWarnings(
|
||||
file_order <- c(file_order, gsub("^<card src=.*/(.*)./>$", "\\1",
|
||||
grep("^<card src=", trimws(readLines(index)),
|
||||
value = TRUE)))
|
||||
)
|
||||
in_index <- fnames %in% file_order
|
||||
dat <- rbind(dat, data.frame(artwork, file_name = fnames, in_index, topic))
|
||||
i <- i + 1
|
||||
}
|
||||
|
||||
# take only the ones that are actually displayed and sort in the same order
|
||||
# as indicated in index.html
|
||||
out <- dat[dat$in_index, -3]
|
||||
out <- out[order(file_order, out$file_name), ]
|
||||
rownames(out) <- NULL
|
||||
|
||||
out$index <- unlist(sapply(table(out$artwork), seq_len))
|
||||
out
|
||||
}
|
||||
|
||||
###########################################################################
|
||||
|
||||
# Add topics: file names and topics
|
||||
|
||||
add_topic <- function(data, topics) {
|
||||
|
||||
tab_art <- lapply(artworks,
|
||||
function(x) names(table(data$topicNumber[data$artwork == x])))
|
||||
names(tab_art) <- artworks
|
||||
|
||||
tab_index <- lapply(tab_art, seq_along)
|
||||
|
||||
dat_split <- split(data, data$artwork)
|
||||
|
||||
set_label <- function(x) {
|
||||
art <- unique(x$artwork)
|
||||
x$topicIndex <- factor(x$topicNumber, labels = tab_index[[art]])
|
||||
x
|
||||
}
|
||||
|
||||
dat_label <- lapply(dat_split, set_label)
|
||||
|
||||
set_topic <- function(x) {
|
||||
art <- unique(x$artwork)
|
||||
labels_file <- topics[topics$artwork == art,
|
||||
"file_name"][as.numeric(levels(x$topicIndex))]
|
||||
x$topicFile <- as.character(factor(x$topicIndex, labels = labels_file))
|
||||
labels_topic <- topics[topics$artwork == art,
|
||||
"topic"][as.numeric(levels(x$topicIndex))]
|
||||
x$topic <- as.character(factor(x$topicIndex, labels = labels_topic))
|
||||
x
|
||||
}
|
||||
|
||||
dat_topic <- lapply(dat_label, set_topic)
|
||||
|
||||
# table(dat_topic[["501"]]$topicNumber)
|
||||
# table(dat_topic[["501"]]$topicIndex)
|
||||
# table(dat_topic[["501"]]$topicFile)
|
||||
# table(dat_topic[["501"]]$topic)
|
||||
|
||||
out <- do.call(rbind, dat_topic)
|
||||
out <- out[order(out$date.start, out$fileId.start), ]
|
||||
rownames(out) <- NULL
|
||||
out
|
||||
}
|
||||
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user