Added topics to data frame
This commit is contained in:
parent
fa730081db
commit
9f15ea1b62
33
README.md
33
README.md
@ -246,6 +246,39 @@ functions.
|
|||||||
|
|
||||||
See `questions_number-of-cards.R` for details.
|
See `questions_number-of-cards.R` for details.
|
||||||
|
|
||||||
|
## Extracting topics
|
||||||
|
|
||||||
|
When I extract the topics from `index.html` I get different topics, than
|
||||||
|
when I get them from `<artwork>.html`. At first glance, it looks like using
|
||||||
|
`index.html` actually gives the wrong results.
|
||||||
|
|
||||||
|
```
|
||||||
|
topics <- extract_topics(artworks, "index.xml", path)
|
||||||
|
topics2 <- extract_topics(artworks, paste0(artworks, ".xml"), path)
|
||||||
|
|
||||||
|
topics[!topics$file_name %in% topics2$file_name, ]
|
||||||
|
# artwork file_name topic index
|
||||||
|
# 072 072_artist.xml artist 1
|
||||||
|
# 073 073_artist.xml artist 1
|
||||||
|
# 110 110_technik.xml technik 2
|
||||||
|
topics2[!topics2$file_name %in% topics$file_name, ]
|
||||||
|
# artwork file_name topic index
|
||||||
|
# 031 031_vergleich.xml extra info 6
|
||||||
|
# 033 033_technik.xml technik 2
|
||||||
|
# 055 055_vergleich4.xml extra info 5
|
||||||
|
# 063 063_thema3.xml thema 3
|
||||||
|
# 063 063_extrainfo1.xml thema 4
|
||||||
|
# 072 072_artist2.xml artist 1
|
||||||
|
# 073 073_artist2.xml artist 1
|
||||||
|
# 099 099_technik.xml technik 2
|
||||||
|
# 110 110_technikneu.xml technik 2
|
||||||
|
```
|
||||||
|
|
||||||
|
For artwork 031, `index.html` only defines 5 cards (the 6th is commented
|
||||||
|
out), but `topicNumber` for this artwork has 6 different entries. I will
|
||||||
|
therefore extract the topics from `<artwork>.html`. (This seems also better
|
||||||
|
compatible with other data sets like 8o8m.
|
||||||
|
|
||||||
# Reading list
|
# Reading list
|
||||||
|
|
||||||
* @Arizmendi2022 [$-$]
|
* @Arizmendi2022 [$-$]
|
||||||
|
@ -49,12 +49,14 @@ dat3 <- dat3[, c("fileId.start", "fileId.stop", "eventId", "case",
|
|||||||
# Add trace for move events
|
# Add trace for move events
|
||||||
dat4 <- add_trace_moves(dat3)
|
dat4 <- add_trace_moves(dat3)
|
||||||
|
|
||||||
# Fill in topics
|
# Add topics: file names and topics
|
||||||
|
artworks <- unique(dat4$artwork)
|
||||||
|
topics <- extract_topics(artworks, pattern = paste0(artworks, ".xml"),
|
||||||
|
path = "../data/ContentEyevisit/eyevisit_cards_light/")
|
||||||
|
|
||||||
# topics <- read.table("../data/topics.csv", sep = ";", header = TRUE)
|
dat5 <- add_topic(dat4, topics = topics)
|
||||||
# TODO: Add topics to data frame
|
|
||||||
|
|
||||||
# Export data
|
# Export data
|
||||||
write.table(dat4, "../data/event_logfiles.csv", sep = ";",
|
write.table(dat5, "../data/event_logfiles.csv", sep = ";",
|
||||||
row.names = FALSE)
|
row.names = FALSE)
|
||||||
|
|
||||||
|
@ -1,42 +1,16 @@
|
|||||||
# setwd("C:/Users/nwickelmaier/Nextcloud/Documents/MDS/2023ss/60100_master_thesis/data/ContentEyevisit/eyevisit_cards_light")
|
path <- "C:/Users/nwickelmaier/Nextcloud/Documents/MDS/2023ss/60100_master_thesis/data/ContentEyevisit/eyevisit_cards_light"
|
||||||
rm(list=ls())
|
|
||||||
|
|
||||||
dat0 <- read.table("../../event_logfiles.csv", sep = ";", header = TRUE)
|
setwd(path)
|
||||||
dat0$artwork <- sprintf("%03d", dat0$artwork)
|
|
||||||
|
|
||||||
# artwork names
|
# artwork names
|
||||||
|
dat0 <- read.table("../../event_logfiles.csv", sep = ";", header = TRUE)
|
||||||
|
dat0$artwork <- sprintf("%03d", dat0$artwork)
|
||||||
artworks <- sort(unique(dat0$artwork))
|
artworks <- sort(unique(dat0$artwork))
|
||||||
|
|
||||||
# create data frame with file names and topics for each artwork
|
# extract topics
|
||||||
|
topics <- extract_topics(artworks, paste0(artworks, ".xml"), path)
|
||||||
|
|
||||||
dat <- NULL
|
write.table(topics, file = "../../topics.csv", sep = ";", row.names = FALSE)
|
||||||
file_order <- NULL
|
|
||||||
|
|
||||||
for (artwork in artworks) {
|
# TODO: Keep this file?
|
||||||
fnames <- dir(pattern = paste0(artwork, "_"), path = artwork, full.names = TRUE)
|
|
||||||
topic <- NULL
|
|
||||||
for (fname in fnames) {
|
|
||||||
topic <- c(topic, gsub("^<card type=.(.*).>$", "\\1",
|
|
||||||
grep("^<card type=", trimws(readLines(fname)), value = T)))
|
|
||||||
|
|
||||||
}
|
|
||||||
index <- paste(artwork, "index.xml", sep = "/")
|
|
||||||
file_order <- c(file_order, gsub("^<card src=.(.*)./>$", "\\1",
|
|
||||||
grep("^<card src=", trimws(readLines(index)), value = TRUE)))
|
|
||||||
in_index <- fnames %in% file_order
|
|
||||||
dat <- rbind(dat, data.frame(artwork, file_name = fnames, in_index, topic))
|
|
||||||
}
|
|
||||||
|
|
||||||
table(dat$artwork)
|
|
||||||
table(dat$topic)
|
|
||||||
|
|
||||||
# take only the ones that are actually displayed and sort in the same order
|
|
||||||
# as indicated in index.html
|
|
||||||
|
|
||||||
dat2 <- dat[dat$in_index, -3]
|
|
||||||
dat2 <- dat2[order(file_order, dat2$file_name), ]
|
|
||||||
|
|
||||||
dat2$index <- unlist(sapply(table(dat2$artwork), seq_len))
|
|
||||||
|
|
||||||
write.table(dat2, file = "../../topics.csv", sep = ";", row.names = FALSE)
|
|
||||||
|
|
||||||
|
@ -277,4 +277,101 @@ add_trace_moves <- function(data) {
|
|||||||
# TODO: Get rid of the loops
|
# TODO: Get rid of the loops
|
||||||
# --> This takes forever...
|
# --> This takes forever...
|
||||||
|
|
||||||
|
###########################################################################
|
||||||
|
|
||||||
|
# Create data frame with file names and topics for each artwork
|
||||||
|
|
||||||
|
extract_topics <- function(artworks, pattern, path) {
|
||||||
|
|
||||||
|
dat <- NULL
|
||||||
|
file_order <- NULL
|
||||||
|
i <- 1
|
||||||
|
|
||||||
|
for (artwork in artworks) {
|
||||||
|
|
||||||
|
if (length(pattern) == 1) {
|
||||||
|
index_file <- pattern
|
||||||
|
} else {
|
||||||
|
index_file <- pattern[i]
|
||||||
|
}
|
||||||
|
|
||||||
|
fnames <- dir(pattern = paste0(artwork, "_"),
|
||||||
|
path = paste(path, artwork, sep = "/"))
|
||||||
|
topic <- NULL
|
||||||
|
for (fname in fnames) {
|
||||||
|
suppressWarnings(
|
||||||
|
topic <- c(topic, gsub("^<card type=.(.*).>$", "\\1",
|
||||||
|
grep("^<card type=",
|
||||||
|
trimws(readLines(paste(path, artwork, fname, sep = "/"))),
|
||||||
|
value = T)))
|
||||||
|
)
|
||||||
|
|
||||||
|
}
|
||||||
|
index <- paste(path, artwork, index_file, sep = "/")
|
||||||
|
suppressWarnings(
|
||||||
|
file_order <- c(file_order, gsub("^<card src=.*/(.*)./>$", "\\1",
|
||||||
|
grep("^<card src=", trimws(readLines(index)),
|
||||||
|
value = TRUE)))
|
||||||
|
)
|
||||||
|
in_index <- fnames %in% file_order
|
||||||
|
dat <- rbind(dat, data.frame(artwork, file_name = fnames, in_index, topic))
|
||||||
|
i <- i + 1
|
||||||
|
}
|
||||||
|
|
||||||
|
# take only the ones that are actually displayed and sort in the same order
|
||||||
|
# as indicated in index.html
|
||||||
|
out <- dat[dat$in_index, -3]
|
||||||
|
out <- out[order(file_order, out$file_name), ]
|
||||||
|
rownames(out) <- NULL
|
||||||
|
|
||||||
|
out$index <- unlist(sapply(table(out$artwork), seq_len))
|
||||||
|
out
|
||||||
|
}
|
||||||
|
|
||||||
|
###########################################################################
|
||||||
|
|
||||||
|
# Add topics: file names and topics
|
||||||
|
|
||||||
|
add_topic <- function(data, topics) {
|
||||||
|
|
||||||
|
tab_art <- lapply(artworks,
|
||||||
|
function(x) names(table(data$topicNumber[data$artwork == x])))
|
||||||
|
names(tab_art) <- artworks
|
||||||
|
|
||||||
|
tab_index <- lapply(tab_art, seq_along)
|
||||||
|
|
||||||
|
dat_split <- split(data, data$artwork)
|
||||||
|
|
||||||
|
set_label <- function(x) {
|
||||||
|
art <- unique(x$artwork)
|
||||||
|
x$topicIndex <- factor(x$topicNumber, labels = tab_index[[art]])
|
||||||
|
x
|
||||||
|
}
|
||||||
|
|
||||||
|
dat_label <- lapply(dat_split, set_label)
|
||||||
|
|
||||||
|
set_topic <- function(x) {
|
||||||
|
art <- unique(x$artwork)
|
||||||
|
labels_file <- topics[topics$artwork == art,
|
||||||
|
"file_name"][as.numeric(levels(x$topicIndex))]
|
||||||
|
x$topicFile <- as.character(factor(x$topicIndex, labels = labels_file))
|
||||||
|
labels_topic <- topics[topics$artwork == art,
|
||||||
|
"topic"][as.numeric(levels(x$topicIndex))]
|
||||||
|
x$topic <- as.character(factor(x$topicIndex, labels = labels_topic))
|
||||||
|
x
|
||||||
|
}
|
||||||
|
|
||||||
|
dat_topic <- lapply(dat_label, set_topic)
|
||||||
|
|
||||||
|
# table(dat_topic[["501"]]$topicNumber)
|
||||||
|
# table(dat_topic[["501"]]$topicIndex)
|
||||||
|
# table(dat_topic[["501"]]$topicFile)
|
||||||
|
# table(dat_topic[["501"]]$topic)
|
||||||
|
|
||||||
|
out <- do.call(rbind, dat_topic)
|
||||||
|
out <- out[order(out$date.start, out$fileId.start), ]
|
||||||
|
rownames(out) <- NULL
|
||||||
|
out
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user