mtt_haum/code/03_topic-cards.R

43 lines
1.4 KiB
R
Raw Normal View History

2023-09-01 15:01:54 +02:00
# setwd("C:/Users/nwickelmaier/Nextcloud/Documents/MDS/2023ss/60100_master_thesis/data/ContentEyevisit/eyevisit_cards_light")
rm(list=ls())
dat0 <- read.table("../../event_logfiles.csv", sep = ";", header = TRUE)
dat0$artwork <- sprintf("%03d", dat0$artwork)
# artwork names
artworks <- sort(unique(dat0$artwork))
# create data frame with file names and topics for each artwork
dat <- NULL
file_order <- NULL
for (artwork in artworks) {
fnames <- dir(pattern = paste0(artwork, "_"), path = artwork, full.names = TRUE)
topic <- NULL
for (fname in fnames) {
topic <- c(topic, gsub("^<card type=.(.*).>$", "\\1",
grep("^<card type=", trimws(readLines(fname)), value = T)))
}
index <- paste(artwork, "index.xml", sep = "/")
file_order <- c(file_order, gsub("^<card src=.(.*)./>$", "\\1",
grep("^<card src=", trimws(readLines(index)), value = TRUE)))
in_index <- fnames %in% file_order
dat <- rbind(dat, data.frame(artwork, file_name = fnames, in_index, topic))
}
table(dat$artwork)
table(dat$topic)
# take only the ones that are actually displayed and sort in the same order
# as indicated in index.html
dat2 <- dat[dat$in_index, -3]
dat2 <- dat2[order(file_order, dat2$file_name), ]
dat2$index <- unlist(sapply(table(dat2$artwork), seq_len))
write.table(dat2, file = "../../topics.csv", sep = ";", row.names = FALSE)