From 9f15ea1b6217cc261fb240a80a47b64266e1a738 Mon Sep 17 00:00:00 2001 From: nwickel Date: Tue, 12 Sep 2023 17:49:35 +0200 Subject: [PATCH] Added topics to data frame --- README.md | 33 ++++++++++++ code/02_preprocessing.R | 10 ++-- code/03_topic-cards.R | 42 +++------------- code/functions.R | 109 +++++++++++++++++++++++++++++++++++++--- 4 files changed, 150 insertions(+), 44 deletions(-) diff --git a/README.md b/README.md index 5708da5..2095388 100644 --- a/README.md +++ b/README.md @@ -246,6 +246,39 @@ functions. See `questions_number-of-cards.R` for details. +## Extracting topics + +When I extract the topics from `index.html` I get different topics, than +when I get them from `.html`. At first glance, it looks like using +`index.html` actually gives the wrong results. + +``` +topics <- extract_topics(artworks, "index.xml", path) +topics2 <- extract_topics(artworks, paste0(artworks, ".xml"), path) + +topics[!topics$file_name %in% topics2$file_name, ] +# artwork file_name topic index +# 072 072_artist.xml artist 1 +# 073 073_artist.xml artist 1 +# 110 110_technik.xml technik 2 +topics2[!topics2$file_name %in% topics$file_name, ] +# artwork file_name topic index +# 031 031_vergleich.xml extra info 6 +# 033 033_technik.xml technik 2 +# 055 055_vergleich4.xml extra info 5 +# 063 063_thema3.xml thema 3 +# 063 063_extrainfo1.xml thema 4 +# 072 072_artist2.xml artist 1 +# 073 073_artist2.xml artist 1 +# 099 099_technik.xml technik 2 +# 110 110_technikneu.xml technik 2 +``` + +For artwork 031, `index.html` only defines 5 cards (the 6th is commented +out), but `topicNumber` for this artwork has 6 different entries. I will +therefore extract the topics from `.html`. (This seems also better +compatible with other data sets like 8o8m. + # Reading list * @Arizmendi2022 [$-$] diff --git a/code/02_preprocessing.R b/code/02_preprocessing.R index d41f538..4bab6c1 100644 --- a/code/02_preprocessing.R +++ b/code/02_preprocessing.R @@ -49,12 +49,14 @@ dat3 <- dat3[, c("fileId.start", "fileId.stop", "eventId", "case", # Add trace for move events dat4 <- add_trace_moves(dat3) -# Fill in topics +# Add topics: file names and topics +artworks <- unique(dat4$artwork) +topics <- extract_topics(artworks, pattern = paste0(artworks, ".xml"), + path = "../data/ContentEyevisit/eyevisit_cards_light/") -# topics <- read.table("../data/topics.csv", sep = ";", header = TRUE) -# TODO: Add topics to data frame +dat5 <- add_topic(dat4, topics = topics) # Export data -write.table(dat4, "../data/event_logfiles.csv", sep = ";", +write.table(dat5, "../data/event_logfiles.csv", sep = ";", row.names = FALSE) diff --git a/code/03_topic-cards.R b/code/03_topic-cards.R index 3227967..921008e 100644 --- a/code/03_topic-cards.R +++ b/code/03_topic-cards.R @@ -1,42 +1,16 @@ -# setwd("C:/Users/nwickelmaier/Nextcloud/Documents/MDS/2023ss/60100_master_thesis/data/ContentEyevisit/eyevisit_cards_light") -rm(list=ls()) +path <- "C:/Users/nwickelmaier/Nextcloud/Documents/MDS/2023ss/60100_master_thesis/data/ContentEyevisit/eyevisit_cards_light" -dat0 <- read.table("../../event_logfiles.csv", sep = ";", header = TRUE) -dat0$artwork <- sprintf("%03d", dat0$artwork) +setwd(path) # artwork names +dat0 <- read.table("../../event_logfiles.csv", sep = ";", header = TRUE) +dat0$artwork <- sprintf("%03d", dat0$artwork) artworks <- sort(unique(dat0$artwork)) -# create data frame with file names and topics for each artwork +# extract topics +topics <- extract_topics(artworks, paste0(artworks, ".xml"), path) -dat <- NULL -file_order <- NULL +write.table(topics, file = "../../topics.csv", sep = ";", row.names = FALSE) -for (artwork in artworks) { - fnames <- dir(pattern = paste0(artwork, "_"), path = artwork, full.names = TRUE) - topic <- NULL - for (fname in fnames) { - topic <- c(topic, gsub("^$", "\\1", - grep("^$", "\\1", - grep("^ 1) { @@ -269,7 +269,7 @@ add_trace_moves <- function(data) { } out <- out[order(out$date.start, out$fileId.start), ] rownames(out) <- NULL - + # Make trace a consecutive number out$trace <- as.numeric(factor(out$trace, levels = unique(out$trace))) out @@ -277,4 +277,101 @@ add_trace_moves <- function(data) { # TODO: Get rid of the loops # --> This takes forever... +########################################################################### + +# Create data frame with file names and topics for each artwork + +extract_topics <- function(artworks, pattern, path) { + + dat <- NULL + file_order <- NULL + i <- 1 + + for (artwork in artworks) { + + if (length(pattern) == 1) { + index_file <- pattern + } else { + index_file <- pattern[i] + } + + fnames <- dir(pattern = paste0(artwork, "_"), + path = paste(path, artwork, sep = "/")) + topic <- NULL + for (fname in fnames) { + suppressWarnings( + topic <- c(topic, gsub("^$", "\\1", + grep("^$", "\\1", + grep("^