Added topics to data frame

2023-09-12 17:49:35 +02:00 · 2023-09-12 17:49:35 +02:00 · 9f15ea1b62
commit 9f15ea1b62
parent fa730081db
4 changed files with 150 additions and 44 deletions
--- a/README.md
+++ b/README.md
@ -246,6 +246,39 @@ functions.

 See `questions_number-of-cards.R` for details.

+## Extracting topics
+
+When I extract the topics from `index.html` I get different topics, than
+when I get them from `<artwork>.html`. At first glance, it looks like using
+`index.html` actually gives the wrong results.
+
+```
+topics <- extract_topics(artworks, "index.xml", path)
+topics2 <- extract_topics(artworks, paste0(artworks, ".xml"), path)
+
+topics[!topics$file_name %in% topics2$file_name, ]
+# artwork       file_name   topic index
+#     072  072_artist.xml  artist     1
+#     073  073_artist.xml  artist     1
+#     110 110_technik.xml technik     2
+topics2[!topics2$file_name %in% topics$file_name, ]
+# artwork          file_name      topic index
+#     031  031_vergleich.xml extra info     6
+#     033    033_technik.xml    technik     2
+#     055 055_vergleich4.xml extra info     5
+#     063     063_thema3.xml      thema     3
+#     063 063_extrainfo1.xml      thema     4
+#     072    072_artist2.xml     artist     1
+#     073    073_artist2.xml     artist     1
+#     099    099_technik.xml    technik     2
+#     110 110_technikneu.xml    technik     2
+```
+
+For artwork 031, `index.html` only defines 5 cards (the 6th is commented
+out), but `topicNumber` for this artwork has 6 different entries. I will
+therefore extract the topics from `<artwork>.html`. (This seems also better
+compatible with other data sets like 8o8m.
+
 # Reading list

 * @Arizmendi2022 [$-$]
--- a/code/02_preprocessing.R
+++ b/code/02_preprocessing.R
@ -49,12 +49,14 @@ dat3 <- dat3[, c("fileId.start", "fileId.stop", "eventId", "case",
 # Add trace for move events
 dat4 <- add_trace_moves(dat3)

-# Fill in topics
+# Add topics: file names and topics
+artworks <- unique(dat4$artwork)
+topics <- extract_topics(artworks, pattern = paste0(artworks, ".xml"),
+                         path = "../data/ContentEyevisit/eyevisit_cards_light/")

-# topics <- read.table("../data/topics.csv", sep = ";", header = TRUE)
-# TODO: Add topics to data frame
+dat5 <- add_topic(dat4, topics = topics)

 # Export data
-write.table(dat4, "../data/event_logfiles.csv", sep = ";",
+write.table(dat5, "../data/event_logfiles.csv", sep = ";",
            row.names = FALSE)

--- a/code/03_topic-cards.R
+++ b/code/03_topic-cards.R
@ -1,42 +1,16 @@
-# setwd("C:/Users/nwickelmaier/Nextcloud/Documents/MDS/2023ss/60100_master_thesis/data/ContentEyevisit/eyevisit_cards_light")
-rm(list=ls())
+path <- "C:/Users/nwickelmaier/Nextcloud/Documents/MDS/2023ss/60100_master_thesis/data/ContentEyevisit/eyevisit_cards_light"

-dat0 <- read.table("../../event_logfiles.csv", sep = ";", header = TRUE)
-dat0$artwork <- sprintf("%03d", dat0$artwork)
+setwd(path)

 # artwork names
+dat0 <- read.table("../../event_logfiles.csv", sep = ";", header = TRUE)
+dat0$artwork <- sprintf("%03d", dat0$artwork)
 artworks <- sort(unique(dat0$artwork))

-# create data frame with file names and topics for each artwork
+# extract topics
+topics <- extract_topics(artworks, paste0(artworks, ".xml"), path)

-dat <- NULL
-file_order <- NULL
+write.table(topics, file = "../../topics.csv", sep = ";", row.names = FALSE)

-for (artwork in artworks) {
-  fnames <- dir(pattern = paste0(artwork, "_"), path = artwork, full.names = TRUE)
-  topic <- NULL
-  for (fname in fnames) {
-    topic <- c(topic, gsub("^<card type=.(.*).>$", "\\1",
-                           grep("^<card type=", trimws(readLines(fname)), value = T)))
-    
-  }
-  index <- paste(artwork, "index.xml", sep = "/")
-  file_order <- c(file_order, gsub("^<card src=.(.*)./>$", "\\1",
-                       grep("^<card src=", trimws(readLines(index)), value = TRUE)))
-  in_index <- fnames %in% file_order
-  dat <- rbind(dat, data.frame(artwork, file_name = fnames, in_index, topic))
-}
-
-table(dat$artwork)
-table(dat$topic)
-
-# take only the ones that are actually displayed and sort in the same order
-# as indicated in index.html
-
-dat2 <- dat[dat$in_index, -3]
-dat2 <- dat2[order(file_order, dat2$file_name), ]
-
-dat2$index <- unlist(sapply(table(dat2$artwork), seq_len))
-
-write.table(dat2, file = "../../topics.csv", sep = ";", row.names = FALSE)
+# TODO: Keep this file?

--- a/code/functions.R
+++ b/code/functions.R
@ -184,8 +184,8 @@ close_events <- function(data, event = c("move", "flipCard", "openTopic", "openP
    move_wide <- data_wide[data_wide$distance != 0 &
                            data_wide$rotationDegree != 0 &
                            data_wide$scaleSize != 1, ]
-    cat(paste("INFORMATION:", nrow(data_wide) - nrow(move_wide), 
-    "lines containing move events were removed since they did", 
+    cat(paste("INFORMATION:", nrow(data_wide) - nrow(move_wide),
+    "lines containing move events were removed since they did",
    "\nnot contain any change"), fill = TRUE)
    data_wide <- move_wide
  }
@ -212,7 +212,7 @@ add_case <- function(data, cutoff = 20) {

  data$case <- NA
  j <- 1
-  
+
  for (i in seq_len(nrow(data))) {
    if (data$timediff[i] <= cutoff) {
      data$case[i] <- j
@ -235,12 +235,12 @@ add_trace_moves <- function(data) {
  aws <- unique(data$artwork)[unique(data$artwork) != "glossar"]
  max_trace <- max(data$trace, na.rm = TRUE) + 1
  out <- NULL
-  
+
  for (case in cases) {
    for (art in aws) {
      tmp <- data[data$case == case & data$artwork == art, ]
      if (nrow(tmp) != 0) {
-  
+
        if (length(na.omit(unique(tmp$trace))) == 1) {
          tmp[tmp$event == "move", "trace"] <- na.omit(unique(tmp$trace))
        } else if (length(na.omit(unique(tmp$trace))) > 1) {
@ -269,7 +269,7 @@ add_trace_moves <- function(data) {
  }
  out <- out[order(out$date.start, out$fileId.start), ]
  rownames(out) <- NULL
-  
+
  # Make trace a consecutive number
  out$trace <- as.numeric(factor(out$trace, levels = unique(out$trace)))
  out
@ -277,4 +277,101 @@ add_trace_moves <- function(data) {
 # TODO: Get rid of the loops
 # --> This takes forever...

+###########################################################################
+
+# Create data frame with file names and topics for each artwork
+
+extract_topics <- function(artworks, pattern, path) {
+
+  dat <- NULL
+  file_order <- NULL
+  i <- 1
+
+  for (artwork in artworks) {
+
+    if (length(pattern) == 1) {
+      index_file <- pattern
+    } else {
+      index_file <- pattern[i]
+    }
+
+    fnames <- dir(pattern = paste0(artwork, "_"),
+                  path = paste(path, artwork, sep = "/"))
+    topic <- NULL
+    for (fname in fnames) {
+      suppressWarnings(
+      topic <- c(topic, gsub("^<card type=.(.*).>$", "\\1",
+        grep("^<card type=",
+          trimws(readLines(paste(path, artwork, fname, sep = "/"))),
+          value = T)))
+      )
+
+    }
+    index <- paste(path, artwork, index_file, sep = "/")
+    suppressWarnings(
+    file_order <- c(file_order, gsub("^<card src=.*/(.*)./>$", "\\1",
+                         grep("^<card src=", trimws(readLines(index)),
+                              value = TRUE)))
+    )
+    in_index <- fnames %in% file_order
+    dat <- rbind(dat, data.frame(artwork, file_name = fnames, in_index, topic))
+    i <- i + 1
+  }
+
+  # take only the ones that are actually displayed and sort in the same order
+  # as indicated in index.html
+  out <- dat[dat$in_index, -3]
+  out <- out[order(file_order, out$file_name), ]
+  rownames(out) <- NULL
+
+  out$index <- unlist(sapply(table(out$artwork), seq_len))
+  out
+}
+
+###########################################################################
+
+# Add topics: file names and topics
+
+add_topic <- function(data, topics) {
+
+  tab_art <- lapply(artworks,
+               function(x) names(table(data$topicNumber[data$artwork == x])))
+  names(tab_art) <- artworks
+
+  tab_index <- lapply(tab_art, seq_along)
+
+  dat_split <- split(data, data$artwork)
+
+  set_label <- function(x) {
+    art <- unique(x$artwork)
+    x$topicIndex <- factor(x$topicNumber, labels = tab_index[[art]])
+    x
+  }
+
+  dat_label <- lapply(dat_split, set_label)
+
+  set_topic <- function(x) {
+    art <- unique(x$artwork)
+    labels_file <- topics[topics$artwork == art,
+                          "file_name"][as.numeric(levels(x$topicIndex))]
+    x$topicFile <- as.character(factor(x$topicIndex, labels = labels_file))
+    labels_topic <- topics[topics$artwork == art,
+                          "topic"][as.numeric(levels(x$topicIndex))]
+    x$topic <- as.character(factor(x$topicIndex, labels = labels_topic))
+    x
+  }
+
+  dat_topic <- lapply(dat_label, set_topic)
+
+  # table(dat_topic[["501"]]$topicNumber)
+  # table(dat_topic[["501"]]$topicIndex)
+  # table(dat_topic[["501"]]$topicFile)
+  # table(dat_topic[["501"]]$topic)
+
+  out <- do.call(rbind, dat_topic)
+  out <- out[order(out$date.start, out$fileId.start), ]
+  rownames(out) <- NULL
+  out
+}
+