Cleaned up preprocessing for HAUM

2024-01-02 15:37:37 +01:00
parent 4d17f76fd5
commit 16fa9c4f2c
1 changed files with 43 additions and 17 deletions
@@ -20,35 +20,54 @@
 devtools::load_all("../../../../software/mtt")

 now <- format(Sys.time(), "%Y-%m-%d_%H-%M-%S")
-#now <- "2023-09-23_01-31-30"

 #--------------- (1) Parse raw log files ---------------

-#path <- "../data/haum/LogFiles/"
-#folders <- dir(path)
+path <- "../data/haum/LogFiles/"
+folders <- dir(path)
 #folders <- "2016"

-#datraw <- parse_logfiles(folders, path)
+datraw <- parse_logfiles(folders, path)
+# 91 corrupt lines have been found and removed from the data set

-datraw <- read.table("results/haum/raw_logfiles_2023-10-25_16-20-45.csv",
-                     sep = ";", header = TRUE)
+# datraw <- read.table("results/haum/raw_logfiles_2023-10-25_16-20-45.csv",
+#                      sep = ";", header = TRUE)

 ## Export data

-#write.table(datraw, paste0("results/haum/raw_logfiles_small_", now, ".csv"),
-#            sep = ";", row.names = FALSE)
+write.table(datraw, paste0("results/haum/raw_logfiles_", now, ".csv"),
+            sep = ";", row.names = FALSE)

 #--------------- (2) Create event logs ---------------

 datlogs <- create_eventlogs(datraw,
-           xmlpath = "../data/haum/ContentEyevisit/eyevisit_cards_light/",
-           glossar = TRUE, save = TRUE)
+           #xmlpath = "../data/haum/ContentEyevisit/eyevisit_cards_light/",
+           glossar = FALSE, save = FALSE)

-artworks <- unique(datlogs$artwork)
-topics <- extract_topics(artworks, xmlfiles = paste0(artworks, ".xml"),
+# 6,064 glossar entries, that could not be matched, have been removed
+
+# 2,136,715 no change move events have been removed
+
+items <- unique(datlogs$item)
+topics <- extract_topics(items, xmlfiles = paste0(items, ".xml"),
                         xmlpath = "../data/haum/ContentEyevisit/eyevisit_cards_light/")

-datlogs_topics <- add_topic(datlogs, topics = topics)
+# Indices for topics:
+# 0   artist
+# 1   thema
+# 2   komposition
+# 3   leben des kunstwerks
+# 4   details
+# 5   licht und farbe
+# 6   extra info
+# 7   technik
+
+# ATTENTION: Need to know which topic maps onto which index!
+datlogs$topic <- factor(datlogs$topic, levels = 0:7,
+                        labels = c("artist", "thema", "komposition",
+                                   "leben des kunstwerks", "details",
+                                   "licht und farbe", "extra info",
+                                   "technik"))

 #--------------- (3) Add meta data ---------------

@@ -60,6 +79,7 @@ hd0$X.br. <- NULL
 hd <- hd0[hd0$Abkuerzung == "NI", ]
 names(hd) <- c("state", "stateCode", "date", "holiday")
 hd$date <- as.POSIXct(hd$date)
+hd$state <- NULL

 ## Read data for school vacations

@@ -92,7 +112,7 @@ sfdat <- NULL
 for (i in seq_len(nrow(sf))) {
  date <- seq(sf$start[i], sf$end[i], by = 1)
  sfdat <- rbind(sfdat, data.frame(date, vacations = sf$name[i],
-                                   stateCodeVacations = sf$stateCode[i]))
+                                   stateCode = sf$stateCode[i]))
 }

 # TODO: How to handle stateCode? There will be several for certain types of
@@ -100,14 +120,20 @@ for (i in seq_len(nrow(sf))) {

 ## Merge data

-datlogs_topics$date <- as.Date(datlogs_topics$date.start)
+datlogs$date <- as.Date(datlogs$date.start)

-dat1 <- merge(datlogs_topics, hd, by.x = "date", by.y = "date", all.x = TRUE)
+dat1 <- merge(datlogs, hd, by.x = "date", by.y = "date", all.x = TRUE)
 dat2 <- merge(dat1, sfdat, by.x = "date", by.y = "date", all.x = TRUE)
+dat2$stateCode <- dat2$stateCode.y
+dat2$stateCode <- ifelse(!is.na(dat2$stateCode.x),
+                         dat2$stateCode.x, dat2$stateCode.y)
+dat2$stateCode.x <- NULL
+dat2$stateCode.y <- NULL
+dat2$date <- NULL

 ## Export data

-write.table(dat2, paste0("results/haum/event_logfiles_glossar_", now, ".csv"),
+write.table(dat2, paste0("results/haum/event_logfiles_", now, ".csv"),
            sep = ";", row.names = FALSE)

 # TODO: Maybe add infos about artworks?