From 16fa9c4f2c2aa9fa74f61354f4e7587bafc0d451 Mon Sep 17 00:00:00 2001 From: nwickel Date: Tue, 2 Jan 2024 15:37:37 +0100 Subject: [PATCH] Cleaned up preprocessing for HAUM --- code/01_preprocessing_haum.R | 60 ++++++++++++++++++++++++++---------- 1 file changed, 43 insertions(+), 17 deletions(-) diff --git a/code/01_preprocessing_haum.R b/code/01_preprocessing_haum.R index 8aabb9f..b8bf2cd 100644 --- a/code/01_preprocessing_haum.R +++ b/code/01_preprocessing_haum.R @@ -20,35 +20,54 @@ devtools::load_all("../../../../software/mtt") now <- format(Sys.time(), "%Y-%m-%d_%H-%M-%S") -#now <- "2023-09-23_01-31-30" #--------------- (1) Parse raw log files --------------- -#path <- "../data/haum/LogFiles/" -#folders <- dir(path) +path <- "../data/haum/LogFiles/" +folders <- dir(path) #folders <- "2016" -#datraw <- parse_logfiles(folders, path) +datraw <- parse_logfiles(folders, path) +# 91 corrupt lines have been found and removed from the data set -datraw <- read.table("results/haum/raw_logfiles_2023-10-25_16-20-45.csv", - sep = ";", header = TRUE) +# datraw <- read.table("results/haum/raw_logfiles_2023-10-25_16-20-45.csv", +# sep = ";", header = TRUE) ## Export data -#write.table(datraw, paste0("results/haum/raw_logfiles_small_", now, ".csv"), -# sep = ";", row.names = FALSE) +write.table(datraw, paste0("results/haum/raw_logfiles_", now, ".csv"), + sep = ";", row.names = FALSE) #--------------- (2) Create event logs --------------- datlogs <- create_eventlogs(datraw, - xmlpath = "../data/haum/ContentEyevisit/eyevisit_cards_light/", - glossar = TRUE, save = TRUE) + #xmlpath = "../data/haum/ContentEyevisit/eyevisit_cards_light/", + glossar = FALSE, save = FALSE) -artworks <- unique(datlogs$artwork) -topics <- extract_topics(artworks, xmlfiles = paste0(artworks, ".xml"), +# 6,064 glossar entries, that could not be matched, have been removed + +# 2,136,715 no change move events have been removed + +items <- unique(datlogs$item) +topics <- extract_topics(items, xmlfiles = paste0(items, ".xml"), xmlpath = "../data/haum/ContentEyevisit/eyevisit_cards_light/") -datlogs_topics <- add_topic(datlogs, topics = topics) +# Indices for topics: +# 0 artist +# 1 thema +# 2 komposition +# 3 leben des kunstwerks +# 4 details +# 5 licht und farbe +# 6 extra info +# 7 technik + +# ATTENTION: Need to know which topic maps onto which index! +datlogs$topic <- factor(datlogs$topic, levels = 0:7, + labels = c("artist", "thema", "komposition", + "leben des kunstwerks", "details", + "licht und farbe", "extra info", + "technik")) #--------------- (3) Add meta data --------------- @@ -60,6 +79,7 @@ hd0$X.br. <- NULL hd <- hd0[hd0$Abkuerzung == "NI", ] names(hd) <- c("state", "stateCode", "date", "holiday") hd$date <- as.POSIXct(hd$date) +hd$state <- NULL ## Read data for school vacations @@ -92,7 +112,7 @@ sfdat <- NULL for (i in seq_len(nrow(sf))) { date <- seq(sf$start[i], sf$end[i], by = 1) sfdat <- rbind(sfdat, data.frame(date, vacations = sf$name[i], - stateCodeVacations = sf$stateCode[i])) + stateCode = sf$stateCode[i])) } # TODO: How to handle stateCode? There will be several for certain types of @@ -100,14 +120,20 @@ for (i in seq_len(nrow(sf))) { ## Merge data -datlogs_topics$date <- as.Date(datlogs_topics$date.start) +datlogs$date <- as.Date(datlogs$date.start) -dat1 <- merge(datlogs_topics, hd, by.x = "date", by.y = "date", all.x = TRUE) +dat1 <- merge(datlogs, hd, by.x = "date", by.y = "date", all.x = TRUE) dat2 <- merge(dat1, sfdat, by.x = "date", by.y = "date", all.x = TRUE) +dat2$stateCode <- dat2$stateCode.y +dat2$stateCode <- ifelse(!is.na(dat2$stateCode.x), + dat2$stateCode.x, dat2$stateCode.y) +dat2$stateCode.x <- NULL +dat2$stateCode.y <- NULL +dat2$date <- NULL ## Export data -write.table(dat2, paste0("results/haum/event_logfiles_glossar_", now, ".csv"), +write.table(dat2, paste0("results/haum/event_logfiles_", now, ".csv"), sep = ";", row.names = FALSE) # TODO: Maybe add infos about artworks?