# 01_preprocessing_haum.R # # content: (1) Parse raw log files # (2) Create event logs # (3) Add meta data # # input: raw log files from ../data/haum/*.log # ../data/metadata/feiertage.csv # ../data/metadata/schulferien_2016-2018_NI.csv # ../data/metadata/schulferien_2019-2025_NI.csv # output: raw_logfiles_.csv # event_logfiles_.csv # event_logfiles_.csv # # last mod: 2023-10-23, NW # setwd("C:/Users/nwickelmaier/Nextcloud/Documents/MDS/2023ss/60100_master_thesis/code") #library(mtt) devtools::load_all("../../../../software/mtt") now <- format(Sys.time(), "%Y-%m-%d_%H-%M-%S") #now <- "2023-09-23_01-31-30" #--------------- (1) Parse raw log files --------------- path <- "../data/haum/LogFiles/" folders <- dir(path) datraw <- parse_logfiles(folders, path) ## Export data write.table(datraw, paste0("../data/haum/raw_logfiles_", now, ".csv"), sep = ";", row.names = FALSE) #--------------- (2) Create event logs --------------- datlogs <- create_eventlogs(datraw, xmlpath = "../data/haum/ContentEyevisit/eyevisit_cards_light/") artworks <- unique(na.omit(datlogs$artwork)) topics <- extract_topics(artworks, xmlfiles = paste0(artworks, ".xml"), xmlpath = "../data/haum/ContentEyevisit/eyevisit_cards_light/") datlogs_topics <- add_topic(datlogs, topics = topics) #--------------- (3) Add meta data --------------- ## Read data for holiday hd0 <- read.table("../data/metadata/feiertage.csv", sep = ";", header = TRUE) hd0$X.br. <- NULL hd <- hd0[hd0$Abkuerzung == "NI", ] names(hd) <- c("state", "stateCode", "date", "holiday") hd$date <- as.POSIXct(hd$date) ## Read data for school vacations # https://ferien-api.de/#holidaysPerStateAndYear # Data extracted (on Linux) via: # curl https://ferien-api.de/api/v1/holidays/NI > schulferien_NI.json # library(jsonlite) # # dat <- read_json("data/metadata/schulferien_NI.json", simplify = TRUE) # dat$slug <- NULL # # dat$name <- paste0(gsub("^(.*).niedersachsen.*", "\\1", dat$name), # gsub("^.*niedersachsen [0-9]{4}(.*)", "\\1", # dat$name)) # # write.table(dat, "data/metadata/schulferien_2019-2025_NI.csv", sep = ";", # row.names = FALSE, quote = FALSE) sf1 <- read.table("../data/metadata/schulferien_2016-2018_NI.csv", sep = ";", header = TRUE) sf2 <- read.table("../data/metadata/schulferien_2019-2025_NI.csv", sep = ";", header = TRUE) sf <- rbind(sf1, sf2) sf$start <- as.Date(sf$start) sf$end <- as.Date(sf$end) sfdat <- NULL for (i in seq_len(nrow(sf))) { date <- seq(sf$start[i], sf$end[i], by = 1) sfdat <- rbind(sfdat, data.frame(date, vacations = sf$name[i], stateCodeVacations = sf$stateCode[i])) } # TODO: How to handle stateCode? There will be several for certain types of # data sets... Not important here, since I only do NI. ## Merge data dat1 <- merge(datlogs_topics, hd, by.x = "date.start", by.y = "date", all.x = TRUE) dat2 <- merge(dat1, sfdat, by.x = "date.start", by.y = "date", all.x = TRUE) ## Export data write.table(dat2, paste0("../data/haum/event_logfiles_", now, ".csv"), sep = ";", row.names = FALSE) # TODO: Maybe add infos about artworks?