137 lines
4.1 KiB
R
137 lines
4.1 KiB
R
# 01_preprocessing.R
|
|
#
|
|
# content: (1) Parse raw log files
|
|
# (2) Create event logs
|
|
# (3) Add meta data
|
|
#
|
|
# input: raw log files from ../data/haum/*.log
|
|
# ../data/metadata/feiertage.csv
|
|
# ../data/metadata/schulferien_2016-2018_NI.csv
|
|
# ../data/metadata/schulferien_2019-2025_NI.csv
|
|
# output: results/raw_logfiles_<timestamp>.csv
|
|
# results/event_logfiles_<timestamp>.csv
|
|
#
|
|
# last mod: 2024-02-23, NW
|
|
|
|
# setwd("C:/Users/nwickelmaier/Nextcloud/Documents/MDS/2023ss/60100_master_thesis/analysis/code")
|
|
|
|
#library(mtt)
|
|
devtools::load_all("../../../../../software/mtt")
|
|
|
|
now <- format(Sys.time(), "%Y-%m-%d_%H-%M-%S")
|
|
|
|
#--------------- (1) Parse raw log files ---------------
|
|
|
|
path <- "../data/haum/LogFiles/"
|
|
folders <- dir(path)
|
|
#folders <- "2016"
|
|
|
|
datraw <- parse_logfiles(folders, path)
|
|
# 91 corrupt lines have been found and removed from the data set
|
|
|
|
# datraw <- read.table("results/raw_logfiles_2023-10-25_16-20-45.csv",
|
|
# sep = ";", header = TRUE)
|
|
|
|
## Export data
|
|
|
|
write.table(datraw, paste0("results/raw_logfiles_", now, ".csv"),
|
|
sep = ";", row.names = FALSE)
|
|
|
|
#--------------- (2) Create event logs ---------------
|
|
|
|
datlogs <- create_eventlogs(datraw,
|
|
#xmlpath = "../data/haum/ContentEyevisit/eyevisit_cards_light/",
|
|
glossar = FALSE, save = TRUE)
|
|
# 2,136,694 no change moves removed
|
|
|
|
# OLD:
|
|
# 6,064 glossar entries, that could not be matched, have been removed
|
|
# 2,136,715 no change move events have been removed
|
|
|
|
# items <- unique(datlogs$item)
|
|
# topics <- extract_topics(items, xmlfiles = paste0(items, ".xml"),
|
|
# xmlpath = "../data/haum/ContentEyevisit/eyevisit_cards_light/")
|
|
|
|
# Indices for topics:
|
|
# 0 artist
|
|
# 1 thema
|
|
# 2 komposition
|
|
# 3 leben des kunstwerks
|
|
# 4 details
|
|
# 5 licht und farbe
|
|
# 6 extra info
|
|
# 7 technik
|
|
|
|
# ATTENTION: Need to know which topic maps onto which index!
|
|
datlogs$topic <- factor(datlogs$topic, levels = 0:7,
|
|
labels = c("artist", "thema", "komposition",
|
|
"leben des kunstwerks", "details",
|
|
"licht und farbe", "extra info",
|
|
"technik"))
|
|
|
|
#--------------- (3) Add meta data ---------------
|
|
|
|
## Read data for holiday
|
|
|
|
hd0 <- read.table("../data/metadata/feiertage.csv", sep = ";", header = TRUE)
|
|
hd0$X.br. <- NULL
|
|
|
|
hd <- hd0[hd0$Abkuerzung == "NI", ]
|
|
names(hd) <- c("state", "stateCode", "date", "holiday")
|
|
hd$date <- as.POSIXct(hd$date)
|
|
hd$state <- NULL
|
|
hd$stateCode <- NULL
|
|
|
|
## Read data for school vacations
|
|
|
|
# https://ferien-api.de/#holidaysPerStateAndYear
|
|
# Data extracted (on Linux) via:
|
|
# curl https://ferien-api.de/api/v1/holidays/NI > schulferien_NI.json
|
|
|
|
# library(jsonlite)
|
|
#
|
|
# dat <- read_json("data/metadata/schulferien_NI.json", simplify = TRUE)
|
|
# dat$slug <- NULL
|
|
#
|
|
# dat$name <- paste0(gsub("^(.*).niedersachsen.*", "\\1", dat$name),
|
|
# gsub("^.*niedersachsen [0-9]{4}(.*)", "\\1",
|
|
# dat$name))
|
|
#
|
|
# write.table(dat, "data/metadata/schulferien_2019-2025_NI.csv", sep = ";",
|
|
# row.names = FALSE, quote = FALSE)
|
|
|
|
sf1 <- read.table("../data/metadata/schulferien_2016-2018_NI.csv", sep = ";",
|
|
header = TRUE)
|
|
sf2 <- read.table("../data/metadata/schulferien_2019-2025_NI.csv", sep = ";",
|
|
header = TRUE)
|
|
sf <- rbind(sf1, sf2)
|
|
sf$start <- as.Date(sf$start)
|
|
sf$end <- as.Date(sf$end)
|
|
|
|
sfdat <- NULL
|
|
|
|
for (i in seq_len(nrow(sf))) {
|
|
date <- seq(sf$start[i], sf$end[i], by = 1)
|
|
sfdat <- rbind(sfdat, data.frame(date, vacation = sf$name[i],
|
|
stateCode = sf$stateCode[i]))
|
|
}
|
|
|
|
sfdat$stateCode <- NULL
|
|
|
|
## Merge data
|
|
|
|
datlogs$date <- as.Date(datlogs$date.start)
|
|
|
|
dat1 <- merge(datlogs, hd, by.x = "date", by.y = "date", all.x = TRUE)
|
|
dat2 <- merge(dat1, sfdat, by.x = "date", by.y = "date", all.x = TRUE)
|
|
|
|
dat2$date <- NULL
|
|
dat2 <- dat2[order(dat2$fileId.start, dat2$date.start, dat2$timeMs.start), ]
|
|
|
|
|
|
## Export data
|
|
|
|
write.table(dat2, paste0("results/event_logfiles_", now, ".csv"),
|
|
sep = ";", row.names = FALSE)
|
|
|