mtt_haum/code/01_preprocessing_haum.R

# 01_preprocessing_haum.R
#
# content: (1) Parse raw log files
#          (2) Create event logs
#          (3) Add meta data
#
# input:  raw log files from ../data/haum/*.log
#         ../data/metadata/feiertage.csv
#         ../data/metadata/schulferien_2016-2018_NI.csv
#         ../data/metadata/schulferien_2019-2025_NI.csv
# output: raw_logfiles_<timestamp>.csv
#         event_logfiles_<timestamp>.csv
#
# last mod: 2024-01-02, NW

# setwd("C:/Users/nwickelmaier/Nextcloud/Documents/MDS/2023ss/60100_master_thesis/code")

#library(mtt)
devtools::load_all("../../../../software/mtt")

now <- format(Sys.time(), "%Y-%m-%d_%H-%M-%S")

#--------------- (1) Parse raw log files ---------------

path <- "../data/haum/LogFiles/"
folders <- dir(path)
#folders <- "2016"

datraw <- parse_logfiles(folders, path)
# 91 corrupt lines have been found and removed from the data set

# datraw <- read.table("results/haum/raw_logfiles_2023-10-25_16-20-45.csv",
#                      sep = ";", header = TRUE)

## Export data

write.table(datraw, paste0("results/haum/raw_logfiles_", now, ".csv"),
            sep = ";", row.names = FALSE)

#--------------- (2) Create event logs ---------------

datlogs <- create_eventlogs(datraw,
           #xmlpath = "../data/haum/ContentEyevisit/eyevisit_cards_light/",
           glossar = FALSE, save = TRUE)
# 2,136,694 no change moves removed

# OLD:
# 6,064 glossar entries, that could not be matched, have been removed
# 2,136,715 no change move events have been removed

# items <- unique(datlogs$item)
# topics <- extract_topics(items, xmlfiles = paste0(items, ".xml"),
#                          xmlpath = "../data/haum/ContentEyevisit/eyevisit_cards_light/")

# Indices for topics:
# 0   artist
# 1   thema
# 2   komposition
# 3   leben des kunstwerks
# 4   details
# 5   licht und farbe
# 6   extra info
# 7   technik

# ATTENTION: Need to know which topic maps onto which index!
datlogs$topic <- factor(datlogs$topic, levels = 0:7,
                        labels = c("artist", "thema", "komposition",
                                   "leben des kunstwerks", "details",
                                   "licht und farbe", "extra info",
                                   "technik"))

#--------------- (3) Add meta data ---------------

## Read data for holiday

hd0 <- read.table("../data/metadata/feiertage.csv", sep = ";", header = TRUE)
hd0$X.br. <- NULL

hd <- hd0[hd0$Abkuerzung == "NI", ]
names(hd) <- c("state", "stateCode", "date", "holiday")
hd$date <- as.POSIXct(hd$date)
hd$state <- NULL
hd$stateCode <- NULL

## Read data for school vacations

# https://ferien-api.de/#holidaysPerStateAndYear
# Data extracted (on Linux) via:
# curl https://ferien-api.de/api/v1/holidays/NI > schulferien_NI.json

# library(jsonlite)
#
# dat <- read_json("data/metadata/schulferien_NI.json", simplify = TRUE)
# dat$slug <- NULL
#
# dat$name <- paste0(gsub("^(.*).niedersachsen.*", "\\1", dat$name),
#                    gsub("^.*niedersachsen [0-9]{4}(.*)", "\\1",
#                               dat$name))
#
# write.table(dat, "data/metadata/schulferien_2019-2025_NI.csv", sep = ";",
#             row.names = FALSE, quote = FALSE)

sf1 <- read.table("../data/metadata/schulferien_2016-2018_NI.csv", sep = ";",
                  header = TRUE)
sf2 <- read.table("../data/metadata/schulferien_2019-2025_NI.csv", sep = ";",
                  header = TRUE)
sf <- rbind(sf1, sf2)
sf$start <- as.Date(sf$start)
sf$end <- as.Date(sf$end)

sfdat <- NULL

for (i in seq_len(nrow(sf))) {
  date <- seq(sf$start[i], sf$end[i], by = 1)
  sfdat <- rbind(sfdat, data.frame(date, vacation = sf$name[i],
                                   stateCode = sf$stateCode[i]))
}

sfdat$stateCode <- NULL

## Merge data

datlogs$date <- as.Date(datlogs$date.start)

dat1 <- merge(datlogs, hd, by.x = "date", by.y = "date", all.x = TRUE)
dat2 <- merge(dat1, sfdat, by.x = "date", by.y = "date", all.x = TRUE)

dat2$date <- NULL

## Export data

write.table(dat2, paste0("results/haum/event_logfiles_", now, ".csv"),
            sep = ";", row.names = FALSE)
Cleaned up preprocessing file for HAUM 2023-10-25 17:12:22 +02:00			`# 01_preprocessing_haum.R`
			`#`
			`# content: (1) Parse raw log files`
			`# (2) Create event logs`
			`# (3) Add meta data`
			`#`
			`# input: raw log files from ../data/haum/*.log`
			`# ../data/metadata/feiertage.csv`
			`# ../data/metadata/schulferien_2016-2018_NI.csv`
			`# ../data/metadata/schulferien_2019-2025_NI.csv`
			`# output: raw_logfiles_<timestamp>.csv`
			`# event_logfiles_<timestamp>.csv`
			`#`
Updated preprocessing file and let it run after cleaning up mtt code 2024-01-03 09:01:55 +01:00			`# last mod: 2024-01-02, NW`
Cleaned up preprocessing file for HAUM 2023-10-25 17:12:22 +02:00
Updated analysis files for haum and 8o8m 2023-09-22 16:16:20 +02:00			`# setwd("C:/Users/nwickelmaier/Nextcloud/Documents/MDS/2023ss/60100_master_thesis/code")`

			`#library(mtt)`
			`devtools::load_all("../../../../software/mtt")`

Worked on analysis files, cleaned out and restructured folders 2023-09-26 18:34:59 +02:00			`now <- format(Sys.time(), "%Y-%m-%d_%H-%M-%S")`
Updated analysis files for haum and 8o8m 2023-09-22 16:16:20 +02:00
Cleaned up preprocessing file for HAUM 2023-10-25 17:12:22 +02:00			`#--------------- (1) Parse raw log files ---------------`
Updated analysis files for haum and 8o8m 2023-09-22 16:16:20 +02:00
Cleaned up preprocessing for HAUM 2024-01-02 15:37:37 +01:00			`path <- "../data/haum/LogFiles/"`
			`folders <- dir(path)`
Tried out current state of mtt; parsing for 8o8m still not working correctly 2023-11-01 18:48:14 +01:00			`#folders <- "2016"`
Updated analysis files for haum and 8o8m 2023-09-22 16:16:20 +02:00
Cleaned up preprocessing for HAUM 2024-01-02 15:37:37 +01:00			`datraw <- parse_logfiles(folders, path)`
			`# 91 corrupt lines have been found and removed from the data set`
Tried out current state of mtt; parsing for 8o8m still not working correctly 2023-11-01 18:48:14 +01:00
Cleaned up preprocessing for HAUM 2024-01-02 15:37:37 +01:00			`# datraw <- read.table("results/haum/raw_logfiles_2023-10-25_16-20-45.csv",`
			`# sep = ";", header = TRUE)`
Cleaned up preprocessing file for HAUM 2023-10-25 17:12:22 +02:00
			`## Export data`
Worked on analysis files, cleaned out and restructured folders 2023-09-26 18:34:59 +02:00
Cleaned up preprocessing for HAUM 2024-01-02 15:37:37 +01:00			`write.table(datraw, paste0("results/haum/raw_logfiles_", now, ".csv"),`
			`sep = ";", row.names = FALSE)`
Updated analysis files for haum and 8o8m 2023-09-22 16:16:20 +02:00
Cleaned up preprocessing file for HAUM 2023-10-25 17:12:22 +02:00			`#--------------- (2) Create event logs ---------------`
Trying out new functionality; working on reparing traces 2023-10-23 15:11:08 +02:00
			`datlogs <- create_eventlogs(datraw,`
Cleaned up preprocessing for HAUM 2024-01-02 15:37:37 +01:00			`#xmlpath = "../data/haum/ContentEyevisit/eyevisit_cards_light/",`
Worked on analysis 2024-01-16 09:59:23 +01:00			`glossar = FALSE, save = TRUE)`
Updated preprocessing file and let it run after cleaning up mtt code 2024-01-03 09:01:55 +01:00			`# 2,136,694 no change moves removed`
Cleaned up preprocessing for HAUM 2024-01-02 15:37:37 +01:00
Updated preprocessing file and let it run after cleaning up mtt code 2024-01-03 09:01:55 +01:00			`# OLD:`
Cleaned up preprocessing for HAUM 2024-01-02 15:37:37 +01:00			`# 6,064 glossar entries, that could not be matched, have been removed`
			`# 2,136,715 no change move events have been removed`
Worked on closing events and checking traces; needs more love still 2023-10-18 12:57:15 +02:00
Removed variable stateCode from final data set 2024-01-02 15:42:48 +01:00			`# items <- unique(datlogs$item)`
			`# topics <- extract_topics(items, xmlfiles = paste0(items, ".xml"),`
			`# xmlpath = "../data/haum/ContentEyevisit/eyevisit_cards_light/")`
Worked on analysis files, cleaned out and restructured folders 2023-09-26 18:34:59 +02:00
Cleaned up preprocessing for HAUM 2024-01-02 15:37:37 +01:00			`# Indices for topics:`
			`# 0 artist`
			`# 1 thema`
			`# 2 komposition`
			`# 3 leben des kunstwerks`
			`# 4 details`
			`# 5 licht und farbe`
			`# 6 extra info`
			`# 7 technik`

			`# ATTENTION: Need to know which topic maps onto which index!`
			`datlogs$topic <- factor(datlogs$topic, levels = 0:7,`
			`labels = c("artist", "thema", "komposition",`
			`"leben des kunstwerks", "details",`
			`"licht und farbe", "extra info",`
			`"technik"))`
Worked on analysis files, cleaned out and restructured folders 2023-09-26 18:34:59 +02:00
Cleaned up preprocessing file for HAUM 2023-10-25 17:12:22 +02:00			`#--------------- (3) Add meta data ---------------`
Trying out new functionality; working on reparing traces 2023-10-23 15:11:08 +02:00
			`## Read data for holiday`

			`hd0 <- read.table("../data/metadata/feiertage.csv", sep = ";", header = TRUE)`
			`hd0$X.br. <- NULL`

			`hd <- hd0[hd0$Abkuerzung == "NI", ]`
			`names(hd) <- c("state", "stateCode", "date", "holiday")`
			`hd$date <- as.POSIXct(hd$date)`
Cleaned up preprocessing for HAUM 2024-01-02 15:37:37 +01:00			`hd$state <- NULL`
Removed variable stateCode from final data set 2024-01-02 15:42:48 +01:00			`hd$stateCode <- NULL`
Trying out new functionality; working on reparing traces 2023-10-23 15:11:08 +02:00
			`## Read data for school vacations`

			`# https://ferien-api.de/#holidaysPerStateAndYear`
			`# Data extracted (on Linux) via:`
			`# curl https://ferien-api.de/api/v1/holidays/NI > schulferien_NI.json`

			`# library(jsonlite)`
			`#`
			`# dat <- read_json("data/metadata/schulferien_NI.json", simplify = TRUE)`
			`# dat$slug <- NULL`
			`#`
			`# dat$name <- paste0(gsub("^(.).niedersachsen.", "\\1", dat$name),`
			`# gsub("^.niedersachsen [0-9]{4}(.)", "\\1",`
			`# dat$name))`
			`#`
			`# write.table(dat, "data/metadata/schulferien_2019-2025_NI.csv", sep = ";",`
			`# row.names = FALSE, quote = FALSE)`

			`sf1 <- read.table("../data/metadata/schulferien_2016-2018_NI.csv", sep = ";",`
			`header = TRUE)`
			`sf2 <- read.table("../data/metadata/schulferien_2019-2025_NI.csv", sep = ";",`
			`header = TRUE)`
			`sf <- rbind(sf1, sf2)`
			`sf$start <- as.Date(sf$start)`
			`sf$end <- as.Date(sf$end)`

			`sfdat <- NULL`

			`for (i in seq_len(nrow(sf))) {`
			`date <- seq(sf$start[i], sf$end[i], by = 1)`
Removed variable stateCode from final data set 2024-01-02 15:42:48 +01:00			`sfdat <- rbind(sfdat, data.frame(date, vacation = sf$name[i],`
Cleaned up preprocessing for HAUM 2024-01-02 15:37:37 +01:00			`stateCode = sf$stateCode[i]))`
Trying out new functionality; working on reparing traces 2023-10-23 15:11:08 +02:00			`}`

Removed variable stateCode from final data set 2024-01-02 15:42:48 +01:00			`sfdat$stateCode <- NULL`
Trying out new functionality; working on reparing traces 2023-10-23 15:11:08 +02:00
Cleaned up preprocessing file for HAUM 2023-10-25 17:12:22 +02:00			`## Merge data`
Trying out new functionality; working on reparing traces 2023-10-23 15:11:08 +02:00
Cleaned up preprocessing for HAUM 2024-01-02 15:37:37 +01:00			`datlogs$date <- as.Date(datlogs$date.start)`
Tried out current state of mtt; parsing for 8o8m still not working correctly 2023-11-01 18:48:14 +01:00
Cleaned up preprocessing for HAUM 2024-01-02 15:37:37 +01:00			`dat1 <- merge(datlogs, hd, by.x = "date", by.y = "date", all.x = TRUE)`
Tried out current state of mtt; parsing for 8o8m still not working correctly 2023-11-01 18:48:14 +01:00			`dat2 <- merge(dat1, sfdat, by.x = "date", by.y = "date", all.x = TRUE)`
Removed variable stateCode from final data set 2024-01-02 15:42:48 +01:00
Cleaned up preprocessing for HAUM 2024-01-02 15:37:37 +01:00			`dat2$date <- NULL`
Trying out new functionality; working on reparing traces 2023-10-23 15:11:08 +02:00
			`## Export data`

Cleaned up preprocessing for HAUM 2024-01-02 15:37:37 +01:00			`write.table(dat2, paste0("results/haum/event_logfiles_", now, ".csv"),`
Updated analysis files for haum and 8o8m 2023-09-22 16:16:20 +02:00			`sep = ";", row.names = FALSE)`