diff --git a/code/01_preprocessing_8o8m.R b/code/01_preprocessing_8o8m.R index 6a2eecb..5b2509c 100644 --- a/code/01_preprocessing_8o8m.R +++ b/code/01_preprocessing_8o8m.R @@ -20,8 +20,7 @@ datraw2 <- datraw[!is.na(datraw$artwork), ] # TODO: Why is this happening? # convert to log events -datlogs <- create_eventlogs(datraw2, xmlfiles = paste0(artworks, "_de.xml"), - xmlpath = "../data/8o8m/Content8o8m/") +datlogs <- create_eventlogs(datraw2, xmlpath = "../data/8o8m/Content8o8m/") # export data write.table(datlogs, paste0("../data/8o8m/event_logfiles_", now, ".csv"), diff --git a/code/01_preprocessing_haum.R b/code/01_preprocessing_haum.R index cf3e3a6..a840b44 100644 --- a/code/01_preprocessing_haum.R +++ b/code/01_preprocessing_haum.R @@ -6,25 +6,25 @@ devtools::load_all("../../../../software/mtt") now <- format(Sys.time(), "%Y-%m-%d_%H-%M-%S") #now <- "2023-09-23_01-31-30" -path <- "../data/haum/LogFiles/" +#--------------- Parse raw log files --------------- +path <- "../data/haum/LogFiles/" folders <- dir(path) -# parse raw log files #datraw <- parse_logfiles(folders, path) datraw <- read.table("../data/haum/raw_logfiles_small_2023-09-26_13-50-20.csv", sep = ";", header = TRUE) # export data -write.table(datraw, paste0("../data/haum/raw_logfiles_", now, ".csv"), - sep = ";", row.names = FALSE) +# write.table(datraw, paste0("../data/haum/raw_logfiles_", now, ".csv"), +# sep = ";", row.names = FALSE) -# convert to log events -datlogs <- create_eventlogs(datraw, +#--------------- Create event logs --------------- + +datlogs <- create_eventlogs(datraw, xmlpath = "../data/haum/ContentEyevisit/eyevisit_cards_light/") artworks <- unique(na.omit(datlogs$artwork)) -artworks <- artworks[!artworks %in% c("504", "505")] topics <- extract_topics(artworks, xmlfiles = paste0(artworks, ".xml"), xmlpath = "../data/haum/ContentEyevisit/eyevisit_cards_light/") @@ -32,6 +32,72 @@ datlogs_topics <- add_topic(datlogs, topics = topics) # export data -write.table(datlogs_topics, paste0("../data/haum/event_logfiles_", now, ".csv"), +# write.table(datlogs_topics, paste0("../data/haum/event_logfiles_", now, ".csv"), +# sep = ";", row.names = FALSE) + +#--------------- Add meta data --------------- + +## Read data for holiday + +hd0 <- read.table("../data/metadata/feiertage.csv", sep = ";", header = TRUE) +hd0$X.br. <- NULL + +hd <- hd0[hd0$Abkuerzung == "NI", ] +names(hd) <- c("state", "stateCode", "date", "holiday") +hd$date <- as.POSIXct(hd$date) + +## Read data for school vacations + +# https://ferien-api.de/#holidaysPerStateAndYear +# Data extracted (on Linux) via: +# curl https://ferien-api.de/api/v1/holidays/NI > schulferien_NI.json + +# library(jsonlite) +# +# dat <- read_json("data/metadata/schulferien_NI.json", simplify = TRUE) +# dat$slug <- NULL +# +# dat$name <- paste0(gsub("^(.*).niedersachsen.*", "\\1", dat$name), +# gsub("^.*niedersachsen [0-9]{4}(.*)", "\\1", +# dat$name)) +# +# write.table(dat, "data/metadata/schulferien_2019-2025_NI.csv", sep = ";", +# row.names = FALSE, quote = FALSE) + +sf1 <- read.table("../data/metadata/schulferien_2016-2018_NI.csv", sep = ";", + header = TRUE) +sf2 <- read.table("../data/metadata/schulferien_2019-2025_NI.csv", sep = ";", + header = TRUE) +sf <- rbind(sf1, sf2) +sf$start <- as.Date(sf$start) +sf$end <- as.Date(sf$end) + +sfdat <- NULL + +for (i in seq_len(nrow(sf))) { + date <- seq(sf$start[i], sf$end[i], by = 1) + sfdat <- rbind(sfdat, data.frame(date, vacations = sf$name[i], + stateCodeVacations = sf$stateCode[i])) +} + +# TODO: How to handle stateCode? There will be several for certain types of +# data sets... Not important here, since I only do NI. + +## Add metadata + +# holidays +dat1 <- merge(datlogs_topics, hd, by.x = "date.start", by.y = "date", all.x = TRUE) +# school vacations +dat2 <- merge(dat1, sfdat, by.x = "date.start", by.y = "date", all.x = TRUE) + +## Export data + +write.table(dat2, + file = paste0("../data/haum/event_logfiles_small_metadata_", now, ".csv"), sep = ";", row.names = FALSE) +# TODO: Maybe add infos about artworks? + + + + diff --git a/code/02_metadata_haum.R b/code/02_metadata_haum.R index 9112fad..5e1f730 100644 --- a/code/02_metadata_haum.R +++ b/code/02_metadata_haum.R @@ -48,7 +48,7 @@ for (i in seq_len(nrow(sf))) { # load (small) event log data set #dat <- read.table("data/haum/event_logfiles_2023-09-23_01-31-30.csv", -dat <- read.table("data/haum/event_logfiles_small_2023-10-15_10-08-43.csv", +dat <- read.table("data/haum/event_logfiles_small_2023-10-19_18-25-26.csv", sep = ";", header = TRUE) dat$date.start <- as.POSIXct(dat$date.start) dat$date.stop <- as.POSIXct(dat$date.stop) @@ -65,7 +65,7 @@ dat2 <- merge(dat1, sfdat, by = "date", all.x = TRUE) ## Export data write.table(dat2, - file = "data/haum/event_logfiles_small_metadata_2023-10-15_10-08-43.csv", + file = "data/haum/event_logfiles_small_metadata_2023-10-19_18-25-26.csv", sep = ";", row.names = FALSE) # TODO: Maybe add infos about artworks? diff --git a/code/04_modeling_haum.R b/code/04_modeling_haum.R index 8c40744..d38b4a9 100644 --- a/code/04_modeling_haum.R +++ b/code/04_modeling_haum.R @@ -4,7 +4,7 @@ # dat0 <- read.table("../data/haum/event_logfiles_metadata_2023-09-23_01-31-30.csv", # sep = ";", header = TRUE) -dat0 <- read.table("../data/haum/event_logfiles_small_metadata_2023-10-15_10-08-43.csv", +dat0 <- read.table("../data/haum/event_logfiles_small_metadata_2023-10-19_18-25-26.csv", sep = ";", header = TRUE) dat0$date <- as.Date(dat0$date) dat0$date.start <- as.POSIXct(dat0$date.start) @@ -81,7 +81,7 @@ datrm[datrm$trace %in% ct, 1:10] ### WHY????? -alog <- activitylog(datrm, +alog <- activitylog(dat, case_id = "trace", activity_id = "event", #resource_id = "case",