Cleaned up preprocessing for HAUM
This commit is contained in:
parent
4d17f76fd5
commit
16fa9c4f2c
@ -20,35 +20,54 @@
|
|||||||
devtools::load_all("../../../../software/mtt")
|
devtools::load_all("../../../../software/mtt")
|
||||||
|
|
||||||
now <- format(Sys.time(), "%Y-%m-%d_%H-%M-%S")
|
now <- format(Sys.time(), "%Y-%m-%d_%H-%M-%S")
|
||||||
#now <- "2023-09-23_01-31-30"
|
|
||||||
|
|
||||||
#--------------- (1) Parse raw log files ---------------
|
#--------------- (1) Parse raw log files ---------------
|
||||||
|
|
||||||
#path <- "../data/haum/LogFiles/"
|
path <- "../data/haum/LogFiles/"
|
||||||
#folders <- dir(path)
|
folders <- dir(path)
|
||||||
#folders <- "2016"
|
#folders <- "2016"
|
||||||
|
|
||||||
#datraw <- parse_logfiles(folders, path)
|
datraw <- parse_logfiles(folders, path)
|
||||||
|
# 91 corrupt lines have been found and removed from the data set
|
||||||
|
|
||||||
datraw <- read.table("results/haum/raw_logfiles_2023-10-25_16-20-45.csv",
|
# datraw <- read.table("results/haum/raw_logfiles_2023-10-25_16-20-45.csv",
|
||||||
sep = ";", header = TRUE)
|
# sep = ";", header = TRUE)
|
||||||
|
|
||||||
## Export data
|
## Export data
|
||||||
|
|
||||||
#write.table(datraw, paste0("results/haum/raw_logfiles_small_", now, ".csv"),
|
write.table(datraw, paste0("results/haum/raw_logfiles_", now, ".csv"),
|
||||||
# sep = ";", row.names = FALSE)
|
sep = ";", row.names = FALSE)
|
||||||
|
|
||||||
#--------------- (2) Create event logs ---------------
|
#--------------- (2) Create event logs ---------------
|
||||||
|
|
||||||
datlogs <- create_eventlogs(datraw,
|
datlogs <- create_eventlogs(datraw,
|
||||||
xmlpath = "../data/haum/ContentEyevisit/eyevisit_cards_light/",
|
#xmlpath = "../data/haum/ContentEyevisit/eyevisit_cards_light/",
|
||||||
glossar = TRUE, save = TRUE)
|
glossar = FALSE, save = FALSE)
|
||||||
|
|
||||||
artworks <- unique(datlogs$artwork)
|
# 6,064 glossar entries, that could not be matched, have been removed
|
||||||
topics <- extract_topics(artworks, xmlfiles = paste0(artworks, ".xml"),
|
|
||||||
|
# 2,136,715 no change move events have been removed
|
||||||
|
|
||||||
|
items <- unique(datlogs$item)
|
||||||
|
topics <- extract_topics(items, xmlfiles = paste0(items, ".xml"),
|
||||||
xmlpath = "../data/haum/ContentEyevisit/eyevisit_cards_light/")
|
xmlpath = "../data/haum/ContentEyevisit/eyevisit_cards_light/")
|
||||||
|
|
||||||
datlogs_topics <- add_topic(datlogs, topics = topics)
|
# Indices for topics:
|
||||||
|
# 0 artist
|
||||||
|
# 1 thema
|
||||||
|
# 2 komposition
|
||||||
|
# 3 leben des kunstwerks
|
||||||
|
# 4 details
|
||||||
|
# 5 licht und farbe
|
||||||
|
# 6 extra info
|
||||||
|
# 7 technik
|
||||||
|
|
||||||
|
# ATTENTION: Need to know which topic maps onto which index!
|
||||||
|
datlogs$topic <- factor(datlogs$topic, levels = 0:7,
|
||||||
|
labels = c("artist", "thema", "komposition",
|
||||||
|
"leben des kunstwerks", "details",
|
||||||
|
"licht und farbe", "extra info",
|
||||||
|
"technik"))
|
||||||
|
|
||||||
#--------------- (3) Add meta data ---------------
|
#--------------- (3) Add meta data ---------------
|
||||||
|
|
||||||
@ -60,6 +79,7 @@ hd0$X.br. <- NULL
|
|||||||
hd <- hd0[hd0$Abkuerzung == "NI", ]
|
hd <- hd0[hd0$Abkuerzung == "NI", ]
|
||||||
names(hd) <- c("state", "stateCode", "date", "holiday")
|
names(hd) <- c("state", "stateCode", "date", "holiday")
|
||||||
hd$date <- as.POSIXct(hd$date)
|
hd$date <- as.POSIXct(hd$date)
|
||||||
|
hd$state <- NULL
|
||||||
|
|
||||||
## Read data for school vacations
|
## Read data for school vacations
|
||||||
|
|
||||||
@ -92,7 +112,7 @@ sfdat <- NULL
|
|||||||
for (i in seq_len(nrow(sf))) {
|
for (i in seq_len(nrow(sf))) {
|
||||||
date <- seq(sf$start[i], sf$end[i], by = 1)
|
date <- seq(sf$start[i], sf$end[i], by = 1)
|
||||||
sfdat <- rbind(sfdat, data.frame(date, vacations = sf$name[i],
|
sfdat <- rbind(sfdat, data.frame(date, vacations = sf$name[i],
|
||||||
stateCodeVacations = sf$stateCode[i]))
|
stateCode = sf$stateCode[i]))
|
||||||
}
|
}
|
||||||
|
|
||||||
# TODO: How to handle stateCode? There will be several for certain types of
|
# TODO: How to handle stateCode? There will be several for certain types of
|
||||||
@ -100,14 +120,20 @@ for (i in seq_len(nrow(sf))) {
|
|||||||
|
|
||||||
## Merge data
|
## Merge data
|
||||||
|
|
||||||
datlogs_topics$date <- as.Date(datlogs_topics$date.start)
|
datlogs$date <- as.Date(datlogs$date.start)
|
||||||
|
|
||||||
dat1 <- merge(datlogs_topics, hd, by.x = "date", by.y = "date", all.x = TRUE)
|
dat1 <- merge(datlogs, hd, by.x = "date", by.y = "date", all.x = TRUE)
|
||||||
dat2 <- merge(dat1, sfdat, by.x = "date", by.y = "date", all.x = TRUE)
|
dat2 <- merge(dat1, sfdat, by.x = "date", by.y = "date", all.x = TRUE)
|
||||||
|
dat2$stateCode <- dat2$stateCode.y
|
||||||
|
dat2$stateCode <- ifelse(!is.na(dat2$stateCode.x),
|
||||||
|
dat2$stateCode.x, dat2$stateCode.y)
|
||||||
|
dat2$stateCode.x <- NULL
|
||||||
|
dat2$stateCode.y <- NULL
|
||||||
|
dat2$date <- NULL
|
||||||
|
|
||||||
## Export data
|
## Export data
|
||||||
|
|
||||||
write.table(dat2, paste0("results/haum/event_logfiles_glossar_", now, ".csv"),
|
write.table(dat2, paste0("results/haum/event_logfiles_", now, ".csv"),
|
||||||
sep = ";", row.names = FALSE)
|
sep = ";", row.names = FALSE)
|
||||||
|
|
||||||
# TODO: Maybe add infos about artworks?
|
# TODO: Maybe add infos about artworks?
|
||||||
|
Loading…
Reference in New Issue
Block a user