2023-09-22 16:16:20 +02:00
|
|
|
# setwd("C:/Users/nwickelmaier/Nextcloud/Documents/MDS/2023ss/60100_master_thesis/code")
|
|
|
|
|
|
|
|
#library(mtt)
|
|
|
|
devtools::load_all("../../../../software/mtt")
|
|
|
|
|
2023-09-26 18:34:59 +02:00
|
|
|
now <- format(Sys.time(), "%Y-%m-%d_%H-%M-%S")
|
2023-09-22 16:16:20 +02:00
|
|
|
|
|
|
|
folders <- dir("../data/8o8m/LogFiles/")
|
|
|
|
#folders <- "Berlin"
|
|
|
|
|
|
|
|
# parse raw log files
|
|
|
|
datraw <- parse_logfiles(folders, path = "../data/8o8m/LogFiles/")
|
2023-11-01 18:48:14 +01:00
|
|
|
#artworks <- unique(na.omit(datraw$artwork))
|
2023-09-22 16:16:20 +02:00
|
|
|
|
2023-09-26 18:34:59 +02:00
|
|
|
# export data
|
|
|
|
write.table(datraw, paste0("../data/8o8m/raw_logfiles_", now, ".csv"),
|
|
|
|
sep = ";", row.names = FALSE)
|
|
|
|
|
2023-11-01 18:48:14 +01:00
|
|
|
#datraw[is.na(datraw$artwork), ]
|
|
|
|
datraw <- datraw[!is.na(datraw$artwork), ]
|
2023-09-26 18:34:59 +02:00
|
|
|
# TODO: Why is this happening?
|
|
|
|
|
2023-09-22 16:16:20 +02:00
|
|
|
# convert to log events
|
2023-11-01 18:48:14 +01:00
|
|
|
datlogs <- create_eventlogs(datraw, xmlpath = "../data/8o8m/Content8o8m/")
|
|
|
|
|
|
|
|
artworks <- unique(datlogs$artwork)
|
|
|
|
|
|
|
|
|
|
|
|
topics <- extract_topics(artworks, xmlfiles = paste0(artworks, "_en.xml"),
|
|
|
|
xmlpath = "../data/8o8m/Content8o8m/")
|
|
|
|
# TODO: What is wrong with the German XML files that the topics are
|
|
|
|
# extracted like this? (It works fine for the English versions...)
|
2023-09-22 16:16:20 +02:00
|
|
|
|
|
|
|
# export data
|
|
|
|
write.table(datlogs, paste0("../data/8o8m/event_logfiles_", now, ".csv"),
|
|
|
|
sep = ";", row.names = FALSE)
|
|
|
|
|