mtt_haum/code/01_preprocessing_8o8m.R

38 lines
1.2 KiB
R

# setwd("C:/Users/nwickelmaier/Nextcloud/Documents/MDS/2023ss/60100_master_thesis/code")
#library(mtt)
devtools::load_all("../../../../software/mtt")
now <- format(Sys.time(), "%Y-%m-%d_%H-%M-%S")
folders <- dir("../data/8o8m/LogFiles/")
#folders <- "Berlin"
# parse raw log files
datraw <- parse_logfiles(folders, path = "../data/8o8m/LogFiles/")
#artworks <- unique(na.omit(datraw$artwork))
# export data
write.table(datraw, paste0("../data/8o8m/raw_logfiles_", now, ".csv"),
sep = ";", row.names = FALSE)
#datraw[is.na(datraw$artwork), ]
datraw <- datraw[!is.na(datraw$artwork), ]
# TODO: Why is this happening?
# convert to log events
datlogs <- create_eventlogs(datraw, xmlpath = "../data/8o8m/Content8o8m/")
artworks <- unique(datlogs$artwork)
topics <- extract_topics(artworks, xmlfiles = paste0(artworks, "_en.xml"),
xmlpath = "../data/8o8m/Content8o8m/")
# TODO: What is wrong with the German XML files that the topics are
# extracted like this? (It works fine for the English versions...)
# export data
write.table(datlogs, paste0("../data/8o8m/event_logfiles_", now, ".csv"),
sep = ";", row.names = FALSE)