diff --git a/code/01_preprocessing_haum.R b/code/01_preprocessing_haum.R index b8bf2cd..4f95e06 100644 --- a/code/01_preprocessing_haum.R +++ b/code/01_preprocessing_haum.R @@ -10,9 +10,8 @@ # ../data/metadata/schulferien_2019-2025_NI.csv # output: raw_logfiles_.csv # event_logfiles_.csv -# event_logfiles_.csv # -# last mod: 2023-10-23, NW +# last mod: 2024-01-01, NW # setwd("C:/Users/nwickelmaier/Nextcloud/Documents/MDS/2023ss/60100_master_thesis/code") @@ -42,15 +41,15 @@ write.table(datraw, paste0("results/haum/raw_logfiles_", now, ".csv"), datlogs <- create_eventlogs(datraw, #xmlpath = "../data/haum/ContentEyevisit/eyevisit_cards_light/", - glossar = FALSE, save = FALSE) + glossar = FALSE) # 6,064 glossar entries, that could not be matched, have been removed # 2,136,715 no change move events have been removed -items <- unique(datlogs$item) -topics <- extract_topics(items, xmlfiles = paste0(items, ".xml"), - xmlpath = "../data/haum/ContentEyevisit/eyevisit_cards_light/") +# items <- unique(datlogs$item) +# topics <- extract_topics(items, xmlfiles = paste0(items, ".xml"), +# xmlpath = "../data/haum/ContentEyevisit/eyevisit_cards_light/") # Indices for topics: # 0 artist @@ -80,6 +79,7 @@ hd <- hd0[hd0$Abkuerzung == "NI", ] names(hd) <- c("state", "stateCode", "date", "holiday") hd$date <- as.POSIXct(hd$date) hd$state <- NULL +hd$stateCode <- NULL ## Read data for school vacations @@ -111,12 +111,11 @@ sfdat <- NULL for (i in seq_len(nrow(sf))) { date <- seq(sf$start[i], sf$end[i], by = 1) - sfdat <- rbind(sfdat, data.frame(date, vacations = sf$name[i], + sfdat <- rbind(sfdat, data.frame(date, vacation = sf$name[i], stateCode = sf$stateCode[i])) } -# TODO: How to handle stateCode? There will be several for certain types of -# data sets... Not important here, since I only do NI. +sfdat$stateCode <- NULL ## Merge data @@ -124,11 +123,7 @@ datlogs$date <- as.Date(datlogs$date.start) dat1 <- merge(datlogs, hd, by.x = "date", by.y = "date", all.x = TRUE) dat2 <- merge(dat1, sfdat, by.x = "date", by.y = "date", all.x = TRUE) -dat2$stateCode <- dat2$stateCode.y -dat2$stateCode <- ifelse(!is.na(dat2$stateCode.x), - dat2$stateCode.x, dat2$stateCode.y) -dat2$stateCode.x <- NULL -dat2$stateCode.y <- NULL + dat2$date <- NULL ## Export data @@ -136,5 +131,3 @@ dat2$date <- NULL write.table(dat2, paste0("results/haum/event_logfiles_", now, ".csv"), sep = ";", row.names = FALSE) -# TODO: Maybe add infos about artworks? -