diff --git a/code/02_preprocessing_8o8m.R b/code/02_preprocessing_8o8m.R index 434a761..6a2eecb 100644 --- a/code/02_preprocessing_8o8m.R +++ b/code/02_preprocessing_8o8m.R @@ -3,7 +3,7 @@ #library(mtt) devtools::load_all("../../../../software/mtt") -now <- format(Sys.time(), , "%Y-%m-%d_%H-%M-%S") +now <- format(Sys.time(), "%Y-%m-%d_%H-%M-%S") folders <- dir("../data/8o8m/LogFiles/") #folders <- "Berlin" @@ -12,8 +12,15 @@ folders <- dir("../data/8o8m/LogFiles/") datraw <- parse_logfiles(folders, path = "../data/8o8m/LogFiles/") artworks <- unique(na.omit(datraw$artwork)) +# export data +write.table(datraw, paste0("../data/8o8m/raw_logfiles_", now, ".csv"), + sep = ";", row.names = FALSE) + +datraw2 <- datraw[!is.na(datraw$artwork), ] +# TODO: Why is this happening? + # convert to log events -datlogs <- create_eventlogs(datraw, xmlfiles = paste0(artworks, "_de.xml"), +datlogs <- create_eventlogs(datraw2, xmlfiles = paste0(artworks, "_de.xml"), xmlpath = "../data/8o8m/Content8o8m/") # export data diff --git a/code/02_preprocessing_haum.R b/code/02_preprocessing_haum.R index 397680b..495329e 100644 --- a/code/02_preprocessing_haum.R +++ b/code/02_preprocessing_haum.R @@ -3,7 +3,8 @@ #library(mtt) devtools::load_all("../../../../software/mtt") -now <- format(Sys.time(), , "%Y-%m-%d_%H-%M-%S") +now <- format(Sys.time(), "%Y-%m-%d_%H-%M-%S") +#now <- "2023-09-23_01-31-30" path <- "../data/haum/LogFiles/" @@ -11,13 +12,27 @@ folders <- dir(path) # parse raw log files datraw <- parse_logfiles(folders, path) + +# export data +write.table(datraw, paste0("../data/haum/raw_logfiles_", now, ".csv"), + sep = ";", row.names = FALSE) +#save(datraw, file = paste0("../data/haum/datraw_", now, ".RData")) +#load("../data/haum/datraw_2023-09-23_01-31-30.RData") artworks <- unique(na.omit(datraw$artwork)) # convert to log events -datlogs <- create_eventlogs(datraw, xmlfiles = paste0(artworks, "_de.xml"), +datlogs <- create_eventlogs(datraw, xmlpath = "../data/haum/ContentEyevisit/eyevisit_cards_light/") +artworks <- unique(na.omit(datlogs$artwork)) +artworks <- artworks[!artworks %in% c("504", "505")] + +topics <- extract_topics(artworks, xmlfiles = paste0(artworks, ".xml"), + xmlpath = "../data/haum/ContentEyevisit/eyevisit_cards_light/") + +datlogs_topics <- add_topic(datlogs, topics = topics) + # export data -write.table(datlogs, paste0("../data/haum/event_logfiles_", now, ".csv"), +write.table(datlogs_topics, paste0("../data/haum/event_logfiles_", now, ".csv"), sep = ";", row.names = FALSE) diff --git a/code/metadata.R b/code/metadata.R new file mode 100644 index 0000000..201cc7f --- /dev/null +++ b/code/metadata.R @@ -0,0 +1,71 @@ +# setwd("C:/Users/nwickelmaier/Nextcloud/Documents/MDS/2023ss/60100_master_thesis/") + +## Read data for holiday + +hd0 <- read.table("data/metadata/feiertage.csv", sep = ";", header = TRUE) +hd0$X.br. <- NULL + +hd <- hd0[hd0$Abkuerzung == "NI", ] +names(hd) <- c("state", "stateCode", "date", "holiday") +hd$date <- as.POSIXct(hd$date) + +## Read data for school vacations + +# https://ferien-api.de/#holidaysPerStateAndYear +# Data extracted (on Linux) via: +# curl https://ferien-api.de/api/v1/holidays/NI > schulferien_NI.json + +# library(jsonlite) +# +# dat <- read_json("data/metadata/schulferien_NI.json", simplify = TRUE) +# dat$slug <- NULL +# +# dat$name <- paste0(gsub("^(.*).niedersachsen.*", "\\1", dat$name), +# gsub("^.*niedersachsen [0-9]{4}(.*)", "\\1", +# dat$name)) +# +# write.table(dat, "data/metadata/schulferien_2019-2025_NI.csv", sep = ";", +# row.names = FALSE, quote = FALSE) + +sf1 <- read.table("data/metadata/schulferien_2016-2018_NI.csv", sep = ";", + header = TRUE) +sf2 <- read.table("data/metadata/schulferien_2019-2025_NI.csv", sep = ";", + header = TRUE) +sf <- rbind(sf1, sf2) +sf$start <- as.Date(sf$start) +sf$end <- as.Date(sf$end) + +sfdat <- NULL + +for (i in seq_len(nrow(sf))) { + date <- seq(sf$start[i], sf$end[i], by = 1) + sfdat <- rbind(sfdat, data.frame(date, vacations = sf$name[i], + stateCodeVacations = sf$stateCode[i])) +} + +# TODO: How to handle stateCode? There will be several for certain types of +# data sets... Not important here, since I only do NI. + +# load (small) event log data set +dat <- read.table("data/haum/event_logfiles_2023-09-23_01-31-30.csv", + sep = ";", header = TRUE) +dat$date.start <- as.POSIXct(dat$date.start) +dat$date.stop <- as.POSIXct(dat$date.stop) +dat$artwork <- sprintf("%03d", dat$artwork) +dat$date <- as.Date(dat$date.start) + +## Add metadata + +# holidays +dat1 <- merge(dat, hd, by = "date", all.x = TRUE) +# school vacations +dat2 <- merge(dat1, sfdat, by = "date", all.x = TRUE) + +## Export data + +write.table(dat2, + file = "data/haum/event_logfiles_metadata_2023-09-23_01-31-30.csv", + sep = ";", row.names = FALSE) + +# TODO: Maybe add infos about artworks? + diff --git a/code/overview_artworks_8o8m.R b/code/overview_artworks_8o8m.R index ebe1d0c..91b395f 100644 --- a/code/overview_artworks_8o8m.R +++ b/code/overview_artworks_8o8m.R @@ -10,7 +10,7 @@ if (file.exists(fname)) file.remove(fname) path <- "C:/Users/nwickelmaier/Nextcloud/Documents/MDS/2023ss/60100_master_thesis/data/8o8m/Content8o8m" artworks <- dir(path = path) -dat <- extract_artworks(artworks, file = paste0(artworks, "_de.xml"), path = path) +dat <- extract_artworks(artworks, xmlfiles = paste0(artworks, "_de.xml"), xmlpath = path) fout <- file(fname, "a") # open in append mode diff --git a/code/overview_artworks_haum.R b/code/overview_artworks_haum.R index 1fd34b1..c11e3a1 100644 --- a/code/overview_artworks_haum.R +++ b/code/overview_artworks_haum.R @@ -1,15 +1,16 @@ # setwd("C:/Users/nwickelmaier/Nextcloud/Documents/MDS/2023ss/60100_master_thesis/code") -source("functions.R") +devtools::load_all("../../../../software/mtt") fname <- "../haum/overview_artworks.tex" if (file.exists(fname)) file.remove(fname) -path <- "C:/Users/nwickelmaier/Nextcloud/Documents/MDS/2023ss/60100_master_thesis/data/ContentEyevisit/eyevisit_cards_light" -artworks <- dir(path = path) +xmlpath <- "C:/Users/nwickelmaier/Nextcloud/Documents/MDS/2023ss/60100_master_thesis/data/ContentEyevisit/eyevisit_cards_light" +artworks <- dir(path = xmlpath) artworks <- artworks[artworks != "glossar"] -dat <- extract_artworks(artworks, files = paste0(artworks, ".xml"), path = path) +dat <- extract_artworks(artworks, xmlfiles = paste0(artworks, ".xml"), + xmlpath = xmlpath) fout <- file(fname, "a") # open in append mode @@ -35,7 +36,7 @@ for (artwork in dat$artwork) { writeLines("", fout) writeLines("\\begin{center}", fout) - art_path <- paste(path, artwork, artwork, sep = "/") + art_path <- paste(xmlpath, artwork, artwork, sep = "/") writeLines(paste0("\\includegraphics[width = 12cm]{", art_path, "}"), fout) writeLines("\\end{center}", fout) writeLines("", fout) diff --git a/code/schulferien_json-to-dataframe.R b/code/schulferien_json-to-dataframe.R deleted file mode 100644 index 4464845..0000000 --- a/code/schulferien_json-to-dataframe.R +++ /dev/null @@ -1,18 +0,0 @@ -# setwd("C:/Users/nwickelmaier/Nextcloud/Documents/MDS/2023ss/60100_master_thesis/data/metadata") - -# https://ferien-api.de/#holidaysPerStateAndYear -# Data extracted (on Linux) via: -# curl https://ferien-api.de/api/v1/holidays/NI > schulferien_NI.json - -library(jsonlite) - -dat <- read_json("schulferien_NI.json", simplify = TRUE) -dat$slug <- NULL -dat$stateCode <- NULL - -dat$name <- paste0(gsub("^(.*).niedersachsen.*", "\\1", dat$name), - gsub("^.*niedersachsen [0-9]{4}(.*)", "\\1", - dat$name)) - -write.table(dat, "schulferien_2019-2025_NI.csv", sep = ";", row.names = FALSE) - diff --git a/code/visualization.R b/code/visualization.R index 255ab27..7d46cde 100644 --- a/code/visualization.R +++ b/code/visualization.R @@ -1,16 +1,178 @@ -setwd("C:/Users/nwickelmaier/Nextcloud/Documents/MDS/2023ss/60100_master_thesis/code/") +# setwd("C:/Users/nwickelmaier/Nextcloud/Documents/MDS/2023ss/60100_master_thesis/code/") +library(lattice) + +# Read data +datlogs <- read.table("../data/haum/event_logfiles_metadata_2023-09-23_01-31-30.csv", + sep = ";", header = TRUE) +datlogs$date <- as.Date(datlogs$date.start) +datlogs$date.start <- as.POSIXct(datlogs$date.start) +datlogs$date.stop <- as.POSIXct(datlogs$date.stop) +datlogs$artwork <- sprintf("%03d", datlogs$artwork) + +### Which artwork gets touched most often/first? + +counts_artwork <- table(datlogs$artwork) +barchart(counts_artwork) + +artworks <- unique(datlogs$artwork) +artworks <- artworks[!artworks %in% c("504", "505")] +datart <- extract_artworks(artworks, + paste0(artworks, ".xml"), + "../data/haum/ContentEyevisit/eyevisit_cards_light/") +datart <- datart[order(datart$artwork), ] +names(counts_artwork) <- datart$title + +pdf("../figures/counts_artwork.pdf", width = 20, height = 10, pointsize = 10) +par(mai = c(5, .6, .1, .1)) +mtp <- barplot(counts_artwork, las = 2, ylim = c(0, 60000), border = "white") +text(tmp, counts_artwork + 1000, c(datart$artwork, "504", "505")) +dev.off() + +### Which artwork gets touched most often first? + +datcase <- datlogs[!duplicated(datlogs$case), ] +counts_case <- table(datcase$artwork) +names(counts_case) <- datart$title +tmp <- barplot(counts_case, las = 2, border = "white") +text(tmp, counts_case + 100, c(datart$artwork, "504", "505")) + +counts <- rbind(counts_artwork, counts_case) +barplot(counts, las = 2, border = "white", col = c("gray", "darkorange")) + +### Which teasers seem to work well? +barplot(table(datlogs$topic), las = 2) +barplot(table(datlogs$topicFile), las = 2) + + +### Dwell times/duration +datagg <- aggregate(duration ~ event + artwork, datlogs, mean) +datagg$ds <- datagg$duration / 1000 + +bwplot(ds ~ as.factor(event), datagg) +xyplot(ds ~ as.factor(event), datagg, groups = artwork) + +# without aggregation +bwplot(duration ~ as.factor(event), datlogs) +# in min +bwplot(I(duration/1000/60) ~ as.factor(event), datlogs) + + +datlogs$daydiff <- c(NA, diff(datlogs$date)) +plot(daydiff ~ date, datlogs, type = "b") + + +### Are there certain areas of the table that are touched most often? + +# heatmap +cuts <- 100 + +datlogs$x.start.cat <- cut(datlogs$x.start, cuts) +datlogs$y.start.cat <- cut(datlogs$y.start, cuts) + +tab <- xtabs( ~ x.start.cat + y.start.cat, datlogs) + +colnames(tab) <- paste0("c", 1:cuts) +rownames(tab) <- paste0("c", 1:cuts) + +heatmap(tab, Rowv = NA, Colv = NA) + + +library(ggplot2) + +ggplot(as.data.frame(tab)) + + geom_tile(aes(x = x.start.cat, y = y.start.cat, fill = Freq)) + + scale_fill_gradient(low = "gray40", high = "orange") + +dattrim <- datlogs[datlogs$x.start < 3840 & + datlogs$x.start > 0 & + datlogs$y.start < 2160 & + datlogs$y.start > 0 & + datlogs$x.stop < 3840 & + datlogs$x.stop > 0 & + datlogs$y.stop < 2160 & + datlogs$y.stop > 0, ] + +cuts <- 200 # 100, 70, ... + +# start +dattrim$x.start.cat <- cut(dattrim$x.start, cuts) +dattrim$y.start.cat <- cut(dattrim$y.start, cuts) + +tab.start <- xtabs( ~ x.start.cat + y.start.cat, dattrim) +colnames(tab.start) <- paste0("c", 1:cuts) +rownames(tab.start) <- paste0("c", 1:cuts) + +heatmap(tab.start, Rowv = NA, Colv = NA) +my_colors <- colorRampPalette(c("gray40", "orange")) +heatmap(tab.start, Rowv = NA, Colv = NA, col = my_colors(1000)) + +ggplot(as.data.frame(tab.start)) + + geom_tile(aes(x = x.start.cat, y = y.start.cat, fill = Freq)) + + scale_fill_gradient(low = "gray40", high = "orange") + +# stop +dattrim$x.stop.cat <- cut(dattrim$x.stop, cuts) +dattrim$y.stop.cat <- cut(dattrim$y.stop, cuts) +tab.stop <- xtabs( ~ x.stop.cat + y.stop.cat, dattrim) +colnames(tab.stop) <- paste0("c", 1:cuts) +rownames(tab.stop) <- paste0("c", 1:cuts) + +heatmap(tab.stop, Rowv = NA, Colv = NA) +heatmap(tab.stop, Rowv = NA, Colv = NA, col = my_colors(1000)) + +### How many visitors per day + +# Interactions per day +datint <- aggregate(case ~ date, datlogs, length) +plot(datint, type = "h") + +# Cases per day +datcase <- aggregate(case ~ date, datlogs, function(x) length(unique(x))) +plot(datcase, type = "h") + +# Traces per day +dattrace <- aggregate(trace ~ date, datlogs, function(x) length(unique(x))) +plot(dattrace, type = "h") + + +# function dependencies of mtt devtools::load_all("../../../../software/mtt") #library(mtt) -dat <- parse_logfiles("2016", path = "../data/haum/LogFiles/", - save = FALSE) -datlogs <- create_eventlogs(dat, "../data/haum/ContentEyevisit/eyevisit_cards_light/") +library(mvbutils) +foodweb(where = "package:mtt") + +foodweb(where = "package:mtt", + prune = c("parse_logfiles", "create_eventlogs", "extract_artworks", + "extract_topics", "add_topic"), + expand.ybox = 1.8, #cex = .6, + boxcolor = "gray", lwd = 2) + + + + + + + + + + + + + + + + + + + +### Other stuff dat001 <- datlogs[which(datlogs$artwork == "001"), ] index <- as.numeric(as.factor(dat001$trace)) -cc <- sample(colors(), length(unique(dat001$trace))) +cc <- sample(colors(), 100) plot(y.start ~ x.start, dat001, type = "n", xlab = "x", ylab = "y", xlim = c(0, 3840), ylim = c(0, 2160)) @@ -24,3 +186,15 @@ points(y.start ~ x.start, dat001, xlab = "x", ylab = "y", col = "blue") + +cc <- sample(colors(), 70) + +dat1 <- datlogs[!duplicated(datlogs$artwork), ] +dat1 <- dat1[order(dat1$artwork), ] + +plot(y.start ~ x.start, dat1, type = "n", xlim = c(-100, 4500), ylim = c(-100, 2500)) +abline(h = c(0, 2160), v = c(0, 3840), col = "lightgray") +with(dat1, points(x.start, y.start, col = cc, pch = 16)) +with(dat1, points(x.stop, y.stop, col = cc, pch = 16)) +with(dat1, arrows(x.start, y.start, x.stop, y.stop, length = .07, col = cc)) +