From d4e9676dd3947dd8b55199c9a01d209c65c7ac53 Mon Sep 17 00:00:00 2001 From: nwickel Date: Wed, 1 Nov 2023 18:46:39 +0100 Subject: [PATCH] Prepared slides for meeting with BB --- code/00_current-anaylsis.R | 370 +++++++++++++++++++++++++++++++++++++ 1 file changed, 370 insertions(+) create mode 100644 code/00_current-anaylsis.R diff --git a/code/00_current-anaylsis.R b/code/00_current-anaylsis.R new file mode 100644 index 0000000..36f0b20 --- /dev/null +++ b/code/00_current-anaylsis.R @@ -0,0 +1,370 @@ +# 00_current_analysis.R +# +# content: (1) Read event log data +# (2) Descriptives +# (3) Process Mining +# +# input: ../data/haum/event_logfiles_glossar_2023-10-29_10-26-42.csv +# output: +# +# last mod: 2023-11-02, NW + +# setwd("C:/Users/nwickelmaier/Nextcloud/Documents/MDS/2023ss/60100_master_thesis/code") + +#library(mtt) +devtools::load_all("../../../../software/mtt") +library(lattice) +library(bupaverse) + +# Overall Research Question: How do museum visitors interact with the +# artworks presented on the MTT? + + + +# What about popups? + + +# Distribution of bursts +# Can this be visualized in a nice way? + +#--------------- (1) Read data --------------- + +dat <- read.table("../data/haum/event_logfiles_glossar_2023-10-29_10-26-42.csv", + sep = ";", header = TRUE) +dat$date <- as.POSIXct(dat$date) +dat$date.start <- as.POSIXct(dat$date.start) +dat$date.stop <- as.POSIXct(dat$date.stop) +dat$artwork <- sprintf("%03d", dat$artwork) +dat$event <- factor(dat$event, levels = c("move", "flipCard", "openTopic", "openPopup")) + +# Add weekdays to data frame + +dat$weekdays <- factor(weekdays(dat$date.start), + levels = c("Montag", "Dienstag", "Mittwoch", + "Donnerstag", "Freitag", "Samstag", + "Sonntag"), + labels = c("Monday", "Tuesday", "Wednesday", + "Thursday", "Friday", "Saturday", + "Sunday")) + +names(dat)[names(dat) %in% c("date.start", "date.stop")] <- c("start", "complete") + +#--------------- (2) Descriptives --------------- +# How many events per topic, per trace, ... +# How many popups per artwork? + +# Number of events per artwork +tab <- xtabs( ~ artwork + event, dat) +addmargins(tab) + +proportions(tab, margin = "artwork") +proportions(tab, margin = "event") + +barplot(t(proportions(tab, margin = "artwork")), las = 2) + +# Proportion of events +proportions(xtabs( ~ event, dat)) +# Mean proportion of event per trace +colMeans(proportions(xtabs( ~ trace + event, dat), margin = "trace")) +# Mean proportion of event per artwork +colMeans(proportions(tab, margin = "artwork")) + +# Proportion of unclosed events + +nrow(dat[is.na(dat$complete), ]) +nrow(dat[is.na(dat$complete), ]) / nrow(dat) + + +#--------------- (3) Process Mining --------------- + +#--------------- (3.1) Check data quality --------------- + +alog <- activitylog(dat, + case_id = "trace", + activity_id = "event", + #resource_id = "case", + resource_id = "artwork", + timestamps = c("start", "complete")) + +# process_map(alog, frequency("relative")) +map_as_pdf(alog, file = "../figures/pm_trace-event.pdf") + +#--------------- (3.2) Interactions for different artworks --------------- +# Do interaction patterns for events per trace look different for different +# artworks? + +which.max(table(dat$artwork)) +which.min(table(dat$artwork)) +which.min(table(dat$artwork)[-c(71,72)]) + +alog080 <- activitylog(dat[dat$artwork == "080",], + case_id = "trace", + activity_id = "event", + resource_id = "artwork", + timestamps = c("start", "complete")) + +map_as_pdf(alog080, file = "../figures/pm_trace-event_080.pdf") + +alog087 <- activitylog(dat[dat$artwork == "087",], + case_id = "trace", + activity_id = "event", + resource_id = "artwork", + timestamps = c("start", "complete")) + +map_as_pdf(alog087, file = "../figures/pm_trace-event_087.pdf") + +alog504 <- activitylog(dat[dat$artwork == "504",], + case_id = "trace", + activity_id = "event", + resource_id = "artwork", + timestamps = c("start", "complete")) + +map_as_pdf(alog504, file = "../figures/pm_trace-event_504.pdf") + +#--------------- (3.3) Patterns of cases --------------- + +# What kind of patterns do we have? Are their typical sequences for cases? +# Do case patterns look different for ... +# ... mornings and afternoons? +# ... weekdays and weekends? +# ... weekdays for "normal" and school vacation days? +# ... pre and post corona? + +alog <- activitylog(dat, + case_id = "case", + activity_id = "event", + resource_id = "trace", + timestamps = c("start", "complete")) + +map_as_pdf(alog, file = "../figures/pm_case-event.pdf") + +alog080 <- activitylog(dat[dat$artwork == "080",], + case_id = "case", + activity_id = "event", + resource_id = "trace", + timestamps = c("start", "complete")) + +map_as_pdf(alog080, file = "../figures/pm_case-event_080.pdf") + +alog087 <- activitylog(dat[dat$artwork == "087",], + case_id = "case", + activity_id = "event", + resource_id = "trace", + timestamps = c("start", "complete")) + +map_as_pdf(alog087, file = "../figures/pm_case-event_087.pdf") + +### Mornings and afternoons + +dat$tod <- ifelse(lubridate::hour(dat$start) > 13, "afternoon", "morning") + +alog <- activitylog(dat[dat$tod == "morning",], + case_id = "case", + activity_id = "event", + resource_id = "trace", + timestamps = c("start", "complete")) + +map_as_pdf(alog, file = "../figures/pm_case-event_morning.pdf") + +alog <- activitylog(dat[dat$tod == "afternoon",], + case_id = "case", + activity_id = "event", + resource_id = "trace", + timestamps = c("start", "complete")) + +map_as_pdf(alog, file = "../figures/pm_case-event_afternoon.pdf") + +### Weekdays and weekends + +dat$wd <- ifelse(dat$weekdays %in% c("Saturday", "Sunday"), "weekend", "weekday") + +alog <- activitylog(dat[dat$wd == "weekend",], + case_id = "case", + activity_id = "event", + resource_id = "trace", + timestamps = c("start", "complete")) + +map_as_pdf(alog, file = "../figures/pm_case-event_weekend.pdf") + +alog <- activitylog(dat[dat$wd == "weekday",], + case_id = "case", + activity_id = "event", + resource_id = "trace", + timestamps = c("start", "complete")) + +map_as_pdf(alog, file = "../figures/pm_case-event_weekday.pdf") + +### Weekdays vs. school vacation weekdays + +dat$wds <- ifelse(!is.na(dat$vacation), "vacation", "school") +dat$wds[dat$wd == "weekend"] <- NA + +alog <- activitylog(dat[which(dat$wds == "school"),], + case_id = "case", + activity_id = "event", + resource_id = "trace", + timestamps = c("start", "complete")) + +map_as_pdf(alog, file = "../figures/pm_case-event_school.pdf") + +alog <- activitylog(dat[which(dat$wds == "vacation"),], + case_id = "case", + activity_id = "event", + resource_id = "trace", + timestamps = c("start", "complete")) + +map_as_pdf(alog, file = "../figures/pm_case-event_vacation.pdf") + +### Pre and post Corona + +dat$corona <- ifelse(dat$date < "2020-03-14", "pre", "post") + +alog <- activitylog(dat[which(dat$corona == "pre"),], + case_id = "case", + activity_id = "event", + resource_id = "trace", + timestamps = c("start", "complete")) + +map_as_pdf(alog, file = "../figures/pm_case-event_pre-corona.pdf") + +alog <- activitylog(dat[which(dat$corona == "post"),], + case_id = "case", + activity_id = "event", + resource_id = "trace", + timestamps = c("start", "complete")) + +map_as_pdf(alog, file = "../figures/pm_case-event_post-corona.pdf") + +#--------------- (3.4) Artwork sequences --------------- +# Order in which artworks are looked at + +nart <- 5 # select 5 artworks randomly +alog <- activitylog(dat,#[dat$artwork %in% sample(unique(dat$artwork), nart), ], + case_id = "case", + activity_id = "artwork", + resource_id = "trace", + timestamps = c("start", "complete")) + +#map <- process_map(alog, frequency("relative")) + +## select cases with Vermeer +length(unique(dat[dat$artwork == "080", "case"])) +# 12615 +case080 <- unique(dat[dat$artwork == "080", "case"]) +tmp <- dat[dat$case %in% case080, ] +table(tmp$artwork) +# --> all :) + +# select the ones most often (I am aiming for 10...) +barplot(table(tmp$artwork)) +abline(h = 12000, col = "red") +which(table(tmp$artwork) > 12000) + +often080 <- names(which(table(tmp$artwork) > 12000)) + +alog <- activitylog(dat[dat$artwork %in% often080, ], + case_id = "case", + activity_id = "artwork", + resource_id = "trace", + timestamps = c("start", "complete")) + +map_as_pdf(alog, file = "../figures/pm_case-artwork_often080.pdf") + + +#--------------- (3.5) Topics --------------- + +# Are there certain topics that people are interested in more than others? +# Do these topic distributions differ for comparable artworks? + +# Order of topics for Vermeer +# alog080 <- activitylog(dat[dat$artwork == "080",], +# case_id = "case", +# activity_id = "topic", +# resource_id = "trace", +# timestamps = c("start", "complete")) +# +# map_as_pdf(alog080, file = "../figures/pm_case-topic_080.pdf") +# +# +# alog080 <- activitylog(dat[dat$artwork == "080",], +# case_id = "case", +# activity_id = "topicFile", +# resource_id = "trace", +# timestamps = c("start", "complete")) +# +# #process_map(alog080, frequency("relative")) +# +# # Comparable artwork +# alog083 <- activitylog(dat[dat$artwork == "083",], +# case_id = "case", +# activity_id = "topic", +# resource_id = "trace", +# timestamps = c("start", "complete")) +# +# map_as_pdf(alog083, file = "../figures/pm_case-topic_083.pdf") + +# artworks that have the same topics than Vermeer +which(rowSums(xtabs( ~ artwork + topic, dat[dat$topic %in% + c("artist", "details", "extra info", "komposition", + "licht und farbe", "thema"), ]) != 0) == 6) + +#037 046 062 080 083 109 + +for (art in c("037", "046", "062", "080", "083", "109")) { + + alog <- activitylog(dat[dat$artwork == art,], + case_id = "case", + activity_id = "topic", + resource_id = "trace", + timestamps = c("start", "complete")) + + map_as_pdf(alog, file = paste0("../figures/pm_case-topic_", art, ".pdf")) +} + + + +# Angewandte Kunst, Graphik, Gemälde, Kultur + + +c("Kultur", "Kultur", "Graphik", "Gemälde", "Gemälde", "Gemälde", + "Gemälde", "Gemälde", "Graphik", "Gemälde", "Angewandte Kunst", "", + "Gemälde", "Angewandte Kunst", "", "", "Graphik", "Angewandte Kunst", + "Angewandte Kunst", "Gemälde", "Angewandte Kunst", "Gemälde", "", + "Gemälde", "Gemälde", "Gemälde", "Graphik", "Gemälde", "Gemälde", + "Gemälde", "", "Angewandte Kunst", "Angewandte Kunst", "Gemälde", + "Graphik", "Gemälde", "Gemälde", "Gemälde", "Gemälde", + "Angewandte Kunst", "Gemälde", "Gemälde", "Gemälde", "Kultur", "Kultur", + "Gemälde", "Kultur", "", "Gemälde", "", "Graphik", "Kultur", "Gemälde", + "", "Kultur", "Gemälde", "Kultur", "Gemälde", "Gemälde", "Gemälde", + "Kultur", "Kultur", "Kultur", "Kultur", "Kultur", "Kultur", + "Angewandte Kunst", "Info", "Info", "Info", "Kultur", "Kultur") + + + +# BURSTS +which.max(table(dat$date)) +tmp <- dat[dat$date == "2017-02-12", ] + +# number of traces per case on 2017-02-12 +rowSums(xtabs( ~ case + trace, tmp) != 0) + +range(tmp$start) +startcut <- cut(tmp$start, 7) +xtabs( ~ case + startcut, tmp) + +# distribution of cases over the day +colSums(xtabs( ~ case + startcut, tmp) != 0) +barplot(colSums(xtabs( ~ case + startcut, tmp) != 0)) + +aggregate(trace ~ case + startcut, tmp, length) + + +########################################################################### +# HELPER + +map_as_pdf <- function(alog, file, type = frequency("relative")) { + map <- process_map(alog, type = type) + g <- DiagrammeR::grViz(map$x$diagram) |> DiagrammeRsvg::export_svg() |> charToRaw() + rsvg::rsvg_pdf(g, file) +} +