510 lines
17 KiB
R
510 lines
17 KiB
R
# 00_current_analysis.R
|
|
#
|
|
# content: (1) Read event log data
|
|
# (2) Descriptives
|
|
# (3) Process Mining
|
|
#
|
|
# input: ../data/haum/event_logfiles_glossar_2023-10-29_10-26-42.csv
|
|
# output:
|
|
#
|
|
# last mod: 2023-11-02, NW
|
|
|
|
# setwd("C:/Users/nwickelmaier/Nextcloud/Documents/MDS/2023ss/60100_master_thesis/code")
|
|
|
|
#library(mtt)
|
|
devtools::load_all("../../../../software/mtt")
|
|
library(lattice)
|
|
library(bupaverse)
|
|
|
|
# Overall Research Question: How do museum visitors interact with the
|
|
# artworks presented on the MTT?
|
|
|
|
#---------------------------------------------------------------------------
|
|
# https://billster45.github.io/rapid_r_data_vis_book/process-mining.html
|
|
# TODO: checkout different specifications for process maps:
|
|
|
|
# edeaR::filter_trace_frequency(percentage = 0.9)
|
|
|
|
# processmapR::process_map(type_nodes = processmapR::frequency("absolute"),
|
|
# sec_nodes = processmapR::frequency("relative"),
|
|
# type_edges = processmapR::frequency("absolute"),
|
|
# sec_edges = processmapR::frequency("relative"),
|
|
# rankdir = "TB")
|
|
|
|
# processmapR::trace_explorer(coverage = 1,
|
|
# type = "frequent",
|
|
# .abbreviate = T)
|
|
|
|
# edeaR::resource_frequency("resource-activity") %>%
|
|
# plot()
|
|
|
|
# edeaR::resource_frequency(level = "case") %>%
|
|
# plot()
|
|
|
|
# edeaR::resource_specialisation(level = "activity") %>%
|
|
# plot()
|
|
|
|
# Functions for conformance checking:
|
|
# https://bupaverse.github.io/processcheckR/
|
|
|
|
#---------------------------------------------------------------------------
|
|
|
|
|
|
|
|
|
|
# Distribution of bursts
|
|
# Can this be visualized in a nice way?
|
|
|
|
#--------------- (1) Read data ---------------
|
|
|
|
dat <- read.table("../data/haum/event_logfiles_glossar_2023-11-03_17-46-28.csv",
|
|
sep = ";", header = TRUE)
|
|
dat$date <- as.POSIXct(dat$date)
|
|
dat$date.start <- as.POSIXct(dat$date.start)
|
|
dat$date.stop <- as.POSIXct(dat$date.stop)
|
|
dat$artwork <- sprintf("%03d", dat$artwork)
|
|
dat$event <- factor(dat$event, levels = c("move", "flipCard", "openTopic", "openPopup"))
|
|
|
|
# Add weekdays to data frame
|
|
|
|
dat$weekdays <- factor(weekdays(dat$date.start),
|
|
levels = c("Montag", "Dienstag", "Mittwoch",
|
|
"Donnerstag", "Freitag", "Samstag",
|
|
"Sonntag"),
|
|
labels = c("Monday", "Tuesday", "Wednesday",
|
|
"Thursday", "Friday", "Saturday",
|
|
"Sunday"))
|
|
|
|
names(dat)[names(dat) %in% c("date.start", "date.stop")] <- c("start", "complete")
|
|
|
|
#--------------- (2) Descriptives ---------------
|
|
# How many events per topic, per trace, ...
|
|
# How many popups per artwork?
|
|
|
|
# Number of events per artwork
|
|
tab <- xtabs( ~ artwork + event, dat)
|
|
addmargins(tab)
|
|
|
|
proportions(tab, margin = "artwork")
|
|
proportions(tab, margin = "event")
|
|
|
|
cc <- palette.colors(palette = "Okabe-Ito")[c(3,2,4,8)]
|
|
|
|
pdf("../figures/event-dist.pdf", height = 3.375, width = 12, pointsize = 10)
|
|
par(mai = c(.4,.4,.1,.1), mgp = c(2.4, 1, 0))
|
|
|
|
barplot(t(proportions(tab, margin = "artwork")), las = 2, col = cc,
|
|
legend.text = levels(dat$event), args.legend = list(x = "bottomleft", bg = "white"))
|
|
|
|
dev.off()
|
|
|
|
#barchart(proportions(tab, margin = "artwork"), las = 2)
|
|
|
|
# Proportion of events
|
|
proportions(xtabs( ~ event, dat))
|
|
# Mean proportion of event per trace
|
|
colMeans(proportions(xtabs( ~ trace + event, dat), margin = "trace"))
|
|
# Mean proportion of event per artwork
|
|
colMeans(proportions(tab, margin = "artwork"))
|
|
|
|
# Proportion of unclosed events
|
|
|
|
nrow(dat[is.na(dat$complete), ])
|
|
nrow(dat[is.na(dat$complete), ]) / nrow(dat)
|
|
|
|
# Proportion of events spanning more than one log file
|
|
sum(dat$fileId.start != dat$fileId.stop, na.rm = TRUE)
|
|
sum(dat$fileId.start != dat$fileId.stop, na.rm = TRUE) / nrow(dat)
|
|
|
|
|
|
#--------------- (3) Process Mining ---------------
|
|
|
|
#--------------- (3.1) Check data quality ---------------
|
|
|
|
dat$trace2 <- dat$trace
|
|
dat$trace <- NULL
|
|
|
|
alog <- activitylog(dat,
|
|
case_id = "trace2",
|
|
activity_id = "event",
|
|
#resource_id = "case",
|
|
resource_id = "artwork",
|
|
timestamps = c("start", "complete"))
|
|
|
|
# process_map(alog, frequency("relative"))
|
|
map_as_pdf(alog, file = "../figures/pm_trace-event.pdf")
|
|
|
|
alogf <- edeaR::filter_trace_frequency(alog, percentage = 0.9)
|
|
|
|
processmapR::process_map(alogf, # alog,
|
|
type_nodes = processmapR::frequency("absolute"),
|
|
sec_nodes = processmapR::frequency("relative"),
|
|
type_edges = processmapR::frequency("absolute"),
|
|
sec_edges = processmapR::frequency("relative"),
|
|
rankdir = "TB")
|
|
|
|
alog_no_move <- alog[alog$event != "move", ]
|
|
set.seed(1447)
|
|
processmapR::trace_explorer(alog_no_move[alog_no_move$trace2 %in%
|
|
sample(unique(alog_no_move$trace2), 400),],
|
|
coverage = 1, type = "frequent",
|
|
abbreviate = T)
|
|
|
|
ra_no_move <- edeaR::resource_frequency(alog_no_move, "resource-activity")
|
|
levels(ra_no_move$event) <- c("flipCard", "flipCard", "openTopic", "openPopup")
|
|
plot(ra_no_move)
|
|
|
|
ra <- edeaR::resource_frequency(alog, "resource-activity")
|
|
plot(ra)
|
|
|
|
heatmap(xtabs(relative_activity ~ artwork + event, ra))
|
|
heatmap(xtabs(relative_resource ~ artwork + event, ra_no_move))
|
|
heatmap(xtabs(relative_activity ~ artwork + event, ra_no_move))
|
|
|
|
aggregate(relative_activity ~ event, ra, sum)
|
|
aggregate(relative_resource ~ artwork, ra, sum)
|
|
|
|
|
|
#--------------- (3.2) Interactions for different artworks ---------------
|
|
# Do interaction patterns for events per trace look different for different
|
|
# artworks?
|
|
|
|
which.max(table(dat$artwork))
|
|
which.min(table(dat$artwork))
|
|
which.min(table(dat$artwork)[-c(71,72)])
|
|
|
|
alog080 <- activitylog(dat[dat$artwork == "080",],
|
|
case_id = "trace",
|
|
activity_id = "event",
|
|
resource_id = "artwork",
|
|
timestamps = c("start", "complete"))
|
|
|
|
map_as_pdf(alog080, file = "../figures/pm_trace-event_080.pdf")
|
|
|
|
alog087 <- activitylog(dat[dat$artwork == "087",],
|
|
case_id = "trace",
|
|
activity_id = "event",
|
|
resource_id = "artwork",
|
|
timestamps = c("start", "complete"))
|
|
|
|
map_as_pdf(alog087, file = "../figures/pm_trace-event_087.pdf")
|
|
|
|
alog504 <- activitylog(dat[dat$artwork == "504",],
|
|
case_id = "trace",
|
|
activity_id = "event",
|
|
resource_id = "artwork",
|
|
timestamps = c("start", "complete"))
|
|
|
|
map_as_pdf(alog504, file = "../figures/pm_trace-event_504.pdf")
|
|
|
|
#--------------- (3.3) Patterns of cases ---------------
|
|
|
|
# What kind of patterns do we have? Are their typical sequences for cases?
|
|
# Do case patterns look different for ...
|
|
# ... mornings and afternoons?
|
|
# ... weekdays and weekends?
|
|
# ... weekdays for "normal" and school vacation days?
|
|
# ... pre and post corona?
|
|
|
|
alog <- activitylog(dat,
|
|
case_id = "case",
|
|
activity_id = "event",
|
|
resource_id = "trace",
|
|
timestamps = c("start", "complete"))
|
|
|
|
map_as_pdf(alog, file = "../figures/pm_case-event.pdf")
|
|
|
|
alog080 <- activitylog(dat[dat$artwork == "080",],
|
|
case_id = "case",
|
|
activity_id = "event",
|
|
resource_id = "trace",
|
|
timestamps = c("start", "complete"))
|
|
|
|
map_as_pdf(alog080, file = "../figures/pm_case-event_080.pdf")
|
|
|
|
alog087 <- activitylog(dat[dat$artwork == "087",],
|
|
case_id = "case",
|
|
activity_id = "event",
|
|
resource_id = "trace",
|
|
timestamps = c("start", "complete"))
|
|
|
|
map_as_pdf(alog087, file = "../figures/pm_case-event_087.pdf")
|
|
|
|
### Mornings and afternoons
|
|
|
|
dat$tod <- ifelse(lubridate::hour(dat$start) > 13, "afternoon", "morning")
|
|
|
|
alog <- activitylog(dat[dat$tod == "morning",],
|
|
case_id = "case",
|
|
activity_id = "event",
|
|
resource_id = "trace",
|
|
timestamps = c("start", "complete"))
|
|
|
|
map_as_pdf(alog, file = "../figures/pm_case-event_morning.pdf")
|
|
|
|
alog <- activitylog(dat[dat$tod == "afternoon",],
|
|
case_id = "case",
|
|
activity_id = "event",
|
|
resource_id = "trace",
|
|
timestamps = c("start", "complete"))
|
|
|
|
map_as_pdf(alog, file = "../figures/pm_case-event_afternoon.pdf")
|
|
|
|
# Are the same artworks looked at?
|
|
pdf("../figures/bp_tod.pdf", height = 3.375, width = 12, pointsize = 10)
|
|
par(mai = c(.5,.6,.1,.1), mgp = c(2.4, 1, 0))
|
|
|
|
barplot(proportions(xtabs( ~ tod + artwork, dat), margin = "tod"), #col = cc[1:2],
|
|
las = 2, beside = TRUE, legend = c("afternoon", "morning"),
|
|
args.legend = list(x = "topleft"))
|
|
|
|
dev.off()
|
|
|
|
### Weekdays and weekends
|
|
|
|
dat$wd <- ifelse(dat$weekdays %in% c("Saturday", "Sunday"), "weekend", "weekday")
|
|
|
|
alog <- activitylog(dat[dat$wd == "weekend",],
|
|
case_id = "case",
|
|
activity_id = "event",
|
|
resource_id = "trace",
|
|
timestamps = c("start", "complete"))
|
|
|
|
map_as_pdf(alog, file = "../figures/pm_case-event_weekend.pdf")
|
|
|
|
alog <- activitylog(dat[dat$wd == "weekday",],
|
|
case_id = "case",
|
|
activity_id = "event",
|
|
resource_id = "trace",
|
|
timestamps = c("start", "complete"))
|
|
|
|
map_as_pdf(alog, file = "../figures/pm_case-event_weekday.pdf")
|
|
|
|
# Are the same artworks looked at?
|
|
pdf("../figures/bp_wd.pdf", height = 3.375, width = 12, pointsize = 10)
|
|
par(mai = c(.5,.6,.1,.1), mgp = c(2.4, 1, 0))
|
|
|
|
barplot(proportions(xtabs( ~ wd + artwork, dat), margin = "wd"),
|
|
las = 2, beside = TRUE, legend = c("weekday", "weekend"),
|
|
args.legend = list(x = "topleft"))
|
|
|
|
dev.off()
|
|
|
|
### Weekdays vs. school vacation weekdays
|
|
|
|
dat$wds <- ifelse(!is.na(dat$vacation), "vacation", "school")
|
|
dat$wds[dat$wd == "weekend"] <- NA
|
|
|
|
alog <- activitylog(dat[which(dat$wds == "school"),],
|
|
case_id = "case",
|
|
activity_id = "event",
|
|
resource_id = "trace",
|
|
timestamps = c("start", "complete"))
|
|
|
|
map_as_pdf(alog, file = "../figures/pm_case-event_school.pdf")
|
|
|
|
alog <- activitylog(dat[which(dat$wds == "vacation"),],
|
|
case_id = "case",
|
|
activity_id = "event",
|
|
resource_id = "trace",
|
|
timestamps = c("start", "complete"))
|
|
|
|
map_as_pdf(alog, file = "../figures/pm_case-event_vacation.pdf")
|
|
|
|
# Are the same artworks looked at?
|
|
pdf("../figures/bp_wds.pdf", height = 3.375, width = 12, pointsize = 10)
|
|
par(mai = c(.5,.6,.1,.1), mgp = c(2.4, 1, 0))
|
|
|
|
#barplot(xtabs( ~ wds + artwork, dat), las = 2, beside = TRUE,
|
|
barplot(proportions(xtabs( ~ wds + artwork, dat), margin = "wds"),
|
|
las = 2, beside = TRUE,
|
|
legend = c("school", "vacation"), args.legend = list(x = "topleft"))
|
|
|
|
dev.off()
|
|
|
|
### Pre and post Corona
|
|
|
|
dat$corona <- ifelse(dat$date < "2020-03-14", "pre", "post")
|
|
|
|
alog <- activitylog(dat[which(dat$corona == "pre"),],
|
|
case_id = "case",
|
|
activity_id = "event",
|
|
resource_id = "trace",
|
|
timestamps = c("start", "complete"))
|
|
|
|
map_as_pdf(alog, file = "../figures/pm_case-event_pre-corona.pdf")
|
|
|
|
alog <- activitylog(dat[which(dat$corona == "post"),],
|
|
case_id = "case",
|
|
activity_id = "event",
|
|
resource_id = "trace",
|
|
timestamps = c("start", "complete"))
|
|
|
|
map_as_pdf(alog, file = "../figures/pm_case-event_post-corona.pdf")
|
|
|
|
# Are the same artworks looked at?
|
|
pdf("../figures/bp_corona.pdf", height = 3.375, width = 12, pointsize = 10)
|
|
par(mai = c(.5,.6,.1,.1), mgp = c(2.4, 1, 0))
|
|
|
|
barplot(proportions(xtabs( ~ corona + artwork, dat), margin = "corona"),
|
|
las = 2, beside = TRUE,
|
|
legend = c("post", "pre"), args.legend = list(x = "topleft"))
|
|
|
|
dev.off()
|
|
|
|
#--------------- (3.4) Artwork sequences ---------------
|
|
# Order in which artworks are looked at
|
|
|
|
nart <- 5 # select 5 artworks randomly
|
|
alog <- activitylog(dat,#[dat$artwork %in% sample(unique(dat$artwork), nart), ],
|
|
case_id = "case",
|
|
activity_id = "artwork",
|
|
resource_id = "trace",
|
|
timestamps = c("start", "complete"))
|
|
|
|
#map <- process_map(alog, frequency("relative"))
|
|
|
|
## select cases with Vermeer
|
|
length(unique(dat[dat$artwork == "080", "case"]))
|
|
# 12615
|
|
case080 <- unique(dat[dat$artwork == "080", "case"])
|
|
tmp <- dat[dat$case %in% case080, ]
|
|
table(tmp$artwork)
|
|
# --> all :)
|
|
|
|
# select the ones most often (I am aiming for 10...)
|
|
barplot(table(tmp$artwork))
|
|
abline(h = 14000, col = "red")
|
|
which(table(tmp$artwork) > 14000)
|
|
|
|
often080 <- names(which(table(tmp$artwork) > 14000))
|
|
|
|
alog <- activitylog(dat[dat$artwork %in% often080, ],
|
|
case_id = "case",
|
|
activity_id = "artwork",
|
|
resource_id = "trace",
|
|
timestamps = c("start", "complete"))
|
|
|
|
map_as_pdf(alog, file = "../figures/pm_case-artwork_often080.pdf")
|
|
|
|
|
|
#--------------- (3.5) Topics ---------------
|
|
|
|
# Are there certain topics that people are interested in more than others?
|
|
# Do these topic distributions differ for comparable artworks?
|
|
|
|
alog <- activitylog(dat[which(dat$event == "openTopic"),],
|
|
case_id = "case",
|
|
activity_id = "topic",
|
|
resource_id = "trace",
|
|
timestamps = c("start", "complete"))
|
|
|
|
map_as_pdf(alog, file = "../figures/pm_case-topic.pdf")
|
|
|
|
# Order of topics for Vermeer
|
|
# alog080 <- activitylog(dat[dat$artwork == "080",],
|
|
# case_id = "case",
|
|
# activity_id = "topic",
|
|
# resource_id = "trace",
|
|
# timestamps = c("start", "complete"))
|
|
#
|
|
# map_as_pdf(alog080, file = "../figures/pm_case-topic_080.pdf")
|
|
#
|
|
#
|
|
# alog080 <- activitylog(dat[dat$artwork == "080",],
|
|
# case_id = "case",
|
|
# activity_id = "topicFile",
|
|
# resource_id = "trace",
|
|
# timestamps = c("start", "complete"))
|
|
#
|
|
# #process_map(alog080, frequency("relative"))
|
|
#
|
|
# # Comparable artwork
|
|
# alog083 <- activitylog(dat[dat$artwork == "083",],
|
|
# case_id = "case",
|
|
# activity_id = "topic",
|
|
# resource_id = "trace",
|
|
# timestamps = c("start", "complete"))
|
|
#
|
|
# map_as_pdf(alog083, file = "../figures/pm_case-topic_083.pdf")
|
|
|
|
# artworks that have the same topics than Vermeer
|
|
which(rowSums(xtabs( ~ artwork + topic, dat[dat$topic %in%
|
|
c("artist", "details", "extra info", "komposition",
|
|
"licht und farbe", "thema"), ]) != 0) == 6)
|
|
|
|
#037 046 062 080 083 109
|
|
|
|
for (art in c("037", "046", "062", "080", "083", "109")) {
|
|
|
|
alog <- activitylog(dat[dat$event == "openTopic" & dat$artwork == art,],
|
|
case_id = "case",
|
|
activity_id = "topic",
|
|
resource_id = "trace",
|
|
timestamps = c("start", "complete"))
|
|
|
|
map_as_pdf(alog, file = paste0("../figures/pm_case-topic_", art, ".pdf"))
|
|
}
|
|
|
|
|
|
|
|
# Angewandte Kunst, Graphik, Gemälde, Kultur
|
|
|
|
|
|
c("Kultur", "Kultur", "Graphik", "Gemälde", "Gemälde", "Gemälde",
|
|
"Gemälde", "Gemälde", "Graphik", "Gemälde", "Angewandte Kunst", "",
|
|
"Gemälde", "Angewandte Kunst", "", "", "Graphik", "Angewandte Kunst",
|
|
"Angewandte Kunst", "Gemälde", "Angewandte Kunst", "Gemälde", "",
|
|
"Gemälde", "Gemälde", "Gemälde", "Graphik", "Gemälde", "Gemälde",
|
|
"Gemälde", "", "Angewandte Kunst", "Angewandte Kunst", "Gemälde",
|
|
"Graphik", "Gemälde", "Gemälde", "Gemälde", "Gemälde",
|
|
"Angewandte Kunst", "Gemälde", "Gemälde", "Gemälde", "Kultur", "Kultur",
|
|
"Gemälde", "Kultur", "", "Gemälde", "", "Graphik", "Kultur", "Gemälde",
|
|
"", "Kultur", "Gemälde", "Kultur", "Gemälde", "Gemälde", "Gemälde",
|
|
"Kultur", "Kultur", "Kultur", "Kultur", "Kultur", "Kultur",
|
|
"Angewandte Kunst", "Info", "Info", "Info", "Kultur", "Kultur")
|
|
|
|
|
|
|
|
# BURSTS
|
|
which.max(table(dat$date))
|
|
tmp <- dat[dat$date == "2017-02-12", ]
|
|
|
|
# number of traces per case on 2017-02-12
|
|
rowSums(xtabs( ~ case + trace, tmp) != 0)
|
|
|
|
range(tmp$start)
|
|
hours <- lubridate::hour(tmp$start)
|
|
xtabs( ~ case + hours, tmp)
|
|
|
|
# distribution of cases over the day
|
|
colSums(xtabs( ~ case + hours, tmp) != 0)
|
|
barplot(colSums(xtabs( ~ case + hours, tmp) != 0))
|
|
|
|
aggregate(trace ~ case + hours, tmp, length)
|
|
|
|
|
|
|
|
|
|
tmp <- aggregate(trace2 ~ case, dat, length)
|
|
tmp$date <- as.Date(dat[!duplicated(dat$case), "start"])
|
|
tmp$time <- lubridate::hour(dat[!duplicated(dat$case), "start"])
|
|
|
|
tmp[tmp$trace2 > 200, ]
|
|
|
|
plot(trace2 ~ time, tmp, cex = 2, col = rgb(0,0,0,.3))
|
|
|
|
lattice::barchart(trace2 ~ time, tmp, horizontal=F)
|
|
|
|
|
|
|
|
###########################################################################
|
|
# HELPER
|
|
|
|
map_as_pdf <- function(alog, file, type = frequency("relative")) {
|
|
map <- process_map(alog, type = type)
|
|
g <- DiagrammeR::grViz(map$x$diagram) |> DiagrammeRsvg::export_svg() |> charToRaw()
|
|
rsvg::rsvg_pdf(g, file)
|
|
}
|
|
|