mtt_haum/code/02_descriptives.R

530 lines
18 KiB
R

# 02_descriptives.R
#
# content: (1) Read data
# (2) Descriptives
# (3) Process Mining
# (3.1) Check data quality
# (3.2) Interactions for different artworks
# (3.3) Patterns of cases
# (3.4) Artwork sequences
# (3.5) Topics
#
# input: results/haum/event_logfiles_2024-02-21_16-07-33.csv
# output:
#
# last mod: 2023-11-15, NW
# setwd("C:/Users/nwickelmaier/Nextcloud/Documents/MDS/2023ss/60100_master_thesis/code")
library(lattice)
library(bupaverse)
# Overall Research Question: How do museum visitors interact with the
# artworks presented on the MTT?
# Distribution of bursts
# Can this be visualized in a nice way?
#--------------- (1) Read data ---------------
datlogs <- read.table("results/haum/event_logfiles_2024-02-21_16-07-33.csv",
colClasses = c("character", "character", "POSIXct",
"POSIXct", "character", "integer",
"numeric", "character", "character",
rep("numeric", 3), "character",
"character", rep("numeric", 11),
"character", "character"),
sep = ";", header = TRUE)
datlogs$event <- factor(datlogs$event, levels = c("move", "flipCard",
"openTopic",
"openPopup"))
datraw <- read.table("results/haum/raw_logfiles_2024-01-18_09-58-52.csv",
sep = ";", header = TRUE)
# Add weekdays to data frame
datlogs$weekdays <- factor(weekdays(datlogs$date.start),
levels = c("Montag", "Dienstag", "Mittwoch",
"Donnerstag", "Freitag", "Samstag",
"Sonntag"),
labels = c("Monday", "Tuesday", "Wednesday",
"Thursday", "Friday", "Saturday",
"Sunday"))
### Number of log files
length(unique(datraw$fileId))
# 39767
length(unique(c(datlogs$fileId.start, datlogs$fileId.stop)))
# 22789
### Number of activities
nrow(datlogs)
table(datlogs$event)
proportions(table(datlogs$event))
proportions(table(datlogs$event[datlogs$event != "move"]))
### Time range
range(as.Date(datlogs$date.start))
### Topics per item
print(xtabs( ~ item + topic, datlogs), zero = "-")
lattice::dotplot(xtabs( ~ item + topic, datlogs), auto.key = TRUE)
mat <- t(as.matrix(xtabs( ~ item + topic, datlogs)))
mat[mat == 0] <- NA
image(mat, axes = F, col = rainbow(100))
heatmap(t(mat))
datlogs$start <- datlogs$date.start
datlogs$complete <- datlogs$date.stop
#--------------- (2) Descriptives ---------------
# How many events per topic, per path, ...
# How many popups per artwork?
# Number of events per artwork
tab <- xtabs( ~ artwork + event, datlogs)
addmargins(tab)
proportions(tab, margin = "artwork")
proportions(tab, margin = "event")
cc <- palette.colors(palette = "Okabe-Ito")[c(3,2,4,8)]
pdf("../figures/event-dist.pdf", height = 3.375, width = 12, pointsize = 10)
par(mai = c(.4,.4,.1,.1), mgp = c(2.4, 1, 0))
barplot(t(proportions(tab, margin = "artwork")), las = 2, col = cc,
legend.text = levels(datlogs$event), args.legend = list(x = "bottomleft", bg = "white"))
dev.off()
#barchart(proportions(tab, margin = "artwork"), las = 2)
# Proportion of events
proportions(xtabs( ~ event, datlogs))
# Mean proportion of event per path
colMeans(proportions(xtabs( ~ path + event, datlogs), margin = "path"))
# Mean proportion of event per artwork
colMeans(proportions(tab, margin = "artwork"))
# Proportion of unclosed events
nrow(datlogs[is.na(datlogs$complete), ])
nrow(datlogs[is.na(datlogs$complete), ]) / nrow(datlogs)
# Proportion of events spanning more than one log file
sum(datlogs$fileId.start != datlogs$fileId.stop, na.rm = TRUE)
sum(datlogs$fileId.start != datlogs$fileId.stop, na.rm = TRUE) / nrow(datlogs)
#--------------- (3) Process Mining ---------------
#--------------- (3.1) Check data quality ---------------
alog <- activitylog(datlogs,
case_id = "path",
activity_id = "event",
#resource_id = "case",
resource_id = "artwork",
timestamps = c("start", "complete"))
# process_map(alog, frequency("relative"))
map_as_pdf(alog, file = "../figures/pm_trace-event.pdf")
alogf <- edeaR::filter_trace_frequency(alog, percentage = 0.9)
processmapR::process_map(alogf, # alog,
type_nodes = processmapR::frequency("absolute"),
sec_nodes = processmapR::frequency("relative"),
type_edges = processmapR::frequency("absolute"),
sec_edges = processmapR::frequency("relative"),
rankdir = "TB")
alog_no_move <- alog[alog$event != "move", ]
pdf("../figures/traceexplore_trace-event.pdf", height = 8, width = 12, pointsize = 10)
set.seed(1447)
processmapR::trace_explorer(alog_no_move[alog_no_move$path %in%
sample(unique(alog_no_move$path), 400),],
coverage = 1, type = "frequent",
abbreviate = T)
dev.off()
pdf("../figures/ra_trace-event.pdf", height = 8, width = 12, pointsize = 10)
ra_no_move <- edeaR::resource_frequency(alog_no_move, "resource-activity")
levels(ra_no_move$event) <- c("flipCard", "flipCard", "openTopic", "openPopup")
plot(ra_no_move)
dev.off()
ra <- edeaR::resource_frequency(alog, "resource-activity")
plot(ra)
heatmap(xtabs(relative_activity ~ artwork + event, ra))
heatmap(xtabs(relative_resource ~ artwork + event, ra_no_move))
heatmap(xtabs(relative_activity ~ artwork + event, ra_no_move))
aggregate(relative_activity ~ event, ra, sum)
aggregate(relative_resource ~ artwork, ra, sum)
#--------------- (3.2) Interactions for different artworks ---------------
# Do interaction patterns for events per trace look different for different
# artworks?
which.max(table(datlogs$artwork))
which.min(table(datlogs$artwork))
which.min(table(datlogs$artwork)[-c(71,72)])
alog080 <- activitylog(datlogs[datlogs$artwork == "080",],
case_id = "path",
activity_id = "event",
resource_id = "artwork",
timestamps = c("start", "complete"))
map_as_pdf(alog080, file = "../figures/pm_trace-event_080.pdf")
alog087 <- activitylog(datlogs[datlogs$artwork == "087",],
case_id = "path",
activity_id = "event",
resource_id = "artwork",
timestamps = c("start", "complete"))
map_as_pdf(alog087, file = "../figures/pm_trace-event_087.pdf")
alog504 <- activitylog(datlogs[datlogs$artwork == "504",],
case_id = "path",
activity_id = "event",
resource_id = "artwork",
timestamps = c("start", "complete"))
map_as_pdf(alog504, file = "../figures/pm_trace-event_504.pdf")
#--------------- (3.3) Patterns of cases ---------------
# What kind of patterns do we have? Are their typical sequences for cases?
# Do case patterns look different for ...
# ... mornings and afternoons?
# ... weekdays and weekends?
# ... weekdays for "normal" and school vacation days?
# ... pre and post corona?
alog <- activitylog(datlogs,
case_id = "case",
activity_id = "event",
resource_id = "path",
timestamps = c("start", "complete"))
map_as_pdf(alog, file = "../figures/pm_case-event.pdf")
alog_no_move <- alog[alog$event != "move", ]
pdf("../figures/traceexplore_case-event.pdf", height = 8, width = 12, pointsize = 10)
set.seed(1050)
processmapR::trace_explorer(alog_no_move[alog_no_move$path %in%
sample(unique(alog_no_move$path), 300),],
coverage = 1, type = "frequent",
abbreviate = T)
dev.off()
map_as_pdf(alog080, file = "../figures/pm_case-event_080.pdf")
alog087 <- activitylog(datlogs[datlogs$artwork == "087",],
case_id = "case",
activity_id = "event",
resource_id = "path",
timestamps = c("start", "complete"))
map_as_pdf(alog087, file = "../figures/pm_case-event_087.pdf")
### Mornings and afternoons
datlogs$tod <- ifelse(lubridate::hour(datlogs$start) > 13, "afternoon", "morning")
alog <- activitylog(datlogs[datlogs$tod == "morning",],
case_id = "case",
activity_id = "event",
resource_id = "path",
timestamps = c("start", "complete"))
map_as_pdf(alog, file = "../figures/pm_case-event_morning.pdf")
alog <- activitylog(datlogs[datlogs$tod == "afternoon",],
case_id = "case",
activity_id = "event",
resource_id = "path",
timestamps = c("start", "complete"))
map_as_pdf(alog, file = "../figures/pm_case-event_afternoon.pdf")
# Are the same artworks looked at?
pdf("../figures/bp_tod.pdf", height = 3.375, width = 12, pointsize = 10)
par(mai = c(.5,.6,.1,.1), mgp = c(2.4, 1, 0))
barplot(proportions(xtabs( ~ tod + artwork, datlogs), margin = "tod"), #col = cc[1:2],
las = 2, beside = TRUE, legend = c("afternoon", "morning"),
args.legend = list(x = "topleft"))
dev.off()
### Weekdays and weekends
datlogs$wd <- ifelse(datlogs$weekdays %in% c("Saturday", "Sunday"), "weekend", "weekday")
alog <- activitylog(datlogs[datlogs$wd == "weekend",],
case_id = "case",
activity_id = "event",
resource_id = "path",
timestamps = c("start", "complete"))
map_as_pdf(alog, file = "../figures/pm_case-event_weekend.pdf")
alog <- activitylog(datlogs[datlogs$wd == "weekday",],
case_id = "case",
activity_id = "event",
resource_id = "path",
timestamps = c("start", "complete"))
map_as_pdf(alog, file = "../figures/pm_case-event_weekday.pdf")
# Are the same artworks looked at?
pdf("../figures/bp_wd.pdf", height = 3.375, width = 12, pointsize = 10)
par(mai = c(.5,.6,.1,.1), mgp = c(2.4, 1, 0))
barplot(proportions(xtabs( ~ wd + artwork, datlogs), margin = "wd"),
las = 2, beside = TRUE, legend = c("weekday", "weekend"),
args.legend = list(x = "topleft"))
dev.off()
### Weekdays vs. school vacation weekdays
datlogs$wds <- ifelse(!is.na(datlogs$vacation), "vacation", "school")
datlogs$wds[datlogs$wd == "weekend"] <- NA
alog <- activitylog(datlogs[which(datlogs$wds == "school"),],
case_id = "case",
activity_id = "event",
resource_id = "path",
timestamps = c("start", "complete"))
map_as_pdf(alog, file = "../figures/pm_case-event_school.pdf")
alog <- activitylog(datlogs[which(datlogs$wds == "vacation"),],
case_id = "case",
activity_id = "event",
resource_id = "path",
timestamps = c("start", "complete"))
map_as_pdf(alog, file = "../figures/pm_case-event_vacation.pdf")
# Are the same artworks looked at?
pdf("../figures/bp_wds.pdf", height = 3.375, width = 12, pointsize = 10)
par(mai = c(.5,.6,.1,.1), mgp = c(2.4, 1, 0))
#barplot(xtabs( ~ wds + artwork, datlogs), las = 2, beside = TRUE,
barplot(proportions(xtabs( ~ wds + artwork, datlogs), margin = "wds"),
las = 2, beside = TRUE,
legend = c("school", "vacation"), args.legend = list(x = "topleft"))
dev.off()
### Pre and post Corona
datlogs$corona <- ifelse(datlogs$date < "2020-03-14", "pre", "post")
alog <- activitylog(datlogs[which(datlogs$corona == "pre"),],
case_id = "case",
activity_id = "event",
resource_id = "path",
timestamps = c("start", "complete"))
map_as_pdf(alog, file = "../figures/pm_case-event_pre-corona.pdf")
alog <- activitylog(datlogs[which(datlogs$corona == "post"),],
case_id = "case",
activity_id = "event",
resource_id = "path",
timestamps = c("start", "complete"))
map_as_pdf(alog, file = "../figures/pm_case-event_post-corona.pdf")
# Are the same artworks looked at?
pdf("../figures/bp_corona.pdf", height = 3.375, width = 12, pointsize = 10)
par(mai = c(.5,.6,.1,.1), mgp = c(2.4, 1, 0))
barplot(proportions(xtabs( ~ corona + artwork, datlogs), margin = "corona"),
las = 2, beside = TRUE,
legend = c("post", "pre"), args.legend = list(x = "topleft"))
dev.off()
#--------------- (3.4) Artwork sequences ---------------
# Order in which artworks are looked at
nart <- 5 # select 5 artworks randomly
alog <- activitylog(datlogs,#[datlogs$artwork %in% sample(unique(datlogs$artwork), nart), ],
case_id = "case",
activity_id = "artwork",
resource_id = "path",
timestamps = c("start", "complete"))
#map <- process_map(alog, frequency("relative"))
## select cases with Vermeer
length(unique(datlogs[datlogs$artwork == "080", "case"]))
# 12615
case080 <- unique(datlogs[datlogs$artwork == "080", "case"])
tmp <- datlogs[datlogs$case %in% case080, ]
table(tmp$artwork)
# --> all :)
# select the ones most often (I am aiming for 10...)
barplot(table(tmp$artwork))
abline(h = 14000, col = "red")
which(table(tmp$artwork) > 14000)
often080 <- names(which(table(tmp$artwork) > 14000))
alog <- activitylog(datlogs[datlogs$artwork %in% often080, ],
case_id = "case",
activity_id = "artwork",
resource_id = "path",
timestamps = c("start", "complete"))
map_as_pdf(alog, file = "../figures/pm_case-artwork_often080.pdf")
pdf("../figures/traceexplore_case-artwork_often080.pdf", height = 8, width = 12, pointsize = 10)
processmapR::trace_explorer(alog,
n_traces = 30, type = "frequent",
abbreviate = TRUE)
dev.off()
#--------------- (3.5) Topics ---------------
# Are there certain topics that people are interested in more than others?
# Do these topic distributions differ for comparable artworks?
alog <- activitylog(datlogs[which(datlogs$event == "openTopic"),],
case_id = "case",
activity_id = "topic",
resource_id = "path",
timestamps = c("start", "complete"))
map_as_pdf(alog, file = "../figures/pm_case-topic.pdf")
# Order of topics for Vermeer
# alog080 <- activitylog(datlogs[datlogs$artwork == "080",],
# case_id = "case",
# activity_id = "topic",
# resource_id = "path",
# timestamps = c("start", "complete"))
#
# map_as_pdf(alog080, file = "../figures/pm_case-topic_080.pdf")
#
#
# alog080 <- activitylog(datlogs[datlogs$artwork == "080",],
# case_id = "case",
# activity_id = "topicFile",
# resource_id = "path",
# timestamps = c("start", "complete"))
#
# #process_map(alog080, frequency("relative"))
#
# # Comparable artwork
# alog083 <- activitylog(datlogs[datlogs$artwork == "083",],
# case_id = "case",
# activity_id = "topic",
# resource_id = "path",
# timestamps = c("start", "complete"))
#
# map_as_pdf(alog083, file = "../figures/pm_case-topic_083.pdf")
# artworks that have the same topics than Vermeer
which(rowSums(xtabs( ~ artwork + topic, datlogs[datlogs$topic %in%
c("artist", "details", "extra info", "komposition",
"licht und farbe", "thema"), ]) != 0) == 6)
#037 046 062 080 083 109
for (art in c("037", "046", "062", "080", "083", "109")) {
alog <- activitylog(datlogs[datlogs$event == "openTopic" & datlogs$artwork == art,],
case_id = "case",
activity_id = "topic",
resource_id = "path",
timestamps = c("start", "complete"))
map_as_pdf(alog, file = paste0("../figures/pm_case-topic_", art, ".pdf"))
}
# Angewandte Kunst, Graphik, Gemälde, Kultur
c("Kultur", "Kultur", "Graphik", "Gemälde", "Gemälde", "Gemälde",
"Gemälde", "Gemälde", "Graphik", "Gemälde", "Angewandte Kunst", "",
"Gemälde", "Angewandte Kunst", "", "", "Graphik", "Angewandte Kunst",
"Angewandte Kunst", "Gemälde", "Angewandte Kunst", "Gemälde", "",
"Gemälde", "Gemälde", "Gemälde", "Graphik", "Gemälde", "Gemälde",
"Gemälde", "", "Angewandte Kunst", "Angewandte Kunst", "Gemälde",
"Graphik", "Gemälde", "Gemälde", "Gemälde", "Gemälde",
"Angewandte Kunst", "Gemälde", "Gemälde", "Gemälde", "Kultur", "Kultur",
"Gemälde", "Kultur", "", "Gemälde", "", "Graphik", "Kultur", "Gemälde",
"", "Kultur", "Gemälde", "Kultur", "Gemälde", "Gemälde", "Gemälde",
"Kultur", "Kultur", "Kultur", "Kultur", "Kultur", "Kultur",
"Angewandte Kunst", "Info", "Info", "Info", "Kultur", "Kultur")
# BURSTS
which.max(table(datlogs$date))
tmp <- datlogs[datlogs$date == "2017-02-12", ]
# number of traces per case on 2017-02-12
rowSums(xtabs( ~ case + path, tmp) != 0)
range(tmp$start)
hours <- lubridate::hour(tmp$start)
xtabs( ~ case + hours, tmp)
# distribution of cases over the day
colSums(xtabs( ~ case + hours, tmp) != 0)
barplot(colSums(xtabs( ~ case + hours, tmp) != 0))
aggregate(path ~ case + hours, tmp, length)
tmp <- aggregate(path ~ case, datlogs, length)
tmp$date <- as.Date(datlogs[!duplicated(datlogs$case), "start"])
tmp$time <- lubridate::hour(datlogs[!duplicated(datlogs$case), "start"])
tmp[tmp$path > 200, ]
plot(path ~ time, tmp, cex = 2, col = rgb(0,0,0,.3))
lattice::barchart(path ~ time, tmp, horizontal=F)
###########################################################################
# HELPER
map_as_pdf <- function(alog, file, type = frequency("relative")) {
map <- process_map(alog, type = type)
g <- DiagrammeR::grViz(map$x$diagram) |> DiagrammeRsvg::export_svg() |> charToRaw()
rsvg::rsvg_pdf(g, file)
}