371 lines
		
	
	
		
			12 KiB
		
	
	
	
		
			R
		
	
	
	
	
	
			
		
		
	
	
			371 lines
		
	
	
		
			12 KiB
		
	
	
	
		
			R
		
	
	
	
	
	
# 00_current_analysis.R
 | 
						|
#
 | 
						|
# content: (1) Read event log data
 | 
						|
#          (2) Descriptives
 | 
						|
#          (3) Process Mining
 | 
						|
#
 | 
						|
# input:  ../data/haum/event_logfiles_glossar_2023-10-29_10-26-42.csv
 | 
						|
# output: 
 | 
						|
#
 | 
						|
# last mod: 2023-11-02, NW
 | 
						|
 | 
						|
# setwd("C:/Users/nwickelmaier/Nextcloud/Documents/MDS/2023ss/60100_master_thesis/code")
 | 
						|
 | 
						|
#library(mtt)
 | 
						|
devtools::load_all("../../../../software/mtt")
 | 
						|
library(lattice)
 | 
						|
library(bupaverse)
 | 
						|
 | 
						|
# Overall Research Question: How do museum visitors interact with the
 | 
						|
# artworks presented on the MTT?
 | 
						|
 | 
						|
 | 
						|
 | 
						|
# What about popups?
 | 
						|
 | 
						|
 | 
						|
# Distribution of bursts
 | 
						|
# Can this be visualized in a nice way?
 | 
						|
 | 
						|
#--------------- (1) Read data ---------------
 | 
						|
 | 
						|
dat <- read.table("../data/haum/event_logfiles_glossar_2023-10-29_10-26-42.csv",
 | 
						|
                  sep = ";", header = TRUE)
 | 
						|
dat$date <- as.POSIXct(dat$date)
 | 
						|
dat$date.start <- as.POSIXct(dat$date.start)
 | 
						|
dat$date.stop <- as.POSIXct(dat$date.stop)
 | 
						|
dat$artwork <- sprintf("%03d", dat$artwork)
 | 
						|
dat$event <- factor(dat$event, levels = c("move", "flipCard", "openTopic", "openPopup"))
 | 
						|
 | 
						|
# Add weekdays to data frame
 | 
						|
 | 
						|
dat$weekdays <- factor(weekdays(dat$date.start),
 | 
						|
                       levels = c("Montag", "Dienstag", "Mittwoch",
 | 
						|
                                  "Donnerstag", "Freitag", "Samstag",
 | 
						|
                                  "Sonntag"),
 | 
						|
                       labels = c("Monday", "Tuesday", "Wednesday",
 | 
						|
                                  "Thursday", "Friday", "Saturday",
 | 
						|
                                  "Sunday"))
 | 
						|
 | 
						|
names(dat)[names(dat) %in% c("date.start", "date.stop")] <- c("start", "complete")
 | 
						|
 | 
						|
#--------------- (2) Descriptives ---------------
 | 
						|
# How many events per topic, per trace, ...
 | 
						|
# How many popups per artwork?
 | 
						|
 | 
						|
# Number of events per artwork
 | 
						|
tab <- xtabs( ~ artwork + event, dat)
 | 
						|
addmargins(tab)
 | 
						|
 | 
						|
proportions(tab, margin = "artwork")
 | 
						|
proportions(tab, margin = "event")
 | 
						|
 | 
						|
barplot(t(proportions(tab, margin = "artwork")), las = 2)
 | 
						|
 | 
						|
# Proportion of events
 | 
						|
proportions(xtabs( ~ event, dat))
 | 
						|
# Mean proportion of event per trace
 | 
						|
colMeans(proportions(xtabs( ~ trace + event, dat), margin = "trace"))
 | 
						|
# Mean proportion of event per artwork
 | 
						|
colMeans(proportions(tab, margin = "artwork"))
 | 
						|
 | 
						|
# Proportion of unclosed events
 | 
						|
 | 
						|
nrow(dat[is.na(dat$complete), ])
 | 
						|
nrow(dat[is.na(dat$complete), ]) / nrow(dat)
 | 
						|
 | 
						|
 | 
						|
#--------------- (3) Process Mining ---------------
 | 
						|
 | 
						|
#--------------- (3.1) Check data quality ---------------
 | 
						|
 | 
						|
alog <- activitylog(dat,
 | 
						|
                    case_id = "trace",
 | 
						|
                    activity_id = "event",
 | 
						|
                    #resource_id = "case",
 | 
						|
                    resource_id = "artwork",
 | 
						|
                    timestamps = c("start", "complete"))
 | 
						|
 | 
						|
# process_map(alog, frequency("relative"))
 | 
						|
map_as_pdf(alog, file = "../figures/pm_trace-event.pdf")
 | 
						|
 | 
						|
#--------------- (3.2) Interactions for different artworks ---------------
 | 
						|
# Do interaction patterns for events per trace look different for different
 | 
						|
# artworks?
 | 
						|
 | 
						|
which.max(table(dat$artwork))
 | 
						|
which.min(table(dat$artwork))
 | 
						|
which.min(table(dat$artwork)[-c(71,72)])
 | 
						|
 | 
						|
alog080 <- activitylog(dat[dat$artwork == "080",],
 | 
						|
                    case_id = "trace",
 | 
						|
                    activity_id = "event",
 | 
						|
                    resource_id = "artwork",
 | 
						|
                    timestamps = c("start", "complete"))
 | 
						|
 | 
						|
map_as_pdf(alog080, file = "../figures/pm_trace-event_080.pdf")
 | 
						|
 | 
						|
alog087 <- activitylog(dat[dat$artwork == "087",],
 | 
						|
                    case_id = "trace",
 | 
						|
                    activity_id = "event",
 | 
						|
                    resource_id = "artwork",
 | 
						|
                    timestamps = c("start", "complete"))
 | 
						|
 | 
						|
map_as_pdf(alog087, file = "../figures/pm_trace-event_087.pdf")
 | 
						|
 | 
						|
alog504 <- activitylog(dat[dat$artwork == "504",],
 | 
						|
                    case_id = "trace",
 | 
						|
                    activity_id = "event",
 | 
						|
                    resource_id = "artwork",
 | 
						|
                    timestamps = c("start", "complete"))
 | 
						|
 | 
						|
map_as_pdf(alog504, file = "../figures/pm_trace-event_504.pdf")
 | 
						|
 | 
						|
#--------------- (3.3) Patterns of cases ---------------
 | 
						|
 | 
						|
# What kind of patterns do we have? Are their typical sequences for cases?
 | 
						|
# Do case patterns look different for ...
 | 
						|
# ... mornings and afternoons?
 | 
						|
# ... weekdays and weekends?
 | 
						|
# ... weekdays for "normal" and school vacation days?
 | 
						|
# ... pre and post corona?
 | 
						|
 | 
						|
alog <- activitylog(dat,
 | 
						|
                    case_id = "case",
 | 
						|
                    activity_id = "event",
 | 
						|
                    resource_id = "trace",
 | 
						|
                    timestamps = c("start", "complete"))
 | 
						|
 | 
						|
map_as_pdf(alog, file = "../figures/pm_case-event.pdf")
 | 
						|
 | 
						|
alog080 <- activitylog(dat[dat$artwork == "080",],
 | 
						|
                       case_id = "case",
 | 
						|
                       activity_id = "event",
 | 
						|
                       resource_id = "trace",
 | 
						|
                       timestamps = c("start", "complete"))
 | 
						|
 | 
						|
map_as_pdf(alog080, file = "../figures/pm_case-event_080.pdf")
 | 
						|
 | 
						|
alog087 <- activitylog(dat[dat$artwork == "087",],
 | 
						|
                       case_id = "case",
 | 
						|
                       activity_id = "event",
 | 
						|
                       resource_id = "trace",
 | 
						|
                       timestamps = c("start", "complete"))
 | 
						|
 | 
						|
map_as_pdf(alog087, file = "../figures/pm_case-event_087.pdf")
 | 
						|
 | 
						|
### Mornings and afternoons
 | 
						|
 | 
						|
dat$tod <- ifelse(lubridate::hour(dat$start) > 13, "afternoon", "morning")
 | 
						|
 | 
						|
alog <- activitylog(dat[dat$tod == "morning",],
 | 
						|
                    case_id = "case",
 | 
						|
                    activity_id = "event",
 | 
						|
                    resource_id = "trace",
 | 
						|
                    timestamps = c("start", "complete"))
 | 
						|
 | 
						|
map_as_pdf(alog, file = "../figures/pm_case-event_morning.pdf")
 | 
						|
 | 
						|
alog <- activitylog(dat[dat$tod == "afternoon",],
 | 
						|
                    case_id = "case",
 | 
						|
                    activity_id = "event",
 | 
						|
                    resource_id = "trace",
 | 
						|
                    timestamps = c("start", "complete"))
 | 
						|
 | 
						|
map_as_pdf(alog, file = "../figures/pm_case-event_afternoon.pdf")
 | 
						|
 | 
						|
### Weekdays and weekends
 | 
						|
 | 
						|
dat$wd <- ifelse(dat$weekdays %in% c("Saturday", "Sunday"), "weekend", "weekday")
 | 
						|
 | 
						|
alog <- activitylog(dat[dat$wd == "weekend",],
 | 
						|
                    case_id = "case",
 | 
						|
                    activity_id = "event",
 | 
						|
                    resource_id = "trace",
 | 
						|
                    timestamps = c("start", "complete"))
 | 
						|
 | 
						|
map_as_pdf(alog, file = "../figures/pm_case-event_weekend.pdf")
 | 
						|
 | 
						|
alog <- activitylog(dat[dat$wd == "weekday",],
 | 
						|
                    case_id = "case",
 | 
						|
                    activity_id = "event",
 | 
						|
                    resource_id = "trace",
 | 
						|
                    timestamps = c("start", "complete"))
 | 
						|
 | 
						|
map_as_pdf(alog, file = "../figures/pm_case-event_weekday.pdf")
 | 
						|
 | 
						|
### Weekdays vs. school vacation weekdays
 | 
						|
 | 
						|
dat$wds <- ifelse(!is.na(dat$vacation), "vacation", "school")
 | 
						|
dat$wds[dat$wd == "weekend"] <- NA
 | 
						|
 | 
						|
alog <- activitylog(dat[which(dat$wds == "school"),],
 | 
						|
                    case_id = "case",
 | 
						|
                    activity_id = "event",
 | 
						|
                    resource_id = "trace",
 | 
						|
                    timestamps = c("start", "complete"))
 | 
						|
 | 
						|
map_as_pdf(alog, file = "../figures/pm_case-event_school.pdf")
 | 
						|
 | 
						|
alog <- activitylog(dat[which(dat$wds == "vacation"),],
 | 
						|
                    case_id = "case",
 | 
						|
                    activity_id = "event",
 | 
						|
                    resource_id = "trace",
 | 
						|
                    timestamps = c("start", "complete"))
 | 
						|
 | 
						|
map_as_pdf(alog, file = "../figures/pm_case-event_vacation.pdf")
 | 
						|
 | 
						|
### Pre and post Corona
 | 
						|
 | 
						|
dat$corona <- ifelse(dat$date < "2020-03-14", "pre", "post")
 | 
						|
 | 
						|
alog <- activitylog(dat[which(dat$corona == "pre"),],
 | 
						|
                    case_id = "case",
 | 
						|
                    activity_id = "event",
 | 
						|
                    resource_id = "trace",
 | 
						|
                    timestamps = c("start", "complete"))
 | 
						|
 | 
						|
map_as_pdf(alog, file = "../figures/pm_case-event_pre-corona.pdf")
 | 
						|
 | 
						|
alog <- activitylog(dat[which(dat$corona == "post"),],
 | 
						|
                    case_id = "case",
 | 
						|
                    activity_id = "event",
 | 
						|
                    resource_id = "trace",
 | 
						|
                    timestamps = c("start", "complete"))
 | 
						|
 | 
						|
map_as_pdf(alog, file = "../figures/pm_case-event_post-corona.pdf")
 | 
						|
 | 
						|
#--------------- (3.4) Artwork sequences ---------------
 | 
						|
# Order in which artworks are looked at
 | 
						|
 | 
						|
nart <- 5     # select 5 artworks randomly
 | 
						|
alog <- activitylog(dat,#[dat$artwork %in% sample(unique(dat$artwork), nart), ],
 | 
						|
                    case_id = "case",
 | 
						|
                    activity_id = "artwork",
 | 
						|
                    resource_id = "trace",
 | 
						|
                    timestamps = c("start", "complete"))
 | 
						|
 | 
						|
#map <- process_map(alog, frequency("relative"))
 | 
						|
 | 
						|
## select cases with Vermeer
 | 
						|
length(unique(dat[dat$artwork == "080", "case"]))
 | 
						|
# 12615
 | 
						|
case080 <- unique(dat[dat$artwork == "080", "case"])
 | 
						|
tmp <- dat[dat$case %in% case080, ]
 | 
						|
table(tmp$artwork)
 | 
						|
# --> all :)
 | 
						|
 | 
						|
# select the ones most often (I am aiming for 10...)
 | 
						|
barplot(table(tmp$artwork))
 | 
						|
abline(h = 12000, col = "red")
 | 
						|
which(table(tmp$artwork) > 12000)
 | 
						|
 | 
						|
often080 <- names(which(table(tmp$artwork) > 12000))
 | 
						|
 | 
						|
alog <- activitylog(dat[dat$artwork %in% often080, ],
 | 
						|
                    case_id = "case",
 | 
						|
                    activity_id = "artwork",
 | 
						|
                    resource_id = "trace",
 | 
						|
                    timestamps = c("start", "complete"))
 | 
						|
 | 
						|
map_as_pdf(alog, file = "../figures/pm_case-artwork_often080.pdf")
 | 
						|
 | 
						|
 | 
						|
#--------------- (3.5) Topics ---------------
 | 
						|
 | 
						|
# Are there certain topics that people are interested in more than others?
 | 
						|
# Do these topic distributions differ for comparable artworks?
 | 
						|
 | 
						|
# Order of topics for Vermeer
 | 
						|
# alog080 <- activitylog(dat[dat$artwork == "080",],
 | 
						|
#                     case_id = "case",
 | 
						|
#                     activity_id = "topic",
 | 
						|
#                     resource_id = "trace",
 | 
						|
#                     timestamps = c("start", "complete"))
 | 
						|
# 
 | 
						|
# map_as_pdf(alog080, file = "../figures/pm_case-topic_080.pdf")
 | 
						|
# 
 | 
						|
# 
 | 
						|
# alog080 <- activitylog(dat[dat$artwork == "080",],
 | 
						|
#                        case_id = "case",
 | 
						|
#                        activity_id = "topicFile",
 | 
						|
#                        resource_id = "trace",
 | 
						|
#                        timestamps = c("start", "complete"))
 | 
						|
# 
 | 
						|
# #process_map(alog080, frequency("relative"))
 | 
						|
# 
 | 
						|
# # Comparable artwork
 | 
						|
# alog083 <- activitylog(dat[dat$artwork == "083",],
 | 
						|
#                        case_id = "case",
 | 
						|
#                        activity_id = "topic",
 | 
						|
#                        resource_id = "trace",
 | 
						|
#                        timestamps = c("start", "complete"))
 | 
						|
# 
 | 
						|
# map_as_pdf(alog083, file = "../figures/pm_case-topic_083.pdf")
 | 
						|
 | 
						|
# artworks that have the same topics than Vermeer
 | 
						|
which(rowSums(xtabs( ~ artwork + topic, dat[dat$topic %in%
 | 
						|
                    c("artist", "details", "extra info", "komposition",
 | 
						|
                      "licht und farbe", "thema"), ]) != 0) == 6)
 | 
						|
 | 
						|
#037 046 062 080 083 109 
 | 
						|
 | 
						|
for (art in c("037", "046", "062", "080", "083", "109")) {
 | 
						|
 | 
						|
  alog <- activitylog(dat[dat$artwork == art,],
 | 
						|
                      case_id = "case",
 | 
						|
                      activity_id = "topic",
 | 
						|
                      resource_id = "trace",
 | 
						|
                      timestamps = c("start", "complete"))
 | 
						|
  
 | 
						|
  map_as_pdf(alog, file = paste0("../figures/pm_case-topic_", art, ".pdf"))
 | 
						|
}
 | 
						|
 | 
						|
 | 
						|
 | 
						|
# Angewandte Kunst, Graphik, Gemälde, Kultur
 | 
						|
 | 
						|
 | 
						|
c("Kultur", "Kultur", "Graphik", "Gemälde", "Gemälde", "Gemälde",
 | 
						|
  "Gemälde", "Gemälde", "Graphik", "Gemälde", "Angewandte Kunst", "",
 | 
						|
  "Gemälde", "Angewandte Kunst", "", "", "Graphik", "Angewandte Kunst",
 | 
						|
  "Angewandte Kunst", "Gemälde", "Angewandte Kunst", "Gemälde", "",
 | 
						|
  "Gemälde", "Gemälde", "Gemälde", "Graphik", "Gemälde", "Gemälde",
 | 
						|
  "Gemälde", "", "Angewandte Kunst", "Angewandte Kunst", "Gemälde",
 | 
						|
  "Graphik", "Gemälde", "Gemälde", "Gemälde", "Gemälde",
 | 
						|
  "Angewandte Kunst", "Gemälde", "Gemälde", "Gemälde", "Kultur", "Kultur",
 | 
						|
  "Gemälde", "Kultur", "", "Gemälde", "", "Graphik", "Kultur", "Gemälde",
 | 
						|
  "", "Kultur", "Gemälde", "Kultur", "Gemälde", "Gemälde", "Gemälde",
 | 
						|
  "Kultur", "Kultur", "Kultur", "Kultur", "Kultur", "Kultur",
 | 
						|
  "Angewandte Kunst", "Info", "Info", "Info", "Kultur", "Kultur")
 | 
						|
 | 
						|
 | 
						|
 | 
						|
# BURSTS
 | 
						|
which.max(table(dat$date))
 | 
						|
tmp <- dat[dat$date == "2017-02-12", ]
 | 
						|
 | 
						|
# number of traces per case on 2017-02-12
 | 
						|
rowSums(xtabs( ~ case + trace, tmp) != 0)
 | 
						|
 | 
						|
range(tmp$start)
 | 
						|
startcut <- cut(tmp$start, 7)
 | 
						|
xtabs( ~ case + startcut, tmp)
 | 
						|
 | 
						|
# distribution of cases over the day
 | 
						|
colSums(xtabs( ~ case + startcut, tmp) != 0)
 | 
						|
barplot(colSums(xtabs( ~ case + startcut, tmp) != 0))
 | 
						|
 | 
						|
aggregate(trace ~ case + startcut, tmp, length)
 | 
						|
 | 
						|
 | 
						|
###########################################################################
 | 
						|
# HELPER
 | 
						|
 | 
						|
map_as_pdf <- function(alog, file, type = frequency("relative")) {
 | 
						|
  map <- process_map(alog, type = type)
 | 
						|
  g <- DiagrammeR::grViz(map$x$diagram) |> DiagrammeRsvg::export_svg() |> charToRaw()
 | 
						|
  rsvg::rsvg_pdf(g, file)
 | 
						|
}
 | 
						|
 |