Next round of script cleaning

This commit is contained in:
Nora Wickelmaier 2024-01-31 12:09:27 +01:00
parent 76af291686
commit 7d67482e0c
7 changed files with 124 additions and 409 deletions

View File

@ -27,46 +27,66 @@ library(bupaverse)
#--------------- (1) Read data --------------- #--------------- (1) Read data ---------------
dat <- read.table("results/haum/event_logfiles_glossar_2023-12-28_09-49-43.csv", datlogs <- read.table("results/haum/event_logfiles_2024-01-18_09-58-52.csv",
sep = ";", header = TRUE, colClasses = c("character", "character", "POSIXct",
colClasses = c("POSIXct", "character", "integer", "POSIXct", "character", "integer",
"integer", "numeric", "integer", "numeric", "character", "character",
"character", "character", "character", rep("numeric", 3), "character",
"character", "POSIXct", "POSIXct", "character", rep("numeric", 11),
"numeric", "numeric", "numeric", "character", "character"),
"integer", "character", sep = ";", header = TRUE)
rep("numeric", 11), "integer",
"character", "character", "logical",
"logical", "logical", "character",
"character"))
dat$date <- NULL datlogs$event <- factor(datlogs$event, levels = c("move", "flipCard",
# TODO: Remove, after rerunning preprocessing "openTopic",
"openPopup"))
dat$event <- factor(dat$event, levels = c("move", "flipCard", "openTopic", "openPopup")) datraw <- read.table("results/haum/raw_logfiles_2024-01-18_09-58-52.csv",
sep = ";", header = TRUE)
# Add weekdays to data frame # Add weekdays to data frame
dat$weekdays <- factor(weekdays(dat$date.start), datlogs$weekdays <- factor(weekdays(datlogs$date.start),
levels = c("Montag", "Dienstag", "Mittwoch", levels = c("Montag", "Dienstag", "Mittwoch",
"Donnerstag", "Freitag", "Samstag", "Donnerstag", "Freitag", "Samstag",
"Sonntag"), "Sonntag"),
labels = c("Monday", "Tuesday", "Wednesday", labels = c("Monday", "Tuesday", "Wednesday",
"Thursday", "Friday", "Saturday", "Thursday", "Friday", "Saturday",
"Sunday")) "Sunday"))
### Number of log files
length(unique(datraw$fileId))
# 39767
length(unique(c(datlogs$fileId.start, datlogs$fileId.stop)))
# 22789
### Number of activities
nrow(datlogs)
table(datlogs$event)
proportions(table(datlogs$event))
proportions(table(datlogs$event[datlogs$event != "move"]))
### Time range
range(as.Date(datlogs$date.start))
### Topics per item
print(xtabs( ~ item + topic, datlogs), zero = "-")
lattice::dotplot(xtabs( ~ item + topic, datlogs), auto.key = TRUE)
mat <- t(as.matrix(xtabs( ~ item + topic, datlogs)))
mat[mat == 0] <- NA
image(mat, axes = F, col = rainbow(100))
heatmap(t(mat))
datlogs$start <- datlogs$date.start
datlogs$complete <- datlogs$date.stop
names(dat)[names(dat) %in% c("date.start", "date.stop")] <- c("start", "complete")
dat$trail <- dat$trace
dat$trace <- NULL
# --> needs to be changed since "trace" is an inbuilt variable name in
# bupar
#--------------- (2) Descriptives --------------- #--------------- (2) Descriptives ---------------
# How many events per topic, per trace, ... # How many events per topic, per path, ...
# How many popups per artwork? # How many popups per artwork?
# Number of events per artwork # Number of events per artwork
tab <- xtabs( ~ artwork + event, dat) tab <- xtabs( ~ artwork + event, datlogs)
addmargins(tab) addmargins(tab)
proportions(tab, margin = "artwork") proportions(tab, margin = "artwork")
@ -78,35 +98,35 @@ pdf("../figures/event-dist.pdf", height = 3.375, width = 12, pointsize = 10)
par(mai = c(.4,.4,.1,.1), mgp = c(2.4, 1, 0)) par(mai = c(.4,.4,.1,.1), mgp = c(2.4, 1, 0))
barplot(t(proportions(tab, margin = "artwork")), las = 2, col = cc, barplot(t(proportions(tab, margin = "artwork")), las = 2, col = cc,
legend.text = levels(dat$event), args.legend = list(x = "bottomleft", bg = "white")) legend.text = levels(datlogs$event), args.legend = list(x = "bottomleft", bg = "white"))
dev.off() dev.off()
#barchart(proportions(tab, margin = "artwork"), las = 2) #barchart(proportions(tab, margin = "artwork"), las = 2)
# Proportion of events # Proportion of events
proportions(xtabs( ~ event, dat)) proportions(xtabs( ~ event, datlogs))
# Mean proportion of event per trace # Mean proportion of event per path
colMeans(proportions(xtabs( ~ trail + event, dat), margin = "trail")) colMeans(proportions(xtabs( ~ path + event, datlogs), margin = "path"))
# Mean proportion of event per artwork # Mean proportion of event per artwork
colMeans(proportions(tab, margin = "artwork")) colMeans(proportions(tab, margin = "artwork"))
# Proportion of unclosed events # Proportion of unclosed events
nrow(dat[is.na(dat$complete), ]) nrow(datlogs[is.na(datlogs$complete), ])
nrow(dat[is.na(dat$complete), ]) / nrow(dat) nrow(datlogs[is.na(datlogs$complete), ]) / nrow(datlogs)
# Proportion of events spanning more than one log file # Proportion of events spanning more than one log file
sum(dat$fileId.start != dat$fileId.stop, na.rm = TRUE) sum(datlogs$fileId.start != datlogs$fileId.stop, na.rm = TRUE)
sum(dat$fileId.start != dat$fileId.stop, na.rm = TRUE) / nrow(dat) sum(datlogs$fileId.start != datlogs$fileId.stop, na.rm = TRUE) / nrow(datlogs)
#--------------- (3) Process Mining --------------- #--------------- (3) Process Mining ---------------
#--------------- (3.1) Check data quality --------------- #--------------- (3.1) Check data quality ---------------
alog <- activitylog(dat, alog <- activitylog(datlogs,
case_id = "trail", case_id = "path",
activity_id = "event", activity_id = "event",
#resource_id = "case", #resource_id = "case",
resource_id = "artwork", resource_id = "artwork",
@ -128,8 +148,8 @@ alog_no_move <- alog[alog$event != "move", ]
pdf("../figures/traceexplore_trace-event.pdf", height = 8, width = 12, pointsize = 10) pdf("../figures/traceexplore_trace-event.pdf", height = 8, width = 12, pointsize = 10)
set.seed(1447) set.seed(1447)
processmapR::trace_explorer(alog_no_move[alog_no_move$trail %in% processmapR::trace_explorer(alog_no_move[alog_no_move$path %in%
sample(unique(alog_no_move$trail), 400),], sample(unique(alog_no_move$path), 400),],
coverage = 1, type = "frequent", coverage = 1, type = "frequent",
abbreviate = T) abbreviate = T)
dev.off() dev.off()
@ -157,28 +177,28 @@ aggregate(relative_resource ~ artwork, ra, sum)
# Do interaction patterns for events per trace look different for different # Do interaction patterns for events per trace look different for different
# artworks? # artworks?
which.max(table(dat$artwork)) which.max(table(datlogs$artwork))
which.min(table(dat$artwork)) which.min(table(datlogs$artwork))
which.min(table(dat$artwork)[-c(71,72)]) which.min(table(datlogs$artwork)[-c(71,72)])
alog080 <- activitylog(dat[dat$artwork == "080",], alog080 <- activitylog(datlogs[datlogs$artwork == "080",],
case_id = "trail", case_id = "path",
activity_id = "event", activity_id = "event",
resource_id = "artwork", resource_id = "artwork",
timestamps = c("start", "complete")) timestamps = c("start", "complete"))
map_as_pdf(alog080, file = "../figures/pm_trace-event_080.pdf") map_as_pdf(alog080, file = "../figures/pm_trace-event_080.pdf")
alog087 <- activitylog(dat[dat$artwork == "087",], alog087 <- activitylog(datlogs[datlogs$artwork == "087",],
case_id = "trail", case_id = "path",
activity_id = "event", activity_id = "event",
resource_id = "artwork", resource_id = "artwork",
timestamps = c("start", "complete")) timestamps = c("start", "complete"))
map_as_pdf(alog087, file = "../figures/pm_trace-event_087.pdf") map_as_pdf(alog087, file = "../figures/pm_trace-event_087.pdf")
alog504 <- activitylog(dat[dat$artwork == "504",], alog504 <- activitylog(datlogs[datlogs$artwork == "504",],
case_id = "trail", case_id = "path",
activity_id = "event", activity_id = "event",
resource_id = "artwork", resource_id = "artwork",
timestamps = c("start", "complete")) timestamps = c("start", "complete"))
@ -194,10 +214,10 @@ map_as_pdf(alog504, file = "../figures/pm_trace-event_504.pdf")
# ... weekdays for "normal" and school vacation days? # ... weekdays for "normal" and school vacation days?
# ... pre and post corona? # ... pre and post corona?
alog <- activitylog(dat, alog <- activitylog(datlogs,
case_id = "case", case_id = "case",
activity_id = "event", activity_id = "event",
resource_id = "trail", resource_id = "path",
timestamps = c("start", "complete")) timestamps = c("start", "complete"))
map_as_pdf(alog, file = "../figures/pm_case-event.pdf") map_as_pdf(alog, file = "../figures/pm_case-event.pdf")
@ -206,38 +226,38 @@ alog_no_move <- alog[alog$event != "move", ]
pdf("../figures/traceexplore_case-event.pdf", height = 8, width = 12, pointsize = 10) pdf("../figures/traceexplore_case-event.pdf", height = 8, width = 12, pointsize = 10)
set.seed(1050) set.seed(1050)
processmapR::trace_explorer(alog_no_move[alog_no_move$trail %in% processmapR::trace_explorer(alog_no_move[alog_no_move$path %in%
sample(unique(alog_no_move$trail), 300),], sample(unique(alog_no_move$path), 300),],
coverage = 1, type = "frequent", coverage = 1, type = "frequent",
abbreviate = T) abbreviate = T)
dev.off() dev.off()
map_as_pdf(alog080, file = "../figures/pm_case-event_080.pdf") map_as_pdf(alog080, file = "../figures/pm_case-event_080.pdf")
alog087 <- activitylog(dat[dat$artwork == "087",], alog087 <- activitylog(datlogs[datlogs$artwork == "087",],
case_id = "case", case_id = "case",
activity_id = "event", activity_id = "event",
resource_id = "trail", resource_id = "path",
timestamps = c("start", "complete")) timestamps = c("start", "complete"))
map_as_pdf(alog087, file = "../figures/pm_case-event_087.pdf") map_as_pdf(alog087, file = "../figures/pm_case-event_087.pdf")
### Mornings and afternoons ### Mornings and afternoons
dat$tod <- ifelse(lubridate::hour(dat$start) > 13, "afternoon", "morning") datlogs$tod <- ifelse(lubridate::hour(datlogs$start) > 13, "afternoon", "morning")
alog <- activitylog(dat[dat$tod == "morning",], alog <- activitylog(datlogs[datlogs$tod == "morning",],
case_id = "case", case_id = "case",
activity_id = "event", activity_id = "event",
resource_id = "trail", resource_id = "path",
timestamps = c("start", "complete")) timestamps = c("start", "complete"))
map_as_pdf(alog, file = "../figures/pm_case-event_morning.pdf") map_as_pdf(alog, file = "../figures/pm_case-event_morning.pdf")
alog <- activitylog(dat[dat$tod == "afternoon",], alog <- activitylog(datlogs[datlogs$tod == "afternoon",],
case_id = "case", case_id = "case",
activity_id = "event", activity_id = "event",
resource_id = "trail", resource_id = "path",
timestamps = c("start", "complete")) timestamps = c("start", "complete"))
map_as_pdf(alog, file = "../figures/pm_case-event_afternoon.pdf") map_as_pdf(alog, file = "../figures/pm_case-event_afternoon.pdf")
@ -246,7 +266,7 @@ map_as_pdf(alog, file = "../figures/pm_case-event_afternoon.pdf")
pdf("../figures/bp_tod.pdf", height = 3.375, width = 12, pointsize = 10) pdf("../figures/bp_tod.pdf", height = 3.375, width = 12, pointsize = 10)
par(mai = c(.5,.6,.1,.1), mgp = c(2.4, 1, 0)) par(mai = c(.5,.6,.1,.1), mgp = c(2.4, 1, 0))
barplot(proportions(xtabs( ~ tod + artwork, dat), margin = "tod"), #col = cc[1:2], barplot(proportions(xtabs( ~ tod + artwork, datlogs), margin = "tod"), #col = cc[1:2],
las = 2, beside = TRUE, legend = c("afternoon", "morning"), las = 2, beside = TRUE, legend = c("afternoon", "morning"),
args.legend = list(x = "topleft")) args.legend = list(x = "topleft"))
@ -254,20 +274,20 @@ dev.off()
### Weekdays and weekends ### Weekdays and weekends
dat$wd <- ifelse(dat$weekdays %in% c("Saturday", "Sunday"), "weekend", "weekday") datlogs$wd <- ifelse(datlogs$weekdays %in% c("Saturday", "Sunday"), "weekend", "weekday")
alog <- activitylog(dat[dat$wd == "weekend",], alog <- activitylog(datlogs[datlogs$wd == "weekend",],
case_id = "case", case_id = "case",
activity_id = "event", activity_id = "event",
resource_id = "trail", resource_id = "path",
timestamps = c("start", "complete")) timestamps = c("start", "complete"))
map_as_pdf(alog, file = "../figures/pm_case-event_weekend.pdf") map_as_pdf(alog, file = "../figures/pm_case-event_weekend.pdf")
alog <- activitylog(dat[dat$wd == "weekday",], alog <- activitylog(datlogs[datlogs$wd == "weekday",],
case_id = "case", case_id = "case",
activity_id = "event", activity_id = "event",
resource_id = "trail", resource_id = "path",
timestamps = c("start", "complete")) timestamps = c("start", "complete"))
map_as_pdf(alog, file = "../figures/pm_case-event_weekday.pdf") map_as_pdf(alog, file = "../figures/pm_case-event_weekday.pdf")
@ -276,7 +296,7 @@ map_as_pdf(alog, file = "../figures/pm_case-event_weekday.pdf")
pdf("../figures/bp_wd.pdf", height = 3.375, width = 12, pointsize = 10) pdf("../figures/bp_wd.pdf", height = 3.375, width = 12, pointsize = 10)
par(mai = c(.5,.6,.1,.1), mgp = c(2.4, 1, 0)) par(mai = c(.5,.6,.1,.1), mgp = c(2.4, 1, 0))
barplot(proportions(xtabs( ~ wd + artwork, dat), margin = "wd"), barplot(proportions(xtabs( ~ wd + artwork, datlogs), margin = "wd"),
las = 2, beside = TRUE, legend = c("weekday", "weekend"), las = 2, beside = TRUE, legend = c("weekday", "weekend"),
args.legend = list(x = "topleft")) args.legend = list(x = "topleft"))
@ -284,21 +304,21 @@ dev.off()
### Weekdays vs. school vacation weekdays ### Weekdays vs. school vacation weekdays
dat$wds <- ifelse(!is.na(dat$vacation), "vacation", "school") datlogs$wds <- ifelse(!is.na(datlogs$vacation), "vacation", "school")
dat$wds[dat$wd == "weekend"] <- NA datlogs$wds[datlogs$wd == "weekend"] <- NA
alog <- activitylog(dat[which(dat$wds == "school"),], alog <- activitylog(datlogs[which(datlogs$wds == "school"),],
case_id = "case", case_id = "case",
activity_id = "event", activity_id = "event",
resource_id = "trail", resource_id = "path",
timestamps = c("start", "complete")) timestamps = c("start", "complete"))
map_as_pdf(alog, file = "../figures/pm_case-event_school.pdf") map_as_pdf(alog, file = "../figures/pm_case-event_school.pdf")
alog <- activitylog(dat[which(dat$wds == "vacation"),], alog <- activitylog(datlogs[which(datlogs$wds == "vacation"),],
case_id = "case", case_id = "case",
activity_id = "event", activity_id = "event",
resource_id = "trail", resource_id = "path",
timestamps = c("start", "complete")) timestamps = c("start", "complete"))
map_as_pdf(alog, file = "../figures/pm_case-event_vacation.pdf") map_as_pdf(alog, file = "../figures/pm_case-event_vacation.pdf")
@ -307,8 +327,8 @@ map_as_pdf(alog, file = "../figures/pm_case-event_vacation.pdf")
pdf("../figures/bp_wds.pdf", height = 3.375, width = 12, pointsize = 10) pdf("../figures/bp_wds.pdf", height = 3.375, width = 12, pointsize = 10)
par(mai = c(.5,.6,.1,.1), mgp = c(2.4, 1, 0)) par(mai = c(.5,.6,.1,.1), mgp = c(2.4, 1, 0))
#barplot(xtabs( ~ wds + artwork, dat), las = 2, beside = TRUE, #barplot(xtabs( ~ wds + artwork, datlogs), las = 2, beside = TRUE,
barplot(proportions(xtabs( ~ wds + artwork, dat), margin = "wds"), barplot(proportions(xtabs( ~ wds + artwork, datlogs), margin = "wds"),
las = 2, beside = TRUE, las = 2, beside = TRUE,
legend = c("school", "vacation"), args.legend = list(x = "topleft")) legend = c("school", "vacation"), args.legend = list(x = "topleft"))
@ -316,20 +336,20 @@ dev.off()
### Pre and post Corona ### Pre and post Corona
dat$corona <- ifelse(dat$date < "2020-03-14", "pre", "post") datlogs$corona <- ifelse(datlogs$date < "2020-03-14", "pre", "post")
alog <- activitylog(dat[which(dat$corona == "pre"),], alog <- activitylog(datlogs[which(datlogs$corona == "pre"),],
case_id = "case", case_id = "case",
activity_id = "event", activity_id = "event",
resource_id = "trail", resource_id = "path",
timestamps = c("start", "complete")) timestamps = c("start", "complete"))
map_as_pdf(alog, file = "../figures/pm_case-event_pre-corona.pdf") map_as_pdf(alog, file = "../figures/pm_case-event_pre-corona.pdf")
alog <- activitylog(dat[which(dat$corona == "post"),], alog <- activitylog(datlogs[which(datlogs$corona == "post"),],
case_id = "case", case_id = "case",
activity_id = "event", activity_id = "event",
resource_id = "trail", resource_id = "path",
timestamps = c("start", "complete")) timestamps = c("start", "complete"))
map_as_pdf(alog, file = "../figures/pm_case-event_post-corona.pdf") map_as_pdf(alog, file = "../figures/pm_case-event_post-corona.pdf")
@ -338,7 +358,7 @@ map_as_pdf(alog, file = "../figures/pm_case-event_post-corona.pdf")
pdf("../figures/bp_corona.pdf", height = 3.375, width = 12, pointsize = 10) pdf("../figures/bp_corona.pdf", height = 3.375, width = 12, pointsize = 10)
par(mai = c(.5,.6,.1,.1), mgp = c(2.4, 1, 0)) par(mai = c(.5,.6,.1,.1), mgp = c(2.4, 1, 0))
barplot(proportions(xtabs( ~ corona + artwork, dat), margin = "corona"), barplot(proportions(xtabs( ~ corona + artwork, datlogs), margin = "corona"),
las = 2, beside = TRUE, las = 2, beside = TRUE,
legend = c("post", "pre"), args.legend = list(x = "topleft")) legend = c("post", "pre"), args.legend = list(x = "topleft"))
@ -348,19 +368,19 @@ dev.off()
# Order in which artworks are looked at # Order in which artworks are looked at
nart <- 5 # select 5 artworks randomly nart <- 5 # select 5 artworks randomly
alog <- activitylog(dat,#[dat$artwork %in% sample(unique(dat$artwork), nart), ], alog <- activitylog(datlogs,#[datlogs$artwork %in% sample(unique(datlogs$artwork), nart), ],
case_id = "case", case_id = "case",
activity_id = "artwork", activity_id = "artwork",
resource_id = "trail", resource_id = "path",
timestamps = c("start", "complete")) timestamps = c("start", "complete"))
#map <- process_map(alog, frequency("relative")) #map <- process_map(alog, frequency("relative"))
## select cases with Vermeer ## select cases with Vermeer
length(unique(dat[dat$artwork == "080", "case"])) length(unique(datlogs[datlogs$artwork == "080", "case"]))
# 12615 # 12615
case080 <- unique(dat[dat$artwork == "080", "case"]) case080 <- unique(datlogs[datlogs$artwork == "080", "case"])
tmp <- dat[dat$case %in% case080, ] tmp <- datlogs[datlogs$case %in% case080, ]
table(tmp$artwork) table(tmp$artwork)
# --> all :) # --> all :)
@ -371,10 +391,10 @@ which(table(tmp$artwork) > 14000)
often080 <- names(which(table(tmp$artwork) > 14000)) often080 <- names(which(table(tmp$artwork) > 14000))
alog <- activitylog(dat[dat$artwork %in% often080, ], alog <- activitylog(datlogs[datlogs$artwork %in% often080, ],
case_id = "case", case_id = "case",
activity_id = "artwork", activity_id = "artwork",
resource_id = "trail", resource_id = "path",
timestamps = c("start", "complete")) timestamps = c("start", "complete"))
map_as_pdf(alog, file = "../figures/pm_case-artwork_often080.pdf") map_as_pdf(alog, file = "../figures/pm_case-artwork_often080.pdf")
@ -393,43 +413,43 @@ dev.off()
# Are there certain topics that people are interested in more than others? # Are there certain topics that people are interested in more than others?
# Do these topic distributions differ for comparable artworks? # Do these topic distributions differ for comparable artworks?
alog <- activitylog(dat[which(dat$event == "openTopic"),], alog <- activitylog(datlogs[which(datlogs$event == "openTopic"),],
case_id = "case", case_id = "case",
activity_id = "topic", activity_id = "topic",
resource_id = "trail", resource_id = "path",
timestamps = c("start", "complete")) timestamps = c("start", "complete"))
map_as_pdf(alog, file = "../figures/pm_case-topic.pdf") map_as_pdf(alog, file = "../figures/pm_case-topic.pdf")
# Order of topics for Vermeer # Order of topics for Vermeer
# alog080 <- activitylog(dat[dat$artwork == "080",], # alog080 <- activitylog(datlogs[datlogs$artwork == "080",],
# case_id = "case", # case_id = "case",
# activity_id = "topic", # activity_id = "topic",
# resource_id = "trail", # resource_id = "path",
# timestamps = c("start", "complete")) # timestamps = c("start", "complete"))
# #
# map_as_pdf(alog080, file = "../figures/pm_case-topic_080.pdf") # map_as_pdf(alog080, file = "../figures/pm_case-topic_080.pdf")
# #
# #
# alog080 <- activitylog(dat[dat$artwork == "080",], # alog080 <- activitylog(datlogs[datlogs$artwork == "080",],
# case_id = "case", # case_id = "case",
# activity_id = "topicFile", # activity_id = "topicFile",
# resource_id = "trail", # resource_id = "path",
# timestamps = c("start", "complete")) # timestamps = c("start", "complete"))
# #
# #process_map(alog080, frequency("relative")) # #process_map(alog080, frequency("relative"))
# #
# # Comparable artwork # # Comparable artwork
# alog083 <- activitylog(dat[dat$artwork == "083",], # alog083 <- activitylog(datlogs[datlogs$artwork == "083",],
# case_id = "case", # case_id = "case",
# activity_id = "topic", # activity_id = "topic",
# resource_id = "trail", # resource_id = "path",
# timestamps = c("start", "complete")) # timestamps = c("start", "complete"))
# #
# map_as_pdf(alog083, file = "../figures/pm_case-topic_083.pdf") # map_as_pdf(alog083, file = "../figures/pm_case-topic_083.pdf")
# artworks that have the same topics than Vermeer # artworks that have the same topics than Vermeer
which(rowSums(xtabs( ~ artwork + topic, dat[dat$topic %in% which(rowSums(xtabs( ~ artwork + topic, datlogs[datlogs$topic %in%
c("artist", "details", "extra info", "komposition", c("artist", "details", "extra info", "komposition",
"licht und farbe", "thema"), ]) != 0) == 6) "licht und farbe", "thema"), ]) != 0) == 6)
@ -437,10 +457,10 @@ which(rowSums(xtabs( ~ artwork + topic, dat[dat$topic %in%
for (art in c("037", "046", "062", "080", "083", "109")) { for (art in c("037", "046", "062", "080", "083", "109")) {
alog <- activitylog(dat[dat$event == "openTopic" & dat$artwork == art,], alog <- activitylog(datlogs[datlogs$event == "openTopic" & datlogs$artwork == art,],
case_id = "case", case_id = "case",
activity_id = "topic", activity_id = "topic",
resource_id = "trail", resource_id = "path",
timestamps = c("start", "complete")) timestamps = c("start", "complete"))
map_as_pdf(alog, file = paste0("../figures/pm_case-topic_", art, ".pdf")) map_as_pdf(alog, file = paste0("../figures/pm_case-topic_", art, ".pdf"))
@ -467,11 +487,11 @@ c("Kultur", "Kultur", "Graphik", "Gemälde", "Gemälde", "Gemälde",
# BURSTS # BURSTS
which.max(table(dat$date)) which.max(table(datlogs$date))
tmp <- dat[dat$date == "2017-02-12", ] tmp <- datlogs[datlogs$date == "2017-02-12", ]
# number of traces per case on 2017-02-12 # number of traces per case on 2017-02-12
rowSums(xtabs( ~ case + trail, tmp) != 0) rowSums(xtabs( ~ case + path, tmp) != 0)
range(tmp$start) range(tmp$start)
hours <- lubridate::hour(tmp$start) hours <- lubridate::hour(tmp$start)
@ -481,20 +501,20 @@ xtabs( ~ case + hours, tmp)
colSums(xtabs( ~ case + hours, tmp) != 0) colSums(xtabs( ~ case + hours, tmp) != 0)
barplot(colSums(xtabs( ~ case + hours, tmp) != 0)) barplot(colSums(xtabs( ~ case + hours, tmp) != 0))
aggregate(trail ~ case + hours, tmp, length) aggregate(path ~ case + hours, tmp, length)
tmp <- aggregate(trail ~ case, dat, length) tmp <- aggregate(path ~ case, datlogs, length)
tmp$date <- as.Date(dat[!duplicated(dat$case), "start"]) tmp$date <- as.Date(datlogs[!duplicated(datlogs$case), "start"])
tmp$time <- lubridate::hour(dat[!duplicated(dat$case), "start"]) tmp$time <- lubridate::hour(datlogs[!duplicated(datlogs$case), "start"])
tmp[tmp$trail > 200, ] tmp[tmp$path > 200, ]
plot(trail ~ time, tmp, cex = 2, col = rgb(0,0,0,.3)) plot(path ~ time, tmp, cex = 2, col = rgb(0,0,0,.3))
lattice::barchart(trail ~ time, tmp, horizontal=F) lattice::barchart(path ~ time, tmp, horizontal=F)

View File

@ -1,8 +1,6 @@
import pm4py import pm4py
import pandas as pd import pandas as pd
import numpy as np import numpy as np
from python_helpers import eval_pm, pn_infos_miner from python_helpers import eval_pm, pn_infos_miner
###### Load data and create event logs ###### ###### Load data and create event logs ######

View File

@ -1,5 +1,3 @@
# TODO: Clean me up! I am a mix of useful and useless!!!
# setwd("C:/Users/nwickelmaier/Nextcloud/Documents/MDS/2023ss/60100_master_thesis/analysis/code") # setwd("C:/Users/nwickelmaier/Nextcloud/Documents/MDS/2023ss/60100_master_thesis/analysis/code")
#--------------- (1) Look at broken trace --------------- #--------------- (1) Look at broken trace ---------------

View File

@ -1,301 +0,0 @@
# setwd("C:/Users/nwickelmaier/Nextcloud/Documents/MDS/2023ss/60100_master_thesis/code/")
library(lattice)
#library(mtt)
devtools::load_all("../../../../software/mtt")
# Achims colors (used by lattice)
#cc <- palette.colors(palette = "Okabe-Ito")
#plot(1:10, col = cc, pch = 16, cex = 2)
# Read data
datlogs <- read.table("results/haum/event_logfiles_2024-01-02_19-44-50.csv",
colClasses = c("character", "character", "POSIXct",
"POSIXct", "character", "integer",
"numeric", "character", "character",
rep("numeric", 3), "character",
"character", rep("numeric", 11),
"character", "character"),
sep = ";", header = TRUE)
datlogs$event <- factor(datlogs$event, levels = c("move", "flipCard",
"openTopic",
"openPopup"))
### Number of log files
length(unique(datlogs$fileId.start))
length(unique(datlogs$fileId.stop))
length(unique(c(datlogs$fileId.start, datlogs$fileId.stop)))
# 22803
### Number of activities
nrow(datlogs)
table(datlogs$event)
proportions(table(datlogs$event))
proportions(table(datlogs$event[datlogs$event != "move"]))
### Time range
range(as.Date(datlogs$date.start))
### Topics per item
print(xtabs( ~ item + topic, datlogs), zero = "-")
lattice::dotplot(xtabs( ~ item + topic, datlogs), auto.key = TRUE)
### Which item gets touched most often?
counts_item <- table(datlogs$item)
barchart(counts_item)
items <- unique(datlogs$item)
#items <- items[!items %in% c("504", "505")]
datart <- extract_artworks(items,
paste0(items, ".xml"),
"../data/haum/ContentEyevisit/eyevisit_cards_light/")
datart <- datart[order(datart$artwork), ]
names(counts_item) <- datart$title
pdf("../figures/counts_item.pdf", width = 20, height = 10, pointsize = 10)
par(mai = c(5, .6, .1, .1))
tmp <- barplot(counts_item, las = 2, ylim = c(0, 60000),
border = "white", col = "#3CB4DC")
text(tmp, counts_item + 1000, datart$artwork)
dev.off()
### Which item gets touched most often first?
datcase <- datlogs[!duplicated(datlogs$case), ]
counts_case <- table(datcase$item)
names(counts_case) <- datart$title
tmp <- barplot(counts_case, las = 2, border = "white")
text(tmp, counts_case + 100, datart$item)
counts <- rbind(counts_item, counts_case)
barplot(counts, las = 2, border = "white", col = c("gray", "darkorange"))
### Which teasers seem to work well?
barplot(table(datlogs$topic), las = 2)
### Dwell times/duration
datagg <- aggregate(duration ~ event + item, datlogs, mean)
datagg$ds <- datagg$duration / 1000
bwplot(ds ~ event, datagg)
bwplot(ds ~ event | item, datagg)
xyplot(ds ~ event, datagg, groups = item)
# without aggregation
bwplot(duration ~ event, datlogs)
# in min
set.seed(1027)
pdf("../figures/duration.pdf", width = 5, height = 5, pointsize = 10)
bwplot(I(duration/1000/60) ~ event, datlogs[sample(nrow(datlogs), 100000), ],
ylab = "Duration in min")
dev.off()
### Move events
datmove <- aggregate(cbind(duration, scaleSize, rotationDegree, distance, x.start,
y.start, x.stop, y.stop) ~ item, datlogs,
mean)
hist(log(datlogs$scaleSize))
# --> better interpretable on logscale
plot(y.start ~ x.start, datmove, pch = 16, col = "gray")
points(y.start ~ x.start, datmove, col = "#3CB4DC", cex = datmove$scaleSize)
plot(y.start ~ x.start, datmove, type = "n", xlab = "x", ylab = "y",
xlim = c(0, 3840), ylim = c(0, 2160))
with(datmove, text(x.start, y.start, item, col = "gray", cex = 1.5))
with(datmove,
arrows(x.start, y.start, x.stop, y.stop, length = 0.07, lwd = 2)
)
abline(v = c(0, 3840), h = c(0, 2160), col = "#3CB4DC", lwd = 2)
datscale <- aggregate(scaleSize ~ item, datlogs, max)
plot(y.start ~ x.start, datmove, pch = 16, col = "gray")
points(y.start ~ x.start, datmove, col = "#3CB4DC", cex = datscale$scaleSize)
plot(y.start ~ x.start, datmove, type = "n", xlab = "x", ylab = "y",
xlim = c(0, 3840), ylim = c(0, 2160))
#with(datmove, text(x.stop, y.stop, item))
with(datmove, text(x.start, y.start, item))
### Are there certain areas of the table that are touched most often?
# heatmap
cuts <- 100
datlogs$x.start.cat <- cut(datlogs$x.start, cuts)
datlogs$y.start.cat <- cut(datlogs$y.start, cuts)
tab <- xtabs( ~ x.start.cat + y.start.cat, datlogs)
colnames(tab) <- paste0("c", 1:cuts)
rownames(tab) <- paste0("c", 1:cuts)
heatmap(tab, Rowv = NA, Colv = NA)
library(ggplot2)
ggplot(as.data.frame(tab)) +
geom_tile(aes(x = x.start.cat, y = y.start.cat, fill = Freq)) +
scale_fill_gradient(low = "gray40", high = "orange")
dattrim <- datlogs[datlogs$x.start < 3840 &
datlogs$x.start > 0 &
datlogs$y.start < 2160 &
datlogs$y.start > 0 &
datlogs$x.stop < 3840 &
datlogs$x.stop > 0 &
datlogs$y.stop < 2160 &
datlogs$y.stop > 0, ]
cuts <- 100 # 200, 100, 70, ...
# start
dattrim$x.start.cat <- cut(dattrim$x.start, cuts)
dattrim$y.start.cat <- cut(dattrim$y.start, cuts)
tab.start <- xtabs( ~ x.start.cat + y.start.cat, dattrim)
colnames(tab.start) <- NULL
rownames(tab.start) <- NULL
pdf("../figures/heatmap_start.pdf", width = 5, height = 5, pointsize = 10)
heatmap(tab.start, Rowv = NA, Colv = NA)
dev.off()
my_colors <- colorRampPalette(c("#009E73", "#E69F00"))
heatmap(tab.start, Rowv = NA, Colv = NA, col = my_colors(1000))
ggplot(as.data.frame(tab.start)) +
geom_tile(aes(x = x.start.cat, y = y.start.cat, fill = Freq)) +
scale_fill_gradient(low = "#009E73", high = "#E69F00")
# stop
dattrim$x.stop.cat <- cut(dattrim$x.stop, cuts)
dattrim$y.stop.cat <- cut(dattrim$y.stop, cuts)
tab.stop <- xtabs( ~ x.stop.cat + y.stop.cat, dattrim)
colnames(tab.stop) <- NULL
rownames(tab.stop) <- NULL
pdf("../figures/heatmap_stop.pdf", width = 5, height = 5, pointsize = 10)
heatmap(tab.stop, Rowv = NA, Colv = NA)
dev.off()
heatmap(tab.stop, Rowv = NA, Colv = NA, col = my_colors(10))
### How many visitors per day
datlogs$date <- as.Date(datlogs$date.start)
# Interactions per day
datint <- aggregate(case ~ date, datlogs, length)
plot(datint, type = "h")
# Cases per day
datcase <- aggregate(case ~ date, datlogs, function(x) length(unique(x)))
plot(datcase, type = "h")
# Paths per day
datpath <- aggregate(path ~ date, datlogs, function(x) length(unique(x)))
plot(datpath, type = "h")
plot(path ~ date, datpath, type = "h", col = "#3CB4DC")
points(case ~ date, datcase, type = "h")
pdf("../figures/cases_per_day2.pdf", width = 9, height = 5, pointsize = 10)
par(mai = c(.6, .6, .1, .1), mgp = c(2.4, 1, 0))
plot(case ~ date, datcase, type = "h", col = "#3CB4DC")
abline(v = datcase$date[datcase$date %in% c("2020-03-13", "2022-10-25")],
col = "#FF6900", lty = 2)
text(datcase$date[datcase$date == "2020-03-13"]+470, 80,
"Corona gap from 2020-03-13 to 2022-10-25",
col = "#D55E00")
dev.off()
### Other stuff
## function dependencies of mtt
devtools::load_all("../../../../software/mtt")
#library(mtt)
library(mvbutils)
foodweb(where = "package:mtt")
pdf("../figures/fun_depend_mtt.pdf", width = 8, height = 4, pointsize = 10)
foodweb(where = "package:mtt",
prune = c("parse_logfiles", "create_eventlogs", "extract_artworks",
"extract_topics"),
#expand.ybox = 1.8, #cex = .6,
#border = TRUE,
#boxcolor = "gray",
color.lines = FALSE,
lwd = 2, mai = c(0, 0, 0, 0))
dev.off()
## weird behavior of timeMs
pdf("../figures/timeMs.pdf", width = 9, height = 6, pointsize = 10)
#par(mai = c(.6, .6, .1, .1), mgp = c(2.4, 1, 0))
#plot(timeMs.start ~ as.factor(fileId), datlogs[1:2000,], xlab = "fileId")
bwplot(timeMs.start ~ as.factor(fileId.start), datlogs[1:2000,], xlab = "",
scales = list(x = list(rot = 90), y = list(rot = 90)))
dev.off()
## x,y-coordinates out of range
set.seed(1522)
pdf("../figures/xycoord.pdf", width = 5, height = 5, pointsize = 10)
par(mai = c(.6, .6, .1, .1), mgp = c(2.4, 1, 0))
#par(mfrow = c(1, 2))
plot(y.start ~ x.start, datlogs[sample(nrow(datlogs), 10000), ])
abline(v = c(0, 3840), h = c(0, 2160), col = "#3CB4DC", lwd = 2)
#plot(y.stop ~ x.stop, datlogs)
#abline(v = c(0, 3840), h = c(0, 2160), col = "#3CB4DC", lwd = 2)
legend("bottomleft", "Random sample of 10,000", bg = "white")
legend("topleft", "4K-Display: 3840 x 2160", bg = "white")
dev.off()
## moves
dat001 <- datlogs[which(datlogs$item == "001"), ]
index <- as.numeric(as.factor(dat001$path))
cc <- sample(colors(), 100)
plot(y.start ~ x.start, dat001, type = "n", xlab = "x", ylab = "y",
xlim = c(0, 3840), ylim = c(0, 2160))
with(dat001[1:200,], arrows(x.start, y.start, x.stop, y.stop,
length = .07, col = cc[index]))
plot(y.start ~ x.start, dat001, xlab = "x", ylab = "y",
xlim = c(0, 3840), ylim = c(0, 2160), pch = 16, col = "gray")
points(y.start ~ x.start, dat001, xlab = "x", ylab = "y",
xlim = c(0, 3840), ylim = c(0, 2160), cex = dat001$scaleSize,
col = "blue")
cc <- sample(colors(), 70)
dat1 <- datlogs[!duplicated(datlogs$item), ]
dat1 <- dat1[order(dat1$item), ]
plot(y.start ~ x.start, dat1, type = "n", xlim = c(-100, 4500), ylim = c(-100, 2500))
abline(h = c(0, 2160), v = c(0, 3840), col = "lightgray")
with(dat1, points(x.start, y.start, col = cc, pch = 16))
with(dat1, points(x.stop, y.stop, col = cc, pch = 16))
with(dat1, arrows(x.start, y.start, x.stop, y.stop, length = .07, col = cc))