mtt_haum/code/11_investigate-variants.R

161 lines
5.2 KiB
R
Raw Normal View History

# 11_investigate-variants.R
#
# content: (1) Read data
2024-03-26 18:28:10 +01:00
# (2) Investigate variants (pre-corona data set)
# (3) Investigate variants (2019)
#
2024-03-22 12:33:58 +01:00
# input: results/eventlogs_pre-corona_cleaned.RData
# output: ../thesis/figures/freq-traces.pdf
# ../thesis/figures/freq-traces_powerlaw.pdf
# ../thesis/figures/freq-traces_powerlaw_bw.pdf
# ../thesis/figures/freq-traces_2019.pdf
# ../thesis/figures/freq-traces_powerlaw_2019.pdf
# ../thesis/figures/freq-traces_powerlaw_2019_bw.pdf
#
# last mod: 2024-04-17
# setwd("C:/Users/nwickelmaier/Nextcloud/Documents/MDS/2023ss/60100_master_thesis/analysis/")
2024-03-12 17:52:53 +01:00
library(bupaverse)
#--------------- (1) Read data ---------------
2024-03-22 12:33:58 +01:00
load("results/eventlogs_pre-corona_cleaned.RData")
2024-03-26 18:28:10 +01:00
#--------------- (2) Investigate variants (pre-corona data set) ---------------
2024-03-12 17:52:53 +01:00
dat$start <- dat$date.start
dat$complete <- dat$date.stop
2024-03-12 17:52:53 +01:00
alog <- activitylog(dat,
case_id = "case",
activity_id = "item",
resource_id = "path",
timestamps = c("start", "complete"))
pdf("../thesis/figures/freq-traces.pdf", height = 7, width = 6, pointsize = 10)
trace_explorer(alog, n_traces = 25)
# --> sequences of artworks are just too rare
2024-03-12 17:52:53 +01:00
dev.off()
trace_explorer(alog, n_traces = 10, type = "infrequent")
tr <- traces(alog)
trace_length <- pbapply::pbsapply(strsplit(tr$trace, ","), length)
tr[trace_length > 10, ]
trace_varied <- pbapply::pbsapply(strsplit(tr$trace, ","), function(x) length(unique(x)))
tr[trace_varied > 1, ]
table(tr[trace_varied > 2, "absolute_frequency"])
table(tr[trace_varied > 3, "absolute_frequency"])
summary(tr$absolute_frequency)
vioplot::vioplot(tr$absolute_frequency)
# Power law for frequencies of traces
tab <- table(tr$absolute_frequency)
x <- as.numeric(names(tab))
y <- as.numeric(tab)
p1 <- lm(log(y) ~ log(x))
pre <- exp(coef(p1)[1]) * x^coef(p1)[2]
pdf("../thesis/figures/freq-traces_powerlaw.pdf", height = 3.375,
width = 3.375, pointsize = 10)
par(mai = c(.6,.6,.1,.1), mgp = c(2.4, 1, 0))
2024-04-03 17:53:39 +02:00
plot(x, y, log = "xy", xlab = "Process variants sorted by frequency",
ylab = "Frequency", pch = 16, col = rgb(0.262, 0.309, 0.309, 0.5))
lines(x, pre, col = "#434F4F")
legend("topright", paste0("Proportion of traces only occurring once: ",
round(tab[1] / nrow(tr), 2)), cex = .7, bty = "n")
dev.off()
# Black and white
pdf("../thesis/figures/freq-traces_powerlaw_bw.pdf", height = 3.375,
width = 3.375, pointsize = 10)
par(mai = c(.6,.6,.1,.1), mgp = c(2.4, 1, 0))
2024-04-03 17:53:39 +02:00
plot(x, y, log = "xy", xlab = "Process variants sorted by frequency",
ylab = "Frequency", pch = 16, col = rgb(0.3, 0.3, 0.3, 0.5))
lines(x, pre, col = "#434F4F")
legend("topright", paste0("Proportion of traces only occurring once: ",
round(tab[1] / nrow(tr), 2)), cex = .7, bty = "n")
dev.off()
# Look at individual traces as examples
tr[trace_varied == 5 & trace_length > 50, ]
# --> every variant exists only once, of course
2024-03-26 18:28:10 +01:00
#--------------- (3) Investigate variants (2019) ---------------
load("results/dataframes_case_2019.RData")
dat$start <- dat$date.start
dat$complete <- dat$date.stop
alog <- activitylog(dat,
case_id = "case",
activity_id = "item",
resource_id = "path",
timestamps = c("start", "complete"))
pdf("../thesis/figures/freq-traces_2019.pdf", height = 7, width = 6, pointsize = 10)
2024-03-26 18:28:10 +01:00
trace_explorer(alog, n_traces = 25)
dev.off()
trace_explorer(alog, n_traces = 10, type = "infrequent")
tr <- traces(alog)
trace_length <- pbapply::pbsapply(strsplit(tr$trace, ","), length)
tr[trace_length > 10, ]
trace_varied <- pbapply::pbsapply(strsplit(tr$trace, ","), function(x) length(unique(x)))
tr[trace_varied > 1, ]
table(tr[trace_varied > 2, "absolute_frequency"])
table(tr[trace_varied > 3, "absolute_frequency"])
summary(tr$absolute_frequency)
vioplot::vioplot(tr$absolute_frequency)
# Power law for frequencies of traces
tab <- table(tr$absolute_frequency)
x <- as.numeric(names(tab))
y <- as.numeric(tab)
p1 <- lm(log(y) ~ log(x))
pre <- exp(coef(p1)[1]) * x^coef(p1)[2]
pdf("../thesis/figures/freq-traces_powerlaw_2019.pdf", height = 3.375,
2024-03-26 18:28:10 +01:00
width = 3.375, pointsize = 10)
par(mai = c(.6,.6,.1,.1), mgp = c(2.4, 1, 0))
2024-04-03 17:53:39 +02:00
plot(x, y, log = "xy", xlab = "Process variants sorted by frequency",
2024-03-26 18:28:10 +01:00
ylab = "Frequency", pch = 16, col = rgb(0.262, 0.309, 0.309, 0.5))
lines(x, pre, col = "#434F4F")
legend("topright", paste0("Proportion of traces only occurring once: ",
round(tab[1] / nrow(tr), 2)), cex = .7, bty = "n")
dev.off()
# Black and white
pdf("../thesis/figures/freq-traces_powerlaw_2019_bw.pdf", height = 3.375,
width = 3.375, pointsize = 10)
par(mai = c(.6,.6,.1,.1), mgp = c(2.4, 1, 0))
2024-04-03 17:53:39 +02:00
plot(x, y, log = "xy", xlab = "Process variants sorted by frequency",
ylab = "Frequency", pch = 16, col = rgb(0.3, 0.3, 0.3, 0.5))
lines(x, pre, col = "#434F4F")
legend("topright", paste0("Proportion of traces only occurring once: ",
round(tab[1] / nrow(tr), 2)), cex = .7, bty = "n")
dev.off()
2024-03-26 18:28:10 +01:00
# Look at individual traces as examples
tr[trace_varied == 5 & trace_length > 50, ]
# --> every variant exists only once, of course