diff --git a/code/11_investigate-variants.R b/code/11_investigate-variants.R index 5fa9422..a585027 100644 --- a/code/11_investigate-variants.R +++ b/code/11_investigate-variants.R @@ -1,13 +1,14 @@ # 11_investigate-variants.R # # content: (1) Read data -# (2) Investigate variants +# (2) Investigate variants (pre-corona data set) +# (3) Investigate variants (2019) # # input: results/eventlogs_pre-corona_cleaned.RData # output: ../../thesis/figures/freq-traces.pdf # ../../thesis/figures/freq-traces_powerlaw.pdf # -# last mod: 2024-03-22 +# last mod: 2024-03-26 # setwd("C:/Users/nwickelmaier/Nextcloud/Documents/MDS/2023ss/60100_master_thesis/analysis/code") @@ -17,7 +18,7 @@ library(bupaverse) load("results/eventlogs_pre-corona_cleaned.RData") -#--------------- (2) Investigate variants --------------- +#--------------- (2) Investigate variants (pre-corona data set) --------------- dat$start <- dat$date.start dat$complete <- dat$date.stop @@ -71,3 +72,58 @@ dev.off() tr[trace_varied == 5 & trace_length > 50, ] # --> every variant exists only once, of course +#--------------- (3) Investigate variants (2019) --------------- + +load("results/dataframes_case_2019.RData") + +dat$start <- dat$date.start +dat$complete <- dat$date.stop + +alog <- activitylog(dat, + case_id = "case", + activity_id = "item", + resource_id = "path", + timestamps = c("start", "complete")) + +pdf("../../thesis/figures/freq-traces_2019.pdf", height = 7, width = 6, pointsize = 10) +trace_explorer(alog, n_traces = 25) +dev.off() + +trace_explorer(alog, n_traces = 10, type = "infrequent") + +tr <- traces(alog) +trace_length <- pbapply::pbsapply(strsplit(tr$trace, ","), length) +tr[trace_length > 10, ] + +trace_varied <- pbapply::pbsapply(strsplit(tr$trace, ","), function(x) length(unique(x))) +tr[trace_varied > 1, ] +table(tr[trace_varied > 2, "absolute_frequency"]) +table(tr[trace_varied > 3, "absolute_frequency"]) + +summary(tr$absolute_frequency) +vioplot::vioplot(tr$absolute_frequency) + +# Power law for frequencies of traces +tab <- table(tr$absolute_frequency) +x <- as.numeric(names(tab)) +y <- as.numeric(tab) + +p1 <- lm(log(y) ~ log(x)) +pre <- exp(coef(p1)[1]) * x^coef(p1)[2] + +pdf("../../thesis/figures/freq-traces_powerlaw_2019.pdf", height = 3.375, + width = 3.375, pointsize = 10) +par(mai = c(.6,.6,.1,.1), mgp = c(2.4, 1, 0)) + +plot(x, y, log = "xy", xlab = "Absolute Frequency of Traces", + ylab = "Frequency", pch = 16, col = rgb(0.262, 0.309, 0.309, 0.5)) +lines(x, pre, col = "#434F4F") +legend("topright", paste0("Proportion of traces only occurring once: ", + round(tab[1] / nrow(tr), 2)), cex = .7, bty = "n") + +dev.off() + +# Look at individual traces as examples +tr[trace_varied == 5 & trace_length > 50, ] +# --> every variant exists only once, of course +