# 11_investigate-variants.R # # content: (1) Read data # (2) Investigate variants (pre-corona data set) # (3) Investigate variants (2019) # # input: results/eventlogs_pre-corona_cleaned.RData # output: ../thesis/figures/freq-traces.pdf # ../thesis/figures/freq-traces_powerlaw.pdf # ../thesis/figures/freq-traces_powerlaw_bw.pdf # ../thesis/figures/freq-traces_2019.pdf # ../thesis/figures/freq-traces_powerlaw_2019.pdf # ../thesis/figures/freq-traces_powerlaw_2019_bw.pdf # # last mod: 2024-04-17 # setwd("C:/Users/nwickelmaier/Nextcloud/Documents/MDS/2023ss/60100_master_thesis/analysis/") library(bupaverse) #--------------- (1) Read data --------------- load("results/eventlogs_pre-corona_cleaned.RData") #--------------- (2) Investigate variants (pre-corona data set) --------------- dat$start <- dat$date.start dat$complete <- dat$date.stop alog <- activitylog(dat, case_id = "case", activity_id = "item", resource_id = "path", timestamps = c("start", "complete")) pdf("../thesis/figures/freq-traces.pdf", height = 7, width = 6, pointsize = 10) trace_explorer(alog, n_traces = 25) # --> sequences of artworks are just too rare dev.off() trace_explorer(alog, n_traces = 10, type = "infrequent") tr <- traces(alog) trace_length <- pbapply::pbsapply(strsplit(tr$trace, ","), length) tr[trace_length > 10, ] trace_varied <- pbapply::pbsapply(strsplit(tr$trace, ","), function(x) length(unique(x))) tr[trace_varied > 1, ] table(tr[trace_varied > 2, "absolute_frequency"]) table(tr[trace_varied > 3, "absolute_frequency"]) summary(tr$absolute_frequency) vioplot::vioplot(tr$absolute_frequency) # Power law for frequencies of traces tab <- table(tr$absolute_frequency) x <- as.numeric(names(tab)) y <- as.numeric(tab) p1 <- lm(log(y) ~ log(x)) pre <- exp(coef(p1)[1]) * x^coef(p1)[2] pdf("../thesis/figures/freq-traces_powerlaw.pdf", height = 3.375, width = 3.375, pointsize = 10) par(mai = c(.6,.6,.1,.1), mgp = c(2.4, 1, 0)) plot(x, y, log = "xy", xlab = "Process variants sorted by frequency", ylab = "Frequency", pch = 16, col = rgb(0.262, 0.309, 0.309, 0.5)) lines(x, pre, col = "#434F4F") legend("topright", paste0("Proportion of traces only occurring once: ", round(tab[1] / nrow(tr), 2)), cex = .7, bty = "n") dev.off() # Black and white pdf("../thesis/figures/freq-traces_powerlaw_bw.pdf", height = 3.375, width = 3.375, pointsize = 10) par(mai = c(.6,.6,.1,.1), mgp = c(2.4, 1, 0)) plot(x, y, log = "xy", xlab = "Process variants sorted by frequency", ylab = "Frequency", pch = 16, col = rgb(0.3, 0.3, 0.3, 0.5)) lines(x, pre, col = "#434F4F") legend("topright", paste0("Proportion of traces only occurring once: ", round(tab[1] / nrow(tr), 2)), cex = .7, bty = "n") dev.off() # Look at individual traces as examples tr[trace_varied == 5 & trace_length > 50, ] # --> every variant exists only once, of course #--------------- (3) Investigate variants (2019) --------------- load("results/dataframes_case_2019.RData") dat$start <- dat$date.start dat$complete <- dat$date.stop alog <- activitylog(dat, case_id = "case", activity_id = "item", resource_id = "path", timestamps = c("start", "complete")) pdf("../thesis/figures/freq-traces_2019.pdf", height = 7, width = 6, pointsize = 10) trace_explorer(alog, n_traces = 25) dev.off() trace_explorer(alog, n_traces = 10, type = "infrequent") tr <- traces(alog) trace_length <- pbapply::pbsapply(strsplit(tr$trace, ","), length) tr[trace_length > 10, ] trace_varied <- pbapply::pbsapply(strsplit(tr$trace, ","), function(x) length(unique(x))) tr[trace_varied > 1, ] table(tr[trace_varied > 2, "absolute_frequency"]) table(tr[trace_varied > 3, "absolute_frequency"]) summary(tr$absolute_frequency) vioplot::vioplot(tr$absolute_frequency) # Power law for frequencies of traces tab <- table(tr$absolute_frequency) x <- as.numeric(names(tab)) y <- as.numeric(tab) p1 <- lm(log(y) ~ log(x)) pre <- exp(coef(p1)[1]) * x^coef(p1)[2] pdf("../thesis/figures/freq-traces_powerlaw_2019.pdf", height = 3.375, width = 3.375, pointsize = 10) par(mai = c(.6,.6,.1,.1), mgp = c(2.4, 1, 0)) plot(x, y, log = "xy", xlab = "Process variants sorted by frequency", ylab = "Frequency", pch = 16, col = rgb(0.262, 0.309, 0.309, 0.5)) lines(x, pre, col = "#434F4F") legend("topright", paste0("Proportion of traces only occurring once: ", round(tab[1] / nrow(tr), 2)), cex = .7, bty = "n") dev.off() # Black and white pdf("../thesis/figures/freq-traces_powerlaw_2019_bw.pdf", height = 3.375, width = 3.375, pointsize = 10) par(mai = c(.6,.6,.1,.1), mgp = c(2.4, 1, 0)) plot(x, y, log = "xy", xlab = "Process variants sorted by frequency", ylab = "Frequency", pch = 16, col = rgb(0.3, 0.3, 0.3, 0.5)) lines(x, pre, col = "#434F4F") legend("topright", paste0("Proportion of traces only occurring once: ", round(tab[1] / nrow(tr), 2)), cex = .7, bty = "n") dev.off() # Look at individual traces as examples tr[trace_varied == 5 & trace_length > 50, ] # --> every variant exists only once, of course