From b469ccfbcf383c34034f4d0c5361b186bc073846 Mon Sep 17 00:00:00 2001 From: nwickel Date: Tue, 30 Jan 2024 09:46:40 +0100 Subject: [PATCH] First round of cleaning up --- code/01_clustering.R | 314 ------------------ ...reprocessing_haum.R => 01_preprocessing.R} | 0 ...eate-petrinet.py => 02_create-petrinet.py} | 0 code/03_conformance-checking.py | 137 ++++++++ code/04_clustering_haum.R | 122 ------- code/{pm_infos-items.py => 04_infos-items.py} | 3 +- ...item_clustering.R => 05_item-clustering.R} | 0 ...infos-clusters.py => 06_infos-clusters.py} | 0 code/{04_modeling_haum.R => check-traces.R} | 35 ++ code/check_broken_trace.R | 28 -- code/check_traces.R | 101 ------ ...{03_haum_descriptives.R => descriptives.R} | 0 ..._current-analysis.R => descriptives_2nd.R} | 0 code/plots_processmaps.R | 39 +++ code/pm.py | 202 ----------- code/pm_conformance-checking.py | 266 --------------- code/pm_navigation-behavior.py | 126 ------- code/python_helpers.py | 33 +- code/trace-clustering.py | 174 ---------- 19 files changed, 242 insertions(+), 1338 deletions(-) delete mode 100644 code/01_clustering.R rename code/{01_preprocessing_haum.R => 01_preprocessing.R} (100%) rename code/{pm_create-petrinet.py => 02_create-petrinet.py} (100%) create mode 100644 code/03_conformance-checking.py delete mode 100644 code/04_clustering_haum.R rename code/{pm_infos-items.py => 04_infos-items.py} (99%) rename code/{item_clustering.R => 05_item-clustering.R} (100%) rename code/{pm_infos-clusters.py => 06_infos-clusters.py} (100%) rename code/{04_modeling_haum.R => check-traces.R} (87%) delete mode 100644 code/check_broken_trace.R delete mode 100644 code/check_traces.R rename code/{03_haum_descriptives.R => descriptives.R} (100%) rename code/{00_current-analysis.R => descriptives_2nd.R} (100%) create mode 100644 code/plots_processmaps.R delete mode 100644 code/pm.py delete mode 100644 code/pm_conformance-checking.py delete mode 100644 code/pm_navigation-behavior.py delete mode 100644 code/trace-clustering.py diff --git a/code/01_clustering.R b/code/01_clustering.R deleted file mode 100644 index 9afff5d..0000000 --- a/code/01_clustering.R +++ /dev/null @@ -1,314 +0,0 @@ -# 01_clustering.R -# -# content: (1) Read evaluation data -# (2) Clustering -# (3) Visualization with pictures -# (4) Read event logs -# (5) Frequency plot for clusters -# (6) DFGs for clusters -# -# input: results/eval_heuristics_artworks.csv -# results/eval_all-miners_complete.csv -# results/haum/event_logfiles_glossar_2023-11-03_17-46-28.csv -# output: ../figures/clustering_heuristics.pdf -# ../figures/clustering_heuristics.png -# ../figures/processmaps/dfg_complete_R.pdf -# ../figures/processmaps/dfg_complete_R.png -# ../figures/processmaps/dfg_cluster1_R.pdf -# ../figures/processmaps/dfg_cluster2_R.pdf -# ../figures/processmaps/dfg_cluster3_R.pdf -# ../figures/processmaps/dfg_cluster4_R.pdf -# -# last mod: 2023-12-21, NW - -# setwd("C:/Users/nwickelmaier/Nextcloud/Documents/MDS/2023ss/60100_master_thesis/code") - -#--------------- (1) Read evaluation data --------------- - -eval_heuristics <- read.table("results/eval_artworks_heuristics.csv", header = TRUE, - sep = ";", row.names = 1) -eval_inductive <- read.table("results/eval_artworks_inductive.csv", header = TRUE, - sep = ";", row.names = 1) -eval_alpha <- read.table("results/eval_artworks_alpha.csv", header = TRUE, - sep = ";", row.names = 1) -eval_ilp <- read.table("results/eval_artworks_ilp.csv", header = TRUE, - sep = ";", row.names = 1) - -#--------------- (2) Clustering --------------- - -set.seed(1607) - -# Heuristics Miner - -k1 <- kmeans(eval_heuristics, 4) - -colors <- c("#3CB4DC", "#78004B", "#91C86E", "#FF6900") - -plot(generalizability ~ precision, eval_heuristics, pch = 16, col = colors[k1$cluster]) - -## Scree plot - -ks <- 1:10 - -sse <- NULL -for (k in ks) sse <- c(sse, kmeans(eval_heuristics, k)$tot.withinss) - -plot(sse ~ ks, type = "l") - -# Inductive Miner - -k2 <- kmeans(eval_inductive, 4) - -plot(generalizability ~ precision, eval_inductive, pch = 16, col = colors[k2$cluster]) - -## Scree plot - -ks <- 1:10 - -sse <- NULL -for (k in ks) sse <- c(sse, kmeans(eval_inductive, k)$tot.withinss) - -plot(sse ~ ks, type = "l") - -# Alpha Miner - -k3 <- kmeans(eval_alpha, 4) - -par(mfrow = c(2, 2)) -plot(generalizability ~ precision, eval_alpha, pch = 16, col = colors[k3$cluster]) -plot(fitness ~ precision, eval_alpha, pch = 16, col = colors[k3$cluster]) -plot(fitness ~ generalizability, eval_alpha, pch = 16, col = colors[k3$cluster]) - -## Scree plot - -ks <- 1:10 - -sse <- NULL -for (k in ks) sse <- c(sse, kmeans(eval_alpha, k)$tot.withinss) - -plot(sse ~ ks, type = "l") - - -# ILP Miner - -k4 <- kmeans(eval_ilp, 4) - -plot(generalizability ~ precision, eval_ilp, pch = 16, col = colors[k4$cluster]) - -## Scree plot - -ks <- 1:10 - -sse <- NULL -for (k in ks) sse <- c(sse, kmeans(eval_ilp, k)$tot.withinss) - -plot(sse ~ ks, type = "l") - -#--------------- (3) Visualization with pictures --------------- - -library(png) -library(jpeg) -library(grid) - -## Heuristics Miner -#pdf("../figures/clustering_heuristics.pdf", height = 8, width = 8, pointsize = 10) -png("../figures/clustering_heuristics.png", units = "in", height = 8, width = 8, pointsize = 10, res = 300) -par(mai = c(.6,.6,.1,.1), mgp = c(2.4, 1, 0)) - -plot(generalizability ~ precision, eval_heuristics, type = "n", ylim = c(0.845, 0.98)) - -for (art in as.numeric(rownames(eval_heuristics))) { - - art_string <- sprintf("%03d", art) - - if (art == 125) { - - pic <- readJPEG(paste0("../data/haum/ContentEyevisit/eyevisit_cards_light/", - art_string, "/", art_string, ".jpg")) - } else { - pic <- readPNG(paste0("../data/haum/ContentEyevisit/eyevisit_cards_light/", - art_string, "/", art_string, ".png")) - } - - img <- as.raster(pic[,,1:3]) - - x <- eval_heuristics[rownames(eval_heuristics) == art, "precision"] - y <- eval_heuristics[rownames(eval_heuristics) == art, "generalizability"] - - points(x, y, col = colors[k1$cluster[as.character(art)]], cex = 8, pch = 15) - - rasterImage(img, - xleft = x - .002, - xright = x + .002, - ybottom = y - .004, - ytop = y + .004) - -} - -dev.off() - -## Inductive Miner -plot(generalizability ~ precision, eval_inductive, col = colors[k2$cluster], - cex = 8, pch = 15) - -for (art in as.numeric(rownames(eval_inductive))) { - - art_string <- sprintf("%03d", art) - - if (art == 125) { - - pic <- readJPEG(paste0("../data/haum/ContentEyevisit/eyevisit_cards_light/", - art_string, "/", art_string, ".jpg")) - } else { - pic <- readPNG(paste0("../data/haum/ContentEyevisit/eyevisit_cards_light/", - art_string, "/", art_string, ".png")) - } - - img <- as.raster(pic[,,1:3]) - - x <- eval_inductive[rownames(eval_inductive) == art, "precision"] - y <- eval_inductive[rownames(eval_inductive) == art, "generalizability"] - - rasterImage(img, - xleft = x - .001, - xright = x + .001, - ybottom = y - .002, - ytop = y + .002) - -} - -#--------------- (4) Read event logs --------------- - -dat <- read.table("results/haum/event_logfiles_glossar_2023-11-03_17-46-28.csv", - sep = ";", header = TRUE) -dat$date <- as.POSIXct(dat$date) -dat$date.start <- as.POSIXct(dat$date.start) -dat$date.stop <- as.POSIXct(dat$date.stop) -dat$artwork <- sprintf("%03d", dat$artwork) -dat$event <- factor(dat$event, levels = c("move", "flipCard", "openTopic", "openPopup")) - -dat$weekdays <- factor(weekdays(dat$date.start), - levels = c("Montag", "Dienstag", "Mittwoch", - "Donnerstag", "Freitag", "Samstag", - "Sonntag"), - labels = c("Monday", "Tuesday", "Wednesday", - "Thursday", "Friday", "Saturday", - "Sunday")) - - -#--------------- (5) Frequency plot for clusters --------------- - -# Only pre Corona -dat <- dat[dat$date < "2020-03-13",] - -counts_artwork <- table(dat$artwork) -dat_count <- as.data.frame(counts_artwork) -names(dat_count) <- c("artwork", "freq") -dat_count$cluster <- k1$cluster[order(as.numeric(names(k1$cluster)))] -dat_count$cluster <- factor(dat_count$cluster, levels = c(4, 2, 1, 3), labels = 4:1) -dat_count <- dat_count[order(dat_count$cluster, dat_count$freq, decreasing = TRUE), ] -dat_count$artwork <- factor(dat_count$artwork, levels = unique(dat_count$artwork)) - -png("../figures/counts_artworks_clusters.png", units = "in", height = 3.375, width = 12, pointsize = 10, res = 300) -par(mai = c(.6,.6,.1,.1), mgp = c(2.4, 1, 0)) -barplot(freq ~ artwork, dat_count, las = 2, ylim = c(0, 60000), - border = "white", ylab = "", - col = c("#FF6900", "#78004B", "#3CB4DC", "#91C86E" )[dat_count$cluster]) -dev.off() - -# compare to clusters - -png("../figures/pm_heuristics_clusters.png", units = "in", height = 3.375, width = 3.375, pointsize = 10, res = 300) -par(mai = c(.6,.6,.1,.1), mgp = c(2.4, 1, 0)) -plot(generalizability ~ precision, eval_heuristics, type = "n", ylim = c(0.845, 0.98)) -with(eval_heuristics, text(precision, generalizability, - rownames(eval_heuristics), - col = colors[k1$cluster])) -dev.off() - -#--------------- (6) DFGs for clusters --------------- - -library(bupaverse) - - -dat$start <- dat$date.start -dat$complete <- dat$date.stop - - -alog <- activitylog(dat, - case_id = "trace", - activity_id = "event", - resource_id = "artwork", - timestamps = c("start", "complete")) - - -alog_c1 <- filter_case_condition(alog, - artwork %in% dat_count[dat_count$cluster == 1, "artwork"]) -alog_c2 <- filter_case_condition(alog, - artwork %in% dat_count[dat_count$cluster == 2, "artwork"]) -alog_c3 <- filter_case_condition(alog, - artwork %in% dat_count[dat_count$cluster == 3, "artwork"]) -alog_c4 <- filter_case_condition(alog, - artwork %in% dat_count[dat_count$cluster == 4, "artwork"]) - -dfg_complete <- process_map(alog, - type_nodes = frequency("absolute", color_scale = "Greys"), - sec_nodes = frequency("relative"), - type_edges = frequency("absolute", color_edges = "#FF6900"), - sec_edges = frequency("relative"), - #rankdir = "TB", - render = FALSE) -export_map(dfg_complete, - file_name = "../figures/processmaps/dfg_complete_R.pdf", - file_type = "pdf", - title = "DFG complete") -export_map(dfg_complete, - file_name = "../figures/processmaps/dfg_complete_R.png", - file_type = "png") - -dfg_c1 <- process_map(alog_c1, - type_nodes = frequency("absolute", color_scale = "Greys"), - sec_nodes = frequency("relative"), - type_edges = frequency("absolute", color_edges = "#FF6900"), - sec_edges = frequency("relative"), - rankdir = "TB", - render = FALSE) -export_map(dfg_c1, - file_name = "../figures/processmaps/dfg_cluster1_R.pdf", - file_type = "pdf", - title = "DFG Cluster 1") -dfg_c2 <- process_map(alog_c2, - type_nodes = frequency("absolute", color_scale = "Greys"), - sec_nodes = frequency("relative"), - type_edges = frequency("absolute", color_edges = "#FF6900"), - sec_edges = frequency("relative"), - rankdir = "TB", - render = FALSE) -export_map(dfg_c2, - file_name = "../figures/processmaps/dfg_cluster2_R.pdf", - file_type = "pdf", - title = "DFG Cluster 2") -dfg_c3 <- process_map(alog_c3, - type_nodes = frequency("absolute", color_scale = "Greys"), - sec_nodes = frequency("relative"), - type_edges = frequency("absolute", color_edges = "#FF6900"), - sec_edges = frequency("relative"), - rankdir = "TB", - render = FALSE) -export_map(dfg_c3, - file_name = "../figures/processmaps/dfg_cluster3_R.pdf", - file_type = "pdf", - title = "DFG Cluster 3") -dfg_c4 <- process_map(alog_c4, - type_nodes = frequency("absolute", color_scale = "Greys"), - sec_nodes = frequency("relative"), - type_edges = frequency("absolute", color_edges = "#FF6900"), - sec_edges = frequency("relative"), - rankdir = "TB", - render = FALSE) -export_map(dfg_c4, - file_name = "../figures/processmaps/dfg_cluster4_R.pdf", - file_type = "pdf", - title = "DFG Cluster 4") - - diff --git a/code/01_preprocessing_haum.R b/code/01_preprocessing.R similarity index 100% rename from code/01_preprocessing_haum.R rename to code/01_preprocessing.R diff --git a/code/pm_create-petrinet.py b/code/02_create-petrinet.py similarity index 100% rename from code/pm_create-petrinet.py rename to code/02_create-petrinet.py diff --git a/code/03_conformance-checking.py b/code/03_conformance-checking.py new file mode 100644 index 0000000..05e0a66 --- /dev/null +++ b/code/03_conformance-checking.py @@ -0,0 +1,137 @@ +import pm4py + +import pandas as pd +import numpy as np + +from python_helpers import eval_pm, pn_infos_miner + +###### Load data and create event logs ###### + +dat = pd.read_csv("results/haum/event_logfiles_2024-01-18_09-58-52.csv", sep = ";") +#dat = dat[dat["date.start"] < "2020-03-13"] +# --> only pre corona (before artworks were updated) + +event_log = pm4py.format_dataframe(dat, case_id='path', activity_key='event', + timestamp_key='date.start') + +###### Descriptives of log data ###### + +# Distribution of events +event_log.event.value_counts() +event_log.event.value_counts(normalize = True) + +# Number of paths +len(event_log.path.unique()) + +# Number of variants +variants = pm4py.get_variants(event_log) +len(variants) + +sorted_variants = dict(sorted(variants.items(), key=lambda item: item[1], reverse = True)) +{k: sorted_variants[k] for k in list(sorted_variants)[:20]} + +filtered_log = event_log[event_log["event"] != "move"] +variants_no_move = pm4py.get_variants(filtered_log) +len(variants_no_move) +sorted_variants_no_move = dict(sorted(variants_no_move.items(), key=lambda item: item[1], reverse = True)) +{k: sorted_variants_no_move[k] for k in list(sorted_variants_no_move)[:20]} + +###### Read "conformative" Petri Net ###### + +basenet, initial_marking, final_marking = pm4py.read_pnml("results/haum/conformative_petrinet_con.pnml") + +# TBR +replayed_traces = pm4py.conformance_diagnostics_token_based_replay(event_log, basenet, initial_marking, final_marking) + +l1 = list() +l2 = list() +l3 = list() +l4 = list() +for i in range(len(replayed_traces)): + l1.append(replayed_traces[i]["remaining_tokens"]) + l2.append(replayed_traces[i]["missing_tokens"]) + l3.append(replayed_traces[i]["reached_marking"]) + l4.append(replayed_traces[i]["transitions_with_problems"]) + +set(l1) +x1 = np.array(l1) +index_broken = np.where(x1 == 1)[0].tolist() + +set(l3) +l4.count([]) + +[l3[i] for i in index_broken] +[l4[i] for i in index_broken] + +broken_traces = [replayed_traces[i] for i in index_broken] + +event_log[event_log['@@case_index'] == index_broken[0]].event +event_log[event_log['@@case_index'] == index_broken[0]].path.unique().tolist() +event_log[event_log['@@case_index'] == index_broken[0]].item.unique().tolist() +event_log[event_log['@@case_index'] == index_broken[0]]["fileId.start"].unique().tolist() +# --> logging error in raw file + + +# Footprints +from pm4py.algo.discovery.footprints import algorithm as footprints_discovery +from pm4py.visualization.footprints import visualizer as fp_visualizer +fp_log = footprints_discovery.apply(event_log, variant=footprints_discovery.Variants.ENTIRE_EVENT_LOG) +fp_net = footprints_discovery.apply(basenet, initial_marking, final_marking) +gviz = fp_visualizer.apply(fp_net, parameters={fp_visualizer.Variants.SINGLE.value.Parameters.FORMAT: "svg"}) +fp_visualizer.view(gviz) + +efg_graph = pm4py.discover_eventually_follows_graph(event_log) + +## Directly-follows graph +dfg, start_activities, end_activities = pm4py.discover_dfg(event_log) +pm4py.view_dfg(dfg, start_activities, end_activities) +pm4py.save_vis_dfg(dfg, start_activities, end_activities, 'results/processmaps/dfg_complete_python.png') + +## Fitting different miners + +eval = pd.DataFrame(columns = ["fitness", "precision", "generalizability", + "simplicity", "sound", "narcs", "ntrans", + "nplaces", "nvariants", "mostfreq"]) + +for miner in ["conformative", "alpha", "heuristics", "inductive", "ilp"]: + eval = pd.concat([eval, pn_infos_miner(event_log, miner)]) + +## Export for all miners +eval.to_csv("results/eval_all-miners_complete.csv", sep = ";") + +## Without broken trace +event_log_clean = event_log[event_log['@@case_index'] != index_broken[0]] + +for miner in ["conformative", "alpha", "heuristics", "inductive", "ilp"]: + eval_clean = pd.concat([eval_clean, pn_infos_miner(event_log_clean, miner)]) + +eval_clean.to_csv("results/eval_all-miners_clean.csv", sep = ";") + +# Export petri nets +h_net, h_im, h_fm = pm4py.discover_petri_net_heuristics(event_log_clean) +a_net, a_im, a_fm = pm4py.discover_petri_net_alpha(event_log_clean) +i_net, i_im, i_fm = pm4py.discover_petri_net_inductive(event_log_clean) +ilp_net, ilp_im, ilp_fm = pm4py.discover_petri_net_ilp(event_log_clean) + +pm4py.vis.save_vis_petri_net(h_net, h_im, h_fm, "results/processmaps/petrinet_heuristics_clean.png") +pm4py.vis.save_vis_petri_net(a_net, a_im, a_fm, "results/processmaps/petrinet_alpha_clean.png") +pm4py.vis.save_vis_petri_net(i_net, i_im, i_fm, "results/processmaps/petrinet_inductive_clean.png") +pm4py.vis.save_vis_petri_net(ilp_net, ilp_im, ilp_fm, "results/processmaps/petrinet_ilp_clean.png") +pm4py.vis.save_vis_petri_net(basenet, initial_marking, final_marking, "results/processmaps/petrinet_conformative.png") + +# convert to BPMN +base_bpmn = pm4py.convert.convert_to_bpmn(basenet, initial_marking, final_marking) +pm4py.vis.save_vis_bpmn(base_bpmn, "results/processmaps/bpmn_conformative.png") + +i_bpmn = pm4py.convert.convert_to_bpmn(i_net, i_im, i_fm) +pm4py.vis.save_vis_bpmn(i_bpmn, "results/processmaps/bpmn_inductive_clean.png") + +ilp_bpmn = pm4py.convert.convert_to_bpmn(ilp_net, ilp_im, ilp_fm) +pm4py.vis.save_vis_bpmn(ilp_bpmn, "results/processmaps/bpmn_ilp_clean.png") + +a_bpmn = pm4py.convert.convert_to_bpmn(a_net, a_im, a_fm) +pm4py.vis.save_vis_bpmn(a_bpmn, "results/processmaps/bpmn_alpha_clean.png") + +h_bpmn = pm4py.convert.convert_to_bpmn(h_net, h_im, h_fm) +pm4py.vis.save_vis_bpmn(h_bpmn, "results/processmaps/bpmn_heuristics_clean.png") + diff --git a/code/04_clustering_haum.R b/code/04_clustering_haum.R deleted file mode 100644 index 0755d5e..0000000 --- a/code/04_clustering_haum.R +++ /dev/null @@ -1,122 +0,0 @@ -# setwd("C:/Users/nwickelmaier/Nextcloud/Documents/MDS/2023ss/60100_master_thesis/code") - -# Read data - -dat0 <- read.table("../data/haum/event_logfiles_metadata_2023-09-23_01-31-30.csv", - sep = ";", header = TRUE) -dat0$date <- as.Date(dat0$date) -dat0$date.start <- as.POSIXct(dat0$date.start) -dat0$date.stop <- as.POSIXct(dat0$date.stop) -dat0$artwork <- sprintf("%03d", dat0$artwork) - -# Preprocess variables for clustering - -str(dat0) - -# year --> lubridate::year() -# duration --> numeric, remove NA -# topicNumber --> numeric, remove NA -# distance --> numeric, remove NA -# scaleSize --> numeric, remove NA -# rotationDegree --> numeric, remove NA -# holiday --> one/hot coding -# vacations --> one/hot coding -# artwork? --> one/hot coding (72 new variables) -# event? --> one/hot coding (4 new variables) - -dat <- dat0 - -dat$year <- lubridate::year(dat$date) -dat$holiday1 <- ifelse(is.na(dat$holiday), 0, 1) -dat$vacations1 <- ifelse(is.na(dat$vacations), 0, 1) -dat$topicNumber1 <- ifelse(is.na(dat$topicNumber), 0, dat$topicNumber) -dat$duration1 <- ifelse(is.na(dat$duration), 0, dat$duration) -dat$distance1 <- ifelse(is.na(dat$distance), 0, dat$distance) -dat$scaleSize1 <- ifelse(is.na(dat$scaleSize), 0, dat$scaleSize) -dat$rotationDegree1 <- ifelse(is.na(dat$rotationDegree), 0, dat$rotationDegree) - -for (artwork in unique(dat$artwork)) { - dat[[paste0("A", artwork)]] <- ifelse(dat$artwork == artwork, 1, 0) -} - -for (event in unique(dat$event)) { - dat[[event]] <- ifelse(dat$event == event, 1, 0) -} - -mat <- dat[, c("year", "duration", "topicNumber", "distance", "scaleSize", - "rotationDegree", "holiday1", "vacations1", - paste0("A", unique(dat$artwork)), "flipCard", "move", "openTopic", - "openPopup")] - - -mat1 <- dat[, c("year", "duration1", "topicNumber1", "distance1", "scaleSize1", - "rotationDegree1", "holiday1", "vacations1", - paste0("A", unique(dat$artwork)), "flipCard", "move", "openTopic", - "openPopup")] - -library(cluster) # for hierarchical clustering - -k1 <- kmeans(mat1, 2) -dat$kcluster <- k1$cluster - -mat1$artwork <- dat$artwork -datagg <- aggregate(. ~ artwork, mat1, mean) -aa <- datagg$artwork -datagg$artwork <- NULL - -k2 <- kmeans(datagg, 3) -datagg$cluster <- k2$cluster -datagg <- datagg[order(datagg$cluster), ] -aggregate(cbind(duration1, distance1, scaleSize1, rotationDegree1, - holiday1, vacations1) ~ cluster, datagg, mean) -# --> how to interpret this?? - - -# sample data for hierarchical clustering -n <- 200 -set.seed(1826) - -mat2 <- mat1[sample(nrow(mat1), n), ] -rownames(mat2) <- NULL -a1 <- agnes(mat2) - -d1 <- as.dendrogram(a1) -plot(d1) - -datagg$cluster <- NULL -rownames(datagg) <- NULL -a2 <- agnes(datagg) -d2 <- as.dendrogram(a2) -plot(d2) - -## Clustering for nominal features with nomclust package - -library(nomclust) - -dat <- as.data.frame(lapply(dat0[, c("folder", "holiday", "vacations", "artwork", - "event", "case", "trace")], as.factor)) -mat <- list() -mat$year <- as.numeric(dat$folder) -mat$holiday <- as.numeric(dat$holiday) -mat$vacations <- as.numeric(dat$vacations) -mat$artwork <- as.numeric(dat$artwork) -mat$event <- as.numeric(dat$event) -mat$case <- as.numeric(dat$case) -mat$trace <- as.numeric(dat$trace) - -mat$holiday <- ifelse(is.na(mat$holiday), 0, 1) -mat$vacations <- ifelse(is.na(mat$vacations), 0, 1) - -set.seed(1526) -ids <- sample(nrow(mat), 1000) -mat_small <- mat[ids, ] - -n1 <- nomclust(mat_small) - -n1$mem$clu_3 -dend.plot(n1, clusters = 3) - -mat_small[n1$mem$clu_6 == 6, ] - -cbind(mat_small[order(n1$mem$clu_3), ], n1$mem$clu_3[order(n1$mem$clu_3)]) - diff --git a/code/pm_infos-items.py b/code/04_infos-items.py similarity index 99% rename from code/pm_infos-items.py rename to code/04_infos-items.py index 6d1a910..3ff97fe 100644 --- a/code/pm_infos-items.py +++ b/code/04_infos-items.py @@ -1,4 +1,4 @@ -%reset +#%reset import pm4py import pandas as pd @@ -17,7 +17,6 @@ dat = dat[dat["path"] != 106098] # exclude broken trace log_path = pm4py.format_dataframe(dat, case_id = "path", activity_key = "event", timestamp_key = "date.start") - ###### Infos for items ###### mdi = pd.DataFrame(columns = ["fitness", "precision", "generalizability", diff --git a/code/item_clustering.R b/code/05_item-clustering.R similarity index 100% rename from code/item_clustering.R rename to code/05_item-clustering.R diff --git a/code/pm_infos-clusters.py b/code/06_infos-clusters.py similarity index 100% rename from code/pm_infos-clusters.py rename to code/06_infos-clusters.py diff --git a/code/04_modeling_haum.R b/code/check-traces.R similarity index 87% rename from code/04_modeling_haum.R rename to code/check-traces.R index 0aa798f..da2aea6 100644 --- a/code/04_modeling_haum.R +++ b/code/check-traces.R @@ -1,3 +1,5 @@ +# TODO: Clean me up! I am a mix of useful and useless!!! + # setwd("C:/Users/nwickelmaier/Nextcloud/Documents/MDS/2023ss/60100_master_thesis/analysis/code") library(bupaverse) @@ -259,3 +261,36 @@ process_map(alog, sec_edges = frequency("absolute"), rankdir = "LR") + + + + +# setwd("C:/Users/nwickelmaier/Nextcloud/Documents/MDS/2023ss/60100_master_thesis/analysis/code") + +datraw <- read.table("results/haum/raw_logfiles_2024-01-18_09-58-52.csv", + header = TRUE, sep = ";") + + +# Read data + +datlogs <- read.table("results/haum/event_logfiles_2024-01-18_09-58-52.csv", + colClasses = c("character", "character", "POSIXct", + "POSIXct", "character", "integer", + "numeric", "character", "character", + rep("numeric", 3), "character", + "character", rep("numeric", 11), + "character", "character"), + sep = ";", header = TRUE) + +datlogs <- datlogs[order(datlogs$fileId.start, datlogs$date.start, datlogs$timeMs.start), ] + +artwork <- "176" +fileId <- c('2017_06_16-13_49_00.log', '2017_06_16-13_59_00.log') +path <- 106098 + +datraw[datraw$item == artwork & datraw$fileId %in% fileId, ] + +datlogs[datlogs$path == path, ] + + + diff --git a/code/check_broken_trace.R b/code/check_broken_trace.R deleted file mode 100644 index edc6d0e..0000000 --- a/code/check_broken_trace.R +++ /dev/null @@ -1,28 +0,0 @@ -# setwd("C:/Users/nwickelmaier/Nextcloud/Documents/MDS/2023ss/60100_master_thesis/analysis/code") - -datraw <- read.table("results/haum/raw_logfiles_2024-01-18_09-58-52.csv", - header = TRUE, sep = ";") - - -# Read data - -datlogs <- read.table("results/haum/event_logfiles_2024-01-18_09-58-52.csv", - colClasses = c("character", "character", "POSIXct", - "POSIXct", "character", "integer", - "numeric", "character", "character", - rep("numeric", 3), "character", - "character", rep("numeric", 11), - "character", "character"), - sep = ";", header = TRUE) - -datlogs <- datlogs[order(datlogs$fileId.start, datlogs$date.start, datlogs$timeMs.start), ] - -artwork <- "176" -fileId <- c('2017_06_16-13_49_00.log', '2017_06_16-13_59_00.log') -path <- 106098 - -datraw[datraw$item == artwork & datraw$fileId %in% fileId, ] - -datlogs[datlogs$path == path, ] - - diff --git a/code/check_traces.R b/code/check_traces.R deleted file mode 100644 index 72c2364..0000000 --- a/code/check_traces.R +++ /dev/null @@ -1,101 +0,0 @@ -# setwd("C:/Users/nwickelmaier/Nextcloud/Documents/MDS/2023ss/60100_master_thesis/code") - -# Read data - -# dat <- read.table("results/haum/event_logfiles_metadata_2023-09-23_01-31-30.csv", -# sep = ";", header = TRUE) -dat <- read.table("results/haum/event_logfiles_small_metadata_2023-10-15_10-08-43.csv", - sep = ";", header = TRUE) -dat$date <- as.Date(dat$date) -dat$date.start <- as.POSIXct(dat$date.start) -dat$date.stop <- as.POSIXct(dat$date.stop) -dat$artwork <- sprintf("%03d", dat$artwork) - -library(bupaverse) - -names(dat)[names(dat) %in% c("date.start", "date.stop")] <- c("start", "complete") - -create_pdf <- function(trace, folder = "../figures/processmaps/") { - alog <- activitylog(dat[which(dat$trace == trace), ], - case_id = "trace", - activity_id = "event", - resource_id = "artwork", - timestamps = c("start", "complete")) - - map <- process_map(alog) - g <- DiagrammeR::grViz(map$x$diagram) |> DiagrammeRsvg::export_svg() |> charToRaw() - rsvg::rsvg_pdf(g, paste0(folder, trace, ".pdf")) -} - - - -find_trace <- function(trace) { - - alog <- activitylog(dat[which(dat$trace == trace), ], - case_id = "trace", - activity_id = "event", - resource_id = "artwork", - timestamps = c("start", "complete")) - - map <- process_map(alog) - d <- strsplit(map$x$diagram, "\n")[[1]] - o <- grep("^.{6}[[]label", d, value = TRUE) - p <- grep("^.{1}[1-6].->", d, value = TRUE) - num_ot <- gsub("^.{3}([1-6]).*", "\\1", grep("openTopic", o, value = TRUE)) - num_op <- gsub("^.{3}([1-6]).*", "\\1", grep("openPopup", o, value = TRUE)) - rel_path <- grep("^.{1}[2].->.[1-6]", p, value = TRUE) - rel_num <- gsub("^.{1}[2].->.([1-6]).*" , "\\1", rel_path) - num_fc <- gsub("^.{3}([1-6]).*", "\\1", grep("flipCard", o, value = TRUE)) - if (length(num_fc) > 0) { - rel_path_fc <- grep(paste0("^.{1}[", num_fc, "].->.[1-6]"), p, value = TRUE) - rel_num_fc <- gsub(paste0("^.{1}[", num_fc, "].->.([1-6]).*"), "\\1", rel_path_fc) - if (any(c(num_ot, num_op) %in% rel_num) | any(num_op == rel_num_fc)) { - trace - } - } else { - if (any(c(num_ot, num_op) %in% rel_num)) { - trace - } - } -} - -ctrace <- pbapply::pbsapply(unique(dat$trace), find_trace) - -unlist(ctrace) -length(unlist(ctrace)) - - -# create plots -for (trace in unlist(ctrace)) { - create_pdf(trace) -} - - - -alog <- activitylog(dat, - case_id = "trace", - activity_id = "event", - resource_id = "artwork", - timestamps = c("start", "complete")) - -map <- process_map(alog) -g <- DiagrammeR::grViz(map$x$diagram) |> DiagrammeRsvg::export_svg() |> charToRaw() -rsvg::rsvg_pdf(g, "../figures/processmap_haum.pdf", width = 10, height = 5) - -# adjusted colors -writeLines(map$x$diagram, "process_map_haum.gv") -g <- DiagrammeR::grViz("process_map_haum.gv") |> DiagrammeRsvg::export_svg() |> charToRaw() -rsvg::rsvg_pdf(g, "../figures/processmap_haum_adjusted.pdf", width = 10, height = 5) - - -alog <- activitylog(dat[!dat$trace %in% unlist(ctrace), ], - case_id = "trace", - activity_id = "event", - resource_id = "artwork", - timestamps = c("start", "complete")) - -map <- process_map(alog) -g <- DiagrammeR::grViz(map$x$diagram) |> DiagrammeRsvg::export_svg() |> charToRaw() -rsvg::rsvg_pdf(g, "../figures/processmap_haum_cleaned.pdf", width = 12, height = 5) - - diff --git a/code/03_haum_descriptives.R b/code/descriptives.R similarity index 100% rename from code/03_haum_descriptives.R rename to code/descriptives.R diff --git a/code/00_current-analysis.R b/code/descriptives_2nd.R similarity index 100% rename from code/00_current-analysis.R rename to code/descriptives_2nd.R diff --git a/code/plots_processmaps.R b/code/plots_processmaps.R new file mode 100644 index 0000000..9bab48e --- /dev/null +++ b/code/plots_processmaps.R @@ -0,0 +1,39 @@ +# setwd("C:/Users/nwickelmaier/Nextcloud/Documents/MDS/2023ss/60100_master_thesis/analysis/code") + +library(bupaverse) + +dat0 <- read.table("results/haum/event_logfiles_2024-01-18_09-58-52.csv", + colClasses = c("character", "character", "POSIXct", + "POSIXct", "character", "integer", + "numeric", "character", "character", + rep("numeric", 3), "character", + "character", rep("numeric", 11), + "character", "character"), + sep = ";", header = TRUE) +dat0$event <- factor(dat0$event, levels = c("move", "flipCard", "openTopic", + "openPopup")) + +# Select data pre Corona +dat <- dat0[as.Date(dat0$date.start) < "2020-03-13", ] +dat <- dat[dat$path != 106098, ] + +dat$start <- dat$date.start +dat$complete <- dat$date.stop + +alog <- activitylog(dat, + case_id = "path", + activity_id = "event", + resource_id = "item", + timestamps = c("start", "complete")) + +dfg_complete <- process_map(alog, + type_nodes = frequency("absolute", color_scale = "Greys"), + sec_nodes = frequency("relative"), + type_edges = frequency("absolute", color_edges = "#FF6900"), + sec_edges = frequency("relative"), + #rankdir = "TB", + render = FALSE) +export_map(dfg_complete, + file_name = "results/processmaps/dfg_complete_R.png", + file_type = "png") + diff --git a/code/pm.py b/code/pm.py deleted file mode 100644 index 1e8166d..0000000 --- a/code/pm.py +++ /dev/null @@ -1,202 +0,0 @@ -#%% # needed for shortcuts to run properly in VSCode *eyeroll* -%reset - -import pm4py -#from pm4py.algo.evaluation.generalization import algorithm as generalization_evaluator -#from pm4py.algo.evaluation.simplicity import algorithm as simplicity_evaluator - -import pandas as pd -import numpy as np -import matplotlib.pyplot as plt -from sklearn.cluster import KMeans - -###### Load data and create event logs ###### - -dat = pd.read_csv("results/haum/event_logfiles_glossar_2023-11-03_17-46-28.csv", sep = ";") -dat = dat[dat.date < "2020-03-13"] -# --> only pre corona (before artworks were updated) - -event_log = pm4py.format_dataframe(dat, case_id='trace', activity_key='event', - timestamp_key='date.start') -# event_log = pm4py.format_dataframe(dat, case_id='trace', activity_key='event', -# timestamp_key='date.stop', start_timestamp_key='date.start') -event_log = event_log.rename(columns={'artwork': 'case:artwork'}) -#event_log = pm4py.convert_to_event_log(dat_log) # deprecated - -###### Process Mining - complete data set ##### - -def eval_pm(data, net, initial_marking, final_marking): - """Caculate fitness, precision, generalizability, and simplicity for petri net""" - fitness = pm4py.fitness_token_based_replay(data, net, initial_marking, final_marking) - #fitness = pm4py.fitness_alignments(data, net, initial_marking, final_marking) - precisison = pm4py.precision_token_based_replay(data, net, initial_marking, final_marking) - #precision = pm4py.precision_alignments(data, net, initial_marking, final_marking) - generalizability = pm4py.algo.evaluation.generalization.algorithm.apply(data, net, initial_marking, final_marking) - simplicity = pm4py.algo.evaluation.simplicity.algorithm.apply(net) - return [fitness['average_trace_fitness'], precisison, generalizability, simplicity] - - -## Directly-follows graph -dfg, start_activities, end_activities = pm4py.discover_dfg(event_log) -pm4py.view_dfg(dfg, start_activities, end_activities) -pm4py.save_vis_dfg(dfg, start_activities, end_activities, '../figures/processmaps/dfg_complete.png') - -## Heuristics Miner -net, im, fm = pm4py.discover_petri_net_heuristics(event_log) -h_eval = eval_pm(event_log, net, im, fm) -pm4py.vis.view_petri_net(net, im, fm) -pm4py.vis.save_vis_petri_net(net, im, fm, "../figures/processmaps/pn_heuristics_complete.png") - -is_sound = pm4py.check_soundness(net, im, fm) -is_sound[0] - -len(is_sound[1]["s_c_net"].arcs) -# 46 -len(is_sound[1]["s_c_net"].transitions) -# 23 -len(is_sound[1]["s_c_net"].places) -# 10 - - - -# decorated petri net -from pm4py.visualization.petri_net import visualizer as pn_visualizer -parameters = {pn_visualizer.Variants.FREQUENCY.value.Parameters.FORMAT: "png"} -gviz = pn_visualizer.apply(net, im, fm, parameters=parameters, variant=pn_visualizer.Variants.FREQUENCY, log=event_log) -pn_visualizer.save(gviz, "../figures/processmaps/pn_heuristics_complete_decorated.png") - -# convert to process tree -bpmn = pm4py.convert.convert_to_bpmn(net, im, fm) -pm4py.vis.view_bpmn(bpmn) - -## Alpha Miner -net, im, fm = pm4py.discover_petri_net_alpha(event_log) -a_eval = eval_pm(event_log, net, im, fm) -pm4py.vis.view_petri_net(net, im, fm) -pm4py.vis.save_vis_petri_net(net, im, fm, "../figures/processmaps/pn_alpha_complete.png") - -is_sound = pm4py.check_soundness(net, im, fm) -is_sound[0] - -len(is_sound[1]["s_c_net"].arcs) -len(is_sound[1]["s_c_net"].transitions) -len(is_sound[1]["s_c_net"].places) - -## Inductive Miner -net, im, fm = pm4py.discover_petri_net_inductive(event_log) -i_eval = eval_pm(event_log, net, im, fm) -pm4py.vis.view_petri_net(net, im, fm) -pm4py.vis.save_vis_petri_net(net, im, fm, "../figures/processmaps/pn_induction_complete.png") - -# as process tree (does not work for heuristics miner!) -pt = pm4py.discover_process_tree_inductive(event_log) -pm4py.vis.view_process_tree(pt) - -is_sound = pm4py.check_soundness(net, im, fm) -is_sound[0] - -## ILP Miner -net, im, fm = pm4py.discover_petri_net_ilp(event_log) -ilp_eval = eval_pm(event_log, net, im, fm) -pm4py.vis.view_petri_net(net, im, fm) -pm4py.vis.save_vis_petri_net(net, im, fm, "../figures/processmaps/pn_ilp_complete.png") - -is_sound = pm4py.check_soundness(net, im, fm) -is_sound[0] - -eval = pd.DataFrame(np.row_stack([h_eval, a_eval, i_eval, ilp_eval])) -eval.columns = ["fitness", "precision", "generalizability", "simplicity"] -eval.index = ["heuristics", "alpha", "inductive", "ilp"] -eval - -eval.to_csv("results/eval_all-miners_complete.csv", sep=";") - - -###### Process Mining - individual artworks ###### - -def pm_artworks(miner): - - retval1 = np.empty((len(event_log["case:artwork"].unique()), 4)) - retval2 = np.empty((len(event_log["case:artwork"].unique()), 4)) - - if miner == "heuristics": - net, im, fm = pm4py.discover_petri_net_heuristics(event_log) - elif miner == "inductive": - net, im, fm = pm4py.discover_petri_net_inductive(event_log) - elif miner == "alpha": - net, im, fm = pm4py.discover_petri_net_alpha(event_log) - elif miner == "ilp": - net, im, fm = pm4py.discover_petri_net_ilp(event_log) - - for i in range(len(event_log["case:artwork"].unique())): - artwork = event_log["case:artwork"].unique()[i] - subdata = pm4py.filter_event_attribute_values(event_log, "case:artwork", - [artwork], - level="case", retain=True) - if miner == "heuristics": - subnet, subim, subfm = pm4py.discover_petri_net_heuristics(subdata) - elif miner == "inductive": - subnet, subim, subfm = pm4py.discover_petri_net_inductive(subdata) - elif miner == "alpha": - subnet, subim, subfm = pm4py.discover_petri_net_alpha(subdata) - elif miner == "ilp": - subnet, subim, subfm = pm4py.discover_petri_net_ilp(subdata) - #pm4py.save_vis_petri_net(subnet, subim, subfm, - # "../figures/processmaps/artworks/petrinet_" + miner + "_" + str(artwork).zfill(3) + ".png") - retval1[i] = eval_pm(subdata, net, im, fm) - retval2[i] = eval_pm(subdata, subnet, subim, subfm) - - retval1 = pd.DataFrame(retval1) - retval1.columns = ["fitness", "precision", "generalizability", "simplicity"] - retval1.index = event_log["case:artwork"].unique() - retval1.insert(0, "nettype", "alldata") - retval2 = pd.DataFrame(retval2) - retval2.columns = ["fitness", "precision", "generalizability", "simplicity"] - retval2.index = event_log["case:artwork"].unique() - retval2.insert(0, "nettype", "subdata") - return pd.concat([retval1, retval2]) - - -for miner in ["heuristics", "inductive", "alpha", "ilp"]: - eval_art = pm_artworks(miner = miner) - eval_art.to_csv("results/eval_artworks_" + miner + ".csv", sep=";") - -eval_art = pm_artworks(miner = "inductive") - -##### Clustering ###### - -## KMeans - -#eval_artworks = eval_art[eval_art.nettype == "alldata"].iloc[:,range(1,5)] -eval_artworks = eval_art[eval_art.nettype == "subdata"].iloc[:,range(1,5)] - -kmeans = KMeans(n_clusters=4, max_iter=1000).fit(eval_artworks) - -#from sklearn.manifold import MDS -#coord = pd.DataFrame(MDS(normalized_stress='auto').fit_transform(eval_artworks)) - -coord = eval_artworks -coord["clusters"] = kmeans.labels_ - -for i in coord.clusters.unique(): - #plt.scatter(coord[coord.clusters == i].iloc[:,0], coord[coord.clusters == i].iloc[:,1], - plt.scatter(coord[coord.clusters == i].iloc[:,1], coord[coord.clusters == i].iloc[:,2], - #plt.scatter(coord[coord.clusters == i].iloc[:,2], coord[coord.clusters == i].iloc[:,4], - label = i) -plt.legend() -plt.show() - -### Scree plot - -sse = {} -for k in range(1, 10): - kmeans = KMeans(n_clusters=k, max_iter=1000).fit(eval_artworks[["precision", "generalizability"]]) - #data["clusters"] = kmeans.labels_ - #print(data["clusters"]) - sse[k] = kmeans.inertia_ # Inertia: Sum of distances of samples to their closest cluster center -plt.figure() -plt.plot(list(sse.keys()), list(sse.values())) -plt.xlabel("Number of clusters") -plt.ylabel("SSE") -plt.show() - diff --git a/code/pm_conformance-checking.py b/code/pm_conformance-checking.py deleted file mode 100644 index ac677e2..0000000 --- a/code/pm_conformance-checking.py +++ /dev/null @@ -1,266 +0,0 @@ -%reset - -import pm4py - -import pandas as pd -import numpy as np -import matplotlib.pyplot as plt - -###### Load data and create event logs ###### - -dat = pd.read_csv("results/haum/event_logfiles_2024-01-18_09-58-52.csv", sep = ";") -dat = dat[dat["date.start"] < "2020-03-13"] -# --> only pre corona (before artworks were updated) - -event_log = pm4py.format_dataframe(dat, case_id='path', activity_key='event', - timestamp_key='date.start') - -###### Descrptives of log data ###### - -# Distribution of events -event_log.event.value_counts() -event_log.event.value_counts(normalize=True) - -# Number of paths -len(event_log.path.unique()) - -# Number of variants -variants = pm4py.get_variants(event_log) -len(variants) - -sorted_variants = dict(sorted(variants.items(), key=lambda item: item[1], reverse = True)) -{k: sorted_variants[k] for k in list(sorted_variants)[:20]} - -filtered_log = event_log[event_log["event"] != "move"] -variants_no_move = pm4py.get_variants(filtered_log) -len(variants_no_move) -sorted_variants_no_move = dict(sorted(variants_no_move.items(), key=lambda item: item[1], reverse = True)) -{k: sorted_variants_no_move[k] for k in list(sorted_variants_no_move)[:20]} - -###### Read "conformative" Petri Net ###### - -basenet, initial_marking, final_marking = pm4py.read_pnml("results/conformative_petrinet_con.pnml") - -def eval_pm(data, net, initial_marking, final_marking): - """Caculate fitness, precision, generalizability, and simplicity for petri net""" - fitness = pm4py.fitness_token_based_replay(data, net, initial_marking, final_marking) - precisison = pm4py.precision_token_based_replay(data, net, initial_marking, final_marking) - generalizability = pm4py.algo.evaluation.generalization.algorithm.apply(data, net, - initial_marking, final_marking) - simplicity = pm4py.algo.evaluation.simplicity.algorithm.apply(net) - return [fitness['average_trace_fitness'], precisison, generalizability, simplicity] - -baseline_eval = eval_pm(event_log, basenet, initial_marking, final_marking) - -# TBR -replayed_traces = pm4py.conformance_diagnostics_token_based_replay(event_log, basenet, initial_marking, final_marking) - -l1 = list() -l2 = list() -l3 = list() -l4 = list() -for i in range(len(replayed_traces)): - l1.append(replayed_traces[i]["remaining_tokens"]) - l2.append(replayed_traces[i]["missing_tokens"]) - l3.append(replayed_traces[i]["reached_marking"]) - l4.append(replayed_traces[i]["transitions_with_problems"]) - -set(l1) -x1 = np.array(l1) -index_broken = np.where(x1 == 1)[0].tolist() - -set(l3) -l4.count([]) - -[l3[i] for i in index_broken] -[l4[i] for i in index_broken] - -broken_traces = [replayed_traces[i] for i in index_broken] - -event_log[event_log['@@case_index'] == index_broken].event -event_log[event_log['@@case_index'] == index_broken].path.unique().tolist() -event_log[event_log['@@case_index'] == index_broken].item.unique().tolist() -event_log[event_log['@@case_index'] == index_broken]["fileId.start"].unique().tolist() -# --> logging error in raw file - - -# Footprints -from pm4py.algo.discovery.footprints import algorithm as footprints_discovery -from pm4py.visualization.footprints import visualizer as fp_visualizer -fp_log = footprints_discovery.apply(event_log, variant=footprints_discovery.Variants.ENTIRE_EVENT_LOG) -fp_net = footprints_discovery.apply(basenet, initial_marking, final_marking) -gviz = fp_visualizer.apply(fp_net, parameters={fp_visualizer.Variants.SINGLE.value.Parameters.FORMAT: "svg"}) -fp_visualizer.view(gviz) - -pm4py.vis.view_petri_net(basenet, initial_marking, final_marking) -is_sound = pm4py.check_soundness(basenet, initial_marking, final_marking) -baseline_eval.append(is_sound[0]) -baseline_eval.append(len(basenet.arcs)) -baseline_eval.append(len(basenet.transitions)) -baseline_eval.append(len(basenet.places)) - -efg_graph = pm4py.discover_eventually_follows_graph(event_log) - -## Directly-follows graph -dfg, start_activities, end_activities = pm4py.discover_dfg(event_log) -pm4py.view_dfg(dfg, start_activities, end_activities) -pm4py.save_vis_dfg(dfg, start_activities, end_activities, '../figures/processmaps/dfg_complete.png') - -## Fitting different miners -### Heuristics Miner -h_net, im, fm = pm4py.discover_petri_net_heuristics(event_log) -h_eval = eval_pm(event_log, h_net, im, fm) -is_sound = pm4py.check_soundness(h_net, im, fm) -h_eval.append(is_sound[0]) -h_eval.append(len(h_net.arcs)) -h_eval.append(len(h_net.transitions)) -h_eval.append(len(h_net.places)) - -## Alpha Miner -a_net, im, fm = pm4py.discover_petri_net_alpha(event_log) -a_eval = eval_pm(event_log, a_net, im, fm) -is_sound = pm4py.check_soundness(a_net, im, fm) -a_eval.append(is_sound[0]) -a_eval.append(len(a_net.arcs)) -a_eval.append(len(a_net.transitions)) -a_eval.append(len(a_net.places)) - -## Inductive Miner -i_net, im, fm = pm4py.discover_petri_net_inductive(event_log) -i_eval = eval_pm(event_log, i_net, im, fm) -is_sound = pm4py.check_soundness(i_net, im, fm) -i_eval.append(is_sound[0]) -i_eval.append(len(i_net.arcs)) -i_eval.append(len(i_net.transitions)) -i_eval.append(len(i_net.places)) - -## ILP Miner -ilp_net, im, fm = pm4py.discover_petri_net_ilp(event_log) -ilp_eval = eval_pm(event_log, ilp_net, im, fm) -is_sound = pm4py.check_soundness(ilp_net, im, fm) -ilp_eval.append(is_sound[0]) -ilp_eval.append(len(ilp_net.arcs)) -ilp_eval.append(len(ilp_net.transitions)) -ilp_eval.append(len(ilp_net.places)) - -## Export for all miners -eval = pd.DataFrame(np.row_stack([baseline_eval, h_eval, a_eval, i_eval, ilp_eval])) -eval.columns = ["fitness", "precision", "generalizability", "simplicity", - "sound", "narcs", "ntrans", "nplaces"] -eval.index = ["conformative", "heuristics", "alpha", "inductive", "ilp"] -eval - -eval.to_csv("results/eval_all-miners_complete.csv", sep=" ") - -## Without broken trace -event_log_clean = event_log[event_log['@@case_index'] != index_broken[0]] -h_net, h_im, h_fm = pm4py.discover_petri_net_heuristics(event_log_clean) -a_net, a_im, a_fm = pm4py.discover_petri_net_alpha(event_log_clean) -i_net, i_im, i_fm = pm4py.discover_petri_net_inductive(event_log_clean) -ilp_net, ilp_im, ilp_fm = pm4py.discover_petri_net_ilp(event_log_clean) - -baseline_eval = eval_pm(event_log_clean, basenet, initial_marking, final_marking) -is_sound = pm4py.check_soundness(basenet, initial_marking, final_marking) -baseline_eval.append(is_sound[0]) -baseline_eval.append(len(basenet.arcs)) -baseline_eval.append(len(basenet.transitions)) -baseline_eval.append(len(basenet.places)) - -h_eval = eval_pm(event_log_clean, h_net, h_im, h_fm) -is_sound = pm4py.check_soundness(h_net, h_im, h_fm) -h_eval.append(is_sound[0]) -h_eval.append(len(h_net.arcs)) -h_eval.append(len(h_net.transitions)) -h_eval.append(len(h_net.places)) - -a_eval = eval_pm(event_log_clean, a_net, a_im, a_fm) -is_sound = pm4py.check_soundness(a_net, a_im, a_fm) -a_eval.append(is_sound[0]) -a_eval.append(len(a_net.arcs)) -a_eval.append(len(a_net.transitions)) -a_eval.append(len(a_net.places)) - -i_eval = eval_pm(event_log_clean, i_net, i_im, i_fm) -is_sound = pm4py.check_soundness(i_net, i_im, i_fm) -i_eval.append(is_sound[0]) -i_eval.append(len(i_net.arcs)) -i_eval.append(len(i_net.transitions)) -i_eval.append(len(i_net.places)) - -ilp_eval = eval_pm(event_log_clean, ilp_net, ilp_im, ilp_fm) -is_sound = pm4py.check_soundness(ilp_net, ilp_im, ilp_fm) -ilp_eval.append(is_sound[0]) -ilp_eval.append(len(ilp_net.arcs)) -ilp_eval.append(len(ilp_net.transitions)) -ilp_eval.append(len(ilp_net.places)) - -eval = pd.DataFrame(np.row_stack([baseline_eval, h_eval, a_eval, i_eval, ilp_eval])) -eval.columns = ["fitness", "precision", "generalizability", "simplicity", - "sound", "narcs", "ntrans", "nplaces"] -eval.index = ["conformative", "heuristics", "alpha", "inductive", "ilp"] -eval - -eval.to_csv("results/eval_all-miners_clean.csv", sep=" ") - -# Export petri nets -pm4py.vis.save_vis_petri_net(h_net, h_im, h_fm, "results/processmaps/petrinet_heuristics_clean.png") -pm4py.vis.save_vis_petri_net(a_net, a_im, a_fm, "results/processmaps/petrinet_alpha_clean.png") -pm4py.vis.save_vis_petri_net(i_net, i_im, i_fm, "results/processmaps/petrinet_inductive_clean.png") -pm4py.vis.save_vis_petri_net(ilp_net, ilp_im, ilp_fm, "results/processmaps/petrinet_ilp_clean.png") -pm4py.vis.save_vis_petri_net(basenet, initial_marking, final_marking, "results/processmaps/petrinet_conformative.png") - -# convert to BPMN -base_bpmn = pm4py.convert.convert_to_bpmn(basenet, initial_marking, final_marking) -pm4py.vis.save_vis_bpmn(base_bpmn, "results/processmaps/bpmn_conformative.png") - -i_bpmn = pm4py.convert.convert_to_bpmn(i_net, i_im, i_fm) -pm4py.vis.save_vis_bpmn(i_bpmn, "results/processmaps/bpmn_inductive_clean.png") - -ilp_bpmn = pm4py.convert.convert_to_bpmn(ilp_net, ilp_im, ilp_fm) -pm4py.vis.save_vis_bpmn(ilp_bpmn, "results/processmaps/bpmn_ilp_clean.png") - -a_bpmn = pm4py.convert.convert_to_bpmn(a_net, a_im, a_fm) -pm4py.vis.save_vis_bpmn(a_bpmn, "results/processmaps/bpmn_alpha_clean.png") - -h_bpmn = pm4py.convert.convert_to_bpmn(h_net, h_im, h_fm) -pm4py.vis.save_vis_bpmn(h_bpmn, "results/processmaps/bpmn_heuristics_clean.png") - -###### Process Mining - individual artworks ###### - -def pm_artworks(miner): - - retval1 = np.empty((len(event_log["item"].unique()), 4)) - retval2 = np.empty((len(event_log["item"].unique()), 4)) - - for i in range(len(event_log["item"].unique())): - artwork = event_log["item"].unique()[i] - subdata = pm4py.filter_event_attribute_values(event_log, "item", - [artwork], - level="case", retain=True) - if miner == "heuristics": - subnet, subim, subfm = pm4py.discover_petri_net_heuristics(subdata) - elif miner == "inductive": - subnet, subim, subfm = pm4py.discover_petri_net_inductive(subdata) - elif miner == "alpha": - subnet, subim, subfm = pm4py.discover_petri_net_alpha(subdata) - elif miner == "ilp": - subnet, subim, subfm = pm4py.discover_petri_net_ilp(subdata) - #pm4py.save_vis_petri_net(subnet, subim, subfm, - # "results/processmaps/artworks/petrinet_" + miner + "_" + str(artwork).zfill(3) + ".png") - retval1[i] = eval_pm(subdata, basenet, initial_marking, final_marking) - retval2[i] = eval_pm(subdata, subnet, subim, subfm) - - retval1 = pd.DataFrame(retval1) - retval1.columns = ["fitness", "precision", "generalizability", "simplicity"] - retval1.index = event_log["item"].unique() - retval1.insert(0, "nettype", "alldata") - retval2 = pd.DataFrame(retval2) - retval2.columns = ["fitness", "precision", "generalizability", "simplicity"] - retval2.index = event_log["item"].unique() - retval2.insert(0, "nettype", "subdata") - return pd.concat([retval1, retval2]) - - -for miner in ["heuristics", "inductive", "alpha", "ilp"]: - eval_art = pm_artworks(miner = miner) - eval_art.to_csv("results/eval_artworks_" + miner + ".csv", sep=";") diff --git a/code/pm_navigation-behavior.py b/code/pm_navigation-behavior.py deleted file mode 100644 index 7a95302..0000000 --- a/code/pm_navigation-behavior.py +++ /dev/null @@ -1,126 +0,0 @@ -%reset - -import pm4py - -import pandas as pd -import numpy as np -import matplotlib.pyplot as plt -from pm4py.visualization.petri_net import visualizer as pn_visualizer -parameters = {pn_visualizer.Variants.FREQUENCY.value.Parameters.FORMAT: "png"} - -###### Load data and create event logs ###### - -dat = pd.read_csv("results/haum/event_logfiles_2024-01-18_09-58-52.csv", sep = ";") -dat = dat[dat["date.start"] < "2020-03-13"] -dat = dat[dat["path"] != 106098] # exclude broken trace -# --> only pre corona (before artworks were updated) - -event_log = pm4py.format_dataframe(dat, case_id='case', activity_key='event', - timestamp_key='date.start') - -event_log.event.value_counts() -event_log.event.value_counts(normalize=True) - -dfg, start_activities, end_activities = pm4py.discover_dfg(event_log) -pm4py.view_dfg(dfg, start_activities, end_activities) - -#filtered_log = pm4py.filter_event_attribute_values(event_log, 'item', [80]) - -net, im, fm = pm4py.discover_petri_net_inductive(event_log) -pm4py.vis.view_petri_net(net, im, fm) - -pm4py.vis.view_petri_net(net, im, fm) -gviz = pn_visualizer.apply(net, im, fm, parameters=parameters, - variant=pn_visualizer.Variants.FREQUENCY, - log=event_log) -pn_visualizer.view(gviz) - -bpmn = pm4py.convert.convert_to_bpmn(net, im, fm) -pm4py.vis.view_bpmn(bpmn) - -net2, im2, fm2 = pm4py.discover_petri_net_inductive(event_log, noise_threshold=0.1) -pm4py.vis.view_petri_net(net2, im2, fm2) - -def eval_pm(data, net, initial_marking, final_marking): - """Caculate fitness, precision, generalizability, and simplicity for petri net""" - fitness = pm4py.fitness_token_based_replay(data, net, initial_marking, final_marking) - precisison = pm4py.precision_token_based_replay(data, net, initial_marking, final_marking) - #generalizability = pm4py.algo.evaluation.generalization.algorithm.apply(data, net, - # initial_marking, final_marking) - simplicity = pm4py.algo.evaluation.simplicity.algorithm.apply(net) - #return [fitness['average_trace_fitness'], precisison, generalizability, simplicity] - return [fitness['average_trace_fitness'], precisison, simplicity] - -eval = eval_pm(event_log, net, im, fm) -eval2 = eval_pm(event_log, net2, im2, fm2) - -len(net.places) -len(net.transitions) -len(net.arcs) - -# Number of cases -len(event_log.case.unique()) - -# Number of variants -variants = pm4py.get_variants(event_log) -len(variants) - -sorted_variants = dict(sorted(variants.items(), key=lambda item: item[1], reverse = True)) -{k: sorted_variants[k] for k in list(sorted_variants)[:20]} - -filtered_log = event_log[event_log["event"] != "move"] -variants_no_move = pm4py.get_variants(filtered_log) -len(variants_no_move) -sorted_variants_no_move = dict(sorted(variants_no_move.items(), key=lambda item: item[1], reverse = True)) -{k: sorted_variants_no_move[k] for k in list(sorted_variants_no_move)[:20]} - - - - - - -###### Navigation behavior for case ###### - -log_case = pm4py.format_dataframe(dat, case_id = "case", activity_key = "item", - timestamp_key = "date.start") -log_case = log_case.merge(tmp, on = "item", how = "left") - -#filtered_log = pm4py.filter_event_attribute_values(log_case, "kcluster", [3]) -filtered_log = log_case[log_case.hcluster == 1] - -net, im, fm = pm4py.discover_dfg(filtered_log) -pm4py.vis.view_dfg(net, im, fm) - - -net, im, fm = pm4py.discover_petri_net_inductive(filtered_log) -pm4py.vis.view_petri_net(net, im, fm) - -tree = pm4py.discovery.discover_process_tree_inductive(filtered_log) -pm4py.vis.view_process_tree(tree) - - -datcase = dat[~dat.duplicated(["case", "path", "item"])] -datcase = datcase[["case", "path", "event", "item", "date.start"]] -datcase = datcase.reset_index().drop("index", axis = 1) -#datcase = pd.concat([datcase, pd.get_dummies(datcase["item"], dtype = "int")], axis = 1) - -datcase["duration"] = dat.groupby("path")["duration"].mean().tolist() -datcase["distance"] = dat.groupby("path")["distance"].mean().tolist() -datcase["scaleSize"] = dat.groupby("path")["scaleSize"].mean().tolist() -datcase["rotationDegree"] = dat.groupby("path")["rotationDegree"].mean().tolist() - -datcase["item"] = [str(item).zfill(3) for item in datcase.item] -datcase = datcase.merge(xy[["item", "hcluster"]], on = "item", how = "left") - -log_case = pm4py.format_dataframe(dat, case_id = "case", activity_key = "item", - timestamp_key = "date.start") - -net, im, fm = pm4py.discover_dfg(log_case) -pm4py.vis.view_dfg(net, im, fm) -# don't know if this will eventually finish? - -net, im, fm = pm4py.discover_dfg(log_case[log_case.hcluster == 1]) -pm4py.vis.view_dfg(net, im, fm) - - - diff --git a/code/python_helpers.py b/code/python_helpers.py index 9c7de96..07e510d 100644 --- a/code/python_helpers.py +++ b/code/python_helpers.py @@ -18,13 +18,40 @@ def pn_infos(log, colname, filter): filtered_log = pm4py.filter_event_attribute_values(log, colname, [filter]) net, im, fm = pm4py.discover_petri_net_inductive(filtered_log) - eval = eval_pm(filtered_log, net, im, fm) + + eval = eval_append(log, net, im, fm) + eval.index = [str(filter).zfill(3)] + return eval + + +def pn_infos_miner(log, miner): + """Create data frame with relevant infos for petri nets created with + different miners""" + + if miner == "alpha": + net, im, fm = pm4py.discover_petri_net_alpha(log) + elif miner == "heuristics": + net, im, fm = pm4py.discover_petri_net_heuristics(log) + elif miner == "ilp": + net, im, fm = pm4py.discover_petri_net_ilp(log) + elif miner == "inductive": + net, im, fm = pm4py.discover_petri_net_inductive(log) + elif miner == "conformative": + net, im, fm = pm4py.read_pnml("results/haum/conformative_petrinet_con.pnml") + + eval = eval_append(log, net, im, fm) + eval.index = [miner] + return eval + +def eval_append(log, net, im, fm): + + eval = eval_pm(log, net, im, fm) is_sound = pm4py.check_soundness(net, im, fm) eval.append(is_sound[0]) eval.append(len(net.arcs)) eval.append(len(net.transitions)) eval.append(len(net.places)) - variants = pm4py.get_variants(filtered_log) + variants = pm4py.get_variants(log) eval.append(len(variants)) sorted_variants = dict(sorted(variants.items(), key=lambda item: item[1], reverse = True)) @@ -33,5 +60,5 @@ def pn_infos(log, colname, filter): eval = pd.DataFrame(eval).T eval.columns = ["fitness", "precision", "generalizability", "simplicity", "sound", "narcs", "ntrans", "nplaces", "nvariants", "mostfreq"] - eval.index = [str(filter).zfill(3)] return eval + diff --git a/code/trace-clustering.py b/code/trace-clustering.py deleted file mode 100644 index c42dab6..0000000 --- a/code/trace-clustering.py +++ /dev/null @@ -1,174 +0,0 @@ -from sklearn.cluster import KMeans - -import matplotlib.pyplot as plt - - -##### Clustering ###### - -## KMeans - -#eval_artworks = eval_art[eval_art.nettype == "alldata"].iloc[:,range(1,5)] -eval_artworks = eval_art[eval_art.nettype == "subdata"].iloc[:,range(1,5)] - -kmeans = KMeans(n_clusters=4, max_iter=1000).fit(eval_artworks) - -#from sklearn.manifold import MDS -#coord = pd.DataFrame(MDS(normalized_stress='auto').fit_transform(eval_artworks)) - -coord = eval_artworks -coord["clusters"] = kmeans.labels_ - -for i in coord.clusters.unique(): - #plt.scatter(coord[coord.clusters == i].iloc[:,0], coord[coord.clusters == i].iloc[:,1], - plt.scatter(coord[coord.clusters == i].iloc[:,1], coord[coord.clusters == i].iloc[:,2], - #plt.scatter(coord[coord.clusters == i].iloc[:,2], coord[coord.clusters == i].iloc[:,4], - label = i) -plt.legend() -plt.show() - -### Scree plot - -sse = {} -for k in range(1, 10): - kmeans = KMeans(n_clusters=k, max_iter=1000).fit(eval_artworks[["precision", "generalizability"]]) - #data["clusters"] = kmeans.labels_ - #print(data["clusters"]) - sse[k] = kmeans.inertia_ # Inertia: Sum of distances of samples to their closest cluster center -plt.figure() -plt.plot(list(sse.keys()), list(sse.values())) -plt.xlabel("Number of clusters") -plt.ylabel("SSE") -plt.show() - - - - -### TMP -datitem = dat.groupby("item")[["duration", "distance", - "scaleSize", "rotationDegree"]].mean() - -def length_path(data): - x = data.path - return len(x.unique()) -def length_case(data): - x = data.case - return len(x.unique()) -def length_topic(data): - x = data.topic.dropna() - return len(x.unique()) - -datitem["npaths"] = dat.groupby(["item"]).apply(length_path) -datitem["ncases"] = dat.groupby(["item"]).apply(length_case) -datitem["ntopics"] = dat.groupby(["item"]).apply(length_topic) - -datitem.index = datitem.index.astype(str).str.rjust(3, "0") -datitem = datitem.sort_index() -datitem.index = mdi.index - -datitem = pd.concat([mdi, datitem], axis = 1) - - - - - -###### Find clusters ###### - -myseed = 1420 - -mat = datitem.drop(["fitness", "sound", "mostfreq"], axis = 1) -mat = StandardScaler().fit_transform(mat) - -xy = pd.DataFrame(MDS(normalized_stress = 'auto', random_state = myseed).fit_transform(mat)) -xy.index = datitem.index - -### K-Means clustering ### - -kmeans = KMeans(n_clusters = 6, max_iter = 1000, random_state = myseed).fit(mat) -xy["kcluster"] = kmeans.labels_ - -for i in xy.kcluster.unique(): - plt.scatter(xy[xy.kcluster == i].iloc[:,0], xy[xy.kcluster == i].iloc[:,1], label = i) - for j, txt in enumerate(xy.index[xy.kcluster == i]): - plt.annotate(txt.split("_")[1], (xy[xy.kcluster == i].iloc[j,0], xy[xy.kcluster == i].iloc[j,1])) -plt.legend() -plt.show() - -xy.kcluster.value_counts() - -# Scree plot -sse = {} -for k in range(1, 10): - kmeans = KMeans(n_clusters = k, max_iter = 1000).fit(mat) - sse[k] = kmeans.inertia_ # Inertia: Sum of distances of samples to their closest cluster center -plt.figure() -plt.plot(list(sse.keys()), list(sse.values())) -plt.xlabel("Number of clusters") -plt.ylabel("SSE") -plt.show() - -c0_items = xy[xy.kcluster == 0].index -c1_items = xy[xy.kcluster == 1].index -c2_items = xy[xy.kcluster == 2].index -c3_items = xy[xy.kcluster == 3].index -c4_items = xy[xy.kcluster == 4].index -c5_items = xy[xy.kcluster == 5].index - -### Hierarchical clustering ### -from sklearn.cluster import AgglomerativeClustering - -hclust = AgglomerativeClustering(n_clusters = 6).fit(mat) -hclust.labels_ - -xy["hcluster"] = hclust.labels_ - -for i in xy.hcluster.unique(): - plt.scatter(xy[xy.hcluster == i].iloc[:,0], xy[xy.hcluster == i].iloc[:,1], label = i) - for j, txt in enumerate(xy.index[xy.hcluster == i]): - plt.annotate(txt.split("_")[1], (xy[xy.hcluster == i].iloc[j,0], xy[xy.hcluster == i].iloc[j,1])) -plt.legend() -plt.show() - -# dendrogram -from scipy.cluster.hierarchy import dendrogram - -def plot_dendrogram(model, **kwargs): - # Create linkage matrix and then plot the dendrogram - - # create the counts of samples under each node - counts = np.zeros(model.children_.shape[0]) - n_samples = len(model.labels_) - for i, merge in enumerate(model.children_): - current_count = 0 - for child_idx in merge: - if child_idx < n_samples: - current_count += 1 # leaf node - else: - current_count += counts[child_idx - n_samples] - counts[i] = current_count - - linkage_matrix = np.column_stack( - [model.children_, model.distances_, counts] - ).astype(float) - - # Plot the corresponding dendrogram - dendrogram(linkage_matrix, **kwargs) - -hclust = AgglomerativeClustering(distance_threshold = 0, n_clusters = None).fit(mat) - -plot_dendrogram(hclust) -plt.show() - -### Bisecting K-Means clustering ### -from sklearn.cluster import BisectingKMeans - -biKmeans = BisectingKMeans(n_clusters = 6, random_state = myseed).fit(mat) -biKmeans.labels_ - -xy["bcluster"] = biKmeans.labels_ - -for i in xy.bcluster.unique(): - plt.scatter(xy[xy.bcluster == i].iloc[:,0], xy[xy.bcluster == i].iloc[:,1], label = i) - for j, txt in enumerate(xy.index[xy.bcluster == i]): - plt.annotate(txt.split("_")[1], (xy[xy.bcluster == i].iloc[j,0], xy[xy.bcluster == i].iloc[j,1])) -plt.legend() -plt.show()