diff --git a/code/01_preprocessing.R b/code/01_preprocessing.R index 6bc3d85..e7085cc 100644 --- a/code/01_preprocessing.R +++ b/code/01_preprocessing.R @@ -8,8 +8,8 @@ # ../data/metadata/feiertage.csv # ../data/metadata/schulferien_2016-2018_NI.csv # ../data/metadata/schulferien_2019-2025_NI.csv -# output: raw_logfiles_.csv -# event_logfiles_.csv +# output: results/raw_logfiles_.csv +# results/event_logfiles_.csv # # last mod: 2024-02-23, NW @@ -29,12 +29,12 @@ folders <- dir(path) datraw <- parse_logfiles(folders, path) # 91 corrupt lines have been found and removed from the data set -# datraw <- read.table("results/haum/raw_logfiles_2023-10-25_16-20-45.csv", +# datraw <- read.table("results/raw_logfiles_2023-10-25_16-20-45.csv", # sep = ";", header = TRUE) ## Export data -write.table(datraw, paste0("results/haum/raw_logfiles_", now, ".csv"), +write.table(datraw, paste0("results/raw_logfiles_", now, ".csv"), sep = ";", row.names = FALSE) #--------------- (2) Create event logs --------------- @@ -131,6 +131,6 @@ dat2 <- dat2[order(dat2$fileId.start, dat2$date.start, dat2$timeMs.start), ] ## Export data -write.table(dat2, paste0("results/haum/event_logfiles_", now, ".csv"), +write.table(dat2, paste0("results/event_logfiles_", now, ".csv"), sep = ";", row.names = FALSE) diff --git a/code/02_descriptives.R b/code/02_descriptives.R index 4e91155..3542224 100644 --- a/code/02_descriptives.R +++ b/code/02_descriptives.R @@ -9,8 +9,8 @@ # (3.4) Artwork sequences # (3.5) Topics # -# input: results/haum/event_logfiles_2024-02-21_16-07-33.csv -# results/haum/raw_logfiles_2024-02-21_16-07-33.csv +# input: results/event_logfiles_2024-02-21_16-07-33.csv +# results/raw_logfiles_2024-02-21_16-07-33.csv # output: results/figures/counts_item.pdf # results/figures/counts_item_firsttouch.pdf # results/figures/duration.pdf @@ -41,7 +41,7 @@ #--------------- (1) Read data --------------- -datlogs <- read.table("results/haum/event_logfiles_2024-02-21_16-07-33.csv", +datlogs <- read.table("results/event_logfiles_2024-02-21_16-07-33.csv", colClasses = c("character", "character", "POSIXct", "POSIXct", "character", "integer", "numeric", "character", "character", @@ -54,7 +54,7 @@ datlogs$event <- factor(datlogs$event, levels = c("move", "flipCard", "openTopic", "openPopup")) -datraw <- read.table("results/haum/raw_logfiles_2024-02-21_16-07-33.csv", +datraw <- read.table("results/raw_logfiles_2024-02-21_16-07-33.csv", sep = ";", header = TRUE) # Add weekdays to data frame diff --git a/code/04_conformance-checking.py b/code/04_conformance-checking.py index 2714195..25df866 100644 --- a/code/04_conformance-checking.py +++ b/code/04_conformance-checking.py @@ -1,25 +1,24 @@ # 04_conformance-checking.py # # content: (1) Load data and create event log -# (2) Infos for items +# (2) Check against normative Petri Net # -# input: results/haum/event_logfiles_2024-02-21_16-07-33.csv -# results/haum/conformative_petrinet_con.pnml -# output: results/processmaps/dfg_complete_python.png -# results/eval_all-miners_complete.csv +# input: results/event_logfiles_2024-02-21_16-07-33.csv +# results/normative_petrinet.pnml +# output: results/eval_all-miners_complete.csv # results/eval_all-miners_clean.csv -# results/processmaps/petrinet_conformative.png -# results/processmaps/petrinet_heuristics_clean.png -# results/processmaps/petrinet_alpha_clean.png -# results/processmaps/petrinet_inductive_clean.png -# results/processmaps/petrinet_ilp_clean.png -# results/processmaps/bpmn_conformative.png -# results/processmaps/bpmn_inductive_clean.png -# results/processmaps/bpmn_ilp_clean.png -# results/processmaps/bpmn_alpha_clean.png -# results/processmaps/bpmn_heuristics_clean.png +# ../../thesis/figures/petrinet_normative.png +# ../../thesis/figures/petrinet_heuristics_clean.png +# ../../thesis/figures/petrinet_alpha_clean.png +# ../../thesis/figures/petrinet_inductive_clean.png +# ../../thesis/figures/petrinet_ilp_clean.png +# ../../thesis/figures/bpmn_normative.png +# ../../thesis/figures/bpmn_inductive_clean.png +# ../../thesis/figures/bpmn_ilp_clean.png +# ../../thesis/figures/bpmn_alpha_clean.png +# ../../thesis/figures/bpmn_heuristics_clean.png # -# last mod: 2024-03-06 +# last mod: 2024-03-22 import pm4py import pandas as pd @@ -29,13 +28,13 @@ from python_helpers import eval_pm, pn_infos_miner #--------------- (1) Load data and create event logs --------------- -dat = pd.read_csv("results/haum/event_logfiles_2024-02-21_16-07-33.csv", sep = ";") +dat = pd.read_csv("results/event_logfiles_2024-02-21_16-07-33.csv", sep = ";") event_log = pm4py.format_dataframe(dat, case_id = "path", activity_key = "event", timestamp_key = "date.start") -###### Descriptives of log data ###### +## Descriptives of log data # Distribution of events event_log.event.value_counts() @@ -57,9 +56,9 @@ len(variants_no_move) sorted_variants_no_move = dict(sorted(variants_no_move.items(), key=lambda item: item[1], reverse = True)) {k: sorted_variants_no_move[k] for k in list(sorted_variants_no_move)[:20]} -###### Check against "conformative" Petri Net ###### +#--------------- (2) Check against normative Petri Net --------------- -basenet, initial_marking, final_marking = pm4py.read_pnml("results/haum/conformative_petrinet_con.pnml") +basenet, initial_marking, final_marking = pm4py.read_pnml("results/normative_petrinet.pnml") # TBR replayed_traces = pm4py.conformance_diagnostics_token_based_replay(event_log, basenet, initial_marking, final_marking) @@ -93,23 +92,13 @@ event_log[event_log["@@case_index"] == index_broken[0]].item.unique().tolist() event_log[event_log["@@case_index"] == index_broken[0]]["fileId.start"].unique().tolist() # --> logging error in raw file -## Footprints -from pm4py.algo.discovery.footprints import algorithm as footprints_discovery -from pm4py.visualization.footprints import visualizer as fp_visualizer -fp_log = footprints_discovery.apply(event_log, variant=footprints_discovery.Variants.ENTIRE_EVENT_LOG) -fp_net = footprints_discovery.apply(basenet, initial_marking, final_marking) -gviz = fp_visualizer.apply(fp_net, parameters={fp_visualizer.Variants.SINGLE.value.Parameters.FORMAT: "svg"}) -fp_visualizer.view(gviz) - -efg_graph = pm4py.discover_eventually_follows_graph(event_log) - ## Fitting different miners eval = pd.DataFrame(columns = ["fitness", "precision", "generalizability", "simplicity", "sound", "narcs", "ntrans", "nplaces", "nvariants", "mostfreq"]) -for miner in ["conformative", "alpha", "heuristics", "inductive", "ilp"]: +for miner in ["normative", "alpha", "heuristics", "inductive", "ilp"]: eval = pd.concat([eval, pn_infos_miner(event_log, miner)]) eval.to_csv("results/eval_all-miners_complete.csv", sep = ";") @@ -121,7 +110,7 @@ eval_clean = pd.DataFrame(columns = ["fitness", "precision", "generalizability", "simplicity", "sound", "narcs", "ntrans", "nplaces", "nvariants", "mostfreq"]) -for miner in ["conformative", "alpha", "heuristics", "inductive", "ilp"]: +for miner in ["normative", "alpha", "heuristics", "inductive", "ilp"]: eval_clean = pd.concat([eval_clean, pn_infos_miner(event_log_clean, miner)]) eval_clean.to_csv("results/eval_all-miners_clean.csv", sep = ";") @@ -129,28 +118,27 @@ eval_clean.to_csv("results/eval_all-miners_clean.csv", sep = ";") ## Directly-follows graph dfg, start_activities, end_activities = pm4py.discover_dfg(event_log_clean) pm4py.view_dfg(dfg, start_activities, end_activities) -pm4py.save_vis_dfg(dfg, start_activities, end_activities, "results/processmaps/dfg_complete_python.png") ## Export petri nets -pm4py.vis.save_vis_petri_net(basenet, initial_marking, final_marking, "results/processmaps/petrinet_conformative.png") +pm4py.vis.save_vis_petri_net(basenet, initial_marking, final_marking, + "../../thesis/figures/petrinet_normative.png") h_net, h_im, h_fm = pm4py.discover_petri_net_heuristics(event_log_clean) -pm4py.vis.save_vis_petri_net(h_net, h_im, h_fm, "results/processmaps/petrinet_heuristics_clean.png") +pm4py.vis.save_vis_petri_net(h_net, h_im, h_fm, "../../thesis/figures/petrinet_heuristics_clean.png") a_net, a_im, a_fm = pm4py.discover_petri_net_alpha(event_log_clean) -pm4py.vis.save_vis_petri_net(a_net, a_im, a_fm, "results/processmaps/petrinet_alpha_clean.png") +pm4py.vis.save_vis_petri_net(a_net, a_im, a_fm, "../../thesis/figures/petrinet_alpha_clean.png") i_net, i_im, i_fm = pm4py.discover_petri_net_inductive(event_log_clean) -pm4py.vis.save_vis_petri_net(i_net, i_im, i_fm, "results/processmaps/petrinet_inductive_clean.png") +pm4py.vis.save_vis_petri_net(i_net, i_im, i_fm, "../../thesis/figures/petrinet_inductive_clean.png") ilp_net, ilp_im, ilp_fm = pm4py.discover_petri_net_ilp(event_log_clean) -pm4py.vis.save_vis_petri_net(ilp_net, ilp_im, ilp_fm, "results/processmaps/petrinet_ilp_clean.png") +pm4py.vis.save_vis_petri_net(ilp_net, ilp_im, ilp_fm, "../../thesis/figures/petrinet_ilp_clean.png") # convert to BPMN base_bpmn = pm4py.convert.convert_to_bpmn(basenet, initial_marking, final_marking) -pm4py.vis.save_vis_bpmn(base_bpmn, "results/processmaps/bpmn_conformative.png") +pm4py.vis.save_vis_bpmn(base_bpmn, "../../thesis/figures/bpmn_normative.png") i_bpmn = pm4py.convert.convert_to_bpmn(i_net, i_im, i_fm) -pm4py.vis.save_vis_bpmn(i_bpmn, "results/processmaps/bpmn_inductive_clean.png") +pm4py.vis.save_vis_bpmn(i_bpmn, "../../thesis/figures/bpmn_inductive_clean.png") ilp_bpmn = pm4py.convert.convert_to_bpmn(ilp_net, ilp_im, ilp_fm) -pm4py.vis.save_vis_bpmn(ilp_bpmn, "results/processmaps/bpmn_ilp_clean.png") +pm4py.vis.save_vis_bpmn(ilp_bpmn, "../../thesis/figures/bpmn_ilp_clean.png") a_bpmn = pm4py.convert.convert_to_bpmn(a_net, a_im, a_fm) -pm4py.vis.save_vis_bpmn(a_bpmn, "results/processmaps/bpmn_alpha_clean.png") +pm4py.vis.save_vis_bpmn(a_bpmn, "../../thesis/figures/bpmn_alpha_clean.png") h_bpmn = pm4py.convert.convert_to_bpmn(h_net, h_im, h_fm) -pm4py.vis.save_vis_bpmn(h_bpmn, "results/processmaps/bpmn_heuristics_clean.png") - +pm4py.vis.save_vis_bpmn(h_bpmn, "../../thesis/figures/bpmn_heuristics_clean.png") diff --git a/code/05_check-traces.R b/code/05_check-traces.R index 2e6fb33..1b715d1 100644 --- a/code/05_check-traces.R +++ b/code/05_check-traces.R @@ -5,22 +5,23 @@ # (3) DFG for complete data # (4) Export data frame for analyses # -# input: results/haum/event_logfiles_2024-02-21_16-07-33.csv -# results/haum/raw_logfiles_2024-02-21_16-07-33.csv -# output: results/haum/eventlogs_pre-corona_cleaned.RData -# results/haum/eventlogs_pre-corona_cleaned.csv +# input: results/event_logfiles_2024-02-21_16-07-33.csv +# results/raw_logfiles_2024-02-21_16-07-33.csv +# output: results/eventlogs_pre-corona_cleaned.RData +# results/eventlogs_pre-corona_cleaned.csv +# ../../thesis/figures/dfg_complete_WFnet_R.pdf # -# last mod: 2024-03-06 +# last mod: 2024-03-23 # setwd("C:/Users/nwickelmaier/Nextcloud/Documents/MDS/2023ss/60100_master_thesis/analysis/code") #--------------- (1) Look at broken trace --------------- -datraw <- read.table("results/haum/raw_logfiles_2024-02-21_16-07-33.csv", - header = TRUE, sep = ";") +datraw <- read.table("results/raw_logfiles_2024-02-21_16-07-33.csv", + header = TRUE, sep = ";") -datlogs <- read.table("results/haum/event_logfiles_2024-02-21_16-07-33.csv", +datlogs <- read.table("results/event_logfiles_2024-02-21_16-07-33.csv", colClasses = c("character", "character", "POSIXct", "POSIXct", "character", "integer", "numeric", "character", "character", @@ -84,7 +85,7 @@ dfg <- processmapR::process_map(alog, render = FALSE) processmapR::export_map(dfg, - file_name = paste0("results/processmaps/dfg_complete_WFnet_R.pdf"), + file_name = paste0("../../thesis/figures/dfg_complete_WFnet_R.pdf"), file_type = "pdf") rm(tmp) @@ -109,10 +110,10 @@ dat <- datlogs[as.Date(datlogs$date.start) < "2020-03-13", ] # Remove corrupt trace dat <- dat[dat$path != 106098, ] -save(dat, file = "results/haum/eventlogs_pre-corona_cleaned.RData") +save(dat, file = "results/eventlogs_pre-corona_cleaned.RData") write.table(dat, - file = "results/haum/eventlogs_pre-corona_cleaned.csv", + file = "results/eventlogs_pre-corona_cleaned.csv", sep = ";", quote = FALSE, row.names = FALSE) diff --git a/code/06_infos-items.py b/code/06_infos-items.py index 9f6ec9b..a292353 100644 --- a/code/06_infos-items.py +++ b/code/06_infos-items.py @@ -3,10 +3,10 @@ # content: (1) Load data and create event log # (2) Infos for items # -# input: results/haum/eventlogs_pre-corona_cleaned.csv -# output: results/haum/pn_infos_items.csv +# input: results/eventlogs_pre-corona_cleaned.csv +# output: results/pn_infos_items.csv # -# last mod: 2024-03-06 +# last mod: 2024-03-22 import pm4py import pandas as pd @@ -16,7 +16,7 @@ from python_helpers import eval_pm, pn_infos #--------------- (1) Load data and create event logs --------------- -dat = pd.read_csv("results/haum/eventlogs_pre-corona_cleaned", sep = ";") +dat = pd.read_csv("results/eventlogs_pre-corona_cleaned", sep = ";") log_path = pm4py.format_dataframe(dat, case_id = "path", activity_key = "event", timestamp_key = "date.start") @@ -33,5 +33,5 @@ for item in log_path.item.unique().tolist(): eval = eval.sort_index() # Export -eval.to_csv("results/haum/pn_infos_items.csv", sep = ";") +eval.to_csv("results/pn_infos_items.csv", sep = ";") diff --git a/code/07_item-clustering.R b/code/07_item-clustering.R index e5a7ab8..f793145 100644 --- a/code/07_item-clustering.R +++ b/code/07_item-clustering.R @@ -7,12 +7,12 @@ # (2) Clustering # (3) Visualization with pictures # -# input: results/haum/eventlogs_pre-corona_cleaned.RData -# results/haum/pn_infos_items.csv -# output: results/haum/eventlogs_pre-corona_item-clusters.csv +# input: results/eventlogs_pre-corona_cleaned.RData +# results/pn_infos_items.csv +# output: results/eventlogs_pre-corona_item-clusters.csv # ../../thesis/figures/data/clustering_items.RData" # -# last mod: 2024-03-21 +# last mod: 2024-03-22 # setwd("C:/Users/nwickelmaier/Nextcloud/Documents/MDS/2023ss/60100_master_thesis/analysis/code") @@ -22,11 +22,11 @@ source("R_helpers.R") #--------------- (1.1) Read log event data --------------- -load("results/haum/eventlogs_pre-corona_cleaned.RData") +load("results/eventlogs_pre-corona_cleaned.RData") #--------------- (1.2) Read infos for PM for items --------------- -datitem <- read.table("results/haum/pn_infos_items.csv", header = TRUE, +datitem <- read.table("results/pn_infos_items.csv", header = TRUE, sep = ";", row.names = 1) #--------------- (1.3) Extract additional infos for clustering --------------- @@ -126,6 +126,28 @@ item <- sprintf("%03d", as.numeric(gsub("item_([0-9]{3})", "\\1", res <- merge(dat, data.frame(item, cluster), by = "item", all.x = TRUE) res <- res[order(res$fileId.start, res$date.start, res$timeMs.start), ] + +# DFGs for clusters +res$start <- res$date.start +res$complete <- res$date.stop + +for (clst in sort(unique(res$cluster))) { + + alog <- bupaR::activitylog(res[res$cluster == clst, ], + case_id = "path", + activity_id = "event", + resource_id = "item", + timestamps = c("start", "complete")) + + processmapR::process_map(alog, + type_nodes = processmapR::frequency("relative", color_scale = "Greys"), + sec_nodes = processmapR::frequency("absolute"), + type_edges = processmapR::frequency("relative", color_edges = "#FF6900"), + sec_edges = processmapR::frequency("absolute"), + rankdir = "LR") +} + + # Look at clusters par(mfrow = c(2,2)) vioplot::vioplot(duration ~ cluster, res) @@ -134,7 +156,7 @@ vioplot::vioplot(scaleSize ~ cluster, res) vioplot::vioplot(rotationDegree ~ cluster, res) write.table(res, - file = "results/haum/eventlogs_pre-corona_item-clusters.csv", + file = "results/eventlogs_pre-corona_item-clusters.csv", sep = ";", quote = FALSE, row.names = FALSE) diff --git a/code/python_helpers.py b/code/python_helpers.py index e3a31be..693acaa 100644 --- a/code/python_helpers.py +++ b/code/python_helpers.py @@ -36,8 +36,8 @@ def pn_infos_miner(log, miner): net, im, fm = pm4py.discover_petri_net_ilp(log) elif miner == "inductive": net, im, fm = pm4py.discover_petri_net_inductive(log) - elif miner == "conformative": - net, im, fm = pm4py.read_pnml("results/haum/conformative_petrinet_con.pnml") + elif miner == "normative": + net, im, fm = pm4py.read_pnml("results/normative_petrinet.pnml") eval = eval_append(log, net, im, fm) eval.index = [miner]