#%% # needed for shortcuts to run properly in VSCode *eyeroll* %reset import pm4py import pandas as pd import numpy as np import matplotlib.pyplot as plt ###### Load data and create event logs ###### dat = pd.read_csv("results/haum/event_logfiles_2024-01-02_19-44-50.csv", sep = ";") dat = dat[dat["date.start"] < "2020-03-13"] # --> only pre corona (before artworks were updated) event_log = pm4py.format_dataframe(dat, case_id='path', activity_key='event', timestamp_key='date.start') event_log = event_log.rename(columns={'artwork': 'case:artwork'}) ###### Descrptives of log data ###### # Distribution of events event_log.event.value_counts() event_log.event.value_counts(normalize=True) # Number of paths len(event_log.path.unique()) # Number of variants variants = pm4py.get_variants(event_log) len(variants) sorted_variants = dict(sorted(variants.items(), key=lambda item: item[1], reverse = True)) {k: sorted_variants[k] for k in list(sorted_variants)[:20]} filtered_log = event_log[event_log["event"] != "move"] variants = pm4py.get_variants(filtered_log) len(variants) sorted_variants = dict(sorted(variants.items(), key=lambda item: item[1], reverse = True)) {k: sorted_variants[k] for k in list(sorted_variants)[:20]} # Path length event_log.path.value_counts() event_log.path.value_counts().mean() event_log.path.value_counts().median() event_log.path.value_counts().min() event_log.path.value_counts().max() plt.hist(event_log.path.value_counts(), bins=200) plt.show() # TODO: Do it again in R -- much smoother and more info, better plots ###### Read "conformative" Petri Net ###### basenet, initial_marking, final_marking = pm4py.read_pnml("results/conformative_petrinet_con.pnml") def eval_pm(data, net, initial_marking, final_marking): """Caculate fitness, precision, generalizability, and simplicity for petri net""" fitness = pm4py.fitness_token_based_replay(data, net, initial_marking, final_marking) precisison = pm4py.precision_token_based_replay(data, net, initial_marking, final_marking) generalizability = pm4py.algo.evaluation.generalization.algorithm.apply(data, net, initial_marking, final_marking) simplicity = pm4py.algo.evaluation.simplicity.algorithm.apply(net) return [fitness['average_trace_fitness'], precisison, generalizability, simplicity] baseline_eval = eval_pm(event_log, basenet, initial_marking, final_marking) # TBR replayed_traces = pm4py.conformance_diagnostics_token_based_replay(event_log, basenet, initial_marking, final_marking) l1 = list() l2 = list() l3 = list() l4 = list() for i in range(len(replayed_traces)): l1.append(replayed_traces[i]["remaining_tokens"]) l2.append(replayed_traces[i]["missing_tokens"]) l3.append(replayed_traces[i]["reached_marking"]) l4.append(replayed_traces[i]["transitions_with_problems"]) np.mean(l1) set(l1) index_broken = l1.index(1) np.mean(l2) set(l2) l2.index(1) set(l3) l4.count([]) l3[index_broken] l4[index_broken] replayed_traces[index_broken] # 216295 # --> broken trace! Must be in artwork 176!!!!! from pm4py.algo.conformance.tokenreplay import algorithm as token_based_replay parameters_tbr = {token_based_replay.Variants.TOKEN_REPLAY.value.Parameters.DISABLE_VARIANTS: True, token_based_replay.Variants.TOKEN_REPLAY.value.Parameters.ENABLE_PLTR_FITNESS: True} replayed_traces, place_fitness, trans_fitness, unwanted_activities = token_based_replay.apply(event_log, basenet, initial_marking, final_marking, parameters=parameters_tbr) from pm4py.algo.conformance.tokenreplay.diagnostics import duration_diagnostics trans_diagnostics = duration_diagnostics.diagnose_from_trans_fitness(event_log, trans_fitness) for trans in trans_diagnostics: print(trans, trans_diagnostics[trans]) # Footprints from pm4py.algo.discovery.footprints import algorithm as footprints_discovery fp_log = footprints_discovery.apply(event_log, variant=footprints_discovery.Variants.ENTIRE_EVENT_LOG) fp_trace_by_trace = footprints_discovery.apply(event_log, variant=footprints_discovery.Variants.TRACE_BY_TRACE) fp_net = footprints_discovery.apply(basenet, initial_marking, final_marking) from pm4py.visualization.footprints import visualizer as fp_visualizer gviz = fp_visualizer.apply(fp_net, parameters={fp_visualizer.Variants.SINGLE.value.Parameters.FORMAT: "svg"}) fp_visualizer.view(gviz) gviz = fp_visualizer.apply(fp_log, fp_net, parameters={fp_visualizer.Variants.COMPARISON.value.Parameters.FORMAT: "svg"}) fp_visualizer.view(gviz) conf_fp = pm4py.conformance_diagnostics_footprints(fp_trace_by_trace, fp_net) from pm4py.algo.conformance.footprints import algorithm as fp_conformance conf_result = fp_conformance.apply(fp_log, fp_net, variant=fp_conformance.Variants.LOG_EXTENSIVE) from pm4py.algo.conformance.footprints.util import evaluation fitness = evaluation.fp_fitness(fp_log, fp_net, conf_result) precision = evaluation.fp_precision(fp_log, fp_net) # Skeleton from pm4py.algo.discovery.log_skeleton import algorithm as lsk_discovery skeleton = lsk_discovery.apply(event_log, parameters={lsk_discovery.Variants.CLASSIC.value.Parameters.NOISE_THRESHOLD: 0.0}) from pm4py.algo.conformance.log_skeleton import algorithm as lsk_conformance conf_result = lsk_conformance.apply(event_log, skeleton) pm4py.vis.view_petri_net(basenet, initial_marking, final_marking) is_sound = pm4py.check_soundness(basenet, initial_marking, final_marking) is_sound[0] len(basenet.arcs) len(basenet.transitions) len(basenet.places) efg_graph = pm4py.discover_eventually_follows_graph(event_log) ## Directly-follows graph dfg, start_activities, end_activities = pm4py.discover_dfg(event_log) pm4py.view_dfg(dfg, start_activities, end_activities) pm4py.save_vis_dfg(dfg, start_activities, end_activities, '../figures/processmaps/dfg_complete.png') ## Heuristics Miner h_net, im, fm = pm4py.discover_petri_net_heuristics(event_log) h_eval = eval_pm(event_log, h_net, im, fm) pm4py.vis.view_petri_net(h_net, im, fm) pm4py.vis.save_vis_petri_net(h_net, im, fm, "../figures/processmaps/pn_heuristics_complete.png") is_sound = pm4py.check_soundness(h_net, im, fm) is_sound[0] len(h_net.arcs) len(h_net.transitions) len(h_net.places) # decorated petri net from pm4py.visualization.petri_net import visualizer as pn_visualizer parameters = {pn_visualizer.Variants.FREQUENCY.value.Parameters.FORMAT: "png"} gviz = pn_visualizer.apply(h_net, im, fm, parameters=parameters, variant=pn_visualizer.Variants.FREQUENCY, log=event_log) pn_visualizer.save(gviz, "../figures/processmaps/pn_heuristics_complete_decorated.png") # convert to BPMN bpmn = pm4py.convert.convert_to_bpmn(h_net, im, fm) pm4py.vis.view_bpmn(bpmn) ## Alpha Miner a_net, im, fm = pm4py.discover_petri_net_alpha(event_log) a_eval = eval_pm(event_log, a_net, im, fm) pm4py.vis.view_petri_net(a_net, im, fm) pm4py.vis.save_vis_petri_net(a_net, im, fm, "../figures/processmaps/pn_alpha_complete.png") is_sound = pm4py.check_soundness(a_net, im, fm) is_sound[0] len(a_net.arcs) len(a_net.transitions) len(a_net.places) ## Inductive Miner i_net, im, fm = pm4py.discover_petri_net_inductive(event_log) i_eval = eval_pm(event_log, i_net, im, fm) pm4py.vis.view_petri_net(i_net, im, fm) pm4py.vis.save_vis_petri_net(i_net, im, fm, "../figures/processmaps/pn_induction_complete.png") # as process tree (does not work for heuristics miner!) pt = pm4py.discover_process_tree_inductive(event_log) pm4py.vis.view_process_tree(pt) is_sound = pm4py.check_soundness(i_net, im, fm) is_sound[0] # TODO: Can I show that this simpler net does not include all traces? (Probably not, # since fitness is 1, but WHY?) len(i_net.arcs) len(i_net.transitions) len(i_net.places) bpmn = pm4py.convert.convert_to_bpmn(i_net, im, fm) pm4py.view_bpmn(bpmn) from pm4py.algo.conformance.tokenreplay import algorithm as token_based_replay parameters_tbr = {token_based_replay.Variants.TOKEN_REPLAY.value.Parameters.DISABLE_VARIANTS: True, token_based_replay.Variants.TOKEN_REPLAY.value.Parameters.ENABLE_PLTR_FITNESS: True} replayed_traces, place_fitness, trans_fitness, unwanted_activities = token_based_replay.apply(event_log, i_net, im, fm, parameters=parameters_tbr) l1 = list() l2 = list() l3 = list() l4 = list() for i in range(len(replayed_traces)): l1.append(replayed_traces[i]["remaining_tokens"]) l2.append(replayed_traces[i]["missing_tokens"]) l3.append(replayed_traces[i]["reached_marking"]) l4.append(replayed_traces[i]["transitions_with_problems"]) np.mean(l1) np.mean(l2) set(l3) l4.count([]) ## ILP Miner ilp_net, im, fm = pm4py.discover_petri_net_ilp(event_log) ilp_eval = eval_pm(event_log, ilp_net, im, fm) pm4py.vis.view_petri_net(ilp_net, im, fm) pm4py.vis.save_vis_petri_net(ilp_net, im, fm, "../figures/processmaps/pn_ilp_complete.png") is_sound = pm4py.check_soundness(ilp_net, im, fm) is_sound[0] len(ilp_net.arcs) len(ilp_net.transitions) len(ilp_net.places) ## Export for all miners eval = pd.DataFrame(np.row_stack([baseline_eval, h_eval, a_eval, i_eval, ilp_eval])) eval.columns = ["fitness", "precision", "generalizability", "simplicity"] eval.index = ["conformative", "heuristics", "alpha", "inductive", "ilp"] eval eval.to_csv("results/eval_all-miners_complete.csv", sep=";") ###### Process Mining - individual artworks ###### def pm_artworks(miner): retval1 = np.empty((len(event_log["case:artwork"].unique()), 4)) retval2 = np.empty((len(event_log["case:artwork"].unique()), 4)) if miner == "heuristics": net, im, fm = pm4py.discover_petri_net_heuristics(event_log) elif miner == "inductive": net, im, fm = pm4py.discover_petri_net_inductive(event_log) elif miner == "alpha": net, im, fm = pm4py.discover_petri_net_alpha(event_log) elif miner == "ilp": net, im, fm = pm4py.discover_petri_net_ilp(event_log) for i in range(len(event_log["case:artwork"].unique())): artwork = event_log["case:artwork"].unique()[i] subdata = pm4py.filter_event_attribute_values(event_log, "case:artwork", [artwork], level="case", retain=True) if miner == "heuristics": subnet, subim, subfm = pm4py.discover_petri_net_heuristics(subdata) elif miner == "inductive": subnet, subim, subfm = pm4py.discover_petri_net_inductive(subdata) elif miner == "alpha": subnet, subim, subfm = pm4py.discover_petri_net_alpha(subdata) elif miner == "ilp": subnet, subim, subfm = pm4py.discover_petri_net_ilp(subdata) #pm4py.save_vis_petri_net(subnet, subim, subfm, # "../figures/processmaps/artworks/petrinet_" + miner + "_" + str(artwork).zfill(3) + ".png") retval1[i] = eval_pm(subdata, net, im, fm) retval2[i] = eval_pm(subdata, subnet, subim, subfm) retval1 = pd.DataFrame(retval1) retval1.columns = ["fitness", "precision", "generalizability", "simplicity"] retval1.index = event_log["case:artwork"].unique() retval1.insert(0, "nettype", "alldata") retval2 = pd.DataFrame(retval2) retval2.columns = ["fitness", "precision", "generalizability", "simplicity"] retval2.index = event_log["case:artwork"].unique() retval2.insert(0, "nettype", "subdata") return pd.concat([retval1, retval2]) for miner in ["heuristics", "inductive", "alpha", "ilp"]: eval_art = pm_artworks(miner = miner) eval_art.to_csv("results/eval_artworks_" + miner + ".csv", sep=";") eval_art = pm_artworks(miner = "inductive") ##### Clustering ###### ## KMeans #eval_artworks = eval_art[eval_art.nettype == "alldata"].iloc[:,range(1,5)] eval_artworks = eval_art[eval_art.nettype == "subdata"].iloc[:,range(1,5)] kmeans = KMeans(n_clusters=4, max_iter=1000).fit(eval_artworks) #from sklearn.manifold import MDS #coord = pd.DataFrame(MDS(normalized_stress='auto').fit_transform(eval_artworks)) coord = eval_artworks coord["clusters"] = kmeans.labels_ for i in coord.clusters.unique(): #plt.scatter(coord[coord.clusters == i].iloc[:,0], coord[coord.clusters == i].iloc[:,1], plt.scatter(coord[coord.clusters == i].iloc[:,1], coord[coord.clusters == i].iloc[:,2], #plt.scatter(coord[coord.clusters == i].iloc[:,2], coord[coord.clusters == i].iloc[:,4], label = i) plt.legend() plt.show() ### Scree plot sse = {} for k in range(1, 10): kmeans = KMeans(n_clusters=k, max_iter=1000).fit(eval_artworks[["precision", "generalizability"]]) #data["clusters"] = kmeans.labels_ #print(data["clusters"]) sse[k] = kmeans.inertia_ # Inertia: Sum of distances of samples to their closest cluster center plt.figure() plt.plot(list(sse.keys()), list(sse.values())) plt.xlabel("Number of clusters") plt.ylabel("SSE") plt.show()