First round of cleaning up
This commit is contained in:
parent
ae7e580749
commit
b469ccfbcf
@ -1,314 +0,0 @@
|
|||||||
# 01_clustering.R
|
|
||||||
#
|
|
||||||
# content: (1) Read evaluation data
|
|
||||||
# (2) Clustering
|
|
||||||
# (3) Visualization with pictures
|
|
||||||
# (4) Read event logs
|
|
||||||
# (5) Frequency plot for clusters
|
|
||||||
# (6) DFGs for clusters
|
|
||||||
#
|
|
||||||
# input: results/eval_heuristics_artworks.csv
|
|
||||||
# results/eval_all-miners_complete.csv
|
|
||||||
# results/haum/event_logfiles_glossar_2023-11-03_17-46-28.csv
|
|
||||||
# output: ../figures/clustering_heuristics.pdf
|
|
||||||
# ../figures/clustering_heuristics.png
|
|
||||||
# ../figures/processmaps/dfg_complete_R.pdf
|
|
||||||
# ../figures/processmaps/dfg_complete_R.png
|
|
||||||
# ../figures/processmaps/dfg_cluster1_R.pdf
|
|
||||||
# ../figures/processmaps/dfg_cluster2_R.pdf
|
|
||||||
# ../figures/processmaps/dfg_cluster3_R.pdf
|
|
||||||
# ../figures/processmaps/dfg_cluster4_R.pdf
|
|
||||||
#
|
|
||||||
# last mod: 2023-12-21, NW
|
|
||||||
|
|
||||||
# setwd("C:/Users/nwickelmaier/Nextcloud/Documents/MDS/2023ss/60100_master_thesis/code")
|
|
||||||
|
|
||||||
#--------------- (1) Read evaluation data ---------------
|
|
||||||
|
|
||||||
eval_heuristics <- read.table("results/eval_artworks_heuristics.csv", header = TRUE,
|
|
||||||
sep = ";", row.names = 1)
|
|
||||||
eval_inductive <- read.table("results/eval_artworks_inductive.csv", header = TRUE,
|
|
||||||
sep = ";", row.names = 1)
|
|
||||||
eval_alpha <- read.table("results/eval_artworks_alpha.csv", header = TRUE,
|
|
||||||
sep = ";", row.names = 1)
|
|
||||||
eval_ilp <- read.table("results/eval_artworks_ilp.csv", header = TRUE,
|
|
||||||
sep = ";", row.names = 1)
|
|
||||||
|
|
||||||
#--------------- (2) Clustering ---------------
|
|
||||||
|
|
||||||
set.seed(1607)
|
|
||||||
|
|
||||||
# Heuristics Miner
|
|
||||||
|
|
||||||
k1 <- kmeans(eval_heuristics, 4)
|
|
||||||
|
|
||||||
colors <- c("#3CB4DC", "#78004B", "#91C86E", "#FF6900")
|
|
||||||
|
|
||||||
plot(generalizability ~ precision, eval_heuristics, pch = 16, col = colors[k1$cluster])
|
|
||||||
|
|
||||||
## Scree plot
|
|
||||||
|
|
||||||
ks <- 1:10
|
|
||||||
|
|
||||||
sse <- NULL
|
|
||||||
for (k in ks) sse <- c(sse, kmeans(eval_heuristics, k)$tot.withinss)
|
|
||||||
|
|
||||||
plot(sse ~ ks, type = "l")
|
|
||||||
|
|
||||||
# Inductive Miner
|
|
||||||
|
|
||||||
k2 <- kmeans(eval_inductive, 4)
|
|
||||||
|
|
||||||
plot(generalizability ~ precision, eval_inductive, pch = 16, col = colors[k2$cluster])
|
|
||||||
|
|
||||||
## Scree plot
|
|
||||||
|
|
||||||
ks <- 1:10
|
|
||||||
|
|
||||||
sse <- NULL
|
|
||||||
for (k in ks) sse <- c(sse, kmeans(eval_inductive, k)$tot.withinss)
|
|
||||||
|
|
||||||
plot(sse ~ ks, type = "l")
|
|
||||||
|
|
||||||
# Alpha Miner
|
|
||||||
|
|
||||||
k3 <- kmeans(eval_alpha, 4)
|
|
||||||
|
|
||||||
par(mfrow = c(2, 2))
|
|
||||||
plot(generalizability ~ precision, eval_alpha, pch = 16, col = colors[k3$cluster])
|
|
||||||
plot(fitness ~ precision, eval_alpha, pch = 16, col = colors[k3$cluster])
|
|
||||||
plot(fitness ~ generalizability, eval_alpha, pch = 16, col = colors[k3$cluster])
|
|
||||||
|
|
||||||
## Scree plot
|
|
||||||
|
|
||||||
ks <- 1:10
|
|
||||||
|
|
||||||
sse <- NULL
|
|
||||||
for (k in ks) sse <- c(sse, kmeans(eval_alpha, k)$tot.withinss)
|
|
||||||
|
|
||||||
plot(sse ~ ks, type = "l")
|
|
||||||
|
|
||||||
|
|
||||||
# ILP Miner
|
|
||||||
|
|
||||||
k4 <- kmeans(eval_ilp, 4)
|
|
||||||
|
|
||||||
plot(generalizability ~ precision, eval_ilp, pch = 16, col = colors[k4$cluster])
|
|
||||||
|
|
||||||
## Scree plot
|
|
||||||
|
|
||||||
ks <- 1:10
|
|
||||||
|
|
||||||
sse <- NULL
|
|
||||||
for (k in ks) sse <- c(sse, kmeans(eval_ilp, k)$tot.withinss)
|
|
||||||
|
|
||||||
plot(sse ~ ks, type = "l")
|
|
||||||
|
|
||||||
#--------------- (3) Visualization with pictures ---------------
|
|
||||||
|
|
||||||
library(png)
|
|
||||||
library(jpeg)
|
|
||||||
library(grid)
|
|
||||||
|
|
||||||
## Heuristics Miner
|
|
||||||
#pdf("../figures/clustering_heuristics.pdf", height = 8, width = 8, pointsize = 10)
|
|
||||||
png("../figures/clustering_heuristics.png", units = "in", height = 8, width = 8, pointsize = 10, res = 300)
|
|
||||||
par(mai = c(.6,.6,.1,.1), mgp = c(2.4, 1, 0))
|
|
||||||
|
|
||||||
plot(generalizability ~ precision, eval_heuristics, type = "n", ylim = c(0.845, 0.98))
|
|
||||||
|
|
||||||
for (art in as.numeric(rownames(eval_heuristics))) {
|
|
||||||
|
|
||||||
art_string <- sprintf("%03d", art)
|
|
||||||
|
|
||||||
if (art == 125) {
|
|
||||||
|
|
||||||
pic <- readJPEG(paste0("../data/haum/ContentEyevisit/eyevisit_cards_light/",
|
|
||||||
art_string, "/", art_string, ".jpg"))
|
|
||||||
} else {
|
|
||||||
pic <- readPNG(paste0("../data/haum/ContentEyevisit/eyevisit_cards_light/",
|
|
||||||
art_string, "/", art_string, ".png"))
|
|
||||||
}
|
|
||||||
|
|
||||||
img <- as.raster(pic[,,1:3])
|
|
||||||
|
|
||||||
x <- eval_heuristics[rownames(eval_heuristics) == art, "precision"]
|
|
||||||
y <- eval_heuristics[rownames(eval_heuristics) == art, "generalizability"]
|
|
||||||
|
|
||||||
points(x, y, col = colors[k1$cluster[as.character(art)]], cex = 8, pch = 15)
|
|
||||||
|
|
||||||
rasterImage(img,
|
|
||||||
xleft = x - .002,
|
|
||||||
xright = x + .002,
|
|
||||||
ybottom = y - .004,
|
|
||||||
ytop = y + .004)
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
dev.off()
|
|
||||||
|
|
||||||
## Inductive Miner
|
|
||||||
plot(generalizability ~ precision, eval_inductive, col = colors[k2$cluster],
|
|
||||||
cex = 8, pch = 15)
|
|
||||||
|
|
||||||
for (art in as.numeric(rownames(eval_inductive))) {
|
|
||||||
|
|
||||||
art_string <- sprintf("%03d", art)
|
|
||||||
|
|
||||||
if (art == 125) {
|
|
||||||
|
|
||||||
pic <- readJPEG(paste0("../data/haum/ContentEyevisit/eyevisit_cards_light/",
|
|
||||||
art_string, "/", art_string, ".jpg"))
|
|
||||||
} else {
|
|
||||||
pic <- readPNG(paste0("../data/haum/ContentEyevisit/eyevisit_cards_light/",
|
|
||||||
art_string, "/", art_string, ".png"))
|
|
||||||
}
|
|
||||||
|
|
||||||
img <- as.raster(pic[,,1:3])
|
|
||||||
|
|
||||||
x <- eval_inductive[rownames(eval_inductive) == art, "precision"]
|
|
||||||
y <- eval_inductive[rownames(eval_inductive) == art, "generalizability"]
|
|
||||||
|
|
||||||
rasterImage(img,
|
|
||||||
xleft = x - .001,
|
|
||||||
xright = x + .001,
|
|
||||||
ybottom = y - .002,
|
|
||||||
ytop = y + .002)
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
#--------------- (4) Read event logs ---------------
|
|
||||||
|
|
||||||
dat <- read.table("results/haum/event_logfiles_glossar_2023-11-03_17-46-28.csv",
|
|
||||||
sep = ";", header = TRUE)
|
|
||||||
dat$date <- as.POSIXct(dat$date)
|
|
||||||
dat$date.start <- as.POSIXct(dat$date.start)
|
|
||||||
dat$date.stop <- as.POSIXct(dat$date.stop)
|
|
||||||
dat$artwork <- sprintf("%03d", dat$artwork)
|
|
||||||
dat$event <- factor(dat$event, levels = c("move", "flipCard", "openTopic", "openPopup"))
|
|
||||||
|
|
||||||
dat$weekdays <- factor(weekdays(dat$date.start),
|
|
||||||
levels = c("Montag", "Dienstag", "Mittwoch",
|
|
||||||
"Donnerstag", "Freitag", "Samstag",
|
|
||||||
"Sonntag"),
|
|
||||||
labels = c("Monday", "Tuesday", "Wednesday",
|
|
||||||
"Thursday", "Friday", "Saturday",
|
|
||||||
"Sunday"))
|
|
||||||
|
|
||||||
|
|
||||||
#--------------- (5) Frequency plot for clusters ---------------
|
|
||||||
|
|
||||||
# Only pre Corona
|
|
||||||
dat <- dat[dat$date < "2020-03-13",]
|
|
||||||
|
|
||||||
counts_artwork <- table(dat$artwork)
|
|
||||||
dat_count <- as.data.frame(counts_artwork)
|
|
||||||
names(dat_count) <- c("artwork", "freq")
|
|
||||||
dat_count$cluster <- k1$cluster[order(as.numeric(names(k1$cluster)))]
|
|
||||||
dat_count$cluster <- factor(dat_count$cluster, levels = c(4, 2, 1, 3), labels = 4:1)
|
|
||||||
dat_count <- dat_count[order(dat_count$cluster, dat_count$freq, decreasing = TRUE), ]
|
|
||||||
dat_count$artwork <- factor(dat_count$artwork, levels = unique(dat_count$artwork))
|
|
||||||
|
|
||||||
png("../figures/counts_artworks_clusters.png", units = "in", height = 3.375, width = 12, pointsize = 10, res = 300)
|
|
||||||
par(mai = c(.6,.6,.1,.1), mgp = c(2.4, 1, 0))
|
|
||||||
barplot(freq ~ artwork, dat_count, las = 2, ylim = c(0, 60000),
|
|
||||||
border = "white", ylab = "",
|
|
||||||
col = c("#FF6900", "#78004B", "#3CB4DC", "#91C86E" )[dat_count$cluster])
|
|
||||||
dev.off()
|
|
||||||
|
|
||||||
# compare to clusters
|
|
||||||
|
|
||||||
png("../figures/pm_heuristics_clusters.png", units = "in", height = 3.375, width = 3.375, pointsize = 10, res = 300)
|
|
||||||
par(mai = c(.6,.6,.1,.1), mgp = c(2.4, 1, 0))
|
|
||||||
plot(generalizability ~ precision, eval_heuristics, type = "n", ylim = c(0.845, 0.98))
|
|
||||||
with(eval_heuristics, text(precision, generalizability,
|
|
||||||
rownames(eval_heuristics),
|
|
||||||
col = colors[k1$cluster]))
|
|
||||||
dev.off()
|
|
||||||
|
|
||||||
#--------------- (6) DFGs for clusters ---------------
|
|
||||||
|
|
||||||
library(bupaverse)
|
|
||||||
|
|
||||||
|
|
||||||
dat$start <- dat$date.start
|
|
||||||
dat$complete <- dat$date.stop
|
|
||||||
|
|
||||||
|
|
||||||
alog <- activitylog(dat,
|
|
||||||
case_id = "trace",
|
|
||||||
activity_id = "event",
|
|
||||||
resource_id = "artwork",
|
|
||||||
timestamps = c("start", "complete"))
|
|
||||||
|
|
||||||
|
|
||||||
alog_c1 <- filter_case_condition(alog,
|
|
||||||
artwork %in% dat_count[dat_count$cluster == 1, "artwork"])
|
|
||||||
alog_c2 <- filter_case_condition(alog,
|
|
||||||
artwork %in% dat_count[dat_count$cluster == 2, "artwork"])
|
|
||||||
alog_c3 <- filter_case_condition(alog,
|
|
||||||
artwork %in% dat_count[dat_count$cluster == 3, "artwork"])
|
|
||||||
alog_c4 <- filter_case_condition(alog,
|
|
||||||
artwork %in% dat_count[dat_count$cluster == 4, "artwork"])
|
|
||||||
|
|
||||||
dfg_complete <- process_map(alog,
|
|
||||||
type_nodes = frequency("absolute", color_scale = "Greys"),
|
|
||||||
sec_nodes = frequency("relative"),
|
|
||||||
type_edges = frequency("absolute", color_edges = "#FF6900"),
|
|
||||||
sec_edges = frequency("relative"),
|
|
||||||
#rankdir = "TB",
|
|
||||||
render = FALSE)
|
|
||||||
export_map(dfg_complete,
|
|
||||||
file_name = "../figures/processmaps/dfg_complete_R.pdf",
|
|
||||||
file_type = "pdf",
|
|
||||||
title = "DFG complete")
|
|
||||||
export_map(dfg_complete,
|
|
||||||
file_name = "../figures/processmaps/dfg_complete_R.png",
|
|
||||||
file_type = "png")
|
|
||||||
|
|
||||||
dfg_c1 <- process_map(alog_c1,
|
|
||||||
type_nodes = frequency("absolute", color_scale = "Greys"),
|
|
||||||
sec_nodes = frequency("relative"),
|
|
||||||
type_edges = frequency("absolute", color_edges = "#FF6900"),
|
|
||||||
sec_edges = frequency("relative"),
|
|
||||||
rankdir = "TB",
|
|
||||||
render = FALSE)
|
|
||||||
export_map(dfg_c1,
|
|
||||||
file_name = "../figures/processmaps/dfg_cluster1_R.pdf",
|
|
||||||
file_type = "pdf",
|
|
||||||
title = "DFG Cluster 1")
|
|
||||||
dfg_c2 <- process_map(alog_c2,
|
|
||||||
type_nodes = frequency("absolute", color_scale = "Greys"),
|
|
||||||
sec_nodes = frequency("relative"),
|
|
||||||
type_edges = frequency("absolute", color_edges = "#FF6900"),
|
|
||||||
sec_edges = frequency("relative"),
|
|
||||||
rankdir = "TB",
|
|
||||||
render = FALSE)
|
|
||||||
export_map(dfg_c2,
|
|
||||||
file_name = "../figures/processmaps/dfg_cluster2_R.pdf",
|
|
||||||
file_type = "pdf",
|
|
||||||
title = "DFG Cluster 2")
|
|
||||||
dfg_c3 <- process_map(alog_c3,
|
|
||||||
type_nodes = frequency("absolute", color_scale = "Greys"),
|
|
||||||
sec_nodes = frequency("relative"),
|
|
||||||
type_edges = frequency("absolute", color_edges = "#FF6900"),
|
|
||||||
sec_edges = frequency("relative"),
|
|
||||||
rankdir = "TB",
|
|
||||||
render = FALSE)
|
|
||||||
export_map(dfg_c3,
|
|
||||||
file_name = "../figures/processmaps/dfg_cluster3_R.pdf",
|
|
||||||
file_type = "pdf",
|
|
||||||
title = "DFG Cluster 3")
|
|
||||||
dfg_c4 <- process_map(alog_c4,
|
|
||||||
type_nodes = frequency("absolute", color_scale = "Greys"),
|
|
||||||
sec_nodes = frequency("relative"),
|
|
||||||
type_edges = frequency("absolute", color_edges = "#FF6900"),
|
|
||||||
sec_edges = frequency("relative"),
|
|
||||||
rankdir = "TB",
|
|
||||||
render = FALSE)
|
|
||||||
export_map(dfg_c4,
|
|
||||||
file_name = "../figures/processmaps/dfg_cluster4_R.pdf",
|
|
||||||
file_type = "pdf",
|
|
||||||
title = "DFG Cluster 4")
|
|
||||||
|
|
||||||
|
|
137
code/03_conformance-checking.py
Normal file
137
code/03_conformance-checking.py
Normal file
@ -0,0 +1,137 @@
|
|||||||
|
import pm4py
|
||||||
|
|
||||||
|
import pandas as pd
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
from python_helpers import eval_pm, pn_infos_miner
|
||||||
|
|
||||||
|
###### Load data and create event logs ######
|
||||||
|
|
||||||
|
dat = pd.read_csv("results/haum/event_logfiles_2024-01-18_09-58-52.csv", sep = ";")
|
||||||
|
#dat = dat[dat["date.start"] < "2020-03-13"]
|
||||||
|
# --> only pre corona (before artworks were updated)
|
||||||
|
|
||||||
|
event_log = pm4py.format_dataframe(dat, case_id='path', activity_key='event',
|
||||||
|
timestamp_key='date.start')
|
||||||
|
|
||||||
|
###### Descriptives of log data ######
|
||||||
|
|
||||||
|
# Distribution of events
|
||||||
|
event_log.event.value_counts()
|
||||||
|
event_log.event.value_counts(normalize = True)
|
||||||
|
|
||||||
|
# Number of paths
|
||||||
|
len(event_log.path.unique())
|
||||||
|
|
||||||
|
# Number of variants
|
||||||
|
variants = pm4py.get_variants(event_log)
|
||||||
|
len(variants)
|
||||||
|
|
||||||
|
sorted_variants = dict(sorted(variants.items(), key=lambda item: item[1], reverse = True))
|
||||||
|
{k: sorted_variants[k] for k in list(sorted_variants)[:20]}
|
||||||
|
|
||||||
|
filtered_log = event_log[event_log["event"] != "move"]
|
||||||
|
variants_no_move = pm4py.get_variants(filtered_log)
|
||||||
|
len(variants_no_move)
|
||||||
|
sorted_variants_no_move = dict(sorted(variants_no_move.items(), key=lambda item: item[1], reverse = True))
|
||||||
|
{k: sorted_variants_no_move[k] for k in list(sorted_variants_no_move)[:20]}
|
||||||
|
|
||||||
|
###### Read "conformative" Petri Net ######
|
||||||
|
|
||||||
|
basenet, initial_marking, final_marking = pm4py.read_pnml("results/haum/conformative_petrinet_con.pnml")
|
||||||
|
|
||||||
|
# TBR
|
||||||
|
replayed_traces = pm4py.conformance_diagnostics_token_based_replay(event_log, basenet, initial_marking, final_marking)
|
||||||
|
|
||||||
|
l1 = list()
|
||||||
|
l2 = list()
|
||||||
|
l3 = list()
|
||||||
|
l4 = list()
|
||||||
|
for i in range(len(replayed_traces)):
|
||||||
|
l1.append(replayed_traces[i]["remaining_tokens"])
|
||||||
|
l2.append(replayed_traces[i]["missing_tokens"])
|
||||||
|
l3.append(replayed_traces[i]["reached_marking"])
|
||||||
|
l4.append(replayed_traces[i]["transitions_with_problems"])
|
||||||
|
|
||||||
|
set(l1)
|
||||||
|
x1 = np.array(l1)
|
||||||
|
index_broken = np.where(x1 == 1)[0].tolist()
|
||||||
|
|
||||||
|
set(l3)
|
||||||
|
l4.count([])
|
||||||
|
|
||||||
|
[l3[i] for i in index_broken]
|
||||||
|
[l4[i] for i in index_broken]
|
||||||
|
|
||||||
|
broken_traces = [replayed_traces[i] for i in index_broken]
|
||||||
|
|
||||||
|
event_log[event_log['@@case_index'] == index_broken[0]].event
|
||||||
|
event_log[event_log['@@case_index'] == index_broken[0]].path.unique().tolist()
|
||||||
|
event_log[event_log['@@case_index'] == index_broken[0]].item.unique().tolist()
|
||||||
|
event_log[event_log['@@case_index'] == index_broken[0]]["fileId.start"].unique().tolist()
|
||||||
|
# --> logging error in raw file
|
||||||
|
|
||||||
|
|
||||||
|
# Footprints
|
||||||
|
from pm4py.algo.discovery.footprints import algorithm as footprints_discovery
|
||||||
|
from pm4py.visualization.footprints import visualizer as fp_visualizer
|
||||||
|
fp_log = footprints_discovery.apply(event_log, variant=footprints_discovery.Variants.ENTIRE_EVENT_LOG)
|
||||||
|
fp_net = footprints_discovery.apply(basenet, initial_marking, final_marking)
|
||||||
|
gviz = fp_visualizer.apply(fp_net, parameters={fp_visualizer.Variants.SINGLE.value.Parameters.FORMAT: "svg"})
|
||||||
|
fp_visualizer.view(gviz)
|
||||||
|
|
||||||
|
efg_graph = pm4py.discover_eventually_follows_graph(event_log)
|
||||||
|
|
||||||
|
## Directly-follows graph
|
||||||
|
dfg, start_activities, end_activities = pm4py.discover_dfg(event_log)
|
||||||
|
pm4py.view_dfg(dfg, start_activities, end_activities)
|
||||||
|
pm4py.save_vis_dfg(dfg, start_activities, end_activities, 'results/processmaps/dfg_complete_python.png')
|
||||||
|
|
||||||
|
## Fitting different miners
|
||||||
|
|
||||||
|
eval = pd.DataFrame(columns = ["fitness", "precision", "generalizability",
|
||||||
|
"simplicity", "sound", "narcs", "ntrans",
|
||||||
|
"nplaces", "nvariants", "mostfreq"])
|
||||||
|
|
||||||
|
for miner in ["conformative", "alpha", "heuristics", "inductive", "ilp"]:
|
||||||
|
eval = pd.concat([eval, pn_infos_miner(event_log, miner)])
|
||||||
|
|
||||||
|
## Export for all miners
|
||||||
|
eval.to_csv("results/eval_all-miners_complete.csv", sep = ";")
|
||||||
|
|
||||||
|
## Without broken trace
|
||||||
|
event_log_clean = event_log[event_log['@@case_index'] != index_broken[0]]
|
||||||
|
|
||||||
|
for miner in ["conformative", "alpha", "heuristics", "inductive", "ilp"]:
|
||||||
|
eval_clean = pd.concat([eval_clean, pn_infos_miner(event_log_clean, miner)])
|
||||||
|
|
||||||
|
eval_clean.to_csv("results/eval_all-miners_clean.csv", sep = ";")
|
||||||
|
|
||||||
|
# Export petri nets
|
||||||
|
h_net, h_im, h_fm = pm4py.discover_petri_net_heuristics(event_log_clean)
|
||||||
|
a_net, a_im, a_fm = pm4py.discover_petri_net_alpha(event_log_clean)
|
||||||
|
i_net, i_im, i_fm = pm4py.discover_petri_net_inductive(event_log_clean)
|
||||||
|
ilp_net, ilp_im, ilp_fm = pm4py.discover_petri_net_ilp(event_log_clean)
|
||||||
|
|
||||||
|
pm4py.vis.save_vis_petri_net(h_net, h_im, h_fm, "results/processmaps/petrinet_heuristics_clean.png")
|
||||||
|
pm4py.vis.save_vis_petri_net(a_net, a_im, a_fm, "results/processmaps/petrinet_alpha_clean.png")
|
||||||
|
pm4py.vis.save_vis_petri_net(i_net, i_im, i_fm, "results/processmaps/petrinet_inductive_clean.png")
|
||||||
|
pm4py.vis.save_vis_petri_net(ilp_net, ilp_im, ilp_fm, "results/processmaps/petrinet_ilp_clean.png")
|
||||||
|
pm4py.vis.save_vis_petri_net(basenet, initial_marking, final_marking, "results/processmaps/petrinet_conformative.png")
|
||||||
|
|
||||||
|
# convert to BPMN
|
||||||
|
base_bpmn = pm4py.convert.convert_to_bpmn(basenet, initial_marking, final_marking)
|
||||||
|
pm4py.vis.save_vis_bpmn(base_bpmn, "results/processmaps/bpmn_conformative.png")
|
||||||
|
|
||||||
|
i_bpmn = pm4py.convert.convert_to_bpmn(i_net, i_im, i_fm)
|
||||||
|
pm4py.vis.save_vis_bpmn(i_bpmn, "results/processmaps/bpmn_inductive_clean.png")
|
||||||
|
|
||||||
|
ilp_bpmn = pm4py.convert.convert_to_bpmn(ilp_net, ilp_im, ilp_fm)
|
||||||
|
pm4py.vis.save_vis_bpmn(ilp_bpmn, "results/processmaps/bpmn_ilp_clean.png")
|
||||||
|
|
||||||
|
a_bpmn = pm4py.convert.convert_to_bpmn(a_net, a_im, a_fm)
|
||||||
|
pm4py.vis.save_vis_bpmn(a_bpmn, "results/processmaps/bpmn_alpha_clean.png")
|
||||||
|
|
||||||
|
h_bpmn = pm4py.convert.convert_to_bpmn(h_net, h_im, h_fm)
|
||||||
|
pm4py.vis.save_vis_bpmn(h_bpmn, "results/processmaps/bpmn_heuristics_clean.png")
|
||||||
|
|
@ -1,122 +0,0 @@
|
|||||||
# setwd("C:/Users/nwickelmaier/Nextcloud/Documents/MDS/2023ss/60100_master_thesis/code")
|
|
||||||
|
|
||||||
# Read data
|
|
||||||
|
|
||||||
dat0 <- read.table("../data/haum/event_logfiles_metadata_2023-09-23_01-31-30.csv",
|
|
||||||
sep = ";", header = TRUE)
|
|
||||||
dat0$date <- as.Date(dat0$date)
|
|
||||||
dat0$date.start <- as.POSIXct(dat0$date.start)
|
|
||||||
dat0$date.stop <- as.POSIXct(dat0$date.stop)
|
|
||||||
dat0$artwork <- sprintf("%03d", dat0$artwork)
|
|
||||||
|
|
||||||
# Preprocess variables for clustering
|
|
||||||
|
|
||||||
str(dat0)
|
|
||||||
|
|
||||||
# year --> lubridate::year()
|
|
||||||
# duration --> numeric, remove NA
|
|
||||||
# topicNumber --> numeric, remove NA
|
|
||||||
# distance --> numeric, remove NA
|
|
||||||
# scaleSize --> numeric, remove NA
|
|
||||||
# rotationDegree --> numeric, remove NA
|
|
||||||
# holiday --> one/hot coding
|
|
||||||
# vacations --> one/hot coding
|
|
||||||
# artwork? --> one/hot coding (72 new variables)
|
|
||||||
# event? --> one/hot coding (4 new variables)
|
|
||||||
|
|
||||||
dat <- dat0
|
|
||||||
|
|
||||||
dat$year <- lubridate::year(dat$date)
|
|
||||||
dat$holiday1 <- ifelse(is.na(dat$holiday), 0, 1)
|
|
||||||
dat$vacations1 <- ifelse(is.na(dat$vacations), 0, 1)
|
|
||||||
dat$topicNumber1 <- ifelse(is.na(dat$topicNumber), 0, dat$topicNumber)
|
|
||||||
dat$duration1 <- ifelse(is.na(dat$duration), 0, dat$duration)
|
|
||||||
dat$distance1 <- ifelse(is.na(dat$distance), 0, dat$distance)
|
|
||||||
dat$scaleSize1 <- ifelse(is.na(dat$scaleSize), 0, dat$scaleSize)
|
|
||||||
dat$rotationDegree1 <- ifelse(is.na(dat$rotationDegree), 0, dat$rotationDegree)
|
|
||||||
|
|
||||||
for (artwork in unique(dat$artwork)) {
|
|
||||||
dat[[paste0("A", artwork)]] <- ifelse(dat$artwork == artwork, 1, 0)
|
|
||||||
}
|
|
||||||
|
|
||||||
for (event in unique(dat$event)) {
|
|
||||||
dat[[event]] <- ifelse(dat$event == event, 1, 0)
|
|
||||||
}
|
|
||||||
|
|
||||||
mat <- dat[, c("year", "duration", "topicNumber", "distance", "scaleSize",
|
|
||||||
"rotationDegree", "holiday1", "vacations1",
|
|
||||||
paste0("A", unique(dat$artwork)), "flipCard", "move", "openTopic",
|
|
||||||
"openPopup")]
|
|
||||||
|
|
||||||
|
|
||||||
mat1 <- dat[, c("year", "duration1", "topicNumber1", "distance1", "scaleSize1",
|
|
||||||
"rotationDegree1", "holiday1", "vacations1",
|
|
||||||
paste0("A", unique(dat$artwork)), "flipCard", "move", "openTopic",
|
|
||||||
"openPopup")]
|
|
||||||
|
|
||||||
library(cluster) # for hierarchical clustering
|
|
||||||
|
|
||||||
k1 <- kmeans(mat1, 2)
|
|
||||||
dat$kcluster <- k1$cluster
|
|
||||||
|
|
||||||
mat1$artwork <- dat$artwork
|
|
||||||
datagg <- aggregate(. ~ artwork, mat1, mean)
|
|
||||||
aa <- datagg$artwork
|
|
||||||
datagg$artwork <- NULL
|
|
||||||
|
|
||||||
k2 <- kmeans(datagg, 3)
|
|
||||||
datagg$cluster <- k2$cluster
|
|
||||||
datagg <- datagg[order(datagg$cluster), ]
|
|
||||||
aggregate(cbind(duration1, distance1, scaleSize1, rotationDegree1,
|
|
||||||
holiday1, vacations1) ~ cluster, datagg, mean)
|
|
||||||
# --> how to interpret this??
|
|
||||||
|
|
||||||
|
|
||||||
# sample data for hierarchical clustering
|
|
||||||
n <- 200
|
|
||||||
set.seed(1826)
|
|
||||||
|
|
||||||
mat2 <- mat1[sample(nrow(mat1), n), ]
|
|
||||||
rownames(mat2) <- NULL
|
|
||||||
a1 <- agnes(mat2)
|
|
||||||
|
|
||||||
d1 <- as.dendrogram(a1)
|
|
||||||
plot(d1)
|
|
||||||
|
|
||||||
datagg$cluster <- NULL
|
|
||||||
rownames(datagg) <- NULL
|
|
||||||
a2 <- agnes(datagg)
|
|
||||||
d2 <- as.dendrogram(a2)
|
|
||||||
plot(d2)
|
|
||||||
|
|
||||||
## Clustering for nominal features with nomclust package
|
|
||||||
|
|
||||||
library(nomclust)
|
|
||||||
|
|
||||||
dat <- as.data.frame(lapply(dat0[, c("folder", "holiday", "vacations", "artwork",
|
|
||||||
"event", "case", "trace")], as.factor))
|
|
||||||
mat <- list()
|
|
||||||
mat$year <- as.numeric(dat$folder)
|
|
||||||
mat$holiday <- as.numeric(dat$holiday)
|
|
||||||
mat$vacations <- as.numeric(dat$vacations)
|
|
||||||
mat$artwork <- as.numeric(dat$artwork)
|
|
||||||
mat$event <- as.numeric(dat$event)
|
|
||||||
mat$case <- as.numeric(dat$case)
|
|
||||||
mat$trace <- as.numeric(dat$trace)
|
|
||||||
|
|
||||||
mat$holiday <- ifelse(is.na(mat$holiday), 0, 1)
|
|
||||||
mat$vacations <- ifelse(is.na(mat$vacations), 0, 1)
|
|
||||||
|
|
||||||
set.seed(1526)
|
|
||||||
ids <- sample(nrow(mat), 1000)
|
|
||||||
mat_small <- mat[ids, ]
|
|
||||||
|
|
||||||
n1 <- nomclust(mat_small)
|
|
||||||
|
|
||||||
n1$mem$clu_3
|
|
||||||
dend.plot(n1, clusters = 3)
|
|
||||||
|
|
||||||
mat_small[n1$mem$clu_6 == 6, ]
|
|
||||||
|
|
||||||
cbind(mat_small[order(n1$mem$clu_3), ], n1$mem$clu_3[order(n1$mem$clu_3)])
|
|
||||||
|
|
@ -1,4 +1,4 @@
|
|||||||
%reset
|
#%reset
|
||||||
|
|
||||||
import pm4py
|
import pm4py
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
@ -17,7 +17,6 @@ dat = dat[dat["path"] != 106098] # exclude broken trace
|
|||||||
log_path = pm4py.format_dataframe(dat, case_id = "path", activity_key = "event",
|
log_path = pm4py.format_dataframe(dat, case_id = "path", activity_key = "event",
|
||||||
timestamp_key = "date.start")
|
timestamp_key = "date.start")
|
||||||
|
|
||||||
|
|
||||||
###### Infos for items ######
|
###### Infos for items ######
|
||||||
|
|
||||||
mdi = pd.DataFrame(columns = ["fitness", "precision", "generalizability",
|
mdi = pd.DataFrame(columns = ["fitness", "precision", "generalizability",
|
@ -1,3 +1,5 @@
|
|||||||
|
# TODO: Clean me up! I am a mix of useful and useless!!!
|
||||||
|
|
||||||
# setwd("C:/Users/nwickelmaier/Nextcloud/Documents/MDS/2023ss/60100_master_thesis/analysis/code")
|
# setwd("C:/Users/nwickelmaier/Nextcloud/Documents/MDS/2023ss/60100_master_thesis/analysis/code")
|
||||||
|
|
||||||
library(bupaverse)
|
library(bupaverse)
|
||||||
@ -259,3 +261,36 @@ process_map(alog,
|
|||||||
sec_edges = frequency("absolute"),
|
sec_edges = frequency("absolute"),
|
||||||
rankdir = "LR")
|
rankdir = "LR")
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# setwd("C:/Users/nwickelmaier/Nextcloud/Documents/MDS/2023ss/60100_master_thesis/analysis/code")
|
||||||
|
|
||||||
|
datraw <- read.table("results/haum/raw_logfiles_2024-01-18_09-58-52.csv",
|
||||||
|
header = TRUE, sep = ";")
|
||||||
|
|
||||||
|
|
||||||
|
# Read data
|
||||||
|
|
||||||
|
datlogs <- read.table("results/haum/event_logfiles_2024-01-18_09-58-52.csv",
|
||||||
|
colClasses = c("character", "character", "POSIXct",
|
||||||
|
"POSIXct", "character", "integer",
|
||||||
|
"numeric", "character", "character",
|
||||||
|
rep("numeric", 3), "character",
|
||||||
|
"character", rep("numeric", 11),
|
||||||
|
"character", "character"),
|
||||||
|
sep = ";", header = TRUE)
|
||||||
|
|
||||||
|
datlogs <- datlogs[order(datlogs$fileId.start, datlogs$date.start, datlogs$timeMs.start), ]
|
||||||
|
|
||||||
|
artwork <- "176"
|
||||||
|
fileId <- c('2017_06_16-13_49_00.log', '2017_06_16-13_59_00.log')
|
||||||
|
path <- 106098
|
||||||
|
|
||||||
|
datraw[datraw$item == artwork & datraw$fileId %in% fileId, ]
|
||||||
|
|
||||||
|
datlogs[datlogs$path == path, ]
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -1,28 +0,0 @@
|
|||||||
# setwd("C:/Users/nwickelmaier/Nextcloud/Documents/MDS/2023ss/60100_master_thesis/analysis/code")
|
|
||||||
|
|
||||||
datraw <- read.table("results/haum/raw_logfiles_2024-01-18_09-58-52.csv",
|
|
||||||
header = TRUE, sep = ";")
|
|
||||||
|
|
||||||
|
|
||||||
# Read data
|
|
||||||
|
|
||||||
datlogs <- read.table("results/haum/event_logfiles_2024-01-18_09-58-52.csv",
|
|
||||||
colClasses = c("character", "character", "POSIXct",
|
|
||||||
"POSIXct", "character", "integer",
|
|
||||||
"numeric", "character", "character",
|
|
||||||
rep("numeric", 3), "character",
|
|
||||||
"character", rep("numeric", 11),
|
|
||||||
"character", "character"),
|
|
||||||
sep = ";", header = TRUE)
|
|
||||||
|
|
||||||
datlogs <- datlogs[order(datlogs$fileId.start, datlogs$date.start, datlogs$timeMs.start), ]
|
|
||||||
|
|
||||||
artwork <- "176"
|
|
||||||
fileId <- c('2017_06_16-13_49_00.log', '2017_06_16-13_59_00.log')
|
|
||||||
path <- 106098
|
|
||||||
|
|
||||||
datraw[datraw$item == artwork & datraw$fileId %in% fileId, ]
|
|
||||||
|
|
||||||
datlogs[datlogs$path == path, ]
|
|
||||||
|
|
||||||
|
|
@ -1,101 +0,0 @@
|
|||||||
# setwd("C:/Users/nwickelmaier/Nextcloud/Documents/MDS/2023ss/60100_master_thesis/code")
|
|
||||||
|
|
||||||
# Read data
|
|
||||||
|
|
||||||
# dat <- read.table("results/haum/event_logfiles_metadata_2023-09-23_01-31-30.csv",
|
|
||||||
# sep = ";", header = TRUE)
|
|
||||||
dat <- read.table("results/haum/event_logfiles_small_metadata_2023-10-15_10-08-43.csv",
|
|
||||||
sep = ";", header = TRUE)
|
|
||||||
dat$date <- as.Date(dat$date)
|
|
||||||
dat$date.start <- as.POSIXct(dat$date.start)
|
|
||||||
dat$date.stop <- as.POSIXct(dat$date.stop)
|
|
||||||
dat$artwork <- sprintf("%03d", dat$artwork)
|
|
||||||
|
|
||||||
library(bupaverse)
|
|
||||||
|
|
||||||
names(dat)[names(dat) %in% c("date.start", "date.stop")] <- c("start", "complete")
|
|
||||||
|
|
||||||
create_pdf <- function(trace, folder = "../figures/processmaps/") {
|
|
||||||
alog <- activitylog(dat[which(dat$trace == trace), ],
|
|
||||||
case_id = "trace",
|
|
||||||
activity_id = "event",
|
|
||||||
resource_id = "artwork",
|
|
||||||
timestamps = c("start", "complete"))
|
|
||||||
|
|
||||||
map <- process_map(alog)
|
|
||||||
g <- DiagrammeR::grViz(map$x$diagram) |> DiagrammeRsvg::export_svg() |> charToRaw()
|
|
||||||
rsvg::rsvg_pdf(g, paste0(folder, trace, ".pdf"))
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
find_trace <- function(trace) {
|
|
||||||
|
|
||||||
alog <- activitylog(dat[which(dat$trace == trace), ],
|
|
||||||
case_id = "trace",
|
|
||||||
activity_id = "event",
|
|
||||||
resource_id = "artwork",
|
|
||||||
timestamps = c("start", "complete"))
|
|
||||||
|
|
||||||
map <- process_map(alog)
|
|
||||||
d <- strsplit(map$x$diagram, "\n")[[1]]
|
|
||||||
o <- grep("^.{6}[[]label", d, value = TRUE)
|
|
||||||
p <- grep("^.{1}[1-6].->", d, value = TRUE)
|
|
||||||
num_ot <- gsub("^.{3}([1-6]).*", "\\1", grep("openTopic", o, value = TRUE))
|
|
||||||
num_op <- gsub("^.{3}([1-6]).*", "\\1", grep("openPopup", o, value = TRUE))
|
|
||||||
rel_path <- grep("^.{1}[2].->.[1-6]", p, value = TRUE)
|
|
||||||
rel_num <- gsub("^.{1}[2].->.([1-6]).*" , "\\1", rel_path)
|
|
||||||
num_fc <- gsub("^.{3}([1-6]).*", "\\1", grep("flipCard", o, value = TRUE))
|
|
||||||
if (length(num_fc) > 0) {
|
|
||||||
rel_path_fc <- grep(paste0("^.{1}[", num_fc, "].->.[1-6]"), p, value = TRUE)
|
|
||||||
rel_num_fc <- gsub(paste0("^.{1}[", num_fc, "].->.([1-6]).*"), "\\1", rel_path_fc)
|
|
||||||
if (any(c(num_ot, num_op) %in% rel_num) | any(num_op == rel_num_fc)) {
|
|
||||||
trace
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
if (any(c(num_ot, num_op) %in% rel_num)) {
|
|
||||||
trace
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
ctrace <- pbapply::pbsapply(unique(dat$trace), find_trace)
|
|
||||||
|
|
||||||
unlist(ctrace)
|
|
||||||
length(unlist(ctrace))
|
|
||||||
|
|
||||||
|
|
||||||
# create plots
|
|
||||||
for (trace in unlist(ctrace)) {
|
|
||||||
create_pdf(trace)
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
alog <- activitylog(dat,
|
|
||||||
case_id = "trace",
|
|
||||||
activity_id = "event",
|
|
||||||
resource_id = "artwork",
|
|
||||||
timestamps = c("start", "complete"))
|
|
||||||
|
|
||||||
map <- process_map(alog)
|
|
||||||
g <- DiagrammeR::grViz(map$x$diagram) |> DiagrammeRsvg::export_svg() |> charToRaw()
|
|
||||||
rsvg::rsvg_pdf(g, "../figures/processmap_haum.pdf", width = 10, height = 5)
|
|
||||||
|
|
||||||
# adjusted colors
|
|
||||||
writeLines(map$x$diagram, "process_map_haum.gv")
|
|
||||||
g <- DiagrammeR::grViz("process_map_haum.gv") |> DiagrammeRsvg::export_svg() |> charToRaw()
|
|
||||||
rsvg::rsvg_pdf(g, "../figures/processmap_haum_adjusted.pdf", width = 10, height = 5)
|
|
||||||
|
|
||||||
|
|
||||||
alog <- activitylog(dat[!dat$trace %in% unlist(ctrace), ],
|
|
||||||
case_id = "trace",
|
|
||||||
activity_id = "event",
|
|
||||||
resource_id = "artwork",
|
|
||||||
timestamps = c("start", "complete"))
|
|
||||||
|
|
||||||
map <- process_map(alog)
|
|
||||||
g <- DiagrammeR::grViz(map$x$diagram) |> DiagrammeRsvg::export_svg() |> charToRaw()
|
|
||||||
rsvg::rsvg_pdf(g, "../figures/processmap_haum_cleaned.pdf", width = 12, height = 5)
|
|
||||||
|
|
||||||
|
|
39
code/plots_processmaps.R
Normal file
39
code/plots_processmaps.R
Normal file
@ -0,0 +1,39 @@
|
|||||||
|
# setwd("C:/Users/nwickelmaier/Nextcloud/Documents/MDS/2023ss/60100_master_thesis/analysis/code")
|
||||||
|
|
||||||
|
library(bupaverse)
|
||||||
|
|
||||||
|
dat0 <- read.table("results/haum/event_logfiles_2024-01-18_09-58-52.csv",
|
||||||
|
colClasses = c("character", "character", "POSIXct",
|
||||||
|
"POSIXct", "character", "integer",
|
||||||
|
"numeric", "character", "character",
|
||||||
|
rep("numeric", 3), "character",
|
||||||
|
"character", rep("numeric", 11),
|
||||||
|
"character", "character"),
|
||||||
|
sep = ";", header = TRUE)
|
||||||
|
dat0$event <- factor(dat0$event, levels = c("move", "flipCard", "openTopic",
|
||||||
|
"openPopup"))
|
||||||
|
|
||||||
|
# Select data pre Corona
|
||||||
|
dat <- dat0[as.Date(dat0$date.start) < "2020-03-13", ]
|
||||||
|
dat <- dat[dat$path != 106098, ]
|
||||||
|
|
||||||
|
dat$start <- dat$date.start
|
||||||
|
dat$complete <- dat$date.stop
|
||||||
|
|
||||||
|
alog <- activitylog(dat,
|
||||||
|
case_id = "path",
|
||||||
|
activity_id = "event",
|
||||||
|
resource_id = "item",
|
||||||
|
timestamps = c("start", "complete"))
|
||||||
|
|
||||||
|
dfg_complete <- process_map(alog,
|
||||||
|
type_nodes = frequency("absolute", color_scale = "Greys"),
|
||||||
|
sec_nodes = frequency("relative"),
|
||||||
|
type_edges = frequency("absolute", color_edges = "#FF6900"),
|
||||||
|
sec_edges = frequency("relative"),
|
||||||
|
#rankdir = "TB",
|
||||||
|
render = FALSE)
|
||||||
|
export_map(dfg_complete,
|
||||||
|
file_name = "results/processmaps/dfg_complete_R.png",
|
||||||
|
file_type = "png")
|
||||||
|
|
202
code/pm.py
202
code/pm.py
@ -1,202 +0,0 @@
|
|||||||
#%% # needed for shortcuts to run properly in VSCode *eyeroll*
|
|
||||||
%reset
|
|
||||||
|
|
||||||
import pm4py
|
|
||||||
#from pm4py.algo.evaluation.generalization import algorithm as generalization_evaluator
|
|
||||||
#from pm4py.algo.evaluation.simplicity import algorithm as simplicity_evaluator
|
|
||||||
|
|
||||||
import pandas as pd
|
|
||||||
import numpy as np
|
|
||||||
import matplotlib.pyplot as plt
|
|
||||||
from sklearn.cluster import KMeans
|
|
||||||
|
|
||||||
###### Load data and create event logs ######
|
|
||||||
|
|
||||||
dat = pd.read_csv("results/haum/event_logfiles_glossar_2023-11-03_17-46-28.csv", sep = ";")
|
|
||||||
dat = dat[dat.date < "2020-03-13"]
|
|
||||||
# --> only pre corona (before artworks were updated)
|
|
||||||
|
|
||||||
event_log = pm4py.format_dataframe(dat, case_id='trace', activity_key='event',
|
|
||||||
timestamp_key='date.start')
|
|
||||||
# event_log = pm4py.format_dataframe(dat, case_id='trace', activity_key='event',
|
|
||||||
# timestamp_key='date.stop', start_timestamp_key='date.start')
|
|
||||||
event_log = event_log.rename(columns={'artwork': 'case:artwork'})
|
|
||||||
#event_log = pm4py.convert_to_event_log(dat_log) # deprecated
|
|
||||||
|
|
||||||
###### Process Mining - complete data set #####
|
|
||||||
|
|
||||||
def eval_pm(data, net, initial_marking, final_marking):
|
|
||||||
"""Caculate fitness, precision, generalizability, and simplicity for petri net"""
|
|
||||||
fitness = pm4py.fitness_token_based_replay(data, net, initial_marking, final_marking)
|
|
||||||
#fitness = pm4py.fitness_alignments(data, net, initial_marking, final_marking)
|
|
||||||
precisison = pm4py.precision_token_based_replay(data, net, initial_marking, final_marking)
|
|
||||||
#precision = pm4py.precision_alignments(data, net, initial_marking, final_marking)
|
|
||||||
generalizability = pm4py.algo.evaluation.generalization.algorithm.apply(data, net, initial_marking, final_marking)
|
|
||||||
simplicity = pm4py.algo.evaluation.simplicity.algorithm.apply(net)
|
|
||||||
return [fitness['average_trace_fitness'], precisison, generalizability, simplicity]
|
|
||||||
|
|
||||||
|
|
||||||
## Directly-follows graph
|
|
||||||
dfg, start_activities, end_activities = pm4py.discover_dfg(event_log)
|
|
||||||
pm4py.view_dfg(dfg, start_activities, end_activities)
|
|
||||||
pm4py.save_vis_dfg(dfg, start_activities, end_activities, '../figures/processmaps/dfg_complete.png')
|
|
||||||
|
|
||||||
## Heuristics Miner
|
|
||||||
net, im, fm = pm4py.discover_petri_net_heuristics(event_log)
|
|
||||||
h_eval = eval_pm(event_log, net, im, fm)
|
|
||||||
pm4py.vis.view_petri_net(net, im, fm)
|
|
||||||
pm4py.vis.save_vis_petri_net(net, im, fm, "../figures/processmaps/pn_heuristics_complete.png")
|
|
||||||
|
|
||||||
is_sound = pm4py.check_soundness(net, im, fm)
|
|
||||||
is_sound[0]
|
|
||||||
|
|
||||||
len(is_sound[1]["s_c_net"].arcs)
|
|
||||||
# 46
|
|
||||||
len(is_sound[1]["s_c_net"].transitions)
|
|
||||||
# 23
|
|
||||||
len(is_sound[1]["s_c_net"].places)
|
|
||||||
# 10
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
# decorated petri net
|
|
||||||
from pm4py.visualization.petri_net import visualizer as pn_visualizer
|
|
||||||
parameters = {pn_visualizer.Variants.FREQUENCY.value.Parameters.FORMAT: "png"}
|
|
||||||
gviz = pn_visualizer.apply(net, im, fm, parameters=parameters, variant=pn_visualizer.Variants.FREQUENCY, log=event_log)
|
|
||||||
pn_visualizer.save(gviz, "../figures/processmaps/pn_heuristics_complete_decorated.png")
|
|
||||||
|
|
||||||
# convert to process tree
|
|
||||||
bpmn = pm4py.convert.convert_to_bpmn(net, im, fm)
|
|
||||||
pm4py.vis.view_bpmn(bpmn)
|
|
||||||
|
|
||||||
## Alpha Miner
|
|
||||||
net, im, fm = pm4py.discover_petri_net_alpha(event_log)
|
|
||||||
a_eval = eval_pm(event_log, net, im, fm)
|
|
||||||
pm4py.vis.view_petri_net(net, im, fm)
|
|
||||||
pm4py.vis.save_vis_petri_net(net, im, fm, "../figures/processmaps/pn_alpha_complete.png")
|
|
||||||
|
|
||||||
is_sound = pm4py.check_soundness(net, im, fm)
|
|
||||||
is_sound[0]
|
|
||||||
|
|
||||||
len(is_sound[1]["s_c_net"].arcs)
|
|
||||||
len(is_sound[1]["s_c_net"].transitions)
|
|
||||||
len(is_sound[1]["s_c_net"].places)
|
|
||||||
|
|
||||||
## Inductive Miner
|
|
||||||
net, im, fm = pm4py.discover_petri_net_inductive(event_log)
|
|
||||||
i_eval = eval_pm(event_log, net, im, fm)
|
|
||||||
pm4py.vis.view_petri_net(net, im, fm)
|
|
||||||
pm4py.vis.save_vis_petri_net(net, im, fm, "../figures/processmaps/pn_induction_complete.png")
|
|
||||||
|
|
||||||
# as process tree (does not work for heuristics miner!)
|
|
||||||
pt = pm4py.discover_process_tree_inductive(event_log)
|
|
||||||
pm4py.vis.view_process_tree(pt)
|
|
||||||
|
|
||||||
is_sound = pm4py.check_soundness(net, im, fm)
|
|
||||||
is_sound[0]
|
|
||||||
|
|
||||||
## ILP Miner
|
|
||||||
net, im, fm = pm4py.discover_petri_net_ilp(event_log)
|
|
||||||
ilp_eval = eval_pm(event_log, net, im, fm)
|
|
||||||
pm4py.vis.view_petri_net(net, im, fm)
|
|
||||||
pm4py.vis.save_vis_petri_net(net, im, fm, "../figures/processmaps/pn_ilp_complete.png")
|
|
||||||
|
|
||||||
is_sound = pm4py.check_soundness(net, im, fm)
|
|
||||||
is_sound[0]
|
|
||||||
|
|
||||||
eval = pd.DataFrame(np.row_stack([h_eval, a_eval, i_eval, ilp_eval]))
|
|
||||||
eval.columns = ["fitness", "precision", "generalizability", "simplicity"]
|
|
||||||
eval.index = ["heuristics", "alpha", "inductive", "ilp"]
|
|
||||||
eval
|
|
||||||
|
|
||||||
eval.to_csv("results/eval_all-miners_complete.csv", sep=";")
|
|
||||||
|
|
||||||
|
|
||||||
###### Process Mining - individual artworks ######
|
|
||||||
|
|
||||||
def pm_artworks(miner):
|
|
||||||
|
|
||||||
retval1 = np.empty((len(event_log["case:artwork"].unique()), 4))
|
|
||||||
retval2 = np.empty((len(event_log["case:artwork"].unique()), 4))
|
|
||||||
|
|
||||||
if miner == "heuristics":
|
|
||||||
net, im, fm = pm4py.discover_petri_net_heuristics(event_log)
|
|
||||||
elif miner == "inductive":
|
|
||||||
net, im, fm = pm4py.discover_petri_net_inductive(event_log)
|
|
||||||
elif miner == "alpha":
|
|
||||||
net, im, fm = pm4py.discover_petri_net_alpha(event_log)
|
|
||||||
elif miner == "ilp":
|
|
||||||
net, im, fm = pm4py.discover_petri_net_ilp(event_log)
|
|
||||||
|
|
||||||
for i in range(len(event_log["case:artwork"].unique())):
|
|
||||||
artwork = event_log["case:artwork"].unique()[i]
|
|
||||||
subdata = pm4py.filter_event_attribute_values(event_log, "case:artwork",
|
|
||||||
[artwork],
|
|
||||||
level="case", retain=True)
|
|
||||||
if miner == "heuristics":
|
|
||||||
subnet, subim, subfm = pm4py.discover_petri_net_heuristics(subdata)
|
|
||||||
elif miner == "inductive":
|
|
||||||
subnet, subim, subfm = pm4py.discover_petri_net_inductive(subdata)
|
|
||||||
elif miner == "alpha":
|
|
||||||
subnet, subim, subfm = pm4py.discover_petri_net_alpha(subdata)
|
|
||||||
elif miner == "ilp":
|
|
||||||
subnet, subim, subfm = pm4py.discover_petri_net_ilp(subdata)
|
|
||||||
#pm4py.save_vis_petri_net(subnet, subim, subfm,
|
|
||||||
# "../figures/processmaps/artworks/petrinet_" + miner + "_" + str(artwork).zfill(3) + ".png")
|
|
||||||
retval1[i] = eval_pm(subdata, net, im, fm)
|
|
||||||
retval2[i] = eval_pm(subdata, subnet, subim, subfm)
|
|
||||||
|
|
||||||
retval1 = pd.DataFrame(retval1)
|
|
||||||
retval1.columns = ["fitness", "precision", "generalizability", "simplicity"]
|
|
||||||
retval1.index = event_log["case:artwork"].unique()
|
|
||||||
retval1.insert(0, "nettype", "alldata")
|
|
||||||
retval2 = pd.DataFrame(retval2)
|
|
||||||
retval2.columns = ["fitness", "precision", "generalizability", "simplicity"]
|
|
||||||
retval2.index = event_log["case:artwork"].unique()
|
|
||||||
retval2.insert(0, "nettype", "subdata")
|
|
||||||
return pd.concat([retval1, retval2])
|
|
||||||
|
|
||||||
|
|
||||||
for miner in ["heuristics", "inductive", "alpha", "ilp"]:
|
|
||||||
eval_art = pm_artworks(miner = miner)
|
|
||||||
eval_art.to_csv("results/eval_artworks_" + miner + ".csv", sep=";")
|
|
||||||
|
|
||||||
eval_art = pm_artworks(miner = "inductive")
|
|
||||||
|
|
||||||
##### Clustering ######
|
|
||||||
|
|
||||||
## KMeans
|
|
||||||
|
|
||||||
#eval_artworks = eval_art[eval_art.nettype == "alldata"].iloc[:,range(1,5)]
|
|
||||||
eval_artworks = eval_art[eval_art.nettype == "subdata"].iloc[:,range(1,5)]
|
|
||||||
|
|
||||||
kmeans = KMeans(n_clusters=4, max_iter=1000).fit(eval_artworks)
|
|
||||||
|
|
||||||
#from sklearn.manifold import MDS
|
|
||||||
#coord = pd.DataFrame(MDS(normalized_stress='auto').fit_transform(eval_artworks))
|
|
||||||
|
|
||||||
coord = eval_artworks
|
|
||||||
coord["clusters"] = kmeans.labels_
|
|
||||||
|
|
||||||
for i in coord.clusters.unique():
|
|
||||||
#plt.scatter(coord[coord.clusters == i].iloc[:,0], coord[coord.clusters == i].iloc[:,1],
|
|
||||||
plt.scatter(coord[coord.clusters == i].iloc[:,1], coord[coord.clusters == i].iloc[:,2],
|
|
||||||
#plt.scatter(coord[coord.clusters == i].iloc[:,2], coord[coord.clusters == i].iloc[:,4],
|
|
||||||
label = i)
|
|
||||||
plt.legend()
|
|
||||||
plt.show()
|
|
||||||
|
|
||||||
### Scree plot
|
|
||||||
|
|
||||||
sse = {}
|
|
||||||
for k in range(1, 10):
|
|
||||||
kmeans = KMeans(n_clusters=k, max_iter=1000).fit(eval_artworks[["precision", "generalizability"]])
|
|
||||||
#data["clusters"] = kmeans.labels_
|
|
||||||
#print(data["clusters"])
|
|
||||||
sse[k] = kmeans.inertia_ # Inertia: Sum of distances of samples to their closest cluster center
|
|
||||||
plt.figure()
|
|
||||||
plt.plot(list(sse.keys()), list(sse.values()))
|
|
||||||
plt.xlabel("Number of clusters")
|
|
||||||
plt.ylabel("SSE")
|
|
||||||
plt.show()
|
|
||||||
|
|
@ -1,266 +0,0 @@
|
|||||||
%reset
|
|
||||||
|
|
||||||
import pm4py
|
|
||||||
|
|
||||||
import pandas as pd
|
|
||||||
import numpy as np
|
|
||||||
import matplotlib.pyplot as plt
|
|
||||||
|
|
||||||
###### Load data and create event logs ######
|
|
||||||
|
|
||||||
dat = pd.read_csv("results/haum/event_logfiles_2024-01-18_09-58-52.csv", sep = ";")
|
|
||||||
dat = dat[dat["date.start"] < "2020-03-13"]
|
|
||||||
# --> only pre corona (before artworks were updated)
|
|
||||||
|
|
||||||
event_log = pm4py.format_dataframe(dat, case_id='path', activity_key='event',
|
|
||||||
timestamp_key='date.start')
|
|
||||||
|
|
||||||
###### Descrptives of log data ######
|
|
||||||
|
|
||||||
# Distribution of events
|
|
||||||
event_log.event.value_counts()
|
|
||||||
event_log.event.value_counts(normalize=True)
|
|
||||||
|
|
||||||
# Number of paths
|
|
||||||
len(event_log.path.unique())
|
|
||||||
|
|
||||||
# Number of variants
|
|
||||||
variants = pm4py.get_variants(event_log)
|
|
||||||
len(variants)
|
|
||||||
|
|
||||||
sorted_variants = dict(sorted(variants.items(), key=lambda item: item[1], reverse = True))
|
|
||||||
{k: sorted_variants[k] for k in list(sorted_variants)[:20]}
|
|
||||||
|
|
||||||
filtered_log = event_log[event_log["event"] != "move"]
|
|
||||||
variants_no_move = pm4py.get_variants(filtered_log)
|
|
||||||
len(variants_no_move)
|
|
||||||
sorted_variants_no_move = dict(sorted(variants_no_move.items(), key=lambda item: item[1], reverse = True))
|
|
||||||
{k: sorted_variants_no_move[k] for k in list(sorted_variants_no_move)[:20]}
|
|
||||||
|
|
||||||
###### Read "conformative" Petri Net ######
|
|
||||||
|
|
||||||
basenet, initial_marking, final_marking = pm4py.read_pnml("results/conformative_petrinet_con.pnml")
|
|
||||||
|
|
||||||
def eval_pm(data, net, initial_marking, final_marking):
|
|
||||||
"""Caculate fitness, precision, generalizability, and simplicity for petri net"""
|
|
||||||
fitness = pm4py.fitness_token_based_replay(data, net, initial_marking, final_marking)
|
|
||||||
precisison = pm4py.precision_token_based_replay(data, net, initial_marking, final_marking)
|
|
||||||
generalizability = pm4py.algo.evaluation.generalization.algorithm.apply(data, net,
|
|
||||||
initial_marking, final_marking)
|
|
||||||
simplicity = pm4py.algo.evaluation.simplicity.algorithm.apply(net)
|
|
||||||
return [fitness['average_trace_fitness'], precisison, generalizability, simplicity]
|
|
||||||
|
|
||||||
baseline_eval = eval_pm(event_log, basenet, initial_marking, final_marking)
|
|
||||||
|
|
||||||
# TBR
|
|
||||||
replayed_traces = pm4py.conformance_diagnostics_token_based_replay(event_log, basenet, initial_marking, final_marking)
|
|
||||||
|
|
||||||
l1 = list()
|
|
||||||
l2 = list()
|
|
||||||
l3 = list()
|
|
||||||
l4 = list()
|
|
||||||
for i in range(len(replayed_traces)):
|
|
||||||
l1.append(replayed_traces[i]["remaining_tokens"])
|
|
||||||
l2.append(replayed_traces[i]["missing_tokens"])
|
|
||||||
l3.append(replayed_traces[i]["reached_marking"])
|
|
||||||
l4.append(replayed_traces[i]["transitions_with_problems"])
|
|
||||||
|
|
||||||
set(l1)
|
|
||||||
x1 = np.array(l1)
|
|
||||||
index_broken = np.where(x1 == 1)[0].tolist()
|
|
||||||
|
|
||||||
set(l3)
|
|
||||||
l4.count([])
|
|
||||||
|
|
||||||
[l3[i] for i in index_broken]
|
|
||||||
[l4[i] for i in index_broken]
|
|
||||||
|
|
||||||
broken_traces = [replayed_traces[i] for i in index_broken]
|
|
||||||
|
|
||||||
event_log[event_log['@@case_index'] == index_broken].event
|
|
||||||
event_log[event_log['@@case_index'] == index_broken].path.unique().tolist()
|
|
||||||
event_log[event_log['@@case_index'] == index_broken].item.unique().tolist()
|
|
||||||
event_log[event_log['@@case_index'] == index_broken]["fileId.start"].unique().tolist()
|
|
||||||
# --> logging error in raw file
|
|
||||||
|
|
||||||
|
|
||||||
# Footprints
|
|
||||||
from pm4py.algo.discovery.footprints import algorithm as footprints_discovery
|
|
||||||
from pm4py.visualization.footprints import visualizer as fp_visualizer
|
|
||||||
fp_log = footprints_discovery.apply(event_log, variant=footprints_discovery.Variants.ENTIRE_EVENT_LOG)
|
|
||||||
fp_net = footprints_discovery.apply(basenet, initial_marking, final_marking)
|
|
||||||
gviz = fp_visualizer.apply(fp_net, parameters={fp_visualizer.Variants.SINGLE.value.Parameters.FORMAT: "svg"})
|
|
||||||
fp_visualizer.view(gviz)
|
|
||||||
|
|
||||||
pm4py.vis.view_petri_net(basenet, initial_marking, final_marking)
|
|
||||||
is_sound = pm4py.check_soundness(basenet, initial_marking, final_marking)
|
|
||||||
baseline_eval.append(is_sound[0])
|
|
||||||
baseline_eval.append(len(basenet.arcs))
|
|
||||||
baseline_eval.append(len(basenet.transitions))
|
|
||||||
baseline_eval.append(len(basenet.places))
|
|
||||||
|
|
||||||
efg_graph = pm4py.discover_eventually_follows_graph(event_log)
|
|
||||||
|
|
||||||
## Directly-follows graph
|
|
||||||
dfg, start_activities, end_activities = pm4py.discover_dfg(event_log)
|
|
||||||
pm4py.view_dfg(dfg, start_activities, end_activities)
|
|
||||||
pm4py.save_vis_dfg(dfg, start_activities, end_activities, '../figures/processmaps/dfg_complete.png')
|
|
||||||
|
|
||||||
## Fitting different miners
|
|
||||||
### Heuristics Miner
|
|
||||||
h_net, im, fm = pm4py.discover_petri_net_heuristics(event_log)
|
|
||||||
h_eval = eval_pm(event_log, h_net, im, fm)
|
|
||||||
is_sound = pm4py.check_soundness(h_net, im, fm)
|
|
||||||
h_eval.append(is_sound[0])
|
|
||||||
h_eval.append(len(h_net.arcs))
|
|
||||||
h_eval.append(len(h_net.transitions))
|
|
||||||
h_eval.append(len(h_net.places))
|
|
||||||
|
|
||||||
## Alpha Miner
|
|
||||||
a_net, im, fm = pm4py.discover_petri_net_alpha(event_log)
|
|
||||||
a_eval = eval_pm(event_log, a_net, im, fm)
|
|
||||||
is_sound = pm4py.check_soundness(a_net, im, fm)
|
|
||||||
a_eval.append(is_sound[0])
|
|
||||||
a_eval.append(len(a_net.arcs))
|
|
||||||
a_eval.append(len(a_net.transitions))
|
|
||||||
a_eval.append(len(a_net.places))
|
|
||||||
|
|
||||||
## Inductive Miner
|
|
||||||
i_net, im, fm = pm4py.discover_petri_net_inductive(event_log)
|
|
||||||
i_eval = eval_pm(event_log, i_net, im, fm)
|
|
||||||
is_sound = pm4py.check_soundness(i_net, im, fm)
|
|
||||||
i_eval.append(is_sound[0])
|
|
||||||
i_eval.append(len(i_net.arcs))
|
|
||||||
i_eval.append(len(i_net.transitions))
|
|
||||||
i_eval.append(len(i_net.places))
|
|
||||||
|
|
||||||
## ILP Miner
|
|
||||||
ilp_net, im, fm = pm4py.discover_petri_net_ilp(event_log)
|
|
||||||
ilp_eval = eval_pm(event_log, ilp_net, im, fm)
|
|
||||||
is_sound = pm4py.check_soundness(ilp_net, im, fm)
|
|
||||||
ilp_eval.append(is_sound[0])
|
|
||||||
ilp_eval.append(len(ilp_net.arcs))
|
|
||||||
ilp_eval.append(len(ilp_net.transitions))
|
|
||||||
ilp_eval.append(len(ilp_net.places))
|
|
||||||
|
|
||||||
## Export for all miners
|
|
||||||
eval = pd.DataFrame(np.row_stack([baseline_eval, h_eval, a_eval, i_eval, ilp_eval]))
|
|
||||||
eval.columns = ["fitness", "precision", "generalizability", "simplicity",
|
|
||||||
"sound", "narcs", "ntrans", "nplaces"]
|
|
||||||
eval.index = ["conformative", "heuristics", "alpha", "inductive", "ilp"]
|
|
||||||
eval
|
|
||||||
|
|
||||||
eval.to_csv("results/eval_all-miners_complete.csv", sep=" ")
|
|
||||||
|
|
||||||
## Without broken trace
|
|
||||||
event_log_clean = event_log[event_log['@@case_index'] != index_broken[0]]
|
|
||||||
h_net, h_im, h_fm = pm4py.discover_petri_net_heuristics(event_log_clean)
|
|
||||||
a_net, a_im, a_fm = pm4py.discover_petri_net_alpha(event_log_clean)
|
|
||||||
i_net, i_im, i_fm = pm4py.discover_petri_net_inductive(event_log_clean)
|
|
||||||
ilp_net, ilp_im, ilp_fm = pm4py.discover_petri_net_ilp(event_log_clean)
|
|
||||||
|
|
||||||
baseline_eval = eval_pm(event_log_clean, basenet, initial_marking, final_marking)
|
|
||||||
is_sound = pm4py.check_soundness(basenet, initial_marking, final_marking)
|
|
||||||
baseline_eval.append(is_sound[0])
|
|
||||||
baseline_eval.append(len(basenet.arcs))
|
|
||||||
baseline_eval.append(len(basenet.transitions))
|
|
||||||
baseline_eval.append(len(basenet.places))
|
|
||||||
|
|
||||||
h_eval = eval_pm(event_log_clean, h_net, h_im, h_fm)
|
|
||||||
is_sound = pm4py.check_soundness(h_net, h_im, h_fm)
|
|
||||||
h_eval.append(is_sound[0])
|
|
||||||
h_eval.append(len(h_net.arcs))
|
|
||||||
h_eval.append(len(h_net.transitions))
|
|
||||||
h_eval.append(len(h_net.places))
|
|
||||||
|
|
||||||
a_eval = eval_pm(event_log_clean, a_net, a_im, a_fm)
|
|
||||||
is_sound = pm4py.check_soundness(a_net, a_im, a_fm)
|
|
||||||
a_eval.append(is_sound[0])
|
|
||||||
a_eval.append(len(a_net.arcs))
|
|
||||||
a_eval.append(len(a_net.transitions))
|
|
||||||
a_eval.append(len(a_net.places))
|
|
||||||
|
|
||||||
i_eval = eval_pm(event_log_clean, i_net, i_im, i_fm)
|
|
||||||
is_sound = pm4py.check_soundness(i_net, i_im, i_fm)
|
|
||||||
i_eval.append(is_sound[0])
|
|
||||||
i_eval.append(len(i_net.arcs))
|
|
||||||
i_eval.append(len(i_net.transitions))
|
|
||||||
i_eval.append(len(i_net.places))
|
|
||||||
|
|
||||||
ilp_eval = eval_pm(event_log_clean, ilp_net, ilp_im, ilp_fm)
|
|
||||||
is_sound = pm4py.check_soundness(ilp_net, ilp_im, ilp_fm)
|
|
||||||
ilp_eval.append(is_sound[0])
|
|
||||||
ilp_eval.append(len(ilp_net.arcs))
|
|
||||||
ilp_eval.append(len(ilp_net.transitions))
|
|
||||||
ilp_eval.append(len(ilp_net.places))
|
|
||||||
|
|
||||||
eval = pd.DataFrame(np.row_stack([baseline_eval, h_eval, a_eval, i_eval, ilp_eval]))
|
|
||||||
eval.columns = ["fitness", "precision", "generalizability", "simplicity",
|
|
||||||
"sound", "narcs", "ntrans", "nplaces"]
|
|
||||||
eval.index = ["conformative", "heuristics", "alpha", "inductive", "ilp"]
|
|
||||||
eval
|
|
||||||
|
|
||||||
eval.to_csv("results/eval_all-miners_clean.csv", sep=" ")
|
|
||||||
|
|
||||||
# Export petri nets
|
|
||||||
pm4py.vis.save_vis_petri_net(h_net, h_im, h_fm, "results/processmaps/petrinet_heuristics_clean.png")
|
|
||||||
pm4py.vis.save_vis_petri_net(a_net, a_im, a_fm, "results/processmaps/petrinet_alpha_clean.png")
|
|
||||||
pm4py.vis.save_vis_petri_net(i_net, i_im, i_fm, "results/processmaps/petrinet_inductive_clean.png")
|
|
||||||
pm4py.vis.save_vis_petri_net(ilp_net, ilp_im, ilp_fm, "results/processmaps/petrinet_ilp_clean.png")
|
|
||||||
pm4py.vis.save_vis_petri_net(basenet, initial_marking, final_marking, "results/processmaps/petrinet_conformative.png")
|
|
||||||
|
|
||||||
# convert to BPMN
|
|
||||||
base_bpmn = pm4py.convert.convert_to_bpmn(basenet, initial_marking, final_marking)
|
|
||||||
pm4py.vis.save_vis_bpmn(base_bpmn, "results/processmaps/bpmn_conformative.png")
|
|
||||||
|
|
||||||
i_bpmn = pm4py.convert.convert_to_bpmn(i_net, i_im, i_fm)
|
|
||||||
pm4py.vis.save_vis_bpmn(i_bpmn, "results/processmaps/bpmn_inductive_clean.png")
|
|
||||||
|
|
||||||
ilp_bpmn = pm4py.convert.convert_to_bpmn(ilp_net, ilp_im, ilp_fm)
|
|
||||||
pm4py.vis.save_vis_bpmn(ilp_bpmn, "results/processmaps/bpmn_ilp_clean.png")
|
|
||||||
|
|
||||||
a_bpmn = pm4py.convert.convert_to_bpmn(a_net, a_im, a_fm)
|
|
||||||
pm4py.vis.save_vis_bpmn(a_bpmn, "results/processmaps/bpmn_alpha_clean.png")
|
|
||||||
|
|
||||||
h_bpmn = pm4py.convert.convert_to_bpmn(h_net, h_im, h_fm)
|
|
||||||
pm4py.vis.save_vis_bpmn(h_bpmn, "results/processmaps/bpmn_heuristics_clean.png")
|
|
||||||
|
|
||||||
###### Process Mining - individual artworks ######
|
|
||||||
|
|
||||||
def pm_artworks(miner):
|
|
||||||
|
|
||||||
retval1 = np.empty((len(event_log["item"].unique()), 4))
|
|
||||||
retval2 = np.empty((len(event_log["item"].unique()), 4))
|
|
||||||
|
|
||||||
for i in range(len(event_log["item"].unique())):
|
|
||||||
artwork = event_log["item"].unique()[i]
|
|
||||||
subdata = pm4py.filter_event_attribute_values(event_log, "item",
|
|
||||||
[artwork],
|
|
||||||
level="case", retain=True)
|
|
||||||
if miner == "heuristics":
|
|
||||||
subnet, subim, subfm = pm4py.discover_petri_net_heuristics(subdata)
|
|
||||||
elif miner == "inductive":
|
|
||||||
subnet, subim, subfm = pm4py.discover_petri_net_inductive(subdata)
|
|
||||||
elif miner == "alpha":
|
|
||||||
subnet, subim, subfm = pm4py.discover_petri_net_alpha(subdata)
|
|
||||||
elif miner == "ilp":
|
|
||||||
subnet, subim, subfm = pm4py.discover_petri_net_ilp(subdata)
|
|
||||||
#pm4py.save_vis_petri_net(subnet, subim, subfm,
|
|
||||||
# "results/processmaps/artworks/petrinet_" + miner + "_" + str(artwork).zfill(3) + ".png")
|
|
||||||
retval1[i] = eval_pm(subdata, basenet, initial_marking, final_marking)
|
|
||||||
retval2[i] = eval_pm(subdata, subnet, subim, subfm)
|
|
||||||
|
|
||||||
retval1 = pd.DataFrame(retval1)
|
|
||||||
retval1.columns = ["fitness", "precision", "generalizability", "simplicity"]
|
|
||||||
retval1.index = event_log["item"].unique()
|
|
||||||
retval1.insert(0, "nettype", "alldata")
|
|
||||||
retval2 = pd.DataFrame(retval2)
|
|
||||||
retval2.columns = ["fitness", "precision", "generalizability", "simplicity"]
|
|
||||||
retval2.index = event_log["item"].unique()
|
|
||||||
retval2.insert(0, "nettype", "subdata")
|
|
||||||
return pd.concat([retval1, retval2])
|
|
||||||
|
|
||||||
|
|
||||||
for miner in ["heuristics", "inductive", "alpha", "ilp"]:
|
|
||||||
eval_art = pm_artworks(miner = miner)
|
|
||||||
eval_art.to_csv("results/eval_artworks_" + miner + ".csv", sep=";")
|
|
@ -1,126 +0,0 @@
|
|||||||
%reset
|
|
||||||
|
|
||||||
import pm4py
|
|
||||||
|
|
||||||
import pandas as pd
|
|
||||||
import numpy as np
|
|
||||||
import matplotlib.pyplot as plt
|
|
||||||
from pm4py.visualization.petri_net import visualizer as pn_visualizer
|
|
||||||
parameters = {pn_visualizer.Variants.FREQUENCY.value.Parameters.FORMAT: "png"}
|
|
||||||
|
|
||||||
###### Load data and create event logs ######
|
|
||||||
|
|
||||||
dat = pd.read_csv("results/haum/event_logfiles_2024-01-18_09-58-52.csv", sep = ";")
|
|
||||||
dat = dat[dat["date.start"] < "2020-03-13"]
|
|
||||||
dat = dat[dat["path"] != 106098] # exclude broken trace
|
|
||||||
# --> only pre corona (before artworks were updated)
|
|
||||||
|
|
||||||
event_log = pm4py.format_dataframe(dat, case_id='case', activity_key='event',
|
|
||||||
timestamp_key='date.start')
|
|
||||||
|
|
||||||
event_log.event.value_counts()
|
|
||||||
event_log.event.value_counts(normalize=True)
|
|
||||||
|
|
||||||
dfg, start_activities, end_activities = pm4py.discover_dfg(event_log)
|
|
||||||
pm4py.view_dfg(dfg, start_activities, end_activities)
|
|
||||||
|
|
||||||
#filtered_log = pm4py.filter_event_attribute_values(event_log, 'item', [80])
|
|
||||||
|
|
||||||
net, im, fm = pm4py.discover_petri_net_inductive(event_log)
|
|
||||||
pm4py.vis.view_petri_net(net, im, fm)
|
|
||||||
|
|
||||||
pm4py.vis.view_petri_net(net, im, fm)
|
|
||||||
gviz = pn_visualizer.apply(net, im, fm, parameters=parameters,
|
|
||||||
variant=pn_visualizer.Variants.FREQUENCY,
|
|
||||||
log=event_log)
|
|
||||||
pn_visualizer.view(gviz)
|
|
||||||
|
|
||||||
bpmn = pm4py.convert.convert_to_bpmn(net, im, fm)
|
|
||||||
pm4py.vis.view_bpmn(bpmn)
|
|
||||||
|
|
||||||
net2, im2, fm2 = pm4py.discover_petri_net_inductive(event_log, noise_threshold=0.1)
|
|
||||||
pm4py.vis.view_petri_net(net2, im2, fm2)
|
|
||||||
|
|
||||||
def eval_pm(data, net, initial_marking, final_marking):
|
|
||||||
"""Caculate fitness, precision, generalizability, and simplicity for petri net"""
|
|
||||||
fitness = pm4py.fitness_token_based_replay(data, net, initial_marking, final_marking)
|
|
||||||
precisison = pm4py.precision_token_based_replay(data, net, initial_marking, final_marking)
|
|
||||||
#generalizability = pm4py.algo.evaluation.generalization.algorithm.apply(data, net,
|
|
||||||
# initial_marking, final_marking)
|
|
||||||
simplicity = pm4py.algo.evaluation.simplicity.algorithm.apply(net)
|
|
||||||
#return [fitness['average_trace_fitness'], precisison, generalizability, simplicity]
|
|
||||||
return [fitness['average_trace_fitness'], precisison, simplicity]
|
|
||||||
|
|
||||||
eval = eval_pm(event_log, net, im, fm)
|
|
||||||
eval2 = eval_pm(event_log, net2, im2, fm2)
|
|
||||||
|
|
||||||
len(net.places)
|
|
||||||
len(net.transitions)
|
|
||||||
len(net.arcs)
|
|
||||||
|
|
||||||
# Number of cases
|
|
||||||
len(event_log.case.unique())
|
|
||||||
|
|
||||||
# Number of variants
|
|
||||||
variants = pm4py.get_variants(event_log)
|
|
||||||
len(variants)
|
|
||||||
|
|
||||||
sorted_variants = dict(sorted(variants.items(), key=lambda item: item[1], reverse = True))
|
|
||||||
{k: sorted_variants[k] for k in list(sorted_variants)[:20]}
|
|
||||||
|
|
||||||
filtered_log = event_log[event_log["event"] != "move"]
|
|
||||||
variants_no_move = pm4py.get_variants(filtered_log)
|
|
||||||
len(variants_no_move)
|
|
||||||
sorted_variants_no_move = dict(sorted(variants_no_move.items(), key=lambda item: item[1], reverse = True))
|
|
||||||
{k: sorted_variants_no_move[k] for k in list(sorted_variants_no_move)[:20]}
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
###### Navigation behavior for case ######
|
|
||||||
|
|
||||||
log_case = pm4py.format_dataframe(dat, case_id = "case", activity_key = "item",
|
|
||||||
timestamp_key = "date.start")
|
|
||||||
log_case = log_case.merge(tmp, on = "item", how = "left")
|
|
||||||
|
|
||||||
#filtered_log = pm4py.filter_event_attribute_values(log_case, "kcluster", [3])
|
|
||||||
filtered_log = log_case[log_case.hcluster == 1]
|
|
||||||
|
|
||||||
net, im, fm = pm4py.discover_dfg(filtered_log)
|
|
||||||
pm4py.vis.view_dfg(net, im, fm)
|
|
||||||
|
|
||||||
|
|
||||||
net, im, fm = pm4py.discover_petri_net_inductive(filtered_log)
|
|
||||||
pm4py.vis.view_petri_net(net, im, fm)
|
|
||||||
|
|
||||||
tree = pm4py.discovery.discover_process_tree_inductive(filtered_log)
|
|
||||||
pm4py.vis.view_process_tree(tree)
|
|
||||||
|
|
||||||
|
|
||||||
datcase = dat[~dat.duplicated(["case", "path", "item"])]
|
|
||||||
datcase = datcase[["case", "path", "event", "item", "date.start"]]
|
|
||||||
datcase = datcase.reset_index().drop("index", axis = 1)
|
|
||||||
#datcase = pd.concat([datcase, pd.get_dummies(datcase["item"], dtype = "int")], axis = 1)
|
|
||||||
|
|
||||||
datcase["duration"] = dat.groupby("path")["duration"].mean().tolist()
|
|
||||||
datcase["distance"] = dat.groupby("path")["distance"].mean().tolist()
|
|
||||||
datcase["scaleSize"] = dat.groupby("path")["scaleSize"].mean().tolist()
|
|
||||||
datcase["rotationDegree"] = dat.groupby("path")["rotationDegree"].mean().tolist()
|
|
||||||
|
|
||||||
datcase["item"] = [str(item).zfill(3) for item in datcase.item]
|
|
||||||
datcase = datcase.merge(xy[["item", "hcluster"]], on = "item", how = "left")
|
|
||||||
|
|
||||||
log_case = pm4py.format_dataframe(dat, case_id = "case", activity_key = "item",
|
|
||||||
timestamp_key = "date.start")
|
|
||||||
|
|
||||||
net, im, fm = pm4py.discover_dfg(log_case)
|
|
||||||
pm4py.vis.view_dfg(net, im, fm)
|
|
||||||
# don't know if this will eventually finish?
|
|
||||||
|
|
||||||
net, im, fm = pm4py.discover_dfg(log_case[log_case.hcluster == 1])
|
|
||||||
pm4py.vis.view_dfg(net, im, fm)
|
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -18,13 +18,40 @@ def pn_infos(log, colname, filter):
|
|||||||
filtered_log = pm4py.filter_event_attribute_values(log, colname, [filter])
|
filtered_log = pm4py.filter_event_attribute_values(log, colname, [filter])
|
||||||
|
|
||||||
net, im, fm = pm4py.discover_petri_net_inductive(filtered_log)
|
net, im, fm = pm4py.discover_petri_net_inductive(filtered_log)
|
||||||
eval = eval_pm(filtered_log, net, im, fm)
|
|
||||||
|
eval = eval_append(log, net, im, fm)
|
||||||
|
eval.index = [str(filter).zfill(3)]
|
||||||
|
return eval
|
||||||
|
|
||||||
|
|
||||||
|
def pn_infos_miner(log, miner):
|
||||||
|
"""Create data frame with relevant infos for petri nets created with
|
||||||
|
different miners"""
|
||||||
|
|
||||||
|
if miner == "alpha":
|
||||||
|
net, im, fm = pm4py.discover_petri_net_alpha(log)
|
||||||
|
elif miner == "heuristics":
|
||||||
|
net, im, fm = pm4py.discover_petri_net_heuristics(log)
|
||||||
|
elif miner == "ilp":
|
||||||
|
net, im, fm = pm4py.discover_petri_net_ilp(log)
|
||||||
|
elif miner == "inductive":
|
||||||
|
net, im, fm = pm4py.discover_petri_net_inductive(log)
|
||||||
|
elif miner == "conformative":
|
||||||
|
net, im, fm = pm4py.read_pnml("results/haum/conformative_petrinet_con.pnml")
|
||||||
|
|
||||||
|
eval = eval_append(log, net, im, fm)
|
||||||
|
eval.index = [miner]
|
||||||
|
return eval
|
||||||
|
|
||||||
|
def eval_append(log, net, im, fm):
|
||||||
|
|
||||||
|
eval = eval_pm(log, net, im, fm)
|
||||||
is_sound = pm4py.check_soundness(net, im, fm)
|
is_sound = pm4py.check_soundness(net, im, fm)
|
||||||
eval.append(is_sound[0])
|
eval.append(is_sound[0])
|
||||||
eval.append(len(net.arcs))
|
eval.append(len(net.arcs))
|
||||||
eval.append(len(net.transitions))
|
eval.append(len(net.transitions))
|
||||||
eval.append(len(net.places))
|
eval.append(len(net.places))
|
||||||
variants = pm4py.get_variants(filtered_log)
|
variants = pm4py.get_variants(log)
|
||||||
eval.append(len(variants))
|
eval.append(len(variants))
|
||||||
|
|
||||||
sorted_variants = dict(sorted(variants.items(), key=lambda item: item[1], reverse = True))
|
sorted_variants = dict(sorted(variants.items(), key=lambda item: item[1], reverse = True))
|
||||||
@ -33,5 +60,5 @@ def pn_infos(log, colname, filter):
|
|||||||
eval = pd.DataFrame(eval).T
|
eval = pd.DataFrame(eval).T
|
||||||
eval.columns = ["fitness", "precision", "generalizability", "simplicity",
|
eval.columns = ["fitness", "precision", "generalizability", "simplicity",
|
||||||
"sound", "narcs", "ntrans", "nplaces", "nvariants", "mostfreq"]
|
"sound", "narcs", "ntrans", "nplaces", "nvariants", "mostfreq"]
|
||||||
eval.index = [str(filter).zfill(3)]
|
|
||||||
return eval
|
return eval
|
||||||
|
|
||||||
|
@ -1,174 +0,0 @@
|
|||||||
from sklearn.cluster import KMeans
|
|
||||||
|
|
||||||
import matplotlib.pyplot as plt
|
|
||||||
|
|
||||||
|
|
||||||
##### Clustering ######
|
|
||||||
|
|
||||||
## KMeans
|
|
||||||
|
|
||||||
#eval_artworks = eval_art[eval_art.nettype == "alldata"].iloc[:,range(1,5)]
|
|
||||||
eval_artworks = eval_art[eval_art.nettype == "subdata"].iloc[:,range(1,5)]
|
|
||||||
|
|
||||||
kmeans = KMeans(n_clusters=4, max_iter=1000).fit(eval_artworks)
|
|
||||||
|
|
||||||
#from sklearn.manifold import MDS
|
|
||||||
#coord = pd.DataFrame(MDS(normalized_stress='auto').fit_transform(eval_artworks))
|
|
||||||
|
|
||||||
coord = eval_artworks
|
|
||||||
coord["clusters"] = kmeans.labels_
|
|
||||||
|
|
||||||
for i in coord.clusters.unique():
|
|
||||||
#plt.scatter(coord[coord.clusters == i].iloc[:,0], coord[coord.clusters == i].iloc[:,1],
|
|
||||||
plt.scatter(coord[coord.clusters == i].iloc[:,1], coord[coord.clusters == i].iloc[:,2],
|
|
||||||
#plt.scatter(coord[coord.clusters == i].iloc[:,2], coord[coord.clusters == i].iloc[:,4],
|
|
||||||
label = i)
|
|
||||||
plt.legend()
|
|
||||||
plt.show()
|
|
||||||
|
|
||||||
### Scree plot
|
|
||||||
|
|
||||||
sse = {}
|
|
||||||
for k in range(1, 10):
|
|
||||||
kmeans = KMeans(n_clusters=k, max_iter=1000).fit(eval_artworks[["precision", "generalizability"]])
|
|
||||||
#data["clusters"] = kmeans.labels_
|
|
||||||
#print(data["clusters"])
|
|
||||||
sse[k] = kmeans.inertia_ # Inertia: Sum of distances of samples to their closest cluster center
|
|
||||||
plt.figure()
|
|
||||||
plt.plot(list(sse.keys()), list(sse.values()))
|
|
||||||
plt.xlabel("Number of clusters")
|
|
||||||
plt.ylabel("SSE")
|
|
||||||
plt.show()
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
### TMP
|
|
||||||
datitem = dat.groupby("item")[["duration", "distance",
|
|
||||||
"scaleSize", "rotationDegree"]].mean()
|
|
||||||
|
|
||||||
def length_path(data):
|
|
||||||
x = data.path
|
|
||||||
return len(x.unique())
|
|
||||||
def length_case(data):
|
|
||||||
x = data.case
|
|
||||||
return len(x.unique())
|
|
||||||
def length_topic(data):
|
|
||||||
x = data.topic.dropna()
|
|
||||||
return len(x.unique())
|
|
||||||
|
|
||||||
datitem["npaths"] = dat.groupby(["item"]).apply(length_path)
|
|
||||||
datitem["ncases"] = dat.groupby(["item"]).apply(length_case)
|
|
||||||
datitem["ntopics"] = dat.groupby(["item"]).apply(length_topic)
|
|
||||||
|
|
||||||
datitem.index = datitem.index.astype(str).str.rjust(3, "0")
|
|
||||||
datitem = datitem.sort_index()
|
|
||||||
datitem.index = mdi.index
|
|
||||||
|
|
||||||
datitem = pd.concat([mdi, datitem], axis = 1)
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
###### Find clusters ######
|
|
||||||
|
|
||||||
myseed = 1420
|
|
||||||
|
|
||||||
mat = datitem.drop(["fitness", "sound", "mostfreq"], axis = 1)
|
|
||||||
mat = StandardScaler().fit_transform(mat)
|
|
||||||
|
|
||||||
xy = pd.DataFrame(MDS(normalized_stress = 'auto', random_state = myseed).fit_transform(mat))
|
|
||||||
xy.index = datitem.index
|
|
||||||
|
|
||||||
### K-Means clustering ###
|
|
||||||
|
|
||||||
kmeans = KMeans(n_clusters = 6, max_iter = 1000, random_state = myseed).fit(mat)
|
|
||||||
xy["kcluster"] = kmeans.labels_
|
|
||||||
|
|
||||||
for i in xy.kcluster.unique():
|
|
||||||
plt.scatter(xy[xy.kcluster == i].iloc[:,0], xy[xy.kcluster == i].iloc[:,1], label = i)
|
|
||||||
for j, txt in enumerate(xy.index[xy.kcluster == i]):
|
|
||||||
plt.annotate(txt.split("_")[1], (xy[xy.kcluster == i].iloc[j,0], xy[xy.kcluster == i].iloc[j,1]))
|
|
||||||
plt.legend()
|
|
||||||
plt.show()
|
|
||||||
|
|
||||||
xy.kcluster.value_counts()
|
|
||||||
|
|
||||||
# Scree plot
|
|
||||||
sse = {}
|
|
||||||
for k in range(1, 10):
|
|
||||||
kmeans = KMeans(n_clusters = k, max_iter = 1000).fit(mat)
|
|
||||||
sse[k] = kmeans.inertia_ # Inertia: Sum of distances of samples to their closest cluster center
|
|
||||||
plt.figure()
|
|
||||||
plt.plot(list(sse.keys()), list(sse.values()))
|
|
||||||
plt.xlabel("Number of clusters")
|
|
||||||
plt.ylabel("SSE")
|
|
||||||
plt.show()
|
|
||||||
|
|
||||||
c0_items = xy[xy.kcluster == 0].index
|
|
||||||
c1_items = xy[xy.kcluster == 1].index
|
|
||||||
c2_items = xy[xy.kcluster == 2].index
|
|
||||||
c3_items = xy[xy.kcluster == 3].index
|
|
||||||
c4_items = xy[xy.kcluster == 4].index
|
|
||||||
c5_items = xy[xy.kcluster == 5].index
|
|
||||||
|
|
||||||
### Hierarchical clustering ###
|
|
||||||
from sklearn.cluster import AgglomerativeClustering
|
|
||||||
|
|
||||||
hclust = AgglomerativeClustering(n_clusters = 6).fit(mat)
|
|
||||||
hclust.labels_
|
|
||||||
|
|
||||||
xy["hcluster"] = hclust.labels_
|
|
||||||
|
|
||||||
for i in xy.hcluster.unique():
|
|
||||||
plt.scatter(xy[xy.hcluster == i].iloc[:,0], xy[xy.hcluster == i].iloc[:,1], label = i)
|
|
||||||
for j, txt in enumerate(xy.index[xy.hcluster == i]):
|
|
||||||
plt.annotate(txt.split("_")[1], (xy[xy.hcluster == i].iloc[j,0], xy[xy.hcluster == i].iloc[j,1]))
|
|
||||||
plt.legend()
|
|
||||||
plt.show()
|
|
||||||
|
|
||||||
# dendrogram
|
|
||||||
from scipy.cluster.hierarchy import dendrogram
|
|
||||||
|
|
||||||
def plot_dendrogram(model, **kwargs):
|
|
||||||
# Create linkage matrix and then plot the dendrogram
|
|
||||||
|
|
||||||
# create the counts of samples under each node
|
|
||||||
counts = np.zeros(model.children_.shape[0])
|
|
||||||
n_samples = len(model.labels_)
|
|
||||||
for i, merge in enumerate(model.children_):
|
|
||||||
current_count = 0
|
|
||||||
for child_idx in merge:
|
|
||||||
if child_idx < n_samples:
|
|
||||||
current_count += 1 # leaf node
|
|
||||||
else:
|
|
||||||
current_count += counts[child_idx - n_samples]
|
|
||||||
counts[i] = current_count
|
|
||||||
|
|
||||||
linkage_matrix = np.column_stack(
|
|
||||||
[model.children_, model.distances_, counts]
|
|
||||||
).astype(float)
|
|
||||||
|
|
||||||
# Plot the corresponding dendrogram
|
|
||||||
dendrogram(linkage_matrix, **kwargs)
|
|
||||||
|
|
||||||
hclust = AgglomerativeClustering(distance_threshold = 0, n_clusters = None).fit(mat)
|
|
||||||
|
|
||||||
plot_dendrogram(hclust)
|
|
||||||
plt.show()
|
|
||||||
|
|
||||||
### Bisecting K-Means clustering ###
|
|
||||||
from sklearn.cluster import BisectingKMeans
|
|
||||||
|
|
||||||
biKmeans = BisectingKMeans(n_clusters = 6, random_state = myseed).fit(mat)
|
|
||||||
biKmeans.labels_
|
|
||||||
|
|
||||||
xy["bcluster"] = biKmeans.labels_
|
|
||||||
|
|
||||||
for i in xy.bcluster.unique():
|
|
||||||
plt.scatter(xy[xy.bcluster == i].iloc[:,0], xy[xy.bcluster == i].iloc[:,1], label = i)
|
|
||||||
for j, txt in enumerate(xy.index[xy.bcluster == i]):
|
|
||||||
plt.annotate(txt.split("_")[1], (xy[xy.bcluster == i].iloc[j,0], xy[xy.bcluster == i].iloc[j,1]))
|
|
||||||
plt.legend()
|
|
||||||
plt.show()
|
|
Loading…
Reference in New Issue
Block a user