Switched to python for fitting real process mining models; added clustering based on eval criteria in R
This commit is contained in:
parent
7bacefbdee
commit
7a4859227a
145
code/00_pm.py
Normal file
145
code/00_pm.py
Normal file
@ -0,0 +1,145 @@
|
||||
#%% # needed for shortcuts to run properly in VSCode *eyeroll*
|
||||
%reset
|
||||
|
||||
import pm4py
|
||||
from pm4py.algo.evaluation.generalization import algorithm as generalization_evaluator
|
||||
from pm4py.algo.evaluation.simplicity import algorithm as simplicity_evaluator
|
||||
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
import matplotlib.pyplot as plt
|
||||
from sklearn.cluster import KMeans
|
||||
|
||||
###### Load data and create event logs ######
|
||||
|
||||
dat = pd.read_csv("../data/haum/event_logfiles_glossar_2023-11-03_17-46-28.csv", sep = ";")
|
||||
dat = dat[dat.date < "2020-03-13"]
|
||||
# --> only pre corona (before artworks were updated)
|
||||
|
||||
event_log = pm4py.format_dataframe(dat, case_id='trace', activity_key='event',
|
||||
timestamp_key='date.start')
|
||||
event_log = event_log.rename(columns={'artwork': 'case:artwork'})
|
||||
#event_log = pm4py.convert_to_event_log(dat_log) # deprecated
|
||||
start_activities = pm4py.get_start_activities(event_log)
|
||||
start_activities
|
||||
end_activities = pm4py.get_end_activities(event_log)
|
||||
end_activities
|
||||
|
||||
###### Process Mining - complete data set #####
|
||||
|
||||
def eval_pm(data, net, initial_marking, final_marking):
|
||||
"""Caculate fitness, precision, generalizability, and simplicity for petri net"""
|
||||
fitness = pm4py.fitness_token_based_replay(data, net, initial_marking, final_marking)
|
||||
#fitness = pm4py.fitness_alignments(data, net, initial_marking, final_marking)
|
||||
precisison = pm4py.precision_token_based_replay(data, net, initial_marking, final_marking)
|
||||
#precision = pm4py.precision_alignments(data, net, initial_marking, final_marking)
|
||||
generalizability = pm4py.algo.evaluation.generalization.algorithm.apply(data, net, initial_marking, final_marking)
|
||||
simplicity = pm4py.algo.evaluation.simplicity.algorithm.apply(net)
|
||||
return [fitness['average_trace_fitness'], precisison, generalizability, simplicity]
|
||||
|
||||
|
||||
## Directly-follows graph
|
||||
dfg, start_activities, end_activities = pm4py.discover_dfg(event_log)
|
||||
pm4py.view_dfg(dfg, start_activities, end_activities)
|
||||
pm4py.save_vis_dfg(dfg, start_activities, end_activities, '../figures/processmaps/dfg_complete.png')
|
||||
|
||||
## Heuristics Miner
|
||||
net, im, fm = pm4py.discover_petri_net_heuristics(event_log)
|
||||
h_eval = eval_pm(event_log, net, im, fm)
|
||||
pm4py.vis.view_petri_net(net, im, fm)
|
||||
pm4py.vis.save_vis_petri_net(net, im, fm, "../figures/processmaps/pn_heuristics_complete.png")
|
||||
|
||||
# decorated petri net
|
||||
from pm4py.visualization.petri_net import visualizer as pn_visualizer
|
||||
parameters = {pn_visualizer.Variants.FREQUENCY.value.Parameters.FORMAT: "png"}
|
||||
gviz = pn_visualizer.apply(net, im, fm, parameters=parameters, variant=pn_visualizer.Variants.FREQUENCY, log=event_log)
|
||||
pn_visualizer.save(gviz, "../figures/processmaps/pn_heuristics_complete_decorated.png")
|
||||
|
||||
## Alpha Miner
|
||||
net, im, fm = pm4py.discover_petri_net_alpha(event_log)
|
||||
a_eval = eval_pm(event_log, net, im, fm)
|
||||
pm4py.vis.view_petri_net(net, im, fm)
|
||||
pm4py.vis.save_vis_petri_net(net, im, fm, "../figures/processmaps/pn_alpha_complete.png")
|
||||
|
||||
## Inductive Miner
|
||||
net, im, fm = pm4py.discover_petri_net_inductive(event_log)
|
||||
i_eval = eval_pm(event_log, net, im, fm)
|
||||
pm4py.vis.view_petri_net(net, im, fm)
|
||||
pm4py.vis.save_vis_petri_net(net, im, fm, "../figures/processmaps/pn_induction_complete.png")
|
||||
|
||||
|
||||
## ILP Miner
|
||||
net, im, fm = pm4py.discover_petri_net_ilp(event_log)
|
||||
ilp_eval = eval_pm(event_log, net, im, fm)
|
||||
pm4py.vis.view_petri_net(net, im, fm)
|
||||
pm4py.vis.save_vis_petri_net(net, im, fm, "../figures/processmaps/pn_ilp_complete.png")
|
||||
|
||||
|
||||
eval = pd.DataFrame(np.row_stack([h_eval, a_eval, i_eval, ilp_eval]))
|
||||
eval.columns = ["fitness", "precision", "generalizability", "simplicity"]
|
||||
eval.index = ["heuristics", "alpha", "inductive", "ilp"]
|
||||
eval
|
||||
|
||||
eval.to_csv("results/eval_all-miners_complete.csv", sep=";")
|
||||
|
||||
|
||||
###### Process Mining - individual artworks ######
|
||||
|
||||
net, im, fm = pm4py.discover_petri_net_heuristics(event_log)
|
||||
#net, im, fm = pm4py.discover_petri_net_inductive(event_log)
|
||||
|
||||
eval_art = np.empty((len(event_log["case:artwork"].unique()), 4))
|
||||
|
||||
for i in range(len(event_log["case:artwork"].unique())):
|
||||
|
||||
subdata = pm4py.filter_event_attribute_values(event_log, "case:artwork",
|
||||
[event_log["case:artwork"].unique()[i]],
|
||||
level="case", retain=True)
|
||||
#net, im, fm = pm4py.discover_petri_net_heuristics(subdata)
|
||||
eval_art[i] = eval_pm(subdata, net, im, fm)
|
||||
|
||||
eval_art = pd.DataFrame(eval_art)
|
||||
eval_art.columns = ["fitness", "precision", "generalizability", "simplicity"]
|
||||
eval_art.index = event_log["case:artwork"].unique()
|
||||
|
||||
#eval_art.to_csv("results/eval_heuristics_artworks.csv", sep=";")
|
||||
eval_art.to_csv("results/eval_inductive_artworks.csv", sep=";")
|
||||
|
||||
|
||||
##### Clustering ######
|
||||
|
||||
## KMeans
|
||||
|
||||
kmeans = KMeans(n_clusters=4, max_iter=1000).fit(eval_art)
|
||||
|
||||
#from sklearn.manifold import MDS
|
||||
#coord = pd.DataFrame(MDS(normalized_stress='auto').fit_transform(eval_art))
|
||||
|
||||
coord = eval_art
|
||||
coord["clusters"] = kmeans.labels_
|
||||
|
||||
for i in coord.clusters.unique():
|
||||
#plt.scatter(coord[coord.clusters == i].iloc[:,0], coord[coord.clusters == i].iloc[:,1],
|
||||
plt.scatter(coord[coord.clusters == i].iloc[:,1], coord[coord.clusters == i].iloc[:,2],
|
||||
#plt.scatter(coord[coord.clusters == i].iloc[:,2], coord[coord.clusters == i].iloc[:,4],
|
||||
label = i)
|
||||
plt.legend()
|
||||
plt.show()
|
||||
|
||||
### Scree plot
|
||||
|
||||
sse = {}
|
||||
for k in range(1, 10):
|
||||
kmeans = KMeans(n_clusters=k, max_iter=1000).fit(eval_art[["precision", "generalizability"]])
|
||||
#data["clusters"] = kmeans.labels_
|
||||
#print(data["clusters"])
|
||||
sse[k] = kmeans.inertia_ # Inertia: Sum of distances of samples to their closest cluster center
|
||||
plt.figure()
|
||||
plt.plot(list(sse.keys()), list(sse.values()))
|
||||
plt.xlabel("Number of clusters")
|
||||
plt.ylabel("SSE")
|
||||
plt.show()
|
||||
|
||||
# TODO: Redo it for data pre corona, so I do not have artefacts for 504 and 505
|
||||
# TODO: Create plot with artworks in it:
|
||||
# https://stackoverflow.com/questions/27800307/adding-a-picture-to-plot-in-r
|
256
code/01_clustering.R
Normal file
256
code/01_clustering.R
Normal file
@ -0,0 +1,256 @@
|
||||
# 00_current_analysis.R
|
||||
#
|
||||
# content: (1) Read evalutation data
|
||||
# (2) Clustering
|
||||
# (3) Visualization with pictures
|
||||
#
|
||||
# input: results/eval_heuristics_artworks.csv
|
||||
# results/eval_all-miners_complete.csv
|
||||
# output: --
|
||||
#
|
||||
# last mod: 2023-12-08, NW
|
||||
|
||||
# setwd("C:/Users/nwickelmaier/Nextcloud/Documents/MDS/2023ss/60100_master_thesis/code")
|
||||
|
||||
#--------------- (1) Read evaluation data ---------------
|
||||
|
||||
eval_heuristics <- read.table("results/eval_heuristics_artworks.csv", header = TRUE,
|
||||
sep = ";", row.names = 1)
|
||||
eval_inductive <- read.table("results/eval_inductive_artworks.csv", header = TRUE,
|
||||
sep = ";", row.names = 1)
|
||||
|
||||
#--------------- (2) Clustering ---------------
|
||||
|
||||
set.seed(1607)
|
||||
|
||||
# Heuristics Miner
|
||||
|
||||
k1 <- kmeans(eval_heuristics, 4)
|
||||
|
||||
colors <- c("#3CB4DC", "#78004B", "#91C86E", "#FF6900")
|
||||
|
||||
plot(generalizability ~ precision, eval_heuristics, pch = 16, col = colors[k1$cluster])
|
||||
|
||||
|
||||
## Scree plot
|
||||
|
||||
ks <- 1:10
|
||||
|
||||
sse <- NULL
|
||||
for (k in ks) sse <- c(sse, kmeans(eval_heuristics, k)$tot.withinss)
|
||||
|
||||
plot(sse ~ ks, type = "l")
|
||||
|
||||
# Inductive Miner
|
||||
|
||||
k2 <- kmeans(eval_inductive, 4)
|
||||
|
||||
plot(generalizability ~ precision, eval_inductive, pch = 16, col = colors[k2$cluster])
|
||||
|
||||
|
||||
## Scree plot
|
||||
|
||||
ks <- 1:10
|
||||
|
||||
sse <- NULL
|
||||
for (k in ks) sse <- c(sse, kmeans(eval_inductive, k)$tot.withinss)
|
||||
|
||||
plot(sse ~ ks, type = "l")
|
||||
|
||||
#--------------- (3) Visualization with pictures ---------------
|
||||
|
||||
library(png)
|
||||
library(jpeg)
|
||||
library(grid)
|
||||
|
||||
## Heuristics Miner
|
||||
#pdf("../figures/clustering_heuristics.pdf", height = 8, width = 8, pointsize = 10)
|
||||
png("../figures/clustering_heuristics.png", units = "in", height = 8, width = 8, pointsize = 10, res = 300)
|
||||
par(mai = c(.6,.6,.1,.1), mgp = c(2.4, 1, 0))
|
||||
|
||||
plot(generalizability ~ precision, eval_heuristics, type = "n", ylim = c(0.845, 0.98))
|
||||
|
||||
for (art in as.numeric(rownames(eval_heuristics))) {
|
||||
|
||||
art_string <- sprintf("%03d", art)
|
||||
|
||||
if (art == 125) {
|
||||
|
||||
pic <- readJPEG(paste0("../data/haum/ContentEyevisit/eyevisit_cards_light/",
|
||||
art_string, "/", art_string, ".jpg"))
|
||||
} else {
|
||||
pic <- readPNG(paste0("../data/haum/ContentEyevisit/eyevisit_cards_light/",
|
||||
art_string, "/", art_string, ".png"))
|
||||
}
|
||||
|
||||
img <- as.raster(pic[,,1:3])
|
||||
|
||||
x <- eval_heuristics[rownames(eval_heuristics) == art, "precision"]
|
||||
y <- eval_heuristics[rownames(eval_heuristics) == art, "generalizability"]
|
||||
|
||||
points(x, y, col = colors[k1$cluster[as.character(art)]], cex = 8, pch = 15)
|
||||
|
||||
rasterImage(img,
|
||||
xleft = x - .002,
|
||||
xright = x + .002,
|
||||
ybottom = y - .004,
|
||||
ytop = y + .004)
|
||||
|
||||
}
|
||||
|
||||
dev.off()
|
||||
|
||||
## Inductive Miner
|
||||
plot(generalizability ~ precision, eval_inductive, col = colors[k2$cluster],
|
||||
cex = 8, pch = 15)
|
||||
|
||||
for (art in as.numeric(rownames(eval_inductive))) {
|
||||
|
||||
art_string <- sprintf("%03d", art)
|
||||
|
||||
if (art == 125) {
|
||||
|
||||
pic <- readJPEG(paste0("../data/haum/ContentEyevisit/eyevisit_cards_light/",
|
||||
art_string, "/", art_string, ".jpg"))
|
||||
} else {
|
||||
pic <- readPNG(paste0("../data/haum/ContentEyevisit/eyevisit_cards_light/",
|
||||
art_string, "/", art_string, ".png"))
|
||||
}
|
||||
|
||||
img <- as.raster(pic[,,1:3])
|
||||
|
||||
x <- eval_inductive[rownames(eval_inductive) == art, "precision"]
|
||||
y <- eval_inductive[rownames(eval_inductive) == art, "generalizability"]
|
||||
|
||||
rasterImage(img,
|
||||
xleft = x - .001,
|
||||
xright = x + .001,
|
||||
ybottom = y - .002,
|
||||
ytop = y + .002)
|
||||
|
||||
}
|
||||
|
||||
#--------------- (4) Read event logs ---------------
|
||||
|
||||
dat <- read.table("../data/haum/event_logfiles_glossar_2023-11-03_17-46-28.csv",
|
||||
sep = ";", header = TRUE)
|
||||
dat$date <- as.POSIXct(dat$date)
|
||||
dat$date.start <- as.POSIXct(dat$date.start)
|
||||
dat$date.stop <- as.POSIXct(dat$date.stop)
|
||||
dat$artwork <- sprintf("%03d", dat$artwork)
|
||||
dat$event <- factor(dat$event, levels = c("move", "flipCard", "openTopic", "openPopup"))
|
||||
|
||||
dat$weekdays <- factor(weekdays(dat$date.start),
|
||||
levels = c("Montag", "Dienstag", "Mittwoch",
|
||||
"Donnerstag", "Freitag", "Samstag",
|
||||
"Sonntag"),
|
||||
labels = c("Monday", "Tuesday", "Wednesday",
|
||||
"Thursday", "Friday", "Saturday",
|
||||
"Sunday"))
|
||||
|
||||
|
||||
#--------------- (5) Frequency plot for clusters ---------------
|
||||
|
||||
# Only pre Corona
|
||||
dat <- dat[dat$date < "2020-03-13",]
|
||||
|
||||
counts_artwork <- table(dat$artwork)
|
||||
dat_count <- as.data.frame(counts_artwork)
|
||||
names(dat_count) <- c("artwork", "freq")
|
||||
dat_count$cluster <- k1$cluster[order(as.numeric(names(k1$cluster)))]
|
||||
dat_count$cluster <- factor(dat_count$cluster, levels = c(4, 2, 1, 3), labels = 4:1)
|
||||
dat_count <- dat_count[order(dat_count$cluster, dat_count$freq, decreasing = TRUE), ]
|
||||
dat_count$artwork <- factor(dat_count$artwork, levels = unique(dat_count$artwork))
|
||||
|
||||
barplot(freq ~ artwork, dat_count, las = 2, ylim = c(0, 60000),
|
||||
border = "white", ylab = "",
|
||||
col = c("#FF6900", "#78004B", "#3CB4DC", "#91C86E" )[dat_count$cluster])
|
||||
|
||||
# compare to clusters
|
||||
plot(generalizability ~ precision, eval_heuristics, type = "n", ylim = c(0.845, 0.98))
|
||||
with(eval_heuristics, text(precision, generalizability,
|
||||
rownames(eval_heuristics),
|
||||
col = colors[k1$cluster]))
|
||||
|
||||
#--------------- (6) DFGs for clusters ---------------
|
||||
|
||||
library(bupaverse)
|
||||
|
||||
|
||||
dat$start <- dat$date.start
|
||||
dat$complete <- dat$date.stop
|
||||
|
||||
|
||||
alog <- activitylog(dat,
|
||||
case_id = "trace",
|
||||
activity_id = "event",
|
||||
resource_id = "artwork",
|
||||
timestamps = c("start", "complete"))
|
||||
|
||||
|
||||
alog_c1 <- filter_case_condition(alog,
|
||||
artwork %in% dat_count[dat_count$cluster == 1, "artwork"])
|
||||
alog_c2 <- filter_case_condition(alog,
|
||||
artwork %in% dat_count[dat_count$cluster == 2, "artwork"])
|
||||
alog_c3 <- filter_case_condition(alog,
|
||||
artwork %in% dat_count[dat_count$cluster == 3, "artwork"])
|
||||
alog_c4 <- filter_case_condition(alog,
|
||||
artwork %in% dat_count[dat_count$cluster == 4, "artwork"])
|
||||
|
||||
dfg_complete <- process_map(alog,
|
||||
type_nodes = frequency("absolute", color_scale = "Greys"),
|
||||
sec_nodes = frequency("relative"),
|
||||
type_edges = frequency("absolute", color_edges = "#FF6900"),
|
||||
sec_edges = frequency("relative"),
|
||||
rankdir = "TB",
|
||||
render = FALSE)
|
||||
export_map(dfg_complete,
|
||||
file_name = "../figures/processmaps/dfg_complete_R.pdf",
|
||||
file_type = "pdf",
|
||||
title = "DFG complete")
|
||||
dfg_c1 <- process_map(alog_c1,
|
||||
type_nodes = frequency("absolute", color_scale = "Greys"),
|
||||
sec_nodes = frequency("relative"),
|
||||
type_edges = frequency("absolute", color_edges = "#FF6900"),
|
||||
sec_edges = frequency("relative"),
|
||||
rankdir = "TB",
|
||||
render = FALSE)
|
||||
export_map(dfg_c1,
|
||||
file_name = "../figures/processmaps/dfg_cluster1_R.pdf",
|
||||
file_type = "pdf",
|
||||
title = "DFG Cluster 1")
|
||||
dfg_c2 <- process_map(alog_c2,
|
||||
type_nodes = frequency("absolute", color_scale = "Greys"),
|
||||
sec_nodes = frequency("relative"),
|
||||
type_edges = frequency("absolute", color_edges = "#FF6900"),
|
||||
sec_edges = frequency("relative"),
|
||||
rankdir = "TB",
|
||||
render = FALSE)
|
||||
export_map(dfg_c2,
|
||||
file_name = "../figures/processmaps/dfg_cluster2_R.pdf",
|
||||
file_type = "pdf",
|
||||
title = "DFG Cluster 2")
|
||||
dfg_c3 <- process_map(alog_c3,
|
||||
type_nodes = frequency("absolute", color_scale = "Greys"),
|
||||
sec_nodes = frequency("relative"),
|
||||
type_edges = frequency("absolute", color_edges = "#FF6900"),
|
||||
sec_edges = frequency("relative"),
|
||||
rankdir = "TB",
|
||||
render = FALSE)
|
||||
export_map(dfg_c3,
|
||||
file_name = "../figures/processmaps/dfg_cluster3_R.pdf",
|
||||
file_type = "pdf",
|
||||
title = "DFG Cluster 3")
|
||||
dfg_c4 <- process_map(alog_c4,
|
||||
type_nodes = frequency("absolute", color_scale = "Greys"),
|
||||
sec_nodes = frequency("relative"),
|
||||
type_edges = frequency("absolute", color_edges = "#FF6900"),
|
||||
sec_edges = frequency("relative"),
|
||||
rankdir = "TB",
|
||||
render = FALSE)
|
||||
export_map(dfg_c4,
|
||||
file_name = "../figures/processmaps/dfg_cluster4_R.pdf",
|
||||
file_type = "pdf",
|
||||
title = "DFG Cluster 4")
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user