diff --git a/code/00_pm.py b/code/00_pm.py new file mode 100644 index 0000000..ec11bdd --- /dev/null +++ b/code/00_pm.py @@ -0,0 +1,145 @@ +#%% # needed for shortcuts to run properly in VSCode *eyeroll* +%reset + +import pm4py +from pm4py.algo.evaluation.generalization import algorithm as generalization_evaluator +from pm4py.algo.evaluation.simplicity import algorithm as simplicity_evaluator + +import pandas as pd +import numpy as np +import matplotlib.pyplot as plt +from sklearn.cluster import KMeans + +###### Load data and create event logs ###### + +dat = pd.read_csv("../data/haum/event_logfiles_glossar_2023-11-03_17-46-28.csv", sep = ";") +dat = dat[dat.date < "2020-03-13"] +# --> only pre corona (before artworks were updated) + +event_log = pm4py.format_dataframe(dat, case_id='trace', activity_key='event', + timestamp_key='date.start') +event_log = event_log.rename(columns={'artwork': 'case:artwork'}) +#event_log = pm4py.convert_to_event_log(dat_log) # deprecated +start_activities = pm4py.get_start_activities(event_log) +start_activities +end_activities = pm4py.get_end_activities(event_log) +end_activities + +###### Process Mining - complete data set ##### + +def eval_pm(data, net, initial_marking, final_marking): + """Caculate fitness, precision, generalizability, and simplicity for petri net""" + fitness = pm4py.fitness_token_based_replay(data, net, initial_marking, final_marking) + #fitness = pm4py.fitness_alignments(data, net, initial_marking, final_marking) + precisison = pm4py.precision_token_based_replay(data, net, initial_marking, final_marking) + #precision = pm4py.precision_alignments(data, net, initial_marking, final_marking) + generalizability = pm4py.algo.evaluation.generalization.algorithm.apply(data, net, initial_marking, final_marking) + simplicity = pm4py.algo.evaluation.simplicity.algorithm.apply(net) + return [fitness['average_trace_fitness'], precisison, generalizability, simplicity] + + +## Directly-follows graph +dfg, start_activities, end_activities = pm4py.discover_dfg(event_log) +pm4py.view_dfg(dfg, start_activities, end_activities) +pm4py.save_vis_dfg(dfg, start_activities, end_activities, '../figures/processmaps/dfg_complete.png') + +## Heuristics Miner +net, im, fm = pm4py.discover_petri_net_heuristics(event_log) +h_eval = eval_pm(event_log, net, im, fm) +pm4py.vis.view_petri_net(net, im, fm) +pm4py.vis.save_vis_petri_net(net, im, fm, "../figures/processmaps/pn_heuristics_complete.png") + +# decorated petri net +from pm4py.visualization.petri_net import visualizer as pn_visualizer +parameters = {pn_visualizer.Variants.FREQUENCY.value.Parameters.FORMAT: "png"} +gviz = pn_visualizer.apply(net, im, fm, parameters=parameters, variant=pn_visualizer.Variants.FREQUENCY, log=event_log) +pn_visualizer.save(gviz, "../figures/processmaps/pn_heuristics_complete_decorated.png") + +## Alpha Miner +net, im, fm = pm4py.discover_petri_net_alpha(event_log) +a_eval = eval_pm(event_log, net, im, fm) +pm4py.vis.view_petri_net(net, im, fm) +pm4py.vis.save_vis_petri_net(net, im, fm, "../figures/processmaps/pn_alpha_complete.png") + +## Inductive Miner +net, im, fm = pm4py.discover_petri_net_inductive(event_log) +i_eval = eval_pm(event_log, net, im, fm) +pm4py.vis.view_petri_net(net, im, fm) +pm4py.vis.save_vis_petri_net(net, im, fm, "../figures/processmaps/pn_induction_complete.png") + + +## ILP Miner +net, im, fm = pm4py.discover_petri_net_ilp(event_log) +ilp_eval = eval_pm(event_log, net, im, fm) +pm4py.vis.view_petri_net(net, im, fm) +pm4py.vis.save_vis_petri_net(net, im, fm, "../figures/processmaps/pn_ilp_complete.png") + + +eval = pd.DataFrame(np.row_stack([h_eval, a_eval, i_eval, ilp_eval])) +eval.columns = ["fitness", "precision", "generalizability", "simplicity"] +eval.index = ["heuristics", "alpha", "inductive", "ilp"] +eval + +eval.to_csv("results/eval_all-miners_complete.csv", sep=";") + + +###### Process Mining - individual artworks ###### + +net, im, fm = pm4py.discover_petri_net_heuristics(event_log) +#net, im, fm = pm4py.discover_petri_net_inductive(event_log) + +eval_art = np.empty((len(event_log["case:artwork"].unique()), 4)) + +for i in range(len(event_log["case:artwork"].unique())): + + subdata = pm4py.filter_event_attribute_values(event_log, "case:artwork", + [event_log["case:artwork"].unique()[i]], + level="case", retain=True) + #net, im, fm = pm4py.discover_petri_net_heuristics(subdata) + eval_art[i] = eval_pm(subdata, net, im, fm) + +eval_art = pd.DataFrame(eval_art) +eval_art.columns = ["fitness", "precision", "generalizability", "simplicity"] +eval_art.index = event_log["case:artwork"].unique() + +#eval_art.to_csv("results/eval_heuristics_artworks.csv", sep=";") +eval_art.to_csv("results/eval_inductive_artworks.csv", sep=";") + + +##### Clustering ###### + +## KMeans + +kmeans = KMeans(n_clusters=4, max_iter=1000).fit(eval_art) + +#from sklearn.manifold import MDS +#coord = pd.DataFrame(MDS(normalized_stress='auto').fit_transform(eval_art)) + +coord = eval_art +coord["clusters"] = kmeans.labels_ + +for i in coord.clusters.unique(): + #plt.scatter(coord[coord.clusters == i].iloc[:,0], coord[coord.clusters == i].iloc[:,1], + plt.scatter(coord[coord.clusters == i].iloc[:,1], coord[coord.clusters == i].iloc[:,2], + #plt.scatter(coord[coord.clusters == i].iloc[:,2], coord[coord.clusters == i].iloc[:,4], + label = i) +plt.legend() +plt.show() + +### Scree plot + +sse = {} +for k in range(1, 10): + kmeans = KMeans(n_clusters=k, max_iter=1000).fit(eval_art[["precision", "generalizability"]]) + #data["clusters"] = kmeans.labels_ + #print(data["clusters"]) + sse[k] = kmeans.inertia_ # Inertia: Sum of distances of samples to their closest cluster center +plt.figure() +plt.plot(list(sse.keys()), list(sse.values())) +plt.xlabel("Number of clusters") +plt.ylabel("SSE") +plt.show() + +# TODO: Redo it for data pre corona, so I do not have artefacts for 504 and 505 +# TODO: Create plot with artworks in it: +# https://stackoverflow.com/questions/27800307/adding-a-picture-to-plot-in-r diff --git a/code/01_clustering.R b/code/01_clustering.R new file mode 100644 index 0000000..7bafa80 --- /dev/null +++ b/code/01_clustering.R @@ -0,0 +1,256 @@ +# 00_current_analysis.R +# +# content: (1) Read evalutation data +# (2) Clustering +# (3) Visualization with pictures +# +# input: results/eval_heuristics_artworks.csv +# results/eval_all-miners_complete.csv +# output: -- +# +# last mod: 2023-12-08, NW + +# setwd("C:/Users/nwickelmaier/Nextcloud/Documents/MDS/2023ss/60100_master_thesis/code") + +#--------------- (1) Read evaluation data --------------- + +eval_heuristics <- read.table("results/eval_heuristics_artworks.csv", header = TRUE, + sep = ";", row.names = 1) +eval_inductive <- read.table("results/eval_inductive_artworks.csv", header = TRUE, + sep = ";", row.names = 1) + +#--------------- (2) Clustering --------------- + +set.seed(1607) + +# Heuristics Miner + +k1 <- kmeans(eval_heuristics, 4) + +colors <- c("#3CB4DC", "#78004B", "#91C86E", "#FF6900") + +plot(generalizability ~ precision, eval_heuristics, pch = 16, col = colors[k1$cluster]) + + +## Scree plot + +ks <- 1:10 + +sse <- NULL +for (k in ks) sse <- c(sse, kmeans(eval_heuristics, k)$tot.withinss) + +plot(sse ~ ks, type = "l") + +# Inductive Miner + +k2 <- kmeans(eval_inductive, 4) + +plot(generalizability ~ precision, eval_inductive, pch = 16, col = colors[k2$cluster]) + + +## Scree plot + +ks <- 1:10 + +sse <- NULL +for (k in ks) sse <- c(sse, kmeans(eval_inductive, k)$tot.withinss) + +plot(sse ~ ks, type = "l") + +#--------------- (3) Visualization with pictures --------------- + +library(png) +library(jpeg) +library(grid) + +## Heuristics Miner +#pdf("../figures/clustering_heuristics.pdf", height = 8, width = 8, pointsize = 10) +png("../figures/clustering_heuristics.png", units = "in", height = 8, width = 8, pointsize = 10, res = 300) +par(mai = c(.6,.6,.1,.1), mgp = c(2.4, 1, 0)) + +plot(generalizability ~ precision, eval_heuristics, type = "n", ylim = c(0.845, 0.98)) + +for (art in as.numeric(rownames(eval_heuristics))) { + + art_string <- sprintf("%03d", art) + + if (art == 125) { + + pic <- readJPEG(paste0("../data/haum/ContentEyevisit/eyevisit_cards_light/", + art_string, "/", art_string, ".jpg")) + } else { + pic <- readPNG(paste0("../data/haum/ContentEyevisit/eyevisit_cards_light/", + art_string, "/", art_string, ".png")) + } + + img <- as.raster(pic[,,1:3]) + + x <- eval_heuristics[rownames(eval_heuristics) == art, "precision"] + y <- eval_heuristics[rownames(eval_heuristics) == art, "generalizability"] + + points(x, y, col = colors[k1$cluster[as.character(art)]], cex = 8, pch = 15) + + rasterImage(img, + xleft = x - .002, + xright = x + .002, + ybottom = y - .004, + ytop = y + .004) + +} + +dev.off() + +## Inductive Miner +plot(generalizability ~ precision, eval_inductive, col = colors[k2$cluster], + cex = 8, pch = 15) + +for (art in as.numeric(rownames(eval_inductive))) { + + art_string <- sprintf("%03d", art) + + if (art == 125) { + + pic <- readJPEG(paste0("../data/haum/ContentEyevisit/eyevisit_cards_light/", + art_string, "/", art_string, ".jpg")) + } else { + pic <- readPNG(paste0("../data/haum/ContentEyevisit/eyevisit_cards_light/", + art_string, "/", art_string, ".png")) + } + + img <- as.raster(pic[,,1:3]) + + x <- eval_inductive[rownames(eval_inductive) == art, "precision"] + y <- eval_inductive[rownames(eval_inductive) == art, "generalizability"] + + rasterImage(img, + xleft = x - .001, + xright = x + .001, + ybottom = y - .002, + ytop = y + .002) + +} + +#--------------- (4) Read event logs --------------- + +dat <- read.table("../data/haum/event_logfiles_glossar_2023-11-03_17-46-28.csv", + sep = ";", header = TRUE) +dat$date <- as.POSIXct(dat$date) +dat$date.start <- as.POSIXct(dat$date.start) +dat$date.stop <- as.POSIXct(dat$date.stop) +dat$artwork <- sprintf("%03d", dat$artwork) +dat$event <- factor(dat$event, levels = c("move", "flipCard", "openTopic", "openPopup")) + +dat$weekdays <- factor(weekdays(dat$date.start), + levels = c("Montag", "Dienstag", "Mittwoch", + "Donnerstag", "Freitag", "Samstag", + "Sonntag"), + labels = c("Monday", "Tuesday", "Wednesday", + "Thursday", "Friday", "Saturday", + "Sunday")) + + +#--------------- (5) Frequency plot for clusters --------------- + +# Only pre Corona +dat <- dat[dat$date < "2020-03-13",] + +counts_artwork <- table(dat$artwork) +dat_count <- as.data.frame(counts_artwork) +names(dat_count) <- c("artwork", "freq") +dat_count$cluster <- k1$cluster[order(as.numeric(names(k1$cluster)))] +dat_count$cluster <- factor(dat_count$cluster, levels = c(4, 2, 1, 3), labels = 4:1) +dat_count <- dat_count[order(dat_count$cluster, dat_count$freq, decreasing = TRUE), ] +dat_count$artwork <- factor(dat_count$artwork, levels = unique(dat_count$artwork)) + +barplot(freq ~ artwork, dat_count, las = 2, ylim = c(0, 60000), + border = "white", ylab = "", + col = c("#FF6900", "#78004B", "#3CB4DC", "#91C86E" )[dat_count$cluster]) + +# compare to clusters +plot(generalizability ~ precision, eval_heuristics, type = "n", ylim = c(0.845, 0.98)) +with(eval_heuristics, text(precision, generalizability, + rownames(eval_heuristics), + col = colors[k1$cluster])) + +#--------------- (6) DFGs for clusters --------------- + +library(bupaverse) + + +dat$start <- dat$date.start +dat$complete <- dat$date.stop + + +alog <- activitylog(dat, + case_id = "trace", + activity_id = "event", + resource_id = "artwork", + timestamps = c("start", "complete")) + + +alog_c1 <- filter_case_condition(alog, + artwork %in% dat_count[dat_count$cluster == 1, "artwork"]) +alog_c2 <- filter_case_condition(alog, + artwork %in% dat_count[dat_count$cluster == 2, "artwork"]) +alog_c3 <- filter_case_condition(alog, + artwork %in% dat_count[dat_count$cluster == 3, "artwork"]) +alog_c4 <- filter_case_condition(alog, + artwork %in% dat_count[dat_count$cluster == 4, "artwork"]) + +dfg_complete <- process_map(alog, + type_nodes = frequency("absolute", color_scale = "Greys"), + sec_nodes = frequency("relative"), + type_edges = frequency("absolute", color_edges = "#FF6900"), + sec_edges = frequency("relative"), + rankdir = "TB", + render = FALSE) +export_map(dfg_complete, + file_name = "../figures/processmaps/dfg_complete_R.pdf", + file_type = "pdf", + title = "DFG complete") +dfg_c1 <- process_map(alog_c1, + type_nodes = frequency("absolute", color_scale = "Greys"), + sec_nodes = frequency("relative"), + type_edges = frequency("absolute", color_edges = "#FF6900"), + sec_edges = frequency("relative"), + rankdir = "TB", + render = FALSE) +export_map(dfg_c1, + file_name = "../figures/processmaps/dfg_cluster1_R.pdf", + file_type = "pdf", + title = "DFG Cluster 1") +dfg_c2 <- process_map(alog_c2, + type_nodes = frequency("absolute", color_scale = "Greys"), + sec_nodes = frequency("relative"), + type_edges = frequency("absolute", color_edges = "#FF6900"), + sec_edges = frequency("relative"), + rankdir = "TB", + render = FALSE) +export_map(dfg_c2, + file_name = "../figures/processmaps/dfg_cluster2_R.pdf", + file_type = "pdf", + title = "DFG Cluster 2") +dfg_c3 <- process_map(alog_c3, + type_nodes = frequency("absolute", color_scale = "Greys"), + sec_nodes = frequency("relative"), + type_edges = frequency("absolute", color_edges = "#FF6900"), + sec_edges = frequency("relative"), + rankdir = "TB", + render = FALSE) +export_map(dfg_c3, + file_name = "../figures/processmaps/dfg_cluster3_R.pdf", + file_type = "pdf", + title = "DFG Cluster 3") +dfg_c4 <- process_map(alog_c4, + type_nodes = frequency("absolute", color_scale = "Greys"), + sec_nodes = frequency("relative"), + type_edges = frequency("absolute", color_edges = "#FF6900"), + sec_edges = frequency("relative"), + rankdir = "TB", + render = FALSE) +export_map(dfg_c4, + file_name = "../figures/processmaps/dfg_cluster4_R.pdf", + file_type = "pdf", + title = "DFG Cluster 4") + +