From 6cfc19a8746bd11dea5a13360d98954b414b812b Mon Sep 17 00:00:00 2001 From: nwickel Date: Wed, 6 Mar 2024 17:59:22 +0100 Subject: [PATCH] Script cleaning; data are now exported better --- code/01_preprocessing.R | 2 +- code/03_create-petrinet.py | 24 ++++++- code/04_conformance-checking.py | 27 +++++++- code/05_check-traces.R | 41 +++++++++++ code/06_infos-items.py | 25 ++++--- code/07_item-clustering.R | 116 ++++++++++---------------------- code/08_infos-clusters.py | 28 +++++--- 7 files changed, 161 insertions(+), 102 deletions(-) diff --git a/code/01_preprocessing.R b/code/01_preprocessing.R index 7d2d4a3..6bc3d85 100644 --- a/code/01_preprocessing.R +++ b/code/01_preprocessing.R @@ -11,7 +11,7 @@ # output: raw_logfiles_.csv # event_logfiles_.csv # -# last mod: 2024-01-18, NW +# last mod: 2024-02-23, NW # setwd("C:/Users/nwickelmaier/Nextcloud/Documents/MDS/2023ss/60100_master_thesis/analysis/code") diff --git a/code/03_create-petrinet.py b/code/03_create-petrinet.py index d9b416b..c83bbfa 100644 --- a/code/03_create-petrinet.py +++ b/code/03_create-petrinet.py @@ -1,7 +1,25 @@ +# 03_create-petrinet.py +# +# content: (1) Create places and transitions +# (2) Sequential net +# (3) Concurrent net +# +# input: -- +# output: results/haum/conformative_petrinet_con.pnml +# results/processmaps/conformative_petrinet_con.png +# results/processmaps/conformative_bpmn_con.png +# results/haum/conformative_petrinet_seq.pnml +# results/processmaps/conformative_petrinet_seq.png +# results/processmaps/conformative_bpmn_seq.png +# +# last mod: 2024-03-06 + import pm4py from pm4py.objects.petri_net.obj import PetriNet, Marking from pm4py.objects.petri_net.utils import petri_utils +#--------------- (1) Create places and transitions --------------- + # Create places source = PetriNet.Place("source") sink = PetriNet.Place("sink") @@ -44,7 +62,8 @@ t_16 = PetriNet.Transition("t_16") t_17 = PetriNet.Transition("t_17") t_18 = PetriNet.Transition("t_18") -## Sequential net +#--------------- (2) Sequential net --------------- + net_seq = PetriNet("new_petri_net") # Add places @@ -149,7 +168,8 @@ pm4py.view_bpmn(bpmn) pm4py.vis.save_vis_bpmn(bpmn, "results/processmaps/conformative_bpmn_seq.png") -## Concurrent net +#--------------- (3) Concurrent net --------------- + net_con = PetriNet("new_petri_net") # Add places diff --git a/code/04_conformance-checking.py b/code/04_conformance-checking.py index 9ffff09..74ca403 100644 --- a/code/04_conformance-checking.py +++ b/code/04_conformance-checking.py @@ -1,9 +1,33 @@ +# 04_conformance-checking.py +# +# content: (1) Load data and create event log +# (2) Infos for items +# +# input: results/haum/event_logfiles_2024-02-21_16-07-33.csv +# results/haum/conformative_petrinet_con.pnml +# output: results/processmaps/dfg_complete_python.png +# results/eval_all-miners_complete.csv +# results/eval_all-miners_clean.csv +# results/processmaps/petrinet_conformative.png +# results/processmaps/petrinet_heuristics_clean.png +# results/processmaps/petrinet_alpha_clean.png +# results/processmaps/petrinet_inductive_clean.png +# results/processmaps/petrinet_ilp_clean.png +# results/processmaps/bpmn_conformative.png +# results/processmaps/bpmn_inductive_clean.png +# results/processmaps/bpmn_ilp_clean.png +# results/processmaps/bpmn_alpha_clean.png +# results/processmaps/bpmn_heuristics_clean.png +# +# last mod: 2024-03-06 + import pm4py import pandas as pd import numpy as np + from python_helpers import eval_pm, pn_infos_miner -###### Load data and create event logs ###### +#--------------- (1) Load data and create event logs --------------- dat = pd.read_csv("results/haum/event_logfiles_2024-02-21_16-07-33.csv", sep = ";") @@ -129,3 +153,4 @@ a_bpmn = pm4py.convert.convert_to_bpmn(a_net, a_im, a_fm) pm4py.vis.save_vis_bpmn(a_bpmn, "results/processmaps/bpmn_alpha_clean.png") h_bpmn = pm4py.convert.convert_to_bpmn(h_net, h_im, h_fm) pm4py.vis.save_vis_bpmn(h_bpmn, "results/processmaps/bpmn_heuristics_clean.png") + diff --git a/code/05_check-traces.R b/code/05_check-traces.R index b9431ca..1f77f2a 100644 --- a/code/05_check-traces.R +++ b/code/05_check-traces.R @@ -1,3 +1,16 @@ +# 05_check-traces.R +# +# content: (1) Look at broken trace +# (2) Function to find broken traces +# (3) Export data frame for analyses +# +# input: results/haum/event_logfiles_2024-02-21_16-07-33.csv +# results/haum/raw_logfiles_2024-02-21_16-07-33.csv +# output: results/haum/eventlogs_pre-corona_cleaned.RData +# results/haum/eventlogs_pre-corona_cleaned.csv +# +# last mod: 2024-03-06 + # setwd("C:/Users/nwickelmaier/Nextcloud/Documents/MDS/2023ss/60100_master_thesis/analysis/code") #--------------- (1) Look at broken trace --------------- @@ -49,3 +62,31 @@ check <- check_traces(tmp) check[check$check, ] +#--------------- (3) Export data frame for analyses --------------- + +datlogs$event <- factor(datlogs$event, levels = c("move", "flipCard", + "openTopic", + "openPopup")) +datlogs$topic <- factor(datlogs$topic) + +datlogs$weekdays <- factor(weekdays(datlogs$date.start), + levels = c("Montag", "Dienstag", "Mittwoch", + "Donnerstag", "Freitag", "Samstag", + "Sonntag"), + labels = c("Monday", "Tuesday", "Wednesday", + "Thursday", "Friday", "Saturday", + "Sunday")) + +# Select data pre Corona +dat <- datlogs[as.Date(datlogs$date.start) < "2020-03-13", ] +# Remove corrupt trace +dat <- dat[dat$path != 106098, ] + +save(dat, file = "results/haum/eventlogs_pre-corona_cleaned.RData") + +write.table(dat, + file = "results/haum/eventlogs_pre-corona_cleaned.csv", + sep = ";", + quote = FALSE, + row.names = FALSE) + diff --git a/code/06_infos-items.py b/code/06_infos-items.py index d43e217..9f6ec9b 100644 --- a/code/06_infos-items.py +++ b/code/06_infos-items.py @@ -1,28 +1,37 @@ +# 06_infos-items.py +# +# content: (1) Load data and create event log +# (2) Infos for items +# +# input: results/haum/eventlogs_pre-corona_cleaned.csv +# output: results/haum/pn_infos_items.csv +# +# last mod: 2024-03-06 + import pm4py import pandas as pd import numpy as np from python_helpers import eval_pm, pn_infos -###### Load data and create event logs ###### +#--------------- (1) Load data and create event logs --------------- -dat = pd.read_csv("results/haum/event_logfiles_2024-02-21_16-07-33.csv", sep = ";") -dat = dat[dat["date.start"] < "2020-03-13"] -# --> only pre corona (before artworks were updated) -dat = dat[dat["path"] != 106098] -# exclude broken trace +dat = pd.read_csv("results/haum/eventlogs_pre-corona_cleaned", sep = ";") log_path = pm4py.format_dataframe(dat, case_id = "path", activity_key = "event", - timestamp_key = "date.start") + timestamp_key = "date.start") -###### Infos for items ###### +#--------------- (2) Infos for items --------------- eval = pd.DataFrame(columns = ["fitness", "precision", "generalizability", "simplicity", "sound", "narcs", "ntrans", "nplaces", "nvariants", "mostfreq"]) + for item in log_path.item.unique().tolist(): eval = pd.concat([eval, pn_infos(log_path, "item", item)]) + eval = eval.sort_index() # Export eval.to_csv("results/haum/pn_infos_items.csv", sep = ";") + diff --git a/code/07_item-clustering.R b/code/07_item-clustering.R index a3651a8..563bfd6 100644 --- a/code/07_item-clustering.R +++ b/code/07_item-clustering.R @@ -7,12 +7,11 @@ # (2) Clustering # (3) Visualization with pictures # -# input: results/haum/event_logfiles_2024-02-21_16-07-33.csv +# input: results/haum/eventlogs_pre-corona_cleaned.RData # results/haum/pn_infos_items.csv -# output: results/haum/event_logfiles_pre-corona_with-clusters.csv +# output: results/haum/eventlogs_pre-corona_item-clusters.csv # -# last mod: 2024-02-23 - +# last mod: 2024-03-06 # setwd("C:/Users/nwickelmaier/Nextcloud/Documents/MDS/2023ss/60100_master_thesis/analysis/code") @@ -23,34 +22,16 @@ library(factoextra) #--------------- (1.1) Read log event data --------------- -dat0 <- read.table("results/haum/event_logfiles_2024-02-21_16-07-33.csv", - colClasses = c("character", "character", "POSIXct", - "POSIXct", "character", "integer", - "numeric", "character", "character", - rep("numeric", 3), "character", - "character", rep("numeric", 11), - "character", "character"), - sep = ";", header = TRUE) -dat0$event <- factor(dat0$event, levels = c("move", "flipCard", "openTopic", - "openPopup")) - -# TODO: Maybe look at this with complete data? - -# Select data pre Corona -dat <- dat0[as.Date(dat0$date.start) < "2020-03-13", ] -dat <- dat[dat$path != 106098, ] +load("results/haum/eventlogs_pre-corona_cleaned.RData") #--------------- (1.2) Read infos for PM for items --------------- datitem <- read.table("results/haum/pn_infos_items.csv", header = TRUE, sep = ";", row.names = 1) - #--------------- (1.3) Extract additional infos for clustering --------------- -dat_split <- split(dat, ~ path) - -time_minmax <- function(subdata) { +time_minmax_ms <- function(subdata) { subdata$min_time <- min(subdata$timeMs.start) if (all(is.na(subdata$timeMs.stop))) { subdata$max_time <- NA @@ -59,18 +40,18 @@ time_minmax <- function(subdata) { } subdata } +# TODO: Move to helper file -dat_list <- pbapply::pblapply(dat_split, time_minmax) +# Get average duration per path +dat_split <- split(dat, ~ path) +dat_list <- pbapply::pblapply(dat_split, time_minmax_ms) dat_minmax <- dplyr::bind_rows(dat_list) datpath <- aggregate(duration ~ item + path, dat, mean, na.action = NULL) - datpath$min_time <- aggregate(min_time ~ path, dat_minmax, unique, na.action = NULL)$min_time datpath$max_time <- aggregate(max_time ~ path, dat_minmax, unique, na.action = NULL)$max_time +datpath$duration <- datpath$max_time - datpath$min_time -datpath$duration_path <- datpath$max_time - datpath$min_time - -# average duration per path datitem$duration <- aggregate(duration ~ item, datpath, mean)$duration datitem$distance <- aggregate(distance ~ item, dat, mean)$distance datitem$scaleSize <- aggregate(scaleSize ~ item, dat, mean)$scaleSize @@ -89,66 +70,39 @@ df <- datitem[, c("precision", "generalizability", "nvariants", "duration", "ncases", "nmoves", "nopenTopic", "nopenPopup")] |> scale() -mat <- dist(df) +dist_mat <- dist(df) -heatmap(as.matrix(mat)) +heatmap(as.matrix(dist_mat)) # Choosing best linkage method -h1 <- hclust(mat, method = "average") -h2 <- hclust(mat, method = "complete") -h3 <- hclust(mat, method = "ward.D") -h4 <- hclust(mat, method = "ward.D2") -h5 <- hclust(mat, method = "single") +method <- c(average = "average", single = "single", complete = "complete", + ward = "ward") -# Cophenetic Distances, for each linkage -c1 <- cophenetic(h1) -c2 <- cophenetic(h2) -c3 <- cophenetic(h3) -c4 <- cophenetic(h4) -c5 <- cophenetic(h5) - -# Correlations -cor(mat, c1) -cor(mat, c2) -cor(mat, c3) -cor(mat, c4) -cor(mat, c5) -# https://en.wikipedia.org/wiki/Cophenetic_correlation -# https://stats.stackexchange.com/questions/195446/choosing-the-right-linkage-method-for-hierarchical-clustering +hcs <- lapply(method, function(x) cluster::agnes(dist_mat, method = x)) +acs <- sapply(hcs, function(x) x$ac) # Dendograms -par(mfrow=c(3,2)) -plot(h1, main = "Average Linkage") -plot(h2, main = "Complete Linkage") -plot(h3, main = "Ward Linkage") -plot(h4, main = "Ward 2 Linkage") -plot(h5, main = "Single Linkage") +par(mfrow=c(4,2)) +for (hc in hcs) plot(hc, main = "") - -hc <- h1 -# Note that ‘agnes(*, method="ward")’ corresponds to ‘hclust(*, "ward.D2")’ +hc <- hcs$ward k <- 4 # number of clusters +mycols <- c("#78004B", "#FF6900", "#3CB4DC", "#91C86E") + grp <- cutree(hc, k = k) datitem$grp <- grp fviz_dend(hc, k = k, cex = 0.5, - k_colors = c("#78004B", "#FF6900", "#3CB4DC", "#91C86E", - "#000000", "gold", "#434F4F"), + k_colors = mycols, #type = "phylogenic", rect = TRUE ) -plot(hc) -rect.hclust(hc, k=8, border="red") -rect.hclust(hc, k=7, border="blue") -rect.hclust(hc, k=6, border="green") - p <- fviz_cluster(list(data = df, cluster = grp), - palette = c("#78004B", "#FF6900", "#3CB4DC", "#91C86E", - "#000000", "#434F4F", "gold"), + palette = mycols, ellipse.type = "convex", repel = TRUE, show.clust.cent = FALSE, ggtheme = theme_bw()) @@ -156,14 +110,16 @@ p aggregate(cbind(duration, distance, scaleSize , rotationDegree, npaths, ncases, nmoves, nflipCard, nopenTopic, nopenPopup) ~ grp, - datitem, median) + datitem, mean) + + +aggregate(cbind(duration, distance, scaleSize , rotationDegree, npaths, + ncases, nmoves, nflipCard, nopenTopic, nopenPopup) ~ grp, + datitem, max) + # Something like a scree plot (??) -plot(rev(seq_along(hc$height)), hc$height, type = "l") -points(rev(seq_along(hc$height)), hc$height, pch = 16, cex = .5) - - - +plot(rev(hc$height), type = "b", pch = 16, cex = .5) datitem$item <- sprintf("%03d", as.numeric(gsub("item_([0-9]{3})", "\\1", row.names(datitem)))) @@ -179,7 +135,7 @@ vioplot::vioplot(scaleSize ~ grp, res) vioplot::vioplot(rotationDegree ~ grp, res) write.table(res, - file = "results/haum/event_logfiles_pre-corona_with-clusters.csv", + file = "results/haum/eventlogs_pre-corona_item-clusters.csv", sep = ";", quote = FALSE, row.names = FALSE) @@ -207,8 +163,6 @@ for (cluster in sort(unique(res$grp))) { file_name = paste0("results/processmaps/dfg_cluster", cluster, "_R.pdf"), file_type = "pdf", title = paste("DFG Cluster", cluster)) - - } #--------------- (3) Visualization with pictures --------------- @@ -217,8 +171,6 @@ library(png) library(jpeg) library(grid) -colors <- c("#78004B", "#FF6900", "#3CB4DC", "#91C86E") - pdf("results/figures/clustering_artworks.pdf", height = 8, width = 8, pointsize = 10) #png("results/figures/clustering_artworks.png", units = "in", height = 8, width = 8, pointsize = 10, res = 300) @@ -244,7 +196,7 @@ for (item in sprintf("%03d", as.numeric(rownames(p$data)))) { y <- p$data$y[sprintf("%03d", as.numeric(rownames(p$data))) == item] points(x, y, - col = colors[p$data$cluster[sprintf("%03d", as.numeric(rownames(p$data))) == item]], + col = mycols[p$data$cluster[sprintf("%03d", as.numeric(rownames(p$data))) == item]], cex = 9, pch = 15) @@ -255,7 +207,7 @@ for (item in sprintf("%03d", as.numeric(rownames(p$data)))) { ytop = y + .2) } -legend("topright", paste("Cluster", 1:k), col = colors, pch = 15, bty = "n") +legend("topright", paste("Cluster", 1:k), col = mycols, pch = 15, bty = "n") dev.off() diff --git a/code/08_infos-clusters.py b/code/08_infos-clusters.py index 994ec2c..fcb390f 100644 --- a/code/08_infos-clusters.py +++ b/code/08_infos-clusters.py @@ -1,16 +1,27 @@ +# 08_infos-clusters.py +# +# content: (1) Load data and create event log +# (2) Infos for clusters +# (3) Process maps for clusters +# +# input: results/haum/eventlogs_pre-corona_item-clusters.csv +# output: results/haum/pn_infos_clusters.csv +# +# last mod: 2024-03-06 + import pm4py import pandas as pd from python_helpers import eval_pm, pn_infos -###### Load data and create event logs ###### +#--------------- (1) Load data and create event logs --------------- -dat = pd.read_csv("results/haum/event_logfiles_pre-corona_with-clusters.csv", sep = ";") +dat = pd.read_csv("results/haum/eventlogs_pre-corona_item-clusters.csv", sep = ";") log_path = pm4py.format_dataframe(dat, case_id = "path", activity_key = "event", - timestamp_key = "date.start") + timestamp_key = "date.start") -###### Infos for clusters ###### +#--------------- (2) Infos for clusters --------------- # Merge clusters into data frame eval = pd.DataFrame(columns = ["fitness", "precision", "generalizability", @@ -22,12 +33,13 @@ eval = eval.sort_index() eval.to_csv("results/haum/pn_infos_clusters.csv", sep = ";") -###### Process maps for clusters ###### +#--------------- (3) Process maps for clusters --------------- for cluster in log_path.grp.unique().tolist(): subdata = log_path[log_path.grp == cluster] - subnet, subim, subfm = pm4py.discover_petri_net_inductive(subdata) + subnet, subim, subfm = pm4py.discover_petri_net_inductive(subdata, noise_threshold=0.5) pm4py.save_vis_petri_net(subnet, subim, subfm, - "results/processmaps/petrinet_cluster" + str(cluster).zfill(3) + ".png") + "results/processmaps/petrinet_cluster" + str(cluster).zfill(3) + ".png") bpmn = pm4py.convert.convert_to_bpmn(subnet, subim, subfm) - pm4py.vis.save_vis_bpmn(bpmn, "results/processmaps/bpmn_cluster_" + str(cluster).zfill(3) + ".png") + pm4py.vis.save_vis_bpmn(bpmn, "results/processmaps/bpmn_cluster_" + + str(cluster).zfill(3) + ".png")