From e8aac63504f68784dad2814407e292a307cb9cc2 Mon Sep 17 00:00:00 2001 From: nwickel Date: Thu, 25 Jan 2024 17:21:18 +0100 Subject: [PATCH] Script dump after trying out a hundred things; needs serious cleaning --- code/04_clustering_haum.R | 2 +- code/04_modeling_haum.R | 356 +++++++++++------- code/check_broken_trace.R | 28 ++ code/item_clustering.R | 158 ++++++++ code/pm_conformance-checking.py | 299 ++++++--------- ...eate-petrinet.py => pm_create-petrinet.py} | 12 +- code/pm_infos-clusters.py | 38 ++ code/pm_infos-items.py | 54 +++ code/pm_navigation-behavior.py | 113 ++++-- code/python_helpers.py | 37 ++ code/trace-clustering.py | 133 +++++++ 11 files changed, 870 insertions(+), 360 deletions(-) create mode 100644 code/check_broken_trace.R create mode 100644 code/item_clustering.R rename code/{create-petrinet.py => pm_create-petrinet.py} (95%) create mode 100644 code/pm_infos-clusters.py create mode 100644 code/pm_infos-items.py create mode 100644 code/python_helpers.py diff --git a/code/04_clustering_haum.R b/code/04_clustering_haum.R index 7161c3d..0755d5e 100644 --- a/code/04_clustering_haum.R +++ b/code/04_clustering_haum.R @@ -54,7 +54,7 @@ mat1 <- dat[, c("year", "duration1", "topicNumber1", "distance1", "scaleSize1", paste0("A", unique(dat$artwork)), "flipCard", "move", "openTopic", "openPopup")] -library(cluster) # for hiereachical clustering +library(cluster) # for hierarchical clustering k1 <- kmeans(mat1, 2) dat$kcluster <- k1$cluster diff --git a/code/04_modeling_haum.R b/code/04_modeling_haum.R index 400f8e0..0aa798f 100644 --- a/code/04_modeling_haum.R +++ b/code/04_modeling_haum.R @@ -1,8 +1,10 @@ -# setwd("C:/Users/nwickelmaier/Nextcloud/Documents/MDS/2023ss/60100_master_thesis/code") +# setwd("C:/Users/nwickelmaier/Nextcloud/Documents/MDS/2023ss/60100_master_thesis/analysis/code") + +library(bupaverse) # Read data -dat <- read.table("results/haum/event_logfiles_2024-01-02_19-44-50.csv", +dat0 <- read.table("results/haum/event_logfiles_2024-01-18_09-58-52.csv", colClasses = c("character", "character", "POSIXct", "POSIXct", "character", "integer", "numeric", "character", "character", @@ -11,10 +13,10 @@ dat <- read.table("results/haum/event_logfiles_2024-01-02_19-44-50.csv", "character", "character"), sep = ";", header = TRUE) -dat$event <- factor(dat$event, levels = c("move", "flipCard", "openTopic", +dat0$event <- factor(dat0$event, levels = c("move", "flipCard", "openTopic", "openPopup")) -dat$weekdays <- factor(weekdays(dat$date.start), +dat0$weekdays <- factor(weekdays(dat0$date.start), levels = c("Montag", "Dienstag", "Mittwoch", "Donnerstag", "Freitag", "Samstag", "Sonntag"), @@ -23,105 +25,160 @@ dat$weekdays <- factor(weekdays(dat$date.start), "Sunday")) # Select data pre Corona -dat <- dat[as.Date(dat$date.start) < "2020-03-13", ] -dat <- dat[dat["path"] != 81621, ] +dat <- dat0[as.Date(dat0$date.start) < "2020-03-13", ] +dat <- dat[dat$path != 106098, ] + + +table(table(dat$start)) table(dat$event) proportions(table(dat$event)) +dat_dur <- aggregate(duration ~ item, dat, mean) +barplot(duration - mean(dat_dur$duration) ~ item, dat_dur, col = "#434F4F", + las = 3) + + # Investigate paths (will separate items and give clusters of artworks!) length(unique(dat$path)) - -datpath <- aggregate(cbind(duration, distance, scaleSize, rotationDegree) ~ - path, dat, function(x) mean(x, na.rm = TRUE), na.action = NULL) - -datpath$length <- aggregate(item ~ path, dat, length)$item -datpath$nitems <- aggregate(item ~ path, dat, function(x) - length(unique(x)), na.action = NULL)$item -datpath$ntopics <- aggregate(topic ~ path, dat, - function(x) ifelse(all(is.na(x)), NA, length(unique(na.omit(x)))), - na.action = NULL)$topic - -datpath$vacation <- aggregate(vacation ~ path, dat, - function(x) ifelse(all(is.na(x)), 0, 1), - na.action = NULL)$vacation -datpath$holiday <- aggregate(holiday ~ path, dat, - function(x) ifelse(all(is.na(x)), 0, 1), - na.action = NULL)$holiday -datpath$weekend <- aggregate(weekdays ~ path, dat, - function(x) ifelse(any(x %in% c("Saturday", "Sunday")), 1, 0), - na.action = NULL)$weekdays -datpath$morning <- aggregate(date.start ~ path, dat, - function(x) ifelse(lubridate::hour(x[1]) > 13, 0, 1), - na.action = NULL)$date.start - - -# Investigate cases (= interactions per time intervall) -length(unique(dat$case)) - -datcase <- aggregate(cbind(duration, distance, scaleSize, rotationDegree) ~ - case, dat, function(x) mean(x, na.rm = TRUE), na.action = NULL) - -datcase$length <- aggregate(item ~ case, dat, length)$item -datcase$nitems <- aggregate(item ~ case, dat, function(x) - length(unique(x)), na.action = NULL)$item -datcase$ntopics <- aggregate(topic ~ case, dat, - function(x) ifelse(all(is.na(x)), NA, length(unique(na.omit(x)))), - na.action = NULL)$topic - -datcase$vacation <- aggregate(vacation ~ case, dat, - function(x) ifelse(all(is.na(x)), 0, 1), - na.action = NULL)$vacation -datcase$holiday <- aggregate(holiday ~ case, dat, - function(x) ifelse(all(is.na(x)), 0, 1), - na.action = NULL)$holiday -datcase$weekend <- aggregate(weekdays ~ case, dat, - function(x) ifelse(any(x %in% c("Saturday", "Sunday")), 1, 0), - na.action = NULL)$weekdays -datcase$morning <- aggregate(date.start ~ case, dat, - function(x) ifelse(lubridate::hour(x[1]) > 13, 0, 1), - na.action = NULL)$date.start - - - -# Paths with more than one case associated -tmp <- aggregate(case ~ path, dat, function(x) length(unique(x))) -sum(tmp$case > 1) -table(tmp$case) - -dat$date <- as.Date(dat$date.start) - -tmp <- aggregate(date ~ path, dat, function(x) length(unique(x))) -sum(tmp$date > 1) -table(tmp$date) -tmp[tmp$date > 1, ] - -for (p in tmp$path[tmp$date > 1]) { - print(dat[dat$path == p, 3:9]) - cat("\n\n") -} - - -dat[dat$date == "2017-02-28" & dat$item == "503", ] - - -# Creating event logs - -library(bupaverse) - -dat$start <- dat$date.start +# DFGs per Cluster +dat$start <- dat$date.start dat$complete <- dat$date.stop -table(table(dat$start)) -# --> hmm... - summary(aggregate(duration ~ path, dat, mean)) alog <- activitylog(dat, - case_id = "path", + case_id = "path", activity_id = "event", resource_id = "item", - timestamps = c("start", "complete")) + timestamps = c("start", "complete")) + +process_map(alog, + type_nodes = frequency("absolute"), + sec_nodes = frequency("relative"), + type_edges = frequency("absolute"), + sec_edges = frequency("relative"), + rankdir = "LR") + +### Separate for items + +datitem <- aggregate(cbind(duration, distance, scaleSize, rotationDegree) ~ + item, dat, function(x) mean(x, na.rm = TRUE), na.action = NULL) +datitem$npaths <- aggregate(path ~ item, dat, + function(x) length(unique(x)), + na.action = NULL)$path +datitem$ncases <- aggregate(case ~ item, dat, + function(x) length(unique(x)), + na.action = NULL)$case +datitem$ntopics <- aggregate(topic ~ item, dat, + function(x) ifelse(all(is.na(x)), NA, length(unique(na.omit(x)))), + na.action = NULL)$topic + + +set.seed(1211) + +nclusters <- 6 +k1 <- kmeans(datitem[, -1], nclusters) + +#colors <- c("#3CB4DC", "#78004B", "#91C86E", "#FF6900") + +colors <- palette.colors(palette = "Okabe-Ito") + +xy <- cmdscale(dist(datitem[, -1])) + +plot(xy, type = "n") +text(xy[,1], xy[,2], datitem$item, col = colors[k1$cluster]) +legend("topright", paste("Cluster", 1:nclusters), col = colors, lty = 1) + +## Scree plot + +ks <- 1:10 + +sse <- NULL +for (k in ks) sse <- c(sse, kmeans(datitem[, -1], k)$tot.withinss) + +plot(sse ~ ks, type = "l") + + +datitem$cluster <- k1$cluster + +datitem_agg <- aggregate(. ~ cluster, datitem[, -1], mean) + + +dat_cl <- merge(dat, datitem[, c("item", "cluster")], by = "item", all.x = TRUE) +dat_cl <- dat_cl[order(dat_cl$fileId.start, dat_cl$date.start, dat_cl$timeMs.start), ] + +write.table(dat_cl, "results/haum/event_logfiles_with-clusters_kmeans.csv", + sep = ";", row.names = FALSE) + +vioplot::vioplot(datitem$duration) + +vioplot::vioplot(duration ~ item, dat, las = 3) + +vioplot::vioplot(duration ~ cluster, dat_cl) +vioplot::vioplot(distance ~ cluster, dat_cl) +vioplot::vioplot(scaleSize ~ cluster, dat_cl) +vioplot::vioplot(rotationDegree ~ cluster, dat_cl) + + + +for (cluster in sort(unique(dat_cl$cluster))) { + + alog <- activitylog(dat_cl[dat_cl$cluster == cluster, ], + case_id = "path", + activity_id = "event", + resource_id = "item", + timestamps = c("start", "complete")) + + dfg <- process_map(alog, + type_nodes = frequency("relative"), + sec_nodes = frequency("absolute"), + type_edges = frequency("relative"), + sec_edges = frequency("absolute"), + rankdir = "LR", + render = FALSE) + export_map(dfg, + file_name = paste0("results/processmaps/dfg_cluster", cluster, "_R.pdf"), + file_type = "pdf", + title = paste("DFG Cluster", cluster)) + + +} + + +tmp <- dat[dat$event != "move", ] + +check_traces <- function(data) { + + datagg <- aggregate(event ~ path, data, + function(x) ifelse("openPopup" %in% x, T, F)) + paths <- datagg$path[datagg$event] + datcheck <- data[data$path %in% paths, c("path", "event")] + datcheck <- datcheck[!duplicated(datcheck), ] + datcheck <- datcheck[order(datcheck$path), ] + + retval <- NULL + for (path in unique(datcheck$path)) { + check <- !all(as.character(datcheck$event[datcheck$path == path]) == + c("flipCard", "openTopic", "openPopup")) + retval <- rbind(retval, data.frame(path, check)) + } + retval +} + +check <- check_traces(tmp) + +sum(check$check) + + + + + +alog <- activitylog(dat, + case_id = "case", + activity_id = "item", + resource_id = "path", + timestamps = c("start", "complete")) process_map(alog, type_nodes = frequency("absolute"), @@ -131,55 +188,74 @@ process_map(alog, rankdir = "LR") -alog2 <- activitylog(dat, - case_id = "case", - activity_id = "event", - resource_id = "item", - timestamps = c("start", "complete")) -process_map(alog2, - type_nodes = frequency("absolute"), - sec_nodes = frequency("relative"), - type_edges = frequency("absolute"), - sec_edges = frequency("relative"), + +datcase <- dat[!duplicated(dat[, c("case", "path", "item")]), + c("case", "path", "event", "item")] +datcase$duration <- aggregate(duration ~ path, dat, + function(x) mean(x, na.rm = TRUE), na.action = NULL)$duration +datcase$distance <- aggregate(distance ~ path, dat, + function(x) mean(x, na.rm = TRUE), na.action = NULL)$distance +datcase$scaleSize <- aggregate(scaleSize ~ path, dat, + function(x) mean(x, na.rm = TRUE), na.action = NULL)$scaleSize +datcase$rotationDegree <- aggregate(rotationDegree ~ path, dat, + function(x) mean(x, na.rm = TRUE), na.action = NULL)$rotationDegree +# datcase$ntopics <- aggregate(topic ~ path, dat, +# function(x) ifelse(all(is.na(x)), NA, length(unique(na.omit(x)))), +# na.action = NULL)$topic +datcase$move <- ifelse(datcase$event == "move", 1, 0) +# paths that start with move + +for (item in sort(unique(datcase$item))) { + datcase[paste0("item_", item)] <- ifelse(datcase$item == item, 1, 0) +} + +mat <- na.omit(datcase[, -c(1:4)]) + + +set.seed(1610) + +nclusters <- 6 +k1 <- kmeans(mat, nclusters) + +#colors <- c("#3CB4DC", "#78004B", "#91C86E", "#FF6900") + +colors <- palette.colors(palette = "Okabe-Ito")[1:nclusters] + +library(distances) +mat_dist <- distances(mat) + +xy <- cmdscale(mat_dist) + +plot(xy, type = "n") +text(xy[,1], xy[,2], datcase$path, col = colors[k1$cluster]) +legend("topright", paste("Cluster", 1:nclusters), col = colors, lty = 1) + +## Scree plot + +ks <- 1:10 + +sse <- NULL +for (k in ks) sse <- c(sse, kmeans(datitem[, -1], k)$tot.withinss) + +plot(sse ~ ks, type = "l") + + + + + + + + +alog <- activitylog(datcase, + case_id = "case", + activity_id = "item", + resource_id = "path", + timestamps = c("start", "complete")) + +process_map(alog, + type_nodes = frequency("relative"), + sec_nodes = frequency("absolute"), + type_edges = frequency("relative"), + sec_edges = frequency("absolute"), rankdir = "LR") - - -library(processanimateR) - -animate_process(to_eventlog(alog)) - -col_vector <- c("#7FC97F", "#BEAED4", "#FDC086", "#FFFF99", "#386CB0", - "#F0027F", "#BF5B17", "#666666", "#1B9E77", "#D95F02", - "#7570B3", "#E7298A", "#66A61E", "#E6AB02", "#A6761D", - "#666666", "#A6CEE3", "#1F78B4", "#B2DF8A", "#33A02C", - "#FB9A99", "#E31A1C", "#FDBF6F", "#FF7F00", "#CAB2D6", - "#6A3D9A", "#FFFF99", "#B15928", "#FBB4AE", "#B3CDE3", - "#CCEBC5", "#DECBE4", "#FED9A6", "#FFFFCC", "#E5D8BD", - "#FDDAEC", "#F2F2F2", "#B3E2CD", "#FDCDAC", "#CBD5E8", - "#F4CAE4", "#E6F5C9", "#FFF2AE", "#F1E2CC", "#CCCCCC", - "#E41A1C", "#377EB8", "#4DAF4A", "#984EA3", "#FF7F00", - "#FFFF33", "#A65628", "#F781BF", "#999999", "#66C2A5", - "#FC8D62", "#8DA0CB", "#E78AC3", "#A6D854", "#FFD92F", - "#E5C494", "#B3B3B3", "#8DD3C7", "#FFFFB3", "#BEBADA", - "#FB8072", "#80B1D3", "#FDB462", "#B3DE69", "#FCCDE5", - "#D9D9D9") - -animate_process(to_eventlog(alog), mode = "relative", jitter = 10, legend = "color", - mapping = token_aes(color = token_scale("artwork", - scale = "ordinal", - range = col_vector))) - -elog <- to_eventlog(alog) -animate_process(elog[elog$artwork == "054", ]) -animate_process(elog[elog$artwork == "080", ]) -animate_process(elog[elog$artwork == "501", ]) - -process_map(alog[alog$artwork == "054", ]) - -animate_process(elog[elog$artwork %in% c("080", "054"), ], - mode = "relative", jitter = 10, legend = "color", - mapping = token_aes(color = token_scale("artwork", - scale = "ordinal", - range = c("black", "gray")))) - diff --git a/code/check_broken_trace.R b/code/check_broken_trace.R new file mode 100644 index 0000000..edc6d0e --- /dev/null +++ b/code/check_broken_trace.R @@ -0,0 +1,28 @@ +# setwd("C:/Users/nwickelmaier/Nextcloud/Documents/MDS/2023ss/60100_master_thesis/analysis/code") + +datraw <- read.table("results/haum/raw_logfiles_2024-01-18_09-58-52.csv", + header = TRUE, sep = ";") + + +# Read data + +datlogs <- read.table("results/haum/event_logfiles_2024-01-18_09-58-52.csv", + colClasses = c("character", "character", "POSIXct", + "POSIXct", "character", "integer", + "numeric", "character", "character", + rep("numeric", 3), "character", + "character", rep("numeric", 11), + "character", "character"), + sep = ";", header = TRUE) + +datlogs <- datlogs[order(datlogs$fileId.start, datlogs$date.start, datlogs$timeMs.start), ] + +artwork <- "176" +fileId <- c('2017_06_16-13_49_00.log', '2017_06_16-13_59_00.log') +path <- 106098 + +datraw[datraw$item == artwork & datraw$fileId %in% fileId, ] + +datlogs[datlogs$path == path, ] + + diff --git a/code/item_clustering.R b/code/item_clustering.R new file mode 100644 index 0000000..54b0438 --- /dev/null +++ b/code/item_clustering.R @@ -0,0 +1,158 @@ +# setwd("C:/Users/nwickelmaier/Nextcloud/Documents/MDS/2023ss/60100_master_thesis/analysis/code") + +#--------------- (1) Read data --------------- + +#--------------- (1.1) Read log event data --------------- + +dat0 <- read.table("results/haum/event_logfiles_2024-01-18_09-58-52.csv", + colClasses = c("character", "character", "POSIXct", + "POSIXct", "character", "integer", + "numeric", "character", "character", + rep("numeric", 3), "character", + "character", rep("numeric", 11), + "character", "character"), + sep = ";", header = TRUE) +dat0$event <- factor(dat0$event, levels = c("move", "flipCard", "openTopic", + "openPopup")) + +# Select data pre Corona +dat <- dat0[as.Date(dat0$date.start) < "2020-03-13", ] +dat <- dat[dat$path != 106098, ] + +#--------------- (1.2) Read infos for PM for items --------------- + +datitem <- read.table("results/haum/pn_infos_items.csv", header = TRUE, + sep = ";", row.names = 1) + + +#--------------- (1.3) Extract additional infos for clustering --------------- + +datitem$duration <- aggregate(duration ~ item, dat, mean)$duration +datitem$distance <- aggregate(distance ~ item, dat, mean)$distance +datitem$scaleSize <- aggregate(scaleSize ~ item, dat, mean)$scaleSize +datitem$rotationDegree <- aggregate(rotationDegree ~ item, dat, mean)$rotationDegree +datitem$npaths <- aggregate(path ~ item, dat, function(x) length(unique(x)))$path +datitem$ncases <- aggregate(case ~ item, dat, function(x) length(unique(x)))$case +datitem$ntopics <- aggregate(topic ~ item, dat, function(x) length(unique(x)))$topic +datitem$mostfreq_num <- as.numeric(gsub(".*: (.*)}", "\\1", datitem$mostfreq)) + +#--------------- (2) Clustering --------------- + +df <- datitem[, c("precision", "generalizability", "nvariants", + "mostfreq_num", "duration", "distance", "scaleSize", + "rotationDegree", "npaths", "ncases", "ntopics")] |> + scale() +mat <- dist(df) + +hc <- hclust(mat, method = "ward.D2") + +library(factoextra) +fviz_dend(hc, cex = 0.5) + +datitem$grp <- cutree(hc, k = 6) + +fviz_dend(hc, k = 6, + cex = 0.5, + k_colors = c("#78004B", "#000000", "#3CB4DC", "#91C86E", + "#FF6900", "#434F4F"), + #type = "phylogenic", + rect = TRUE +) + +p <- fviz_cluster(list(data = df, cluster = grp), + palette = c("#78004B", "#000000", "#3CB4DC", "#91C86E", + "#FF6900", "#434F4F"), + ellipse.type = "convex", + repel = TRUE, + show.clust.cent = FALSE, ggtheme = theme_bw()) +p + +aggregate(cbind(duration, distance, scaleSize , rotationDegree, npaths, + ncases, ntopics) ~ grp, datitem, mean) + +datitem$item <- gsub("item_([0-9]{3})", "\\1", row.names(datitem)) + +res <- merge(dat, datitem[, c("item", "grp")], by = "item", all.x = TRUE) +res <- res[order(res$fileId.start, res$date.start, res$timeMs.start), ] + +write.table(res, + file = "results/haum/event_logfiles_pre-corona_with-clusters.csv", + sep = ";", + quote = FALSE, + row.names = FALSE) + +library(bupaverse) + +res$start <- res$date.start +res$complete <- res$date.stop + +for (cluster in sort(unique(res$grp))) { + + alog <- activitylog(res[res$grp == cluster, ], + case_id = "path", + activity_id = "event", + resource_id = "item", + timestamps = c("start", "complete")) + + dfg <- process_map(alog, + type_nodes = frequency("relative"), + sec_nodes = frequency("absolute"), + type_edges = frequency("relative"), + sec_edges = frequency("absolute"), + rankdir = "LR", + render = FALSE) + export_map(dfg, + file_name = paste0("results/processmaps/dfg_cluster", cluster, "_R.pdf"), + file_type = "pdf", + title = paste("DFG Cluster", cluster)) + + +} + +#--------------- (3) Visualization with pictures --------------- + +library(png) +library(jpeg) +library(grid) + +colors <- c("#78004B", "#000000", "#3CB4DC", "#91C86E", "#FF6900", + "#434F4F") + +#pdf("results/haum/figures/clustering_artworks.pdf", height = 8, width = 8, pointsize = 10) +png("results/haum/figures/clustering_artworks.png", units = "in", height = 8, width = 8, pointsize = 10, res = 300) + +par(mai = c(.6,.6,.1,.1), mgp = c(2.4, 1, 0)) + +plot(y ~ x, p$data, type = "n", ylim = c(-3.2, 3), xlim = c(-4.7, 6.4)) + +for (item in sprintf("%03d", as.numeric(rownames(p$data)))) { + + if (item == "125") { + + pic <- readJPEG(paste0("../data/haum/ContentEyevisit/eyevisit_cards_light/", + item, "/", item, ".jpg")) + } else { + pic <- readPNG(paste0("../data/haum/ContentEyevisit/eyevisit_cards_light/", + item, "/", item, ".png")) + } + + img <- as.raster(pic[,,1:3]) + + x <- p$data$x[sprintf("%03d", as.numeric(rownames(p$data))) == item] + y <- p$data$y[sprintf("%03d", as.numeric(rownames(p$data))) == item] + + points(x, y, + col = colors[p$data$cluster[sprintf("%03d", as.numeric(rownames(p$data))) == item]], + cex = 9, + pch = 15) + + rasterImage(img, + xleft = x - .4, + xright = x + .4, + ybottom = y - .2, + ytop = y + .2) + +} + +dev.off() + diff --git a/code/pm_conformance-checking.py b/code/pm_conformance-checking.py index bc37867..ac677e2 100644 --- a/code/pm_conformance-checking.py +++ b/code/pm_conformance-checking.py @@ -1,4 +1,3 @@ -#%% # needed for shortcuts to run properly in VSCode *eyeroll* %reset import pm4py @@ -9,13 +8,12 @@ import matplotlib.pyplot as plt ###### Load data and create event logs ###### -dat = pd.read_csv("results/haum/event_logfiles_2024-01-02_19-44-50.csv", sep = ";") +dat = pd.read_csv("results/haum/event_logfiles_2024-01-18_09-58-52.csv", sep = ";") dat = dat[dat["date.start"] < "2020-03-13"] # --> only pre corona (before artworks were updated) event_log = pm4py.format_dataframe(dat, case_id='path', activity_key='event', timestamp_key='date.start') -event_log = event_log.rename(columns={'item': 'case:item'}) ###### Descrptives of log data ###### @@ -39,30 +37,17 @@ len(variants_no_move) sorted_variants_no_move = dict(sorted(variants_no_move.items(), key=lambda item: item[1], reverse = True)) {k: sorted_variants_no_move[k] for k in list(sorted_variants_no_move)[:20]} -# Path length -event_log.path.value_counts() -event_log.path.value_counts().mean() -event_log.path.value_counts().median() -event_log.path.value_counts().min() -event_log.path.value_counts().max() - -plt.hist(event_log.path.value_counts(), bins=200) -plt.show() - -# TODO: Do it again in R -- much smoother and more info, better plots - ###### Read "conformative" Petri Net ###### basenet, initial_marking, final_marking = pm4py.read_pnml("results/conformative_petrinet_con.pnml") def eval_pm(data, net, initial_marking, final_marking): """Caculate fitness, precision, generalizability, and simplicity for petri net""" - fitness = pm4py.fitness_token_based_replay(data, net, initial_marking, final_marking) - precisison = pm4py.precision_token_based_replay(data, net, - initial_marking, final_marking) + fitness = pm4py.fitness_token_based_replay(data, net, initial_marking, final_marking) + precisison = pm4py.precision_token_based_replay(data, net, initial_marking, final_marking) generalizability = pm4py.algo.evaluation.generalization.algorithm.apply(data, net, - initial_marking, final_marking) - simplicity = pm4py.algo.evaluation.simplicity.algorithm.apply(net) + initial_marking, final_marking) + simplicity = pm4py.algo.evaluation.simplicity.algorithm.apply(net) return [fitness['average_trace_fitness'], precisison, generalizability, simplicity] baseline_eval = eval_pm(event_log, basenet, initial_marking, final_marking) @@ -80,75 +65,39 @@ for i in range(len(replayed_traces)): l3.append(replayed_traces[i]["reached_marking"]) l4.append(replayed_traces[i]["transitions_with_problems"]) -np.mean(l1) set(l1) -index_broken = l1.index(1) -np.mean(l2) -set(l2) -l2.index(1) +x1 = np.array(l1) +index_broken = np.where(x1 == 1)[0].tolist() + set(l3) l4.count([]) -l3[index_broken] -l4[index_broken] +[l3[i] for i in index_broken] +[l4[i] for i in index_broken] -replayed_traces[index_broken] +broken_traces = [replayed_traces[i] for i in index_broken] event_log[event_log['@@case_index'] == index_broken].event -event_log[event_log['@@case_index'] == index_broken].path -event_log[event_log['@@case_index'] == index_broken].item -event_log[event_log['@@case_index'] == index_broken]["fileId.start"] -# --> logging error in file! +event_log[event_log['@@case_index'] == index_broken].path.unique().tolist() +event_log[event_log['@@case_index'] == index_broken].item.unique().tolist() +event_log[event_log['@@case_index'] == index_broken]["fileId.start"].unique().tolist() +# --> logging error in raw file -from pm4py.algo.conformance.tokenreplay import algorithm as token_based_replay -parameters_tbr = {token_based_replay.Variants.TOKEN_REPLAY.value.Parameters.DISABLE_VARIANTS: True, token_based_replay.Variants.TOKEN_REPLAY.value.Parameters.ENABLE_PLTR_FITNESS: True} -replayed_traces, place_fitness, trans_fitness, unwanted_activities = token_based_replay.apply(event_log, basenet, - initial_marking, - final_marking, - parameters=parameters_tbr) - -from pm4py.algo.conformance.tokenreplay.diagnostics import duration_diagnostics -trans_diagnostics = duration_diagnostics.diagnose_from_trans_fitness(event_log, trans_fitness) -for trans in trans_diagnostics: - print(trans, trans_diagnostics[trans]) # Footprints from pm4py.algo.discovery.footprints import algorithm as footprints_discovery -fp_log = footprints_discovery.apply(event_log, variant=footprints_discovery.Variants.ENTIRE_EVENT_LOG) - -fp_trace_by_trace = footprints_discovery.apply(event_log, variant=footprints_discovery.Variants.TRACE_BY_TRACE) - -fp_net = footprints_discovery.apply(basenet, initial_marking, final_marking) - from pm4py.visualization.footprints import visualizer as fp_visualizer +fp_log = footprints_discovery.apply(event_log, variant=footprints_discovery.Variants.ENTIRE_EVENT_LOG) +fp_net = footprints_discovery.apply(basenet, initial_marking, final_marking) gviz = fp_visualizer.apply(fp_net, parameters={fp_visualizer.Variants.SINGLE.value.Parameters.FORMAT: "svg"}) fp_visualizer.view(gviz) -gviz = fp_visualizer.apply(fp_log, fp_net, parameters={fp_visualizer.Variants.COMPARISON.value.Parameters.FORMAT: "svg"}) -fp_visualizer.view(gviz) - -conf_fp = pm4py.conformance_diagnostics_footprints(fp_trace_by_trace, fp_net) - -from pm4py.algo.conformance.footprints import algorithm as fp_conformance -conf_result = fp_conformance.apply(fp_log, fp_net, variant=fp_conformance.Variants.LOG_EXTENSIVE) - -from pm4py.algo.conformance.footprints.util import evaluation -fitness = evaluation.fp_fitness(fp_log, fp_net, conf_result) -precision = evaluation.fp_precision(fp_log, fp_net) - -# Skeleton -from pm4py.algo.discovery.log_skeleton import algorithm as lsk_discovery -skeleton = lsk_discovery.apply(event_log, parameters={lsk_discovery.Variants.CLASSIC.value.Parameters.NOISE_THRESHOLD: 0.0}) - -from pm4py.algo.conformance.log_skeleton import algorithm as lsk_conformance -conf_result = lsk_conformance.apply(event_log, skeleton) - pm4py.vis.view_petri_net(basenet, initial_marking, final_marking) is_sound = pm4py.check_soundness(basenet, initial_marking, final_marking) -is_sound[0] -len(basenet.arcs) -len(basenet.transitions) -len(basenet.places) +baseline_eval.append(is_sound[0]) +baseline_eval.append(len(basenet.arcs)) +baseline_eval.append(len(basenet.transitions)) +baseline_eval.append(len(basenet.places)) efg_graph = pm4py.discover_eventually_follows_graph(event_log) @@ -157,163 +106,135 @@ dfg, start_activities, end_activities = pm4py.discover_dfg(event_log) pm4py.view_dfg(dfg, start_activities, end_activities) pm4py.save_vis_dfg(dfg, start_activities, end_activities, '../figures/processmaps/dfg_complete.png') -## Heuristics Miner +## Fitting different miners +### Heuristics Miner h_net, im, fm = pm4py.discover_petri_net_heuristics(event_log) -pm4py.vis.view_petri_net(h_net, im, fm) -pm4py.vis.save_vis_petri_net(h_net, im, fm, "../figures/processmaps/petrinet_heuristics_complete.png") h_eval = eval_pm(event_log, h_net, im, fm) - is_sound = pm4py.check_soundness(h_net, im, fm) -is_sound[0] - -len(h_net.arcs) -len(h_net.transitions) -len(h_net.places) - - -# decorated petri net -from pm4py.visualization.petri_net import visualizer as pn_visualizer -parameters = {pn_visualizer.Variants.FREQUENCY.value.Parameters.FORMAT: "png"} -gviz = pn_visualizer.apply(h_net, im, fm, parameters=parameters, variant=pn_visualizer.Variants.FREQUENCY, log=event_log) -pn_visualizer.save(gviz, "../figures/processmaps/petrinet_heuristics_complete_decorated.png") - -# convert to BPMN -bpmn = pm4py.convert.convert_to_bpmn(h_net, im, fm) -pm4py.vis.view_bpmn(bpmn) +h_eval.append(is_sound[0]) +h_eval.append(len(h_net.arcs)) +h_eval.append(len(h_net.transitions)) +h_eval.append(len(h_net.places)) ## Alpha Miner a_net, im, fm = pm4py.discover_petri_net_alpha(event_log) -pm4py.vis.view_petri_net(a_net, im, fm) -pm4py.vis.save_vis_petri_net(a_net, im, fm, "../figures/processmaps/petrinet_alpha_complete.png") a_eval = eval_pm(event_log, a_net, im, fm) - is_sound = pm4py.check_soundness(a_net, im, fm) -is_sound[0] - -len(a_net.arcs) -len(a_net.transitions) -len(a_net.places) +a_eval.append(is_sound[0]) +a_eval.append(len(a_net.arcs)) +a_eval.append(len(a_net.transitions)) +a_eval.append(len(a_net.places)) ## Inductive Miner i_net, im, fm = pm4py.discover_petri_net_inductive(event_log) -pm4py.vis.view_petri_net(i_net, im, fm) -pm4py.vis.save_vis_petri_net(i_net, im, fm, "../figures/processmaps/petrinet_induction_complete.png") i_eval = eval_pm(event_log, i_net, im, fm) - -# as process tree (does not work for heuristics miner!) -pt = pm4py.discover_process_tree_inductive(event_log) -pm4py.vis.view_process_tree(pt) - is_sound = pm4py.check_soundness(i_net, im, fm) -is_sound[0] - -# TODO: Can I show that this simpler net does not include all traces? (Probably not, -# since fitness is 1, but WHY?) - -len(i_net.arcs) -len(i_net.transitions) -len(i_net.places) - -bpmn = pm4py.convert.convert_to_bpmn(i_net, im, fm) -pm4py.view_bpmn(bpmn) - -from pm4py.algo.conformance.tokenreplay import algorithm as token_based_replay -parameters_tbr = {token_based_replay.Variants.TOKEN_REPLAY.value.Parameters.DISABLE_VARIANTS: True, token_based_replay.Variants.TOKEN_REPLAY.value.Parameters.ENABLE_PLTR_FITNESS: True} -replayed_traces, place_fitness, trans_fitness, unwanted_activities = token_based_replay.apply(event_log, i_net, - im, - fm, - parameters=parameters_tbr) - -l1 = list() -l2 = list() -l3 = list() -l4 = list() -for i in range(len(replayed_traces)): - l1.append(replayed_traces[i]["remaining_tokens"]) - l2.append(replayed_traces[i]["missing_tokens"]) - l3.append(replayed_traces[i]["reached_marking"]) - l4.append(replayed_traces[i]["transitions_with_problems"]) - -np.mean(l1) -set(l1) -index_broken = l1.index(1) -np.mean(l2) -set(l2) -l2.index(1) -set(l3) -l4.count([]) - -l3[index_broken] -l4[index_broken] - -replayed_traces[index_broken] - -event_log[event_log['@@case_index'] == index_broken].event -event_log[event_log['@@case_index'] == index_broken].path -event_log[event_log['@@case_index'] == index_broken].item -event_log[event_log['@@case_index'] == index_broken]["fileId.start"] +i_eval.append(is_sound[0]) +i_eval.append(len(i_net.arcs)) +i_eval.append(len(i_net.transitions)) +i_eval.append(len(i_net.places)) ## ILP Miner ilp_net, im, fm = pm4py.discover_petri_net_ilp(event_log) -pm4py.vis.view_petri_net(ilp_net, im, fm) -pm4py.vis.save_vis_petri_net(ilp_net, im, fm, "../figures/processmaps/petrinet_ilp_complete.png") ilp_eval = eval_pm(event_log, ilp_net, im, fm) - is_sound = pm4py.check_soundness(ilp_net, im, fm) -is_sound[0] - -len(ilp_net.arcs) -len(ilp_net.transitions) -len(ilp_net.places) +ilp_eval.append(is_sound[0]) +ilp_eval.append(len(ilp_net.arcs)) +ilp_eval.append(len(ilp_net.transitions)) +ilp_eval.append(len(ilp_net.places)) ## Export for all miners eval = pd.DataFrame(np.row_stack([baseline_eval, h_eval, a_eval, i_eval, ilp_eval])) -eval.columns = ["fitness", "precision", "generalizability", "simplicity"] +eval.columns = ["fitness", "precision", "generalizability", "simplicity", + "sound", "narcs", "ntrans", "nplaces"] eval.index = ["conformative", "heuristics", "alpha", "inductive", "ilp"] eval -eval.to_csv("results/eval_all-miners_complete.csv", sep=";") +eval.to_csv("results/eval_all-miners_complete.csv", sep=" ") ## Without broken trace -event_log_clean = event_log[event_log['@@case_index'] != index_broken] -h_net, a_im, h_fm = pm4py.discover_petri_net_heuristics(event_log_clean) -a_net, h_im, a_fm = pm4py.discover_petri_net_alpha(event_log_clean) +event_log_clean = event_log[event_log['@@case_index'] != index_broken[0]] +h_net, h_im, h_fm = pm4py.discover_petri_net_heuristics(event_log_clean) +a_net, a_im, a_fm = pm4py.discover_petri_net_alpha(event_log_clean) i_net, i_im, i_fm = pm4py.discover_petri_net_inductive(event_log_clean) ilp_net, ilp_im, ilp_fm = pm4py.discover_petri_net_ilp(event_log_clean) baseline_eval = eval_pm(event_log_clean, basenet, initial_marking, final_marking) +is_sound = pm4py.check_soundness(basenet, initial_marking, final_marking) +baseline_eval.append(is_sound[0]) +baseline_eval.append(len(basenet.arcs)) +baseline_eval.append(len(basenet.transitions)) +baseline_eval.append(len(basenet.places)) + h_eval = eval_pm(event_log_clean, h_net, h_im, h_fm) +is_sound = pm4py.check_soundness(h_net, h_im, h_fm) +h_eval.append(is_sound[0]) +h_eval.append(len(h_net.arcs)) +h_eval.append(len(h_net.transitions)) +h_eval.append(len(h_net.places)) + a_eval = eval_pm(event_log_clean, a_net, a_im, a_fm) +is_sound = pm4py.check_soundness(a_net, a_im, a_fm) +a_eval.append(is_sound[0]) +a_eval.append(len(a_net.arcs)) +a_eval.append(len(a_net.transitions)) +a_eval.append(len(a_net.places)) + i_eval = eval_pm(event_log_clean, i_net, i_im, i_fm) +is_sound = pm4py.check_soundness(i_net, i_im, i_fm) +i_eval.append(is_sound[0]) +i_eval.append(len(i_net.arcs)) +i_eval.append(len(i_net.transitions)) +i_eval.append(len(i_net.places)) + ilp_eval = eval_pm(event_log_clean, ilp_net, ilp_im, ilp_fm) +is_sound = pm4py.check_soundness(ilp_net, ilp_im, ilp_fm) +ilp_eval.append(is_sound[0]) +ilp_eval.append(len(ilp_net.arcs)) +ilp_eval.append(len(ilp_net.transitions)) +ilp_eval.append(len(ilp_net.places)) eval = pd.DataFrame(np.row_stack([baseline_eval, h_eval, a_eval, i_eval, ilp_eval])) -eval.columns = ["fitness", "precision", "generalizability", "simplicity"] +eval.columns = ["fitness", "precision", "generalizability", "simplicity", + "sound", "narcs", "ntrans", "nplaces"] eval.index = ["conformative", "heuristics", "alpha", "inductive", "ilp"] eval -eval.to_csv("results/eval_all-miners_clean.csv", sep=";") +eval.to_csv("results/eval_all-miners_clean.csv", sep=" ") +# Export petri nets +pm4py.vis.save_vis_petri_net(h_net, h_im, h_fm, "results/processmaps/petrinet_heuristics_clean.png") +pm4py.vis.save_vis_petri_net(a_net, a_im, a_fm, "results/processmaps/petrinet_alpha_clean.png") +pm4py.vis.save_vis_petri_net(i_net, i_im, i_fm, "results/processmaps/petrinet_inductive_clean.png") +pm4py.vis.save_vis_petri_net(ilp_net, ilp_im, ilp_fm, "results/processmaps/petrinet_ilp_clean.png") +pm4py.vis.save_vis_petri_net(basenet, initial_marking, final_marking, "results/processmaps/petrinet_conformative.png") + +# convert to BPMN +base_bpmn = pm4py.convert.convert_to_bpmn(basenet, initial_marking, final_marking) +pm4py.vis.save_vis_bpmn(base_bpmn, "results/processmaps/bpmn_conformative.png") + +i_bpmn = pm4py.convert.convert_to_bpmn(i_net, i_im, i_fm) +pm4py.vis.save_vis_bpmn(i_bpmn, "results/processmaps/bpmn_inductive_clean.png") + +ilp_bpmn = pm4py.convert.convert_to_bpmn(ilp_net, ilp_im, ilp_fm) +pm4py.vis.save_vis_bpmn(ilp_bpmn, "results/processmaps/bpmn_ilp_clean.png") + +a_bpmn = pm4py.convert.convert_to_bpmn(a_net, a_im, a_fm) +pm4py.vis.save_vis_bpmn(a_bpmn, "results/processmaps/bpmn_alpha_clean.png") + +h_bpmn = pm4py.convert.convert_to_bpmn(h_net, h_im, h_fm) +pm4py.vis.save_vis_bpmn(h_bpmn, "results/processmaps/bpmn_heuristics_clean.png") ###### Process Mining - individual artworks ###### def pm_artworks(miner): - retval1 = np.empty((len(event_log["case:artwork"].unique()), 4)) - retval2 = np.empty((len(event_log["case:artwork"].unique()), 4)) + retval1 = np.empty((len(event_log["item"].unique()), 4)) + retval2 = np.empty((len(event_log["item"].unique()), 4)) - if miner == "heuristics": - net, im, fm = pm4py.discover_petri_net_heuristics(event_log) - elif miner == "inductive": - net, im, fm = pm4py.discover_petri_net_inductive(event_log) - elif miner == "alpha": - net, im, fm = pm4py.discover_petri_net_alpha(event_log) - elif miner == "ilp": - net, im, fm = pm4py.discover_petri_net_ilp(event_log) - - for i in range(len(event_log["case:artwork"].unique())): - artwork = event_log["case:artwork"].unique()[i] - subdata = pm4py.filter_event_attribute_values(event_log, "case:artwork", + for i in range(len(event_log["item"].unique())): + artwork = event_log["item"].unique()[i] + subdata = pm4py.filter_event_attribute_values(event_log, "item", [artwork], level="case", retain=True) if miner == "heuristics": @@ -325,17 +246,17 @@ def pm_artworks(miner): elif miner == "ilp": subnet, subim, subfm = pm4py.discover_petri_net_ilp(subdata) #pm4py.save_vis_petri_net(subnet, subim, subfm, - # "../figures/processmaps/artworks/petrinet_" + miner + "_" + str(artwork).zfill(3) + ".png") - retval1[i] = eval_pm(subdata, net, im, fm) + # "results/processmaps/artworks/petrinet_" + miner + "_" + str(artwork).zfill(3) + ".png") + retval1[i] = eval_pm(subdata, basenet, initial_marking, final_marking) retval2[i] = eval_pm(subdata, subnet, subim, subfm) retval1 = pd.DataFrame(retval1) retval1.columns = ["fitness", "precision", "generalizability", "simplicity"] - retval1.index = event_log["case:artwork"].unique() + retval1.index = event_log["item"].unique() retval1.insert(0, "nettype", "alldata") retval2 = pd.DataFrame(retval2) retval2.columns = ["fitness", "precision", "generalizability", "simplicity"] - retval2.index = event_log["case:artwork"].unique() + retval2.index = event_log["item"].unique() retval2.insert(0, "nettype", "subdata") return pd.concat([retval1, retval2]) @@ -343,7 +264,3 @@ def pm_artworks(miner): for miner in ["heuristics", "inductive", "alpha", "ilp"]: eval_art = pm_artworks(miner = miner) eval_art.to_csv("results/eval_artworks_" + miner + ".csv", sep=";") - -eval_art = pm_artworks(miner = "inductive") - - diff --git a/code/create-petrinet.py b/code/pm_create-petrinet.py similarity index 95% rename from code/create-petrinet.py rename to code/pm_create-petrinet.py index bccf41e..d9b416b 100644 --- a/code/create-petrinet.py +++ b/code/pm_create-petrinet.py @@ -138,15 +138,15 @@ final_marking = Marking() final_marking[sink] = 1 pm4py.view_petri_net(net_seq, initial_marking, final_marking) -pm4py.write_pnml(net_seq, initial_marking, final_marking, "results/conformative_petrinet_seq.pnml") +pm4py.write_pnml(net_seq, initial_marking, final_marking, "results/haum/conformative_petrinet_seq.pnml") pm4py.vis.save_vis_petri_net(net_seq, initial_marking, final_marking, - "../figures/conformative_petrinet_seq.png") + "results/processmaps/conformative_petrinet_seq.png") bpmn = pm4py.convert.convert_to_bpmn(net_seq, initial_marking, final_marking) pm4py.view_bpmn(bpmn) -pm4py.vis.save_vis_bpmn(bpmn, "../figures/conformative_bpmn_seq.png") +pm4py.vis.save_vis_bpmn(bpmn, "results/processmaps/conformative_bpmn_seq.png") ## Concurrent net @@ -240,12 +240,12 @@ final_marking = Marking() final_marking[sink] = 1 pm4py.view_petri_net(net_con, initial_marking, final_marking) -pm4py.write_pnml(net_con, initial_marking, final_marking, "results/conformative_petrinet_con.pnml") +pm4py.write_pnml(net_con, initial_marking, final_marking, "results/haum/conformative_petrinet_con.pnml") pm4py.vis.save_vis_petri_net(net_con, initial_marking, final_marking, - "../figures/conformative_petrinet_con.png") + "results/processmaps/conformative_petrinet_con.png") bpmn = pm4py.convert.convert_to_bpmn(net_con, initial_marking, final_marking) pm4py.view_bpmn(bpmn) -pm4py.vis.save_vis_bpmn(bpmn, "../figures/conformative_bpmn_con.png") +pm4py.vis.save_vis_bpmn(bpmn, "results/processmaps/conformative_bpmn_con.png") diff --git a/code/pm_infos-clusters.py b/code/pm_infos-clusters.py new file mode 100644 index 0000000..6e2e85c --- /dev/null +++ b/code/pm_infos-clusters.py @@ -0,0 +1,38 @@ +%reset + +import pm4py +import pandas as pd +import numpy as np +import matplotlib.pyplot as plt + +from python_helpers import eval_pm, pn_infos + +###### Load data and create event logs ###### + +dat = pd.read_csv("results/haum/event_logfiles_pre-corona_with-clusters.csv", sep = ";") + +log_path = pm4py.format_dataframe(dat, case_id = "path", activity_key = "event", + timestamp_key = "date.start") + +###### Infos for clusters ###### + +# Merge clusters into data frame +mdc = pd.DataFrame(columns = ["fitness", "precision", "generalizability", + "simplicity", "sound", "narcs", "ntrans", + "nplaces", "nvariants", "mostfreq"]) +for cluster in log_path.grp.unique().tolist(): + mdc = pd.concat([mdc, pn_infos(log_path, "grp", cluster)]) +mdc = mdc.sort_index() + +# Export +mdc.to_csv("results/haum/pn_infos_clusters.csv", sep = ";") + +###### Process maps for clusters ###### + +for cluster in log_path.grp.unique().tolist(): + subdata = log_path[log_path.grp == cluster] + subnet, subim, subfm = pm4py.discover_petri_net_inductive(subdata) + pm4py.save_vis_petri_net(subnet, subim, subfm, + "results/processmaps/petrinet_cluster" + str(cluster).zfill(3) + ".png") + bpmn = pm4py.convert.convert_to_bpmn(subnet, subim, subfm) + pm4py.vis.save_vis_bpmn(bpmn, "results/processmaps/bpmn_cluster_" + str(cluster).zfill(3) + ".png") diff --git a/code/pm_infos-items.py b/code/pm_infos-items.py new file mode 100644 index 0000000..6d1a910 --- /dev/null +++ b/code/pm_infos-items.py @@ -0,0 +1,54 @@ +%reset + +import pm4py +import pandas as pd +import numpy as np +import matplotlib.pyplot as plt + +from python_helpers import eval_pm, pn_infos + +###### Load data and create event logs ###### + +dat = pd.read_csv("results/haum/event_logfiles_2024-01-18_09-58-52.csv", sep = ";") +dat = dat[dat["date.start"] < "2020-03-13"] +dat = dat[dat["path"] != 106098] # exclude broken trace +# --> only pre corona (before artworks were updated) + +log_path = pm4py.format_dataframe(dat, case_id = "path", activity_key = "event", + timestamp_key = "date.start") + + +###### Infos for items ###### + +mdi = pd.DataFrame(columns = ["fitness", "precision", "generalizability", + "simplicity", "sound", "narcs", "ntrans", + "nplaces", "nvariants", "mostfreq"]) +for item in log_path.item.unique().tolist(): + mdi = pd.concat([mdi, pn_infos(log_path, "item", item)]) +mdi = mdi.sort_index() + +# Export +mdi.to_csv("results/haum/pn_infos_items.csv", sep = ";") + +# datitem = dat.groupby("item")[["duration", "distance", +# "scaleSize", "rotationDegree"]].mean() +# +# def length_path(data): +# x = data.path +# return len(x.unique()) +# def length_case(data): +# x = data.case +# return len(x.unique()) +# def length_topic(data): +# x = data.topic.dropna() +# return len(x.unique()) +# +# datitem["npaths"] = dat.groupby(["item"]).apply(length_path) +# datitem["ncases"] = dat.groupby(["item"]).apply(length_case) +# datitem["ntopics"] = dat.groupby(["item"]).apply(length_topic) +# +# datitem.index = datitem.index.astype(str).str.rjust(3, "0") +# datitem = datitem.sort_index() +# datitem.index = mdi.index +# +# datitem = pd.concat([mdi, datitem], yaxis = 1) diff --git a/code/pm_navigation-behavior.py b/code/pm_navigation-behavior.py index 9aec338..7a95302 100644 --- a/code/pm_navigation-behavior.py +++ b/code/pm_navigation-behavior.py @@ -10,9 +10,9 @@ parameters = {pn_visualizer.Variants.FREQUENCY.value.Parameters.FORMAT: "png"} ###### Load data and create event logs ###### -dat = pd.read_csv("results/haum/event_logfiles_2024-01-02_19-44-50.csv", sep = ";") +dat = pd.read_csv("results/haum/event_logfiles_2024-01-18_09-58-52.csv", sep = ";") dat = dat[dat["date.start"] < "2020-03-13"] -dat = dat[dat["path"] != 81621] # exclude broken trace +dat = dat[dat["path"] != 106098] # exclude broken trace # --> only pre corona (before artworks were updated) event_log = pm4py.format_dataframe(dat, case_id='case', activity_key='event', @@ -26,32 +26,101 @@ pm4py.view_dfg(dfg, start_activities, end_activities) #filtered_log = pm4py.filter_event_attribute_values(event_log, 'item', [80]) -i_net, im, fm = pm4py.discover_petri_net_inductive(event_log) -pm4py.vis.view_petri_net(i_net, im, fm) -gviz = pn_visualizer.apply(i_net, im, fm, parameters=parameters, +net, im, fm = pm4py.discover_petri_net_inductive(event_log) +pm4py.vis.view_petri_net(net, im, fm) + +pm4py.vis.view_petri_net(net, im, fm) +gviz = pn_visualizer.apply(net, im, fm, parameters=parameters, variant=pn_visualizer.Variants.FREQUENCY, log=event_log) pn_visualizer.view(gviz) -len(i_net.places) -len(i_net.transitions) -len(i_net.arcs) +bpmn = pm4py.convert.convert_to_bpmn(net, im, fm) +pm4py.vis.view_bpmn(bpmn) -a_net, im, fm = pm4py.discover_petri_net_alpha(event_log) -pm4py.vis.view_petri_net(a_net, im, fm) -gviz = pn_visualizer.apply(a_net, im, fm, parameters=parameters, - variant=pn_visualizer.Variants.FREQUENCY, - log=event_log) -pn_visualizer.view(gviz) +net2, im2, fm2 = pm4py.discover_petri_net_inductive(event_log, noise_threshold=0.1) +pm4py.vis.view_petri_net(net2, im2, fm2) + +def eval_pm(data, net, initial_marking, final_marking): + """Caculate fitness, precision, generalizability, and simplicity for petri net""" + fitness = pm4py.fitness_token_based_replay(data, net, initial_marking, final_marking) + precisison = pm4py.precision_token_based_replay(data, net, initial_marking, final_marking) + #generalizability = pm4py.algo.evaluation.generalization.algorithm.apply(data, net, + # initial_marking, final_marking) + simplicity = pm4py.algo.evaluation.simplicity.algorithm.apply(net) + #return [fitness['average_trace_fitness'], precisison, generalizability, simplicity] + return [fitness['average_trace_fitness'], precisison, simplicity] + +eval = eval_pm(event_log, net, im, fm) +eval2 = eval_pm(event_log, net2, im2, fm2) + +len(net.places) +len(net.transitions) +len(net.arcs) + +# Number of cases +len(event_log.case.unique()) + +# Number of variants +variants = pm4py.get_variants(event_log) +len(variants) + +sorted_variants = dict(sorted(variants.items(), key=lambda item: item[1], reverse = True)) +{k: sorted_variants[k] for k in list(sorted_variants)[:20]} + +filtered_log = event_log[event_log["event"] != "move"] +variants_no_move = pm4py.get_variants(filtered_log) +len(variants_no_move) +sorted_variants_no_move = dict(sorted(variants_no_move.items(), key=lambda item: item[1], reverse = True)) +{k: sorted_variants_no_move[k] for k in list(sorted_variants_no_move)[:20]} -len(a_net.places) -len(a_net.transitions) -len(a_net.arcs) -h_net, im, fm = pm4py.discover_petri_net_heuristics(filtered_log) -pm4py.vis.view_petri_net(h_net, im, fm) -len(h_net.places) -len(h_net.transitions) -len(h_net.arcs) + + +###### Navigation behavior for case ###### + +log_case = pm4py.format_dataframe(dat, case_id = "case", activity_key = "item", + timestamp_key = "date.start") +log_case = log_case.merge(tmp, on = "item", how = "left") + +#filtered_log = pm4py.filter_event_attribute_values(log_case, "kcluster", [3]) +filtered_log = log_case[log_case.hcluster == 1] + +net, im, fm = pm4py.discover_dfg(filtered_log) +pm4py.vis.view_dfg(net, im, fm) + + +net, im, fm = pm4py.discover_petri_net_inductive(filtered_log) +pm4py.vis.view_petri_net(net, im, fm) + +tree = pm4py.discovery.discover_process_tree_inductive(filtered_log) +pm4py.vis.view_process_tree(tree) + + +datcase = dat[~dat.duplicated(["case", "path", "item"])] +datcase = datcase[["case", "path", "event", "item", "date.start"]] +datcase = datcase.reset_index().drop("index", axis = 1) +#datcase = pd.concat([datcase, pd.get_dummies(datcase["item"], dtype = "int")], axis = 1) + +datcase["duration"] = dat.groupby("path")["duration"].mean().tolist() +datcase["distance"] = dat.groupby("path")["distance"].mean().tolist() +datcase["scaleSize"] = dat.groupby("path")["scaleSize"].mean().tolist() +datcase["rotationDegree"] = dat.groupby("path")["rotationDegree"].mean().tolist() + +datcase["item"] = [str(item).zfill(3) for item in datcase.item] +datcase = datcase.merge(xy[["item", "hcluster"]], on = "item", how = "left") + +log_case = pm4py.format_dataframe(dat, case_id = "case", activity_key = "item", + timestamp_key = "date.start") + +net, im, fm = pm4py.discover_dfg(log_case) +pm4py.vis.view_dfg(net, im, fm) +# don't know if this will eventually finish? + +net, im, fm = pm4py.discover_dfg(log_case[log_case.hcluster == 1]) +pm4py.vis.view_dfg(net, im, fm) + + + diff --git a/code/python_helpers.py b/code/python_helpers.py new file mode 100644 index 0000000..9c7de96 --- /dev/null +++ b/code/python_helpers.py @@ -0,0 +1,37 @@ +import pm4py +import pandas as pd + +###### Extract metadata for petri nets on filtered logs ###### + +def eval_pm(data, net, initial_marking, final_marking): + """Caculate fitness, precision, generalizability, and simplicity for petri net""" + fitness = pm4py.fitness_token_based_replay(data, net, initial_marking, final_marking) + precisison = pm4py.precision_token_based_replay(data, net, initial_marking, final_marking) + generalizability = pm4py.algo.evaluation.generalization.algorithm.apply(data, net, + initial_marking, final_marking) + simplicity = pm4py.algo.evaluation.simplicity.algorithm.apply(net) + return [fitness['average_trace_fitness'], precisison, generalizability, simplicity] + + +def pn_infos(log, colname, filter): + """Create data frame with relevant infos for petri nets on filtered logs""" + filtered_log = pm4py.filter_event_attribute_values(log, colname, [filter]) + + net, im, fm = pm4py.discover_petri_net_inductive(filtered_log) + eval = eval_pm(filtered_log, net, im, fm) + is_sound = pm4py.check_soundness(net, im, fm) + eval.append(is_sound[0]) + eval.append(len(net.arcs)) + eval.append(len(net.transitions)) + eval.append(len(net.places)) + variants = pm4py.get_variants(filtered_log) + eval.append(len(variants)) + + sorted_variants = dict(sorted(variants.items(), key=lambda item: item[1], reverse = True)) + eval.append({k: sorted_variants[k] for k in list(sorted_variants)[:1]}) + + eval = pd.DataFrame(eval).T + eval.columns = ["fitness", "precision", "generalizability", "simplicity", + "sound", "narcs", "ntrans", "nplaces", "nvariants", "mostfreq"] + eval.index = [str(filter).zfill(3)] + return eval diff --git a/code/trace-clustering.py b/code/trace-clustering.py index f35cb2f..c42dab6 100644 --- a/code/trace-clustering.py +++ b/code/trace-clustering.py @@ -39,3 +39,136 @@ plt.plot(list(sse.keys()), list(sse.values())) plt.xlabel("Number of clusters") plt.ylabel("SSE") plt.show() + + + + +### TMP +datitem = dat.groupby("item")[["duration", "distance", + "scaleSize", "rotationDegree"]].mean() + +def length_path(data): + x = data.path + return len(x.unique()) +def length_case(data): + x = data.case + return len(x.unique()) +def length_topic(data): + x = data.topic.dropna() + return len(x.unique()) + +datitem["npaths"] = dat.groupby(["item"]).apply(length_path) +datitem["ncases"] = dat.groupby(["item"]).apply(length_case) +datitem["ntopics"] = dat.groupby(["item"]).apply(length_topic) + +datitem.index = datitem.index.astype(str).str.rjust(3, "0") +datitem = datitem.sort_index() +datitem.index = mdi.index + +datitem = pd.concat([mdi, datitem], axis = 1) + + + + + +###### Find clusters ###### + +myseed = 1420 + +mat = datitem.drop(["fitness", "sound", "mostfreq"], axis = 1) +mat = StandardScaler().fit_transform(mat) + +xy = pd.DataFrame(MDS(normalized_stress = 'auto', random_state = myseed).fit_transform(mat)) +xy.index = datitem.index + +### K-Means clustering ### + +kmeans = KMeans(n_clusters = 6, max_iter = 1000, random_state = myseed).fit(mat) +xy["kcluster"] = kmeans.labels_ + +for i in xy.kcluster.unique(): + plt.scatter(xy[xy.kcluster == i].iloc[:,0], xy[xy.kcluster == i].iloc[:,1], label = i) + for j, txt in enumerate(xy.index[xy.kcluster == i]): + plt.annotate(txt.split("_")[1], (xy[xy.kcluster == i].iloc[j,0], xy[xy.kcluster == i].iloc[j,1])) +plt.legend() +plt.show() + +xy.kcluster.value_counts() + +# Scree plot +sse = {} +for k in range(1, 10): + kmeans = KMeans(n_clusters = k, max_iter = 1000).fit(mat) + sse[k] = kmeans.inertia_ # Inertia: Sum of distances of samples to their closest cluster center +plt.figure() +plt.plot(list(sse.keys()), list(sse.values())) +plt.xlabel("Number of clusters") +plt.ylabel("SSE") +plt.show() + +c0_items = xy[xy.kcluster == 0].index +c1_items = xy[xy.kcluster == 1].index +c2_items = xy[xy.kcluster == 2].index +c3_items = xy[xy.kcluster == 3].index +c4_items = xy[xy.kcluster == 4].index +c5_items = xy[xy.kcluster == 5].index + +### Hierarchical clustering ### +from sklearn.cluster import AgglomerativeClustering + +hclust = AgglomerativeClustering(n_clusters = 6).fit(mat) +hclust.labels_ + +xy["hcluster"] = hclust.labels_ + +for i in xy.hcluster.unique(): + plt.scatter(xy[xy.hcluster == i].iloc[:,0], xy[xy.hcluster == i].iloc[:,1], label = i) + for j, txt in enumerate(xy.index[xy.hcluster == i]): + plt.annotate(txt.split("_")[1], (xy[xy.hcluster == i].iloc[j,0], xy[xy.hcluster == i].iloc[j,1])) +plt.legend() +plt.show() + +# dendrogram +from scipy.cluster.hierarchy import dendrogram + +def plot_dendrogram(model, **kwargs): + # Create linkage matrix and then plot the dendrogram + + # create the counts of samples under each node + counts = np.zeros(model.children_.shape[0]) + n_samples = len(model.labels_) + for i, merge in enumerate(model.children_): + current_count = 0 + for child_idx in merge: + if child_idx < n_samples: + current_count += 1 # leaf node + else: + current_count += counts[child_idx - n_samples] + counts[i] = current_count + + linkage_matrix = np.column_stack( + [model.children_, model.distances_, counts] + ).astype(float) + + # Plot the corresponding dendrogram + dendrogram(linkage_matrix, **kwargs) + +hclust = AgglomerativeClustering(distance_threshold = 0, n_clusters = None).fit(mat) + +plot_dendrogram(hclust) +plt.show() + +### Bisecting K-Means clustering ### +from sklearn.cluster import BisectingKMeans + +biKmeans = BisectingKMeans(n_clusters = 6, random_state = myseed).fit(mat) +biKmeans.labels_ + +xy["bcluster"] = biKmeans.labels_ + +for i in xy.bcluster.unique(): + plt.scatter(xy[xy.bcluster == i].iloc[:,0], xy[xy.bcluster == i].iloc[:,1], label = i) + for j, txt in enumerate(xy.index[xy.bcluster == i]): + plt.annotate(txt.split("_")[1], (xy[xy.bcluster == i].iloc[j,0], xy[xy.bcluster == i].iloc[j,1])) +plt.legend() +plt.show()