Script dump after trying out a hundred things; needs serious cleaning

This commit is contained in:
Nora Wickelmaier 2024-01-25 17:21:18 +01:00
parent b4ca4dd5eb
commit e8aac63504
11 changed files with 870 additions and 360 deletions

View File

@ -54,7 +54,7 @@ mat1 <- dat[, c("year", "duration1", "topicNumber1", "distance1", "scaleSize1",
paste0("A", unique(dat$artwork)), "flipCard", "move", "openTopic",
"openPopup")]
library(cluster) # for hiereachical clustering
library(cluster) # for hierarchical clustering
k1 <- kmeans(mat1, 2)
dat$kcluster <- k1$cluster

View File

@ -1,8 +1,10 @@
# setwd("C:/Users/nwickelmaier/Nextcloud/Documents/MDS/2023ss/60100_master_thesis/code")
# setwd("C:/Users/nwickelmaier/Nextcloud/Documents/MDS/2023ss/60100_master_thesis/analysis/code")
library(bupaverse)
# Read data
dat <- read.table("results/haum/event_logfiles_2024-01-02_19-44-50.csv",
dat0 <- read.table("results/haum/event_logfiles_2024-01-18_09-58-52.csv",
colClasses = c("character", "character", "POSIXct",
"POSIXct", "character", "integer",
"numeric", "character", "character",
@ -11,10 +13,10 @@ dat <- read.table("results/haum/event_logfiles_2024-01-02_19-44-50.csv",
"character", "character"),
sep = ";", header = TRUE)
dat$event <- factor(dat$event, levels = c("move", "flipCard", "openTopic",
dat0$event <- factor(dat0$event, levels = c("move", "flipCard", "openTopic",
"openPopup"))
dat$weekdays <- factor(weekdays(dat$date.start),
dat0$weekdays <- factor(weekdays(dat0$date.start),
levels = c("Montag", "Dienstag", "Mittwoch",
"Donnerstag", "Freitag", "Samstag",
"Sonntag"),
@ -23,105 +25,160 @@ dat$weekdays <- factor(weekdays(dat$date.start),
"Sunday"))
# Select data pre Corona
dat <- dat[as.Date(dat$date.start) < "2020-03-13", ]
dat <- dat[dat["path"] != 81621, ]
dat <- dat0[as.Date(dat0$date.start) < "2020-03-13", ]
dat <- dat[dat$path != 106098, ]
table(table(dat$start))
table(dat$event)
proportions(table(dat$event))
dat_dur <- aggregate(duration ~ item, dat, mean)
barplot(duration - mean(dat_dur$duration) ~ item, dat_dur, col = "#434F4F",
las = 3)
# Investigate paths (will separate items and give clusters of artworks!)
length(unique(dat$path))
datpath <- aggregate(cbind(duration, distance, scaleSize, rotationDegree) ~
path, dat, function(x) mean(x, na.rm = TRUE), na.action = NULL)
datpath$length <- aggregate(item ~ path, dat, length)$item
datpath$nitems <- aggregate(item ~ path, dat, function(x)
length(unique(x)), na.action = NULL)$item
datpath$ntopics <- aggregate(topic ~ path, dat,
function(x) ifelse(all(is.na(x)), NA, length(unique(na.omit(x)))),
na.action = NULL)$topic
datpath$vacation <- aggregate(vacation ~ path, dat,
function(x) ifelse(all(is.na(x)), 0, 1),
na.action = NULL)$vacation
datpath$holiday <- aggregate(holiday ~ path, dat,
function(x) ifelse(all(is.na(x)), 0, 1),
na.action = NULL)$holiday
datpath$weekend <- aggregate(weekdays ~ path, dat,
function(x) ifelse(any(x %in% c("Saturday", "Sunday")), 1, 0),
na.action = NULL)$weekdays
datpath$morning <- aggregate(date.start ~ path, dat,
function(x) ifelse(lubridate::hour(x[1]) > 13, 0, 1),
na.action = NULL)$date.start
# Investigate cases (= interactions per time intervall)
length(unique(dat$case))
datcase <- aggregate(cbind(duration, distance, scaleSize, rotationDegree) ~
case, dat, function(x) mean(x, na.rm = TRUE), na.action = NULL)
datcase$length <- aggregate(item ~ case, dat, length)$item
datcase$nitems <- aggregate(item ~ case, dat, function(x)
length(unique(x)), na.action = NULL)$item
datcase$ntopics <- aggregate(topic ~ case, dat,
function(x) ifelse(all(is.na(x)), NA, length(unique(na.omit(x)))),
na.action = NULL)$topic
datcase$vacation <- aggregate(vacation ~ case, dat,
function(x) ifelse(all(is.na(x)), 0, 1),
na.action = NULL)$vacation
datcase$holiday <- aggregate(holiday ~ case, dat,
function(x) ifelse(all(is.na(x)), 0, 1),
na.action = NULL)$holiday
datcase$weekend <- aggregate(weekdays ~ case, dat,
function(x) ifelse(any(x %in% c("Saturday", "Sunday")), 1, 0),
na.action = NULL)$weekdays
datcase$morning <- aggregate(date.start ~ case, dat,
function(x) ifelse(lubridate::hour(x[1]) > 13, 0, 1),
na.action = NULL)$date.start
# Paths with more than one case associated
tmp <- aggregate(case ~ path, dat, function(x) length(unique(x)))
sum(tmp$case > 1)
table(tmp$case)
dat$date <- as.Date(dat$date.start)
tmp <- aggregate(date ~ path, dat, function(x) length(unique(x)))
sum(tmp$date > 1)
table(tmp$date)
tmp[tmp$date > 1, ]
for (p in tmp$path[tmp$date > 1]) {
print(dat[dat$path == p, 3:9])
cat("\n\n")
}
dat[dat$date == "2017-02-28" & dat$item == "503", ]
# Creating event logs
library(bupaverse)
dat$start <- dat$date.start
# DFGs per Cluster
dat$start <- dat$date.start
dat$complete <- dat$date.stop
table(table(dat$start))
# --> hmm...
summary(aggregate(duration ~ path, dat, mean))
alog <- activitylog(dat,
case_id = "path",
case_id = "path",
activity_id = "event",
resource_id = "item",
timestamps = c("start", "complete"))
timestamps = c("start", "complete"))
process_map(alog,
type_nodes = frequency("absolute"),
sec_nodes = frequency("relative"),
type_edges = frequency("absolute"),
sec_edges = frequency("relative"),
rankdir = "LR")
### Separate for items
datitem <- aggregate(cbind(duration, distance, scaleSize, rotationDegree) ~
item, dat, function(x) mean(x, na.rm = TRUE), na.action = NULL)
datitem$npaths <- aggregate(path ~ item, dat,
function(x) length(unique(x)),
na.action = NULL)$path
datitem$ncases <- aggregate(case ~ item, dat,
function(x) length(unique(x)),
na.action = NULL)$case
datitem$ntopics <- aggregate(topic ~ item, dat,
function(x) ifelse(all(is.na(x)), NA, length(unique(na.omit(x)))),
na.action = NULL)$topic
set.seed(1211)
nclusters <- 6
k1 <- kmeans(datitem[, -1], nclusters)
#colors <- c("#3CB4DC", "#78004B", "#91C86E", "#FF6900")
colors <- palette.colors(palette = "Okabe-Ito")
xy <- cmdscale(dist(datitem[, -1]))
plot(xy, type = "n")
text(xy[,1], xy[,2], datitem$item, col = colors[k1$cluster])
legend("topright", paste("Cluster", 1:nclusters), col = colors, lty = 1)
## Scree plot
ks <- 1:10
sse <- NULL
for (k in ks) sse <- c(sse, kmeans(datitem[, -1], k)$tot.withinss)
plot(sse ~ ks, type = "l")
datitem$cluster <- k1$cluster
datitem_agg <- aggregate(. ~ cluster, datitem[, -1], mean)
dat_cl <- merge(dat, datitem[, c("item", "cluster")], by = "item", all.x = TRUE)
dat_cl <- dat_cl[order(dat_cl$fileId.start, dat_cl$date.start, dat_cl$timeMs.start), ]
write.table(dat_cl, "results/haum/event_logfiles_with-clusters_kmeans.csv",
sep = ";", row.names = FALSE)
vioplot::vioplot(datitem$duration)
vioplot::vioplot(duration ~ item, dat, las = 3)
vioplot::vioplot(duration ~ cluster, dat_cl)
vioplot::vioplot(distance ~ cluster, dat_cl)
vioplot::vioplot(scaleSize ~ cluster, dat_cl)
vioplot::vioplot(rotationDegree ~ cluster, dat_cl)
for (cluster in sort(unique(dat_cl$cluster))) {
alog <- activitylog(dat_cl[dat_cl$cluster == cluster, ],
case_id = "path",
activity_id = "event",
resource_id = "item",
timestamps = c("start", "complete"))
dfg <- process_map(alog,
type_nodes = frequency("relative"),
sec_nodes = frequency("absolute"),
type_edges = frequency("relative"),
sec_edges = frequency("absolute"),
rankdir = "LR",
render = FALSE)
export_map(dfg,
file_name = paste0("results/processmaps/dfg_cluster", cluster, "_R.pdf"),
file_type = "pdf",
title = paste("DFG Cluster", cluster))
}
tmp <- dat[dat$event != "move", ]
check_traces <- function(data) {
datagg <- aggregate(event ~ path, data,
function(x) ifelse("openPopup" %in% x, T, F))
paths <- datagg$path[datagg$event]
datcheck <- data[data$path %in% paths, c("path", "event")]
datcheck <- datcheck[!duplicated(datcheck), ]
datcheck <- datcheck[order(datcheck$path), ]
retval <- NULL
for (path in unique(datcheck$path)) {
check <- !all(as.character(datcheck$event[datcheck$path == path]) ==
c("flipCard", "openTopic", "openPopup"))
retval <- rbind(retval, data.frame(path, check))
}
retval
}
check <- check_traces(tmp)
sum(check$check)
alog <- activitylog(dat,
case_id = "case",
activity_id = "item",
resource_id = "path",
timestamps = c("start", "complete"))
process_map(alog,
type_nodes = frequency("absolute"),
@ -131,55 +188,74 @@ process_map(alog,
rankdir = "LR")
alog2 <- activitylog(dat,
case_id = "case",
activity_id = "event",
resource_id = "item",
timestamps = c("start", "complete"))
process_map(alog2,
type_nodes = frequency("absolute"),
sec_nodes = frequency("relative"),
type_edges = frequency("absolute"),
sec_edges = frequency("relative"),
datcase <- dat[!duplicated(dat[, c("case", "path", "item")]),
c("case", "path", "event", "item")]
datcase$duration <- aggregate(duration ~ path, dat,
function(x) mean(x, na.rm = TRUE), na.action = NULL)$duration
datcase$distance <- aggregate(distance ~ path, dat,
function(x) mean(x, na.rm = TRUE), na.action = NULL)$distance
datcase$scaleSize <- aggregate(scaleSize ~ path, dat,
function(x) mean(x, na.rm = TRUE), na.action = NULL)$scaleSize
datcase$rotationDegree <- aggregate(rotationDegree ~ path, dat,
function(x) mean(x, na.rm = TRUE), na.action = NULL)$rotationDegree
# datcase$ntopics <- aggregate(topic ~ path, dat,
# function(x) ifelse(all(is.na(x)), NA, length(unique(na.omit(x)))),
# na.action = NULL)$topic
datcase$move <- ifelse(datcase$event == "move", 1, 0)
# paths that start with move
for (item in sort(unique(datcase$item))) {
datcase[paste0("item_", item)] <- ifelse(datcase$item == item, 1, 0)
}
mat <- na.omit(datcase[, -c(1:4)])
set.seed(1610)
nclusters <- 6
k1 <- kmeans(mat, nclusters)
#colors <- c("#3CB4DC", "#78004B", "#91C86E", "#FF6900")
colors <- palette.colors(palette = "Okabe-Ito")[1:nclusters]
library(distances)
mat_dist <- distances(mat)
xy <- cmdscale(mat_dist)
plot(xy, type = "n")
text(xy[,1], xy[,2], datcase$path, col = colors[k1$cluster])
legend("topright", paste("Cluster", 1:nclusters), col = colors, lty = 1)
## Scree plot
ks <- 1:10
sse <- NULL
for (k in ks) sse <- c(sse, kmeans(datitem[, -1], k)$tot.withinss)
plot(sse ~ ks, type = "l")
alog <- activitylog(datcase,
case_id = "case",
activity_id = "item",
resource_id = "path",
timestamps = c("start", "complete"))
process_map(alog,
type_nodes = frequency("relative"),
sec_nodes = frequency("absolute"),
type_edges = frequency("relative"),
sec_edges = frequency("absolute"),
rankdir = "LR")
library(processanimateR)
animate_process(to_eventlog(alog))
col_vector <- c("#7FC97F", "#BEAED4", "#FDC086", "#FFFF99", "#386CB0",
"#F0027F", "#BF5B17", "#666666", "#1B9E77", "#D95F02",
"#7570B3", "#E7298A", "#66A61E", "#E6AB02", "#A6761D",
"#666666", "#A6CEE3", "#1F78B4", "#B2DF8A", "#33A02C",
"#FB9A99", "#E31A1C", "#FDBF6F", "#FF7F00", "#CAB2D6",
"#6A3D9A", "#FFFF99", "#B15928", "#FBB4AE", "#B3CDE3",
"#CCEBC5", "#DECBE4", "#FED9A6", "#FFFFCC", "#E5D8BD",
"#FDDAEC", "#F2F2F2", "#B3E2CD", "#FDCDAC", "#CBD5E8",
"#F4CAE4", "#E6F5C9", "#FFF2AE", "#F1E2CC", "#CCCCCC",
"#E41A1C", "#377EB8", "#4DAF4A", "#984EA3", "#FF7F00",
"#FFFF33", "#A65628", "#F781BF", "#999999", "#66C2A5",
"#FC8D62", "#8DA0CB", "#E78AC3", "#A6D854", "#FFD92F",
"#E5C494", "#B3B3B3", "#8DD3C7", "#FFFFB3", "#BEBADA",
"#FB8072", "#80B1D3", "#FDB462", "#B3DE69", "#FCCDE5",
"#D9D9D9")
animate_process(to_eventlog(alog), mode = "relative", jitter = 10, legend = "color",
mapping = token_aes(color = token_scale("artwork",
scale = "ordinal",
range = col_vector)))
elog <- to_eventlog(alog)
animate_process(elog[elog$artwork == "054", ])
animate_process(elog[elog$artwork == "080", ])
animate_process(elog[elog$artwork == "501", ])
process_map(alog[alog$artwork == "054", ])
animate_process(elog[elog$artwork %in% c("080", "054"), ],
mode = "relative", jitter = 10, legend = "color",
mapping = token_aes(color = token_scale("artwork",
scale = "ordinal",
range = c("black", "gray"))))

28
code/check_broken_trace.R Normal file
View File

@ -0,0 +1,28 @@
# setwd("C:/Users/nwickelmaier/Nextcloud/Documents/MDS/2023ss/60100_master_thesis/analysis/code")
datraw <- read.table("results/haum/raw_logfiles_2024-01-18_09-58-52.csv",
header = TRUE, sep = ";")
# Read data
datlogs <- read.table("results/haum/event_logfiles_2024-01-18_09-58-52.csv",
colClasses = c("character", "character", "POSIXct",
"POSIXct", "character", "integer",
"numeric", "character", "character",
rep("numeric", 3), "character",
"character", rep("numeric", 11),
"character", "character"),
sep = ";", header = TRUE)
datlogs <- datlogs[order(datlogs$fileId.start, datlogs$date.start, datlogs$timeMs.start), ]
artwork <- "176"
fileId <- c('2017_06_16-13_49_00.log', '2017_06_16-13_59_00.log')
path <- 106098
datraw[datraw$item == artwork & datraw$fileId %in% fileId, ]
datlogs[datlogs$path == path, ]

158
code/item_clustering.R Normal file
View File

@ -0,0 +1,158 @@
# setwd("C:/Users/nwickelmaier/Nextcloud/Documents/MDS/2023ss/60100_master_thesis/analysis/code")
#--------------- (1) Read data ---------------
#--------------- (1.1) Read log event data ---------------
dat0 <- read.table("results/haum/event_logfiles_2024-01-18_09-58-52.csv",
colClasses = c("character", "character", "POSIXct",
"POSIXct", "character", "integer",
"numeric", "character", "character",
rep("numeric", 3), "character",
"character", rep("numeric", 11),
"character", "character"),
sep = ";", header = TRUE)
dat0$event <- factor(dat0$event, levels = c("move", "flipCard", "openTopic",
"openPopup"))
# Select data pre Corona
dat <- dat0[as.Date(dat0$date.start) < "2020-03-13", ]
dat <- dat[dat$path != 106098, ]
#--------------- (1.2) Read infos for PM for items ---------------
datitem <- read.table("results/haum/pn_infos_items.csv", header = TRUE,
sep = ";", row.names = 1)
#--------------- (1.3) Extract additional infos for clustering ---------------
datitem$duration <- aggregate(duration ~ item, dat, mean)$duration
datitem$distance <- aggregate(distance ~ item, dat, mean)$distance
datitem$scaleSize <- aggregate(scaleSize ~ item, dat, mean)$scaleSize
datitem$rotationDegree <- aggregate(rotationDegree ~ item, dat, mean)$rotationDegree
datitem$npaths <- aggregate(path ~ item, dat, function(x) length(unique(x)))$path
datitem$ncases <- aggregate(case ~ item, dat, function(x) length(unique(x)))$case
datitem$ntopics <- aggregate(topic ~ item, dat, function(x) length(unique(x)))$topic
datitem$mostfreq_num <- as.numeric(gsub(".*: (.*)}", "\\1", datitem$mostfreq))
#--------------- (2) Clustering ---------------
df <- datitem[, c("precision", "generalizability", "nvariants",
"mostfreq_num", "duration", "distance", "scaleSize",
"rotationDegree", "npaths", "ncases", "ntopics")] |>
scale()
mat <- dist(df)
hc <- hclust(mat, method = "ward.D2")
library(factoextra)
fviz_dend(hc, cex = 0.5)
datitem$grp <- cutree(hc, k = 6)
fviz_dend(hc, k = 6,
cex = 0.5,
k_colors = c("#78004B", "#000000", "#3CB4DC", "#91C86E",
"#FF6900", "#434F4F"),
#type = "phylogenic",
rect = TRUE
)
p <- fviz_cluster(list(data = df, cluster = grp),
palette = c("#78004B", "#000000", "#3CB4DC", "#91C86E",
"#FF6900", "#434F4F"),
ellipse.type = "convex",
repel = TRUE,
show.clust.cent = FALSE, ggtheme = theme_bw())
p
aggregate(cbind(duration, distance, scaleSize , rotationDegree, npaths,
ncases, ntopics) ~ grp, datitem, mean)
datitem$item <- gsub("item_([0-9]{3})", "\\1", row.names(datitem))
res <- merge(dat, datitem[, c("item", "grp")], by = "item", all.x = TRUE)
res <- res[order(res$fileId.start, res$date.start, res$timeMs.start), ]
write.table(res,
file = "results/haum/event_logfiles_pre-corona_with-clusters.csv",
sep = ";",
quote = FALSE,
row.names = FALSE)
library(bupaverse)
res$start <- res$date.start
res$complete <- res$date.stop
for (cluster in sort(unique(res$grp))) {
alog <- activitylog(res[res$grp == cluster, ],
case_id = "path",
activity_id = "event",
resource_id = "item",
timestamps = c("start", "complete"))
dfg <- process_map(alog,
type_nodes = frequency("relative"),
sec_nodes = frequency("absolute"),
type_edges = frequency("relative"),
sec_edges = frequency("absolute"),
rankdir = "LR",
render = FALSE)
export_map(dfg,
file_name = paste0("results/processmaps/dfg_cluster", cluster, "_R.pdf"),
file_type = "pdf",
title = paste("DFG Cluster", cluster))
}
#--------------- (3) Visualization with pictures ---------------
library(png)
library(jpeg)
library(grid)
colors <- c("#78004B", "#000000", "#3CB4DC", "#91C86E", "#FF6900",
"#434F4F")
#pdf("results/haum/figures/clustering_artworks.pdf", height = 8, width = 8, pointsize = 10)
png("results/haum/figures/clustering_artworks.png", units = "in", height = 8, width = 8, pointsize = 10, res = 300)
par(mai = c(.6,.6,.1,.1), mgp = c(2.4, 1, 0))
plot(y ~ x, p$data, type = "n", ylim = c(-3.2, 3), xlim = c(-4.7, 6.4))
for (item in sprintf("%03d", as.numeric(rownames(p$data)))) {
if (item == "125") {
pic <- readJPEG(paste0("../data/haum/ContentEyevisit/eyevisit_cards_light/",
item, "/", item, ".jpg"))
} else {
pic <- readPNG(paste0("../data/haum/ContentEyevisit/eyevisit_cards_light/",
item, "/", item, ".png"))
}
img <- as.raster(pic[,,1:3])
x <- p$data$x[sprintf("%03d", as.numeric(rownames(p$data))) == item]
y <- p$data$y[sprintf("%03d", as.numeric(rownames(p$data))) == item]
points(x, y,
col = colors[p$data$cluster[sprintf("%03d", as.numeric(rownames(p$data))) == item]],
cex = 9,
pch = 15)
rasterImage(img,
xleft = x - .4,
xright = x + .4,
ybottom = y - .2,
ytop = y + .2)
}
dev.off()

View File

@ -1,4 +1,3 @@
#%% # needed for shortcuts to run properly in VSCode *eyeroll*
%reset
import pm4py
@ -9,13 +8,12 @@ import matplotlib.pyplot as plt
###### Load data and create event logs ######
dat = pd.read_csv("results/haum/event_logfiles_2024-01-02_19-44-50.csv", sep = ";")
dat = pd.read_csv("results/haum/event_logfiles_2024-01-18_09-58-52.csv", sep = ";")
dat = dat[dat["date.start"] < "2020-03-13"]
# --> only pre corona (before artworks were updated)
event_log = pm4py.format_dataframe(dat, case_id='path', activity_key='event',
timestamp_key='date.start')
event_log = event_log.rename(columns={'item': 'case:item'})
###### Descrptives of log data ######
@ -39,30 +37,17 @@ len(variants_no_move)
sorted_variants_no_move = dict(sorted(variants_no_move.items(), key=lambda item: item[1], reverse = True))
{k: sorted_variants_no_move[k] for k in list(sorted_variants_no_move)[:20]}
# Path length
event_log.path.value_counts()
event_log.path.value_counts().mean()
event_log.path.value_counts().median()
event_log.path.value_counts().min()
event_log.path.value_counts().max()
plt.hist(event_log.path.value_counts(), bins=200)
plt.show()
# TODO: Do it again in R -- much smoother and more info, better plots
###### Read "conformative" Petri Net ######
basenet, initial_marking, final_marking = pm4py.read_pnml("results/conformative_petrinet_con.pnml")
def eval_pm(data, net, initial_marking, final_marking):
"""Caculate fitness, precision, generalizability, and simplicity for petri net"""
fitness = pm4py.fitness_token_based_replay(data, net, initial_marking, final_marking)
precisison = pm4py.precision_token_based_replay(data, net,
initial_marking, final_marking)
fitness = pm4py.fitness_token_based_replay(data, net, initial_marking, final_marking)
precisison = pm4py.precision_token_based_replay(data, net, initial_marking, final_marking)
generalizability = pm4py.algo.evaluation.generalization.algorithm.apply(data, net,
initial_marking, final_marking)
simplicity = pm4py.algo.evaluation.simplicity.algorithm.apply(net)
initial_marking, final_marking)
simplicity = pm4py.algo.evaluation.simplicity.algorithm.apply(net)
return [fitness['average_trace_fitness'], precisison, generalizability, simplicity]
baseline_eval = eval_pm(event_log, basenet, initial_marking, final_marking)
@ -80,75 +65,39 @@ for i in range(len(replayed_traces)):
l3.append(replayed_traces[i]["reached_marking"])
l4.append(replayed_traces[i]["transitions_with_problems"])
np.mean(l1)
set(l1)
index_broken = l1.index(1)
np.mean(l2)
set(l2)
l2.index(1)
x1 = np.array(l1)
index_broken = np.where(x1 == 1)[0].tolist()
set(l3)
l4.count([])
l3[index_broken]
l4[index_broken]
[l3[i] for i in index_broken]
[l4[i] for i in index_broken]
replayed_traces[index_broken]
broken_traces = [replayed_traces[i] for i in index_broken]
event_log[event_log['@@case_index'] == index_broken].event
event_log[event_log['@@case_index'] == index_broken].path
event_log[event_log['@@case_index'] == index_broken].item
event_log[event_log['@@case_index'] == index_broken]["fileId.start"]
# --> logging error in file!
event_log[event_log['@@case_index'] == index_broken].path.unique().tolist()
event_log[event_log['@@case_index'] == index_broken].item.unique().tolist()
event_log[event_log['@@case_index'] == index_broken]["fileId.start"].unique().tolist()
# --> logging error in raw file
from pm4py.algo.conformance.tokenreplay import algorithm as token_based_replay
parameters_tbr = {token_based_replay.Variants.TOKEN_REPLAY.value.Parameters.DISABLE_VARIANTS: True, token_based_replay.Variants.TOKEN_REPLAY.value.Parameters.ENABLE_PLTR_FITNESS: True}
replayed_traces, place_fitness, trans_fitness, unwanted_activities = token_based_replay.apply(event_log, basenet,
initial_marking,
final_marking,
parameters=parameters_tbr)
from pm4py.algo.conformance.tokenreplay.diagnostics import duration_diagnostics
trans_diagnostics = duration_diagnostics.diagnose_from_trans_fitness(event_log, trans_fitness)
for trans in trans_diagnostics:
print(trans, trans_diagnostics[trans])
# Footprints
from pm4py.algo.discovery.footprints import algorithm as footprints_discovery
fp_log = footprints_discovery.apply(event_log, variant=footprints_discovery.Variants.ENTIRE_EVENT_LOG)
fp_trace_by_trace = footprints_discovery.apply(event_log, variant=footprints_discovery.Variants.TRACE_BY_TRACE)
fp_net = footprints_discovery.apply(basenet, initial_marking, final_marking)
from pm4py.visualization.footprints import visualizer as fp_visualizer
fp_log = footprints_discovery.apply(event_log, variant=footprints_discovery.Variants.ENTIRE_EVENT_LOG)
fp_net = footprints_discovery.apply(basenet, initial_marking, final_marking)
gviz = fp_visualizer.apply(fp_net, parameters={fp_visualizer.Variants.SINGLE.value.Parameters.FORMAT: "svg"})
fp_visualizer.view(gviz)
gviz = fp_visualizer.apply(fp_log, fp_net, parameters={fp_visualizer.Variants.COMPARISON.value.Parameters.FORMAT: "svg"})
fp_visualizer.view(gviz)
conf_fp = pm4py.conformance_diagnostics_footprints(fp_trace_by_trace, fp_net)
from pm4py.algo.conformance.footprints import algorithm as fp_conformance
conf_result = fp_conformance.apply(fp_log, fp_net, variant=fp_conformance.Variants.LOG_EXTENSIVE)
from pm4py.algo.conformance.footprints.util import evaluation
fitness = evaluation.fp_fitness(fp_log, fp_net, conf_result)
precision = evaluation.fp_precision(fp_log, fp_net)
# Skeleton
from pm4py.algo.discovery.log_skeleton import algorithm as lsk_discovery
skeleton = lsk_discovery.apply(event_log, parameters={lsk_discovery.Variants.CLASSIC.value.Parameters.NOISE_THRESHOLD: 0.0})
from pm4py.algo.conformance.log_skeleton import algorithm as lsk_conformance
conf_result = lsk_conformance.apply(event_log, skeleton)
pm4py.vis.view_petri_net(basenet, initial_marking, final_marking)
is_sound = pm4py.check_soundness(basenet, initial_marking, final_marking)
is_sound[0]
len(basenet.arcs)
len(basenet.transitions)
len(basenet.places)
baseline_eval.append(is_sound[0])
baseline_eval.append(len(basenet.arcs))
baseline_eval.append(len(basenet.transitions))
baseline_eval.append(len(basenet.places))
efg_graph = pm4py.discover_eventually_follows_graph(event_log)
@ -157,163 +106,135 @@ dfg, start_activities, end_activities = pm4py.discover_dfg(event_log)
pm4py.view_dfg(dfg, start_activities, end_activities)
pm4py.save_vis_dfg(dfg, start_activities, end_activities, '../figures/processmaps/dfg_complete.png')
## Heuristics Miner
## Fitting different miners
### Heuristics Miner
h_net, im, fm = pm4py.discover_petri_net_heuristics(event_log)
pm4py.vis.view_petri_net(h_net, im, fm)
pm4py.vis.save_vis_petri_net(h_net, im, fm, "../figures/processmaps/petrinet_heuristics_complete.png")
h_eval = eval_pm(event_log, h_net, im, fm)
is_sound = pm4py.check_soundness(h_net, im, fm)
is_sound[0]
len(h_net.arcs)
len(h_net.transitions)
len(h_net.places)
# decorated petri net
from pm4py.visualization.petri_net import visualizer as pn_visualizer
parameters = {pn_visualizer.Variants.FREQUENCY.value.Parameters.FORMAT: "png"}
gviz = pn_visualizer.apply(h_net, im, fm, parameters=parameters, variant=pn_visualizer.Variants.FREQUENCY, log=event_log)
pn_visualizer.save(gviz, "../figures/processmaps/petrinet_heuristics_complete_decorated.png")
# convert to BPMN
bpmn = pm4py.convert.convert_to_bpmn(h_net, im, fm)
pm4py.vis.view_bpmn(bpmn)
h_eval.append(is_sound[0])
h_eval.append(len(h_net.arcs))
h_eval.append(len(h_net.transitions))
h_eval.append(len(h_net.places))
## Alpha Miner
a_net, im, fm = pm4py.discover_petri_net_alpha(event_log)
pm4py.vis.view_petri_net(a_net, im, fm)
pm4py.vis.save_vis_petri_net(a_net, im, fm, "../figures/processmaps/petrinet_alpha_complete.png")
a_eval = eval_pm(event_log, a_net, im, fm)
is_sound = pm4py.check_soundness(a_net, im, fm)
is_sound[0]
len(a_net.arcs)
len(a_net.transitions)
len(a_net.places)
a_eval.append(is_sound[0])
a_eval.append(len(a_net.arcs))
a_eval.append(len(a_net.transitions))
a_eval.append(len(a_net.places))
## Inductive Miner
i_net, im, fm = pm4py.discover_petri_net_inductive(event_log)
pm4py.vis.view_petri_net(i_net, im, fm)
pm4py.vis.save_vis_petri_net(i_net, im, fm, "../figures/processmaps/petrinet_induction_complete.png")
i_eval = eval_pm(event_log, i_net, im, fm)
# as process tree (does not work for heuristics miner!)
pt = pm4py.discover_process_tree_inductive(event_log)
pm4py.vis.view_process_tree(pt)
is_sound = pm4py.check_soundness(i_net, im, fm)
is_sound[0]
# TODO: Can I show that this simpler net does not include all traces? (Probably not,
# since fitness is 1, but WHY?)
len(i_net.arcs)
len(i_net.transitions)
len(i_net.places)
bpmn = pm4py.convert.convert_to_bpmn(i_net, im, fm)
pm4py.view_bpmn(bpmn)
from pm4py.algo.conformance.tokenreplay import algorithm as token_based_replay
parameters_tbr = {token_based_replay.Variants.TOKEN_REPLAY.value.Parameters.DISABLE_VARIANTS: True, token_based_replay.Variants.TOKEN_REPLAY.value.Parameters.ENABLE_PLTR_FITNESS: True}
replayed_traces, place_fitness, trans_fitness, unwanted_activities = token_based_replay.apply(event_log, i_net,
im,
fm,
parameters=parameters_tbr)
l1 = list()
l2 = list()
l3 = list()
l4 = list()
for i in range(len(replayed_traces)):
l1.append(replayed_traces[i]["remaining_tokens"])
l2.append(replayed_traces[i]["missing_tokens"])
l3.append(replayed_traces[i]["reached_marking"])
l4.append(replayed_traces[i]["transitions_with_problems"])
np.mean(l1)
set(l1)
index_broken = l1.index(1)
np.mean(l2)
set(l2)
l2.index(1)
set(l3)
l4.count([])
l3[index_broken]
l4[index_broken]
replayed_traces[index_broken]
event_log[event_log['@@case_index'] == index_broken].event
event_log[event_log['@@case_index'] == index_broken].path
event_log[event_log['@@case_index'] == index_broken].item
event_log[event_log['@@case_index'] == index_broken]["fileId.start"]
i_eval.append(is_sound[0])
i_eval.append(len(i_net.arcs))
i_eval.append(len(i_net.transitions))
i_eval.append(len(i_net.places))
## ILP Miner
ilp_net, im, fm = pm4py.discover_petri_net_ilp(event_log)
pm4py.vis.view_petri_net(ilp_net, im, fm)
pm4py.vis.save_vis_petri_net(ilp_net, im, fm, "../figures/processmaps/petrinet_ilp_complete.png")
ilp_eval = eval_pm(event_log, ilp_net, im, fm)
is_sound = pm4py.check_soundness(ilp_net, im, fm)
is_sound[0]
len(ilp_net.arcs)
len(ilp_net.transitions)
len(ilp_net.places)
ilp_eval.append(is_sound[0])
ilp_eval.append(len(ilp_net.arcs))
ilp_eval.append(len(ilp_net.transitions))
ilp_eval.append(len(ilp_net.places))
## Export for all miners
eval = pd.DataFrame(np.row_stack([baseline_eval, h_eval, a_eval, i_eval, ilp_eval]))
eval.columns = ["fitness", "precision", "generalizability", "simplicity"]
eval.columns = ["fitness", "precision", "generalizability", "simplicity",
"sound", "narcs", "ntrans", "nplaces"]
eval.index = ["conformative", "heuristics", "alpha", "inductive", "ilp"]
eval
eval.to_csv("results/eval_all-miners_complete.csv", sep=";")
eval.to_csv("results/eval_all-miners_complete.csv", sep=" ")
## Without broken trace
event_log_clean = event_log[event_log['@@case_index'] != index_broken]
h_net, a_im, h_fm = pm4py.discover_petri_net_heuristics(event_log_clean)
a_net, h_im, a_fm = pm4py.discover_petri_net_alpha(event_log_clean)
event_log_clean = event_log[event_log['@@case_index'] != index_broken[0]]
h_net, h_im, h_fm = pm4py.discover_petri_net_heuristics(event_log_clean)
a_net, a_im, a_fm = pm4py.discover_petri_net_alpha(event_log_clean)
i_net, i_im, i_fm = pm4py.discover_petri_net_inductive(event_log_clean)
ilp_net, ilp_im, ilp_fm = pm4py.discover_petri_net_ilp(event_log_clean)
baseline_eval = eval_pm(event_log_clean, basenet, initial_marking, final_marking)
is_sound = pm4py.check_soundness(basenet, initial_marking, final_marking)
baseline_eval.append(is_sound[0])
baseline_eval.append(len(basenet.arcs))
baseline_eval.append(len(basenet.transitions))
baseline_eval.append(len(basenet.places))
h_eval = eval_pm(event_log_clean, h_net, h_im, h_fm)
is_sound = pm4py.check_soundness(h_net, h_im, h_fm)
h_eval.append(is_sound[0])
h_eval.append(len(h_net.arcs))
h_eval.append(len(h_net.transitions))
h_eval.append(len(h_net.places))
a_eval = eval_pm(event_log_clean, a_net, a_im, a_fm)
is_sound = pm4py.check_soundness(a_net, a_im, a_fm)
a_eval.append(is_sound[0])
a_eval.append(len(a_net.arcs))
a_eval.append(len(a_net.transitions))
a_eval.append(len(a_net.places))
i_eval = eval_pm(event_log_clean, i_net, i_im, i_fm)
is_sound = pm4py.check_soundness(i_net, i_im, i_fm)
i_eval.append(is_sound[0])
i_eval.append(len(i_net.arcs))
i_eval.append(len(i_net.transitions))
i_eval.append(len(i_net.places))
ilp_eval = eval_pm(event_log_clean, ilp_net, ilp_im, ilp_fm)
is_sound = pm4py.check_soundness(ilp_net, ilp_im, ilp_fm)
ilp_eval.append(is_sound[0])
ilp_eval.append(len(ilp_net.arcs))
ilp_eval.append(len(ilp_net.transitions))
ilp_eval.append(len(ilp_net.places))
eval = pd.DataFrame(np.row_stack([baseline_eval, h_eval, a_eval, i_eval, ilp_eval]))
eval.columns = ["fitness", "precision", "generalizability", "simplicity"]
eval.columns = ["fitness", "precision", "generalizability", "simplicity",
"sound", "narcs", "ntrans", "nplaces"]
eval.index = ["conformative", "heuristics", "alpha", "inductive", "ilp"]
eval
eval.to_csv("results/eval_all-miners_clean.csv", sep=";")
eval.to_csv("results/eval_all-miners_clean.csv", sep=" ")
# Export petri nets
pm4py.vis.save_vis_petri_net(h_net, h_im, h_fm, "results/processmaps/petrinet_heuristics_clean.png")
pm4py.vis.save_vis_petri_net(a_net, a_im, a_fm, "results/processmaps/petrinet_alpha_clean.png")
pm4py.vis.save_vis_petri_net(i_net, i_im, i_fm, "results/processmaps/petrinet_inductive_clean.png")
pm4py.vis.save_vis_petri_net(ilp_net, ilp_im, ilp_fm, "results/processmaps/petrinet_ilp_clean.png")
pm4py.vis.save_vis_petri_net(basenet, initial_marking, final_marking, "results/processmaps/petrinet_conformative.png")
# convert to BPMN
base_bpmn = pm4py.convert.convert_to_bpmn(basenet, initial_marking, final_marking)
pm4py.vis.save_vis_bpmn(base_bpmn, "results/processmaps/bpmn_conformative.png")
i_bpmn = pm4py.convert.convert_to_bpmn(i_net, i_im, i_fm)
pm4py.vis.save_vis_bpmn(i_bpmn, "results/processmaps/bpmn_inductive_clean.png")
ilp_bpmn = pm4py.convert.convert_to_bpmn(ilp_net, ilp_im, ilp_fm)
pm4py.vis.save_vis_bpmn(ilp_bpmn, "results/processmaps/bpmn_ilp_clean.png")
a_bpmn = pm4py.convert.convert_to_bpmn(a_net, a_im, a_fm)
pm4py.vis.save_vis_bpmn(a_bpmn, "results/processmaps/bpmn_alpha_clean.png")
h_bpmn = pm4py.convert.convert_to_bpmn(h_net, h_im, h_fm)
pm4py.vis.save_vis_bpmn(h_bpmn, "results/processmaps/bpmn_heuristics_clean.png")
###### Process Mining - individual artworks ######
def pm_artworks(miner):
retval1 = np.empty((len(event_log["case:artwork"].unique()), 4))
retval2 = np.empty((len(event_log["case:artwork"].unique()), 4))
retval1 = np.empty((len(event_log["item"].unique()), 4))
retval2 = np.empty((len(event_log["item"].unique()), 4))
if miner == "heuristics":
net, im, fm = pm4py.discover_petri_net_heuristics(event_log)
elif miner == "inductive":
net, im, fm = pm4py.discover_petri_net_inductive(event_log)
elif miner == "alpha":
net, im, fm = pm4py.discover_petri_net_alpha(event_log)
elif miner == "ilp":
net, im, fm = pm4py.discover_petri_net_ilp(event_log)
for i in range(len(event_log["case:artwork"].unique())):
artwork = event_log["case:artwork"].unique()[i]
subdata = pm4py.filter_event_attribute_values(event_log, "case:artwork",
for i in range(len(event_log["item"].unique())):
artwork = event_log["item"].unique()[i]
subdata = pm4py.filter_event_attribute_values(event_log, "item",
[artwork],
level="case", retain=True)
if miner == "heuristics":
@ -325,17 +246,17 @@ def pm_artworks(miner):
elif miner == "ilp":
subnet, subim, subfm = pm4py.discover_petri_net_ilp(subdata)
#pm4py.save_vis_petri_net(subnet, subim, subfm,
# "../figures/processmaps/artworks/petrinet_" + miner + "_" + str(artwork).zfill(3) + ".png")
retval1[i] = eval_pm(subdata, net, im, fm)
# "results/processmaps/artworks/petrinet_" + miner + "_" + str(artwork).zfill(3) + ".png")
retval1[i] = eval_pm(subdata, basenet, initial_marking, final_marking)
retval2[i] = eval_pm(subdata, subnet, subim, subfm)
retval1 = pd.DataFrame(retval1)
retval1.columns = ["fitness", "precision", "generalizability", "simplicity"]
retval1.index = event_log["case:artwork"].unique()
retval1.index = event_log["item"].unique()
retval1.insert(0, "nettype", "alldata")
retval2 = pd.DataFrame(retval2)
retval2.columns = ["fitness", "precision", "generalizability", "simplicity"]
retval2.index = event_log["case:artwork"].unique()
retval2.index = event_log["item"].unique()
retval2.insert(0, "nettype", "subdata")
return pd.concat([retval1, retval2])
@ -343,7 +264,3 @@ def pm_artworks(miner):
for miner in ["heuristics", "inductive", "alpha", "ilp"]:
eval_art = pm_artworks(miner = miner)
eval_art.to_csv("results/eval_artworks_" + miner + ".csv", sep=";")
eval_art = pm_artworks(miner = "inductive")

View File

@ -138,15 +138,15 @@ final_marking = Marking()
final_marking[sink] = 1
pm4py.view_petri_net(net_seq, initial_marking, final_marking)
pm4py.write_pnml(net_seq, initial_marking, final_marking, "results/conformative_petrinet_seq.pnml")
pm4py.write_pnml(net_seq, initial_marking, final_marking, "results/haum/conformative_petrinet_seq.pnml")
pm4py.vis.save_vis_petri_net(net_seq, initial_marking, final_marking,
"../figures/conformative_petrinet_seq.png")
"results/processmaps/conformative_petrinet_seq.png")
bpmn = pm4py.convert.convert_to_bpmn(net_seq, initial_marking, final_marking)
pm4py.view_bpmn(bpmn)
pm4py.vis.save_vis_bpmn(bpmn, "../figures/conformative_bpmn_seq.png")
pm4py.vis.save_vis_bpmn(bpmn, "results/processmaps/conformative_bpmn_seq.png")
## Concurrent net
@ -240,12 +240,12 @@ final_marking = Marking()
final_marking[sink] = 1
pm4py.view_petri_net(net_con, initial_marking, final_marking)
pm4py.write_pnml(net_con, initial_marking, final_marking, "results/conformative_petrinet_con.pnml")
pm4py.write_pnml(net_con, initial_marking, final_marking, "results/haum/conformative_petrinet_con.pnml")
pm4py.vis.save_vis_petri_net(net_con, initial_marking, final_marking,
"../figures/conformative_petrinet_con.png")
"results/processmaps/conformative_petrinet_con.png")
bpmn = pm4py.convert.convert_to_bpmn(net_con, initial_marking, final_marking)
pm4py.view_bpmn(bpmn)
pm4py.vis.save_vis_bpmn(bpmn, "../figures/conformative_bpmn_con.png")
pm4py.vis.save_vis_bpmn(bpmn, "results/processmaps/conformative_bpmn_con.png")

38
code/pm_infos-clusters.py Normal file
View File

@ -0,0 +1,38 @@
%reset
import pm4py
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from python_helpers import eval_pm, pn_infos
###### Load data and create event logs ######
dat = pd.read_csv("results/haum/event_logfiles_pre-corona_with-clusters.csv", sep = ";")
log_path = pm4py.format_dataframe(dat, case_id = "path", activity_key = "event",
timestamp_key = "date.start")
###### Infos for clusters ######
# Merge clusters into data frame
mdc = pd.DataFrame(columns = ["fitness", "precision", "generalizability",
"simplicity", "sound", "narcs", "ntrans",
"nplaces", "nvariants", "mostfreq"])
for cluster in log_path.grp.unique().tolist():
mdc = pd.concat([mdc, pn_infos(log_path, "grp", cluster)])
mdc = mdc.sort_index()
# Export
mdc.to_csv("results/haum/pn_infos_clusters.csv", sep = ";")
###### Process maps for clusters ######
for cluster in log_path.grp.unique().tolist():
subdata = log_path[log_path.grp == cluster]
subnet, subim, subfm = pm4py.discover_petri_net_inductive(subdata)
pm4py.save_vis_petri_net(subnet, subim, subfm,
"results/processmaps/petrinet_cluster" + str(cluster).zfill(3) + ".png")
bpmn = pm4py.convert.convert_to_bpmn(subnet, subim, subfm)
pm4py.vis.save_vis_bpmn(bpmn, "results/processmaps/bpmn_cluster_" + str(cluster).zfill(3) + ".png")

54
code/pm_infos-items.py Normal file
View File

@ -0,0 +1,54 @@
%reset
import pm4py
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from python_helpers import eval_pm, pn_infos
###### Load data and create event logs ######
dat = pd.read_csv("results/haum/event_logfiles_2024-01-18_09-58-52.csv", sep = ";")
dat = dat[dat["date.start"] < "2020-03-13"]
dat = dat[dat["path"] != 106098] # exclude broken trace
# --> only pre corona (before artworks were updated)
log_path = pm4py.format_dataframe(dat, case_id = "path", activity_key = "event",
timestamp_key = "date.start")
###### Infos for items ######
mdi = pd.DataFrame(columns = ["fitness", "precision", "generalizability",
"simplicity", "sound", "narcs", "ntrans",
"nplaces", "nvariants", "mostfreq"])
for item in log_path.item.unique().tolist():
mdi = pd.concat([mdi, pn_infos(log_path, "item", item)])
mdi = mdi.sort_index()
# Export
mdi.to_csv("results/haum/pn_infos_items.csv", sep = ";")
# datitem = dat.groupby("item")[["duration", "distance",
# "scaleSize", "rotationDegree"]].mean()
#
# def length_path(data):
# x = data.path
# return len(x.unique())
# def length_case(data):
# x = data.case
# return len(x.unique())
# def length_topic(data):
# x = data.topic.dropna()
# return len(x.unique())
#
# datitem["npaths"] = dat.groupby(["item"]).apply(length_path)
# datitem["ncases"] = dat.groupby(["item"]).apply(length_case)
# datitem["ntopics"] = dat.groupby(["item"]).apply(length_topic)
#
# datitem.index = datitem.index.astype(str).str.rjust(3, "0")
# datitem = datitem.sort_index()
# datitem.index = mdi.index
#
# datitem = pd.concat([mdi, datitem], yaxis = 1)

View File

@ -10,9 +10,9 @@ parameters = {pn_visualizer.Variants.FREQUENCY.value.Parameters.FORMAT: "png"}
###### Load data and create event logs ######
dat = pd.read_csv("results/haum/event_logfiles_2024-01-02_19-44-50.csv", sep = ";")
dat = pd.read_csv("results/haum/event_logfiles_2024-01-18_09-58-52.csv", sep = ";")
dat = dat[dat["date.start"] < "2020-03-13"]
dat = dat[dat["path"] != 81621] # exclude broken trace
dat = dat[dat["path"] != 106098] # exclude broken trace
# --> only pre corona (before artworks were updated)
event_log = pm4py.format_dataframe(dat, case_id='case', activity_key='event',
@ -26,32 +26,101 @@ pm4py.view_dfg(dfg, start_activities, end_activities)
#filtered_log = pm4py.filter_event_attribute_values(event_log, 'item', [80])
i_net, im, fm = pm4py.discover_petri_net_inductive(event_log)
pm4py.vis.view_petri_net(i_net, im, fm)
gviz = pn_visualizer.apply(i_net, im, fm, parameters=parameters,
net, im, fm = pm4py.discover_petri_net_inductive(event_log)
pm4py.vis.view_petri_net(net, im, fm)
pm4py.vis.view_petri_net(net, im, fm)
gviz = pn_visualizer.apply(net, im, fm, parameters=parameters,
variant=pn_visualizer.Variants.FREQUENCY,
log=event_log)
pn_visualizer.view(gviz)
len(i_net.places)
len(i_net.transitions)
len(i_net.arcs)
bpmn = pm4py.convert.convert_to_bpmn(net, im, fm)
pm4py.vis.view_bpmn(bpmn)
a_net, im, fm = pm4py.discover_petri_net_alpha(event_log)
pm4py.vis.view_petri_net(a_net, im, fm)
gviz = pn_visualizer.apply(a_net, im, fm, parameters=parameters,
variant=pn_visualizer.Variants.FREQUENCY,
log=event_log)
pn_visualizer.view(gviz)
net2, im2, fm2 = pm4py.discover_petri_net_inductive(event_log, noise_threshold=0.1)
pm4py.vis.view_petri_net(net2, im2, fm2)
def eval_pm(data, net, initial_marking, final_marking):
"""Caculate fitness, precision, generalizability, and simplicity for petri net"""
fitness = pm4py.fitness_token_based_replay(data, net, initial_marking, final_marking)
precisison = pm4py.precision_token_based_replay(data, net, initial_marking, final_marking)
#generalizability = pm4py.algo.evaluation.generalization.algorithm.apply(data, net,
# initial_marking, final_marking)
simplicity = pm4py.algo.evaluation.simplicity.algorithm.apply(net)
#return [fitness['average_trace_fitness'], precisison, generalizability, simplicity]
return [fitness['average_trace_fitness'], precisison, simplicity]
eval = eval_pm(event_log, net, im, fm)
eval2 = eval_pm(event_log, net2, im2, fm2)
len(net.places)
len(net.transitions)
len(net.arcs)
# Number of cases
len(event_log.case.unique())
# Number of variants
variants = pm4py.get_variants(event_log)
len(variants)
sorted_variants = dict(sorted(variants.items(), key=lambda item: item[1], reverse = True))
{k: sorted_variants[k] for k in list(sorted_variants)[:20]}
filtered_log = event_log[event_log["event"] != "move"]
variants_no_move = pm4py.get_variants(filtered_log)
len(variants_no_move)
sorted_variants_no_move = dict(sorted(variants_no_move.items(), key=lambda item: item[1], reverse = True))
{k: sorted_variants_no_move[k] for k in list(sorted_variants_no_move)[:20]}
len(a_net.places)
len(a_net.transitions)
len(a_net.arcs)
h_net, im, fm = pm4py.discover_petri_net_heuristics(filtered_log)
pm4py.vis.view_petri_net(h_net, im, fm)
len(h_net.places)
len(h_net.transitions)
len(h_net.arcs)
###### Navigation behavior for case ######
log_case = pm4py.format_dataframe(dat, case_id = "case", activity_key = "item",
timestamp_key = "date.start")
log_case = log_case.merge(tmp, on = "item", how = "left")
#filtered_log = pm4py.filter_event_attribute_values(log_case, "kcluster", [3])
filtered_log = log_case[log_case.hcluster == 1]
net, im, fm = pm4py.discover_dfg(filtered_log)
pm4py.vis.view_dfg(net, im, fm)
net, im, fm = pm4py.discover_petri_net_inductive(filtered_log)
pm4py.vis.view_petri_net(net, im, fm)
tree = pm4py.discovery.discover_process_tree_inductive(filtered_log)
pm4py.vis.view_process_tree(tree)
datcase = dat[~dat.duplicated(["case", "path", "item"])]
datcase = datcase[["case", "path", "event", "item", "date.start"]]
datcase = datcase.reset_index().drop("index", axis = 1)
#datcase = pd.concat([datcase, pd.get_dummies(datcase["item"], dtype = "int")], axis = 1)
datcase["duration"] = dat.groupby("path")["duration"].mean().tolist()
datcase["distance"] = dat.groupby("path")["distance"].mean().tolist()
datcase["scaleSize"] = dat.groupby("path")["scaleSize"].mean().tolist()
datcase["rotationDegree"] = dat.groupby("path")["rotationDegree"].mean().tolist()
datcase["item"] = [str(item).zfill(3) for item in datcase.item]
datcase = datcase.merge(xy[["item", "hcluster"]], on = "item", how = "left")
log_case = pm4py.format_dataframe(dat, case_id = "case", activity_key = "item",
timestamp_key = "date.start")
net, im, fm = pm4py.discover_dfg(log_case)
pm4py.vis.view_dfg(net, im, fm)
# don't know if this will eventually finish?
net, im, fm = pm4py.discover_dfg(log_case[log_case.hcluster == 1])
pm4py.vis.view_dfg(net, im, fm)

37
code/python_helpers.py Normal file
View File

@ -0,0 +1,37 @@
import pm4py
import pandas as pd
###### Extract metadata for petri nets on filtered logs ######
def eval_pm(data, net, initial_marking, final_marking):
"""Caculate fitness, precision, generalizability, and simplicity for petri net"""
fitness = pm4py.fitness_token_based_replay(data, net, initial_marking, final_marking)
precisison = pm4py.precision_token_based_replay(data, net, initial_marking, final_marking)
generalizability = pm4py.algo.evaluation.generalization.algorithm.apply(data, net,
initial_marking, final_marking)
simplicity = pm4py.algo.evaluation.simplicity.algorithm.apply(net)
return [fitness['average_trace_fitness'], precisison, generalizability, simplicity]
def pn_infos(log, colname, filter):
"""Create data frame with relevant infos for petri nets on filtered logs"""
filtered_log = pm4py.filter_event_attribute_values(log, colname, [filter])
net, im, fm = pm4py.discover_petri_net_inductive(filtered_log)
eval = eval_pm(filtered_log, net, im, fm)
is_sound = pm4py.check_soundness(net, im, fm)
eval.append(is_sound[0])
eval.append(len(net.arcs))
eval.append(len(net.transitions))
eval.append(len(net.places))
variants = pm4py.get_variants(filtered_log)
eval.append(len(variants))
sorted_variants = dict(sorted(variants.items(), key=lambda item: item[1], reverse = True))
eval.append({k: sorted_variants[k] for k in list(sorted_variants)[:1]})
eval = pd.DataFrame(eval).T
eval.columns = ["fitness", "precision", "generalizability", "simplicity",
"sound", "narcs", "ntrans", "nplaces", "nvariants", "mostfreq"]
eval.index = [str(filter).zfill(3)]
return eval

View File

@ -39,3 +39,136 @@ plt.plot(list(sse.keys()), list(sse.values()))
plt.xlabel("Number of clusters")
plt.ylabel("SSE")
plt.show()
### TMP
datitem = dat.groupby("item")[["duration", "distance",
"scaleSize", "rotationDegree"]].mean()
def length_path(data):
x = data.path
return len(x.unique())
def length_case(data):
x = data.case
return len(x.unique())
def length_topic(data):
x = data.topic.dropna()
return len(x.unique())
datitem["npaths"] = dat.groupby(["item"]).apply(length_path)
datitem["ncases"] = dat.groupby(["item"]).apply(length_case)
datitem["ntopics"] = dat.groupby(["item"]).apply(length_topic)
datitem.index = datitem.index.astype(str).str.rjust(3, "0")
datitem = datitem.sort_index()
datitem.index = mdi.index
datitem = pd.concat([mdi, datitem], axis = 1)
###### Find clusters ######
myseed = 1420
mat = datitem.drop(["fitness", "sound", "mostfreq"], axis = 1)
mat = StandardScaler().fit_transform(mat)
xy = pd.DataFrame(MDS(normalized_stress = 'auto', random_state = myseed).fit_transform(mat))
xy.index = datitem.index
### K-Means clustering ###
kmeans = KMeans(n_clusters = 6, max_iter = 1000, random_state = myseed).fit(mat)
xy["kcluster"] = kmeans.labels_
for i in xy.kcluster.unique():
plt.scatter(xy[xy.kcluster == i].iloc[:,0], xy[xy.kcluster == i].iloc[:,1], label = i)
for j, txt in enumerate(xy.index[xy.kcluster == i]):
plt.annotate(txt.split("_")[1], (xy[xy.kcluster == i].iloc[j,0], xy[xy.kcluster == i].iloc[j,1]))
plt.legend()
plt.show()
xy.kcluster.value_counts()
# Scree plot
sse = {}
for k in range(1, 10):
kmeans = KMeans(n_clusters = k, max_iter = 1000).fit(mat)
sse[k] = kmeans.inertia_ # Inertia: Sum of distances of samples to their closest cluster center
plt.figure()
plt.plot(list(sse.keys()), list(sse.values()))
plt.xlabel("Number of clusters")
plt.ylabel("SSE")
plt.show()
c0_items = xy[xy.kcluster == 0].index
c1_items = xy[xy.kcluster == 1].index
c2_items = xy[xy.kcluster == 2].index
c3_items = xy[xy.kcluster == 3].index
c4_items = xy[xy.kcluster == 4].index
c5_items = xy[xy.kcluster == 5].index
### Hierarchical clustering ###
from sklearn.cluster import AgglomerativeClustering
hclust = AgglomerativeClustering(n_clusters = 6).fit(mat)
hclust.labels_
xy["hcluster"] = hclust.labels_
for i in xy.hcluster.unique():
plt.scatter(xy[xy.hcluster == i].iloc[:,0], xy[xy.hcluster == i].iloc[:,1], label = i)
for j, txt in enumerate(xy.index[xy.hcluster == i]):
plt.annotate(txt.split("_")[1], (xy[xy.hcluster == i].iloc[j,0], xy[xy.hcluster == i].iloc[j,1]))
plt.legend()
plt.show()
# dendrogram
from scipy.cluster.hierarchy import dendrogram
def plot_dendrogram(model, **kwargs):
# Create linkage matrix and then plot the dendrogram
# create the counts of samples under each node
counts = np.zeros(model.children_.shape[0])
n_samples = len(model.labels_)
for i, merge in enumerate(model.children_):
current_count = 0
for child_idx in merge:
if child_idx < n_samples:
current_count += 1 # leaf node
else:
current_count += counts[child_idx - n_samples]
counts[i] = current_count
linkage_matrix = np.column_stack(
[model.children_, model.distances_, counts]
).astype(float)
# Plot the corresponding dendrogram
dendrogram(linkage_matrix, **kwargs)
hclust = AgglomerativeClustering(distance_threshold = 0, n_clusters = None).fit(mat)
plot_dendrogram(hclust)
plt.show()
### Bisecting K-Means clustering ###
from sklearn.cluster import BisectingKMeans
biKmeans = BisectingKMeans(n_clusters = 6, random_state = myseed).fit(mat)
biKmeans.labels_
xy["bcluster"] = biKmeans.labels_
for i in xy.bcluster.unique():
plt.scatter(xy[xy.bcluster == i].iloc[:,0], xy[xy.bcluster == i].iloc[:,1], label = i)
for j, txt in enumerate(xy.index[xy.bcluster == i]):
plt.annotate(txt.split("_")[1], (xy[xy.bcluster == i].iloc[j,0], xy[xy.bcluster == i].iloc[j,1]))
plt.legend()
plt.show()