Script dump after trying out a hundred things; needs serious cleaning
This commit is contained in:
parent
b4ca4dd5eb
commit
e8aac63504
@ -54,7 +54,7 @@ mat1 <- dat[, c("year", "duration1", "topicNumber1", "distance1", "scaleSize1",
|
|||||||
paste0("A", unique(dat$artwork)), "flipCard", "move", "openTopic",
|
paste0("A", unique(dat$artwork)), "flipCard", "move", "openTopic",
|
||||||
"openPopup")]
|
"openPopup")]
|
||||||
|
|
||||||
library(cluster) # for hiereachical clustering
|
library(cluster) # for hierarchical clustering
|
||||||
|
|
||||||
k1 <- kmeans(mat1, 2)
|
k1 <- kmeans(mat1, 2)
|
||||||
dat$kcluster <- k1$cluster
|
dat$kcluster <- k1$cluster
|
||||||
|
@ -1,8 +1,10 @@
|
|||||||
# setwd("C:/Users/nwickelmaier/Nextcloud/Documents/MDS/2023ss/60100_master_thesis/code")
|
# setwd("C:/Users/nwickelmaier/Nextcloud/Documents/MDS/2023ss/60100_master_thesis/analysis/code")
|
||||||
|
|
||||||
|
library(bupaverse)
|
||||||
|
|
||||||
# Read data
|
# Read data
|
||||||
|
|
||||||
dat <- read.table("results/haum/event_logfiles_2024-01-02_19-44-50.csv",
|
dat0 <- read.table("results/haum/event_logfiles_2024-01-18_09-58-52.csv",
|
||||||
colClasses = c("character", "character", "POSIXct",
|
colClasses = c("character", "character", "POSIXct",
|
||||||
"POSIXct", "character", "integer",
|
"POSIXct", "character", "integer",
|
||||||
"numeric", "character", "character",
|
"numeric", "character", "character",
|
||||||
@ -11,10 +13,10 @@ dat <- read.table("results/haum/event_logfiles_2024-01-02_19-44-50.csv",
|
|||||||
"character", "character"),
|
"character", "character"),
|
||||||
sep = ";", header = TRUE)
|
sep = ";", header = TRUE)
|
||||||
|
|
||||||
dat$event <- factor(dat$event, levels = c("move", "flipCard", "openTopic",
|
dat0$event <- factor(dat0$event, levels = c("move", "flipCard", "openTopic",
|
||||||
"openPopup"))
|
"openPopup"))
|
||||||
|
|
||||||
dat$weekdays <- factor(weekdays(dat$date.start),
|
dat0$weekdays <- factor(weekdays(dat0$date.start),
|
||||||
levels = c("Montag", "Dienstag", "Mittwoch",
|
levels = c("Montag", "Dienstag", "Mittwoch",
|
||||||
"Donnerstag", "Freitag", "Samstag",
|
"Donnerstag", "Freitag", "Samstag",
|
||||||
"Sonntag"),
|
"Sonntag"),
|
||||||
@ -23,98 +25,26 @@ dat$weekdays <- factor(weekdays(dat$date.start),
|
|||||||
"Sunday"))
|
"Sunday"))
|
||||||
|
|
||||||
# Select data pre Corona
|
# Select data pre Corona
|
||||||
dat <- dat[as.Date(dat$date.start) < "2020-03-13", ]
|
dat <- dat0[as.Date(dat0$date.start) < "2020-03-13", ]
|
||||||
dat <- dat[dat["path"] != 81621, ]
|
dat <- dat[dat$path != 106098, ]
|
||||||
|
|
||||||
|
|
||||||
|
table(table(dat$start))
|
||||||
|
|
||||||
table(dat$event)
|
table(dat$event)
|
||||||
proportions(table(dat$event))
|
proportions(table(dat$event))
|
||||||
|
|
||||||
|
dat_dur <- aggregate(duration ~ item, dat, mean)
|
||||||
|
barplot(duration - mean(dat_dur$duration) ~ item, dat_dur, col = "#434F4F",
|
||||||
|
las = 3)
|
||||||
|
|
||||||
|
|
||||||
# Investigate paths (will separate items and give clusters of artworks!)
|
# Investigate paths (will separate items and give clusters of artworks!)
|
||||||
length(unique(dat$path))
|
length(unique(dat$path))
|
||||||
|
# DFGs per Cluster
|
||||||
datpath <- aggregate(cbind(duration, distance, scaleSize, rotationDegree) ~
|
|
||||||
path, dat, function(x) mean(x, na.rm = TRUE), na.action = NULL)
|
|
||||||
|
|
||||||
datpath$length <- aggregate(item ~ path, dat, length)$item
|
|
||||||
datpath$nitems <- aggregate(item ~ path, dat, function(x)
|
|
||||||
length(unique(x)), na.action = NULL)$item
|
|
||||||
datpath$ntopics <- aggregate(topic ~ path, dat,
|
|
||||||
function(x) ifelse(all(is.na(x)), NA, length(unique(na.omit(x)))),
|
|
||||||
na.action = NULL)$topic
|
|
||||||
|
|
||||||
datpath$vacation <- aggregate(vacation ~ path, dat,
|
|
||||||
function(x) ifelse(all(is.na(x)), 0, 1),
|
|
||||||
na.action = NULL)$vacation
|
|
||||||
datpath$holiday <- aggregate(holiday ~ path, dat,
|
|
||||||
function(x) ifelse(all(is.na(x)), 0, 1),
|
|
||||||
na.action = NULL)$holiday
|
|
||||||
datpath$weekend <- aggregate(weekdays ~ path, dat,
|
|
||||||
function(x) ifelse(any(x %in% c("Saturday", "Sunday")), 1, 0),
|
|
||||||
na.action = NULL)$weekdays
|
|
||||||
datpath$morning <- aggregate(date.start ~ path, dat,
|
|
||||||
function(x) ifelse(lubridate::hour(x[1]) > 13, 0, 1),
|
|
||||||
na.action = NULL)$date.start
|
|
||||||
|
|
||||||
|
|
||||||
# Investigate cases (= interactions per time intervall)
|
|
||||||
length(unique(dat$case))
|
|
||||||
|
|
||||||
datcase <- aggregate(cbind(duration, distance, scaleSize, rotationDegree) ~
|
|
||||||
case, dat, function(x) mean(x, na.rm = TRUE), na.action = NULL)
|
|
||||||
|
|
||||||
datcase$length <- aggregate(item ~ case, dat, length)$item
|
|
||||||
datcase$nitems <- aggregate(item ~ case, dat, function(x)
|
|
||||||
length(unique(x)), na.action = NULL)$item
|
|
||||||
datcase$ntopics <- aggregate(topic ~ case, dat,
|
|
||||||
function(x) ifelse(all(is.na(x)), NA, length(unique(na.omit(x)))),
|
|
||||||
na.action = NULL)$topic
|
|
||||||
|
|
||||||
datcase$vacation <- aggregate(vacation ~ case, dat,
|
|
||||||
function(x) ifelse(all(is.na(x)), 0, 1),
|
|
||||||
na.action = NULL)$vacation
|
|
||||||
datcase$holiday <- aggregate(holiday ~ case, dat,
|
|
||||||
function(x) ifelse(all(is.na(x)), 0, 1),
|
|
||||||
na.action = NULL)$holiday
|
|
||||||
datcase$weekend <- aggregate(weekdays ~ case, dat,
|
|
||||||
function(x) ifelse(any(x %in% c("Saturday", "Sunday")), 1, 0),
|
|
||||||
na.action = NULL)$weekdays
|
|
||||||
datcase$morning <- aggregate(date.start ~ case, dat,
|
|
||||||
function(x) ifelse(lubridate::hour(x[1]) > 13, 0, 1),
|
|
||||||
na.action = NULL)$date.start
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
# Paths with more than one case associated
|
|
||||||
tmp <- aggregate(case ~ path, dat, function(x) length(unique(x)))
|
|
||||||
sum(tmp$case > 1)
|
|
||||||
table(tmp$case)
|
|
||||||
|
|
||||||
dat$date <- as.Date(dat$date.start)
|
|
||||||
|
|
||||||
tmp <- aggregate(date ~ path, dat, function(x) length(unique(x)))
|
|
||||||
sum(tmp$date > 1)
|
|
||||||
table(tmp$date)
|
|
||||||
tmp[tmp$date > 1, ]
|
|
||||||
|
|
||||||
for (p in tmp$path[tmp$date > 1]) {
|
|
||||||
print(dat[dat$path == p, 3:9])
|
|
||||||
cat("\n\n")
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
dat[dat$date == "2017-02-28" & dat$item == "503", ]
|
|
||||||
|
|
||||||
|
|
||||||
# Creating event logs
|
|
||||||
|
|
||||||
library(bupaverse)
|
|
||||||
|
|
||||||
dat$start <- dat$date.start
|
dat$start <- dat$date.start
|
||||||
dat$complete <- dat$date.stop
|
dat$complete <- dat$date.stop
|
||||||
|
|
||||||
table(table(dat$start))
|
|
||||||
# --> hmm...
|
|
||||||
|
|
||||||
summary(aggregate(duration ~ path, dat, mean))
|
summary(aggregate(duration ~ path, dat, mean))
|
||||||
|
|
||||||
alog <- activitylog(dat,
|
alog <- activitylog(dat,
|
||||||
@ -130,13 +60,127 @@ process_map(alog,
|
|||||||
sec_edges = frequency("relative"),
|
sec_edges = frequency("relative"),
|
||||||
rankdir = "LR")
|
rankdir = "LR")
|
||||||
|
|
||||||
|
### Separate for items
|
||||||
|
|
||||||
alog2 <- activitylog(dat,
|
datitem <- aggregate(cbind(duration, distance, scaleSize, rotationDegree) ~
|
||||||
case_id = "case",
|
item, dat, function(x) mean(x, na.rm = TRUE), na.action = NULL)
|
||||||
|
datitem$npaths <- aggregate(path ~ item, dat,
|
||||||
|
function(x) length(unique(x)),
|
||||||
|
na.action = NULL)$path
|
||||||
|
datitem$ncases <- aggregate(case ~ item, dat,
|
||||||
|
function(x) length(unique(x)),
|
||||||
|
na.action = NULL)$case
|
||||||
|
datitem$ntopics <- aggregate(topic ~ item, dat,
|
||||||
|
function(x) ifelse(all(is.na(x)), NA, length(unique(na.omit(x)))),
|
||||||
|
na.action = NULL)$topic
|
||||||
|
|
||||||
|
|
||||||
|
set.seed(1211)
|
||||||
|
|
||||||
|
nclusters <- 6
|
||||||
|
k1 <- kmeans(datitem[, -1], nclusters)
|
||||||
|
|
||||||
|
#colors <- c("#3CB4DC", "#78004B", "#91C86E", "#FF6900")
|
||||||
|
|
||||||
|
colors <- palette.colors(palette = "Okabe-Ito")
|
||||||
|
|
||||||
|
xy <- cmdscale(dist(datitem[, -1]))
|
||||||
|
|
||||||
|
plot(xy, type = "n")
|
||||||
|
text(xy[,1], xy[,2], datitem$item, col = colors[k1$cluster])
|
||||||
|
legend("topright", paste("Cluster", 1:nclusters), col = colors, lty = 1)
|
||||||
|
|
||||||
|
## Scree plot
|
||||||
|
|
||||||
|
ks <- 1:10
|
||||||
|
|
||||||
|
sse <- NULL
|
||||||
|
for (k in ks) sse <- c(sse, kmeans(datitem[, -1], k)$tot.withinss)
|
||||||
|
|
||||||
|
plot(sse ~ ks, type = "l")
|
||||||
|
|
||||||
|
|
||||||
|
datitem$cluster <- k1$cluster
|
||||||
|
|
||||||
|
datitem_agg <- aggregate(. ~ cluster, datitem[, -1], mean)
|
||||||
|
|
||||||
|
|
||||||
|
dat_cl <- merge(dat, datitem[, c("item", "cluster")], by = "item", all.x = TRUE)
|
||||||
|
dat_cl <- dat_cl[order(dat_cl$fileId.start, dat_cl$date.start, dat_cl$timeMs.start), ]
|
||||||
|
|
||||||
|
write.table(dat_cl, "results/haum/event_logfiles_with-clusters_kmeans.csv",
|
||||||
|
sep = ";", row.names = FALSE)
|
||||||
|
|
||||||
|
vioplot::vioplot(datitem$duration)
|
||||||
|
|
||||||
|
vioplot::vioplot(duration ~ item, dat, las = 3)
|
||||||
|
|
||||||
|
vioplot::vioplot(duration ~ cluster, dat_cl)
|
||||||
|
vioplot::vioplot(distance ~ cluster, dat_cl)
|
||||||
|
vioplot::vioplot(scaleSize ~ cluster, dat_cl)
|
||||||
|
vioplot::vioplot(rotationDegree ~ cluster, dat_cl)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
for (cluster in sort(unique(dat_cl$cluster))) {
|
||||||
|
|
||||||
|
alog <- activitylog(dat_cl[dat_cl$cluster == cluster, ],
|
||||||
|
case_id = "path",
|
||||||
activity_id = "event",
|
activity_id = "event",
|
||||||
resource_id = "item",
|
resource_id = "item",
|
||||||
timestamps = c("start", "complete"))
|
timestamps = c("start", "complete"))
|
||||||
process_map(alog2,
|
|
||||||
|
dfg <- process_map(alog,
|
||||||
|
type_nodes = frequency("relative"),
|
||||||
|
sec_nodes = frequency("absolute"),
|
||||||
|
type_edges = frequency("relative"),
|
||||||
|
sec_edges = frequency("absolute"),
|
||||||
|
rankdir = "LR",
|
||||||
|
render = FALSE)
|
||||||
|
export_map(dfg,
|
||||||
|
file_name = paste0("results/processmaps/dfg_cluster", cluster, "_R.pdf"),
|
||||||
|
file_type = "pdf",
|
||||||
|
title = paste("DFG Cluster", cluster))
|
||||||
|
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
tmp <- dat[dat$event != "move", ]
|
||||||
|
|
||||||
|
check_traces <- function(data) {
|
||||||
|
|
||||||
|
datagg <- aggregate(event ~ path, data,
|
||||||
|
function(x) ifelse("openPopup" %in% x, T, F))
|
||||||
|
paths <- datagg$path[datagg$event]
|
||||||
|
datcheck <- data[data$path %in% paths, c("path", "event")]
|
||||||
|
datcheck <- datcheck[!duplicated(datcheck), ]
|
||||||
|
datcheck <- datcheck[order(datcheck$path), ]
|
||||||
|
|
||||||
|
retval <- NULL
|
||||||
|
for (path in unique(datcheck$path)) {
|
||||||
|
check <- !all(as.character(datcheck$event[datcheck$path == path]) ==
|
||||||
|
c("flipCard", "openTopic", "openPopup"))
|
||||||
|
retval <- rbind(retval, data.frame(path, check))
|
||||||
|
}
|
||||||
|
retval
|
||||||
|
}
|
||||||
|
|
||||||
|
check <- check_traces(tmp)
|
||||||
|
|
||||||
|
sum(check$check)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
alog <- activitylog(dat,
|
||||||
|
case_id = "case",
|
||||||
|
activity_id = "item",
|
||||||
|
resource_id = "path",
|
||||||
|
timestamps = c("start", "complete"))
|
||||||
|
|
||||||
|
process_map(alog,
|
||||||
type_nodes = frequency("absolute"),
|
type_nodes = frequency("absolute"),
|
||||||
sec_nodes = frequency("relative"),
|
sec_nodes = frequency("relative"),
|
||||||
type_edges = frequency("absolute"),
|
type_edges = frequency("absolute"),
|
||||||
@ -145,41 +189,73 @@ process_map(alog2,
|
|||||||
|
|
||||||
|
|
||||||
|
|
||||||
library(processanimateR)
|
datcase <- dat[!duplicated(dat[, c("case", "path", "item")]),
|
||||||
|
c("case", "path", "event", "item")]
|
||||||
|
datcase$duration <- aggregate(duration ~ path, dat,
|
||||||
|
function(x) mean(x, na.rm = TRUE), na.action = NULL)$duration
|
||||||
|
datcase$distance <- aggregate(distance ~ path, dat,
|
||||||
|
function(x) mean(x, na.rm = TRUE), na.action = NULL)$distance
|
||||||
|
datcase$scaleSize <- aggregate(scaleSize ~ path, dat,
|
||||||
|
function(x) mean(x, na.rm = TRUE), na.action = NULL)$scaleSize
|
||||||
|
datcase$rotationDegree <- aggregate(rotationDegree ~ path, dat,
|
||||||
|
function(x) mean(x, na.rm = TRUE), na.action = NULL)$rotationDegree
|
||||||
|
# datcase$ntopics <- aggregate(topic ~ path, dat,
|
||||||
|
# function(x) ifelse(all(is.na(x)), NA, length(unique(na.omit(x)))),
|
||||||
|
# na.action = NULL)$topic
|
||||||
|
datcase$move <- ifelse(datcase$event == "move", 1, 0)
|
||||||
|
# paths that start with move
|
||||||
|
|
||||||
animate_process(to_eventlog(alog))
|
for (item in sort(unique(datcase$item))) {
|
||||||
|
datcase[paste0("item_", item)] <- ifelse(datcase$item == item, 1, 0)
|
||||||
|
}
|
||||||
|
|
||||||
col_vector <- c("#7FC97F", "#BEAED4", "#FDC086", "#FFFF99", "#386CB0",
|
mat <- na.omit(datcase[, -c(1:4)])
|
||||||
"#F0027F", "#BF5B17", "#666666", "#1B9E77", "#D95F02",
|
|
||||||
"#7570B3", "#E7298A", "#66A61E", "#E6AB02", "#A6761D",
|
|
||||||
"#666666", "#A6CEE3", "#1F78B4", "#B2DF8A", "#33A02C",
|
|
||||||
"#FB9A99", "#E31A1C", "#FDBF6F", "#FF7F00", "#CAB2D6",
|
|
||||||
"#6A3D9A", "#FFFF99", "#B15928", "#FBB4AE", "#B3CDE3",
|
|
||||||
"#CCEBC5", "#DECBE4", "#FED9A6", "#FFFFCC", "#E5D8BD",
|
|
||||||
"#FDDAEC", "#F2F2F2", "#B3E2CD", "#FDCDAC", "#CBD5E8",
|
|
||||||
"#F4CAE4", "#E6F5C9", "#FFF2AE", "#F1E2CC", "#CCCCCC",
|
|
||||||
"#E41A1C", "#377EB8", "#4DAF4A", "#984EA3", "#FF7F00",
|
|
||||||
"#FFFF33", "#A65628", "#F781BF", "#999999", "#66C2A5",
|
|
||||||
"#FC8D62", "#8DA0CB", "#E78AC3", "#A6D854", "#FFD92F",
|
|
||||||
"#E5C494", "#B3B3B3", "#8DD3C7", "#FFFFB3", "#BEBADA",
|
|
||||||
"#FB8072", "#80B1D3", "#FDB462", "#B3DE69", "#FCCDE5",
|
|
||||||
"#D9D9D9")
|
|
||||||
|
|
||||||
animate_process(to_eventlog(alog), mode = "relative", jitter = 10, legend = "color",
|
|
||||||
mapping = token_aes(color = token_scale("artwork",
|
|
||||||
scale = "ordinal",
|
|
||||||
range = col_vector)))
|
|
||||||
|
|
||||||
elog <- to_eventlog(alog)
|
set.seed(1610)
|
||||||
animate_process(elog[elog$artwork == "054", ])
|
|
||||||
animate_process(elog[elog$artwork == "080", ])
|
|
||||||
animate_process(elog[elog$artwork == "501", ])
|
|
||||||
|
|
||||||
process_map(alog[alog$artwork == "054", ])
|
nclusters <- 6
|
||||||
|
k1 <- kmeans(mat, nclusters)
|
||||||
|
|
||||||
animate_process(elog[elog$artwork %in% c("080", "054"), ],
|
#colors <- c("#3CB4DC", "#78004B", "#91C86E", "#FF6900")
|
||||||
mode = "relative", jitter = 10, legend = "color",
|
|
||||||
mapping = token_aes(color = token_scale("artwork",
|
colors <- palette.colors(palette = "Okabe-Ito")[1:nclusters]
|
||||||
scale = "ordinal",
|
|
||||||
range = c("black", "gray"))))
|
library(distances)
|
||||||
|
mat_dist <- distances(mat)
|
||||||
|
|
||||||
|
xy <- cmdscale(mat_dist)
|
||||||
|
|
||||||
|
plot(xy, type = "n")
|
||||||
|
text(xy[,1], xy[,2], datcase$path, col = colors[k1$cluster])
|
||||||
|
legend("topright", paste("Cluster", 1:nclusters), col = colors, lty = 1)
|
||||||
|
|
||||||
|
## Scree plot
|
||||||
|
|
||||||
|
ks <- 1:10
|
||||||
|
|
||||||
|
sse <- NULL
|
||||||
|
for (k in ks) sse <- c(sse, kmeans(datitem[, -1], k)$tot.withinss)
|
||||||
|
|
||||||
|
plot(sse ~ ks, type = "l")
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
alog <- activitylog(datcase,
|
||||||
|
case_id = "case",
|
||||||
|
activity_id = "item",
|
||||||
|
resource_id = "path",
|
||||||
|
timestamps = c("start", "complete"))
|
||||||
|
|
||||||
|
process_map(alog,
|
||||||
|
type_nodes = frequency("relative"),
|
||||||
|
sec_nodes = frequency("absolute"),
|
||||||
|
type_edges = frequency("relative"),
|
||||||
|
sec_edges = frequency("absolute"),
|
||||||
|
rankdir = "LR")
|
||||||
|
|
||||||
|
28
code/check_broken_trace.R
Normal file
28
code/check_broken_trace.R
Normal file
@ -0,0 +1,28 @@
|
|||||||
|
# setwd("C:/Users/nwickelmaier/Nextcloud/Documents/MDS/2023ss/60100_master_thesis/analysis/code")
|
||||||
|
|
||||||
|
datraw <- read.table("results/haum/raw_logfiles_2024-01-18_09-58-52.csv",
|
||||||
|
header = TRUE, sep = ";")
|
||||||
|
|
||||||
|
|
||||||
|
# Read data
|
||||||
|
|
||||||
|
datlogs <- read.table("results/haum/event_logfiles_2024-01-18_09-58-52.csv",
|
||||||
|
colClasses = c("character", "character", "POSIXct",
|
||||||
|
"POSIXct", "character", "integer",
|
||||||
|
"numeric", "character", "character",
|
||||||
|
rep("numeric", 3), "character",
|
||||||
|
"character", rep("numeric", 11),
|
||||||
|
"character", "character"),
|
||||||
|
sep = ";", header = TRUE)
|
||||||
|
|
||||||
|
datlogs <- datlogs[order(datlogs$fileId.start, datlogs$date.start, datlogs$timeMs.start), ]
|
||||||
|
|
||||||
|
artwork <- "176"
|
||||||
|
fileId <- c('2017_06_16-13_49_00.log', '2017_06_16-13_59_00.log')
|
||||||
|
path <- 106098
|
||||||
|
|
||||||
|
datraw[datraw$item == artwork & datraw$fileId %in% fileId, ]
|
||||||
|
|
||||||
|
datlogs[datlogs$path == path, ]
|
||||||
|
|
||||||
|
|
158
code/item_clustering.R
Normal file
158
code/item_clustering.R
Normal file
@ -0,0 +1,158 @@
|
|||||||
|
# setwd("C:/Users/nwickelmaier/Nextcloud/Documents/MDS/2023ss/60100_master_thesis/analysis/code")
|
||||||
|
|
||||||
|
#--------------- (1) Read data ---------------
|
||||||
|
|
||||||
|
#--------------- (1.1) Read log event data ---------------
|
||||||
|
|
||||||
|
dat0 <- read.table("results/haum/event_logfiles_2024-01-18_09-58-52.csv",
|
||||||
|
colClasses = c("character", "character", "POSIXct",
|
||||||
|
"POSIXct", "character", "integer",
|
||||||
|
"numeric", "character", "character",
|
||||||
|
rep("numeric", 3), "character",
|
||||||
|
"character", rep("numeric", 11),
|
||||||
|
"character", "character"),
|
||||||
|
sep = ";", header = TRUE)
|
||||||
|
dat0$event <- factor(dat0$event, levels = c("move", "flipCard", "openTopic",
|
||||||
|
"openPopup"))
|
||||||
|
|
||||||
|
# Select data pre Corona
|
||||||
|
dat <- dat0[as.Date(dat0$date.start) < "2020-03-13", ]
|
||||||
|
dat <- dat[dat$path != 106098, ]
|
||||||
|
|
||||||
|
#--------------- (1.2) Read infos for PM for items ---------------
|
||||||
|
|
||||||
|
datitem <- read.table("results/haum/pn_infos_items.csv", header = TRUE,
|
||||||
|
sep = ";", row.names = 1)
|
||||||
|
|
||||||
|
|
||||||
|
#--------------- (1.3) Extract additional infos for clustering ---------------
|
||||||
|
|
||||||
|
datitem$duration <- aggregate(duration ~ item, dat, mean)$duration
|
||||||
|
datitem$distance <- aggregate(distance ~ item, dat, mean)$distance
|
||||||
|
datitem$scaleSize <- aggregate(scaleSize ~ item, dat, mean)$scaleSize
|
||||||
|
datitem$rotationDegree <- aggregate(rotationDegree ~ item, dat, mean)$rotationDegree
|
||||||
|
datitem$npaths <- aggregate(path ~ item, dat, function(x) length(unique(x)))$path
|
||||||
|
datitem$ncases <- aggregate(case ~ item, dat, function(x) length(unique(x)))$case
|
||||||
|
datitem$ntopics <- aggregate(topic ~ item, dat, function(x) length(unique(x)))$topic
|
||||||
|
datitem$mostfreq_num <- as.numeric(gsub(".*: (.*)}", "\\1", datitem$mostfreq))
|
||||||
|
|
||||||
|
#--------------- (2) Clustering ---------------
|
||||||
|
|
||||||
|
df <- datitem[, c("precision", "generalizability", "nvariants",
|
||||||
|
"mostfreq_num", "duration", "distance", "scaleSize",
|
||||||
|
"rotationDegree", "npaths", "ncases", "ntopics")] |>
|
||||||
|
scale()
|
||||||
|
mat <- dist(df)
|
||||||
|
|
||||||
|
hc <- hclust(mat, method = "ward.D2")
|
||||||
|
|
||||||
|
library(factoextra)
|
||||||
|
fviz_dend(hc, cex = 0.5)
|
||||||
|
|
||||||
|
datitem$grp <- cutree(hc, k = 6)
|
||||||
|
|
||||||
|
fviz_dend(hc, k = 6,
|
||||||
|
cex = 0.5,
|
||||||
|
k_colors = c("#78004B", "#000000", "#3CB4DC", "#91C86E",
|
||||||
|
"#FF6900", "#434F4F"),
|
||||||
|
#type = "phylogenic",
|
||||||
|
rect = TRUE
|
||||||
|
)
|
||||||
|
|
||||||
|
p <- fviz_cluster(list(data = df, cluster = grp),
|
||||||
|
palette = c("#78004B", "#000000", "#3CB4DC", "#91C86E",
|
||||||
|
"#FF6900", "#434F4F"),
|
||||||
|
ellipse.type = "convex",
|
||||||
|
repel = TRUE,
|
||||||
|
show.clust.cent = FALSE, ggtheme = theme_bw())
|
||||||
|
p
|
||||||
|
|
||||||
|
aggregate(cbind(duration, distance, scaleSize , rotationDegree, npaths,
|
||||||
|
ncases, ntopics) ~ grp, datitem, mean)
|
||||||
|
|
||||||
|
datitem$item <- gsub("item_([0-9]{3})", "\\1", row.names(datitem))
|
||||||
|
|
||||||
|
res <- merge(dat, datitem[, c("item", "grp")], by = "item", all.x = TRUE)
|
||||||
|
res <- res[order(res$fileId.start, res$date.start, res$timeMs.start), ]
|
||||||
|
|
||||||
|
write.table(res,
|
||||||
|
file = "results/haum/event_logfiles_pre-corona_with-clusters.csv",
|
||||||
|
sep = ";",
|
||||||
|
quote = FALSE,
|
||||||
|
row.names = FALSE)
|
||||||
|
|
||||||
|
library(bupaverse)
|
||||||
|
|
||||||
|
res$start <- res$date.start
|
||||||
|
res$complete <- res$date.stop
|
||||||
|
|
||||||
|
for (cluster in sort(unique(res$grp))) {
|
||||||
|
|
||||||
|
alog <- activitylog(res[res$grp == cluster, ],
|
||||||
|
case_id = "path",
|
||||||
|
activity_id = "event",
|
||||||
|
resource_id = "item",
|
||||||
|
timestamps = c("start", "complete"))
|
||||||
|
|
||||||
|
dfg <- process_map(alog,
|
||||||
|
type_nodes = frequency("relative"),
|
||||||
|
sec_nodes = frequency("absolute"),
|
||||||
|
type_edges = frequency("relative"),
|
||||||
|
sec_edges = frequency("absolute"),
|
||||||
|
rankdir = "LR",
|
||||||
|
render = FALSE)
|
||||||
|
export_map(dfg,
|
||||||
|
file_name = paste0("results/processmaps/dfg_cluster", cluster, "_R.pdf"),
|
||||||
|
file_type = "pdf",
|
||||||
|
title = paste("DFG Cluster", cluster))
|
||||||
|
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
#--------------- (3) Visualization with pictures ---------------
|
||||||
|
|
||||||
|
library(png)
|
||||||
|
library(jpeg)
|
||||||
|
library(grid)
|
||||||
|
|
||||||
|
colors <- c("#78004B", "#000000", "#3CB4DC", "#91C86E", "#FF6900",
|
||||||
|
"#434F4F")
|
||||||
|
|
||||||
|
#pdf("results/haum/figures/clustering_artworks.pdf", height = 8, width = 8, pointsize = 10)
|
||||||
|
png("results/haum/figures/clustering_artworks.png", units = "in", height = 8, width = 8, pointsize = 10, res = 300)
|
||||||
|
|
||||||
|
par(mai = c(.6,.6,.1,.1), mgp = c(2.4, 1, 0))
|
||||||
|
|
||||||
|
plot(y ~ x, p$data, type = "n", ylim = c(-3.2, 3), xlim = c(-4.7, 6.4))
|
||||||
|
|
||||||
|
for (item in sprintf("%03d", as.numeric(rownames(p$data)))) {
|
||||||
|
|
||||||
|
if (item == "125") {
|
||||||
|
|
||||||
|
pic <- readJPEG(paste0("../data/haum/ContentEyevisit/eyevisit_cards_light/",
|
||||||
|
item, "/", item, ".jpg"))
|
||||||
|
} else {
|
||||||
|
pic <- readPNG(paste0("../data/haum/ContentEyevisit/eyevisit_cards_light/",
|
||||||
|
item, "/", item, ".png"))
|
||||||
|
}
|
||||||
|
|
||||||
|
img <- as.raster(pic[,,1:3])
|
||||||
|
|
||||||
|
x <- p$data$x[sprintf("%03d", as.numeric(rownames(p$data))) == item]
|
||||||
|
y <- p$data$y[sprintf("%03d", as.numeric(rownames(p$data))) == item]
|
||||||
|
|
||||||
|
points(x, y,
|
||||||
|
col = colors[p$data$cluster[sprintf("%03d", as.numeric(rownames(p$data))) == item]],
|
||||||
|
cex = 9,
|
||||||
|
pch = 15)
|
||||||
|
|
||||||
|
rasterImage(img,
|
||||||
|
xleft = x - .4,
|
||||||
|
xright = x + .4,
|
||||||
|
ybottom = y - .2,
|
||||||
|
ytop = y + .2)
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
dev.off()
|
||||||
|
|
@ -1,4 +1,3 @@
|
|||||||
#%% # needed for shortcuts to run properly in VSCode *eyeroll*
|
|
||||||
%reset
|
%reset
|
||||||
|
|
||||||
import pm4py
|
import pm4py
|
||||||
@ -9,13 +8,12 @@ import matplotlib.pyplot as plt
|
|||||||
|
|
||||||
###### Load data and create event logs ######
|
###### Load data and create event logs ######
|
||||||
|
|
||||||
dat = pd.read_csv("results/haum/event_logfiles_2024-01-02_19-44-50.csv", sep = ";")
|
dat = pd.read_csv("results/haum/event_logfiles_2024-01-18_09-58-52.csv", sep = ";")
|
||||||
dat = dat[dat["date.start"] < "2020-03-13"]
|
dat = dat[dat["date.start"] < "2020-03-13"]
|
||||||
# --> only pre corona (before artworks were updated)
|
# --> only pre corona (before artworks were updated)
|
||||||
|
|
||||||
event_log = pm4py.format_dataframe(dat, case_id='path', activity_key='event',
|
event_log = pm4py.format_dataframe(dat, case_id='path', activity_key='event',
|
||||||
timestamp_key='date.start')
|
timestamp_key='date.start')
|
||||||
event_log = event_log.rename(columns={'item': 'case:item'})
|
|
||||||
|
|
||||||
###### Descrptives of log data ######
|
###### Descrptives of log data ######
|
||||||
|
|
||||||
@ -39,18 +37,6 @@ len(variants_no_move)
|
|||||||
sorted_variants_no_move = dict(sorted(variants_no_move.items(), key=lambda item: item[1], reverse = True))
|
sorted_variants_no_move = dict(sorted(variants_no_move.items(), key=lambda item: item[1], reverse = True))
|
||||||
{k: sorted_variants_no_move[k] for k in list(sorted_variants_no_move)[:20]}
|
{k: sorted_variants_no_move[k] for k in list(sorted_variants_no_move)[:20]}
|
||||||
|
|
||||||
# Path length
|
|
||||||
event_log.path.value_counts()
|
|
||||||
event_log.path.value_counts().mean()
|
|
||||||
event_log.path.value_counts().median()
|
|
||||||
event_log.path.value_counts().min()
|
|
||||||
event_log.path.value_counts().max()
|
|
||||||
|
|
||||||
plt.hist(event_log.path.value_counts(), bins=200)
|
|
||||||
plt.show()
|
|
||||||
|
|
||||||
# TODO: Do it again in R -- much smoother and more info, better plots
|
|
||||||
|
|
||||||
###### Read "conformative" Petri Net ######
|
###### Read "conformative" Petri Net ######
|
||||||
|
|
||||||
basenet, initial_marking, final_marking = pm4py.read_pnml("results/conformative_petrinet_con.pnml")
|
basenet, initial_marking, final_marking = pm4py.read_pnml("results/conformative_petrinet_con.pnml")
|
||||||
@ -58,8 +44,7 @@ basenet, initial_marking, final_marking = pm4py.read_pnml("results/conformative_
|
|||||||
def eval_pm(data, net, initial_marking, final_marking):
|
def eval_pm(data, net, initial_marking, final_marking):
|
||||||
"""Caculate fitness, precision, generalizability, and simplicity for petri net"""
|
"""Caculate fitness, precision, generalizability, and simplicity for petri net"""
|
||||||
fitness = pm4py.fitness_token_based_replay(data, net, initial_marking, final_marking)
|
fitness = pm4py.fitness_token_based_replay(data, net, initial_marking, final_marking)
|
||||||
precisison = pm4py.precision_token_based_replay(data, net,
|
precisison = pm4py.precision_token_based_replay(data, net, initial_marking, final_marking)
|
||||||
initial_marking, final_marking)
|
|
||||||
generalizability = pm4py.algo.evaluation.generalization.algorithm.apply(data, net,
|
generalizability = pm4py.algo.evaluation.generalization.algorithm.apply(data, net,
|
||||||
initial_marking, final_marking)
|
initial_marking, final_marking)
|
||||||
simplicity = pm4py.algo.evaluation.simplicity.algorithm.apply(net)
|
simplicity = pm4py.algo.evaluation.simplicity.algorithm.apply(net)
|
||||||
@ -80,75 +65,39 @@ for i in range(len(replayed_traces)):
|
|||||||
l3.append(replayed_traces[i]["reached_marking"])
|
l3.append(replayed_traces[i]["reached_marking"])
|
||||||
l4.append(replayed_traces[i]["transitions_with_problems"])
|
l4.append(replayed_traces[i]["transitions_with_problems"])
|
||||||
|
|
||||||
np.mean(l1)
|
|
||||||
set(l1)
|
set(l1)
|
||||||
index_broken = l1.index(1)
|
x1 = np.array(l1)
|
||||||
np.mean(l2)
|
index_broken = np.where(x1 == 1)[0].tolist()
|
||||||
set(l2)
|
|
||||||
l2.index(1)
|
|
||||||
set(l3)
|
set(l3)
|
||||||
l4.count([])
|
l4.count([])
|
||||||
|
|
||||||
l3[index_broken]
|
[l3[i] for i in index_broken]
|
||||||
l4[index_broken]
|
[l4[i] for i in index_broken]
|
||||||
|
|
||||||
replayed_traces[index_broken]
|
broken_traces = [replayed_traces[i] for i in index_broken]
|
||||||
|
|
||||||
event_log[event_log['@@case_index'] == index_broken].event
|
event_log[event_log['@@case_index'] == index_broken].event
|
||||||
event_log[event_log['@@case_index'] == index_broken].path
|
event_log[event_log['@@case_index'] == index_broken].path.unique().tolist()
|
||||||
event_log[event_log['@@case_index'] == index_broken].item
|
event_log[event_log['@@case_index'] == index_broken].item.unique().tolist()
|
||||||
event_log[event_log['@@case_index'] == index_broken]["fileId.start"]
|
event_log[event_log['@@case_index'] == index_broken]["fileId.start"].unique().tolist()
|
||||||
# --> logging error in file!
|
# --> logging error in raw file
|
||||||
|
|
||||||
from pm4py.algo.conformance.tokenreplay import algorithm as token_based_replay
|
|
||||||
parameters_tbr = {token_based_replay.Variants.TOKEN_REPLAY.value.Parameters.DISABLE_VARIANTS: True, token_based_replay.Variants.TOKEN_REPLAY.value.Parameters.ENABLE_PLTR_FITNESS: True}
|
|
||||||
replayed_traces, place_fitness, trans_fitness, unwanted_activities = token_based_replay.apply(event_log, basenet,
|
|
||||||
initial_marking,
|
|
||||||
final_marking,
|
|
||||||
parameters=parameters_tbr)
|
|
||||||
|
|
||||||
from pm4py.algo.conformance.tokenreplay.diagnostics import duration_diagnostics
|
|
||||||
trans_diagnostics = duration_diagnostics.diagnose_from_trans_fitness(event_log, trans_fitness)
|
|
||||||
for trans in trans_diagnostics:
|
|
||||||
print(trans, trans_diagnostics[trans])
|
|
||||||
|
|
||||||
# Footprints
|
# Footprints
|
||||||
from pm4py.algo.discovery.footprints import algorithm as footprints_discovery
|
from pm4py.algo.discovery.footprints import algorithm as footprints_discovery
|
||||||
fp_log = footprints_discovery.apply(event_log, variant=footprints_discovery.Variants.ENTIRE_EVENT_LOG)
|
|
||||||
|
|
||||||
fp_trace_by_trace = footprints_discovery.apply(event_log, variant=footprints_discovery.Variants.TRACE_BY_TRACE)
|
|
||||||
|
|
||||||
fp_net = footprints_discovery.apply(basenet, initial_marking, final_marking)
|
|
||||||
|
|
||||||
from pm4py.visualization.footprints import visualizer as fp_visualizer
|
from pm4py.visualization.footprints import visualizer as fp_visualizer
|
||||||
|
fp_log = footprints_discovery.apply(event_log, variant=footprints_discovery.Variants.ENTIRE_EVENT_LOG)
|
||||||
|
fp_net = footprints_discovery.apply(basenet, initial_marking, final_marking)
|
||||||
gviz = fp_visualizer.apply(fp_net, parameters={fp_visualizer.Variants.SINGLE.value.Parameters.FORMAT: "svg"})
|
gviz = fp_visualizer.apply(fp_net, parameters={fp_visualizer.Variants.SINGLE.value.Parameters.FORMAT: "svg"})
|
||||||
fp_visualizer.view(gviz)
|
fp_visualizer.view(gviz)
|
||||||
|
|
||||||
gviz = fp_visualizer.apply(fp_log, fp_net, parameters={fp_visualizer.Variants.COMPARISON.value.Parameters.FORMAT: "svg"})
|
|
||||||
fp_visualizer.view(gviz)
|
|
||||||
|
|
||||||
conf_fp = pm4py.conformance_diagnostics_footprints(fp_trace_by_trace, fp_net)
|
|
||||||
|
|
||||||
from pm4py.algo.conformance.footprints import algorithm as fp_conformance
|
|
||||||
conf_result = fp_conformance.apply(fp_log, fp_net, variant=fp_conformance.Variants.LOG_EXTENSIVE)
|
|
||||||
|
|
||||||
from pm4py.algo.conformance.footprints.util import evaluation
|
|
||||||
fitness = evaluation.fp_fitness(fp_log, fp_net, conf_result)
|
|
||||||
precision = evaluation.fp_precision(fp_log, fp_net)
|
|
||||||
|
|
||||||
# Skeleton
|
|
||||||
from pm4py.algo.discovery.log_skeleton import algorithm as lsk_discovery
|
|
||||||
skeleton = lsk_discovery.apply(event_log, parameters={lsk_discovery.Variants.CLASSIC.value.Parameters.NOISE_THRESHOLD: 0.0})
|
|
||||||
|
|
||||||
from pm4py.algo.conformance.log_skeleton import algorithm as lsk_conformance
|
|
||||||
conf_result = lsk_conformance.apply(event_log, skeleton)
|
|
||||||
|
|
||||||
pm4py.vis.view_petri_net(basenet, initial_marking, final_marking)
|
pm4py.vis.view_petri_net(basenet, initial_marking, final_marking)
|
||||||
is_sound = pm4py.check_soundness(basenet, initial_marking, final_marking)
|
is_sound = pm4py.check_soundness(basenet, initial_marking, final_marking)
|
||||||
is_sound[0]
|
baseline_eval.append(is_sound[0])
|
||||||
len(basenet.arcs)
|
baseline_eval.append(len(basenet.arcs))
|
||||||
len(basenet.transitions)
|
baseline_eval.append(len(basenet.transitions))
|
||||||
len(basenet.places)
|
baseline_eval.append(len(basenet.places))
|
||||||
|
|
||||||
efg_graph = pm4py.discover_eventually_follows_graph(event_log)
|
efg_graph = pm4py.discover_eventually_follows_graph(event_log)
|
||||||
|
|
||||||
@ -157,163 +106,135 @@ dfg, start_activities, end_activities = pm4py.discover_dfg(event_log)
|
|||||||
pm4py.view_dfg(dfg, start_activities, end_activities)
|
pm4py.view_dfg(dfg, start_activities, end_activities)
|
||||||
pm4py.save_vis_dfg(dfg, start_activities, end_activities, '../figures/processmaps/dfg_complete.png')
|
pm4py.save_vis_dfg(dfg, start_activities, end_activities, '../figures/processmaps/dfg_complete.png')
|
||||||
|
|
||||||
## Heuristics Miner
|
## Fitting different miners
|
||||||
|
### Heuristics Miner
|
||||||
h_net, im, fm = pm4py.discover_petri_net_heuristics(event_log)
|
h_net, im, fm = pm4py.discover_petri_net_heuristics(event_log)
|
||||||
pm4py.vis.view_petri_net(h_net, im, fm)
|
|
||||||
pm4py.vis.save_vis_petri_net(h_net, im, fm, "../figures/processmaps/petrinet_heuristics_complete.png")
|
|
||||||
h_eval = eval_pm(event_log, h_net, im, fm)
|
h_eval = eval_pm(event_log, h_net, im, fm)
|
||||||
|
|
||||||
is_sound = pm4py.check_soundness(h_net, im, fm)
|
is_sound = pm4py.check_soundness(h_net, im, fm)
|
||||||
is_sound[0]
|
h_eval.append(is_sound[0])
|
||||||
|
h_eval.append(len(h_net.arcs))
|
||||||
len(h_net.arcs)
|
h_eval.append(len(h_net.transitions))
|
||||||
len(h_net.transitions)
|
h_eval.append(len(h_net.places))
|
||||||
len(h_net.places)
|
|
||||||
|
|
||||||
|
|
||||||
# decorated petri net
|
|
||||||
from pm4py.visualization.petri_net import visualizer as pn_visualizer
|
|
||||||
parameters = {pn_visualizer.Variants.FREQUENCY.value.Parameters.FORMAT: "png"}
|
|
||||||
gviz = pn_visualizer.apply(h_net, im, fm, parameters=parameters, variant=pn_visualizer.Variants.FREQUENCY, log=event_log)
|
|
||||||
pn_visualizer.save(gviz, "../figures/processmaps/petrinet_heuristics_complete_decorated.png")
|
|
||||||
|
|
||||||
# convert to BPMN
|
|
||||||
bpmn = pm4py.convert.convert_to_bpmn(h_net, im, fm)
|
|
||||||
pm4py.vis.view_bpmn(bpmn)
|
|
||||||
|
|
||||||
## Alpha Miner
|
## Alpha Miner
|
||||||
a_net, im, fm = pm4py.discover_petri_net_alpha(event_log)
|
a_net, im, fm = pm4py.discover_petri_net_alpha(event_log)
|
||||||
pm4py.vis.view_petri_net(a_net, im, fm)
|
|
||||||
pm4py.vis.save_vis_petri_net(a_net, im, fm, "../figures/processmaps/petrinet_alpha_complete.png")
|
|
||||||
a_eval = eval_pm(event_log, a_net, im, fm)
|
a_eval = eval_pm(event_log, a_net, im, fm)
|
||||||
|
|
||||||
is_sound = pm4py.check_soundness(a_net, im, fm)
|
is_sound = pm4py.check_soundness(a_net, im, fm)
|
||||||
is_sound[0]
|
a_eval.append(is_sound[0])
|
||||||
|
a_eval.append(len(a_net.arcs))
|
||||||
len(a_net.arcs)
|
a_eval.append(len(a_net.transitions))
|
||||||
len(a_net.transitions)
|
a_eval.append(len(a_net.places))
|
||||||
len(a_net.places)
|
|
||||||
|
|
||||||
## Inductive Miner
|
## Inductive Miner
|
||||||
i_net, im, fm = pm4py.discover_petri_net_inductive(event_log)
|
i_net, im, fm = pm4py.discover_petri_net_inductive(event_log)
|
||||||
pm4py.vis.view_petri_net(i_net, im, fm)
|
|
||||||
pm4py.vis.save_vis_petri_net(i_net, im, fm, "../figures/processmaps/petrinet_induction_complete.png")
|
|
||||||
i_eval = eval_pm(event_log, i_net, im, fm)
|
i_eval = eval_pm(event_log, i_net, im, fm)
|
||||||
|
|
||||||
# as process tree (does not work for heuristics miner!)
|
|
||||||
pt = pm4py.discover_process_tree_inductive(event_log)
|
|
||||||
pm4py.vis.view_process_tree(pt)
|
|
||||||
|
|
||||||
is_sound = pm4py.check_soundness(i_net, im, fm)
|
is_sound = pm4py.check_soundness(i_net, im, fm)
|
||||||
is_sound[0]
|
i_eval.append(is_sound[0])
|
||||||
|
i_eval.append(len(i_net.arcs))
|
||||||
# TODO: Can I show that this simpler net does not include all traces? (Probably not,
|
i_eval.append(len(i_net.transitions))
|
||||||
# since fitness is 1, but WHY?)
|
i_eval.append(len(i_net.places))
|
||||||
|
|
||||||
len(i_net.arcs)
|
|
||||||
len(i_net.transitions)
|
|
||||||
len(i_net.places)
|
|
||||||
|
|
||||||
bpmn = pm4py.convert.convert_to_bpmn(i_net, im, fm)
|
|
||||||
pm4py.view_bpmn(bpmn)
|
|
||||||
|
|
||||||
from pm4py.algo.conformance.tokenreplay import algorithm as token_based_replay
|
|
||||||
parameters_tbr = {token_based_replay.Variants.TOKEN_REPLAY.value.Parameters.DISABLE_VARIANTS: True, token_based_replay.Variants.TOKEN_REPLAY.value.Parameters.ENABLE_PLTR_FITNESS: True}
|
|
||||||
replayed_traces, place_fitness, trans_fitness, unwanted_activities = token_based_replay.apply(event_log, i_net,
|
|
||||||
im,
|
|
||||||
fm,
|
|
||||||
parameters=parameters_tbr)
|
|
||||||
|
|
||||||
l1 = list()
|
|
||||||
l2 = list()
|
|
||||||
l3 = list()
|
|
||||||
l4 = list()
|
|
||||||
for i in range(len(replayed_traces)):
|
|
||||||
l1.append(replayed_traces[i]["remaining_tokens"])
|
|
||||||
l2.append(replayed_traces[i]["missing_tokens"])
|
|
||||||
l3.append(replayed_traces[i]["reached_marking"])
|
|
||||||
l4.append(replayed_traces[i]["transitions_with_problems"])
|
|
||||||
|
|
||||||
np.mean(l1)
|
|
||||||
set(l1)
|
|
||||||
index_broken = l1.index(1)
|
|
||||||
np.mean(l2)
|
|
||||||
set(l2)
|
|
||||||
l2.index(1)
|
|
||||||
set(l3)
|
|
||||||
l4.count([])
|
|
||||||
|
|
||||||
l3[index_broken]
|
|
||||||
l4[index_broken]
|
|
||||||
|
|
||||||
replayed_traces[index_broken]
|
|
||||||
|
|
||||||
event_log[event_log['@@case_index'] == index_broken].event
|
|
||||||
event_log[event_log['@@case_index'] == index_broken].path
|
|
||||||
event_log[event_log['@@case_index'] == index_broken].item
|
|
||||||
event_log[event_log['@@case_index'] == index_broken]["fileId.start"]
|
|
||||||
|
|
||||||
## ILP Miner
|
## ILP Miner
|
||||||
ilp_net, im, fm = pm4py.discover_petri_net_ilp(event_log)
|
ilp_net, im, fm = pm4py.discover_petri_net_ilp(event_log)
|
||||||
pm4py.vis.view_petri_net(ilp_net, im, fm)
|
|
||||||
pm4py.vis.save_vis_petri_net(ilp_net, im, fm, "../figures/processmaps/petrinet_ilp_complete.png")
|
|
||||||
ilp_eval = eval_pm(event_log, ilp_net, im, fm)
|
ilp_eval = eval_pm(event_log, ilp_net, im, fm)
|
||||||
|
|
||||||
is_sound = pm4py.check_soundness(ilp_net, im, fm)
|
is_sound = pm4py.check_soundness(ilp_net, im, fm)
|
||||||
is_sound[0]
|
ilp_eval.append(is_sound[0])
|
||||||
|
ilp_eval.append(len(ilp_net.arcs))
|
||||||
len(ilp_net.arcs)
|
ilp_eval.append(len(ilp_net.transitions))
|
||||||
len(ilp_net.transitions)
|
ilp_eval.append(len(ilp_net.places))
|
||||||
len(ilp_net.places)
|
|
||||||
|
|
||||||
## Export for all miners
|
## Export for all miners
|
||||||
eval = pd.DataFrame(np.row_stack([baseline_eval, h_eval, a_eval, i_eval, ilp_eval]))
|
eval = pd.DataFrame(np.row_stack([baseline_eval, h_eval, a_eval, i_eval, ilp_eval]))
|
||||||
eval.columns = ["fitness", "precision", "generalizability", "simplicity"]
|
eval.columns = ["fitness", "precision", "generalizability", "simplicity",
|
||||||
|
"sound", "narcs", "ntrans", "nplaces"]
|
||||||
eval.index = ["conformative", "heuristics", "alpha", "inductive", "ilp"]
|
eval.index = ["conformative", "heuristics", "alpha", "inductive", "ilp"]
|
||||||
eval
|
eval
|
||||||
|
|
||||||
eval.to_csv("results/eval_all-miners_complete.csv", sep=";")
|
eval.to_csv("results/eval_all-miners_complete.csv", sep=" ")
|
||||||
|
|
||||||
## Without broken trace
|
## Without broken trace
|
||||||
event_log_clean = event_log[event_log['@@case_index'] != index_broken]
|
event_log_clean = event_log[event_log['@@case_index'] != index_broken[0]]
|
||||||
h_net, a_im, h_fm = pm4py.discover_petri_net_heuristics(event_log_clean)
|
h_net, h_im, h_fm = pm4py.discover_petri_net_heuristics(event_log_clean)
|
||||||
a_net, h_im, a_fm = pm4py.discover_petri_net_alpha(event_log_clean)
|
a_net, a_im, a_fm = pm4py.discover_petri_net_alpha(event_log_clean)
|
||||||
i_net, i_im, i_fm = pm4py.discover_petri_net_inductive(event_log_clean)
|
i_net, i_im, i_fm = pm4py.discover_petri_net_inductive(event_log_clean)
|
||||||
ilp_net, ilp_im, ilp_fm = pm4py.discover_petri_net_ilp(event_log_clean)
|
ilp_net, ilp_im, ilp_fm = pm4py.discover_petri_net_ilp(event_log_clean)
|
||||||
|
|
||||||
baseline_eval = eval_pm(event_log_clean, basenet, initial_marking, final_marking)
|
baseline_eval = eval_pm(event_log_clean, basenet, initial_marking, final_marking)
|
||||||
|
is_sound = pm4py.check_soundness(basenet, initial_marking, final_marking)
|
||||||
|
baseline_eval.append(is_sound[0])
|
||||||
|
baseline_eval.append(len(basenet.arcs))
|
||||||
|
baseline_eval.append(len(basenet.transitions))
|
||||||
|
baseline_eval.append(len(basenet.places))
|
||||||
|
|
||||||
h_eval = eval_pm(event_log_clean, h_net, h_im, h_fm)
|
h_eval = eval_pm(event_log_clean, h_net, h_im, h_fm)
|
||||||
|
is_sound = pm4py.check_soundness(h_net, h_im, h_fm)
|
||||||
|
h_eval.append(is_sound[0])
|
||||||
|
h_eval.append(len(h_net.arcs))
|
||||||
|
h_eval.append(len(h_net.transitions))
|
||||||
|
h_eval.append(len(h_net.places))
|
||||||
|
|
||||||
a_eval = eval_pm(event_log_clean, a_net, a_im, a_fm)
|
a_eval = eval_pm(event_log_clean, a_net, a_im, a_fm)
|
||||||
|
is_sound = pm4py.check_soundness(a_net, a_im, a_fm)
|
||||||
|
a_eval.append(is_sound[0])
|
||||||
|
a_eval.append(len(a_net.arcs))
|
||||||
|
a_eval.append(len(a_net.transitions))
|
||||||
|
a_eval.append(len(a_net.places))
|
||||||
|
|
||||||
i_eval = eval_pm(event_log_clean, i_net, i_im, i_fm)
|
i_eval = eval_pm(event_log_clean, i_net, i_im, i_fm)
|
||||||
|
is_sound = pm4py.check_soundness(i_net, i_im, i_fm)
|
||||||
|
i_eval.append(is_sound[0])
|
||||||
|
i_eval.append(len(i_net.arcs))
|
||||||
|
i_eval.append(len(i_net.transitions))
|
||||||
|
i_eval.append(len(i_net.places))
|
||||||
|
|
||||||
ilp_eval = eval_pm(event_log_clean, ilp_net, ilp_im, ilp_fm)
|
ilp_eval = eval_pm(event_log_clean, ilp_net, ilp_im, ilp_fm)
|
||||||
|
is_sound = pm4py.check_soundness(ilp_net, ilp_im, ilp_fm)
|
||||||
|
ilp_eval.append(is_sound[0])
|
||||||
|
ilp_eval.append(len(ilp_net.arcs))
|
||||||
|
ilp_eval.append(len(ilp_net.transitions))
|
||||||
|
ilp_eval.append(len(ilp_net.places))
|
||||||
|
|
||||||
eval = pd.DataFrame(np.row_stack([baseline_eval, h_eval, a_eval, i_eval, ilp_eval]))
|
eval = pd.DataFrame(np.row_stack([baseline_eval, h_eval, a_eval, i_eval, ilp_eval]))
|
||||||
eval.columns = ["fitness", "precision", "generalizability", "simplicity"]
|
eval.columns = ["fitness", "precision", "generalizability", "simplicity",
|
||||||
|
"sound", "narcs", "ntrans", "nplaces"]
|
||||||
eval.index = ["conformative", "heuristics", "alpha", "inductive", "ilp"]
|
eval.index = ["conformative", "heuristics", "alpha", "inductive", "ilp"]
|
||||||
eval
|
eval
|
||||||
|
|
||||||
eval.to_csv("results/eval_all-miners_clean.csv", sep=";")
|
eval.to_csv("results/eval_all-miners_clean.csv", sep=" ")
|
||||||
|
|
||||||
|
# Export petri nets
|
||||||
|
pm4py.vis.save_vis_petri_net(h_net, h_im, h_fm, "results/processmaps/petrinet_heuristics_clean.png")
|
||||||
|
pm4py.vis.save_vis_petri_net(a_net, a_im, a_fm, "results/processmaps/petrinet_alpha_clean.png")
|
||||||
|
pm4py.vis.save_vis_petri_net(i_net, i_im, i_fm, "results/processmaps/petrinet_inductive_clean.png")
|
||||||
|
pm4py.vis.save_vis_petri_net(ilp_net, ilp_im, ilp_fm, "results/processmaps/petrinet_ilp_clean.png")
|
||||||
|
pm4py.vis.save_vis_petri_net(basenet, initial_marking, final_marking, "results/processmaps/petrinet_conformative.png")
|
||||||
|
|
||||||
|
# convert to BPMN
|
||||||
|
base_bpmn = pm4py.convert.convert_to_bpmn(basenet, initial_marking, final_marking)
|
||||||
|
pm4py.vis.save_vis_bpmn(base_bpmn, "results/processmaps/bpmn_conformative.png")
|
||||||
|
|
||||||
|
i_bpmn = pm4py.convert.convert_to_bpmn(i_net, i_im, i_fm)
|
||||||
|
pm4py.vis.save_vis_bpmn(i_bpmn, "results/processmaps/bpmn_inductive_clean.png")
|
||||||
|
|
||||||
|
ilp_bpmn = pm4py.convert.convert_to_bpmn(ilp_net, ilp_im, ilp_fm)
|
||||||
|
pm4py.vis.save_vis_bpmn(ilp_bpmn, "results/processmaps/bpmn_ilp_clean.png")
|
||||||
|
|
||||||
|
a_bpmn = pm4py.convert.convert_to_bpmn(a_net, a_im, a_fm)
|
||||||
|
pm4py.vis.save_vis_bpmn(a_bpmn, "results/processmaps/bpmn_alpha_clean.png")
|
||||||
|
|
||||||
|
h_bpmn = pm4py.convert.convert_to_bpmn(h_net, h_im, h_fm)
|
||||||
|
pm4py.vis.save_vis_bpmn(h_bpmn, "results/processmaps/bpmn_heuristics_clean.png")
|
||||||
|
|
||||||
###### Process Mining - individual artworks ######
|
###### Process Mining - individual artworks ######
|
||||||
|
|
||||||
def pm_artworks(miner):
|
def pm_artworks(miner):
|
||||||
|
|
||||||
retval1 = np.empty((len(event_log["case:artwork"].unique()), 4))
|
retval1 = np.empty((len(event_log["item"].unique()), 4))
|
||||||
retval2 = np.empty((len(event_log["case:artwork"].unique()), 4))
|
retval2 = np.empty((len(event_log["item"].unique()), 4))
|
||||||
|
|
||||||
if miner == "heuristics":
|
for i in range(len(event_log["item"].unique())):
|
||||||
net, im, fm = pm4py.discover_petri_net_heuristics(event_log)
|
artwork = event_log["item"].unique()[i]
|
||||||
elif miner == "inductive":
|
subdata = pm4py.filter_event_attribute_values(event_log, "item",
|
||||||
net, im, fm = pm4py.discover_petri_net_inductive(event_log)
|
|
||||||
elif miner == "alpha":
|
|
||||||
net, im, fm = pm4py.discover_petri_net_alpha(event_log)
|
|
||||||
elif miner == "ilp":
|
|
||||||
net, im, fm = pm4py.discover_petri_net_ilp(event_log)
|
|
||||||
|
|
||||||
for i in range(len(event_log["case:artwork"].unique())):
|
|
||||||
artwork = event_log["case:artwork"].unique()[i]
|
|
||||||
subdata = pm4py.filter_event_attribute_values(event_log, "case:artwork",
|
|
||||||
[artwork],
|
[artwork],
|
||||||
level="case", retain=True)
|
level="case", retain=True)
|
||||||
if miner == "heuristics":
|
if miner == "heuristics":
|
||||||
@ -325,17 +246,17 @@ def pm_artworks(miner):
|
|||||||
elif miner == "ilp":
|
elif miner == "ilp":
|
||||||
subnet, subim, subfm = pm4py.discover_petri_net_ilp(subdata)
|
subnet, subim, subfm = pm4py.discover_petri_net_ilp(subdata)
|
||||||
#pm4py.save_vis_petri_net(subnet, subim, subfm,
|
#pm4py.save_vis_petri_net(subnet, subim, subfm,
|
||||||
# "../figures/processmaps/artworks/petrinet_" + miner + "_" + str(artwork).zfill(3) + ".png")
|
# "results/processmaps/artworks/petrinet_" + miner + "_" + str(artwork).zfill(3) + ".png")
|
||||||
retval1[i] = eval_pm(subdata, net, im, fm)
|
retval1[i] = eval_pm(subdata, basenet, initial_marking, final_marking)
|
||||||
retval2[i] = eval_pm(subdata, subnet, subim, subfm)
|
retval2[i] = eval_pm(subdata, subnet, subim, subfm)
|
||||||
|
|
||||||
retval1 = pd.DataFrame(retval1)
|
retval1 = pd.DataFrame(retval1)
|
||||||
retval1.columns = ["fitness", "precision", "generalizability", "simplicity"]
|
retval1.columns = ["fitness", "precision", "generalizability", "simplicity"]
|
||||||
retval1.index = event_log["case:artwork"].unique()
|
retval1.index = event_log["item"].unique()
|
||||||
retval1.insert(0, "nettype", "alldata")
|
retval1.insert(0, "nettype", "alldata")
|
||||||
retval2 = pd.DataFrame(retval2)
|
retval2 = pd.DataFrame(retval2)
|
||||||
retval2.columns = ["fitness", "precision", "generalizability", "simplicity"]
|
retval2.columns = ["fitness", "precision", "generalizability", "simplicity"]
|
||||||
retval2.index = event_log["case:artwork"].unique()
|
retval2.index = event_log["item"].unique()
|
||||||
retval2.insert(0, "nettype", "subdata")
|
retval2.insert(0, "nettype", "subdata")
|
||||||
return pd.concat([retval1, retval2])
|
return pd.concat([retval1, retval2])
|
||||||
|
|
||||||
@ -343,7 +264,3 @@ def pm_artworks(miner):
|
|||||||
for miner in ["heuristics", "inductive", "alpha", "ilp"]:
|
for miner in ["heuristics", "inductive", "alpha", "ilp"]:
|
||||||
eval_art = pm_artworks(miner = miner)
|
eval_art = pm_artworks(miner = miner)
|
||||||
eval_art.to_csv("results/eval_artworks_" + miner + ".csv", sep=";")
|
eval_art.to_csv("results/eval_artworks_" + miner + ".csv", sep=";")
|
||||||
|
|
||||||
eval_art = pm_artworks(miner = "inductive")
|
|
||||||
|
|
||||||
|
|
||||||
|
@ -138,15 +138,15 @@ final_marking = Marking()
|
|||||||
final_marking[sink] = 1
|
final_marking[sink] = 1
|
||||||
|
|
||||||
pm4py.view_petri_net(net_seq, initial_marking, final_marking)
|
pm4py.view_petri_net(net_seq, initial_marking, final_marking)
|
||||||
pm4py.write_pnml(net_seq, initial_marking, final_marking, "results/conformative_petrinet_seq.pnml")
|
pm4py.write_pnml(net_seq, initial_marking, final_marking, "results/haum/conformative_petrinet_seq.pnml")
|
||||||
|
|
||||||
pm4py.vis.save_vis_petri_net(net_seq, initial_marking, final_marking,
|
pm4py.vis.save_vis_petri_net(net_seq, initial_marking, final_marking,
|
||||||
"../figures/conformative_petrinet_seq.png")
|
"results/processmaps/conformative_petrinet_seq.png")
|
||||||
|
|
||||||
bpmn = pm4py.convert.convert_to_bpmn(net_seq, initial_marking, final_marking)
|
bpmn = pm4py.convert.convert_to_bpmn(net_seq, initial_marking, final_marking)
|
||||||
pm4py.view_bpmn(bpmn)
|
pm4py.view_bpmn(bpmn)
|
||||||
|
|
||||||
pm4py.vis.save_vis_bpmn(bpmn, "../figures/conformative_bpmn_seq.png")
|
pm4py.vis.save_vis_bpmn(bpmn, "results/processmaps/conformative_bpmn_seq.png")
|
||||||
|
|
||||||
|
|
||||||
## Concurrent net
|
## Concurrent net
|
||||||
@ -240,12 +240,12 @@ final_marking = Marking()
|
|||||||
final_marking[sink] = 1
|
final_marking[sink] = 1
|
||||||
|
|
||||||
pm4py.view_petri_net(net_con, initial_marking, final_marking)
|
pm4py.view_petri_net(net_con, initial_marking, final_marking)
|
||||||
pm4py.write_pnml(net_con, initial_marking, final_marking, "results/conformative_petrinet_con.pnml")
|
pm4py.write_pnml(net_con, initial_marking, final_marking, "results/haum/conformative_petrinet_con.pnml")
|
||||||
|
|
||||||
pm4py.vis.save_vis_petri_net(net_con, initial_marking, final_marking,
|
pm4py.vis.save_vis_petri_net(net_con, initial_marking, final_marking,
|
||||||
"../figures/conformative_petrinet_con.png")
|
"results/processmaps/conformative_petrinet_con.png")
|
||||||
|
|
||||||
bpmn = pm4py.convert.convert_to_bpmn(net_con, initial_marking, final_marking)
|
bpmn = pm4py.convert.convert_to_bpmn(net_con, initial_marking, final_marking)
|
||||||
pm4py.view_bpmn(bpmn)
|
pm4py.view_bpmn(bpmn)
|
||||||
|
|
||||||
pm4py.vis.save_vis_bpmn(bpmn, "../figures/conformative_bpmn_con.png")
|
pm4py.vis.save_vis_bpmn(bpmn, "results/processmaps/conformative_bpmn_con.png")
|
38
code/pm_infos-clusters.py
Normal file
38
code/pm_infos-clusters.py
Normal file
@ -0,0 +1,38 @@
|
|||||||
|
%reset
|
||||||
|
|
||||||
|
import pm4py
|
||||||
|
import pandas as pd
|
||||||
|
import numpy as np
|
||||||
|
import matplotlib.pyplot as plt
|
||||||
|
|
||||||
|
from python_helpers import eval_pm, pn_infos
|
||||||
|
|
||||||
|
###### Load data and create event logs ######
|
||||||
|
|
||||||
|
dat = pd.read_csv("results/haum/event_logfiles_pre-corona_with-clusters.csv", sep = ";")
|
||||||
|
|
||||||
|
log_path = pm4py.format_dataframe(dat, case_id = "path", activity_key = "event",
|
||||||
|
timestamp_key = "date.start")
|
||||||
|
|
||||||
|
###### Infos for clusters ######
|
||||||
|
|
||||||
|
# Merge clusters into data frame
|
||||||
|
mdc = pd.DataFrame(columns = ["fitness", "precision", "generalizability",
|
||||||
|
"simplicity", "sound", "narcs", "ntrans",
|
||||||
|
"nplaces", "nvariants", "mostfreq"])
|
||||||
|
for cluster in log_path.grp.unique().tolist():
|
||||||
|
mdc = pd.concat([mdc, pn_infos(log_path, "grp", cluster)])
|
||||||
|
mdc = mdc.sort_index()
|
||||||
|
|
||||||
|
# Export
|
||||||
|
mdc.to_csv("results/haum/pn_infos_clusters.csv", sep = ";")
|
||||||
|
|
||||||
|
###### Process maps for clusters ######
|
||||||
|
|
||||||
|
for cluster in log_path.grp.unique().tolist():
|
||||||
|
subdata = log_path[log_path.grp == cluster]
|
||||||
|
subnet, subim, subfm = pm4py.discover_petri_net_inductive(subdata)
|
||||||
|
pm4py.save_vis_petri_net(subnet, subim, subfm,
|
||||||
|
"results/processmaps/petrinet_cluster" + str(cluster).zfill(3) + ".png")
|
||||||
|
bpmn = pm4py.convert.convert_to_bpmn(subnet, subim, subfm)
|
||||||
|
pm4py.vis.save_vis_bpmn(bpmn, "results/processmaps/bpmn_cluster_" + str(cluster).zfill(3) + ".png")
|
54
code/pm_infos-items.py
Normal file
54
code/pm_infos-items.py
Normal file
@ -0,0 +1,54 @@
|
|||||||
|
%reset
|
||||||
|
|
||||||
|
import pm4py
|
||||||
|
import pandas as pd
|
||||||
|
import numpy as np
|
||||||
|
import matplotlib.pyplot as plt
|
||||||
|
|
||||||
|
from python_helpers import eval_pm, pn_infos
|
||||||
|
|
||||||
|
###### Load data and create event logs ######
|
||||||
|
|
||||||
|
dat = pd.read_csv("results/haum/event_logfiles_2024-01-18_09-58-52.csv", sep = ";")
|
||||||
|
dat = dat[dat["date.start"] < "2020-03-13"]
|
||||||
|
dat = dat[dat["path"] != 106098] # exclude broken trace
|
||||||
|
# --> only pre corona (before artworks were updated)
|
||||||
|
|
||||||
|
log_path = pm4py.format_dataframe(dat, case_id = "path", activity_key = "event",
|
||||||
|
timestamp_key = "date.start")
|
||||||
|
|
||||||
|
|
||||||
|
###### Infos for items ######
|
||||||
|
|
||||||
|
mdi = pd.DataFrame(columns = ["fitness", "precision", "generalizability",
|
||||||
|
"simplicity", "sound", "narcs", "ntrans",
|
||||||
|
"nplaces", "nvariants", "mostfreq"])
|
||||||
|
for item in log_path.item.unique().tolist():
|
||||||
|
mdi = pd.concat([mdi, pn_infos(log_path, "item", item)])
|
||||||
|
mdi = mdi.sort_index()
|
||||||
|
|
||||||
|
# Export
|
||||||
|
mdi.to_csv("results/haum/pn_infos_items.csv", sep = ";")
|
||||||
|
|
||||||
|
# datitem = dat.groupby("item")[["duration", "distance",
|
||||||
|
# "scaleSize", "rotationDegree"]].mean()
|
||||||
|
#
|
||||||
|
# def length_path(data):
|
||||||
|
# x = data.path
|
||||||
|
# return len(x.unique())
|
||||||
|
# def length_case(data):
|
||||||
|
# x = data.case
|
||||||
|
# return len(x.unique())
|
||||||
|
# def length_topic(data):
|
||||||
|
# x = data.topic.dropna()
|
||||||
|
# return len(x.unique())
|
||||||
|
#
|
||||||
|
# datitem["npaths"] = dat.groupby(["item"]).apply(length_path)
|
||||||
|
# datitem["ncases"] = dat.groupby(["item"]).apply(length_case)
|
||||||
|
# datitem["ntopics"] = dat.groupby(["item"]).apply(length_topic)
|
||||||
|
#
|
||||||
|
# datitem.index = datitem.index.astype(str).str.rjust(3, "0")
|
||||||
|
# datitem = datitem.sort_index()
|
||||||
|
# datitem.index = mdi.index
|
||||||
|
#
|
||||||
|
# datitem = pd.concat([mdi, datitem], yaxis = 1)
|
@ -10,9 +10,9 @@ parameters = {pn_visualizer.Variants.FREQUENCY.value.Parameters.FORMAT: "png"}
|
|||||||
|
|
||||||
###### Load data and create event logs ######
|
###### Load data and create event logs ######
|
||||||
|
|
||||||
dat = pd.read_csv("results/haum/event_logfiles_2024-01-02_19-44-50.csv", sep = ";")
|
dat = pd.read_csv("results/haum/event_logfiles_2024-01-18_09-58-52.csv", sep = ";")
|
||||||
dat = dat[dat["date.start"] < "2020-03-13"]
|
dat = dat[dat["date.start"] < "2020-03-13"]
|
||||||
dat = dat[dat["path"] != 81621] # exclude broken trace
|
dat = dat[dat["path"] != 106098] # exclude broken trace
|
||||||
# --> only pre corona (before artworks were updated)
|
# --> only pre corona (before artworks were updated)
|
||||||
|
|
||||||
event_log = pm4py.format_dataframe(dat, case_id='case', activity_key='event',
|
event_log = pm4py.format_dataframe(dat, case_id='case', activity_key='event',
|
||||||
@ -26,32 +26,101 @@ pm4py.view_dfg(dfg, start_activities, end_activities)
|
|||||||
|
|
||||||
#filtered_log = pm4py.filter_event_attribute_values(event_log, 'item', [80])
|
#filtered_log = pm4py.filter_event_attribute_values(event_log, 'item', [80])
|
||||||
|
|
||||||
i_net, im, fm = pm4py.discover_petri_net_inductive(event_log)
|
net, im, fm = pm4py.discover_petri_net_inductive(event_log)
|
||||||
pm4py.vis.view_petri_net(i_net, im, fm)
|
pm4py.vis.view_petri_net(net, im, fm)
|
||||||
gviz = pn_visualizer.apply(i_net, im, fm, parameters=parameters,
|
|
||||||
|
pm4py.vis.view_petri_net(net, im, fm)
|
||||||
|
gviz = pn_visualizer.apply(net, im, fm, parameters=parameters,
|
||||||
variant=pn_visualizer.Variants.FREQUENCY,
|
variant=pn_visualizer.Variants.FREQUENCY,
|
||||||
log=event_log)
|
log=event_log)
|
||||||
pn_visualizer.view(gviz)
|
pn_visualizer.view(gviz)
|
||||||
|
|
||||||
len(i_net.places)
|
bpmn = pm4py.convert.convert_to_bpmn(net, im, fm)
|
||||||
len(i_net.transitions)
|
pm4py.vis.view_bpmn(bpmn)
|
||||||
len(i_net.arcs)
|
|
||||||
|
|
||||||
a_net, im, fm = pm4py.discover_petri_net_alpha(event_log)
|
net2, im2, fm2 = pm4py.discover_petri_net_inductive(event_log, noise_threshold=0.1)
|
||||||
pm4py.vis.view_petri_net(a_net, im, fm)
|
pm4py.vis.view_petri_net(net2, im2, fm2)
|
||||||
gviz = pn_visualizer.apply(a_net, im, fm, parameters=parameters,
|
|
||||||
variant=pn_visualizer.Variants.FREQUENCY,
|
def eval_pm(data, net, initial_marking, final_marking):
|
||||||
log=event_log)
|
"""Caculate fitness, precision, generalizability, and simplicity for petri net"""
|
||||||
pn_visualizer.view(gviz)
|
fitness = pm4py.fitness_token_based_replay(data, net, initial_marking, final_marking)
|
||||||
|
precisison = pm4py.precision_token_based_replay(data, net, initial_marking, final_marking)
|
||||||
|
#generalizability = pm4py.algo.evaluation.generalization.algorithm.apply(data, net,
|
||||||
|
# initial_marking, final_marking)
|
||||||
|
simplicity = pm4py.algo.evaluation.simplicity.algorithm.apply(net)
|
||||||
|
#return [fitness['average_trace_fitness'], precisison, generalizability, simplicity]
|
||||||
|
return [fitness['average_trace_fitness'], precisison, simplicity]
|
||||||
|
|
||||||
|
eval = eval_pm(event_log, net, im, fm)
|
||||||
|
eval2 = eval_pm(event_log, net2, im2, fm2)
|
||||||
|
|
||||||
|
len(net.places)
|
||||||
|
len(net.transitions)
|
||||||
|
len(net.arcs)
|
||||||
|
|
||||||
|
# Number of cases
|
||||||
|
len(event_log.case.unique())
|
||||||
|
|
||||||
|
# Number of variants
|
||||||
|
variants = pm4py.get_variants(event_log)
|
||||||
|
len(variants)
|
||||||
|
|
||||||
|
sorted_variants = dict(sorted(variants.items(), key=lambda item: item[1], reverse = True))
|
||||||
|
{k: sorted_variants[k] for k in list(sorted_variants)[:20]}
|
||||||
|
|
||||||
|
filtered_log = event_log[event_log["event"] != "move"]
|
||||||
|
variants_no_move = pm4py.get_variants(filtered_log)
|
||||||
|
len(variants_no_move)
|
||||||
|
sorted_variants_no_move = dict(sorted(variants_no_move.items(), key=lambda item: item[1], reverse = True))
|
||||||
|
{k: sorted_variants_no_move[k] for k in list(sorted_variants_no_move)[:20]}
|
||||||
|
|
||||||
|
|
||||||
len(a_net.places)
|
|
||||||
len(a_net.transitions)
|
|
||||||
len(a_net.arcs)
|
|
||||||
|
|
||||||
h_net, im, fm = pm4py.discover_petri_net_heuristics(filtered_log)
|
|
||||||
pm4py.vis.view_petri_net(h_net, im, fm)
|
|
||||||
|
|
||||||
len(h_net.places)
|
|
||||||
len(h_net.transitions)
|
|
||||||
len(h_net.arcs)
|
###### Navigation behavior for case ######
|
||||||
|
|
||||||
|
log_case = pm4py.format_dataframe(dat, case_id = "case", activity_key = "item",
|
||||||
|
timestamp_key = "date.start")
|
||||||
|
log_case = log_case.merge(tmp, on = "item", how = "left")
|
||||||
|
|
||||||
|
#filtered_log = pm4py.filter_event_attribute_values(log_case, "kcluster", [3])
|
||||||
|
filtered_log = log_case[log_case.hcluster == 1]
|
||||||
|
|
||||||
|
net, im, fm = pm4py.discover_dfg(filtered_log)
|
||||||
|
pm4py.vis.view_dfg(net, im, fm)
|
||||||
|
|
||||||
|
|
||||||
|
net, im, fm = pm4py.discover_petri_net_inductive(filtered_log)
|
||||||
|
pm4py.vis.view_petri_net(net, im, fm)
|
||||||
|
|
||||||
|
tree = pm4py.discovery.discover_process_tree_inductive(filtered_log)
|
||||||
|
pm4py.vis.view_process_tree(tree)
|
||||||
|
|
||||||
|
|
||||||
|
datcase = dat[~dat.duplicated(["case", "path", "item"])]
|
||||||
|
datcase = datcase[["case", "path", "event", "item", "date.start"]]
|
||||||
|
datcase = datcase.reset_index().drop("index", axis = 1)
|
||||||
|
#datcase = pd.concat([datcase, pd.get_dummies(datcase["item"], dtype = "int")], axis = 1)
|
||||||
|
|
||||||
|
datcase["duration"] = dat.groupby("path")["duration"].mean().tolist()
|
||||||
|
datcase["distance"] = dat.groupby("path")["distance"].mean().tolist()
|
||||||
|
datcase["scaleSize"] = dat.groupby("path")["scaleSize"].mean().tolist()
|
||||||
|
datcase["rotationDegree"] = dat.groupby("path")["rotationDegree"].mean().tolist()
|
||||||
|
|
||||||
|
datcase["item"] = [str(item).zfill(3) for item in datcase.item]
|
||||||
|
datcase = datcase.merge(xy[["item", "hcluster"]], on = "item", how = "left")
|
||||||
|
|
||||||
|
log_case = pm4py.format_dataframe(dat, case_id = "case", activity_key = "item",
|
||||||
|
timestamp_key = "date.start")
|
||||||
|
|
||||||
|
net, im, fm = pm4py.discover_dfg(log_case)
|
||||||
|
pm4py.vis.view_dfg(net, im, fm)
|
||||||
|
# don't know if this will eventually finish?
|
||||||
|
|
||||||
|
net, im, fm = pm4py.discover_dfg(log_case[log_case.hcluster == 1])
|
||||||
|
pm4py.vis.view_dfg(net, im, fm)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
37
code/python_helpers.py
Normal file
37
code/python_helpers.py
Normal file
@ -0,0 +1,37 @@
|
|||||||
|
import pm4py
|
||||||
|
import pandas as pd
|
||||||
|
|
||||||
|
###### Extract metadata for petri nets on filtered logs ######
|
||||||
|
|
||||||
|
def eval_pm(data, net, initial_marking, final_marking):
|
||||||
|
"""Caculate fitness, precision, generalizability, and simplicity for petri net"""
|
||||||
|
fitness = pm4py.fitness_token_based_replay(data, net, initial_marking, final_marking)
|
||||||
|
precisison = pm4py.precision_token_based_replay(data, net, initial_marking, final_marking)
|
||||||
|
generalizability = pm4py.algo.evaluation.generalization.algorithm.apply(data, net,
|
||||||
|
initial_marking, final_marking)
|
||||||
|
simplicity = pm4py.algo.evaluation.simplicity.algorithm.apply(net)
|
||||||
|
return [fitness['average_trace_fitness'], precisison, generalizability, simplicity]
|
||||||
|
|
||||||
|
|
||||||
|
def pn_infos(log, colname, filter):
|
||||||
|
"""Create data frame with relevant infos for petri nets on filtered logs"""
|
||||||
|
filtered_log = pm4py.filter_event_attribute_values(log, colname, [filter])
|
||||||
|
|
||||||
|
net, im, fm = pm4py.discover_petri_net_inductive(filtered_log)
|
||||||
|
eval = eval_pm(filtered_log, net, im, fm)
|
||||||
|
is_sound = pm4py.check_soundness(net, im, fm)
|
||||||
|
eval.append(is_sound[0])
|
||||||
|
eval.append(len(net.arcs))
|
||||||
|
eval.append(len(net.transitions))
|
||||||
|
eval.append(len(net.places))
|
||||||
|
variants = pm4py.get_variants(filtered_log)
|
||||||
|
eval.append(len(variants))
|
||||||
|
|
||||||
|
sorted_variants = dict(sorted(variants.items(), key=lambda item: item[1], reverse = True))
|
||||||
|
eval.append({k: sorted_variants[k] for k in list(sorted_variants)[:1]})
|
||||||
|
|
||||||
|
eval = pd.DataFrame(eval).T
|
||||||
|
eval.columns = ["fitness", "precision", "generalizability", "simplicity",
|
||||||
|
"sound", "narcs", "ntrans", "nplaces", "nvariants", "mostfreq"]
|
||||||
|
eval.index = [str(filter).zfill(3)]
|
||||||
|
return eval
|
@ -39,3 +39,136 @@ plt.plot(list(sse.keys()), list(sse.values()))
|
|||||||
plt.xlabel("Number of clusters")
|
plt.xlabel("Number of clusters")
|
||||||
plt.ylabel("SSE")
|
plt.ylabel("SSE")
|
||||||
plt.show()
|
plt.show()
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
### TMP
|
||||||
|
datitem = dat.groupby("item")[["duration", "distance",
|
||||||
|
"scaleSize", "rotationDegree"]].mean()
|
||||||
|
|
||||||
|
def length_path(data):
|
||||||
|
x = data.path
|
||||||
|
return len(x.unique())
|
||||||
|
def length_case(data):
|
||||||
|
x = data.case
|
||||||
|
return len(x.unique())
|
||||||
|
def length_topic(data):
|
||||||
|
x = data.topic.dropna()
|
||||||
|
return len(x.unique())
|
||||||
|
|
||||||
|
datitem["npaths"] = dat.groupby(["item"]).apply(length_path)
|
||||||
|
datitem["ncases"] = dat.groupby(["item"]).apply(length_case)
|
||||||
|
datitem["ntopics"] = dat.groupby(["item"]).apply(length_topic)
|
||||||
|
|
||||||
|
datitem.index = datitem.index.astype(str).str.rjust(3, "0")
|
||||||
|
datitem = datitem.sort_index()
|
||||||
|
datitem.index = mdi.index
|
||||||
|
|
||||||
|
datitem = pd.concat([mdi, datitem], axis = 1)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
###### Find clusters ######
|
||||||
|
|
||||||
|
myseed = 1420
|
||||||
|
|
||||||
|
mat = datitem.drop(["fitness", "sound", "mostfreq"], axis = 1)
|
||||||
|
mat = StandardScaler().fit_transform(mat)
|
||||||
|
|
||||||
|
xy = pd.DataFrame(MDS(normalized_stress = 'auto', random_state = myseed).fit_transform(mat))
|
||||||
|
xy.index = datitem.index
|
||||||
|
|
||||||
|
### K-Means clustering ###
|
||||||
|
|
||||||
|
kmeans = KMeans(n_clusters = 6, max_iter = 1000, random_state = myseed).fit(mat)
|
||||||
|
xy["kcluster"] = kmeans.labels_
|
||||||
|
|
||||||
|
for i in xy.kcluster.unique():
|
||||||
|
plt.scatter(xy[xy.kcluster == i].iloc[:,0], xy[xy.kcluster == i].iloc[:,1], label = i)
|
||||||
|
for j, txt in enumerate(xy.index[xy.kcluster == i]):
|
||||||
|
plt.annotate(txt.split("_")[1], (xy[xy.kcluster == i].iloc[j,0], xy[xy.kcluster == i].iloc[j,1]))
|
||||||
|
plt.legend()
|
||||||
|
plt.show()
|
||||||
|
|
||||||
|
xy.kcluster.value_counts()
|
||||||
|
|
||||||
|
# Scree plot
|
||||||
|
sse = {}
|
||||||
|
for k in range(1, 10):
|
||||||
|
kmeans = KMeans(n_clusters = k, max_iter = 1000).fit(mat)
|
||||||
|
sse[k] = kmeans.inertia_ # Inertia: Sum of distances of samples to their closest cluster center
|
||||||
|
plt.figure()
|
||||||
|
plt.plot(list(sse.keys()), list(sse.values()))
|
||||||
|
plt.xlabel("Number of clusters")
|
||||||
|
plt.ylabel("SSE")
|
||||||
|
plt.show()
|
||||||
|
|
||||||
|
c0_items = xy[xy.kcluster == 0].index
|
||||||
|
c1_items = xy[xy.kcluster == 1].index
|
||||||
|
c2_items = xy[xy.kcluster == 2].index
|
||||||
|
c3_items = xy[xy.kcluster == 3].index
|
||||||
|
c4_items = xy[xy.kcluster == 4].index
|
||||||
|
c5_items = xy[xy.kcluster == 5].index
|
||||||
|
|
||||||
|
### Hierarchical clustering ###
|
||||||
|
from sklearn.cluster import AgglomerativeClustering
|
||||||
|
|
||||||
|
hclust = AgglomerativeClustering(n_clusters = 6).fit(mat)
|
||||||
|
hclust.labels_
|
||||||
|
|
||||||
|
xy["hcluster"] = hclust.labels_
|
||||||
|
|
||||||
|
for i in xy.hcluster.unique():
|
||||||
|
plt.scatter(xy[xy.hcluster == i].iloc[:,0], xy[xy.hcluster == i].iloc[:,1], label = i)
|
||||||
|
for j, txt in enumerate(xy.index[xy.hcluster == i]):
|
||||||
|
plt.annotate(txt.split("_")[1], (xy[xy.hcluster == i].iloc[j,0], xy[xy.hcluster == i].iloc[j,1]))
|
||||||
|
plt.legend()
|
||||||
|
plt.show()
|
||||||
|
|
||||||
|
# dendrogram
|
||||||
|
from scipy.cluster.hierarchy import dendrogram
|
||||||
|
|
||||||
|
def plot_dendrogram(model, **kwargs):
|
||||||
|
# Create linkage matrix and then plot the dendrogram
|
||||||
|
|
||||||
|
# create the counts of samples under each node
|
||||||
|
counts = np.zeros(model.children_.shape[0])
|
||||||
|
n_samples = len(model.labels_)
|
||||||
|
for i, merge in enumerate(model.children_):
|
||||||
|
current_count = 0
|
||||||
|
for child_idx in merge:
|
||||||
|
if child_idx < n_samples:
|
||||||
|
current_count += 1 # leaf node
|
||||||
|
else:
|
||||||
|
current_count += counts[child_idx - n_samples]
|
||||||
|
counts[i] = current_count
|
||||||
|
|
||||||
|
linkage_matrix = np.column_stack(
|
||||||
|
[model.children_, model.distances_, counts]
|
||||||
|
).astype(float)
|
||||||
|
|
||||||
|
# Plot the corresponding dendrogram
|
||||||
|
dendrogram(linkage_matrix, **kwargs)
|
||||||
|
|
||||||
|
hclust = AgglomerativeClustering(distance_threshold = 0, n_clusters = None).fit(mat)
|
||||||
|
|
||||||
|
plot_dendrogram(hclust)
|
||||||
|
plt.show()
|
||||||
|
|
||||||
|
### Bisecting K-Means clustering ###
|
||||||
|
from sklearn.cluster import BisectingKMeans
|
||||||
|
|
||||||
|
biKmeans = BisectingKMeans(n_clusters = 6, random_state = myseed).fit(mat)
|
||||||
|
biKmeans.labels_
|
||||||
|
|
||||||
|
xy["bcluster"] = biKmeans.labels_
|
||||||
|
|
||||||
|
for i in xy.bcluster.unique():
|
||||||
|
plt.scatter(xy[xy.bcluster == i].iloc[:,0], xy[xy.bcluster == i].iloc[:,1], label = i)
|
||||||
|
for j, txt in enumerate(xy.index[xy.bcluster == i]):
|
||||||
|
plt.annotate(txt.split("_")[1], (xy[xy.bcluster == i].iloc[j,0], xy[xy.bcluster == i].iloc[j,1]))
|
||||||
|
plt.legend()
|
||||||
|
plt.show()
|
||||||
|
Loading…
Reference in New Issue
Block a user