From 76af2916864df732f7c0d2353ecafa77e9324043 Mon Sep 17 00:00:00 2001 From: nwickel Date: Tue, 30 Jan 2024 11:48:48 +0100 Subject: [PATCH] Went over clustering and helper scripts --- code/04_infos-items.py | 41 ++---- code/05_item-clustering.R | 41 ++++-- code/06_infos-clusters.py | 17 +-- code/check-traces.R | 281 +++----------------------------------- code/python_helpers.py | 2 +- 5 files changed, 66 insertions(+), 316 deletions(-) diff --git a/code/04_infos-items.py b/code/04_infos-items.py index 3ff97fe..901167d 100644 --- a/code/04_infos-items.py +++ b/code/04_infos-items.py @@ -1,9 +1,6 @@ -#%reset - import pm4py import pandas as pd import numpy as np -import matplotlib.pyplot as plt from python_helpers import eval_pm, pn_infos @@ -11,43 +8,21 @@ from python_helpers import eval_pm, pn_infos dat = pd.read_csv("results/haum/event_logfiles_2024-01-18_09-58-52.csv", sep = ";") dat = dat[dat["date.start"] < "2020-03-13"] -dat = dat[dat["path"] != 106098] # exclude broken trace # --> only pre corona (before artworks were updated) +dat = dat[dat["path"] != 106098] +# exclude broken trace log_path = pm4py.format_dataframe(dat, case_id = "path", activity_key = "event", timestamp_key = "date.start") ###### Infos for items ###### -mdi = pd.DataFrame(columns = ["fitness", "precision", "generalizability", - "simplicity", "sound", "narcs", "ntrans", - "nplaces", "nvariants", "mostfreq"]) +eval = pd.DataFrame(columns = ["fitness", "precision", "generalizability", + "simplicity", "sound", "narcs", "ntrans", + "nplaces", "nvariants", "mostfreq"]) for item in log_path.item.unique().tolist(): - mdi = pd.concat([mdi, pn_infos(log_path, "item", item)]) -mdi = mdi.sort_index() + eval = pd.concat([eval, pn_infos(log_path, "item", item)]) +eval = eval.sort_index() # Export -mdi.to_csv("results/haum/pn_infos_items.csv", sep = ";") - -# datitem = dat.groupby("item")[["duration", "distance", -# "scaleSize", "rotationDegree"]].mean() -# -# def length_path(data): -# x = data.path -# return len(x.unique()) -# def length_case(data): -# x = data.case -# return len(x.unique()) -# def length_topic(data): -# x = data.topic.dropna() -# return len(x.unique()) -# -# datitem["npaths"] = dat.groupby(["item"]).apply(length_path) -# datitem["ncases"] = dat.groupby(["item"]).apply(length_case) -# datitem["ntopics"] = dat.groupby(["item"]).apply(length_topic) -# -# datitem.index = datitem.index.astype(str).str.rjust(3, "0") -# datitem = datitem.sort_index() -# datitem.index = mdi.index -# -# datitem = pd.concat([mdi, datitem], yaxis = 1) +eval.to_csv("results/haum/pn_infos_items.csv", sep = ";") diff --git a/code/05_item-clustering.R b/code/05_item-clustering.R index 54b0438..d901a61 100644 --- a/code/05_item-clustering.R +++ b/code/05_item-clustering.R @@ -1,5 +1,24 @@ +# 05_item-clustering.R +# +# content: (1) Read data +# (1.1) Read log event data +# (1.2) Read infos for PM for infos +# (1.3) Extract additional infos for clustering +# (2) Clustering +# (3) Visualization with pictures +# +# input: results/haum/event_logfiles_2024-01-18_09-58-52.csv +# results/haum/pn_infos_items.csv +# output: results/haum/event_logfiles_pre-corona_with-clusters.csv +# +# last mod: 2024-01-30 + + # setwd("C:/Users/nwickelmaier/Nextcloud/Documents/MDS/2023ss/60100_master_thesis/analysis/code") +library(bupaverse) +library(factoextra) + #--------------- (1) Read data --------------- #--------------- (1.1) Read log event data --------------- @@ -46,10 +65,8 @@ mat <- dist(df) hc <- hclust(mat, method = "ward.D2") -library(factoextra) -fviz_dend(hc, cex = 0.5) - -datitem$grp <- cutree(hc, k = 6) +grp <- cutree(hc, k = 6) +datitem$grp <- grp fviz_dend(hc, k = 6, cex = 0.5, @@ -70,19 +87,25 @@ p aggregate(cbind(duration, distance, scaleSize , rotationDegree, npaths, ncases, ntopics) ~ grp, datitem, mean) -datitem$item <- gsub("item_([0-9]{3})", "\\1", row.names(datitem)) +datitem$item <- sprintf("%03d", + as.numeric(gsub("item_([0-9]{3})", "\\1", row.names(datitem)))) res <- merge(dat, datitem[, c("item", "grp")], by = "item", all.x = TRUE) res <- res[order(res$fileId.start, res$date.start, res$timeMs.start), ] +# Look at clusters +vioplot::vioplot(duration ~ grp, res) +vioplot::vioplot(distance ~ grp, res) +vioplot::vioplot(scaleSize ~ grp, res) +vioplot::vioplot(rotationDegree ~ grp, res) + write.table(res, file = "results/haum/event_logfiles_pre-corona_with-clusters.csv", sep = ";", quote = FALSE, row.names = FALSE) -library(bupaverse) - +# DFGs for clusters res$start <- res$date.start res$complete <- res$date.stop @@ -95,9 +118,9 @@ for (cluster in sort(unique(res$grp))) { timestamps = c("start", "complete")) dfg <- process_map(alog, - type_nodes = frequency("relative"), + type_nodes = frequency("relative", color_scale = "Greys"), sec_nodes = frequency("absolute"), - type_edges = frequency("relative"), + type_edges = frequency("relative", color_edges = "#FF6900"), sec_edges = frequency("absolute"), rankdir = "LR", render = FALSE) diff --git a/code/06_infos-clusters.py b/code/06_infos-clusters.py index 6e2e85c..994ec2c 100644 --- a/code/06_infos-clusters.py +++ b/code/06_infos-clusters.py @@ -1,9 +1,5 @@ -%reset - import pm4py import pandas as pd -import numpy as np -import matplotlib.pyplot as plt from python_helpers import eval_pm, pn_infos @@ -17,15 +13,14 @@ log_path = pm4py.format_dataframe(dat, case_id = "path", activity_key = "event", ###### Infos for clusters ###### # Merge clusters into data frame -mdc = pd.DataFrame(columns = ["fitness", "precision", "generalizability", - "simplicity", "sound", "narcs", "ntrans", - "nplaces", "nvariants", "mostfreq"]) +eval = pd.DataFrame(columns = ["fitness", "precision", "generalizability", + "simplicity", "sound", "narcs", "ntrans", + "nplaces", "nvariants", "mostfreq"]) for cluster in log_path.grp.unique().tolist(): - mdc = pd.concat([mdc, pn_infos(log_path, "grp", cluster)]) -mdc = mdc.sort_index() + eval = pd.concat([eval, pn_infos(log_path, "grp", cluster)]) +eval = eval.sort_index() -# Export -mdc.to_csv("results/haum/pn_infos_clusters.csv", sep = ";") +eval.to_csv("results/haum/pn_infos_clusters.csv", sep = ";") ###### Process maps for clusters ###### diff --git a/code/check-traces.R b/code/check-traces.R index da2aea6..8665c53 100644 --- a/code/check-traces.R +++ b/code/check-traces.R @@ -2,153 +2,32 @@ # setwd("C:/Users/nwickelmaier/Nextcloud/Documents/MDS/2023ss/60100_master_thesis/analysis/code") -library(bupaverse) +#--------------- (1) Look at broken trace --------------- -# Read data - -dat0 <- read.table("results/haum/event_logfiles_2024-01-18_09-58-52.csv", - colClasses = c("character", "character", "POSIXct", - "POSIXct", "character", "integer", - "numeric", "character", "character", - rep("numeric", 3), "character", - "character", rep("numeric", 11), - "character", "character"), - sep = ";", header = TRUE) - -dat0$event <- factor(dat0$event, levels = c("move", "flipCard", "openTopic", - "openPopup")) - -dat0$weekdays <- factor(weekdays(dat0$date.start), - levels = c("Montag", "Dienstag", "Mittwoch", - "Donnerstag", "Freitag", "Samstag", - "Sonntag"), - labels = c("Monday", "Tuesday", "Wednesday", - "Thursday", "Friday", "Saturday", - "Sunday")) - -# Select data pre Corona -dat <- dat0[as.Date(dat0$date.start) < "2020-03-13", ] -dat <- dat[dat$path != 106098, ] +datraw <- read.table("results/haum/raw_logfiles_2024-01-18_09-58-52.csv", + header = TRUE, sep = ";") -table(table(dat$start)) +datlogs <- read.table("results/haum/event_logfiles_2024-01-18_09-58-52.csv", + colClasses = c("character", "character", "POSIXct", + "POSIXct", "character", "integer", + "numeric", "character", "character", + rep("numeric", 3), "character", + "character", rep("numeric", 11), + "character", "character"), + sep = ";", header = TRUE) -table(dat$event) -proportions(table(dat$event)) +artwork <- "176" +fileId <- c('2017_06_16-13_49_00.log', '2017_06_16-13_59_00.log') +path <- 106098 -dat_dur <- aggregate(duration ~ item, dat, mean) -barplot(duration - mean(dat_dur$duration) ~ item, dat_dur, col = "#434F4F", - las = 3) +datraw[datraw$item == artwork & datraw$fileId %in% fileId, ] +datlogs[datlogs$path == path, ] -# Investigate paths (will separate items and give clusters of artworks!) -length(unique(dat$path)) -# DFGs per Cluster -dat$start <- dat$date.start -dat$complete <- dat$date.stop +#--------------- (2) Function to find broken traces --------------- -summary(aggregate(duration ~ path, dat, mean)) - -alog <- activitylog(dat, - case_id = "path", - activity_id = "event", - resource_id = "item", - timestamps = c("start", "complete")) - -process_map(alog, - type_nodes = frequency("absolute"), - sec_nodes = frequency("relative"), - type_edges = frequency("absolute"), - sec_edges = frequency("relative"), - rankdir = "LR") - -### Separate for items - -datitem <- aggregate(cbind(duration, distance, scaleSize, rotationDegree) ~ - item, dat, function(x) mean(x, na.rm = TRUE), na.action = NULL) -datitem$npaths <- aggregate(path ~ item, dat, - function(x) length(unique(x)), - na.action = NULL)$path -datitem$ncases <- aggregate(case ~ item, dat, - function(x) length(unique(x)), - na.action = NULL)$case -datitem$ntopics <- aggregate(topic ~ item, dat, - function(x) ifelse(all(is.na(x)), NA, length(unique(na.omit(x)))), - na.action = NULL)$topic - - -set.seed(1211) - -nclusters <- 6 -k1 <- kmeans(datitem[, -1], nclusters) - -#colors <- c("#3CB4DC", "#78004B", "#91C86E", "#FF6900") - -colors <- palette.colors(palette = "Okabe-Ito") - -xy <- cmdscale(dist(datitem[, -1])) - -plot(xy, type = "n") -text(xy[,1], xy[,2], datitem$item, col = colors[k1$cluster]) -legend("topright", paste("Cluster", 1:nclusters), col = colors, lty = 1) - -## Scree plot - -ks <- 1:10 - -sse <- NULL -for (k in ks) sse <- c(sse, kmeans(datitem[, -1], k)$tot.withinss) - -plot(sse ~ ks, type = "l") - - -datitem$cluster <- k1$cluster - -datitem_agg <- aggregate(. ~ cluster, datitem[, -1], mean) - - -dat_cl <- merge(dat, datitem[, c("item", "cluster")], by = "item", all.x = TRUE) -dat_cl <- dat_cl[order(dat_cl$fileId.start, dat_cl$date.start, dat_cl$timeMs.start), ] - -write.table(dat_cl, "results/haum/event_logfiles_with-clusters_kmeans.csv", - sep = ";", row.names = FALSE) - -vioplot::vioplot(datitem$duration) - -vioplot::vioplot(duration ~ item, dat, las = 3) - -vioplot::vioplot(duration ~ cluster, dat_cl) -vioplot::vioplot(distance ~ cluster, dat_cl) -vioplot::vioplot(scaleSize ~ cluster, dat_cl) -vioplot::vioplot(rotationDegree ~ cluster, dat_cl) - - - -for (cluster in sort(unique(dat_cl$cluster))) { - - alog <- activitylog(dat_cl[dat_cl$cluster == cluster, ], - case_id = "path", - activity_id = "event", - resource_id = "item", - timestamps = c("start", "complete")) - - dfg <- process_map(alog, - type_nodes = frequency("relative"), - sec_nodes = frequency("absolute"), - type_edges = frequency("relative"), - sec_edges = frequency("absolute"), - rankdir = "LR", - render = FALSE) - export_map(dfg, - file_name = paste0("results/processmaps/dfg_cluster", cluster, "_R.pdf"), - file_type = "pdf", - title = paste("DFG Cluster", cluster)) - - -} - - -tmp <- dat[dat$event != "move", ] +tmp <- datlogs[datlogs$event != "move", ] check_traces <- function(data) { @@ -170,127 +49,5 @@ check_traces <- function(data) { check <- check_traces(tmp) -sum(check$check) - - - - - -alog <- activitylog(dat, - case_id = "case", - activity_id = "item", - resource_id = "path", - timestamps = c("start", "complete")) - -process_map(alog, - type_nodes = frequency("absolute"), - sec_nodes = frequency("relative"), - type_edges = frequency("absolute"), - sec_edges = frequency("relative"), - rankdir = "LR") - - - -datcase <- dat[!duplicated(dat[, c("case", "path", "item")]), - c("case", "path", "event", "item")] -datcase$duration <- aggregate(duration ~ path, dat, - function(x) mean(x, na.rm = TRUE), na.action = NULL)$duration -datcase$distance <- aggregate(distance ~ path, dat, - function(x) mean(x, na.rm = TRUE), na.action = NULL)$distance -datcase$scaleSize <- aggregate(scaleSize ~ path, dat, - function(x) mean(x, na.rm = TRUE), na.action = NULL)$scaleSize -datcase$rotationDegree <- aggregate(rotationDegree ~ path, dat, - function(x) mean(x, na.rm = TRUE), na.action = NULL)$rotationDegree -# datcase$ntopics <- aggregate(topic ~ path, dat, -# function(x) ifelse(all(is.na(x)), NA, length(unique(na.omit(x)))), -# na.action = NULL)$topic -datcase$move <- ifelse(datcase$event == "move", 1, 0) -# paths that start with move - -for (item in sort(unique(datcase$item))) { - datcase[paste0("item_", item)] <- ifelse(datcase$item == item, 1, 0) -} - -mat <- na.omit(datcase[, -c(1:4)]) - - -set.seed(1610) - -nclusters <- 6 -k1 <- kmeans(mat, nclusters) - -#colors <- c("#3CB4DC", "#78004B", "#91C86E", "#FF6900") - -colors <- palette.colors(palette = "Okabe-Ito")[1:nclusters] - -library(distances) -mat_dist <- distances(mat) - -xy <- cmdscale(mat_dist) - -plot(xy, type = "n") -text(xy[,1], xy[,2], datcase$path, col = colors[k1$cluster]) -legend("topright", paste("Cluster", 1:nclusters), col = colors, lty = 1) - -## Scree plot - -ks <- 1:10 - -sse <- NULL -for (k in ks) sse <- c(sse, kmeans(datitem[, -1], k)$tot.withinss) - -plot(sse ~ ks, type = "l") - - - - - - - - -alog <- activitylog(datcase, - case_id = "case", - activity_id = "item", - resource_id = "path", - timestamps = c("start", "complete")) - -process_map(alog, - type_nodes = frequency("relative"), - sec_nodes = frequency("absolute"), - type_edges = frequency("relative"), - sec_edges = frequency("absolute"), - rankdir = "LR") - - - - - -# setwd("C:/Users/nwickelmaier/Nextcloud/Documents/MDS/2023ss/60100_master_thesis/analysis/code") - -datraw <- read.table("results/haum/raw_logfiles_2024-01-18_09-58-52.csv", - header = TRUE, sep = ";") - - -# Read data - -datlogs <- read.table("results/haum/event_logfiles_2024-01-18_09-58-52.csv", - colClasses = c("character", "character", "POSIXct", - "POSIXct", "character", "integer", - "numeric", "character", "character", - rep("numeric", 3), "character", - "character", rep("numeric", 11), - "character", "character"), - sep = ";", header = TRUE) - -datlogs <- datlogs[order(datlogs$fileId.start, datlogs$date.start, datlogs$timeMs.start), ] - -artwork <- "176" -fileId <- c('2017_06_16-13_49_00.log', '2017_06_16-13_59_00.log') -path <- 106098 - -datraw[datraw$item == artwork & datraw$fileId %in% fileId, ] - -datlogs[datlogs$path == path, ] - - +check[check$check, ] diff --git a/code/python_helpers.py b/code/python_helpers.py index 07e510d..e3a31be 100644 --- a/code/python_helpers.py +++ b/code/python_helpers.py @@ -19,7 +19,7 @@ def pn_infos(log, colname, filter): net, im, fm = pm4py.discover_petri_net_inductive(filtered_log) - eval = eval_append(log, net, im, fm) + eval = eval_append(filtered_log, net, im, fm) eval.index = [str(filter).zfill(3)] return eval