From b29790dfc11ee2e13973e5aaba9682ff66926979 Mon Sep 17 00:00:00 2001 From: nwickel Date: Sat, 9 Mar 2024 17:22:46 +0100 Subject: [PATCH] Working on finalizing the clustering --- code/04_conformance-checking.py | 10 ++++---- code/05_check-traces.R | 31 ++++++++++++++++++++++-- code/07_item-clustering.R | 25 +++++++++++++++---- code/09_case-characteristics.R | 2 +- code/10_user-navigation.R | 43 +++++++++++++-------------------- 5 files changed, 72 insertions(+), 39 deletions(-) diff --git a/code/04_conformance-checking.py b/code/04_conformance-checking.py index 74ca403..2714195 100644 --- a/code/04_conformance-checking.py +++ b/code/04_conformance-checking.py @@ -103,11 +103,6 @@ fp_visualizer.view(gviz) efg_graph = pm4py.discover_eventually_follows_graph(event_log) -## Directly-follows graph -dfg, start_activities, end_activities = pm4py.discover_dfg(event_log) -pm4py.view_dfg(dfg, start_activities, end_activities) -pm4py.save_vis_dfg(dfg, start_activities, end_activities, "results/processmaps/dfg_complete_python.png") - ## Fitting different miners eval = pd.DataFrame(columns = ["fitness", "precision", "generalizability", @@ -131,6 +126,11 @@ for miner in ["conformative", "alpha", "heuristics", "inductive", "ilp"]: eval_clean.to_csv("results/eval_all-miners_clean.csv", sep = ";") +## Directly-follows graph +dfg, start_activities, end_activities = pm4py.discover_dfg(event_log_clean) +pm4py.view_dfg(dfg, start_activities, end_activities) +pm4py.save_vis_dfg(dfg, start_activities, end_activities, "results/processmaps/dfg_complete_python.png") + ## Export petri nets pm4py.vis.save_vis_petri_net(basenet, initial_marking, final_marking, "results/processmaps/petrinet_conformative.png") h_net, h_im, h_fm = pm4py.discover_petri_net_heuristics(event_log_clean) diff --git a/code/05_check-traces.R b/code/05_check-traces.R index 1f77f2a..2e37f7e 100644 --- a/code/05_check-traces.R +++ b/code/05_check-traces.R @@ -2,7 +2,8 @@ # # content: (1) Look at broken trace # (2) Function to find broken traces -# (3) Export data frame for analyses +# (3) DFG for complete data +# (4) Export data frame for analyses # # input: results/haum/event_logfiles_2024-02-21_16-07-33.csv # results/haum/raw_logfiles_2024-02-21_16-07-33.csv @@ -62,7 +63,33 @@ check <- check_traces(tmp) check[check$check, ] -#--------------- (3) Export data frame for analyses --------------- +#--------------- (3) DFG for complete data --------------- + +tmp <- datlogs[datlogs$path != 106098, ] +tmp$start <- tmp$date.start +tmp$complete <- tmp$date.stop + +alog <- bupaR::activitylog(tmp, + case_id = "path", + activity_id = "event", + resource_id = "item", + timestamps = c("start", "complete")) + +dfg <- processmapR::process_map(alog, + type_nodes = processmapR::frequency("relative", color_scale = "Greys"), + sec_nodes = processmapR::frequency("absolute"), + type_edges = processmapR::frequency("relative", color_edges = "#FF6900"), + sec_edges = processmapR::frequency("absolute"), + rankdir = "LR", + render = FALSE) + +processmapR::export_map(dfg, + file_name = paste0("results/processmaps/dfg_complete_R.pdf"), + file_type = "pdf") + +rm(tmp) + +#--------------- (4) Export data frame for analyses --------------- datlogs$event <- factor(datlogs$event, levels = c("move", "flipCard", "openTopic", diff --git a/code/07_item-clustering.R b/code/07_item-clustering.R index 8a3282d..717c134 100644 --- a/code/07_item-clustering.R +++ b/code/07_item-clustering.R @@ -10,6 +10,8 @@ # input: results/haum/eventlogs_pre-corona_cleaned.RData # results/haum/pn_infos_items.csv # output: results/haum/eventlogs_pre-corona_item-clusters.csv +# results/figures/dendrogram_items.pdf +# results/figures/clustering_items.pdf # results/figures/clustering_artworks.pdf # results/figures/clustering_artworks.png # @@ -85,7 +87,7 @@ factoextra::fviz_nbclust(df, FUNcluster = factoextra::hcut, method = "silhouette gap_stat <- cluster::clusGap(df, FUNcluster = factoextra::hcut, hc_func = "agnes", hc_method = "ward", - K.max = 10) + K.max = 15) factoextra::fviz_gap_stat(gap_stat) k <- 6 # number of clusters @@ -94,23 +96,36 @@ mycols <- c("#434F4F", "#78004B", "#FF6900", "#3CB4DC", "#91C86E", "Black") cluster <- cutree(hc, k = k) +pdf("results/figures/dendrogram_items.pdf", width = 6.5, height = 5.5, pointsize = 10) + factoextra::fviz_dend(hc, k = k, cex = 0.5, k_colors = mycols, #type = "phylogenic", - rect = TRUE + rect = TRUE, + main = "", + ylab = "" + #ggtheme = ggplot2::theme_bw() ) +dev.off() + +pdf("results/figures/clustering_items.pdf", width = 6.5, height = 5.5, pointsize = 10) + factoextra::fviz_cluster(list(data = df, cluster = cluster), palette = mycols, ellipse.type = "convex", repel = TRUE, show.clust.cent = FALSE, + main = "", ggtheme = ggplot2::theme_bw()) -aggregate(cbind(duration, distance, scaleSize , rotationDegree, npaths, - ncases, nmoves, nflipCard, nopenTopic, nopenPopup) ~ cluster, - datitem, mean) +dev.off() + +aggregate(cbind(precision, generalizability, nvariants, duration, distance, + scaleSize , rotationDegree, npaths, ncases, nmoves, + nflipCard, nopenTopic, nopenPopup) ~ cluster, datitem, + mean) aggregate(cbind(duration, distance, scaleSize , rotationDegree, npaths, ncases, nmoves, nflipCard, nopenTopic, nopenPopup) ~ cluster, diff --git a/code/09_case-characteristics.R b/code/09_case-characteristics.R index 7f84ee6..a786821 100644 --- a/code/09_case-characteristics.R +++ b/code/09_case-characteristics.R @@ -171,7 +171,7 @@ dattree$AvDurItemNorm <- normalize(dattree$AvDurItem) #--------------- (4) Export data frames --------------- -save(datcase, dattree, file = "results/haum/dataframes_case_2019.RData") +save(dat, datcase, dattree, file = "results/haum/dataframes_case_2019.RData") write.table(datcase, file = "results/haum/datcase.csv", diff --git a/code/10_user-navigation.R b/code/10_user-navigation.R index 72cfb35..a050730 100644 --- a/code/10_user-navigation.R +++ b/code/10_user-navigation.R @@ -27,25 +27,24 @@ summary(df) dist_mat <- cluster::daisy(df, metric = "gower") # "Flatten" with MDS -# coor_2d <- as.data.frame(cmdscale(dist_mat, k = 2)) # coor_3d <- as.data.frame(cmdscale(dist_mat, k = 3)) - -# coor_2d <- prcomp(df)$x[, 1:2] # coor_3d <- prcomp(df)$x[, 1:3] -coor_2d <- smacof::mds(dist_mat, ndim = 2, type = "ordinal")$conf -coor_3d <- smacof::mds(dist_mat, ndim = 2, type = "ordinal")$conf +coor_3d <- smacof::mds(dist_mat, ndim = 3, type = "ordinal")$conf +coor_2d <- coor_3d[, 1:2] plot(coor_2d) rgl::plot3d(coor_3d) -# method <- c(average = "average", single = "single", complete = "complete", -# ward = "ward") -# hcs <- pbapply::pblapply(method, function(x) cluster::agnes(dist_mat, method = x)) -# acs <- pbapply::pbsapply(hcs, function(x) x$ac) -# hc <- hcs$ward +method <- c(average = "average", single = "single", complete = "complete", + ward = "ward") +hcs <- pbapply::pblapply(method, function(x) cluster::agnes(dist_mat, method = x)) +acs <- pbapply::pbsapply(hcs, function(x) x$ac) +# average single complete ward +# 0.9881224 0.9725661 0.9937669 0.9994267 +hc <- hcs$ward -hc <- cluster::agnes(dist_mat, method = "ward") +#hc <- cluster::agnes(dist_mat, method = "ward") k <- 5 @@ -59,17 +58,7 @@ plot(coor_2d, col = mycols[cluster]) legend("topleft", paste("Cl", 1:4), col = mycols, pch = 21) rgl::plot3d(coor_3d, col = mycols[cluster]) -table(dattree[cluster == 1, "Pattern"]) -table(dattree[cluster == 2, "Pattern"]) -table(dattree[cluster == 3, "Pattern"]) -table(dattree[cluster == 4, "Pattern"]) -table(dattree[cluster == 5, "Pattern"]) - -table(dattree[cluster == 1, "InfocardOnly"]) -table(dattree[cluster == 2, "InfocardOnly"]) -table(dattree[cluster == 3, "InfocardOnly"]) -table(dattree[cluster == 4, "InfocardOnly"]) -table(dattree[cluster == 5, "InfocardOnly"]) +ftable(xtabs( ~ InfocardOnly + Pattern + cluster, dattree)) aggregate(. ~ cluster, df, mean) @@ -78,9 +67,6 @@ aggregate(cbind(duration, distance, scaleSize, rotationDegree, length, nitems, mean) ### Look at selected cases ########################################### - -load("") - tmp <- dat tmp$start <- tmp$date.start tmp$complete <- tmp$date.stop @@ -133,7 +119,9 @@ c1 <- rpart::rpart(as.factor(cluster) ~ ., data = dattree[, c("PropMoves", "InfocardOnly")], method = "class") +pdf("results/figures/tree_items_rpart.pdf", height = 5, width = 15, pointsize = 10) plot(partykit::as.party(c1)) +dev.off() # with conditional tree c2 <- partykit::ctree(as.factor(cluster) ~ ., data = dattree[, c("PropMoves", @@ -143,5 +131,8 @@ c2 <- partykit::ctree(as.factor(cluster) ~ ., data = dattree[, c("PropMoves", "Pattern", "InfocardOnly")], alpha = 0.001) -plot(c2) + +pdf("results/figures/tree_items_ctree.pdf", height = 7, width = 20, pointsize = 10) +plot(c2) +dev.off()