diff --git a/code/01_preprocessing_haum.R b/code/01_preprocessing_haum.R index 55ba519..a7feea8 100644 --- a/code/01_preprocessing_haum.R +++ b/code/01_preprocessing_haum.R @@ -41,7 +41,7 @@ write.table(datraw, paste0("results/haum/raw_logfiles_", now, ".csv"), datlogs <- create_eventlogs(datraw, #xmlpath = "../data/haum/ContentEyevisit/eyevisit_cards_light/", - glossar = FALSE) + glossar = FALSE, save = TRUE) # 2,136,694 no change moves removed # OLD: diff --git a/code/04_modeling_haum.R b/code/04_modeling_haum.R index 5124f4f..400f8e0 100644 --- a/code/04_modeling_haum.R +++ b/code/04_modeling_haum.R @@ -2,78 +2,148 @@ # Read data -dat0 <- read.table("results/haum/event_logfiles_2023-10-25_17-29-52.csv", +dat <- read.table("results/haum/event_logfiles_2024-01-02_19-44-50.csv", + colClasses = c("character", "character", "POSIXct", + "POSIXct", "character", "integer", + "numeric", "character", "character", + rep("numeric", 3), "character", + "character", rep("numeric", 11), + "character", "character"), sep = ";", header = TRUE) -dat0$date.start <- as.POSIXct(dat0$date.start) -dat0$date.stop <- as.POSIXct(dat0$date.stop) -dat0$artwork <- sprintf("%03d", dat0$artwork) -table(dat0[!duplicated(dat0$trace), "event"]) +dat$event <- factor(dat$event, levels = c("move", "flipCard", "openTopic", + "openPopup")) -proportions(table(dat0[!duplicated(dat0$trace), "event"])) +dat$weekdays <- factor(weekdays(dat$date.start), + levels = c("Montag", "Dienstag", "Mittwoch", + "Donnerstag", "Freitag", "Samstag", + "Sonntag"), + labels = c("Monday", "Tuesday", "Wednesday", + "Thursday", "Friday", "Saturday", + "Sunday")) -tmp <- dat0[!duplicated(dat0$trace) & dat0$event %in% c("openTopic", - "openPopup"), ] +# Select data pre Corona +dat <- dat[as.Date(dat$date.start) < "2020-03-13", ] +dat <- dat[dat["path"] != 81621, ] -dat <- dat0 -i <- 1 -stop <- 1 +table(dat$event) +proportions(table(dat$event)) -while (stop > 0) { - stop <- sum(!duplicated(dat$trace) & dat$event %in% c("openTopic", "openPopup")) - dat <- dat[!(!duplicated(dat$trace) & - dat$event %in% c("openTopic", "openPopup")), ] - print(i) - i <- i + 1 - print(table(dat[!duplicated(dat$trace), "event"])) +# Investigate paths (will separate items and give clusters of artworks!) +length(unique(dat$path)) + +datpath <- aggregate(cbind(duration, distance, scaleSize, rotationDegree) ~ + path, dat, function(x) mean(x, na.rm = TRUE), na.action = NULL) + +datpath$length <- aggregate(item ~ path, dat, length)$item +datpath$nitems <- aggregate(item ~ path, dat, function(x) + length(unique(x)), na.action = NULL)$item +datpath$ntopics <- aggregate(topic ~ path, dat, + function(x) ifelse(all(is.na(x)), NA, length(unique(na.omit(x)))), + na.action = NULL)$topic + +datpath$vacation <- aggregate(vacation ~ path, dat, + function(x) ifelse(all(is.na(x)), 0, 1), + na.action = NULL)$vacation +datpath$holiday <- aggregate(holiday ~ path, dat, + function(x) ifelse(all(is.na(x)), 0, 1), + na.action = NULL)$holiday +datpath$weekend <- aggregate(weekdays ~ path, dat, + function(x) ifelse(any(x %in% c("Saturday", "Sunday")), 1, 0), + na.action = NULL)$weekdays +datpath$morning <- aggregate(date.start ~ path, dat, + function(x) ifelse(lubridate::hour(x[1]) > 13, 0, 1), + na.action = NULL)$date.start + + +# Investigate cases (= interactions per time intervall) +length(unique(dat$case)) + +datcase <- aggregate(cbind(duration, distance, scaleSize, rotationDegree) ~ + case, dat, function(x) mean(x, na.rm = TRUE), na.action = NULL) + +datcase$length <- aggregate(item ~ case, dat, length)$item +datcase$nitems <- aggregate(item ~ case, dat, function(x) + length(unique(x)), na.action = NULL)$item +datcase$ntopics <- aggregate(topic ~ case, dat, + function(x) ifelse(all(is.na(x)), NA, length(unique(na.omit(x)))), + na.action = NULL)$topic + +datcase$vacation <- aggregate(vacation ~ case, dat, + function(x) ifelse(all(is.na(x)), 0, 1), + na.action = NULL)$vacation +datcase$holiday <- aggregate(holiday ~ case, dat, + function(x) ifelse(all(is.na(x)), 0, 1), + na.action = NULL)$holiday +datcase$weekend <- aggregate(weekdays ~ case, dat, + function(x) ifelse(any(x %in% c("Saturday", "Sunday")), 1, 0), + na.action = NULL)$weekdays +datcase$morning <- aggregate(date.start ~ case, dat, + function(x) ifelse(lubridate::hour(x[1]) > 13, 0, 1), + na.action = NULL)$date.start + + + +# Paths with more than one case associated +tmp <- aggregate(case ~ path, dat, function(x) length(unique(x))) +sum(tmp$case > 1) +table(tmp$case) + +dat$date <- as.Date(dat$date.start) + +tmp <- aggregate(date ~ path, dat, function(x) length(unique(x))) +sum(tmp$date > 1) +table(tmp$date) +tmp[tmp$date > 1, ] + +for (p in tmp$path[tmp$date > 1]) { + print(dat[dat$path == p, 3:9]) + cat("\n\n") } + +dat[dat$date == "2017-02-28" & dat$item == "503", ] + + # Creating event logs library(bupaverse) -names(dat)[names(dat) %in% c("date.start", "date.stop")] <- c("start", - "complete") +dat$start <- dat$date.start +dat$complete <- dat$date.stop table(table(dat$start)) # --> hmm... -summary(aggregate(duration ~ trace, dat, mean)) - -# TODO: Find trace that has flipCard --> openPopup --> openTopic -dato <- dat[dat$event != "move", ] -dato_split <- split(dato, ~ trace) -tmp <- lapply(dato_split, function(x) unique(x$event)) -#tmp <- lapply(unique(dato$trace), function(x) unique(dato[dato$trace == x, "event"])) - -ids <- sapply(tmp, length) == 3 -tmp2 <- as.data.frame(do.call(rbind, tmp[ids])) -names(tmp2) <- c("flipCard", "openTopic", "openPopup") - -table(tmp2$flipCard) -table(tmp2$openTopic) -table(tmp2$openPopup) - -frag_ids <- which(tmp2$openTopic == "openPopup") - -tmp3 <- dat[dat$trace %in% rownames(tmp2)[frag_ids], ] - -tmp4 <- tmp3[!tmp3$glossar == 1, ] - -dat6 <- rbind(dat[!dat$trace %in% rownames(tmp2)[frag_ids], ], tmp4) - +summary(aggregate(duration ~ path, dat, mean)) alog <- activitylog(dat, - case_id = "trace", + case_id = "path", activity_id = "event", - #resource_id = "case", - resource_id = "artwork", + resource_id = "item", timestamps = c("start", "complete")) -process_map(alog) +process_map(alog, + type_nodes = frequency("absolute"), + sec_nodes = frequency("relative"), + type_edges = frequency("absolute"), + sec_edges = frequency("relative"), + rankdir = "LR") + + +alog2 <- activitylog(dat, + case_id = "case", + activity_id = "event", + resource_id = "item", + timestamps = c("start", "complete")) +process_map(alog2, + type_nodes = frequency("absolute"), + sec_nodes = frequency("relative"), + type_edges = frequency("absolute"), + sec_edges = frequency("relative"), + rankdir = "LR") + -process_map(alog, frequency("relative")) -process_map(alog, frequency("relative_consequent")) library(processanimateR) @@ -112,30 +182,4 @@ animate_process(elog[elog$artwork %in% c("080", "054"), ], mapping = token_aes(color = token_scale("artwork", scale = "ordinal", range = c("black", "gray")))) -# --> not sure, yet, how to interpret this... - -alog080 <- activitylog(dat[dat$artwork %in% "080", ], - #case_id = "case", - case_id = "trace", - activity_id = "event", - #resource_id = "trace", - resource_id = "case", - timestamps = c("start", "complete")) - -process_map(alog080, frequency("relative")) - - - -alog054 <- activitylog(dat[dat$artwork %in% "054", ], - #case_id = "case", - case_id = "trace", - activity_id = "event", - #resource_id = "trace", - resource_id = "case", - timestamps = c("start", "complete")) - -process_map(alog054, frequency("relative")) - - - diff --git a/code/plots_petri-nets.py b/code/plots_petri-nets.py index 4d467a1..e1acdb7 100644 --- a/code/plots_petri-nets.py +++ b/code/plots_petri-nets.py @@ -10,24 +10,7 @@ net_con.places net_con.transitions net_con.arcs -help(pm4py.objects.petri_net.obj.Marking) - -# Places -source = PetriNet.Place("source") -sink = PetriNet.Place("sink") -p_1 = PetriNet.Place("p_1") -p_2 = PetriNet.Place("p_2") -p_3 = PetriNet.Place("p_3") -p_4 = PetriNet.Place("p_4") -p_5 = PetriNet.Place("p_5") -p_6 = PetriNet.Place("p_6") -p_7 = PetriNet.Place("p_7") -p_8 = PetriNet.Place("p_8") -p_9 = PetriNet.Place("p_9") -p_10 = PetriNet.Place("p_10") -p_11 = PetriNet.Place("p_11") -p_12 = PetriNet.Place("p_12") - +final_marking = Marking() # Add tokens for traces # ('flipCard', 'openTopic', 'openPopup', 'openTopic', 'move'): 14 @@ -75,7 +58,8 @@ pm4py.vis.save_vis_petri_net(net_con, marking, final_marking, file_path="../figu marking = pm4py.generate_marking(net_con, {'p_5': 1, 'p_12' : 1}) pm4py.vis.save_vis_petri_net(net_con, marking, final_marking, file_path="../figures/processmaps/conformative_net_con_markings_1_15.png") #pm4py.view_petri_net(net_con, marking) -pm4py.vis.save_vis_petri_net(net_con, final_marking, final_marking, file_path="../figures/processmaps/conformative_net_con_markings_1_16.png") +marking = pm4py.generate_marking(net_con, {'sink': 1}) +pm4py.vis.save_vis_petri_net(net_con, marking, final_marking, file_path="../figures/processmaps/conformative_net_con_markings_1_16.png") #pm4py.view_petri_net(net_con, final_marking) # ('move', 'move', 'flipCard', 'move', 'openTopic', 'openPopup'): 14 @@ -110,5 +94,6 @@ marking = pm4py.generate_marking(net_con, {'p_4': 1, 'p_12' : 1}) pm4py.vis.save_vis_petri_net(net_con, marking, final_marking, file_path="../figures/processmaps/conformative_net_con_markings_2_15.png") marking = pm4py.generate_marking(net_con, {'p_5': 1, 'p_12' : 1}) pm4py.vis.save_vis_petri_net(net_con, marking, final_marking, file_path="../figures/processmaps/conformative_net_con_markings_2_16.png") -pm4py.vis.save_vis_petri_net(net_con, final_marking, final_marking, file_path="../figures/processmaps/conformative_net_con_markings_2_17.png") +marking = pm4py.generate_marking(net_con, {'sink': 1}) +pm4py.vis.save_vis_petri_net(net_con, marking, final_marking, file_path="../figures/processmaps/conformative_net_con_markings_2_17.png") diff --git a/code/00_pm.py b/code/pm.py similarity index 100% rename from code/00_pm.py rename to code/pm.py diff --git a/code/conformance-checking.py b/code/pm_conformance-checking.py similarity index 83% rename from code/conformance-checking.py rename to code/pm_conformance-checking.py index a4508de..bc37867 100644 --- a/code/conformance-checking.py +++ b/code/pm_conformance-checking.py @@ -15,7 +15,7 @@ dat = dat[dat["date.start"] < "2020-03-13"] event_log = pm4py.format_dataframe(dat, case_id='path', activity_key='event', timestamp_key='date.start') -event_log = event_log.rename(columns={'artwork': 'case:artwork'}) +event_log = event_log.rename(columns={'item': 'case:item'}) ###### Descrptives of log data ###### @@ -34,10 +34,10 @@ sorted_variants = dict(sorted(variants.items(), key=lambda item: item[1], revers {k: sorted_variants[k] for k in list(sorted_variants)[:20]} filtered_log = event_log[event_log["event"] != "move"] -variants = pm4py.get_variants(filtered_log) -len(variants) -sorted_variants = dict(sorted(variants.items(), key=lambda item: item[1], reverse = True)) -{k: sorted_variants[k] for k in list(sorted_variants)[:20]} +variants_no_move = pm4py.get_variants(filtered_log) +len(variants_no_move) +sorted_variants_no_move = dict(sorted(variants_no_move.items(), key=lambda item: item[1], reverse = True)) +{k: sorted_variants_no_move[k] for k in list(sorted_variants_no_move)[:20]} # Path length event_log.path.value_counts() @@ -94,8 +94,11 @@ l4[index_broken] replayed_traces[index_broken] -# 216295 # --> broken trace! Must be in artwork 176!!!!! - +event_log[event_log['@@case_index'] == index_broken].event +event_log[event_log['@@case_index'] == index_broken].path +event_log[event_log['@@case_index'] == index_broken].item +event_log[event_log['@@case_index'] == index_broken]["fileId.start"] +# --> logging error in file! from pm4py.algo.conformance.tokenreplay import algorithm as token_based_replay parameters_tbr = {token_based_replay.Variants.TOKEN_REPLAY.value.Parameters.DISABLE_VARIANTS: True, token_based_replay.Variants.TOKEN_REPLAY.value.Parameters.ENABLE_PLTR_FITNESS: True} @@ -156,9 +159,9 @@ pm4py.save_vis_dfg(dfg, start_activities, end_activities, '../figures/processmap ## Heuristics Miner h_net, im, fm = pm4py.discover_petri_net_heuristics(event_log) -h_eval = eval_pm(event_log, h_net, im, fm) pm4py.vis.view_petri_net(h_net, im, fm) -pm4py.vis.save_vis_petri_net(h_net, im, fm, "../figures/processmaps/pn_heuristics_complete.png") +pm4py.vis.save_vis_petri_net(h_net, im, fm, "../figures/processmaps/petrinet_heuristics_complete.png") +h_eval = eval_pm(event_log, h_net, im, fm) is_sound = pm4py.check_soundness(h_net, im, fm) is_sound[0] @@ -172,7 +175,7 @@ len(h_net.places) from pm4py.visualization.petri_net import visualizer as pn_visualizer parameters = {pn_visualizer.Variants.FREQUENCY.value.Parameters.FORMAT: "png"} gviz = pn_visualizer.apply(h_net, im, fm, parameters=parameters, variant=pn_visualizer.Variants.FREQUENCY, log=event_log) -pn_visualizer.save(gviz, "../figures/processmaps/pn_heuristics_complete_decorated.png") +pn_visualizer.save(gviz, "../figures/processmaps/petrinet_heuristics_complete_decorated.png") # convert to BPMN bpmn = pm4py.convert.convert_to_bpmn(h_net, im, fm) @@ -180,9 +183,9 @@ pm4py.vis.view_bpmn(bpmn) ## Alpha Miner a_net, im, fm = pm4py.discover_petri_net_alpha(event_log) -a_eval = eval_pm(event_log, a_net, im, fm) pm4py.vis.view_petri_net(a_net, im, fm) -pm4py.vis.save_vis_petri_net(a_net, im, fm, "../figures/processmaps/pn_alpha_complete.png") +pm4py.vis.save_vis_petri_net(a_net, im, fm, "../figures/processmaps/petrinet_alpha_complete.png") +a_eval = eval_pm(event_log, a_net, im, fm) is_sound = pm4py.check_soundness(a_net, im, fm) is_sound[0] @@ -193,9 +196,9 @@ len(a_net.places) ## Inductive Miner i_net, im, fm = pm4py.discover_petri_net_inductive(event_log) -i_eval = eval_pm(event_log, i_net, im, fm) pm4py.vis.view_petri_net(i_net, im, fm) -pm4py.vis.save_vis_petri_net(i_net, im, fm, "../figures/processmaps/pn_induction_complete.png") +pm4py.vis.save_vis_petri_net(i_net, im, fm, "../figures/processmaps/petrinet_induction_complete.png") +i_eval = eval_pm(event_log, i_net, im, fm) # as process tree (does not work for heuristics miner!) pt = pm4py.discover_process_tree_inductive(event_log) @@ -217,9 +220,9 @@ pm4py.view_bpmn(bpmn) from pm4py.algo.conformance.tokenreplay import algorithm as token_based_replay parameters_tbr = {token_based_replay.Variants.TOKEN_REPLAY.value.Parameters.DISABLE_VARIANTS: True, token_based_replay.Variants.TOKEN_REPLAY.value.Parameters.ENABLE_PLTR_FITNESS: True} replayed_traces, place_fitness, trans_fitness, unwanted_activities = token_based_replay.apply(event_log, i_net, - im, - fm, - parameters=parameters_tbr) + im, + fm, + parameters=parameters_tbr) l1 = list() l2 = list() @@ -232,15 +235,29 @@ for i in range(len(replayed_traces)): l4.append(replayed_traces[i]["transitions_with_problems"]) np.mean(l1) +set(l1) +index_broken = l1.index(1) np.mean(l2) +set(l2) +l2.index(1) set(l3) l4.count([]) +l3[index_broken] +l4[index_broken] + +replayed_traces[index_broken] + +event_log[event_log['@@case_index'] == index_broken].event +event_log[event_log['@@case_index'] == index_broken].path +event_log[event_log['@@case_index'] == index_broken].item +event_log[event_log['@@case_index'] == index_broken]["fileId.start"] + ## ILP Miner ilp_net, im, fm = pm4py.discover_petri_net_ilp(event_log) -ilp_eval = eval_pm(event_log, ilp_net, im, fm) pm4py.vis.view_petri_net(ilp_net, im, fm) -pm4py.vis.save_vis_petri_net(ilp_net, im, fm, "../figures/processmaps/pn_ilp_complete.png") +pm4py.vis.save_vis_petri_net(ilp_net, im, fm, "../figures/processmaps/petrinet_ilp_complete.png") +ilp_eval = eval_pm(event_log, ilp_net, im, fm) is_sound = pm4py.check_soundness(ilp_net, im, fm) is_sound[0] @@ -257,6 +274,27 @@ eval eval.to_csv("results/eval_all-miners_complete.csv", sep=";") +## Without broken trace +event_log_clean = event_log[event_log['@@case_index'] != index_broken] +h_net, a_im, h_fm = pm4py.discover_petri_net_heuristics(event_log_clean) +a_net, h_im, a_fm = pm4py.discover_petri_net_alpha(event_log_clean) +i_net, i_im, i_fm = pm4py.discover_petri_net_inductive(event_log_clean) +ilp_net, ilp_im, ilp_fm = pm4py.discover_petri_net_ilp(event_log_clean) + +baseline_eval = eval_pm(event_log_clean, basenet, initial_marking, final_marking) +h_eval = eval_pm(event_log_clean, h_net, h_im, h_fm) +a_eval = eval_pm(event_log_clean, a_net, a_im, a_fm) +i_eval = eval_pm(event_log_clean, i_net, i_im, i_fm) +ilp_eval = eval_pm(event_log_clean, ilp_net, ilp_im, ilp_fm) + +eval = pd.DataFrame(np.row_stack([baseline_eval, h_eval, a_eval, i_eval, ilp_eval])) +eval.columns = ["fitness", "precision", "generalizability", "simplicity"] +eval.index = ["conformative", "heuristics", "alpha", "inductive", "ilp"] +eval + +eval.to_csv("results/eval_all-miners_clean.csv", sep=";") + + ###### Process Mining - individual artworks ###### def pm_artworks(miner): @@ -308,40 +346,4 @@ for miner in ["heuristics", "inductive", "alpha", "ilp"]: eval_art = pm_artworks(miner = "inductive") -##### Clustering ###### - -## KMeans - -#eval_artworks = eval_art[eval_art.nettype == "alldata"].iloc[:,range(1,5)] -eval_artworks = eval_art[eval_art.nettype == "subdata"].iloc[:,range(1,5)] - -kmeans = KMeans(n_clusters=4, max_iter=1000).fit(eval_artworks) - -#from sklearn.manifold import MDS -#coord = pd.DataFrame(MDS(normalized_stress='auto').fit_transform(eval_artworks)) - -coord = eval_artworks -coord["clusters"] = kmeans.labels_ - -for i in coord.clusters.unique(): - #plt.scatter(coord[coord.clusters == i].iloc[:,0], coord[coord.clusters == i].iloc[:,1], - plt.scatter(coord[coord.clusters == i].iloc[:,1], coord[coord.clusters == i].iloc[:,2], - #plt.scatter(coord[coord.clusters == i].iloc[:,2], coord[coord.clusters == i].iloc[:,4], - label = i) -plt.legend() -plt.show() - -### Scree plot - -sse = {} -for k in range(1, 10): - kmeans = KMeans(n_clusters=k, max_iter=1000).fit(eval_artworks[["precision", "generalizability"]]) - #data["clusters"] = kmeans.labels_ - #print(data["clusters"]) - sse[k] = kmeans.inertia_ # Inertia: Sum of distances of samples to their closest cluster center -plt.figure() -plt.plot(list(sse.keys()), list(sse.values())) -plt.xlabel("Number of clusters") -plt.ylabel("SSE") -plt.show() diff --git a/code/pm_navigation-behavior.py b/code/pm_navigation-behavior.py new file mode 100644 index 0000000..9aec338 --- /dev/null +++ b/code/pm_navigation-behavior.py @@ -0,0 +1,57 @@ +%reset + +import pm4py + +import pandas as pd +import numpy as np +import matplotlib.pyplot as plt +from pm4py.visualization.petri_net import visualizer as pn_visualizer +parameters = {pn_visualizer.Variants.FREQUENCY.value.Parameters.FORMAT: "png"} + +###### Load data and create event logs ###### + +dat = pd.read_csv("results/haum/event_logfiles_2024-01-02_19-44-50.csv", sep = ";") +dat = dat[dat["date.start"] < "2020-03-13"] +dat = dat[dat["path"] != 81621] # exclude broken trace +# --> only pre corona (before artworks were updated) + +event_log = pm4py.format_dataframe(dat, case_id='case', activity_key='event', + timestamp_key='date.start') + +event_log.event.value_counts() +event_log.event.value_counts(normalize=True) + +dfg, start_activities, end_activities = pm4py.discover_dfg(event_log) +pm4py.view_dfg(dfg, start_activities, end_activities) + +#filtered_log = pm4py.filter_event_attribute_values(event_log, 'item', [80]) + +i_net, im, fm = pm4py.discover_petri_net_inductive(event_log) +pm4py.vis.view_petri_net(i_net, im, fm) +gviz = pn_visualizer.apply(i_net, im, fm, parameters=parameters, + variant=pn_visualizer.Variants.FREQUENCY, + log=event_log) +pn_visualizer.view(gviz) + +len(i_net.places) +len(i_net.transitions) +len(i_net.arcs) + +a_net, im, fm = pm4py.discover_petri_net_alpha(event_log) +pm4py.vis.view_petri_net(a_net, im, fm) +gviz = pn_visualizer.apply(a_net, im, fm, parameters=parameters, + variant=pn_visualizer.Variants.FREQUENCY, + log=event_log) +pn_visualizer.view(gviz) + + +len(a_net.places) +len(a_net.transitions) +len(a_net.arcs) + +h_net, im, fm = pm4py.discover_petri_net_heuristics(filtered_log) +pm4py.vis.view_petri_net(h_net, im, fm) + +len(h_net.places) +len(h_net.transitions) +len(h_net.arcs) diff --git a/code/trace-clustering.py b/code/trace-clustering.py new file mode 100644 index 0000000..f35cb2f --- /dev/null +++ b/code/trace-clustering.py @@ -0,0 +1,41 @@ +from sklearn.cluster import KMeans + +import matplotlib.pyplot as plt + + +##### Clustering ###### + +## KMeans + +#eval_artworks = eval_art[eval_art.nettype == "alldata"].iloc[:,range(1,5)] +eval_artworks = eval_art[eval_art.nettype == "subdata"].iloc[:,range(1,5)] + +kmeans = KMeans(n_clusters=4, max_iter=1000).fit(eval_artworks) + +#from sklearn.manifold import MDS +#coord = pd.DataFrame(MDS(normalized_stress='auto').fit_transform(eval_artworks)) + +coord = eval_artworks +coord["clusters"] = kmeans.labels_ + +for i in coord.clusters.unique(): + #plt.scatter(coord[coord.clusters == i].iloc[:,0], coord[coord.clusters == i].iloc[:,1], + plt.scatter(coord[coord.clusters == i].iloc[:,1], coord[coord.clusters == i].iloc[:,2], + #plt.scatter(coord[coord.clusters == i].iloc[:,2], coord[coord.clusters == i].iloc[:,4], + label = i) +plt.legend() +plt.show() + +### Scree plot + +sse = {} +for k in range(1, 10): + kmeans = KMeans(n_clusters=k, max_iter=1000).fit(eval_artworks[["precision", "generalizability"]]) + #data["clusters"] = kmeans.labels_ + #print(data["clusters"]) + sse[k] = kmeans.inertia_ # Inertia: Sum of distances of samples to their closest cluster center +plt.figure() +plt.plot(list(sse.keys()), list(sse.values())) +plt.xlabel("Number of clusters") +plt.ylabel("SSE") +plt.show()