from sklearn.cluster import KMeans import matplotlib.pyplot as plt ##### Clustering ###### ## KMeans #eval_artworks = eval_art[eval_art.nettype == "alldata"].iloc[:,range(1,5)] eval_artworks = eval_art[eval_art.nettype == "subdata"].iloc[:,range(1,5)] kmeans = KMeans(n_clusters=4, max_iter=1000).fit(eval_artworks) #from sklearn.manifold import MDS #coord = pd.DataFrame(MDS(normalized_stress='auto').fit_transform(eval_artworks)) coord = eval_artworks coord["clusters"] = kmeans.labels_ for i in coord.clusters.unique(): #plt.scatter(coord[coord.clusters == i].iloc[:,0], coord[coord.clusters == i].iloc[:,1], plt.scatter(coord[coord.clusters == i].iloc[:,1], coord[coord.clusters == i].iloc[:,2], #plt.scatter(coord[coord.clusters == i].iloc[:,2], coord[coord.clusters == i].iloc[:,4], label = i) plt.legend() plt.show() ### Scree plot sse = {} for k in range(1, 10): kmeans = KMeans(n_clusters=k, max_iter=1000).fit(eval_artworks[["precision", "generalizability"]]) #data["clusters"] = kmeans.labels_ #print(data["clusters"]) sse[k] = kmeans.inertia_ # Inertia: Sum of distances of samples to their closest cluster center plt.figure() plt.plot(list(sse.keys()), list(sse.values())) plt.xlabel("Number of clusters") plt.ylabel("SSE") plt.show() ### TMP datitem = dat.groupby("item")[["duration", "distance", "scaleSize", "rotationDegree"]].mean() def length_path(data): x = data.path return len(x.unique()) def length_case(data): x = data.case return len(x.unique()) def length_topic(data): x = data.topic.dropna() return len(x.unique()) datitem["npaths"] = dat.groupby(["item"]).apply(length_path) datitem["ncases"] = dat.groupby(["item"]).apply(length_case) datitem["ntopics"] = dat.groupby(["item"]).apply(length_topic) datitem.index = datitem.index.astype(str).str.rjust(3, "0") datitem = datitem.sort_index() datitem.index = mdi.index datitem = pd.concat([mdi, datitem], axis = 1) ###### Find clusters ###### myseed = 1420 mat = datitem.drop(["fitness", "sound", "mostfreq"], axis = 1) mat = StandardScaler().fit_transform(mat) xy = pd.DataFrame(MDS(normalized_stress = 'auto', random_state = myseed).fit_transform(mat)) xy.index = datitem.index ### K-Means clustering ### kmeans = KMeans(n_clusters = 6, max_iter = 1000, random_state = myseed).fit(mat) xy["kcluster"] = kmeans.labels_ for i in xy.kcluster.unique(): plt.scatter(xy[xy.kcluster == i].iloc[:,0], xy[xy.kcluster == i].iloc[:,1], label = i) for j, txt in enumerate(xy.index[xy.kcluster == i]): plt.annotate(txt.split("_")[1], (xy[xy.kcluster == i].iloc[j,0], xy[xy.kcluster == i].iloc[j,1])) plt.legend() plt.show() xy.kcluster.value_counts() # Scree plot sse = {} for k in range(1, 10): kmeans = KMeans(n_clusters = k, max_iter = 1000).fit(mat) sse[k] = kmeans.inertia_ # Inertia: Sum of distances of samples to their closest cluster center plt.figure() plt.plot(list(sse.keys()), list(sse.values())) plt.xlabel("Number of clusters") plt.ylabel("SSE") plt.show() c0_items = xy[xy.kcluster == 0].index c1_items = xy[xy.kcluster == 1].index c2_items = xy[xy.kcluster == 2].index c3_items = xy[xy.kcluster == 3].index c4_items = xy[xy.kcluster == 4].index c5_items = xy[xy.kcluster == 5].index ### Hierarchical clustering ### from sklearn.cluster import AgglomerativeClustering hclust = AgglomerativeClustering(n_clusters = 6).fit(mat) hclust.labels_ xy["hcluster"] = hclust.labels_ for i in xy.hcluster.unique(): plt.scatter(xy[xy.hcluster == i].iloc[:,0], xy[xy.hcluster == i].iloc[:,1], label = i) for j, txt in enumerate(xy.index[xy.hcluster == i]): plt.annotate(txt.split("_")[1], (xy[xy.hcluster == i].iloc[j,0], xy[xy.hcluster == i].iloc[j,1])) plt.legend() plt.show() # dendrogram from scipy.cluster.hierarchy import dendrogram def plot_dendrogram(model, **kwargs): # Create linkage matrix and then plot the dendrogram # create the counts of samples under each node counts = np.zeros(model.children_.shape[0]) n_samples = len(model.labels_) for i, merge in enumerate(model.children_): current_count = 0 for child_idx in merge: if child_idx < n_samples: current_count += 1 # leaf node else: current_count += counts[child_idx - n_samples] counts[i] = current_count linkage_matrix = np.column_stack( [model.children_, model.distances_, counts] ).astype(float) # Plot the corresponding dendrogram dendrogram(linkage_matrix, **kwargs) hclust = AgglomerativeClustering(distance_threshold = 0, n_clusters = None).fit(mat) plot_dendrogram(hclust) plt.show() ### Bisecting K-Means clustering ### from sklearn.cluster import BisectingKMeans biKmeans = BisectingKMeans(n_clusters = 6, random_state = myseed).fit(mat) biKmeans.labels_ xy["bcluster"] = biKmeans.labels_ for i in xy.bcluster.unique(): plt.scatter(xy[xy.bcluster == i].iloc[:,0], xy[xy.bcluster == i].iloc[:,1], label = i) for j, txt in enumerate(xy.index[xy.bcluster == i]): plt.annotate(txt.split("_")[1], (xy[xy.bcluster == i].iloc[j,0], xy[xy.bcluster == i].iloc[j,1])) plt.legend() plt.show()