mtt_haum/code/trace-clustering.py

175 lines
5.1 KiB
Python

from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
##### Clustering ######
## KMeans
#eval_artworks = eval_art[eval_art.nettype == "alldata"].iloc[:,range(1,5)]
eval_artworks = eval_art[eval_art.nettype == "subdata"].iloc[:,range(1,5)]
kmeans = KMeans(n_clusters=4, max_iter=1000).fit(eval_artworks)
#from sklearn.manifold import MDS
#coord = pd.DataFrame(MDS(normalized_stress='auto').fit_transform(eval_artworks))
coord = eval_artworks
coord["clusters"] = kmeans.labels_
for i in coord.clusters.unique():
#plt.scatter(coord[coord.clusters == i].iloc[:,0], coord[coord.clusters == i].iloc[:,1],
plt.scatter(coord[coord.clusters == i].iloc[:,1], coord[coord.clusters == i].iloc[:,2],
#plt.scatter(coord[coord.clusters == i].iloc[:,2], coord[coord.clusters == i].iloc[:,4],
label = i)
plt.legend()
plt.show()
### Scree plot
sse = {}
for k in range(1, 10):
kmeans = KMeans(n_clusters=k, max_iter=1000).fit(eval_artworks[["precision", "generalizability"]])
#data["clusters"] = kmeans.labels_
#print(data["clusters"])
sse[k] = kmeans.inertia_ # Inertia: Sum of distances of samples to their closest cluster center
plt.figure()
plt.plot(list(sse.keys()), list(sse.values()))
plt.xlabel("Number of clusters")
plt.ylabel("SSE")
plt.show()
### TMP
datitem = dat.groupby("item")[["duration", "distance",
"scaleSize", "rotationDegree"]].mean()
def length_path(data):
x = data.path
return len(x.unique())
def length_case(data):
x = data.case
return len(x.unique())
def length_topic(data):
x = data.topic.dropna()
return len(x.unique())
datitem["npaths"] = dat.groupby(["item"]).apply(length_path)
datitem["ncases"] = dat.groupby(["item"]).apply(length_case)
datitem["ntopics"] = dat.groupby(["item"]).apply(length_topic)
datitem.index = datitem.index.astype(str).str.rjust(3, "0")
datitem = datitem.sort_index()
datitem.index = mdi.index
datitem = pd.concat([mdi, datitem], axis = 1)
###### Find clusters ######
myseed = 1420
mat = datitem.drop(["fitness", "sound", "mostfreq"], axis = 1)
mat = StandardScaler().fit_transform(mat)
xy = pd.DataFrame(MDS(normalized_stress = 'auto', random_state = myseed).fit_transform(mat))
xy.index = datitem.index
### K-Means clustering ###
kmeans = KMeans(n_clusters = 6, max_iter = 1000, random_state = myseed).fit(mat)
xy["kcluster"] = kmeans.labels_
for i in xy.kcluster.unique():
plt.scatter(xy[xy.kcluster == i].iloc[:,0], xy[xy.kcluster == i].iloc[:,1], label = i)
for j, txt in enumerate(xy.index[xy.kcluster == i]):
plt.annotate(txt.split("_")[1], (xy[xy.kcluster == i].iloc[j,0], xy[xy.kcluster == i].iloc[j,1]))
plt.legend()
plt.show()
xy.kcluster.value_counts()
# Scree plot
sse = {}
for k in range(1, 10):
kmeans = KMeans(n_clusters = k, max_iter = 1000).fit(mat)
sse[k] = kmeans.inertia_ # Inertia: Sum of distances of samples to their closest cluster center
plt.figure()
plt.plot(list(sse.keys()), list(sse.values()))
plt.xlabel("Number of clusters")
plt.ylabel("SSE")
plt.show()
c0_items = xy[xy.kcluster == 0].index
c1_items = xy[xy.kcluster == 1].index
c2_items = xy[xy.kcluster == 2].index
c3_items = xy[xy.kcluster == 3].index
c4_items = xy[xy.kcluster == 4].index
c5_items = xy[xy.kcluster == 5].index
### Hierarchical clustering ###
from sklearn.cluster import AgglomerativeClustering
hclust = AgglomerativeClustering(n_clusters = 6).fit(mat)
hclust.labels_
xy["hcluster"] = hclust.labels_
for i in xy.hcluster.unique():
plt.scatter(xy[xy.hcluster == i].iloc[:,0], xy[xy.hcluster == i].iloc[:,1], label = i)
for j, txt in enumerate(xy.index[xy.hcluster == i]):
plt.annotate(txt.split("_")[1], (xy[xy.hcluster == i].iloc[j,0], xy[xy.hcluster == i].iloc[j,1]))
plt.legend()
plt.show()
# dendrogram
from scipy.cluster.hierarchy import dendrogram
def plot_dendrogram(model, **kwargs):
# Create linkage matrix and then plot the dendrogram
# create the counts of samples under each node
counts = np.zeros(model.children_.shape[0])
n_samples = len(model.labels_)
for i, merge in enumerate(model.children_):
current_count = 0
for child_idx in merge:
if child_idx < n_samples:
current_count += 1 # leaf node
else:
current_count += counts[child_idx - n_samples]
counts[i] = current_count
linkage_matrix = np.column_stack(
[model.children_, model.distances_, counts]
).astype(float)
# Plot the corresponding dendrogram
dendrogram(linkage_matrix, **kwargs)
hclust = AgglomerativeClustering(distance_threshold = 0, n_clusters = None).fit(mat)
plot_dendrogram(hclust)
plt.show()
### Bisecting K-Means clustering ###
from sklearn.cluster import BisectingKMeans
biKmeans = BisectingKMeans(n_clusters = 6, random_state = myseed).fit(mat)
biKmeans.labels_
xy["bcluster"] = biKmeans.labels_
for i in xy.bcluster.unique():
plt.scatter(xy[xy.bcluster == i].iloc[:,0], xy[xy.bcluster == i].iloc[:,1], label = i)
for j, txt in enumerate(xy.index[xy.bcluster == i]):
plt.annotate(txt.split("_")[1], (xy[xy.bcluster == i].iloc[j,0], xy[xy.bcluster == i].iloc[j,1]))
plt.legend()
plt.show()