mtt_haum/code/trace-clustering.py

from sklearn.cluster import KMeans

import matplotlib.pyplot as plt


##### Clustering ######

## KMeans

#eval_artworks = eval_art[eval_art.nettype == "alldata"].iloc[:,range(1,5)]
eval_artworks = eval_art[eval_art.nettype == "subdata"].iloc[:,range(1,5)]

kmeans = KMeans(n_clusters=4, max_iter=1000).fit(eval_artworks)

#from sklearn.manifold import MDS
#coord = pd.DataFrame(MDS(normalized_stress='auto').fit_transform(eval_artworks))

coord = eval_artworks
coord["clusters"] = kmeans.labels_

for i in coord.clusters.unique():
    #plt.scatter(coord[coord.clusters == i].iloc[:,0], coord[coord.clusters == i].iloc[:,1],
    plt.scatter(coord[coord.clusters == i].iloc[:,1], coord[coord.clusters == i].iloc[:,2],
    #plt.scatter(coord[coord.clusters == i].iloc[:,2], coord[coord.clusters == i].iloc[:,4],
                 label = i)
plt.legend()
plt.show()

### Scree plot

sse = {}
for k in range(1, 10):
    kmeans = KMeans(n_clusters=k, max_iter=1000).fit(eval_artworks[["precision", "generalizability"]])
    #data["clusters"] = kmeans.labels_
    #print(data["clusters"])
    sse[k] = kmeans.inertia_ # Inertia: Sum of distances of samples to their closest cluster center
plt.figure()
plt.plot(list(sse.keys()), list(sse.values()))
plt.xlabel("Number of clusters")
plt.ylabel("SSE")
plt.show()


### TMP
datitem = dat.groupby("item")[["duration", "distance",
                               "scaleSize", "rotationDegree"]].mean()

def length_path(data):
    x = data.path
    return len(x.unique())
def length_case(data):
    x = data.case
    return len(x.unique())
def length_topic(data):
    x = data.topic.dropna()
    return len(x.unique())

datitem["npaths"] = dat.groupby(["item"]).apply(length_path)
datitem["ncases"] = dat.groupby(["item"]).apply(length_case)
datitem["ntopics"] = dat.groupby(["item"]).apply(length_topic)

datitem.index = datitem.index.astype(str).str.rjust(3, "0")
datitem = datitem.sort_index()
datitem.index = mdi.index

datitem = pd.concat([mdi, datitem], axis = 1)


###### Find clusters ######

myseed = 1420

mat = datitem.drop(["fitness", "sound", "mostfreq"], axis = 1)
mat = StandardScaler().fit_transform(mat)

xy = pd.DataFrame(MDS(normalized_stress = 'auto', random_state = myseed).fit_transform(mat))
xy.index = datitem.index

### K-Means clustering ###

kmeans = KMeans(n_clusters = 6, max_iter = 1000, random_state = myseed).fit(mat)
xy["kcluster"] = kmeans.labels_

for i in xy.kcluster.unique():
    plt.scatter(xy[xy.kcluster == i].iloc[:,0], xy[xy.kcluster == i].iloc[:,1], label = i)
    for j, txt in enumerate(xy.index[xy.kcluster == i]):
        plt.annotate(txt.split("_")[1], (xy[xy.kcluster == i].iloc[j,0], xy[xy.kcluster == i].iloc[j,1]))
plt.legend()
plt.show()

xy.kcluster.value_counts()

# Scree plot
sse = {}
for k in range(1, 10):
    kmeans = KMeans(n_clusters = k, max_iter = 1000).fit(mat)
    sse[k] = kmeans.inertia_ # Inertia: Sum of distances of samples to their closest cluster center
plt.figure()
plt.plot(list(sse.keys()), list(sse.values()))
plt.xlabel("Number of clusters")
plt.ylabel("SSE")
plt.show()

c0_items = xy[xy.kcluster == 0].index
c1_items = xy[xy.kcluster == 1].index
c2_items = xy[xy.kcluster == 2].index
c3_items = xy[xy.kcluster == 3].index
c4_items = xy[xy.kcluster == 4].index
c5_items = xy[xy.kcluster == 5].index

### Hierarchical clustering ###
from sklearn.cluster import AgglomerativeClustering

hclust = AgglomerativeClustering(n_clusters = 6).fit(mat)
hclust.labels_

xy["hcluster"] = hclust.labels_

for i in xy.hcluster.unique():
    plt.scatter(xy[xy.hcluster == i].iloc[:,0], xy[xy.hcluster == i].iloc[:,1], label = i)
    for j, txt in enumerate(xy.index[xy.hcluster == i]):
        plt.annotate(txt.split("_")[1], (xy[xy.hcluster == i].iloc[j,0], xy[xy.hcluster == i].iloc[j,1]))
plt.legend()
plt.show()

# dendrogram
from scipy.cluster.hierarchy import dendrogram

def plot_dendrogram(model, **kwargs):
    # Create linkage matrix and then plot the dendrogram

    # create the counts of samples under each node
    counts = np.zeros(model.children_.shape[0])
    n_samples = len(model.labels_)
    for i, merge in enumerate(model.children_):
        current_count = 0
        for child_idx in merge:
            if child_idx < n_samples:
                current_count += 1  # leaf node
            else:
                current_count += counts[child_idx - n_samples]
        counts[i] = current_count

    linkage_matrix = np.column_stack(
        [model.children_, model.distances_, counts]
    ).astype(float)

    # Plot the corresponding dendrogram
    dendrogram(linkage_matrix, **kwargs)

hclust = AgglomerativeClustering(distance_threshold = 0, n_clusters = None).fit(mat)

plot_dendrogram(hclust)
plt.show()

### Bisecting K-Means clustering ###
from sklearn.cluster import BisectingKMeans

biKmeans = BisectingKMeans(n_clusters = 6, random_state = myseed).fit(mat)
biKmeans.labels_

xy["bcluster"] = biKmeans.labels_

for i in xy.bcluster.unique():
    plt.scatter(xy[xy.bcluster == i].iloc[:,0], xy[xy.bcluster == i].iloc[:,1], label = i)
    for j, txt in enumerate(xy.index[xy.bcluster == i]):
        plt.annotate(txt.split("_")[1], (xy[xy.bcluster == i].iloc[j,0], xy[xy.bcluster == i].iloc[j,1]))
plt.legend()
plt.show()