175 lines
5.1 KiB
Python
175 lines
5.1 KiB
Python
from sklearn.cluster import KMeans
|
|
|
|
import matplotlib.pyplot as plt
|
|
|
|
|
|
##### Clustering ######
|
|
|
|
## KMeans
|
|
|
|
#eval_artworks = eval_art[eval_art.nettype == "alldata"].iloc[:,range(1,5)]
|
|
eval_artworks = eval_art[eval_art.nettype == "subdata"].iloc[:,range(1,5)]
|
|
|
|
kmeans = KMeans(n_clusters=4, max_iter=1000).fit(eval_artworks)
|
|
|
|
#from sklearn.manifold import MDS
|
|
#coord = pd.DataFrame(MDS(normalized_stress='auto').fit_transform(eval_artworks))
|
|
|
|
coord = eval_artworks
|
|
coord["clusters"] = kmeans.labels_
|
|
|
|
for i in coord.clusters.unique():
|
|
#plt.scatter(coord[coord.clusters == i].iloc[:,0], coord[coord.clusters == i].iloc[:,1],
|
|
plt.scatter(coord[coord.clusters == i].iloc[:,1], coord[coord.clusters == i].iloc[:,2],
|
|
#plt.scatter(coord[coord.clusters == i].iloc[:,2], coord[coord.clusters == i].iloc[:,4],
|
|
label = i)
|
|
plt.legend()
|
|
plt.show()
|
|
|
|
### Scree plot
|
|
|
|
sse = {}
|
|
for k in range(1, 10):
|
|
kmeans = KMeans(n_clusters=k, max_iter=1000).fit(eval_artworks[["precision", "generalizability"]])
|
|
#data["clusters"] = kmeans.labels_
|
|
#print(data["clusters"])
|
|
sse[k] = kmeans.inertia_ # Inertia: Sum of distances of samples to their closest cluster center
|
|
plt.figure()
|
|
plt.plot(list(sse.keys()), list(sse.values()))
|
|
plt.xlabel("Number of clusters")
|
|
plt.ylabel("SSE")
|
|
plt.show()
|
|
|
|
|
|
|
|
|
|
### TMP
|
|
datitem = dat.groupby("item")[["duration", "distance",
|
|
"scaleSize", "rotationDegree"]].mean()
|
|
|
|
def length_path(data):
|
|
x = data.path
|
|
return len(x.unique())
|
|
def length_case(data):
|
|
x = data.case
|
|
return len(x.unique())
|
|
def length_topic(data):
|
|
x = data.topic.dropna()
|
|
return len(x.unique())
|
|
|
|
datitem["npaths"] = dat.groupby(["item"]).apply(length_path)
|
|
datitem["ncases"] = dat.groupby(["item"]).apply(length_case)
|
|
datitem["ntopics"] = dat.groupby(["item"]).apply(length_topic)
|
|
|
|
datitem.index = datitem.index.astype(str).str.rjust(3, "0")
|
|
datitem = datitem.sort_index()
|
|
datitem.index = mdi.index
|
|
|
|
datitem = pd.concat([mdi, datitem], axis = 1)
|
|
|
|
|
|
|
|
|
|
|
|
###### Find clusters ######
|
|
|
|
myseed = 1420
|
|
|
|
mat = datitem.drop(["fitness", "sound", "mostfreq"], axis = 1)
|
|
mat = StandardScaler().fit_transform(mat)
|
|
|
|
xy = pd.DataFrame(MDS(normalized_stress = 'auto', random_state = myseed).fit_transform(mat))
|
|
xy.index = datitem.index
|
|
|
|
### K-Means clustering ###
|
|
|
|
kmeans = KMeans(n_clusters = 6, max_iter = 1000, random_state = myseed).fit(mat)
|
|
xy["kcluster"] = kmeans.labels_
|
|
|
|
for i in xy.kcluster.unique():
|
|
plt.scatter(xy[xy.kcluster == i].iloc[:,0], xy[xy.kcluster == i].iloc[:,1], label = i)
|
|
for j, txt in enumerate(xy.index[xy.kcluster == i]):
|
|
plt.annotate(txt.split("_")[1], (xy[xy.kcluster == i].iloc[j,0], xy[xy.kcluster == i].iloc[j,1]))
|
|
plt.legend()
|
|
plt.show()
|
|
|
|
xy.kcluster.value_counts()
|
|
|
|
# Scree plot
|
|
sse = {}
|
|
for k in range(1, 10):
|
|
kmeans = KMeans(n_clusters = k, max_iter = 1000).fit(mat)
|
|
sse[k] = kmeans.inertia_ # Inertia: Sum of distances of samples to their closest cluster center
|
|
plt.figure()
|
|
plt.plot(list(sse.keys()), list(sse.values()))
|
|
plt.xlabel("Number of clusters")
|
|
plt.ylabel("SSE")
|
|
plt.show()
|
|
|
|
c0_items = xy[xy.kcluster == 0].index
|
|
c1_items = xy[xy.kcluster == 1].index
|
|
c2_items = xy[xy.kcluster == 2].index
|
|
c3_items = xy[xy.kcluster == 3].index
|
|
c4_items = xy[xy.kcluster == 4].index
|
|
c5_items = xy[xy.kcluster == 5].index
|
|
|
|
### Hierarchical clustering ###
|
|
from sklearn.cluster import AgglomerativeClustering
|
|
|
|
hclust = AgglomerativeClustering(n_clusters = 6).fit(mat)
|
|
hclust.labels_
|
|
|
|
xy["hcluster"] = hclust.labels_
|
|
|
|
for i in xy.hcluster.unique():
|
|
plt.scatter(xy[xy.hcluster == i].iloc[:,0], xy[xy.hcluster == i].iloc[:,1], label = i)
|
|
for j, txt in enumerate(xy.index[xy.hcluster == i]):
|
|
plt.annotate(txt.split("_")[1], (xy[xy.hcluster == i].iloc[j,0], xy[xy.hcluster == i].iloc[j,1]))
|
|
plt.legend()
|
|
plt.show()
|
|
|
|
# dendrogram
|
|
from scipy.cluster.hierarchy import dendrogram
|
|
|
|
def plot_dendrogram(model, **kwargs):
|
|
# Create linkage matrix and then plot the dendrogram
|
|
|
|
# create the counts of samples under each node
|
|
counts = np.zeros(model.children_.shape[0])
|
|
n_samples = len(model.labels_)
|
|
for i, merge in enumerate(model.children_):
|
|
current_count = 0
|
|
for child_idx in merge:
|
|
if child_idx < n_samples:
|
|
current_count += 1 # leaf node
|
|
else:
|
|
current_count += counts[child_idx - n_samples]
|
|
counts[i] = current_count
|
|
|
|
linkage_matrix = np.column_stack(
|
|
[model.children_, model.distances_, counts]
|
|
).astype(float)
|
|
|
|
# Plot the corresponding dendrogram
|
|
dendrogram(linkage_matrix, **kwargs)
|
|
|
|
hclust = AgglomerativeClustering(distance_threshold = 0, n_clusters = None).fit(mat)
|
|
|
|
plot_dendrogram(hclust)
|
|
plt.show()
|
|
|
|
### Bisecting K-Means clustering ###
|
|
from sklearn.cluster import BisectingKMeans
|
|
|
|
biKmeans = BisectingKMeans(n_clusters = 6, random_state = myseed).fit(mat)
|
|
biKmeans.labels_
|
|
|
|
xy["bcluster"] = biKmeans.labels_
|
|
|
|
for i in xy.bcluster.unique():
|
|
plt.scatter(xy[xy.bcluster == i].iloc[:,0], xy[xy.bcluster == i].iloc[:,1], label = i)
|
|
for j, txt in enumerate(xy.index[xy.bcluster == i]):
|
|
plt.annotate(txt.split("_")[1], (xy[xy.bcluster == i].iloc[j,0], xy[xy.bcluster == i].iloc[j,1]))
|
|
plt.legend()
|
|
plt.show()
|