Spaces:
Sleeping
Sleeping
Commit
·
f8607ee
1
Parent(s):
60ded96
Create clustering.py
Browse files- clustering.py +34 -0
clustering.py
ADDED
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from sklearn.cluster import AgglomerativeClustering
|
2 |
+
import numpy as np
|
3 |
+
|
4 |
+
def cluster_aspect_terms(nlp, aspects):
|
5 |
+
|
6 |
+
aspect_terms = sorted(list(set(aspects['aspect'].values)))
|
7 |
+
|
8 |
+
aspect_terms_sizes = aspects.groupby('aspect').size().sort_index().values
|
9 |
+
|
10 |
+
aspect_terms_vectors = [doc.vector for doc in nlp.pipe(aspect_terms)]
|
11 |
+
|
12 |
+
clusterer = AgglomerativeClustering(n_clusters=None,
|
13 |
+
affinity='cosine',
|
14 |
+
linkage='average',
|
15 |
+
distance_threshold=0.2)
|
16 |
+
|
17 |
+
clusterer.fit(aspect_terms_vectors)
|
18 |
+
|
19 |
+
term_replacements = {}
|
20 |
+
|
21 |
+
for cluster in range(clusterer.n_clusters_):
|
22 |
+
|
23 |
+
idxs = np.nonzero(clusterer.labels_ == cluster)[0]
|
24 |
+
|
25 |
+
terms = [t for i, t in enumerate(aspect_terms) if i in idxs]
|
26 |
+
|
27 |
+
sizes = aspect_terms_sizes[idxs]
|
28 |
+
|
29 |
+
main_term = terms[np.argmax(sizes)]
|
30 |
+
|
31 |
+
for term in terms:
|
32 |
+
term_replacements[term] = main_term
|
33 |
+
|
34 |
+
return term_replacements
|