analytics-jiten commited on
Commit
f8607ee
·
1 Parent(s): 60ded96

Create clustering.py

Browse files
Files changed (1) hide show
  1. clustering.py +34 -0
clustering.py ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from sklearn.cluster import AgglomerativeClustering
2
+ import numpy as np
3
+
4
+ def cluster_aspect_terms(nlp, aspects):
5
+
6
+ aspect_terms = sorted(list(set(aspects['aspect'].values)))
7
+
8
+ aspect_terms_sizes = aspects.groupby('aspect').size().sort_index().values
9
+
10
+ aspect_terms_vectors = [doc.vector for doc in nlp.pipe(aspect_terms)]
11
+
12
+ clusterer = AgglomerativeClustering(n_clusters=None,
13
+ affinity='cosine',
14
+ linkage='average',
15
+ distance_threshold=0.2)
16
+
17
+ clusterer.fit(aspect_terms_vectors)
18
+
19
+ term_replacements = {}
20
+
21
+ for cluster in range(clusterer.n_clusters_):
22
+
23
+ idxs = np.nonzero(clusterer.labels_ == cluster)[0]
24
+
25
+ terms = [t for i, t in enumerate(aspect_terms) if i in idxs]
26
+
27
+ sizes = aspect_terms_sizes[idxs]
28
+
29
+ main_term = terms[np.argmax(sizes)]
30
+
31
+ for term in terms:
32
+ term_replacements[term] = main_term
33
+
34
+ return term_replacements