Spaces:
Sleeping
Sleeping
File size: 968 Bytes
f8607ee |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 |
from sklearn.cluster import AgglomerativeClustering
import numpy as np
def cluster_aspect_terms(nlp, aspects):
aspect_terms = sorted(list(set(aspects['aspect'].values)))
aspect_terms_sizes = aspects.groupby('aspect').size().sort_index().values
aspect_terms_vectors = [doc.vector for doc in nlp.pipe(aspect_terms)]
clusterer = AgglomerativeClustering(n_clusters=None,
affinity='cosine',
linkage='average',
distance_threshold=0.2)
clusterer.fit(aspect_terms_vectors)
term_replacements = {}
for cluster in range(clusterer.n_clusters_):
idxs = np.nonzero(clusterer.labels_ == cluster)[0]
terms = [t for i, t in enumerate(aspect_terms) if i in idxs]
sizes = aspect_terms_sizes[idxs]
main_term = terms[np.argmax(sizes)]
for term in terms:
term_replacements[term] = main_term
return term_replacements
|