Spaces:
Sleeping
Sleeping
from sklearn.cluster import AgglomerativeClustering | |
import numpy as np | |
def cluster_aspect_terms(nlp, aspects): | |
aspect_terms = sorted(list(set(aspects['aspect'].values))) | |
aspect_terms_sizes = aspects.groupby('aspect').size().sort_index().values | |
aspect_terms_vectors = [doc.vector for doc in nlp.pipe(aspect_terms)] | |
clusterer = AgglomerativeClustering(n_clusters=None, | |
affinity='cosine', | |
linkage='average', | |
distance_threshold=0.2) | |
clusterer.fit(aspect_terms_vectors) | |
term_replacements = {} | |
for cluster in range(clusterer.n_clusters_): | |
idxs = np.nonzero(clusterer.labels_ == cluster)[0] | |
terms = [t for i, t in enumerate(aspect_terms) if i in idxs] | |
sizes = aspect_terms_sizes[idxs] | |
main_term = terms[np.argmax(sizes)] | |
for term in terms: | |
term_replacements[term] = main_term | |
return term_replacements | |