reviews-insights / clustering.py
analytics-jiten's picture
Create clustering.py
f8607ee
raw
history blame
968 Bytes
from sklearn.cluster import AgglomerativeClustering
import numpy as np
def cluster_aspect_terms(nlp, aspects):
aspect_terms = sorted(list(set(aspects['aspect'].values)))
aspect_terms_sizes = aspects.groupby('aspect').size().sort_index().values
aspect_terms_vectors = [doc.vector for doc in nlp.pipe(aspect_terms)]
clusterer = AgglomerativeClustering(n_clusters=None,
affinity='cosine',
linkage='average',
distance_threshold=0.2)
clusterer.fit(aspect_terms_vectors)
term_replacements = {}
for cluster in range(clusterer.n_clusters_):
idxs = np.nonzero(clusterer.labels_ == cluster)[0]
terms = [t for i, t in enumerate(aspect_terms) if i in idxs]
sizes = aspect_terms_sizes[idxs]
main_term = terms[np.argmax(sizes)]
for term in terms:
term_replacements[term] = main_term
return term_replacements