Spaces:
Sleeping
Sleeping
from sklearn.feature_extraction.text import TfidfVectorizer | |
import numpy as np | |
import jieba.analyse | |
def extract_keywords_per_cluster(sentences, labels, top_k=5): | |
cluster_keywords = {} | |
clusters = set(labels) | |
for c in clusters: | |
if c == -1: | |
continue | |
cluster_sents = [s for s, l in zip(sentences, labels) if l == c] | |
vectorizer = TfidfVectorizer(max_features=1000) | |
tfidf_matrix = vectorizer.fit_transform(cluster_sents) | |
scores = np.asarray(tfidf_matrix.mean(axis=0)).ravel() | |
keywords = np.array(vectorizer.get_feature_names_out())[np.argsort(scores)[::-1]] | |
cluster_keywords[c] = keywords[:top_k].tolist() | |
return cluster_keywords | |
def summarize_per_cluster(sentences, labels, top_k=3): | |
cluster_summaries = {} | |
clusters = set(labels) | |
for c in clusters: | |
if c == -1: | |
continue | |
cluster_sents = [s for s, l in zip(sentences, labels) if l == c] | |
text = "。".join(cluster_sents) | |
keywords = jieba.analyse.textrank(text, topK=top_k, withWeight=False) | |
cluster_summaries[c] = list(keywords) | |
return cluster_summaries | |