Spaces:
Sleeping
Sleeping
Commit
·
5ddcb1d
1
Parent(s):
869d101
the first update
Browse files- README.md +7 -11
- app.py +49 -0
- clustering_utils.py +16 -0
- requirements.txt +7 -0
- viz_utils.py +40 -0
README.md
CHANGED
@@ -1,12 +1,8 @@
|
|
1 |
-
|
2 |
-
title: Sentcluster
|
3 |
-
emoji: 📉
|
4 |
-
colorFrom: gray
|
5 |
-
colorTo: green
|
6 |
-
sdk: gradio
|
7 |
-
sdk_version: 5.34.0
|
8 |
-
app_file: app.py
|
9 |
-
pinned: false
|
10 |
-
---
|
11 |
|
12 |
-
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# 中文句子语义聚类空间
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2 |
|
3 |
+
使用 [text2vec-bge-large-chinese](https://huggingface.co/shibing624/text2vec-bge-large-chinese) + HDBSCAN 实现中文句子聚类分析。支持 Silhouette、DB 指数评估,可视化展示与 CSV 导出。
|
4 |
+
|
5 |
+
## 使用说明
|
6 |
+
- 输入文本或上传 txt 文件
|
7 |
+
- 自动编码并聚类
|
8 |
+
- 展示评分、Echarts 图、聚类结果
|
app.py
ADDED
@@ -0,0 +1,49 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
from clustering_utils import cluster_sentences
|
3 |
+
from viz_utils import generate_force_graph, generate_bubble_chart, generate_umap_plot
|
4 |
+
import pandas as pd
|
5 |
+
|
6 |
+
def process_input(sentences, file_obj):
|
7 |
+
if file_obj:
|
8 |
+
sentences = file_obj.read().decode("utf-8").splitlines()
|
9 |
+
elif isinstance(sentences, str):
|
10 |
+
sentences = sentences.strip().splitlines()
|
11 |
+
sentences = [s.strip() for s in sentences if s.strip()]
|
12 |
+
if not sentences:
|
13 |
+
return "请输入句子或上传文件", None, None, None, None
|
14 |
+
labels, embeddings, scores = cluster_sentences(sentences)
|
15 |
+
df = pd.DataFrame({
|
16 |
+
"句子": sentences,
|
17 |
+
"簇ID": labels
|
18 |
+
})
|
19 |
+
force_data = generate_force_graph(sentences, labels)
|
20 |
+
bubble_data = generate_bubble_chart(sentences, labels)
|
21 |
+
umap_data = generate_umap_plot(embeddings, labels)
|
22 |
+
return f"✅ Silhouette: {scores['silhouette']:.4f}, DB: {scores['db']:.4f}", df, force_data, bubble_data, umap_data
|
23 |
+
|
24 |
+
with gr.Blocks(title="Text2Vec 语义聚类") as demo:
|
25 |
+
gr.Markdown("## 🧠 中文句子语义聚类(HDBSCAN + BGE)")
|
26 |
+
with gr.Row():
|
27 |
+
txt_input = gr.Textbox(lines=5, label="批量输入句子(每行一句)")
|
28 |
+
file_input = gr.File(label="或上传 .txt 文件")
|
29 |
+
run_btn = gr.Button("开始聚类")
|
30 |
+
status = gr.Markdown()
|
31 |
+
df_output = gr.Dataframe(label="聚类结果", interactive=False)
|
32 |
+
with gr.Tabs():
|
33 |
+
with gr.Tab("力导图"):
|
34 |
+
force_plot = gr.JSON(label="Echarts 数据")
|
35 |
+
with gr.Tab("气泡图"):
|
36 |
+
bubble_plot = gr.JSON(label="Echarts 数据")
|
37 |
+
with gr.Tab("UMAP分布图"):
|
38 |
+
umap_plot = gr.JSON(label="Echarts 数据")
|
39 |
+
csv_btn = gr.File(label="下载 CSV", visible=False)
|
40 |
+
|
41 |
+
def export_csv(df):
|
42 |
+
df.to_csv("cluster_result.csv", index=False)
|
43 |
+
return "cluster_result.csv"
|
44 |
+
|
45 |
+
run_btn.click(process_input, inputs=[txt_input, file_input],
|
46 |
+
outputs=[status, df_output, force_plot, bubble_plot, umap_plot])
|
47 |
+
df_output.change(export_csv, inputs=[df_output], outputs=[csv_btn])
|
48 |
+
|
49 |
+
demo.launch()
|
clustering_utils.py
ADDED
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from sentence_transformers import SentenceTransformer
|
2 |
+
import hdbscan
|
3 |
+
from sklearn.metrics import silhouette_score, davies_bouldin_score
|
4 |
+
|
5 |
+
model = SentenceTransformer("shibing624/text2vec-bge-large-chinese")
|
6 |
+
|
7 |
+
def cluster_sentences(sentences):
|
8 |
+
embeddings = model.encode(sentences, normalize_embeddings=True)
|
9 |
+
clusterer = hdbscan.HDBSCAN(min_cluster_size=3, prediction_data=True)
|
10 |
+
labels = clusterer.fit_predict(embeddings)
|
11 |
+
try:
|
12 |
+
sil = silhouette_score(embeddings, labels) if len(set(labels)) > 1 else -1
|
13 |
+
db = davies_bouldin_score(embeddings, labels) if len(set(labels)) > 1 else -1
|
14 |
+
except Exception:
|
15 |
+
sil, db = -1, -1
|
16 |
+
return labels, embeddings, {"silhouette": sil, "db": db}
|
requirements.txt
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
gradio>=4.0.0
|
2 |
+
sentence-transformers
|
3 |
+
hdbscan
|
4 |
+
scikit-learn
|
5 |
+
umap-learn
|
6 |
+
numpy
|
7 |
+
pandas
|
viz_utils.py
ADDED
@@ -0,0 +1,40 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import numpy as np
|
2 |
+
import umap
|
3 |
+
from sklearn.preprocessing import MinMaxScaler
|
4 |
+
from collections import defaultdict
|
5 |
+
import random
|
6 |
+
|
7 |
+
def color_for_label(label):
|
8 |
+
random.seed(label + 1000)
|
9 |
+
return f"rgb({random.randint(50,200)}, {random.randint(50,200)}, {random.randint(50,200)})"
|
10 |
+
|
11 |
+
def generate_force_graph(sentences, labels):
|
12 |
+
nodes = []
|
13 |
+
links = []
|
14 |
+
label_map = {}
|
15 |
+
for i, (s, l) in enumerate(zip(sentences, labels)):
|
16 |
+
color = color_for_label(l)
|
17 |
+
nodes.append({"name": s, "symbolSize": 10, "category": int(l), "itemStyle": {"color": color}})
|
18 |
+
label_map.setdefault(l, []).append(i)
|
19 |
+
|
20 |
+
for group in label_map.values():
|
21 |
+
for i in group:
|
22 |
+
for j in group:
|
23 |
+
if i < j:
|
24 |
+
links.append({"source": sentences[i], "target": sentences[j]})
|
25 |
+
return {"type": "force", "nodes": nodes, "links": links}
|
26 |
+
|
27 |
+
def generate_bubble_chart(sentences, labels):
|
28 |
+
counts = defaultdict(int)
|
29 |
+
for l in labels:
|
30 |
+
counts[l] += 1
|
31 |
+
data = [{"name": f"簇{l}", "value": v, "itemStyle": {"color": color_for_label(l)}} for l, v in counts.items()]
|
32 |
+
return {"type": "bubble", "series": [{"type": "scatter", "data": data}]}
|
33 |
+
|
34 |
+
def generate_umap_plot(embeddings, labels):
|
35 |
+
reducer = umap.UMAP(n_components=2)
|
36 |
+
umap_emb = reducer.fit_transform(embeddings)
|
37 |
+
scaled = MinMaxScaler().fit_transform(umap_emb)
|
38 |
+
data = [{"x": float(x), "y": float(y), "label": int(l), "itemStyle": {"color": color_for_label(l)}}
|
39 |
+
for (x, y), l in zip(scaled, labels)]
|
40 |
+
return {"type": "scatter", "series": [{"data": data}]}
|