strongeryongchao commited on
Commit
5ddcb1d
·
1 Parent(s): 869d101

the first update

Browse files
Files changed (5) hide show
  1. README.md +7 -11
  2. app.py +49 -0
  3. clustering_utils.py +16 -0
  4. requirements.txt +7 -0
  5. viz_utils.py +40 -0
README.md CHANGED
@@ -1,12 +1,8 @@
1
- ---
2
- title: Sentcluster
3
- emoji: 📉
4
- colorFrom: gray
5
- colorTo: green
6
- sdk: gradio
7
- sdk_version: 5.34.0
8
- app_file: app.py
9
- pinned: false
10
- ---
11
 
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
1
+ # 中文句子语义聚类空间
 
 
 
 
 
 
 
 
 
2
 
3
+ 使用 [text2vec-bge-large-chinese](https://huggingface.co/shibing624/text2vec-bge-large-chinese) + HDBSCAN 实现中文句子聚类分析。支持 Silhouette、DB 指数评估,可视化展示与 CSV 导出。
4
+
5
+ ## 使用说明
6
+ - 输入文本或上传 txt 文件
7
+ - 自动编码并聚类
8
+ - 展示评分、Echarts 图、聚类结果
app.py ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from clustering_utils import cluster_sentences
3
+ from viz_utils import generate_force_graph, generate_bubble_chart, generate_umap_plot
4
+ import pandas as pd
5
+
6
+ def process_input(sentences, file_obj):
7
+ if file_obj:
8
+ sentences = file_obj.read().decode("utf-8").splitlines()
9
+ elif isinstance(sentences, str):
10
+ sentences = sentences.strip().splitlines()
11
+ sentences = [s.strip() for s in sentences if s.strip()]
12
+ if not sentences:
13
+ return "请输入句子或上传文件", None, None, None, None
14
+ labels, embeddings, scores = cluster_sentences(sentences)
15
+ df = pd.DataFrame({
16
+ "句子": sentences,
17
+ "簇ID": labels
18
+ })
19
+ force_data = generate_force_graph(sentences, labels)
20
+ bubble_data = generate_bubble_chart(sentences, labels)
21
+ umap_data = generate_umap_plot(embeddings, labels)
22
+ return f"✅ Silhouette: {scores['silhouette']:.4f}, DB: {scores['db']:.4f}", df, force_data, bubble_data, umap_data
23
+
24
+ with gr.Blocks(title="Text2Vec 语义聚类") as demo:
25
+ gr.Markdown("## 🧠 中文句子语义聚类(HDBSCAN + BGE)")
26
+ with gr.Row():
27
+ txt_input = gr.Textbox(lines=5, label="批量输入句子(每行一句)")
28
+ file_input = gr.File(label="或上传 .txt 文件")
29
+ run_btn = gr.Button("开始聚类")
30
+ status = gr.Markdown()
31
+ df_output = gr.Dataframe(label="聚类结果", interactive=False)
32
+ with gr.Tabs():
33
+ with gr.Tab("力导图"):
34
+ force_plot = gr.JSON(label="Echarts 数据")
35
+ with gr.Tab("气泡图"):
36
+ bubble_plot = gr.JSON(label="Echarts 数据")
37
+ with gr.Tab("UMAP分布图"):
38
+ umap_plot = gr.JSON(label="Echarts 数据")
39
+ csv_btn = gr.File(label="下载 CSV", visible=False)
40
+
41
+ def export_csv(df):
42
+ df.to_csv("cluster_result.csv", index=False)
43
+ return "cluster_result.csv"
44
+
45
+ run_btn.click(process_input, inputs=[txt_input, file_input],
46
+ outputs=[status, df_output, force_plot, bubble_plot, umap_plot])
47
+ df_output.change(export_csv, inputs=[df_output], outputs=[csv_btn])
48
+
49
+ demo.launch()
clustering_utils.py ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from sentence_transformers import SentenceTransformer
2
+ import hdbscan
3
+ from sklearn.metrics import silhouette_score, davies_bouldin_score
4
+
5
+ model = SentenceTransformer("shibing624/text2vec-bge-large-chinese")
6
+
7
+ def cluster_sentences(sentences):
8
+ embeddings = model.encode(sentences, normalize_embeddings=True)
9
+ clusterer = hdbscan.HDBSCAN(min_cluster_size=3, prediction_data=True)
10
+ labels = clusterer.fit_predict(embeddings)
11
+ try:
12
+ sil = silhouette_score(embeddings, labels) if len(set(labels)) > 1 else -1
13
+ db = davies_bouldin_score(embeddings, labels) if len(set(labels)) > 1 else -1
14
+ except Exception:
15
+ sil, db = -1, -1
16
+ return labels, embeddings, {"silhouette": sil, "db": db}
requirements.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ gradio>=4.0.0
2
+ sentence-transformers
3
+ hdbscan
4
+ scikit-learn
5
+ umap-learn
6
+ numpy
7
+ pandas
viz_utils.py ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import umap
3
+ from sklearn.preprocessing import MinMaxScaler
4
+ from collections import defaultdict
5
+ import random
6
+
7
+ def color_for_label(label):
8
+ random.seed(label + 1000)
9
+ return f"rgb({random.randint(50,200)}, {random.randint(50,200)}, {random.randint(50,200)})"
10
+
11
+ def generate_force_graph(sentences, labels):
12
+ nodes = []
13
+ links = []
14
+ label_map = {}
15
+ for i, (s, l) in enumerate(zip(sentences, labels)):
16
+ color = color_for_label(l)
17
+ nodes.append({"name": s, "symbolSize": 10, "category": int(l), "itemStyle": {"color": color}})
18
+ label_map.setdefault(l, []).append(i)
19
+
20
+ for group in label_map.values():
21
+ for i in group:
22
+ for j in group:
23
+ if i < j:
24
+ links.append({"source": sentences[i], "target": sentences[j]})
25
+ return {"type": "force", "nodes": nodes, "links": links}
26
+
27
+ def generate_bubble_chart(sentences, labels):
28
+ counts = defaultdict(int)
29
+ for l in labels:
30
+ counts[l] += 1
31
+ data = [{"name": f"簇{l}", "value": v, "itemStyle": {"color": color_for_label(l)}} for l, v in counts.items()]
32
+ return {"type": "bubble", "series": [{"type": "scatter", "data": data}]}
33
+
34
+ def generate_umap_plot(embeddings, labels):
35
+ reducer = umap.UMAP(n_components=2)
36
+ umap_emb = reducer.fit_transform(embeddings)
37
+ scaled = MinMaxScaler().fit_transform(umap_emb)
38
+ data = [{"x": float(x), "y": float(y), "label": int(l), "itemStyle": {"color": color_for_label(l)}}
39
+ for (x, y), l in zip(scaled, labels)]
40
+ return {"type": "scatter", "series": [{"data": data}]}