Spaces:

chen666-666
/

wechat-ner-re

Sleeping

App Files Files Community

chen666-666 commited on Apr 6

Commit

07e97de

1 Parent(s): ff6d08e

Add Gradio app for NER + RE

Browse files

Files changed (2) hide show

app.py +244 -107
requirements.txt +6 -6

app.py CHANGED Viewed

@@ -1,133 +1,270 @@
-import gradio as gr
-from transformers import BertTokenizerFast, BertForTokenClassification, BertForSequenceClassification
 import torch
-from pathlib import Path
-from pyvis.network import Network
 import os
-# 加载模型和分词器
-ner_tokenizer = BertTokenizerFast.from_pretrained("bert-base-chinese")
-ner_model = BertForTokenClassification.from_pretrained("bert-base-chinese", num_labels=10)
-re_model = BertForSequenceClassification.from_pretrained("bert-base-chinese", num_labels=5)
-re_tokenizer = BertTokenizerFast.from_pretrained("bert-base-chinese")
-# 定义标签和关系类型
-label_list = ["O", "B-PER", "I-PER", "B-ORG", "I-ORG", "B-LOC", "I-LOC", "B-MISC", "I-MISC", "PAD"]
-relation_list = ["no_relation", "per-org", "per-loc", "org-loc", "org-misc"]
-# 用于存储知识图谱
 knowledge_graph = {
-    "entities": [],
     "relations": []
 }
-def ner_predict(text):
-    inputs = ner_tokenizer(text, return_tensors="pt", truncation=True)
-    with torch.no_grad():
-        outputs = ner_model(**inputs).logits
-    predictions = torch.argmax(outputs, dim=2)
-    tokens = ner_tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])
-    predicted_labels = [label_list[label_id] for label_id in predictions[0].numpy()]
-    entities = []
-    current_entity = ""
-    current_label = ""
-    start = None
-    special_tokens = {"[CLS]", "[SEP]", "[PAD]"}
-    for idx, (token, label) in enumerate(zip(tokens, predicted_labels)):
-        if token in special_tokens:
-            continue
-        if label.startswith("B-"):
-            if current_entity:
-                entities.append((current_entity, current_label, start, idx))
-            current_entity = token.replace("##", "")
-            current_label = label[2:]
-            start = idx
-        elif label.startswith("I-") and current_label == label[2:]:
-            current_entity += token.replace("##", "")
-        else:
-            if current_entity:
-                entities.append((current_entity, current_label, start, idx))
-                current_entity = ""
-                current_label = ""
-    if current_entity:
-        entities.append((current_entity, current_label, start, len(tokens)))
-    return entities
-def re_predict(text, entities):
-    relations = []
-    for i in range(len(entities)):
-        for j in range(len(entities)):
-            if i == j:
-                continue
-            head, tail = entities[i][0], entities[j][0]
-            input_text = f"{head} 和 {tail} 有什么关系？{text}"
-            inputs = re_tokenizer(input_text, return_tensors="pt", truncation=True, padding=True)
-            with torch.no_grad():
-                outputs = re_model(**inputs).logits
-            prediction = torch.argmax(outputs, dim=1).item()
-            if relation_list[prediction] != "no_relation":
-                relations.append((head, tail, relation_list[prediction]))
-    return relations
-def analyze_text(text):
-    entities = ner_predict(text)
-    relations = re_predict(text, entities)
-    entity_list = [f"{ent[0]} ({ent[1]}) [{ent[2]}, {ent[3]}]" for ent in entities]
-    relation_list_text = [f"{rel[0]} --[{rel[2]}]-> {rel[1]}" for rel in relations]
-    # 更新知识图谱
-    knowledge_graph["entities"] = [(ent[0], ent[1]) for ent in entities]
-    knowledge_graph["relations"] = relations
-    return "\n".join(entity_list), "\n".join(relation_list_text)
 def visualize_kg():
-    if not knowledge_graph["entities"]:
-        return "<p style='color:red;'>知识图谱为空，请先进行文本分析。</p>"
-    net = Network(height="600px", width="100%", notebook=False, directed=True)
     node_map = {}
     for idx, (name, type_) in enumerate(knowledge_graph["entities"]):
         node_map[name] = idx
-        net.add_node(idx, label=name, title=type_, group=type_)
     for head, tail, relation in knowledge_graph["relations"]:
         if head in node_map and tail in node_map:
-            net.add_edge(node_map[head], node_map[tail], label=relation, arrows="to")
     net.set_options("""
     {
-      "physics": { "stabilization": { "iterations": 100 }},
-      "interaction": { "hover": true }
     }
     """)
-    file_path = Path("kg.html")
-    net.save_graph(str(file_path))
-    return f'<iframe src="file={file_path.name}" width="100%" height="600px" frameborder="0"></iframe>'
-# Gradio 界面
-with gr.Blocks(title="Wechat Ner Re") as demo:
-    gr.Markdown("## 微信聊天记录结构化系统（NER + RE + 知识图谱）")
-    with gr.Row():
-        input_text = gr.Textbox(lines=5, label="请输入文本")
-        analyze_button = gr.Button("分析文本")
-    with gr.Row():
-        ner_output = gr.Textbox(label="识别出的实体")
-        re_output = gr.Textbox(label="抽取的关系")
-    analyze_button.click(analyze_text, inputs=input_text, outputs=[ner_output, re_output])
-    gr.Markdown("## 知识图谱可视化")
-    with gr.Row():
-        kg_button = gr.Button("生成知识图谱")
-    kg_html1 = gr.HTML(label="知识图谱可视化", show_label=True)
-    kg_button.click(fn=visualize_kg, outputs=kg_html1)
-# 启动应用
 if __name__ == "__main__":
-    demo.launch()

 import torch
+from transformers import BertTokenizer, BertModel
+import gradio as gr
+import re
 import os
+import json
+import pandas as pd
+import chardet
+from pyvis.network import Network
+import networkx as nx
+# 初始化模型
+model_name = "bert-base-chinese"
+tokenizer = BertTokenizer.from_pretrained(model_name)
+model = BertModel.from_pretrained(model_name)
+# 知识图谱数据存储
 knowledge_graph = {
+    "entities": set(),
     "relations": []
 }
+def update_knowledge_graph(entities, relations):
+    """更新知识图谱数据"""
+    for e in entities:
+        knowledge_graph["entities"].add((e['text'], e['type']))
+    for r in relations:
+        knowledge_graph["relations"].append((r['head'], r['tail'], r['relation']))
 def visualize_kg():
+    """生成交互式知识图谱可视化（返回HTML内容）"""
+    net = Network(height="600px", width="100%", notebook=True, directed=True)
     node_map = {}
+    # 添加节点
     for idx, (name, type_) in enumerate(knowledge_graph["entities"]):
         node_map[name] = idx
+        net.add_node(idx,
+                     label=name,
+                     title=f"类型：{type_}",
+                     group=type_,
+                     font={"size": 20})
+    # 添加边
     for head, tail, relation in knowledge_graph["relations"]:
         if head in node_map and tail in node_map:
+            net.add_edge(node_map[head], node_map[tail],
+                         label=relation,
+                         arrows='to',
+                         font={"size": 16})
+    # 配置可视化参数
     net.set_options("""
     {
+      "nodes": {
+        "scaling": {
+          "min": 20,
+          "max": 40
+        }
+      },
+      "physics": {
+        "stabilization": {
+          "iterations": 200
+        },
+        "barnesHut": {
+          "springLength": 200
+        }
+      },
+      "interaction": {
+        "hover": true,
+        "tooltipDelay": 200
+      }
     }
     """)
+    # 生成HTML内容并修复CDN引用
+    html = net.generate_html()
+    html = html.replace('//cdnjs.cloudflare.com', 'https://cdnjs.cloudflare.com')
+    html = html.replace('//unpkg.com', 'https://unpkg.com')
+    return html
+# ----------- NER 和 RE 抽取逻辑 -----------------
+def ner(text):
+    pattern_name = r"[赵钱孙李周吴郑王冯陈褚卫蒋沈韩杨朱秦尤许何吕施张孔曹严华金魏陶姜][\u4e00-\u9fa5]{1,2}"
+    pattern_id = r"\b[a-zA-Z_][a-zA-Z0-9_]{4,}\b"
+    entities = []
+    # 中文姓名识别
+    for match in re.finditer(pattern_name, text):
+        entities.append({
+            "text": match.group(),
+            "start": match.start(),
+            "end": match.end(),
+            "type": "PersonName"
+        })
+    # 用户ID识别
+    for match in re.finditer(pattern_id, text):
+        if not any(e["start"] == match.start() for e in entities):
+            entities.append({
+                "text": match.group(),
+                "start": match.start(),
+                "end": match.end(),
+                "type": "UserID"
+            })
+    return sorted(entities, key=lambda x: x["start"])
+def re_extract(entities, text):
+    relations = []
+    if len(entities) >= 2:
+        for i in range(len(entities) - 1):
+            head = entities[i]["text"]
+            tail = entities[i + 1]["text"]
+            context = text[entities[i]["end"]:entities[i + 1]["start"]]
+            # 关系判断逻辑
+            if "推荐" in context or "找" in context:
+                relation = "recommend"
+            elif "发送" in context or "发给" in context:
+                relation = "send_to"
+            elif "提到" in context or "说" in context:
+                relation = "mention"
+            else:
+                relation = "knows"
+            relations.append({
+                "head": head,
+                "tail": tail,
+                "relation": relation
+            })
+    return relations
+# ----------- 文本处理逻辑 -----------------
+def process_text(text):
+    # 实体识别
+    entities = ner(text)
+    # 关系抽取
+    relations = re_extract(entities, text)
+    # 更新知识图谱
+    update_knowledge_graph(entities, relations)
+    # 生成输出
+    entity_output = "\n".join([f"{e['text']} ({e['type']}) [{e['start']}, {e['end']}]" for e in entities])
+    relation_output = "\n".join([f"{r['head']} --[{r['relation']}]-> {r['tail']}" for r in relations])
+    kg_html = visualize_kg()
+    return entity_output, relation_output, gr.HTML(kg_html)
+# ----------- 文件处理逻辑 -----------------
+def detect_encoding(file_path):
+    with open(file_path, 'rb') as f:
+        raw_data = f.read(4096)
+    result = chardet.detect(raw_data)
+    return result['encoding'] if result['encoding'] else 'utf-8'
+def process_file(file):
+    ext = os.path.splitext(file.name)[-1].lower()
+    full_text = ""
+    warning = ""
+    try:
+        encoding = detect_encoding(file.name)
+        # 处理不同文件格式
+        if ext == ".txt":
+            with open(file.name, "r", encoding=encoding) as f:
+                full_text = f.read()
+        elif ext == ".jsonl":
+            with open(file.name, "r", encoding=encoding) as f:
+                lines = f.readlines()
+                texts = []
+                skipped_lines = []
+                for i, line in enumerate(lines, start=1):
+                    try:
+                        obj = json.loads(line)
+                        texts.append(obj.get("text", ""))
+                    except Exception:
+                        skipped_lines.append(i)
+                full_text = "\n".join(texts)
+                if skipped_lines:
+                    warning = f"⚠️ 跳过 {len(skipped_lines)} 行无效 JSON（如第 {skipped_lines[0]} 行）\n\n"
+        elif ext == ".json":
+            with open(file.name, "r", encoding=encoding) as f:
+                data = json.load(f)
+                if isinstance(data, list):
+                    full_text = "\n".join([str(item.get("text", "")) for item in data])
+                elif isinstance(data, dict):
+                    full_text = data.get("text", "")
+                else:
+                    return "❌ JSON 文件格式无法解析", "", gr.HTML()
+        elif ext == ".csv":
+            df = pd.read_csv(file.name, encoding=encoding)
+            if "text" in df.columns:
+                full_text = "\n".join(df["text"].astype(str))
+            else:
+                return "❌ CSV 中未找到 'text' 列", "", gr.HTML()
+        else:
+            return f"❌ 不支持的文件格式：{ext}", "", gr.HTML()
+    except Exception as e:
+        return f"❌ 文件读取错误：{str(e)}", "", gr.HTML()
+    # 处理文本并生成结果
+    entity_out, relation_out, kg_html = process_text(full_text)
+    return warning + entity_out, relation_out, kg_html
+# ----------- Gradio 界面 -----------------
+with gr.Blocks(
+        css=".kg-container {border: 1px solid #e0e0e0; border-radius: 10px; padding: 20px; margin-top: 20px;}") as demo:
+    gr.Markdown("""# 📱 微信聊天记录智能分析系统
+    **功能**：实体识别（NER） → 关系抽取（RE） → 动态知识图谱""")
+    with gr.Tab("✍️ 直接输入文本"):
+        gr.Markdown("## 直接输入聊天内容进行分析")
+        input_text = gr.Textbox(label="输入内容", lines=8,
+                                placeholder="示例：\n张三：推荐李四加入项目组\n王五：把需求文档发送给赵六")
+        analyze_btn = gr.Button("开始分析", variant="primary")
+        with gr.Row():
+            entity_output1 = gr.Textbox(label="识别出的实体", interactive=False)
+            relation_output1 = gr.Textbox(label="抽取的关系", interactive=False)
+        kg_html1 = gr.HTML(label="知识图谱展示", elem_classes="kg-container")
+        analyze_btn.click(
+            fn=process_text,
+            inputs=[input_text],
+            outputs=[entity_output1, relation_output1, kg_html1]
+        )
+    with gr.Tab("📁 上传文件"):
+        gr.Markdown("## 上传聊天记录文件（支持多种格式）")
+        file_input = gr.File(label="选择文件", file_types=[".txt", ".jsonl", ".json", ".csv"])
+        analyze_file_btn = gr.Button("分析文件", variant="primary")
+        with gr.Row():
+            entity_output2 = gr.Textbox(label="识别出的实体", interactive=False)
+            relation_output2 = gr.Textbox(label="抽取的关系", interactive=False)
+        kg_html2 = gr.HTML(label="知识图谱展示", elem_classes="kg-container")
+        analyze_file_btn.click(
+            fn=process_file,
+            inputs=[file_input],
+            outputs=[entity_output2, relation_output2, kg_html2]
+        )
+    with gr.Tab("🗺️ 完整知识图谱"):
+        gr.Markdown("## 动态更新的完整知识图谱")
+        with gr.Row():
+            gr.Markdown("点击按钮刷新查看累计分析结果")
+            refresh_btn = gr.Button("立即刷新", variant="secondary")
+        full_kg = gr.HTML(elem_classes="kg-container")
+        refresh_btn.click(fn=lambda: visualize_kg(), outputs=full_kg)
 if __name__ == "__main__":
+    demo.launch()

requirements.txt CHANGED Viewed

@@ -1,7 +1,7 @@
-torch==2.1.2
-transformers==4.36.2
-gradio==4.19.2
-pandas==2.2.1
-chardet==5.2.0
-networkx==3.2.1
 pyvis==0.3.2

+transformers==4.30.2
+torch==2.0.1
+gradio==3.39.0
+pandas==2.0.3
+chardet==5.1.0
 pyvis==0.3.2
+networkx==3.1