Spaces:

chen666-666
/

wechat-ner-re

Sleeping

App Files Files Community

chen666-666 commited on Apr 15

Commit

26ec260

1 Parent(s): 1a6560a

add app.py and requirements.txt

Browse files

Files changed (2) hide show

app.py +107 -186
requirements.txt +5 -4

app.py CHANGED Viewed

@@ -8,32 +8,20 @@ import pandas as pd
 import chardet
 from pyvis.network import Network
 import time
-# 初始化模型
 bert_model_name = "bert-base-chinese"
 bert_tokenizer = BertTokenizer.from_pretrained(bert_model_name)
 bert_model = BertModel.from_pretrained(bert_model_name)
-# 加载中文模型 ChatGLM3-6B
 chatglm_model_name = "THUDM/chatglm3-6b"
-chatglm_tokenizer = AutoTokenizer.from_pretrained(
-    chatglm_model_name,
-    trust_remote_code=True
-)
-chatglm_model = AutoModel.from_pretrained(
-    chatglm_model_name,
-    trust_remote_code=True,
-    device_map="auto",
-    torch_dtype=torch.float16
-).eval()
-# 知识图谱数据存储
-knowledge_graph = {
-    "entities": set(),
-    "relations": []
-}
 def update_knowledge_graph(entities, relations):
     for e in entities:
         if isinstance(e, dict) and 'text' in e and 'type' in e:
@@ -42,232 +30,165 @@ def update_knowledge_graph(entities, relations):
         if isinstance(r, dict) and all(k in r for k in ("head", "tail", "relation")):
             knowledge_graph["relations"].append((r['head'], r['tail'], r['relation']))
 def visualize_kg():
     net = Network(height="600px", width="100%", notebook=True, directed=True)
     node_map = {}
     idx = 0
     for ent in knowledge_graph["entities"]:
-        if isinstance(ent, tuple) and len(ent) == 2:
-            name, type_ = ent
-            node_map[name] = idx
-            net.add_node(idx,
-                         label=name,
-                         title=f"类型：{type_}",
-                         group=type_,
-                         font={'size': 20, 'face': 'SimHei'})
-            idx += 1
     seen_edges = set()
     for head, tail, relation in knowledge_graph["relations"]:
         if head in node_map and tail in node_map:
             edge_key = f"{head}-{tail}-{relation}"
             if edge_key not in seen_edges:
-                net.add_edge(node_map[head], node_map[tail],
-                             label=relation,
-                             arrows='to',
-                             font={'size': 14})
                 seen_edges.add(edge_key)
-    net.set_options("""
-    {
-      "nodes": {
-        "scaling": {
-          "min": 20,
-          "max": 40
-        }
-      },
-      "physics": {
-        "stabilization": {
-          "enabled": true,
-          "iterations": 200,
-          "updateInterval": 25
-        },
-        "barnesHut": {
-          "gravitationalConstant": -2000,
-          "springLength": 150
-        }
-      },
-      "interaction": {
-        "hover": true,
-        "tooltipDelay": 200
-      }
-    }
-    """)
-    html = net.generate_html()
-    html = html.replace('//cdnjs.cloudflare.com', 'https://cdnjs.cloudflare.com')
-    html = html.replace('//unpkg.com', 'https://unpkg.com')
     return f'<div class="kg-graph">{html}</div>'
 def ner(text, model_type="bert"):
     start_time = time.time()
     if model_type == "bert":
-        # BERT 中文实体识别（原逻辑保留）
         name_pattern = r"([\u4e00-\u9fa5]{2,4})(?![的等地得啦啊哦])"
-        id_pattern = r"(?<!\S)([a-zA-Z_][a-zA-Z0-9_]{4,})(?![\\u4e00-\\u9fa5])"
     else:
-        # ChatGLM 增强实体识别
-        response, _ = chatglm_model.chat(
-            chatglm_tokenizer,
-            f"请从以下文本中识别所有实体，用JSON格式返回:[{text}]",
-            temperature=0.1
-        )
         try:
             entities = json.loads(response)
             return entities, time.time() - start_time
         except:
-            pass
-        # 如果模型响应失败，使用备用正则
-        name_pattern = r"([\\u4e00-\\u9fa5]{2,4})(?![的等地得啦啊哦])"
-        id_pattern = r"(?<!\S)([a-zA-Z_][a-zA-Z0-9_]{4,})"
     entities = []
     occupied = set()
     def is_occupied(start, end):
         return any(s <= start < e or s < end <= e for s, e in occupied)
     for match in re.finditer(name_pattern, text):
         start, end = match.start(1), match.end(1)
         if not is_occupied(start, end):
-            entities.append({
-                "text": match.group(1),
-                "start": start,
-                "end": end,
-                "type": "人名"
-            })
             occupied.add((start, end))
     for match in re.finditer(id_pattern, text):
         start, end = match.start(1), match.end(1)
         if not is_occupied(start, end):
-            entities.append({
-                "text": match.group(1),
-                "start": start,
-                "end": end,
-                "type": "用户ID"
-            })
             occupied.add((start, end))
-    processing_time = time.time() - start_time
-    return entities, processing_time
 def re_extract(entities, text):
     relations = []
     if len(entities) < 2:
         return relations
-    # 使用ChatGLM分析关系
     entity_list = [e['text'] for e in entities]
     prompt = f"分析以下实体之间的关系：{entity_list}\n文本上下文：{text}"
     response, _ = chatglm_model.chat(chatglm_tokenizer, prompt, temperature=0.1)
     try:
         relations = json.loads(response)
     except:
-        # 备用简单关系生成
         for i in range(len(entities)):
             for j in range(i + 1, len(entities)):
-                relations.append({
-                    "head": entities[i]['text'],
-                    "tail": entities[j]['text'],
-                    "relation": "相关"
-                })
     return relations
 def process_text(text, model_type="bert"):
-    try:
-        entities, processing_time = ner(text, model_type=model_type)
-        relations = re_extract(entities, text)
-        update_knowledge_graph(entities, relations)
-        entity_output = "\n".join(
-            f"{e['text']} ({e['type']}) [{e['start']}-{e['end']}]"
-            for e in entities
-        )
-        relation_output = "\n".join(
-            f"{r['head']} --[{r['relation']}]-> {r['tail']}"
-            for r in relations
-        )
-        kg_html = visualize_kg()
-        return entity_output, relation_output, gr.HTML(kg_html), f"处理时间：{processing_time:.2f}秒"
-    except Exception as e:
-        return f"处理出错: {str(e)}", "", gr.HTML(), ""
 def process_file(file, model_type="bert"):
-    try:
-        content_bytes = file.read()
-        if len(content_bytes) > 5 * 1024 * 1024:
-            return "❌ 文件大小超过5MB限制", "", gr.HTML(), ""
-        encoding = chardet.detect(content_bytes)['encoding'] or 'utf-8'
-        full_text = content_bytes.decode(encoding)
-        ext = os.path.splitext(file.name)[-1].lower()
-        if ext == ".csv":
-            df = pd.read_csv(file.name)
-            if 'text' in df.columns:
-                full_text = "\n".join(df['text'].astype(str))
-            else:
-                return "❌ CSV文件中缺少text列", "", gr.HTML(), ""
-        return process_text(full_text, model_type)
-    except Exception as e:
-        return f"❌ 文件处理错误: {str(e)}", "", gr.HTML(), ""
-# Gradio UI
-css = """
-.kg-container {
-    border: 1px solid #e0e0e0;
-    border-radius: 10px;
-    padding: 20px;
-    margin: 20px 0;
-    background: white;
-    box-shadow: 0 2px 8px rgba(0,0,0,0.1);
-}
-.kg-graph {
-    width: 100%;
-    height: 600px;
-}
-"""
 with gr.Blocks(css=css) as demo:
-    gr.Markdown("# 🚀 智能聊天记录分析系统（ChatGLM3-6B版）")
     with gr.Tab("✍️ 文本分析"):
-        input_text = gr.Textbox(label="输入内容", lines=8,
-                                placeholder="示例：张三@李四 请把需求文档_v2发送给王五")
-        model_type = gr.Radio(["bert", "chatglm"], label="选择模型", value="bert")
-        analyze_btn = gr.Button("开始分析", variant="primary")
-        with gr.Row():
-            entity_output = gr.Textbox(label="识别的实体", lines=6)
-            relation_output = gr.Textbox(label="提取的关系", lines=6)
-        kg_output = gr.HTML(label="知识图谱")
-        time_output = gr.Textbox(label="处理时间")
     with gr.Tab("📄 文件分析"):
-        file_input = gr.File(label="选择文件", file_types=[".txt", ".csv", ".json"])
-        analyze_file_btn = gr.Button("开始分析文件", variant="primary")
-        file_entity_output = gr.Textbox(label="识别的实体", lines=6)
-        file_relation_output = gr.Textbox(label="提取的关系", lines=6)
-        file_kg_output = gr.HTML(label="知识图谱")
-        file_time_output = gr.Textbox(label="处理时间")
-    analyze_btn.click(process_text, [input_text, model_type],
-                      [entity_output, relation_output, kg_output, time_output])
-    analyze_file_btn.click(process_file, [file_input, model_type],
-                           [file_entity_output, file_relation_output, file_kg_output, file_time_output])
-demo.launch(server_name="0.0.0.0", server_port=7860)

 import chardet
 from pyvis.network import Network
 import time
+from sklearn.metrics import precision_score, recall_score, f1_score
+# ==== 模型初始化 ====
 bert_model_name = "bert-base-chinese"
 bert_tokenizer = BertTokenizer.from_pretrained(bert_model_name)
 bert_model = BertModel.from_pretrained(bert_model_name)
 chatglm_model_name = "THUDM/chatglm3-6b"
+chatglm_tokenizer = AutoTokenizer.from_pretrained(chatglm_model_name, trust_remote_code=True)
+chatglm_model = AutoModel.from_pretrained(chatglm_model_name, trust_remote_code=True, device_map="auto", torch_dtype=torch.float16).eval()
+knowledge_graph = {"entities": set(), "relations": []}
+# ==== 核心处理函数 ====
 def update_knowledge_graph(entities, relations):
     for e in entities:
         if isinstance(e, dict) and 'text' in e and 'type' in e:
         if isinstance(r, dict) and all(k in r for k in ("head", "tail", "relation")):
             knowledge_graph["relations"].append((r['head'], r['tail'], r['relation']))
 def visualize_kg():
     net = Network(height="600px", width="100%", notebook=True, directed=True)
     node_map = {}
     idx = 0
     for ent in knowledge_graph["entities"]:
+        name, type_ = ent
+        node_map[name] = idx
+        net.add_node(idx, label=name, title=f"类型：{type_}", group=type_, font={'size': 20, 'face': 'SimHei'})
+        idx += 1
     seen_edges = set()
     for head, tail, relation in knowledge_graph["relations"]:
         if head in node_map and tail in node_map:
             edge_key = f"{head}-{tail}-{relation}"
             if edge_key not in seen_edges:
+                net.add_edge(node_map[head], node_map[tail], label=relation, arrows='to', font={'size': 14})
                 seen_edges.add(edge_key)
+    net.set_options("""{
+      "nodes": {"scaling": {"min": 20, "max": 40}},
+      "physics": {"stabilization": {"enabled": true, "iterations": 200}, "barnesHut": {"gravitationalConstant": -2000, "springLength": 150}},
+      "interaction": {"hover": true, "tooltipDelay": 200}
+    }""")
+    html = net.generate_html().replace('//cdnjs.cloudflare.com', 'https://cdnjs.cloudflare.com').replace('//unpkg.com', 'https://unpkg.com')
     return f'<div class="kg-graph">{html}</div>'
 def ner(text, model_type="bert"):
     start_time = time.time()
     if model_type == "bert":
         name_pattern = r"([\u4e00-\u9fa5]{2,4})(?![的等地得啦啊哦])"
+        id_pattern = r"(?<!\S)([a-zA-Z_][a-zA-Z0-9_]{4,})(?![\u4e00-\u9fa5])"
     else:
+        response, _ = chatglm_model.chat(chatglm_tokenizer, f"请从以下文本中识别所有实体，用JSON格式返回:[{text}]", temperature=0.1)
         try:
             entities = json.loads(response)
             return entities, time.time() - start_time
         except:
+            name_pattern = r"([\u4e00-\u9fa5]{2,4})(?![的等地得啦啊哦])"
+            id_pattern = r"(?<!\S)([a-zA-Z_][a-zA-Z0-9_]{4,})"
     entities = []
     occupied = set()
     def is_occupied(start, end):
         return any(s <= start < e or s < end <= e for s, e in occupied)
     for match in re.finditer(name_pattern, text):
         start, end = match.start(1), match.end(1)
         if not is_occupied(start, end):
+            entities.append({"text": match.group(1), "start": start, "end": end, "type": "人名"})
             occupied.add((start, end))
     for match in re.finditer(id_pattern, text):
         start, end = match.start(1), match.end(1)
         if not is_occupied(start, end):
+            entities.append({"text": match.group(1), "start": start, "end": end, "type": "用户ID"})
             occupied.add((start, end))
+    return entities, time.time() - start_time
 def re_extract(entities, text):
     relations = []
     if len(entities) < 2:
         return relations
     entity_list = [e['text'] for e in entities]
     prompt = f"分析以下实体之间的关系：{entity_list}\n文本上下文：{text}"
     response, _ = chatglm_model.chat(chatglm_tokenizer, prompt, temperature=0.1)
     try:
         relations = json.loads(response)
     except:
         for i in range(len(entities)):
             for j in range(i + 1, len(entities)):
+                relations.append({"head": entities[i]['text'], "tail": entities[j]['text'], "relation": "相关"})
     return relations
 def process_text(text, model_type="bert"):
+    entities, processing_time = ner(text, model_type)
+    relations = re_extract(entities, text)
+    update_knowledge_graph(entities, relations)
+    entity_output = "\n".join(f"{e['text']} ({e['type']}) [{e['start']}-{e['end']}]" for e in entities)
+    relation_output = "\n".join(f"{r['head']} --[{r['relation']}]-> {r['tail']}" for r in relations)
+    return entity_output, relation_output, gr.HTML(visualize_kg()), f"处理时间：{processing_time:.2f}秒"
 def process_file(file, model_type="bert"):
+    content_bytes = file.read()
+    if len(content_bytes) > 5 * 1024 * 1024:
+        return "❌ 文件太大", "", gr.HTML(), ""
+    encoding = chardet.detect(content_bytes)['encoding'] or 'utf-8'
+    text = content_bytes.decode(encoding)
+    return process_text(text, model_type)
+# ==== 评估功能与自动标注 ====
+def convert_telegram_json_to_eval_format(path):
+    data = json.load(open(path, encoding="utf-8"))
+    result = []
+    for m in data.get("messages", []):
+        if isinstance(m.get("text"), str):
+            result.append({"text": m["text"], "entities": []})
+        elif isinstance(m.get("text"), list):
+            txt = ''.join([x["text"] if isinstance(x, dict) else x for x in m["text"]])
+            result.append({"text": txt, "entities": []})
+    return result
+def evaluate_ner_model(data, model_type):
+    y_true, y_pred = [], []
+    for item in data:
+        gold = set(e['text'] for e in item['entities'])
+        pred, _ = ner(item['text'], model_type)
+        pred = set(e['text'] for e in pred)
+        for ent in gold.union(pred):
+            y_true.append(1 if ent in gold else 0)
+            y_pred.append(1 if ent in pred else 0)
+    return f"📊 {model_type} 实体识别评估：\nPrecision: {precision_score(y_true,y_pred):.2f}\nRecall: {recall_score(y_true,y_pred):.2f}\nF1: {f1_score(y_true,y_pred):.2f}"
+def auto_annotate(file, model_type):
+    data = convert_telegram_json_to_eval_format(file.name)
+    for item in data:
+        ents, _ = ner(item["text"], model_type)
+        item["entities"] = ents
+    return json.dumps(data, ensure_ascii=False, indent=2)
+def save_json(json_text):
+    fname = "auto_labeled.json"
+    with open(fname, "w", encoding="utf-8") as f:
+        f.write(json_text)
+    return fname
+# ==== Gradio UI ====
+css = ".kg-graph { height: 600px; }"
 with gr.Blocks(css=css) as demo:
+    gr.Markdown("# 🚀 智能聊天分析系统 + 标注评估工具")
     with gr.Tab("✍️ 文本分析"):
+        input_text = gr.Textbox(lines=6, label="输入内容")
+        model_type = gr.Radio(["bert", "chatglm"], value="bert", label="模型")
+        btn = gr.Button("开始分析")
+        out1 = gr.Textbox(label="实体")
+        out2 = gr.Textbox(label="关系")
+        out3 = gr.HTML()
+        out4 = gr.Textbox(label="耗时")
+        btn.click(fn=process_text, inputs=[input_text, model_type], outputs=[out1, out2, out3, out4])
     with gr.Tab("📄 文件分析"):
+        file_input = gr.File(file_types=[".txt", ".json", ".csv"])
+        btn2 = gr.Button("分析文件")
+        fout1, fout2, fout3, fout4 = gr.Textbox(), gr.Textbox(), gr.HTML(), gr.Textbox()
+        btn2.click(fn=process_file, inputs=[file_input, model_type], outputs=[fout1, fout2, fout3, fout4])
+    with gr.Tab("📊 模型评估"):
+        eval_file = gr.File(label="上传标注数据集")
+        eval_model = gr.Radio(["bert", "chatglm"], value="bert")
+        eval_btn = gr.Button("开始评估")
+        eval_output = gr.Textbox(label="评估结果", lines=5)
+        eval_btn.click(lambda f, m: evaluate_ner_model(convert_telegram_json_to_eval_format(f.name), m), [eval_file, eval_model], eval_output)
+    with gr.Tab("🖍 实体标注助手"):
+        raw_file = gr.File(label="上传原始 Telegram JSON")
+        auto_model = gr.Radio(["bert", "chatglm"], value="bert")
+        auto_btn = gr.Button("自动初标")
+        marked_texts = gr.Textbox(label="初步标注结果（可下载）", lines=20)
+        download_btn = gr.Button("💾 下载JSON")
+        auto_btn.click(fn=auto_annotate, inputs=[raw_file, auto_model], outputs=marked_texts)
+        download_btn.click(fn=save_json, inputs=marked_texts, outputs=gr.File())
+demo.launch(server_name="0.0.0.0", server_port=7860)

requirements.txt CHANGED Viewed

@@ -1,11 +1,12 @@
 gradio==3.50.2
 transformers==4.39.3
 torch>=2.1.0
-pandas>=2.0.0
-chardet>=5.0.0
 networkx>=3.0
-pyvis>=0.3.2
 python-dotenv>=1.0.0
 sentencepiece>=0.2.0
 cpm-kernels>=1.0.11
-accelerate>=0.27.0

 gradio==3.50.2
 transformers==4.39.3
 torch>=2.1.0
 networkx>=3.0
 python-dotenv>=1.0.0
 sentencepiece>=0.2.0
 cpm-kernels>=1.0.11
+accelerate>=0.27.0
+scikit-learn>=1.3.0
+chardet>=5.2.0
+pandas>=2.1.0
+pyvis>=0.3.2