Spaces:

chen666-666
/

wechat-ner-re

Sleeping

App Files Files Community

chen666-666 commited on Apr 15

Commit

1eeeaf5

1 Parent(s): ee61e9e

add app.py and requirements.txt

Browse files

Files changed (1) hide show

app.py +19 -24

app.py CHANGED Viewed

@@ -4,29 +4,23 @@ import gradio as gr
 import re
 import os
 import json
-import pandas as pd
 import chardet
 from sklearn.metrics import precision_score, recall_score, f1_score
 import time
-# ==== 模型初始化 ====
 bert_model_name = "bert-base-chinese"
 bert_tokenizer = BertTokenizer.from_pretrained(bert_model_name)
 bert_model = BertModel.from_pretrained(bert_model_name)
-# chatglm3 模型检测与安全加载
 chatglm_model, chatglm_tokenizer = None, None
 use_chatglm = False
 try:
     if torch.cuda.is_available():
         chatglm_model_name = "THUDM/chatglm3-6b"
         chatglm_tokenizer = AutoTokenizer.from_pretrained(chatglm_model_name, trust_remote_code=True)
         chatglm_model = AutoModel.from_pretrained(
-            chatglm_model_name,
-            trust_remote_code=True,
-            device_map="auto",
-            torch_dtype=torch.float16
         ).eval()
         use_chatglm = True
     else:
@@ -34,7 +28,7 @@ try:
 except Exception as e:
     print(f"❌ ChatGLM 加载失败: {e}")
-# ==== 知识图谱数据结构 ====
 knowledge_graph = {"entities": set(), "relations": []}
 def update_knowledge_graph(entities, relations):
@@ -50,16 +44,17 @@ def visualize_kg_text():
     edges = [f"{h} --[{r}]-> {t}" for h, t, r in knowledge_graph["relations"]]
     return "📌 实体:\n" + "\n".join(nodes) + "\n\n📎 关系:\n" + "\n".join(edges)
-# ==== 实体识别函数 ====
 def ner(text, model_type="bert"):
     start_time = time.time()
     if model_type == "chatglm" and use_chatglm:
         try:
-            response, _ = chatglm_model.chat(chatglm_tokenizer, f"请从以下文本中识别所有实体，用JSON格式返回:[{text}]", temperature=0.1)
             entities = json.loads(response)
             return entities, time.time() - start_time
         except Exception as e:
-            print(f"❌ ChatGLM 解析失败：{e}")
             return [], time.time() - start_time
     name_pattern = r"([\u4e00-\u9fa5]{2,4})(?![的等地得啦啊哦])"
@@ -83,7 +78,7 @@ def ner(text, model_type="bert"):
     return entities, time.time() - start_time
-# ==== 关系抽取 ====
 def re_extract(entities, text):
     if len(entities) < 2:
         return []
@@ -93,11 +88,11 @@ def re_extract(entities, text):
         if use_chatglm:
             response, _ = chatglm_model.chat(chatglm_tokenizer, prompt, temperature=0.1)
             return json.loads(response)
-    except:
-        pass
     return [{"head": e1['text'], "tail": e2['text'], "relation": "相关"} for i, e1 in enumerate(entities) for e2 in entities[i+1:]]
-# ==== 文本处理主流程 ====
 def process_text(text, model_type="bert"):
     entities, duration = ner(text, model_type)
     relations = re_extract(entities, text)
@@ -106,7 +101,7 @@ def process_text(text, model_type="bert"):
     ent_text = "\n".join(f"{e['text']} ({e['type']}) [{e['start']}-{e['end']}]" for e in entities)
     rel_text = "\n".join(f"{r['head']} --[{r['relation']}]-> {r['tail']}" for r in relations)
     kg_text = visualize_kg_text()
-    return ent_text, rel_text, kg_text, f"{duration:.2f}秒"
 def process_file(file, model_type="bert"):
     content = file.read()
@@ -116,7 +111,7 @@ def process_file(file, model_type="bert"):
     text = content.decode(encoding)
     return process_text(text, model_type)
-# ==== 模型评估与初标注 ====
 def convert_telegram_json_to_eval_format(path):
     data = json.load(open(path, encoding="utf-8"))
     result = []
@@ -152,7 +147,7 @@ def save_json(json_text):
         f.write(json_text)
     return fname
-# ==== Gradio 界面 ====
 with gr.Blocks(css=".kg-graph {height: 500px;}") as demo:
     gr.Markdown("# 🤖 聊天记录实体关系识别系统")
@@ -162,7 +157,7 @@ with gr.Blocks(css=".kg-graph {height: 500px;}") as demo:
         btn = gr.Button("开始分析")
         out1 = gr.Textbox(label="识别实体")
         out2 = gr.Textbox(label="识别关系")
-        out3 = gr.Textbox(label="知识图谱文本")
         out4 = gr.Textbox(label="耗时")
         btn.click(fn=process_text, inputs=[input_text, model_type], outputs=[out1, out2, out3, out4])
@@ -179,12 +174,12 @@ with gr.Blocks(css=".kg-graph {height: 500px;}") as demo:
         eval_output = gr.Textbox(label="评估结果", lines=5)
         eval_btn.click(lambda f, m: evaluate_ner_model(convert_telegram_json_to_eval_format(f.name), m), [eval_file, eval_model], eval_output)
-    with gr.Tab("✏️ 实体自动标注"):
         raw_file = gr.File(label="上传 Telegram 原始 JSON")
         auto_model = gr.Radio(["bert", "chatglm"], value="bert")
-        auto_btn = gr.Button("自动初标")
-        marked_texts = gr.Textbox(label="自动标注结果", lines=20)
-        download_btn = gr.Button("💾 下载JSON")
         auto_btn.click(fn=auto_annotate, inputs=[raw_file, auto_model], outputs=marked_texts)
         download_btn.click(fn=save_json, inputs=marked_texts, outputs=gr.File())

 import re
 import os
 import json
 import chardet
 from sklearn.metrics import precision_score, recall_score, f1_score
 import time
+# ======================== 模型加载 ========================
 bert_model_name = "bert-base-chinese"
 bert_tokenizer = BertTokenizer.from_pretrained(bert_model_name)
 bert_model = BertModel.from_pretrained(bert_model_name)
 chatglm_model, chatglm_tokenizer = None, None
 use_chatglm = False
 try:
     if torch.cuda.is_available():
         chatglm_model_name = "THUDM/chatglm3-6b"
         chatglm_tokenizer = AutoTokenizer.from_pretrained(chatglm_model_name, trust_remote_code=True)
         chatglm_model = AutoModel.from_pretrained(
+            chatglm_model_name, trust_remote_code=True, device_map="auto", torch_dtype=torch.float16
         ).eval()
         use_chatglm = True
     else:
 except Exception as e:
     print(f"❌ ChatGLM 加载失败: {e}")
+# ======================== 知识图谱结构 ========================
 knowledge_graph = {"entities": set(), "relations": []}
 def update_knowledge_graph(entities, relations):
     edges = [f"{h} --[{r}]-> {t}" for h, t, r in knowledge_graph["relations"]]
     return "📌 实体:\n" + "\n".join(nodes) + "\n\n📎 关系:\n" + "\n".join(edges)
+# ======================== 实体识别（NER） ========================
 def ner(text, model_type="bert"):
     start_time = time.time()
     if model_type == "chatglm" and use_chatglm:
         try:
+            prompt = f"请从以下文本中识别所有实体，用JSON格式返回:[{text}]"
+            response, _ = chatglm_model.chat(chatglm_tokenizer, prompt, temperature=0.1)
             entities = json.loads(response)
             return entities, time.time() - start_time
         except Exception as e:
+            print(f"❌ ChatGLM 实体识别失败：{e}")
             return [], time.time() - start_time
     name_pattern = r"([\u4e00-\u9fa5]{2,4})(?![的等地得啦啊哦])"
     return entities, time.time() - start_time
+# ======================== 关系抽取（RE） ========================
 def re_extract(entities, text):
     if len(entities) < 2:
         return []
         if use_chatglm:
             response, _ = chatglm_model.chat(chatglm_tokenizer, prompt, temperature=0.1)
             return json.loads(response)
+    except Exception as e:
+        print(f"❌ ChatGLM 关系抽取失败：{e}")
     return [{"head": e1['text'], "tail": e2['text'], "relation": "相关"} for i, e1 in enumerate(entities) for e2 in entities[i+1:]]
+# ======================== 文本分析主流程 ========================
 def process_text(text, model_type="bert"):
     entities, duration = ner(text, model_type)
     relations = re_extract(entities, text)
     ent_text = "\n".join(f"{e['text']} ({e['type']}) [{e['start']}-{e['end']}]" for e in entities)
     rel_text = "\n".join(f"{r['head']} --[{r['relation']}]-> {r['tail']}" for r in relations)
     kg_text = visualize_kg_text()
+    return ent_text, rel_text, kg_text, f"{duration:.2f} 秒"
 def process_file(file, model_type="bert"):
     content = file.read()
     text = content.decode(encoding)
     return process_text(text, model_type)
+# ======================== 模型评估与自动标注 ========================
 def convert_telegram_json_to_eval_format(path):
     data = json.load(open(path, encoding="utf-8"))
     result = []
         f.write(json_text)
     return fname
+# ======================== Gradio 界面 ========================
 with gr.Blocks(css=".kg-graph {height: 500px;}") as demo:
     gr.Markdown("# 🤖 聊天记录实体关系识别系统")
         btn = gr.Button("开始分析")
         out1 = gr.Textbox(label="识别实体")
         out2 = gr.Textbox(label="识别关系")
+        out3 = gr.Textbox(label="知识图谱")
         out4 = gr.Textbox(label="耗时")
         btn.click(fn=process_text, inputs=[input_text, model_type], outputs=[out1, out2, out3, out4])
         eval_output = gr.Textbox(label="评估结果", lines=5)
         eval_btn.click(lambda f, m: evaluate_ner_model(convert_telegram_json_to_eval_format(f.name), m), [eval_file, eval_model], eval_output)
+    with gr.Tab("✏️ 自动标注"):
         raw_file = gr.File(label="上传 Telegram 原始 JSON")
         auto_model = gr.Radio(["bert", "chatglm"], value="bert")
+        auto_btn = gr.Button("自动标注")
+        marked_texts = gr.Textbox(label="标注结果", lines=20)
+        download_btn = gr.Button("💾 下载标注文件")
         auto_btn.click(fn=auto_annotate, inputs=[raw_file, auto_model], outputs=marked_texts)
         download_btn.click(fn=save_json, inputs=marked_texts, outputs=gr.File())