Spaces:

chen666-666
/

wechat-ner-re

Sleeping

App Files Files Community

chen666-666 commited on Apr 16

Commit

0207e75

1 Parent(s): cebeeca

add app.py and requirements.txt

Browse files

Files changed (1) hide show

app.py +103 -30

app.py CHANGED Viewed

@@ -30,16 +30,22 @@ except Exception as e:
     print(f"❌ ChatGLM 加载失败: {e}")
 # ======================== 知识图谱结构 ========================
-knowledge_graph = {"entities": set(), "relations": []}
 def update_knowledge_graph(entities, relations):
     for e in entities:
         if isinstance(e, dict) and 'text' in e and 'type' in e:
             knowledge_graph["entities"].add((e['text'], e['type']))
     for r in relations:
         if isinstance(r, dict) and all(k in r for k in ("head", "tail", "relation")):
-            knowledge_graph["relations"].append((r['head'], r['tail'], r['relation']))
 def visualize_kg_text():
     nodes = [f"{ent[0]} ({ent[1]})" for ent in knowledge_graph["entities"]]
     edges = [f"{h} --[{r}]-> {t}" for h, t, r in knowledge_graph["relations"]]
@@ -50,14 +56,28 @@ def ner(text, model_type="bert"):
     start_time = time.time()
     if model_type == "chatglm" and use_chatglm:
         try:
-            prompt = f"请从以下文本中识别所有实体，用JSON格式返回:[{text}]"
             response = chatglm_model.chat(chatglm_tokenizer, prompt, temperature=0.1)
             if isinstance(response, tuple):
                 response = response[0]
-            entities = json.loads(response)
-            return entities, time.time() - start_time
         except Exception as e:
-            print(f"❌ ChatGLM 实体识别失败：{e}")
             return [], time.time() - start_time
     # 使用微调的 BERT 中文 NER 模型
@@ -76,17 +96,41 @@ def ner(text, model_type="bert"):
 def re_extract(entities, text):
     if len(entities) < 2:
         return []
     try:
-        entity_list = [e['text'] for e in entities]
-        prompt = f"分析以下实体之间的关系：{entity_list}\n文本上下文：{text}"
         if use_chatglm:
             response = chatglm_model.chat(chatglm_tokenizer, prompt, temperature=0.1)
             if isinstance(response, tuple):
                 response = response[0]
-            return json.loads(response)
     except Exception as e:
-        print(f"❌ ChatGLM 关系抽取失败：{e}")
-    return [{"head": e1['text'], "tail": e2['text'], "relation": "相关"} for i, e1 in enumerate(entities) for e2 in entities[i+1:]]
 # ======================== 文本分析主流程 ========================
 def process_text(text, model_type="bert"):
@@ -99,13 +143,33 @@ def process_text(text, model_type="bert"):
     kg_text = visualize_kg_text()
     return ent_text, rel_text, kg_text, f"{duration:.2f} 秒"
 def process_file(file, model_type="bert"):
-    content = file.read()
-    if len(content) > 5 * 1024 * 1024:
-        return "❌ 文件太大", "", "", ""
-    encoding = chardet.detect(content)['encoding'] or 'utf-8'
-    text = content.decode(encoding)
-    return process_text(text, model_type)
 # ======================== 模型评估与自动标注 ========================
 def convert_telegram_json_to_eval_format(path):
@@ -128,23 +192,32 @@ def convert_telegram_json_to_eval_format(path):
         return result
     return []
 def evaluate_ner_model(data, model_type):
     y_true, y_pred = [], []
     for item in data:
         text = item["text"]
-        gold = set()
-        for e in item["entities"]:
-            # 兼容 text 或 start/end 两种格式
-            if "text" in e:
-                gold.add(e["text"])
-            elif "start" in e and "end" in e:
-                gold.add(text[e["start"]:e["end"]])
         pred, _ = ner(text, model_type)
-        pred = set(e['text'] for e in pred)
-        for ent in gold.union(pred):
-            y_true.append(1 if ent in gold else 0)
-            y_pred.append(1 if ent in pred else 0)
-    return f"Precision: {precision_score(y_true, y_pred):.2f}\\nRecall: {recall_score(y_true, y_pred):.2f}\\nF1: {f1_score(y_true, y_pred):.2f}"
 def auto_annotate(file, model_type):
     data = convert_telegram_json_to_eval_format(file.name)

     print(f"❌ ChatGLM 加载失败: {e}")
 # ======================== 知识图谱结构 ========================
+knowledge_graph = {"entities": set(), "relations": set()}
 def update_knowledge_graph(entities, relations):
     for e in entities:
         if isinstance(e, dict) and 'text' in e and 'type' in e:
             knowledge_graph["entities"].add((e['text'], e['type']))
     for r in relations:
         if isinstance(r, dict) and all(k in r for k in ("head", "tail", "relation")):
+            # 标准化关系方向
+            relation_tuple = (r['head'], r['tail'], r['relation'])
+            reverse_tuple = (r['tail'], r['head'], r['relation'])
+            if reverse_tuple not in knowledge_graph["relations"]:
+                knowledge_graph["relations"].add(relation_tuple)
 def visualize_kg_text():
     nodes = [f"{ent[0]} ({ent[1]})" for ent in knowledge_graph["entities"]]
     edges = [f"{h} --[{r}]-> {t}" for h, t, r in knowledge_graph["relations"]]
     start_time = time.time()
     if model_type == "chatglm" and use_chatglm:
         try:
+            prompt = f"""请从以下文本中识别所有实体，严格按照JSON列表格式返回，每个实体包含text、type、start、end字段：
+示例：[{{"text": "北京", "type": "LOC", "start": 0, "end": 2}}]
+文本：{text}"""
             response = chatglm_model.chat(chatglm_tokenizer, prompt, temperature=0.1)
             if isinstance(response, tuple):
                 response = response[0]
+            # 增强 JSON 解析
+            try:
+                json_str = re.search(r'\[.*\]', response, re.DOTALL).group()
+                entities = json.loads(json_str)
+                # 验证字段
+                valid_entities = []
+                for ent in entities:
+                    if all(k in ent for k in ("text", "type", "start", "end")):
+                        valid_entities.append(ent)
+                return valid_entities, time.time() - start_time
+            except Exception as e:
+                print(f"JSON 解析失败: {e}")
+                return [], time.time() - start_time
         except Exception as e:
+            print(f"ChatGLM 调用失败：{e}")
             return [], time.time() - start_time
     # 使用微调的 BERT 中文 NER 模型
 def re_extract(entities, text):
     if len(entities) < 2:
         return []
+    relations = []
     try:
+        entity_pairs = [(e1, e2) for i, e1 in enumerate(entities) for e2 in entities[i + 1:]]
+        prompt = f"""分析文本中的实体关系，返回JSON列表：
+文本：{text}
+实体列表：{[e['text'] for e in entities]}
+要求：
+1. 仅返回存在明确关系的实体对
+2. 关系类型使用：属于、位于、参与、其他
+3. 格式示例：[{{"head": "北京", "tail": "中国", "relation": "位于"}}]"""
         if use_chatglm:
             response = chatglm_model.chat(chatglm_tokenizer, prompt, temperature=0.1)
             if isinstance(response, tuple):
                 response = response[0]
+            # 提取 JSON
+            try:
+                json_str = re.search(r'\[.*\]', response, re.DOTALL).group()
+                relations = json.loads(json_str)
+                # 验证关系
+                valid_relations = []
+                valid_types = {"属于", "位于", "参与", "其他"}
+                for rel in relations:
+                    if all(k in rel for k in ("head", "tail", "relation")) and rel["relation"] in valid_types:
+                        valid_relations.append(rel)
+                return valid_relations
+            except Exception as e:
+                print(f"关系解析失败: {e}")
     except Exception as e:
+        print(f"关系抽取失败: {e}")
+    # 默认不生成任何关系
+    return []
 # ======================== 文本分析主流程 ========================
 def process_text(text, model_type="bert"):
     kg_text = visualize_kg_text()
     return ent_text, rel_text, kg_text, f"{duration:.2f} 秒"
 def process_file(file, model_type="bert"):
+    try:
+        with open(file.name, 'rb') as f:
+            content = f.read()
+        if len(content) > 5 * 1024 * 1024:
+            return "❌ 文件太大", "", "", ""
+        # 检测编码
+        try:
+            encoding = chardet.detect(content)['encoding'] or 'utf-8'
+            text = content.decode(encoding)
+        except UnicodeDecodeError:
+            # 尝试常见中文编码
+            for enc in ['gb18030', 'utf-16', 'big5']:
+                try:
+                    text = content.decode(enc)
+                    break
+                except:
+                    continue
+            else:
+                return "❌ 编码解析失败", "", "", ""
+        return process_text(text, model_type)
+    except Exception as e:
+        return f"❌ 文件处理错误: {str(e)}", "", "", ""
 # ======================== 模型评估与自动标注 ========================
 def convert_telegram_json_to_eval_format(path):
         return result
     return []
 def evaluate_ner_model(data, model_type):
     y_true, y_pred = [], []
     for item in data:
         text = item["text"]
+        gold_entities = []
+        for e in item.get("entities", []):
+            if "text" in e and "type" in e:
+                # 使用哈希避免重复
+                gold_entities.append(f"{e['text']}|{e['type']}|{e.get('start', -1)}|{e.get('end', -1)}")
+        pred_entities = []
         pred, _ = ner(text, model_type)
+        for e in pred:
+            pred_entities.append(f"{e['text']}|{e['type']}|{e['start']}|{e['end']}")
+        # 创建所有可能的实体集合
+        all_entities = set(gold_entities + pred_entities)
+        for ent in all_entities:
+            y_true.append(1 if ent in gold_entities else 0)
+            y_pred.append(1 if ent in pred_entities else 0)
+    if not y_true:
+        return "⚠️ 无有效标注数据"
+    return f"Precision: {precision_score(y_true, y_pred):.2f}\nRecall: {recall_score(y_true, y_pred):.2f}\nF1: {f1_score(y_true, y_pred):.2f}"
 def auto_annotate(file, model_type):
     data = convert_telegram_json_to_eval_format(file.name)