chen666-666 commited on
Commit
0207e75
·
1 Parent(s): cebeeca

add app.py and requirements.txt

Browse files
Files changed (1) hide show
  1. app.py +103 -30
app.py CHANGED
@@ -30,16 +30,22 @@ except Exception as e:
30
  print(f"❌ ChatGLM 加载失败: {e}")
31
 
32
  # ======================== 知识图谱结构 ========================
33
- knowledge_graph = {"entities": set(), "relations": []}
 
34
 
35
  def update_knowledge_graph(entities, relations):
36
  for e in entities:
37
  if isinstance(e, dict) and 'text' in e and 'type' in e:
38
  knowledge_graph["entities"].add((e['text'], e['type']))
 
39
  for r in relations:
40
  if isinstance(r, dict) and all(k in r for k in ("head", "tail", "relation")):
41
- knowledge_graph["relations"].append((r['head'], r['tail'], r['relation']))
42
-
 
 
 
 
43
  def visualize_kg_text():
44
  nodes = [f"{ent[0]} ({ent[1]})" for ent in knowledge_graph["entities"]]
45
  edges = [f"{h} --[{r}]-> {t}" for h, t, r in knowledge_graph["relations"]]
@@ -50,14 +56,28 @@ def ner(text, model_type="bert"):
50
  start_time = time.time()
51
  if model_type == "chatglm" and use_chatglm:
52
  try:
53
- prompt = f"请从以下文本中识别所有实体,用JSON格式返回:[{text}]"
 
 
54
  response = chatglm_model.chat(chatglm_tokenizer, prompt, temperature=0.1)
55
  if isinstance(response, tuple):
56
  response = response[0]
57
- entities = json.loads(response)
58
- return entities, time.time() - start_time
 
 
 
 
 
 
 
 
 
 
 
 
59
  except Exception as e:
60
- print(f"ChatGLM 实体识别失败:{e}")
61
  return [], time.time() - start_time
62
 
63
  # 使用微调的 BERT 中文 NER 模型
@@ -76,17 +96,41 @@ def ner(text, model_type="bert"):
76
  def re_extract(entities, text):
77
  if len(entities) < 2:
78
  return []
 
 
79
  try:
80
- entity_list = [e['text'] for e in entities]
81
- prompt = f"分析以下实体之间的关系:{entity_list}\n文本上下文:{text}"
 
 
 
 
 
 
 
82
  if use_chatglm:
83
  response = chatglm_model.chat(chatglm_tokenizer, prompt, temperature=0.1)
84
  if isinstance(response, tuple):
85
  response = response[0]
86
- return json.loads(response)
 
 
 
 
 
 
 
 
 
 
 
 
 
87
  except Exception as e:
88
- print(f" ChatGLM 关系抽取失败:{e}")
89
- return [{"head": e1['text'], "tail": e2['text'], "relation": "相关"} for i, e1 in enumerate(entities) for e2 in entities[i+1:]]
 
 
90
 
91
  # ======================== 文本分析主流程 ========================
92
  def process_text(text, model_type="bert"):
@@ -99,13 +143,33 @@ def process_text(text, model_type="bert"):
99
  kg_text = visualize_kg_text()
100
  return ent_text, rel_text, kg_text, f"{duration:.2f} 秒"
101
 
 
102
  def process_file(file, model_type="bert"):
103
- content = file.read()
104
- if len(content) > 5 * 1024 * 1024:
105
- return "❌ 文件太大", "", "", ""
106
- encoding = chardet.detect(content)['encoding'] or 'utf-8'
107
- text = content.decode(encoding)
108
- return process_text(text, model_type)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
109
 
110
  # ======================== 模型评估与自动标注 ========================
111
  def convert_telegram_json_to_eval_format(path):
@@ -128,23 +192,32 @@ def convert_telegram_json_to_eval_format(path):
128
  return result
129
  return []
130
 
 
131
  def evaluate_ner_model(data, model_type):
132
  y_true, y_pred = [], []
133
  for item in data:
134
  text = item["text"]
135
- gold = set()
136
- for e in item["entities"]:
137
- # 兼容 text start/end 两种格式
138
- if "text" in e:
139
- gold.add(e["text"])
140
- elif "start" in e and "end" in e:
141
- gold.add(text[e["start"]:e["end"]])
142
  pred, _ = ner(text, model_type)
143
- pred = set(e['text'] for e in pred)
144
- for ent in gold.union(pred):
145
- y_true.append(1 if ent in gold else 0)
146
- y_pred.append(1 if ent in pred else 0)
147
- return f"Precision: {precision_score(y_true, y_pred):.2f}\\nRecall: {recall_score(y_true, y_pred):.2f}\\nF1: {f1_score(y_true, y_pred):.2f}"
 
 
 
 
 
 
 
 
148
 
149
  def auto_annotate(file, model_type):
150
  data = convert_telegram_json_to_eval_format(file.name)
 
30
  print(f"❌ ChatGLM 加载失败: {e}")
31
 
32
  # ======================== 知识图谱结构 ========================
33
+ knowledge_graph = {"entities": set(), "relations": set()}
34
+
35
 
36
  def update_knowledge_graph(entities, relations):
37
  for e in entities:
38
  if isinstance(e, dict) and 'text' in e and 'type' in e:
39
  knowledge_graph["entities"].add((e['text'], e['type']))
40
+
41
  for r in relations:
42
  if isinstance(r, dict) and all(k in r for k in ("head", "tail", "relation")):
43
+ # 标准化关系方向
44
+ relation_tuple = (r['head'], r['tail'], r['relation'])
45
+ reverse_tuple = (r['tail'], r['head'], r['relation'])
46
+ if reverse_tuple not in knowledge_graph["relations"]:
47
+ knowledge_graph["relations"].add(relation_tuple)
48
+
49
  def visualize_kg_text():
50
  nodes = [f"{ent[0]} ({ent[1]})" for ent in knowledge_graph["entities"]]
51
  edges = [f"{h} --[{r}]-> {t}" for h, t, r in knowledge_graph["relations"]]
 
56
  start_time = time.time()
57
  if model_type == "chatglm" and use_chatglm:
58
  try:
59
+ prompt = f"""请从以下文本中识别所有实体,严格按照JSON列表格式返回,每个实体包含text、type、start、end字段:
60
+ 示例:[{{"text": "北京", "type": "LOC", "start": 0, "end": 2}}]
61
+ 文本:{text}"""
62
  response = chatglm_model.chat(chatglm_tokenizer, prompt, temperature=0.1)
63
  if isinstance(response, tuple):
64
  response = response[0]
65
+
66
+ # 增强 JSON 解析
67
+ try:
68
+ json_str = re.search(r'\[.*\]', response, re.DOTALL).group()
69
+ entities = json.loads(json_str)
70
+ # 验证字段
71
+ valid_entities = []
72
+ for ent in entities:
73
+ if all(k in ent for k in ("text", "type", "start", "end")):
74
+ valid_entities.append(ent)
75
+ return valid_entities, time.time() - start_time
76
+ except Exception as e:
77
+ print(f"JSON 解析失败: {e}")
78
+ return [], time.time() - start_time
79
  except Exception as e:
80
+ print(f"ChatGLM 调用失败:{e}")
81
  return [], time.time() - start_time
82
 
83
  # 使用微调的 BERT 中文 NER 模型
 
96
  def re_extract(entities, text):
97
  if len(entities) < 2:
98
  return []
99
+
100
+ relations = []
101
  try:
102
+ entity_pairs = [(e1, e2) for i, e1 in enumerate(entities) for e2 in entities[i + 1:]]
103
+ prompt = f"""分析文本中的实体关系,返回JSON列表:
104
+ 文本:{text}
105
+ 实体列表:{[e['text'] for e in entities]}
106
+ 要求:
107
+ 1. 仅返回存在明确关系的实体对
108
+ 2. 关系类型使用:属于、位于、参与、其他
109
+ 3. 格式示例:[{{"head": "北京", "tail": "中国", "relation": "位于"}}]"""
110
+
111
  if use_chatglm:
112
  response = chatglm_model.chat(chatglm_tokenizer, prompt, temperature=0.1)
113
  if isinstance(response, tuple):
114
  response = response[0]
115
+
116
+ # 提取 JSON
117
+ try:
118
+ json_str = re.search(r'\[.*\]', response, re.DOTALL).group()
119
+ relations = json.loads(json_str)
120
+ # 验证关系
121
+ valid_relations = []
122
+ valid_types = {"属于", "位于", "参与", "其他"}
123
+ for rel in relations:
124
+ if all(k in rel for k in ("head", "tail", "relation")) and rel["relation"] in valid_types:
125
+ valid_relations.append(rel)
126
+ return valid_relations
127
+ except Exception as e:
128
+ print(f"关系解析失败: {e}")
129
  except Exception as e:
130
+ print(f"关系抽取失败: {e}")
131
+
132
+ # 默认不生成任何关系
133
+ return []
134
 
135
  # ======================== 文本分析主流程 ========================
136
  def process_text(text, model_type="bert"):
 
143
  kg_text = visualize_kg_text()
144
  return ent_text, rel_text, kg_text, f"{duration:.2f} 秒"
145
 
146
+
147
  def process_file(file, model_type="bert"):
148
+ try:
149
+ with open(file.name, 'rb') as f:
150
+ content = f.read()
151
+
152
+ if len(content) > 5 * 1024 * 1024:
153
+ return "❌ 文件太大", "", "", ""
154
+
155
+ # 检测编码
156
+ try:
157
+ encoding = chardet.detect(content)['encoding'] or 'utf-8'
158
+ text = content.decode(encoding)
159
+ except UnicodeDecodeError:
160
+ # 尝试常见中文编码
161
+ for enc in ['gb18030', 'utf-16', 'big5']:
162
+ try:
163
+ text = content.decode(enc)
164
+ break
165
+ except:
166
+ continue
167
+ else:
168
+ return "❌ 编码解析失败", "", "", ""
169
+
170
+ return process_text(text, model_type)
171
+ except Exception as e:
172
+ return f"❌ 文件处理错误: {str(e)}", "", "", ""
173
 
174
  # ======================== 模型评估与自动标注 ========================
175
  def convert_telegram_json_to_eval_format(path):
 
192
  return result
193
  return []
194
 
195
+
196
  def evaluate_ner_model(data, model_type):
197
  y_true, y_pred = [], []
198
  for item in data:
199
  text = item["text"]
200
+ gold_entities = []
201
+ for e in item.get("entities", []):
202
+ if "text" in e and "type" in e:
203
+ # 使用哈希避免重复
204
+ gold_entities.append(f"{e['text']}|{e['type']}|{e.get('start', -1)}|{e.get('end', -1)}")
205
+
206
+ pred_entities = []
207
  pred, _ = ner(text, model_type)
208
+ for e in pred:
209
+ pred_entities.append(f"{e['text']}|{e['type']}|{e['start']}|{e['end']}")
210
+
211
+ # 创建所有可能的实体集合
212
+ all_entities = set(gold_entities + pred_entities)
213
+ for ent in all_entities:
214
+ y_true.append(1 if ent in gold_entities else 0)
215
+ y_pred.append(1 if ent in pred_entities else 0)
216
+
217
+ if not y_true:
218
+ return "⚠️ 无有效标注数据"
219
+
220
+ return f"Precision: {precision_score(y_true, y_pred):.2f}\nRecall: {recall_score(y_true, y_pred):.2f}\nF1: {f1_score(y_true, y_pred):.2f}"
221
 
222
  def auto_annotate(file, model_type):
223
  data = convert_telegram_json_to_eval_format(file.name)