chen666-666 commited on
Commit
b45e38c
·
1 Parent(s): 4cd7a63

add app.py and requirements.txt

Browse files
Files changed (1) hide show
  1. app.py +109 -30
app.py CHANGED
@@ -132,49 +132,128 @@ def ner(text, model_type="bert"):
132
  return entities, time.time() - start_time
133
 
134
 
 
135
  # ======================== 关系抽取(RE) ========================
136
  def re_extract(entities, text):
137
- # 修改7:添加实体类型过滤
138
- valid_entity_types = {"PER", "LOC", "ORG"}
139
- filtered_entities = [e for e in entities if e["type"] in valid_entity_types]
140
-
141
- if len(filtered_entities) < 2:
142
  return []
143
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
144
  relations = []
145
- try:
146
- entity_pairs = [(e1, e2) for i, e1 in enumerate(entities) for e2 in entities[i + 1:]]
147
- prompt = f"""分析文本中的实体关系,返回JSON列表:
148
- 文本:{text}
149
- 实体列表:{[e['text'] for e in entities]}
 
 
 
150
  要求:
151
- 1. 仅返回存在明确关系的实体对
152
- 2. 关系类型使用:属于、位于、参与、其他
153
- 3. 格式示例:[{{"head": "北京", "tail": "中国", "relation": "位于"}}]"""
 
154
 
155
- if use_chatglm:
156
- response = chatglm_model.chat(chatglm_tokenizer, prompt, temperature=0.1)
157
  if isinstance(response, tuple):
158
  response = response[0]
159
 
160
- # 提取 JSON
161
  try:
162
- json_str = re.search(r'\[.*\]', response, re.DOTALL).group()
163
- relations = json.loads(json_str)
164
- # 验证关系
165
- valid_relations = []
166
- valid_types = {"属于", "位于", "参与", "其他"}
167
- for rel in relations:
168
- if all(k in rel for k in ("head", "tail", "relation")) and rel["relation"] in valid_types:
169
- valid_relations.append(rel)
170
- return valid_relations
 
 
 
 
 
 
 
 
171
  except Exception as e:
172
- print(f"关系解析失败: {e}")
173
- except Exception as e:
174
- print(f"关系抽取失败: {e}")
175
 
176
- # 默认不生成任何关系
177
- return []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
178
 
179
 
180
  # ======================== 文本分析主流程 ========================
 
132
  return entities, time.time() - start_time
133
 
134
 
135
+ # ======================== 关系抽取(RE) ========================
136
  # ======================== 关系抽取(RE) ========================
137
  def re_extract(entities, text):
138
+ # 参数校验
139
+ if not entities or not text:
 
 
 
140
  return []
141
 
142
+ # 实体类型过滤(根据业务需求调整)
143
+ valid_entity_types = {"PER", "LOC", "ORG", "TITLE"}
144
+ filtered_entities = [e for e in entities if e.get("type") in valid_entity_types]
145
+
146
+ # --------------------- 处理单实体场景 ---------------------
147
+ if len(filtered_entities) == 1:
148
+ single_relations = []
149
+ ent = filtered_entities[0]
150
+
151
+ # 规则1:人物职位检测
152
+ if ent["type"] == "PER":
153
+ position_keywords = ["CEO", "经理", "总监", "工程师", "教授"]
154
+ for keyword in position_keywords:
155
+ if keyword in text:
156
+ single_relations.append({
157
+ "head": ent["text"],
158
+ "tail": keyword,
159
+ "relation": "担任职位"
160
+ })
161
+ break
162
+
163
+ # 规则2:机构地点检测
164
+ if ent["type"] in ["ORG", "LOC"]:
165
+ location_verbs = ["位于", "坐落于", "地处"]
166
+ for verb in location_verbs:
167
+ if verb in text:
168
+ match = re.search(fr"{ent['text']}{verb}(.*?)[,。]", text)
169
+ if match:
170
+ single_relations.append({
171
+ "head": ent["text"],
172
+ "tail": match.group(1).strip(),
173
+ "relation": "位置"
174
+ })
175
+ break
176
+ return single_relations
177
+
178
+ # --------------------- 多实体关系抽取 ---------------------
179
  relations = []
180
+
181
+ # 方案1:使用ChatGLM抽取关系
182
+ if use_chatglm and len(filtered_entities) >= 2:
183
+ try:
184
+ entity_list = [e["text"] for e in filtered_entities]
185
+ prompt = f"""请分析以下文本中的实体关系,严格按照JSON列表格式返回:
186
+ 文本内容:{text}
187
+ 候选实体:{entity_list}
188
  要求:
189
+ 1. 只返回存在明确关系的实体对
190
+ 2. 关系类型使用:属于、位于、任职于、合作、其他
191
+ 3. 示例格式:[{{"head":"实体1", "tail":"实体2", "relation":"关系类型"}}]
192
+ 请直接返回JSON,不要多余内容:"""
193
 
194
+ response = chatglm_model.chat(chatglm_tokenizer, prompt, temperature=0.01)
 
195
  if isinstance(response, tuple):
196
  response = response[0]
197
 
198
+ # 增强JSON解析
199
  try:
200
+ json_str = re.search(r'(\[.*?\])', response, re.DOTALL)
201
+ if json_str:
202
+ json_str = json_str.group(1)
203
+ json_str = re.sub(r'[\u201c\u201d]', '"', json_str) # 处理中文引号
204
+ json_str = re.sub(r'(?<!,)\n', '', json_str) # 保留逗号后的换行
205
+ relations = json.loads(json_str)
206
+
207
+ # 验证关系有效性
208
+ valid_relations = []
209
+ valid_rel_types = {"属于", "位于", "任职于", "合作", "其他"}
210
+ for rel in relations:
211
+ if (isinstance(rel, dict) and
212
+ rel.get("head") in entity_list and
213
+ rel.get("tail") in entity_list and
214
+ rel.get("relation") in valid_rel_types):
215
+ valid_relations.append(rel)
216
+ relations = valid_relations
217
  except Exception as e:
218
+ print(f"[DEBUG] 关系解析失败: {str(e)}")
 
 
219
 
220
+ except Exception as e:
221
+ print(f"ChatGLM关系抽取异常: {str(e)}")
222
+
223
+ # 方案2:规则兜底(当模型不可用或未抽取出关系时)
224
+ if len(relations) == 0:
225
+ # 规则1:A位于B
226
+ location_matches = re.finditer(r'([^\s,。]+)[位于|坐落于|地处]([^\s,。]+)', text)
227
+ for match in location_matches:
228
+ head, tail = match.groups()
229
+ relations.append({"head": head, "tail": tail, "relation": "位于"})
230
+
231
+ # 规则2:A属于B
232
+ belong_matches = re.finditer(r'([^\s,。]+)(属于|隶属于)([^\s,。]+)', text)
233
+ for match in belong_matches:
234
+ head, _, tail = match.groups()
235
+ relations.append({"head": head, "tail": tail, "relation": "属于"})
236
+
237
+ # 规则3:人物-机构关系
238
+ person_org_pattern = r'([\u4e00-\u9fa5]{2,4})(现任|担任|就职于)([\u4e00-\u9fa5]+?公司|[\u4e00-\u9fa5]+?大学)'
239
+ for match in re.finditer(person_org_pattern, text):
240
+ head, _, tail = match.groups()
241
+ relations.append({"head": head, "tail": tail, "relation": "任职于"})
242
+
243
+ # 后处理:去重和验证
244
+ seen = set()
245
+ final_relations = []
246
+ for rel in relations:
247
+ key = (rel["head"], rel["tail"], rel["relation"])
248
+ if key not in seen:
249
+ # 验证实体是否存在
250
+ head_exists = any(e["text"] == rel["head"] for e in filtered_entities)
251
+ tail_exists = any(e["text"] == rel["tail"] for e in filtered_entities)
252
+ if head_exists and tail_exists:
253
+ final_relations.append(rel)
254
+ seen.add(key)
255
+
256
+ return final_relations
257
 
258
 
259
  # ======================== 文本分析主流程 ========================