Spaces:
Sleeping
Sleeping
Commit
·
b45e38c
1
Parent(s):
4cd7a63
add app.py and requirements.txt
Browse files
app.py
CHANGED
@@ -132,49 +132,128 @@ def ner(text, model_type="bert"):
|
|
132 |
return entities, time.time() - start_time
|
133 |
|
134 |
|
|
|
135 |
# ======================== 关系抽取(RE) ========================
|
136 |
def re_extract(entities, text):
|
137 |
-
#
|
138 |
-
|
139 |
-
filtered_entities = [e for e in entities if e["type"] in valid_entity_types]
|
140 |
-
|
141 |
-
if len(filtered_entities) < 2:
|
142 |
return []
|
143 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
144 |
relations = []
|
145 |
-
|
146 |
-
|
147 |
-
|
148 |
-
|
149 |
-
|
|
|
|
|
|
|
150 |
要求:
|
151 |
-
1.
|
152 |
-
2.
|
153 |
-
3.
|
|
|
154 |
|
155 |
-
|
156 |
-
response = chatglm_model.chat(chatglm_tokenizer, prompt, temperature=0.1)
|
157 |
if isinstance(response, tuple):
|
158 |
response = response[0]
|
159 |
|
160 |
-
#
|
161 |
try:
|
162 |
-
json_str = re.search(r'\[
|
163 |
-
|
164 |
-
|
165 |
-
|
166 |
-
|
167 |
-
|
168 |
-
|
169 |
-
|
170 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
171 |
except Exception as e:
|
172 |
-
print(f"关系解析失败: {e}")
|
173 |
-
except Exception as e:
|
174 |
-
print(f"关系抽取失败: {e}")
|
175 |
|
176 |
-
|
177 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
178 |
|
179 |
|
180 |
# ======================== 文本分析主流程 ========================
|
|
|
132 |
return entities, time.time() - start_time
|
133 |
|
134 |
|
135 |
+
# ======================== 关系抽取(RE) ========================
|
136 |
# ======================== 关系抽取(RE) ========================
|
137 |
def re_extract(entities, text):
|
138 |
+
# 参数校验
|
139 |
+
if not entities or not text:
|
|
|
|
|
|
|
140 |
return []
|
141 |
|
142 |
+
# 实体类型过滤(根据业务需求调整)
|
143 |
+
valid_entity_types = {"PER", "LOC", "ORG", "TITLE"}
|
144 |
+
filtered_entities = [e for e in entities if e.get("type") in valid_entity_types]
|
145 |
+
|
146 |
+
# --------------------- 处理单实体场景 ---------------------
|
147 |
+
if len(filtered_entities) == 1:
|
148 |
+
single_relations = []
|
149 |
+
ent = filtered_entities[0]
|
150 |
+
|
151 |
+
# 规则1:人物职位检测
|
152 |
+
if ent["type"] == "PER":
|
153 |
+
position_keywords = ["CEO", "经理", "总监", "工程师", "教授"]
|
154 |
+
for keyword in position_keywords:
|
155 |
+
if keyword in text:
|
156 |
+
single_relations.append({
|
157 |
+
"head": ent["text"],
|
158 |
+
"tail": keyword,
|
159 |
+
"relation": "担任职位"
|
160 |
+
})
|
161 |
+
break
|
162 |
+
|
163 |
+
# 规则2:机构地点检测
|
164 |
+
if ent["type"] in ["ORG", "LOC"]:
|
165 |
+
location_verbs = ["位于", "坐落于", "地处"]
|
166 |
+
for verb in location_verbs:
|
167 |
+
if verb in text:
|
168 |
+
match = re.search(fr"{ent['text']}{verb}(.*?)[,。]", text)
|
169 |
+
if match:
|
170 |
+
single_relations.append({
|
171 |
+
"head": ent["text"],
|
172 |
+
"tail": match.group(1).strip(),
|
173 |
+
"relation": "位置"
|
174 |
+
})
|
175 |
+
break
|
176 |
+
return single_relations
|
177 |
+
|
178 |
+
# --------------------- 多实体关系抽取 ---------------------
|
179 |
relations = []
|
180 |
+
|
181 |
+
# 方案1:使用ChatGLM抽取关系
|
182 |
+
if use_chatglm and len(filtered_entities) >= 2:
|
183 |
+
try:
|
184 |
+
entity_list = [e["text"] for e in filtered_entities]
|
185 |
+
prompt = f"""请分析以下文本中的实体关系,严格按照JSON列表格式返回:
|
186 |
+
文本内容:{text}
|
187 |
+
候选实体:{entity_list}
|
188 |
要求:
|
189 |
+
1. 只返回存在明确关系的实体对
|
190 |
+
2. 关系类型使用:属于、位于、任职于、合作、其他
|
191 |
+
3. 示例格式:[{{"head":"实体1", "tail":"实体2", "relation":"关系类型"}}]
|
192 |
+
请直接返回JSON,不要多余内容:"""
|
193 |
|
194 |
+
response = chatglm_model.chat(chatglm_tokenizer, prompt, temperature=0.01)
|
|
|
195 |
if isinstance(response, tuple):
|
196 |
response = response[0]
|
197 |
|
198 |
+
# 增强JSON解析
|
199 |
try:
|
200 |
+
json_str = re.search(r'(\[.*?\])', response, re.DOTALL)
|
201 |
+
if json_str:
|
202 |
+
json_str = json_str.group(1)
|
203 |
+
json_str = re.sub(r'[\u201c\u201d]', '"', json_str) # 处理中文引号
|
204 |
+
json_str = re.sub(r'(?<!,)\n', '', json_str) # 保留逗号后的换行
|
205 |
+
relations = json.loads(json_str)
|
206 |
+
|
207 |
+
# 验证关系有效性
|
208 |
+
valid_relations = []
|
209 |
+
valid_rel_types = {"属于", "位于", "任职于", "合作", "其他"}
|
210 |
+
for rel in relations:
|
211 |
+
if (isinstance(rel, dict) and
|
212 |
+
rel.get("head") in entity_list and
|
213 |
+
rel.get("tail") in entity_list and
|
214 |
+
rel.get("relation") in valid_rel_types):
|
215 |
+
valid_relations.append(rel)
|
216 |
+
relations = valid_relations
|
217 |
except Exception as e:
|
218 |
+
print(f"[DEBUG] 关系解析失败: {str(e)}")
|
|
|
|
|
219 |
|
220 |
+
except Exception as e:
|
221 |
+
print(f"ChatGLM关系抽取异常: {str(e)}")
|
222 |
+
|
223 |
+
# 方案2:规则兜底(当模型不可用或未抽取出关系时)
|
224 |
+
if len(relations) == 0:
|
225 |
+
# 规则1:A位于B
|
226 |
+
location_matches = re.finditer(r'([^\s,。]+)[位于|坐落于|地处]([^\s,。]+)', text)
|
227 |
+
for match in location_matches:
|
228 |
+
head, tail = match.groups()
|
229 |
+
relations.append({"head": head, "tail": tail, "relation": "位于"})
|
230 |
+
|
231 |
+
# 规则2:A属于B
|
232 |
+
belong_matches = re.finditer(r'([^\s,。]+)(属于|隶属于)([^\s,。]+)', text)
|
233 |
+
for match in belong_matches:
|
234 |
+
head, _, tail = match.groups()
|
235 |
+
relations.append({"head": head, "tail": tail, "relation": "属于"})
|
236 |
+
|
237 |
+
# 规则3:人物-机构关系
|
238 |
+
person_org_pattern = r'([\u4e00-\u9fa5]{2,4})(现任|担任|就职于)([\u4e00-\u9fa5]+?公司|[\u4e00-\u9fa5]+?大学)'
|
239 |
+
for match in re.finditer(person_org_pattern, text):
|
240 |
+
head, _, tail = match.groups()
|
241 |
+
relations.append({"head": head, "tail": tail, "relation": "任职于"})
|
242 |
+
|
243 |
+
# 后处理:去重和验证
|
244 |
+
seen = set()
|
245 |
+
final_relations = []
|
246 |
+
for rel in relations:
|
247 |
+
key = (rel["head"], rel["tail"], rel["relation"])
|
248 |
+
if key not in seen:
|
249 |
+
# 验证实体是否存在
|
250 |
+
head_exists = any(e["text"] == rel["head"] for e in filtered_entities)
|
251 |
+
tail_exists = any(e["text"] == rel["tail"] for e in filtered_entities)
|
252 |
+
if head_exists and tail_exists:
|
253 |
+
final_relations.append(rel)
|
254 |
+
seen.add(key)
|
255 |
+
|
256 |
+
return final_relations
|
257 |
|
258 |
|
259 |
# ======================== 文本分析主流程 ========================
|