Spaces:
Sleeping
Sleeping
Commit
·
0207e75
1
Parent(s):
cebeeca
add app.py and requirements.txt
Browse files
app.py
CHANGED
@@ -30,16 +30,22 @@ except Exception as e:
|
|
30 |
print(f"❌ ChatGLM 加载失败: {e}")
|
31 |
|
32 |
# ======================== 知识图谱结构 ========================
|
33 |
-
knowledge_graph = {"entities": set(), "relations":
|
|
|
34 |
|
35 |
def update_knowledge_graph(entities, relations):
|
36 |
for e in entities:
|
37 |
if isinstance(e, dict) and 'text' in e and 'type' in e:
|
38 |
knowledge_graph["entities"].add((e['text'], e['type']))
|
|
|
39 |
for r in relations:
|
40 |
if isinstance(r, dict) and all(k in r for k in ("head", "tail", "relation")):
|
41 |
-
|
42 |
-
|
|
|
|
|
|
|
|
|
43 |
def visualize_kg_text():
|
44 |
nodes = [f"{ent[0]} ({ent[1]})" for ent in knowledge_graph["entities"]]
|
45 |
edges = [f"{h} --[{r}]-> {t}" for h, t, r in knowledge_graph["relations"]]
|
@@ -50,14 +56,28 @@ def ner(text, model_type="bert"):
|
|
50 |
start_time = time.time()
|
51 |
if model_type == "chatglm" and use_chatglm:
|
52 |
try:
|
53 |
-
prompt = f"
|
|
|
|
|
54 |
response = chatglm_model.chat(chatglm_tokenizer, prompt, temperature=0.1)
|
55 |
if isinstance(response, tuple):
|
56 |
response = response[0]
|
57 |
-
|
58 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
59 |
except Exception as e:
|
60 |
-
print(f"
|
61 |
return [], time.time() - start_time
|
62 |
|
63 |
# 使用微调的 BERT 中文 NER 模型
|
@@ -76,17 +96,41 @@ def ner(text, model_type="bert"):
|
|
76 |
def re_extract(entities, text):
|
77 |
if len(entities) < 2:
|
78 |
return []
|
|
|
|
|
79 |
try:
|
80 |
-
|
81 |
-
prompt = f"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
82 |
if use_chatglm:
|
83 |
response = chatglm_model.chat(chatglm_tokenizer, prompt, temperature=0.1)
|
84 |
if isinstance(response, tuple):
|
85 |
response = response[0]
|
86 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
87 |
except Exception as e:
|
88 |
-
print(f"
|
89 |
-
|
|
|
|
|
90 |
|
91 |
# ======================== 文本分析主流程 ========================
|
92 |
def process_text(text, model_type="bert"):
|
@@ -99,13 +143,33 @@ def process_text(text, model_type="bert"):
|
|
99 |
kg_text = visualize_kg_text()
|
100 |
return ent_text, rel_text, kg_text, f"{duration:.2f} 秒"
|
101 |
|
|
|
102 |
def process_file(file, model_type="bert"):
|
103 |
-
|
104 |
-
|
105 |
-
|
106 |
-
|
107 |
-
|
108 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
109 |
|
110 |
# ======================== 模型评估与自动标注 ========================
|
111 |
def convert_telegram_json_to_eval_format(path):
|
@@ -128,23 +192,32 @@ def convert_telegram_json_to_eval_format(path):
|
|
128 |
return result
|
129 |
return []
|
130 |
|
|
|
131 |
def evaluate_ner_model(data, model_type):
|
132 |
y_true, y_pred = [], []
|
133 |
for item in data:
|
134 |
text = item["text"]
|
135 |
-
|
136 |
-
for e in item
|
137 |
-
|
138 |
-
|
139 |
-
|
140 |
-
|
141 |
-
|
142 |
pred, _ = ner(text, model_type)
|
143 |
-
|
144 |
-
|
145 |
-
|
146 |
-
|
147 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
148 |
|
149 |
def auto_annotate(file, model_type):
|
150 |
data = convert_telegram_json_to_eval_format(file.name)
|
|
|
30 |
print(f"❌ ChatGLM 加载失败: {e}")
|
31 |
|
32 |
# ======================== 知识图谱结构 ========================
|
33 |
+
knowledge_graph = {"entities": set(), "relations": set()}
|
34 |
+
|
35 |
|
36 |
def update_knowledge_graph(entities, relations):
|
37 |
for e in entities:
|
38 |
if isinstance(e, dict) and 'text' in e and 'type' in e:
|
39 |
knowledge_graph["entities"].add((e['text'], e['type']))
|
40 |
+
|
41 |
for r in relations:
|
42 |
if isinstance(r, dict) and all(k in r for k in ("head", "tail", "relation")):
|
43 |
+
# 标准化关系方向
|
44 |
+
relation_tuple = (r['head'], r['tail'], r['relation'])
|
45 |
+
reverse_tuple = (r['tail'], r['head'], r['relation'])
|
46 |
+
if reverse_tuple not in knowledge_graph["relations"]:
|
47 |
+
knowledge_graph["relations"].add(relation_tuple)
|
48 |
+
|
49 |
def visualize_kg_text():
|
50 |
nodes = [f"{ent[0]} ({ent[1]})" for ent in knowledge_graph["entities"]]
|
51 |
edges = [f"{h} --[{r}]-> {t}" for h, t, r in knowledge_graph["relations"]]
|
|
|
56 |
start_time = time.time()
|
57 |
if model_type == "chatglm" and use_chatglm:
|
58 |
try:
|
59 |
+
prompt = f"""请从以下文本中识别所有实体,严格按照JSON列表格式返回,每个实体包含text、type、start、end字段:
|
60 |
+
示例:[{{"text": "北京", "type": "LOC", "start": 0, "end": 2}}]
|
61 |
+
文本:{text}"""
|
62 |
response = chatglm_model.chat(chatglm_tokenizer, prompt, temperature=0.1)
|
63 |
if isinstance(response, tuple):
|
64 |
response = response[0]
|
65 |
+
|
66 |
+
# 增强 JSON 解析
|
67 |
+
try:
|
68 |
+
json_str = re.search(r'\[.*\]', response, re.DOTALL).group()
|
69 |
+
entities = json.loads(json_str)
|
70 |
+
# 验证字段
|
71 |
+
valid_entities = []
|
72 |
+
for ent in entities:
|
73 |
+
if all(k in ent for k in ("text", "type", "start", "end")):
|
74 |
+
valid_entities.append(ent)
|
75 |
+
return valid_entities, time.time() - start_time
|
76 |
+
except Exception as e:
|
77 |
+
print(f"JSON 解析失败: {e}")
|
78 |
+
return [], time.time() - start_time
|
79 |
except Exception as e:
|
80 |
+
print(f"ChatGLM 调用失败:{e}")
|
81 |
return [], time.time() - start_time
|
82 |
|
83 |
# 使用微调的 BERT 中文 NER 模型
|
|
|
96 |
def re_extract(entities, text):
|
97 |
if len(entities) < 2:
|
98 |
return []
|
99 |
+
|
100 |
+
relations = []
|
101 |
try:
|
102 |
+
entity_pairs = [(e1, e2) for i, e1 in enumerate(entities) for e2 in entities[i + 1:]]
|
103 |
+
prompt = f"""分析文本中的实体关系,返回JSON列表:
|
104 |
+
文本:{text}
|
105 |
+
实体列表:{[e['text'] for e in entities]}
|
106 |
+
要求:
|
107 |
+
1. 仅返回存在明确关系的实体对
|
108 |
+
2. 关系类型使用:属于、位于、参与、其他
|
109 |
+
3. 格式示例:[{{"head": "北京", "tail": "中国", "relation": "位于"}}]"""
|
110 |
+
|
111 |
if use_chatglm:
|
112 |
response = chatglm_model.chat(chatglm_tokenizer, prompt, temperature=0.1)
|
113 |
if isinstance(response, tuple):
|
114 |
response = response[0]
|
115 |
+
|
116 |
+
# 提取 JSON
|
117 |
+
try:
|
118 |
+
json_str = re.search(r'\[.*\]', response, re.DOTALL).group()
|
119 |
+
relations = json.loads(json_str)
|
120 |
+
# 验证关系
|
121 |
+
valid_relations = []
|
122 |
+
valid_types = {"属于", "位于", "参与", "其他"}
|
123 |
+
for rel in relations:
|
124 |
+
if all(k in rel for k in ("head", "tail", "relation")) and rel["relation"] in valid_types:
|
125 |
+
valid_relations.append(rel)
|
126 |
+
return valid_relations
|
127 |
+
except Exception as e:
|
128 |
+
print(f"关系解析失败: {e}")
|
129 |
except Exception as e:
|
130 |
+
print(f"关系抽取失败: {e}")
|
131 |
+
|
132 |
+
# 默认不生成任何关系
|
133 |
+
return []
|
134 |
|
135 |
# ======================== 文本分析主流程 ========================
|
136 |
def process_text(text, model_type="bert"):
|
|
|
143 |
kg_text = visualize_kg_text()
|
144 |
return ent_text, rel_text, kg_text, f"{duration:.2f} 秒"
|
145 |
|
146 |
+
|
147 |
def process_file(file, model_type="bert"):
|
148 |
+
try:
|
149 |
+
with open(file.name, 'rb') as f:
|
150 |
+
content = f.read()
|
151 |
+
|
152 |
+
if len(content) > 5 * 1024 * 1024:
|
153 |
+
return "❌ 文件太大", "", "", ""
|
154 |
+
|
155 |
+
# 检测编码
|
156 |
+
try:
|
157 |
+
encoding = chardet.detect(content)['encoding'] or 'utf-8'
|
158 |
+
text = content.decode(encoding)
|
159 |
+
except UnicodeDecodeError:
|
160 |
+
# 尝试常见中文编码
|
161 |
+
for enc in ['gb18030', 'utf-16', 'big5']:
|
162 |
+
try:
|
163 |
+
text = content.decode(enc)
|
164 |
+
break
|
165 |
+
except:
|
166 |
+
continue
|
167 |
+
else:
|
168 |
+
return "❌ 编码解析失败", "", "", ""
|
169 |
+
|
170 |
+
return process_text(text, model_type)
|
171 |
+
except Exception as e:
|
172 |
+
return f"❌ 文件处理错误: {str(e)}", "", "", ""
|
173 |
|
174 |
# ======================== 模型评估与自动标注 ========================
|
175 |
def convert_telegram_json_to_eval_format(path):
|
|
|
192 |
return result
|
193 |
return []
|
194 |
|
195 |
+
|
196 |
def evaluate_ner_model(data, model_type):
|
197 |
y_true, y_pred = [], []
|
198 |
for item in data:
|
199 |
text = item["text"]
|
200 |
+
gold_entities = []
|
201 |
+
for e in item.get("entities", []):
|
202 |
+
if "text" in e and "type" in e:
|
203 |
+
# 使用哈希避免重复
|
204 |
+
gold_entities.append(f"{e['text']}|{e['type']}|{e.get('start', -1)}|{e.get('end', -1)}")
|
205 |
+
|
206 |
+
pred_entities = []
|
207 |
pred, _ = ner(text, model_type)
|
208 |
+
for e in pred:
|
209 |
+
pred_entities.append(f"{e['text']}|{e['type']}|{e['start']}|{e['end']}")
|
210 |
+
|
211 |
+
# 创建所有可能的实体集合
|
212 |
+
all_entities = set(gold_entities + pred_entities)
|
213 |
+
for ent in all_entities:
|
214 |
+
y_true.append(1 if ent in gold_entities else 0)
|
215 |
+
y_pred.append(1 if ent in pred_entities else 0)
|
216 |
+
|
217 |
+
if not y_true:
|
218 |
+
return "⚠️ 无有效标注数据"
|
219 |
+
|
220 |
+
return f"Precision: {precision_score(y_true, y_pred):.2f}\nRecall: {recall_score(y_true, y_pred):.2f}\nF1: {f1_score(y_true, y_pred):.2f}"
|
221 |
|
222 |
def auto_annotate(file, model_type):
|
223 |
data = convert_telegram_json_to_eval_format(file.name)
|