Spaces:

chen666-666
/

wechat-ner-re

Sleeping

App Files Files Community

chen666-666 commited on May 4

Commit

d18c4dc

verified ·

1 Parent(s): 8c073c8

Upload app.py

Browse files

Files changed (1) hide show

app.py +770 -768

app.py CHANGED Viewed

@@ -1,769 +1,771 @@
-import torch
-from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline, AutoModel
-import gradio as gr
-import re
-import os
-import json
-import chardet
-from sklearn.metrics import precision_score, recall_score, f1_score
-import time
-from functools import lru_cache  # 添加这行导入
-# ======================== 数据库模块 ========================
-from sqlalchemy import create_engine
-from sqlalchemy.orm import sessionmaker
-from contextlib import contextmanager
-import logging
-import networkx as nx
-from pyvis.network import Network
-import pandas as pd
-# 配置日志
-logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
-# 使用SQLAlchemy的连接池来管理数据库连接
-DATABASE_URL = "mysql+pymysql://user:password@host/dbname"  # 请根据实际情况修改连接字符串
-# 创建引擎（连接池）
-engine = create_engine(DATABASE_URL, pool_size=10, max_overflow=20, echo=True)
-# 创建session类
-Session = sessionmaker(bind=engine)
-@contextmanager
-def get_db_connection():
-    """
-    使用上下文管理器获取数据库连接
-    """
-    session = None
-    try:
-        session = Session()  # 从连接池中获取一个连接
-        logging.info("✅ 数据库连接已建立")
-        yield session  # 使用session进行数据库操作
-    except Exception as e:
-        logging.error(f"❌ 数据库操作时发生错误: {e}")
-        if session:
-            session.rollback()  # 回滚事务
-    finally:
-        if session:
-            try:
-                session.commit()  # 提交事务
-                logging.info("✅ 数据库事务已提交")
-            except Exception as e:
-                logging.error(f"❌ 提交事务时发生错误: {e}")
-            finally:
-                session.close()  # 关闭会话，释放连接
-                logging.info("✅ 数据库连接已关闭")
-def save_to_db(table, data):
-    """
-    将数据保存到数据库
-    :param table: 表名
-    :param data: 数据字典
-    """
-    try:
-        valid_tables = ["entities", "relations"]  # 只允许保存到这些表
-        if table not in valid_tables:
-            raise ValueError(f"Invalid table: {table}")
-        with get_db_connection() as conn:
-            if conn:
-                # 这里的操作假设使用了ORM模型来处理插入，实际根据你数据库的表结构来调整
-                table_model = get_table_model(table)  # 假设你有一个方法来根据表名获得ORM模型
-                new_record = table_model(**data)
-                conn.add(new_record)
-                conn.commit()  # 提交事务
-    except Exception as e:
-        logging.error(f"❌ 保存数据时发生错误: {e}")
-        return False
-    return True
-def get_table_model(table_name):
-    """
-    根据表名获取ORM模型（这里假设你有一个映射到数据库表的模型）
-    :param table_name: 表名
-    :return: 对应的ORM模型
-    """
-    if table_name == "entities":
-        from models import Entity  # 假设你已经定义了ORM模型
-        return Entity
-    elif table_name == "relations":
-        from models import Relation  # 假设你已经定义了ORM模型
-        return Relation
-    else:
-        raise ValueError(f"Unknown table: {table_name}")
-# ======================== 模型加载 ========================
-NER_MODEL_NAME = "uer/roberta-base-finetuned-cluener2020-chinese"
-@lru_cache(maxsize=1)
-def get_ner_pipeline():
-    tokenizer = AutoTokenizer.from_pretrained(NER_MODEL_NAME)
-    model = AutoModelForTokenClassification.from_pretrained(NER_MODEL_NAME)
-    return pipeline(
-        "ner",
-        model=model,
-        tokenizer=tokenizer,
-        aggregation_strategy="first"
-    )
-@lru_cache(maxsize=1)
-def get_re_pipeline():
-    return pipeline(
-        "text2text-generation",
-        model=NER_MODEL_NAME,
-        tokenizer=NER_MODEL_NAME,
-        max_length=512,
-        device=0 if torch.cuda.is_available() else -1
-    )
-# chatglm_model, chatglm_tokenizer = None, None
-# use_chatglm = False
-# try:
-#     chatglm_model_name = "THUDM/chatglm-6b-int4"
-#     chatglm_tokenizer = AutoTokenizer.from_pretrained(chatglm_model_name, trust_remote_code=True)
-#     chatglm_model = AutoModel.from_pretrained(
-#         chatglm_model_name,
-#         trust_remote_code=True,
-#         device_map="cpu",
-#         torch_dtype=torch.float32
-#     ).eval()
-#     use_chatglm = True
-#     print("✅ 4-bit量化版ChatGLM加载成功")
-# except Exception as e:
-#     print(f"❌ ChatGLM加载失败: {e}")
-# ======================== 知识图谱结构 ========================
-knowledge_graph = {"entities": set(), "relations": set()}
-def update_knowledge_graph(entities, relations):
-    # 保存实体
-    for e in entities:
-        if isinstance(e, dict) and 'text' in e and 'type' in e:
-            save_to_db('entities', {
-                'text': e['text'],
-                'type': e['type'],
-                'start_pos': e.get('start', -1),
-                'end_pos': e.get('end', -1),
-                'source': 'user_input'
-            })
-    # 保存关系
-    for r in relations:
-        if isinstance(r, dict) and all(k in r for k in ("head", "tail", "relation")):
-            save_to_db('relations', {
-                'head_entity': r['head'],
-                'tail_entity': r['tail'],
-                'relation_type': r['relation'],
-                'source_text': ''  # 可添加原文关联
-            })
-def visualize_kg_text():
-    nodes = [f"{ent[0]} ({ent[1]})" for ent in knowledge_graph["entities"]]
-    edges = [f"{h} --[{r}]-> {t}" for h, t, r in knowledge_graph["relations"]]
-    return "\n".join(["📌 实体:"] + nodes + ["", "📎 关系:"] + edges)
-def visualize_kg_interactive(entities, relations):
-    """
-    生成交互式的知识图谱可视化
-    """
-    # 创建一个新的网络图
-    net = Network(height="500px", width="100%", bgcolor="#ffffff", font_color="black")
-    # 添加节点
-    entity_colors = {
-        'PER': '#FF6B6B',  # 人物-红色
-        'ORG': '#4ECDC4',  # 组织-青色
-        'LOC': '#45B7D1',  # 地点-蓝色
-        'TIME': '#96CEB4', # 时间-绿色
-        'MISC': '#D4A5A5'  # 其他-灰色
-    }
-    # 添加实体节点
-    for entity in entities:
-        node_color = entity_colors.get(entity['type'], '#D3D3D3')
-        net.add_node(entity['text'],
-                    label=f"{entity['text']}\n({entity['type']})",
-                    color=node_color,
-                    title=f"类型: {entity['type']}")
-    # 添加关系边
-    for relation in relations:
-        net.add_edge(relation['head'],
-                    relation['tail'],
-                    label=relation['relation'],
-                    arrows='to')
-    # 设置物理布局
-    net.set_options('''
-    var options = {
-        "physics": {
-            "forceAtlas2Based": {
-                "gravitationalConstant": -50,
-                "centralGravity": 0.01,
-                "springLength": 100,
-                "springConstant": 0.08
-            },
-            "maxVelocity": 50,
-            "solver": "forceAtlas2Based",
-            "timestep": 0.35,
-            "stabilization": {"iterations": 150}
-        }
-    }
-    ''')
-    # 生成HTML文件
-    html_path = "knowledge_graph.html"
-    net.save_graph(html_path)
-    return html_path
-# ======================== 实体识别（NER） ========================
-def merge_adjacent_entities(entities):
-    if not entities:
-        return entities
-    merged = [entities[0]]
-    for entity in entities[1:]:
-        last = merged[-1]
-        # 合并相邻的同类型实体
-        if (entity["type"] == last["type"] and
-                entity["start"] == last["end"]):
-            last["text"] += entity["text"]
-            last["end"] = entity["end"]
-        else:
-            merged.append(entity)
-    return merged
-def ner(text, model_type="bert"):
-    start_time = time.time()
-    # 如果使用的是 ChatGLM 模型，执行 ChatGLM 的NER
-    if model_type == "chatglm" and use_chatglm:
-        try:
-            prompt = f"""请从以下文本中识别所有实体，严格按照JSON列表格式返回，每个实体包含text、type、start、end字段：
-示例：[{{"text": "北京", "type": "LOC", "start": 0, "end": 2}}]
-文本：{text}"""
-            response = chatglm_model.chat(chatglm_tokenizer, prompt, temperature=0.1)
-            if isinstance(response, tuple):
-                response = response[0]
-            try:
-                json_str = re.search(r'\[.*\]', response, re.DOTALL).group()
-                entities = json.loads(json_str)
-                valid_entities = [ent for ent in entities if all(k in ent for k in ("text", "type", "start", "end"))]
-                return valid_entities, time.time() - start_time
-            except Exception as e:
-                print(f"JSON解析失败: {e}")
-                return [], time.time() - start_time
-        except Exception as e:
-            print(f"ChatGLM调用失败: {e}")
-            return [], time.time() - start_time
-    # 使用BERT NER
-    text_chunks = [text[i:i + 510] for i in range(0, len(text), 510)]  # 安全分段
-    raw_results = []
-    # 获取NER pipeline
-    ner_pipeline = get_ner_pipeline()  # 使用缓存的pipeline
-    for idx, chunk in enumerate(text_chunks):
-        chunk_results = ner_pipeline(chunk)  # 使用获取的pipeline
-        for r in chunk_results:
-            r["start"] += idx * 510
-            r["end"] += idx * 510
-        raw_results.extend(chunk_results)
-    entities = [{
-        "text": r['word'].replace(' ', ''),
-        "start": r['start'],
-        "end": r['end'],
-        "type": LABEL_MAPPING.get(r.get('entity_group') or r.get('entity'), r.get('entity_group') or r.get('entity'))
-    } for r in raw_results]
-    entities = merge_adjacent_entities(entities)
-    return entities, time.time() - start_time
-# ------------------ 实体类型标准化 ------------------
-LABEL_MAPPING = {
-    "address": "LOC",
-    "company": "ORG",
-    "name": "PER",
-    "organization": "ORG",
-    "position": "TITLE",
-    "government": "ORG",
-    "scene": "LOC",
-    "book": "WORK",
-    "movie": "WORK",
-    "game": "WORK"
-}
-# 提取实体
-entities, processing_time = ner("Google in New York met Alice")
-# 标准化实体类型
-for e in entities:
-    e["type"] = LABEL_MAPPING.get(e.get("type"), e.get("type"))
-# 打印标准化后的实体
-print(f"[DEBUG] 标准化后实体列表: {[{'text': e['text'], 'type': e['type']} for e in entities]}")
-# 打印处理时间
-print(f"处理时间: {processing_time:.2f}秒")
-# ======================== 关系抽取（RE） ========================
-@lru_cache(maxsize=1)
-def get_re_pipeline():
-    tokenizer = AutoTokenizer.from_pretrained(NER_MODEL_NAME)
-    model = AutoModelForTokenClassification.from_pretrained(NER_MODEL_NAME)
-    return pipeline(
-        "ner",  # 使用NER pipeline
-        model=model,
-        tokenizer=tokenizer,
-        aggregation_strategy="first"
-    )
-def re_extract(entities, text, use_bert_model=True):
-    if not entities or not text:
-        return [], 0
-    start_time = time.time()
-    try:
-        # 使用规则匹配关系
-        relations = []
-        # 定义关系关键词和对应的实体类型约束
-        relation_rules = {
-            "位于": {
-                "keywords": ["位于", "在", "坐落于"],
-                "valid_types": {
-                    "head": ["ORG", "PER", "LOC"],
-                    "tail": ["LOC"]
-                }
-            },
-            "属于": {
-                "keywords": ["属于", "是", "为"],
-                "valid_types": {
-                    "head": ["ORG", "PER"],
-                    "tail": ["ORG", "LOC"]
-                }
-            },
-            "任职于": {
-                "keywords": ["任职于", "就职于", "工作于"],
-                "valid_types": {
-                    "head": ["PER"],
-                    "tail": ["ORG"]
-                }
-            }
-        }
-        # 预处理实体，去除重复和部分匹配
-        processed_entities = []
-        for e in entities:
-            # 检查是否与已有实体重叠
-            is_subset = False
-            for pe in processed_entities:
-                if e["text"] in pe["text"] and e["text"] != pe["text"]:
-                    is_subset = True
-                    break
-            if not is_subset:
-                processed_entities.append(e)
-        # 遍历文本中的每个句子
-        sentences = re.split('[。！？.!?]', text)
-        for sentence in sentences:
-            if not sentence.strip():
-                continue
-            # 获取当前句子中的实体
-            sentence_entities = [e for e in processed_entities if e["text"] in sentence]
-            # 检查每个关系类型
-            for rel_type, rule in relation_rules.items():
-                for keyword in rule["keywords"]:
-                    if keyword in sentence:
-                        # 在句子中查找符合类型约束的实体对
-                        for i, ent1 in enumerate(sentence_entities):
-                            for j, ent2 in enumerate(sentence_entities):
-                                if i != j:  # 避免自循环
-                                    # 检查实体类型是否符合规则
-                                    if (ent1["type"] in rule["valid_types"]["head"] and
-                                        ent2["type"] in rule["valid_types"]["tail"]):
-                                        # 检查实体在句子中的位置关系
-                                        if sentence.find(ent1["text"]) < sentence.find(ent2["text"]):
-                                            relations.append({
-                                                "head": ent1["text"],
-                                                "tail": ent2["text"],
-                                                "relation": rel_type
-                                            })
-        # 去重
-        unique_relations = []
-        seen = set()
-        for rel in relations:
-            rel_key = (rel["head"], rel["tail"], rel["relation"])
-            if rel_key not in seen:
-                seen.add(rel_key)
-                unique_relations.append(rel)
-        return unique_relations, time.time() - start_time
-    except Exception as e:
-        logging.error(f"关系抽取失败: {e}")
-        return [], time.time() - start_time
-# ======================== 文本分析主流程 ========================
-def create_knowledge_graph(entities, relations):
-    """
-    创建交互式网络图形式的知识图谱
-    """
-    # 创建一个新的网络图
-    net = Network(height="600px", width="100%", bgcolor="#ffffff", font_color="black", directed=True)
-    # 设置实体类型的颜色映射
-    entity_colors = {
-        'PER': '#FF6B6B',  # 人物-红色
-        'ORG': '#4ECDC4',  # 组织-青色
-        'LOC': '#45B7D1',  # 地点-蓝色
-        'TIME': '#96CEB4', # 时间-绿色
-        'TITLE': '#D4A5A5' # 职位-粉色
-    }
-    # 添加实体节点
-    added_nodes = set()
-    for entity in entities:
-        if entity['text'] not in added_nodes:
-            node_color = entity_colors.get(entity['type'], '#D3D3D3')
-            net.add_node(
-                entity['text'],
-                label=entity['text'],
-                title=f"类型: {entity['type']}",
-                color=node_color,
-                size=20,
-                font={'size': 16}
-            )
-            added_nodes.add(entity['text'])
-    # 添加关系边
-    for relation in relations:
-        if relation['head'] in added_nodes and relation['tail'] in added_nodes:
-            net.add_edge(
-                relation['head'],
-                relation['tail'],
-                label=relation['relation'],
-                title=relation['relation'],
-                arrows={'to': {'enabled': True, 'type': 'arrow'}},
-                color={'color': '#666666'},
-                font={'size': 12}
-            )
-    # 设置物理布局参数
-    net.set_options('''
-    {
-        "nodes": {
-            "shape": "dot",
-            "shadow": true
-        },
-        "edges": {
-            "smooth": {
-                "type": "continuous",
-                "forceDirection": "none"
-            },
-            "shadow": true
-        },
-        "physics": {
-            "barnesHut": {
-                "gravitationalConstant": -2000,
-                "centralGravity": 0.3,
-                "springLength": 200,
-                "springConstant": 0.04,
-                "damping": 0.09
-            },
-            "minVelocity": 0.75
-        },
-        "interaction": {
-            "hover": true,
-            "navigationButtons": true,
-            "keyboard": true
-        }
-    }
-    ''')
-    # 生成HTML文件
-    try:
-        # 创建临时目录（如果不存在）
-        temp_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "temp")
-        os.makedirs(temp_dir, exist_ok=True)
-        # 生成唯一的文件名
-        output_path = os.path.join(temp_dir, f"kg_{int(time.time())}.html")
-        # 保存图谱
-        net.save_graph(output_path)
-        # 读取生成的HTML文件内容
-        with open(output_path, 'r', encoding='utf-8') as f:
-            html_content = f.read()
-        # 删除临时文件
-        os.remove(output_path)
-        # 修改HTML内容以适应Gradio界面
-        html_content = html_content.replace('height: 600px', 'height: 700px')
-        # 添加图例
-        legend_html = f"""
-        <div style="margin-bottom: 10px; padding: 10px; background-color: #f8f9fa; border-radius: 5px;">
-            <div style="font-weight: bold; margin-bottom: 5px;">图例说明：</div>
-            <div style="display: flex; gap: 15px; flex-wrap: wrap;">
-                <div style="display: flex; align-items: center; gap: 5px;">
-                    <div style="width: 15px; height: 15px; background: {entity_colors['PER']}; border-radius: 50%;"></div>
-                    <span>人物 (PER)</span>
-                </div>
-                <div style="display: flex; align-items: center; gap: 5px;">
-                    <div style="width: 15px; height: 15px; background: {entity_colors['ORG']}; border-radius: 50%;"></div>
-                    <span>组织 (ORG)</span>
-                </div>
-                <div style="display: flex; align-items: center; gap: 5px;">
-                    <div style="width: 15px; height: 15px; background: {entity_colors['LOC']}; border-radius: 50%;"></div>
-                    <span>地点 (LOC)</span>
-                </div>
-                <div style="display: flex; align-items: center; gap: 5px;">
-                    <div style="width: 15px; height: 15px; background: {entity_colors['TITLE']}; border-radius: 50%;"></div>
-                    <span>职位 (TITLE)</span>
-                </div>
-            </div>
-        </div>
-        """
-        # 将图例添加到HTML内容中
-        html_content = legend_html + html_content
-        return html_content
-    except Exception as e:
-        logging.error(f"生成知识图谱失败: {str(e)}")
-        return f"<div class='error'>生成知识图谱失败: {str(e)}</div>"
-def process_text(text, model_type="bert"):
-    """
-    处理文本，进行实体识别和关系抽取
-    """
-    start_time = time.time()
-    # 实体识别
-    entities, ner_duration = ner(text, model_type)
-    if not entities:
-        return "", "", "", f"{time.time() - start_time:.2f} 秒"
-    # 关系抽取
-    relations, re_duration = re_extract(entities, text)
-    # 生成文本格式的实体和关系描述
-    ent_text = "📌 实体:\n" + "\n".join([f"{e['text']} ({e['type']})" for e in entities])
-    rel_text = "\n\n📎 关系:\n" + "\n".join([f"{r['head']} --[{r['relation']}]--> {r['tail']}" for r in relations])
-    # 生成知识图谱
-    kg_text = create_knowledge_graph(entities, relations)
-    total_duration = time.time() - start_time
-    return ent_text, rel_text, kg_text, f"{total_duration:.2f} 秒"
-def process_file(file, model_type="bert"):
-    try:
-        with open(file.name, 'rb') as f:
-            content = f.read()
-        if len(content) > 5 * 1024 * 1024:
-            return "❌ 文件太大", "", "", ""
-        # 检测编码
-        try:
-            encoding = chardet.detect(content)['encoding'] or 'utf-8'
-            text = content.decode(encoding)
-        except UnicodeDecodeError:
-            # 尝试常见中文编码
-            for enc in ['gb18030', 'utf-16', 'big5']:
-                try:
-                    text = content.decode(enc)
-                    break
-                except:
-                    continue
-            else:
-                return "❌ 编码解析失败", "", "", ""
-        # 直接调用process_text处理文本
-        return process_text(text, model_type)
-    except Exception as e:
-        logging.error(f"文件处理错误: {str(e)}")
-        return f"❌ 文件处理错误: {str(e)}", "", "", ""
-# ======================== 模型评估与自动标注 ========================
-def convert_telegram_json_to_eval_format(path):
-    with open(path, encoding="utf-8") as f:
-        data = json.load(f)
-    if isinstance(data, dict) and "text" in data:
-        return [{"text": data["text"], "entities": [
-            {"text": data["text"][e["start"]:e["end"]]} for e in data.get("entities", [])
-        ]}]
-    elif isinstance(data, list):
-        return data
-    elif isinstance(data, dict) and "messages" in data:
-        result = []
-        for m in data.get("messages", []):
-            if isinstance(m.get("text"), str):
-                result.append({"text": m["text"], "entities": []})
-            elif isinstance(m.get("text"), list):
-                txt = ''.join([x["text"] if isinstance(x, dict) else x for x in m["text"]])
-                result.append({"text": txt, "entities": []})
-        return result
-    return []
-def evaluate_ner_model(data, model_type):
-    tp, fp, fn = 0, 0, 0
-    POS_TOLERANCE = 1
-    for item in data:
-        text = item["text"]
-        # 处理标注数据
-        gold_entities = []
-        for e in item.get("entities", []):
-            if "text" in e and "type" in e:
-                norm_type = LABEL_MAPPING.get(e["type"], e["type"])
-                gold_entities.append({
-                    "text": e["text"],
-                    "type": norm_type,
-                    "start": e.get("start", -1),
-                    "end": e.get("end", -1)
-                })
-        # 获取预测结果
-        pred_entities, _ = ner(text, model_type)
-        # 初始化匹配状态
-        matched_gold = [False] * len(gold_entities)
-        matched_pred = [False] * len(pred_entities)
-        # 遍历预测实体寻找匹配
-        for p_idx, p in enumerate(pred_entities):
-            for g_idx, g in enumerate(gold_entities):
-                if not matched_gold[g_idx] and \
-                        p["text"] == g["text"] and \
-                        p["type"] == g["type"] and \
-                        abs(p["start"] - g["start"]) <= POS_TOLERANCE and \
-                        abs(p["end"] - g["end"]) <= POS_TOLERANCE:
-                    matched_gold[g_idx] = True
-                    matched_pred[p_idx] = True
-                    break
-        # 统计指标
-        tp += sum(matched_pred)
-        fp += len(pred_entities) - sum(matched_pred)
-        fn += len(gold_entities) - sum(matched_gold)
-    # 处理除零情况
-    precision = tp / (tp + fp) if (tp + fp) > 0 else 0
-    recall = tp / (tp + fn) if (tp + fn) > 0 else 0
-    f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
-    return (f"Precision: {precision:.2f}\n"
-            f"Recall: {recall:.2f}\n"
-            f"F1: {f1:.2f}")
-def auto_annotate(file, model_type):
-    data = convert_telegram_json_to_eval_format(file.name)
-    for item in data:
-        ents, _ = ner(item["text"], model_type)
-        item["entities"] = ents
-    return json.dumps(data, ensure_ascii=False, indent=2)
-def save_json(json_text):
-    fname = f"auto_labeled_{int(time.time())}.json"
-    with open(fname, "w", encoding="utf-8") as f:
-        f.write(json_text)
-    return fname
-# ======================== 数据集导入 ========================
-def import_dataset(path="D:/云边智算/暗语识别/filtered_results"):
-    import os
-    import json
-    for filename in os.listdir(path):
-        if filename.endswith('.json'):
-            filepath = os.path.join(path, filename)
-            with open(filepath, 'r', encoding='utf-8') as f:
-                data = json.load(f)
-                # 调用现有处理流程
-                process_text(data['text'])
-                print(f"已处理文件: {filename}")
-# ======================== Gradio 界面 ========================
-with gr.Blocks(css="""
-    .kg-graph {height: 500px; overflow-y: auto;}
-    .warning {color: #ff6b6b;}
-""") as demo:
-    gr.Markdown("# 🤖 聊天记录实体关系识别系统")
-    with gr.Tab("📄 文本分析"):
-        input_text = gr.Textbox(lines=6, label="输入文本")
-        model_type = gr.Radio(["bert", "chatglm"], value="bert", label="选择模型")
-        btn = gr.Button("开始分析")
-        out1 = gr.Textbox(label="识别实体")
-        out2 = gr.Textbox(label="识别关系")
-        out3 = gr.HTML(label="知识图谱")
-        out4 = gr.Textbox(label="耗时")
-        btn.click(fn=process_text, inputs=[input_text, model_type], outputs=[out1, out2, out3, out4])
-    with gr.Tab("🗂 文件分析"):
-        file_input = gr.File(file_types=[".txt", ".json"])
-        file_btn = gr.Button("上传并分析")
-        fout1 = gr.Textbox()
-        fout2 = gr.Textbox()
-        fout3 = gr.HTML()
-        fout4 = gr.Textbox()
-        file_btn.click(fn=process_file, inputs=[file_input, model_type], outputs=[fout1, fout2, fout3, fout4])
-    with gr.Tab("📊 模型评估"):
-        eval_file = gr.File(label="上传标注 JSON")
-        eval_model = gr.Radio(["bert", "chatglm"], value="bert")
-        eval_btn = gr.Button("开始评估")
-        eval_output = gr.Textbox(label="评估结果", lines=5)
-        eval_btn.click(lambda f, m: evaluate_ner_model(convert_telegram_json_to_eval_format(f.name), m),
-                       [eval_file, eval_model], eval_output)
-    with gr.Tab("✏️ 自动标注"):
-        raw_file = gr.File(label="上传 Telegram 原始 JSON")
-        auto_model = gr.Radio(["bert", "chatglm"], value="bert")
-        auto_btn = gr.Button("自动标注")
-        marked_texts = gr.Textbox(label="标注结果", lines=20)
-        download_btn = gr.Button("💾 下载标注文件")
-        auto_btn.click(fn=auto_annotate, inputs=[raw_file, auto_model], outputs=marked_texts)
-        download_btn.click(fn=save_json, inputs=marked_texts, outputs=gr.File())
-    with gr.Tab("📂 数据管理"):
-        gr.Markdown("### 数据集导入")
-        dataset_path = gr.Textbox(
-            value="D:/云边智算/暗语识别/filtered_results",
-            label="数据集路径"
-        )
-        import_btn = gr.Button("导入数据集到数据库")
-        import_output = gr.Textbox(label="导入日志")
-        import_btn.click(fn=lambda: import_dataset(dataset_path.value), outputs=import_output)
 demo.launch(server_name="0.0.0.0", server_port=7860)

+import torch
+from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline, AutoModel
+import gradio as gr
+import re
+import os
+import json
+import chardet
+from sklearn.metrics import precision_score, recall_score, f1_score
+import time
+from functools import lru_cache  # 添加这行导入
+# ======================== 数据库模块 ========================
+from sqlalchemy import create_engine
+from sqlalchemy.orm import sessionmaker
+from contextlib import contextmanager
+import logging
+import networkx as nx
+from pyvis.network import Network
+import pandas as pd
+# 配置日志
+logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
+# 使用SQLAlchemy的连接池来管理数据库连接
+DATABASE_URL = "mysql+pymysql://user:password@host/dbname"  # 请根据实际情况修改连接字符串
+# 创建引擎（连接池）
+engine = create_engine(DATABASE_URL, pool_size=10, max_overflow=20, echo=True)
+# 创建session类
+Session = sessionmaker(bind=engine)
+@contextmanager
+def get_db_connection():
+    """
+    使用上下文管理器获取数据库连接
+    """
+    session = None
+    try:
+        session = Session()  # 从连接池中获取一个连接
+        logging.info("✅ 数据库连接已建立")
+        yield session  # 使用session进行数据库操作
+    except Exception as e:
+        logging.error(f"❌ 数据库操作时发生错误: {e}")
+        if session:
+            session.rollback()  # 回滚事务
+    finally:
+        if session:
+            try:
+                session.commit()  # 提交事务
+                logging.info("✅ 数据库事务已提交")
+            except Exception as e:
+                logging.error(f"❌ 提交事务时发生错误: {e}")
+            finally:
+                session.close()  # 关闭会话，释放连接
+                logging.info("✅ 数据库连接已关闭")
+def save_to_db(table, data):
+    """
+    将数据保存到数据库
+    :param table: 表名
+    :param data: 数据字典
+    """
+    try:
+        valid_tables = ["entities", "relations"]  # 只允许保存到这些表
+        if table not in valid_tables:
+            raise ValueError(f"Invalid table: {table}")
+        with get_db_connection() as conn:
+            if conn:
+                # 这里的操作假设使用了ORM模型来处理插入，实际根据你数据库的表结构来调整
+                table_model = get_table_model(table)  # 假设你有一个方法来根据表名获得ORM模型
+                new_record = table_model(**data)
+                conn.add(new_record)
+                conn.commit()  # 提交事务
+    except Exception as e:
+        logging.error(f"❌ 保存数据时发生错误: {e}")
+        return False
+    return True
+def get_table_model(table_name):
+    """
+    根据表名获取ORM模型（这里假设你有一个映射到数据库表的模型）
+    :param table_name: 表名
+    :return: 对应的ORM模型
+    """
+    if table_name == "entities":
+        from models import Entity  # 假设你已经定义了ORM模型
+        return Entity
+    elif table_name == "relations":
+        from models import Relation  # 假设你已经定义了ORM模型
+        return Relation
+    else:
+        raise ValueError(f"Unknown table: {table_name}")
+# ======================== 模型加载 ========================
+NER_MODEL_NAME = "uer/roberta-base-finetuned-cluener2020-chinese"
+@lru_cache(maxsize=1)
+def get_ner_pipeline():
+    tokenizer = AutoTokenizer.from_pretrained(NER_MODEL_NAME)
+    model = AutoModelForTokenClassification.from_pretrained(NER_MODEL_NAME)
+    return pipeline(
+        "ner",
+        model=model,
+        tokenizer=tokenizer,
+        aggregation_strategy="first"
+    )
+@lru_cache(maxsize=1)
+def get_re_pipeline():
+    return pipeline(
+        "text2text-generation",
+        model=NER_MODEL_NAME,
+        tokenizer=NER_MODEL_NAME,
+        max_length=512,
+        device=0 if torch.cuda.is_available() else -1
+    )
+# chatglm_model, chatglm_tokenizer = None, None
+# use_chatglm = False
+# try:
+#     chatglm_model_name = "THUDM/chatglm-6b-int4"
+#     chatglm_tokenizer = AutoTokenizer.from_pretrained(chatglm_model_name, trust_remote_code=True)
+#     chatglm_model = AutoModel.from_pretrained(
+#         chatglm_model_name,
+#         trust_remote_code=True,
+#         device_map="cpu",
+#         torch_dtype=torch.float32
+#     ).eval()
+#     use_chatglm = True
+#     print("✅ 4-bit量化版ChatGLM加载成功")
+# except Exception as e:
+#     print(f"❌ ChatGLM加载失败: {e}")
+# ======================== 知识图谱结构 ========================
+knowledge_graph = {"entities": set(), "relations": set()}
+def update_knowledge_graph(entities, relations):
+    # 保存实体
+    for e in entities:
+        if isinstance(e, dict) and 'text' in e and 'type' in e:
+            save_to_db('entities', {
+                'text': e['text'],
+                'type': e['type'],
+                'start_pos': e.get('start', -1),
+                'end_pos': e.get('end', -1),
+                'source': 'user_input'
+            })
+    # 保存关系
+    for r in relations:
+        if isinstance(r, dict) and all(k in r for k in ("head", "tail", "relation")):
+            save_to_db('relations', {
+                'head_entity': r['head'],
+                'tail_entity': r['tail'],
+                'relation_type': r['relation'],
+                'source_text': ''  # 可添加原文关联
+            })
+def visualize_kg_text():
+    nodes = [f"{ent[0]} ({ent[1]})" for ent in knowledge_graph["entities"]]
+    edges = [f"{h} --[{r}]-> {t}" for h, t, r in knowledge_graph["relations"]]
+    return "\n".join(["📌 实体:"] + nodes + ["", "📎 关系:"] + edges)
+def visualize_kg_interactive(entities, relations):
+    """
+    生成交互式的知识图谱可视化
+    """
+    # 创建一个新的网络图
+    net = Network(height="500px", width="100%", bgcolor="#ffffff", font_color="black")
+    # 添加节点
+    entity_colors = {
+        'PER': '#FF6B6B',  # 人物-红色
+        'ORG': '#4ECDC4',  # 组织-青色
+        'LOC': '#45B7D1',  # 地点-蓝色
+        'TIME': '#96CEB4', # 时间-绿色
+        'MISC': '#D4A5A5'  # 其他-灰色
+    }
+    # 添加实体节点
+    for entity in entities:
+        node_color = entity_colors.get(entity['type'], '#D3D3D3')
+        net.add_node(entity['text'],
+                    label=f"{entity['text']}\n({entity['type']})",
+                    color=node_color,
+                    title=f"类型: {entity['type']}")
+    # 添加关系边
+    for relation in relations:
+        net.add_edge(relation['head'],
+                    relation['tail'],
+                    label=relation['relation'],
+                    arrows='to')
+    # 设置物理布局
+    net.set_options('''
+    var options = {
+        "physics": {
+            "forceAtlas2Based": {
+                "gravitationalConstant": -50,
+                "centralGravity": 0.01,
+                "springLength": 100,
+                "springConstant": 0.08
+            },
+            "maxVelocity": 50,
+            "solver": "forceAtlas2Based",
+            "timestep": 0.35,
+            "stabilization": {"iterations": 150}
+        }
+    }
+    ''')
+    # 生成HTML文件
+    html_path = "knowledge_graph.html"
+    net.save_graph(html_path)
+    return html_path
+# ======================== 实体识别（NER） ========================
+def merge_adjacent_entities(entities):
+    if not entities:
+        return entities
+    merged = [entities[0]]
+    for entity in entities[1:]:
+        last = merged[-1]
+        # 合并相邻的同类型实体
+        if (entity["type"] == last["type"] and
+                entity["start"] == last["end"]):
+            last["text"] += entity["text"]
+            last["end"] = entity["end"]
+        else:
+            merged.append(entity)
+    return merged
+def ner(text, model_type="bert"):
+    start_time = time.time()
+    # 如果使用的是 ChatGLM 模型，执行 ChatGLM 的NER
+    if model_type == "chatglm" and use_chatglm:
+        try:
+            prompt = f"""请从以下文本中识别所有实体，严格按照JSON列表格式返回，每个实体包含text、type、start、end字段：
+示例：[{{"text": "北京", "type": "LOC", "start": 0, "end": 2}}]
+文本：{text}"""
+            response = chatglm_model.chat(chatglm_tokenizer, prompt, temperature=0.1)
+            if isinstance(response, tuple):
+                response = response[0]
+            try:
+                json_str = re.search(r'\[.*\]', response, re.DOTALL).group()
+                entities = json.loads(json_str)
+                valid_entities = [ent for ent in entities if all(k in ent for k in ("text", "type", "start", "end"))]
+                return valid_entities, time.time() - start_time
+            except Exception as e:
+                print(f"JSON解析失败: {e}")
+                return [], time.time() - start_time
+        except Exception as e:
+            print(f"ChatGLM调用失败: {e}")
+            return [], time.time() - start_time
+    # 使用BERT NER
+    text_chunks = [text[i:i + 510] for i in range(0, len(text), 510)]  # 安全分段
+    raw_results = []
+    # 获取NER pipeline
+    ner_pipeline = get_ner_pipeline()  # 使用缓存的pipeline
+    for idx, chunk in enumerate(text_chunks):
+        chunk_results = ner_pipeline(chunk)  # 使用获取的pipeline
+        for r in chunk_results:
+            r["start"] += idx * 510
+            r["end"] += idx * 510
+        raw_results.extend(chunk_results)
+    entities = [{
+        "text": r['word'].replace(' ', ''),
+        "start": r['start'],
+        "end": r['end'],
+        "type": LABEL_MAPPING.get(r.get('entity_group') or r.get('entity'), r.get('entity_group') or r.get('entity'))
+    } for r in raw_results]
+    entities = merge_adjacent_entities(entities)
+    return entities, time.time() - start_time
+# ------------------ 实体类型标准化 ------------------
+LABEL_MAPPING = {
+    "address": "LOC",
+    "company": "ORG",
+    "name": "PER",
+    "organization": "ORG",
+    "position": "TITLE",
+    "government": "ORG",
+    "scene": "LOC",
+    "book": "WORK",
+    "movie": "WORK",
+    "game": "WORK"
+}
+# 提取实体
+entities, processing_time = ner("Google in New York met Alice")
+# 标准化实体类型
+for e in entities:
+    e["type"] = LABEL_MAPPING.get(e.get("type"), e.get("type"))
+# 打印标准化后的实体
+print(f"[DEBUG] 标准化后实体列表: {[{'text': e['text'], 'type': e['type']} for e in entities]}")
+# 打印处理时间
+print(f"处理时间: {processing_time:.2f}秒")
+# ======================== 关系抽取（RE） ========================
+@lru_cache(maxsize=1)
+def get_re_pipeline():
+    tokenizer = AutoTokenizer.from_pretrained(NER_MODEL_NAME)
+    model = AutoModelForTokenClassification.from_pretrained(NER_MODEL_NAME)
+    return pipeline(
+        "ner",  # 使用NER pipeline
+        model=model,
+        tokenizer=tokenizer,
+        aggregation_strategy="first"
+    )
+def re_extract(entities, text, use_bert_model=True):
+    if not entities or not text:
+        return [], 0
+    start_time = time.time()
+    try:
+        # 使用规则匹配关系
+        relations = []
+        # 定义关系关键词和对应的实体类型约束
+        relation_rules = {
+            "位于": {
+                "keywords": ["位于", "在", "��落于"],
+                "valid_types": {
+                    "head": ["ORG", "PER", "LOC"],
+                    "tail": ["LOC"]
+                }
+            },
+            "属于": {
+                "keywords": ["属于", "是", "为"],
+                "valid_types": {
+                    "head": ["ORG", "PER"],
+                    "tail": ["ORG", "LOC"]
+                }
+            },
+            "任职于": {
+                "keywords": ["任职于", "就职于", "工作于"],
+                "valid_types": {
+                    "head": ["PER"],
+                    "tail": ["ORG"]
+                }
+            }
+        }
+        # 预处理实体，去除重复和部分匹配
+        processed_entities = []
+        for e in entities:
+            # 检查是否与已有实体重叠
+            is_subset = False
+            for pe in processed_entities:
+                if e["text"] in pe["text"] and e["text"] != pe["text"]:
+                    is_subset = True
+                    break
+            if not is_subset:
+                processed_entities.append(e)
+        # 遍历文本中的每个句子
+        sentences = re.split('[。！？.!?]', text)
+        for sentence in sentences:
+            if not sentence.strip():
+                continue
+            # 获取当前句子中的实体
+            sentence_entities = [e for e in processed_entities if e["text"] in sentence]
+            # 检查每个关系类型
+            for rel_type, rule in relation_rules.items():
+                for keyword in rule["keywords"]:
+                    if keyword in sentence:
+                        # 在句子中查找符合类型约束的实体对
+                        for i, ent1 in enumerate(sentence_entities):
+                            for j, ent2 in enumerate(sentence_entities):
+                                if i != j:  # 避免自循环
+                                    # 检查实体类型是否符合规则
+                                    if (ent1["type"] in rule["valid_types"]["head"] and
+                                        ent2["type"] in rule["valid_types"]["tail"]):
+                                        # 检查实体在句子中的位置关系
+                                        if sentence.find(ent1["text"]) < sentence.find(ent2["text"]):
+                                            relations.append({
+                                                "head": ent1["text"],
+                                                "tail": ent2["text"],
+                                                "relation": rel_type
+                                            })
+        # 去重
+        unique_relations = []
+        seen = set()
+        for rel in relations:
+            rel_key = (rel["head"], rel["tail"], rel["relation"])
+            if rel_key not in seen:
+                seen.add(rel_key)
+                unique_relations.append(rel)
+        return unique_relations, time.time() - start_time
+    except Exception as e:
+        logging.error(f"关系抽取失败: {e}")
+        return [], time.time() - start_time
+# ======================== 文本分析主流程 ========================
+def create_knowledge_graph(entities, relations):
+    """
+    创建交互式网络图形式的知识图谱
+    """
+    # 创建一个新的网络图
+    net = Network(height="600px", width="100%", bgcolor="#ffffff", font_color="black", directed=True)
+    # 设置实体类型的颜色映射
+    entity_colors = {
+        'PER': '#FF6B6B',  # 人物-红色
+        'ORG': '#4ECDC4',  # 组织-青色
+        'LOC': '#45B7D1',  # 地点-蓝色
+        'TIME': '#96CEB4', # 时间-绿色
+        'TITLE': '#D4A5A5' # 职位-粉色
+    }
+    # 添加实体节点
+    added_nodes = set()
+    for entity in entities:
+        if entity['text'] not in added_nodes:
+            node_color = entity_colors.get(entity['type'], '#D3D3D3')
+            net.add_node(
+                entity['text'],
+                label=entity['text'],
+                title=f"类型: {entity['type']}",
+                color=node_color,
+                size=20,
+                font={'size': 16}
+            )
+            added_nodes.add(entity['text'])
+    # 添加关系边
+    for relation in relations:
+        if relation['head'] in added_nodes and relation['tail'] in added_nodes:
+            net.add_edge(
+                relation['head'],
+                relation['tail'],
+                label=relation['relation'],
+                title=relation['relation'],
+                arrows={'to': {'enabled': True, 'type': 'arrow'}},
+                color={'color': '#666666'},
+                font={'size': 12}
+            )
+    # 设置物理布局参数
+    net.set_options('''
+    {
+        "nodes": {
+            "shape": "dot",
+            "shadow": true
+        },
+        "edges": {
+            "smooth": {
+                "type": "continuous",
+                "forceDirection": "none"
+            },
+            "shadow": true
+        },
+        "physics": {
+            "barnesHut": {
+                "gravitationalConstant": -2000,
+                "centralGravity": 0.3,
+                "springLength": 200,
+                "springConstant": 0.04,
+                "damping": 0.09
+            },
+            "minVelocity": 0.75
+        },
+        "interaction": {
+            "hover": true,
+            "navigationButtons": true,
+            "keyboard": true
+        }
+    }
+    ''')
+    try:
+        # 创建临时目录（如果不存在）
+        temp_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "temp")
+        os.makedirs(temp_dir, exist_ok=True)
+        # 生成唯一的文件名
+        timestamp = int(time.time())
+        html_file = os.path.join(temp_dir, f"kg_{timestamp}.html")
+        # 保存HTML文件
+        net.save_graph(html_file)
+        # 读取HTML内容
+        with open(html_file, 'r', encoding='utf-8') as f:
+            html_content = f.read()
+        # 添加图例
+        legend_html = f"""
+        <div style="margin-bottom: 10px; padding: 10px; background-color: #f8f9fa; border-radius: 5px;">
+            <div style="font-weight: bold; margin-bottom: 5px;">图例说明：</div>
+            <div style="display: flex; gap: 15px; flex-wrap: wrap;">
+                <div style="display: flex; align-items: center; gap: 5px;">
+                    <div style="width: 15px; height: 15px; background: {entity_colors['PER']}; border-radius: 50%;"></div>
+                    <span>人物 (PER)</span>
+                </div>
+                <div style="display: flex; align-items: center; gap: 5px;">
+                    <div style="width: 15px; height: 15px; background: {entity_colors['ORG']}; border-radius: 50%;"></div>
+                    <span>组织 (ORG)</span>
+                </div>
+                <div style="display: flex; align-items: center; gap: 5px;">
+                    <div style="width: 15px; height: 15px; background: {entity_colors['LOC']}; border-radius: 50%;"></div>
+                    <span>地点 (LOC)</span>
+                </div>
+                <div style="display: flex; align-items: center; gap: 5px;">
+                    <div style="width: 15px; height: 15px; background: {entity_colors['TITLE']}; border-radius: 50%;"></div>
+                    <span>职位 (TITLE)</span>
+                </div>
+            </div>
+        </div>
+        """
+        # 将图例添加到HTML内容中
+        html_content = legend_html + html_content
+        # 清理旧的临时文件
+        for old_file in os.listdir(temp_dir):
+            if old_file.startswith("kg_") and old_file.endswith(".html"):
+                old_path = os.path.join(temp_dir, old_file)
+                if os.path.getmtime(old_path) < time.time() - 3600:  # 删除1小时前的文件
+                    try:
+                        os.remove(old_path)
+                    except:
+                        pass
+        return html_content
+    except Exception as e:
+        logging.error(f"生成知识图谱失败: {str(e)}")
+        return f"<div class='error'>生成知识图谱失败: {str(e)}</div>"
+def process_text(text, model_type="bert"):
+    """
+    处理文本，进行实体识别和关系抽取
+    """
+    start_time = time.time()
+    # 实体识别
+    entities, ner_duration = ner(text, model_type)
+    if not entities:
+        return "", "", "", f"{time.time() - start_time:.2f} 秒"
+    # 关系抽取
+    relations, re_duration = re_extract(entities, text)
+    # 生成文本格式的实体和关系描述
+    ent_text = "📌 实体:\n" + "\n".join([f"{e['text']} ({e['type']})" for e in entities])
+    rel_text = "\n\n📎 关系:\n" + "\n".join([f"{r['head']} --[{r['relation']}]--> {r['tail']}" for r in relations])
+    # 生成知识图谱
+    kg_text = create_knowledge_graph(entities, relations)
+    total_duration = time.time() - start_time
+    return ent_text, rel_text, kg_text, f"{total_duration:.2f} 秒"
+def process_file(file, model_type="bert"):
+    try:
+        with open(file.name, 'rb') as f:
+            content = f.read()
+        if len(content) > 5 * 1024 * 1024:
+            return "❌ 文件太大", "", "", ""
+        # 检测编码
+        try:
+            encoding = chardet.detect(content)['encoding'] or 'utf-8'
+            text = content.decode(encoding)
+        except UnicodeDecodeError:
+            # 尝试常见中文编码
+            for enc in ['gb18030', 'utf-16', 'big5']:
+                try:
+                    text = content.decode(enc)
+                    break
+                except:
+                    continue
+            else:
+                return "❌ 编码解析失败", "", "", ""
+        # 直接调用process_text处理文本
+        return process_text(text, model_type)
+    except Exception as e:
+        logging.error(f"文件处理错误: {str(e)}")
+        return f"❌ 文件处理错误: {str(e)}", "", "", ""
+# ======================== 模型评估与自动标注 ========================
+def convert_telegram_json_to_eval_format(path):
+    with open(path, encoding="utf-8") as f:
+        data = json.load(f)
+    if isinstance(data, dict) and "text" in data:
+        return [{"text": data["text"], "entities": [
+            {"text": data["text"][e["start"]:e["end"]]} for e in data.get("entities", [])
+        ]}]
+    elif isinstance(data, list):
+        return data
+    elif isinstance(data, dict) and "messages" in data:
+        result = []
+        for m in data.get("messages", []):
+            if isinstance(m.get("text"), str):
+                result.append({"text": m["text"], "entities": []})
+            elif isinstance(m.get("text"), list):
+                txt = ''.join([x["text"] if isinstance(x, dict) else x for x in m["text"]])
+                result.append({"text": txt, "entities": []})
+        return result
+    return []
+def evaluate_ner_model(data, model_type):
+    tp, fp, fn = 0, 0, 0
+    POS_TOLERANCE = 1
+    for item in data:
+        text = item["text"]
+        # 处理标注数据
+        gold_entities = []
+        for e in item.get("entities", []):
+            if "text" in e and "type" in e:
+                norm_type = LABEL_MAPPING.get(e["type"], e["type"])
+                gold_entities.append({
+                    "text": e["text"],
+                    "type": norm_type,
+                    "start": e.get("start", -1),
+                    "end": e.get("end", -1)
+                })
+        # 获取预测结果
+        pred_entities, _ = ner(text, model_type)
+        # 初始化匹配状态
+        matched_gold = [False] * len(gold_entities)
+        matched_pred = [False] * len(pred_entities)
+        # 遍历预测实体寻找匹配
+        for p_idx, p in enumerate(pred_entities):
+            for g_idx, g in enumerate(gold_entities):
+                if not matched_gold[g_idx] and \
+                        p["text"] == g["text"] and \
+                        p["type"] == g["type"] and \
+                        abs(p["start"] - g["start"]) <= POS_TOLERANCE and \
+                        abs(p["end"] - g["end"]) <= POS_TOLERANCE:
+                    matched_gold[g_idx] = True
+                    matched_pred[p_idx] = True
+                    break
+        # 统计指标
+        tp += sum(matched_pred)
+        fp += len(pred_entities) - sum(matched_pred)
+        fn += len(gold_entities) - sum(matched_gold)
+    # 处理除零情况
+    precision = tp / (tp + fp) if (tp + fp) > 0 else 0
+    recall = tp / (tp + fn) if (tp + fn) > 0 else 0
+    f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
+    return (f"Precision: {precision:.2f}\n"
+            f"Recall: {recall:.2f}\n"
+            f"F1: {f1:.2f}")
+def auto_annotate(file, model_type):
+    data = convert_telegram_json_to_eval_format(file.name)
+    for item in data:
+        ents, _ = ner(item["text"], model_type)
+        item["entities"] = ents
+    return json.dumps(data, ensure_ascii=False, indent=2)
+def save_json(json_text):
+    fname = f"auto_labeled_{int(time.time())}.json"
+    with open(fname, "w", encoding="utf-8") as f:
+        f.write(json_text)
+    return fname
+# ======================== 数据集导入 ========================
+def import_dataset(path="D:/云边智算/暗语识别/filtered_results"):
+    import os
+    import json
+    for filename in os.listdir(path):
+        if filename.endswith('.json'):
+            filepath = os.path.join(path, filename)
+            with open(filepath, 'r', encoding='utf-8') as f:
+                data = json.load(f)
+                # 调用现有处理流程
+                process_text(data['text'])
+                print(f"已处理文件: {filename}")
+# ======================== Gradio 界面 ========================
+with gr.Blocks(css="""
+    .kg-graph {height: 700px; overflow-y: auto;}
+    .warning {color: #ff6b6b;}
+    .error {color: #ff0000; padding: 10px; background-color: #ffeeee; border-radius: 5px;}
+""") as demo:
+    gr.Markdown("# 🤖 聊天记录实体关系识别系统")
+    with gr.Tab("📄 文本分析"):
+        input_text = gr.Textbox(lines=6, label="输入文本")
+        model_type = gr.Radio(["bert", "chatglm"], value="bert", label="选择模型")
+        btn = gr.Button("开始分析")
+        out1 = gr.Textbox(label="识别实体")
+        out2 = gr.Textbox(label="识别关系")
+        out3 = gr.HTML(label="知识图谱")
+        out4 = gr.Textbox(label="耗时")
+        btn.click(fn=process_text, inputs=[input_text, model_type], outputs=[out1, out2, out3, out4])
+    with gr.Tab("🗂 文件分析"):
+        file_input = gr.File(file_types=[".txt", ".json"])
+        file_btn = gr.Button("上传并分析")
+        fout1, fout2, fout3, fout4 = gr.Textbox(), gr.Textbox(), gr.Textbox(), gr.Textbox()
+        file_btn.click(fn=process_file, inputs=[file_input, model_type], outputs=[fout1, fout2, fout3, fout4])
+    with gr.Tab("📊 模型评估"):
+        eval_file = gr.File(label="上传标注 JSON")
+        eval_model = gr.Radio(["bert", "chatglm"], value="bert")
+        eval_btn = gr.Button("开始评估")
+        eval_output = gr.Textbox(label="评估结果", lines=5)
+        eval_btn.click(lambda f, m: evaluate_ner_model(convert_telegram_json_to_eval_format(f.name), m),
+                       [eval_file, eval_model], eval_output)
+    with gr.Tab("✏️ 自动标注"):
+        raw_file = gr.File(label="上传 Telegram 原始 JSON")
+        auto_model = gr.Radio(["bert", "chatglm"], value="bert")
+        auto_btn = gr.Button("自动标注")
+        marked_texts = gr.Textbox(label="标注结果", lines=20)
+        download_btn = gr.Button("💾 下载标注文件")
+        auto_btn.click(fn=auto_annotate, inputs=[raw_file, auto_model], outputs=marked_texts)
+        download_btn.click(fn=save_json, inputs=marked_texts, outputs=gr.File())
+    with gr.Tab("📂 数据管理"):
+        gr.Markdown("### 数据集导入")
+        dataset_path = gr.Textbox(
+            value="D:/云边智算/暗语识别/filtered_results",
+            label="数据集路径"
+        )
+        import_btn = gr.Button("导入数据集到数据库")
+        import_output = gr.Textbox(label="导入日志")
+        import_btn.click(fn=lambda: import_dataset(dataset_path.value), outputs=import_output)
 demo.launch(server_name="0.0.0.0", server_port=7860)