Spaces:

chen666-666
/

wechat-ner-re

Sleeping

File size: 25,113 Bytes

import torch
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline, AutoModel
import gradio as gr
import re
import os
import json
import chardet
from sklearn.metrics import precision_score, recall_score, f1_score
import time
from functools import lru_cache  # 添加这行导入
# ======================== 数据库模块 ========================
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker
from contextlib import contextmanager
import logging
import networkx as nx
from pyvis.network import Network
import pandas as pd
import matplotlib.pyplot as plt

# 配置日志
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")

# 使用SQLAlchemy的连接池来管理数据库连接
DATABASE_URL = "mysql+pymysql://user:password@host/dbname"  # 请根据实际情况修改连接字符串

# 创建引擎（连接池）
engine = create_engine(DATABASE_URL, pool_size=10, max_overflow=20, echo=True)

# 创建session类
Session = sessionmaker(bind=engine)

@contextmanager
def get_db_connection():
    """
    使用上下文管理器获取数据库连接
    """
    session = None
    try:
        session = Session()  # 从连接池中获取一个连接
        logging.info("✅ 数据库连接已建立")
        yield session  # 使用session进行数据库操作
    except Exception as e:
        logging.error(f"❌ 数据库操作时发生错误: {e}")
        if session:
            session.rollback()  # 回滚事务
    finally:
        if session:
            try:
                session.commit()  # 提交事务
                logging.info("✅ 数据库事务已提交")
            except Exception as e:
                logging.error(f"❌ 提交事务时发生错误: {e}")
            finally:
                session.close()  # 关闭会话，释放连接
                logging.info("✅ 数据库连接已关闭")

def save_to_db(table, data):
    """
    将数据保存到数据库
    :param table: 表名
    :param data: 数据字典
    """
    try:
        valid_tables = ["entities", "relations"]  # 只允许保存到这些表
        if table not in valid_tables:
            raise ValueError(f"Invalid table: {table}")
        
        with get_db_connection() as conn:
            if conn:
                # 这里的操作假设使用了ORM模型来处理插入，实际根据你数据库的表结构来调整
                table_model = get_table_model(table)  # 假设你有一个方法来根据表名获得ORM模型
                new_record = table_model(**data)
                conn.add(new_record)
                conn.commit()  # 提交事务
    except Exception as e:
        logging.error(f"❌ 保存数据时发生错误: {e}")
        return False
    return True

def get_table_model(table_name):
    """
    根据表名获取ORM模型（这里假设你有一个映射到数据库表的模型）
    :param table_name: 表名
    :return: 对应的ORM模型
    """
    if table_name == "entities":
        from models import Entity  # 假设你已经定义了ORM模型
        return Entity
    elif table_name == "relations":
        from models import Relation  # 假设你已经定义了ORM模型
        return Relation
    else:
        raise ValueError(f"Unknown table: {table_name}")


# ======================== 模型加载 ========================
NER_MODEL_NAME = "uer/roberta-base-finetuned-cluener2020-chinese"

@lru_cache(maxsize=1)
def get_ner_pipeline():
    tokenizer = AutoTokenizer.from_pretrained(NER_MODEL_NAME)
    model = AutoModelForTokenClassification.from_pretrained(NER_MODEL_NAME)
    return pipeline(
        "ner",
        model=model,
        tokenizer=tokenizer,
        aggregation_strategy="first"
    )

@lru_cache(maxsize=1)
def get_re_pipeline():
    return pipeline(
        "text2text-generation",
        model=NER_MODEL_NAME,
        tokenizer=NER_MODEL_NAME,
        max_length=512,
        device=0 if torch.cuda.is_available() else -1
    )


# chatglm_model, chatglm_tokenizer = None, None
# use_chatglm = False
# try:
#     chatglm_model_name = "THUDM/chatglm-6b-int4"
#     chatglm_tokenizer = AutoTokenizer.from_pretrained(chatglm_model_name, trust_remote_code=True)
#     chatglm_model = AutoModel.from_pretrained(
#         chatglm_model_name,
#         trust_remote_code=True,
#         device_map="cpu",
#         torch_dtype=torch.float32
#     ).eval()
#     use_chatglm = True
#     print("✅ 4-bit量化版ChatGLM加载成功")
# except Exception as e:
#     print(f"❌ ChatGLM加载失败: {e}")

# ======================== 知识图谱结构 ========================
knowledge_graph = {"entities": set(), "relations": set()}


def update_knowledge_graph(entities, relations):
    # 保存实体
    for e in entities:
        if isinstance(e, dict) and 'text' in e and 'type' in e:
            save_to_db('entities', {
                'text': e['text'],
                'type': e['type'],
                'start_pos': e.get('start', -1),
                'end_pos': e.get('end', -1),
                'source': 'user_input'
            })

    # 保存关系
    for r in relations:
        if isinstance(r, dict) and all(k in r for k in ("head", "tail", "relation")):
            save_to_db('relations', {
                'head_entity': r['head'],
                'tail_entity': r['tail'],
                'relation_type': r['relation'],
                'source_text': ''  # 可添加原文关联
            })


def visualize_kg_text():
    nodes = [f"{ent[0]} ({ent[1]})" for ent in knowledge_graph["entities"]]
    edges = [f"{h} --[{r}]-> {t}" for h, t, r in knowledge_graph["relations"]]
    return "\n".join(["📌 实体:"] + nodes + ["", "📎 关系:"] + edges)

def visualize_kg_interactive(entities, relations):
    """
    生成交互式的知识图谱可视化
    """
    # 创建一个新的网络图
    net = Network(height="500px", width="100%", bgcolor="#ffffff", font_color="black")
    
    # 添加节点
    entity_colors = {
        'PER': '#FF6B6B',  # 人物-红色
        'ORG': '#4ECDC4',  # 组织-青色
        'LOC': '#45B7D1',  # 地点-蓝色
        'TIME': '#96CEB4', # 时间-绿色
        'MISC': '#D4A5A5'  # 其他-灰色
    }
    
    # 添加实体节点
    for entity in entities:
        node_color = entity_colors.get(entity['type'], '#D3D3D3')
        net.add_node(entity['text'], 
                    label=f"{entity['text']}\n({entity['type']})",
                    color=node_color,
                    title=f"类型: {entity['type']}")
    
    # 添加关系边
    for relation in relations:
        net.add_edge(relation['head'], 
                    relation['tail'],
                    label=relation['relation'],
                    arrows='to')
    
    # 设置物理布局
    net.set_options('''
    var options = {
        "physics": {
            "forceAtlas2Based": {
                "gravitationalConstant": -50,
                "centralGravity": 0.01,
                "springLength": 100,
                "springConstant": 0.08
            },
            "maxVelocity": 50,
            "solver": "forceAtlas2Based",
            "timestep": 0.35,
            "stabilization": {"iterations": 150}
        }
    }
    ''')
    
    # 生成HTML文件
    html_path = "knowledge_graph.html"
    net.save_graph(html_path)
    return html_path

# ======================== 实体识别（NER） ========================
def merge_adjacent_entities(entities):
    if not entities:
        return entities

    merged = [entities[0]]
    for entity in entities[1:]:
        last = merged[-1]
        # 合并相邻的同类型实体
        if (entity["type"] == last["type"] and
                entity["start"] == last["end"]):
            last["text"] += entity["text"]
            last["end"] = entity["end"]
        else:
            merged.append(entity)

    return merged


def ner(text, model_type="bert"):
    start_time = time.time()

    # 如果使用的是 ChatGLM 模型，执行 ChatGLM 的NER
    if model_type == "chatglm" and use_chatglm:
        try:
            prompt = f"""请从以下文本中识别所有实体，严格按照JSON列表格式返回，每个实体包含text、type、start、end字段：
示例：[{{"text": "北京", "type": "LOC", "start": 0, "end": 2}}]
文本：{text}"""
            response = chatglm_model.chat(chatglm_tokenizer, prompt, temperature=0.1)
            if isinstance(response, tuple):
                response = response[0]

            try:
                json_str = re.search(r'\[.*\]', response, re.DOTALL).group()
                entities = json.loads(json_str)
                valid_entities = [ent for ent in entities if all(k in ent for k in ("text", "type", "start", "end"))]
                return valid_entities, time.time() - start_time
            except Exception as e:
                print(f"JSON解析失败: {e}")
                return [], time.time() - start_time
        except Exception as e:
            print(f"ChatGLM调用失败: {e}")
            return [], time.time() - start_time

    # 使用BERT NER
    text_chunks = [text[i:i + 510] for i in range(0, len(text), 510)]  # 安全分段
    raw_results = []
    
    # 获取NER pipeline
    ner_pipeline = get_ner_pipeline()  # 使用缓存的pipeline
    
    for idx, chunk in enumerate(text_chunks):
        chunk_results = ner_pipeline(chunk)  # 使用获取的pipeline
        for r in chunk_results:
            r["start"] += idx * 510
            r["end"] += idx * 510
        raw_results.extend(chunk_results)

    entities = [{
        "text": r['word'].replace(' ', ''),
        "start": r['start'],
        "end": r['end'],
        "type": LABEL_MAPPING.get(r.get('entity_group') or r.get('entity'), r.get('entity_group') or r.get('entity'))
    } for r in raw_results]

    entities = merge_adjacent_entities(entities)
    return entities, time.time() - start_time


# ------------------ 实体类型标准化 ------------------
LABEL_MAPPING = {
    "address": "LOC",
    "company": "ORG",
    "name": "PER",
    "organization": "ORG",
    "position": "TITLE",
    "government": "ORG",
    "scene": "LOC",
    "book": "WORK",
    "movie": "WORK",
    "game": "WORK"
}

# 提取实体
entities, processing_time = ner("Google in New York met Alice")

# 标准化实体类型
for e in entities:
    e["type"] = LABEL_MAPPING.get(e.get("type"), e.get("type"))

# 打印标准化后的实体
print(f"[DEBUG] 标准化后实体列表: {[{'text': e['text'], 'type': e['type']} for e in entities]}")

# 打印处理时间
print(f"处理时间: {processing_time:.2f}秒")


# ======================== 关系抽取（RE） ========================
@lru_cache(maxsize=1)
def get_re_pipeline():
    tokenizer = AutoTokenizer.from_pretrained(NER_MODEL_NAME)
    model = AutoModelForTokenClassification.from_pretrained(NER_MODEL_NAME)
    return pipeline(
        "ner",  # 使用NER pipeline
        model=model,
        tokenizer=tokenizer,
        aggregation_strategy="first"
    )

def re_extract(entities, text, use_bert_model=True):
    if not entities or not text:
        return [], 0
    
    start_time = time.time()
    try:
        # 使用规则匹配关系
        relations = []
        
        # 定义关系关键词和对应的实体类型约束
        relation_rules = {
            "位于": {
                "keywords": ["位于", "在", "坐落于"],
                "valid_types": {
                    "head": ["ORG", "PER", "LOC"],
                    "tail": ["LOC"]
                }
            },
            "属于": {
                "keywords": ["属于", "是", "为"],
                "valid_types": {
                    "head": ["ORG", "PER"],
                    "tail": ["ORG", "LOC"]
                }
            },
            "任职于": {
                "keywords": ["任职于", "就职于", "工作于"],
                "valid_types": {
                    "head": ["PER"],
                    "tail": ["ORG"]
                }
            }
        }
        
        # 预处理实体，去除重复和部分匹配
        processed_entities = []
        for e in entities:
            # 检查是否与已有实体重叠
            is_subset = False
            for pe in processed_entities:
                if e["text"] in pe["text"] and e["text"] != pe["text"]:
                    is_subset = True
                    break
            if not is_subset:
                processed_entities.append(e)
        
        # 遍历文本中的每个句子
        sentences = re.split('[。！？.!?]', text)
        for sentence in sentences:
            if not sentence.strip():
                continue
            
            # 获取当前句子中的实体
            sentence_entities = [e for e in processed_entities if e["text"] in sentence]
            
            # 检查每个关系类型
            for rel_type, rule in relation_rules.items():
                for keyword in rule["keywords"]:
                    if keyword in sentence:
                        # 在句子中查找符合类型约束的实体对
                        for i, ent1 in enumerate(sentence_entities):
                            for j, ent2 in enumerate(sentence_entities):
                                if i != j:  # 避免自循环
                                    # 检查实体类型是否符合规则
                                    if (ent1["type"] in rule["valid_types"]["head"] and 
                                        ent2["type"] in rule["valid_types"]["tail"]):
                                        # 检查实体在句子中的位置关系
                                        if sentence.find(ent1["text"]) < sentence.find(ent2["text"]):
                                            relations.append({
                                                "head": ent1["text"],
                                                "tail": ent2["text"],
                                                "relation": rel_type
                                            })
        
        # 去重
        unique_relations = []
        seen = set()
        for rel in relations:
            rel_key = (rel["head"], rel["tail"], rel["relation"])
            if rel_key not in seen:
                seen.add(rel_key)
                unique_relations.append(rel)
        
        return unique_relations, time.time() - start_time
            
    except Exception as e:
        logging.error(f"关系抽取失败: {e}")
        return [], time.time() - start_time


# ======================== 文本分析主流程 ========================
def create_knowledge_graph(entities, relations):
    """
    创建知识图谱可视化
    """
    # 创建一个新的有向图
    G = nx.DiGraph()
    
    # 设置实体类型的颜色映射
    entity_colors = {
        'PER': '#FF6B6B',  # 人物-红色
        'ORG': '#4ECDC4',  # 组织-青色
        'LOC': '#45B7D1',  # 地点-蓝色
        'TIME': '#96CEB4', # 时间-绿色
        'TITLE': '#D4A5A5' # 职位-粉色
    }
    
    # 添加节点
    node_colors = []
    for entity in entities:
        G.add_node(entity['text'])
        node_colors.append(entity_colors.get(entity['type'], '#D3D3D3'))
    
    # 添加边
    for relation in relations:
        G.add_edge(relation['head'], relation['tail'], label=relation['relation'])
    
    # 创建图形
    plt.figure(figsize=(12, 8))
    pos = nx.spring_layout(G, k=1, iterations=50)
    
    # 绘制节点
    nx.draw_networkx_nodes(G, pos, node_color=node_colors, node_size=1000, alpha=0.8)
    
    # 绘制边
    nx.draw_networkx_edges(G, pos, edge_color='gray', arrows=True, arrowsize=20)
    
    # 绘制标签
    nx.draw_networkx_labels(G, pos, font_size=10, font_family='SimHei')
    
    # 绘制边的标签
    edge_labels = nx.get_edge_attributes(G, 'label')
    nx.draw_networkx_edge_labels(G, pos, edge_labels=edge_labels, font_size=8, font_family='SimHei')
    
    # 添加图例
    legend_elements = [
        plt.Line2D([0], [0], marker='o', color='w', markerfacecolor=color, label=label, markersize=10)
        for label, color in entity_colors.items()
    ]
    plt.legend(handles=legend_elements, loc='upper right', bbox_to_anchor=(1.15, 1))
    
    # 设置图形属性
    plt.title("知识图谱", fontsize=16, fontfamily='SimHei')
    plt.axis('off')
    
    # 保存图形到临时文件
    temp_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "temp")
    os.makedirs(temp_dir, exist_ok=True)
    timestamp = int(time.time())
    image_path = os.path.join(temp_dir, f"kg_{timestamp}.png")
    plt.savefig(image_path, bbox_inches='tight', dpi=300)
    plt.close()
    
    # 返回图片路径
    return image_path

def process_text(text, model_type="bert"):
    """
    处理文本，进行实体识别和关系抽取
    """
    start_time = time.time()
    
    # 实体识别
    entities, ner_duration = ner(text, model_type)
    if not entities:
        return "", "", "", f"{time.time() - start_time:.2f} 秒"
    
    # 关系抽取
    relations, re_duration = re_extract(entities, text)
    
    # 生成文本格式的实体和关系描述
    ent_text = "📌 实体:\n" + "\n".join([f"{e['text']} ({e['type']})" for e in entities])
    rel_text = "\n\n📎 关系:\n" + "\n".join([f"{r['head']} --[{r['relation']}]--> {r['tail']}" for r in relations])
    
    # 生成知识图谱
    kg_image = create_knowledge_graph(entities, relations)
    
    total_duration = time.time() - start_time
    return ent_text, rel_text, kg_image, f"{total_duration:.2f} 秒"


def process_file(file, model_type="bert"):
    try:
        with open(file.name, 'rb') as f:
            content = f.read()

        if len(content) > 5 * 1024 * 1024:
            return "❌ 文件太大", "", "", ""

        # 检测编码
        try:
            encoding = chardet.detect(content)['encoding'] or 'utf-8'
            text = content.decode(encoding)
        except UnicodeDecodeError:
            # 尝试常见中文编码
            for enc in ['gb18030', 'utf-16', 'big5']:
                try:
                    text = content.decode(enc)
                    break
                except:
                    continue
            else:
                return "❌ 编码解析失败", "", "", ""

        # 直接调用process_text处理文本
        return process_text(text, model_type)
        
    except Exception as e:
        logging.error(f"文件处理错误: {str(e)}")
        return f"❌ 文件处理错误: {str(e)}", "", "", ""



# ======================== 模型评估与自动标注 ========================
def convert_telegram_json_to_eval_format(path):
    with open(path, encoding="utf-8") as f:
        data = json.load(f)
    if isinstance(data, dict) and "text" in data:
        return [{"text": data["text"], "entities": [
            {"text": data["text"][e["start"]:e["end"]]} for e in data.get("entities", [])
        ]}]
    elif isinstance(data, list):
        return data
    elif isinstance(data, dict) and "messages" in data:
        result = []
        for m in data.get("messages", []):
            if isinstance(m.get("text"), str):
                result.append({"text": m["text"], "entities": []})
            elif isinstance(m.get("text"), list):
                txt = ''.join([x["text"] if isinstance(x, dict) else x for x in m["text"]])
                result.append({"text": txt, "entities": []})
        return result
    return []


def evaluate_ner_model(data, model_type):
    tp, fp, fn = 0, 0, 0
    POS_TOLERANCE = 1

    for item in data:
        text = item["text"]
        # 处理标注数据
        gold_entities = []
        for e in item.get("entities", []):
            if "text" in e and "type" in e:
                norm_type = LABEL_MAPPING.get(e["type"], e["type"])
                gold_entities.append({
                    "text": e["text"],
                    "type": norm_type,
                    "start": e.get("start", -1),
                    "end": e.get("end", -1)
                })

        # 获取预测结果
        pred_entities, _ = ner(text, model_type)

        # 初始化匹配状态
        matched_gold = [False] * len(gold_entities)
        matched_pred = [False] * len(pred_entities)

        # 遍历预测实体寻找匹配
        for p_idx, p in enumerate(pred_entities):
            for g_idx, g in enumerate(gold_entities):
                if not matched_gold[g_idx] and \
                        p["text"] == g["text"] and \
                        p["type"] == g["type"] and \
                        abs(p["start"] - g["start"]) <= POS_TOLERANCE and \
                        abs(p["end"] - g["end"]) <= POS_TOLERANCE:
                    matched_gold[g_idx] = True
                    matched_pred[p_idx] = True
                    break

        # 统计指标
        tp += sum(matched_pred)
        fp += len(pred_entities) - sum(matched_pred)
        fn += len(gold_entities) - sum(matched_gold)

    # 处理除零情况
    precision = tp / (tp + fp) if (tp + fp) > 0 else 0
    recall = tp / (tp + fn) if (tp + fn) > 0 else 0
    f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

    return (f"Precision: {precision:.2f}\n"
            f"Recall: {recall:.2f}\n"
            f"F1: {f1:.2f}")


def auto_annotate(file, model_type):
    data = convert_telegram_json_to_eval_format(file.name)
    for item in data:
        ents, _ = ner(item["text"], model_type)
        item["entities"] = ents
    return json.dumps(data, ensure_ascii=False, indent=2)


def save_json(json_text):
    fname = f"auto_labeled_{int(time.time())}.json"
    with open(fname, "w", encoding="utf-8") as f:
        f.write(json_text)
    return fname


# ======================== 数据集导入 ========================
def import_dataset(path="D:/云边智算/暗语识别/filtered_results"):
    import os
    import json

    for filename in os.listdir(path):
        if filename.endswith('.json'):
            filepath = os.path.join(path, filename)
            with open(filepath, 'r', encoding='utf-8') as f:
                data = json.load(f)
                # 调用现有处理流程
                process_text(data['text'])
                print(f"已处理文件: {filename}")


# ======================== Gradio 界面 ========================
with gr.Blocks(css="""
    .kg-graph {height: 700px; overflow-y: auto;}
    .warning {color: #ff6b6b;}
    .error {color: #ff0000; padding: 10px; background-color: #ffeeee; border-radius: 5px;}
""") as demo:
    gr.Markdown("# 🤖 聊天记录实体关系识别系统")

    with gr.Tab("📄 文本分析"):
        input_text = gr.Textbox(lines=6, label="输入文本")
        model_type = gr.Radio(["bert", "chatglm"], value="bert", label="选择模型")
        btn = gr.Button("开始分析")
        out1 = gr.Textbox(label="识别实体")
        out2 = gr.Textbox(label="识别关系")
        out3 = gr.Image(label="知识图谱")
        out4 = gr.Textbox(label="耗时")
        btn.click(fn=process_text, inputs=[input_text, model_type], outputs=[out1, out2, out3, out4])

    with gr.Tab("🗂 文件分析"):
        file_input = gr.File(file_types=[".txt", ".json"])
        file_btn = gr.Button("上传并分析")
        fout1, fout2, fout3, fout4 = gr.Textbox(), gr.Textbox(), gr.Textbox(), gr.Textbox()
        file_btn.click(fn=process_file, inputs=[file_input, model_type], outputs=[fout1, fout2, fout3, fout4])

    with gr.Tab("📊 模型评估"):
        eval_file = gr.File(label="上传标注 JSON")
        eval_model = gr.Radio(["bert", "chatglm"], value="bert")
        eval_btn = gr.Button("开始评估")
        eval_output = gr.Textbox(label="评估结果", lines=5)
        eval_btn.click(lambda f, m: evaluate_ner_model(convert_telegram_json_to_eval_format(f.name), m),
                       [eval_file, eval_model], eval_output)

    with gr.Tab("✏️ 自动标注"):
        raw_file = gr.File(label="上传 Telegram 原始 JSON")
        auto_model = gr.Radio(["bert", "chatglm"], value="bert")
        auto_btn = gr.Button("自动标注")
        marked_texts = gr.Textbox(label="标注结果", lines=20)
        download_btn = gr.Button("💾 下载标注文件")
        auto_btn.click(fn=auto_annotate, inputs=[raw_file, auto_model], outputs=marked_texts)
        download_btn.click(fn=save_json, inputs=marked_texts, outputs=gr.File())

    with gr.Tab("📂 数据管理"):
        gr.Markdown("### 数据集导入")
        dataset_path = gr.Textbox(
            value="D:/云边智算/暗语识别/filtered_results",
            label="数据集路径"
        )
        import_btn = gr.Button("导入数据集到数据库")
        import_output = gr.Textbox(label="导入日志")
        import_btn.click(fn=lambda: import_dataset(dataset_path.value), outputs=import_output)

demo.launch(server_name="0.0.0.0", server_port=7860)