Spaces:

jackkuo
/

streamlit-macp-agents

Sleeping

App Files Files Community

jackkuo commited on 5 days ago

Commit

79899c0

1 Parent(s): 02f96ad

add QA

Browse files

Files changed (43) hide show

.gitattributes +1 -0
app.py +160 -0
config.json +5 -5
docker-compose.yml +12 -0
logs/bio_rag_2025-08-25.log +0 -0
python-services/Retrieve/.gitignore +9 -0
python-services/Retrieve/Dockerfile +41 -0
python-services/Retrieve/MCP_USAGE.md +207 -0
python-services/Retrieve/README_ENV.md +99 -0
python-services/Retrieve/bio_agent/rewrite_agent.py +255 -0
python-services/Retrieve/bio_requests/chat_request.py +17 -0
python-services/Retrieve/bio_requests/rag_request.py +44 -0
python-services/Retrieve/config/2023JCR（完整）.xlsx +3 -0
python-services/Retrieve/config/app_config_dev.yaml +60 -0
python-services/Retrieve/config/global_storage.py +121 -0
python-services/Retrieve/dto/bio_document.py +111 -0
python-services/Retrieve/main.py +93 -0
python-services/Retrieve/readme.md +284 -0
python-services/Retrieve/requirements.txt +19 -0
python-services/Retrieve/routers/mcp_sensor.py +81 -0
python-services/Retrieve/routers/sensor.py +83 -0
python-services/Retrieve/search_service/base_search.py +28 -0
python-services/Retrieve/search_service/pubmed_search.py +197 -0
python-services/Retrieve/search_service/web_search.py +163 -0
python-services/Retrieve/service/__init__.py +0 -0
python-services/Retrieve/service/chat.py +468 -0
python-services/Retrieve/service/pubmed_api.py +164 -0
python-services/Retrieve/service/pubmed_async_api.py +195 -0
python-services/Retrieve/service/pubmed_xml_parse.py +232 -0
python-services/Retrieve/service/query_rewrite.py +354 -0
python-services/Retrieve/service/rag.py +54 -0
python-services/Retrieve/service/rerank.py +60 -0
python-services/Retrieve/service/web_search.py +406 -0
python-services/Retrieve/utils/bio_logger.py +253 -0
python-services/Retrieve/utils/http_util.py +275 -0
python-services/Retrieve/utils/i18n_context.py +125 -0
python-services/Retrieve/utils/i18n_messages.py +262 -0
python-services/Retrieve/utils/i18n_types.py +12 -0
python-services/Retrieve/utils/i18n_util.py +302 -0
python-services/Retrieve/utils/snowflake_id.py +252 -0
python-services/Retrieve/utils/token_util.py +63 -0
requirements.txt +18 -1
requirements_back.txt +15 -0

.gitattributes CHANGED Viewed

@@ -34,3 +34,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
 assets/*.png filter=lfs diff=lfs merge=lfs -text

 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
 assets/*.png filter=lfs diff=lfs merge=lfs -text
+*.xlsx filter=lfs diff=lfs merge=lfs -text

app.py CHANGED Viewed

@@ -5,6 +5,10 @@ import json
 import os
 import platform
 import time
 if platform.system() == "Windows":
     asyncio.set_event_loop_policy(asyncio.WindowsProactorEventLoopPolicy())
@@ -88,6 +92,112 @@ def save_config_to_json(config):
         st.error(f"Error saving settings file: {str(e)}")
         return False
 # Initialize login session variables
 if "authenticated" not in st.session_state:
     st.session_state.authenticated = False
@@ -843,6 +953,14 @@ async def initialize_session(mcp_config=None):
             # Load settings from config.json file
             mcp_config = load_config_from_json()
         # Validate MCP configuration before connecting
         st.info("🔍 Validating MCP server configurations...")
         config_errors = []
@@ -1369,6 +1487,32 @@ with st.sidebar:
     # Action buttons section
     st.subheader("🔄 Actions")
     # Reset conversation button
     if st.button("Reset Conversation", use_container_width=True, type="primary"):
         # Reset thread_id
@@ -1435,3 +1579,19 @@ if user_query:
         st.warning(
             "⚠️ MCP server and agent are not initialized. Please click the 'Apply Settings' button in the left sidebar to initialize."
         )

 import os
 import platform
 import time
+import subprocess
+import threading
+import signal
+import sys
 if platform.system() == "Windows":
     asyncio.set_event_loop_policy(asyncio.WindowsProactorEventLoopPolicy())
         st.error(f"Error saving settings file: {str(e)}")
         return False
+def start_retrieve_service():
+    """
+    启动 Retrieve 服务作为后台进程
+    """
+    try:
+        # 检查服务是否已经在运行
+        if "retrieve_process" in st.session_state and st.session_state.retrieve_process:
+            try:
+                # 检查进程是否还在运行
+                if st.session_state.retrieve_process.poll() is None:
+                    st.info("✅ Retrieve 服务已经在运行")
+                    return True
+            except:
+                pass
+        # 启动服务
+        st.info("🚀 正在启动 Retrieve 服务...")
+        # 构建命令 - 使用 cwd 参数设置工作目录
+        cmd = ["python", "main.py"]
+        # 启动进程
+        process = subprocess.Popen(
+            cmd,
+            stdout=subprocess.PIPE,
+            stderr=subprocess.PIPE,
+            text=True,
+            bufsize=1,
+            universal_newlines=True,
+            cwd="python-services/Retrieve"  # 设置工作目录
+        )
+        # 存储进程引用
+        st.session_state.retrieve_process = process
+        st.session_state.retrieve_started = True
+        # 启动后台线程来监控进程输出
+        def monitor_process():
+            try:
+                while process.poll() is None:
+                    # 读取输出
+                    output = process.stdout.readline()
+                    if output:
+                        st.info(f"Retrieve 服务: {output.strip()}")
+                    # 检查错误输出
+                    error = process.stderr.readline()
+                    if error:
+                        st.warning(f"Retrieve 服务错误: {error.strip()}")
+                    time.sleep(0.1)
+                # 进程结束
+                st.warning(f"Retrieve 服务已停止，退出码: {process.returncode}")
+                st.session_state.retrieve_started = False
+            except Exception as e:
+                st.error(f"监控 Retrieve 服务时出错: {str(e)}")
+        # 启动监控线程
+        monitor_thread = threading.Thread(target=monitor_process, daemon=True)
+        monitor_thread.start()
+        # 等待一下确保服务启动
+        time.sleep(2)
+        # 检查服务是否成功启动
+        if process.poll() is None:
+            st.success("✅ Retrieve 服务启动成功")
+            return True
+        else:
+            st.error("❌ Retrieve 服务启动失败")
+            return False
+    except Exception as e:
+        st.error(f"启动 Retrieve 服务时出错: {str(e)}")
+        return False
+def stop_retrieve_service():
+    """
+    停止 Retrieve 服务
+    """
+    try:
+        if "retrieve_process" in st.session_state and st.session_state.retrieve_process:
+            process = st.session_state.retrieve_process
+            if process.poll() is None:
+                # 发送终止信号
+                process.terminate()
+                # 等待进程结束
+                try:
+                    process.wait(timeout=5)
+                except subprocess.TimeoutExpired:
+                    # 强制杀死进程
+                    process.kill()
+                st.success("✅ Retrieve 服务已停止")
+            else:
+                st.info("Retrieve 服务已经停止")
+            st.session_state.retrieve_started = False
+            st.session_state.retrieve_process = None
+    except Exception as e:
+        st.error(f"停止 Retrieve 服务时出错: {str(e)}")
 # Initialize login session variables
 if "authenticated" not in st.session_state:
     st.session_state.authenticated = False
             # Load settings from config.json file
             mcp_config = load_config_from_json()
+        # 自动启动 Retrieve 服务（如果配置中存在）
+        if "bio-qa-chat" in mcp_config:
+            st.info("🚀 检测到 bio-qa-chat 服务，正在启动...")
+            if start_retrieve_service():
+                st.success("✅ Retrieve 服务启动成功")
+            else:
+                st.warning("⚠️ Retrieve 服务启动失败，但继续初始化其他服务")
         # Validate MCP configuration before connecting
         st.info("🔍 Validating MCP server configurations...")
         config_errors = []
     # Action buttons section
     st.subheader("🔄 Actions")
+    # Retrieve 服务控制按钮
+    st.subheader("🔧 Retrieve 服务控制")
+    col1, col2 = st.columns(2)
+    with col1:
+        if st.button("🚀 启动服务", use_container_width=True, type="primary"):
+            if start_retrieve_service():
+                st.success("✅ 服务启动成功")
+            else:
+                st.error("❌ 服务启动失败")
+            st.rerun()
+    with col2:
+        if st.button("🛑 停止服务", use_container_width=True, type="secondary"):
+            stop_retrieve_service()
+            st.rerun()
+    # 显示服务状态
+    if st.session_state.get("retrieve_started", False):
+        st.success("🟢 Retrieve 服务运行中")
+    else:
+        st.warning("🔴 Retrieve 服务未运行")
+    st.divider()
     # Reset conversation button
     if st.button("Reset Conversation", use_container_width=True, type="primary"):
         # Reset thread_id
         st.warning(
             "⚠️ MCP server and agent are not initialized. Please click the 'Apply Settings' button in the left sidebar to initialize."
         )
+# 应用退出时的清理逻辑
+def cleanup_on_exit():
+    """应用退出时清理资源"""
+    try:
+        if "retrieve_process" in st.session_state and st.session_state.retrieve_process:
+            stop_retrieve_service()
+    except:
+        pass
+# 注册清理函数
+import atexit
+atexit.register(cleanup_on_exit)
+# 注意：在 Streamlit 中不能使用信号处理器，因为它在子线程中运行
+# 清理逻辑通过 atexit 和页面刷新时的状态检查来处理

config.json CHANGED Viewed

@@ -27,16 +27,16 @@
     ],
     "transport": "stdio"
   },
-  "qa": {
-    "transport": "sse",
-    "url": "http://10.15.56.148:9230/sse"
-  },
   "review_generate": {
     "transport": "sse",
     "url": "http://10.15.56.148:8000/review"
   },
-  "机械问题": {
     "transport": "streamable_http",
     "url": "http://127.0.0.1:7860/gradio_api/mcp/"
   }
 }

     ],
     "transport": "stdio"
   },
   "review_generate": {
     "transport": "sse",
     "url": "http://10.15.56.148:8000/review"
   },
+  "get_200_words": {
     "transport": "streamable_http",
     "url": "http://127.0.0.1:7860/gradio_api/mcp/"
+  },
+  "bio-qa-chat": {
+    "transport": "sse",
+    "url": "http://10.15.56.148:9487/sse"
   }
 }

docker-compose.yml CHANGED Viewed

@@ -15,6 +15,18 @@ services:
     networks:
       - mcp-network
     restart: unless-stopped
 networks:
   mcp-network:

     networks:
       - mcp-network
     restart: unless-stopped
+    depends_on:
+      - retrieve-service
+  # Retrieve服务
+  retrieve-service:
+    build: ./python-services/Retrieve
+    container_name: retrieve-service
+    ports:
+      - "9487:9487"
+    networks:
+      - mcp-network
+    restart: unless-stopped
 networks:
   mcp-network:

logs/bio_rag_2025-08-25.log ADDED Viewed

File without changes

python-services/Retrieve/.gitignore ADDED Viewed

	@@ -0,0 +1,9 @@

+logs/*
+*.pyc
+py_milvus_test.py
+test_vector_search.py
+.vscode/settings.json
+service/Qwen3-Reranker-0.6B
+test_model_api.py
+test/logs
+.conda/*

python-services/Retrieve/Dockerfile ADDED Viewed

	@@ -0,0 +1,41 @@

+from python:3.11-slim as builder
+WORKDIR /app
+# 首先只复制依赖文件
+COPY requirements.txt .
+RUN pip install --no-cache-dir -r requirements.txt -i https://pypi.tuna.tsinghua.edu.cn/simple
+RUN pip install -U crawl4ai
+# 运行安装后设置
+RUN crawl4ai-setup
+# Verify your installation
+RUN crawl4ai-doctor
+# RUN python -m playwright install --with-deps chromium
+# 第二阶段
+#from python:3.11-slim
+#WORKDIR /app
+# 从构建阶段复制已安装的依赖
+#COPY --from=builder /usr/local/lib/python3.11/site-packages /usr/local/lib/python3.11/site-packages
+#COPY --from=builder /ms-playwright /ms-playwright
+# 复制应用代码
+COPY . .
+# 声明端口
+EXPOSE 9487
+USER root
+# 3. 设置缓存路径并赋予权限
+# 4. 切换非root用户（避免权限问题）
+# RUN useradd -m appuser && chown -R appuser:appuser /app
+# USER appuser
+CMD ["python", "main.py"]

python-services/Retrieve/MCP_USAGE.md ADDED Viewed

	@@ -0,0 +1,207 @@

+# MCP 包装服务使用说明
+## 概述
+这个服务使用 `FastApiMCP` 将生物医学RAG服务包装成MCP（Model Context Protocol）工具，可以通过MCP客户端调用。
+## 服务配置
+在 `main.py` 中，服务被包装为：
+```python
+mcp = FastApiMCP(app, name="bio qa mcp", include_operations=["bio_qa_stream_chat"])
+mcp.mount_sse()
+```
+## 可用的MCP操作
+### 1. bio_qa_stream_chat
+这是主要的生物医学问答操作，提供流式RAG问答服务。
+## 调用方式
+### 方式1: 通过MCP客户端调用
+#### 1.1 配置MCP客户端
+在你的MCP客户端配置中添加：
+```json
+{
+  "bio_qa_mcp": {
+    "url": "http://localhost:9487/sse",
+    "transport": "sse"
+  }
+}
+```
+#### 1.2 调用示例
+```python
+# 使用MCP客户端调用
+from langchain_mcp_adapters.client import MultiServerMCPClient
+# 配置MCP服务器
+mcp_config = {
+    "bio_qa_mcp": {
+        "url": "http://localhost:9487/sse",
+        "transport": "sse"
+    }
+}
+# 创建客户端
+client = MultiServerMCPClient(mcp_config)
+# 获取工具
+tools = await client.get_tools()
+# 使用工具
+# 工具名称: bio_qa_stream_chat
+# 参数: query (问题), lang (语言，可选，默认"en")
+```
+### 方式2: 直接HTTP调用
+#### 2.1 直接调用API端点
+```bash
+# 调用生物医学问答接口
+curl -X POST "http://localhost:9487/mcp/bio_qa" \
+  -H "Content-Type: application/x-www-form-urlencoded" \
+  -d "query=什么是糖尿病？&lang=zh"
+```
+#### 2.2 Python requests调用
+```python
+import requests
+# 调用接口
+response = requests.post(
+    "http://localhost:9487/mcp/bio_qa",
+    data={
+        "query": "什么是糖尿病？",
+        "lang": "zh"
+    }
+)
+# 处理流式响应
+for line in response.iter_lines():
+    if line:
+        print(line.decode('utf-8'))
+```
+## 参数说明
+### bio_qa_stream_chat 操作
+- **query** (必需): 问题内容
+- **lang** (可选): 语言设置
+  - `"zh"`: 中文
+  - `"en"`: 英文（默认）
+## 响应格式
+服务返回流式响应（Server-Sent Events），格式为：
+```
+data: {"type": "result", "content": "回答内容..."}
+data: {"type": "result", "content": "更多内容..."}
+data: {"type": "done", "content": "完成"}
+```
+## 使用场景
+### 1. 在LangChain中使用
+```python
+from langchain.agents import AgentExecutor, create_openai_functions_agent
+from langchain_openai import ChatOpenAI
+# 创建代理
+llm = ChatOpenAI(model="gpt-4")
+agent = create_openai_functions_agent(llm, tools, prompt)
+agent_executor = AgentExecutor(agent=agent, tools=tools)
+# 执行问答
+result = await agent_executor.ainvoke({
+    "input": "请帮我查询关于糖尿病的相关信息"
+})
+```
+### 2. 在Streamlit应用中使用
+```python
+import streamlit as st
+from langchain_mcp_adapters.client import MultiServerMCPClient
+# 初始化MCP客户端
+@st.cache_resource
+def get_mcp_client():
+    config = {
+        "bio_qa_mcp": {
+            "url": "http://localhost:9487/sse",
+            "transport": "sse"
+        }
+    }
+    return MultiServerMCPClient(config)
+# 使用
+client = get_mcp_client()
+tools = await client.get_tools()
+```
+## 部署说明
+### 1. 启动服务
+```bash
+cd python-services/Retrieve
+python main.py
+```
+服务将在 `http://localhost:9487` 启动。
+### 2. 环境变量配置
+确保设置了必要的环境变量：
+```bash
+export ENVIRONMENT=prod
+export QA_LLM_MAIN_API_KEY=your-api-key
+export QA_LLM_MAIN_BASE_URL=your-api-url
+# ... 其他配置
+```
+### 3. 网络访问
+- 本地访问: `http://localhost:9487`
+- 远程访问: `http://your-server-ip:9487`
+## 故障排除
+### 常见问题
+1. **连接失败**: 检查服务是否启动，端口是否正确
+2. **认证错误**: 检查API密钥配置
+3. **流式响应中断**: 检查网络连接稳定性
+### 日志查看
+服务会记录详细的日志信息，包括：
+- 请求处理时间
+- 错误信息
+- 操作状态
+## 扩展功能
+### 添加新的MCP操作
+1. 在 `routers/mcp_sensor.py` 中添加新的路由
+2. 在 `main.py` 的 `include_operations` 中添加操作名称
+3. 重新启动服务
+### 自定义响应格式
+可以修改 `ChatService` 来定制响应格式，满足不同的MCP客户端需求。

python-services/Retrieve/README_ENV.md ADDED Viewed

	@@ -0,0 +1,99 @@

+# 混合配置说明
+## 概述
+本项目采用混合配置方式：
+- **大部分配置**：从YAML配置文件加载（`app_config_dev.yaml` 或 `app_config_prod.yaml`）
+- **敏感配置**：API密钥和base_url从环境变量加载，覆盖YAML文件中的值
+## 环境变量列表
+### 基础配置
+- `ENVIRONMENT`: 环境类型，可选值：`dev`（开发环境）或 `prod`（生产环境），默认为 `dev`
+### API密钥和Base URL配置（从环境变量加载）
+#### QA LLM 主模型
+- `QA_LLM_MAIN_API_KEY`: API密钥
+- `QA_LLM_MAIN_BASE_URL`: API基础URL
+#### QA LLM 备用模型
+- `QA_LLM_BACKUP_API_KEY`: API密钥
+- `QA_LLM_BACKUP_BASE_URL`: API基础URL
+#### Rewrite LLM 备用模型 (GPT-4o)
+- `REWRITE_LLM_BACKUP_API_KEY`: API密钥
+- `REWRITE_LLM_BACKUP_BASE_URL`: API基础URL
+#### Rewrite LLM 主模型
+- `REWRITE_LLM_MAIN_API_KEY`: API密钥
+- `REWRITE_LLM_MAIN_BASE_URL`: API基础URL
+#### Web搜索服务
+- `SERPER_API_KEY`: Serper API密钥（用于网络搜索）
+## 其他配置（从YAML文件加载）
+以下配置仍然从YAML文件加载，包括：
+- 模型名称
+- max_tokens
+- temperature
+- recall配置
+- qa-topk配置
+- qa-prompt-max-token配置
+- chat配置
+## 使用方法
+### 1. 设置环境变量
+```bash
+# 设置环境
+export ENVIRONMENT=prod
+# 设置API密钥和base_url
+export QA_LLM_MAIN_API_KEY=your-actual-api-key
+export QA_LLM_MAIN_BASE_URL=https://your-api-endpoint.com
+export REWRITE_LLM_BACKUP_API_KEY=your-gpt4o-api-key
+export REWRITE_LLM_BACKUP_BASE_URL=https://api.openai.com/v1
+# 设置Web搜索API密钥
+export SERPER_API_KEY=your-serper-api-key
+# ... 其他API配置
+```
+### 2. 在代码中使用
+```python
+from config.global_storage import get_model_config
+# 获取配置
+config = get_model_config()
+# 使用配置（API密钥和base_url来自环境变量，其他来自YAML）
+model_name = config['qa-llm']['main']['model']  # 来自YAML
+api_key = config['qa-llm']['main']['api_key']   # 来自环境变量
+base_url = config['qa-llm']['main']['base_url'] # 来自环境变量
+```
+## 配置优先级
+1. **环境变量**：API密钥和base_url（最高优先级）
+2. **YAML文件**：其他所有配置（基础配置）
+## 优势
+1. **安全性**: 敏感信息（API密钥）从环境变量加载，不会出现在代码或配置文件中
+2. **灵活性**: 可以轻松切换不同环境的API端点
+3. **维护性**: 大部分配置仍在YAML文件中，便于管理和版本控制
+4. **部署友好**: 生产环境只需要设置环境变量即可
+## 注意事项
+1. 如果环境变量未设置，将使用YAML文件中的默认值
+2. 确保 `.env` 文件已添加到 `.gitignore` 中
+3. 生产环境建议使用环境变量而不是 `.env` 文件
+4. YAML文件中的API密钥和base_url值会被环境变量覆盖
+5. 对于Web搜索服务，如果未设置 `SERPER_API_KEY`，将使用代码中的默认密钥（不推荐用于生产环境）

python-services/Retrieve/bio_agent/rewrite_agent.py ADDED Viewed

	@@ -0,0 +1,255 @@

+import json
+from typing import Any, List
+from agents import Agent, OpenAIChatCompletionsModel, Runner
+from agents.agent_output import AgentOutputSchemaBase
+from openai import AsyncOpenAI
+from config.global_storage import get_model_config
+from utils.bio_logger import bio_logger as logger
+from typing import List, Dict
+from pydantic import BaseModel, Field,ConfigDict
+class DateRange(BaseModel):
+    # model_config = ConfigDict(strict=True)
+    model_config = ConfigDict(strict=True, extra="forbid",json_schema_extra={"required": ["start", "end"]})
+    start: str = Field('', description="Start date in YYYY-MM-DD format")
+    end: str = Field('', description="End date in YYYY-MM-DD format")
+class Journal(BaseModel):
+    # model_config = ConfigDict(strict=True)
+    model_config = ConfigDict(strict=True, extra="forbid",json_schema_extra={"required": ["name", "EISSN"]})
+    name: str = Field(..., description="Journal name")
+    EISSN: str = Field(..., description="Journal EISSN")
+class AuthorFilter(BaseModel):
+    # model_config = ConfigDict(strict=True)
+    model_config = ConfigDict(strict=True, extra="forbid",json_schema_extra={"required": ["name", "first_author", "last_author"]})
+    name: str = Field("", description="Author name to filter")
+    first_author: bool = Field(False, description="Is first author?")
+    last_author: bool = Field(False, description="Is last author?")
+class Filters(BaseModel):
+    # model_config = ConfigDict(strict=True)
+    model_config = ConfigDict(strict=True, extra="forbid",json_schema_extra={"required": ["date_range", "article_types", "languages", "subjects", "journals", "author"]})
+    date_range: DateRange = Field(...,default_factory=DateRange)
+    article_types: List[str] = Field(...,default_factory=list)
+    languages: List[str] = Field(["English"],)
+    subjects: List[str] = Field(...,default_factory=list)
+    journals: List[str] = Field([""])
+    author: AuthorFilter = Field(...,default_factory=AuthorFilter)
+class RewriteJsonOutput(BaseModel):
+    model_config = ConfigDict(strict=True, extra="forbid",json_schema_extra={"required": ["category", "key_words", "key_journals", "queries", "filters"]})
+    category: str = Field(..., description="Query category")
+    key_words: List[str] = Field(...,default_factory=list)
+    key_journals: List[Journal] = Field(...,default_factory=list)
+    queries: List[str] = Field(...,default_factory=list)
+    filters: Filters = Field(...,default_factory=Filters)
+class SimpleJsonOutput(BaseModel):
+    key_words: List[str] = Field(...,default_factory=list)
+class RewriteJsonOutputSchema(AgentOutputSchemaBase):
+    def is_plain_text(self):
+        return False
+    def name(self):
+        return "RewriteJsonOutput"
+    def json_schema(self):
+        return RewriteJsonOutput.model_json_schema()
+    def is_strict_json_schema(self):
+        return True
+    def validate_json(self, json_data: Dict[str, Any]) -> bool:
+        try:
+            if isinstance(json_data, str):
+                json_data = json.loads(json_data)
+            return RewriteJsonOutput.model_validate(json_data)
+        except Exception as e:
+            logger.error(f"Validation error: {e}")
+            # return False
+    def parse(self, json_data: Dict[str, Any]) -> Any:
+        if isinstance(json_data, str):
+            json_data = json.loads(json_data)
+        return json_data
+class RewriteAgent:
+    def __init__(self):
+        self.model_config = get_model_config()
+        self.agent_name = "rewrite agent"
+        self.selected_model = OpenAIChatCompletionsModel(
+            model=self.model_config["rewrite-llm"]["main"]["model"],
+            openai_client=AsyncOpenAI(
+                api_key=self.model_config["rewrite-llm"]["main"]["api_key"],
+                base_url=self.model_config["rewrite-llm"]["main"]["base_url"],
+                timeout=120.0,
+                max_retries=2,
+            ),
+        )
+        # self.openai_client = AsyncOpenAI(
+        #     api_key=self.model_config["llm"]["api_key"],
+        #     base_url=self.model_config["llm"]["base_url"],
+        # )
+    async def rewrite_query(self, query: str,INSTRUCTIONS: str,simple_version=False) -> List[str]:
+        try:
+            logger.info(f"Rewriting query with main configuration.")
+            if not simple_version:
+                rewrite_agent = Agent(
+                    name=self.agent_name,
+                    instructions=' Your task is to rewrite the query into a structured JSON format. Please do not answer the question.',
+                    model=self.selected_model,
+                    output_type=RewriteJsonOutputSchema(),  # Use the Pydantic model for structured output
+                )
+            else:
+                rewrite_agent = Agent(
+                    name=self.agent_name,
+                    instructions=' Your task is to rewrite the query into a structured JSON format. Please do not answer the question.',
+                    model=self.selected_model,
+                    output_type=SimpleJsonOutput,  # Use the Pydantic model for structured output
+                )
+            result = await Runner.run(rewrite_agent, input=INSTRUCTIONS + 'Here is the question: '+query)
+            # completion = await self.openai_client.chat.completions.create(
+            #     model=self.model_config["llm"]["model"],
+            #     messages=[
+            #         # {
+            #         #     "role": "system",
+            #         #     "content": "You are a helpful assistant.",
+            #         # },
+            #         {
+            #             "role": "user",
+            #             "content": INSTRUCTIONS +' Here is the question: ' + query,
+            #         },
+            #     ],
+            #     temperature=self.model_config["llm"]["temperature"],
+            #     # max_tokens=self.model_config["llm"]["max_tokens"],
+            # )
+            try:
+                # query_result = self.parse_json_output(completion.choices[0].message.content)
+                query_result = self.parse_json_output(result.final_output.model_dump_json())
+                # query_result = self.parse_json_output(completion.model_dump_json())
+            except Exception as e:
+                # print(completion.choices[0].message.content)
+                logger.error(f"Failed to parse JSON output: {e}")
+            return query_result
+        except Exception as main_error:
+            self.selected_model_backup = OpenAIChatCompletionsModel(
+            model=self.model_config["rewrite-llm"]["backup"]["model"],
+            openai_client=AsyncOpenAI(
+                api_key=self.model_config["rewrite-llm"]["backup"]["api_key"],
+                base_url=self.model_config["rewrite-llm"]["backup"]["base_url"],
+                timeout=120.0,
+                max_retries=2,
+                ),
+            )
+            logger.error(f"Error with main model: {main_error}", exc_info=main_error)
+            logger.info("Trying backup model for rewriting query.")
+            if not simple_version:
+                rewrite_agent = Agent(
+                    name=self.agent_name,
+                    instructions=' Your task is to rewrite the query into a structured JSON format. Please do not answer the question.',
+                    model=self.selected_model_backup,
+                    output_type=RewriteJsonOutputSchema(),  # Use the Pydantic model for structured output
+                )
+            else:
+                rewrite_agent = Agent(
+                    name=self.agent_name,
+                    instructions=' Your task is to rewrite the query into a structured JSON format. Please do not answer the question.',
+                    model=self.selected_model_backup,
+                    output_type=SimpleJsonOutput,  # Use the Pydantic model for structured output
+                )
+            result = await Runner.run(rewrite_agent, input=INSTRUCTIONS + 'Here is the question: '+query)
+            # completion = await self.openai_client.chat.completions.create(
+            #     model=self.model_config["llm"]["model"],
+            #     messages=[
+            #         # {
+            #         #     "role": "system",
+            #         #     "content": "You are a helpful assistant.",
+            #         # },
+            #         {
+            #             "role": "user",
+            #             "content": INSTRUCTIONS +' Here is the question: ' + query,
+            #         },
+            #     ],
+            #     temperature=self.model_config["llm"]["temperature"],
+            #     # max_tokens=self.model_config["llm"]["max_tokens"],
+            # )
+            try:
+                # query_result = self.parse_json_output(completion.choices[0].message.content)
+                query_result = self.parse_json_output(result.final_output.model_dump_json())
+                # query_result = self.parse_json_output(completion.model_dump_json())
+            except Exception as e:
+                # print(completion.choices[0].message.content)
+                logger.error(f"Failed to parse JSON output: {e}")
+            return query_result
+    def parse_json_output(self, output: str) -> Any:
+        """Take a string output and parse it as JSON"""
+        # First try to load the string as JSON
+        try:
+            return json.loads(output)
+        except json.JSONDecodeError as e:
+            logger.info(f"Output is not valid JSON: {output}")
+            logger.error(f"Failed to parse output as direct JSON: {e}")
+        # If that fails, assume that the output is in a code block - remove the code block markers and try again
+        parsed_output = output
+        if "```" in parsed_output:
+            try:
+                parts = parsed_output.split("```")
+                if len(parts) >= 3:
+                    parsed_output = parts[1]
+                    if parsed_output.startswith("json") or parsed_output.startswith(
+                        "JSON"
+                    ):
+                        parsed_output = parsed_output[4:].strip()
+                    return json.loads(parsed_output)
+            except (IndexError, json.JSONDecodeError) as e:
+                logger.error(f"Failed to parse output from code block: {e}")
+        # As a last attempt, try to manually find the JSON object in the output and parse it
+        parsed_output = self.find_json_in_string(output)
+        if parsed_output:
+            try:
+                return json.loads(parsed_output)
+            except json.JSONDecodeError as e:
+                logger.error(f"Failed to parse extracted JSON: {e}")
+                logger.error(f"Extracted JSON: {parsed_output}")
+                return {"queries": []}
+        else:
+            logger.error("No valid JSON found in the output:{output}")
+        # If all fails, raise an error
+        return {"queries": []}
+    def find_json_in_string(self, string: str) -> str:
+        """
+        Method to extract all text in the left-most brace that appears in a string.
+        Used to extract JSON from a string (note that this function does not validate the JSON).
+        Example:
+            string = "bla bla bla {this is {some} text{{}and it's sneaky}} because {it's} confusing"
+            output = "{this is {some} text{{}and it's sneaky}}"
+        """
+        stack = 0
+        start_index = None
+        for i, c in enumerate(string):
+            if c == "{":
+                if stack == 0:
+                    start_index = i  # Start index of the first '{'
+                stack += 1  # Push to stack
+            elif c == "}":
+                stack -= 1  # Pop stack
+                if stack == 0:
+                    # Return the substring from the start of the first '{' to the current '}'
+                    return (
+                        string[start_index : i + 1] if start_index is not None else ""
+                    )
+        # If no complete set of braces is found, return an empty string
+        return ""

python-services/Retrieve/bio_requests/chat_request.py ADDED Viewed

	@@ -0,0 +1,17 @@

+from pydantic import BaseModel, Field
+class ChatRequest(BaseModel):
+    query: str = Field(default="", description="Search query")
+    is_web: bool = Field(
+        default=False, description="Whether to use web search, default is False"
+    )
+    is_pubmed: bool = Field(
+        default=True, description="Whether to use pubmed search, default is True"
+    )
+    language: str = Field(
+        default="en", description="Response language (zh/en), default is English"
+    )

python-services/Retrieve/bio_requests/rag_request.py ADDED Viewed

	@@ -0,0 +1,44 @@

+"""
+RAG request class, used to encapsulate the parameters of RAG requests
+"""
+from typing import List, Optional
+from pydantic import BaseModel, Field
+class RagRequest(BaseModel):
+    """
+    RAG request class, used to encapsulate the parameters of RAG requests
+    """
+    query: str = Field(default="", description="Search query")
+    top_k: int = Field(default=5, ge=1, description="Number of results to return")
+    search_type: Optional[str] = Field(
+        default="keyword",
+        description="Type of search to perform (keyword or advanced), please note that if data_source is not ['pubmed'], this field will be ignored",
+    )
+    is_rewrite: Optional[bool] = Field(
+        default=True, description="Whether the query is a subquery of a larger query"
+    )
+    data_source: List[str] = Field(
+        default=["pubmed"],
+        description="Data source to search in (e.g., pubmed, web)",
+    )
+    pubmed_topk: int = Field(
+        default=30,
+        description="Number of results to return from one specific pubmed search, only used when is_rewrite is True",
+    )
+    is_rerank: Optional[bool] = Field(
+        default=True,
+        description="Whether to use reranker to rerank the results, only used when data_source is ['pubmed']",
+    )
+    language: Optional[str] = Field(
+        default="en", description="Response language (zh/en), default is English"
+    )

python-services/Retrieve/config/2023JCR（完整）.xlsx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:346311258d5c7843558c36d874a95a1603ff9f38c5ec32c9b58e93f41f71b023
+size 1922687

python-services/Retrieve/config/app_config_dev.yaml ADDED Viewed

	@@ -0,0 +1,60 @@

+qa-llm:
+  main:
+    model: deepseek-r1
+    api_key: sk-sk-*************
+    base_url: https://dashscope.aliyuncs.com/compatible-mode/v1
+    max_tokens: 1024
+    temperature: 0.7
+  backup:
+    model: qwen-plus-latest
+    api_key: sk-sk-*************
+    base_url: https://dashscope.aliyuncs.com/compatible-mode/v1
+    max_tokens: 1024
+    temperature: 0.7
+rewrite-llm:
+  backup:
+    model: gpt-4o
+    api_key: sk-**********
+    base_url: https://openai.sohoyo.io/v1
+    max_tokens: 1024
+    temperature: 0.7
+  main:
+    model: qwen-plus-latest
+    api_key: sk-sk-*************
+    base_url: https://dashscope.aliyuncs.com/compatible-mode/v1
+    max_tokens: 1024
+    temperature: 0.7
+recall:
+  pubmed_topk: 30
+  es_topk: 30
+qa-topk:
+  personal_vector: 40
+  pubmed: 10
+  web: 5
+qa-prompt-max-token:
+  max_tokens: 120000
+chat:
+  rag_prompt: |
+    # The following contents are the search results related to the user's message:
+        {search_results}
+        In the search results I provide to you, each result is formatted as [document X begin]...[document X end], where X represents the numerical index of each article.
+        When responding, please keep the following points in mind:
+        - Today is {cur_date}.
+        - Not all content in the search results is closely related to the user's question. You need to evaluate and filter the search results based on the question.
+        - If all the search results are irrelevant, please answer the question by yourself  professionally and concisely.
+        - The search results may focus only on a few points, use the information it provided, but do not favor those points in your answer, reason and answer by yourself all-sidedly with full consideration.
+        - For listing-type questions (e.g., listing all flight information), try to limit the answer to 10 key points and inform the user that they can refer to the search sources for complete information. Prioritize providing the most complete and relevant items in the list. Avoid mentioning content not provided in the search results unless necessary.
+        - If the response is lengthy, structure it well and summarize it in paragraphs. If a point-by-point format is needed, try to limit it to 5 points and merge related content.
+        - For objective Q&A, if the answer is very brief, you may add one or two related sentences to enrich the content.
+        - Choose an appropriate and visually appealing format for your response based on the user's requirements and the content of the answer, ensuring strong readability.
+        - Your answer should synthesize information from multiple relevant documents.
+        - Unless the user requests otherwise, your response should be in the same language as the user's question.
+        # The user's message is:
+        {question}
+        - The content should be concise and direct, and you MUST include proper citations using ONLY "[bdd-rag-citation:X]" format reference marks to indicate the sources of your information. Do NOT use any other citation formats such as [document X], [Author, Year], or parenthetical bibliographical references.

python-services/Retrieve/config/global_storage.py ADDED Viewed

	@@ -0,0 +1,121 @@

+"""全局配置存储模块，提供配置文件的加载和缓存功能，API密钥和base_url从环境变量加载。"""
+import os
+from typing import Any, Dict, Optional
+import yaml
+class ConfigManager:
+    """配置管理器，使用单例模式缓存配置，API密钥和base_url从环境变量加载。"""
+    _instance = None
+    _config: Optional[Dict[str, Any]] = None
+    def __new__(cls):
+        if cls._instance is None:
+            cls._instance = super().__new__(cls)
+        return cls._instance
+    def get_config(self) -> Dict[str, Any]:
+        """获取配置，如果未加载则自动加载。
+        Returns:
+            包含所有配置信息的字典
+        """
+        if self._config is None:
+            self._config = self._load_config()
+        return self._config
+    def _get_environment(self) -> str:
+        """获取当前环境类型。
+        Returns:
+            环境类型：'prod' 或 'dev'
+        """
+        return os.getenv("ENVIRONMENT", "dev").lower()
+    def _get_config_path(self) -> str:
+        """根据环境获取配置文件路径。
+        Returns:
+            配置文件路径
+        """
+        env = self._get_environment()
+        if env == "prod":
+            return "config/app_config_prod.yaml"
+        return "config/app_config_dev.yaml"
+    def _load_config(self) -> Dict[str, Any]:
+        """加载配置文件，并覆盖API密钥和base_url为环境变量值。
+        Returns:
+            从YAML文件加载的配置字典，API密钥和base_url从环境变量覆盖
+        """
+        config_path = self._get_config_path()
+        try:
+            with open(config_path, "r", encoding="utf-8") as file:
+                config = yaml.safe_load(file)
+                # 添加环境信息到配置中
+                config["environment"] = self._get_environment()
+                # 从环境变量覆盖API密钥和base_url
+                self._override_api_configs(config)
+                return config
+        except FileNotFoundError as exc:
+            raise FileNotFoundError(f"配置文件未找到: {config_path}") from exc
+        except yaml.YAMLError as exc:
+            raise ValueError(f"配置文件格式错误: {exc}") from exc
+    def _override_api_configs(self, config: Dict[str, Any]) -> None:
+        """从环境变量覆盖API密钥和base_url配置。
+        Args:
+            config: 配置字典
+        """
+        # QA LLM 主模型
+        if "qa-llm" in config and "main" in config["qa-llm"]:
+            main_config = config["qa-llm"]["main"]
+            if os.getenv("QA_LLM_MAIN_API_KEY"):
+                main_config["api_key"] = os.getenv("QA_LLM_MAIN_API_KEY")
+            if os.getenv("QA_LLM_MAIN_BASE_URL"):
+                main_config["base_url"] = os.getenv("QA_LLM_MAIN_BASE_URL")
+        # QA LLM 备用模型
+        if "qa-llm" in config and "backup" in config["qa-llm"]:
+            backup_config = config["qa-llm"]["backup"]
+            if os.getenv("QA_LLM_BACKUP_API_KEY"):
+                backup_config["api_key"] = os.getenv("QA_LLM_BACKUP_API_KEY")
+            if os.getenv("QA_LLM_BACKUP_BASE_URL"):
+                backup_config["base_url"] = os.getenv("QA_LLM_BACKUP_BASE_URL")
+        # Rewrite LLM 备用模型 (GPT-4o)
+        if "rewrite-llm" in config and "backup" in config["rewrite-llm"]:
+            backup_config = config["rewrite-llm"]["backup"]
+            if os.getenv("REWRITE_LLM_BACKUP_API_KEY"):
+                backup_config["api_key"] = os.getenv("REWRITE_LLM_BACKUP_API_KEY")
+            if os.getenv("REWRITE_LLM_BACKUP_BASE_URL"):
+                backup_config["base_url"] = os.getenv("REWRITE_LLM_BACKUP_BASE_URL")
+        # Rewrite LLM 主模型
+        if "rewrite-llm" in config and "main" in config["rewrite-llm"]:
+            main_config = config["rewrite-llm"]["main"]
+            if os.getenv("REWRITE_LLM_MAIN_API_KEY"):
+                main_config["api_key"] = os.getenv("REWRITE_LLM_MAIN_API_KEY")
+            if os.getenv("REWRITE_LLM_MAIN_BASE_URL"):
+                main_config["base_url"] = os.getenv("REWRITE_LLM_MAIN_BASE_URL")
+# 全局配置管理器实例
+_config_manager = ConfigManager()
+def get_model_config() -> Dict[str, Any]:
+    """获取模型配置。
+    Returns:
+        包含所有配置信息的字典
+    """
+    return _config_manager.get_config()

python-services/Retrieve/dto/bio_document.py ADDED Viewed

	@@ -0,0 +1,111 @@

+from dataclasses import dataclass, field
+from typing import Optional
+from utils.snowflake_id import snowflake_id_str
+@dataclass
+class BaseBioDocument:
+    """
+    生物医学文档基础类
+    包含所有搜索类型共有的字段
+    """
+    bio_id: Optional[str] = field(default_factory=snowflake_id_str)
+    title: Optional[str] = None
+    text: Optional[str] = None
+    source: Optional[str] = None
+    source_id: Optional[str] = None
+@dataclass
+class PubMedDocument(BaseBioDocument):
+    """
+    PubMed学术文献文档
+    包含学术文献特有的字段
+    """
+    abstract: Optional[str] = None
+    authors: Optional[str] = None
+    doi: Optional[str] = None
+    journal: Optional[str] = None
+    pub_date: Optional[str] = None
+    if_score: Optional[float] = None
+    url: Optional[str] = None
+    def __post_init__(self):
+        if self.source is None:
+            self.source = "pubmed"
+@dataclass
+class PersonalDocument(BaseBioDocument):
+    """
+    个人向量搜索文档
+    包含个人文档特有的字段
+    """
+    if_score: Optional[float] = None
+    doc_id: Optional[str] = None
+    index: Optional[int] = 0
+    user_id: Optional[str] = None
+    file_name: Optional[str] = None
+    def __post_init__(self):
+        if self.source is None:
+            self.source = "personal_vector"
+@dataclass
+class WebDocument(BaseBioDocument):
+    """
+    Web搜索文档
+    包含网页内容特有的字段
+    """
+    url: Optional[str] = None
+    description: Optional[str] = None
+    def __post_init__(self):
+        if self.source is None:
+            self.source = "web"
+# 为了保持向后兼容，保留原有的BioDocument类
+@dataclass
+class BioDocument(BaseBioDocument):
+    """
+    生物医学文档（向后兼容）
+    包含所有可能的字段，但建议使用专门的文档类型
+    """
+    abstract: Optional[str] = None
+    authors: Optional[str] = None
+    doi: Optional[str] = None
+    journal: Optional[str] = None
+    pub_date: Optional[str] = None
+    if_score: Optional[float] = None
+    url: Optional[str] = None
+    doc_id: Optional[str] = None
+# 工厂函数，根据source类型创建相应的文档对象
+def create_bio_document(source: str, **kwargs) -> BaseBioDocument:
+    """
+    根据source类型创建相应的文档对象
+    Args:
+        source: 文档来源类型 ("pubmed", "personal_vector", "web")
+        **kwargs: 文档字段
+    Returns:
+        相应的文档对象
+    """
+    if source == "pubmed":
+        return PubMedDocument(**kwargs)
+    elif source == "personal_vector":
+        return PersonalDocument(**kwargs)
+    elif source == "web":
+        return WebDocument(**kwargs)
+    else:
+        # 默认使用通用BioDocument
+        return BioDocument(**kwargs)

python-services/Retrieve/main.py ADDED Viewed

	@@ -0,0 +1,93 @@

+"""生物医学RAG服务主程序入口。"""
+import importlib
+import pkgutil
+import time
+import os
+from dotenv import load_dotenv
+# 加载环境变量
+load_dotenv()
+import uvicorn
+from asgi_correlation_id import CorrelationIdMiddleware, correlation_id
+from fastapi import FastAPI, Request
+from fastapi_mcp import FastApiMCP
+from fastapi.middleware.cors import CORSMiddleware
+from routers import sensor, mcp_sensor
+from utils.bio_logger import bio_logger as logger
+# 调试：验证环境变量是否加载
+logger.info(f"SERPER_API_KEY loaded: {'Yes' if os.getenv('SERPER_API_KEY') else 'No'}")
+app = FastAPI(
+    docs_url=None,  # 关闭 Swagger UI 文档
+    redoc_url=None,  # 关闭 ReDoc 文档
+    openapi_url=None,  # 关闭 OpenAPI 规范文件
+    debug=False,  # 关闭调试模式
+)
+# 第一个添加的中间件
+app.add_middleware(CorrelationIdMiddleware)
+# 配置CORS
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+# 路由
+app.include_router(sensor.router)
+app.include_router(mcp_sensor.router)  # 包含 MCP 路由
+@app.middleware("http")
+async def add_process_time_header(request: Request, call_next):
+    """HTTP中间件，记录请求处理时间和状态。"""
+    start_time = time.time()
+    logger.info(f"Request started  | URL: {request.url}")
+    response = await call_next(request)
+    process_time = time.time() - start_time
+    logger.info(
+        f"Request completed | "
+        f"Status: {response.status_code} | "
+        f"Time: {process_time:.2f}s"
+    )
+    return response
+def dynamic_import_subclasses(parent_dir: str) -> None:
+    """动态导入指定目录下的所有Python模块。
+    Args:
+        parent_dir: 要导入的目录路径
+    """
+    for _, module_name, _ in pkgutil.iter_modules([parent_dir]):
+        module = importlib.import_module(f"{parent_dir}.{module_name}")
+        logger.info(f"Imported: {module.__name__}")
+# Add MCP server to the FastAPI app
+mcp = FastApiMCP(
+    app,
+    name="bio qa mcp",
+    include_operations=["bio_qa_stream_chat"]
+)
+# Mount the MCP server to the FastAPI app
+# 挂载SSE端点到 /mcp/sse
+mcp.mount_sse()
+if __name__ == "__main__":
+    logger.info("Starting Bio RAG Server...")
+    dynamic_import_subclasses("search_service")
+    uvicorn.run(app, host="0.0.0.0", port=9487)

python-services/Retrieve/readme.md ADDED Viewed

	@@ -0,0 +1,284 @@

+# Bio RAG Server
+一个基于FastAPI的生物医学检索增强生成(RAG)服务，支持PubMed文献检索、Web搜索和向量数据库查询，提供智能问答和文档检索功能。
+## 🚀 功能特性
+- **多源数据检索**: 支持PubMed、Web搜索、个人向量数据库等多种数据源
+- **智能问答**: 基于大语言模型的RAG问答，支持流式响应
+- **查询重写**: 智能查询拆分和重写，提高检索精度
+- **主备切换**: 支持LLM服务的主备配置，自动故障转移
+- **流式响应**: 实时流式聊天响应，提升用户体验
+- **国际化支持**: 支持中英文切换，包含87个国际化消息，涵盖8种消息类型
+- **日志追踪**: 完整的请求追踪和日志记录
+- **CORS支持**: 跨域请求支持，便于前端集成
+## 🏗️ 系统架构
+```
+bio_rag_server/
+├── bio_agent/          # AI代理相关
+├── bio_requests/       # 请求模型定义
+├── config/            # 配置文件
+├── dto/               # 数据传输对象
+├── routers/           # API路由
+├── search_service/    # 搜索服务
+├── service/           # 核心业务服务
+├── utils/             # 工具类
+└── test/              # 测试文件
+```
+## 📋 环境要求
+- Python 3.8+
+- OpenAI API 或兼容的LLM服务
+## 🛠️ 安装部署
+### 1. 克隆项目
+```bash
+git clone <repository-url>
+cd bio_rag_server-1
+```
+### 2. 安装依赖
+```bash
+pip install -r requirements.txt
+```
+### 3. 配置环境
+复制并修改配置文件 `config/app_config.yaml`:
+```yaml
+llm:
+  model: gpt-4o
+  api_key: your-openai-api-key
+  base_url: https://api.openai.com/v1
+  max_tokens: 1024
+  temperature: 0.7
+qa-llm:
+  main:
+    model: deepseek-r1
+    api_key: your-main-api-key
+    base_url: https://your-main-endpoint/v1
+    max_tokens: 1024
+    temperature: 0.7
+  backup:
+    model: qwen-plus-latest
+    api_key: your-backup-api-key
+    base_url: https://your-backup-endpoint/v1
+    max_tokens: 1024
+    temperature: 0.7
+```
+### 4. 启动服务
+```bash
+python main.py
+```
+或使用Docker:
+```bash
+docker build -t bio-rag-server .
+docker run -p 9487:9487 bio-rag-server
+```
+服务将在 `http://localhost:9487` 启动。
+## 📚 API 文档
+### 1. 文档检索 API
+**端点**: `POST /retrieve`
+**请求体**:
+```json
+{
+  "query": "cancer treatment",
+  "top_k": 5,
+  "search_type": "keyword",
+  "is_rewrite": true,
+  "data_source": ["pubmed"],
+  "user_id": "user123",
+  "pubmed_topk": 30
+}
+```
+**响应**:
+```json
+[
+  {
+    "title": "Cancer Treatment Advances",
+    "abstract": "Recent advances in cancer treatment...",
+    "url": "https://pubmed.ncbi.nlm.nih.gov/...",
+    "score": 0.95
+  }
+]
+```
+### 2. 流式聊天 API
+**端点**: `POST /stream-chat`
+**请求体**:
+```json
+{
+  "query": "What are the latest treatments for breast cancer?",
+  "is_web": true,
+  "is_pubmed": true,
+  "language": "en"  // 可选：响应语言 (zh/en)
+}
+```
+**响应**: Server-Sent Events (SSE) 流式响应
+### 3. 国际化支持
+所有API接口都支持国际化，通过 `language` 参数指定响应语言：
+- `zh` (默认): 中文响应
+- `en`: 英文响应
+**响应格式示例**:
+```json
+{
+  "success": true,
+  "data": [...],
+  "message": "搜索成功",  // 或 "Search successful"
+  "language": "zh"
+}
+```
+**错误响应格式**:
+```json
+{
+  "success": false,
+  "error": {
+    "code": 500,
+    "message": "搜索失败",  // 或 "Search failed"
+    "language": "zh",
+    "details": "具体错误信息"
+  }
+}
+```
+## 🔧 配置说明
+### 数据源配置
+- **pubmed**: PubMed文献数据库
+- **web**: Web搜索
+### LLM配置
+支持主备配置，当主配置失败时自动切换到备用配置：
+```yaml
+qa-llm:
+  main:
+    model: deepseek-r1
+    api_key: main-api-key
+    base_url: main-endpoint
+  backup:
+    model: qwen-plus-latest
+    api_key: backup-api-key
+    base_url: backup-endpoint
+```
+## 🧪 测试
+### 基本功能测试
+运行测试用例：
+```bash
+cd test
+python client.py
+```
+### 国际化功能测试
+```bash
+# 基本国际化功能测试
+python test/test_i18n.py
+# Label国际化功能测试
+python test/test_label_i18n.py
+# 新的消息文件结构测试
+python test/test_i18n_messages.py
+# 运行客户端测试示例
+python test/client_test.py
+```
+### 使用示例
+```python
+import requests
+# 中文检索
+response_zh = requests.post("http://localhost:9487/retrieve", json={
+    "query": "人工智能",
+    "language": "zh"
+})
+# 英文检索
+response_en = requests.post("http://localhost:9487/retrieve", json={
+    "query": "artificial intelligence",
+    "language": "en"
+})
+```
+## 📊 监控和日志
+- 日志文件位置: `logs/bio_rag_YYYY-MM-DD.log`
+- 请求追踪: 每个请求都有唯一的correlation_id
+- 性能监控: 自动记录请求处理时间
+## 🔒 安全特性
+- API密钥配置化管理
+- 请求日志记录
+- CORS配置
+- 错误处理和安全异常
+## 🤝 贡献指南
+1. Fork 项目
+2. 创建功能分支 (`git checkout -b feature/AmazingFeature`)
+3. 提交更改 (`git commit -m 'Add some AmazingFeature'`)
+4. 推送到分支 (`git push origin feature/AmazingFeature`)
+5. 打开 Pull Request
+## 📄 许可证
+本项目采用 MIT 许可证 - 查看 [LICENSE](LICENSE) 文件了解详情。
+## 🆘 支持
+如有问题或建议，请：
+1. 查看 [Issues](../../issues) 页面
+2. 创建新的 Issue
+3. 联系项目维护者
+## 🗺️ 路线图
+- [ ] 支持更多数据源
+- [ ] 增加用户认证和权限管理
+- [ ] 优化向量搜索性能
+- [ ] 添加更多LLM模型支持
+- [ ] 实现缓存机制
+- [ ] 增加API限流功能
+---
+**注意**: 请确保在使用前正确配置所有必要的API密钥和服务端点。

python-services/Retrieve/requirements.txt ADDED Viewed

	@@ -0,0 +1,19 @@

+asgi_correlation_id==4.3.4
+fastapi==0.115.12
+uvicorn==0.34.0
+loguru==0.7.3
+pyyaml==6.0.2
+httpx==0.28.1
+requests==2.32.3
+biopython==1.85
+openpyxl==3.1.5
+openai==1.86.0
+openai-agents==0.0.17
+pandas==2.2.3
+pymilvus==2.5.8
+crawl4ai==0.7.0
+aiohttp==3.11.18
+beautifulsoup4==4.12.3
+tiktoken==0.9.0
+fastapi-mcp==0.4.0
+python-dotenv

python-services/Retrieve/routers/mcp_sensor.py ADDED Viewed

	@@ -0,0 +1,81 @@

+from asgi_correlation_id import correlation_id
+from fastapi import APIRouter
+from fastapi.responses import StreamingResponse, JSONResponse
+from utils.bio_logger import bio_logger as logger
+from utils.i18n_util import (
+    get_language,
+    create_error_response,
+)
+from utils.i18n_context import with_language
+from bio_requests.chat_request import ChatRequest
+from service.chat import ChatService
+router = APIRouter(prefix="/mcp", tags=["MCP"])
+@router.post("/bio_qa", response_model=None, operation_id="bio_qa_stream_chat")
+async def bio_qa(query: str, lang: str = "en"):
+    """
+    生物医学问答接口，提供RAG问答服务。
+    query: 问答内容
+    lang: 语言设置,zh代表中文,en代表英文
+    """
+    logger.info(f"{correlation_id.get()} Bio QA for {query}")
+    chat_request = ChatRequest(query=query, language=lang)
+    # 解析语言设置
+    language = get_language(chat_request.language)
+    # 使用上下文管理器设置语言
+    with with_language(language):
+        try:
+            chat_service = ChatService()
+            return StreamingResponse(
+                chat_service.generate_stream(chat_request),
+                media_type="text/event-stream",
+                headers={
+                    "Connection": "keep-alive",
+                    "Cache-Control": "no-cache",
+                },
+            )
+        except Exception as e:
+            logger.error(f"{correlation_id.get()} Stream chat error: {e}")
+            error_response = create_error_response(
+                error_key="service_unavailable",
+                details=str(e),
+                error_code=500,
+            )
+            return JSONResponse(content=error_response, status_code=500)
+# 添加MCP协议所需的端点
+@router.get("/tools")
+async def list_tools():
+    """列出可用的MCP工具"""
+    return {
+        "tools": [
+            {
+                "name": "bio_qa_stream_chat",
+                "description": "生物医学问答服务，提供RAG问答功能",
+                "inputSchema": {
+                    "type": "object",
+                    "properties": {
+                        "query": {
+                            "type": "string",
+                            "description": "问题内容"
+                        },
+                        "lang": {
+                            "type": "string",
+                            "description": "语言设置，zh代表中文，en代表英文",
+                            "enum": ["zh", "en"],
+                            "default": "en"
+                        }
+                    },
+                    "required": ["query"]
+                }
+            }
+        ]
+    }

python-services/Retrieve/routers/sensor.py ADDED Viewed

	@@ -0,0 +1,83 @@

+"""API路由模块"""
+from asgi_correlation_id import correlation_id
+from fastapi import APIRouter
+from fastapi.responses import StreamingResponse, JSONResponse
+from utils.bio_logger import bio_logger as logger
+from utils.i18n_util import (
+    get_language,
+    create_success_response,
+    create_error_response,
+)
+from utils.i18n_context import with_language
+from bio_requests.rag_request import RagRequest
+from bio_requests.chat_request import ChatRequest
+from service.rag import RagService
+from service.chat import ChatService
+router = APIRouter()
+@router.post("/retrieve")
+async def search(rag_request: RagRequest) -> JSONResponse:
+    """文档检索接口，支持多源数据检索。"""
+    logger.info(f"{correlation_id.get()} Searching for {rag_request}")
+    # 解析语言设置
+    language = get_language(rag_request.language)
+    # 使用上下文管理器设置语言
+    with with_language(language):
+        try:
+            rag_assistant = RagService()
+            documents = await rag_assistant.multi_query(rag_request)
+            logger.info(f"{correlation_id.get()} Found {len(documents)} documents")
+            results = [document.__dict__ for document in documents]
+            # 返回国际化响应
+            response_data = create_success_response(
+                data=results, message_key="search_success"
+            )
+            return JSONResponse(content=response_data)
+        except Exception as e:
+            logger.error(f"{correlation_id.get()} Search error: {e}")
+            error_response = create_error_response(
+                error_key="search_failed", details=str(e), error_code=500
+            )
+            return JSONResponse(content=error_response, status_code=500)
+@router.post("/stream-chat")
+async def stream_chat(chat_request: ChatRequest):
+    """流式聊天接口，提供RAG问答服务。"""
+    logger.info(f"{correlation_id.get()} Streaming chat for {chat_request}")
+    # 解析语言设置
+    language = get_language(chat_request.language)
+    # 使用上下文管理器设置语言
+    with with_language(language):
+        try:
+            chat_service = ChatService()
+            return StreamingResponse(
+                chat_service.generate_stream(chat_request),
+                media_type="text/event-stream",
+                headers={
+                    "Connection": "keep-alive",
+                    "Cache-Control": "no-cache",
+                },
+            )
+        except Exception as e:
+            logger.error(f"{correlation_id.get()} Stream chat error: {e}")
+            error_response = create_error_response(
+                error_key="service_unavailable",
+                details=str(e),
+                error_code=500,
+            )
+            return JSONResponse(content=error_response, status_code=500)

python-services/Retrieve/search_service/base_search.py ADDED Viewed

	@@ -0,0 +1,28 @@

+from typing import List
+from bio_requests.rag_request import RagRequest
+from dto.bio_document import BaseBioDocument
+class BaseSearchService:
+    _registry = []
+    def __init_subclass__(cls, **kwargs):
+        super().__init_subclass__(**kwargs)
+        BaseSearchService._registry.append(cls)
+    @classmethod
+    def get_subclasses(cls):
+        return cls._registry
+    def __init__(self):
+        self.data_source = "Base"
+        pass
+    async def filter_search(self, rag_request: RagRequest) -> List[BaseBioDocument]:
+        if self.data_source in rag_request.data_source:
+            return await self.search(rag_request)
+        return []
+    async def search(self, rag_request: RagRequest) -> List[BaseBioDocument]:
+        return []

python-services/Retrieve/search_service/pubmed_search.py ADDED Viewed

	@@ -0,0 +1,197 @@

+import asyncio
+import re
+import time
+from typing import Dict, List
+from dto.bio_document import BaseBioDocument, create_bio_document
+from search_service.base_search import BaseSearchService
+from bio_requests.rag_request import RagRequest
+from utils.bio_logger import bio_logger as logger
+from service.query_rewrite import QueryRewriteService
+from service.pubmed_api import PubMedApi
+from service.pubmed_async_api import PubMedAsyncApi
+from config.global_storage import get_model_config
+class PubMedSearchService(BaseSearchService):
+    def __init__(self):
+        self.query_rewrite_service = QueryRewriteService()
+        self.model_config = get_model_config()
+        self.pubmed_topk = self.model_config["recall"]["pubmed_topk"]
+        self.es_topk = self.model_config["recall"]["es_topk"]
+        self.data_source = "pubmed"
+    async def get_query_list(self, rag_request: RagRequest) -> List[Dict]:
+        """根据RagRequest获取查询列表"""
+        if rag_request.is_rewrite:
+            query_list = await self.query_rewrite_service.query_split(rag_request.query)
+            logger.info(f"length of query_list after query_split: {len(query_list)}")
+            if len(query_list) == 0:
+                logger.info("query_list is empty, use query_split_for_simple")
+                query_list = await self.query_rewrite_service.query_split_for_simple(
+                    rag_request.query
+                )
+                logger.info(
+                    f"length of query_list after query_split_for_simple: {len(query_list)}"
+                )
+            self.pubmed_topk = rag_request.pubmed_topk
+            self.es_topk = rag_request.pubmed_topk
+        else:
+            self.pubmed_topk = rag_request.top_k
+            self.es_topk = rag_request.top_k
+            query_list = [
+                {
+                    "query_item": rag_request.query,
+                    "search_type": rag_request.search_type,
+                }
+            ]
+        return query_list
+    async def search(self, rag_request: RagRequest) -> List[BaseBioDocument]:
+        """异步搜索PubMed数据库"""
+        if not rag_request.query:
+            return []
+        start_time = time.time()
+        query_list = await self.get_query_list(rag_request)
+        # 使用异步并发替代线程池
+        articles_id_list = []
+        es_articles = []
+        try:
+            # 创建异步任务列表，使用PubMedApi的search_database方法
+            async_tasks = []
+            for query in query_list:
+                task = self._search_pubmed_with_sync_api(
+                    query["query_item"], self.pubmed_topk, query["search_type"]
+                )
+                async_tasks.append((query, task))
+            # 并发执行所有搜索任务
+            results = await asyncio.gather(
+                *[task for _, task in async_tasks], return_exceptions=True
+            )
+            # 处理结果
+            for i, (query, _) in enumerate(async_tasks):
+                result = results[i]
+                if isinstance(result, Exception):
+                    logger.error(f"Error in search pubmed: {result}")
+                else:
+                    articles_id_list.extend(result)
+        except Exception as e:
+            logger.error(f"Error in concurrent PubMed search: {e}")
+        # 获取文章详细信息
+        pubmed_docs = await self.fetch_article_details(articles_id_list)
+        # 合并结果
+        all_results = []
+        all_results.extend(pubmed_docs)
+        all_results.extend(es_articles)
+        logger.info(
+            f"""Finished searching PubMed, query:{rag_request.query},
+            total articles: {len(articles_id_list)}, total time: {time.time() - start_time:.2f}s"""
+        )
+        return all_results
+    async def _search_pubmed_with_sync_api(
+        self, query: str, top_k: int, search_type: str
+    ) -> List[str]:
+        """
+        使用PubMedApi的search_database方法，但通过异步包装来提升并发效率
+        Args:
+            query: 搜索查询
+            top_k: 返回结果数量
+            search_type: 搜索类型
+        Returns:
+            文章ID列表
+        """
+        try:
+            # 在线程池中运行同步的search_database方法
+            loop = asyncio.get_event_loop()
+            pubmed_api = PubMedApi()
+            # 使用run_in_executor来异步执行同步方法
+            id_list = await loop.run_in_executor(
+                None,  # 使用默认线程池
+                pubmed_api.search_database,
+                query,
+                top_k,
+                search_type,
+            )
+            return id_list
+        except Exception as e:
+            logger.error(f"Error in PubMed search for query '{query}': {e}")
+            raise e
+    async def fetch_article_details(
+        self, articles_id_list: List[str]
+    ) -> List[BaseBioDocument]:
+        """根据文章ID从pubmed获取文章详细信息"""
+        if not articles_id_list:
+            return []
+        # 将articles_id_list去重
+        articles_id_list = list(set(articles_id_list))
+        # 将articles_id_list以group_size个一组切分成不同的列表
+        group_size = 80
+        articles_id_groups = [
+            articles_id_list[i : i + group_size]
+            for i in range(0, len(articles_id_list), group_size)
+        ]
+        try:
+            # 并发获取所有组的详细信息
+            batch_tasks = []
+            for ids in articles_id_groups:
+                pubmed_async_api = PubMedAsyncApi()
+                task = pubmed_async_api.fetch_details(id_list=ids)
+                batch_tasks.append(task)
+            task_results = await asyncio.gather(*batch_tasks, return_exceptions=True)
+            fetch_results = []
+            for result in task_results:
+                if isinstance(result, Exception):
+                    logger.error(f"Error in fetch_details: {result}")
+                    continue
+                fetch_results.extend(result)
+        except Exception as e:
+            logger.error(f"Error in concurrent fetch_details: {e}")
+            return []
+        # 转换为BioDocument对象
+        all_results = [
+            create_bio_document(
+                title=result["title"],
+                abstract=result["abstract"],
+                authors=self.process_authors(result["authors"]),
+                doi=result["doi"],
+                source=self.data_source,
+                source_id=result["pmid"],
+                pub_date=result["pub_date"],
+                journal=result["journal"],
+                text=result["abstract"],
+                url=f'https://pubmed.ncbi.nlm.nih.gov/{result["pmid"]}',
+            )
+            for result in fetch_results
+        ]
+        return all_results
+    def process_authors(self, author_list: List[Dict]) -> str:
+        """处理作者列表，将其转换为字符串"""
+        return ", ".join(
+            [f"{author['forename']} {author['lastname']}" for author in author_list]
+        )

python-services/Retrieve/search_service/web_search.py ADDED Viewed

	@@ -0,0 +1,163 @@

+"""
+Web search service for retrieving and processing web content.
+This module provides functionality to search the web using Serper API
+and extract content from web pages using crawl4ai.
+"""
+import asyncio
+import os
+from typing import List, Optional
+from bio_requests.rag_request import RagRequest
+from dto.bio_document import BaseBioDocument, create_bio_document
+from search_service.base_search import BaseSearchService
+from service.web_search import SerperClient, scrape_urls, url_to_fit_contents
+from utils.bio_logger import bio_logger as logger
+class WebSearchService(BaseSearchService):
+    """
+    Web search service that retrieves content from web pages.
+    This service uses Serper API for web search and crawl4ai for content extraction.
+    """
+    def __init__(self):
+        """Initialize the web search service."""
+        self.data_source = "web"
+        self._serper_client: Optional[SerperClient] = None
+        self._max_results = 5
+        self._content_length_limit = 40000  # ~10k tokens
+    @property
+    def serper_client(self) -> SerperClient:
+        """Lazy initialization of SerperClient."""
+        if self._serper_client is None:
+            # 从环境变量获取API密钥
+            api_key = os.getenv("SERPER_API_KEY")
+            if not api_key:
+                logger.warning("SERPER_API_KEY environment variable not set, using default key")
+            self._serper_client = SerperClient(api_key=api_key)
+        return self._serper_client
+    async def search(self, rag_request: RagRequest) -> List[BaseBioDocument]:
+        """
+        Perform web search and extract content from search results.
+        Args:
+            rag_request: The RAG request containing the search query
+        Returns:
+            List of BaseBioDocument objects with extracted web content
+        """
+        try:
+            query = rag_request.query
+            logger.info(f"Starting web search for query: {query}")
+            # Search for URLs using Serper
+            url_results = await self.search_serper(query, rag_request.top_k)
+            if not url_results:
+                logger.info(f"No search results found for query: {query}")
+                return []
+            # Extract content from URLs
+            search_results = await self.enrich_url_results_with_contents(url_results)
+            logger.info(f"Web search completed. Found {len(search_results)} documents")
+            return search_results
+        except Exception as e:
+            logger.error(f"Error during web search: {str(e)}", exc_info=e)
+            return []
+    async def enrich_url_results_with_contents(
+        self, results: List
+    ) -> List[BaseBioDocument]:
+        """
+        Extract content from URLs and create BaseBioDocument objects.
+        Args:
+            results: List of search results with URLs
+        Returns:
+            List of BaseBioDocument objects with extracted content
+        """
+        try:
+            # Create tasks for concurrent content extraction
+            tasks = [self._extract_content_from_url(res) for res in results]
+            contents = await asyncio.gather(*tasks, return_exceptions=True)
+            enriched_results = []
+            for res, content in zip(results, contents):
+                # Handle exceptions from content extraction
+                if isinstance(content, Exception):
+                    logger.error(f"Failed to extract content from {res.url}: {content}")
+                    continue
+                bio_doc = create_bio_document(
+                    title=res.title,
+                    url=res.url,
+                    text=str(content)[: self._content_length_limit],
+                    source=self.data_source,
+                )
+                enriched_results.append(bio_doc)
+            return enriched_results
+        except Exception as e:
+            logger.error(f"Error enriching URL results: {str(e)}", exc_info=e)
+            return []
+    async def _extract_content_from_url(self, res) -> str:
+        """
+        Extract content from a single URL with error handling.
+        Args:
+            res: Search result object containing URL information
+        Returns:
+            Extracted content as string
+        """
+        try:
+            return await url_to_fit_contents(res)
+        except Exception as e:
+            logger.error(f"Error extracting content from {res.url}: {str(e)}")
+            return f"Error extracting content: {str(e)}"
+    async def search_serper(
+        self, query: str, max_results: Optional[int] = None
+    ) -> List:
+        """
+        Perform web search using Serper API.
+        Args:
+            query: Search query string
+            max_results: Maximum number of results to return
+        Returns:
+            List of search results with URLs
+        """
+        try:
+            max_results = max_results or self._max_results
+            logger.info(f"Searching Serper for: {query} (max_results: {max_results})")
+            search_results = await self.serper_client.search(
+                query, filter_for_relevance=True, max_results=max_results
+            )
+            if not search_results:
+                logger.info(f"No search results from Serper for query: {query}")
+                return []
+            # Scrape content from URLs
+            results = await scrape_urls(search_results)
+            logger.info(f"Serper search completed. Found {len(results)} results")
+            return results
+        except Exception as e:
+            logger.error(f"Error in Serper search: {str(e)}", exc_info=e)
+            return []

python-services/Retrieve/service/__init__.py ADDED Viewed

File without changes

python-services/Retrieve/service/chat.py ADDED Viewed

	@@ -0,0 +1,468 @@

+"""生物医学聊天服务模块，提供RAG问答和流式响应功能。"""
+import datetime
+import json
+import time
+from typing import Any, AsyncGenerator, List
+from openai import AsyncOpenAI
+from openai.types.chat import ChatCompletionMessageParam
+from bio_requests.chat_request import ChatRequest
+from bio_requests.rag_request import RagRequest
+from config.global_storage import get_model_config
+from search_service.pubmed_search import PubMedSearchService
+from search_service.web_search import WebSearchService
+from service.query_rewrite import QueryRewriteService
+from service.rerank import RerankService
+from utils.bio_logger import bio_logger as logger
+from utils.i18n_util import get_error_message, get_label_message
+from utils.token_util import num_tokens_from_messages, num_tokens_from_text
+from utils.snowflake_id import snowflake_id_str
+class ChatService:
+    """生物医学聊天服务，提供RAG问答和流式响应功能。"""
+    def __init__(self):
+        self.pubmed_search_service = PubMedSearchService()
+        self.web_search_service = WebSearchService()
+        self.query_rewrite_service = QueryRewriteService()
+        self.rag_request = RagRequest()
+        self.rerank_service = RerankService()
+        self.model_config = get_model_config()
+    def _initialize_rag_request(self, chat_request: ChatRequest) -> None:
+        """初始化RAG请求参数"""
+        self.rag_request.query = chat_request.query
+    async def generate_stream(self, chat_request: ChatRequest):
+        """
+        Generate a stream of messages for the chat request.
+        Args:
+            chat_request: 聊天请求
+        """
+        start_time = time.time()
+        try:
+            # 初始化RAG请求
+            self._initialize_rag_request(chat_request)
+            # PubMed搜索
+            logger.info("QA-RAG: Start search pubmed...")
+            pubmed_results = await self._search_pubmed(chat_request)
+            pubmed_task_text = self._generate_pubmed_search_task_text(pubmed_results)
+            yield pubmed_task_text
+            logger.info(
+                f"QA-RAG: Finished search pubmed, length: {len(pubmed_results)}"
+            )
+            # Web搜索
+            web_results = []
+            logger.info("QA-RAG: Start search web...")
+            web_urls, task_text = await self._search_web()
+            logger.info("QA-RAG: Finished search web...")
+            web_results = (
+                await self.web_search_service.enrich_url_results_with_contents(web_urls)
+            )
+            yield task_text
+            # 创建消息
+            messages, citation_list = self._create_messages(
+                pubmed_results, web_results, chat_request
+            )
+            citation_text = self._generate_citation_text(citation_list)
+            yield citation_text
+            # 流式聊天完成
+            async for content in self._stream_chat_completion(messages):
+                yield content
+            logger.info(
+                f"Finished search and chat, query: [{chat_request.query}], total time: {time.time() - start_time:.2f}s"
+            )
+        except Exception as e:
+            logger.error(f"Error occurred: {e}")
+            # 使用上下文中的语言返回错误消息
+            error_msg = get_error_message("llm_service_error")
+            yield f"data: {error_msg}\n\n"
+            return
+    def _generate_citation_text(self, citation_list: List[Any]) -> str:
+        """生成引用文本"""
+        return f"""
+```bdd-resource-lookup
+{json.dumps(citation_list)}
+```
+    """
+    async def _search_pubmed(self, chat_request: ChatRequest) -> List[Any]:
+        """执行PubMed搜索"""
+        try:
+            logger.info(f"query: {chat_request.query}, Using pubmed search...")
+            self.rag_request.top_k = self.model_config["qa-topk"]["pubmed"]
+            self.rag_request.pubmed_topk = self.model_config["qa-topk"]["pubmed"]
+            start_search_time = time.time()
+            pubmed_results = await self.pubmed_search_service.search(self.rag_request)
+            end_search_time = time.time()
+            logger.info(
+                f"length of pubmed_results: {len(pubmed_results)},time used:{end_search_time - start_search_time:.2f}s"
+            )
+            pubmed_results = pubmed_results[0 : self.rag_request.top_k]
+            logger.info(f"length of pubmed_results after rerank: {len(pubmed_results)}")
+            end_rerank_time = time.time()
+            logger.info(
+                f"Reranked {len(pubmed_results)} results,time used:{end_rerank_time - end_search_time:.2f}s"
+            )
+            return pubmed_results
+        except Exception as e:
+            logger.error(f"error in search pubmed: {e}")
+            return []
+    async def _search_web(self) -> tuple[List[Any], str]:
+        """执行Web搜索"""
+        web_topk = self.model_config["qa-topk"]["web"]
+        try:
+            # 尝试获取重写后的查询
+            query_list = await self.query_rewrite_service.query_split_for_web(
+                self.rag_request.query
+            )
+            # 安全获取重写查询，如果query_list为空或获取失败则使用原始查询
+            serper_query = (
+                query_list[0].get("query_item", "").strip() if query_list else None
+            )
+            # 如果重写查询为空，则回退到原始查询
+            if not serper_query:
+                serper_query = self.rag_request.query
+            # 使用最终确定的查询执行搜索
+            url_results = await self.web_search_service.search_serper(
+                query=serper_query, max_results=web_topk
+            )
+        except Exception as e:
+            logger.error(f"error in query rewrite web or serper retrieval: {e}")
+            # 出错时使用原始查询进行搜索
+            url_results = await self.web_search_service.search_serper(
+                query=self.rag_request.query, max_results=web_topk
+            )
+        # 生成任务文本
+        task_text = self._generate_web_search_task_text(url_results)
+        return url_results, task_text
+    def _generate_pubmed_search_task_text(self, pubmed_results: List[Any]) -> str:
+        """生成PubMed搜索任务文本"""
+        docs = [
+            {
+                "docId": result.bio_id,
+                "url": result.url,
+                "title": result.title,
+                "description": result.text,
+                "author": result.authors,
+                "JournalInfo": result.journal.get("title", "")
+                + "."
+                + result.journal.get("year", "")
+                + "."
+                + (
+                    result.journal.get("start_page", "")
+                    + "-"
+                    + result.journal.get("end_page", "")
+                    + "."
+                    if result.journal.get("start_page")
+                    and result.journal.get("end_page")
+                    else ""
+                )
+                + "doi:"
+                + result.doi,
+                "PMID": result.source_id,
+            }
+            for result in pubmed_results
+        ]
+        label = get_label_message("pubmed_search")
+        return self._generate_task_text(label, "pubmed", docs)
+    def _generate_web_search_task_text(self, url_results: List[Any]) -> str:
+        """生成Web搜索任务文本"""
+        web_docs = [
+            {
+                "docId": snowflake_id_str(),
+                "url": url_result.url,
+                "title": url_result.title,
+                "description": url_result.description,
+            }
+            for url_result in url_results
+        ]
+        logger.info(f"URL Results: {web_docs}")
+        label = get_label_message("web_search")
+        return self._generate_task_text(label, "webSearch", web_docs)
+    def _generate_task_text(self, label, source, bio_docs: List[Any]):
+        """生成任务文本"""
+        task = {
+            "type": "search",
+            "label": label,
+            "hoverable": True,
+            "handler": "QASearch",
+            "status": "running",
+            "handlerParam": {"source": source, "bioDocs": bio_docs},
+        }
+        return f"""
+```bdd-chat-agent-task
+{json.dumps(task)}
+```
+"""
+    def _build_document_texts(
+        self, pubmed_results: List[Any], web_results: List[Any]
+    ) -> tuple[str, str, List[Any]]:
+        """构建文档文本"""
+        # 个人向量搜索结果
+        citation_list = []
+        temp_doc_list = []
+        # pubmed结果
+        pubmed_offset = 0
+        for idx, doc in enumerate(pubmed_results):
+            _idx = idx + 1 + pubmed_offset
+            temp_doc_list.append(
+                "[document {idx} begin] title: {title}. content: {abstract} [document {idx} end]".format(
+                    idx=_idx, title=doc.title, abstract=doc.abstract
+                )
+            )
+            citation_list.append(
+                {"source": "pubmed", "docId": doc.bio_id, "citation": _idx}
+            )
+        pubmed_texts = "\n".join(temp_doc_list)
+        temp_doc_list = []
+        # 联网搜索结果
+        web_offset = pubmed_offset + len(pubmed_results)
+        for idx, doc in enumerate(web_results):
+            _idx = idx + 1 + web_offset
+            temp_doc_list.append(
+                "[document {idx} begin] title: {title}. content: {content} [document {idx} end]".format(
+                    idx=_idx, title=doc.title, content=doc.text
+                )
+            )
+            citation_list.append(
+                {"source": "webSearch", "docId": doc.bio_id, "citation": _idx}
+            )
+        web_texts = "\n".join(temp_doc_list)
+        return pubmed_texts, web_texts, citation_list
+    def _truncate_documents_to_token_limit(
+        self,
+        pubmed_texts: str,
+        web_texts: str,
+        chat_request: ChatRequest,
+    ) -> tuple[List[ChatCompletionMessageParam], int]:
+        """截断文档以符合token限制"""
+        pubmed_list = pubmed_texts.split("\n")
+        web_list = web_texts.split("\n")
+        today = datetime.date.today()
+        openai_client_rag_prompt = self.model_config["chat"]["rag_prompt"]
+        max_tokens = self.model_config["qa-prompt-max-token"]["max_tokens"]
+        pubmed_token_limit = max_tokens
+        web_token_limit = 60000
+        personal_vector_token_limit = 80000
+        if chat_request.is_pubmed and chat_request.is_web:
+            personal_vector_token_limit = 40000
+            pubmed_token_limit = 20000
+            web_token_limit = 60000
+        elif chat_request.is_pubmed and not chat_request.is_web:
+            personal_vector_token_limit = 80000
+            pubmed_token_limit = 40000
+            web_token_limit = 0
+        elif chat_request.is_pubmed and chat_request.is_web:
+            personal_vector_token_limit = 0
+            pubmed_token_limit = 60000
+            web_token_limit = 60000
+        elif chat_request.is_pubmed and not chat_request.is_web:
+            personal_vector_token_limit = 0
+            pubmed_token_limit = 120000
+            web_token_limit = 0
+        def calculate_num_tokens(
+            pubmed_list: List[str], web_list: List[str]
+        ) -> tuple[int, List[ChatCompletionMessageParam]]:
+            # 合并结果
+            docs_text = "\n".join(pubmed_list + web_list)
+            pt = (
+                openai_client_rag_prompt.replace("{search_results}", docs_text)
+                .replace("{cur_date}", str(today))
+                .replace("{question}", chat_request.query)
+            )
+            messages: List[ChatCompletionMessageParam] = [
+                {"role": "user", "content": pt}
+            ]
+            # 计算token数
+            num_tokens = num_tokens_from_messages(messages)
+            return num_tokens, messages
+        while True:
+            num_tokens, messages = calculate_num_tokens(pubmed_list, web_list)
+            if num_tokens <= max_tokens:
+                break
+            # 如果超过token限制，则按照比例进行截断
+            logger.info(
+                f"start truncate documents to token limit: max_tokens: {max_tokens}"
+            )
+            logger.info(
+                f"pubmed_token_limit: {pubmed_token_limit}, web_token_limit: {web_token_limit}, personal_vector_token_limit: {personal_vector_token_limit}"
+            )
+            while True:
+                if num_tokens_from_text("\n".join(pubmed_list)) > pubmed_token_limit:
+                    pubmed_list.pop()
+                else:
+                    break
+            # 截断pubmed之后，重新计算token数，如果token数小于max_tokens，则停止截断
+            num_tokens, messages = calculate_num_tokens(pubmed_list, web_list)
+            if num_tokens <= max_tokens:
+                break
+            while True:
+                if num_tokens_from_text("\n".join(web_list)) > web_token_limit:
+                    web_list.pop()
+                else:
+                    break
+            # 截断web之后，重新计算token数，如果token数小于max_tokens，则停止截断
+            num_tokens, messages = calculate_num_tokens(pubmed_list, web_list)
+            if num_tokens <= max_tokens:
+                break
+        logger.info(f"Final token count: {num_tokens}")
+        return messages, num_tokens
+    def _create_messages(
+        self,
+        pubmed_results: List[Any],
+        web_results: List[Any],
+        chat_request: ChatRequest,
+    ) -> tuple[List[ChatCompletionMessageParam], List[Any]]:
+        """创建聊天消息"""
+        if len(pubmed_results) == 0 and len(web_results) == 0:
+            logger.info(f"No results found for query: {chat_request.query}")
+            pt = chat_request.query
+            messages: List[ChatCompletionMessageParam] = [
+                {"role": "user", "content": pt}
+            ]
+            num_tokens = num_tokens_from_messages(messages)
+            logger.info(f"Total tokens: {num_tokens}")
+            return messages, []
+        # 构建文档文本
+        pubmed_texts, web_texts, citation_list = self._build_document_texts(
+            pubmed_results, web_results
+        )
+        # 截断文档以符合token限制
+        messages, num_tokens = self._truncate_documents_to_token_limit(
+            pubmed_texts, web_texts, chat_request
+        )
+        return messages, citation_list
+    async def _stream_chat_completion(
+        self, messages: List[ChatCompletionMessageParam]
+    ) -> AsyncGenerator[bytes, None]:
+        """流式聊天完成，支持qa-llm的main/backup配置"""
+        async def create_stream_with_config(
+            qa_config: dict, config_name: str
+        ) -> AsyncGenerator[bytes, None]:
+            """使用指定配置创建流式响应"""
+            try:
+                logger.info(f"Using qa-llm {config_name} configuration")
+                client = AsyncOpenAI(
+                    api_key=qa_config["api_key"],
+                    base_url=qa_config["base_url"],
+                )
+                chat_start_time = time.time()
+                # 创建聊天完成流
+                stream = await client.chat.completions.create(
+                    model=qa_config["model"],
+                    messages=messages,
+                    stream=True,
+                    temperature=qa_config["temperature"],
+                    max_tokens=qa_config["max_tokens"],
+                )
+                logger.info(
+                    f"Finished chat completion with {config_name} config, total time: {time.time() - chat_start_time:.2f}s"
+                )
+                is_start_answer = False
+                # 处理流式响应
+                async for chunk in stream:
+                    if chunk.choices and (content := chunk.choices[0].delta.content):
+                        if not is_start_answer:
+                            is_start_answer = True
+                        yield content.encode("utf-8")
+            except Exception as e:
+                logger.info(f"qa-llm {config_name} configuration failed: {e}")
+                raise e
+        async def with_fallback(main_func, backup_func):
+            """高阶函数：尝试主函数，失败时使用备选函数"""
+            try:
+                async for content in main_func():
+                    yield content
+            except Exception as main_error:
+                logger.info("Main config failed, falling back to backup configuration")
+                try:
+                    async for content in backup_func():
+                        yield content
+                except Exception as backup_error:
+                    logger.error(
+                        f"Both main and backup qa-llm configurations failed. "
+                        f"Main error: {main_error}, Backup error: {backup_error}"
+                    )
+                    raise backup_error
+        # 创建主用和备选配置的生成器函数
+        async def main_stream():
+            logger.info("Using main qa-llm configuration")
+            async for content in create_stream_with_config(
+                self.model_config["qa-llm"]["main"], "main"
+            ):
+                yield content
+        async def backup_stream():
+            logger.info("Using backup qa-llm configuration")
+            async for content in create_stream_with_config(
+                self.model_config["qa-llm"]["backup"], "backup"
+            ):
+                yield content
+        # 使用fallback逻辑
+        async for content in with_fallback(main_stream, backup_stream):
+            yield content

python-services/Retrieve/service/pubmed_api.py ADDED Viewed

	@@ -0,0 +1,164 @@

+import time
+from typing import Dict, List
+from Bio import Entrez
+import requests
+from config.global_storage import get_model_config
+from dto.bio_document import PubMedDocument
+from service.pubmed_xml_parse import PubmedXmlParse
+from utils.bio_logger import bio_logger as logger
+PUBMED_ACCOUNT = [
+    {"email": "[email protected]", "api_key": "60eb67add17f39aa588a43e30bb7fce98809"},
+    {"email": "[email protected]", "api_key": "fd9bb5b827c95086b9c2d579df20beca2708"},
+    {"email": "[email protected]", "api_key": "026586b79437a2b21d1e27d8c3f339230208"},
+    {"email": "[email protected]", "api_key": "bca0489d8fe314bfdbb1f7bfe63fb5d76e09"},
+]
+class PubMedApi:
+    def __init__(self):
+        self.pubmed_xml_parse = PubmedXmlParse()
+        self.model_config = get_model_config()
+    def pubmed_search_function(
+        self, query: str, top_k: int, search_type: str
+    ) -> List[PubMedDocument]:
+        try:
+            start_time = time.time()
+            logger.info(
+                f'Trying to search PubMed for "{query}", top_k={top_k}, search_type={search_type}'
+            )
+            id_list = self.search_database(query, retmax=top_k, search_type=search_type)
+            records = self.fetch_details(id_list, db="pubmed", rettype="abstract")
+            end_search_pubmed_time = time.time()
+            logger.info(
+                f'Finished searching PubMed for "{query}", took {end_search_pubmed_time - start_time:.2f} seconds, found {len(records)} results'
+            )
+            return [
+                PubMedDocument(
+                    title=result["title"],
+                    abstract=result["abstract"],
+                    authors=self.process_authors(result["authors"]),
+                    doi=result["doi"],
+                    source="pubmed",
+                    source_id=result["pmid"],
+                    pub_date=result["pub_date"],
+                    journal=result["journal"],
+                    text=result["abstract"],
+                )
+                for result in records
+            ]
+        except Exception as e:
+            logger.error(f"Error searching PubMed query: {query} error: {e}")
+            raise e
+    def process_authors(self, author_list: List[Dict]) -> str:
+        return ", ".join(
+            [f"{author['forename']} {author['lastname']}" for author in author_list]
+        )
+    # 搜索数据库（ESearch）
+    def search_database(
+        self, query: str, retmax: int, search_type: str = "keyword"
+    ) -> List[str]:
+        """
+        获取pubmed数据库中的记录id列表
+        :param search_type: 搜索类型，keyword或advanced
+        :param query: 查询字符串
+        :param retmax: 返回的最大结果数
+        """
+        start_time = time.time()
+        db = "pubmed"
+        # 随机从pubmed账号池中选择一个
+        random_index = int((time.time() * 1000) % len(PUBMED_ACCOUNT))
+        random_pubmed_account = PUBMED_ACCOUNT[random_index]
+        Entrez.email = random_pubmed_account["email"]
+        Entrez.api_key = random_pubmed_account["api_key"]
+        if search_type == "keyword":
+            art_type_list = [
+                "Address",
+                "Bibliography",
+                "Biography",
+                "Books and Documents",
+                "Clinical Conference",
+                "Clinical Study",
+                "Collected Works",
+                "Comment",
+                "Congress",
+                "Consensus Development Conference",
+                "Consensus Development Conference, NIH",
+                "Dictionary",
+                "Directory",
+                "Duplicate Publication",
+                "Editorial",
+                "Festschrift",
+                "Government Document",
+                "Guideline",
+                "Interactive Tutorial",
+                "Interview",
+                "Lecture",
+                "Legal Case",
+                "Legislation",
+                "Letter",
+                "News",
+                "Newspaper Article",
+                "Patient Education Handout",
+                "Periodical Index",
+                "Personal Narrative",
+                "Practice Guideline",
+                "Published Erratum",
+                "Technical Report",
+                "Video-Audio Media",
+                "Webcast",
+            ]
+            art_type = "(" + " OR ".join(f'"{j}"[Filter]' for j in art_type_list) + ")"
+            query = "( " + query + ")"
+            query += " AND (fha[Filter]) NOT " + art_type
+            handle = Entrez.esearch(
+                db=db, term=query, usehistory="y", sort="relevance", retmax=retmax
+            )
+        elif search_type == "advanced":
+            handle = Entrez.esearch(
+                db=db, term=query, usehistory="y", sort="relevance", retmax=retmax
+            )
+        else:
+            raise ValueError("search_type must be either 'keyword' or 'advanced'")
+        results = Entrez.read(handle)
+        handle.close()
+        id_list = results["IdList"]
+        logger.info(
+            f"Finished searching PubMed id, took {time.time() - start_time:.2f} seconds, found {len(id_list)} results,query: {query}"
+        )
+        logger.info(
+            f"Search type:{search_type} PubMed search query: {query}, id_list: {id_list}"
+        )
+        if len(id_list) == 0:
+            return []
+        return id_list
+    def fetch_details(self, id_list, db="pubmed", rettype="abstract"):
+        start_time = time.time()
+        try:
+            ids = ",".join(id_list)
+            server = "efetch"
+            random_index = int((time.time() * 1000) % len(PUBMED_ACCOUNT))
+            random_pubmed_account = PUBMED_ACCOUNT[random_index]
+            api_key = random_pubmed_account["api_key"]
+            url = f"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/{server}.fcgi?db={db}&id={ids}&retmode=xml&api_key={api_key}&rettype={rettype}"
+            response = requests.get(url)
+            articles = self.pubmed_xml_parse.parse_pubmed_xml(response.text)
+            logger.info(
+                f"pubmed_async_http fetch detail, Time taken: {time.time() - start_time}"
+            )
+            return articles
+        except Exception as e:
+            logger.error(f"Error fetching details for id_list: {id_list}, error: {e}")
+            # pmid 精准匹配
+        return []

python-services/Retrieve/service/pubmed_async_api.py ADDED Viewed

	@@ -0,0 +1,195 @@

+import asyncio
+import time
+from typing import Dict, List
+import aiohttp
+from config.global_storage import get_model_config
+from dto.bio_document import PubMedDocument
+from service.pubmed_xml_parse import PubmedXmlParse
+from utils.bio_logger import bio_logger as logger
+PUBMED_ACCOUNT = [
+    {"email": "[email protected]", "api_key": "60eb67add17f39aa588a43e30bb7fce98809"},
+    {"email": "[email protected]", "api_key": "fd9bb5b827c95086b9c2d579df20beca2708"},
+    {"email": "[email protected]", "api_key": "026586b79437a2b21d1e27d8c3f339230208"},
+    {"email": "[email protected]", "api_key": "bca0489d8fe314bfdbb1f7bfe63fb5d76e09"},
+]
+class PubMedAsyncApi:
+    def __init__(self):
+        self.pubmed_xml_parse = PubmedXmlParse()
+        self.model_config = get_model_config()
+    async def pubmed_search_function(
+        self, query: str, top_k: int, search_type: str
+    ) -> List[PubMedDocument]:
+        try:
+            start_time = time.time()
+            logger.info(
+                f'Trying to search PubMed for "{query}", top_k={top_k}, search_type={search_type}'
+            )
+            id_list = await self.search_database(
+                query, db="pubmed", retmax=top_k, search_type=search_type
+            )
+            articles = await self.fetch_details(
+                id_list, db="pubmed", rettype="abstract"
+            )
+            end_search_pubmed_time = time.time()
+            logger.info(
+                f'Finished searching PubMed for "{query}", took {end_search_pubmed_time - start_time:.2f} seconds, found {len(articles)} results'
+            )
+            return [
+                PubMedDocument(
+                    title=result["title"],
+                    abstract=result["abstract"],
+                    authors=self.process_authors(result["authors"]),
+                    doi=result["doi"],
+                    source="pubmed",
+                    source_id=result["pmid"],
+                    pub_date=result["pub_date"],
+                    journal=result["journal"],
+                )
+                for result in articles
+            ]
+        except Exception as e:
+            logger.error(f"Error searching PubMed query: {query} error: {e}")
+            raise e
+    def process_authors(self, author_list: List[Dict]) -> str:
+        return ", ".join(
+            [f"{author['forename']} {author['lastname']}" for author in author_list]
+        )
+    # 搜索数据库（ESearch）
+    async def search_database(
+        self, query: str, db: str, retmax: int, search_type: str = "keyword"
+    ) -> List[Dict]:
+        if search_type not in ["keyword", "advanced"]:
+            raise ValueError("search_type must be one of 'keyword' or 'advanced'")
+        if search_type == "keyword":
+            art_type_list = [
+                "Address",
+                "Bibliography",
+                "Biography",
+                "Books and Documents",
+                "Clinical Conference",
+                "Clinical Study",
+                "Collected Works",
+                "Comment",
+                "Congress",
+                "Consensus Development Conference",
+                "Consensus Development Conference, NIH",
+                "Dictionary",
+                "Directory",
+                "Duplicate Publication",
+                "Editorial",
+                "Festschrift",
+                "Government Document",
+                "Guideline",
+                "Interactive Tutorial",
+                "Interview",
+                "Lecture",
+                "Legal Case",
+                "Legislation",
+                "Letter",
+                "News",
+                "Newspaper Article",
+                "Patient Education Handout",
+                "Periodical Index",
+                "Personal Narrative",
+                "Practice Guideline",
+                "Published Erratum",
+                "Technical Report",
+                "Video-Audio Media",
+                "Webcast",
+            ]
+            art_type = "(" + " OR ".join(f'"{j}"[Filter]' for j in art_type_list) + ")"
+            query = "( " + query + ")"
+            query += " AND (fha[Filter]) NOT " + art_type
+        id_list = await self.esearch(query=query, retmax=retmax)
+        if len(id_list) == 0:
+            return []
+        return id_list
+    async def esearch(self, query=None, retmax=10):
+        start_time = time.time()
+        db = "pubmed"
+        server = "esearch"
+        random_index = int((time.time() * 1000) % len(PUBMED_ACCOUNT))
+        random_pubmed_account = PUBMED_ACCOUNT[random_index]
+        api_key = random_pubmed_account["api_key"]
+        url = f"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/{server}.fcgi?db={db}&term={query}&retmode=json&api_key={api_key}&sort=relevance&retmax={retmax}"
+        response = await self.async_http_get(url=url)
+        id_list = response["esearchresult"]["idlist"]
+        logger.info(
+            f"pubmed_async_http get id_list, search Time taken: {time.time() - start_time}s"
+        )
+        return id_list
+    async def async_http_get(self, url: str):
+        async with aiohttp.ClientSession() as session:
+            try_time = 1
+            while try_time < 4:
+                async with session.get(url) as response:
+                    if response.status == 200:
+                        return await response.json()
+                    else:
+                        logger.error(
+                            f"{url},try_time:{try_time},Error: {response.status}"
+                        )
+                        try_time += 1
+                        # 睡眠0.5秒后重试
+                        await asyncio.sleep(0.5)
+        raise Exception(f"Failed to fetch data from {url} after 3 attempts")
+    async def async_http_get_text(self, url: str, params=None):
+        async with aiohttp.ClientSession() as session:
+            try_time = 1
+            while try_time < 4:
+                async with session.get(url, params=params) as response:
+                    if response.status == 200:
+                        return await response.text()
+                    else:
+                        logger.error(
+                            f"{url},try_time:{try_time},Error: {response.status}"
+                        )
+                        try_time += 1
+                        # 睡眠0.5秒后重试
+                        await asyncio.sleep(0.5)
+        raise Exception(f"Failed to fetch data from {url} after 3 attempts")
+    # 获取详细信息（EFetch）
+    async def fetch_details(self, id_list, db="pubmed", rettype="abstract"):
+        start_time = time.time()
+        try:
+            ids = ",".join(id_list)
+            server = "efetch"
+            random_index = int((time.time() * 1000) % len(PUBMED_ACCOUNT))
+            random_pubmed_account = PUBMED_ACCOUNT[random_index]
+            api_key = random_pubmed_account["api_key"]
+            url = f"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/{server}.fcgi?db={db}&id={ids}&retmode=xml&api_key={api_key}&rettype={rettype}"
+            response = await self.async_http_get_text(url=url)
+            articles = self.pubmed_xml_parse.parse_pubmed_xml(response)
+            logger.info(
+                f"pubmed_async_http fetch detail, Time taken: {time.time() - start_time}"
+            )
+            return articles
+        except Exception as e:
+            logger.error(f"Error fetching details for id_list: {id_list}, error: {e}")
+            # pmid 精准匹配
+        return []

python-services/Retrieve/service/pubmed_xml_parse.py ADDED Viewed

	@@ -0,0 +1,232 @@

+import xml.etree.ElementTree as ET
+import re
+class PubmedXmlParse:
+    def __init__(self):
+        pass
+    def remove_xml_tags(self, text):
+        """移除XML标签，返回纯文本"""
+        clean = re.compile('<.*?>')
+        return re.sub(clean, '', text)
+    # 解析 XML 数据
+    def parse_pubmed_xml(self, xml_data):
+        tree = ET.ElementTree(ET.fromstring(xml_data))
+        root = tree.getroot()
+        articles = []
+        # 遍历每个 PubmedArticle 元素
+        for article in root.findall(".//PubmedArticle"):
+            # 提取文章信息
+            article_title_elem = article.find(".//ArticleTitle")
+            article_title = ""
+            if article_title_elem is not None:
+                # Convert element to string and decode to handle tags
+                title_text = ET.tostring(article_title_elem, encoding='unicode', method='xml')
+                # Remove the ArticleTitle tags but keep inner content and tags
+                title_text = title_text.replace('<ArticleTitle>', '').replace('</ArticleTitle>', '')
+                # Remove all XML tags to get plain text
+                article_title = self.remove_xml_tags(title_text).strip()
+            pmid = (
+                article.find(".//ArticleId[@IdType='pubmed']").text
+                if article.find(".//ArticleId[@IdType='pubmed']") is not None
+                else ""
+            )
+            abstract_texts = article.findall(".//AbstractText")
+            abstract_text = (
+                " ".join(
+                    [
+                        abstract.text if abstract.text is not None else ""
+                        for abstract in abstract_texts
+                    ]
+                )
+                if abstract_texts
+                else ""
+            )
+            # 提取作者信息
+            authors = []
+            for author in article.findall(".//Author"):
+                authors.append(
+                    {
+                        "lastname": (
+                            author.find(".//LastName").text
+                            if author.find(".//LastName") is not None
+                            else ""
+                        ),
+                        "forename": (
+                            author.find(".//ForeName").text
+                            if author.find(".//ForeName") is not None
+                            else ""
+                        ),
+                        "initials": (
+                            author.find(".//Initials").text
+                            if author.find(".//Initials") is not None
+                            else ""
+                        ),
+                        "affiliation": (
+                            author.find(".//AffiliationInfo/Affiliation").text
+                            if author.find(".//AffiliationInfo/Affiliation") is not None
+                            else ""
+                        ),
+                    }
+                )
+            journal = {
+                "issn": (
+                    article.find(".//Journal/ISSN").text
+                    if article.find(".//Journal/ISSN") is not None
+                    else ""
+                ),
+                "title": (
+                    article.find(".//Journal/Title").text
+                    if article.find(".//Journal/Title") is not None
+                    else ""
+                ),
+                "abbreviation": (
+                    article.find(".//Journal/ISOAbbreviation").text
+                    if article.find(".//Journal/ISOAbbreviation") is not None
+                    else ""
+                ),
+                "startPage": (
+                    article.find(".//Pagination/StartPage").text
+                    if article.find(".//Pagination/StartPage") is not None
+                    else ""
+                ),
+                "endPage": (
+                    article.find(".//Pagination/EndPage").text
+                    if article.find(".//Pagination/EndPage") is not None
+                    else ""
+                ),
+                "volume": (
+                    article.find(".//Journal/JournalIssue/Volume").text
+                    if article.find(".//Journal/JournalIssue/Volume") is not None
+                    else ""
+                ),
+                "issue": (
+                    article.find(".//Journal/JournalIssue/Issue").text
+                    if article.find(".//Journal/JournalIssue/Issue") is not None
+                    else ""
+                ),
+                "year": (
+                    article.find(".//Journal/JournalIssue/PubDate/Year").text
+                    if article.find(".//Journal/JournalIssue/PubDate/Year") is not None
+                    else ""
+                ),
+            }
+            medline = article.find("MedlineCitation")
+            references = article.findall(".//PubmedData/ReferenceList/Reference")
+            # 将每篇文章的信息添加到列表中
+            articles.append(
+                {
+                    "pmid": pmid,
+                    "pmcid": (
+                        article.find(
+                            ".//PubmedData/ArticleIdList/ArticleId[@IdType='pmc']"
+                        ).text
+                        if article.find(
+                            ".//PubmedData/ArticleIdList/ArticleId[@IdType='pmc']"
+                        )
+                        is not None
+                        else ""
+                    ),
+                    "title": article_title,
+                    "abstract": abstract_text,
+                    "journal": journal,
+                    "authors": authors,
+                    "pub_date": {
+                        "year": (
+                            article.find(".//Journal/JournalIssue/PubDate/Year").text
+                            if article.find(".//Journal/JournalIssue/PubDate/Year")
+                            is not None
+                            else ""
+                        ),
+                        "month": (
+                            article.find(".//Journal/JournalIssue/PubDate/Month").text
+                            if article.find(".//Journal/JournalIssue/PubDate/Month")
+                            is not None
+                            else ""
+                        ),
+                        "day": (
+                            article.find(".//Journal/JournalIssue/PubDate/Day").text
+                            if article.find(".//Journal/JournalIssue/PubDate/Day")
+                            is not None
+                            else ""
+                        ),
+                    },
+                    "keywords": (
+                        [k.text for k in medline.findall(".//KeywordList/Keyword")]
+                        if medline.findall(".//KeywordList/Keyword") is not None
+                        else ""
+                    ),
+                    "doi": self.parse_doi(medline.find("Article"), article),
+                    "mesh_terms": [
+                        self.parse_mesh(m)
+                        for m in medline.findall("MeshHeadingList/MeshHeading")
+                    ],
+                    "references": [self.parse_reference(r) for r in references],
+                }
+            )
+        return articles
+    def parse_doi(self, article, article_elem) -> str:
+        if article.find(".//ELocationID[@EIdType='doi']") is not None:
+            doi = article.find(".//ELocationID[@EIdType='doi']").text
+            if doi is not None and doi != "":
+                return doi
+        elif article_elem.find(".//ArticleIdList/ArticleId[@IdType='doi']") is not None:
+            doi = article_elem.find(".//ArticleIdList/ArticleId[@IdType='doi']").text
+            if doi is not None and doi != "":
+                return doi
+        else:
+            return ""
+    def parse_mesh(self, mesh_elem):
+        """解析MeSH主题词"""
+        return {
+            "descriptor": (
+                mesh_elem.find(".//DescriptorName").text
+                if mesh_elem.find(".//DescriptorName") is not None
+                else ""
+            ),
+            "qualifiers": [
+                (
+                    q.find(".//QualifierName").text
+                    if q.find(".//QualifierName") is not None
+                    else ""
+                )
+                for q in mesh_elem.findall(".//QualifierName")
+            ],
+        }
+    def parse_reference(self, reference_elem):
+        """解析参考文献"""
+        return {
+            "citation": (
+                reference_elem.find("Citation").text
+                if reference_elem.find("Citation") is not None
+                else ""
+            ),
+            "doi": (
+                reference_elem.find(".//ArticleId[@IdType='doi']").text
+                if reference_elem.find(".//ArticleId[@IdType='doi']") is not None
+                else ""
+            ),
+            "pmid": (
+                reference_elem.find(".//ArticleId[@IdType='pubmed']").text
+                if reference_elem.find(".//ArticleId[@IdType='pubmed']") is not None
+                else ""
+            ),
+            "pmcid": (
+                reference_elem.find(".//ArticleId[@IdType='pmcid']").text
+                if reference_elem.find(".//ArticleId[@IdType='pmcid']") is not None
+                else ""
+            ),
+        }

python-services/Retrieve/service/query_rewrite.py ADDED Viewed

	@@ -0,0 +1,354 @@

+import time
+from bio_agent.rewrite_agent import RewriteAgent
+from utils.bio_logger import bio_logger as logger
+from datetime import datetime
+# Instruct
+INSTRUCTIONS_rewrite = f"""
+    You are a research expert with strong skills in question categorization and optimizing PubMed searches.
+    Frist, classify the research question into exactly one of the following categories:
+    - Review: Queries that summarize existing knowledge or literature on a topic.
+    - Question_Answer: Queries that seek specific answers to scientific questions.
+    Secondly, extract the 3-6 key words of the research question. The key words should be the most important terms or phrases that capture the essence of the research question. These key words should be relevant to the topic and can be used to generate search queries. These key words should be relavant to medicine, biology, health, disease.
+    Thirdly,using the given keywords, please identify at least 60 leading authoritative journals in this field, including their names and EISSNs. It would be ok to include journals that are not strictly in the field of medicine, biology, health, or disease, but are relevant to the topic and the journals should be well-known and respected in their respective fields. The EISSN is the electronic International Standard Serial Number for the journal.
+    Next, break down this research question into specific search queries for PubMed that comprehensively cover all important aspects of the topic. Generate as many search queries as necessary to ensure thorough coverage - don't limit yourself to a fixed number.
+    Each query should:
+    1. Be concise (3-6 words maximum)
+    2. Focus on a specific aspect of the research question
+    3. Use appropriate scientific terminology
+    4. Be suitable for a scientific database search
+    5. Collectively cover the full breadth of the research topic
+    If the query's type is review, generate additional queries (10-20) to ensure thorough coverage. If the query's type is question-answer, fewer queries (5-10) may be sufficient.
+    Avoid long phrases, questions, or full sentences, as these are not effective for database searches.
+    Examples of good queries:
+    - "CRISPR cancer therapy"
+    - "tau protein Alzheimer's"
+    - "microbiome obesity metabolism"
+    Then, construct the final PubMed search query based on the following filters:
+    - "date_range": {{"start": "YYYY/MM/DD", "end": "YYYY/MM/DD",}}, only populate this field if the query contains phrases like "the past x years" or "the last x years"; otherwise, leave blank as default.
+    - "article_types": [],array of publication types, only if user specify some publication types, otherwise leave blank as default.
+    - "languages": [],array of language filters,if user do not specify, use English as default.
+    - "subjects": [],if user do not specify, use human as default.
+    - "journals": [], if user do not specify, use [] as default.
+    - "author": [{{"name": string, "first_author": boolean, "last_author": boolean}}], if user do not specify, use {{}} as default.
+    IMPORTANT: Your output MUST be a valid JSON object with a "queries" field containing an array of strings. For example:
+    ```
+    {{ "category": "Review",
+       "key_words":["CRISPR", "cancer", "therapy"],
+       "key_journals":[{{"name":"Nature","EISSN":"1476-4687"}}],
+       "queries": [
+        "CRISPR cancer therapy",
+        "tau protein Alzheimer's",
+        "microbiome obesity metabolism"
+       ],
+       "filters": {{"date_range": {{"start": "2019/01/01", "end": "2024/01/01"}},
+                   "article_types": [],
+                   "languages": ["English"],
+                   "subjects": ["human"],
+                   "journals": [],
+                   "author": {{"name": "", "first_author": false, "last_author": false}}
+       }}
+    }}
+    Only output JSON. Follow the JSON schema below. Do not output anything else. I will be parsing this with Pydantic so output valid JSON only.If you are not sure about the output, output an empty array.
+"""
+SIMPLE_INSTRUCTIONS_rewrite = f"""
+    You are a research expert with strong skills in question categorization and optimizing PubMed searches.
+    Extract the 3-6 key words of the research question. The key words should be the most important terms or phrases that capture the essence of the research question. These key words should be relevant to the topic and can be used to generate search queries. These key words should be relavant to medicine, biology, health, disease.
+    IMPORTANT: Your output MUST be a valid JSON object. For example:
+    ```
+    {{
+       "key_words":["CRISPR", "cancer", "therapy"],
+    }}
+    Only output JSON. Follow the JSON schema below. Do not output anything else. I will be parsing this with Pydantic so output valid JSON only.If you are not sure about the output, output an empty array.
+    """
+def build_pubmed_filter_query(data):
+    # 基础查询部分（queries的组合）
+    base_query = ""
+    # 构建过滤器部分
+    filters = []
+    # 日期范围过滤
+    date_range = data["filters"].get("date_range", {})
+    if date_range.get("start") or date_range.get("end"):
+        start_date = date_range.get("start", "1000/01/01")  # 很早的日期作为默认
+        end_date = date_range.get("end", datetime.now().strftime("%Y/%m/%d"))  # 当前日期作为默认
+        date_filter = f'("{start_date}"[Date - Publication] : "{end_date}"[Date - Publication])'
+        filters.append(date_filter)
+    # 文章类型过滤
+    article_types = data["filters"].get("article_types", [])
+    if article_types:
+        type_filter = " OR ".join([f'"{at}"[Publication Type]' for at in article_types])
+        filters.append(f"({type_filter})")
+    # 语言过滤
+    languages = data["filters"].get("languages", [])
+    if languages:
+        lang_filter = " OR ".join([f'"{lang}"[Language]' for lang in languages])
+        filters.append(f"({lang_filter})")
+    # 主题过滤
+    # subjects = data["filters"].get("subjects", [])
+    # if subjects:
+    #     subj_filter = " OR ".join([f'"{subj}"[MeSH Terms]' for subj in subjects])
+    #     filters.append(f"({subj_filter})")
+    # 期刊过滤
+    journal_names = data["filters"].get("journals", [])
+    if journal_names:
+        journal_filter = " OR ".join([f'"{journal}"[Journal]' for journal in journal_names])
+        filters.append(f"({journal_filter})")
+    # 作者过滤
+    author = data["filters"].get("author", {})
+    if author and author.get("name"):
+        author_query = []
+        if author.get("first_author", False):
+            author_query.append(f'"{author["name"]}"[Author - First]')
+        if author.get("last_author", False):
+            author_query.append(f'"{author["name"]}"[Author - Last]')
+        if not author.get("first_author", False) and not author.get("last_author", False):
+            author_query.append(f'"{author["name"]}"[Author]')
+        if author_query:
+            filters.append(f"({' OR '.join(author_query)})")
+    # 组合所有过滤器
+    if filters:
+        full_query = " AND ".join(filters)
+    else:
+        full_query = base_query
+    return full_query
+class QueryRewriteService:
+    def __init__(self):
+        self.rewrite_agent = RewriteAgent()
+        # self.aclient = OPENAI_CLIENT
+        # self.pd_data= pd.read_excel('config/2023JCR（完整）.xlsx')
+        # self.pd_data = self.pd_data[["名字", "EISSN"]]
+    async def query_split(self, query: str):
+        start_time = time.time()
+        query_list = []
+        queries = []
+        key_journals = {"name": "", "EISSN": ""}
+        category = "Review"
+        try_count = 0
+        while try_count < 3:
+            try:
+                query_dict = await self.rewrite_agent.rewrite_query(
+                    query, INSTRUCTIONS_rewrite + ' Please note: Today is ' + datetime.now().strftime("%Y/%m/%d") + '.'
+                )
+                logger.info(f"query_dict: {query_dict}")
+                # logger.info(f"query_dict filter: {query_dict['filters']}")
+                if (
+                    "queries" not in query_dict
+                    or "key_journals" not in query_dict
+                    or "category" not in query_dict
+                ):
+                    logger.error(f"Invalid JSON structure, {query_dict}")
+                    raise ValueError("Invalid JSON structure")
+                queries = query_dict.get("queries")
+                key_journals = query_dict.get("key_journals")
+                category = query_dict.get("category")
+                key_words = query_dict.get("key_words")
+                journal_list =[]
+                for journal in key_journals:
+                    journal_list.append(journal.get("EISSN", ""))
+                journal_list = [
+                    f"""("{journal_EISSN}"[Journal])"""
+                    for journal_EISSN in journal_list
+                ]
+                journal_list += [
+                    "(Nature[Journal])",
+                    "(Science[Journal])",
+                    "(Nature Reviews Methods Primers[Journal])",
+                    "(Innovation[Journal])",
+                    "(National Science Review[Journal])",
+                    "(Nature Communications[Journal])",
+                    "(Science Bulletin[Journal])",
+                    "(Science Advances[Journal])",
+                    "(BMJ[Journal])",
+                ]
+                if category == "Review":
+                    for sub_query in queries:
+                        query_list.append(
+                            {
+                                "query_item": "( "
+                                # + sub_query.strip()
+                                + ' '.join(key_words)
+                                # + " ) AND ("
+                                # + " OR ".join(journal_list)
+                                + ") AND (fha[Filter]) AND "
+                                + build_pubmed_filter_query(query_dict),
+                                "search_type": "advanced",
+                            }
+                        )
+                        query_list.append(
+                            {
+                                "query_item": "( "
+                                + sub_query.strip()
+                                + " ) AND ("
+                                + " OR ".join(journal_list)
+                                + ") AND (fha[Filter]) AND "
+                                + build_pubmed_filter_query(query_dict),
+                                "search_type": "advanced",
+                            }
+                            )
+                else:
+                        # query_list.append(
+                        #     {
+                        #         "query_item": "( "
+                        #         + sub_query.strip()
+                        #         + " ) AND ("
+                        #         + " OR ".join(journal_list)
+                        #         + ") AND (fha[Filter]) AND "
+                        #         + build_pubmed_filter_query(query_dict),
+                        #         "search_type": "advanced",
+                        #     }
+                        # )
+                    query_list.append(
+                            {
+                                "query_item": "( "
+                                # + sub_query.strip()
+                                + ' '.join(key_words)
+                                # + " ) AND ("
+                                # + " OR ".join(journal_list)
+                                + ") AND (fha[Filter]) AND "
+                                + build_pubmed_filter_query(query_dict),
+                                "search_type": "advanced",
+                            }
+                        )
+                logger.info(
+                    f"Original query: {query}, count: {len(query_list)}, wait time: {time.time() - start_time:.2f}s, rewrite result: {query_list}"
+                )
+                return query_list
+            except Exception as e:
+                logger.error(f"Error in query rewrite: {e},trying again...",exc_info=e)
+                try_count += 1
+                time.sleep(0.1)
+        new_try_count = 0
+        logger.info(f"Error in query rewrite,trying a simple version again...")
+        while new_try_count < 3:
+            try:
+                query_dict = await self.rewrite_agent.rewrite_query(
+                    query, INSTRUCTIONS_rewrite + ' Please note: Today is ' + datetime.now().strftime("%Y/%m/%d") + '.'
+                )
+                logger.info(f"query_dict: {query_dict}")
+                if "key_words" not in query_dict:
+                    logger.error(f"SIMPLE_version:Invalid JSON structure, {query_dict}")
+                    raise ValueError("SIMPLE_version:Invalid JSON structure")
+                key_words = query_dict.get("key_words")
+                query_list.append(
+                    {
+                        "query_item": "( "
+                        + ' '.join(key_words)
+                        + " ) AND (fha[Filter]) AND "
+                        + build_pubmed_filter_query(query_dict),
+                        "search_type": "advanced",
+                    }
+                )
+                logger.info(
+                    f"SIMPLE_version: Original query: {query}, count: {len(query_list)}, wait time: {time.time() - start_time:.2f}s, rewrite result: {query_list}"
+                )
+                return query_list
+            except Exception as e:
+                logger.error(f"SIMPLE_version: Error in query rewrite: {e}")
+                new_try_count += 1
+                time.sleep(0.1)
+        return []
+    async def query_split_for_web(self,query: str):
+        """
+        For web use, only return the key words.
+        """
+        start_time = time.time()
+        query_list = []
+        try_count = 0
+        while try_count < 3:
+            try:
+                query_dict = await self.rewrite_agent.rewrite_query(
+                    query, INSTRUCTIONS_rewrite + ' Please note: Today is ' + datetime.now().strftime("%Y/%m/%d") + '.',True
+                )
+                logger.info(f"query_dict: {query_dict}")
+                if "key_words" not in query_dict:
+                    logger.error(f"SIMPLE_version for web:Invalid JSON structure, {query_dict}")
+                    raise ValueError("SIMPLE_version for web:Invalid JSON structure")
+                key_words = query_dict.get("key_words")
+                query_list.append(
+                    {
+                        "query_item":
+                        ' '.join(key_words)
+                        # + " ) AND (fha[Filter]) AND "
+                        # + build_pubmed_filter_query(query_dict),
+                        # "search_type": "advanced",
+                    }
+                )
+                logger.info(
+                    f"SIMPLE_version for web: Original query: {query}, count: {len(query_list)}, wait time: {time.time() - start_time:.2f}s, rewrite result: {query_list}"
+                )
+                return query_list
+            except Exception as e:
+                logger.error(f"SIMPLE_version: Error in query rewrite: {e}")
+                try_count += 1
+                time.sleep(0.1)
+        return [{"query_item": ""}]
+    async def query_split_for_simple(self,query: str):
+        """
+        For simple use, only return the key words.
+        """
+        start_time = time.time()
+        query_list = []
+        try_count = 0
+        while try_count < 3:
+            try:
+                query_dict = await self.rewrite_agent.rewrite_query(
+                    query, SIMPLE_INSTRUCTIONS_rewrite + ' Please note: Today is ' + datetime.now().strftime("%Y/%m/%d") + '.',True
+                )
+                logger.info(f"query_dict: {query_dict}")
+                if "key_words" not in query_dict:
+                    logger.error(f"SIMPLE_version for simple:Invalid JSON structure, {query_dict}")
+                    raise ValueError("SIMPLE_version for simple:Invalid JSON structure")
+                key_words = query_dict.get("key_words")
+                query_list.append(
+                    {
+                        "query_item":
+                        ' '.join(key_words),
+                        # + " ) AND (fha[Filter]) AND "
+                        # + build_pubmed_filter_query(query_dict),
+                        "search_type": "keyword",
+                    }
+                )
+                logger.info(
+                    f"SIMPLE_version for simple: Original query: {query}, count: {len(query_list)}, wait time: {time.time() - start_time:.2f}s, rewrite result: {query_list}"
+                )
+                return query_list
+            except Exception as e:
+                logger.error(f"SIMPLE_version for simple: Error in query rewrite: {e}")
+                try_count += 1
+                time.sleep(0.1)
+        return [{"query_item": ""}]

python-services/Retrieve/service/rag.py ADDED Viewed

	@@ -0,0 +1,54 @@

+import asyncio
+import time
+from typing import List
+from service.rerank import RerankService
+from search_service.base_search import BaseSearchService
+from utils.bio_logger import bio_logger as logger
+from dto.bio_document import BaseBioDocument
+from bio_requests.rag_request import RagRequest
+class RagService:
+    def __init__(self):
+        self.rerank_service = RerankService()
+        # 确保所有子类都被加载
+        self.search_services = [
+            subclass() for subclass in BaseSearchService.get_subclasses()
+        ]
+        logger.info(
+            f"Loaded search services: {[service.__class__.__name__ for service in self.search_services]}"
+        )
+    async def multi_query(self, rag_request: RagRequest) -> List[BaseBioDocument]:
+        start_time = time.time()
+        batch_search = [
+            service.filter_search(rag_request=rag_request)
+            for service in self.search_services
+        ]
+        task_result = await asyncio.gather(*batch_search, return_exceptions=True)
+        all_results = []
+        for result in task_result:
+            if isinstance(result, Exception):
+                logger.error(f"Error in search service: {result}")
+                continue
+            all_results.extend(result)
+        end_search_time = time.time()
+        logger.info(
+            f"Found {len(all_results)} results in total,time used:{end_search_time - start_time:.2f}s"
+        )
+        if rag_request.is_rerank:
+            logger.info("RerankService: is_rerank is True")
+            reranked_results = await self.rerank_service.rerank(
+                rag_request=rag_request, documents=all_results
+            )
+            end_rerank_time = time.time()
+            logger.info(
+                f"Reranked {len(reranked_results)} results,time used:{end_rerank_time - end_search_time:.2f}s"
+            )
+        else:
+            logger.info("RerankService: is_rerank is False, skip rerank")
+            reranked_results = all_results
+        return reranked_results[0 : rag_request.top_k]

python-services/Retrieve/service/rerank.py ADDED Viewed

	@@ -0,0 +1,60 @@

+from typing import List
+from bio_requests.rag_request import RagRequest
+from dto.bio_document import BaseBioDocument
+from utils.bio_logger import bio_logger as logger
+import pandas as pd
+# Load the Excel file
+df = pd.read_excel("config/2023JCR（完整）.xlsx")
+# Select only the 'ISSN' and '5年IF' columns
+df = df[["ISSN", "5年IF", "EISSN"]]
+# Convert '5年IF' to float, setting invalid values to 0.01
+df["5年IF"] = pd.to_numeric(df["5年IF"], errors="coerce").fillna(0.01)
+class RerankService:
+    def __init__(self):
+        # Select only the 'ISSN' and '5年IF' columns
+        self.df = df
+    async def rerank(
+        self, rag_request: RagRequest, documents: List[BaseBioDocument] = []
+    ) -> List[BaseBioDocument]:
+        if not rag_request.data_source or "pubmed" not in rag_request.data_source:
+            logger.info("RerankService: data_source is not pubmed, skip rerank")
+            return documents
+        logger.info("RerankService: start rerank")
+        # Now sorted_documents contains the documents sorted by "5-year IF" from high to low
+        # Step 1: Extract ISSN and query the DataFrame for "5-year IF"
+        for document in documents:
+            issn = document.journal["issn"]
+            # Check if ISSN exists in the 'ISSN' column
+            if_5_year = self.df.loc[self.df["ISSN"] == issn, "5年IF"].values
+            if if_5_year.size > 0:
+                document.if_score = if_5_year[0]
+            else:
+                # If not found in 'ISSN', check the 'EISSN' column
+                if_5_year = self.df.loc[self.df["EISSN"] == issn, "5年IF"].values
+                if if_5_year.size > 0:
+                    document.if_score = if_5_year[0]
+                else:
+                    document.if_score = None
+        # Step 2: De-duplicate the ID of each document in the documents list
+        documents = list({doc.bio_id: doc for doc in documents}.values())
+        # Step 3: Sort documents by "5-year IF" in descending order
+        sorted_documents = sorted(
+            documents,
+            key=lambda x: x.if_score if x.if_score is not None else 0.01,
+            reverse=True,
+        )
+        return sorted_documents

python-services/Retrieve/service/web_search.py ADDED Viewed

	@@ -0,0 +1,406 @@

+import json
+import os
+import ssl
+import aiohttp
+import asyncio
+from agents import function_tool
+# from ..workers.baseclass import ResearchAgent, ResearchRunner
+# from ..workers.utils.parse_output import create_type_parser
+from typing import List, Union, Optional
+from bs4 import BeautifulSoup
+from dotenv import load_dotenv
+from pydantic import BaseModel, Field
+from crawl4ai import *
+load_dotenv()
+CONTENT_LENGTH_LIMIT = 10000  # Trim scraped content to this length to avoid large context / token limit issues
+SEARCH_PROVIDER = os.getenv("SEARCH_PROVIDER", "serper").lower()
+# ------- DEFINE TYPES -------
+class ScrapeResult(BaseModel):
+    url: str = Field(description="The URL of the webpage")
+    text: str = Field(description="The full text content of the webpage")
+    title: str = Field(description="The title of the webpage")
+    description: str = Field(description="A short description of the webpage")
+class WebpageSnippet(BaseModel):
+    url: str = Field(description="The URL of the webpage")
+    title: str = Field(description="The title of the webpage")
+    description: Optional[str] = Field(description="A short description of the webpage")
+class SearchResults(BaseModel):
+    results_list: List[WebpageSnippet]
+# ------- DEFINE TOOL -------
+# Add a module-level variable to store the singleton instance
+_serper_client = None
+@function_tool
+async def web_search(query: str) -> Union[List[ScrapeResult], str]:
+    """Perform a web search for a given query and get back the URLs along with their titles, descriptions and text contents.
+    Args:
+        query: The search query
+    Returns:
+        List of ScrapeResult objects which have the following fields:
+            - url: The URL of the search result
+            - title: The title of the search result
+            - description: The description of the search result
+            - text: The full text content of the search result
+    """
+    # Only use SerperClient if search provider is serper
+    if SEARCH_PROVIDER == "openai":
+        # For OpenAI search provider, this function should not be called directly
+        # The WebSearchTool from the agents module will be used instead
+        return f"The web_search function is not used when SEARCH_PROVIDER is set to 'openai'. Please check your configuration."
+    else:
+        try:
+            # Lazy initialization of SerperClient
+            global _serper_client
+            if _serper_client is None:
+                _serper_client = SerperClient()
+            search_results = await _serper_client.search(
+                query, filter_for_relevance=True, max_results=5
+            )
+            results = await scrape_urls(search_results)
+            return results
+        except Exception as e:
+            # Return a user-friendly error message
+            return f"Sorry, I encountered an error while searching: {str(e)}"
+# ------- DEFINE AGENT FOR FILTERING SEARCH RESULTS BY RELEVANCE -------
+FILTER_AGENT_INSTRUCTIONS = f"""
+You are a search result filter. Your task is to analyze a list of SERP search results and determine which ones are relevant
+to the original query based on the link, title and snippet. Return only the relevant results in the specified format.
+- Remove any results that refer to entities that have similar names to the queried entity, but are not the same.
+- E.g. if the query asks about a company "Amce Inc, acme.com", remove results with "acmesolutions.com" or "acme.net" in the link.
+Only output JSON. Follow the JSON schema below. Do not output anything else. I will be parsing this with Pydantic so output valid JSON only:
+{SearchResults.model_json_schema()}
+"""
+# selected_model = fast_model
+#
+# filter_agent = ResearchAgent(
+#     name="SearchFilterAgent",
+#     instructions=FILTER_AGENT_INSTRUCTIONS,
+#     model=selected_model,
+#     output_type=SearchResults if model_supports_structured_output(selected_model) else None,
+#     output_parser=create_type_parser(SearchResults) if not model_supports_structured_output(selected_model) else None
+# )
+# ------- DEFINE UNDERLYING TOOL LOGIC -------
+# Create a shared connector
+ssl_context = ssl.create_default_context()
+ssl_context.check_hostname = False
+ssl_context.verify_mode = ssl.CERT_NONE
+ssl_context.set_ciphers(
+    "DEFAULT:@SECLEVEL=1"
+)  # Add this line to allow older cipher suites
+class SerperClient:
+    """A client for the Serper API to perform Google searches."""
+    def __init__(self, api_key: str = None):
+        self.api_key = api_key or os.getenv("SERPER_API_KEY")
+        if not self.api_key:
+            raise ValueError(
+                "No API key provided. Set SERPER_API_KEY environment variable."
+            )
+        self.url = "https://google.serper.dev/search"
+        self.headers = {"X-API-KEY": self.api_key, "Content-Type": "application/json"}
+    async def search(
+        self, query: str, filter_for_relevance: bool = True, max_results: int = 5
+    ) -> List[WebpageSnippet]:
+        """Perform a Google search using Serper API and fetch basic details for top results.
+        Args:
+            query: The search query
+            num_results: Maximum number of results to return (max 10)
+        Returns:
+            Dictionary with search results
+        """
+        connector = aiohttp.TCPConnector(ssl=ssl_context)
+        async with aiohttp.ClientSession(connector=connector) as session:
+            async with session.post(
+                self.url, headers=self.headers, json={"q": query, "autocorrect": False}
+            ) as response:
+                response.raise_for_status()
+                results = await response.json()
+                results_list = [
+                    WebpageSnippet(
+                        url=result.get("link", ""),
+                        title=result.get("title", ""),
+                        description=result.get("snippet", ""),
+                    )
+                    for result in results.get("organic", [])
+                ]
+        if not results_list:
+            return []
+        if not filter_for_relevance:
+            return results_list[:max_results]
+        # return results_list[:max_results]
+        return await self._filter_results(results_list, query, max_results=max_results)
+    async def _filter_results(
+        self, results: List[WebpageSnippet], query: str, max_results: int = 5
+    ) -> List[WebpageSnippet]:
+        # get rid of pubmed source data
+        filtered_results = [
+            res
+            for res in results
+            if "pmc.ncbi.nlm.nih.gov" not in res.url
+            and "pubmed.ncbi.nlm.nih.gov" not in res.url
+        ]
+        # # get rid of unrelated data
+        # serialized_results = [result.model_dump() if isinstance(result, WebpageSnippet) else result for result in
+        #                       filtered_results]
+        #
+        # user_prompt = f"""
+        # Original search query: {query}
+        #
+        # Search results to analyze:
+        # {json.dumps(serialized_results, indent=2)}
+        #
+        # Return {max_results} search results or less.
+        # """
+        #
+        # try:
+        #     result = await ResearchRunner.run(filter_agent, user_prompt)
+        #     output = result.final_output_as(SearchResults)
+        #     return output.results_list
+        # except Exception as e:
+        #     print("Error filtering urls:", str(e))
+        #     return filtered_results[:max_results]
+        async def fetch_url(session, url):
+            try:
+                async with session.get(url, timeout=5) as response:
+                    return response.status == 200
+            except Exception as e:
+                print(f"Error accessing {url}: {str(e)}")
+                return False  # 返回 False 表示不可访问
+        async def filter_unreachable_urls(results):
+            async with aiohttp.ClientSession() as session:
+                tasks = [fetch_url(session, res.url) for res in results]
+                reachable = await asyncio.gather(*tasks)
+                return [
+                    res for res, can_access in zip(results, reachable) if can_access
+                ]
+        reachable_results = await filter_unreachable_urls(filtered_results)
+        # Return the first `max_results` or less if there are not enough reachable results
+        return reachable_results[:max_results]
+async def scrape_urls(items: List[WebpageSnippet]) -> List[ScrapeResult]:
+    """Fetch text content from provided URLs.
+    Args:
+        items: List of SearchEngineResult items to extract content from
+    Returns:
+        List of ScrapeResult objects which have the following fields:
+            - url: The URL of the search result
+            - title: The title of the search result
+            - description: The description of the search result
+            - text: The full text content of the search result
+    """
+    connector = aiohttp.TCPConnector(ssl=ssl_context)
+    async with aiohttp.ClientSession(connector=connector) as session:
+        # Create list of tasks for concurrent execution
+        tasks = []
+        for item in items:
+            if item.url:  # Skip empty URLs
+                tasks.append(fetch_and_process_url(session, item))
+        # Execute all tasks concurrently and gather results
+        results = await asyncio.gather(*tasks, return_exceptions=True)
+        # Filter out errors and return successful results
+        return [r for r in results if isinstance(r, ScrapeResult)]
+async def fetch_and_process_url(
+    session: aiohttp.ClientSession, item: WebpageSnippet
+) -> ScrapeResult:
+    """Helper function to fetch and process a single URL."""
+    if not is_valid_url(item.url):
+        return ScrapeResult(
+            url=item.url,
+            title=item.title,
+            description=item.description,
+            text=f"Error fetching content: URL contains restricted file extension",
+        )
+    try:
+        async with session.get(item.url, timeout=8) as response:
+            if response.status == 200:
+                content = await response.text()
+                # Run html_to_text in a thread pool to avoid blocking
+                text_content = await asyncio.get_event_loop().run_in_executor(
+                    None, html_to_text, content
+                )
+                text_content = text_content[
+                    :CONTENT_LENGTH_LIMIT
+                ]  # Trim content to avoid exceeding token limit
+                return ScrapeResult(
+                    url=item.url,
+                    title=item.title,
+                    description=item.description,
+                    text=text_content,
+                )
+            else:
+                # Instead of raising, return a WebSearchResult with an error message
+                return ScrapeResult(
+                    url=item.url,
+                    title=item.title,
+                    description=item.description,
+                    text=f"Error fetching content: HTTP {response.status}",
+                )
+    except Exception as e:
+        # Instead of raising, return a WebSearchResult with an error message
+        return ScrapeResult(
+            url=item.url,
+            title=item.title,
+            description=item.description,
+            text=f"Error fetching content: {str(e)}",
+        )
+def html_to_text(html_content: str) -> str:
+    """
+    Strips out all of the unnecessary elements from the HTML context to prepare it for text extraction / LLM processing.
+    """
+    # Parse the HTML using lxml for speed
+    soup = BeautifulSoup(html_content, "lxml")
+    # Extract text from relevant tags
+    tags_to_extract = ("h1", "h2", "h3", "h4", "h5", "h6", "p", "li", "blockquote")
+    # Use a generator expression for efficiency
+    extracted_text = "\n".join(
+        element.get_text(strip=True)
+        for element in soup.find_all(tags_to_extract)
+        if element.get_text(strip=True)
+    )
+    return extracted_text
+def is_valid_url(url: str) -> bool:
+    """Check that a URL does not contain restricted file extensions."""
+    if any(
+        ext in url
+        for ext in [
+            ".pdf",
+            ".doc",
+            ".xls",
+            ".ppt",
+            ".zip",
+            ".rar",
+            ".7z",
+            ".txt",
+            ".js",
+            ".xml",
+            ".css",
+            ".png",
+            ".jpg",
+            ".jpeg",
+            ".gif",
+            ".ico",
+            ".svg",
+            ".webp",
+            ".mp3",
+            ".mp4",
+            ".avi",
+            ".mov",
+            ".wmv",
+            ".flv",
+            ".wma",
+            ".wav",
+            ".m4a",
+            ".m4v",
+            ".m4b",
+            ".m4p",
+            ".m4u",
+        ]
+    ):
+        return False
+    return True
+async def url_to_contents(url):
+    async with AsyncWebCrawler() as crawler:
+        result = await crawler.arun(
+            url=url,
+        )
+        # print(result.markdown)
+    return result.markdown
+async def url_to_fit_contents(res):
+    str_fit_max = 40000  # 40,000字符通常在10,000token，5个合起来不超过50k
+    browser_config = BrowserConfig(
+        headless=True,
+        verbose=True,
+    )
+    run_config = CrawlerRunConfig(
+        cache_mode=CacheMode.DISABLED,
+        markdown_generator=DefaultMarkdownGenerator(
+            content_filter=PruningContentFilter(
+                threshold=1.0, threshold_type="fixed", min_word_threshold=0
+            )
+        ),
+        # markdown_generator=DefaultMarkdownGenerator(
+        #     content_filter=BM25ContentFilter(user_query="WHEN_WE_FOCUS_BASED_ON_A_USER_QUERY", bm25_threshold=1.0)
+        # ),
+    )
+    try:
+        async with AsyncWebCrawler(config=browser_config) as crawler:
+            # 使用 asyncio.wait_for 来设置超时
+            result = await asyncio.wait_for(
+                crawler.arun(url=res.url, config=run_config), timeout=15  # 设置超时
+            )
+            print(f"char before filtering {len(result.markdown.raw_markdown)}.")
+            print(f"char after filtering {len(result.markdown.fit_markdown)}.")
+            return result.markdown.fit_markdown[
+                :str_fit_max
+            ]  # 如果成功，返回结果的前str_fit_max个字符
+    except asyncio.TimeoutError:
+        print(f"Timeout occurred while accessing {res.url}.")  # 打印超时信息
+        return res.text[:str_fit_max]  # 如果发生超时，返回res粗略提取
+    except Exception as e:
+        print(f"Exception occurred: {str(e)}")  # 打印其他异常信息
+        return res.text[:str_fit_max]  # 如果发生其他异常，返回res粗略提取

python-services/Retrieve/utils/bio_logger.py ADDED Viewed

	@@ -0,0 +1,253 @@

+"""
+BioLogger - A comprehensive logging utility for the bio RAG server.
+This module provides a centralized logging system with correlation ID support,
+structured logging, and configurable output handlers.
+"""
+import sys
+import traceback
+from pathlib import Path
+from typing import Any, Optional
+from asgi_correlation_id import correlation_id
+from loguru import logger
+class BioLogger:
+    """
+    Enhanced logging utility with correlation ID support and structured logging.
+    This class provides a unified interface for logging with automatic
+    correlation ID binding and comprehensive error tracking.
+    """
+    def __init__(self, log_dir: str = "logs", max_retention_days: int = 30):
+        """
+        Initialize the BioLogger.
+        Args:
+            log_dir: Directory to store log files
+            max_retention_days: Maximum number of days to retain log files
+        """
+        self.log_dir = Path(log_dir)
+        self.max_retention_days = max_retention_days
+        self._setup_logging()
+    def _setup_logging(self) -> None:
+        """Configure loguru logger with handlers."""
+        # Remove default handler
+        logger.remove()
+        # Create log directory
+        self.log_dir.mkdir(exist_ok=True)
+        # Terminal handler
+        logger.add(
+            sys.stderr,
+            format=self._get_format_string(),
+            level="INFO",
+            colorize=True,
+            backtrace=True,
+            diagnose=True,
+        )
+        # File handlers
+        log_file = self.log_dir / "bio_rag_{time:YYYY-MM-DD}.log"
+        # Info level file handler
+        logger.add(
+            str(log_file),
+            format=self._get_format_string(),
+            level="INFO",
+            rotation="1 day",
+            retention=f"{self.max_retention_days} days",
+            compression="zip",
+            backtrace=True,
+            diagnose=True,
+        )
+        # Error level file handler
+        logger.add(
+            str(log_file),
+            format=self._get_format_string(),
+            level="ERROR",
+            rotation="1 day",
+            retention=f"{self.max_retention_days} days",
+            compression="zip",
+            backtrace=True,
+            diagnose=True,
+        )
+    def _get_format_string(self) -> str:
+        """Get the log format string with correlation ID."""
+        return "{time:YYYY-MM-DD HH:mm:ss.SSS} | {level: <8} | [CID:{extra[correlation_id]}] | {name}:{function}:{line} | {message}"
+    def _get_correlation_id(self) -> str:
+        """Get the current correlation ID or return SYSTEM."""
+        return correlation_id.get() or "SYSTEM"
+    def _bind_logger(self):
+        """Bind logger with current correlation ID."""
+        return logger.bind(correlation_id=self._get_correlation_id())
+    def debug(self, message: str, **kwargs: Any) -> None:
+        """
+        Log a debug message.
+        Args:
+            message: The message to log
+            **kwargs: Additional context data
+        """
+        self._bind_logger().debug(message, **kwargs)
+    def info(self, message: str, **kwargs: Any) -> None:
+        """
+        Log an info message.
+        Args:
+            message: The message to log
+            **kwargs: Additional context data
+        """
+        self._bind_logger().info(message, **kwargs)
+    def warning(self, message: str, **kwargs: Any) -> None:
+        """
+        Log a warning message.
+        Args:
+            message: The message to log
+            **kwargs: Additional context data
+        """
+        self._bind_logger().warning(message, **kwargs)
+    def error(
+        self, message: str, exc_info: Optional[Exception] = None, **kwargs: Any
+    ) -> None:
+        """
+        Log an error message with optional exception information.
+        Args:
+            message: The error message
+            exc_info: Optional exception object for detailed error tracking
+            **kwargs: Additional context data
+        """
+        if exc_info is not None:
+            error_details = self._format_exception_details(message, exc_info)
+            self._bind_logger().error(error_details, **kwargs)
+        else:
+            self._bind_logger().error(message, **kwargs)
+    def critical(
+        self, message: str, exc_info: Optional[Exception] = None, **kwargs: Any
+    ) -> None:
+        """
+        Log a critical error message.
+        Args:
+            message: The critical error message
+            exc_info: Optional exception object for detailed error tracking
+            **kwargs: Additional context data
+        """
+        if exc_info is not None:
+            error_details = self._format_exception_details(message, exc_info)
+            self._bind_logger().critical(error_details, **kwargs)
+        else:
+            self._bind_logger().critical(message, **kwargs)
+    def _format_exception_details(self, message: str, exc_info: Exception) -> str:
+        """
+        Format exception details for logging.
+        Args:
+            message: The base error message
+            exc_info: The exception object
+        Returns:
+            Formatted error details string
+        """
+        exc_type = exc_info.__class__.__name__
+        exc_message = str(exc_info)
+        # Get stack trace
+        stack_trace = []
+        if exc_info.__traceback__:
+            tb_list = traceback.extract_tb(exc_info.__traceback__)
+            for tb in tb_list:
+                stack_trace.append(
+                    f"  File: {tb.filename}, "
+                    f"Line: {tb.lineno}, "
+                    f"Function: {tb.name}"
+                )
+        # Format error details
+        error_details = [
+            f"Error Message: {message}",
+            f"Exception Type: {exc_type}",
+            f"Exception Details: {exc_message}",
+        ]
+        if stack_trace:
+            error_details.append("Stack Trace:")
+            error_details.extend(stack_trace)
+        return "\n".join(error_details)
+    def log_performance(self, operation: str, duration: float, **kwargs: Any) -> None:
+        """
+        Log performance metrics.
+        Args:
+            operation: Name of the operation
+            duration: Duration in seconds
+            **kwargs: Additional performance metrics
+        """
+        message = f"Performance: {operation} took {duration:.3f}s"
+        if kwargs:
+            metrics = ", ".join(f"{k}={v}" for k, v in kwargs.items())
+            message += f" | {metrics}"
+        self.info(message)
+    def log_api_call(
+        self, method: str, url: str, status_code: int, duration: float
+    ) -> None:
+        """
+        Log API call details.
+        Args:
+            method: HTTP method
+            url: API endpoint URL
+            status_code: HTTP status code
+            duration: Request duration in seconds
+        """
+        level = "error" if status_code >= 400 else "info"
+        message = f"API Call: {method} {url} -> {status_code} ({duration:.3f}s)"
+        if level == "error":
+            self.error(message)
+        else:
+            self.info(message)
+    def log_database_operation(
+        self, operation: str, table: str, duration: float, **kwargs: Any
+    ) -> None:
+        """
+        Log database operation details.
+        Args:
+            operation: Database operation (SELECT, INSERT, etc.)
+            table: Table name
+            duration: Operation duration in seconds
+            **kwargs: Additional operation details
+        """
+        message = f"Database: {operation} on {table} took {duration:.3f}s"
+        if kwargs:
+            details = ", ".join(f"{k}={v}" for k, v in kwargs.items())
+            message += f" | {details}"
+        self.info(message)
+# Create singleton instance
+bio_logger = BioLogger()

python-services/Retrieve/utils/http_util.py ADDED Viewed

	@@ -0,0 +1,275 @@

+"""
+HTTP utility functions for making synchronous and asynchronous HTTP requests.
+This module provides a unified interface for HTTP operations using httpx,
+with proper error handling, timeout configuration, and retry logic.
+"""
+import asyncio
+import os
+import time
+import uuid
+from typing import Any, Dict, Optional
+import httpx
+from utils.bio_logger import bio_logger as logger
+class HTTPError(Exception):
+    """Custom exception for HTTP-related errors."""
+    def __init__(self, status_code: int, message: str, url: str):
+        self.status_code = status_code
+        self.message = message
+        self.url = url
+        super().__init__(f"HTTP {status_code}: {message} for {url}")
+def _create_timeout(timeout: float = 10.0) -> httpx.Timeout:
+    """Create a timeout configuration for HTTP requests."""
+    return httpx.Timeout(timeout, connect=5.0)
+def _handle_response(response: httpx.Response, url: str) -> Any:
+    """Handle HTTP response and raise appropriate exceptions."""
+    if response.status_code == 200:
+        return response.json()
+    logger.error(f"HTTP request failed: {response.status_code} for {url}")
+    raise HTTPError(
+        status_code=response.status_code,
+        message=f"Request failed with status {response.status_code}",
+        url=url,
+    )
+async def async_http_get(
+    url: str,
+    params: Optional[Dict[str, Any]] = None,
+    timeout: float = 10.0,
+    headers: Optional[Dict[str, str]] = None,
+) -> Any:
+    """
+    Make an asynchronous HTTP GET request.
+    Args:
+        url: The URL to make the request to
+        params: Query parameters to include in the request
+        timeout: Request timeout in seconds
+        headers: Optional headers to include in the request
+    Returns:
+        The JSON response from the server
+    Raises:
+        HTTPError: If the request fails
+        httpx.RequestError: If there's a network error
+    """
+    timeout_config = _create_timeout(timeout)
+    start_time = time.time()
+    async with httpx.AsyncClient(timeout=timeout_config) as client:
+        response = await client.get(url=url, params=params, headers=headers)
+        duration = time.time() - start_time
+        # Log the API call
+        logger.log_api_call("GET", url, response.status_code, duration)
+        return _handle_response(response, url)
+def http_get(
+    url: str,
+    params: Optional[Dict[str, Any]] = None,
+    timeout: float = 10.0,
+    headers: Optional[Dict[str, str]] = None,
+) -> Any:
+    """
+    Make a synchronous HTTP GET request.
+    Args:
+        url: The URL to make the request to
+        params: Query parameters to include in the request
+        timeout: Request timeout in seconds
+        headers: Optional headers to include in the request
+    Returns:
+        The JSON response from the server
+    Raises:
+        HTTPError: If the request fails
+        httpx.RequestError: If there's a network error
+    """
+    timeout_config = _create_timeout(timeout)
+    start_time = time.time()
+    with httpx.Client(timeout=timeout_config) as client:
+        response = client.get(url=url, params=params, headers=headers)
+        duration = time.time() - start_time
+        # Log the API call
+        logger.log_api_call("GET", url, response.status_code, duration)
+        return _handle_response(response, url)
+def http_post(
+    url: str, data: Any, headers: Optional[Dict[str, Any]] = None, timeout: float = 10.0
+) -> Any:
+    """
+    Make a synchronous HTTP POST request.
+    Args:
+        url: The URL to make the request to
+        data: The data to send in the request body
+        headers: Optional headers to include in the request
+        timeout: Request timeout in seconds
+    Returns:
+        The JSON response from the server
+    Raises:
+        HTTPError: If the request fails
+        httpx.RequestError: If there's a network error
+    """
+    timeout_config = _create_timeout(timeout)
+    start_time = time.time()
+    with httpx.Client(timeout=timeout_config) as client:
+        response = client.post(url=url, json=data, headers=headers)
+        duration = time.time() - start_time
+        # Log the API call
+        logger.log_api_call("POST", url, response.status_code, duration)
+        return _handle_response(response, url)
+async def async_http_post(
+    url: str,
+    data: Any,
+    headers: Optional[Dict[str, Any]] = None,
+    timeout: float = 10.0,
+    max_retries: int = 3,
+    retry_delay: float = 0.5,
+) -> Any:
+    """
+    Make an asynchronous HTTP POST request with retry logic.
+    Args:
+        url: The URL to make the request to
+        data: The data to send in the request body
+        headers: Optional headers to include in the request
+        timeout: Request timeout in seconds
+        max_retries: Maximum number of retry attempts
+        retry_delay: Delay between retries in seconds
+    Returns:
+        The JSON response from the server
+    Raises:
+        HTTPError: If the request fails after all retries
+        httpx.RequestError: If there's a network error
+    """
+    timeout_config = _create_timeout(timeout)
+    async with httpx.AsyncClient(timeout=timeout_config) as client:
+        for attempt in range(1, max_retries + 1):
+            try:
+                start_time = time.time()
+                response = await client.post(url=url, json=data, headers=headers)
+                duration = time.time() - start_time
+                # Log the API call
+                logger.log_api_call("POST", url, response.status_code, duration)
+                if response.status_code == 200:
+                    return response.json()
+                else:
+                    logger.error(
+                        f"HTTP POST failed (attempt {attempt}/{max_retries}): "
+                        f"{response.status_code} for {url}"
+                    )
+                    if attempt < max_retries:
+                        await asyncio.sleep(retry_delay)
+                    else:
+                        raise HTTPError(
+                            status_code=response.status_code,
+                            message=f"Request failed after {max_retries} attempts",
+                            url=url,
+                        )
+            except httpx.RequestError as e:
+                logger.error(f"Network error on attempt {attempt}: {e}")
+                if attempt < max_retries:
+                    await asyncio.sleep(retry_delay)
+                else:
+                    raise HTTPError(
+                        status_code=0,
+                        message=f"Network error after {max_retries} attempts: {str(e)}",
+                        url=url,
+                    ) from e
+    raise HTTPError(
+        status_code=0,
+        message=f"Failed to fetch data from {url} after {max_retries} attempts",
+        url=url,
+    )
+def download_file(
+    file_url: str, directory_path: str, timeout: int = 60, verify_ssl: bool = True
+) -> Optional[str]:
+    """
+    Download a file from a URL to a local directory.
+    Args:
+        file_url: The URL of the file to download
+        directory_path: The directory to save the file in
+        timeout: Request timeout in seconds
+        verify_ssl: Whether to verify SSL certificates
+    Returns:
+        The path to the downloaded file, or None if download failed
+    """
+    # Extract file extension from URL
+    file_extension = file_url.split(".")[-1].split("?")[0]  # Remove query params
+    random_filename = f"{uuid.uuid4()}.{file_extension}"
+    # Create directory if it doesn't exist
+    os.makedirs(directory_path, exist_ok=True)
+    file_path = os.path.join(directory_path, random_filename)
+    try:
+        with httpx.Client(timeout=timeout, verify=verify_ssl) as client:
+            with client.stream("GET", file_url) as response:
+                if response.status_code == 200:
+                    with open(file_path, "wb") as file:
+                        for chunk in response.iter_bytes(chunk_size=8192):
+                            file.write(chunk)
+                    logger.info(f"Successfully downloaded file to {file_path}")
+                    return file_path
+                else:
+                    logger.error(
+                        f"Download failed with status code: {response.status_code}"
+                    )
+                    return None
+    except httpx.TimeoutException:
+        logger.error("Download request timed out")
+        return None
+    except httpx.RequestError as e:
+        logger.error(f"Download request failed: {e}")
+        return None
+    except Exception as e:
+        logger.error(f"Unexpected error during download: {e}")
+        return None
+# Backward compatibility functions
+async def async_http_post_legacy(url: str, params: dict) -> Any:
+    """
+    Legacy async HTTP POST function for backward compatibility.
+    This function maintains the old interface but uses the new implementation.
+    """
+    return await async_http_post(url=url, data=params)

python-services/Retrieve/utils/i18n_context.py ADDED Viewed

	@@ -0,0 +1,125 @@

+"""
+国际化上下文管理器
+提供更优雅的语言设置方式，避免在函数间传递language参数
+"""
+import contextvars
+from utils.i18n_types import Language
+# 创建上下文变量
+_language_context = contextvars.ContextVar("language", default=Language.ENGLISH)
+class I18nContext:
+    """国际化上下文管理器"""
+    @staticmethod
+    def set_language(language: Language) -> None:
+        """
+        设置当前上下文的语言
+        Args:
+            language: 语言枚举值
+        """
+        _language_context.set(language)
+    @staticmethod
+    def get_language() -> Language:
+        """
+        获取当前上下文的语言
+        Returns:
+            当前语言枚举值
+        """
+        return _language_context.get()
+    @staticmethod
+    def reset_language() -> None:
+        """重置语言为默认值"""
+        _language_context.set(Language.ENGLISH)
+    @staticmethod
+    def get_language_value() -> str:
+        """
+        获取当前语言的字符串值
+        Returns:
+            语言字符串值
+        """
+        return _language_context.get().value
+class I18nContextManager:
+    """国际化上下文管理器，支持with语句"""
+    def __init__(self, language: Language):
+        """
+        初始化上下文管理器
+        Args:
+            language: 要设置的语言
+        """
+        self.language = language
+        self._previous_language = None
+    def __enter__(self):
+        """进入上下文时保存当前语言并设置新语言"""
+        self._previous_language = I18nContext.get_language()
+        I18nContext.set_language(self.language)
+        return self
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        """退出上下文时恢复之前的语言"""
+        if self._previous_language is not None:
+            I18nContext.set_language(self._previous_language)
+# 便捷函数
+def set_language(language: Language) -> None:
+    """设置当前语言"""
+    I18nContext.set_language(language)
+def get_language() -> Language:
+    """获取当前语言"""
+    return I18nContext.get_language()
+def reset_language() -> None:
+    """重置语言为默认值"""
+    I18nContext.reset_language()
+def with_language(language: Language):
+    """
+    创建语言上下文管理器
+    Args:
+        language: 要设置的语言
+    Returns:
+        上下文管理器
+    """
+    return I18nContextManager(language)
+# 装饰器，用于自动设置语言
+def with_language_decorator(language: Language):
+    """
+    装饰器，为函数自动设置语言上下文
+    Args:
+        language: 要设置的语言
+    Returns:
+        装饰器函数
+    """
+    def decorator(func):
+        def wrapper(*args, **kwargs):
+            with I18nContextManager(language):
+                return func(*args, **kwargs)
+        return wrapper
+    return decorator

python-services/Retrieve/utils/i18n_messages.py ADDED Viewed

	@@ -0,0 +1,262 @@

+"""
+国际化消息配置文件
+包含所有错误消息、成功消息、状态消息和UI标签消息的中英文映射
+"""
+from utils.i18n_types import Language
+# 错误消息国际化
+ERROR_MESSAGES = {
+    Language.CHINESE: {
+        "invalid_request": "无效的请求参数",
+        "search_failed": "搜索失败",
+        "no_results": "未找到相关结果",
+        "service_unavailable": "服务暂时不可用",
+        "internal_error": "内部服务器错误",
+        "invalid_language": "不支持的语言设置",
+        "query_too_long": "查询内容过长",
+        "rate_limit_exceeded": "请求频率过高，请稍后重试",
+        "authentication_failed": "认证失败",
+        "permission_denied": "权限不足",
+        "resource_not_found": "资源未找到",
+        "network_error": "网络连接错误",
+        "timeout_error": "请求超时",
+        "invalid_format": "数据格式错误",
+        "missing_required_field": "缺少必需字段",
+        "invalid_user_id": "无效的用户ID",
+        "search_service_error": "搜索服务错误",
+        "llm_service_error": "语言模型服务错误",
+        "embedding_service_error": "向量化服务错误",
+        "database_error": "数据库错误",
+    },
+    Language.ENGLISH: {
+        "invalid_request": "Invalid request parameters",
+        "search_failed": "Search failed",
+        "no_results": "No relevant results found",
+        "service_unavailable": "Service temporarily unavailable",
+        "internal_error": "Internal server error",
+        "invalid_language": "Unsupported language setting",
+        "query_too_long": "Query content too long",
+        "rate_limit_exceeded": "Request rate exceeded, please try again later",
+        "authentication_failed": "Authentication failed",
+        "permission_denied": "Permission denied",
+        "resource_not_found": "Resource not found",
+        "network_error": "Network connection error",
+        "timeout_error": "Request timeout",
+        "invalid_format": "Invalid data format",
+        "missing_required_field": "Missing required field",
+        "invalid_user_id": "Invalid user ID",
+        "search_service_error": "Search service error",
+        "llm_service_error": "Language model service error",
+        "embedding_service_error": "Embedding service error",
+        "database_error": "Database error",
+    },
+}
+# 成功消息国际化
+SUCCESS_MESSAGES = {
+    Language.CHINESE: {
+        "search_success": "搜索成功",
+        "chat_success": "聊天服务正常",
+        "health_check_ok": "服务运行正常",
+        "results_found": "找到相关结果",
+        "processing_complete": "处理完成",
+    },
+    Language.ENGLISH: {
+        "search_success": "Search successful",
+        "chat_success": "Chat service normal",
+        "health_check_ok": "Service running normally",
+        "results_found": "Relevant results found",
+        "processing_complete": "Processing complete",
+    },
+}
+# 状态消息国际化
+STATUS_MESSAGES = {
+    Language.CHINESE: {
+        "processing": "正在处理",
+        "searching": "正在搜索",
+        "generating": "正在生成回答",
+        "completed": "已完成",
+        "failed": "处理失败",
+    },
+    Language.ENGLISH: {
+        "processing": "Processing",
+        "searching": "Searching",
+        "generating": "Generating answer",
+        "completed": "Completed",
+        "failed": "Processing failed",
+    },
+}
+# UI标签消息国际化
+LABEL_MESSAGES = {
+    Language.CHINESE: {
+        "web_search_start": "正在调用 Browser 进行内容检索，所需时间较长，请等待...",
+        "web_search": "正在调用 Browser 进行内容检索",
+        "personal_search_start": "正在调用 个人知识库 进行内容检索，所需时间较长，请等待...",
+        "personal_search": "正在调用 个人知识库 进行内容检索",
+        "pubmed_search_start": "正在调用 PubMed 进行内容检索，所需时间较长，请等待...",
+        "pubmed_search": "正在调用 PubMed 进行内容检索",
+        "generating_answer": "正在生成回答",
+        "processing": "正在处理",
+        "personal_search_description": "片段 {index}",
+    },
+    Language.ENGLISH: {
+        "web_search_start": "Retrieving content from Browser, this may take a while, please wait...",
+        "web_search": "Retrieving content from Browser",
+        "personal_search_start": "Retrieving content from Personal Knowledge Base, this may take a while, please wait...",
+        "personal_search": "Retrieving content from Personal Knowledge Base",
+        "pubmed_search_start": "Retrieving content from PubMed, this may take a while, please wait...",
+        "pubmed_search": "Retrieving content from PubMed",
+        "generating_answer": "Generating answer",
+        "processing": "Processing",
+        "personal_search_description": "Chunk {index} from this reference.",
+    },
+}
+# 系统消息国际化
+SYSTEM_MESSAGES = {
+    Language.CHINESE: {
+        "welcome": "欢迎使用生物医学RAG服务",
+        "service_start": "服务已启动",
+        "service_stop": "服务已停止",
+        "connection_established": "连接已建立",
+        "connection_lost": "连接已断开",
+        "maintenance_mode": "系统维护中",
+        "updating": "系统更新中",
+        "backup_restore": "备份恢复中",
+    },
+    Language.ENGLISH: {
+        "welcome": "Welcome to Biomedical RAG Service",
+        "service_start": "Service started",
+        "service_stop": "Service stopped",
+        "connection_established": "Connection established",
+        "connection_lost": "Connection lost",
+        "maintenance_mode": "System under maintenance",
+        "updating": "System updating",
+        "backup_restore": "Backup restoring",
+    },
+}
+# 业务消息国际化
+BUSINESS_MESSAGES = {
+    Language.CHINESE: {
+        "search_started": "开始搜索...",
+        "search_completed": "搜索完成",
+        "no_search_results": "未找到搜索结果",
+        "processing_request": "正在处理请求...",
+        "request_completed": "请求处理完成",
+        "upload_success": "文件上传成功",
+        "upload_failed": "文件上传失败",
+        "download_started": "开始下载...",
+        "download_completed": "下载完成",
+        "operation_success": "操作成功",
+        "operation_failed": "操作失败",
+        "data_saved": "数据已保存",
+        "data_deleted": "数据已删除",
+        "data_updated": "数据已更新",
+        "connection_timeout": "连接超时",
+        "server_busy": "服务器繁忙",
+        "maintenance_notice": "系统维护通知",
+    },
+    Language.ENGLISH: {
+        "search_started": "Search started...",
+        "search_completed": "Search completed",
+        "no_search_results": "No search results found",
+        "processing_request": "Processing request...",
+        "request_completed": "Request completed",
+        "upload_success": "File uploaded successfully",
+        "upload_failed": "File upload failed",
+        "download_started": "Download started...",
+        "download_completed": "Download completed",
+        "operation_success": "Operation successful",
+        "operation_failed": "Operation failed",
+        "data_saved": "Data saved",
+        "data_deleted": "Data deleted",
+        "data_updated": "Data updated",
+        "connection_timeout": "Connection timeout",
+        "server_busy": "Server busy",
+        "maintenance_notice": "System maintenance notice",
+    },
+}
+# 所有消息类型的映射
+ALL_MESSAGE_TYPES = {
+    "error": ERROR_MESSAGES,
+    "success": SUCCESS_MESSAGES,
+    "status": STATUS_MESSAGES,
+    "label": LABEL_MESSAGES,
+    "system": SYSTEM_MESSAGES,
+    "business": BUSINESS_MESSAGES,
+}
+def get_message(message_type: str, key: str, language: Language) -> str:
+    """
+    获取指定类型的国际化消息
+    Args:
+        message_type: 消息类型 (error, success, status, label, system, business)
+        key: 消息键
+        language: 语言
+    Returns:
+        国际化消息字符串
+    """
+    if message_type not in ALL_MESSAGE_TYPES:
+        return f"Unknown message type: {message_type}"
+    messages = ALL_MESSAGE_TYPES[message_type]
+    default_language = Language.CHINESE
+    return messages.get(language, messages[default_language]).get(
+        key,
+        messages[default_language].get(key, f"Unknown {message_type} message: {key}"),
+    )
+def get_all_messages_for_language(language: Language) -> dict:
+    """
+    获取指定语言的所有消息
+    Args:
+        language: 语言
+    Returns:
+        包含所有消息类型的字典
+    """
+    result = {}
+    for message_type, messages in ALL_MESSAGE_TYPES.items():
+        result[message_type] = messages.get(language, messages[Language.CHINESE])
+    return result
+def get_available_message_types() -> list:
+    """
+    获取所有可用的消息类型
+    Returns:
+        消息类型列表
+    """
+    return list(ALL_MESSAGE_TYPES.keys())
+def get_available_keys_for_type(message_type: str) -> list:
+    """
+    获取指定消息类型的所有可用键
+    Args:
+        message_type: 消息类型
+    Returns:
+        键列表
+    """
+    if message_type not in ALL_MESSAGE_TYPES:
+        return []
+    messages = ALL_MESSAGE_TYPES[message_type]
+    # 使用中文作为默认语言来获取所有键
+    return list(messages[Language.CHINESE].keys())

python-services/Retrieve/utils/i18n_types.py ADDED Viewed

	@@ -0,0 +1,12 @@

+"""
+国际化类型定义
+"""
+from enum import Enum
+class Language(Enum):
+    """支持的语言枚举"""
+    CHINESE = "zh"
+    ENGLISH = "en"

python-services/Retrieve/utils/i18n_util.py ADDED Viewed

	@@ -0,0 +1,302 @@

+"""
+国际化工具类，支持中英文切换功能
+"""
+from typing import Dict, Any, Optional
+from utils.i18n_types import Language
+from utils.i18n_messages import get_message
+from utils.i18n_context import I18nContext
+class I18nUtil:
+    """国际化工具类"""
+    # 默认语言
+    DEFAULT_LANGUAGE = Language.ENGLISH
+    # 语言映射
+    LANGUAGE_MAPPING = {
+        "zh": Language.CHINESE,
+        "zh_cn": Language.CHINESE,
+        "en": Language.ENGLISH,
+        "en_us": Language.ENGLISH,
+    }
+    @classmethod
+    def parse_language(cls, language_str: Optional[str]) -> Language:
+        """
+        解析语言字符串
+        Args:
+            language_str: 语言字符串
+        Returns:
+            语言枚举值
+        """
+        if not language_str:
+            return cls.DEFAULT_LANGUAGE
+        # 标准化语言字符串
+        normalized = language_str.lower()
+        # 处理连字符和下划线
+        normalized = normalized.replace("-", "_")
+        return cls.LANGUAGE_MAPPING.get(normalized, cls.DEFAULT_LANGUAGE)
+    @classmethod
+    def get_error_message(cls, key: str, language: Optional[Language] = None) -> str:
+        """
+        获取错误消息
+        Args:
+            key: 错误消息键
+            language: 语言，如果为None则使用上下文中的语言
+        Returns:
+            错误消息
+        """
+        if language is None:
+            language = I18nContext.get_language()
+        return get_message("error", key, language)
+    @classmethod
+    def get_success_message(cls, key: str, language: Optional[Language] = None) -> str:
+        """
+        获取成功消息
+        Args:
+            key: 成功消息键
+            language: 语言，如果为None则使用上下文中的语言
+        Returns:
+            成功消息
+        """
+        if language is None:
+            language = I18nContext.get_language()
+        return get_message("success", key, language)
+    @classmethod
+    def get_status_message(cls, key: str, language: Optional[Language] = None) -> str:
+        """
+        获取状态消息
+        Args:
+            key: 状态消息键
+            language: 语言，如果为None则使用上下文中的语言
+        Returns:
+            状态消息
+        """
+        if language is None:
+            language = I18nContext.get_language()
+        return get_message("status", key, language)
+    @classmethod
+    def get_label_message(cls, key: str, language: Optional[Language] = None) -> str:
+        """
+        获取UI标签消息
+        Args:
+            key: 标签消息键
+            language: 语言，如果为None则使用上下文中的语言
+        Returns:
+            标签消息
+        """
+        if language is None:
+            language = I18nContext.get_language()
+        return get_message("label", key, language)
+    @classmethod
+    def get_system_message(cls, key: str, language: Optional[Language] = None) -> str:
+        """
+        获取系统消息
+        Args:
+            key: 系统消息键
+            language: 语言，如果为None则使用上下文中的语言
+        Returns:
+            系统消息
+        """
+        if language is None:
+            language = I18nContext.get_language()
+        return get_message("system", key, language)
+    @classmethod
+    def get_business_message(cls, key: str, language: Optional[Language] = None) -> str:
+        """
+        获取业务消息
+        Args:
+            key: 业务消息键
+            language: 语言，如果为None则使用上下文中的语言
+        Returns:
+            业务消息
+        """
+        if language is None:
+            language = I18nContext.get_language()
+        return get_message("business", key, language)
+    @classmethod
+    def create_error_response(
+        cls,
+        error_key: str,
+        language: Optional[Language] = None,
+        details: Optional[str] = None,
+        error_code: int = 400,
+    ) -> Dict[str, Any]:
+        """
+        创建错误响应
+        Args:
+            error_key: 错误消息键
+            language: 语言
+            details: 错误详情
+            error_code: 错误代码
+        Returns:
+            错误响应字典
+        """
+        if language is None:
+            language = I18nContext.get_language()
+        response = {
+            "success": False,
+            "error": {
+                "code": error_code,
+                "message": cls.get_error_message(error_key, language),
+                "language": language.value,
+            },
+        }
+        if details:
+            response["error"]["details"] = details
+        return response
+    @classmethod
+    def create_success_response(
+        cls,
+        data: Any,
+        language: Optional[Language] = None,
+        message_key: str = "search_success",
+    ) -> Dict[str, Any]:
+        """
+        创建成功响应
+        Args:
+            data: 响应数据
+            language: 语言
+            message_key: 成功消息键
+        Returns:
+            成功响应字典
+        """
+        if language is None:
+            language = I18nContext.get_language()
+        return {
+            "success": True,
+            "data": data,
+            "message": cls.get_success_message(message_key, language),
+            "language": language.value,
+        }
+    @classmethod
+    def create_status_response(
+        cls,
+        status_key: str,
+        language: Optional[Language] = None,
+        data: Optional[Any] = None,
+    ) -> Dict[str, Any]:
+        """
+        创建状态响应
+        Args:
+            status_key: 状态消息键
+            language: 语言
+            data: 响应数据
+        Returns:
+            状态响应字典
+        """
+        if language is None:
+            language = I18nContext.get_language()
+        response = {
+            "status": cls.get_status_message(status_key, language),
+            "language": language.value,
+        }
+        if data is not None:
+            response["data"] = data
+        return response
+# 便捷函数
+def get_language(language_str: Optional[str]) -> Language:
+    """获取语言枚举值"""
+    return I18nUtil.parse_language(language_str)
+def get_error_message(key: str, language: Optional[Language] = None) -> str:
+    """获取错误消息"""
+    return I18nUtil.get_error_message(key, language)
+def get_success_message(key: str, language: Optional[Language] = None) -> str:
+    """获取成功消息"""
+    return I18nUtil.get_success_message(key, language)
+def get_status_message(key: str, language: Optional[Language] = None) -> str:
+    """获取状态消息"""
+    return I18nUtil.get_status_message(key, language)
+def get_label_message(key: str, language: Optional[Language] = None) -> str:
+    """获取UI标签消息"""
+    return I18nUtil.get_label_message(key, language)
+def get_system_message(key: str, language: Optional[Language] = None) -> str:
+    """获取系统消息"""
+    return I18nUtil.get_system_message(key, language)
+def get_business_message(key: str, language: Optional[Language] = None) -> str:
+    """获取业务消息"""
+    return I18nUtil.get_business_message(key, language)
+def create_error_response(
+    error_key: str,
+    language: Optional[Language] = None,
+    details: Optional[str] = None,
+    error_code: int = 400,
+) -> Dict[str, Any]:
+    """创建错误响应"""
+    return I18nUtil.create_error_response(error_key, language, details, error_code)
+def create_success_response(
+    data: Any, language: Optional[Language] = None, message_key: str = "search_success"
+) -> Dict[str, Any]:
+    """创建成功响应"""
+    return I18nUtil.create_success_response(data, language, message_key)
+def create_status_response(
+    status_key: str, language: Optional[Language] = None, data: Optional[Any] = None
+) -> Dict[str, Any]:
+    """创建状态响应"""
+    return I18nUtil.create_status_response(status_key, language, data)

python-services/Retrieve/utils/snowflake_id.py ADDED Viewed

	@@ -0,0 +1,252 @@

+import time
+import threading
+from typing import Optional
+class SnowflakeIDGenerator:
+    """
+    雪花ID生成器
+    雪花ID结构 (64位):
+    - 符号位: 1位，固定为0
+    - 时间戳: 41位，毫秒级时间戳
+    - 工作机器ID: 10位，包含5位数据中心ID和5位机器ID
+    - 序列号: 12位，同一毫秒内的自增序列
+    特点:
+    - 趋势递增
+    - 全局唯一
+    - 支持分布式环境
+    - 高性能
+    """
+    def __init__(self, datacenter_id: int = 1, worker_id: int = 1, sequence: int = 0):
+        """
+        初始化雪花ID生成器
+        Args:
+            datacenter_id: 数据中心ID (0-31)
+            worker_id: 工作机器ID (0-31)
+            sequence: 初始序列号
+        """
+        # 位数分配
+        self.TIMESTAMP_BITS = 41
+        self.DATACENTER_ID_BITS = 5
+        self.WORKER_ID_BITS = 5
+        self.SEQUENCE_BITS = 12
+        # 最大值
+        self.MAX_DATACENTER_ID = -1 ^ (-1 << self.DATACENTER_ID_BITS)
+        self.MAX_WORKER_ID = -1 ^ (-1 << self.WORKER_ID_BITS)
+        self.MAX_SEQUENCE = -1 ^ (-1 << self.SEQUENCE_BITS)
+        # 偏移量
+        self.WORKER_ID_SHIFT = self.SEQUENCE_BITS
+        self.DATACENTER_ID_SHIFT = self.SEQUENCE_BITS + self.WORKER_ID_BITS
+        self.TIMESTAMP_LEFT_SHIFT = (
+            self.SEQUENCE_BITS + self.WORKER_ID_BITS + self.DATACENTER_ID_BITS
+        )
+        # 验证参数
+        if datacenter_id > self.MAX_DATACENTER_ID or datacenter_id < 0:
+            raise ValueError(
+                f"Datacenter ID must be between 0 and {self.MAX_DATACENTER_ID}"
+            )
+        if worker_id > self.MAX_WORKER_ID or worker_id < 0:
+            raise ValueError(f"Worker ID must be between 0 and {self.MAX_WORKER_ID}")
+        self.datacenter_id = datacenter_id
+        self.worker_id = worker_id
+        self.sequence = sequence
+        # 时间戳基准点 (2023-01-01 00:00:00 UTC)
+        self.EPOCH = 1672531200000
+        # 上次生成ID的时间戳
+        self.last_timestamp = -1
+        # 线程锁
+        self.lock = threading.Lock()
+    def _get_timestamp(self) -> int:
+        """
+        获取当前毫秒时间戳
+        Returns:
+            当前毫秒时间戳
+        """
+        return int(time.time() * 1000)
+    def _wait_for_next_millis(self, last_timestamp: int) -> int:
+        """
+        等待到下一毫秒
+        Args:
+            last_timestamp: 上次时间戳
+        Returns:
+            新的时间戳
+        """
+        timestamp = self._get_timestamp()
+        while timestamp <= last_timestamp:
+            timestamp = self._get_timestamp()
+        return timestamp
+    def generate_id(self) -> int:
+        """
+        生成雪花ID
+        Returns:
+            64位雪花ID
+        Raises:
+            RuntimeError: 时钟回拨时抛出异常
+        """
+        with self.lock:
+            timestamp = self._get_timestamp()
+            # 检查时钟回拨
+            if timestamp < self.last_timestamp:
+                raise RuntimeError(
+                    f"Clock moved backwards. Refusing to generate id for {self.last_timestamp - timestamp} milliseconds"
+                )
+            # 如果是同一毫秒内
+            if timestamp == self.last_timestamp:
+                self.sequence = (self.sequence + 1) & self.MAX_SEQUENCE
+                # 如果序列号溢出，等待下一毫秒
+                if self.sequence == 0:
+                    timestamp = self._wait_for_next_millis(self.last_timestamp)
+            else:
+                # 不同毫秒，序列号重置
+                self.sequence = 0
+            self.last_timestamp = timestamp
+            # 生成ID
+            snowflake_id = (
+                ((timestamp - self.EPOCH) << self.TIMESTAMP_LEFT_SHIFT)
+                | (self.datacenter_id << self.DATACENTER_ID_SHIFT)
+                | (self.worker_id << self.WORKER_ID_SHIFT)
+                | self.sequence
+            )
+            return snowflake_id
+    def generate_id_str(self) -> str:
+        """
+        生成字符串格式的雪花ID
+        Returns:
+            字符串格式的雪花ID
+        """
+        return str(self.generate_id())
+    def parse_id(self, snowflake_id: int) -> dict:
+        """
+        解析雪花ID
+        Args:
+            snowflake_id: 雪花ID
+        Returns:
+            包含解析结果的字典
+        """
+        timestamp = (snowflake_id >> self.TIMESTAMP_LEFT_SHIFT) + self.EPOCH
+        datacenter_id = (
+            snowflake_id >> self.DATACENTER_ID_SHIFT
+        ) & self.MAX_DATACENTER_ID
+        worker_id = (snowflake_id >> self.WORKER_ID_SHIFT) & self.MAX_WORKER_ID
+        sequence = snowflake_id & self.MAX_SEQUENCE
+        return {
+            "timestamp": timestamp,
+            "datacenter_id": datacenter_id,
+            "worker_id": worker_id,
+            "sequence": sequence,
+            "datetime": time.strftime(
+                "%Y-%m-%d %H:%M:%S", time.localtime(timestamp / 1000)
+            ),
+        }
+# 全局雪花ID生成器实例
+_snowflake_generator: Optional[SnowflakeIDGenerator] = None
+_generator_lock = threading.Lock()
+def get_snowflake_generator(
+    datacenter_id: int = 1, worker_id: int = 1
+) -> SnowflakeIDGenerator:
+    """
+    获取全局雪花ID生成器实例
+    Args:
+        datacenter_id: 数据中心ID
+        worker_id: 工作机器ID
+    Returns:
+        雪花ID生成器实例
+    """
+    global _snowflake_generator
+    if _snowflake_generator is None:
+        with _generator_lock:
+            if _snowflake_generator is None:
+                _snowflake_generator = SnowflakeIDGenerator(datacenter_id, worker_id)
+    return _snowflake_generator
+def generate_snowflake_id() -> int:
+    """
+    生成雪花ID (使用默认配置)
+    Returns:
+        64位雪花ID
+    """
+    return get_snowflake_generator().generate_id()
+def generate_snowflake_id_str() -> str:
+    """
+    生成字符串格式的雪花ID (使用默认配置)
+    Returns:
+        字符串格式的雪花ID
+    """
+    return get_snowflake_generator().generate_id_str()
+def parse_snowflake_id(snowflake_id: int) -> dict:
+    """
+    解析雪花ID
+    Args:
+        snowflake_id: 雪花ID
+    Returns:
+        包含解析结果的字典
+    """
+    return get_snowflake_generator().parse_id(snowflake_id)
+# 便捷函数
+def snowflake_id() -> int:
+    """
+    快速生成雪花ID的便捷函数
+    Returns:
+        64位雪花ID
+    """
+    return generate_snowflake_id()
+def snowflake_id_str() -> str:
+    """
+    快速生成字符串格式雪花ID的便捷函数
+    Returns:
+        字符串格式的雪花ID
+    """
+    return generate_snowflake_id_str()

python-services/Retrieve/utils/token_util.py ADDED Viewed

	@@ -0,0 +1,63 @@

+import tiktoken
+def num_tokens_from_messages(messages, model="gpt-4o"):
+    """
+    Returns the number of tokens used by a list of messages.
+    Args:
+    messages (list): A list of messages.
+    model (str): The name of the model to use for tokenization.
+    Returns:
+    int: The number of tokens used by the messages.
+    """
+    try:
+        encoding = tiktoken.encoding_for_model(model)
+    except KeyError:
+        print("Warning: model not found. Using cl100k_base encoding.")
+        encoding = tiktoken.get_encoding("cl100k_base")
+    if model == "gpt-3.5-turbo":
+        return num_tokens_from_messages(messages, model="gpt-3.5-turbo-0301")
+    elif model == "gpt-4o":
+        return num_tokens_from_messages(messages, model="gpt-4-0314")
+    elif model == "gpt-3.5-turbo-0301":
+        tokens_per_message = (
+            4  # every message follows <|start|>{role/name}\n{content}<|end|>\n
+        )
+        tokens_per_name = -1  # if there's a name, the role is omitted
+    elif model == "gpt-4-0314":
+        tokens_per_message = 3
+        tokens_per_name = 1
+    else:
+        raise NotImplementedError(
+            f"""num_tokens_from_messages() is not implemented for model {model}. See https://github.com/openai/openai-python/blob/main/chatml.md for information on how messages are converted to tokens."""
+        )
+    num_tokens = 0
+    for message in messages:
+        num_tokens += tokens_per_message
+        for key, value in message.items():
+            num_tokens += len(encoding.encode(value))
+            if key == "name":
+                num_tokens += tokens_per_name
+    num_tokens += 3  # every reply is primed with <|start|>assistant<|message|>
+    return num_tokens
+def num_tokens_from_text(text: str, model: str = "gpt-4o") -> int:
+    """
+    Returns the number of tokens used by a text.
+    Args:
+    text (str): The text to tokenize.
+    model (str): The name of the model to use for tokenization.
+    """
+    try:
+        encoding = tiktoken.encoding_for_model(model)
+    except KeyError:
+        print("Warning: model not found. Using cl100k_base encoding.")
+        encoding = tiktoken.get_encoding("cl100k_base")
+    num_tokens = 0
+    if text:
+        num_tokens += len(encoding.encode(text))
+    return num_tokens

requirements.txt CHANGED Viewed

@@ -12,4 +12,21 @@ pymupdf>=1.25.4
 python-dotenv>=1.1.0
 streamlit>=1.44.1
 nest-asyncio>=1.6.0
-fastapi

 python-dotenv>=1.1.0
 streamlit>=1.44.1
 nest-asyncio>=1.6.0
+asgi_correlation_id==4.3.4
+fastapi==0.115.12
+uvicorn==0.34.0
+loguru==0.7.3
+pyyaml==6.0.2
+httpx==0.28.1
+requests==2.32.3
+biopython==1.85
+openpyxl==3.1.5
+openai==1.86.0
+openai-agents==0.0.17
+pandas==2.2.3
+pymilvus==2.5.8
+crawl4ai==0.7.0
+aiohttp==3.11.18
+beautifulsoup4==4.12.3
+tiktoken==0.9.0
+fastapi-mcp==0.4.0

requirements_back.txt ADDED Viewed

	@@ -0,0 +1,15 @@

+faiss-cpu>=1.10.0
+jupyter>=1.1.1
+langchain-anthropic>=0.3.10
+langchain-community>=0.3.20
+langchain-mcp-adapters==0.1.9
+langchain-openai>=0.3.11
+langgraph>=0.3.21
+mcp>=1.6.0
+fastmcp
+notebook>=7.3.3
+pymupdf>=1.25.4
+python-dotenv>=1.1.0
+streamlit>=1.44.1
+nest-asyncio>=1.6.0
+fastapi