Spaces:

mgbam
/

Synthetic_Biology

Sleeping

App Files Files Community

mgbam commited on 7 days ago

Commit

e2c04b6

verified ·

1 Parent(s): 86b948e

Update genesis/pipeline.py

Browse files

Files changed (1) hide show

genesis/pipeline.py +98 -28

genesis/pipeline.py CHANGED Viewed

@@ -1,20 +1,31 @@
-## `genesis/pipeline.py`
 from __future__ import annotations
-import os, json
 from typing import Any, Dict, List
 from pydantic import BaseModel
 from openai import AsyncOpenAI
 from agents import Agent, Runner, RunConfig, WebSearchTool, HostedMCPTool
 from .safety import SafetyGuard
-from .tools import OntologyTool, PubMedTool, StructureTool, CrossrefTool, HFRerankTool
-# Env & client
 OPENAI_API_KEY = os.getenv("OPENAI_API_KEY", "")
 OPENAI_BASE_URL = os.getenv("OPENAI_BASE_URL", "https://api.openai.com/v1")
 os.environ["OPENAI_AGENTS_DISABLE_TRACING"] = os.getenv("GENESIS_DISABLE_TRACING", "1")
 client = AsyncOpenAI(api_key=OPENAI_API_KEY, base_url=OPENAI_BASE_URL, timeout=600.0)
 DEEP_MODEL_PRIMARY = os.getenv("GENESIS_DEEP_MODEL", "o3-deep-research")
@@ -24,8 +35,16 @@ TRIAGE_MODEL = os.getenv("GENESIS_TRIAGE_MODEL", "gpt-4o-mini")
 CLARIFY_MODEL = os.getenv("GENESIS_CLARIFY_MODEL", "gpt-4o-mini")
 MCP_URL = os.getenv("GENESIS_MCP_URL")
 safety_guard = SafetyGuard()
 class Clarifications(BaseModel):
     questions: List[str]
@@ -38,18 +57,38 @@ CLARIFY_PROMPT = (
 INSTRUCTION_PROMPT = (
     "Rewrite the user query into detailed DEEP RESEARCH instructions in English. OUTPUT ONLY the instructions. "
     "Include dimensions: organism/system, target, scope/timeframe, evaluation axes, required tables. "
-    "Format as a report with headers: Abstract, Background, Findings, Synthesis, Open Questions, Limitations, Risk & Ethics, References. "
-    "Prefer primary literature (PubMed/Crossref) and databases (UMLS/BioPortal/RCSB). Strictly avoid wet-lab protocols."
 )
 # Tools
-base_tools = [WebSearchTool(), OntologyTool(), PubMedTool(), StructureTool(), CrossrefTool()]
 if MCP_URL:
-    base_tools.append(HostedMCPTool(tool_config={
-        "type": "mcp", "server_label": "file_search", "server_url": MCP_URL, "require_approval": "never"
-    }))
 # Agents
 research_agent = Agent(
     name="Synthetic Biology Research Agent",
     model=DEEP_MODEL_PRIMARY,
@@ -85,7 +124,12 @@ triage_agent = Agent(
     handoffs=[clarifying_agent, instruction_agent],
 )
 async def _extract_citations(stream) -> List[Dict[str, str]]:
     citations: List[Dict[str, str]] = []
     try:
         for item in reversed(stream.new_items):
@@ -93,42 +137,68 @@ async def _extract_citations(stream) -> List[Dict[str, str]]:
                 for content in getattr(item.raw_item, "content", []):
                     for ann in getattr(content, "annotations", []):
                         if getattr(ann, "type", None) == "url_citation":
-                            citations.append({"title": getattr(ann, "title", ""), "url": getattr(ann, "url", "")})
                 break
     except Exception:
         pass
     return citations
-async def research_once(query: str, fast: bool = False, rerank_model: str | None = None) -> Dict[str, Any]:
     # Safety gate input
-    dec = safety_guard.gate(query)
-    if not dec.allowed:
-        query = "SAFE-ONLY REVIEW: " + query + "
-Only produce high-level literature synthesis with citations."
-    # Switch to fast model if requested
     if fast and research_agent.model != DEEP_MODEL_FAST:
         research_agent.model = DEEP_MODEL_FAST
-    # Run pipeline
-    stream = Runner.run_streamed(triage_agent, query, run_config=RunConfig(tracing_disabled=True))
     async for _ in stream.stream_events():
         pass
     final_text = stream.final_output
     citations = await _extract_citations(stream)
-    # Optional HF rerank to reorder citations by relevance to query
     if rerank_model and citations:
-        from .tools import HFRerankTool
-        rerank = HFRerankTool(model_id=rerank_model)
-        docs = [c.get("title") or c.get("url", "") for c in citations]
         try:
-            rr = await rerank.call(query, docs)
             order = rr.get("order")
             if order:
                 citations = [citations[i] for i in order]
         except Exception:
             pass
-    return {"final_output": final_text, "citations": citations}

 from __future__ import annotations
+import os
 from typing import Any, Dict, List
 from pydantic import BaseModel
+# OpenAI Agents SDK + Deep Research
 from openai import AsyncOpenAI
 from agents import Agent, Runner, RunConfig, WebSearchTool, HostedMCPTool
 from .safety import SafetyGuard
+from .tools import (
+    OntologyTool,
+    PubMedTool,
+    StructureTool,
+    CrossrefTool,
+    HFRerankTool,
+)
+# ─────────────────────────────────────────────────────────────
+# Environment & client
+# ─────────────────────────────────────────────────────────────
 OPENAI_API_KEY = os.getenv("OPENAI_API_KEY", "")
 OPENAI_BASE_URL = os.getenv("OPENAI_BASE_URL", "https://api.openai.com/v1")
 os.environ["OPENAI_AGENTS_DISABLE_TRACING"] = os.getenv("GENESIS_DISABLE_TRACING", "1")
+# The AsyncOpenAI client is created for completeness; the Agents SDK uses your default client.
 client = AsyncOpenAI(api_key=OPENAI_API_KEY, base_url=OPENAI_BASE_URL, timeout=600.0)
 DEEP_MODEL_PRIMARY = os.getenv("GENESIS_DEEP_MODEL", "o3-deep-research")
 CLARIFY_MODEL = os.getenv("GENESIS_CLARIFY_MODEL", "gpt-4o-mini")
 MCP_URL = os.getenv("GENESIS_MCP_URL")
+# ─────────────────────────────────────────────────────────────
+# Safety
+# ─────────────────────────────────────────────────────────────
 safety_guard = SafetyGuard()
+# ─────────────────────────────────────────────────────────────
+# Agent prompts
+# ─────────────────────────────────────────────────────────────
 class Clarifications(BaseModel):
     questions: List[str]
 INSTRUCTION_PROMPT = (
     "Rewrite the user query into detailed DEEP RESEARCH instructions in English. OUTPUT ONLY the instructions. "
     "Include dimensions: organism/system, target, scope/timeframe, evaluation axes, required tables. "
+    "Format as a report with headers: Abstract, Background, Findings, Synthesis, Open Questions, "
+    "Limitations, Risk & Ethics, References. Prefer primary literature (PubMed/Crossref) and databases "
+    "(UMLS/BioPortal/RCSB). Strictly avoid wet-lab protocols."
 )
+# ─────────────────────────────────────────────────────────────
 # Tools
+# ─────────────────────────────────────────────────────────────
+base_tools = [
+    WebSearchTool(),
+    OntologyTool(),
+    PubMedTool(),
+    StructureTool(),
+    CrossrefTool(),
+]
 if MCP_URL:
+    base_tools.append(
+        HostedMCPTool(
+            tool_config={
+                "type": "mcp",
+                "server_label": "file_search",
+                "server_url": MCP_URL,
+                "require_approval": "never",
+            }
+        )
+    )
+# ─────────────────────────────────────────────────────────────
 # Agents
+# ─────────────────────────────────────────────────────────────
 research_agent = Agent(
     name="Synthetic Biology Research Agent",
     model=DEEP_MODEL_PRIMARY,
     handoffs=[clarifying_agent, instruction_agent],
 )
+# ────────────────────────────────────────��────────────────────
+# Helpers
+# ─────────────────────────────────────────────────────────────
 async def _extract_citations(stream) -> List[Dict[str, str]]:
+    """Extract URL citations from the final message, if any."""
     citations: List[Dict[str, str]] = []
     try:
         for item in reversed(stream.new_items):
                 for content in getattr(item.raw_item, "content", []):
                     for ann in getattr(content, "annotations", []):
                         if getattr(ann, "type", None) == "url_citation":
+                            citations.append(
+                                {
+                                    "title": getattr(ann, "title", "") or "",
+                                    "url": getattr(ann, "url", "") or "",
+                                }
+                            )
                 break
     except Exception:
         pass
     return citations
+# ─────────────────────────────────────────────────────────────
+# Public API
+# ─────────────────────────────────────────────────────────────
+async def research_once(
+    query: str,
+    fast: bool = False,
+    rerank_model: str | None = None,
+) -> Dict[str, Any]:
+    """
+    Run the triage → (clarify|instruct) → research pipeline once and return:
+      { "final_output": <str or structured>, "citations": [ {title, url}, ... ] }
+    Safety: if the input appears operational/dual-use, we transform it to a SAFE-ONLY prompt.
+    """
     # Safety gate input
+    decision = safety_guard.gate(query)
+    if not decision.allowed:
+        query = (
+            "SAFE-ONLY REVIEW: "
+            + query
+            + "\nOnly produce high-level literature synthesis with citations."
+        )
+    # Switch to fast deep-research model if requested
     if fast and research_agent.model != DEEP_MODEL_FAST:
         research_agent.model = DEEP_MODEL_FAST
+    # Run streamed; we just drain to completion here
+    stream = Runner.run_streamed(
+        triage_agent,
+        query,
+        run_config=RunConfig(tracing_disabled=True),
+    )
     async for _ in stream.stream_events():
         pass
     final_text = stream.final_output
     citations = await _extract_citations(stream)
+    # Optional: rerank citations with a HF reranker (if configured)
     if rerank_model and citations:
         try:
+            reranker = HFRerankTool(model_id=rerank_model)
+            docs = [c.get("title") or c.get("url", "") for c in citations]
+            rr = await reranker.call(query, docs)
             order = rr.get("order")
             if order:
                 citations = [citations[i] for i in order]
         except Exception:
+            # Best-effort; keep original order on failure
             pass
+    return {"final_output": final_text, "citations": citations}