mgbam commited on
Commit
e2c04b6
Β·
verified Β·
1 Parent(s): 86b948e

Update genesis/pipeline.py

Browse files
Files changed (1) hide show
  1. genesis/pipeline.py +98 -28
genesis/pipeline.py CHANGED
@@ -1,20 +1,31 @@
1
- ## `genesis/pipeline.py`
2
-
3
  from __future__ import annotations
4
- import os, json
 
5
  from typing import Any, Dict, List
6
  from pydantic import BaseModel
7
 
 
8
  from openai import AsyncOpenAI
9
  from agents import Agent, Runner, RunConfig, WebSearchTool, HostedMCPTool
10
 
11
  from .safety import SafetyGuard
12
- from .tools import OntologyTool, PubMedTool, StructureTool, CrossrefTool, HFRerankTool
 
 
 
 
 
 
 
 
 
 
13
 
14
- # Env & client
15
  OPENAI_API_KEY = os.getenv("OPENAI_API_KEY", "")
16
  OPENAI_BASE_URL = os.getenv("OPENAI_BASE_URL", "https://api.openai.com/v1")
17
  os.environ["OPENAI_AGENTS_DISABLE_TRACING"] = os.getenv("GENESIS_DISABLE_TRACING", "1")
 
 
18
  client = AsyncOpenAI(api_key=OPENAI_API_KEY, base_url=OPENAI_BASE_URL, timeout=600.0)
19
 
20
  DEEP_MODEL_PRIMARY = os.getenv("GENESIS_DEEP_MODEL", "o3-deep-research")
@@ -24,8 +35,16 @@ TRIAGE_MODEL = os.getenv("GENESIS_TRIAGE_MODEL", "gpt-4o-mini")
24
  CLARIFY_MODEL = os.getenv("GENESIS_CLARIFY_MODEL", "gpt-4o-mini")
25
  MCP_URL = os.getenv("GENESIS_MCP_URL")
26
 
 
 
 
 
27
  safety_guard = SafetyGuard()
28
 
 
 
 
 
29
  class Clarifications(BaseModel):
30
  questions: List[str]
31
 
@@ -38,18 +57,38 @@ CLARIFY_PROMPT = (
38
  INSTRUCTION_PROMPT = (
39
  "Rewrite the user query into detailed DEEP RESEARCH instructions in English. OUTPUT ONLY the instructions. "
40
  "Include dimensions: organism/system, target, scope/timeframe, evaluation axes, required tables. "
41
- "Format as a report with headers: Abstract, Background, Findings, Synthesis, Open Questions, Limitations, Risk & Ethics, References. "
42
- "Prefer primary literature (PubMed/Crossref) and databases (UMLS/BioPortal/RCSB). Strictly avoid wet-lab protocols."
 
43
  )
44
 
 
45
  # Tools
46
- base_tools = [WebSearchTool(), OntologyTool(), PubMedTool(), StructureTool(), CrossrefTool()]
 
 
 
 
 
 
 
 
47
  if MCP_URL:
48
- base_tools.append(HostedMCPTool(tool_config={
49
- "type": "mcp", "server_label": "file_search", "server_url": MCP_URL, "require_approval": "never"
50
- }))
51
-
 
 
 
 
 
 
 
 
52
  # Agents
 
 
53
  research_agent = Agent(
54
  name="Synthetic Biology Research Agent",
55
  model=DEEP_MODEL_PRIMARY,
@@ -85,7 +124,12 @@ triage_agent = Agent(
85
  handoffs=[clarifying_agent, instruction_agent],
86
  )
87
 
 
 
 
 
88
  async def _extract_citations(stream) -> List[Dict[str, str]]:
 
89
  citations: List[Dict[str, str]] = []
90
  try:
91
  for item in reversed(stream.new_items):
@@ -93,42 +137,68 @@ async def _extract_citations(stream) -> List[Dict[str, str]]:
93
  for content in getattr(item.raw_item, "content", []):
94
  for ann in getattr(content, "annotations", []):
95
  if getattr(ann, "type", None) == "url_citation":
96
- citations.append({"title": getattr(ann, "title", ""), "url": getattr(ann, "url", "")})
 
 
 
 
 
97
  break
98
  except Exception:
99
  pass
100
  return citations
101
 
102
- async def research_once(query: str, fast: bool = False, rerank_model: str | None = None) -> Dict[str, Any]:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
103
  # Safety gate input
104
- dec = safety_guard.gate(query)
105
- if not dec.allowed:
106
- query = "SAFE-ONLY REVIEW: " + query + "
107
- Only produce high-level literature synthesis with citations."
108
-
109
- # Switch to fast model if requested
 
 
 
110
  if fast and research_agent.model != DEEP_MODEL_FAST:
111
  research_agent.model = DEEP_MODEL_FAST
112
 
113
- # Run pipeline
114
- stream = Runner.run_streamed(triage_agent, query, run_config=RunConfig(tracing_disabled=True))
 
 
 
 
115
  async for _ in stream.stream_events():
116
  pass
117
 
118
  final_text = stream.final_output
119
  citations = await _extract_citations(stream)
120
 
121
- # Optional HF rerank to reorder citations by relevance to query
122
  if rerank_model and citations:
123
- from .tools import HFRerankTool
124
- rerank = HFRerankTool(model_id=rerank_model)
125
- docs = [c.get("title") or c.get("url", "") for c in citations]
126
  try:
127
- rr = await rerank.call(query, docs)
 
 
128
  order = rr.get("order")
129
  if order:
130
  citations = [citations[i] for i in order]
131
  except Exception:
 
132
  pass
133
 
134
- return {"final_output": final_text, "citations": citations}
 
 
 
1
  from __future__ import annotations
2
+
3
+ import os
4
  from typing import Any, Dict, List
5
  from pydantic import BaseModel
6
 
7
+ # OpenAI Agents SDK + Deep Research
8
  from openai import AsyncOpenAI
9
  from agents import Agent, Runner, RunConfig, WebSearchTool, HostedMCPTool
10
 
11
  from .safety import SafetyGuard
12
+ from .tools import (
13
+ OntologyTool,
14
+ PubMedTool,
15
+ StructureTool,
16
+ CrossrefTool,
17
+ HFRerankTool,
18
+ )
19
+
20
+ # ─────────────────────────────────────────────────────────────
21
+ # Environment & client
22
+ # ─────────────────────────────────────────────────────────────
23
 
 
24
  OPENAI_API_KEY = os.getenv("OPENAI_API_KEY", "")
25
  OPENAI_BASE_URL = os.getenv("OPENAI_BASE_URL", "https://api.openai.com/v1")
26
  os.environ["OPENAI_AGENTS_DISABLE_TRACING"] = os.getenv("GENESIS_DISABLE_TRACING", "1")
27
+
28
+ # The AsyncOpenAI client is created for completeness; the Agents SDK uses your default client.
29
  client = AsyncOpenAI(api_key=OPENAI_API_KEY, base_url=OPENAI_BASE_URL, timeout=600.0)
30
 
31
  DEEP_MODEL_PRIMARY = os.getenv("GENESIS_DEEP_MODEL", "o3-deep-research")
 
35
  CLARIFY_MODEL = os.getenv("GENESIS_CLARIFY_MODEL", "gpt-4o-mini")
36
  MCP_URL = os.getenv("GENESIS_MCP_URL")
37
 
38
+ # ─────────────────────────────────────────────────────────────
39
+ # Safety
40
+ # ─────────────────────────────────────────────────────────────
41
+
42
  safety_guard = SafetyGuard()
43
 
44
+ # ─────────────────────────────────────────────────────────────
45
+ # Agent prompts
46
+ # ─────────────────────────────────────────────────────────────
47
+
48
  class Clarifications(BaseModel):
49
  questions: List[str]
50
 
 
57
  INSTRUCTION_PROMPT = (
58
  "Rewrite the user query into detailed DEEP RESEARCH instructions in English. OUTPUT ONLY the instructions. "
59
  "Include dimensions: organism/system, target, scope/timeframe, evaluation axes, required tables. "
60
+ "Format as a report with headers: Abstract, Background, Findings, Synthesis, Open Questions, "
61
+ "Limitations, Risk & Ethics, References. Prefer primary literature (PubMed/Crossref) and databases "
62
+ "(UMLS/BioPortal/RCSB). Strictly avoid wet-lab protocols."
63
  )
64
 
65
+ # ─────────────────────────────────────────────────────────────
66
  # Tools
67
+ # ─────────────────────────────────────────────────────────────
68
+
69
+ base_tools = [
70
+ WebSearchTool(),
71
+ OntologyTool(),
72
+ PubMedTool(),
73
+ StructureTool(),
74
+ CrossrefTool(),
75
+ ]
76
  if MCP_URL:
77
+ base_tools.append(
78
+ HostedMCPTool(
79
+ tool_config={
80
+ "type": "mcp",
81
+ "server_label": "file_search",
82
+ "server_url": MCP_URL,
83
+ "require_approval": "never",
84
+ }
85
+ )
86
+ )
87
+
88
+ # ─────────────────────────────────────────────────────────────
89
  # Agents
90
+ # ─────────────────────────────────────────────────────────────
91
+
92
  research_agent = Agent(
93
  name="Synthetic Biology Research Agent",
94
  model=DEEP_MODEL_PRIMARY,
 
124
  handoffs=[clarifying_agent, instruction_agent],
125
  )
126
 
127
+ # ────────────────────────────────────────��────────────────────
128
+ # Helpers
129
+ # ─────────────────────────────────────────────────────────────
130
+
131
  async def _extract_citations(stream) -> List[Dict[str, str]]:
132
+ """Extract URL citations from the final message, if any."""
133
  citations: List[Dict[str, str]] = []
134
  try:
135
  for item in reversed(stream.new_items):
 
137
  for content in getattr(item.raw_item, "content", []):
138
  for ann in getattr(content, "annotations", []):
139
  if getattr(ann, "type", None) == "url_citation":
140
+ citations.append(
141
+ {
142
+ "title": getattr(ann, "title", "") or "",
143
+ "url": getattr(ann, "url", "") or "",
144
+ }
145
+ )
146
  break
147
  except Exception:
148
  pass
149
  return citations
150
 
151
+ # ─────────────────────────────────────────────────────────────
152
+ # Public API
153
+ # ─────────────────────────────────────────────────────────────
154
+
155
+ async def research_once(
156
+ query: str,
157
+ fast: bool = False,
158
+ rerank_model: str | None = None,
159
+ ) -> Dict[str, Any]:
160
+ """
161
+ Run the triage β†’ (clarify|instruct) β†’ research pipeline once and return:
162
+ { "final_output": <str or structured>, "citations": [ {title, url}, ... ] }
163
+
164
+ Safety: if the input appears operational/dual-use, we transform it to a SAFE-ONLY prompt.
165
+ """
166
  # Safety gate input
167
+ decision = safety_guard.gate(query)
168
+ if not decision.allowed:
169
+ query = (
170
+ "SAFE-ONLY REVIEW: "
171
+ + query
172
+ + "\nOnly produce high-level literature synthesis with citations."
173
+ )
174
+
175
+ # Switch to fast deep-research model if requested
176
  if fast and research_agent.model != DEEP_MODEL_FAST:
177
  research_agent.model = DEEP_MODEL_FAST
178
 
179
+ # Run streamed; we just drain to completion here
180
+ stream = Runner.run_streamed(
181
+ triage_agent,
182
+ query,
183
+ run_config=RunConfig(tracing_disabled=True),
184
+ )
185
  async for _ in stream.stream_events():
186
  pass
187
 
188
  final_text = stream.final_output
189
  citations = await _extract_citations(stream)
190
 
191
+ # Optional: rerank citations with a HF reranker (if configured)
192
  if rerank_model and citations:
 
 
 
193
  try:
194
+ reranker = HFRerankTool(model_id=rerank_model)
195
+ docs = [c.get("title") or c.get("url", "") for c in citations]
196
+ rr = await reranker.call(query, docs)
197
  order = rr.get("order")
198
  if order:
199
  citations = [citations[i] for i in order]
200
  except Exception:
201
+ # Best-effort; keep original order on failure
202
  pass
203
 
204
+ return {"final_output": final_text, "citations": citations}