mgbam commited on
Commit
e1444b4
·
verified ·
1 Parent(s): e22ad8c

Update genesis/pipeline.py

Browse files
Files changed (1) hide show
  1. genesis/pipeline.py +53 -100
genesis/pipeline.py CHANGED
@@ -1,106 +1,59 @@
1
-
2
  from __future__ import annotations
3
  import os, json
 
 
4
  from typing import Any, Dict, List
5
- from pydantic import BaseModel
6
-
7
- from openai import AsyncOpenAI
8
- from agents import Agent, Runner, RunConfig, WebSearchTool, HostedMCPTool
9
-
10
- from .safety import SafetyGuard
11
- from .tools import OntologyTool, PubMedTool, StructureTool, CrossrefTool
12
-
13
- OPENAI_API_KEY = os.getenv("OPENAI_API_KEY","")
14
- OPENAI_BASE_URL = os.getenv("OPENAI_BASE_URL","https://api.openai.com/v1")
15
- GENESIS_DISABLE_TRACING = os.getenv("GENESIS_DISABLE_TRACING","1")
16
- os.environ["OPENAI_AGENTS_DISABLE_TRACING"] = GENESIS_DISABLE_TRACING
17
-
18
- client = AsyncOpenAI(api_key=OPENAI_API_KEY, base_url=OPENAI_BASE_URL, timeout=600.0)
19
-
20
- DEEP_MODEL_PRIMARY = os.getenv("GENESIS_DEEP_MODEL", "o3-deep-research")
21
- DEEP_MODEL_FAST = os.getenv("GENESIS_DEEP_FAST_MODEL", "o4-mini-deep-research")
22
- INSTRUCTION_MODEL = os.getenv("GENESIS_INSTRUCTION_MODEL", "gpt-4o-mini")
23
- TRIAGE_MODEL = os.getenv("GENESIS_TRIAGE_MODEL", "gpt-4o-mini")
24
- CLARIFY_MODEL = os.getenv("GENESIS_CLARIFY_MODEL", "gpt-4o-mini")
25
- MCP_URL = os.getenv("GENESIS_MCP_URL")
26
-
27
- safety_guard = SafetyGuard()
28
-
29
- class Clarifications(BaseModel):
30
- questions: List[str]
31
-
32
- CLARIFY_PROMPT = """
33
- Ask at most 3 essential questions to improve a high-level synthetic biology research brief.
34
- Focus only on: organism/system, target (gene/protein/pathway), timeframe, preferred outputs.
35
- Never request operational lab details. Friendly tone.
36
- """
37
 
38
- INSTRUCTION_PROMPT = """
39
- Rewrite the user query into detailed DEEP RESEARCH instructions in English.
40
- OUTPUT ONLY the instructions.
41
- Include dimensions: organism/system, target, scope/timeframe, evaluation axes, required tables.
42
- Format requested output as a report with headers: Abstract, Background, Findings, Synthesis, Open Questions,
43
- Limitations, Risk & Ethics, References. Prefer primary literature (PubMed/Crossref) and databases (UMLS/BioPortal/RCSB).
44
- Strictly avoid operational wet-lab protocols.
45
- """
46
-
47
- base_tools = [WebSearchTool(), OntologyTool(), PubMedTool(), StructureTool(), CrossrefTool()]
48
- if MCP_URL:
49
- base_tools.append(HostedMCPTool(tool_config={"type":"mcp","server_label":"file_search","server_url":MCP_URL,"require_approval":"never"}))
50
-
51
- research_agent = Agent(
52
- name="Synthetic Biology Research Agent",
53
- model=DEEP_MODEL_PRIMARY,
54
- instructions=("Perform high-level empirical research with citations. Use tools judiciously. "
55
- "NEVER produce step-by-step lab instructions or protocols."),
56
- tools=base_tools,
57
- )
58
-
59
- instruction_agent = Agent(
60
- name="Research Instruction Agent",
61
- model=INSTRUCTION_MODEL,
62
- instructions=INSTRUCTION_PROMPT,
63
- handoffs=[research_agent],
64
- )
65
-
66
- clarifying_agent = Agent(
67
- name="Clarifying Questions Agent",
68
- model=CLARIFY_MODEL,
69
- instructions=CLARIFY_PROMPT,
70
- output_type=Clarifications,
71
- handoffs=[instruction_agent],
72
- )
73
-
74
- triage_agent = Agent(
75
- name="Triage Agent",
76
- model=TRIAGE_MODEL,
77
- instructions=("If the user query lacks essential context, handoff to Clarifying Questions Agent; "
78
- "otherwise handoff to Research Instruction Agent. Return EXACTLY one function call."),
79
- handoffs=[clarifying_agent, instruction_agent],
80
- )
81
-
82
- async def research_once(query: str, fast: bool=False) -> Dict[str, Any]:
83
- dec = safety_guard.gate(query)
84
- if not dec.allowed:
85
- query = "SAFE-ONLY REVIEW: " + query + "\nOnly produce high-level literature synthesis with citations."
86
- if fast and research_agent.model != DEEP_MODEL_FAST:
87
- research_agent.model = DEEP_MODEL_FAST
88
-
89
- stream = Runner.run_streamed(triage_agent, query, run_config=RunConfig(tracing_disabled=True))
90
- async for _ in stream.stream_events():
91
- pass
92
- final_text = stream.final_output
93
-
94
- citations = []
95
  try:
96
- for item in reversed(stream.new_items):
97
- if item.type == "message_output_item":
98
- for content in getattr(item.raw_item, "content", []):
99
- for ann in getattr(content, "annotations", []):
100
- if getattr(ann, "type", None) == "url_citation":
101
- citations.append({"title": getattr(ann,"title",""), "url": getattr(ann,"url","")})
102
- break
 
 
 
 
 
 
 
 
 
 
 
 
103
  except Exception:
104
- pass
105
-
106
- return {"final_output": final_text, "citations": citations}
 
 
 
 
 
 
 
 
1
  from __future__ import annotations
2
  import os, json
3
+ import httpx
4
+ import google.generativeai as genai
5
  from typing import Any, Dict, List
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6
 
7
+ # Optional post-processors for polishing final summaries (NO lab steps)
8
+
9
+ async def gemini_postprocess(text: str, citations: List[dict]) -> str:
10
+ api_key = os.getenv("GEMINI_API_KEY")
11
+ if not api_key:
12
+ return text
13
+ genai.configure(api_key=api_key)
14
+ model = genai.GenerativeModel("gemini-1.5-flash")
15
+ prompt = (
16
+ "Polish the following high-level scientific synthesis for clarity and flow. "
17
+ "Do NOT add wet-lab procedures or operational details. Maintain citations list context.
18
+
19
+ " + text
20
+ )
21
+ resp = await model.asynchronous.generate_content_async(prompt)
22
+ return resp.text or text
23
+
24
+ async def deepseek_postprocess(text: str, citations: List[dict]) -> str:
25
+ # Generic OpenAI-compatible chat completions call
26
+ base = os.getenv("DEEPSEEK_BASE_URL")
27
+ key = os.getenv("DEEPSEEK_API_KEY")
28
+ if not base or not key:
29
+ return text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
30
  try:
31
+ async with httpx.AsyncClient(timeout=60.0) as http:
32
+ r = await http.post(
33
+ f"{base.rstrip('/')}/v1/chat/completions",
34
+ headers={"Authorization": f"Bearer {key}", "Content-Type": "application/json"},
35
+ json={
36
+ "model": os.getenv("DEEPSEEK_MODEL", "deepseek-chat"),
37
+ "messages": [
38
+ {"role": "system", "content": "You are a scientific editor. Never add lab protocols."},
39
+ {"role": "user", "content": (
40
+ "Polish the following high-level synthesis without adding operational details.
41
+
42
+ " + text
43
+ )},
44
+ ],
45
+ "temperature": 0.3,
46
+ },
47
+ )
48
+ data = r.json()
49
+ return data.get("choices", [{}])[0].get("message", {}).get("content", text)
50
  except Exception:
51
+ return text
52
+
53
+ async def postprocess_summary(base_text: str, citations: List[dict], engine: str = "none") -> str:
54
+ engine = (engine or "none").lower()
55
+ if engine == "gemini":
56
+ return await gemini_postprocess(base_text, citations)
57
+ if engine == "deepseek":
58
+ return await deepseek_postprocess(base_text, citations)
59
+ return base_text