mgbam commited on
Commit
2689723
·
verified ·
1 Parent(s): a9868dc

Upload 4 files

Browse files
Files changed (4) hide show
  1. genesis/__init__.py +1 -0
  2. genesis/pipeline.py +106 -0
  3. genesis/safety.py +40 -0
  4. genesis/tools.py +111 -0
genesis/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ from .pipeline import research_once
genesis/pipeline.py ADDED
@@ -0,0 +1,106 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ from __future__ import annotations
3
+ import os, json
4
+ from typing import Any, Dict, List
5
+ from pydantic import BaseModel
6
+
7
+ from openai import AsyncOpenAI
8
+ from agents import Agent, Runner, RunConfig, WebSearchTool, HostedMCPTool
9
+
10
+ from .safety import SafetyGuard
11
+ from .tools import OntologyTool, PubMedTool, StructureTool, CrossrefTool
12
+
13
+ OPENAI_API_KEY = os.getenv("OPENAI_API_KEY","")
14
+ OPENAI_BASE_URL = os.getenv("OPENAI_BASE_URL","https://api.openai.com/v1")
15
+ GENESIS_DISABLE_TRACING = os.getenv("GENESIS_DISABLE_TRACING","1")
16
+ os.environ["OPENAI_AGENTS_DISABLE_TRACING"] = GENESIS_DISABLE_TRACING
17
+
18
+ client = AsyncOpenAI(api_key=OPENAI_API_KEY, base_url=OPENAI_BASE_URL, timeout=600.0)
19
+
20
+ DEEP_MODEL_PRIMARY = os.getenv("GENESIS_DEEP_MODEL", "o3-deep-research")
21
+ DEEP_MODEL_FAST = os.getenv("GENESIS_DEEP_FAST_MODEL", "o4-mini-deep-research")
22
+ INSTRUCTION_MODEL = os.getenv("GENESIS_INSTRUCTION_MODEL", "gpt-4o-mini")
23
+ TRIAGE_MODEL = os.getenv("GENESIS_TRIAGE_MODEL", "gpt-4o-mini")
24
+ CLARIFY_MODEL = os.getenv("GENESIS_CLARIFY_MODEL", "gpt-4o-mini")
25
+ MCP_URL = os.getenv("GENESIS_MCP_URL")
26
+
27
+ safety_guard = SafetyGuard()
28
+
29
+ class Clarifications(BaseModel):
30
+ questions: List[str]
31
+
32
+ CLARIFY_PROMPT = """
33
+ Ask at most 3 essential questions to improve a high-level synthetic biology research brief.
34
+ Focus only on: organism/system, target (gene/protein/pathway), timeframe, preferred outputs.
35
+ Never request operational lab details. Friendly tone.
36
+ """
37
+
38
+ INSTRUCTION_PROMPT = """
39
+ Rewrite the user query into detailed DEEP RESEARCH instructions in English.
40
+ OUTPUT ONLY the instructions.
41
+ Include dimensions: organism/system, target, scope/timeframe, evaluation axes, required tables.
42
+ Format requested output as a report with headers: Abstract, Background, Findings, Synthesis, Open Questions,
43
+ Limitations, Risk & Ethics, References. Prefer primary literature (PubMed/Crossref) and databases (UMLS/BioPortal/RCSB).
44
+ Strictly avoid operational wet-lab protocols.
45
+ """
46
+
47
+ base_tools = [WebSearchTool(), OntologyTool(), PubMedTool(), StructureTool(), CrossrefTool()]
48
+ if MCP_URL:
49
+ base_tools.append(HostedMCPTool(tool_config={"type":"mcp","server_label":"file_search","server_url":MCP_URL,"require_approval":"never"}))
50
+
51
+ research_agent = Agent(
52
+ name="Synthetic Biology Research Agent",
53
+ model=DEEP_MODEL_PRIMARY,
54
+ instructions=("Perform high-level empirical research with citations. Use tools judiciously. "
55
+ "NEVER produce step-by-step lab instructions or protocols."),
56
+ tools=base_tools,
57
+ )
58
+
59
+ instruction_agent = Agent(
60
+ name="Research Instruction Agent",
61
+ model=INSTRUCTION_MODEL,
62
+ instructions=INSTRUCTION_PROMPT,
63
+ handoffs=[research_agent],
64
+ )
65
+
66
+ clarifying_agent = Agent(
67
+ name="Clarifying Questions Agent",
68
+ model=CLARIFY_MODEL,
69
+ instructions=CLARIFY_PROMPT,
70
+ output_type=Clarifications,
71
+ handoffs=[instruction_agent],
72
+ )
73
+
74
+ triage_agent = Agent(
75
+ name="Triage Agent",
76
+ model=TRIAGE_MODEL,
77
+ instructions=("If the user query lacks essential context, handoff to Clarifying Questions Agent; "
78
+ "otherwise handoff to Research Instruction Agent. Return EXACTLY one function call."),
79
+ handoffs=[clarifying_agent, instruction_agent],
80
+ )
81
+
82
+ async def research_once(query: str, fast: bool=False) -> Dict[str, Any]:
83
+ dec = safety_guard.gate(query)
84
+ if not dec.allowed:
85
+ query = "SAFE-ONLY REVIEW: " + query + "\nOnly produce high-level literature synthesis with citations."
86
+ if fast and research_agent.model != DEEP_MODEL_FAST:
87
+ research_agent.model = DEEP_MODEL_FAST
88
+
89
+ stream = Runner.run_streamed(triage_agent, query, run_config=RunConfig(tracing_disabled=True))
90
+ async for _ in stream.stream_events():
91
+ pass
92
+ final_text = stream.final_output
93
+
94
+ citations = []
95
+ try:
96
+ for item in reversed(stream.new_items):
97
+ if item.type == "message_output_item":
98
+ for content in getattr(item.raw_item, "content", []):
99
+ for ann in getattr(content, "annotations", []):
100
+ if getattr(ann, "type", None) == "url_citation":
101
+ citations.append({"title": getattr(ann,"title",""), "url": getattr(ann,"url","")})
102
+ break
103
+ except Exception:
104
+ pass
105
+
106
+ return {"final_output": final_text, "citations": citations}
genesis/safety.py ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ from __future__ import annotations
3
+ import re
4
+ from pydantic import BaseModel
5
+ from typing import List
6
+
7
+ DUAL_USE_PATTERNS = [
8
+ r"step-?by-?step",
9
+ r"protocol",
10
+ r"wet[- ]?lab",
11
+ r"culture\s*conditions",
12
+ r"viral\s*vector",
13
+ r"pathogen",
14
+ r"gain[- ]of[- ]function",
15
+ r"increase\s*virulence",
16
+ r"synthesis\s*of\s*toxin",
17
+ r"biosafety\s*level\s*(2|3|4)",
18
+ r"kill\s*curve",
19
+ r"CFU|colony\s*forming\s*units",
20
+ ]
21
+
22
+ class SafetyDecision(BaseModel):
23
+ allowed: bool
24
+ rationale: str
25
+ redactions: List[str] = []
26
+
27
+ class SafetyGuard:
28
+ # Block operational/dual-use outputs; allow high-level literature review only.
29
+ def __init__(self) -> None:
30
+ pass
31
+
32
+ def gate(self, text: str) -> SafetyDecision:
33
+ hits = [p for p in DUAL_USE_PATTERNS if re.search(p, text, flags=re.I)]
34
+ if hits:
35
+ return SafetyDecision(
36
+ allowed=False,
37
+ rationale="Operational/dual-use intent detected. Only high-level review permitted.",
38
+ redactions=hits,
39
+ )
40
+ return SafetyDecision(allowed=True, rationale="High-level research intent.")
genesis/tools.py ADDED
@@ -0,0 +1,111 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ from __future__ import annotations
3
+ import os, json, re
4
+ import httpx
5
+ from typing import Any, Dict, Optional, List
6
+
7
+ class ToolBase:
8
+ name: str = "tool"
9
+ description: str = ""
10
+
11
+ async def call(self, *args, **kwargs) -> Dict[str, Any]:
12
+ raise NotImplementedError
13
+
14
+ class OntologyTool(ToolBase):
15
+ name = "ontology_normalize"
16
+ description = "Normalize biomedical terms via BioPortal; returns concept info (no protocols)."
17
+
18
+ def __init__(self, timeout: float = 20.0):
19
+ self.http = httpx.AsyncClient(timeout=timeout)
20
+ self.bioportal_key = os.getenv("BIOPORTAL_API_KEY")
21
+
22
+ async def call(self, term: str) -> dict:
23
+ out = {"term": term, "bioportal": None}
24
+ try:
25
+ if self.bioportal_key:
26
+ r = await self.http.get(
27
+ "https://data.bioontology.org/search",
28
+ params={"q": term, "pagesize": 5},
29
+ headers={"Authorization": f"apikey token={self.bioportal_key}"}
30
+ )
31
+ out["bioportal"] = r.json()
32
+ except Exception as e:
33
+ out["bioportal_error"] = str(e)
34
+ return out
35
+
36
+ class PubMedTool(ToolBase):
37
+ name = "pubmed_search"
38
+ description = "Search PubMed via NCBI; return metadata with citations."
39
+
40
+ def __init__(self, timeout: float = 20.0):
41
+ self.http = httpx.AsyncClient(timeout=timeout)
42
+ self.key = os.getenv("NCBI_API_KEY")
43
+ self.email = os.getenv("NCBI_EMAIL")
44
+
45
+ async def call(self, query: str) -> dict:
46
+ base = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/"
47
+ try:
48
+ es = await self.http.get(
49
+ base + "esearch.fcgi",
50
+ params={"db":"pubmed","term":query,"retmode":"json","retmax":20,"api_key":self.key,"email":self.email}
51
+ )
52
+ ids = es.json().get("esearchresult",{}).get("idlist",[])
53
+ if not ids: return {"query":query,"results":[]}
54
+ su = await self.http.get(
55
+ base + "esummary.fcgi",
56
+ params={"db":"pubmed","id":",".join(ids),"retmode":"json","api_key":self.key,"email":self.email}
57
+ )
58
+ recs = su.json().get("result",{})
59
+ items = []
60
+ for pmid in ids:
61
+ r = recs.get(pmid,{ })
62
+ items.append({
63
+ "pmid": pmid,
64
+ "title": r.get("title"),
65
+ "journal": r.get("fulljournalname"),
66
+ "year": (r.get("pubdate") or "")[:4],
67
+ "authors": [a.get("name") for a in r.get("authors",[])],
68
+ })
69
+ return {"query":query,"results":items}
70
+ except Exception as e:
71
+ return {"query":query,"error":str(e)}
72
+
73
+ class StructureTool(ToolBase):
74
+ name = "structure_info"
75
+ description = "Query RCSB structure metadata (no lab steps)."
76
+
77
+ def __init__(self, timeout: float = 20.0):
78
+ self.http = httpx.AsyncClient(timeout=timeout)
79
+
80
+ async def call(self, pdb_id: str) -> dict:
81
+ out = {"pdb_id": pdb_id}
82
+ try:
83
+ r = await self.http.get(f"https://data.rcsb.org/rest/v1/core/entry/{pdb_id}")
84
+ r.raise_for_status()
85
+ out["rcsb_core"] = r.json()
86
+ except Exception as e:
87
+ out["error"] = str(e)
88
+ return out
89
+
90
+ class CrossrefTool(ToolBase):
91
+ name = "crossref_search"
92
+ description = "Crossref search for DOIs; titles, years, authors."
93
+
94
+ def __init__(self, timeout: float = 20.0):
95
+ self.http = httpx.AsyncClient(timeout=timeout)
96
+
97
+ async def call(self, query: str) -> dict:
98
+ try:
99
+ r = await self.http.get("https://api.crossref.org/works", params={"query":query,"rows":10})
100
+ items = r.json().get("message",{}).get("items",[])
101
+ papers = []
102
+ for it in items:
103
+ papers.append({
104
+ "title": (it.get("title") or [None])[0],
105
+ "doi": it.get("DOI"),
106
+ "year": (it.get("issued") or {}).get("date-parts", [[None]])[0][0],
107
+ "authors": [f"{a.get('given','')} {a.get('family','')}".strip() for a in it.get("author",[])],
108
+ })
109
+ return {"query":query,"results":papers}
110
+ except Exception as e:
111
+ return {"query":query,"error":str(e)}