Spaces:
Sleeping
Sleeping
Update genesis/pipeline.py
Browse files- genesis/pipeline.py +33 -24
genesis/pipeline.py
CHANGED
@@ -1,17 +1,20 @@
|
|
1 |
# genesis/pipeline.py
|
|
|
|
|
|
|
|
|
|
|
|
|
2 |
import os
|
3 |
import re
|
4 |
from datetime import datetime
|
5 |
from typing import Dict, Any, List
|
6 |
|
7 |
-
# API
|
8 |
from genesis.api_clients.pubmed_api import search_pubmed_literature
|
9 |
from genesis.api_clients.bioportal_api import expand_with_bioportal
|
10 |
from genesis.api_clients.umls_api import expand_with_umls
|
11 |
-
from genesis.
|
12 |
-
from genesis.api_clients.ncbi_api import fetch_ncbi_structure
|
13 |
-
from genesis.utils.pdf_export import export_report_to_pdf
|
14 |
-
|
15 |
|
16 |
# Core logic providers
|
17 |
from genesis.providers import (
|
@@ -23,26 +26,29 @@ from genesis.providers import (
|
|
23 |
run_hf_image,
|
24 |
narrate_text_elevenlabs
|
25 |
)
|
|
|
|
|
26 |
from genesis.utils.pdf_export import export_report_to_pdf
|
|
|
|
|
|
|
27 |
from genesis.visualization import generate_pathway_graph, generate_funding_network
|
|
|
|
|
28 |
from genesis.funding import fetch_funding_data
|
29 |
from genesis.trials import fetch_clinical_trials
|
30 |
from genesis.biosecurity import analyze_biosecurity_risks
|
31 |
from genesis.regulation import fetch_regulatory_info
|
32 |
from genesis.safety import analyze_safety_concerns
|
33 |
-
from genesis.structures import fetch_structures_for_terms
|
34 |
from genesis.ontology import merge_ontology_terms
|
35 |
-
from genesis.utils.graph_tools import write_topic_and_papers
|
36 |
|
37 |
# Environment vars
|
38 |
ELEVEN_LABS_API_KEY = os.getenv("ELEVEN_LABS_API_KEY")
|
39 |
NEO4J_URI = os.getenv("NEO4J_URI")
|
40 |
-
NEO4J_USER = os.getenv("NEO4J_USER")
|
41 |
-
NEO4J_PASSWORD = os.getenv("NEO4J_PASSWORD")
|
42 |
|
43 |
SYNBIO_MODE = True # Bias towards synthetic biology context
|
44 |
|
45 |
-
# Demo queries
|
46 |
DEMO_QUERIES = [
|
47 |
"CRISPR living therapeutics in clinical trials since 2020",
|
48 |
"AI-designed enzymes for plastic degradation β literature + pathways",
|
@@ -51,9 +57,10 @@ DEMO_QUERIES = [
|
|
51 |
"Oncolytic virus engineering β biosecurity risk analysis"
|
52 |
]
|
53 |
|
|
|
54 |
|
55 |
def extract_citations(text: str) -> List[Dict[str, str]]:
|
56 |
-
"""Extract
|
57 |
citations = []
|
58 |
doi_pattern = r"(10\.\d{4,9}/[-._;()/:A-Z0-9]+)"
|
59 |
pmid_pattern = r"PMID:\s*(\d+)"
|
@@ -66,12 +73,10 @@ def extract_citations(text: str) -> List[Dict[str, str]]:
|
|
66 |
for match in re.finditer(url_pattern, text, re.IGNORECASE):
|
67 |
if not any(c["url"] == match.group(1) for c in citations):
|
68 |
citations.append({"type": "URL", "id": "", "url": match.group(1)})
|
69 |
-
|
70 |
return citations
|
71 |
|
72 |
-
|
73 |
def inject_synbio_context(query: str, expanded_terms: List[str]) -> str:
|
74 |
-
"""
|
75 |
context = (
|
76 |
"You are an expert in synthetic biology, biosecurity, and regulatory affairs. "
|
77 |
"Provide literature review, molecular insights, market trends, and policy implications. "
|
@@ -80,9 +85,14 @@ def inject_synbio_context(query: str, expanded_terms: List[str]) -> str:
|
|
80 |
)
|
81 |
return f"{context}\n\nQuery: {query}\nExpanded terms: {', '.join(expanded_terms)}"
|
82 |
|
|
|
|
|
|
|
|
|
|
|
83 |
|
84 |
def multimodal_research(query: str, narration: bool = False, generate_pdf: bool = False) -> Dict[str, Any]:
|
85 |
-
"""Main
|
86 |
print(f"[Pipeline] Starting research for query: {query}")
|
87 |
|
88 |
# 1 β Expand query with ontology
|
@@ -93,10 +103,10 @@ def multimodal_research(query: str, narration: bool = False, generate_pdf: bool
|
|
93 |
)
|
94 |
print(f"[Pipeline] Expanded terms: {expanded_terms}")
|
95 |
|
96 |
-
# 2 β
|
97 |
enriched_query = inject_synbio_context(query, expanded_terms) if SYNBIO_MODE else query
|
98 |
|
99 |
-
# 3 β Summarization
|
100 |
summary = None
|
101 |
for summarizer in [run_deepseek_summary, run_gemini_summary, run_openai_summary]:
|
102 |
try:
|
@@ -110,13 +120,13 @@ def multimodal_research(query: str, narration: bool = False, generate_pdf: bool
|
|
110 |
if not summary:
|
111 |
summary = "No summary generated β please refine your query."
|
112 |
|
113 |
-
# 4 β
|
114 |
citations = extract_citations(summary)
|
115 |
if not citations:
|
116 |
print("[Pipeline] No citations in summary, querying PubMed...")
|
117 |
citations = search_pubmed_literature(query)
|
118 |
|
119 |
-
# 5 β
|
120 |
structures = fetch_structures_for_terms(expanded_terms)
|
121 |
|
122 |
# 6 β Image generation with fallback
|
@@ -130,7 +140,7 @@ def multimodal_research(query: str, narration: bool = False, generate_pdf: bool
|
|
130 |
except Exception as e:
|
131 |
print(f"[Pipeline] {img_fn.__name__} failed: {e}")
|
132 |
|
133 |
-
# 7 β Funding,
|
134 |
funding_data = fetch_funding_data(query) or []
|
135 |
trial_data = fetch_clinical_trials(query) or []
|
136 |
regulation_data = fetch_regulatory_info(query) or []
|
@@ -141,7 +151,7 @@ def multimodal_research(query: str, narration: bool = False, generate_pdf: bool
|
|
141 |
pathway_graph = generate_pathway_graph(query, expanded_terms) if expanded_terms else None
|
142 |
funding_graph = generate_funding_network(query, funding_data) if funding_data else None
|
143 |
|
144 |
-
# 9 β Save to Neo4j
|
145 |
if NEO4J_URI:
|
146 |
try:
|
147 |
write_topic_and_papers(query, citations, expanded_terms)
|
@@ -149,7 +159,7 @@ def multimodal_research(query: str, narration: bool = False, generate_pdf: bool
|
|
149 |
except Exception as e:
|
150 |
print(f"[Pipeline] Neo4j save failed: {e}")
|
151 |
|
152 |
-
# 10 β
|
153 |
audio_url = None
|
154 |
if narration and ELEVEN_LABS_API_KEY:
|
155 |
try:
|
@@ -158,7 +168,7 @@ def multimodal_research(query: str, narration: bool = False, generate_pdf: bool
|
|
158 |
except Exception as e:
|
159 |
print(f"[Pipeline] Narration failed: {e}")
|
160 |
|
161 |
-
# 11 β
|
162 |
pdf_path = None
|
163 |
if generate_pdf:
|
164 |
try:
|
@@ -167,7 +177,6 @@ def multimodal_research(query: str, narration: bool = False, generate_pdf: bool
|
|
167 |
except Exception as e:
|
168 |
print(f"[Pipeline] PDF generation failed: {e}")
|
169 |
|
170 |
-
# 12 β Build output
|
171 |
return {
|
172 |
"timestamp": datetime.utcnow().isoformat(),
|
173 |
"query": query,
|
|
|
1 |
# genesis/pipeline.py
|
2 |
+
"""
|
3 |
+
GENESIS-AI β Multimodal Synthetic Biology Research Pipeline
|
4 |
+
Coordinates ontology expansion, literature review, AI summarization, image generation,
|
5 |
+
funding intelligence, safety/biosecurity checks, and report export.
|
6 |
+
"""
|
7 |
+
|
8 |
import os
|
9 |
import re
|
10 |
from datetime import datetime
|
11 |
from typing import Dict, Any, List
|
12 |
|
13 |
+
# API clients
|
14 |
from genesis.api_clients.pubmed_api import search_pubmed_literature
|
15 |
from genesis.api_clients.bioportal_api import expand_with_bioportal
|
16 |
from genesis.api_clients.umls_api import expand_with_umls
|
17 |
+
from genesis.structures import fetch_structures_for_terms
|
|
|
|
|
|
|
18 |
|
19 |
# Core logic providers
|
20 |
from genesis.providers import (
|
|
|
26 |
run_hf_image,
|
27 |
narrate_text_elevenlabs
|
28 |
)
|
29 |
+
|
30 |
+
# Utility modules
|
31 |
from genesis.utils.pdf_export import export_report_to_pdf
|
32 |
+
from genesis.utils.graph_tools import write_topic_and_papers
|
33 |
+
|
34 |
+
# Visualizations
|
35 |
from genesis.visualization import generate_pathway_graph, generate_funding_network
|
36 |
+
|
37 |
+
# Data sources
|
38 |
from genesis.funding import fetch_funding_data
|
39 |
from genesis.trials import fetch_clinical_trials
|
40 |
from genesis.biosecurity import analyze_biosecurity_risks
|
41 |
from genesis.regulation import fetch_regulatory_info
|
42 |
from genesis.safety import analyze_safety_concerns
|
|
|
43 |
from genesis.ontology import merge_ontology_terms
|
|
|
44 |
|
45 |
# Environment vars
|
46 |
ELEVEN_LABS_API_KEY = os.getenv("ELEVEN_LABS_API_KEY")
|
47 |
NEO4J_URI = os.getenv("NEO4J_URI")
|
|
|
|
|
48 |
|
49 |
SYNBIO_MODE = True # Bias towards synthetic biology context
|
50 |
|
51 |
+
# Demo queries
|
52 |
DEMO_QUERIES = [
|
53 |
"CRISPR living therapeutics in clinical trials since 2020",
|
54 |
"AI-designed enzymes for plastic degradation β literature + pathways",
|
|
|
57 |
"Oncolytic virus engineering β biosecurity risk analysis"
|
58 |
]
|
59 |
|
60 |
+
# ---------- Helper Functions ----------
|
61 |
|
62 |
def extract_citations(text: str) -> List[Dict[str, str]]:
|
63 |
+
"""Extract DOI, PMID, and URLs from text."""
|
64 |
citations = []
|
65 |
doi_pattern = r"(10\.\d{4,9}/[-._;()/:A-Z0-9]+)"
|
66 |
pmid_pattern = r"PMID:\s*(\d+)"
|
|
|
73 |
for match in re.finditer(url_pattern, text, re.IGNORECASE):
|
74 |
if not any(c["url"] == match.group(1) for c in citations):
|
75 |
citations.append({"type": "URL", "id": "", "url": match.group(1)})
|
|
|
76 |
return citations
|
77 |
|
|
|
78 |
def inject_synbio_context(query: str, expanded_terms: List[str]) -> str:
|
79 |
+
"""Inject synthetic biology expertise into the prompt."""
|
80 |
context = (
|
81 |
"You are an expert in synthetic biology, biosecurity, and regulatory affairs. "
|
82 |
"Provide literature review, molecular insights, market trends, and policy implications. "
|
|
|
85 |
)
|
86 |
return f"{context}\n\nQuery: {query}\nExpanded terms: {', '.join(expanded_terms)}"
|
87 |
|
88 |
+
# ---------- Main Pipeline ----------
|
89 |
+
|
90 |
+
def research_once(topic: str) -> Dict[str, Any]:
|
91 |
+
"""Alias to multimodal_research for compatibility."""
|
92 |
+
return multimodal_research(topic)
|
93 |
|
94 |
def multimodal_research(query: str, narration: bool = False, generate_pdf: bool = False) -> Dict[str, Any]:
|
95 |
+
"""Main research pipeline for GENESIS-AI."""
|
96 |
print(f"[Pipeline] Starting research for query: {query}")
|
97 |
|
98 |
# 1 β Expand query with ontology
|
|
|
103 |
)
|
104 |
print(f"[Pipeline] Expanded terms: {expanded_terms}")
|
105 |
|
106 |
+
# 2 β Enrich query with domain-specific context
|
107 |
enriched_query = inject_synbio_context(query, expanded_terms) if SYNBIO_MODE else query
|
108 |
|
109 |
+
# 3 β Summarization (fallback order)
|
110 |
summary = None
|
111 |
for summarizer in [run_deepseek_summary, run_gemini_summary, run_openai_summary]:
|
112 |
try:
|
|
|
120 |
if not summary:
|
121 |
summary = "No summary generated β please refine your query."
|
122 |
|
123 |
+
# 4 β Extract citations, fallback to PubMed if none found
|
124 |
citations = extract_citations(summary)
|
125 |
if not citations:
|
126 |
print("[Pipeline] No citations in summary, querying PubMed...")
|
127 |
citations = search_pubmed_literature(query)
|
128 |
|
129 |
+
# 5 β Fetch related structures (NCBI, ChEMBL)
|
130 |
structures = fetch_structures_for_terms(expanded_terms)
|
131 |
|
132 |
# 6 β Image generation with fallback
|
|
|
140 |
except Exception as e:
|
141 |
print(f"[Pipeline] {img_fn.__name__} failed: {e}")
|
142 |
|
143 |
+
# 7 β Funding, trials, regulations, safety, biosecurity
|
144 |
funding_data = fetch_funding_data(query) or []
|
145 |
trial_data = fetch_clinical_trials(query) or []
|
146 |
regulation_data = fetch_regulatory_info(query) or []
|
|
|
151 |
pathway_graph = generate_pathway_graph(query, expanded_terms) if expanded_terms else None
|
152 |
funding_graph = generate_funding_network(query, funding_data) if funding_data else None
|
153 |
|
154 |
+
# 9 β Save to Neo4j if configured
|
155 |
if NEO4J_URI:
|
156 |
try:
|
157 |
write_topic_and_papers(query, citations, expanded_terms)
|
|
|
159 |
except Exception as e:
|
160 |
print(f"[Pipeline] Neo4j save failed: {e}")
|
161 |
|
162 |
+
# 10 β Narration (optional)
|
163 |
audio_url = None
|
164 |
if narration and ELEVEN_LABS_API_KEY:
|
165 |
try:
|
|
|
168 |
except Exception as e:
|
169 |
print(f"[Pipeline] Narration failed: {e}")
|
170 |
|
171 |
+
# 11 β PDF export (optional)
|
172 |
pdf_path = None
|
173 |
if generate_pdf:
|
174 |
try:
|
|
|
177 |
except Exception as e:
|
178 |
print(f"[Pipeline] PDF generation failed: {e}")
|
179 |
|
|
|
180 |
return {
|
181 |
"timestamp": datetime.utcnow().isoformat(),
|
182 |
"query": query,
|