Spaces:
Sleeping
Sleeping
Update genesis/pipeline.py
Browse files- genesis/pipeline.py +136 -44
genesis/pipeline.py
CHANGED
@@ -1,78 +1,170 @@
|
|
1 |
# genesis/pipeline.py
|
2 |
import os
|
|
|
|
|
3 |
from datetime import datetime
|
4 |
-
from typing import Dict, Any, List
|
5 |
|
6 |
-
|
7 |
-
from genesis.
|
8 |
-
from genesis.
|
9 |
-
from genesis.
|
|
|
|
|
|
|
|
|
10 |
from genesis.providers import (
|
11 |
-
run_pubmed_literature,
|
12 |
run_deepseek_summary,
|
13 |
-
|
14 |
-
|
|
|
|
|
|
|
|
|
15 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
16 |
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
NCBI_EMAIL = os.getenv("NCBI_EMAIL")
|
21 |
-
NEO4J_URI = os.getenv("NEO4J_URI")
|
22 |
ELEVEN_LABS_API_KEY = os.getenv("ELEVEN_LABS_API_KEY")
|
|
|
|
|
|
|
23 |
|
24 |
-
SYNBIO_MODE = True
|
25 |
|
|
|
26 |
DEMO_QUERIES = [
|
27 |
-
"
|
28 |
-
"
|
29 |
-
"Synthetic biology startups
|
30 |
-
"
|
31 |
-
"
|
32 |
]
|
33 |
|
34 |
-
|
35 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
36 |
context = (
|
37 |
-
"You are an expert synthetic
|
38 |
-
"
|
39 |
-
"
|
40 |
-
"
|
41 |
)
|
42 |
return f"{context}\n\nQuery: {query}\nExpanded terms: {', '.join(expanded_terms)}"
|
43 |
|
44 |
-
def research_once(query: str, graph_preview: bool = True, narration: bool = True) -> Dict[str, Any]:
|
45 |
-
"""Main research pipeline."""
|
46 |
-
expanded_terms = expand_terms_with_ontology(query, UMLS_API_KEY, BIOPORTAL_API_KEY)
|
47 |
-
enriched_query = synthetic_biology_prompt_inject(query, expanded_terms) if SYNBIO_MODE else query
|
48 |
|
49 |
-
|
50 |
-
|
51 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
52 |
|
53 |
-
|
54 |
-
|
55 |
|
56 |
-
#
|
|
|
|
|
|
|
|
|
|
|
57 |
structures = fetch_structures_for_terms(expanded_terms)
|
58 |
|
59 |
-
# Image generation
|
60 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
61 |
|
62 |
-
#
|
63 |
-
|
|
|
|
|
|
|
|
|
64 |
write_topic_and_papers(query, citations, expanded_terms)
|
65 |
|
66 |
-
#
|
67 |
-
audio_url =
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
68 |
|
|
|
69 |
return {
|
70 |
"timestamp": datetime.utcnow().isoformat(),
|
71 |
"query": query,
|
72 |
"expanded_terms": expanded_terms,
|
73 |
-
"summary":
|
74 |
"citations": citations,
|
75 |
"structures": structures,
|
76 |
-
"
|
77 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
78 |
}
|
|
|
1 |
# genesis/pipeline.py
|
2 |
import os
|
3 |
+
import json
|
4 |
+
import re
|
5 |
from datetime import datetime
|
6 |
+
from typing import Dict, Any, List, Optional
|
7 |
|
8 |
+
# API client imports
|
9 |
+
from genesis.api_clients.pubmed_api import search_pubmed_literature
|
10 |
+
from genesis.api_clients.bioportal_api import expand_with_bioportal
|
11 |
+
from genesis.api_clients.umls_api import expand_with_umls
|
12 |
+
from genesis.api_clients.chembl_api import get_molecule_data
|
13 |
+
from genesis.api_clients.ncbi_api import fetch_ncbi_structure
|
14 |
+
|
15 |
+
# Core logic
|
16 |
from genesis.providers import (
|
|
|
17 |
run_deepseek_summary,
|
18 |
+
run_gemini_summary,
|
19 |
+
run_openai_summary,
|
20 |
+
run_gemini_image,
|
21 |
+
run_openai_image,
|
22 |
+
run_hf_image,
|
23 |
+
narrate_text_elevenlabs
|
24 |
)
|
25 |
+
from genesis.utils.pdf_export import export_report_to_pdf
|
26 |
+
from genesis.visualization import generate_pathway_graph, generate_funding_network
|
27 |
+
from genesis.funding import fetch_funding_data
|
28 |
+
from genesis.trials import fetch_clinical_trials
|
29 |
+
from genesis.biosecurity import analyze_biosecurity_risks
|
30 |
+
from genesis.regulation import fetch_regulatory_info
|
31 |
+
from genesis.safety import analyze_safety_concerns
|
32 |
+
from genesis.structures import fetch_structures_for_terms
|
33 |
+
from genesis.ontology import merge_ontology_terms
|
34 |
|
35 |
+
from genesis.graph_tools import write_topic_and_papers
|
36 |
+
|
37 |
+
# Environment vars
|
|
|
|
|
38 |
ELEVEN_LABS_API_KEY = os.getenv("ELEVEN_LABS_API_KEY")
|
39 |
+
NEO4J_URI = os.getenv("NEO4J_URI")
|
40 |
+
NEO4J_USER = os.getenv("NEO4J_USER")
|
41 |
+
NEO4J_PASSWORD = os.getenv("NEO4J_PASSWORD")
|
42 |
|
43 |
+
SYNBIO_MODE = True # always bias to synthetic biology domain
|
44 |
|
45 |
+
# Preloaded demo queries
|
46 |
DEMO_QUERIES = [
|
47 |
+
"CRISPR living therapeutics in clinical trials since 2020",
|
48 |
+
"AI-designed enzymes for plastic degradation β literature + pathways",
|
49 |
+
"Synthetic biology startups in oncology β funding map",
|
50 |
+
"Metabolic pathway for artemisinin biosynthesis in yeast",
|
51 |
+
"Oncolytic virus engineering β biosecurity risk analysis"
|
52 |
]
|
53 |
|
54 |
+
|
55 |
+
def extract_citations(text: str) -> List[Dict[str, str]]:
|
56 |
+
"""Extract citations (DOI, PMID, URLs) from text."""
|
57 |
+
citations = []
|
58 |
+
doi_pattern = r"(10\.\d{4,9}/[-._;()/:A-Z0-9]+)"
|
59 |
+
pmid_pattern = r"PMID:\s*(\d+)"
|
60 |
+
url_pattern = r"(https?://[^\s)]+)"
|
61 |
+
|
62 |
+
for match in re.finditer(doi_pattern, text, re.IGNORECASE):
|
63 |
+
citations.append({"type": "DOI", "id": match.group(1), "url": f"https://doi.org/{match.group(1)}"})
|
64 |
+
for match in re.finditer(pmid_pattern, text, re.IGNORECASE):
|
65 |
+
citations.append({"type": "PMID", "id": match.group(1), "url": f"https://pubmed.ncbi.nlm.nih.gov/{match.group(1)}/"})
|
66 |
+
for match in re.finditer(url_pattern, text, re.IGNORECASE):
|
67 |
+
if not any(c["url"] == match.group(1) for c in citations):
|
68 |
+
citations.append({"type": "URL", "id": "", "url": match.group(1)})
|
69 |
+
|
70 |
+
return citations
|
71 |
+
|
72 |
+
|
73 |
+
def inject_synbio_context(query: str, expanded_terms: List[str]) -> str:
|
74 |
+
"""Injects synthetic biology expertise into the prompt."""
|
75 |
context = (
|
76 |
+
"You are an expert in synthetic biology, biosecurity, and regulatory affairs. "
|
77 |
+
"Provide literature review, molecular insights, market trends, and policy implications. "
|
78 |
+
"Focus on CRISPR, metabolic engineering, living therapeutics, protein design, biosensors, and biosecurity. "
|
79 |
+
"Be concise, factual, and provide citations."
|
80 |
)
|
81 |
return f"{context}\n\nQuery: {query}\nExpanded terms: {', '.join(expanded_terms)}"
|
82 |
|
|
|
|
|
|
|
|
|
83 |
|
84 |
+
def multimodal_research(query: str, narration: bool = False, generate_pdf: bool = False) -> Dict[str, Any]:
|
85 |
+
"""Main multi-modal synthetic biology pipeline."""
|
86 |
+
# 1 β Expand query with ontology
|
87 |
+
expanded_terms = merge_ontology_terms(
|
88 |
+
query,
|
89 |
+
expand_with_umls(query),
|
90 |
+
expand_with_bioportal(query)
|
91 |
+
)
|
92 |
+
|
93 |
+
# 2 β Inject domain-specific context
|
94 |
+
enriched_query = inject_synbio_context(query, expanded_terms) if SYNBIO_MODE else query
|
95 |
+
|
96 |
+
# 3 β Summarization with fallback
|
97 |
+
summary = None
|
98 |
+
for summarizer in [run_deepseek_summary, run_gemini_summary, run_openai_summary]:
|
99 |
+
try:
|
100 |
+
summary = summarizer(enriched_query)
|
101 |
+
if summary:
|
102 |
+
break
|
103 |
+
except Exception:
|
104 |
+
continue
|
105 |
|
106 |
+
if not summary:
|
107 |
+
summary = "No summary generated β please refine your query."
|
108 |
|
109 |
+
# 4 β Citations extraction & PubMed fallback
|
110 |
+
citations = extract_citations(summary)
|
111 |
+
if not citations:
|
112 |
+
citations = search_pubmed_literature(query)
|
113 |
+
|
114 |
+
# 5 β Structures (NCBI, ChEMBL)
|
115 |
structures = fetch_structures_for_terms(expanded_terms)
|
116 |
|
117 |
+
# 6 β Image generation with fallback
|
118 |
+
image_url = None
|
119 |
+
for img_fn in [run_gemini_image, run_openai_image, run_hf_image]:
|
120 |
+
try:
|
121 |
+
image_url = img_fn(query)
|
122 |
+
if image_url:
|
123 |
+
break
|
124 |
+
except Exception:
|
125 |
+
continue
|
126 |
+
|
127 |
+
# 7 β Funding, Trials, Regulation, Safety, Biosecurity
|
128 |
+
funding_data = fetch_funding_data(query)
|
129 |
+
trial_data = fetch_clinical_trials(query)
|
130 |
+
regulation_data = fetch_regulatory_info(query)
|
131 |
+
safety_data = analyze_safety_concerns(query)
|
132 |
+
biosecurity_data = analyze_biosecurity_risks(query)
|
133 |
|
134 |
+
# 8 β Graph visualizations
|
135 |
+
pathway_graph = generate_pathway_graph(query, expanded_terms)
|
136 |
+
funding_graph = generate_funding_network(query, funding_data)
|
137 |
+
|
138 |
+
# 9 β Save to Neo4j
|
139 |
+
if NEO4J_URI:
|
140 |
write_topic_and_papers(query, citations, expanded_terms)
|
141 |
|
142 |
+
# 10 β Optional narration
|
143 |
+
audio_url = None
|
144 |
+
if narration and ELEVEN_LABS_API_KEY:
|
145 |
+
audio_url = narrate_text_elevenlabs(summary)
|
146 |
+
|
147 |
+
# 11 β Optional PDF export
|
148 |
+
pdf_path = None
|
149 |
+
if generate_pdf:
|
150 |
+
pdf_path = export_report_to_pdf(query, summary, citations, structures, funding_data, regulation_data)
|
151 |
|
152 |
+
# 12 β Build output
|
153 |
return {
|
154 |
"timestamp": datetime.utcnow().isoformat(),
|
155 |
"query": query,
|
156 |
"expanded_terms": expanded_terms,
|
157 |
+
"summary": summary,
|
158 |
"citations": citations,
|
159 |
"structures": structures,
|
160 |
+
"image_url": image_url,
|
161 |
+
"funding_data": funding_data,
|
162 |
+
"trial_data": trial_data,
|
163 |
+
"regulation_data": regulation_data,
|
164 |
+
"safety_data": safety_data,
|
165 |
+
"biosecurity_data": biosecurity_data,
|
166 |
+
"pathway_graph": pathway_graph,
|
167 |
+
"funding_graph": funding_graph,
|
168 |
+
"audio_url": audio_url,
|
169 |
+
"pdf_path": pdf_path
|
170 |
}
|