Spaces:
Sleeping
Sleeping
# genesis/pipeline.py | |
import os | |
import re | |
import json | |
from typing import Dict, Any, List, Optional | |
from datetime import datetime | |
from .ontology import expand_terms_with_ontology | |
from .structures import fetch_structures_for_terms | |
from .narration import narrate_text | |
from .graphdb import write_topic_and_papers | |
from .providers import run_deepseek_summary, run_gemini_polish, run_gemini_image | |
from .providers import pubmed_fallback_search | |
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY") | |
UMLS_API_KEY = os.getenv("UMLS_API_KEY") | |
BIOPORTAL_API_KEY = os.getenv("BIOPORTAL_API_KEY") | |
NCBI_API_KEY = os.getenv("NCBI_API_KEY") | |
NCBI_EMAIL = os.getenv("NCBI_EMAIL") | |
ELEVEN_LABS_API_KEY = os.getenv("ELEVEN_LABS_API_KEY") | |
GEMINI_API_KEY = os.getenv("GEMINI_API_KEY") | |
NEO4J_USER = os.getenv("NEO4J_USER") | |
NEO4J_PASSWORD = os.getenv("NEO4J_PASSWORD") | |
NEO4J_URI = os.getenv("NEO4J_URI") | |
SYNBIO_MODE = True # force synthetic biology bias | |
# Preloaded killer queries for tomorrow's demo | |
DEMO_QUERIES = [ | |
"Map all CRISPR-based living therapeutics in clinical trials since 2020", | |
"Graph metabolic engineering pathways for bio-based drug production", | |
"Synthetic biology startups developing oncolytic viruses β funding + trials", | |
"3D bioprinting advances for organ transplantation with regulatory analysis", | |
"AI-driven biosensor design for early cancer detection" | |
] | |
def extract_citations(text: str) -> List[Dict[str, str]]: | |
"""Extract citations from model output.""" | |
citations = [] | |
doi_pattern = r"(10\.\d{4,9}/[-._;()/:A-Z0-9]+)" | |
pmid_pattern = r"PMID:\s*(\d+)" | |
url_pattern = r"(https?://[^\s)]+)" | |
for match in re.finditer(doi_pattern, text, re.IGNORECASE): | |
citations.append({"type": "DOI", "id": match.group(1), "url": f"https://doi.org/{match.group(1)}"}) | |
for match in re.finditer(pmid_pattern, text, re.IGNORECASE): | |
citations.append({"type": "PMID", "id": match.group(1), "url": f"https://pubmed.ncbi.nlm.nih.gov/{match.group(1)}/"}) | |
for match in re.finditer(url_pattern, text, re.IGNORECASE): | |
if not any(c["url"] == match.group(1) for c in citations): | |
citations.append({"type": "URL", "id": "", "url": match.group(1)}) | |
return citations | |
def synthetic_biology_prompt_inject(query: str, expanded_terms: List[str]) -> str: | |
"""Bias prompt toward synthetic biology domain.""" | |
synbio_context = ( | |
"You are an expert synthetic biologist and AI researcher. " | |
"Focus on CRISPR, metabolic engineering, living therapeutics, protein design, " | |
"biosensors, and biosecurity. Integrate literature, molecular structures, market trends, " | |
"and policy/regulatory outlook. Produce a structured, citation-rich report." | |
) | |
return f"{synbio_context}\n\nQuery: {query}\nExpanded terms: {', '.join(expanded_terms)}" | |
def research_once(query: str, graph_preview: bool = True, narration: bool = True) -> Dict[str, Any]: | |
"""Main synthetic biology research pipeline.""" | |
# 1. Expand query with ontology | |
expanded_terms = expand_terms_with_ontology(query, UMLS_API_KEY, BIOPORTAL_API_KEY) | |
# 2. Inject synthetic biology context | |
enriched_query = synthetic_biology_prompt_inject(query, expanded_terms) if SYNBIO_MODE else query | |
# 3. Run DeepSeek summarization | |
raw_summary = run_deepseek_summary(enriched_query) | |
# 4. Polish with Gemini | |
polished_summary = run_gemini_polish(raw_summary) | |
# 5. Extract citations | |
citations = extract_citations(polished_summary) | |
if not citations: | |
# fallback: run PubMed search | |
fallback_cites = pubmed_fallback_search(query, NCBI_API_KEY, NCBI_EMAIL) | |
citations.extend(fallback_cites) | |
# 6. Fetch molecular structures for any mentioned terms | |
structures = fetch_structures_for_terms(expanded_terms) | |
# 7. Generate visual diagram with Gemini | |
visual_image_url = run_gemini_image(query) if GEMINI_API_KEY else None | |
# 8. Write to Neo4j | |
if graph_preview and NEO4J_URI: | |
write_topic_and_papers(query, citations, expanded_terms) | |
# 9. Narrate executive summary | |
audio_url = narrate_text(polished_summary) if narration and ELEVEN_LABS_API_KEY else None | |
# 10. Build output | |
report = { | |
"timestamp": datetime.utcnow().isoformat(), | |
"query": query, | |
"expanded_terms": expanded_terms, | |
"summary": polished_summary, | |
"citations": citations, | |
"structures": structures, | |
"visual_image_url": visual_image_url, | |
"audio_url": audio_url | |
} | |
return report | |