mgbam's picture
Update genesis/pipeline.py
7b0bd9a verified
raw
history blame
3.96 kB
import os
import re
from datetime import datetime
from typing import Dict, Any, List
from .ontology import expand_terms_with_ontology
from .structures import fetch_structures_for_terms
from .narration import narrate_text
from .graphdb import write_topic_and_papers
from .providers import (
run_deepseek_summary,
run_gemini_polish,
run_openai_image,
run_hf_image,
pubmed_fallback_search
)
# Environment variables
UMLS_API_KEY = os.getenv("UMLS_API_KEY")
BIOPORTAL_API_KEY = os.getenv("BIOPORTAL_API_KEY")
NCBI_API_KEY = os.getenv("NCBI_API_KEY")
NCBI_EMAIL = os.getenv("NCBI_EMAIL")
ELEVEN_LABS_API_KEY = os.getenv("ELEVEN_LABS_API_KEY")
NEO4J_URI = os.getenv("NEO4J_URI")
SYNBIO_MODE = True
def extract_citations(text: str) -> List[Dict[str, str]]:
"""Extract citations from model output."""
citations = []
doi_pattern = r"(10\.\d{4,9}/[-._;()/:A-Z0-9]+)"
pmid_pattern = r"PMID:\s*(\d+)"
url_pattern = r"(https?://[^\s)]+)"
for match in re.finditer(doi_pattern, text, re.IGNORECASE):
citations.append({"type": "DOI", "id": match.group(1), "url": f"https://doi.org/{match.group(1)}"})
for match in re.finditer(pmid_pattern, text, re.IGNORECASE):
citations.append({"type": "PMID", "id": match.group(1), "url": f"https://pubmed.ncbi.nlm.nih.gov/{match.group(1)}/"})
for match in re.finditer(url_pattern, text, re.IGNORECASE):
if not any(c["url"] == match.group(1) for c in citations):
citations.append({"type": "URL", "id": "", "url": match.group(1)})
return citations
def synthetic_biology_prompt_inject(query: str, expanded_terms: List[str]) -> str:
"""Bias prompt toward synthetic biology domain."""
synbio_context = (
"You are an expert synthetic biologist and AI researcher. "
"Focus on CRISPR, metabolic engineering, living therapeutics, protein design, "
"biosensors, and biosecurity. Integrate literature, molecular structures, market trends, "
"and policy/regulatory outlook. Produce a structured, citation-rich report."
)
return f"{synbio_context}\n\nQuery: {query}\nExpanded terms: {', '.join(expanded_terms)}"
def research_once(query: str, graph_preview: bool = True, narration: bool = True) -> Dict[str, Any]:
"""Main synthetic biology research pipeline."""
# 1. Expand query with ontology
expanded_terms = expand_terms_with_ontology(query, UMLS_API_KEY, BIOPORTAL_API_KEY)
# 2. Inject synthetic biology context
enriched_query = synthetic_biology_prompt_inject(query, expanded_terms) if SYNBIO_MODE else query
# 3. Run DeepSeek summarization
raw_summary = run_deepseek_summary(enriched_query)
# 4. Polish with Gemini
polished_summary = run_gemini_polish(raw_summary)
# 5. Extract citations
citations = extract_citations(polished_summary)
if not citations:
fallback_cites = pubmed_fallback_search(query, NCBI_API_KEY, NCBI_EMAIL)
citations.extend(fallback_cites)
# 6. Fetch molecular structures
structures = fetch_structures_for_terms(expanded_terms)
# 7. Generate visual diagram
visual_image_url = run_openai_image(query)
if not visual_image_url:
print("[Image] Falling back to Hugging Face Stable Diffusion...")
visual_image_url = run_hf_image(f"Scientific diagram about {query}")
# 8. Write to Neo4j
if graph_preview and NEO4J_URI:
write_topic_and_papers(query, citations, expanded_terms)
# 9. Narrate executive summary
audio_url = narrate_text(polished_summary) if narration and ELEVEN_LABS_API_KEY else None
# 10. Return structured output
return {
"timestamp": datetime.utcnow().isoformat(),
"query": query,
"expanded_terms": expanded_terms,
"summary": polished_summary,
"citations": citations,
"structures": structures,
"visual_image_url": visual_image_url,
"audio_url": audio_url
}