mgbam's picture
Update genesis/pipeline.py
bc27b65 verified
"""
GENESIS-AI β€” Multimodal Synthetic Biology Research Pipeline
Coordinates ontology expansion, literature review, AI summarization, image generation,
funding intelligence, safety/biosecurity checks, and report export.
"""
import os
import re
from datetime import datetime
from typing import Dict, Any, List
# API clients
from genesis.api_clients.pubmed_api import search_pubmed_literature
from genesis.api_clients.bioportal_api import expand_with_bioportal
from genesis.api_clients.umls_api import expand_with_umls
from genesis.api_clients.chembl_api import get_molecule_data
from genesis.structures import fetch_structures_for_terms
# Core logic providers
from genesis.providers import (
run_deepseek_summary,
run_gemini_summary,
run_openai_summary,
run_gemini_image,
run_openai_image,
run_hf_image,
narrate_text_elevenlabs
)
# Utility modules
from genesis.utils.pdf_export import export_report_to_pdf
from genesis.utils.graph_tools import write_topic_and_papers
# Visualizations
from genesis.visualization import generate_pathway_graph, generate_funding_network
# Data sources
from genesis.funding import fetch_funding_data
from genesis.trials import fetch_clinical_trials
from genesis.biosecurity import analyze_biosecurity_risks
from genesis.regulation import fetch_regulatory_info
from genesis.safety import analyze_safety_concerns
from genesis.ontology import merge_ontology_terms
# Environment vars
ELEVEN_LABS_API_KEY = os.getenv("ELEVEN_LABS_API_KEY")
NEO4J_URI = os.getenv("NEO4J_URI")
SYNBIO_MODE = True # Bias towards synthetic biology context
# Demo queries
DEMO_QUERIES = [
"CRISPR living therapeutics in clinical trials since 2020",
"AI-designed enzymes for plastic degradation β€” literature + pathways",
"Synthetic biology startups in oncology β€” funding map",
"Metabolic pathway for artemisinin biosynthesis in yeast",
"Oncolytic virus engineering β€” biosecurity risk analysis"
]
# ---------- Helper Functions ----------
def extract_citations(text: str) -> List[Dict[str, str]]:
"""Extract DOI, PMID, and URLs from text."""
citations = []
doi_pattern = r"(10\.\d{4,9}/[-._;()/:A-Z0-9]+)"
pmid_pattern = r"PMID:\s*(\d+)"
url_pattern = r"(https?://[^\s)]+)"
for match in re.finditer(doi_pattern, text, re.IGNORECASE):
citations.append({"type": "DOI", "id": match.group(1), "url": f"https://doi.org/{match.group(1)}"})
for match in re.finditer(pmid_pattern, text, re.IGNORECASE):
citations.append({"type": "PMID", "id": match.group(1), "url": f"https://pubmed.ncbi.nlm.nih.gov/{match.group(1)}/"})
for match in re.finditer(url_pattern, text, re.IGNORECASE):
if not any(c["url"] == match.group(1) for c in citations):
citations.append({"type": "URL", "id": "", "url": match.group(1)})
return citations
def inject_synbio_context(query: str, expanded_terms: List[str]) -> str:
"""Inject synthetic biology expertise into the prompt."""
context = (
"You are an expert in synthetic biology, biosecurity, and regulatory affairs. "
"Provide literature review, molecular insights, market trends, and policy implications. "
"Focus on CRISPR, metabolic engineering, living therapeutics, protein design, biosensors, and biosecurity. "
"Be concise, factual, and provide citations."
)
return f"{context}\n\nQuery: {query}\nExpanded terms: {', '.join(expanded_terms)}"
# ---------- Main Pipeline ----------
def multimodal_research(query: str, narration: bool = False, generate_pdf: bool = False) -> Dict[str, Any]:
"""Main research pipeline for GENESIS-AI."""
print(f"[Pipeline] Starting research for query: {query}")
# 1 β€” Expand query with ontology
expanded_terms = merge_ontology_terms(
query,
expand_with_umls(query),
expand_with_bioportal(query)
)
print(f"[Pipeline] Expanded terms: {expanded_terms}")
# 2 β€” Enrich query with domain-specific context
enriched_query = inject_synbio_context(query, expanded_terms) if SYNBIO_MODE else query
# 3 β€” Summarization (fallback order)
summary = None
for summarizer in [run_deepseek_summary, run_gemini_summary, run_openai_summary]:
try:
summary = summarizer(enriched_query)
if summary:
print(f"[Pipeline] Summary generated by {summarizer.__name__}")
break
except Exception as e:
print(f"[Pipeline] {summarizer.__name__} failed: {e}")
if not summary:
summary = "No summary generated β€” please refine your query."
# 4 β€” Extract citations, fallback to PubMed if none found
citations = extract_citations(summary)
if not citations:
print("[Pipeline] No citations in summary, querying PubMed...")
citations = search_pubmed_literature(query)
# 5 β€” Fetch related structures (NCBI, ChEMBL)
structures = fetch_structures_for_terms(expanded_terms)
# 6 β€” Image generation with fallback
image_url = None
for img_fn in [run_gemini_image, run_openai_image, run_hf_image]:
try:
image_url = img_fn(query)
if image_url:
print(f"[Pipeline] Image generated by {img_fn.__name__}")
break
except Exception as e:
print(f"[Pipeline] {img_fn.__name__} failed: {e}")
# 7 β€” Funding, trials, regulations, safety, biosecurity
funding_data = fetch_funding_data(query) or []
trial_data = fetch_clinical_trials(query) or []
regulation_data = fetch_regulatory_info(query) or []
safety_data = analyze_safety_concerns(query) or []
biosecurity_data = analyze_biosecurity_risks(query) or []
# 8 β€” Graph visualizations
pathway_graph = generate_pathway_graph(query) if expanded_terms else None
funding_graph = generate_funding_network(query) if funding_data else None
# 9 β€” Save to Neo4j if configured
if NEO4J_URI:
try:
write_topic_and_papers(query, citations, expanded_terms)
print("[Pipeline] Data saved to Neo4j")
except Exception as e:
print(f"[Pipeline] Neo4j save failed: {e}")
# 10 β€” Narration (optional)
audio_url = None
if narration and ELEVEN_LABS_API_KEY:
try:
audio_url = narrate_text_elevenlabs(summary)
print("[Pipeline] Narration generated")
except Exception as e:
print(f"[Pipeline] Narration failed: {e}")
# 11 β€” PDF export (optional)
pdf_path = None
if generate_pdf:
try:
pdf_path = export_report_to_pdf(query, summary, citations, structures, funding_data, regulation_data)
print("[Pipeline] PDF report generated")
except Exception as e:
print(f"[Pipeline] PDF generation failed: {e}")
return {
"timestamp": datetime.utcnow().isoformat(),
"query": query,
"expanded_terms": expanded_terms,
"summary": summary,
"citations": citations,
"structures": structures,
"image_url": image_url,
"funding_data": funding_data,
"trial_data": trial_data,
"regulation_data": regulation_data,
"safety_data": safety_data,
"biosecurity_data": biosecurity_data,
"pathway_graph": pathway_graph,
"funding_graph": funding_graph,
"audio_url": audio_url,
"pdf_path": pdf_path
}
# ---------- Wrappers for app.py ----------
def research_once(topic: str) -> Dict[str, Any]:
"""Alias for multimodal_research."""
return multimodal_research(topic)
def run_literature_review(query: str):
"""For literature review tab."""
result = multimodal_research(query)
return result["summary"], result["citations"]
def run_molecule_lookup(molecule_name: str):
"""For molecule lookup tab."""
try:
chembl_data = get_molecule_data(molecule_name)
except Exception as e:
chembl_data = {"error": str(e)}
structures = fetch_structures_for_terms([molecule_name])
img_url = None
for img_fn in [run_gemini_image, run_openai_image, run_hf_image]:
try:
img_url = img_fn(molecule_name)
if img_url:
break
except:
pass
return str({"chembl": chembl_data, "structures": structures}), img_url
def run_pathway_analysis(pathway_name: str):
"""For pathway analysis tab."""
graph_data = generate_pathway_graph(pathway_name, [pathway_name])
return f"Pathway analysis for {pathway_name}", graph_data
def run_funding_analysis(keyword: str):
"""For funding analysis tab."""
funding_info = fetch_funding_data(keyword)
return str(funding_info)
def run_image_analysis(image_path: str):
"""For image analysis tab."""
prompt = f"Analyze this microscopy or biological image: {image_path}"
analysis = None
for summarizer in [run_gemini_summary, run_openai_summary, run_deepseek_summary]:
try:
analysis = summarizer(prompt)
if analysis:
break
except:
pass
return analysis or "Image analysis failed."