""" GENESIS-AI — Multimodal Synthetic Biology Research Pipeline Coordinates ontology expansion, literature review, AI summarization, image generation, funding intelligence, safety/biosecurity checks, and report export. """ import os import re from datetime import datetime from typing import Dict, Any, List # API clients from genesis.api_clients.pubmed_api import search_pubmed_literature from genesis.api_clients.bioportal_api import expand_with_bioportal from genesis.api_clients.umls_api import expand_with_umls from genesis.api_clients.chembl_api import get_molecule_data from genesis.structures import fetch_structures_for_terms # Core logic providers from genesis.providers import ( run_deepseek_summary, run_gemini_summary, run_openai_summary, run_gemini_image, run_openai_image, run_hf_image, narrate_text_elevenlabs ) # Utility modules from genesis.utils.pdf_export import export_report_to_pdf from genesis.utils.graph_tools import write_topic_and_papers # Visualizations from genesis.visualization import generate_pathway_graph, generate_funding_network # Data sources from genesis.funding import fetch_funding_data from genesis.trials import fetch_clinical_trials from genesis.biosecurity import analyze_biosecurity_risks from genesis.regulation import fetch_regulatory_info from genesis.safety import analyze_safety_concerns from genesis.ontology import merge_ontology_terms # Environment vars ELEVEN_LABS_API_KEY = os.getenv("ELEVEN_LABS_API_KEY") NEO4J_URI = os.getenv("NEO4J_URI") SYNBIO_MODE = True # Bias towards synthetic biology context # Demo queries DEMO_QUERIES = [ "CRISPR living therapeutics in clinical trials since 2020", "AI-designed enzymes for plastic degradation — literature + pathways", "Synthetic biology startups in oncology — funding map", "Metabolic pathway for artemisinin biosynthesis in yeast", "Oncolytic virus engineering — biosecurity risk analysis" ] # ---------- Helper Functions ---------- def extract_citations(text: str) -> List[Dict[str, str]]: """Extract DOI, PMID, and URLs from text.""" citations = [] doi_pattern = r"(10\.\d{4,9}/[-._;()/:A-Z0-9]+)" pmid_pattern = r"PMID:\s*(\d+)" url_pattern = r"(https?://[^\s)]+)" for match in re.finditer(doi_pattern, text, re.IGNORECASE): citations.append({"type": "DOI", "id": match.group(1), "url": f"https://doi.org/{match.group(1)}"}) for match in re.finditer(pmid_pattern, text, re.IGNORECASE): citations.append({"type": "PMID", "id": match.group(1), "url": f"https://pubmed.ncbi.nlm.nih.gov/{match.group(1)}/"}) for match in re.finditer(url_pattern, text, re.IGNORECASE): if not any(c["url"] == match.group(1) for c in citations): citations.append({"type": "URL", "id": "", "url": match.group(1)}) return citations def inject_synbio_context(query: str, expanded_terms: List[str]) -> str: """Inject synthetic biology expertise into the prompt.""" context = ( "You are an expert in synthetic biology, biosecurity, and regulatory affairs. " "Provide literature review, molecular insights, market trends, and policy implications. " "Focus on CRISPR, metabolic engineering, living therapeutics, protein design, biosensors, and biosecurity. " "Be concise, factual, and provide citations." ) return f"{context}\n\nQuery: {query}\nExpanded terms: {', '.join(expanded_terms)}" # ---------- Main Pipeline ---------- def multimodal_research(query: str, narration: bool = False, generate_pdf: bool = False) -> Dict[str, Any]: """Main research pipeline for GENESIS-AI.""" print(f"[Pipeline] Starting research for query: {query}") # 1 — Expand query with ontology expanded_terms = merge_ontology_terms( query, expand_with_umls(query), expand_with_bioportal(query) ) print(f"[Pipeline] Expanded terms: {expanded_terms}") # 2 — Enrich query with domain-specific context enriched_query = inject_synbio_context(query, expanded_terms) if SYNBIO_MODE else query # 3 — Summarization (fallback order) summary = None for summarizer in [run_deepseek_summary, run_gemini_summary, run_openai_summary]: try: summary = summarizer(enriched_query) if summary: print(f"[Pipeline] Summary generated by {summarizer.__name__}") break except Exception as e: print(f"[Pipeline] {summarizer.__name__} failed: {e}") if not summary: summary = "No summary generated — please refine your query." # 4 — Extract citations, fallback to PubMed if none found citations = extract_citations(summary) if not citations: print("[Pipeline] No citations in summary, querying PubMed...") citations = search_pubmed_literature(query) # 5 — Fetch related structures (NCBI, ChEMBL) structures = fetch_structures_for_terms(expanded_terms) # 6 — Image generation with fallback image_url = None for img_fn in [run_gemini_image, run_openai_image, run_hf_image]: try: image_url = img_fn(query) if image_url: print(f"[Pipeline] Image generated by {img_fn.__name__}") break except Exception as e: print(f"[Pipeline] {img_fn.__name__} failed: {e}") # 7 — Funding, trials, regulations, safety, biosecurity funding_data = fetch_funding_data(query) or [] trial_data = fetch_clinical_trials(query) or [] regulation_data = fetch_regulatory_info(query) or [] safety_data = analyze_safety_concerns(query) or [] biosecurity_data = analyze_biosecurity_risks(query) or [] # 8 — Graph visualizations pathway_graph = generate_pathway_graph(query) if expanded_terms else None funding_graph = generate_funding_network(query) if funding_data else None # 9 — Save to Neo4j if configured if NEO4J_URI: try: write_topic_and_papers(query, citations, expanded_terms) print("[Pipeline] Data saved to Neo4j") except Exception as e: print(f"[Pipeline] Neo4j save failed: {e}") # 10 — Narration (optional) audio_url = None if narration and ELEVEN_LABS_API_KEY: try: audio_url = narrate_text_elevenlabs(summary) print("[Pipeline] Narration generated") except Exception as e: print(f"[Pipeline] Narration failed: {e}") # 11 — PDF export (optional) pdf_path = None if generate_pdf: try: pdf_path = export_report_to_pdf(query, summary, citations, structures, funding_data, regulation_data) print("[Pipeline] PDF report generated") except Exception as e: print(f"[Pipeline] PDF generation failed: {e}") return { "timestamp": datetime.utcnow().isoformat(), "query": query, "expanded_terms": expanded_terms, "summary": summary, "citations": citations, "structures": structures, "image_url": image_url, "funding_data": funding_data, "trial_data": trial_data, "regulation_data": regulation_data, "safety_data": safety_data, "biosecurity_data": biosecurity_data, "pathway_graph": pathway_graph, "funding_graph": funding_graph, "audio_url": audio_url, "pdf_path": pdf_path } # ---------- Wrappers for app.py ---------- def research_once(topic: str) -> Dict[str, Any]: """Alias for multimodal_research.""" return multimodal_research(topic) def run_literature_review(query: str): """For literature review tab.""" result = multimodal_research(query) return result["summary"], result["citations"] def run_molecule_lookup(molecule_name: str): """For molecule lookup tab.""" try: chembl_data = get_molecule_data(molecule_name) except Exception as e: chembl_data = {"error": str(e)} structures = fetch_structures_for_terms([molecule_name]) img_url = None for img_fn in [run_gemini_image, run_openai_image, run_hf_image]: try: img_url = img_fn(molecule_name) if img_url: break except: pass return str({"chembl": chembl_data, "structures": structures}), img_url def run_pathway_analysis(pathway_name: str): """For pathway analysis tab.""" graph_data = generate_pathway_graph(pathway_name, [pathway_name]) return f"Pathway analysis for {pathway_name}", graph_data def run_funding_analysis(keyword: str): """For funding analysis tab.""" funding_info = fetch_funding_data(keyword) return str(funding_info) def run_image_analysis(image_path: str): """For image analysis tab.""" prompt = f"Analyze this microscopy or biological image: {image_path}" analysis = None for summarizer in [run_gemini_summary, run_openai_summary, run_deepseek_summary]: try: analysis = summarizer(prompt) if analysis: break except: pass return analysis or "Image analysis failed."