Spaces:

mgbam
/

Synthetic_Biology

Sleeping

File size: 7,359 Bytes

4d521f6
e2c04b6
6f23dc2
fa94666
875ce0d
fa94666
6f23dc2
 
 
 
 
 
e844fe0
 
6f23dc2
875ce0d
47975fb
fa94666
6f23dc2
 
 
 
 
 
7b0bd9a
6f23dc2
 
 
 
 
 
 
 
 
875ce0d
6f23dc2
 
fa94666
6f23dc2
 
 
5d480b1
875ce0d
7b0bd9a
875ce0d
fa94666
6f23dc2
 
 
 
 
fa94666
7b0bd9a
6f23dc2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fa94666
6f23dc2
 
 
 
fa94666
 
7b0bd9a
fa94666
6f23dc2
 
875ce0d
 
6f23dc2
 
 
 
 
 
875ce0d
6f23dc2
 
 
 
 
 
 
 
 
 
875ce0d
6f23dc2
875ce0d
 
fa94666
6f23dc2
 
fa94666
6f23dc2
 
 
875ce0d
6f23dc2
 
 
fa94666
 
6f23dc2
 
 
 
 
 
875ce0d
6f23dc2
875ce0d
 
6f23dc2
 
875ce0d
 
 
 
 
fa94666
6f23dc2
875ce0d
 
6f23dc2
 
 
875ce0d
 
 
 
 
fa94666
6f23dc2
 
 
875ce0d
 
 
 
 
6f23dc2
 
 
 
875ce0d
 
 
 
 
7b0bd9a
6f23dc2
fa94666
 
 
 
6f23dc2
fa94666
 
6f23dc2
 
 
 
 
 
 
 
 
 
5d480b1

# genesis/pipeline.py
import os
import re
from datetime import datetime
from typing import Dict, Any, List

# API client imports
from genesis.api_clients.pubmed_api import search_pubmed_literature
from genesis.api_clients.bioportal_api import expand_with_bioportal
from genesis.api_clients.umls_api import expand_with_umls
from genesis.api_clients.chembl_api import get_molecule_data
from genesis.api_clients.ncbi_api import fetch_ncbi_structure
from genesis.utils.pdf_export import export_report_to_pdf


# Core logic providers
from genesis.providers import (
    run_deepseek_summary,
    run_gemini_summary,
    run_openai_summary,
    run_gemini_image,
    run_openai_image,
    run_hf_image,
    narrate_text_elevenlabs
)
from genesis.utils.pdf_export import export_report_to_pdf
from genesis.visualization import generate_pathway_graph, generate_funding_network
from genesis.funding import fetch_funding_data
from genesis.trials import fetch_clinical_trials
from genesis.biosecurity import analyze_biosecurity_risks
from genesis.regulation import fetch_regulatory_info
from genesis.safety import analyze_safety_concerns
from genesis.structures import fetch_structures_for_terms
from genesis.ontology import merge_ontology_terms
from genesis.utils.graph_tools import write_topic_and_papers

# Environment vars
ELEVEN_LABS_API_KEY = os.getenv("ELEVEN_LABS_API_KEY")
NEO4J_URI = os.getenv("NEO4J_URI")
NEO4J_USER = os.getenv("NEO4J_USER")
NEO4J_PASSWORD = os.getenv("NEO4J_PASSWORD")

SYNBIO_MODE = True  # Bias towards synthetic biology context

# Demo queries for UI preload
DEMO_QUERIES = [
    "CRISPR living therapeutics in clinical trials since 2020",
    "AI-designed enzymes for plastic degradation — literature + pathways",
    "Synthetic biology startups in oncology — funding map",
    "Metabolic pathway for artemisinin biosynthesis in yeast",
    "Oncolytic virus engineering — biosecurity risk analysis"
]


def extract_citations(text: str) -> List[Dict[str, str]]:
    """Extract citations (DOI, PMID, URLs) from text."""
    citations = []
    doi_pattern = r"(10\.\d{4,9}/[-._;()/:A-Z0-9]+)"
    pmid_pattern = r"PMID:\s*(\d+)"
    url_pattern = r"(https?://[^\s)]+)"

    for match in re.finditer(doi_pattern, text, re.IGNORECASE):
        citations.append({"type": "DOI", "id": match.group(1), "url": f"https://doi.org/{match.group(1)}"})
    for match in re.finditer(pmid_pattern, text, re.IGNORECASE):
        citations.append({"type": "PMID", "id": match.group(1), "url": f"https://pubmed.ncbi.nlm.nih.gov/{match.group(1)}/"})
    for match in re.finditer(url_pattern, text, re.IGNORECASE):
        if not any(c["url"] == match.group(1) for c in citations):
            citations.append({"type": "URL", "id": "", "url": match.group(1)})

    return citations


def inject_synbio_context(query: str, expanded_terms: List[str]) -> str:
    """Injects synthetic biology expertise into the prompt."""
    context = (
        "You are an expert in synthetic biology, biosecurity, and regulatory affairs. "
        "Provide literature review, molecular insights, market trends, and policy implications. "
        "Focus on CRISPR, metabolic engineering, living therapeutics, protein design, biosensors, and biosecurity. "
        "Be concise, factual, and provide citations."
    )
    return f"{context}\n\nQuery: {query}\nExpanded terms: {', '.join(expanded_terms)}"


def multimodal_research(query: str, narration: bool = False, generate_pdf: bool = False) -> Dict[str, Any]:
    """Main multi-modal synthetic biology pipeline."""
    print(f"[Pipeline] Starting research for query: {query}")

    # 1 — Expand query with ontology
    expanded_terms = merge_ontology_terms(
        query,
        expand_with_umls(query),
        expand_with_bioportal(query)
    )
    print(f"[Pipeline] Expanded terms: {expanded_terms}")

    # 2 — Inject domain-specific context
    enriched_query = inject_synbio_context(query, expanded_terms) if SYNBIO_MODE else query

    # 3 — Summarization with fallback
    summary = None
    for summarizer in [run_deepseek_summary, run_gemini_summary, run_openai_summary]:
        try:
            summary = summarizer(enriched_query)
            if summary:
                print(f"[Pipeline] Summary generated by {summarizer.__name__}")
                break
        except Exception as e:
            print(f"[Pipeline] {summarizer.__name__} failed: {e}")

    if not summary:
        summary = "No summary generated — please refine your query."

    # 4 — Citations extraction & PubMed fallback
    citations = extract_citations(summary)
    if not citations:
        print("[Pipeline] No citations in summary, querying PubMed...")
        citations = search_pubmed_literature(query)

    # 5 — Structures (NCBI, ChEMBL)
    structures = fetch_structures_for_terms(expanded_terms)

    # 6 — Image generation with fallback
    image_url = None
    for img_fn in [run_gemini_image, run_openai_image, run_hf_image]:
        try:
            image_url = img_fn(query)
            if image_url:
                print(f"[Pipeline] Image generated by {img_fn.__name__}")
                break
        except Exception as e:
            print(f"[Pipeline] {img_fn.__name__} failed: {e}")

    # 7 — Funding, Trials, Regulation, Safety, Biosecurity
    funding_data = fetch_funding_data(query) or []
    trial_data = fetch_clinical_trials(query) or []
    regulation_data = fetch_regulatory_info(query) or []
    safety_data = analyze_safety_concerns(query) or []
    biosecurity_data = analyze_biosecurity_risks(query) or []

    # 8 — Graph visualizations
    pathway_graph = generate_pathway_graph(query, expanded_terms) if expanded_terms else None
    funding_graph = generate_funding_network(query, funding_data) if funding_data else None

    # 9 — Save to Neo4j
    if NEO4J_URI:
        try:
            write_topic_and_papers(query, citations, expanded_terms)
            print("[Pipeline] Data saved to Neo4j")
        except Exception as e:
            print(f"[Pipeline] Neo4j save failed: {e}")

    # 10 — Optional narration
    audio_url = None
    if narration and ELEVEN_LABS_API_KEY:
        try:
            audio_url = narrate_text_elevenlabs(summary)
            print("[Pipeline] Narration generated")
        except Exception as e:
            print(f"[Pipeline] Narration failed: {e}")

    # 11 — Optional PDF export
    pdf_path = None
    if generate_pdf:
        try:
            pdf_path = export_report_to_pdf(query, summary, citations, structures, funding_data, regulation_data)
            print("[Pipeline] PDF report generated")
        except Exception as e:
            print(f"[Pipeline] PDF generation failed: {e}")

    # 12 — Build output
    return {
        "timestamp": datetime.utcnow().isoformat(),
        "query": query,
        "expanded_terms": expanded_terms,
        "summary": summary,
        "citations": citations,
        "structures": structures,
        "image_url": image_url,
        "funding_data": funding_data,
        "trial_data": trial_data,
        "regulation_data": regulation_data,
        "safety_data": safety_data,
        "biosecurity_data": biosecurity_data,
        "pathway_graph": pathway_graph,
        "funding_graph": funding_graph,
        "audio_url": audio_url,
        "pdf_path": pdf_path
    }