Spaces:

mgbam
/

Synthetic_Biology

Sleeping

File size: 3,596 Bytes

5d480b1
e2c04b6
5d480b1
 
a75a9dc
5d480b1
 
 
 
 
7f4ed38
5d480b1
 
 
 
 
 
 
 
 
a75a9dc
5d480b1
 
 
 
 
 
 
e2c04b6
 
5d480b1
 
 
 
 
 
 
 
 
 
 
 
2efa720
 
5d480b1
 
 
 
 
 
e2c04b6
5d480b1
 
 
 
 
 
 
a75a9dc
5d480b1
a75a9dc
5d480b1
 
 
a75a9dc
5d480b1

# genesis/pipeline.py
import os
import re
from datetime import datetime
from typing import Dict, Any, List

from .ontology import expand_terms_with_ontology
from .structures import fetch_structures_for_terms
from .narration import narrate_text
from .graphdb import write_topic_and_papers
from .providers import run_deepseek_summary, run_gemini_polish, run_openai_image, pubmed_fallback_search

OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
UMLS_API_KEY = os.getenv("UMLS_API_KEY")
BIOPORTAL_API_KEY = os.getenv("BIOPORTAL_API_KEY")
NCBI_API_KEY = os.getenv("NCBI_API_KEY")
NCBI_EMAIL = os.getenv("NCBI_EMAIL")
ELEVEN_LABS_API_KEY = os.getenv("ELEVEN_LABS_API_KEY")
NEO4J_URI = os.getenv("NEO4J_URI")

SYNBIO_MODE = True

DEMO_QUERIES = [
    "Map all CRISPR-based living therapeutics in clinical trials since 2020",
    "Graph metabolic engineering pathways for bio-based drug production",
    "Synthetic biology startups developing oncolytic viruses — funding + trials",
    "3D bioprinting advances for organ transplantation with regulatory analysis",
    "AI-driven biosensor design for early cancer detection"
]

def extract_citations(text: str) -> List[Dict[str, str]]:
    citations = []
    doi_pattern = r"(10\.\d{4,9}/[-._;()/:A-Z0-9]+)"
    pmid_pattern = r"PMID:\s*(\d+)"
    url_pattern = r"(https?://[^\s)]+)"
    for match in re.finditer(doi_pattern, text, re.IGNORECASE):
        citations.append({"type": "DOI", "id": match.group(1), "url": f"https://doi.org/{match.group(1)}"})
    for match in re.finditer(pmid_pattern, text, re.IGNORECASE):
        citations.append({"type": "PMID", "id": match.group(1), "url": f"https://pubmed.ncbi.nlm.nih.gov/{match.group(1)}/"})
    for match in re.finditer(url_pattern, text, re.IGNORECASE):
        if not any(c["url"] == match.group(1) for c in citations):
            citations.append({"type": "URL", "id": "", "url": match.group(1)})
    return citations

def synthetic_biology_prompt_inject(query: str, expanded_terms: List[str]) -> str:
    synbio_context = (
        "You are an expert synthetic biologist and AI researcher. "
        "Focus on CRISPR, metabolic engineering, living therapeutics, protein design, "
        "biosensors, and biosecurity. Integrate literature, molecular structures, market trends, "
        "and policy/regulatory outlook. Produce a structured, citation-rich report."
    )
    return f"{synbio_context}\n\nQuery: {query}\nExpanded terms: {', '.join(expanded_terms)}"

def research_once(query: str, graph_preview: bool = True, narration: bool = True) -> Dict[str, Any]:
    expanded_terms = expand_terms_with_ontology(query, UMLS_API_KEY, BIOPORTAL_API_KEY)
    enriched_query = synthetic_biology_prompt_inject(query, expanded_terms) if SYNBIO_MODE else query
    raw_summary = run_deepseek_summary(enriched_query)
    polished_summary = run_gemini_polish(raw_summary)
    citations = extract_citations(polished_summary) or pubmed_fallback_search(query, NCBI_API_KEY, NCBI_EMAIL)
    structures = fetch_structures_for_terms(expanded_terms)
    visual_image_url = run_openai_image(query)
    if graph_preview and NEO4J_URI:
        write_topic_and_papers(query, citations, expanded_terms)
    audio_url = narrate_text(polished_summary) if narration and ELEVEN_LABS_API_KEY else None
    return {
        "timestamp": datetime.utcnow().isoformat(),
        "query": query,
        "expanded_terms": expanded_terms,
        "summary": polished_summary,
        "citations": citations,
        "structures": structures,
        "visual_image_url": visual_image_url,
        "audio_url": audio_url
    }