File size: 3,957 Bytes
e2c04b6
5d480b1
 
a75a9dc
5d480b1
 
 
 
 
7b0bd9a
 
 
 
 
 
 
5d480b1
7b0bd9a
5d480b1
 
 
 
 
 
 
a75a9dc
5d480b1
 
7b0bd9a
5d480b1
 
 
 
7b0bd9a
5d480b1
 
7b0bd9a
5d480b1
 
7b0bd9a
5d480b1
 
 
7b0bd9a
2efa720
 
5d480b1
7b0bd9a
5d480b1
 
 
 
 
e2c04b6
5d480b1
 
 
7b0bd9a
 
5d480b1
7b0bd9a
 
5d480b1
7b0bd9a
 
5d480b1
7b0bd9a
 
5d480b1
7b0bd9a
 
 
 
 
 
 
 
5d480b1
7b0bd9a
 
a75a9dc
7b0bd9a
 
 
 
 
5d480b1
 
7b0bd9a
 
5d480b1
7b0bd9a
 
a75a9dc
5d480b1
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
import os
import re
from datetime import datetime
from typing import Dict, Any, List

from .ontology import expand_terms_with_ontology
from .structures import fetch_structures_for_terms
from .narration import narrate_text
from .graphdb import write_topic_and_papers
from .providers import (
    run_deepseek_summary,
    run_gemini_polish,
    run_openai_image,
    run_hf_image,
    pubmed_fallback_search
)

# Environment variables
UMLS_API_KEY = os.getenv("UMLS_API_KEY")
BIOPORTAL_API_KEY = os.getenv("BIOPORTAL_API_KEY")
NCBI_API_KEY = os.getenv("NCBI_API_KEY")
NCBI_EMAIL = os.getenv("NCBI_EMAIL")
ELEVEN_LABS_API_KEY = os.getenv("ELEVEN_LABS_API_KEY")
NEO4J_URI = os.getenv("NEO4J_URI")

SYNBIO_MODE = True

def extract_citations(text: str) -> List[Dict[str, str]]:
    """Extract citations from model output."""
    citations = []
    doi_pattern = r"(10\.\d{4,9}/[-._;()/:A-Z0-9]+)"
    pmid_pattern = r"PMID:\s*(\d+)"
    url_pattern = r"(https?://[^\s)]+)"

    for match in re.finditer(doi_pattern, text, re.IGNORECASE):
        citations.append({"type": "DOI", "id": match.group(1), "url": f"https://doi.org/{match.group(1)}"})

    for match in re.finditer(pmid_pattern, text, re.IGNORECASE):
        citations.append({"type": "PMID", "id": match.group(1), "url": f"https://pubmed.ncbi.nlm.nih.gov/{match.group(1)}/"})

    for match in re.finditer(url_pattern, text, re.IGNORECASE):
        if not any(c["url"] == match.group(1) for c in citations):
            citations.append({"type": "URL", "id": "", "url": match.group(1)})

    return citations

def synthetic_biology_prompt_inject(query: str, expanded_terms: List[str]) -> str:
    """Bias prompt toward synthetic biology domain."""
    synbio_context = (
        "You are an expert synthetic biologist and AI researcher. "
        "Focus on CRISPR, metabolic engineering, living therapeutics, protein design, "
        "biosensors, and biosecurity. Integrate literature, molecular structures, market trends, "
        "and policy/regulatory outlook. Produce a structured, citation-rich report."
    )
    return f"{synbio_context}\n\nQuery: {query}\nExpanded terms: {', '.join(expanded_terms)}"

def research_once(query: str, graph_preview: bool = True, narration: bool = True) -> Dict[str, Any]:
    """Main synthetic biology research pipeline."""
    # 1. Expand query with ontology
    expanded_terms = expand_terms_with_ontology(query, UMLS_API_KEY, BIOPORTAL_API_KEY)

    # 2. Inject synthetic biology context
    enriched_query = synthetic_biology_prompt_inject(query, expanded_terms) if SYNBIO_MODE else query

    # 3. Run DeepSeek summarization
    raw_summary = run_deepseek_summary(enriched_query)

    # 4. Polish with Gemini
    polished_summary = run_gemini_polish(raw_summary)

    # 5. Extract citations
    citations = extract_citations(polished_summary)
    if not citations:
        fallback_cites = pubmed_fallback_search(query, NCBI_API_KEY, NCBI_EMAIL)
        citations.extend(fallback_cites)

    # 6. Fetch molecular structures
    structures = fetch_structures_for_terms(expanded_terms)

    # 7. Generate visual diagram
    visual_image_url = run_openai_image(query)
    if not visual_image_url:
        print("[Image] Falling back to Hugging Face Stable Diffusion...")
        visual_image_url = run_hf_image(f"Scientific diagram about {query}")

    # 8. Write to Neo4j
    if graph_preview and NEO4J_URI:
        write_topic_and_papers(query, citations, expanded_terms)

    # 9. Narrate executive summary
    audio_url = narrate_text(polished_summary) if narration and ELEVEN_LABS_API_KEY else None

    # 10. Return structured output
    return {
        "timestamp": datetime.utcnow().isoformat(),
        "query": query,
        "expanded_terms": expanded_terms,
        "summary": polished_summary,
        "citations": citations,
        "structures": structures,
        "visual_image_url": visual_image_url,
        "audio_url": audio_url
    }