File size: 7,359 Bytes
4d521f6
e2c04b6
6f23dc2
fa94666
875ce0d
fa94666
6f23dc2
 
 
 
 
 
e844fe0
 
6f23dc2
875ce0d
47975fb
fa94666
6f23dc2
 
 
 
 
 
7b0bd9a
6f23dc2
 
 
 
 
 
 
 
 
875ce0d
6f23dc2
 
fa94666
6f23dc2
 
 
5d480b1
875ce0d
7b0bd9a
875ce0d
fa94666
6f23dc2
 
 
 
 
fa94666
7b0bd9a
6f23dc2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fa94666
6f23dc2
 
 
 
fa94666
 
7b0bd9a
fa94666
6f23dc2
 
875ce0d
 
6f23dc2
 
 
 
 
 
875ce0d
6f23dc2
 
 
 
 
 
 
 
 
 
875ce0d
6f23dc2
875ce0d
 
fa94666
6f23dc2
 
fa94666
6f23dc2
 
 
875ce0d
6f23dc2
 
 
fa94666
 
6f23dc2
 
 
 
 
 
875ce0d
6f23dc2
875ce0d
 
6f23dc2
 
875ce0d
 
 
 
 
fa94666
6f23dc2
875ce0d
 
6f23dc2
 
 
875ce0d
 
 
 
 
fa94666
6f23dc2
 
 
875ce0d
 
 
 
 
6f23dc2
 
 
 
875ce0d
 
 
 
 
7b0bd9a
6f23dc2
fa94666
 
 
 
6f23dc2
fa94666
 
6f23dc2
 
 
 
 
 
 
 
 
 
5d480b1
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
# genesis/pipeline.py
import os
import re
from datetime import datetime
from typing import Dict, Any, List

# API client imports
from genesis.api_clients.pubmed_api import search_pubmed_literature
from genesis.api_clients.bioportal_api import expand_with_bioportal
from genesis.api_clients.umls_api import expand_with_umls
from genesis.api_clients.chembl_api import get_molecule_data
from genesis.api_clients.ncbi_api import fetch_ncbi_structure
from genesis.utils.pdf_export import export_report_to_pdf


# Core logic providers
from genesis.providers import (
    run_deepseek_summary,
    run_gemini_summary,
    run_openai_summary,
    run_gemini_image,
    run_openai_image,
    run_hf_image,
    narrate_text_elevenlabs
)
from genesis.utils.pdf_export import export_report_to_pdf
from genesis.visualization import generate_pathway_graph, generate_funding_network
from genesis.funding import fetch_funding_data
from genesis.trials import fetch_clinical_trials
from genesis.biosecurity import analyze_biosecurity_risks
from genesis.regulation import fetch_regulatory_info
from genesis.safety import analyze_safety_concerns
from genesis.structures import fetch_structures_for_terms
from genesis.ontology import merge_ontology_terms
from genesis.utils.graph_tools import write_topic_and_papers

# Environment vars
ELEVEN_LABS_API_KEY = os.getenv("ELEVEN_LABS_API_KEY")
NEO4J_URI = os.getenv("NEO4J_URI")
NEO4J_USER = os.getenv("NEO4J_USER")
NEO4J_PASSWORD = os.getenv("NEO4J_PASSWORD")

SYNBIO_MODE = True  # Bias towards synthetic biology context

# Demo queries for UI preload
DEMO_QUERIES = [
    "CRISPR living therapeutics in clinical trials since 2020",
    "AI-designed enzymes for plastic degradation β€” literature + pathways",
    "Synthetic biology startups in oncology β€” funding map",
    "Metabolic pathway for artemisinin biosynthesis in yeast",
    "Oncolytic virus engineering β€” biosecurity risk analysis"
]


def extract_citations(text: str) -> List[Dict[str, str]]:
    """Extract citations (DOI, PMID, URLs) from text."""
    citations = []
    doi_pattern = r"(10\.\d{4,9}/[-._;()/:A-Z0-9]+)"
    pmid_pattern = r"PMID:\s*(\d+)"
    url_pattern = r"(https?://[^\s)]+)"

    for match in re.finditer(doi_pattern, text, re.IGNORECASE):
        citations.append({"type": "DOI", "id": match.group(1), "url": f"https://doi.org/{match.group(1)}"})
    for match in re.finditer(pmid_pattern, text, re.IGNORECASE):
        citations.append({"type": "PMID", "id": match.group(1), "url": f"https://pubmed.ncbi.nlm.nih.gov/{match.group(1)}/"})
    for match in re.finditer(url_pattern, text, re.IGNORECASE):
        if not any(c["url"] == match.group(1) for c in citations):
            citations.append({"type": "URL", "id": "", "url": match.group(1)})

    return citations


def inject_synbio_context(query: str, expanded_terms: List[str]) -> str:
    """Injects synthetic biology expertise into the prompt."""
    context = (
        "You are an expert in synthetic biology, biosecurity, and regulatory affairs. "
        "Provide literature review, molecular insights, market trends, and policy implications. "
        "Focus on CRISPR, metabolic engineering, living therapeutics, protein design, biosensors, and biosecurity. "
        "Be concise, factual, and provide citations."
    )
    return f"{context}\n\nQuery: {query}\nExpanded terms: {', '.join(expanded_terms)}"


def multimodal_research(query: str, narration: bool = False, generate_pdf: bool = False) -> Dict[str, Any]:
    """Main multi-modal synthetic biology pipeline."""
    print(f"[Pipeline] Starting research for query: {query}")

    # 1 β€” Expand query with ontology
    expanded_terms = merge_ontology_terms(
        query,
        expand_with_umls(query),
        expand_with_bioportal(query)
    )
    print(f"[Pipeline] Expanded terms: {expanded_terms}")

    # 2 β€” Inject domain-specific context
    enriched_query = inject_synbio_context(query, expanded_terms) if SYNBIO_MODE else query

    # 3 β€” Summarization with fallback
    summary = None
    for summarizer in [run_deepseek_summary, run_gemini_summary, run_openai_summary]:
        try:
            summary = summarizer(enriched_query)
            if summary:
                print(f"[Pipeline] Summary generated by {summarizer.__name__}")
                break
        except Exception as e:
            print(f"[Pipeline] {summarizer.__name__} failed: {e}")

    if not summary:
        summary = "No summary generated β€” please refine your query."

    # 4 β€” Citations extraction & PubMed fallback
    citations = extract_citations(summary)
    if not citations:
        print("[Pipeline] No citations in summary, querying PubMed...")
        citations = search_pubmed_literature(query)

    # 5 β€” Structures (NCBI, ChEMBL)
    structures = fetch_structures_for_terms(expanded_terms)

    # 6 β€” Image generation with fallback
    image_url = None
    for img_fn in [run_gemini_image, run_openai_image, run_hf_image]:
        try:
            image_url = img_fn(query)
            if image_url:
                print(f"[Pipeline] Image generated by {img_fn.__name__}")
                break
        except Exception as e:
            print(f"[Pipeline] {img_fn.__name__} failed: {e}")

    # 7 β€” Funding, Trials, Regulation, Safety, Biosecurity
    funding_data = fetch_funding_data(query) or []
    trial_data = fetch_clinical_trials(query) or []
    regulation_data = fetch_regulatory_info(query) or []
    safety_data = analyze_safety_concerns(query) or []
    biosecurity_data = analyze_biosecurity_risks(query) or []

    # 8 β€” Graph visualizations
    pathway_graph = generate_pathway_graph(query, expanded_terms) if expanded_terms else None
    funding_graph = generate_funding_network(query, funding_data) if funding_data else None

    # 9 β€” Save to Neo4j
    if NEO4J_URI:
        try:
            write_topic_and_papers(query, citations, expanded_terms)
            print("[Pipeline] Data saved to Neo4j")
        except Exception as e:
            print(f"[Pipeline] Neo4j save failed: {e}")

    # 10 β€” Optional narration
    audio_url = None
    if narration and ELEVEN_LABS_API_KEY:
        try:
            audio_url = narrate_text_elevenlabs(summary)
            print("[Pipeline] Narration generated")
        except Exception as e:
            print(f"[Pipeline] Narration failed: {e}")

    # 11 β€” Optional PDF export
    pdf_path = None
    if generate_pdf:
        try:
            pdf_path = export_report_to_pdf(query, summary, citations, structures, funding_data, regulation_data)
            print("[Pipeline] PDF report generated")
        except Exception as e:
            print(f"[Pipeline] PDF generation failed: {e}")

    # 12 β€” Build output
    return {
        "timestamp": datetime.utcnow().isoformat(),
        "query": query,
        "expanded_terms": expanded_terms,
        "summary": summary,
        "citations": citations,
        "structures": structures,
        "image_url": image_url,
        "funding_data": funding_data,
        "trial_data": trial_data,
        "regulation_data": regulation_data,
        "safety_data": safety_data,
        "biosecurity_data": biosecurity_data,
        "pathway_graph": pathway_graph,
        "funding_graph": funding_graph,
        "audio_url": audio_url,
        "pdf_path": pdf_path
    }