Spaces:
Sleeping
Sleeping
Update genesis/pipeline.py
Browse files- genesis/pipeline.py +41 -12
genesis/pipeline.py
CHANGED
@@ -1,4 +1,3 @@
|
|
1 |
-
# genesis/pipeline.py
|
2 |
import os
|
3 |
import re
|
4 |
from datetime import datetime
|
@@ -8,9 +7,15 @@ from .ontology import expand_terms_with_ontology
|
|
8 |
from .structures import fetch_structures_for_terms
|
9 |
from .narration import narrate_text
|
10 |
from .graphdb import write_topic_and_papers
|
11 |
-
from .providers import
|
|
|
|
|
|
|
|
|
|
|
|
|
12 |
|
13 |
-
|
14 |
UMLS_API_KEY = os.getenv("UMLS_API_KEY")
|
15 |
BIOPORTAL_API_KEY = os.getenv("BIOPORTAL_API_KEY")
|
16 |
NCBI_API_KEY = os.getenv("NCBI_API_KEY")
|
@@ -20,29 +25,27 @@ NEO4J_URI = os.getenv("NEO4J_URI")
|
|
20 |
|
21 |
SYNBIO_MODE = True
|
22 |
|
23 |
-
DEMO_QUERIES = [
|
24 |
-
"Map all CRISPR-based living therapeutics in clinical trials since 2020",
|
25 |
-
"Graph metabolic engineering pathways for bio-based drug production",
|
26 |
-
"Synthetic biology startups developing oncolytic viruses — funding + trials",
|
27 |
-
"3D bioprinting advances for organ transplantation with regulatory analysis",
|
28 |
-
"AI-driven biosensor design for early cancer detection"
|
29 |
-
]
|
30 |
-
|
31 |
def extract_citations(text: str) -> List[Dict[str, str]]:
|
|
|
32 |
citations = []
|
33 |
doi_pattern = r"(10\.\d{4,9}/[-._;()/:A-Z0-9]+)"
|
34 |
pmid_pattern = r"PMID:\s*(\d+)"
|
35 |
url_pattern = r"(https?://[^\s)]+)"
|
|
|
36 |
for match in re.finditer(doi_pattern, text, re.IGNORECASE):
|
37 |
citations.append({"type": "DOI", "id": match.group(1), "url": f"https://doi.org/{match.group(1)}"})
|
|
|
38 |
for match in re.finditer(pmid_pattern, text, re.IGNORECASE):
|
39 |
citations.append({"type": "PMID", "id": match.group(1), "url": f"https://pubmed.ncbi.nlm.nih.gov/{match.group(1)}/"})
|
|
|
40 |
for match in re.finditer(url_pattern, text, re.IGNORECASE):
|
41 |
if not any(c["url"] == match.group(1) for c in citations):
|
42 |
citations.append({"type": "URL", "id": "", "url": match.group(1)})
|
|
|
43 |
return citations
|
44 |
|
45 |
def synthetic_biology_prompt_inject(query: str, expanded_terms: List[str]) -> str:
|
|
|
46 |
synbio_context = (
|
47 |
"You are an expert synthetic biologist and AI researcher. "
|
48 |
"Focus on CRISPR, metabolic engineering, living therapeutics, protein design, "
|
@@ -52,16 +55,42 @@ def synthetic_biology_prompt_inject(query: str, expanded_terms: List[str]) -> st
|
|
52 |
return f"{synbio_context}\n\nQuery: {query}\nExpanded terms: {', '.join(expanded_terms)}"
|
53 |
|
54 |
def research_once(query: str, graph_preview: bool = True, narration: bool = True) -> Dict[str, Any]:
|
|
|
|
|
55 |
expanded_terms = expand_terms_with_ontology(query, UMLS_API_KEY, BIOPORTAL_API_KEY)
|
|
|
|
|
56 |
enriched_query = synthetic_biology_prompt_inject(query, expanded_terms) if SYNBIO_MODE else query
|
|
|
|
|
57 |
raw_summary = run_deepseek_summary(enriched_query)
|
|
|
|
|
58 |
polished_summary = run_gemini_polish(raw_summary)
|
59 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
60 |
structures = fetch_structures_for_terms(expanded_terms)
|
|
|
|
|
61 |
visual_image_url = run_openai_image(query)
|
|
|
|
|
|
|
|
|
|
|
62 |
if graph_preview and NEO4J_URI:
|
63 |
write_topic_and_papers(query, citations, expanded_terms)
|
|
|
|
|
64 |
audio_url = narrate_text(polished_summary) if narration and ELEVEN_LABS_API_KEY else None
|
|
|
|
|
65 |
return {
|
66 |
"timestamp": datetime.utcnow().isoformat(),
|
67 |
"query": query,
|
|
|
|
|
1 |
import os
|
2 |
import re
|
3 |
from datetime import datetime
|
|
|
7 |
from .structures import fetch_structures_for_terms
|
8 |
from .narration import narrate_text
|
9 |
from .graphdb import write_topic_and_papers
|
10 |
+
from .providers import (
|
11 |
+
run_deepseek_summary,
|
12 |
+
run_gemini_polish,
|
13 |
+
run_openai_image,
|
14 |
+
run_hf_image,
|
15 |
+
pubmed_fallback_search
|
16 |
+
)
|
17 |
|
18 |
+
# Environment variables
|
19 |
UMLS_API_KEY = os.getenv("UMLS_API_KEY")
|
20 |
BIOPORTAL_API_KEY = os.getenv("BIOPORTAL_API_KEY")
|
21 |
NCBI_API_KEY = os.getenv("NCBI_API_KEY")
|
|
|
25 |
|
26 |
SYNBIO_MODE = True
|
27 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
28 |
def extract_citations(text: str) -> List[Dict[str, str]]:
|
29 |
+
"""Extract citations from model output."""
|
30 |
citations = []
|
31 |
doi_pattern = r"(10\.\d{4,9}/[-._;()/:A-Z0-9]+)"
|
32 |
pmid_pattern = r"PMID:\s*(\d+)"
|
33 |
url_pattern = r"(https?://[^\s)]+)"
|
34 |
+
|
35 |
for match in re.finditer(doi_pattern, text, re.IGNORECASE):
|
36 |
citations.append({"type": "DOI", "id": match.group(1), "url": f"https://doi.org/{match.group(1)}"})
|
37 |
+
|
38 |
for match in re.finditer(pmid_pattern, text, re.IGNORECASE):
|
39 |
citations.append({"type": "PMID", "id": match.group(1), "url": f"https://pubmed.ncbi.nlm.nih.gov/{match.group(1)}/"})
|
40 |
+
|
41 |
for match in re.finditer(url_pattern, text, re.IGNORECASE):
|
42 |
if not any(c["url"] == match.group(1) for c in citations):
|
43 |
citations.append({"type": "URL", "id": "", "url": match.group(1)})
|
44 |
+
|
45 |
return citations
|
46 |
|
47 |
def synthetic_biology_prompt_inject(query: str, expanded_terms: List[str]) -> str:
|
48 |
+
"""Bias prompt toward synthetic biology domain."""
|
49 |
synbio_context = (
|
50 |
"You are an expert synthetic biologist and AI researcher. "
|
51 |
"Focus on CRISPR, metabolic engineering, living therapeutics, protein design, "
|
|
|
55 |
return f"{synbio_context}\n\nQuery: {query}\nExpanded terms: {', '.join(expanded_terms)}"
|
56 |
|
57 |
def research_once(query: str, graph_preview: bool = True, narration: bool = True) -> Dict[str, Any]:
|
58 |
+
"""Main synthetic biology research pipeline."""
|
59 |
+
# 1. Expand query with ontology
|
60 |
expanded_terms = expand_terms_with_ontology(query, UMLS_API_KEY, BIOPORTAL_API_KEY)
|
61 |
+
|
62 |
+
# 2. Inject synthetic biology context
|
63 |
enriched_query = synthetic_biology_prompt_inject(query, expanded_terms) if SYNBIO_MODE else query
|
64 |
+
|
65 |
+
# 3. Run DeepSeek summarization
|
66 |
raw_summary = run_deepseek_summary(enriched_query)
|
67 |
+
|
68 |
+
# 4. Polish with Gemini
|
69 |
polished_summary = run_gemini_polish(raw_summary)
|
70 |
+
|
71 |
+
# 5. Extract citations
|
72 |
+
citations = extract_citations(polished_summary)
|
73 |
+
if not citations:
|
74 |
+
fallback_cites = pubmed_fallback_search(query, NCBI_API_KEY, NCBI_EMAIL)
|
75 |
+
citations.extend(fallback_cites)
|
76 |
+
|
77 |
+
# 6. Fetch molecular structures
|
78 |
structures = fetch_structures_for_terms(expanded_terms)
|
79 |
+
|
80 |
+
# 7. Generate visual diagram
|
81 |
visual_image_url = run_openai_image(query)
|
82 |
+
if not visual_image_url:
|
83 |
+
print("[Image] Falling back to Hugging Face Stable Diffusion...")
|
84 |
+
visual_image_url = run_hf_image(f"Scientific diagram about {query}")
|
85 |
+
|
86 |
+
# 8. Write to Neo4j
|
87 |
if graph_preview and NEO4J_URI:
|
88 |
write_topic_and_papers(query, citations, expanded_terms)
|
89 |
+
|
90 |
+
# 9. Narrate executive summary
|
91 |
audio_url = narrate_text(polished_summary) if narration and ELEVEN_LABS_API_KEY else None
|
92 |
+
|
93 |
+
# 10. Return structured output
|
94 |
return {
|
95 |
"timestamp": datetime.utcnow().isoformat(),
|
96 |
"query": query,
|