Spaces:
Sleeping
Sleeping
Update genesis/pipeline.py
Browse files- genesis/pipeline.py +5 -45
genesis/pipeline.py
CHANGED
@@ -1,31 +1,25 @@
|
|
1 |
# genesis/pipeline.py
|
2 |
import os
|
3 |
import re
|
4 |
-
from typing import Dict, Any, List
|
5 |
from datetime import datetime
|
|
|
6 |
|
7 |
from .ontology import expand_terms_with_ontology
|
8 |
from .structures import fetch_structures_for_terms
|
9 |
from .narration import narrate_text
|
10 |
from .graphdb import write_topic_and_papers
|
11 |
from .providers import run_deepseek_summary, run_gemini_polish, run_openai_image, pubmed_fallback_search
|
12 |
-
# genesis/pipeline.py
|
13 |
-
from .providers import run_deepseek_summary, run_gemini_polish, run_gemini_image, pubmed_fallback_search
|
14 |
|
15 |
-
# ===== ENV VARIABLES =====
|
16 |
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
|
17 |
UMLS_API_KEY = os.getenv("UMLS_API_KEY")
|
18 |
BIOPORTAL_API_KEY = os.getenv("BIOPORTAL_API_KEY")
|
19 |
NCBI_API_KEY = os.getenv("NCBI_API_KEY")
|
20 |
NCBI_EMAIL = os.getenv("NCBI_EMAIL")
|
21 |
ELEVEN_LABS_API_KEY = os.getenv("ELEVEN_LABS_API_KEY")
|
22 |
-
NEO4J_USER = os.getenv("NEO4J_USER")
|
23 |
-
NEO4J_PASSWORD = os.getenv("NEO4J_PASSWORD")
|
24 |
NEO4J_URI = os.getenv("NEO4J_URI")
|
25 |
|
26 |
-
SYNBIO_MODE = True
|
27 |
|
28 |
-
# ===== DEMO PRESETS =====
|
29 |
DEMO_QUERIES = [
|
30 |
"Map all CRISPR-based living therapeutics in clinical trials since 2020",
|
31 |
"Graph metabolic engineering pathways for bio-based drug production",
|
@@ -34,29 +28,21 @@ DEMO_QUERIES = [
|
|
34 |
"AI-driven biosensor design for early cancer detection"
|
35 |
]
|
36 |
|
37 |
-
|
38 |
def extract_citations(text: str) -> List[Dict[str, str]]:
|
39 |
-
"""Extract citations from model output."""
|
40 |
citations = []
|
41 |
doi_pattern = r"(10\.\d{4,9}/[-._;()/:A-Z0-9]+)"
|
42 |
pmid_pattern = r"PMID:\s*(\d+)"
|
43 |
url_pattern = r"(https?://[^\s)]+)"
|
44 |
-
|
45 |
for match in re.finditer(doi_pattern, text, re.IGNORECASE):
|
46 |
citations.append({"type": "DOI", "id": match.group(1), "url": f"https://doi.org/{match.group(1)}"})
|
47 |
-
|
48 |
for match in re.finditer(pmid_pattern, text, re.IGNORECASE):
|
49 |
citations.append({"type": "PMID", "id": match.group(1), "url": f"https://pubmed.ncbi.nlm.nih.gov/{match.group(1)}/"})
|
50 |
-
|
51 |
for match in re.finditer(url_pattern, text, re.IGNORECASE):
|
52 |
if not any(c["url"] == match.group(1) for c in citations):
|
53 |
citations.append({"type": "URL", "id": "", "url": match.group(1)})
|
54 |
-
|
55 |
return citations
|
56 |
|
57 |
-
|
58 |
def synthetic_biology_prompt_inject(query: str, expanded_terms: List[str]) -> str:
|
59 |
-
"""Bias prompt toward synthetic biology domain."""
|
60 |
synbio_context = (
|
61 |
"You are an expert synthetic biologist and AI researcher. "
|
62 |
"Focus on CRISPR, metabolic engineering, living therapeutics, protein design, "
|
@@ -65,43 +51,18 @@ def synthetic_biology_prompt_inject(query: str, expanded_terms: List[str]) -> st
|
|
65 |
)
|
66 |
return f"{synbio_context}\n\nQuery: {query}\nExpanded terms: {', '.join(expanded_terms)}"
|
67 |
|
68 |
-
|
69 |
def research_once(query: str, graph_preview: bool = True, narration: bool = True) -> Dict[str, Any]:
|
70 |
-
"""Main synthetic biology research pipeline."""
|
71 |
-
# 1. Expand query with ontology
|
72 |
expanded_terms = expand_terms_with_ontology(query, UMLS_API_KEY, BIOPORTAL_API_KEY)
|
73 |
-
|
74 |
-
# 2. Inject synthetic biology context
|
75 |
enriched_query = synthetic_biology_prompt_inject(query, expanded_terms) if SYNBIO_MODE else query
|
76 |
-
|
77 |
-
# 3. Run DeepSeek summarization
|
78 |
raw_summary = run_deepseek_summary(enriched_query)
|
79 |
-
|
80 |
-
# 4. Polish with Gemini
|
81 |
polished_summary = run_gemini_polish(raw_summary)
|
82 |
-
|
83 |
-
# 5. Extract citations
|
84 |
-
citations = extract_citations(polished_summary)
|
85 |
-
if not citations:
|
86 |
-
# fallback: run PubMed search
|
87 |
-
fallback_cites = pubmed_fallback_search(query, NCBI_API_KEY, NCBI_EMAIL)
|
88 |
-
citations.extend(fallback_cites)
|
89 |
-
|
90 |
-
# 6. Fetch molecular structures
|
91 |
structures = fetch_structures_for_terms(expanded_terms)
|
92 |
-
|
93 |
-
# 7. Generate visual diagram with OpenAI image model
|
94 |
-
visual_image_url = run_openai_image(query) if OPENAI_API_KEY else None
|
95 |
-
|
96 |
-
# 8. Write to Neo4j graph
|
97 |
if graph_preview and NEO4J_URI:
|
98 |
write_topic_and_papers(query, citations, expanded_terms)
|
99 |
-
|
100 |
-
# 9. Narrate executive summary
|
101 |
audio_url = narrate_text(polished_summary) if narration and ELEVEN_LABS_API_KEY else None
|
102 |
-
|
103 |
-
# 10. Build output package
|
104 |
-
report = {
|
105 |
"timestamp": datetime.utcnow().isoformat(),
|
106 |
"query": query,
|
107 |
"expanded_terms": expanded_terms,
|
@@ -111,4 +72,3 @@ def research_once(query: str, graph_preview: bool = True, narration: bool = True
|
|
111 |
"visual_image_url": visual_image_url,
|
112 |
"audio_url": audio_url
|
113 |
}
|
114 |
-
return report
|
|
|
1 |
# genesis/pipeline.py
|
2 |
import os
|
3 |
import re
|
|
|
4 |
from datetime import datetime
|
5 |
+
from typing import Dict, Any, List
|
6 |
|
7 |
from .ontology import expand_terms_with_ontology
|
8 |
from .structures import fetch_structures_for_terms
|
9 |
from .narration import narrate_text
|
10 |
from .graphdb import write_topic_and_papers
|
11 |
from .providers import run_deepseek_summary, run_gemini_polish, run_openai_image, pubmed_fallback_search
|
|
|
|
|
12 |
|
|
|
13 |
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
|
14 |
UMLS_API_KEY = os.getenv("UMLS_API_KEY")
|
15 |
BIOPORTAL_API_KEY = os.getenv("BIOPORTAL_API_KEY")
|
16 |
NCBI_API_KEY = os.getenv("NCBI_API_KEY")
|
17 |
NCBI_EMAIL = os.getenv("NCBI_EMAIL")
|
18 |
ELEVEN_LABS_API_KEY = os.getenv("ELEVEN_LABS_API_KEY")
|
|
|
|
|
19 |
NEO4J_URI = os.getenv("NEO4J_URI")
|
20 |
|
21 |
+
SYNBIO_MODE = True
|
22 |
|
|
|
23 |
DEMO_QUERIES = [
|
24 |
"Map all CRISPR-based living therapeutics in clinical trials since 2020",
|
25 |
"Graph metabolic engineering pathways for bio-based drug production",
|
|
|
28 |
"AI-driven biosensor design for early cancer detection"
|
29 |
]
|
30 |
|
|
|
31 |
def extract_citations(text: str) -> List[Dict[str, str]]:
|
|
|
32 |
citations = []
|
33 |
doi_pattern = r"(10\.\d{4,9}/[-._;()/:A-Z0-9]+)"
|
34 |
pmid_pattern = r"PMID:\s*(\d+)"
|
35 |
url_pattern = r"(https?://[^\s)]+)"
|
|
|
36 |
for match in re.finditer(doi_pattern, text, re.IGNORECASE):
|
37 |
citations.append({"type": "DOI", "id": match.group(1), "url": f"https://doi.org/{match.group(1)}"})
|
|
|
38 |
for match in re.finditer(pmid_pattern, text, re.IGNORECASE):
|
39 |
citations.append({"type": "PMID", "id": match.group(1), "url": f"https://pubmed.ncbi.nlm.nih.gov/{match.group(1)}/"})
|
|
|
40 |
for match in re.finditer(url_pattern, text, re.IGNORECASE):
|
41 |
if not any(c["url"] == match.group(1) for c in citations):
|
42 |
citations.append({"type": "URL", "id": "", "url": match.group(1)})
|
|
|
43 |
return citations
|
44 |
|
|
|
45 |
def synthetic_biology_prompt_inject(query: str, expanded_terms: List[str]) -> str:
|
|
|
46 |
synbio_context = (
|
47 |
"You are an expert synthetic biologist and AI researcher. "
|
48 |
"Focus on CRISPR, metabolic engineering, living therapeutics, protein design, "
|
|
|
51 |
)
|
52 |
return f"{synbio_context}\n\nQuery: {query}\nExpanded terms: {', '.join(expanded_terms)}"
|
53 |
|
|
|
54 |
def research_once(query: str, graph_preview: bool = True, narration: bool = True) -> Dict[str, Any]:
|
|
|
|
|
55 |
expanded_terms = expand_terms_with_ontology(query, UMLS_API_KEY, BIOPORTAL_API_KEY)
|
|
|
|
|
56 |
enriched_query = synthetic_biology_prompt_inject(query, expanded_terms) if SYNBIO_MODE else query
|
|
|
|
|
57 |
raw_summary = run_deepseek_summary(enriched_query)
|
|
|
|
|
58 |
polished_summary = run_gemini_polish(raw_summary)
|
59 |
+
citations = extract_citations(polished_summary) or pubmed_fallback_search(query, NCBI_API_KEY, NCBI_EMAIL)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
60 |
structures = fetch_structures_for_terms(expanded_terms)
|
61 |
+
visual_image_url = run_openai_image(query)
|
|
|
|
|
|
|
|
|
62 |
if graph_preview and NEO4J_URI:
|
63 |
write_topic_and_papers(query, citations, expanded_terms)
|
|
|
|
|
64 |
audio_url = narrate_text(polished_summary) if narration and ELEVEN_LABS_API_KEY else None
|
65 |
+
return {
|
|
|
|
|
66 |
"timestamp": datetime.utcnow().isoformat(),
|
67 |
"query": query,
|
68 |
"expanded_terms": expanded_terms,
|
|
|
72 |
"visual_image_url": visual_image_url,
|
73 |
"audio_url": audio_url
|
74 |
}
|
|