Spaces:
Sleeping
Sleeping
Update genesis/pipeline.py
Browse files- genesis/pipeline.py +39 -64
genesis/pipeline.py
CHANGED
@@ -1,103 +1,78 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
import os
|
2 |
-
import re
|
3 |
from datetime import datetime
|
4 |
-
from typing import Dict, Any, List
|
5 |
|
6 |
from .ontology import expand_terms_with_ontology
|
7 |
-
from .
|
8 |
from .narration import narrate_text
|
9 |
-
from .graphdb import write_topic_and_papers
|
10 |
from .providers import (
|
11 |
run_deepseek_summary,
|
12 |
run_gemini_polish,
|
13 |
run_openai_image,
|
14 |
-
run_hf_image,
|
15 |
pubmed_fallback_search
|
16 |
)
|
|
|
17 |
|
18 |
-
#
|
19 |
-
UMLS_API_KEY = os.getenv("UMLS_API_KEY")
|
20 |
-
BIOPORTAL_API_KEY = os.getenv("BIOPORTAL_API_KEY")
|
21 |
-
NCBI_API_KEY = os.getenv("NCBI_API_KEY")
|
22 |
-
NCBI_EMAIL = os.getenv("NCBI_EMAIL")
|
23 |
ELEVEN_LABS_API_KEY = os.getenv("ELEVEN_LABS_API_KEY")
|
|
|
24 |
NEO4J_URI = os.getenv("NEO4J_URI")
|
25 |
|
26 |
SYNBIO_MODE = True
|
27 |
|
28 |
-
def
|
29 |
-
"""
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
for match in re.finditer(doi_pattern, text, re.IGNORECASE):
|
36 |
-
citations.append({"type": "DOI", "id": match.group(1), "url": f"https://doi.org/{match.group(1)}"})
|
37 |
-
|
38 |
-
for match in re.finditer(pmid_pattern, text, re.IGNORECASE):
|
39 |
-
citations.append({"type": "PMID", "id": match.group(1), "url": f"https://pubmed.ncbi.nlm.nih.gov/{match.group(1)}/"})
|
40 |
-
|
41 |
-
for match in re.finditer(url_pattern, text, re.IGNORECASE):
|
42 |
-
if not any(c["url"] == match.group(1) for c in citations):
|
43 |
-
citations.append({"type": "URL", "id": "", "url": match.group(1)})
|
44 |
-
|
45 |
-
return citations
|
46 |
-
|
47 |
-
def synthetic_biology_prompt_inject(query: str, expanded_terms: List[str]) -> str:
|
48 |
-
"""Bias prompt toward synthetic biology domain."""
|
49 |
-
synbio_context = (
|
50 |
-
"You are an expert synthetic biologist and AI researcher. "
|
51 |
-
"Focus on CRISPR, metabolic engineering, living therapeutics, protein design, "
|
52 |
-
"biosensors, and biosecurity. Integrate literature, molecular structures, market trends, "
|
53 |
-
"and policy/regulatory outlook. Produce a structured, citation-rich report."
|
54 |
)
|
55 |
-
return f"{
|
56 |
|
57 |
-
def research_once(query
|
58 |
-
"""
|
59 |
-
|
60 |
-
|
|
|
61 |
|
62 |
-
# 2.
|
63 |
enriched_query = synthetic_biology_prompt_inject(query, expanded_terms) if SYNBIO_MODE else query
|
64 |
|
65 |
-
# 3.
|
66 |
-
|
67 |
|
68 |
-
# 4. Polish
|
69 |
-
|
70 |
|
71 |
-
# 5.
|
72 |
-
citations =
|
73 |
-
if not citations:
|
74 |
-
fallback_cites = pubmed_fallback_search(query, NCBI_API_KEY, NCBI_EMAIL)
|
75 |
-
citations.extend(fallback_cites)
|
76 |
|
77 |
-
# 6.
|
78 |
-
structures =
|
79 |
|
80 |
-
# 7.
|
81 |
-
|
82 |
-
if not visual_image_url:
|
83 |
-
print("[Image] Falling back to Hugging Face Stable Diffusion...")
|
84 |
-
visual_image_url = run_hf_image(f"Scientific diagram about {query}")
|
85 |
|
86 |
-
# 8.
|
87 |
if graph_preview and NEO4J_URI:
|
88 |
write_topic_and_papers(query, citations, expanded_terms)
|
89 |
|
90 |
-
# 9.
|
91 |
-
audio_url = narrate_text(
|
92 |
|
93 |
-
# 10.
|
94 |
return {
|
95 |
"timestamp": datetime.utcnow().isoformat(),
|
96 |
"query": query,
|
97 |
"expanded_terms": expanded_terms,
|
98 |
-
"summary":
|
99 |
"citations": citations,
|
100 |
"structures": structures,
|
101 |
-
"
|
102 |
"audio_url": audio_url
|
103 |
}
|
|
|
1 |
+
# genesis/pipeline.py
|
2 |
+
"""
|
3 |
+
GENESIS-AI Research Pipeline
|
4 |
+
Coordinates ontology expansion, literature retrieval, summaries, citations, structure fetching, graphDB storage, and narration.
|
5 |
+
"""
|
6 |
+
|
7 |
import os
|
|
|
8 |
from datetime import datetime
|
|
|
9 |
|
10 |
from .ontology import expand_terms_with_ontology
|
11 |
+
from .molecule_viewer import fetch_structure
|
12 |
from .narration import narrate_text
|
|
|
13 |
from .providers import (
|
14 |
run_deepseek_summary,
|
15 |
run_gemini_polish,
|
16 |
run_openai_image,
|
|
|
17 |
pubmed_fallback_search
|
18 |
)
|
19 |
+
from .graphdb import write_topic_and_papers
|
20 |
|
21 |
+
# ENV
|
|
|
|
|
|
|
|
|
22 |
ELEVEN_LABS_API_KEY = os.getenv("ELEVEN_LABS_API_KEY")
|
23 |
+
GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
|
24 |
NEO4J_URI = os.getenv("NEO4J_URI")
|
25 |
|
26 |
SYNBIO_MODE = True
|
27 |
|
28 |
+
def synthetic_biology_prompt_inject(query, expanded_terms):
|
29 |
+
"""Injects domain-specific bias toward synthetic biology research."""
|
30 |
+
context = (
|
31 |
+
"You are an expert in synthetic biology. Focus on CRISPR, metabolic engineering, "
|
32 |
+
"living therapeutics, protein design, biosensors, and biosecurity. Include literature, "
|
33 |
+
"structures, market trends, and regulatory insights with citations."
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
34 |
)
|
35 |
+
return f"{context}\n\nQuery: {query}\nExpanded terms: {', '.join(expanded_terms)}"
|
36 |
|
37 |
+
def research_once(query, graph_preview=True, narration=True):
|
38 |
+
"""Runs the GENESIS-AI pipeline for a given research query."""
|
39 |
+
|
40 |
+
# 1. Expand ontology
|
41 |
+
expanded_terms = expand_terms_with_ontology(query)
|
42 |
|
43 |
+
# 2. Domain injection
|
44 |
enriched_query = synthetic_biology_prompt_inject(query, expanded_terms) if SYNBIO_MODE else query
|
45 |
|
46 |
+
# 3. Summarize (DeepSeek)
|
47 |
+
summary_raw = run_deepseek_summary(enriched_query)
|
48 |
|
49 |
+
# 4. Polish (Gemini)
|
50 |
+
summary_polished = run_gemini_polish(summary_raw)
|
51 |
|
52 |
+
# 5. Citations
|
53 |
+
citations = pubmed_fallback_search(query)
|
|
|
|
|
|
|
54 |
|
55 |
+
# 6. Structures
|
56 |
+
structures = [fetch_structure(term) for term in expanded_terms]
|
57 |
|
58 |
+
# 7. Visual (OpenAI Image)
|
59 |
+
image_url = run_openai_image(query)
|
|
|
|
|
|
|
60 |
|
61 |
+
# 8. GraphDB
|
62 |
if graph_preview and NEO4J_URI:
|
63 |
write_topic_and_papers(query, citations, expanded_terms)
|
64 |
|
65 |
+
# 9. Narration
|
66 |
+
audio_url = narrate_text(summary_polished) if narration and ELEVEN_LABS_API_KEY else None
|
67 |
|
68 |
+
# 10. Output
|
69 |
return {
|
70 |
"timestamp": datetime.utcnow().isoformat(),
|
71 |
"query": query,
|
72 |
"expanded_terms": expanded_terms,
|
73 |
+
"summary": summary_polished,
|
74 |
"citations": citations,
|
75 |
"structures": structures,
|
76 |
+
"image_url": image_url,
|
77 |
"audio_url": audio_url
|
78 |
}
|