mgbam commited on
Commit
7b0bd9a
·
verified ·
1 Parent(s): 45f823c

Update genesis/pipeline.py

Browse files
Files changed (1) hide show
  1. genesis/pipeline.py +41 -12
genesis/pipeline.py CHANGED
@@ -1,4 +1,3 @@
1
- # genesis/pipeline.py
2
  import os
3
  import re
4
  from datetime import datetime
@@ -8,9 +7,15 @@ from .ontology import expand_terms_with_ontology
8
  from .structures import fetch_structures_for_terms
9
  from .narration import narrate_text
10
  from .graphdb import write_topic_and_papers
11
- from .providers import run_deepseek_summary, run_gemini_polish, run_openai_image, pubmed_fallback_search
 
 
 
 
 
 
12
 
13
- OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
14
  UMLS_API_KEY = os.getenv("UMLS_API_KEY")
15
  BIOPORTAL_API_KEY = os.getenv("BIOPORTAL_API_KEY")
16
  NCBI_API_KEY = os.getenv("NCBI_API_KEY")
@@ -20,29 +25,27 @@ NEO4J_URI = os.getenv("NEO4J_URI")
20
 
21
  SYNBIO_MODE = True
22
 
23
- DEMO_QUERIES = [
24
- "Map all CRISPR-based living therapeutics in clinical trials since 2020",
25
- "Graph metabolic engineering pathways for bio-based drug production",
26
- "Synthetic biology startups developing oncolytic viruses — funding + trials",
27
- "3D bioprinting advances for organ transplantation with regulatory analysis",
28
- "AI-driven biosensor design for early cancer detection"
29
- ]
30
-
31
  def extract_citations(text: str) -> List[Dict[str, str]]:
 
32
  citations = []
33
  doi_pattern = r"(10\.\d{4,9}/[-._;()/:A-Z0-9]+)"
34
  pmid_pattern = r"PMID:\s*(\d+)"
35
  url_pattern = r"(https?://[^\s)]+)"
 
36
  for match in re.finditer(doi_pattern, text, re.IGNORECASE):
37
  citations.append({"type": "DOI", "id": match.group(1), "url": f"https://doi.org/{match.group(1)}"})
 
38
  for match in re.finditer(pmid_pattern, text, re.IGNORECASE):
39
  citations.append({"type": "PMID", "id": match.group(1), "url": f"https://pubmed.ncbi.nlm.nih.gov/{match.group(1)}/"})
 
40
  for match in re.finditer(url_pattern, text, re.IGNORECASE):
41
  if not any(c["url"] == match.group(1) for c in citations):
42
  citations.append({"type": "URL", "id": "", "url": match.group(1)})
 
43
  return citations
44
 
45
  def synthetic_biology_prompt_inject(query: str, expanded_terms: List[str]) -> str:
 
46
  synbio_context = (
47
  "You are an expert synthetic biologist and AI researcher. "
48
  "Focus on CRISPR, metabolic engineering, living therapeutics, protein design, "
@@ -52,16 +55,42 @@ def synthetic_biology_prompt_inject(query: str, expanded_terms: List[str]) -> st
52
  return f"{synbio_context}\n\nQuery: {query}\nExpanded terms: {', '.join(expanded_terms)}"
53
 
54
  def research_once(query: str, graph_preview: bool = True, narration: bool = True) -> Dict[str, Any]:
 
 
55
  expanded_terms = expand_terms_with_ontology(query, UMLS_API_KEY, BIOPORTAL_API_KEY)
 
 
56
  enriched_query = synthetic_biology_prompt_inject(query, expanded_terms) if SYNBIO_MODE else query
 
 
57
  raw_summary = run_deepseek_summary(enriched_query)
 
 
58
  polished_summary = run_gemini_polish(raw_summary)
59
- citations = extract_citations(polished_summary) or pubmed_fallback_search(query, NCBI_API_KEY, NCBI_EMAIL)
 
 
 
 
 
 
 
60
  structures = fetch_structures_for_terms(expanded_terms)
 
 
61
  visual_image_url = run_openai_image(query)
 
 
 
 
 
62
  if graph_preview and NEO4J_URI:
63
  write_topic_and_papers(query, citations, expanded_terms)
 
 
64
  audio_url = narrate_text(polished_summary) if narration and ELEVEN_LABS_API_KEY else None
 
 
65
  return {
66
  "timestamp": datetime.utcnow().isoformat(),
67
  "query": query,
 
 
1
  import os
2
  import re
3
  from datetime import datetime
 
7
  from .structures import fetch_structures_for_terms
8
  from .narration import narrate_text
9
  from .graphdb import write_topic_and_papers
10
+ from .providers import (
11
+ run_deepseek_summary,
12
+ run_gemini_polish,
13
+ run_openai_image,
14
+ run_hf_image,
15
+ pubmed_fallback_search
16
+ )
17
 
18
+ # Environment variables
19
  UMLS_API_KEY = os.getenv("UMLS_API_KEY")
20
  BIOPORTAL_API_KEY = os.getenv("BIOPORTAL_API_KEY")
21
  NCBI_API_KEY = os.getenv("NCBI_API_KEY")
 
25
 
26
  SYNBIO_MODE = True
27
 
 
 
 
 
 
 
 
 
28
  def extract_citations(text: str) -> List[Dict[str, str]]:
29
+ """Extract citations from model output."""
30
  citations = []
31
  doi_pattern = r"(10\.\d{4,9}/[-._;()/:A-Z0-9]+)"
32
  pmid_pattern = r"PMID:\s*(\d+)"
33
  url_pattern = r"(https?://[^\s)]+)"
34
+
35
  for match in re.finditer(doi_pattern, text, re.IGNORECASE):
36
  citations.append({"type": "DOI", "id": match.group(1), "url": f"https://doi.org/{match.group(1)}"})
37
+
38
  for match in re.finditer(pmid_pattern, text, re.IGNORECASE):
39
  citations.append({"type": "PMID", "id": match.group(1), "url": f"https://pubmed.ncbi.nlm.nih.gov/{match.group(1)}/"})
40
+
41
  for match in re.finditer(url_pattern, text, re.IGNORECASE):
42
  if not any(c["url"] == match.group(1) for c in citations):
43
  citations.append({"type": "URL", "id": "", "url": match.group(1)})
44
+
45
  return citations
46
 
47
  def synthetic_biology_prompt_inject(query: str, expanded_terms: List[str]) -> str:
48
+ """Bias prompt toward synthetic biology domain."""
49
  synbio_context = (
50
  "You are an expert synthetic biologist and AI researcher. "
51
  "Focus on CRISPR, metabolic engineering, living therapeutics, protein design, "
 
55
  return f"{synbio_context}\n\nQuery: {query}\nExpanded terms: {', '.join(expanded_terms)}"
56
 
57
  def research_once(query: str, graph_preview: bool = True, narration: bool = True) -> Dict[str, Any]:
58
+ """Main synthetic biology research pipeline."""
59
+ # 1. Expand query with ontology
60
  expanded_terms = expand_terms_with_ontology(query, UMLS_API_KEY, BIOPORTAL_API_KEY)
61
+
62
+ # 2. Inject synthetic biology context
63
  enriched_query = synthetic_biology_prompt_inject(query, expanded_terms) if SYNBIO_MODE else query
64
+
65
+ # 3. Run DeepSeek summarization
66
  raw_summary = run_deepseek_summary(enriched_query)
67
+
68
+ # 4. Polish with Gemini
69
  polished_summary = run_gemini_polish(raw_summary)
70
+
71
+ # 5. Extract citations
72
+ citations = extract_citations(polished_summary)
73
+ if not citations:
74
+ fallback_cites = pubmed_fallback_search(query, NCBI_API_KEY, NCBI_EMAIL)
75
+ citations.extend(fallback_cites)
76
+
77
+ # 6. Fetch molecular structures
78
  structures = fetch_structures_for_terms(expanded_terms)
79
+
80
+ # 7. Generate visual diagram
81
  visual_image_url = run_openai_image(query)
82
+ if not visual_image_url:
83
+ print("[Image] Falling back to Hugging Face Stable Diffusion...")
84
+ visual_image_url = run_hf_image(f"Scientific diagram about {query}")
85
+
86
+ # 8. Write to Neo4j
87
  if graph_preview and NEO4J_URI:
88
  write_topic_and_papers(query, citations, expanded_terms)
89
+
90
+ # 9. Narrate executive summary
91
  audio_url = narrate_text(polished_summary) if narration and ELEVEN_LABS_API_KEY else None
92
+
93
+ # 10. Return structured output
94
  return {
95
  "timestamp": datetime.utcnow().isoformat(),
96
  "query": query,