mgbam commited on
Commit
a75a9dc
·
verified ·
1 Parent(s): 82c40eb

Update genesis/pipeline.py

Browse files
Files changed (1) hide show
  1. genesis/pipeline.py +5 -45
genesis/pipeline.py CHANGED
@@ -1,31 +1,25 @@
1
  # genesis/pipeline.py
2
  import os
3
  import re
4
- from typing import Dict, Any, List
5
  from datetime import datetime
 
6
 
7
  from .ontology import expand_terms_with_ontology
8
  from .structures import fetch_structures_for_terms
9
  from .narration import narrate_text
10
  from .graphdb import write_topic_and_papers
11
  from .providers import run_deepseek_summary, run_gemini_polish, run_openai_image, pubmed_fallback_search
12
- # genesis/pipeline.py
13
- from .providers import run_deepseek_summary, run_gemini_polish, run_gemini_image, pubmed_fallback_search
14
 
15
- # ===== ENV VARIABLES =====
16
  OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
17
  UMLS_API_KEY = os.getenv("UMLS_API_KEY")
18
  BIOPORTAL_API_KEY = os.getenv("BIOPORTAL_API_KEY")
19
  NCBI_API_KEY = os.getenv("NCBI_API_KEY")
20
  NCBI_EMAIL = os.getenv("NCBI_EMAIL")
21
  ELEVEN_LABS_API_KEY = os.getenv("ELEVEN_LABS_API_KEY")
22
- NEO4J_USER = os.getenv("NEO4J_USER")
23
- NEO4J_PASSWORD = os.getenv("NEO4J_PASSWORD")
24
  NEO4J_URI = os.getenv("NEO4J_URI")
25
 
26
- SYNBIO_MODE = True # force synthetic biology bias for demo
27
 
28
- # ===== DEMO PRESETS =====
29
  DEMO_QUERIES = [
30
  "Map all CRISPR-based living therapeutics in clinical trials since 2020",
31
  "Graph metabolic engineering pathways for bio-based drug production",
@@ -34,29 +28,21 @@ DEMO_QUERIES = [
34
  "AI-driven biosensor design for early cancer detection"
35
  ]
36
 
37
-
38
  def extract_citations(text: str) -> List[Dict[str, str]]:
39
- """Extract citations from model output."""
40
  citations = []
41
  doi_pattern = r"(10\.\d{4,9}/[-._;()/:A-Z0-9]+)"
42
  pmid_pattern = r"PMID:\s*(\d+)"
43
  url_pattern = r"(https?://[^\s)]+)"
44
-
45
  for match in re.finditer(doi_pattern, text, re.IGNORECASE):
46
  citations.append({"type": "DOI", "id": match.group(1), "url": f"https://doi.org/{match.group(1)}"})
47
-
48
  for match in re.finditer(pmid_pattern, text, re.IGNORECASE):
49
  citations.append({"type": "PMID", "id": match.group(1), "url": f"https://pubmed.ncbi.nlm.nih.gov/{match.group(1)}/"})
50
-
51
  for match in re.finditer(url_pattern, text, re.IGNORECASE):
52
  if not any(c["url"] == match.group(1) for c in citations):
53
  citations.append({"type": "URL", "id": "", "url": match.group(1)})
54
-
55
  return citations
56
 
57
-
58
  def synthetic_biology_prompt_inject(query: str, expanded_terms: List[str]) -> str:
59
- """Bias prompt toward synthetic biology domain."""
60
  synbio_context = (
61
  "You are an expert synthetic biologist and AI researcher. "
62
  "Focus on CRISPR, metabolic engineering, living therapeutics, protein design, "
@@ -65,43 +51,18 @@ def synthetic_biology_prompt_inject(query: str, expanded_terms: List[str]) -> st
65
  )
66
  return f"{synbio_context}\n\nQuery: {query}\nExpanded terms: {', '.join(expanded_terms)}"
67
 
68
-
69
  def research_once(query: str, graph_preview: bool = True, narration: bool = True) -> Dict[str, Any]:
70
- """Main synthetic biology research pipeline."""
71
- # 1. Expand query with ontology
72
  expanded_terms = expand_terms_with_ontology(query, UMLS_API_KEY, BIOPORTAL_API_KEY)
73
-
74
- # 2. Inject synthetic biology context
75
  enriched_query = synthetic_biology_prompt_inject(query, expanded_terms) if SYNBIO_MODE else query
76
-
77
- # 3. Run DeepSeek summarization
78
  raw_summary = run_deepseek_summary(enriched_query)
79
-
80
- # 4. Polish with Gemini
81
  polished_summary = run_gemini_polish(raw_summary)
82
-
83
- # 5. Extract citations
84
- citations = extract_citations(polished_summary)
85
- if not citations:
86
- # fallback: run PubMed search
87
- fallback_cites = pubmed_fallback_search(query, NCBI_API_KEY, NCBI_EMAIL)
88
- citations.extend(fallback_cites)
89
-
90
- # 6. Fetch molecular structures
91
  structures = fetch_structures_for_terms(expanded_terms)
92
-
93
- # 7. Generate visual diagram with OpenAI image model
94
- visual_image_url = run_openai_image(query) if OPENAI_API_KEY else None
95
-
96
- # 8. Write to Neo4j graph
97
  if graph_preview and NEO4J_URI:
98
  write_topic_and_papers(query, citations, expanded_terms)
99
-
100
- # 9. Narrate executive summary
101
  audio_url = narrate_text(polished_summary) if narration and ELEVEN_LABS_API_KEY else None
102
-
103
- # 10. Build output package
104
- report = {
105
  "timestamp": datetime.utcnow().isoformat(),
106
  "query": query,
107
  "expanded_terms": expanded_terms,
@@ -111,4 +72,3 @@ def research_once(query: str, graph_preview: bool = True, narration: bool = True
111
  "visual_image_url": visual_image_url,
112
  "audio_url": audio_url
113
  }
114
- return report
 
1
  # genesis/pipeline.py
2
  import os
3
  import re
 
4
  from datetime import datetime
5
+ from typing import Dict, Any, List
6
 
7
  from .ontology import expand_terms_with_ontology
8
  from .structures import fetch_structures_for_terms
9
  from .narration import narrate_text
10
  from .graphdb import write_topic_and_papers
11
  from .providers import run_deepseek_summary, run_gemini_polish, run_openai_image, pubmed_fallback_search
 
 
12
 
 
13
  OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
14
  UMLS_API_KEY = os.getenv("UMLS_API_KEY")
15
  BIOPORTAL_API_KEY = os.getenv("BIOPORTAL_API_KEY")
16
  NCBI_API_KEY = os.getenv("NCBI_API_KEY")
17
  NCBI_EMAIL = os.getenv("NCBI_EMAIL")
18
  ELEVEN_LABS_API_KEY = os.getenv("ELEVEN_LABS_API_KEY")
 
 
19
  NEO4J_URI = os.getenv("NEO4J_URI")
20
 
21
+ SYNBIO_MODE = True
22
 
 
23
  DEMO_QUERIES = [
24
  "Map all CRISPR-based living therapeutics in clinical trials since 2020",
25
  "Graph metabolic engineering pathways for bio-based drug production",
 
28
  "AI-driven biosensor design for early cancer detection"
29
  ]
30
 
 
31
  def extract_citations(text: str) -> List[Dict[str, str]]:
 
32
  citations = []
33
  doi_pattern = r"(10\.\d{4,9}/[-._;()/:A-Z0-9]+)"
34
  pmid_pattern = r"PMID:\s*(\d+)"
35
  url_pattern = r"(https?://[^\s)]+)"
 
36
  for match in re.finditer(doi_pattern, text, re.IGNORECASE):
37
  citations.append({"type": "DOI", "id": match.group(1), "url": f"https://doi.org/{match.group(1)}"})
 
38
  for match in re.finditer(pmid_pattern, text, re.IGNORECASE):
39
  citations.append({"type": "PMID", "id": match.group(1), "url": f"https://pubmed.ncbi.nlm.nih.gov/{match.group(1)}/"})
 
40
  for match in re.finditer(url_pattern, text, re.IGNORECASE):
41
  if not any(c["url"] == match.group(1) for c in citations):
42
  citations.append({"type": "URL", "id": "", "url": match.group(1)})
 
43
  return citations
44
 
 
45
  def synthetic_biology_prompt_inject(query: str, expanded_terms: List[str]) -> str:
 
46
  synbio_context = (
47
  "You are an expert synthetic biologist and AI researcher. "
48
  "Focus on CRISPR, metabolic engineering, living therapeutics, protein design, "
 
51
  )
52
  return f"{synbio_context}\n\nQuery: {query}\nExpanded terms: {', '.join(expanded_terms)}"
53
 
 
54
  def research_once(query: str, graph_preview: bool = True, narration: bool = True) -> Dict[str, Any]:
 
 
55
  expanded_terms = expand_terms_with_ontology(query, UMLS_API_KEY, BIOPORTAL_API_KEY)
 
 
56
  enriched_query = synthetic_biology_prompt_inject(query, expanded_terms) if SYNBIO_MODE else query
 
 
57
  raw_summary = run_deepseek_summary(enriched_query)
 
 
58
  polished_summary = run_gemini_polish(raw_summary)
59
+ citations = extract_citations(polished_summary) or pubmed_fallback_search(query, NCBI_API_KEY, NCBI_EMAIL)
 
 
 
 
 
 
 
 
60
  structures = fetch_structures_for_terms(expanded_terms)
61
+ visual_image_url = run_openai_image(query)
 
 
 
 
62
  if graph_preview and NEO4J_URI:
63
  write_topic_and_papers(query, citations, expanded_terms)
 
 
64
  audio_url = narrate_text(polished_summary) if narration and ELEVEN_LABS_API_KEY else None
65
+ return {
 
 
66
  "timestamp": datetime.utcnow().isoformat(),
67
  "query": query,
68
  "expanded_terms": expanded_terms,
 
72
  "visual_image_url": visual_image_url,
73
  "audio_url": audio_url
74
  }