mgbam commited on
Commit
57d5f2c
Β·
verified Β·
1 Parent(s): 2782916

Update genesis/pipeline.py

Browse files
Files changed (1) hide show
  1. genesis/pipeline.py +33 -24
genesis/pipeline.py CHANGED
@@ -1,17 +1,20 @@
1
  # genesis/pipeline.py
 
 
 
 
 
 
2
  import os
3
  import re
4
  from datetime import datetime
5
  from typing import Dict, Any, List
6
 
7
- # API client imports
8
  from genesis.api_clients.pubmed_api import search_pubmed_literature
9
  from genesis.api_clients.bioportal_api import expand_with_bioportal
10
  from genesis.api_clients.umls_api import expand_with_umls
11
- from genesis.api_clients.chembl_api import get_molecule_data
12
- from genesis.api_clients.ncbi_api import fetch_ncbi_structure
13
- from genesis.utils.pdf_export import export_report_to_pdf
14
-
15
 
16
  # Core logic providers
17
  from genesis.providers import (
@@ -23,26 +26,29 @@ from genesis.providers import (
23
  run_hf_image,
24
  narrate_text_elevenlabs
25
  )
 
 
26
  from genesis.utils.pdf_export import export_report_to_pdf
 
 
 
27
  from genesis.visualization import generate_pathway_graph, generate_funding_network
 
 
28
  from genesis.funding import fetch_funding_data
29
  from genesis.trials import fetch_clinical_trials
30
  from genesis.biosecurity import analyze_biosecurity_risks
31
  from genesis.regulation import fetch_regulatory_info
32
  from genesis.safety import analyze_safety_concerns
33
- from genesis.structures import fetch_structures_for_terms
34
  from genesis.ontology import merge_ontology_terms
35
- from genesis.utils.graph_tools import write_topic_and_papers
36
 
37
  # Environment vars
38
  ELEVEN_LABS_API_KEY = os.getenv("ELEVEN_LABS_API_KEY")
39
  NEO4J_URI = os.getenv("NEO4J_URI")
40
- NEO4J_USER = os.getenv("NEO4J_USER")
41
- NEO4J_PASSWORD = os.getenv("NEO4J_PASSWORD")
42
 
43
  SYNBIO_MODE = True # Bias towards synthetic biology context
44
 
45
- # Demo queries for UI preload
46
  DEMO_QUERIES = [
47
  "CRISPR living therapeutics in clinical trials since 2020",
48
  "AI-designed enzymes for plastic degradation β€” literature + pathways",
@@ -51,9 +57,10 @@ DEMO_QUERIES = [
51
  "Oncolytic virus engineering β€” biosecurity risk analysis"
52
  ]
53
 
 
54
 
55
  def extract_citations(text: str) -> List[Dict[str, str]]:
56
- """Extract citations (DOI, PMID, URLs) from text."""
57
  citations = []
58
  doi_pattern = r"(10\.\d{4,9}/[-._;()/:A-Z0-9]+)"
59
  pmid_pattern = r"PMID:\s*(\d+)"
@@ -66,12 +73,10 @@ def extract_citations(text: str) -> List[Dict[str, str]]:
66
  for match in re.finditer(url_pattern, text, re.IGNORECASE):
67
  if not any(c["url"] == match.group(1) for c in citations):
68
  citations.append({"type": "URL", "id": "", "url": match.group(1)})
69
-
70
  return citations
71
 
72
-
73
  def inject_synbio_context(query: str, expanded_terms: List[str]) -> str:
74
- """Injects synthetic biology expertise into the prompt."""
75
  context = (
76
  "You are an expert in synthetic biology, biosecurity, and regulatory affairs. "
77
  "Provide literature review, molecular insights, market trends, and policy implications. "
@@ -80,9 +85,14 @@ def inject_synbio_context(query: str, expanded_terms: List[str]) -> str:
80
  )
81
  return f"{context}\n\nQuery: {query}\nExpanded terms: {', '.join(expanded_terms)}"
82
 
 
 
 
 
 
83
 
84
  def multimodal_research(query: str, narration: bool = False, generate_pdf: bool = False) -> Dict[str, Any]:
85
- """Main multi-modal synthetic biology pipeline."""
86
  print(f"[Pipeline] Starting research for query: {query}")
87
 
88
  # 1 β€” Expand query with ontology
@@ -93,10 +103,10 @@ def multimodal_research(query: str, narration: bool = False, generate_pdf: bool
93
  )
94
  print(f"[Pipeline] Expanded terms: {expanded_terms}")
95
 
96
- # 2 β€” Inject domain-specific context
97
  enriched_query = inject_synbio_context(query, expanded_terms) if SYNBIO_MODE else query
98
 
99
- # 3 β€” Summarization with fallback
100
  summary = None
101
  for summarizer in [run_deepseek_summary, run_gemini_summary, run_openai_summary]:
102
  try:
@@ -110,13 +120,13 @@ def multimodal_research(query: str, narration: bool = False, generate_pdf: bool
110
  if not summary:
111
  summary = "No summary generated β€” please refine your query."
112
 
113
- # 4 β€” Citations extraction & PubMed fallback
114
  citations = extract_citations(summary)
115
  if not citations:
116
  print("[Pipeline] No citations in summary, querying PubMed...")
117
  citations = search_pubmed_literature(query)
118
 
119
- # 5 β€” Structures (NCBI, ChEMBL)
120
  structures = fetch_structures_for_terms(expanded_terms)
121
 
122
  # 6 β€” Image generation with fallback
@@ -130,7 +140,7 @@ def multimodal_research(query: str, narration: bool = False, generate_pdf: bool
130
  except Exception as e:
131
  print(f"[Pipeline] {img_fn.__name__} failed: {e}")
132
 
133
- # 7 β€” Funding, Trials, Regulation, Safety, Biosecurity
134
  funding_data = fetch_funding_data(query) or []
135
  trial_data = fetch_clinical_trials(query) or []
136
  regulation_data = fetch_regulatory_info(query) or []
@@ -141,7 +151,7 @@ def multimodal_research(query: str, narration: bool = False, generate_pdf: bool
141
  pathway_graph = generate_pathway_graph(query, expanded_terms) if expanded_terms else None
142
  funding_graph = generate_funding_network(query, funding_data) if funding_data else None
143
 
144
- # 9 β€” Save to Neo4j
145
  if NEO4J_URI:
146
  try:
147
  write_topic_and_papers(query, citations, expanded_terms)
@@ -149,7 +159,7 @@ def multimodal_research(query: str, narration: bool = False, generate_pdf: bool
149
  except Exception as e:
150
  print(f"[Pipeline] Neo4j save failed: {e}")
151
 
152
- # 10 β€” Optional narration
153
  audio_url = None
154
  if narration and ELEVEN_LABS_API_KEY:
155
  try:
@@ -158,7 +168,7 @@ def multimodal_research(query: str, narration: bool = False, generate_pdf: bool
158
  except Exception as e:
159
  print(f"[Pipeline] Narration failed: {e}")
160
 
161
- # 11 β€” Optional PDF export
162
  pdf_path = None
163
  if generate_pdf:
164
  try:
@@ -167,7 +177,6 @@ def multimodal_research(query: str, narration: bool = False, generate_pdf: bool
167
  except Exception as e:
168
  print(f"[Pipeline] PDF generation failed: {e}")
169
 
170
- # 12 β€” Build output
171
  return {
172
  "timestamp": datetime.utcnow().isoformat(),
173
  "query": query,
 
1
  # genesis/pipeline.py
2
+ """
3
+ GENESIS-AI β€” Multimodal Synthetic Biology Research Pipeline
4
+ Coordinates ontology expansion, literature review, AI summarization, image generation,
5
+ funding intelligence, safety/biosecurity checks, and report export.
6
+ """
7
+
8
  import os
9
  import re
10
  from datetime import datetime
11
  from typing import Dict, Any, List
12
 
13
+ # API clients
14
  from genesis.api_clients.pubmed_api import search_pubmed_literature
15
  from genesis.api_clients.bioportal_api import expand_with_bioportal
16
  from genesis.api_clients.umls_api import expand_with_umls
17
+ from genesis.structures import fetch_structures_for_terms
 
 
 
18
 
19
  # Core logic providers
20
  from genesis.providers import (
 
26
  run_hf_image,
27
  narrate_text_elevenlabs
28
  )
29
+
30
+ # Utility modules
31
  from genesis.utils.pdf_export import export_report_to_pdf
32
+ from genesis.utils.graph_tools import write_topic_and_papers
33
+
34
+ # Visualizations
35
  from genesis.visualization import generate_pathway_graph, generate_funding_network
36
+
37
+ # Data sources
38
  from genesis.funding import fetch_funding_data
39
  from genesis.trials import fetch_clinical_trials
40
  from genesis.biosecurity import analyze_biosecurity_risks
41
  from genesis.regulation import fetch_regulatory_info
42
  from genesis.safety import analyze_safety_concerns
 
43
  from genesis.ontology import merge_ontology_terms
 
44
 
45
  # Environment vars
46
  ELEVEN_LABS_API_KEY = os.getenv("ELEVEN_LABS_API_KEY")
47
  NEO4J_URI = os.getenv("NEO4J_URI")
 
 
48
 
49
  SYNBIO_MODE = True # Bias towards synthetic biology context
50
 
51
+ # Demo queries
52
  DEMO_QUERIES = [
53
  "CRISPR living therapeutics in clinical trials since 2020",
54
  "AI-designed enzymes for plastic degradation β€” literature + pathways",
 
57
  "Oncolytic virus engineering β€” biosecurity risk analysis"
58
  ]
59
 
60
+ # ---------- Helper Functions ----------
61
 
62
  def extract_citations(text: str) -> List[Dict[str, str]]:
63
+ """Extract DOI, PMID, and URLs from text."""
64
  citations = []
65
  doi_pattern = r"(10\.\d{4,9}/[-._;()/:A-Z0-9]+)"
66
  pmid_pattern = r"PMID:\s*(\d+)"
 
73
  for match in re.finditer(url_pattern, text, re.IGNORECASE):
74
  if not any(c["url"] == match.group(1) for c in citations):
75
  citations.append({"type": "URL", "id": "", "url": match.group(1)})
 
76
  return citations
77
 
 
78
  def inject_synbio_context(query: str, expanded_terms: List[str]) -> str:
79
+ """Inject synthetic biology expertise into the prompt."""
80
  context = (
81
  "You are an expert in synthetic biology, biosecurity, and regulatory affairs. "
82
  "Provide literature review, molecular insights, market trends, and policy implications. "
 
85
  )
86
  return f"{context}\n\nQuery: {query}\nExpanded terms: {', '.join(expanded_terms)}"
87
 
88
+ # ---------- Main Pipeline ----------
89
+
90
+ def research_once(topic: str) -> Dict[str, Any]:
91
+ """Alias to multimodal_research for compatibility."""
92
+ return multimodal_research(topic)
93
 
94
  def multimodal_research(query: str, narration: bool = False, generate_pdf: bool = False) -> Dict[str, Any]:
95
+ """Main research pipeline for GENESIS-AI."""
96
  print(f"[Pipeline] Starting research for query: {query}")
97
 
98
  # 1 β€” Expand query with ontology
 
103
  )
104
  print(f"[Pipeline] Expanded terms: {expanded_terms}")
105
 
106
+ # 2 β€” Enrich query with domain-specific context
107
  enriched_query = inject_synbio_context(query, expanded_terms) if SYNBIO_MODE else query
108
 
109
+ # 3 β€” Summarization (fallback order)
110
  summary = None
111
  for summarizer in [run_deepseek_summary, run_gemini_summary, run_openai_summary]:
112
  try:
 
120
  if not summary:
121
  summary = "No summary generated β€” please refine your query."
122
 
123
+ # 4 β€” Extract citations, fallback to PubMed if none found
124
  citations = extract_citations(summary)
125
  if not citations:
126
  print("[Pipeline] No citations in summary, querying PubMed...")
127
  citations = search_pubmed_literature(query)
128
 
129
+ # 5 β€” Fetch related structures (NCBI, ChEMBL)
130
  structures = fetch_structures_for_terms(expanded_terms)
131
 
132
  # 6 β€” Image generation with fallback
 
140
  except Exception as e:
141
  print(f"[Pipeline] {img_fn.__name__} failed: {e}")
142
 
143
+ # 7 β€” Funding, trials, regulations, safety, biosecurity
144
  funding_data = fetch_funding_data(query) or []
145
  trial_data = fetch_clinical_trials(query) or []
146
  regulation_data = fetch_regulatory_info(query) or []
 
151
  pathway_graph = generate_pathway_graph(query, expanded_terms) if expanded_terms else None
152
  funding_graph = generate_funding_network(query, funding_data) if funding_data else None
153
 
154
+ # 9 β€” Save to Neo4j if configured
155
  if NEO4J_URI:
156
  try:
157
  write_topic_and_papers(query, citations, expanded_terms)
 
159
  except Exception as e:
160
  print(f"[Pipeline] Neo4j save failed: {e}")
161
 
162
+ # 10 β€” Narration (optional)
163
  audio_url = None
164
  if narration and ELEVEN_LABS_API_KEY:
165
  try:
 
168
  except Exception as e:
169
  print(f"[Pipeline] Narration failed: {e}")
170
 
171
+ # 11 β€” PDF export (optional)
172
  pdf_path = None
173
  if generate_pdf:
174
  try:
 
177
  except Exception as e:
178
  print(f"[Pipeline] PDF generation failed: {e}")
179
 
 
180
  return {
181
  "timestamp": datetime.utcnow().isoformat(),
182
  "query": query,