mgbam commited on
Commit
6f23dc2
Β·
verified Β·
1 Parent(s): 800d67f

Update genesis/pipeline.py

Browse files
Files changed (1) hide show
  1. genesis/pipeline.py +136 -44
genesis/pipeline.py CHANGED
@@ -1,78 +1,170 @@
1
  # genesis/pipeline.py
2
  import os
 
 
3
  from datetime import datetime
4
- from typing import Dict, Any, List
5
 
6
- from genesis.ontology import expand_terms_with_ontology
7
- from genesis.structures import fetch_structures_for_terms
8
- from genesis.narration import narrate_text
9
- from genesis.graphdb import write_topic_and_papers
 
 
 
 
10
  from genesis.providers import (
11
- run_pubmed_literature,
12
  run_deepseek_summary,
13
- run_gemini_polish,
14
- run_image_generation
 
 
 
 
15
  )
 
 
 
 
 
 
 
 
 
16
 
17
- UMLS_API_KEY = os.getenv("UMLS_API_KEY")
18
- BIOPORTAL_API_KEY = os.getenv("BIOPORTAL_API_KEY")
19
- NCBI_API_KEY = os.getenv("NCBI_API_KEY")
20
- NCBI_EMAIL = os.getenv("NCBI_EMAIL")
21
- NEO4J_URI = os.getenv("NEO4J_URI")
22
  ELEVEN_LABS_API_KEY = os.getenv("ELEVEN_LABS_API_KEY")
 
 
 
23
 
24
- SYNBIO_MODE = True
25
 
 
26
  DEMO_QUERIES = [
27
- "Map all CRISPR-based living therapeutics in clinical trials since 2020",
28
- "Graph metabolic engineering pathways for bio-based drug production",
29
- "Synthetic biology startups developing oncolytic viruses β€” funding + trials",
30
- "3D bioprinting advances for organ transplantation with regulatory analysis",
31
- "AI-driven biosensor design for early cancer detection"
32
  ]
33
 
34
- def synthetic_biology_prompt_inject(query: str, expanded_terms: List[str]) -> str:
35
- """Bias toward synthetic biology."""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
36
  context = (
37
- "You are an expert synthetic biologist and AI researcher. "
38
- "Focus on CRISPR, metabolic engineering, living therapeutics, protein design, "
39
- "biosensors, and biosecurity. Integrate literature, molecular structures, market trends, "
40
- "and policy/regulatory outlook. Produce a structured, citation-rich report."
41
  )
42
  return f"{context}\n\nQuery: {query}\nExpanded terms: {', '.join(expanded_terms)}"
43
 
44
- def research_once(query: str, graph_preview: bool = True, narration: bool = True) -> Dict[str, Any]:
45
- """Main research pipeline."""
46
- expanded_terms = expand_terms_with_ontology(query, UMLS_API_KEY, BIOPORTAL_API_KEY)
47
- enriched_query = synthetic_biology_prompt_inject(query, expanded_terms) if SYNBIO_MODE else query
48
 
49
- # Summarize with DeepSeek
50
- raw_summary = run_deepseek_summary(enriched_query) or "Summary unavailable."
51
- polished_summary = run_gemini_polish(raw_summary)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
52
 
53
- # Literature from PubMed
54
- citations = run_pubmed_literature(query, max_results=10)
55
 
56
- # Molecular structures
 
 
 
 
 
57
  structures = fetch_structures_for_terms(expanded_terms)
58
 
59
- # Image generation
60
- visual_image_url = run_image_generation(query)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
61
 
62
- # GraphDB integration
63
- if graph_preview and NEO4J_URI:
 
 
 
 
64
  write_topic_and_papers(query, citations, expanded_terms)
65
 
66
- # Narration
67
- audio_url = narrate_text(polished_summary) if narration and ELEVEN_LABS_API_KEY else None
 
 
 
 
 
 
 
68
 
 
69
  return {
70
  "timestamp": datetime.utcnow().isoformat(),
71
  "query": query,
72
  "expanded_terms": expanded_terms,
73
- "summary": polished_summary,
74
  "citations": citations,
75
  "structures": structures,
76
- "visual_image_url": visual_image_url,
77
- "audio_url": audio_url
 
 
 
 
 
 
 
 
78
  }
 
1
  # genesis/pipeline.py
2
  import os
3
+ import json
4
+ import re
5
  from datetime import datetime
6
+ from typing import Dict, Any, List, Optional
7
 
8
+ # API client imports
9
+ from genesis.api_clients.pubmed_api import search_pubmed_literature
10
+ from genesis.api_clients.bioportal_api import expand_with_bioportal
11
+ from genesis.api_clients.umls_api import expand_with_umls
12
+ from genesis.api_clients.chembl_api import get_molecule_data
13
+ from genesis.api_clients.ncbi_api import fetch_ncbi_structure
14
+
15
+ # Core logic
16
  from genesis.providers import (
 
17
  run_deepseek_summary,
18
+ run_gemini_summary,
19
+ run_openai_summary,
20
+ run_gemini_image,
21
+ run_openai_image,
22
+ run_hf_image,
23
+ narrate_text_elevenlabs
24
  )
25
+ from genesis.utils.pdf_export import export_report_to_pdf
26
+ from genesis.visualization import generate_pathway_graph, generate_funding_network
27
+ from genesis.funding import fetch_funding_data
28
+ from genesis.trials import fetch_clinical_trials
29
+ from genesis.biosecurity import analyze_biosecurity_risks
30
+ from genesis.regulation import fetch_regulatory_info
31
+ from genesis.safety import analyze_safety_concerns
32
+ from genesis.structures import fetch_structures_for_terms
33
+ from genesis.ontology import merge_ontology_terms
34
 
35
+ from genesis.graph_tools import write_topic_and_papers
36
+
37
+ # Environment vars
 
 
38
  ELEVEN_LABS_API_KEY = os.getenv("ELEVEN_LABS_API_KEY")
39
+ NEO4J_URI = os.getenv("NEO4J_URI")
40
+ NEO4J_USER = os.getenv("NEO4J_USER")
41
+ NEO4J_PASSWORD = os.getenv("NEO4J_PASSWORD")
42
 
43
+ SYNBIO_MODE = True # always bias to synthetic biology domain
44
 
45
+ # Preloaded demo queries
46
  DEMO_QUERIES = [
47
+ "CRISPR living therapeutics in clinical trials since 2020",
48
+ "AI-designed enzymes for plastic degradation β€” literature + pathways",
49
+ "Synthetic biology startups in oncology β€” funding map",
50
+ "Metabolic pathway for artemisinin biosynthesis in yeast",
51
+ "Oncolytic virus engineering β€” biosecurity risk analysis"
52
  ]
53
 
54
+
55
+ def extract_citations(text: str) -> List[Dict[str, str]]:
56
+ """Extract citations (DOI, PMID, URLs) from text."""
57
+ citations = []
58
+ doi_pattern = r"(10\.\d{4,9}/[-._;()/:A-Z0-9]+)"
59
+ pmid_pattern = r"PMID:\s*(\d+)"
60
+ url_pattern = r"(https?://[^\s)]+)"
61
+
62
+ for match in re.finditer(doi_pattern, text, re.IGNORECASE):
63
+ citations.append({"type": "DOI", "id": match.group(1), "url": f"https://doi.org/{match.group(1)}"})
64
+ for match in re.finditer(pmid_pattern, text, re.IGNORECASE):
65
+ citations.append({"type": "PMID", "id": match.group(1), "url": f"https://pubmed.ncbi.nlm.nih.gov/{match.group(1)}/"})
66
+ for match in re.finditer(url_pattern, text, re.IGNORECASE):
67
+ if not any(c["url"] == match.group(1) for c in citations):
68
+ citations.append({"type": "URL", "id": "", "url": match.group(1)})
69
+
70
+ return citations
71
+
72
+
73
+ def inject_synbio_context(query: str, expanded_terms: List[str]) -> str:
74
+ """Injects synthetic biology expertise into the prompt."""
75
  context = (
76
+ "You are an expert in synthetic biology, biosecurity, and regulatory affairs. "
77
+ "Provide literature review, molecular insights, market trends, and policy implications. "
78
+ "Focus on CRISPR, metabolic engineering, living therapeutics, protein design, biosensors, and biosecurity. "
79
+ "Be concise, factual, and provide citations."
80
  )
81
  return f"{context}\n\nQuery: {query}\nExpanded terms: {', '.join(expanded_terms)}"
82
 
 
 
 
 
83
 
84
+ def multimodal_research(query: str, narration: bool = False, generate_pdf: bool = False) -> Dict[str, Any]:
85
+ """Main multi-modal synthetic biology pipeline."""
86
+ # 1 β€” Expand query with ontology
87
+ expanded_terms = merge_ontology_terms(
88
+ query,
89
+ expand_with_umls(query),
90
+ expand_with_bioportal(query)
91
+ )
92
+
93
+ # 2 β€” Inject domain-specific context
94
+ enriched_query = inject_synbio_context(query, expanded_terms) if SYNBIO_MODE else query
95
+
96
+ # 3 β€” Summarization with fallback
97
+ summary = None
98
+ for summarizer in [run_deepseek_summary, run_gemini_summary, run_openai_summary]:
99
+ try:
100
+ summary = summarizer(enriched_query)
101
+ if summary:
102
+ break
103
+ except Exception:
104
+ continue
105
 
106
+ if not summary:
107
+ summary = "No summary generated β€” please refine your query."
108
 
109
+ # 4 β€” Citations extraction & PubMed fallback
110
+ citations = extract_citations(summary)
111
+ if not citations:
112
+ citations = search_pubmed_literature(query)
113
+
114
+ # 5 β€” Structures (NCBI, ChEMBL)
115
  structures = fetch_structures_for_terms(expanded_terms)
116
 
117
+ # 6 β€” Image generation with fallback
118
+ image_url = None
119
+ for img_fn in [run_gemini_image, run_openai_image, run_hf_image]:
120
+ try:
121
+ image_url = img_fn(query)
122
+ if image_url:
123
+ break
124
+ except Exception:
125
+ continue
126
+
127
+ # 7 β€” Funding, Trials, Regulation, Safety, Biosecurity
128
+ funding_data = fetch_funding_data(query)
129
+ trial_data = fetch_clinical_trials(query)
130
+ regulation_data = fetch_regulatory_info(query)
131
+ safety_data = analyze_safety_concerns(query)
132
+ biosecurity_data = analyze_biosecurity_risks(query)
133
 
134
+ # 8 β€” Graph visualizations
135
+ pathway_graph = generate_pathway_graph(query, expanded_terms)
136
+ funding_graph = generate_funding_network(query, funding_data)
137
+
138
+ # 9 β€” Save to Neo4j
139
+ if NEO4J_URI:
140
  write_topic_and_papers(query, citations, expanded_terms)
141
 
142
+ # 10 β€” Optional narration
143
+ audio_url = None
144
+ if narration and ELEVEN_LABS_API_KEY:
145
+ audio_url = narrate_text_elevenlabs(summary)
146
+
147
+ # 11 β€” Optional PDF export
148
+ pdf_path = None
149
+ if generate_pdf:
150
+ pdf_path = export_report_to_pdf(query, summary, citations, structures, funding_data, regulation_data)
151
 
152
+ # 12 β€” Build output
153
  return {
154
  "timestamp": datetime.utcnow().isoformat(),
155
  "query": query,
156
  "expanded_terms": expanded_terms,
157
+ "summary": summary,
158
  "citations": citations,
159
  "structures": structures,
160
+ "image_url": image_url,
161
+ "funding_data": funding_data,
162
+ "trial_data": trial_data,
163
+ "regulation_data": regulation_data,
164
+ "safety_data": safety_data,
165
+ "biosecurity_data": biosecurity_data,
166
+ "pathway_graph": pathway_graph,
167
+ "funding_graph": funding_graph,
168
+ "audio_url": audio_url,
169
+ "pdf_path": pdf_path
170
  }