mgbam commited on
Commit
4d521f6
·
verified ·
1 Parent(s): 75cfd1d

Update genesis/pipeline.py

Browse files
Files changed (1) hide show
  1. genesis/pipeline.py +39 -64
genesis/pipeline.py CHANGED
@@ -1,103 +1,78 @@
 
 
 
 
 
 
1
  import os
2
- import re
3
  from datetime import datetime
4
- from typing import Dict, Any, List
5
 
6
  from .ontology import expand_terms_with_ontology
7
- from .structures import fetch_structures_for_terms
8
  from .narration import narrate_text
9
- from .graphdb import write_topic_and_papers
10
  from .providers import (
11
  run_deepseek_summary,
12
  run_gemini_polish,
13
  run_openai_image,
14
- run_hf_image,
15
  pubmed_fallback_search
16
  )
 
17
 
18
- # Environment variables
19
- UMLS_API_KEY = os.getenv("UMLS_API_KEY")
20
- BIOPORTAL_API_KEY = os.getenv("BIOPORTAL_API_KEY")
21
- NCBI_API_KEY = os.getenv("NCBI_API_KEY")
22
- NCBI_EMAIL = os.getenv("NCBI_EMAIL")
23
  ELEVEN_LABS_API_KEY = os.getenv("ELEVEN_LABS_API_KEY")
 
24
  NEO4J_URI = os.getenv("NEO4J_URI")
25
 
26
  SYNBIO_MODE = True
27
 
28
- def extract_citations(text: str) -> List[Dict[str, str]]:
29
- """Extract citations from model output."""
30
- citations = []
31
- doi_pattern = r"(10\.\d{4,9}/[-._;()/:A-Z0-9]+)"
32
- pmid_pattern = r"PMID:\s*(\d+)"
33
- url_pattern = r"(https?://[^\s)]+)"
34
-
35
- for match in re.finditer(doi_pattern, text, re.IGNORECASE):
36
- citations.append({"type": "DOI", "id": match.group(1), "url": f"https://doi.org/{match.group(1)}"})
37
-
38
- for match in re.finditer(pmid_pattern, text, re.IGNORECASE):
39
- citations.append({"type": "PMID", "id": match.group(1), "url": f"https://pubmed.ncbi.nlm.nih.gov/{match.group(1)}/"})
40
-
41
- for match in re.finditer(url_pattern, text, re.IGNORECASE):
42
- if not any(c["url"] == match.group(1) for c in citations):
43
- citations.append({"type": "URL", "id": "", "url": match.group(1)})
44
-
45
- return citations
46
-
47
- def synthetic_biology_prompt_inject(query: str, expanded_terms: List[str]) -> str:
48
- """Bias prompt toward synthetic biology domain."""
49
- synbio_context = (
50
- "You are an expert synthetic biologist and AI researcher. "
51
- "Focus on CRISPR, metabolic engineering, living therapeutics, protein design, "
52
- "biosensors, and biosecurity. Integrate literature, molecular structures, market trends, "
53
- "and policy/regulatory outlook. Produce a structured, citation-rich report."
54
  )
55
- return f"{synbio_context}\n\nQuery: {query}\nExpanded terms: {', '.join(expanded_terms)}"
56
 
57
- def research_once(query: str, graph_preview: bool = True, narration: bool = True) -> Dict[str, Any]:
58
- """Main synthetic biology research pipeline."""
59
- # 1. Expand query with ontology
60
- expanded_terms = expand_terms_with_ontology(query, UMLS_API_KEY, BIOPORTAL_API_KEY)
 
61
 
62
- # 2. Inject synthetic biology context
63
  enriched_query = synthetic_biology_prompt_inject(query, expanded_terms) if SYNBIO_MODE else query
64
 
65
- # 3. Run DeepSeek summarization
66
- raw_summary = run_deepseek_summary(enriched_query)
67
 
68
- # 4. Polish with Gemini
69
- polished_summary = run_gemini_polish(raw_summary)
70
 
71
- # 5. Extract citations
72
- citations = extract_citations(polished_summary)
73
- if not citations:
74
- fallback_cites = pubmed_fallback_search(query, NCBI_API_KEY, NCBI_EMAIL)
75
- citations.extend(fallback_cites)
76
 
77
- # 6. Fetch molecular structures
78
- structures = fetch_structures_for_terms(expanded_terms)
79
 
80
- # 7. Generate visual diagram
81
- visual_image_url = run_openai_image(query)
82
- if not visual_image_url:
83
- print("[Image] Falling back to Hugging Face Stable Diffusion...")
84
- visual_image_url = run_hf_image(f"Scientific diagram about {query}")
85
 
86
- # 8. Write to Neo4j
87
  if graph_preview and NEO4J_URI:
88
  write_topic_and_papers(query, citations, expanded_terms)
89
 
90
- # 9. Narrate executive summary
91
- audio_url = narrate_text(polished_summary) if narration and ELEVEN_LABS_API_KEY else None
92
 
93
- # 10. Return structured output
94
  return {
95
  "timestamp": datetime.utcnow().isoformat(),
96
  "query": query,
97
  "expanded_terms": expanded_terms,
98
- "summary": polished_summary,
99
  "citations": citations,
100
  "structures": structures,
101
- "visual_image_url": visual_image_url,
102
  "audio_url": audio_url
103
  }
 
1
+ # genesis/pipeline.py
2
+ """
3
+ GENESIS-AI Research Pipeline
4
+ Coordinates ontology expansion, literature retrieval, summaries, citations, structure fetching, graphDB storage, and narration.
5
+ """
6
+
7
  import os
 
8
  from datetime import datetime
 
9
 
10
  from .ontology import expand_terms_with_ontology
11
+ from .molecule_viewer import fetch_structure
12
  from .narration import narrate_text
 
13
  from .providers import (
14
  run_deepseek_summary,
15
  run_gemini_polish,
16
  run_openai_image,
 
17
  pubmed_fallback_search
18
  )
19
+ from .graphdb import write_topic_and_papers
20
 
21
+ # ENV
 
 
 
 
22
  ELEVEN_LABS_API_KEY = os.getenv("ELEVEN_LABS_API_KEY")
23
+ GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
24
  NEO4J_URI = os.getenv("NEO4J_URI")
25
 
26
  SYNBIO_MODE = True
27
 
28
+ def synthetic_biology_prompt_inject(query, expanded_terms):
29
+ """Injects domain-specific bias toward synthetic biology research."""
30
+ context = (
31
+ "You are an expert in synthetic biology. Focus on CRISPR, metabolic engineering, "
32
+ "living therapeutics, protein design, biosensors, and biosecurity. Include literature, "
33
+ "structures, market trends, and regulatory insights with citations."
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
34
  )
35
+ return f"{context}\n\nQuery: {query}\nExpanded terms: {', '.join(expanded_terms)}"
36
 
37
+ def research_once(query, graph_preview=True, narration=True):
38
+ """Runs the GENESIS-AI pipeline for a given research query."""
39
+
40
+ # 1. Expand ontology
41
+ expanded_terms = expand_terms_with_ontology(query)
42
 
43
+ # 2. Domain injection
44
  enriched_query = synthetic_biology_prompt_inject(query, expanded_terms) if SYNBIO_MODE else query
45
 
46
+ # 3. Summarize (DeepSeek)
47
+ summary_raw = run_deepseek_summary(enriched_query)
48
 
49
+ # 4. Polish (Gemini)
50
+ summary_polished = run_gemini_polish(summary_raw)
51
 
52
+ # 5. Citations
53
+ citations = pubmed_fallback_search(query)
 
 
 
54
 
55
+ # 6. Structures
56
+ structures = [fetch_structure(term) for term in expanded_terms]
57
 
58
+ # 7. Visual (OpenAI Image)
59
+ image_url = run_openai_image(query)
 
 
 
60
 
61
+ # 8. GraphDB
62
  if graph_preview and NEO4J_URI:
63
  write_topic_and_papers(query, citations, expanded_terms)
64
 
65
+ # 9. Narration
66
+ audio_url = narrate_text(summary_polished) if narration and ELEVEN_LABS_API_KEY else None
67
 
68
+ # 10. Output
69
  return {
70
  "timestamp": datetime.utcnow().isoformat(),
71
  "query": query,
72
  "expanded_terms": expanded_terms,
73
+ "summary": summary_polished,
74
  "citations": citations,
75
  "structures": structures,
76
+ "image_url": image_url,
77
  "audio_url": audio_url
78
  }