Spaces:
Sleeping
Sleeping
File size: 7,359 Bytes
4d521f6 e2c04b6 6f23dc2 fa94666 875ce0d fa94666 6f23dc2 e844fe0 6f23dc2 875ce0d 47975fb fa94666 6f23dc2 7b0bd9a 6f23dc2 875ce0d 6f23dc2 fa94666 6f23dc2 5d480b1 875ce0d 7b0bd9a 875ce0d fa94666 6f23dc2 fa94666 7b0bd9a 6f23dc2 fa94666 6f23dc2 fa94666 7b0bd9a fa94666 6f23dc2 875ce0d 6f23dc2 875ce0d 6f23dc2 875ce0d 6f23dc2 875ce0d fa94666 6f23dc2 fa94666 6f23dc2 875ce0d 6f23dc2 fa94666 6f23dc2 875ce0d 6f23dc2 875ce0d 6f23dc2 875ce0d fa94666 6f23dc2 875ce0d 6f23dc2 875ce0d fa94666 6f23dc2 875ce0d 6f23dc2 875ce0d 7b0bd9a 6f23dc2 fa94666 6f23dc2 fa94666 6f23dc2 5d480b1 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 |
# genesis/pipeline.py
import os
import re
from datetime import datetime
from typing import Dict, Any, List
# API client imports
from genesis.api_clients.pubmed_api import search_pubmed_literature
from genesis.api_clients.bioportal_api import expand_with_bioportal
from genesis.api_clients.umls_api import expand_with_umls
from genesis.api_clients.chembl_api import get_molecule_data
from genesis.api_clients.ncbi_api import fetch_ncbi_structure
from genesis.utils.pdf_export import export_report_to_pdf
# Core logic providers
from genesis.providers import (
run_deepseek_summary,
run_gemini_summary,
run_openai_summary,
run_gemini_image,
run_openai_image,
run_hf_image,
narrate_text_elevenlabs
)
from genesis.utils.pdf_export import export_report_to_pdf
from genesis.visualization import generate_pathway_graph, generate_funding_network
from genesis.funding import fetch_funding_data
from genesis.trials import fetch_clinical_trials
from genesis.biosecurity import analyze_biosecurity_risks
from genesis.regulation import fetch_regulatory_info
from genesis.safety import analyze_safety_concerns
from genesis.structures import fetch_structures_for_terms
from genesis.ontology import merge_ontology_terms
from genesis.utils.graph_tools import write_topic_and_papers
# Environment vars
ELEVEN_LABS_API_KEY = os.getenv("ELEVEN_LABS_API_KEY")
NEO4J_URI = os.getenv("NEO4J_URI")
NEO4J_USER = os.getenv("NEO4J_USER")
NEO4J_PASSWORD = os.getenv("NEO4J_PASSWORD")
SYNBIO_MODE = True # Bias towards synthetic biology context
# Demo queries for UI preload
DEMO_QUERIES = [
"CRISPR living therapeutics in clinical trials since 2020",
"AI-designed enzymes for plastic degradation β literature + pathways",
"Synthetic biology startups in oncology β funding map",
"Metabolic pathway for artemisinin biosynthesis in yeast",
"Oncolytic virus engineering β biosecurity risk analysis"
]
def extract_citations(text: str) -> List[Dict[str, str]]:
"""Extract citations (DOI, PMID, URLs) from text."""
citations = []
doi_pattern = r"(10\.\d{4,9}/[-._;()/:A-Z0-9]+)"
pmid_pattern = r"PMID:\s*(\d+)"
url_pattern = r"(https?://[^\s)]+)"
for match in re.finditer(doi_pattern, text, re.IGNORECASE):
citations.append({"type": "DOI", "id": match.group(1), "url": f"https://doi.org/{match.group(1)}"})
for match in re.finditer(pmid_pattern, text, re.IGNORECASE):
citations.append({"type": "PMID", "id": match.group(1), "url": f"https://pubmed.ncbi.nlm.nih.gov/{match.group(1)}/"})
for match in re.finditer(url_pattern, text, re.IGNORECASE):
if not any(c["url"] == match.group(1) for c in citations):
citations.append({"type": "URL", "id": "", "url": match.group(1)})
return citations
def inject_synbio_context(query: str, expanded_terms: List[str]) -> str:
"""Injects synthetic biology expertise into the prompt."""
context = (
"You are an expert in synthetic biology, biosecurity, and regulatory affairs. "
"Provide literature review, molecular insights, market trends, and policy implications. "
"Focus on CRISPR, metabolic engineering, living therapeutics, protein design, biosensors, and biosecurity. "
"Be concise, factual, and provide citations."
)
return f"{context}\n\nQuery: {query}\nExpanded terms: {', '.join(expanded_terms)}"
def multimodal_research(query: str, narration: bool = False, generate_pdf: bool = False) -> Dict[str, Any]:
"""Main multi-modal synthetic biology pipeline."""
print(f"[Pipeline] Starting research for query: {query}")
# 1 β Expand query with ontology
expanded_terms = merge_ontology_terms(
query,
expand_with_umls(query),
expand_with_bioportal(query)
)
print(f"[Pipeline] Expanded terms: {expanded_terms}")
# 2 β Inject domain-specific context
enriched_query = inject_synbio_context(query, expanded_terms) if SYNBIO_MODE else query
# 3 β Summarization with fallback
summary = None
for summarizer in [run_deepseek_summary, run_gemini_summary, run_openai_summary]:
try:
summary = summarizer(enriched_query)
if summary:
print(f"[Pipeline] Summary generated by {summarizer.__name__}")
break
except Exception as e:
print(f"[Pipeline] {summarizer.__name__} failed: {e}")
if not summary:
summary = "No summary generated β please refine your query."
# 4 β Citations extraction & PubMed fallback
citations = extract_citations(summary)
if not citations:
print("[Pipeline] No citations in summary, querying PubMed...")
citations = search_pubmed_literature(query)
# 5 β Structures (NCBI, ChEMBL)
structures = fetch_structures_for_terms(expanded_terms)
# 6 β Image generation with fallback
image_url = None
for img_fn in [run_gemini_image, run_openai_image, run_hf_image]:
try:
image_url = img_fn(query)
if image_url:
print(f"[Pipeline] Image generated by {img_fn.__name__}")
break
except Exception as e:
print(f"[Pipeline] {img_fn.__name__} failed: {e}")
# 7 β Funding, Trials, Regulation, Safety, Biosecurity
funding_data = fetch_funding_data(query) or []
trial_data = fetch_clinical_trials(query) or []
regulation_data = fetch_regulatory_info(query) or []
safety_data = analyze_safety_concerns(query) or []
biosecurity_data = analyze_biosecurity_risks(query) or []
# 8 β Graph visualizations
pathway_graph = generate_pathway_graph(query, expanded_terms) if expanded_terms else None
funding_graph = generate_funding_network(query, funding_data) if funding_data else None
# 9 β Save to Neo4j
if NEO4J_URI:
try:
write_topic_and_papers(query, citations, expanded_terms)
print("[Pipeline] Data saved to Neo4j")
except Exception as e:
print(f"[Pipeline] Neo4j save failed: {e}")
# 10 β Optional narration
audio_url = None
if narration and ELEVEN_LABS_API_KEY:
try:
audio_url = narrate_text_elevenlabs(summary)
print("[Pipeline] Narration generated")
except Exception as e:
print(f"[Pipeline] Narration failed: {e}")
# 11 β Optional PDF export
pdf_path = None
if generate_pdf:
try:
pdf_path = export_report_to_pdf(query, summary, citations, structures, funding_data, regulation_data)
print("[Pipeline] PDF report generated")
except Exception as e:
print(f"[Pipeline] PDF generation failed: {e}")
# 12 β Build output
return {
"timestamp": datetime.utcnow().isoformat(),
"query": query,
"expanded_terms": expanded_terms,
"summary": summary,
"citations": citations,
"structures": structures,
"image_url": image_url,
"funding_data": funding_data,
"trial_data": trial_data,
"regulation_data": regulation_data,
"safety_data": safety_data,
"biosecurity_data": biosecurity_data,
"pathway_graph": pathway_graph,
"funding_graph": funding_graph,
"audio_url": audio_url,
"pdf_path": pdf_path
}
|