Spaces:

mgbam
/

MCP_Res

Runtime error

App Files Files Community

mgbam commited on 8 days ago

Commit

eaba1ed

verified ·

1 Parent(s): f400521

Update mcp/orchestrator.py

Browse files

Files changed (1) hide show

mcp/orchestrator.py +84 -96

mcp/orchestrator.py CHANGED Viewed

@@ -1,30 +1,23 @@
 #!/usr/bin/env python3
 """
-mcp/orchestrator.py  ·  MedGenesis v5
-─────────────────────────────────────
-Fan-out, async orchestrator that drives the Streamlit UI.
-Data sources pulled in parallel
-───────────────────────────────
-• Literature        → PubMed (E-utils) + arXiv RSS
-• NLP keywords      → spaCy (en_core_web_sm)
-• UMLS concepts     → UMLS REST (optional API key)
-• Safety signals    → openFDA Drug Event API
-• Gene annotation   → MyGene.info ➜ Ensembl REST ➜ Open Targets GraphQL
-• Expression        → EMBL-EBI Expression Atlas
-• Gene↔Disease      → DisGeNET
-• Trial registry    → ClinicalTrials.gov (β OAS v2 ➜ prod v2 ➜ legacy v1 ➜ WHO ICTRP)
-• Cancer variants   → cBioPortal REST v4
-• Drug metadata     → DrugCentral SMART API
-• Chemistry         → PubChem PUG-REST
-• Fast PubMed IDs   → NCBI E-utils (personal key doubles quota)
-LLM engines
-───────────
-OpenAI GPT-4o (default) or Gemini 1.5-Flash/Pro via router.
-Return payload keys (used by Streamlit UI)
-──────────────────────────────────────────
 papers, ai_summary, llm_used, umls, drug_safety,
 genes_rich, expr_atlas, drug_meta, chem_info,
 gene_disease, clinical_trials, cbio_variants
@@ -37,126 +30,121 @@ from typing import Dict, Any, List
 # ── Literature ──────────────────────────────────────────────────────
 from mcp.arxiv            import fetch_arxiv
 from mcp.pubmed           import fetch_pubmed
-from mcp.ncbi_turbo       import pubmed_ids                    # fast IDs
-# ── NLP + biomedical enrichment ────────────────────────────────────
 from mcp.nlp              import extract_keywords
 from mcp.umls             import lookup_umls
 from mcp.openfda          import fetch_drug_safety
 from mcp.disgenet         import disease_to_genes
 from mcp.clinicaltrials   import search_trials
-# Gene / expression helpers
-from mcp.gene_hub         import resolve_gene                  # MyGene → Ensembl → OT
 from mcp.atlas            import fetch_expression
-from mcp.cbio             import fetch_cbio
-# Drug metadata
 from mcp.drugcentral_ext  import fetch_drugcentral
 from mcp.pubchem_ext      import fetch_compound
-# ── LLM helpers ────────────────────────────────────────────────────
 from mcp.openai_utils     import ai_summarize, ai_qa
 from mcp.gemini           import gemini_summarize, gemini_qa
 _LLM_DEFAULT = "openai"
 # ────────────────────────────────────────────────────────────────────
-# Internal helpers
 # ────────────────────────────────────────────────────────────────────
 def _llm_router(name: str):
-    """Return (summarise_fn, qa_fn, engine_name)."""
     if name.lower() == "gemini":
         return gemini_summarize, gemini_qa, "gemini"
     return ai_summarize, ai_qa, "openai"
-async def _fanout_keywords(keys: List[str]) -> Dict[str, Any]:
-    """Run UMLS, safety, gene, expression, drug meta in parallel."""
-    umls_f   = [lookup_umls(k)        for k in keys]
-    fda_f    = [fetch_drug_safety(k)  for k in keys]
-    gene_f   = [resolve_gene(k)       for k in keys]
-    expr_f   = [fetch_expression(k)   for k in keys]
-    drug_f   = [fetch_drugcentral(k)  for k in keys]
-    chem_f   = [fetch_compound(k)     for k in keys]
-    umls, fda, genes, exprs, dmeta, chem  = await asyncio.gather(
-        asyncio.gather(*umls_f,  return_exceptions=True),
-        asyncio.gather(*fda_f,   return_exceptions=True),
-        asyncio.gather(*gene_f,  return_exceptions=True),
-        asyncio.gather(*expr_f,  return_exceptions=True),
-        asyncio.gather(*drug_f,  return_exceptions=True),
-        asyncio.gather(*chem_f,  return_exceptions=True),
-    )
-    return {
-        "umls"        : [u for u in umls  if isinstance(u, dict)],
-        "fda"         : [d for d in fda   if d],
-        "genes"       : [g for g in genes if g],
-        "expr"        : [e for e in exprs if e],
-        "drug_meta"   : [d for d in dmeta if d],
-        "chem_info"   : [c for c in chem  if c],
-    }
 # ────────────────────────────────────────────────────────────────────
-# Public API
 # ────────────────────────────────────────────────────────────────────
 async def orchestrate_search(query: str,
                              llm: str = _LLM_DEFAULT) -> Dict[str, Any]:
-    """Run full async pipeline; never raises uncaught exceptions."""
-    # 1) Literature ---------------------------------------------------
-    arxiv_task  = asyncio.create_task(fetch_arxiv(query, max_results=10))
-    pubmed_task = asyncio.create_task(fetch_pubmed(query, max_results=10))
     papers: List[Dict] = []
-    for p in await asyncio.gather(arxiv_task, pubmed_task, return_exceptions=True):
-        if not isinstance(p, Exception):
-            papers.extend(p)
-    # 2) NLP keywords -------------------------------------------------
     corpus   = " ".join(p.get("summary", "") for p in papers)
     keywords = extract_keywords(corpus)[:10]
-    # 3) Keyword fan-out ---------------------------------------------
-    enrich   = await _fanout_keywords(keywords)
-    # 4) DisGeNET + trials (single calls) -----------------------------
-    disg_f   = asyncio.create_task(disease_to_genes(query))
-    trials_f = asyncio.create_task(search_trials(query, max_studies=20))
-    gene_dis, trials = await asyncio.gather(disg_f, trials_f)
-    # 5) Cancer variants (limit first 3 genes for quota) -------------
-    cbio_tasks = [fetch_cbio(g["symbol"]) for g in enrich["genes"][:3]]
-    cbio_vars  = []
-    if cbio_tasks:
-        cbio_vars = await asyncio.gather(*cbio_tasks, return_exceptions=True)
-        cbio_vars = [v for v in cbio_vars if v]
-    # 6) AI summary ---------------------------------------------------
-    summarise, _, engine = _llm_router(llm)
     ai_summary = await summarise(corpus) if corpus else ""
-    # 7) Return payload ----------------------------------------------
     return {
         "papers"          : papers,
         "ai_summary"      : ai_summary,
-        "llm_used"        : engine,
-        "umls"            : enrich["umls"],
-        "drug_safety"     : enrich["fda"],
-        "genes_rich"      : enrich["genes"],
-        "expr_atlas"      : enrich["expr"],
-        "drug_meta"       : enrich["drug_meta"],
-        "chem_info"       : enrich["chem_info"],
         "gene_disease"    : gene_dis,
         "clinical_trials" : trials,
         "cbio_variants"   : cbio_vars,
     }
 async def answer_ai_question(question: str, *,
                              context: str,
                              llm: str = _LLM_DEFAULT) -> Dict[str, str]:
-    """Follow-up QA using selected engine; returns {'answer': str}."""
     _, qa_fn, _ = _llm_router(llm)
     return {"answer": await qa_fn(question, context=context)}

 #!/usr/bin/env python3
 """
+mcp/orchestrator.py — MedGenesis v5
+───────────────────────────────────
+Asynchronously fan-outs across >10 open biomedical APIs, then returns
+one consolidated dictionary for the Streamlit UI.
+Public-key–free by default:
+  • MyGene.info, Ensembl REST, Open Targets GraphQL
+  • PubMed (E-utils), arXiv
+  • UMLS, openFDA, DisGeNET
+  • Expression Atlas, ClinicalTrials.gov (+ WHO ICTRP fallback)
+  • cBioPortal, DrugCentral, PubChem
+If you add secrets **MYGENE_KEY**, **OT_KEY**, **CBIO_KEY** or
+**NCBI_EUTILS_KEY**, they are auto-detected and used — otherwise the code
+runs key-less.
+Returned payload keys
+─────────────────────
 papers, ai_summary, llm_used, umls, drug_safety,
 genes_rich, expr_atlas, drug_meta, chem_info,
 gene_disease, clinical_trials, cbio_variants
 # ── Literature ──────────────────────────────────────────────────────
 from mcp.arxiv            import fetch_arxiv
 from mcp.pubmed           import fetch_pubmed
+# ── NLP + enrichment ────────────────────────────────────────────────
 from mcp.nlp              import extract_keywords
 from mcp.umls             import lookup_umls
 from mcp.openfda          import fetch_drug_safety
 from mcp.disgenet         import disease_to_genes
 from mcp.clinicaltrials   import search_trials
+# Gene / expression modules
+from mcp.gene_hub         import resolve_gene        # MyGene → Ensembl → OT
 from mcp.atlas            import fetch_expression
+from mcp.cbio             import fetch_cbio          # cancer variants
+# Drug metadata & chemistry
 from mcp.drugcentral_ext  import fetch_drugcentral
 from mcp.pubchem_ext      import fetch_compound
+# ── Large-language model helpers ────────────────────────────────────
 from mcp.openai_utils     import ai_summarize, ai_qa
 from mcp.gemini           import gemini_summarize, gemini_qa
 _LLM_DEFAULT = "openai"
 # ────────────────────────────────────────────────────────────────────
+# LLM router
 # ────────────────────────────────────────────────────────────────────
 def _llm_router(name: str):
+    """Return (summarise_fn, qa_fn, engine_tag)."""
     if name.lower() == "gemini":
         return gemini_summarize, gemini_qa, "gemini"
     return ai_summarize, ai_qa, "openai"
 # ────────────────────────────────────────────────────────────────────
+# Main orchestrator
 # ────────────────────────────────────────────────────────────────────
 async def orchestrate_search(query: str,
                              llm: str = _LLM_DEFAULT) -> Dict[str, Any]:
+    """Run the complete async pipeline; always resolves without raising."""
+    # 1  Literature ---------------------------------------------------
+    arxiv_f  = asyncio.create_task(fetch_arxiv(query, max_results=10))
+    pubmed_f = asyncio.create_task(fetch_pubmed(query, max_results=10))
     papers: List[Dict] = []
+    for res in await asyncio.gather(arxiv_f, pubmed_f, return_exceptions=True):
+        if not isinstance(res, Exception):
+            papers.extend(res)
+    # 2  Keyword extraction ------------------------------------------
     corpus   = " ".join(p.get("summary", "") for p in papers)
     keywords = extract_keywords(corpus)[:10]
+    # 3  Parallel enrichment -----------------------------------------
+    umls_jobs   = [lookup_umls(k)        for k in keywords]
+    fda_jobs    = [fetch_drug_safety(k)  for k in keywords]
+    gene_jobs   = [resolve_gene(k)       for k in keywords]
+    expr_jobs   = [fetch_expression(k)   for k in keywords]
+    drug_jobs   = [fetch_drugcentral(k)  for k in keywords]
+    chem_jobs   = [fetch_compound(k)     for k in keywords]
+    umls, fda, genes, exprs, drugs, chems = await asyncio.gather(
+        asyncio.gather(*umls_jobs,  return_exceptions=True),
+        asyncio.gather(*fda_jobs,   return_exceptions=True),
+        asyncio.gather(*gene_jobs,  return_exceptions=True),
+        asyncio.gather(*expr_jobs,  return_exceptions=True),
+        asyncio.gather(*drug_jobs,  return_exceptions=True),
+        asyncio.gather(*chem_jobs,  return_exceptions=True),
+    )
+    # filter out errors / empty payloads
+    umls   = [u for u in umls if isinstance(u, dict)]
+    fda    = [d for d in fda  if d]
+    genes  = [g for g in genes if g]
+    exprs  = [e for e in exprs if e]
+    drugs  = [d for d in drugs if d]
+    chems  = [c for c in chems if c]
+    # 4  Other single-shot APIs --------------------------------------
+    gene_dis  = await disease_to_genes(query)
+    trials    = await search_trials(query, max_studies=20)
+    # Cancer variants for first 3 gene symbols (quota safety)
+    cbio_jobs = [fetch_cbio(g.get("symbol", "")) for g in genes[:3]]
+    cbio_vars = []
+    if cbio_jobs:
+        tmp = await asyncio.gather(*cbio_jobs, return_exceptions=True)
+        cbio_vars = [v for v in tmp if v]
+    # 5  AI summary ---------------------------------------------------
+    summarise, _, engine_tag = _llm_router(llm)
     ai_summary = await summarise(corpus) if corpus else ""
+    # 6  Return payload ----------------------------------------------
     return {
         "papers"          : papers,
         "ai_summary"      : ai_summary,
+        "llm_used"        : engine_tag,
+        "umls"            : umls,
+        "drug_safety"     : fda,
+        "genes_rich"      : genes,
+        "expr_atlas"      : exprs,
+        "drug_meta"       : drugs,
+        "chem_info"       : chems,
         "gene_disease"    : gene_dis,
         "clinical_trials" : trials,
         "cbio_variants"   : cbio_vars,
     }
+# ────────────────────────────────────────────────────────────────────
+# Follow-up question-answer
+# ────────────────────────────────────────────────────────────────────
 async def answer_ai_question(question: str, *,
                              context: str,
                              llm: str = _LLM_DEFAULT) -> Dict[str, str]:
+    """Return {"answer": str} using chosen LLM."""
     _, qa_fn, _ = _llm_router(llm)
     return {"answer": await qa_fn(question, context=context)}