Spaces:

mgbam
/

MCP_Res

Runtime error

App Files Files Community

mgbam commited on 8 days ago

Commit

f400521

verified ·

1 Parent(s): 20e7231

Update mcp/orchestrator.py

Browse files

Files changed (1) hide show

mcp/orchestrator.py +129 -80

mcp/orchestrator.py CHANGED Viewed

@@ -1,113 +1,162 @@
 """
-MedGenesis – parallel multi-API orchestrator
---------------------------------------------
-Now pulls PubMed, arXiv, UMLS, OpenFDA, DisGeNET, ClinicalTrials.gov,
-PLUS: MyGene.info, Ensembl, OpenTargets, Expression Atlas, cBioPortal,
-DrugCentral & PubChem.
-Call           : orchestrate_search(query, llm="openai" | "gemini")
-Returns        : dict ready for Streamlit UI
-Follow-up QA   : answer_ai_question(...)
 """
-import asyncio, httpx
 from typing import Dict, Any, List
 from mcp.arxiv            import fetch_arxiv
 from mcp.pubmed           import fetch_pubmed
 from mcp.nlp              import extract_keywords
 from mcp.umls             import lookup_umls
 from mcp.openfda          import fetch_drug_safety
-from mcp.ncbi             import search_gene, get_mesh_definition   # legacy
-from mcp.ncbi_turbo       import pubmed_ids
 from mcp.disgenet         import disease_to_genes
 from mcp.clinicaltrials   import search_trials
-from mcp.openai_utils     import ai_summarize, ai_qa
-from mcp.gemini           import gemini_summarize, gemini_qa
-from mcp.mygene           import fetch_gene_info
-from mcp.ensembl          import fetch_ensembl
-from mcp.opentargets      import fetch_ot
 from mcp.atlas            import fetch_expression
 from mcp.drugcentral_ext  import fetch_drugcentral
 from mcp.pubchem_ext      import fetch_compound
-from mcp.cbio             import fetch_cbio
-_DEF = "openai"
-# ---------- LLM router ----------
-def _llm_router(llm: str):
-    if llm.lower() == "gemini":
         return gemini_summarize, gemini_qa, "gemini"
     return ai_summarize, ai_qa, "openai"
-# ---------- gene resolver ----------
-async def _resolve_gene(sym: str) -> dict:
-    for fn in (fetch_gene_info, fetch_ensembl, fetch_ot):
-        try:
-            data = await fn(sym)
-            if data:
-                return data
-        except Exception:
-            pass
-    return {}
-# ---------- orchestrator ----------
-async def orchestrate_search(query: str,
-                             llm: str = _DEF) -> Dict[str, Any]:
-    # 1  Literature
-    arxiv_f  = asyncio.create_task(fetch_arxiv(query))
-    pubmed_f = asyncio.create_task(fetch_pubmed(query))
-    papers   = sum(await asyncio.gather(arxiv_f, pubmed_f), [])
-    # 2  Keyword extraction
-    blob = " ".join(p["summary"] for p in papers)
-    keywords = extract_keywords(blob)[:10]
-    # 3  Enrichment fan-out
-    umls_tasks  = [lookup_umls(k)       for k in keywords]
-    fda_tasks   = [fetch_drug_safety(k) for k in keywords]
-    gene_tasks  = [ _resolve_gene(k)    for k in keywords]
-    expr_tasks  = [ fetch_expression(k) for k in keywords]
-    dcz_tasks   = [ fetch_drugcentral(k) for k in keywords]
-    chem_tasks  = [ fetch_compound(k)    for k in keywords]
-    cbio_tasks  = [ fetch_cbio(k)        for k in keywords[:3]]  # limit API
-    genes, exprs, dcz, chems, cbio = await asyncio.gather(
-        asyncio.gather(*gene_tasks, return_exceptions=True),
-        asyncio.gather(*expr_tasks, return_exceptions=True),
-        asyncio.gather(*dcz_tasks,   return_exceptions=True),
-        asyncio.gather(*chem_tasks,  return_exceptions=True),
-        asyncio.gather(*cbio_tasks,  return_exceptions=True),
-    )
-    umls, fda = await asyncio.gather(
-        asyncio.gather(*umls_tasks, return_exceptions=True),
-        asyncio.gather(*fda_tasks,  return_exceptions=True),
     )
-    trials = await search_trials(query, max_studies=20)
-    # 4  AI summary
-    summarise, _, used_llm = _llm_router(llm)
-    summary = await summarise(blob)
     return {
         "papers"          : papers,
-        "ai_summary"      : summary,
-        "llm_used"        : used_llm,
-        "umls"            : umls,
-        "drug_safety"     : fda,
-        "genes_rich"      : [g for g in genes if g],
-        "expr_atlas"      : [e for e in exprs if e],
-        "drug_meta"       : [d for d in dcz if d],
-        "chem_info"       : [c for c in chems if c],
         "clinical_trials" : trials,
-        "cbio_variants"   : [v for v in cbio if v],
     }
-# ---------- follow-up QA ----------
 async def answer_ai_question(question: str, *,
                              context: str,
-                             llm: str = _DEF) -> Dict[str, str]:
     _, qa_fn, _ = _llm_router(llm)
-    return {"answer": await qa_fn(question, context)}

+#!/usr/bin/env python3
 """
+mcp/orchestrator.py  ·  MedGenesis v5
+─────────────────────────────────────
+Fan-out, async orchestrator that drives the Streamlit UI.
+Data sources pulled in parallel
+───────────────────────────────
+• Literature        → PubMed (E-utils) + arXiv RSS
+• NLP keywords      → spaCy (en_core_web_sm)
+• UMLS concepts     → UMLS REST (optional API key)
+• Safety signals    → openFDA Drug Event API
+• Gene annotation   → MyGene.info ➜ Ensembl REST ➜ Open Targets GraphQL
+• Expression        → EMBL-EBI Expression Atlas
+• Gene↔Disease      → DisGeNET
+• Trial registry    → ClinicalTrials.gov (β OAS v2 ➜ prod v2 ➜ legacy v1 ➜ WHO ICTRP)
+• Cancer variants   → cBioPortal REST v4
+• Drug metadata     → DrugCentral SMART API
+• Chemistry         → PubChem PUG-REST
+• Fast PubMed IDs   → NCBI E-utils (personal key doubles quota)
+LLM engines
+───────────
+OpenAI GPT-4o (default) or Gemini 1.5-Flash/Pro via router.
+Return payload keys (used by Streamlit UI)
+──────────────────────────────────────────
+papers, ai_summary, llm_used, umls, drug_safety,
+genes_rich, expr_atlas, drug_meta, chem_info,
+gene_disease, clinical_trials, cbio_variants
 """
+from __future__ import annotations
+import asyncio
 from typing import Dict, Any, List
+# ── Literature ──────────────────────────────────────────────────────
 from mcp.arxiv            import fetch_arxiv
 from mcp.pubmed           import fetch_pubmed
+from mcp.ncbi_turbo       import pubmed_ids                    # fast IDs
+# ── NLP + biomedical enrichment ────────────────────────────────────
 from mcp.nlp              import extract_keywords
 from mcp.umls             import lookup_umls
 from mcp.openfda          import fetch_drug_safety
 from mcp.disgenet         import disease_to_genes
 from mcp.clinicaltrials   import search_trials
+# Gene / expression helpers
+from mcp.gene_hub         import resolve_gene                  # MyGene → Ensembl → OT
 from mcp.atlas            import fetch_expression
+from mcp.cbio             import fetch_cbio
+# Drug metadata
 from mcp.drugcentral_ext  import fetch_drugcentral
 from mcp.pubchem_ext      import fetch_compound
+# ── LLM helpers ────────────────────────────────────────────────────
+from mcp.openai_utils     import ai_summarize, ai_qa
+from mcp.gemini           import gemini_summarize, gemini_qa
+_LLM_DEFAULT = "openai"
+# ────────────────────────────────────────────────────────────────────
+# Internal helpers
+# ────────────────────────────────────────────────────────────────────
+def _llm_router(name: str):
+    """Return (summarise_fn, qa_fn, engine_name)."""
+    if name.lower() == "gemini":
         return gemini_summarize, gemini_qa, "gemini"
     return ai_summarize, ai_qa, "openai"
+async def _fanout_keywords(keys: List[str]) -> Dict[str, Any]:
+    """Run UMLS, safety, gene, expression, drug meta in parallel."""
+    umls_f   = [lookup_umls(k)        for k in keys]
+    fda_f    = [fetch_drug_safety(k)  for k in keys]
+    gene_f   = [resolve_gene(k)       for k in keys]
+    expr_f   = [fetch_expression(k)   for k in keys]
+    drug_f   = [fetch_drugcentral(k)  for k in keys]
+    chem_f   = [fetch_compound(k)     for k in keys]
+    umls, fda, genes, exprs, dmeta, chem  = await asyncio.gather(
+        asyncio.gather(*umls_f,  return_exceptions=True),
+        asyncio.gather(*fda_f,   return_exceptions=True),
+        asyncio.gather(*gene_f,  return_exceptions=True),
+        asyncio.gather(*expr_f,  return_exceptions=True),
+        asyncio.gather(*drug_f,  return_exceptions=True),
+        asyncio.gather(*chem_f,  return_exceptions=True),
     )
+    return {
+        "umls"        : [u for u in umls  if isinstance(u, dict)],
+        "fda"         : [d for d in fda   if d],
+        "genes"       : [g for g in genes if g],
+        "expr"        : [e for e in exprs if e],
+        "drug_meta"   : [d for d in dmeta if d],
+        "chem_info"   : [c for c in chem  if c],
+    }
+# ────────────────────────────────────────────────────────────────────
+# Public API
+# ────────────────────────────────────────────────────────────────────
+async def orchestrate_search(query: str,
+                             llm: str = _LLM_DEFAULT) -> Dict[str, Any]:
+    """Run full async pipeline; never raises uncaught exceptions."""
+    # 1) Literature ---------------------------------------------------
+    arxiv_task  = asyncio.create_task(fetch_arxiv(query, max_results=10))
+    pubmed_task = asyncio.create_task(fetch_pubmed(query, max_results=10))
+    papers: List[Dict] = []
+    for p in await asyncio.gather(arxiv_task, pubmed_task, return_exceptions=True):
+        if not isinstance(p, Exception):
+            papers.extend(p)
+    # 2) NLP keywords -------------------------------------------------
+    corpus   = " ".join(p.get("summary", "") for p in papers)
+    keywords = extract_keywords(corpus)[:10]
+    # 3) Keyword fan-out ---------------------------------------------
+    enrich   = await _fanout_keywords(keywords)
+    # 4) DisGeNET + trials (single calls) -----------------------------
+    disg_f   = asyncio.create_task(disease_to_genes(query))
+    trials_f = asyncio.create_task(search_trials(query, max_studies=20))
+    gene_dis, trials = await asyncio.gather(disg_f, trials_f)
+    # 5) Cancer variants (limit first 3 genes for quota) -------------
+    cbio_tasks = [fetch_cbio(g["symbol"]) for g in enrich["genes"][:3]]
+    cbio_vars  = []
+    if cbio_tasks:
+        cbio_vars = await asyncio.gather(*cbio_tasks, return_exceptions=True)
+        cbio_vars = [v for v in cbio_vars if v]
+    # 6) AI summary ---------------------------------------------------
+    summarise, _, engine = _llm_router(llm)
+    ai_summary = await summarise(corpus) if corpus else ""
+    # 7) Return payload ----------------------------------------------
     return {
         "papers"          : papers,
+        "ai_summary"      : ai_summary,
+        "llm_used"        : engine,
+        "umls"            : enrich["umls"],
+        "drug_safety"     : enrich["fda"],
+        "genes_rich"      : enrich["genes"],
+        "expr_atlas"      : enrich["expr"],
+        "drug_meta"       : enrich["drug_meta"],
+        "chem_info"       : enrich["chem_info"],
+        "gene_disease"    : gene_dis,
         "clinical_trials" : trials,
+        "cbio_variants"   : cbio_vars,
     }
 async def answer_ai_question(question: str, *,
                              context: str,
+                             llm: str = _LLM_DEFAULT) -> Dict[str, str]:
+    """Follow-up QA using selected engine; returns {'answer': str}."""
     _, qa_fn, _ = _llm_router(llm)
+    return {"answer": await qa_fn(question, context=context)}