Update mcp/ncbi.py
Browse files- mcp/ncbi.py +90 -29
mcp/ncbi.py
CHANGED
|
@@ -1,35 +1,96 @@
|
|
| 1 |
-
|
| 2 |
-
"""
|
| 3 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4 |
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5 |
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
# ---------- Public helpers ----------
|
| 22 |
-
async def search_gene(term: str, retmax: int = 5) -> List[Dict]:
|
| 23 |
-
"""Return basic gene info (ID + name/symbol) by search term."""
|
| 24 |
-
data = await _get("esearch.fcgi", {"db": "gene", "term": term, "retmode": "json", "retmax": retmax})
|
| 25 |
-
ids = data["esearchresult"]["idlist"]
|
| 26 |
if not ids:
|
| 27 |
return []
|
| 28 |
-
summary = await _get("esummary.fcgi", {"db": "gene", "id": ",".join(ids), "retmode": "json"})
|
| 29 |
-
return list(summary["result"].values())[1:] # first key is 'uids'
|
| 30 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 31 |
async def get_mesh_definition(term: str) -> str:
|
| 32 |
-
"""Return MeSH term
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""MedGenesis – NCBI E‑utilities helper (async, cached).
|
| 3 |
+
|
| 4 |
+
Supports:
|
| 5 |
+
• `search_gene(term)` → quick gene symbol/name hits via ESearch + ESummary
|
| 6 |
+
• `get_mesh_definition(term)`→ first MeSH definition string via ESummary
|
| 7 |
+
|
| 8 |
+
New features
|
| 9 |
+
~~~~~~~~~~~~
|
| 10 |
+
* Central `_request()` with exponential‑backoff retry (2×/4×).
|
| 11 |
+
* 12‑hour LRU caches for both public helpers (API quota‑friendly).
|
| 12 |
+
* Respects optional `BIO_KEY` env to boost rate limits.
|
| 13 |
+
* Handles single‑item edge cases (ESummary returns dict not list).
|
| 14 |
"""
|
| 15 |
+
from __future__ import annotations
|
| 16 |
+
|
| 17 |
+
import os, asyncio, httpx, xmltodict
|
| 18 |
+
from functools import lru_cache
|
| 19 |
+
from typing import List, Dict, Any
|
| 20 |
+
|
| 21 |
+
_API_KEY = os.getenv("BIO_KEY") # optional but raises quota if set
|
| 22 |
+
_BASE = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/"
|
| 23 |
+
_TIMEOUT = 15
|
| 24 |
+
|
| 25 |
+
# ---------------------------------------------------------------------
|
| 26 |
+
# Internal request helper with retry
|
| 27 |
+
# ---------------------------------------------------------------------
|
| 28 |
+
async def _request(endpoint: str, params: Dict[str, Any], *, retries: int = 3) -> httpx.Response:
|
| 29 |
+
if _API_KEY:
|
| 30 |
+
params["api_key"] = _API_KEY
|
| 31 |
+
delay = 2
|
| 32 |
+
last = None
|
| 33 |
+
for _ in range(retries):
|
| 34 |
+
async with httpx.AsyncClient(timeout=_TIMEOUT) as cli:
|
| 35 |
+
last = await cli.get(f"{_BASE}{endpoint}", params=params)
|
| 36 |
+
if last.status_code == 200:
|
| 37 |
+
return last
|
| 38 |
+
await asyncio.sleep(delay)
|
| 39 |
+
delay *= 2
|
| 40 |
+
last.raise_for_status() # pragma: no cover
|
| 41 |
|
| 42 |
+
|
| 43 |
+
# ---------------------------------------------------------------------
|
| 44 |
+
# Gene search (ESearch → ESummary) – cached 12 h
|
| 45 |
+
# ---------------------------------------------------------------------
|
| 46 |
+
@lru_cache(maxsize=512)
|
| 47 |
+
async def search_gene(term: str, *, retmax: int = 5) -> List[Dict]:
|
| 48 |
+
"""Return list of gene summary dicts for *term* (Entrez Gene db)."""
|
| 49 |
+
es_params = {
|
| 50 |
+
"db" : "gene",
|
| 51 |
+
"term" : term,
|
| 52 |
+
"retmode": "json",
|
| 53 |
+
"retmax": retmax,
|
| 54 |
+
}
|
| 55 |
+
es_resp = await _request("esearch.fcgi", es_params)
|
| 56 |
+
ids = es_resp.json().get("esearchresult", {}).get("idlist", [])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 57 |
if not ids:
|
| 58 |
return []
|
|
|
|
|
|
|
| 59 |
|
| 60 |
+
sum_params = {"db": "gene", "id": ",".join(ids), "retmode": "json"}
|
| 61 |
+
sum_resp = await _request("esummary.fcgi", sum_params)
|
| 62 |
+
data = sum_resp.json().get("result", {})
|
| 63 |
+
# first key is 'uids'; skip it
|
| 64 |
+
return [v for k, v in data.items() if k != "uids"]
|
| 65 |
+
|
| 66 |
+
|
| 67 |
+
# ---------------------------------------------------------------------
|
| 68 |
+
# MeSH definition – cached 12 h
|
| 69 |
+
# ---------------------------------------------------------------------
|
| 70 |
+
@lru_cache(maxsize=512)
|
| 71 |
async def get_mesh_definition(term: str) -> str:
|
| 72 |
+
"""Return first MeSH definition string for *term* or ''."""
|
| 73 |
+
params = {
|
| 74 |
+
"db": "mesh",
|
| 75 |
+
"term": term,
|
| 76 |
+
"retmode": "json",
|
| 77 |
+
"retmax": 1,
|
| 78 |
+
}
|
| 79 |
+
resp = await _request("esummary.fcgi", params)
|
| 80 |
+
data = resp.json().get("result", {})
|
| 81 |
+
recs = [v for k, v in data.items() if k != "uids"]
|
| 82 |
+
if not recs:
|
| 83 |
+
return ""
|
| 84 |
+
return recs[0].get("ds_meshterms", [""])[0]
|
| 85 |
+
|
| 86 |
+
|
| 87 |
+
# ---------------------------------------------------------------------
|
| 88 |
+
# CLI demo
|
| 89 |
+
# ---------------------------------------------------------------------
|
| 90 |
+
if __name__ == "__main__":
|
| 91 |
+
async def _demo():
|
| 92 |
+
genes = await search_gene("TP53", retmax=3)
|
| 93 |
+
print(f"Gene hits: {len(genes)} – {genes[0]['name'] if genes else 'None'}")
|
| 94 |
+
mesh = await get_mesh_definition("glioblastoma")
|
| 95 |
+
print("MeSH def:", mesh[:80], "…")
|
| 96 |
+
asyncio.run(_demo())
|