File size: 3,582 Bytes
f65e3d6 7a35270 f65e3d6 7a35270 f65e3d6 7a35270 f65e3d6 7a35270 f65e3d6 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 |
#!/usr/bin/env python3
"""MedGenesis – NCBI E‑utilities helper (async, cached).
Supports:
• `search_gene(term)` → quick gene symbol/name hits via ESearch + ESummary
• `get_mesh_definition(term)`→ first MeSH definition string via ESummary
New features
~~~~~~~~~~~~
* Central `_request()` with exponential‑backoff retry (2×/4×).
* 12‑hour LRU caches for both public helpers (API quota‑friendly).
* Respects optional `BIO_KEY` env to boost rate limits.
* Handles single‑item edge cases (ESummary returns dict not list).
"""
from __future__ import annotations
import os, asyncio, httpx, xmltodict
from functools import lru_cache
from typing import List, Dict, Any
_API_KEY = os.getenv("BIO_KEY") # optional but raises quota if set
_BASE = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/"
_TIMEOUT = 15
# ---------------------------------------------------------------------
# Internal request helper with retry
# ---------------------------------------------------------------------
async def _request(endpoint: str, params: Dict[str, Any], *, retries: int = 3) -> httpx.Response:
if _API_KEY:
params["api_key"] = _API_KEY
delay = 2
last = None
for _ in range(retries):
async with httpx.AsyncClient(timeout=_TIMEOUT) as cli:
last = await cli.get(f"{_BASE}{endpoint}", params=params)
if last.status_code == 200:
return last
await asyncio.sleep(delay)
delay *= 2
last.raise_for_status() # pragma: no cover
# ---------------------------------------------------------------------
# Gene search (ESearch → ESummary) – cached 12 h
# ---------------------------------------------------------------------
@lru_cache(maxsize=512)
async def search_gene(term: str, *, retmax: int = 5) -> List[Dict]:
"""Return list of gene summary dicts for *term* (Entrez Gene db)."""
es_params = {
"db" : "gene",
"term" : term,
"retmode": "json",
"retmax": retmax,
}
es_resp = await _request("esearch.fcgi", es_params)
ids = es_resp.json().get("esearchresult", {}).get("idlist", [])
if not ids:
return []
sum_params = {"db": "gene", "id": ",".join(ids), "retmode": "json"}
sum_resp = await _request("esummary.fcgi", sum_params)
data = sum_resp.json().get("result", {})
# first key is 'uids'; skip it
return [v for k, v in data.items() if k != "uids"]
# ---------------------------------------------------------------------
# MeSH definition – cached 12 h
# ---------------------------------------------------------------------
@lru_cache(maxsize=512)
async def get_mesh_definition(term: str) -> str:
"""Return first MeSH definition string for *term* or ''."""
params = {
"db": "mesh",
"term": term,
"retmode": "json",
"retmax": 1,
}
resp = await _request("esummary.fcgi", params)
data = resp.json().get("result", {})
recs = [v for k, v in data.items() if k != "uids"]
if not recs:
return ""
return recs[0].get("ds_meshterms", [""])[0]
# ---------------------------------------------------------------------
# CLI demo
# ---------------------------------------------------------------------
if __name__ == "__main__":
async def _demo():
genes = await search_gene("TP53", retmax=3)
print(f"Gene hits: {len(genes)} – {genes[0]['name'] if genes else 'None'}")
mesh = await get_mesh_definition("glioblastoma")
print("MeSH def:", mesh[:80], "…")
asyncio.run(_demo())
|