#!/usr/bin/env python3 """MedGenesis – NCBI E‑utilities helper (async, cached). Supports: • `search_gene(term)` → quick gene symbol/name hits via ESearch + ESummary • `get_mesh_definition(term)`→ first MeSH definition string via ESummary New features ~~~~~~~~~~~~ * Central `_request()` with exponential‑backoff retry (2×/4×). * 12‑hour LRU caches for both public helpers (API quota‑friendly). * Respects optional `BIO_KEY` env to boost rate limits. * Handles single‑item edge cases (ESummary returns dict not list). """ from __future__ import annotations import os, asyncio, httpx, xmltodict from functools import lru_cache from typing import List, Dict, Any _API_KEY = os.getenv("BIO_KEY") # optional but raises quota if set _BASE = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/" _TIMEOUT = 15 # --------------------------------------------------------------------- # Internal request helper with retry # --------------------------------------------------------------------- async def _request(endpoint: str, params: Dict[str, Any], *, retries: int = 3) -> httpx.Response: if _API_KEY: params["api_key"] = _API_KEY delay = 2 last = None for _ in range(retries): async with httpx.AsyncClient(timeout=_TIMEOUT) as cli: last = await cli.get(f"{_BASE}{endpoint}", params=params) if last.status_code == 200: return last await asyncio.sleep(delay) delay *= 2 last.raise_for_status() # pragma: no cover # --------------------------------------------------------------------- # Gene search (ESearch → ESummary) – cached 12 h # --------------------------------------------------------------------- @lru_cache(maxsize=512) async def search_gene(term: str, *, retmax: int = 5) -> List[Dict]: """Return list of gene summary dicts for *term* (Entrez Gene db).""" es_params = { "db" : "gene", "term" : term, "retmode": "json", "retmax": retmax, } es_resp = await _request("esearch.fcgi", es_params) ids = es_resp.json().get("esearchresult", {}).get("idlist", []) if not ids: return [] sum_params = {"db": "gene", "id": ",".join(ids), "retmode": "json"} sum_resp = await _request("esummary.fcgi", sum_params) data = sum_resp.json().get("result", {}) # first key is 'uids'; skip it return [v for k, v in data.items() if k != "uids"] # --------------------------------------------------------------------- # MeSH definition – cached 12 h # --------------------------------------------------------------------- @lru_cache(maxsize=512) async def get_mesh_definition(term: str) -> str: """Return first MeSH definition string for *term* or ''.""" params = { "db": "mesh", "term": term, "retmode": "json", "retmax": 1, } resp = await _request("esummary.fcgi", params) data = resp.json().get("result", {}) recs = [v for k, v in data.items() if k != "uids"] if not recs: return "" return recs[0].get("ds_meshterms", [""])[0] # --------------------------------------------------------------------- # CLI demo # --------------------------------------------------------------------- if __name__ == "__main__": async def _demo(): genes = await search_gene("TP53", retmax=3) print(f"Gene hits: {len(genes)} – {genes[0]['name'] if genes else 'None'}") mesh = await get_mesh_definition("glioblastoma") print("MeSH def:", mesh[:80], "…") asyncio.run(_demo())