Update mcp/ncbi.py
Browse files- mcp/ncbi.py +90 -29
    	
        mcp/ncbi.py
    CHANGED
    
    | @@ -1,35 +1,96 @@ | |
| 1 | 
            -
             | 
| 2 | 
            -
            """
         | 
| 3 | 
            -
             | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 4 | 
             
            """
         | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 5 |  | 
| 6 | 
            -
             | 
| 7 | 
            -
             | 
| 8 | 
            -
             | 
| 9 | 
            -
             | 
| 10 | 
            -
             | 
| 11 | 
            -
             | 
| 12 | 
            -
             | 
| 13 | 
            -
             | 
| 14 | 
            -
                 | 
| 15 | 
            -
                     | 
| 16 | 
            -
             | 
| 17 | 
            -
                     | 
| 18 | 
            -
             | 
| 19 | 
            -
             | 
| 20 | 
            -
             | 
| 21 | 
            -
            # ---------- Public helpers ----------
         | 
| 22 | 
            -
            async def search_gene(term: str, retmax: int = 5) -> List[Dict]:
         | 
| 23 | 
            -
                """Return basic gene info (ID + name/symbol) by search term."""
         | 
| 24 | 
            -
                data = await _get("esearch.fcgi", {"db": "gene", "term": term, "retmode": "json", "retmax": retmax})
         | 
| 25 | 
            -
                ids = data["esearchresult"]["idlist"]
         | 
| 26 | 
             
                if not ids:
         | 
| 27 | 
             
                    return []
         | 
| 28 | 
            -
                summary = await _get("esummary.fcgi", {"db": "gene", "id": ",".join(ids), "retmode": "json"})
         | 
| 29 | 
            -
                return list(summary["result"].values())[1:]  # first key is 'uids'
         | 
| 30 |  | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 31 | 
             
            async def get_mesh_definition(term: str) -> str:
         | 
| 32 | 
            -
                """Return MeSH term  | 
| 33 | 
            -
                 | 
| 34 | 
            -
             | 
| 35 | 
            -
             | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            #!/usr/bin/env python3
         | 
| 2 | 
            +
            """MedGenesis – NCBI E‑utilities helper (async, cached).
         | 
| 3 | 
            +
             | 
| 4 | 
            +
            Supports:
         | 
| 5 | 
            +
            • `search_gene(term)`        → quick gene symbol/name hits via ESearch + ESummary
         | 
| 6 | 
            +
            • `get_mesh_definition(term)`→ first MeSH definition string via ESummary
         | 
| 7 | 
            +
             | 
| 8 | 
            +
            New features
         | 
| 9 | 
            +
            ~~~~~~~~~~~~
         | 
| 10 | 
            +
            * Central `_request()` with exponential‑backoff retry (2×/4×).
         | 
| 11 | 
            +
            * 12‑hour LRU caches for both public helpers (API quota‑friendly).
         | 
| 12 | 
            +
            * Respects optional `BIO_KEY` env to boost rate limits.
         | 
| 13 | 
            +
            * Handles single‑item edge cases (ESummary returns dict not list).
         | 
| 14 | 
             
            """
         | 
| 15 | 
            +
            from __future__ import annotations
         | 
| 16 | 
            +
             | 
| 17 | 
            +
            import os, asyncio, httpx, xmltodict
         | 
| 18 | 
            +
            from functools import lru_cache
         | 
| 19 | 
            +
            from typing import List, Dict, Any
         | 
| 20 | 
            +
             | 
| 21 | 
            +
            _API_KEY = os.getenv("BIO_KEY")  # optional but raises quota if set
         | 
| 22 | 
            +
            _BASE    = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/"
         | 
| 23 | 
            +
            _TIMEOUT = 15
         | 
| 24 | 
            +
             | 
| 25 | 
            +
            # ---------------------------------------------------------------------
         | 
| 26 | 
            +
            # Internal request helper with retry
         | 
| 27 | 
            +
            # ---------------------------------------------------------------------
         | 
| 28 | 
            +
            async def _request(endpoint: str, params: Dict[str, Any], *, retries: int = 3) -> httpx.Response:
         | 
| 29 | 
            +
                if _API_KEY:
         | 
| 30 | 
            +
                    params["api_key"] = _API_KEY
         | 
| 31 | 
            +
                delay = 2
         | 
| 32 | 
            +
                last  = None
         | 
| 33 | 
            +
                for _ in range(retries):
         | 
| 34 | 
            +
                    async with httpx.AsyncClient(timeout=_TIMEOUT) as cli:
         | 
| 35 | 
            +
                        last = await cli.get(f"{_BASE}{endpoint}", params=params)
         | 
| 36 | 
            +
                        if last.status_code == 200:
         | 
| 37 | 
            +
                            return last
         | 
| 38 | 
            +
                    await asyncio.sleep(delay)
         | 
| 39 | 
            +
                    delay *= 2
         | 
| 40 | 
            +
                last.raise_for_status()  # pragma: no cover
         | 
| 41 |  | 
| 42 | 
            +
             | 
| 43 | 
            +
            # ---------------------------------------------------------------------
         | 
| 44 | 
            +
            # Gene search (ESearch → ESummary) – cached 12 h
         | 
| 45 | 
            +
            # ---------------------------------------------------------------------
         | 
| 46 | 
            +
            @lru_cache(maxsize=512)
         | 
| 47 | 
            +
            async def search_gene(term: str, *, retmax: int = 5) -> List[Dict]:
         | 
| 48 | 
            +
                """Return list of gene summary dicts for *term* (Entrez Gene db)."""
         | 
| 49 | 
            +
                es_params = {
         | 
| 50 | 
            +
                    "db"    : "gene",
         | 
| 51 | 
            +
                    "term"  : term,
         | 
| 52 | 
            +
                    "retmode": "json",
         | 
| 53 | 
            +
                    "retmax": retmax,
         | 
| 54 | 
            +
                }
         | 
| 55 | 
            +
                es_resp = await _request("esearch.fcgi", es_params)
         | 
| 56 | 
            +
                ids = es_resp.json().get("esearchresult", {}).get("idlist", [])
         | 
|  | |
|  | |
|  | |
|  | |
|  | |
| 57 | 
             
                if not ids:
         | 
| 58 | 
             
                    return []
         | 
|  | |
|  | |
| 59 |  | 
| 60 | 
            +
                sum_params = {"db": "gene", "id": ",".join(ids), "retmode": "json"}
         | 
| 61 | 
            +
                sum_resp   = await _request("esummary.fcgi", sum_params)
         | 
| 62 | 
            +
                data = sum_resp.json().get("result", {})
         | 
| 63 | 
            +
                # first key is 'uids'; skip it
         | 
| 64 | 
            +
                return [v for k, v in data.items() if k != "uids"]
         | 
| 65 | 
            +
             | 
| 66 | 
            +
             | 
| 67 | 
            +
            # ---------------------------------------------------------------------
         | 
| 68 | 
            +
            # MeSH definition – cached 12 h
         | 
| 69 | 
            +
            # ---------------------------------------------------------------------
         | 
| 70 | 
            +
            @lru_cache(maxsize=512)
         | 
| 71 | 
             
            async def get_mesh_definition(term: str) -> str:
         | 
| 72 | 
            +
                """Return first MeSH definition string for *term* or ''."""
         | 
| 73 | 
            +
                params = {
         | 
| 74 | 
            +
                    "db": "mesh",
         | 
| 75 | 
            +
                    "term": term,
         | 
| 76 | 
            +
                    "retmode": "json",
         | 
| 77 | 
            +
                    "retmax": 1,
         | 
| 78 | 
            +
                }
         | 
| 79 | 
            +
                resp = await _request("esummary.fcgi", params)
         | 
| 80 | 
            +
                data = resp.json().get("result", {})
         | 
| 81 | 
            +
                recs = [v for k, v in data.items() if k != "uids"]
         | 
| 82 | 
            +
                if not recs:
         | 
| 83 | 
            +
                    return ""
         | 
| 84 | 
            +
                return recs[0].get("ds_meshterms", [""])[0]
         | 
| 85 | 
            +
             | 
| 86 | 
            +
             | 
| 87 | 
            +
            # ---------------------------------------------------------------------
         | 
| 88 | 
            +
            # CLI demo
         | 
| 89 | 
            +
            # ---------------------------------------------------------------------
         | 
| 90 | 
            +
            if __name__ == "__main__":
         | 
| 91 | 
            +
                async def _demo():
         | 
| 92 | 
            +
                    genes = await search_gene("TP53", retmax=3)
         | 
| 93 | 
            +
                    print(f"Gene hits: {len(genes)} – {genes[0]['name'] if genes else 'None'}")
         | 
| 94 | 
            +
                    mesh = await get_mesh_definition("glioblastoma")
         | 
| 95 | 
            +
                    print("MeSH def:", mesh[:80], "…")
         | 
| 96 | 
            +
                asyncio.run(_demo())
         | 
