MCP_Res / mcp /pubmed.py
mgbam's picture
Update mcp/pubmed.py
25f3b01 verified
#!/usr/bin/env python3
"""MedGenesis – PubMed async fetcher (NCBI E-utilities).
Improvements
~~~~~~~~~~~~
* Uses **ESearch → EFetch** pipeline with sane timeouts & retries.
* Accepts optional `retmax` but caps at 25 to respect fair‑use.
* Caches EFetch XML for 12 h via `lru_cache` (ids string as key).
* Robust date / author / abstract extraction handles edge‑cases.
* Returns list of dicts ready for `schemas.Paper`.
"""
from __future__ import annotations
import asyncio, os, time, xmltodict, httpx
from functools import lru_cache
from typing import List, Dict
_ESEARCH = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi"
_EFETCH = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"
_API_KEY = os.getenv("PUB_KEY") # optional but higher rate limits if set
_TIMEOUT = 15
_MAX_RET = 25 # absolute hard‑cap
# ---------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------
async def _esearch(query: str, retmax: int) -> List[str]:
params = {
"db" : "pubmed",
"term" : query,
"retmax" : min(retmax, _MAX_RET),
"retmode": "json",
}
if _API_KEY:
params["api_key"] = _API_KEY
async with httpx.AsyncClient(timeout=_TIMEOUT) as cli:
r = await cli.get(_ESEARCH, params=params)
r.raise_for_status()
return r.json()["esearchresult"].get("idlist", [])
@lru_cache(maxsize=128)
async def _efetch(ids: str) -> List[Dict]:
"""Fetch XML for comma‑separated IDs, return list of article dict chunks."""
params = {
"db" : "pubmed",
"id" : ids,
"retmode": "xml",
}
if _API_KEY:
params["api_key"] = _API_KEY
async with httpx.AsyncClient(timeout=_TIMEOUT) as cli:
r = await cli.get(_EFETCH, params=params)
r.raise_for_status()
xml = r.text
parsed = xmltodict.parse(xml).get("PubmedArticleSet", {}).get("PubmedArticle", [])
return parsed if isinstance(parsed, list) else [parsed]
# ---------------------------------------------------------------------
# Public API
# ---------------------------------------------------------------------
async def fetch_pubmed(query: str, *, max_results: int = 5) -> List[Dict]:
"""Return latest PubMed papers as simple dicts."""
ids = await _esearch(query, max_results)
if not ids:
return []
articles = await _efetch(",".join(ids))
results: List[Dict] = []
for art in articles:
meta = art["MedlineCitation"]["Article"]
pmid = art["MedlineCitation"]["PMID"]
pmid = pmid.get("#text") if isinstance(pmid, dict) else str(pmid)
# Title -------------------------------------------------------
title = meta.get("ArticleTitle", "[No title]")
# Authors -----------------------------------------------------
authors_raw = meta.get("AuthorList", {}).get("Author", [])
if isinstance(authors_raw, dict):
authors_raw = [authors_raw]
authors = ", ".join(
f"{a.get('LastName','')} {a.get('ForeName','')}".strip()
for a in authors_raw if a.get("LastName")
) or "Unknown"
# Abstract ----------------------------------------------------
abstr = meta.get("Abstract", {}).get("AbstractText", "")
if isinstance(abstr, list):
summary = " ".join(
seg.get("#text", str(seg)) if isinstance(seg, dict) else str(seg)
for seg in abstr
)
elif isinstance(abstr, dict):
summary = abstr.get("#text", "")
else:
summary = abstr or ""
# Published date ---------------------------------------------
published = ""
art_date = meta.get("ArticleDate")
if isinstance(art_date, dict):
published = art_date.get("Year", "")
elif isinstance(art_date, list) and art_date:
published = art_date[0].get("Year", "")
if not published:
pubdate = meta.get("Journal", {}).get("JournalIssue", {}).get("PubDate", {})
published = pubdate.get("Year") or pubdate.get("MedlineDate", "")
results.append({
"title" : title,
"authors" : authors,
"summary" : summary,
"link" : f"https://pubmed.ncbi.nlm.nih.gov/{pmid}/",
"published": published,
"source" : "PubMed",
})
return results