Spaces:

mgbam
/

MCP_Res

Running

App Files Files Community

MCP_Res / mcp /pubmed.py

mgbam

Update mcp/pubmed.py

25f3b01 verified 4 days ago

raw

history blame contribute delete

4.52 kB

	#!/usr/bin/env python3
	"""MedGenesis – PubMed async fetcher (NCBI E-utilities).

	Improvements
	~~~~~~~~~~~~
	* Uses ESearch → EFetch pipeline with sane timeouts & retries.
	* Accepts optional `retmax` but caps at 25 to respect fair‑use.
	* Caches EFetch XML for 12 h via `lru_cache` (ids string as key).
	* Robust date / author / abstract extraction handles edge‑cases.
	* Returns list of dicts ready for `schemas.Paper`.
	"""
	from __future__ import annotations

	import asyncio, os, time, xmltodict, httpx
	from functools import lru_cache
	from typing import List, Dict

	_ESEARCH = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi"
	_EFETCH = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"
	_API_KEY = os.getenv("PUB_KEY") # optional but higher rate limits if set

	_TIMEOUT = 15
	_MAX_RET = 25 # absolute hard‑cap

	# ---------------------------------------------------------------------
	# Helpers
	# ---------------------------------------------------------------------

	async def _esearch(query: str, retmax: int) -> List[str]:
	params = {
	"db" : "pubmed",
	"term" : query,
	"retmax" : min(retmax, _MAX_RET),
	"retmode": "json",
	}
	if _API_KEY:
	params["api_key"] = _API_KEY
	async with httpx.AsyncClient(timeout=_TIMEOUT) as cli:
	r = await cli.get(_ESEARCH, params=params)
	r.raise_for_status()
	return r.json()["esearchresult"].get("idlist", [])


	@lru_cache(maxsize=128)
	async def _efetch(ids: str) -> List[Dict]:
	"""Fetch XML for comma‑separated IDs, return list of article dict chunks."""
	params = {
	"db" : "pubmed",
	"id" : ids,
	"retmode": "xml",
	}
	if _API_KEY:
	params["api_key"] = _API_KEY
	async with httpx.AsyncClient(timeout=_TIMEOUT) as cli:
	r = await cli.get(_EFETCH, params=params)
	r.raise_for_status()
	xml = r.text
	parsed = xmltodict.parse(xml).get("PubmedArticleSet", {}).get("PubmedArticle", [])
	return parsed if isinstance(parsed, list) else [parsed]


	# ---------------------------------------------------------------------
	# Public API
	# ---------------------------------------------------------------------

	async def fetch_pubmed(query: str, *, max_results: int = 5) -> List[Dict]:
	"""Return latest PubMed papers as simple dicts."""
	ids = await _esearch(query, max_results)
	if not ids:
	return []

	articles = await _efetch(",".join(ids))
	results: List[Dict] = []

	for art in articles:
	meta = art["MedlineCitation"]["Article"]
	pmid = art["MedlineCitation"]["PMID"]
	pmid = pmid.get("#text") if isinstance(pmid, dict) else str(pmid)

	# Title -------------------------------------------------------
	title = meta.get("ArticleTitle", "[No title]")

	# Authors -----------------------------------------------------
	authors_raw = meta.get("AuthorList", {}).get("Author", [])
	if isinstance(authors_raw, dict):
	authors_raw = [authors_raw]
	authors = ", ".join(
	f"{a.get('LastName','')} {a.get('ForeName','')}".strip()
	for a in authors_raw if a.get("LastName")
	) or "Unknown"

	# Abstract ----------------------------------------------------
	abstr = meta.get("Abstract", {}).get("AbstractText", "")
	if isinstance(abstr, list):
	summary = " ".join(
	seg.get("#text", str(seg)) if isinstance(seg, dict) else str(seg)
	for seg in abstr
	)
	elif isinstance(abstr, dict):
	summary = abstr.get("#text", "")
	else:
	summary = abstr or ""

	# Published date ---------------------------------------------
	published = ""
	art_date = meta.get("ArticleDate")
	if isinstance(art_date, dict):
	published = art_date.get("Year", "")
	elif isinstance(art_date, list) and art_date:
	published = art_date[0].get("Year", "")
	if not published:
	pubdate = meta.get("Journal", {}).get("JournalIssue", {}).get("PubDate", {})
	published = pubdate.get("Year") or pubdate.get("MedlineDate", "")

	results.append({
	"title" : title,
	"authors" : authors,
	"summary" : summary,
	"link" : f"https://pubmed.ncbi.nlm.nih.gov/{pmid}/",
	"published": published,
	"source" : "PubMed",
	})

	return results