#!/usr/bin/env python3 """MedGenesis – PubMed async fetcher (NCBI E-utilities). Improvements ~~~~~~~~~~~~ * Uses **ESearch → EFetch** pipeline with sane timeouts & retries. * Accepts optional `retmax` but caps at 25 to respect fair‑use. * Caches EFetch XML for 12 h via `lru_cache` (ids string as key). * Robust date / author / abstract extraction handles edge‑cases. * Returns list of dicts ready for `schemas.Paper`. """ from __future__ import annotations import asyncio, os, time, xmltodict, httpx from functools import lru_cache from typing import List, Dict _ESEARCH = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi" _EFETCH = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi" _API_KEY = os.getenv("PUB_KEY") # optional but higher rate limits if set _TIMEOUT = 15 _MAX_RET = 25 # absolute hard‑cap # --------------------------------------------------------------------- # Helpers # --------------------------------------------------------------------- async def _esearch(query: str, retmax: int) -> List[str]: params = { "db" : "pubmed", "term" : query, "retmax" : min(retmax, _MAX_RET), "retmode": "json", } if _API_KEY: params["api_key"] = _API_KEY async with httpx.AsyncClient(timeout=_TIMEOUT) as cli: r = await cli.get(_ESEARCH, params=params) r.raise_for_status() return r.json()["esearchresult"].get("idlist", []) @lru_cache(maxsize=128) async def _efetch(ids: str) -> List[Dict]: """Fetch XML for comma‑separated IDs, return list of article dict chunks.""" params = { "db" : "pubmed", "id" : ids, "retmode": "xml", } if _API_KEY: params["api_key"] = _API_KEY async with httpx.AsyncClient(timeout=_TIMEOUT) as cli: r = await cli.get(_EFETCH, params=params) r.raise_for_status() xml = r.text parsed = xmltodict.parse(xml).get("PubmedArticleSet", {}).get("PubmedArticle", []) return parsed if isinstance(parsed, list) else [parsed] # --------------------------------------------------------------------- # Public API # --------------------------------------------------------------------- async def fetch_pubmed(query: str, *, max_results: int = 5) -> List[Dict]: """Return latest PubMed papers as simple dicts.""" ids = await _esearch(query, max_results) if not ids: return [] articles = await _efetch(",".join(ids)) results: List[Dict] = [] for art in articles: meta = art["MedlineCitation"]["Article"] pmid = art["MedlineCitation"]["PMID"] pmid = pmid.get("#text") if isinstance(pmid, dict) else str(pmid) # Title ------------------------------------------------------- title = meta.get("ArticleTitle", "[No title]") # Authors ----------------------------------------------------- authors_raw = meta.get("AuthorList", {}).get("Author", []) if isinstance(authors_raw, dict): authors_raw = [authors_raw] authors = ", ".join( f"{a.get('LastName','')} {a.get('ForeName','')}".strip() for a in authors_raw if a.get("LastName") ) or "Unknown" # Abstract ---------------------------------------------------- abstr = meta.get("Abstract", {}).get("AbstractText", "") if isinstance(abstr, list): summary = " ".join( seg.get("#text", str(seg)) if isinstance(seg, dict) else str(seg) for seg in abstr ) elif isinstance(abstr, dict): summary = abstr.get("#text", "") else: summary = abstr or "" # Published date --------------------------------------------- published = "" art_date = meta.get("ArticleDate") if isinstance(art_date, dict): published = art_date.get("Year", "") elif isinstance(art_date, list) and art_date: published = art_date[0].get("Year", "") if not published: pubdate = meta.get("Journal", {}).get("JournalIssue", {}).get("PubDate", {}) published = pubdate.get("Year") or pubdate.get("MedlineDate", "") results.append({ "title" : title, "authors" : authors, "summary" : summary, "link" : f"https://pubmed.ncbi.nlm.nih.gov/{pmid}/", "published": published, "source" : "PubMed", }) return results