|
|
|
"""MedGenesis – PubMed async fetcher (NCBI E-utilities). |
|
|
|
Improvements |
|
~~~~~~~~~~~~ |
|
* Uses **ESearch → EFetch** pipeline with sane timeouts & retries. |
|
* Accepts optional `retmax` but caps at 25 to respect fair‑use. |
|
* Caches EFetch XML for 12 h via `lru_cache` (ids string as key). |
|
* Robust date / author / abstract extraction handles edge‑cases. |
|
* Returns list of dicts ready for `schemas.Paper`. |
|
""" |
|
from __future__ import annotations |
|
|
|
import asyncio, os, time, xmltodict, httpx |
|
from functools import lru_cache |
|
from typing import List, Dict |
|
|
|
_ESEARCH = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi" |
|
_EFETCH = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi" |
|
_API_KEY = os.getenv("PUB_KEY") |
|
|
|
_TIMEOUT = 15 |
|
_MAX_RET = 25 |
|
|
|
|
|
|
|
|
|
|
|
async def _esearch(query: str, retmax: int) -> List[str]: |
|
params = { |
|
"db" : "pubmed", |
|
"term" : query, |
|
"retmax" : min(retmax, _MAX_RET), |
|
"retmode": "json", |
|
} |
|
if _API_KEY: |
|
params["api_key"] = _API_KEY |
|
async with httpx.AsyncClient(timeout=_TIMEOUT) as cli: |
|
r = await cli.get(_ESEARCH, params=params) |
|
r.raise_for_status() |
|
return r.json()["esearchresult"].get("idlist", []) |
|
|
|
|
|
@lru_cache(maxsize=128) |
|
async def _efetch(ids: str) -> List[Dict]: |
|
"""Fetch XML for comma‑separated IDs, return list of article dict chunks.""" |
|
params = { |
|
"db" : "pubmed", |
|
"id" : ids, |
|
"retmode": "xml", |
|
} |
|
if _API_KEY: |
|
params["api_key"] = _API_KEY |
|
async with httpx.AsyncClient(timeout=_TIMEOUT) as cli: |
|
r = await cli.get(_EFETCH, params=params) |
|
r.raise_for_status() |
|
xml = r.text |
|
parsed = xmltodict.parse(xml).get("PubmedArticleSet", {}).get("PubmedArticle", []) |
|
return parsed if isinstance(parsed, list) else [parsed] |
|
|
|
|
|
|
|
|
|
|
|
|
|
async def fetch_pubmed(query: str, *, max_results: int = 5) -> List[Dict]: |
|
"""Return latest PubMed papers as simple dicts.""" |
|
ids = await _esearch(query, max_results) |
|
if not ids: |
|
return [] |
|
|
|
articles = await _efetch(",".join(ids)) |
|
results: List[Dict] = [] |
|
|
|
for art in articles: |
|
meta = art["MedlineCitation"]["Article"] |
|
pmid = art["MedlineCitation"]["PMID"] |
|
pmid = pmid.get("#text") if isinstance(pmid, dict) else str(pmid) |
|
|
|
|
|
title = meta.get("ArticleTitle", "[No title]") |
|
|
|
|
|
authors_raw = meta.get("AuthorList", {}).get("Author", []) |
|
if isinstance(authors_raw, dict): |
|
authors_raw = [authors_raw] |
|
authors = ", ".join( |
|
f"{a.get('LastName','')} {a.get('ForeName','')}".strip() |
|
for a in authors_raw if a.get("LastName") |
|
) or "Unknown" |
|
|
|
|
|
abstr = meta.get("Abstract", {}).get("AbstractText", "") |
|
if isinstance(abstr, list): |
|
summary = " ".join( |
|
seg.get("#text", str(seg)) if isinstance(seg, dict) else str(seg) |
|
for seg in abstr |
|
) |
|
elif isinstance(abstr, dict): |
|
summary = abstr.get("#text", "") |
|
else: |
|
summary = abstr or "" |
|
|
|
|
|
published = "" |
|
art_date = meta.get("ArticleDate") |
|
if isinstance(art_date, dict): |
|
published = art_date.get("Year", "") |
|
elif isinstance(art_date, list) and art_date: |
|
published = art_date[0].get("Year", "") |
|
if not published: |
|
pubdate = meta.get("Journal", {}).get("JournalIssue", {}).get("PubDate", {}) |
|
published = pubdate.get("Year") or pubdate.get("MedlineDate", "") |
|
|
|
results.append({ |
|
"title" : title, |
|
"authors" : authors, |
|
"summary" : summary, |
|
"link" : f"https://pubmed.ncbi.nlm.nih.gov/{pmid}/", |
|
"published": published, |
|
"source" : "PubMed", |
|
}) |
|
|
|
return results |
|
|