File size: 4,521 Bytes
25f3b01
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
#!/usr/bin/env python3
"""MedGenesis – PubMed async fetcher (NCBI E-utilities).

Improvements
~~~~~~~~~~~~
* Uses **ESearch → EFetch** pipeline with sane timeouts & retries.
* Accepts optional `retmax` but caps at 25 to respect fair‑use.
* Caches EFetch XML for 12 h via `lru_cache` (ids string as key).
* Robust date / author / abstract extraction handles edge‑cases.
* Returns list of dicts ready for `schemas.Paper`.
"""
from __future__ import annotations

import asyncio, os, time, xmltodict, httpx
from functools import lru_cache
from typing import List, Dict

_ESEARCH = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi"
_EFETCH  = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"
_API_KEY = os.getenv("PUB_KEY")  # optional but higher rate limits if set

_TIMEOUT = 15
_MAX_RET = 25  # absolute hard‑cap

# ---------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------

async def _esearch(query: str, retmax: int) -> List[str]:
    params = {
        "db"     : "pubmed",
        "term"   : query,
        "retmax" : min(retmax, _MAX_RET),
        "retmode": "json",
    }
    if _API_KEY:
        params["api_key"] = _API_KEY
    async with httpx.AsyncClient(timeout=_TIMEOUT) as cli:
        r = await cli.get(_ESEARCH, params=params)
        r.raise_for_status()
        return r.json()["esearchresult"].get("idlist", [])


@lru_cache(maxsize=128)
async def _efetch(ids: str) -> List[Dict]:
    """Fetch XML for comma‑separated IDs, return list of article dict chunks."""
    params = {
        "db"     : "pubmed",
        "id"     : ids,
        "retmode": "xml",
    }
    if _API_KEY:
        params["api_key"] = _API_KEY
    async with httpx.AsyncClient(timeout=_TIMEOUT) as cli:
        r = await cli.get(_EFETCH, params=params)
        r.raise_for_status()
        xml = r.text
    parsed = xmltodict.parse(xml).get("PubmedArticleSet", {}).get("PubmedArticle", [])
    return parsed if isinstance(parsed, list) else [parsed]


# ---------------------------------------------------------------------
# Public API
# ---------------------------------------------------------------------

async def fetch_pubmed(query: str, *, max_results: int = 5) -> List[Dict]:
    """Return latest PubMed papers as simple dicts."""
    ids = await _esearch(query, max_results)
    if not ids:
        return []

    articles = await _efetch(",".join(ids))
    results: List[Dict] = []

    for art in articles:
        meta  = art["MedlineCitation"]["Article"]
        pmid  = art["MedlineCitation"]["PMID"]
        pmid  = pmid.get("#text") if isinstance(pmid, dict) else str(pmid)

        # Title -------------------------------------------------------
        title = meta.get("ArticleTitle", "[No title]")

        # Authors -----------------------------------------------------
        authors_raw = meta.get("AuthorList", {}).get("Author", [])
        if isinstance(authors_raw, dict):
            authors_raw = [authors_raw]
        authors = ", ".join(
            f"{a.get('LastName','')} {a.get('ForeName','')}".strip()
            for a in authors_raw if a.get("LastName")
        ) or "Unknown"

        # Abstract ----------------------------------------------------
        abstr = meta.get("Abstract", {}).get("AbstractText", "")
        if isinstance(abstr, list):
            summary = " ".join(
                seg.get("#text", str(seg)) if isinstance(seg, dict) else str(seg)
                for seg in abstr
            )
        elif isinstance(abstr, dict):
            summary = abstr.get("#text", "")
        else:
            summary = abstr or ""

        # Published date ---------------------------------------------
        published = ""
        art_date  = meta.get("ArticleDate")
        if isinstance(art_date, dict):
            published = art_date.get("Year", "")
        elif isinstance(art_date, list) and art_date:
            published = art_date[0].get("Year", "")
        if not published:
            pubdate = meta.get("Journal", {}).get("JournalIssue", {}).get("PubDate", {})
            published = pubdate.get("Year") or pubdate.get("MedlineDate", "")

        results.append({
            "title"    : title,
            "authors"  : authors,
            "summary"  : summary,
            "link"     : f"https://pubmed.ncbi.nlm.nih.gov/{pmid}/",
            "published": published,
            "source"   : "PubMed",
        })

    return results