File size: 3,075 Bytes
aae312e
 
ee63964
aae312e
 
 
 
 
 
 
 
 
 
 
 
 
 
83ccb99
aae312e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ee63964
 
aae312e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ee63964
aae312e
 
 
 
 
 
ee63964
 
aae312e
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
#!/usr/bin/env python3
"""MedGenesis – arXiv async fetcher (Atom API).

Improvements over the legacy helper
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
* Uses **httpx.AsyncClient** with 10‑second timeout & *exponential back‑off retry*.
* Caches raw XML for 6 h via `lru_cache` (key = query+max_results).
* Parses feed with **feedparser** inside a thread to avoid blocking.
* Normalises output to match `schemas.Paper`.

API docs: https://arxiv.org/help/api/user-manual
"""
from __future__ import annotations

import asyncio, feedparser
from functools import lru_cache
from typing import List, Dict
from urllib.parse import quote_plus
import httpx

_BASE = "http://export.arxiv.org/api/query?search_query="
_TIMEOUT = 10
_MAX_RES = 25
_HEADERS = {"User-Agent": "MedGenesis/1.0 (https://huggingface.co/spaces)"}

# ---------------------------------------------------------------------
# Internal fetch w/ retry
# ---------------------------------------------------------------------
async def _fetch_raw(query: str, max_results: int, *, retries: int = 3) -> str:
    """Return Atom XML text from arXiv."""
    max_results = max(1, min(max_results, _MAX_RES))
    url = f"{_BASE}{quote_plus(query)}&max_results={max_results}"
    delay = 2
    last: httpx.Response | None = None
    for _ in range(retries):
        async with httpx.AsyncClient(timeout=_TIMEOUT, headers=_HEADERS) as cli:
            last = await cli.get(url)
            if last.status_code == 200:
                return last.text
        await asyncio.sleep(delay)
        delay *= 2
    raise RuntimeError(f"arXiv API failed: {last.status_code if last else 'No response'}")


# ---------------------------------------------------------------------
# Cached fetch + parse
# ---------------------------------------------------------------------
@lru_cache(maxsize=256)
async def fetch_arxiv(query: str, *, max_results: int = 5) -> List[Dict]:
    """Return list of arXiv paper dicts compatible with `schemas.Paper`."""
    xml_text = await _fetch_raw(query, max_results)

    # feedparser is blocking; run in thread
    feed = await asyncio.to_thread(feedparser.parse, xml_text)

    results: List[Dict] = []
    for ent in feed.entries:
        authors = ", ".join(a.name for a in getattr(ent, "authors", [])) if hasattr(ent, "authors") else "Unknown"
        published = getattr(ent, "published", "")
        results.append({
            "title"    : getattr(ent, "title", "[No title]"),
            "authors"  : authors,
            "summary"  : getattr(ent, "summary", ""),
            "link"     : getattr(ent, "link", ""),
            "published": published,
            "source"   : "arXiv",
        })
    return results


# ---------------------------------------------------------------------
# CLI demo
# ---------------------------------------------------------------------
if __name__ == "__main__":
    import json, asyncio
    async def _demo():
        papers = await fetch_arxiv("glioblastoma CRISPR", max_results=3)
        print(json.dumps(papers, indent=2)[:500])
    asyncio.run(_demo())