|
|
|
""" |
|
MedGenesis β arXiv async fetcher (Atom API). |
|
|
|
* Uses HTTPS (`https://export.arxiv.org/...`) to avoid HTTP 301 redirects. |
|
* Async httpx fetch with 2Γ/4Γ exponential-back-off retry. |
|
* Parses the Atom feed with feedparser inside a thread (non-blocking). |
|
* 6-hour LRU cache keyed by βquery+max_resultsβ. |
|
* Returns a list of dicts matching schemas.Paper. |
|
|
|
API docs: https://arxiv.org/help/api/user-manual |
|
""" |
|
from __future__ import annotations |
|
|
|
import asyncio |
|
from functools import lru_cache |
|
from typing import List, Dict |
|
from urllib.parse import quote_plus |
|
|
|
import feedparser |
|
import httpx |
|
|
|
_BASE = "https://export.arxiv.org/api/query?search_query=" |
|
_TIMEOUT = 10 |
|
_MAX_RES = 25 |
|
_HEADERS = {"User-Agent": "MedGenesis/1.0 (https://huggingface.co/spaces)"} |
|
|
|
|
|
|
|
|
|
|
|
async def _fetch_raw(query: str, max_results: int, *, retries: int = 3) -> str: |
|
"""Return raw Atom XML from arXiv.""" |
|
max_results = max(1, min(max_results, _MAX_RES)) |
|
url = f"{_BASE}{quote_plus(query)}&max_results={max_results}" |
|
delay = 2 |
|
last: httpx.Response | None = None |
|
for _ in range(retries): |
|
async with httpx.AsyncClient(timeout=_TIMEOUT, headers=_HEADERS) as cli: |
|
last = await cli.get(url) |
|
if last.status_code == 200: |
|
return last.text |
|
await asyncio.sleep(delay) |
|
delay *= 2 |
|
raise RuntimeError(f"arXiv API failed: {last.status_code if last else 'No response'}") |
|
|
|
|
|
|
|
|
|
|
|
@lru_cache(maxsize=256) |
|
async def fetch_arxiv(query: str, *, max_results: int = 5) -> List[Dict]: |
|
"""Return arXiv paper dicts compatible with schemas.Paper.""" |
|
xml_text = await _fetch_raw(query, max_results) |
|
|
|
|
|
feed = await asyncio.to_thread(feedparser.parse, xml_text) |
|
|
|
papers: List[Dict] = [] |
|
for ent in feed.entries: |
|
authors = ( |
|
", ".join(a.name for a in getattr(ent, "authors", [])) |
|
if hasattr(ent, "authors") else "Unknown" |
|
) |
|
papers.append({ |
|
"title" : getattr(ent, "title", "[No title]"), |
|
"authors" : authors, |
|
"summary" : getattr(ent, "summary", ""), |
|
"link" : getattr(ent, "link", ""), |
|
"published": getattr(ent, "published", ""), |
|
"source" : "arXiv", |
|
}) |
|
return papers |
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
async def _demo(): |
|
papers = await fetch_arxiv("glioblastoma CRISPR", max_results=3) |
|
for p in papers: |
|
print(p["title"]) |
|
asyncio.run(_demo()) |
|
|