#!/usr/bin/env python3 """ MedGenesis – arXiv async fetcher (Atom API). * Uses HTTPS (`https://export.arxiv.org/...`) to avoid HTTP 301 redirects. * Async httpx fetch with 2×/4× exponential-back-off retry. * Parses the Atom feed with feedparser inside a thread (non-blocking). * 6-hour LRU cache keyed by “query+max_results”. * Returns a list of dicts matching schemas.Paper. API docs: https://arxiv.org/help/api/user-manual """ from __future__ import annotations import asyncio from functools import lru_cache from typing import List, Dict from urllib.parse import quote_plus import feedparser import httpx _BASE = "https://export.arxiv.org/api/query?search_query=" _TIMEOUT = 10 _MAX_RES = 25 _HEADERS = {"User-Agent": "MedGenesis/1.0 (https://huggingface.co/spaces)"} # ────────────────────────────────────────────────────────────────────── # Internal fetch helper with retry # ────────────────────────────────────────────────────────────────────── async def _fetch_raw(query: str, max_results: int, *, retries: int = 3) -> str: """Return raw Atom XML from arXiv.""" max_results = max(1, min(max_results, _MAX_RES)) url = f"{_BASE}{quote_plus(query)}&max_results={max_results}" delay = 2 last: httpx.Response | None = None for _ in range(retries): async with httpx.AsyncClient(timeout=_TIMEOUT, headers=_HEADERS) as cli: last = await cli.get(url) if last.status_code == 200: return last.text await asyncio.sleep(delay) delay *= 2 raise RuntimeError(f"arXiv API failed: {last.status_code if last else 'No response'}") # ────────────────────────────────────────────────────────────────────── # Public cached fetch + parse # ────────────────────────────────────────────────────────────────────── @lru_cache(maxsize=256) async def fetch_arxiv(query: str, *, max_results: int = 5) -> List[Dict]: """Return arXiv paper dicts compatible with schemas.Paper.""" xml_text = await _fetch_raw(query, max_results) # feedparser is blocking; parse in thread feed = await asyncio.to_thread(feedparser.parse, xml_text) papers: List[Dict] = [] for ent in feed.entries: authors = ( ", ".join(a.name for a in getattr(ent, "authors", [])) if hasattr(ent, "authors") else "Unknown" ) papers.append({ "title" : getattr(ent, "title", "[No title]"), "authors" : authors, "summary" : getattr(ent, "summary", ""), "link" : getattr(ent, "link", ""), "published": getattr(ent, "published", ""), "source" : "arXiv", }) return papers # ────────────────────────────────────────────────────────────────────── # CLI demo # ────────────────────────────────────────────────────────────────────── if __name__ == "__main__": async def _demo(): papers = await fetch_arxiv("glioblastoma CRISPR", max_results=3) for p in papers: print(p["title"]) asyncio.run(_demo())