#!/usr/bin/env python3 """MedGenesis – arXiv async fetcher (Atom API). Improvements over the legacy helper ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ * Uses **httpx.AsyncClient** with 10‑second timeout & *exponential back‑off retry*. * Caches raw XML for 6 h via `lru_cache` (key = query+max_results). * Parses feed with **feedparser** inside a thread to avoid blocking. * Normalises output to match `schemas.Paper`. API docs: https://arxiv.org/help/api/user-manual """ from __future__ import annotations import asyncio, feedparser from functools import lru_cache from typing import List, Dict from urllib.parse import quote_plus import httpx _BASE = "http://export.arxiv.org/api/query?search_query=" _TIMEOUT = 10 _MAX_RES = 25 _HEADERS = {"User-Agent": "MedGenesis/1.0 (https://huggingface.co/spaces)"} # --------------------------------------------------------------------- # Internal fetch w/ retry # --------------------------------------------------------------------- async def _fetch_raw(query: str, max_results: int, *, retries: int = 3) -> str: """Return Atom XML text from arXiv.""" max_results = max(1, min(max_results, _MAX_RES)) url = f"{_BASE}{quote_plus(query)}&max_results={max_results}" delay = 2 last: httpx.Response | None = None for _ in range(retries): async with httpx.AsyncClient(timeout=_TIMEOUT, headers=_HEADERS) as cli: last = await cli.get(url) if last.status_code == 200: return last.text await asyncio.sleep(delay) delay *= 2 raise RuntimeError(f"arXiv API failed: {last.status_code if last else 'No response'}") # --------------------------------------------------------------------- # Cached fetch + parse # --------------------------------------------------------------------- @lru_cache(maxsize=256) async def fetch_arxiv(query: str, *, max_results: int = 5) -> List[Dict]: """Return list of arXiv paper dicts compatible with `schemas.Paper`.""" xml_text = await _fetch_raw(query, max_results) # feedparser is blocking; run in thread feed = await asyncio.to_thread(feedparser.parse, xml_text) results: List[Dict] = [] for ent in feed.entries: authors = ", ".join(a.name for a in getattr(ent, "authors", [])) if hasattr(ent, "authors") else "Unknown" published = getattr(ent, "published", "") results.append({ "title" : getattr(ent, "title", "[No title]"), "authors" : authors, "summary" : getattr(ent, "summary", ""), "link" : getattr(ent, "link", ""), "published": published, "source" : "arXiv", }) return results # --------------------------------------------------------------------- # CLI demo # --------------------------------------------------------------------- if __name__ == "__main__": import json, asyncio async def _demo(): papers = await fetch_arxiv("glioblastoma CRISPR", max_results=3) print(json.dumps(papers, indent=2)[:500]) asyncio.run(_demo())