MCP_Res / mcp /arxiv.py
mgbam's picture
Update mcp/arxiv.py
f62a8d2 verified
#!/usr/bin/env python3
"""
MedGenesis – arXiv async fetcher (Atom API).
* Uses HTTPS (`https://export.arxiv.org/...`) to avoid HTTP 301 redirects.
* Async httpx fetch with 2Γ—/4Γ— exponential-back-off retry.
* Parses the Atom feed with feedparser inside a thread (non-blocking).
* 6-hour LRU cache keyed by β€œquery+max_results”.
* Returns a list of dicts matching schemas.Paper.
API docs: https://arxiv.org/help/api/user-manual
"""
from __future__ import annotations
import asyncio
from functools import lru_cache
from typing import List, Dict
from urllib.parse import quote_plus
import feedparser
import httpx
_BASE = "https://export.arxiv.org/api/query?search_query="
_TIMEOUT = 10
_MAX_RES = 25
_HEADERS = {"User-Agent": "MedGenesis/1.0 (https://huggingface.co/spaces)"}
# ──────────────────────────────────────────────────────────────────────
# Internal fetch helper with retry
# ──────────────────────────────────────────────────────────────────────
async def _fetch_raw(query: str, max_results: int, *, retries: int = 3) -> str:
"""Return raw Atom XML from arXiv."""
max_results = max(1, min(max_results, _MAX_RES))
url = f"{_BASE}{quote_plus(query)}&max_results={max_results}"
delay = 2
last: httpx.Response | None = None
for _ in range(retries):
async with httpx.AsyncClient(timeout=_TIMEOUT, headers=_HEADERS) as cli:
last = await cli.get(url)
if last.status_code == 200:
return last.text
await asyncio.sleep(delay)
delay *= 2
raise RuntimeError(f"arXiv API failed: {last.status_code if last else 'No response'}")
# ──────────────────────────────────────────────────────────────────────
# Public cached fetch + parse
# ──────────────────────────────────────────────────────────────────────
@lru_cache(maxsize=256)
async def fetch_arxiv(query: str, *, max_results: int = 5) -> List[Dict]:
"""Return arXiv paper dicts compatible with schemas.Paper."""
xml_text = await _fetch_raw(query, max_results)
# feedparser is blocking; parse in thread
feed = await asyncio.to_thread(feedparser.parse, xml_text)
papers: List[Dict] = []
for ent in feed.entries:
authors = (
", ".join(a.name for a in getattr(ent, "authors", []))
if hasattr(ent, "authors") else "Unknown"
)
papers.append({
"title" : getattr(ent, "title", "[No title]"),
"authors" : authors,
"summary" : getattr(ent, "summary", ""),
"link" : getattr(ent, "link", ""),
"published": getattr(ent, "published", ""),
"source" : "arXiv",
})
return papers
# ──────────────────────────────────────────────────────────────────────
# CLI demo
# ──────────────────────────────────────────────────────────────────────
if __name__ == "__main__":
async def _demo():
papers = await fetch_arxiv("glioblastoma CRISPR", max_results=3)
for p in papers:
print(p["title"])
asyncio.run(_demo())