MCP_Res / mcp /arxiv.py
mgbam's picture
Update mcp/arxiv.py
aae312e verified
raw
history blame
3.08 kB
#!/usr/bin/env python3
"""MedGenesis – arXiv async fetcher (Atom API).
Improvements over the legacy helper
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
* Uses **httpx.AsyncClient** with 10‑second timeout & *exponential back‑off retry*.
* Caches raw XML for 6 h via `lru_cache` (key = query+max_results).
* Parses feed with **feedparser** inside a thread to avoid blocking.
* Normalises output to match `schemas.Paper`.
API docs: https://arxiv.org/help/api/user-manual
"""
from __future__ import annotations
import asyncio, feedparser
from functools import lru_cache
from typing import List, Dict
from urllib.parse import quote_plus
import httpx
_BASE = "http://export.arxiv.org/api/query?search_query="
_TIMEOUT = 10
_MAX_RES = 25
_HEADERS = {"User-Agent": "MedGenesis/1.0 (https://huggingface.co/spaces)"}
# ---------------------------------------------------------------------
# Internal fetch w/ retry
# ---------------------------------------------------------------------
async def _fetch_raw(query: str, max_results: int, *, retries: int = 3) -> str:
"""Return Atom XML text from arXiv."""
max_results = max(1, min(max_results, _MAX_RES))
url = f"{_BASE}{quote_plus(query)}&max_results={max_results}"
delay = 2
last: httpx.Response | None = None
for _ in range(retries):
async with httpx.AsyncClient(timeout=_TIMEOUT, headers=_HEADERS) as cli:
last = await cli.get(url)
if last.status_code == 200:
return last.text
await asyncio.sleep(delay)
delay *= 2
raise RuntimeError(f"arXiv API failed: {last.status_code if last else 'No response'}")
# ---------------------------------------------------------------------
# Cached fetch + parse
# ---------------------------------------------------------------------
@lru_cache(maxsize=256)
async def fetch_arxiv(query: str, *, max_results: int = 5) -> List[Dict]:
"""Return list of arXiv paper dicts compatible with `schemas.Paper`."""
xml_text = await _fetch_raw(query, max_results)
# feedparser is blocking; run in thread
feed = await asyncio.to_thread(feedparser.parse, xml_text)
results: List[Dict] = []
for ent in feed.entries:
authors = ", ".join(a.name for a in getattr(ent, "authors", [])) if hasattr(ent, "authors") else "Unknown"
published = getattr(ent, "published", "")
results.append({
"title" : getattr(ent, "title", "[No title]"),
"authors" : authors,
"summary" : getattr(ent, "summary", ""),
"link" : getattr(ent, "link", ""),
"published": published,
"source" : "arXiv",
})
return results
# ---------------------------------------------------------------------
# CLI demo
# ---------------------------------------------------------------------
if __name__ == "__main__":
import json, asyncio
async def _demo():
papers = await fetch_arxiv("glioblastoma CRISPR", max_results=3)
print(json.dumps(papers, indent=2)[:500])
asyncio.run(_demo())