Spaces:

mgbam
/

MCP_Res

Running

App Files Files Community

MCP_Res / mcp /arxiv.py

mgbam

Update mcp/arxiv.py

f62a8d2 verified 4 days ago

raw

history blame contribute delete

3.89 kB

	#!/usr/bin/env python3
	"""
	MedGenesis – arXiv async fetcher (Atom API).

	* Uses HTTPS (`https://export.arxiv.org/...`) to avoid HTTP 301 redirects.
	* Async httpx fetch with 2×/4× exponential-back-off retry.
	* Parses the Atom feed with feedparser inside a thread (non-blocking).
	* 6-hour LRU cache keyed by “query+max_results”.
	* Returns a list of dicts matching schemas.Paper.

	API docs: https://arxiv.org/help/api/user-manual
	"""
	from __future__ import annotations

	import asyncio
	from functools import lru_cache
	from typing import List, Dict
	from urllib.parse import quote_plus

	import feedparser
	import httpx

	_BASE = "https://export.arxiv.org/api/query?search_query="
	_TIMEOUT = 10
	_MAX_RES = 25
	_HEADERS = {"User-Agent": "MedGenesis/1.0 (https://huggingface.co/spaces)"}


	# ──────────────────────────────────────────────────────────────────────
	# Internal fetch helper with retry
	# ──────────────────────────────────────────────────────────────────────
	async def _fetch_raw(query: str, max_results: int, *, retries: int = 3) -> str:
	"""Return raw Atom XML from arXiv."""
	max_results = max(1, min(max_results, _MAX_RES))
	url = f"{_BASE}{quote_plus(query)}&max_results={max_results}"
	delay = 2
	last: httpx.Response \| None = None
	for _ in range(retries):
	async with httpx.AsyncClient(timeout=_TIMEOUT, headers=_HEADERS) as cli:
	last = await cli.get(url)
	if last.status_code == 200:
	return last.text
	await asyncio.sleep(delay)
	delay *= 2
	raise RuntimeError(f"arXiv API failed: {last.status_code if last else 'No response'}")


	# ──────────────────────────────────────────────────────────────────────
	# Public cached fetch + parse
	# ──────────────────────────────────────────────────────────────────────
	@lru_cache(maxsize=256)
	async def fetch_arxiv(query: str, *, max_results: int = 5) -> List[Dict]:
	"""Return arXiv paper dicts compatible with schemas.Paper."""
	xml_text = await _fetch_raw(query, max_results)

	# feedparser is blocking; parse in thread
	feed = await asyncio.to_thread(feedparser.parse, xml_text)

	papers: List[Dict] = []
	for ent in feed.entries:
	authors = (
	", ".join(a.name for a in getattr(ent, "authors", []))
	if hasattr(ent, "authors") else "Unknown"
	)
	papers.append({
	"title" : getattr(ent, "title", "[No title]"),
	"authors" : authors,
	"summary" : getattr(ent, "summary", ""),
	"link" : getattr(ent, "link", ""),
	"published": getattr(ent, "published", ""),
	"source" : "arXiv",
	})
	return papers


	# ──────────────────────────────────────────────────────────────────────
	# CLI demo
	# ──────────────────────────────────────────────────────────────────────
	if __name__ == "__main__":
	async def _demo():
	papers = await fetch_arxiv("glioblastoma CRISPR", max_results=3)
	for p in papers:
	print(p["title"])
	asyncio.run(_demo())