Spaces:

mgbam
/

MCP_Res

Runtime error

App Files Files Community

MCP_Res / mcp /arxiv.py

mgbam

Update mcp/arxiv.py

aae312e verified 22 days ago

raw

history blame

3.08 kB

	#!/usr/bin/env python3
	"""MedGenesis – arXiv async fetcher (Atom API).

	Improvements over the legacy helper
	~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
	* Uses httpx.AsyncClient with 10‑second timeout & exponential back‑off retry.
	* Caches raw XML for 6 h via `lru_cache` (key = query+max_results).
	* Parses feed with feedparser inside a thread to avoid blocking.
	* Normalises output to match `schemas.Paper`.

	API docs: https://arxiv.org/help/api/user-manual
	"""
	from __future__ import annotations

	import asyncio, feedparser
	from functools import lru_cache
	from typing import List, Dict
	from urllib.parse import quote_plus
	import httpx

	_BASE = "http://export.arxiv.org/api/query?search_query="
	_TIMEOUT = 10
	_MAX_RES = 25
	_HEADERS = {"User-Agent": "MedGenesis/1.0 (https://huggingface.co/spaces)"}

	# ---------------------------------------------------------------------
	# Internal fetch w/ retry
	# ---------------------------------------------------------------------
	async def _fetch_raw(query: str, max_results: int, *, retries: int = 3) -> str:
	"""Return Atom XML text from arXiv."""
	max_results = max(1, min(max_results, _MAX_RES))
	url = f"{_BASE}{quote_plus(query)}&max_results={max_results}"
	delay = 2
	last: httpx.Response \| None = None
	for _ in range(retries):
	async with httpx.AsyncClient(timeout=_TIMEOUT, headers=_HEADERS) as cli:
	last = await cli.get(url)
	if last.status_code == 200:
	return last.text
	await asyncio.sleep(delay)
	delay *= 2
	raise RuntimeError(f"arXiv API failed: {last.status_code if last else 'No response'}")


	# ---------------------------------------------------------------------
	# Cached fetch + parse
	# ---------------------------------------------------------------------
	@lru_cache(maxsize=256)
	async def fetch_arxiv(query: str, *, max_results: int = 5) -> List[Dict]:
	"""Return list of arXiv paper dicts compatible with `schemas.Paper`."""
	xml_text = await _fetch_raw(query, max_results)

	# feedparser is blocking; run in thread
	feed = await asyncio.to_thread(feedparser.parse, xml_text)

	results: List[Dict] = []
	for ent in feed.entries:
	authors = ", ".join(a.name for a in getattr(ent, "authors", [])) if hasattr(ent, "authors") else "Unknown"
	published = getattr(ent, "published", "")
	results.append({
	"title" : getattr(ent, "title", "[No title]"),
	"authors" : authors,
	"summary" : getattr(ent, "summary", ""),
	"link" : getattr(ent, "link", ""),
	"published": published,
	"source" : "arXiv",
	})
	return results


	# ---------------------------------------------------------------------
	# CLI demo
	# ---------------------------------------------------------------------
	if __name__ == "__main__":
	import json, asyncio
	async def _demo():
	papers = await fetch_arxiv("glioblastoma CRISPR", max_results=3)
	print(json.dumps(papers, indent=2)[:500])
	asyncio.run(_demo())