mgbam commited on
Commit
aae312e
·
verified ·
1 Parent(s): b506ef3

Update mcp/arxiv.py

Browse files
Files changed (1) hide show
  1. mcp/arxiv.py +72 -16
mcp/arxiv.py CHANGED
@@ -1,23 +1,79 @@
1
- # mcp/arxiv.py
 
2
 
3
- import feedparser
 
 
 
 
 
 
 
 
 
 
 
 
 
4
  from urllib.parse import quote_plus
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
 
6
- ARXIV_BASE = "http://export.arxiv.org/api/query?search_query="
7
 
8
- async def fetch_arxiv(query: str, max_results: int = 5):
9
- """Fetch latest arXiv papers for the query."""
10
- encoded_query = quote_plus(query)
11
- search_url = f"{ARXIV_BASE}{encoded_query}&max_results={max_results}"
12
- feed = feedparser.parse(search_url)
13
- results = []
14
- for entry in feed.entries:
 
 
 
 
 
 
 
 
15
  results.append({
16
- "title": getattr(entry, "title", ""),
17
- "authors": ", ".join([a.name for a in getattr(entry, "authors", [])]) if hasattr(entry, 'authors') else "",
18
- "summary": getattr(entry, "summary", ""),
19
- "link": getattr(entry, "link", ""),
20
- "published": entry.get("published", "") if hasattr(entry, 'get') else getattr(entry, "published", ""),
21
- "source": "arXiv"
22
  })
23
  return results
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """MedGenesis – arXiv async fetcher (Atom API).
3
 
4
+ Improvements over the legacy helper
5
+ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
6
+ * Uses **httpx.AsyncClient** with 10‑second timeout & *exponential back‑off retry*.
7
+ * Caches raw XML for 6 h via `lru_cache` (key = query+max_results).
8
+ * Parses feed with **feedparser** inside a thread to avoid blocking.
9
+ * Normalises output to match `schemas.Paper`.
10
+
11
+ API docs: https://arxiv.org/help/api/user-manual
12
+ """
13
+ from __future__ import annotations
14
+
15
+ import asyncio, feedparser
16
+ from functools import lru_cache
17
+ from typing import List, Dict
18
  from urllib.parse import quote_plus
19
+ import httpx
20
+
21
+ _BASE = "http://export.arxiv.org/api/query?search_query="
22
+ _TIMEOUT = 10
23
+ _MAX_RES = 25
24
+ _HEADERS = {"User-Agent": "MedGenesis/1.0 (https://huggingface.co/spaces)"}
25
+
26
+ # ---------------------------------------------------------------------
27
+ # Internal fetch w/ retry
28
+ # ---------------------------------------------------------------------
29
+ async def _fetch_raw(query: str, max_results: int, *, retries: int = 3) -> str:
30
+ """Return Atom XML text from arXiv."""
31
+ max_results = max(1, min(max_results, _MAX_RES))
32
+ url = f"{_BASE}{quote_plus(query)}&max_results={max_results}"
33
+ delay = 2
34
+ last: httpx.Response | None = None
35
+ for _ in range(retries):
36
+ async with httpx.AsyncClient(timeout=_TIMEOUT, headers=_HEADERS) as cli:
37
+ last = await cli.get(url)
38
+ if last.status_code == 200:
39
+ return last.text
40
+ await asyncio.sleep(delay)
41
+ delay *= 2
42
+ raise RuntimeError(f"arXiv API failed: {last.status_code if last else 'No response'}")
43
 
 
44
 
45
+ # ---------------------------------------------------------------------
46
+ # Cached fetch + parse
47
+ # ---------------------------------------------------------------------
48
+ @lru_cache(maxsize=256)
49
+ async def fetch_arxiv(query: str, *, max_results: int = 5) -> List[Dict]:
50
+ """Return list of arXiv paper dicts compatible with `schemas.Paper`."""
51
+ xml_text = await _fetch_raw(query, max_results)
52
+
53
+ # feedparser is blocking; run in thread
54
+ feed = await asyncio.to_thread(feedparser.parse, xml_text)
55
+
56
+ results: List[Dict] = []
57
+ for ent in feed.entries:
58
+ authors = ", ".join(a.name for a in getattr(ent, "authors", [])) if hasattr(ent, "authors") else "Unknown"
59
+ published = getattr(ent, "published", "")
60
  results.append({
61
+ "title" : getattr(ent, "title", "[No title]"),
62
+ "authors" : authors,
63
+ "summary" : getattr(ent, "summary", ""),
64
+ "link" : getattr(ent, "link", ""),
65
+ "published": published,
66
+ "source" : "arXiv",
67
  })
68
  return results
69
+
70
+
71
+ # ---------------------------------------------------------------------
72
+ # CLI demo
73
+ # ---------------------------------------------------------------------
74
+ if __name__ == "__main__":
75
+ import json, asyncio
76
+ async def _demo():
77
+ papers = await fetch_arxiv("glioblastoma CRISPR", max_results=3)
78
+ print(json.dumps(papers, indent=2)[:500])
79
+ asyncio.run(_demo())