mgbam commited on
Commit
f62a8d2
Β·
verified Β·
1 Parent(s): 739f11d

Update mcp/arxiv.py

Browse files
Files changed (1) hide show
  1. mcp/arxiv.py +33 -28
mcp/arxiv.py CHANGED
@@ -1,33 +1,36 @@
1
  #!/usr/bin/env python3
2
- """MedGenesis – arXiv async fetcher (Atom API).
 
3
 
4
- Improvements over the legacy helper
5
- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
6
- * Uses **httpx.AsyncClient** with 10‑second timeout & *exponential back‑off retry*.
7
- * Caches raw XML for 6β€―h via `lru_cache` (key = query+max_results).
8
- * Parses feed with **feedparser** inside a thread to avoid blocking.
9
- * Normalises output to match `schemas.Paper`.
10
 
11
  API docs: https://arxiv.org/help/api/user-manual
12
  """
13
  from __future__ import annotations
14
 
15
- import asyncio, feedparser
16
  from functools import lru_cache
17
  from typing import List, Dict
18
  from urllib.parse import quote_plus
 
 
19
  import httpx
20
 
21
- _BASE = "http://export.arxiv.org/api/query?search_query="
22
  _TIMEOUT = 10
23
  _MAX_RES = 25
24
  _HEADERS = {"User-Agent": "MedGenesis/1.0 (https://huggingface.co/spaces)"}
25
 
26
- # ---------------------------------------------------------------------
27
- # Internal fetch w/ retry
28
- # ---------------------------------------------------------------------
 
29
  async def _fetch_raw(query: str, max_results: int, *, retries: int = 3) -> str:
30
- """Return Atom XML text from arXiv."""
31
  max_results = max(1, min(max_results, _MAX_RES))
32
  url = f"{_BASE}{quote_plus(query)}&max_results={max_results}"
33
  delay = 2
@@ -42,38 +45,40 @@ async def _fetch_raw(query: str, max_results: int, *, retries: int = 3) -> str:
42
  raise RuntimeError(f"arXiv API failed: {last.status_code if last else 'No response'}")
43
 
44
 
45
- # ---------------------------------------------------------------------
46
- # Cached fetch + parse
47
- # ---------------------------------------------------------------------
48
  @lru_cache(maxsize=256)
49
  async def fetch_arxiv(query: str, *, max_results: int = 5) -> List[Dict]:
50
- """Return list of arXiv paper dicts compatible with `schemas.Paper`."""
51
  xml_text = await _fetch_raw(query, max_results)
52
 
53
- # feedparser is blocking; run in thread
54
  feed = await asyncio.to_thread(feedparser.parse, xml_text)
55
 
56
- results: List[Dict] = []
57
  for ent in feed.entries:
58
- authors = ", ".join(a.name for a in getattr(ent, "authors", [])) if hasattr(ent, "authors") else "Unknown"
59
- published = getattr(ent, "published", "")
60
- results.append({
 
 
61
  "title" : getattr(ent, "title", "[No title]"),
62
  "authors" : authors,
63
  "summary" : getattr(ent, "summary", ""),
64
  "link" : getattr(ent, "link", ""),
65
- "published": published,
66
  "source" : "arXiv",
67
  })
68
- return results
69
 
70
 
71
- # ---------------------------------------------------------------------
72
  # CLI demo
73
- # ---------------------------------------------------------------------
74
  if __name__ == "__main__":
75
- import json, asyncio
76
  async def _demo():
77
  papers = await fetch_arxiv("glioblastoma CRISPR", max_results=3)
78
- print(json.dumps(papers, indent=2)[:500])
 
79
  asyncio.run(_demo())
 
1
  #!/usr/bin/env python3
2
+ """
3
+ MedGenesis – arXiv async fetcher (Atom API).
4
 
5
+ * Uses HTTPS (`https://export.arxiv.org/...`) to avoid HTTP 301 redirects.
6
+ * Async httpx fetch with 2Γ—/4Γ— exponential-back-off retry.
7
+ * Parses the Atom feed with feedparser inside a thread (non-blocking).
8
+ * 6-hour LRU cache keyed by β€œquery+max_results”.
9
+ * Returns a list of dicts matching schemas.Paper.
 
10
 
11
  API docs: https://arxiv.org/help/api/user-manual
12
  """
13
  from __future__ import annotations
14
 
15
+ import asyncio
16
  from functools import lru_cache
17
  from typing import List, Dict
18
  from urllib.parse import quote_plus
19
+
20
+ import feedparser
21
  import httpx
22
 
23
+ _BASE = "https://export.arxiv.org/api/query?search_query="
24
  _TIMEOUT = 10
25
  _MAX_RES = 25
26
  _HEADERS = {"User-Agent": "MedGenesis/1.0 (https://huggingface.co/spaces)"}
27
 
28
+
29
+ # ──────────────────────────────────────────────────────────────────────
30
+ # Internal fetch helper with retry
31
+ # ──────────────────────────────────────────────────────────────────────
32
  async def _fetch_raw(query: str, max_results: int, *, retries: int = 3) -> str:
33
+ """Return raw Atom XML from arXiv."""
34
  max_results = max(1, min(max_results, _MAX_RES))
35
  url = f"{_BASE}{quote_plus(query)}&max_results={max_results}"
36
  delay = 2
 
45
  raise RuntimeError(f"arXiv API failed: {last.status_code if last else 'No response'}")
46
 
47
 
48
+ # ──────────────────────────────────────────────────────────────────────
49
+ # Public cached fetch + parse
50
+ # ──────────────────────────────────────────────────────────────────────
51
  @lru_cache(maxsize=256)
52
  async def fetch_arxiv(query: str, *, max_results: int = 5) -> List[Dict]:
53
+ """Return arXiv paper dicts compatible with schemas.Paper."""
54
  xml_text = await _fetch_raw(query, max_results)
55
 
56
+ # feedparser is blocking; parse in thread
57
  feed = await asyncio.to_thread(feedparser.parse, xml_text)
58
 
59
+ papers: List[Dict] = []
60
  for ent in feed.entries:
61
+ authors = (
62
+ ", ".join(a.name for a in getattr(ent, "authors", []))
63
+ if hasattr(ent, "authors") else "Unknown"
64
+ )
65
+ papers.append({
66
  "title" : getattr(ent, "title", "[No title]"),
67
  "authors" : authors,
68
  "summary" : getattr(ent, "summary", ""),
69
  "link" : getattr(ent, "link", ""),
70
+ "published": getattr(ent, "published", ""),
71
  "source" : "arXiv",
72
  })
73
+ return papers
74
 
75
 
76
+ # ──────────────────────────────────────────────────────────────────────
77
  # CLI demo
78
+ # ──────────────────────────────────────────────────────────────────────
79
  if __name__ == "__main__":
 
80
  async def _demo():
81
  papers = await fetch_arxiv("glioblastoma CRISPR", max_results=3)
82
+ for p in papers:
83
+ print(p["title"])
84
  asyncio.run(_demo())