mgbam commited on
Commit
169858b
Β·
verified Β·
1 Parent(s): b90d413

Update mcp/arxiv.py

Browse files
Files changed (1) hide show
  1. mcp/arxiv.py +17 -81
mcp/arxiv.py CHANGED
@@ -1,84 +1,20 @@
1
- #!/usr/bin/env python3
2
- """
3
- MedGenesis – arXiv async fetcher (Atom API).
4
-
5
- * Uses HTTPS (`https://export.arxiv.org/...`) to avoid HTTP 301 redirects.
6
- * Async httpx fetch with 2Γ—/4Γ— exponential-back-off retry.
7
- * Parses the Atom feed with feedparser inside a thread (non-blocking).
8
- * 6-hour LRU cache keyed by β€œquery+max_results”.
9
- * Returns a list of dicts matching schemas.Paper.
10
-
11
- API docs: https://arxiv.org/help/api/user-manual
12
- """
13
- from __future__ import annotations
14
-
15
- import asyncio
16
- from functools import lru_cache
17
- from typing import List, Dict
18
- from urllib.parse import quote_plus
19
-
20
  import feedparser
21
- import httpx
22
-
23
- _BASE = "https://export.arxiv.org/api/query?search_query="
24
- _TIMEOUT = 10
25
- _MAX_RES = 25
26
- _HEADERS = {"User-Agent": "MedGenesis/1.0 (https://huggingface.co/spaces)"}
27
-
28
-
29
- # ──────────────────────────────────────────────────────────────────────
30
- # Internal fetch helper with retry
31
- # ──────────────────────────────────────────────────────────────────────
32
- async def _fetch_raw(query: str, max_results: int, *, retries: int = 3) -> str:
33
- """Return raw Atom XML from arXiv."""
34
- max_results = max(1, min(max_results, _MAX_RES))
35
- url = f"{_BASE}{quote_plus(query)}&max_results={max_results}"
36
- delay = 2
37
- last: httpx.Response | None = None
38
- for _ in range(retries):
39
- async with httpx.AsyncClient(timeout=_TIMEOUT, headers=_HEADERS) as cli:
40
- last = await cli.get(url)
41
- if last.status_code == 200:
42
- return last.text
43
- await asyncio.sleep(delay)
44
- delay *= 2
45
- raise RuntimeError(f"arXiv API failed: {last.status_code if last else 'No response'}")
46
-
47
-
48
- # ──────────────────────────────────────────────────────────────────────
49
- # Public cached fetch + parse
50
- # ──────────────────────────────────────────────────────────────────────
51
- @lru_cache(maxsize=256)
52
- async def fetch_arxiv(query: str, *, max_results: int = 5) -> List[Dict]:
53
- """Return arXiv paper dicts compatible with schemas.Paper."""
54
- xml_text = await _fetch_raw(query, max_results)
55
-
56
- # feedparser is blocking; parse in thread
57
- feed = await asyncio.to_thread(feedparser.parse, xml_text)
58
 
59
- papers: List[Dict] = []
60
- for ent in feed.entries:
61
- authors = (
62
- ", ".join(a.name for a in getattr(ent, "authors", []))
63
- if hasattr(ent, "authors") else "Unknown"
64
- )
65
- papers.append({
66
- "title" : getattr(ent, "title", "[No title]"),
67
- "authors" : authors,
68
- "summary" : getattr(ent, "summary", ""),
69
- "link" : getattr(ent, "link", ""),
70
- "published": getattr(ent, "published", ""),
71
- "source" : "arXiv",
 
72
  })
73
- return papers
74
-
75
-
76
- # ──────────────────────────────────────────────────────────────────────
77
- # CLI demo
78
- # ──────────────────────────────────────────────────────────────────────
79
- if __name__ == "__main__":
80
- async def _demo():
81
- papers = await fetch_arxiv("glioblastoma CRISPR", max_results=3)
82
- for p in papers:
83
- print(p["title"])
84
- asyncio.run(_demo())
 
1
+ # mcp/arxiv.py
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  import feedparser
3
+ from urllib.parse import quote_plus
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4
 
5
+ ARXIV_BASE = "http://export.arxiv.org/api/query?search_query="
6
+
7
+ async def fetch_arxiv(query: str, max_results: int = 5) -> list[dict]:
8
+ url = f"{ARXIV_BASE}{quote_plus(query)}&max_results={max_results}"
9
+ feed = feedparser.parse(url)
10
+ out = []
11
+ for e in feed.entries:
12
+ out.append({
13
+ "title": e.get("title",""),
14
+ "authors": ", ".join([a.name for a in getattr(e,"authors",[])]),
15
+ "summary": e.get("summary",""),
16
+ "link": e.get("link",""),
17
+ "published": getattr(e,"published",""),
18
+ "source": "arXiv"
19
  })
20
+ return out