Spaces:

mgbam
/

MCP_Res

Runtime error

App Files Files Community

mgbam commited on 6 days ago

Commit

f62a8d2

verified ·

1 Parent(s): 739f11d

Update mcp/arxiv.py

Browse files

Files changed (1) hide show

mcp/arxiv.py +33 -28

mcp/arxiv.py CHANGED Viewed

@@ -1,33 +1,36 @@
 #!/usr/bin/env python3
-"""MedGenesis – arXiv async fetcher (Atom API).
-Improvements over the legacy helper
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-* Uses **httpx.AsyncClient** with 10‑second timeout & *exponential back‑off retry*.
-* Caches raw XML for 6 h via `lru_cache` (key = query+max_results).
-* Parses feed with **feedparser** inside a thread to avoid blocking.
-* Normalises output to match `schemas.Paper`.
 API docs: https://arxiv.org/help/api/user-manual
 """
 from __future__ import annotations
-import asyncio, feedparser
 from functools import lru_cache
 from typing import List, Dict
 from urllib.parse import quote_plus
 import httpx
-_BASE = "http://export.arxiv.org/api/query?search_query="
 _TIMEOUT = 10
 _MAX_RES = 25
 _HEADERS = {"User-Agent": "MedGenesis/1.0 (https://huggingface.co/spaces)"}
-# ---------------------------------------------------------------------
-# Internal fetch w/ retry
-# ---------------------------------------------------------------------
 async def _fetch_raw(query: str, max_results: int, *, retries: int = 3) -> str:
-    """Return Atom XML text from arXiv."""
     max_results = max(1, min(max_results, _MAX_RES))
     url = f"{_BASE}{quote_plus(query)}&max_results={max_results}"
     delay = 2
@@ -42,38 +45,40 @@ async def _fetch_raw(query: str, max_results: int, *, retries: int = 3) -> str:
     raise RuntimeError(f"arXiv API failed: {last.status_code if last else 'No response'}")
-# ---------------------------------------------------------------------
-# Cached fetch + parse
-# ---------------------------------------------------------------------
 @lru_cache(maxsize=256)
 async def fetch_arxiv(query: str, *, max_results: int = 5) -> List[Dict]:
-    """Return list of arXiv paper dicts compatible with `schemas.Paper`."""
     xml_text = await _fetch_raw(query, max_results)
-    # feedparser is blocking; run in thread
     feed = await asyncio.to_thread(feedparser.parse, xml_text)
-    results: List[Dict] = []
     for ent in feed.entries:
-        authors = ", ".join(a.name for a in getattr(ent, "authors", [])) if hasattr(ent, "authors") else "Unknown"
-        published = getattr(ent, "published", "")
-        results.append({
             "title"    : getattr(ent, "title", "[No title]"),
             "authors"  : authors,
             "summary"  : getattr(ent, "summary", ""),
             "link"     : getattr(ent, "link", ""),
-            "published": published,
             "source"   : "arXiv",
         })
-    return results
-# ---------------------------------------------------------------------
 # CLI demo
-# ---------------------------------------------------------------------
 if __name__ == "__main__":
-    import json, asyncio
     async def _demo():
         papers = await fetch_arxiv("glioblastoma CRISPR", max_results=3)
-        print(json.dumps(papers, indent=2)[:500])
     asyncio.run(_demo())

 #!/usr/bin/env python3
+"""
+MedGenesis – arXiv async fetcher (Atom API).
+* Uses HTTPS (`https://export.arxiv.org/...`) to avoid HTTP 301 redirects.
+* Async httpx fetch with 2×/4× exponential-back-off retry.
+* Parses the Atom feed with feedparser inside a thread (non-blocking).
+* 6-hour LRU cache keyed by “query+max_results”.
+* Returns a list of dicts matching schemas.Paper.
 API docs: https://arxiv.org/help/api/user-manual
 """
 from __future__ import annotations
+import asyncio
 from functools import lru_cache
 from typing import List, Dict
 from urllib.parse import quote_plus
+import feedparser
 import httpx
+_BASE   = "https://export.arxiv.org/api/query?search_query="
 _TIMEOUT = 10
 _MAX_RES = 25
 _HEADERS = {"User-Agent": "MedGenesis/1.0 (https://huggingface.co/spaces)"}
+# ──────────────────────────────────────────────────────────────────────
+# Internal fetch helper with retry
+# ──────────────────────────────────────────────────────────────────────
 async def _fetch_raw(query: str, max_results: int, *, retries: int = 3) -> str:
+    """Return raw Atom XML from arXiv."""
     max_results = max(1, min(max_results, _MAX_RES))
     url = f"{_BASE}{quote_plus(query)}&max_results={max_results}"
     delay = 2
     raise RuntimeError(f"arXiv API failed: {last.status_code if last else 'No response'}")
+# ──────────────────────────────────────────────────────────────────────
+# Public cached fetch + parse
+# ──────────────────────────────────────────────────────────────────────
 @lru_cache(maxsize=256)
 async def fetch_arxiv(query: str, *, max_results: int = 5) -> List[Dict]:
+    """Return arXiv paper dicts compatible with schemas.Paper."""
     xml_text = await _fetch_raw(query, max_results)
+    # feedparser is blocking; parse in thread
     feed = await asyncio.to_thread(feedparser.parse, xml_text)
+    papers: List[Dict] = []
     for ent in feed.entries:
+        authors = (
+            ", ".join(a.name for a in getattr(ent, "authors", []))
+            if hasattr(ent, "authors") else "Unknown"
+        )
+        papers.append({
             "title"    : getattr(ent, "title", "[No title]"),
             "authors"  : authors,
             "summary"  : getattr(ent, "summary", ""),
             "link"     : getattr(ent, "link", ""),
+            "published": getattr(ent, "published", ""),
             "source"   : "arXiv",
         })
+    return papers
+# ──────────────────────────────────────────────────────────────────────
 # CLI demo
+# ──────────────────────────────────────────────────────────────────────
 if __name__ == "__main__":
     async def _demo():
         papers = await fetch_arxiv("glioblastoma CRISPR", max_results=3)
+        for p in papers:
+            print(p["title"])
     asyncio.run(_demo())