Update mcp/arxiv.py
Browse files- mcp/arxiv.py +33 -28
    	
        mcp/arxiv.py
    CHANGED
    
    | @@ -1,33 +1,36 @@ | |
| 1 | 
             
            #!/usr/bin/env python3
         | 
| 2 | 
            -
            """ | 
|  | |
| 3 |  | 
| 4 | 
            -
             | 
| 5 | 
            -
             | 
| 6 | 
            -
            *  | 
| 7 | 
            -
            *  | 
| 8 | 
            -
            *  | 
| 9 | 
            -
            * Normalises output to match `schemas.Paper`.
         | 
| 10 |  | 
| 11 | 
             
            API docs: https://arxiv.org/help/api/user-manual
         | 
| 12 | 
             
            """
         | 
| 13 | 
             
            from __future__ import annotations
         | 
| 14 |  | 
| 15 | 
            -
            import asyncio | 
| 16 | 
             
            from functools import lru_cache
         | 
| 17 | 
             
            from typing import List, Dict
         | 
| 18 | 
             
            from urllib.parse import quote_plus
         | 
|  | |
|  | |
| 19 | 
             
            import httpx
         | 
| 20 |  | 
| 21 | 
            -
            _BASE | 
| 22 | 
             
            _TIMEOUT = 10
         | 
| 23 | 
             
            _MAX_RES = 25
         | 
| 24 | 
             
            _HEADERS = {"User-Agent": "MedGenesis/1.0 (https://huggingface.co/spaces)"}
         | 
| 25 |  | 
| 26 | 
            -
             | 
| 27 | 
            -
            #  | 
| 28 | 
            -
            #  | 
|  | |
| 29 | 
             
            async def _fetch_raw(query: str, max_results: int, *, retries: int = 3) -> str:
         | 
| 30 | 
            -
                """Return Atom XML  | 
| 31 | 
             
                max_results = max(1, min(max_results, _MAX_RES))
         | 
| 32 | 
             
                url = f"{_BASE}{quote_plus(query)}&max_results={max_results}"
         | 
| 33 | 
             
                delay = 2
         | 
| @@ -42,38 +45,40 @@ async def _fetch_raw(query: str, max_results: int, *, retries: int = 3) -> str: | |
| 42 | 
             
                raise RuntimeError(f"arXiv API failed: {last.status_code if last else 'No response'}")
         | 
| 43 |  | 
| 44 |  | 
| 45 | 
            -
            #  | 
| 46 | 
            -
            #  | 
| 47 | 
            -
            #  | 
| 48 | 
             
            @lru_cache(maxsize=256)
         | 
| 49 | 
             
            async def fetch_arxiv(query: str, *, max_results: int = 5) -> List[Dict]:
         | 
| 50 | 
            -
                """Return  | 
| 51 | 
             
                xml_text = await _fetch_raw(query, max_results)
         | 
| 52 |  | 
| 53 | 
            -
                # feedparser is blocking;  | 
| 54 | 
             
                feed = await asyncio.to_thread(feedparser.parse, xml_text)
         | 
| 55 |  | 
| 56 | 
            -
                 | 
| 57 | 
             
                for ent in feed.entries:
         | 
| 58 | 
            -
                    authors =  | 
| 59 | 
            -
             | 
| 60 | 
            -
             | 
|  | |
|  | |
| 61 | 
             
                        "title"    : getattr(ent, "title", "[No title]"),
         | 
| 62 | 
             
                        "authors"  : authors,
         | 
| 63 | 
             
                        "summary"  : getattr(ent, "summary", ""),
         | 
| 64 | 
             
                        "link"     : getattr(ent, "link", ""),
         | 
| 65 | 
            -
                        "published": published,
         | 
| 66 | 
             
                        "source"   : "arXiv",
         | 
| 67 | 
             
                    })
         | 
| 68 | 
            -
                return  | 
| 69 |  | 
| 70 |  | 
| 71 | 
            -
            #  | 
| 72 | 
             
            # CLI demo
         | 
| 73 | 
            -
            #  | 
| 74 | 
             
            if __name__ == "__main__":
         | 
| 75 | 
            -
                import json, asyncio
         | 
| 76 | 
             
                async def _demo():
         | 
| 77 | 
             
                    papers = await fetch_arxiv("glioblastoma CRISPR", max_results=3)
         | 
| 78 | 
            -
                     | 
|  | |
| 79 | 
             
                asyncio.run(_demo())
         | 
|  | |
| 1 | 
             
            #!/usr/bin/env python3
         | 
| 2 | 
            +
            """
         | 
| 3 | 
            +
            MedGenesis β arXiv async fetcher (Atom API).
         | 
| 4 |  | 
| 5 | 
            +
            * Uses HTTPS (`https://export.arxiv.org/...`) to avoid HTTP 301 redirects.
         | 
| 6 | 
            +
            * Async httpx fetch with 2Γ/4Γ exponential-back-off retry.
         | 
| 7 | 
            +
            * Parses the Atom feed with feedparser inside a thread (non-blocking).
         | 
| 8 | 
            +
            * 6-hour LRU cache keyed by βquery+max_resultsβ.
         | 
| 9 | 
            +
            * Returns a list of dicts matching schemas.Paper.
         | 
|  | |
| 10 |  | 
| 11 | 
             
            API docs: https://arxiv.org/help/api/user-manual
         | 
| 12 | 
             
            """
         | 
| 13 | 
             
            from __future__ import annotations
         | 
| 14 |  | 
| 15 | 
            +
            import asyncio
         | 
| 16 | 
             
            from functools import lru_cache
         | 
| 17 | 
             
            from typing import List, Dict
         | 
| 18 | 
             
            from urllib.parse import quote_plus
         | 
| 19 | 
            +
             | 
| 20 | 
            +
            import feedparser
         | 
| 21 | 
             
            import httpx
         | 
| 22 |  | 
| 23 | 
            +
            _BASE   = "https://export.arxiv.org/api/query?search_query="
         | 
| 24 | 
             
            _TIMEOUT = 10
         | 
| 25 | 
             
            _MAX_RES = 25
         | 
| 26 | 
             
            _HEADERS = {"User-Agent": "MedGenesis/1.0 (https://huggingface.co/spaces)"}
         | 
| 27 |  | 
| 28 | 
            +
             | 
| 29 | 
            +
            # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
         | 
| 30 | 
            +
            # Internal fetch helper with retry
         | 
| 31 | 
            +
            # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
         | 
| 32 | 
             
            async def _fetch_raw(query: str, max_results: int, *, retries: int = 3) -> str:
         | 
| 33 | 
            +
                """Return raw Atom XML from arXiv."""
         | 
| 34 | 
             
                max_results = max(1, min(max_results, _MAX_RES))
         | 
| 35 | 
             
                url = f"{_BASE}{quote_plus(query)}&max_results={max_results}"
         | 
| 36 | 
             
                delay = 2
         | 
|  | |
| 45 | 
             
                raise RuntimeError(f"arXiv API failed: {last.status_code if last else 'No response'}")
         | 
| 46 |  | 
| 47 |  | 
| 48 | 
            +
            # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
         | 
| 49 | 
            +
            # Public cached fetch + parse
         | 
| 50 | 
            +
            # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
         | 
| 51 | 
             
            @lru_cache(maxsize=256)
         | 
| 52 | 
             
            async def fetch_arxiv(query: str, *, max_results: int = 5) -> List[Dict]:
         | 
| 53 | 
            +
                """Return arXiv paper dicts compatible with schemas.Paper."""
         | 
| 54 | 
             
                xml_text = await _fetch_raw(query, max_results)
         | 
| 55 |  | 
| 56 | 
            +
                # feedparser is blocking; parse in thread
         | 
| 57 | 
             
                feed = await asyncio.to_thread(feedparser.parse, xml_text)
         | 
| 58 |  | 
| 59 | 
            +
                papers: List[Dict] = []
         | 
| 60 | 
             
                for ent in feed.entries:
         | 
| 61 | 
            +
                    authors = (
         | 
| 62 | 
            +
                        ", ".join(a.name for a in getattr(ent, "authors", []))
         | 
| 63 | 
            +
                        if hasattr(ent, "authors") else "Unknown"
         | 
| 64 | 
            +
                    )
         | 
| 65 | 
            +
                    papers.append({
         | 
| 66 | 
             
                        "title"    : getattr(ent, "title", "[No title]"),
         | 
| 67 | 
             
                        "authors"  : authors,
         | 
| 68 | 
             
                        "summary"  : getattr(ent, "summary", ""),
         | 
| 69 | 
             
                        "link"     : getattr(ent, "link", ""),
         | 
| 70 | 
            +
                        "published": getattr(ent, "published", ""),
         | 
| 71 | 
             
                        "source"   : "arXiv",
         | 
| 72 | 
             
                    })
         | 
| 73 | 
            +
                return papers
         | 
| 74 |  | 
| 75 |  | 
| 76 | 
            +
            # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
         | 
| 77 | 
             
            # CLI demo
         | 
| 78 | 
            +
            # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
         | 
| 79 | 
             
            if __name__ == "__main__":
         | 
|  | |
| 80 | 
             
                async def _demo():
         | 
| 81 | 
             
                    papers = await fetch_arxiv("glioblastoma CRISPR", max_results=3)
         | 
| 82 | 
            +
                    for p in papers:
         | 
| 83 | 
            +
                        print(p["title"])
         | 
| 84 | 
             
                asyncio.run(_demo())
         | 
