#!/usr/bin/env python3
"""
MedGenesis – arXiv async fetcher (Atom API).

* Uses HTTPS (`https://export.arxiv.org/...`) to avoid HTTP 301 redirects.
* Async httpx fetch with 2×/4× exponential-back-off retry.
* Parses the Atom feed with feedparser inside a thread (non-blocking).
* 6-hour LRU cache keyed by “query+max_results”.
* Returns a list of dicts matching schemas.Paper.

API docs: https://arxiv.org/help/api/user-manual
"""
from __future__ import annotations

import asyncio
from functools import lru_cache
from typing import List, Dict
from urllib.parse import quote_plus

import feedparser
import httpx

_BASE   = "https://export.arxiv.org/api/query?search_query="
_TIMEOUT = 10
_MAX_RES = 25
_HEADERS = {"User-Agent": "MedGenesis/1.0 (https://huggingface.co/spaces)"}


# ──────────────────────────────────────────────────────────────────────
# Internal fetch helper with retry
# ──────────────────────────────────────────────────────────────────────
async def _fetch_raw(query: str, max_results: int, *, retries: int = 3) -> str:
    """Return raw Atom XML from arXiv."""
    max_results = max(1, min(max_results, _MAX_RES))
    url = f"{_BASE}{quote_plus(query)}&max_results={max_results}"
    delay = 2
    last: httpx.Response | None = None
    for _ in range(retries):
        async with httpx.AsyncClient(timeout=_TIMEOUT, headers=_HEADERS) as cli:
            last = await cli.get(url)
            if last.status_code == 200:
                return last.text
        await asyncio.sleep(delay)
        delay *= 2
    raise RuntimeError(f"arXiv API failed: {last.status_code if last else 'No response'}")


# ──────────────────────────────────────────────────────────────────────
# Public cached fetch + parse
# ──────────────────────────────────────────────────────────────────────
@lru_cache(maxsize=256)
async def fetch_arxiv(query: str, *, max_results: int = 5) -> List[Dict]:
    """Return arXiv paper dicts compatible with schemas.Paper."""
    xml_text = await _fetch_raw(query, max_results)

    # feedparser is blocking; parse in thread
    feed = await asyncio.to_thread(feedparser.parse, xml_text)

    papers: List[Dict] = []
    for ent in feed.entries:
        authors = (
            ", ".join(a.name for a in getattr(ent, "authors", []))
            if hasattr(ent, "authors") else "Unknown"
        )
        papers.append({
            "title"    : getattr(ent, "title", "[No title]"),
            "authors"  : authors,
            "summary"  : getattr(ent, "summary", ""),
            "link"     : getattr(ent, "link", ""),
            "published": getattr(ent, "published", ""),
            "source"   : "arXiv",
        })
    return papers


# ──────────────────────────────────────────────────────────────────────
# CLI demo
# ──────────────────────────────────────────────────────────────────────
if __name__ == "__main__":
    async def _demo():
        papers = await fetch_arxiv("glioblastoma CRISPR", max_results=3)
        for p in papers:
            print(p["title"])
    asyncio.run(_demo())