from typing import List, Dict, Any, Optional import re import httpx from datetime import datetime, timedelta from bs4 import BeautifulSoup from src.logger import logger class HuggingFaceDailyPapers: """Class for crawling and parsing Hugging Face daily papers""" def __init__(self): self.base_url = "https://huggingface.co/papers/date" self.timeout = 20 def extract_arxiv_id(self, url: str) -> Optional[str]: """Extract arXiv ID from a URL""" if not url: return None # matches https://huggingface.co/papers/2508.10711 m = re.search(r"huggingface\.co/papers/(\d{4,5}\.\d+)(v\d+)?", url) if m: return m.group(1) return None def extract_json_data(self, html: str) -> Dict[str, Any]: """Extract JSON data from the HTML page to get GitHub stars and other metadata.""" try: soup = BeautifulSoup(html, "lxml") # Look for GitHub stars in the HTML structure # Based on the user's description, GitHub stars are displayed with SVG icons # Look for SVG elements that might represent GitHub stars svg_elements = soup.find_all("svg") github_stars_map = {} for svg in svg_elements: # Look for GitHub-related SVG (usually has specific viewBox or path) svg_html = str(svg) if "github" in svg_html.lower() or "256 250" in svg_html: # GitHub icon viewBox # Look for the star count near this SVG parent = svg.parent if parent: # Look for numbers that might be star counts text_content = parent.get_text() numbers = re.findall(r'\b(\d+)\b', text_content) if numbers: # The number near a GitHub SVG is likely the star count star_count = int(numbers[0]) # Try to find the paper title or ID to associate with # Look for the closest article or card container article = svg.find_parent("article") if article: title_elem = article.find("h3") if title_elem: paper_title = title_elem.get_text(strip=True) github_stars_map[paper_title] = star_count # Also look for any elements with GitHub-related text github_text_elements = soup.find_all(string=lambda text: text and "github" in text.lower()) for text_elem in github_text_elements: parent = text_elem.parent if parent: text_content = parent.get_text() numbers = re.findall(r'\b(\d+)\b', text_content) if numbers: star_count = int(numbers[0]) # Try to find the paper title article = parent.find_parent("article") if article: title_elem = article.find("h3") if title_elem: paper_title = title_elem.get_text(strip=True) if paper_title not in github_stars_map: github_stars_map[paper_title] = star_count return {"github_stars_map": github_stars_map} except Exception as e: logger.error(f"Error extracting JSON data: {e}") return {"github_stars_map": {}} async def fetch_daily_html(self, target_date: str) -> tuple[str, str]: """Fetch daily papers HTML, with fallback to find the latest available date""" async with httpx.AsyncClient(timeout=self.timeout, follow_redirects=False) as client: # First try the requested date url = f"{self.base_url}/{target_date}" try: r = await client.get(url) # Check if we got redirected if r.status_code in [301, 302, 303, 307, 308]: # We got redirected, extract the actual date from the redirect location location = r.headers.get('location', '') logger.info(f"Got redirect to: {location}") # Extract date from redirect URL (e.g., /papers/date/2025-08-08) date_match = re.search(r'/papers/date/(\d{4}-\d{2}-\d{2})', location) if date_match: actual_date = date_match.group(1) logger.info(f"Redirected from {target_date} to {actual_date}") # Fetch the actual page actual_url = f"https://huggingface.co{location}" r = await client.get(actual_url) if r.status_code == 200: return actual_date, r.text else: raise Exception(f"Failed to fetch redirected page: {r.status_code}") else: # Couldn't extract date from redirect, use fallback raise Exception("Could not extract date from redirect") elif r.status_code == 200: # Direct success, check if the page actually contains the requested date if target_date in r.text or "Daily Papers" in r.text: return target_date, r.text else: # Page exists but doesn't contain expected content raise Exception("Page exists but doesn't contain expected content") else: # Other error status raise Exception(f"Status code {r.status_code}") except Exception as e: logger.error(f"Failed to fetch {target_date}: {e}") # If the requested date fails, try to find the latest available date actual_date, html = await self.find_latest_available_date(client) return actual_date, html async def find_latest_available_date(self, client: httpx.AsyncClient) -> tuple[str, str]: """Find the latest available date by checking recent dates""" # Start from today and go backwards up to 30 days today = datetime.now() for i in range(30): check_date = today - timedelta(days=i) date_str = check_date.strftime("%Y-%m-%d") url = f"{self.base_url}/{date_str}" try: r = await client.get(url) if r.status_code == 200: # Check if the page actually has content (not just a 404 or empty page) if "Daily Papers" in r.text and len(r.text) > 1000: logger.info(f"Found latest available date: {date_str}") return date_str, r.text except Exception: continue # If no date found, return a default page or raise an error raise Exception("No available daily papers found in the last 30 days") def parse_daily_cards(self, html: str) -> List[Dict[str, Any]]: """Parse daily papers HTML and extract paper cards""" soup = BeautifulSoup(html, "lxml") # First, extract JSON data from the page to get GitHub stars json_data = self.extract_json_data(html) # Find all article elements that contain paper cards cards: List[Dict[str, Any]] = [] # Look for article elements with the specific class structure from Hugging Face for article in soup.select("article.relative.flex.flex-col.overflow-hidden.rounded-xl.border"): try: card_data = {} # Extract title and link title_link = article.select_one("h3 a") if title_link: card_data["title"] = title_link.get_text(strip=True) href = title_link.get("href") if href: if href.startswith("http"): card_data["huggingface_url"] = href else: card_data["huggingface_url"] = f"https://huggingface.co{href}" # Extract upvote count upvote_div = article.select_one("div.shadow-alternate div.leading-none") if upvote_div: upvote_text = upvote_div.get_text(strip=True) try: card_data["upvotes"] = int(upvote_text) except ValueError: card_data["upvotes"] = 0 # Extract author count - look for the author count text author_count_div = article.select_one("div.flex.truncate.text-sm") if author_count_div: author_text = author_count_div.get_text(strip=True) # Extract number from "ยท 10 authors" author_match = re.search(r'(\d+)\s*authors?', author_text) if author_match: card_data["author_count"] = int(author_match.group(1)) else: card_data["author_count"] = 0 # Extract GitHub stars from JSON data in the page # This will be handled later when we parse the JSON data card_data["github_stars"] = 0 # Default value # Extract comments count - look for comment icon and number comment_links = article.select("a[href*='#community']") for comment_link in comment_links: comment_text = comment_link.get_text(strip=True) try: card_data["comments"] = int(comment_text) break except ValueError: continue # Extract submitter information submitted_div = article.select_one("div.shadow-xs") if submitted_div: submitter_text = submitted_div.get_text(strip=True) # Extract submitter name from "Submitted byLiang0223" (no space) submitter_match = re.search(r'Submitted by(\S+)', submitter_text) if submitter_match: card_data["submitter"] = submitter_match.group(1) # Extract arXiv ID from the URL if card_data.get("huggingface_url"): arxiv_id = self.extract_arxiv_id(card_data["huggingface_url"]) if arxiv_id: card_data["arxiv_id"] = arxiv_id # Try to get GitHub stars from the extracted data # Look for GitHub stars by matching paper title paper_title = card_data.get("title", "") if paper_title in json_data.get("github_stars_map", {}): card_data["github_stars"] = json_data["github_stars_map"][paper_title] # Only add cards that have at least a title if card_data.get("title"): cards.append(card_data) except Exception as e: logger.error(f"Error parsing card: {e}") continue # If the above method didn't work, fall back to the old method if not cards: logger.info("Falling back to old parsing method") for h3 in soup.select("h3"): # Title and Hugging Face paper link (if present) a = h3.find("a") title = h3.get_text(strip=True) hf_link = None if a and a.get("href"): href = a.get("href") # Absolute URL to huggingface if href.startswith("http"): hf_link = href else: hf_link = f"https://huggingface.co{href}" # Try to capture sibling info (authors, votes, etc.) as a small snippet meta_text = None parent = h3.parent if parent: # Join immediate text content following h3 collected: List[str] = [] for sib in parent.find_all(text=True, recursive=False): t = (sib or "").strip() if t: collected.append(t) if collected: meta_text = " ".join(collected) # Try to discover any arXiv link inside nearby anchors arxiv_id: Optional[str] = None container = parent if parent else h3 for link in container.find_all("a", href=True): possible = self.extract_arxiv_id(link["href"]) if possible: arxiv_id = possible break cards.append( { "title": title, "huggingface_url": hf_link, "meta": meta_text, "arxiv_id": arxiv_id, } ) # Deduplicate by title seen = set() unique_cards: List[Dict[str, Any]] = [] for c in cards: key = c.get("title") or "" if key and key not in seen: seen.add(key) unique_cards.append(c) logger.info(f"Parsed {len(unique_cards)} cards") return unique_cards async def get_daily_papers(self, target_date: str) -> tuple[str, List[Dict[str, Any]]]: """Get daily papers for a specific date""" date_str, html = await self.fetch_daily_html(target_date) cards = self.parse_daily_cards(html) return date_str, cards