paperindex / src /crawl /huggingface_daily.py
DVampire
update website
a878541
from typing import List, Dict, Any, Optional
import re
import httpx
from datetime import datetime, timedelta
from bs4 import BeautifulSoup
from src.logger import logger
class HuggingFaceDailyPapers:
"""Class for crawling and parsing Hugging Face daily papers"""
def __init__(self):
self.base_url = "https://huggingface.co/papers/date"
self.timeout = 20
def extract_arxiv_id(self, url: str) -> Optional[str]:
"""Extract arXiv ID from a URL"""
if not url:
return None
# matches https://huggingface.co/papers/2508.10711
m = re.search(r"huggingface\.co/papers/(\d{4,5}\.\d+)(v\d+)?", url)
if m:
return m.group(1)
return None
def extract_json_data(self, html: str) -> Dict[str, Any]:
"""Extract JSON data from the HTML page to get GitHub stars and other metadata."""
try:
soup = BeautifulSoup(html, "lxml")
# Look for GitHub stars in the HTML structure
# Based on the user's description, GitHub stars are displayed with SVG icons
# Look for SVG elements that might represent GitHub stars
svg_elements = soup.find_all("svg")
github_stars_map = {}
for svg in svg_elements:
# Look for GitHub-related SVG (usually has specific viewBox or path)
svg_html = str(svg)
if "github" in svg_html.lower() or "256 250" in svg_html: # GitHub icon viewBox
# Look for the star count near this SVG
parent = svg.parent
if parent:
# Look for numbers that might be star counts
text_content = parent.get_text()
numbers = re.findall(r'\b(\d+)\b', text_content)
if numbers:
# The number near a GitHub SVG is likely the star count
star_count = int(numbers[0])
# Try to find the paper title or ID to associate with
# Look for the closest article or card container
article = svg.find_parent("article")
if article:
title_elem = article.find("h3")
if title_elem:
paper_title = title_elem.get_text(strip=True)
github_stars_map[paper_title] = star_count
# Also look for any elements with GitHub-related text
github_text_elements = soup.find_all(string=lambda text: text and "github" in text.lower())
for text_elem in github_text_elements:
parent = text_elem.parent
if parent:
text_content = parent.get_text()
numbers = re.findall(r'\b(\d+)\b', text_content)
if numbers:
star_count = int(numbers[0])
# Try to find the paper title
article = parent.find_parent("article")
if article:
title_elem = article.find("h3")
if title_elem:
paper_title = title_elem.get_text(strip=True)
if paper_title not in github_stars_map:
github_stars_map[paper_title] = star_count
return {"github_stars_map": github_stars_map}
except Exception as e:
logger.error(f"Error extracting JSON data: {e}")
return {"github_stars_map": {}}
async def fetch_daily_html(self, target_date: str) -> tuple[str, str]:
"""Fetch daily papers HTML, with fallback to find the latest available date"""
async with httpx.AsyncClient(timeout=self.timeout, follow_redirects=False) as client:
# First try the requested date
url = f"{self.base_url}/{target_date}"
try:
r = await client.get(url)
# Check if we got redirected
if r.status_code in [301, 302, 303, 307, 308]:
# We got redirected, extract the actual date from the redirect location
location = r.headers.get('location', '')
logger.info(f"Got redirect to: {location}")
# Extract date from redirect URL (e.g., /papers/date/2025-08-08)
date_match = re.search(r'/papers/date/(\d{4}-\d{2}-\d{2})', location)
if date_match:
actual_date = date_match.group(1)
logger.info(f"Redirected from {target_date} to {actual_date}")
# Fetch the actual page
actual_url = f"https://huggingface.co{location}"
r = await client.get(actual_url)
if r.status_code == 200:
return actual_date, r.text
else:
raise Exception(f"Failed to fetch redirected page: {r.status_code}")
else:
# Couldn't extract date from redirect, use fallback
raise Exception("Could not extract date from redirect")
elif r.status_code == 200:
# Direct success, check if the page actually contains the requested date
if target_date in r.text or "Daily Papers" in r.text:
return target_date, r.text
else:
# Page exists but doesn't contain expected content
raise Exception("Page exists but doesn't contain expected content")
else:
# Other error status
raise Exception(f"Status code {r.status_code}")
except Exception as e:
logger.error(f"Failed to fetch {target_date}: {e}")
# If the requested date fails, try to find the latest available date
actual_date, html = await self.find_latest_available_date(client)
return actual_date, html
async def find_latest_available_date(self, client: httpx.AsyncClient) -> tuple[str, str]:
"""Find the latest available date by checking recent dates"""
# Start from today and go backwards up to 30 days
today = datetime.now()
for i in range(30):
check_date = today - timedelta(days=i)
date_str = check_date.strftime("%Y-%m-%d")
url = f"{self.base_url}/{date_str}"
try:
r = await client.get(url)
if r.status_code == 200:
# Check if the page actually has content (not just a 404 or empty page)
if "Daily Papers" in r.text and len(r.text) > 1000:
logger.info(f"Found latest available date: {date_str}")
return date_str, r.text
except Exception:
continue
# If no date found, return a default page or raise an error
raise Exception("No available daily papers found in the last 30 days")
def parse_daily_cards(self, html: str) -> List[Dict[str, Any]]:
"""Parse daily papers HTML and extract paper cards"""
soup = BeautifulSoup(html, "lxml")
# First, extract JSON data from the page to get GitHub stars
json_data = self.extract_json_data(html)
# Find all article elements that contain paper cards
cards: List[Dict[str, Any]] = []
# Look for article elements with the specific class structure from Hugging Face
for article in soup.select("article.relative.flex.flex-col.overflow-hidden.rounded-xl.border"):
try:
card_data = {}
# Extract title and link
title_link = article.select_one("h3 a")
if title_link:
card_data["title"] = title_link.get_text(strip=True)
href = title_link.get("href")
if href:
if href.startswith("http"):
card_data["huggingface_url"] = href
else:
card_data["huggingface_url"] = f"https://huggingface.co{href}"
# Extract upvote count
upvote_div = article.select_one("div.shadow-alternate div.leading-none")
if upvote_div:
upvote_text = upvote_div.get_text(strip=True)
try:
card_data["upvotes"] = int(upvote_text)
except ValueError:
card_data["upvotes"] = 0
# Extract author count - look for the author count text
author_count_div = article.select_one("div.flex.truncate.text-sm")
if author_count_div:
author_text = author_count_div.get_text(strip=True)
# Extract number from "· 10 authors"
author_match = re.search(r'(\d+)\s*authors?', author_text)
if author_match:
card_data["author_count"] = int(author_match.group(1))
else:
card_data["author_count"] = 0
# Extract GitHub stars from JSON data in the page
# This will be handled later when we parse the JSON data
card_data["github_stars"] = 0 # Default value
# Extract comments count - look for comment icon and number
comment_links = article.select("a[href*='#community']")
for comment_link in comment_links:
comment_text = comment_link.get_text(strip=True)
try:
card_data["comments"] = int(comment_text)
break
except ValueError:
continue
# Extract submitter information
submitted_div = article.select_one("div.shadow-xs")
if submitted_div:
submitter_text = submitted_div.get_text(strip=True)
# Extract submitter name from "Submitted byLiang0223" (no space)
submitter_match = re.search(r'Submitted by(\S+)', submitter_text)
if submitter_match:
card_data["submitter"] = submitter_match.group(1)
# Extract arXiv ID from the URL
if card_data.get("huggingface_url"):
arxiv_id = self.extract_arxiv_id(card_data["huggingface_url"])
if arxiv_id:
card_data["arxiv_id"] = arxiv_id
# Try to get GitHub stars from the extracted data
# Look for GitHub stars by matching paper title
paper_title = card_data.get("title", "")
if paper_title in json_data.get("github_stars_map", {}):
card_data["github_stars"] = json_data["github_stars_map"][paper_title]
# Only add cards that have at least a title
if card_data.get("title"):
cards.append(card_data)
except Exception as e:
logger.error(f"Error parsing card: {e}")
continue
# If the above method didn't work, fall back to the old method
if not cards:
logger.info("Falling back to old parsing method")
for h3 in soup.select("h3"):
# Title and Hugging Face paper link (if present)
a = h3.find("a")
title = h3.get_text(strip=True)
hf_link = None
if a and a.get("href"):
href = a.get("href")
# Absolute URL to huggingface
if href.startswith("http"):
hf_link = href
else:
hf_link = f"https://huggingface.co{href}"
# Try to capture sibling info (authors, votes, etc.) as a small snippet
meta_text = None
parent = h3.parent
if parent:
# Join immediate text content following h3
collected: List[str] = []
for sib in parent.find_all(text=True, recursive=False):
t = (sib or "").strip()
if t:
collected.append(t)
if collected:
meta_text = " ".join(collected)
# Try to discover any arXiv link inside nearby anchors
arxiv_id: Optional[str] = None
container = parent if parent else h3
for link in container.find_all("a", href=True):
possible = self.extract_arxiv_id(link["href"])
if possible:
arxiv_id = possible
break
cards.append(
{
"title": title,
"huggingface_url": hf_link,
"meta": meta_text,
"arxiv_id": arxiv_id,
}
)
# Deduplicate by title
seen = set()
unique_cards: List[Dict[str, Any]] = []
for c in cards:
key = c.get("title") or ""
if key and key not in seen:
seen.add(key)
unique_cards.append(c)
logger.info(f"Parsed {len(unique_cards)} cards")
return unique_cards
async def get_daily_papers(self, target_date: str) -> tuple[str, List[Dict[str, Any]]]:
"""Get daily papers for a specific date"""
date_str, html = await self.fetch_daily_html(target_date)
cards = self.parse_daily_cards(html)
return date_str, cards