Spaces:
Running
Running
File size: 14,160 Bytes
583741e a878541 583741e a878541 583741e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 |
from typing import List, Dict, Any, Optional
import re
import httpx
from datetime import datetime, timedelta
from bs4 import BeautifulSoup
from src.logger import logger
class HuggingFaceDailyPapers:
"""Class for crawling and parsing Hugging Face daily papers"""
def __init__(self):
self.base_url = "https://huggingface.co/papers/date"
self.timeout = 20
def extract_arxiv_id(self, url: str) -> Optional[str]:
"""Extract arXiv ID from a URL"""
if not url:
return None
# matches https://huggingface.co/papers/2508.10711
m = re.search(r"huggingface\.co/papers/(\d{4,5}\.\d+)(v\d+)?", url)
if m:
return m.group(1)
return None
def extract_json_data(self, html: str) -> Dict[str, Any]:
"""Extract JSON data from the HTML page to get GitHub stars and other metadata."""
try:
soup = BeautifulSoup(html, "lxml")
# Look for GitHub stars in the HTML structure
# Based on the user's description, GitHub stars are displayed with SVG icons
# Look for SVG elements that might represent GitHub stars
svg_elements = soup.find_all("svg")
github_stars_map = {}
for svg in svg_elements:
# Look for GitHub-related SVG (usually has specific viewBox or path)
svg_html = str(svg)
if "github" in svg_html.lower() or "256 250" in svg_html: # GitHub icon viewBox
# Look for the star count near this SVG
parent = svg.parent
if parent:
# Look for numbers that might be star counts
text_content = parent.get_text()
numbers = re.findall(r'\b(\d+)\b', text_content)
if numbers:
# The number near a GitHub SVG is likely the star count
star_count = int(numbers[0])
# Try to find the paper title or ID to associate with
# Look for the closest article or card container
article = svg.find_parent("article")
if article:
title_elem = article.find("h3")
if title_elem:
paper_title = title_elem.get_text(strip=True)
github_stars_map[paper_title] = star_count
# Also look for any elements with GitHub-related text
github_text_elements = soup.find_all(string=lambda text: text and "github" in text.lower())
for text_elem in github_text_elements:
parent = text_elem.parent
if parent:
text_content = parent.get_text()
numbers = re.findall(r'\b(\d+)\b', text_content)
if numbers:
star_count = int(numbers[0])
# Try to find the paper title
article = parent.find_parent("article")
if article:
title_elem = article.find("h3")
if title_elem:
paper_title = title_elem.get_text(strip=True)
if paper_title not in github_stars_map:
github_stars_map[paper_title] = star_count
return {"github_stars_map": github_stars_map}
except Exception as e:
logger.error(f"Error extracting JSON data: {e}")
return {"github_stars_map": {}}
async def fetch_daily_html(self, target_date: str) -> tuple[str, str]:
"""Fetch daily papers HTML, with fallback to find the latest available date"""
async with httpx.AsyncClient(timeout=self.timeout, follow_redirects=False) as client:
# First try the requested date
url = f"{self.base_url}/{target_date}"
try:
r = await client.get(url)
# Check if we got redirected
if r.status_code in [301, 302, 303, 307, 308]:
# We got redirected, extract the actual date from the redirect location
location = r.headers.get('location', '')
logger.info(f"Got redirect to: {location}")
# Extract date from redirect URL (e.g., /papers/date/2025-08-08)
date_match = re.search(r'/papers/date/(\d{4}-\d{2}-\d{2})', location)
if date_match:
actual_date = date_match.group(1)
logger.info(f"Redirected from {target_date} to {actual_date}")
# Fetch the actual page
actual_url = f"https://huggingface.co{location}"
r = await client.get(actual_url)
if r.status_code == 200:
return actual_date, r.text
else:
raise Exception(f"Failed to fetch redirected page: {r.status_code}")
else:
# Couldn't extract date from redirect, use fallback
raise Exception("Could not extract date from redirect")
elif r.status_code == 200:
# Direct success, check if the page actually contains the requested date
if target_date in r.text or "Daily Papers" in r.text:
return target_date, r.text
else:
# Page exists but doesn't contain expected content
raise Exception("Page exists but doesn't contain expected content")
else:
# Other error status
raise Exception(f"Status code {r.status_code}")
except Exception as e:
logger.error(f"Failed to fetch {target_date}: {e}")
# If the requested date fails, try to find the latest available date
actual_date, html = await self.find_latest_available_date(client)
return actual_date, html
async def find_latest_available_date(self, client: httpx.AsyncClient) -> tuple[str, str]:
"""Find the latest available date by checking recent dates"""
# Start from today and go backwards up to 30 days
today = datetime.now()
for i in range(30):
check_date = today - timedelta(days=i)
date_str = check_date.strftime("%Y-%m-%d")
url = f"{self.base_url}/{date_str}"
try:
r = await client.get(url)
if r.status_code == 200:
# Check if the page actually has content (not just a 404 or empty page)
if "Daily Papers" in r.text and len(r.text) > 1000:
logger.info(f"Found latest available date: {date_str}")
return date_str, r.text
except Exception:
continue
# If no date found, return a default page or raise an error
raise Exception("No available daily papers found in the last 30 days")
def parse_daily_cards(self, html: str) -> List[Dict[str, Any]]:
"""Parse daily papers HTML and extract paper cards"""
soup = BeautifulSoup(html, "lxml")
# First, extract JSON data from the page to get GitHub stars
json_data = self.extract_json_data(html)
# Find all article elements that contain paper cards
cards: List[Dict[str, Any]] = []
# Look for article elements with the specific class structure from Hugging Face
for article in soup.select("article.relative.flex.flex-col.overflow-hidden.rounded-xl.border"):
try:
card_data = {}
# Extract title and link
title_link = article.select_one("h3 a")
if title_link:
card_data["title"] = title_link.get_text(strip=True)
href = title_link.get("href")
if href:
if href.startswith("http"):
card_data["huggingface_url"] = href
else:
card_data["huggingface_url"] = f"https://huggingface.co{href}"
# Extract upvote count
upvote_div = article.select_one("div.shadow-alternate div.leading-none")
if upvote_div:
upvote_text = upvote_div.get_text(strip=True)
try:
card_data["upvotes"] = int(upvote_text)
except ValueError:
card_data["upvotes"] = 0
# Extract author count - look for the author count text
author_count_div = article.select_one("div.flex.truncate.text-sm")
if author_count_div:
author_text = author_count_div.get_text(strip=True)
# Extract number from "· 10 authors"
author_match = re.search(r'(\d+)\s*authors?', author_text)
if author_match:
card_data["author_count"] = int(author_match.group(1))
else:
card_data["author_count"] = 0
# Extract GitHub stars from JSON data in the page
# This will be handled later when we parse the JSON data
card_data["github_stars"] = 0 # Default value
# Extract comments count - look for comment icon and number
comment_links = article.select("a[href*='#community']")
for comment_link in comment_links:
comment_text = comment_link.get_text(strip=True)
try:
card_data["comments"] = int(comment_text)
break
except ValueError:
continue
# Extract submitter information
submitted_div = article.select_one("div.shadow-xs")
if submitted_div:
submitter_text = submitted_div.get_text(strip=True)
# Extract submitter name from "Submitted byLiang0223" (no space)
submitter_match = re.search(r'Submitted by(\S+)', submitter_text)
if submitter_match:
card_data["submitter"] = submitter_match.group(1)
# Extract arXiv ID from the URL
if card_data.get("huggingface_url"):
arxiv_id = self.extract_arxiv_id(card_data["huggingface_url"])
if arxiv_id:
card_data["arxiv_id"] = arxiv_id
# Try to get GitHub stars from the extracted data
# Look for GitHub stars by matching paper title
paper_title = card_data.get("title", "")
if paper_title in json_data.get("github_stars_map", {}):
card_data["github_stars"] = json_data["github_stars_map"][paper_title]
# Only add cards that have at least a title
if card_data.get("title"):
cards.append(card_data)
except Exception as e:
logger.error(f"Error parsing card: {e}")
continue
# If the above method didn't work, fall back to the old method
if not cards:
logger.info("Falling back to old parsing method")
for h3 in soup.select("h3"):
# Title and Hugging Face paper link (if present)
a = h3.find("a")
title = h3.get_text(strip=True)
hf_link = None
if a and a.get("href"):
href = a.get("href")
# Absolute URL to huggingface
if href.startswith("http"):
hf_link = href
else:
hf_link = f"https://huggingface.co{href}"
# Try to capture sibling info (authors, votes, etc.) as a small snippet
meta_text = None
parent = h3.parent
if parent:
# Join immediate text content following h3
collected: List[str] = []
for sib in parent.find_all(text=True, recursive=False):
t = (sib or "").strip()
if t:
collected.append(t)
if collected:
meta_text = " ".join(collected)
# Try to discover any arXiv link inside nearby anchors
arxiv_id: Optional[str] = None
container = parent if parent else h3
for link in container.find_all("a", href=True):
possible = self.extract_arxiv_id(link["href"])
if possible:
arxiv_id = possible
break
cards.append(
{
"title": title,
"huggingface_url": hf_link,
"meta": meta_text,
"arxiv_id": arxiv_id,
}
)
# Deduplicate by title
seen = set()
unique_cards: List[Dict[str, Any]] = []
for c in cards:
key = c.get("title") or ""
if key and key not in seen:
seen.add(key)
unique_cards.append(c)
logger.info(f"Parsed {len(unique_cards)} cards")
return unique_cards
async def get_daily_papers(self, target_date: str) -> tuple[str, List[Dict[str, Any]]]:
"""Get daily papers for a specific date"""
date_str, html = await self.fetch_daily_html(target_date)
cards = self.parse_daily_cards(html)
return date_str, cards
|