import os import requests import tempfile from datetime import datetime, timezone, timedelta import base64 from tqdm.auto import tqdm import pymupdf DAILY_PAPERS_API_URL = "https://huggingface.co/api/daily_papers" class PaperManager: def fetch_papers(self, date=None): """ Fetch papers from the API with optional date filtering. Args: date (str, optional): Date string in 'YYYY-MM-DD' format. Defaults to today's date. Returns: bool: True if papers were successfully fetched, False otherwise. """ try: # Use today's date if none provided if date is None: date = datetime.now().strftime('%Y-%m-%d') # Construct the URL with the date parameter url = f"{DAILY_PAPERS_API_URL}?date={date}&limit=100" print(f"Fetching papers from: {url}") response = requests.get(url) response.raise_for_status() data = response.json() if not data: print("No data received from API.") return False self.raw_papers = data # Store raw data print(f"Found {len(data)} papers for date {date}") return True except requests.RequestException as e: print(f"Error fetching papers: {e}") return False except Exception as e: print(f"Unexpected error: {e}") return False def get_top_content(self): """ Get the most upvoted paper from today's submissions. Returns: dict: Dictionary mapping paper titles to their contents. """ # Fetch papers from today if not self.fetch_papers(): return {} # Sort by upvotes if self.raw_papers: sorted_papers = sorted( self.raw_papers, key=lambda x: x.get('paper', {}).get('upvotes', 0), reverse=True ) # Take only the top paper self.papers = [sorted_papers[0]] if sorted_papers else [] else: print("No papers found for today.") self.papers = [] # Get content contents = {} print(f"Processing {len(self.papers)} papers:") for paper in tqdm(self.papers): paper_id = paper["paper"]['id'] content = self.get_paper_text(paper_id) contents[paper["paper"]['title']] = {"id": paper_id, "content": content} return contents def get_paper_text(self, paper_id): url = f"https://arxiv.org/pdf/{paper_id}.pdf" response = requests.get(url) if response.status_code != 200: raise Exception(f"Failed to download PDF: {response.status_code}") with open("temp.pdf", "wb") as f: f.write(response.content) with pymupdf.open("temp.pdf") as doc: text = "" for page in doc: text += page.get_text() return text # def get_top_content(self): # self.fetch_papers() # self.filter_top_papers() # contents = {} # print(f"Processing {len(self.papers)} papers:") # for paper in tqdm(self.papers): # paper_id = paper["paper"]['id'] # contents[paper["paper"]['title']] = self.get_paper_text(paper_id) # return contents