open-notebooklm / papers.py
m-ric's picture
m-ric HF Staff
More fixes
a3761cd
raw
history blame
4.28 kB
import os
import requests
import tempfile
from datetime import datetime, timezone
import base64
from tqdm.auto import tqdm
import pymupdf
DAILY_PAPERS_API_URL = "https://huggingface.co/api/daily_papers"
class PaperManager:
def __init__(self, papers_per_page=30):
self.papers = []
self.raw_papers = [] # To store fetched data
def calculate_rising_score(self, paper):
"""
Calculate the rising score of a paper.
This emphasizes recent upvotes and the rate of upvote accumulation.
"""
upvotes = paper.get('paper', {}).get('upvotes', 0)
published_at_str = paper.get('publishedAt', datetime.now(timezone.utc).isoformat())
try:
published_time = datetime.fromisoformat(published_at_str.replace('Z', '+00:00'))
except ValueError:
published_time = datetime.now(timezone.utc)
time_diff = datetime.now(timezone.utc) - published_time
time_diff_hours = time_diff.total_seconds() / 3600 # Convert time difference to hours
# Rising score favors papers that are gaining upvotes quickly
# Adjusted to have a linear decay over time
score = upvotes / (time_diff_hours + 1)
return score
def fetch_papers(self):
try:
response = requests.get(f"{DAILY_PAPERS_API_URL}?limit=100")
response.raise_for_status()
data = response.json()
if not data:
print("No data received from API.")
return False
self.raw_papers = data # Store raw data
return True
except requests.RequestException as e:
print(f"Error fetching papers: {e}")
return False
except Exception as e:
print(f"Unexpected error: {e}")
return False
def filter_top_papers(self, threshold_general=2.0, threshold_agent=0.7):
self.papers = []
for paper in self.raw_papers:
paper_score = self.calculate_rising_score(paper)
# if paper_score >= threshold_general or ('agent' in paper['title'].lower() and paper_score >= threshold_agent):
self.papers.append(paper)
self.papers = sorted(
self.papers,
key=lambda x: self.calculate_rising_score(x) * (3 if 'agent' in x['title'].lower() else 1),
reverse=True
)[:2]
return self.papers
# def get_paper_content(self, paper_id):
# pdf_url = f"https://arxiv.org/pdf/{paper_id}.pdf"
# print("Processing paper:", pdf_url)
# client = httpx.Client(follow_redirects=True)
# response = client.get(pdf_url)
# # First verification - check if we got a valid PDF response
# if response.status_code != 200:
# raise Exception(f"Failed to fetch PDF: {response.status_code}")
# if not response.headers.get('content-type', '').startswith('application/pdf'):
# raise Exception(f"Unexpected content type: {response.headers.get('content-type')}")
# # Second verification - check the first few bytes of the content
# if not response.content.startswith(b'%PDF'):
# raise Exception("Content doesn't appear to be a valid PDF")
# pdf_data = base64.standard_b64encode(response.content).decode("utf-8")
# return {"pdf": pdf_data, "url": pdf_url}
def get_paper_text(self, paper_id):
url = f"https://arxiv.org/pdf/{paper_id}.pdf"
response = requests.get(url)
if response.status_code != 200:
raise Exception(f"Failed to download PDF: {response.status_code}")
with open("temp.pdf", "wb") as f:
f.write(response.content)
with pymupdf.open("temp.pdf") as doc:
text = ""
for page in doc:
text += page.get_text()
return text
def get_top_content(self):
self.fetch_papers()
self.filter_top_papers()
contents = {}
print(f"Processing {len(self.papers)} papers:")
for paper in tqdm(self.papers):
paper_id = paper["paper"]['id']
contents[paper["paper"]['title']] = self.get_paper_text(paper_id)
return contents