File size: 3,838 Bytes
b358061
547fef1
b358061
e325224
b358061
 
 
547fef1
 
 
b358061
e325224
 
 
 
 
 
 
 
 
 
b358061
e325224
 
 
 
 
 
 
 
 
b358061
 
 
 
 
 
 
 
e325224
b358061
 
 
 
 
 
 
 
 
e325224
 
 
b358061
e325224
 
 
 
 
 
b358061
e325224
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
55e52b0
 
e325224
 
b358061
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19da3fb
 
 
 
 
 
 
 
 
 
e325224
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
import os
import requests
import tempfile
from datetime import datetime, timezone, timedelta
import base64
from tqdm.auto import tqdm
import pymupdf

DAILY_PAPERS_API_URL = "https://huggingface.co/api/daily_papers"

class PaperManager:
    def fetch_papers(self, date=None):
        """
        Fetch papers from the API with optional date filtering.
        
        Args:
            date (str, optional): Date string in 'YYYY-MM-DD' format. Defaults to today's date.
        
        Returns:
            bool: True if papers were successfully fetched, False otherwise.
        """
        try:
            # Use today's date if none provided
            if date is None:
                date = datetime.now().strftime('%Y-%m-%d')
                
            # Construct the URL with the date parameter
            url = f"{DAILY_PAPERS_API_URL}?date={date}&limit=100"
                
            print(f"Fetching papers from: {url}")
            response = requests.get(url)
            response.raise_for_status()
            data = response.json()

            if not data:
                print("No data received from API.")
                return False

            self.raw_papers = data  # Store raw data
            print(f"Found {len(data)} papers for date {date}")
            return True

        except requests.RequestException as e:
            print(f"Error fetching papers: {e}")
            return False
        except Exception as e:
            print(f"Unexpected error: {e}")
            return False

    def get_top_content(self):
        """
        Get the most upvoted paper from today's submissions.
        
        Returns:
            dict: Dictionary mapping paper titles to their contents.
        """
        # Fetch papers from today
        if not self.fetch_papers():
            return {}
        
        # Sort by upvotes
        if self.raw_papers:
            sorted_papers = sorted(
                self.raw_papers,
                key=lambda x: x.get('paper', {}).get('upvotes', 0),
                reverse=True
            )
            
            # Take only the top paper
            self.papers = [sorted_papers[0]] if sorted_papers else []
        else:
            print("No papers found for today.")
            self.papers = []
        
        # Get content
        contents = {}
        print(f"Processing {len(self.papers)} papers:")
        for paper in tqdm(self.papers):
            paper_id = paper["paper"]['id']
            content = self.get_paper_text(paper_id)
            contents[paper["paper"]['title']] = {"id": paper_id, "content": content}
        
        return contents

    def get_paper_text(self, paper_id):
        url = f"https://arxiv.org/pdf/{paper_id}.pdf"
        response = requests.get(url)
        
        if response.status_code != 200:
            raise Exception(f"Failed to download PDF: {response.status_code}")
        
        with open("temp.pdf", "wb") as f:
            f.write(response.content)

        with pymupdf.open("temp.pdf") as doc:
            text = ""
            for page in doc:
                text += page.get_text()        
        return text


    # def get_top_content(self):
    #     self.fetch_papers()
    #     self.filter_top_papers()
    #     contents =  {}
    #     print(f"Processing {len(self.papers)} papers:")
    #     for paper in tqdm(self.papers):
    #         paper_id = paper["paper"]['id']
    #         contents[paper["paper"]['title']] = self.get_paper_text(paper_id)
    #     return contents

    
# Example usage
if __name__ == "__main__":
    paper_manager = PaperManager()
    top_papers = paper_manager.get_top_content()
    for title, content in top_papers.items():
        print(f"Title: {title}")
        print(f"Content: {content[:100]}...")  # Print first 100 characters of content