m-ric HF Staff commited on
Commit
6cc0694
·
1 Parent(s): 8569025

Add paper management

Browse files
Files changed (3) hide show
  1. app.py +8 -2
  2. papers.py +117 -0
  3. requirements.txt +3 -2
app.py CHANGED
@@ -12,9 +12,15 @@ from huggingface_hub import InferenceClient
12
  from kokoro import KModel, KPipeline
13
 
14
  # -----------------------------------------------------------------------------
15
- # Hard‑coded podcast subject
16
  # -----------------------------------------------------------------------------
17
- PODCAST_SUBJECT = "The future of AI and its impact on society"
 
 
 
 
 
 
18
 
19
  # -----------------------------------------------------------------------------
20
  # LLM that writes the script (unchanged)
 
12
  from kokoro import KModel, KPipeline
13
 
14
  # -----------------------------------------------------------------------------
15
+ # Get podcast subject
16
  # -----------------------------------------------------------------------------
17
+ from papers import PaperManager
18
+
19
+ paper_manage = PaperManager()
20
+ top_papers = paper_manager.get_top_content()
21
+ print("TOP PAPERS", top_papers)
22
+
23
+ PODCAST_SUBJECT = paper_manager.get_paper_text(top_papers[0]["paper_id"])
24
 
25
  # -----------------------------------------------------------------------------
26
  # LLM that writes the script (unchanged)
papers.py ADDED
@@ -0,0 +1,117 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import requests
3
+ import tempfile
4
+ from datetime import datetime, timezone
5
+ import base64
6
+ from tqdm.auto import tqdm
7
+ import PyMuPDF
8
+
9
+ DAILY_PAPERS_API_URL = "https://huggingface.co/api/daily_papers"
10
+
11
+ class PaperManager:
12
+ def __init__(self, papers_per_page=30):
13
+ self.papers = []
14
+ self.raw_papers = [] # To store fetched data
15
+
16
+ def calculate_rising_score(self, paper):
17
+ """
18
+ Calculate the rising score of a paper.
19
+ This emphasizes recent upvotes and the rate of upvote accumulation.
20
+ """
21
+ upvotes = paper.get('paper', {}).get('upvotes', 0)
22
+ published_at_str = paper.get('publishedAt', datetime.now(timezone.utc).isoformat())
23
+ try:
24
+ published_time = datetime.fromisoformat(published_at_str.replace('Z', '+00:00'))
25
+ except ValueError:
26
+ published_time = datetime.now(timezone.utc)
27
+
28
+ time_diff = datetime.now(timezone.utc) - published_time
29
+ time_diff_hours = time_diff.total_seconds() / 3600 # Convert time difference to hours
30
+
31
+ # Rising score favors papers that are gaining upvotes quickly
32
+ # Adjusted to have a linear decay over time
33
+ score = upvotes / (time_diff_hours + 1)
34
+ return score
35
+
36
+ def fetch_papers(self):
37
+ try:
38
+ response = requests.get(f"{DAILY_PAPERS_API_URL}?limit=100")
39
+ response.raise_for_status()
40
+ data = response.json()
41
+
42
+ if not data:
43
+ print("No data received from API.")
44
+ return False
45
+
46
+ self.raw_papers = data # Store raw data
47
+
48
+ return True
49
+
50
+ except requests.RequestException as e:
51
+ print(f"Error fetching papers: {e}")
52
+ return False
53
+ except Exception as e:
54
+ print(f"Unexpected error: {e}")
55
+ return False
56
+
57
+ def filter_top_papers(self, threshold_general=2.0, threshold_agent=0.7):
58
+ self.papers = []
59
+ for paper in self.raw_papers:
60
+ paper_score = self.calculate_rising_score(paper)
61
+ # if paper_score >= threshold_general or ('agent' in paper['title'].lower() and paper_score >= threshold_agent):
62
+ self.papers.append(paper)
63
+
64
+ self.papers = sorted(
65
+ self.papers,
66
+ key=lambda x: self.calculate_rising_score(x) * (3 if 'agent' in x['title'].lower() else 1),
67
+ reverse=True
68
+ )[:2]
69
+ return self.papers
70
+
71
+ # def get_paper_content(self, paper_id):
72
+ # pdf_url = f"https://arxiv.org/pdf/{paper_id}.pdf"
73
+ # print("Processing paper:", pdf_url)
74
+ # client = httpx.Client(follow_redirects=True)
75
+ # response = client.get(pdf_url)
76
+
77
+ # # First verification - check if we got a valid PDF response
78
+ # if response.status_code != 200:
79
+ # raise Exception(f"Failed to fetch PDF: {response.status_code}")
80
+
81
+ # if not response.headers.get('content-type', '').startswith('application/pdf'):
82
+ # raise Exception(f"Unexpected content type: {response.headers.get('content-type')}")
83
+
84
+ # # Second verification - check the first few bytes of the content
85
+ # if not response.content.startswith(b'%PDF'):
86
+ # raise Exception("Content doesn't appear to be a valid PDF")
87
+
88
+ # pdf_data = base64.standard_b64encode(response.content).decode("utf-8")
89
+ # return {"pdf": pdf_data, "url": pdf_url}
90
+
91
+ def get_paper_text(self, paper_id):
92
+ url = f"https://arxiv.org/pdf/{paper_id}.pdf"
93
+ response = requests.get(url)
94
+
95
+ if response.status_code != 200:
96
+ raise Exception(f"Failed to download PDF: {response.status_code}")
97
+
98
+ with open("temp.pdf", "wb") as f:
99
+ f.write(response.content)
100
+
101
+ # Extract text using PyMuPDF
102
+ with PyMuPDF.open("temp.pdf") as doc:
103
+ text = ""
104
+ for page in doc:
105
+ text += page.get_text()
106
+ return text
107
+
108
+
109
+ def get_top_content(self):
110
+ self.fetch_papers()
111
+ self.filter_top_papers()
112
+ contents = {}
113
+ print(f"Processing {len(self.papers)} papers:")
114
+ for paper in tqdm(self.papers):
115
+ paper_id = paper["paper"]['id']
116
+ contents[paper["paper"]['title']] = self.get_paper_content(paper_id)
117
+ return contents
requirements.txt CHANGED
@@ -1,3 +1,4 @@
1
- git+https://github.com/nari-labs/dia.git
2
  huggingface_hub
3
- transformers
 
 
1
+ kokoro
2
  huggingface_hub
3
+ transformers
4
+ PyMuPDF