Spaces:

m-ric
/

open-notebooklm

Running on Zero

App Files Files Community

m-ric commited on May 8

Commit

6cc0694

1 Parent(s): 8569025

Add paper management

Browse files

Files changed (3) hide show

app.py +8 -2
papers.py +117 -0
requirements.txt +3 -2

app.py CHANGED Viewed

@@ -12,9 +12,15 @@ from huggingface_hub import InferenceClient
 from kokoro import KModel, KPipeline
 # -----------------------------------------------------------------------------
-# Hard‑coded podcast subject
 # -----------------------------------------------------------------------------
-PODCAST_SUBJECT = "The future of AI and its impact on society"
 # -----------------------------------------------------------------------------
 # LLM that writes the script (unchanged)

 from kokoro import KModel, KPipeline
 # -----------------------------------------------------------------------------
+# Get podcast subject
 # -----------------------------------------------------------------------------
+from papers import PaperManager
+paper_manage = PaperManager()
+top_papers = paper_manager.get_top_content()
+print("TOP PAPERS", top_papers)
+PODCAST_SUBJECT = paper_manager.get_paper_text(top_papers[0]["paper_id"])
 # -----------------------------------------------------------------------------
 # LLM that writes the script (unchanged)

papers.py ADDED Viewed

	@@ -0,0 +1,117 @@

+import os
+import requests
+import tempfile
+from datetime import datetime, timezone
+import base64
+from tqdm.auto import tqdm
+import PyMuPDF
+DAILY_PAPERS_API_URL = "https://huggingface.co/api/daily_papers"
+class PaperManager:
+    def __init__(self, papers_per_page=30):
+        self.papers = []
+        self.raw_papers = []  # To store fetched data
+    def calculate_rising_score(self, paper):
+        """
+        Calculate the rising score of a paper.
+        This emphasizes recent upvotes and the rate of upvote accumulation.
+        """
+        upvotes = paper.get('paper', {}).get('upvotes', 0)
+        published_at_str = paper.get('publishedAt', datetime.now(timezone.utc).isoformat())
+        try:
+            published_time = datetime.fromisoformat(published_at_str.replace('Z', '+00:00'))
+        except ValueError:
+            published_time = datetime.now(timezone.utc)
+        time_diff = datetime.now(timezone.utc) - published_time
+        time_diff_hours = time_diff.total_seconds() / 3600  # Convert time difference to hours
+        # Rising score favors papers that are gaining upvotes quickly
+        # Adjusted to have a linear decay over time
+        score = upvotes / (time_diff_hours + 1)
+        return score
+    def fetch_papers(self):
+        try:
+            response = requests.get(f"{DAILY_PAPERS_API_URL}?limit=100")
+            response.raise_for_status()
+            data = response.json()
+            if not data:
+                print("No data received from API.")
+                return False
+            self.raw_papers = data  # Store raw data
+            return True
+        except requests.RequestException as e:
+            print(f"Error fetching papers: {e}")
+            return False
+        except Exception as e:
+            print(f"Unexpected error: {e}")
+            return False
+    def filter_top_papers(self, threshold_general=2.0, threshold_agent=0.7):
+        self.papers = []
+        for paper in self.raw_papers:
+            paper_score = self.calculate_rising_score(paper)
+            # if paper_score >= threshold_general or ('agent' in paper['title'].lower() and paper_score >= threshold_agent):
+            self.papers.append(paper)
+        self.papers = sorted(
+            self.papers,
+            key=lambda x: self.calculate_rising_score(x) * (3 if 'agent' in x['title'].lower() else 1),
+            reverse=True
+        )[:2]
+        return self.papers
+    # def get_paper_content(self, paper_id):
+    #     pdf_url = f"https://arxiv.org/pdf/{paper_id}.pdf"
+    #     print("Processing paper:", pdf_url)
+    #     client = httpx.Client(follow_redirects=True)
+    #     response = client.get(pdf_url)
+    #     # First verification - check if we got a valid PDF response
+    #     if response.status_code != 200:
+    #         raise Exception(f"Failed to fetch PDF: {response.status_code}")
+    #     if not response.headers.get('content-type', '').startswith('application/pdf'):
+    #         raise Exception(f"Unexpected content type: {response.headers.get('content-type')}")
+    #     # Second verification - check the first few bytes of the content
+    #     if not response.content.startswith(b'%PDF'):
+    #         raise Exception("Content doesn't appear to be a valid PDF")
+    #     pdf_data = base64.standard_b64encode(response.content).decode("utf-8")
+    #     return {"pdf": pdf_data, "url": pdf_url}
+    def get_paper_text(self, paper_id):
+        url = f"https://arxiv.org/pdf/{paper_id}.pdf"
+        response = requests.get(url)
+        if response.status_code != 200:
+            raise Exception(f"Failed to download PDF: {response.status_code}")
+        with open("temp.pdf", "wb") as f:
+            f.write(response.content)
+        # Extract text using PyMuPDF
+        with PyMuPDF.open("temp.pdf") as doc:
+            text = ""
+            for page in doc:
+                text += page.get_text()
+        return text
+    def get_top_content(self):
+        self.fetch_papers()
+        self.filter_top_papers()
+        contents =  {}
+        print(f"Processing {len(self.papers)} papers:")
+        for paper in tqdm(self.papers):
+            paper_id = paper["paper"]['id']
+            contents[paper["paper"]['title']] = self.get_paper_content(paper_id)
+        return contents

requirements.txt CHANGED Viewed

@@ -1,3 +1,4 @@
-git+https://github.com/nari-labs/dia.git
 huggingface_hub
-transformers

+kokoro
 huggingface_hub
+transformers
+PyMuPDF