Spaces:

huggingface
/

paper-central

Running

App Files Files Community

IAMJB commited on Nov 26, 2024

Commit

a0359a1

1 Parent(s): f710cf8

push chat

Browse files

Files changed (5) hide show

app.py +25 -9
df/PaperCentral.py +18 -1
paper_chat_tab.py +281 -0
requirements.txt +14 -0
style.css +63 -1

app.py CHANGED Viewed

@@ -10,6 +10,8 @@ import json
 import requests
 from author_leaderboard_contrib_tab import author_resource_leaderboard_tab
 from zoneinfo import ZoneInfo  # Available in Python 3.9 and later
 # Initialize the PaperCentral class instance
@@ -60,6 +62,9 @@ with gr.Blocks(css_paths="style.css") as demo:
         with gr.Column(scale=1):
             with gr.Accordion(label="⭐Release notes", open=False):
                 gr.Markdown("""
                 - **October 24, 2024** – CoRL 2024 proceedings added.
                 - **October 20, 2024** – You can now add or edit papers.
                 - **October 19, 2024** – Papers with github now have github stars.
@@ -182,6 +187,12 @@ with gr.Blocks(css_paths="style.css") as demo:
                 author_resource_leaderboard_tab()
     # Define function to move to the next day
     def go_to_next_day(
             date: Union[str, datetime],
@@ -468,13 +479,14 @@ with gr.Blocks(css_paths="style.css") as demo:
         date_range = gr.update(value=None)
         conferences = gr.update(value=[])
         hf_options = gr.update(value=[])
-        leaderboard_tab = gr.Tabs()
         if request:
-            print("Request headers dictionary:", dict(request.headers))
-            print("IP address:", request.client.host)
-            print("Query parameters:", dict(request.query_params))
-            print("Session hash:", request.session_hash)
             if 'date' in request.query_params:
                 calendar = gr.update(value=request.query_params['date'])
@@ -502,9 +514,13 @@ with gr.Blocks(css_paths="style.css") as demo:
             if "tab" in request.query_params:
                 tab = request.query_params['tab']
                 if tab == "tab-leaderboards":
-                    leaderboard_tab = gr.Tabs(selected="tab-leaderboards")
-        return calendar, date_range, conferences, hf_options, leaderboard_tab,
     demo.load(
@@ -514,7 +530,7 @@ with gr.Blocks(css_paths="style.css") as demo:
         api_name="update_data",
     ).then(
         fn=echo,
-        outputs=[calendar, date_range_radio, conference_options, hf_options, tabs],
         api_name=False,
     ).then(
         # New then to handle LoginButton and HTML components
@@ -529,7 +545,7 @@ def main():
     """
     Launches the Gradio app.
     """
-    demo.launch()
 # Run the main function when the script is executed

 import requests
 from author_leaderboard_contrib_tab import author_resource_leaderboard_tab
+from paper_chat_tab import paper_chat_tab
 from zoneinfo import ZoneInfo  # Available in Python 3.9 and later
 # Initialize the PaperCentral class instance
         with gr.Column(scale=1):
             with gr.Accordion(label="⭐Release notes", open=False):
                 gr.Markdown("""
+                - **November 21, 2024** – Neurips D&B 2024 proceedings added.
+                - **November 20, 2024** – Neurips 2024 proceedings added.
+                - **November 15, 2024** – EMNLP 2024 proceedings added.
                 - **October 24, 2024** – CoRL 2024 proceedings added.
                 - **October 20, 2024** – You can now add or edit papers.
                 - **October 19, 2024** – Papers with github now have github stars.
                 author_resource_leaderboard_tab()
+        with gr.Tab("Chat With Paper", id="tab-chat-with-paper"):
+            gr.Markdown("## Chat with Paper")
+            arxiv_id = gr.State(value=None)
+            paper_chat_tab(arxiv_id)
     # Define function to move to the next day
     def go_to_next_day(
             date: Union[str, datetime],
         date_range = gr.update(value=None)
         conferences = gr.update(value=[])
         hf_options = gr.update(value=[])
+        selected_tab = gr.Tabs()
+        paper_id = gr.update(value=None)
         if request:
+            # print("Request headers dictionary:", dict(request.headers))
+            # print("IP address:", request.client.host)
+            # print("Query parameters:", dict(request.query_params))
+            # print("Session hash:", request.session_hash)
             if 'date' in request.query_params:
                 calendar = gr.update(value=request.query_params['date'])
             if "tab" in request.query_params:
                 tab = request.query_params['tab']
                 if tab == "tab-leaderboards":
+                    selected_tab = gr.Tabs(selected="tab-leaderboards")
+                elif tab == "tab-chat-with-paper":
+                    selected_tab = gr.Tabs(selected="tab-chat-with-paper")
+                    if "paper_id" in request.query_params:
+                        paper_id = request.query_params['paper_id']
+        return calendar, date_range, conferences, hf_options, selected_tab, paper_id
     demo.load(
         api_name="update_data",
     ).then(
         fn=echo,
+        outputs=[calendar, date_range_radio, conference_options, hf_options, tabs, arxiv_id],
         api_name=False,
     ).then(
         # New then to handle LoginButton and HTML components
     """
     Launches the Gradio app.
     """
+    demo.launch(ssr_mode=False)
 # Run the main function when the script is executed

df/PaperCentral.py CHANGED Viewed

@@ -15,7 +15,7 @@ import gradio as gr
 from utils import load_and_process
 import numpy as np
 from datetime import datetime, timedelta
 class PaperCentral:
     """
@@ -53,6 +53,7 @@ class PaperCentral:
     ]
     COLUMNS_ORDER_PAPER_PAGE: List[str] = [
         'date',
         'arxiv_id',
         'paper_page',
@@ -90,6 +91,7 @@ class PaperCentral:
         'authors': 'str',
         'github_stars': 'number',
         'project_page': 'markdown',
     }
     # Mapping for renaming columns for display purposes
@@ -101,6 +103,7 @@ class PaperCentral:
         'github_stars': 'GitHub⭐',
         'num_comments': '💬',
         'upvotes': '👍',
     }
     def __init__(self):
@@ -475,6 +478,20 @@ class PaperCentral:
                     )
                 filtered_df = filtered_df[conference_filter]
         # Prettify the DataFrame
         filtered_df = self.prettify(filtered_df)

 from utils import load_and_process
 import numpy as np
 from datetime import datetime, timedelta
+import re
 class PaperCentral:
     """
     ]
     COLUMNS_ORDER_PAPER_PAGE: List[str] = [
+        'chat_with_paper',
         'date',
         'arxiv_id',
         'paper_page',
         'authors': 'str',
         'github_stars': 'number',
         'project_page': 'markdown',
+        'chat_with_paper': 'markdown',
     }
     # Mapping for renaming columns for display purposes
         'github_stars': 'GitHub⭐',
         'num_comments': '💬',
         'upvotes': '👍',
+        'chat_with_paper': 'Chat',
     }
     def __init__(self):
                     )
                 filtered_df = filtered_df[conference_filter]
+            if any(conf in ["NeurIPS2024 D&B", "NeurIPS2024"] for conf in conference_options):
+                def create_chat_link(row):
+                    neurips_id = re.search(r'id=([^&]+)', row["proceedings"])
+                    if neurips_id:
+                        neurips_id = neurips_id.group(1)
+                        return f'<a href="/?tab=tab-chat-with-paper&paper_id={neurips_id}" id="custom_button" target="_blank" rel="noopener noreferrer" aria-disabled="false">✨ Chat with paper</a>'
+                    else:
+                        return ""
+                # Add the "chat_with_paper" column
+                filtered_df['chat_with_paper'] = filtered_df.apply(create_chat_link, axis=1)
+                if 'chat_with_paper' not in columns_to_show:
+                    columns_to_show.append('chat_with_paper')
         # Prettify the DataFrame
         filtered_df = self.prettify(filtered_df)

paper_chat_tab.py ADDED Viewed

	@@ -0,0 +1,281 @@

+import gradio as gr
+from PyPDF2 import PdfReader
+from bs4 import BeautifulSoup
+import requests
+from io import BytesIO
+from transformers import AutoTokenizer
+import os
+from openai import OpenAI
+# Cache for tokenizers to avoid reloading
+tokenizer_cache = {}
+# Function to fetch paper information from OpenReview
+def fetch_paper_info_neurips(paper_id):
+    url = f"https://openreview.net/forum?id={paper_id}"
+    response = requests.get(url)
+    if response.status_code != 200:
+        return None, None
+    html_content = response.content
+    soup = BeautifulSoup(html_content, 'html.parser')
+    # Extract title
+    title_tag = soup.find('h2', class_='citation_title')
+    title = title_tag.get_text(strip=True) if title_tag else 'Title not found'
+    # Extract authors
+    authors = []
+    author_div = soup.find('div', class_='forum-authors')
+    if author_div:
+        author_tags = author_div.find_all('a')
+        authors = [tag.get_text(strip=True) for tag in author_tags]
+    author_list = ', '.join(authors) if authors else 'Authors not found'
+    # Extract abstract
+    abstract_div = soup.find('strong', text='Abstract:')
+    if abstract_div:
+        abstract_paragraph = abstract_div.find_next_sibling('div')
+        abstract = abstract_paragraph.get_text(strip=True) if abstract_paragraph else 'Abstract not found'
+    else:
+        abstract = 'Abstract not found'
+    # Construct preamble in Markdown
+    # preamble = f"**[{title}](https://openreview.net/forum?id={paper_id})**\n\n{author_list}\n\n**Abstract:**\n{abstract}"
+    preamble = f"**[{title}](https://openreview.net/forum?id={paper_id})**\n\n{author_list}\n\n"
+    return preamble
+def fetch_paper_content(paper_id):
+    try:
+        # Construct the URL
+        url = f"https://openreview.net/pdf?id={paper_id}"
+        # Fetch the PDF
+        response = requests.get(url)
+        response.raise_for_status()  # Raise an exception for HTTP errors
+        # Read the PDF content
+        pdf_content = BytesIO(response.content)
+        reader = PdfReader(pdf_content)
+        # Extract text from the PDF
+        text = ""
+        for page in reader.pages:
+            text += page.extract_text()
+        return text  # Return full text; truncation will be handled later
+    except Exception as e:
+        print(f"An error occurred: {e}")
+        return None
+def paper_chat_tab(paper_id):
+    with gr.Blocks() as demo:
+        with gr.Column():
+            # Textbox to display the paper title and authors
+            content = gr.Markdown(value="")
+            # Preamble message to hint the user
+            gr.Markdown("**Note:** Providing your own sambanova token can help you avoid rate limits.")
+            # Input for Hugging Face token
+            hf_token_input = gr.Textbox(
+                label="Enter your sambanova token (optional)",
+                type="password",
+                placeholder="Enter your sambanova token to avoid rate limits"
+            )
+            models = [
+                "Meta-Llama-3.1-8B-Instruct",
+                "Meta-Llama-3.1-70B-Instruct",
+                "Meta-Llama-3.1-405B-Instruct",
+            ]
+            default_model = models[-1]
+            # Dropdown for selecting the model
+            model_dropdown = gr.Dropdown(
+                label="Select Model",
+                choices=models,
+                value=default_model
+            )
+            # State to store the paper content
+            paper_content = gr.State()
+            # Create a column for each model, only visible if it's the default model
+            columns = []
+            for model_name in models:
+                column = gr.Column(visible=(model_name == default_model))
+                with column:
+                    chatbot = create_chat_interface(model_name, paper_content, hf_token_input)
+                columns.append(column)
+            gr.HTML(
+                '<img src="https://venturebeat.com/wp-content/uploads/2020/02/SambaNovaLogo_H_F.jpg" width="100px" />')
+            gr.Markdown("**Note:** This model is supported by SambaNova.")
+            # Update visibility of columns based on the selected model
+            def update_columns(selected_model):
+                visibility = []
+                for model_name in models:
+                    is_visible = model_name == selected_model
+                    visibility.append(gr.update(visible=is_visible))
+                return visibility
+            model_dropdown.change(
+                fn=update_columns,
+                inputs=model_dropdown,
+                outputs=columns,
+                api_name=False,
+                queue=False,
+            )
+            # Function to update the content Markdown and paper_content when paper ID or model changes
+            def update_paper_info(paper_id, selected_model):
+                preamble = fetch_paper_info_neurips(paper_id)
+                text = fetch_paper_content(paper_id)
+                if text is None:
+                    return preamble, None
+                return preamble, text
+            # Update paper content when paper ID or model changes
+            paper_id.change(
+                fn=update_paper_info,
+                inputs=[paper_id, model_dropdown],
+                outputs=[content, paper_content]
+            )
+            model_dropdown.change(
+                fn=update_paper_info,
+                inputs=[paper_id, model_dropdown],
+                outputs=[content, paper_content],
+                queue=False,
+            )
+    return demo
+def create_chat_interface(model_name, paper_content, hf_token_input):
+    # Load tokenizer and cache it
+    if model_name not in tokenizer_cache:
+        # Load the tokenizer from Hugging Face
+        # tokenizer = AutoTokenizer.from_pretrained(model_name)
+        tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-1B-Instruct")
+        tokenizer_cache[model_name] = tokenizer
+    else:
+        tokenizer = tokenizer_cache[model_name]
+    max_total_tokens = 50000  # Maximum tokens allowed
+    # Define the function to handle the chat
+    def get_fn(message, history, paper_content_value, hf_token_value):
+        # Include the paper content as context
+        if paper_content_value:
+            context = f"The following is the content of the paper:\n{paper_content_value}\n\n"
+        else:
+            context = ""
+        # Tokenize the context
+        context_tokens = tokenizer.encode(context)
+        context_token_length = len(context_tokens)
+        # Prepare the messages without context
+        messages = []
+        message_tokens_list = []
+        total_tokens = context_token_length  # Start with context tokens
+        for user_msg, assistant_msg in history:
+            # Tokenize user message
+            user_tokens = tokenizer.encode(user_msg)
+            messages.append({"role": "user", "content": user_msg})
+            message_tokens_list.append(len(user_tokens))
+            total_tokens += len(user_tokens)
+            # Tokenize assistant message
+            if assistant_msg:
+                assistant_tokens = tokenizer.encode(assistant_msg)
+                messages.append({"role": "assistant", "content": assistant_msg})
+                message_tokens_list.append(len(assistant_tokens))
+                total_tokens += len(assistant_tokens)
+        # Tokenize the new user message
+        message_tokens = tokenizer.encode(message)
+        messages.append({"role": "user", "content": message})
+        message_tokens_list.append(len(message_tokens))
+        total_tokens += len(message_tokens)
+        # Check if total tokens exceed the maximum allowed tokens
+        if total_tokens > max_total_tokens:
+            # Attempt to truncate the context first
+            available_tokens = max_total_tokens - (total_tokens - context_token_length)
+            if available_tokens > 0:
+                # Truncate the context to fit the available tokens
+                truncated_context_tokens = context_tokens[:available_tokens]
+                context = tokenizer.decode(truncated_context_tokens)
+                context_token_length = available_tokens
+                total_tokens = total_tokens - len(context_tokens) + context_token_length
+            else:
+                # Not enough space for context; remove it
+                context = ""
+                total_tokens -= context_token_length
+                context_token_length = 0
+        # If total tokens still exceed the limit, truncate the message history
+        while total_tokens > max_total_tokens and len(messages) > 1:
+            # Remove the oldest message
+            removed_message = messages.pop(0)
+            removed_tokens = message_tokens_list.pop(0)
+            total_tokens -= removed_tokens
+        # Rebuild the final messages list including the (possibly truncated) context
+        final_messages = []
+        if context:
+            final_messages.append({"role": "system", "content": context})
+        final_messages.extend(messages)
+        # Use the Hugging Face token if provided
+        api_key = hf_token_value or os.environ.get("SAMBANOVA_API_KEY")
+        if not api_key:
+            raise ValueError("API token is not provided.")
+        # Initialize the OpenAI client
+        client = OpenAI(
+            base_url="https://api.sambanova.ai/v1/",
+            api_key=api_key,
+        )
+        try:
+            # Create the chat completion
+            completion = client.chat.completions.create(
+                model=model_name,
+                messages=final_messages,
+                stream=True,
+            )
+            response_text = ""
+            for chunk in completion:
+                delta = chunk.choices[0].delta.content or ""
+                response_text += delta
+                yield response_text
+        except Exception as e:
+            error_message = f"Error: {str(e)}"
+            yield error_message
+    # Create the ChatInterface
+    chat_interface = gr.ChatInterface(
+        fn=get_fn,
+        chatbot=gr.Chatbot(
+            label="Chatbot",
+            scale=1,
+            height=400,
+            autoscroll=True
+        ),
+        additional_inputs=[paper_content, hf_token_input],
+        # examples=["What are the main findings of this paper?", "Explain the methodology used in this research."]
+    )
+    return chat_interface

requirements.txt CHANGED Viewed

@@ -2,3 +2,17 @@ gradio==5.6.0
 gradio_calendar
 datasets
 scholarly

 gradio_calendar
 datasets
 scholarly
+arxiv
+PyPDF2
+transformers
+beautifulsoup4
+# Set the primary index URL to PyTorch's CPU wheels
+--index-url https://download.pytorch.org/whl/cpu
+# Ensure PyPI is still accessible for other packages
+--extra-index-url https://pypi.org/simple
+# List all your packages
+torch
+torchvision
+torchaudio

style.css CHANGED Viewed

@@ -57,4 +57,66 @@ body a:hover {
     height: 1.38rem;
     overflow: hidden;
     border-radius: 9999px;
-}

     height: 1.38rem;
     overflow: hidden;
     border-radius: 9999px;
+}
+/* CSS Variables for Button Styling */
+:root {
+  /* Border and Padding */
+  --button-border-width: 0px;
+  --button-small-padding: 8px 12px; /* Example values */
+  --button-small-radius: 4px;      /* Example values */
+  /* Colors */
+  --button-secondary-border-color: #e5e7eb; /* Example neutral-200 */
+  --button-secondary-background-fill: #f3f4f6; /* Example neutral-200 */
+  --button-secondary-background-fill-hover: #d1d5db; /* Example neutral-300 */
+  --button-secondary-text-color: #000000;
+  --button-secondary-text-color-hover: #000000;
+  /* Typography */
+  --button-small-text-size: 14px; /* Example text-sm */
+  --button-small-text-weight: 400;
+  /* Shadows and Transitions */
+  --button-secondary-shadow: none;
+  --button-secondary-shadow-hover: none;
+  --button-secondary-shadow-active: none;
+  --button-transition: all 0.2s ease;
+}
+/* Custom Button Styles */
+#custom_button {
+  display: inline-flex;
+  align-items: center;
+  justify-content: center;
+  border: var(--button-border-width) solid var(--button-secondary-border-color);
+  background: var(--button-secondary-background-fill);
+  color: var(--button-secondary-text-color);
+  padding: var(--button-small-padding);
+  border-radius: var(--button-small-radius);
+  font-size: var(--button-small-text-size);
+  font-weight: var(--button-small-text-weight);
+  text-decoration: none;
+  box-shadow: var(--button-secondary-shadow);
+  transition: var(--button-transition);
+}
+#custom_button:hover {
+  background: var(--button-secondary-background-fill-hover);
+  border-color: var(--button-secondary-border-color-hover);
+  color: var(--button-secondary-text-color-hover);
+  box-shadow: var(--button-secondary-shadow-hover);
+}
+#custom_button:active {
+  box-shadow: var(--button-secondary-shadow-active);
+}
+/* Icon Styling */
+#custom_button .button-icon {
+  margin-right: 8px; /* Adjust spacing between icon and text as needed */
+  width: 20px;       /* Adjust icon size as needed */
+  height: 20px;      /* Adjust icon size as needed */
+}