Spaces:
Paused
Paused
import gradio as gr | |
import arxiv | |
import traceback | |
import logging | |
from typing import List, Dict, Any | |
from datasets import load_dataset, Dataset | |
from huggingface_hub import HfApi | |
from config import DATASET_NAME | |
# Logging setup | |
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') | |
# Arxiv Fetcher logic | |
def fetch_metadata(query: str, max_results: int = 10) -> List[Dict[str, Any]]: | |
logging.info(f"Fetching arXiv metadata for query: {query}") | |
if not query.strip(): | |
logging.warning("Empty or whitespace-only query provided") | |
return [] | |
client = arxiv.Client(page_size=max_results, delay_seconds=3, num_retries=3) | |
search = arxiv.Search(query=query, max_results=max_results, sort_by=arxiv.SortCriterion.SubmittedDate) | |
results = [] | |
try: | |
for result in client.results(search): | |
metadata = { | |
"title": result.title, | |
"authors": [author.name for author in result.authors], | |
"published": result.published.isoformat(), | |
"updated": result.updated.isoformat(), | |
"pdf_url": result.pdf_url, | |
"entry_id": result.entry_id, | |
"summary": result.summary, | |
"categories": result.categories, | |
"primary_category": result.primary_category, | |
"html_url": f"http://arxiv.org/abs/{result.entry_id.split('/')[-1]}" | |
} | |
results.append(metadata) | |
logging.info(f"Fetched metadata for {len(results)} papers") | |
except Exception as e: | |
logging.error(f"Error fetching metadata: {str(e)}") | |
return results | |
# Arxiv Metadata Service logic | |
class ArxivMetadataService: | |
def __init__(self): | |
self.hf_api = HfApi() | |
def extract_metadata_and_update_dataset(self, query: str, max_results: int = 10) -> str: | |
metadata_list = fetch_metadata(query, max_results) | |
if not metadata_list: | |
return "No metadata found for the given query." | |
return self.update_dataset(metadata_list) | |
def update_dataset(self, metadata_list: List[Dict[str, Any]]) -> str: | |
try: | |
# Load the existing dataset | |
try: | |
dataset = load_dataset(DATASET_NAME, split="train") | |
current_data = dataset.to_dict() | |
except Exception: | |
# If loading fails, start with an empty dictionary | |
current_data = {} | |
# If the dataset is empty, initialize it with the structure from metadata_list | |
if not current_data: | |
current_data = {key: [] for key in metadata_list[0].keys()} | |
updated = False | |
for paper in metadata_list: | |
entry_id = paper['entry_id'].split('/')[-1] | |
if 'entry_id' not in current_data or entry_id not in current_data['entry_id']: | |
# Add new paper | |
for key, value in paper.items(): | |
current_data.setdefault(key, []).append(value) | |
updated = True | |
else: | |
# Update existing paper | |
index = current_data['entry_id'].index(entry_id) | |
for key, value in paper.items(): | |
if current_data[key][index] != value: | |
current_data[key][index] = value | |
updated = True | |
if updated: | |
updated_dataset = Dataset.from_dict(current_data) | |
updated_dataset.push_to_hub(DATASET_NAME, split="train") | |
return f"Successfully updated dataset with {len(metadata_list)} papers" | |
else: | |
return "No new data to update." | |
except Exception as e: | |
logging.error(f"Failed to update dataset: {str(e)}") | |
return f"Failed to update dataset: {str(e)}" | |
def get_dataset_records(self): | |
try: | |
dataset = load_dataset(DATASET_NAME, split="train") | |
records = dataset.to_pandas().to_dict(orient="records") | |
return records | |
except Exception as e: | |
return f"Error loading dataset: {str(e)}" | |
# Initialize Arxiv Metadata Service | |
arxiv_service = ArxivMetadataService() | |
# Define Gradio functions | |
def handle_metadata_extraction(query: str, max_results: int): | |
try: | |
result = arxiv_service.extract_metadata_and_update_dataset(query, max_results) | |
logging.info(f"Extraction result: {result}") | |
return result | |
except Exception as e: | |
error_msg = f"An error occurred: {str(e)}\n\nTraceback:\n{traceback.format_exc()}" | |
logging.error(error_msg) | |
return error_msg | |
def handle_dataset_view(): | |
try: | |
records = arxiv_service.get_dataset_records() | |
return records | |
except Exception as e: | |
return f"Error loading dataset: {str(e)}" | |
# Define Gradio interface | |
with gr.Blocks() as demo: | |
gr.Markdown( | |
f"""Extract metadata from ArXiv papers and update the dataset. | |
\n\nCurrently leverages the following datasets: | |
\n- [{DATASET_NAME}](https://huggingface.co/datasets/{DATASET_NAME}/viewer) dataset. | |
""" | |
) | |
with gr.Tab("Extract Metadata"): | |
query_input = gr.Textbox(label="ArXiv Query") | |
max_results = gr.Slider(minimum=1, maximum=100, value=10, step=1, label="Max Results") | |
submit_button = gr.Button("Extract Metadata") | |
output = gr.Textbox(label="Result") | |
submit_button.click( | |
fn=handle_metadata_extraction, | |
inputs=[query_input, max_results], | |
outputs=output | |
) | |
with gr.Tab("View Dataset"): | |
refresh_button = gr.Button("Refresh Dataset Info") | |
dataset_info = gr.JSON(label="Dataset Info") | |
refresh_button.click( | |
fn=handle_dataset_view, | |
inputs=[], | |
outputs=dataset_info | |
) | |
if __name__ == "__main__": | |
demo.queue() | |
demo.launch() | |