Spaces:

dwb2023
/

retrieval_metadata

Paused

File size: 6,027 Bytes

edd8809
d7cecb4
edd8809
19ab6fa
d7cecb4
 
 
19ab6fa
 
d7cecb4
19ab6fa
edd8809
d7cecb4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
edd8809
 
d7cecb4
 
edd8809
d7cecb4
19ab6fa
 
edd8809
 
19ab6fa
edd8809
 
d7cecb4
19ab6fa
d7cecb4
 
19ab6fa
 
 
d7cecb4
19ab6fa
 
 
 
 
 
d7cecb4
19ab6fa
 
 
 
 
 
 
 
d7cecb4
19ab6fa
 
 
 
 
 
d7cecb4
19ab6fa
 
d7cecb4
19ab6fa
 
 
edd8809
 
d7cecb4

import gradio as gr
import arxiv
import traceback
import logging
from typing import List, Dict, Any
from datasets import load_dataset, Dataset
from huggingface_hub import HfApi
from config import DATASET_NAME

# Logging setup
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Arxiv Fetcher logic
def fetch_metadata(query: str, max_results: int = 10) -> List[Dict[str, Any]]:
    logging.info(f"Fetching arXiv metadata for query: {query}")
    if not query.strip():
        logging.warning("Empty or whitespace-only query provided")
        return []
    
    client = arxiv.Client(page_size=max_results, delay_seconds=3, num_retries=3)
    search = arxiv.Search(query=query, max_results=max_results, sort_by=arxiv.SortCriterion.SubmittedDate)
    
    results = []
    try:
        for result in client.results(search):
            metadata = {
                "title": result.title,
                "authors": [author.name for author in result.authors],
                "published": result.published.isoformat(),
                "updated": result.updated.isoformat(),
                "pdf_url": result.pdf_url,
                "entry_id": result.entry_id,
                "summary": result.summary,
                "categories": result.categories,
                "primary_category": result.primary_category,
                "html_url": f"http://arxiv.org/abs/{result.entry_id.split('/')[-1]}"
            }
            results.append(metadata)
        logging.info(f"Fetched metadata for {len(results)} papers")
    except Exception as e:
        logging.error(f"Error fetching metadata: {str(e)}")
    
    return results

# Arxiv Metadata Service logic
class ArxivMetadataService:
    def __init__(self):
        self.hf_api = HfApi()

    def extract_metadata_and_update_dataset(self, query: str, max_results: int = 10) -> str:
        metadata_list = fetch_metadata(query, max_results)
        if not metadata_list:
            return "No metadata found for the given query."
        return self.update_dataset(metadata_list)

    def update_dataset(self, metadata_list: List[Dict[str, Any]]) -> str:
        try:
            # Load the existing dataset
            try:
                dataset = load_dataset(DATASET_NAME, split="train")
                current_data = dataset.to_dict()
            except Exception:
                # If loading fails, start with an empty dictionary
                current_data = {}

            # If the dataset is empty, initialize it with the structure from metadata_list
            if not current_data:
                current_data = {key: [] for key in metadata_list[0].keys()}

            updated = False
            for paper in metadata_list:
                entry_id = paper['entry_id'].split('/')[-1]
                if 'entry_id' not in current_data or entry_id not in current_data['entry_id']:
                    # Add new paper
                    for key, value in paper.items():
                        current_data.setdefault(key, []).append(value)
                    updated = True
                else:
                    # Update existing paper
                    index = current_data['entry_id'].index(entry_id)
                    for key, value in paper.items():
                        if current_data[key][index] != value:
                            current_data[key][index] = value
                            updated = True

            if updated:
                updated_dataset = Dataset.from_dict(current_data)
                updated_dataset.push_to_hub(DATASET_NAME, split="train")
                return f"Successfully updated dataset with {len(metadata_list)} papers"
            else:
                return "No new data to update."
        except Exception as e:
            logging.error(f"Failed to update dataset: {str(e)}")
            return f"Failed to update dataset: {str(e)}"

    def get_dataset_records(self):
        try:
            dataset = load_dataset(DATASET_NAME, split="train")
            records = dataset.to_pandas().to_dict(orient="records")
            return records
        except Exception as e:
            return f"Error loading dataset: {str(e)}"

# Initialize Arxiv Metadata Service
arxiv_service = ArxivMetadataService()

# Define Gradio functions
def handle_metadata_extraction(query: str, max_results: int):
    try:
        result = arxiv_service.extract_metadata_and_update_dataset(query, max_results)
        logging.info(f"Extraction result: {result}")
        return result
    except Exception as e:
        error_msg = f"An error occurred: {str(e)}\n\nTraceback:\n{traceback.format_exc()}"
        logging.error(error_msg)
        return error_msg

def handle_dataset_view():
    try:
        records = arxiv_service.get_dataset_records()
        return records
    except Exception as e:
        return f"Error loading dataset: {str(e)}"

# Define Gradio interface
with gr.Blocks() as demo:
    gr.Markdown(
        f"""Extract metadata from ArXiv papers and update the dataset.
        \n\nCurrently leverages the following datasets:
        \n- [{DATASET_NAME}](https://huggingface.co/datasets/{DATASET_NAME}/viewer) dataset.
        """
    )
    
    with gr.Tab("Extract Metadata"):
        query_input = gr.Textbox(label="ArXiv Query")
        max_results = gr.Slider(minimum=1, maximum=100, value=10, step=1, label="Max Results")
        submit_button = gr.Button("Extract Metadata")
        output = gr.Textbox(label="Result")
        
        submit_button.click(
            fn=handle_metadata_extraction,
            inputs=[query_input, max_results],
            outputs=output
        )
    
    with gr.Tab("View Dataset"):
        refresh_button = gr.Button("Refresh Dataset Info")
        dataset_info = gr.JSON(label="Dataset Info")
        
        refresh_button.click(
            fn=handle_dataset_view,
            inputs=[],
            outputs=dataset_info
        )

if __name__ == "__main__":
    demo.queue()
    demo.launch()