Spaces:
Paused
Paused
File size: 6,027 Bytes
edd8809 d7cecb4 edd8809 19ab6fa d7cecb4 19ab6fa d7cecb4 19ab6fa edd8809 d7cecb4 edd8809 d7cecb4 edd8809 d7cecb4 19ab6fa edd8809 19ab6fa edd8809 d7cecb4 19ab6fa d7cecb4 19ab6fa d7cecb4 19ab6fa d7cecb4 19ab6fa d7cecb4 19ab6fa d7cecb4 19ab6fa d7cecb4 19ab6fa edd8809 d7cecb4 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 |
import gradio as gr
import arxiv
import traceback
import logging
from typing import List, Dict, Any
from datasets import load_dataset, Dataset
from huggingface_hub import HfApi
from config import DATASET_NAME
# Logging setup
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
# Arxiv Fetcher logic
def fetch_metadata(query: str, max_results: int = 10) -> List[Dict[str, Any]]:
logging.info(f"Fetching arXiv metadata for query: {query}")
if not query.strip():
logging.warning("Empty or whitespace-only query provided")
return []
client = arxiv.Client(page_size=max_results, delay_seconds=3, num_retries=3)
search = arxiv.Search(query=query, max_results=max_results, sort_by=arxiv.SortCriterion.SubmittedDate)
results = []
try:
for result in client.results(search):
metadata = {
"title": result.title,
"authors": [author.name for author in result.authors],
"published": result.published.isoformat(),
"updated": result.updated.isoformat(),
"pdf_url": result.pdf_url,
"entry_id": result.entry_id,
"summary": result.summary,
"categories": result.categories,
"primary_category": result.primary_category,
"html_url": f"http://arxiv.org/abs/{result.entry_id.split('/')[-1]}"
}
results.append(metadata)
logging.info(f"Fetched metadata for {len(results)} papers")
except Exception as e:
logging.error(f"Error fetching metadata: {str(e)}")
return results
# Arxiv Metadata Service logic
class ArxivMetadataService:
def __init__(self):
self.hf_api = HfApi()
def extract_metadata_and_update_dataset(self, query: str, max_results: int = 10) -> str:
metadata_list = fetch_metadata(query, max_results)
if not metadata_list:
return "No metadata found for the given query."
return self.update_dataset(metadata_list)
def update_dataset(self, metadata_list: List[Dict[str, Any]]) -> str:
try:
# Load the existing dataset
try:
dataset = load_dataset(DATASET_NAME, split="train")
current_data = dataset.to_dict()
except Exception:
# If loading fails, start with an empty dictionary
current_data = {}
# If the dataset is empty, initialize it with the structure from metadata_list
if not current_data:
current_data = {key: [] for key in metadata_list[0].keys()}
updated = False
for paper in metadata_list:
entry_id = paper['entry_id'].split('/')[-1]
if 'entry_id' not in current_data or entry_id not in current_data['entry_id']:
# Add new paper
for key, value in paper.items():
current_data.setdefault(key, []).append(value)
updated = True
else:
# Update existing paper
index = current_data['entry_id'].index(entry_id)
for key, value in paper.items():
if current_data[key][index] != value:
current_data[key][index] = value
updated = True
if updated:
updated_dataset = Dataset.from_dict(current_data)
updated_dataset.push_to_hub(DATASET_NAME, split="train")
return f"Successfully updated dataset with {len(metadata_list)} papers"
else:
return "No new data to update."
except Exception as e:
logging.error(f"Failed to update dataset: {str(e)}")
return f"Failed to update dataset: {str(e)}"
def get_dataset_records(self):
try:
dataset = load_dataset(DATASET_NAME, split="train")
records = dataset.to_pandas().to_dict(orient="records")
return records
except Exception as e:
return f"Error loading dataset: {str(e)}"
# Initialize Arxiv Metadata Service
arxiv_service = ArxivMetadataService()
# Define Gradio functions
def handle_metadata_extraction(query: str, max_results: int):
try:
result = arxiv_service.extract_metadata_and_update_dataset(query, max_results)
logging.info(f"Extraction result: {result}")
return result
except Exception as e:
error_msg = f"An error occurred: {str(e)}\n\nTraceback:\n{traceback.format_exc()}"
logging.error(error_msg)
return error_msg
def handle_dataset_view():
try:
records = arxiv_service.get_dataset_records()
return records
except Exception as e:
return f"Error loading dataset: {str(e)}"
# Define Gradio interface
with gr.Blocks() as demo:
gr.Markdown(
f"""Extract metadata from ArXiv papers and update the dataset.
\n\nCurrently leverages the following datasets:
\n- [{DATASET_NAME}](https://huggingface.co/datasets/{DATASET_NAME}/viewer) dataset.
"""
)
with gr.Tab("Extract Metadata"):
query_input = gr.Textbox(label="ArXiv Query")
max_results = gr.Slider(minimum=1, maximum=100, value=10, step=1, label="Max Results")
submit_button = gr.Button("Extract Metadata")
output = gr.Textbox(label="Result")
submit_button.click(
fn=handle_metadata_extraction,
inputs=[query_input, max_results],
outputs=output
)
with gr.Tab("View Dataset"):
refresh_button = gr.Button("Refresh Dataset Info")
dataset_info = gr.JSON(label="Dataset Info")
refresh_button.click(
fn=handle_dataset_view,
inputs=[],
outputs=dataset_info
)
if __name__ == "__main__":
demo.queue()
demo.launch()
|