Spaces:
Paused
Paused
from arxiv_fetcher import fetch_arxiv_metadata | |
from datasets import load_dataset, Dataset | |
from config import DATASET_NAME | |
import logging | |
from typing import List, Dict, Any | |
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') | |
class ArxivMetadataService: | |
def extract_and_update(self, query: str, max_results: int = 10) -> str: | |
metadata_list = fetch_arxiv_metadata(query, max_results) | |
return self.update_dataset(metadata_list) | |
def update_dataset(self, metadata_list: List[Dict[str, Any]]) -> str: | |
try: | |
dataset = load_dataset(DATASET_NAME, split="train") | |
current_data = dataset.to_dict() | |
for paper in metadata_list: | |
if paper['id'] not in current_data.get('id', []): | |
for key, value in paper.items(): | |
if key not in current_data: | |
current_data[key] = [] | |
current_data[key].append(value) | |
else: | |
index = current_data['id'].index(paper['id']) | |
for key, value in paper.items(): | |
current_data[key][index] = value | |
updated_dataset = Dataset.from_dict(current_data) | |
updated_dataset.push_to_hub(DATASET_NAME, split="train") | |
return f"Successfully updated dataset with {len(metadata_list)} papers" | |
except Exception as e: | |
logging.error(f"Failed to update dataset: {str(e)}") | |
return f"Failed to update dataset: {str(e)}" |