# arxiv_metadata_service.py from typing import List, Dict, Any import logging from huggingface_dataset_manager import HuggingFaceDatasetManager from arxiv_fetcher import fetch_arxiv_metadata logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') class ArxivMetadataService: def __init__(self, dataset_name: str): self.dataset_manager = HuggingFaceDatasetManager(dataset_name) def fetch_and_persist_metadata(self, query: str, max_results: int = 10) -> bool: metadata_list = fetch_arxiv_metadata(query, max_results) if not metadata_list: logging.warning("No metadata fetched to persist.") return False return self.dataset_manager.persist_to_dataset(metadata_list) def test_arxiv_metadata_service(): # Use a test dataset name test_dataset_name = "dwb2023/arxiv-papers-dataset" service = ArxivMetadataService(test_dataset_name) # Test query test_query = "quantum computing" max_results = 5 success = service.fetch_and_persist_metadata(test_query, max_results) if success: print(f"Successfully fetched and persisted metadata for query: '{test_query}'") else: print(f"Failed to fetch and persist metadata for query: '{test_query}'") # Get and print dataset info info = service.dataset_manager.get_dataset_info() print("\nDataset Info:") print(f"Number of rows: {info.get('num_rows', 'N/A')}") print(f"Features: {info.get('features', 'N/A')}") print(f"Last modified: {info.get('last_modified', 'N/A')}") if __name__ == "__main__": test_arxiv_metadata_service()