from datasets import Dataset from huggingface_hub import HfApi from config import DATASET_NAME import logging logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') def initialize_dataset(): # Initialize an empty dataset with the expected structure initial_data = { "entry_id": [], "title": [], "authors": [], "published": [], "updated": [], "pdf_url": [], "summary": [], "categories": [], "primary_category": [], "html_url": [] } # Create the dataset dataset = Dataset.from_dict(initial_data) try: # Push the initial dataset to the Hub dataset.push_to_hub(DATASET_NAME, split="train") logging.info(f"Dataset {DATASET_NAME} initialized successfully with 'train' split.") except Exception as e: logging.error(f"Failed to initialize dataset: {str(e)}") raise if __name__ == "__main__": initialize_dataset()