retrieval_metadata / initialize_dataset.py
donb-hf's picture
initialize datasett script
778b735
raw
history blame
530 Bytes
from datasets import Dataset
from config import DATASET_NAME
import huggingface_hub
# Initialize an empty dataset with the expected structure
initial_data = {
"id": [],
"title": [],
"authors": [],
"published": [],
"updated": [],
"pdf_url": [],
"entry_id": [],
"summary": [],
"categories": [],
"primary_category": [],
"html_url": []
}
# Create the dataset
dataset = Dataset.from_dict(initial_data)
# Push the initial dataset to the Hub
dataset.push_to_hub(DATASET_NAME, split="train")