Spaces:
Running
Running
File size: 5,494 Bytes
af5e0d4 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 |
import os
import datasets
import pandas as pd
from datetime import datetime
from config import BACKUP_FOLDER, HF_DATASET_REPO_ID, HF_TOKEN, RESULTS_CSV_FILE, CSV_HEADERS
def main():
"""
Gets the dataset from HF Hub where preferences are being collected,
save it locally to a backup folder with a timestamp.
Then creates an empty dataset with the same structure and saves it to the HF Hub.
"""
print(f"Attempting to load dataset '{HF_DATASET_REPO_ID}' from Hugging Face Hub (file: {RESULTS_CSV_FILE})...")
try:
# 1. Get the dataset from HF Hub
# Ensure the token has write permissions for pushing later.
dataset = datasets.load_dataset(HF_DATASET_REPO_ID, data_files=RESULTS_CSV_FILE, token=HF_TOKEN, split='train')
print(f"Successfully loaded dataset. It has {len(dataset)} entries.")
dataset_df = dataset.to_pandas()
except Exception as e:
print(f"Error loading dataset from Hugging Face Hub: {e}")
print("This could be due to the dataset/file not existing, or token issues.")
print("Attempting to proceed by creating an empty structure for backup and remote reset.")
# If loading fails, we might still want to try to clear the remote
# or at least create an empty local backup structure.
dataset_df = pd.DataFrame(columns=CSV_HEADERS) # Use predefined headers
# 2. Save it locally to a backup folder with a timestamp
if not os.path.exists(BACKUP_FOLDER):
os.makedirs(BACKUP_FOLDER)
print(f"Created backup folder: {BACKUP_FOLDER}")
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
backup_filename = f"preferences_backup_{timestamp}.csv"
backup_filepath = os.path.join(BACKUP_FOLDER, backup_filename)
try:
dataset_df.to_csv(backup_filepath, index=False)
print(f"Successfully backed up current preferences (or empty structure) to: {backup_filepath}")
except Exception as e:
print(f"Error saving backup to {backup_filepath}: {e}")
# Decide if to return or continue to try clearing remote
# For now, let's continue to try clearing remote if backup fails
# 3. Create an empty dataset with the same structure (using config.CSV_HEADERS)
print(f"Creating an empty dataset structure using predefined CSV_HEADERS: {CSV_HEADERS}")
empty_df = pd.DataFrame(columns=CSV_HEADERS)
empty_dataset = datasets.Dataset.from_pandas(empty_df)
# 4. Save the empty dataset to the HF Hub
print(f"Attempting to push the empty dataset to '{HF_DATASET_REPO_ID}' (file: {RESULTS_CSV_FILE}) on Hugging Face Hub...")
try:
# To push a specific CSV file and overwrite it, we can push a dictionary
# where the key is the name of the file in the repo (without .csv extension if that's how load_dataset names splits)
# or more robustly, save to a local temp CSV and use that path in push_to_hub.
# Create a DatasetDict. The key 'train' is a common default split name.
# If your dataset on the Hub uses a different split name for this CSV, adjust accordingly.
# Or, if RESULTS_CSV_FILE is the exact filename on the hub, that's what we want to replace.
dataset_dict_to_push = datasets.DatasetDict({"train": empty_dataset})
# The push_to_hub for a DatasetDict will typically create Parquet files by default.
# To ensure it's a CSV, we might need to save it locally first and then push that file.
# However, let's try pushing the DatasetDict directly first, as it might handle CSVs
# if the original dataset was loaded as such.
# For more direct control over pushing a CSV file:
temp_empty_csv_path = "_temp_empty_prefs.csv"
empty_df.to_csv(temp_empty_csv_path, index=False)
# The `push_to_hub` method on a Dataset object itself can be used.
# To ensure it overwrites the correct file, it's often best to structure it as a DatasetDict
# or manage file uploads more directly if the library offers it for specific file types.
# Let's use a method that's common for replacing a dataset with a new version from a local file.
# We'll upload our temporary empty CSV.
# This requires the `huggingface_hub` library to be installed and logged in.
from huggingface_hub import HfApi
api = HfApi(token=os.getenv("HF_HUB_TOKEN", HF_TOKEN))
api.upload_file(
path_or_fileobj=temp_empty_csv_path,
path_in_repo=RESULTS_CSV_FILE, # This should be the path to the CSV file in the repo
repo_id=HF_DATASET_REPO_ID,
repo_type="dataset",
commit_message=f"Reset {RESULTS_CSV_FILE} to empty by script"
)
if os.path.exists(temp_empty_csv_path):
os.remove(temp_empty_csv_path)
print(f"Successfully pushed empty dataset to replace {RESULTS_CSV_FILE} in Hugging Face Hub: {HF_DATASET_REPO_ID}")
print("The remote dataset CSV should now be empty but retain its structure based on CSV_HEADERS.")
print(f"IMPORTANT: The old data (if any) is backed up at {backup_filepath}")
except Exception as e:
print(f"Error pushing empty dataset to Hugging Face Hub: {e}")
if os.path.exists(temp_empty_csv_path):
os.remove(temp_empty_csv_path)
print("The remote dataset might not have been cleared. Please check the Hugging Face Hub.")
if __name__ == "__main__":
main() |