dit-editor-user-study / clean_preferences.py
matsant01's picture
Major update of code. Adding new data with our generations
af5e0d4
raw
history blame
5.49 kB
import os
import datasets
import pandas as pd
from datetime import datetime
from config import BACKUP_FOLDER, HF_DATASET_REPO_ID, HF_TOKEN, RESULTS_CSV_FILE, CSV_HEADERS
def main():
"""
Gets the dataset from HF Hub where preferences are being collected,
save it locally to a backup folder with a timestamp.
Then creates an empty dataset with the same structure and saves it to the HF Hub.
"""
print(f"Attempting to load dataset '{HF_DATASET_REPO_ID}' from Hugging Face Hub (file: {RESULTS_CSV_FILE})...")
try:
# 1. Get the dataset from HF Hub
# Ensure the token has write permissions for pushing later.
dataset = datasets.load_dataset(HF_DATASET_REPO_ID, data_files=RESULTS_CSV_FILE, token=HF_TOKEN, split='train')
print(f"Successfully loaded dataset. It has {len(dataset)} entries.")
dataset_df = dataset.to_pandas()
except Exception as e:
print(f"Error loading dataset from Hugging Face Hub: {e}")
print("This could be due to the dataset/file not existing, or token issues.")
print("Attempting to proceed by creating an empty structure for backup and remote reset.")
# If loading fails, we might still want to try to clear the remote
# or at least create an empty local backup structure.
dataset_df = pd.DataFrame(columns=CSV_HEADERS) # Use predefined headers
# 2. Save it locally to a backup folder with a timestamp
if not os.path.exists(BACKUP_FOLDER):
os.makedirs(BACKUP_FOLDER)
print(f"Created backup folder: {BACKUP_FOLDER}")
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
backup_filename = f"preferences_backup_{timestamp}.csv"
backup_filepath = os.path.join(BACKUP_FOLDER, backup_filename)
try:
dataset_df.to_csv(backup_filepath, index=False)
print(f"Successfully backed up current preferences (or empty structure) to: {backup_filepath}")
except Exception as e:
print(f"Error saving backup to {backup_filepath}: {e}")
# Decide if to return or continue to try clearing remote
# For now, let's continue to try clearing remote if backup fails
# 3. Create an empty dataset with the same structure (using config.CSV_HEADERS)
print(f"Creating an empty dataset structure using predefined CSV_HEADERS: {CSV_HEADERS}")
empty_df = pd.DataFrame(columns=CSV_HEADERS)
empty_dataset = datasets.Dataset.from_pandas(empty_df)
# 4. Save the empty dataset to the HF Hub
print(f"Attempting to push the empty dataset to '{HF_DATASET_REPO_ID}' (file: {RESULTS_CSV_FILE}) on Hugging Face Hub...")
try:
# To push a specific CSV file and overwrite it, we can push a dictionary
# where the key is the name of the file in the repo (without .csv extension if that's how load_dataset names splits)
# or more robustly, save to a local temp CSV and use that path in push_to_hub.
# Create a DatasetDict. The key 'train' is a common default split name.
# If your dataset on the Hub uses a different split name for this CSV, adjust accordingly.
# Or, if RESULTS_CSV_FILE is the exact filename on the hub, that's what we want to replace.
dataset_dict_to_push = datasets.DatasetDict({"train": empty_dataset})
# The push_to_hub for a DatasetDict will typically create Parquet files by default.
# To ensure it's a CSV, we might need to save it locally first and then push that file.
# However, let's try pushing the DatasetDict directly first, as it might handle CSVs
# if the original dataset was loaded as such.
# For more direct control over pushing a CSV file:
temp_empty_csv_path = "_temp_empty_prefs.csv"
empty_df.to_csv(temp_empty_csv_path, index=False)
# The `push_to_hub` method on a Dataset object itself can be used.
# To ensure it overwrites the correct file, it's often best to structure it as a DatasetDict
# or manage file uploads more directly if the library offers it for specific file types.
# Let's use a method that's common for replacing a dataset with a new version from a local file.
# We'll upload our temporary empty CSV.
# This requires the `huggingface_hub` library to be installed and logged in.
from huggingface_hub import HfApi
api = HfApi(token=os.getenv("HF_HUB_TOKEN", HF_TOKEN))
api.upload_file(
path_or_fileobj=temp_empty_csv_path,
path_in_repo=RESULTS_CSV_FILE, # This should be the path to the CSV file in the repo
repo_id=HF_DATASET_REPO_ID,
repo_type="dataset",
commit_message=f"Reset {RESULTS_CSV_FILE} to empty by script"
)
if os.path.exists(temp_empty_csv_path):
os.remove(temp_empty_csv_path)
print(f"Successfully pushed empty dataset to replace {RESULTS_CSV_FILE} in Hugging Face Hub: {HF_DATASET_REPO_ID}")
print("The remote dataset CSV should now be empty but retain its structure based on CSV_HEADERS.")
print(f"IMPORTANT: The old data (if any) is backed up at {backup_filepath}")
except Exception as e:
print(f"Error pushing empty dataset to Hugging Face Hub: {e}")
if os.path.exists(temp_empty_csv_path):
os.remove(temp_empty_csv_path)
print("The remote dataset might not have been cleared. Please check the Hugging Face Hub.")
if __name__ == "__main__":
main()