File size: 5,494 Bytes
af5e0d4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
import os
import datasets
import pandas as pd
from datetime import datetime

from config import BACKUP_FOLDER, HF_DATASET_REPO_ID, HF_TOKEN, RESULTS_CSV_FILE, CSV_HEADERS

def main():
    """
    Gets the dataset from HF Hub where preferences are being collected,
    save it locally to a backup folder with a timestamp.
    Then creates an empty dataset with the same structure and saves it to the HF Hub.
    """
    print(f"Attempting to load dataset '{HF_DATASET_REPO_ID}' from Hugging Face Hub (file: {RESULTS_CSV_FILE})...")
    try:
        # 1. Get the dataset from HF Hub
        # Ensure the token has write permissions for pushing later.
        dataset = datasets.load_dataset(HF_DATASET_REPO_ID, data_files=RESULTS_CSV_FILE, token=HF_TOKEN, split='train')
        print(f"Successfully loaded dataset. It has {len(dataset)} entries.")
        dataset_df = dataset.to_pandas()
    except Exception as e:
        print(f"Error loading dataset from Hugging Face Hub: {e}")
        print("This could be due to the dataset/file not existing, or token issues.")
        print("Attempting to proceed by creating an empty structure for backup and remote reset.")
        # If loading fails, we might still want to try to clear the remote
        # or at least create an empty local backup structure.
        dataset_df = pd.DataFrame(columns=CSV_HEADERS) # Use predefined headers

    # 2. Save it locally to a backup folder with a timestamp
    if not os.path.exists(BACKUP_FOLDER):
        os.makedirs(BACKUP_FOLDER)
        print(f"Created backup folder: {BACKUP_FOLDER}")

    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    backup_filename = f"preferences_backup_{timestamp}.csv"
    backup_filepath = os.path.join(BACKUP_FOLDER, backup_filename)

    try:
        dataset_df.to_csv(backup_filepath, index=False)
        print(f"Successfully backed up current preferences (or empty structure) to: {backup_filepath}")
    except Exception as e:
        print(f"Error saving backup to {backup_filepath}: {e}")
        # Decide if to return or continue to try clearing remote
        # For now, let's continue to try clearing remote if backup fails

    # 3. Create an empty dataset with the same structure (using config.CSV_HEADERS)
    print(f"Creating an empty dataset structure using predefined CSV_HEADERS: {CSV_HEADERS}")
    empty_df = pd.DataFrame(columns=CSV_HEADERS)
    empty_dataset = datasets.Dataset.from_pandas(empty_df)

    # 4. Save the empty dataset to the HF Hub
    print(f"Attempting to push the empty dataset to '{HF_DATASET_REPO_ID}' (file: {RESULTS_CSV_FILE}) on Hugging Face Hub...")
    try:
        # To push a specific CSV file and overwrite it, we can push a dictionary
        # where the key is the name of the file in the repo (without .csv extension if that's how load_dataset names splits)
        # or more robustly, save to a local temp CSV and use that path in push_to_hub.
        
        # Create a DatasetDict. The key 'train' is a common default split name.
        # If your dataset on the Hub uses a different split name for this CSV, adjust accordingly.
        # Or, if RESULTS_CSV_FILE is the exact filename on the hub, that's what we want to replace.
        dataset_dict_to_push = datasets.DatasetDict({"train": empty_dataset}) 

        # The push_to_hub for a DatasetDict will typically create Parquet files by default.
        # To ensure it's a CSV, we might need to save it locally first and then push that file.
        # However, let's try pushing the DatasetDict directly first, as it might handle CSVs
        # if the original dataset was loaded as such.
        # For more direct control over pushing a CSV file:
        temp_empty_csv_path = "_temp_empty_prefs.csv"
        empty_df.to_csv(temp_empty_csv_path, index=False)
        
        # The `push_to_hub` method on a Dataset object itself can be used.
        # To ensure it overwrites the correct file, it's often best to structure it as a DatasetDict
        # or manage file uploads more directly if the library offers it for specific file types.

        # Let's use a method that's common for replacing a dataset with a new version from a local file.
        # We'll upload our temporary empty CSV.
        # This requires the `huggingface_hub` library to be installed and logged in.
        from huggingface_hub import HfApi
        api = HfApi(token=os.getenv("HF_HUB_TOKEN", HF_TOKEN))

        api.upload_file(
            path_or_fileobj=temp_empty_csv_path,
            path_in_repo=RESULTS_CSV_FILE, # This should be the path to the CSV file in the repo
            repo_id=HF_DATASET_REPO_ID,
            repo_type="dataset",
            commit_message=f"Reset {RESULTS_CSV_FILE} to empty by script"
        )

        if os.path.exists(temp_empty_csv_path):
            os.remove(temp_empty_csv_path)

        print(f"Successfully pushed empty dataset to replace {RESULTS_CSV_FILE} in Hugging Face Hub: {HF_DATASET_REPO_ID}")
        print("The remote dataset CSV should now be empty but retain its structure based on CSV_HEADERS.")
        print(f"IMPORTANT: The old data (if any) is backed up at {backup_filepath}")

    except Exception as e:
        print(f"Error pushing empty dataset to Hugging Face Hub: {e}")
        if os.path.exists(temp_empty_csv_path):
            os.remove(temp_empty_csv_path)
        print("The remote dataset might not have been cleared. Please check the Hugging Face Hub.")


if __name__ == "__main__":
    main()