import os import json import random from glob import glob from huggingface_hub import Repository # Name of the combined file output_file_name = "combined_conversations.jsonl" # Hugging Face Hub repository ID repo_id = "AlignmentLab-AI/idonteven" # Shuffle and combine jsonl files def shuffle_and_combine_jsonls(output_file_name): all_lines = [] for jsonl_file in glob("*.jsonl"): with open(jsonl_file, 'r') as file: all_lines.extend(file.readlines()) random.shuffle(all_lines) with open(output_file_name, 'w') as outfile: outfile.writelines(all_lines) return output_file_name # Clone your repository from Hugging Face and return the local path def clone_repository(repo_id): repo = Repository(repo_id, clone_from=repo_id) return repo # Copy the combined jsonl file and scripts to the cloned repository def copy_files_to_repo(combined_jsonl_path): # Copy the combined jsonl file os.system(f"cp {combined_jsonl_path} {repo_id}") # Copy other necessary files, e.g., scripts for file in glob("*"): if file != repo_id: os.system(f"cp {file} {repo_id}") # Commit and push changes to the Hugging Face repository def push_to_hub(repo): repo.git_add() repo.git_commit("Update dataset") repo.git_push() # Run all steps combined_jsonl_path = shuffle_and_combine_jsonls(output_file_name) repo = clone_repository(repo_id) copy_files_to_repo(combined_jsonl_path) push_to_hub(repo)