File size: 1,477 Bytes

cf4acc9

import os
import json
import random
from glob import glob
from huggingface_hub import Repository

# Name of the combined file
output_file_name = "combined_conversations.jsonl"
# Hugging Face Hub repository ID
repo_id = "AlignmentLab-AI/idonteven"

# Shuffle and combine jsonl files
def shuffle_and_combine_jsonls(output_file_name):
    all_lines = []
    for jsonl_file in glob("*.jsonl"):
        with open(jsonl_file, 'r') as file:
            all_lines.extend(file.readlines())
    random.shuffle(all_lines)
    with open(output_file_name, 'w') as outfile:
        outfile.writelines(all_lines)
    return output_file_name

# Clone your repository from Hugging Face and return the local path
def clone_repository(repo_id):
    repo = Repository(repo_id, clone_from=repo_id)
    return repo

# Copy the combined jsonl file and scripts to the cloned repository
def copy_files_to_repo(combined_jsonl_path):
    # Copy the combined jsonl file
    os.system(f"cp {combined_jsonl_path} {repo_id}")
    # Copy other necessary files, e.g., scripts
    for file in glob("*"):
        if file != repo_id:
            os.system(f"cp {file} {repo_id}")

# Commit and push changes to the Hugging Face repository
def push_to_hub(repo):
    repo.git_add()
    repo.git_commit("Update dataset")
    repo.git_push()

# Run all steps
combined_jsonl_path = shuffle_and_combine_jsonls(output_file_name)
repo = clone_repository(repo_id)
copy_files_to_repo(combined_jsonl_path)
push_to_hub(repo)