File size: 1,477 Bytes
cf4acc9 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 |
import os
import json
import random
from glob import glob
from huggingface_hub import Repository
# Name of the combined file
output_file_name = "combined_conversations.jsonl"
# Hugging Face Hub repository ID
repo_id = "AlignmentLab-AI/idonteven"
# Shuffle and combine jsonl files
def shuffle_and_combine_jsonls(output_file_name):
all_lines = []
for jsonl_file in glob("*.jsonl"):
with open(jsonl_file, 'r') as file:
all_lines.extend(file.readlines())
random.shuffle(all_lines)
with open(output_file_name, 'w') as outfile:
outfile.writelines(all_lines)
return output_file_name
# Clone your repository from Hugging Face and return the local path
def clone_repository(repo_id):
repo = Repository(repo_id, clone_from=repo_id)
return repo
# Copy the combined jsonl file and scripts to the cloned repository
def copy_files_to_repo(combined_jsonl_path):
# Copy the combined jsonl file
os.system(f"cp {combined_jsonl_path} {repo_id}")
# Copy other necessary files, e.g., scripts
for file in glob("*"):
if file != repo_id:
os.system(f"cp {file} {repo_id}")
# Commit and push changes to the Hugging Face repository
def push_to_hub(repo):
repo.git_add()
repo.git_commit("Update dataset")
repo.git_push()
# Run all steps
combined_jsonl_path = shuffle_and_combine_jsonls(output_file_name)
repo = clone_repository(repo_id)
copy_files_to_repo(combined_jsonl_path)
push_to_hub(repo)
|