lfs-enable-largefiles / script2.py
Alignment-Lab-AI's picture
Upload folder using huggingface_hub
cf4acc9
raw
history blame
1.48 kB
import os
import json
import random
from glob import glob
from huggingface_hub import Repository
# Name of the combined file
output_file_name = "combined_conversations.jsonl"
# Hugging Face Hub repository ID
repo_id = "AlignmentLab-AI/idonteven"
# Shuffle and combine jsonl files
def shuffle_and_combine_jsonls(output_file_name):
all_lines = []
for jsonl_file in glob("*.jsonl"):
with open(jsonl_file, 'r') as file:
all_lines.extend(file.readlines())
random.shuffle(all_lines)
with open(output_file_name, 'w') as outfile:
outfile.writelines(all_lines)
return output_file_name
# Clone your repository from Hugging Face and return the local path
def clone_repository(repo_id):
repo = Repository(repo_id, clone_from=repo_id)
return repo
# Copy the combined jsonl file and scripts to the cloned repository
def copy_files_to_repo(combined_jsonl_path):
# Copy the combined jsonl file
os.system(f"cp {combined_jsonl_path} {repo_id}")
# Copy other necessary files, e.g., scripts
for file in glob("*"):
if file != repo_id:
os.system(f"cp {file} {repo_id}")
# Commit and push changes to the Hugging Face repository
def push_to_hub(repo):
repo.git_add()
repo.git_commit("Update dataset")
repo.git_push()
# Run all steps
combined_jsonl_path = shuffle_and_combine_jsonls(output_file_name)
repo = clone_repository(repo_id)
copy_files_to_repo(combined_jsonl_path)
push_to_hub(repo)