|
import os |
|
import json |
|
import random |
|
from glob import glob |
|
from huggingface_hub import Repository |
|
|
|
|
|
output_file_name = "combined_conversations.jsonl" |
|
|
|
repo_id = "AlignmentLab-AI/idonteven" |
|
|
|
|
|
def shuffle_and_combine_jsonls(output_file_name): |
|
all_lines = [] |
|
for jsonl_file in glob("*.jsonl"): |
|
with open(jsonl_file, 'r') as file: |
|
all_lines.extend(file.readlines()) |
|
random.shuffle(all_lines) |
|
with open(output_file_name, 'w') as outfile: |
|
outfile.writelines(all_lines) |
|
return output_file_name |
|
|
|
|
|
def clone_repository(repo_id): |
|
repo = Repository(repo_id, clone_from=repo_id) |
|
return repo |
|
|
|
|
|
def copy_files_to_repo(combined_jsonl_path): |
|
|
|
os.system(f"cp {combined_jsonl_path} {repo_id}") |
|
|
|
for file in glob("*"): |
|
if file != repo_id: |
|
os.system(f"cp {file} {repo_id}") |
|
|
|
|
|
def push_to_hub(repo): |
|
repo.git_add() |
|
repo.git_commit("Update dataset") |
|
repo.git_push() |
|
|
|
|
|
combined_jsonl_path = shuffle_and_combine_jsonls(output_file_name) |
|
repo = clone_repository(repo_id) |
|
copy_files_to_repo(combined_jsonl_path) |
|
push_to_hub(repo) |
|
|