Commit
·
cf4acc9
1
Parent(s):
0dd4f1b
Upload folder using huggingface_hub
Browse files- .gitattributes +3 -0
- cc.jsonl +3 -0
- combined_conversations.jsonl +3 -0
- dd.jsonl +3 -0
- script.py +80 -0
- script2.py +47 -0
.gitattributes
CHANGED
@@ -33,3 +33,6 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
+
cc.jsonl filter=lfs diff=lfs merge=lfs -text
|
37 |
+
combined_conversations.jsonl filter=lfs diff=lfs merge=lfs -text
|
38 |
+
dd.jsonl filter=lfs diff=lfs merge=lfs -text
|
cc.jsonl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:abe1ef071fe87d40adee42190c42e514f1739262d2dcc9017dad256d5277c5c5
|
3 |
+
size 467189195
|
combined_conversations.jsonl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:35150095d33f2455275f7ba2e7e2004d46f66bf8a301b6e8d8678cc16231fedf
|
3 |
+
size 1829804506
|
dd.jsonl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:58ab571bc54f28c5dc71dfd40ad1801cc09acbe69d5413096797478192471a53
|
3 |
+
size 1362615311
|
script.py
ADDED
@@ -0,0 +1,80 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import json
|
2 |
+
import random
|
3 |
+
import re
|
4 |
+
from tqdm import tqdm
|
5 |
+
from glob import glob
|
6 |
+
|
7 |
+
# Function to check for special content and return appropriate system content
|
8 |
+
def get_system_content(assistant_content):
|
9 |
+
if re.search(r'\b(?:int|float|char|struct|for|while|if|else)\b', assistant_content):
|
10 |
+
return "you are a genius!"
|
11 |
+
elif re.search(r"\*[^*]+\*", assistant_content):
|
12 |
+
return "lets tell a story"
|
13 |
+
else:
|
14 |
+
# Get the first three words from the assistant's turn
|
15 |
+
first_three_words = ' '.join(assistant_content.split()[:3])
|
16 |
+
return f"start like {first_three_words}"
|
17 |
+
|
18 |
+
# Function to add a System role to the conversation
|
19 |
+
def add_system_role(conversation, total_turns):
|
20 |
+
# Check for special content in the first assistant turn
|
21 |
+
assistant_content = conversation[1]["value"]
|
22 |
+
if total_turns % 2 == 0: # If even, add a new System turn
|
23 |
+
system_content = get_system_content(assistant_content)
|
24 |
+
# Insert the new System turn at the beginning
|
25 |
+
conversation.insert(0, {"from": "system", "value": system_content})
|
26 |
+
else: # If odd, convert the first user turn to System
|
27 |
+
conversation[0]["from"] = "system"
|
28 |
+
return conversation
|
29 |
+
|
30 |
+
# Function to reformat a single conversation
|
31 |
+
def reformat_conversation(conversation):
|
32 |
+
reformatted_convo = []
|
33 |
+
# First, handle the System role for the conversation
|
34 |
+
conversation = add_system_role(conversation, len(conversation))
|
35 |
+
# Next, assign roles and randomize do_train
|
36 |
+
for i, turn in enumerate(conversation):
|
37 |
+
role = "System" if turn["from"] == "system" else ("User" if i % 2 == 1 else "Assistant")
|
38 |
+
reformatted_convo.append({
|
39 |
+
"content": turn["value"],
|
40 |
+
"do_train": random.choice([True, False]),
|
41 |
+
"role": role
|
42 |
+
})
|
43 |
+
return reformatted_convo
|
44 |
+
|
45 |
+
# Function to load all .jsonl files, reformat them, and ensure odd number of turns
|
46 |
+
def load_and_reformat_conversations():
|
47 |
+
all_conversations = []
|
48 |
+
even_conversations_count = 0 # Counter for conversations with even number of turns
|
49 |
+
# Iterate over all .jsonl files in the current directory with a progress bar
|
50 |
+
for file_name in tqdm(glob("*.jsonl"), desc="Processing files"):
|
51 |
+
with open(file_name, 'r') as file:
|
52 |
+
# Process each line in the current file with a progress bar
|
53 |
+
for line in tqdm(file, desc=f"Processing {file_name}", leave=False):
|
54 |
+
# Load the original conversation
|
55 |
+
data = json.loads(line)
|
56 |
+
# Reformat the conversation
|
57 |
+
reformatted_convo = reformat_conversation(data['conversations'])
|
58 |
+
# Add to the list of all conversations
|
59 |
+
all_conversations.append({"conversation": reformatted_convo})
|
60 |
+
# Shuffle the combined list of all conversations
|
61 |
+
random.shuffle(all_conversations)
|
62 |
+
return all_conversations
|
63 |
+
|
64 |
+
# Execute the reformatting function and save the result
|
65 |
+
reformatted_conversations = load_and_reformat_conversations()
|
66 |
+
|
67 |
+
# Check that all conversations have an odd number of turns after reformatting
|
68 |
+
odd_turns_check = all(len(convo["conversation"]) % 2 != 0 for convo in reformatted_conversations)
|
69 |
+
if not odd_turns_check:
|
70 |
+
raise ValueError("Some conversations have an even number of turns after reformatting.")
|
71 |
+
|
72 |
+
# Save to a new .jsonl file
|
73 |
+
output_file = 'combined_conversations.jsonl'
|
74 |
+
with open(output_file, 'w') as outfile:
|
75 |
+
for convo in reformatted_conversations:
|
76 |
+
json.dump(convo, outfile)
|
77 |
+
outfile.write('\n')
|
78 |
+
|
79 |
+
# Return the name of the output file
|
80 |
+
output_file
|
script2.py
ADDED
@@ -0,0 +1,47 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import json
|
3 |
+
import random
|
4 |
+
from glob import glob
|
5 |
+
from huggingface_hub import Repository
|
6 |
+
|
7 |
+
# Name of the combined file
|
8 |
+
output_file_name = "combined_conversations.jsonl"
|
9 |
+
# Hugging Face Hub repository ID
|
10 |
+
repo_id = "AlignmentLab-AI/idonteven"
|
11 |
+
|
12 |
+
# Shuffle and combine jsonl files
|
13 |
+
def shuffle_and_combine_jsonls(output_file_name):
|
14 |
+
all_lines = []
|
15 |
+
for jsonl_file in glob("*.jsonl"):
|
16 |
+
with open(jsonl_file, 'r') as file:
|
17 |
+
all_lines.extend(file.readlines())
|
18 |
+
random.shuffle(all_lines)
|
19 |
+
with open(output_file_name, 'w') as outfile:
|
20 |
+
outfile.writelines(all_lines)
|
21 |
+
return output_file_name
|
22 |
+
|
23 |
+
# Clone your repository from Hugging Face and return the local path
|
24 |
+
def clone_repository(repo_id):
|
25 |
+
repo = Repository(repo_id, clone_from=repo_id)
|
26 |
+
return repo
|
27 |
+
|
28 |
+
# Copy the combined jsonl file and scripts to the cloned repository
|
29 |
+
def copy_files_to_repo(combined_jsonl_path):
|
30 |
+
# Copy the combined jsonl file
|
31 |
+
os.system(f"cp {combined_jsonl_path} {repo_id}")
|
32 |
+
# Copy other necessary files, e.g., scripts
|
33 |
+
for file in glob("*"):
|
34 |
+
if file != repo_id:
|
35 |
+
os.system(f"cp {file} {repo_id}")
|
36 |
+
|
37 |
+
# Commit and push changes to the Hugging Face repository
|
38 |
+
def push_to_hub(repo):
|
39 |
+
repo.git_add()
|
40 |
+
repo.git_commit("Update dataset")
|
41 |
+
repo.git_push()
|
42 |
+
|
43 |
+
# Run all steps
|
44 |
+
combined_jsonl_path = shuffle_and_combine_jsonls(output_file_name)
|
45 |
+
repo = clone_repository(repo_id)
|
46 |
+
copy_files_to_repo(combined_jsonl_path)
|
47 |
+
push_to_hub(repo)
|