Alignment-Lab-AI commited on
Commit
cf4acc9
·
1 Parent(s): 0dd4f1b

Upload folder using huggingface_hub

Browse files
Files changed (6) hide show
  1. .gitattributes +3 -0
  2. cc.jsonl +3 -0
  3. combined_conversations.jsonl +3 -0
  4. dd.jsonl +3 -0
  5. script.py +80 -0
  6. script2.py +47 -0
.gitattributes CHANGED
@@ -33,3 +33,6 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ cc.jsonl filter=lfs diff=lfs merge=lfs -text
37
+ combined_conversations.jsonl filter=lfs diff=lfs merge=lfs -text
38
+ dd.jsonl filter=lfs diff=lfs merge=lfs -text
cc.jsonl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:abe1ef071fe87d40adee42190c42e514f1739262d2dcc9017dad256d5277c5c5
3
+ size 467189195
combined_conversations.jsonl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:35150095d33f2455275f7ba2e7e2004d46f66bf8a301b6e8d8678cc16231fedf
3
+ size 1829804506
dd.jsonl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:58ab571bc54f28c5dc71dfd40ad1801cc09acbe69d5413096797478192471a53
3
+ size 1362615311
script.py ADDED
@@ -0,0 +1,80 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import random
3
+ import re
4
+ from tqdm import tqdm
5
+ from glob import glob
6
+
7
+ # Function to check for special content and return appropriate system content
8
+ def get_system_content(assistant_content):
9
+ if re.search(r'\b(?:int|float|char|struct|for|while|if|else)\b', assistant_content):
10
+ return "you are a genius!"
11
+ elif re.search(r"\*[^*]+\*", assistant_content):
12
+ return "lets tell a story"
13
+ else:
14
+ # Get the first three words from the assistant's turn
15
+ first_three_words = ' '.join(assistant_content.split()[:3])
16
+ return f"start like {first_three_words}"
17
+
18
+ # Function to add a System role to the conversation
19
+ def add_system_role(conversation, total_turns):
20
+ # Check for special content in the first assistant turn
21
+ assistant_content = conversation[1]["value"]
22
+ if total_turns % 2 == 0: # If even, add a new System turn
23
+ system_content = get_system_content(assistant_content)
24
+ # Insert the new System turn at the beginning
25
+ conversation.insert(0, {"from": "system", "value": system_content})
26
+ else: # If odd, convert the first user turn to System
27
+ conversation[0]["from"] = "system"
28
+ return conversation
29
+
30
+ # Function to reformat a single conversation
31
+ def reformat_conversation(conversation):
32
+ reformatted_convo = []
33
+ # First, handle the System role for the conversation
34
+ conversation = add_system_role(conversation, len(conversation))
35
+ # Next, assign roles and randomize do_train
36
+ for i, turn in enumerate(conversation):
37
+ role = "System" if turn["from"] == "system" else ("User" if i % 2 == 1 else "Assistant")
38
+ reformatted_convo.append({
39
+ "content": turn["value"],
40
+ "do_train": random.choice([True, False]),
41
+ "role": role
42
+ })
43
+ return reformatted_convo
44
+
45
+ # Function to load all .jsonl files, reformat them, and ensure odd number of turns
46
+ def load_and_reformat_conversations():
47
+ all_conversations = []
48
+ even_conversations_count = 0 # Counter for conversations with even number of turns
49
+ # Iterate over all .jsonl files in the current directory with a progress bar
50
+ for file_name in tqdm(glob("*.jsonl"), desc="Processing files"):
51
+ with open(file_name, 'r') as file:
52
+ # Process each line in the current file with a progress bar
53
+ for line in tqdm(file, desc=f"Processing {file_name}", leave=False):
54
+ # Load the original conversation
55
+ data = json.loads(line)
56
+ # Reformat the conversation
57
+ reformatted_convo = reformat_conversation(data['conversations'])
58
+ # Add to the list of all conversations
59
+ all_conversations.append({"conversation": reformatted_convo})
60
+ # Shuffle the combined list of all conversations
61
+ random.shuffle(all_conversations)
62
+ return all_conversations
63
+
64
+ # Execute the reformatting function and save the result
65
+ reformatted_conversations = load_and_reformat_conversations()
66
+
67
+ # Check that all conversations have an odd number of turns after reformatting
68
+ odd_turns_check = all(len(convo["conversation"]) % 2 != 0 for convo in reformatted_conversations)
69
+ if not odd_turns_check:
70
+ raise ValueError("Some conversations have an even number of turns after reformatting.")
71
+
72
+ # Save to a new .jsonl file
73
+ output_file = 'combined_conversations.jsonl'
74
+ with open(output_file, 'w') as outfile:
75
+ for convo in reformatted_conversations:
76
+ json.dump(convo, outfile)
77
+ outfile.write('\n')
78
+
79
+ # Return the name of the output file
80
+ output_file
script2.py ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+ import random
4
+ from glob import glob
5
+ from huggingface_hub import Repository
6
+
7
+ # Name of the combined file
8
+ output_file_name = "combined_conversations.jsonl"
9
+ # Hugging Face Hub repository ID
10
+ repo_id = "AlignmentLab-AI/idonteven"
11
+
12
+ # Shuffle and combine jsonl files
13
+ def shuffle_and_combine_jsonls(output_file_name):
14
+ all_lines = []
15
+ for jsonl_file in glob("*.jsonl"):
16
+ with open(jsonl_file, 'r') as file:
17
+ all_lines.extend(file.readlines())
18
+ random.shuffle(all_lines)
19
+ with open(output_file_name, 'w') as outfile:
20
+ outfile.writelines(all_lines)
21
+ return output_file_name
22
+
23
+ # Clone your repository from Hugging Face and return the local path
24
+ def clone_repository(repo_id):
25
+ repo = Repository(repo_id, clone_from=repo_id)
26
+ return repo
27
+
28
+ # Copy the combined jsonl file and scripts to the cloned repository
29
+ def copy_files_to_repo(combined_jsonl_path):
30
+ # Copy the combined jsonl file
31
+ os.system(f"cp {combined_jsonl_path} {repo_id}")
32
+ # Copy other necessary files, e.g., scripts
33
+ for file in glob("*"):
34
+ if file != repo_id:
35
+ os.system(f"cp {file} {repo_id}")
36
+
37
+ # Commit and push changes to the Hugging Face repository
38
+ def push_to_hub(repo):
39
+ repo.git_add()
40
+ repo.git_commit("Update dataset")
41
+ repo.git_push()
42
+
43
+ # Run all steps
44
+ combined_jsonl_path = shuffle_and_combine_jsonls(output_file_name)
45
+ repo = clone_repository(repo_id)
46
+ copy_files_to_repo(combined_jsonl_path)
47
+ push_to_hub(repo)