Commit
·
c6a30f0
1
Parent(s):
2c4519d
Delete script.py
Browse files
script.py
DELETED
@@ -1,80 +0,0 @@
|
|
1 |
-
import json
|
2 |
-
import random
|
3 |
-
import re
|
4 |
-
from tqdm import tqdm
|
5 |
-
from glob import glob
|
6 |
-
|
7 |
-
# Function to check for special content and return appropriate system content
|
8 |
-
def get_system_content(assistant_content):
|
9 |
-
if re.search(r'\b(?:int|float|char|struct|for|while|if|else)\b', assistant_content):
|
10 |
-
return "you are a genius!"
|
11 |
-
elif re.search(r"\*[^*]+\*", assistant_content):
|
12 |
-
return "lets tell a story"
|
13 |
-
else:
|
14 |
-
# Get the first three words from the assistant's turn
|
15 |
-
first_three_words = ' '.join(assistant_content.split()[:3])
|
16 |
-
return f"start like {first_three_words}"
|
17 |
-
|
18 |
-
# Function to add a System role to the conversation
|
19 |
-
def add_system_role(conversation, total_turns):
|
20 |
-
# Check for special content in the first assistant turn
|
21 |
-
assistant_content = conversation[1]["value"]
|
22 |
-
if total_turns % 2 == 0: # If even, add a new System turn
|
23 |
-
system_content = get_system_content(assistant_content)
|
24 |
-
# Insert the new System turn at the beginning
|
25 |
-
conversation.insert(0, {"from": "system", "value": system_content})
|
26 |
-
else: # If odd, convert the first user turn to System
|
27 |
-
conversation[0]["from"] = "system"
|
28 |
-
return conversation
|
29 |
-
|
30 |
-
# Function to reformat a single conversation
|
31 |
-
def reformat_conversation(conversation):
|
32 |
-
reformatted_convo = []
|
33 |
-
# First, handle the System role for the conversation
|
34 |
-
conversation = add_system_role(conversation, len(conversation))
|
35 |
-
# Next, assign roles and randomize do_train
|
36 |
-
for i, turn in enumerate(conversation):
|
37 |
-
role = "System" if turn["from"] == "system" else ("User" if i % 2 == 1 else "Assistant")
|
38 |
-
reformatted_convo.append({
|
39 |
-
"content": turn["value"],
|
40 |
-
"do_train": random.choice([True, False]),
|
41 |
-
"role": role
|
42 |
-
})
|
43 |
-
return reformatted_convo
|
44 |
-
|
45 |
-
# Function to load all .jsonl files, reformat them, and ensure odd number of turns
|
46 |
-
def load_and_reformat_conversations():
|
47 |
-
all_conversations = []
|
48 |
-
even_conversations_count = 0 # Counter for conversations with even number of turns
|
49 |
-
# Iterate over all .jsonl files in the current directory with a progress bar
|
50 |
-
for file_name in tqdm(glob("*.jsonl"), desc="Processing files"):
|
51 |
-
with open(file_name, 'r') as file:
|
52 |
-
# Process each line in the current file with a progress bar
|
53 |
-
for line in tqdm(file, desc=f"Processing {file_name}", leave=False):
|
54 |
-
# Load the original conversation
|
55 |
-
data = json.loads(line)
|
56 |
-
# Reformat the conversation
|
57 |
-
reformatted_convo = reformat_conversation(data['conversations'])
|
58 |
-
# Add to the list of all conversations
|
59 |
-
all_conversations.append({"conversation": reformatted_convo})
|
60 |
-
# Shuffle the combined list of all conversations
|
61 |
-
random.shuffle(all_conversations)
|
62 |
-
return all_conversations
|
63 |
-
|
64 |
-
# Execute the reformatting function and save the result
|
65 |
-
reformatted_conversations = load_and_reformat_conversations()
|
66 |
-
|
67 |
-
# Check that all conversations have an odd number of turns after reformatting
|
68 |
-
odd_turns_check = all(len(convo["conversation"]) % 2 != 0 for convo in reformatted_conversations)
|
69 |
-
if not odd_turns_check:
|
70 |
-
raise ValueError("Some conversations have an even number of turns after reformatting.")
|
71 |
-
|
72 |
-
# Save to a new .jsonl file
|
73 |
-
output_file = 'combined_conversations.jsonl'
|
74 |
-
with open(output_file, 'w') as outfile:
|
75 |
-
for convo in reformatted_conversations:
|
76 |
-
json.dump(convo, outfile)
|
77 |
-
outfile.write('\n')
|
78 |
-
|
79 |
-
# Return the name of the output file
|
80 |
-
output_file
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|