Alignment-Lab-AI's picture
Upload folder using huggingface_hub
cf4acc9
raw
history blame
3.55 kB
import json
import random
import re
from tqdm import tqdm
from glob import glob
# Function to check for special content and return appropriate system content
def get_system_content(assistant_content):
if re.search(r'\b(?:int|float|char|struct|for|while|if|else)\b', assistant_content):
return "you are a genius!"
elif re.search(r"\*[^*]+\*", assistant_content):
return "lets tell a story"
else:
# Get the first three words from the assistant's turn
first_three_words = ' '.join(assistant_content.split()[:3])
return f"start like {first_three_words}"
# Function to add a System role to the conversation
def add_system_role(conversation, total_turns):
# Check for special content in the first assistant turn
assistant_content = conversation[1]["value"]
if total_turns % 2 == 0: # If even, add a new System turn
system_content = get_system_content(assistant_content)
# Insert the new System turn at the beginning
conversation.insert(0, {"from": "system", "value": system_content})
else: # If odd, convert the first user turn to System
conversation[0]["from"] = "system"
return conversation
# Function to reformat a single conversation
def reformat_conversation(conversation):
reformatted_convo = []
# First, handle the System role for the conversation
conversation = add_system_role(conversation, len(conversation))
# Next, assign roles and randomize do_train
for i, turn in enumerate(conversation):
role = "System" if turn["from"] == "system" else ("User" if i % 2 == 1 else "Assistant")
reformatted_convo.append({
"content": turn["value"],
"do_train": random.choice([True, False]),
"role": role
})
return reformatted_convo
# Function to load all .jsonl files, reformat them, and ensure odd number of turns
def load_and_reformat_conversations():
all_conversations = []
even_conversations_count = 0 # Counter for conversations with even number of turns
# Iterate over all .jsonl files in the current directory with a progress bar
for file_name in tqdm(glob("*.jsonl"), desc="Processing files"):
with open(file_name, 'r') as file:
# Process each line in the current file with a progress bar
for line in tqdm(file, desc=f"Processing {file_name}", leave=False):
# Load the original conversation
data = json.loads(line)
# Reformat the conversation
reformatted_convo = reformat_conversation(data['conversations'])
# Add to the list of all conversations
all_conversations.append({"conversation": reformatted_convo})
# Shuffle the combined list of all conversations
random.shuffle(all_conversations)
return all_conversations
# Execute the reformatting function and save the result
reformatted_conversations = load_and_reformat_conversations()
# Check that all conversations have an odd number of turns after reformatting
odd_turns_check = all(len(convo["conversation"]) % 2 != 0 for convo in reformatted_conversations)
if not odd_turns_check:
raise ValueError("Some conversations have an even number of turns after reformatting.")
# Save to a new .jsonl file
output_file = 'combined_conversations.jsonl'
with open(output_file, 'w') as outfile:
for convo in reformatted_conversations:
json.dump(convo, outfile)
outfile.write('\n')
# Return the name of the output file
output_file