import json import random import re from tqdm import tqdm from glob import glob # Function to check for special content and return appropriate system content def get_system_content(assistant_content): if re.search(r'\b(?:int|float|char|struct|for|while|if|else)\b', assistant_content): return "you are a genius!" elif re.search(r"\*[^*]+\*", assistant_content): return "lets tell a story" else: # Get the first three words from the assistant's turn first_three_words = ' '.join(assistant_content.split()[:3]) return f"start like {first_three_words}" # Function to add a System role to the conversation def add_system_role(conversation, total_turns): # Check for special content in the first assistant turn assistant_content = conversation[1]["value"] if total_turns % 2 == 0: # If even, add a new System turn system_content = get_system_content(assistant_content) # Insert the new System turn at the beginning conversation.insert(0, {"from": "system", "value": system_content}) else: # If odd, convert the first user turn to System conversation[0]["from"] = "system" return conversation # Function to reformat a single conversation def reformat_conversation(conversation): reformatted_convo = [] # First, handle the System role for the conversation conversation = add_system_role(conversation, len(conversation)) # Next, assign roles and randomize do_train for i, turn in enumerate(conversation): role = "System" if turn["from"] == "system" else ("User" if i % 2 == 1 else "Assistant") reformatted_convo.append({ "content": turn["value"], "do_train": random.choice([True, False]), "role": role }) return reformatted_convo # Function to load all .jsonl files, reformat them, and ensure odd number of turns def load_and_reformat_conversations(): all_conversations = [] even_conversations_count = 0 # Counter for conversations with even number of turns # Iterate over all .jsonl files in the current directory with a progress bar for file_name in tqdm(glob("*.jsonl"), desc="Processing files"): with open(file_name, 'r') as file: # Process each line in the current file with a progress bar for line in tqdm(file, desc=f"Processing {file_name}", leave=False): # Load the original conversation data = json.loads(line) # Reformat the conversation reformatted_convo = reformat_conversation(data['conversations']) # Add to the list of all conversations all_conversations.append({"conversation": reformatted_convo}) # Shuffle the combined list of all conversations random.shuffle(all_conversations) return all_conversations # Execute the reformatting function and save the result reformatted_conversations = load_and_reformat_conversations() # Check that all conversations have an odd number of turns after reformatting odd_turns_check = all(len(convo["conversation"]) % 2 != 0 for convo in reformatted_conversations) if not odd_turns_check: raise ValueError("Some conversations have an even number of turns after reformatting.") # Save to a new .jsonl file output_file = 'combined_conversations.jsonl' with open(output_file, 'w') as outfile: for convo in reformatted_conversations: json.dump(convo, outfile) outfile.write('\n') # Return the name of the output file output_file