File size: 3,548 Bytes
cf4acc9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
import json
import random
import re
from tqdm import tqdm
from glob import glob

# Function to check for special content and return appropriate system content
def get_system_content(assistant_content):
    if re.search(r'\b(?:int|float|char|struct|for|while|if|else)\b', assistant_content):
        return "you are a genius!"
    elif re.search(r"\*[^*]+\*", assistant_content):
        return "lets tell a story"
    else:
        # Get the first three words from the assistant's turn
        first_three_words = ' '.join(assistant_content.split()[:3])
        return f"start like {first_three_words}"

# Function to add a System role to the conversation
def add_system_role(conversation, total_turns):
    # Check for special content in the first assistant turn
    assistant_content = conversation[1]["value"]
    if total_turns % 2 == 0:  # If even, add a new System turn
        system_content = get_system_content(assistant_content)
        # Insert the new System turn at the beginning
        conversation.insert(0, {"from": "system", "value": system_content})
    else:  # If odd, convert the first user turn to System
        conversation[0]["from"] = "system"
    return conversation

# Function to reformat a single conversation
def reformat_conversation(conversation):
    reformatted_convo = []
    # First, handle the System role for the conversation
    conversation = add_system_role(conversation, len(conversation))
    # Next, assign roles and randomize do_train
    for i, turn in enumerate(conversation):
        role = "System" if turn["from"] == "system" else ("User" if i % 2 == 1 else "Assistant")
        reformatted_convo.append({
            "content": turn["value"],
            "do_train": random.choice([True, False]),
            "role": role
        })
    return reformatted_convo

# Function to load all .jsonl files, reformat them, and ensure odd number of turns
def load_and_reformat_conversations():
    all_conversations = []
    even_conversations_count = 0  # Counter for conversations with even number of turns
    # Iterate over all .jsonl files in the current directory with a progress bar
    for file_name in tqdm(glob("*.jsonl"), desc="Processing files"):
        with open(file_name, 'r') as file:
            # Process each line in the current file with a progress bar
            for line in tqdm(file, desc=f"Processing {file_name}", leave=False):
                # Load the original conversation
                data = json.loads(line)
                # Reformat the conversation
                reformatted_convo = reformat_conversation(data['conversations'])
                # Add to the list of all conversations
                all_conversations.append({"conversation": reformatted_convo})
    # Shuffle the combined list of all conversations
    random.shuffle(all_conversations)
    return all_conversations

# Execute the reformatting function and save the result
reformatted_conversations = load_and_reformat_conversations()

# Check that all conversations have an odd number of turns after reformatting
odd_turns_check = all(len(convo["conversation"]) % 2 != 0 for convo in reformatted_conversations)
if not odd_turns_check:
    raise ValueError("Some conversations have an even number of turns after reformatting.")

# Save to a new .jsonl file
output_file = 'combined_conversations.jsonl'
with open(output_file, 'w') as outfile:
    for convo in reformatted_conversations:
        json.dump(convo, outfile)
        outfile.write('\n')

# Return the name of the output file
output_file