|
import json |
|
import random |
|
import re |
|
from tqdm import tqdm |
|
from glob import glob |
|
|
|
|
|
def get_system_content(assistant_content): |
|
if re.search(r'\b(?:int|float|char|struct|for|while|if|else)\b', assistant_content): |
|
return "you are a genius!" |
|
elif re.search(r"\*[^*]+\*", assistant_content): |
|
return "lets tell a story" |
|
else: |
|
|
|
first_three_words = ' '.join(assistant_content.split()[:3]) |
|
return f"start like {first_three_words}" |
|
|
|
|
|
def add_system_role(conversation, total_turns): |
|
|
|
assistant_content = conversation[1]["value"] |
|
if total_turns % 2 == 0: |
|
system_content = get_system_content(assistant_content) |
|
|
|
conversation.insert(0, {"from": "system", "value": system_content}) |
|
else: |
|
conversation[0]["from"] = "system" |
|
return conversation |
|
|
|
|
|
def reformat_conversation(conversation): |
|
reformatted_convo = [] |
|
|
|
conversation = add_system_role(conversation, len(conversation)) |
|
|
|
for i, turn in enumerate(conversation): |
|
role = "System" if turn["from"] == "system" else ("User" if i % 2 == 1 else "Assistant") |
|
reformatted_convo.append({ |
|
"content": turn["value"], |
|
"do_train": random.choice([True, False]), |
|
"role": role |
|
}) |
|
return reformatted_convo |
|
|
|
|
|
def load_and_reformat_conversations(): |
|
all_conversations = [] |
|
even_conversations_count = 0 |
|
|
|
for file_name in tqdm(glob("*.jsonl"), desc="Processing files"): |
|
with open(file_name, 'r') as file: |
|
|
|
for line in tqdm(file, desc=f"Processing {file_name}", leave=False): |
|
|
|
data = json.loads(line) |
|
|
|
reformatted_convo = reformat_conversation(data['conversations']) |
|
|
|
all_conversations.append({"conversation": reformatted_convo}) |
|
|
|
random.shuffle(all_conversations) |
|
return all_conversations |
|
|
|
|
|
reformatted_conversations = load_and_reformat_conversations() |
|
|
|
|
|
odd_turns_check = all(len(convo["conversation"]) % 2 != 0 for convo in reformatted_conversations) |
|
if not odd_turns_check: |
|
raise ValueError("Some conversations have an even number of turns after reformatting.") |
|
|
|
|
|
output_file = 'combined_conversations.jsonl' |
|
with open(output_file, 'w') as outfile: |
|
for convo in reformatted_conversations: |
|
json.dump(convo, outfile) |
|
outfile.write('\n') |
|
|
|
|
|
output_file |
|
|