Alignment-Lab-AI commited on
Commit
c6a30f0
·
1 Parent(s): 2c4519d

Delete script.py

Browse files
Files changed (1) hide show
  1. script.py +0 -80
script.py DELETED
@@ -1,80 +0,0 @@
1
- import json
2
- import random
3
- import re
4
- from tqdm import tqdm
5
- from glob import glob
6
-
7
- # Function to check for special content and return appropriate system content
8
- def get_system_content(assistant_content):
9
- if re.search(r'\b(?:int|float|char|struct|for|while|if|else)\b', assistant_content):
10
- return "you are a genius!"
11
- elif re.search(r"\*[^*]+\*", assistant_content):
12
- return "lets tell a story"
13
- else:
14
- # Get the first three words from the assistant's turn
15
- first_three_words = ' '.join(assistant_content.split()[:3])
16
- return f"start like {first_three_words}"
17
-
18
- # Function to add a System role to the conversation
19
- def add_system_role(conversation, total_turns):
20
- # Check for special content in the first assistant turn
21
- assistant_content = conversation[1]["value"]
22
- if total_turns % 2 == 0: # If even, add a new System turn
23
- system_content = get_system_content(assistant_content)
24
- # Insert the new System turn at the beginning
25
- conversation.insert(0, {"from": "system", "value": system_content})
26
- else: # If odd, convert the first user turn to System
27
- conversation[0]["from"] = "system"
28
- return conversation
29
-
30
- # Function to reformat a single conversation
31
- def reformat_conversation(conversation):
32
- reformatted_convo = []
33
- # First, handle the System role for the conversation
34
- conversation = add_system_role(conversation, len(conversation))
35
- # Next, assign roles and randomize do_train
36
- for i, turn in enumerate(conversation):
37
- role = "System" if turn["from"] == "system" else ("User" if i % 2 == 1 else "Assistant")
38
- reformatted_convo.append({
39
- "content": turn["value"],
40
- "do_train": random.choice([True, False]),
41
- "role": role
42
- })
43
- return reformatted_convo
44
-
45
- # Function to load all .jsonl files, reformat them, and ensure odd number of turns
46
- def load_and_reformat_conversations():
47
- all_conversations = []
48
- even_conversations_count = 0 # Counter for conversations with even number of turns
49
- # Iterate over all .jsonl files in the current directory with a progress bar
50
- for file_name in tqdm(glob("*.jsonl"), desc="Processing files"):
51
- with open(file_name, 'r') as file:
52
- # Process each line in the current file with a progress bar
53
- for line in tqdm(file, desc=f"Processing {file_name}", leave=False):
54
- # Load the original conversation
55
- data = json.loads(line)
56
- # Reformat the conversation
57
- reformatted_convo = reformat_conversation(data['conversations'])
58
- # Add to the list of all conversations
59
- all_conversations.append({"conversation": reformatted_convo})
60
- # Shuffle the combined list of all conversations
61
- random.shuffle(all_conversations)
62
- return all_conversations
63
-
64
- # Execute the reformatting function and save the result
65
- reformatted_conversations = load_and_reformat_conversations()
66
-
67
- # Check that all conversations have an odd number of turns after reformatting
68
- odd_turns_check = all(len(convo["conversation"]) % 2 != 0 for convo in reformatted_conversations)
69
- if not odd_turns_check:
70
- raise ValueError("Some conversations have an even number of turns after reformatting.")
71
-
72
- # Save to a new .jsonl file
73
- output_file = 'combined_conversations.jsonl'
74
- with open(output_file, 'w') as outfile:
75
- for convo in reformatted_conversations:
76
- json.dump(convo, outfile)
77
- outfile.write('\n')
78
-
79
- # Return the name of the output file
80
- output_file