Upload folder using huggingface_hub

Browse files

Files changed (6) hide show

.gitattributes +3 -0
cc.jsonl +3 -0
combined_conversations.jsonl +3 -0
dd.jsonl +3 -0
script.py +80 -0
script2.py +47 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,6 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+cc.jsonl filter=lfs diff=lfs merge=lfs -text
+combined_conversations.jsonl filter=lfs diff=lfs merge=lfs -text
+dd.jsonl filter=lfs diff=lfs merge=lfs -text

cc.jsonl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:abe1ef071fe87d40adee42190c42e514f1739262d2dcc9017dad256d5277c5c5
+size 467189195

combined_conversations.jsonl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:35150095d33f2455275f7ba2e7e2004d46f66bf8a301b6e8d8678cc16231fedf
+size 1829804506

dd.jsonl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:58ab571bc54f28c5dc71dfd40ad1801cc09acbe69d5413096797478192471a53
+size 1362615311

script.py ADDED Viewed

	@@ -0,0 +1,80 @@

+import json
+import random
+import re
+from tqdm import tqdm
+from glob import glob
+# Function to check for special content and return appropriate system content
+def get_system_content(assistant_content):
+    if re.search(r'\b(?:int|float|char|struct|for|while|if|else)\b', assistant_content):
+        return "you are a genius!"
+    elif re.search(r"\*[^*]+\*", assistant_content):
+        return "lets tell a story"
+    else:
+        # Get the first three words from the assistant's turn
+        first_three_words = ' '.join(assistant_content.split()[:3])
+        return f"start like {first_three_words}"
+# Function to add a System role to the conversation
+def add_system_role(conversation, total_turns):
+    # Check for special content in the first assistant turn
+    assistant_content = conversation[1]["value"]
+    if total_turns % 2 == 0:  # If even, add a new System turn
+        system_content = get_system_content(assistant_content)
+        # Insert the new System turn at the beginning
+        conversation.insert(0, {"from": "system", "value": system_content})
+    else:  # If odd, convert the first user turn to System
+        conversation[0]["from"] = "system"
+    return conversation
+# Function to reformat a single conversation
+def reformat_conversation(conversation):
+    reformatted_convo = []
+    # First, handle the System role for the conversation
+    conversation = add_system_role(conversation, len(conversation))
+    # Next, assign roles and randomize do_train
+    for i, turn in enumerate(conversation):
+        role = "System" if turn["from"] == "system" else ("User" if i % 2 == 1 else "Assistant")
+        reformatted_convo.append({
+            "content": turn["value"],
+            "do_train": random.choice([True, False]),
+            "role": role
+        })
+    return reformatted_convo
+# Function to load all .jsonl files, reformat them, and ensure odd number of turns
+def load_and_reformat_conversations():
+    all_conversations = []
+    even_conversations_count = 0  # Counter for conversations with even number of turns
+    # Iterate over all .jsonl files in the current directory with a progress bar
+    for file_name in tqdm(glob("*.jsonl"), desc="Processing files"):
+        with open(file_name, 'r') as file:
+            # Process each line in the current file with a progress bar
+            for line in tqdm(file, desc=f"Processing {file_name}", leave=False):
+                # Load the original conversation
+                data = json.loads(line)
+                # Reformat the conversation
+                reformatted_convo = reformat_conversation(data['conversations'])
+                # Add to the list of all conversations
+                all_conversations.append({"conversation": reformatted_convo})
+    # Shuffle the combined list of all conversations
+    random.shuffle(all_conversations)
+    return all_conversations
+# Execute the reformatting function and save the result
+reformatted_conversations = load_and_reformat_conversations()
+# Check that all conversations have an odd number of turns after reformatting
+odd_turns_check = all(len(convo["conversation"]) % 2 != 0 for convo in reformatted_conversations)
+if not odd_turns_check:
+    raise ValueError("Some conversations have an even number of turns after reformatting.")
+# Save to a new .jsonl file
+output_file = 'combined_conversations.jsonl'
+with open(output_file, 'w') as outfile:
+    for convo in reformatted_conversations:
+        json.dump(convo, outfile)
+        outfile.write('\n')
+# Return the name of the output file
+output_file

script2.py ADDED Viewed

	@@ -0,0 +1,47 @@

+import os
+import json
+import random
+from glob import glob
+from huggingface_hub import Repository
+# Name of the combined file
+output_file_name = "combined_conversations.jsonl"
+# Hugging Face Hub repository ID
+repo_id = "AlignmentLab-AI/idonteven"
+# Shuffle and combine jsonl files
+def shuffle_and_combine_jsonls(output_file_name):
+    all_lines = []
+    for jsonl_file in glob("*.jsonl"):
+        with open(jsonl_file, 'r') as file:
+            all_lines.extend(file.readlines())
+    random.shuffle(all_lines)
+    with open(output_file_name, 'w') as outfile:
+        outfile.writelines(all_lines)
+    return output_file_name
+# Clone your repository from Hugging Face and return the local path
+def clone_repository(repo_id):
+    repo = Repository(repo_id, clone_from=repo_id)
+    return repo
+# Copy the combined jsonl file and scripts to the cloned repository
+def copy_files_to_repo(combined_jsonl_path):
+    # Copy the combined jsonl file
+    os.system(f"cp {combined_jsonl_path} {repo_id}")
+    # Copy other necessary files, e.g., scripts
+    for file in glob("*"):
+        if file != repo_id:
+            os.system(f"cp {file} {repo_id}")
+# Commit and push changes to the Hugging Face repository
+def push_to_hub(repo):
+    repo.git_add()
+    repo.git_commit("Update dataset")
+    repo.git_push()
+# Run all steps
+combined_jsonl_path = shuffle_and_combine_jsonls(output_file_name)
+repo = clone_repository(repo_id)
+copy_files_to_repo(combined_jsonl_path)
+push_to_hub(repo)