Spaces:

UcsTurkey
/

mistral7b

Paused

ciyidogan commited on May 20

Commit

f99c444

verified ·

1 Parent(s): ea55d0f

Update tokenize_and_upload_mistral.py

Files changed (1) hide show

tokenize_and_upload_mistral.py CHANGED Viewed

@@ -73,7 +73,8 @@ buffer_counter_train = 0
 buffer_counter_rag = 0
 def tokenize(example):
-    prompt = f"SORU: {example['instruction']}\nCEVAP: {example['output']}"
     tokenized = tokenizer(prompt, truncation=True, padding="max_length", max_length=MAX_LEN)
     tokenized["labels"] = [
         -100 if token_id == tokenizer.pad_token_id else token_id for token_id in tokenized["input_ids"]
@@ -125,6 +126,7 @@ for idx, filename in enumerate(selected_files):
             buffer_counter_train += 1
             if buffer_counter_train >= BUFFER_SIZE:
                 buffer_counter_train = upload_if_ready(TRAIN_FOLDER, TRAIN_TARGET_DATASET_ID)
     except Exception as e:
         log(f"❌ Hata oluştu: {filename} → {e}")
         traceback.print_exc()

 buffer_counter_rag = 0
 def tokenize(example):
+    # ✅ Mistral-7B-Instruct formatına uygun prompt
+    prompt = f"### Instruction:\n{example['instruction']}\n\n### Response:\n{example['output']}"
     tokenized = tokenizer(prompt, truncation=True, padding="max_length", max_length=MAX_LEN)
     tokenized["labels"] = [
         -100 if token_id == tokenizer.pad_token_id else token_id for token_id in tokenized["input_ids"]
             buffer_counter_train += 1
             if buffer_counter_train >= BUFFER_SIZE:
                 buffer_counter_train = upload_if_ready(TRAIN_FOLDER, TRAIN_TARGET_DATASET_ID)
     except Exception as e:
         log(f"❌ Hata oluştu: {filename} → {e}")
         traceback.print_exc()