Update tokenize_and_upload_mistral.py
Browse files
tokenize_and_upload_mistral.py
CHANGED
@@ -73,7 +73,8 @@ buffer_counter_train = 0
|
|
73 |
buffer_counter_rag = 0
|
74 |
|
75 |
def tokenize(example):
|
76 |
-
|
|
|
77 |
tokenized = tokenizer(prompt, truncation=True, padding="max_length", max_length=MAX_LEN)
|
78 |
tokenized["labels"] = [
|
79 |
-100 if token_id == tokenizer.pad_token_id else token_id for token_id in tokenized["input_ids"]
|
@@ -125,6 +126,7 @@ for idx, filename in enumerate(selected_files):
|
|
125 |
buffer_counter_train += 1
|
126 |
if buffer_counter_train >= BUFFER_SIZE:
|
127 |
buffer_counter_train = upload_if_ready(TRAIN_FOLDER, TRAIN_TARGET_DATASET_ID)
|
|
|
128 |
except Exception as e:
|
129 |
log(f"❌ Hata oluştu: {filename} → {e}")
|
130 |
traceback.print_exc()
|
|
|
73 |
buffer_counter_rag = 0
|
74 |
|
75 |
def tokenize(example):
|
76 |
+
# ✅ Mistral-7B-Instruct formatına uygun prompt
|
77 |
+
prompt = f"### Instruction:\n{example['instruction']}\n\n### Response:\n{example['output']}"
|
78 |
tokenized = tokenizer(prompt, truncation=True, padding="max_length", max_length=MAX_LEN)
|
79 |
tokenized["labels"] = [
|
80 |
-100 if token_id == tokenizer.pad_token_id else token_id for token_id in tokenized["input_ids"]
|
|
|
126 |
buffer_counter_train += 1
|
127 |
if buffer_counter_train >= BUFFER_SIZE:
|
128 |
buffer_counter_train = upload_if_ready(TRAIN_FOLDER, TRAIN_TARGET_DATASET_ID)
|
129 |
+
|
130 |
except Exception as e:
|
131 |
log(f"❌ Hata oluştu: {filename} → {e}")
|
132 |
traceback.print_exc()
|