ciyidogan commited on
Commit
f99c444
·
verified ·
1 Parent(s): ea55d0f

Update tokenize_and_upload_mistral.py

Browse files
Files changed (1) hide show
  1. tokenize_and_upload_mistral.py +3 -1
tokenize_and_upload_mistral.py CHANGED
@@ -73,7 +73,8 @@ buffer_counter_train = 0
73
  buffer_counter_rag = 0
74
 
75
  def tokenize(example):
76
- prompt = f"SORU: {example['instruction']}\nCEVAP: {example['output']}"
 
77
  tokenized = tokenizer(prompt, truncation=True, padding="max_length", max_length=MAX_LEN)
78
  tokenized["labels"] = [
79
  -100 if token_id == tokenizer.pad_token_id else token_id for token_id in tokenized["input_ids"]
@@ -125,6 +126,7 @@ for idx, filename in enumerate(selected_files):
125
  buffer_counter_train += 1
126
  if buffer_counter_train >= BUFFER_SIZE:
127
  buffer_counter_train = upload_if_ready(TRAIN_FOLDER, TRAIN_TARGET_DATASET_ID)
 
128
  except Exception as e:
129
  log(f"❌ Hata oluştu: {filename} → {e}")
130
  traceback.print_exc()
 
73
  buffer_counter_rag = 0
74
 
75
  def tokenize(example):
76
+ # Mistral-7B-Instruct formatına uygun prompt
77
+ prompt = f"### Instruction:\n{example['instruction']}\n\n### Response:\n{example['output']}"
78
  tokenized = tokenizer(prompt, truncation=True, padding="max_length", max_length=MAX_LEN)
79
  tokenized["labels"] = [
80
  -100 if token_id == tokenizer.pad_token_id else token_id for token_id in tokenized["input_ids"]
 
126
  buffer_counter_train += 1
127
  if buffer_counter_train >= BUFFER_SIZE:
128
  buffer_counter_train = upload_if_ready(TRAIN_FOLDER, TRAIN_TARGET_DATASET_ID)
129
+
130
  except Exception as e:
131
  log(f"❌ Hata oluştu: {filename} → {e}")
132
  traceback.print_exc()