Cylanoid commited on
Commit
ccbe1fa
·
1 Parent(s): 4fc46ee

dafeafdf dfae

Browse files
Files changed (1) hide show
  1. train_llama.py +14 -9
train_llama.py CHANGED
@@ -43,20 +43,25 @@ model.print_trainable_parameters()
43
  dataset = datasets.load_dataset("json", data_files="final_combined_fraud_data.json", field="training_pairs")
44
  print("First example from dataset:", dataset["train"][0])
45
 
46
- # Tokenization with fixed length
47
  def tokenize_data(example):
48
  formatted_text = f"{example['input']} {example['output']}"
49
  inputs = tokenizer(formatted_text, truncation=True, max_length=512, padding="max_length", return_tensors="pt")
50
- input_ids = inputs["input_ids"].squeeze(0).tolist()
51
- attention_mask = inputs["attention_mask"].squeeze(0).tolist()
52
- labels = input_ids.copy()
53
  input_len = len(tokenizer(example['input'])["input_ids"])
54
- labels[:input_len] = [-100] * input_len
55
- print(f"Debug: input_ids[:5] = {input_ids[:5]}, labels[:5] = {labels[:5]}, attention_mask[:5] = {attention_mask[:5]}")
 
 
 
 
 
56
  return {
57
- "input_ids": input_ids,
58
- "labels": labels,
59
- "attention_mask": attention_mask
60
  }
61
 
62
  tokenized_dataset = dataset["train"].map(tokenize_data, batched=False, remove_columns=dataset["train"].column_names)
 
43
  dataset = datasets.load_dataset("json", data_files="final_combined_fraud_data.json", field="training_pairs")
44
  print("First example from dataset:", dataset["train"][0])
45
 
46
+ # Tokenization with validation
47
  def tokenize_data(example):
48
  formatted_text = f"{example['input']} {example['output']}"
49
  inputs = tokenizer(formatted_text, truncation=True, max_length=512, padding="max_length", return_tensors="pt")
50
+ input_ids = inputs["input_ids"].squeeze(0)
51
+ attention_mask = inputs["attention_mask"].squeeze(0)
52
+ labels = input_ids.clone()
53
  input_len = len(tokenizer(example['input'])["input_ids"])
54
+ labels[:input_len] = -100 # Mask input part in labels only
55
+ # Validate input_ids
56
+ vocab_size = model.config.vocab_size # Should be 32000 for LLaMA-2
57
+ if (input_ids < 0).any() or (input_ids >= vocab_size).any():
58
+ print(f"Invalid input_ids: min={input_ids.min()}, max={input_ids.max()}, vocab_size={vocab_size}")
59
+ raise ValueError("input_ids contains invalid indices")
60
+ print(f"Debug: input_ids[:5] = {input_ids[:5].tolist()}, labels[:5] = {labels[:5].tolist()}, attention_mask[:5] = {attention_mask[:5].tolist()}")
61
  return {
62
+ "input_ids": input_ids.tolist(),
63
+ "labels": labels.tolist(),
64
+ "attention_mask": attention_mask.tolist()
65
  }
66
 
67
  tokenized_dataset = dataset["train"].map(tokenize_data, batched=False, remove_columns=dataset["train"].column_names)