Cylanoid commited on
Commit
4700a7a
·
1 Parent(s): 9773d48

you best commit this time 4 bucks an hour mr.

Browse files
Files changed (1) hide show
  1. train_llama.py +11 -17
train_llama.py CHANGED
@@ -22,13 +22,12 @@ if tokenizer.pad_token is None:
22
  # Quantization config
23
  quantization_config = BitsAndBytesConfig(load_in_8bit=True)
24
 
25
- # Load model with FlashAttention 2
26
  model = LlamaForCausalLM.from_pretrained(
27
  MODEL_ID,
28
  torch_dtype=torch.bfloat16,
29
  device_map="auto",
30
- quantization_config=quantization_config,
31
- attn_implementation="flash_attention_2"
32
  )
33
 
34
  # Prepare for LoRA
@@ -44,15 +43,16 @@ model.print_trainable_parameters()
44
  dataset = datasets.load_dataset("json", data_files="final_combined_fraud_data.json", field="training_pairs")
45
  print("First example from dataset:", dataset["train"][0])
46
 
47
- # Tokenization with lists (no tensors)
48
  def tokenize_data(example):
49
  formatted_text = f"{example['input']} {example['output']}"
50
- inputs = tokenizer(formatted_text, truncation=True, max_length=2048)
51
- input_ids = inputs["input_ids"]
52
- attention_mask = inputs["attention_mask"]
53
  labels = input_ids.copy()
54
  input_len = len(tokenizer(example['input'])["input_ids"])
55
  labels[:input_len] = [-100] * input_len
 
56
  return {
57
  "input_ids": input_ids,
58
  "labels": labels,
@@ -60,24 +60,18 @@ def tokenize_data(example):
60
  }
61
 
62
  tokenized_dataset = dataset["train"].map(tokenize_data, batched=False, remove_columns=dataset["train"].column_names)
63
- # Print first example (lists with lengths)
64
  first_example = tokenized_dataset[0]
65
  print("First tokenized example:", {k: (type(v), len(v)) for k, v in first_example.items()})
66
 
67
- # Data collator: convert lists to tensors and pad
68
  def custom_data_collator(features):
69
  input_ids = [torch.tensor(f["input_ids"]) for f in features]
70
  attention_mask = [torch.tensor(f["attention_mask"]) for f in features]
71
  labels = [torch.tensor(f["labels"]) for f in features]
72
-
73
- input_ids = pad_sequence(input_ids, batch_first=True, padding_value=tokenizer.pad_token_id)
74
- attention_mask = pad_sequence(attention_mask, batch_first=True, padding_value=0)
75
- labels = pad_sequence(labels, batch_first=True, padding_value=-100)
76
-
77
  return {
78
- "input_ids": input_ids,
79
- "attention_mask": attention_mask,
80
- "labels": labels
81
  }
82
 
83
  # Accelerator and training
 
22
  # Quantization config
23
  quantization_config = BitsAndBytesConfig(load_in_8bit=True)
24
 
25
+ # Load model without FlashAttention
26
  model = LlamaForCausalLM.from_pretrained(
27
  MODEL_ID,
28
  torch_dtype=torch.bfloat16,
29
  device_map="auto",
30
+ quantization_config=quantization_config
 
31
  )
32
 
33
  # Prepare for LoRA
 
43
  dataset = datasets.load_dataset("json", data_files="final_combined_fraud_data.json", field="training_pairs")
44
  print("First example from dataset:", dataset["train"][0])
45
 
46
+ # Tokenization with fixed length
47
  def tokenize_data(example):
48
  formatted_text = f"{example['input']} {example['output']}"
49
+ inputs = tokenizer(formatted_text, truncation=True, max_length=512, padding="max_length", return_tensors="pt")
50
+ input_ids = inputs["input_ids"].squeeze(0).tolist()
51
+ attention_mask = inputs["attention_mask"].squeeze(0).tolist()
52
  labels = input_ids.copy()
53
  input_len = len(tokenizer(example['input'])["input_ids"])
54
  labels[:input_len] = [-100] * input_len
55
+ print(f"Debug: input_ids[:5] = {input_ids[:5]}, labels[:5] = {labels[:5]}, attention_mask[:5] = {attention_mask[:5]}")
56
  return {
57
  "input_ids": input_ids,
58
  "labels": labels,
 
60
  }
61
 
62
  tokenized_dataset = dataset["train"].map(tokenize_data, batched=False, remove_columns=dataset["train"].column_names)
 
63
  first_example = tokenized_dataset[0]
64
  print("First tokenized example:", {k: (type(v), len(v)) for k, v in first_example.items()})
65
 
66
+ # Data collator with tensor stacking
67
  def custom_data_collator(features):
68
  input_ids = [torch.tensor(f["input_ids"]) for f in features]
69
  attention_mask = [torch.tensor(f["attention_mask"]) for f in features]
70
  labels = [torch.tensor(f["labels"]) for f in features]
 
 
 
 
 
71
  return {
72
+ "input_ids": torch.stack(input_ids),
73
+ "attention_mask": torch.stack(attention_mask),
74
+ "labels": torch.stack(labels)
75
  }
76
 
77
  # Accelerator and training