Cylanoid commited on
Commit
8a0527c
·
1 Parent(s): 9340dd5

removed flash

Browse files
Files changed (1) hide show
  1. train_llama.py +51 -92
train_llama.py CHANGED
@@ -1,6 +1,8 @@
1
  from transformers import LlamaForCausalLM, LlamaTokenizer, Trainer, TrainingArguments
 
2
  import datasets
3
  import torch
 
4
  from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
5
  from accelerate import Accelerator
6
 
@@ -14,122 +16,79 @@ print(f"CUDA device: {torch.cuda.get_device_name(torch.cuda.current_device())}")
14
  MODEL_ID = "meta-llama/Llama-2-7b-hf"
15
  tokenizer = LlamaTokenizer.from_pretrained(MODEL_ID)
16
 
17
- # Add padding token if it doesn't exist
18
  if tokenizer.pad_token is None:
19
  tokenizer.add_special_tokens({'pad_token': '[PAD]'})
20
 
21
- # Load the model with optimizations for A100 GPU
 
 
 
22
  model = LlamaForCausalLM.from_pretrained(
23
  MODEL_ID,
24
- torch_dtype=torch.bfloat16, # Better for A100 GPUs
25
  device_map="auto",
26
- use_flash_attention_2=True, # Flash Attention for faster training
27
- load_in_8bit=True # Quantization for memory efficiency
28
  )
29
 
30
- # Prepare the model for training with LoRA (more memory-efficient)
31
  model = prepare_model_for_kbit_training(model)
32
-
33
- # LoRA configuration
34
  peft_config = LoraConfig(
35
- r=16, # Rank
36
- lora_alpha=32, # Alpha
37
- lora_dropout=0.05, # Dropout
38
- bias="none",
39
- task_type="CAUSAL_LM",
40
- target_modules=["q_proj", "k_proj", "v_proj", "o_proj"] # Attention modules for Llama
41
  )
42
-
43
  model = get_peft_model(model, peft_config)
44
- model.print_trainable_parameters() # Print percentage of trainable parameters
45
 
46
- # Load the dataset with field="training_pairs"
47
  dataset = datasets.load_dataset("json", data_files="final_combined_fraud_data.json", field="training_pairs")
48
-
49
- # Verify the dataset structure
50
  print("First example from dataset:", dataset["train"][0])
51
 
52
- # Define instruction template for formatting inputs
53
- def format_instruction(example):
54
- # Adapt this template based on your specific use case and dataset format
55
- return f"""<s>[INST] {example['input']} [/INST] {example['output']}</s>"""
56
-
57
- # Tokenization function
58
  def tokenize_data(example):
59
- formatted_text = format_instruction(example)
60
-
61
- # Tokenize with appropriate padding and truncation
62
- inputs = tokenizer(
63
- formatted_text,
64
- padding="max_length",
65
- truncation=True,
66
- max_length=2048, # Llama 2 context length
67
- return_tensors="pt"
68
- )
69
-
70
- # Create labels (for causal language modeling, labels are the same as input_ids)
71
- inputs["labels"] = inputs["input_ids"].clone()
72
-
73
- # Keep tensors as-is
74
- inputs = {k: v.squeeze(0) for k, v in inputs.items()}
75
- return inputs
76
-
77
- # Map without forcing Arrow schema
78
- tokenized_dataset = dataset["train"].map(
79
- tokenize_data,
80
- batched=False,
81
- remove_columns=dataset["train"].column_names
82
- )
83
-
84
- # Debug: Print the first tokenized example
85
- print("First tokenized example:", {k: (type(v), v.shape if isinstance(v, torch.Tensor) else "list") for k, v in tokenized_dataset[0].items()})
86
-
87
- # Custom data collator
88
  def custom_data_collator(features):
89
- batch = {}
90
-
91
- # Stack tensors
92
- batch["input_ids"] = torch.stack([f["input_ids"] for f in features])
93
- batch["attention_mask"] = torch.stack([f["attention_mask"] for f in features])
94
- batch["labels"] = torch.stack([f["labels"] for f in features])
95
-
96
- return batch
97
-
98
- # Initialize accelerator for distributed training
99
  accelerator = Accelerator()
100
-
101
- # Training setup
102
  training_args = TrainingArguments(
103
- output_dir="./fine_tuned_llama2",
104
- per_device_train_batch_size=4, # Larger batch size for A100
105
- gradient_accumulation_steps=8, # Accumulate gradients to increase effective batch size
106
- eval_strategy="no",
107
- save_strategy="steps",
108
- save_steps=100,
109
- save_total_limit=3,
110
- num_train_epochs=3,
111
- learning_rate=2e-5,
112
- weight_decay=0.01,
113
- logging_dir="./logs",
114
- logging_steps=10,
115
- bf16=True, # Use bfloat16 for A100 GPUs
116
- gradient_checkpointing=True, # Memory optimization
117
- optim="adamw_torch",
118
- warmup_steps=100,
119
  )
120
-
121
  trainer = Trainer(
122
- model=model,
123
- args=training_args,
124
- train_dataset=tokenized_dataset,
125
- data_collator=custom_data_collator,
126
  )
127
-
128
- # Start fine-tuning
129
  trainer.train()
130
-
131
- # Save the fine-tuned model and tokenizer
132
  model.save_pretrained("./fine_tuned_llama2")
133
  tokenizer.save_pretrained("./fine_tuned_llama2")
134
-
135
  print("Training complete. Model and tokenizer saved to ./fine_tuned_llama2")
 
1
  from transformers import LlamaForCausalLM, LlamaTokenizer, Trainer, TrainingArguments
2
+ from transformers import BitsAndBytesConfig
3
  import datasets
4
  import torch
5
+ from torch.nn.utils.rnn import pad_sequence
6
  from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
7
  from accelerate import Accelerator
8
 
 
16
  MODEL_ID = "meta-llama/Llama-2-7b-hf"
17
  tokenizer = LlamaTokenizer.from_pretrained(MODEL_ID)
18
 
 
19
  if tokenizer.pad_token is None:
20
  tokenizer.add_special_tokens({'pad_token': '[PAD]'})
21
 
22
+ # Quantization config
23
+ quantization_config = BitsAndBytesConfig(load_in_8bit=True)
24
+
25
+ # Load model without FlashAttention
26
  model = LlamaForCausalLM.from_pretrained(
27
  MODEL_ID,
28
+ torch_dtype=torch.bfloat16,
29
  device_map="auto",
30
+ quantization_config=quantization_config
 
31
  )
32
 
33
+ # Prepare for LoRA
34
  model = prepare_model_for_kbit_training(model)
 
 
35
  peft_config = LoraConfig(
36
+ r=16, lora_alpha=32, lora_dropout=0.05, bias="none", task_type="CAUSAL_LM",
37
+ target_modules=["q_proj", "k_proj", "v_proj", "o_proj"]
 
 
 
 
38
  )
 
39
  model = get_peft_model(model, peft_config)
40
+ model.print_trainable_parameters()
41
 
42
+ # Load dataset
43
  dataset = datasets.load_dataset("json", data_files="final_combined_fraud_data.json", field="training_pairs")
 
 
44
  print("First example from dataset:", dataset["train"][0])
45
 
46
+ # Tokenization with fixed length
 
 
 
 
 
47
  def tokenize_data(example):
48
+ formatted_text = f"{example['input']} {example['output']}"
49
+ inputs = tokenizer(formatted_text, truncation=True, max_length=512, padding="max_length", return_tensors="pt")
50
+ input_ids = inputs["input_ids"].squeeze(0).tolist()
51
+ attention_mask = inputs["attention_mask"].squeeze(0).tolist()
52
+ labels = input_ids.copy()
53
+ input_len = len(tokenizer(example['input'])["input_ids"])
54
+ labels[:input_len] = [-100] * input_len
55
+ print(f"Debug: input_ids[:5] = {input_ids[:5]}, labels[:5] = {labels[:5]}, attention_mask[:5] = {attention_mask[:5]}")
56
+ return {
57
+ "input_ids": input_ids,
58
+ "labels": labels,
59
+ "attention_mask": attention_mask
60
+ }
61
+
62
+ tokenized_dataset = dataset["train"].map(tokenize_data, batched=False, remove_columns=dataset["train"].column_names)
63
+ first_example = tokenized_dataset[0]
64
+ print("First tokenized example:", {k: (type(v), len(v)) for k, v in first_example.items()})
65
+
66
+ # Data collator with tensor stacking
 
 
 
 
 
 
 
 
 
 
67
  def custom_data_collator(features):
68
+ input_ids = [torch.tensor(f["input_ids"]) for f in features]
69
+ attention_mask = [torch.tensor(f["attention_mask"]) for f in features]
70
+ labels = [torch.tensor(f["labels"]) for f in features]
71
+ return {
72
+ "input_ids": torch.stack(input_ids),
73
+ "attention_mask": torch.stack(attention_mask),
74
+ "labels": torch.stack(labels)
75
+ }
76
+
77
+ # Accelerator and training
78
  accelerator = Accelerator()
 
 
79
  training_args = TrainingArguments(
80
+ output_dir="./fine_tuned_llama2", per_device_train_batch_size=4, gradient_accumulation_steps=4,
81
+ eval_strategy="steps", eval_steps=50, save_strategy="steps", save_steps=100, save_total_limit=3,
82
+ num_train_epochs=3, learning_rate=2e-5, weight_decay=0.01, logging_dir="./logs", logging_steps=10,
83
+ bf16=True, gradient_checkpointing=True, optim="adamw_torch", warmup_steps=100
 
 
 
 
 
 
 
 
 
 
 
 
84
  )
 
85
  trainer = Trainer(
86
+ model=model, args=training_args,
87
+ train_dataset=tokenized_dataset.select(range(90)),
88
+ eval_dataset=tokenized_dataset.select(range(90, 112)),
89
+ data_collator=custom_data_collator
90
  )
 
 
91
  trainer.train()
 
 
92
  model.save_pretrained("./fine_tuned_llama2")
93
  tokenizer.save_pretrained("./fine_tuned_llama2")
 
94
  print("Training complete. Model and tokenizer saved to ./fine_tuned_llama2")