File size: 4,443 Bytes
b4ff959 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 |
from transformers import LlamaForCausalLM, LlamaTokenizer, Trainer, TrainingArguments
import datasets
import torch
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from accelerate import Accelerator
# Version and CUDA check
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA version: {torch.version.cuda}")
print(f"Is CUDA available: {torch.cuda.is_available()}")
print(f"CUDA device: {torch.cuda.get_device_name(torch.cuda.current_device())}")
# Load Llama model and tokenizer
MODEL_ID = "meta-llama/Llama-2-7b-hf"
tokenizer = LlamaTokenizer.from_pretrained(MODEL_ID)
# Add padding token if it doesn't exist
if tokenizer.pad_token is None:
tokenizer.add_special_tokens({'pad_token': '[PAD]'})
# Load the model with optimizations for A100 GPU
model = LlamaForCausalLM.from_pretrained(
MODEL_ID,
torch_dtype=torch.bfloat16, # Better for A100 GPUs
device_map="auto",
use_flash_attention_2=True, # Flash Attention for faster training
load_in_8bit=True # Quantization for memory efficiency
)
# Prepare the model for training with LoRA (more memory-efficient)
model = prepare_model_for_kbit_training(model)
# LoRA configuration
peft_config = LoraConfig(
r=16, # Rank
lora_alpha=32, # Alpha
lora_dropout=0.05, # Dropout
bias="none",
task_type="CAUSAL_LM",
target_modules=["q_proj", "k_proj", "v_proj", "o_proj"] # Attention modules for Llama
)
model = get_peft_model(model, peft_config)
model.print_trainable_parameters() # Print percentage of trainable parameters
# Load the dataset with field="training_pairs"
dataset = datasets.load_dataset("json", data_files="final_combined_fraud_data.json", field="training_pairs")
# Verify the dataset structure
print("First example from dataset:", dataset["train"][0])
# Define instruction template for formatting inputs
def format_instruction(example):
# Adapt this template based on your specific use case and dataset format
return f"""<s>[INST] {example['input']} [/INST] {example['output']}</s>"""
# Tokenization function
def tokenize_data(example):
formatted_text = format_instruction(example)
# Tokenize with appropriate padding and truncation
inputs = tokenizer(
formatted_text,
padding="max_length",
truncation=True,
max_length=2048, # Llama 2 context length
return_tensors="pt"
)
# Create labels (for causal language modeling, labels are the same as input_ids)
inputs["labels"] = inputs["input_ids"].clone()
# Keep tensors as-is
inputs = {k: v.squeeze(0) for k, v in inputs.items()}
return inputs
# Map without forcing Arrow schema
tokenized_dataset = dataset["train"].map(
tokenize_data,
batched=False,
remove_columns=dataset["train"].column_names
)
# Debug: Print the first tokenized example
print("First tokenized example:", {k: (type(v), v.shape if isinstance(v, torch.Tensor) else "list") for k, v in tokenized_dataset[0].items()})
# Custom data collator
def custom_data_collator(features):
batch = {}
# Stack tensors
batch["input_ids"] = torch.stack([f["input_ids"] for f in features])
batch["attention_mask"] = torch.stack([f["attention_mask"] for f in features])
batch["labels"] = torch.stack([f["labels"] for f in features])
return batch
# Initialize accelerator for distributed training
accelerator = Accelerator()
# Training setup
training_args = TrainingArguments(
output_dir="./fine_tuned_llama2",
per_device_train_batch_size=4, # Larger batch size for A100
gradient_accumulation_steps=8, # Accumulate gradients to increase effective batch size
eval_strategy="no",
save_strategy="steps",
save_steps=100,
save_total_limit=3,
num_train_epochs=3,
learning_rate=2e-5,
weight_decay=0.01,
logging_dir="./logs",
logging_steps=10,
bf16=True, # Use bfloat16 for A100 GPUs
gradient_checkpointing=True, # Memory optimization
optim="adamw_torch",
warmup_steps=100,
)
trainer = Trainer(
model=model,
args=training_args,
train_dataset=tokenized_dataset,
data_collator=custom_data_collator,
)
# Start fine-tuning
trainer.train()
# Save the fine-tuned model and tokenizer
model.save_pretrained("./fine_tuned_llama2")
tokenizer.save_pretrained("./fine_tuned_llama2")
print("Training complete. Model and tokenizer saved to ./fine_tuned_llama2") |