Cylanoid's picture
dafeadfe
53d6f71
raw
history blame
4.44 kB
from transformers import LlamaForCausalLM, LlamaTokenizer, Trainer, TrainingArguments
import datasets
import torch
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from accelerate import Accelerator
# Version and CUDA check
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA version: {torch.version.cuda}")
print(f"Is CUDA available: {torch.cuda.is_available()}")
print(f"CUDA device: {torch.cuda.get_device_name(torch.cuda.current_device())}")
# Load Llama model and tokenizer
MODEL_ID = "meta-llama/Llama-2-7b-hf"
tokenizer = LlamaTokenizer.from_pretrained(MODEL_ID)
# Add padding token if it doesn't exist
if tokenizer.pad_token is None:
tokenizer.add_special_tokens({'pad_token': '[PAD]'})
# Load the model with optimizations for A100 GPU
model = LlamaForCausalLM.from_pretrained(
MODEL_ID,
torch_dtype=torch.bfloat16, # Better for A100 GPUs
device_map="auto",
use_flash_attention_2=True, # Flash Attention for faster training
load_in_8bit=True # Quantization for memory efficiency
)
# Prepare the model for training with LoRA (more memory-efficient)
model = prepare_model_for_kbit_training(model)
# LoRA configuration
peft_config = LoraConfig(
r=16, # Rank
lora_alpha=32, # Alpha
lora_dropout=0.05, # Dropout
bias="none",
task_type="CAUSAL_LM",
target_modules=["q_proj", "k_proj", "v_proj", "o_proj"] # Attention modules for Llama
)
model = get_peft_model(model, peft_config)
model.print_trainable_parameters() # Print percentage of trainable parameters
# Load the dataset with field="training_pairs"
dataset = datasets.load_dataset("json", data_files="final_combined_fraud_data.json", field="training_pairs")
# Verify the dataset structure
print("First example from dataset:", dataset["train"][0])
# Define instruction template for formatting inputs
def format_instruction(example):
# Adapt this template based on your specific use case and dataset format
return f"""<s>[INST] {example['input']} [/INST] {example['output']}</s>"""
# Tokenization function
def tokenize_data(example):
formatted_text = format_instruction(example)
# Tokenize with appropriate padding and truncation
inputs = tokenizer(
formatted_text,
padding="max_length",
truncation=True,
max_length=2048, # Llama 2 context length
return_tensors="pt"
)
# Create labels (for causal language modeling, labels are the same as input_ids)
inputs["labels"] = inputs["input_ids"].clone()
# Keep tensors as-is
inputs = {k: v.squeeze(0) for k, v in inputs.items()}
return inputs
# Map without forcing Arrow schema
tokenized_dataset = dataset["train"].map(
tokenize_data,
batched=False,
remove_columns=dataset["train"].column_names
)
# Debug: Print the first tokenized example
print("First tokenized example:", {k: (type(v), v.shape if isinstance(v, torch.Tensor) else "list") for k, v in tokenized_dataset[0].items()})
# Custom data collator
def custom_data_collator(features):
batch = {}
# Stack tensors
batch["input_ids"] = torch.stack([f["input_ids"] for f in features])
batch["attention_mask"] = torch.stack([f["attention_mask"] for f in features])
batch["labels"] = torch.stack([f["labels"] for f in features])
return batch
# Initialize accelerator for distributed training
accelerator = Accelerator()
# Training setup
training_args = TrainingArguments(
output_dir="./fine_tuned_llama2",
per_device_train_batch_size=4, # Larger batch size for A100
gradient_accumulation_steps=8, # Accumulate gradients to increase effective batch size
eval_strategy="no",
save_strategy="steps",
save_steps=100,
save_total_limit=3,
num_train_epochs=3,
learning_rate=2e-5,
weight_decay=0.01,
logging_dir="./logs",
logging_steps=10,
bf16=True, # Use bfloat16 for A100 GPUs
gradient_checkpointing=True, # Memory optimization
optim="adamw_torch",
warmup_steps=100,
)
trainer = Trainer(
model=model,
args=training_args,
train_dataset=tokenized_dataset,
data_collator=custom_data_collator,
)
# Start fine-tuning
trainer.train()
# Save the fine-tuned model and tokenizer
model.save_pretrained("./fine_tuned_llama2")
tokenizer.save_pretrained("./fine_tuned_llama2")
print("Training complete. Model and tokenizer saved to ./fine_tuned_llama2")