# QLoRA fine-tuning for CodeLLaMA-7B-Instruct on 1x H200 # Requirements: transformers, peft, accelerate, bitsandbytes, datasets from transformers import ( AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer, BitsAndBytesConfig, DataCollatorForSeq2Seq ) from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training from datasets import load_dataset import torch import os import wandb os.environ["WANDB_PROJECT"] = "codellama-7b-instruct-qlora-linux-bugfix" os.environ["WANDB_NAME"] = "run-v1" # Paths and model BASE_MODEL = "codellama/CodeLLaMA-7b-Instruct-hf" DATA_PATH = "../dataset/training_data_100k.jsonl" OUTPUT_DIR = "./output/qlora-codellama-bugfix" # Load dataset (prompt-completion format) dataset = load_dataset("json", data_files=DATA_PATH, split="train") # BitsandBytes config for QLoRA bnb_config = BitsAndBytesConfig( load_in_4bit=True, bnb_4bit_use_double_quant=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.bfloat16 # optimized for H100/H200 ) # Load tokenizer and model tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, use_fast=True) tokenizer.pad_token = tokenizer.eos_token tokenizer.padding_side = "right" model = AutoModelForCausalLM.from_pretrained( BASE_MODEL, quantization_config=bnb_config, device_map="auto" ) model = prepare_model_for_kbit_training(model) model.gradient_checkpointing_enable() torch.backends.cuda.matmul.allow_tf32 = True # Apply QLoRA (LoRA config) lora_config = LoraConfig( r=64, lora_alpha=16, lora_dropout=0.1, bias="none", task_type="CAUSAL_LM" ) model = get_peft_model(model, lora_config) model.config.use_cache = False model.config.return_dict = True model.config.pad_token_id = tokenizer.pad_token_id model.print_trainable_parameters() # Format and tokenize the dataset model_max_len = tokenizer.model_max_length def format(example): prompt_ids = tokenizer(example["prompt"], truncation=True, max_length=1024)["input_ids"] completion_ids = tokenizer(example["completion"], truncation=True, max_length=512)["input_ids"] input_ids = prompt_ids + completion_ids labels = [-100] * len(prompt_ids) + completion_ids # pad both input_ids and labels to the same length max_len = min(len(input_ids), tokenizer.model_max_length) input_ids = input_ids[:max_len] labels = labels[:max_len] return { "input_ids": input_ids, "labels": labels, } # Sanity check print("__ Sanity checking one example...") sample = format(dataset[0]) test_input = torch.tensor(sample["input_ids"]).unsqueeze(0).to(model.device) test_labels = torch.tensor(sample["labels"]).unsqueeze(0).to(model.device) model.train() out = model(input_ids=test_input, labels=test_labels) assert out.loss.requires_grad, "Sanity check failed: Loss does not require grad." print("__ Sanity check passed. Proceeding to map()...") # Apply formatting to entire dataset dataset = dataset.map(format, remove_columns=["prompt", "completion"]) collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model, return_tensors="pt", pad_to_multiple_of=8) # Training arguments training_args = TrainingArguments( report_to="wandb", run_name="codellama-7b-instruct-qlora-linux-bugfix", logging_dir=f"{OUTPUT_DIR}/logs", output_dir=OUTPUT_DIR, num_train_epochs=3, per_device_train_batch_size=64, gradient_accumulation_steps=4, learning_rate=2e-4, lr_scheduler_type="cosine", warmup_ratio=0.03, gradient_checkpointing=True, bf16=True, # Important for H200 fp16=False, max_grad_norm=1.0, save_strategy="steps", save_steps=500, save_total_limit=2, logging_steps=50, push_to_hub=False, label_names=["labels"], remove_unused_columns=False, # Critical to prevent data loss ) # Trainer setup trainer = Trainer( model=model, args=training_args, train_dataset=dataset, tokenizer=tokenizer, data_collator=collator ) # Begin training model.train() print(f"Track this run in Weights & Biases: https://wandb.ai/{os.environ['WANDB_PROJECT']}/{os.environ['WANDB_NAME']}") trainer.train(resume_from_checkpoint=True) # Save final model model.save_pretrained(OUTPUT_DIR, safe_serialization=True) tokenizer.save_pretrained(OUTPUT_DIR) print(f"[DONE] Model saved to {OUTPUT_DIR}")