Cylanoid's picture
it's not empty fucker
a18ecb8
# train_llama4.py
from transformers import AutoTokenizer, Llama4ForConditionalGeneration, BitsAndBytesConfig
import datasets
import torch
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from accelerate import Accelerator
import huggingface_hub
import os
print("Running train_llama4.py with CPU offloading (version: 2025-04-22 v1)")
# β€” Authenticate with Hugging Face
LLAMA = os.getenv("LLama")
if not LLAMA:
raise ValueError("LLama token not found. Set it in environment as 'LLama'.")
huggingface_hub.login(token=LLAMA)
# β€” Tokenizer
MODEL_ID = "meta-llama/Llama-4-Maverick-17B-128E-Instruct"
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
if tokenizer.pad_token is None:
tokenizer.add_special_tokens({'pad_token': '[PAD]'})
# β€” Quantization + CPU off‑load config
quant_config = BitsAndBytesConfig(
load_in_8bit=True,
llm_int8_enable_fp32_cpu_offload=True
)
print("Loading model with 8-bit quantization, CPU offload, and automatic device mapping")
model = Llama4ForConditionalGeneration.from_pretrained(
MODEL_ID,
torch_dtype=torch.bfloat16,
device_map="auto",
quantization_config=quant_config,
offload_folder="./offload"
)
# β€” Resize embeddings if pad was added
model.resize_token_embeddings(len(tokenizer))
# β€” Accelerator prep
accelerator = Accelerator()
model = accelerator.prepare(model)
# β€” Load training data
dataset = datasets.load_dataset('json', data_files="Bingaman_training_data.json")['train']
# β€” LoRA setup
lora_config = LoraConfig(
r=16,
lora_alpha=32,
target_modules=["q_proj", "v_proj"],
lora_dropout=0.05,
bias="none",
task_type="CAUSAL_LM"
)
model = prepare_model_for_kbit_training(model)
model = get_peft_model(model, lora_config)
# β€” Training arguments
training_args = {
"output_dir": "./results",
"num_train_epochs": 1,
"per_device_train_batch_size": 2,
"gradient_accumulation_steps": 8,
"optim": "adamw_torch",
"save_steps": 500,
"logging_steps": 100,
"learning_rate": 2e-4,
"fp16": True,
"max_grad_norm": 0.3,
"warmup_ratio": 0.03,
"lr_scheduler_type": "cosine"
}
# β€” Initialize Trainer via Accelerate
trainer = accelerator.prepare(
datasets.Trainer(
model=model,
args=datasets.TrainingArguments(**training_args),
train_dataset=dataset
)
)
# β€” Run training
trainer.train()
model.save_pretrained("./fine_tuned_model")
print("Training completed!")