File size: 2,791 Bytes
d9cfebf 406313e d9cfebf 406313e d9cfebf 406313e d9cfebf 406313e d9cfebf a16809f d9cfebf 406313e 4f2ef99 406313e 4f2ef99 406313e d9cfebf 406313e d9cfebf 4f2ef99 406313e 4f2ef99 d9cfebf 406313e d9cfebf 406313e d9cfebf 406313e d9cfebf 406313e d9cfebf 406313e d9cfebf 406313e d9cfebf 406313e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 |
# train_llama4.py
# Script to fine-tune Llama 4 Maverick for healthcare fraud detection
from transformers import AutoTokenizer, Llama4ForConditionalGeneration
import datasets
import torch
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from accelerate import Accelerator
import huggingface_hub
import os
# Debug: Confirm file version
print("Running train_llama4.py with CPU offloading (version: 2025-04-21 v2)")
# Authenticate with Hugging Face
LLama = os.getenv("LLama")
if not LLama:
raise ValueError("LLama token not found. Set it in Hugging Face Space secrets as 'LLama'.")
huggingface_hub.login(token=LLama)
# Model setup
MODEL_ID = "meta-llama/Llama-4-Maverick-17B-128E-Instruct"
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
if tokenizer.pad_token is None:
tokenizer.add_special_tokens({'pad_token': '[PAD]'})
# Custom device map for CPU offloading
device_map = {
"model.embed_tokens": 0,
"model.layers.0-15": 0,
"model.layers.16-31": "cpu",
"model.norm": 0,
"lm_head": 0
}
# Debug: Confirm offloading settings
print("Loading model with CPU offloading: llm_int8_enable_fp32_cpu_offload=True, device_map=", device_map)
# Load model with 8-bit quantization and CPU offloading
model = Llama4ForConditionalGeneration.from_pretrained(
MODEL_ID,
torch_dtype=torch.bfloat16,
device_map=device_map,
quantization_config={"load_in_8bit": True},
llm_int8_enable_fp32_cpu_offload=True,
attn_implementation="flex_attention"
)
# Resize token embeddings
model.resize_token_embeddings(len(tokenizer))
# Initialize Accelerator
accelerator = Accelerator()
model = accelerator.prepare(model)
# Load dataset
dataset = datasets.load_dataset('json', data_files="Bingaman_training_data.json")['train']
# LoRA configuration
lora_config = LoraConfig(
r=16,
lora_alpha=32,
target_modules=["q_proj", "v_proj"],
lora_dropout=0.05,
bias="none",
task_type="CAUSAL_LM"
)
# Prepare model for fine-tuning
model = prepare_model_for_kbit_training(model)
model = get_peft_model(model, lora_config)
# Training arguments
training_args = {
"output_dir": "./results",
"num_train_epochs": 1,
"per_device_train_batch_size": 2,
"gradient_accumulation_steps": 8,
"optim": "adamw_torch",
"save_steps": 500,
"logging_steps": 100,
"learning_rate": 2e-4,
"fp16": True,
"max_grad_norm": 0.3,
"warmup_ratio": 0.03,
"lr_scheduler_type": "cosine"
}
# Initialize trainer
trainer = accelerator.prepare(
datasets.Trainer(
model=model,
args=datasets.TrainingArguments(**training_args),
train_dataset=dataset,
)
)
# Train
trainer.train()
model.save_pretrained("./fine_tuned_model")
print("Training completed!") |