File size: 4,405 Bytes
9340dd5 9a84d4a 9340dd5 c1c1cb3 9340dd5 68401c7 9340dd5 68401c7 9340dd5 9a84d4a 4700a7a 9340dd5 04ed659 9340dd5 4700a7a 9340dd5 9a84d4a 9340dd5 9a84d4a 9340dd5 9a84d4a 9340dd5 9a84d4a 9340dd5 ccbe1fa 9340dd5 9a84d4a 4700a7a ccbe1fa 9a84d4a ccbe1fa 04ed659 ccbe1fa 04ed659 9a84d4a 13a5da0 c1c1cb3 9340dd5 4700a7a 9340dd5 c1c1cb3 9a84d4a 4700a7a 9a84d4a 9340dd5 9a84d4a 9340dd5 9a84d4a 9340dd5 9a84d4a 9340dd5 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 |
from transformers import LlamaForCausalLM, LlamaTokenizer, Trainer, TrainingArguments
from transformers import BitsAndBytesConfig
import datasets
import torch
from torch.nn.utils.rnn import pad_sequence
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from accelerate import Accelerator
# Version and CUDA check
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA version: {torch.version.cuda}")
print(f"Is CUDA available: {torch.cuda.is_available()}")
print(f"CUDA device: {torch.cuda.get_device_name(torch.cuda.current_device())}")
# Load Llama model and tokenizer
MODEL_ID = "meta-llama/Llama-2-7b-hf"
tokenizer = LlamaTokenizer.from_pretrained(MODEL_ID)
# Set pad token to existing <|endoftext|> (ID 2) instead of adding a new one
if tokenizer.pad_token is None:
tokenizer.pad_token = tokenizer.eos_token # Use <|endoftext|> as pad token
tokenizer.pad_token_id = tokenizer.eos_token_id # Should be 2
# Quantization config
quantization_config = BitsAndBytesConfig(load_in_8bit=True)
# Load model without FlashAttention
model = LlamaForCausalLM.from_pretrained(
MODEL_ID,
torch_dtype=torch.bfloat16,
device_map="auto",
quantization_config=quantization_config
)
# Prepare for LoRA
model = prepare_model_for_kbit_training(model)
peft_config = LoraConfig(
r=16, lora_alpha=32, lora_dropout=0.05, bias="none", task_type="CAUSAL_LM",
target_modules=["q_proj", "k_proj", "v_proj", "o_proj"]
)
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()
# Load dataset
dataset = datasets.load_dataset("json", data_files="final_combined_fraud_data.json", field="training_pairs")
print("First example from dataset:", dataset["train"][0])
# Tokenization with validation
def tokenize_data(example):
formatted_text = f"{example['input']} {example['output']}"
inputs = tokenizer(formatted_text, truncation=True, max_length=512, padding="max_length", return_tensors="pt")
input_ids = inputs["input_ids"].squeeze(0)
attention_mask = inputs["attention_mask"].squeeze(0)
labels = input_ids.clone()
input_len = len(tokenizer(example['input'])["input_ids"])
labels[:input_len] = -100 # Mask input part in labels only
# Validate input_ids
vocab_size = model.config.vocab_size # Should be 32000 for LLaMA-2
if (input_ids < 0).any() or (input_ids >= vocab_size).any():
print(f"Invalid input_ids: min={input_ids.min()}, max={input_ids.max()}, vocab_size={vocab_size}")
raise ValueError("input_ids contains invalid indices")
print(f"Debug: input_ids[:5] = {input_ids[:5].tolist()}, labels[:5] = {labels[:5].tolist()}, attention_mask[:5] = {attention_mask[:5].tolist()}")
return {
"input_ids": input_ids.tolist(),
"labels": labels.tolist(),
"attention_mask": attention_mask.tolist()
}
tokenized_dataset = dataset["train"].map(tokenize_data, batched=False, remove_columns=dataset["train"].column_names)
first_example = tokenized_dataset[0]
print("First tokenized example:", {k: (type(v), len(v)) for k, v in first_example.items()})
# Data collator with tensor stacking
def custom_data_collator(features):
input_ids = [torch.tensor(f["input_ids"]) for f in features]
attention_mask = [torch.tensor(f["attention_mask"]) for f in features]
labels = [torch.tensor(f["labels"]) for f in features]
return {
"input_ids": torch.stack(input_ids),
"attention_mask": torch.stack(attention_mask),
"labels": torch.stack(labels)
}
# Accelerator and training
accelerator = Accelerator()
training_args = TrainingArguments(
output_dir="./fine_tuned_llama2", per_device_train_batch_size=4, gradient_accumulation_steps=4,
eval_strategy="steps", eval_steps=50, save_strategy="steps", save_steps=100, save_total_limit=3,
num_train_epochs=3, learning_rate=2e-5, weight_decay=0.01, logging_dir="./logs", logging_steps=10,
bf16=True, gradient_checkpointing=True, optim="adamw_torch", warmup_steps=100
)
trainer = Trainer(
model=model, args=training_args,
train_dataset=tokenized_dataset.select(range(90)),
eval_dataset=tokenized_dataset.select(range(90, 112)),
data_collator=custom_data_collator
)
trainer.train()
model.save_pretrained("./fine_tuned_llama2")
tokenizer.save_pretrained("./fine_tuned_llama2")
print("Training complete. Model and tokenizer saved to ./fine_tuned_llama2") |