File size: 4,405 Bytes
9340dd5
9a84d4a
9340dd5
 
c1c1cb3
9340dd5
 
 
 
 
 
 
 
 
 
 
 
 
68401c7
9340dd5
68401c7
 
9340dd5
9a84d4a
 
 
4700a7a
9340dd5
 
04ed659
9340dd5
4700a7a
9340dd5
 
9a84d4a
9340dd5
 
9a84d4a
 
9340dd5
 
9a84d4a
9340dd5
9a84d4a
9340dd5
 
 
ccbe1fa
9340dd5
9a84d4a
4700a7a
ccbe1fa
 
 
9a84d4a
ccbe1fa
 
 
 
 
 
 
04ed659
ccbe1fa
 
 
04ed659
9a84d4a
 
13a5da0
c1c1cb3
9340dd5
4700a7a
9340dd5
c1c1cb3
 
 
9a84d4a
4700a7a
 
 
9a84d4a
9340dd5
9a84d4a
9340dd5
 
9a84d4a
 
 
 
9340dd5
 
9a84d4a
 
 
 
9340dd5
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
from transformers import LlamaForCausalLM, LlamaTokenizer, Trainer, TrainingArguments
from transformers import BitsAndBytesConfig
import datasets
import torch
from torch.nn.utils.rnn import pad_sequence
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from accelerate import Accelerator

# Version and CUDA check
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA version: {torch.version.cuda}")
print(f"Is CUDA available: {torch.cuda.is_available()}")
print(f"CUDA device: {torch.cuda.get_device_name(torch.cuda.current_device())}")

# Load Llama model and tokenizer
MODEL_ID = "meta-llama/Llama-2-7b-hf"
tokenizer = LlamaTokenizer.from_pretrained(MODEL_ID)

# Set pad token to existing <|endoftext|> (ID 2) instead of adding a new one
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token  # Use <|endoftext|> as pad token
    tokenizer.pad_token_id = tokenizer.eos_token_id  # Should be 2

# Quantization config
quantization_config = BitsAndBytesConfig(load_in_8bit=True)

# Load model without FlashAttention
model = LlamaForCausalLM.from_pretrained(
    MODEL_ID,
    torch_dtype=torch.bfloat16,
    device_map="auto",
    quantization_config=quantization_config
)

# Prepare for LoRA
model = prepare_model_for_kbit_training(model)
peft_config = LoraConfig(
    r=16, lora_alpha=32, lora_dropout=0.05, bias="none", task_type="CAUSAL_LM",
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"]
)
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

# Load dataset
dataset = datasets.load_dataset("json", data_files="final_combined_fraud_data.json", field="training_pairs")
print("First example from dataset:", dataset["train"][0])

# Tokenization with validation
def tokenize_data(example):
    formatted_text = f"{example['input']} {example['output']}"
    inputs = tokenizer(formatted_text, truncation=True, max_length=512, padding="max_length", return_tensors="pt")
    input_ids = inputs["input_ids"].squeeze(0)
    attention_mask = inputs["attention_mask"].squeeze(0)
    labels = input_ids.clone()
    input_len = len(tokenizer(example['input'])["input_ids"])
    labels[:input_len] = -100  # Mask input part in labels only
    # Validate input_ids
    vocab_size = model.config.vocab_size  # Should be 32000 for LLaMA-2
    if (input_ids < 0).any() or (input_ids >= vocab_size).any():
        print(f"Invalid input_ids: min={input_ids.min()}, max={input_ids.max()}, vocab_size={vocab_size}")
        raise ValueError("input_ids contains invalid indices")
    print(f"Debug: input_ids[:5] = {input_ids[:5].tolist()}, labels[:5] = {labels[:5].tolist()}, attention_mask[:5] = {attention_mask[:5].tolist()}")
    return {
        "input_ids": input_ids.tolist(),
        "labels": labels.tolist(),
        "attention_mask": attention_mask.tolist()
    }

tokenized_dataset = dataset["train"].map(tokenize_data, batched=False, remove_columns=dataset["train"].column_names)
first_example = tokenized_dataset[0]
print("First tokenized example:", {k: (type(v), len(v)) for k, v in first_example.items()})

# Data collator with tensor stacking
def custom_data_collator(features):
    input_ids = [torch.tensor(f["input_ids"]) for f in features]
    attention_mask = [torch.tensor(f["attention_mask"]) for f in features]
    labels = [torch.tensor(f["labels"]) for f in features]
    return {
        "input_ids": torch.stack(input_ids),
        "attention_mask": torch.stack(attention_mask),
        "labels": torch.stack(labels)
    }

# Accelerator and training
accelerator = Accelerator()
training_args = TrainingArguments(
    output_dir="./fine_tuned_llama2", per_device_train_batch_size=4, gradient_accumulation_steps=4,
    eval_strategy="steps", eval_steps=50, save_strategy="steps", save_steps=100, save_total_limit=3,
    num_train_epochs=3, learning_rate=2e-5, weight_decay=0.01, logging_dir="./logs", logging_steps=10,
    bf16=True, gradient_checkpointing=True, optim="adamw_torch", warmup_steps=100
)
trainer = Trainer(
    model=model, args=training_args,
    train_dataset=tokenized_dataset.select(range(90)),
    eval_dataset=tokenized_dataset.select(range(90, 112)),
    data_collator=custom_data_collator
)
trainer.train()
model.save_pretrained("./fine_tuned_llama2")
tokenizer.save_pretrained("./fine_tuned_llama2")
print("Training complete. Model and tokenizer saved to ./fine_tuned_llama2")