File size: 4,443 Bytes
b4ff959
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
from transformers import LlamaForCausalLM, LlamaTokenizer, Trainer, TrainingArguments
import datasets
import torch
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from accelerate import Accelerator

# Version and CUDA check
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA version: {torch.version.cuda}")
print(f"Is CUDA available: {torch.cuda.is_available()}")
print(f"CUDA device: {torch.cuda.get_device_name(torch.cuda.current_device())}")

# Load Llama model and tokenizer
MODEL_ID = "meta-llama/Llama-2-7b-hf"
tokenizer = LlamaTokenizer.from_pretrained(MODEL_ID)

# Add padding token if it doesn't exist
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})

# Load the model with optimizations for A100 GPU
model = LlamaForCausalLM.from_pretrained(
    MODEL_ID,
    torch_dtype=torch.bfloat16,  # Better for A100 GPUs
    device_map="auto",
    use_flash_attention_2=True,  # Flash Attention for faster training
    load_in_8bit=True  # Quantization for memory efficiency
)

# Prepare the model for training with LoRA (more memory-efficient)
model = prepare_model_for_kbit_training(model)

# LoRA configuration
peft_config = LoraConfig(
    r=16,               # Rank
    lora_alpha=32,      # Alpha
    lora_dropout=0.05,  # Dropout
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"]  # Attention modules for Llama
)

model = get_peft_model(model, peft_config)
model.print_trainable_parameters()  # Print percentage of trainable parameters

# Load the dataset with field="training_pairs"
dataset = datasets.load_dataset("json", data_files="final_combined_fraud_data.json", field="training_pairs")

# Verify the dataset structure
print("First example from dataset:", dataset["train"][0])

# Define instruction template for formatting inputs
def format_instruction(example):
    # Adapt this template based on your specific use case and dataset format
    return f"""<s>[INST] {example['input']} [/INST] {example['output']}</s>"""

# Tokenization function
def tokenize_data(example):
    formatted_text = format_instruction(example)
    
    # Tokenize with appropriate padding and truncation
    inputs = tokenizer(
        formatted_text,
        padding="max_length",
        truncation=True,
        max_length=2048,  # Llama 2 context length
        return_tensors="pt"
    )
    
    # Create labels (for causal language modeling, labels are the same as input_ids)
    inputs["labels"] = inputs["input_ids"].clone()
    
    # Keep tensors as-is
    inputs = {k: v.squeeze(0) for k, v in inputs.items()}
    return inputs

# Map without forcing Arrow schema
tokenized_dataset = dataset["train"].map(
    tokenize_data,
    batched=False,
    remove_columns=dataset["train"].column_names
)

# Debug: Print the first tokenized example
print("First tokenized example:", {k: (type(v), v.shape if isinstance(v, torch.Tensor) else "list") for k, v in tokenized_dataset[0].items()})

# Custom data collator
def custom_data_collator(features):
    batch = {}
    
    # Stack tensors
    batch["input_ids"] = torch.stack([f["input_ids"] for f in features])
    batch["attention_mask"] = torch.stack([f["attention_mask"] for f in features])
    batch["labels"] = torch.stack([f["labels"] for f in features])
    
    return batch

# Initialize accelerator for distributed training
accelerator = Accelerator()

# Training setup
training_args = TrainingArguments(
    output_dir="./fine_tuned_llama2",
    per_device_train_batch_size=4,  # Larger batch size for A100
    gradient_accumulation_steps=8,  # Accumulate gradients to increase effective batch size
    eval_strategy="no",
    save_strategy="steps",
    save_steps=100,
    save_total_limit=3,
    num_train_epochs=3,
    learning_rate=2e-5,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    bf16=True,  # Use bfloat16 for A100 GPUs
    gradient_checkpointing=True,  # Memory optimization
    optim="adamw_torch",
    warmup_steps=100,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    data_collator=custom_data_collator,
)

# Start fine-tuning
trainer.train()

# Save the fine-tuned model and tokenizer
model.save_pretrained("./fine_tuned_llama2")
tokenizer.save_pretrained("./fine_tuned_llama2")

print("Training complete. Model and tokenizer saved to ./fine_tuned_llama2")