from transformers import LlamaForCausalLM, LlamaTokenizer, Trainer, TrainingArguments
import datasets
import torch
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from accelerate import Accelerator

# Version and CUDA check
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA version: {torch.version.cuda}")
print(f"Is CUDA available: {torch.cuda.is_available()}")
print(f"CUDA device: {torch.cuda.get_device_name(torch.cuda.current_device())}")

# Load Llama model and tokenizer
MODEL_ID = "meta-llama/Llama-2-7b-hf"
tokenizer = LlamaTokenizer.from_pretrained(MODEL_ID)

# Add padding token if it doesn't exist
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})

# Load the model with optimizations for A100 GPU
model = LlamaForCausalLM.from_pretrained(
    MODEL_ID,
    torch_dtype=torch.bfloat16,  # Better for A100 GPUs
    device_map="auto",
    use_flash_attention_2=True,  # Flash Attention for faster training
    load_in_8bit=True  # Quantization for memory efficiency
)

# Prepare the model for training with LoRA (more memory-efficient)
model = prepare_model_for_kbit_training(model)

# LoRA configuration
peft_config = LoraConfig(
    r=16,               # Rank
    lora_alpha=32,      # Alpha
    lora_dropout=0.05,  # Dropout
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"]  # Attention modules for Llama
)

model = get_peft_model(model, peft_config)
model.print_trainable_parameters()  # Print percentage of trainable parameters

# Load the dataset with field="training_pairs"
dataset = datasets.load_dataset("json", data_files="final_combined_fraud_data.json", field="training_pairs")

# Verify the dataset structure
print("First example from dataset:", dataset["train"][0])

# Define instruction template for formatting inputs
def format_instruction(example):
    # Adapt this template based on your specific use case and dataset format
    return f"""<s>[INST] {example['input']} [/INST] {example['output']}</s>"""

# Tokenization function
def tokenize_data(example):
    formatted_text = format_instruction(example)
    
    # Tokenize with appropriate padding and truncation
    inputs = tokenizer(
        formatted_text,
        padding="max_length",
        truncation=True,
        max_length=2048,  # Llama 2 context length
        return_tensors="pt"
    )
    
    # Create labels (for causal language modeling, labels are the same as input_ids)
    inputs["labels"] = inputs["input_ids"].clone()
    
    # Keep tensors as-is
    inputs = {k: v.squeeze(0) for k, v in inputs.items()}
    return inputs

# Map without forcing Arrow schema
tokenized_dataset = dataset["train"].map(
    tokenize_data,
    batched=False,
    remove_columns=dataset["train"].column_names
)

# Debug: Print the first tokenized example
print("First tokenized example:", {k: (type(v), v.shape if isinstance(v, torch.Tensor) else "list") for k, v in tokenized_dataset[0].items()})

# Custom data collator
def custom_data_collator(features):
    batch = {}
    
    # Stack tensors
    batch["input_ids"] = torch.stack([f["input_ids"] for f in features])
    batch["attention_mask"] = torch.stack([f["attention_mask"] for f in features])
    batch["labels"] = torch.stack([f["labels"] for f in features])
    
    return batch

# Initialize accelerator for distributed training
accelerator = Accelerator()

# Training setup
training_args = TrainingArguments(
    output_dir="./fine_tuned_llama2",
    per_device_train_batch_size=4,  # Larger batch size for A100
    gradient_accumulation_steps=8,  # Accumulate gradients to increase effective batch size
    eval_strategy="no",
    save_strategy="steps",
    save_steps=100,
    save_total_limit=3,
    num_train_epochs=3,
    learning_rate=2e-5,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    bf16=True,  # Use bfloat16 for A100 GPUs
    gradient_checkpointing=True,  # Memory optimization
    optim="adamw_torch",
    warmup_steps=100,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    data_collator=custom_data_collator,
)

# Start fine-tuning
trainer.train()

# Save the fine-tuned model and tokenizer
model.save_pretrained("./fine_tuned_llama2")
tokenizer.save_pretrained("./fine_tuned_llama2")

print("Training complete. Model and tokenizer saved to ./fine_tuned_llama2")