Spaces:

Cylanoid
/

Nursing-Home-Fraud-Detection-using-Llama

Paused

App Files Files Community

Nursing-Home-Fraud-Detection-using-Llama / train.py

Cylanoid

dafeadfe

53d6f71 3 months ago

raw

history blame

4.44 kB

	from transformers import LlamaForCausalLM, LlamaTokenizer, Trainer, TrainingArguments
	import datasets
	import torch
	from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
	from accelerate import Accelerator

	# Version and CUDA check
	print(f"PyTorch version: {torch.__version__}")
	print(f"CUDA version: {torch.version.cuda}")
	print(f"Is CUDA available: {torch.cuda.is_available()}")
	print(f"CUDA device: {torch.cuda.get_device_name(torch.cuda.current_device())}")

	# Load Llama model and tokenizer
	MODEL_ID = "meta-llama/Llama-2-7b-hf"
	tokenizer = LlamaTokenizer.from_pretrained(MODEL_ID)

	# Add padding token if it doesn't exist
	if tokenizer.pad_token is None:
	tokenizer.add_special_tokens({'pad_token': '[PAD]'})

	# Load the model with optimizations for A100 GPU
	model = LlamaForCausalLM.from_pretrained(
	MODEL_ID,
	torch_dtype=torch.bfloat16, # Better for A100 GPUs
	device_map="auto",
	use_flash_attention_2=True, # Flash Attention for faster training
	load_in_8bit=True # Quantization for memory efficiency
	)

	# Prepare the model for training with LoRA (more memory-efficient)
	model = prepare_model_for_kbit_training(model)

	# LoRA configuration
	peft_config = LoraConfig(
	r=16, # Rank
	lora_alpha=32, # Alpha
	lora_dropout=0.05, # Dropout
	bias="none",
	task_type="CAUSAL_LM",
	target_modules=["q_proj", "k_proj", "v_proj", "o_proj"] # Attention modules for Llama
	)

	model = get_peft_model(model, peft_config)
	model.print_trainable_parameters() # Print percentage of trainable parameters

	# Load the dataset with field="training_pairs"
	dataset = datasets.load_dataset("json", data_files="final_combined_fraud_data.json", field="training_pairs")

	# Verify the dataset structure
	print("First example from dataset:", dataset["train"][0])

	# Define instruction template for formatting inputs
	def format_instruction(example):
	# Adapt this template based on your specific use case and dataset format
	return f"""<s>[INST] {example['input']} [/INST] {example['output']}</s>"""

	# Tokenization function
	def tokenize_data(example):
	formatted_text = format_instruction(example)

	# Tokenize with appropriate padding and truncation
	inputs = tokenizer(
	formatted_text,
	padding="max_length",
	truncation=True,
	max_length=2048, # Llama 2 context length
	return_tensors="pt"
	)

	# Create labels (for causal language modeling, labels are the same as input_ids)
	inputs["labels"] = inputs["input_ids"].clone()

	# Keep tensors as-is
	inputs = {k: v.squeeze(0) for k, v in inputs.items()}
	return inputs

	# Map without forcing Arrow schema
	tokenized_dataset = dataset["train"].map(
	tokenize_data,
	batched=False,
	remove_columns=dataset["train"].column_names
	)

	# Debug: Print the first tokenized example
	print("First tokenized example:", {k: (type(v), v.shape if isinstance(v, torch.Tensor) else "list") for k, v in tokenized_dataset[0].items()})

	# Custom data collator
	def custom_data_collator(features):
	batch = {}

	# Stack tensors
	batch["input_ids"] = torch.stack([f["input_ids"] for f in features])
	batch["attention_mask"] = torch.stack([f["attention_mask"] for f in features])
	batch["labels"] = torch.stack([f["labels"] for f in features])

	return batch

	# Initialize accelerator for distributed training
	accelerator = Accelerator()

	# Training setup
	training_args = TrainingArguments(
	output_dir="./fine_tuned_llama2",
	per_device_train_batch_size=4, # Larger batch size for A100
	gradient_accumulation_steps=8, # Accumulate gradients to increase effective batch size
	eval_strategy="no",
	save_strategy="steps",
	save_steps=100,
	save_total_limit=3,
	num_train_epochs=3,
	learning_rate=2e-5,
	weight_decay=0.01,
	logging_dir="./logs",
	logging_steps=10,
	bf16=True, # Use bfloat16 for A100 GPUs
	gradient_checkpointing=True, # Memory optimization
	optim="adamw_torch",
	warmup_steps=100,
	)

	trainer = Trainer(
	model=model,
	args=training_args,
	train_dataset=tokenized_dataset,
	data_collator=custom_data_collator,
	)

	# Start fine-tuning
	trainer.train()

	# Save the fine-tuned model and tokenizer
	model.save_pretrained("./fine_tuned_llama2")
	tokenizer.save_pretrained("./fine_tuned_llama2")

	print("Training complete. Model and tokenizer saved to ./fine_tuned_llama2")