Final_Assignment_Template

Build error

Add training script for SmolLM2-135M model using Unsloth. Includes model loading, dataset preparation, and training configuration. Provides detailed instructions for setup and execution.

7749830 unverified about 1 month ago

raw

history blame

4.84 kB

	#!/usr/bin/env python3
	"""
	Fine-tuning script for SmolLM2-135M model using Unsloth.

	This script demonstrates how to:
	1. Install and configure Unsloth
	2. Prepare and format training data
	3. Configure and run the training process
	4. Save and evaluate the model

	To run this script:
	1. Install dependencies: pip install -r requirements.txt
	2. Run: python train.py
	"""

	import os
	from typing import Union

	from datasets import (
	Dataset,
	DatasetDict,
	IterableDataset,
	IterableDatasetDict,
	load_dataset,
	)
	from transformers import AutoTokenizer, Trainer, TrainingArguments
	from trl import SFTTrainer
	from unsloth import FastLanguageModel, is_bfloat16_supported
	from unsloth.chat_templates import get_chat_template

	# Configuration
	max_seq_length = 2048 # Auto supports RoPE Scaling internally
	dtype = (
	None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
	)
	load_in_4bit = True # Use 4bit quantization to reduce memory usage

	# def install_dependencies():
	# """Install required dependencies."""
	# os.system('pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"')
	# os.system('pip install --no-deps xformers trl peft accelerate bitsandbytes')


	def load_model() -> tuple[FastLanguageModel, AutoTokenizer]:
	"""Load and configure the model."""
	model, tokenizer = FastLanguageModel.from_pretrained(
	model_name="unsloth/SmolLM2-135M-Instruct-bnb-4bit",
	max_seq_length=max_seq_length,
	dtype=dtype,
	load_in_4bit=load_in_4bit,
	)

	# Configure LoRA
	model = FastLanguageModel.get_peft_model(
	model,
	r=64,
	target_modules=[
	"q_proj",
	"k_proj",
	"v_proj",
	"o_proj",
	"gate_proj",
	"up_proj",
	"down_proj",
	],
	lora_alpha=128,
	lora_dropout=0.05,
	bias="none",
	use_gradient_checkpointing="unsloth",
	random_state=3407,
	use_rslora=True,
	loftq_config=None,
	)

	return model, tokenizer


	def load_and_format_dataset(
	tokenizer: AutoTokenizer,
	) -> tuple[
	Union[DatasetDict, Dataset, IterableDatasetDict, IterableDataset], AutoTokenizer
	]:
	"""Load and format the training dataset."""
	# Load the code-act dataset
	dataset = load_dataset("xingyaoww/code-act", split="codeact")

	# Configure chat template
	tokenizer = get_chat_template(
	tokenizer,
	chat_template="chatml", # Supports zephyr, chatml, mistral, llama, alpaca, vicuna, vicuna_old, unsloth
	mapping={
	"role": "from",
	"content": "value",
	"user": "human",
	"assistant": "gpt",
	}, # ShareGPT style
	map_eos_token=True, # Maps <\|im_end\|> to </s> instead
	)

	def formatting_prompts_func(examples):
	convos = examples["conversations"]
	texts = [
	tokenizer.apply_chat_template(
	convo, tokenize=False, add_generation_prompt=False
	)
	for convo in convos
	]
	return {"text": texts}

	# Apply formatting to dataset
	dataset = dataset.map(formatting_prompts_func, batched=True)

	return dataset, tokenizer


	def create_trainer(
	model: FastLanguageModel,
	tokenizer: AutoTokenizer,
	dataset: Union[DatasetDict, Dataset, IterableDatasetDict, IterableDataset],
	) -> Trainer:
	"""Create and configure the SFTTrainer."""
	return SFTTrainer(
	model=model,
	tokenizer=tokenizer,
	train_dataset=dataset,
	dataset_text_field="text",
	max_seq_length=max_seq_length,
	dataset_num_proc=2,
	packing=False,
	args=TrainingArguments(
	per_device_train_batch_size=2,
	gradient_accumulation_steps=16,
	warmup_steps=100,
	max_steps=120,
	learning_rate=5e-5,
	fp16=not is_bfloat16_supported(),
	bf16=is_bfloat16_supported(),
	logging_steps=1,
	optim="adamw_8bit",
	weight_decay=0.01,
	lr_scheduler_type="cosine_with_restarts",
	seed=3407,
	output_dir="outputs",
	gradient_checkpointing=True,
	save_strategy="steps",
	save_steps=30,
	save_total_limit=2,
	),
	)


	def main():
	"""Main training function."""
	# Install dependencies
	# install_dependencies()

	# Load model and tokenizer
	model, tokenizer = load_model()

	# Load and prepare dataset
	dataset, tokenizer = load_and_format_dataset(tokenizer)

	# Create trainer
	trainer: Trainer = create_trainer(model, tokenizer, dataset)

	# Train
	trainer.train()

	# Save model
	trainer.save_model("final_model")


	if __name__ == "__main__":
	main()