mjschock's picture
Add training script for SmolLM2-135M model using Unsloth. Includes model loading, dataset preparation, and training configuration. Provides detailed instructions for setup and execution.
7749830 unverified
raw
history blame
4.84 kB
#!/usr/bin/env python3
"""
Fine-tuning script for SmolLM2-135M model using Unsloth.
This script demonstrates how to:
1. Install and configure Unsloth
2. Prepare and format training data
3. Configure and run the training process
4. Save and evaluate the model
To run this script:
1. Install dependencies: pip install -r requirements.txt
2. Run: python train.py
"""
import os
from typing import Union
from datasets import (
Dataset,
DatasetDict,
IterableDataset,
IterableDatasetDict,
load_dataset,
)
from transformers import AutoTokenizer, Trainer, TrainingArguments
from trl import SFTTrainer
from unsloth import FastLanguageModel, is_bfloat16_supported
from unsloth.chat_templates import get_chat_template
# Configuration
max_seq_length = 2048 # Auto supports RoPE Scaling internally
dtype = (
None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
)
load_in_4bit = True # Use 4bit quantization to reduce memory usage
# def install_dependencies():
# """Install required dependencies."""
# os.system('pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"')
# os.system('pip install --no-deps xformers trl peft accelerate bitsandbytes')
def load_model() -> tuple[FastLanguageModel, AutoTokenizer]:
"""Load and configure the model."""
model, tokenizer = FastLanguageModel.from_pretrained(
model_name="unsloth/SmolLM2-135M-Instruct-bnb-4bit",
max_seq_length=max_seq_length,
dtype=dtype,
load_in_4bit=load_in_4bit,
)
# Configure LoRA
model = FastLanguageModel.get_peft_model(
model,
r=64,
target_modules=[
"q_proj",
"k_proj",
"v_proj",
"o_proj",
"gate_proj",
"up_proj",
"down_proj",
],
lora_alpha=128,
lora_dropout=0.05,
bias="none",
use_gradient_checkpointing="unsloth",
random_state=3407,
use_rslora=True,
loftq_config=None,
)
return model, tokenizer
def load_and_format_dataset(
tokenizer: AutoTokenizer,
) -> tuple[
Union[DatasetDict, Dataset, IterableDatasetDict, IterableDataset], AutoTokenizer
]:
"""Load and format the training dataset."""
# Load the code-act dataset
dataset = load_dataset("xingyaoww/code-act", split="codeact")
# Configure chat template
tokenizer = get_chat_template(
tokenizer,
chat_template="chatml", # Supports zephyr, chatml, mistral, llama, alpaca, vicuna, vicuna_old, unsloth
mapping={
"role": "from",
"content": "value",
"user": "human",
"assistant": "gpt",
}, # ShareGPT style
map_eos_token=True, # Maps <|im_end|> to </s> instead
)
def formatting_prompts_func(examples):
convos = examples["conversations"]
texts = [
tokenizer.apply_chat_template(
convo, tokenize=False, add_generation_prompt=False
)
for convo in convos
]
return {"text": texts}
# Apply formatting to dataset
dataset = dataset.map(formatting_prompts_func, batched=True)
return dataset, tokenizer
def create_trainer(
model: FastLanguageModel,
tokenizer: AutoTokenizer,
dataset: Union[DatasetDict, Dataset, IterableDatasetDict, IterableDataset],
) -> Trainer:
"""Create and configure the SFTTrainer."""
return SFTTrainer(
model=model,
tokenizer=tokenizer,
train_dataset=dataset,
dataset_text_field="text",
max_seq_length=max_seq_length,
dataset_num_proc=2,
packing=False,
args=TrainingArguments(
per_device_train_batch_size=2,
gradient_accumulation_steps=16,
warmup_steps=100,
max_steps=120,
learning_rate=5e-5,
fp16=not is_bfloat16_supported(),
bf16=is_bfloat16_supported(),
logging_steps=1,
optim="adamw_8bit",
weight_decay=0.01,
lr_scheduler_type="cosine_with_restarts",
seed=3407,
output_dir="outputs",
gradient_checkpointing=True,
save_strategy="steps",
save_steps=30,
save_total_limit=2,
),
)
def main():
"""Main training function."""
# Install dependencies
# install_dependencies()
# Load model and tokenizer
model, tokenizer = load_model()
# Load and prepare dataset
dataset, tokenizer = load_and_format_dataset(tokenizer)
# Create trainer
trainer: Trainer = create_trainer(model, tokenizer, dataset)
# Train
trainer.train()
# Save model
trainer.save_model("final_model")
if __name__ == "__main__":
main()