Unsloth — The Fastest Fine-Tuning Tool

Episode 6 20 min

Why Unsloth?

So far we’ve seen various tools for fine-tuning — Hugging Face Transformers, PEFT, TRL. They’re all good, but Unsloth has something special: exceptionally optimized speed and memory usage.

  • 2x faster than Hugging Face + PEFT
  • 60% less memory consumption
  • No quality loss — results are exactly the same
  • Easy installation and an API similar to Hugging Face

Unsloth achieves this optimization by rewriting CUDA kernels and eliminating unnecessary operations. It performs the same computations, just much more efficiently.

Installing Unsloth

# Install in Google Colab or local environment
# pip install unsloth

# Or the nightly version for latest features
# pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"

# Verify installation
import unsloth
print(f"Unsloth version: {unsloth.__version__}")

import torch
print(f"CUDA: {torch.cuda.is_available()}")
print(f"GPU: {torch.cuda.get_device_name(0)}")

Step 1: Load the Model

from unsloth import FastLanguageModel
import torch

# Configuration
max_seq_length = 2048
dtype = None  # auto-detect (best dtype for GPU)
load_in_4bit = True  # QLoRA

# Load model with Unsloth
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name="unsloth/Llama-3.1-8B-Instruct",  # Model from Hugging Face
    max_seq_length=max_seq_length,
    dtype=dtype,
    load_in_4bit=load_in_4bit,
)

print("Model loaded successfully!")
print(f"Memory usage: {torch.cuda.memory_allocated() / 1e9:.2f} GB")
Note: Unsloth has its own optimized models (prefixed with unsloth/). These load faster, but you can use any Hugging Face model as well.

Step 2: Configure LoRA

# Add LoRA adapters
model = FastLanguageModel.get_peft_model(
    model,
    r=16,                      # LoRA rank
    target_modules=[
        "q_proj", "k_proj", "v_proj", "o_proj",
        "gate_proj", "up_proj", "down_proj",
    ],
    lora_alpha=16,
    lora_dropout=0,            # Unsloth recommends 0
    bias="none",
    use_gradient_checkpointing="unsloth",  # Unsloth memory optimization
    random_state=3407,
    use_rslora=False,
    loftq_config=None,
)

# Check trainable parameters
model.print_trainable_parameters()
# Output e.g.: trainable params: 41,943,040 (0.92%)

Step 3: Prepare the Dataset

from datasets import load_dataset

# Load dataset — example with a ready-made dataset
dataset = load_dataset("json", data_files="my_dataset.jsonl", split="train")

# Use the model's own chat template
def format_prompts(examples):
    texts = []
    for messages in examples["messages"]:
        text = tokenizer.apply_chat_template(
            messages, 
            tokenize=False, 
            add_generation_prompt=False
        )
        texts.append(text)
    return {"text": texts}

dataset = dataset.map(format_prompts, batched=True)

# Check a sample
print(dataset[0]["text"][:500])

If your dataset is in instruction format:

# Convert instruction format to messages
def convert_to_messages(examples):
    all_messages = []
    for instruction, output in zip(examples["instruction"], examples["output"]):
        messages = [
            {"role": "system", "content": "You are a helpful and accurate assistant."},
            {"role": "user", "content": instruction},
            {"role": "assistant", "content": output},
        ]
        all_messages.append(messages)
    return {"messages": all_messages}

dataset = dataset.map(convert_to_messages, batched=True)
dataset = dataset.map(format_prompts, batched=True)

Step 4: Training Configuration

from trl import SFTTrainer
from transformers import TrainingArguments

trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=dataset,
    dataset_text_field="text",
    max_seq_length=max_seq_length,
    dataset_num_proc=2,
    packing=False,  # True for short examples (saves time)
    args=TrainingArguments(
        # Core settings
        output_dir="outputs",
        num_train_epochs=3,
        
        # Batch size
        per_device_train_batch_size=2,
        gradient_accumulation_steps=4,   # effective batch = 8
        
        # Learning rate
        learning_rate=2e-4,
        warmup_steps=5,
        
        # Memory optimization
        fp16=not torch.cuda.is_bf16_supported(),
        bf16=torch.cuda.is_bf16_supported(),
        optim="adamw_8bit",
        
        # Logging
        logging_steps=10,
        save_strategy="epoch",
        
        # Speed
        weight_decay=0.01,
        lr_scheduler_type="linear",
        seed=3407,
    ),
)

# Check memory before starting
gpu_stats = torch.cuda.get_device_properties(0)
used_memory = round(torch.cuda.memory_allocated() / 1024**3, 2)
total_memory = round(gpu_stats.total_mem / 1024**3, 2)
print(f"GPU: {gpu_stats.name}")
print(f"Memory used: {used_memory} GB / {total_memory} GB")

Step 5: Start Training

# Start training!
trainer_stats = trainer.train()

# Display training stats
print(f"Training time: {trainer_stats.metrics['train_runtime']:.0f} seconds")
print(f"Samples per second: {trainer_stats.metrics['train_samples_per_second']:.1f}")

# Final memory usage
used_memory = round(torch.cuda.max_memory_allocated() / 1024**3, 2)
print(f"Peak memory usage: {used_memory} GB")

Step 6: Test the Model

# Quick test of the fine-tuned model
FastLanguageModel.for_inference(model)  # Switch to inference mode

messages = [
    {"role": "system", "content": "You are a helpful assistant."},
    {"role": "user", "content": "Explain the difference between LoRA and QLoRA."},
]

inputs = tokenizer.apply_chat_template(
    messages,
    tokenize=True,
    add_generation_prompt=True,
    return_tensors="pt",
).to("cuda")

outputs = model.generate(
    input_ids=inputs,
    max_new_tokens=512,
    temperature=0.7,
    use_cache=True,
)

response = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(response)

Step 7: Save the Model

# Method 1: Save only the adapter (small)
model.save_pretrained("lora_model")
tokenizer.save_pretrained("lora_model")
# Size: ~50-100 MB

# Method 2: Save the merged model (full)
model.save_pretrained_merged(
    "merged_model",
    tokenizer,
    save_method="merged_16bit",  # or "merged_4bit"
)
# Size: ~16 GB (for 8B model)

# Method 3: Save in GGUF format (for llama.cpp)
model.save_pretrained_gguf(
    "gguf_model",
    tokenizer,
    quantization_method="q4_k_m",  # or q8_0, f16, ...
)
# Size: ~4-8 GB

# Method 4: Upload to Hugging Face
model.push_to_hub_merged(
    "your-username/my-fine-tuned-model",
    tokenizer,
    save_method="merged_16bit",
    token="hf_your_token",
)
print("Model uploaded!")

Complete Code from Start to Finish

"""
Complete Fine-tuning with Unsloth
From model loading to final save
"""
from unsloth import FastLanguageModel
from trl import SFTTrainer
from transformers import TrainingArguments
from datasets import load_dataset
import torch

# ===== Configuration =====
MODEL_NAME = "unsloth/Llama-3.1-8B-Instruct"
DATASET_PATH = "my_dataset.jsonl"
OUTPUT_DIR = "outputs"
MAX_SEQ_LENGTH = 2048
EPOCHS = 3
BATCH_SIZE = 2
GRAD_ACCUM = 4
LEARNING_RATE = 2e-4
LORA_RANK = 16

# ===== Load Model =====
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=MODEL_NAME,
    max_seq_length=MAX_SEQ_LENGTH,
    load_in_4bit=True,
)

# ===== LoRA =====
model = FastLanguageModel.get_peft_model(
    model, r=LORA_RANK, lora_alpha=LORA_RANK,
    target_modules=["q_proj","k_proj","v_proj","o_proj",
                     "gate_proj","up_proj","down_proj"],
    lora_dropout=0, bias="none",
    use_gradient_checkpointing="unsloth",
)

# ===== Dataset =====
dataset = load_dataset("json", data_files=DATASET_PATH, split="train")

def format_fn(examples):
    texts = []
    for msgs in examples["messages"]:
        texts.append(tokenizer.apply_chat_template(
            msgs, tokenize=False, add_generation_prompt=False
        ))
    return {"text": texts}

dataset = dataset.map(format_fn, batched=True)

# ===== Training =====
trainer = SFTTrainer(
    model=model, tokenizer=tokenizer,
    train_dataset=dataset,
    dataset_text_field="text",
    max_seq_length=MAX_SEQ_LENGTH,
    args=TrainingArguments(
        output_dir=OUTPUT_DIR,
        num_train_epochs=EPOCHS,
        per_device_train_batch_size=BATCH_SIZE,
        gradient_accumulation_steps=GRAD_ACCUM,
        learning_rate=LEARNING_RATE,
        warmup_steps=5,
        bf16=torch.cuda.is_bf16_supported(),
        fp16=not torch.cuda.is_bf16_supported(),
        optim="adamw_8bit",
        logging_steps=10,
        save_strategy="epoch",
        seed=3407,
    ),
)

stats = trainer.train()
print(f"Training complete! Time: {stats.metrics['train_runtime']:.0f}s")

# ===== Save =====
model.save_pretrained("lora_model")
tokenizer.save_pretrained("lora_model")
print("Model saved!")

Tips and Tricks

  • Packing: If your examples are short (under 512 tokens), set packing=True to fit multiple examples in one sequence
  • gradient_checkpointing: Always use "unsloth" — it’s 30% faster than the standard implementation
  • lora_dropout=0: Unsloth recommends setting dropout to 0
  • Unsloth models: Models prefixed with unsloth/ load faster
  • Multi-GPU: Unsloth currently works on a single GPU only

Unsloth is the best choice for single-GPU fine-tuning. If you have multiple GPUs, use Hugging Face Accelerate or DeepSpeed.

Summary

Unsloth has made fine-tuning simple and fast. With just a few lines of code, you can load a model, add LoRA, train, and save. Its main advantage is speed and low memory usage — meaning you can work on cheaper GPUs.

But an important question remains: how do you know if your fine-tuned model is actually good? In the next episode, we’ll explore methods for evaluating your model.