How Do You Know If Your Model Is Good?
Fine-tuning is done. The model is trained. Loss has decreased. But is the model actually doing the right thing? Low loss doesn’t necessarily mean a good model — the model might have just memorized the training data (overfitting) without truly learning.
Evaluating a fine-tuned model is an art. You need to know what to measure and how to interpret it.
1. Perplexity — The First Metric
Perplexity shows how “confused” the model is when predicting the next token. The lower it is, the better the model performs.
import torch
import math
from transformers import AutoModelForCausalLM, AutoTokenizer
def calculate_perplexity(model, tokenizer, texts, max_length=2048):
"""Calculate Perplexity on a set of texts"""
model.eval()
total_loss = 0
total_tokens = 0
with torch.no_grad():
for text in texts:
inputs = tokenizer(
text,
return_tensors="pt",
truncation=True,
max_length=max_length
).to(model.device)
outputs = model(**inputs, labels=inputs["input_ids"])
total_loss += outputs.loss.item() * inputs["input_ids"].size(1)
total_tokens += inputs["input_ids"].size(1)
avg_loss = total_loss / total_tokens
perplexity = math.exp(avg_loss)
return perplexity
# Compare model before and after fine-tuning
ppl_before = calculate_perplexity(base_model, tokenizer, val_texts)
ppl_after = calculate_perplexity(fine_tuned_model, tokenizer, val_texts)
print(f"Perplexity before: {ppl_before:.2f}")
print(f"Perplexity after: {ppl_after:.2f}")
print(f"Improvement: {(1 - ppl_after/ppl_before) * 100:.1f}%")
2. Human Evaluation — The Most Important Metric
No automated metric can replace human evaluation. A person needs to see the model’s output and judge it.
import json
import random
def create_evaluation_set(test_data, num_samples=50):
"""Create a human evaluation set"""
samples = random.sample(test_data, min(num_samples, len(test_data)))
evaluation_sheet = []
for i, sample in enumerate(samples):
# Generate response from fine-tuned model
prompt = sample["instruction"]
generated = generate_response(fine_tuned_model, prompt)
evaluation_sheet.append({
"id": i + 1,
"prompt": prompt,
"expected": sample["output"], # Reference answer
"generated": generated, # Model answer
"scores": {
"accuracy": None, # 1-5: content accuracy
"relevance": None, # 1-5: relevance
"fluency": None, # 1-5: text fluency
"format": None, # 1-5: format compliance
},
"notes": ""
})
# Save for review
with open("evaluation_sheet.json", "w", encoding="utf-8") as f:
json.dump(evaluation_sheet, f, ensure_ascii=False, indent=2)
print(f"{len(evaluation_sheet)} samples ready for evaluation.")
return evaluation_sheet
3. Task-Specific Evaluation
Depending on the task, there are specific metrics:
# For summarization: ROUGE
from rouge_score import rouge_scorer
def evaluate_summarization(model, test_data):
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'])
scores = {"rouge1": [], "rouge2": [], "rougeL": []}
for item in test_data:
generated = generate_response(model, item["instruction"])
score = scorer.score(item["output"], generated)
for key in scores:
scores[key].append(score[key].fmeasure)
print("ROUGE Results:")
for key, vals in scores.items():
print(f" {key}: {sum(vals)/len(vals):.4f}")
# For classification: Accuracy, F1
from sklearn.metrics import classification_report
def evaluate_classification(model, test_data, labels):
predictions = []
ground_truth = []
for item in test_data:
pred = generate_response(model, item["instruction"])
pred_label = extract_label(pred, labels)
predictions.append(pred_label)
ground_truth.append(item["expected_label"])
print(classification_report(ground_truth, predictions, target_names=labels))
# For code generation: pass@k
def evaluate_code_generation(model, test_problems, k=1):
correct = 0
total = len(test_problems)
for problem in test_problems:
generated_code = generate_response(model, problem["prompt"])
# Run tests
try:
exec(generated_code + "\n" + problem["test_code"])
correct += 1
except Exception:
pass
pass_at_k = correct / total
print(f"pass@{k}: {pass_at_k:.2%}")
4. A/B Testing
Direct comparison of the fine-tuned model with the base model:
import random
def ab_test(base_model, fine_tuned_model, test_prompts, num_judges=3):
"""A/B test between two models"""
results = {"base_wins": 0, "ft_wins": 0, "tie": 0}
for prompt in test_prompts:
# Generate response from both models
base_response = generate_response(base_model, prompt)
ft_response = generate_response(fine_tuned_model, prompt)
# Shuffle order to prevent bias
if random.random() > 0.5:
option_a, option_b = base_response, ft_response
mapping = {"A": "base", "B": "ft"}
else:
option_a, option_b = ft_response, base_response
mapping = {"A": "ft", "B": "base"}
print(f"\nPrompt: {prompt}")
print(f"\nOption A: {option_a[:200]}...")
print(f"\nOption B: {option_b[:200]}...")
# Ask evaluator to choose
choice = input("Which is better? (A/B/T for tie): ").upper()
if choice == "T":
results["tie"] += 1
elif mapping.get(choice) == "base":
results["base_wins"] += 1
elif mapping.get(choice) == "ft":
results["ft_wins"] += 1
total = sum(results.values())
print(f"\nA/B Test Results:")
print(f" Base model wins: {results['base_wins']} ({results['base_wins']/total:.0%})")
print(f" Fine-tuned wins: {results['ft_wins']} ({results['ft_wins']/total:.0%})")
print(f" Tie: {results['tie']} ({results['tie']/total:.0%})")
return results
5. Detecting Overfitting
Overfitting means the model has only memorized the training data and doesn’t perform well on new data.
def detect_overfitting(trainer, train_data, val_data):
"""Detect overfitting by comparing training and evaluation loss"""
# Calculate loss on train and validation
train_loss = evaluate_loss(trainer.model, train_data)
val_loss = evaluate_loss(trainer.model, val_data)
gap = val_loss - train_loss
print(f"Train Loss: {train_loss:.4f}")
print(f"Validation Loss: {val_loss:.4f}")
print(f"Gap: {gap:.4f}")
if gap > 0.5:
print("Warning: High probability of overfitting!")
print("Suggestions:")
print(" - Reduce the number of epochs")
print(" - Add more data")
print(" - Increase lora_dropout")
print(" - Reduce the rank")
elif gap > 0.2:
print("Some overfitting present — acceptable")
else:
print("Looking good — no overfitting detected")
Signs of Overfitting
- Training loss decreases but validation loss increases
- The model generates responses that are exact copies from the training dataset
- Performs well on similar questions but poorly on different ones
- Very low response diversity
6. Validation Set Strategy
from sklearn.model_selection import train_test_split
def prepare_eval_split(data, test_size=0.1, strategy="random"):
"""Split data into train and validation"""
if strategy == "random":
train, val = train_test_split(data, test_size=test_size, random_state=42)
elif strategy == "temporal":
# For time-series data — latest examples as validation
split_idx = int(len(data) * (1 - test_size))
train, val = data[:split_idx], data[split_idx:]
elif strategy == "category":
# Split by category
from collections import defaultdict
categories = defaultdict(list)
for item in data:
cat = item.get("category", "general")
categories[cat].append(item)
train, val = [], []
for cat, items in categories.items():
t, v = train_test_split(items, test_size=test_size, random_state=42)
train.extend(t)
val.extend(v)
print(f"Train: {len(train)} | Validation: {len(val)}")
return train, val
Evaluation Checklist
After fine-tuning, follow these steps:
- Step 1: Check the training curve — has loss decreased properly?
- Step 2: Calculate perplexity on the validation set
- Step 3: Check for overfitting (compare train vs val loss)
- Step 4: Manually review 20-50 examples
- Step 5: Run an A/B test against the base model
- Step 6: Calculate task-specific metrics
Before deploying the model, always do human evaluation. Numbers and metrics matter, but the final judgment belongs to humans.
Summary
Evaluating a fine-tuned model isn’t just looking at the loss. You need to examine it from multiple angles: perplexity, human evaluation, task-specific metrics, A/B testing, and overfitting detection. Each metric reveals a different part of the picture.
In the next episode, we’ll explore DPO — a technique that can take your model’s quality one step higher after SFT, without the complexity of RLHF.