Fine-tuning Large Language Models: Adapting LLMs for Specific Tasks

Fine-tuning adapts pre-trained LLMs to specific tasks or domains. This guide covers efficient fine-tuning methods and practical implementation.

Fine-tuning Fundamentals

Why Fine-tune?

# Pre-trained model limitations
# - General knowledge, not domain-specific
# - May not follow specific instructions
# - Lacks task-specific patterns

# Fine-tuning benefits
# - Adapt to specific domain
# - Improve performance on target task
# - Reduce hallucinations
# - Better instruction following

Full Fine-tuning vs Parameter-Efficient Methods

# Full fine-tuning: Update all parameters
# Pros: Best performance
# Cons: High memory, slow, requires large dataset

# LoRA (Low-Rank Adaptation): Update small matrices
# Pros: Memory efficient, fast, works with small datasets
# Cons: Slightly lower performance

# QLoRA: Quantized LoRA
# Pros: Very memory efficient, runs on consumer GPUs
# Cons: Slower training

# Prefix tuning: Add learnable prefix
# Pros: Efficient, modular
# Cons: Limited flexibility

Full Fine-tuning

Fine-tuning with Hugging Face

from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer
from datasets import load_dataset
import torch

# Load model and tokenizer
model_name = "gpt2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

# Prepare dataset
dataset = load_dataset('text', data_files={'train': 'train.txt', 'validation': 'val.txt'})

def tokenize_function(examples):
    return tokenizer(examples['text'], truncation=True, max_length=512)

tokenized_dataset = dataset.map(tokenize_function, batched=True)

# Training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=100,
    evaluation_strategy='epoch',
    save_strategy='epoch',
    load_best_model_at_end=True,
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset['train'],
    eval_dataset=tokenized_dataset['validation'],
)

# Train
trainer.train()

# Save model
model.save_pretrained('./fine_tuned_model')
tokenizer.save_pretrained('./fine_tuned_model')

Fine-tuning for Instruction Following

from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer
from datasets import Dataset
import json

# Load instruction-following dataset
with open('instructions.jsonl', 'r') as f:
    data = [json.loads(line) for line in f]

# Format as instruction-response pairs
formatted_data = []
for item in data:
    formatted_data.append({
        'text': f"Instruction: {item['instruction']}\nResponse: {item['response']}"
    })

dataset = Dataset.from_dict({'text': [d['text'] for d in formatted_data]})

# Tokenize
tokenizer = AutoTokenizer.from_pretrained('gpt2')
tokenizer.pad_token = tokenizer.eos_token

def tokenize_function(examples):
    return tokenizer(examples['text'], truncation=True, max_length=512, padding='max_length')

tokenized_dataset = dataset.map(tokenize_function, batched=True)

# Train
model = AutoModelForCausalLM.from_pretrained('gpt2')

training_args = TrainingArguments(
    output_dir='./instruction_model',
    num_train_epochs=5,
    per_device_train_batch_size=4,
    learning_rate=2e-5,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
)

trainer.train()

Parameter-Efficient Fine-tuning

LoRA (Low-Rank Adaptation)

from peft import get_peft_model, LoraConfig, TaskType
from transformers import AutoModelForCausalLM, AutoTokenizer

# Load base model
model_name = "meta-llama/Llama-2-7b"
model = AutoModelForCausalLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Configure LoRA
lora_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    r=8,  # Rank
    lora_alpha=32,
    lora_dropout=0.1,
    bias="none",
    target_modules=["q_proj", "v_proj"],  # Which modules to apply LoRA
)

# Apply LoRA
model = get_peft_model(model, lora_config)

# Check trainable parameters
model.print_trainable_parameters()
# Output: trainable params: 4,194,304 || all params: 6,738,415,616 || trainable%: 0.06

# Fine-tune with LoRA
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir='./lora_model',
    num_train_epochs=3,
    per_device_train_batch_size=8,
    learning_rate=1e-4,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
)

trainer.train()

# Save LoRA weights
model.save_pretrained('./lora_weights')

QLoRA (Quantized LoRA)

from peft import prepare_model_for_kbit_training, get_peft_model, LoraConfig, TaskType
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
import torch

# Quantization config
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

# Load quantized model
model_name = "meta-llama/Llama-2-7b"
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map="auto"
)

# Prepare for training
model = prepare_model_for_kbit_training(model)

# Configure LoRA
lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.CAUSAL_LM,
    target_modules=["q_proj", "v_proj", "k_proj", "o_proj"]
)

# Apply LoRA
model = get_peft_model(model, lora_config)

# Train with QLoRA
# Much more memory efficient than full fine-tuning

Instruction Fine-tuning

Creating Instruction Datasets

import json

# Format 1: Instruction-Response
instruction_data = [
    {
        "instruction": "What is machine learning?",
        "response": "Machine learning is a subset of AI that enables systems to learn from data."
    },
    {
        "instruction": "Explain neural networks",
        "response": "Neural networks are computational models inspired by biological neurons..."
    }
]

# Format 2: Instruction-Input-Output
instruction_input_output = [
    {
        "instruction": "Classify sentiment",
        "input": "This product is amazing!",
        "output": "Positive"
    }
]

# Format 3: Conversation
conversation_data = [
    {
        "messages": [
            {"role": "user", "content": "What is Python?"},
            {"role": "assistant", "content": "Python is a programming language..."},
            {"role": "user", "content": "What are its uses?"},
            {"role": "assistant", "content": "Python is used for..."}
        ]
    }
]

# Save as JSONL
with open('instructions.jsonl', 'w') as f:
    for item in instruction_data:
        f.write(json.dumps(item) + '\n')

Fine-tuning for Chat

from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer
from datasets import load_dataset

# Load chat dataset
dataset = load_dataset('json', data_files='chat_data.jsonl')

# Tokenize conversations
tokenizer = AutoTokenizer.from_pretrained('meta-llama/Llama-2-7b-chat')

def format_chat(examples):
    formatted = []
    for messages in examples['messages']:
        text = ""
        for msg in messages:
            if msg['role'] == 'user':
                text += f"User: {msg['content']}\n"
            else:
                text += f"Assistant: {msg['content']}\n"
        formatted.append(text)
    return tokenizer(formatted, truncation=True, max_length=512)

tokenized_dataset = dataset.map(format_chat, batched=True)

# Train
model = AutoModelForCausalLM.from_pretrained('meta-llama/Llama-2-7b-chat')

training_args = TrainingArguments(
    output_dir='./chat_model',
    num_train_epochs=3,
    per_device_train_batch_size=4,
    learning_rate=2e-5,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset['train'],
)

trainer.train()

Evaluation and Testing

from transformers import pipeline
import torch

# Load fine-tuned model
model_path = './fine_tuned_model'
generator = pipeline('text-generation', model=model_path)

# Test generation
test_prompts = [
    "Machine learning is",
    "The future of AI",
    "Python is used for"
]

for prompt in test_prompts:
    result = generator(prompt, max_length=50, num_return_sequences=1)
    print(f"Prompt: {prompt}")
    print(f"Generated: {result[0]['generated_text']}\n")

# Evaluate on benchmark
from datasets import load_dataset
from torch.utils.data import DataLoader

test_dataset = load_dataset('glue', 'mrpc', split='test')

# Compute metrics
from sklearn.metrics import accuracy_score, f1_score

predictions = []
labels = []

for batch in DataLoader(test_dataset, batch_size=8):
    with torch.no_grad():
        outputs = model(**batch)
    preds = torch.argmax(outputs.logits, dim=-1)
    predictions.extend(preds.cpu().numpy())
    labels.extend(batch['label'].numpy())

accuracy = accuracy_score(labels, predictions)
f1 = f1_score(labels, predictions, average='weighted')

print(f"Accuracy: {accuracy:.4f}")
print(f"F1 Score: {f1:.4f}")

Best Practices

Start with small learning rate: 1e-5 to 5e-5 for fine-tuning
Use validation set: Monitor overfitting
Gradient accumulation: Simulate larger batch sizes
Warmup steps: Gradually increase learning rate
Early stopping: Stop when validation loss plateaus
Save checkpoints: Save best model during training
Use LoRA for efficiency: Especially for large models

Common Pitfalls

Bad Practice:

# Don't: Use high learning rate
training_args = TrainingArguments(learning_rate=0.01)

# Don't: No validation
trainer = Trainer(model=model, args=args, train_dataset=train_data)

# Don't: Train on full dataset without splitting
model.fit(all_data, epochs=100)

Good Practice:

# Do: Use appropriate learning rate
training_args = TrainingArguments(learning_rate=2e-5)

# Do: Monitor validation
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_data,
    eval_dataset=val_data,
)

# Do: Split data properly
train_data, val_data = train_test_split(data, test_size=0.2)

Conclusion

Fine-tuning adapts pre-trained LLMs to specific tasks efficiently. Use parameter-efficient methods like LoRA for resource-constrained environments. Carefully prepare datasets, monitor training, and evaluate on appropriate benchmarks to build effective domain-specific models.