Introduction
Large Language Models have achieved remarkable capabilities, but their deployment remains challenging due to massive memory and computational requirements. A 70-billion parameter model in FP32 requires 280GB of memoryโjust for weights. Model quantization offers a solution by representing weights and activations with lower precision data types, dramatically reducing model size and accelerating inference.
In 2026, quantization has become essential for deploying LLMs. From INT8 to 2-bit representations, various algorithms now enable running large models on consumer hardware. This guide explores quantization fundamentals, algorithms, and practical implementations for compressing deep learning models.
Fundamentals of Model Quantization
What is Quantization?
Quantization maps floating-point values to a smaller set of discrete values:
import numpy as np
import torch
def naive_quantize(values: np.ndarray, bits: int = 8) -> np.ndarray:
"""
Simple uniform quantization.
Maps float values to discrete integer levels.
"""
# Calculate range
min_val = values.min()
max_val = values.max()
# Number of levels
levels = 2 ** bits
# Scale factor
scale = (max_val - min_val) / (levels - 1)
# Quantize
quantized = np.round((values - min_val) / scale)
quantized = np.clip(quantized, 0, levels - 1)
return quantized.astype(np.uint8)
def naive_dequantize(quantized: np.ndarray, bits: int = 8,
min_val: float = None, max_val: float = None) -> np.ndarray:
"""
Dequantize back to float.
"""
levels = 2 ** bits
if min_val is None:
min_val = quantized.min()
if max_val is None:
max_val = quantized.max()
scale = (max_val - min_val) / (levels - 1)
return quantized * scale + min_val
Quantization Metrics
def compute_quantization_metrics(original: torch.Tensor,
quantized: torch.Tensor) -> dict:
"""
Compute metrics for quantization quality.
"""
# Mean Squared Error
mse = ((original - quantized) ** 2).mean().item()
# Per-channel MSE
per_channel_mse = ((original - quantized) ** 2).mean(dim=-1)
# Signal-to-Noise Ratio
signal = (original ** 2).mean()
noise = ((original - quantized) ** 2).mean()
snr = 10 * torch.log10(signal / noise).item()
# Cosine similarity
cos_sim = torch.nn.functional.cosine_similarity(
original.flatten(),
quantized.flatten(),
dim=0
).item()
return {
"mse": mse,
"snr_db": snr,
"cosine_similarity": cos_sim,
"perplexity": None # Would compute on actual task
}
Quantization Approaches
1. Post-Training Quantization (PTQ)
class PostTrainingQuantizer:
"""
Post-training quantization after model is trained.
"""
def __init__(self, model: torch.nn.Module, bits: int = 8):
self.model = model
self.bits = bits
self.quantized_model = None
def quantize_weights(self, dataloader=None):
"""
Quantize model weights using calibration data.
"""
self.quantized_model = copy.deepcopy(self.model)
for name, param in self.quantized_model.named_parameters():
if param.requires_grad:
continue
# Get weight values
weight = param.data
# Choose quantization method
if self.bits == 8:
param.data = self._uniform_quantize(weight)
elif self.bits == 4:
param.data = self._learned_scale_quantize(weight, dataloader)
else:
param.data = self._binary_quantize(weight)
return self.quantized_model
def _uniform_quantize(self, weight: torch.Tensor) -> torch.Tensor:
"""
Uniform quantization with per-tensor scaling.
"""
# Find max absolute value
max_val = weight.abs().max()
# Scale to range [0, 255]
levels = 2 ** self.bits - 1
scale = max_val / levels
# Quantize
quantized = torch.round(weight / scale)
quantized = torch.clamp(quantized, -levels//2, levels//2)
# Dequantize
return quantized * scale
def _learned_scale_quantize(self, weight: torch.Tensor,
dataloader) -> torch.Tensor:
"""
Quantization with learned scales (like in GPTQ).
"""
# Would implement GPTQ-style quantization
# This is simplified
return self._uniform_quantize(weight)
def _binary_quantize(self, weight: torch.Tensor) -> torch.Tensor:
"""
Extreme quantization to binary (1-bit).
"""
# Sign-based quantization
scale = weight.abs().mean()
binary = torch.sign(weight)
return binary * scale
2. Dynamic Quantization
class DynamicQuantizer:
"""
Quantization where scales are computed dynamically at runtime.
"""
@staticmethod
def quantize_dynamic(module: torch.nn.Module) -> torch.nn.Module:
"""
Apply dynamic quantization to linear layers.
"""
quantized = copy.deepcopy(module)
for name, submodule in quantized.named_children():
if isinstance(submodule, torch.nn.Linear):
# Quantize weights dynamically
weight = submodule.weight.data
# Compute scale per channel
scales = weight.abs().max(dim=1)[0]
# Quantize
quantized_weight = torch.round(weight / scales.unsqueeze(1))
quantized_weight = torch.clamp(quantized_weight, -127, 127)
# Store as int8
submodule.weight.data = quantized_weight.to(torch.int8)
submodule.weight_scale = scales
return quantized
@staticmethod
def quantize_observer(module: torch.nn.Module,
calibration_data: list) -> torch.nn.Module:
"""
Quantization with calibration to compute optimal scales.
"""
# First pass: collect statistics
observer = torch.quantization.QuantObserver()
# Second pass: compute scales
for data in calibration_data:
observer(module(data))
# Apply quantization with computed scales
return torch.quantization.quantize_dynamic(
module, observer.scales
)
3. Quantization-Aware Training (QAT)
class QuantizationAwareTraining:
"""
Train model with quantization in the loop.
"""
def __init__(self, model: torch.nn.Module, bits: int = 8):
self.model = model
self.bits = bits
self.fake_quant_modules = []
def add_fake_quant(self):
"""
Insert fake quantization modules.
"""
for name, module in self.model.named_modules():
if isinstance(module, (torch.nn.Linear, torch.nn.Conv2d)):
# Insert fake quant before and after
fq1 = FakeQuant(bits=self.bits)
fq2 = FakeQuant(bits=self.bits)
# Replace module with wrapped version
parent_name = ".".join(name.split(".")[:-1])
child_name = name.split(".")[-1]
# This is simplified - real implementation is more complex
pass
def train_step(self, optimizer, data, target):
"""
Training step with fake quantization.
"""
self.model.train()
# Forward with fake quantization
output = self.model(data)
# Loss
loss = torch.nn.functional.cross_entropy(output, target)
# Backward
optimizer.zero_grad()
loss.backward()
optimizer.step()
return loss.item()
class FakeQuant(torch.autograd.Function):
"""
Differentiable fake quantization for training.
"""
@staticmethod
def forward(ctx, x, bits=8, scale=None):
# Straight-through estimator
levels = 2 ** bits - 1
if scale is None:
scale = x.abs().max() / levels
x_scaled = x / scale
x_quant = torch.round(x_scaled)
x_quant = torch.clamp(x_quant, -levels//2, levels//2)
ctx.save_for_backward(x_quant, scale)
return x_quant * scale
@staticmethod
def backward(ctx, grad_output):
# Straight-through estimator: gradient passes through
return grad_output, None, None
Advanced Quantization Algorithms
1. GPTQ: Accurate Post-Training Quantization
class GPTQQuantizer:
"""
GPTQ: Accurate Post-Training Quantization for LLMs.
Key innovation: Layer-by-layer quantization with optimal brain
"""
def __init__(self, model, bits=4, group_size=128):
self.model = model
self.bits = bits
self.group_size = group_size
self.hessian_cache = {}
def quantize_layer(self, layer: torch.nn.Linear,
dataloader, device='cuda'):
"""
Quantize single linear layer.
"""
# Get layer input for Hessian computation
Layer = layer
in_features = Layer.in_features
out_features = Layer.out_features
# Collect activations
def hook_fn(module, input, output):
self.activations = input[0].detach()
handle = Layer.register_forward_hook(hook_fn)
# Forward pass on calibration data
for data in dataloader:
if isinstance(data, dict):
data = data['input']
data = data.to(device)
self.model(data)
handle.remove()
# Compute Hessian (approximation)
H = self._compute_hessian(self.activations)
# Quantize weights
weights = Layer.weight.data.clone()
quantized_weights = self._quantize_weights(weights, H)
Layer.weight.data = quantized_weights
def _compute_hessian(self, activations):
"""
Compute Hessian approximation for weights.
"""
# Simplified: use identity as Hessian approximation
# Real implementation: compute actual Hessian
return torch.eye(activations.shape[-1], device=activations.device)
def _quantize_weights(self, weights, H):
"""
Quantize weights using optimal brain quantization.
"""
out_features, in_features = weights.shape
# Reshape for group quantization
num_groups = in_features // self.group_size
quantized = torch.zeros_like(weights)
for g in range(num_groups):
start = g * self.group_size
end = min((g + 1) * self.group_size, in_features)
w_g = weights[:, start:end]
H_g = H[start:end, start:end]
# Quantize group
q_w = self._quantize_group(w_g, H_g)
quantized[:, start:end] = q_w
return quantized
def _quantize_group(self, w, H):
"""
Quantize weight group with optimal scales.
"""
# Find optimal scale
levels = 2 ** self.bits - 1
# Compute optimal scales using inverse Hessian
# Simplified: use max norm
scale = w.abs().max() / levels
# Quantize
w_q = torch.round(w / scale)
w_q = torch.clamp(w_q, -levels//2, levels//2)
return w_q * scale
2. AWQ: Activation-Aware Quantization
class AWQQuantizer:
"""
AWQ: Activation-Aware Weight Quantization.
Key insight: Not all weights are equally important.
"""
def __init__(self, model, bits=4):
self.model = model
self.bits = bits
def quantize(self, dataloader, device='cuda'):
"""
Quantize model using AWQ.
"""
# Step 1: Measure activation scales
act_scales = self._measure_activation_scales(dataloader, device)
# Step 2: Find optimal per-channel scales
for name, module in self.model.named_modules():
if isinstance(module, torch.nn.Linear):
if name in act_scales:
self._quantize_with_scales(
module,
act_scales[name],
device
)
def _measure_activation_scales(self, dataloader, device):
"""
Measure activation magnitudes for each layer.
"""
scales = {}
hooks = []
def hook_fn(name):
def fn(module, input, output):
# Use input activation scale
if isinstance(input, tuple):
act = input[0]
else:
act = input
scale = act.abs().mean(dim=(0, 2), keepdim=True)
if name not in scales:
scales[name] = []
scales[name].append(scale.detach())
return fn
# Register hooks
for name, module in self.model.named_modules():
if isinstance(module, torch.nn.Linear):
hooks.append(module.register_forward_hook(hook_fn(name)))
# Run data through model
self.model.eval()
for data in dataloader:
if isinstance(data, dict):
data = data['input']
self.model(data.to(device))
# Remove hooks
for h in hooks:
h.remove()
# Average scales
for name in scales:
scales[name] = torch.stack(scales[name]).mean(dim=0)
return scales
def _quantize_with_scales(self, module, act_scale, device):
"""
Quantize with activation-aware scaling.
"""
weight = module.weight.data
# Compute weight scales
weight_scale = weight.abs().max(dim=1, keepdim=True)[0]
# Combine activation and weight scales
# Key innovation: protect important channels
combined_scale = (act_scale.squeeze() * weight_scale.squeeze()).sqrt()
combined_scale = combined_scale.clamp(min=1e-4)
# Quantize
levels = 2 ** self.bits - 1
scaled_w = weight / combined_scale.unsqueeze(1)
w_q = torch.round(scaled_w)
w_q = torch.clamp(w_q, -levels//2, levels//2)
# Dequantize
module.weight.data = (w_q * combined_scale.unsqueeze(1)).to(device)
3. SmoothQuant: Activation-Aware Per-Channel Quantization
class SmoothQuantizer:
"""
SmoothQuant: Smooths activation outliers before quantization.
"""
def __init__(self, model, alpha=0.5):
self.model = model
self.alpha = alpha # Balance between weight and activation smoothing
def smooth_model(self, dataloader, device='cuda'):
"""
Smooth activations to reduce outliers.
"""
# Compute per-channel statistics
act_stats = self._compute_activation_stats(dataloader, device)
weight_stats = self._compute_weight_stats()
# Compute smoothing factors
for name, module in self.model.named_modules():
if isinstance(module, torch.nn.Linear):
if name in act_stats and name in weight_stats:
s = self._compute_smoothing_factor(
act_stats[name],
weight_stats[name],
self.alpha
)
# Apply smoothing
module.weight.data = module.weight.data / s.unsqueeze(1)
# Store inverse for inference
module.smoothing_factor = s
def _compute_activation_stats(self, dataloader, device):
"""Compute activation per-channel max."""
stats = {}
hooks = []
def hook_fn(name):
def fn(module, input, output):
if isinstance(input, tuple):
act = input[0]
else:
act = input
# Per-channel max
channel_max = act.abs().max(dim=0)[0]
if name not in stats:
stats[name] = []
stats[name].append(channel_max.detach())
return fn
for name, module in self.model.named_modules():
if isinstance(module, torch.nn.Linear):
hooks.append(module.register_forward_hook(hook_fn(name)))
self.model.eval()
for data in dataloader:
if isinstance(data, dict):
data = data['input']
self.model(data.to(device))
for h in hooks:
h.remove()
# Average
for name in stats:
stats[name] = torch.stack(stats[name]).mean(dim=0)
return stats
def _compute_weight_stats(self):
"""Compute weight per-channel max."""
stats = {}
for name, module in self.model.named_modules():
if isinstance(module, torch.nn.Linear):
w_max = module.weight.abs().max(dim=1)[0]
stats[name] = w_max
return stats
def _compute_smoothing_factor(self, act_scale, weight_scale, alpha):
"""
Compute smoothing factor.
"""
# SmoothQuant formula
s = (act_scale ** alpha) / (weight_scale ** (1 - alpha))
return s
Implementation Framework
class QuantizationPipeline:
"""
Complete quantization pipeline.
"""
def __init__(self, model, config):
self.model = model
self.config = config
self.quantizers = {
'gptq': GPTQQuantizer,
'awq': AWQQuantizer,
'smoothquant': SmoothQuantizer,
'dynamic': DynamicQuantizer
}
def quantize(self, method='gptq', calibration_data=None,
bits=4, device='cuda'):
"""
Run quantization pipeline.
"""
# Clone model
quantized_model = copy.deepcopy(self.model)
if method == 'gptq':
quantizer = GPTQQuantizer(quantized_model, bits=bits)
# Quantize layer by layer
for name, module in quantized_model.named_modules():
if isinstance(module, torch.nn.Linear):
quantizer.quantize_layer(module, calibration_data, device)
elif method == 'awq':
quantizer = AWQQuantizer(quantized_model, bits=bits)
quantizer.quantize(calibration_data, device)
elif method == 'smoothquant':
quantizer = SmoothQuantizer(quantized_model, alpha=0.5)
quantizer.smooth_model(calibration_data, device)
elif method == 'dynamic':
quantized_model = torch.quantization.quantize_dynamic(
quantized_model,
{torch.nn.Linear},
dtype=torch.qint8
)
return quantized_model
def evaluate(self, quantized_model, test_data, metric='perplexity'):
"""
Evaluate quantized model.
"""
quantized_model.eval()
total_loss = 0
num_tokens = 0
with torch.no_grad():
for batch in test_data:
input_ids = batch['input_ids'].to('cuda')
labels = batch['labels'].to('cuda')
outputs = quantized_model(input_ids)
loss = torch.nn.functional.cross_entropy(
outputs.view(-1, outputs.size(-1)),
labels.view(-1)
)
total_loss += loss.item() * labels.numel()
num_tokens += labels.numel()
perplexity = np.exp(total_loss / num_tokens)
return {'perplexity': perplexity}
Practical Considerations
1. Choosing Quantization Method
def select_quantization_method(model_size, hardware, accuracy_requirement):
"""
Select appropriate quantization method.
"""
if accuracy_requirement == 'high':
# Use GPTQ or AWQ for better accuracy
if model_size > 30_000_000_000: # 30B+
return 'gptq' # Best for large models
else:
return 'awq' # Good balance
elif accuracy_requirement == 'medium':
# Use SmoothQuant for moderate accuracy
return 'smoothquant'
else:
# Use dynamic for fastest deployment
return 'dynamic'
2. Mixed-Precision Quantization
class MixedPrecisionQuantizer:
"""
Use different bit-widths for different layers.
"""
def __init__(self, model):
self.model = model
# Determine bit-width per layer
self.layer_bits = self._analyze_layer_importance()
def _analyze_layer_importance(self):
"""
Analyze which layers need higher precision.
"""
# Simplified: use heuristic
# In practice: measure activation magnitudes, gradients, etc.
bits = {}
for name, module in self.model.named_modules():
if isinstance(module, torch.nn.Linear):
# First and last layers: higher precision
if 'lm_head' in name or 'embed' in name:
bits[name] = 8
else:
bits[name] = 4
return bits
def quantize(self):
"""
Quantize with mixed precision.
"""
for name, module in self.model.named_modules():
if name in self.layer_bits:
bits = self.layer_bits[name]
# Apply quantization with specified bits
pass
3. Hardware-Optimized Kernels
class QuantizedLinear(torch.nn.Module):
"""
Optimized linear layer for quantized inference.
"""
def __init__(self, weight, weight_scale, bias=None):
super().__init__()
self.weight = weight # int8
self.weight_scale = weight_scale # float
self.bias = bias
def forward(self, x):
# Dequantize
weight_dequant = self.weight.float() * self.weight_scale.unsqueeze(1)
# Compute
return torch.nn.functional.linear(x, weight_dequant, self.bias)
Quantization Results
Size Reduction
| Precision | Size Reduction | Typical Quality Loss |
|---|---|---|
| FP16 | 2x | Minimal |
| INT8 | 4x | 1-3% |
| INT4 | 8x | 3-8% |
| INT2 | 16x | 10-20% |
| INT1 (binary) | 32x | Significant |
Latency Improvements
def benchmark_inference(model, quantized_model, test_data):
"""
Benchmark inference speed improvement.
"""
import time
# Warmup
for _ in range(10):
model(test_data)
# Time original
start = time.time()
for _ in range(100):
model(test_data)
original_time = time.time() - start
# Time quantized
start = time.time()
for _ in range(100):
quantized_model(test_data)
quantized_time = time.time() - start
speedup = original_time / quantized_time
return {
'original_time': original_time,
'quantized_time': quantized_time,
'speedup': speedup
}
Best Practices
- Calibration Data: Use 100-1000 samples representative of deployment domain
- Layer Sensitivity: Test each layer’s sensitivity to quantization
- Mixed Precision: Keep sensitive layers at higher precision
- Evaluation: Always evaluate on actual task metrics
- Hardware: Match quantization to target hardware capabilities
Future Directions in 2026
Emerging Techniques
- QuIP#: Optimal quantization with adapter-based methods
- QuaRot: Rotation-based quantization for LLMs
- Activation-Quantization: Quantizing activations for even more speed
- Hardware Co-design: Specialized quantization for specific chips
Resources
Conclusion
Model quantization has become essential for deploying large AI models efficiently. From simple post-training quantization to sophisticated methods like GPTQ and AWQ, we now have tools to reduce model size by 8x or more while maintaining quality.
The key is choosing the right method for your use case: GPTQ for maximum accuracy on large models, AWQ for a good balance, and dynamic quantization for fastest deployment. As quantization techniques continue to improve, we’ll see even larger models running on consumer hardware.
In 2026, understanding quantization is essential for any AI engineer working with large models. The techniques explored here provide a foundation for building efficient, deployable AI systems.
Comments