Transformers and Attention Mechanisms: Modern Deep Learning Architecture
Transformers have revolutionized deep learning by replacing recurrence with attention mechanisms. They power state-of-the-art models like BERT, GPT, and T5, enabling parallel processing and better long-range dependencies.
Attention Mechanism Fundamentals
The Attention Concept
Attention allows models to focus on relevant parts of input when processing each element. Instead of processing sequentially, attention computes relationships between all positions simultaneously.
import numpy as np
import matplotlib.pyplot as plt
def scaled_dot_product_attention(Q, K, V, mask=None):
"""
Scaled dot-product attention
Attention(Q, K, V) = softmax(Q @ K^T / sqrt(d_k)) @ V
Args:
Q: Query matrix (batch_size, seq_len, d_k)
K: Key matrix (batch_size, seq_len, d_k)
V: Value matrix (batch_size, seq_len, d_v)
mask: Optional mask for padding or causal attention
"""
d_k = Q.shape[-1]
# Compute attention scores
scores = np.matmul(Q, K.transpose(0, 2, 1)) / np.sqrt(d_k)
# Apply mask if provided
if mask is not None:
scores = np.where(mask, scores, -1e9)
# Apply softmax
attention_weights = np.exp(scores - np.max(scores, axis=-1, keepdims=True))
attention_weights = attention_weights / np.sum(attention_weights, axis=-1, keepdims=True)
# Apply attention to values
output = np.matmul(attention_weights, V)
return output, attention_weights
# Example: Simple attention
batch_size, seq_len, d_k = 2, 4, 8
Q = np.random.randn(batch_size, seq_len, d_k)
K = np.random.randn(batch_size, seq_len, d_k)
V = np.random.randn(batch_size, seq_len, d_k)
output, weights = scaled_dot_product_attention(Q, K, V)
print(f"Output shape: {output.shape}")
print(f"Attention weights shape: {weights.shape}")
print(f"Attention weights sum to 1: {np.allclose(weights.sum(axis=-1), 1)}")
# Visualize attention weights
plt.figure(figsize=(8, 6))
plt.imshow(weights[0], cmap='viridis')
plt.colorbar(label='Attention Weight')
plt.xlabel('Key Position')
plt.ylabel('Query Position')
plt.title('Attention Weights Heatmap')
plt.show()
Multi-Head Attention
Multiple attention heads allow the model to attend to different representation subspaces.
class MultiHeadAttention:
def __init__(self, d_model, num_heads):
"""
Multi-head attention
Args:
d_model: Model dimension
num_heads: Number of attention heads
"""
assert d_model % num_heads == 0, "d_model must be divisible by num_heads"
self.d_model = d_model
self.num_heads = num_heads
self.d_k = d_model // num_heads
# Initialize weight matrices
self.W_q = np.random.randn(d_model, d_model) * 0.01
self.W_k = np.random.randn(d_model, d_model) * 0.01
self.W_v = np.random.randn(d_model, d_model) * 0.01
self.W_o = np.random.randn(d_model, d_model) * 0.01
def split_heads(self, x):
"""Split into multiple heads"""
batch_size, seq_len, d_model = x.shape
x = x.reshape(batch_size, seq_len, self.num_heads, self.d_k)
return x.transpose(0, 2, 1, 3) # (batch, heads, seq_len, d_k)
def combine_heads(self, x):
"""Combine multiple heads"""
batch_size, num_heads, seq_len, d_k = x.shape
x = x.transpose(0, 2, 1, 3)
return x.reshape(batch_size, seq_len, self.d_model)
def forward(self, Q, K, V):
"""Forward pass"""
batch_size = Q.shape[0]
# Linear projections
Q = np.dot(Q, self.W_q)
K = np.dot(K, self.W_k)
V = np.dot(V, self.W_v)
# Split into heads
Q = self.split_heads(Q)
K = self.split_heads(K)
V = self.split_heads(V)
# Attention for each head
attn_output, _ = scaled_dot_product_attention(Q, K, V)
# Combine heads
output = self.combine_heads(attn_output)
# Final linear projection
output = np.dot(output, self.W_o)
return output
# Example
d_model = 512
num_heads = 8
seq_len = 10
Q = np.random.randn(2, seq_len, d_model)
K = np.random.randn(2, seq_len, d_model)
V = np.random.randn(2, seq_len, d_model)
mha = MultiHeadAttention(d_model, num_heads)
output = mha.forward(Q, K, V)
print(f"Multi-head attention output shape: {output.shape}")
Positional Encoding
Since transformers don’t have recurrence, they need positional information.
def positional_encoding(seq_len, d_model):
"""
Positional encoding using sine and cosine functions
PE(pos, 2i) = sin(pos / 10000^(2i/d_model))
PE(pos, 2i+1) = cos(pos / 10000^(2i/d_model))
"""
pos = np.arange(seq_len)[:, np.newaxis]
i = np.arange(d_model)[np.newaxis, :]
angle_rates = 1 / np.power(10000, (2 * (i // 2)) / np.float32(d_model))
pe = np.zeros((seq_len, d_model))
pe[:, 0::2] = np.sin(pos * angle_rates[:, 0::2])
pe[:, 1::2] = np.cos(pos * angle_rates[:, 1::2])
return pe
# Visualize positional encoding
pe = positional_encoding(100, 512)
plt.figure(figsize=(12, 4))
plt.imshow(pe.T, cmap='viridis', aspect='auto')
plt.colorbar(label='Encoding Value')
plt.xlabel('Position')
plt.ylabel('Dimension')
plt.title('Positional Encoding')
plt.show()
Transformer Architecture
Complete Transformer Block
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
def create_transformer_block(d_model, num_heads, dff, dropout_rate=0.1):
"""
Create a single transformer block
Args:
d_model: Model dimension
num_heads: Number of attention heads
dff: Feed-forward network dimension
dropout_rate: Dropout rate
"""
inputs = keras.Input(shape=(None, d_model))
# Multi-head attention
attention = layers.MultiHeadAttention(
num_heads=num_heads,
key_dim=d_model // num_heads,
dropout=dropout_rate
)(inputs, inputs)
# Dropout and add
attention = layers.Dropout(dropout_rate)(attention)
attention_output = layers.Add()([inputs, attention])
attention_output = layers.LayerNormalization(epsilon=1e-6)(attention_output)
# Feed-forward network
ffn = keras.Sequential([
layers.Dense(dff, activation='relu'),
layers.Dense(d_model),
])
ffn_output = ffn(attention_output)
ffn_output = layers.Dropout(dropout_rate)(ffn_output)
# Add and normalize
output = layers.Add()([attention_output, ffn_output])
output = layers.LayerNormalization(epsilon=1e-6)(output)
return keras.Model(inputs=inputs, outputs=output)
# Create transformer block
transformer_block = create_transformer_block(
d_model=512,
num_heads=8,
dff=2048,
dropout_rate=0.1
)
print(transformer_block.summary())
Full Transformer Model
def create_transformer_model(
vocab_size,
max_seq_len,
d_model=512,
num_heads=8,
num_layers=6,
dff=2048,
dropout_rate=0.1
):
"""Create a complete transformer model"""
inputs = keras.Input(shape=(max_seq_len,), dtype=tf.int32)
# Embedding
x = layers.Embedding(vocab_size, d_model)(inputs)
# Positional encoding
positions = tf.range(start=0, limit=max_seq_len, delta=1)
position_embedding = layers.Embedding(
input_dim=max_seq_len,
output_dim=d_model
)(positions)
x = x + position_embedding
x = layers.Dropout(dropout_rate)(x)
# Transformer blocks
for _ in range(num_layers):
# Multi-head attention
attention = layers.MultiHeadAttention(
num_heads=num_heads,
key_dim=d_model // num_heads,
dropout=dropout_rate
)(x, x)
attention = layers.Dropout(dropout_rate)(attention)
x = layers.Add()([x, attention])
x = layers.LayerNormalization(epsilon=1e-6)(x)
# Feed-forward
ffn = keras.Sequential([
layers.Dense(dff, activation='relu'),
layers.Dense(d_model),
])
ffn_output = ffn(x)
ffn_output = layers.Dropout(dropout_rate)(ffn_output)
x = layers.Add()([x, ffn_output])
x = layers.LayerNormalization(epsilon=1e-6)(x)
# Global average pooling
x = layers.GlobalAveragePooling1D()(x)
# Classification head
outputs = layers.Dense(10, activation='softmax')(x)
model = keras.Model(inputs=inputs, outputs=outputs)
return model
# Create model
transformer_model = create_transformer_model(
vocab_size=10000,
max_seq_len=100,
d_model=256,
num_heads=8,
num_layers=4,
dff=1024
)
transformer_model.compile(
optimizer='adam',
loss='sparse_categorical_crossentropy',
metrics=['accuracy']
)
print(transformer_model.summary())
Using Pre-trained Transformers with Hugging Face
# Install transformers library
# pip install transformers torch
from transformers import AutoTokenizer, AutoModel, pipeline
# Load pre-trained BERT model
model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)
# Tokenize text
text = "The quick brown fox jumps over the lazy dog"
inputs = tokenizer(text, return_tensors="pt")
# Get embeddings
outputs = model(**inputs)
embeddings = outputs.last_hidden_state
print(f"Embeddings shape: {embeddings.shape}")
# Use pipeline for common tasks
# Sentiment analysis
sentiment_pipeline = pipeline("sentiment-analysis")
result = sentiment_pipeline("I love this movie!")
print(f"Sentiment: {result}")
# Text generation
generation_pipeline = pipeline("text-generation", model="gpt2")
result = generation_pipeline("The future of AI is", max_length=50)
print(f"Generated text: {result}")
# Question answering
qa_pipeline = pipeline("question-answering")
context = "The Eiffel Tower is located in Paris, France."
question = "Where is the Eiffel Tower?"
result = qa_pipeline(question=question, context=context)
print(f"Answer: {result['answer']}")
Fine-tuning Transformers
from transformers import TextClassificationPipeline, AutoModelForSequenceClassification
import torch
# Load pre-trained model for fine-tuning
model_name = "bert-base-uncased"
model = AutoModelForSequenceClassification.from_pretrained(
model_name,
num_labels=2 # Binary classification
)
# Prepare data
texts = [
"This movie is great!",
"I didn't like this movie",
"Amazing performance!",
"Terrible waste of time"
]
labels = [1, 0, 1, 0] # 1: positive, 0: negative
# Tokenize
inputs = tokenizer(texts, padding=True, truncation=True, return_tensors="pt")
labels = torch.tensor(labels)
# Fine-tune
optimizer = torch.optim.Adam(model.parameters(), lr=2e-5)
for epoch in range(3):
outputs = model(**inputs, labels=labels)
loss = outputs.loss
loss.backward()
optimizer.step()
optimizer.zero_grad()
print(f"Epoch {epoch+1}, Loss: {loss.item():.4f}")
Best Practices
- Use pre-trained models: Transfer learning is faster and more effective
- Appropriate sequence length: Balance between context and computational cost
- Batch normalization: Use layer normalization in transformers
- Gradient clipping: Prevent exploding gradients
- Learning rate scheduling: Warm-up followed by decay
- Validation monitoring: Detect overfitting early
Common Pitfalls
Bad Practice:
# Don't: Train transformer from scratch on small dataset
model = create_transformer_model(...)
model.fit(small_dataset, epochs=100)
# Don't: Use very long sequences
inputs = tokenizer(very_long_text, max_length=4096)
# Don't: Forget to use appropriate learning rate
optimizer = keras.optimizers.Adam(learning_rate=0.1)
Good Practice:
# Do: Use pre-trained models and fine-tune
model = AutoModel.from_pretrained("bert-base-uncased")
model.fit(dataset, epochs=3)
# Do: Use reasonable sequence lengths
inputs = tokenizer(text, max_length=512, truncation=True)
# Do: Use appropriate learning rate for fine-tuning
optimizer = torch.optim.Adam(model.parameters(), lr=2e-5)
Conclusion
Transformers have become the foundation of modern NLP and beyond. Understanding attention mechanisms, positional encoding, and the transformer architecture enables building state-of-the-art models. Leverage pre-trained models from Hugging Face for most applications, and fine-tune them for specific tasks. The combination of transformers’ parallel processing and attention’s ability to capture long-range dependencies makes them powerful for sequential data.
Comments