Transformers and Attention Mechanisms: Modern Deep Learning Architecture

Transformers have revolutionized deep learning by replacing recurrence with attention mechanisms. They power state-of-the-art models like BERT, GPT, and T5, enabling parallel processing and better long-range dependencies.

Attention Mechanism Fundamentals

The Attention Concept

Attention allows models to focus on relevant parts of input when processing each element. Instead of processing sequentially, attention computes relationships between all positions simultaneously.

import numpy as np
import matplotlib.pyplot as plt

def scaled_dot_product_attention(Q, K, V, mask=None):
    """
    Scaled dot-product attention
    
    Attention(Q, K, V) = softmax(Q @ K^T / sqrt(d_k)) @ V
    
    Args:
        Q: Query matrix (batch_size, seq_len, d_k)
        K: Key matrix (batch_size, seq_len, d_k)
        V: Value matrix (batch_size, seq_len, d_v)
        mask: Optional mask for padding or causal attention
    """
    d_k = Q.shape[-1]
    
    # Compute attention scores
    scores = np.matmul(Q, K.transpose(0, 2, 1)) / np.sqrt(d_k)
    
    # Apply mask if provided
    if mask is not None:
        scores = np.where(mask, scores, -1e9)
    
    # Apply softmax
    attention_weights = np.exp(scores - np.max(scores, axis=-1, keepdims=True))
    attention_weights = attention_weights / np.sum(attention_weights, axis=-1, keepdims=True)
    
    # Apply attention to values
    output = np.matmul(attention_weights, V)
    
    return output, attention_weights

# Example: Simple attention
batch_size, seq_len, d_k = 2, 4, 8

Q = np.random.randn(batch_size, seq_len, d_k)
K = np.random.randn(batch_size, seq_len, d_k)
V = np.random.randn(batch_size, seq_len, d_k)

output, weights = scaled_dot_product_attention(Q, K, V)

print(f"Output shape: {output.shape}")
print(f"Attention weights shape: {weights.shape}")
print(f"Attention weights sum to 1: {np.allclose(weights.sum(axis=-1), 1)}")

# Visualize attention weights
plt.figure(figsize=(8, 6))
plt.imshow(weights[0], cmap='viridis')
plt.colorbar(label='Attention Weight')
plt.xlabel('Key Position')
plt.ylabel('Query Position')
plt.title('Attention Weights Heatmap')
plt.show()

Multi-Head Attention

Multiple attention heads allow the model to attend to different representation subspaces.

class MultiHeadAttention:
    def __init__(self, d_model, num_heads):
        """
        Multi-head attention
        
        Args:
            d_model: Model dimension
            num_heads: Number of attention heads
        """
        assert d_model % num_heads == 0, "d_model must be divisible by num_heads"
        
        self.d_model = d_model
        self.num_heads = num_heads
        self.d_k = d_model // num_heads
        
        # Initialize weight matrices
        self.W_q = np.random.randn(d_model, d_model) * 0.01
        self.W_k = np.random.randn(d_model, d_model) * 0.01
        self.W_v = np.random.randn(d_model, d_model) * 0.01
        self.W_o = np.random.randn(d_model, d_model) * 0.01
    
    def split_heads(self, x):
        """Split into multiple heads"""
        batch_size, seq_len, d_model = x.shape
        x = x.reshape(batch_size, seq_len, self.num_heads, self.d_k)
        return x.transpose(0, 2, 1, 3)  # (batch, heads, seq_len, d_k)
    
    def combine_heads(self, x):
        """Combine multiple heads"""
        batch_size, num_heads, seq_len, d_k = x.shape
        x = x.transpose(0, 2, 1, 3)
        return x.reshape(batch_size, seq_len, self.d_model)
    
    def forward(self, Q, K, V):
        """Forward pass"""
        batch_size = Q.shape[0]
        
        # Linear projections
        Q = np.dot(Q, self.W_q)
        K = np.dot(K, self.W_k)
        V = np.dot(V, self.W_v)
        
        # Split into heads
        Q = self.split_heads(Q)
        K = self.split_heads(K)
        V = self.split_heads(V)
        
        # Attention for each head
        attn_output, _ = scaled_dot_product_attention(Q, K, V)
        
        # Combine heads
        output = self.combine_heads(attn_output)
        
        # Final linear projection
        output = np.dot(output, self.W_o)
        
        return output

# Example
d_model = 512
num_heads = 8
seq_len = 10

Q = np.random.randn(2, seq_len, d_model)
K = np.random.randn(2, seq_len, d_model)
V = np.random.randn(2, seq_len, d_model)

mha = MultiHeadAttention(d_model, num_heads)
output = mha.forward(Q, K, V)

print(f"Multi-head attention output shape: {output.shape}")

Positional Encoding

Since transformers don’t have recurrence, they need positional information.

def positional_encoding(seq_len, d_model):
    """
    Positional encoding using sine and cosine functions
    
    PE(pos, 2i) = sin(pos / 10000^(2i/d_model))
    PE(pos, 2i+1) = cos(pos / 10000^(2i/d_model))
    """
    pos = np.arange(seq_len)[:, np.newaxis]
    i = np.arange(d_model)[np.newaxis, :]
    
    angle_rates = 1 / np.power(10000, (2 * (i // 2)) / np.float32(d_model))
    
    pe = np.zeros((seq_len, d_model))
    pe[:, 0::2] = np.sin(pos * angle_rates[:, 0::2])
    pe[:, 1::2] = np.cos(pos * angle_rates[:, 1::2])
    
    return pe

# Visualize positional encoding
pe = positional_encoding(100, 512)

plt.figure(figsize=(12, 4))
plt.imshow(pe.T, cmap='viridis', aspect='auto')
plt.colorbar(label='Encoding Value')
plt.xlabel('Position')
plt.ylabel('Dimension')
plt.title('Positional Encoding')
plt.show()

Transformer Architecture

Complete Transformer Block

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

def create_transformer_block(d_model, num_heads, dff, dropout_rate=0.1):
    """
    Create a single transformer block
    
    Args:
        d_model: Model dimension
        num_heads: Number of attention heads
        dff: Feed-forward network dimension
        dropout_rate: Dropout rate
    """
    inputs = keras.Input(shape=(None, d_model))
    
    # Multi-head attention
    attention = layers.MultiHeadAttention(
        num_heads=num_heads,
        key_dim=d_model // num_heads,
        dropout=dropout_rate
    )(inputs, inputs)
    
    # Dropout and add
    attention = layers.Dropout(dropout_rate)(attention)
    attention_output = layers.Add()([inputs, attention])
    attention_output = layers.LayerNormalization(epsilon=1e-6)(attention_output)
    
    # Feed-forward network
    ffn = keras.Sequential([
        layers.Dense(dff, activation='relu'),
        layers.Dense(d_model),
    ])
    
    ffn_output = ffn(attention_output)
    ffn_output = layers.Dropout(dropout_rate)(ffn_output)
    
    # Add and normalize
    output = layers.Add()([attention_output, ffn_output])
    output = layers.LayerNormalization(epsilon=1e-6)(output)
    
    return keras.Model(inputs=inputs, outputs=output)

# Create transformer block
transformer_block = create_transformer_block(
    d_model=512,
    num_heads=8,
    dff=2048,
    dropout_rate=0.1
)

print(transformer_block.summary())

Full Transformer Model

def create_transformer_model(
    vocab_size,
    max_seq_len,
    d_model=512,
    num_heads=8,
    num_layers=6,
    dff=2048,
    dropout_rate=0.1
):
    """Create a complete transformer model"""
    
    inputs = keras.Input(shape=(max_seq_len,), dtype=tf.int32)
    
    # Embedding
    x = layers.Embedding(vocab_size, d_model)(inputs)
    
    # Positional encoding
    positions = tf.range(start=0, limit=max_seq_len, delta=1)
    position_embedding = layers.Embedding(
        input_dim=max_seq_len,
        output_dim=d_model
    )(positions)
    
    x = x + position_embedding
    x = layers.Dropout(dropout_rate)(x)
    
    # Transformer blocks
    for _ in range(num_layers):
        # Multi-head attention
        attention = layers.MultiHeadAttention(
            num_heads=num_heads,
            key_dim=d_model // num_heads,
            dropout=dropout_rate
        )(x, x)
        attention = layers.Dropout(dropout_rate)(attention)
        x = layers.Add()([x, attention])
        x = layers.LayerNormalization(epsilon=1e-6)(x)
        
        # Feed-forward
        ffn = keras.Sequential([
            layers.Dense(dff, activation='relu'),
            layers.Dense(d_model),
        ])
        ffn_output = ffn(x)
        ffn_output = layers.Dropout(dropout_rate)(ffn_output)
        x = layers.Add()([x, ffn_output])
        x = layers.LayerNormalization(epsilon=1e-6)(x)
    
    # Global average pooling
    x = layers.GlobalAveragePooling1D()(x)
    
    # Classification head
    outputs = layers.Dense(10, activation='softmax')(x)
    
    model = keras.Model(inputs=inputs, outputs=outputs)
    return model

# Create model
transformer_model = create_transformer_model(
    vocab_size=10000,
    max_seq_len=100,
    d_model=256,
    num_heads=8,
    num_layers=4,
    dff=1024
)

transformer_model.compile(
    optimizer='adam',
    loss='sparse_categorical_crossentropy',
    metrics=['accuracy']
)

print(transformer_model.summary())

Using Pre-trained Transformers with Hugging Face

# Install transformers library
# pip install transformers torch

from transformers import AutoTokenizer, AutoModel, pipeline

# Load pre-trained BERT model
model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

# Tokenize text
text = "The quick brown fox jumps over the lazy dog"
inputs = tokenizer(text, return_tensors="pt")

# Get embeddings
outputs = model(**inputs)
embeddings = outputs.last_hidden_state

print(f"Embeddings shape: {embeddings.shape}")

# Use pipeline for common tasks
# Sentiment analysis
sentiment_pipeline = pipeline("sentiment-analysis")
result = sentiment_pipeline("I love this movie!")
print(f"Sentiment: {result}")

# Text generation
generation_pipeline = pipeline("text-generation", model="gpt2")
result = generation_pipeline("The future of AI is", max_length=50)
print(f"Generated text: {result}")

# Question answering
qa_pipeline = pipeline("question-answering")
context = "The Eiffel Tower is located in Paris, France."
question = "Where is the Eiffel Tower?"
result = qa_pipeline(question=question, context=context)
print(f"Answer: {result['answer']}")

Fine-tuning Transformers

from transformers import TextClassificationPipeline, AutoModelForSequenceClassification
import torch

# Load pre-trained model for fine-tuning
model_name = "bert-base-uncased"
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=2  # Binary classification
)

# Prepare data
texts = [
    "This movie is great!",
    "I didn't like this movie",
    "Amazing performance!",
    "Terrible waste of time"
]
labels = [1, 0, 1, 0]  # 1: positive, 0: negative

# Tokenize
inputs = tokenizer(texts, padding=True, truncation=True, return_tensors="pt")
labels = torch.tensor(labels)

# Fine-tune
optimizer = torch.optim.Adam(model.parameters(), lr=2e-5)

for epoch in range(3):
    outputs = model(**inputs, labels=labels)
    loss = outputs.loss
    
    loss.backward()
    optimizer.step()
    optimizer.zero_grad()
    
    print(f"Epoch {epoch+1}, Loss: {loss.item():.4f}")

Best Practices

Use pre-trained models: Transfer learning is faster and more effective
Appropriate sequence length: Balance between context and computational cost
Batch normalization: Use layer normalization in transformers
Gradient clipping: Prevent exploding gradients
Learning rate scheduling: Warm-up followed by decay
Validation monitoring: Detect overfitting early

Common Pitfalls

Bad Practice:

# Don't: Train transformer from scratch on small dataset
model = create_transformer_model(...)
model.fit(small_dataset, epochs=100)

# Don't: Use very long sequences
inputs = tokenizer(very_long_text, max_length=4096)

# Don't: Forget to use appropriate learning rate
optimizer = keras.optimizers.Adam(learning_rate=0.1)

Good Practice:

# Do: Use pre-trained models and fine-tune
model = AutoModel.from_pretrained("bert-base-uncased")
model.fit(dataset, epochs=3)

# Do: Use reasonable sequence lengths
inputs = tokenizer(text, max_length=512, truncation=True)

# Do: Use appropriate learning rate for fine-tuning
optimizer = torch.optim.Adam(model.parameters(), lr=2e-5)

Conclusion

Transformers have become the foundation of modern NLP and beyond. Understanding attention mechanisms, positional encoding, and the transformer architecture enables building state-of-the-art models. Leverage pre-trained models from Hugging Face for most applications, and fine-tune them for specific tasks. The combination of transformers’ parallel processing and attention’s ability to capture long-range dependencies makes them powerful for sequential data.