PyTorch: Dynamic Neural Networks and Flexible Deep Learning

PyTorch is Facebook’s open-source machine learning framework known for its dynamic computation graphs and intuitive Python-like syntax. It’s particularly popular in research and production environments for its flexibility and ease of debugging.

Getting Started with PyTorch

Installation

# Install PyTorch (CPU version)
pip install torch torchvision torchaudio

# Install PyTorch with CUDA support (GPU)
pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118

# Verify installation
python -c "import torch; print(torch.__version__)"

Basic Tensor Operations

import torch
import numpy as np

# Create tensors
tensor_zeros = torch.zeros(2, 3)
tensor_ones = torch.ones(2, 3)
tensor_random = torch.randn(2, 3)
tensor_range = torch.arange(0, 10, 2)

print(f"Zeros:\n{tensor_zeros}")
print(f"Ones:\n{tensor_ones}")
print(f"Random:\n{tensor_random}")

# Tensor properties
print(f"Shape: {tensor_random.shape}")
print(f"Data type: {tensor_random.dtype}")
print(f"Device: {tensor_random.device}")

# Convert between NumPy and PyTorch
numpy_array = np.array([1, 2, 3])
torch_tensor = torch.from_numpy(numpy_array)
back_to_numpy = torch_tensor.numpy()

print(f"NumPy to PyTorch: {torch_tensor}")
print(f"PyTorch to NumPy: {back_to_numpy}")

# Tensor operations
a = torch.tensor([[1, 2], [3, 4]], dtype=torch.float32)
b = torch.tensor([[5, 6], [7, 8]], dtype=torch.float32)

print(f"Addition: {a + b}")
print(f"Element-wise multiplication: {a * b}")
print(f"Matrix multiplication: {torch.matmul(a, b)}")
print(f"Transpose: {a.t()}")

# GPU support
if torch.cuda.is_available():
    tensor_gpu = a.to('cuda')
    print(f"Tensor on GPU: {tensor_gpu.device}")
    tensor_cpu = tensor_gpu.to('cpu')

Autograd: Automatic Differentiation

PyTorch’s autograd system automatically computes gradients for backpropagation.

import torch

# Enable gradient computation
x = torch.tensor([2.0, 3.0], requires_grad=True)
y = torch.tensor([1.0, 4.0], requires_grad=True)

# Perform operations
z = x ** 2 + y ** 3

# Compute loss
loss = z.sum()

# Backward pass (compute gradients)
loss.backward()

# Access gradients
print(f"x gradients: {x.grad}")  # dz/dx = 2x = [4, 6]
print(f"y gradients: {y.grad}")  # dz/dy = 3y^2 = [3, 48]

# Clear gradients for next iteration
x.grad.zero_()
y.grad.zero_()

# Example: Gradient descent optimization
x = torch.tensor([5.0], requires_grad=True)
optimizer = torch.optim.SGD([x], lr=0.01)

for i in range(100):
    optimizer.zero_grad()
    
    # Loss function: (x - 3)^2
    loss = (x - 3) ** 2
    
    loss.backward()
    optimizer.step()
    
    if i % 20 == 0:
        print(f"Iteration {i}, x: {x.item():.4f}, loss: {loss.item():.4f}")

Building Neural Networks with PyTorch

Using nn.Module

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

# Define a neural network
class SimpleNN(nn.Module):
    def __init__(self, input_size=784, hidden_size=128, num_classes=10):
        super(SimpleNN, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.2)
        self.fc2 = nn.Linear(hidden_size, 64)
        self.fc3 = nn.Linear(64, num_classes)
    
    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.dropout(x)
        x = self.fc2(x)
        x = self.relu(x)
        x = self.fc3(x)
        return x

# Create model
model = SimpleNN()
print(model)

# Define loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Create dummy data
X_train = torch.randn(1000, 784)
y_train = torch.randint(0, 10, (1000,))

# Create data loader
dataset = TensorDataset(X_train, y_train)
train_loader = DataLoader(dataset, batch_size=32, shuffle=True)

# Training loop
num_epochs = 5
for epoch in range(num_epochs):
    total_loss = 0
    for batch_X, batch_y in train_loader:
        # Forward pass
        outputs = model(batch_X)
        loss = criterion(outputs, batch_y)
        
        # Backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
    
    avg_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {avg_loss:.4f}")

Convolutional Neural Networks (CNN)

class CNN(nn.Module):
    def __init__(self, num_classes=10):
        super(CNN, self).__init__()
        
        # Convolutional layers
        self.conv1 = nn.Conv2d(1, 32, kernel_size=3, padding=1)
        self.conv2 = nn.Conv2d(32, 64, kernel_size=3, padding=1)
        self.conv3 = nn.Conv2d(64, 64, kernel_size=3, padding=1)
        
        # Pooling
        self.pool = nn.MaxPool2d(2, 2)
        
        # Fully connected layers
        self.fc1 = nn.Linear(64 * 3 * 3, 128)
        self.fc2 = nn.Linear(128, num_classes)
        self.dropout = nn.Dropout(0.5)
        self.relu = nn.ReLU()
    
    def forward(self, x):
        # Conv block 1
        x = self.relu(self.conv1(x))
        x = self.pool(x)
        
        # Conv block 2
        x = self.relu(self.conv2(x))
        x = self.pool(x)
        
        # Conv block 3
        x = self.relu(self.conv3(x))
        x = self.pool(x)
        
        # Flatten
        x = x.view(x.size(0), -1)
        
        # Fully connected
        x = self.relu(self.fc1(x))
        x = self.dropout(x)
        x = self.fc2(x)
        
        return x

# Create CNN model
cnn_model = CNN()
print(cnn_model)

Recurrent Neural Networks (RNN/LSTM)

class LSTMModel(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, num_classes):
        super(LSTMModel, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, 
                           batch_first=True, dropout=0.2)
        self.fc = nn.Linear(hidden_size, num_classes)
    
    def forward(self, x):
        # LSTM forward pass
        lstm_out, (h_n, c_n) = self.lstm(x)
        
        # Use last hidden state
        out = self.fc(h_n[-1])
        
        return out

# Create LSTM model
lstm_model = LSTMModel(input_size=10, hidden_size=64, num_layers=2, num_classes=5)
print(lstm_model)

Advanced Training Techniques

Custom Training Loop

def train_epoch(model, train_loader, criterion, optimizer, device):
    model.train()
    total_loss = 0
    
    for batch_X, batch_y in train_loader:
        batch_X, batch_y = batch_X.to(device), batch_y.to(device)
        
        # Forward pass
        outputs = model(batch_X)
        loss = criterion(outputs, batch_y)
        
        # Backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
    
    return total_loss / len(train_loader)

def validate(model, val_loader, criterion, device):
    model.eval()
    total_loss = 0
    correct = 0
    total = 0
    
    with torch.no_grad():
        for batch_X, batch_y in val_loader:
            batch_X, batch_y = batch_X.to(device), batch_y.to(device)
            
            outputs = model(batch_X)
            loss = criterion(outputs, batch_y)
            
            total_loss += loss.item()
            
            _, predicted = torch.max(outputs.data, 1)
            total += batch_y.size(0)
            correct += (predicted == batch_y).sum().item()
    
    accuracy = 100 * correct / total
    return total_loss / len(val_loader), accuracy

# Training with validation
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = SimpleNN().to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

num_epochs = 10
for epoch in range(num_epochs):
    train_loss = train_epoch(model, train_loader, criterion, optimizer, device)
    val_loss, val_acc = validate(model, val_loader, criterion, device)
    
    print(f"Epoch {epoch+1}/{num_epochs}")
    print(f"  Train Loss: {train_loss:.4f}")
    print(f"  Val Loss: {val_loss:.4f}, Val Acc: {val_acc:.2f}%")

Learning Rate Scheduling

# Step-based scheduler
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=5, gamma=0.1)

# Exponential decay
scheduler = optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.9)

# Cosine annealing
scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=10)

# Use scheduler in training loop
for epoch in range(num_epochs):
    train_loss = train_epoch(model, train_loader, criterion, optimizer, device)
    scheduler.step()

Model Saving and Loading

# Save model state
torch.save(model.state_dict(), 'model_weights.pth')

# Load model state
model = SimpleNN()
model.load_state_dict(torch.load('model_weights.pth'))

# Save entire model (not recommended)
torch.save(model, 'model.pth')
loaded_model = torch.load('model.pth')

# Save checkpoint with optimizer state
checkpoint = {
    'epoch': epoch,
    'model_state_dict': model.state_dict(),
    'optimizer_state_dict': optimizer.state_dict(),
    'loss': loss,
}
torch.save(checkpoint, 'checkpoint.pth')

# Load checkpoint
checkpoint = torch.load('checkpoint.pth')
model.load_state_dict(checkpoint['model_state_dict'])
optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
epoch = checkpoint['epoch']

Transfer Learning

import torchvision.models as models

# Load pre-trained ResNet50
resnet = models.resnet50(pretrained=True)

# Freeze all layers
for param in resnet.parameters():
    param.requires_grad = False

# Replace final layer
num_features = resnet.fc.in_features
resnet.fc = nn.Linear(num_features, 10)

# Only train final layer
optimizer = optim.Adam(resnet.fc.parameters(), lr=0.001)

# Fine-tune: unfreeze some layers
for param in resnet.layer4.parameters():
    param.requires_grad = True

# Recompile optimizer with all trainable parameters
optimizer = optim.Adam(filter(lambda p: p.requires_grad, resnet.parameters()), 
                       lr=0.0001)

Best Practices

Use GPU when available: Move tensors and models to GPU for faster training
Batch normalization: Stabilizes training and improves convergence
Gradient clipping: Prevents exploding gradients in RNNs
Checkpointing: Save best model during training
Validation monitoring: Detect overfitting early
Learning rate scheduling: Adjust learning rate during training
Mixed precision training: Use float16 for faster training on GPUs

Common Pitfalls

Bad Practice:

# Don't: Forget to zero gradients
for batch_X, batch_y in train_loader:
    outputs = model(batch_X)
    loss = criterion(outputs, batch_y)
    loss.backward()
    optimizer.step()  # Gradients accumulate!

# Don't: Train on CPU when GPU available
model = model.to('cpu')  # Should be 'cuda'

# Don't: Forget model.eval() during validation
model.train()  # Should be model.eval()
with torch.no_grad():
    outputs = model(val_data)

Good Practice:

# Do: Zero gradients each iteration
for batch_X, batch_y in train_loader:
    optimizer.zero_grad()
    outputs = model(batch_X)
    loss = criterion(outputs, batch_y)
    loss.backward()
    optimizer.step()

# Do: Use GPU when available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)

# Do: Set model to eval mode for validation
model.eval()
with torch.no_grad():
    outputs = model(val_data)

Conclusion

PyTorch’s dynamic computation graphs and intuitive API make it ideal for research and production deep learning. The combination of autograd for automatic differentiation and flexible module design enables building complex architectures. Master tensor operations, understand autograd, and practice with real datasets to become proficient with PyTorch.