Convolutional Neural Networks (CNNs): Image Processing and Computer Vision
Convolutional Neural Networks are specialized neural networks designed for processing grid-like data, particularly images. They’ve revolutionized computer vision by automatically learning hierarchical features from raw pixels.
CNN Fundamentals
Why CNNs for Images?
Traditional fully connected networks have limitations with images:
- Too many parameters (28ร28 image = 784 inputs)
- Lose spatial information
- Not translation invariant
CNNs solve these through:
- Local connectivity: Neurons connect to small regions
- Weight sharing: Same filters applied across image
- Spatial hierarchy: Learn features at multiple scales
Convolution Operation
The core operation in CNNs is convolution: sliding a filter over an image and computing element-wise products.
import numpy as np
import matplotlib.pyplot as plt
def convolve2d(image, kernel):
"""Simple 2D convolution without padding"""
kernel_h, kernel_w = kernel.shape
img_h, img_w = image.shape
output_h = img_h - kernel_h + 1
output_w = img_w - kernel_w + 1
output = np.zeros((output_h, output_w))
for i in range(output_h):
for j in range(output_w):
region = image[i:i+kernel_h, j:j+kernel_w]
output[i, j] = np.sum(region * kernel)
return output
# Example: Edge detection with Sobel filter
image = np.random.rand(5, 5)
sobel_x = np.array([[-1, 0, 1], [-2, 0, 2], [-1, 0, 1]])
result = convolve2d(image, sobel_x)
print(f"Input shape: {image.shape}")
print(f"Kernel shape: {sobel_x.shape}")
print(f"Output shape: {result.shape}")
# Visualize convolution
fig, axes = plt.subplots(1, 3, figsize=(12, 3))
axes[0].imshow(image, cmap='gray')
axes[0].set_title('Input Image')
axes[1].imshow(sobel_x, cmap='gray')
axes[1].set_title('Sobel Filter')
axes[2].imshow(result, cmap='gray')
axes[2].set_title('Convolution Output')
plt.tight_layout()
plt.show()
Pooling Operations
Pooling reduces spatial dimensions while preserving important features.
def max_pool(image, pool_size=2, stride=2):
"""Max pooling operation"""
img_h, img_w = image.shape
pool_h, pool_w = pool_size, pool_size
output_h = (img_h - pool_h) // stride + 1
output_w = (img_w - pool_w) // stride + 1
output = np.zeros((output_h, output_w))
for i in range(output_h):
for j in range(output_w):
region = image[i*stride:i*stride+pool_h, j*stride:j*stride+pool_w]
output[i, j] = np.max(region)
return output
def avg_pool(image, pool_size=2, stride=2):
"""Average pooling operation"""
img_h, img_w = image.shape
pool_h, pool_w = pool_size, pool_size
output_h = (img_h - pool_h) // stride + 1
output_w = (img_w - pool_w) // stride + 1
output = np.zeros((output_h, output_w))
for i in range(output_h):
for j in range(output_w):
region = image[i*stride:i*stride+pool_h, j*stride:j*stride+pool_w]
output[i, j] = np.mean(region)
return output
# Example
image = np.random.rand(8, 8)
max_pooled = max_pool(image, pool_size=2, stride=2)
avg_pooled = avg_pool(image, pool_size=2, stride=2)
print(f"Original shape: {image.shape}")
print(f"Max pooled shape: {max_pooled.shape}")
print(f"Avg pooled shape: {avg_pooled.shape}")
Building CNNs with TensorFlow/Keras
Simple CNN for MNIST
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.datasets import mnist
# Load and preprocess data
(X_train, y_train), (X_test, y_test) = mnist.load_data()
# Normalize and reshape
X_train = X_train.astype('float32') / 255.0
X_test = X_test.astype('float32') / 255.0
X_train = X_train.reshape(-1, 28, 28, 1)
X_test = X_test.reshape(-1, 28, 28, 1)
# Build CNN
model = keras.Sequential([
layers.Conv2D(32, (3, 3), activation='relu', input_shape=(28, 28, 1)),
layers.MaxPooling2D((2, 2)),
layers.Conv2D(64, (3, 3), activation='relu'),
layers.MaxPooling2D((2, 2)),
layers.Conv2D(64, (3, 3), activation='relu'),
layers.Flatten(),
layers.Dense(64, activation='relu'),
layers.Dropout(0.5),
layers.Dense(10, activation='softmax')
])
model.compile(
optimizer='adam',
loss='sparse_categorical_crossentropy',
metrics=['accuracy']
)
model.summary()
# Train
history = model.fit(
X_train, y_train,
epochs=10,
batch_size=128,
validation_split=0.2,
verbose=1
)
# Evaluate
test_loss, test_acc = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {test_acc:.4f}")
VGG-like Architecture
# VGG-inspired architecture
vgg_model = keras.Sequential([
# Block 1
layers.Conv2D(64, (3, 3), activation='relu', padding='same', input_shape=(224, 224, 3)),
layers.Conv2D(64, (3, 3), activation='relu', padding='same'),
layers.MaxPooling2D((2, 2)),
# Block 2
layers.Conv2D(128, (3, 3), activation='relu', padding='same'),
layers.Conv2D(128, (3, 3), activation='relu', padding='same'),
layers.MaxPooling2D((2, 2)),
# Block 3
layers.Conv2D(256, (3, 3), activation='relu', padding='same'),
layers.Conv2D(256, (3, 3), activation='relu', padding='same'),
layers.Conv2D(256, (3, 3), activation='relu', padding='same'),
layers.MaxPooling2D((2, 2)),
# Block 4
layers.Conv2D(512, (3, 3), activation='relu', padding='same'),
layers.Conv2D(512, (3, 3), activation='relu', padding='same'),
layers.Conv2D(512, (3, 3), activation='relu', padding='same'),
layers.MaxPooling2D((2, 2)),
# Classification head
layers.Flatten(),
layers.Dense(4096, activation='relu'),
layers.Dropout(0.5),
layers.Dense(4096, activation='relu'),
layers.Dropout(0.5),
layers.Dense(1000, activation='softmax')
])
vgg_model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
ResNet-like Architecture with Skip Connections
def residual_block(x, filters, kernel_size=3):
"""Residual block with skip connection"""
shortcut = x
x = layers.Conv2D(filters, kernel_size, padding='same', activation='relu')(x)
x = layers.BatchNormalization()(x)
x = layers.Conv2D(filters, kernel_size, padding='same')(x)
x = layers.BatchNormalization()(x)
# Skip connection
x = layers.Add()([x, shortcut])
x = layers.Activation('relu')(x)
return x
# Build ResNet-like model
inputs = keras.Input(shape=(32, 32, 3))
x = layers.Conv2D(64, 3, padding='same', activation='relu')(inputs)
# Residual blocks
for _ in range(3):
x = residual_block(x, 64)
x = layers.GlobalAveragePooling2D()(x)
outputs = layers.Dense(10, activation='softmax')(x)
resnet_model = keras.Model(inputs, outputs)
resnet_model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
Building CNNs with PyTorch
import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import datasets, transforms
from torch.utils.data import DataLoader
class SimpleCNN(nn.Module):
def __init__(self):
super(SimpleCNN, self).__init__()
self.conv1 = nn.Conv2d(1, 32, kernel_size=3, padding=1)
self.conv2 = nn.Conv2d(32, 64, kernel_size=3, padding=1)
self.conv3 = nn.Conv2d(64, 64, kernel_size=3, padding=1)
self.pool = nn.MaxPool2d(2, 2)
self.fc1 = nn.Linear(64 * 3 * 3, 128)
self.fc2 = nn.Linear(128, 10)
self.dropout = nn.Dropout(0.5)
self.relu = nn.ReLU()
def forward(self, x):
x = self.relu(self.conv1(x))
x = self.pool(x)
x = self.relu(self.conv2(x))
x = self.pool(x)
x = self.relu(self.conv3(x))
x = self.pool(x)
x = x.view(x.size(0), -1)
x = self.relu(self.fc1(x))
x = self.dropout(x)
x = self.fc2(x)
return x
# Load MNIST
transform = transforms.Compose([
transforms.ToTensor(),
transforms.Normalize((0.1307,), (0.3081,))
])
train_dataset = datasets.MNIST(root='./data', train=True, download=True, transform=transform)
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
# Train
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = SimpleCNN().to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
for epoch in range(5):
total_loss = 0
for images, labels in train_loader:
images, labels = images.to(device), labels.to(device)
outputs = model(images)
loss = criterion(outputs, labels)
optimizer.zero_grad()
loss.backward()
optimizer.step()
total_loss += loss.item()
print(f"Epoch {epoch+1}, Loss: {total_loss/len(train_loader):.4f}")
Advanced CNN Techniques
Batch Normalization
# Batch normalization stabilizes training
model = keras.Sequential([
layers.Conv2D(32, (3, 3), input_shape=(28, 28, 1)),
layers.BatchNormalization(),
layers.Activation('relu'),
layers.MaxPooling2D((2, 2)),
layers.Conv2D(64, (3, 3)),
layers.BatchNormalization(),
layers.Activation('relu'),
layers.MaxPooling2D((2, 2)),
layers.Flatten(),
layers.Dense(128),
layers.BatchNormalization(),
layers.Activation('relu'),
layers.Dense(10, activation='softmax')
])
Data Augmentation
from tensorflow.keras.preprocessing.image import ImageDataGenerator
# Create augmented data generator
train_datagen = ImageDataGenerator(
rotation_range=20,
width_shift_range=0.2,
height_shift_range=0.2,
horizontal_flip=True,
zoom_range=0.2,
shear_range=0.2,
fill_mode='nearest'
)
# Use in training
# model.fit(train_datagen.flow(X_train, y_train, batch_size=32),
# epochs=10, validation_data=(X_val, y_val))
Best Practices
- Start with pre-trained models: Transfer learning is faster than training from scratch
- Use batch normalization: Stabilizes training and improves convergence
- Data augmentation: Increases effective dataset size and improves generalization
- Regularization: Use dropout and L2 regularization to prevent overfitting
- Monitor validation loss: Detect overfitting early with validation set
- Appropriate architecture: Match architecture complexity to dataset size
Common Pitfalls
Bad Practice:
# Don't: Use raw pixel values
model.fit(X_raw, y) # X_raw in [0, 255]
# Don't: No data augmentation with small datasets
model.fit(X_small, y_small, epochs=100)
# Don't: Forget to reshape for channels
X = X.reshape(-1, 28, 28) # Missing channel dimension
Good Practice:
# Do: Normalize pixel values
X_normalized = X / 255.0
# Do: Use data augmentation
augmented_data = ImageDataGenerator(...).flow(X, y)
model.fit(augmented_data, epochs=100)
# Do: Include channel dimension
X = X.reshape(-1, 28, 28, 1)
Conclusion
CNNs have transformed computer vision by automatically learning hierarchical features from images. Understanding convolution, pooling, and modern architectures enables building powerful image processing systems. Start with pre-trained models, use data augmentation, and monitor validation performance to build effective CNN applications.
Comments