Recurrent Neural Networks (RNNs) and LSTMs: Sequence Processing
Recurrent Neural Networks are designed for sequential data where the order matters. They maintain hidden state that captures information from previous time steps, making them ideal for time series, text, and other sequential tasks.
RNN Fundamentals
Why RNNs for Sequences?
Traditional feedforward networks treat inputs independently, but sequences have temporal dependencies:
- Previous words influence next word in text
- Past prices influence future stock prices
- Previous frames affect next video frame
RNNs solve this through recurrent connections that maintain hidden state across time steps.
Basic RNN Architecture
import numpy as np
import matplotlib.pyplot as plt
class SimpleRNN:
def __init__(self, input_size, hidden_size, output_size, learning_rate=0.01):
self.input_size = input_size
self.hidden_size = hidden_size
self.output_size = output_size
self.learning_rate = learning_rate
# Initialize weights
self.Wxh = np.random.randn(input_size, hidden_size) * 0.01
self.Whh = np.random.randn(hidden_size, hidden_size) * 0.01
self.Why = np.random.randn(hidden_size, output_size) * 0.01
self.bh = np.zeros((1, hidden_size))
self.by = np.zeros((1, output_size))
def tanh(self, x):
return np.tanh(x)
def tanh_derivative(self, x):
return 1 - np.tanh(x) ** 2
def softmax(self, x):
exp_x = np.exp(x - np.max(x, axis=1, keepdims=True))
return exp_x / np.sum(exp_x, axis=1, keepdims=True)
def forward(self, X):
"""Forward pass through sequence"""
batch_size, seq_len, _ = X.shape
# Initialize hidden state
h = np.zeros((batch_size, self.hidden_size))
# Store activations for backprop
self.h_cache = [h]
self.x_cache = []
# Process sequence
for t in range(seq_len):
x_t = X[:, t, :]
self.x_cache.append(x_t)
# Hidden state update: h_t = tanh(Wxh @ x_t + Whh @ h_{t-1} + bh)
h = self.tanh(np.dot(x_t, self.Wxh) + np.dot(h, self.Whh) + self.bh)
self.h_cache.append(h)
# Output: y = Why @ h_T + by
y = np.dot(h, self.Why) + self.by
output = self.softmax(y)
return output
def backward(self, X, y, output):
"""Backpropagation through time (BPTT)"""
batch_size, seq_len, _ = X.shape
# Output layer gradient
dy = output - y
dWhy = np.dot(self.h_cache[-1].T, dy)
dby = np.sum(dy, axis=0, keepdims=True)
# Backprop through time
dh = np.dot(dy, self.Why.T)
dWxh = np.zeros_like(self.Wxh)
dWhh = np.zeros_like(self.Whh)
dbh = np.zeros_like(self.bh)
for t in range(seq_len - 1, -1, -1):
# Gradient through tanh
dh_raw = dh * self.tanh_derivative(self.h_cache[t+1])
# Accumulate gradients
dWxh += np.dot(self.x_cache[t].T, dh_raw)
dWhh += np.dot(self.h_cache[t].T, dh_raw)
dbh += np.sum(dh_raw, axis=0, keepdims=True)
# Propagate to previous time step
dh = np.dot(dh_raw, self.Whh.T)
# Clip gradients to prevent explosion
for dparam in [dWxh, dWhh, dWhy, dbh, dby]:
np.clip(dparam, -5, 5, out=dparam)
# Update weights
self.Wxh -= self.learning_rate * dWxh
self.Whh -= self.learning_rate * dWhh
self.Why -= self.learning_rate * dWhy
self.bh -= self.learning_rate * dbh
self.by -= self.learning_rate * dby
# Example: Simple sequence prediction
X = np.random.randn(32, 10, 5) # batch_size=32, seq_len=10, input_size=5
y = np.random.randint(0, 3, (32, 3)) # 3 output classes
rnn = SimpleRNN(input_size=5, hidden_size=20, output_size=3)
output = rnn.forward(X)
print(f"Output shape: {output.shape}")
The Vanishing Gradient Problem
RNNs suffer from vanishing gradients when backpropagating through many time steps.
# Visualization of vanishing gradients
import numpy as np
import matplotlib.pyplot as plt
# Simulate gradient flow through time
def simulate_gradient_flow(num_steps, gradient_factor=0.9):
"""Simulate how gradients shrink through time"""
gradients = [1.0]
for _ in range(num_steps):
gradients.append(gradients[-1] * gradient_factor)
return gradients
# Compare different gradient factors
steps = 50
plt.figure(figsize=(10, 5))
for factor in [0.5, 0.8, 0.9, 0.95]:
grads = simulate_gradient_flow(steps, factor)
plt.plot(grads, label=f'Factor: {factor}')
plt.xlabel('Time Step')
plt.ylabel('Gradient Magnitude')
plt.title('Vanishing Gradient Problem in RNNs')
plt.legend()
plt.yscale('log')
plt.grid(True)
plt.show()
LSTMs: Long Short-Term Memory
LSTMs solve the vanishing gradient problem through gating mechanisms.
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
# LSTM cell components
class LSTMCell:
"""
LSTM cell with forget, input, and output gates
Forget gate: f_t = sigmoid(Wf @ [h_{t-1}, x_t] + bf)
Input gate: i_t = sigmoid(Wi @ [h_{t-1}, x_t] + bi)
Candidate: C_tilde = tanh(Wc @ [h_{t-1}, x_t] + bc)
Cell state: C_t = f_t * C_{t-1} + i_t * C_tilde
Output gate: o_t = sigmoid(Wo @ [h_{t-1}, x_t] + bo)
Hidden state: h_t = o_t * tanh(C_t)
"""
pass
# Build LSTM model with Keras
lstm_model = keras.Sequential([
layers.LSTM(128, activation='relu', input_shape=(timesteps, features),
return_sequences=True),
layers.Dropout(0.2),
layers.LSTM(64, activation='relu'),
layers.Dropout(0.2),
layers.Dense(32, activation='relu'),
layers.Dense(1, activation='sigmoid')
])
lstm_model.compile(
optimizer='adam',
loss='binary_crossentropy',
metrics=['accuracy']
)
Building RNN/LSTM Models with TensorFlow/Keras
Time Series Prediction
import numpy as np
from tensorflow import keras
from tensorflow.keras import layers
from sklearn.preprocessing import MinMaxScaler
# Generate synthetic time series data
def generate_time_series(num_samples=1000, seq_length=50):
"""Generate synthetic time series"""
data = np.sin(np.linspace(0, 100, num_samples)) + np.random.normal(0, 0.1, num_samples)
X, y = [], []
for i in range(len(data) - seq_length):
X.append(data[i:i+seq_length])
y.append(data[i+seq_length])
return np.array(X), np.array(y)
# Prepare data
X, y = generate_time_series(num_samples=1000, seq_length=50)
X = X.reshape(X.shape[0], X.shape[1], 1) # Add feature dimension
# Split data
split = int(0.8 * len(X))
X_train, X_test = X[:split], X[split:]
y_train, y_test = y[:split], y[split:]
# Build LSTM model
model = keras.Sequential([
layers.LSTM(64, activation='relu', input_shape=(50, 1), return_sequences=True),
layers.Dropout(0.2),
layers.LSTM(32, activation='relu'),
layers.Dropout(0.2),
layers.Dense(16, activation='relu'),
layers.Dense(1)
])
model.compile(optimizer='adam', loss='mse', metrics=['mae'])
# Train
history = model.fit(
X_train, y_train,
epochs=20,
batch_size=32,
validation_split=0.2,
verbose=1
)
# Evaluate
test_loss, test_mae = model.evaluate(X_test, y_test)
print(f"Test MAE: {test_mae:.4f}")
# Predict
predictions = model.predict(X_test[:10])
Text Generation with RNN
# Simple character-level text generation
text = "The quick brown fox jumps over the lazy dog"
# Create character mappings
chars = sorted(set(text))
char_to_idx = {c: i for i, c in enumerate(chars)}
idx_to_char = {i: c for i, c in enumerate(chars)}
# Prepare sequences
seq_length = 10
X_text, y_text = [], []
for i in range(len(text) - seq_length):
X_text.append([char_to_idx[c] for c in text[i:i+seq_length]])
y_text.append(char_to_idx[text[i+seq_length]])
X_text = np.array(X_text)
y_text = np.array(y_text)
# Build model
text_model = keras.Sequential([
layers.Embedding(len(chars), 32, input_length=seq_length),
layers.LSTM(128, return_sequences=True),
layers.Dropout(0.2),
layers.LSTM(64),
layers.Dropout(0.2),
layers.Dense(len(chars), activation='softmax')
])
text_model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
# text_model.fit(X_text, y_text, epochs=50, batch_size=16)
Building RNNs with PyTorch
import torch
import torch.nn as nn
import torch.optim as optim
class LSTMModel(nn.Module):
def __init__(self, input_size, hidden_size, num_layers, output_size, dropout=0.2):
super(LSTMModel, self).__init__()
self.hidden_size = hidden_size
self.num_layers = num_layers
self.lstm = nn.LSTM(
input_size, hidden_size, num_layers,
batch_first=True, dropout=dropout
)
self.fc = nn.Linear(hidden_size, output_size)
def forward(self, x):
# LSTM forward pass
lstm_out, (h_n, c_n) = self.lstm(x)
# Use last hidden state
out = self.fc(h_n[-1])
return out
# Create model
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = LSTMModel(input_size=10, hidden_size=64, num_layers=2, output_size=1).to(device)
# Training
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
# Dummy data
X_train = torch.randn(100, 50, 10).to(device) # batch_size=100, seq_len=50, input_size=10
y_train = torch.randn(100, 1).to(device)
for epoch in range(10):
optimizer.zero_grad()
outputs = model(X_train)
loss = criterion(outputs, y_train)
loss.backward()
optimizer.step()
if (epoch + 1) % 2 == 0:
print(f"Epoch {epoch+1}, Loss: {loss.item():.4f}")
GRU: Gated Recurrent Unit
GRU is a simpler alternative to LSTM with fewer parameters:
# GRU model with Keras
gru_model = keras.Sequential([
layers.GRU(128, activation='relu', input_shape=(timesteps, features),
return_sequences=True),
layers.Dropout(0.2),
layers.GRU(64, activation='relu'),
layers.Dropout(0.2),
layers.Dense(32, activation='relu'),
layers.Dense(1, activation='sigmoid')
])
gru_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
# GRU with PyTorch
class GRUModel(nn.Module):
def __init__(self, input_size, hidden_size, num_layers, output_size):
super(GRUModel, self).__init__()
self.gru = nn.GRU(input_size, hidden_size, num_layers, batch_first=True)
self.fc = nn.Linear(hidden_size, output_size)
def forward(self, x):
gru_out, h_n = self.gru(x)
out = self.fc(h_n[-1])
return out
Bidirectional RNNs
Process sequences in both directions:
# Bidirectional LSTM
bidirectional_model = keras.Sequential([
layers.Bidirectional(
layers.LSTM(64, activation='relu', return_sequences=True),
input_shape=(timesteps, features)
),
layers.Dropout(0.2),
layers.Bidirectional(layers.LSTM(32, activation='relu')),
layers.Dropout(0.2),
layers.Dense(16, activation='relu'),
layers.Dense(1, activation='sigmoid')
])
bidirectional_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
Best Practices
- Gradient clipping: Prevent exploding gradients in RNNs
- Use LSTM/GRU: Better than vanilla RNN for long sequences
- Bidirectional processing: When full sequence is available
- Stateful RNNs: For very long sequences, process in chunks
- Sequence padding: Handle variable-length sequences
- Dropout: Prevent overfitting in deep RNNs
Common Pitfalls
Bad Practice:
# Don't: Use vanilla RNN for long sequences
model = keras.Sequential([
layers.SimpleRNN(64, input_shape=(1000, 10)), # Vanishing gradients!
layers.Dense(1)
])
# Don't: Forget to reshape for time dimension
X = X.reshape(-1, 10) # Missing time dimension
# Don't: No gradient clipping
model.fit(X, y, epochs=100) # May explode
Good Practice:
# Do: Use LSTM for long sequences
model = keras.Sequential([
layers.LSTM(64, input_shape=(1000, 10)),
layers.Dense(1)
])
# Do: Include time dimension
X = X.reshape(-1, timesteps, features)
# Do: Use gradient clipping
optimizer = keras.optimizers.Adam(clipvalue=1.0)
model.compile(optimizer=optimizer, loss='mse')
Conclusion
RNNs and LSTMs enable processing of sequential data with temporal dependencies. LSTMs solve the vanishing gradient problem through gating mechanisms, making them suitable for long sequences. Understand the architecture, use appropriate variants (LSTM/GRU), and monitor training carefully to build effective sequence models.
Comments