Recurrent Networks and Sequence Models
Recurrent Networks and Sequence Models
RNNs process sequential data by maintaining hidden state across time steps. Unlike CNNs which exploit spatial structure, RNNs handle dependencies in sequences: text, time series, audio, and video.
Recurrent Neural Networks: The Basic Idea
An RNN processes sequences step-by-step, maintaining a hidden state that captures information from previous steps:
import torch
import torch.nn as nn
# Simple RNN from scratch (conceptual)
class SimpleRNN:
def __init__(self, input_size, hidden_size):
self.hidden_size = hidden_size
self.W_h = torch.randn(hidden_size, hidden_size) * 0.01
self.W_x = torch.randn(hidden_size, input_size) * 0.01
self.b = torch.zeros(hidden_size)
def forward(self, X):
"""
X: [sequence_length, batch_size, input_size]
Returns: outputs [sequence_length, batch_size, hidden_size]
"""
sequence_length, batch_size, _ = X.shape
h = torch.zeros(batch_size, self.hidden_size)
outputs = []
for t in range(sequence_length):
# h_t = tanh(W_x * x_t + W_h * h_{t-1} + b)
h = torch.tanh(
torch.matmul(X[t], self.W_x.T) +
torch.matmul(h, self.W_h.T) + self.b
)
outputs.append(h)
return torch.stack(outputs, dim=0)
# In practice, use PyTorch's RNN
rnn = nn.RNN(input_size=10, hidden_size=20, batch_first=False)
X = torch.randn(5, 32, 10) # Sequence length=5, batch=32, features=10
output, h_n = rnn(X)
print(f"Output shape: {output.shape}") # [5, 32, 20]
print(f"Final hidden state shape: {h_n.shape}") # [1, 32, 20]
LSTM: Solving Vanishing Gradients
LSTMs introduce a cell state and gates to better capture long-term dependencies:
# LSTM cell operations
class LSTMCell(nn.Module):
def __init__(self, input_size, hidden_size):
super(LSTMCell, self).__init__()
self.hidden_size = hidden_size
# Four gate matrices
self.W = nn.Linear(input_size + hidden_size, 4 * hidden_size)
def forward(self, x_t, state):
h_t_prev, c_t_prev = state
# Concatenate input and hidden state
combined = torch.cat([x_t, h_t_prev], dim=1)
# Apply linear transformation and split into 4 gates
gates = self.W(combined)
i, f, g, o = gates.chunk(4, dim=1)
# Input gate: which new information enters
i = torch.sigmoid(i)
# Forget gate: which cell state information to discard
f = torch.sigmoid(f)
# Cell candidate: new information to add
g = torch.tanh(g)
# Output gate: which cell state to expose
o = torch.sigmoid(o)
# Update cell state and hidden state
c_t = f * c_t_prev + i * g
h_t = o * torch.tanh(c_t)
return h_t, (h_t, c_t)
# In practice, use PyTorch's LSTM
lstm = nn.LSTM(input_size=10, hidden_size=20, num_layers=2, batch_first=True)
X = torch.randn(32, 5, 10) # Batch=32, seq_len=5, features=10
output, (h_n, c_n) = lstm(X)
print(f"Output shape: {output.shape}") # [32, 5, 20]
print(f"Hidden state shape: {h_n.shape}") # [2, 32, 20] (2 layers)
print(f"Cell state shape: {c_n.shape}") # [2, 32, 20]
LSTM key components:
- Input gate (i): Controls information flow into cell
- Forget gate (f): Controls what to forget from previous state
- Cell candidate (g): New information to consider
- Output gate (o): Controls what to output
This gate mechanism allows gradients to flow better through time.
GRU: Simplified LSTM
GRUs simplify LSTMs while maintaining performance:
# GRU has 3 gates instead of LSTM's 4
gru = nn.GRU(input_size=10, hidden_size=20, num_layers=2, batch_first=True)
X = torch.randn(32, 5, 10)
output, h_n = gru(X)
print(f"Output shape: {output.shape}") # [32, 5, 20]
print(f"Hidden state shape: {h_n.shape}") # [2, 32, 20]
GRU components:
- Update gate: Blends previous and candidate hidden states
- Reset gate: How much of previous state to consider for candidate
GRU is faster and uses fewer parameters than LSTM, often with comparable performance.
Sequence Classification: Sentiment Analysis
Use RNN final hidden state for sequence classification:
class SentimentClassifier(nn.Module):
def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim):
super(SentimentClassifier, self).__init__()
self.embedding = nn.Embedding(vocab_size, embedding_dim)
self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers=2,
bidirectional=True, dropout=0.3, batch_first=True)
self.fc = nn.Linear(hidden_dim * 2, output_dim) # *2 for bidirectional
self.dropout = nn.Dropout(0.3)
def forward(self, text):
# text: [batch_size, seq_len] (token indices)
embedded = self.embedding(text) # [batch_size, seq_len, embed_dim]
_, (h_final, _) = self.lstm(embedded) # h_final: [2*2, batch, hidden]
# Concatenate forward and backward final states
h_final = torch.cat((h_final[-2, :, :], h_final[-1, :, :]), dim=1)
h_final = self.dropout(h_final)
output = self.fc(h_final) # [batch_size, output_dim]
return output
# Example usage
model = SentimentClassifier(vocab_size=10000, embedding_dim=100,
hidden_dim=128, output_dim=1)
X = torch.randint(0, 10000, (32, 50)) # Batch of 32 sequences, length 50
output = model(X)
print(f"Output shape: {output.shape}") # [32, 1]
Sequence-to-Sequence: Encoder-Decoder
Translation and summarization use encoder-decoder architectures:
class EncoderDecoder(nn.Module):
def __init__(self, encoder_input_dim, decoder_input_dim, hidden_dim, output_dim):
super(EncoderDecoder, self).__init__()
# Encoder: process input sequence
self.encoder = nn.LSTM(encoder_input_dim, hidden_dim, batch_first=True)
# Decoder: generate output sequence
self.decoder = nn.LSTM(decoder_input_dim, hidden_dim, batch_first=True)
# Output projection
self.fc = nn.Linear(hidden_dim, output_dim)
def forward(self, encoder_input, decoder_input):
# Encoder: process full input sequence
_, (h_enc, c_enc) = self.encoder(encoder_input)
# Decoder: generate output sequence using encoder's hidden state
decoder_output, _ = self.decoder(decoder_input, (h_enc, c_enc))
# Project to output vocabulary
output = self.fc(decoder_output) # [batch, seq_len, output_dim]
return output
# Example: machine translation
# Source language: English, target: French
model = EncoderDecoder(encoder_input_dim=5000, # English vocab
decoder_input_dim=5000, # French vocab
hidden_dim=256,
output_dim=5000)
encoder_input = torch.randint(0, 5000, (32, 10)) # Batch, max_len
decoder_input = torch.randint(0, 5000, (32, 12)) # Batch, target_len
output = model(encoder_input, decoder_input)
print(f"Output shape: {output.shape}") # [32, 12, 5000]
Attention Mechanism: Focusing on Relevant Information
Attention lets the model focus on important parts of the sequence:
class SimpleAttention(nn.Module):
def __init__(self, hidden_dim):
super(SimpleAttention, self).__init__()
self.attention = nn.Linear(hidden_dim * 2, 1)
def forward(self, decoder_hidden, encoder_outputs):
"""
decoder_hidden: [batch, hidden_dim]
encoder_outputs: [batch, seq_len, hidden_dim]
"""
batch_size, seq_len, _ = encoder_outputs.shape
# Broadcast decoder hidden state to match each encoder output
decoder_hidden_expanded = decoder_hidden.unsqueeze(1).expand(-1, seq_len, -1)
# Concatenate: [batch, seq_len, hidden_dim*2]
combined = torch.cat([encoder_outputs, decoder_hidden_expanded], dim=2)
# Compute attention scores
scores = self.attention(combined) # [batch, seq_len, 1]
attention_weights = torch.softmax(scores, dim=1) # [batch, seq_len, 1]
# Weighted sum of encoder outputs
context = (encoder_outputs * attention_weights).sum(dim=1) # [batch, hidden_dim]
return context, attention_weights
Time Series Prediction
RNNs excel at predicting next values in sequences:
class TimeSeriesRNN(nn.Module):
def __init__(self, input_size, hidden_size, num_layers, output_size):
super(TimeSeriesRNN, self).__init__()
self.lstm = nn.LSTM(input_size, hidden_size, num_layers,
batch_first=True, dropout=0.2)
self.fc = nn.Linear(hidden_size, output_size)
def forward(self, x):
# x: [batch_size, seq_len, input_size]
lstm_out, _ = self.lstm(x) # [batch_size, seq_len, hidden_size]
# Use all time steps for prediction
predictions = self.fc(lstm_out) # [batch_size, seq_len, output_size]
return predictions
# Training on time series
model = TimeSeriesRNN(input_size=1, hidden_size=64, num_layers=2, output_size=1)
# Create windowed data: predict next value from previous 10
X = torch.randn(100, 10, 1) # 100 samples, 10-step windows, 1 feature
y = torch.randn(100, 10, 1) # Predict 10 steps ahead
output = model(X)
loss = nn.MSELoss()(output, y)
Key Takeaway
RNNs and LSTMs handle sequences naturally by maintaining state across time steps. LSTM’s gates solve the gradient flow problem of vanilla RNNs, making them the go-to choice for sequences of significant length. For most tasks, GRU provides comparable performance with less complexity.
Practical Exercise
Implement sentiment analysis on movie reviews:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torchtext.datasets import IMDB
from torchtext.data.utils import get_tokenizer
# Your task:
# 1. Load IMDB movie review dataset
# 2. Build vocabulary and tokenizer
# 3. Create SentimentClassifier (LSTM-based)
# 4. Train for 5 epochs
# 5. Evaluate on test set
# 6. Predict sentiment on new review
# Expected components:
# - Text preprocessing pipeline
# - Embedding layer
# - LSTM encoder
# - Output classification
# - Training loop with validation
# - Test set accuracy > 80%
# Bonus:
# - Use bidirectional LSTM
# - Add attention mechanism
# - Implement advanced padding/truncation
# - Compare GRU vs LSTM performance
This exercise teaches practical NLP with RNNs—a foundation for language understanding.