PyTorch Fundamentals
PyTorch Fundamentals
PyTorch has become the dominant deep learning framework in research and production. Its dynamic computation graphs and intuitive design make it perfect for building and experimenting with neural networks. This lesson covers the essential PyTorch concepts you’ll use daily.
Tensors: PyTorch’s Core Data Structure
Tensors are multi-dimensional arrays optimized for GPU computation. Think of them as NumPy arrays on steroids:
import torch
import numpy as np
# Creating tensors
t1 = torch.tensor([1, 2, 3]) # From Python list
t2 = torch.tensor([[1, 2], [3, 4]]) # 2D tensor
t3 = torch.zeros(3, 4) # All zeros
t4 = torch.ones(2, 3) # All ones
t5 = torch.randn(5, 5) # Random from normal distribution
# Shape and properties
print(f"Shape: {t2.shape}")
print(f"Data type: {t2.dtype}")
print(f"Device: {t2.device}") # CPU or GPU
# Convert between NumPy and PyTorch
np_array = np.array([1, 2, 3])
torch_tensor = torch.from_numpy(np_array)
back_to_numpy = torch_tensor.numpy()
# Move to GPU (if available)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
t_gpu = t2.to(device)
Automatic Differentiation: Autograd
PyTorch’s magic: automatic computation of gradients for any scalar output:
import torch
# Create tensor with gradient tracking enabled
x = torch.tensor([2.0, 3.0], requires_grad=True)
# Compute function: y = x^2 + 2*x
y = x**2 + 2*x
# Compute sum (scalar) and compute gradient
loss = y.sum()
loss.backward() # Backpropagation
print(f"Gradient of x: {x.grad}")
# Expected: [6, 8] (derivative of x^2 + 2*x is 2*x + 2)
# Manual verification
# d/dx(x^2 + 2*x) = 2*x + 2
# At x=2: 2*2 + 2 = 6
# At x=3: 2*3 + 2 = 8
Autograd works with complex computation graphs:
# Multi-step computation graph
x = torch.tensor([[1.0, 2.0], [3.0, 4.0]], requires_grad=True)
# Series of operations
y = x * 2
z = y.sum() + y.mean()
loss = z ** 2
# Backward pass computes gradients for all operations
loss.backward()
print(f"Gradient of x:\n{x.grad}")
# Important: Gradients accumulate by default
x.grad.zero_() # Clear gradients before next backward pass
Building Models: nn.Module
PyTorch models inherit from nn.Module:
import torch
import torch.nn as nn
class SimpleNet(nn.Module):
def __init__(self):
super(SimpleNet, self).__init__()
# Define layers
self.fc1 = nn.Linear(784, 128) # 784 inputs, 128 outputs
self.relu = nn.ReLU()
self.fc2 = nn.Linear(128, 64)
self.fc3 = nn.Linear(64, 10) # 10 output classes
def forward(self, x):
"""Define computation: how input flows through layers"""
x = x.view(x.size(0), -1) # Flatten
x = self.fc1(x)
x = self.relu(x)
x = self.fc2(x)
x = self.relu(x)
x = self.fc3(x)
return x
# Create model
model = SimpleNet()
print(model)
# Count parameters
total_params = sum(p.numel() for p in model.parameters())
print(f"Total parameters: {total_params}")
# Forward pass
x = torch.randn(32, 784) # Batch of 32 images
output = model(x)
print(f"Output shape: {output.shape}") # [32, 10]
Loss Functions and Optimization
PyTorch provides standard loss functions and optimizers:
import torch.nn as nn
import torch.optim as optim
model = SimpleNet()
# Loss function for classification
criterion = nn.CrossEntropyLoss() # Combines LogSoftmax + NLLLoss
# Optimizer
optimizer = optim.Adam(model.parameters(), lr=0.001)
# Other optimizers: SGD, AdamW, RMSprop
# Single training step
X_batch = torch.randn(32, 784)
y_batch = torch.randint(0, 10, (32,))
# Forward pass
outputs = model(X_batch)
loss = criterion(outputs, y_batch)
# Backward pass
optimizer.zero_grad() # Clear old gradients
loss.backward() # Compute gradients
optimizer.step() # Update weights
print(f"Loss: {loss.item():.4f}")
Complete Training Loop
A full training loop combines everything:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
# Synthetic data
X_train = torch.randn(1000, 784)
y_train = torch.randint(0, 10, (1000,))
X_test = torch.randn(200, 784)
y_test = torch.randint(0, 10, (200,))
# Create DataLoaders for batching
train_dataset = TensorDataset(X_train, y_train)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_dataset = TensorDataset(X_test, y_test)
test_loader = DataLoader(test_dataset, batch_size=32)
# Setup
model = SimpleNet()
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
# Training loop
num_epochs = 10
for epoch in range(num_epochs):
# Training phase
model.train()
train_loss = 0.0
for X_batch, y_batch in train_loader:
X_batch, y_batch = X_batch.to(device), y_batch.to(device)
# Forward pass
outputs = model(X_batch)
loss = criterion(outputs, y_batch)
# Backward pass
optimizer.zero_grad()
loss.backward()
optimizer.step()
train_loss += loss.item()
avg_train_loss = train_loss / len(train_loader)
# Validation phase
model.eval() # Disable dropout, batchnorm updates
test_loss = 0.0
correct = 0
total = 0
with torch.no_grad(): # Disable gradient computation
for X_batch, y_batch in test_loader:
X_batch, y_batch = X_batch.to(device), y_batch.to(device)
outputs = model(X_batch)
loss = criterion(outputs, y_batch)
test_loss += loss.item()
_, predicted = torch.max(outputs, 1)
correct += (predicted == y_batch).sum().item()
total += y_batch.size(0)
avg_test_loss = test_loss / len(test_loader)
accuracy = correct / total
print(f"Epoch [{epoch+1}/{num_epochs}] "
f"Train Loss: {avg_train_loss:.4f} | "
f"Test Loss: {avg_test_loss:.4f} | "
f"Test Acc: {accuracy:.4f}")
Common Layer Types
PyTorch provides pre-built layers for common operations:
import torch.nn as nn
# Dense/Linear layer
linear = nn.Linear(in_features=10, out_features=5)
# Activation functions
relu = nn.ReLU()
sigmoid = nn.Sigmoid()
tanh = nn.Tanh()
leaky_relu = nn.LeakyReLU(negative_slope=0.01)
# Regularization
dropout = nn.Dropout(p=0.5)
batch_norm = nn.BatchNorm1d(num_features=128)
# Convolutional (for images)
conv2d = nn.Conv2d(in_channels=3, out_channels=64, kernel_size=3)
# Pooling
max_pool = nn.MaxPool2d(kernel_size=2)
avg_pool = nn.AvgPool2d(kernel_size=2)
# Recurrent (for sequences)
lstm = nn.LSTM(input_size=100, hidden_size=50, num_layers=2)
gru = nn.GRU(input_size=100, hidden_size=50)
Model Saving and Loading
Persist trained models:
# Save model
torch.save(model.state_dict(), 'model.pth')
# Load model
model = SimpleNet()
model.load_state_dict(torch.load('model.pth'))
# Save entire model (including architecture)
torch.save(model, 'model_full.pth')
loaded_model = torch.load('model_full.pth')
Debugging and Profiling
Tools to understand what’s happening:
# Check gradient flow
def check_gradients(model):
for name, param in model.named_parameters():
if param.grad is not None:
print(f"{name}: {param.grad.abs().mean():.6f}")
else:
print(f"{name}: No gradient")
# Forward hook: inspect intermediate activations
def forward_hook(module, input, output):
print(f"Module: {module.__class__.__name__}")
print(f"Output shape: {output.shape}")
model.fc1.register_forward_hook(forward_hook)
# Profiling: measure execution time
import time
start = time.time()
output = model(torch.randn(32, 784))
end = time.time()
print(f"Forward pass time: {(end-start)*1000:.2f}ms")
Key Takeaway
PyTorch combines mathematical expressiveness with practical convenience. Master the fundamentals—tensors, autograd, nn.Module, and training loops—and you have the foundation for any deep learning project.
Practical Exercise
Build a neural network classifier for MNIST:
import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import datasets, transforms
from torch.utils.data import DataLoader
# Your task:
# 1. Download MNIST dataset
# 2. Build a 3-layer neural network
# 3. Train for 5 epochs with Adam optimizer
# 4. Evaluate on test set
# 5. Save the trained model
# 6. Load and verify predictions
# Expected structure:
# - Data loading with transforms
# - Custom nn.Module model
# - Full training loop with validation
# - Test set evaluation
# - Model persistence
# Hints:
# - MNIST: 28x28 grayscale images, 10 classes
# - Flatten to 784-dim vectors
# - Use CrossEntropyLoss for classification
# - Track both train and test metrics
This exercise solidifies PyTorch fundamentals before moving to specialized architectures.