PyTorch Fundamentals

PyTorch has become the dominant deep learning framework in research and production. Its dynamic computation graphs and intuitive design make it perfect for building and experimenting with neural networks. This lesson covers the essential PyTorch concepts you’ll use daily.

Tensors: PyTorch’s Core Data Structure

Tensors are multi-dimensional arrays optimized for GPU computation. Think of them as NumPy arrays on steroids:

import torch
import numpy as np

# Creating tensors
t1 = torch.tensor([1, 2, 3])  # From Python list
t2 = torch.tensor([[1, 2], [3, 4]])  # 2D tensor

t3 = torch.zeros(3, 4)  # All zeros
t4 = torch.ones(2, 3)  # All ones
t5 = torch.randn(5, 5)  # Random from normal distribution

# Shape and properties
print(f"Shape: {t2.shape}")
print(f"Data type: {t2.dtype}")
print(f"Device: {t2.device}")  # CPU or GPU

# Convert between NumPy and PyTorch
np_array = np.array([1, 2, 3])
torch_tensor = torch.from_numpy(np_array)
back_to_numpy = torch_tensor.numpy()

# Move to GPU (if available)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
t_gpu = t2.to(device)

Automatic Differentiation: Autograd

PyTorch’s magic: automatic computation of gradients for any scalar output:

import torch

# Create tensor with gradient tracking enabled
x = torch.tensor([2.0, 3.0], requires_grad=True)

# Compute function: y = x^2 + 2*x
y = x**2 + 2*x

# Compute sum (scalar) and compute gradient
loss = y.sum()
loss.backward()  # Backpropagation

print(f"Gradient of x: {x.grad}")
# Expected: [6, 8] (derivative of x^2 + 2*x is 2*x + 2)

# Manual verification
# d/dx(x^2 + 2*x) = 2*x + 2
# At x=2: 2*2 + 2 = 6
# At x=3: 2*3 + 2 = 8

Autograd works with complex computation graphs:

# Multi-step computation graph
x = torch.tensor([[1.0, 2.0], [3.0, 4.0]], requires_grad=True)

# Series of operations
y = x * 2
z = y.sum() + y.mean()
loss = z ** 2

# Backward pass computes gradients for all operations
loss.backward()

print(f"Gradient of x:\n{x.grad}")

# Important: Gradients accumulate by default
x.grad.zero_()  # Clear gradients before next backward pass

Building Models: nn.Module

PyTorch models inherit from nn.Module:

import torch
import torch.nn as nn

class SimpleNet(nn.Module):
    def __init__(self):
        super(SimpleNet, self).__init__()

        # Define layers
        self.fc1 = nn.Linear(784, 128)  # 784 inputs, 128 outputs
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(128, 64)
        self.fc3 = nn.Linear(64, 10)  # 10 output classes

    def forward(self, x):
        """Define computation: how input flows through layers"""
        x = x.view(x.size(0), -1)  # Flatten
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        x = self.relu(x)
        x = self.fc3(x)
        return x

# Create model
model = SimpleNet()
print(model)

# Count parameters
total_params = sum(p.numel() for p in model.parameters())
print(f"Total parameters: {total_params}")

# Forward pass
x = torch.randn(32, 784)  # Batch of 32 images
output = model(x)
print(f"Output shape: {output.shape}")  # [32, 10]

Loss Functions and Optimization

PyTorch provides standard loss functions and optimizers:

import torch.nn as nn
import torch.optim as optim

model = SimpleNet()

# Loss function for classification
criterion = nn.CrossEntropyLoss()  # Combines LogSoftmax + NLLLoss

# Optimizer
optimizer = optim.Adam(model.parameters(), lr=0.001)
# Other optimizers: SGD, AdamW, RMSprop

# Single training step
X_batch = torch.randn(32, 784)
y_batch = torch.randint(0, 10, (32,))

# Forward pass
outputs = model(X_batch)
loss = criterion(outputs, y_batch)

# Backward pass
optimizer.zero_grad()  # Clear old gradients
loss.backward()  # Compute gradients
optimizer.step()  # Update weights

print(f"Loss: {loss.item():.4f}")

Complete Training Loop

A full training loop combines everything:

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

# Synthetic data
X_train = torch.randn(1000, 784)
y_train = torch.randint(0, 10, (1000,))

X_test = torch.randn(200, 784)
y_test = torch.randint(0, 10, (200,))

# Create DataLoaders for batching
train_dataset = TensorDataset(X_train, y_train)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

test_dataset = TensorDataset(X_test, y_test)
test_loader = DataLoader(test_dataset, batch_size=32)

# Setup
model = SimpleNet()
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

# Training loop
num_epochs = 10
for epoch in range(num_epochs):
    # Training phase
    model.train()
    train_loss = 0.0
    for X_batch, y_batch in train_loader:
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)

        # Forward pass
        outputs = model(X_batch)
        loss = criterion(outputs, y_batch)

        # Backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        train_loss += loss.item()

    avg_train_loss = train_loss / len(train_loader)

    # Validation phase
    model.eval()  # Disable dropout, batchnorm updates
    test_loss = 0.0
    correct = 0
    total = 0

    with torch.no_grad():  # Disable gradient computation
        for X_batch, y_batch in test_loader:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)

            outputs = model(X_batch)
            loss = criterion(outputs, y_batch)
            test_loss += loss.item()

            _, predicted = torch.max(outputs, 1)
            correct += (predicted == y_batch).sum().item()
            total += y_batch.size(0)

    avg_test_loss = test_loss / len(test_loader)
    accuracy = correct / total

    print(f"Epoch [{epoch+1}/{num_epochs}] "
          f"Train Loss: {avg_train_loss:.4f} | "
          f"Test Loss: {avg_test_loss:.4f} | "
          f"Test Acc: {accuracy:.4f}")

Common Layer Types

PyTorch provides pre-built layers for common operations:

import torch.nn as nn

# Dense/Linear layer
linear = nn.Linear(in_features=10, out_features=5)

# Activation functions
relu = nn.ReLU()
sigmoid = nn.Sigmoid()
tanh = nn.Tanh()
leaky_relu = nn.LeakyReLU(negative_slope=0.01)

# Regularization
dropout = nn.Dropout(p=0.5)
batch_norm = nn.BatchNorm1d(num_features=128)

# Convolutional (for images)
conv2d = nn.Conv2d(in_channels=3, out_channels=64, kernel_size=3)

# Pooling
max_pool = nn.MaxPool2d(kernel_size=2)
avg_pool = nn.AvgPool2d(kernel_size=2)

# Recurrent (for sequences)
lstm = nn.LSTM(input_size=100, hidden_size=50, num_layers=2)
gru = nn.GRU(input_size=100, hidden_size=50)

Model Saving and Loading

Persist trained models:

# Save model
torch.save(model.state_dict(), 'model.pth')

# Load model
model = SimpleNet()
model.load_state_dict(torch.load('model.pth'))

# Save entire model (including architecture)
torch.save(model, 'model_full.pth')
loaded_model = torch.load('model_full.pth')

Debugging and Profiling

Tools to understand what’s happening:

# Check gradient flow
def check_gradients(model):
    for name, param in model.named_parameters():
        if param.grad is not None:
            print(f"{name}: {param.grad.abs().mean():.6f}")
        else:
            print(f"{name}: No gradient")

# Forward hook: inspect intermediate activations
def forward_hook(module, input, output):
    print(f"Module: {module.__class__.__name__}")
    print(f"Output shape: {output.shape}")

model.fc1.register_forward_hook(forward_hook)

# Profiling: measure execution time
import time
start = time.time()
output = model(torch.randn(32, 784))
end = time.time()
print(f"Forward pass time: {(end-start)*1000:.2f}ms")

Key Takeaway

PyTorch combines mathematical expressiveness with practical convenience. Master the fundamentals—tensors, autograd, nn.Module, and training loops—and you have the foundation for any deep learning project.

Practical Exercise

Build a neural network classifier for MNIST:

import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import datasets, transforms
from torch.utils.data import DataLoader

# Your task:
# 1. Download MNIST dataset
# 2. Build a 3-layer neural network
# 3. Train for 5 epochs with Adam optimizer
# 4. Evaluate on test set
# 5. Save the trained model
# 6. Load and verify predictions

# Expected structure:
# - Data loading with transforms
# - Custom nn.Module model
# - Full training loop with validation
# - Test set evaluation
# - Model persistence

# Hints:
# - MNIST: 28x28 grayscale images, 10 classes
# - Flatten to 784-dim vectors
# - Use CrossEntropyLoss for classification
# - Track both train and test metrics

This exercise solidifies PyTorch fundamentals before moving to specialized architectures.