Foundations

Neural Networks from Scratch

Lesson 1 of 4 Estimated Time 55 min

Neural Networks from Scratch

Before using PyTorch or TensorFlow, understanding how neural networks work at the mathematical level is essential. You’ll implement a simple neural network from scratch in NumPy, building intuition for backpropagation and gradient descent.

The Perceptron: Foundation of Neural Networks

The perceptron is the simplest neural network unit. It takes inputs, applies weights, adds a bias, and passes through an activation function:

import numpy as np

class Perceptron:
    def __init__(self, n_inputs, learning_rate=0.01):
        self.weights = np.random.randn(n_inputs) * 0.01
        self.bias = 0
        self.learning_rate = learning_rate

    def forward(self, X):
        """Compute output: z = w*X + b"""
        return np.dot(X, self.weights) + self.bias

    def predict(self, X):
        """Binary prediction"""
        return (self.forward(X) > 0).astype(int)

    def train(self, X, y, epochs=100):
        """Update weights using perceptron rule"""
        for epoch in range(epochs):
            for x_i, y_i in zip(X, y):
                prediction = self.predict(x_i.reshape(1, -1))[0]
                error = y_i - prediction

                # Update weights if prediction is wrong
                self.weights += self.learning_rate * error * x_i
                self.bias += self.learning_rate * error

# Example: Classify linearly separable data
X = np.array([[0, 0], [0, 1], [1, 0], [1, 1]])
y = np.array([0, 1, 1, 1])  # OR operation

perceptron = Perceptron(n_inputs=2)
perceptron.train(X, y, epochs=10)

predictions = np.array([perceptron.predict(x.reshape(1, -1))[0] for x in X])
print(f"Predictions: {predictions}")
print(f"Accuracy: {(predictions == y).mean():.2f}")

Activation Functions: Adding Nonlinearity

Without activation functions, stacking layers would just create another linear transformation. Activation functions introduce nonlinearity:

import matplotlib.pyplot as plt

# Create sample input
z = np.linspace(-5, 5, 100)

# Sigmoid: smooth 0-1 squashing
sigmoid = 1 / (1 + np.exp(-z))

# ReLU: popular in deep networks
relu = np.maximum(0, z)

# Tanh: smooth -1 to 1
tanh = np.tanh(z)

# Plot
fig, axes = plt.subplots(1, 3, figsize=(15, 4))

axes[0].plot(z, sigmoid)
axes[0].set_title('Sigmoid')
axes[0].grid()
axes[0].set_ylabel('Output')

axes[1].plot(z, relu)
axes[1].set_title('ReLU')
axes[1].grid()

axes[2].plot(z, tanh)
axes[2].set_title('Tanh')
axes[2].grid()

plt.tight_layout()
plt.show()

# Derivatives needed for backprop
def sigmoid_derivative(z):
    s = 1 / (1 + np.exp(-z))
    return s * (1 - s)

def relu_derivative(z):
    return (z > 0).astype(float)

def tanh_derivative(z):
    return 1 - np.tanh(z)**2

Forward Propagation: Computing Predictions

A multi-layer neural network stacks multiple transformations:

class NeuralNetwork:
    def __init__(self, layer_sizes):
        """
        layer_sizes: [input_size, hidden1_size, hidden2_size, output_size]
        Example: [784, 128, 64, 10] for MNIST
        """
        self.layer_sizes = layer_sizes
        self.layers = len(layer_sizes) - 1
        self.params = {}

        # Initialize weights and biases
        for i in range(self.layers):
            self.params[f'W{i+1}'] = np.random.randn(
                layer_sizes[i], layer_sizes[i+1]
            ) * 0.01
            self.params[f'b{i+1}'] = np.zeros((1, layer_sizes[i+1]))

    def forward(self, X):
        """Forward pass through all layers"""
        self.cache = {'A0': X}
        A = X

        # Hidden layers with ReLU
        for i in range(self.layers - 1):
            W = self.params[f'W{i+1}']
            b = self.params[f'b{i+1}']

            Z = np.dot(A, W) + b
            A = np.maximum(0, Z)  # ReLU

            self.cache[f'Z{i+1}'] = Z
            self.cache[f'A{i+1}'] = A

        # Output layer with sigmoid
        W = self.params[f'W{self.layers}']
        b = self.params[f'b{self.layers}']

        Z = np.dot(A, W) + b
        A = 1 / (1 + np.exp(-Z))  # Sigmoid

        self.cache[f'Z{self.layers}'] = Z
        self.cache[f'A{self.layers}'] = A

        return A

# Example network: 2 -> 4 -> 1
nn = NeuralNetwork([2, 4, 1])

X = np.array([[0, 0], [0, 1], [1, 0], [1, 1]])
output = nn.forward(X)
print(f"Predictions shape: {output.shape}")
print(f"First prediction: {output[0]}")

Backpropagation: Learning from Errors

Backpropagation computes gradients of the loss with respect to weights, enabling gradient descent:

class NeuralNetworkWithBackprop(NeuralNetwork):
    def backward(self, y):
        """
        Compute gradients using chain rule.
        y: true labels, shape (m, output_size)
        """
        m = y.shape[0]
        self.gradients = {}

        # Output layer gradient
        # dL/dZ = A - y (for MSE loss with sigmoid)
        dZ = self.cache[f'A{self.layers}'] - y

        # Backpropagate through all layers
        for i in range(self.layers, 0, -1):
            # Gradient w.r.t. weights and biases
            dW = np.dot(self.cache[f'A{i-1}'].T, dZ) / m
            db = np.sum(dZ, axis=0, keepdims=True) / m

            self.gradients[f'dW{i}'] = dW
            self.gradients[f'db{i}'] = db

            # Gradient for previous layer
            if i > 1:
                dA = np.dot(dZ, self.params[f'W{i}'].T)
                dZ = dA * (self.cache[f'Z{i-1}'] > 0)  # ReLU derivative

    def update_weights(self, learning_rate=0.01):
        """Gradient descent update"""
        for i in range(1, self.layers + 1):
            self.params[f'W{i}'] -= learning_rate * self.gradients[f'dW{i}']
            self.params[f'b{i}'] -= learning_rate * self.gradients[f'db{i}']

    def train(self, X, y, epochs=100, learning_rate=0.01):
        """Training loop"""
        losses = []
        for epoch in range(epochs):
            # Forward pass
            output = self.forward(X)

            # Compute loss (MSE)
            loss = np.mean((output - y)**2)
            losses.append(loss)

            # Backward pass
            self.backward(y)

            # Update weights
            self.update_weights(learning_rate)

            if (epoch + 1) % 20 == 0:
                print(f"Epoch {epoch+1}, Loss: {loss:.4f}")

        return losses

# Train network on XOR (nonlinear problem)
X = np.array([[0, 0], [0, 1], [1, 0], [1, 1]])
y = np.array([[0], [1], [1], [0]])  # XOR

nn = NeuralNetworkWithBackprop([2, 4, 1])
losses = nn.train(X, y, epochs=1000, learning_rate=0.1)

# Check predictions
predictions = nn.forward(X)
print("\nFinal predictions:")
for x, pred in zip(X, predictions):
    print(f"{x} -> {pred[0]:.4f}")

Gradient Descent: Optimization

Gradient descent minimizes loss by updating weights in the direction of negative gradients:

import matplotlib.pyplot as plt

def loss_landscape(w1, w2):
    """Simple quadratic loss landscape"""
    return (w1 - 2)**2 + 2 * (w2 + 1)**2

# Visualize landscape
w1_range = np.linspace(-2, 6, 100)
w2_range = np.linspace(-5, 3, 100)
W1, W2 = np.meshgrid(w1_range, w2_range)
Loss = loss_landscape(W1, W2)

plt.figure(figsize=(12, 5))

# Contour plot
plt.subplot(1, 2, 1)
plt.contour(W1, W2, Loss, levels=20)
plt.xlabel('w1')
plt.ylabel('w2')
plt.title('Loss Landscape')

# Gradient descent optimization
w = np.array([0.0, 0.0])
learning_rate = 0.01
path = [w.copy()]

for _ in range(100):
    # Gradient
    grad_w1 = 2 * (w[0] - 2)
    grad_w2 = 4 * (w[1] + 1)
    gradient = np.array([grad_w1, grad_w2])

    # Update
    w = w - learning_rate * gradient
    path.append(w.copy())

path = np.array(path)

# Plot path
plt.subplot(1, 2, 2)
plt.contour(W1, W2, Loss, levels=20, alpha=0.6)
plt.plot(path[:, 0], path[:, 1], 'ro-', markersize=3, linewidth=1)
plt.scatter([2], [-1], c='g', s=200, marker='*', label='Minimum')
plt.xlabel('w1')
plt.ylabel('w2')
plt.title('Gradient Descent Path')
plt.legend()

plt.tight_layout()
plt.show()

Key Challenges: Vanishing and Exploding Gradients

Deep networks face gradient flow problems:

# Vanishing gradients: sigmoid derivative is small (max 0.25)
z = np.array([[-5, -2, 0, 2, 5]])
sig = 1 / (1 + np.exp(-z))
sig_grad = sig * (1 - sig)
print(f"Sigmoid gradients: {sig_grad}")
print(f"Max gradient: {sig_grad.max():.4f}")

# In deep networks, these multiply: 0.25^100 ≈ 0 (vanishing!)
# Solution: Use ReLU instead

# Exploding gradients: large weight updates
# Solution: Gradient clipping, batch normalization

# Example: clip gradients
gradient = np.array([-100, 50, -200, 75])
max_norm = 1.0
if np.linalg.norm(gradient) > max_norm:
    gradient = gradient * (max_norm / np.linalg.norm(gradient))
print(f"Clipped gradient: {gradient}")

Key Takeaway

Understanding forward propagation, backpropagation, and gradient descent at the mathematical level builds intuition that frameworks like PyTorch will later abstract away. This foundation is invaluable when debugging and designing networks.

Practical Exercise

Implement a 2-layer neural network from scratch that:

  1. Solves the XOR problem (nonlinear)
  2. Computes forward and backward passes correctly
  3. Trains using gradient descent
  4. Visualizes loss over epochs
  5. Tests on new inputs
import numpy as np
import matplotlib.pyplot as plt

class SimpleNN:
    """2-layer neural network: input -> hidden -> output"""

    def __init__(self, input_size, hidden_size, output_size, learning_rate=0.01):
        # Your implementation
        pass

    def forward(self, X):
        # Forward pass: compute predictions
        pass

    def backward(self, X, y):
        # Compute gradients with backpropagation
        pass

    def train(self, X, y, epochs=1000):
        # Training loop with gradient descent
        pass

# Test on XOR problem
X = np.array([[0, 0], [0, 1], [1, 0], [1, 1]])
y = np.array([[0], [1], [1], [0]])

nn = SimpleNN(input_size=2, hidden_size=4, output_size=1, learning_rate=0.1)
losses = nn.train(X, y, epochs=5000)

# Visualization:
# 1. Loss curve over epochs
# 2. Final predictions vs targets
# 3. Decision boundary visualization

Complete this implementation to solidify your understanding before moving to PyTorch.