Neural Networks from Scratch
Neural Networks from Scratch
Before using PyTorch or TensorFlow, understanding how neural networks work at the mathematical level is essential. You’ll implement a simple neural network from scratch in NumPy, building intuition for backpropagation and gradient descent.
The Perceptron: Foundation of Neural Networks
The perceptron is the simplest neural network unit. It takes inputs, applies weights, adds a bias, and passes through an activation function:
import numpy as np
class Perceptron:
def __init__(self, n_inputs, learning_rate=0.01):
self.weights = np.random.randn(n_inputs) * 0.01
self.bias = 0
self.learning_rate = learning_rate
def forward(self, X):
"""Compute output: z = w*X + b"""
return np.dot(X, self.weights) + self.bias
def predict(self, X):
"""Binary prediction"""
return (self.forward(X) > 0).astype(int)
def train(self, X, y, epochs=100):
"""Update weights using perceptron rule"""
for epoch in range(epochs):
for x_i, y_i in zip(X, y):
prediction = self.predict(x_i.reshape(1, -1))[0]
error = y_i - prediction
# Update weights if prediction is wrong
self.weights += self.learning_rate * error * x_i
self.bias += self.learning_rate * error
# Example: Classify linearly separable data
X = np.array([[0, 0], [0, 1], [1, 0], [1, 1]])
y = np.array([0, 1, 1, 1]) # OR operation
perceptron = Perceptron(n_inputs=2)
perceptron.train(X, y, epochs=10)
predictions = np.array([perceptron.predict(x.reshape(1, -1))[0] for x in X])
print(f"Predictions: {predictions}")
print(f"Accuracy: {(predictions == y).mean():.2f}")
Activation Functions: Adding Nonlinearity
Without activation functions, stacking layers would just create another linear transformation. Activation functions introduce nonlinearity:
import matplotlib.pyplot as plt
# Create sample input
z = np.linspace(-5, 5, 100)
# Sigmoid: smooth 0-1 squashing
sigmoid = 1 / (1 + np.exp(-z))
# ReLU: popular in deep networks
relu = np.maximum(0, z)
# Tanh: smooth -1 to 1
tanh = np.tanh(z)
# Plot
fig, axes = plt.subplots(1, 3, figsize=(15, 4))
axes[0].plot(z, sigmoid)
axes[0].set_title('Sigmoid')
axes[0].grid()
axes[0].set_ylabel('Output')
axes[1].plot(z, relu)
axes[1].set_title('ReLU')
axes[1].grid()
axes[2].plot(z, tanh)
axes[2].set_title('Tanh')
axes[2].grid()
plt.tight_layout()
plt.show()
# Derivatives needed for backprop
def sigmoid_derivative(z):
s = 1 / (1 + np.exp(-z))
return s * (1 - s)
def relu_derivative(z):
return (z > 0).astype(float)
def tanh_derivative(z):
return 1 - np.tanh(z)**2
Forward Propagation: Computing Predictions
A multi-layer neural network stacks multiple transformations:
class NeuralNetwork:
def __init__(self, layer_sizes):
"""
layer_sizes: [input_size, hidden1_size, hidden2_size, output_size]
Example: [784, 128, 64, 10] for MNIST
"""
self.layer_sizes = layer_sizes
self.layers = len(layer_sizes) - 1
self.params = {}
# Initialize weights and biases
for i in range(self.layers):
self.params[f'W{i+1}'] = np.random.randn(
layer_sizes[i], layer_sizes[i+1]
) * 0.01
self.params[f'b{i+1}'] = np.zeros((1, layer_sizes[i+1]))
def forward(self, X):
"""Forward pass through all layers"""
self.cache = {'A0': X}
A = X
# Hidden layers with ReLU
for i in range(self.layers - 1):
W = self.params[f'W{i+1}']
b = self.params[f'b{i+1}']
Z = np.dot(A, W) + b
A = np.maximum(0, Z) # ReLU
self.cache[f'Z{i+1}'] = Z
self.cache[f'A{i+1}'] = A
# Output layer with sigmoid
W = self.params[f'W{self.layers}']
b = self.params[f'b{self.layers}']
Z = np.dot(A, W) + b
A = 1 / (1 + np.exp(-Z)) # Sigmoid
self.cache[f'Z{self.layers}'] = Z
self.cache[f'A{self.layers}'] = A
return A
# Example network: 2 -> 4 -> 1
nn = NeuralNetwork([2, 4, 1])
X = np.array([[0, 0], [0, 1], [1, 0], [1, 1]])
output = nn.forward(X)
print(f"Predictions shape: {output.shape}")
print(f"First prediction: {output[0]}")
Backpropagation: Learning from Errors
Backpropagation computes gradients of the loss with respect to weights, enabling gradient descent:
class NeuralNetworkWithBackprop(NeuralNetwork):
def backward(self, y):
"""
Compute gradients using chain rule.
y: true labels, shape (m, output_size)
"""
m = y.shape[0]
self.gradients = {}
# Output layer gradient
# dL/dZ = A - y (for MSE loss with sigmoid)
dZ = self.cache[f'A{self.layers}'] - y
# Backpropagate through all layers
for i in range(self.layers, 0, -1):
# Gradient w.r.t. weights and biases
dW = np.dot(self.cache[f'A{i-1}'].T, dZ) / m
db = np.sum(dZ, axis=0, keepdims=True) / m
self.gradients[f'dW{i}'] = dW
self.gradients[f'db{i}'] = db
# Gradient for previous layer
if i > 1:
dA = np.dot(dZ, self.params[f'W{i}'].T)
dZ = dA * (self.cache[f'Z{i-1}'] > 0) # ReLU derivative
def update_weights(self, learning_rate=0.01):
"""Gradient descent update"""
for i in range(1, self.layers + 1):
self.params[f'W{i}'] -= learning_rate * self.gradients[f'dW{i}']
self.params[f'b{i}'] -= learning_rate * self.gradients[f'db{i}']
def train(self, X, y, epochs=100, learning_rate=0.01):
"""Training loop"""
losses = []
for epoch in range(epochs):
# Forward pass
output = self.forward(X)
# Compute loss (MSE)
loss = np.mean((output - y)**2)
losses.append(loss)
# Backward pass
self.backward(y)
# Update weights
self.update_weights(learning_rate)
if (epoch + 1) % 20 == 0:
print(f"Epoch {epoch+1}, Loss: {loss:.4f}")
return losses
# Train network on XOR (nonlinear problem)
X = np.array([[0, 0], [0, 1], [1, 0], [1, 1]])
y = np.array([[0], [1], [1], [0]]) # XOR
nn = NeuralNetworkWithBackprop([2, 4, 1])
losses = nn.train(X, y, epochs=1000, learning_rate=0.1)
# Check predictions
predictions = nn.forward(X)
print("\nFinal predictions:")
for x, pred in zip(X, predictions):
print(f"{x} -> {pred[0]:.4f}")
Gradient Descent: Optimization
Gradient descent minimizes loss by updating weights in the direction of negative gradients:
import matplotlib.pyplot as plt
def loss_landscape(w1, w2):
"""Simple quadratic loss landscape"""
return (w1 - 2)**2 + 2 * (w2 + 1)**2
# Visualize landscape
w1_range = np.linspace(-2, 6, 100)
w2_range = np.linspace(-5, 3, 100)
W1, W2 = np.meshgrid(w1_range, w2_range)
Loss = loss_landscape(W1, W2)
plt.figure(figsize=(12, 5))
# Contour plot
plt.subplot(1, 2, 1)
plt.contour(W1, W2, Loss, levels=20)
plt.xlabel('w1')
plt.ylabel('w2')
plt.title('Loss Landscape')
# Gradient descent optimization
w = np.array([0.0, 0.0])
learning_rate = 0.01
path = [w.copy()]
for _ in range(100):
# Gradient
grad_w1 = 2 * (w[0] - 2)
grad_w2 = 4 * (w[1] + 1)
gradient = np.array([grad_w1, grad_w2])
# Update
w = w - learning_rate * gradient
path.append(w.copy())
path = np.array(path)
# Plot path
plt.subplot(1, 2, 2)
plt.contour(W1, W2, Loss, levels=20, alpha=0.6)
plt.plot(path[:, 0], path[:, 1], 'ro-', markersize=3, linewidth=1)
plt.scatter([2], [-1], c='g', s=200, marker='*', label='Minimum')
plt.xlabel('w1')
plt.ylabel('w2')
plt.title('Gradient Descent Path')
plt.legend()
plt.tight_layout()
plt.show()
Key Challenges: Vanishing and Exploding Gradients
Deep networks face gradient flow problems:
# Vanishing gradients: sigmoid derivative is small (max 0.25)
z = np.array([[-5, -2, 0, 2, 5]])
sig = 1 / (1 + np.exp(-z))
sig_grad = sig * (1 - sig)
print(f"Sigmoid gradients: {sig_grad}")
print(f"Max gradient: {sig_grad.max():.4f}")
# In deep networks, these multiply: 0.25^100 ≈ 0 (vanishing!)
# Solution: Use ReLU instead
# Exploding gradients: large weight updates
# Solution: Gradient clipping, batch normalization
# Example: clip gradients
gradient = np.array([-100, 50, -200, 75])
max_norm = 1.0
if np.linalg.norm(gradient) > max_norm:
gradient = gradient * (max_norm / np.linalg.norm(gradient))
print(f"Clipped gradient: {gradient}")
Key Takeaway
Understanding forward propagation, backpropagation, and gradient descent at the mathematical level builds intuition that frameworks like PyTorch will later abstract away. This foundation is invaluable when debugging and designing networks.
Practical Exercise
Implement a 2-layer neural network from scratch that:
- Solves the XOR problem (nonlinear)
- Computes forward and backward passes correctly
- Trains using gradient descent
- Visualizes loss over epochs
- Tests on new inputs
import numpy as np
import matplotlib.pyplot as plt
class SimpleNN:
"""2-layer neural network: input -> hidden -> output"""
def __init__(self, input_size, hidden_size, output_size, learning_rate=0.01):
# Your implementation
pass
def forward(self, X):
# Forward pass: compute predictions
pass
def backward(self, X, y):
# Compute gradients with backpropagation
pass
def train(self, X, y, epochs=1000):
# Training loop with gradient descent
pass
# Test on XOR problem
X = np.array([[0, 0], [0, 1], [1, 0], [1, 1]])
y = np.array([[0], [1], [1], [0]])
nn = SimpleNN(input_size=2, hidden_size=4, output_size=1, learning_rate=0.1)
losses = nn.train(X, y, epochs=5000)
# Visualization:
# 1. Loss curve over epochs
# 2. Final predictions vs targets
# 3. Decision boundary visualization
Complete this implementation to solidify your understanding before moving to PyTorch.