Advanced CNN Architectures
Advanced CNN Architectures
Modern CNNs go far beyond simple stacked convolutions. Architectures like ResNet, EfficientNet, and Vision Transformers have revolutionized computer vision through clever innovations in connectivity patterns, normalization, and attention mechanisms.
Residual Connections: The ResNet Revolution
ResNets introduced skip connections, allowing gradients to flow directly through layers:
import torch
import torch.nn as nn
class ResidualBlock(nn.Module):
def __init__(self, in_channels, out_channels, stride=1):
super(ResidualBlock, self).__init__()
# Main path
self.conv1 = nn.Conv2d(in_channels, out_channels, kernel_size=3,
stride=stride, padding=1, bias=False)
self.bn1 = nn.BatchNorm2d(out_channels)
self.relu = nn.ReLU(inplace=True)
self.conv2 = nn.Conv2d(out_channels, out_channels, kernel_size=3,
padding=1, bias=False)
self.bn2 = nn.BatchNorm2d(out_channels)
# Shortcut path (handles dimension changes)
self.shortcut = nn.Sequential()
if stride != 1 or in_channels != out_channels:
self.shortcut = nn.Sequential(
nn.Conv2d(in_channels, out_channels, kernel_size=1,
stride=stride, bias=False),
nn.BatchNorm2d(out_channels)
)
def forward(self, x):
# Main path
out = self.conv1(x)
out = self.bn1(out)
out = self.relu(out)
out = self.conv2(out)
out = self.bn2(out)
# Skip connection (key innovation!)
out = out + self.shortcut(x)
out = self.relu(out)
return out
class ResNet(nn.Module):
def __init__(self, block, layers, num_classes=1000):
super(ResNet, self).__init__()
self.in_channels = 64
# Initial convolution
self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3, bias=False)
self.bn1 = nn.BatchNorm2d(64)
self.relu = nn.ReLU(inplace=True)
self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
# Residual blocks
self.layer1 = self._make_layer(block, 64, layers[0], stride=1)
self.layer2 = self._make_layer(block, 128, layers[1], stride=2)
self.layer3 = self._make_layer(block, 256, layers[2], stride=2)
self.layer4 = self._make_layer(block, 512, layers[3], stride=2)
# Global average pooling and classification
self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
self.fc = nn.Linear(512, num_classes)
def _make_layer(self, block, channels, blocks, stride):
layers = []
layers.append(block(self.in_channels, channels, stride))
self.in_channels = channels
for _ in range(1, blocks):
layers.append(block(channels, channels, stride=1))
return nn.Sequential(*layers)
def forward(self, x):
x = self.conv1(x)
x = self.bn1(x)
x = self.relu(x)
x = self.maxpool(x)
x = self.layer1(x)
x = self.layer2(x)
x = self.layer3(x)
x = self.layer4(x)
x = self.avgpool(x)
x = x.view(x.size(0), -1)
x = self.fc(x)
return x
# Create ResNet-50 (4 blocks with 3, 4, 6, 3 residual blocks)
resnet50 = ResNet(ResidualBlock, [3, 4, 6, 3], num_classes=1000)
x = torch.randn(1, 3, 224, 224)
output = resnet50(x)
print(f"Output shape: {output.shape}") # [1, 1000]
Skip connections solve the vanishing gradient problem, enabling networks 100+ layers deep.
EfficientNet: Scaling with Efficiency
EfficientNet scales networks systematically—not just depth, but width and resolution:
# EfficientNet concepts (using torchvision)
from torchvision import models
# Load pretrained EfficientNet
efficient_net_b0 = models.efficientnet_b0(pretrained=True)
efficient_net_b7 = models.efficientnet_b7(pretrained=True)
# B0 to B7 systematically scale depth, width, and resolution
# B0: baseline, B7: largest variant
# For your task, replace final layer
for param in efficient_net_b0.parameters():
param.requires_grad = False
num_classes = 10
efficient_net_b0.classifier[1] = nn.Linear(efficient_net_b0.classifier[1].in_features, num_classes)
x = torch.randn(4, 3, 224, 224)
output = efficient_net_b0(x)
print(f"Output shape: {output.shape}") # [4, 10]
EfficientNet’s compound scaling formula:
- Depth: α^φ (deeper networks)
- Width: β^φ (more channels)
- Resolution: γ^φ (higher input resolution)
All scaled together by compound coefficient φ. This balanced approach achieves better accuracy per parameter.
Vision Transformers: Attention for Images
Vision Transformers (ViT) apply the transformer architecture to images:
class PatchEmbedding(nn.Module):
def __init__(self, img_size, patch_size, in_channels, embed_dim):
super(PatchEmbedding, self).__init__()
self.img_size = img_size
self.patch_size = patch_size
self.num_patches = (img_size // patch_size) ** 2
# Convert patches to embeddings
self.proj = nn.Conv2d(in_channels, embed_dim, kernel_size=patch_size,
stride=patch_size)
def forward(self, x):
# x: [batch, 3, 224, 224]
x = self.proj(x) # [batch, embed_dim, 14, 14]
x = x.flatten(2) # [batch, embed_dim, 196]
x = x.transpose(1, 2) # [batch, 196, embed_dim]
return x
class VisionTransformer(nn.Module):
def __init__(self, img_size=224, patch_size=16, in_channels=3,
num_classes=1000, embed_dim=768, num_layers=12, num_heads=12):
super(VisionTransformer, self).__init__()
self.patch_embed = PatchEmbedding(img_size, patch_size, in_channels, embed_dim)
num_patches = self.patch_embed.num_patches
# Learnable class token
self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim))
# Positional embeddings
self.pos_embed = nn.Parameter(torch.zeros(1, num_patches + 1, embed_dim))
# Transformer blocks
encoder_layer = nn.TransformerEncoderLayer(d_model=embed_dim, nhead=num_heads,
dim_feedforward=3072, dropout=0.1,
batch_first=True)
self.transformer = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
# Classification head
self.norm = nn.LayerNorm(embed_dim)
self.fc = nn.Linear(embed_dim, num_classes)
def forward(self, x):
# Patch embeddings
x = self.patch_embed(x) # [batch, num_patches, embed_dim]
# Prepend class token
cls_tokens = self.cls_token.expand(x.size(0), -1, -1)
x = torch.cat([cls_tokens, x], dim=1) # [batch, num_patches+1, embed_dim]
# Add positional embeddings
x = x + self.pos_embed
# Transformer
x = self.transformer(x)
# Use class token for classification
x = x[:, 0]
x = self.norm(x)
x = self.fc(x)
return x
# Create ViT
vit = VisionTransformer(img_size=224, patch_size=16, num_classes=1000)
x = torch.randn(4, 3, 224, 224)
output = vit(x)
print(f"Output shape: {output.shape}") # [4, 1000]
ViT treats images as sequences of patches, applying pure transformer attention. Surprisingly, it works remarkably well and scales to larger datasets.
Transfer Learning with Pretrained Models
Most modern projects start with pretrained weights:
from torchvision import models
# Load pretrained models (trained on ImageNet-21k or ImageNet-1k)
resnet50 = models.resnet50(weights=models.ResNet50_Weights.IMAGENET1K_V1)
vit_b16 = models.vit_b_16(weights=models.ViT_B_16_Weights.IMAGENET1K_V1)
efficient_b0 = models.efficientnet_b0(weights=models.EfficientNet_B0_Weights.IMAGENET1K_V1)
# Fine-tuning strategy 1: Freeze backbone, train only head
for param in resnet50.parameters():
param.requires_grad = False
resnet50.fc = nn.Linear(resnet50.fc.in_features, 10) # Your task classes
optimizer = optim.Adam(resnet50.fc.parameters(), lr=0.001)
# Fine-tuning strategy 2: Lower learning rate for backbone
base_params = [p for p in resnet50.parameters() if p not in resnet50.fc.parameters()]
head_params = list(resnet50.fc.parameters())
param_groups = [
{'params': base_params, 'lr': 0.0001},
{'params': head_params, 'lr': 0.001}
]
optimizer = optim.Adam(param_groups)
# Fine-tuning strategy 3: Gradual unfreezing (discriminative learning rates)
groups = []
lr = 0.0001
for layer in [resnet50.layer4, resnet50.layer3, resnet50.layer2, resnet50.layer1]:
groups.append({'params': layer.parameters(), 'lr': lr})
lr *= 2
groups.append({'params': resnet50.fc.parameters(), 'lr': lr})
optimizer = optim.Adam(groups)
Multi-Task Learning
Train single network on multiple related tasks:
class MultiTaskCNN(nn.Module):
def __init__(self):
super(MultiTaskCNN, self).__init__()
# Shared backbone
self.backbone = models.resnet50(weights=models.ResNet50_Weights.IMAGENET1K_V1)
backbone_out = self.backbone.fc.in_features
# Remove original classification head
self.backbone.fc = nn.Identity()
# Task-specific heads
self.task1_head = nn.Linear(backbone_out, 10) # Classification
self.task2_head = nn.Linear(backbone_out, 4) # Bounding box
self.task3_head = nn.Linear(backbone_out, 128) # Feature embedding
def forward(self, x):
features = self.backbone(x)
task1_out = self.task1_head(features)
task2_out = self.task2_head(features)
task3_out = self.task3_head(features)
return task1_out, task2_out, task3_out
# Training with combined loss
model = MultiTaskCNN()
x = torch.randn(4, 3, 224, 224)
task1_pred, task2_pred, task3_pred = model(x)
# Combined loss
loss_task1 = nn.CrossEntropyLoss()(task1_pred, torch.randint(0, 10, (4,)))
loss_task2 = nn.MSELoss()(task2_pred, torch.randn(4, 4))
loss_task3 = nn.CosineSimilarity()(task3_pred, torch.randn(4, 128))
total_loss = loss_task1 + 0.5 * loss_task2 + 0.2 * loss_task3
Key Takeaway
Skip connections, systematic scaling, and attention mechanisms represent the evolution of CNN design. In practice, starting with a pretrained model (ResNet, EfficientNet, or ViT) and fine-tuning for your task is the standard approach—training from scratch is rarely necessary.
Practical Exercise
Build a multi-scale object detector:
import torch
import torch.nn as nn
from torchvision import models
# Your task:
# 1. Load pretrained ResNet50
# 2. Extract features from multiple layers
# 3. Create FPN (Feature Pyramid Network)
# 4. Add detection head for multi-scale objects
# 5. Implement loss combining classification + localization
# Expected components:
# - Feature extraction at different scales
# - Pyramid construction
# - RPN (Region Proposal Network) head
# - Classification and bounding box regression heads
# - Training on COCO or Pascal VOC
# Challenge: Implement FPN module that:
# - Takes features from C3, C4, C5 (different resolution levels)
# - Creates P3-P7 pyramids with consistent dimensions
# - Applies lateral and smooth convolutions
This exercise teaches modern architecture design principles through detection.