Advanced CNN Architectures

Modern CNNs go far beyond simple stacked convolutions. Architectures like ResNet, EfficientNet, and Vision Transformers have revolutionized computer vision through clever innovations in connectivity patterns, normalization, and attention mechanisms.

Residual Connections: The ResNet Revolution

ResNets introduced skip connections, allowing gradients to flow directly through layers:

import torch
import torch.nn as nn

class ResidualBlock(nn.Module):
    def __init__(self, in_channels, out_channels, stride=1):
        super(ResidualBlock, self).__init__()

        # Main path
        self.conv1 = nn.Conv2d(in_channels, out_channels, kernel_size=3,
                              stride=stride, padding=1, bias=False)
        self.bn1 = nn.BatchNorm2d(out_channels)
        self.relu = nn.ReLU(inplace=True)

        self.conv2 = nn.Conv2d(out_channels, out_channels, kernel_size=3,
                              padding=1, bias=False)
        self.bn2 = nn.BatchNorm2d(out_channels)

        # Shortcut path (handles dimension changes)
        self.shortcut = nn.Sequential()
        if stride != 1 or in_channels != out_channels:
            self.shortcut = nn.Sequential(
                nn.Conv2d(in_channels, out_channels, kernel_size=1,
                         stride=stride, bias=False),
                nn.BatchNorm2d(out_channels)
            )

    def forward(self, x):
        # Main path
        out = self.conv1(x)
        out = self.bn1(out)
        out = self.relu(out)
        out = self.conv2(out)
        out = self.bn2(out)

        # Skip connection (key innovation!)
        out = out + self.shortcut(x)
        out = self.relu(out)
        return out

class ResNet(nn.Module):
    def __init__(self, block, layers, num_classes=1000):
        super(ResNet, self).__init__()

        self.in_channels = 64

        # Initial convolution
        self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3, bias=False)
        self.bn1 = nn.BatchNorm2d(64)
        self.relu = nn.ReLU(inplace=True)
        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)

        # Residual blocks
        self.layer1 = self._make_layer(block, 64, layers[0], stride=1)
        self.layer2 = self._make_layer(block, 128, layers[1], stride=2)
        self.layer3 = self._make_layer(block, 256, layers[2], stride=2)
        self.layer4 = self._make_layer(block, 512, layers[3], stride=2)

        # Global average pooling and classification
        self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
        self.fc = nn.Linear(512, num_classes)

    def _make_layer(self, block, channels, blocks, stride):
        layers = []
        layers.append(block(self.in_channels, channels, stride))
        self.in_channels = channels
        for _ in range(1, blocks):
            layers.append(block(channels, channels, stride=1))
        return nn.Sequential(*layers)

    def forward(self, x):
        x = self.conv1(x)
        x = self.bn1(x)
        x = self.relu(x)
        x = self.maxpool(x)

        x = self.layer1(x)
        x = self.layer2(x)
        x = self.layer3(x)
        x = self.layer4(x)

        x = self.avgpool(x)
        x = x.view(x.size(0), -1)
        x = self.fc(x)
        return x

# Create ResNet-50 (4 blocks with 3, 4, 6, 3 residual blocks)
resnet50 = ResNet(ResidualBlock, [3, 4, 6, 3], num_classes=1000)
x = torch.randn(1, 3, 224, 224)
output = resnet50(x)
print(f"Output shape: {output.shape}")  # [1, 1000]

Skip connections solve the vanishing gradient problem, enabling networks 100+ layers deep.

EfficientNet: Scaling with Efficiency

EfficientNet scales networks systematically—not just depth, but width and resolution:

# EfficientNet concepts (using torchvision)
from torchvision import models

# Load pretrained EfficientNet
efficient_net_b0 = models.efficientnet_b0(pretrained=True)
efficient_net_b7 = models.efficientnet_b7(pretrained=True)

# B0 to B7 systematically scale depth, width, and resolution
# B0: baseline, B7: largest variant

# For your task, replace final layer
for param in efficient_net_b0.parameters():
    param.requires_grad = False

num_classes = 10
efficient_net_b0.classifier[1] = nn.Linear(efficient_net_b0.classifier[1].in_features, num_classes)

x = torch.randn(4, 3, 224, 224)
output = efficient_net_b0(x)
print(f"Output shape: {output.shape}")  # [4, 10]

EfficientNet’s compound scaling formula:

Depth: α^φ (deeper networks)
Width: β^φ (more channels)
Resolution: γ^φ (higher input resolution)

All scaled together by compound coefficient φ. This balanced approach achieves better accuracy per parameter.

Vision Transformers: Attention for Images

Vision Transformers (ViT) apply the transformer architecture to images:

class PatchEmbedding(nn.Module):
    def __init__(self, img_size, patch_size, in_channels, embed_dim):
        super(PatchEmbedding, self).__init__()

        self.img_size = img_size
        self.patch_size = patch_size
        self.num_patches = (img_size // patch_size) ** 2

        # Convert patches to embeddings
        self.proj = nn.Conv2d(in_channels, embed_dim, kernel_size=patch_size,
                             stride=patch_size)

    def forward(self, x):
        # x: [batch, 3, 224, 224]
        x = self.proj(x)  # [batch, embed_dim, 14, 14]
        x = x.flatten(2)  # [batch, embed_dim, 196]
        x = x.transpose(1, 2)  # [batch, 196, embed_dim]
        return x

class VisionTransformer(nn.Module):
    def __init__(self, img_size=224, patch_size=16, in_channels=3,
                 num_classes=1000, embed_dim=768, num_layers=12, num_heads=12):
        super(VisionTransformer, self).__init__()

        self.patch_embed = PatchEmbedding(img_size, patch_size, in_channels, embed_dim)
        num_patches = self.patch_embed.num_patches

        # Learnable class token
        self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim))

        # Positional embeddings
        self.pos_embed = nn.Parameter(torch.zeros(1, num_patches + 1, embed_dim))

        # Transformer blocks
        encoder_layer = nn.TransformerEncoderLayer(d_model=embed_dim, nhead=num_heads,
                                                   dim_feedforward=3072, dropout=0.1,
                                                   batch_first=True)
        self.transformer = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)

        # Classification head
        self.norm = nn.LayerNorm(embed_dim)
        self.fc = nn.Linear(embed_dim, num_classes)

    def forward(self, x):
        # Patch embeddings
        x = self.patch_embed(x)  # [batch, num_patches, embed_dim]

        # Prepend class token
        cls_tokens = self.cls_token.expand(x.size(0), -1, -1)
        x = torch.cat([cls_tokens, x], dim=1)  # [batch, num_patches+1, embed_dim]

        # Add positional embeddings
        x = x + self.pos_embed

        # Transformer
        x = self.transformer(x)

        # Use class token for classification
        x = x[:, 0]
        x = self.norm(x)
        x = self.fc(x)
        return x

# Create ViT
vit = VisionTransformer(img_size=224, patch_size=16, num_classes=1000)
x = torch.randn(4, 3, 224, 224)
output = vit(x)
print(f"Output shape: {output.shape}")  # [4, 1000]

ViT treats images as sequences of patches, applying pure transformer attention. Surprisingly, it works remarkably well and scales to larger datasets.

Transfer Learning with Pretrained Models

Most modern projects start with pretrained weights:

from torchvision import models

# Load pretrained models (trained on ImageNet-21k or ImageNet-1k)
resnet50 = models.resnet50(weights=models.ResNet50_Weights.IMAGENET1K_V1)
vit_b16 = models.vit_b_16(weights=models.ViT_B_16_Weights.IMAGENET1K_V1)
efficient_b0 = models.efficientnet_b0(weights=models.EfficientNet_B0_Weights.IMAGENET1K_V1)

# Fine-tuning strategy 1: Freeze backbone, train only head
for param in resnet50.parameters():
    param.requires_grad = False

resnet50.fc = nn.Linear(resnet50.fc.in_features, 10)  # Your task classes

optimizer = optim.Adam(resnet50.fc.parameters(), lr=0.001)

# Fine-tuning strategy 2: Lower learning rate for backbone
base_params = [p for p in resnet50.parameters() if p not in resnet50.fc.parameters()]
head_params = list(resnet50.fc.parameters())

param_groups = [
    {'params': base_params, 'lr': 0.0001},
    {'params': head_params, 'lr': 0.001}
]
optimizer = optim.Adam(param_groups)

# Fine-tuning strategy 3: Gradual unfreezing (discriminative learning rates)
groups = []
lr = 0.0001
for layer in [resnet50.layer4, resnet50.layer3, resnet50.layer2, resnet50.layer1]:
    groups.append({'params': layer.parameters(), 'lr': lr})
    lr *= 2

groups.append({'params': resnet50.fc.parameters(), 'lr': lr})
optimizer = optim.Adam(groups)

Multi-Task Learning

Train single network on multiple related tasks:

class MultiTaskCNN(nn.Module):
    def __init__(self):
        super(MultiTaskCNN, self).__init__()

        # Shared backbone
        self.backbone = models.resnet50(weights=models.ResNet50_Weights.IMAGENET1K_V1)
        backbone_out = self.backbone.fc.in_features

        # Remove original classification head
        self.backbone.fc = nn.Identity()

        # Task-specific heads
        self.task1_head = nn.Linear(backbone_out, 10)  # Classification
        self.task2_head = nn.Linear(backbone_out, 4)   # Bounding box
        self.task3_head = nn.Linear(backbone_out, 128) # Feature embedding

    def forward(self, x):
        features = self.backbone(x)
        task1_out = self.task1_head(features)
        task2_out = self.task2_head(features)
        task3_out = self.task3_head(features)
        return task1_out, task2_out, task3_out

# Training with combined loss
model = MultiTaskCNN()
x = torch.randn(4, 3, 224, 224)
task1_pred, task2_pred, task3_pred = model(x)

# Combined loss
loss_task1 = nn.CrossEntropyLoss()(task1_pred, torch.randint(0, 10, (4,)))
loss_task2 = nn.MSELoss()(task2_pred, torch.randn(4, 4))
loss_task3 = nn.CosineSimilarity()(task3_pred, torch.randn(4, 128))

total_loss = loss_task1 + 0.5 * loss_task2 + 0.2 * loss_task3

Key Takeaway

Skip connections, systematic scaling, and attention mechanisms represent the evolution of CNN design. In practice, starting with a pretrained model (ResNet, EfficientNet, or ViT) and fine-tuning for your task is the standard approach—training from scratch is rarely necessary.

Practical Exercise

Build a multi-scale object detector:

import torch
import torch.nn as nn
from torchvision import models

# Your task:
# 1. Load pretrained ResNet50
# 2. Extract features from multiple layers
# 3. Create FPN (Feature Pyramid Network)
# 4. Add detection head for multi-scale objects
# 5. Implement loss combining classification + localization

# Expected components:
# - Feature extraction at different scales
# - Pyramid construction
# - RPN (Region Proposal Network) head
# - Classification and bounding box regression heads
# - Training on COCO or Pascal VOC

# Challenge: Implement FPN module that:
# - Takes features from C3, C4, C5 (different resolution levels)
# - Creates P3-P7 pyramids with consistent dimensions
# - Applies lateral and smooth convolutions

This exercise teaches modern architecture design principles through detection.