写在前面

参考书籍

Aston Zhang, Zachary C. Lipton, Mu Li, Alexander J. Smola. Dive into Deep Learning. 2020.

简介 - Dive-into-DL-PyTorch (tangshusen.me)

现代卷积神经网络

source code: NJU-ymhui/DeepLearning: Deep Learning with pytorch (github.com)

use git to clone: https://github.com/NJU-ymhui/DeepLearning.git

/modernCNN

AlexNet.py VGG.py NiN.py GoogLeNet.py tensor_normalize_self.py tensor_normalize_lib.py

AlexNet

理论部分见8.1. Deep Convolutional Neural Networks (AlexNet) — Dive into Deep Learning 1.0.3 documentation (d2l.ai)

code

import torch
from torch import nn
from d2l import torch as d2l


if __name__ == "__main__":
    # AlexNet模型
    net = nn.Sequential(
        # 这里使用一个11*11的更大窗口来捕捉对象。
        # 同时，步幅为4，以减少输出的高度和宽度。
        # 另外，输出通道的数目远大于LeNet
        # 输入层到隐藏层1，卷积操作
        nn.Conv2d(1, 96, kernel_size=11, stride=4, padding=1),
        # 激活函数，引入非线性
        nn.ReLU(),
        nn.MaxPool2d(kernel_size=3, stride=2),
        # 减小卷积窗口，使用填充为2来使得输入与输出的高和宽一致，且增大输出通道数
        # 隐藏层1到隐藏层2，卷积操作
        nn.Conv2d(96, 256, kernel_size=5, padding=2),
        nn.ReLU(),
        nn.MaxPool2d(kernel_size=3, stride=2),
        # 使用三个连续的卷积层和较小的卷积窗口。
        # 除了最后的卷积层，输出通道的数量进一步增加。
        # 在前两个卷积层之后，汇聚层不用于减少输入的高度和宽度
        nn.Conv2d(256, 384, kernel_size=3, padding=1),
        nn.ReLU(),
        nn.Conv2d(384, 384, kernel_size=3, padding=1),
        nn.ReLU(),
        nn.Conv2d(384, 256, kernel_size=3, padding=1),
        nn.ReLU(),
        # 池化操作，降维减参
        nn.MaxPool2d(kernel_size=3, stride=2),
        # 展平操作，将多维数据展平为一维数据s
        nn.Flatten(),
        # 这里，全连接层的输出数量是LeNet中的好几倍。使用dropout层来减轻过拟合
        # 隐藏层到输出层，全连接层
        nn.Linear(6400, 4096),
        nn.ReLU(),
        # Dropout层，防止过拟合
        nn.Dropout(p=0.5),
        nn.Linear(4096, 4096),
        nn.ReLU(),
        nn.Dropout(p=0.5),
        # 最后是输出层。由于这里使用Fashion-MNIST，所以用类别数为10，而非论文中的1000
        nn.Linear(4096, 10)
    )
    X = torch.randn(1, 1, 224, 224)
    for layer in net:
        X = layer(X)
        print(layer.__class__.__name__, 'output shape: ', X.shape)

    # load data
    batch_size = 128
    train_iter, test_iter = d2l.load_data_fashion_mnist(batch_size, resize=224)

    # start training
    print("start training:")
    lr, num_epochs = 0.01, 10
    d2l.train_ch6(net, train_iter, test_iter, num_epochs, lr, device=d2l.try_gpu())
    d2l.plt.show()  # 可视化

output

Conv2d output shape:  torch.Size([1, 96, 54, 54])
ReLU output shape:  torch.Size([1, 96, 54, 54])
MaxPool2d output shape:  torch.Size([1, 96, 26, 26])
Conv2d output shape:  torch.Size([1, 256, 26, 26])
ReLU output shape:  torch.Size([1, 256, 26, 26])
MaxPool2d output shape:  torch.Size([1, 256, 12, 12])
Conv2d output shape:  torch.Size([1, 384, 12, 12])
ReLU output shape:  torch.Size([1, 384, 12, 12])
Conv2d output shape:  torch.Size([1, 384, 12, 12])
ReLU output shape:  torch.Size([1, 384, 12, 12])
Conv2d output shape:  torch.Size([1, 256, 12, 12])
ReLU output shape:  torch.Size([1, 256, 12, 12])
MaxPool2d output shape:  torch.Size([1, 256, 5, 5])
Flatten output shape:  torch.Size([1, 6400])
Linear output shape:  torch.Size([1, 4096])
ReLU output shape:  torch.Size([1, 4096])
Dropout output shape:  torch.Size([1, 4096])
Linear output shape:  torch.Size([1, 4096])
ReLU output shape:  torch.Size([1, 4096])
Dropout output shape:  torch.Size([1, 4096])
Linear output shape:  torch.Size([1, 10])
start training:
training on cpu
loss 0.331, train acc 0.879, test acc 0.884
26.8 examples/sec on cpu

使用块的网络VGG

VGG可用于启发设计深层神经网络。

经典卷积神经网络的基本组成部分是下面的这个序列：

带填充以保持分辨率的卷积层
非线性激活函数，如ReLU
汇聚层，如最大汇聚层

一个VGG块与之类似，由一系列卷积层组成，后面再加上用于空间下采样的最大汇聚层; 8.2. Networks Using Blocks (VGG) — Dive into Deep Learning 1.0.3 documentation (d2l.ai)

code

import torch
from torch import nn
from d2l import torch as d2l


def vgg_block(num_convs, in_channels, out_channels):
    """
    实现一个VGG块
    :param num_convs: 卷积层数量
    :param in_channels: 输入通道数量
    :param out_channels: 输出通道数量
    :return: 由卷积层、激活函数和池化层组成的序列模型
    """
    layers = []
    for _ in range(num_convs):
        layers.append(nn.Conv2d(in_channels, out_channels, kernel_size=3, padding=1))
        layers.append(nn.ReLU())
        in_channels = out_channels
    layers.append(nn.MaxPool2d(kernel_size=2, stride=2))
    return nn.Sequential(*layers)


# vgg
def vgg(conv_arch):
    conv_blks = []
    in_channels = 1
    # 卷积层部分
    for (num_convs, out_channels) in conv_arch:
        conv_blks.append(vgg_block(num_convs, in_channels, out_channels))
        in_channels = out_channels

    # 全连接层部分的输入维度依赖于最后一个卷积块的输出尺寸
    return nn.Sequential(
        *conv_blks,
        nn.Flatten(),
        # 全连接层部分
        nn.Linear(out_channels * 7 * 7, 4096),
        nn.ReLU(),
        nn.Dropout(0.5),
        nn.Linear(4096, 4096),
        nn.ReLU(),
        nn.Dropout(0.5),
        nn.Linear(4096, 10)
    )


if __name__ == "__main__":
    # vgg网络
    conv_arch = ((1, 64), (1, 128), (2, 256), (2, 512), (2, 512))
    net = vgg(conv_arch)

    X = torch.randn(size=(1, 1, 224, 224))
    for blk in net:
        X = blk(X)
        print(blk.__class__.__name__, "output shape: ", X.shape)

    # start training
    # 由于VGG-11比AlexNet的计算量更大，因此构建一个通道较少的网络，足够训练该数据集
    ratio = 4
    small_conv_arch = [(pair[0], pair[1] // ratio) for pair in conv_arch]
    net = vgg(small_conv_arch)
    lr, num_epochs, batch_size = 0.05, 10, 128
    train_iter, test_iter = d2l.load_data_fashion_mnist(batch_size, resize=224)
    d2l.train_ch6(net, train_iter, test_iter, num_epochs, lr, d2l.try_gpu())
    d2l.plt.show()  # 可视化

output

Sequential output shape:  torch.Size([1, 64, 112, 112])
Sequential output shape:  torch.Size([1, 128, 56, 56])
Sequential output shape:  torch.Size([1, 256, 28, 28])
Sequential output shape:  torch.Size([1, 512, 14, 14])
Sequential output shape:  torch.Size([1, 512, 7, 7])
Flatten output shape:  torch.Size([1, 25088])
Linear output shape:  torch.Size([1, 4096])
ReLU output shape:  torch.Size([1, 4096])
Dropout output shape:  torch.Size([1, 4096])
Linear output shape:  torch.Size([1, 4096])
ReLU output shape:  torch.Size([1, 4096])
Dropout output shape:  torch.Size([1, 4096])
Linear output shape:  torch.Size([1, 10])
training on cpu
loss 0.178, train acc 0.934, test acc 0.923
49.2 examples/sec on cpu

网络中的网络NIN

原理及与VGG的比较见8.3. Network in Network (NiN) — Dive into Deep Learning 1.0.3 documentation (d2l.ai)

code

import torch
from torch import nn
from d2l import torch as d2l


# 网络中的网络
def nin_block(in_channels, out_channels, kernel_size, strides, padding):
    return nn.Sequential(
        nn.Conv2d(in_channels, out_channels, kernel_size, strides, padding),
        nn.ReLU(),
        nn.Conv2d(out_channels, out_channels, kernel_size=1),
        nn.ReLU(),
        nn.Conv2d(out_channels, out_channels, kernel_size=1),
        nn.ReLU()
    )


if __name__ == "__main__":
    # NiN模型
    net = nn.Sequential(
        nin_block(1, 96, kernel_size=11, strides=4, padding=0),
        nn.MaxPool2d(3, stride=2),
        nin_block(96, 256, kernel_size=5, strides=1, padding=2),
        nn.MaxPool2d(3, stride=2),
        nin_block(256, 384, kernel_size=3, strides=1, padding=1),
        nn.MaxPool2d(3, stride=2),
        nn.Dropout(0.5),  # or (p=0.5)
        # 标签类别数是10
        nin_block(384, 10, kernel_size=3, strides=1, padding=1),
        nn.AdaptiveAvgPool2d((1, 1)),
        # 将4维的输出转换成2维的输出，形状为(批量大小, 10)
        nn.Flatten()  # 展平
    )

    # 检查一下每个块的输出形状
    X = torch.rand(size=(1, 1, 224, 224))
    for layer in net:
        X = layer(X)
        print(layer.__class__.__name__, "output shape: ", X.shape)

    # 训练模型
    lr, num_epochs, batch_size = 0.1, 10, 128
    train_iter, test_iter = d2l.load_data_fashion_mnist(batch_size, resize=224)
    d2l.train_ch6(net, train_iter, test_iter, num_epochs, lr, d2l.try_gpu())
    d2l.plt.show()  # 可视化

output

Sequential output shape:  torch.Size([1, 96, 54, 54])
MaxPool2d output shape:  torch.Size([1, 96, 26, 26])
Sequential output shape:  torch.Size([1, 256, 26, 26])
MaxPool2d output shape:  torch.Size([1, 256, 12, 12])
Sequential output shape:  torch.Size([1, 384, 12, 12])
MaxPool2d output shape:  torch.Size([1, 384, 5, 5])
Dropout output shape:  torch.Size([1, 384, 5, 5])
Sequential output shape:  torch.Size([1, 10, 5, 5])
AdaptiveAvgPool2d output shape:  torch.Size([1, 10, 1, 1])
Flatten output shape:  torch.Size([1, 10])
training on cpu
loss 0.335, train acc 0.875, test acc 0.881
42.1 examples/sec on cpu

含并行连接的网络GoogLeNet

GoogLenet的一个重要观点是：有时使用不同大小的卷积核组合是有利的。在GoogLeNet中，基本的卷积块被称为Inception块。一个Inception块的示例如下：

这个Inception块由四条并行路径组成，前三条路径使用窗口大小为1×1、3×3和5×5的卷积层，从不同空间大小中提取信息。中间的两条路径在输入上执行1 × 1卷积，以减少通道数，从而降低模型的复杂性。第四条路径使用3 × 3最大汇聚层，然后使用1 × 1卷积层来改变通道数。这四条路径都使用合适的填充来使输入与输出的高和宽一致，最后我们将每条线路的输出在通道维度上连结，并构成Inception块的输出。在Inception块中，通常调整的超参数是每层输出通道数。

现在我们来实现这样一个GoogLeNet：

code

import torch
from torch import nn
from torch.nn import functional as F
from d2l import torch as d2l


class Inception(nn.Module):
    # c1-c4是每条路径的输出通道数
    def __init__(self, in_channels, c1, c2, c3, c4, **kwargs):
        super(Inception, self).__init__(**kwargs)
        # 路径1, 单1*1卷积层
        self.p1_1 = nn.Conv2d(in_channels, c1, kernel_size=1)
        # 路径2, 1*1卷积层后接3*3卷积层
        self.p2_1 = nn.Conv2d(in_channels, c2[0], kernel_size=1)
        self.p2_2 = nn.Conv2d(c2[0], c2[1], kernel_size=3, padding=1)
        # 路径3, 1*1卷积层后接5*5卷积层
        self.p3_1 = nn.Conv2d(in_channels, c3[0], kernel_size=1)
        self.p3_2 = nn.Conv2d(c3[0], c3[1], kernel_size=5, padding=2)
        # 路径4, 3*3最大池化层后接1*1卷积层
        self.p4_1 = nn.MaxPool2d(kernel_size=3, stride=1, padding=1)
        self.p4_2 = nn.Conv2d(in_channels, c4, kernel_size=1)

    def forward(self, x):
        # 第1个分支：直接通过一个经过ReLU激活函数的卷积层
        p1 = F.relu(self.p1_1(x))
        # 第2个分支：先通过一个ReLU激活函数的卷积层，再通过另一个
        p2 = F.relu(self.p2_2(F.relu(self.p2_1(x))))
        # 第3个分支：与第2个分支相同，但使用不同的卷积层权重
        p3 = F.relu(self.p3_2(F.relu(self.p3_1(x))))
        # 第4个分支：通过两个卷积层，但第二个层未使用激活函数，避免过度激活
        p4 = F.relu(self.p4_2(self.p4_1(x)))
        # 在通道维度上连结输出
        return torch.cat((p1, p2, p3, p4), dim=1)

if __name__ == "__main__":
    # 逐一实现GoogLeNet的每个模块
    b1 = nn.Sequential(
        nn.Conv2d(1, 64, kernel_size=7, stride=2, padding=3),
        nn.ReLU(),
        nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
    )
    b2 = nn.Sequential(
        nn.Conv2d(64, 64, kernel_size=1),
        nn.ReLU(),
        nn.Conv2d(64, 192, kernel_size=3, padding=1),
        nn.ReLU(),
        nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
    )
    b3 = nn.Sequential(
        Inception(192, 64, (96, 128), (16, 32), 32),
        Inception(256, 128, (128, 192), (32, 96), 64),
        nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
    )
    b4 = nn.Sequential(
        Inception(480, 192, (96, 208), (16, 48), 64),
        Inception(512, 160, (112, 224), (24, 64), 64),
        Inception(512, 128, (128, 256), (24, 64), 64),
        Inception(512, 112, (144, 288), (32, 64), 64),
        Inception(528, 256, (160, 320), (32, 128), 128),
        nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
    )
    b5 = nn.Sequential(
        Inception(832, 256, (160, 320), (32, 128), 128),
        Inception(832, 384, (192, 384), (48, 128), 128),
        nn.AdaptiveAvgPool2d((1, 1)),
        nn.Flatten()  # 展平
    )
    # 创建GoogLeNet模型
    net = nn.Sequential(
        b1, b2, b3, b4, b5, nn.Linear(1024, 10)
    )
    # 由于GoogLeNet的计算非常复杂，所以适当降低输入的高和宽，提高效率
    X = torch.rand(size=(1, 1, 96, 96))  # 从原来的224改为了96
    for layer in net:
        X = layer(X)  # 模仿神经网络按顺序经过每一层
        print(layer.__class__.__name__, "output shape: ", X.shape)

    # 训练
    lr, num_epochs, batch_size = 0.1, 10, 128
    train_iter, test_iter = d2l.load_data_fashion_mnist(batch_size, resize=96)  # 降为96
    d2l.train_ch6(net, train_iter, test_iter, num_epochs, lr, d2l.try_gpu())
    d2l.plt.show()  # 可视化

output

Sequential output shape:  torch.Size([1, 64, 24, 24])
Sequential output shape:  torch.Size([1, 192, 12, 12])
Sequential output shape:  torch.Size([1, 480, 6, 6])
Sequential output shape:  torch.Size([1, 832, 3, 3])
Sequential output shape:  torch.Size([1, 1024])
Linear output shape:  torch.Size([1, 10])
training on cpu
loss 0.272, train acc 0.896, test acc 0.875
126.8 examples/sec on cpu

批量规范化

训练深层神经网络十分困难，特别是希望在短时间内使它们收敛。批量规范化是一种有效的技术，可以加速深层神经网络的收敛。

理论部分见8.5. Batch Normalization — Dive into Deep Learning 1.0.3 documentation (d2l.ai)

从零实现批量规范化层

下面实现一个具有张量的批量规范化层。

code

import torch
from torch import nn
from d2l import torch as d2l


# 批量规范化
def batch_norm(X, gamma, beta, moving_mean, moving_tar, eps, momentum):
    # 通过is_grad_enabled来判断是训练模式还是预测模式
    if not torch.is_grad_enabled():
        # 如果在预测模式下，直接使用传入的移动平均所获得的均值和方差
        X_hat = (X - moving_mean) / torch.sqrt(moving_tar + eps)
    else:
        assert len(X.shape) in (2, 4)
        if len(X.shape) == 2:
            # 使用全连接层的情况
            mean = X.mean(dim=0)
            var = ((X - mean) ** 2).mean(dim=0)
        else:
            # 使用二维卷积层的情况, 计算通道维上(axis=1)的均值和方差
            # 保持X的形状以便做广播运算
            mean = X.mean(dim=(0, 2, 3), keepdim=True)
            var = ((X - mean) ** 2).mean(dim=(0, 2, 3), keepdim=True)
        # 训练模式下，用当前均值和方差做标准化
        X_hat = (X - mean) / torch.sqrt(var + eps)
        # 更新移动平均的均值和方差
        moving_mean = momentum * moving_mean + (1.0 - momentum) * mean
        moving_var = momentum * moving_tar + (1.0 - momentum) * var
    Y = gamma * X_hat + beta
    return Y, moving_mean.data, moving_var.data


# BatchNorm层
class BatchNorm(nn.Module):
    # num_features: 完全连接层的输出数量或卷积层的输出通道数
    # num_dims: 2表示完全连接层，4表示卷积层
    def __init__(self, num_features, num_dims):
        super().__init__()
        if num_dims == 2:
            shape = (1, num_features)
        else:
            shape = (1, num_features, 1, 1)
        # 参与求梯度和迭代的拉伸和偏移参数，分别初始化为1和0
        self.gamma = nn.Parameter(torch.ones(shape))
        self.beta = nn.Parameter(torch.zeros(shape))
        # 非模型参数的变量初始化为0和1
        self.moving_mean = torch.zeros(shape)
        self.moving_var = torch.ones(shape)

    def forward(self, X):
        # 如果X不在内存上，将moving_mean和moving_var复制到X所在设备上
        if self.moving_mean.device != X.device:
            self.moving_mean = self.moving_mean.to(X.device)
            self.moving_var = self.moving_var.to(X.device)
        # 保存更新过的moving_mean和moving_var
        Y, self.moving_mean, self.moving_var = batch_norm(
            X, self.gamma, self.beta, self.moving_mean, self.moving_var, eps=1e-5, momentum=0.9
        )
        return Y


if __name__ == "__main__":
    # 创建一个批量规范化层
    net = nn.Sequential(
        nn.Conv2d(1, 6, kernel_size=5),
        BatchNorm(6, num_dims=4),
        nn.Sigmoid(),
        nn.AvgPool2d(kernel_size=2, stride=2),
        nn.Conv2d(6, 16, kernel_size=5),
        BatchNorm(16, num_dims=4),
        nn.Sigmoid(),
        nn.AvgPool2d(kernel_size=2, stride=2),
        nn.Flatten(),
        nn.Linear(16 * 4 * 4, 120),
        BatchNorm(120, num_dims=2),
        nn.Sigmoid(),
        nn.Linear(120, 84),
        BatchNorm(84, num_dims=2),
        nn.Sigmoid(),
        nn.Linear(84, 10)
    )

    # 训练
    lr, num_epochs, batch_size = 1.0, 10, 256
    train_iter, test_iter = d2l.load_data_fashion_mnist(batch_size=batch_size)
    d2l.train_ch6(net, train_iter, test_iter, num_epochs, lr, d2l.try_gpu())
    d2l.plt.show()  # 可视化

output

# TODO

简洁实现的批量规范化层

code

from torch import nn
from d2l import torch as d2l


if __name__ == "__main__":
    net = nn.Sequential(
        nn.Conv2d(1, 6, kernel_size=5),
        nn.BatchNorm2d(6),
        nn.Sigmoid(),
        nn.AvgPool2d(kernel_size=2, stride=2),
        nn.Conv2d(6, 16, kernel_size=5),
        nn.BatchNorm2d(16),
        nn.Sigmoid(),
        nn.AvgPool2d(kernel_size=2, stride=2),
        nn.Flatten(),
        nn.Linear(256, 120),
        nn.BatchNorm1d(120),
        nn.Sigmoid(),
        nn.Linear(120, 84),
        nn.BatchNorm1d(84),
        nn.Sigmoid(),
        nn.Linear(84, 10)
    )
    lr, num_epochs, batch_size = 1.0, 10, 256
    train_iter, test_iter = d2l.load_data_fashion_mnist(batch_size)
    d2l.train_ch6(net, train_iter, test_iter, num_epochs, lr, d2l.try_gpu())
    d2l.plt.show()  # 可视化

output

training on cpu
loss 0.271, train acc 0.900, test acc 0.729
23296.6 examples/sec on cpu

残差网络ResNet

组件：

残差块
ResNet模型

原理见8.6. Residual Networks (ResNet) and ResNeXt — Dive into Deep Learning 1.0.3 documentation (d2l.ai)（比较抽象）

code

import torch
from torch import nn
from torch.nn import functional as F
from d2l import torch as d2l


# 实现残差块
class Residual(nn.Module):
    def __init__(self, input_channels, num_channels, use_1x1conv=False, strides=1):
        super().__init__()
        self.conv1 = nn.Conv2d(input_channels, num_channels, kernel_size=3, padding=1, stride=strides)
        self.conv2 = nn.Conv2d(num_channels, num_channels, kernel_size=3, padding=1)
        if use_1x1conv:
            # 如果使用1 * 1卷积层，添加通过1 * 1卷积调整通道和分辨率
            self.conv3 = nn.Conv2d(input_channels, num_channels, kernel_size=1, stride=strides)
        else:
            # 不使用1 * 1卷积层，在应用ReLU函数之前，将输入添加到输出
            self.conv3 = None
        self.bn1 = nn.BatchNorm2d(num_channels)
        self.bn2 = nn.BatchNorm2d(num_channels)

    def forward(self, X):
        Y = F.relu(self.bn1(self.conv1(X)))
        Y = self.bn2(self.conv2(Y))
        if self.conv3:
            X = self.conv3(X)
        Y += X
        return F.relu(Y)


# ResNet使用4个由残差块组成的模块，每个模块使用若干个同样输出通道数的残差块。
# 第一个模块的通道数同输入通道数一致, 由于之前已经使用了步幅为2的最大汇聚层，所以无须减小高和宽
# 之后的每个模块在第一个残差块里将上一个模块的通道数翻倍，并将高和宽减半
# 下面实现这个模块
def resnet_block(input_channels, num_channels, num_residuals, first_block=False):
    blk = []
    for i in range(num_residuals):
        if i == 0 and not first_block:
            blk.append(Residual(input_channels, num_channels, use_1x1conv=True, strides=2))
        else:
            blk.append(Residual(num_channels, num_channels))
    return blk


if __name__ == "__main__":
    # 当输入和输出形状一致时
    blk = Residual(3, 3)
    X = torch.rand(4, 3, 6, 6)
    Y = blk(X)
    print(Y.shape)

    blk = Residual(3, 6, use_1x1conv=True, strides=2)
    print(blk(X).shape)

    # ResNet的前两层跟之前介绍的GoogLeNet中的一样：在输出通道数为64、步幅为2的7 × 7卷积层后，接步幅为2的3 × 3的最大汇聚层。
    # 不同之处在于ResNet每个卷积层后增加了批量规范化层
    b1 = nn.Sequential(
        nn.Conv2d(1, 64, kernel_size=7, stride=2, padding=3),
        nn.BatchNorm2d(64),
        nn.ReLU(),
        nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
    )
    b2 = nn.Sequential(*resnet_block(64, 64, 2, first_block=True))
    b3 = nn.Sequential(*resnet_block(64, 128, 2))
    b4 = nn.Sequential(*resnet_block(128, 256, 2))
    b5 = nn.Sequential(*resnet_block(256, 512, 2))
    net = nn.Sequential(
        b1, b2, b3, b4, b5,
        nn.AdaptiveAvgPool2d((1, 1)),
        nn.Flatten(),
        nn.Linear(512, 10)
    )

    # 观察不同模块的输入输入形状是如何变化的
    X = torch.rand(size=(1, 1, 224, 224))
    for layer in net:
        X = layer(X)
        print(layer.__class__.__name__, "output shape: ", X.shape)

    # 训练模型
    lr, num_epochs, batch_size = 0.05, 10, 256
    train_iter, test_iter = d2l.load_data_fashion_mnist(batch_size, resize=96)
    d2l.train_ch6(net, train_iter, test_iter, num_epochs, lr, d2l.try_gpu())
    d2l.plt.show()  # 可视化

output

# TODO

稠密连接网络DenseNet

组件：

稠密块体
过渡层
DenseNet模型

原理见8.7. Densely Connected Networks (DenseNet) — Dive into Deep Learning 1.0.3 documentation (d2l.ai)

code

import torch
from torch import nn
from d2l import torch as d2l


# 使用ResNet改良版的 批量规范化、激活和卷积 架构
# 实现该架构
def conv_block(input_channels, num_channels):
    """批量规范化、激活和卷积架构"""
    return nn.Sequential(
        nn.BatchNorm2d(input_channels),
        nn.ReLU(),
        nn.Conv2d(input_channels, num_channels, kernel_size=3, padding=1)
    )


# 实现稠密快
class DenseBlock(nn.Module):
    def __init__(self, num_convs, input_channels, num_channels):
        super(DenseBlock, self).__init__()
        layer = []
        for i in range(num_convs):
            layer.append(conv_block(num_channels * i + input_channels, num_channels))
        self.net = nn.Sequential(*layer)

    def forward(self, X):
        for blk in self.net:
            Y = blk(X)
            # 连接通道上每个块的输入和输出
            X = torch.cat((X, Y), dim=1)
        return X


# 由于每个稠密块都会带来通道数的增加，使用过多会复杂化模型，而过渡层可以用来控制模型复杂度
# 通过1 * 1卷积层来减小通道数，并使用步幅为2的平均汇聚层减半高和宽，从而进一步降低复杂度
def transition_block(input_channels, num_channels):
    """
    :param input_channels: 输入通道数
    :param num_channels: 通道数
    :return: 过渡层
    """
    return nn.Sequential(
        nn.BatchNorm2d(input_channels),
        nn.ReLU(),
        nn.Conv2d(input_channels, num_channels, kernel_size=1),
        nn.AvgPool2d(kernel_size=2, stride=2)
    )


if __name__ == "__main__":
    # 创建稠密块
    blk = DenseBlock(2, 3, 10)
    X = torch.randn(4, 3, 8, 8)
    Y = blk(X)
    print(Y.shape)  # 4, 23, 8, 8

    # 对上述稠密块的输出使用通道数为10的过渡层, 高和宽减半
    blk = transition_block(23, 10)
    print(blk(Y).shape)  # 4, 10, 4, 4

    # DenseNet模型
    b1 = nn.Sequential(
        nn.Conv2d(1, 64, kernel_size=7, stride=2, padding=3),
        nn.BatchNorm2d(64),
        nn.ReLU(),
        nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
    )
    # num_channels为当前通道数
    num_channels, growth_rate = 64, 32
    num_convs_in_dense_block = [4, 4, 4, 4]
    blks = []
    for i, num_convs in enumerate(num_convs_in_dense_block):
        blks.append(DenseBlock(num_convs, num_channels, growth_rate))
        # 上一个稠密块的输出通道数
        num_channels += num_convs * growth_rate
        # 在稠密块之间添加一个转换层，使其通道数量减半
        if i != len(num_convs_in_dense_block) - 1:
            blks.append(transition_block(num_channels, num_channels // 2))
            num_channels //= 2
    # 创建DenseNet模型，和ResNet类似，最后接上全局汇聚层和全连接层来输出结果
    net = nn.Sequential(
        b1,
        *blks,
        nn.BatchNorm2d(num_channels),
        nn.ReLU(),
        nn.AdaptiveAvgPool2d((1, 1)),
        nn.Flatten(),  # 展平
        nn.Linear(num_channels, 10)
    )

    # 训练模型
    lr, num_epochs, batch_size = 0.1, 10, 256
    train_iter, test_iter = d2l.load_data_fashion_mnist(batch_size, resize=96)
    d2l.train_ch6(net, train_iter, test_iter, num_epochs, lr, d2l.try_gpu())
    d2l.plt.show()  # 可视化

output

torch.Size([4, 23, 8, 8])
torch.Size([4, 10, 4, 4])
training on cpu
loss 0.143, train acc 0.947, test acc 0.906
142.7 examples/sec on cpu

(•‿•)