# PyTorch 深度学习

基于小土堆:https://www.bilibili.com/video/BV1hE411t7RN

# 安装

基于 conda 环境来安装

conda create --name pytorch python=3.11

查询已有环境

conda info --envs

conda environments:
base * D:\anaconda
pytorch D:\anaconda\envs\pytorch

激活 pytorch 环境

conda activate pytorch

其他命令

conda remove -n xxxxx(名字) --all	# 环境删除命令
deactivate	# 退出虚拟环境
pip list	# 查看虚拟环境的库

访问 pytorch 网站:https://pytorch.org/get-started/locally/


截至 2025 年 8 月 Start Locally 条目给予 python 版本提示:

NOTE: Latest PyTorch requires Python 3.9 or later.


基于:Stable (2.7.1) - Windows - Pip - Python - CUDA 11.8

pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
(pytorch) C:\Users\Karry>pip list
Package           Version
----------------- ------------
filelock          3.13.1
fsspec            2024.6.1
Jinja2            3.1.4
MarkupSafe        2.1.5
mpmath            1.3.0
networkx          3.3
numpy             2.1.2
pillow            11.0.0
pip               25.1
setuptools        78.1.1
sympy             1.13.3
torch             2.7.1+cu118
torchaudio        2.7.1+cu118
torchvision       0.22.1+cu118
typing_extensions 4.12.2
wheel             0.45.1

验证:

import torch
torch.cuda.is_available()	# True

# 两大法宝函数 - dir 与 help

dir 主要是来查看一个工具包下还有什么子工具包或者工具

help 主要是查看一些工具有什么作用

比如 help (torch.cuda.is_available)

help(torch.cuda.is_available)
Help on function is_available in module torch.cuda:
is_available() -> bool
    Return a bool indicating if CUDA is currently available.

# PyTorch 数据读取

数据 —— Dataset(提供一种方式获取 label) —— Dataloader(为后面的网络提供不同的数据形式)

组织结构:

+—— hymenoptera_data
| +—— train
| | —— ants
| | —— bees

对应代码:

from torch.utils.data import Dataset
from PIL import Image
import os
# MyData 是自定义的数据集类,他继承了 Dataset 类,MyData (Dataset) 这是继承动作
class MyData(Dataset):
    # 初始化函数,初始化一些数据,比如图片路径,标签路径等等,__init__这是重写父类的方法
    def __init__(self, root_dir, label_dir):
        # 根集路径
        self.root_dir = root_dir
        # 标签路径,这其实表达了图片是什么分类
        self.label_dir = label_dir
        # 获取图片的路径,这个路径下存放着图片
        self.path = os.path.join(self.root_dir, self.label_dir)
        # 获取图片路径列表 listdir
        self.img_path = os.listdir(self.path)
    # 获取数据,这个函数是必须写的,并且是必须返回两个值,一个是图片,一个是标签
    def __getitem__(self, idx):
        img_name = self.img_path[idx]
        img_item_path = os.path.join(self.root_dir, self.label_dir, img_name)
        img = Image.open(img_item_path)
        label = self.label_dir
        return img, label
    def __len__(self):
        return len(self.img_path)
root_dir = "hymenoptera_data\\train"
ant_label_dir = "ants"
bees_label_dir = "bees"
ants_dataset = MyData(root_dir, ant_label_dir)
bees_dataset = MyData(root_dir, bees_label_dir)
train_dataset = ants_dataset + bees_dataset

组织结构:

+—— hymenoptera_data_ex
| +—— train
| | —— ants_image
| | —— ants_label
| | —— bees_image
| | —— bees_label

对应代码:

from torch.utils.data import Dataset
from PIL import Image
import os
class MyData(Dataset):
    def __init__(self, root_dir, img_dir, label_dir):
        self.root_dir = root_dir
        self.img_dir = img_dir
        self.label_dir = label_dir
        self.path_img = os.path.join(self.root_dir, self.img_dir)
        self.path_label = os.path.join(self.root_dir, self.label_dir)
        self.img_path = os.listdir(self.path_img)
        self.label_path = os.listdir(self.path_label)
    def __getitem__(self, item):
        img_name = self.img_path[item]
        img_item_path = os.path.join(self.path_img, img_name)
        img = Image.open(img_item_path)
        label_name = self.label_path[item]
        label_item_path = os.path.join(self.path_label, label_name)
        with open(label_item_path, "r", encoding="utf-8") as f:
            label = f.read()
            return img, label
root_dir = "hymenoptera_data_ex\\train"
img_dir = "ants_image"
label_dir = "ants_label"
ants_dataset = MyData(root_dir, img_dir, label_dir)

# Tensorboard 的使用

# add_scalar

from torch.utils.tensorboard import SummaryWriter
writer = SummaryWriter("logs")
# writer.add_image()
# y = x
for i in range(100):
    writer.add_scalar("y=i^2", i * i, i)
writer.close()
tensorboard --logdir=logs --port=6007

1.png

writer.add_scalar (“y=i^2”, i * i, i),这个的参数分别是:标签、y、x

# add_image

import numpy as np
from PIL import Image
from torch.utils.tensorboard import SummaryWriter
img_path = "hymenoptera_data/train/ants/0013035.jpg"
img_PIL = Image.open(img_path)
img_array = np.array(img_PIL)
# hymenoptera_data/train/ants/0013035.jpg
writer = SummaryWriter("logs")
writer.add_image("img", img_array, 1,dataformats="HWC")
# y = x
for i in range(100):
    writer.add_scalar("y=i^2", i * i, i)
writer.close()

2.png

add_image (“img”, img_array, 1,dataformats=“HWC”),这个参数分别是:标签、ndarray 类型的图片、步骤、dataformats-HWC

# Transforms 的使用

Transforms 主要是对图片的各种变换,是预处理?

from PIL import Image
from torchvision import transforms
img = Image.open('hymenoptera_data/train/ants/0013035.jpg')
tensor_trans = transforms.ToTensor()
tensor_img = tensor_trans(img)
print(tensor_img)

Image.open 返回了 PIL 类型的图片,通过 transforms.ToTensor () 创建了工具对象,最后使用 tensor_trans (img) 转化为 tensor 类型

tensor 数据类型:包装了神经网络所需要的理论基础参数

使用 SummaryWriter 将 tensor_img 写入

from PIL import Image
from torch.utils.tensorboard import SummaryWriter
from torchvision import transforms
img = Image.open('hymenoptera_data/train/ants/0013035.jpg')
tensor_trans = transforms.ToTensor()
tensor_img = tensor_trans(img)
# print(tensor_img)
writer = SummaryWriter('logs')
writer.add_image('tensor_img', tensor_img)
writer.close()

3.png


题外话,理解一下 py 的面向对象:

class Person:
    def __init__(self, name):
        self.name = name
    def __call__(self):
        print("hello", self.name)
    def hello(self):
        print("hello_ex", self.name)
person = Person("Karry")
person()
person.hello()

init 方法实际上是一个构造方法,person = Person (“Karry”) 执行后调用构造方法

call 方法像初次见面问好一样 person () 执行后,调用 call 方法,是让实例像函数一样用的钩子方法

person.hello () 只是一个类的普通方法


# ToTensor 的使用

ToTensor 图片张量化工具

from PIL import Image
from torch.utils.tensorboard import SummaryWriter
from torchvision import transforms
writer = SummaryWriter('logs')
img = Image.open('images/pytorch.png')
toTensorTools = transforms.ToTensor()
img_tensor = toTensorTools(img)
writer.add_image('ToTensor', img_tensor)
writer.close()

# Normalize 的使用

Normalize 归一化、标准化,均值为 0,方差为 1,数值位于 - 1 到 1 之间

如果图片不是 RGB 模式需要做 img.convert (‘RGB’)

writer = SummaryWriter('logs')
img = Image.open('images/pytorch.png')
# img 转化为 RGB
img = img.convert('RGB')
toTensorTools = transforms.ToTensor()
img_tensor = toTensorTools(img)
writer.add_image('ToTensor', img_tensor)
print(img_tensor[0][0][0])
# 归一化
transforms_normalize = transforms.Normalize([0.5, 0.5, 0.5], [0.5, 0.5, 0.5])
img_normalize = transforms_normalize(img_tensor)
print(img_normalize[0][0][0])
writer.close()

tensor(0.1333)
tensor(-0.7333)

0.1333*2 - 1 = -0.7333

# Resize 的使用

Resize 重调整

# Resize 的使用
print(img.size)
transforms_resize = transforms.Resize([128, 128])
img_resize = transforms_resize(img_tensor)
writer.add_image('Resize', img_resize, 0)
print(img_resize.size())
resize_2 = transforms.Resize(512)
transforms_compose = transforms.Compose([resize_2, toTensorTools])
img_resize_2 = transforms_compose(img)
writer.add_image('Compose', img_resize_2, 1)

起初我们使用 img = Image.open (‘images/testFG.jpg’),此时这是一个 PIL 图片,如何我们使用 toTensorTools = transforms.ToTensor () 创建张量转换工具,使用 img_tensor = toTensorTools (img) 将 PIL 图片转化为 img_tensor 张量图片,紧接着我们使用 transforms_resize = transforms.Resize ([128, 128]) 创建尺寸调整工具,使用 img_resize = transforms_resize (img_tensor) 对张量图片重调整。

Compose 的意义在于它可以将多个图像变换操作(如缩放、裁剪、归一化等)按顺序组合成一个流水线,输入图像会依次通过这些变换。

我们使用 transforms_compose = transforms.Compose ([resize_2, toTensorTools]) 创建了一个工具链,resize_2 用于调整图像尺寸,toTensorTools 用于其转化为张量图片。

从 img_resize_2 = transforms_compose (img) 我们可以看到,参数 img 是一个 PIL 图片他通过 Compose 先后进行了重调整和张量化,最终返回 img_resize_2 张量图片。


提示:resize_2 = transforms.Resize (512) 是一个等比例调整。

tensor(0.4039)
tensor(0.6797)
(3600, 2700)
torch.Size([3, 128, 128])
torch.Size([3, 512, 682])


# RandomCrop 的使用

RandomCrop 随机裁剪

# RandomCrop 的使用
# random_crop = transforms.RandomCrop(500, 1000)
random_crop = transforms.RandomCrop(512)
compose_random_crop = transforms.Compose([random_crop, toTensorTools])
for i in range(10):
    img_random_crop = compose_random_crop(img)
    writer.add_image('RandomCrop', img_random_crop, i)

# 使用 TorchVision 的数据集(DataSet)

CIFAR —— Canadian Institute For Advanced Research(加拿大高级研究所)

root 数据集所在目录、train 是训练集还是测试集、transform 应用的变换操作或操作集合、download 是否启用下载

train_set = torchvision.datasets.CIFAR10(root="./data", train=True, transform=transforms_compose_dataset, download=True)
test_set = torchvision.datasets.CIFAR10(root="./data", train=False, transform=transforms_compose_dataset, download=True)

4.png

CIFAR-10 and CIFAR-100 datasets

The CIFAR-10 dataset consists of 60000 32x32 colour images in 10 classes, with 6000 images per class. There are 50000 training images and 10000 test images.

CIFAR-10 数据集由 10 类的 60000 张 32x32 彩色图像组成,每类 6000 张图像。有 50000 张训练图像和 10000 张测试图像。

The dataset is divided into five training batches and one test batch, each with 10000 images. The test batch contains exactly 1000 randomly-selected images from each class. The training batches contain the remaining images in random order, but some training batches may contain more images from one class than another. Between them, the training batches contain exactly 5000 images from each class.

数据集分为五个训练批次和一个测试批次,每个训练批次有 10000 张图像。测试批次恰好包含每个类中随机选择的 1000 张图像。训练批次以随机顺序包含剩余的图像,但某些训练批次可能包含来自一个类的图像多于另一个类的图像。在它们之间,训练批次恰好包含每个类的 5000 张图像。

# 数据集联动 Tensorboard

import torchvision
from torch.utils.tensorboard import SummaryWriter
tensorboard = SummaryWriter("p10")
transforms_compose_dataset = torchvision.transforms.Compose([torchvision.transforms.ToTensor()])
train_set = torchvision.datasets.CIFAR10(root="./data", train=True, transform=transforms_compose_dataset, download=True)
test_set = torchvision.datasets.CIFAR10(root="./data", train=False, transform=transforms_compose_dataset, download=True)
print(test_set[0])
for i in range(20):
    img, target = test_set[i]
    tensorboard.add_image("test_set", img, i)
tensorboard.close()

其中 img, target = test_set [i] 返回了一个元组

5.png

0 号位是转化过后的张量图,1 号位是其标签索引,数据的标签列表可以在 test_set.classes 中看到。

# add_image 源码提示

def add_image(
    self, tag, img_tensor, global_step=None, walltime=None, dataformats="CHW"
):

tag, img_tensor, global_step=None 分别对应 tensorboard 标签,张量图,以及步骤 i

tensorboard.add_image("test_set", img, i)

注意:tensorboard 使用过后须关闭 tensorboard.close ()

# DataLoader 的使用

torch.utils.data — PyTorch 2.8 documentation

参数初见:

  • dataset (Dataset) – dataset from which to load the data.

  • batch_size (int, optional) – how many samples per batch to load (default: 1 ).

  • shuffle (bool, optional) – set to True to have the data reshuffled at every epoch (default: False ).

  • batch_sampler (Sampler or Iterable*,* optional) – like sampler , but returns a batch of indices at a time. Mutually exclusive with batch_size , shuffle , sampler , and drop_last .

  • num_workers (int, optional) – how many subprocesses to use for data loading. 0 means that the data will be loaded in the main process. (default: 0 )

  • drop_last (bool, optional) – set to True to drop the last incomplete batch, if the dataset size is not divisible by the batch size. If False and the size of dataset is not divisible by the batch size, then the last batch will be smaller. (default: False )

# 准备的测试数据
test_data = torchvision.datasets.CIFAR10(root="./data", train=False, transform=torchvision.transforms.ToTensor(),
                                         download=True)
# 创建测试数据集
test_loader = DataLoader(dataset=test_data, batch_size=64, shuffle=True, num_workers=0, drop_last=True)

dataset 数据集、batch_size 一次打包多少个、shuffle 是否打乱、num_workers 加载数据子进程数、drop_last 多余部分是否删除

torch.Size([3, 32, 32])
3
cat

# 联动 Tensorboard

import torchvision.datasets
from torch.utils.data import DataLoader
from torch.utils.tensorboard import SummaryWriter
# 准备的测试数据
test_data = torchvision.datasets.CIFAR10(root="./data", train=False, transform=torchvision.transforms.ToTensor(),
                                         download=True)
# 创建测试数据集
test_loader = DataLoader(dataset=test_data, batch_size=64, shuffle=True, num_workers=0, drop_last=True)
image, target = test_data[0]
# 测试数据第一张图像的 shape 和标签
print(image.shape)
print(target)
print(test_data.classes[target])
print("____________________")
writer = SummaryWriter("DataLoader")
step = 0
for loaderX in test_loader:
    images, targets = loaderX
    # print(images.shape)
    # print(targets)
    writer.add_images("test_data_drop_last", images, step)
    step += 1
writer.close()

# 利用 Epoch 变量控制训练或测试轮次

for epoch in range(2):
    step = 0
    for loaderX in test_loader:
        images, targets = loaderX
        # print(images.shape)
        # print(targets)
        writer.add_images("Epoch:{}".format(epoch), images, step)
        step += 1

"Epoch:{}".format(epoch) 是 Python 的 字符串格式化 方法,它会将 epoch 的值动态插入到字符串的 {} 占位符中。

6.png

此时当 shuffle=True 时,Epoch0 和 Epoch1 并不一样

test_loader = DataLoader(dataset=test_data, batch_size=64, shuffle=True, num_workers=0, drop_last=True)

7.png

# 神经网络 (Neural Network) 基本骨架

torch.nn — PyTorch 2.8 documentation

Module — PyTorch 2.8 documentation

import torch.nn as nn
import torch.nn.functional as F
class Model(nn.Module):
    def __init__(self) -> None:
        super().__init__()
        self.conv1 = nn.Conv2d(1, 20, 5)
        self.conv2 = nn.Conv2d(20, 20, 5)
    def forward(self, x):
        x = F.relu(self.conv1(x))
        return F.relu(self.conv2(x))

forward 前向传播:

def forward(self, x):
        x = F.relu(self.conv1(x))
        return F.relu(self.conv2(x))

x 先经过一次 conv1 卷积,再经过一次 relu 非线性处理 x = F.relu(self.conv1(x))

然后 x 在经过一次 conv2 卷积,再经过一次 relu 非线性处理,最后返回 return F.relu(self.conv2(x))

# 简单的骨架

简单的骨架就是这样,有一个输入经过 forward 后每次加一

import torch
from torch import nn
class Module(nn.Module):
    def __init__(self, *args, **kwargs) -> None:
        super().__init__(*args, **kwargs)
    def forward(self, input):
        output = input + 1
        return output
karry = Module()
x = torch.tensor(1.0)
print(karry(x))

# convolution 卷积操作

https://www.bilibili.com/video/BV1hE411t7RN?p=17

# Stride 跨步 = 1

Stride 是每次跨步数,1 就是每次跨一步

8.png

注意到红色部分就是对应位置相乘再相加

import torch
import torch.nn.functional as F
# 创建输入和核
input_matrix = torch.tensor([[1, 2, 0, 3, 1], [0, 1, 2, 3, 1], [1, 2, 1, 0, 0], [5, 2, 3, 1, 1], [2, 1, 0, 1, 1]])
kernel = torch.tensor([[1, 2, 1], [0, 1, 0], [2, 1, 0]])
# 改变维度
input_matrix = torch.reshape(input_matrix, (1, 1, 5, 5))
kernel = torch.reshape(kernel, (1, 1, 3, 3))
# 卷积
output_ans = F.conv2d(input_matrix, kernel, stride=1)
print(input_matrix.shape)
print(kernel.shape)
print(output_ans)

torch.Size([1, 1, 5, 5])
torch.Size([1, 1, 3, 3])
tensor([[[[10, 12, 12],
[18, 16, 16],
[13, 9, 3]]]])

# Stride 跨步 = 2

9.png

# 卷积
output_ans = F.conv2d(input_matrix, kernel, stride=2)
print(output_ans)

tensor([[[[10, 12],
[13, 3]]]])

# Padding 填充 = 1

Padding 填充就是在原始数据的最外侧填充一些数据(像素),一般情况下是设置为零

10.png

一样地,红色部分对应位置相乘再相加,最外圈绿色为 padding=1 所产生的额外填充

# 卷积
output_ans = F.conv2d(input_matrix, kernel, stride=1, padding=1)
print(output_ans)

tensor([[[[ 1, 3, 4, 10, 8],
[ 5, 10, 12, 12, 6],
[ 7, 18, 16, 16, 8],
[11, 13, 9, 3, 4],
[14, 13, 9, 7, 4]]]])

# Convolution Layers 卷积层

torch.nn — PyTorch 2.8 documentation

class torch.nn.Conv2d(in_channels, out_channels, kernel_size, stride=1, padding=0, dilation=1, groups=1, bias=True, padding_mode=‘zeros’, device=None, dtype=None)

其中最主要的参数设置是这五个:in_channels, out_channels, kernel_size, stride=1, padding=0

11.png

weight 实际上就是卷积核,input 是通道数据,bias 是偏置

13.png

卷积层的 每个输出通道 都是 所有输入通道的加权卷积结果相加,权重就是 weight[j, k]

假设输入是 RGB 彩色图像(3 个通道:R、G、B),卷积层的一个输出通道是这样算的:

12.png

为什么要 “所有输入通道相加”?

  1. 图像的特征可能跨通道(比如红绿蓝组合才能构成颜色信息)
  2. 一个卷积核只看单个通道的信息是不完整的
  3. 把多个输入通道的卷积结果加在一起,就相当于在融合这些通道的信息

这也是为什么多通道卷积的 weight 是四维的:C_out、C_in、k_h、k_w

14.png

Convolution animations:conv_arithmetic/README.md at master · vdumoulin/conv_arithmetic · GitHub

# in_channel=1, out_channel=1

只有一个输入通道和一个输出通道,不存在跨通道求和。

16.png

1×1+2×0+3×0+4×(−1)=1+0+0−4=−3

[[−3]]

in_channel=1, out_channel=1 时,就是用一个卷积核直接作用于输入通道,卷积后加上 bias 得到结果。

没有跨通道加权,没有额外求和步骤。

import torch
from torch import nn
nn_conv_d = nn.Conv2d(in_channels=1, out_channels=1, kernel_size=2, stride=1, padding=0, bias=False)
nn_conv_d.weight.data = torch.tensor([[[[1, 0],
                                        [0, -1]]]], dtype=torch.float32)
input = torch.tensor([[[[1, 2], [3, 4]]]], dtype=torch.float32)
print(nn_conv_d(input))

tensor([[[[-3.]]]], grad_fn=)

起步解释:

  1. nn_conv_d.weight.data 自定义卷积核
  2. nn.Conv2d 要求 [batch, channels, height, width] ,所以二维矩阵要用 unsqueeze 扩成 4 维,或者在初始化时就是四维的
  3. 卷积层的权重和 bias 是 ** 浮点型 ( torch.float32 )** 因此要 dtype=torch.float32
  4. 为了得到精确值偏置量应当设为假(不偏置):bias=False

以上均为 torch 框架中自带的参数调试变量,这些操作主要是 调试和理解卷积的计算过程

在实际神经网络训练中:

  • 权重 weight 会被优化器自动更新
  • 输入通常是四维 tensor
  • bias 是否启用视网络设计而定

掌握这些基本参数调试方法,是为了:

  1. 理解卷积计算机制
  2. 验证卷积操作是否如预期
  3. 为后续做图像或特征识别的神经网络打基础

# in_channel=1, out_channel=2

每个输出通道是独立的卷积结果

17.png

1×1+2×0+3×0+4×(−1)=1+0+0−4=−3

1×0+2×1+3×(−1)+4×0=0+2−3+0=−1

输出张量 (batch=1, out_channel=2, H=1, W=1):output=[[−3],[−1]]

in_channel=1 时,每个输出通道都直接在这个输入通道上用不同卷积核独立运算。

最终的两个通道结果是并列存储,不做求和。

import torch
from torch import nn
nn_conv_d = nn.Conv2d(in_channels=1, out_channels=2, kernel_size=2, stride=1, padding=0, bias=False)
nn_conv_d.weight.data = torch.tensor([
    [[[1, 0], [0, -1]]],  # 输出通道 1 的卷积核,对应输入通道 1
    [[[0, 1], [-1, 0]]]  # 输出通道 2 的卷积核,对应输入通道 1
], dtype=torch.float32)
input = torch.tensor([[[[1, 2], [3, 4]]]], dtype=torch.float32)
print(nn_conv_d(input))

tensor([[[[-3.]],

​ [[-1.]]]], grad_fn=)

# in_channel=2, out_channel=1

两个输入通道各自用自己的卷积核卷积 → 得到两个结果。

18.png

1×1+2×0+3×0+4×(−1)=1+0+0−4=−3

5×0+6×1+7×(−1)+8×0=0+6−7+0=−1

加权求和(合成一个输出通道):output=(−3)+(−1)=−4

输出张量 (batch=1, out_channel=1, H=1, W=1):[[−4]]

import torch
from torch import nn
nn_conv_d = nn.Conv2d(in_channels=2, out_channels=1, kernel_size=2, stride=1, padding=0, bias=False)
nn_conv_d.weight.data = torch.tensor([
    [
        [[1, 0], [0, -1]],    # 输入通道 1 的卷积核
        [[0, 1], [-1, 0]]     # 输入通道 2 的卷积核
    ]
], dtype=torch.float32)
input = torch.tensor([
    [
        [
            [1, 2],
            [3, 4]
        ], [
        [5, 6],
        [7, 8]
    ]
    ]
], dtype=torch.float32)
print(nn_conv_d(input))

tensor([[[[-4.]]]], grad_fn=)

# in_channel=2, out_channel=2

19.png

X_channel1 * W_out1_in1 = [[11+20+30+41]] = [[1+0+0+4]] = [[5]]
X_channel2 * W_out1_in2 = [[00+11+11+00]] = [[0+1+1+0]] = [[2]]
Y_out1 = 5 + 2 = 7

X_channel1 * W_out2_in1 = [[1+2+3+4]] = [[10]]
X_channel2 * W_out2_in2 = [[0* -1 + 10 + 10 + 0*-1]] = [[0]]
Y_out2 = 10 + 0 = 10

Y_out1 = [[7]] Y_out2 = [[10]]

结果为:[[7],[10]]

import torch
from torch import nn
nn_conv_d = nn.Conv2d(in_channels=2, out_channels=2, kernel_size=2, stride=1, padding=0, bias=False)
nn_conv_d.weight.data = torch.tensor([
    [
        [[1, 0], [0, 1]],
        [[0, 1], [1, 0]]
    ], [
        [[1, 1], [1, 1]],
        [[-1, 0], [0, -1]]
    ]
], dtype=torch.float32)
input = torch.tensor([[
    [
        [1, 2],
        [3, 4]
    ], [
        [0, 1],
        [1, 0]
    ]
]], dtype=torch.float32)
print(nn_conv_d(input))

tensor([[[[ 7.]],

​ [[10.]]]], grad_fn=)

# 向前传播的卷积(正向传播)

import torch
import torchvision
from torch import nn
from torch.utils.data import DataLoader
from torch.utils.tensorboard import SummaryWriter
dataset = torchvision.datasets.CIFAR10("data", train=False, transform=torchvision.transforms.ToTensor(),
                                       download=True)
dataloader = DataLoader(dataset, batch_size=64, shuffle=True, num_workers=0)
class Net(nn.Module):
    def __init__(self, *args, **kwargs) -> None:
        super().__init__(*args, **kwargs)
        self.conv1 = nn.Conv2d(in_channels=3, out_channels=6, kernel_size=3, stride=1, padding=0)
    def forward(self, x):
        x = self.conv1(x)
        return x
writer = SummaryWriter("logs")
net = Net()
step = 0
for data in dataloader:
    imgs, targets = data
    outputs = net(imgs)
    print(imgs.shape)
    print(outputs.shape)
    # torch.Size([64, 3, 32, 32])
    writer.add_images("input", imgs, global_step=step)
    # torch.Size([64, 6, 30, 30])
    outputs = torch.reshape(outputs, (-1, 3, 30, 30))
    writer.add_images("output", outputs, global_step=step)
    step += 1

self.conv1 是一个 卷积层( nn.Conv2d,在 forward 里写 x = self.conv1(x) ,就是把输入 x 通过卷积层进行前向计算

# Pooling Layers 池化层

torch.nn — PyTorch 2.8 documentation

池化是卷积神经网络(CNN)中一个很重要的操作。它的主要作用可以总结为以下几点:

  1. 降维与减少计算量
  • 为什么:卷积层输出的特征图通常很大,如果不缩小,后面网络层的计算量会非常庞大。
  • 怎么做:池化通过取局部区域的最大值(Max Pooling)或平均值(Average Pooling)来缩小特征图尺寸。
  • 效果:减少参数量和计算量,加快训练和推理速度。

  1. 特征的平移不变性
  • 什么意思:如果图片里一个物体稍微移动了,网络依然能识别。
  • 为什么能做到:池化会在一个小范围内提取统计特征(最大值或平均值),因此即使输入图像有微小的偏移,结果变化也不会太大。

  1. 突出重要特征,抑制不重要信息
  • Max Pooling:保留一个区域的最大值,倾向于保留最显著的边缘或纹理特征。
  • Average Pooling:保留区域的平均值,得到更加平滑的特征。
  • 作用:相当于 “特征压缩”,让后续层更容易提取全局信息。

  1. 防止过拟合
  • 通过减少参数和对细节的依赖,网络更关注大局特征而不是局部噪声,从而减轻过拟合风险。

20.png

核心参数

  1. kernel_size (Union[int, tuple[int, int]]) – the size of the window to take a max over
  2. stride (Union[int, tuple[int, int]]) – the stride of the window. Default value is kernel_size
  3. padding (Union[int, tuple[int, int]]) – Implicit negative infinity padding to be added on both sides
  4. dilation (Union[int, tuple[int, int]]) – a parameter that controls the stride of elements in the window
  5. return_indices (bool) – if True, will return the max indices along with the outputs. Useful for torch.nn.MaxUnpool2d later
  6. ceil_mode (bool) – when True, will use ceil instead of floor to compute the output shape

池化示意图:

21.png

# dilation 扩张率(空洞卷积)

这个参数在 卷积(特别是卷积神经网络中的卷积层)里起很重要的作用。它和 kernel_sizestride 一样,决定了卷积核是怎么在输入特征图上取值的。

dilation.png

dilation 卷积,其实就是我们常说的 空洞卷积 (Dilated Convolution / Atrous Convolution)

# ceil 模式和 floor 模式

  1. floor 模式(默认)
  • 取整时向下取整(floor)。
  • 多余的边缘(不足一个 kernel 的区域)会被丢弃。
  • 比如:
    • 输入长度 5
    • kernel=2, stride=2
    • 计算:(5−2)/2+1=2.5 (5-2)/2 + 1 = 2.5 (5−2)/2+1=2.5 → floor → 2
    • 输出长度就是 2(最后一个位置没覆盖到)。
  1. ceil 模式(开启 ceil_mode=True
  • 取整时向上取整(ceil)。
  • 边缘不足 kernel 的部分,也会被保留(通常会用 padding 补齐)。
  • 上面例子:
    • 输入长度 5
    • kernel=2, stride=2, ceil_mode=True
    • 计算:2.5 → ceil → 3
    • 输出长度就是 3(最后一个区域会只覆盖部分输入,或者补 0)。

floor 模式(默认):计算稳定,常用于训练。

ceil 模式:当你希望输入和输出严格对齐,或者想保留更多边缘信息时使用(比如某些图像分割任务)。

# 提示

卷积是提取特征,池化是压缩特征

1080P -> 720P

# 最大池化

池化的默认步长等于池化核的大小池化的默认步长等于池化核的大小池化的默认步长等于池化核的大小

22.png

# 最大池化实例

ceil_mode=True:

import torch
import torch.nn as nn
input = torch.tensor([[1, 2, 0, 3, 1],
                      [0, 1, 2, 3, 1],
                      [1, 2, 1, 0, 0],
                      [5, 2, 3, 1, 1],
                      [2, 1, 0, 1, 1]])
input = torch.reshape(input, (-1, 1, 5, 5))
class Net(nn.Module):
    def __init__(self, *args, **kwargs) -> None:
        super().__init__(*args, **kwargs)
        self.maxPool1 = nn.MaxPool2d(kernel_size=3, ceil_mode=True)
    def forward(self, input):
        output = self.maxPool1(input)
        return output
net = Net()
output = net(input)
print(output.shape)
print(output)

torch.Size([1, 1, 2, 2])
tensor([[[[2, 3],
[5, 1]]]])

ceil_mode=False:

self.maxPool1 = nn.MaxPool2d(kernel_size=3, ceil_mode=False)

torch.Size([1, 1, 1, 1])
tensor([[[[2]]]])

# 图片操作

import torch
import torch.nn as nn
import torchvision.datasets
from torch.utils.data import DataLoader
from torch.utils.tensorboard import SummaryWriter
input = torch.tensor([[1, 2, 0, 3, 1],
                      [0, 1, 2, 3, 1],
                      [1, 2, 1, 0, 0],
                      [5, 2, 3, 1, 1],
                      [2, 1, 0, 1, 1]])
input = torch.reshape(input, (-1, 1, 5, 5))
data_set = torchvision.datasets.CIFAR10(root="data", train=False, transform=torchvision.transforms.ToTensor(),
                                        download=True)
dataloader = DataLoader(data_set, batch_size=64, shuffle=True, num_workers=0)
class Net(nn.Module):
    def __init__(self, *args, **kwargs) -> None:
        super().__init__(*args, **kwargs)
        self.maxPool1 = nn.MaxPool2d(kernel_size=3, ceil_mode=False)
    def forward(self, input):
        output = self.maxPool1(input)
        return output
writer = SummaryWriter("logs_maxpool")
net = Net()
step = 0
for data in dataloader:
    img, label = data
    writer.add_images("input", img, global_step=step)
    output = net(img)
    writer.add_images("output", output, global_step=step)
    step += 1
writer.close()

可以看到,变成马赛克了:

23.png

# Padding Layers 填充层

torch.nn — PyTorch 2.8 documentation

填充是应用于图片外围的,主要进行一些值的填充,基本不用到

最多的会使用到:nn.ZeroPad2d Pads the input tensor boundaries with zero.

# Non-linear Activations 非线性激活

最常见的是:ReLU — PyTorch 2.8 documentation

24.png

其次是:Sigmoid — PyTorch 2.8 documentation

25.png

# ReLU 示例

import torch
input = torch.tensor([[1, -0.5], [-1, 3]])
output = torch.reshape(input, (-1, 1, 2, 2))
print(output)
class Net(torch.nn.Module):
    def __init__(self, *args, **kwargs) -> None:
        super().__init__(*args, **kwargs)
        self.relu = torch.nn.ReLU()
    def forward(self, input) -> torch.Tensor:
        output = self.relu(input)
        return output
net = Net()
output = net(output)
print(output)

# inplace 源码提示

def __init__(self, inplace: bool = False):
    super().__init__()
    self.inplace = inplace

当 inplace 为假时,不改变源数据

当 inplace 为真时,执行时改变原始数据(就地算法)

# 图片操作

import torch
import torchvision.datasets
from torch.utils.data import DataLoader
from torch.utils.tensorboard import SummaryWriter
dataset = torchvision.datasets.CIFAR10("data", train=False, transform=torchvision.transforms.ToTensor(), download=True)
dataloader = DataLoader(dataset, batch_size=64, shuffle=True, num_workers=0)
class Net(torch.nn.Module):
    def __init__(self, *args, **kwargs) -> None:
        super().__init__(*args, **kwargs)
        self.relu = torch.nn.ReLU()
        self.sigmoid = torch.nn.Sigmoid()
    def forward(self, input) -> torch.Tensor:
        # output = self.relu(input)
        output = self.sigmoid(input)
        return output
net = Net()
writer = SummaryWriter("logs_relu")
step = 0
for data in dataloader:
    imgs, targets = data
    writer.add_images("input", imgs, global_step=step)
    output = net(imgs)
    writer.add_images("output", output, global_step=step)
    step += 1
writer.close()

结果如下:

26.png

# Normalization Layers 正则化层

https://docs.pytorch.org/docs/stable/nn.html#normalization-layers

有一篇论文提到正则化层可以加速训练

# Recurrent Layers 循环层

提示:RNN

torch.nn — PyTorch 2.8 documentation

序列的每一步计算都会依赖前一步的隐藏状态,从而能捕捉序列的时间依赖性。

# Transformer Layers

https://docs.pytorch.org/docs/stable/nn.html#transformer-layers

Transformer 的核心思想就是用 自注意力机制(Self-Attention) 来替代传统 RNN 或 CNN 处理序列时的缺陷。

# Linear Layers 线性层(全连接)

torch.nn — PyTorch 2.8 documentation

27.png

# 源码提示

torch.nn.Linear(in_features, out_features, bias=True)

in_features 含义:输入特征的维度(每个样本的输入向量长度)。

out_features 含义:输出特征的维度(每个样本的输出向量长度)。

bias 是否使用偏置项,默认 True

28.png

import torch
import torchvision
from torch import nn
from torch.utils.data import DataLoader
from torch.utils.tensorboard import SummaryWriter
dataset = torchvision.datasets.CIFAR10("data", train=False, transform=torchvision.transforms.ToTensor(),
                                       download=True)
dataLoader = DataLoader(dataset, batch_size=64, shuffle=True, num_workers=0, drop_last=True)
# writer = SummaryWriter("log_liner")
class Net(nn.Module):
    def __init__(self, *args, **kwargs) -> None:
        super().__init__(*args, **kwargs)
        self.linear1 = nn.Linear(196608, 10)
    def forward(self, input):
        output = self.linear1(input)
        return output
net = Net()
for data in dataLoader:
    img, label = data
    print(img.shape)
    output = torch.flatten(img)
    print(output.shape)
    output = net(output)
    print(output.shape)

展平动作:torch.flatten (img)

输入 196608,输出 10:nn.Linear (196608, 10)

# Dropout Layers

torch.nn — PyTorch 2.8 documentation

# Sequential 序列操作 —— 以简单网络模型实战为例

相当于 transforms 的 compose,将一些操作组装到一起

29.png

import torch
from torch import nn
from torch.nn import Conv2d, MaxPool2d, Flatten, Linear
class Net(nn.Module):
    def __init__(self, *args, **kwargs) -> None:
        super().__init__(*args, **kwargs)
        self.conv1 = Conv2d(in_channels=3, out_channels=32, kernel_size=5, stride=1, padding=2)
        self.maxPool1 = MaxPool2d(kernel_size=2, ceil_mode=False)
        self.conv2 = Conv2d(in_channels=32, out_channels=32, kernel_size=5, stride=1, padding=2)
        self.maxPool2 = MaxPool2d(kernel_size=2, ceil_mode=False)
        self.conv3 = Conv2d(in_channels=32, out_channels=64, kernel_size=5, stride=1, padding=2)
        self.maxPool3 = MaxPool2d(kernel_size=2, ceil_mode=False)
        self.flatten = Flatten()
        self.linear0 = Linear(1024, 64)
        self.linear1 = Linear(64, 10)
    def forward(self, x):
        x = self.conv1(x)
        x = self.maxPool1(x)
        x = self.conv2(x)
        x = self.maxPool2(x)
        x = self.conv3(x)
        x = self.maxPool3(x)
        x = self.flatten(x)
        x = self.linear0(x)
        x = self.linear1(x)
        return x
net = Net()
print(net)
# 测试网络
input = torch.ones(64, 3, 32, 32)
outputs = net(input)
print(outputs.shape)

输入是一张彩色图片(3 个通道)

  1. 首先用一个卷积层(conv1)去提取一些低级特征,比如边缘、颜色块,然后通过一次池化(maxPool1)把图片 “缩小一半”,同时保留主要特征
  2. 接着再来一次卷积(conv2),这时候的输入已经有 32 个通道了,网络会继续在前面提取到的特征基础上,找到更复杂的形状、纹理,然后再做一次池化(maxPool2),图像再缩小一半
  3. 然后再卷积一次(conv3),这次输出通道变成 64 个,能提取更丰富、更抽象的特征,比如局部的结构、物体的一部分,再池化一次(maxPool3),图像又缩小
  4. 接下来把这些 “缩小后的特征图” 拉直成一维向量(flatten),然后经过一个全连接层(linear0),把大规模的特征压缩成一个 64 维的向量
  5. 最后再经过一个全连接层(linear1),输出 10 个数,这通常对应 10 个分类的可能性

总结一下:它就是一个典型的卷积神经网络,前面几层卷积 + 池化负责逐步提取和浓缩图像特征,后面的全连接层负责把这些特征转成分类结果。

# Sequential

class Net(nn.Module):
    def __init__(self, *args, **kwargs) -> None:
        super().__init__(*args, **kwargs)
        self.conv1 = Conv2d(in_channels=3, out_channels=32, kernel_size=5, stride=1, padding=2)
        self.maxPool1 = MaxPool2d(kernel_size=2, ceil_mode=False)
        self.conv2 = Conv2d(in_channels=32, out_channels=32, kernel_size=5, stride=1, padding=2)
        self.maxPool2 = MaxPool2d(kernel_size=2, ceil_mode=False)
        self.conv3 = Conv2d(in_channels=32, out_channels=64, kernel_size=5, stride=1, padding=2)
        self.maxPool3 = MaxPool2d(kernel_size=2, ceil_mode=False)
        self.flatten = Flatten()
        self.linear0 = Linear(1024, 64)
        self.linear1 = Linear(64, 10)
        self.model1 = Sequential(self.conv1, self.maxPool1, self.conv2, self.maxPool2, self.conv3, self.maxPool3,self.flatten, self.linear0, self.linear1)
    def forward(self, x):
        x = self.model1(x)
        return x

其中 Sequential 将一些操作组装到一起:

self.model1 = Sequential(self.conv1, self.maxPool1, self.conv2, self.maxPool2, self.conv3, self.maxPool3,self.flatten, self.linear0, self.linear1)

进一步地可以这样写:

class Net(nn.Module):
    def __init__(self, *args, **kwargs) -> None:
        super().__init__(*args, **kwargs)
        self.model2 = Sequential(
            Conv2d(in_channels=3, out_channels=32, kernel_size=5, stride=1, padding=2),
            MaxPool2d(kernel_size=2, ceil_mode=False),
            Conv2d(in_channels=32, out_channels=32, kernel_size=5, stride=1, padding=2),
            MaxPool2d(kernel_size=2, ceil_mode=False),
            Conv2d(in_channels=32, out_channels=64, kernel_size=5, stride=1, padding=2),
            MaxPool2d(kernel_size=2, ceil_mode=False),
            Flatten(),
            Linear(1024, 64),
            Linear(64, 10))
    def forward(self, x):
        x = self.model2(x)
        return x

# 流程可视化

writer = SummaryWriter("logs_seq")
writer.add_graph(net, input)
writer.close()

30.png

可以观察到具体流程就是:卷积 1、池化 1、卷积 2、池化 2、卷积 3、池化 3、展平、全连接 1(线性 1)、全连接 2(线性 2)。

# 损失函数与反向传播

计算实际输出和目标之间的差距,为我们更新输出提供一定的依据(反向传播), grad

torch.nn — PyTorch 2.8 documentation


nn.L1Loss

nn.L1Loss:

X:1, 2, 3

Y:1, 2, 5

L1loss = (0+0+2) /3=0.6,这里自然是越小越好

import torch
from torch.nn import L1Loss
input = torch.tensor([1, 2, 3], dtype=torch.float32)
target = torch.tensor([1, 2, 5], dtype=torch.float32)
input = torch.reshape(input, (1, 1, 1, 3))
target = torch.reshape(target, (1, 1, 1, 3))
loss = L1Loss()
result = loss(input, target)
print(result)

输出:tensor (0.6667)


nn.MSELoss

MSE = (0+0+2^2)/3=4/3=1.333

注意每个位置要平方

import torch
from torch.nn import L1Loss
input = torch.tensor([1, 2, 3], dtype=torch.float32)
target = torch.tensor([1, 2, 5], dtype=torch.float32)
input = torch.reshape(input, (1, 1, 1, 3))
target = torch.reshape(target, (1, 1, 1, 3))
loss_Mse = torch.nn.MSELoss()
result_Mse = loss_Mse(input, target)
print(result_Mse)

交叉熵: nn.CrossEntropyLoss 分类问题

31.png

此处 log 时以 e 为底数的(ln),常见 target 就是目标代号,1 就是第二个标签


使用 result_loss.backward () 开启反向传播

backward 根据损失值 result_loss 自动计算出每个参数的梯度,并把梯度存到参数的 .grad 属性中。

import torch
import torchvision
from torch import nn
from torch.nn import Conv2d, Sequential, MaxPool2d, Linear, Flatten
dataset = torchvision.datasets.CIFAR10("data", train=False, transform=torchvision.transforms.ToTensor(),
                                       download=True)
dataloader = torch.utils.data.DataLoader(dataset, batch_size=1)
class Net(nn.Module):
    def __init__(self, *args, **kwargs) -> None:
        super().__init__(*args, **kwargs)
        self.model1 = Sequential(
            Conv2d(in_channels=3, out_channels=32, kernel_size=5, stride=1, padding=2),
            MaxPool2d(kernel_size=2, ceil_mode=False),
            Conv2d(in_channels=32, out_channels=32, kernel_size=5, stride=1, padding=2),
            MaxPool2d(kernel_size=2, ceil_mode=False),
            Conv2d(in_channels=32, out_channels=64, kernel_size=5, stride=1, padding=2),
            MaxPool2d(kernel_size=2, ceil_mode=False),
            Flatten(),
            Linear(1024, 64),
            Linear(64, 10))
    def forward(self, x):
        x = self.model1(x)
        return x
net = Net()
loss = nn.CrossEntropyLoss()
for data in dataloader:
    imgs, targets = data
    outputs = net(imgs)
    result_loss = loss(outputs, targets)
    result_loss.backward()
    print("ok")
    print(result_loss)

# optim 优化器

它负责根据参数的梯度 .grad 来更新模型的参数,从而让模型越来越接近目标。

训练一个神经网络时流程是这样的:

  1. 前向传播 (forward)
    输入数据 → 模型输出 → 计算损失 loss
  2. 反向传播 (backward)
    调用 loss.backward() ,PyTorch 会自动算出每个参数的梯度,并存到 param.grad 里。
  3. 参数更新 (update step)
    这一步就是 优化器的作用
    优化器会读取参数的 .grad ,然后根据优化算法(如 SGD、Adam)来调整参数值。

optim(update step)会根据 backward 计算出来的梯度来更新参数

import torch
import torchvision
from torch import nn
from torch.nn import Conv2d, Sequential, MaxPool2d, Linear, Flatten
dataset = torchvision.datasets.CIFAR10("data", train=False, transform=torchvision.transforms.ToTensor(),
                                       download=True)
dataloader = torch.utils.data.DataLoader(dataset, batch_size=1)
class Net(nn.Module):
    def __init__(self, *args, **kwargs) -> None:
        super().__init__(*args, **kwargs)
        self.model1 = Sequential(
            Conv2d(in_channels=3, out_channels=32, kernel_size=5, stride=1, padding=2),
            MaxPool2d(kernel_size=2, ceil_mode=False),
            Conv2d(in_channels=32, out_channels=32, kernel_size=5, stride=1, padding=2),
            MaxPool2d(kernel_size=2, ceil_mode=False),
            Conv2d(in_channels=32, out_channels=64, kernel_size=5, stride=1, padding=2),
            MaxPool2d(kernel_size=2, ceil_mode=False),
            Flatten(),
            Linear(1024, 64),
            Linear(64, 10))
    def forward(self, x):
        x = self.model1(x)
        return x
net = Net()
loss = nn.CrossEntropyLoss()
optim = torch.optim.SGD(net.parameters(), lr=0.01)
for epoch in range(20):
    print("Epoch:{}".format(epoch))
    running_loss = 0.0
    for data in dataloader:
        imgs, targets = data
        outputs = net(imgs)
        result_loss = loss(outputs, targets)
        optim.zero_grad()
        result_loss.backward()
        optim.step()
        running_loss += result_loss
    print("Loss:{}".format(running_loss))

关键在于:

optim.zero_grad()
result_loss.backward()
optim.step()

首先 optim 梯度数据清零,然后 backward 计算这一次的梯度。最后 optim 做参数的优化

对于这里的循环:

for data in dataloader:
        imgs, targets = data
        outputs = net(imgs)
        result_loss = loss(outputs, targets)
        optim.zero_grad()
        result_loss.backward()
        optim.step()
        running_loss += result_loss

只是对数据进行了一轮的学习,我们需要多轮学习才能最优化结果:

for epoch in range(20):
    print("Epoch:{}".format(epoch))
    running_loss = 0.0
    for data in dataloader:
        imgs, targets = data
        outputs = net(imgs)
        result_loss = loss(outputs, targets)
        optim.zero_grad()
        result_loss.backward()
        optim.step()
        running_loss += result_loss
    print("Loss:{}".format(running_loss))

因此我们这样操作

# SGD(随机梯度下降)

训练神经网络的目标是让 损失函数 Loss 最小化

想象损失函数是一个 “山谷地形”,我们要沿着山坡往下走,直到找到最低点

梯度(Gradient) 就是告诉我们 “往哪个方向下坡最快”

32.png

optim = torch.optim.SGD(net.parameters(), lr=0.01)

模型参数即 net.parameters (),学习速率即 lr=0.01

学习率 (learning rate, lr):

lr 太小:走得慢,收敛速度慢,可能训练很久 loss 才下降。

lr 太大:走得快,但可能 “跨过山谷底部”,造成震荡甚至发散(loss 变大)。

# 现有模型的使用与修改(迁移学习)

借用别人训练好的模型知识,在新任务上减少训练成本,提高效果。

在现有 vgg16 中加一些新的层

import os
import torchvision.datasets
from torch import nn
os.environ["TORCH_HOME"] = "E:/PyCharmCode/pytorchSTU/data"
vgg16_false = torchvision.models.vgg16(pretrained=False)
vgg16_true = torchvision.models.vgg16(pretrained=True)
print(vgg16_true)
train_data = torchvision.datasets.CIFAR10(root="data", train=True, transform=torchvision.transforms.ToTensor(),
                                          download=True)
vgg16_true.classifier.add_module("add_linear", nn.Linear(1000, 10))
print(vgg16_true)

源:

(classifier): Sequential(
(0): Linear(in_features=25088, out_features=4096, bias=True)
(1): ReLU(inplace=True)
(2): Dropout(p=0.5, inplace=False)
(3): Linear(in_features=4096, out_features=4096, bias=True)
(4): ReLU(inplace=True)
(5): Dropout(p=0.5, inplace=False)
(6): Linear(in_features=4096, out_features=1000, bias=True)
)

现:

(classifier): Sequential(
(0): Linear(in_features=25088, out_features=4096, bias=True)
(1): ReLU(inplace=True)
(2): Dropout(p=0.5, inplace=False)
(3): Linear(in_features=4096, out_features=4096, bias=True)
(4): ReLU(inplace=True)
(5): Dropout(p=0.5, inplace=False)
(6): Linear(in_features=4096, out_features=1000, bias=True)
(add_linear): Linear(in_features=1000, out_features=10, bias=True)
)

在现有 vgg16 中修改层

import os
import torchvision.datasets
from torch import nn
# train_data = torchvision.datasets.ImageNet(root="data", split="train", download=True,
#                                            transform=torchvision.transforms.ToTensor())
os.environ["TORCH_HOME"] = "E:/PyCharmCode/pytorchSTU/data"
vgg16_false = torchvision.models.vgg16(pretrained=False)
vgg16_true = torchvision.models.vgg16(pretrained=True)
print(vgg16_false)
vgg16_false.classifier[6] = nn.Linear(in_features=4096, out_features=10)
print(vgg16_false)

源:

(classifier): Sequential(
(0): Linear(in_features=25088, out_features=4096, bias=True)
(1): ReLU(inplace=True)
(2): Dropout(p=0.5, inplace=False)
(3): Linear(in_features=4096, out_features=4096, bias=True)
(4): ReLU(inplace=True)
(5): Dropout(p=0.5, inplace=False)
(6): Linear(in_features=4096, out_features=1000, bias=True)
)

现:

(classifier): Sequential(
(0): Linear(in_features=25088, out_features=4096, bias=True)
(1): ReLU(inplace=True)
(2): Dropout(p=0.5, inplace=False)
(3): Linear(in_features=4096, out_features=4096, bias=True)
(4): ReLU(inplace=True)
(5): Dropout(p=0.5, inplace=False)
(6): Linear(in_features=4096, out_features=10, bias=True)
)

# 模型的保存与加载

保存:

import os
import torch
import torchvision.models
from torch import nn
os.environ["TORCH_HOME"] = "E:/PyCharmCode/pytorchSTU/data"
vgg16 = torchvision.models.vgg16(pretrained=False)
# 方式 1:保存模型结构 + 参数
torch.save(vgg16, "vgg16_method1.pth")
# 方式 2:保存模型参数 (官方推荐)
torch.save(vgg16.state_dict(), "vgg16_method2.pth")
# 陷阱 1
class Net(nn.Module):
    def __init__(self, *args, **kwargs) -> None:
        super().__init__(*args, **kwargs)
        self.conv1 = nn.Conv2d(3, 32, 5, 1, 2)
    def forward(self, x):
        x = self.conv1(x)
        return x
net = Net()
torch.save(net, "net.pth")

加载:

import torch
import torchvision.models
import model_save
from torch import nn
# 方式 1:模型结构 + 参数加载 1
torch_load = torch.load("vgg16_method1.pth", weights_only=False)
# print(torch_load)
# 方式 2:模型参数加载 2
vgg16 = torchvision.models.vgg16(pretrained=False)
torch_load = torch.load("vgg16_method2.pth", weights_only=True)
vgg16.load_state_dict(torch_load)
# print(vgg16)
# 陷阱 1: 需要把模型引入才能进行加载 import model_save
model = torch.load("net.pth", weights_only=False)
print(model)

# 模型的完整训练套路

  1. 准备数据集、获取数据集大小(可选)
  2. dataLoader 加载数据集
  3. 搭建神经网络 Net
  4. 创建损失函数 loss_fn、创建优化器 optimizer、可选使用 tenser board
  5. 设置训练网络一些参数:训练的轮数 total_train_step、测试的轮数 total_test_step、总训练的轮数 epoch
  6. 开始训练,导出 imgs, targets 并进入网络
  7. 计算损失函数
  8. 清零梯度、计算梯度、利用反向传播,致使优化器更新参数
  9. 暂时关闭梯度计算、开始测试
  10. 计算损失函数并累计、计算当前轮次准确率
  11. 重复 6~10,直到完成所有轮次
import torch
import torchvision.datasets
from torch.utils.tensorboard import SummaryWriter
from model import *
from torch.utils.data import DataLoader
# 准备数据集
train_data = torchvision.datasets.CIFAR10("data", train=True, transform=torchvision.transforms.ToTensor(),
                                          download=True)
test_data = torchvision.datasets.CIFAR10("data", train=False, transform=torchvision.transforms.ToTensor(),
                                         download=True)
# 获取数据集大小
train_data_size = len(train_data)
test_data_size = len(test_data)
print(f"训练数据集的长度为{train_data_size}")
print(f"测试数据集的长度为{test_data_size}")
# dataLoader 加载数据集
dataLoader_train = DataLoader(train_data, batch_size=64)
dataLoader_test = DataLoader(test_data, batch_size=64)
# 搭建神经网络
net = Net()
# 创建损失函数
loss_fn = nn.CrossEntropyLoss()
# 创建优化器
lr = 0.01
optimizer = torch.optim.SGD(net.parameters(), lr=lr)
# tenser board
writer = SummaryWriter("logs")
# 设置训练网络一些参数
# 训练的轮数
total_train_step = 0
# 测试的轮数
total_test_step = 0
# 训练的轮数
epoch = 10
for i in range(epoch):
    print("--------第 {} 轮训练开始--------".format(i + 1))
    # 训练开始
    net.train()
    for data in dataLoader_train:
        imgs, targets = data
        outputs = net(imgs)
        loss = loss_fn(outputs, targets)  # 损失函数
        optimizer.zero_grad()  # 清零梯度
        loss.backward()  # 反向传播
        optimizer.step()  # 优化器更新参数
        total_train_step += 1
        if total_train_step % 100 == 0:
            print("训练次数:{}, Loss: {}".format(total_train_step, loss.item()))
            writer.add_scalar("train_loss", loss.item(), total_train_step)
    # 测试步骤开始
    net.eval()
    total_test_loss = 0
    total_acc = 0
    with torch.no_grad():  # 不进行梯度计算 with 的作用是:临时关闭梯度计算,退出后自动恢复。在上下文里关闭,出了上下文就恢复。
        for data in dataLoader_test:
            imgs, targets = data
            outputs = net(imgs)  # 测试步骤开始
            loss = loss_fn(outputs, targets)  # 损失函数
            total_test_loss += loss.item()  # 求和
            acc = (outputs.argmax(1) == targets).sum()
            total_acc += acc
        print("整体测试集上的Loss:{}".format(total_test_loss))
        print("整体测试集上的正确率:{}".format(total_acc / test_data_size))
        writer.add_scalar("test_loss", total_test_loss, total_test_step)
        writer.add_scalar("test_acc", total_acc / test_data_size, total_test_step)
        total_test_step += 1
    torch.save(net, "net_{}.pth".format(i))
    print("模型已保存")
writer.close()

10 轮学习后,整体测试集上的正确率:0.5428000092506409

# test_acc

33.png

# test_loss

34.png

# train_loss

35.png

# 使用 GPU 训练 1(.cuda)

  1. 网络可以使用 GPU 加速

    net = Net()
    # 交给 GPU
    if torch.cuda.is_available():
        net = net.cuda()
  2. 损失函数可以使用 GPU 加速

    loss_fn = nn.CrossEntropyLoss()
    # 交给 GPU
    if torch.cuda.is_available():
        loss_fn = loss_fn.cuda()
  3. 数据和标签和使用 GPU 加速

    # 交给 GPU
    if torch.cuda.is_available():
        imgs = imgs.cuda()
        targets = targets.cuda()

优化后:

# _*_ coding : utf-8 _*_
# @Time : 2025/8/26 14:38
# @Author : KarryLiu
# File : train_gpu_1
# @Project : pytorchSTU
import torch
import torchvision.datasets
from torch import nn
from torch.nn import Sequential, Conv2d, MaxPool2d, Linear, Flatten
from torch.utils.tensorboard import SummaryWriter
from torch.utils.data import DataLoader
# 准备数据集
train_data = torchvision.datasets.CIFAR10("data", train=True, transform=torchvision.transforms.ToTensor(),
                                          download=True)
test_data = torchvision.datasets.CIFAR10("data", train=False, transform=torchvision.transforms.ToTensor(),
                                         download=True)
# 获取数据集大小
train_data_size = len(train_data)
test_data_size = len(test_data)
print(f"训练数据集的长度为{train_data_size}")
print(f"测试数据集的长度为{test_data_size}")
# dataLoader 加载数据集
dataLoader_train = DataLoader(train_data, batch_size=64)
dataLoader_test = DataLoader(test_data, batch_size=64)
# 搭建神经网络
class Net(nn.Module):
    def __init__(self, *args, **kwargs) -> None:
        super().__init__(*args, **kwargs)
        self.model = Sequential(
            Conv2d(in_channels=3, out_channels=32, kernel_size=5, stride=1, padding=2),
            MaxPool2d(kernel_size=2, ceil_mode=False),
            Conv2d(in_channels=32, out_channels=32, kernel_size=5, stride=1, padding=2),
            MaxPool2d(kernel_size=2, ceil_mode=False),
            Conv2d(in_channels=32, out_channels=64, kernel_size=5, stride=1, padding=2),
            MaxPool2d(kernel_size=2, ceil_mode=False),
            Flatten(),
            Linear(1024, 64),
            Linear(64, 10)
        )
    def forward(self, x):
        x = self.model(x)
        return x
net = Net()
# 交给 GPU
if torch.cuda.is_available():
    net = net.cuda()
# 创建损失函数
loss_fn = nn.CrossEntropyLoss()
# 交给 GPU
if torch.cuda.is_available():
    loss_fn = loss_fn.cuda()
# 创建优化器
lr = 0.01
optimizer = torch.optim.SGD(net.parameters(), lr=lr)
# tenser board
writer = SummaryWriter("logs")
# 设置训练网络一些参数
# 训练的轮数
total_train_step = 0
# 测试的轮数
total_test_step = 0
# 训练的轮数
epoch = 10
for i in range(epoch):
    print("--------第 {} 轮训练开始--------".format(i + 1))
    # 训练开始
    net.train()
    for data in dataLoader_train:
        imgs, targets = data
        # 交给 GPU
        if torch.cuda.is_available():
            imgs = imgs.cuda()
            targets = targets.cuda()
        outputs = net(imgs)
        loss = loss_fn(outputs, targets)  # 损失函数
        optimizer.zero_grad()  # 清零梯度
        loss.backward()  # 反向传播
        optimizer.step()  # 优化器更新参数
        total_train_step += 1
        if total_train_step % 100 == 0:
            print("训练次数:{}, Loss: {}".format(total_train_step, loss.item()))
            writer.add_scalar("train_loss", loss.item(), total_train_step)
    # 测试步骤开始
    net.eval()
    total_test_loss = 0
    total_acc = 0
    with torch.no_grad():  # 不进行梯度计算 with 的作用是:临时关闭梯度计算,退出后自动恢复。在上下文里关闭,出了上下文就恢复。
        for data in dataLoader_test:
            imgs, targets = data
            # 交给 GPU
            if torch.cuda.is_available():
                imgs = imgs.cuda()
                targets = targets.cuda()
            outputs = net(imgs)  # 测试步骤开始
            loss = loss_fn(outputs, targets)  # 损失函数
            total_test_loss += loss.item()  # 求和
            acc = (outputs.argmax(1) == targets).sum()
            total_acc += acc
        print("整体测试集上的Loss:{}".format(total_test_loss))
        print("整体测试集上的正确率:{}".format(total_acc / test_data_size))
        writer.add_scalar("test_loss", total_test_loss, total_test_step)
        writer.add_scalar("test_acc", total_acc / test_data_size, total_test_step)
        total_test_step += 1
    torch.save(net, "net_{}.pth".format(i))
    print("模型已保存")
writer.close()

使用 GPU 后,学习 100 次只需要大约 1.23s

# 使用 GPU 训练 2(.to)

使用:

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
  1. 网络可以使用 GPU 加速

    net = Net()
    # 交给 GPU
    net = net.to(device)
  2. 损失函数可以使用 GPU 加速

    loss_fn = nn.CrossEntropyLoss()
    # 交给 GPU
    loss_fn = loss_fn.to(device)
  3. 数据和标签和使用 GPU 加速

    # 交给 GPU
    imgs = imgs.to(device)
    targets = targets.to(device)

优化后:

import torch
import torchvision.datasets
from torch import nn
from torch.nn import Sequential, Conv2d, MaxPool2d, Linear, Flatten
from torch.utils.tensorboard import SummaryWriter
from torch.utils.data import DataLoader
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# 准备数据集
train_data = torchvision.datasets.CIFAR10("data", train=True, transform=torchvision.transforms.ToTensor(),
                                          download=True)
test_data = torchvision.datasets.CIFAR10("data", train=False, transform=torchvision.transforms.ToTensor(),
                                         download=True)
# 获取数据集大小
train_data_size = len(train_data)
test_data_size = len(test_data)
print(f"训练数据集的长度为{train_data_size}")
print(f"测试数据集的长度为{test_data_size}")
# dataLoader 加载数据集
dataLoader_train = DataLoader(train_data, batch_size=64)
dataLoader_test = DataLoader(test_data, batch_size=64)
# 搭建神经网络
class Net(nn.Module):
    def __init__(self, *args, **kwargs) -> None:
        super().__init__(*args, **kwargs)
        self.model = Sequential(
            Conv2d(in_channels=3, out_channels=32, kernel_size=5, stride=1, padding=2),
            MaxPool2d(kernel_size=2, ceil_mode=False),
            Conv2d(in_channels=32, out_channels=32, kernel_size=5, stride=1, padding=2),
            MaxPool2d(kernel_size=2, ceil_mode=False),
            Conv2d(in_channels=32, out_channels=64, kernel_size=5, stride=1, padding=2),
            MaxPool2d(kernel_size=2, ceil_mode=False),
            Flatten(),
            Linear(1024, 64),
            Linear(64, 10)
        )
    def forward(self, x):
        x = self.model(x)
        return x
net = Net()
# 交给 GPU
net = net.to(device)
# 创建损失函数
loss_fn = nn.CrossEntropyLoss()
# 交给 GPU
loss_fn = loss_fn.to(device)
# 创建优化器
lr = 0.01
optimizer = torch.optim.SGD(net.parameters(), lr=lr)
# tenser board
writer = SummaryWriter("logs")
# 设置训练网络一些参数
# 训练的轮数
total_train_step = 0
# 测试的轮数
total_test_step = 0
# 训练的轮数
epoch = 10
for i in range(epoch):
    print("--------第 {} 轮训练开始--------".format(i + 1))
    # 训练开始
    net.train()
    for data in dataLoader_train:
        imgs, targets = data
        # 交给 GPU
        imgs = imgs.to(device)
        targets = targets.to(device)
        outputs = net(imgs)
        loss = loss_fn(outputs, targets)  # 损失函数
        optimizer.zero_grad()  # 清零梯度
        loss.backward()  # 反向传播
        optimizer.step()  # 优化器更新参数
        total_train_step += 1
        if total_train_step % 100 == 0:
            print("训练次数:{}, Loss: {}".format(total_train_step, loss.item()))
            writer.add_scalar("train_loss", loss.item(), total_train_step)
    # 测试步骤开始
    net.eval()
    total_test_loss = 0
    total_acc = 0
    with torch.no_grad():  # 不进行梯度计算 with 的作用是:临时关闭梯度计算,退出后自动恢复。在上下文里关闭,出了上下文就恢复。
        for data in dataLoader_test:
            imgs, targets = data
            # 交给 GPU
            imgs = imgs.to(device)
            targets = targets.to(device)
            outputs = net(imgs)  # 测试步骤开始
            loss = loss_fn(outputs, targets)  # 损失函数
            total_test_loss += loss.item()  # 求和
            acc = (outputs.argmax(1) == targets).sum()
            total_acc += acc
        print("整体测试集上的Loss:{}".format(total_test_loss))
        print("整体测试集上的正确率:{}".format(total_acc / test_data_size))
        writer.add_scalar("test_loss", total_test_loss, total_test_step)
        writer.add_scalar("test_acc", total_acc / test_data_size, total_test_step)
        total_test_step += 1
    torch.save(net, "net_{}.pth".format(i))
    print("模型已保存")
writer.close()

# 验证套路(利用训练好的模型,给他提供输入)

import torch
import torchvision.transforms
from PIL import Image
from torch import nn
from torch.nn import Sequential, Conv2d, MaxPool2d, Flatten, Linear
# 如果是由 GPU 训练得到模型,则需要将模型移动到 GPU 上
# 如果仅想使用 CPU 测试可以在 model 中使用:map_location="cpu"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
image_path = "images/fff.png"
# 此处 png 是 RGBA 模式,我们转换成 RGB 模式
image = Image.open(image_path).convert("RGB")
trans = torchvision.transforms.Compose([
    torchvision.transforms.Resize(size=(32, 32)),
    torchvision.transforms.ToTensor(),
])
image = trans(image)
# 添加一个维度代表 1 张图片,并交给 GPU
image = torch.reshape(image, (1, 3, 32, 32)).to(device)
print(image.shape)
# 搭建神经网络
class Net(nn.Module):
    def __init__(self, *args, **kwargs) -> None:
        super().__init__(*args, **kwargs)
        self.model = Sequential(
            Conv2d(in_channels=3, out_channels=32, kernel_size=5, stride=1, padding=2),
            MaxPool2d(kernel_size=2, ceil_mode=False),
            Conv2d(in_channels=32, out_channels=32, kernel_size=5, stride=1, padding=2),
            MaxPool2d(kernel_size=2, ceil_mode=False),
            Conv2d(in_channels=32, out_channels=64, kernel_size=5, stride=1, padding=2),
            MaxPool2d(kernel_size=2, ceil_mode=False),
            Flatten(),
            Linear(1024, 64),
            Linear(64, 10)
        )
    def forward(self, x):
        x = self.model(x)
        return x
# 仅仅在 CPU 测试
# model = torch.load("net_9.pth", weights_only=False, map_location="cpu")
model = torch.load("net_9.pth", weights_only=False)
model.eval()
with torch.no_grad():
    output = model(image)
    print(output)
    print(torch.argmax(output))

torch.Size([1, 3, 32, 32])
tensor([[ 4.1824, -0.9885, 2.4846, -0.8212, 0.5465, -1.7889, -1.6802, -0.3970,
0.7540, -0.7934]])
tensor(0)

36.png

37.png

# GPU50 轮学习后

38.png

后面出现了过拟合

# 附录

简单入门了一下,后面还有很长的路要走,继续保持持续学习的动力。

相关代码已公开在 GitHub 中:https://github.com/735690757/pytorch_stu_up

Swim in the ocean of art and programming, weave the future with code art.

更新于 阅读次数

请我喝[茶]~( ̄▽ ̄)~*

KarryLiu 微信支付

微信支付

KarryLiu 支付宝

支付宝