开发: C++知识库 Java知识库 JavaScript Python PHP知识库人工智能区块链大数据移动开发嵌入式开发工具数据结构与算法开发测试游戏开发网络协议系统运维
教程: HTML教程 CSS教程 JavaScript教程 Go语言教程 JQuery教程 VUE教程 VUE3教程 Bootstrap教程 SQL数据库教程 C语言教程 C++教程 Java教程 Python教程 Python3教程 C#教程
数码: 电脑笔记本显卡显示器固态硬盘硬盘耳机手机 iphone vivo oppo 小米华为单反装机图拉丁

-> 人工智能 -> pytorch学习笔记一——nn.Module -> 正文阅读

[人工智能]pytorch学习笔记一——nn.Module

一、五种模型构造形式

nn.Sequential()

# 使用nn.Sequential搭建网络
# 一个普通的单层神经网络
Sequen_net = nn.Sequential(nn.Linear(20, 256), nn.ReLU(), nn.Linear(256, 10))

X = torch.rand(3, 20)
print("X: ", X)
net1 = Sequen_net(X)
print("net1: ", net1)

结果：

X:  tensor([[0.6513, 0.1193, 0.2606, 0.4497, 0.2970, 0.9101, 0.1941, 0.9283, 0.5954,
         0.0683, 0.6453, 0.8361, 0.5083, 0.7473, 0.4150, 0.8179, 0.3845, 0.1216,
         0.7870, 0.9959],
        [0.2798, 0.2982, 0.8267, 0.9409, 0.2763, 0.8739, 0.6138, 0.5610, 0.6146,
         0.3400, 0.9731, 0.7196, 0.7783, 0.0356, 0.0168, 0.9358, 0.8763, 0.0903,
         0.8762, 0.4948],
        [0.4094, 0.7204, 0.8304, 0.4614, 0.3940, 0.4561, 0.8946, 0.5907, 0.8838,
         0.1645, 0.5259, 0.9783, 0.8897, 0.3766, 0.9449, 0.3011, 0.3275, 0.5742,
         0.7173, 0.3336]])
net1:  tensor([[ 0.0874,  0.1452, -0.1404, -0.0427,  0.2162, -0.2174,  0.0608,  0.0526,
          0.0731, -0.0225],
        [ 0.0295,  0.0829, -0.1371, -0.0923,  0.1686, -0.2626,  0.1728,  0.0373,
         -0.0400,  0.0174],
        [-0.0100,  0.0885, -0.1002, -0.0672,  0.0749, -0.1790,  0.1430,  0.1640,
          0.0572,  0.0146]], grad_fn=<AddmmBackward0>)

class自定义类

# 自定义类搭建网络（网络构造同上）
X = torch.rand(3, 20)


class MLP(nn.Module):  # nn.Module 任何一个神经网络的层都是Module的子类
    def __init__(self):  # 定义需要的类和参数
        super(MLP, self).__init__()  # 调用父类
        self.hidden = nn.Linear(20, 256)
        self.out = nn.Linear(256, 10)

    def forward(self, X):  # 前向传播函数 输入X
        return self.out(F.relu(self.hidden(X)))


net2 = MLP()
print("net2: ", net2(X))

结果：

net2:  tensor([[ 0.1871, -0.0803,  0.2287, -0.1312,  0.1852,  0.0470, -0.1633,  0.0177,
          0.0817,  0.1049],
        [ 0.2333,  0.0757,  0.2480, -0.0570,  0.2533,  0.0042, -0.2117,  0.0129,
          0.0444,  0.0404],
        [ 0.1890,  0.0458,  0.2060, -0.0098,  0.1948,  0.0225, -0.1746, -0.0491,
          0.0820,  0.0435]], grad_fn=<AddmmBackward0>)

顺序块

# 顺序块   nn.Sequential实现
class MySequential(nn.Module):
    def __init__(self, *args):
        super(MySequential, self).__init__()
        for block in args:
            self._modules[block] = block  # 按顺序排列的一个字典

    def forward(self, X):
        for block in self._modules.values():
            # print("block: ", block)
            X = block(X)
        return X


net3 = MySequential(nn.Linear(20, 256), nn.ReLU(), nn.Linear(256, 10))
print("net3: ", net3(X))

结果：

net3:  tensor([[-0.0630,  0.1005,  0.1431,  0.2444, -0.0660, -0.0654,  0.0188,  0.2016,
         -0.4174, -0.1706],
        [ 0.0219,  0.0262,  0.1507,  0.1690, -0.0820,  0.0212, -0.1419,  0.3118,
         -0.3934, -0.1119],
        [-0.0213, -0.0285,  0.0478,  0.1897, -0.1706,  0.0083, -0.1190,  0.0981,
         -0.3782, -0.1059]], grad_fn=<AddmmBackward0>)

requires_grad=False

class otherMLP(nn.Module):
    def __init__(self):
        super(otherMLP, self).__init__()
        # rand_weight不参与训练   requires_grad=False不计算梯度，所以不参与训练
        self.rand_weight = torch.rand((20, 20), requires_grad=False)
        self.linear = nn.Linear(20, 20)

    def forward(self, X):
        X = self.linear(X)
        X = F.relu(torch.mm(X, self.rand_weight) + 1)
        X = self.linear(X)
        while X.abs().sum() > 1:
            X /= 2
        return X.sum()


net4 = otherMLP()
print("net4: ", net4(X))

结果：

net4:  tensor(0.0289, grad_fn=<SumBackward0>)

混合搭配各种组合块的方法

# 混合搭配各种组合块的方法
class NestMLP(nn.Module):
    def __init__(self):
        super(NestMLP, self).__init__()
        self.net = nn.Sequential(nn.Linear(20, 64), nn.ReLU(),
                                 nn.Linear(64, 16), nn.ReLU())
        self.linear = nn.Linear(16, 16)

    def forward(self, X):
        return self.linear(self.net(X))


net5 = nn.Sequential(NestMLP(), nn.Linear(16, 20), otherMLP())
print("net5: ", net5(X))

结果：

net5:  tensor(-0.1181, grad_fn=<SumBackward0>)

二、参数管理

net.state_dict()

net_paraManag = nn.Sequential(nn.Linear(4, 8), nn.ReLU(), nn.Linear(8, 1))
X_paraManag = torch.rand(size=(3, 4))

print(net_paraManag(X_paraManag))  # input 3*4  hidden 4*8  output 8*1   hidden:weight 4*8  bias 8  output:weight 8*1 bias 1
print(net_paraManag.state_dict())
print(type(net_paraManag[2].bias))
# net_paraManag[2].bias.data为参数的本身
print(net_paraManag[2].bias)
print(net_paraManag[2].bias.data)
# 访问梯度，未做反向计算 此时梯度为空
print(net_paraManag[2].weight.grad == None)

结果：

tensor([[0.4404],
        [0.3632],
        [0.3589]], grad_fn=<AddmmBackward0>)
OrderedDict([('0.weight', tensor([[-3.0088e-01,  3.7643e-01,  2.2169e-01, -1.7857e-01],
        [-4.6796e-01,  1.9626e-01, -1.0750e-01, -8.1731e-02],
        [-4.6789e-02, -1.4509e-02,  4.1151e-02, -4.7081e-01],
        [ 3.3119e-01,  1.0578e-01,  1.8293e-01,  1.6103e-01],
        [-3.4791e-04,  4.7955e-01,  4.3611e-01,  3.2194e-01],
        [ 7.4373e-02,  1.2937e-01, -3.0729e-01, -1.8899e-01],
        [ 3.1637e-01, -2.4891e-01,  2.0294e-01,  4.1119e-01],
        [-1.4292e-01,  3.6126e-01, -2.9002e-01, -1.3629e-01]])), ('0.bias', tensor([ 0.1134, -0.2226, -0.2156, -0.2025, -0.2483, -0.0088,  0.2076, -0.4190])), ('2.weight', tensor([[ 0.1526, -0.3418,  0.1399,  0.2730, -0.2132, -0.0511,  0.2570, -0.1498]])), ('2.bias', tensor([0.2878]))])
<class 'torch.nn.parameter.Parameter'>
Parameter containing:
tensor([0.2878], requires_grad=True)
tensor([0.2878])
True

net.named_parameters()

# 网络参数全部拿出来，一次性访问所有参数
print(*[(name, param.shape) for name, param in net_paraManag[0].named_parameters()])
print(*[(name, param.shape) for name, param in net_paraManag.named_parameters()])
print(net_paraManag.state_dict()['2.bias'].data)

('weight', torch.Size([8, 4])) ('bias', torch.Size([8]))
('0.weight', torch.Size([8, 4])) ('0.bias', torch.Size([8])) ('2.weight', torch.Size([1, 8])) ('2.bias', torch.Size([1]))
tensor([0.2878])

网络嵌套块

net.add_module

X = torch.rand(size=(3, 4))

# 网络嵌套块  从嵌套块收集参数
def blk1():
    return nn.Sequential(nn.Linear(4, 8), nn.ReLU(), nn.Linear(8, 4))


def blk2():
    net_blk = nn.Sequential()
    for i in range(4):
        net_blk.add_module(f'block {i}', blk1())
    return net_blk


rg_net = nn.Sequential(blk2(), nn.Linear(4, 1))
print(rg_net(X))
print(rg_net.state_dict())

print(*[(name, param.shape) for name, param in rg_net.named_parameters()])
print(rg_net)

结果：

tensor([[-0.4122],
        [-0.4122],
        [-0.4122]], grad_fn=<AddmmBackward0>)
OrderedDict([('0.block 0.0.weight', tensor([[-0.4171, -0.1780,  0.0606,  0.3748],
        [ 0.4168, -0.1772, -0.1909, -0.2394],
        [-0.1034, -0.0757,  0.3555, -0.1779],
        [ 0.1085, -0.3186,  0.0096, -0.2394],
        [-0.1383,  0.4044,  0.3551, -0.3015],
        [ 0.4526, -0.0596,  0.0173, -0.1155],
        [ 0.3260, -0.2712,  0.1022,  0.2748],
        [-0.0684, -0.3660,  0.2312, -0.1745]])), ('0.block 0.0.bias', tensor([ 0.2892,  0.0427, -0.3143, -0.3937,  0.3830,  0.4406, -0.4819,  0.2689])), ('0.block 0.2.weight', tensor([[-0.1373,  0.3090,  0.1439, -0.1540,  0.0539, -0.1121,  0.3362,  0.0686],
        [-0.1190, -0.0290,  0.2045, -0.0352, -0.1836,  0.2612, -0.0020, -0.2542],
        [ 0.2692,  0.1711,  0.3402,  0.1287, -0.1026,  0.1157,  0.1243,  0.0963],
        [-0.1632, -0.0101, -0.2477, -0.3182,  0.3411, -0.0662, -0.2787, -0.2854]])), ('0.block 0.2.bias', tensor([ 0.1856,  0.2060, -0.2202, -0.2335])), ('0.block 1.0.weight', tensor([[-0.2876, -0.2843,  0.2626, -0.2522],
        [-0.4314, -0.0281,  0.3921,  0.1997],
        [-0.4291, -0.2965,  0.2171,  0.4714],
        [-0.2635,  0.1402,  0.4460,  0.2259],
        [-0.2485,  0.1630,  0.2658,  0.1465],
        [-0.2018,  0.1986,  0.2293, -0.3614],
        [-0.2771,  0.4530, -0.1518,  0.0359],
        [-0.4866,  0.3631, -0.3512, -0.1767]])), ('0.block 1.0.bias', tensor([ 0.3211, -0.3084,  0.4175, -0.2714, -0.3879,  0.4064, -0.4108,  0.4524])), ('0.block 1.2.weight', tensor([[-0.1630,  0.0587,  0.0727,  0.2492,  0.0867,  0.2773, -0.3044,  0.1747],
        [-0.0317, -0.1035, -0.0919,  0.2908, -0.2533, -0.1995,  0.3382, -0.3458],
        [ 0.3418,  0.2402, -0.2481,  0.0028,  0.2577,  0.0366, -0.1611,  0.1437],
        [-0.2612,  0.2073,  0.3012, -0.0942, -0.2812,  0.0807,  0.1528, -0.1563]])), ('0.block 1.2.bias', tensor([ 0.3424,  0.0663, -0.2212, -0.2977])), ('0.block 2.0.weight', tensor([[-0.2276,  0.4482, -0.1265,  0.4275],
        [ 0.1494, -0.4260,  0.1137, -0.2846],
        [ 0.3537, -0.2347,  0.1338, -0.3326],
        [ 0.0920,  0.2532, -0.1029, -0.2644],
        [ 0.0546, -0.1787,  0.4898, -0.0615],
        [-0.3642, -0.1389, -0.1747, -0.1744],
        [ 0.3488,  0.4022, -0.0566,  0.4181],
        [ 0.4516,  0.3673,  0.0574,  0.0241]])), ('0.block 2.0.bias', tensor([-0.3022,  0.1639,  0.1999, -0.1497,  0.4291, -0.2849, -0.2372,  0.3733])), ('0.block 2.2.weight', tensor([[-0.1605,  0.1799,  0.0969, -0.3066,  0.0343, -0.0491,  0.2340, -0.0662],
        [ 0.1513,  0.0432, -0.1792, -0.2874,  0.0522, -0.1464,  0.0801, -0.0581],
        [-0.0155,  0.1103, -0.0999,  0.2740, -0.1410, -0.2216, -0.1957, -0.2005],
        [ 0.0011, -0.3436,  0.0429, -0.3246, -0.1136,  0.2585, -0.3093, -0.0797]])), ('0.block 2.2.bias', tensor([ 0.1010,  0.1079, -0.3058,  0.0265])), ('0.block 3.0.weight', tensor([[ 0.2588,  0.0358, -0.3987,  0.4909],
        [ 0.2737,  0.3185,  0.4886, -0.2483],
        [ 0.4014,  0.4572,  0.1416,  0.1966],
        [ 0.4365,  0.3307, -0.0075,  0.3789],
        [-0.3204, -0.0930,  0.4388,  0.2200],
        [-0.0976, -0.0429, -0.4074,  0.1534],
        [ 0.3454,  0.1831, -0.0017,  0.4038],
        [ 0.3662, -0.4278, -0.4763,  0.3758]])), ('0.block 3.0.bias', tensor([-0.0556, -0.4557, -0.2513, -0.2545, -0.1326, -0.3917,  0.4147, -0.0952])), ('0.block 3.2.weight', tensor([[-0.2281,  0.1095, -0.3121, -0.1222,  0.1357,  0.2677, -0.0877, -0.0851],
        [-0.0383, -0.2003, -0.2518,  0.0728,  0.2119,  0.1021,  0.0172, -0.3027],
        [ 0.0219, -0.1426,  0.0483,  0.1317, -0.1951,  0.2851, -0.1330, -0.1818],
        [-0.2049,  0.2587,  0.0653,  0.1295, -0.0574,  0.1290, -0.0289, -0.0458]])), ('0.block 3.2.bias', tensor([ 0.3209, -0.0175,  0.1921, -0.0818])), ('1.weight', tensor([[-0.4423, -0.2640,  0.3931,  0.3123]])), ('1.bias', tensor([-0.3232]))])

('0.block 0.0.weight', torch.Size([8, 4])) ('0.block 0.0.bias', torch.Size([8])) ('0.block 0.2.weight', torch.Size([4, 8])) ('0.block 0.2.bias', torch.Size([4])) ('0.block 1.0.weight', torch.Size([8, 4])) ('0.block 1.0.bias', torch.Size([8])) ('0.block 1.2.weight', torch.Size([4, 8])) ('0.block 1.2.bias', torch.Size([4])) ('0.block 2.0.weight', torch.Size([8, 4])) ('0.block 2.0.bias', torch.Size([8])) ('0.block 2.2.weight', torch.Size([4, 8])) ('0.block 2.2.bias', torch.Size([4])) ('0.block 3.0.weight', torch.Size([8, 4])) ('0.block 3.0.bias', torch.Size([8])) ('0.block 3.2.weight', torch.Size([4, 8])) ('0.block 3.2.bias', torch.Size([4])) ('1.weight', torch.Size([1, 4])) ('1.bias', torch.Size([1]))
Sequential(
  (0): Sequential(
    (block 0): Sequential(
      (0): Linear(in_features=4, out_features=8, bias=True)
      (1): ReLU()
      (2): Linear(in_features=8, out_features=4, bias=True)
    )
    (block 1): Sequential(
      (0): Linear(in_features=4, out_features=8, bias=True)
      (1): ReLU()
      (2): Linear(in_features=8, out_features=4, bias=True)
    )
    (block 2): Sequential(
      (0): Linear(in_features=4, out_features=8, bias=True)
      (1): ReLU()
      (2): Linear(in_features=8, out_features=4, bias=True)
    )
    (block 3): Sequential(
      (0): Linear(in_features=4, out_features=8, bias=True)
      (1): ReLU()
      (2): Linear(in_features=8, out_features=4, bias=True)
    )
  )
  (1): Linear(in_features=4, out_features=1, bias=True)
)

nn.init.normal_(m.weight, mean=0, std=0.01)

nn.init.zeros_(m.bias)

def init_normal(m):
    if type(m) == nn.Linear:  # 只对全连接层nn.Linear参数指定初始化
        nn.init.normal_(m.weight, mean=0, std=0.01)
        nn.init.zeros_(m.bias)


# 将自定义的内置函数应用到神经网络中
rg_net.apply(init_normal)
print(rg_net.state_dict())

结果：

OrderedDict([('0.block 0.0.weight', tensor([[-0.0005, -0.0156, -0.0060,  0.0215],
        [-0.0034, -0.0072,  0.0202, -0.0293],
        [-0.0038,  0.0044,  0.0092,  0.0033],
        [-0.0180,  0.0007,  0.0112,  0.0146],
        [ 0.0056,  0.0016, -0.0126,  0.0023],
        [ 0.0129, -0.0162,  0.0021,  0.0114],
        [-0.0059, -0.0165, -0.0054, -0.0024],
        [ 0.0119, -0.0080, -0.0047,  0.0144]])), ('0.block 0.0.bias', tensor([0., 0., 0., 0., 0., 0., 0., 0.])), ('0.block 0.2.weight', tensor([[-0.0079,  0.0108,  0.0019,  0.0021,  0.0067, -0.0080,  0.0136, -0.0100],
        [ 0.0132,  0.0003,  0.0324, -0.0023,  0.0139, -0.0006,  0.0063,  0.0035],
        [ 0.0104,  0.0098,  0.0100, -0.0092, -0.0110, -0.0063, -0.0102, -0.0252],
        [-0.0131,  0.0164,  0.0010,  0.0028,  0.0039,  0.0144, -0.0070,  0.0112]])), ('0.block 0.2.bias', tensor([0., 0., 0., 0.])), ('0.block 1.0.weight', tensor([[-0.0065, -0.0065, -0.0049, -0.0068],
        [ 0.0111,  0.0094,  0.0023, -0.0018],
        [ 0.0125,  0.0005, -0.0009,  0.0083],
        [ 0.0151, -0.0119,  0.0130,  0.0044],
        [ 0.0059, -0.0127,  0.0008,  0.0002],
        [-0.0142,  0.0148,  0.0154, -0.0192],
        [ 0.0060, -0.0111,  0.0227,  0.0052],
        [ 0.0026, -0.0114,  0.0009,  0.0065]])), ('0.block 1.0.bias', tensor([0., 0., 0., 0., 0., 0., 0., 0.])), ('0.block 1.2.weight', tensor([[-0.0177,  0.0073, -0.0108, -0.0105,  0.0122, -0.0111, -0.0058, -0.0075],
        [-0.0055,  0.0120,  0.0076, -0.0040,  0.0091, -0.0071,  0.0057, -0.0131],
        [-0.0011,  0.0067,  0.0130,  0.0168, -0.0164, -0.0165,  0.0149, -0.0038],
        [ 0.0024,  0.0023,  0.0034,  0.0109,  0.0055, -0.0070,  0.0014, -0.0062]])), ('0.block 1.2.bias', tensor([0., 0., 0., 0.])), ('0.block 2.0.weight', tensor([[ 1.0085e-02,  8.0402e-03, -9.1719e-03,  1.3070e-02],
        [-8.9510e-03,  8.2770e-03, -9.4393e-03, -1.7498e-03],
        [ 1.7968e-02,  6.7528e-03,  1.2561e-02, -7.4435e-03],
        [ 3.2517e-03, -1.0305e-02,  1.6995e-05, -4.5249e-03],
        [-1.5941e-03, -1.1127e-02,  7.5920e-03,  3.9745e-03],
        [-8.2059e-03, -4.8210e-03, -1.1469e-02,  1.9077e-02],
        [ 2.3864e-03, -3.9443e-03, -1.7923e-02, -1.9211e-02],
        [-1.5732e-02,  4.3284e-03, -2.9533e-03,  1.6779e-02]])), ('0.block 2.0.bias', tensor([0., 0., 0., 0., 0., 0., 0., 0.])), ('0.block 2.2.weight', tensor([[ 0.0130, -0.0097, -0.0064, -0.0129, -0.0056, -0.0153, -0.0075,  0.0038],
        [ 0.0022, -0.0114,  0.0283,  0.0007,  0.0044,  0.0140, -0.0086,  0.0052],
        [ 0.0043, -0.0071, -0.0021, -0.0050,  0.0017, -0.0125, -0.0006, -0.0131],
        [-0.0109, -0.0074, -0.0005,  0.0041, -0.0061, -0.0030,  0.0152,  0.0061]])), ('0.block 2.2.bias', tensor([0., 0., 0., 0.])), ('0.block 3.0.weight', tensor([[ 0.0129, -0.0063,  0.0013,  0.0052],
        [ 0.0015,  0.0039, -0.0005,  0.0026],
        [ 0.0020, -0.0285, -0.0073, -0.0100],
        [ 0.0063, -0.0016,  0.0011,  0.0034],
        [ 0.0006,  0.0322, -0.0118,  0.0084],
        [ 0.0017, -0.0090,  0.0004,  0.0121],
        [ 0.0057, -0.0053, -0.0099,  0.0110],
        [-0.0032,  0.0015,  0.0132,  0.0021]])), ('0.block 3.0.bias', tensor([0., 0., 0., 0., 0., 0., 0., 0.])), ('0.block 3.2.weight', tensor([[ 0.0053,  0.0018, -0.0010,  0.0125,  0.0173,  0.0061, -0.0098,  0.0042],
        [-0.0015,  0.0005, -0.0129, -0.0032,  0.0152, -0.0107,  0.0058, -0.0132],
        [ 0.0038, -0.0153, -0.0120,  0.0186,  0.0042, -0.0002, -0.0070,  0.0054],
        [-0.0036,  0.0046,  0.0096,  0.0048, -0.0034,  0.0261,  0.0107, -0.0033]])), ('0.block 3.2.bias', tensor([0., 0., 0., 0.])), ('1.weight', tensor([[-0.0115, -0.0152,  0.0224,  0.0118]])), ('1.bias', tensor([0.]))])

nn.init.constant_(m.weight, 1)

def init_constant(m):
    if type(m) == nn.Linear:
        nn.init.constant_(m.weight, 1)
        nn.init.zeros_(m.bias)


rg_net.apply(init_constant)
print(rg_net.state_dict())

结果：

OrderedDict([('0.block 0.0.weight', tensor([[1., 1., 1., 1.],
        [1., 1., 1., 1.],
        [1., 1., 1., 1.],
        [1., 1., 1., 1.],
        [1., 1., 1., 1.],
        [1., 1., 1., 1.],
        [1., 1., 1., 1.],
        [1., 1., 1., 1.]])), ('0.block 0.0.bias', tensor([0., 0., 0., 0., 0., 0., 0., 0.])), ('0.block 0.2.weight', tensor([[1., 1., 1., 1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1., 1., 1., 1.]])), ('0.block 0.2.bias', tensor([0., 0., 0., 0.])), ('0.block 1.0.weight', tensor([[1., 1., 1., 1.],
        [1., 1., 1., 1.],
        [1., 1., 1., 1.],
        [1., 1., 1., 1.],
        [1., 1., 1., 1.],
        [1., 1., 1., 1.],
        [1., 1., 1., 1.],
        [1., 1., 1., 1.]])), ('0.block 1.0.bias', tensor([0., 0., 0., 0., 0., 0., 0., 0.])), ('0.block 1.2.weight', tensor([[1., 1., 1., 1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1., 1., 1., 1.]])), ('0.block 1.2.bias', tensor([0., 0., 0., 0.])), ('0.block 2.0.weight', tensor([[1., 1., 1., 1.],
        [1., 1., 1., 1.],
        [1., 1., 1., 1.],
        [1., 1., 1., 1.],
        [1., 1., 1., 1.],
        [1., 1., 1., 1.],
        [1., 1., 1., 1.],
        [1., 1., 1., 1.]])), ('0.block 2.0.bias', tensor([0., 0., 0., 0., 0., 0., 0., 0.])), ('0.block 2.2.weight', tensor([[1., 1., 1., 1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1., 1., 1., 1.]])), ('0.block 2.2.bias', tensor([0., 0., 0., 0.])), ('0.block 3.0.weight', tensor([[1., 1., 1., 1.],
        [1., 1., 1., 1.],
        [1., 1., 1., 1.],
        [1., 1., 1., 1.],
        [1., 1., 1., 1.],
        [1., 1., 1., 1.],
        [1., 1., 1., 1.],
        [1., 1., 1., 1.]])), ('0.block 3.0.bias', tensor([0., 0., 0., 0., 0., 0., 0., 0.])), ('0.block 3.2.weight', tensor([[1., 1., 1., 1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1., 1., 1., 1.]])), ('0.block 3.2.bias', tensor([0., 0., 0., 0.])), ('1.weight', tensor([[1., 1., 1., 1.]])), ('1.bias', tensor([0.]))])

nn.init.xavier_uniform_(m.weight)

# 对某些块应用不同的初始化方法
def xavier(m):
    if type(m) == nn.Linear:
        nn.init.xavier_uniform_(m.weight)


def init_2(m):
    if type(m) == nn.Linear:
        nn.init.constant_(m.weight, 2)


print(rg_net[0][0])
rg_net[0][0].apply(xavier)
rg_net[0][1].apply(init_2)
print(rg_net[0].state_dict())

结果：

Sequential(
  (0): Linear(in_features=4, out_features=8, bias=True)
  (1): ReLU()
  (2): Linear(in_features=8, out_features=4, bias=True)
)
OrderedDict([('block 0.0.weight', tensor([[ 0.6565,  0.4152,  0.3616,  0.0241],
        [-0.2715,  0.1828,  0.0867, -0.5255],
        [-0.3187,  0.1219, -0.4898, -0.5391],
        [-0.0607,  0.4178, -0.1593,  0.4205],
        [ 0.3720, -0.0351,  0.5027, -0.1858],
        [-0.2447, -0.1904,  0.6159, -0.0746],
        [-0.0756, -0.6110,  0.0704, -0.5446],
        [-0.3130, -0.0007,  0.3470, -0.0409]])), ('block 0.0.bias', tensor([0., 0., 0., 0., 0., 0., 0., 0.])), ('block 0.2.weight', tensor([[ 0.3093,  0.6073,  0.5924,  0.2230, -0.0619, -0.5506,  0.1878,  0.4609],
        [ 0.5646, -0.2433,  0.2026, -0.2918,  0.1207, -0.5572,  0.0366,  0.0012],
        [ 0.4603, -0.1414, -0.1358, -0.5018, -0.1071,  0.2308,  0.2204, -0.2279],
        [-0.5665,  0.1631, -0.3504, -0.1455,  0.1157,  0.5545, -0.3063,  0.3872]])), ('block 0.2.bias', tensor([0., 0., 0., 0.])), ('block 1.0.weight', tensor([[2., 2., 2., 2.],
        [2., 2., 2., 2.],
        [2., 2., 2., 2.],
        [2., 2., 2., 2.],
        [2., 2., 2., 2.],
        [2., 2., 2., 2.],
        [2., 2., 2., 2.],
        [2., 2., 2., 2.]])), ('block 1.0.bias', tensor([0., 0., 0., 0., 0., 0., 0., 0.])), ('block 1.2.weight', tensor([[2., 2., 2., 2., 2., 2., 2., 2.],
        [2., 2., 2., 2., 2., 2., 2., 2.],
        [2., 2., 2., 2., 2., 2., 2., 2.],
        [2., 2., 2., 2., 2., 2., 2., 2.]])), ('block 1.2.bias', tensor([0., 0., 0., 0.])), ('block 2.0.weight', tensor([[1., 1., 1., 1.],
        [1., 1., 1., 1.],
        [1., 1., 1., 1.],
        [1., 1., 1., 1.],
        [1., 1., 1., 1.],
        [1., 1., 1., 1.],
        [1., 1., 1., 1.],
        [1., 1., 1., 1.]])), ('block 2.0.bias', tensor([0., 0., 0., 0., 0., 0., 0., 0.])), ('block 2.2.weight', tensor([[1., 1., 1., 1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1., 1., 1., 1.]])), ('block 2.2.bias', tensor([0., 0., 0., 0.])), ('block 3.0.weight', tensor([[1., 1., 1., 1.],
        [1., 1., 1., 1.],
        [1., 1., 1., 1.],
        [1., 1., 1., 1.],
        [1., 1., 1., 1.],
        [1., 1., 1., 1.],
        [1., 1., 1., 1.],
        [1., 1., 1., 1.]])), ('block 3.0.bias', tensor([0., 0., 0., 0., 0., 0., 0., 0.])), ('block 3.2.weight', tensor([[1., 1., 1., 1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1., 1., 1., 1.]])), ('block 3.2.bias', tensor([0., 0., 0., 0.]))])

nn.init.uniform_(m.weight, -10, 10)

def my_init(m):
    if type(m) == nn.Linear:
        print(
            "Init",
            *[(name, param.shape) for name, param in rg_net.named_parameters()][0]
        )
        nn.init.uniform_(m.weight, -10, 10)
        m.weight.data *= m.weight.data.abs() >= 5


rg_net[0][2].apply(my_init)
print(rg_net[0].state_dict())

结果：

Init 0.block 0.0.weight torch.Size([8, 4])
Init 0.block 0.0.weight torch.Size([8, 4])
OrderedDict([('block 0.0.weight', tensor([[ 0.6565,  0.4152,  0.3616,  0.0241],
        [-0.2715,  0.1828,  0.0867, -0.5255],
        [-0.3187,  0.1219, -0.4898, -0.5391],
        [-0.0607,  0.4178, -0.1593,  0.4205],
        [ 0.3720, -0.0351,  0.5027, -0.1858],
        [-0.2447, -0.1904,  0.6159, -0.0746],
        [-0.0756, -0.6110,  0.0704, -0.5446],
        [-0.3130, -0.0007,  0.3470, -0.0409]])), ('block 0.0.bias', tensor([0., 0., 0., 0., 0., 0., 0., 0.])), ('block 0.2.weight', tensor([[ 0.3093,  0.6073,  0.5924,  0.2230, -0.0619, -0.5506,  0.1878,  0.4609],
        [ 0.5646, -0.2433,  0.2026, -0.2918,  0.1207, -0.5572,  0.0366,  0.0012],
        [ 0.4603, -0.1414, -0.1358, -0.5018, -0.1071,  0.2308,  0.2204, -0.2279],
        [-0.5665,  0.1631, -0.3504, -0.1455,  0.1157,  0.5545, -0.3063,  0.3872]])), ('block 0.2.bias', tensor([0., 0., 0., 0.])), ('block 1.0.weight', tensor([[2., 2., 2., 2.],
        [2., 2., 2., 2.],
        [2., 2., 2., 2.],
        [2., 2., 2., 2.],
        [2., 2., 2., 2.],
        [2., 2., 2., 2.],
        [2., 2., 2., 2.],
        [2., 2., 2., 2.]])), ('block 1.0.bias', tensor([0., 0., 0., 0., 0., 0., 0., 0.])), ('block 1.2.weight', tensor([[2., 2., 2., 2., 2., 2., 2., 2.],
        [2., 2., 2., 2., 2., 2., 2., 2.],
        [2., 2., 2., 2., 2., 2., 2., 2.],
        [2., 2., 2., 2., 2., 2., 2., 2.]])), ('block 1.2.bias', tensor([0., 0., 0., 0.])), ('block 2.0.weight', tensor([[ 5.4197, -9.7842, -0.0000,  9.2256],
        [-0.0000, -0.0000,  7.9126, -0.0000],
        [-0.0000,  9.0827,  0.0000, -5.7510],
        [-0.0000,  8.2361, -6.8953,  0.0000],
        [ 5.2814, -0.0000, -7.2411, -0.0000],
        [ 8.2870, -7.5425,  8.3590, -8.6748],
        [ 6.4971,  7.6553, -9.7904,  8.5761],
        [-5.6998, -6.8568,  0.0000, -0.0000]])), ('block 2.0.bias', tensor([0., 0., 0., 0., 0., 0., 0., 0.])), ('block 2.2.weight', tensor([[-7.5476,  0.0000, -5.7735, -6.1913, -0.0000,  8.8335,  0.0000, -8.3723],
        [ 7.7029, -6.5640, -0.0000, -9.6283, -5.5881, -0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000, -7.1479,  0.0000, -0.0000, -7.5409,  7.4545],
        [-0.0000,  6.9711, -0.0000,  0.0000,  9.4580, -0.0000, -8.5716,  6.6650]])), ('block 2.2.bias', tensor([0., 0., 0., 0.])), ('block 3.0.weight', tensor([[1., 1., 1., 1.],
        [1., 1., 1., 1.],
        [1., 1., 1., 1.],
        [1., 1., 1., 1.],
        [1., 1., 1., 1.],
        [1., 1., 1., 1.],
        [1., 1., 1., 1.],
        [1., 1., 1., 1.]])), ('block 3.0.bias', tensor([0., 0., 0., 0., 0., 0., 0., 0.])), ('block 3.2.weight', tensor([[1., 1., 1., 1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1., 1., 1., 1.]])), ('block 3.2.bias', tensor([0., 0., 0., 0.]))])

直接替换weight bias的值

# 简单粗暴的方法 直接替换weight bias的值
rg_net[1].weight.data[:] += 1
print(rg_net[1].state_dict())
rg_net[1].weight.data[0, 0] = 42
print(rg_net[1].state_dict())

结果：

OrderedDict([('weight', tensor([[2., 2., 2., 2.]])), ('bias', tensor([0.]))])
OrderedDict([('weight', tensor([[42.,  2.,  2.,  2.]])), ('bias', tensor([0.]))])

参数绑定共享参数

# 参数绑定 共享参数: 把第二层和第四层共享
shared = nn.Linear(8, 8)
net_share = nn.Sequential(nn.Linear(4, 8), nn.ReLU(), shared, nn.ReLU(), shared,
                          nn.ReLU(), nn.Linear(8, 1))
net_share(X)
print(net_share[2].weight.data[0] == net_share[4].weight.data[0])
net_share[2].weight.data[0, 0] = 1000
print(net_share[2].weight.data[0] == net_share[4].weight.data[0])

结果：

tensor([True, True, True, True, True, True, True, True])
tensor([True, True, True, True, True, True, True, True])

三、自定义层

class CenteredLayer(nn.Module):
    def __init__(self):
        super(CenteredLayer, self).__init__()

    def forward(self, X):
        if __name__ == '__main__':
            return X - X.mean()


layer = CenteredLayer()
print(layer(torch.FloatTensor([1, 2, 3, 4, 5])))


net = nn.Sequential(nn.Linear(8, 12), CenteredLayer())

Y = net(torch.rand(4, 8))
print(Y)

结果：

tensor([-2., -1.,  0.,  1.,  2.])
tensor([[ 0.1277, -0.0479, -0.1686,  0.0222, -0.4566,  0.0111,  0.2503,  0.2069,
         -0.3122,  0.3772, -0.1147,  0.0919],
        [-0.0777,  0.1050, -0.1751,  0.0301, -0.5946,  0.1693,  0.1831,  0.4101,
         -0.4953,  0.5953, -0.1591,  0.1412],
        [ 0.1005,  0.3416,  0.2313, -0.3359, -0.8277, -0.0700,  0.0232,  0.0690,
         -0.2101,  0.5695, -0.2455,  0.2663],
        [-0.0245,  0.0388, -0.0631, -0.1355, -0.6780,  0.2602,  0.0541,  0.3108,
         -0.4082,  0.5630, -0.1461,  0.1969]], grad_fn=<SubBackward0>)

nn.Parameter()

# 带参数的图层
class MyLinear(nn.Module):
    def __init__(self, in_units, units):
        super(MyLinear, self).__init__()
        self.weight = nn.Parameter(torch.randn(in_units, units))
        self.bias = nn.Parameter(torch.randn(units,))

    def forward(self, X):
        linear = torch.matmul(X, self.weight.data) + self.bias.data
        return F.relu(linear)


dense = MyLinear(5, 3)
print(dense.weight)

结果：

Parameter containing:
tensor([[ 0.7404,  0.2722, -1.0920],
        [ 0.1745, -1.1313, -0.8230],
        [-0.6691, -1.3758, -0.4024],
        [ 0.2260, -1.0750, -1.0215],
        [ 1.1589, -0.2882, -0.0932]], requires_grad=True)

# 使用自定义层直接执行正向传播计算
print(dense(torch.rand(2, 5)))

结果：

tensor([[0.0000, 2.7653, 0.0000],
        [0.0000, 0.9031, 0.0000]])

net = nn.Sequential(MyLinear(64, 8), MyLinear(8, 1))
print(net(torch.rand(3, 64)))

结果：

tensor([[3.5120],
        [0.0000],
        [6.2135]])

四、读写文件（保存参数）

# 加载和保存张量的几种方法
x = torch.arange(4)
torch.save(x, './params/x_file.txt')
x2 = torch.load('./params/x_file.txt')
print(x2)

y = torch.zeros(4)
torch.save([x, y], './params/x_file2.txt')
x2, y2 = torch.load('./params/x_file2.txt')
print(x2, y2)

mydict = {'x': x, 'y': y}
torch.save(mydict, './params/mydict_file2.txt')
mydict2 = torch.load('./params/mydict_file2.txt')
print(mydict2)

结果：

tensor([0, 1, 2, 3])
tensor([0, 1, 2, 3]) tensor([0., 0., 0., 0.])
{'x': tensor([0, 1, 2, 3]), 'y': tensor([0., 0., 0., 0.])}

class MLP(nn.Module):
    def __init__(self):
        super(MLP, self).__init__()
        self.Linear1 = nn.Linear(20, 256)
        self.Linear2 = nn.Linear(256, 10)

    def forward(self, x):
        return self.Linear2(F.relu(self.Linear1(x)))


net = MLP()
X = torch.randn(size=(2, 20))
Y = net(X)
print(Y)

结果：

tensor([[ 0.2036, -0.0329, -0.1639,  0.0509,  0.3943, -0.1709,  0.2458, -0.0923,
         -0.1430,  0.4085],
        [ 0.1013, -0.0231,  0.2619,  0.0948,  0.1205,  0.1814,  0.2518,  0.0211,
          0.0743,  0.0249]], grad_fn=<AddmmBackward0>)

torch.save(net.state_dict(), ‘./params/mlp_params.txt’)

clone.load_state_dict(torch.load(’./params/mlp_params.txt’))

torch.save(net.state_dict(), './params/mlp_params.txt')

# 实例化了原始多层感知机模型的一个备份 直接读取文件中存储的参数
clone = MLP()
clone.load_state_dict(torch.load('./params/mlp_params.txt'))  # 参数写回到网络
clone.eval()

Y_clone = clone(X)
print(Y_clone == Y)
print(Y)

结果：

tensor([[True, True, True, True, True, True, True, True, True, True],
        [True, True, True, True, True, True, True, True, True, True]])
tensor([[ 0.2036, -0.0329, -0.1639,  0.0509,  0.3943, -0.1709,  0.2458, -0.0923,
         -0.1430,  0.4085],
        [ 0.1013, -0.0231,  0.2619,  0.0948,  0.1205,  0.1814,  0.2518,  0.0211,
          0.0743,  0.0249]], grad_fn=<AddmmBackward0>)

五、使用GPU运行

torch.device(‘cpu’)

torch.cuda.device(‘cuda’)

torch.cuda.device(‘cuda:1’)

torch.cuda.device_count()

print(torch.device('cpu'))  # 默认使用cpu
print(torch.cuda.device('cuda'))  # 表示第0个GPU
print(torch.cuda.device('cuda:1'))  # 表示第1个GPU

# 查询可用GPU数量
print(torch.cuda.device_count())


def try_gpu(i=0):
    """如果存在，则返回GPU(i)，否则返回cpu()"""
    if torch.cuda.device_count() >= i+1:
        return torch.device(f'cuda:{i}')
    return torch.device('cpu')


def try_all_gpu():
    """返回所有可用的GPU，如果没有GPU，则返回[cpu(),]"""
    devices = [
        torch.device(f'cuda:{i}') for i in range(torch.cuda.device_count())
    ]
    return devices if devices else [torch.device('cpu')]


print(try_gpu())
print(try_gpu(10))
print(try_all_gpu())


# 查询张量所在的设备
x = torch.tensor([1, 2, 3])
# 默认是在cpu上
print('此时x在', x.device, '上')

# 存储在GPU上
x1 = torch.tensor([1, 2, 3], device=try_gpu())
print(x1)
x2 = torch.tensor([1, 2, 3], device=try_gpu(1))
print(x2)

# 要计算x+y 必须保证x y均在同一个GPU上
z = x.cuda(1)
print('z:', z)
print('x2+z=', x2+z)
print('z.cuda(1) is z:', z.cuda(1) is z)

# 神经网络与GPU
net = nn.Sequential(nn.Linear(3, 1))
net = net.to(device=try_gpu(2))
print(net(x))
# 确认模型参数存储在同一个GPU上
print(net[0].weight.data.device)