一、五种模型构造形式
nn.Sequential()
Sequen_net = nn.Sequential(nn.Linear(20, 256), nn.ReLU(), nn.Linear(256, 10))
X = torch.rand(3, 20)
print("X: ", X)
net1 = Sequen_net(X)
print("net1: ", net1)
结果:
X: tensor([[0.6513, 0.1193, 0.2606, 0.4497, 0.2970, 0.9101, 0.1941, 0.9283, 0.5954,
0.0683, 0.6453, 0.8361, 0.5083, 0.7473, 0.4150, 0.8179, 0.3845, 0.1216,
0.7870, 0.9959],
[0.2798, 0.2982, 0.8267, 0.9409, 0.2763, 0.8739, 0.6138, 0.5610, 0.6146,
0.3400, 0.9731, 0.7196, 0.7783, 0.0356, 0.0168, 0.9358, 0.8763, 0.0903,
0.8762, 0.4948],
[0.4094, 0.7204, 0.8304, 0.4614, 0.3940, 0.4561, 0.8946, 0.5907, 0.8838,
0.1645, 0.5259, 0.9783, 0.8897, 0.3766, 0.9449, 0.3011, 0.3275, 0.5742,
0.7173, 0.3336]])
net1: tensor([[ 0.0874, 0.1452, -0.1404, -0.0427, 0.2162, -0.2174, 0.0608, 0.0526,
0.0731, -0.0225],
[ 0.0295, 0.0829, -0.1371, -0.0923, 0.1686, -0.2626, 0.1728, 0.0373,
-0.0400, 0.0174],
[-0.0100, 0.0885, -0.1002, -0.0672, 0.0749, -0.1790, 0.1430, 0.1640,
0.0572, 0.0146]], grad_fn=<AddmmBackward0>)
class自定义类
X = torch.rand(3, 20)
class MLP(nn.Module):
def __init__(self):
super(MLP, self).__init__()
self.hidden = nn.Linear(20, 256)
self.out = nn.Linear(256, 10)
def forward(self, X):
return self.out(F.relu(self.hidden(X)))
net2 = MLP()
print("net2: ", net2(X))
结果:
net2: tensor([[ 0.1871, -0.0803, 0.2287, -0.1312, 0.1852, 0.0470, -0.1633, 0.0177,
0.0817, 0.1049],
[ 0.2333, 0.0757, 0.2480, -0.0570, 0.2533, 0.0042, -0.2117, 0.0129,
0.0444, 0.0404],
[ 0.1890, 0.0458, 0.2060, -0.0098, 0.1948, 0.0225, -0.1746, -0.0491,
0.0820, 0.0435]], grad_fn=<AddmmBackward0>)
顺序块
class MySequential(nn.Module):
def __init__(self, *args):
super(MySequential, self).__init__()
for block in args:
self._modules[block] = block
def forward(self, X):
for block in self._modules.values():
X = block(X)
return X
net3 = MySequential(nn.Linear(20, 256), nn.ReLU(), nn.Linear(256, 10))
print("net3: ", net3(X))
结果:
net3: tensor([[-0.0630, 0.1005, 0.1431, 0.2444, -0.0660, -0.0654, 0.0188, 0.2016,
-0.4174, -0.1706],
[ 0.0219, 0.0262, 0.1507, 0.1690, -0.0820, 0.0212, -0.1419, 0.3118,
-0.3934, -0.1119],
[-0.0213, -0.0285, 0.0478, 0.1897, -0.1706, 0.0083, -0.1190, 0.0981,
-0.3782, -0.1059]], grad_fn=<AddmmBackward0>)
requires_grad=False
class otherMLP(nn.Module):
def __init__(self):
super(otherMLP, self).__init__()
self.rand_weight = torch.rand((20, 20), requires_grad=False)
self.linear = nn.Linear(20, 20)
def forward(self, X):
X = self.linear(X)
X = F.relu(torch.mm(X, self.rand_weight) + 1)
X = self.linear(X)
while X.abs().sum() > 1:
X /= 2
return X.sum()
net4 = otherMLP()
print("net4: ", net4(X))
结果:
net4: tensor(0.0289, grad_fn=<SumBackward0>)
混合搭配各种组合块的方法
class NestMLP(nn.Module):
def __init__(self):
super(NestMLP, self).__init__()
self.net = nn.Sequential(nn.Linear(20, 64), nn.ReLU(),
nn.Linear(64, 16), nn.ReLU())
self.linear = nn.Linear(16, 16)
def forward(self, X):
return self.linear(self.net(X))
net5 = nn.Sequential(NestMLP(), nn.Linear(16, 20), otherMLP())
print("net5: ", net5(X))
结果:
net5: tensor(-0.1181, grad_fn=<SumBackward0>)
二、参数管理
net.state_dict()
net_paraManag = nn.Sequential(nn.Linear(4, 8), nn.ReLU(), nn.Linear(8, 1))
X_paraManag = torch.rand(size=(3, 4))
print(net_paraManag(X_paraManag))
print(net_paraManag.state_dict())
print(type(net_paraManag[2].bias))
print(net_paraManag[2].bias)
print(net_paraManag[2].bias.data)
print(net_paraManag[2].weight.grad == None)
结果:
tensor([[0.4404],
[0.3632],
[0.3589]], grad_fn=<AddmmBackward0>)
OrderedDict([('0.weight', tensor([[-3.0088e-01, 3.7643e-01, 2.2169e-01, -1.7857e-01],
[-4.6796e-01, 1.9626e-01, -1.0750e-01, -8.1731e-02],
[-4.6789e-02, -1.4509e-02, 4.1151e-02, -4.7081e-01],
[ 3.3119e-01, 1.0578e-01, 1.8293e-01, 1.6103e-01],
[-3.4791e-04, 4.7955e-01, 4.3611e-01, 3.2194e-01],
[ 7.4373e-02, 1.2937e-01, -3.0729e-01, -1.8899e-01],
[ 3.1637e-01, -2.4891e-01, 2.0294e-01, 4.1119e-01],
[-1.4292e-01, 3.6126e-01, -2.9002e-01, -1.3629e-01]])), ('0.bias', tensor([ 0.1134, -0.2226, -0.2156, -0.2025, -0.2483, -0.0088, 0.2076, -0.4190])), ('2.weight', tensor([[ 0.1526, -0.3418, 0.1399, 0.2730, -0.2132, -0.0511, 0.2570, -0.1498]])), ('2.bias', tensor([0.2878]))])
<class 'torch.nn.parameter.Parameter'>
Parameter containing:
tensor([0.2878], requires_grad=True)
tensor([0.2878])
True
net.named_parameters()
print(*[(name, param.shape) for name, param in net_paraManag[0].named_parameters()])
print(*[(name, param.shape) for name, param in net_paraManag.named_parameters()])
print(net_paraManag.state_dict()['2.bias'].data)
('weight', torch.Size([8, 4])) ('bias', torch.Size([8]))
('0.weight', torch.Size([8, 4])) ('0.bias', torch.Size([8])) ('2.weight', torch.Size([1, 8])) ('2.bias', torch.Size([1]))
tensor([0.2878])
网络嵌套块
net.add_module
X = torch.rand(size=(3, 4))
def blk1():
return nn.Sequential(nn.Linear(4, 8), nn.ReLU(), nn.Linear(8, 4))
def blk2():
net_blk = nn.Sequential()
for i in range(4):
net_blk.add_module(f'block {i}', blk1())
return net_blk
rg_net = nn.Sequential(blk2(), nn.Linear(4, 1))
print(rg_net(X))
print(rg_net.state_dict())
print(*[(name, param.shape) for name, param in rg_net.named_parameters()])
print(rg_net)
结果:
tensor([[-0.4122],
[-0.4122],
[-0.4122]], grad_fn=<AddmmBackward0>)
OrderedDict([('0.block 0.0.weight', tensor([[-0.4171, -0.1780, 0.0606, 0.3748],
[ 0.4168, -0.1772, -0.1909, -0.2394],
[-0.1034, -0.0757, 0.3555, -0.1779],
[ 0.1085, -0.3186, 0.0096, -0.2394],
[-0.1383, 0.4044, 0.3551, -0.3015],
[ 0.4526, -0.0596, 0.0173, -0.1155],
[ 0.3260, -0.2712, 0.1022, 0.2748],
[-0.0684, -0.3660, 0.2312, -0.1745]])), ('0.block 0.0.bias', tensor([ 0.2892, 0.0427, -0.3143, -0.3937, 0.3830, 0.4406, -0.4819, 0.2689])), ('0.block 0.2.weight', tensor([[-0.1373, 0.3090, 0.1439, -0.1540, 0.0539, -0.1121, 0.3362, 0.0686],
[-0.1190, -0.0290, 0.2045, -0.0352, -0.1836, 0.2612, -0.0020, -0.2542],
[ 0.2692, 0.1711, 0.3402, 0.1287, -0.1026, 0.1157, 0.1243, 0.0963],
[-0.1632, -0.0101, -0.2477, -0.3182, 0.3411, -0.0662, -0.2787, -0.2854]])), ('0.block 0.2.bias', tensor([ 0.1856, 0.2060, -0.2202, -0.2335])), ('0.block 1.0.weight', tensor([[-0.2876, -0.2843, 0.2626, -0.2522],
[-0.4314, -0.0281, 0.3921, 0.1997],
[-0.4291, -0.2965, 0.2171, 0.4714],
[-0.2635, 0.1402, 0.4460, 0.2259],
[-0.2485, 0.1630, 0.2658, 0.1465],
[-0.2018, 0.1986, 0.2293, -0.3614],
[-0.2771, 0.4530, -0.1518, 0.0359],
[-0.4866, 0.3631, -0.3512, -0.1767]])), ('0.block 1.0.bias', tensor([ 0.3211, -0.3084, 0.4175, -0.2714, -0.3879, 0.4064, -0.4108, 0.4524])), ('0.block 1.2.weight', tensor([[-0.1630, 0.0587, 0.0727, 0.2492, 0.0867, 0.2773, -0.3044, 0.1747],
[-0.0317, -0.1035, -0.0919, 0.2908, -0.2533, -0.1995, 0.3382, -0.3458],
[ 0.3418, 0.2402, -0.2481, 0.0028, 0.2577, 0.0366, -0.1611, 0.1437],
[-0.2612, 0.2073, 0.3012, -0.0942, -0.2812, 0.0807, 0.1528, -0.1563]])), ('0.block 1.2.bias', tensor([ 0.3424, 0.0663, -0.2212, -0.2977])), ('0.block 2.0.weight', tensor([[-0.2276, 0.4482, -0.1265, 0.4275],
[ 0.1494, -0.4260, 0.1137, -0.2846],
[ 0.3537, -0.2347, 0.1338, -0.3326],
[ 0.0920, 0.2532, -0.1029, -0.2644],
[ 0.0546, -0.1787, 0.4898, -0.0615],
[-0.3642, -0.1389, -0.1747, -0.1744],
[ 0.3488, 0.4022, -0.0566, 0.4181],
[ 0.4516, 0.3673, 0.0574, 0.0241]])), ('0.block 2.0.bias', tensor([-0.3022, 0.1639, 0.1999, -0.1497, 0.4291, -0.2849, -0.2372, 0.3733])), ('0.block 2.2.weight', tensor([[-0.1605, 0.1799, 0.0969, -0.3066, 0.0343, -0.0491, 0.2340, -0.0662],
[ 0.1513, 0.0432, -0.1792, -0.2874, 0.0522, -0.1464, 0.0801, -0.0581],
[-0.0155, 0.1103, -0.0999, 0.2740, -0.1410, -0.2216, -0.1957, -0.2005],
[ 0.0011, -0.3436, 0.0429, -0.3246, -0.1136, 0.2585, -0.3093, -0.0797]])), ('0.block 2.2.bias', tensor([ 0.1010, 0.1079, -0.3058, 0.0265])), ('0.block 3.0.weight', tensor([[ 0.2588, 0.0358, -0.3987, 0.4909],
[ 0.2737, 0.3185, 0.4886, -0.2483],
[ 0.4014, 0.4572, 0.1416, 0.1966],
[ 0.4365, 0.3307, -0.0075, 0.3789],
[-0.3204, -0.0930, 0.4388, 0.2200],
[-0.0976, -0.0429, -0.4074, 0.1534],
[ 0.3454, 0.1831, -0.0017, 0.4038],
[ 0.3662, -0.4278, -0.4763, 0.3758]])), ('0.block 3.0.bias', tensor([-0.0556, -0.4557, -0.2513, -0.2545, -0.1326, -0.3917, 0.4147, -0.0952])), ('0.block 3.2.weight', tensor([[-0.2281, 0.1095, -0.3121, -0.1222, 0.1357, 0.2677, -0.0877, -0.0851],
[-0.0383, -0.2003, -0.2518, 0.0728, 0.2119, 0.1021, 0.0172, -0.3027],
[ 0.0219, -0.1426, 0.0483, 0.1317, -0.1951, 0.2851, -0.1330, -0.1818],
[-0.2049, 0.2587, 0.0653, 0.1295, -0.0574, 0.1290, -0.0289, -0.0458]])), ('0.block 3.2.bias', tensor([ 0.3209, -0.0175, 0.1921, -0.0818])), ('1.weight', tensor([[-0.4423, -0.2640, 0.3931, 0.3123]])), ('1.bias', tensor([-0.3232]))])
('0.block 0.0.weight', torch.Size([8, 4])) ('0.block 0.0.bias', torch.Size([8])) ('0.block 0.2.weight', torch.Size([4, 8])) ('0.block 0.2.bias', torch.Size([4])) ('0.block 1.0.weight', torch.Size([8, 4])) ('0.block 1.0.bias', torch.Size([8])) ('0.block 1.2.weight', torch.Size([4, 8])) ('0.block 1.2.bias', torch.Size([4])) ('0.block 2.0.weight', torch.Size([8, 4])) ('0.block 2.0.bias', torch.Size([8])) ('0.block 2.2.weight', torch.Size([4, 8])) ('0.block 2.2.bias', torch.Size([4])) ('0.block 3.0.weight', torch.Size([8, 4])) ('0.block 3.0.bias', torch.Size([8])) ('0.block 3.2.weight', torch.Size([4, 8])) ('0.block 3.2.bias', torch.Size([4])) ('1.weight', torch.Size([1, 4])) ('1.bias', torch.Size([1]))
Sequential(
(0): Sequential(
(block 0): Sequential(
(0): Linear(in_features=4, out_features=8, bias=True)
(1): ReLU()
(2): Linear(in_features=8, out_features=4, bias=True)
)
(block 1): Sequential(
(0): Linear(in_features=4, out_features=8, bias=True)
(1): ReLU()
(2): Linear(in_features=8, out_features=4, bias=True)
)
(block 2): Sequential(
(0): Linear(in_features=4, out_features=8, bias=True)
(1): ReLU()
(2): Linear(in_features=8, out_features=4, bias=True)
)
(block 3): Sequential(
(0): Linear(in_features=4, out_features=8, bias=True)
(1): ReLU()
(2): Linear(in_features=8, out_features=4, bias=True)
)
)
(1): Linear(in_features=4, out_features=1, bias=True)
)
nn.init.normal_(m.weight, mean=0, std=0.01)
nn.init.zeros_(m.bias)
def init_normal(m):
if type(m) == nn.Linear:
nn.init.normal_(m.weight, mean=0, std=0.01)
nn.init.zeros_(m.bias)
rg_net.apply(init_normal)
print(rg_net.state_dict())
结果:
OrderedDict([('0.block 0.0.weight', tensor([[-0.0005, -0.0156, -0.0060, 0.0215],
[-0.0034, -0.0072, 0.0202, -0.0293],
[-0.0038, 0.0044, 0.0092, 0.0033],
[-0.0180, 0.0007, 0.0112, 0.0146],
[ 0.0056, 0.0016, -0.0126, 0.0023],
[ 0.0129, -0.0162, 0.0021, 0.0114],
[-0.0059, -0.0165, -0.0054, -0.0024],
[ 0.0119, -0.0080, -0.0047, 0.0144]])), ('0.block 0.0.bias', tensor([0., 0., 0., 0., 0., 0., 0., 0.])), ('0.block 0.2.weight', tensor([[-0.0079, 0.0108, 0.0019, 0.0021, 0.0067, -0.0080, 0.0136, -0.0100],
[ 0.0132, 0.0003, 0.0324, -0.0023, 0.0139, -0.0006, 0.0063, 0.0035],
[ 0.0104, 0.0098, 0.0100, -0.0092, -0.0110, -0.0063, -0.0102, -0.0252],
[-0.0131, 0.0164, 0.0010, 0.0028, 0.0039, 0.0144, -0.0070, 0.0112]])), ('0.block 0.2.bias', tensor([0., 0., 0., 0.])), ('0.block 1.0.weight', tensor([[-0.0065, -0.0065, -0.0049, -0.0068],
[ 0.0111, 0.0094, 0.0023, -0.0018],
[ 0.0125, 0.0005, -0.0009, 0.0083],
[ 0.0151, -0.0119, 0.0130, 0.0044],
[ 0.0059, -0.0127, 0.0008, 0.0002],
[-0.0142, 0.0148, 0.0154, -0.0192],
[ 0.0060, -0.0111, 0.0227, 0.0052],
[ 0.0026, -0.0114, 0.0009, 0.0065]])), ('0.block 1.0.bias', tensor([0., 0., 0., 0., 0., 0., 0., 0.])), ('0.block 1.2.weight', tensor([[-0.0177, 0.0073, -0.0108, -0.0105, 0.0122, -0.0111, -0.0058, -0.0075],
[-0.0055, 0.0120, 0.0076, -0.0040, 0.0091, -0.0071, 0.0057, -0.0131],
[-0.0011, 0.0067, 0.0130, 0.0168, -0.0164, -0.0165, 0.0149, -0.0038],
[ 0.0024, 0.0023, 0.0034, 0.0109, 0.0055, -0.0070, 0.0014, -0.0062]])), ('0.block 1.2.bias', tensor([0., 0., 0., 0.])), ('0.block 2.0.weight', tensor([[ 1.0085e-02, 8.0402e-03, -9.1719e-03, 1.3070e-02],
[-8.9510e-03, 8.2770e-03, -9.4393e-03, -1.7498e-03],
[ 1.7968e-02, 6.7528e-03, 1.2561e-02, -7.4435e-03],
[ 3.2517e-03, -1.0305e-02, 1.6995e-05, -4.5249e-03],
[-1.5941e-03, -1.1127e-02, 7.5920e-03, 3.9745e-03],
[-8.2059e-03, -4.8210e-03, -1.1469e-02, 1.9077e-02],
[ 2.3864e-03, -3.9443e-03, -1.7923e-02, -1.9211e-02],
[-1.5732e-02, 4.3284e-03, -2.9533e-03, 1.6779e-02]])), ('0.block 2.0.bias', tensor([0., 0., 0., 0., 0., 0., 0., 0.])), ('0.block 2.2.weight', tensor([[ 0.0130, -0.0097, -0.0064, -0.0129, -0.0056, -0.0153, -0.0075, 0.0038],
[ 0.0022, -0.0114, 0.0283, 0.0007, 0.0044, 0.0140, -0.0086, 0.0052],
[ 0.0043, -0.0071, -0.0021, -0.0050, 0.0017, -0.0125, -0.0006, -0.0131],
[-0.0109, -0.0074, -0.0005, 0.0041, -0.0061, -0.0030, 0.0152, 0.0061]])), ('0.block 2.2.bias', tensor([0., 0., 0., 0.])), ('0.block 3.0.weight', tensor([[ 0.0129, -0.0063, 0.0013, 0.0052],
[ 0.0015, 0.0039, -0.0005, 0.0026],
[ 0.0020, -0.0285, -0.0073, -0.0100],
[ 0.0063, -0.0016, 0.0011, 0.0034],
[ 0.0006, 0.0322, -0.0118, 0.0084],
[ 0.0017, -0.0090, 0.0004, 0.0121],
[ 0.0057, -0.0053, -0.0099, 0.0110],
[-0.0032, 0.0015, 0.0132, 0.0021]])), ('0.block 3.0.bias', tensor([0., 0., 0., 0., 0., 0., 0., 0.])), ('0.block 3.2.weight', tensor([[ 0.0053, 0.0018, -0.0010, 0.0125, 0.0173, 0.0061, -0.0098, 0.0042],
[-0.0015, 0.0005, -0.0129, -0.0032, 0.0152, -0.0107, 0.0058, -0.0132],
[ 0.0038, -0.0153, -0.0120, 0.0186, 0.0042, -0.0002, -0.0070, 0.0054],
[-0.0036, 0.0046, 0.0096, 0.0048, -0.0034, 0.0261, 0.0107, -0.0033]])), ('0.block 3.2.bias', tensor([0., 0., 0., 0.])), ('1.weight', tensor([[-0.0115, -0.0152, 0.0224, 0.0118]])), ('1.bias', tensor([0.]))])
nn.init.constant_(m.weight, 1)
def init_constant(m):
if type(m) == nn.Linear:
nn.init.constant_(m.weight, 1)
nn.init.zeros_(m.bias)
rg_net.apply(init_constant)
print(rg_net.state_dict())
结果:
OrderedDict([('0.block 0.0.weight', tensor([[1., 1., 1., 1.],
[1., 1., 1., 1.],
[1., 1., 1., 1.],
[1., 1., 1., 1.],
[1., 1., 1., 1.],
[1., 1., 1., 1.],
[1., 1., 1., 1.],
[1., 1., 1., 1.]])), ('0.block 0.0.bias', tensor([0., 0., 0., 0., 0., 0., 0., 0.])), ('0.block 0.2.weight', tensor([[1., 1., 1., 1., 1., 1., 1., 1.],
[1., 1., 1., 1., 1., 1., 1., 1.],
[1., 1., 1., 1., 1., 1., 1., 1.],
[1., 1., 1., 1., 1., 1., 1., 1.]])), ('0.block 0.2.bias', tensor([0., 0., 0., 0.])), ('0.block 1.0.weight', tensor([[1., 1., 1., 1.],
[1., 1., 1., 1.],
[1., 1., 1., 1.],
[1., 1., 1., 1.],
[1., 1., 1., 1.],
[1., 1., 1., 1.],
[1., 1., 1., 1.],
[1., 1., 1., 1.]])), ('0.block 1.0.bias', tensor([0., 0., 0., 0., 0., 0., 0., 0.])), ('0.block 1.2.weight', tensor([[1., 1., 1., 1., 1., 1., 1., 1.],
[1., 1., 1., 1., 1., 1., 1., 1.],
[1., 1., 1., 1., 1., 1., 1., 1.],
[1., 1., 1., 1., 1., 1., 1., 1.]])), ('0.block 1.2.bias', tensor([0., 0., 0., 0.])), ('0.block 2.0.weight', tensor([[1., 1., 1., 1.],
[1., 1., 1., 1.],
[1., 1., 1., 1.],
[1., 1., 1., 1.],
[1., 1., 1., 1.],
[1., 1., 1., 1.],
[1., 1., 1., 1.],
[1., 1., 1., 1.]])), ('0.block 2.0.bias', tensor([0., 0., 0., 0., 0., 0., 0., 0.])), ('0.block 2.2.weight', tensor([[1., 1., 1., 1., 1., 1., 1., 1.],
[1., 1., 1., 1., 1., 1., 1., 1.],
[1., 1., 1., 1., 1., 1., 1., 1.],
[1., 1., 1., 1., 1., 1., 1., 1.]])), ('0.block 2.2.bias', tensor([0., 0., 0., 0.])), ('0.block 3.0.weight', tensor([[1., 1., 1., 1.],
[1., 1., 1., 1.],
[1., 1., 1., 1.],
[1., 1., 1., 1.],
[1., 1., 1., 1.],
[1., 1., 1., 1.],
[1., 1., 1., 1.],
[1., 1., 1., 1.]])), ('0.block 3.0.bias', tensor([0., 0., 0., 0., 0., 0., 0., 0.])), ('0.block 3.2.weight', tensor([[1., 1., 1., 1., 1., 1., 1., 1.],
[1., 1., 1., 1., 1., 1., 1., 1.],
[1., 1., 1., 1., 1., 1., 1., 1.],
[1., 1., 1., 1., 1., 1., 1., 1.]])), ('0.block 3.2.bias', tensor([0., 0., 0., 0.])), ('1.weight', tensor([[1., 1., 1., 1.]])), ('1.bias', tensor([0.]))])
nn.init.xavier_uniform_(m.weight)
def xavier(m):
if type(m) == nn.Linear:
nn.init.xavier_uniform_(m.weight)
def init_2(m):
if type(m) == nn.Linear:
nn.init.constant_(m.weight, 2)
print(rg_net[0][0])
rg_net[0][0].apply(xavier)
rg_net[0][1].apply(init_2)
print(rg_net[0].state_dict())
结果:
Sequential(
(0): Linear(in_features=4, out_features=8, bias=True)
(1): ReLU()
(2): Linear(in_features=8, out_features=4, bias=True)
)
OrderedDict([('block 0.0.weight', tensor([[ 0.6565, 0.4152, 0.3616, 0.0241],
[-0.2715, 0.1828, 0.0867, -0.5255],
[-0.3187, 0.1219, -0.4898, -0.5391],
[-0.0607, 0.4178, -0.1593, 0.4205],
[ 0.3720, -0.0351, 0.5027, -0.1858],
[-0.2447, -0.1904, 0.6159, -0.0746],
[-0.0756, -0.6110, 0.0704, -0.5446],
[-0.3130, -0.0007, 0.3470, -0.0409]])), ('block 0.0.bias', tensor([0., 0., 0., 0., 0., 0., 0., 0.])), ('block 0.2.weight', tensor([[ 0.3093, 0.6073, 0.5924, 0.2230, -0.0619, -0.5506, 0.1878, 0.4609],
[ 0.5646, -0.2433, 0.2026, -0.2918, 0.1207, -0.5572, 0.0366, 0.0012],
[ 0.4603, -0.1414, -0.1358, -0.5018, -0.1071, 0.2308, 0.2204, -0.2279],
[-0.5665, 0.1631, -0.3504, -0.1455, 0.1157, 0.5545, -0.3063, 0.3872]])), ('block 0.2.bias', tensor([0., 0., 0., 0.])), ('block 1.0.weight', tensor([[2., 2., 2., 2.],
[2., 2., 2., 2.],
[2., 2., 2., 2.],
[2., 2., 2., 2.],
[2., 2., 2., 2.],
[2., 2., 2., 2.],
[2., 2., 2., 2.],
[2., 2., 2., 2.]])), ('block 1.0.bias', tensor([0., 0., 0., 0., 0., 0., 0., 0.])), ('block 1.2.weight', tensor([[2., 2., 2., 2., 2., 2., 2., 2.],
[2., 2., 2., 2., 2., 2., 2., 2.],
[2., 2., 2., 2., 2., 2., 2., 2.],
[2., 2., 2., 2., 2., 2., 2., 2.]])), ('block 1.2.bias', tensor([0., 0., 0., 0.])), ('block 2.0.weight', tensor([[1., 1., 1., 1.],
[1., 1., 1., 1.],
[1., 1., 1., 1.],
[1., 1., 1., 1.],
[1., 1., 1., 1.],
[1., 1., 1., 1.],
[1., 1., 1., 1.],
[1., 1., 1., 1.]])), ('block 2.0.bias', tensor([0., 0., 0., 0., 0., 0., 0., 0.])), ('block 2.2.weight', tensor([[1., 1., 1., 1., 1., 1., 1., 1.],
[1., 1., 1., 1., 1., 1., 1., 1.],
[1., 1., 1., 1., 1., 1., 1., 1.],
[1., 1., 1., 1., 1., 1., 1., 1.]])), ('block 2.2.bias', tensor([0., 0., 0., 0.])), ('block 3.0.weight', tensor([[1., 1., 1., 1.],
[1., 1., 1., 1.],
[1., 1., 1., 1.],
[1., 1., 1., 1.],
[1., 1., 1., 1.],
[1., 1., 1., 1.],
[1., 1., 1., 1.],
[1., 1., 1., 1.]])), ('block 3.0.bias', tensor([0., 0., 0., 0., 0., 0., 0., 0.])), ('block 3.2.weight', tensor([[1., 1., 1., 1., 1., 1., 1., 1.],
[1., 1., 1., 1., 1., 1., 1., 1.],
[1., 1., 1., 1., 1., 1., 1., 1.],
[1., 1., 1., 1., 1., 1., 1., 1.]])), ('block 3.2.bias', tensor([0., 0., 0., 0.]))])
nn.init.uniform_(m.weight, -10, 10)
def my_init(m):
if type(m) == nn.Linear:
print(
"Init",
*[(name, param.shape) for name, param in rg_net.named_parameters()][0]
)
nn.init.uniform_(m.weight, -10, 10)
m.weight.data *= m.weight.data.abs() >= 5
rg_net[0][2].apply(my_init)
print(rg_net[0].state_dict())
结果:
Init 0.block 0.0.weight torch.Size([8, 4])
Init 0.block 0.0.weight torch.Size([8, 4])
OrderedDict([('block 0.0.weight', tensor([[ 0.6565, 0.4152, 0.3616, 0.0241],
[-0.2715, 0.1828, 0.0867, -0.5255],
[-0.3187, 0.1219, -0.4898, -0.5391],
[-0.0607, 0.4178, -0.1593, 0.4205],
[ 0.3720, -0.0351, 0.5027, -0.1858],
[-0.2447, -0.1904, 0.6159, -0.0746],
[-0.0756, -0.6110, 0.0704, -0.5446],
[-0.3130, -0.0007, 0.3470, -0.0409]])), ('block 0.0.bias', tensor([0., 0., 0., 0., 0., 0., 0., 0.])), ('block 0.2.weight', tensor([[ 0.3093, 0.6073, 0.5924, 0.2230, -0.0619, -0.5506, 0.1878, 0.4609],
[ 0.5646, -0.2433, 0.2026, -0.2918, 0.1207, -0.5572, 0.0366, 0.0012],
[ 0.4603, -0.1414, -0.1358, -0.5018, -0.1071, 0.2308, 0.2204, -0.2279],
[-0.5665, 0.1631, -0.3504, -0.1455, 0.1157, 0.5545, -0.3063, 0.3872]])), ('block 0.2.bias', tensor([0., 0., 0., 0.])), ('block 1.0.weight', tensor([[2., 2., 2., 2.],
[2., 2., 2., 2.],
[2., 2., 2., 2.],
[2., 2., 2., 2.],
[2., 2., 2., 2.],
[2., 2., 2., 2.],
[2., 2., 2., 2.],
[2., 2., 2., 2.]])), ('block 1.0.bias', tensor([0., 0., 0., 0., 0., 0., 0., 0.])), ('block 1.2.weight', tensor([[2., 2., 2., 2., 2., 2., 2., 2.],
[2., 2., 2., 2., 2., 2., 2., 2.],
[2., 2., 2., 2., 2., 2., 2., 2.],
[2., 2., 2., 2., 2., 2., 2., 2.]])), ('block 1.2.bias', tensor([0., 0., 0., 0.])), ('block 2.0.weight', tensor([[ 5.4197, -9.7842, -0.0000, 9.2256],
[-0.0000, -0.0000, 7.9126, -0.0000],
[-0.0000, 9.0827, 0.0000, -5.7510],
[-0.0000, 8.2361, -6.8953, 0.0000],
[ 5.2814, -0.0000, -7.2411, -0.0000],
[ 8.2870, -7.5425, 8.3590, -8.6748],
[ 6.4971, 7.6553, -9.7904, 8.5761],
[-5.6998, -6.8568, 0.0000, -0.0000]])), ('block 2.0.bias', tensor([0., 0., 0., 0., 0., 0., 0., 0.])), ('block 2.2.weight', tensor([[-7.5476, 0.0000, -5.7735, -6.1913, -0.0000, 8.8335, 0.0000, -8.3723],
[ 7.7029, -6.5640, -0.0000, -9.6283, -5.5881, -0.0000, 0.0000, 0.0000],
[ 0.0000, 0.0000, 0.0000, -7.1479, 0.0000, -0.0000, -7.5409, 7.4545],
[-0.0000, 6.9711, -0.0000, 0.0000, 9.4580, -0.0000, -8.5716, 6.6650]])), ('block 2.2.bias', tensor([0., 0., 0., 0.])), ('block 3.0.weight', tensor([[1., 1., 1., 1.],
[1., 1., 1., 1.],
[1., 1., 1., 1.],
[1., 1., 1., 1.],
[1., 1., 1., 1.],
[1., 1., 1., 1.],
[1., 1., 1., 1.],
[1., 1., 1., 1.]])), ('block 3.0.bias', tensor([0., 0., 0., 0., 0., 0., 0., 0.])), ('block 3.2.weight', tensor([[1., 1., 1., 1., 1., 1., 1., 1.],
[1., 1., 1., 1., 1., 1., 1., 1.],
[1., 1., 1., 1., 1., 1., 1., 1.],
[1., 1., 1., 1., 1., 1., 1., 1.]])), ('block 3.2.bias', tensor([0., 0., 0., 0.]))])
直接替换weight bias的值
rg_net[1].weight.data[:] += 1
print(rg_net[1].state_dict())
rg_net[1].weight.data[0, 0] = 42
print(rg_net[1].state_dict())
结果:
OrderedDict([('weight', tensor([[2., 2., 2., 2.]])), ('bias', tensor([0.]))])
OrderedDict([('weight', tensor([[42., 2., 2., 2.]])), ('bias', tensor([0.]))])
参数绑定 共享参数
shared = nn.Linear(8, 8)
net_share = nn.Sequential(nn.Linear(4, 8), nn.ReLU(), shared, nn.ReLU(), shared,
nn.ReLU(), nn.Linear(8, 1))
net_share(X)
print(net_share[2].weight.data[0] == net_share[4].weight.data[0])
net_share[2].weight.data[0, 0] = 1000
print(net_share[2].weight.data[0] == net_share[4].weight.data[0])
结果:
tensor([True, True, True, True, True, True, True, True])
tensor([True, True, True, True, True, True, True, True])
三、自定义层
class CenteredLayer(nn.Module):
def __init__(self):
super(CenteredLayer, self).__init__()
def forward(self, X):
if __name__ == '__main__':
return X - X.mean()
layer = CenteredLayer()
print(layer(torch.FloatTensor([1, 2, 3, 4, 5])))
net = nn.Sequential(nn.Linear(8, 12), CenteredLayer())
Y = net(torch.rand(4, 8))
print(Y)
结果:
tensor([-2., -1., 0., 1., 2.])
tensor([[ 0.1277, -0.0479, -0.1686, 0.0222, -0.4566, 0.0111, 0.2503, 0.2069,
-0.3122, 0.3772, -0.1147, 0.0919],
[-0.0777, 0.1050, -0.1751, 0.0301, -0.5946, 0.1693, 0.1831, 0.4101,
-0.4953, 0.5953, -0.1591, 0.1412],
[ 0.1005, 0.3416, 0.2313, -0.3359, -0.8277, -0.0700, 0.0232, 0.0690,
-0.2101, 0.5695, -0.2455, 0.2663],
[-0.0245, 0.0388, -0.0631, -0.1355, -0.6780, 0.2602, 0.0541, 0.3108,
-0.4082, 0.5630, -0.1461, 0.1969]], grad_fn=<SubBackward0>)
nn.Parameter()
class MyLinear(nn.Module):
def __init__(self, in_units, units):
super(MyLinear, self).__init__()
self.weight = nn.Parameter(torch.randn(in_units, units))
self.bias = nn.Parameter(torch.randn(units,))
def forward(self, X):
linear = torch.matmul(X, self.weight.data) + self.bias.data
return F.relu(linear)
dense = MyLinear(5, 3)
print(dense.weight)
结果:
Parameter containing:
tensor([[ 0.7404, 0.2722, -1.0920],
[ 0.1745, -1.1313, -0.8230],
[-0.6691, -1.3758, -0.4024],
[ 0.2260, -1.0750, -1.0215],
[ 1.1589, -0.2882, -0.0932]], requires_grad=True)
print(dense(torch.rand(2, 5)))
结果:
tensor([[0.0000, 2.7653, 0.0000],
[0.0000, 0.9031, 0.0000]])
net = nn.Sequential(MyLinear(64, 8), MyLinear(8, 1))
print(net(torch.rand(3, 64)))
结果:
tensor([[3.5120],
[0.0000],
[6.2135]])
四、读写文件(保存参数)
x = torch.arange(4)
torch.save(x, './params/x_file.txt')
x2 = torch.load('./params/x_file.txt')
print(x2)
y = torch.zeros(4)
torch.save([x, y], './params/x_file2.txt')
x2, y2 = torch.load('./params/x_file2.txt')
print(x2, y2)
mydict = {'x': x, 'y': y}
torch.save(mydict, './params/mydict_file2.txt')
mydict2 = torch.load('./params/mydict_file2.txt')
print(mydict2)
结果:
tensor([0, 1, 2, 3])
tensor([0, 1, 2, 3]) tensor([0., 0., 0., 0.])
{'x': tensor([0, 1, 2, 3]), 'y': tensor([0., 0., 0., 0.])}
class MLP(nn.Module):
def __init__(self):
super(MLP, self).__init__()
self.Linear1 = nn.Linear(20, 256)
self.Linear2 = nn.Linear(256, 10)
def forward(self, x):
return self.Linear2(F.relu(self.Linear1(x)))
net = MLP()
X = torch.randn(size=(2, 20))
Y = net(X)
print(Y)
结果:
tensor([[ 0.2036, -0.0329, -0.1639, 0.0509, 0.3943, -0.1709, 0.2458, -0.0923,
-0.1430, 0.4085],
[ 0.1013, -0.0231, 0.2619, 0.0948, 0.1205, 0.1814, 0.2518, 0.0211,
0.0743, 0.0249]], grad_fn=<AddmmBackward0>)
torch.save(net.state_dict(), ‘./params/mlp_params.txt’)
clone.load_state_dict(torch.load(’./params/mlp_params.txt’))
torch.save(net.state_dict(), './params/mlp_params.txt')
clone = MLP()
clone.load_state_dict(torch.load('./params/mlp_params.txt'))
clone.eval()
Y_clone = clone(X)
print(Y_clone == Y)
print(Y)
结果:
tensor([[True, True, True, True, True, True, True, True, True, True],
[True, True, True, True, True, True, True, True, True, True]])
tensor([[ 0.2036, -0.0329, -0.1639, 0.0509, 0.3943, -0.1709, 0.2458, -0.0923,
-0.1430, 0.4085],
[ 0.1013, -0.0231, 0.2619, 0.0948, 0.1205, 0.1814, 0.2518, 0.0211,
0.0743, 0.0249]], grad_fn=<AddmmBackward0>)
五、使用GPU运行
torch.device(‘cpu’)
torch.cuda.device(‘cuda’)
torch.cuda.device(‘cuda:1’)
torch.cuda.device_count()
print(torch.device('cpu'))
print(torch.cuda.device('cuda'))
print(torch.cuda.device('cuda:1'))
print(torch.cuda.device_count())
def try_gpu(i=0):
"""如果存在,则返回GPU(i),否则返回cpu()"""
if torch.cuda.device_count() >= i+1:
return torch.device(f'cuda:{i}')
return torch.device('cpu')
def try_all_gpu():
"""返回所有可用的GPU,如果没有GPU,则返回[cpu(),]"""
devices = [
torch.device(f'cuda:{i}') for i in range(torch.cuda.device_count())
]
return devices if devices else [torch.device('cpu')]
print(try_gpu())
print(try_gpu(10))
print(try_all_gpu())
x = torch.tensor([1, 2, 3])
print('此时x在', x.device, '上')
x1 = torch.tensor([1, 2, 3], device=try_gpu())
print(x1)
x2 = torch.tensor([1, 2, 3], device=try_gpu(1))
print(x2)
z = x.cuda(1)
print('z:', z)
print('x2+z=', x2+z)
print('z.cuda(1) is z:', z.cuda(1) is z)
net = nn.Sequential(nn.Linear(3, 1))
net = net.to(device=try_gpu(2))
print(net(x))
print(net[0].weight.data.device)
|