一、丢弃法(dropout)
动机:一个好的模型需要对输入数据的扰动鲁棒
使用有噪音的数据等价于Tikhonov正则
丢弃法:在层之间加入噪音。
使用丢弃法,通常将丢弃法作用在隐藏去拿连接层的输出上。
丢弃的概率是控制模型复杂度的超参数。
二、丢弃法从零开始实现
from matplotlib import pyplot as plt
import torch
import torch.nn as nn
import numpy as np
import sys
sys.path.append("..")
import d2lzh_pytorch as d2l
def dropout(X,drop_prob):
X=X.float()
assert 0<=drop_prob <=1
keep_prob =1- drop_prob
if keep_prob==0:
return torch.zeros_like(X)
mask=(torch.randn(X.shape)<keep_prob).float()
return mask *X /keep_prob
X = torch.arange(16).view(2, 8)
dropout(X, 0)
dropout(X,0.5)
dropout(X,1.0)
"""
定义模型参数
"""
num_inputs,num_outputs,num_hiddens1,num_hiddens2=784,10,256,256
W1=torch.tensor(np.random.normal(0,0.01,size=(num_inputs,num_hiddens1)),dtype=torch.float,requires_grad=True)
b1 = torch.zeros(num_hiddens1, requires_grad=True,dtype=torch.float)
W2 = torch.tensor(np.random.normal(0, 0.01, size=(num_hiddens1,num_hiddens2)), dtype=torch.float, requires_grad=True)
b2 = torch.zeros(num_hiddens2, requires_grad=True,dtype=torch.float)
W3=torch.tensor(np.random.normal(0,0.01,size=(num_hiddens2,num_outputs)),dtype=torch.float,requires_grad=True)
b3=torch.zeros(num_outputs,requires_grad=True,dtype=torch.float)
params=[W1,b1,W2,b2,W3,b3]
"""
定义模型
"""
drop_prob1,drop_prob2=0.2,0.5
def net(X,is_training=True):
X = X.view(-1, num_inputs)
H1=(torch.matmul(X,W1)+b1).relu()
"""只在训练时使用丢弃法"""
if is_training:
H1 = dropout(H1, drop_prob1) # 在第?层全连接后添加丢弃层
H2 = (torch.matmul(H1, W2) + b2).relu()
if is_training:
H2 = dropout(H2, drop_prob2) # 在第?层全连接后添加丢弃层
return torch.matmul(H2, W3) + b3
"""
训练和测试模型
"""
num_epochs, lr, batch_size = 5, 100.0, 256
loss = torch.nn.CrossEntropyLoss()
train_iter, test_iter = d2l.load_data_fashion_mnist(batch_size)
d2l.train_ch3(net, train_iter, test_iter, loss, num_epochs,batch_size, params, lr)
?
三、丢弃法简洁实现
from matplotlib import pyplot as plt
import torch
import torch.nn as nn
import numpy as np
import sys
sys.path.append("..")
import d2lzh_pytorch as d2l
def dropout(X, drop_prob):
X = X.float()
assert 0 <= drop_prob <= 1
keep_prob = 1 - drop_prob
if keep_prob == 0:
return torch.zeros_like(X)
mask = (torch.randn(X.shape) < keep_prob).float()
return mask * X / keep_prob
X = torch.arange(16).view(2, 8)
dropout(X, 0)
dropout(X, 0.5)
dropout(X, 1.0)
"""
定义模型参数
"""
num_inputs, num_outputs, num_hiddens1, num_hiddens2 = 784, 10, 256, 256
W1 = torch.tensor(np.random.normal(0, 0.01, size=(num_inputs, num_hiddens1)), dtype=torch.float, requires_grad=True)
b1 = torch.zeros(num_hiddens1, requires_grad=True, dtype=torch.float)
W2 = torch.tensor(np.random.normal(0, 0.01, size=(num_hiddens1, num_hiddens2)), dtype=torch.float, requires_grad=True)
b2 = torch.zeros(num_hiddens2, requires_grad=True, dtype=torch.float)
W3 = torch.tensor(np.random.normal(0, 0.01, size=(num_hiddens2, num_outputs)), dtype=torch.float, requires_grad=True)
b3 = torch.zeros(num_outputs, requires_grad=True, dtype=torch.float)
params = [W1, b1, W2, b2, W3, b3]
drop_prob1,drop_prob2=0.2,0.5
net = nn.Sequential(
d2l.FlattenLayer(),
nn.Linear(num_inputs, num_hiddens1),
nn.ReLU(),
nn.Dropout(drop_prob1),
nn.Linear(num_hiddens1, num_hiddens2),
nn.ReLU(),
nn.Dropout(drop_prob2),
nn.Linear(num_hiddens2, 10) )
for param in net.parameters():
nn.init.normal_(param, mean=0, std=0.01)
"""
训练并测试模型
"""
optimizer = torch.optim.SGD(net.parameters(), lr=0.5)
num_epochs, lr, batch_size = 5, 100.0, 256
loss = torch.nn.CrossEntropyLoss()
train_iter, test_iter = d2l.load_data_fashion_mnist(batch_size)
d2l.train_ch3(net, train_iter, test_iter, loss, num_epochs,
batch_size, None, None, optimizer)
?
|