dropout_layers.py
包含了Dropout前向传播以及反向传播,组合Dropout传播层
import sys, os
sys.path.append(os.path.realpath(os.path.dirname(os.path.realpath(__file__))))
import numpy as np
from layers import *
def dropout_forward(x, dropout_param):
"""
执行dropout前向传播
Inputs:
- x: 输入数据
- dropout_param: 字典类型,使用下列键值:
- p: dropout参数。每个神经元的激活概率p
- mode: 'test'或'train'. 训练模式使用dropout;测试模式仅仅返回输入值。
- seed: 随机数生成种子.
Outputs:
- out: 和输入数据相同形状
- cache:元组(dropout_param, mask).
训练模式时,掩码mask用于激活该层神经元,测试模式时不使用
"""
p, mode = dropout_param['p'], dropout_param['mode']
if 'seed' in dropout_param:
np.random.seed(dropout_param['seed'])
mask = None
out = None
if mode == 'train':
mask = (np.random.rand(*x.shape) < p)/p
out =x*mask
elif mode == 'test':
out = x
cache = (dropout_param, mask)
out = out.astype(x.dtype, copy=False)
return out, cache
def dropout_backward(dout, cache):
"""
dropout反向传播
Inputs:
- dout: 上层梯度
- cache: dropout_forward中的缓存(dropout_param, mask)。
"""
dropout_param, mask = cache
mode = dropout_param['mode']
dx = None
if mode == 'train':
dx =dout*mask
elif mode == 'test':
dx = dout
return dx
def affine_relu_dropout_forward(x,w,b,dropout_param):
"""
组合affine_relu_dropout前向传播
Inputs:
- x: 输入数据
- w: 权重参数
- b: 偏置项
- dropout_param: 字典类型,使用下列键值:
- p: dropout参数。每个神经元的激活概率p
- mode: 'test'或'train'. 训练模式使用dropout;测试模式仅仅返回输入值。
- seed: 随机数生成种子.
Outputs:
- out: 和输入数据相同形状
- cache:缓存包含(cache_affine,cache_relu,cache_dropout)
"""
out_dropout = None
cache =None
out_affine, cache_affine = affine_forward(x,w,b)
out_relu,cache_relu =relu_forward(out_affine)
out_dropout,cache_dropout =dropout_forward(out_relu,dropout_param)
cache = (cache_affine,cache_relu,cache_dropout)
return out_dropout,cache
def affine_relu_dropout_backward(dout,cache):
"""
affine_relu_dropout神经元的反向传播
Input:
- dout: 上层误差梯度
- cache: 缓存(cache_affine,cache_relu,cache_dropout)
Returns:
- dx: 输入数据x的梯度
- dw: 权重矩阵w的梯度
- db: 偏置向量b的梯度
"""
cache_affine,cache_relu,cache_dropout = cache
dx,dw,db=None,None,None
ddropout = dropout_backward(dout,cache_dropout)
drelu = relu_backward(ddropout,cache_relu)
dx,dw,db = affine_backward(drelu,cache_affine)
return dx,dw,db
layers.py
之前已经写好的前向传播与后向传播代码以及softmax的损失函数。
import sys, os
sys.path.append(os.path.realpath(os.path.dirname(os.path.realpath(__file__))))
import numpy as np
def affine_forward(x, w, b):
"""
计算神经网络当前层的前馈传播。该方法计算在全连接情况下的得分函数
注:如果不理解affine仿射变换,简单的理解为在全连接情况下的得分函数即可
输入数据x的形状为(N, d_1, ..., d_k),其中N表示数据量,(d_1, ..., d_k)表示
每一通道的数据维度。如果是图片数据就为(长,宽,色道),数据的总维度就为
D = d_1 * ... * d_k,因此我们需要数据整合成完整的(N,D)形式再进行仿射变换。
Inputs:
- x: 输入数据,其形状为(N, d_1, ..., d_k)的numpy array
- w: 权重矩阵,其形状为(D,M)的numpy array,D表示输入数据维度,M表示输出数据维度
可以将D看成输入的神经元个数,M看成输出神经元个数
- b: 偏置向量,其形状为(M,)的numpy array
Returns 元组:
- out: 形状为(N, M)的输出结果
- cache: 将输入进行缓存(x, w, b)
"""
out = None
N=x.shape[0]
x_new=x.reshape(N,-1)
out=np.dot(x_new,w)+b
cache = (x, w, b)
return out, cache
def affine_backward(dout, cache):
"""
计算仿射层的反向传播.
Inputs:
- dout: 形状为(N, M)的上层梯度
- cache: 元组:
- x: (N, d_1, ... d_k)的输入数据
- w: 形状为(D, M)的权重矩阵
Returns 元组:
- dx: 输入数据x的梯度,其形状为(N, d1, ..., d_k)
- dw: 权重矩阵w的梯度,其形状为(D,M)
- db: 偏置项b的梯度,其形状为(M,)
"""
x, w, b = cache
dx, dw, db = None, None, None
db = np.sum(dout,axis=0)
xx= x.reshape(x.shape[0],-1)
dw = np.dot(xx.T,dout)
dx = np.dot(dout,w.T)
dx=np.reshape(dx,x.shape)
return dx, dw, db
def relu_forward(x):
"""
计算tified linear units (ReLUs)激活函数的前向传播,并保存相应缓存
Input:
- x: 输入数据
Returns 元组:
- out: 和输入数据x形状相同
- cache: x
"""
out = None
out =np.maximum(0,x)
cache = x
return out, cache
def relu_backward(dout, cache):
"""
计算 rectified linear units (ReLUs)激活函数的反向传播.
Input:
- dout: 上层误差梯度
- cache: 输入 x,其形状应该和dout相同
Returns:
- dx: x的梯度
"""
dx, x = None, cache
dx=dout
dx[x<=0]=0
return dx
def affine_relu_forward(x, w, b):
"""
ReLU神经元前向传播
Inputs:
- x: 输入到 affine层的数据
- w, b: affine层的权重矩阵和偏置向量
Returns 元组:
- out: Output from the ReLU的输出结果
- cache: 前向传播的缓存
"""
a, fc_cache = affine_forward(x, w, b)
out, relu_cache = relu_forward(a)
cache = (fc_cache, relu_cache)
return out, cache
def affine_relu_backward(dout, cache):
"""
ReLU神经元的反向传播
Input:
- dout: 上层误差梯度
- cache: affine缓存,以及relu缓存
Returns:
- dx: 输入数据x的梯度
- dw: 权重矩阵w的梯度
- db: 偏置向量b的梯度
"""
fc_cache, relu_cache = cache
da = relu_backward(dout, relu_cache)
dx, dw, db = affine_backward(da, fc_cache)
return dx, dw, db
def softmax_loss(x, y):
probs = np.exp(x - np.max(x, axis=1, keepdims=True))
probs /= np.sum(probs, axis=1, keepdims=True)
N = x.shape[0]
loss = -np.sum(np.log(probs[np.arange(N), y])) / N
dx = probs.copy()
dx[np.arange(N), y] -= 1
dx /= N
return loss, dx
fc_net.py
实现了深层全连接神经网络。
import sys, os
sys.path.append(os.path.realpath(os.path.dirname(os.path.realpath(__file__))))
import numpy as np
from layers import *
from dropout_layers import *
from bn_layers import *
class FullyConnectedNet(object):
"""
{affine - [batch norm] - relu - [dropout]} x (L - 1) - affine - softmax
"""
def __init__(self, input_dim=3*32*32,hidden_dims=[100], num_classes=10,
dropout=0, use_batchnorm=False, reg=0.0,
weight_scale=1e-2, seed=None):
"""
初始化全连接网络.
Inputs:
- input_dim: 输入维度
- hidden_dims: 隐藏层各层维度向量,如[100,100]
- num_classes: 分类个数.
- dropout: 如果dropout=0,表示不使用dropout.
- use_batchnorm:布尔型,是否使用BN
- reg:正则化衰减因子.
- weight_scale:权重初始化范围,标准差.
- seed: 使用seed产生相同的随机数。
"""
self.use_batchnorm = use_batchnorm
self.use_dropout = dropout > 0
self.reg = reg
self.num_layers = 1 + len(hidden_dims)
self.params = {}
layers_dims = [input_dim]+hidden_dims+[num_classes]
for i in range(self.num_layers):
self.params['W'+str(i+1)] = weight_scale*np.random.randn(layers_dims[i],
layers_dims[i+1])
self.params['b'+str(i+1)] = np.zeros((1, layers_dims[i+1]))
if self.use_batchnorm and i < len(hidden_dims):
self.params['gamma'+str(i+1)] = np.ones((1, layers_dims[i+1]))
self.params['beta'+str(i+1)] = np.zeros((1, layers_dims[i+1]))
self.dropout_param = {}
if self.use_dropout:
self.dropout_param = {'mode': 'train', 'p': dropout}
if seed is not None:
self.dropout_param['seed'] = seed
self.bn_params = []
if self.use_batchnorm:
self.bn_params = [{'mode': 'train'} for i in range(self.num_layers - 1)]
def loss(self, X, y=None):
'''
计算损失值
Parameters
----------
X : 训练数据
y : 标签
Returns
-------
TYPE
DESCRIPTION.
'''
mode = 'test' if y is None else 'train'
if self.dropout_param is not None:
self.dropout_param['mode'] = mode
if self.use_batchnorm:
for bn_param in self.bn_params:
bn_param[mode] = mode
scores = None
outs, cache = {}, {}
outs[0] = X
num_h = self.num_layers-1
for i in range(num_h):
if self.use_dropout:
outs[i+1], cache[i+1] = affine_relu_dropout_forward(
outs[i], self.params['W'+str(i+1)], self.params['b'+str(i+1)],
self.dropout_param)
elif self.use_batchnorm:
gamma = self.params['gamma'+str(i+1)]
beta = self.params['beta'+str(i+1)]
outs[i+1], cache[i+1] = affine_bn_relu_forward(outs[i],
self.params['W'+str(i+1)], self.params['b'+str(i+1)], gamma,
beta, self.bn_params[i])
else:
outs[i+1], cache[i+1] = affine_relu_forward(outs[i],
self.params['W'+str(i+1)], self.params['b'+str(i+1)])
scores, cache[num_h+1] = affine_forward(outs[num_h],
self.params['W'+str(num_h+1)], self.params['b'+str(num_h+1)])
if mode == 'test':
return scores
loss, grads = 0.0, {}
dout = {}
loss, dy = softmax_loss(scores, y)
h = self.num_layers-1
for i in range(self.num_layers):
loss += 0.5*self.reg*(np.sum(self.params['W'+str(i+1)]*self.params['W'+str(i+1)]))
dout[h], grads['W'+str(h+1)], grads['b'+str(h+1)] = affine_backward(dy, cache[h+1])
grads['W'+str(h+1)] += self.reg*self.params['W'+str(h+1)]
for i in range(h):
if self.use_dropout:
dx, dw, db = affine_relu_dropout_backward(dout[h-i], cache[h-i])
dout[h-1-i] = dx
grads['W'+str(h-i)] = dw
grads['b'+str(h-i)] = db
elif self.use_batchnorm:
dx, dw, db, dgamma, dbeta = affine_bn_relu_backward(dout[h-i], cache[h-i])
dout[h-1-i] = dx
grads['W'+str(h-i)] = dw
grads['b'+str(h-i)] = db
grads['gamma'+str(h-i)] = dgamma
grads['beta'+str(h-i)] = dbeta
else:
dx, dw, db = affine_relu_backward(dout[h-i], cache[h-i])
dout[h-1-i] = dx
grads['W'+str(h-i)] = dw
grads['b'+str(h-i)] = db
grads['W'+str(h-i)] += self.reg*self.params['W'+str(h-i)]
return loss, grads
trainer.py
解耦训练器的实现
import sys, os
sys.path.append(os.path.realpath(os.path.dirname(os.path.realpath(__file__))))
import numpy as np
import updater
class Trainer(object):
"""
使用形式:
data = {
'X_train': # 训练数据
'y_train': # 训练类标
'X_val': # 验证数据
'X_train': # 验证类标
}
model = MyAwesomeModel(hidden_size=100, reg=10)
Trainer = Trainer(model, data,
update_rule='sgd',
updater_config={
'learning_rate': 1e-3,
},
lr_decay=0.95,
num_epochs=10, batch_size=100,
print_every=100)
Trainer.train()
"""
def __init__(self, model, data, **kwargs):
"""
构造一个新的Trainer实例
必须参数:
- model: 网络模型
- data: 数据字典,其中:
'X_train': 形状为(N_train, d_1, ..., d_k)训练数据
'X_val': 形状为(N_val, d_1, ..., d_k) 验证数据
'y_train': 形状为(N_train,) 训练数据类标
'y_val': 形状为(N_val,) 验证数据类标
可选参数:
- update_rule: 更新规则,其存放在updater.py文件中,默认选项为'sgd'。
- updater_config: 字典类型的,更新规则所对应的超参数配置,同见updater.py文件。
- lr_decay: 学习率衰减系数。
- batch_size: 批量数据大小
- num_epochs: 训练周期
- print_every: 整数型; 每迭代多少次进行打印一次中间结果
- verbose: 布尔型; 是否在训练期间打印中间结果
"""
self.model = model
self.X_train = data['X_train']
self.y_train = data['y_train']
self.X_val = data['X_val']
self.y_val = data['y_val']
self.update_rule = kwargs.pop('update_rule', 'sgd')
self.updater_config = kwargs.pop('updater_config', {})
self.lr_decay = kwargs.pop('lr_decay', 1.0)
self.batch_size = kwargs.pop('batch_size', 100)
self.num_epochs = kwargs.pop('num_epochs', 10)
self.print_every = kwargs.pop('print_every', 10)
self.verbose = kwargs.pop('verbose', True)
if len(kwargs) > 0:
extra = ', '.join('"%s"' % k for k in kwargs.keys())
raise ValueError('Unrecognized arguments %s' % extra)
if not hasattr(updater, self.update_rule):
raise ValueError('Invalid update_rule "%s"' % self.update_rule)
self.update_rule = getattr(updater, self.update_rule)
self.epoch = 0
self.best_val_acc = 0
self.best_params = {}
self.loss_history = []
self.train_acc_history = []
self.val_acc_history = []
self.updater_configs = {}
for p in self.model.params:
d = {k: v for k, v in self.updater_config.items()}
self.updater_configs[p] = d
def _step(self):
"""
执行单步梯度更新
"""
num_train = self.X_train.shape[0]
batch_mask = np.random.choice(num_train, self.batch_size)
X_batch = self.X_train[batch_mask]
y_batch = self.y_train[batch_mask]
loss, grads = self.model.loss(X_batch, y_batch)
self.loss_history.append(loss)
for p, w in self.model.params.items():
dw = grads[p]
config = self.updater_configs[p]
next_w, next_config = self.update_rule(w, dw, config)
self.model.params[p] = next_w
self.updater_configs[p] = next_config
def check_accuracy(self, X, y, num_samples=None, batch_size=100):
"""
根据提供的数据检验精度,若数据集过大,可进行采样测试。
Inputs:
- X: 形状为(N, d_1, ..., d_k)的数据
- y: 形状为 (N,)的数据类标
- num_samples: 采样次数
- batch_size:批量数据大小
Returns:
- acc: 测试数据正确率
"""
N = X.shape[0]
if num_samples is not None and N > num_samples:
mask = np.random.choice(N, num_samples)
N = num_samples
X = X[mask]
y = y[mask]
num_batches = int(N / batch_size)
if N % batch_size != 0:
num_batches += 1
y_pred = []
for i in range(num_batches):
start = i * batch_size
end = (i + 1) * batch_size
scores = self.model.loss(X[start:end])
y_pred.append(np.argmax(scores, axis=1))
y_pred = np.hstack(y_pred)
acc = np.mean(y_pred == y)
return acc
def train(self):
"""
根据配置训练模型
"""
num_train = self.X_train.shape[0]
iterations_per_epoch = max(num_train / self.batch_size, 1)
num_iterations = int(self.num_epochs * iterations_per_epoch)
for t in range(num_iterations):
self._step()
if self.verbose and t % self.print_every == 0:
print('(迭代 %d / %d) 损失值: %f' % (t + 1, num_iterations, self.loss_history[-1]))
epoch_end = (t + 1) % iterations_per_epoch == 0
if epoch_end:
self.epoch += 1
for k in self.updater_configs:
self.updater_configs[k]['learning_rate'] *= self.lr_decay
first_it = (t == 0)
last_it = (t == num_iterations + 1)
if first_it or last_it or epoch_end:
train_acc = self.check_accuracy(self.X_train, self.y_train,
num_samples=1000)
val_acc = self.check_accuracy(self.X_val, self.y_val)
self.train_acc_history.append(train_acc)
self.val_acc_history.append(val_acc)
if self.verbose:
print('(周期 %d / %d) 训练精度: %f; 验证精度: %f' % (
self.epoch, self.num_epochs, train_acc, val_acc))
if val_acc > self.best_val_acc:
self.best_val_acc = val_acc
self.best_params = {}
for k, v in self.model.params.items():
self.best_params[k] = v.copy()
self.model.params = self.best_params
updater.py
解耦更新器,主要负责更新神经网络的权重,其传入参数有神经网络的权重
w
w
w、当前权重的梯度
d
w
dw
dw及相应的更新配置。
import sys, os
sys.path.append(os.path.realpath(os.path.dirname(os.path.realpath(__file__))))
import numpy as np
"""
频繁使用在训练神经网络中的一阶梯度更新规则。每次更新接受当前的权重,
对应的梯度,以及相关配置进行权重更新。
def update(w, dw, config=None):
Inputs:
- w:当前权重.
- dw: 和权重形状相同的梯度.
- config: 字典型超参数配置,比如学习率,动量值等。如果更新规则需要用到缓存,
在配置中需要保存相应的缓存。
Returns:
- next_w: 更新后的权重.
- config: 更新规则相应的配置.
"""
def sgd(w, dw, config=None):
"""
随机梯度下降更新规则.
config 格式:
- learning_rate: 学习率.
"""
if config is None: config = {}
config.setdefault('learning_rate', 1e-2)
w -= config['learning_rate'] * dw
return w, config
def sgd_momentum(w, dw, config=None):
"""
动量随机梯度下降更新规则。
config 使用格式:
- learning_rate: 学习率。
- momentum: [0,1]的动量衰减因子,0表示不使用动量,即退化为SGD。
- velocity: 和w,dw形状相同的速度。
"""
if config is None:
config = { }
config.setdefault('learning_rate', 1e-2)
config.setdefault('momentum', 0.9)
v = config.setdefault('velocity', np.zeros_like(w))
next_w = None
v = config['momentum']*config['velocity']-config['learning_rate']*dw
next_w = w+v
config['velocity'] = v
return next_w, config
def rmsprop(w, dw, config=None):
"""
RMSProp更新规则
config 使用格式:
- learning_rate: 学习率.
- decay_rate:历史累积梯度衰减率因子,取值为[0,1]
- epsilon: 避免除零异常的小数.
- cache:历史梯度缓存.
"""
if config is None: config = {}
config.setdefault('learning_rate', 1e-2)
config.setdefault('decay_rate', 0.99)
config.setdefault('epsilon', 1e-8)
config.setdefault('cache', np.zeros_like(w))
next_w = None
config['cache'] = config['decay_rate']*config['cache']+(1-config['decay_rate'])*dw**2
next_w = w-config['learning_rate']*dw/(np.sqrt(config['cache']+config['epsilon']))
return next_w, config
def adam(w, dw, config=None):
"""
使用 Adam更新规则 ,融合了“热身”更新操作。
config 使用格式:
- learning_rate: 学习率.
- beta1: 动量衰减因子.
- beta2: 学习率衰减因子.
- epsilon: 防除0小数.
- m: 梯度.
- v: 梯度平方.
- t: 迭代次数.
"""
if config is None: config = {}
config.setdefault('learning_rate', 1e-3)
config.setdefault('beta1', 0.9)
config.setdefault('beta2', 0.999)
config.setdefault('epsilon', 1e-8)
config.setdefault('m', np.zeros_like(w))
config.setdefault('v', np.zeros_like(w))
config.setdefault('t', 0)
next_w = None
config['t'] += 1
beta1 = config['beta1']
beta2 = config['beta2']
epsilon = config['epsilon']
learning_rate = config['learning_rate']
config['m'] = beta1*config['m']+(1-beta1)*dw
config['v'] = beta2*config['v']+(1-beta2)*dw**2
mb = config['m']/(1-beta1**config['t'])
vb = config['v']/(1-beta2**config['t'])
next_w = w-learning_rate*mb/(np.sqrt(vb)+epsilon)
return next_w, config
bn_layers.py
实现BN算法的前向传播、反向传播。
import sys, os
sys.path.append(os.path.realpath(os.path.dirname(os.path.realpath(__file__))))
import numpy as np
from layers import *
from dropout_layers import *
def batchnorm_forward(x, gamma, beta, bn_param):
"""
使用类似动量衰减的运行时平均,计算总体均值与方差 例如:
running_mean = momentum * running_mean + (1 - momentum) * sample_mean
running_var = momentum * running_var + (1 - momentum) * sample_var
Input:
- x: 数据(N, D)
- gamma: 缩放参数 (D,)
- beta: 平移参数 (D,)
- bn_param: 字典型,使用下列键值:
- mode: 'train' 或'test';
- eps: 保证数值稳定
- momentum: 运行时平均衰减因子
- running_mean: 形状为(D,)的运行时均值
- running_var : 形状为 (D,)的运行时方差
Returns 元组:
- out: 输出(N, D)
- cache: 用于反向传播的缓存
"""
mode = bn_param['mode']
eps = bn_param.get('eps', 1e-5)
momentum = bn_param.get('momentum', 0.9)
N, D = x.shape
running_mean = bn_param.get('running_mean', np.zeros(D, dtype=x.dtype))
running_var = bn_param.get('running_var', np.zeros(D, dtype=x.dtype))
out, cache = None, None
if mode == 'train':
mu = 1/float(N)*np.sum(x, axis=0)
xmu = x-mu
carre = xmu**2
var = 1/float(N)*np.sum(carre, axis=0)
sqrtvar = np.sqrt(var+eps)
invvar = 1./sqrtvar
va2 = xmu*invvar
va3 = gamma*va2
out = va3+beta
running_mean = momentum*running_mean+(1.0-momentum)*mu
running_var = momentum * running_var+(1.0-momentum)*var
cache=(mu, xmu, carre, var, sqrtvar, invvar, va2, va3, gamma, beta, x, bn_param)
elif mode == 'test':
mu = running_mean
var = running_var
xhat = (x-mu)/np.sqrt(var+eps)
out = gamma*xhat+beta
cache = (mu, var, gamma, beta, bn_param)
else:
raise ValueError('无法识别的BN模式: "%s"' % mode)
bn_param['running_mean'] = running_mean
bn_param['running_var'] = running_var
return out, cache
def batchnorm_backward(dout, cache):
"""
BN反向传播
Inputs:
- dout: 上层梯度 (N, D)
- cache: 前向传播时的缓存.
Returns 元组:
- dx: 数据梯度 (N, D)
- dgamma: gamma梯度 (D,)
- dbeta: beta梯度 (D,)
"""
dx, dgamma, dbeta = None, None, None
mu, xmu, carre, var, sqrtvar, invvar, va2, va3, gamma, beta, x, bn_param=cache
eps = bn_param.get('eps', 1e-5)
N, D = dout.shape
dva3 = dout
dbeta = np.sum(dout, axis=0)
dva2 = gamma*dva3
dgamma = np.sum(va2*dva3, axis=0)
dxmu = invvar*dva2
dinvvar = np.sum(xmu*dva2, axis=0)
dsqrtvar = -1./(sqrtvar**2)*dinvvar
dvar = 0.5*(var+eps)**(-0.5)*dsqrtvar
dcarre = 1/float(N)*np.ones((carre.shape))*dvar
dxmu += 2*xmu*dcarre
dx = dxmu
dmu = -np.sum(dxmu, axis=0)
dx += 1/float(N)*np.ones((dxmu.shape))*dmu
return dx, dgamma, dbeta
def batchnorm_backward_alt(dout, cache):
"""
可选的BN反向传播
"""
dx, dgamma, dbeta = None, None, None
mu, xmu, carre, var, sqrtvar, invvar, va2, va3, gamma, beta, x, bn_param = cache
eps = bn_param.get('eps', 1e-5)
N, D = dout.shape
dbeta = np.sum(dout, axis=0)
dgamma = np.sum((x - mu) * (var + eps)**(-1. / 2.) * dout, axis=0)
dx = (1./N) * gamma * (var + eps)**(-1./2.)*(N*dout-np.sum(
dout, axis=0)-(x-mu)*(var+eps)**(-1.0)*np.sum(dout*(x-mu),axis=0))
return dx, dgamma, dbeta
def affine_bn_relu_forward(x,w,b,gamma, beta,bn_param):
x_affine,cache_affine= affine_forward(x,w,b)
x_bn,cache_bn = batchnorm_forward(x_affine,gamma, beta,bn_param)
out,cache_relu = relu_forward(x_bn)
cache = (cache_affine,cache_bn,cache_relu)
return out,cache
def affine_bn_relu_backward(dout,cache):
cache_affine,cache_bn,cache_relu = cache
drelu = relu_backward(dout,cache_relu)
dbn,dgamma, dbeta= batchnorm_backward_alt(drelu,cache_bn)
dx,dw,db = affine_backward(dbn,cache_affine)
return dx,dw,db,dgamma,dbeta
|