import numpy as np
import matplotlib.pyplot as plt
class MLP:
" Multi-layer perceptron "
def __init__(self, sizes, beta=1, momentum=0.9):
"""
sizes is a list of length four. The first element is the number of features
in each samples. In the MNIST dataset, this is 784 (28*28). The second
and the third elements are the number of neurons in the first
and the second hidden layers, respectively. The fourth element is the
number of neurons in the output layer which is determined by the number
of classes. For example, if the sizes list is [784, 5, 7, 10], this means
the first hidden layer has 5 neurons and the second layer has 7 neurons.
beta is a scalar used in the sigmoid function
momentum is a scalar used for the gradient descent with momentum
"""
self.beta = beta
self.momentum = momentum
self.nin = sizes[0] # number of features in each sample
self.nhidden1 = sizes[1] # number of neurons in the first hidden layer
self.nhidden2 = sizes[2] # number of neurons in the second hidden layer
self.nout = sizes[3] # number of classes / the number of neurons in the output layer
# Initialise the network of two hidden layers
self.weights1 = (np.random.rand(self.nin + 1, self.nhidden1) - 0.5) * 2 / np.sqrt(self.nin) # hidden layer 1
self.weights2 = (np.random.rand(self.nhidden1 + 1, self.nhidden2) - 0.5) * 2 / np.sqrt(
self.nhidden1) # hidden layer 2
self.weights3 = (np.random.rand(self.nhidden2 + 1, self.nout) - 0.5) * 2 / np.sqrt(
self.nhidden2) # output layer
def train(self, inputs, targets, eta, niterations):
"""
inputs is a numpy array of shape (num_train, D) containing the training images
consisting of num_train samples each of dimension D.
targets is a numpy array of shape (num_train, D) containing the training labels
consisting of num_train samples each of dimension D.
eta is the learning rate for optimization
niterations is the number of iterations for updating the weights
"""
ndata = np.shape(inputs)[0] # number of data samples
# adding the bias
inputs = np.concatenate((inputs, -np.ones((ndata, 1))), axis=1)
# numpy array to store the update weights
updatew1 = np.zeros((np.shape(self.weights1)))
updatew2 = np.zeros((np.shape(self.weights2)))
updatew3 = np.zeros((np.shape(self.weights3)))
self.loss = []
for n in range(niterations):
#############################################################################
# TODO: implement the training phase of one iteration which consists of two phases:
# the forward phase and the backward phase. you will implement the forward phase in
# the self.forwardPass method and return the outputs to self.outputs. Then compute
# the error (hints: similar to what we did in the lab). Next is to implement the
# backward phase where you will compute the derivative of the layers and update
# their weights.
#############################################################################
# forward phase
self.outputs = self.forwardPass(inputs)
# Error using the sum-of-squares error function
loss = 0.5 * np.sum((self.outputs - targets) ** 2)
if (np.mod(n, 100) == 0):
self.loss.append(loss)
print("Iteration: ", n, " Loss: ", loss)
# backward phase
# Compute the derivative of the output layer. NOTE: you will need to compute the derivative of
# the softmax function. Hints: equation 4.55 in the book.
deltao = (self.outputs - targets) * self.outputs * (1 - self.outputs)
# compute the derivative of the second hidden layer
deltah2 = self.beta * self.delta_sigmoid(self.hidden2,deltao,self.weights3)
# compute the derivative of the first hidden layer
# deltah1 = self.beta * self.delta_sigmoid(self.hidden1,deltah2,self.weights2)
deltah1 = self.beta * self.hidden1 * (1.0 - self.hidden1) * (np.dot(deltah2[:, :-1], np.transpose(self.weights2)))
# update the weights of the three layers: self.weights1, self.weights2 and self.weights3
# here you can update the weights as we did in the week 4 lab (using gradient descent)
# but you can also add the momentum
updatew1 = self.update_weights(updatew1,inputs,deltah1,eta,self.momentum)
updatew2 = self.update_weights(updatew2,self.hidden1,deltah2,eta,self.momentum)
# updatew3 = self.update_weights(updatew3,self.hidden2,deltao,eta,self.momentum)
updatew3 = eta * np.dot(np.transpose(self.hidden2), deltao) + self.momentum * updatew3
self.weights1 -= updatew1
self.weights2 -= updatew2
self.weights3 -= updatew3
#############################################################################
# END of YOUR CODE
#############################################################################
def forwardPass(self, inputs):
"""
inputs is a numpy array of shape (num_train, D) containing the training images
consisting of num_train samples each of dimension D.
"""
#############################################################################
# TODO: Implement the forward phase of the model. It has two hidden layers
# and the output layer. The activation function of the two hidden layers is
# sigmoid function. The output layer activation function is the softmax function
# because we are working with multi-class classification.
#############################################################################
# layer 1
# compute the forward pass on the first hidden layer with the sigmoid function
self.hidden1 = np.dot(inputs, self.weights1) # (size[0],785) . (785,size[1]) = (size[0],size[1])
# add bias
b1 = -np.zeros((np.shape(inputs)[0], 1))
# sigmoid
self.hidden1 = self.sigmoid_fun(self.hidden1) # (size[0],size[1])
self.hidden1 = np.concatenate((self.hidden1, b1), axis=1) # (size[0],size[1]+1)
# layer 2
# compute the forward pass on the second hidden layer with the sigmoid function
self.hidden2 = np.dot(self.hidden1, self.weights2) # (size[0],size[1]) . (size[1],size[2]) = (size[0],size[2])
# add bias
b2 = -np.zeros((np.shape(self.hidden1)[0], 1))
# sigmoid
self.hidden2 = self.sigmoid_fun(self.hidden2) # (size[0],size[2])
self.hidden2 = np.concatenate((self.hidden2, b2), axis=1) # (size[0],size[2]+1)
# output layer
# compute the forward pass on the output layer with softmax function
outputs = np.dot(self.hidden2, self.weights3) # (9000,10)
outputs = self.softmax_fun(outputs)
#############################################################################
# END of YOUR CODE
#############################################################################
return outputs
def evaluate(self, X, y):
"""
this method is to evaluate our model on unseen samples
it computes the confusion matrix and the accuracy
X is a numpy array of shape (num_train, D) containing the testing images
consisting of num_train samples each of dimension D.
y is a numpy array of shape (num_train, D) containing the testing labels
consisting of num_train samples each of dimension D.
"""
inputs = np.concatenate((X, -np.ones((np.shape(X)[0], 1))), axis=1)
outputs = self.forwardPass(inputs)
nclasses = np.shape(y)[1]
# 1-of-N encoding
outputs = np.argmax(outputs, 1)
targets = np.argmax(y, 1)
cm = np.zeros((nclasses, nclasses))
for i in range(nclasses):
for j in range(nclasses):
cm[i, j] = np.sum(np.where(outputs == i, 1, 0) * np.where(targets == j, 1, 0))
print("The confusion matrix is:")
print(cm)
self.accuracy = np.trace(cm) / np.sum(cm) * 100
print("The accuracy is ", np.trace(cm) / np.sum(cm) * 100)
return cm
# 激活函数
def sigmoid_fun(self, x):
x = self.beta * x
x = 1.0 / (1.0 + np.exp(-x))
return x
# 分类器
def softmax_fun(self, x):
len = np.shape(x)[0]
N = np.sum(np.exp(x), axis=1) * np.ones((1, len))
x = np.exp(x)
x = np.transpose(x) / N
x = np.transpose(x)
return x
# sigmoid求导
def delta_sigmoid(self,x,delta,w):
delta_ = self.beta * x * (1.0 - x) * (np.dot(delta,np.transpose(w)))
return delta_
# 更新梯度
def update_weights(self,updatew,x,delta,lr,momentum):
updatew = lr * np.dot(np.transpose(x), delta[:, :-1]) + momentum * updatew
return updatew
if __name__ == '__main__':
import pickle, gzip
# 绘制损失曲线
def plot_error(niter, Y):
X = [x for x in range(0, int(niter), 100)]
plt.plot(X, Y, 'g*-')
# 保存最好的参数
def save_params(file,params):
# TODO: run the following code to save the best parameters and
# the weights of the network that achieves the desired accuracy
with open(file, 'wb') as handle:
pickle.dump(params, handle, protocol=pickle.HIGHEST_PROTOCOL)
# 加载数据
f = gzip.open('mnist.pkl.gz', 'rb')
tset, vset, teset = pickle.load(f, encoding='latin1')
print(tset[0].shape, vset[0].shape, teset[0].shape)
f.close()
# Just use the first 9000 images for training
tread = 9000
train_in = tset[0][:tread, :]
# This is a little bit of work -- 1 of N encoding
# Make sure you understand how it does it
train_tgt = np.zeros((tread, 10))
for i in range(tread):
train_tgt[i, tset[1][i]] = 1
# and use 1000 images for testing
teread = 1000
test_in = teset[0][:teread, :]
test_tgt = np.zeros((teread, 10))
for i in range(teread):
test_tgt[i, teset[1][i]] = 1
# 设置参数进行训练
best_sizes = [784, 50, 30, 10]
best_beta = 2
best_momentum = 0.5
best_lr = 0.001 # best learning rate
best_niterations = 1200
best_classifier = MLP(sizes=best_sizes, beta=best_beta, momentum=best_momentum)
best_classifier.train(train_in, train_tgt, best_lr, best_niterations)
best_classifier.evaluate(test_in, test_tgt)
# 绘制训练损失曲线
plot_error(best_niterations,best_classifier.loss)
plt.xlabel('the number of iterations')
plt.ylabel('the errors')
accuracy = round(best_classifier.accuracy, 2)
plt.text(best_niterations / 2, 4000, r'$accuracy:\ ' + str(accuracy) + '\%$', fontdict={'size': '12', 'color': 'r'})
plt.title('sizes:{}, beta:{}, momentum:{}, lr:{}, '
'niter:{}'.format(best_sizes, best_beta, best_momentum, best_lr, best_niterations))
plt.show()
# 保存参数
file = "best_parameters.pkl"
best_parameters = {
'sizes': best_sizes,
'beta': best_beta,
'momentum': best_momentum,
'lr': best_lr,
'niterations': best_niterations,
'weights_1': best_classifier.weights1,
'weights_2': best_classifier.weights2,
'weights_3': best_classifier.weights3,
}
save_params(file,best_parameters)