[人工智能] 手写神经网络

开发: C++知识库 Java知识库 JavaScript Python PHP知识库人工智能区块链大数据移动开发嵌入式开发工具数据结构与算法开发测试游戏开发网络协议系统运维
教程: HTML教程 CSS教程 JavaScript教程 Go语言教程 JQuery教程 VUE教程 VUE3教程 Bootstrap教程 SQL数据库教程 C语言教程 C++教程 Java教程 Python教程 Python3教程 C#教程
数码: 电脑笔记本显卡显示器固态硬盘硬盘耳机手机 iphone vivo oppo 小米华为单反装机图拉丁

-> 人工智能 -> 手写神经网络 | CUDA版 -> 正文阅读

[人工智能]手写神经网络 | CUDA版

最近为了对比神经网络在不同语言中的运行效率，于是手搓了3层DNN模型在4种语言中的代码，在这儿记录一下。
基于MNIST手写数字数据集，采用3层全连接网络，由Pytorch、Python、C++、CUDA四个版本实现。
代码及相关算法原理和公式参照《深度学习入门基于Python的理论与实现》，阿里云盘链接：Link。这本书写的很详细，代码中有看不懂的地方可以参照该书。

0.系列文章目录

1.说明

基于CUDA实现的全连接神经网络，以一维数组的方式对数据进行处理。
为了简化代码，目前batchsize只能设置为1
完整代码，Github:

2.代码

dataloader.cuh

#pragma once

using namespace std;

void read_Mnist_Images(string filename, float* img_array);
void read_Mnist_Label(string filename, float* label_array);

dataloader.cu

//dataloader.cpp
#include <iostream>
#include <fstream>
#include <string>
#include <cstring>
using namespace std;

int ReverseInt(int i)
{
	unsigned char ch1, ch2, ch3, ch4;
	ch1 = i & 255;
	ch2 = (i >> 8) & 255;
	ch3 = (i >> 16) & 255;
	ch4 = (i >> 24) & 255;
	return((int)ch1 << 24) + ((int)ch2 << 16) + ((int)ch3 << 8) + ch4;
}


void read_Mnist_Label(string filename, float* label_array)
{
	ifstream file(filename, ios::binary);
	if (file.is_open())
	{

		int magic_number = 0;
		int number_of_images = 0;
		file.read((char*)&magic_number, sizeof(magic_number));
		file.read((char*)&number_of_images, sizeof(number_of_images));
		magic_number = ReverseInt(magic_number);
		number_of_images = ReverseInt(number_of_images);
		cout << "magic number = " << magic_number << endl;
		cout << "number of images = " << number_of_images << endl;
		//float label_array[];
		for (int i = 0; i < number_of_images; i++)
		{
			unsigned char label = 0;
			file.read((char*)&label, sizeof(label));
			//onehot[(int)label] = 1;
			label_array[i * 10 + (int)label] = 1;//在这里将标签转换为onehot格式
		}
	}
	else {
		cout << "open file failed." << endl;
	}
	file.close();
}

void read_Mnist_Images(string filename, float* img_array)
{
	ifstream file(filename, ios::binary);
	if (file.is_open())
	{
		int magic_number = 0;
		int number_of_images = 0;
		int n_rows = 0;
		int n_cols = 0;
		//unsigned char label;
		file.read((char*)&magic_number, sizeof(magic_number));
		file.read((char*)&number_of_images, sizeof(number_of_images));
		file.read((char*)&n_rows, sizeof(n_rows));
		file.read((char*)&n_cols, sizeof(n_cols));
		magic_number = ReverseInt(magic_number);
		number_of_images = ReverseInt(number_of_images);
		n_rows = ReverseInt(n_rows);
		n_cols = ReverseInt(n_cols);
		cout << "magic number = " << magic_number << endl;
		cout << "number of images = " << number_of_images << endl;
		cout << "rows = " << n_rows << endl;
		cout << "cols = " << n_cols << endl;
		//Mat temp(n_rows, n_cols, CV_8UC1, Scalar::all(0));
		for (int i = 0; i < number_of_images; i++)
		{
			//img_array[i] = new float[784];
			int img_num = 0;
			for (int r = 0; r < n_rows; r++)
			{
				for (int c = 0; c < n_cols; c++)
				{

					unsigned char image = 0;
					file.read((char*)&image, sizeof(image));
					img_array[i * 784 + img_num] = (float)image / 255.0;
					img_num++;
				}
			}
		}
	}
	else {
		cout << "open file failed." << endl;
	}
	file.close();
}

functions.cuh

#pragma once

#include "cuda_runtime.h"
#include "device_launch_parameters.h"

__global__ void MatMulwithBias(float* x, int x_w, int x_h, float* w, int w_w, int w_h, float* b, float* out);
__global__ void MatMul(float* x, int x_h, int x_w, float* w, int w_h, int w_w, float* out);
__global__ void MatTrans(float* A, int A_w, int A_h, float* out);

__global__ void CudaReluForward(float* input, float* output);
__global__ void CudaReluBackward(float* input, float* value, float* output);

void softmax(float* input, float* output);
float cross_entropy_error(float* y, float* t);

void FC_init(float* params, int size);

void SGD_update(float* params, float* grads, int size);

__global__ void cuda_swl_backward(float* y, float* t, float* dx, int n);

functions.cu

#include <iostream>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "functions.cuh"
#include <random>
#include <ctime>
using namespace std;


__global__ void MatMulwithBias(float* x, int x_h, int x_w, float* w, int w_h, int w_w, float* b, float* out)
{
	int row = threadIdx.y + blockIdx.y*blockDim.y;
	int col = threadIdx.x + blockIdx.x*blockDim.x;
	if (row < x_h&&col < w_w)
	{
		float temp = 0.0;
		for (int i = 0; i < x_w; i++)
		{
			temp += x[row*x_w + i] * w[i*w_w + col];
		}
		temp += b[row*w_w + col];
		out[row*w_w + col] = temp;
	}
}

__global__ void MatMul(float* x, int x_h, int x_w, float* w, int w_h, int w_w, float* out)
{
	int row = threadIdx.y + blockIdx.y*blockDim.y;
	int col = threadIdx.x + blockIdx.x*blockDim.x;
	if (row < x_h&&col < w_w)
	{
		float temp = 0.0;
		for (int i = 0; i < x_w; i++)
		{
			temp += x[row*x_w + i] * w[i*w_w + col];
		}
		out[row*w_w + col] = temp;
	}
}

__global__ void MatTrans(float* A, int A_w, int A_h, float* out)
{
	int ny = threadIdx.y + blockIdx.y*blockDim.y;
	int nx = threadIdx.x + blockIdx.x*blockDim.x;

	if (nx < A_h&&ny < A_w)
	{
		out[nx*A_w + ny] = A[ny*A_h + nx];
	}
}

__global__ void CudaReluForward(float* input, float* output)
{
	int i = threadIdx.x + blockIdx.x*blockDim.x;
	output[i] = (input[i] > 0) ? input[i] : 0;
}

__global__ void CudaReluBackward(float* input, float* value, float* output)
{
	int i = threadIdx.x + blockIdx.x*blockDim.x;
	output[i] = (input[i] > 0) ? value[i] : 0;
}

__global__ void reduceSum(float* input, float* output, int n)
{
	int tid = threadIdx.x;
	//boundary check
	if (tid >= n) return;
	//以block为单位拆分数组
	float* data = input + blockIdx.x*blockDim.x;//input是指针地址，data是一个新的数组(指针)
	for (int stride = 1; stride < blockDim.x; stride *= 2)
	{
		if ((tid % (2 * stride)) == 0)
		{
			data[tid] += data[tid + stride];
		}
		__syncthreads();
	}
	if (tid == 0)
	{
		output[blockIdx.x] = data[0];
	}
}

__global__ void reduceMax(float* input, float* output, int n)
{
	int tid = threadIdx.x;
	if (tid >= n)return;

	float* data = input + blockIdx.x*blockDim.x;
	for (int stride = 1; stride < blockDim.x; stride *= 2)
	{
		if ((tid % (2 * stride)) == 0)
		{
			if (data[tid] < data[tid + stride])
			{
				data[tid] = data[tid + stride];
			}
		}
		__syncthreads();
	}
	if (tid == 0)
	{
		output[blockIdx.x] = data[0];
	}
}

__global__ void cudaExp(float* input, float* c, int n)
{
	int i = threadIdx.x + blockIdx.x*blockDim.x;
	if (i < n)
	{
		input[i] = expf(input[i] - c[0]);
	}
}

__global__ void cudaDivide(float* input, float* output, float* d, int n)
{
	int i = threadIdx.x + blockIdx.x*blockDim.x;
	if (i < n)
	{
		output[i] = input[i] / d[0];
	}
}

void softmax(float* input, float* output)
{

	const int input_size = 10;
	float* in_copy = nullptr;
	cudaMalloc(&in_copy, sizeof(float)*input_size);
	cudaMemcpy(in_copy, input, sizeof(float)*input_size, cudaMemcpyDeviceToDevice);

	dim3 block_sfm(1024, 1);
	dim3 grid_sfm((input_size - 1) / block_sfm.x + 1, 1);
	//Max
	float* t_B = nullptr;
	cudaMalloc(&t_B, sizeof(float)*grid_sfm.x);
	reduceMax << <grid_sfm, block_sfm >> > (in_copy, t_B, input_size);//Max=t_B[0]
	//exp
	cudaExp << <grid_sfm, block_sfm >> > (input, t_B, input_size);
	float* exp_sum_input = nullptr;
	cudaMalloc(&exp_sum_input, sizeof(float)*input_size);
	cudaMemcpy(exp_sum_input, input, sizeof(float)*input_size, cudaMemcpyDeviceToDevice);
	//sum
	float* t_C = nullptr;
	cudaMalloc(&t_C, sizeof(float)*grid_sfm.x);
	reduceSum << <grid_sfm, block_sfm >> > (exp_sum_input, t_C, input_size);//sum=t_C[0]
	//divide
	cudaDivide << <grid_sfm, block_sfm >> > (input, output, t_C, input_size);

	cudaFree(t_B);
	cudaFree(t_C);
	cudaFree(in_copy);
	cudaFree(exp_sum_input);
}

__global__ void cudaCSEerror(float* y, float* t, int n, float* out)
{
	int i = threadIdx.x + blockIdx.x*blockDim.x;
	float delta = 1e-7;

	if (i < n&&t[i] != 0)
	{
		out[0] = -(t[i] * log(y[i] + delta));
	}
}

float cross_entropy_error(float* y, float* t)
{
	const int input_size = 10;
	float* out = new float[1];
	float* out_array = nullptr;
	cudaMalloc(&out_array, sizeof(float) * 1);
	dim3 block(1024, 1);
	dim3 grid((input_size - 1) / block.x + 1, 1);
	cudaCSEerror << <grid, block >> > (y, t, input_size, out_array);
	cudaMemcpy(out, out_array, sizeof(float) * 1, cudaMemcpyDeviceToHost);

	return out[0];
}



__global__ void cuda_swl_backward(float* y, float* t, float* dx, int n)
{
	int i = threadIdx.x + blockIdx.x*blockDim.x;
	if (i < n)
	{
		dx[i] = y[i] - t[i];
	}
}

void FC_init(float* params, int size)
{
	default_random_engine e(time(0));
	uniform_real_distribution<float> u(-2, 2);
	for (int i = 0; i < size; i++)
	{
		params[i] = 0.01*u(e);
	}
}

__global__ void SGD_update_kernel(float* params, float* grads, int size)
{
	int i = threadIdx.x + blockDim.x*blockIdx.x;
	params[i] -= 0.01*grads[i];
}

void SGD_update(float* params, float* grads, int size)
{
	dim3 block(1024, 1);
	dim3 grid((size - 1) / block.x + 1, 1);
	SGD_update_kernel << <grid, block >> > (params, grads, size);
}

layers.cuh

#pragma once
#include "functions.cuh"



class FC
{
public:
	float* self_w;
	float* self_b;
	float* self_x;
	float* self_dw;
	float* self_db;
	float* self_output;
	float* self_dx;
	//private:
	int self_input_size;
	int self_output_size;
public:
	FC();
	FC(float* w, float* b, int input_size, int output_size);
	~FC();
	void fc_forward(float* x);
	void fc_backward(float* dout);
};


class Relu
{
public:
	Relu();
	Relu(int size);
	~Relu();
	void r_forward(float* x);
	void r_backward(float* x);
private:
	int self_size;
public:
	float* self_mask;
	float* self_dx;
};

class Softmax_with_loss
{
public:
	Softmax_with_loss();
	~Softmax_with_loss();
	void swl_forward(float* x, float* t);
	void swl_backward();

private:
	int self_input_size;
public:
	float* self_y;
	float* self_t;
	float* self_dx;
	float self_loss;
	float* self_t_dx;
};

layers.cu

#include <iostream>
#include "functions.cuh"
#include "layers.cuh"
using namespace std;

FC::FC() {}

FC::FC(float* w, float* b, int input_size, int output_size)
	:self_w(w),
	self_b(b),
	self_input_size(input_size),
	self_output_size(output_size)
{

	self_x = nullptr;
	self_dw = nullptr;
	self_db = nullptr;
	self_output = nullptr;
	self_dx = nullptr;

	cudaMalloc(&self_x, self_input_size * sizeof(float));
	cudaMalloc(&self_dw, self_input_size*self_output_size * sizeof(float));
	cudaMalloc(&self_db, self_output_size * sizeof(float));
	cudaMalloc(&self_output, self_output_size * sizeof(float));
	cudaMalloc(&self_dx, self_input_size * sizeof(float));
}

FC::~FC() {}

void FC::fc_forward(float* x)
{
	cudaMemcpy(self_x, x, self_input_size * sizeof(float), cudaMemcpyDeviceToDevice);
	dim3 blockSize(32, 32);//一个block中的线程数不能超过1024
	dim3 gridSize((self_input_size + blockSize.x - 1) / blockSize.x, (self_input_size + blockSize.y - 1) / blockSize.y);
	MatMulwithBias << <gridSize, blockSize >> > (x, 1, self_input_size, self_w, self_input_size, self_output_size, self_b, self_output);
}

void FC::fc_backward(float* dout)
{
	float* self_w_T = nullptr;
	cudaMalloc(&self_w_T, self_input_size * self_output_size * sizeof(float));

	dim3 blockSize(32, 32);//一个block中的线程数不能超过1024
	dim3 gridSize((self_input_size + blockSize.x - 1) / blockSize.x, (self_input_size + blockSize.y - 1) / blockSize.y);
	MatTrans << <gridSize, blockSize >> > (self_w, self_input_size, self_output_size, self_w_T);
	MatMul << <gridSize, blockSize >> > (dout, 1, self_output_size, self_w_T, self_output_size, self_input_size, self_dx);
	cudaFree(self_w_T);
	MatMul << <gridSize, blockSize >> > (self_x, self_input_size, 1, dout, 1, self_output_size, self_dw);
	cudaMemcpy(self_db, dout, sizeof(float)*self_output_size, cudaMemcpyDeviceToDevice);
}

Relu::Relu() {};

Relu::Relu(int size) :self_size(size)
{
	self_mask = nullptr;
	self_dx = nullptr;

	cudaMalloc(&self_mask, sizeof(float)*self_size);
	cudaMalloc(&self_dx, sizeof(float)*self_size);
}

Relu::~Relu()
{
	cudaFree(self_mask);
	cudaFree(self_dx);
}
void Relu::r_forward(float* x)
{
	dim3 blockSize(256);
	dim3 gridSize((self_size + blockSize.x - 1) / blockSize.x);
	CudaReluForward << <gridSize, blockSize >> > (x, self_mask);

	cudaMemcpy(x, self_mask, sizeof(float)*self_size, cudaMemcpyDeviceToDevice);
}

void Relu::r_backward(float* dout)
{
	dim3 blockSize(256);
	dim3 gridSize((self_size + blockSize.x - 1) / blockSize.x);
	CudaReluBackward << <gridSize, blockSize >> > (self_mask, dout, self_dx);
}

Softmax_with_loss::Softmax_with_loss() :self_input_size(10)
{
	self_t = nullptr;
	self_y = nullptr;
	self_dx = nullptr;
	cudaMalloc(&self_t, sizeof(float)*self_input_size);
	cudaMalloc(&self_y, sizeof(float)*self_input_size);
	cudaMalloc(&self_dx, sizeof(float)*self_input_size);
}

Softmax_with_loss::~Softmax_with_loss() {}

void Softmax_with_loss::swl_forward(float* x, float* t)
{
	cudaMemcpy(self_t, t, sizeof(float)*self_input_size, cudaMemcpyDeviceToDevice);
	softmax(x, self_y);
	self_loss = cross_entropy_error(self_y, self_t);
}

void Softmax_with_loss::swl_backward()
{
	dim3 block(1024, 1);
	dim3 grid((self_input_size - 1) / block.x + 1, 1);

	cuda_swl_backward << <grid, block >> > (self_y, self_t, self_dx, self_input_size);
}

main.cuh

#pragma once
#include "layers.cuh"

class Model
{
public:
	float* self_params_w1;
	float* self_params_b1;
	float* self_params_w2;
	float* self_params_b2;
	float* self_params_w3;
	float* self_params_b3;

	float* self_grads_w1;
	float* self_grads_b1;
	float* self_grads_w2;
	float* self_grads_b2;
	float* self_grads_w3;
	float* self_grads_b3;

	FC self_layer1;
	Relu self_layer1_act;
	FC self_layer2;
	Relu self_layer2_act;
	FC self_layer3;
	//Relu self_layer3_act;
	Softmax_with_loss self_layer4;


private:
	int* self_model_size;

public:
	Model(int* model_size);
	~Model();
	float* model_forward(float* x);
	float loss(float* x, float* t);
	float gradient(float* x, float* t);
	void SGD();
};

main.cu

#include <iostream>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "layers.cuh"
#include "functions.cuh"
#include "dataloader.cuh"
#include "main.cuh"
#include <time.h>
#include <opencv2/opencv.hpp>
using namespace cv;
using namespace std;


Model::Model(int* model_size) :self_model_size(model_size)
{
	//params参数（cuda）分配内存
	self_params_w1 = nullptr;
	self_params_b1 = nullptr;
	self_params_w2 = nullptr;
	self_params_b2 = nullptr;
	self_params_w3 = nullptr;
	self_params_b3 = nullptr;
	cudaMalloc(&self_params_w1, sizeof(float)*self_model_size[0] * self_model_size[1]);
	cudaMalloc(&self_params_b1, sizeof(float)*self_model_size[1]);
	cudaMalloc(&self_params_w2, sizeof(float)*self_model_size[1] * self_model_size[2]);
	cudaMalloc(&self_params_b2, sizeof(float)*self_model_size[2]);
	cudaMalloc(&self_params_w3, sizeof(float)*self_model_size[2] * self_model_size[3]);
	cudaMalloc(&self_params_b3, sizeof(float)*self_model_size[3]);

	//grads
	self_grads_w1 = nullptr;
	self_grads_b1 = nullptr;
	self_grads_w2 = nullptr;
	self_grads_b2 = nullptr;
	self_grads_w3 = nullptr;
	self_grads_b3 = nullptr;
	cudaMalloc(&self_grads_w1, sizeof(float)*self_model_size[0] * self_model_size[1]);
	cudaMalloc(&self_grads_b1, sizeof(float)*self_model_size[1]);
	cudaMalloc(&self_grads_w2, sizeof(float)*self_model_size[1] * self_model_size[2]);
	cudaMalloc(&self_grads_b2, sizeof(float)*self_model_size[2]);
	cudaMalloc(&self_grads_w3, sizeof(float)*self_model_size[2] * self_model_size[3]);
	cudaMalloc(&self_grads_b3, sizeof(float)*self_model_size[3]);

	//在host端对params参数进行初始化
	float* params_w1 = new float[self_model_size[0] * self_model_size[1]];
	float* params_w2 = new float[self_model_size[1] * self_model_size[2]];
	float* params_w3 = new float[self_model_size[2] * self_model_size[3]];
	FC_init(params_w1, self_model_size[0] * self_model_size[1]);
	FC_init(params_w2, self_model_size[1] * self_model_size[2]);
	FC_init(params_w3, self_model_size[1] * self_model_size[3]);


	//将host端初始化后的参数拷贝到device端
	cudaMemcpy(self_params_w1, params_w1, sizeof(float)*self_model_size[0] * self_model_size[1], cudaMemcpyHostToDevice);
	cudaMemcpy(self_params_w2, params_w2, sizeof(float)*self_model_size[1] * self_model_size[2], cudaMemcpyHostToDevice);
	cudaMemcpy(self_params_w3, params_w3, sizeof(float)*self_model_size[2] * self_model_size[3], cudaMemcpyHostToDevice);
	cudaMemset(self_params_b1, 0, sizeof(float)*self_model_size[1]);
	cudaMemset(self_params_b2, 0, sizeof(float)*self_model_size[2]);
	cudaMemset(self_params_b3, 0, sizeof(float)*self_model_size[3]);

	//实例化layers
	self_layer1 = FC(self_params_w1, self_params_b1, self_model_size[0], self_model_size[1]);
	self_layer1_act = Relu(self_model_size[1]);
	self_layer2 = FC(self_params_w2, self_params_b2, self_model_size[1], self_model_size[2]);
	self_layer2_act = Relu(self_model_size[2]);
	self_layer3 = FC(self_params_w3, self_params_b3, self_model_size[2], self_model_size[3]);
	self_layer4 = Softmax_with_loss();

	delete[]params_w1;
	delete[]params_w2;
	delete[]params_w3;
}


Model::~Model() {}

float* Model::model_forward(float* x)//(device)
{
	self_layer1.fc_forward(x);
	self_layer1_act.r_forward(self_layer1.self_output);
	self_layer2.fc_forward(self_layer1.self_output);
	self_layer2_act.r_forward(self_layer2.self_output);
	self_layer3.fc_forward(self_layer2.self_output);

	return self_layer3.self_output;
}

float Model::loss(float* x, float* t)
{
	self_layer4.swl_forward(x, t);
	return self_layer4.self_loss;
}

float Model::gradient(float* x, float* t)//(device,host)
{
	float* y = model_forward(x);
	float loss_value = loss(y, t);
	//反向传播部分
	self_layer4.swl_backward();
	self_layer3.fc_backward(self_layer4.self_dx);
	self_layer2_act.r_backward(self_layer3.self_dx);
	self_layer2.fc_backward(self_layer2_act.self_dx);
	self_layer1_act.r_backward(self_layer2.self_dx);
	self_layer1.fc_backward(self_layer1_act.self_dx);
	//存储grads
	cudaMemcpy(self_grads_w1, self_layer1.self_dw, sizeof(float) * self_model_size[0] * self_model_size[1], cudaMemcpyDeviceToDevice);
	cudaMemcpy(self_grads_b1, self_layer1.self_db, sizeof(float) * self_model_size[1], cudaMemcpyDeviceToDevice);
	cudaMemcpy(self_grads_w2, self_layer2.self_dw, sizeof(float) * self_model_size[1] * self_model_size[2], cudaMemcpyDeviceToDevice);
	cudaMemcpy(self_grads_b2, self_layer2.self_db, sizeof(float) * self_model_size[2], cudaMemcpyDeviceToDevice);
	cudaMemcpy(self_grads_w3, self_layer3.self_dw, sizeof(float) * self_model_size[2] * self_model_size[3], cudaMemcpyDeviceToDevice);
	cudaMemcpy(self_grads_b3, self_layer3.self_db, sizeof(float) * self_model_size[3], cudaMemcpyDeviceToDevice);
	return loss_value;

}

void Model::SGD()
{
	SGD_update(self_params_w1, self_grads_w1, self_model_size[0] * self_model_size[1]);
	SGD_update(self_params_b1, self_grads_b1, self_model_size[1]);
	SGD_update(self_params_w2, self_grads_w2, self_model_size[1] * self_model_size[2]);
	SGD_update(self_params_b2, self_grads_b2, self_model_size[2]);
	SGD_update(self_params_w3, self_grads_w3, self_model_size[2] * self_model_size[3]);
	SGD_update(self_params_b3, self_grads_b3, self_model_size[3]);
}

int main()
{
	//实例化Model
	int model_size[4] = { 784,1024,1024,10 };
	Model DNN(model_size);

	//加载数据集
	const int train_size = 60000;
	const int test_size = 10000;
	const int label_size = 10;
	const int image_size = 784;
	float* train_label = new float[train_size * label_size];//用于存储label的数组
	float* train_image = new float[train_size * image_size];//用于存储iamge的二维数组
	memset(train_label, 0, train_size * label_size * sizeof(float));
	read_Mnist_Label("./dataset/train-labels.idx1-ubyte", train_label);
	read_Mnist_Images("./dataset/train-images.idx3-ubyte", train_image);

	//Train
	const int epoch = 1;
	for (int e = 0; e < epoch; e++)
	{
		float loss_sum = 0;
		clock_t t1, t2;
		t1 = clock();
		for (int i = 0; i < 3000; i++)
		{
			//训练数据拷贝
			float* train_x = new float[image_size];
			float* train_t = new float[label_size];
			copy(train_label + (i * label_size), train_label + (i + 1) * label_size, train_t);
			copy(train_image + (i * image_size), train_image + (i + 1) * image_size, train_x);
			float* t_train_x = nullptr;
			float* t_train_t = nullptr;
			cudaMalloc(&t_train_x, sizeof(float)*image_size);
			cudaMemcpy(t_train_x, train_x, sizeof(float) * image_size, cudaMemcpyHostToDevice);
			cudaMalloc(&t_train_t, sizeof(float)*label_size);
			cudaMemcpy(t_train_t, train_t, sizeof(float) * label_size, cudaMemcpyHostToDevice);

			float loss_ = DNN.gradient(t_train_x, t_train_t);
			DNN.SGD();
			loss_sum += loss_;

			cudaFree(t_train_x);
			cudaFree(t_train_t);
			delete[]train_x;
			delete[]train_t;

			if (i % 500 == 0)
			{
				t2 = clock();
				float loss_avg = loss_sum / 500;
				cout << i << " Loss:" << loss_avg << " | ";
				cout << "Time: " << (double)(t2 - t1) / CLOCKS_PER_SEC << endl;
				t1 = clock();
				loss_sum = 0;
			}

		}
	}
	//读入一张图进行测试
	Mat img = imread("0.jpg", 0);
	if (!img.data) { cout << "测试图片读取错误" << endl; }
	float* test_array = new float[28 * 28];
	int a = 0;
	for (int i = 0; i < img.rows; i++)
	{
		for (int j = 0; j < img.cols; j++)
		{
			float value = (img.at<uchar>(i, j)) / 255.0;
			test_array[a] = value;
			a += 1;
		}
	}

	float* t_test_array = nullptr;
	cudaMalloc(&t_test_array, sizeof(float) * 28 * 28);
	cudaMemcpy(t_test_array, test_array, sizeof(float) * 28 * 28, cudaMemcpyHostToDevice);
	float* t_test_y = DNN.model_forward(t_test_array);
	float* test_y_softmax = nullptr;
	cudaMalloc(&test_y_softmax, sizeof(float) * 10);

	softmax(t_test_y, test_y_softmax);

	float* h_y_softmax = new float[10];
	cudaMemcpy(h_y_softmax, test_y_softmax, sizeof(float) * 10, cudaMemcpyDeviceToHost);
	int y_out = max_element(h_y_softmax, h_y_softmax + 10) - h_y_softmax;
	cout << "预测值：" << y_out << endl;

	return 0;
}

人工智能最新文章

2022吴恩达机器学习课程——第二课（神经网

第十五章规则学习

FixMatch: Simplifying Semi-Supervised Le

数据挖掘Java——Kmeans算法的实现

大脑皮层的分割方法

【翻译】GPT-3是如何工作的

论文笔记:TEACHTEXT: CrossModal Generaliz

python从零学（六）

详解Python 3.x 导入(import)

【答读者问27】backtrader不支持最新版本的

加:2022-06-26 16:54:39 更:2022-06-26 16:57:21

360图书馆购物三丰科技阅读网日历万年历 2025年6日历

-2025/6/7 22:05:15-

图片自动播放器
↓图片自动播放器↓

TxT小说阅读器
↓语音阅读,小说下载,古典文学↓

一键清除垃圾
↓轻轻一点,清除系统垃圾↓

图片批量下载器
↓批量下载图片,美女图库↓

网站联系: qq:121756557 email:121756557@qq.com IT数码