IT数码 购物 网址 头条 软件 日历 阅读 图书馆
TxT小说阅读器
↓语音阅读,小说下载,古典文学↓
图片批量下载器
↓批量下载图片,美女图库↓
图片自动播放器
↓图片自动播放器↓
一键清除垃圾
↓轻轻一点,清除系统垃圾↓
开发: C++知识库 Java知识库 JavaScript Python PHP知识库 人工智能 区块链 大数据 移动开发 嵌入式 开发工具 数据结构与算法 开发测试 游戏开发 网络协议 系统运维
教程: HTML教程 CSS教程 JavaScript教程 Go语言教程 JQuery教程 VUE教程 VUE3教程 Bootstrap教程 SQL数据库教程 C语言教程 C++教程 Java教程 Python教程 Python3教程 C#教程
数码: 电脑 笔记本 显卡 显示器 固态硬盘 硬盘 耳机 手机 iphone vivo oppo 小米 华为 单反 装机 图拉丁
 
   -> 人工智能 -> 手写神经网络 | CUDA版 -> 正文阅读

[人工智能]手写神经网络 | CUDA版

  • 最近为了对比神经网络在不同语言中的运行效率,于是手搓了3层DNN模型在4种语言中的代码,在这儿记录一下。
  • 基于MNIST手写数字数据集,采用3层全连接网络,由Pytorch、Python、C++、CUDA四个版本实现。
  • 代码及相关算法原理和公式参照《深度学习入门 基于Python的理论与实现》,阿里云盘链接:Link。这本书写的很详细,代码中有看不懂的地方可以参照该书。

0.系列文章目录

  1. 实验结果
  2. Pytorch版代码
  3. Python版
    2.1.Numpy版代码
    2.2.纯Python版代码
  4. C++版代码
  5. CUDA版代码

1.说明

  1. 基于CUDA实现的全连接神经网络,以一维数组的方式对数据进行处理。
  2. 为了简化代码,目前batchsize只能设置为1
  3. 完整代码,Github:

2.代码

dataloader.cuh

#pragma once

using namespace std;

void read_Mnist_Images(string filename, float* img_array);
void read_Mnist_Label(string filename, float* label_array);

dataloader.cu

//dataloader.cpp
#include <iostream>
#include <fstream>
#include <string>
#include <cstring>
using namespace std;

int ReverseInt(int i)
{
	unsigned char ch1, ch2, ch3, ch4;
	ch1 = i & 255;
	ch2 = (i >> 8) & 255;
	ch3 = (i >> 16) & 255;
	ch4 = (i >> 24) & 255;
	return((int)ch1 << 24) + ((int)ch2 << 16) + ((int)ch3 << 8) + ch4;
}


void read_Mnist_Label(string filename, float* label_array)
{
	ifstream file(filename, ios::binary);
	if (file.is_open())
	{

		int magic_number = 0;
		int number_of_images = 0;
		file.read((char*)&magic_number, sizeof(magic_number));
		file.read((char*)&number_of_images, sizeof(number_of_images));
		magic_number = ReverseInt(magic_number);
		number_of_images = ReverseInt(number_of_images);
		cout << "magic number = " << magic_number << endl;
		cout << "number of images = " << number_of_images << endl;
		//float label_array[];
		for (int i = 0; i < number_of_images; i++)
		{
			unsigned char label = 0;
			file.read((char*)&label, sizeof(label));
			//onehot[(int)label] = 1;
			label_array[i * 10 + (int)label] = 1;//在这里将标签转换为onehot格式
		}
	}
	else {
		cout << "open file failed." << endl;
	}
	file.close();
}

void read_Mnist_Images(string filename, float* img_array)
{
	ifstream file(filename, ios::binary);
	if (file.is_open())
	{
		int magic_number = 0;
		int number_of_images = 0;
		int n_rows = 0;
		int n_cols = 0;
		//unsigned char label;
		file.read((char*)&magic_number, sizeof(magic_number));
		file.read((char*)&number_of_images, sizeof(number_of_images));
		file.read((char*)&n_rows, sizeof(n_rows));
		file.read((char*)&n_cols, sizeof(n_cols));
		magic_number = ReverseInt(magic_number);
		number_of_images = ReverseInt(number_of_images);
		n_rows = ReverseInt(n_rows);
		n_cols = ReverseInt(n_cols);
		cout << "magic number = " << magic_number << endl;
		cout << "number of images = " << number_of_images << endl;
		cout << "rows = " << n_rows << endl;
		cout << "cols = " << n_cols << endl;
		//Mat temp(n_rows, n_cols, CV_8UC1, Scalar::all(0));
		for (int i = 0; i < number_of_images; i++)
		{
			//img_array[i] = new float[784];
			int img_num = 0;
			for (int r = 0; r < n_rows; r++)
			{
				for (int c = 0; c < n_cols; c++)
				{

					unsigned char image = 0;
					file.read((char*)&image, sizeof(image));
					img_array[i * 784 + img_num] = (float)image / 255.0;
					img_num++;
				}
			}
		}
	}
	else {
		cout << "open file failed." << endl;
	}
	file.close();
}

functions.cuh

#pragma once

#include "cuda_runtime.h"
#include "device_launch_parameters.h"

__global__ void MatMulwithBias(float* x, int x_w, int x_h, float* w, int w_w, int w_h, float* b, float* out);
__global__ void MatMul(float* x, int x_h, int x_w, float* w, int w_h, int w_w, float* out);
__global__ void MatTrans(float* A, int A_w, int A_h, float* out);

__global__ void CudaReluForward(float* input, float* output);
__global__ void CudaReluBackward(float* input, float* value, float* output);

void softmax(float* input, float* output);
float cross_entropy_error(float* y, float* t);

void FC_init(float* params, int size);

void SGD_update(float* params, float* grads, int size);

__global__ void cuda_swl_backward(float* y, float* t, float* dx, int n);


functions.cu

#include <iostream>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "functions.cuh"
#include <random>
#include <ctime>
using namespace std;


__global__ void MatMulwithBias(float* x, int x_h, int x_w, float* w, int w_h, int w_w, float* b, float* out)
{
	int row = threadIdx.y + blockIdx.y*blockDim.y;
	int col = threadIdx.x + blockIdx.x*blockDim.x;
	if (row < x_h&&col < w_w)
	{
		float temp = 0.0;
		for (int i = 0; i < x_w; i++)
		{
			temp += x[row*x_w + i] * w[i*w_w + col];
		}
		temp += b[row*w_w + col];
		out[row*w_w + col] = temp;
	}
}

__global__ void MatMul(float* x, int x_h, int x_w, float* w, int w_h, int w_w, float* out)
{
	int row = threadIdx.y + blockIdx.y*blockDim.y;
	int col = threadIdx.x + blockIdx.x*blockDim.x;
	if (row < x_h&&col < w_w)
	{
		float temp = 0.0;
		for (int i = 0; i < x_w; i++)
		{
			temp += x[row*x_w + i] * w[i*w_w + col];
		}
		out[row*w_w + col] = temp;
	}
}

__global__ void MatTrans(float* A, int A_w, int A_h, float* out)
{
	int ny = threadIdx.y + blockIdx.y*blockDim.y;
	int nx = threadIdx.x + blockIdx.x*blockDim.x;

	if (nx < A_h&&ny < A_w)
	{
		out[nx*A_w + ny] = A[ny*A_h + nx];
	}
}

__global__ void CudaReluForward(float* input, float* output)
{
	int i = threadIdx.x + blockIdx.x*blockDim.x;
	output[i] = (input[i] > 0) ? input[i] : 0;
}

__global__ void CudaReluBackward(float* input, float* value, float* output)
{
	int i = threadIdx.x + blockIdx.x*blockDim.x;
	output[i] = (input[i] > 0) ? value[i] : 0;
}

__global__ void reduceSum(float* input, float* output, int n)
{
	int tid = threadIdx.x;
	//boundary check
	if (tid >= n) return;
	//以block为单位拆分数组
	float* data = input + blockIdx.x*blockDim.x;//input是指针地址,data是一个新的数组(指针)
	for (int stride = 1; stride < blockDim.x; stride *= 2)
	{
		if ((tid % (2 * stride)) == 0)
		{
			data[tid] += data[tid + stride];
		}
		__syncthreads();
	}
	if (tid == 0)
	{
		output[blockIdx.x] = data[0];
	}
}

__global__ void reduceMax(float* input, float* output, int n)
{
	int tid = threadIdx.x;
	if (tid >= n)return;

	float* data = input + blockIdx.x*blockDim.x;
	for (int stride = 1; stride < blockDim.x; stride *= 2)
	{
		if ((tid % (2 * stride)) == 0)
		{
			if (data[tid] < data[tid + stride])
			{
				data[tid] = data[tid + stride];
			}
		}
		__syncthreads();
	}
	if (tid == 0)
	{
		output[blockIdx.x] = data[0];
	}
}

__global__ void cudaExp(float* input, float* c, int n)
{
	int i = threadIdx.x + blockIdx.x*blockDim.x;
	if (i < n)
	{
		input[i] = expf(input[i] - c[0]);
	}
}

__global__ void cudaDivide(float* input, float* output, float* d, int n)
{
	int i = threadIdx.x + blockIdx.x*blockDim.x;
	if (i < n)
	{
		output[i] = input[i] / d[0];
	}
}

void softmax(float* input, float* output)
{

	const int input_size = 10;
	float* in_copy = nullptr;
	cudaMalloc(&in_copy, sizeof(float)*input_size);
	cudaMemcpy(in_copy, input, sizeof(float)*input_size, cudaMemcpyDeviceToDevice);

	dim3 block_sfm(1024, 1);
	dim3 grid_sfm((input_size - 1) / block_sfm.x + 1, 1);
	//Max
	float* t_B = nullptr;
	cudaMalloc(&t_B, sizeof(float)*grid_sfm.x);
	reduceMax << <grid_sfm, block_sfm >> > (in_copy, t_B, input_size);//Max=t_B[0]
	//exp
	cudaExp << <grid_sfm, block_sfm >> > (input, t_B, input_size);
	float* exp_sum_input = nullptr;
	cudaMalloc(&exp_sum_input, sizeof(float)*input_size);
	cudaMemcpy(exp_sum_input, input, sizeof(float)*input_size, cudaMemcpyDeviceToDevice);
	//sum
	float* t_C = nullptr;
	cudaMalloc(&t_C, sizeof(float)*grid_sfm.x);
	reduceSum << <grid_sfm, block_sfm >> > (exp_sum_input, t_C, input_size);//sum=t_C[0]
	//divide
	cudaDivide << <grid_sfm, block_sfm >> > (input, output, t_C, input_size);

	cudaFree(t_B);
	cudaFree(t_C);
	cudaFree(in_copy);
	cudaFree(exp_sum_input);
}

__global__ void cudaCSEerror(float* y, float* t, int n, float* out)
{
	int i = threadIdx.x + blockIdx.x*blockDim.x;
	float delta = 1e-7;

	if (i < n&&t[i] != 0)
	{
		out[0] = -(t[i] * log(y[i] + delta));
	}
}

float cross_entropy_error(float* y, float* t)
{
	const int input_size = 10;
	float* out = new float[1];
	float* out_array = nullptr;
	cudaMalloc(&out_array, sizeof(float) * 1);
	dim3 block(1024, 1);
	dim3 grid((input_size - 1) / block.x + 1, 1);
	cudaCSEerror << <grid, block >> > (y, t, input_size, out_array);
	cudaMemcpy(out, out_array, sizeof(float) * 1, cudaMemcpyDeviceToHost);

	return out[0];
}



__global__ void cuda_swl_backward(float* y, float* t, float* dx, int n)
{
	int i = threadIdx.x + blockIdx.x*blockDim.x;
	if (i < n)
	{
		dx[i] = y[i] - t[i];
	}
}

void FC_init(float* params, int size)
{
	default_random_engine e(time(0));
	uniform_real_distribution<float> u(-2, 2);
	for (int i = 0; i < size; i++)
	{
		params[i] = 0.01*u(e);
	}
}

__global__ void SGD_update_kernel(float* params, float* grads, int size)
{
	int i = threadIdx.x + blockDim.x*blockIdx.x;
	params[i] -= 0.01*grads[i];
}

void SGD_update(float* params, float* grads, int size)
{
	dim3 block(1024, 1);
	dim3 grid((size - 1) / block.x + 1, 1);
	SGD_update_kernel << <grid, block >> > (params, grads, size);
}

layers.cuh

#pragma once
#include "functions.cuh"



class FC
{
public:
	float* self_w;
	float* self_b;
	float* self_x;
	float* self_dw;
	float* self_db;
	float* self_output;
	float* self_dx;
	//private:
	int self_input_size;
	int self_output_size;
public:
	FC();
	FC(float* w, float* b, int input_size, int output_size);
	~FC();
	void fc_forward(float* x);
	void fc_backward(float* dout);
};


class Relu
{
public:
	Relu();
	Relu(int size);
	~Relu();
	void r_forward(float* x);
	void r_backward(float* x);
private:
	int self_size;
public:
	float* self_mask;
	float* self_dx;
};

class Softmax_with_loss
{
public:
	Softmax_with_loss();
	~Softmax_with_loss();
	void swl_forward(float* x, float* t);
	void swl_backward();

private:
	int self_input_size;
public:
	float* self_y;
	float* self_t;
	float* self_dx;
	float self_loss;
	float* self_t_dx;
};

layers.cu

#include <iostream>
#include "functions.cuh"
#include "layers.cuh"
using namespace std;

FC::FC() {}

FC::FC(float* w, float* b, int input_size, int output_size)
	:self_w(w),
	self_b(b),
	self_input_size(input_size),
	self_output_size(output_size)
{

	self_x = nullptr;
	self_dw = nullptr;
	self_db = nullptr;
	self_output = nullptr;
	self_dx = nullptr;

	cudaMalloc(&self_x, self_input_size * sizeof(float));
	cudaMalloc(&self_dw, self_input_size*self_output_size * sizeof(float));
	cudaMalloc(&self_db, self_output_size * sizeof(float));
	cudaMalloc(&self_output, self_output_size * sizeof(float));
	cudaMalloc(&self_dx, self_input_size * sizeof(float));
}

FC::~FC() {}

void FC::fc_forward(float* x)
{
	cudaMemcpy(self_x, x, self_input_size * sizeof(float), cudaMemcpyDeviceToDevice);
	dim3 blockSize(32, 32);//一个block中的线程数不能超过1024
	dim3 gridSize((self_input_size + blockSize.x - 1) / blockSize.x, (self_input_size + blockSize.y - 1) / blockSize.y);
	MatMulwithBias << <gridSize, blockSize >> > (x, 1, self_input_size, self_w, self_input_size, self_output_size, self_b, self_output);
}

void FC::fc_backward(float* dout)
{
	float* self_w_T = nullptr;
	cudaMalloc(&self_w_T, self_input_size * self_output_size * sizeof(float));

	dim3 blockSize(32, 32);//一个block中的线程数不能超过1024
	dim3 gridSize((self_input_size + blockSize.x - 1) / blockSize.x, (self_input_size + blockSize.y - 1) / blockSize.y);
	MatTrans << <gridSize, blockSize >> > (self_w, self_input_size, self_output_size, self_w_T);
	MatMul << <gridSize, blockSize >> > (dout, 1, self_output_size, self_w_T, self_output_size, self_input_size, self_dx);
	cudaFree(self_w_T);
	MatMul << <gridSize, blockSize >> > (self_x, self_input_size, 1, dout, 1, self_output_size, self_dw);
	cudaMemcpy(self_db, dout, sizeof(float)*self_output_size, cudaMemcpyDeviceToDevice);
}

Relu::Relu() {};

Relu::Relu(int size) :self_size(size)
{
	self_mask = nullptr;
	self_dx = nullptr;

	cudaMalloc(&self_mask, sizeof(float)*self_size);
	cudaMalloc(&self_dx, sizeof(float)*self_size);
}

Relu::~Relu()
{
	cudaFree(self_mask);
	cudaFree(self_dx);
}
void Relu::r_forward(float* x)
{
	dim3 blockSize(256);
	dim3 gridSize((self_size + blockSize.x - 1) / blockSize.x);
	CudaReluForward << <gridSize, blockSize >> > (x, self_mask);

	cudaMemcpy(x, self_mask, sizeof(float)*self_size, cudaMemcpyDeviceToDevice);
}

void Relu::r_backward(float* dout)
{
	dim3 blockSize(256);
	dim3 gridSize((self_size + blockSize.x - 1) / blockSize.x);
	CudaReluBackward << <gridSize, blockSize >> > (self_mask, dout, self_dx);
}

Softmax_with_loss::Softmax_with_loss() :self_input_size(10)
{
	self_t = nullptr;
	self_y = nullptr;
	self_dx = nullptr;
	cudaMalloc(&self_t, sizeof(float)*self_input_size);
	cudaMalloc(&self_y, sizeof(float)*self_input_size);
	cudaMalloc(&self_dx, sizeof(float)*self_input_size);
}

Softmax_with_loss::~Softmax_with_loss() {}

void Softmax_with_loss::swl_forward(float* x, float* t)
{
	cudaMemcpy(self_t, t, sizeof(float)*self_input_size, cudaMemcpyDeviceToDevice);
	softmax(x, self_y);
	self_loss = cross_entropy_error(self_y, self_t);
}

void Softmax_with_loss::swl_backward()
{
	dim3 block(1024, 1);
	dim3 grid((self_input_size - 1) / block.x + 1, 1);

	cuda_swl_backward << <grid, block >> > (self_y, self_t, self_dx, self_input_size);
}

main.cuh

#pragma once
#include "layers.cuh"

class Model
{
public:
	float* self_params_w1;
	float* self_params_b1;
	float* self_params_w2;
	float* self_params_b2;
	float* self_params_w3;
	float* self_params_b3;

	float* self_grads_w1;
	float* self_grads_b1;
	float* self_grads_w2;
	float* self_grads_b2;
	float* self_grads_w3;
	float* self_grads_b3;

	FC self_layer1;
	Relu self_layer1_act;
	FC self_layer2;
	Relu self_layer2_act;
	FC self_layer3;
	//Relu self_layer3_act;
	Softmax_with_loss self_layer4;


private:
	int* self_model_size;

public:
	Model(int* model_size);
	~Model();
	float* model_forward(float* x);
	float loss(float* x, float* t);
	float gradient(float* x, float* t);
	void SGD();
};

main.cu

#include <iostream>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "layers.cuh"
#include "functions.cuh"
#include "dataloader.cuh"
#include "main.cuh"
#include <time.h>
#include <opencv2/opencv.hpp>
using namespace cv;
using namespace std;


Model::Model(int* model_size) :self_model_size(model_size)
{
	//params参数(cuda)分配内存
	self_params_w1 = nullptr;
	self_params_b1 = nullptr;
	self_params_w2 = nullptr;
	self_params_b2 = nullptr;
	self_params_w3 = nullptr;
	self_params_b3 = nullptr;
	cudaMalloc(&self_params_w1, sizeof(float)*self_model_size[0] * self_model_size[1]);
	cudaMalloc(&self_params_b1, sizeof(float)*self_model_size[1]);
	cudaMalloc(&self_params_w2, sizeof(float)*self_model_size[1] * self_model_size[2]);
	cudaMalloc(&self_params_b2, sizeof(float)*self_model_size[2]);
	cudaMalloc(&self_params_w3, sizeof(float)*self_model_size[2] * self_model_size[3]);
	cudaMalloc(&self_params_b3, sizeof(float)*self_model_size[3]);

	//grads
	self_grads_w1 = nullptr;
	self_grads_b1 = nullptr;
	self_grads_w2 = nullptr;
	self_grads_b2 = nullptr;
	self_grads_w3 = nullptr;
	self_grads_b3 = nullptr;
	cudaMalloc(&self_grads_w1, sizeof(float)*self_model_size[0] * self_model_size[1]);
	cudaMalloc(&self_grads_b1, sizeof(float)*self_model_size[1]);
	cudaMalloc(&self_grads_w2, sizeof(float)*self_model_size[1] * self_model_size[2]);
	cudaMalloc(&self_grads_b2, sizeof(float)*self_model_size[2]);
	cudaMalloc(&self_grads_w3, sizeof(float)*self_model_size[2] * self_model_size[3]);
	cudaMalloc(&self_grads_b3, sizeof(float)*self_model_size[3]);

	//在host端对params参数进行初始化
	float* params_w1 = new float[self_model_size[0] * self_model_size[1]];
	float* params_w2 = new float[self_model_size[1] * self_model_size[2]];
	float* params_w3 = new float[self_model_size[2] * self_model_size[3]];
	FC_init(params_w1, self_model_size[0] * self_model_size[1]);
	FC_init(params_w2, self_model_size[1] * self_model_size[2]);
	FC_init(params_w3, self_model_size[1] * self_model_size[3]);


	//将host端初始化后的参数拷贝到device端
	cudaMemcpy(self_params_w1, params_w1, sizeof(float)*self_model_size[0] * self_model_size[1], cudaMemcpyHostToDevice);
	cudaMemcpy(self_params_w2, params_w2, sizeof(float)*self_model_size[1] * self_model_size[2], cudaMemcpyHostToDevice);
	cudaMemcpy(self_params_w3, params_w3, sizeof(float)*self_model_size[2] * self_model_size[3], cudaMemcpyHostToDevice);
	cudaMemset(self_params_b1, 0, sizeof(float)*self_model_size[1]);
	cudaMemset(self_params_b2, 0, sizeof(float)*self_model_size[2]);
	cudaMemset(self_params_b3, 0, sizeof(float)*self_model_size[3]);

	//实例化layers
	self_layer1 = FC(self_params_w1, self_params_b1, self_model_size[0], self_model_size[1]);
	self_layer1_act = Relu(self_model_size[1]);
	self_layer2 = FC(self_params_w2, self_params_b2, self_model_size[1], self_model_size[2]);
	self_layer2_act = Relu(self_model_size[2]);
	self_layer3 = FC(self_params_w3, self_params_b3, self_model_size[2], self_model_size[3]);
	self_layer4 = Softmax_with_loss();

	delete[]params_w1;
	delete[]params_w2;
	delete[]params_w3;
}


Model::~Model() {}

float* Model::model_forward(float* x)//(device)
{
	self_layer1.fc_forward(x);
	self_layer1_act.r_forward(self_layer1.self_output);
	self_layer2.fc_forward(self_layer1.self_output);
	self_layer2_act.r_forward(self_layer2.self_output);
	self_layer3.fc_forward(self_layer2.self_output);

	return self_layer3.self_output;
}

float Model::loss(float* x, float* t)
{
	self_layer4.swl_forward(x, t);
	return self_layer4.self_loss;
}

float Model::gradient(float* x, float* t)//(device,host)
{
	float* y = model_forward(x);
	float loss_value = loss(y, t);
	//反向传播部分
	self_layer4.swl_backward();
	self_layer3.fc_backward(self_layer4.self_dx);
	self_layer2_act.r_backward(self_layer3.self_dx);
	self_layer2.fc_backward(self_layer2_act.self_dx);
	self_layer1_act.r_backward(self_layer2.self_dx);
	self_layer1.fc_backward(self_layer1_act.self_dx);
	//存储grads
	cudaMemcpy(self_grads_w1, self_layer1.self_dw, sizeof(float) * self_model_size[0] * self_model_size[1], cudaMemcpyDeviceToDevice);
	cudaMemcpy(self_grads_b1, self_layer1.self_db, sizeof(float) * self_model_size[1], cudaMemcpyDeviceToDevice);
	cudaMemcpy(self_grads_w2, self_layer2.self_dw, sizeof(float) * self_model_size[1] * self_model_size[2], cudaMemcpyDeviceToDevice);
	cudaMemcpy(self_grads_b2, self_layer2.self_db, sizeof(float) * self_model_size[2], cudaMemcpyDeviceToDevice);
	cudaMemcpy(self_grads_w3, self_layer3.self_dw, sizeof(float) * self_model_size[2] * self_model_size[3], cudaMemcpyDeviceToDevice);
	cudaMemcpy(self_grads_b3, self_layer3.self_db, sizeof(float) * self_model_size[3], cudaMemcpyDeviceToDevice);
	return loss_value;

}

void Model::SGD()
{
	SGD_update(self_params_w1, self_grads_w1, self_model_size[0] * self_model_size[1]);
	SGD_update(self_params_b1, self_grads_b1, self_model_size[1]);
	SGD_update(self_params_w2, self_grads_w2, self_model_size[1] * self_model_size[2]);
	SGD_update(self_params_b2, self_grads_b2, self_model_size[2]);
	SGD_update(self_params_w3, self_grads_w3, self_model_size[2] * self_model_size[3]);
	SGD_update(self_params_b3, self_grads_b3, self_model_size[3]);
}

int main()
{
	//实例化Model
	int model_size[4] = { 784,1024,1024,10 };
	Model DNN(model_size);

	//加载数据集
	const int train_size = 60000;
	const int test_size = 10000;
	const int label_size = 10;
	const int image_size = 784;
	float* train_label = new float[train_size * label_size];//用于存储label的数组
	float* train_image = new float[train_size * image_size];//用于存储iamge的二维数组
	memset(train_label, 0, train_size * label_size * sizeof(float));
	read_Mnist_Label("./dataset/train-labels.idx1-ubyte", train_label);
	read_Mnist_Images("./dataset/train-images.idx3-ubyte", train_image);

	//Train
	const int epoch = 1;
	for (int e = 0; e < epoch; e++)
	{
		float loss_sum = 0;
		clock_t t1, t2;
		t1 = clock();
		for (int i = 0; i < 3000; i++)
		{
			//训练数据拷贝
			float* train_x = new float[image_size];
			float* train_t = new float[label_size];
			copy(train_label + (i * label_size), train_label + (i + 1) * label_size, train_t);
			copy(train_image + (i * image_size), train_image + (i + 1) * image_size, train_x);
			float* t_train_x = nullptr;
			float* t_train_t = nullptr;
			cudaMalloc(&t_train_x, sizeof(float)*image_size);
			cudaMemcpy(t_train_x, train_x, sizeof(float) * image_size, cudaMemcpyHostToDevice);
			cudaMalloc(&t_train_t, sizeof(float)*label_size);
			cudaMemcpy(t_train_t, train_t, sizeof(float) * label_size, cudaMemcpyHostToDevice);

			float loss_ = DNN.gradient(t_train_x, t_train_t);
			DNN.SGD();
			loss_sum += loss_;

			cudaFree(t_train_x);
			cudaFree(t_train_t);
			delete[]train_x;
			delete[]train_t;

			if (i % 500 == 0)
			{
				t2 = clock();
				float loss_avg = loss_sum / 500;
				cout << i << " Loss:" << loss_avg << " | ";
				cout << "Time: " << (double)(t2 - t1) / CLOCKS_PER_SEC << endl;
				t1 = clock();
				loss_sum = 0;
			}

		}
	}
	//读入一张图进行测试
	Mat img = imread("0.jpg", 0);
	if (!img.data) { cout << "测试图片读取错误" << endl; }
	float* test_array = new float[28 * 28];
	int a = 0;
	for (int i = 0; i < img.rows; i++)
	{
		for (int j = 0; j < img.cols; j++)
		{
			float value = (img.at<uchar>(i, j)) / 255.0;
			test_array[a] = value;
			a += 1;
		}
	}

	float* t_test_array = nullptr;
	cudaMalloc(&t_test_array, sizeof(float) * 28 * 28);
	cudaMemcpy(t_test_array, test_array, sizeof(float) * 28 * 28, cudaMemcpyHostToDevice);
	float* t_test_y = DNN.model_forward(t_test_array);
	float* test_y_softmax = nullptr;
	cudaMalloc(&test_y_softmax, sizeof(float) * 10);

	softmax(t_test_y, test_y_softmax);

	float* h_y_softmax = new float[10];
	cudaMemcpy(h_y_softmax, test_y_softmax, sizeof(float) * 10, cudaMemcpyDeviceToHost);
	int y_out = max_element(h_y_softmax, h_y_softmax + 10) - h_y_softmax;
	cout << "预测值:" << y_out << endl;

	return 0;
}


  人工智能 最新文章
2022吴恩达机器学习课程——第二课(神经网
第十五章 规则学习
FixMatch: Simplifying Semi-Supervised Le
数据挖掘Java——Kmeans算法的实现
大脑皮层的分割方法
【翻译】GPT-3是如何工作的
论文笔记:TEACHTEXT: CrossModal Generaliz
python从零学(六)
详解Python 3.x 导入(import)
【答读者问27】backtrader不支持最新版本的
上一篇文章      下一篇文章      查看所有文章
加:2022-06-26 16:54:39  更:2022-06-26 16:57:21 
 
开发: C++知识库 Java知识库 JavaScript Python PHP知识库 人工智能 区块链 大数据 移动开发 嵌入式 开发工具 数据结构与算法 开发测试 游戏开发 网络协议 系统运维
教程: HTML教程 CSS教程 JavaScript教程 Go语言教程 JQuery教程 VUE教程 VUE3教程 Bootstrap教程 SQL数据库教程 C语言教程 C++教程 Java教程 Python教程 Python3教程 C#教程
数码: 电脑 笔记本 显卡 显示器 固态硬盘 硬盘 耳机 手机 iphone vivo oppo 小米 华为 单反 装机 图拉丁

360图书馆 购物 三丰科技 阅读网 日历 万年历 2024年11日历 -2024/11/26 2:43:04-

图片自动播放器
↓图片自动播放器↓
TxT小说阅读器
↓语音阅读,小说下载,古典文学↓
一键清除垃圾
↓轻轻一点,清除系统垃圾↓
图片批量下载器
↓批量下载图片,美女图库↓
  网站联系: qq:121756557 email:121756557@qq.com  IT数码