- 最近为了对比神经网络在不同语言中的运行效率,于是手搓了3层DNN模型在4种语言中的代码,在这儿记录一下。
- 基于MNIST手写数字数据集,采用3层全连接网络,由Pytorch、Python、C++、CUDA四个版本实现。
- 代码及相关算法原理和公式参照《深度学习入门 基于Python的理论与实现》,阿里云盘链接:Link。这本书写的很详细,代码中有看不懂的地方可以参照该书。
0.系列文章目录
- 实验结果
- Pytorch版代码
- Python版
2.1.Numpy版代码 2.2.纯Python版代码 - C++版代码
- CUDA版代码
1.说明
- 基于CUDA实现的全连接神经网络,以一维数组的方式对数据进行处理。
- 为了简化代码,目前batchsize只能设置为1
- 完整代码,Github:
2.代码
dataloader.cuh
#pragma once
using namespace std;
void read_Mnist_Images(string filename, float* img_array);
void read_Mnist_Label(string filename, float* label_array);
dataloader.cu
#include <iostream>
#include <fstream>
#include <string>
#include <cstring>
using namespace std;
int ReverseInt(int i)
{
unsigned char ch1, ch2, ch3, ch4;
ch1 = i & 255;
ch2 = (i >> 8) & 255;
ch3 = (i >> 16) & 255;
ch4 = (i >> 24) & 255;
return((int)ch1 << 24) + ((int)ch2 << 16) + ((int)ch3 << 8) + ch4;
}
void read_Mnist_Label(string filename, float* label_array)
{
ifstream file(filename, ios::binary);
if (file.is_open())
{
int magic_number = 0;
int number_of_images = 0;
file.read((char*)&magic_number, sizeof(magic_number));
file.read((char*)&number_of_images, sizeof(number_of_images));
magic_number = ReverseInt(magic_number);
number_of_images = ReverseInt(number_of_images);
cout << "magic number = " << magic_number << endl;
cout << "number of images = " << number_of_images << endl;
for (int i = 0; i < number_of_images; i++)
{
unsigned char label = 0;
file.read((char*)&label, sizeof(label));
label_array[i * 10 + (int)label] = 1;
}
}
else {
cout << "open file failed." << endl;
}
file.close();
}
void read_Mnist_Images(string filename, float* img_array)
{
ifstream file(filename, ios::binary);
if (file.is_open())
{
int magic_number = 0;
int number_of_images = 0;
int n_rows = 0;
int n_cols = 0;
file.read((char*)&magic_number, sizeof(magic_number));
file.read((char*)&number_of_images, sizeof(number_of_images));
file.read((char*)&n_rows, sizeof(n_rows));
file.read((char*)&n_cols, sizeof(n_cols));
magic_number = ReverseInt(magic_number);
number_of_images = ReverseInt(number_of_images);
n_rows = ReverseInt(n_rows);
n_cols = ReverseInt(n_cols);
cout << "magic number = " << magic_number << endl;
cout << "number of images = " << number_of_images << endl;
cout << "rows = " << n_rows << endl;
cout << "cols = " << n_cols << endl;
for (int i = 0; i < number_of_images; i++)
{
int img_num = 0;
for (int r = 0; r < n_rows; r++)
{
for (int c = 0; c < n_cols; c++)
{
unsigned char image = 0;
file.read((char*)&image, sizeof(image));
img_array[i * 784 + img_num] = (float)image / 255.0;
img_num++;
}
}
}
}
else {
cout << "open file failed." << endl;
}
file.close();
}
functions.cuh
#pragma once
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
__global__ void MatMulwithBias(float* x, int x_w, int x_h, float* w, int w_w, int w_h, float* b, float* out);
__global__ void MatMul(float* x, int x_h, int x_w, float* w, int w_h, int w_w, float* out);
__global__ void MatTrans(float* A, int A_w, int A_h, float* out);
__global__ void CudaReluForward(float* input, float* output);
__global__ void CudaReluBackward(float* input, float* value, float* output);
void softmax(float* input, float* output);
float cross_entropy_error(float* y, float* t);
void FC_init(float* params, int size);
void SGD_update(float* params, float* grads, int size);
__global__ void cuda_swl_backward(float* y, float* t, float* dx, int n);
functions.cu
#include <iostream>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "functions.cuh"
#include <random>
#include <ctime>
using namespace std;
__global__ void MatMulwithBias(float* x, int x_h, int x_w, float* w, int w_h, int w_w, float* b, float* out)
{
int row = threadIdx.y + blockIdx.y*blockDim.y;
int col = threadIdx.x + blockIdx.x*blockDim.x;
if (row < x_h&&col < w_w)
{
float temp = 0.0;
for (int i = 0; i < x_w; i++)
{
temp += x[row*x_w + i] * w[i*w_w + col];
}
temp += b[row*w_w + col];
out[row*w_w + col] = temp;
}
}
__global__ void MatMul(float* x, int x_h, int x_w, float* w, int w_h, int w_w, float* out)
{
int row = threadIdx.y + blockIdx.y*blockDim.y;
int col = threadIdx.x + blockIdx.x*blockDim.x;
if (row < x_h&&col < w_w)
{
float temp = 0.0;
for (int i = 0; i < x_w; i++)
{
temp += x[row*x_w + i] * w[i*w_w + col];
}
out[row*w_w + col] = temp;
}
}
__global__ void MatTrans(float* A, int A_w, int A_h, float* out)
{
int ny = threadIdx.y + blockIdx.y*blockDim.y;
int nx = threadIdx.x + blockIdx.x*blockDim.x;
if (nx < A_h&&ny < A_w)
{
out[nx*A_w + ny] = A[ny*A_h + nx];
}
}
__global__ void CudaReluForward(float* input, float* output)
{
int i = threadIdx.x + blockIdx.x*blockDim.x;
output[i] = (input[i] > 0) ? input[i] : 0;
}
__global__ void CudaReluBackward(float* input, float* value, float* output)
{
int i = threadIdx.x + blockIdx.x*blockDim.x;
output[i] = (input[i] > 0) ? value[i] : 0;
}
__global__ void reduceSum(float* input, float* output, int n)
{
int tid = threadIdx.x;
if (tid >= n) return;
float* data = input + blockIdx.x*blockDim.x;
for (int stride = 1; stride < blockDim.x; stride *= 2)
{
if ((tid % (2 * stride)) == 0)
{
data[tid] += data[tid + stride];
}
__syncthreads();
}
if (tid == 0)
{
output[blockIdx.x] = data[0];
}
}
__global__ void reduceMax(float* input, float* output, int n)
{
int tid = threadIdx.x;
if (tid >= n)return;
float* data = input + blockIdx.x*blockDim.x;
for (int stride = 1; stride < blockDim.x; stride *= 2)
{
if ((tid % (2 * stride)) == 0)
{
if (data[tid] < data[tid + stride])
{
data[tid] = data[tid + stride];
}
}
__syncthreads();
}
if (tid == 0)
{
output[blockIdx.x] = data[0];
}
}
__global__ void cudaExp(float* input, float* c, int n)
{
int i = threadIdx.x + blockIdx.x*blockDim.x;
if (i < n)
{
input[i] = expf(input[i] - c[0]);
}
}
__global__ void cudaDivide(float* input, float* output, float* d, int n)
{
int i = threadIdx.x + blockIdx.x*blockDim.x;
if (i < n)
{
output[i] = input[i] / d[0];
}
}
void softmax(float* input, float* output)
{
const int input_size = 10;
float* in_copy = nullptr;
cudaMalloc(&in_copy, sizeof(float)*input_size);
cudaMemcpy(in_copy, input, sizeof(float)*input_size, cudaMemcpyDeviceToDevice);
dim3 block_sfm(1024, 1);
dim3 grid_sfm((input_size - 1) / block_sfm.x + 1, 1);
float* t_B = nullptr;
cudaMalloc(&t_B, sizeof(float)*grid_sfm.x);
reduceMax << <grid_sfm, block_sfm >> > (in_copy, t_B, input_size);
cudaExp << <grid_sfm, block_sfm >> > (input, t_B, input_size);
float* exp_sum_input = nullptr;
cudaMalloc(&exp_sum_input, sizeof(float)*input_size);
cudaMemcpy(exp_sum_input, input, sizeof(float)*input_size, cudaMemcpyDeviceToDevice);
float* t_C = nullptr;
cudaMalloc(&t_C, sizeof(float)*grid_sfm.x);
reduceSum << <grid_sfm, block_sfm >> > (exp_sum_input, t_C, input_size);
cudaDivide << <grid_sfm, block_sfm >> > (input, output, t_C, input_size);
cudaFree(t_B);
cudaFree(t_C);
cudaFree(in_copy);
cudaFree(exp_sum_input);
}
__global__ void cudaCSEerror(float* y, float* t, int n, float* out)
{
int i = threadIdx.x + blockIdx.x*blockDim.x;
float delta = 1e-7;
if (i < n&&t[i] != 0)
{
out[0] = -(t[i] * log(y[i] + delta));
}
}
float cross_entropy_error(float* y, float* t)
{
const int input_size = 10;
float* out = new float[1];
float* out_array = nullptr;
cudaMalloc(&out_array, sizeof(float) * 1);
dim3 block(1024, 1);
dim3 grid((input_size - 1) / block.x + 1, 1);
cudaCSEerror << <grid, block >> > (y, t, input_size, out_array);
cudaMemcpy(out, out_array, sizeof(float) * 1, cudaMemcpyDeviceToHost);
return out[0];
}
__global__ void cuda_swl_backward(float* y, float* t, float* dx, int n)
{
int i = threadIdx.x + blockIdx.x*blockDim.x;
if (i < n)
{
dx[i] = y[i] - t[i];
}
}
void FC_init(float* params, int size)
{
default_random_engine e(time(0));
uniform_real_distribution<float> u(-2, 2);
for (int i = 0; i < size; i++)
{
params[i] = 0.01*u(e);
}
}
__global__ void SGD_update_kernel(float* params, float* grads, int size)
{
int i = threadIdx.x + blockDim.x*blockIdx.x;
params[i] -= 0.01*grads[i];
}
void SGD_update(float* params, float* grads, int size)
{
dim3 block(1024, 1);
dim3 grid((size - 1) / block.x + 1, 1);
SGD_update_kernel << <grid, block >> > (params, grads, size);
}
layers.cuh
#pragma once
#include "functions.cuh"
class FC
{
public:
float* self_w;
float* self_b;
float* self_x;
float* self_dw;
float* self_db;
float* self_output;
float* self_dx;
int self_input_size;
int self_output_size;
public:
FC();
FC(float* w, float* b, int input_size, int output_size);
~FC();
void fc_forward(float* x);
void fc_backward(float* dout);
};
class Relu
{
public:
Relu();
Relu(int size);
~Relu();
void r_forward(float* x);
void r_backward(float* x);
private:
int self_size;
public:
float* self_mask;
float* self_dx;
};
class Softmax_with_loss
{
public:
Softmax_with_loss();
~Softmax_with_loss();
void swl_forward(float* x, float* t);
void swl_backward();
private:
int self_input_size;
public:
float* self_y;
float* self_t;
float* self_dx;
float self_loss;
float* self_t_dx;
};
layers.cu
#include <iostream>
#include "functions.cuh"
#include "layers.cuh"
using namespace std;
FC::FC() {}
FC::FC(float* w, float* b, int input_size, int output_size)
:self_w(w),
self_b(b),
self_input_size(input_size),
self_output_size(output_size)
{
self_x = nullptr;
self_dw = nullptr;
self_db = nullptr;
self_output = nullptr;
self_dx = nullptr;
cudaMalloc(&self_x, self_input_size * sizeof(float));
cudaMalloc(&self_dw, self_input_size*self_output_size * sizeof(float));
cudaMalloc(&self_db, self_output_size * sizeof(float));
cudaMalloc(&self_output, self_output_size * sizeof(float));
cudaMalloc(&self_dx, self_input_size * sizeof(float));
}
FC::~FC() {}
void FC::fc_forward(float* x)
{
cudaMemcpy(self_x, x, self_input_size * sizeof(float), cudaMemcpyDeviceToDevice);
dim3 blockSize(32, 32);
dim3 gridSize((self_input_size + blockSize.x - 1) / blockSize.x, (self_input_size + blockSize.y - 1) / blockSize.y);
MatMulwithBias << <gridSize, blockSize >> > (x, 1, self_input_size, self_w, self_input_size, self_output_size, self_b, self_output);
}
void FC::fc_backward(float* dout)
{
float* self_w_T = nullptr;
cudaMalloc(&self_w_T, self_input_size * self_output_size * sizeof(float));
dim3 blockSize(32, 32);
dim3 gridSize((self_input_size + blockSize.x - 1) / blockSize.x, (self_input_size + blockSize.y - 1) / blockSize.y);
MatTrans << <gridSize, blockSize >> > (self_w, self_input_size, self_output_size, self_w_T);
MatMul << <gridSize, blockSize >> > (dout, 1, self_output_size, self_w_T, self_output_size, self_input_size, self_dx);
cudaFree(self_w_T);
MatMul << <gridSize, blockSize >> > (self_x, self_input_size, 1, dout, 1, self_output_size, self_dw);
cudaMemcpy(self_db, dout, sizeof(float)*self_output_size, cudaMemcpyDeviceToDevice);
}
Relu::Relu() {};
Relu::Relu(int size) :self_size(size)
{
self_mask = nullptr;
self_dx = nullptr;
cudaMalloc(&self_mask, sizeof(float)*self_size);
cudaMalloc(&self_dx, sizeof(float)*self_size);
}
Relu::~Relu()
{
cudaFree(self_mask);
cudaFree(self_dx);
}
void Relu::r_forward(float* x)
{
dim3 blockSize(256);
dim3 gridSize((self_size + blockSize.x - 1) / blockSize.x);
CudaReluForward << <gridSize, blockSize >> > (x, self_mask);
cudaMemcpy(x, self_mask, sizeof(float)*self_size, cudaMemcpyDeviceToDevice);
}
void Relu::r_backward(float* dout)
{
dim3 blockSize(256);
dim3 gridSize((self_size + blockSize.x - 1) / blockSize.x);
CudaReluBackward << <gridSize, blockSize >> > (self_mask, dout, self_dx);
}
Softmax_with_loss::Softmax_with_loss() :self_input_size(10)
{
self_t = nullptr;
self_y = nullptr;
self_dx = nullptr;
cudaMalloc(&self_t, sizeof(float)*self_input_size);
cudaMalloc(&self_y, sizeof(float)*self_input_size);
cudaMalloc(&self_dx, sizeof(float)*self_input_size);
}
Softmax_with_loss::~Softmax_with_loss() {}
void Softmax_with_loss::swl_forward(float* x, float* t)
{
cudaMemcpy(self_t, t, sizeof(float)*self_input_size, cudaMemcpyDeviceToDevice);
softmax(x, self_y);
self_loss = cross_entropy_error(self_y, self_t);
}
void Softmax_with_loss::swl_backward()
{
dim3 block(1024, 1);
dim3 grid((self_input_size - 1) / block.x + 1, 1);
cuda_swl_backward << <grid, block >> > (self_y, self_t, self_dx, self_input_size);
}
main.cuh
#pragma once
#include "layers.cuh"
class Model
{
public:
float* self_params_w1;
float* self_params_b1;
float* self_params_w2;
float* self_params_b2;
float* self_params_w3;
float* self_params_b3;
float* self_grads_w1;
float* self_grads_b1;
float* self_grads_w2;
float* self_grads_b2;
float* self_grads_w3;
float* self_grads_b3;
FC self_layer1;
Relu self_layer1_act;
FC self_layer2;
Relu self_layer2_act;
FC self_layer3;
Softmax_with_loss self_layer4;
private:
int* self_model_size;
public:
Model(int* model_size);
~Model();
float* model_forward(float* x);
float loss(float* x, float* t);
float gradient(float* x, float* t);
void SGD();
};
main.cu
#include <iostream>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "layers.cuh"
#include "functions.cuh"
#include "dataloader.cuh"
#include "main.cuh"
#include <time.h>
#include <opencv2/opencv.hpp>
using namespace cv;
using namespace std;
Model::Model(int* model_size) :self_model_size(model_size)
{
self_params_w1 = nullptr;
self_params_b1 = nullptr;
self_params_w2 = nullptr;
self_params_b2 = nullptr;
self_params_w3 = nullptr;
self_params_b3 = nullptr;
cudaMalloc(&self_params_w1, sizeof(float)*self_model_size[0] * self_model_size[1]);
cudaMalloc(&self_params_b1, sizeof(float)*self_model_size[1]);
cudaMalloc(&self_params_w2, sizeof(float)*self_model_size[1] * self_model_size[2]);
cudaMalloc(&self_params_b2, sizeof(float)*self_model_size[2]);
cudaMalloc(&self_params_w3, sizeof(float)*self_model_size[2] * self_model_size[3]);
cudaMalloc(&self_params_b3, sizeof(float)*self_model_size[3]);
self_grads_w1 = nullptr;
self_grads_b1 = nullptr;
self_grads_w2 = nullptr;
self_grads_b2 = nullptr;
self_grads_w3 = nullptr;
self_grads_b3 = nullptr;
cudaMalloc(&self_grads_w1, sizeof(float)*self_model_size[0] * self_model_size[1]);
cudaMalloc(&self_grads_b1, sizeof(float)*self_model_size[1]);
cudaMalloc(&self_grads_w2, sizeof(float)*self_model_size[1] * self_model_size[2]);
cudaMalloc(&self_grads_b2, sizeof(float)*self_model_size[2]);
cudaMalloc(&self_grads_w3, sizeof(float)*self_model_size[2] * self_model_size[3]);
cudaMalloc(&self_grads_b3, sizeof(float)*self_model_size[3]);
float* params_w1 = new float[self_model_size[0] * self_model_size[1]];
float* params_w2 = new float[self_model_size[1] * self_model_size[2]];
float* params_w3 = new float[self_model_size[2] * self_model_size[3]];
FC_init(params_w1, self_model_size[0] * self_model_size[1]);
FC_init(params_w2, self_model_size[1] * self_model_size[2]);
FC_init(params_w3, self_model_size[1] * self_model_size[3]);
cudaMemcpy(self_params_w1, params_w1, sizeof(float)*self_model_size[0] * self_model_size[1], cudaMemcpyHostToDevice);
cudaMemcpy(self_params_w2, params_w2, sizeof(float)*self_model_size[1] * self_model_size[2], cudaMemcpyHostToDevice);
cudaMemcpy(self_params_w3, params_w3, sizeof(float)*self_model_size[2] * self_model_size[3], cudaMemcpyHostToDevice);
cudaMemset(self_params_b1, 0, sizeof(float)*self_model_size[1]);
cudaMemset(self_params_b2, 0, sizeof(float)*self_model_size[2]);
cudaMemset(self_params_b3, 0, sizeof(float)*self_model_size[3]);
self_layer1 = FC(self_params_w1, self_params_b1, self_model_size[0], self_model_size[1]);
self_layer1_act = Relu(self_model_size[1]);
self_layer2 = FC(self_params_w2, self_params_b2, self_model_size[1], self_model_size[2]);
self_layer2_act = Relu(self_model_size[2]);
self_layer3 = FC(self_params_w3, self_params_b3, self_model_size[2], self_model_size[3]);
self_layer4 = Softmax_with_loss();
delete[]params_w1;
delete[]params_w2;
delete[]params_w3;
}
Model::~Model() {}
float* Model::model_forward(float* x)
{
self_layer1.fc_forward(x);
self_layer1_act.r_forward(self_layer1.self_output);
self_layer2.fc_forward(self_layer1.self_output);
self_layer2_act.r_forward(self_layer2.self_output);
self_layer3.fc_forward(self_layer2.self_output);
return self_layer3.self_output;
}
float Model::loss(float* x, float* t)
{
self_layer4.swl_forward(x, t);
return self_layer4.self_loss;
}
float Model::gradient(float* x, float* t)
{
float* y = model_forward(x);
float loss_value = loss(y, t);
self_layer4.swl_backward();
self_layer3.fc_backward(self_layer4.self_dx);
self_layer2_act.r_backward(self_layer3.self_dx);
self_layer2.fc_backward(self_layer2_act.self_dx);
self_layer1_act.r_backward(self_layer2.self_dx);
self_layer1.fc_backward(self_layer1_act.self_dx);
cudaMemcpy(self_grads_w1, self_layer1.self_dw, sizeof(float) * self_model_size[0] * self_model_size[1], cudaMemcpyDeviceToDevice);
cudaMemcpy(self_grads_b1, self_layer1.self_db, sizeof(float) * self_model_size[1], cudaMemcpyDeviceToDevice);
cudaMemcpy(self_grads_w2, self_layer2.self_dw, sizeof(float) * self_model_size[1] * self_model_size[2], cudaMemcpyDeviceToDevice);
cudaMemcpy(self_grads_b2, self_layer2.self_db, sizeof(float) * self_model_size[2], cudaMemcpyDeviceToDevice);
cudaMemcpy(self_grads_w3, self_layer3.self_dw, sizeof(float) * self_model_size[2] * self_model_size[3], cudaMemcpyDeviceToDevice);
cudaMemcpy(self_grads_b3, self_layer3.self_db, sizeof(float) * self_model_size[3], cudaMemcpyDeviceToDevice);
return loss_value;
}
void Model::SGD()
{
SGD_update(self_params_w1, self_grads_w1, self_model_size[0] * self_model_size[1]);
SGD_update(self_params_b1, self_grads_b1, self_model_size[1]);
SGD_update(self_params_w2, self_grads_w2, self_model_size[1] * self_model_size[2]);
SGD_update(self_params_b2, self_grads_b2, self_model_size[2]);
SGD_update(self_params_w3, self_grads_w3, self_model_size[2] * self_model_size[3]);
SGD_update(self_params_b3, self_grads_b3, self_model_size[3]);
}
int main()
{
int model_size[4] = { 784,1024,1024,10 };
Model DNN(model_size);
const int train_size = 60000;
const int test_size = 10000;
const int label_size = 10;
const int image_size = 784;
float* train_label = new float[train_size * label_size];
float* train_image = new float[train_size * image_size];
memset(train_label, 0, train_size * label_size * sizeof(float));
read_Mnist_Label("./dataset/train-labels.idx1-ubyte", train_label);
read_Mnist_Images("./dataset/train-images.idx3-ubyte", train_image);
const int epoch = 1;
for (int e = 0; e < epoch; e++)
{
float loss_sum = 0;
clock_t t1, t2;
t1 = clock();
for (int i = 0; i < 3000; i++)
{
float* train_x = new float[image_size];
float* train_t = new float[label_size];
copy(train_label + (i * label_size), train_label + (i + 1) * label_size, train_t);
copy(train_image + (i * image_size), train_image + (i + 1) * image_size, train_x);
float* t_train_x = nullptr;
float* t_train_t = nullptr;
cudaMalloc(&t_train_x, sizeof(float)*image_size);
cudaMemcpy(t_train_x, train_x, sizeof(float) * image_size, cudaMemcpyHostToDevice);
cudaMalloc(&t_train_t, sizeof(float)*label_size);
cudaMemcpy(t_train_t, train_t, sizeof(float) * label_size, cudaMemcpyHostToDevice);
float loss_ = DNN.gradient(t_train_x, t_train_t);
DNN.SGD();
loss_sum += loss_;
cudaFree(t_train_x);
cudaFree(t_train_t);
delete[]train_x;
delete[]train_t;
if (i % 500 == 0)
{
t2 = clock();
float loss_avg = loss_sum / 500;
cout << i << " Loss:" << loss_avg << " | ";
cout << "Time: " << (double)(t2 - t1) / CLOCKS_PER_SEC << endl;
t1 = clock();
loss_sum = 0;
}
}
}
Mat img = imread("0.jpg", 0);
if (!img.data) { cout << "测试图片读取错误" << endl; }
float* test_array = new float[28 * 28];
int a = 0;
for (int i = 0; i < img.rows; i++)
{
for (int j = 0; j < img.cols; j++)
{
float value = (img.at<uchar>(i, j)) / 255.0;
test_array[a] = value;
a += 1;
}
}
float* t_test_array = nullptr;
cudaMalloc(&t_test_array, sizeof(float) * 28 * 28);
cudaMemcpy(t_test_array, test_array, sizeof(float) * 28 * 28, cudaMemcpyHostToDevice);
float* t_test_y = DNN.model_forward(t_test_array);
float* test_y_softmax = nullptr;
cudaMalloc(&test_y_softmax, sizeof(float) * 10);
softmax(t_test_y, test_y_softmax);
float* h_y_softmax = new float[10];
cudaMemcpy(h_y_softmax, test_y_softmax, sizeof(float) * 10, cudaMemcpyDeviceToHost);
int y_out = max_element(h_y_softmax, h_y_softmax + 10) - h_y_softmax;
cout << "预测值:" << y_out << endl;
return 0;
}
|