前言:alexeyab版darknet出现,与pjreddie相比训练与推理速度有了一定的提升,且多了很多功能,如自动画l训练过程的loss曲线图,自动生成anchor值等,但是精度有一些损失,在实际使用过程中可以权衡利弊。
两个版本在安装还是比较容易的的,目录下面已经有makefile文件了,在目录下打开终端直接可以编译darknet,但由于pjreddie由于长时间未更新,随着cuda与cudnn版本的升级,兼容性出现了一些问题,当cudnn版本为8.0及以上版本时,出现了找不到cudnn的问题。本文给出了适配cudnn8.0及以上版本的的解决方案。
使用以下代码替换原先/src/convolutional_layer.c文件中的cudnn_convolutional_setup函数,在执行编译。
void cudnn_convolutional_setup(layer *l)
{
cudnnSetTensor4dDescriptor(l->dsrcTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, l->batch, l->c, l->h, l->w);
cudnnSetTensor4dDescriptor(l->ddstTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, l->batch, l->out_c, l->out_h, l->out_w);
cudnnSetTensor4dDescriptor(l->srcTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, l->batch, l->c, l->h, l->w);
cudnnSetTensor4dDescriptor(l->dstTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, l->batch, l->out_c, l->out_h, l->out_w);
cudnnSetTensor4dDescriptor(l->normTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, 1, l->out_c, 1, 1);
cudnnSetFilter4dDescriptor(l->dweightDesc, CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, l->n, l->c/l->groups, l->size, l->size);
cudnnSetFilter4dDescriptor(l->weightDesc, CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, l->n, l->c/l->groups, l->size, l->size);
#if CUDNN_MAJOR >= 6
cudnnSetConvolution2dDescriptor(l->convDesc, l->pad, l->pad, l->stride, l->stride, 1, 1, CUDNN_CROSS_CORRELATION, CUDNN_DATA_FLOAT);
#else
cudnnSetConvolution2dDescriptor(l->convDesc, l->pad, l->pad, l->stride, l->stride, 1, 1, CUDNN_CROSS_CORRELATION);
#endif
#if CUDNN_MAJOR >= 7
cudnnSetConvolutionGroupCount(l->convDesc, l->groups);
#else
if(l->groups > 1){
error("CUDNN < 7 doesn't support groups, please upgrade!");
}
#endif
#if CUDNN_MAJOR >= 8
int returnedAlgoCount;
cudnnConvolutionFwdAlgoPerf_t fw_results[2 * CUDNN_CONVOLUTION_FWD_ALGO_COUNT];
cudnnConvolutionBwdDataAlgoPerf_t bd_results[2 * CUDNN_CONVOLUTION_BWD_DATA_ALGO_COUNT];
cudnnConvolutionBwdFilterAlgoPerf_t bf_results[2 * CUDNN_CONVOLUTION_BWD_FILTER_ALGO_COUNT];
cudnnFindConvolutionForwardAlgorithm(cudnn_handle(),
l->srcTensorDesc,
l->weightDesc,
l->convDesc,
l->dstTensorDesc,
CUDNN_CONVOLUTION_FWD_ALGO_COUNT,
&returnedAlgoCount,
fw_results);
for(int algoIndex = 0; algoIndex < returnedAlgoCount; ++algoIndex){
#if PRINT_CUDNN_ALGO > 0
printf("^^^^ %s for Algo %d: %f time requiring %llu memory\n",
cudnnGetErrorString(fw_results[algoIndex].status),
fw_results[algoIndex].algo, fw_results[algoIndex].time,
(unsigned long long)fw_results[algoIndex].memory);
#endif
if( fw_results[algoIndex].memory < MEMORY_LIMIT ){
l->fw_algo = fw_results[algoIndex].algo;
break;
}
}
cudnnFindConvolutionBackwardDataAlgorithm(cudnn_handle(),
l->weightDesc,
l->ddstTensorDesc,
l->convDesc,
l->dsrcTensorDesc,
CUDNN_CONVOLUTION_BWD_DATA_ALGO_COUNT,
&returnedAlgoCount,
bd_results);
for(int algoIndex = 0; algoIndex < returnedAlgoCount; ++algoIndex){
#if PRINT_CUDNN_ALGO > 0
printf("^^^^ %s for Algo %d: %f time requiring %llu memory\n",
cudnnGetErrorString(bd_results[algoIndex].status),
bd_results[algoIndex].algo, bd_results[algoIndex].time,
(unsigned long long)bd_results[algoIndex].memory);
#endif
if( bd_results[algoIndex].memory < MEMORY_LIMIT ){
l->bd_algo = bd_results[algoIndex].algo;
break;
}
}
cudnnFindConvolutionBackwardFilterAlgorithm(cudnn_handle(),
l->srcTensorDesc,
l->ddstTensorDesc,
l->convDesc,
l->dweightDesc,
CUDNN_CONVOLUTION_BWD_FILTER_ALGO_COUNT,
&returnedAlgoCount,
bf_results);
for(int algoIndex = 0; algoIndex < returnedAlgoCount; ++algoIndex){
#if PRINT_CUDNN_ALGO > 0
printf("^^^^ %s for Algo %d: %f time requiring %llu memory\n",
cudnnGetErrorString(bf_results[algoIndex].status),
bf_results[algoIndex].algo, bf_results[algoIndex].time,
(unsigned long long)bf_results[algoIndex].memory);
#endif
if( bf_results[algoIndex].memory < MEMORY_LIMIT ){
l->bf_algo = bf_results[algoIndex].algo;
break;
}
}
#else
cudnnGetConvolutionForwardAlgorithm(cudnn_handle(),
l->srcTensorDesc,
l->weightDesc,
l->convDesc,
l->dstTensorDesc,
CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT,
2000000000,
&l->fw_algo);
cudnnGetConvolutionBackwardDataAlgorithm(cudnn_handle(),
l->weightDesc,
l->ddstTensorDesc,
l->convDesc,
l->dsrcTensorDesc,
CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT,
2000000000,
&l->bd_algo);
cudnnGetConvolutionBackwardFilterAlgorithm(cudnn_handle(),
l->srcTensorDesc,
l->ddstTensorDesc,
l->convDesc,
l->dweightDesc,
CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT,
2000000000,
&l->bf_algo);
#endif
}
|