最近有个小米的K30 5G手机不用了,解锁BL后,root, 用来测试opencl ,芯片是骁龙765G。
1 首先 从高通平台下载 opencl-sdk, 我下载的是opencl-sdk-1.4.2.zip ,解压后,从里面提取头文件CL目录;
2 在ubuntu20.04上安装android sdk和NDK工具,不需要安装android studio, 但是通过android studio也方便,我们主要是想通过NDK进行C++代码的编译,链接opencl库;
3 下载libOpenCL库,由于编译的是64位 arm64-v8a,要下载64位的opencl库,不要下载32位的,64位库下载:?
adb pull /system/vendor/lib/libOpenCL.so # 这里是32位程序
adb pull /vendor/lib64/libOpenCL.so # 这里是64位程序
4 创建工程,目录如下
?
?除了头文件和库文件外,主要有三个文件 build.sh , CMakeLists.txt 和hello_world_cl.cpp文件及kernel文件,matvec.cl文件,各个文件的内容如下
build.sh
#!/bin/bash
SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
cd ${SCRIPT_DIR}
rm -rf build
mkdir build
cd build
SDK_PATH=/home/guo/Android/Sdk
NDK_PATH=/home/guo/Android/Sdk/ndk/24.0.8215888/
ANDROID_ABI=arm64-v8a
MINSDKVERSION=29
# make
cmake \
-DCMAKE_BUILD_TYPE=Release \
-DCMAKE_TOOLCHAIN_FILE=${NDK_PATH}/build/cmake/android.toolchain.cmake \
-DANDROID_ABI=${ANDROID_ABI} \
-DANDROID_NDK=${NDK_PATH} \
-DANDROID_PLATFORM=android-${MINSDKVERSION} \
-DCMAKE_ANDROID_ARCH_ABI=${ANDROID_ABI} \
-DCMAKE_ANDROID_NDK=${NDK_PATH} \
-DCMAKE_MAKE_PROGRAM=/usr/bin/ninja \
-DCMAKE_SYSTEM_NAME=Android \
-DCMAKE_SYSTEM_VERSION=${MINSDKVERSION} \
-DANDROID_STL=c++_static \
-GNinja \
..
/usr/bin/ninja
CMakeLists.txt
cmake_minimum_required(VERSION 3.10)
project(cl_main LANGUAGES CXX)
# set(CMAKE_CXX_STANDARD 11)
set(EXECUTABLE_OUTPUT_PATH ${PROJECT_SOURCE_DIR}/cl_hello_bin)
include_directories(include )
link_directories(lib/bit64)
link_libraries(OpenCL)
# without these flags, the cmake generated binary file is much bigger than ndk-build
# you can also pass -DCMAKE_C_FLAGS="-s" to the CMake call.
set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} -s")
set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -s")
# 编译源码生成目标
add_executable(
cl_main
hello_world_cl.cpp
)
hello_world_cl.cpp 这个一个矢量和矩阵相乘的函数
#include <stdio.h>
#include <cstdlib>
#include <fstream>
#include <iostream>
#include <vector>
#include <sys/types.h>
#ifdef MAC
#include <OpenCL/cl.h>
#else
#include <CL/cl.h>
#endif
#define PROGRAM_FILE "matvec.cl"
#define KERNEL_FUNC "matvec_mult"
//矩阵乘以向量
int main(int argc, char** argv)
{
cl_platform_id platform;
cl_device_id device;
cl_context context;
cl_command_queue queue;
cl_int err;
cl_program program;
FILE *program_handle;
char *program_buffer, *program_log;
size_t program_size, log_size;
cl_kernel kernel;
size_t work_units_per_kernel;
float mat[16], vec[4], result[4];
float correct[4] = {0.0f, 0.0f, 0.0f, 0.0f};
cl_mem mat_buff, vec_buff, res_buff;
//初始化数据
for (cl_int i = 0; i < 16; ++i) {
mat[i] = i * 2.0f;
}
for (cl_int i = 0; i < 4; ++i) {
vec[i] = i * 3.0f;
correct[0] += mat[i] * vec[i];
correct[1] += mat[i+4] * vec[i];
correct[2] += mat[i+8] * vec[i];
correct[3] += mat[i+12] * vec[i];
}
clGetPlatformIDs(1, &platform, NULL);
clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL);
context = clCreateContext(NULL, 1,&device, NULL, NULL,&err);
program_handle = fopen(PROGRAM_FILE, "r");
fseek(program_handle, 0, SEEK_END);
program_size = ftell(program_handle);
rewind(program_handle);
program_buffer = (char *) malloc(program_size+1);
program_buffer[program_size] = '\0';
fread(program_buffer, sizeof(char), program_size, program_handle);
fclose(program_handle);
program = clCreateProgramWithSource(context,1,(const char **)&program_buffer,&program_size,&err);
free(program_buffer);
clBuildProgram(program,0,NULL,NULL,NULL,NULL);
kernel = clCreateKernel(program, KERNEL_FUNC, &err);
queue = clCreateCommandQueue(context,device,0, &err);
mat_buff = clCreateBuffer(context,CL_MEM_READ_ONLY|CL_MEM_COPY_HOST_PTR, sizeof(float)*16, mat,&err);
vec_buff = clCreateBuffer(context, CL_MEM_READ_ONLY|CL_MEM_COPY_HOST_PTR,sizeof(float)*4,vec,&err);
res_buff = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(float)*4, NULL, &err);
clSetKernelArg(kernel, 0, sizeof(cl_mem), &mat_buff);
clSetKernelArg(kernel, 1, sizeof(cl_mem),&vec_buff);
clSetKernelArg(kernel, 2, sizeof(cl_mem), &res_buff);
work_units_per_kernel = 4;
clEnqueueNDRangeKernel(queue,kernel,1,NULL,&work_units_per_kernel,
NULL,0,NULL,NULL);
clEnqueueReadBuffer(queue,res_buff,CL_TRUE,0, sizeof(float)*4,result,
0,NULL,NULL);
if((result[0] == correct[0])&&(result[1] == correct[1])&&(result[2] == correct[2])&&(result[3] == correct[3])){
printf("Matrix-vector multiplication successful.\n");
} else{
printf("Matrix-vector multiplication unsuccessful.\n");
}
clReleaseMemObject(mat_buff);
clReleaseMemObject(vec_buff);
clReleaseMemObject(res_buff);
clReleaseKernel(kernel);
clReleaseCommandQueue(queue);
clReleaseProgram(program);
clReleaseContext(context);
return 0;
}
kernel 文件?matvec.cl
__kernel void matvec_mult(__global float4* matrix,__global float4* vector, __global float*result){
int i = get_global_id(0);
result[i] = dot(matrix[i], vector[0]);
}
把编译后的文件cl_main, libOpencl.so matvec.cl? push到手机/data/armtest/helloworld-opencl下
chmod +x cl_main , 然后运行
运行成功!?
|