From: @yeyunpeng2020 Reviewed-by: @zhanghaibo5,@ddwsky Signed-off-by: @ddwskypull/15622/MERGE
| @@ -29,6 +29,8 @@ class OpenCLExecutor : public Executor { | |||
| public: | |||
| OpenCLExecutor() : Executor() { allocator_ = ocl_runtime.GetInstance()->GetAllocator(); } | |||
| ~OpenCLExecutor() override = default; | |||
| int Prepare(const std::vector<kernel::LiteKernel *> &kernels) override { return RET_OK; } | |||
| int Run(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs, | |||
| @@ -56,6 +56,7 @@ class RegistryInferShape { | |||
| RegistryInferShape(int prim_type, InferShape func) { | |||
| InferManager::GetInstance()->InsertInferShapeFunc(prim_type, func); | |||
| } | |||
| ~RegistryInferShape() = default; | |||
| }; | |||
| #define REG_INFER_SHAPE(op, prim_type, func) static RegistryInferShape g_##op##InferShape(prim_type, func); | |||
| @@ -3,7 +3,7 @@ | |||
| #define UP_DIV(x, y) (((x) + (y) - (1)) / (y)) | |||
| __constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; | |||
| __kernel void MatMul_2d(__read_only image2d_t input, __write_only image2d_t output, __global FLT16 *weight, | |||
| int4 in_shape, int4 out_shape) { | |||
| __read_only image2d_t bias, int4 in_shape, int4 out_shape) { | |||
| int gidx = get_global_id(0); // CO4 | |||
| int gidz = get_global_id(2); // N | |||
| int lidx = get_local_id(0); | |||
| @@ -28,12 +28,13 @@ __kernel void MatMul_2d(__read_only image2d_t input, __write_only image2d_t outp | |||
| result += temp[lidx][1]; | |||
| result += temp[lidx][2]; | |||
| result += temp[lidx][3]; | |||
| result += READ_IMAGE(bias, smp_zero, (int2)(gidx, 0)); | |||
| WRITE_IMAGE(output, (int2)(gidx, gidz), result); | |||
| } | |||
| } | |||
| __kernel void MatMul_4d(__read_only image2d_t input, __write_only image2d_t output, __global FLT16 *weight, | |||
| int4 in_shape, int4 out_shape) { | |||
| __read_only image2d_t bias, int4 in_shape, int4 out_shape) { | |||
| int gidx = get_global_id(0); // CO4 | |||
| int gidy = get_global_id(1); // N * H * 4 | |||
| int gidz = get_global_id(2); // W | |||
| @@ -62,12 +63,14 @@ __kernel void MatMul_4d(__read_only image2d_t input, __write_only image2d_t outp | |||
| result += temp[lidx][1]; | |||
| result += temp[lidx][2]; | |||
| result += temp[lidx][3]; | |||
| result += READ_IMAGE(bias, smp_zero, (int2)(gidx, 0)); | |||
| WRITE_IMAGE(output, (int2)(gidz * co4 + gidx, nh_index), result); | |||
| } | |||
| } | |||
| __kernel void MatMulActWeightTransposeB_4d(__read_only image2d_t input, __write_only image2d_t output, | |||
| __read_only image2d_t weight, int4 in_shape, int4 out_shape) { | |||
| __read_only image2d_t weight, __read_only image2d_t bias, int4 in_shape, | |||
| int4 out_shape) { | |||
| int gidx = get_global_id(0); // CO4 | |||
| int gidy = get_global_id(1); // N * H * 4 | |||
| int gidz = get_global_id(2); // W | |||
| @@ -99,12 +102,14 @@ __kernel void MatMulActWeightTransposeB_4d(__read_only image2d_t input, __write_ | |||
| result += temp[lidx][1]; | |||
| result += temp[lidx][2]; | |||
| result += temp[lidx][3]; | |||
| result += READ_IMAGE(bias, smp_zero, (int2)(gidx, 0)); | |||
| WRITE_IMAGE(output, (int2)(gidz * co4 + gidx, nh_index), result); | |||
| } | |||
| } | |||
| __kernel void MatMulActWeight_4d(__read_only image2d_t input, __write_only image2d_t output, | |||
| __read_only image2d_t weight, int4 in_shape, int4 out_shape) { | |||
| __read_only image2d_t weight, __read_only image2d_t bias, int4 in_shape, | |||
| int4 out_shape) { | |||
| int gidx = get_global_id(0); // CO4 | |||
| int gidy = get_global_id(1); // N * H * 4 | |||
| int gidz = get_global_id(2); // W | |||
| @@ -136,6 +141,7 @@ __kernel void MatMulActWeight_4d(__read_only image2d_t input, __write_only image | |||
| result += temp[lidx][1]; | |||
| result += temp[lidx][2]; | |||
| result += temp[lidx][3]; | |||
| result += READ_IMAGE(bias, smp_zero, (int2)(gidx, 0)); | |||
| WRITE_IMAGE(output, (int2)(gidz * co4 + gidx, nh_index), result); | |||
| } | |||
| } | |||
| @@ -49,7 +49,7 @@ bool IsUseStrassenMatmul(const std::vector<lite::Tensor *> &in_tensors_) { | |||
| } | |||
| int MatMulOpenCLKernel::CheckSpecs() { | |||
| if (in_tensors_.size() != 2 || out_tensors_.size() != 1) { | |||
| if (!(in_tensors_.size() == 2 || in_tensors_.size() == 3) || out_tensors_.size() != 1) { | |||
| MS_LOG(ERROR) << "in size: " << in_tensors_.size() << ", out size: " << out_tensors_.size(); | |||
| return RET_ERROR; | |||
| } | |||
| @@ -180,6 +180,40 @@ int MatMulOpenCLKernel::InitWeights() { | |||
| } | |||
| allocator->UnmapBuffer(padWeight_); | |||
| return InitBias(); | |||
| } | |||
| int MatMulOpenCLKernel::InitBias() { | |||
| // pad FC Bias | |||
| CO_ = GpuTensorInfo(out_tensors_[0]).C; | |||
| auto allocator = ocl_runtime_->GetAllocator(); | |||
| int co4 = UP_DIV(CO_, C4NUM); | |||
| size_t dtype_size = enable_fp16_ ? sizeof(uint16_t) : sizeof(float); | |||
| size_t im_dst_x, im_dst_y; | |||
| im_dst_x = co4; | |||
| im_dst_y = 1; | |||
| size_t img_dtype = CL_FLOAT; | |||
| if (enable_fp16_) { | |||
| img_dtype = CL_HALF_FLOAT; | |||
| } | |||
| lite::opencl::ImageSize img_size{im_dst_x, im_dst_y, img_dtype}; | |||
| bias_ = allocator->Malloc(img_size); | |||
| bias_ = allocator->MapBuffer(bias_, CL_MAP_WRITE, nullptr, true); | |||
| memset(bias_, 0x00, co4 * C4NUM * dtype_size); | |||
| if (in_tensors_.size() == 3) { | |||
| if (in_tensors_[2]->data_type() == kNumberTypeFloat32 && enable_fp16_) { | |||
| for (int i = 0; i < CO_; i++) { | |||
| reinterpret_cast<float16_t *>(bias_)[i] = reinterpret_cast<float *>(in_tensors_[2]->data_c())[i]; | |||
| } | |||
| } else if (in_tensors_[2]->data_type() == kNumberTypeFloat16 && !enable_fp16_) { | |||
| for (int i = 0; i < CO_; i++) { | |||
| reinterpret_cast<float *>(bias_)[i] = reinterpret_cast<float16_t *>(in_tensors_[2]->data_c())[i]; | |||
| } | |||
| } else { | |||
| memcpy(bias_, in_tensors_[2]->data_c(), CO_ * dtype_size); | |||
| } | |||
| } | |||
| allocator->UnmapBuffer(bias_); | |||
| return RET_OK; | |||
| } | |||
| @@ -202,6 +236,7 @@ void MatMulOpenCLKernel::SetConstArgs() { | |||
| } else { | |||
| ocl_runtime_->SetKernelArg(kernel_, arg_count++, padWeight_, lite::opencl::MemType::BUF); | |||
| } | |||
| ocl_runtime_->SetKernelArg(kernel_, arg_count++, bias_); | |||
| ocl_runtime_->SetKernelArg(kernel_, arg_count++, in_shape); | |||
| ocl_runtime_->SetKernelArg(kernel_, arg_count++, out_shape); | |||
| } | |||
| @@ -37,6 +37,7 @@ class MatMulOpenCLKernel : public OpenCLKernel { | |||
| void SetConstArgs() override; | |||
| void SetGlobalLocal() override; | |||
| int Tune() override { return lite::RET_OK; } | |||
| int InitBias(); | |||
| protected: | |||
| void *padWeight_{nullptr}; | |||
| @@ -44,6 +45,8 @@ class MatMulOpenCLKernel : public OpenCLKernel { | |||
| bool transposeA{false}; | |||
| bool transposeB{true}; | |||
| int dims{}; | |||
| void *bias_{nullptr}; | |||
| int CO_{1}; | |||
| static constexpr int MAX_DIMS{4}; // max supported matmul dims | |||
| bool act_weight_{false}; | |||
| std::vector<int> inShape{std::vector<int>(MAX_DIMS, 1)}; | |||
| @@ -405,6 +405,12 @@ kernel::LiteKernel *Scheduler::FindGpuKernel(const std::vector<Tensor *> &in_ten | |||
| return nullptr; | |||
| } | |||
| // we don't need to restore tensor for copy data | |||
| ret = CopyConstTensorData(in_tensors, op_parameter->type_); | |||
| if (ret != RET_OK) { | |||
| MS_LOG(DEBUG) << "CopyConstTensorsData failed: " << ret; | |||
| return nullptr; | |||
| } | |||
| auto *kernel = KernelRegistry::GetInstance()->GetKernel(in_tensors, out_tensors, context_, gpu_desc, op_parameter); | |||
| if (kernel != nullptr) { | |||
| MS_LOG(DEBUG) << "Get gpu op success: " << PrimitiveCurVersionTypeName(gpu_desc.type); | |||
| @@ -41,4 +41,19 @@ Q_new_detect.tflite | |||
| Q_object_scene.tflite | |||
| Q_pose.tflite | |||
| matmul.pb | |||
| add_uint8.tflite;2 | |||
| add_uint8.tflite;0.5;2 | |||
| mtk_face_features_v3.onnx | |||
| hdc_Face_Landmark5_MTI_Aesthetic.onnx | |||
| inception_v3.pb;0.5;1;1,299,299,3 | |||
| mobilenet_v1_0.25_128_frozen.pb;0.5;1;1,128,128,3 | |||
| mobilenet_v2_1.0_224_frozen.pb;0.5;1;1,224,224,3 | |||
| ml_face_openclose.pb;0.5;1;1,32,32,3 | |||
| hiai_AADB_HADB_MBV2_model.pb;0.5;1;1,224,224,3 | |||
| hiai_model_0909_kd_rot_ps_softmax.pb;0.5;1;1,224,224,3 | |||
| model_normalize_object_scene_ps_20200519.pb;0.5;1;1,224,224,3 | |||
| mtk_AADB_HADB_MBV2_model.pb;0.5;1;1,224,224,3 | |||
| mtk_AADB_HADB_MBV3_model.pb;0.5;1;1,224,224,3 | |||
| mtk_model_face_dress.pb;0.5;1;1,128,128,3 | |||
| hiai_model_normalize_object_scene_ps_20200519.pb;0.5;1;1,224,224,3 | |||
| hiai_label_and_video.pb;0.5;1;1,224,224,3 | |||
| tinyyolov2-8.onnx;0.5;1;1,416,416,3 | |||
| @@ -2109,11 +2109,13 @@ function Run_gpu() { | |||
| continue | |||
| fi | |||
| model_name=`echo ${line} | awk -F ';' '{print $1}'` | |||
| input_num=`echo ${line} | awk -F ';' '{print $2}'` | |||
| accuracy_limit=`echo ${line} | awk -F ';' '{print $2}'` | |||
| input_num=`echo ${line} | awk -F ';' '{print $3}'` | |||
| input_shapes=`echo ${line} | awk -F ';' '{print $4}'` | |||
| input_files="" | |||
| data_path="/data/local/tmp/input_output/" | |||
| output_file=${data_path}'output/'${model_name}'.ms.out' | |||
| if [[ ${input_num} == "" ]]; then | |||
| if [[ ${input_num} == "" || ${input_num} == 1 ]]; then | |||
| input_files=/data/local/tmp/input_output/input/${model_name}.ms.bin | |||
| else | |||
| for i in $(seq 1 $input_num) | |||
| @@ -2123,8 +2125,15 @@ function Run_gpu() { | |||
| fi | |||
| echo ${model_name} >> "${run_gpu_log_file}" | |||
| echo 'cd /data/local/tmp/benchmark_test' > adb_run_cmd.txt | |||
| echo 'export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/data/local/tmp/benchmark_test;./benchmark --device=GPU --modelFile='${model_name}'.ms --inDataFile='${input_files}' --benchmarkDataFile='${output_file} >> "${run_gpu_log_file}" | |||
| echo 'export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/data/local/tmp/benchmark_test;./benchmark --device=GPU --modelFile='${model_name}'.ms --inDataFile='${input_files}' --benchmarkDataFile='${output_file} >> adb_run_cmd.txt | |||
| if [[ $input_shapes == "" ]]; then | |||
| echo 'export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/data/local/tmp/benchmark_test;./benchmark --device=GPU --modelFile='${model_name}'.ms --inDataFile='${input_files}' --benchmarkDataFile='${output_file} >> "${run_gpu_log_file}" | |||
| echo 'export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/data/local/tmp/benchmark_test;./benchmark --device=GPU --modelFile='${model_name}'.ms --inDataFile='${input_files}' --benchmarkDataFile='${output_file} >> adb_run_cmd.txt | |||
| else | |||
| echo 'export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/data/local/tmp/benchmark_test;./benchmark --inputShapes='${input_shapes}' --accuracyThreshold='${accuracy_limit}' --device=GPU --modelFile='${model_name}'.ms --inDataFile='${input_files}' --benchmarkDataFile='${output_file} >> "${run_gpu_log_file}" | |||
| echo 'export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/data/local/tmp/benchmark_test;./benchmark --inputShapes='${input_shapes}' --accuracyThreshold='${accuracy_limit}' --device=GPU --modelFile='${model_name}'.ms --inDataFile='${input_files}' --benchmarkDataFile='${output_file} >> adb_run_cmd.txt | |||
| fi | |||
| adb -s ${device_id} shell < adb_run_cmd.txt >> "${run_gpu_log_file}" | |||
| if [ $? = 0 ]; then | |||
| run_result='arm64_gpu: '${model_name}' pass'; echo ${run_result} >> ${run_benchmark_result_file} | |||