From 8ffe5a67b8db1e6c058657305a0a4e201d2cbcea Mon Sep 17 00:00:00 2001 From: wandongdong Date: Thu, 24 Dec 2020 22:38:51 -0800 Subject: [PATCH] fix quant abort and pow off bugs and support java call opencl --- build.sh | 8 +- cmake/external_libs/opencl.cmake | 4 +- .../app/src/main/native/runtime/ms_config.cpp | 7 +- .../kernel/opencl/kernel/depthwise_conv2d.cc | 78 ++++++++++--------- .../kernel/opencl/kernel/depthwise_conv2d.h | 6 +- .../runtime/kernel/opencl/opencl_kernel.cc | 7 +- .../src/runtime/kernel/opencl/opencl_kernel.h | 1 + .../src/runtime/opencl/opencl_allocator.cc | 2 + 8 files changed, 67 insertions(+), 46 deletions(-) diff --git a/build.sh b/build.sh index 1acb973039..8ad4845092 100755 --- a/build.sh +++ b/build.sh @@ -508,7 +508,8 @@ build_lite() LITE_ENABLE_NPU="on" fi - if [ "${ENABLE_GPU}" == "on" ] && [ "${LITE_PLATFORM}" == "arm64" ] || [ $1 == "arm64" ]; then + if [[ "${LITE_ENABLE_GPU}" == "on" || $1 == "arm64" ]]; then + LITE_ENABLE_GPU="on" echo "start get opencl" fi if [ "${LITE_ENABLE_NPU}" == "on" ]; then @@ -545,7 +546,7 @@ build_lite() -DANDROID_STL=${ANDROID_STL} -DCMAKE_BUILD_TYPE=${BUILD_TYPE} \ -DPLATFORM_ARM32=on -DENABLE_NEON=on -DSUPPORT_TRAIN=${SUPPORT_TRAIN} \ -DENABLE_TOOLS=${ENABLE_TOOLS} -DENABLE_CONVERTER=${ENABLE_CONVERTER} -DBUILD_TESTCASES=${RUN_TESTCASES} \ - -DSUPPORT_GPU=${ENABLE_GPU} -DSUPPORT_NPU=${ENABLE_NPU} -DENABLE_V0=on \ + -DSUPPORT_GPU=${LITE_ENABLE_GPU} -DSUPPORT_NPU=${ENABLE_NPU} -DENABLE_V0=on \ -DOFFLINE_COMPILE=${OPENCL_OFFLINE_COMPILE} -DBUILD_MINDDATA=${COMPILE_MINDDATA_LITE} \ -DCMAKE_INSTALL_PREFIX=${BASEPATH}/output/tmp -DMS_VERSION_MAJOR=${VERSION_MAJOR} \ -DMS_VERSION_MINOR=${VERSION_MINOR} -DMS_VERSION_REVISION=${VERSION_REVISION} -DENABLE_VERBOSE=${ENABLE_VERBOSE} \ @@ -553,7 +554,7 @@ build_lite() else cmake -DPLATFORM_ARM64=off -DSUPPORT_TRAIN=${SUPPORT_TRAIN} \ -DENABLE_TOOLS=${ENABLE_TOOLS} -DENABLE_CONVERTER=${ENABLE_CONVERTER} -DBUILD_TESTCASES=${RUN_TESTCASES} \ - -DCMAKE_BUILD_TYPE=${BUILD_TYPE} -DSUPPORT_GPU=${ENABLE_GPU} -DSUPPORT_NPU=${ENABLE_NPU} \ + -DCMAKE_BUILD_TYPE=${BUILD_TYPE} -DSUPPORT_GPU=${LITE_ENABLE_GPU} -DSUPPORT_NPU=${ENABLE_NPU} \ -DBUILD_MINDDATA=${COMPILE_MINDDATA_LITE} -DENABLE_V0=on \ -DOFFLINE_COMPILE=${OPENCL_OFFLINE_COMPILE} -DCMAKE_INSTALL_PREFIX=${BASEPATH}/output/tmp \ -DMS_VERSION_MAJOR=${VERSION_MAJOR} -DMS_VERSION_MINOR=${VERSION_MINOR} -DMS_VERSION_REVISION=${VERSION_REVISION} \ @@ -647,6 +648,7 @@ build_jni_arm32() { build_java() { JAVA_PATH=${BASEPATH}/mindspore/lite/java + LITE_ENABLE_GPU="on" get_version build_lite_java_arm64 build_lite_java_arm32 diff --git a/cmake/external_libs/opencl.cmake b/cmake/external_libs/opencl.cmake index 90ad02c314..4737f14478 100644 --- a/cmake/external_libs/opencl.cmake +++ b/cmake/external_libs/opencl.cmake @@ -1,10 +1,10 @@ if (ENABLE_GITEE) set(REQ_URL "https://gitee.com/mirrors/OpenCL-Headers/repository/archive/v2020.06.16.tar.gz") - set(MD5 "fc7627b5a8a95ecbe3d5df43bc88aa44") + set(MD5 "8797a525aff953ea536ebe338a9f5ef6") set(PKG_GIT_TAG "") __download_pkg_with_git(OpenCL-Headers ${REQ_URL} ${PKG_GIT_TAG} ${MD5}) set(REQ_URL "https://gitee.com/mirrors/OpenCL-CLHPP/repository/archive/v2.0.12.tar.gz") - set(MD5 "bd00fca8f861b3b65660d719f00a58dd") + set(MD5 "a07b45d676b02644482bc2c3bb90b891") set(PKG_GIT_TAG "") __download_pkg_with_git(OpenCL-CLHPP ${REQ_URL} ${PKG_GIT_TAG} ${MD5}) else() diff --git a/mindspore/lite/java/java/app/src/main/native/runtime/ms_config.cpp b/mindspore/lite/java/java/app/src/main/native/runtime/ms_config.cpp index ca48c731a9..b4bf610665 100644 --- a/mindspore/lite/java/java/app/src/main/native/runtime/ms_config.cpp +++ b/mindspore/lite/java/java/app/src/main/native/runtime/ms_config.cpp @@ -32,9 +32,12 @@ extern "C" JNIEXPORT jlong JNICALL Java_com_mindspore_lite_config_MSConfig_creat context->device_list_[0].device_type_ = mindspore::lite::DT_CPU; break; case 1: // DT_GPU - MS_LOGE("We only support CPU now."); - return (jlong)context; + { + mindspore::lite::DeviceContext gpu_device_ctx{mindspore::lite::DT_GPU, {false}}; + gpu_device_ctx.device_info_.gpu_device_info_.enable_float16_ = enable_float16; + context->device_list_.push_back(gpu_device_ctx); break; + } case 2: // DT_NPU MS_LOGE("We only support CPU now."); return (jlong)context; diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/depthwise_conv2d.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/depthwise_conv2d.cc index 4b913ab3f2..ab236d4483 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/depthwise_conv2d.cc +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/depthwise_conv2d.cc @@ -51,6 +51,14 @@ int DepthwiseConv2dOpenCLKernel::CheckSpecs() { MS_LOG(ERROR) << "Unsupported data type " << in_tensors_[0]->data_type(); return RET_ERROR; } + if (!in_tensors_.at(kWeightIndex)->IsConst()) { + MS_LOG(ERROR) << "DepthwiseConv2d don't support non-constant weight yet."; + return RET_ERROR; + } + if (in_tensors_.size() == 3 && !in_tensors_.at(kBiasIndex)->IsConst()) { + MS_LOG(ERROR) << "DepthwiseConv2d don't support non-constant bias yet."; + return RET_ERROR; + } return RET_OK; } int DepthwiseConv2dOpenCLKernel::Prepare() { @@ -62,13 +70,10 @@ int DepthwiseConv2dOpenCLKernel::Prepare() { } kernel_name += "_NHWC4"; auto parameter = reinterpret_cast(op_parameter_); - if (parameter->kernel_h_ == 1) { + if (parameter->kernel_h_ == 1 && parameter->kernel_w_ == 1) { kernel_name += "_1x1"; } - kernel_name += "_b"; - for (auto iv : block_size_) { - kernel_name += std::to_string(iv); - } + kernel_name += "_b" + std::to_string(block_size_.H) + std::to_string(block_size_.W) + std::to_string(block_size_.C); #ifdef PROGRAM_WITH_IL kernel_ = ocl_runtime_->GetKernelFromBinary(kernel_name); #else @@ -100,9 +105,10 @@ int DepthwiseConv2dOpenCLKernel::InitWeights() { auto allocator = ocl_runtime_->GetAllocator(); bool is_fp16 = ocl_runtime_->GetFp16Enable(); + auto out_info = GpuTensorInfo(out_tensors_[0]); // weight: o, h, w, i; o == group, i == 1 void *origin_weight = in_tensors_.at(kWeightIndex)->data_c(); - int CO4 = UP_DIV(out_tensors_[0]->Channel(), C4NUM); + int CO4 = UP_DIV(out_info.C, C4NUM * block_size_.C); int pack_weight_size = C4NUM * CO4 * parameter->kernel_h_ * parameter->kernel_w_; int plane = parameter->kernel_h_ * parameter->kernel_w_; @@ -111,13 +117,13 @@ int DepthwiseConv2dOpenCLKernel::InitWeights() { packed_weight_ = allocator->MapBuffer(packed_weight_, CL_MAP_WRITE, nullptr, true); if (in_tensors_.at(kWeightIndex)->data_type() == kNumberTypeFloat16) { std::function to_dtype = [](int16_t x) -> int16_t { return x; }; - PackNCHWToNC4HW4(origin_weight, packed_weight_, 1, plane, out_tensors_[0]->Channel(), to_dtype); + PackNCHWToNC4HW4(origin_weight, packed_weight_, 1, plane, out_info.C, to_dtype); } else if (in_tensors_.at(kWeightIndex)->data_type() == kNumberTypeFloat32) { std::function to_dtype = [](float x) -> float16_t { return static_cast(x); }; - PackNCHWToNC4HW4(origin_weight, packed_weight_, 1, plane, out_tensors_[0]->Channel(), to_dtype); + PackNCHWToNC4HW4(origin_weight, packed_weight_, 1, plane, out_info.C, to_dtype); } else { // int8 or int16 std::function to_dtype = [](int16_t x) -> int16_t { return x; }; - PackNCHWToNC4HW4(origin_weight, packed_weight_, 1, plane, out_tensors_[0]->Channel(), to_dtype); + PackNCHWToNC4HW4(origin_weight, packed_weight_, 1, plane, out_info.C, to_dtype); FreeDequantedWeight(); } } else { @@ -125,51 +131,53 @@ int DepthwiseConv2dOpenCLKernel::InitWeights() { packed_weight_ = allocator->MapBuffer(packed_weight_, CL_MAP_WRITE, nullptr, true); if (in_tensors_.at(kWeightIndex)->data_type() == kNumberTypeFloat32) { std::function to_dtype = [](float x) -> float { return x; }; - PackNCHWToNC4HW4(origin_weight, packed_weight_, 1, plane, out_tensors_[0]->Channel(), to_dtype); + PackNCHWToNC4HW4(origin_weight, packed_weight_, 1, plane, out_info.C, to_dtype); } else if (in_tensors_.at(kWeightIndex)->data_type() == kNumberTypeFloat16) { std::function to_dtype = [](float16_t x) -> float { return static_cast(x); }; - PackNCHWToNC4HW4(origin_weight, packed_weight_, 1, plane, out_tensors_[0]->Channel(), to_dtype); + PackNCHWToNC4HW4(origin_weight, packed_weight_, 1, plane, out_info.C, to_dtype); } else { // int8 or int16 std::function to_dtype = [](float x) -> float { return x; }; - PackNCHWToNC4HW4(origin_weight, packed_weight_, 1, plane, out_tensors_[0]->Channel(), to_dtype); + PackNCHWToNC4HW4(origin_weight, packed_weight_, 1, plane, out_info.C, to_dtype); FreeDequantedWeight(); } } - allocator->UnmapBuffer(packed_weight_); + size_t dtype_size = sizeof(float); + if (is_fp16 && in_tensors_.at(kBiasIndex)->data_type() == kNumberTypeFloat16) { + dtype_size = sizeof(int16_t); + } + bias_data_ = allocator->Malloc(C4NUM * CO4 * dtype_size); + bias_data_ = allocator->MapBuffer(bias_data_, CL_MAP_WRITE, nullptr, true); + size_t up_co_size = C4NUM * CO4 * dtype_size; + memset(bias_data_, 0, up_co_size); if (in_tensors_.size() == kInputSize2) { - if (!in_tensors_.at(2)->IsConst()) { - MS_LOG(ERROR) << "DepthwiseConv2d don't support non-constant bias yet."; - return RET_ERROR; - } - size_t dtype_size = sizeof(float); - if (is_fp16 && in_tensors_.at(kBiasIndex)->data_type() == kNumberTypeFloat16) { - dtype_size = sizeof(int16_t); - } - bias_data_ = allocator->Malloc(C4NUM * CO4 * dtype_size); - bias_data_ = allocator->MapBuffer(bias_data_, CL_MAP_WRITE, nullptr, true); - size_t up_co_size = C4NUM * CO4 * dtype_size; - memset(bias_data_, 0, up_co_size); auto ori_bias = in_tensors_.at(kBiasIndex)->data_c(); if (is_fp16 && in_tensors_.at(kBiasIndex)->data_type() == kNumberTypeFloat32) { float16_t *bias_ptr = static_cast(bias_data_); for (size_t i = 0; i < in_tensors_.at(kBiasIndex)->ElementsNum(); ++i) { bias_ptr[i] = static_cast(static_cast(ori_bias)[i]); } + } else if (!is_fp16 && in_tensors_.at(kBiasIndex)->data_type() == kNumberTypeFloat16) { + float32_t *bias_ptr = static_cast(bias_data_); + for (size_t i = 0; i < in_tensors_.at(kBiasIndex)->ElementsNum(); ++i) { + bias_ptr[i] = static_cast(static_cast(ori_bias)[i]); + } } else { - memcpy(bias_data_, ori_bias, out_tensors_[0]->Channel() * dtype_size); + memcpy(bias_data_, ori_bias, out_info.C * dtype_size); } - allocator->UnmapBuffer(bias_data_); } else { MS_ASSERT(in_tensors_.size() == kInputSize1); } + allocator->UnmapBuffer(bias_data_); return mindspore::lite::RET_OK; } void DepthwiseConv2dOpenCLKernel::SetConstArgs() { auto parameter = reinterpret_cast(op_parameter_); - size_t CO4 = UP_DIV(out_tensors_[0]->Channel(), C4NUM); - size_t CI4 = UP_DIV(in_tensors_[0]->Channel(), C4NUM); + auto in_info = GpuTensorInfo(in_tensors_[0]); + auto out_info = GpuTensorInfo(out_tensors_[0]); + size_t CO4 = UP_DIV(out_info.C, C4NUM); + size_t CI4 = UP_DIV(in_info.C, C4NUM); std::map> relu_clips{ {ActType_No, {-FLT_MAX, FLT_MAX}}, {ActType_Relu, {0.0, FLT_MAX}}, {ActType_Relu6, {0, 6.0}}}; @@ -177,9 +185,8 @@ void DepthwiseConv2dOpenCLKernel::SetConstArgs() { cl_int2 stride = {parameter->stride_h_, parameter->stride_w_}; cl_int2 padding = {-parameter->pad_u_, -parameter->pad_l_}; cl_int2 dilation = {parameter->dilation_h_, parameter->dilation_w_}; - cl_int4 src_size = {in_tensors_[0]->Width(), in_tensors_[0]->Height(), (cl_int)CI4, in_tensors_[0]->Batch()}; - cl_int4 dst_size = {(cl_int)out_tensors_[0]->Width(), (cl_int)out_tensors_[0]->Height(), (cl_int)CO4, - (cl_int)out_tensors_[0]->Batch()}; + cl_int4 src_size = {(cl_int)in_info.W, (cl_int)in_info.H, (cl_int)CI4, (cl_int)in_info.N}; + cl_int4 dst_size = {(cl_int)out_info.W, (cl_int)out_info.H, (cl_int)CO4, (cl_int)out_info.N}; int arg_cnt = 2; ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, packed_weight_, lite::opencl::MemType::BUF); @@ -194,10 +201,11 @@ void DepthwiseConv2dOpenCLKernel::SetConstArgs() { ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, relu_clips[parameter->act_type_].second); } void DepthwiseConv2dOpenCLKernel::SetGlobalLocal() { + auto out_info = GpuTensorInfo(out_tensors_[0]); // set global - size_t CO4 = UP_DIV(out_tensors_[0]->Channel(), C4NUM * block_size_[2]); - global_size_ = {CO4, (size_t)UP_DIV(out_tensors_[0]->Width(), block_size_[1]), - (size_t)UP_DIV(out_tensors_[0]->Height() * out_tensors_[0]->Batch(), block_size_[0])}; + size_t CO4 = UP_DIV(out_info.C, C4NUM * block_size_.C); + global_size_ = {CO4, (size_t)UP_DIV(out_info.W, block_size_.W), + (size_t)UP_DIV(out_info.H * out_info.N, block_size_.H)}; // set local const int max_group_size = ocl_runtime_->DeviceMaxWorkGroupSize(); int z = global_size_[0]; diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/depthwise_conv2d.h b/mindspore/lite/src/runtime/kernel/opencl/kernel/depthwise_conv2d.h index 17bafbd094..fbaccaf535 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/depthwise_conv2d.h +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/depthwise_conv2d.h @@ -42,7 +42,11 @@ class DepthwiseConv2dOpenCLKernel : public OpenCLKernel { private: void *packed_weight_{nullptr}; void *bias_data_{nullptr}; - std::vector block_size_{2, 2, 1}; + struct { + int H{2}; + int W{2}; + int C{1}; + } block_size_; }; } // namespace mindspore::kernel diff --git a/mindspore/lite/src/runtime/kernel/opencl/opencl_kernel.cc b/mindspore/lite/src/runtime/kernel/opencl/opencl_kernel.cc index f18f1e2b16..176cba173c 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/opencl_kernel.cc +++ b/mindspore/lite/src/runtime/kernel/opencl/opencl_kernel.cc @@ -203,9 +203,9 @@ std::set OpenCLKernel::GenerateLocalByGlobal(size_t global_i) { int OpenCLKernel::DequantWeight() { bool is_fp16 = ocl_runtime_->GetFp16Enable(); auto *weight_tensor = in_tensors_.at(kWeightIndex); - auto *restore_data = weight_tensor->data_c(); - dequant_flag_ = - !weight_tensor->quant_params().empty() && weight_tensor->quant_params().front().inited && restore_data != nullptr; + restore_quant_data_ = weight_tensor->data_c(); + dequant_flag_ = !weight_tensor->quant_params().empty() && weight_tensor->quant_params().front().inited && + restore_quant_data_ != nullptr; if (dequant_flag_) { void *dequant_weight{nullptr}; bool set_flag{true}; @@ -242,6 +242,7 @@ void OpenCLKernel::FreeDequantedWeight() { auto *weight_tensor = in_tensors_.at(kWeightIndex); if (dequant_flag_) { free(weight_tensor->data_c()); + weight_tensor->set_data(restore_quant_data_); } } } // namespace mindspore::kernel diff --git a/mindspore/lite/src/runtime/kernel/opencl/opencl_kernel.h b/mindspore/lite/src/runtime/kernel/opencl/opencl_kernel.h index ee9d927676..b822616c88 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/opencl_kernel.h +++ b/mindspore/lite/src/runtime/kernel/opencl/opencl_kernel.h @@ -209,6 +209,7 @@ class OpenCLKernel : public LiteKernel { std::vector local_size_; cl::Kernel kernel_; cl::Event event_; + void *restore_quant_data_{nullptr}; bool dequant_flag_{false}; private: diff --git a/mindspore/lite/src/runtime/opencl/opencl_allocator.cc b/mindspore/lite/src/runtime/opencl/opencl_allocator.cc index dab5258cdf..b19d7d19a0 100644 --- a/mindspore/lite/src/runtime/opencl/opencl_allocator.cc +++ b/mindspore/lite/src/runtime/opencl/opencl_allocator.cc @@ -150,7 +150,9 @@ void *OpenCLAllocator::Malloc(size_t size, const std::vector &img_size, total_size_ += size; const uint64_t max_size = ocl_runtime_->GetGlobalMemSize(); if (total_size_ >= max_size) { + UnLock(); MS_LOG(ERROR) << "Mem pool out of max_size, total size: " << total_size_ << ", max size: " << max_size; + return nullptr; } cl::Buffer *buffer = nullptr; cl::Image2D *image = nullptr;