From e8b27b323fc03049230d0273a7305c825b0ab748 Mon Sep 17 00:00:00 2001 From: Pengyongrong Date: Mon, 26 Oct 2020 22:49:13 -0700 Subject: [PATCH] anewdd new ops named hswish --- .../kernel/opencl/cl/fullconnection.cl | 21 +-- .../src/runtime/kernel/opencl/cl/hswish.cl | 19 +++ .../kernel/opencl/kernel/fullconnection.cc | 2 +- .../runtime/kernel/opencl/kernel/hswish.cc | 128 ++++++++++++++++++ .../src/runtime/kernel/opencl/kernel/hswish.h | 50 +++++++ .../kernel/opencl/subgraph_opencl_kernel.cc | 16 --- mindspore/lite/test/models_fp32_gpu.cfg | 1 + mindspore/lite/test/models_tflite.cfg | 1 + mindspore/lite/test/run_test.sh | 1 + .../src/runtime/kernel/opencl/hswish_tests.cc | 100 ++++++++++++++ 10 files changed, 312 insertions(+), 27 deletions(-) create mode 100644 mindspore/lite/src/runtime/kernel/opencl/cl/hswish.cl create mode 100644 mindspore/lite/src/runtime/kernel/opencl/kernel/hswish.cc create mode 100644 mindspore/lite/src/runtime/kernel/opencl/kernel/hswish.h create mode 100644 mindspore/lite/test/ut/src/runtime/kernel/opencl/hswish_tests.cc diff --git a/mindspore/lite/src/runtime/kernel/opencl/cl/fullconnection.cl b/mindspore/lite/src/runtime/kernel/opencl/cl/fullconnection.cl index 4c7febb2bb..0714674fb6 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/cl/fullconnection.cl +++ b/mindspore/lite/src/runtime/kernel/opencl/cl/fullconnection.cl @@ -11,13 +11,14 @@ __kernel void FullConnection_NHWC4(__read_only image2d_t input, __global FLT16 * int lidy = get_local_id(1); int ci4 = UP_DIV(in_shape.w, C4NUM); int hwci4 = ci4 * in_shape.y * in_shape.z; + int wci4 = ci4 * in_shape.z; int co4 = UP_DIV(out_shape.y, C4NUM); int n = out_shape.x; - if (gidx >= co4 || gidz >= n) return; + bool inside = gidx < co4 && gidz < n; FLT4 result = (FLT4)(0.0f); - for (uint i = lidy; i < hwci4; i += 4) { - int index_h = i / (ci4 * in_shape.z); - int index_wci4 = i % (ci4 * in_shape.z); + for (uint i = lidy; i < hwci4 && inside; i += 4) { + int index_h = i / wci4; + int index_wci4 = i % wci4; FLT4 v = READ_IMAGE(input, smp_zero, (int2)(index_wci4, gidz * in_shape.y + index_h)); FLT16 w = weight[i * co4 + gidx]; result.x += dot(v, w.s0123); @@ -25,13 +26,13 @@ __kernel void FullConnection_NHWC4(__read_only image2d_t input, __global FLT16 * result.z += dot(v, w.s89ab); result.w += dot(v, w.scdef); } - __local FLT4 temp[4]; - temp[lidy] = result; + __local FLT4 temp[32][4]; + temp[lidx][lidy] = result; barrier(CLK_LOCAL_MEM_FENCE); - if (lidy == 0) { - result += temp[1]; - result += temp[2]; - result += temp[3]; + if (lidy == 0 && inside) { + result += temp[lidx][1]; + result += temp[lidx][2]; + result += temp[lidx][3]; result += READ_IMAGE(bias, smp_zero, (int2)(gidx, 0)); result = clamp(result, (FLT)(act_min), (FLT)(act_max)); WRITE_IMAGE(output, (int2)(gidx, gidz), result); diff --git a/mindspore/lite/src/runtime/kernel/opencl/cl/hswish.cl b/mindspore/lite/src/runtime/kernel/opencl/cl/hswish.cl new file mode 100644 index 0000000000..fa7486bb63 --- /dev/null +++ b/mindspore/lite/src/runtime/kernel/opencl/cl/hswish.cl @@ -0,0 +1,19 @@ +#pragma OPENCL EXTENSION cl_khr_fp16 : enable +__constant sampler_t smp_none = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; +__kernel void hswish(__read_only image2d_t src_data, __write_only image2d_t dst_data, int4 tensor_shape) { + int X = get_global_id(0); // n*h n: default =1 + int Y = get_global_id(1); // w + int Z = get_global_id(2); // c + if (X >= tensor_shape.x * tensor_shape.y || Y >= tensor_shape.z || Z >= tensor_shape.w || tensor_shape.y == 0) { + return; + } + int n = X / tensor_shape.y; + int h = X % tensor_shape.y; + FLT4 temp = READ_IMAGE(src_data, smp_none, (int2)((Y)*tensor_shape.w + Z, (n * tensor_shape.y + h))); + FLT4 result = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f); + result.x = temp.x <= -3 ? 0 : (temp.x >= 3 ? 1 : temp.x / 6 + 0.5f); + result.y = temp.y <= -3 ? 0 : (temp.y >= 3 ? 1 : temp.y / 6 + 0.5f); + result.z = temp.z <= -3 ? 0 : (temp.z >= 3 ? 1 : temp.z / 6 + 0.5f); + result.w = temp.w <= -3 ? 0 : (temp.w >= 3 ? 1 : temp.w / 6 + 0.5f); + WRITE_IMAGE(dst_data, (int2)((Y)*tensor_shape.w + Z, (n * tensor_shape.y + h)), result); +} diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/fullconnection.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/fullconnection.cc index bf6061660f..a881860277 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/fullconnection.cc +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/fullconnection.cc @@ -171,7 +171,7 @@ void FullConnectionOpenCLKernel::PadWeight() { int FullConnectionOpenCLKernel::Run() { MS_LOG(DEBUG) << this->name() << " Running!"; - std::vector local = {1, 4, 1}; + std::vector local = {32, 4, 1}; std::vector global = {UP_DIV(outShape.C, C4NUM), 4, outShape.N}; int arg_count = 0; cl_int4 in_shape = {static_cast(inShape.N), static_cast(inShape.H), static_cast(inShape.W), diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/hswish.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/hswish.cc new file mode 100644 index 0000000000..099548aca1 --- /dev/null +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/hswish.cc @@ -0,0 +1,128 @@ +/** + * Copyright 2019 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "src/runtime/kernel/opencl/kernel/hswish.h" +#include +#include +#include +#include +#include "src/kernel_registry.h" +#include "src/runtime/kernel/opencl/utils.h" +#include "src/runtime/kernel/opencl/cl/hswish.cl.inc" + +using mindspore::kernel::KERNEL_ARCH::kGPU; +using mindspore::lite::KernelRegistrar; +using mindspore::lite::RET_ERROR; +using mindspore::lite::RET_OK; +using mindspore::schema::PrimitiveType_Activation; + +namespace mindspore::kernel { + +int HswishOpenCLKernel::Init() { + if (out_tensors_[0]->shape().size() > 4) { + MS_LOG(ERROR) << " only support dim <= 4"; + return RET_ERROR; + } + + std::string kernel_name = "hswish"; + std::set build_options; + std::string source = hswish_source; + std::string program_name = "hswish"; + ocl_runtime_->LoadSource(program_name, source); + ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name, build_options); + MS_LOG(DEBUG) << kernel_name << " Init Done!"; + return RET_OK; +} + +void HswishGetWorkGroup(const std::vector &global, std::vector *local, int max_size) { + const int max_divider = 8; + const int max_x = 2, max_y = 8; + int x = std::min(GetMaxDivisorStrategy1(global[0], max_divider), max_x); + int yz = max_size / x; + int y = std::min(std::min(GetMaxDivisorStrategy1(global[1], max_divider), yz), max_y); + int z = std::min(yz / y, static_cast(UP_DIV(global[2], 2))); + + local->clear(); + local->push_back(x); + local->push_back(y); + local->push_back(z); +} + +int HswishOpenCLKernel::InferShapeTo4D() { + if (in_tensors_[0]->shape().size() <= 4) { + if (in_tensors_[0]->shape().size() == 1) { + N_ = in_tensors_[0]->shape()[0]; + } else if (in_tensors_[0]->shape().size() == 2) { + N_ = in_tensors_[0]->shape()[0]; + C_ = in_tensors_[0]->shape()[1]; + } else if (in_tensors_[0]->shape().size() == 3) { + N_ = in_tensors_[0]->shape()[0]; + W_ = in_tensors_[0]->shape()[1]; + C_ = in_tensors_[0]->shape()[2]; + } else { + N_ = in_tensors_[0]->shape()[0]; + H_ = in_tensors_[0]->shape()[1]; + W_ = in_tensors_[0]->shape()[2]; + C_ = in_tensors_[0]->shape()[3]; + } + } else { + MS_LOG(ERROR) << "Unsupported inputdim: " << in_tensors_[0]->shape().size(); + return RET_ERROR; + } + return RET_OK; +} + +int HswishOpenCLKernel::Run() { + MS_LOG(DEBUG) << this->name() << " Running! "; + auto output_shape = out_tensors_[0]->shape(); + InferShapeTo4D(); + cl_int4 output_shape_ = {static_cast(N_), static_cast(H_), static_cast(W_), + static_cast(UP_DIV(C_, C4NUM))}; + const std::vector &max_global = ocl_runtime_->GetWorkItemSize(); + std::vector local = {1, 1, 1}; + uint32_t OH = N_ * H_; + uint32_t OW = W_; + uint32_t OC = UP_DIV(C_, C4NUM); + std::vector global = {OH, OW, OC}; + HswishGetWorkGroup(global, &local, max_global[0]); + int arg_cn = 0; + ocl_runtime_->SetKernelArg(kernel_, arg_cn++, in_tensors_[0]->data_c()); + ocl_runtime_->SetKernelArg(kernel_, arg_cn++, out_tensors_[0]->data_c()); + ocl_runtime_->SetKernelArg(kernel_, arg_cn++, output_shape_); + ocl_runtime_->RunKernel(kernel_, global, local, nullptr); + return RET_OK; +} + +kernel::LiteKernel *HswishOpenCLKernelCreator(const std::vector &inputs, + const std::vector &outputs, OpParameter *opParameter, + const lite::InnerContext *ctx, const kernel::KernelKey &desc, + const mindspore::lite::PrimitiveC *primitive) { + auto *kernel = new (std::nothrow) HswishOpenCLKernel(opParameter, inputs, outputs); + if (kernel == nullptr) { + MS_LOG(ERROR) << " new HswishOpenCLKernel failed "; + free(opParameter); + return nullptr; + } + auto ret = kernel->Init(); + if (ret != RET_OK) { + MS_LOG(ERROR) << " Init kernel failed, name: hswish "; + delete kernel; + return nullptr; + } + return kernel; +} + +} // namespace mindspore::kernel diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/hswish.h b/mindspore/lite/src/runtime/kernel/opencl/kernel/hswish.h new file mode 100644 index 0000000000..bd44c198dc --- /dev/null +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/hswish.h @@ -0,0 +1,50 @@ +/** + * Copyright 2019 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_OPENCL_KERNEL_HSWISH_H_ +#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_OPENCL_KERNEL_HSWISH_H_ + +#include +#include "mindspore/lite/nnacl/fp32/activation.h" +#include "src/runtime/kernel/opencl/opencl_kernel.h" + +namespace mindspore::kernel { + +class HswishOpenCLKernel : public OpenCLKernel { + public: + HswishOpenCLKernel(OpParameter *parameter, const std::vector &inputs, + const std::vector &outputs) + : OpenCLKernel(parameter, inputs, outputs) {} + + ~HswishOpenCLKernel() override = default; + + int Init() override; + + int Run() override; + + private: + int InferShapeTo4D(); + cl::Kernel kernel_; + + private: + size_t N_{1}; + size_t H_{1}; + size_t W_{1}; + size_t C_{1}; +}; + +} // namespace mindspore::kernel +#endif diff --git a/mindspore/lite/src/runtime/kernel/opencl/subgraph_opencl_kernel.cc b/mindspore/lite/src/runtime/kernel/opencl/subgraph_opencl_kernel.cc index bd8f334103..c02eff91a1 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/subgraph_opencl_kernel.cc +++ b/mindspore/lite/src/runtime/kernel/opencl/subgraph_opencl_kernel.cc @@ -48,22 +48,6 @@ int SubGraphOpenCLKernel::GenToFormatOp(const std::vector &in_te } } for (size_t i = 0; i < in_tensors.size(); ++i) { - if (in_tensors.at(i)->shape().size() <= 1) { - if (mem_type == OpenCLMemType::IMG) { - for (auto &iv : in_kernels[i]) { - auto tensors = iv->in_tensors(); - tensors.emplace_back(in_tensors.at(i)); - iv->set_in_tensors(tensors); - } - } else { - for (auto &iv : in_kernels[i]) { - auto tensors = iv->out_tensors(); - tensors.emplace_back(in_tensors.at(i)); - iv->set_out_tensors(tensors); - } - } - continue; - } auto dst_format = (mem_type == OpenCLMemType::IMG) ? schema::Format::Format_NHWC4 : schema::Format::Format_NHWC; auto src_format = (mem_type == OpenCLMemType::IMG) ? schema::Format::Format_NHWC : schema::Format::Format_NHWC4; auto *new_tensor = new (std::nothrow) lite::Tensor(); diff --git a/mindspore/lite/test/models_fp32_gpu.cfg b/mindspore/lite/test/models_fp32_gpu.cfg index 9b1bd59a2a..116b2f78a8 100644 --- a/mindspore/lite/test/models_fp32_gpu.cfg +++ b/mindspore/lite/test/models_fp32_gpu.cfg @@ -16,6 +16,7 @@ mtk_model_normalize_object_scene_ps_20200826_f32_no_softmax.tflite mtk_276landmark_0913.tflite mtk_face_recognition.tflite mtk_convert_model.tflite +mtk_model_face_dress_fp16.tflite detection_retinaface_fix landmark PoseNet_dla_17_x512 diff --git a/mindspore/lite/test/models_tflite.cfg b/mindspore/lite/test/models_tflite.cfg index e854a4155a..b984dbe566 100644 --- a/mindspore/lite/test/models_tflite.cfg +++ b/mindspore/lite/test/models_tflite.cfg @@ -130,3 +130,4 @@ mtk_model_normalize_object_scene_ps_20200826_f32_no_softmax.tflite mtk_276landmark_0913.tflite mtk_face_recognition.tflite mtk_convert_model.tflite +mtk_model_face_dress_fp16.tflite diff --git a/mindspore/lite/test/run_test.sh b/mindspore/lite/test/run_test.sh index fdf6c642a1..23626f1601 100755 --- a/mindspore/lite/test/run_test.sh +++ b/mindspore/lite/test/run_test.sh @@ -47,3 +47,4 @@ cp -fr $TEST_DATA_DIR/testPK ./data ./lite-test --gtest_filter="TestScaleOpenCL*" ./lite-test --gtest_filter="TestFullConnectionOpenCL*" ./lite-test --gtest_filter="TestResizeOpenCL*" +./lite-test --gtest_filter="TestSwishOpenCLCI.Fp32CI" diff --git a/mindspore/lite/test/ut/src/runtime/kernel/opencl/hswish_tests.cc b/mindspore/lite/test/ut/src/runtime/kernel/opencl/hswish_tests.cc new file mode 100644 index 0000000000..aab2821ddb --- /dev/null +++ b/mindspore/lite/test/ut/src/runtime/kernel/opencl/hswish_tests.cc @@ -0,0 +1,100 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include +#include +#include "src/common/log_adapter.h" +#include "common/common_test.h" +#include "mindspore/lite/src/runtime/opencl/opencl_runtime.h" +#include "mindspore/lite/src/runtime/kernel/opencl/subgraph_opencl_kernel.h" +#include "mindspore/lite/src/runtime/kernel/opencl/kernel/hswish.h" +using mindspore::lite::Tensor; +using mindspore::schema::Format::Format_NHWC; +namespace mindspore { +class TestSwishOpenCLCI : public mindspore::CommonTest { + public: + TestSwishOpenCLCI() {} +}; + +TEST_F(TestSwishOpenCLCI, Fp32CI) { + MS_LOG(INFO) << " begin test "; + auto runtime_wrapper = lite::opencl::OpenCLRuntimeWrapper(); + auto runtime = runtime_wrapper.GetInstance(); + runtime->Init(); + auto allocator = runtime->GetAllocator(); + + MS_LOG(INFO) << " init tensors "; + std::vector input_shape = {2, 10, 1, 4}; + std::vector output_shape = {2, 10, 1, 4}; + auto data_type = kNumberTypeFloat32; + auto tensor_type = lite::Tensor::CONST_TENSOR; + float input_data[] = {2.5f, 6.0f, -7.4f, -3.5f, 5.9f, 6.5f, -8.0f, 7.4f, 5.9f, 6.5f, -8.0f, 7.4f, 7.5f, 6.0f, + -7.4f, -3.5f, 7.5f, 6.0f, -7.4f, -3.5f, 5.9f, 6.5f, -8.0f, 7.4f, 5.9f, 6.5f, -8.0f, 7.4f, + 7.5f, 6.0f, -7.4f, -3.5f, 7.5f, 6.0f, -7.4f, -3.5f, 5.9f, 6.5f, -8.0f, 7.4f, 5.9f, 6.5f, + -8.0f, 7.4f, 7.5f, 6.0f, -7.4f, -3.5f, 7.5f, 6.0f, -7.4f, -3.5f, 5.9f, 6.5f, -8.0f, 7.4f, + 5.9f, 6.5f, -8.0f, 7.4f, 7.5f, 6.0f, -7.4f, -3.5f, 7.5f, 6.0f, -7.4f, -3.5f, 5.9f, 6.5f, + -8.0f, 7.4f, 5.9f, 6.5f, -8.0f, 7.4f, 7.5f, 6.0f, -7.4f, -3.5f}; + + float correctOutput[] = {0.9167f, 1.0f, 0.0f, 0.0f, 1.0f, 1.0f, 0.0f, 1.0f, 1.0f, 1.0f, 0.0f, 1.0f, 1.0f, 1.0f, + 0.0f, 0.0f, 1.0f, 1.0f, 0.0f, 0.0f, 1.0f, 1.0f, 0.0f, 1.0f, 1.0f, 1.0f, 0.0f, 1.0f, + 1.0f, 1.0f, 0.0f, 0.0f, 1.0f, 1.0f, 0.0f, 0.0f, 1.0f, 1.0f, 0.0f, 1.0f, 1.0f, 1.0f, + 0.0f, 1.0f, 1.0f, 1.0f, 0.0f, 0.0f, 1.0f, 1.0f, 0.0f, 0.0f, 1.0f, 1.0f, 0.0f, 1.0f, + 1.0f, 1.0f, 0.0f, 1.0f, 1.0f, 1.0f, 0.0f, 0.0f, 1.0f, 1.0f, 0.0f, 0.0f, 1.0f, 1.0f, + 0.0f, 1.0f, 1.0f, 1.0f, 0.0f, 1.0f, 1.0f, 1.0f, 0.0f, 0.0f}; + auto output_tensor = Tensor(data_type, input_shape, Format_NHWC, tensor_type); + auto in_tensor = Tensor(data_type, output_shape, Format_NHWC, tensor_type); + std::vector inputs{&in_tensor}; + std::vector outputs{&output_tensor}; + + MS_LOG(INFO) << " initialize tensors "; + auto param = reinterpret_cast(malloc(sizeof(ActivationParameter))); + if (param == nullptr) { + MS_LOG(INFO) << " new ActivationParameter failed "; + return; + } + + auto *hswish_kernel = + new (std::nothrow) kernel::HswishOpenCLKernel(reinterpret_cast(param), inputs, outputs); + if (hswish_kernel == nullptr) { + MS_LOG(INFO) << " new kernel::HswishOpenCLKernel failed "; + delete param; + return; + } + hswish_kernel->Init(); + // to do allocate memory for inputs + for (auto &input_tensor : inputs) { + input_tensor->MallocData(allocator); + } + + MS_LOG(INFO) << " initialize sub_graph "; + std::vector kernels{hswish_kernel}; + auto *sub_graph = new (std::nothrow) kernel::SubGraphOpenCLKernel(inputs, outputs, kernels, kernels, kernels); + if (sub_graph == nullptr) { + MS_LOG(INFO) << " new kernel::SubGraphOpenCLKernel failed "; + delete param; + delete hswish_kernel; + return; + } + sub_graph->Init(); + MS_LOG(INFO) << " initialize input data "; + memcpy(inputs[0]->data_c(), input_data, sizeof(input_data)); + + std::cout << "==================output data================" << std::endl; + sub_graph->Run(); + auto *output_data_gpu = reinterpret_cast(output_tensor.data_c()); + CompareOutputData(output_data_gpu, correctOutput, output_tensor.ElementsNum(), 0.0001); + delete sub_graph; +} +} // namespace mindspore