diff --git a/mindspore/lite/src/runtime/kernel/opencl/cl/scale.cl b/mindspore/lite/src/runtime/kernel/opencl/cl/scale.cl new file mode 100644 index 0000000000..92dc04d45b --- /dev/null +++ b/mindspore/lite/src/runtime/kernel/opencl/cl/scale.cl @@ -0,0 +1,42 @@ +#pragma OPENCL EXTENSION cl_khr_fp16 : enable +__constant sampler_t smp_none = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE | CLK_FILTER_NEAREST; + +__kernel void Scale_IMG(__read_only image2d_t input, __read_only image2d_t scale, __read_only image2d_t offset, + __write_only image2d_t output, const int2 output_shape) { + int X = get_global_id(0); + int Y = get_global_id(1); + if (X >= output_shape.x || Y >= output_shape.y) { + return; + } + + FLT4 in = read_imagef(input, smp_none, (int2)(X, Y)); + FLT4 s = read_imagef(scale, smp_none, (int2)(X, Y)); + FLT4 o = read_imagef(offset, smp_none, (int2)(X, Y)); + WRITE_IMAGE(output, (int2)(X, Y), in * s + o); +} + +__kernel void BoardcastScale_IMG(__read_only image2d_t input, float scale, float offset, __write_only image2d_t output, + const int2 output_shape) { + int X = get_global_id(0); + int Y = get_global_id(1); + if (X >= output_shape.x || Y >= output_shape.y) { + return; + } + + FLT4 in = read_imagef(input, smp_none, (int2)(X, Y)); + WRITE_IMAGE(output, (int2)(X, Y), in * (FLT)scale + (FLT)offset); +} + +__kernel void Scale_C_IMG(__read_only image2d_t input, __read_only image2d_t scale, __read_only image2d_t offset, + __write_only image2d_t output, const int2 output_shape, const int C) { + int X = get_global_id(0); + int Y = get_global_id(1); + if (X >= output_shape.x || Y >= output_shape.y) { + return; + } + + FLT4 in = read_imagef(input, smp_none, (int2)(X, Y)); + FLT4 s = read_imagef(scale, smp_none, (int2)(X % C, 0)); + FLT4 o = read_imagef(offset, smp_none, (int2)(X % C, 0)); + WRITE_IMAGE(output, (int2)(X, Y), in * s + o); +} diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/scale.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/scale.cc new file mode 100644 index 0000000000..d972aa0cb7 --- /dev/null +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/scale.cc @@ -0,0 +1,384 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "src/runtime/kernel/opencl/kernel/scale.h" +#include +#include +#include +#include "schema/model_generated.h" +#include "src/kernel_registry.h" +#include "nnacl/fp32/common_func.h" +#include "src/runtime/kernel/opencl/utils.h" +#ifndef PROGRAM_WITH_IL +#include "src/runtime/kernel/opencl/cl/scale.cl.inc" +#endif + +using mindspore::kernel::KERNEL_ARCH::kGPU; +using mindspore::lite::KernelRegistrar; +using mindspore::schema::PrimitiveType_Scale; + +namespace mindspore::kernel { + +ScaleOpenCLKernel::~ScaleOpenCLKernel() { + auto allocator = ocl_runtime_->GetAllocator(); + if (scale_ptr_ != nullptr) { + allocator->Free(scale_ptr_); + scale_ptr_ = nullptr; + } + if (offset_ptr_ != nullptr) { + allocator->Free(offset_ptr_); + offset_ptr_ = nullptr; + } +} + +std::vector ScaleOpenCLKernel::InitGlobalSize() const { + const size_t global_x = out_tensors_[0]->Width(); + const size_t global_y = out_tensors_[0]->Height(); + const size_t global_z = UP_ROUND_DIV(out_tensors_[0]->Channel(), C4NUM); + std::vector global = {global_x, global_y, global_z}; + return global; +} + +void ScaleOpenCLKernel::Image2dGetWorkGroupSize() { + local_size_ = {16, 16}; + if (out_tensors_[0]->GetFormat() == schema::Format_NC4HW4) { + size_t H = out_tensors_[0]->Batch() * out_tensors_[0]->Height() * UP_DIV(out_tensors_[0]->Channel(), C4NUM); + size_t W = out_tensors_[0]->Width(); + global_size_ = {W, H}; + } else if (out_tensors_[0]->GetFormat() == schema::Format_NHWC4) { + size_t H = out_tensors_[0]->Batch() * out_tensors_[0]->Height(); + size_t W = out_tensors_[0]->Width() * UP_DIV(out_tensors_[0]->Channel(), C4NUM); + global_size_ = {W, H}; + } else if (out_tensors_[0]->GetFormat() == schema::Format_NC4) { + size_t H = out_tensors_[0]->Batch(); + size_t W = UP_DIV(out_tensors_[0]->Channel(), C4NUM); + global_size_ = {W, H}; + } else { + MS_LOG(ERROR) << "Unsupport data format " << out_tensors_[0]->GetFormat(); + } +} + +void ScaleOpenCLKernel::BufferGetWorkGroupSize() { + uint32_t element_num = out_tensors_[0]->ElementsC4Num(); + global_size_ = {element_num}; +} + +int ScaleOpenCLKernel::GetImageSize(size_t idx, std::vector *img_size) { + size_t im_dst_x, im_dst_y; + if (out_tensors_[0]->GetFormat() == schema::Format_NC4HW4) { + im_dst_x = out_tensors_[0]->Width(); + im_dst_y = out_tensors_[0]->Batch() * out_tensors_[0]->Height() * UP_DIV(out_tensors_[0]->Channel(), C4NUM); + } else if (out_tensors_[0]->GetFormat() == schema::Format_NHWC4) { + im_dst_x = out_tensors_[0]->Width() * UP_DIV(out_tensors_[0]->Channel(), C4NUM); + im_dst_y = out_tensors_[0]->Batch() * out_tensors_[0]->Height(); + } else if (out_tensors_[0]->GetFormat() == schema::Format_NC4) { + im_dst_y = out_tensors_[0]->Batch(); + im_dst_x = UP_DIV(out_tensors_[0]->Channel(), C4NUM); + } else { + MS_LOG(ERROR) << "Unsupport data format " << out_tensors_[0]->GetFormat(); + return RET_ERROR; + } + + size_t img_dtype = CL_FLOAT; + if (in_tensors_[0]->data_type() == kNumberTypeFloat16) { + img_dtype = CL_HALF_FLOAT; + } else if (in_tensors_[0]->data_type() == kNumberTypeFloat32) { + img_dtype = CL_FLOAT; + } else { + MS_LOG(ERROR) << "Unsupport data type " << in_tensors_[0]->data_type(); + } + img_size->clear(); + std::vector vec{im_dst_x, im_dst_y, img_dtype}; + *img_size = vec; + return RET_OK; +} + +int ScaleOpenCLKernel::InitBuffer() { + if (!element_flag_) { + return RET_OK; + } + if (in_tensors_[1]->TensorType() == schema::NodeType_ValueNode && in_tensors_[1]->Data() != nullptr) { + auto allocator = ocl_runtime_->GetAllocator(); + std::vector img_size; + GetImageSize(0, &img_size); + if (in_tensors_[1]->shape().size() == 1 && axis_ == 3) { + img_size[0] = 1; + img_size[1] = UP_DIV(in_tensors_[1]->shape()[0], C4NUM); + scale_ptr_ = allocator->CreateImageFromHost(in_tensors_[1]->Data(), in_tensors_[1]->ElementsNum(), img_size); + offset_ptr_ = allocator->CreateImageFromHost(in_tensors_[2]->Data(), in_tensors_[2]->ElementsNum(), img_size); + return RET_OK; + } + int pack_weight_size = in_tensors_[1]->ElementsC4Num(); + int plane = in_tensors_[1]->Height() * in_tensors_[1]->Width(); + int channel = in_tensors_[1]->Channel(); + int batch = in_tensors_[1]->Batch(); + if (in_tensors_[0]->GetFormat() == in_tensors_[1]->GetFormat()) { + if (in_tensors_[0]->data_type() == in_tensors_[1]->data_type()) { + scale_ptr_ = allocator->CreateImageFromHost(in_tensors_[1]->Data(), in_tensors_[1]->ElementsNum(), img_size); + offset_ptr_ = allocator->CreateImageFromHost(in_tensors_[2]->Data(), in_tensors_[2]->ElementsNum(), img_size); + } else { + MS_LOG(ERROR) << "Unsupport data type transpose from " << in_tensors_[1]->data_type() << "to " + << in_tensors_[0]->data_type(); + return RET_ERROR; + } + } else if (in_tensors_[0]->GetFormat() == schema::Format_NC4HW4) { + if (in_tensors_[1]->GetFormat() == schema::Format_NHWC) { + if (in_tensors_[0]->data_type() == kNumberTypeFloat32) { + float *scale = new (std::nothrow) float[pack_weight_size]; + if (scale == nullptr) { + MS_LOG(ERROR) << "Malloc buffer failed!"; + return RET_ERROR; + } + float *offset = new (std::nothrow) float[pack_weight_size]; + if (offset == nullptr) { + MS_LOG(ERROR) << "Malloc buffer failed!"; + delete[] scale; + return RET_ERROR; + } + std::function to_dtype = [](float x) -> float { return (float)x; }; + PackNHWCToNC4HW4(in_tensors_[1]->Data(), scale, batch, plane, channel, to_dtype); + PackNHWCToNC4HW4(in_tensors_[2]->Data(), offset, batch, plane, channel, to_dtype); + scale_ptr_ = allocator->CreateImageFromHost(scale, in_tensors_[1]->ElementsNum(), img_size); + offset_ptr_ = allocator->CreateImageFromHost(offset, in_tensors_[2]->ElementsNum(), img_size); + delete[] scale; + delete[] offset; + } else if (in_tensors_[0]->data_type() == kNumberTypeFloat16) { + int16_t *scale = new (std::nothrow) int16_t[pack_weight_size]; + if (scale == nullptr) { + MS_LOG(ERROR) << "Malloc buffer failed!"; + return RET_ERROR; + } + int16_t *offset = new (std::nothrow) int16_t[pack_weight_size]; + if (offset == nullptr) { + MS_LOG(ERROR) << "Malloc buffer failed!"; + delete[] scale; + return RET_ERROR; + } + std::function to_dtype = Float32ToShort; + PackNHWCToNC4HW4(in_tensors_[1]->Data(), scale, batch, plane, channel, to_dtype); + PackNHWCToNC4HW4(in_tensors_[2]->Data(), offset, batch, plane, channel, to_dtype); + scale_ptr_ = allocator->CreateImageFromHost(scale, in_tensors_[1]->ElementsNum(), img_size); + offset_ptr_ = allocator->CreateImageFromHost(offset, in_tensors_[2]->ElementsNum(), img_size); + delete[] scale; + delete[] offset; + } else { + MS_LOG(ERROR) << "Unsupport data type transpose from " << in_tensors_[1]->data_type() << "to " + << in_tensors_[0]->data_type(); + return RET_ERROR; + } + } else { + MS_LOG(ERROR) << "Unsupport format transpose from " << in_tensors_[1]->GetFormat() << "to " + << in_tensors_[0]->GetFormat(); + return RET_ERROR; + } + } else if (in_tensors_[0]->GetFormat() == schema::Format_NHWC4) { + if (in_tensors_[1]->GetFormat() == schema::Format_NHWC) { + if (in_tensors_[0]->data_type() == kNumberTypeFloat32) { + float *scale = new (std::nothrow) float[pack_weight_size]; + if (scale == nullptr) { + MS_LOG(ERROR) << "Malloc buffer failed!"; + return RET_ERROR; + } + float *offset = new (std::nothrow) float[pack_weight_size]; + if (offset == nullptr) { + MS_LOG(ERROR) << "Malloc buffer failed!"; + delete[] scale; + return RET_ERROR; + } + std::function to_dtype = [](float x) -> float { return (float)x; }; + PackNHWCToNHWC4(in_tensors_[1]->Data(), scale, batch, plane, channel, to_dtype); + PackNHWCToNHWC4(in_tensors_[2]->Data(), offset, batch, plane, channel, to_dtype); + scale_ptr_ = allocator->CreateImageFromHost(scale, in_tensors_[1]->ElementsNum(), img_size); + offset_ptr_ = allocator->CreateImageFromHost(offset, in_tensors_[2]->ElementsNum(), img_size); + delete[] scale; + delete[] offset; + } else if (in_tensors_[0]->data_type() == kNumberTypeFloat16) { + int16_t *scale = new (std::nothrow) int16_t[pack_weight_size]; + if (scale == nullptr) { + MS_LOG(ERROR) << "Malloc buffer failed!"; + return RET_ERROR; + } + int16_t *offset = new (std::nothrow) int16_t[pack_weight_size]; + if (offset == nullptr) { + MS_LOG(ERROR) << "Malloc buffer failed!"; + delete[] scale; + return RET_ERROR; + } + std::function to_dtype = Float32ToShort; + PackNHWCToNHWC4(in_tensors_[1]->Data(), scale, batch, plane, channel, to_dtype); + PackNHWCToNHWC4(in_tensors_[2]->Data(), offset, batch, plane, channel, to_dtype); + scale_ptr_ = allocator->CreateImageFromHost(scale, in_tensors_[1]->ElementsNum(), img_size); + offset_ptr_ = allocator->CreateImageFromHost(offset, in_tensors_[2]->ElementsNum(), img_size); + delete[] scale; + delete[] offset; + } else { + MS_LOG(ERROR) << "Unsupport data type transpose from " << in_tensors_[1]->data_type() << "to " + << in_tensors_[0]->data_type(); + return RET_ERROR; + } + } else { + MS_LOG(ERROR) << "Unsupport format transpose from " << in_tensors_[1]->GetFormat() << "to " + << in_tensors_[0]->GetFormat(); + return RET_ERROR; + } + } + return RET_OK; + } + return RET_OK; +} + +int ScaleOpenCLKernel::Init() { + ocl_runtime_ = lite::opencl::OpenCLRuntime::GetInstance(); + std::string kernel_name; + + const ScaleParameter *scale_param = reinterpret_cast(op_parameter_); + auto in_tensor = in_tensors_.at(0); + auto in_shape = in_tensor->shape(); + auto scale_tensor = in_tensors_.at(1); + auto scale_shape = scale_tensor->shape(); + axis_ = scale_param->axis_; + if (axis_ < 0) { + axis_ = axis_ + in_shape.size(); + } + if (scale_shape.size() != in_shape.size()) { + if (scale_tensor->ElementsNum() == 1) { + element_flag_ = false; + kernel_name = "BoardcastScale"; + } else if (axis_ == 3 && scale_shape.size() == 1) { + element_flag_ = true; + kernel_name = "Scale_C"; + } + } else { + element_flag_ = true; + kernel_name = "Scale"; + } + lite::STATUS error_code = RET_OK; +#ifdef PROGRAM_WITH_IL + kernel_ = ocl_runtime_->GetKernelFromBinary(kernel_name); +#else + if (out_mem_type_ == OpenCLMemType::IMG) { + kernel_name += "_IMG"; + } else { + kernel_name += "_BUF"; + } + std::string program_name = "Scale"; + std::set build_options; + std::string source = scale_source; + ocl_runtime_->LoadSource(program_name, source); + error_code = ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name, build_options); +#endif + if (error_code != RET_OK) { + return error_code; + } + + auto format = op_format_; + if (out_tensors_[0]->shape().size() == 2) { + format = schema::Format_NC4; + } + in_ori_format_ = in_tensors_[0]->GetFormat(); + out_ori_format_ = out_tensors_[0]->GetFormat(); + in_tensors_[0]->SetFormat(format); + if (element_flag_ && in_tensors_[1]->TensorType() != schema::NodeType_ValueNode) { + in_tensors_[1]->SetFormat(format); + in_tensors_[2]->SetFormat(format); + } + out_tensors_[0]->SetFormat(format); + Image2dGetWorkGroupSize(); + InitBuffer(); + return RET_OK; +} + +int ScaleOpenCLKernel::Run() { + MS_LOG(DEBUG) << this->name() << " Running!"; + + int arg_idx = 0; + ocl_runtime_->SetKernelArg(kernel_, arg_idx++, in_tensors_[0]->Data()); + if (element_flag_) { + void *scale = scale_ptr_ == nullptr ? in_tensors_[1]->Data() : scale_ptr_; + void *offset = offset_ptr_ == nullptr ? in_tensors_[2]->Data() : offset_ptr_; + ocl_runtime_->SetKernelArg(kernel_, arg_idx++, scale); + ocl_runtime_->SetKernelArg(kernel_, arg_idx++, offset); + } else { + if (in_tensors_[0]->data_type() == kNumberTypeFloat32) { + float scale = static_cast(in_tensors_[1]->Data())[0]; + float offset = static_cast(in_tensors_[2]->Data())[0]; + ocl_runtime_->SetKernelArg(kernel_, arg_idx++, scale); + ocl_runtime_->SetKernelArg(kernel_, arg_idx++, offset); + } else if (in_tensors_[0]->data_type() == kNumberTypeFloat16) { + if (in_tensors_[1]->data_type() == kNumberTypeFloat32) { + float scale = static_cast(in_tensors_[1]->Data())[0]; + float offset = static_cast(in_tensors_[2]->Data())[0]; + ocl_runtime_->SetKernelArg(kernel_, arg_idx++, Float32ToShort(scale)); + ocl_runtime_->SetKernelArg(kernel_, arg_idx++, Float32ToShort(offset)); + } else if (in_tensors_[1]->data_type() == kNumberTypeFloat16) { + int16_t scale = static_cast(in_tensors_[1]->Data())[0]; + int16_t offset = static_cast(in_tensors_[2]->Data())[0]; + ocl_runtime_->SetKernelArg(kernel_, arg_idx++, Float32ToShort(scale)); + ocl_runtime_->SetKernelArg(kernel_, arg_idx++, Float32ToShort(offset)); + } else { + MS_LOG(ERROR) << "Unsupport data type " << in_tensors_[1]->data_type(); + return RET_ERROR; + } + } + } + ocl_runtime_->SetKernelArg(kernel_, arg_idx++, out_tensors_[0]->Data()); + int H = 0; + int W = 0; + if (out_tensors_[0]->GetFormat() == schema::Format_NC4HW4) { + H = out_tensors_[0]->Batch() * out_tensors_[0]->Height() * UP_DIV(out_tensors_[0]->Channel(), C4NUM); + W = out_tensors_[0]->Width(); + } else if (out_tensors_[0]->GetFormat() == schema::Format_NHWC4) { + H = out_tensors_[0]->Batch() * out_tensors_[0]->Height(); + W = out_tensors_[0]->Width() * UP_DIV(out_tensors_[0]->Channel(), C4NUM); + } else if (out_tensors_[0]->GetFormat() == schema::Format_NC4) { + H = out_tensors_[0]->Batch(); + W = UP_DIV(out_tensors_[0]->Channel(), C4NUM); + } else { + MS_LOG(ERROR) << "Error output type " << out_tensors_[0]->GetFormat(); + return RET_ERROR; + } + cl_int2 output_shape{W, H}; + ocl_runtime_->SetKernelArg(kernel_, arg_idx++, output_shape); + if (element_flag_ && axis_ == 3) { + ocl_runtime_->SetKernelArg(kernel_, arg_idx++, UP_DIV(in_tensors_[1]->shape()[0], C4NUM)); + } + ocl_runtime_->RunKernel(kernel_, global_size_, local_size_, nullptr); + return RET_OK; +} + +kernel::LiteKernel *OpenCLScaleKernelCreator(const std::vector &inputs, + const std::vector &outputs, + OpParameter *opParameter, const lite::Context *ctx, + const kernel::KernelKey &desc, + const mindspore::lite::PrimitiveC *primitive) { + auto *kernel = + new (std::nothrow) ScaleOpenCLKernel(reinterpret_cast(opParameter), inputs, outputs, ctx); + if (kernel == nullptr) { + MS_LOG(ERROR) << "Create OpenCL Scale kernel failed!"; + return nullptr; + } + auto ret = kernel->Init(); + if (ret != RET_OK) { + MS_LOG(ERROR) << "Init kernel failed, name: Scale"; + delete kernel; + return nullptr; + } + return kernel; +} + +REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_Scale, OpenCLScaleKernelCreator) +REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Scale, OpenCLScaleKernelCreator) +} // namespace mindspore::kernel diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/scale.h b/mindspore/lite/src/runtime/kernel/opencl/kernel/scale.h new file mode 100644 index 0000000000..82e754a3bd --- /dev/null +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/scale.h @@ -0,0 +1,56 @@ +/** + * Copyright 2019 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_OPENCL_KERNEL_SCALE_H_ +#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_OPENCL_KERNEL_SCALE_H_ + +#include +#include "nnacl/scale.h" +#include "src/runtime/opencl/opencl_runtime.h" +#include "src/runtime/kernel/opencl/opencl_kernel.h" + +namespace mindspore::kernel { + +class ScaleOpenCLKernel : public OpenCLKernel { + public: + explicit ScaleOpenCLKernel(OpParameter *parameter, const std::vector &inputs, + const std::vector &outputs, const lite::Context *ctx) + : OpenCLKernel(parameter, inputs, outputs) {} + ~ScaleOpenCLKernel() override; + + int Init() override; + int Run() override; + int GetImageSize(size_t idx, std::vector *img_size) override; + + private: + std::vector InitGlobalSize() const; + void Image2dGetWorkGroupSize(); + void BufferGetWorkGroupSize(); + int InitBuffer(); + + cl::Kernel kernel_; + lite::opencl::OpenCLRuntime *ocl_runtime_; + bool element_flag_{true}; + void *scale_ptr_{nullptr}; + void *offset_ptr_{nullptr}; + int axis_{0}; + + std::vector local_size_; + std::vector global_size_; +}; +} // namespace mindspore::kernel + +#endif // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_OPENCL_KERNEL_SCALE_H_ diff --git a/mindspore/lite/src/runtime/opencl/opencl_allocator.cc b/mindspore/lite/src/runtime/opencl/opencl_allocator.cc index a193c302c3..896d460a8b 100644 --- a/mindspore/lite/src/runtime/opencl/opencl_allocator.cc +++ b/mindspore/lite/src/runtime/opencl/opencl_allocator.cc @@ -288,13 +288,14 @@ void OpenCLAllocator::Clear() { MS_LOG(DEBUG) << "OpenCL free svm buffer : " << it->second->host_ptr_; } else { cl::Buffer *buffer = static_cast(it->second->device_ptr_); - MS_LOG(DEBUG) << "OpenCL free device buffer : " << buffer; if (buffer != nullptr) { + MS_LOG(DEBUG) << "OpenCL free device buffer : " << buffer; delete buffer; it->second->device_ptr_ = nullptr; } cl::Image *image = static_cast(it->second->image_ptr_); if (image != nullptr) { + MS_LOG(DEBUG) << "OpenCL free image : " << image; delete image; it->second->image_ptr_ = nullptr; } diff --git a/mindspore/lite/test/CMakeLists.txt b/mindspore/lite/test/CMakeLists.txt index f0011c79b8..db1cd38cbd 100644 --- a/mindspore/lite/test/CMakeLists.txt +++ b/mindspore/lite/test/CMakeLists.txt @@ -161,6 +161,7 @@ if (SUPPORT_GPU) ${LITE_DIR}/src/runtime/kernel/opencl/kernel/prelu.cc ${LITE_DIR}/src/runtime/kernel/opencl/kernel/to_format.cc ${LITE_DIR}/src/runtime/kernel/opencl/kernel/biasadd.cc + ${LITE_DIR}/src/runtime/kernel/opencl/kernel/scale.cc ) endif() ### minddata lite @@ -349,6 +350,7 @@ if (SUPPORT_GPU) ${TEST_DIR}/ut/src/runtime/kernel/opencl/prelu_tests.cc ${TEST_DIR}/ut/src/runtime/kernel/opencl/reshape_tests.cc ${TEST_DIR}/ut/src/runtime/kernel/opencl/biasadd_tests.cc + ${TEST_DIR}/ut/src/runtime/kernel/opencl/scale_tests.cc ) endif() diff --git a/mindspore/lite/test/ut/src/runtime/kernel/opencl/arithmetic_tests.cc b/mindspore/lite/test/ut/src/runtime/kernel/opencl/arithmetic_tests.cc index a752594142..5efe368204 100644 --- a/mindspore/lite/test/ut/src/runtime/kernel/opencl/arithmetic_tests.cc +++ b/mindspore/lite/test/ut/src/runtime/kernel/opencl/arithmetic_tests.cc @@ -136,8 +136,8 @@ void TestCase(const std::vector &shape_a, const std::vector &shape_b) std::vector arithmetic_inputs = {tensor_a, tensor_b}; lite::Context ctx; - auto *arith_kernel = - new kernel::ArithmeticOpenCLKernel(reinterpret_cast(param), arithmetic_inputs, outputs, &ctx); + auto *arith_kernel = new (std::nothrow) + kernel::ArithmeticOpenCLKernel(reinterpret_cast(param), arithmetic_inputs, outputs, &ctx); if (arith_kernel == nullptr) { MS_LOG(ERROR) << "Create ArithmeticOpenCLKernel failed!"; delete tensor_a; @@ -216,7 +216,7 @@ TEST_F(TestArithmeticOpenCL, AddElementwiseTest) { TestCase(shape_a, shape_b); } -TEST_F(TestArithmeticOpenCL, AddBoardcaseTest) { +TEST_F(TestArithmeticOpenCL, AddBroadcastTest) { const std::vector &shape_a = {1, 128, 128, 4}; const std::vector &shape_b = {}; TestCase(shape_a, shape_b); diff --git a/mindspore/lite/test/ut/src/runtime/kernel/opencl/scale_tests.cc b/mindspore/lite/test/ut/src/runtime/kernel/opencl/scale_tests.cc new file mode 100644 index 0000000000..e27c3d30b4 --- /dev/null +++ b/mindspore/lite/test/ut/src/runtime/kernel/opencl/scale_tests.cc @@ -0,0 +1,265 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "common/common_test.h" +#include "mindspore/lite/src/runtime/kernel/opencl/subgraph_opencl_kernel.h" +#include "mindspore/lite/src/runtime/kernel/opencl/kernel/scale.h" + +namespace mindspore { + +template +static void BoardcaseScale(const T *in, const T scale, const T offset, T *out, const int size) { + for (int i = 0; i < size; i++) { + out[i] = in[i] * scale + offset; + } +} + +template +static void Scale(const T *in, const T *scale, T *offset, T *out, const int size) { + for (int i = 0; i < size; i++) { + out[i] = in[i] * scale[i] + offset[i]; + } +} + +template +static bool DataCompare(const T *a, const T *b, const int size, const T accuracy = 1e-4) { + for (int i = 0; i < size; i++) { + auto diff = fabs(a[i] - b[i]); + if (diff > accuracy) { + MS_LOG(ERROR) << "compare failed at " << i << " exp " << a[i] << " bug got " << b[i]; + return false; + } + } + return true; +} + +template +static void InitData(void *data, const int size) { + T *data_float = reinterpret_cast(data); + static unsigned int seed = 123; + for (int i = 0; i < size; i++) { + data_float[i] = static_cast(rand_r(&seed)) % 100; + } +} + +template +static void LogData(void *data, const int size, const std::string prefix) { + std::cout << prefix; + T *data_float = reinterpret_cast(data); + for (int i = 0; i < size; i++) { + std::cout << data_float[i] << ","; + } + std::cout << std::endl; +} + +template +static void TestCase(const std::vector &shape_a, const std::vector &shape_b) { + auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance(); + auto allocator = ocl_runtime->GetAllocator(); + + bool is_broadcast = shape_b.empty(); + auto tensorType = schema::NodeType_ValueNode; + auto format = schema::Format_NHWC4; + + auto data_type = kNumberTypeFloat32; + if (sizeof(T) == 2) { + data_type = kNumberTypeFloat16; + ocl_runtime->SetFp16Enable(true); + } + lite::tensor::Tensor *tensor_in = new (std::nothrow) lite::tensor::Tensor(data_type, shape_a, format, tensorType); + lite::tensor::Tensor *tensor_scale = new (std::nothrow) lite::tensor::Tensor(data_type, shape_b, format, tensorType); + lite::tensor::Tensor *tensor_offset = new (std::nothrow) lite::tensor::Tensor(data_type, shape_b, format, tensorType); + lite::tensor::Tensor *tensor_out = new (std::nothrow) lite::tensor::Tensor(data_type, shape_a, format, tensorType); + if (tensor_in == nullptr || tensor_scale == nullptr || tensor_offset == nullptr) { + MS_LOG(ERROR) << "Create tensor failed!"; + delete tensor_in; + delete tensor_scale; + delete tensor_offset; + delete tensor_out; + return; + } + + int64_t element_num = tensor_in->ElementsC4Num(); + int64_t element_num_b = is_broadcast ? 1 : tensor_scale->ElementsC4Num(); + + T *data_in = new (std::nothrow) T[element_num]; + T *data_scale = new (std::nothrow) T[element_num_b]; + T *data_offset = new (std::nothrow) T[element_num_b]; + T *data_out_cpu = new (std::nothrow) T[element_num]; + T *data_out_ocl = new (std::nothrow) T[element_num]; + if (data_in == nullptr || data_scale == nullptr || data_out_cpu == nullptr || data_out_ocl == nullptr) { + MS_LOG(ERROR) << "Create buffer failed!"; + delete tensor_in; + delete tensor_scale; + delete tensor_offset; + delete tensor_out; + delete[] data_in; + delete[] data_scale; + delete[] data_offset; + delete[] data_out_cpu; + delete[] data_out_ocl; + return; + } + + InitData(data_in, element_num); + InitData(data_scale, element_num_b); + InitData(data_offset, element_num_b); + memset(data_out_ocl, 0, sizeof(T) * element_num); + + if (is_broadcast) { + BoardcaseScale(data_in, static_cast(data_scale)[0], static_cast(data_offset)[0], data_out_cpu, + element_num); + } else { + Scale(data_in, data_scale, data_offset, data_out_cpu, element_num); + } + + std::vector inputs = {tensor_in}; + if (!is_broadcast) { + inputs.push_back(tensor_scale); + inputs.push_back(tensor_offset); + } else { + tensor_scale->MallocData(); + tensor_offset->MallocData(); + memcpy(tensor_scale->Data(), data_scale, sizeof(T)); + memcpy(tensor_offset->Data(), data_offset, sizeof(T)); + } + std::vector outputs = {tensor_out}; + + ScaleParameter *param = new (std::nothrow) ScaleParameter(); + if (param == nullptr) { + MS_LOG(ERROR) << "Create parameter failed!"; + delete tensor_in; + delete tensor_scale; + delete tensor_offset; + delete tensor_out; + delete[] data_in; + delete[] data_scale; + delete[] data_offset; + delete[] data_out_cpu; + delete[] data_out_ocl; + return; + } + param->axis_ = 0; + param->op_parameter_.type_ = schema::PrimitiveType_Scale; + + std::vector scale_inputs = {tensor_in, tensor_scale, tensor_offset}; + lite::Context ctx; + auto *scale_kernel = + new (std::nothrow) kernel::ScaleOpenCLKernel(reinterpret_cast(param), scale_inputs, outputs, &ctx); + if (scale_kernel == nullptr) { + MS_LOG(ERROR) << "Create ScaleOpenCLKernel failed!"; + delete tensor_in; + delete tensor_scale; + delete tensor_offset; + delete tensor_out; + delete[] data_in; + delete[] data_scale; + delete[] data_offset; + delete[] data_out_cpu; + delete[] data_out_ocl; + delete param; + return; + } + scale_kernel->Init(); + + tensor_in->MallocData(allocator); + tensor_scale->MallocData(allocator); + tensor_offset->MallocData(allocator); + std::vector kernels{scale_kernel}; + auto *kernel = new (std::nothrow) kernel::SubGraphOpenCLKernel(inputs, outputs, kernels, kernels, kernels); + if (scale_kernel == nullptr) { + MS_LOG(ERROR) << "Create SubGraphOpenCLKernel failed!"; + delete tensor_in; + delete tensor_scale; + delete tensor_offset; + delete tensor_out; + delete[] data_in; + delete[] data_scale; + delete[] data_offset; + delete[] data_out_cpu; + delete[] data_out_ocl; + delete scale_kernel; + return; + } + kernel->Init(); + + memcpy(inputs[0]->Data(), data_in, sizeof(T) * element_num); + if (!is_broadcast) { + memcpy(inputs[1]->Data(), data_scale, sizeof(T) * element_num_b); + memcpy(inputs[2]->Data(), data_offset, sizeof(T) * element_num_b); + } + + kernel->Run(); + + memcpy(data_out_ocl, outputs[0]->Data(), sizeof(T) * element_num); + + LogData(data_in, 10, "Data input : "); + LogData(data_scale, tensor_scale->shape().empty() ? 1 : 10, "Data scale : "); + LogData(data_offset, tensor_offset->shape().empty() ? 1 : 10, "Data offset : "); + LogData(data_out_cpu, 10, "Expect compute : "); + LogData(outputs[0]->Data(), 10, "OpenCL compute : "); + bool cmp = DataCompare(data_out_cpu, data_out_ocl, element_num); + MS_LOG(INFO) << "Compare " << (cmp ? "success!" : "failed!"); + EXPECT_EQ(true, cmp); + + // free + delete[] data_in; + delete[] data_scale; + delete[] data_offset; + delete[] data_out_cpu; + delete[] data_out_ocl; + + delete kernel; + delete scale_kernel; + delete param; + for (auto tensor : inputs) { + delete tensor; + } + for (auto tensor : outputs) { + delete tensor; + } + lite::opencl::OpenCLRuntime::DeleteInstance(); +} + +class TestScaleOpenCL : public mindspore::CommonTest { + public: + TestScaleOpenCL() {} +}; + +TEST_F(TestScaleOpenCL, ElementFP32) { + const std::vector &shape_a = {1, 1024, 1024, 4}; + const std::vector &shape_b = {1, 1024, 1024, 4}; + TestCase(shape_a, shape_b); +} + +TEST_F(TestScaleOpenCL, BroadcastFP32) { + const std::vector &shape_a = {1, 128, 128, 4}; + const std::vector &shape_b = {}; + TestCase(shape_a, shape_b); +} + +TEST_F(TestScaleOpenCL, ElementFP16) { + const std::vector &shape_a = {1, 1024, 1024, 4}; + const std::vector &shape_b = {1, 1024, 1024, 4}; + TestCase(shape_a, shape_b); +} + +TEST_F(TestScaleOpenCL, BroadcastFP16) { + const std::vector &shape_a = {1, 128, 128, 4}; + const std::vector &shape_b = {}; + TestCase(shape_a, shape_b); +} +} // namespace mindspore