From 03044858fa6b104aef78e44ec4a98295ffc67750 Mon Sep 17 00:00:00 2001 From: Pengyongrong Date: Tue, 15 Sep 2020 02:02:45 -0700 Subject: [PATCH] add new ops name cast --- .../lite/src/runtime/kernel/opencl/cl/cast.cl | 46 ++++ .../runtime/kernel/opencl/kernel/batchnorm.cc | 12 +- .../src/runtime/kernel/opencl/kernel/cast.cc | 152 +++++++++++++ .../src/runtime/kernel/opencl/kernel/cast.h | 52 +++++ .../runtime/kernel/opencl/kernel/concat.cc | 28 +-- .../src/runtime/kernel/opencl/kernel/slice.cc | 4 +- .../runtime/kernel/opencl/batchnorm_tests.cc | 24 +- .../src/runtime/kernel/opencl/cast_tests.cc | 212 ++++++++++++++++++ .../src/runtime/kernel/opencl/concat_tests.cc | 32 +-- .../src/runtime/kernel/opencl/slice_tests.cc | 8 +- 10 files changed, 516 insertions(+), 54 deletions(-) create mode 100644 mindspore/lite/src/runtime/kernel/opencl/cl/cast.cl create mode 100644 mindspore/lite/src/runtime/kernel/opencl/kernel/cast.cc create mode 100644 mindspore/lite/src/runtime/kernel/opencl/kernel/cast.h create mode 100644 mindspore/lite/test/ut/src/runtime/kernel/opencl/cast_tests.cc diff --git a/mindspore/lite/src/runtime/kernel/opencl/cl/cast.cl b/mindspore/lite/src/runtime/kernel/opencl/cl/cast.cl new file mode 100644 index 0000000000..ff3a3971bb --- /dev/null +++ b/mindspore/lite/src/runtime/kernel/opencl/cl/cast.cl @@ -0,0 +1,46 @@ +#pragma OPENCL EXTENSION cl_khr_fp16 : enable +__constant sampler_t smp_none = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE | CLK_FILTER_NEAREST; + +__kernel void Cast_Fp32ToFp16_NHWC4(__read_only image2d_t input0, __write_only image2d_t output, int4 output_shape) { + int X = get_global_id(0); // N*H + int Y = get_global_id(1); // W + int Z = get_global_id(2); // c/4 + if (X >= output_shape.x * output_shape.y || Y >= output_shape.z || Z >= output_shape.w) { + return; + } + half4 result = convert_half4(READ_IMAGE(input0, smp_none, (int2)((Y)*output_shape.w + Z, (X)))); + write_imageh(output, (int2)((Y)*output_shape.w + Z, (X)), result); +} + +__kernel void Cast_Fp32ToFp16_NC4HW4(__read_only image2d_t input0, __write_only image2d_t output, int4 output_shape) { + int X = get_global_id(0); // N*H + int Y = get_global_id(1); // W + int Z = get_global_id(2); // c/4 + if (X >= output_shape.x * output_shape.y || Y >= output_shape.z || Z >= output_shape.w) { + return; + } + half4 result = convert_half4(READ_IMAGE(input0, smp_none, (int2)((Y), (Z * output_shape.y + X)))); + write_imageh(output, (int2)((Y), (Z * output_shape.y + X)), result); +} + +__kernel void Cast_Fp16ToFp32_NHWC4(__read_only image2d_t input0, __write_only image2d_t output, int4 output_shape) { + int X = get_global_id(0); // N*H + int Y = get_global_id(1); // W + int Z = get_global_id(2); // c/4 + if (X >= output_shape.x * output_shape.y || Y >= output_shape.z || Z >= output_shape.w) { + return; + } + float4 result = convert_float4(READ_IMAGE(input0, smp_none, (int2)((Y)*output_shape.w + Z, (X)))); + WRITE_IMAGE(output, (int2)((Y)*output_shape.w + Z, (X)), result); +} + +__kernel void Cast_Fp16ToFp32_NC4HW4(__read_only image2d_t input0, __write_only image2d_t output, int4 output_shape) { + int X = get_global_id(0); // N*H + int Y = get_global_id(1); // W + int Z = get_global_id(2); // c/4 + if (X >= output_shape.x * output_shape.y || Y >= output_shape.z || Z >= output_shape.w) { + return; + } + float4 result = convert_float4(READ_IMAGE(input0, smp_none, (int2)((Y), (Z * output_shape.y + X)))); + WRITE_IMAGE(output, (int2)((Y), (Z * output_shape.y + X)), result); +} diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/batchnorm.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/batchnorm.cc index e07ea96fc5..44825acae0 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/batchnorm.cc +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/batchnorm.cc @@ -112,12 +112,12 @@ int BatchNormOpenCLKernel::Run() { std::vector global = {OH, OW, OC}; BatchNormGetWorkGroup(global, &local, max_global[0]); int arg_cn = 0; - ocl_runtime->SetKernelArg(kernel_, arg_cn++, in_tensors_[0]->MutableData()); // input tensor - ocl_runtime->SetKernelArg(kernel_, arg_cn++, in_tensors_[1]->MutableData()); // scale - ocl_runtime->SetKernelArg(kernel_, arg_cn++, in_tensors_[2]->MutableData()); // offest - ocl_runtime->SetKernelArg(kernel_, arg_cn++, in_tensors_[3]->MutableData()); // mean - ocl_runtime->SetKernelArg(kernel_, arg_cn++, in_tensors_[4]->MutableData()); // variance - ocl_runtime->SetKernelArg(kernel_, arg_cn++, out_tensors_[0]->MutableData()); // out tensor + ocl_runtime->SetKernelArg(kernel_, arg_cn++, in_tensors_[0]->data_c()); // input tensor + ocl_runtime->SetKernelArg(kernel_, arg_cn++, in_tensors_[1]->data_c()); // scale + ocl_runtime->SetKernelArg(kernel_, arg_cn++, in_tensors_[2]->data_c()); // offest + ocl_runtime->SetKernelArg(kernel_, arg_cn++, in_tensors_[3]->data_c()); // mean + ocl_runtime->SetKernelArg(kernel_, arg_cn++, in_tensors_[4]->data_c()); // variance + ocl_runtime->SetKernelArg(kernel_, arg_cn++, out_tensors_[0]->data_c()); // out tensor ocl_runtime->SetKernelArg(kernel_, arg_cn++, input_shape_); ocl_runtime->SetKernelArg(kernel_, arg_cn++, param->epsilon_); ocl_runtime->RunKernel(kernel_, global, local, nullptr); diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/cast.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/cast.cc new file mode 100644 index 0000000000..da57b7f250 --- /dev/null +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/cast.cc @@ -0,0 +1,152 @@ +/** + * Copyright 2019 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include +#include +#include +#include +#include "src/kernel_registry.h" +#include "src/runtime/opencl/opencl_runtime.h" +#include "src/runtime/kernel/opencl/kernel/cast.h" +#include "src/runtime/kernel/opencl/utils.h" +#include "src/runtime/kernel/opencl/cl/cast.cl.inc" + +using mindspore::kernel::KERNEL_ARCH::kGPU; +using mindspore::lite::KernelRegistrar; +using mindspore::schema::PrimitiveType_Cast; + +namespace mindspore::kernel { + +int CastOpenCLKernel::GetImageSize(size_t idx, std::vector *img_size) { + size_t CO4 = UP_DIV(out_tensors_[0]->Channel(), C4NUM); + size_t im_dst_x, im_dst_y; + if (in_tensors_[0]->GetFormat() == schema::Format::Format_NHWC4) { + im_dst_x = out_tensors_[0]->Width() * CO4; + im_dst_y = out_tensors_[0]->Height(); + } else { + im_dst_y = out_tensors_[0]->Batch() * out_tensors_[0]->Height() * CO4; + im_dst_x = out_tensors_[0]->Width(); + } + size_t img_dtype = CL_FLOAT; + auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance(); + auto enable_fp16_ = ocl_runtime->GetFp16Enable(); + if (enable_fp16_) { + img_dtype = CL_HALF_FLOAT; + } + img_size->clear(); + std::vector vec{im_dst_x, im_dst_y, img_dtype}; + *img_size = vec; + return RET_OK; +} + +void CastOpenCLKernel::GetKernelName(std::string *kernel_name, CastParameter *param) { + if (param->src_type_ == kNumberTypeFloat32 && param->dst_type_ == kNumberTypeFloat16) { + kernel_name[0] += "_Fp32ToFp16"; + } else if (param->src_type_ == kNumberTypeFloat16 && param->dst_type_ == kNumberTypeFloat32) { + kernel_name[0] += "_Fp16ToFp32"; + } else { + MS_LOG(ERROR) << "unsupported convert format from : " << param->src_type_ << "to " << param->dst_type_; + } +} + +int CastOpenCLKernel::Init() { + auto param = reinterpret_cast(this->op_parameter_); + auto in_format = op_format_; + if (in_format != schema::Format_NHWC4 && in_format != schema::Format_NC4HW4) { + MS_LOG(ERROR) << "input format(" << in_format << ") " + << "format not support!"; + return RET_ERROR; + } + in_ori_format_ = in_tensors_[0]->GetFormat(); + in_tensors_[0]->SetFormat(op_format_); + out_ori_format_ = out_tensors_[0]->GetFormat(); + out_tensors_[0]->SetFormat(op_format_); + std::string kernel_name = "Cast"; + GetKernelName(&kernel_name, param); + if (in_format == schema::Format_NC4HW4) { + kernel_name += "_NC4HW4"; + } else if (in_format == schema::Format_NHWC4) { + kernel_name += "_NHWC4"; + } + std::set build_options; + std::string source = cast_source; + std::string program_name = "cast"; + auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance(); + ocl_runtime->LoadSource(program_name, source); + ocl_runtime->BuildKernel(kernel_, program_name, kernel_name, build_options); + + return RET_OK; +} + +int CastOpenCLKernel::ReSize() { return RET_OK; } + +void CastGetWorkGroup(const std::vector &global, std::vector *local, int max_size) { + const int max_divider = 8; + const int max_x = 4, max_y = 8; + int x = std::min(GetMaxDivisorStrategy1(global[0], max_divider), max_x); + int yz = max_size / x; + int y = std::min(std::min(GetMaxDivisorStrategy1(global[1], max_divider), yz), max_y); + int z = std::min(yz / y, static_cast(UP_DIV(global[2], 2))); + + local->clear(); + local->push_back(x); + local->push_back(y); + local->push_back(z); +} + +int CastOpenCLKernel::Run() { + MS_LOG(DEBUG) << this->name() << " Running! "; + auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance(); + auto input_shape = in_tensors_[0]->shape(); + cl_int4 input_shape_ = {input_shape[0], input_shape[1], input_shape[2], UP_DIV(input_shape[3], C4NUM)}; + + uint32_t OH = input_shape[1]; + uint32_t OW = input_shape[2]; + uint32_t OC = UP_DIV(input_shape[3], C4NUM); + + const std::vector &max_global = ocl_runtime->GetWorkItemSize(); + std::vector local = {1, 1, 1}; // init local + std::vector global = {OH, OW, OC}; + CastGetWorkGroup(global, &local, max_global[0]); + int arg_cn = 0; + ocl_runtime->SetKernelArg(kernel_, arg_cn++, in_tensors_[0]->data_c()); // input tensor + ocl_runtime->SetKernelArg(kernel_, arg_cn++, out_tensors_[0]->data_c()); // out tensor + ocl_runtime->SetKernelArg(kernel_, arg_cn++, input_shape_); + ocl_runtime->RunKernel(kernel_, global, local, nullptr); + + return RET_OK; +} + +kernel::LiteKernel *OpenCLCastKernelCreator(const std::vector &inputs, + const std::vector &outputs, OpParameter *opParameter, + const lite::Context *ctx, const kernel::KernelKey &desc, + const mindspore::lite::PrimitiveC *primitive) { + auto *kernel = new (std::nothrow) CastOpenCLKernel(opParameter, inputs, outputs); + if (kernel == nullptr) { + MS_LOG(ERROR) << " new CastOpenCLKernel failed "; + return nullptr; + } + auto ret = kernel->Init(); + if (ret != RET_OK) { + MS_LOG(ERROR) << " Init kernel failed, name: Cast "; + delete kernel; + return nullptr; + } + return kernel; +} + +REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Cast, OpenCLCastKernelCreator); +REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_Cast, OpenCLCastKernelCreator); +} // namespace mindspore::kernel diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/cast.h b/mindspore/lite/src/runtime/kernel/opencl/kernel/cast.h new file mode 100644 index 0000000000..f2001ced08 --- /dev/null +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/cast.h @@ -0,0 +1,52 @@ +/** + * Copyright 2019 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_OPENCL_KERNEL_CAST_H_ +#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_OPENCL_KERNEL_CAST_H_ + +#include +#include +#include "ir/anf.h" +#include "src/runtime/kernel/opencl/opencl_kernel.h" +#include "src/runtime/opencl/opencl_runtime.h" +#include "nnacl/fp32/cast.h" + +namespace mindspore::kernel { + +class CastOpenCLKernel : public OpenCLKernel { + public: + explicit CastOpenCLKernel(OpParameter *parameter, const std::vector &inputs, + const std::vector &outputs) + : OpenCLKernel(parameter, inputs, outputs) {} + + ~CastOpenCLKernel() override{}; + + int Init() override; + + int ReSize() override; + + int Run() override; + + void GetKernelName(std::string *kernel_name, CastParameter *param); + + int GetImageSize(size_t idx, std::vector *img_size) override; + + private: + cl::Kernel kernel_; +}; + +} // namespace mindspore::kernel +#endif diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/concat.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/concat.cc index d1776b18de..b7e79ec745 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/concat.cc +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/concat.cc @@ -55,11 +55,11 @@ int ConcatOpenCLKernel::RunAxis0() { auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance(); auto allocator_ = ocl_runtime->GetAllocator(); std::vector img_size; - auto dst_data = out_tensors_[0]->MutableData(); + auto dst_data = out_tensors_[0]->data_c(); auto dst_origin = cl::array{0, 0, 0}; cl::Image2D *out_image = reinterpret_cast(allocator_->GetImage(dst_data)); for (int i = 0; i < in_tensors_.size(); i++) { - auto src_data = in_tensors_[i]->MutableData(); + auto src_data = in_tensors_[i]->data_c(); allocator_->GetImageSize(src_data, &img_size); auto src_origin = cl::array{0, 0, 0}; auto region = cl::array{img_size[0], img_size[1], 1}; @@ -176,9 +176,9 @@ int ConcatOpenCLKernel::Run() { int arg_cn = 0; if (in_tensors_.size() == 2) { - ocl_runtime->SetKernelArg(kernel_, arg_cn++, in_tensors_[0]->MutableData()); - ocl_runtime->SetKernelArg(kernel_, arg_cn++, in_tensors_[1]->MutableData()); - ocl_runtime->SetKernelArg(kernel_, arg_cn++, out_tensors_[0]->MutableData()); + ocl_runtime->SetKernelArg(kernel_, arg_cn++, in_tensors_[0]->data_c()); + ocl_runtime->SetKernelArg(kernel_, arg_cn++, in_tensors_[1]->data_c()); + ocl_runtime->SetKernelArg(kernel_, arg_cn++, out_tensors_[0]->data_c()); ocl_runtime->SetKernelArg(kernel_, arg_cn++, input_shape1_); ocl_runtime->SetKernelArg(kernel_, arg_cn++, input_shape2_); ocl_runtime->SetKernelArg(kernel_, arg_cn++, output_shape_); @@ -187,10 +187,10 @@ int ConcatOpenCLKernel::Run() { auto input3_shape = in_tensors_[2]->shape(); cl_int4 input_shape3_ = {input3_shape[0], input3_shape[1], input3_shape[2], UP_DIV(input3_shape[3], C4NUM)}; - ocl_runtime->SetKernelArg(kernel_, arg_cn++, in_tensors_[0]->MutableData()); - ocl_runtime->SetKernelArg(kernel_, arg_cn++, in_tensors_[1]->MutableData()); - ocl_runtime->SetKernelArg(kernel_, arg_cn++, in_tensors_[2]->MutableData()); - ocl_runtime->SetKernelArg(kernel_, arg_cn++, out_tensors_[0]->MutableData()); + ocl_runtime->SetKernelArg(kernel_, arg_cn++, in_tensors_[0]->data_c()); + ocl_runtime->SetKernelArg(kernel_, arg_cn++, in_tensors_[1]->data_c()); + ocl_runtime->SetKernelArg(kernel_, arg_cn++, in_tensors_[2]->data_c()); + ocl_runtime->SetKernelArg(kernel_, arg_cn++, out_tensors_[0]->data_c()); ocl_runtime->SetKernelArg(kernel_, arg_cn++, input_shape1_); ocl_runtime->SetKernelArg(kernel_, arg_cn++, input_shape2_); ocl_runtime->SetKernelArg(kernel_, arg_cn++, input_shape3_); @@ -202,11 +202,11 @@ int ConcatOpenCLKernel::Run() { cl_int4 input_shape3_ = {input3_shape[0], input3_shape[1], input3_shape[2], UP_DIV(input3_shape[3], C4NUM)}; cl_int4 input_shape4_ = {input4_shape[0], input4_shape[1], input4_shape[2], UP_DIV(input4_shape[3], C4NUM)}; - ocl_runtime->SetKernelArg(kernel_, arg_cn++, in_tensors_[0]->MutableData()); - ocl_runtime->SetKernelArg(kernel_, arg_cn++, in_tensors_[1]->MutableData()); - ocl_runtime->SetKernelArg(kernel_, arg_cn++, in_tensors_[2]->MutableData()); - ocl_runtime->SetKernelArg(kernel_, arg_cn++, in_tensors_[3]->MutableData()); - ocl_runtime->SetKernelArg(kernel_, arg_cn++, out_tensors_[0]->MutableData()); + ocl_runtime->SetKernelArg(kernel_, arg_cn++, in_tensors_[0]->data_c()); + ocl_runtime->SetKernelArg(kernel_, arg_cn++, in_tensors_[1]->data_c()); + ocl_runtime->SetKernelArg(kernel_, arg_cn++, in_tensors_[2]->data_c()); + ocl_runtime->SetKernelArg(kernel_, arg_cn++, in_tensors_[3]->data_c()); + ocl_runtime->SetKernelArg(kernel_, arg_cn++, out_tensors_[0]->data_c()); ocl_runtime->SetKernelArg(kernel_, arg_cn++, input_shape1_); ocl_runtime->SetKernelArg(kernel_, arg_cn++, input_shape2_); ocl_runtime->SetKernelArg(kernel_, arg_cn++, input_shape3_); diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/slice.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/slice.cc index 7fb58b9e9f..f55131046b 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/slice.cc +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/slice.cc @@ -110,8 +110,8 @@ int SliceOpenCLKernel::Run() { std::vector global = {1, OH, OW}; SlcieGetWorkGroup(global, &local, max_global[0]); int arg_cn = 0; - ocl_runtime->SetKernelArg(kernel_, arg_cn++, in_tensors_[0]->MutableData()); // input tensor - ocl_runtime->SetKernelArg(kernel_, arg_cn++, out_tensors_[0]->MutableData()); // out tensor + ocl_runtime->SetKernelArg(kernel_, arg_cn++, in_tensors_[0]->data_c()); // input tensor + ocl_runtime->SetKernelArg(kernel_, arg_cn++, out_tensors_[0]->data_c()); // out tensor ocl_runtime->SetKernelArg(kernel_, arg_cn++, input_shape_); ocl_runtime->SetKernelArg(kernel_, arg_cn++, size_); ocl_runtime->SetKernelArg(kernel_, arg_cn++, begin_); diff --git a/mindspore/lite/test/ut/src/runtime/kernel/opencl/batchnorm_tests.cc b/mindspore/lite/test/ut/src/runtime/kernel/opencl/batchnorm_tests.cc index cde15bc740..df75b7bc3d 100644 --- a/mindspore/lite/test/ut/src/runtime/kernel/opencl/batchnorm_tests.cc +++ b/mindspore/lite/test/ut/src/runtime/kernel/opencl/batchnorm_tests.cc @@ -130,15 +130,15 @@ TEST_F(TestBatchnormOpenCLfp16, Batchnormfp16input_dim4) { } sub_graph->Init(); MS_LOG(INFO) << " init tensors "; - memcpy(inputs[0]->MutableData(), input_data, input_size); - memcpy(inputs[1]->MutableData(), scale_data, scale_size); - memcpy(inputs[2]->MutableData(), offset_data, offset_size); - memcpy(inputs[3]->MutableData(), mean_data, mean_size); - memcpy(inputs[4]->MutableData(), var_data, var_size); + memcpy(inputs[0]->data_c(), input_data, input_size); + memcpy(inputs[1]->data_c(), scale_data, scale_size); + memcpy(inputs[2]->data_c(), offset_data, offset_size); + memcpy(inputs[3]->data_c(), mean_data, mean_size); + memcpy(inputs[4]->data_c(), var_data, var_size); std::cout << "==================output data================" << std::endl; sub_graph->Run(); - auto *output_data_gpu = reinterpret_cast(output_tensor->MutableData()); + auto *output_data_gpu = reinterpret_cast(output_tensor->data_c()); CompareOutputData(output_data_gpu, correct_data, output_tensor->ElementsNum(), 0.01); for (auto tensor : inputs) { delete tensor; @@ -247,15 +247,15 @@ TEST_F(TestBatchnormOpenCLfp32, Batchnormfp32input_dim4) { } sub_graph->Init(); MS_LOG(INFO) << " init tensors "; - memcpy(inputs[0]->MutableData(), input_data, input_size); - memcpy(inputs[1]->MutableData(), scale_data, scale_size); - memcpy(inputs[2]->MutableData(), offset_data, offset_size); - memcpy(inputs[3]->MutableData(), mean_data, mean_size); - memcpy(inputs[4]->MutableData(), var_data, var_size); + memcpy(inputs[0]->data_c(), input_data, input_size); + memcpy(inputs[1]->data_c(), scale_data, scale_size); + memcpy(inputs[2]->data_c(), offset_data, offset_size); + memcpy(inputs[3]->data_c(), mean_data, mean_size); + memcpy(inputs[4]->data_c(), var_data, var_size); std::cout << "==================output data================" << std::endl; sub_graph->Run(); - auto *output_data_gpu = reinterpret_cast(output_tensor->MutableData()); + auto *output_data_gpu = reinterpret_cast(output_tensor->data_c()); CompareOutputData(output_data_gpu, correct_data, output_tensor->ElementsNum(), 0.0001); for (auto tensor : inputs) { delete tensor; diff --git a/mindspore/lite/test/ut/src/runtime/kernel/opencl/cast_tests.cc b/mindspore/lite/test/ut/src/runtime/kernel/opencl/cast_tests.cc new file mode 100644 index 0000000000..6b98f85355 --- /dev/null +++ b/mindspore/lite/test/ut/src/runtime/kernel/opencl/cast_tests.cc @@ -0,0 +1,212 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include +#include +#include "utils/log_adapter.h" +#include "common/common_test.h" +#include "mindspore/lite/src/runtime/opencl/opencl_runtime.h" +#include "mindspore/lite/src/common/file_utils.h" +#include "mindspore/lite/src/runtime/kernel/opencl/subgraph_opencl_kernel.h" +#include "mindspore/lite/src/runtime/kernel/opencl/kernel/cast.h" + +namespace mindspore { +class TestCastSelfOpenCL : public mindspore::CommonTest { + public: + TestCastSelfOpenCL() {} +}; + +template +void CompareOutputData1(T *output_data, T *correct_data, int size, float err_bound) { + for (size_t i = 0; i < size; i++) { + T abs = fabs(output_data[i] - correct_data[i]); + ASSERT_LE(abs, err_bound); + } +} + +TEST_F(TestCastSelfOpenCL, Castfp32tofp16) { + MS_LOG(INFO) << " begin test "; + auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance(); + ocl_runtime->Init(); + auto allocator = ocl_runtime->GetAllocator(); + + // get the input from .bin + size_t input1_size, output_size; + std::string input1Ppath = "./test_data/in_castfp32.bin"; + std::string correctOutputPath = "./test_data/out_castfp16.bin"; + + MS_LOG(INFO) << " initialize param "; + auto param = new (std::nothrow) CastParameter(); + if (param == nullptr) { + MS_LOG(INFO) << " new CastParameter failed "; + return; + } + param->src_type_ = kNumberTypeFloat32; + param->dst_type_ = kNumberTypeFloat16; + auto input_data = reinterpret_cast(mindspore::lite::ReadFile(input1Ppath.c_str(), &input1_size)); + auto correctOutput = + reinterpret_cast(mindspore::lite::ReadFile(correctOutputPath.c_str(), &output_size)); + + MS_LOG(INFO) << " init tensors "; + std::vector shape = {1, 23, 39, 47}; + auto tensor_type = lite::TensorCategory(schema::NodeType_ValueNode); + auto *input_tensor = new (std::nothrow) lite::Tensor(kNumberTypeFloat32, shape, schema::Format_NHWC, tensor_type); + auto *output_tensor = new (std::nothrow) lite::Tensor(kNumberTypeFloat16, shape, schema::Format_NHWC, tensor_type); + if (input_tensor == nullptr || output_tensor == nullptr) { + MS_LOG(INFO) << " new input_tensor or output_tensor failed "; + return; + } + std::vector inputs{input_tensor}; + std::vector outputs{output_tensor}; + + auto *cast_kernel = + new (std::nothrow) kernel::CastOpenCLKernel(reinterpret_cast(param), inputs, outputs); + if (cast_kernel == nullptr) { + MS_LOG(INFO) << " new kernel::CastOpenCLKernel failed "; + for (auto tensor : inputs) { + delete tensor; + } + for (auto tensor : outputs) { + delete tensor; + } + delete param; + return; + } + cast_kernel->SetFormatType(schema::Format_NC4HW4); + cast_kernel->Init(); + // to do allocate memory for inputs and outputs + for (auto &input_tensor : inputs) { + input_tensor->MallocData(allocator); + } + MS_LOG(INFO) << " initialize sub_graph "; + std::vector kernels{cast_kernel}; + auto *sub_graph = new (std::nothrow) kernel::SubGraphOpenCLKernel(inputs, outputs, kernels, kernels, kernels); + if (sub_graph == nullptr) { + MS_LOG(INFO) << " new kernel::SubGraphOpenCLKernel failed "; + for (auto tensor : inputs) { + delete tensor; + } + for (auto tensor : outputs) { + delete tensor; + } + delete param; + delete cast_kernel; + return; + } + sub_graph->Init(); + MS_LOG(INFO) << " initialize input data "; + memcpy(inputs[0]->data_c(), input_data, input1_size); + + std::cout << "==================output data================" << std::endl; + sub_graph->Run(); + auto *output_data_gpu = reinterpret_cast(output_tensor->data_c()); + CompareOutputData1(output_data_gpu, correctOutput, output_tensor->ElementsNum(), 0.000001); + for (auto tensor : inputs) { + delete tensor; + } + for (auto tensor : outputs) { + delete tensor; + } + delete param; + delete cast_kernel; + delete sub_graph; +} +TEST_F(TestCastSelfOpenCL, Castfp16tofp32) { + MS_LOG(INFO) << " begin test "; + auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance(); + ocl_runtime->Init(); + auto allocator = ocl_runtime->GetAllocator(); + + // get the input from .bin + size_t input1_size, output_size; + std::string input1Ppath = "./test_data/in_castfp16.bin"; + std::string correctOutputPath = "./test_data/out_castfp32.bin"; + + MS_LOG(INFO) << " initialize param "; + auto param = new (std::nothrow) CastParameter(); + if (param == nullptr) { + MS_LOG(INFO) << " new CastParameter failed "; + return; + } + param->src_type_ = kNumberTypeFloat16; + param->dst_type_ = kNumberTypeFloat32; + auto input_data = reinterpret_cast(mindspore::lite::ReadFile(input1Ppath.c_str(), &input1_size)); + auto correctOutput = reinterpret_cast(mindspore::lite::ReadFile(correctOutputPath.c_str(), &output_size)); + + MS_LOG(INFO) << " init tensors "; + std::vector shape = {1, 23, 39, 47}; + auto tensor_type = lite::TensorCategory(schema::NodeType_ValueNode); + auto *input_tensor = new (std::nothrow) lite::Tensor(kNumberTypeFloat16, shape, schema::Format_NHWC, tensor_type); + auto *output_tensor = new (std::nothrow) lite::Tensor(kNumberTypeFloat32, shape, schema::Format_NHWC, tensor_type); + if (input_tensor == nullptr || output_tensor == nullptr) { + MS_LOG(INFO) << " new input_tensor or output_tensor failed "; + return; + } + std::vector inputs{input_tensor}; + std::vector outputs{output_tensor}; + + auto *cast_kernel = + new (std::nothrow) kernel::CastOpenCLKernel(reinterpret_cast(param), inputs, outputs); + if (cast_kernel == nullptr) { + MS_LOG(INFO) << " new kernel::CastOpenCLKernel failed "; + for (auto tensor : inputs) { + delete tensor; + } + for (auto tensor : outputs) { + delete tensor; + } + delete param; + return; + } + cast_kernel->SetFormatType(schema::Format_NC4HW4); + cast_kernel->Init(); + // to do allocate memory for inputs and outputs + for (auto &input_tensor : inputs) { + input_tensor->MallocData(allocator); + } + MS_LOG(INFO) << " initialize sub_graph "; + std::vector kernels{cast_kernel}; + auto *sub_graph = new (std::nothrow) kernel::SubGraphOpenCLKernel(inputs, outputs, kernels, kernels, kernels); + if (sub_graph == nullptr) { + MS_LOG(INFO) << " new kernel::SubGraphOpenCLKernel failed "; + for (auto tensor : inputs) { + delete tensor; + } + for (auto tensor : outputs) { + delete tensor; + } + delete param; + delete cast_kernel; + return; + } + sub_graph->Init(); + MS_LOG(INFO) << " initialize input data "; + memcpy(inputs[0]->data_c(), input_data, input1_size); + + std::cout << "==================output data================" << std::endl; + sub_graph->Run(); + auto *output_data_gpu = reinterpret_cast(output_tensor->data_c()); + CompareOutputData1(output_data_gpu, correctOutput, output_tensor->ElementsNum(), 0.000001); + for (auto tensor : inputs) { + delete tensor; + } + for (auto tensor : outputs) { + delete tensor; + } + delete param; + delete cast_kernel; + delete sub_graph; +} +} // namespace mindspore diff --git a/mindspore/lite/test/ut/src/runtime/kernel/opencl/concat_tests.cc b/mindspore/lite/test/ut/src/runtime/kernel/opencl/concat_tests.cc index 4f79dbbcd8..2342b0668f 100644 --- a/mindspore/lite/test/ut/src/runtime/kernel/opencl/concat_tests.cc +++ b/mindspore/lite/test/ut/src/runtime/kernel/opencl/concat_tests.cc @@ -138,24 +138,24 @@ TEST_F(TestConcatOpenCLfp16, ConcatFp16_2input_dim4_axis3) { sub_graph->Init(); MS_LOG(INFO) << " initialize input data "; if (inputs.size() == 2) { - memcpy(inputs[0]->MutableData(), input_data1, input1_size); - memcpy(inputs[1]->MutableData(), input_data2, input2_size); + memcpy(inputs[0]->data_c(), input_data1, input1_size); + memcpy(inputs[1]->data_c(), input_data2, input2_size); } else if (inputs.size() == 3) { - memcpy(inputs[0]->MutableData(), input_data1, input1_size); - memcpy(inputs[1]->MutableData(), input_data2, input2_size); - memcpy(inputs[2]->MutableData(), input_data3, input3_size); + memcpy(inputs[0]->data_c(), input_data1, input1_size); + memcpy(inputs[1]->data_c(), input_data2, input2_size); + memcpy(inputs[2]->data_c(), input_data3, input3_size); } else if (inputs.size() == 4) { - memcpy(inputs[0]->MutableData(), input_data1, input1_size); - memcpy(inputs[1]->MutableData(), input_data2, input2_size); - memcpy(inputs[2]->MutableData(), input_data3, input3_size); - memcpy(inputs[3]->MutableData(), input_data4, input4_size); + memcpy(inputs[0]->data_c(), input_data1, input1_size); + memcpy(inputs[1]->data_c(), input_data2, input2_size); + memcpy(inputs[2]->data_c(), input_data3, input3_size); + memcpy(inputs[3]->data_c(), input_data4, input4_size); } else { MS_LOG(ERROR) << " input size must be 2 or 3 or 4"; } std::cout << "==================output data================" << std::endl; sub_graph->Run(); - auto *output_data_gpu = reinterpret_cast(output_tensor->MutableData()); + auto *output_data_gpu = reinterpret_cast(output_tensor->data_c()); CompareOutputData1(output_data_gpu, correctOutput, output_tensor->ElementsNum(), 0.000001); for (auto tensor : inputs) { delete tensor; @@ -263,19 +263,19 @@ TEST_F(TestConcatOpenCLfp32, ConcatFp32_2input_dim4_axis3) { sub_graph->Init(); MS_LOG(INFO) << " initialize input data "; if (inputs.size() == 2) { - memcpy(inputs[0]->MutableData(), input_data1, input1_size); - memcpy(inputs[1]->MutableData(), input_data2, input2_size); + memcpy(inputs[0]->data_c(), input_data1, input1_size); + memcpy(inputs[1]->data_c(), input_data2, input2_size); } else if (inputs.size() == 3) { - memcpy(inputs[0]->MutableData(), input_data1, input1_size); - memcpy(inputs[1]->MutableData(), input_data2, input2_size); - memcpy(inputs[2]->MutableData(), input_data3, input3_size); + memcpy(inputs[0]->data_c(), input_data1, input1_size); + memcpy(inputs[1]->data_c(), input_data2, input2_size); + memcpy(inputs[2]->data_c(), input_data3, input3_size); } else { MS_LOG(ERROR) << " input size must be 2 or 3 "; } std::cout << "==================output data================" << std::endl; sub_graph->Run(); - auto *output_data_gpu = reinterpret_cast(output_tensor->MutableData()); + auto *output_data_gpu = reinterpret_cast(output_tensor->data_c()); CompareOutputData1(output_data_gpu, correctOutput, output_tensor->ElementsNum(), 0.00001); for (auto tensor : inputs) { delete tensor; diff --git a/mindspore/lite/test/ut/src/runtime/kernel/opencl/slice_tests.cc b/mindspore/lite/test/ut/src/runtime/kernel/opencl/slice_tests.cc index 7a3c7beb35..fcb7cc86c7 100644 --- a/mindspore/lite/test/ut/src/runtime/kernel/opencl/slice_tests.cc +++ b/mindspore/lite/test/ut/src/runtime/kernel/opencl/slice_tests.cc @@ -130,12 +130,12 @@ TEST_F(TestSliceOpenCLfp32, Slicefp32input_dim4) { sub_graph->Init(); MS_LOG(INFO) << " init tensors "; - memcpy(inputs[0]->MutableData(), input_data, input_size); + memcpy(inputs[0]->data_c(), input_data, input_size); std::cout << "==================output data================" << std::endl; sub_graph->Run(); - auto *output_data_gpu = reinterpret_cast(output_tensor->MutableData()); + auto *output_data_gpu = reinterpret_cast(output_tensor->data_c()); CompareOutputData1(output_data_gpu, correct_data, output_tensor->ElementsNum(), 0.0001); for (auto tensor : inputs) { delete tensor; @@ -238,12 +238,12 @@ TEST_F(TestSliceOpenCLfp16, Slicefp16input_dim4) { sub_graph->Init(); MS_LOG(INFO) << " init tensors "; - memcpy(inputs[0]->MutableData(), input_data, input_size); + memcpy(inputs[0]->data_c(), input_data, input_size); std::cout << "==================output data================" << std::endl; sub_graph->Run(); - auto *output_data_gpu = reinterpret_cast(output_tensor->MutableData()); + auto *output_data_gpu = reinterpret_cast(output_tensor->data_c()); CompareOutputData1(output_data_gpu, correct_data, output_tensor->ElementsNum(), 0.0001); for (auto tensor : inputs) { delete tensor;