From af2c8072631121b9909f2995204047e742c6edf3 Mon Sep 17 00:00:00 2001 From: chenzupeng Date: Mon, 2 Nov 2020 14:59:26 +0800 Subject: [PATCH] fix arm32 gpu build bug --- .../kernel/opencl/kernel/arithmetic.cc | 2 +- .../runtime/kernel/opencl/kernel/arithmetic.h | 2 +- .../src/runtime/kernel/opencl/kernel/scale.cc | 3 +- .../src/runtime/kernel/opencl/kernel/scale.h | 2 +- .../runtime/kernel/opencl/arithmetic_tests.cc | 333 +++++++---------- .../src/runtime/kernel/opencl/scale_tests.cc | 346 +++++++----------- 6 files changed, 275 insertions(+), 413 deletions(-) diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/arithmetic.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/arithmetic.cc index 71bfd3749f..c57c9f2104 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/arithmetic.cc +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/arithmetic.cc @@ -295,7 +295,7 @@ kernel::LiteKernel *OpenCLArithmeticKernelCreator(const std::vector(opParameter), inputs, outputs, ctx); + new (std::nothrow) ArithmeticOpenCLKernel(reinterpret_cast(opParameter), inputs, outputs); if (kernel == nullptr) { MS_LOG(ERROR) << "Create OpenCL Arithmetic kernel failed!"; free(opParameter); diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/arithmetic.h b/mindspore/lite/src/runtime/kernel/opencl/kernel/arithmetic.h index c41b92afe0..d7557d8c94 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/arithmetic.h +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/arithmetic.h @@ -26,7 +26,7 @@ namespace mindspore::kernel { class ArithmeticOpenCLKernel : public OpenCLKernel { public: ArithmeticOpenCLKernel(OpParameter *parameter, const std::vector &inputs, - const std::vector &outputs, const lite::InnerContext *ctx) + const std::vector &outputs) : OpenCLKernel(parameter, inputs, outputs) {} ~ArithmeticOpenCLKernel() override = default; diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/scale.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/scale.cc index 6ff046bb47..b68441b438 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/scale.cc +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/scale.cc @@ -251,8 +251,7 @@ kernel::LiteKernel *OpenCLScaleKernelCreator(const std::vector & const std::vector &outputs, OpParameter *opParameter, const lite::InnerContext *ctx, const kernel::KernelKey &desc, const mindspore::lite::PrimitiveC *primitive) { - auto *kernel = - new (std::nothrow) ScaleOpenCLKernel(reinterpret_cast(opParameter), inputs, outputs, ctx); + auto *kernel = new (std::nothrow) ScaleOpenCLKernel(reinterpret_cast(opParameter), inputs, outputs); if (kernel == nullptr) { MS_LOG(ERROR) << "Create OpenCL Scale kernel failed!"; free(opParameter); diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/scale.h b/mindspore/lite/src/runtime/kernel/opencl/kernel/scale.h index 5ac3927460..6aeb16c18c 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/scale.h +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/scale.h @@ -26,7 +26,7 @@ namespace mindspore::kernel { class ScaleOpenCLKernel : public OpenCLKernel { public: ScaleOpenCLKernel(OpParameter *parameter, const std::vector &inputs, - const std::vector &outputs, const lite::InnerContext *ctx) + const std::vector &outputs) : OpenCLKernel(parameter, inputs, outputs) {} ~ScaleOpenCLKernel() override; diff --git a/mindspore/lite/test/ut/src/runtime/kernel/opencl/arithmetic_tests.cc b/mindspore/lite/test/ut/src/runtime/kernel/opencl/arithmetic_tests.cc index d052b3b2ed..327e92c267 100644 --- a/mindspore/lite/test/ut/src/runtime/kernel/opencl/arithmetic_tests.cc +++ b/mindspore/lite/test/ut/src/runtime/kernel/opencl/arithmetic_tests.cc @@ -13,233 +13,178 @@ * See the License for the specific language governing permissions and * limitations under the License. */ - +#include +#include +#include "src/common/log_adapter.h" #include "common/common_test.h" +#include "mindspore/lite/src/common/file_utils.h" +#include "mindspore/lite/src/runtime/opencl/opencl_runtime.h" #include "mindspore/lite/src/runtime/kernel/opencl/subgraph_opencl_kernel.h" #include "mindspore/lite/src/runtime/kernel/opencl/kernel/arithmetic.h" +#include "mindspore/lite/test/ut/src/runtime/kernel/opencl/utils_tests.h" namespace mindspore { +class TestArithmeticOpenCL : public mindspore::CommonTest { + public: + TestArithmeticOpenCL() {} +}; -template -static void BoardcaseAdd(const T *a, const T b, T *c, const int size) { - for (int i = 0; i < size; i++) { - c[i] = a[i] + b; - } -} - -template -static void ElementAdd(const T *a, const T *b, T *c, const int size) { - for (int i = 0; i < size; i++) { - c[i] = a[i] + b[i]; - } -} - -template -static bool DataCompare(const T *a, const T *b, const int size, const float accuracy = 1e-4) { - for (int i = 0; i < size; i++) { - auto diff = fabs(a[i] - b[i]); - if (diff > accuracy) { - MS_LOG(ERROR) << "compare failed at " << i << " exp " << a[i] << " bug got " << b[i]; - return false; - } - } - return true; -} - -template -static void InitData(void *data, const int size) { - T *data_float = reinterpret_cast(data); - static unsigned int seed = 123; - for (int i = 0; i < size; i++) { - data_float[i] = static_cast(rand_r(&seed)) % 100; - } -} - -template -static void LogData(void *data, const int size, const std::string prefix) { - std::cout << prefix; - T *data_float = reinterpret_cast(data); - for (int i = 0; i < size; i++) { - std::cout << data_float[i] << ","; - } - std::cout << std::endl; -} - -template -static void TestCase(const std::vector &shape_a, const std::vector &shape_b) { - bool is_log_data = false; +void RunTestCaseArithmetic(void *input_data0, const std::vector &input_shape, void *input_data1, + const std::vector &weight_shape, void *output_data, const std::vector &out_shape, + bool enable_fp16, int op_type, int act_type = schema::ActivationType_NO_ACTIVATION) { auto ocl_runtime = lite::opencl::OpenCLRuntimeWrapper().GetInstance(); + ocl_runtime->Init(); + size_t dtype_size = enable_fp16 ? sizeof(float16_t) : sizeof(float); + ocl_runtime->SetFp16Enable(enable_fp16); auto allocator = ocl_runtime->GetAllocator(); - - bool is_bias_add = shape_b.empty(); - auto data_type = kNumberTypeFloat32; - if (sizeof(T) == 2) { - data_type = kNumberTypeFloat16; - ocl_runtime->SetFp16Enable(true); - } - - lite::Tensor *tensor_a = new (std::nothrow) lite::Tensor(data_type, shape_a, schema::Format_NHWC4); - lite::Tensor *tensor_b = new (std::nothrow) lite::Tensor(data_type, shape_b, schema::Format_NHWC4); - lite::Tensor *tensor_c = new (std::nothrow) lite::Tensor(data_type, shape_a, schema::Format_NHWC4); - if (tensor_a == nullptr || tensor_b == nullptr || tensor_c == nullptr) { - MS_LOG(ERROR) << "Create tensor failed!"; - delete tensor_a; - delete tensor_b; - delete tensor_c; + auto param = static_cast(malloc(sizeof(ArithmeticParameter))); + if (param == nullptr) { + MS_LOG(ERROR) << "param_ptr create error."; return; } - - int64_t element_num = tensor_a->ElementsC4Num(); - int64_t element_num_b = is_bias_add ? 1 : tensor_b->ElementsC4Num(); - - T *data_a = new (std::nothrow) T[element_num]; - T *data_b = new (std::nothrow) T[element_num_b]; - T *data_c_cpu = new (std::nothrow) T[element_num]; - T *data_c_ocl = new (std::nothrow) T[element_num]; - if (data_a == nullptr || data_b == nullptr || data_c_cpu == nullptr || data_c_ocl == nullptr) { - MS_LOG(ERROR) << "Create buffer failed!"; - delete tensor_a; - delete tensor_b; - delete tensor_c; - delete[] data_a; - delete[] data_b; - delete[] data_c_cpu; - delete[] data_c_ocl; + int input0_size = std::accumulate(input_shape.begin(), input_shape.end(), 1LL, std::multiplies()); + int input1_size = std::accumulate(weight_shape.begin(), weight_shape.end(), 1LL, std::multiplies()); + if (input0_size != input1_size) { + param->broadcasting_ = true; + } + param->op_parameter_.type_ = op_type; + param->activation_type_ = act_type; + auto tensor_x_ptr = + std::make_unique(TypeId(enable_fp16 ? kNumberTypeFloat16 : kNumberTypeFloat32), input_shape); + auto tensor_x = tensor_x_ptr.get(); + if (tensor_x == nullptr) { + MS_LOG(ERROR) << "tensor_x create error."; return; } - InitData(data_a, element_num); - InitData(data_b, element_num_b); - memset(data_c_ocl, 0, sizeof(T) * element_num); - - if (is_bias_add) { - BoardcaseAdd(data_a, static_cast(data_b)[0], data_c_cpu, element_num); - } else { - ElementAdd(data_a, data_b, data_c_cpu, element_num); - } - - std::vector inputs = {tensor_a}; - if (!is_bias_add) { - inputs.push_back(tensor_b); - } else { - tensor_b->MallocData(); - memcpy(tensor_b->data_c(), data_b, sizeof(T)); - } - std::vector outputs = {tensor_c}; - - ArithmeticParameter *param = static_cast(malloc(sizeof(ArithmeticParameter))); - param->broadcasting_ = is_bias_add; - if (param == nullptr) { - MS_LOG(ERROR) << "Create parameter failed!"; - delete tensor_a; - delete tensor_b; - delete tensor_c; - delete[] data_a; - delete[] data_b; - delete[] data_c_cpu; - delete[] data_c_ocl; + auto tensor_w_ptr = std::make_unique( + TypeId(enable_fp16 ? kNumberTypeFloat16 : kNumberTypeFloat32), weight_shape, schema::Format_NHWC, + input1_size != 1 ? lite::Tensor::Category::CONST_TENSOR : lite::Tensor::Category::CONST_SCALAR); + auto tensor_w = tensor_w_ptr.get(); + if (tensor_w == nullptr) { + MS_LOG(ERROR) << "tensor_w create error."; return; } - param->ndim_ = 4; - param->op_parameter_.type_ = PrimitiveType_Add; - - std::vector arithmetic_inputs = {tensor_a, tensor_b}; - lite::InnerContext ctx; - ASSERT_EQ(lite::RET_OK, ctx.Init()); - auto *arith_kernel = new (std::nothrow) - kernel::ArithmeticOpenCLKernel(reinterpret_cast(param), arithmetic_inputs, outputs, &ctx); - if (arith_kernel == nullptr) { - MS_LOG(ERROR) << "Create ArithmeticOpenCLKernel failed!"; - delete tensor_a; - delete tensor_b; - delete tensor_c; - delete[] data_a; - delete[] data_b; - delete[] data_c_cpu; - delete[] data_c_ocl; - free(param); + tensor_w->set_data(input_data1); + auto tensor_out_ptr = + std::make_unique(TypeId(enable_fp16 ? kNumberTypeFloat16 : kNumberTypeFloat32), out_shape); + auto tensor_out = tensor_out_ptr.get(); + if (tensor_out == nullptr) { + MS_LOG(ERROR) << "tensor_out create error."; return; } - arith_kernel->Init(); - - tensor_a->MallocData(allocator); - tensor_b->MallocData(allocator); - std::vector kernels{arith_kernel}; - auto *kernel = new (std::nothrow) kernel::SubGraphOpenCLKernel(inputs, outputs, kernels, kernels, kernels); - if (arith_kernel == nullptr) { - MS_LOG(ERROR) << "Create SubGraphOpenCLKernel failed!"; - delete tensor_a; - delete tensor_b; - delete tensor_c; - delete[] data_a; - delete[] data_b; - delete[] data_c_cpu; - delete[] data_c_ocl; - delete arith_kernel; + std::vector inputs{tensor_x, tensor_w}; + std::vector outputs{tensor_out}; + auto op_kernel_ptr = + std::make_unique(reinterpret_cast(param), inputs, outputs); + auto op_kernel = op_kernel_ptr.release(); + if (op_kernel == nullptr) { + MS_LOG(ERROR) << "op_kernel create error."; return; } - kernel->Init(); + op_kernel->Init(); + inputs[0]->MallocData(allocator); - memcpy(inputs[0]->data_c(), data_a, sizeof(T) * element_num); - if (!is_bias_add) { - memcpy(inputs[1]->data_c(), data_b, sizeof(T) * element_num_b); - } - - kernel->Run(); - - memcpy(data_c_ocl, outputs[0]->data_c(), sizeof(T) * element_num); + std::vector kernels{op_kernel}; - if (is_log_data) { - LogData(data_a, 10, "Data A : "); - LogData(data_b, tensor_b->shape().empty() ? 1 : 10, "Data B : "); - LogData(data_c_cpu, 10, "Expect compute : "); - LogData(outputs[0]->data_c(), 10, "OpenCL compute : "); + std::vector inputs_g{tensor_x}; + auto pGraph_ptr = std::make_unique(inputs_g, outputs, kernels, kernels, kernels); + auto pGraph = pGraph_ptr.get(); + if (pGraph == nullptr) { + MS_LOG(ERROR) << "pGraph create error."; + return; + } + pGraph->Init(); + memcpy(inputs[0]->MutableData(), input_data0, tensor_x->ElementsNum() * dtype_size); + pGraph->Run(); + if (enable_fp16) { + CompareOutput(outputs[0]->MutableData(), output_data, tensor_out->ElementsNum(), static_cast(1e-3), + 2e-2); + } else { + CompareOutput(outputs[0]->MutableData(), output_data, tensor_out->ElementsNum(), static_cast(1e-5)); } - bool cmp = DataCompare(data_c_cpu, data_c_ocl, element_num); - MS_LOG(INFO) << "Compare " << (cmp ? "success!" : "failed!"); - EXPECT_EQ(true, cmp); - - // free - delete[] data_a; - delete[] data_b; - delete[] data_c_cpu; - delete[] data_c_ocl; - delete kernel; - for (auto tensor : inputs) { - delete tensor; + for (auto t : inputs) { + t->set_data(nullptr); } - for (auto tensor : outputs) { - delete tensor; + for (auto t : outputs) { + t->set_data(nullptr); } + MS_LOG(INFO) << "TestArithmetic passed"; } -class TestArithmeticOpenCL : public mindspore::CommonTest { - public: - TestArithmeticOpenCL() {} -}; +TEST_F(TestArithmeticOpenCL, ArithmeticElementwiseAddFp32) { + int n = 1; + int h = 2; + int w = 2; + int c = 3; + std::vector in_shape0 = {n, h, w, c}; + std::vector in_shape1 = {n, h, w, c}; + std::vector out_shape = {n, h, w, c}; + std::vector input_data = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, 10.0f, 11.0f, 12.0f}; + std::vector weight_data = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, 10.0f, 11.0f, 12.0f}; + std::vector output_data = {2.0f, 4.0f, 6.0f, 8.0f, 10.0f, 12.0f, 14.0f, 16.0f, 18.0f, 20.0f, 22.0f, 24.0f}; + RunTestCaseArithmetic(input_data.data(), in_shape0, weight_data.data(), in_shape1, output_data.data(), out_shape, + false, schema::PrimitiveType_Add); +} -TEST_F(TestArithmeticOpenCL, AddElementwiseFP32) { - const std::vector &shape_a = {1, 1024, 1024, 4}; - const std::vector &shape_b = {1, 1024, 1024, 4}; - TestCase(shape_a, shape_b); +TEST_F(TestArithmeticOpenCL, ArithmeticScalarMulFp32) { + int n = 1; + int h = 2; + int w = 2; + int c = 3; + std::vector in_shape0 = {n, h, w, c}; + std::vector in_shape1 = {1}; + std::vector out_shape = {n, h, w, c}; + std::vector input_data = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, 10.0f, 11.0f, 12.0f}; + std::vector weight_data = {2.0f}; + std::vector output_data = {2.0f, 4.0f, 6.0f, 8.0f, 10.0f, 12.0f, 14.0f, 16.0f, 18.0f, 20.0f, 22.0f, 24.0f}; + RunTestCaseArithmetic(input_data.data(), in_shape0, weight_data.data(), in_shape1, output_data.data(), out_shape, + false, schema::PrimitiveType_Mul); } -TEST_F(TestArithmeticOpenCL, AddBroadcastFP32) { - const std::vector &shape_a = {1, 128, 128, 4}; - const std::vector &shape_b = {}; - TestCase(shape_a, shape_b); +TEST_F(TestArithmeticOpenCL, ArithmeticBroadcastSubReLU6Fp32) { + int n = 1; + int h = 2; + int w = 2; + int c = 3; + std::vector in_shape0 = {n, h, w, c}; + std::vector in_shape1 = {c}; + std::vector out_shape = {n, h, w, c}; + std::vector input_data = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, 10.0f, 11.0f, 12.0f}; + std::vector weight_data = {1.0f, 2.0f, 3.0f}; + std::vector output_data = {0.0f, 0.0f, 0.0f, 3.0f, 3.0f, 3.0f, 6.0f, 6.0f, 6.0f, 6.0f, 6.0f, 6.0f}; + RunTestCaseArithmetic(input_data.data(), in_shape0, weight_data.data(), in_shape1, output_data.data(), out_shape, + false, schema::PrimitiveType_Sub, schema::ActivationType_RELU6); } -TEST_F(TestArithmeticOpenCL, AddElementwiseFP16) { - const std::vector &shape_a = {1, 1024, 1024, 4}; - const std::vector &shape_b = {1, 1024, 1024, 4}; - TestCase(shape_a, shape_b); +TEST_F(TestArithmeticOpenCL, ArithmeticBroadcastSub2Fp32) { + int n = 1; + int h = 2; + int w = 2; + int c = 3; + std::vector in_shape0 = {n, c}; + std::vector in_shape1 = {n, h, w, c}; + std::vector out_shape = {n, h, w, c}; + std::vector input_data = {1.0f, 2.0f, 3.0f}; + std::vector weight_data = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, 10.0f, 11.0f, 12.0f}; + std::vector output_data = {0.0f, 0.0f, 0.0f, -3.0f, -3.0f, -3.0f, -6.0f, -6.0f, -6.0f, -9.0f, -9.0f, -9.0f}; + RunTestCaseArithmetic(input_data.data(), in_shape0, weight_data.data(), in_shape1, output_data.data(), out_shape, + false, schema::PrimitiveType_Sub); } -TEST_F(TestArithmeticOpenCL, AddBroadcastFP16) { - const std::vector &shape_a = {1, 128, 128, 4}; - const std::vector &shape_b = {}; - TestCase(shape_a, shape_b); +TEST_F(TestArithmeticOpenCL, ArithmeticElementwiseDivFp16) { + int n = 1; + int h = 2; + int w = 2; + int c = 3; + std::vector in_shape0 = {n, h, w, c}; + std::vector in_shape1 = {n, h, w, c}; + std::vector out_shape = {n, h, w, c}; + std::vector input_data = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, 10.0f, 11.0f, 12.0f}; + std::vector weight_data = {1.0f, 1.0f, 1.0f, 2.0f, 2.0f, 2.0f, 1.0f, 1.0f, 1.0f, 2.0f, 2.0f, 2.0f}; + std::vector output_data = {1.0f, 2.0f, 3.0f, 2.0f, 2.5, 3.0f, 7.0f, 8.0f, 9.0f, 5.0f, 5.5, 6.0f}; + RunTestCaseArithmetic(input_data.data(), in_shape0, weight_data.data(), in_shape1, output_data.data(), out_shape, + true, schema::PrimitiveType_Div); } } // namespace mindspore diff --git a/mindspore/lite/test/ut/src/runtime/kernel/opencl/scale_tests.cc b/mindspore/lite/test/ut/src/runtime/kernel/opencl/scale_tests.cc index 96ff554b09..1ce7d6e870 100644 --- a/mindspore/lite/test/ut/src/runtime/kernel/opencl/scale_tests.cc +++ b/mindspore/lite/test/ut/src/runtime/kernel/opencl/scale_tests.cc @@ -13,253 +13,171 @@ * See the License for the specific language governing permissions and * limitations under the License. */ - +#include +#include +#include "src/common/log_adapter.h" #include "common/common_test.h" +#include "mindspore/lite/src/common/file_utils.h" +#include "mindspore/lite/src/runtime/opencl/opencl_runtime.h" #include "mindspore/lite/src/runtime/kernel/opencl/subgraph_opencl_kernel.h" #include "mindspore/lite/src/runtime/kernel/opencl/kernel/scale.h" +#include "mindspore/lite/test/ut/src/runtime/kernel/opencl/utils_tests.h" namespace mindspore { +class TestScaleOpenCL : public mindspore::CommonTest { + public: + TestScaleOpenCL() {} +}; -template -static void BoardcaseScale(const T *in, const T scale, const T offset, T *out, const int size) { - for (int i = 0; i < size; i++) { - out[i] = in[i] * scale + offset; - } -} - -template -static void Scale(const T *in, const T *scale, T *offset, T *out, const int size) { - for (int i = 0; i < size; i++) { - out[i] = in[i] * scale[i] + offset[i]; - } -} - -template -static bool DataCompare(const T *a, const T *b, const int size, const T accuracy = 1e-4) { - for (int i = 0; i < size; i++) { - auto diff = fabs(a[i] - b[i]); - if (diff > accuracy) { - MS_LOG(ERROR) << "compare failed at " << i << " exp " << a[i] << " bug got " << b[i]; - return false; - } - } - return true; -} - -template -static void InitData(void *data, const int size) { - T *data_float = reinterpret_cast(data); - static unsigned int seed = 123; - for (int i = 0; i < size; i++) { - data_float[i] = static_cast(rand_r(&seed)) % 100; - } -} - -template -static void LogData(void *data, const int size, const std::string prefix) { - std::cout << prefix; - T *data_float = reinterpret_cast(data); - for (int i = 0; i < size; i++) { - std::cout << data_float[i] << ","; - } - std::cout << std::endl; -} - -template -static void TestCase(const std::vector &shape_a, const std::vector &shape_b) { - bool is_log_data = false; +void RunTestCaseScale(void *input_data0, const std::vector &input_shape, void *scale_data, void *offset_data, + const std::vector &weight_shape, void *output_data, const std::vector &out_shape, + bool enable_fp16, int axis, int act_type = schema::ActivationType_NO_ACTIVATION) { auto ocl_runtime = lite::opencl::OpenCLRuntimeWrapper().GetInstance(); + ocl_runtime->Init(); + size_t dtype_size = enable_fp16 ? sizeof(float16_t) : sizeof(float); + ocl_runtime->SetFp16Enable(enable_fp16); auto allocator = ocl_runtime->GetAllocator(); - - bool is_broadcast = shape_b.empty(); - auto format = schema::Format_NHWC4; - - auto data_type = kNumberTypeFloat32; - if (sizeof(T) == 2) { - data_type = kNumberTypeFloat16; - ocl_runtime->SetFp16Enable(true); - } - lite::Tensor *tensor_in = new (std::nothrow) lite::Tensor(data_type, shape_a, format); - lite::Tensor *tensor_scale = new (std::nothrow) lite::Tensor(data_type, shape_b, format); - lite::Tensor *tensor_offset = new (std::nothrow) lite::Tensor(data_type, shape_b, format); - lite::Tensor *tensor_out = new (std::nothrow) lite::Tensor(data_type, shape_a, format); - if (tensor_in == nullptr || tensor_scale == nullptr || tensor_offset == nullptr) { - MS_LOG(ERROR) << "Create tensor failed!"; - delete tensor_in; - delete tensor_scale; - delete tensor_offset; - delete tensor_out; + auto param = static_cast(malloc(sizeof(ScaleParameter))); + if (param == nullptr) { + MS_LOG(ERROR) << "param_ptr create error."; return; } - - int64_t element_num = tensor_in->ElementsC4Num(); - int64_t element_num_b = is_broadcast ? 1 : tensor_scale->ElementsC4Num(); - - T *data_in = new (std::nothrow) T[element_num]; - T *data_scale = new (std::nothrow) T[element_num_b]; - T *data_offset = new (std::nothrow) T[element_num_b]; - T *data_out_cpu = new (std::nothrow) T[element_num]; - T *data_out_ocl = new (std::nothrow) T[element_num]; - if (data_in == nullptr || data_scale == nullptr || data_out_cpu == nullptr || data_out_ocl == nullptr) { - MS_LOG(ERROR) << "Create buffer failed!"; - delete tensor_in; - delete tensor_scale; - delete tensor_offset; - delete tensor_out; - delete[] data_in; - delete[] data_scale; - delete[] data_offset; - delete[] data_out_cpu; - delete[] data_out_ocl; + param->axis_ = axis; + param->activation_type_ = act_type; + auto tensor_x_ptr = + std::make_unique(TypeId(enable_fp16 ? kNumberTypeFloat16 : kNumberTypeFloat32), input_shape); + auto tensor_x = tensor_x_ptr.get(); + if (tensor_x == nullptr) { + MS_LOG(ERROR) << "tensor_x create error."; return; } - InitData(data_in, element_num); - InitData(data_scale, element_num_b); - InitData(data_offset, element_num_b); - memset(data_out_ocl, 0, sizeof(T) * element_num); - - if (is_broadcast) { - BoardcaseScale(data_in, static_cast(data_scale)[0], static_cast(data_offset)[0], data_out_cpu, - element_num); - } else { - Scale(data_in, data_scale, data_offset, data_out_cpu, element_num); - } - - std::vector inputs = {tensor_in}; - if (!is_broadcast) { - inputs.push_back(tensor_scale); - inputs.push_back(tensor_offset); - } else { - tensor_scale->MallocData(); - tensor_offset->MallocData(); - memcpy(tensor_scale->data_c(), data_scale, sizeof(T)); - memcpy(tensor_offset->data_c(), data_offset, sizeof(T)); - } - std::vector outputs = {tensor_out}; - - ScaleParameter *param = static_cast(malloc(sizeof(ScaleParameter))); - if (param == nullptr) { - MS_LOG(ERROR) << "Create parameter failed!"; - delete tensor_in; - delete tensor_scale; - delete tensor_offset; - delete tensor_out; - delete[] data_in; - delete[] data_scale; - delete[] data_offset; - delete[] data_out_cpu; - delete[] data_out_ocl; + auto tensor_scale_ptr = + std::make_unique(TypeId(enable_fp16 ? kNumberTypeFloat16 : kNumberTypeFloat32), weight_shape, + schema::Format_NHWC, lite::Tensor::Category::CONST_TENSOR); + auto tensor_scale = tensor_scale_ptr.get(); + if (tensor_scale == nullptr) { + MS_LOG(ERROR) << "tensor_scale create error."; return; } - param->axis_ = 0; - param->op_parameter_.type_ = schema::PrimitiveType_Scale; - - std::vector scale_inputs = {tensor_in, tensor_scale, tensor_offset}; - lite::InnerContext ctx; - ASSERT_EQ(lite::RET_OK, ctx.Init()); - auto *scale_kernel = - new (std::nothrow) kernel::ScaleOpenCLKernel(reinterpret_cast(param), scale_inputs, outputs, &ctx); - if (scale_kernel == nullptr) { - MS_LOG(ERROR) << "Create ScaleOpenCLKernel failed!"; - delete tensor_in; - delete tensor_scale; - delete tensor_offset; - delete tensor_out; - delete[] data_in; - delete[] data_scale; - delete[] data_offset; - delete[] data_out_cpu; - delete[] data_out_ocl; - free(param); + tensor_scale->set_data(scale_data); + auto tensor_offset_ptr = + std::make_unique(TypeId(enable_fp16 ? kNumberTypeFloat16 : kNumberTypeFloat32), weight_shape, + schema::Format_NHWC, lite::Tensor::Category::CONST_TENSOR); + auto tensor_offset = tensor_offset_ptr.get(); + if (tensor_offset == nullptr) { + MS_LOG(ERROR) << "tensor_offset create error."; return; } - scale_kernel->Init(); - - tensor_in->MallocData(allocator); - tensor_scale->MallocData(allocator); - tensor_offset->MallocData(allocator); - std::vector kernels{scale_kernel}; - auto *kernel = new (std::nothrow) kernel::SubGraphOpenCLKernel(inputs, outputs, kernels, kernels, kernels); - if (scale_kernel == nullptr) { - MS_LOG(ERROR) << "Create SubGraphOpenCLKernel failed!"; - delete tensor_in; - delete tensor_scale; - delete tensor_offset; - delete tensor_out; - delete[] data_in; - delete[] data_scale; - delete[] data_offset; - delete[] data_out_cpu; - delete[] data_out_ocl; - delete scale_kernel; + tensor_offset->set_data(offset_data); + auto tensor_out_ptr = + std::make_unique(TypeId(enable_fp16 ? kNumberTypeFloat16 : kNumberTypeFloat32), out_shape); + auto tensor_out = tensor_out_ptr.get(); + if (tensor_out == nullptr) { + MS_LOG(ERROR) << "tensor_out create error."; return; } - kernel->Init(); - - memcpy(inputs[0]->data_c(), data_in, sizeof(T) * element_num); - if (!is_broadcast) { - memcpy(inputs[1]->data_c(), data_scale, sizeof(T) * element_num_b); - memcpy(inputs[2]->data_c(), data_offset, sizeof(T) * element_num_b); + std::vector inputs{tensor_x, tensor_scale, tensor_offset}; + std::vector outputs{tensor_out}; + auto op_kernel_ptr = + std::make_unique(reinterpret_cast(param), inputs, outputs); + auto op_kernel = op_kernel_ptr.release(); + if (op_kernel == nullptr) { + MS_LOG(ERROR) << "op_kernel create error."; + return; } + op_kernel->Init(); + inputs[0]->MallocData(allocator); - kernel->Run(); - - memcpy(data_out_ocl, outputs[0]->data_c(), sizeof(T) * element_num); + std::vector kernels{op_kernel}; - if (is_log_data) { - LogData(data_in, 10, "Data input : "); - LogData(data_scale, tensor_scale->shape().empty() ? 1 : 10, "Data scale : "); - LogData(data_offset, tensor_offset->shape().empty() ? 1 : 10, "Data offset : "); - LogData(data_out_cpu, 10, "Expect compute : "); - LogData(outputs[0]->data_c(), 10, "OpenCL compute : "); + std::vector inputs_g{tensor_x}; + auto pGraph_ptr = std::make_unique(inputs_g, outputs, kernels, kernels, kernels); + auto pGraph = pGraph_ptr.get(); + if (pGraph == nullptr) { + MS_LOG(ERROR) << "pGraph create error."; + return; + } + pGraph->Init(); + memcpy(inputs[0]->MutableData(), input_data0, tensor_x->ElementsNum() * dtype_size); + pGraph->Run(); + if (enable_fp16) { + CompareOutput(outputs[0]->MutableData(), output_data, tensor_out->ElementsNum(), static_cast(1e-3), + 2e-2); + } else { + CompareOutput(outputs[0]->MutableData(), output_data, tensor_out->ElementsNum(), static_cast(1e-5)); } - bool cmp = DataCompare(data_out_cpu, data_out_ocl, element_num); - MS_LOG(INFO) << "Compare " << (cmp ? "success!" : "failed!"); - EXPECT_EQ(true, cmp); - - // free - delete[] data_in; - delete[] data_scale; - delete[] data_offset; - delete[] data_out_cpu; - delete[] data_out_ocl; - delete kernel; - for (auto tensor : inputs) { - delete tensor; + for (auto t : inputs) { + t->set_data(nullptr); } - for (auto tensor : outputs) { - delete tensor; + for (auto t : outputs) { + t->set_data(nullptr); } + MS_LOG(INFO) << "TestScale passed"; } -class TestScaleOpenCL : public mindspore::CommonTest { - public: - TestScaleOpenCL() {} -}; - -TEST_F(TestScaleOpenCL, ElementFP32) { - const std::vector &shape_a = {1, 1024, 1024, 4}; - const std::vector &shape_b = {1, 1024, 1024, 4}; - TestCase(shape_a, shape_b); +TEST_F(TestScaleOpenCL, ScaleAxis3Fp32) { + int n = 1; + int h = 2; + int w = 2; + int c = 3; + std::vector in_shape0 = {n, h, w, c}; + std::vector weight_shape = {c}; + std::vector out_shape = {n, h, w, c}; + std::vector input_data = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, 10.0f, 11.0f, 12.0f}; + std::vector scale_data = {1.0f, 2.0f, 3.0f}; + std::vector offset_data = {1.0f, 2.0f, 3.0f}; + std::vector output_data = {2.0f, 6.0f, 12.0f, 5.0f, 12.0f, 21.0f, 8.0f, 18.0f, 30.0f, 11.0f, 24.0f, 39.0f}; + RunTestCaseScale(input_data.data(), in_shape0, scale_data.data(), offset_data.data(), weight_shape, + output_data.data(), out_shape, false, 3); } -TEST_F(TestScaleOpenCL, BroadcastFP32) { - const std::vector &shape_a = {1, 128, 128, 4}; - const std::vector &shape_b = {}; - TestCase(shape_a, shape_b); +TEST_F(TestScaleOpenCL, ScaleAxis1Fp32) { + int n = 1; + int h = 2; + int w = 2; + int c = 3; + std::vector in_shape0 = {n, h, w, c}; + std::vector weight_shape = {h}; + std::vector out_shape = {n, h, w, c}; + std::vector input_data = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, 10.0f, 11.0f, 12.0f}; + std::vector scale_data = {1.0f, 2.0f}; + std::vector offset_data = {1.0f, 2.0f}; + std::vector output_data = {2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 16.0f, 18.0f, 20.0f, 22.0f, 24.0f, 26.0f}; + RunTestCaseScale(input_data.data(), in_shape0, scale_data.data(), offset_data.data(), weight_shape, + output_data.data(), out_shape, false, 1); } -TEST_F(TestScaleOpenCL, ElementFP16) { - const std::vector &shape_a = {1, 1024, 1024, 4}; - const std::vector &shape_b = {1, 1024, 1024, 4}; - TestCase(shape_a, shape_b); +TEST_F(TestScaleOpenCL, ScaleAxis3ReLU6Fp32) { + int n = 1; + int h = 2; + int w = 2; + int c = 3; + std::vector in_shape0 = {n, h, w, c}; + std::vector weight_shape = {c}; + std::vector out_shape = {n, h, w, c}; + std::vector input_data = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, 10.0f, 11.0f, 12.0f}; + std::vector scale_data = {1.0f, 2.0f, -1.0f}; + std::vector offset_data = {1.0f, 2.0f, 3.0f}; + std::vector output_data = {2.0f, 6.0f, 0.0f, 5.0f, 6.0f, 0.0f, 6.0f, 6.0f, 0.0f, 6.0f, 6.0f, 0.0f}; + RunTestCaseScale(input_data.data(), in_shape0, scale_data.data(), offset_data.data(), weight_shape, + output_data.data(), out_shape, false, 3, schema::ActivationType_RELU6); } -TEST_F(TestScaleOpenCL, BroadcastFP16) { - const std::vector &shape_a = {1, 128, 128, 4}; - const std::vector &shape_b = {}; - TestCase(shape_a, shape_b); +TEST_F(TestScaleOpenCL, ScaleAxis3Fp16) { + int n = 1; + int h = 2; + int w = 2; + int c = 3; + std::vector in_shape0 = {n, h, w, c}; + std::vector weight_shape = {c}; + std::vector out_shape = {n, h, w, c}; + std::vector input_data = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, 10.0f, 11.0f, 12.0f}; + std::vector scale_data = {1.0f, 2.0f, 3.0f}; + std::vector offset_data = {1.0f, 2.0f, 3.0f}; + std::vector output_data = {2.0f, 6.0f, 12.0f, 5.0f, 12.0f, 21.0f, 8.0f, 18.0f, 30.0f, 11.0f, 24.0f, 39.0f}; + RunTestCaseScale(input_data.data(), in_shape0, scale_data.data(), offset_data.data(), weight_shape, + output_data.data(), out_shape, true, 3); } } // namespace mindspore