Browse Source

add opencl kernel stack, fix scale/reduce bug, add some models to CI

tags/v1.1.0
Pengyongrong 5 years ago
parent
commit
292e4a4e2e
13 changed files with 770 additions and 72 deletions
  1. +3
    -3
      mindspore/lite/src/runtime/kernel/opencl/cl/reduce.cl
  2. +29
    -6
      mindspore/lite/src/runtime/kernel/opencl/cl/scale.cl
  3. +137
    -0
      mindspore/lite/src/runtime/kernel/opencl/cl/stack.cl
  4. +9
    -39
      mindspore/lite/src/runtime/kernel/opencl/kernel/concat.cc
  5. +0
    -4
      mindspore/lite/src/runtime/kernel/opencl/kernel/concat.h
  6. +14
    -7
      mindspore/lite/src/runtime/kernel/opencl/kernel/scale.cc
  7. +211
    -0
      mindspore/lite/src/runtime/kernel/opencl/kernel/stack.cc
  8. +59
    -0
      mindspore/lite/src/runtime/kernel/opencl/kernel/stack.h
  9. +9
    -0
      mindspore/lite/test/models_fp32_gpu.cfg
  10. +11
    -0
      mindspore/lite/test/models_tflite.cfg
  11. +1
    -1
      mindspore/lite/test/run_benchmark_nets.sh
  12. +4
    -12
      mindspore/lite/test/ut/src/runtime/kernel/opencl/concat_tests.cc
  13. +283
    -0
      mindspore/lite/test/ut/src/runtime/kernel/opencl/stack_tests.cc

+ 3
- 3
mindspore/lite/src/runtime/kernel/opencl/cl/reduce.cl View File

@@ -7,14 +7,14 @@ __kernel void mean_NHWC4(__read_only image2d_t src_data, __write_only image2d_t
if (X >= size.z) {
return;
}
FLT4 result = (FLT4)0.f;
float4 result = (float4)0.f;
for (int h = 0; h < size.x; h++) {
for (int w = 0; w < size.y; w++) {
result += READ_IMAGE(src_data, smp_zero, (int2)(w * size.z + X, h));
result += convert_float4(READ_IMAGE(src_data, smp_zero, (int2)(w * size.z + X, h)));
}
}
result /= size.x * size.y;
WRITE_IMAGE(dst_data, (int2)(X, 0), result);
WRITE_IMAGE(dst_data, (int2)(X, 0), TO_FLT4(result));
}

__kernel void mean_NC4HW4(__read_only image2d_t src_data, __write_only image2d_t dst_data, int4 size) {


+ 29
- 6
mindspore/lite/src/runtime/kernel/opencl/cl/scale.cl View File

@@ -1,8 +1,13 @@
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
__constant sampler_t smp_none = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE | CLK_FILTER_NEAREST;

#define ActType_No 0
#define ActType_Relu 1
#define ActType_Sigmod 2
#define ActType_Relu6 3

__kernel void Scale_IMG(__read_only image2d_t input, __read_only image2d_t scale, __read_only image2d_t offset,
__write_only image2d_t output, const int2 output_shape) {
__write_only image2d_t output, const int2 output_shape, const int act_type) {
int X = get_global_id(0);
int Y = get_global_id(1);
if (X >= output_shape.x || Y >= output_shape.y) {
@@ -12,11 +17,17 @@ __kernel void Scale_IMG(__read_only image2d_t input, __read_only image2d_t scale
FLT4 in = READ_IMAGE(input, smp_none, (int2)(X, Y));
FLT4 s = READ_IMAGE(scale, smp_none, (int2)(X, Y));
FLT4 o = READ_IMAGE(offset, smp_none, (int2)(X, Y));
WRITE_IMAGE(output, (int2)(X, Y), in * s + o);
FLT4 out = in * s + o;
if (act_type == ActType_Relu) {
out = max(out, (FLT4)(0.0f));
} else if (act_type == ActType_Relu6) {
out = clamp(out, (FLT4)(0.0f), (FLT4)(6.0f));
}
WRITE_IMAGE(output, (int2)(X, Y), out);
}

__kernel void BoardcastScale_IMG(__read_only image2d_t input, float scale, float offset, __write_only image2d_t output,
const int2 output_shape) {
const int2 output_shape, const int act_type) {
int X = get_global_id(0);
int Y = get_global_id(1);
if (X >= output_shape.x || Y >= output_shape.y) {
@@ -24,11 +35,17 @@ __kernel void BoardcastScale_IMG(__read_only image2d_t input, float scale, float
}

FLT4 in = READ_IMAGE(input, smp_none, (int2)(X, Y));
WRITE_IMAGE(output, (int2)(X, Y), in * (FLT)scale + (FLT)offset);
FLT4 out = in * (FLT)scale + (FLT)offset;
if (act_type == ActType_Relu) {
out = max(out, (FLT4)(0.0f));
} else if (act_type == ActType_Relu6) {
out = clamp(out, (FLT4)(0.0f), (FLT4)(6.0f));
}
WRITE_IMAGE(output, (int2)(X, Y), out);
}

__kernel void Scale_C_IMG(__read_only image2d_t input, __read_only image2d_t scale, __read_only image2d_t offset,
__write_only image2d_t output, const int2 output_shape, const int C) {
__write_only image2d_t output, const int2 output_shape, const int C, const int act_type) {
int X = get_global_id(0);
int Y = get_global_id(1);
if (X >= output_shape.x || Y >= output_shape.y || C == 0) {
@@ -38,5 +55,11 @@ __kernel void Scale_C_IMG(__read_only image2d_t input, __read_only image2d_t sca
FLT4 in = READ_IMAGE(input, smp_none, (int2)(X, Y));
FLT4 s = READ_IMAGE(scale, smp_none, (int2)(X % C, 0));
FLT4 o = READ_IMAGE(offset, smp_none, (int2)(X % C, 0));
WRITE_IMAGE(output, (int2)(X, Y), in * s + o);
FLT4 out = in * s + o;
if (act_type == ActType_Relu) {
out = max(out, (FLT4)(0.0f));
} else if (act_type == ActType_Relu6) {
out = clamp(out, (FLT4)(0.0f), (FLT4)(6.0f));
}
WRITE_IMAGE(output, (int2)(X, Y), out);
}

+ 137
- 0
mindspore/lite/src/runtime/kernel/opencl/cl/stack.cl View File

@@ -0,0 +1,137 @@
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
#define INT4 int4
__constant sampler_t smp_none = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE | CLK_FILTER_NEAREST;
#define CHECK_IDX_FOR_STACK \
int X = get_global_id(0); \
int Y = get_global_id(1); \
int Z = get_global_id(2); \
if (X >= output_shape.x * output_shape.y || Y >= output_shape.z || Z >= output_shape.w) { \
return; \
} \
FLT4 result;

__kernel void stack8inputaxis1(__read_only image2d_t input0, __read_only image2d_t input1, __read_only image2d_t input2,
__read_only image2d_t input3, __read_only image2d_t input4, __read_only image2d_t input5,
__read_only image2d_t input6, __read_only image2d_t input7,
__write_only image2d_t output, int4 input_shape0, int4 input_shape1, int4 input_shape2,
int4 input_shape3, int4 input_shape4, int4 input_shape5, int4 input_shape6,
int4 input_shape7, int4 output_shape) {
CHECK_IDX_FOR_STACK;
if (X < input_shape0.y) {
result = READ_IMAGE(input0, smp_none, (int2)((Y)*input_shape0.w + Z, (X)));
} else if (X < (input_shape0.y + input_shape1.y)) {
result = READ_IMAGE(input1, smp_none, (int2)((Y)*input_shape1.w + Z, (X - input_shape0.y)));
} else if (X < (input_shape0.y + input_shape1.y + input_shape2.y)) {
result = READ_IMAGE(input2, smp_none, (int2)((Y)*input_shape2.w + Z, (X - input_shape0.y - input_shape1.y)));
} else if (X < (input_shape0.y + input_shape1.y + input_shape2.y + input_shape3.y)) {
result = READ_IMAGE(input3, smp_none,
(int2)((Y)*input_shape3.w + Z, (X - input_shape0.y - input_shape1.y - input_shape2.y)));
} else if (X < (input_shape0.y + input_shape1.y + input_shape2.y + input_shape3.y + input_shape4.y)) {
result = READ_IMAGE(
input4, smp_none,
(int2)((Y)*input_shape4.w + Z, (X - input_shape0.y - input_shape1.y - input_shape2.y - input_shape3.y)));
} else if (X <
(input_shape0.y + input_shape1.y + input_shape2.y + input_shape3.y + input_shape4.y + input_shape5.y)) {
result = READ_IMAGE(input5, smp_none,
(int2)((Y)*input_shape5.w + Z, (X - input_shape0.y - input_shape1.y - input_shape2.y -
input_shape3.y - input_shape4.y)));
} else if (X < (input_shape0.y + input_shape1.y + input_shape2.y + input_shape3.y + input_shape4.y + input_shape5.y +
input_shape6.y)) {
result = READ_IMAGE(input6, smp_none,
(int2)((Y)*input_shape6.w + Z, (X - input_shape0.y - input_shape1.y - input_shape2.y -
input_shape3.y - input_shape4.y - input_shape5.y)));
} else {
result =
READ_IMAGE(input7, smp_none,
(int2)((Y)*input_shape7.w + Z, (X - input_shape0.y - input_shape1.y - input_shape2.y - input_shape3.y -
input_shape4.y - input_shape5.y - input_shape6.y)));
}
WRITE_IMAGE(output, (int2)((Y)*output_shape.w + Z, (X)), result);
}

__kernel void stack8inputaxis2(__read_only image2d_t input0, __read_only image2d_t input1, __read_only image2d_t input2,
__read_only image2d_t input3, __read_only image2d_t input4, __read_only image2d_t input5,
__read_only image2d_t input6, __read_only image2d_t input7,
__write_only image2d_t output, int4 input_shape0, int4 input_shape1, int4 input_shape2,
int4 input_shape3, int4 input_shape4, int4 input_shape5, int4 input_shape6,
int4 input_shape7, int4 output_shape) {
CHECK_IDX_FOR_STACK;
if (Y < input_shape0.z) {
result = READ_IMAGE(input0, smp_none, (int2)((Y)*input_shape0.w + Z, (X)));
} else if (Y < (input_shape0.z + input_shape1.z)) {
result = READ_IMAGE(input1, smp_none, (int2)((Y - input_shape0.z) * input_shape1.w + Z, (X)));
} else if (Y < (input_shape0.z + input_shape1.z + input_shape2.z)) {
result = READ_IMAGE(input2, smp_none, (int2)((Y - input_shape0.z - input_shape1.z) * input_shape2.w + Z, (X)));
} else if (Y < (input_shape0.z + input_shape1.z + input_shape2.z + input_shape3.z)) {
result = READ_IMAGE(input3, smp_none,
(int2)((Y - input_shape0.z - input_shape1.z - input_shape2.z) * input_shape3.w + Z, (X)));
} else if (Y < (input_shape0.z + input_shape1.z + input_shape2.z + input_shape3.z + input_shape4.z)) {
result = READ_IMAGE(
input4, smp_none,
(int2)((Y - input_shape0.z - input_shape1.z - input_shape2.z - input_shape3.z) * input_shape4.w + Z, (X)));
} else if (Y <
(input_shape0.z + input_shape1.z + input_shape2.z + input_shape3.z + input_shape4.z + input_shape5.z)) {
result = READ_IMAGE(
input5, smp_none,
(int2)(
(Y - input_shape0.z - input_shape1.z - input_shape2.z - input_shape3.z - input_shape4.z) * input_shape5.w + Z,
(X)));
} else if (Y < (input_shape0.z + input_shape1.z + input_shape2.z + input_shape3.z + input_shape4.z + input_shape5.z +
input_shape6.z)) {
result = READ_IMAGE(
input6, smp_none,
(int2)((Y - input_shape0.z - input_shape1.z - input_shape2.z - input_shape3.z - input_shape4.z - input_shape5.z) *
input_shape6.w +
Z,
(X)));
} else {
result = READ_IMAGE(input7, smp_none,
(int2)((Y - input_shape0.z - input_shape1.z - input_shape2.z - input_shape3.z - input_shape4.z -
input_shape5.z - input_shape6.z) *
input_shape7.w +
Z,
(X)));
}
WRITE_IMAGE(output, (int2)((Y)*output_shape.w + Z, (X)), result);
}

__kernel void stack8inputaxis3(__read_only image2d_t input0, __read_only image2d_t input1, __read_only image2d_t input2,
__read_only image2d_t input3, __read_only image2d_t input4, __read_only image2d_t input5,
__read_only image2d_t input6, __read_only image2d_t input7,
__write_only image2d_t output, int4 input_shape0, int4 input_shape1, int4 input_shape2,
int4 input_shape3, int4 input_shape4, int4 input_shape5, int4 input_shape6,
int4 input_shape7, int4 output_shape) {
CHECK_IDX_FOR_STACK;
if (Z < input_shape0.w) {
result = READ_IMAGE(input0, smp_none, (int2)((Y)*input_shape0.w + Z, (X)));
} else if (Z < (input_shape0.w + input_shape1.w)) {
result = READ_IMAGE(input1, smp_none, (int2)((Y)*input_shape1.w + Z - input_shape0.w, (X)));
} else if (Z < (input_shape0.w + input_shape1.w + input_shape2.w)) {
result = READ_IMAGE(input2, smp_none, (int2)((Y)*input_shape2.w + Z - input_shape0.w - input_shape1.w, (X)));
} else if (Z < (input_shape0.w + input_shape1.w + input_shape2.w + input_shape3.w)) {
result = READ_IMAGE(input3, smp_none,
(int2)((Y)*input_shape3.w + Z - input_shape0.w - input_shape1.w - input_shape2.w, (X)));
} else if (Z < (input_shape0.w + input_shape1.w + input_shape2.w + input_shape3.w + input_shape4.w)) {
result = READ_IMAGE(
input4, smp_none,
(int2)((Y)*input_shape4.w + Z - input_shape0.w - input_shape1.w - input_shape2.w - input_shape3.w, (X)));
} else if (Z <
(input_shape0.w + input_shape1.w + input_shape2.w + input_shape3.w + input_shape4.w + input_shape5.w)) {
result = READ_IMAGE(input5, smp_none,
(int2)((Y)*input_shape5.w + Z - input_shape0.w - input_shape1.w - input_shape2.w -
input_shape3.w - input_shape4.w,
(X)));
} else if (Z < (input_shape0.w + input_shape1.w + input_shape2.w + input_shape3.w + input_shape4.w + input_shape5.w +
input_shape6.w)) {
result = READ_IMAGE(input6, smp_none,
(int2)((Y)*input_shape6.w + Z - input_shape0.w - input_shape1.w - input_shape2.w -
input_shape3.w - input_shape4.w - input_shape5.w,
(X)));
} else {
result = READ_IMAGE(input7, smp_none,
(int2)((Y)*input_shape7.w + Z - input_shape0.w - input_shape1.w - input_shape2.w -
input_shape3.w - input_shape4.w - input_shape5.w - input_shape6.w,
(X)));
}
WRITE_IMAGE(output, (int2)((Y)*output_shape.w + Z, (X)), result);
}

+ 9
- 39
mindspore/lite/src/runtime/kernel/opencl/kernel/concat.cc View File

@@ -66,7 +66,7 @@ int ConcatOpenCLKernel::Init() {
}

std::string kernel_name = "Concat";
if (in_tensors_.size() == 2 || in_tensors_.size() == 3 || in_tensors_.size() == 4 || in_tensors_.size() == 4) {
if (in_tensors_.size() == 2 || in_tensors_.size() == 3 || in_tensors_.size() == 4 || in_tensors_.size() == 6) {
kernel_name += std::to_string(in_tensors_.size()) + "inputaxis" + std::to_string(param->axis_);
} else {
MS_LOG(ERROR) << " input must be 2 , 3 , 4 or 6";
@@ -83,41 +83,9 @@ int ConcatOpenCLKernel::Init() {
return RET_OK;
}

int ConcatOpenCLKernel::IntegraShapeToXYZ() {
if (out_tensors_[0]->shape().size() > 4 || out_tensors_[0]->shape().empty()) {
MS_LOG(ERROR) << "in_tensors_.shape() must between 0~4";
return RET_ERROR;
}

if (out_tensors_[0]->shape().size() == 4) {
for (int i = 0; i < in_tensors_.size(); ++i) {
cl_int4 temp_cl;
auto temp = in_tensors_[i]->shape();
temp_cl = {temp[0], temp[1], temp[2], UP_DIV(temp[3], C4NUM)};
XYZShape.push_back(temp_cl);
}
} else {
for (int i = 0; i < in_tensors_.size(); ++i) {
auto temp = in_tensors_[i]->shape();
for (int j = temp.size(); j < C4NUM; ++j) {
temp.push_back(1);
}
cl_int4 temp_cl = {temp[0], temp[1], temp[2], UP_DIV(temp[3], C4NUM)};
XYZShape.push_back(temp_cl);
}
auto temp = out_tensors_[0]->shape();
for (int i = out_tensors_[0]->shape().size(); i < C4NUM; ++i) {
temp.push_back(1);
}
}
shape_nhwc = {out_tensors_[0]->shape()[0] * out_tensors_[0]->shape()[1], out_tensors_[0]->shape()[2],
UP_DIV(out_tensors_[0]->shape()[3], C4NUM)};
return RET_OK;
}

void ConcatGetWorkGroup(const std::vector<size_t> &global, std::vector<size_t> *local, int max_size) {
const int max_divider = 8;
const int max_x = 4, max_y = 8;
const int max_x = 2, max_y = 8;
int x = std::min(GetMaxDivisorStrategy1(global[0], max_divider), max_x);
int yz = max_size / x;
int y = std::min(std::min(GetMaxDivisorStrategy1(global[1], max_divider), yz), max_y);
@@ -137,11 +105,12 @@ int ConcatOpenCLKernel::Run() {
}
auto output_shape = out_tensors_[0]->shape();
cl_int4 output_shape_ = {output_shape[0], output_shape[1], output_shape[2], UP_DIV(output_shape[3], C4NUM)};
IntegraShapeToXYZ();
const std::vector<size_t> &max_global = ocl_runtime_->GetWorkItemSize();
std::vector<size_t> local = {1, 1, 1};
std::vector<size_t> global = {static_cast<size_t>(shape_nhwc.s[0]), static_cast<size_t>(shape_nhwc.s[1]),
static_cast<size_t>(shape_nhwc.s[2])};
uint32_t OH = output_shape_.s[0] * output_shape_.s[1];
uint32_t OW = output_shape_.s[2];
uint32_t OC = output_shape_.s[3];
std::vector<size_t> global = {OH, OW, OC};
ConcatGetWorkGroup(global, &local, max_global[0]);
if (in_tensors_.size() == 2 || in_tensors_.size() == 3 || in_tensors_.size() == 4 || in_tensors_.size() == 6) {
int arg_cn = 0;
@@ -149,8 +118,9 @@ int ConcatOpenCLKernel::Run() {
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, in_tensors_[i]->data_c());
}
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, out_tensors_[0]->data_c());
for (int i = 0; i < XYZShape.size(); ++i) {
cl_int4 temp = {XYZShape[i].s[0], XYZShape[i].s[1], XYZShape[i].s[2], XYZShape[i].s[3]};
for (int i = 0; i < in_tensors_.size(); ++i) {
cl_int4 temp = {in_tensors_[i]->shape()[0], in_tensors_[i]->shape()[1], in_tensors_[i]->shape()[2],
UP_DIV(in_tensors_[i]->shape()[3], C4NUM)};
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, temp);
}
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, output_shape_);


+ 0
- 4
mindspore/lite/src/runtime/kernel/opencl/kernel/concat.h View File

@@ -38,11 +38,7 @@ class ConcatOpenCLKernel : public OpenCLKernel {
private:
int RunAxis0();

int IntegraShapeToXYZ();

cl::Kernel kernel_;
std::vector<cl_int3> XYZShape;
cl_int4 shape_nhwc{};
};

} // namespace mindspore::kernel


+ 14
- 7
mindspore/lite/src/runtime/kernel/opencl/kernel/scale.cc View File

@@ -91,12 +91,12 @@ int ScaleOpenCLKernel::InitBuffer() {
} else if (in_tensors_[0]->GetFormat() == schema::Format_NHWC) {
if (in_tensors_[1]->GetFormat() == schema::Format_NHWC) {
if (in_tensors_[0]->data_type() == kNumberTypeFloat32) {
float *scale = new (std::nothrow) float[pack_weight_size];
auto *scale = new (std::nothrow) float[pack_weight_size];
if (scale == nullptr) {
MS_LOG(ERROR) << "Malloc buffer failed!";
return RET_ERROR;
}
float *offset = new (std::nothrow) float[pack_weight_size];
auto *offset = new (std::nothrow) float[pack_weight_size];
if (offset == nullptr) {
MS_LOG(ERROR) << "Malloc buffer failed!";
delete[] scale;
@@ -110,12 +110,12 @@ int ScaleOpenCLKernel::InitBuffer() {
delete[] scale;
delete[] offset;
} else if (in_tensors_[0]->data_type() == kNumberTypeFloat16) {
float16_t *scale = new (std::nothrow) float16_t[pack_weight_size];
auto *scale = new (std::nothrow) float16_t[pack_weight_size];
if (scale == nullptr) {
MS_LOG(ERROR) << "Malloc buffer failed!";
return RET_ERROR;
}
float16_t *offset = new (std::nothrow) float16_t[pack_weight_size];
auto *offset = new (std::nothrow) float16_t[pack_weight_size];
if (offset == nullptr) {
MS_LOG(ERROR) << "Malloc buffer failed!";
delete[] scale;
@@ -146,15 +146,14 @@ int ScaleOpenCLKernel::InitBuffer() {

int ScaleOpenCLKernel::Init() {
std::string kernel_name;

const ScaleParameter *scale_param = reinterpret_cast<const ScaleParameter *>(op_parameter_);
auto *scale_param = reinterpret_cast<const ScaleParameter *>(op_parameter_);
auto in_tensor = in_tensors_.at(0);
auto in_shape = in_tensor->shape();
auto scale_tensor = in_tensors_.at(1);
auto scale_shape = scale_tensor->shape();
axis_ = scale_param->axis_;
if (axis_ < 0) {
axis_ = axis_ + in_shape.size();
axis_ += in_shape.size();
}
if (scale_shape.size() != in_shape.size()) {
if (scale_tensor->ElementsNum() == 1) {
@@ -197,6 +196,13 @@ int ScaleOpenCLKernel::Init() {

int ScaleOpenCLKernel::Run() {
MS_LOG(DEBUG) << this->name() << " Running!";
auto *param = reinterpret_cast<const ScaleParameter *>(op_parameter_);
cl_int act_type = 0;
if (param->activation_type_ == ActType_Relu) {
act_type = 1;
} else if (param->activation_type_ == ActType_Relu6) {
act_type = 3;
}

int arg_idx = 0;
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, in_tensors_[0]->data_c());
@@ -227,6 +233,7 @@ int ScaleOpenCLKernel::Run() {
if (element_flag_ && scale_C_flag_) {
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, UP_DIV(in_tensors_[1]->shape()[0], C4NUM));
}
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, act_type);
ocl_runtime_->RunKernel(kernel_, global_size_, local_size_, nullptr);
return RET_OK;
}


+ 211
- 0
mindspore/lite/src/runtime/kernel/opencl/kernel/stack.cc View File

@@ -0,0 +1,211 @@
/**
* Copyright 2019 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include "src/runtime/kernel/opencl/kernel/stack.h"
#include <cstring>
#include <string>
#include <algorithm>
#include <set>
#include "src/kernel_registry.h"
#include "src/runtime/kernel/opencl/utils.h"
#include "src/runtime/kernel/opencl/cl/stack.cl.inc"

using mindspore::kernel::KERNEL_ARCH::kGPU;
using mindspore::lite::KernelRegistrar;
using mindspore::schema::PrimitiveType_Stack;

namespace mindspore::kernel {

int StackOpenCLKernel::RunAxis0() {
auto allocator_ = ocl_runtime_->GetAllocator();
std::vector<size_t> img_size;
auto dst_data = out_tensors_[0]->data_c();
auto dst_origin = cl::array<cl::size_type, 3U>{0, 0, 0};
cl::Image2D *out_image = reinterpret_cast<cl::Image2D *>(allocator_->GetImage(dst_data));
for (int i = 0; i < in_tensors_.size(); i++) {
auto src_data = in_tensors_[i]->data_c();
allocator_->GetImageSize(src_data, &img_size);
auto src_origin = cl::array<cl::size_type, 3U>{0, 0, 0};
auto region = cl::array<cl::size_type, 3U>{img_size[0], img_size[1], 1};
cl::Image2D *input_image = reinterpret_cast<cl::Image2D *>(allocator_->GetImage(src_data));
ocl_runtime_->GetDefaultCommandQueue()->enqueueCopyImage(*input_image, *out_image, src_origin, dst_origin, region);
dst_origin[1] += region[1];
}
return RET_OK;
}

int StackOpenCLKernel::Init() {
if (in_tensors_[0]->shape().size() > 4 || in_tensors_[0]->shape().size() <= 0) {
MS_LOG(ERROR) << " only support dim <= 4 ";
return RET_ERROR;
}
auto param = reinterpret_cast<StackParameter *>(this->op_parameter_);
axis_ = param->axis_;
axis_ = axis_ < 0 ? axis_ + in_tensors_[0]->shape().size() + 1 : axis_;
if (in_tensors_[0]->shape().size() != 4) {
if (in_tensors_[0]->shape().size() == 2) {
axis_ = axis_ + 2;
}
}
if (param->axis_ < -3 || param->axis_ > 3) {
MS_LOG(ERROR) << " only support axis >= -3 and axis <= 3 ";
return RET_ERROR;
}

std::string kernel_name = "stack";
if (in_tensors_.size() == 8) {
kernel_name += "8inputaxis" + std::to_string(axis_);
} else {
MS_LOG(ERROR) << " input must be 8";
return RET_ERROR;
}
MS_LOG(DEBUG) << "kernel_name=: " << kernel_name;
std::set<std::string> build_options;
std::string source = stack_source;
std::string program_name = "stack";
ocl_runtime_->LoadSource(program_name, source);
ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name, build_options);

return RET_OK;
}

int StackOpenCLKernel::ReSize() { return RET_OK; }

void StackGetWorkGroup(const std::vector<size_t> &global, std::vector<size_t> *local, int max_size) {
const int max_divider = 8;
const int max_x = 4, max_y = 8;
int x = std::min(GetMaxDivisorStrategy1(global[0], max_divider), max_x);
int yz = max_size / x;
int y = std::min(std::min(GetMaxDivisorStrategy1(global[1], max_divider), yz), max_y);
int z = std::min(yz / y, static_cast<int>(UP_DIV(global[2], 2)));

local->clear();
local->push_back(x);
local->push_back(y);
local->push_back(z);
}

int StackOpenCLKernel::InferInTensorShapeTo4D(int *arg_cn) {
if (in_tensors_.size() == 8) {
int size = in_tensors_[0]->shape().size();
switch (size) {
case 1:
for (int i = 0; i < in_tensors_.size(); ++i) {
ocl_runtime_->SetKernelArg(kernel_, (*arg_cn)++, in_tensors_[i]->data_c());
}
ocl_runtime_->SetKernelArg(kernel_, (*arg_cn)++, out_tensors_[0]->data_c());
for (int i = 0; i < in_tensors_.size(); ++i) {
cl_int4 temp = {in_tensors_[i]->shape()[0], 1, 1, 1};
ocl_runtime_->SetKernelArg(kernel_, (*arg_cn)++, temp);
}
break;
case 2:
for (int i = 0; i < in_tensors_.size(); ++i) {
ocl_runtime_->SetKernelArg(kernel_, (*arg_cn)++, in_tensors_[i]->data_c());
}
ocl_runtime_->SetKernelArg(kernel_, (*arg_cn)++, out_tensors_[0]->data_c());
for (int i = 0; i < in_tensors_.size(); ++i) {
cl_int4 temp = {in_tensors_[i]->shape()[0], 1, 1, UP_DIV(in_tensors_[i]->shape()[1], C4NUM)};
ocl_runtime_->SetKernelArg(kernel_, (*arg_cn)++, temp);
}
break;
case 3:
for (int i = 0; i < in_tensors_.size(); ++i) {
ocl_runtime_->SetKernelArg(kernel_, (*arg_cn)++, in_tensors_[i]->data_c());
}
ocl_runtime_->SetKernelArg(kernel_, (*arg_cn)++, out_tensors_[0]->data_c());
for (int i = 0; i < in_tensors_.size(); ++i) {
cl_int4 temp = {in_tensors_[i]->shape()[0], 1, in_tensors_[i]->shape()[1],
UP_DIV(in_tensors_[i]->shape()[2], C4NUM)};
ocl_runtime_->SetKernelArg(kernel_, (*arg_cn)++, temp);
}
break;
default:
MS_LOG(ERROR) << "unsupported input size > 3 or size <= 0 :" << in_tensors_.size();
return RET_ERROR;
}
} else {
MS_LOG(ERROR) << "unsupported input size :" << in_tensors_.size();
return RET_ERROR;
}
return RET_OK;
}

int StackOpenCLKernel::InferOutTensorShapeTo4D(cl_int4 *output_shape) {
std::vector<int> out_shape = out_tensors_[0]->shape();
if (out_shape.size() == 3) {
N_ = out_shape[0];
C_ = out_shape[1] * UP_DIV(out_shape[2], C4NUM);
} else if (out_shape.size() == 4) {
if (axis_ == 1) {
N_ = out_shape[0];
H_ = out_shape[1];
W_ = out_shape[2];
C_ = UP_DIV(out_shape[3], C4NUM);
} else {
MS_LOG(ERROR) << "Unsupported out_shape.size=: " << out_shape.size() << " axis=: " << axis_;
return RET_ERROR;
}
}
OH_ = N_ * H_;
OW_ = W_;
OC_ = C_;
output_shape->s[0] = N_;
output_shape->s[1] = H_;
output_shape->s[2] = W_;
output_shape->s[3] = C_;
return RET_OK;
}

int StackOpenCLKernel::Run() {
MS_LOG(DEBUG) << this->name() << " Running! ";
if (axis_ == 0) {
return RunAxis0();
}
cl_int4 output_shape = {1, 1, 1, 1};
const std::vector<size_t> &max_global = ocl_runtime_->GetWorkItemSize();
std::vector<size_t> local = {1, 1, 1};
int arg_cn = 0;
InferInTensorShapeTo4D(&arg_cn);
InferOutTensorShapeTo4D(&output_shape);
std::vector<size_t> global = {OH_, OW_, OC_};
StackGetWorkGroup(global, &local, max_global[0]);
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, output_shape);
ocl_runtime_->RunKernel(kernel_, global, local, nullptr);
return RET_OK;
}

kernel::LiteKernel *OpenCLStackKernelCreator(const std::vector<lite::Tensor *> &inputs,
const std::vector<lite::Tensor *> &outputs, OpParameter *opParameter,
const lite::InnerContext *ctx, const kernel::KernelKey &desc,
const mindspore::lite::PrimitiveC *primitive) {
auto *kernel = new (std::nothrow) StackOpenCLKernel(opParameter, inputs, outputs);
if (kernel == nullptr) {
MS_LOG(ERROR) << " new StackOpenCLKernel failed ";
return nullptr;
}
auto ret = kernel->Init();
if (ret != RET_OK) {
MS_LOG(ERROR) << " Init kernel failed, name: Stack ";
delete kernel;
return nullptr;
}
return kernel;
}

REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Stack, OpenCLStackKernelCreator);
REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_Stack, OpenCLStackKernelCreator);
} // namespace mindspore::kernel

+ 59
- 0
mindspore/lite/src/runtime/kernel/opencl/kernel/stack.h View File

@@ -0,0 +1,59 @@
/**
* Copyright 2019 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_OPENCL_KERNEL_STACK_H_
#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_OPENCL_KERNEL_STACK_H_

#include <vector>
#include "src/runtime/kernel/opencl/opencl_kernel.h"
#include "nnacl/stack_parameter.h"

namespace mindspore::kernel {

class StackOpenCLKernel : public OpenCLKernel {
public:
explicit StackOpenCLKernel(OpParameter *parameter, const std::vector<lite::Tensor *> &inputs,
const std::vector<lite::Tensor *> &outputs)
: OpenCLKernel(parameter, inputs, outputs) {}

~StackOpenCLKernel() override{};

int Init() override;

int ReSize() override;

int Run() override;

private:
int RunAxis0();

int InferInTensorShapeTo4D(int *arg_cn);

int InferOutTensorShapeTo4D(cl_int4 *output_shape);

cl::Kernel kernel_;
int axis_{0};
size_t N_{1};
size_t H_{1};
size_t W_{1};
size_t C_{1};
size_t OH_{1};
size_t OW_{1};
size_t OC_{1};
};

} // namespace mindspore::kernel
#endif

mindspore/lite/test/models_tflite_gpu.cfg → mindspore/lite/test/models_fp32_gpu.cfg View File

@@ -7,3 +7,12 @@ hiai_cn_recognize_modify_padv2.tflite
hiai_cv_focusShootOCRModel_08.tflite
hiai_model_normalize_object_scene_ps_20200519.tflite
inception_v3.tflite
mtk_isface.tflite
mtk_landmark.tflite
mtk_new_detect.tflite
mtk_pose.tflite
mtk_model_emotions_0727_nosoftmax.tflite
mtk_model_normalize_object_scene_ps_20200826_f32_no_softmax.tflite
mtk_276landmark_0913.tflite
mtk_face_recognition.tflite
mtk_convert_model.tflite

+ 11
- 0
mindspore/lite/test/models_tflite.cfg View File

@@ -119,3 +119,14 @@ magenta_arbitrary-image-stylization-v1-256_fp16_prediction_1.tflite
lite-model_cartoongan_fp16_1.tflite
lite-model_arbitrary-image-stylization-inceptionv3_fp16_predict_1.tflite
gts_detect_5k_tf115.tflite
mtk_isface.tflite
mtk_landmark.tflite
mtk_new_detect.tflite
mtk_pose.tflite
mtk_age_gender_fp16.tflite
mtk_model_emotions_0727_nosoftmax.tflite
mtk_model_face_dress_fp16.tflite
mtk_model_normalize_object_scene_ps_20200826_f32_no_softmax.tflite
mtk_276landmark_0913.tflite
mtk_face_recognition.tflite
mtk_convert_model.tflite

+ 1
- 1
mindspore/lite/test/run_benchmark_nets.sh View File

@@ -766,7 +766,7 @@ models_onnx_config=${basepath}/models_onnx.cfg
models_fp16_config=${basepath}/models_fp16.cfg
models_mindspore_config=${basepath}/models_mindspore.cfg
models_mindspore_train_config=${basepath}/models_mindspore_train.cfg
models_tflite_gpu_config=${basepath}/models_tflite_gpu.cfg
models_tflite_gpu_config=${basepath}/models_fp32_gpu.cfg
models_fp16_gpu_config=${basepath}/models_fp16_gpu.cfg
models_arm32_config=${basepath}/models_arm32.cfg
models_compatibility_config=${basepath}/models_compatibility.cfg


+ 4
- 12
mindspore/lite/test/ut/src/runtime/kernel/opencl/concat_tests.cc View File

@@ -37,14 +37,6 @@ class TestConcatOpenCLCI : public mindspore::CommonTest {
TestConcatOpenCLCI() {}
};

template <typename T>
void CompareOutputData1(T *output_data, T *correct_data, int size, float err_bound) {
for (size_t i = 0; i < size; i++) {
T abs = fabs(output_data[i] - correct_data[i]);
ASSERT_LE(abs, err_bound);
}
}

TEST_F(TestConcatOpenCLCI, ConcatFp32_2inputforCI) {
MS_LOG(INFO) << " begin test ";
auto ocl_runtime = lite::opencl::OpenCLRuntimeWrapper().GetInstance();
@@ -132,7 +124,7 @@ TEST_F(TestConcatOpenCLCI, ConcatFp32_2inputforCI) {
std::cout << "==================output data================" << std::endl;
sub_graph->Run();
auto *output_data_gpu = reinterpret_cast<float *>(output_tensor->data_c());
CompareOutputData1(output_data_gpu, correctOutput, output_tensor->ElementsNum(), 0.00001);
CompareOutputData(output_data_gpu, correctOutput, output_tensor->ElementsNum(), 0.00001);
for (auto tensor : inputs) {
tensor->SetData(nullptr);
delete tensor;
@@ -260,7 +252,7 @@ TEST_F(TestConcatOpenCLfp16, ConcatFp16_4input_dim4_axis1) {
std::cout << "==================output data================" << std::endl;
sub_graph->Run();
auto *output_data_gpu = reinterpret_cast<float16_t *>(output_tensor->data_c());
CompareOutputData1(output_data_gpu, correctOutput, output_tensor->ElementsNum(), 0.000001);
CompareOutputData(output_data_gpu, correctOutput, output_tensor->ElementsNum(), 0.000001);
for (auto tensor : inputs) {
tensor->SetData(nullptr);
delete tensor;
@@ -379,7 +371,7 @@ TEST_F(TestConcatOpenCLfp32, ConcatFp32_3input_dim4_axis1) {
std::cout << "==================output data================" << std::endl;
sub_graph->Run();
auto *output_data_gpu = reinterpret_cast<float *>(output_tensor->data_c());
CompareOutputData1(output_data_gpu, correctOutput, output_tensor->ElementsNum(), 0.00001);
CompareOutputData(output_data_gpu, correctOutput, output_tensor->ElementsNum(), 0.00001);
for (auto tensor : inputs) {
tensor->SetData(nullptr);
delete tensor;
@@ -518,7 +510,7 @@ TEST_F(TestConcatOpenCLfp16, ConcatFp16_6input_dim4_axis1) {
std::cout << "==================output data================" << std::endl;
sub_graph->Run();
auto *output_data_gpu = reinterpret_cast<float16_t *>(output_tensor->MutableData());
CompareOutputData1(output_data_gpu, correctOutput, output_tensor->ElementsNum(), 0.000001);
CompareOutputData(output_data_gpu, correctOutput, output_tensor->ElementsNum(), 0.000001);
for (auto tensor : inputs) {
tensor->SetData(nullptr);
delete tensor;


+ 283
- 0
mindspore/lite/test/ut/src/runtime/kernel/opencl/stack_tests.cc View File

@@ -0,0 +1,283 @@
/**
* Copyright 2020 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include <iostream>
#include <memory>
#include "common/common_test.h"
#include "mindspore/lite/src/runtime/opencl/opencl_runtime.h"
#include "mindspore/lite/src/common/file_utils.h"
#include "mindspore/lite/src/runtime/kernel/opencl/subgraph_opencl_kernel.h"
#include "mindspore/lite/src/runtime/kernel/opencl/kernel/stack.h"
namespace mindspore {
class TestStackOpenCLCI : public mindspore::CommonTest {
public:
TestStackOpenCLCI() {}
};

class TestStackOpenCLfp16 : public mindspore::CommonTest {
public:
TestStackOpenCLfp16() {}
};

TEST_F(TestStackOpenCLCI, StackFp32_8inputforCI) {
MS_LOG(INFO) << " begin test ";
auto ocl_runtime = lite::opencl::OpenCLRuntimeWrapper().GetInstance();
ocl_runtime->Init();
auto allocator = ocl_runtime->GetAllocator();

MS_LOG(INFO) << " init tensors ";
constexpr int INPUT_NUM = 8;
std::array<std::vector<int>, INPUT_NUM> input_shapes = {
std::vector<int>{1, 1, 8}, std::vector<int>{1, 1, 8}, std::vector<int>{1, 1, 8}, std::vector<int>{1, 1, 8},
std::vector<int>{1, 1, 8}, std::vector<int>{1, 1, 8}, std::vector<int>{1, 1, 8}, std::vector<int>{1, 1, 8}};
std::vector<int> output_shape = {8, 1, 1, 8};
auto data_type = kNumberTypeFloat32;
auto tensor_type = lite::TensorCategory(schema::NodeType_ValueNode);
float input_data1[] = {0.75f, 0.06f, 0.74f, 0.30f, 0.9f, 0.59f, 0.03f, 0.37f};
float input_data2[] = {0.5f, 0.6f, 0.74f, 0.23f, 0.46f, 0.69f, 0.13f, 0.47f};
float input_data3[] = {0.31f, 0.63f, 0.84f, 0.43f, 0.56f, 0.79f, 0.12f, 0.57f};
float input_data4[] = {0.35f, 0.26f, 0.17f, 0.33f, 0.66f, 0.89f, 0.93f, 0.77f};
float input_data5[] = {0.57f, 0.6f, 0.84f, 0.83f, 0.48f, 0.78f, 0.63f, 0.87f};
float input_data6[] = {0.66f, 0.56f, 0.64f, 0.63f, 0.56f, 0.59f, 0.73f, 0.37f};
float input_data7[] = {0.35f, 0.26f, 0.54f, 0.33f, 0.76f, 0.59f, 0.73f, 0.34f};
float input_data8[] = {0.15f, 0.36f, 0.44f, 0.73f, 0.56f, 0.49f, 0.93f, 0.37f};
float correctOutput[] = {0.75f, 0.06f, 0.74f, 0.30f, 0.9f, 0.59f, 0.03f, 0.37f, 0.5f, 0.6f, 0.74f, 0.23f, 0.46f,
0.69f, 0.13f, 0.47f, 0.31f, 0.63f, 0.84f, 0.43f, 0.56f, 0.79f, 0.12f, 0.57f, 0.35f, 0.26f,
0.17f, 0.33f, 0.66f, 0.89f, 0.93f, 0.77f, 0.57f, 0.6f, 0.84f, 0.83f, 0.48f, 0.78f, 0.63f,
0.87f, 0.66f, 0.56f, 0.64f, 0.63f, 0.56f, 0.59f, 0.73f, 0.37f, 0.35f, 0.26f, 0.54f, 0.33f,
0.76f, 0.59f, 0.73f, 0.34f, 0.15f, 0.36f, 0.44f, 0.73f, 0.56f, 0.49f, 0.93f, 0.37f};
auto *output_tensor = new (std::nothrow) lite::Tensor(data_type, output_shape, schema::Format_NHWC, tensor_type);
if (output_tensor == nullptr) {
MS_LOG(INFO) << " new output_tensor failed ";
return;
}
std::vector<lite::Tensor *> inputs;
std::vector<lite::Tensor *> outputs{output_tensor};
for (auto &shape : input_shapes) {
auto input_temp = new (std::nothrow) lite::Tensor(data_type, shape, schema::Format_NHWC, tensor_type);
inputs.push_back(input_temp);
if (input_temp == nullptr) {
MS_LOG(INFO) << " new input_tensor failed ";
return;
}
}

MS_LOG(INFO) << " initialize tensors ";
auto param = reinterpret_cast<StackParameter *>(malloc(sizeof(StackParameter)));
if (param == nullptr) {
MS_LOG(INFO) << " new StackParameter failed ";
for (auto tensor : inputs) {
delete tensor;
}
for (auto tensor : outputs) {
delete tensor;
}
return;
}
param->axis_ = 0;
auto *stack_kernel =
new (std::nothrow) kernel::StackOpenCLKernel(reinterpret_cast<OpParameter *>(param), inputs, outputs);
if (stack_kernel == nullptr) {
MS_LOG(INFO) << " new kernel::StackOpenCLKernel failed ";
for (auto tensor : inputs) {
delete tensor;
}
for (auto tensor : outputs) {
delete tensor;
}
delete param;
return;
}
stack_kernel->Init();
// to do allocate memory for inputs
for (auto &input_tensor : inputs) {
input_tensor->MallocData(allocator);
}

MS_LOG(INFO) << " initialize sub_graph ";
std::vector<kernel::LiteKernel *> kernels{stack_kernel};
auto *sub_graph = new (std::nothrow) kernel::SubGraphOpenCLKernel(inputs, outputs, kernels, kernels, kernels);
if (sub_graph == nullptr) {
MS_LOG(INFO) << " new kernel::SubGraphOpenCLKernel failed ";
for (auto tensor : inputs) {
delete tensor;
}
for (auto tensor : outputs) {
delete tensor;
}
delete param;
delete stack_kernel;
return;
}
sub_graph->Init();
MS_LOG(INFO) << " initialize input data ";
memcpy(inputs[0]->data_c(), input_data1, sizeof(input_data1));
memcpy(inputs[1]->data_c(), input_data2, sizeof(input_data2));
memcpy(inputs[2]->data_c(), input_data3, sizeof(input_data1));
memcpy(inputs[3]->data_c(), input_data4, sizeof(input_data2));
memcpy(inputs[4]->data_c(), input_data5, sizeof(input_data1));
memcpy(inputs[5]->data_c(), input_data6, sizeof(input_data2));
memcpy(inputs[6]->data_c(), input_data7, sizeof(input_data1));
memcpy(inputs[7]->data_c(), input_data8, sizeof(input_data2));

std::cout << "==================output data================" << std::endl;
sub_graph->Run();
auto *output_data_gpu = reinterpret_cast<float *>(output_tensor->data_c());
CompareOutputData(output_data_gpu, correctOutput, output_tensor->ElementsNum(), 0.00001);
for (auto tensor : inputs) {
tensor->SetData(nullptr);
delete tensor;
}
for (auto tensor : outputs) {
tensor->SetData(nullptr);
delete tensor;
}
delete sub_graph;
}

TEST_F(TestStackOpenCLfp16, StackFp32_8inputaxis1) {
MS_LOG(INFO) << " begin test ";
auto ocl_runtime = lite::opencl::OpenCLRuntimeWrapper().GetInstance();
ocl_runtime->SetFp16Enable(true);
ocl_runtime->Init();
auto allocator = ocl_runtime->GetAllocator();

// get the input from .bin
size_t input1_size, input2_size, input3_size, input4_size, input5_size, input6_size, input7_size, input8_size,
output_size;
std::string input1Ppath = "./test_data/stackfp16_input1.bin";
std::string input2Ppath = "./test_data/stackfp16_input2.bin";
std::string input3Ppath = "./test_data/stackfp16_input3.bin";
std::string input4Ppath = "./test_data/stackfp16_input4.bin";
std::string input5Ppath = "./test_data/stackfp16_input5.bin";
std::string input6Ppath = "./test_data/stackfp16_input6.bin";
std::string input7Ppath = "./test_data/stackfp16_input7.bin";
std::string input8Ppath = "./test_data/stackfp16_input8.bin";
std::string correctOutputPath = "./test_data/stackfp16_output.bin";
auto input_data1 = reinterpret_cast<float16_t *>(mindspore::lite::ReadFile(input1Ppath.c_str(), &input1_size));
auto input_data2 = reinterpret_cast<float16_t *>(mindspore::lite::ReadFile(input2Ppath.c_str(), &input2_size));
auto input_data3 = reinterpret_cast<float16_t *>(mindspore::lite::ReadFile(input3Ppath.c_str(), &input3_size));
auto input_data4 = reinterpret_cast<float16_t *>(mindspore::lite::ReadFile(input4Ppath.c_str(), &input4_size));
auto input_data5 = reinterpret_cast<float16_t *>(mindspore::lite::ReadFile(input5Ppath.c_str(), &input5_size));
auto input_data6 = reinterpret_cast<float16_t *>(mindspore::lite::ReadFile(input6Ppath.c_str(), &input6_size));
auto input_data7 = reinterpret_cast<float16_t *>(mindspore::lite::ReadFile(input7Ppath.c_str(), &input7_size));
auto input_data8 = reinterpret_cast<float16_t *>(mindspore::lite::ReadFile(input8Ppath.c_str(), &input8_size));
auto correctOutput =
reinterpret_cast<float16_t *>(mindspore::lite::ReadFile(correctOutputPath.c_str(), &output_size));
MS_LOG(INFO) << " init tensors ";
constexpr int INPUT_NUM = 8;
std::array<std::vector<int>, INPUT_NUM> input_shapes = {
std::vector<int>{1, 17, 18}, std::vector<int>{1, 17, 18}, std::vector<int>{1, 17, 18}, std::vector<int>{1, 17, 18},
std::vector<int>{1, 17, 18}, std::vector<int>{1, 17, 18}, std::vector<int>{1, 17, 18}, std::vector<int>{1, 17, 18}};
std::vector<int> output_shape = {1, 8, 17, 18};
auto data_type = kNumberTypeFloat16;
auto tensor_type = lite::TensorCategory(schema::NodeType_ValueNode);
std::vector<lite::Tensor *> inputs;
for (auto &shape : input_shapes) {
auto input_temp = new (std::nothrow) lite::Tensor(data_type, shape, schema::Format_NHWC, tensor_type);
inputs.push_back(input_temp);
if (input_temp == nullptr) {
MS_LOG(INFO) << " new input_tensor failed ";
return;
}
}
auto *output_tensor = new (std::nothrow) lite::Tensor(data_type, output_shape, schema::Format_NHWC, tensor_type);
if (output_tensor == nullptr) {
MS_LOG(INFO) << " new output_tensor failed ";
for (auto tensor : inputs) {
delete tensor;
}
return;
}
std::vector<lite::Tensor *> outputs{output_tensor};
MS_LOG(INFO) << " input_shapes size =: " << input_shapes.size();

MS_LOG(INFO) << " initialize tensors ";
auto param = reinterpret_cast<StackParameter *>(malloc(sizeof(StackParameter)));
if (param == nullptr) {
MS_LOG(INFO) << " new StackParameter failed ";
for (auto tensor : inputs) {
delete tensor;
}
for (auto tensor : outputs) {
delete tensor;
}
return;
}
param->axis_ = 1;
auto *stack_kernel =
new (std::nothrow) kernel::StackOpenCLKernel(reinterpret_cast<OpParameter *>(param), inputs, outputs);
if (stack_kernel == nullptr) {
MS_LOG(INFO) << " new kernel::StackOpenCLKernel failed ";
for (auto tensor : inputs) {
delete tensor;
}
for (auto tensor : outputs) {
delete tensor;
}
delete param;
return;
}
stack_kernel->Init();
// to allocate memory for inputs and outputs
for (auto &input_tensor : inputs) {
input_tensor->MallocData(allocator);
}
MS_LOG(INFO) << " initialize sub_graph ";
std::vector<kernel::LiteKernel *> kernels{stack_kernel};
auto *sub_graph = new (std::nothrow) kernel::SubGraphOpenCLKernel(inputs, outputs, kernels, kernels, kernels);
if (sub_graph == nullptr) {
MS_LOG(INFO) << " new kernel::SubGraphOpenCLKernel failed ";
for (auto tensor : inputs) {
delete tensor;
}
for (auto tensor : outputs) {
delete tensor;
}
delete param;
delete stack_kernel;
return;
}
sub_graph->Init();
MS_LOG(INFO) << " initialize input data ";
if (inputs.size() == 8) {
memcpy(inputs[0]->data_c(), input_data1, input1_size);
memcpy(inputs[1]->data_c(), input_data2, input2_size);
memcpy(inputs[2]->data_c(), input_data3, input3_size);
memcpy(inputs[3]->data_c(), input_data4, input4_size);
memcpy(inputs[4]->data_c(), input_data5, input5_size);
memcpy(inputs[5]->data_c(), input_data6, input6_size);
memcpy(inputs[6]->data_c(), input_data7, input7_size);
memcpy(inputs[7]->data_c(), input_data8, input8_size);
} else {
MS_LOG(ERROR) << " input size must be 2 or 3 or 4";
}

std::cout << "==================output data================" << std::endl;
sub_graph->Run();
auto *output_data_gpu = reinterpret_cast<float16_t *>(output_tensor->MutableData());
CompareOutputData(output_data_gpu, correctOutput, output_tensor->ElementsNum(), 0.000001);
for (auto tensor : inputs) {
tensor->SetData(nullptr);
delete tensor;
}
for (auto tensor : outputs) {
tensor->SetData(nullptr);
delete tensor;
}
delete sub_graph;
}

} // namespace mindspore

Loading…
Cancel
Save