| @@ -7,14 +7,14 @@ __kernel void mean_NHWC4(__read_only image2d_t src_data, __write_only image2d_t | |||
| if (X >= size.z) { | |||
| return; | |||
| } | |||
| FLT4 result = (FLT4)0.f; | |||
| float4 result = (float4)0.f; | |||
| for (int h = 0; h < size.x; h++) { | |||
| for (int w = 0; w < size.y; w++) { | |||
| result += READ_IMAGE(src_data, smp_zero, (int2)(w * size.z + X, h)); | |||
| result += convert_float4(READ_IMAGE(src_data, smp_zero, (int2)(w * size.z + X, h))); | |||
| } | |||
| } | |||
| result /= size.x * size.y; | |||
| WRITE_IMAGE(dst_data, (int2)(X, 0), result); | |||
| WRITE_IMAGE(dst_data, (int2)(X, 0), TO_FLT4(result)); | |||
| } | |||
| __kernel void mean_NC4HW4(__read_only image2d_t src_data, __write_only image2d_t dst_data, int4 size) { | |||
| @@ -1,8 +1,13 @@ | |||
| #pragma OPENCL EXTENSION cl_khr_fp16 : enable | |||
| __constant sampler_t smp_none = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE | CLK_FILTER_NEAREST; | |||
| #define ActType_No 0 | |||
| #define ActType_Relu 1 | |||
| #define ActType_Sigmod 2 | |||
| #define ActType_Relu6 3 | |||
| __kernel void Scale_IMG(__read_only image2d_t input, __read_only image2d_t scale, __read_only image2d_t offset, | |||
| __write_only image2d_t output, const int2 output_shape) { | |||
| __write_only image2d_t output, const int2 output_shape, const int act_type) { | |||
| int X = get_global_id(0); | |||
| int Y = get_global_id(1); | |||
| if (X >= output_shape.x || Y >= output_shape.y) { | |||
| @@ -12,11 +17,17 @@ __kernel void Scale_IMG(__read_only image2d_t input, __read_only image2d_t scale | |||
| FLT4 in = READ_IMAGE(input, smp_none, (int2)(X, Y)); | |||
| FLT4 s = READ_IMAGE(scale, smp_none, (int2)(X, Y)); | |||
| FLT4 o = READ_IMAGE(offset, smp_none, (int2)(X, Y)); | |||
| WRITE_IMAGE(output, (int2)(X, Y), in * s + o); | |||
| FLT4 out = in * s + o; | |||
| if (act_type == ActType_Relu) { | |||
| out = max(out, (FLT4)(0.0f)); | |||
| } else if (act_type == ActType_Relu6) { | |||
| out = clamp(out, (FLT4)(0.0f), (FLT4)(6.0f)); | |||
| } | |||
| WRITE_IMAGE(output, (int2)(X, Y), out); | |||
| } | |||
| __kernel void BoardcastScale_IMG(__read_only image2d_t input, float scale, float offset, __write_only image2d_t output, | |||
| const int2 output_shape) { | |||
| const int2 output_shape, const int act_type) { | |||
| int X = get_global_id(0); | |||
| int Y = get_global_id(1); | |||
| if (X >= output_shape.x || Y >= output_shape.y) { | |||
| @@ -24,11 +35,17 @@ __kernel void BoardcastScale_IMG(__read_only image2d_t input, float scale, float | |||
| } | |||
| FLT4 in = READ_IMAGE(input, smp_none, (int2)(X, Y)); | |||
| WRITE_IMAGE(output, (int2)(X, Y), in * (FLT)scale + (FLT)offset); | |||
| FLT4 out = in * (FLT)scale + (FLT)offset; | |||
| if (act_type == ActType_Relu) { | |||
| out = max(out, (FLT4)(0.0f)); | |||
| } else if (act_type == ActType_Relu6) { | |||
| out = clamp(out, (FLT4)(0.0f), (FLT4)(6.0f)); | |||
| } | |||
| WRITE_IMAGE(output, (int2)(X, Y), out); | |||
| } | |||
| __kernel void Scale_C_IMG(__read_only image2d_t input, __read_only image2d_t scale, __read_only image2d_t offset, | |||
| __write_only image2d_t output, const int2 output_shape, const int C) { | |||
| __write_only image2d_t output, const int2 output_shape, const int C, const int act_type) { | |||
| int X = get_global_id(0); | |||
| int Y = get_global_id(1); | |||
| if (X >= output_shape.x || Y >= output_shape.y || C == 0) { | |||
| @@ -38,5 +55,11 @@ __kernel void Scale_C_IMG(__read_only image2d_t input, __read_only image2d_t sca | |||
| FLT4 in = READ_IMAGE(input, smp_none, (int2)(X, Y)); | |||
| FLT4 s = READ_IMAGE(scale, smp_none, (int2)(X % C, 0)); | |||
| FLT4 o = READ_IMAGE(offset, smp_none, (int2)(X % C, 0)); | |||
| WRITE_IMAGE(output, (int2)(X, Y), in * s + o); | |||
| FLT4 out = in * s + o; | |||
| if (act_type == ActType_Relu) { | |||
| out = max(out, (FLT4)(0.0f)); | |||
| } else if (act_type == ActType_Relu6) { | |||
| out = clamp(out, (FLT4)(0.0f), (FLT4)(6.0f)); | |||
| } | |||
| WRITE_IMAGE(output, (int2)(X, Y), out); | |||
| } | |||
| @@ -0,0 +1,137 @@ | |||
| #pragma OPENCL EXTENSION cl_khr_fp16 : enable | |||
| #define INT4 int4 | |||
| __constant sampler_t smp_none = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE | CLK_FILTER_NEAREST; | |||
| #define CHECK_IDX_FOR_STACK \ | |||
| int X = get_global_id(0); \ | |||
| int Y = get_global_id(1); \ | |||
| int Z = get_global_id(2); \ | |||
| if (X >= output_shape.x * output_shape.y || Y >= output_shape.z || Z >= output_shape.w) { \ | |||
| return; \ | |||
| } \ | |||
| FLT4 result; | |||
| __kernel void stack8inputaxis1(__read_only image2d_t input0, __read_only image2d_t input1, __read_only image2d_t input2, | |||
| __read_only image2d_t input3, __read_only image2d_t input4, __read_only image2d_t input5, | |||
| __read_only image2d_t input6, __read_only image2d_t input7, | |||
| __write_only image2d_t output, int4 input_shape0, int4 input_shape1, int4 input_shape2, | |||
| int4 input_shape3, int4 input_shape4, int4 input_shape5, int4 input_shape6, | |||
| int4 input_shape7, int4 output_shape) { | |||
| CHECK_IDX_FOR_STACK; | |||
| if (X < input_shape0.y) { | |||
| result = READ_IMAGE(input0, smp_none, (int2)((Y)*input_shape0.w + Z, (X))); | |||
| } else if (X < (input_shape0.y + input_shape1.y)) { | |||
| result = READ_IMAGE(input1, smp_none, (int2)((Y)*input_shape1.w + Z, (X - input_shape0.y))); | |||
| } else if (X < (input_shape0.y + input_shape1.y + input_shape2.y)) { | |||
| result = READ_IMAGE(input2, smp_none, (int2)((Y)*input_shape2.w + Z, (X - input_shape0.y - input_shape1.y))); | |||
| } else if (X < (input_shape0.y + input_shape1.y + input_shape2.y + input_shape3.y)) { | |||
| result = READ_IMAGE(input3, smp_none, | |||
| (int2)((Y)*input_shape3.w + Z, (X - input_shape0.y - input_shape1.y - input_shape2.y))); | |||
| } else if (X < (input_shape0.y + input_shape1.y + input_shape2.y + input_shape3.y + input_shape4.y)) { | |||
| result = READ_IMAGE( | |||
| input4, smp_none, | |||
| (int2)((Y)*input_shape4.w + Z, (X - input_shape0.y - input_shape1.y - input_shape2.y - input_shape3.y))); | |||
| } else if (X < | |||
| (input_shape0.y + input_shape1.y + input_shape2.y + input_shape3.y + input_shape4.y + input_shape5.y)) { | |||
| result = READ_IMAGE(input5, smp_none, | |||
| (int2)((Y)*input_shape5.w + Z, (X - input_shape0.y - input_shape1.y - input_shape2.y - | |||
| input_shape3.y - input_shape4.y))); | |||
| } else if (X < (input_shape0.y + input_shape1.y + input_shape2.y + input_shape3.y + input_shape4.y + input_shape5.y + | |||
| input_shape6.y)) { | |||
| result = READ_IMAGE(input6, smp_none, | |||
| (int2)((Y)*input_shape6.w + Z, (X - input_shape0.y - input_shape1.y - input_shape2.y - | |||
| input_shape3.y - input_shape4.y - input_shape5.y))); | |||
| } else { | |||
| result = | |||
| READ_IMAGE(input7, smp_none, | |||
| (int2)((Y)*input_shape7.w + Z, (X - input_shape0.y - input_shape1.y - input_shape2.y - input_shape3.y - | |||
| input_shape4.y - input_shape5.y - input_shape6.y))); | |||
| } | |||
| WRITE_IMAGE(output, (int2)((Y)*output_shape.w + Z, (X)), result); | |||
| } | |||
| __kernel void stack8inputaxis2(__read_only image2d_t input0, __read_only image2d_t input1, __read_only image2d_t input2, | |||
| __read_only image2d_t input3, __read_only image2d_t input4, __read_only image2d_t input5, | |||
| __read_only image2d_t input6, __read_only image2d_t input7, | |||
| __write_only image2d_t output, int4 input_shape0, int4 input_shape1, int4 input_shape2, | |||
| int4 input_shape3, int4 input_shape4, int4 input_shape5, int4 input_shape6, | |||
| int4 input_shape7, int4 output_shape) { | |||
| CHECK_IDX_FOR_STACK; | |||
| if (Y < input_shape0.z) { | |||
| result = READ_IMAGE(input0, smp_none, (int2)((Y)*input_shape0.w + Z, (X))); | |||
| } else if (Y < (input_shape0.z + input_shape1.z)) { | |||
| result = READ_IMAGE(input1, smp_none, (int2)((Y - input_shape0.z) * input_shape1.w + Z, (X))); | |||
| } else if (Y < (input_shape0.z + input_shape1.z + input_shape2.z)) { | |||
| result = READ_IMAGE(input2, smp_none, (int2)((Y - input_shape0.z - input_shape1.z) * input_shape2.w + Z, (X))); | |||
| } else if (Y < (input_shape0.z + input_shape1.z + input_shape2.z + input_shape3.z)) { | |||
| result = READ_IMAGE(input3, smp_none, | |||
| (int2)((Y - input_shape0.z - input_shape1.z - input_shape2.z) * input_shape3.w + Z, (X))); | |||
| } else if (Y < (input_shape0.z + input_shape1.z + input_shape2.z + input_shape3.z + input_shape4.z)) { | |||
| result = READ_IMAGE( | |||
| input4, smp_none, | |||
| (int2)((Y - input_shape0.z - input_shape1.z - input_shape2.z - input_shape3.z) * input_shape4.w + Z, (X))); | |||
| } else if (Y < | |||
| (input_shape0.z + input_shape1.z + input_shape2.z + input_shape3.z + input_shape4.z + input_shape5.z)) { | |||
| result = READ_IMAGE( | |||
| input5, smp_none, | |||
| (int2)( | |||
| (Y - input_shape0.z - input_shape1.z - input_shape2.z - input_shape3.z - input_shape4.z) * input_shape5.w + Z, | |||
| (X))); | |||
| } else if (Y < (input_shape0.z + input_shape1.z + input_shape2.z + input_shape3.z + input_shape4.z + input_shape5.z + | |||
| input_shape6.z)) { | |||
| result = READ_IMAGE( | |||
| input6, smp_none, | |||
| (int2)((Y - input_shape0.z - input_shape1.z - input_shape2.z - input_shape3.z - input_shape4.z - input_shape5.z) * | |||
| input_shape6.w + | |||
| Z, | |||
| (X))); | |||
| } else { | |||
| result = READ_IMAGE(input7, smp_none, | |||
| (int2)((Y - input_shape0.z - input_shape1.z - input_shape2.z - input_shape3.z - input_shape4.z - | |||
| input_shape5.z - input_shape6.z) * | |||
| input_shape7.w + | |||
| Z, | |||
| (X))); | |||
| } | |||
| WRITE_IMAGE(output, (int2)((Y)*output_shape.w + Z, (X)), result); | |||
| } | |||
| __kernel void stack8inputaxis3(__read_only image2d_t input0, __read_only image2d_t input1, __read_only image2d_t input2, | |||
| __read_only image2d_t input3, __read_only image2d_t input4, __read_only image2d_t input5, | |||
| __read_only image2d_t input6, __read_only image2d_t input7, | |||
| __write_only image2d_t output, int4 input_shape0, int4 input_shape1, int4 input_shape2, | |||
| int4 input_shape3, int4 input_shape4, int4 input_shape5, int4 input_shape6, | |||
| int4 input_shape7, int4 output_shape) { | |||
| CHECK_IDX_FOR_STACK; | |||
| if (Z < input_shape0.w) { | |||
| result = READ_IMAGE(input0, smp_none, (int2)((Y)*input_shape0.w + Z, (X))); | |||
| } else if (Z < (input_shape0.w + input_shape1.w)) { | |||
| result = READ_IMAGE(input1, smp_none, (int2)((Y)*input_shape1.w + Z - input_shape0.w, (X))); | |||
| } else if (Z < (input_shape0.w + input_shape1.w + input_shape2.w)) { | |||
| result = READ_IMAGE(input2, smp_none, (int2)((Y)*input_shape2.w + Z - input_shape0.w - input_shape1.w, (X))); | |||
| } else if (Z < (input_shape0.w + input_shape1.w + input_shape2.w + input_shape3.w)) { | |||
| result = READ_IMAGE(input3, smp_none, | |||
| (int2)((Y)*input_shape3.w + Z - input_shape0.w - input_shape1.w - input_shape2.w, (X))); | |||
| } else if (Z < (input_shape0.w + input_shape1.w + input_shape2.w + input_shape3.w + input_shape4.w)) { | |||
| result = READ_IMAGE( | |||
| input4, smp_none, | |||
| (int2)((Y)*input_shape4.w + Z - input_shape0.w - input_shape1.w - input_shape2.w - input_shape3.w, (X))); | |||
| } else if (Z < | |||
| (input_shape0.w + input_shape1.w + input_shape2.w + input_shape3.w + input_shape4.w + input_shape5.w)) { | |||
| result = READ_IMAGE(input5, smp_none, | |||
| (int2)((Y)*input_shape5.w + Z - input_shape0.w - input_shape1.w - input_shape2.w - | |||
| input_shape3.w - input_shape4.w, | |||
| (X))); | |||
| } else if (Z < (input_shape0.w + input_shape1.w + input_shape2.w + input_shape3.w + input_shape4.w + input_shape5.w + | |||
| input_shape6.w)) { | |||
| result = READ_IMAGE(input6, smp_none, | |||
| (int2)((Y)*input_shape6.w + Z - input_shape0.w - input_shape1.w - input_shape2.w - | |||
| input_shape3.w - input_shape4.w - input_shape5.w, | |||
| (X))); | |||
| } else { | |||
| result = READ_IMAGE(input7, smp_none, | |||
| (int2)((Y)*input_shape7.w + Z - input_shape0.w - input_shape1.w - input_shape2.w - | |||
| input_shape3.w - input_shape4.w - input_shape5.w - input_shape6.w, | |||
| (X))); | |||
| } | |||
| WRITE_IMAGE(output, (int2)((Y)*output_shape.w + Z, (X)), result); | |||
| } | |||
| @@ -66,7 +66,7 @@ int ConcatOpenCLKernel::Init() { | |||
| } | |||
| std::string kernel_name = "Concat"; | |||
| if (in_tensors_.size() == 2 || in_tensors_.size() == 3 || in_tensors_.size() == 4 || in_tensors_.size() == 4) { | |||
| if (in_tensors_.size() == 2 || in_tensors_.size() == 3 || in_tensors_.size() == 4 || in_tensors_.size() == 6) { | |||
| kernel_name += std::to_string(in_tensors_.size()) + "inputaxis" + std::to_string(param->axis_); | |||
| } else { | |||
| MS_LOG(ERROR) << " input must be 2 , 3 , 4 or 6"; | |||
| @@ -83,41 +83,9 @@ int ConcatOpenCLKernel::Init() { | |||
| return RET_OK; | |||
| } | |||
| int ConcatOpenCLKernel::IntegraShapeToXYZ() { | |||
| if (out_tensors_[0]->shape().size() > 4 || out_tensors_[0]->shape().empty()) { | |||
| MS_LOG(ERROR) << "in_tensors_.shape() must between 0~4"; | |||
| return RET_ERROR; | |||
| } | |||
| if (out_tensors_[0]->shape().size() == 4) { | |||
| for (int i = 0; i < in_tensors_.size(); ++i) { | |||
| cl_int4 temp_cl; | |||
| auto temp = in_tensors_[i]->shape(); | |||
| temp_cl = {temp[0], temp[1], temp[2], UP_DIV(temp[3], C4NUM)}; | |||
| XYZShape.push_back(temp_cl); | |||
| } | |||
| } else { | |||
| for (int i = 0; i < in_tensors_.size(); ++i) { | |||
| auto temp = in_tensors_[i]->shape(); | |||
| for (int j = temp.size(); j < C4NUM; ++j) { | |||
| temp.push_back(1); | |||
| } | |||
| cl_int4 temp_cl = {temp[0], temp[1], temp[2], UP_DIV(temp[3], C4NUM)}; | |||
| XYZShape.push_back(temp_cl); | |||
| } | |||
| auto temp = out_tensors_[0]->shape(); | |||
| for (int i = out_tensors_[0]->shape().size(); i < C4NUM; ++i) { | |||
| temp.push_back(1); | |||
| } | |||
| } | |||
| shape_nhwc = {out_tensors_[0]->shape()[0] * out_tensors_[0]->shape()[1], out_tensors_[0]->shape()[2], | |||
| UP_DIV(out_tensors_[0]->shape()[3], C4NUM)}; | |||
| return RET_OK; | |||
| } | |||
| void ConcatGetWorkGroup(const std::vector<size_t> &global, std::vector<size_t> *local, int max_size) { | |||
| const int max_divider = 8; | |||
| const int max_x = 4, max_y = 8; | |||
| const int max_x = 2, max_y = 8; | |||
| int x = std::min(GetMaxDivisorStrategy1(global[0], max_divider), max_x); | |||
| int yz = max_size / x; | |||
| int y = std::min(std::min(GetMaxDivisorStrategy1(global[1], max_divider), yz), max_y); | |||
| @@ -137,11 +105,12 @@ int ConcatOpenCLKernel::Run() { | |||
| } | |||
| auto output_shape = out_tensors_[0]->shape(); | |||
| cl_int4 output_shape_ = {output_shape[0], output_shape[1], output_shape[2], UP_DIV(output_shape[3], C4NUM)}; | |||
| IntegraShapeToXYZ(); | |||
| const std::vector<size_t> &max_global = ocl_runtime_->GetWorkItemSize(); | |||
| std::vector<size_t> local = {1, 1, 1}; | |||
| std::vector<size_t> global = {static_cast<size_t>(shape_nhwc.s[0]), static_cast<size_t>(shape_nhwc.s[1]), | |||
| static_cast<size_t>(shape_nhwc.s[2])}; | |||
| uint32_t OH = output_shape_.s[0] * output_shape_.s[1]; | |||
| uint32_t OW = output_shape_.s[2]; | |||
| uint32_t OC = output_shape_.s[3]; | |||
| std::vector<size_t> global = {OH, OW, OC}; | |||
| ConcatGetWorkGroup(global, &local, max_global[0]); | |||
| if (in_tensors_.size() == 2 || in_tensors_.size() == 3 || in_tensors_.size() == 4 || in_tensors_.size() == 6) { | |||
| int arg_cn = 0; | |||
| @@ -149,8 +118,9 @@ int ConcatOpenCLKernel::Run() { | |||
| ocl_runtime_->SetKernelArg(kernel_, arg_cn++, in_tensors_[i]->data_c()); | |||
| } | |||
| ocl_runtime_->SetKernelArg(kernel_, arg_cn++, out_tensors_[0]->data_c()); | |||
| for (int i = 0; i < XYZShape.size(); ++i) { | |||
| cl_int4 temp = {XYZShape[i].s[0], XYZShape[i].s[1], XYZShape[i].s[2], XYZShape[i].s[3]}; | |||
| for (int i = 0; i < in_tensors_.size(); ++i) { | |||
| cl_int4 temp = {in_tensors_[i]->shape()[0], in_tensors_[i]->shape()[1], in_tensors_[i]->shape()[2], | |||
| UP_DIV(in_tensors_[i]->shape()[3], C4NUM)}; | |||
| ocl_runtime_->SetKernelArg(kernel_, arg_cn++, temp); | |||
| } | |||
| ocl_runtime_->SetKernelArg(kernel_, arg_cn++, output_shape_); | |||
| @@ -38,11 +38,7 @@ class ConcatOpenCLKernel : public OpenCLKernel { | |||
| private: | |||
| int RunAxis0(); | |||
| int IntegraShapeToXYZ(); | |||
| cl::Kernel kernel_; | |||
| std::vector<cl_int3> XYZShape; | |||
| cl_int4 shape_nhwc{}; | |||
| }; | |||
| } // namespace mindspore::kernel | |||
| @@ -91,12 +91,12 @@ int ScaleOpenCLKernel::InitBuffer() { | |||
| } else if (in_tensors_[0]->GetFormat() == schema::Format_NHWC) { | |||
| if (in_tensors_[1]->GetFormat() == schema::Format_NHWC) { | |||
| if (in_tensors_[0]->data_type() == kNumberTypeFloat32) { | |||
| float *scale = new (std::nothrow) float[pack_weight_size]; | |||
| auto *scale = new (std::nothrow) float[pack_weight_size]; | |||
| if (scale == nullptr) { | |||
| MS_LOG(ERROR) << "Malloc buffer failed!"; | |||
| return RET_ERROR; | |||
| } | |||
| float *offset = new (std::nothrow) float[pack_weight_size]; | |||
| auto *offset = new (std::nothrow) float[pack_weight_size]; | |||
| if (offset == nullptr) { | |||
| MS_LOG(ERROR) << "Malloc buffer failed!"; | |||
| delete[] scale; | |||
| @@ -110,12 +110,12 @@ int ScaleOpenCLKernel::InitBuffer() { | |||
| delete[] scale; | |||
| delete[] offset; | |||
| } else if (in_tensors_[0]->data_type() == kNumberTypeFloat16) { | |||
| float16_t *scale = new (std::nothrow) float16_t[pack_weight_size]; | |||
| auto *scale = new (std::nothrow) float16_t[pack_weight_size]; | |||
| if (scale == nullptr) { | |||
| MS_LOG(ERROR) << "Malloc buffer failed!"; | |||
| return RET_ERROR; | |||
| } | |||
| float16_t *offset = new (std::nothrow) float16_t[pack_weight_size]; | |||
| auto *offset = new (std::nothrow) float16_t[pack_weight_size]; | |||
| if (offset == nullptr) { | |||
| MS_LOG(ERROR) << "Malloc buffer failed!"; | |||
| delete[] scale; | |||
| @@ -146,15 +146,14 @@ int ScaleOpenCLKernel::InitBuffer() { | |||
| int ScaleOpenCLKernel::Init() { | |||
| std::string kernel_name; | |||
| const ScaleParameter *scale_param = reinterpret_cast<const ScaleParameter *>(op_parameter_); | |||
| auto *scale_param = reinterpret_cast<const ScaleParameter *>(op_parameter_); | |||
| auto in_tensor = in_tensors_.at(0); | |||
| auto in_shape = in_tensor->shape(); | |||
| auto scale_tensor = in_tensors_.at(1); | |||
| auto scale_shape = scale_tensor->shape(); | |||
| axis_ = scale_param->axis_; | |||
| if (axis_ < 0) { | |||
| axis_ = axis_ + in_shape.size(); | |||
| axis_ += in_shape.size(); | |||
| } | |||
| if (scale_shape.size() != in_shape.size()) { | |||
| if (scale_tensor->ElementsNum() == 1) { | |||
| @@ -197,6 +196,13 @@ int ScaleOpenCLKernel::Init() { | |||
| int ScaleOpenCLKernel::Run() { | |||
| MS_LOG(DEBUG) << this->name() << " Running!"; | |||
| auto *param = reinterpret_cast<const ScaleParameter *>(op_parameter_); | |||
| cl_int act_type = 0; | |||
| if (param->activation_type_ == ActType_Relu) { | |||
| act_type = 1; | |||
| } else if (param->activation_type_ == ActType_Relu6) { | |||
| act_type = 3; | |||
| } | |||
| int arg_idx = 0; | |||
| ocl_runtime_->SetKernelArg(kernel_, arg_idx++, in_tensors_[0]->data_c()); | |||
| @@ -227,6 +233,7 @@ int ScaleOpenCLKernel::Run() { | |||
| if (element_flag_ && scale_C_flag_) { | |||
| ocl_runtime_->SetKernelArg(kernel_, arg_idx++, UP_DIV(in_tensors_[1]->shape()[0], C4NUM)); | |||
| } | |||
| ocl_runtime_->SetKernelArg(kernel_, arg_idx++, act_type); | |||
| ocl_runtime_->RunKernel(kernel_, global_size_, local_size_, nullptr); | |||
| return RET_OK; | |||
| } | |||
| @@ -0,0 +1,211 @@ | |||
| /** | |||
| * Copyright 2019 Huawei Technologies Co., Ltd | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| * You may obtain a copy of the License at | |||
| * | |||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, software | |||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| #include "src/runtime/kernel/opencl/kernel/stack.h" | |||
| #include <cstring> | |||
| #include <string> | |||
| #include <algorithm> | |||
| #include <set> | |||
| #include "src/kernel_registry.h" | |||
| #include "src/runtime/kernel/opencl/utils.h" | |||
| #include "src/runtime/kernel/opencl/cl/stack.cl.inc" | |||
| using mindspore::kernel::KERNEL_ARCH::kGPU; | |||
| using mindspore::lite::KernelRegistrar; | |||
| using mindspore::schema::PrimitiveType_Stack; | |||
| namespace mindspore::kernel { | |||
| int StackOpenCLKernel::RunAxis0() { | |||
| auto allocator_ = ocl_runtime_->GetAllocator(); | |||
| std::vector<size_t> img_size; | |||
| auto dst_data = out_tensors_[0]->data_c(); | |||
| auto dst_origin = cl::array<cl::size_type, 3U>{0, 0, 0}; | |||
| cl::Image2D *out_image = reinterpret_cast<cl::Image2D *>(allocator_->GetImage(dst_data)); | |||
| for (int i = 0; i < in_tensors_.size(); i++) { | |||
| auto src_data = in_tensors_[i]->data_c(); | |||
| allocator_->GetImageSize(src_data, &img_size); | |||
| auto src_origin = cl::array<cl::size_type, 3U>{0, 0, 0}; | |||
| auto region = cl::array<cl::size_type, 3U>{img_size[0], img_size[1], 1}; | |||
| cl::Image2D *input_image = reinterpret_cast<cl::Image2D *>(allocator_->GetImage(src_data)); | |||
| ocl_runtime_->GetDefaultCommandQueue()->enqueueCopyImage(*input_image, *out_image, src_origin, dst_origin, region); | |||
| dst_origin[1] += region[1]; | |||
| } | |||
| return RET_OK; | |||
| } | |||
| int StackOpenCLKernel::Init() { | |||
| if (in_tensors_[0]->shape().size() > 4 || in_tensors_[0]->shape().size() <= 0) { | |||
| MS_LOG(ERROR) << " only support dim <= 4 "; | |||
| return RET_ERROR; | |||
| } | |||
| auto param = reinterpret_cast<StackParameter *>(this->op_parameter_); | |||
| axis_ = param->axis_; | |||
| axis_ = axis_ < 0 ? axis_ + in_tensors_[0]->shape().size() + 1 : axis_; | |||
| if (in_tensors_[0]->shape().size() != 4) { | |||
| if (in_tensors_[0]->shape().size() == 2) { | |||
| axis_ = axis_ + 2; | |||
| } | |||
| } | |||
| if (param->axis_ < -3 || param->axis_ > 3) { | |||
| MS_LOG(ERROR) << " only support axis >= -3 and axis <= 3 "; | |||
| return RET_ERROR; | |||
| } | |||
| std::string kernel_name = "stack"; | |||
| if (in_tensors_.size() == 8) { | |||
| kernel_name += "8inputaxis" + std::to_string(axis_); | |||
| } else { | |||
| MS_LOG(ERROR) << " input must be 8"; | |||
| return RET_ERROR; | |||
| } | |||
| MS_LOG(DEBUG) << "kernel_name=: " << kernel_name; | |||
| std::set<std::string> build_options; | |||
| std::string source = stack_source; | |||
| std::string program_name = "stack"; | |||
| ocl_runtime_->LoadSource(program_name, source); | |||
| ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name, build_options); | |||
| return RET_OK; | |||
| } | |||
| int StackOpenCLKernel::ReSize() { return RET_OK; } | |||
| void StackGetWorkGroup(const std::vector<size_t> &global, std::vector<size_t> *local, int max_size) { | |||
| const int max_divider = 8; | |||
| const int max_x = 4, max_y = 8; | |||
| int x = std::min(GetMaxDivisorStrategy1(global[0], max_divider), max_x); | |||
| int yz = max_size / x; | |||
| int y = std::min(std::min(GetMaxDivisorStrategy1(global[1], max_divider), yz), max_y); | |||
| int z = std::min(yz / y, static_cast<int>(UP_DIV(global[2], 2))); | |||
| local->clear(); | |||
| local->push_back(x); | |||
| local->push_back(y); | |||
| local->push_back(z); | |||
| } | |||
| int StackOpenCLKernel::InferInTensorShapeTo4D(int *arg_cn) { | |||
| if (in_tensors_.size() == 8) { | |||
| int size = in_tensors_[0]->shape().size(); | |||
| switch (size) { | |||
| case 1: | |||
| for (int i = 0; i < in_tensors_.size(); ++i) { | |||
| ocl_runtime_->SetKernelArg(kernel_, (*arg_cn)++, in_tensors_[i]->data_c()); | |||
| } | |||
| ocl_runtime_->SetKernelArg(kernel_, (*arg_cn)++, out_tensors_[0]->data_c()); | |||
| for (int i = 0; i < in_tensors_.size(); ++i) { | |||
| cl_int4 temp = {in_tensors_[i]->shape()[0], 1, 1, 1}; | |||
| ocl_runtime_->SetKernelArg(kernel_, (*arg_cn)++, temp); | |||
| } | |||
| break; | |||
| case 2: | |||
| for (int i = 0; i < in_tensors_.size(); ++i) { | |||
| ocl_runtime_->SetKernelArg(kernel_, (*arg_cn)++, in_tensors_[i]->data_c()); | |||
| } | |||
| ocl_runtime_->SetKernelArg(kernel_, (*arg_cn)++, out_tensors_[0]->data_c()); | |||
| for (int i = 0; i < in_tensors_.size(); ++i) { | |||
| cl_int4 temp = {in_tensors_[i]->shape()[0], 1, 1, UP_DIV(in_tensors_[i]->shape()[1], C4NUM)}; | |||
| ocl_runtime_->SetKernelArg(kernel_, (*arg_cn)++, temp); | |||
| } | |||
| break; | |||
| case 3: | |||
| for (int i = 0; i < in_tensors_.size(); ++i) { | |||
| ocl_runtime_->SetKernelArg(kernel_, (*arg_cn)++, in_tensors_[i]->data_c()); | |||
| } | |||
| ocl_runtime_->SetKernelArg(kernel_, (*arg_cn)++, out_tensors_[0]->data_c()); | |||
| for (int i = 0; i < in_tensors_.size(); ++i) { | |||
| cl_int4 temp = {in_tensors_[i]->shape()[0], 1, in_tensors_[i]->shape()[1], | |||
| UP_DIV(in_tensors_[i]->shape()[2], C4NUM)}; | |||
| ocl_runtime_->SetKernelArg(kernel_, (*arg_cn)++, temp); | |||
| } | |||
| break; | |||
| default: | |||
| MS_LOG(ERROR) << "unsupported input size > 3 or size <= 0 :" << in_tensors_.size(); | |||
| return RET_ERROR; | |||
| } | |||
| } else { | |||
| MS_LOG(ERROR) << "unsupported input size :" << in_tensors_.size(); | |||
| return RET_ERROR; | |||
| } | |||
| return RET_OK; | |||
| } | |||
| int StackOpenCLKernel::InferOutTensorShapeTo4D(cl_int4 *output_shape) { | |||
| std::vector<int> out_shape = out_tensors_[0]->shape(); | |||
| if (out_shape.size() == 3) { | |||
| N_ = out_shape[0]; | |||
| C_ = out_shape[1] * UP_DIV(out_shape[2], C4NUM); | |||
| } else if (out_shape.size() == 4) { | |||
| if (axis_ == 1) { | |||
| N_ = out_shape[0]; | |||
| H_ = out_shape[1]; | |||
| W_ = out_shape[2]; | |||
| C_ = UP_DIV(out_shape[3], C4NUM); | |||
| } else { | |||
| MS_LOG(ERROR) << "Unsupported out_shape.size=: " << out_shape.size() << " axis=: " << axis_; | |||
| return RET_ERROR; | |||
| } | |||
| } | |||
| OH_ = N_ * H_; | |||
| OW_ = W_; | |||
| OC_ = C_; | |||
| output_shape->s[0] = N_; | |||
| output_shape->s[1] = H_; | |||
| output_shape->s[2] = W_; | |||
| output_shape->s[3] = C_; | |||
| return RET_OK; | |||
| } | |||
| int StackOpenCLKernel::Run() { | |||
| MS_LOG(DEBUG) << this->name() << " Running! "; | |||
| if (axis_ == 0) { | |||
| return RunAxis0(); | |||
| } | |||
| cl_int4 output_shape = {1, 1, 1, 1}; | |||
| const std::vector<size_t> &max_global = ocl_runtime_->GetWorkItemSize(); | |||
| std::vector<size_t> local = {1, 1, 1}; | |||
| int arg_cn = 0; | |||
| InferInTensorShapeTo4D(&arg_cn); | |||
| InferOutTensorShapeTo4D(&output_shape); | |||
| std::vector<size_t> global = {OH_, OW_, OC_}; | |||
| StackGetWorkGroup(global, &local, max_global[0]); | |||
| ocl_runtime_->SetKernelArg(kernel_, arg_cn++, output_shape); | |||
| ocl_runtime_->RunKernel(kernel_, global, local, nullptr); | |||
| return RET_OK; | |||
| } | |||
| kernel::LiteKernel *OpenCLStackKernelCreator(const std::vector<lite::Tensor *> &inputs, | |||
| const std::vector<lite::Tensor *> &outputs, OpParameter *opParameter, | |||
| const lite::InnerContext *ctx, const kernel::KernelKey &desc, | |||
| const mindspore::lite::PrimitiveC *primitive) { | |||
| auto *kernel = new (std::nothrow) StackOpenCLKernel(opParameter, inputs, outputs); | |||
| if (kernel == nullptr) { | |||
| MS_LOG(ERROR) << " new StackOpenCLKernel failed "; | |||
| return nullptr; | |||
| } | |||
| auto ret = kernel->Init(); | |||
| if (ret != RET_OK) { | |||
| MS_LOG(ERROR) << " Init kernel failed, name: Stack "; | |||
| delete kernel; | |||
| return nullptr; | |||
| } | |||
| return kernel; | |||
| } | |||
| REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Stack, OpenCLStackKernelCreator); | |||
| REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_Stack, OpenCLStackKernelCreator); | |||
| } // namespace mindspore::kernel | |||
| @@ -0,0 +1,59 @@ | |||
| /** | |||
| * Copyright 2019 Huawei Technologies Co., Ltd | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| * You may obtain a copy of the License at | |||
| * | |||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, software | |||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| #ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_OPENCL_KERNEL_STACK_H_ | |||
| #define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_OPENCL_KERNEL_STACK_H_ | |||
| #include <vector> | |||
| #include "src/runtime/kernel/opencl/opencl_kernel.h" | |||
| #include "nnacl/stack_parameter.h" | |||
| namespace mindspore::kernel { | |||
| class StackOpenCLKernel : public OpenCLKernel { | |||
| public: | |||
| explicit StackOpenCLKernel(OpParameter *parameter, const std::vector<lite::Tensor *> &inputs, | |||
| const std::vector<lite::Tensor *> &outputs) | |||
| : OpenCLKernel(parameter, inputs, outputs) {} | |||
| ~StackOpenCLKernel() override{}; | |||
| int Init() override; | |||
| int ReSize() override; | |||
| int Run() override; | |||
| private: | |||
| int RunAxis0(); | |||
| int InferInTensorShapeTo4D(int *arg_cn); | |||
| int InferOutTensorShapeTo4D(cl_int4 *output_shape); | |||
| cl::Kernel kernel_; | |||
| int axis_{0}; | |||
| size_t N_{1}; | |||
| size_t H_{1}; | |||
| size_t W_{1}; | |||
| size_t C_{1}; | |||
| size_t OH_{1}; | |||
| size_t OW_{1}; | |||
| size_t OC_{1}; | |||
| }; | |||
| } // namespace mindspore::kernel | |||
| #endif | |||
| @@ -7,3 +7,12 @@ hiai_cn_recognize_modify_padv2.tflite | |||
| hiai_cv_focusShootOCRModel_08.tflite | |||
| hiai_model_normalize_object_scene_ps_20200519.tflite | |||
| inception_v3.tflite | |||
| mtk_isface.tflite | |||
| mtk_landmark.tflite | |||
| mtk_new_detect.tflite | |||
| mtk_pose.tflite | |||
| mtk_model_emotions_0727_nosoftmax.tflite | |||
| mtk_model_normalize_object_scene_ps_20200826_f32_no_softmax.tflite | |||
| mtk_276landmark_0913.tflite | |||
| mtk_face_recognition.tflite | |||
| mtk_convert_model.tflite | |||
| @@ -119,3 +119,14 @@ magenta_arbitrary-image-stylization-v1-256_fp16_prediction_1.tflite | |||
| lite-model_cartoongan_fp16_1.tflite | |||
| lite-model_arbitrary-image-stylization-inceptionv3_fp16_predict_1.tflite | |||
| gts_detect_5k_tf115.tflite | |||
| mtk_isface.tflite | |||
| mtk_landmark.tflite | |||
| mtk_new_detect.tflite | |||
| mtk_pose.tflite | |||
| mtk_age_gender_fp16.tflite | |||
| mtk_model_emotions_0727_nosoftmax.tflite | |||
| mtk_model_face_dress_fp16.tflite | |||
| mtk_model_normalize_object_scene_ps_20200826_f32_no_softmax.tflite | |||
| mtk_276landmark_0913.tflite | |||
| mtk_face_recognition.tflite | |||
| mtk_convert_model.tflite | |||
| @@ -766,7 +766,7 @@ models_onnx_config=${basepath}/models_onnx.cfg | |||
| models_fp16_config=${basepath}/models_fp16.cfg | |||
| models_mindspore_config=${basepath}/models_mindspore.cfg | |||
| models_mindspore_train_config=${basepath}/models_mindspore_train.cfg | |||
| models_tflite_gpu_config=${basepath}/models_tflite_gpu.cfg | |||
| models_tflite_gpu_config=${basepath}/models_fp32_gpu.cfg | |||
| models_fp16_gpu_config=${basepath}/models_fp16_gpu.cfg | |||
| models_arm32_config=${basepath}/models_arm32.cfg | |||
| models_compatibility_config=${basepath}/models_compatibility.cfg | |||
| @@ -37,14 +37,6 @@ class TestConcatOpenCLCI : public mindspore::CommonTest { | |||
| TestConcatOpenCLCI() {} | |||
| }; | |||
| template <typename T> | |||
| void CompareOutputData1(T *output_data, T *correct_data, int size, float err_bound) { | |||
| for (size_t i = 0; i < size; i++) { | |||
| T abs = fabs(output_data[i] - correct_data[i]); | |||
| ASSERT_LE(abs, err_bound); | |||
| } | |||
| } | |||
| TEST_F(TestConcatOpenCLCI, ConcatFp32_2inputforCI) { | |||
| MS_LOG(INFO) << " begin test "; | |||
| auto ocl_runtime = lite::opencl::OpenCLRuntimeWrapper().GetInstance(); | |||
| @@ -132,7 +124,7 @@ TEST_F(TestConcatOpenCLCI, ConcatFp32_2inputforCI) { | |||
| std::cout << "==================output data================" << std::endl; | |||
| sub_graph->Run(); | |||
| auto *output_data_gpu = reinterpret_cast<float *>(output_tensor->data_c()); | |||
| CompareOutputData1(output_data_gpu, correctOutput, output_tensor->ElementsNum(), 0.00001); | |||
| CompareOutputData(output_data_gpu, correctOutput, output_tensor->ElementsNum(), 0.00001); | |||
| for (auto tensor : inputs) { | |||
| tensor->SetData(nullptr); | |||
| delete tensor; | |||
| @@ -260,7 +252,7 @@ TEST_F(TestConcatOpenCLfp16, ConcatFp16_4input_dim4_axis1) { | |||
| std::cout << "==================output data================" << std::endl; | |||
| sub_graph->Run(); | |||
| auto *output_data_gpu = reinterpret_cast<float16_t *>(output_tensor->data_c()); | |||
| CompareOutputData1(output_data_gpu, correctOutput, output_tensor->ElementsNum(), 0.000001); | |||
| CompareOutputData(output_data_gpu, correctOutput, output_tensor->ElementsNum(), 0.000001); | |||
| for (auto tensor : inputs) { | |||
| tensor->SetData(nullptr); | |||
| delete tensor; | |||
| @@ -379,7 +371,7 @@ TEST_F(TestConcatOpenCLfp32, ConcatFp32_3input_dim4_axis1) { | |||
| std::cout << "==================output data================" << std::endl; | |||
| sub_graph->Run(); | |||
| auto *output_data_gpu = reinterpret_cast<float *>(output_tensor->data_c()); | |||
| CompareOutputData1(output_data_gpu, correctOutput, output_tensor->ElementsNum(), 0.00001); | |||
| CompareOutputData(output_data_gpu, correctOutput, output_tensor->ElementsNum(), 0.00001); | |||
| for (auto tensor : inputs) { | |||
| tensor->SetData(nullptr); | |||
| delete tensor; | |||
| @@ -518,7 +510,7 @@ TEST_F(TestConcatOpenCLfp16, ConcatFp16_6input_dim4_axis1) { | |||
| std::cout << "==================output data================" << std::endl; | |||
| sub_graph->Run(); | |||
| auto *output_data_gpu = reinterpret_cast<float16_t *>(output_tensor->MutableData()); | |||
| CompareOutputData1(output_data_gpu, correctOutput, output_tensor->ElementsNum(), 0.000001); | |||
| CompareOutputData(output_data_gpu, correctOutput, output_tensor->ElementsNum(), 0.000001); | |||
| for (auto tensor : inputs) { | |||
| tensor->SetData(nullptr); | |||
| delete tensor; | |||
| @@ -0,0 +1,283 @@ | |||
| /** | |||
| * Copyright 2020 Huawei Technologies Co., Ltd | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| * You may obtain a copy of the License at | |||
| * | |||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, software | |||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| #include <iostream> | |||
| #include <memory> | |||
| #include "common/common_test.h" | |||
| #include "mindspore/lite/src/runtime/opencl/opencl_runtime.h" | |||
| #include "mindspore/lite/src/common/file_utils.h" | |||
| #include "mindspore/lite/src/runtime/kernel/opencl/subgraph_opencl_kernel.h" | |||
| #include "mindspore/lite/src/runtime/kernel/opencl/kernel/stack.h" | |||
| namespace mindspore { | |||
| class TestStackOpenCLCI : public mindspore::CommonTest { | |||
| public: | |||
| TestStackOpenCLCI() {} | |||
| }; | |||
| class TestStackOpenCLfp16 : public mindspore::CommonTest { | |||
| public: | |||
| TestStackOpenCLfp16() {} | |||
| }; | |||
| TEST_F(TestStackOpenCLCI, StackFp32_8inputforCI) { | |||
| MS_LOG(INFO) << " begin test "; | |||
| auto ocl_runtime = lite::opencl::OpenCLRuntimeWrapper().GetInstance(); | |||
| ocl_runtime->Init(); | |||
| auto allocator = ocl_runtime->GetAllocator(); | |||
| MS_LOG(INFO) << " init tensors "; | |||
| constexpr int INPUT_NUM = 8; | |||
| std::array<std::vector<int>, INPUT_NUM> input_shapes = { | |||
| std::vector<int>{1, 1, 8}, std::vector<int>{1, 1, 8}, std::vector<int>{1, 1, 8}, std::vector<int>{1, 1, 8}, | |||
| std::vector<int>{1, 1, 8}, std::vector<int>{1, 1, 8}, std::vector<int>{1, 1, 8}, std::vector<int>{1, 1, 8}}; | |||
| std::vector<int> output_shape = {8, 1, 1, 8}; | |||
| auto data_type = kNumberTypeFloat32; | |||
| auto tensor_type = lite::TensorCategory(schema::NodeType_ValueNode); | |||
| float input_data1[] = {0.75f, 0.06f, 0.74f, 0.30f, 0.9f, 0.59f, 0.03f, 0.37f}; | |||
| float input_data2[] = {0.5f, 0.6f, 0.74f, 0.23f, 0.46f, 0.69f, 0.13f, 0.47f}; | |||
| float input_data3[] = {0.31f, 0.63f, 0.84f, 0.43f, 0.56f, 0.79f, 0.12f, 0.57f}; | |||
| float input_data4[] = {0.35f, 0.26f, 0.17f, 0.33f, 0.66f, 0.89f, 0.93f, 0.77f}; | |||
| float input_data5[] = {0.57f, 0.6f, 0.84f, 0.83f, 0.48f, 0.78f, 0.63f, 0.87f}; | |||
| float input_data6[] = {0.66f, 0.56f, 0.64f, 0.63f, 0.56f, 0.59f, 0.73f, 0.37f}; | |||
| float input_data7[] = {0.35f, 0.26f, 0.54f, 0.33f, 0.76f, 0.59f, 0.73f, 0.34f}; | |||
| float input_data8[] = {0.15f, 0.36f, 0.44f, 0.73f, 0.56f, 0.49f, 0.93f, 0.37f}; | |||
| float correctOutput[] = {0.75f, 0.06f, 0.74f, 0.30f, 0.9f, 0.59f, 0.03f, 0.37f, 0.5f, 0.6f, 0.74f, 0.23f, 0.46f, | |||
| 0.69f, 0.13f, 0.47f, 0.31f, 0.63f, 0.84f, 0.43f, 0.56f, 0.79f, 0.12f, 0.57f, 0.35f, 0.26f, | |||
| 0.17f, 0.33f, 0.66f, 0.89f, 0.93f, 0.77f, 0.57f, 0.6f, 0.84f, 0.83f, 0.48f, 0.78f, 0.63f, | |||
| 0.87f, 0.66f, 0.56f, 0.64f, 0.63f, 0.56f, 0.59f, 0.73f, 0.37f, 0.35f, 0.26f, 0.54f, 0.33f, | |||
| 0.76f, 0.59f, 0.73f, 0.34f, 0.15f, 0.36f, 0.44f, 0.73f, 0.56f, 0.49f, 0.93f, 0.37f}; | |||
| auto *output_tensor = new (std::nothrow) lite::Tensor(data_type, output_shape, schema::Format_NHWC, tensor_type); | |||
| if (output_tensor == nullptr) { | |||
| MS_LOG(INFO) << " new output_tensor failed "; | |||
| return; | |||
| } | |||
| std::vector<lite::Tensor *> inputs; | |||
| std::vector<lite::Tensor *> outputs{output_tensor}; | |||
| for (auto &shape : input_shapes) { | |||
| auto input_temp = new (std::nothrow) lite::Tensor(data_type, shape, schema::Format_NHWC, tensor_type); | |||
| inputs.push_back(input_temp); | |||
| if (input_temp == nullptr) { | |||
| MS_LOG(INFO) << " new input_tensor failed "; | |||
| return; | |||
| } | |||
| } | |||
| MS_LOG(INFO) << " initialize tensors "; | |||
| auto param = reinterpret_cast<StackParameter *>(malloc(sizeof(StackParameter))); | |||
| if (param == nullptr) { | |||
| MS_LOG(INFO) << " new StackParameter failed "; | |||
| for (auto tensor : inputs) { | |||
| delete tensor; | |||
| } | |||
| for (auto tensor : outputs) { | |||
| delete tensor; | |||
| } | |||
| return; | |||
| } | |||
| param->axis_ = 0; | |||
| auto *stack_kernel = | |||
| new (std::nothrow) kernel::StackOpenCLKernel(reinterpret_cast<OpParameter *>(param), inputs, outputs); | |||
| if (stack_kernel == nullptr) { | |||
| MS_LOG(INFO) << " new kernel::StackOpenCLKernel failed "; | |||
| for (auto tensor : inputs) { | |||
| delete tensor; | |||
| } | |||
| for (auto tensor : outputs) { | |||
| delete tensor; | |||
| } | |||
| delete param; | |||
| return; | |||
| } | |||
| stack_kernel->Init(); | |||
| // to do allocate memory for inputs | |||
| for (auto &input_tensor : inputs) { | |||
| input_tensor->MallocData(allocator); | |||
| } | |||
| MS_LOG(INFO) << " initialize sub_graph "; | |||
| std::vector<kernel::LiteKernel *> kernels{stack_kernel}; | |||
| auto *sub_graph = new (std::nothrow) kernel::SubGraphOpenCLKernel(inputs, outputs, kernels, kernels, kernels); | |||
| if (sub_graph == nullptr) { | |||
| MS_LOG(INFO) << " new kernel::SubGraphOpenCLKernel failed "; | |||
| for (auto tensor : inputs) { | |||
| delete tensor; | |||
| } | |||
| for (auto tensor : outputs) { | |||
| delete tensor; | |||
| } | |||
| delete param; | |||
| delete stack_kernel; | |||
| return; | |||
| } | |||
| sub_graph->Init(); | |||
| MS_LOG(INFO) << " initialize input data "; | |||
| memcpy(inputs[0]->data_c(), input_data1, sizeof(input_data1)); | |||
| memcpy(inputs[1]->data_c(), input_data2, sizeof(input_data2)); | |||
| memcpy(inputs[2]->data_c(), input_data3, sizeof(input_data1)); | |||
| memcpy(inputs[3]->data_c(), input_data4, sizeof(input_data2)); | |||
| memcpy(inputs[4]->data_c(), input_data5, sizeof(input_data1)); | |||
| memcpy(inputs[5]->data_c(), input_data6, sizeof(input_data2)); | |||
| memcpy(inputs[6]->data_c(), input_data7, sizeof(input_data1)); | |||
| memcpy(inputs[7]->data_c(), input_data8, sizeof(input_data2)); | |||
| std::cout << "==================output data================" << std::endl; | |||
| sub_graph->Run(); | |||
| auto *output_data_gpu = reinterpret_cast<float *>(output_tensor->data_c()); | |||
| CompareOutputData(output_data_gpu, correctOutput, output_tensor->ElementsNum(), 0.00001); | |||
| for (auto tensor : inputs) { | |||
| tensor->SetData(nullptr); | |||
| delete tensor; | |||
| } | |||
| for (auto tensor : outputs) { | |||
| tensor->SetData(nullptr); | |||
| delete tensor; | |||
| } | |||
| delete sub_graph; | |||
| } | |||
| TEST_F(TestStackOpenCLfp16, StackFp32_8inputaxis1) { | |||
| MS_LOG(INFO) << " begin test "; | |||
| auto ocl_runtime = lite::opencl::OpenCLRuntimeWrapper().GetInstance(); | |||
| ocl_runtime->SetFp16Enable(true); | |||
| ocl_runtime->Init(); | |||
| auto allocator = ocl_runtime->GetAllocator(); | |||
| // get the input from .bin | |||
| size_t input1_size, input2_size, input3_size, input4_size, input5_size, input6_size, input7_size, input8_size, | |||
| output_size; | |||
| std::string input1Ppath = "./test_data/stackfp16_input1.bin"; | |||
| std::string input2Ppath = "./test_data/stackfp16_input2.bin"; | |||
| std::string input3Ppath = "./test_data/stackfp16_input3.bin"; | |||
| std::string input4Ppath = "./test_data/stackfp16_input4.bin"; | |||
| std::string input5Ppath = "./test_data/stackfp16_input5.bin"; | |||
| std::string input6Ppath = "./test_data/stackfp16_input6.bin"; | |||
| std::string input7Ppath = "./test_data/stackfp16_input7.bin"; | |||
| std::string input8Ppath = "./test_data/stackfp16_input8.bin"; | |||
| std::string correctOutputPath = "./test_data/stackfp16_output.bin"; | |||
| auto input_data1 = reinterpret_cast<float16_t *>(mindspore::lite::ReadFile(input1Ppath.c_str(), &input1_size)); | |||
| auto input_data2 = reinterpret_cast<float16_t *>(mindspore::lite::ReadFile(input2Ppath.c_str(), &input2_size)); | |||
| auto input_data3 = reinterpret_cast<float16_t *>(mindspore::lite::ReadFile(input3Ppath.c_str(), &input3_size)); | |||
| auto input_data4 = reinterpret_cast<float16_t *>(mindspore::lite::ReadFile(input4Ppath.c_str(), &input4_size)); | |||
| auto input_data5 = reinterpret_cast<float16_t *>(mindspore::lite::ReadFile(input5Ppath.c_str(), &input5_size)); | |||
| auto input_data6 = reinterpret_cast<float16_t *>(mindspore::lite::ReadFile(input6Ppath.c_str(), &input6_size)); | |||
| auto input_data7 = reinterpret_cast<float16_t *>(mindspore::lite::ReadFile(input7Ppath.c_str(), &input7_size)); | |||
| auto input_data8 = reinterpret_cast<float16_t *>(mindspore::lite::ReadFile(input8Ppath.c_str(), &input8_size)); | |||
| auto correctOutput = | |||
| reinterpret_cast<float16_t *>(mindspore::lite::ReadFile(correctOutputPath.c_str(), &output_size)); | |||
| MS_LOG(INFO) << " init tensors "; | |||
| constexpr int INPUT_NUM = 8; | |||
| std::array<std::vector<int>, INPUT_NUM> input_shapes = { | |||
| std::vector<int>{1, 17, 18}, std::vector<int>{1, 17, 18}, std::vector<int>{1, 17, 18}, std::vector<int>{1, 17, 18}, | |||
| std::vector<int>{1, 17, 18}, std::vector<int>{1, 17, 18}, std::vector<int>{1, 17, 18}, std::vector<int>{1, 17, 18}}; | |||
| std::vector<int> output_shape = {1, 8, 17, 18}; | |||
| auto data_type = kNumberTypeFloat16; | |||
| auto tensor_type = lite::TensorCategory(schema::NodeType_ValueNode); | |||
| std::vector<lite::Tensor *> inputs; | |||
| for (auto &shape : input_shapes) { | |||
| auto input_temp = new (std::nothrow) lite::Tensor(data_type, shape, schema::Format_NHWC, tensor_type); | |||
| inputs.push_back(input_temp); | |||
| if (input_temp == nullptr) { | |||
| MS_LOG(INFO) << " new input_tensor failed "; | |||
| return; | |||
| } | |||
| } | |||
| auto *output_tensor = new (std::nothrow) lite::Tensor(data_type, output_shape, schema::Format_NHWC, tensor_type); | |||
| if (output_tensor == nullptr) { | |||
| MS_LOG(INFO) << " new output_tensor failed "; | |||
| for (auto tensor : inputs) { | |||
| delete tensor; | |||
| } | |||
| return; | |||
| } | |||
| std::vector<lite::Tensor *> outputs{output_tensor}; | |||
| MS_LOG(INFO) << " input_shapes size =: " << input_shapes.size(); | |||
| MS_LOG(INFO) << " initialize tensors "; | |||
| auto param = reinterpret_cast<StackParameter *>(malloc(sizeof(StackParameter))); | |||
| if (param == nullptr) { | |||
| MS_LOG(INFO) << " new StackParameter failed "; | |||
| for (auto tensor : inputs) { | |||
| delete tensor; | |||
| } | |||
| for (auto tensor : outputs) { | |||
| delete tensor; | |||
| } | |||
| return; | |||
| } | |||
| param->axis_ = 1; | |||
| auto *stack_kernel = | |||
| new (std::nothrow) kernel::StackOpenCLKernel(reinterpret_cast<OpParameter *>(param), inputs, outputs); | |||
| if (stack_kernel == nullptr) { | |||
| MS_LOG(INFO) << " new kernel::StackOpenCLKernel failed "; | |||
| for (auto tensor : inputs) { | |||
| delete tensor; | |||
| } | |||
| for (auto tensor : outputs) { | |||
| delete tensor; | |||
| } | |||
| delete param; | |||
| return; | |||
| } | |||
| stack_kernel->Init(); | |||
| // to allocate memory for inputs and outputs | |||
| for (auto &input_tensor : inputs) { | |||
| input_tensor->MallocData(allocator); | |||
| } | |||
| MS_LOG(INFO) << " initialize sub_graph "; | |||
| std::vector<kernel::LiteKernel *> kernels{stack_kernel}; | |||
| auto *sub_graph = new (std::nothrow) kernel::SubGraphOpenCLKernel(inputs, outputs, kernels, kernels, kernels); | |||
| if (sub_graph == nullptr) { | |||
| MS_LOG(INFO) << " new kernel::SubGraphOpenCLKernel failed "; | |||
| for (auto tensor : inputs) { | |||
| delete tensor; | |||
| } | |||
| for (auto tensor : outputs) { | |||
| delete tensor; | |||
| } | |||
| delete param; | |||
| delete stack_kernel; | |||
| return; | |||
| } | |||
| sub_graph->Init(); | |||
| MS_LOG(INFO) << " initialize input data "; | |||
| if (inputs.size() == 8) { | |||
| memcpy(inputs[0]->data_c(), input_data1, input1_size); | |||
| memcpy(inputs[1]->data_c(), input_data2, input2_size); | |||
| memcpy(inputs[2]->data_c(), input_data3, input3_size); | |||
| memcpy(inputs[3]->data_c(), input_data4, input4_size); | |||
| memcpy(inputs[4]->data_c(), input_data5, input5_size); | |||
| memcpy(inputs[5]->data_c(), input_data6, input6_size); | |||
| memcpy(inputs[6]->data_c(), input_data7, input7_size); | |||
| memcpy(inputs[7]->data_c(), input_data8, input8_size); | |||
| } else { | |||
| MS_LOG(ERROR) << " input size must be 2 or 3 or 4"; | |||
| } | |||
| std::cout << "==================output data================" << std::endl; | |||
| sub_graph->Run(); | |||
| auto *output_data_gpu = reinterpret_cast<float16_t *>(output_tensor->MutableData()); | |||
| CompareOutputData(output_data_gpu, correctOutput, output_tensor->ElementsNum(), 0.000001); | |||
| for (auto tensor : inputs) { | |||
| tensor->SetData(nullptr); | |||
| delete tensor; | |||
| } | |||
| for (auto tensor : outputs) { | |||
| tensor->SetData(nullptr); | |||
| delete tensor; | |||
| } | |||
| delete sub_graph; | |||
| } | |||
| } // namespace mindspore | |||