| @@ -7,14 +7,14 @@ __kernel void mean_NHWC4(__read_only image2d_t src_data, __write_only image2d_t | |||||
| if (X >= size.z) { | if (X >= size.z) { | ||||
| return; | return; | ||||
| } | } | ||||
| FLT4 result = (FLT4)0.f; | |||||
| float4 result = (float4)0.f; | |||||
| for (int h = 0; h < size.x; h++) { | for (int h = 0; h < size.x; h++) { | ||||
| for (int w = 0; w < size.y; w++) { | for (int w = 0; w < size.y; w++) { | ||||
| result += READ_IMAGE(src_data, smp_zero, (int2)(w * size.z + X, h)); | |||||
| result += convert_float4(READ_IMAGE(src_data, smp_zero, (int2)(w * size.z + X, h))); | |||||
| } | } | ||||
| } | } | ||||
| result /= size.x * size.y; | result /= size.x * size.y; | ||||
| WRITE_IMAGE(dst_data, (int2)(X, 0), result); | |||||
| WRITE_IMAGE(dst_data, (int2)(X, 0), TO_FLT4(result)); | |||||
| } | } | ||||
| __kernel void mean_NC4HW4(__read_only image2d_t src_data, __write_only image2d_t dst_data, int4 size) { | __kernel void mean_NC4HW4(__read_only image2d_t src_data, __write_only image2d_t dst_data, int4 size) { | ||||
| @@ -1,8 +1,13 @@ | |||||
| #pragma OPENCL EXTENSION cl_khr_fp16 : enable | #pragma OPENCL EXTENSION cl_khr_fp16 : enable | ||||
| __constant sampler_t smp_none = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE | CLK_FILTER_NEAREST; | __constant sampler_t smp_none = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE | CLK_FILTER_NEAREST; | ||||
| #define ActType_No 0 | |||||
| #define ActType_Relu 1 | |||||
| #define ActType_Sigmod 2 | |||||
| #define ActType_Relu6 3 | |||||
| __kernel void Scale_IMG(__read_only image2d_t input, __read_only image2d_t scale, __read_only image2d_t offset, | __kernel void Scale_IMG(__read_only image2d_t input, __read_only image2d_t scale, __read_only image2d_t offset, | ||||
| __write_only image2d_t output, const int2 output_shape) { | |||||
| __write_only image2d_t output, const int2 output_shape, const int act_type) { | |||||
| int X = get_global_id(0); | int X = get_global_id(0); | ||||
| int Y = get_global_id(1); | int Y = get_global_id(1); | ||||
| if (X >= output_shape.x || Y >= output_shape.y) { | if (X >= output_shape.x || Y >= output_shape.y) { | ||||
| @@ -12,11 +17,17 @@ __kernel void Scale_IMG(__read_only image2d_t input, __read_only image2d_t scale | |||||
| FLT4 in = READ_IMAGE(input, smp_none, (int2)(X, Y)); | FLT4 in = READ_IMAGE(input, smp_none, (int2)(X, Y)); | ||||
| FLT4 s = READ_IMAGE(scale, smp_none, (int2)(X, Y)); | FLT4 s = READ_IMAGE(scale, smp_none, (int2)(X, Y)); | ||||
| FLT4 o = READ_IMAGE(offset, smp_none, (int2)(X, Y)); | FLT4 o = READ_IMAGE(offset, smp_none, (int2)(X, Y)); | ||||
| WRITE_IMAGE(output, (int2)(X, Y), in * s + o); | |||||
| FLT4 out = in * s + o; | |||||
| if (act_type == ActType_Relu) { | |||||
| out = max(out, (FLT4)(0.0f)); | |||||
| } else if (act_type == ActType_Relu6) { | |||||
| out = clamp(out, (FLT4)(0.0f), (FLT4)(6.0f)); | |||||
| } | |||||
| WRITE_IMAGE(output, (int2)(X, Y), out); | |||||
| } | } | ||||
| __kernel void BoardcastScale_IMG(__read_only image2d_t input, float scale, float offset, __write_only image2d_t output, | __kernel void BoardcastScale_IMG(__read_only image2d_t input, float scale, float offset, __write_only image2d_t output, | ||||
| const int2 output_shape) { | |||||
| const int2 output_shape, const int act_type) { | |||||
| int X = get_global_id(0); | int X = get_global_id(0); | ||||
| int Y = get_global_id(1); | int Y = get_global_id(1); | ||||
| if (X >= output_shape.x || Y >= output_shape.y) { | if (X >= output_shape.x || Y >= output_shape.y) { | ||||
| @@ -24,11 +35,17 @@ __kernel void BoardcastScale_IMG(__read_only image2d_t input, float scale, float | |||||
| } | } | ||||
| FLT4 in = READ_IMAGE(input, smp_none, (int2)(X, Y)); | FLT4 in = READ_IMAGE(input, smp_none, (int2)(X, Y)); | ||||
| WRITE_IMAGE(output, (int2)(X, Y), in * (FLT)scale + (FLT)offset); | |||||
| FLT4 out = in * (FLT)scale + (FLT)offset; | |||||
| if (act_type == ActType_Relu) { | |||||
| out = max(out, (FLT4)(0.0f)); | |||||
| } else if (act_type == ActType_Relu6) { | |||||
| out = clamp(out, (FLT4)(0.0f), (FLT4)(6.0f)); | |||||
| } | |||||
| WRITE_IMAGE(output, (int2)(X, Y), out); | |||||
| } | } | ||||
| __kernel void Scale_C_IMG(__read_only image2d_t input, __read_only image2d_t scale, __read_only image2d_t offset, | __kernel void Scale_C_IMG(__read_only image2d_t input, __read_only image2d_t scale, __read_only image2d_t offset, | ||||
| __write_only image2d_t output, const int2 output_shape, const int C) { | |||||
| __write_only image2d_t output, const int2 output_shape, const int C, const int act_type) { | |||||
| int X = get_global_id(0); | int X = get_global_id(0); | ||||
| int Y = get_global_id(1); | int Y = get_global_id(1); | ||||
| if (X >= output_shape.x || Y >= output_shape.y || C == 0) { | if (X >= output_shape.x || Y >= output_shape.y || C == 0) { | ||||
| @@ -38,5 +55,11 @@ __kernel void Scale_C_IMG(__read_only image2d_t input, __read_only image2d_t sca | |||||
| FLT4 in = READ_IMAGE(input, smp_none, (int2)(X, Y)); | FLT4 in = READ_IMAGE(input, smp_none, (int2)(X, Y)); | ||||
| FLT4 s = READ_IMAGE(scale, smp_none, (int2)(X % C, 0)); | FLT4 s = READ_IMAGE(scale, smp_none, (int2)(X % C, 0)); | ||||
| FLT4 o = READ_IMAGE(offset, smp_none, (int2)(X % C, 0)); | FLT4 o = READ_IMAGE(offset, smp_none, (int2)(X % C, 0)); | ||||
| WRITE_IMAGE(output, (int2)(X, Y), in * s + o); | |||||
| FLT4 out = in * s + o; | |||||
| if (act_type == ActType_Relu) { | |||||
| out = max(out, (FLT4)(0.0f)); | |||||
| } else if (act_type == ActType_Relu6) { | |||||
| out = clamp(out, (FLT4)(0.0f), (FLT4)(6.0f)); | |||||
| } | |||||
| WRITE_IMAGE(output, (int2)(X, Y), out); | |||||
| } | } | ||||
| @@ -0,0 +1,137 @@ | |||||
| #pragma OPENCL EXTENSION cl_khr_fp16 : enable | |||||
| #define INT4 int4 | |||||
| __constant sampler_t smp_none = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE | CLK_FILTER_NEAREST; | |||||
| #define CHECK_IDX_FOR_STACK \ | |||||
| int X = get_global_id(0); \ | |||||
| int Y = get_global_id(1); \ | |||||
| int Z = get_global_id(2); \ | |||||
| if (X >= output_shape.x * output_shape.y || Y >= output_shape.z || Z >= output_shape.w) { \ | |||||
| return; \ | |||||
| } \ | |||||
| FLT4 result; | |||||
| __kernel void stack8inputaxis1(__read_only image2d_t input0, __read_only image2d_t input1, __read_only image2d_t input2, | |||||
| __read_only image2d_t input3, __read_only image2d_t input4, __read_only image2d_t input5, | |||||
| __read_only image2d_t input6, __read_only image2d_t input7, | |||||
| __write_only image2d_t output, int4 input_shape0, int4 input_shape1, int4 input_shape2, | |||||
| int4 input_shape3, int4 input_shape4, int4 input_shape5, int4 input_shape6, | |||||
| int4 input_shape7, int4 output_shape) { | |||||
| CHECK_IDX_FOR_STACK; | |||||
| if (X < input_shape0.y) { | |||||
| result = READ_IMAGE(input0, smp_none, (int2)((Y)*input_shape0.w + Z, (X))); | |||||
| } else if (X < (input_shape0.y + input_shape1.y)) { | |||||
| result = READ_IMAGE(input1, smp_none, (int2)((Y)*input_shape1.w + Z, (X - input_shape0.y))); | |||||
| } else if (X < (input_shape0.y + input_shape1.y + input_shape2.y)) { | |||||
| result = READ_IMAGE(input2, smp_none, (int2)((Y)*input_shape2.w + Z, (X - input_shape0.y - input_shape1.y))); | |||||
| } else if (X < (input_shape0.y + input_shape1.y + input_shape2.y + input_shape3.y)) { | |||||
| result = READ_IMAGE(input3, smp_none, | |||||
| (int2)((Y)*input_shape3.w + Z, (X - input_shape0.y - input_shape1.y - input_shape2.y))); | |||||
| } else if (X < (input_shape0.y + input_shape1.y + input_shape2.y + input_shape3.y + input_shape4.y)) { | |||||
| result = READ_IMAGE( | |||||
| input4, smp_none, | |||||
| (int2)((Y)*input_shape4.w + Z, (X - input_shape0.y - input_shape1.y - input_shape2.y - input_shape3.y))); | |||||
| } else if (X < | |||||
| (input_shape0.y + input_shape1.y + input_shape2.y + input_shape3.y + input_shape4.y + input_shape5.y)) { | |||||
| result = READ_IMAGE(input5, smp_none, | |||||
| (int2)((Y)*input_shape5.w + Z, (X - input_shape0.y - input_shape1.y - input_shape2.y - | |||||
| input_shape3.y - input_shape4.y))); | |||||
| } else if (X < (input_shape0.y + input_shape1.y + input_shape2.y + input_shape3.y + input_shape4.y + input_shape5.y + | |||||
| input_shape6.y)) { | |||||
| result = READ_IMAGE(input6, smp_none, | |||||
| (int2)((Y)*input_shape6.w + Z, (X - input_shape0.y - input_shape1.y - input_shape2.y - | |||||
| input_shape3.y - input_shape4.y - input_shape5.y))); | |||||
| } else { | |||||
| result = | |||||
| READ_IMAGE(input7, smp_none, | |||||
| (int2)((Y)*input_shape7.w + Z, (X - input_shape0.y - input_shape1.y - input_shape2.y - input_shape3.y - | |||||
| input_shape4.y - input_shape5.y - input_shape6.y))); | |||||
| } | |||||
| WRITE_IMAGE(output, (int2)((Y)*output_shape.w + Z, (X)), result); | |||||
| } | |||||
| __kernel void stack8inputaxis2(__read_only image2d_t input0, __read_only image2d_t input1, __read_only image2d_t input2, | |||||
| __read_only image2d_t input3, __read_only image2d_t input4, __read_only image2d_t input5, | |||||
| __read_only image2d_t input6, __read_only image2d_t input7, | |||||
| __write_only image2d_t output, int4 input_shape0, int4 input_shape1, int4 input_shape2, | |||||
| int4 input_shape3, int4 input_shape4, int4 input_shape5, int4 input_shape6, | |||||
| int4 input_shape7, int4 output_shape) { | |||||
| CHECK_IDX_FOR_STACK; | |||||
| if (Y < input_shape0.z) { | |||||
| result = READ_IMAGE(input0, smp_none, (int2)((Y)*input_shape0.w + Z, (X))); | |||||
| } else if (Y < (input_shape0.z + input_shape1.z)) { | |||||
| result = READ_IMAGE(input1, smp_none, (int2)((Y - input_shape0.z) * input_shape1.w + Z, (X))); | |||||
| } else if (Y < (input_shape0.z + input_shape1.z + input_shape2.z)) { | |||||
| result = READ_IMAGE(input2, smp_none, (int2)((Y - input_shape0.z - input_shape1.z) * input_shape2.w + Z, (X))); | |||||
| } else if (Y < (input_shape0.z + input_shape1.z + input_shape2.z + input_shape3.z)) { | |||||
| result = READ_IMAGE(input3, smp_none, | |||||
| (int2)((Y - input_shape0.z - input_shape1.z - input_shape2.z) * input_shape3.w + Z, (X))); | |||||
| } else if (Y < (input_shape0.z + input_shape1.z + input_shape2.z + input_shape3.z + input_shape4.z)) { | |||||
| result = READ_IMAGE( | |||||
| input4, smp_none, | |||||
| (int2)((Y - input_shape0.z - input_shape1.z - input_shape2.z - input_shape3.z) * input_shape4.w + Z, (X))); | |||||
| } else if (Y < | |||||
| (input_shape0.z + input_shape1.z + input_shape2.z + input_shape3.z + input_shape4.z + input_shape5.z)) { | |||||
| result = READ_IMAGE( | |||||
| input5, smp_none, | |||||
| (int2)( | |||||
| (Y - input_shape0.z - input_shape1.z - input_shape2.z - input_shape3.z - input_shape4.z) * input_shape5.w + Z, | |||||
| (X))); | |||||
| } else if (Y < (input_shape0.z + input_shape1.z + input_shape2.z + input_shape3.z + input_shape4.z + input_shape5.z + | |||||
| input_shape6.z)) { | |||||
| result = READ_IMAGE( | |||||
| input6, smp_none, | |||||
| (int2)((Y - input_shape0.z - input_shape1.z - input_shape2.z - input_shape3.z - input_shape4.z - input_shape5.z) * | |||||
| input_shape6.w + | |||||
| Z, | |||||
| (X))); | |||||
| } else { | |||||
| result = READ_IMAGE(input7, smp_none, | |||||
| (int2)((Y - input_shape0.z - input_shape1.z - input_shape2.z - input_shape3.z - input_shape4.z - | |||||
| input_shape5.z - input_shape6.z) * | |||||
| input_shape7.w + | |||||
| Z, | |||||
| (X))); | |||||
| } | |||||
| WRITE_IMAGE(output, (int2)((Y)*output_shape.w + Z, (X)), result); | |||||
| } | |||||
| __kernel void stack8inputaxis3(__read_only image2d_t input0, __read_only image2d_t input1, __read_only image2d_t input2, | |||||
| __read_only image2d_t input3, __read_only image2d_t input4, __read_only image2d_t input5, | |||||
| __read_only image2d_t input6, __read_only image2d_t input7, | |||||
| __write_only image2d_t output, int4 input_shape0, int4 input_shape1, int4 input_shape2, | |||||
| int4 input_shape3, int4 input_shape4, int4 input_shape5, int4 input_shape6, | |||||
| int4 input_shape7, int4 output_shape) { | |||||
| CHECK_IDX_FOR_STACK; | |||||
| if (Z < input_shape0.w) { | |||||
| result = READ_IMAGE(input0, smp_none, (int2)((Y)*input_shape0.w + Z, (X))); | |||||
| } else if (Z < (input_shape0.w + input_shape1.w)) { | |||||
| result = READ_IMAGE(input1, smp_none, (int2)((Y)*input_shape1.w + Z - input_shape0.w, (X))); | |||||
| } else if (Z < (input_shape0.w + input_shape1.w + input_shape2.w)) { | |||||
| result = READ_IMAGE(input2, smp_none, (int2)((Y)*input_shape2.w + Z - input_shape0.w - input_shape1.w, (X))); | |||||
| } else if (Z < (input_shape0.w + input_shape1.w + input_shape2.w + input_shape3.w)) { | |||||
| result = READ_IMAGE(input3, smp_none, | |||||
| (int2)((Y)*input_shape3.w + Z - input_shape0.w - input_shape1.w - input_shape2.w, (X))); | |||||
| } else if (Z < (input_shape0.w + input_shape1.w + input_shape2.w + input_shape3.w + input_shape4.w)) { | |||||
| result = READ_IMAGE( | |||||
| input4, smp_none, | |||||
| (int2)((Y)*input_shape4.w + Z - input_shape0.w - input_shape1.w - input_shape2.w - input_shape3.w, (X))); | |||||
| } else if (Z < | |||||
| (input_shape0.w + input_shape1.w + input_shape2.w + input_shape3.w + input_shape4.w + input_shape5.w)) { | |||||
| result = READ_IMAGE(input5, smp_none, | |||||
| (int2)((Y)*input_shape5.w + Z - input_shape0.w - input_shape1.w - input_shape2.w - | |||||
| input_shape3.w - input_shape4.w, | |||||
| (X))); | |||||
| } else if (Z < (input_shape0.w + input_shape1.w + input_shape2.w + input_shape3.w + input_shape4.w + input_shape5.w + | |||||
| input_shape6.w)) { | |||||
| result = READ_IMAGE(input6, smp_none, | |||||
| (int2)((Y)*input_shape6.w + Z - input_shape0.w - input_shape1.w - input_shape2.w - | |||||
| input_shape3.w - input_shape4.w - input_shape5.w, | |||||
| (X))); | |||||
| } else { | |||||
| result = READ_IMAGE(input7, smp_none, | |||||
| (int2)((Y)*input_shape7.w + Z - input_shape0.w - input_shape1.w - input_shape2.w - | |||||
| input_shape3.w - input_shape4.w - input_shape5.w - input_shape6.w, | |||||
| (X))); | |||||
| } | |||||
| WRITE_IMAGE(output, (int2)((Y)*output_shape.w + Z, (X)), result); | |||||
| } | |||||
| @@ -66,7 +66,7 @@ int ConcatOpenCLKernel::Init() { | |||||
| } | } | ||||
| std::string kernel_name = "Concat"; | std::string kernel_name = "Concat"; | ||||
| if (in_tensors_.size() == 2 || in_tensors_.size() == 3 || in_tensors_.size() == 4 || in_tensors_.size() == 4) { | |||||
| if (in_tensors_.size() == 2 || in_tensors_.size() == 3 || in_tensors_.size() == 4 || in_tensors_.size() == 6) { | |||||
| kernel_name += std::to_string(in_tensors_.size()) + "inputaxis" + std::to_string(param->axis_); | kernel_name += std::to_string(in_tensors_.size()) + "inputaxis" + std::to_string(param->axis_); | ||||
| } else { | } else { | ||||
| MS_LOG(ERROR) << " input must be 2 , 3 , 4 or 6"; | MS_LOG(ERROR) << " input must be 2 , 3 , 4 or 6"; | ||||
| @@ -83,41 +83,9 @@ int ConcatOpenCLKernel::Init() { | |||||
| return RET_OK; | return RET_OK; | ||||
| } | } | ||||
| int ConcatOpenCLKernel::IntegraShapeToXYZ() { | |||||
| if (out_tensors_[0]->shape().size() > 4 || out_tensors_[0]->shape().empty()) { | |||||
| MS_LOG(ERROR) << "in_tensors_.shape() must between 0~4"; | |||||
| return RET_ERROR; | |||||
| } | |||||
| if (out_tensors_[0]->shape().size() == 4) { | |||||
| for (int i = 0; i < in_tensors_.size(); ++i) { | |||||
| cl_int4 temp_cl; | |||||
| auto temp = in_tensors_[i]->shape(); | |||||
| temp_cl = {temp[0], temp[1], temp[2], UP_DIV(temp[3], C4NUM)}; | |||||
| XYZShape.push_back(temp_cl); | |||||
| } | |||||
| } else { | |||||
| for (int i = 0; i < in_tensors_.size(); ++i) { | |||||
| auto temp = in_tensors_[i]->shape(); | |||||
| for (int j = temp.size(); j < C4NUM; ++j) { | |||||
| temp.push_back(1); | |||||
| } | |||||
| cl_int4 temp_cl = {temp[0], temp[1], temp[2], UP_DIV(temp[3], C4NUM)}; | |||||
| XYZShape.push_back(temp_cl); | |||||
| } | |||||
| auto temp = out_tensors_[0]->shape(); | |||||
| for (int i = out_tensors_[0]->shape().size(); i < C4NUM; ++i) { | |||||
| temp.push_back(1); | |||||
| } | |||||
| } | |||||
| shape_nhwc = {out_tensors_[0]->shape()[0] * out_tensors_[0]->shape()[1], out_tensors_[0]->shape()[2], | |||||
| UP_DIV(out_tensors_[0]->shape()[3], C4NUM)}; | |||||
| return RET_OK; | |||||
| } | |||||
| void ConcatGetWorkGroup(const std::vector<size_t> &global, std::vector<size_t> *local, int max_size) { | void ConcatGetWorkGroup(const std::vector<size_t> &global, std::vector<size_t> *local, int max_size) { | ||||
| const int max_divider = 8; | const int max_divider = 8; | ||||
| const int max_x = 4, max_y = 8; | |||||
| const int max_x = 2, max_y = 8; | |||||
| int x = std::min(GetMaxDivisorStrategy1(global[0], max_divider), max_x); | int x = std::min(GetMaxDivisorStrategy1(global[0], max_divider), max_x); | ||||
| int yz = max_size / x; | int yz = max_size / x; | ||||
| int y = std::min(std::min(GetMaxDivisorStrategy1(global[1], max_divider), yz), max_y); | int y = std::min(std::min(GetMaxDivisorStrategy1(global[1], max_divider), yz), max_y); | ||||
| @@ -137,11 +105,12 @@ int ConcatOpenCLKernel::Run() { | |||||
| } | } | ||||
| auto output_shape = out_tensors_[0]->shape(); | auto output_shape = out_tensors_[0]->shape(); | ||||
| cl_int4 output_shape_ = {output_shape[0], output_shape[1], output_shape[2], UP_DIV(output_shape[3], C4NUM)}; | cl_int4 output_shape_ = {output_shape[0], output_shape[1], output_shape[2], UP_DIV(output_shape[3], C4NUM)}; | ||||
| IntegraShapeToXYZ(); | |||||
| const std::vector<size_t> &max_global = ocl_runtime_->GetWorkItemSize(); | const std::vector<size_t> &max_global = ocl_runtime_->GetWorkItemSize(); | ||||
| std::vector<size_t> local = {1, 1, 1}; | std::vector<size_t> local = {1, 1, 1}; | ||||
| std::vector<size_t> global = {static_cast<size_t>(shape_nhwc.s[0]), static_cast<size_t>(shape_nhwc.s[1]), | |||||
| static_cast<size_t>(shape_nhwc.s[2])}; | |||||
| uint32_t OH = output_shape_.s[0] * output_shape_.s[1]; | |||||
| uint32_t OW = output_shape_.s[2]; | |||||
| uint32_t OC = output_shape_.s[3]; | |||||
| std::vector<size_t> global = {OH, OW, OC}; | |||||
| ConcatGetWorkGroup(global, &local, max_global[0]); | ConcatGetWorkGroup(global, &local, max_global[0]); | ||||
| if (in_tensors_.size() == 2 || in_tensors_.size() == 3 || in_tensors_.size() == 4 || in_tensors_.size() == 6) { | if (in_tensors_.size() == 2 || in_tensors_.size() == 3 || in_tensors_.size() == 4 || in_tensors_.size() == 6) { | ||||
| int arg_cn = 0; | int arg_cn = 0; | ||||
| @@ -149,8 +118,9 @@ int ConcatOpenCLKernel::Run() { | |||||
| ocl_runtime_->SetKernelArg(kernel_, arg_cn++, in_tensors_[i]->data_c()); | ocl_runtime_->SetKernelArg(kernel_, arg_cn++, in_tensors_[i]->data_c()); | ||||
| } | } | ||||
| ocl_runtime_->SetKernelArg(kernel_, arg_cn++, out_tensors_[0]->data_c()); | ocl_runtime_->SetKernelArg(kernel_, arg_cn++, out_tensors_[0]->data_c()); | ||||
| for (int i = 0; i < XYZShape.size(); ++i) { | |||||
| cl_int4 temp = {XYZShape[i].s[0], XYZShape[i].s[1], XYZShape[i].s[2], XYZShape[i].s[3]}; | |||||
| for (int i = 0; i < in_tensors_.size(); ++i) { | |||||
| cl_int4 temp = {in_tensors_[i]->shape()[0], in_tensors_[i]->shape()[1], in_tensors_[i]->shape()[2], | |||||
| UP_DIV(in_tensors_[i]->shape()[3], C4NUM)}; | |||||
| ocl_runtime_->SetKernelArg(kernel_, arg_cn++, temp); | ocl_runtime_->SetKernelArg(kernel_, arg_cn++, temp); | ||||
| } | } | ||||
| ocl_runtime_->SetKernelArg(kernel_, arg_cn++, output_shape_); | ocl_runtime_->SetKernelArg(kernel_, arg_cn++, output_shape_); | ||||
| @@ -38,11 +38,7 @@ class ConcatOpenCLKernel : public OpenCLKernel { | |||||
| private: | private: | ||||
| int RunAxis0(); | int RunAxis0(); | ||||
| int IntegraShapeToXYZ(); | |||||
| cl::Kernel kernel_; | cl::Kernel kernel_; | ||||
| std::vector<cl_int3> XYZShape; | |||||
| cl_int4 shape_nhwc{}; | |||||
| }; | }; | ||||
| } // namespace mindspore::kernel | } // namespace mindspore::kernel | ||||
| @@ -91,12 +91,12 @@ int ScaleOpenCLKernel::InitBuffer() { | |||||
| } else if (in_tensors_[0]->GetFormat() == schema::Format_NHWC) { | } else if (in_tensors_[0]->GetFormat() == schema::Format_NHWC) { | ||||
| if (in_tensors_[1]->GetFormat() == schema::Format_NHWC) { | if (in_tensors_[1]->GetFormat() == schema::Format_NHWC) { | ||||
| if (in_tensors_[0]->data_type() == kNumberTypeFloat32) { | if (in_tensors_[0]->data_type() == kNumberTypeFloat32) { | ||||
| float *scale = new (std::nothrow) float[pack_weight_size]; | |||||
| auto *scale = new (std::nothrow) float[pack_weight_size]; | |||||
| if (scale == nullptr) { | if (scale == nullptr) { | ||||
| MS_LOG(ERROR) << "Malloc buffer failed!"; | MS_LOG(ERROR) << "Malloc buffer failed!"; | ||||
| return RET_ERROR; | return RET_ERROR; | ||||
| } | } | ||||
| float *offset = new (std::nothrow) float[pack_weight_size]; | |||||
| auto *offset = new (std::nothrow) float[pack_weight_size]; | |||||
| if (offset == nullptr) { | if (offset == nullptr) { | ||||
| MS_LOG(ERROR) << "Malloc buffer failed!"; | MS_LOG(ERROR) << "Malloc buffer failed!"; | ||||
| delete[] scale; | delete[] scale; | ||||
| @@ -110,12 +110,12 @@ int ScaleOpenCLKernel::InitBuffer() { | |||||
| delete[] scale; | delete[] scale; | ||||
| delete[] offset; | delete[] offset; | ||||
| } else if (in_tensors_[0]->data_type() == kNumberTypeFloat16) { | } else if (in_tensors_[0]->data_type() == kNumberTypeFloat16) { | ||||
| float16_t *scale = new (std::nothrow) float16_t[pack_weight_size]; | |||||
| auto *scale = new (std::nothrow) float16_t[pack_weight_size]; | |||||
| if (scale == nullptr) { | if (scale == nullptr) { | ||||
| MS_LOG(ERROR) << "Malloc buffer failed!"; | MS_LOG(ERROR) << "Malloc buffer failed!"; | ||||
| return RET_ERROR; | return RET_ERROR; | ||||
| } | } | ||||
| float16_t *offset = new (std::nothrow) float16_t[pack_weight_size]; | |||||
| auto *offset = new (std::nothrow) float16_t[pack_weight_size]; | |||||
| if (offset == nullptr) { | if (offset == nullptr) { | ||||
| MS_LOG(ERROR) << "Malloc buffer failed!"; | MS_LOG(ERROR) << "Malloc buffer failed!"; | ||||
| delete[] scale; | delete[] scale; | ||||
| @@ -146,15 +146,14 @@ int ScaleOpenCLKernel::InitBuffer() { | |||||
| int ScaleOpenCLKernel::Init() { | int ScaleOpenCLKernel::Init() { | ||||
| std::string kernel_name; | std::string kernel_name; | ||||
| const ScaleParameter *scale_param = reinterpret_cast<const ScaleParameter *>(op_parameter_); | |||||
| auto *scale_param = reinterpret_cast<const ScaleParameter *>(op_parameter_); | |||||
| auto in_tensor = in_tensors_.at(0); | auto in_tensor = in_tensors_.at(0); | ||||
| auto in_shape = in_tensor->shape(); | auto in_shape = in_tensor->shape(); | ||||
| auto scale_tensor = in_tensors_.at(1); | auto scale_tensor = in_tensors_.at(1); | ||||
| auto scale_shape = scale_tensor->shape(); | auto scale_shape = scale_tensor->shape(); | ||||
| axis_ = scale_param->axis_; | axis_ = scale_param->axis_; | ||||
| if (axis_ < 0) { | if (axis_ < 0) { | ||||
| axis_ = axis_ + in_shape.size(); | |||||
| axis_ += in_shape.size(); | |||||
| } | } | ||||
| if (scale_shape.size() != in_shape.size()) { | if (scale_shape.size() != in_shape.size()) { | ||||
| if (scale_tensor->ElementsNum() == 1) { | if (scale_tensor->ElementsNum() == 1) { | ||||
| @@ -197,6 +196,13 @@ int ScaleOpenCLKernel::Init() { | |||||
| int ScaleOpenCLKernel::Run() { | int ScaleOpenCLKernel::Run() { | ||||
| MS_LOG(DEBUG) << this->name() << " Running!"; | MS_LOG(DEBUG) << this->name() << " Running!"; | ||||
| auto *param = reinterpret_cast<const ScaleParameter *>(op_parameter_); | |||||
| cl_int act_type = 0; | |||||
| if (param->activation_type_ == ActType_Relu) { | |||||
| act_type = 1; | |||||
| } else if (param->activation_type_ == ActType_Relu6) { | |||||
| act_type = 3; | |||||
| } | |||||
| int arg_idx = 0; | int arg_idx = 0; | ||||
| ocl_runtime_->SetKernelArg(kernel_, arg_idx++, in_tensors_[0]->data_c()); | ocl_runtime_->SetKernelArg(kernel_, arg_idx++, in_tensors_[0]->data_c()); | ||||
| @@ -227,6 +233,7 @@ int ScaleOpenCLKernel::Run() { | |||||
| if (element_flag_ && scale_C_flag_) { | if (element_flag_ && scale_C_flag_) { | ||||
| ocl_runtime_->SetKernelArg(kernel_, arg_idx++, UP_DIV(in_tensors_[1]->shape()[0], C4NUM)); | ocl_runtime_->SetKernelArg(kernel_, arg_idx++, UP_DIV(in_tensors_[1]->shape()[0], C4NUM)); | ||||
| } | } | ||||
| ocl_runtime_->SetKernelArg(kernel_, arg_idx++, act_type); | |||||
| ocl_runtime_->RunKernel(kernel_, global_size_, local_size_, nullptr); | ocl_runtime_->RunKernel(kernel_, global_size_, local_size_, nullptr); | ||||
| return RET_OK; | return RET_OK; | ||||
| } | } | ||||
| @@ -0,0 +1,211 @@ | |||||
| /** | |||||
| * Copyright 2019 Huawei Technologies Co., Ltd | |||||
| * | |||||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||||
| * you may not use this file except in compliance with the License. | |||||
| * You may obtain a copy of the License at | |||||
| * | |||||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||||
| * | |||||
| * Unless required by applicable law or agreed to in writing, software | |||||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| * See the License for the specific language governing permissions and | |||||
| * limitations under the License. | |||||
| */ | |||||
| #include "src/runtime/kernel/opencl/kernel/stack.h" | |||||
| #include <cstring> | |||||
| #include <string> | |||||
| #include <algorithm> | |||||
| #include <set> | |||||
| #include "src/kernel_registry.h" | |||||
| #include "src/runtime/kernel/opencl/utils.h" | |||||
| #include "src/runtime/kernel/opencl/cl/stack.cl.inc" | |||||
| using mindspore::kernel::KERNEL_ARCH::kGPU; | |||||
| using mindspore::lite::KernelRegistrar; | |||||
| using mindspore::schema::PrimitiveType_Stack; | |||||
| namespace mindspore::kernel { | |||||
| int StackOpenCLKernel::RunAxis0() { | |||||
| auto allocator_ = ocl_runtime_->GetAllocator(); | |||||
| std::vector<size_t> img_size; | |||||
| auto dst_data = out_tensors_[0]->data_c(); | |||||
| auto dst_origin = cl::array<cl::size_type, 3U>{0, 0, 0}; | |||||
| cl::Image2D *out_image = reinterpret_cast<cl::Image2D *>(allocator_->GetImage(dst_data)); | |||||
| for (int i = 0; i < in_tensors_.size(); i++) { | |||||
| auto src_data = in_tensors_[i]->data_c(); | |||||
| allocator_->GetImageSize(src_data, &img_size); | |||||
| auto src_origin = cl::array<cl::size_type, 3U>{0, 0, 0}; | |||||
| auto region = cl::array<cl::size_type, 3U>{img_size[0], img_size[1], 1}; | |||||
| cl::Image2D *input_image = reinterpret_cast<cl::Image2D *>(allocator_->GetImage(src_data)); | |||||
| ocl_runtime_->GetDefaultCommandQueue()->enqueueCopyImage(*input_image, *out_image, src_origin, dst_origin, region); | |||||
| dst_origin[1] += region[1]; | |||||
| } | |||||
| return RET_OK; | |||||
| } | |||||
| int StackOpenCLKernel::Init() { | |||||
| if (in_tensors_[0]->shape().size() > 4 || in_tensors_[0]->shape().size() <= 0) { | |||||
| MS_LOG(ERROR) << " only support dim <= 4 "; | |||||
| return RET_ERROR; | |||||
| } | |||||
| auto param = reinterpret_cast<StackParameter *>(this->op_parameter_); | |||||
| axis_ = param->axis_; | |||||
| axis_ = axis_ < 0 ? axis_ + in_tensors_[0]->shape().size() + 1 : axis_; | |||||
| if (in_tensors_[0]->shape().size() != 4) { | |||||
| if (in_tensors_[0]->shape().size() == 2) { | |||||
| axis_ = axis_ + 2; | |||||
| } | |||||
| } | |||||
| if (param->axis_ < -3 || param->axis_ > 3) { | |||||
| MS_LOG(ERROR) << " only support axis >= -3 and axis <= 3 "; | |||||
| return RET_ERROR; | |||||
| } | |||||
| std::string kernel_name = "stack"; | |||||
| if (in_tensors_.size() == 8) { | |||||
| kernel_name += "8inputaxis" + std::to_string(axis_); | |||||
| } else { | |||||
| MS_LOG(ERROR) << " input must be 8"; | |||||
| return RET_ERROR; | |||||
| } | |||||
| MS_LOG(DEBUG) << "kernel_name=: " << kernel_name; | |||||
| std::set<std::string> build_options; | |||||
| std::string source = stack_source; | |||||
| std::string program_name = "stack"; | |||||
| ocl_runtime_->LoadSource(program_name, source); | |||||
| ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name, build_options); | |||||
| return RET_OK; | |||||
| } | |||||
| int StackOpenCLKernel::ReSize() { return RET_OK; } | |||||
| void StackGetWorkGroup(const std::vector<size_t> &global, std::vector<size_t> *local, int max_size) { | |||||
| const int max_divider = 8; | |||||
| const int max_x = 4, max_y = 8; | |||||
| int x = std::min(GetMaxDivisorStrategy1(global[0], max_divider), max_x); | |||||
| int yz = max_size / x; | |||||
| int y = std::min(std::min(GetMaxDivisorStrategy1(global[1], max_divider), yz), max_y); | |||||
| int z = std::min(yz / y, static_cast<int>(UP_DIV(global[2], 2))); | |||||
| local->clear(); | |||||
| local->push_back(x); | |||||
| local->push_back(y); | |||||
| local->push_back(z); | |||||
| } | |||||
| int StackOpenCLKernel::InferInTensorShapeTo4D(int *arg_cn) { | |||||
| if (in_tensors_.size() == 8) { | |||||
| int size = in_tensors_[0]->shape().size(); | |||||
| switch (size) { | |||||
| case 1: | |||||
| for (int i = 0; i < in_tensors_.size(); ++i) { | |||||
| ocl_runtime_->SetKernelArg(kernel_, (*arg_cn)++, in_tensors_[i]->data_c()); | |||||
| } | |||||
| ocl_runtime_->SetKernelArg(kernel_, (*arg_cn)++, out_tensors_[0]->data_c()); | |||||
| for (int i = 0; i < in_tensors_.size(); ++i) { | |||||
| cl_int4 temp = {in_tensors_[i]->shape()[0], 1, 1, 1}; | |||||
| ocl_runtime_->SetKernelArg(kernel_, (*arg_cn)++, temp); | |||||
| } | |||||
| break; | |||||
| case 2: | |||||
| for (int i = 0; i < in_tensors_.size(); ++i) { | |||||
| ocl_runtime_->SetKernelArg(kernel_, (*arg_cn)++, in_tensors_[i]->data_c()); | |||||
| } | |||||
| ocl_runtime_->SetKernelArg(kernel_, (*arg_cn)++, out_tensors_[0]->data_c()); | |||||
| for (int i = 0; i < in_tensors_.size(); ++i) { | |||||
| cl_int4 temp = {in_tensors_[i]->shape()[0], 1, 1, UP_DIV(in_tensors_[i]->shape()[1], C4NUM)}; | |||||
| ocl_runtime_->SetKernelArg(kernel_, (*arg_cn)++, temp); | |||||
| } | |||||
| break; | |||||
| case 3: | |||||
| for (int i = 0; i < in_tensors_.size(); ++i) { | |||||
| ocl_runtime_->SetKernelArg(kernel_, (*arg_cn)++, in_tensors_[i]->data_c()); | |||||
| } | |||||
| ocl_runtime_->SetKernelArg(kernel_, (*arg_cn)++, out_tensors_[0]->data_c()); | |||||
| for (int i = 0; i < in_tensors_.size(); ++i) { | |||||
| cl_int4 temp = {in_tensors_[i]->shape()[0], 1, in_tensors_[i]->shape()[1], | |||||
| UP_DIV(in_tensors_[i]->shape()[2], C4NUM)}; | |||||
| ocl_runtime_->SetKernelArg(kernel_, (*arg_cn)++, temp); | |||||
| } | |||||
| break; | |||||
| default: | |||||
| MS_LOG(ERROR) << "unsupported input size > 3 or size <= 0 :" << in_tensors_.size(); | |||||
| return RET_ERROR; | |||||
| } | |||||
| } else { | |||||
| MS_LOG(ERROR) << "unsupported input size :" << in_tensors_.size(); | |||||
| return RET_ERROR; | |||||
| } | |||||
| return RET_OK; | |||||
| } | |||||
| int StackOpenCLKernel::InferOutTensorShapeTo4D(cl_int4 *output_shape) { | |||||
| std::vector<int> out_shape = out_tensors_[0]->shape(); | |||||
| if (out_shape.size() == 3) { | |||||
| N_ = out_shape[0]; | |||||
| C_ = out_shape[1] * UP_DIV(out_shape[2], C4NUM); | |||||
| } else if (out_shape.size() == 4) { | |||||
| if (axis_ == 1) { | |||||
| N_ = out_shape[0]; | |||||
| H_ = out_shape[1]; | |||||
| W_ = out_shape[2]; | |||||
| C_ = UP_DIV(out_shape[3], C4NUM); | |||||
| } else { | |||||
| MS_LOG(ERROR) << "Unsupported out_shape.size=: " << out_shape.size() << " axis=: " << axis_; | |||||
| return RET_ERROR; | |||||
| } | |||||
| } | |||||
| OH_ = N_ * H_; | |||||
| OW_ = W_; | |||||
| OC_ = C_; | |||||
| output_shape->s[0] = N_; | |||||
| output_shape->s[1] = H_; | |||||
| output_shape->s[2] = W_; | |||||
| output_shape->s[3] = C_; | |||||
| return RET_OK; | |||||
| } | |||||
| int StackOpenCLKernel::Run() { | |||||
| MS_LOG(DEBUG) << this->name() << " Running! "; | |||||
| if (axis_ == 0) { | |||||
| return RunAxis0(); | |||||
| } | |||||
| cl_int4 output_shape = {1, 1, 1, 1}; | |||||
| const std::vector<size_t> &max_global = ocl_runtime_->GetWorkItemSize(); | |||||
| std::vector<size_t> local = {1, 1, 1}; | |||||
| int arg_cn = 0; | |||||
| InferInTensorShapeTo4D(&arg_cn); | |||||
| InferOutTensorShapeTo4D(&output_shape); | |||||
| std::vector<size_t> global = {OH_, OW_, OC_}; | |||||
| StackGetWorkGroup(global, &local, max_global[0]); | |||||
| ocl_runtime_->SetKernelArg(kernel_, arg_cn++, output_shape); | |||||
| ocl_runtime_->RunKernel(kernel_, global, local, nullptr); | |||||
| return RET_OK; | |||||
| } | |||||
| kernel::LiteKernel *OpenCLStackKernelCreator(const std::vector<lite::Tensor *> &inputs, | |||||
| const std::vector<lite::Tensor *> &outputs, OpParameter *opParameter, | |||||
| const lite::InnerContext *ctx, const kernel::KernelKey &desc, | |||||
| const mindspore::lite::PrimitiveC *primitive) { | |||||
| auto *kernel = new (std::nothrow) StackOpenCLKernel(opParameter, inputs, outputs); | |||||
| if (kernel == nullptr) { | |||||
| MS_LOG(ERROR) << " new StackOpenCLKernel failed "; | |||||
| return nullptr; | |||||
| } | |||||
| auto ret = kernel->Init(); | |||||
| if (ret != RET_OK) { | |||||
| MS_LOG(ERROR) << " Init kernel failed, name: Stack "; | |||||
| delete kernel; | |||||
| return nullptr; | |||||
| } | |||||
| return kernel; | |||||
| } | |||||
| REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Stack, OpenCLStackKernelCreator); | |||||
| REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_Stack, OpenCLStackKernelCreator); | |||||
| } // namespace mindspore::kernel | |||||
| @@ -0,0 +1,59 @@ | |||||
| /** | |||||
| * Copyright 2019 Huawei Technologies Co., Ltd | |||||
| * | |||||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||||
| * you may not use this file except in compliance with the License. | |||||
| * You may obtain a copy of the License at | |||||
| * | |||||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||||
| * | |||||
| * Unless required by applicable law or agreed to in writing, software | |||||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| * See the License for the specific language governing permissions and | |||||
| * limitations under the License. | |||||
| */ | |||||
| #ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_OPENCL_KERNEL_STACK_H_ | |||||
| #define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_OPENCL_KERNEL_STACK_H_ | |||||
| #include <vector> | |||||
| #include "src/runtime/kernel/opencl/opencl_kernel.h" | |||||
| #include "nnacl/stack_parameter.h" | |||||
| namespace mindspore::kernel { | |||||
| class StackOpenCLKernel : public OpenCLKernel { | |||||
| public: | |||||
| explicit StackOpenCLKernel(OpParameter *parameter, const std::vector<lite::Tensor *> &inputs, | |||||
| const std::vector<lite::Tensor *> &outputs) | |||||
| : OpenCLKernel(parameter, inputs, outputs) {} | |||||
| ~StackOpenCLKernel() override{}; | |||||
| int Init() override; | |||||
| int ReSize() override; | |||||
| int Run() override; | |||||
| private: | |||||
| int RunAxis0(); | |||||
| int InferInTensorShapeTo4D(int *arg_cn); | |||||
| int InferOutTensorShapeTo4D(cl_int4 *output_shape); | |||||
| cl::Kernel kernel_; | |||||
| int axis_{0}; | |||||
| size_t N_{1}; | |||||
| size_t H_{1}; | |||||
| size_t W_{1}; | |||||
| size_t C_{1}; | |||||
| size_t OH_{1}; | |||||
| size_t OW_{1}; | |||||
| size_t OC_{1}; | |||||
| }; | |||||
| } // namespace mindspore::kernel | |||||
| #endif | |||||
| @@ -7,3 +7,12 @@ hiai_cn_recognize_modify_padv2.tflite | |||||
| hiai_cv_focusShootOCRModel_08.tflite | hiai_cv_focusShootOCRModel_08.tflite | ||||
| hiai_model_normalize_object_scene_ps_20200519.tflite | hiai_model_normalize_object_scene_ps_20200519.tflite | ||||
| inception_v3.tflite | inception_v3.tflite | ||||
| mtk_isface.tflite | |||||
| mtk_landmark.tflite | |||||
| mtk_new_detect.tflite | |||||
| mtk_pose.tflite | |||||
| mtk_model_emotions_0727_nosoftmax.tflite | |||||
| mtk_model_normalize_object_scene_ps_20200826_f32_no_softmax.tflite | |||||
| mtk_276landmark_0913.tflite | |||||
| mtk_face_recognition.tflite | |||||
| mtk_convert_model.tflite | |||||
| @@ -119,3 +119,14 @@ magenta_arbitrary-image-stylization-v1-256_fp16_prediction_1.tflite | |||||
| lite-model_cartoongan_fp16_1.tflite | lite-model_cartoongan_fp16_1.tflite | ||||
| lite-model_arbitrary-image-stylization-inceptionv3_fp16_predict_1.tflite | lite-model_arbitrary-image-stylization-inceptionv3_fp16_predict_1.tflite | ||||
| gts_detect_5k_tf115.tflite | gts_detect_5k_tf115.tflite | ||||
| mtk_isface.tflite | |||||
| mtk_landmark.tflite | |||||
| mtk_new_detect.tflite | |||||
| mtk_pose.tflite | |||||
| mtk_age_gender_fp16.tflite | |||||
| mtk_model_emotions_0727_nosoftmax.tflite | |||||
| mtk_model_face_dress_fp16.tflite | |||||
| mtk_model_normalize_object_scene_ps_20200826_f32_no_softmax.tflite | |||||
| mtk_276landmark_0913.tflite | |||||
| mtk_face_recognition.tflite | |||||
| mtk_convert_model.tflite | |||||
| @@ -766,7 +766,7 @@ models_onnx_config=${basepath}/models_onnx.cfg | |||||
| models_fp16_config=${basepath}/models_fp16.cfg | models_fp16_config=${basepath}/models_fp16.cfg | ||||
| models_mindspore_config=${basepath}/models_mindspore.cfg | models_mindspore_config=${basepath}/models_mindspore.cfg | ||||
| models_mindspore_train_config=${basepath}/models_mindspore_train.cfg | models_mindspore_train_config=${basepath}/models_mindspore_train.cfg | ||||
| models_tflite_gpu_config=${basepath}/models_tflite_gpu.cfg | |||||
| models_tflite_gpu_config=${basepath}/models_fp32_gpu.cfg | |||||
| models_fp16_gpu_config=${basepath}/models_fp16_gpu.cfg | models_fp16_gpu_config=${basepath}/models_fp16_gpu.cfg | ||||
| models_arm32_config=${basepath}/models_arm32.cfg | models_arm32_config=${basepath}/models_arm32.cfg | ||||
| models_compatibility_config=${basepath}/models_compatibility.cfg | models_compatibility_config=${basepath}/models_compatibility.cfg | ||||
| @@ -37,14 +37,6 @@ class TestConcatOpenCLCI : public mindspore::CommonTest { | |||||
| TestConcatOpenCLCI() {} | TestConcatOpenCLCI() {} | ||||
| }; | }; | ||||
| template <typename T> | |||||
| void CompareOutputData1(T *output_data, T *correct_data, int size, float err_bound) { | |||||
| for (size_t i = 0; i < size; i++) { | |||||
| T abs = fabs(output_data[i] - correct_data[i]); | |||||
| ASSERT_LE(abs, err_bound); | |||||
| } | |||||
| } | |||||
| TEST_F(TestConcatOpenCLCI, ConcatFp32_2inputforCI) { | TEST_F(TestConcatOpenCLCI, ConcatFp32_2inputforCI) { | ||||
| MS_LOG(INFO) << " begin test "; | MS_LOG(INFO) << " begin test "; | ||||
| auto ocl_runtime = lite::opencl::OpenCLRuntimeWrapper().GetInstance(); | auto ocl_runtime = lite::opencl::OpenCLRuntimeWrapper().GetInstance(); | ||||
| @@ -132,7 +124,7 @@ TEST_F(TestConcatOpenCLCI, ConcatFp32_2inputforCI) { | |||||
| std::cout << "==================output data================" << std::endl; | std::cout << "==================output data================" << std::endl; | ||||
| sub_graph->Run(); | sub_graph->Run(); | ||||
| auto *output_data_gpu = reinterpret_cast<float *>(output_tensor->data_c()); | auto *output_data_gpu = reinterpret_cast<float *>(output_tensor->data_c()); | ||||
| CompareOutputData1(output_data_gpu, correctOutput, output_tensor->ElementsNum(), 0.00001); | |||||
| CompareOutputData(output_data_gpu, correctOutput, output_tensor->ElementsNum(), 0.00001); | |||||
| for (auto tensor : inputs) { | for (auto tensor : inputs) { | ||||
| tensor->SetData(nullptr); | tensor->SetData(nullptr); | ||||
| delete tensor; | delete tensor; | ||||
| @@ -260,7 +252,7 @@ TEST_F(TestConcatOpenCLfp16, ConcatFp16_4input_dim4_axis1) { | |||||
| std::cout << "==================output data================" << std::endl; | std::cout << "==================output data================" << std::endl; | ||||
| sub_graph->Run(); | sub_graph->Run(); | ||||
| auto *output_data_gpu = reinterpret_cast<float16_t *>(output_tensor->data_c()); | auto *output_data_gpu = reinterpret_cast<float16_t *>(output_tensor->data_c()); | ||||
| CompareOutputData1(output_data_gpu, correctOutput, output_tensor->ElementsNum(), 0.000001); | |||||
| CompareOutputData(output_data_gpu, correctOutput, output_tensor->ElementsNum(), 0.000001); | |||||
| for (auto tensor : inputs) { | for (auto tensor : inputs) { | ||||
| tensor->SetData(nullptr); | tensor->SetData(nullptr); | ||||
| delete tensor; | delete tensor; | ||||
| @@ -379,7 +371,7 @@ TEST_F(TestConcatOpenCLfp32, ConcatFp32_3input_dim4_axis1) { | |||||
| std::cout << "==================output data================" << std::endl; | std::cout << "==================output data================" << std::endl; | ||||
| sub_graph->Run(); | sub_graph->Run(); | ||||
| auto *output_data_gpu = reinterpret_cast<float *>(output_tensor->data_c()); | auto *output_data_gpu = reinterpret_cast<float *>(output_tensor->data_c()); | ||||
| CompareOutputData1(output_data_gpu, correctOutput, output_tensor->ElementsNum(), 0.00001); | |||||
| CompareOutputData(output_data_gpu, correctOutput, output_tensor->ElementsNum(), 0.00001); | |||||
| for (auto tensor : inputs) { | for (auto tensor : inputs) { | ||||
| tensor->SetData(nullptr); | tensor->SetData(nullptr); | ||||
| delete tensor; | delete tensor; | ||||
| @@ -518,7 +510,7 @@ TEST_F(TestConcatOpenCLfp16, ConcatFp16_6input_dim4_axis1) { | |||||
| std::cout << "==================output data================" << std::endl; | std::cout << "==================output data================" << std::endl; | ||||
| sub_graph->Run(); | sub_graph->Run(); | ||||
| auto *output_data_gpu = reinterpret_cast<float16_t *>(output_tensor->MutableData()); | auto *output_data_gpu = reinterpret_cast<float16_t *>(output_tensor->MutableData()); | ||||
| CompareOutputData1(output_data_gpu, correctOutput, output_tensor->ElementsNum(), 0.000001); | |||||
| CompareOutputData(output_data_gpu, correctOutput, output_tensor->ElementsNum(), 0.000001); | |||||
| for (auto tensor : inputs) { | for (auto tensor : inputs) { | ||||
| tensor->SetData(nullptr); | tensor->SetData(nullptr); | ||||
| delete tensor; | delete tensor; | ||||
| @@ -0,0 +1,283 @@ | |||||
| /** | |||||
| * Copyright 2020 Huawei Technologies Co., Ltd | |||||
| * | |||||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||||
| * you may not use this file except in compliance with the License. | |||||
| * You may obtain a copy of the License at | |||||
| * | |||||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||||
| * | |||||
| * Unless required by applicable law or agreed to in writing, software | |||||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| * See the License for the specific language governing permissions and | |||||
| * limitations under the License. | |||||
| */ | |||||
| #include <iostream> | |||||
| #include <memory> | |||||
| #include "common/common_test.h" | |||||
| #include "mindspore/lite/src/runtime/opencl/opencl_runtime.h" | |||||
| #include "mindspore/lite/src/common/file_utils.h" | |||||
| #include "mindspore/lite/src/runtime/kernel/opencl/subgraph_opencl_kernel.h" | |||||
| #include "mindspore/lite/src/runtime/kernel/opencl/kernel/stack.h" | |||||
| namespace mindspore { | |||||
| class TestStackOpenCLCI : public mindspore::CommonTest { | |||||
| public: | |||||
| TestStackOpenCLCI() {} | |||||
| }; | |||||
| class TestStackOpenCLfp16 : public mindspore::CommonTest { | |||||
| public: | |||||
| TestStackOpenCLfp16() {} | |||||
| }; | |||||
| TEST_F(TestStackOpenCLCI, StackFp32_8inputforCI) { | |||||
| MS_LOG(INFO) << " begin test "; | |||||
| auto ocl_runtime = lite::opencl::OpenCLRuntimeWrapper().GetInstance(); | |||||
| ocl_runtime->Init(); | |||||
| auto allocator = ocl_runtime->GetAllocator(); | |||||
| MS_LOG(INFO) << " init tensors "; | |||||
| constexpr int INPUT_NUM = 8; | |||||
| std::array<std::vector<int>, INPUT_NUM> input_shapes = { | |||||
| std::vector<int>{1, 1, 8}, std::vector<int>{1, 1, 8}, std::vector<int>{1, 1, 8}, std::vector<int>{1, 1, 8}, | |||||
| std::vector<int>{1, 1, 8}, std::vector<int>{1, 1, 8}, std::vector<int>{1, 1, 8}, std::vector<int>{1, 1, 8}}; | |||||
| std::vector<int> output_shape = {8, 1, 1, 8}; | |||||
| auto data_type = kNumberTypeFloat32; | |||||
| auto tensor_type = lite::TensorCategory(schema::NodeType_ValueNode); | |||||
| float input_data1[] = {0.75f, 0.06f, 0.74f, 0.30f, 0.9f, 0.59f, 0.03f, 0.37f}; | |||||
| float input_data2[] = {0.5f, 0.6f, 0.74f, 0.23f, 0.46f, 0.69f, 0.13f, 0.47f}; | |||||
| float input_data3[] = {0.31f, 0.63f, 0.84f, 0.43f, 0.56f, 0.79f, 0.12f, 0.57f}; | |||||
| float input_data4[] = {0.35f, 0.26f, 0.17f, 0.33f, 0.66f, 0.89f, 0.93f, 0.77f}; | |||||
| float input_data5[] = {0.57f, 0.6f, 0.84f, 0.83f, 0.48f, 0.78f, 0.63f, 0.87f}; | |||||
| float input_data6[] = {0.66f, 0.56f, 0.64f, 0.63f, 0.56f, 0.59f, 0.73f, 0.37f}; | |||||
| float input_data7[] = {0.35f, 0.26f, 0.54f, 0.33f, 0.76f, 0.59f, 0.73f, 0.34f}; | |||||
| float input_data8[] = {0.15f, 0.36f, 0.44f, 0.73f, 0.56f, 0.49f, 0.93f, 0.37f}; | |||||
| float correctOutput[] = {0.75f, 0.06f, 0.74f, 0.30f, 0.9f, 0.59f, 0.03f, 0.37f, 0.5f, 0.6f, 0.74f, 0.23f, 0.46f, | |||||
| 0.69f, 0.13f, 0.47f, 0.31f, 0.63f, 0.84f, 0.43f, 0.56f, 0.79f, 0.12f, 0.57f, 0.35f, 0.26f, | |||||
| 0.17f, 0.33f, 0.66f, 0.89f, 0.93f, 0.77f, 0.57f, 0.6f, 0.84f, 0.83f, 0.48f, 0.78f, 0.63f, | |||||
| 0.87f, 0.66f, 0.56f, 0.64f, 0.63f, 0.56f, 0.59f, 0.73f, 0.37f, 0.35f, 0.26f, 0.54f, 0.33f, | |||||
| 0.76f, 0.59f, 0.73f, 0.34f, 0.15f, 0.36f, 0.44f, 0.73f, 0.56f, 0.49f, 0.93f, 0.37f}; | |||||
| auto *output_tensor = new (std::nothrow) lite::Tensor(data_type, output_shape, schema::Format_NHWC, tensor_type); | |||||
| if (output_tensor == nullptr) { | |||||
| MS_LOG(INFO) << " new output_tensor failed "; | |||||
| return; | |||||
| } | |||||
| std::vector<lite::Tensor *> inputs; | |||||
| std::vector<lite::Tensor *> outputs{output_tensor}; | |||||
| for (auto &shape : input_shapes) { | |||||
| auto input_temp = new (std::nothrow) lite::Tensor(data_type, shape, schema::Format_NHWC, tensor_type); | |||||
| inputs.push_back(input_temp); | |||||
| if (input_temp == nullptr) { | |||||
| MS_LOG(INFO) << " new input_tensor failed "; | |||||
| return; | |||||
| } | |||||
| } | |||||
| MS_LOG(INFO) << " initialize tensors "; | |||||
| auto param = reinterpret_cast<StackParameter *>(malloc(sizeof(StackParameter))); | |||||
| if (param == nullptr) { | |||||
| MS_LOG(INFO) << " new StackParameter failed "; | |||||
| for (auto tensor : inputs) { | |||||
| delete tensor; | |||||
| } | |||||
| for (auto tensor : outputs) { | |||||
| delete tensor; | |||||
| } | |||||
| return; | |||||
| } | |||||
| param->axis_ = 0; | |||||
| auto *stack_kernel = | |||||
| new (std::nothrow) kernel::StackOpenCLKernel(reinterpret_cast<OpParameter *>(param), inputs, outputs); | |||||
| if (stack_kernel == nullptr) { | |||||
| MS_LOG(INFO) << " new kernel::StackOpenCLKernel failed "; | |||||
| for (auto tensor : inputs) { | |||||
| delete tensor; | |||||
| } | |||||
| for (auto tensor : outputs) { | |||||
| delete tensor; | |||||
| } | |||||
| delete param; | |||||
| return; | |||||
| } | |||||
| stack_kernel->Init(); | |||||
| // to do allocate memory for inputs | |||||
| for (auto &input_tensor : inputs) { | |||||
| input_tensor->MallocData(allocator); | |||||
| } | |||||
| MS_LOG(INFO) << " initialize sub_graph "; | |||||
| std::vector<kernel::LiteKernel *> kernels{stack_kernel}; | |||||
| auto *sub_graph = new (std::nothrow) kernel::SubGraphOpenCLKernel(inputs, outputs, kernels, kernels, kernels); | |||||
| if (sub_graph == nullptr) { | |||||
| MS_LOG(INFO) << " new kernel::SubGraphOpenCLKernel failed "; | |||||
| for (auto tensor : inputs) { | |||||
| delete tensor; | |||||
| } | |||||
| for (auto tensor : outputs) { | |||||
| delete tensor; | |||||
| } | |||||
| delete param; | |||||
| delete stack_kernel; | |||||
| return; | |||||
| } | |||||
| sub_graph->Init(); | |||||
| MS_LOG(INFO) << " initialize input data "; | |||||
| memcpy(inputs[0]->data_c(), input_data1, sizeof(input_data1)); | |||||
| memcpy(inputs[1]->data_c(), input_data2, sizeof(input_data2)); | |||||
| memcpy(inputs[2]->data_c(), input_data3, sizeof(input_data1)); | |||||
| memcpy(inputs[3]->data_c(), input_data4, sizeof(input_data2)); | |||||
| memcpy(inputs[4]->data_c(), input_data5, sizeof(input_data1)); | |||||
| memcpy(inputs[5]->data_c(), input_data6, sizeof(input_data2)); | |||||
| memcpy(inputs[6]->data_c(), input_data7, sizeof(input_data1)); | |||||
| memcpy(inputs[7]->data_c(), input_data8, sizeof(input_data2)); | |||||
| std::cout << "==================output data================" << std::endl; | |||||
| sub_graph->Run(); | |||||
| auto *output_data_gpu = reinterpret_cast<float *>(output_tensor->data_c()); | |||||
| CompareOutputData(output_data_gpu, correctOutput, output_tensor->ElementsNum(), 0.00001); | |||||
| for (auto tensor : inputs) { | |||||
| tensor->SetData(nullptr); | |||||
| delete tensor; | |||||
| } | |||||
| for (auto tensor : outputs) { | |||||
| tensor->SetData(nullptr); | |||||
| delete tensor; | |||||
| } | |||||
| delete sub_graph; | |||||
| } | |||||
| TEST_F(TestStackOpenCLfp16, StackFp32_8inputaxis1) { | |||||
| MS_LOG(INFO) << " begin test "; | |||||
| auto ocl_runtime = lite::opencl::OpenCLRuntimeWrapper().GetInstance(); | |||||
| ocl_runtime->SetFp16Enable(true); | |||||
| ocl_runtime->Init(); | |||||
| auto allocator = ocl_runtime->GetAllocator(); | |||||
| // get the input from .bin | |||||
| size_t input1_size, input2_size, input3_size, input4_size, input5_size, input6_size, input7_size, input8_size, | |||||
| output_size; | |||||
| std::string input1Ppath = "./test_data/stackfp16_input1.bin"; | |||||
| std::string input2Ppath = "./test_data/stackfp16_input2.bin"; | |||||
| std::string input3Ppath = "./test_data/stackfp16_input3.bin"; | |||||
| std::string input4Ppath = "./test_data/stackfp16_input4.bin"; | |||||
| std::string input5Ppath = "./test_data/stackfp16_input5.bin"; | |||||
| std::string input6Ppath = "./test_data/stackfp16_input6.bin"; | |||||
| std::string input7Ppath = "./test_data/stackfp16_input7.bin"; | |||||
| std::string input8Ppath = "./test_data/stackfp16_input8.bin"; | |||||
| std::string correctOutputPath = "./test_data/stackfp16_output.bin"; | |||||
| auto input_data1 = reinterpret_cast<float16_t *>(mindspore::lite::ReadFile(input1Ppath.c_str(), &input1_size)); | |||||
| auto input_data2 = reinterpret_cast<float16_t *>(mindspore::lite::ReadFile(input2Ppath.c_str(), &input2_size)); | |||||
| auto input_data3 = reinterpret_cast<float16_t *>(mindspore::lite::ReadFile(input3Ppath.c_str(), &input3_size)); | |||||
| auto input_data4 = reinterpret_cast<float16_t *>(mindspore::lite::ReadFile(input4Ppath.c_str(), &input4_size)); | |||||
| auto input_data5 = reinterpret_cast<float16_t *>(mindspore::lite::ReadFile(input5Ppath.c_str(), &input5_size)); | |||||
| auto input_data6 = reinterpret_cast<float16_t *>(mindspore::lite::ReadFile(input6Ppath.c_str(), &input6_size)); | |||||
| auto input_data7 = reinterpret_cast<float16_t *>(mindspore::lite::ReadFile(input7Ppath.c_str(), &input7_size)); | |||||
| auto input_data8 = reinterpret_cast<float16_t *>(mindspore::lite::ReadFile(input8Ppath.c_str(), &input8_size)); | |||||
| auto correctOutput = | |||||
| reinterpret_cast<float16_t *>(mindspore::lite::ReadFile(correctOutputPath.c_str(), &output_size)); | |||||
| MS_LOG(INFO) << " init tensors "; | |||||
| constexpr int INPUT_NUM = 8; | |||||
| std::array<std::vector<int>, INPUT_NUM> input_shapes = { | |||||
| std::vector<int>{1, 17, 18}, std::vector<int>{1, 17, 18}, std::vector<int>{1, 17, 18}, std::vector<int>{1, 17, 18}, | |||||
| std::vector<int>{1, 17, 18}, std::vector<int>{1, 17, 18}, std::vector<int>{1, 17, 18}, std::vector<int>{1, 17, 18}}; | |||||
| std::vector<int> output_shape = {1, 8, 17, 18}; | |||||
| auto data_type = kNumberTypeFloat16; | |||||
| auto tensor_type = lite::TensorCategory(schema::NodeType_ValueNode); | |||||
| std::vector<lite::Tensor *> inputs; | |||||
| for (auto &shape : input_shapes) { | |||||
| auto input_temp = new (std::nothrow) lite::Tensor(data_type, shape, schema::Format_NHWC, tensor_type); | |||||
| inputs.push_back(input_temp); | |||||
| if (input_temp == nullptr) { | |||||
| MS_LOG(INFO) << " new input_tensor failed "; | |||||
| return; | |||||
| } | |||||
| } | |||||
| auto *output_tensor = new (std::nothrow) lite::Tensor(data_type, output_shape, schema::Format_NHWC, tensor_type); | |||||
| if (output_tensor == nullptr) { | |||||
| MS_LOG(INFO) << " new output_tensor failed "; | |||||
| for (auto tensor : inputs) { | |||||
| delete tensor; | |||||
| } | |||||
| return; | |||||
| } | |||||
| std::vector<lite::Tensor *> outputs{output_tensor}; | |||||
| MS_LOG(INFO) << " input_shapes size =: " << input_shapes.size(); | |||||
| MS_LOG(INFO) << " initialize tensors "; | |||||
| auto param = reinterpret_cast<StackParameter *>(malloc(sizeof(StackParameter))); | |||||
| if (param == nullptr) { | |||||
| MS_LOG(INFO) << " new StackParameter failed "; | |||||
| for (auto tensor : inputs) { | |||||
| delete tensor; | |||||
| } | |||||
| for (auto tensor : outputs) { | |||||
| delete tensor; | |||||
| } | |||||
| return; | |||||
| } | |||||
| param->axis_ = 1; | |||||
| auto *stack_kernel = | |||||
| new (std::nothrow) kernel::StackOpenCLKernel(reinterpret_cast<OpParameter *>(param), inputs, outputs); | |||||
| if (stack_kernel == nullptr) { | |||||
| MS_LOG(INFO) << " new kernel::StackOpenCLKernel failed "; | |||||
| for (auto tensor : inputs) { | |||||
| delete tensor; | |||||
| } | |||||
| for (auto tensor : outputs) { | |||||
| delete tensor; | |||||
| } | |||||
| delete param; | |||||
| return; | |||||
| } | |||||
| stack_kernel->Init(); | |||||
| // to allocate memory for inputs and outputs | |||||
| for (auto &input_tensor : inputs) { | |||||
| input_tensor->MallocData(allocator); | |||||
| } | |||||
| MS_LOG(INFO) << " initialize sub_graph "; | |||||
| std::vector<kernel::LiteKernel *> kernels{stack_kernel}; | |||||
| auto *sub_graph = new (std::nothrow) kernel::SubGraphOpenCLKernel(inputs, outputs, kernels, kernels, kernels); | |||||
| if (sub_graph == nullptr) { | |||||
| MS_LOG(INFO) << " new kernel::SubGraphOpenCLKernel failed "; | |||||
| for (auto tensor : inputs) { | |||||
| delete tensor; | |||||
| } | |||||
| for (auto tensor : outputs) { | |||||
| delete tensor; | |||||
| } | |||||
| delete param; | |||||
| delete stack_kernel; | |||||
| return; | |||||
| } | |||||
| sub_graph->Init(); | |||||
| MS_LOG(INFO) << " initialize input data "; | |||||
| if (inputs.size() == 8) { | |||||
| memcpy(inputs[0]->data_c(), input_data1, input1_size); | |||||
| memcpy(inputs[1]->data_c(), input_data2, input2_size); | |||||
| memcpy(inputs[2]->data_c(), input_data3, input3_size); | |||||
| memcpy(inputs[3]->data_c(), input_data4, input4_size); | |||||
| memcpy(inputs[4]->data_c(), input_data5, input5_size); | |||||
| memcpy(inputs[5]->data_c(), input_data6, input6_size); | |||||
| memcpy(inputs[6]->data_c(), input_data7, input7_size); | |||||
| memcpy(inputs[7]->data_c(), input_data8, input8_size); | |||||
| } else { | |||||
| MS_LOG(ERROR) << " input size must be 2 or 3 or 4"; | |||||
| } | |||||
| std::cout << "==================output data================" << std::endl; | |||||
| sub_graph->Run(); | |||||
| auto *output_data_gpu = reinterpret_cast<float16_t *>(output_tensor->MutableData()); | |||||
| CompareOutputData(output_data_gpu, correctOutput, output_tensor->ElementsNum(), 0.000001); | |||||
| for (auto tensor : inputs) { | |||||
| tensor->SetData(nullptr); | |||||
| delete tensor; | |||||
| } | |||||
| for (auto tensor : outputs) { | |||||
| tensor->SetData(nullptr); | |||||
| delete tensor; | |||||
| } | |||||
| delete sub_graph; | |||||
| } | |||||
| } // namespace mindspore | |||||