add ops supported fp16 add dimension 0-3 for concattags/v1.0.0
| @@ -2,39 +2,106 @@ | |||
| __constant sampler_t smp_none = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE | CLK_FILTER_NEAREST; | |||
| __kernel void Concat(__read_only image2d_t input0, __read_only image2d_t input1, __write_only image2d_t output, | |||
| int2 input_channels, int4 output_shape) { | |||
| int X = get_global_id(0); // H | |||
| int4 input_shape0, int4 input_shape1, int4 output_shape, const int axis) { | |||
| int X = get_global_id(0); // N*H | |||
| int Y = get_global_id(1); // W | |||
| int Z = get_global_id(2); // c/4 | |||
| if (X >= output_shape.y || Y >= output_shape.z || Z >= output_shape.w) { | |||
| if (X >= output_shape.x * output_shape.y || Y >= output_shape.z || Z >= output_shape.w) { | |||
| return; | |||
| } | |||
| if (Z < input_channels.x) { | |||
| FLT4 result = READ_IMAGE(input0, smp_none, (int2)((Y)*input_channels.x + Z, (X))); | |||
| WRITE_IMAGE(output, (int2)((Y)*output_shape.w + Z, (X)), result); | |||
| if (axis == 0) { | |||
| if (X < input_shape0.x * input_shape0.y) { | |||
| FLT4 result = READ_IMAGE(input0, smp_none, (int2)((Y)*input_shape0.w + Z, (X))); | |||
| WRITE_IMAGE(output, (int2)((Y)*output_shape.w + Z, (X)), result); | |||
| } else { | |||
| FLT4 result = READ_IMAGE(input1, smp_none, (int2)((Y)*input_shape1.w + Z, (X - input_shape0.x * input_shape0.y))); | |||
| WRITE_IMAGE(output, (int2)((Y)*output_shape.w + Z, (X)), result); | |||
| } | |||
| } else if (axis == 1) { | |||
| if (X < input_shape0.y) { | |||
| FLT4 result = READ_IMAGE(input0, smp_none, (int2)((Y)*input_shape0.w + Z, (X))); | |||
| WRITE_IMAGE(output, (int2)((Y)*output_shape.w + Z, (X)), result); | |||
| } else { | |||
| FLT4 result = READ_IMAGE(input1, smp_none, (int2)((Y)*input_shape1.w + Z, (X - input_shape0.y))); | |||
| WRITE_IMAGE(output, (int2)((Y)*output_shape.w + Z, (X)), result); | |||
| } | |||
| } else if (axis == 2) { | |||
| if (Y < input_shape0.z) { | |||
| FLT4 result = READ_IMAGE(input0, smp_none, (int2)((Y)*input_shape0.w + Z, (X))); | |||
| WRITE_IMAGE(output, (int2)((Y)*output_shape.w + Z, (X)), result); | |||
| } else { | |||
| FLT4 result = READ_IMAGE(input1, smp_none, (int2)((Y - input_shape0.z) * input_shape1.w + Z, (X))); | |||
| WRITE_IMAGE(output, (int2)((Y)*output_shape.w + Z, (X)), result); | |||
| } | |||
| } else { | |||
| FLT4 result = READ_IMAGE(input1, smp_none, (int2)((Y)*input_channels.y + Z - input_channels.x, (X))); | |||
| WRITE_IMAGE(output, (int2)((Y)*output_shape.w + Z, (X)), result); | |||
| if (Z < input_shape0.w) { | |||
| FLT4 result = READ_IMAGE(input0, smp_none, (int2)((Y)*input_shape0.w + Z, (X))); | |||
| WRITE_IMAGE(output, (int2)((Y)*output_shape.w + Z, (X)), result); | |||
| } else { | |||
| FLT4 result = READ_IMAGE(input1, smp_none, (int2)((Y)*input_shape1.w + Z - input_shape0.w, (X))); | |||
| WRITE_IMAGE(output, (int2)((Y)*output_shape.w + Z, (X)), result); | |||
| } | |||
| } | |||
| } | |||
| __kernel void Concat3input(__read_only image2d_t input0, __read_only image2d_t input1, __read_only image2d_t input2, | |||
| __write_only image2d_t output, int3 input_channels, int4 output_shape) { | |||
| int X = get_global_id(0); // H | |||
| __write_only image2d_t output, int4 input_shape0, int4 input_shape1, int4 input_shape2, | |||
| int4 output_shape, const int axis) { | |||
| int X = get_global_id(0); // N*H | |||
| int Y = get_global_id(1); // W | |||
| int Z = get_global_id(2); // c/4 | |||
| if (X >= output_shape.y || Y >= output_shape.z || Z >= output_shape.w) { | |||
| if (X >= output_shape.x * output_shape.y || Y >= output_shape.z || Z >= output_shape.w) { | |||
| return; | |||
| } | |||
| if (Z < input_channels.x) { | |||
| FLT4 result0 = READ_IMAGE(input0, smp_none, (int2)((Y)*input_channels.x + Z, (X))); | |||
| WRITE_IMAGE(output, (int2)((Y)*output_shape.w + Z, (X)), result0); | |||
| } else if (Z < (input_channels.x + input_channels.y)) { | |||
| FLT4 result1 = READ_IMAGE(input1, smp_none, (int2)((Y)*input_channels.y + Z - input_channels.x, (X))); | |||
| WRITE_IMAGE(output, (int2)((Y)*output_shape.w + Z, (X)), result1); | |||
| if (axis == 0) { | |||
| if (X < input_shape0.x * input_shape0.y) { | |||
| FLT4 result0 = READ_IMAGE(input0, smp_none, (int2)((Y)*input_shape0.w + Z, (X))); | |||
| WRITE_IMAGE(output, (int2)((Y)*output_shape.w + Z, (X)), result0); | |||
| } else if (X < (input_shape0.x * input_shape0.y + input_shape1.x * input_shape1.y)) { | |||
| FLT4 result1 = | |||
| READ_IMAGE(input1, smp_none, (int2)((Y)*input_shape1.w + Z, (X - input_shape0.x * input_shape0.y))); | |||
| WRITE_IMAGE(output, (int2)((Y)*output_shape.w + Z, (X)), result1); | |||
| } else { | |||
| FLT4 result2 = READ_IMAGE( | |||
| input2, smp_none, | |||
| (int2)((Y)*input_shape2.w + Z, (X - input_shape0.x * input_shape0.y - input_shape1.x * input_shape1.y))); | |||
| WRITE_IMAGE(output, (int2)((Y)*output_shape.w + Z, (X)), result2); | |||
| } | |||
| } else if (axis == 1) { | |||
| if (X < input_shape0.y) { | |||
| FLT4 result0 = READ_IMAGE(input0, smp_none, (int2)((Y)*input_shape0.w + Z, (X))); | |||
| WRITE_IMAGE(output, (int2)((Y)*output_shape.w + Z, (X)), result0); | |||
| } else if (X < (input_shape0.y + input_shape1.y)) { | |||
| FLT4 result1 = READ_IMAGE(input1, smp_none, (int2)((Y)*input_shape1.w + Z, (X - input_shape0.y))); | |||
| WRITE_IMAGE(output, (int2)((Y)*output_shape.w + Z, (X)), result1); | |||
| } else { | |||
| FLT4 result2 = | |||
| READ_IMAGE(input2, smp_none, (int2)((Y)*input_shape2.w + Z, (X - input_shape0.y - input_shape1.y))); | |||
| WRITE_IMAGE(output, (int2)((Y)*output_shape.w + Z, (X)), result2); | |||
| } | |||
| } else if (axis == 2) { | |||
| if (Y < input_shape0.z) { | |||
| FLT4 result0 = READ_IMAGE(input0, smp_none, (int2)((Y)*input_shape0.w + Z, (X))); | |||
| WRITE_IMAGE(output, (int2)((Y)*output_shape.w + Z, (X)), result0); | |||
| } else if (Y < (input_shape0.z + input_shape0.z)) { | |||
| FLT4 result1 = READ_IMAGE(input1, smp_none, (int2)((Y - input_shape0.z) * input_shape1.w + Z, (X))); | |||
| WRITE_IMAGE(output, (int2)((Y)*output_shape.w + Z, (X)), result1); | |||
| } else { | |||
| FLT4 result2 = | |||
| READ_IMAGE(input2, smp_none, (int2)((Y - input_shape0.z - input_shape1.z) * input_shape2.w + Z, (X))); | |||
| WRITE_IMAGE(output, (int2)((Y)*output_shape.w + Z, (X)), result2); | |||
| } | |||
| } else { | |||
| FLT4 result2 = | |||
| READ_IMAGE(input2, smp_none, (int2)((Y)*input_channels.z + Z - input_channels.x - input_channels.y, (X))); | |||
| WRITE_IMAGE(output, (int2)((Y)*output_shape.w + Z, (X)), result2); | |||
| if (Z < input_shape0.w) { | |||
| FLT4 result0 = READ_IMAGE(input0, smp_none, (int2)((Y)*input_shape0.w + Z, (X))); | |||
| WRITE_IMAGE(output, (int2)((Y)*output_shape.w + Z, (X)), result0); | |||
| } else if (Z < (input_shape0.w + input_shape0.w)) { | |||
| FLT4 result1 = READ_IMAGE(input1, smp_none, (int2)((Y)*input_shape1.w + Z - input_shape0.w, (X))); | |||
| WRITE_IMAGE(output, (int2)((Y)*output_shape.w + Z, (X)), result1); | |||
| } else { | |||
| FLT4 result2 = | |||
| READ_IMAGE(input2, smp_none, (int2)((Y)*input_shape2.w + Z - input_shape0.w - input_shape1.w, (X))); | |||
| WRITE_IMAGE(output, (int2)((Y)*output_shape.w + Z, (X)), result2); | |||
| } | |||
| } | |||
| } | |||
| @@ -100,7 +100,7 @@ void BatchNormGetWorkGroup(const std::vector<size_t> &global, std::vector<size_t | |||
| local->push_back(z); | |||
| } | |||
| int BatchNormOpenCLKernel::Run() { | |||
| MS_LOG(DEBUG) << this->name() << " Running!"; | |||
| MS_LOG(DEBUG) << this->name() << " Running! "; | |||
| auto param = reinterpret_cast<BatchNormParameter *>(this->op_parameter_); | |||
| auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance(); | |||
| auto input0_shape = in_tensors_[0]->shape(); | |||
| @@ -136,12 +136,12 @@ kernel::LiteKernel *OpenCLBatchnormKernelCreator(const std::vector<lite::tensor: | |||
| const mindspore::lite::PrimitiveC *primitive) { | |||
| auto *kernel = new (std::nothrow) BatchNormOpenCLKernel(opParameter, inputs, outputs); | |||
| if (kernel == nullptr) { | |||
| MS_LOG(ERROR) << "new BatchnormOpenCLKernel failed"; | |||
| MS_LOG(ERROR) << " new BatchnormOpenCLKernel failed "; | |||
| return nullptr; | |||
| } | |||
| auto ret = kernel->Init(); | |||
| if (ret != RET_OK) { | |||
| MS_LOG(ERROR) << "Init kernel failed, name: Convolution"; | |||
| MS_LOG(ERROR) << " Init kernel failed, name: Batchnorm "; | |||
| delete kernel; | |||
| return nullptr; | |||
| } | |||
| @@ -33,10 +33,10 @@ int ConcatOpenCLKernel::GetImageSize(size_t idx, std::vector<size_t> *img_size) | |||
| size_t im_dst_x, im_dst_y; | |||
| if (in_tensors_[0]->GetFormat() == schema::Format_NHWC4) { | |||
| im_dst_x = out_tensors_[0]->Width() * CO4; | |||
| im_dst_y = out_tensors_[0]->Height(); | |||
| im_dst_y = out_tensors_[0]->Height() * out_tensors_[0]->Batch(); | |||
| } else { | |||
| im_dst_y = out_tensors_[0]->Height() * CO4; | |||
| im_dst_x = out_tensors_[0]->Width(); | |||
| im_dst_x = out_tensors_[0]->Width() * out_tensors_[0]->Batch(); | |||
| } | |||
| size_t img_dtype = CL_FLOAT; | |||
| auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance(); | |||
| @@ -51,20 +51,17 @@ int ConcatOpenCLKernel::GetImageSize(size_t idx, std::vector<size_t> *img_size) | |||
| } | |||
| int ConcatOpenCLKernel::Init() { | |||
| if (in_tensors_[0]->shape().size() != 4) { | |||
| MS_LOG(ERROR) << "only support dim=4"; | |||
| MS_LOG(ERROR) << " only support dim = 4 "; | |||
| return RET_ERROR; | |||
| } | |||
| auto param = reinterpret_cast<ConcatParameter *>(this->op_parameter_); | |||
| MS_LOG(DEBUG) << "concat at axis=: " << param->axis_; | |||
| if (param->axis_ != 0 && param->axis_ != 3) { | |||
| MS_LOG(ERROR) << "only support axis=0 or axis=3"; | |||
| MS_LOG(DEBUG) << " concat at axis=: " << param->axis_; | |||
| if (param->axis_ < 0 || param->axis_ > 3) { | |||
| MS_LOG(ERROR) << " only support axis >= 0 and axis <= 3 "; | |||
| return RET_ERROR; | |||
| } | |||
| if (param->axis_ == 0) { | |||
| return RET_OK; | |||
| } | |||
| if (in_tensors_.size() == 2) { | |||
| std::set<std::string> build_options; | |||
| std::string source = concat_source; | |||
| @@ -94,33 +91,6 @@ int ConcatOpenCLKernel::Init() { | |||
| int ConcatOpenCLKernel::ReSize() { return RET_OK; } | |||
| int ConcatOpenCLKernel::Run_axis0() { | |||
| auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance(); | |||
| auto allocator_ = ocl_runtime->GetAllocator(); | |||
| cl::CommandQueue *command_queue = ocl_runtime->GetDefaultCommandQueue(); | |||
| for (auto &tensor : in_tensors_) { | |||
| auto buffer = static_cast<cl::Buffer *>(allocator_->GetBuffer(tensor->Data())); | |||
| ocl_runtime->MapBuffer(*buffer, CL_MAP_READ, tensor->Size(), command_queue, true); | |||
| } | |||
| for (auto &tensor : out_tensors_) { | |||
| auto buffer = static_cast<cl::Buffer *>(allocator_->GetBuffer(tensor->Data())); | |||
| ocl_runtime->MapBuffer(*buffer, CL_MAP_WRITE, tensor->Size(), command_queue, true); | |||
| } | |||
| memcpy(out_tensors_[0]->Data(), in_tensors_[0]->Data(), in_tensors_[0]->Size()); | |||
| memcpy(reinterpret_cast<char *>(out_tensors_[0]->Data()) + in_tensors_[0]->Size(), in_tensors_[1]->Data(), | |||
| in_tensors_[1]->Size()); | |||
| for (auto tensors : {&in_tensors_, &out_tensors_}) { | |||
| for (auto &tensor : *tensors) { | |||
| auto buffer = static_cast<cl::Buffer *>(allocator_->GetBuffer(tensor->Data())); | |||
| ocl_runtime->UnmapBuffer(*buffer, tensor->Data()); | |||
| } | |||
| } | |||
| return RET_OK; | |||
| } | |||
| int ConcatGetBiggestDividerWithPriority(int number, int max_divider) { | |||
| if (number % 8 == 0 && max_divider >= 8) { | |||
| return number / 8; | |||
| @@ -154,21 +124,19 @@ void ConcatGetWorkGroup(const std::vector<size_t> &global, std::vector<size_t> * | |||
| local->push_back(z); | |||
| } | |||
| int ConcatOpenCLKernel::Run() { | |||
| MS_LOG(DEBUG) << this->name() << " Running!"; | |||
| MS_LOG(DEBUG) << this->name() << " Running! "; | |||
| auto param = reinterpret_cast<ConcatParameter *>(this->op_parameter_); | |||
| if (param->axis_ == 0) { | |||
| return Run_axis0(); | |||
| } | |||
| auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance(); | |||
| auto input0_shape = in_tensors_[0]->shape(); | |||
| auto input1_shape = in_tensors_[1]->shape(); | |||
| auto input1_shape = in_tensors_[0]->shape(); | |||
| auto input2_shape = in_tensors_[1]->shape(); | |||
| auto output_shape = out_tensors_[0]->shape(); | |||
| cl_int2 input0_shape2_ = {UP_DIV(input0_shape[3], C4NUM), UP_DIV(input1_shape[3], C4NUM)}; // change | |||
| cl_int4 input_shape1_ = {input1_shape[0], input1_shape[1], input1_shape[2], UP_DIV(input1_shape[3], C4NUM)}; | |||
| cl_int4 input_shape2_ = {input2_shape[0], input2_shape[1], input2_shape[2], UP_DIV(input2_shape[3], C4NUM)}; | |||
| cl_int4 output_shape_ = {output_shape[0], output_shape[1], output_shape[2], UP_DIV(output_shape[3], C4NUM)}; | |||
| uint32_t OH = output_shape[1]; // N*H | |||
| uint32_t OH = output_shape[0] * output_shape[1]; // N*H | |||
| uint32_t OW = output_shape[2]; | |||
| uint32_t OC = UP_DIV(output_shape[3], C4NUM); | |||
| @@ -182,23 +150,28 @@ int ConcatOpenCLKernel::Run() { | |||
| ocl_runtime->SetKernelArg(kernel_, arg_cn++, in_tensors_[0]->Data()); | |||
| ocl_runtime->SetKernelArg(kernel_, arg_cn++, in_tensors_[1]->Data()); | |||
| ocl_runtime->SetKernelArg(kernel_, arg_cn++, out_tensors_[0]->Data()); | |||
| ocl_runtime->SetKernelArg(kernel_, arg_cn++, input0_shape2_); | |||
| ocl_runtime->SetKernelArg(kernel_, arg_cn++, input_shape1_); | |||
| ocl_runtime->SetKernelArg(kernel_, arg_cn++, input_shape2_); | |||
| ocl_runtime->SetKernelArg(kernel_, arg_cn++, output_shape_); | |||
| ocl_runtime->SetKernelArg(kernel_, arg_cn++, param->axis_); | |||
| } else if (in_tensors_.size() == 3) { | |||
| auto input2_shape = in_tensors_[2]->shape(); | |||
| cl_int3 input0_shape3_ = {UP_DIV(input0_shape[3], C4NUM), UP_DIV(input1_shape[3], C4NUM), | |||
| UP_DIV(input2_shape[3], C4NUM)}; | |||
| auto input3_shape = in_tensors_[2]->shape(); | |||
| cl_int4 input_shape3_ = {input3_shape[0], input3_shape[1], input3_shape[2], UP_DIV(input3_shape[3], C4NUM)}; | |||
| ocl_runtime->SetKernelArg(kernel_, arg_cn++, in_tensors_[0]->Data()); | |||
| ocl_runtime->SetKernelArg(kernel_, arg_cn++, in_tensors_[1]->Data()); | |||
| ocl_runtime->SetKernelArg(kernel_, arg_cn++, in_tensors_[2]->Data()); | |||
| ocl_runtime->SetKernelArg(kernel_, arg_cn++, out_tensors_[0]->Data()); | |||
| ocl_runtime->SetKernelArg(kernel_, arg_cn++, input0_shape3_); | |||
| ocl_runtime->SetKernelArg(kernel_, arg_cn++, input_shape1_); | |||
| ocl_runtime->SetKernelArg(kernel_, arg_cn++, input_shape2_); | |||
| ocl_runtime->SetKernelArg(kernel_, arg_cn++, input_shape3_); | |||
| ocl_runtime->SetKernelArg(kernel_, arg_cn++, output_shape_); | |||
| ocl_runtime->SetKernelArg(kernel_, arg_cn++, param->axis_); | |||
| } else if (in_tensors_.size() < 2) { | |||
| MS_LOG(ERROR) << " inputs must been >=2"; | |||
| MS_LOG(ERROR) << " input sizes must >= 2 "; | |||
| return RET_ERROR; | |||
| } else { | |||
| MS_LOG(ERROR) << "only support inputs<=3"; | |||
| MS_LOG(ERROR) << " only support inputs <= 3 "; | |||
| return RET_ERROR; | |||
| } | |||
| ocl_runtime->RunKernel(kernel_, global, local, nullptr); | |||
| @@ -213,12 +186,12 @@ kernel::LiteKernel *OpenCLConcatKernelCreator(const std::vector<lite::tensor::Te | |||
| const mindspore::lite::PrimitiveC *primitive) { | |||
| auto *kernel = new (std::nothrow) ConcatOpenCLKernel(opParameter, inputs, outputs); | |||
| if (kernel == nullptr) { | |||
| MS_LOG(ERROR) << "new ConcatOpenCLKernel failed"; | |||
| MS_LOG(ERROR) << " new ConcatOpenCLKernel failed "; | |||
| return nullptr; | |||
| } | |||
| auto ret = kernel->Init(); | |||
| if (ret != RET_OK) { | |||
| MS_LOG(ERROR) << "Init kernel failed, name: Convolution"; | |||
| MS_LOG(ERROR) << " Init kernel failed, name: Concat "; | |||
| delete kernel; | |||
| return nullptr; | |||
| } | |||
| @@ -37,8 +37,6 @@ class ConcatOpenCLKernel : public OpenCLKernel { | |||
| int ReSize() override; | |||
| int Run_axis0(); | |||
| int Run() override; | |||
| int GetImageSize(size_t idx, std::vector<size_t> *img_size) override; | |||
| @@ -97,7 +97,7 @@ void SlcieGetWorkGroup(const std::vector<size_t> &global, std::vector<size_t> *l | |||
| local->push_back(z); | |||
| } | |||
| int SliceOpenCLKernel::Run() { | |||
| MS_LOG(DEBUG) << this->name() << " Running!"; | |||
| MS_LOG(DEBUG) << this->name() << " Running! "; | |||
| auto param = reinterpret_cast<SliceParameter *>(this->op_parameter_); | |||
| auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance(); | |||
| auto input_shape = in_tensors_[0]->shape(); | |||
| @@ -131,12 +131,12 @@ kernel::LiteKernel *OpenCLSliceKernelCreator(const std::vector<lite::tensor::Ten | |||
| const mindspore::lite::PrimitiveC *primitive) { | |||
| auto *kernel = new (std::nothrow) SliceOpenCLKernel(opParameter, inputs, outputs); | |||
| if (kernel == nullptr) { | |||
| MS_LOG(ERROR) << "new SliceOpenCLKernel failed"; | |||
| MS_LOG(ERROR) << " new SliceOpenCLKernel failed "; | |||
| return nullptr; | |||
| } | |||
| auto ret = kernel->Init(); | |||
| if (ret != RET_OK) { | |||
| MS_LOG(ERROR) << "Init kernel failed, name: Convolution"; | |||
| MS_LOG(ERROR) << " Init kernel failed, name: Slice "; | |||
| delete kernel; | |||
| return nullptr; | |||
| } | |||
| @@ -94,7 +94,7 @@ int ToFormatOpenCLKernel::InitNHWCShape() { | |||
| int ToFormatOpenCLKernel::ReSize() { return RET_OK; } | |||
| int ToFormatOpenCLKernel::GetGlobalSize(size_t idx, std::vector<size_t> *global_size) { | |||
| std::vector<size_t> vec = {nhwc_shape_[1], nhwc_shape_[2], UP_DIV(nhwc_shape_[3], C4NUM)}; | |||
| std::vector<size_t> vec = {nhwc_shape_[0] * nhwc_shape_[1], nhwc_shape_[2], UP_DIV(nhwc_shape_[3], C4NUM)}; | |||
| *global_size = std::move(vec); | |||
| return RET_OK; | |||
| } | |||
| @@ -107,13 +107,13 @@ int ToFormatOpenCLKernel::GetImageSize(size_t idx, std::vector<size_t> *img_size | |||
| size_t im_dst_x, im_dst_y; | |||
| std::vector<int> shapex = out_tensors_[0]->shape(); | |||
| if (out_tensors_[0]->GetFormat() == schema::Format_NC4HW4) { | |||
| int c = shapex[1]; | |||
| int h = shapex[2]; | |||
| int c = shapex[1] * shapex[2]; | |||
| int h = shapex[0]; | |||
| int w = shapex[3]; | |||
| im_dst_y = h * UP_DIV(c, C4NUM); | |||
| im_dst_x = w; | |||
| } else if (out_tensors_[0]->GetFormat() == schema::Format_NHWC4) { | |||
| int h = shapex[1]; | |||
| int h = shapex[0] * shapex[1]; | |||
| int w = shapex[2]; | |||
| int c = shapex[3]; | |||
| im_dst_x = w * UP_DIV(c, C4NUM); | |||
| @@ -32,13 +32,6 @@ class TestBatchnormOpenCLfp16 : public mindspore::CommonTest { | |||
| TestBatchnormOpenCLfp16() {} | |||
| }; | |||
| template <typename T> | |||
| void CompareOutputData1(T *output_data, T *correct_data, int size, float err_bound) { | |||
| for (size_t i = 0; i < size; i++) { | |||
| T abs = fabs(output_data[i] - correct_data[i]); | |||
| ASSERT_LE(abs, err_bound); | |||
| } | |||
| } | |||
| TEST_F(TestBatchnormOpenCLfp16, Batchnormfp16input_dim4) { | |||
| MS_LOG(INFO) << "begin test"; | |||
| auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance(); | |||
| @@ -46,7 +39,7 @@ TEST_F(TestBatchnormOpenCLfp16, Batchnormfp16input_dim4) { | |||
| ocl_runtime->Init(); | |||
| auto allocator = ocl_runtime->GetAllocator(); | |||
| MS_LOG(INFO) << "Read tensors from .bin"; | |||
| MS_LOG(INFO) << " Read tensors from .bin "; | |||
| std::vector<int> input_shape = {1, 256, 256, 48}; | |||
| std::vector<int> output_shape = {1, 256, 256, 48}; | |||
| auto data_type = kNumberTypeFloat32; | |||
| @@ -59,7 +52,7 @@ TEST_F(TestBatchnormOpenCLfp16, Batchnormfp16input_dim4) { | |||
| std::string var_path = "./test_data/batchnorm_varfp16.bin"; | |||
| std::string offset_path = "./test_data/batchnorm_offsetfp16.bin"; | |||
| std::string scale_path = "./test_data/batchnorm_scalefp16.bin"; | |||
| std::string output_path = "./test_data/batchnorm_out_datafp16.bin"; | |||
| std::string output_path = "./test_data/batchnorm_correctdatafp16.bin"; | |||
| auto input_data = reinterpret_cast<float16_t *>(mindspore::lite::ReadFile(input_path.c_str(), &input_size)); | |||
| auto correct_data = reinterpret_cast<float16_t *>(mindspore::lite::ReadFile(output_path.c_str(), &output_size)); | |||
| size_t mean_size, var_size, scale_size, offset_size; | |||
| @@ -68,7 +61,7 @@ TEST_F(TestBatchnormOpenCLfp16, Batchnormfp16input_dim4) { | |||
| auto scale_data = reinterpret_cast<float16_t *>(mindspore::lite::ReadFile(scale_path.c_str(), &scale_size)); | |||
| auto offset_data = reinterpret_cast<float16_t *>(mindspore::lite::ReadFile(offset_path.c_str(), &offset_size)); | |||
| MS_LOG(INFO) << "construct tensors"; | |||
| MS_LOG(INFO) << " construct tensors "; | |||
| lite::tensor::Tensor *tensor_data = | |||
| new (std::nothrow) lite::tensor::Tensor(data_type, input_shape, schema::Format_NHWC, tensor_type); | |||
| lite::tensor::Tensor *tensor_mean = | |||
| @@ -81,13 +74,13 @@ TEST_F(TestBatchnormOpenCLfp16, Batchnormfp16input_dim4) { | |||
| new (std::nothrow) lite::tensor::Tensor(data_type, {1, 1, 1, input_shape[3]}, schema::Format_NHWC, tensor_type); | |||
| if (tensor_data == nullptr || tensor_mean == nullptr || tensor_var == nullptr || tensor_scale == nullptr || | |||
| tensor_offset == nullptr) { | |||
| MS_LOG(INFO) << "init tensor failed"; | |||
| MS_LOG(INFO) << " init tensor failed "; | |||
| return; | |||
| } | |||
| auto *output_tensor = | |||
| new (std::nothrow) lite::tensor::Tensor(data_type, output_shape, schema::Format_NHWC4, tensor_type); | |||
| if (output_tensor == nullptr) { | |||
| MS_LOG(INFO) << "init tensor failed"; | |||
| MS_LOG(INFO) << " init tensor failed "; | |||
| delete tensor_data; | |||
| delete tensor_mean; | |||
| delete tensor_var; | |||
| @@ -98,10 +91,10 @@ TEST_F(TestBatchnormOpenCLfp16, Batchnormfp16input_dim4) { | |||
| std::vector<lite::tensor::Tensor *> inputs = {tensor_data, tensor_scale, tensor_offset, tensor_mean, tensor_var}; | |||
| std::vector<lite::tensor::Tensor *> outputs{output_tensor}; | |||
| MS_LOG(INFO) << "initialize tensors"; | |||
| MS_LOG(INFO) << " initialize tensors "; | |||
| auto param = new (std::nothrow) BatchNormParameter(); | |||
| if (param == nullptr) { | |||
| MS_LOG(INFO) << "new BatchNormParameter failed"; | |||
| MS_LOG(INFO) << " new BatchNormParameter failed "; | |||
| for (auto tensor : outputs) { | |||
| delete tensor; | |||
| } | |||
| @@ -111,7 +104,7 @@ TEST_F(TestBatchnormOpenCLfp16, Batchnormfp16input_dim4) { | |||
| auto *batchnorm_kernel = | |||
| new (std::nothrow) kernel::BatchNormOpenCLKernel(reinterpret_cast<OpParameter *>(param), inputs, outputs); | |||
| if (batchnorm_kernel == nullptr) { | |||
| MS_LOG(INFO) << "new kernel::BatchNorm_kernel failed"; | |||
| MS_LOG(INFO) << " new kernel::BatchNorm_kernel failed "; | |||
| for (auto tensor : outputs) { | |||
| delete tensor; | |||
| } | |||
| @@ -125,11 +118,11 @@ TEST_F(TestBatchnormOpenCLfp16, Batchnormfp16input_dim4) { | |||
| input_tensor->MallocData(allocator); | |||
| } | |||
| MS_LOG(INFO) << "initialize sub_graph"; | |||
| MS_LOG(INFO) << " initialize sub_graph "; | |||
| std::vector<kernel::LiteKernel *> kernels{batchnorm_kernel}; | |||
| auto *sub_graph = new (std::nothrow) kernel::SubGraphOpenCLKernel(inputs, outputs, kernels, kernels, kernels); | |||
| if (sub_graph == nullptr) { | |||
| MS_LOG(INFO) << "new kernel::SubGraphOpenCLKernel failed"; | |||
| MS_LOG(INFO) << " new kernel::SubGraphOpenCLKernel failed "; | |||
| for (auto tensor : outputs) { | |||
| delete tensor; | |||
| } | |||
| @@ -138,7 +131,7 @@ TEST_F(TestBatchnormOpenCLfp16, Batchnormfp16input_dim4) { | |||
| return; | |||
| } | |||
| sub_graph->Init(); | |||
| MS_LOG(INFO) << "init tensors"; | |||
| MS_LOG(INFO) << " init tensors "; | |||
| memcpy(inputs[0]->Data(), input_data, input_size); | |||
| memcpy(inputs[1]->Data(), scale_data, scale_size); | |||
| memcpy(inputs[2]->Data(), offset_data, offset_size); | |||
| @@ -148,7 +141,7 @@ TEST_F(TestBatchnormOpenCLfp16, Batchnormfp16input_dim4) { | |||
| sub_graph->Run(); | |||
| auto *output_data_gpu = reinterpret_cast<float16_t *>(output_tensor->Data()); | |||
| CompareOutputData1(output_data_gpu, correct_data, output_tensor->ElementsNum(), 0.0001); | |||
| CompareOutputData(output_data_gpu, correct_data, output_tensor->ElementsNum(), 0.01); | |||
| for (auto tensor : inputs) { | |||
| delete tensor; | |||
| } | |||
| @@ -158,15 +151,14 @@ TEST_F(TestBatchnormOpenCLfp16, Batchnormfp16input_dim4) { | |||
| delete param; | |||
| delete batchnorm_kernel; | |||
| delete sub_graph; | |||
| lite::opencl::OpenCLRuntime::DeleteInstance(); | |||
| } | |||
| TEST_F(TestBatchnormOpenCLfp32, Batchnormfp32input_dim4) { | |||
| MS_LOG(INFO) << "begin test"; | |||
| MS_LOG(INFO) << " begin test "; | |||
| auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance(); | |||
| ocl_runtime->Init(); | |||
| auto allocator = ocl_runtime->GetAllocator(); | |||
| MS_LOG(INFO) << "Read tensors from .bin"; | |||
| MS_LOG(INFO) << " Read tensors from .bin "; | |||
| std::vector<int> input_shape = {1, 256, 256, 47}; | |||
| std::vector<int> output_shape = {1, 256, 256, 47}; | |||
| auto data_type = kNumberTypeFloat32; | |||
| @@ -188,7 +180,7 @@ TEST_F(TestBatchnormOpenCLfp32, Batchnormfp32input_dim4) { | |||
| auto scale_data = reinterpret_cast<float *>(mindspore::lite::ReadFile(scale_path.c_str(), &scale_size)); | |||
| auto offset_data = reinterpret_cast<float *>(mindspore::lite::ReadFile(offset_path.c_str(), &offset_size)); | |||
| MS_LOG(INFO) << "construct tensors"; | |||
| MS_LOG(INFO) << " construct tensors "; | |||
| lite::tensor::Tensor *tensor_data = | |||
| new (std::nothrow) lite::tensor::Tensor(data_type, input_shape, schema::Format_NHWC, tensor_type); | |||
| lite::tensor::Tensor *tensor_mean = | |||
| @@ -201,13 +193,13 @@ TEST_F(TestBatchnormOpenCLfp32, Batchnormfp32input_dim4) { | |||
| new (std::nothrow) lite::tensor::Tensor(data_type, {1, 1, 1, input_shape[3]}, schema::Format_NHWC, tensor_type); | |||
| if (tensor_data == nullptr || tensor_mean == nullptr || tensor_var == nullptr || tensor_scale == nullptr || | |||
| tensor_offset == nullptr) { | |||
| MS_LOG(INFO) << "init tensor failed"; | |||
| MS_LOG(INFO) << " init tensor failed "; | |||
| return; | |||
| } | |||
| auto *output_tensor = | |||
| new (std::nothrow) lite::tensor::Tensor(data_type, output_shape, schema::Format_NHWC4, tensor_type); | |||
| if (output_tensor == nullptr) { | |||
| MS_LOG(INFO) << "init tensor failed"; | |||
| MS_LOG(INFO) << " init tensor failed "; | |||
| delete tensor_data; | |||
| delete tensor_mean; | |||
| delete tensor_var; | |||
| @@ -218,10 +210,10 @@ TEST_F(TestBatchnormOpenCLfp32, Batchnormfp32input_dim4) { | |||
| std::vector<lite::tensor::Tensor *> inputs = {tensor_data, tensor_scale, tensor_offset, tensor_mean, tensor_var}; | |||
| std::vector<lite::tensor::Tensor *> outputs{output_tensor}; | |||
| MS_LOG(INFO) << "initialize tensors"; | |||
| MS_LOG(INFO) << " initialize tensors "; | |||
| auto param = new (std::nothrow) BatchNormParameter(); | |||
| if (param == nullptr) { | |||
| MS_LOG(INFO) << "new BatchNormParameter failed"; | |||
| MS_LOG(INFO) << " new BatchNormParameter failed "; | |||
| for (auto tensor : outputs) { | |||
| delete tensor; | |||
| } | |||
| @@ -231,7 +223,7 @@ TEST_F(TestBatchnormOpenCLfp32, Batchnormfp32input_dim4) { | |||
| auto *batchnorm_kernel = | |||
| new (std::nothrow) kernel::BatchNormOpenCLKernel(reinterpret_cast<OpParameter *>(param), inputs, outputs); | |||
| if (batchnorm_kernel == nullptr) { | |||
| MS_LOG(INFO) << "new kernel::BatchNorm_kernel failed"; | |||
| MS_LOG(INFO) << " new kernel::BatchNorm_kernel failed "; | |||
| for (auto tensor : outputs) { | |||
| delete tensor; | |||
| } | |||
| @@ -245,11 +237,11 @@ TEST_F(TestBatchnormOpenCLfp32, Batchnormfp32input_dim4) { | |||
| input_tensor->MallocData(allocator); | |||
| } | |||
| MS_LOG(INFO) << "initialize sub_graph"; | |||
| MS_LOG(INFO) << " initialize sub_graph "; | |||
| std::vector<kernel::LiteKernel *> kernels{batchnorm_kernel}; | |||
| auto *sub_graph = new (std::nothrow) kernel::SubGraphOpenCLKernel(inputs, outputs, kernels, kernels, kernels); | |||
| if (sub_graph == nullptr) { | |||
| MS_LOG(INFO) << "new kernel::SubGraphOpenCLKernel failed"; | |||
| MS_LOG(INFO) << " new kernel::SubGraphOpenCLKernel failed "; | |||
| for (auto tensor : outputs) { | |||
| delete tensor; | |||
| } | |||
| @@ -258,7 +250,7 @@ TEST_F(TestBatchnormOpenCLfp32, Batchnormfp32input_dim4) { | |||
| return; | |||
| } | |||
| sub_graph->Init(); | |||
| MS_LOG(INFO) << "init tensors"; | |||
| MS_LOG(INFO) << " init tensors "; | |||
| memcpy(inputs[0]->Data(), input_data, input_size); | |||
| memcpy(inputs[1]->Data(), scale_data, scale_size); | |||
| memcpy(inputs[2]->Data(), offset_data, offset_size); | |||
| @@ -268,7 +260,7 @@ TEST_F(TestBatchnormOpenCLfp32, Batchnormfp32input_dim4) { | |||
| sub_graph->Run(); | |||
| auto *output_data_gpu = reinterpret_cast<float *>(output_tensor->Data()); | |||
| CompareOutputData1(output_data_gpu, correct_data, output_tensor->ElementsNum(), 0.0001); | |||
| CompareOutputData(output_data_gpu, correct_data, output_tensor->ElementsNum(), 0.0001); | |||
| for (auto tensor : inputs) { | |||
| delete tensor; | |||
| } | |||
| @@ -278,6 +270,5 @@ TEST_F(TestBatchnormOpenCLfp32, Batchnormfp32input_dim4) { | |||
| delete param; | |||
| delete batchnorm_kernel; | |||
| delete sub_graph; | |||
| lite::opencl::OpenCLRuntime::DeleteInstance(); | |||
| } | |||
| } // namespace mindspore | |||
| @@ -18,70 +18,10 @@ | |||
| #include "utils/log_adapter.h" | |||
| #include "common/common_test.h" | |||
| #include "mindspore/lite/src/runtime/opencl/opencl_runtime.h" | |||
| #include "mindspore/lite/src/common/file_utils.h" | |||
| #include "mindspore/lite/src/runtime/kernel/opencl/subgraph_opencl_kernel.h" | |||
| #include "mindspore/lite/src/runtime/kernel/opencl/kernel/concat.h" | |||
| template <typename T> | |||
| void ConcatComputeByCPU_2input_dim4_axis3(const T *input0, const T *input1, T *output, std::vector<int> input_shape0, | |||
| std::vector<int> input_shape1, std::vector<int> output_shape, | |||
| const int axis) { | |||
| int postion, index0 = 0, index1 = 0; | |||
| for (int i = 0; i < output_shape[0]; i++) { | |||
| for (int j = 0; j < output_shape[1]; j++) { | |||
| for (int k = 0; k < output_shape[2]; k++) { | |||
| postion = i * output_shape[1] * output_shape[2] * output_shape[3] + j * output_shape[2] * output_shape[3] + | |||
| k * output_shape[3]; | |||
| for (int w = 0; w < output_shape[3]; w++) { | |||
| if (w < input_shape0[3] + input_shape1[3]) { | |||
| output[postion++] = (w < input_shape0[3]) ? input0[index0++] : input1[index1++]; | |||
| } else { | |||
| for (int ind = input_shape0[3] + input_shape1[3]; ind < output_shape[3]; ind++) { | |||
| output[postion++] = 0; | |||
| } | |||
| } | |||
| } | |||
| } | |||
| } | |||
| } | |||
| } | |||
| template <typename T> | |||
| void ConcatComputeByCPU_3input_dim4_axis3(T *input0, T *input1, T *input2, T *output, std::vector<int> input_shape0, | |||
| std::vector<int> input_shape1, std::vector<int> input_shape2, | |||
| std::vector<int> output_shape, const int axis) { | |||
| int postion, index0 = 0, index1 = 0, index2 = 0; | |||
| for (int i = 0; i < output_shape[0]; i++) { | |||
| for (int j = 0; j < output_shape[1]; j++) { | |||
| for (int k = 0; k < output_shape[2]; k++) { | |||
| postion = i * output_shape[1] * output_shape[2] * output_shape[3] + j * output_shape[2] * output_shape[3] + | |||
| k * output_shape[3]; | |||
| for (int w = 0; w < output_shape[3]; w++) { | |||
| if (w < input_shape0[3]) { | |||
| int align = UP_DIV(input_shape0[3], 4) * 4; | |||
| index0 = i * input_shape0[1] * input_shape0[2] * align + j * input_shape0[2] * align + k * align + w; | |||
| output[postion++] = input0[index0]; | |||
| } else if (w >= input_shape0[3] && w < (input_shape0[3] + input_shape1[3])) { | |||
| int align = UP_DIV(input_shape1[3], 4) * 4; | |||
| index1 = i * input_shape1[1] * input_shape1[2] * align + j * input_shape1[2] * align + k * align + w - | |||
| input_shape0[3]; | |||
| output[postion++] = input1[index1]; | |||
| } else if ((input_shape0[3] + input_shape1[3]) <= w && | |||
| w < (input_shape0[3] + input_shape1[3] + input_shape2[3])) { | |||
| int align = UP_DIV(input_shape2[3], 4) * 4; | |||
| index2 = i * input_shape2[1] * input_shape2[2] * align + j * input_shape2[2] * align + k * align + w - | |||
| input_shape0[3] - input_shape1[3]; | |||
| output[postion++] = input2[index2]; | |||
| } else { | |||
| for (int ind = input_shape0[3] + input_shape1[3] + input_shape2[3]; ind < output_shape[3]; ind++) { | |||
| output[postion++] = 0; | |||
| } | |||
| break; | |||
| } | |||
| } | |||
| } | |||
| } | |||
| } | |||
| } | |||
| namespace mindspore { | |||
| class TestConcatOpenCLfp32 : public mindspore::CommonTest { | |||
| public: | |||
| @@ -100,17 +40,29 @@ void CompareOutputData1(T *output_data, T *correct_data, int size, float err_bou | |||
| } | |||
| } | |||
| TEST_F(TestConcatOpenCLfp16, ConcatFp16_2input_dim4_axis3) { | |||
| MS_LOG(INFO) << "begin test"; | |||
| MS_LOG(INFO) << " begin test "; | |||
| auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance(); | |||
| ocl_runtime->SetFp16Enable(true); | |||
| ocl_runtime->Init(); | |||
| auto allocator = ocl_runtime->GetAllocator(); | |||
| MS_LOG(INFO) << "init tensors"; | |||
| constexpr int INPUT_NUM = 3; | |||
| std::array<std::vector<int>, INPUT_NUM> input_shapes = { | |||
| std::vector<int>{1, 16, 256, 80}, std::vector<int>{1, 16, 256, 80}, std::vector<int>{1, 16, 256, 80}}; | |||
| std::vector<int> output_shape = {1, 16, 256, 240}; | |||
| // get the input from .bin | |||
| size_t input1_size, input2_size, input3_size, output_size; | |||
| std::string input1Ppath = "./test_data/concatfp16_input1.bin"; | |||
| std::string input2Ppath = "./test_data/concatfp16_input2.bin"; | |||
| std::string input3Ppath = "./test_data/concatfp16_input3.bin"; | |||
| std::string correctOutputPath = "./test_data/concatfp16_output.bin"; | |||
| auto input_data1 = reinterpret_cast<float16_t *>(mindspore::lite::ReadFile(input1Ppath.c_str(), &input1_size)); | |||
| auto input_data2 = reinterpret_cast<float16_t *>(mindspore::lite::ReadFile(input2Ppath.c_str(), &input2_size)); | |||
| auto input_data3 = reinterpret_cast<float16_t *>(mindspore::lite::ReadFile(input3Ppath.c_str(), &input3_size)); | |||
| auto correctOutput = | |||
| reinterpret_cast<float16_t *>(mindspore::lite::ReadFile(correctOutputPath.c_str(), &output_size)); | |||
| MS_LOG(INFO) << " init tensors "; | |||
| constexpr int INPUT_NUM = 2; | |||
| std::array<std::vector<int>, INPUT_NUM> input_shapes = {std::vector<int>{1, 19, 19, 96}, | |||
| std::vector<int>{1, 19, 19, 96}}; | |||
| std::vector<int> output_shape = {2, 19, 19, 96}; | |||
| auto data_type = kNumberTypeFloat16; | |||
| auto tensor_type = schema::NodeType_ValueNode; | |||
| std::vector<lite::tensor::Tensor *> inputs; | |||
| @@ -118,26 +70,26 @@ TEST_F(TestConcatOpenCLfp16, ConcatFp16_2input_dim4_axis3) { | |||
| auto input_temp = new (std::nothrow) lite::tensor::Tensor(data_type, shape, schema::Format_NHWC4, tensor_type); | |||
| inputs.push_back(input_temp); | |||
| if (input_temp == nullptr) { | |||
| MS_LOG(INFO) << "new input_tensor failed"; | |||
| MS_LOG(INFO) << " new input_tensor failed "; | |||
| return; | |||
| } | |||
| } | |||
| auto *output_tensor = | |||
| new (std::nothrow) lite::tensor::Tensor(data_type, output_shape, schema::Format_NHWC4, tensor_type); | |||
| if (output_tensor == nullptr) { | |||
| MS_LOG(INFO) << "new output_tensor failed"; | |||
| MS_LOG(INFO) << " new output_tensor failed "; | |||
| for (auto tensor : inputs) { | |||
| delete tensor; | |||
| } | |||
| return; | |||
| } | |||
| std::vector<lite::tensor::Tensor *> outputs{output_tensor}; | |||
| MS_LOG(INFO) << "input_shapes size=: " << input_shapes.size(); | |||
| MS_LOG(INFO) << " input_shapes size =: " << input_shapes.size(); | |||
| MS_LOG(INFO) << "initialize tensors"; | |||
| MS_LOG(INFO) << " initialize tensors "; | |||
| auto param = new (std::nothrow) ConcatParameter(); | |||
| if (param == nullptr) { | |||
| MS_LOG(INFO) << "new ConcatParameter failed"; | |||
| MS_LOG(INFO) << " new ConcatParameter failed "; | |||
| for (auto tensor : inputs) { | |||
| delete tensor; | |||
| } | |||
| @@ -146,11 +98,11 @@ TEST_F(TestConcatOpenCLfp16, ConcatFp16_2input_dim4_axis3) { | |||
| } | |||
| return; | |||
| } | |||
| param->axis_ = 3; | |||
| param->axis_ = 0; | |||
| auto *concat_kernel = | |||
| new (std::nothrow) kernel::ConcatOpenCLKernel(reinterpret_cast<OpParameter *>(param), inputs, outputs); | |||
| if (concat_kernel == nullptr) { | |||
| MS_LOG(INFO) << "new kernel::ConcatOpenCLKernel failed"; | |||
| MS_LOG(INFO) << " new kernel::ConcatOpenCLKernel failed "; | |||
| for (auto tensor : inputs) { | |||
| delete tensor; | |||
| } | |||
| @@ -165,12 +117,11 @@ TEST_F(TestConcatOpenCLfp16, ConcatFp16_2input_dim4_axis3) { | |||
| for (auto &input_tensor : inputs) { | |||
| input_tensor->MallocData(allocator); | |||
| } | |||
| MS_LOG(INFO) << "initialize sub_graph"; | |||
| MS_LOG(INFO) << " initialize sub_graph "; | |||
| std::vector<kernel::LiteKernel *> kernels{concat_kernel}; | |||
| auto *sub_graph = new (std::nothrow) kernel::SubGraphOpenCLKernel(inputs, outputs, kernels, kernels, kernels); | |||
| if (sub_graph == nullptr) { | |||
| MS_LOG(INFO) << "new kernel::SubGraphOpenCLKernel failed"; | |||
| MS_LOG(INFO) << " new kernel::SubGraphOpenCLKernel failed "; | |||
| for (auto tensor : inputs) { | |||
| delete tensor; | |||
| } | |||
| @@ -182,33 +133,22 @@ TEST_F(TestConcatOpenCLfp16, ConcatFp16_2input_dim4_axis3) { | |||
| return; | |||
| } | |||
| sub_graph->Init(); | |||
| unsigned int seed = 123; | |||
| MS_LOG(INFO) << "initialize input data"; | |||
| for (auto &input_tensor : inputs) { | |||
| auto input_data = reinterpret_cast<float16_t *>(input_tensor->Data()); | |||
| for (int i = 0; i < input_tensor->ElementsNum(); ++i) { | |||
| input_data[i] = static_cast<float16_t>(rand_r(&seed) % 10 + 1); | |||
| } | |||
| } | |||
| // compute the result for CPU | |||
| auto *input_data0 = reinterpret_cast<float16_t *>(inputs[0]->Data()); | |||
| auto *input_data1 = reinterpret_cast<float16_t *>(inputs[1]->Data()); | |||
| std::vector<float16_t> output_data_cpu(output_shape[0] * output_shape[1] * output_shape[2] * output_shape[3]); | |||
| MS_LOG(INFO) << " initialize input data "; | |||
| if (inputs.size() == 2) { | |||
| ConcatComputeByCPU_2input_dim4_axis3(input_data0, input_data1, output_data_cpu.data(), input_shapes[0], | |||
| input_shapes[1], output_shape, param->axis_); | |||
| } | |||
| if (inputs.size() == 3) { | |||
| auto *input_data2 = reinterpret_cast<float16_t *>(inputs[2]->Data()); | |||
| ConcatComputeByCPU_3input_dim4_axis3(input_data0, input_data1, input_data2, output_data_cpu.data(), input_shapes[0], | |||
| input_shapes[1], input_shapes[2], output_shape, param->axis_); | |||
| memcpy(inputs[0]->Data(), input_data1, input1_size); | |||
| memcpy(inputs[1]->Data(), input_data2, input2_size); | |||
| } else if (inputs.size() == 3) { | |||
| memcpy(inputs[0]->Data(), input_data1, input1_size); | |||
| memcpy(inputs[1]->Data(), input_data2, input2_size); | |||
| memcpy(inputs[2]->Data(), input_data3, input3_size); | |||
| } else { | |||
| MS_LOG(ERROR) << " input size must be 2 or 3"; | |||
| } | |||
| std::cout << "==================output data================" << std::endl; | |||
| sub_graph->Run(); | |||
| auto *output_data_gpu = reinterpret_cast<float16_t *>(output_tensor->Data()); | |||
| CompareOutputData1(output_data_gpu, output_data_cpu.data(), output_tensor->ElementsNum(), 0.00001); | |||
| CompareOutputData1(output_data_gpu, correctOutput, output_tensor->ElementsNum(), 0.000001); | |||
| for (auto tensor : inputs) { | |||
| delete tensor; | |||
| } | |||
| @@ -218,47 +158,57 @@ TEST_F(TestConcatOpenCLfp16, ConcatFp16_2input_dim4_axis3) { | |||
| delete param; | |||
| delete concat_kernel; | |||
| delete sub_graph; | |||
| lite::opencl::OpenCLRuntime::DeleteInstance(); | |||
| } | |||
| TEST_F(TestConcatOpenCLfp32, ConcatFp32_2input_dim4_axis3) { | |||
| MS_LOG(INFO) << "begin test"; | |||
| MS_LOG(INFO) << " begin test "; | |||
| auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance(); | |||
| ocl_runtime->Init(); | |||
| auto allocator = ocl_runtime->GetAllocator(); | |||
| MS_LOG(INFO) << "init tensors"; | |||
| // get the input from .bin | |||
| size_t input1_size, input2_size, input3_size, output_size; | |||
| std::string input1Ppath = "./test_data/concat_input1.bin"; | |||
| std::string input2Ppath = "./test_data/concat_input2.bin"; | |||
| std::string input3Ppath = "./test_data/concat_input3.bin"; | |||
| std::string correctOutputPath = "./test_data/concat_output.bin"; | |||
| auto input_data1 = reinterpret_cast<float *>(mindspore::lite::ReadFile(input1Ppath.c_str(), &input1_size)); | |||
| auto input_data2 = reinterpret_cast<float *>(mindspore::lite::ReadFile(input2Ppath.c_str(), &input2_size)); | |||
| auto input_data3 = reinterpret_cast<float *>(mindspore::lite::ReadFile(input3Ppath.c_str(), &input3_size)); | |||
| auto correctOutput = reinterpret_cast<float *>(mindspore::lite::ReadFile(correctOutputPath.c_str(), &output_size)); | |||
| MS_LOG(INFO) << " init tensors "; | |||
| constexpr int INPUT_NUM = 3; | |||
| std::array<std::vector<int>, INPUT_NUM> input_shapes = { | |||
| std::vector<int>{1, 16, 256, 80}, std::vector<int>{1, 16, 256, 80}, std::vector<int>{1, 16, 256, 80}}; | |||
| std::vector<int> output_shape = {1, 16, 256, 240}; | |||
| std::vector<int> output_shape = {1, 48, 256, 80}; | |||
| auto data_type = kNumberTypeFloat32; | |||
| auto tensor_type = schema::NodeType_ValueNode; | |||
| std::vector<lite::tensor::Tensor *> inputs; | |||
| for (auto &shape : input_shapes) { | |||
| auto input_temp = new (std::nothrow) lite::tensor::Tensor(data_type, shape, schema::Format_NHWC4, tensor_type); | |||
| auto input_temp = new (std::nothrow) lite::tensor::Tensor(data_type, shape, schema::Format_NHWC, tensor_type); | |||
| inputs.push_back(input_temp); | |||
| if (input_temp == nullptr) { | |||
| MS_LOG(INFO) << "new input_tensor failed"; | |||
| MS_LOG(INFO) << " new input_tensor failed "; | |||
| return; | |||
| } | |||
| } | |||
| auto *output_tensor = | |||
| new (std::nothrow) lite::tensor::Tensor(data_type, output_shape, schema::Format_NHWC4, tensor_type); | |||
| new (std::nothrow) lite::tensor::Tensor(data_type, output_shape, schema::Format_NHWC, tensor_type); | |||
| if (output_tensor == nullptr) { | |||
| MS_LOG(INFO) << "new output_tensor failed"; | |||
| MS_LOG(INFO) << " new output_tensor failed "; | |||
| for (auto tensor : inputs) { | |||
| delete tensor; | |||
| } | |||
| return; | |||
| } | |||
| std::vector<lite::tensor::Tensor *> outputs{output_tensor}; | |||
| MS_LOG(INFO) << "input_shapes size=: " << input_shapes.size(); | |||
| MS_LOG(INFO) << " input_shapes size=: " << input_shapes.size(); | |||
| MS_LOG(INFO) << "initialize tensors"; | |||
| MS_LOG(INFO) << " initialize tensors "; | |||
| auto param = new (std::nothrow) ConcatParameter(); | |||
| if (param == nullptr) { | |||
| MS_LOG(INFO) << "new ConcatParameter failed"; | |||
| MS_LOG(INFO) << " new ConcatParameter failed "; | |||
| for (auto tensor : inputs) { | |||
| delete tensor; | |||
| } | |||
| @@ -267,11 +217,11 @@ TEST_F(TestConcatOpenCLfp32, ConcatFp32_2input_dim4_axis3) { | |||
| } | |||
| return; | |||
| } | |||
| param->axis_ = 3; | |||
| param->axis_ = 1; | |||
| auto *concat_kernel = | |||
| new (std::nothrow) kernel::ConcatOpenCLKernel(reinterpret_cast<OpParameter *>(param), inputs, outputs); | |||
| if (concat_kernel == nullptr) { | |||
| MS_LOG(INFO) << "new kernel::ConcatOpenCLKernel failed"; | |||
| MS_LOG(INFO) << " new kernel::ConcatOpenCLKernel failed "; | |||
| for (auto tensor : inputs) { | |||
| delete tensor; | |||
| } | |||
| @@ -287,11 +237,11 @@ TEST_F(TestConcatOpenCLfp32, ConcatFp32_2input_dim4_axis3) { | |||
| input_tensor->MallocData(allocator); | |||
| } | |||
| MS_LOG(INFO) << "initialize sub_graph"; | |||
| MS_LOG(INFO) << " initialize sub_graph "; | |||
| std::vector<kernel::LiteKernel *> kernels{concat_kernel}; | |||
| auto *sub_graph = new (std::nothrow) kernel::SubGraphOpenCLKernel(inputs, outputs, kernels, kernels, kernels); | |||
| if (sub_graph == nullptr) { | |||
| MS_LOG(INFO) << "new kernel::SubGraphOpenCLKernel failed"; | |||
| MS_LOG(INFO) << " new kernel::SubGraphOpenCLKernel failed "; | |||
| for (auto tensor : inputs) { | |||
| delete tensor; | |||
| } | |||
| @@ -303,33 +253,22 @@ TEST_F(TestConcatOpenCLfp32, ConcatFp32_2input_dim4_axis3) { | |||
| return; | |||
| } | |||
| sub_graph->Init(); | |||
| unsigned int seed = 123; | |||
| MS_LOG(INFO) << "initialize input data"; | |||
| for (auto &input_tensor : inputs) { | |||
| auto input_data = reinterpret_cast<float *>(input_tensor->Data()); | |||
| for (int i = 0; i < input_tensor->ElementsNum(); ++i) { | |||
| input_data[i] = static_cast<float>(rand_r(&seed) % 10 + 1); | |||
| } | |||
| } | |||
| // compute the result for CPU | |||
| auto *input_data0 = reinterpret_cast<float *>(inputs[0]->Data()); | |||
| auto *input_data1 = reinterpret_cast<float *>(inputs[1]->Data()); | |||
| std::vector<float> output_data_cpu(output_shape[0] * output_shape[1] * output_shape[2] * output_shape[3]); | |||
| MS_LOG(INFO) << " initialize input data "; | |||
| if (inputs.size() == 2) { | |||
| ConcatComputeByCPU_2input_dim4_axis3(input_data0, input_data1, output_data_cpu.data(), input_shapes[0], | |||
| input_shapes[1], output_shape, param->axis_); | |||
| } | |||
| if (inputs.size() == 3) { | |||
| auto *input_data2 = reinterpret_cast<float *>(inputs[2]->Data()); | |||
| ConcatComputeByCPU_3input_dim4_axis3(input_data0, input_data1, input_data2, output_data_cpu.data(), input_shapes[0], | |||
| input_shapes[1], input_shapes[2], output_shape, param->axis_); | |||
| memcpy(inputs[0]->Data(), input_data1, input1_size); | |||
| memcpy(inputs[1]->Data(), input_data2, input2_size); | |||
| } else if (inputs.size() == 3) { | |||
| memcpy(inputs[0]->Data(), input_data1, input1_size); | |||
| memcpy(inputs[1]->Data(), input_data2, input2_size); | |||
| memcpy(inputs[2]->Data(), input_data3, input3_size); | |||
| } else { | |||
| MS_LOG(ERROR) << " input size must be 2 or 3 "; | |||
| } | |||
| std::cout << "==================output data================" << std::endl; | |||
| sub_graph->Run(); | |||
| auto *output_data_gpu = reinterpret_cast<float *>(output_tensor->Data()); | |||
| CompareOutputData1(output_data_gpu, output_data_cpu.data(), output_tensor->ElementsNum(), 0.00001); | |||
| CompareOutputData1(output_data_gpu, correctOutput, output_tensor->ElementsNum(), 0.00001); | |||
| for (auto tensor : inputs) { | |||
| delete tensor; | |||
| } | |||
| @@ -339,6 +278,5 @@ TEST_F(TestConcatOpenCLfp32, ConcatFp32_2input_dim4_axis3) { | |||
| delete param; | |||
| delete concat_kernel; | |||
| delete sub_graph; | |||
| lite::opencl::OpenCLRuntime::DeleteInstance(); | |||
| } | |||
| } // namespace mindspore | |||
| @@ -41,44 +41,43 @@ void CompareOutputData1(T *output_data, T *correct_data, int size, float err_bou | |||
| } | |||
| TEST_F(TestSliceOpenCLfp32, Slicefp32input_dim4) { | |||
| MS_LOG(INFO) << "begin test"; | |||
| MS_LOG(INFO) << " begin test "; | |||
| auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance(); | |||
| ocl_runtime->Init(); | |||
| auto allocator = ocl_runtime->GetAllocator(); | |||
| MS_LOG(INFO) << "Read tensors from .bin"; | |||
| std::vector<int> input_shape = {1, 256, 256, 48}; | |||
| std::vector<int> output_shape = {1, 255, 255, 15}; | |||
| std::vector<int> begin = {0, 1, 1, 7}; | |||
| std::vector<int> size = {1, 255, 255, 15}; | |||
| MS_LOG(INFO) << " Read tensors from .bin "; | |||
| std::vector<int> input_shape = {1, 19, 19, 96}; | |||
| std::vector<int> output_shape = {1, 10, 10, 13}; | |||
| std::vector<int> begin = {0, 2, 3, 4}; | |||
| std::vector<int> size = {1, 10, 10, 13}; | |||
| auto data_type = kNumberTypeFloat32; | |||
| auto tensor_type = schema::NodeType_ValueNode; | |||
| // get the input from .bin | |||
| size_t input_size, output_size; | |||
| std::string input_path = "./test_data/in_datafp32.bin"; | |||
| std::string output_path = "./test_data/out_datafp32.bin"; | |||
| std::string input_path = "./test_data/in_slicefp32.bin"; | |||
| std::string output_path = "./test_data/out_slicefp32.bin"; | |||
| auto input_data = reinterpret_cast<float *>(mindspore::lite::ReadFile(input_path.c_str(), &input_size)); | |||
| auto correct_data = reinterpret_cast<float *>(mindspore::lite::ReadFile(output_path.c_str(), &output_size)); | |||
| MS_LOG(INFO) << "construct tensors"; | |||
| MS_LOG(INFO) << " construct tensors "; | |||
| lite::tensor::Tensor *tensor_data = | |||
| new (std::nothrow) lite::tensor::Tensor(data_type, input_shape, schema::Format_NHWC, tensor_type); | |||
| if (tensor_data == nullptr) { | |||
| MS_LOG(INFO) << "init tensor failed"; | |||
| MS_LOG(INFO) << " init tensor failed "; | |||
| return; | |||
| } | |||
| auto *output_tensor = | |||
| new (std::nothrow) lite::tensor::Tensor(data_type, output_shape, schema::Format_NHWC4, tensor_type); | |||
| if (output_tensor == nullptr) { | |||
| delete tensor_data; | |||
| MS_LOG(INFO) << "init tensor failed"; | |||
| MS_LOG(INFO) << " init tensor failed "; | |||
| return; | |||
| } | |||
| std::vector<lite::tensor::Tensor *> inputs = {tensor_data}; | |||
| std::vector<lite::tensor::Tensor *> outputs = {output_tensor}; | |||
| MS_LOG(INFO) << "setting SliceParameter"; | |||
| MS_LOG(INFO) << "setting SliceParameter "; | |||
| auto param = new (std::nothrow) SliceParameter(); | |||
| if (param == nullptr) { | |||
| for (auto tensor : inputs) { | |||
| @@ -87,7 +86,7 @@ TEST_F(TestSliceOpenCLfp32, Slicefp32input_dim4) { | |||
| for (auto tensor : outputs) { | |||
| delete tensor; | |||
| } | |||
| MS_LOG(INFO) << "new SliceParameter failed"; | |||
| MS_LOG(INFO) << "new SliceParameter failed "; | |||
| return; | |||
| } | |||
| for (int i = 0; i < input_shape.size(); i++) { | |||
| @@ -105,7 +104,7 @@ TEST_F(TestSliceOpenCLfp32, Slicefp32input_dim4) { | |||
| delete tensor; | |||
| } | |||
| delete param; | |||
| MS_LOG(INFO) << "new kernel::slice_kernel failed"; | |||
| MS_LOG(INFO) << "new kernel::slice_kernel failed "; | |||
| return; | |||
| } | |||
| slice_kernel->Init(); | |||
| @@ -115,7 +114,7 @@ TEST_F(TestSliceOpenCLfp32, Slicefp32input_dim4) { | |||
| input_tensor->MallocData(allocator); | |||
| } | |||
| MS_LOG(INFO) << "initialize sub_graph"; | |||
| MS_LOG(INFO) << " initialize sub_graph "; | |||
| std::vector<kernel::LiteKernel *> kernels{slice_kernel}; | |||
| auto *sub_graph = new (std::nothrow) kernel::SubGraphOpenCLKernel(inputs, outputs, kernels, kernels, kernels); | |||
| if (sub_graph == nullptr) { | |||
| @@ -127,12 +126,12 @@ TEST_F(TestSliceOpenCLfp32, Slicefp32input_dim4) { | |||
| } | |||
| delete param; | |||
| delete slice_kernel; | |||
| MS_LOG(INFO) << "new kernel::SubGraphOpenCLKernel failed"; | |||
| MS_LOG(INFO) << " new kernel::SubGraphOpenCLKernel failed "; | |||
| return; | |||
| } | |||
| sub_graph->Init(); | |||
| MS_LOG(INFO) << "init tensors"; | |||
| MS_LOG(INFO) << " init tensors "; | |||
| memcpy(inputs[0]->Data(), input_data, input_size); | |||
| std::cout << "==================output data================" << std::endl; | |||
| @@ -148,16 +147,15 @@ TEST_F(TestSliceOpenCLfp32, Slicefp32input_dim4) { | |||
| } | |||
| delete slice_kernel; | |||
| delete sub_graph; | |||
| lite::opencl::OpenCLRuntime::DeleteInstance(); | |||
| } | |||
| TEST_F(TestSliceOpenCLfp16, Slicefp16input_dim4) { | |||
| MS_LOG(INFO) << "begin test"; | |||
| MS_LOG(INFO) << " begin test "; | |||
| auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance(); | |||
| ocl_runtime->SetFp16Enable(true); | |||
| ocl_runtime->Init(); | |||
| auto allocator = ocl_runtime->GetAllocator(); | |||
| MS_LOG(INFO) << "Read tensors from .bin"; | |||
| MS_LOG(INFO) << " Read tensors from .bin "; | |||
| std::vector<int> input_shape = {1, 256, 256, 48}; | |||
| std::vector<int> output_shape = {1, 255, 255, 15}; | |||
| std::vector<int> begin = {0, 1, 1, 7}; | |||
| @@ -172,24 +170,24 @@ TEST_F(TestSliceOpenCLfp16, Slicefp16input_dim4) { | |||
| auto input_data = reinterpret_cast<float16_t *>(mindspore::lite::ReadFile(input_path.c_str(), &input_size)); | |||
| auto correct_data = reinterpret_cast<float16_t *>(mindspore::lite::ReadFile(output_path.c_str(), &output_size)); | |||
| MS_LOG(INFO) << "construct tensors"; | |||
| MS_LOG(INFO) << " construct tensors "; | |||
| lite::tensor::Tensor *tensor_data = | |||
| new (std::nothrow) lite::tensor::Tensor(data_type, input_shape, schema::Format_NHWC, tensor_type); | |||
| if (tensor_data == nullptr) { | |||
| MS_LOG(INFO) << "init tensor failed"; | |||
| MS_LOG(INFO) << " init tensor failed "; | |||
| return; | |||
| } | |||
| auto *output_tensor = | |||
| new (std::nothrow) lite::tensor::Tensor(data_type, output_shape, schema::Format_NHWC4, tensor_type); | |||
| if (output_tensor == nullptr) { | |||
| delete tensor_data; | |||
| MS_LOG(INFO) << "init tensor failed"; | |||
| MS_LOG(INFO) << " init tensor failed "; | |||
| return; | |||
| } | |||
| std::vector<lite::tensor::Tensor *> inputs = {tensor_data}; | |||
| std::vector<lite::tensor::Tensor *> outputs = {output_tensor}; | |||
| MS_LOG(INFO) << "setting SliceParameter"; | |||
| MS_LOG(INFO) << " setting SliceParameter "; | |||
| auto param = new (std::nothrow) SliceParameter(); | |||
| if (param == nullptr) { | |||
| for (auto tensor : inputs) { | |||
| @@ -198,10 +196,10 @@ TEST_F(TestSliceOpenCLfp16, Slicefp16input_dim4) { | |||
| for (auto tensor : outputs) { | |||
| delete tensor; | |||
| } | |||
| MS_LOG(INFO) << "new SliceParameter failed"; | |||
| MS_LOG(INFO) << " new SliceParameter failed "; | |||
| return; | |||
| } | |||
| for (int i = 0; i < 4; i++) { | |||
| for (int i = 0; i < input_shape.size(); i++) { | |||
| param->begin_[i] = begin[i]; | |||
| param->size_[i] = size[i]; | |||
| } | |||
| @@ -216,7 +214,7 @@ TEST_F(TestSliceOpenCLfp16, Slicefp16input_dim4) { | |||
| delete tensor; | |||
| } | |||
| delete param; | |||
| MS_LOG(INFO) << "new kernel::slice_kernel failed"; | |||
| MS_LOG(INFO) << " new kernel::slice_kernel failed "; | |||
| return; | |||
| } | |||
| slice_kernel->Init(); | |||
| @@ -226,7 +224,7 @@ TEST_F(TestSliceOpenCLfp16, Slicefp16input_dim4) { | |||
| input_tensor->MallocData(allocator); | |||
| } | |||
| MS_LOG(INFO) << "initialize sub_graph"; | |||
| MS_LOG(INFO) << " initialize sub_graph "; | |||
| std::vector<kernel::LiteKernel *> kernels{slice_kernel}; | |||
| auto *sub_graph = new (std::nothrow) kernel::SubGraphOpenCLKernel(inputs, outputs, kernels, kernels, kernels); | |||
| if (sub_graph == nullptr) { | |||
| @@ -238,12 +236,12 @@ TEST_F(TestSliceOpenCLfp16, Slicefp16input_dim4) { | |||
| } | |||
| delete param; | |||
| delete slice_kernel; | |||
| MS_LOG(INFO) << "new kernel::SubGraphOpenCLKernel failed"; | |||
| MS_LOG(INFO) << " new kernel::SubGraphOpenCLKernel failed "; | |||
| return; | |||
| } | |||
| sub_graph->Init(); | |||
| MS_LOG(INFO) << "init tensors"; | |||
| MS_LOG(INFO) << " init tensors "; | |||
| memcpy(inputs[0]->Data(), input_data, input_size); | |||
| std::cout << "==================output data================" << std::endl; | |||
| @@ -259,6 +257,5 @@ TEST_F(TestSliceOpenCLfp16, Slicefp16input_dim4) { | |||
| } | |||
| delete slice_kernel; | |||
| delete sub_graph; | |||
| lite::opencl::OpenCLRuntime::DeleteInstance(); | |||
| } | |||
| } // namespace mindspore | |||