diff --git a/mindspore/lite/schema/model.fbs b/mindspore/lite/schema/model.fbs index 733fc28d76..0c0ad360b6 100644 --- a/mindspore/lite/schema/model.fbs +++ b/mindspore/lite/schema/model.fbs @@ -192,7 +192,8 @@ union PrimitiveType { SpaceToBatchND, TopKV2, Return, - MakeTuple + MakeTuple, + ToFormat, } enum QuantType: int { diff --git a/mindspore/lite/schema/ops.fbs b/mindspore/lite/schema/ops.fbs index e6584dc059..2e93983c16 100644 --- a/mindspore/lite/schema/ops.fbs +++ b/mindspore/lite/schema/ops.fbs @@ -881,5 +881,10 @@ table TopKV2 { table MakeTuple { } +table ToFormat { + srcT: int; + dstT: int; +} + table Return { } \ No newline at end of file diff --git a/mindspore/lite/src/lite_kernel.h b/mindspore/lite/src/lite_kernel.h index d278b27351..36b90701d0 100644 --- a/mindspore/lite/src/lite_kernel.h +++ b/mindspore/lite/src/lite_kernel.h @@ -131,6 +131,10 @@ class LiteKernel { void AddOutKernel(LiteKernel *kernel) { this->out_kernels_.emplace_back(kernel); } + void SetInKernel(const std::vector &kernel) { this->in_kernels_ = kernel; } + + void SetOutKernel(const std::vector &kernel) { this->out_kernels_ = kernel; } + std::vector &in_kernels() { return this->in_kernels_; } std::vector &out_kernels() { return this->out_kernels_; } @@ -167,16 +171,14 @@ class SubGraphKernel : public LiteKernel { public: explicit SubGraphKernel(const std::vector &inputs, const std::vector &outputs, - const std::vector &inKernels, - const std::vector &outKernels, + const std::vector &in_kernels, + const std::vector &out_kernels, const std::vector &nodes, const lite::Context *ctx, const lite::Primitive *primitive) - : LiteKernel(nullptr, inputs, outputs, ctx, primitive), - inputs_(inputs), - outputs_(outputs), - inkernels_(inKernels), - outkernels_(outKernels), - nodes_(nodes) {} + : LiteKernel(nullptr, inputs, outputs, ctx, primitive), nodes_(nodes) { + in_kernels_ = in_kernels; + out_kernels_ = out_kernels; + } virtual int Init() { return -1; } virtual int InferShape() { return -1; } @@ -184,10 +186,6 @@ class SubGraphKernel : public LiteKernel { virtual int Run() { return -1; } protected: - std::vector inputs_; - std::vector outputs_; - std::vector inkernels_; - std::vector outkernels_; std::vector nodes_; }; diff --git a/mindspore/lite/src/runtime/kernel/opencl/cl/fp32/arithmetic_image2d.cl b/mindspore/lite/src/runtime/kernel/opencl/cl/fp32/arithmetic_image2d.cl index 6438f24e6b..33d6143f9d 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/cl/fp32/arithmetic_image2d.cl +++ b/mindspore/lite/src/runtime/kernel/opencl/cl/fp32/arithmetic_image2d.cl @@ -49,9 +49,6 @@ __kernel void ElementDiv(__read_only image2d_t input_a, __read_only image2d_t in float4 a = read_imagef(input_a, smp_none, (int2)(X, Y)); float4 b = read_imagef(input_b, smp_none, (int2)(X, Y)); - if (b == 0) { - return; - } write_imagef(output, (int2)(X, Y), a / b); } diff --git a/mindspore/lite/src/runtime/kernel/opencl/cl/fp32/to_format.cl b/mindspore/lite/src/runtime/kernel/opencl/cl/fp32/to_format.cl new file mode 100644 index 0000000000..0d877b778c --- /dev/null +++ b/mindspore/lite/src/runtime/kernel/opencl/cl/fp32/to_format.cl @@ -0,0 +1,235 @@ +#define FLT float +#define FLT4 float4 +#define READ_IMAGE read_imagef +#define WRITE_IMAGE write_imagef +// enum Format { +// Format_NCHW = 0, +// Format_NHWC = 1, +// Format_NHWC4 = 2, +// Format_HWKC = 3, +// Format_HWCK = 4, +// Format_KCHW = 5, +// Format_CKHW = 6, +// Format_KHWC = 7, +// Format_CHWK = 8, +// Format_NC4HW4 = 100, +// Format_NUM_OF_FORMAT = 101, +// Format_MIN = Format_NCHW, +// Format_MAX = Format_NUM_OF_FORMAT +//}; +__constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; +__kernel void to_format_NCHW_to_NHWC4_IMG(__global FLT4 *src_data, __write_only image2d_t dst_data, int4 size, + int4 shape) { + int X = get_global_id(0); + int Y = get_global_id(1); + int Z = get_global_id(2); + if (X >= size.x || Y >= size.y || Z >= size.z) { + return; + } + // WRITE_IMAGE(dst_data, (int2)(Y * size.z + Z, X), READ_IMAGE(src_data, smp_zero, (int2)(Y * size.z + Z, X))); +} +__kernel void to_format_NHWC_to_NHWC4_IMG(__global FLT4 *src_data, __write_only image2d_t dst_data, int4 size, + int4 shape) { + int X = get_global_id(0); + int Y = get_global_id(1); + int Z = get_global_id(2); + if (X >= size.x || Y >= size.y || Z >= size.z) { + return; + } + int offset = (X * shape.z + Y) * shape.w + Z * 4; + __global FLT *src_addr = (__global FLT *)src_data; + src_addr += offset; + FLT4 data = (FLT4)(0.f); + if ((Z + 1) * 4 <= shape.w) { + data = ((__global FLT4 *)src_addr)[0]; + } else { + if ((shape.w - Z * 4) >= 1) { + data.x = src_addr[0]; + } + if ((shape.w - Z * 4) >= 2) { + data.y = src_addr[1]; + } + if ((shape.w - Z * 4) >= 3) { + data.z = src_addr[2]; + } + } + WRITE_IMAGE(dst_data, (int2)(Y * size.z + Z, X), data); +} +__kernel void to_format_NHWC4_to_NHWC4_IMG(__global FLT4 *src_data, __write_only image2d_t dst_data, int4 size, + int4 shape) { + int X = get_global_id(0); + int Y = get_global_id(1); + int Z = get_global_id(2); + if (X >= size.x || Y >= size.y || Z >= size.z) { + return; + } + // WRITE_IMAGE(dst_data, (int2)(Y * size.z + Z, X), READ_IMAGE(src_data, smp_zero, (int2)(Y * size.z + Z, X))); +} +__kernel void to_format_NC4HW4_to_NHWC4_IMG(__global FLT4 *src_data, __write_only image2d_t dst_data, int4 size, + int4 shape) { + int X = get_global_id(0); + int Y = get_global_id(1); + int Z = get_global_id(2); + if (X >= size.x || Y >= size.y || Z >= size.z) { + return; + } + // WRITE_IMAGE(dst_data, (int2)(Y * size.z + Z, X), READ_IMAGE(src_data, smp_zero, (int2)(Y * size.z + Z, X))); +} +__kernel void to_format_NCHW_to_NC4HW4_IMG(__global FLT4 *src_data, __write_only image2d_t dst_data, int4 size, + int4 shape) { + int X = get_global_id(0); + int Y = get_global_id(1); + int Z = get_global_id(2); + if (X >= size.x || Y >= size.y || Z >= size.z) { + return; + } + // WRITE_IMAGE(dst_data, (int2)(Y * size.z + Z, X), READ_IMAGE(src_data, smp_zero, (int2)(Y * size.z + Z, X))); +} +__kernel void to_format_NHWC_to_NC4HW4_IMG(__global FLT4 *src_data, __write_only image2d_t dst_data, int4 size, + int4 shape) { + int X = get_global_id(0); + int Y = get_global_id(1); + int Z = get_global_id(2); + if (X >= size.x || Y >= size.y || Z >= size.z) { + return; + } + // WRITE_IMAGE(dst_data, (int2)(Y * size.z + Z, X), READ_IMAGE(src_data, smp_zero, (int2)(Y * size.z + Z, X))); +} +__kernel void to_format_NHWC4_to_NC4HW4_IMG(__global FLT4 *src_data, __write_only image2d_t dst_data, int4 size, + int4 shape) { + int X = get_global_id(0); + int Y = get_global_id(1); + int Z = get_global_id(2); + if (X >= size.x || Y >= size.y || Z >= size.z) { + return; + } + // WRITE_IMAGE(dst_data, (int2)(Y * size.z + Z, X), READ_IMAGE(src_data, smp_zero, (int2)(Y * size.z + Z, X))); +} +__kernel void to_format_NC4HW4_to_NC4HW4_IMG(__global FLT4 *src_data, __write_only image2d_t dst_data, int4 size, + int4 shape) { + int X = get_global_id(0); + int Y = get_global_id(1); + int Z = get_global_id(2); + if (X >= size.x || Y >= size.y || Z >= size.z) { + return; + } + // FLT4 src_final = src_data[(((Z)*src_size.y + (y_c)) * src_size.x + (x_c))]; + WRITE_IMAGE(dst_data, (int2)(Y * size.z + Z, X), src_data[(Y * size.z + Z) * size.x + X]); +} + +__kernel void to_format_NCHW_to_NCHW_BUF(__read_only image2d_t src_data, __global FLT4 *dst_data, int4 size, + int4 shape) { + int X = get_global_id(0); + int Y = get_global_id(1); + int Z = get_global_id(2); + if (X >= size.x || Y >= size.y || Z >= size.z) { + return; + } + dst_data[(Z * size.y + Y) * size.x + X] = READ_IMAGE(src_data, smp_zero, (int2)(Y * size.x + X, Z)); +} +__kernel void to_format_NHWC_to_NCHW_BUF(__read_only image2d_t src_data, __global FLT4 *dst_data, int4 size, + int4 shape) { + int X = get_global_id(0); + int Y = get_global_id(1); + int Z = get_global_id(2); + if (X >= size.x || Y >= size.y || Z >= size.z) { + return; + } + // WRITE_IMAGE(dst_data, (int2)(Y * size.z + Z, X), READ_IMAGE(src_data, smp_zero, (int2)(Y * size.z + Z, X))); +} +__kernel void to_format_NHWC4_to_NCHW_BUF(__read_only image2d_t src_data, __global FLT4 *dst_data, int4 size, + int4 shape) { + int X = get_global_id(0); + int Y = get_global_id(1); + int Z = get_global_id(2); + if (X >= size.x || Y >= size.y || Z >= size.z) { + return; + } + // WRITE_IMAGE(dst_data, (int2)(Y * size.z + Z, X), READ_IMAGE(src_data, smp_zero, (int2)(Y * size.z + Z, X))); +} +__kernel void to_format_NC4HW4_to_NCHW_BUF(__read_only image2d_t src_data, __global FLT4 *dst_data, int4 size, + int4 shape) { + int X = get_global_id(0); + int Y = get_global_id(1); + int Z = get_global_id(2); + if (X >= size.x || Y >= size.y || Z >= size.z) { + return; + } + // WRITE_IMAGE(dst_data, (int2)(Y * size.z + Z, X), READ_IMAGE(src_data, smp_zero, (int2)(Y * size.z + Z, X))); +} +__kernel void to_format_NCHW_to_NHWC_BUF(__read_only image2d_t src_data, __global FLT4 *dst_data, int4 size, + int4 shape) { + int X = get_global_id(0); + int Y = get_global_id(1); + int Z = get_global_id(2); + if (X >= size.x || Y >= size.y || Z >= size.z) { + return; + } + // WRITE_IMAGE(dst_data, (int2)(Y * size.z + Z, X), READ_IMAGE(src_data, smp_zero, (int2)(Y * size.z + Z, X))); +} +__kernel void to_format_NHWC_to_NHWC_BUF(__read_only image2d_t src_data, __global FLT4 *dst_data, int4 size, + int4 shape) { + int X = get_global_id(0); + int Y = get_global_id(1); + int Z = get_global_id(2); + if (X >= size.x || Y >= size.y || Z >= size.z) { + return; + } + // WRITE_IMAGE(dst_data, (int2)(Y * size.z + Z, X), READ_IMAGE(src_data, smp_zero, (int2)(Y * size.z + Z, X))); +} +__kernel void to_format_NHWC4_to_NHWC_BUF(__read_only image2d_t src_data, __global FLT4 *dst_data, int4 size, + int4 shape) { + int X = get_global_id(0); + int Y = get_global_id(1); + int Z = get_global_id(2); + if (X >= size.x || Y >= size.y || Z >= size.z) { + return; + } + FLT4 data = READ_IMAGE(src_data, smp_zero, (int2)(Y * size.z + Z, X)); + int offset = (X * shape.z + Y) * shape.w + Z * 4; + __global FLT *dst_addr = (__global FLT *)dst_data; + dst_addr += offset; + if ((Z + 1) * 4 <= shape.w) { + ((__global FLT4 *)dst_addr)[0] = data; + } else { + if (shape.w - Z * 4 >= 1) { + dst_addr[0] = data.x; + } + if (shape.w - Z * 4 >= 2) { + dst_addr[1] = data.y; + } + if (shape.w - Z * 4 >= 3) { + dst_addr[2] = data.z; + } + } +} +__kernel void to_format_NC4HW4_to_to_NHWC_BUF(__read_only image2d_t src_data, __global FLT4 *dst_data, int4 size, + int4 shape) { + int X = get_global_id(0); + int Y = get_global_id(1); + int Z = get_global_id(2); + if (X >= size.x || Y >= size.y || Z >= size.z) { + return; + } + // WRITE_IMAGE(dst_data, (int2)(Y * size.z + Z, X), READ_IMAGE(src_data, smp_zero, (int2)(Y * size.z + Z, X))); +} +__kernel void to_format_NC4HW4_to_NC4HW4_BUF(__read_only image2d_t src_data, __global FLT4 *dst_data, int4 size, + int4 shape) { + int X = get_global_id(0); + int Y = get_global_id(1); + int Z = get_global_id(2); + if (X >= size.x || Y >= size.y || Z >= size.z) { + return; + } + dst_data[(Y * size.z + Z) * size.x + X] = READ_IMAGE(src_data, smp_zero, (int2)(Y * size.z + Z, X)); +} +__kernel void to_format_NHWC4_to_NHWC4_BUF(__read_only image2d_t src_data, __global FLT4 *dst_data, int4 size, + int4 shape) { + int X = get_global_id(0); + int Y = get_global_id(1); + int Z = get_global_id(2); + if (X >= size.x || Y >= size.y || Z >= size.z) { + return; + } + dst_data[(Y * size.z + Z) * size.x + X] = READ_IMAGE(src_data, smp_zero, (int2)(Y * size.z + Z, X)); +} diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/convolution.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/convolution.cc index daddfe73a9..726dc0ac36 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/convolution.cc +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/convolution.cc @@ -329,7 +329,8 @@ kernel::LiteKernel *OpenCLConvolutionKernelCreator(const std::vector &outputs, OpParameter *opParameter, const lite::Context *ctx, const kernel::KernelKey &desc, const lite::Primitive *primitive) { - auto *kernel = new ConvolutionOpenCLKernel(reinterpret_cast(opParameter), inputs, outputs); + auto *kernel = + new (std::nothrow) ConvolutionOpenCLKernel(reinterpret_cast(opParameter), inputs, outputs); if (kernel == nullptr) { MS_LOG(ERROR) << "Create OpenCL Convolution kernel failed!"; return nullptr; diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/pooling2d.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/pooling2d.cc index 184dd5cd9e..99607ff43e 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/pooling2d.cc +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/pooling2d.cc @@ -145,7 +145,7 @@ kernel::LiteKernel *OpenCLPooling2dKernelCreator(const std::vector &outputs, OpParameter *opParameter, const lite::Context *ctx, const kernel::KernelKey &desc, const lite::Primitive *primitive) { - auto *kernel = new (std::nothrow)PoolingOpenCLKernel(reinterpret_cast(opParameter), inputs, outputs); + auto *kernel = new (std::nothrow) PoolingOpenCLKernel(reinterpret_cast(opParameter), inputs, outputs); if (kernel == nullptr) { MS_LOG(ERROR) << "Create OpenCL Pooling kernel failed!"; return nullptr; diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/softmax.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/softmax.cc index 242a27b2a6..1396cae004 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/softmax.cc +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/softmax.cc @@ -158,7 +158,7 @@ kernel::LiteKernel *OpenCLSoftMaxKernelCreator(const std::vector &outputs, OpParameter *opParameter, const lite::Context *ctx, const kernel::KernelKey &desc, const lite::Primitive *primitive) { - auto *kernel = new (std::nothrow)SoftmaxOpenCLKernel(reinterpret_cast(opParameter), inputs, outputs); + auto *kernel = new (std::nothrow) SoftmaxOpenCLKernel(reinterpret_cast(opParameter), inputs, outputs); if (kernel == nullptr) { MS_LOG(ERROR) << "kernel " << opParameter->name_ << "is nullptr."; return nullptr; diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/to_format.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/to_format.cc new file mode 100644 index 0000000000..44d1cf8dde --- /dev/null +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/to_format.cc @@ -0,0 +1,167 @@ +/** + * Copyright 2019 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "src/runtime/kernel/opencl/kernel/to_format.h" +#include +#include +#include +#include +#include "include/errorcode.h" +#include "src/kernel_registry.h" +#include "src/runtime/opencl/opencl_runtime.h" +#include "src/runtime/kernel/opencl/cl/fp32/to_format.cl.inc" + +using mindspore::kernel::KERNEL_ARCH::kGPU; +using mindspore::lite::KernelRegistrar; +using mindspore::lite::RET_ERROR; +using mindspore::lite::RET_OK; +using mindspore::schema::PrimitiveType_ToFormat; + +namespace mindspore::kernel { + +int ToFormatOpenCLKernel::Init() { + auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance(); + auto parameter = reinterpret_cast(op_parameter_); + out_mem_type_ = parameter->out_mem_type; + std::string program_name = "to_format"; + std::map format_str{{schema::Format_NCHW, "NCHW"}, + {schema::Format_NHWC, "NHWC"}, + {schema::Format_NC4HW4, "NC4HW4"}, + {schema::Format_NHWC4, "NHWC4"}}; + std::string kernel_name = + "to_format_" + format_str[in_tensors_[0]->GetFormat()] + "_to_" + format_str[out_tensors_[0]->GetFormat()]; + if (out_mem_type_ == OpenCLMemType::IMG) { + kernel_name += "_IMG"; + } else { + kernel_name += "_BUF"; + } + +#ifdef PROGRAM_WITH_IL + ocl_runtime->CreateKernelFromIL(kernel_(), kernel_name); +#else + std::set build_options; +#ifdef ENABLE_FP16 + std::string source = to_format_source_fp16; +#else + std::string source = to_format_source_fp32; +#endif + ocl_runtime->LoadSource(program_name, source); + ocl_runtime->BuildKernel(kernel_, program_name, kernel_name, build_options); +#endif + MS_LOG(DEBUG) << kernel_name << " Init Done!"; + return RET_OK; +} + +int ToFormatOpenCLKernel::ReSize() { return RET_OK; } + +int ToFormatOpenCLKernel::GetGlobalSize(size_t idx, std::vector *global_size) { + std::vector shapex = out_tensors_[0]->shape(); + if (out_tensors_[0]->GetFormat() == schema::Format_NHWC4 || out_tensors_[0]->GetFormat() == schema::Format_NHWC) { + int h = shapex[1]; + int w = shapex[2]; + int c = shapex[3]; + int c4 = UP_DIV(c, C4NUM); + std::vector vec = {(size_t)h, (size_t)w, (size_t)c4}; + *global_size = std::move(vec); + } else if (out_tensors_[0]->GetFormat() == schema::Format_NC4HW4 || + out_tensors_[0]->GetFormat() == schema::Format_NCHW) { + int h = shapex[2]; + int w = shapex[3]; + int c = shapex[1]; + int c4 = UP_DIV(c, C4NUM); + std::vector vec = {(size_t)c4, (size_t)h, (size_t)w}; + *global_size = std::move(vec); + } else if (out_tensors_[0]->GetFormat() == out_tensors_[0]->GetFormat() == schema::Format_NCHW) { + int h = shapex[2]; + int w = shapex[3]; + int c = shapex[1]; + int w4 = UP_DIV(w, C4NUM); + std::vector vec = {(size_t)w4, (size_t)h, (size_t)c}; + *global_size = std::move(vec); + } + return RET_OK; +} +int ToFormatOpenCLKernel::GetLocalSize(size_t idx, const std::vector &global_size, + std::vector *local_size) { + return RET_OK; +} + +int ToFormatOpenCLKernel::GetImageSize(size_t idx, std::vector *img_size) { + size_t im_dst_x, im_dst_y; + std::vector shapex = out_tensors_[0]->shape(); + if (out_tensors_[0]->GetFormat() == schema::Format_NC4HW4) { + int h = shapex[1]; + int w = shapex[2]; + int c = shapex[3]; + im_dst_y = UP_DIV(h * c, C4NUM); + im_dst_x = w; + } else if (out_tensors_[0]->GetFormat() == schema::Format_NHWC4) { + int h = shapex[2]; + int w = shapex[3]; + int c = shapex[1]; + im_dst_x = UP_DIV(w * c, C4NUM); + im_dst_y = h; + } else { + MS_LOG(ERROR) << "Unsupported format. " << out_tensors_[0]->GetFormat(); + } + img_size->clear(); +#ifdef ENABLE_FP16 + size_t img_dtype = CL_HALF_FLOAT; +#else + size_t img_dtype = CL_FLOAT; +#endif + std::vector vec{im_dst_x, im_dst_y, img_dtype}; + *img_size = vec; + return RET_OK; +} + +int ToFormatOpenCLKernel::Run() { + MS_LOG(DEBUG) << "ToFormat" << " Running!"; + auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance(); + std::vector local = {}; + std::vector global; + GetGlobalSize(0, &global); + auto shapex = in_tensors_[0]->shape(); + cl_int4 shape{shapex.size() > 0 ? shapex[0] : 1, shapex.size() > 1 ? shapex[1] : 1, shapex.size() > 2 ? shapex[2] : 1, + shapex.size() > 3 ? shapex[3] : 1}; + cl_int4 gsize{(cl_int)global[0], (cl_int)global[1], (cl_int)global[2], 1}; + ocl_runtime->SetKernelArg(kernel_, 0, in_tensors_[0]->Data()); + ocl_runtime->SetKernelArg(kernel_, 1, out_tensors_[0]->Data()); + ocl_runtime->SetKernelArg(kernel_, 2, gsize); + ocl_runtime->SetKernelArg(kernel_, 3, shape); + ocl_runtime->RunKernel(kernel_, global, local, nullptr); + return RET_OK; +} + +kernel::LiteKernel *OpenCLToFormatKernelCreator(const std::vector &inputs, + const std::vector &outputs, + OpParameter *opParameter, const lite::Context *ctx, + const kernel::KernelKey &desc, const lite::Primitive *primitive) { + auto *kernel = new (std::nothrow) ToFormatOpenCLKernel(reinterpret_cast(opParameter), inputs, outputs); + if (kernel == nullptr) { + MS_LOG(ERROR) << "kernel " << opParameter->name_ << " create failed."; + return nullptr; + } + auto ret = kernel->Init(); + if (ret != RET_OK) { + delete kernel; + return nullptr; + } + return kernel; +} + +REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_ToFormat, OpenCLToFormatKernelCreator) +} // namespace mindspore::kernel diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/to_format.h b/mindspore/lite/src/runtime/kernel/opencl/kernel/to_format.h new file mode 100644 index 0000000000..09d31b7454 --- /dev/null +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/to_format.h @@ -0,0 +1,46 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_LITE_SRC_BACKEND_OPENCL_TO_FORMAT_H_ +#define MINDSPORE_LITE_SRC_BACKEND_OPENCL_TO_FORMAT_H_ + +#include + +#include "src/lite_kernel.h" +#include "src/runtime/opencl/opencl_runtime.h" +#include "src/runtime/kernel/opencl/opencl_kernel.h" + +namespace mindspore::kernel { +class ToFormatOpenCLKernel : public OpenCLKernel { + public: + explicit ToFormatOpenCLKernel(OpParameter *parameter, const std::vector &inputs, + const std::vector &outputs) + : OpenCLKernel(parameter, inputs, outputs) {} + ~ToFormatOpenCLKernel() override{}; + + int Init() override; + int ReSize() override; + int Run() override; + int GetImageSize(size_t idx, std::vector *img_size) override; + int GetGlobalSize(size_t idx, std::vector *global_size) override; + int GetLocalSize(size_t idx, const std::vector &global_size, std::vector *local_size) override; + + private: + cl::Kernel kernel_; +}; +} // namespace mindspore::kernel + +#endif // MINDSPORE_LITE_SRC_BACKEND_OPENCL_RESHAPE_H_ diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/transpose.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/transpose.cc index b0052e0f5b..c76920c47f 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/transpose.cc +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/transpose.cc @@ -109,7 +109,8 @@ kernel::LiteKernel *OpenCLTransposeKernelCreator(const std::vector &outputs, OpParameter *opParameter, const lite::Context *ctx, const kernel::KernelKey &desc, const lite::Primitive *primitive) { - auto *kernel = new (std::nothrow)TransposeOpenCLKernel(reinterpret_cast(opParameter), inputs, outputs); + auto *kernel = + new (std::nothrow) TransposeOpenCLKernel(reinterpret_cast(opParameter), inputs, outputs); if (kernel == nullptr) { MS_LOG(ERROR) << "kernel " << opParameter->name_ << "is nullptr."; return nullptr; diff --git a/mindspore/lite/src/runtime/kernel/opencl/opencl_kernel.h b/mindspore/lite/src/runtime/kernel/opencl/opencl_kernel.h index 10dabfea71..d1facd0bc1 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/opencl_kernel.h +++ b/mindspore/lite/src/runtime/kernel/opencl/opencl_kernel.h @@ -21,21 +21,37 @@ #include "src/lite_kernel.h" namespace mindspore::kernel { + +enum class OpenCLMemType { BUF, IMG }; + +struct OpenCLToFormatParameter { + OpParameter op_parameter; + schema::Format src_format{schema::Format_NHWC}; + schema::Format dst_format{schema::Format_NHWC4}; + OpenCLMemType out_mem_type{OpenCLMemType::IMG}; +}; + class OpenCLKernel : public LiteKernel { public: explicit OpenCLKernel(OpParameter *parameter, const std::vector &inputs, const std::vector &outputs) - : LiteKernel(parameter, inputs, outputs, nullptr, nullptr) {} + : LiteKernel(parameter, inputs, outputs, nullptr, nullptr) {} virtual int Init() { return -1; } virtual int Prepare() { return -1; } virtual int InferShape() { return -1; } virtual int ReSize() { return -1; } virtual int Run() { return -1; } - virtual int GetImageSize(size_t idx, std::vector* img_size) { return -1; } - virtual int GetGlobalSize(size_t idx, std::vector* global_size) { return -1; } - virtual int GetLocalSize(size_t idx, const std::vector& global_size, - std::vector* local_size) { return -1; } + virtual int GetImageSize(size_t idx, std::vector *img_size) { return -1; } + virtual int GetGlobalSize(size_t idx, std::vector *global_size) { return -1; } + virtual int GetLocalSize(size_t idx, const std::vector &global_size, std::vector *local_size) { + return -1; + } + OpenCLMemType GetMemType() { return out_mem_type_; } + void SetMemType(OpenCLMemType mem_type) { out_mem_type_ = mem_type; } + + protected: + OpenCLMemType out_mem_type_{OpenCLMemType::IMG}; }; } // namespace mindspore::kernel diff --git a/mindspore/lite/src/runtime/kernel/opencl/subgraph_opencl_kernel.cc b/mindspore/lite/src/runtime/kernel/opencl/subgraph_opencl_kernel.cc index 7c495cf68f..05514815d7 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/subgraph_opencl_kernel.cc +++ b/mindspore/lite/src/runtime/kernel/opencl/subgraph_opencl_kernel.cc @@ -17,22 +17,117 @@ #include "src/runtime/kernel/opencl/subgraph_opencl_kernel.h" #include "src/runtime/opencl/opencl_executor.h" #include "src/runtime/opencl/opencl_runtime.h" +#include "src/runtime/kernel/opencl/utils.h" +#include "include/errorcode.h" +#include "src/common/utils.h" namespace mindspore::kernel { SubGraphOpenCLKernel::~SubGraphOpenCLKernel() { UnInit(); } +int SubGraphOpenCLKernel::GenToFormatOp(const std::vector &in_tensors, + const std::vector in_kernels, + std::vector *out_tensors, + std::vector *out_parameters, + std::vector *out_convert_ops, OpenCLMemType mem_type) { + out_tensors->clear(); + out_parameters->clear(); + out_convert_ops->clear(); + for (size_t i = 0; i < in_tensors.size(); ++i) { + lite::tensor::Tensor *new_tensor = new (std::nothrow) lite::tensor::Tensor(); + MS_ASSERT(new_tensor); + if (new_tensor == nullptr) { + MS_LOG(ERROR) << "SubGraphOpenCLKernel new tensor failed!"; + return RET_ERROR; + } + new_tensor->CopyTensor(*in_tensors[i]); + auto dst_format = + (mem_type == OpenCLMemType::IMG) ? in_kernels.back()->out_tensors()[0]->GetFormat() : in_tensors[i]->GetFormat(); + auto src_format = + (mem_type == OpenCLMemType::IMG) ? in_tensors[i]->GetFormat() : in_kernels.front()->out_tensors()[0]->GetFormat(); + if ((dst_format == schema::Format_NCHW || dst_format == schema::Format_NC4HW4) && + (src_format == schema::Format_NHWC || src_format == schema::Format_NHWC4)) { + auto &shape = new_tensor->shape(); + std::vector dst_shape{shape[0], shape[3], shape[1], shape[2]}; + new_tensor->set_shape(shape); + } + if ((dst_format == schema::Format_NHWC || dst_format == schema::Format_NHWC4) && + (src_format == schema::Format_NCHW || src_format == schema::Format_NC4HW4)) { + auto &shape = new_tensor->shape(); + std::vector dst_shape{shape[0], shape[2], shape[3], shape[1]}; + new_tensor->set_shape(shape); + } + new_tensor->SetFormat(dst_format); + out_tensors->emplace_back(new_tensor); +#ifdef ENABLE_FP16 + KernelKey desc{kGPU, kNumberTypeFloat16, schema::PrimitiveType_ToFormat}; +#else + KernelKey desc{kGPU, kNumberTypeFloat32, schema::PrimitiveType_ToFormat}; +#endif + OpenCLToFormatParameter *parameter = new (std::nothrow) OpenCLToFormatParameter; + MS_ASSERT(parameter); + if (parameter == nullptr) { + MS_LOG(ERROR) << "SubGraphOpenCLKernel new parameter failed!"; + return RET_ERROR; + } + parameter->src_format = src_format; + parameter->dst_format = dst_format; + parameter->out_mem_type = mem_type; + out_parameters->emplace_back(parameter); + LiteKernel *in_convert_op; + if (mem_type == OpenCLMemType::IMG) { + in_convert_op = + lite::GetOpenCLKernel({in_tensors[i]}, {new_tensor}, reinterpret_cast(parameter), nullptr, desc); + } else { + in_convert_op = + lite::GetOpenCLKernel({new_tensor}, {in_tensors[i]}, reinterpret_cast(parameter), nullptr, desc); + } + MS_ASSERT(in_convert_op); + if (in_convert_op == nullptr) { + MS_LOG(ERROR) << "SubGraphOpenCLKernel create op failed!"; + return RET_ERROR; + } + auto in_opencl_op = reinterpret_cast(in_convert_op); + if (mem_type == OpenCLMemType::IMG) { + in_opencl_op->AddOutKernel(in_kernels[i]); + reinterpret_cast(in_kernels[i])->SetInKernel({in_convert_op}); + reinterpret_cast(in_kernels[i])->set_in_tensors({new_tensor}); + } else { + reinterpret_cast(in_kernels[i])->SetOutKernel({in_convert_op}); + reinterpret_cast(in_kernels[i])->set_out_tensors({new_tensor}); + in_convert_op->AddInKernel(in_kernels[i]); + } + out_convert_ops->emplace_back(in_convert_op); + } + return RET_OK; +} + int SubGraphOpenCLKernel::Init() { allocator_ = lite::opencl::OpenCLRuntime::GetInstance()->GetAllocator(); - MS_LOG(DEBUG) << "input num=" << inputs_.size() << ", output num=" << outputs_.size(); - for (const auto tensor : inputs_) { + MS_LOG(DEBUG) << "input num=" << in_tensors_.size() << ", output num=" << out_tensors_.size(); + for (const auto tensor : in_tensors_) { tensor->set_allocator(allocator_); } - for (const auto tensor : outputs_) { + for (const auto tensor : out_tensors_) { tensor->set_allocator(allocator_); } + int ret = GenToFormatOp(in_tensors_, in_kernels_, &in_convert_tensors_, &in_parameters_, &in_convert_ops_, + OpenCLMemType::IMG); + if (ret != RET_OK) { + return RET_ERROR; + } + nodes_.insert(nodes_.begin(), in_convert_ops_.begin(), in_convert_ops_.end()); + ret = GenToFormatOp(out_tensors_, out_kernels_, &out_convert_tensors_, &out_parameters_, &out_convert_ops_, + OpenCLMemType::BUF); + if (ret != RET_OK) { + return RET_ERROR; + } + nodes_.insert(nodes_.end(), out_convert_ops_.begin(), out_convert_ops_.end()); + + MallocTensorWithReuse(); + // Map buffer for write, it is not necessary for fine-grained - for (auto &tensor : inputs_) { + for (auto &tensor : in_tensors_) { void *data = tensor->Data(); // It is required with coarse-grained SVM if (data != nullptr) { @@ -42,44 +137,118 @@ int SubGraphOpenCLKernel::Init() { MS_LOG(ERROR) << "SubGraphOpenCLKernel input nullptr!"; } } - return 0; + return RET_OK; +} +int SubGraphOpenCLKernel::MallocTensorWithReuse() { + kernel::LiteKernelUtil::InitTensorRefCount(nodes_); + for (auto *kernel : nodes_) { + MS_ASSERT(nullptr != kernel); + kernel::OpenCLKernel *op_kernel = reinterpret_cast(kernel); + auto &outputs = kernel->out_tensors(); + for (auto i = 0; i < outputs.size(); ++i) { + auto *output = outputs.at(i); + MS_ASSERT(nullptr != output); + if (op_kernel->GetMemType() == OpenCLMemType::IMG) { + std::vector img_size; + op_kernel->GetImageSize(i, &img_size); + auto data_ptr = allocator_->Malloc(output->Size(), img_size); + output->SetData(data_ptr); + } else { + output->MallocData(allocator_); + } + output->set_allocator(allocator_); + } + for (auto input_kernel : kernel->in_kernels()) { + MS_EXCEPTION_IF_NULL(input_kernel); + auto ret = input_kernel->DecOutTensorRefCount(); + if (0 != ret) { + MS_LOG(WARNING) << "DecOutTensorRefCount for kernel" << kernel->name() << " failed"; + } + } + } + for (auto kernel : out_kernels_) { + MS_EXCEPTION_IF_NULL(kernel); + auto ret = kernel->DecOutTensorRefCount(); + if (0 != ret) { + MS_LOG(WARNING) << "DecOutTensorRefCount for kernel" << kernel->name() << " failed"; + } + } + for (auto kernel : in_convert_ops_) { + MS_EXCEPTION_IF_NULL(kernel); + auto ret = kernel->DecOutTensorRefCount(); + if (0 != ret) { + MS_LOG(WARNING) << "DecOutTensorRefCount for kernel" << kernel->name() << " failed"; + } + } + for (auto kernel : out_convert_ops_) { + MS_EXCEPTION_IF_NULL(kernel); + auto ret = kernel->DecOutTensorRefCount(); + if (0 != ret) { + MS_LOG(WARNING) << "DecOutTensorRefCount for kernel" << kernel->name() << " failed"; + } + } + return RET_OK; } int SubGraphOpenCLKernel::UnInit() { - for (auto &tensor : outputs_) { + for (auto &tensor : out_tensors_) { + allocator_->UnmapBuffer(tensor->Data()); + } + for (const auto tensor : in_tensors_) { + if (tensor != nullptr) { + tensor->FreeData(); + } + } + for (const auto tensor : out_tensors_) { + if (tensor != nullptr) { + tensor->FreeData(); + } + } + for (auto &tensor : out_tensors_) { allocator_->UnmapBuffer(tensor->Data()); } - for (const auto tensor : inputs_) { + for (const auto tensor : in_convert_tensors_) { if (tensor != nullptr) { tensor->FreeData(); + delete tensor; } } - for (const auto tensor : outputs_) { + for (const auto tensor : out_convert_tensors_) { if (tensor != nullptr) { tensor->FreeData(); + delete tensor; } } - return 0; + for (const auto parameter : in_parameters_) { + if (parameter != nullptr) { + delete parameter; + } + } + for (const auto op : in_convert_ops_) { + if (op != nullptr) { + delete op; + } + } + return RET_OK; } -int SubGraphOpenCLKernel::InferShape() { return 0; } +int SubGraphOpenCLKernel::InferShape() { return RET_OK; } -int SubGraphOpenCLKernel::ReSize() { return 0; } +int SubGraphOpenCLKernel::ReSize() { return RET_OK; } int SubGraphOpenCLKernel::Run() { - auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance(); - for (auto &tensor : inputs_) { + for (auto &tensor : in_tensors_) { allocator_->UnmapBuffer(tensor->Data()); } lite::opencl::OpenCLExecutor executor; - executor.Run(inputs_, outputs_, nodes_, allocator_); - ocl_runtime->SyncCommandQueue(); - for (auto &tensor : outputs_) { + executor.Run(in_tensors_, out_tensors_, nodes_, allocator_); + + for (auto &tensor : out_tensors_) { void *data = allocator_->MapBuffer(tensor->Data(), CL_MAP_READ, nullptr, true); tensor->SetData(data); } - return 0; + return RET_OK; } } // namespace mindspore::kernel diff --git a/mindspore/lite/src/runtime/kernel/opencl/subgraph_opencl_kernel.h b/mindspore/lite/src/runtime/kernel/opencl/subgraph_opencl_kernel.h index d965c2e7bd..cd926f0ab9 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/subgraph_opencl_kernel.h +++ b/mindspore/lite/src/runtime/kernel/opencl/subgraph_opencl_kernel.h @@ -36,7 +36,7 @@ class SubGraphOpenCLKernel : public SubGraphKernel { const std::vector inKernels, const std::vector outKernels, const std::vector nodes) - : SubGraphKernel(inputs, outputs, inKernels, outKernels, nodes, nullptr, nullptr) {} + : SubGraphKernel(inputs, outputs, inKernels, outKernels, nodes, nullptr, nullptr) {} ~SubGraphOpenCLKernel() override; int Init() override; @@ -45,11 +45,24 @@ class SubGraphOpenCLKernel : public SubGraphKernel { int Run() override; int UnInit(); + protected: + int MallocTensorWithReuse(); + int GenToFormatOp(const std::vector &in_tensors, + const std::vector in_kernels, + std::vector *out_tensors, + std::vector *out_parameters, std::vector *out_convert_ops, + OpenCLMemType mem_type); + private: SubGraphOpenCLParameter *subgraph_ocl_parameter_; lite::opencl::OpenCLAllocator *allocator_; + std::vector in_convert_tensors_; + std::vector out_convert_tensors_; + std::vector in_parameters_; + std::vector out_parameters_; + std::vector in_convert_ops_; + std::vector out_convert_ops_; }; } // namespace mindspore::kernel #endif // MINDSPORE_LITE_SRC_BACKEND_OPENCL_SUBGRAPH_OPENCL_KERNEL_H_ - diff --git a/mindspore/lite/src/runtime/kernel/opencl/utils.cc b/mindspore/lite/src/runtime/kernel/opencl/utils.cc index aab9795734..41ed6df6cf 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/utils.cc +++ b/mindspore/lite/src/runtime/kernel/opencl/utils.cc @@ -18,6 +18,22 @@ #include #include #include +#include "src/kernel_registry.h" + +using mindspore::lite::KernelRegistrar; + +namespace mindspore::lite { +kernel::LiteKernel *GetOpenCLKernel(const std::vector &in_tensors, + const std::vector &out_tensors, OpParameter *parameter, + const Context *ctx, const kernel::KernelKey &key) { + auto creator = KernelRegistry::GetInstance()->GetCreator(key); + if (creator != nullptr) { + auto kernel = creator(in_tensors, out_tensors, parameter, nullptr, key, nullptr); + return kernel; + } + return nullptr; +} +} // namespace mindspore::lite namespace mindspore { namespace kernel { diff --git a/mindspore/lite/src/runtime/kernel/opencl/utils.h b/mindspore/lite/src/runtime/kernel/opencl/utils.h index f34135c29c..d1c0d130f3 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/utils.h +++ b/mindspore/lite/src/runtime/kernel/opencl/utils.h @@ -22,6 +22,13 @@ #include "CL/cl2.hpp" #include "utils/log_adapter.h" #include "src/runtime/kernel/arm/nnacl/op_base.h" +#include "src/lite_kernel.h" + +namespace mindspore::lite { +kernel::LiteKernel *GetOpenCLKernel(const std::vector &in_tensors, + const std::vector &out_tensors, OpParameter *parameter, + const Context *ctx, const kernel::KernelKey &key); +} namespace mindspore::kernel { diff --git a/mindspore/lite/src/runtime/opencl/opencl_allocator.cc b/mindspore/lite/src/runtime/opencl/opencl_allocator.cc index ab11ff06ae..8c34e2737f 100644 --- a/mindspore/lite/src/runtime/opencl/opencl_allocator.cc +++ b/mindspore/lite/src/runtime/opencl/opencl_allocator.cc @@ -16,10 +16,10 @@ #include "src/runtime/opencl/opencl_allocator.h" #include -#include "utils/log_adapter.h" #include "src/runtime/opencl/opencl_runtime.h" -#include "include/errorcode.h" #include "src/runtime/kernel/opencl/utils.h" +#include "utils/log_adapter.h" +#include "include/errorcode.h" namespace mindspore::lite::opencl { @@ -61,7 +61,7 @@ void *OpenCLAllocator::Malloc(size_t size, const std::vector &img_size) } Lock(); auto iter = free_list_.lower_bound(size); - if (iter != free_list_.end() && (iter->second->size_ >= size) && (iter->second->size_ < (size << shift_factor_))) { + while (iter != free_list_.end() && (iter->second->size_ >= size) && (iter->second->size_ < (size << shift_factor_))) { auto mem_buf = iter->second; bool is_match{mem_buf->img_size.size() == img_size.size()}; for (int i = 0; i < img_size.size() && is_match; ++i) { @@ -75,6 +75,7 @@ void *OpenCLAllocator::Malloc(size_t size, const std::vector &img_size) << ", host addr: " << mem_buf->host_ptr_ << ", device addr: " << mem_buf->device_ptr_; return mem_buf->host_ptr_; } + ++iter; } void *host_ptr = nullptr; void *device_ptr = nullptr; @@ -136,7 +137,7 @@ void *OpenCLAllocator::CreateImageFromHost(void *data, size_t size, const std::v auto ocl_runtime = opencl::OpenCLRuntime::GetInstance(); Lock(); auto iter = free_list_.lower_bound(size); - if (iter != free_list_.end() && (iter->second->size_ >= size) && (iter->second->size_ < (size << shift_factor_))) { + while (iter != free_list_.end() && (iter->second->size_ >= size) && (iter->second->size_ < (size << shift_factor_))) { auto mem_buf = iter->second; bool is_match{mem_buf->img_size.size() == img_size.size()}; for (int i = 0; i < img_size.size() && is_match; ++i) { @@ -150,6 +151,7 @@ void *OpenCLAllocator::CreateImageFromHost(void *data, size_t size, const std::v << ", host addr: " << mem_buf->host_ptr_ << ", device addr: " << mem_buf->device_ptr_; return mem_buf->host_ptr_; } + ++iter; } void *host_ptr = nullptr; void *device_ptr = nullptr; @@ -198,10 +200,13 @@ void OpenCLAllocator::Free(void *buf) { allocated_list_.erase(iter); free_list_.insert(std::make_pair(mem_buf->size_, mem_buf)); UnLock(); + MS_LOG(DEBUG) << "Free a new Image2D. size: " << mem_buf->size_ << ", host addr: " << mem_buf->host_ptr_ + << ", device addr: " << mem_buf->device_ptr_ << ", image addr: " << mem_buf->image_ptr_; return; } UnLock(); free(buf); + MS_LOG(DEBUG) << "Free host ptr: " << buf; } size_t OpenCLAllocator::GetTotalSize() { diff --git a/mindspore/lite/src/runtime/opencl/opencl_executor.cc b/mindspore/lite/src/runtime/opencl/opencl_executor.cc index 2dc619afe2..38841a067b 100644 --- a/mindspore/lite/src/runtime/opencl/opencl_executor.cc +++ b/mindspore/lite/src/runtime/opencl/opencl_executor.cc @@ -24,41 +24,9 @@ namespace mindspore::lite::opencl { int OpenCLExecutor::Run(std::vector &inputs, std::vector &outputs, std::vector &kernels, Allocator *allocator, const session::KernelCallBack &before, const session::KernelCallBack &after) { - MS_ASSERT(nullptr != allocator); - for (auto &inTensor : inputs) { - if (inTensor == nullptr) { - MS_LOG(ERROR) << "Graph input tensor is nullptr"; - return RET_ERROR; - } - if (inTensor->GetFormat() != schema::Format_NHWC4 && inTensor->GetFormat() != schema::Format_NC4HW4 && - inTensor->GetFormat() != schema::Format_NHWC) { - MS_LOG(ERROR) << "input should be NHWC/NHWC4/NC4HW4, actual is " << schema::EnumNameFormat(inTensor->GetFormat()); - return RET_ERROR; - } else { - TransformTensorLayout(inTensor, inTensor->GetFormat(), schema::Format_NHWC4, true); - // TransformTensorLayout(inTensor, inTensor->GetFormat(), schema::Format_NC4HW4, true); - } - } kernel::LiteKernelUtil::InitTensorRefCount(kernels); - OpenCLAllocator* op_allocator = reinterpret_cast(allocator); for (auto *kernel : kernels) { MS_ASSERT(nullptr != kernel); - kernel::OpenCLKernel *op_kernel = reinterpret_cast(kernel); - auto &outputs = kernel->out_tensors(); - for (auto i = 0; i < outputs.size(); ++i) { - auto *output = outputs.at(i); - MS_ASSERT(nullptr != output); - if (is_image2d_out_) { - std::vector img_size; - op_kernel->GetImageSize(i, &img_size); - auto data_ptr = op_allocator->Malloc(output->Size(), img_size); - - output->SetData(data_ptr); - } else { - output->MallocData(allocator); - } - output->set_allocator(allocator); - } session::CallBackParam callbackParam; callbackParam.name_callback_param = kernel->name(); @@ -67,6 +35,21 @@ int OpenCLExecutor::Run(std::vector &inputs, std::vectorname(); } } + kernel::OpenCLKernel *op_kernel = reinterpret_cast(kernel); + auto &cur_outputs = kernel->out_tensors(); + for (auto i = 0; i < cur_outputs.size(); ++i) { + auto *output = cur_outputs.at(i); + MS_ASSERT(nullptr != output); + if (op_kernel->GetMemType() == kernel::OpenCLMemType::IMG) { + std::vector img_size; + op_kernel->GetImageSize(i, &img_size); + auto data_ptr = allocator_->Malloc(output->Size(), img_size); + output->SetData(data_ptr); + } else { + output->MallocData(allocator_); + } + } + auto ret = kernel->Run(); if (0 != ret) { MS_LOG(ERROR) << "run kernel failed, name: " << kernel->name(); @@ -86,21 +69,11 @@ int OpenCLExecutor::Run(std::vector &inputs, std::vectorGetFormat() != schema::Format_NHWC) { - TransformTensorLayout(outTensor, outTensor->GetFormat(), schema::Format_NHWC, false); - } - } return RET_OK; } -int OpenCLExecutor::TransformTensorLayout(tensor::Tensor *tensor, schema::Format src_format, - schema::Format dst_format, bool trans_dir) { +int OpenCLExecutor::TransformTensorLayout(tensor::Tensor *tensor, schema::Format src_format, schema::Format dst_format, + bool trans_dir) { MS_ASSERT(nullptr != tensor); MS_ASSERT(4 == tensor->shape().size()); auto data_type = tensor->data_type(); @@ -114,11 +87,10 @@ int OpenCLExecutor::TransformTensorLayout(tensor::Tensor *tensor, schema::Format << schema::EnumNameFormat(dst_format); return RET_ERROR; } - return RET_OK; } int OpenCLExecutor::TransformTensorLayoutFp32(tensor::Tensor *tensor, schema::Format src_format, - schema::Format dst_format, bool trans_dir) { + schema::Format dst_format, bool trans_dir) { MS_ASSERT(nullptr != tensor); MS_ASSERT(nullptr != allocator_); MS_ASSERT(4 == tensor->shape().size()); @@ -138,11 +110,11 @@ int OpenCLExecutor::TransformTensorLayoutFp32(tensor::Tensor *tensor, schema::Fo } int OpenCLExecutor::TransformTensorLayoutToBuffer(tensor::Tensor *tensor, schema::Format src_format, - schema::Format dst_format) { + schema::Format dst_format) { if (dst_format == schema::Format_NHWC4) { auto *src_data = tensor->Data(); size_t C4 = UP_DIV(tensor->Channel(), C4NUM); - std::vector img_size{tensor->Width() * C4, (size_t) tensor->Height(), CL_FLOAT}; + std::vector img_size{tensor->Width() * C4, (size_t)tensor->Height(), CL_FLOAT}; if (src_format == schema::Format_NHWC) { auto *dst_data = allocator_->Malloc(tensor->Size(), img_size); if (dst_data == nullptr) { @@ -168,7 +140,7 @@ int OpenCLExecutor::TransformTensorLayoutToBuffer(tensor::Tensor *tensor, schema } int OpenCLExecutor::TransformTensorLayoutToImage(tensor::Tensor *tensor, schema::Format src_format, - schema::Format dst_format) { + schema::Format dst_format) { if (dst_format == schema::Format_NHWC4) { tensor->SetFormat(schema::Format_NHWC4); // convert to nhwc4 @@ -202,15 +174,15 @@ int OpenCLExecutor::TransformTensorLayoutToImage(tensor::Tensor *tensor, schema: } int OpenCLExecutor::TransformTensorLayoutFromImage(tensor::Tensor *tensor, schema::Format src_format, - schema::Format dst_format) { + schema::Format dst_format) { if (dst_format == schema::Format_NHWC) { auto src_data = tensor->Data(); auto dst_data = allocator_->Malloc(tensor->Size()); cl::Image2D *out_mem = reinterpret_cast(allocator_->GetImage(src_data)); std::vector img_size; allocator_->GetImageSize(src_data, &img_size); - auto origin = cl::array < cl::size_type, 3U > {0, 0, 0}; - auto region = cl::array < cl::size_type, 3U > {img_size[0], img_size[1], 1}; + auto origin = cl::array{0, 0, 0}; + auto region = cl::array{img_size[0], img_size[1], 1}; auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance(); ocl_runtime->GetDefaultCommandQueue()->enqueueReadImage(*out_mem, CL_TRUE, origin, region, 0, 0, dst_data); tensor->SetData(dst_data); @@ -224,7 +196,7 @@ int OpenCLExecutor::TransformTensorLayoutFromImage(tensor::Tensor *tensor, schem } int OpenCLExecutor::TransformTensorLayoutUint8(tensor::Tensor *tensor, schema::Format src_format, - schema::Format dst_format, bool is_image) { + schema::Format dst_format, bool is_image) { MS_ASSERT(nullptr != tensor); MS_ASSERT(4 == tensor->shape().size()); // auto src_format = tensor->GetFormat(); @@ -234,4 +206,3 @@ int OpenCLExecutor::TransformTensorLayoutUint8(tensor::Tensor *tensor, schema::F return RET_ERROR; } } // namespace mindspore::lite::opencl - diff --git a/mindspore/lite/test/CMakeLists.txt b/mindspore/lite/test/CMakeLists.txt index eaae765d67..b6b961868d 100644 --- a/mindspore/lite/test/CMakeLists.txt +++ b/mindspore/lite/test/CMakeLists.txt @@ -148,6 +148,7 @@ if (SUPPORT_GPU) ${LITE_DIR}/src/runtime/kernel/opencl/kernel/conv2d_transpose.cc ${LITE_DIR}/src/runtime/kernel/opencl/kernel/transpose.cc ${LITE_DIR}/src/runtime/kernel/opencl/kernel/reshape.cc + ${LITE_DIR}/src/runtime/kernel/opencl/kernel/to_format.cc ) endif() ### minddata lite @@ -323,6 +324,8 @@ if (SUPPORT_GPU) ${TEST_DIR}/ut/src/runtime/kernel/opencl/transpose_tests.cc ${TEST_DIR}/ut/src/runtime/kernel/opencl/convolution_tests.cc ${TEST_DIR}/ut/src/runtime/kernel/opencl/activation_tests.cc + #${TEST_DIR}/ut/src/runtime/kernel/opencl/leakyrelu_tests.cc + ${TEST_DIR}/ut/src/runtime/kernel/opencl/to_format_tests.cc ) endif() diff --git a/mindspore/lite/test/ut/src/runtime/kernel/opencl/to_format_tests.cc b/mindspore/lite/test/ut/src/runtime/kernel/opencl/to_format_tests.cc new file mode 100644 index 0000000000..475183bb7f --- /dev/null +++ b/mindspore/lite/test/ut/src/runtime/kernel/opencl/to_format_tests.cc @@ -0,0 +1,79 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include +#include +#include "mindspore/core/utils/log_adapter.h" +#include "common/common_test.h" +#include "mindspore/lite/src/common/file_utils.h" +#include "mindspore/lite/src/runtime/opencl/opencl_runtime.h" +#include "mindspore/lite/src/runtime/kernel/opencl/subgraph_opencl_kernel.h" +#include "mindspore/lite/src/runtime/kernel/opencl/kernel/transpose.h" + +namespace mindspore { +class TestToFormatOpenCL : public mindspore::CommonTest { + public: + TestToFormatOpenCL() {} +}; + +TEST_F(TestToFormatOpenCL, TransposeFp32) { + auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance(); + ocl_runtime->Init(); + auto allocator = ocl_runtime->GetAllocator(); + int h = 64; + int w = 1; + int c = 7360; + size_t input_size; + std::string input_path = "./test_data/transpose/transpose_fp32_input.bin"; + auto input_data = reinterpret_cast(mindspore::lite::ReadFile(input_path.c_str(), &input_size)); + + lite::tensor::Tensor *tensor_x = + new lite::tensor::Tensor(TypeId(kNumberTypeFloat32), {1, h, w, c}, schema::Format_NHWC4); + + lite::tensor::Tensor *tensor_out = new lite::tensor::Tensor(TypeId(kNumberTypeFloat32), {1, c, h, w}); + std::vector inputs{tensor_x}; + std::vector outputs{tensor_out}; + auto *arith_kernel = new kernel::TransposeOpenCLKernel(nullptr, inputs, outputs); + arith_kernel->Init(); + + inputs[0]->MallocData(allocator); + + std::vector kernels{arith_kernel}; + auto *pGraph = new kernel::SubGraphOpenCLKernel(inputs, outputs, kernels, kernels, kernels); + pGraph->Init(); + memcpy(inputs[0]->Data(), input_data, input_size); + pGraph->Run(); + + size_t output_size; + std::string output_path = "./test_data/transpose/transpose_fp32_output.bin"; + auto correct_data = reinterpret_cast(mindspore::lite::ReadFile(output_path.c_str(), &output_size)); + printf("==================output data=================\n"); + float *output_data = reinterpret_cast(tensor_out->Data()); + std::cout << std::endl; + int size_n = h * w * c; + size_n = size_n > 100 ? 100 : size_n; + for (int i = 0; i < size_n; i++) { + std::cout << output_data[i] << " "; + if ((i + 1) % c == 0) { + std::cout << std::endl; + } + } + std::cout << std::endl; + + // compare + CompareOutputData(output_data, correct_data, h * w * c, 0.00001); + MS_LOG(INFO) << "TestMatMulFp32 passed"; +} +} // namespace mindspore