Merge pull request !4524 from wandongdong/uptags/v0.7.0-beta
| @@ -192,7 +192,8 @@ union PrimitiveType { | |||
| SpaceToBatchND, | |||
| TopKV2, | |||
| Return, | |||
| MakeTuple | |||
| MakeTuple, | |||
| ToFormat, | |||
| } | |||
| enum QuantType: int { | |||
| @@ -881,5 +881,10 @@ table TopKV2 { | |||
| table MakeTuple { | |||
| } | |||
| table ToFormat { | |||
| srcT: int; | |||
| dstT: int; | |||
| } | |||
| table Return { | |||
| } | |||
| @@ -131,6 +131,10 @@ class LiteKernel { | |||
| void AddOutKernel(LiteKernel *kernel) { this->out_kernels_.emplace_back(kernel); } | |||
| void SetInKernel(const std::vector<LiteKernel *> &kernel) { this->in_kernels_ = kernel; } | |||
| void SetOutKernel(const std::vector<LiteKernel *> &kernel) { this->out_kernels_ = kernel; } | |||
| std::vector<LiteKernel *> &in_kernels() { return this->in_kernels_; } | |||
| std::vector<LiteKernel *> &out_kernels() { return this->out_kernels_; } | |||
| @@ -167,16 +171,14 @@ class SubGraphKernel : public LiteKernel { | |||
| public: | |||
| explicit SubGraphKernel(const std::vector<lite::tensor::Tensor *> &inputs, | |||
| const std::vector<lite::tensor::Tensor *> &outputs, | |||
| const std::vector<kernel::LiteKernel *> &inKernels, | |||
| const std::vector<kernel::LiteKernel *> &outKernels, | |||
| const std::vector<kernel::LiteKernel *> &in_kernels, | |||
| const std::vector<kernel::LiteKernel *> &out_kernels, | |||
| const std::vector<kernel::LiteKernel *> &nodes, const lite::Context *ctx, | |||
| const lite::Primitive *primitive) | |||
| : LiteKernel(nullptr, inputs, outputs, ctx, primitive), | |||
| inputs_(inputs), | |||
| outputs_(outputs), | |||
| inkernels_(inKernels), | |||
| outkernels_(outKernels), | |||
| nodes_(nodes) {} | |||
| : LiteKernel(nullptr, inputs, outputs, ctx, primitive), nodes_(nodes) { | |||
| in_kernels_ = in_kernels; | |||
| out_kernels_ = out_kernels; | |||
| } | |||
| virtual int Init() { return -1; } | |||
| virtual int InferShape() { return -1; } | |||
| @@ -184,10 +186,6 @@ class SubGraphKernel : public LiteKernel { | |||
| virtual int Run() { return -1; } | |||
| protected: | |||
| std::vector<lite::tensor::Tensor *> inputs_; | |||
| std::vector<lite::tensor::Tensor *> outputs_; | |||
| std::vector<LiteKernel *> inkernels_; | |||
| std::vector<LiteKernel *> outkernels_; | |||
| std::vector<LiteKernel *> nodes_; | |||
| }; | |||
| @@ -49,9 +49,6 @@ __kernel void ElementDiv(__read_only image2d_t input_a, __read_only image2d_t in | |||
| float4 a = read_imagef(input_a, smp_none, (int2)(X, Y)); | |||
| float4 b = read_imagef(input_b, smp_none, (int2)(X, Y)); | |||
| if (b == 0) { | |||
| return; | |||
| } | |||
| write_imagef(output, (int2)(X, Y), a / b); | |||
| } | |||
| @@ -0,0 +1,235 @@ | |||
| #define FLT float | |||
| #define FLT4 float4 | |||
| #define READ_IMAGE read_imagef | |||
| #define WRITE_IMAGE write_imagef | |||
| // enum Format { | |||
| // Format_NCHW = 0, | |||
| // Format_NHWC = 1, | |||
| // Format_NHWC4 = 2, | |||
| // Format_HWKC = 3, | |||
| // Format_HWCK = 4, | |||
| // Format_KCHW = 5, | |||
| // Format_CKHW = 6, | |||
| // Format_KHWC = 7, | |||
| // Format_CHWK = 8, | |||
| // Format_NC4HW4 = 100, | |||
| // Format_NUM_OF_FORMAT = 101, | |||
| // Format_MIN = Format_NCHW, | |||
| // Format_MAX = Format_NUM_OF_FORMAT | |||
| //}; | |||
| __constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; | |||
| __kernel void to_format_NCHW_to_NHWC4_IMG(__global FLT4 *src_data, __write_only image2d_t dst_data, int4 size, | |||
| int4 shape) { | |||
| int X = get_global_id(0); | |||
| int Y = get_global_id(1); | |||
| int Z = get_global_id(2); | |||
| if (X >= size.x || Y >= size.y || Z >= size.z) { | |||
| return; | |||
| } | |||
| // WRITE_IMAGE(dst_data, (int2)(Y * size.z + Z, X), READ_IMAGE(src_data, smp_zero, (int2)(Y * size.z + Z, X))); | |||
| } | |||
| __kernel void to_format_NHWC_to_NHWC4_IMG(__global FLT4 *src_data, __write_only image2d_t dst_data, int4 size, | |||
| int4 shape) { | |||
| int X = get_global_id(0); | |||
| int Y = get_global_id(1); | |||
| int Z = get_global_id(2); | |||
| if (X >= size.x || Y >= size.y || Z >= size.z) { | |||
| return; | |||
| } | |||
| int offset = (X * shape.z + Y) * shape.w + Z * 4; | |||
| __global FLT *src_addr = (__global FLT *)src_data; | |||
| src_addr += offset; | |||
| FLT4 data = (FLT4)(0.f); | |||
| if ((Z + 1) * 4 <= shape.w) { | |||
| data = ((__global FLT4 *)src_addr)[0]; | |||
| } else { | |||
| if ((shape.w - Z * 4) >= 1) { | |||
| data.x = src_addr[0]; | |||
| } | |||
| if ((shape.w - Z * 4) >= 2) { | |||
| data.y = src_addr[1]; | |||
| } | |||
| if ((shape.w - Z * 4) >= 3) { | |||
| data.z = src_addr[2]; | |||
| } | |||
| } | |||
| WRITE_IMAGE(dst_data, (int2)(Y * size.z + Z, X), data); | |||
| } | |||
| __kernel void to_format_NHWC4_to_NHWC4_IMG(__global FLT4 *src_data, __write_only image2d_t dst_data, int4 size, | |||
| int4 shape) { | |||
| int X = get_global_id(0); | |||
| int Y = get_global_id(1); | |||
| int Z = get_global_id(2); | |||
| if (X >= size.x || Y >= size.y || Z >= size.z) { | |||
| return; | |||
| } | |||
| // WRITE_IMAGE(dst_data, (int2)(Y * size.z + Z, X), READ_IMAGE(src_data, smp_zero, (int2)(Y * size.z + Z, X))); | |||
| } | |||
| __kernel void to_format_NC4HW4_to_NHWC4_IMG(__global FLT4 *src_data, __write_only image2d_t dst_data, int4 size, | |||
| int4 shape) { | |||
| int X = get_global_id(0); | |||
| int Y = get_global_id(1); | |||
| int Z = get_global_id(2); | |||
| if (X >= size.x || Y >= size.y || Z >= size.z) { | |||
| return; | |||
| } | |||
| // WRITE_IMAGE(dst_data, (int2)(Y * size.z + Z, X), READ_IMAGE(src_data, smp_zero, (int2)(Y * size.z + Z, X))); | |||
| } | |||
| __kernel void to_format_NCHW_to_NC4HW4_IMG(__global FLT4 *src_data, __write_only image2d_t dst_data, int4 size, | |||
| int4 shape) { | |||
| int X = get_global_id(0); | |||
| int Y = get_global_id(1); | |||
| int Z = get_global_id(2); | |||
| if (X >= size.x || Y >= size.y || Z >= size.z) { | |||
| return; | |||
| } | |||
| // WRITE_IMAGE(dst_data, (int2)(Y * size.z + Z, X), READ_IMAGE(src_data, smp_zero, (int2)(Y * size.z + Z, X))); | |||
| } | |||
| __kernel void to_format_NHWC_to_NC4HW4_IMG(__global FLT4 *src_data, __write_only image2d_t dst_data, int4 size, | |||
| int4 shape) { | |||
| int X = get_global_id(0); | |||
| int Y = get_global_id(1); | |||
| int Z = get_global_id(2); | |||
| if (X >= size.x || Y >= size.y || Z >= size.z) { | |||
| return; | |||
| } | |||
| // WRITE_IMAGE(dst_data, (int2)(Y * size.z + Z, X), READ_IMAGE(src_data, smp_zero, (int2)(Y * size.z + Z, X))); | |||
| } | |||
| __kernel void to_format_NHWC4_to_NC4HW4_IMG(__global FLT4 *src_data, __write_only image2d_t dst_data, int4 size, | |||
| int4 shape) { | |||
| int X = get_global_id(0); | |||
| int Y = get_global_id(1); | |||
| int Z = get_global_id(2); | |||
| if (X >= size.x || Y >= size.y || Z >= size.z) { | |||
| return; | |||
| } | |||
| // WRITE_IMAGE(dst_data, (int2)(Y * size.z + Z, X), READ_IMAGE(src_data, smp_zero, (int2)(Y * size.z + Z, X))); | |||
| } | |||
| __kernel void to_format_NC4HW4_to_NC4HW4_IMG(__global FLT4 *src_data, __write_only image2d_t dst_data, int4 size, | |||
| int4 shape) { | |||
| int X = get_global_id(0); | |||
| int Y = get_global_id(1); | |||
| int Z = get_global_id(2); | |||
| if (X >= size.x || Y >= size.y || Z >= size.z) { | |||
| return; | |||
| } | |||
| // FLT4 src_final = src_data[(((Z)*src_size.y + (y_c)) * src_size.x + (x_c))]; | |||
| WRITE_IMAGE(dst_data, (int2)(Y * size.z + Z, X), src_data[(Y * size.z + Z) * size.x + X]); | |||
| } | |||
| __kernel void to_format_NCHW_to_NCHW_BUF(__read_only image2d_t src_data, __global FLT4 *dst_data, int4 size, | |||
| int4 shape) { | |||
| int X = get_global_id(0); | |||
| int Y = get_global_id(1); | |||
| int Z = get_global_id(2); | |||
| if (X >= size.x || Y >= size.y || Z >= size.z) { | |||
| return; | |||
| } | |||
| dst_data[(Z * size.y + Y) * size.x + X] = READ_IMAGE(src_data, smp_zero, (int2)(Y * size.x + X, Z)); | |||
| } | |||
| __kernel void to_format_NHWC_to_NCHW_BUF(__read_only image2d_t src_data, __global FLT4 *dst_data, int4 size, | |||
| int4 shape) { | |||
| int X = get_global_id(0); | |||
| int Y = get_global_id(1); | |||
| int Z = get_global_id(2); | |||
| if (X >= size.x || Y >= size.y || Z >= size.z) { | |||
| return; | |||
| } | |||
| // WRITE_IMAGE(dst_data, (int2)(Y * size.z + Z, X), READ_IMAGE(src_data, smp_zero, (int2)(Y * size.z + Z, X))); | |||
| } | |||
| __kernel void to_format_NHWC4_to_NCHW_BUF(__read_only image2d_t src_data, __global FLT4 *dst_data, int4 size, | |||
| int4 shape) { | |||
| int X = get_global_id(0); | |||
| int Y = get_global_id(1); | |||
| int Z = get_global_id(2); | |||
| if (X >= size.x || Y >= size.y || Z >= size.z) { | |||
| return; | |||
| } | |||
| // WRITE_IMAGE(dst_data, (int2)(Y * size.z + Z, X), READ_IMAGE(src_data, smp_zero, (int2)(Y * size.z + Z, X))); | |||
| } | |||
| __kernel void to_format_NC4HW4_to_NCHW_BUF(__read_only image2d_t src_data, __global FLT4 *dst_data, int4 size, | |||
| int4 shape) { | |||
| int X = get_global_id(0); | |||
| int Y = get_global_id(1); | |||
| int Z = get_global_id(2); | |||
| if (X >= size.x || Y >= size.y || Z >= size.z) { | |||
| return; | |||
| } | |||
| // WRITE_IMAGE(dst_data, (int2)(Y * size.z + Z, X), READ_IMAGE(src_data, smp_zero, (int2)(Y * size.z + Z, X))); | |||
| } | |||
| __kernel void to_format_NCHW_to_NHWC_BUF(__read_only image2d_t src_data, __global FLT4 *dst_data, int4 size, | |||
| int4 shape) { | |||
| int X = get_global_id(0); | |||
| int Y = get_global_id(1); | |||
| int Z = get_global_id(2); | |||
| if (X >= size.x || Y >= size.y || Z >= size.z) { | |||
| return; | |||
| } | |||
| // WRITE_IMAGE(dst_data, (int2)(Y * size.z + Z, X), READ_IMAGE(src_data, smp_zero, (int2)(Y * size.z + Z, X))); | |||
| } | |||
| __kernel void to_format_NHWC_to_NHWC_BUF(__read_only image2d_t src_data, __global FLT4 *dst_data, int4 size, | |||
| int4 shape) { | |||
| int X = get_global_id(0); | |||
| int Y = get_global_id(1); | |||
| int Z = get_global_id(2); | |||
| if (X >= size.x || Y >= size.y || Z >= size.z) { | |||
| return; | |||
| } | |||
| // WRITE_IMAGE(dst_data, (int2)(Y * size.z + Z, X), READ_IMAGE(src_data, smp_zero, (int2)(Y * size.z + Z, X))); | |||
| } | |||
| __kernel void to_format_NHWC4_to_NHWC_BUF(__read_only image2d_t src_data, __global FLT4 *dst_data, int4 size, | |||
| int4 shape) { | |||
| int X = get_global_id(0); | |||
| int Y = get_global_id(1); | |||
| int Z = get_global_id(2); | |||
| if (X >= size.x || Y >= size.y || Z >= size.z) { | |||
| return; | |||
| } | |||
| FLT4 data = READ_IMAGE(src_data, smp_zero, (int2)(Y * size.z + Z, X)); | |||
| int offset = (X * shape.z + Y) * shape.w + Z * 4; | |||
| __global FLT *dst_addr = (__global FLT *)dst_data; | |||
| dst_addr += offset; | |||
| if ((Z + 1) * 4 <= shape.w) { | |||
| ((__global FLT4 *)dst_addr)[0] = data; | |||
| } else { | |||
| if (shape.w - Z * 4 >= 1) { | |||
| dst_addr[0] = data.x; | |||
| } | |||
| if (shape.w - Z * 4 >= 2) { | |||
| dst_addr[1] = data.y; | |||
| } | |||
| if (shape.w - Z * 4 >= 3) { | |||
| dst_addr[2] = data.z; | |||
| } | |||
| } | |||
| } | |||
| __kernel void to_format_NC4HW4_to_to_NHWC_BUF(__read_only image2d_t src_data, __global FLT4 *dst_data, int4 size, | |||
| int4 shape) { | |||
| int X = get_global_id(0); | |||
| int Y = get_global_id(1); | |||
| int Z = get_global_id(2); | |||
| if (X >= size.x || Y >= size.y || Z >= size.z) { | |||
| return; | |||
| } | |||
| // WRITE_IMAGE(dst_data, (int2)(Y * size.z + Z, X), READ_IMAGE(src_data, smp_zero, (int2)(Y * size.z + Z, X))); | |||
| } | |||
| __kernel void to_format_NC4HW4_to_NC4HW4_BUF(__read_only image2d_t src_data, __global FLT4 *dst_data, int4 size, | |||
| int4 shape) { | |||
| int X = get_global_id(0); | |||
| int Y = get_global_id(1); | |||
| int Z = get_global_id(2); | |||
| if (X >= size.x || Y >= size.y || Z >= size.z) { | |||
| return; | |||
| } | |||
| dst_data[(Y * size.z + Z) * size.x + X] = READ_IMAGE(src_data, smp_zero, (int2)(Y * size.z + Z, X)); | |||
| } | |||
| __kernel void to_format_NHWC4_to_NHWC4_BUF(__read_only image2d_t src_data, __global FLT4 *dst_data, int4 size, | |||
| int4 shape) { | |||
| int X = get_global_id(0); | |||
| int Y = get_global_id(1); | |||
| int Z = get_global_id(2); | |||
| if (X >= size.x || Y >= size.y || Z >= size.z) { | |||
| return; | |||
| } | |||
| dst_data[(Y * size.z + Z) * size.x + X] = READ_IMAGE(src_data, smp_zero, (int2)(Y * size.z + Z, X)); | |||
| } | |||
| @@ -329,7 +329,8 @@ kernel::LiteKernel *OpenCLConvolutionKernelCreator(const std::vector<lite::tenso | |||
| const std::vector<lite::tensor::Tensor *> &outputs, | |||
| OpParameter *opParameter, const lite::Context *ctx, | |||
| const kernel::KernelKey &desc, const lite::Primitive *primitive) { | |||
| auto *kernel = new ConvolutionOpenCLKernel(reinterpret_cast<OpParameter *>(opParameter), inputs, outputs); | |||
| auto *kernel = | |||
| new (std::nothrow) ConvolutionOpenCLKernel(reinterpret_cast<OpParameter *>(opParameter), inputs, outputs); | |||
| if (kernel == nullptr) { | |||
| MS_LOG(ERROR) << "Create OpenCL Convolution kernel failed!"; | |||
| return nullptr; | |||
| @@ -145,7 +145,7 @@ kernel::LiteKernel *OpenCLPooling2dKernelCreator(const std::vector<lite::tensor: | |||
| const std::vector<lite::tensor::Tensor *> &outputs, | |||
| OpParameter *opParameter, const lite::Context *ctx, | |||
| const kernel::KernelKey &desc, const lite::Primitive *primitive) { | |||
| auto *kernel = new (std::nothrow)PoolingOpenCLKernel(reinterpret_cast<OpParameter *>(opParameter), inputs, outputs); | |||
| auto *kernel = new (std::nothrow) PoolingOpenCLKernel(reinterpret_cast<OpParameter *>(opParameter), inputs, outputs); | |||
| if (kernel == nullptr) { | |||
| MS_LOG(ERROR) << "Create OpenCL Pooling kernel failed!"; | |||
| return nullptr; | |||
| @@ -158,7 +158,7 @@ kernel::LiteKernel *OpenCLSoftMaxKernelCreator(const std::vector<lite::tensor::T | |||
| const std::vector<lite::tensor::Tensor *> &outputs, | |||
| OpParameter *opParameter, const lite::Context *ctx, | |||
| const kernel::KernelKey &desc, const lite::Primitive *primitive) { | |||
| auto *kernel = new (std::nothrow)SoftmaxOpenCLKernel(reinterpret_cast<OpParameter *>(opParameter), inputs, outputs); | |||
| auto *kernel = new (std::nothrow) SoftmaxOpenCLKernel(reinterpret_cast<OpParameter *>(opParameter), inputs, outputs); | |||
| if (kernel == nullptr) { | |||
| MS_LOG(ERROR) << "kernel " << opParameter->name_ << "is nullptr."; | |||
| return nullptr; | |||
| @@ -0,0 +1,167 @@ | |||
| /** | |||
| * Copyright 2019 Huawei Technologies Co., Ltd | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| * You may obtain a copy of the License at | |||
| * | |||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, software | |||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| #include "src/runtime/kernel/opencl/kernel/to_format.h" | |||
| #include <set> | |||
| #include <map> | |||
| #include <string> | |||
| #include <utility> | |||
| #include "include/errorcode.h" | |||
| #include "src/kernel_registry.h" | |||
| #include "src/runtime/opencl/opencl_runtime.h" | |||
| #include "src/runtime/kernel/opencl/cl/fp32/to_format.cl.inc" | |||
| using mindspore::kernel::KERNEL_ARCH::kGPU; | |||
| using mindspore::lite::KernelRegistrar; | |||
| using mindspore::lite::RET_ERROR; | |||
| using mindspore::lite::RET_OK; | |||
| using mindspore::schema::PrimitiveType_ToFormat; | |||
| namespace mindspore::kernel { | |||
| int ToFormatOpenCLKernel::Init() { | |||
| auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance(); | |||
| auto parameter = reinterpret_cast<OpenCLToFormatParameter *>(op_parameter_); | |||
| out_mem_type_ = parameter->out_mem_type; | |||
| std::string program_name = "to_format"; | |||
| std::map<schema::Format, std::string> format_str{{schema::Format_NCHW, "NCHW"}, | |||
| {schema::Format_NHWC, "NHWC"}, | |||
| {schema::Format_NC4HW4, "NC4HW4"}, | |||
| {schema::Format_NHWC4, "NHWC4"}}; | |||
| std::string kernel_name = | |||
| "to_format_" + format_str[in_tensors_[0]->GetFormat()] + "_to_" + format_str[out_tensors_[0]->GetFormat()]; | |||
| if (out_mem_type_ == OpenCLMemType::IMG) { | |||
| kernel_name += "_IMG"; | |||
| } else { | |||
| kernel_name += "_BUF"; | |||
| } | |||
| #ifdef PROGRAM_WITH_IL | |||
| ocl_runtime->CreateKernelFromIL(kernel_(), kernel_name); | |||
| #else | |||
| std::set<std::string> build_options; | |||
| #ifdef ENABLE_FP16 | |||
| std::string source = to_format_source_fp16; | |||
| #else | |||
| std::string source = to_format_source_fp32; | |||
| #endif | |||
| ocl_runtime->LoadSource(program_name, source); | |||
| ocl_runtime->BuildKernel(kernel_, program_name, kernel_name, build_options); | |||
| #endif | |||
| MS_LOG(DEBUG) << kernel_name << " Init Done!"; | |||
| return RET_OK; | |||
| } | |||
| int ToFormatOpenCLKernel::ReSize() { return RET_OK; } | |||
| int ToFormatOpenCLKernel::GetGlobalSize(size_t idx, std::vector<size_t> *global_size) { | |||
| std::vector<int> shapex = out_tensors_[0]->shape(); | |||
| if (out_tensors_[0]->GetFormat() == schema::Format_NHWC4 || out_tensors_[0]->GetFormat() == schema::Format_NHWC) { | |||
| int h = shapex[1]; | |||
| int w = shapex[2]; | |||
| int c = shapex[3]; | |||
| int c4 = UP_DIV(c, C4NUM); | |||
| std::vector<size_t> vec = {(size_t)h, (size_t)w, (size_t)c4}; | |||
| *global_size = std::move(vec); | |||
| } else if (out_tensors_[0]->GetFormat() == schema::Format_NC4HW4 || | |||
| out_tensors_[0]->GetFormat() == schema::Format_NCHW) { | |||
| int h = shapex[2]; | |||
| int w = shapex[3]; | |||
| int c = shapex[1]; | |||
| int c4 = UP_DIV(c, C4NUM); | |||
| std::vector<size_t> vec = {(size_t)c4, (size_t)h, (size_t)w}; | |||
| *global_size = std::move(vec); | |||
| } else if (out_tensors_[0]->GetFormat() == out_tensors_[0]->GetFormat() == schema::Format_NCHW) { | |||
| int h = shapex[2]; | |||
| int w = shapex[3]; | |||
| int c = shapex[1]; | |||
| int w4 = UP_DIV(w, C4NUM); | |||
| std::vector<size_t> vec = {(size_t)w4, (size_t)h, (size_t)c}; | |||
| *global_size = std::move(vec); | |||
| } | |||
| return RET_OK; | |||
| } | |||
| int ToFormatOpenCLKernel::GetLocalSize(size_t idx, const std::vector<size_t> &global_size, | |||
| std::vector<size_t> *local_size) { | |||
| return RET_OK; | |||
| } | |||
| int ToFormatOpenCLKernel::GetImageSize(size_t idx, std::vector<size_t> *img_size) { | |||
| size_t im_dst_x, im_dst_y; | |||
| std::vector<int> shapex = out_tensors_[0]->shape(); | |||
| if (out_tensors_[0]->GetFormat() == schema::Format_NC4HW4) { | |||
| int h = shapex[1]; | |||
| int w = shapex[2]; | |||
| int c = shapex[3]; | |||
| im_dst_y = UP_DIV(h * c, C4NUM); | |||
| im_dst_x = w; | |||
| } else if (out_tensors_[0]->GetFormat() == schema::Format_NHWC4) { | |||
| int h = shapex[2]; | |||
| int w = shapex[3]; | |||
| int c = shapex[1]; | |||
| im_dst_x = UP_DIV(w * c, C4NUM); | |||
| im_dst_y = h; | |||
| } else { | |||
| MS_LOG(ERROR) << "Unsupported format. " << out_tensors_[0]->GetFormat(); | |||
| } | |||
| img_size->clear(); | |||
| #ifdef ENABLE_FP16 | |||
| size_t img_dtype = CL_HALF_FLOAT; | |||
| #else | |||
| size_t img_dtype = CL_FLOAT; | |||
| #endif | |||
| std::vector<size_t> vec{im_dst_x, im_dst_y, img_dtype}; | |||
| *img_size = vec; | |||
| return RET_OK; | |||
| } | |||
| int ToFormatOpenCLKernel::Run() { | |||
| MS_LOG(DEBUG) << "ToFormat" << " Running!"; | |||
| auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance(); | |||
| std::vector<size_t> local = {}; | |||
| std::vector<size_t> global; | |||
| GetGlobalSize(0, &global); | |||
| auto shapex = in_tensors_[0]->shape(); | |||
| cl_int4 shape{shapex.size() > 0 ? shapex[0] : 1, shapex.size() > 1 ? shapex[1] : 1, shapex.size() > 2 ? shapex[2] : 1, | |||
| shapex.size() > 3 ? shapex[3] : 1}; | |||
| cl_int4 gsize{(cl_int)global[0], (cl_int)global[1], (cl_int)global[2], 1}; | |||
| ocl_runtime->SetKernelArg(kernel_, 0, in_tensors_[0]->Data()); | |||
| ocl_runtime->SetKernelArg(kernel_, 1, out_tensors_[0]->Data()); | |||
| ocl_runtime->SetKernelArg(kernel_, 2, gsize); | |||
| ocl_runtime->SetKernelArg(kernel_, 3, shape); | |||
| ocl_runtime->RunKernel(kernel_, global, local, nullptr); | |||
| return RET_OK; | |||
| } | |||
| kernel::LiteKernel *OpenCLToFormatKernelCreator(const std::vector<lite::tensor::Tensor *> &inputs, | |||
| const std::vector<lite::tensor::Tensor *> &outputs, | |||
| OpParameter *opParameter, const lite::Context *ctx, | |||
| const kernel::KernelKey &desc, const lite::Primitive *primitive) { | |||
| auto *kernel = new (std::nothrow) ToFormatOpenCLKernel(reinterpret_cast<OpParameter *>(opParameter), inputs, outputs); | |||
| if (kernel == nullptr) { | |||
| MS_LOG(ERROR) << "kernel " << opParameter->name_ << " create failed."; | |||
| return nullptr; | |||
| } | |||
| auto ret = kernel->Init(); | |||
| if (ret != RET_OK) { | |||
| delete kernel; | |||
| return nullptr; | |||
| } | |||
| return kernel; | |||
| } | |||
| REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_ToFormat, OpenCLToFormatKernelCreator) | |||
| } // namespace mindspore::kernel | |||
| @@ -0,0 +1,46 @@ | |||
| /** | |||
| * Copyright 2020 Huawei Technologies Co., Ltd | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| * You may obtain a copy of the License at | |||
| * | |||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, software | |||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| #ifndef MINDSPORE_LITE_SRC_BACKEND_OPENCL_TO_FORMAT_H_ | |||
| #define MINDSPORE_LITE_SRC_BACKEND_OPENCL_TO_FORMAT_H_ | |||
| #include <vector> | |||
| #include "src/lite_kernel.h" | |||
| #include "src/runtime/opencl/opencl_runtime.h" | |||
| #include "src/runtime/kernel/opencl/opencl_kernel.h" | |||
| namespace mindspore::kernel { | |||
| class ToFormatOpenCLKernel : public OpenCLKernel { | |||
| public: | |||
| explicit ToFormatOpenCLKernel(OpParameter *parameter, const std::vector<lite::tensor::Tensor *> &inputs, | |||
| const std::vector<lite::tensor::Tensor *> &outputs) | |||
| : OpenCLKernel(parameter, inputs, outputs) {} | |||
| ~ToFormatOpenCLKernel() override{}; | |||
| int Init() override; | |||
| int ReSize() override; | |||
| int Run() override; | |||
| int GetImageSize(size_t idx, std::vector<size_t> *img_size) override; | |||
| int GetGlobalSize(size_t idx, std::vector<size_t> *global_size) override; | |||
| int GetLocalSize(size_t idx, const std::vector<size_t> &global_size, std::vector<size_t> *local_size) override; | |||
| private: | |||
| cl::Kernel kernel_; | |||
| }; | |||
| } // namespace mindspore::kernel | |||
| #endif // MINDSPORE_LITE_SRC_BACKEND_OPENCL_RESHAPE_H_ | |||
| @@ -109,7 +109,8 @@ kernel::LiteKernel *OpenCLTransposeKernelCreator(const std::vector<lite::tensor: | |||
| const std::vector<lite::tensor::Tensor *> &outputs, | |||
| OpParameter *opParameter, const lite::Context *ctx, | |||
| const kernel::KernelKey &desc, const lite::Primitive *primitive) { | |||
| auto *kernel = new (std::nothrow)TransposeOpenCLKernel(reinterpret_cast<OpParameter *>(opParameter), inputs, outputs); | |||
| auto *kernel = | |||
| new (std::nothrow) TransposeOpenCLKernel(reinterpret_cast<OpParameter *>(opParameter), inputs, outputs); | |||
| if (kernel == nullptr) { | |||
| MS_LOG(ERROR) << "kernel " << opParameter->name_ << "is nullptr."; | |||
| return nullptr; | |||
| @@ -21,21 +21,37 @@ | |||
| #include "src/lite_kernel.h" | |||
| namespace mindspore::kernel { | |||
| enum class OpenCLMemType { BUF, IMG }; | |||
| struct OpenCLToFormatParameter { | |||
| OpParameter op_parameter; | |||
| schema::Format src_format{schema::Format_NHWC}; | |||
| schema::Format dst_format{schema::Format_NHWC4}; | |||
| OpenCLMemType out_mem_type{OpenCLMemType::IMG}; | |||
| }; | |||
| class OpenCLKernel : public LiteKernel { | |||
| public: | |||
| explicit OpenCLKernel(OpParameter *parameter, const std::vector<lite::tensor::Tensor *> &inputs, | |||
| const std::vector<lite::tensor::Tensor *> &outputs) | |||
| : LiteKernel(parameter, inputs, outputs, nullptr, nullptr) {} | |||
| : LiteKernel(parameter, inputs, outputs, nullptr, nullptr) {} | |||
| virtual int Init() { return -1; } | |||
| virtual int Prepare() { return -1; } | |||
| virtual int InferShape() { return -1; } | |||
| virtual int ReSize() { return -1; } | |||
| virtual int Run() { return -1; } | |||
| virtual int GetImageSize(size_t idx, std::vector<size_t>* img_size) { return -1; } | |||
| virtual int GetGlobalSize(size_t idx, std::vector<size_t>* global_size) { return -1; } | |||
| virtual int GetLocalSize(size_t idx, const std::vector<size_t>& global_size, | |||
| std::vector<size_t>* local_size) { return -1; } | |||
| virtual int GetImageSize(size_t idx, std::vector<size_t> *img_size) { return -1; } | |||
| virtual int GetGlobalSize(size_t idx, std::vector<size_t> *global_size) { return -1; } | |||
| virtual int GetLocalSize(size_t idx, const std::vector<size_t> &global_size, std::vector<size_t> *local_size) { | |||
| return -1; | |||
| } | |||
| OpenCLMemType GetMemType() { return out_mem_type_; } | |||
| void SetMemType(OpenCLMemType mem_type) { out_mem_type_ = mem_type; } | |||
| protected: | |||
| OpenCLMemType out_mem_type_{OpenCLMemType::IMG}; | |||
| }; | |||
| } // namespace mindspore::kernel | |||
| @@ -17,22 +17,117 @@ | |||
| #include "src/runtime/kernel/opencl/subgraph_opencl_kernel.h" | |||
| #include "src/runtime/opencl/opencl_executor.h" | |||
| #include "src/runtime/opencl/opencl_runtime.h" | |||
| #include "src/runtime/kernel/opencl/utils.h" | |||
| #include "include/errorcode.h" | |||
| #include "src/common/utils.h" | |||
| namespace mindspore::kernel { | |||
| SubGraphOpenCLKernel::~SubGraphOpenCLKernel() { UnInit(); } | |||
| int SubGraphOpenCLKernel::GenToFormatOp(const std::vector<lite::tensor::Tensor *> &in_tensors, | |||
| const std::vector<kernel::LiteKernel *> in_kernels, | |||
| std::vector<lite::tensor::Tensor *> *out_tensors, | |||
| std::vector<OpenCLToFormatParameter *> *out_parameters, | |||
| std::vector<LiteKernel *> *out_convert_ops, OpenCLMemType mem_type) { | |||
| out_tensors->clear(); | |||
| out_parameters->clear(); | |||
| out_convert_ops->clear(); | |||
| for (size_t i = 0; i < in_tensors.size(); ++i) { | |||
| lite::tensor::Tensor *new_tensor = new (std::nothrow) lite::tensor::Tensor(); | |||
| MS_ASSERT(new_tensor); | |||
| if (new_tensor == nullptr) { | |||
| MS_LOG(ERROR) << "SubGraphOpenCLKernel new tensor failed!"; | |||
| return RET_ERROR; | |||
| } | |||
| new_tensor->CopyTensor(*in_tensors[i]); | |||
| auto dst_format = | |||
| (mem_type == OpenCLMemType::IMG) ? in_kernels.back()->out_tensors()[0]->GetFormat() : in_tensors[i]->GetFormat(); | |||
| auto src_format = | |||
| (mem_type == OpenCLMemType::IMG) ? in_tensors[i]->GetFormat() : in_kernels.front()->out_tensors()[0]->GetFormat(); | |||
| if ((dst_format == schema::Format_NCHW || dst_format == schema::Format_NC4HW4) && | |||
| (src_format == schema::Format_NHWC || src_format == schema::Format_NHWC4)) { | |||
| auto &shape = new_tensor->shape(); | |||
| std::vector<int> dst_shape{shape[0], shape[3], shape[1], shape[2]}; | |||
| new_tensor->set_shape(shape); | |||
| } | |||
| if ((dst_format == schema::Format_NHWC || dst_format == schema::Format_NHWC4) && | |||
| (src_format == schema::Format_NCHW || src_format == schema::Format_NC4HW4)) { | |||
| auto &shape = new_tensor->shape(); | |||
| std::vector<int> dst_shape{shape[0], shape[2], shape[3], shape[1]}; | |||
| new_tensor->set_shape(shape); | |||
| } | |||
| new_tensor->SetFormat(dst_format); | |||
| out_tensors->emplace_back(new_tensor); | |||
| #ifdef ENABLE_FP16 | |||
| KernelKey desc{kGPU, kNumberTypeFloat16, schema::PrimitiveType_ToFormat}; | |||
| #else | |||
| KernelKey desc{kGPU, kNumberTypeFloat32, schema::PrimitiveType_ToFormat}; | |||
| #endif | |||
| OpenCLToFormatParameter *parameter = new (std::nothrow) OpenCLToFormatParameter; | |||
| MS_ASSERT(parameter); | |||
| if (parameter == nullptr) { | |||
| MS_LOG(ERROR) << "SubGraphOpenCLKernel new parameter failed!"; | |||
| return RET_ERROR; | |||
| } | |||
| parameter->src_format = src_format; | |||
| parameter->dst_format = dst_format; | |||
| parameter->out_mem_type = mem_type; | |||
| out_parameters->emplace_back(parameter); | |||
| LiteKernel *in_convert_op; | |||
| if (mem_type == OpenCLMemType::IMG) { | |||
| in_convert_op = | |||
| lite::GetOpenCLKernel({in_tensors[i]}, {new_tensor}, reinterpret_cast<OpParameter *>(parameter), nullptr, desc); | |||
| } else { | |||
| in_convert_op = | |||
| lite::GetOpenCLKernel({new_tensor}, {in_tensors[i]}, reinterpret_cast<OpParameter *>(parameter), nullptr, desc); | |||
| } | |||
| MS_ASSERT(in_convert_op); | |||
| if (in_convert_op == nullptr) { | |||
| MS_LOG(ERROR) << "SubGraphOpenCLKernel create op failed!"; | |||
| return RET_ERROR; | |||
| } | |||
| auto in_opencl_op = reinterpret_cast<OpenCLKernel *>(in_convert_op); | |||
| if (mem_type == OpenCLMemType::IMG) { | |||
| in_opencl_op->AddOutKernel(in_kernels[i]); | |||
| reinterpret_cast<OpenCLKernel *>(in_kernels[i])->SetInKernel({in_convert_op}); | |||
| reinterpret_cast<OpenCLKernel *>(in_kernels[i])->set_in_tensors({new_tensor}); | |||
| } else { | |||
| reinterpret_cast<OpenCLKernel *>(in_kernels[i])->SetOutKernel({in_convert_op}); | |||
| reinterpret_cast<OpenCLKernel *>(in_kernels[i])->set_out_tensors({new_tensor}); | |||
| in_convert_op->AddInKernel(in_kernels[i]); | |||
| } | |||
| out_convert_ops->emplace_back(in_convert_op); | |||
| } | |||
| return RET_OK; | |||
| } | |||
| int SubGraphOpenCLKernel::Init() { | |||
| allocator_ = lite::opencl::OpenCLRuntime::GetInstance()->GetAllocator(); | |||
| MS_LOG(DEBUG) << "input num=" << inputs_.size() << ", output num=" << outputs_.size(); | |||
| for (const auto tensor : inputs_) { | |||
| MS_LOG(DEBUG) << "input num=" << in_tensors_.size() << ", output num=" << out_tensors_.size(); | |||
| for (const auto tensor : in_tensors_) { | |||
| tensor->set_allocator(allocator_); | |||
| } | |||
| for (const auto tensor : outputs_) { | |||
| for (const auto tensor : out_tensors_) { | |||
| tensor->set_allocator(allocator_); | |||
| } | |||
| int ret = GenToFormatOp(in_tensors_, in_kernels_, &in_convert_tensors_, &in_parameters_, &in_convert_ops_, | |||
| OpenCLMemType::IMG); | |||
| if (ret != RET_OK) { | |||
| return RET_ERROR; | |||
| } | |||
| nodes_.insert(nodes_.begin(), in_convert_ops_.begin(), in_convert_ops_.end()); | |||
| ret = GenToFormatOp(out_tensors_, out_kernels_, &out_convert_tensors_, &out_parameters_, &out_convert_ops_, | |||
| OpenCLMemType::BUF); | |||
| if (ret != RET_OK) { | |||
| return RET_ERROR; | |||
| } | |||
| nodes_.insert(nodes_.end(), out_convert_ops_.begin(), out_convert_ops_.end()); | |||
| MallocTensorWithReuse(); | |||
| // Map buffer for write, it is not necessary for fine-grained | |||
| for (auto &tensor : inputs_) { | |||
| for (auto &tensor : in_tensors_) { | |||
| void *data = tensor->Data(); | |||
| // It is required with coarse-grained SVM | |||
| if (data != nullptr) { | |||
| @@ -42,44 +137,118 @@ int SubGraphOpenCLKernel::Init() { | |||
| MS_LOG(ERROR) << "SubGraphOpenCLKernel input nullptr!"; | |||
| } | |||
| } | |||
| return 0; | |||
| return RET_OK; | |||
| } | |||
| int SubGraphOpenCLKernel::MallocTensorWithReuse() { | |||
| kernel::LiteKernelUtil::InitTensorRefCount(nodes_); | |||
| for (auto *kernel : nodes_) { | |||
| MS_ASSERT(nullptr != kernel); | |||
| kernel::OpenCLKernel *op_kernel = reinterpret_cast<kernel::OpenCLKernel *>(kernel); | |||
| auto &outputs = kernel->out_tensors(); | |||
| for (auto i = 0; i < outputs.size(); ++i) { | |||
| auto *output = outputs.at(i); | |||
| MS_ASSERT(nullptr != output); | |||
| if (op_kernel->GetMemType() == OpenCLMemType::IMG) { | |||
| std::vector<size_t> img_size; | |||
| op_kernel->GetImageSize(i, &img_size); | |||
| auto data_ptr = allocator_->Malloc(output->Size(), img_size); | |||
| output->SetData(data_ptr); | |||
| } else { | |||
| output->MallocData(allocator_); | |||
| } | |||
| output->set_allocator(allocator_); | |||
| } | |||
| for (auto input_kernel : kernel->in_kernels()) { | |||
| MS_EXCEPTION_IF_NULL(input_kernel); | |||
| auto ret = input_kernel->DecOutTensorRefCount(); | |||
| if (0 != ret) { | |||
| MS_LOG(WARNING) << "DecOutTensorRefCount for kernel" << kernel->name() << " failed"; | |||
| } | |||
| } | |||
| } | |||
| for (auto kernel : out_kernels_) { | |||
| MS_EXCEPTION_IF_NULL(kernel); | |||
| auto ret = kernel->DecOutTensorRefCount(); | |||
| if (0 != ret) { | |||
| MS_LOG(WARNING) << "DecOutTensorRefCount for kernel" << kernel->name() << " failed"; | |||
| } | |||
| } | |||
| for (auto kernel : in_convert_ops_) { | |||
| MS_EXCEPTION_IF_NULL(kernel); | |||
| auto ret = kernel->DecOutTensorRefCount(); | |||
| if (0 != ret) { | |||
| MS_LOG(WARNING) << "DecOutTensorRefCount for kernel" << kernel->name() << " failed"; | |||
| } | |||
| } | |||
| for (auto kernel : out_convert_ops_) { | |||
| MS_EXCEPTION_IF_NULL(kernel); | |||
| auto ret = kernel->DecOutTensorRefCount(); | |||
| if (0 != ret) { | |||
| MS_LOG(WARNING) << "DecOutTensorRefCount for kernel" << kernel->name() << " failed"; | |||
| } | |||
| } | |||
| return RET_OK; | |||
| } | |||
| int SubGraphOpenCLKernel::UnInit() { | |||
| for (auto &tensor : outputs_) { | |||
| for (auto &tensor : out_tensors_) { | |||
| allocator_->UnmapBuffer(tensor->Data()); | |||
| } | |||
| for (const auto tensor : in_tensors_) { | |||
| if (tensor != nullptr) { | |||
| tensor->FreeData(); | |||
| } | |||
| } | |||
| for (const auto tensor : out_tensors_) { | |||
| if (tensor != nullptr) { | |||
| tensor->FreeData(); | |||
| } | |||
| } | |||
| for (auto &tensor : out_tensors_) { | |||
| allocator_->UnmapBuffer(tensor->Data()); | |||
| } | |||
| for (const auto tensor : inputs_) { | |||
| for (const auto tensor : in_convert_tensors_) { | |||
| if (tensor != nullptr) { | |||
| tensor->FreeData(); | |||
| delete tensor; | |||
| } | |||
| } | |||
| for (const auto tensor : outputs_) { | |||
| for (const auto tensor : out_convert_tensors_) { | |||
| if (tensor != nullptr) { | |||
| tensor->FreeData(); | |||
| delete tensor; | |||
| } | |||
| } | |||
| return 0; | |||
| for (const auto parameter : in_parameters_) { | |||
| if (parameter != nullptr) { | |||
| delete parameter; | |||
| } | |||
| } | |||
| for (const auto op : in_convert_ops_) { | |||
| if (op != nullptr) { | |||
| delete op; | |||
| } | |||
| } | |||
| return RET_OK; | |||
| } | |||
| int SubGraphOpenCLKernel::InferShape() { return 0; } | |||
| int SubGraphOpenCLKernel::InferShape() { return RET_OK; } | |||
| int SubGraphOpenCLKernel::ReSize() { return 0; } | |||
| int SubGraphOpenCLKernel::ReSize() { return RET_OK; } | |||
| int SubGraphOpenCLKernel::Run() { | |||
| auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance(); | |||
| for (auto &tensor : inputs_) { | |||
| for (auto &tensor : in_tensors_) { | |||
| allocator_->UnmapBuffer(tensor->Data()); | |||
| } | |||
| lite::opencl::OpenCLExecutor executor; | |||
| executor.Run(inputs_, outputs_, nodes_, allocator_); | |||
| ocl_runtime->SyncCommandQueue(); | |||
| for (auto &tensor : outputs_) { | |||
| executor.Run(in_tensors_, out_tensors_, nodes_, allocator_); | |||
| for (auto &tensor : out_tensors_) { | |||
| void *data = allocator_->MapBuffer(tensor->Data(), CL_MAP_READ, nullptr, true); | |||
| tensor->SetData(data); | |||
| } | |||
| return 0; | |||
| return RET_OK; | |||
| } | |||
| } // namespace mindspore::kernel | |||
| @@ -36,7 +36,7 @@ class SubGraphOpenCLKernel : public SubGraphKernel { | |||
| const std::vector<kernel::LiteKernel *> inKernels, | |||
| const std::vector<kernel::LiteKernel *> outKernels, | |||
| const std::vector<kernel::LiteKernel *> nodes) | |||
| : SubGraphKernel(inputs, outputs, inKernels, outKernels, nodes, nullptr, nullptr) {} | |||
| : SubGraphKernel(inputs, outputs, inKernels, outKernels, nodes, nullptr, nullptr) {} | |||
| ~SubGraphOpenCLKernel() override; | |||
| int Init() override; | |||
| @@ -45,11 +45,24 @@ class SubGraphOpenCLKernel : public SubGraphKernel { | |||
| int Run() override; | |||
| int UnInit(); | |||
| protected: | |||
| int MallocTensorWithReuse(); | |||
| int GenToFormatOp(const std::vector<lite::tensor::Tensor *> &in_tensors, | |||
| const std::vector<kernel::LiteKernel *> in_kernels, | |||
| std::vector<lite::tensor::Tensor *> *out_tensors, | |||
| std::vector<OpenCLToFormatParameter *> *out_parameters, std::vector<LiteKernel *> *out_convert_ops, | |||
| OpenCLMemType mem_type); | |||
| private: | |||
| SubGraphOpenCLParameter *subgraph_ocl_parameter_; | |||
| lite::opencl::OpenCLAllocator *allocator_; | |||
| std::vector<lite::tensor::Tensor *> in_convert_tensors_; | |||
| std::vector<lite::tensor::Tensor *> out_convert_tensors_; | |||
| std::vector<OpenCLToFormatParameter *> in_parameters_; | |||
| std::vector<OpenCLToFormatParameter *> out_parameters_; | |||
| std::vector<LiteKernel *> in_convert_ops_; | |||
| std::vector<LiteKernel *> out_convert_ops_; | |||
| }; | |||
| } // namespace mindspore::kernel | |||
| #endif // MINDSPORE_LITE_SRC_BACKEND_OPENCL_SUBGRAPH_OPENCL_KERNEL_H_ | |||
| @@ -18,6 +18,22 @@ | |||
| #include <algorithm> | |||
| #include <string> | |||
| #include <vector> | |||
| #include "src/kernel_registry.h" | |||
| using mindspore::lite::KernelRegistrar; | |||
| namespace mindspore::lite { | |||
| kernel::LiteKernel *GetOpenCLKernel(const std::vector<tensor::Tensor *> &in_tensors, | |||
| const std::vector<tensor::Tensor *> &out_tensors, OpParameter *parameter, | |||
| const Context *ctx, const kernel::KernelKey &key) { | |||
| auto creator = KernelRegistry::GetInstance()->GetCreator(key); | |||
| if (creator != nullptr) { | |||
| auto kernel = creator(in_tensors, out_tensors, parameter, nullptr, key, nullptr); | |||
| return kernel; | |||
| } | |||
| return nullptr; | |||
| } | |||
| } // namespace mindspore::lite | |||
| namespace mindspore { | |||
| namespace kernel { | |||
| @@ -22,6 +22,13 @@ | |||
| #include "CL/cl2.hpp" | |||
| #include "utils/log_adapter.h" | |||
| #include "src/runtime/kernel/arm/nnacl/op_base.h" | |||
| #include "src/lite_kernel.h" | |||
| namespace mindspore::lite { | |||
| kernel::LiteKernel *GetOpenCLKernel(const std::vector<tensor::Tensor *> &in_tensors, | |||
| const std::vector<tensor::Tensor *> &out_tensors, OpParameter *parameter, | |||
| const Context *ctx, const kernel::KernelKey &key); | |||
| } | |||
| namespace mindspore::kernel { | |||
| @@ -16,10 +16,10 @@ | |||
| #include "src/runtime/opencl/opencl_allocator.h" | |||
| #include <utility> | |||
| #include "utils/log_adapter.h" | |||
| #include "src/runtime/opencl/opencl_runtime.h" | |||
| #include "include/errorcode.h" | |||
| #include "src/runtime/kernel/opencl/utils.h" | |||
| #include "utils/log_adapter.h" | |||
| #include "include/errorcode.h" | |||
| namespace mindspore::lite::opencl { | |||
| @@ -61,7 +61,7 @@ void *OpenCLAllocator::Malloc(size_t size, const std::vector<size_t> &img_size) | |||
| } | |||
| Lock(); | |||
| auto iter = free_list_.lower_bound(size); | |||
| if (iter != free_list_.end() && (iter->second->size_ >= size) && (iter->second->size_ < (size << shift_factor_))) { | |||
| while (iter != free_list_.end() && (iter->second->size_ >= size) && (iter->second->size_ < (size << shift_factor_))) { | |||
| auto mem_buf = iter->second; | |||
| bool is_match{mem_buf->img_size.size() == img_size.size()}; | |||
| for (int i = 0; i < img_size.size() && is_match; ++i) { | |||
| @@ -75,6 +75,7 @@ void *OpenCLAllocator::Malloc(size_t size, const std::vector<size_t> &img_size) | |||
| << ", host addr: " << mem_buf->host_ptr_ << ", device addr: " << mem_buf->device_ptr_; | |||
| return mem_buf->host_ptr_; | |||
| } | |||
| ++iter; | |||
| } | |||
| void *host_ptr = nullptr; | |||
| void *device_ptr = nullptr; | |||
| @@ -136,7 +137,7 @@ void *OpenCLAllocator::CreateImageFromHost(void *data, size_t size, const std::v | |||
| auto ocl_runtime = opencl::OpenCLRuntime::GetInstance(); | |||
| Lock(); | |||
| auto iter = free_list_.lower_bound(size); | |||
| if (iter != free_list_.end() && (iter->second->size_ >= size) && (iter->second->size_ < (size << shift_factor_))) { | |||
| while (iter != free_list_.end() && (iter->second->size_ >= size) && (iter->second->size_ < (size << shift_factor_))) { | |||
| auto mem_buf = iter->second; | |||
| bool is_match{mem_buf->img_size.size() == img_size.size()}; | |||
| for (int i = 0; i < img_size.size() && is_match; ++i) { | |||
| @@ -150,6 +151,7 @@ void *OpenCLAllocator::CreateImageFromHost(void *data, size_t size, const std::v | |||
| << ", host addr: " << mem_buf->host_ptr_ << ", device addr: " << mem_buf->device_ptr_; | |||
| return mem_buf->host_ptr_; | |||
| } | |||
| ++iter; | |||
| } | |||
| void *host_ptr = nullptr; | |||
| void *device_ptr = nullptr; | |||
| @@ -198,10 +200,13 @@ void OpenCLAllocator::Free(void *buf) { | |||
| allocated_list_.erase(iter); | |||
| free_list_.insert(std::make_pair(mem_buf->size_, mem_buf)); | |||
| UnLock(); | |||
| MS_LOG(DEBUG) << "Free a new Image2D. size: " << mem_buf->size_ << ", host addr: " << mem_buf->host_ptr_ | |||
| << ", device addr: " << mem_buf->device_ptr_ << ", image addr: " << mem_buf->image_ptr_; | |||
| return; | |||
| } | |||
| UnLock(); | |||
| free(buf); | |||
| MS_LOG(DEBUG) << "Free host ptr: " << buf; | |||
| } | |||
| size_t OpenCLAllocator::GetTotalSize() { | |||
| @@ -24,41 +24,9 @@ namespace mindspore::lite::opencl { | |||
| int OpenCLExecutor::Run(std::vector<tensor::Tensor *> &inputs, std::vector<tensor::Tensor *> &outputs, | |||
| std::vector<kernel::LiteKernel *> &kernels, Allocator *allocator, | |||
| const session::KernelCallBack &before, const session::KernelCallBack &after) { | |||
| MS_ASSERT(nullptr != allocator); | |||
| for (auto &inTensor : inputs) { | |||
| if (inTensor == nullptr) { | |||
| MS_LOG(ERROR) << "Graph input tensor is nullptr"; | |||
| return RET_ERROR; | |||
| } | |||
| if (inTensor->GetFormat() != schema::Format_NHWC4 && inTensor->GetFormat() != schema::Format_NC4HW4 && | |||
| inTensor->GetFormat() != schema::Format_NHWC) { | |||
| MS_LOG(ERROR) << "input should be NHWC/NHWC4/NC4HW4, actual is " << schema::EnumNameFormat(inTensor->GetFormat()); | |||
| return RET_ERROR; | |||
| } else { | |||
| TransformTensorLayout(inTensor, inTensor->GetFormat(), schema::Format_NHWC4, true); | |||
| // TransformTensorLayout(inTensor, inTensor->GetFormat(), schema::Format_NC4HW4, true); | |||
| } | |||
| } | |||
| kernel::LiteKernelUtil::InitTensorRefCount(kernels); | |||
| OpenCLAllocator* op_allocator = reinterpret_cast<OpenCLAllocator*>(allocator); | |||
| for (auto *kernel : kernels) { | |||
| MS_ASSERT(nullptr != kernel); | |||
| kernel::OpenCLKernel *op_kernel = reinterpret_cast<kernel::OpenCLKernel*>(kernel); | |||
| auto &outputs = kernel->out_tensors(); | |||
| for (auto i = 0; i < outputs.size(); ++i) { | |||
| auto *output = outputs.at(i); | |||
| MS_ASSERT(nullptr != output); | |||
| if (is_image2d_out_) { | |||
| std::vector<size_t> img_size; | |||
| op_kernel->GetImageSize(i, &img_size); | |||
| auto data_ptr = op_allocator->Malloc(output->Size(), img_size); | |||
| output->SetData(data_ptr); | |||
| } else { | |||
| output->MallocData(allocator); | |||
| } | |||
| output->set_allocator(allocator); | |||
| } | |||
| session::CallBackParam callbackParam; | |||
| callbackParam.name_callback_param = kernel->name(); | |||
| @@ -67,6 +35,21 @@ int OpenCLExecutor::Run(std::vector<tensor::Tensor *> &inputs, std::vector<tenso | |||
| MS_LOG(ERROR) << "run kernel before_callback failed, name: " << kernel->name(); | |||
| } | |||
| } | |||
| kernel::OpenCLKernel *op_kernel = reinterpret_cast<kernel::OpenCLKernel *>(kernel); | |||
| auto &cur_outputs = kernel->out_tensors(); | |||
| for (auto i = 0; i < cur_outputs.size(); ++i) { | |||
| auto *output = cur_outputs.at(i); | |||
| MS_ASSERT(nullptr != output); | |||
| if (op_kernel->GetMemType() == kernel::OpenCLMemType::IMG) { | |||
| std::vector<size_t> img_size; | |||
| op_kernel->GetImageSize(i, &img_size); | |||
| auto data_ptr = allocator_->Malloc(output->Size(), img_size); | |||
| output->SetData(data_ptr); | |||
| } else { | |||
| output->MallocData(allocator_); | |||
| } | |||
| } | |||
| auto ret = kernel->Run(); | |||
| if (0 != ret) { | |||
| MS_LOG(ERROR) << "run kernel failed, name: " << kernel->name(); | |||
| @@ -86,21 +69,11 @@ int OpenCLExecutor::Run(std::vector<tensor::Tensor *> &inputs, std::vector<tenso | |||
| } | |||
| } | |||
| } | |||
| // output format transform | |||
| for (auto &outTensor : outputs) { | |||
| if (outTensor == nullptr) { | |||
| MS_LOG(ERROR) << "Graph output tensor is nullptr"; | |||
| return RET_ERROR; | |||
| } | |||
| if (outTensor->GetFormat() != schema::Format_NHWC) { | |||
| TransformTensorLayout(outTensor, outTensor->GetFormat(), schema::Format_NHWC, false); | |||
| } | |||
| } | |||
| return RET_OK; | |||
| } | |||
| int OpenCLExecutor::TransformTensorLayout(tensor::Tensor *tensor, schema::Format src_format, | |||
| schema::Format dst_format, bool trans_dir) { | |||
| int OpenCLExecutor::TransformTensorLayout(tensor::Tensor *tensor, schema::Format src_format, schema::Format dst_format, | |||
| bool trans_dir) { | |||
| MS_ASSERT(nullptr != tensor); | |||
| MS_ASSERT(4 == tensor->shape().size()); | |||
| auto data_type = tensor->data_type(); | |||
| @@ -114,11 +87,10 @@ int OpenCLExecutor::TransformTensorLayout(tensor::Tensor *tensor, schema::Format | |||
| << schema::EnumNameFormat(dst_format); | |||
| return RET_ERROR; | |||
| } | |||
| return RET_OK; | |||
| } | |||
| int OpenCLExecutor::TransformTensorLayoutFp32(tensor::Tensor *tensor, schema::Format src_format, | |||
| schema::Format dst_format, bool trans_dir) { | |||
| schema::Format dst_format, bool trans_dir) { | |||
| MS_ASSERT(nullptr != tensor); | |||
| MS_ASSERT(nullptr != allocator_); | |||
| MS_ASSERT(4 == tensor->shape().size()); | |||
| @@ -138,11 +110,11 @@ int OpenCLExecutor::TransformTensorLayoutFp32(tensor::Tensor *tensor, schema::Fo | |||
| } | |||
| int OpenCLExecutor::TransformTensorLayoutToBuffer(tensor::Tensor *tensor, schema::Format src_format, | |||
| schema::Format dst_format) { | |||
| schema::Format dst_format) { | |||
| if (dst_format == schema::Format_NHWC4) { | |||
| auto *src_data = tensor->Data(); | |||
| size_t C4 = UP_DIV(tensor->Channel(), C4NUM); | |||
| std::vector <size_t> img_size{tensor->Width() * C4, (size_t) tensor->Height(), CL_FLOAT}; | |||
| std::vector<size_t> img_size{tensor->Width() * C4, (size_t)tensor->Height(), CL_FLOAT}; | |||
| if (src_format == schema::Format_NHWC) { | |||
| auto *dst_data = allocator_->Malloc(tensor->Size(), img_size); | |||
| if (dst_data == nullptr) { | |||
| @@ -168,7 +140,7 @@ int OpenCLExecutor::TransformTensorLayoutToBuffer(tensor::Tensor *tensor, schema | |||
| } | |||
| int OpenCLExecutor::TransformTensorLayoutToImage(tensor::Tensor *tensor, schema::Format src_format, | |||
| schema::Format dst_format) { | |||
| schema::Format dst_format) { | |||
| if (dst_format == schema::Format_NHWC4) { | |||
| tensor->SetFormat(schema::Format_NHWC4); | |||
| // convert to nhwc4 | |||
| @@ -202,15 +174,15 @@ int OpenCLExecutor::TransformTensorLayoutToImage(tensor::Tensor *tensor, schema: | |||
| } | |||
| int OpenCLExecutor::TransformTensorLayoutFromImage(tensor::Tensor *tensor, schema::Format src_format, | |||
| schema::Format dst_format) { | |||
| schema::Format dst_format) { | |||
| if (dst_format == schema::Format_NHWC) { | |||
| auto src_data = tensor->Data(); | |||
| auto dst_data = allocator_->Malloc(tensor->Size()); | |||
| cl::Image2D *out_mem = reinterpret_cast<cl::Image2D *>(allocator_->GetImage(src_data)); | |||
| std::vector<size_t> img_size; | |||
| allocator_->GetImageSize(src_data, &img_size); | |||
| auto origin = cl::array < cl::size_type, 3U > {0, 0, 0}; | |||
| auto region = cl::array < cl::size_type, 3U > {img_size[0], img_size[1], 1}; | |||
| auto origin = cl::array<cl::size_type, 3U>{0, 0, 0}; | |||
| auto region = cl::array<cl::size_type, 3U>{img_size[0], img_size[1], 1}; | |||
| auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance(); | |||
| ocl_runtime->GetDefaultCommandQueue()->enqueueReadImage(*out_mem, CL_TRUE, origin, region, 0, 0, dst_data); | |||
| tensor->SetData(dst_data); | |||
| @@ -224,7 +196,7 @@ int OpenCLExecutor::TransformTensorLayoutFromImage(tensor::Tensor *tensor, schem | |||
| } | |||
| int OpenCLExecutor::TransformTensorLayoutUint8(tensor::Tensor *tensor, schema::Format src_format, | |||
| schema::Format dst_format, bool is_image) { | |||
| schema::Format dst_format, bool is_image) { | |||
| MS_ASSERT(nullptr != tensor); | |||
| MS_ASSERT(4 == tensor->shape().size()); | |||
| // auto src_format = tensor->GetFormat(); | |||
| @@ -234,4 +206,3 @@ int OpenCLExecutor::TransformTensorLayoutUint8(tensor::Tensor *tensor, schema::F | |||
| return RET_ERROR; | |||
| } | |||
| } // namespace mindspore::lite::opencl | |||
| @@ -148,6 +148,7 @@ if (SUPPORT_GPU) | |||
| ${LITE_DIR}/src/runtime/kernel/opencl/kernel/conv2d_transpose.cc | |||
| ${LITE_DIR}/src/runtime/kernel/opencl/kernel/transpose.cc | |||
| ${LITE_DIR}/src/runtime/kernel/opencl/kernel/reshape.cc | |||
| ${LITE_DIR}/src/runtime/kernel/opencl/kernel/to_format.cc | |||
| ) | |||
| endif() | |||
| ### minddata lite | |||
| @@ -323,6 +324,8 @@ if (SUPPORT_GPU) | |||
| ${TEST_DIR}/ut/src/runtime/kernel/opencl/transpose_tests.cc | |||
| ${TEST_DIR}/ut/src/runtime/kernel/opencl/convolution_tests.cc | |||
| ${TEST_DIR}/ut/src/runtime/kernel/opencl/activation_tests.cc | |||
| #${TEST_DIR}/ut/src/runtime/kernel/opencl/leakyrelu_tests.cc | |||
| ${TEST_DIR}/ut/src/runtime/kernel/opencl/to_format_tests.cc | |||
| ) | |||
| endif() | |||
| @@ -0,0 +1,79 @@ | |||
| /** | |||
| * Copyright 2020 Huawei Technologies Co., Ltd | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| * You may obtain a copy of the License at | |||
| * | |||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, software | |||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| #include <iostream> | |||
| #include <memory> | |||
| #include "mindspore/core/utils/log_adapter.h" | |||
| #include "common/common_test.h" | |||
| #include "mindspore/lite/src/common/file_utils.h" | |||
| #include "mindspore/lite/src/runtime/opencl/opencl_runtime.h" | |||
| #include "mindspore/lite/src/runtime/kernel/opencl/subgraph_opencl_kernel.h" | |||
| #include "mindspore/lite/src/runtime/kernel/opencl/kernel/transpose.h" | |||
| namespace mindspore { | |||
| class TestToFormatOpenCL : public mindspore::CommonTest { | |||
| public: | |||
| TestToFormatOpenCL() {} | |||
| }; | |||
| TEST_F(TestToFormatOpenCL, TransposeFp32) { | |||
| auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance(); | |||
| ocl_runtime->Init(); | |||
| auto allocator = ocl_runtime->GetAllocator(); | |||
| int h = 64; | |||
| int w = 1; | |||
| int c = 7360; | |||
| size_t input_size; | |||
| std::string input_path = "./test_data/transpose/transpose_fp32_input.bin"; | |||
| auto input_data = reinterpret_cast<float *>(mindspore::lite::ReadFile(input_path.c_str(), &input_size)); | |||
| lite::tensor::Tensor *tensor_x = | |||
| new lite::tensor::Tensor(TypeId(kNumberTypeFloat32), {1, h, w, c}, schema::Format_NHWC4); | |||
| lite::tensor::Tensor *tensor_out = new lite::tensor::Tensor(TypeId(kNumberTypeFloat32), {1, c, h, w}); | |||
| std::vector<lite::tensor::Tensor *> inputs{tensor_x}; | |||
| std::vector<lite::tensor::Tensor *> outputs{tensor_out}; | |||
| auto *arith_kernel = new kernel::TransposeOpenCLKernel(nullptr, inputs, outputs); | |||
| arith_kernel->Init(); | |||
| inputs[0]->MallocData(allocator); | |||
| std::vector<kernel::LiteKernel *> kernels{arith_kernel}; | |||
| auto *pGraph = new kernel::SubGraphOpenCLKernel(inputs, outputs, kernels, kernels, kernels); | |||
| pGraph->Init(); | |||
| memcpy(inputs[0]->Data(), input_data, input_size); | |||
| pGraph->Run(); | |||
| size_t output_size; | |||
| std::string output_path = "./test_data/transpose/transpose_fp32_output.bin"; | |||
| auto correct_data = reinterpret_cast<float *>(mindspore::lite::ReadFile(output_path.c_str(), &output_size)); | |||
| printf("==================output data=================\n"); | |||
| float *output_data = reinterpret_cast<float *>(tensor_out->Data()); | |||
| std::cout << std::endl; | |||
| int size_n = h * w * c; | |||
| size_n = size_n > 100 ? 100 : size_n; | |||
| for (int i = 0; i < size_n; i++) { | |||
| std::cout << output_data[i] << " "; | |||
| if ((i + 1) % c == 0) { | |||
| std::cout << std::endl; | |||
| } | |||
| } | |||
| std::cout << std::endl; | |||
| // compare | |||
| CompareOutputData(output_data, correct_data, h * w * c, 0.00001); | |||
| MS_LOG(INFO) << "TestMatMulFp32 passed"; | |||
| } | |||
| } // namespace mindspore | |||