Merge pull request !4524 from wandongdong/uptags/v0.7.0-beta
| @@ -192,7 +192,8 @@ union PrimitiveType { | |||||
| SpaceToBatchND, | SpaceToBatchND, | ||||
| TopKV2, | TopKV2, | ||||
| Return, | Return, | ||||
| MakeTuple | |||||
| MakeTuple, | |||||
| ToFormat, | |||||
| } | } | ||||
| enum QuantType: int { | enum QuantType: int { | ||||
| @@ -881,5 +881,10 @@ table TopKV2 { | |||||
| table MakeTuple { | table MakeTuple { | ||||
| } | } | ||||
| table ToFormat { | |||||
| srcT: int; | |||||
| dstT: int; | |||||
| } | |||||
| table Return { | table Return { | ||||
| } | } | ||||
| @@ -131,6 +131,10 @@ class LiteKernel { | |||||
| void AddOutKernel(LiteKernel *kernel) { this->out_kernels_.emplace_back(kernel); } | void AddOutKernel(LiteKernel *kernel) { this->out_kernels_.emplace_back(kernel); } | ||||
| void SetInKernel(const std::vector<LiteKernel *> &kernel) { this->in_kernels_ = kernel; } | |||||
| void SetOutKernel(const std::vector<LiteKernel *> &kernel) { this->out_kernels_ = kernel; } | |||||
| std::vector<LiteKernel *> &in_kernels() { return this->in_kernels_; } | std::vector<LiteKernel *> &in_kernels() { return this->in_kernels_; } | ||||
| std::vector<LiteKernel *> &out_kernels() { return this->out_kernels_; } | std::vector<LiteKernel *> &out_kernels() { return this->out_kernels_; } | ||||
| @@ -167,16 +171,14 @@ class SubGraphKernel : public LiteKernel { | |||||
| public: | public: | ||||
| explicit SubGraphKernel(const std::vector<lite::tensor::Tensor *> &inputs, | explicit SubGraphKernel(const std::vector<lite::tensor::Tensor *> &inputs, | ||||
| const std::vector<lite::tensor::Tensor *> &outputs, | const std::vector<lite::tensor::Tensor *> &outputs, | ||||
| const std::vector<kernel::LiteKernel *> &inKernels, | |||||
| const std::vector<kernel::LiteKernel *> &outKernels, | |||||
| const std::vector<kernel::LiteKernel *> &in_kernels, | |||||
| const std::vector<kernel::LiteKernel *> &out_kernels, | |||||
| const std::vector<kernel::LiteKernel *> &nodes, const lite::Context *ctx, | const std::vector<kernel::LiteKernel *> &nodes, const lite::Context *ctx, | ||||
| const lite::Primitive *primitive) | const lite::Primitive *primitive) | ||||
| : LiteKernel(nullptr, inputs, outputs, ctx, primitive), | |||||
| inputs_(inputs), | |||||
| outputs_(outputs), | |||||
| inkernels_(inKernels), | |||||
| outkernels_(outKernels), | |||||
| nodes_(nodes) {} | |||||
| : LiteKernel(nullptr, inputs, outputs, ctx, primitive), nodes_(nodes) { | |||||
| in_kernels_ = in_kernels; | |||||
| out_kernels_ = out_kernels; | |||||
| } | |||||
| virtual int Init() { return -1; } | virtual int Init() { return -1; } | ||||
| virtual int InferShape() { return -1; } | virtual int InferShape() { return -1; } | ||||
| @@ -184,10 +186,6 @@ class SubGraphKernel : public LiteKernel { | |||||
| virtual int Run() { return -1; } | virtual int Run() { return -1; } | ||||
| protected: | protected: | ||||
| std::vector<lite::tensor::Tensor *> inputs_; | |||||
| std::vector<lite::tensor::Tensor *> outputs_; | |||||
| std::vector<LiteKernel *> inkernels_; | |||||
| std::vector<LiteKernel *> outkernels_; | |||||
| std::vector<LiteKernel *> nodes_; | std::vector<LiteKernel *> nodes_; | ||||
| }; | }; | ||||
| @@ -49,9 +49,6 @@ __kernel void ElementDiv(__read_only image2d_t input_a, __read_only image2d_t in | |||||
| float4 a = read_imagef(input_a, smp_none, (int2)(X, Y)); | float4 a = read_imagef(input_a, smp_none, (int2)(X, Y)); | ||||
| float4 b = read_imagef(input_b, smp_none, (int2)(X, Y)); | float4 b = read_imagef(input_b, smp_none, (int2)(X, Y)); | ||||
| if (b == 0) { | |||||
| return; | |||||
| } | |||||
| write_imagef(output, (int2)(X, Y), a / b); | write_imagef(output, (int2)(X, Y), a / b); | ||||
| } | } | ||||
| @@ -0,0 +1,235 @@ | |||||
| #define FLT float | |||||
| #define FLT4 float4 | |||||
| #define READ_IMAGE read_imagef | |||||
| #define WRITE_IMAGE write_imagef | |||||
| // enum Format { | |||||
| // Format_NCHW = 0, | |||||
| // Format_NHWC = 1, | |||||
| // Format_NHWC4 = 2, | |||||
| // Format_HWKC = 3, | |||||
| // Format_HWCK = 4, | |||||
| // Format_KCHW = 5, | |||||
| // Format_CKHW = 6, | |||||
| // Format_KHWC = 7, | |||||
| // Format_CHWK = 8, | |||||
| // Format_NC4HW4 = 100, | |||||
| // Format_NUM_OF_FORMAT = 101, | |||||
| // Format_MIN = Format_NCHW, | |||||
| // Format_MAX = Format_NUM_OF_FORMAT | |||||
| //}; | |||||
| __constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; | |||||
| __kernel void to_format_NCHW_to_NHWC4_IMG(__global FLT4 *src_data, __write_only image2d_t dst_data, int4 size, | |||||
| int4 shape) { | |||||
| int X = get_global_id(0); | |||||
| int Y = get_global_id(1); | |||||
| int Z = get_global_id(2); | |||||
| if (X >= size.x || Y >= size.y || Z >= size.z) { | |||||
| return; | |||||
| } | |||||
| // WRITE_IMAGE(dst_data, (int2)(Y * size.z + Z, X), READ_IMAGE(src_data, smp_zero, (int2)(Y * size.z + Z, X))); | |||||
| } | |||||
| __kernel void to_format_NHWC_to_NHWC4_IMG(__global FLT4 *src_data, __write_only image2d_t dst_data, int4 size, | |||||
| int4 shape) { | |||||
| int X = get_global_id(0); | |||||
| int Y = get_global_id(1); | |||||
| int Z = get_global_id(2); | |||||
| if (X >= size.x || Y >= size.y || Z >= size.z) { | |||||
| return; | |||||
| } | |||||
| int offset = (X * shape.z + Y) * shape.w + Z * 4; | |||||
| __global FLT *src_addr = (__global FLT *)src_data; | |||||
| src_addr += offset; | |||||
| FLT4 data = (FLT4)(0.f); | |||||
| if ((Z + 1) * 4 <= shape.w) { | |||||
| data = ((__global FLT4 *)src_addr)[0]; | |||||
| } else { | |||||
| if ((shape.w - Z * 4) >= 1) { | |||||
| data.x = src_addr[0]; | |||||
| } | |||||
| if ((shape.w - Z * 4) >= 2) { | |||||
| data.y = src_addr[1]; | |||||
| } | |||||
| if ((shape.w - Z * 4) >= 3) { | |||||
| data.z = src_addr[2]; | |||||
| } | |||||
| } | |||||
| WRITE_IMAGE(dst_data, (int2)(Y * size.z + Z, X), data); | |||||
| } | |||||
| __kernel void to_format_NHWC4_to_NHWC4_IMG(__global FLT4 *src_data, __write_only image2d_t dst_data, int4 size, | |||||
| int4 shape) { | |||||
| int X = get_global_id(0); | |||||
| int Y = get_global_id(1); | |||||
| int Z = get_global_id(2); | |||||
| if (X >= size.x || Y >= size.y || Z >= size.z) { | |||||
| return; | |||||
| } | |||||
| // WRITE_IMAGE(dst_data, (int2)(Y * size.z + Z, X), READ_IMAGE(src_data, smp_zero, (int2)(Y * size.z + Z, X))); | |||||
| } | |||||
| __kernel void to_format_NC4HW4_to_NHWC4_IMG(__global FLT4 *src_data, __write_only image2d_t dst_data, int4 size, | |||||
| int4 shape) { | |||||
| int X = get_global_id(0); | |||||
| int Y = get_global_id(1); | |||||
| int Z = get_global_id(2); | |||||
| if (X >= size.x || Y >= size.y || Z >= size.z) { | |||||
| return; | |||||
| } | |||||
| // WRITE_IMAGE(dst_data, (int2)(Y * size.z + Z, X), READ_IMAGE(src_data, smp_zero, (int2)(Y * size.z + Z, X))); | |||||
| } | |||||
| __kernel void to_format_NCHW_to_NC4HW4_IMG(__global FLT4 *src_data, __write_only image2d_t dst_data, int4 size, | |||||
| int4 shape) { | |||||
| int X = get_global_id(0); | |||||
| int Y = get_global_id(1); | |||||
| int Z = get_global_id(2); | |||||
| if (X >= size.x || Y >= size.y || Z >= size.z) { | |||||
| return; | |||||
| } | |||||
| // WRITE_IMAGE(dst_data, (int2)(Y * size.z + Z, X), READ_IMAGE(src_data, smp_zero, (int2)(Y * size.z + Z, X))); | |||||
| } | |||||
| __kernel void to_format_NHWC_to_NC4HW4_IMG(__global FLT4 *src_data, __write_only image2d_t dst_data, int4 size, | |||||
| int4 shape) { | |||||
| int X = get_global_id(0); | |||||
| int Y = get_global_id(1); | |||||
| int Z = get_global_id(2); | |||||
| if (X >= size.x || Y >= size.y || Z >= size.z) { | |||||
| return; | |||||
| } | |||||
| // WRITE_IMAGE(dst_data, (int2)(Y * size.z + Z, X), READ_IMAGE(src_data, smp_zero, (int2)(Y * size.z + Z, X))); | |||||
| } | |||||
| __kernel void to_format_NHWC4_to_NC4HW4_IMG(__global FLT4 *src_data, __write_only image2d_t dst_data, int4 size, | |||||
| int4 shape) { | |||||
| int X = get_global_id(0); | |||||
| int Y = get_global_id(1); | |||||
| int Z = get_global_id(2); | |||||
| if (X >= size.x || Y >= size.y || Z >= size.z) { | |||||
| return; | |||||
| } | |||||
| // WRITE_IMAGE(dst_data, (int2)(Y * size.z + Z, X), READ_IMAGE(src_data, smp_zero, (int2)(Y * size.z + Z, X))); | |||||
| } | |||||
| __kernel void to_format_NC4HW4_to_NC4HW4_IMG(__global FLT4 *src_data, __write_only image2d_t dst_data, int4 size, | |||||
| int4 shape) { | |||||
| int X = get_global_id(0); | |||||
| int Y = get_global_id(1); | |||||
| int Z = get_global_id(2); | |||||
| if (X >= size.x || Y >= size.y || Z >= size.z) { | |||||
| return; | |||||
| } | |||||
| // FLT4 src_final = src_data[(((Z)*src_size.y + (y_c)) * src_size.x + (x_c))]; | |||||
| WRITE_IMAGE(dst_data, (int2)(Y * size.z + Z, X), src_data[(Y * size.z + Z) * size.x + X]); | |||||
| } | |||||
| __kernel void to_format_NCHW_to_NCHW_BUF(__read_only image2d_t src_data, __global FLT4 *dst_data, int4 size, | |||||
| int4 shape) { | |||||
| int X = get_global_id(0); | |||||
| int Y = get_global_id(1); | |||||
| int Z = get_global_id(2); | |||||
| if (X >= size.x || Y >= size.y || Z >= size.z) { | |||||
| return; | |||||
| } | |||||
| dst_data[(Z * size.y + Y) * size.x + X] = READ_IMAGE(src_data, smp_zero, (int2)(Y * size.x + X, Z)); | |||||
| } | |||||
| __kernel void to_format_NHWC_to_NCHW_BUF(__read_only image2d_t src_data, __global FLT4 *dst_data, int4 size, | |||||
| int4 shape) { | |||||
| int X = get_global_id(0); | |||||
| int Y = get_global_id(1); | |||||
| int Z = get_global_id(2); | |||||
| if (X >= size.x || Y >= size.y || Z >= size.z) { | |||||
| return; | |||||
| } | |||||
| // WRITE_IMAGE(dst_data, (int2)(Y * size.z + Z, X), READ_IMAGE(src_data, smp_zero, (int2)(Y * size.z + Z, X))); | |||||
| } | |||||
| __kernel void to_format_NHWC4_to_NCHW_BUF(__read_only image2d_t src_data, __global FLT4 *dst_data, int4 size, | |||||
| int4 shape) { | |||||
| int X = get_global_id(0); | |||||
| int Y = get_global_id(1); | |||||
| int Z = get_global_id(2); | |||||
| if (X >= size.x || Y >= size.y || Z >= size.z) { | |||||
| return; | |||||
| } | |||||
| // WRITE_IMAGE(dst_data, (int2)(Y * size.z + Z, X), READ_IMAGE(src_data, smp_zero, (int2)(Y * size.z + Z, X))); | |||||
| } | |||||
| __kernel void to_format_NC4HW4_to_NCHW_BUF(__read_only image2d_t src_data, __global FLT4 *dst_data, int4 size, | |||||
| int4 shape) { | |||||
| int X = get_global_id(0); | |||||
| int Y = get_global_id(1); | |||||
| int Z = get_global_id(2); | |||||
| if (X >= size.x || Y >= size.y || Z >= size.z) { | |||||
| return; | |||||
| } | |||||
| // WRITE_IMAGE(dst_data, (int2)(Y * size.z + Z, X), READ_IMAGE(src_data, smp_zero, (int2)(Y * size.z + Z, X))); | |||||
| } | |||||
| __kernel void to_format_NCHW_to_NHWC_BUF(__read_only image2d_t src_data, __global FLT4 *dst_data, int4 size, | |||||
| int4 shape) { | |||||
| int X = get_global_id(0); | |||||
| int Y = get_global_id(1); | |||||
| int Z = get_global_id(2); | |||||
| if (X >= size.x || Y >= size.y || Z >= size.z) { | |||||
| return; | |||||
| } | |||||
| // WRITE_IMAGE(dst_data, (int2)(Y * size.z + Z, X), READ_IMAGE(src_data, smp_zero, (int2)(Y * size.z + Z, X))); | |||||
| } | |||||
| __kernel void to_format_NHWC_to_NHWC_BUF(__read_only image2d_t src_data, __global FLT4 *dst_data, int4 size, | |||||
| int4 shape) { | |||||
| int X = get_global_id(0); | |||||
| int Y = get_global_id(1); | |||||
| int Z = get_global_id(2); | |||||
| if (X >= size.x || Y >= size.y || Z >= size.z) { | |||||
| return; | |||||
| } | |||||
| // WRITE_IMAGE(dst_data, (int2)(Y * size.z + Z, X), READ_IMAGE(src_data, smp_zero, (int2)(Y * size.z + Z, X))); | |||||
| } | |||||
| __kernel void to_format_NHWC4_to_NHWC_BUF(__read_only image2d_t src_data, __global FLT4 *dst_data, int4 size, | |||||
| int4 shape) { | |||||
| int X = get_global_id(0); | |||||
| int Y = get_global_id(1); | |||||
| int Z = get_global_id(2); | |||||
| if (X >= size.x || Y >= size.y || Z >= size.z) { | |||||
| return; | |||||
| } | |||||
| FLT4 data = READ_IMAGE(src_data, smp_zero, (int2)(Y * size.z + Z, X)); | |||||
| int offset = (X * shape.z + Y) * shape.w + Z * 4; | |||||
| __global FLT *dst_addr = (__global FLT *)dst_data; | |||||
| dst_addr += offset; | |||||
| if ((Z + 1) * 4 <= shape.w) { | |||||
| ((__global FLT4 *)dst_addr)[0] = data; | |||||
| } else { | |||||
| if (shape.w - Z * 4 >= 1) { | |||||
| dst_addr[0] = data.x; | |||||
| } | |||||
| if (shape.w - Z * 4 >= 2) { | |||||
| dst_addr[1] = data.y; | |||||
| } | |||||
| if (shape.w - Z * 4 >= 3) { | |||||
| dst_addr[2] = data.z; | |||||
| } | |||||
| } | |||||
| } | |||||
| __kernel void to_format_NC4HW4_to_to_NHWC_BUF(__read_only image2d_t src_data, __global FLT4 *dst_data, int4 size, | |||||
| int4 shape) { | |||||
| int X = get_global_id(0); | |||||
| int Y = get_global_id(1); | |||||
| int Z = get_global_id(2); | |||||
| if (X >= size.x || Y >= size.y || Z >= size.z) { | |||||
| return; | |||||
| } | |||||
| // WRITE_IMAGE(dst_data, (int2)(Y * size.z + Z, X), READ_IMAGE(src_data, smp_zero, (int2)(Y * size.z + Z, X))); | |||||
| } | |||||
| __kernel void to_format_NC4HW4_to_NC4HW4_BUF(__read_only image2d_t src_data, __global FLT4 *dst_data, int4 size, | |||||
| int4 shape) { | |||||
| int X = get_global_id(0); | |||||
| int Y = get_global_id(1); | |||||
| int Z = get_global_id(2); | |||||
| if (X >= size.x || Y >= size.y || Z >= size.z) { | |||||
| return; | |||||
| } | |||||
| dst_data[(Y * size.z + Z) * size.x + X] = READ_IMAGE(src_data, smp_zero, (int2)(Y * size.z + Z, X)); | |||||
| } | |||||
| __kernel void to_format_NHWC4_to_NHWC4_BUF(__read_only image2d_t src_data, __global FLT4 *dst_data, int4 size, | |||||
| int4 shape) { | |||||
| int X = get_global_id(0); | |||||
| int Y = get_global_id(1); | |||||
| int Z = get_global_id(2); | |||||
| if (X >= size.x || Y >= size.y || Z >= size.z) { | |||||
| return; | |||||
| } | |||||
| dst_data[(Y * size.z + Z) * size.x + X] = READ_IMAGE(src_data, smp_zero, (int2)(Y * size.z + Z, X)); | |||||
| } | |||||
| @@ -329,7 +329,8 @@ kernel::LiteKernel *OpenCLConvolutionKernelCreator(const std::vector<lite::tenso | |||||
| const std::vector<lite::tensor::Tensor *> &outputs, | const std::vector<lite::tensor::Tensor *> &outputs, | ||||
| OpParameter *opParameter, const lite::Context *ctx, | OpParameter *opParameter, const lite::Context *ctx, | ||||
| const kernel::KernelKey &desc, const lite::Primitive *primitive) { | const kernel::KernelKey &desc, const lite::Primitive *primitive) { | ||||
| auto *kernel = new ConvolutionOpenCLKernel(reinterpret_cast<OpParameter *>(opParameter), inputs, outputs); | |||||
| auto *kernel = | |||||
| new (std::nothrow) ConvolutionOpenCLKernel(reinterpret_cast<OpParameter *>(opParameter), inputs, outputs); | |||||
| if (kernel == nullptr) { | if (kernel == nullptr) { | ||||
| MS_LOG(ERROR) << "Create OpenCL Convolution kernel failed!"; | MS_LOG(ERROR) << "Create OpenCL Convolution kernel failed!"; | ||||
| return nullptr; | return nullptr; | ||||
| @@ -145,7 +145,7 @@ kernel::LiteKernel *OpenCLPooling2dKernelCreator(const std::vector<lite::tensor: | |||||
| const std::vector<lite::tensor::Tensor *> &outputs, | const std::vector<lite::tensor::Tensor *> &outputs, | ||||
| OpParameter *opParameter, const lite::Context *ctx, | OpParameter *opParameter, const lite::Context *ctx, | ||||
| const kernel::KernelKey &desc, const lite::Primitive *primitive) { | const kernel::KernelKey &desc, const lite::Primitive *primitive) { | ||||
| auto *kernel = new (std::nothrow)PoolingOpenCLKernel(reinterpret_cast<OpParameter *>(opParameter), inputs, outputs); | |||||
| auto *kernel = new (std::nothrow) PoolingOpenCLKernel(reinterpret_cast<OpParameter *>(opParameter), inputs, outputs); | |||||
| if (kernel == nullptr) { | if (kernel == nullptr) { | ||||
| MS_LOG(ERROR) << "Create OpenCL Pooling kernel failed!"; | MS_LOG(ERROR) << "Create OpenCL Pooling kernel failed!"; | ||||
| return nullptr; | return nullptr; | ||||
| @@ -158,7 +158,7 @@ kernel::LiteKernel *OpenCLSoftMaxKernelCreator(const std::vector<lite::tensor::T | |||||
| const std::vector<lite::tensor::Tensor *> &outputs, | const std::vector<lite::tensor::Tensor *> &outputs, | ||||
| OpParameter *opParameter, const lite::Context *ctx, | OpParameter *opParameter, const lite::Context *ctx, | ||||
| const kernel::KernelKey &desc, const lite::Primitive *primitive) { | const kernel::KernelKey &desc, const lite::Primitive *primitive) { | ||||
| auto *kernel = new (std::nothrow)SoftmaxOpenCLKernel(reinterpret_cast<OpParameter *>(opParameter), inputs, outputs); | |||||
| auto *kernel = new (std::nothrow) SoftmaxOpenCLKernel(reinterpret_cast<OpParameter *>(opParameter), inputs, outputs); | |||||
| if (kernel == nullptr) { | if (kernel == nullptr) { | ||||
| MS_LOG(ERROR) << "kernel " << opParameter->name_ << "is nullptr."; | MS_LOG(ERROR) << "kernel " << opParameter->name_ << "is nullptr."; | ||||
| return nullptr; | return nullptr; | ||||
| @@ -0,0 +1,167 @@ | |||||
| /** | |||||
| * Copyright 2019 Huawei Technologies Co., Ltd | |||||
| * | |||||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||||
| * you may not use this file except in compliance with the License. | |||||
| * You may obtain a copy of the License at | |||||
| * | |||||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||||
| * | |||||
| * Unless required by applicable law or agreed to in writing, software | |||||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| * See the License for the specific language governing permissions and | |||||
| * limitations under the License. | |||||
| */ | |||||
| #include "src/runtime/kernel/opencl/kernel/to_format.h" | |||||
| #include <set> | |||||
| #include <map> | |||||
| #include <string> | |||||
| #include <utility> | |||||
| #include "include/errorcode.h" | |||||
| #include "src/kernel_registry.h" | |||||
| #include "src/runtime/opencl/opencl_runtime.h" | |||||
| #include "src/runtime/kernel/opencl/cl/fp32/to_format.cl.inc" | |||||
| using mindspore::kernel::KERNEL_ARCH::kGPU; | |||||
| using mindspore::lite::KernelRegistrar; | |||||
| using mindspore::lite::RET_ERROR; | |||||
| using mindspore::lite::RET_OK; | |||||
| using mindspore::schema::PrimitiveType_ToFormat; | |||||
| namespace mindspore::kernel { | |||||
| int ToFormatOpenCLKernel::Init() { | |||||
| auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance(); | |||||
| auto parameter = reinterpret_cast<OpenCLToFormatParameter *>(op_parameter_); | |||||
| out_mem_type_ = parameter->out_mem_type; | |||||
| std::string program_name = "to_format"; | |||||
| std::map<schema::Format, std::string> format_str{{schema::Format_NCHW, "NCHW"}, | |||||
| {schema::Format_NHWC, "NHWC"}, | |||||
| {schema::Format_NC4HW4, "NC4HW4"}, | |||||
| {schema::Format_NHWC4, "NHWC4"}}; | |||||
| std::string kernel_name = | |||||
| "to_format_" + format_str[in_tensors_[0]->GetFormat()] + "_to_" + format_str[out_tensors_[0]->GetFormat()]; | |||||
| if (out_mem_type_ == OpenCLMemType::IMG) { | |||||
| kernel_name += "_IMG"; | |||||
| } else { | |||||
| kernel_name += "_BUF"; | |||||
| } | |||||
| #ifdef PROGRAM_WITH_IL | |||||
| ocl_runtime->CreateKernelFromIL(kernel_(), kernel_name); | |||||
| #else | |||||
| std::set<std::string> build_options; | |||||
| #ifdef ENABLE_FP16 | |||||
| std::string source = to_format_source_fp16; | |||||
| #else | |||||
| std::string source = to_format_source_fp32; | |||||
| #endif | |||||
| ocl_runtime->LoadSource(program_name, source); | |||||
| ocl_runtime->BuildKernel(kernel_, program_name, kernel_name, build_options); | |||||
| #endif | |||||
| MS_LOG(DEBUG) << kernel_name << " Init Done!"; | |||||
| return RET_OK; | |||||
| } | |||||
| int ToFormatOpenCLKernel::ReSize() { return RET_OK; } | |||||
| int ToFormatOpenCLKernel::GetGlobalSize(size_t idx, std::vector<size_t> *global_size) { | |||||
| std::vector<int> shapex = out_tensors_[0]->shape(); | |||||
| if (out_tensors_[0]->GetFormat() == schema::Format_NHWC4 || out_tensors_[0]->GetFormat() == schema::Format_NHWC) { | |||||
| int h = shapex[1]; | |||||
| int w = shapex[2]; | |||||
| int c = shapex[3]; | |||||
| int c4 = UP_DIV(c, C4NUM); | |||||
| std::vector<size_t> vec = {(size_t)h, (size_t)w, (size_t)c4}; | |||||
| *global_size = std::move(vec); | |||||
| } else if (out_tensors_[0]->GetFormat() == schema::Format_NC4HW4 || | |||||
| out_tensors_[0]->GetFormat() == schema::Format_NCHW) { | |||||
| int h = shapex[2]; | |||||
| int w = shapex[3]; | |||||
| int c = shapex[1]; | |||||
| int c4 = UP_DIV(c, C4NUM); | |||||
| std::vector<size_t> vec = {(size_t)c4, (size_t)h, (size_t)w}; | |||||
| *global_size = std::move(vec); | |||||
| } else if (out_tensors_[0]->GetFormat() == out_tensors_[0]->GetFormat() == schema::Format_NCHW) { | |||||
| int h = shapex[2]; | |||||
| int w = shapex[3]; | |||||
| int c = shapex[1]; | |||||
| int w4 = UP_DIV(w, C4NUM); | |||||
| std::vector<size_t> vec = {(size_t)w4, (size_t)h, (size_t)c}; | |||||
| *global_size = std::move(vec); | |||||
| } | |||||
| return RET_OK; | |||||
| } | |||||
| int ToFormatOpenCLKernel::GetLocalSize(size_t idx, const std::vector<size_t> &global_size, | |||||
| std::vector<size_t> *local_size) { | |||||
| return RET_OK; | |||||
| } | |||||
| int ToFormatOpenCLKernel::GetImageSize(size_t idx, std::vector<size_t> *img_size) { | |||||
| size_t im_dst_x, im_dst_y; | |||||
| std::vector<int> shapex = out_tensors_[0]->shape(); | |||||
| if (out_tensors_[0]->GetFormat() == schema::Format_NC4HW4) { | |||||
| int h = shapex[1]; | |||||
| int w = shapex[2]; | |||||
| int c = shapex[3]; | |||||
| im_dst_y = UP_DIV(h * c, C4NUM); | |||||
| im_dst_x = w; | |||||
| } else if (out_tensors_[0]->GetFormat() == schema::Format_NHWC4) { | |||||
| int h = shapex[2]; | |||||
| int w = shapex[3]; | |||||
| int c = shapex[1]; | |||||
| im_dst_x = UP_DIV(w * c, C4NUM); | |||||
| im_dst_y = h; | |||||
| } else { | |||||
| MS_LOG(ERROR) << "Unsupported format. " << out_tensors_[0]->GetFormat(); | |||||
| } | |||||
| img_size->clear(); | |||||
| #ifdef ENABLE_FP16 | |||||
| size_t img_dtype = CL_HALF_FLOAT; | |||||
| #else | |||||
| size_t img_dtype = CL_FLOAT; | |||||
| #endif | |||||
| std::vector<size_t> vec{im_dst_x, im_dst_y, img_dtype}; | |||||
| *img_size = vec; | |||||
| return RET_OK; | |||||
| } | |||||
| int ToFormatOpenCLKernel::Run() { | |||||
| MS_LOG(DEBUG) << "ToFormat" << " Running!"; | |||||
| auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance(); | |||||
| std::vector<size_t> local = {}; | |||||
| std::vector<size_t> global; | |||||
| GetGlobalSize(0, &global); | |||||
| auto shapex = in_tensors_[0]->shape(); | |||||
| cl_int4 shape{shapex.size() > 0 ? shapex[0] : 1, shapex.size() > 1 ? shapex[1] : 1, shapex.size() > 2 ? shapex[2] : 1, | |||||
| shapex.size() > 3 ? shapex[3] : 1}; | |||||
| cl_int4 gsize{(cl_int)global[0], (cl_int)global[1], (cl_int)global[2], 1}; | |||||
| ocl_runtime->SetKernelArg(kernel_, 0, in_tensors_[0]->Data()); | |||||
| ocl_runtime->SetKernelArg(kernel_, 1, out_tensors_[0]->Data()); | |||||
| ocl_runtime->SetKernelArg(kernel_, 2, gsize); | |||||
| ocl_runtime->SetKernelArg(kernel_, 3, shape); | |||||
| ocl_runtime->RunKernel(kernel_, global, local, nullptr); | |||||
| return RET_OK; | |||||
| } | |||||
| kernel::LiteKernel *OpenCLToFormatKernelCreator(const std::vector<lite::tensor::Tensor *> &inputs, | |||||
| const std::vector<lite::tensor::Tensor *> &outputs, | |||||
| OpParameter *opParameter, const lite::Context *ctx, | |||||
| const kernel::KernelKey &desc, const lite::Primitive *primitive) { | |||||
| auto *kernel = new (std::nothrow) ToFormatOpenCLKernel(reinterpret_cast<OpParameter *>(opParameter), inputs, outputs); | |||||
| if (kernel == nullptr) { | |||||
| MS_LOG(ERROR) << "kernel " << opParameter->name_ << " create failed."; | |||||
| return nullptr; | |||||
| } | |||||
| auto ret = kernel->Init(); | |||||
| if (ret != RET_OK) { | |||||
| delete kernel; | |||||
| return nullptr; | |||||
| } | |||||
| return kernel; | |||||
| } | |||||
| REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_ToFormat, OpenCLToFormatKernelCreator) | |||||
| } // namespace mindspore::kernel | |||||
| @@ -0,0 +1,46 @@ | |||||
| /** | |||||
| * Copyright 2020 Huawei Technologies Co., Ltd | |||||
| * | |||||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||||
| * you may not use this file except in compliance with the License. | |||||
| * You may obtain a copy of the License at | |||||
| * | |||||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||||
| * | |||||
| * Unless required by applicable law or agreed to in writing, software | |||||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| * See the License for the specific language governing permissions and | |||||
| * limitations under the License. | |||||
| */ | |||||
| #ifndef MINDSPORE_LITE_SRC_BACKEND_OPENCL_TO_FORMAT_H_ | |||||
| #define MINDSPORE_LITE_SRC_BACKEND_OPENCL_TO_FORMAT_H_ | |||||
| #include <vector> | |||||
| #include "src/lite_kernel.h" | |||||
| #include "src/runtime/opencl/opencl_runtime.h" | |||||
| #include "src/runtime/kernel/opencl/opencl_kernel.h" | |||||
| namespace mindspore::kernel { | |||||
| class ToFormatOpenCLKernel : public OpenCLKernel { | |||||
| public: | |||||
| explicit ToFormatOpenCLKernel(OpParameter *parameter, const std::vector<lite::tensor::Tensor *> &inputs, | |||||
| const std::vector<lite::tensor::Tensor *> &outputs) | |||||
| : OpenCLKernel(parameter, inputs, outputs) {} | |||||
| ~ToFormatOpenCLKernel() override{}; | |||||
| int Init() override; | |||||
| int ReSize() override; | |||||
| int Run() override; | |||||
| int GetImageSize(size_t idx, std::vector<size_t> *img_size) override; | |||||
| int GetGlobalSize(size_t idx, std::vector<size_t> *global_size) override; | |||||
| int GetLocalSize(size_t idx, const std::vector<size_t> &global_size, std::vector<size_t> *local_size) override; | |||||
| private: | |||||
| cl::Kernel kernel_; | |||||
| }; | |||||
| } // namespace mindspore::kernel | |||||
| #endif // MINDSPORE_LITE_SRC_BACKEND_OPENCL_RESHAPE_H_ | |||||
| @@ -109,7 +109,8 @@ kernel::LiteKernel *OpenCLTransposeKernelCreator(const std::vector<lite::tensor: | |||||
| const std::vector<lite::tensor::Tensor *> &outputs, | const std::vector<lite::tensor::Tensor *> &outputs, | ||||
| OpParameter *opParameter, const lite::Context *ctx, | OpParameter *opParameter, const lite::Context *ctx, | ||||
| const kernel::KernelKey &desc, const lite::Primitive *primitive) { | const kernel::KernelKey &desc, const lite::Primitive *primitive) { | ||||
| auto *kernel = new (std::nothrow)TransposeOpenCLKernel(reinterpret_cast<OpParameter *>(opParameter), inputs, outputs); | |||||
| auto *kernel = | |||||
| new (std::nothrow) TransposeOpenCLKernel(reinterpret_cast<OpParameter *>(opParameter), inputs, outputs); | |||||
| if (kernel == nullptr) { | if (kernel == nullptr) { | ||||
| MS_LOG(ERROR) << "kernel " << opParameter->name_ << "is nullptr."; | MS_LOG(ERROR) << "kernel " << opParameter->name_ << "is nullptr."; | ||||
| return nullptr; | return nullptr; | ||||
| @@ -21,21 +21,37 @@ | |||||
| #include "src/lite_kernel.h" | #include "src/lite_kernel.h" | ||||
| namespace mindspore::kernel { | namespace mindspore::kernel { | ||||
| enum class OpenCLMemType { BUF, IMG }; | |||||
| struct OpenCLToFormatParameter { | |||||
| OpParameter op_parameter; | |||||
| schema::Format src_format{schema::Format_NHWC}; | |||||
| schema::Format dst_format{schema::Format_NHWC4}; | |||||
| OpenCLMemType out_mem_type{OpenCLMemType::IMG}; | |||||
| }; | |||||
| class OpenCLKernel : public LiteKernel { | class OpenCLKernel : public LiteKernel { | ||||
| public: | public: | ||||
| explicit OpenCLKernel(OpParameter *parameter, const std::vector<lite::tensor::Tensor *> &inputs, | explicit OpenCLKernel(OpParameter *parameter, const std::vector<lite::tensor::Tensor *> &inputs, | ||||
| const std::vector<lite::tensor::Tensor *> &outputs) | const std::vector<lite::tensor::Tensor *> &outputs) | ||||
| : LiteKernel(parameter, inputs, outputs, nullptr, nullptr) {} | |||||
| : LiteKernel(parameter, inputs, outputs, nullptr, nullptr) {} | |||||
| virtual int Init() { return -1; } | virtual int Init() { return -1; } | ||||
| virtual int Prepare() { return -1; } | virtual int Prepare() { return -1; } | ||||
| virtual int InferShape() { return -1; } | virtual int InferShape() { return -1; } | ||||
| virtual int ReSize() { return -1; } | virtual int ReSize() { return -1; } | ||||
| virtual int Run() { return -1; } | virtual int Run() { return -1; } | ||||
| virtual int GetImageSize(size_t idx, std::vector<size_t>* img_size) { return -1; } | |||||
| virtual int GetGlobalSize(size_t idx, std::vector<size_t>* global_size) { return -1; } | |||||
| virtual int GetLocalSize(size_t idx, const std::vector<size_t>& global_size, | |||||
| std::vector<size_t>* local_size) { return -1; } | |||||
| virtual int GetImageSize(size_t idx, std::vector<size_t> *img_size) { return -1; } | |||||
| virtual int GetGlobalSize(size_t idx, std::vector<size_t> *global_size) { return -1; } | |||||
| virtual int GetLocalSize(size_t idx, const std::vector<size_t> &global_size, std::vector<size_t> *local_size) { | |||||
| return -1; | |||||
| } | |||||
| OpenCLMemType GetMemType() { return out_mem_type_; } | |||||
| void SetMemType(OpenCLMemType mem_type) { out_mem_type_ = mem_type; } | |||||
| protected: | |||||
| OpenCLMemType out_mem_type_{OpenCLMemType::IMG}; | |||||
| }; | }; | ||||
| } // namespace mindspore::kernel | } // namespace mindspore::kernel | ||||
| @@ -17,22 +17,117 @@ | |||||
| #include "src/runtime/kernel/opencl/subgraph_opencl_kernel.h" | #include "src/runtime/kernel/opencl/subgraph_opencl_kernel.h" | ||||
| #include "src/runtime/opencl/opencl_executor.h" | #include "src/runtime/opencl/opencl_executor.h" | ||||
| #include "src/runtime/opencl/opencl_runtime.h" | #include "src/runtime/opencl/opencl_runtime.h" | ||||
| #include "src/runtime/kernel/opencl/utils.h" | |||||
| #include "include/errorcode.h" | |||||
| #include "src/common/utils.h" | |||||
| namespace mindspore::kernel { | namespace mindspore::kernel { | ||||
| SubGraphOpenCLKernel::~SubGraphOpenCLKernel() { UnInit(); } | SubGraphOpenCLKernel::~SubGraphOpenCLKernel() { UnInit(); } | ||||
| int SubGraphOpenCLKernel::GenToFormatOp(const std::vector<lite::tensor::Tensor *> &in_tensors, | |||||
| const std::vector<kernel::LiteKernel *> in_kernels, | |||||
| std::vector<lite::tensor::Tensor *> *out_tensors, | |||||
| std::vector<OpenCLToFormatParameter *> *out_parameters, | |||||
| std::vector<LiteKernel *> *out_convert_ops, OpenCLMemType mem_type) { | |||||
| out_tensors->clear(); | |||||
| out_parameters->clear(); | |||||
| out_convert_ops->clear(); | |||||
| for (size_t i = 0; i < in_tensors.size(); ++i) { | |||||
| lite::tensor::Tensor *new_tensor = new (std::nothrow) lite::tensor::Tensor(); | |||||
| MS_ASSERT(new_tensor); | |||||
| if (new_tensor == nullptr) { | |||||
| MS_LOG(ERROR) << "SubGraphOpenCLKernel new tensor failed!"; | |||||
| return RET_ERROR; | |||||
| } | |||||
| new_tensor->CopyTensor(*in_tensors[i]); | |||||
| auto dst_format = | |||||
| (mem_type == OpenCLMemType::IMG) ? in_kernels.back()->out_tensors()[0]->GetFormat() : in_tensors[i]->GetFormat(); | |||||
| auto src_format = | |||||
| (mem_type == OpenCLMemType::IMG) ? in_tensors[i]->GetFormat() : in_kernels.front()->out_tensors()[0]->GetFormat(); | |||||
| if ((dst_format == schema::Format_NCHW || dst_format == schema::Format_NC4HW4) && | |||||
| (src_format == schema::Format_NHWC || src_format == schema::Format_NHWC4)) { | |||||
| auto &shape = new_tensor->shape(); | |||||
| std::vector<int> dst_shape{shape[0], shape[3], shape[1], shape[2]}; | |||||
| new_tensor->set_shape(shape); | |||||
| } | |||||
| if ((dst_format == schema::Format_NHWC || dst_format == schema::Format_NHWC4) && | |||||
| (src_format == schema::Format_NCHW || src_format == schema::Format_NC4HW4)) { | |||||
| auto &shape = new_tensor->shape(); | |||||
| std::vector<int> dst_shape{shape[0], shape[2], shape[3], shape[1]}; | |||||
| new_tensor->set_shape(shape); | |||||
| } | |||||
| new_tensor->SetFormat(dst_format); | |||||
| out_tensors->emplace_back(new_tensor); | |||||
| #ifdef ENABLE_FP16 | |||||
| KernelKey desc{kGPU, kNumberTypeFloat16, schema::PrimitiveType_ToFormat}; | |||||
| #else | |||||
| KernelKey desc{kGPU, kNumberTypeFloat32, schema::PrimitiveType_ToFormat}; | |||||
| #endif | |||||
| OpenCLToFormatParameter *parameter = new (std::nothrow) OpenCLToFormatParameter; | |||||
| MS_ASSERT(parameter); | |||||
| if (parameter == nullptr) { | |||||
| MS_LOG(ERROR) << "SubGraphOpenCLKernel new parameter failed!"; | |||||
| return RET_ERROR; | |||||
| } | |||||
| parameter->src_format = src_format; | |||||
| parameter->dst_format = dst_format; | |||||
| parameter->out_mem_type = mem_type; | |||||
| out_parameters->emplace_back(parameter); | |||||
| LiteKernel *in_convert_op; | |||||
| if (mem_type == OpenCLMemType::IMG) { | |||||
| in_convert_op = | |||||
| lite::GetOpenCLKernel({in_tensors[i]}, {new_tensor}, reinterpret_cast<OpParameter *>(parameter), nullptr, desc); | |||||
| } else { | |||||
| in_convert_op = | |||||
| lite::GetOpenCLKernel({new_tensor}, {in_tensors[i]}, reinterpret_cast<OpParameter *>(parameter), nullptr, desc); | |||||
| } | |||||
| MS_ASSERT(in_convert_op); | |||||
| if (in_convert_op == nullptr) { | |||||
| MS_LOG(ERROR) << "SubGraphOpenCLKernel create op failed!"; | |||||
| return RET_ERROR; | |||||
| } | |||||
| auto in_opencl_op = reinterpret_cast<OpenCLKernel *>(in_convert_op); | |||||
| if (mem_type == OpenCLMemType::IMG) { | |||||
| in_opencl_op->AddOutKernel(in_kernels[i]); | |||||
| reinterpret_cast<OpenCLKernel *>(in_kernels[i])->SetInKernel({in_convert_op}); | |||||
| reinterpret_cast<OpenCLKernel *>(in_kernels[i])->set_in_tensors({new_tensor}); | |||||
| } else { | |||||
| reinterpret_cast<OpenCLKernel *>(in_kernels[i])->SetOutKernel({in_convert_op}); | |||||
| reinterpret_cast<OpenCLKernel *>(in_kernels[i])->set_out_tensors({new_tensor}); | |||||
| in_convert_op->AddInKernel(in_kernels[i]); | |||||
| } | |||||
| out_convert_ops->emplace_back(in_convert_op); | |||||
| } | |||||
| return RET_OK; | |||||
| } | |||||
| int SubGraphOpenCLKernel::Init() { | int SubGraphOpenCLKernel::Init() { | ||||
| allocator_ = lite::opencl::OpenCLRuntime::GetInstance()->GetAllocator(); | allocator_ = lite::opencl::OpenCLRuntime::GetInstance()->GetAllocator(); | ||||
| MS_LOG(DEBUG) << "input num=" << inputs_.size() << ", output num=" << outputs_.size(); | |||||
| for (const auto tensor : inputs_) { | |||||
| MS_LOG(DEBUG) << "input num=" << in_tensors_.size() << ", output num=" << out_tensors_.size(); | |||||
| for (const auto tensor : in_tensors_) { | |||||
| tensor->set_allocator(allocator_); | tensor->set_allocator(allocator_); | ||||
| } | } | ||||
| for (const auto tensor : outputs_) { | |||||
| for (const auto tensor : out_tensors_) { | |||||
| tensor->set_allocator(allocator_); | tensor->set_allocator(allocator_); | ||||
| } | } | ||||
| int ret = GenToFormatOp(in_tensors_, in_kernels_, &in_convert_tensors_, &in_parameters_, &in_convert_ops_, | |||||
| OpenCLMemType::IMG); | |||||
| if (ret != RET_OK) { | |||||
| return RET_ERROR; | |||||
| } | |||||
| nodes_.insert(nodes_.begin(), in_convert_ops_.begin(), in_convert_ops_.end()); | |||||
| ret = GenToFormatOp(out_tensors_, out_kernels_, &out_convert_tensors_, &out_parameters_, &out_convert_ops_, | |||||
| OpenCLMemType::BUF); | |||||
| if (ret != RET_OK) { | |||||
| return RET_ERROR; | |||||
| } | |||||
| nodes_.insert(nodes_.end(), out_convert_ops_.begin(), out_convert_ops_.end()); | |||||
| MallocTensorWithReuse(); | |||||
| // Map buffer for write, it is not necessary for fine-grained | // Map buffer for write, it is not necessary for fine-grained | ||||
| for (auto &tensor : inputs_) { | |||||
| for (auto &tensor : in_tensors_) { | |||||
| void *data = tensor->Data(); | void *data = tensor->Data(); | ||||
| // It is required with coarse-grained SVM | // It is required with coarse-grained SVM | ||||
| if (data != nullptr) { | if (data != nullptr) { | ||||
| @@ -42,44 +137,118 @@ int SubGraphOpenCLKernel::Init() { | |||||
| MS_LOG(ERROR) << "SubGraphOpenCLKernel input nullptr!"; | MS_LOG(ERROR) << "SubGraphOpenCLKernel input nullptr!"; | ||||
| } | } | ||||
| } | } | ||||
| return 0; | |||||
| return RET_OK; | |||||
| } | |||||
| int SubGraphOpenCLKernel::MallocTensorWithReuse() { | |||||
| kernel::LiteKernelUtil::InitTensorRefCount(nodes_); | |||||
| for (auto *kernel : nodes_) { | |||||
| MS_ASSERT(nullptr != kernel); | |||||
| kernel::OpenCLKernel *op_kernel = reinterpret_cast<kernel::OpenCLKernel *>(kernel); | |||||
| auto &outputs = kernel->out_tensors(); | |||||
| for (auto i = 0; i < outputs.size(); ++i) { | |||||
| auto *output = outputs.at(i); | |||||
| MS_ASSERT(nullptr != output); | |||||
| if (op_kernel->GetMemType() == OpenCLMemType::IMG) { | |||||
| std::vector<size_t> img_size; | |||||
| op_kernel->GetImageSize(i, &img_size); | |||||
| auto data_ptr = allocator_->Malloc(output->Size(), img_size); | |||||
| output->SetData(data_ptr); | |||||
| } else { | |||||
| output->MallocData(allocator_); | |||||
| } | |||||
| output->set_allocator(allocator_); | |||||
| } | |||||
| for (auto input_kernel : kernel->in_kernels()) { | |||||
| MS_EXCEPTION_IF_NULL(input_kernel); | |||||
| auto ret = input_kernel->DecOutTensorRefCount(); | |||||
| if (0 != ret) { | |||||
| MS_LOG(WARNING) << "DecOutTensorRefCount for kernel" << kernel->name() << " failed"; | |||||
| } | |||||
| } | |||||
| } | |||||
| for (auto kernel : out_kernels_) { | |||||
| MS_EXCEPTION_IF_NULL(kernel); | |||||
| auto ret = kernel->DecOutTensorRefCount(); | |||||
| if (0 != ret) { | |||||
| MS_LOG(WARNING) << "DecOutTensorRefCount for kernel" << kernel->name() << " failed"; | |||||
| } | |||||
| } | |||||
| for (auto kernel : in_convert_ops_) { | |||||
| MS_EXCEPTION_IF_NULL(kernel); | |||||
| auto ret = kernel->DecOutTensorRefCount(); | |||||
| if (0 != ret) { | |||||
| MS_LOG(WARNING) << "DecOutTensorRefCount for kernel" << kernel->name() << " failed"; | |||||
| } | |||||
| } | |||||
| for (auto kernel : out_convert_ops_) { | |||||
| MS_EXCEPTION_IF_NULL(kernel); | |||||
| auto ret = kernel->DecOutTensorRefCount(); | |||||
| if (0 != ret) { | |||||
| MS_LOG(WARNING) << "DecOutTensorRefCount for kernel" << kernel->name() << " failed"; | |||||
| } | |||||
| } | |||||
| return RET_OK; | |||||
| } | } | ||||
| int SubGraphOpenCLKernel::UnInit() { | int SubGraphOpenCLKernel::UnInit() { | ||||
| for (auto &tensor : outputs_) { | |||||
| for (auto &tensor : out_tensors_) { | |||||
| allocator_->UnmapBuffer(tensor->Data()); | |||||
| } | |||||
| for (const auto tensor : in_tensors_) { | |||||
| if (tensor != nullptr) { | |||||
| tensor->FreeData(); | |||||
| } | |||||
| } | |||||
| for (const auto tensor : out_tensors_) { | |||||
| if (tensor != nullptr) { | |||||
| tensor->FreeData(); | |||||
| } | |||||
| } | |||||
| for (auto &tensor : out_tensors_) { | |||||
| allocator_->UnmapBuffer(tensor->Data()); | allocator_->UnmapBuffer(tensor->Data()); | ||||
| } | } | ||||
| for (const auto tensor : inputs_) { | |||||
| for (const auto tensor : in_convert_tensors_) { | |||||
| if (tensor != nullptr) { | if (tensor != nullptr) { | ||||
| tensor->FreeData(); | tensor->FreeData(); | ||||
| delete tensor; | |||||
| } | } | ||||
| } | } | ||||
| for (const auto tensor : outputs_) { | |||||
| for (const auto tensor : out_convert_tensors_) { | |||||
| if (tensor != nullptr) { | if (tensor != nullptr) { | ||||
| tensor->FreeData(); | tensor->FreeData(); | ||||
| delete tensor; | |||||
| } | } | ||||
| } | } | ||||
| return 0; | |||||
| for (const auto parameter : in_parameters_) { | |||||
| if (parameter != nullptr) { | |||||
| delete parameter; | |||||
| } | |||||
| } | |||||
| for (const auto op : in_convert_ops_) { | |||||
| if (op != nullptr) { | |||||
| delete op; | |||||
| } | |||||
| } | |||||
| return RET_OK; | |||||
| } | } | ||||
| int SubGraphOpenCLKernel::InferShape() { return 0; } | |||||
| int SubGraphOpenCLKernel::InferShape() { return RET_OK; } | |||||
| int SubGraphOpenCLKernel::ReSize() { return 0; } | |||||
| int SubGraphOpenCLKernel::ReSize() { return RET_OK; } | |||||
| int SubGraphOpenCLKernel::Run() { | int SubGraphOpenCLKernel::Run() { | ||||
| auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance(); | |||||
| for (auto &tensor : inputs_) { | |||||
| for (auto &tensor : in_tensors_) { | |||||
| allocator_->UnmapBuffer(tensor->Data()); | allocator_->UnmapBuffer(tensor->Data()); | ||||
| } | } | ||||
| lite::opencl::OpenCLExecutor executor; | lite::opencl::OpenCLExecutor executor; | ||||
| executor.Run(inputs_, outputs_, nodes_, allocator_); | |||||
| ocl_runtime->SyncCommandQueue(); | |||||
| for (auto &tensor : outputs_) { | |||||
| executor.Run(in_tensors_, out_tensors_, nodes_, allocator_); | |||||
| for (auto &tensor : out_tensors_) { | |||||
| void *data = allocator_->MapBuffer(tensor->Data(), CL_MAP_READ, nullptr, true); | void *data = allocator_->MapBuffer(tensor->Data(), CL_MAP_READ, nullptr, true); | ||||
| tensor->SetData(data); | tensor->SetData(data); | ||||
| } | } | ||||
| return 0; | |||||
| return RET_OK; | |||||
| } | } | ||||
| } // namespace mindspore::kernel | } // namespace mindspore::kernel | ||||
| @@ -36,7 +36,7 @@ class SubGraphOpenCLKernel : public SubGraphKernel { | |||||
| const std::vector<kernel::LiteKernel *> inKernels, | const std::vector<kernel::LiteKernel *> inKernels, | ||||
| const std::vector<kernel::LiteKernel *> outKernels, | const std::vector<kernel::LiteKernel *> outKernels, | ||||
| const std::vector<kernel::LiteKernel *> nodes) | const std::vector<kernel::LiteKernel *> nodes) | ||||
| : SubGraphKernel(inputs, outputs, inKernels, outKernels, nodes, nullptr, nullptr) {} | |||||
| : SubGraphKernel(inputs, outputs, inKernels, outKernels, nodes, nullptr, nullptr) {} | |||||
| ~SubGraphOpenCLKernel() override; | ~SubGraphOpenCLKernel() override; | ||||
| int Init() override; | int Init() override; | ||||
| @@ -45,11 +45,24 @@ class SubGraphOpenCLKernel : public SubGraphKernel { | |||||
| int Run() override; | int Run() override; | ||||
| int UnInit(); | int UnInit(); | ||||
| protected: | |||||
| int MallocTensorWithReuse(); | |||||
| int GenToFormatOp(const std::vector<lite::tensor::Tensor *> &in_tensors, | |||||
| const std::vector<kernel::LiteKernel *> in_kernels, | |||||
| std::vector<lite::tensor::Tensor *> *out_tensors, | |||||
| std::vector<OpenCLToFormatParameter *> *out_parameters, std::vector<LiteKernel *> *out_convert_ops, | |||||
| OpenCLMemType mem_type); | |||||
| private: | private: | ||||
| SubGraphOpenCLParameter *subgraph_ocl_parameter_; | SubGraphOpenCLParameter *subgraph_ocl_parameter_; | ||||
| lite::opencl::OpenCLAllocator *allocator_; | lite::opencl::OpenCLAllocator *allocator_; | ||||
| std::vector<lite::tensor::Tensor *> in_convert_tensors_; | |||||
| std::vector<lite::tensor::Tensor *> out_convert_tensors_; | |||||
| std::vector<OpenCLToFormatParameter *> in_parameters_; | |||||
| std::vector<OpenCLToFormatParameter *> out_parameters_; | |||||
| std::vector<LiteKernel *> in_convert_ops_; | |||||
| std::vector<LiteKernel *> out_convert_ops_; | |||||
| }; | }; | ||||
| } // namespace mindspore::kernel | } // namespace mindspore::kernel | ||||
| #endif // MINDSPORE_LITE_SRC_BACKEND_OPENCL_SUBGRAPH_OPENCL_KERNEL_H_ | #endif // MINDSPORE_LITE_SRC_BACKEND_OPENCL_SUBGRAPH_OPENCL_KERNEL_H_ | ||||
| @@ -18,6 +18,22 @@ | |||||
| #include <algorithm> | #include <algorithm> | ||||
| #include <string> | #include <string> | ||||
| #include <vector> | #include <vector> | ||||
| #include "src/kernel_registry.h" | |||||
| using mindspore::lite::KernelRegistrar; | |||||
| namespace mindspore::lite { | |||||
| kernel::LiteKernel *GetOpenCLKernel(const std::vector<tensor::Tensor *> &in_tensors, | |||||
| const std::vector<tensor::Tensor *> &out_tensors, OpParameter *parameter, | |||||
| const Context *ctx, const kernel::KernelKey &key) { | |||||
| auto creator = KernelRegistry::GetInstance()->GetCreator(key); | |||||
| if (creator != nullptr) { | |||||
| auto kernel = creator(in_tensors, out_tensors, parameter, nullptr, key, nullptr); | |||||
| return kernel; | |||||
| } | |||||
| return nullptr; | |||||
| } | |||||
| } // namespace mindspore::lite | |||||
| namespace mindspore { | namespace mindspore { | ||||
| namespace kernel { | namespace kernel { | ||||
| @@ -22,6 +22,13 @@ | |||||
| #include "CL/cl2.hpp" | #include "CL/cl2.hpp" | ||||
| #include "utils/log_adapter.h" | #include "utils/log_adapter.h" | ||||
| #include "src/runtime/kernel/arm/nnacl/op_base.h" | #include "src/runtime/kernel/arm/nnacl/op_base.h" | ||||
| #include "src/lite_kernel.h" | |||||
| namespace mindspore::lite { | |||||
| kernel::LiteKernel *GetOpenCLKernel(const std::vector<tensor::Tensor *> &in_tensors, | |||||
| const std::vector<tensor::Tensor *> &out_tensors, OpParameter *parameter, | |||||
| const Context *ctx, const kernel::KernelKey &key); | |||||
| } | |||||
| namespace mindspore::kernel { | namespace mindspore::kernel { | ||||
| @@ -16,10 +16,10 @@ | |||||
| #include "src/runtime/opencl/opencl_allocator.h" | #include "src/runtime/opencl/opencl_allocator.h" | ||||
| #include <utility> | #include <utility> | ||||
| #include "utils/log_adapter.h" | |||||
| #include "src/runtime/opencl/opencl_runtime.h" | #include "src/runtime/opencl/opencl_runtime.h" | ||||
| #include "include/errorcode.h" | |||||
| #include "src/runtime/kernel/opencl/utils.h" | #include "src/runtime/kernel/opencl/utils.h" | ||||
| #include "utils/log_adapter.h" | |||||
| #include "include/errorcode.h" | |||||
| namespace mindspore::lite::opencl { | namespace mindspore::lite::opencl { | ||||
| @@ -61,7 +61,7 @@ void *OpenCLAllocator::Malloc(size_t size, const std::vector<size_t> &img_size) | |||||
| } | } | ||||
| Lock(); | Lock(); | ||||
| auto iter = free_list_.lower_bound(size); | auto iter = free_list_.lower_bound(size); | ||||
| if (iter != free_list_.end() && (iter->second->size_ >= size) && (iter->second->size_ < (size << shift_factor_))) { | |||||
| while (iter != free_list_.end() && (iter->second->size_ >= size) && (iter->second->size_ < (size << shift_factor_))) { | |||||
| auto mem_buf = iter->second; | auto mem_buf = iter->second; | ||||
| bool is_match{mem_buf->img_size.size() == img_size.size()}; | bool is_match{mem_buf->img_size.size() == img_size.size()}; | ||||
| for (int i = 0; i < img_size.size() && is_match; ++i) { | for (int i = 0; i < img_size.size() && is_match; ++i) { | ||||
| @@ -75,6 +75,7 @@ void *OpenCLAllocator::Malloc(size_t size, const std::vector<size_t> &img_size) | |||||
| << ", host addr: " << mem_buf->host_ptr_ << ", device addr: " << mem_buf->device_ptr_; | << ", host addr: " << mem_buf->host_ptr_ << ", device addr: " << mem_buf->device_ptr_; | ||||
| return mem_buf->host_ptr_; | return mem_buf->host_ptr_; | ||||
| } | } | ||||
| ++iter; | |||||
| } | } | ||||
| void *host_ptr = nullptr; | void *host_ptr = nullptr; | ||||
| void *device_ptr = nullptr; | void *device_ptr = nullptr; | ||||
| @@ -136,7 +137,7 @@ void *OpenCLAllocator::CreateImageFromHost(void *data, size_t size, const std::v | |||||
| auto ocl_runtime = opencl::OpenCLRuntime::GetInstance(); | auto ocl_runtime = opencl::OpenCLRuntime::GetInstance(); | ||||
| Lock(); | Lock(); | ||||
| auto iter = free_list_.lower_bound(size); | auto iter = free_list_.lower_bound(size); | ||||
| if (iter != free_list_.end() && (iter->second->size_ >= size) && (iter->second->size_ < (size << shift_factor_))) { | |||||
| while (iter != free_list_.end() && (iter->second->size_ >= size) && (iter->second->size_ < (size << shift_factor_))) { | |||||
| auto mem_buf = iter->second; | auto mem_buf = iter->second; | ||||
| bool is_match{mem_buf->img_size.size() == img_size.size()}; | bool is_match{mem_buf->img_size.size() == img_size.size()}; | ||||
| for (int i = 0; i < img_size.size() && is_match; ++i) { | for (int i = 0; i < img_size.size() && is_match; ++i) { | ||||
| @@ -150,6 +151,7 @@ void *OpenCLAllocator::CreateImageFromHost(void *data, size_t size, const std::v | |||||
| << ", host addr: " << mem_buf->host_ptr_ << ", device addr: " << mem_buf->device_ptr_; | << ", host addr: " << mem_buf->host_ptr_ << ", device addr: " << mem_buf->device_ptr_; | ||||
| return mem_buf->host_ptr_; | return mem_buf->host_ptr_; | ||||
| } | } | ||||
| ++iter; | |||||
| } | } | ||||
| void *host_ptr = nullptr; | void *host_ptr = nullptr; | ||||
| void *device_ptr = nullptr; | void *device_ptr = nullptr; | ||||
| @@ -198,10 +200,13 @@ void OpenCLAllocator::Free(void *buf) { | |||||
| allocated_list_.erase(iter); | allocated_list_.erase(iter); | ||||
| free_list_.insert(std::make_pair(mem_buf->size_, mem_buf)); | free_list_.insert(std::make_pair(mem_buf->size_, mem_buf)); | ||||
| UnLock(); | UnLock(); | ||||
| MS_LOG(DEBUG) << "Free a new Image2D. size: " << mem_buf->size_ << ", host addr: " << mem_buf->host_ptr_ | |||||
| << ", device addr: " << mem_buf->device_ptr_ << ", image addr: " << mem_buf->image_ptr_; | |||||
| return; | return; | ||||
| } | } | ||||
| UnLock(); | UnLock(); | ||||
| free(buf); | free(buf); | ||||
| MS_LOG(DEBUG) << "Free host ptr: " << buf; | |||||
| } | } | ||||
| size_t OpenCLAllocator::GetTotalSize() { | size_t OpenCLAllocator::GetTotalSize() { | ||||
| @@ -24,41 +24,9 @@ namespace mindspore::lite::opencl { | |||||
| int OpenCLExecutor::Run(std::vector<tensor::Tensor *> &inputs, std::vector<tensor::Tensor *> &outputs, | int OpenCLExecutor::Run(std::vector<tensor::Tensor *> &inputs, std::vector<tensor::Tensor *> &outputs, | ||||
| std::vector<kernel::LiteKernel *> &kernels, Allocator *allocator, | std::vector<kernel::LiteKernel *> &kernels, Allocator *allocator, | ||||
| const session::KernelCallBack &before, const session::KernelCallBack &after) { | const session::KernelCallBack &before, const session::KernelCallBack &after) { | ||||
| MS_ASSERT(nullptr != allocator); | |||||
| for (auto &inTensor : inputs) { | |||||
| if (inTensor == nullptr) { | |||||
| MS_LOG(ERROR) << "Graph input tensor is nullptr"; | |||||
| return RET_ERROR; | |||||
| } | |||||
| if (inTensor->GetFormat() != schema::Format_NHWC4 && inTensor->GetFormat() != schema::Format_NC4HW4 && | |||||
| inTensor->GetFormat() != schema::Format_NHWC) { | |||||
| MS_LOG(ERROR) << "input should be NHWC/NHWC4/NC4HW4, actual is " << schema::EnumNameFormat(inTensor->GetFormat()); | |||||
| return RET_ERROR; | |||||
| } else { | |||||
| TransformTensorLayout(inTensor, inTensor->GetFormat(), schema::Format_NHWC4, true); | |||||
| // TransformTensorLayout(inTensor, inTensor->GetFormat(), schema::Format_NC4HW4, true); | |||||
| } | |||||
| } | |||||
| kernel::LiteKernelUtil::InitTensorRefCount(kernels); | kernel::LiteKernelUtil::InitTensorRefCount(kernels); | ||||
| OpenCLAllocator* op_allocator = reinterpret_cast<OpenCLAllocator*>(allocator); | |||||
| for (auto *kernel : kernels) { | for (auto *kernel : kernels) { | ||||
| MS_ASSERT(nullptr != kernel); | MS_ASSERT(nullptr != kernel); | ||||
| kernel::OpenCLKernel *op_kernel = reinterpret_cast<kernel::OpenCLKernel*>(kernel); | |||||
| auto &outputs = kernel->out_tensors(); | |||||
| for (auto i = 0; i < outputs.size(); ++i) { | |||||
| auto *output = outputs.at(i); | |||||
| MS_ASSERT(nullptr != output); | |||||
| if (is_image2d_out_) { | |||||
| std::vector<size_t> img_size; | |||||
| op_kernel->GetImageSize(i, &img_size); | |||||
| auto data_ptr = op_allocator->Malloc(output->Size(), img_size); | |||||
| output->SetData(data_ptr); | |||||
| } else { | |||||
| output->MallocData(allocator); | |||||
| } | |||||
| output->set_allocator(allocator); | |||||
| } | |||||
| session::CallBackParam callbackParam; | session::CallBackParam callbackParam; | ||||
| callbackParam.name_callback_param = kernel->name(); | callbackParam.name_callback_param = kernel->name(); | ||||
| @@ -67,6 +35,21 @@ int OpenCLExecutor::Run(std::vector<tensor::Tensor *> &inputs, std::vector<tenso | |||||
| MS_LOG(ERROR) << "run kernel before_callback failed, name: " << kernel->name(); | MS_LOG(ERROR) << "run kernel before_callback failed, name: " << kernel->name(); | ||||
| } | } | ||||
| } | } | ||||
| kernel::OpenCLKernel *op_kernel = reinterpret_cast<kernel::OpenCLKernel *>(kernel); | |||||
| auto &cur_outputs = kernel->out_tensors(); | |||||
| for (auto i = 0; i < cur_outputs.size(); ++i) { | |||||
| auto *output = cur_outputs.at(i); | |||||
| MS_ASSERT(nullptr != output); | |||||
| if (op_kernel->GetMemType() == kernel::OpenCLMemType::IMG) { | |||||
| std::vector<size_t> img_size; | |||||
| op_kernel->GetImageSize(i, &img_size); | |||||
| auto data_ptr = allocator_->Malloc(output->Size(), img_size); | |||||
| output->SetData(data_ptr); | |||||
| } else { | |||||
| output->MallocData(allocator_); | |||||
| } | |||||
| } | |||||
| auto ret = kernel->Run(); | auto ret = kernel->Run(); | ||||
| if (0 != ret) { | if (0 != ret) { | ||||
| MS_LOG(ERROR) << "run kernel failed, name: " << kernel->name(); | MS_LOG(ERROR) << "run kernel failed, name: " << kernel->name(); | ||||
| @@ -86,21 +69,11 @@ int OpenCLExecutor::Run(std::vector<tensor::Tensor *> &inputs, std::vector<tenso | |||||
| } | } | ||||
| } | } | ||||
| } | } | ||||
| // output format transform | |||||
| for (auto &outTensor : outputs) { | |||||
| if (outTensor == nullptr) { | |||||
| MS_LOG(ERROR) << "Graph output tensor is nullptr"; | |||||
| return RET_ERROR; | |||||
| } | |||||
| if (outTensor->GetFormat() != schema::Format_NHWC) { | |||||
| TransformTensorLayout(outTensor, outTensor->GetFormat(), schema::Format_NHWC, false); | |||||
| } | |||||
| } | |||||
| return RET_OK; | return RET_OK; | ||||
| } | } | ||||
| int OpenCLExecutor::TransformTensorLayout(tensor::Tensor *tensor, schema::Format src_format, | |||||
| schema::Format dst_format, bool trans_dir) { | |||||
| int OpenCLExecutor::TransformTensorLayout(tensor::Tensor *tensor, schema::Format src_format, schema::Format dst_format, | |||||
| bool trans_dir) { | |||||
| MS_ASSERT(nullptr != tensor); | MS_ASSERT(nullptr != tensor); | ||||
| MS_ASSERT(4 == tensor->shape().size()); | MS_ASSERT(4 == tensor->shape().size()); | ||||
| auto data_type = tensor->data_type(); | auto data_type = tensor->data_type(); | ||||
| @@ -114,11 +87,10 @@ int OpenCLExecutor::TransformTensorLayout(tensor::Tensor *tensor, schema::Format | |||||
| << schema::EnumNameFormat(dst_format); | << schema::EnumNameFormat(dst_format); | ||||
| return RET_ERROR; | return RET_ERROR; | ||||
| } | } | ||||
| return RET_OK; | |||||
| } | } | ||||
| int OpenCLExecutor::TransformTensorLayoutFp32(tensor::Tensor *tensor, schema::Format src_format, | int OpenCLExecutor::TransformTensorLayoutFp32(tensor::Tensor *tensor, schema::Format src_format, | ||||
| schema::Format dst_format, bool trans_dir) { | |||||
| schema::Format dst_format, bool trans_dir) { | |||||
| MS_ASSERT(nullptr != tensor); | MS_ASSERT(nullptr != tensor); | ||||
| MS_ASSERT(nullptr != allocator_); | MS_ASSERT(nullptr != allocator_); | ||||
| MS_ASSERT(4 == tensor->shape().size()); | MS_ASSERT(4 == tensor->shape().size()); | ||||
| @@ -138,11 +110,11 @@ int OpenCLExecutor::TransformTensorLayoutFp32(tensor::Tensor *tensor, schema::Fo | |||||
| } | } | ||||
| int OpenCLExecutor::TransformTensorLayoutToBuffer(tensor::Tensor *tensor, schema::Format src_format, | int OpenCLExecutor::TransformTensorLayoutToBuffer(tensor::Tensor *tensor, schema::Format src_format, | ||||
| schema::Format dst_format) { | |||||
| schema::Format dst_format) { | |||||
| if (dst_format == schema::Format_NHWC4) { | if (dst_format == schema::Format_NHWC4) { | ||||
| auto *src_data = tensor->Data(); | auto *src_data = tensor->Data(); | ||||
| size_t C4 = UP_DIV(tensor->Channel(), C4NUM); | size_t C4 = UP_DIV(tensor->Channel(), C4NUM); | ||||
| std::vector <size_t> img_size{tensor->Width() * C4, (size_t) tensor->Height(), CL_FLOAT}; | |||||
| std::vector<size_t> img_size{tensor->Width() * C4, (size_t)tensor->Height(), CL_FLOAT}; | |||||
| if (src_format == schema::Format_NHWC) { | if (src_format == schema::Format_NHWC) { | ||||
| auto *dst_data = allocator_->Malloc(tensor->Size(), img_size); | auto *dst_data = allocator_->Malloc(tensor->Size(), img_size); | ||||
| if (dst_data == nullptr) { | if (dst_data == nullptr) { | ||||
| @@ -168,7 +140,7 @@ int OpenCLExecutor::TransformTensorLayoutToBuffer(tensor::Tensor *tensor, schema | |||||
| } | } | ||||
| int OpenCLExecutor::TransformTensorLayoutToImage(tensor::Tensor *tensor, schema::Format src_format, | int OpenCLExecutor::TransformTensorLayoutToImage(tensor::Tensor *tensor, schema::Format src_format, | ||||
| schema::Format dst_format) { | |||||
| schema::Format dst_format) { | |||||
| if (dst_format == schema::Format_NHWC4) { | if (dst_format == schema::Format_NHWC4) { | ||||
| tensor->SetFormat(schema::Format_NHWC4); | tensor->SetFormat(schema::Format_NHWC4); | ||||
| // convert to nhwc4 | // convert to nhwc4 | ||||
| @@ -202,15 +174,15 @@ int OpenCLExecutor::TransformTensorLayoutToImage(tensor::Tensor *tensor, schema: | |||||
| } | } | ||||
| int OpenCLExecutor::TransformTensorLayoutFromImage(tensor::Tensor *tensor, schema::Format src_format, | int OpenCLExecutor::TransformTensorLayoutFromImage(tensor::Tensor *tensor, schema::Format src_format, | ||||
| schema::Format dst_format) { | |||||
| schema::Format dst_format) { | |||||
| if (dst_format == schema::Format_NHWC) { | if (dst_format == schema::Format_NHWC) { | ||||
| auto src_data = tensor->Data(); | auto src_data = tensor->Data(); | ||||
| auto dst_data = allocator_->Malloc(tensor->Size()); | auto dst_data = allocator_->Malloc(tensor->Size()); | ||||
| cl::Image2D *out_mem = reinterpret_cast<cl::Image2D *>(allocator_->GetImage(src_data)); | cl::Image2D *out_mem = reinterpret_cast<cl::Image2D *>(allocator_->GetImage(src_data)); | ||||
| std::vector<size_t> img_size; | std::vector<size_t> img_size; | ||||
| allocator_->GetImageSize(src_data, &img_size); | allocator_->GetImageSize(src_data, &img_size); | ||||
| auto origin = cl::array < cl::size_type, 3U > {0, 0, 0}; | |||||
| auto region = cl::array < cl::size_type, 3U > {img_size[0], img_size[1], 1}; | |||||
| auto origin = cl::array<cl::size_type, 3U>{0, 0, 0}; | |||||
| auto region = cl::array<cl::size_type, 3U>{img_size[0], img_size[1], 1}; | |||||
| auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance(); | auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance(); | ||||
| ocl_runtime->GetDefaultCommandQueue()->enqueueReadImage(*out_mem, CL_TRUE, origin, region, 0, 0, dst_data); | ocl_runtime->GetDefaultCommandQueue()->enqueueReadImage(*out_mem, CL_TRUE, origin, region, 0, 0, dst_data); | ||||
| tensor->SetData(dst_data); | tensor->SetData(dst_data); | ||||
| @@ -224,7 +196,7 @@ int OpenCLExecutor::TransformTensorLayoutFromImage(tensor::Tensor *tensor, schem | |||||
| } | } | ||||
| int OpenCLExecutor::TransformTensorLayoutUint8(tensor::Tensor *tensor, schema::Format src_format, | int OpenCLExecutor::TransformTensorLayoutUint8(tensor::Tensor *tensor, schema::Format src_format, | ||||
| schema::Format dst_format, bool is_image) { | |||||
| schema::Format dst_format, bool is_image) { | |||||
| MS_ASSERT(nullptr != tensor); | MS_ASSERT(nullptr != tensor); | ||||
| MS_ASSERT(4 == tensor->shape().size()); | MS_ASSERT(4 == tensor->shape().size()); | ||||
| // auto src_format = tensor->GetFormat(); | // auto src_format = tensor->GetFormat(); | ||||
| @@ -234,4 +206,3 @@ int OpenCLExecutor::TransformTensorLayoutUint8(tensor::Tensor *tensor, schema::F | |||||
| return RET_ERROR; | return RET_ERROR; | ||||
| } | } | ||||
| } // namespace mindspore::lite::opencl | } // namespace mindspore::lite::opencl | ||||
| @@ -148,6 +148,7 @@ if (SUPPORT_GPU) | |||||
| ${LITE_DIR}/src/runtime/kernel/opencl/kernel/conv2d_transpose.cc | ${LITE_DIR}/src/runtime/kernel/opencl/kernel/conv2d_transpose.cc | ||||
| ${LITE_DIR}/src/runtime/kernel/opencl/kernel/transpose.cc | ${LITE_DIR}/src/runtime/kernel/opencl/kernel/transpose.cc | ||||
| ${LITE_DIR}/src/runtime/kernel/opencl/kernel/reshape.cc | ${LITE_DIR}/src/runtime/kernel/opencl/kernel/reshape.cc | ||||
| ${LITE_DIR}/src/runtime/kernel/opencl/kernel/to_format.cc | |||||
| ) | ) | ||||
| endif() | endif() | ||||
| ### minddata lite | ### minddata lite | ||||
| @@ -323,6 +324,8 @@ if (SUPPORT_GPU) | |||||
| ${TEST_DIR}/ut/src/runtime/kernel/opencl/transpose_tests.cc | ${TEST_DIR}/ut/src/runtime/kernel/opencl/transpose_tests.cc | ||||
| ${TEST_DIR}/ut/src/runtime/kernel/opencl/convolution_tests.cc | ${TEST_DIR}/ut/src/runtime/kernel/opencl/convolution_tests.cc | ||||
| ${TEST_DIR}/ut/src/runtime/kernel/opencl/activation_tests.cc | ${TEST_DIR}/ut/src/runtime/kernel/opencl/activation_tests.cc | ||||
| #${TEST_DIR}/ut/src/runtime/kernel/opencl/leakyrelu_tests.cc | |||||
| ${TEST_DIR}/ut/src/runtime/kernel/opencl/to_format_tests.cc | |||||
| ) | ) | ||||
| endif() | endif() | ||||
| @@ -0,0 +1,79 @@ | |||||
| /** | |||||
| * Copyright 2020 Huawei Technologies Co., Ltd | |||||
| * | |||||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||||
| * you may not use this file except in compliance with the License. | |||||
| * You may obtain a copy of the License at | |||||
| * | |||||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||||
| * | |||||
| * Unless required by applicable law or agreed to in writing, software | |||||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| * See the License for the specific language governing permissions and | |||||
| * limitations under the License. | |||||
| */ | |||||
| #include <iostream> | |||||
| #include <memory> | |||||
| #include "mindspore/core/utils/log_adapter.h" | |||||
| #include "common/common_test.h" | |||||
| #include "mindspore/lite/src/common/file_utils.h" | |||||
| #include "mindspore/lite/src/runtime/opencl/opencl_runtime.h" | |||||
| #include "mindspore/lite/src/runtime/kernel/opencl/subgraph_opencl_kernel.h" | |||||
| #include "mindspore/lite/src/runtime/kernel/opencl/kernel/transpose.h" | |||||
| namespace mindspore { | |||||
| class TestToFormatOpenCL : public mindspore::CommonTest { | |||||
| public: | |||||
| TestToFormatOpenCL() {} | |||||
| }; | |||||
| TEST_F(TestToFormatOpenCL, TransposeFp32) { | |||||
| auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance(); | |||||
| ocl_runtime->Init(); | |||||
| auto allocator = ocl_runtime->GetAllocator(); | |||||
| int h = 64; | |||||
| int w = 1; | |||||
| int c = 7360; | |||||
| size_t input_size; | |||||
| std::string input_path = "./test_data/transpose/transpose_fp32_input.bin"; | |||||
| auto input_data = reinterpret_cast<float *>(mindspore::lite::ReadFile(input_path.c_str(), &input_size)); | |||||
| lite::tensor::Tensor *tensor_x = | |||||
| new lite::tensor::Tensor(TypeId(kNumberTypeFloat32), {1, h, w, c}, schema::Format_NHWC4); | |||||
| lite::tensor::Tensor *tensor_out = new lite::tensor::Tensor(TypeId(kNumberTypeFloat32), {1, c, h, w}); | |||||
| std::vector<lite::tensor::Tensor *> inputs{tensor_x}; | |||||
| std::vector<lite::tensor::Tensor *> outputs{tensor_out}; | |||||
| auto *arith_kernel = new kernel::TransposeOpenCLKernel(nullptr, inputs, outputs); | |||||
| arith_kernel->Init(); | |||||
| inputs[0]->MallocData(allocator); | |||||
| std::vector<kernel::LiteKernel *> kernels{arith_kernel}; | |||||
| auto *pGraph = new kernel::SubGraphOpenCLKernel(inputs, outputs, kernels, kernels, kernels); | |||||
| pGraph->Init(); | |||||
| memcpy(inputs[0]->Data(), input_data, input_size); | |||||
| pGraph->Run(); | |||||
| size_t output_size; | |||||
| std::string output_path = "./test_data/transpose/transpose_fp32_output.bin"; | |||||
| auto correct_data = reinterpret_cast<float *>(mindspore::lite::ReadFile(output_path.c_str(), &output_size)); | |||||
| printf("==================output data=================\n"); | |||||
| float *output_data = reinterpret_cast<float *>(tensor_out->Data()); | |||||
| std::cout << std::endl; | |||||
| int size_n = h * w * c; | |||||
| size_n = size_n > 100 ? 100 : size_n; | |||||
| for (int i = 0; i < size_n; i++) { | |||||
| std::cout << output_data[i] << " "; | |||||
| if ((i + 1) % c == 0) { | |||||
| std::cout << std::endl; | |||||
| } | |||||
| } | |||||
| std::cout << std::endl; | |||||
| // compare | |||||
| CompareOutputData(output_data, correct_data, h * w * c, 0.00001); | |||||
| MS_LOG(INFO) << "TestMatMulFp32 passed"; | |||||
| } | |||||
| } // namespace mindspore | |||||