Browse Source

!4524 add to_format opencl op

Merge pull request !4524 from wandongdong/up
tags/v0.7.0-beta
mindspore-ci-bot Gitee 5 years ago
parent
commit
2808c0f8ea
20 changed files with 832 additions and 102 deletions
  1. +2
    -1
      mindspore/lite/schema/model.fbs
  2. +5
    -0
      mindspore/lite/schema/ops.fbs
  3. +10
    -12
      mindspore/lite/src/lite_kernel.h
  4. +0
    -3
      mindspore/lite/src/runtime/kernel/opencl/cl/fp32/arithmetic_image2d.cl
  5. +235
    -0
      mindspore/lite/src/runtime/kernel/opencl/cl/fp32/to_format.cl
  6. +2
    -1
      mindspore/lite/src/runtime/kernel/opencl/kernel/convolution.cc
  7. +1
    -1
      mindspore/lite/src/runtime/kernel/opencl/kernel/pooling2d.cc
  8. +1
    -1
      mindspore/lite/src/runtime/kernel/opencl/kernel/softmax.cc
  9. +167
    -0
      mindspore/lite/src/runtime/kernel/opencl/kernel/to_format.cc
  10. +46
    -0
      mindspore/lite/src/runtime/kernel/opencl/kernel/to_format.h
  11. +2
    -1
      mindspore/lite/src/runtime/kernel/opencl/kernel/transpose.cc
  12. +21
    -5
      mindspore/lite/src/runtime/kernel/opencl/opencl_kernel.h
  13. +186
    -17
      mindspore/lite/src/runtime/kernel/opencl/subgraph_opencl_kernel.cc
  14. +15
    -2
      mindspore/lite/src/runtime/kernel/opencl/subgraph_opencl_kernel.h
  15. +16
    -0
      mindspore/lite/src/runtime/kernel/opencl/utils.cc
  16. +7
    -0
      mindspore/lite/src/runtime/kernel/opencl/utils.h
  17. +9
    -4
      mindspore/lite/src/runtime/opencl/opencl_allocator.cc
  18. +25
    -54
      mindspore/lite/src/runtime/opencl/opencl_executor.cc
  19. +3
    -0
      mindspore/lite/test/CMakeLists.txt
  20. +79
    -0
      mindspore/lite/test/ut/src/runtime/kernel/opencl/to_format_tests.cc

+ 2
- 1
mindspore/lite/schema/model.fbs View File

@@ -192,7 +192,8 @@ union PrimitiveType {
SpaceToBatchND, SpaceToBatchND,
TopKV2, TopKV2,
Return, Return,
MakeTuple
MakeTuple,
ToFormat,
} }


enum QuantType: int { enum QuantType: int {


+ 5
- 0
mindspore/lite/schema/ops.fbs View File

@@ -881,5 +881,10 @@ table TopKV2 {
table MakeTuple { table MakeTuple {
} }


table ToFormat {
srcT: int;
dstT: int;
}

table Return { table Return {
} }

+ 10
- 12
mindspore/lite/src/lite_kernel.h View File

@@ -131,6 +131,10 @@ class LiteKernel {


void AddOutKernel(LiteKernel *kernel) { this->out_kernels_.emplace_back(kernel); } void AddOutKernel(LiteKernel *kernel) { this->out_kernels_.emplace_back(kernel); }


void SetInKernel(const std::vector<LiteKernel *> &kernel) { this->in_kernels_ = kernel; }

void SetOutKernel(const std::vector<LiteKernel *> &kernel) { this->out_kernels_ = kernel; }

std::vector<LiteKernel *> &in_kernels() { return this->in_kernels_; } std::vector<LiteKernel *> &in_kernels() { return this->in_kernels_; }


std::vector<LiteKernel *> &out_kernels() { return this->out_kernels_; } std::vector<LiteKernel *> &out_kernels() { return this->out_kernels_; }
@@ -167,16 +171,14 @@ class SubGraphKernel : public LiteKernel {
public: public:
explicit SubGraphKernel(const std::vector<lite::tensor::Tensor *> &inputs, explicit SubGraphKernel(const std::vector<lite::tensor::Tensor *> &inputs,
const std::vector<lite::tensor::Tensor *> &outputs, const std::vector<lite::tensor::Tensor *> &outputs,
const std::vector<kernel::LiteKernel *> &inKernels,
const std::vector<kernel::LiteKernel *> &outKernels,
const std::vector<kernel::LiteKernel *> &in_kernels,
const std::vector<kernel::LiteKernel *> &out_kernels,
const std::vector<kernel::LiteKernel *> &nodes, const lite::Context *ctx, const std::vector<kernel::LiteKernel *> &nodes, const lite::Context *ctx,
const lite::Primitive *primitive) const lite::Primitive *primitive)
: LiteKernel(nullptr, inputs, outputs, ctx, primitive),
inputs_(inputs),
outputs_(outputs),
inkernels_(inKernels),
outkernels_(outKernels),
nodes_(nodes) {}
: LiteKernel(nullptr, inputs, outputs, ctx, primitive), nodes_(nodes) {
in_kernels_ = in_kernels;
out_kernels_ = out_kernels;
}


virtual int Init() { return -1; } virtual int Init() { return -1; }
virtual int InferShape() { return -1; } virtual int InferShape() { return -1; }
@@ -184,10 +186,6 @@ class SubGraphKernel : public LiteKernel {
virtual int Run() { return -1; } virtual int Run() { return -1; }


protected: protected:
std::vector<lite::tensor::Tensor *> inputs_;
std::vector<lite::tensor::Tensor *> outputs_;
std::vector<LiteKernel *> inkernels_;
std::vector<LiteKernel *> outkernels_;
std::vector<LiteKernel *> nodes_; std::vector<LiteKernel *> nodes_;
}; };




+ 0
- 3
mindspore/lite/src/runtime/kernel/opencl/cl/fp32/arithmetic_image2d.cl View File

@@ -49,9 +49,6 @@ __kernel void ElementDiv(__read_only image2d_t input_a, __read_only image2d_t in


float4 a = read_imagef(input_a, smp_none, (int2)(X, Y)); float4 a = read_imagef(input_a, smp_none, (int2)(X, Y));
float4 b = read_imagef(input_b, smp_none, (int2)(X, Y)); float4 b = read_imagef(input_b, smp_none, (int2)(X, Y));
if (b == 0) {
return;
}
write_imagef(output, (int2)(X, Y), a / b); write_imagef(output, (int2)(X, Y), a / b);
} }




+ 235
- 0
mindspore/lite/src/runtime/kernel/opencl/cl/fp32/to_format.cl View File

@@ -0,0 +1,235 @@
#define FLT float
#define FLT4 float4
#define READ_IMAGE read_imagef
#define WRITE_IMAGE write_imagef
// enum Format {
// Format_NCHW = 0,
// Format_NHWC = 1,
// Format_NHWC4 = 2,
// Format_HWKC = 3,
// Format_HWCK = 4,
// Format_KCHW = 5,
// Format_CKHW = 6,
// Format_KHWC = 7,
// Format_CHWK = 8,
// Format_NC4HW4 = 100,
// Format_NUM_OF_FORMAT = 101,
// Format_MIN = Format_NCHW,
// Format_MAX = Format_NUM_OF_FORMAT
//};
__constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
__kernel void to_format_NCHW_to_NHWC4_IMG(__global FLT4 *src_data, __write_only image2d_t dst_data, int4 size,
int4 shape) {
int X = get_global_id(0);
int Y = get_global_id(1);
int Z = get_global_id(2);
if (X >= size.x || Y >= size.y || Z >= size.z) {
return;
}
// WRITE_IMAGE(dst_data, (int2)(Y * size.z + Z, X), READ_IMAGE(src_data, smp_zero, (int2)(Y * size.z + Z, X)));
}
__kernel void to_format_NHWC_to_NHWC4_IMG(__global FLT4 *src_data, __write_only image2d_t dst_data, int4 size,
int4 shape) {
int X = get_global_id(0);
int Y = get_global_id(1);
int Z = get_global_id(2);
if (X >= size.x || Y >= size.y || Z >= size.z) {
return;
}
int offset = (X * shape.z + Y) * shape.w + Z * 4;
__global FLT *src_addr = (__global FLT *)src_data;
src_addr += offset;
FLT4 data = (FLT4)(0.f);
if ((Z + 1) * 4 <= shape.w) {
data = ((__global FLT4 *)src_addr)[0];
} else {
if ((shape.w - Z * 4) >= 1) {
data.x = src_addr[0];
}
if ((shape.w - Z * 4) >= 2) {
data.y = src_addr[1];
}
if ((shape.w - Z * 4) >= 3) {
data.z = src_addr[2];
}
}
WRITE_IMAGE(dst_data, (int2)(Y * size.z + Z, X), data);
}
__kernel void to_format_NHWC4_to_NHWC4_IMG(__global FLT4 *src_data, __write_only image2d_t dst_data, int4 size,
int4 shape) {
int X = get_global_id(0);
int Y = get_global_id(1);
int Z = get_global_id(2);
if (X >= size.x || Y >= size.y || Z >= size.z) {
return;
}
// WRITE_IMAGE(dst_data, (int2)(Y * size.z + Z, X), READ_IMAGE(src_data, smp_zero, (int2)(Y * size.z + Z, X)));
}
__kernel void to_format_NC4HW4_to_NHWC4_IMG(__global FLT4 *src_data, __write_only image2d_t dst_data, int4 size,
int4 shape) {
int X = get_global_id(0);
int Y = get_global_id(1);
int Z = get_global_id(2);
if (X >= size.x || Y >= size.y || Z >= size.z) {
return;
}
// WRITE_IMAGE(dst_data, (int2)(Y * size.z + Z, X), READ_IMAGE(src_data, smp_zero, (int2)(Y * size.z + Z, X)));
}
__kernel void to_format_NCHW_to_NC4HW4_IMG(__global FLT4 *src_data, __write_only image2d_t dst_data, int4 size,
int4 shape) {
int X = get_global_id(0);
int Y = get_global_id(1);
int Z = get_global_id(2);
if (X >= size.x || Y >= size.y || Z >= size.z) {
return;
}
// WRITE_IMAGE(dst_data, (int2)(Y * size.z + Z, X), READ_IMAGE(src_data, smp_zero, (int2)(Y * size.z + Z, X)));
}
__kernel void to_format_NHWC_to_NC4HW4_IMG(__global FLT4 *src_data, __write_only image2d_t dst_data, int4 size,
int4 shape) {
int X = get_global_id(0);
int Y = get_global_id(1);
int Z = get_global_id(2);
if (X >= size.x || Y >= size.y || Z >= size.z) {
return;
}
// WRITE_IMAGE(dst_data, (int2)(Y * size.z + Z, X), READ_IMAGE(src_data, smp_zero, (int2)(Y * size.z + Z, X)));
}
__kernel void to_format_NHWC4_to_NC4HW4_IMG(__global FLT4 *src_data, __write_only image2d_t dst_data, int4 size,
int4 shape) {
int X = get_global_id(0);
int Y = get_global_id(1);
int Z = get_global_id(2);
if (X >= size.x || Y >= size.y || Z >= size.z) {
return;
}
// WRITE_IMAGE(dst_data, (int2)(Y * size.z + Z, X), READ_IMAGE(src_data, smp_zero, (int2)(Y * size.z + Z, X)));
}
__kernel void to_format_NC4HW4_to_NC4HW4_IMG(__global FLT4 *src_data, __write_only image2d_t dst_data, int4 size,
int4 shape) {
int X = get_global_id(0);
int Y = get_global_id(1);
int Z = get_global_id(2);
if (X >= size.x || Y >= size.y || Z >= size.z) {
return;
}
// FLT4 src_final = src_data[(((Z)*src_size.y + (y_c)) * src_size.x + (x_c))];
WRITE_IMAGE(dst_data, (int2)(Y * size.z + Z, X), src_data[(Y * size.z + Z) * size.x + X]);
}

__kernel void to_format_NCHW_to_NCHW_BUF(__read_only image2d_t src_data, __global FLT4 *dst_data, int4 size,
int4 shape) {
int X = get_global_id(0);
int Y = get_global_id(1);
int Z = get_global_id(2);
if (X >= size.x || Y >= size.y || Z >= size.z) {
return;
}
dst_data[(Z * size.y + Y) * size.x + X] = READ_IMAGE(src_data, smp_zero, (int2)(Y * size.x + X, Z));
}
__kernel void to_format_NHWC_to_NCHW_BUF(__read_only image2d_t src_data, __global FLT4 *dst_data, int4 size,
int4 shape) {
int X = get_global_id(0);
int Y = get_global_id(1);
int Z = get_global_id(2);
if (X >= size.x || Y >= size.y || Z >= size.z) {
return;
}
// WRITE_IMAGE(dst_data, (int2)(Y * size.z + Z, X), READ_IMAGE(src_data, smp_zero, (int2)(Y * size.z + Z, X)));
}
__kernel void to_format_NHWC4_to_NCHW_BUF(__read_only image2d_t src_data, __global FLT4 *dst_data, int4 size,
int4 shape) {
int X = get_global_id(0);
int Y = get_global_id(1);
int Z = get_global_id(2);
if (X >= size.x || Y >= size.y || Z >= size.z) {
return;
}
// WRITE_IMAGE(dst_data, (int2)(Y * size.z + Z, X), READ_IMAGE(src_data, smp_zero, (int2)(Y * size.z + Z, X)));
}
__kernel void to_format_NC4HW4_to_NCHW_BUF(__read_only image2d_t src_data, __global FLT4 *dst_data, int4 size,
int4 shape) {
int X = get_global_id(0);
int Y = get_global_id(1);
int Z = get_global_id(2);
if (X >= size.x || Y >= size.y || Z >= size.z) {
return;
}
// WRITE_IMAGE(dst_data, (int2)(Y * size.z + Z, X), READ_IMAGE(src_data, smp_zero, (int2)(Y * size.z + Z, X)));
}
__kernel void to_format_NCHW_to_NHWC_BUF(__read_only image2d_t src_data, __global FLT4 *dst_data, int4 size,
int4 shape) {
int X = get_global_id(0);
int Y = get_global_id(1);
int Z = get_global_id(2);
if (X >= size.x || Y >= size.y || Z >= size.z) {
return;
}
// WRITE_IMAGE(dst_data, (int2)(Y * size.z + Z, X), READ_IMAGE(src_data, smp_zero, (int2)(Y * size.z + Z, X)));
}
__kernel void to_format_NHWC_to_NHWC_BUF(__read_only image2d_t src_data, __global FLT4 *dst_data, int4 size,
int4 shape) {
int X = get_global_id(0);
int Y = get_global_id(1);
int Z = get_global_id(2);
if (X >= size.x || Y >= size.y || Z >= size.z) {
return;
}
// WRITE_IMAGE(dst_data, (int2)(Y * size.z + Z, X), READ_IMAGE(src_data, smp_zero, (int2)(Y * size.z + Z, X)));
}
__kernel void to_format_NHWC4_to_NHWC_BUF(__read_only image2d_t src_data, __global FLT4 *dst_data, int4 size,
int4 shape) {
int X = get_global_id(0);
int Y = get_global_id(1);
int Z = get_global_id(2);
if (X >= size.x || Y >= size.y || Z >= size.z) {
return;
}
FLT4 data = READ_IMAGE(src_data, smp_zero, (int2)(Y * size.z + Z, X));
int offset = (X * shape.z + Y) * shape.w + Z * 4;
__global FLT *dst_addr = (__global FLT *)dst_data;
dst_addr += offset;
if ((Z + 1) * 4 <= shape.w) {
((__global FLT4 *)dst_addr)[0] = data;
} else {
if (shape.w - Z * 4 >= 1) {
dst_addr[0] = data.x;
}
if (shape.w - Z * 4 >= 2) {
dst_addr[1] = data.y;
}
if (shape.w - Z * 4 >= 3) {
dst_addr[2] = data.z;
}
}
}
__kernel void to_format_NC4HW4_to_to_NHWC_BUF(__read_only image2d_t src_data, __global FLT4 *dst_data, int4 size,
int4 shape) {
int X = get_global_id(0);
int Y = get_global_id(1);
int Z = get_global_id(2);
if (X >= size.x || Y >= size.y || Z >= size.z) {
return;
}
// WRITE_IMAGE(dst_data, (int2)(Y * size.z + Z, X), READ_IMAGE(src_data, smp_zero, (int2)(Y * size.z + Z, X)));
}
__kernel void to_format_NC4HW4_to_NC4HW4_BUF(__read_only image2d_t src_data, __global FLT4 *dst_data, int4 size,
int4 shape) {
int X = get_global_id(0);
int Y = get_global_id(1);
int Z = get_global_id(2);
if (X >= size.x || Y >= size.y || Z >= size.z) {
return;
}
dst_data[(Y * size.z + Z) * size.x + X] = READ_IMAGE(src_data, smp_zero, (int2)(Y * size.z + Z, X));
}
__kernel void to_format_NHWC4_to_NHWC4_BUF(__read_only image2d_t src_data, __global FLT4 *dst_data, int4 size,
int4 shape) {
int X = get_global_id(0);
int Y = get_global_id(1);
int Z = get_global_id(2);
if (X >= size.x || Y >= size.y || Z >= size.z) {
return;
}
dst_data[(Y * size.z + Z) * size.x + X] = READ_IMAGE(src_data, smp_zero, (int2)(Y * size.z + Z, X));
}

+ 2
- 1
mindspore/lite/src/runtime/kernel/opencl/kernel/convolution.cc View File

@@ -329,7 +329,8 @@ kernel::LiteKernel *OpenCLConvolutionKernelCreator(const std::vector<lite::tenso
const std::vector<lite::tensor::Tensor *> &outputs, const std::vector<lite::tensor::Tensor *> &outputs,
OpParameter *opParameter, const lite::Context *ctx, OpParameter *opParameter, const lite::Context *ctx,
const kernel::KernelKey &desc, const lite::Primitive *primitive) { const kernel::KernelKey &desc, const lite::Primitive *primitive) {
auto *kernel = new ConvolutionOpenCLKernel(reinterpret_cast<OpParameter *>(opParameter), inputs, outputs);
auto *kernel =
new (std::nothrow) ConvolutionOpenCLKernel(reinterpret_cast<OpParameter *>(opParameter), inputs, outputs);
if (kernel == nullptr) { if (kernel == nullptr) {
MS_LOG(ERROR) << "Create OpenCL Convolution kernel failed!"; MS_LOG(ERROR) << "Create OpenCL Convolution kernel failed!";
return nullptr; return nullptr;


+ 1
- 1
mindspore/lite/src/runtime/kernel/opencl/kernel/pooling2d.cc View File

@@ -145,7 +145,7 @@ kernel::LiteKernel *OpenCLPooling2dKernelCreator(const std::vector<lite::tensor:
const std::vector<lite::tensor::Tensor *> &outputs, const std::vector<lite::tensor::Tensor *> &outputs,
OpParameter *opParameter, const lite::Context *ctx, OpParameter *opParameter, const lite::Context *ctx,
const kernel::KernelKey &desc, const lite::Primitive *primitive) { const kernel::KernelKey &desc, const lite::Primitive *primitive) {
auto *kernel = new (std::nothrow)PoolingOpenCLKernel(reinterpret_cast<OpParameter *>(opParameter), inputs, outputs);
auto *kernel = new (std::nothrow) PoolingOpenCLKernel(reinterpret_cast<OpParameter *>(opParameter), inputs, outputs);
if (kernel == nullptr) { if (kernel == nullptr) {
MS_LOG(ERROR) << "Create OpenCL Pooling kernel failed!"; MS_LOG(ERROR) << "Create OpenCL Pooling kernel failed!";
return nullptr; return nullptr;


+ 1
- 1
mindspore/lite/src/runtime/kernel/opencl/kernel/softmax.cc View File

@@ -158,7 +158,7 @@ kernel::LiteKernel *OpenCLSoftMaxKernelCreator(const std::vector<lite::tensor::T
const std::vector<lite::tensor::Tensor *> &outputs, const std::vector<lite::tensor::Tensor *> &outputs,
OpParameter *opParameter, const lite::Context *ctx, OpParameter *opParameter, const lite::Context *ctx,
const kernel::KernelKey &desc, const lite::Primitive *primitive) { const kernel::KernelKey &desc, const lite::Primitive *primitive) {
auto *kernel = new (std::nothrow)SoftmaxOpenCLKernel(reinterpret_cast<OpParameter *>(opParameter), inputs, outputs);
auto *kernel = new (std::nothrow) SoftmaxOpenCLKernel(reinterpret_cast<OpParameter *>(opParameter), inputs, outputs);
if (kernel == nullptr) { if (kernel == nullptr) {
MS_LOG(ERROR) << "kernel " << opParameter->name_ << "is nullptr."; MS_LOG(ERROR) << "kernel " << opParameter->name_ << "is nullptr.";
return nullptr; return nullptr;


+ 167
- 0
mindspore/lite/src/runtime/kernel/opencl/kernel/to_format.cc View File

@@ -0,0 +1,167 @@
/**
* Copyright 2019 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include "src/runtime/kernel/opencl/kernel/to_format.h"
#include <set>
#include <map>
#include <string>
#include <utility>
#include "include/errorcode.h"
#include "src/kernel_registry.h"
#include "src/runtime/opencl/opencl_runtime.h"
#include "src/runtime/kernel/opencl/cl/fp32/to_format.cl.inc"

using mindspore::kernel::KERNEL_ARCH::kGPU;
using mindspore::lite::KernelRegistrar;
using mindspore::lite::RET_ERROR;
using mindspore::lite::RET_OK;
using mindspore::schema::PrimitiveType_ToFormat;

namespace mindspore::kernel {

int ToFormatOpenCLKernel::Init() {
auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();
auto parameter = reinterpret_cast<OpenCLToFormatParameter *>(op_parameter_);
out_mem_type_ = parameter->out_mem_type;
std::string program_name = "to_format";
std::map<schema::Format, std::string> format_str{{schema::Format_NCHW, "NCHW"},
{schema::Format_NHWC, "NHWC"},
{schema::Format_NC4HW4, "NC4HW4"},
{schema::Format_NHWC4, "NHWC4"}};
std::string kernel_name =
"to_format_" + format_str[in_tensors_[0]->GetFormat()] + "_to_" + format_str[out_tensors_[0]->GetFormat()];
if (out_mem_type_ == OpenCLMemType::IMG) {
kernel_name += "_IMG";
} else {
kernel_name += "_BUF";
}

#ifdef PROGRAM_WITH_IL
ocl_runtime->CreateKernelFromIL(kernel_(), kernel_name);
#else
std::set<std::string> build_options;
#ifdef ENABLE_FP16
std::string source = to_format_source_fp16;
#else
std::string source = to_format_source_fp32;
#endif
ocl_runtime->LoadSource(program_name, source);
ocl_runtime->BuildKernel(kernel_, program_name, kernel_name, build_options);
#endif
MS_LOG(DEBUG) << kernel_name << " Init Done!";
return RET_OK;
}

int ToFormatOpenCLKernel::ReSize() { return RET_OK; }

int ToFormatOpenCLKernel::GetGlobalSize(size_t idx, std::vector<size_t> *global_size) {
std::vector<int> shapex = out_tensors_[0]->shape();
if (out_tensors_[0]->GetFormat() == schema::Format_NHWC4 || out_tensors_[0]->GetFormat() == schema::Format_NHWC) {
int h = shapex[1];
int w = shapex[2];
int c = shapex[3];
int c4 = UP_DIV(c, C4NUM);
std::vector<size_t> vec = {(size_t)h, (size_t)w, (size_t)c4};
*global_size = std::move(vec);
} else if (out_tensors_[0]->GetFormat() == schema::Format_NC4HW4 ||
out_tensors_[0]->GetFormat() == schema::Format_NCHW) {
int h = shapex[2];
int w = shapex[3];
int c = shapex[1];
int c4 = UP_DIV(c, C4NUM);
std::vector<size_t> vec = {(size_t)c4, (size_t)h, (size_t)w};
*global_size = std::move(vec);
} else if (out_tensors_[0]->GetFormat() == out_tensors_[0]->GetFormat() == schema::Format_NCHW) {
int h = shapex[2];
int w = shapex[3];
int c = shapex[1];
int w4 = UP_DIV(w, C4NUM);
std::vector<size_t> vec = {(size_t)w4, (size_t)h, (size_t)c};
*global_size = std::move(vec);
}
return RET_OK;
}
int ToFormatOpenCLKernel::GetLocalSize(size_t idx, const std::vector<size_t> &global_size,
std::vector<size_t> *local_size) {
return RET_OK;
}

int ToFormatOpenCLKernel::GetImageSize(size_t idx, std::vector<size_t> *img_size) {
size_t im_dst_x, im_dst_y;
std::vector<int> shapex = out_tensors_[0]->shape();
if (out_tensors_[0]->GetFormat() == schema::Format_NC4HW4) {
int h = shapex[1];
int w = shapex[2];
int c = shapex[3];
im_dst_y = UP_DIV(h * c, C4NUM);
im_dst_x = w;
} else if (out_tensors_[0]->GetFormat() == schema::Format_NHWC4) {
int h = shapex[2];
int w = shapex[3];
int c = shapex[1];
im_dst_x = UP_DIV(w * c, C4NUM);
im_dst_y = h;
} else {
MS_LOG(ERROR) << "Unsupported format. " << out_tensors_[0]->GetFormat();
}
img_size->clear();
#ifdef ENABLE_FP16
size_t img_dtype = CL_HALF_FLOAT;
#else
size_t img_dtype = CL_FLOAT;
#endif
std::vector<size_t> vec{im_dst_x, im_dst_y, img_dtype};
*img_size = vec;
return RET_OK;
}

int ToFormatOpenCLKernel::Run() {
MS_LOG(DEBUG) << "ToFormat" << " Running!";
auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();
std::vector<size_t> local = {};
std::vector<size_t> global;
GetGlobalSize(0, &global);
auto shapex = in_tensors_[0]->shape();
cl_int4 shape{shapex.size() > 0 ? shapex[0] : 1, shapex.size() > 1 ? shapex[1] : 1, shapex.size() > 2 ? shapex[2] : 1,
shapex.size() > 3 ? shapex[3] : 1};
cl_int4 gsize{(cl_int)global[0], (cl_int)global[1], (cl_int)global[2], 1};
ocl_runtime->SetKernelArg(kernel_, 0, in_tensors_[0]->Data());
ocl_runtime->SetKernelArg(kernel_, 1, out_tensors_[0]->Data());
ocl_runtime->SetKernelArg(kernel_, 2, gsize);
ocl_runtime->SetKernelArg(kernel_, 3, shape);
ocl_runtime->RunKernel(kernel_, global, local, nullptr);
return RET_OK;
}

kernel::LiteKernel *OpenCLToFormatKernelCreator(const std::vector<lite::tensor::Tensor *> &inputs,
const std::vector<lite::tensor::Tensor *> &outputs,
OpParameter *opParameter, const lite::Context *ctx,
const kernel::KernelKey &desc, const lite::Primitive *primitive) {
auto *kernel = new (std::nothrow) ToFormatOpenCLKernel(reinterpret_cast<OpParameter *>(opParameter), inputs, outputs);
if (kernel == nullptr) {
MS_LOG(ERROR) << "kernel " << opParameter->name_ << " create failed.";
return nullptr;
}
auto ret = kernel->Init();
if (ret != RET_OK) {
delete kernel;
return nullptr;
}
return kernel;
}

REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_ToFormat, OpenCLToFormatKernelCreator)
} // namespace mindspore::kernel

+ 46
- 0
mindspore/lite/src/runtime/kernel/opencl/kernel/to_format.h View File

@@ -0,0 +1,46 @@
/**
* Copyright 2020 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#ifndef MINDSPORE_LITE_SRC_BACKEND_OPENCL_TO_FORMAT_H_
#define MINDSPORE_LITE_SRC_BACKEND_OPENCL_TO_FORMAT_H_

#include <vector>

#include "src/lite_kernel.h"
#include "src/runtime/opencl/opencl_runtime.h"
#include "src/runtime/kernel/opencl/opencl_kernel.h"

namespace mindspore::kernel {
class ToFormatOpenCLKernel : public OpenCLKernel {
public:
explicit ToFormatOpenCLKernel(OpParameter *parameter, const std::vector<lite::tensor::Tensor *> &inputs,
const std::vector<lite::tensor::Tensor *> &outputs)
: OpenCLKernel(parameter, inputs, outputs) {}
~ToFormatOpenCLKernel() override{};

int Init() override;
int ReSize() override;
int Run() override;
int GetImageSize(size_t idx, std::vector<size_t> *img_size) override;
int GetGlobalSize(size_t idx, std::vector<size_t> *global_size) override;
int GetLocalSize(size_t idx, const std::vector<size_t> &global_size, std::vector<size_t> *local_size) override;

private:
cl::Kernel kernel_;
};
} // namespace mindspore::kernel

#endif // MINDSPORE_LITE_SRC_BACKEND_OPENCL_RESHAPE_H_

+ 2
- 1
mindspore/lite/src/runtime/kernel/opencl/kernel/transpose.cc View File

@@ -109,7 +109,8 @@ kernel::LiteKernel *OpenCLTransposeKernelCreator(const std::vector<lite::tensor:
const std::vector<lite::tensor::Tensor *> &outputs, const std::vector<lite::tensor::Tensor *> &outputs,
OpParameter *opParameter, const lite::Context *ctx, OpParameter *opParameter, const lite::Context *ctx,
const kernel::KernelKey &desc, const lite::Primitive *primitive) { const kernel::KernelKey &desc, const lite::Primitive *primitive) {
auto *kernel = new (std::nothrow)TransposeOpenCLKernel(reinterpret_cast<OpParameter *>(opParameter), inputs, outputs);
auto *kernel =
new (std::nothrow) TransposeOpenCLKernel(reinterpret_cast<OpParameter *>(opParameter), inputs, outputs);
if (kernel == nullptr) { if (kernel == nullptr) {
MS_LOG(ERROR) << "kernel " << opParameter->name_ << "is nullptr."; MS_LOG(ERROR) << "kernel " << opParameter->name_ << "is nullptr.";
return nullptr; return nullptr;


+ 21
- 5
mindspore/lite/src/runtime/kernel/opencl/opencl_kernel.h View File

@@ -21,21 +21,37 @@
#include "src/lite_kernel.h" #include "src/lite_kernel.h"


namespace mindspore::kernel { namespace mindspore::kernel {

enum class OpenCLMemType { BUF, IMG };

struct OpenCLToFormatParameter {
OpParameter op_parameter;
schema::Format src_format{schema::Format_NHWC};
schema::Format dst_format{schema::Format_NHWC4};
OpenCLMemType out_mem_type{OpenCLMemType::IMG};
};

class OpenCLKernel : public LiteKernel { class OpenCLKernel : public LiteKernel {
public: public:
explicit OpenCLKernel(OpParameter *parameter, const std::vector<lite::tensor::Tensor *> &inputs, explicit OpenCLKernel(OpParameter *parameter, const std::vector<lite::tensor::Tensor *> &inputs,
const std::vector<lite::tensor::Tensor *> &outputs) const std::vector<lite::tensor::Tensor *> &outputs)
: LiteKernel(parameter, inputs, outputs, nullptr, nullptr) {}
: LiteKernel(parameter, inputs, outputs, nullptr, nullptr) {}


virtual int Init() { return -1; } virtual int Init() { return -1; }
virtual int Prepare() { return -1; } virtual int Prepare() { return -1; }
virtual int InferShape() { return -1; } virtual int InferShape() { return -1; }
virtual int ReSize() { return -1; } virtual int ReSize() { return -1; }
virtual int Run() { return -1; } virtual int Run() { return -1; }
virtual int GetImageSize(size_t idx, std::vector<size_t>* img_size) { return -1; }
virtual int GetGlobalSize(size_t idx, std::vector<size_t>* global_size) { return -1; }
virtual int GetLocalSize(size_t idx, const std::vector<size_t>& global_size,
std::vector<size_t>* local_size) { return -1; }
virtual int GetImageSize(size_t idx, std::vector<size_t> *img_size) { return -1; }
virtual int GetGlobalSize(size_t idx, std::vector<size_t> *global_size) { return -1; }
virtual int GetLocalSize(size_t idx, const std::vector<size_t> &global_size, std::vector<size_t> *local_size) {
return -1;
}
OpenCLMemType GetMemType() { return out_mem_type_; }
void SetMemType(OpenCLMemType mem_type) { out_mem_type_ = mem_type; }

protected:
OpenCLMemType out_mem_type_{OpenCLMemType::IMG};
}; };
} // namespace mindspore::kernel } // namespace mindspore::kernel




+ 186
- 17
mindspore/lite/src/runtime/kernel/opencl/subgraph_opencl_kernel.cc View File

@@ -17,22 +17,117 @@
#include "src/runtime/kernel/opencl/subgraph_opencl_kernel.h" #include "src/runtime/kernel/opencl/subgraph_opencl_kernel.h"
#include "src/runtime/opencl/opencl_executor.h" #include "src/runtime/opencl/opencl_executor.h"
#include "src/runtime/opencl/opencl_runtime.h" #include "src/runtime/opencl/opencl_runtime.h"
#include "src/runtime/kernel/opencl/utils.h"
#include "include/errorcode.h"
#include "src/common/utils.h"


namespace mindspore::kernel { namespace mindspore::kernel {


SubGraphOpenCLKernel::~SubGraphOpenCLKernel() { UnInit(); } SubGraphOpenCLKernel::~SubGraphOpenCLKernel() { UnInit(); }


int SubGraphOpenCLKernel::GenToFormatOp(const std::vector<lite::tensor::Tensor *> &in_tensors,
const std::vector<kernel::LiteKernel *> in_kernels,
std::vector<lite::tensor::Tensor *> *out_tensors,
std::vector<OpenCLToFormatParameter *> *out_parameters,
std::vector<LiteKernel *> *out_convert_ops, OpenCLMemType mem_type) {
out_tensors->clear();
out_parameters->clear();
out_convert_ops->clear();
for (size_t i = 0; i < in_tensors.size(); ++i) {
lite::tensor::Tensor *new_tensor = new (std::nothrow) lite::tensor::Tensor();
MS_ASSERT(new_tensor);
if (new_tensor == nullptr) {
MS_LOG(ERROR) << "SubGraphOpenCLKernel new tensor failed!";
return RET_ERROR;
}
new_tensor->CopyTensor(*in_tensors[i]);
auto dst_format =
(mem_type == OpenCLMemType::IMG) ? in_kernels.back()->out_tensors()[0]->GetFormat() : in_tensors[i]->GetFormat();
auto src_format =
(mem_type == OpenCLMemType::IMG) ? in_tensors[i]->GetFormat() : in_kernels.front()->out_tensors()[0]->GetFormat();
if ((dst_format == schema::Format_NCHW || dst_format == schema::Format_NC4HW4) &&
(src_format == schema::Format_NHWC || src_format == schema::Format_NHWC4)) {
auto &shape = new_tensor->shape();
std::vector<int> dst_shape{shape[0], shape[3], shape[1], shape[2]};
new_tensor->set_shape(shape);
}
if ((dst_format == schema::Format_NHWC || dst_format == schema::Format_NHWC4) &&
(src_format == schema::Format_NCHW || src_format == schema::Format_NC4HW4)) {
auto &shape = new_tensor->shape();
std::vector<int> dst_shape{shape[0], shape[2], shape[3], shape[1]};
new_tensor->set_shape(shape);
}
new_tensor->SetFormat(dst_format);
out_tensors->emplace_back(new_tensor);
#ifdef ENABLE_FP16
KernelKey desc{kGPU, kNumberTypeFloat16, schema::PrimitiveType_ToFormat};
#else
KernelKey desc{kGPU, kNumberTypeFloat32, schema::PrimitiveType_ToFormat};
#endif
OpenCLToFormatParameter *parameter = new (std::nothrow) OpenCLToFormatParameter;
MS_ASSERT(parameter);
if (parameter == nullptr) {
MS_LOG(ERROR) << "SubGraphOpenCLKernel new parameter failed!";
return RET_ERROR;
}
parameter->src_format = src_format;
parameter->dst_format = dst_format;
parameter->out_mem_type = mem_type;
out_parameters->emplace_back(parameter);
LiteKernel *in_convert_op;
if (mem_type == OpenCLMemType::IMG) {
in_convert_op =
lite::GetOpenCLKernel({in_tensors[i]}, {new_tensor}, reinterpret_cast<OpParameter *>(parameter), nullptr, desc);
} else {
in_convert_op =
lite::GetOpenCLKernel({new_tensor}, {in_tensors[i]}, reinterpret_cast<OpParameter *>(parameter), nullptr, desc);
}
MS_ASSERT(in_convert_op);
if (in_convert_op == nullptr) {
MS_LOG(ERROR) << "SubGraphOpenCLKernel create op failed!";
return RET_ERROR;
}
auto in_opencl_op = reinterpret_cast<OpenCLKernel *>(in_convert_op);
if (mem_type == OpenCLMemType::IMG) {
in_opencl_op->AddOutKernel(in_kernels[i]);
reinterpret_cast<OpenCLKernel *>(in_kernels[i])->SetInKernel({in_convert_op});
reinterpret_cast<OpenCLKernel *>(in_kernels[i])->set_in_tensors({new_tensor});
} else {
reinterpret_cast<OpenCLKernel *>(in_kernels[i])->SetOutKernel({in_convert_op});
reinterpret_cast<OpenCLKernel *>(in_kernels[i])->set_out_tensors({new_tensor});
in_convert_op->AddInKernel(in_kernels[i]);
}
out_convert_ops->emplace_back(in_convert_op);
}
return RET_OK;
}

int SubGraphOpenCLKernel::Init() { int SubGraphOpenCLKernel::Init() {
allocator_ = lite::opencl::OpenCLRuntime::GetInstance()->GetAllocator(); allocator_ = lite::opencl::OpenCLRuntime::GetInstance()->GetAllocator();
MS_LOG(DEBUG) << "input num=" << inputs_.size() << ", output num=" << outputs_.size();
for (const auto tensor : inputs_) {
MS_LOG(DEBUG) << "input num=" << in_tensors_.size() << ", output num=" << out_tensors_.size();
for (const auto tensor : in_tensors_) {
tensor->set_allocator(allocator_); tensor->set_allocator(allocator_);
} }
for (const auto tensor : outputs_) {
for (const auto tensor : out_tensors_) {
tensor->set_allocator(allocator_); tensor->set_allocator(allocator_);
} }
int ret = GenToFormatOp(in_tensors_, in_kernels_, &in_convert_tensors_, &in_parameters_, &in_convert_ops_,
OpenCLMemType::IMG);
if (ret != RET_OK) {
return RET_ERROR;
}
nodes_.insert(nodes_.begin(), in_convert_ops_.begin(), in_convert_ops_.end());
ret = GenToFormatOp(out_tensors_, out_kernels_, &out_convert_tensors_, &out_parameters_, &out_convert_ops_,
OpenCLMemType::BUF);
if (ret != RET_OK) {
return RET_ERROR;
}
nodes_.insert(nodes_.end(), out_convert_ops_.begin(), out_convert_ops_.end());

MallocTensorWithReuse();

// Map buffer for write, it is not necessary for fine-grained // Map buffer for write, it is not necessary for fine-grained
for (auto &tensor : inputs_) {
for (auto &tensor : in_tensors_) {
void *data = tensor->Data(); void *data = tensor->Data();
// It is required with coarse-grained SVM // It is required with coarse-grained SVM
if (data != nullptr) { if (data != nullptr) {
@@ -42,44 +137,118 @@ int SubGraphOpenCLKernel::Init() {
MS_LOG(ERROR) << "SubGraphOpenCLKernel input nullptr!"; MS_LOG(ERROR) << "SubGraphOpenCLKernel input nullptr!";
} }
} }
return 0;
return RET_OK;
}
int SubGraphOpenCLKernel::MallocTensorWithReuse() {
kernel::LiteKernelUtil::InitTensorRefCount(nodes_);
for (auto *kernel : nodes_) {
MS_ASSERT(nullptr != kernel);
kernel::OpenCLKernel *op_kernel = reinterpret_cast<kernel::OpenCLKernel *>(kernel);
auto &outputs = kernel->out_tensors();
for (auto i = 0; i < outputs.size(); ++i) {
auto *output = outputs.at(i);
MS_ASSERT(nullptr != output);
if (op_kernel->GetMemType() == OpenCLMemType::IMG) {
std::vector<size_t> img_size;
op_kernel->GetImageSize(i, &img_size);
auto data_ptr = allocator_->Malloc(output->Size(), img_size);
output->SetData(data_ptr);
} else {
output->MallocData(allocator_);
}
output->set_allocator(allocator_);
}
for (auto input_kernel : kernel->in_kernels()) {
MS_EXCEPTION_IF_NULL(input_kernel);
auto ret = input_kernel->DecOutTensorRefCount();
if (0 != ret) {
MS_LOG(WARNING) << "DecOutTensorRefCount for kernel" << kernel->name() << " failed";
}
}
}
for (auto kernel : out_kernels_) {
MS_EXCEPTION_IF_NULL(kernel);
auto ret = kernel->DecOutTensorRefCount();
if (0 != ret) {
MS_LOG(WARNING) << "DecOutTensorRefCount for kernel" << kernel->name() << " failed";
}
}
for (auto kernel : in_convert_ops_) {
MS_EXCEPTION_IF_NULL(kernel);
auto ret = kernel->DecOutTensorRefCount();
if (0 != ret) {
MS_LOG(WARNING) << "DecOutTensorRefCount for kernel" << kernel->name() << " failed";
}
}
for (auto kernel : out_convert_ops_) {
MS_EXCEPTION_IF_NULL(kernel);
auto ret = kernel->DecOutTensorRefCount();
if (0 != ret) {
MS_LOG(WARNING) << "DecOutTensorRefCount for kernel" << kernel->name() << " failed";
}
}
return RET_OK;
} }


int SubGraphOpenCLKernel::UnInit() { int SubGraphOpenCLKernel::UnInit() {
for (auto &tensor : outputs_) {
for (auto &tensor : out_tensors_) {
allocator_->UnmapBuffer(tensor->Data());
}
for (const auto tensor : in_tensors_) {
if (tensor != nullptr) {
tensor->FreeData();
}
}
for (const auto tensor : out_tensors_) {
if (tensor != nullptr) {
tensor->FreeData();
}
}
for (auto &tensor : out_tensors_) {
allocator_->UnmapBuffer(tensor->Data()); allocator_->UnmapBuffer(tensor->Data());
} }
for (const auto tensor : inputs_) {
for (const auto tensor : in_convert_tensors_) {
if (tensor != nullptr) { if (tensor != nullptr) {
tensor->FreeData(); tensor->FreeData();
delete tensor;
} }
} }
for (const auto tensor : outputs_) {
for (const auto tensor : out_convert_tensors_) {
if (tensor != nullptr) { if (tensor != nullptr) {
tensor->FreeData(); tensor->FreeData();
delete tensor;
} }
} }
return 0;
for (const auto parameter : in_parameters_) {
if (parameter != nullptr) {
delete parameter;
}
}
for (const auto op : in_convert_ops_) {
if (op != nullptr) {
delete op;
}
}
return RET_OK;
} }


int SubGraphOpenCLKernel::InferShape() { return 0; }
int SubGraphOpenCLKernel::InferShape() { return RET_OK; }


int SubGraphOpenCLKernel::ReSize() { return 0; }
int SubGraphOpenCLKernel::ReSize() { return RET_OK; }


int SubGraphOpenCLKernel::Run() { int SubGraphOpenCLKernel::Run() {
auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();
for (auto &tensor : inputs_) {
for (auto &tensor : in_tensors_) {
allocator_->UnmapBuffer(tensor->Data()); allocator_->UnmapBuffer(tensor->Data());
} }


lite::opencl::OpenCLExecutor executor; lite::opencl::OpenCLExecutor executor;
executor.Run(inputs_, outputs_, nodes_, allocator_);
ocl_runtime->SyncCommandQueue();
for (auto &tensor : outputs_) {
executor.Run(in_tensors_, out_tensors_, nodes_, allocator_);
for (auto &tensor : out_tensors_) {
void *data = allocator_->MapBuffer(tensor->Data(), CL_MAP_READ, nullptr, true); void *data = allocator_->MapBuffer(tensor->Data(), CL_MAP_READ, nullptr, true);
tensor->SetData(data); tensor->SetData(data);
} }
return 0;
return RET_OK;
} }


} // namespace mindspore::kernel } // namespace mindspore::kernel

+ 15
- 2
mindspore/lite/src/runtime/kernel/opencl/subgraph_opencl_kernel.h View File

@@ -36,7 +36,7 @@ class SubGraphOpenCLKernel : public SubGraphKernel {
const std::vector<kernel::LiteKernel *> inKernels, const std::vector<kernel::LiteKernel *> inKernels,
const std::vector<kernel::LiteKernel *> outKernels, const std::vector<kernel::LiteKernel *> outKernels,
const std::vector<kernel::LiteKernel *> nodes) const std::vector<kernel::LiteKernel *> nodes)
: SubGraphKernel(inputs, outputs, inKernels, outKernels, nodes, nullptr, nullptr) {}
: SubGraphKernel(inputs, outputs, inKernels, outKernels, nodes, nullptr, nullptr) {}
~SubGraphOpenCLKernel() override; ~SubGraphOpenCLKernel() override;


int Init() override; int Init() override;
@@ -45,11 +45,24 @@ class SubGraphOpenCLKernel : public SubGraphKernel {
int Run() override; int Run() override;
int UnInit(); int UnInit();


protected:
int MallocTensorWithReuse();
int GenToFormatOp(const std::vector<lite::tensor::Tensor *> &in_tensors,
const std::vector<kernel::LiteKernel *> in_kernels,
std::vector<lite::tensor::Tensor *> *out_tensors,
std::vector<OpenCLToFormatParameter *> *out_parameters, std::vector<LiteKernel *> *out_convert_ops,
OpenCLMemType mem_type);

private: private:
SubGraphOpenCLParameter *subgraph_ocl_parameter_; SubGraphOpenCLParameter *subgraph_ocl_parameter_;
lite::opencl::OpenCLAllocator *allocator_; lite::opencl::OpenCLAllocator *allocator_;
std::vector<lite::tensor::Tensor *> in_convert_tensors_;
std::vector<lite::tensor::Tensor *> out_convert_tensors_;
std::vector<OpenCLToFormatParameter *> in_parameters_;
std::vector<OpenCLToFormatParameter *> out_parameters_;
std::vector<LiteKernel *> in_convert_ops_;
std::vector<LiteKernel *> out_convert_ops_;
}; };
} // namespace mindspore::kernel } // namespace mindspore::kernel


#endif // MINDSPORE_LITE_SRC_BACKEND_OPENCL_SUBGRAPH_OPENCL_KERNEL_H_ #endif // MINDSPORE_LITE_SRC_BACKEND_OPENCL_SUBGRAPH_OPENCL_KERNEL_H_


+ 16
- 0
mindspore/lite/src/runtime/kernel/opencl/utils.cc View File

@@ -18,6 +18,22 @@
#include <algorithm> #include <algorithm>
#include <string> #include <string>
#include <vector> #include <vector>
#include "src/kernel_registry.h"

using mindspore::lite::KernelRegistrar;

namespace mindspore::lite {
kernel::LiteKernel *GetOpenCLKernel(const std::vector<tensor::Tensor *> &in_tensors,
const std::vector<tensor::Tensor *> &out_tensors, OpParameter *parameter,
const Context *ctx, const kernel::KernelKey &key) {
auto creator = KernelRegistry::GetInstance()->GetCreator(key);
if (creator != nullptr) {
auto kernel = creator(in_tensors, out_tensors, parameter, nullptr, key, nullptr);
return kernel;
}
return nullptr;
}
} // namespace mindspore::lite


namespace mindspore { namespace mindspore {
namespace kernel { namespace kernel {


+ 7
- 0
mindspore/lite/src/runtime/kernel/opencl/utils.h View File

@@ -22,6 +22,13 @@
#include "CL/cl2.hpp" #include "CL/cl2.hpp"
#include "utils/log_adapter.h" #include "utils/log_adapter.h"
#include "src/runtime/kernel/arm/nnacl/op_base.h" #include "src/runtime/kernel/arm/nnacl/op_base.h"
#include "src/lite_kernel.h"

namespace mindspore::lite {
kernel::LiteKernel *GetOpenCLKernel(const std::vector<tensor::Tensor *> &in_tensors,
const std::vector<tensor::Tensor *> &out_tensors, OpParameter *parameter,
const Context *ctx, const kernel::KernelKey &key);
}


namespace mindspore::kernel { namespace mindspore::kernel {




+ 9
- 4
mindspore/lite/src/runtime/opencl/opencl_allocator.cc View File

@@ -16,10 +16,10 @@


#include "src/runtime/opencl/opencl_allocator.h" #include "src/runtime/opencl/opencl_allocator.h"
#include <utility> #include <utility>
#include "utils/log_adapter.h"
#include "src/runtime/opencl/opencl_runtime.h" #include "src/runtime/opencl/opencl_runtime.h"
#include "include/errorcode.h"
#include "src/runtime/kernel/opencl/utils.h" #include "src/runtime/kernel/opencl/utils.h"
#include "utils/log_adapter.h"
#include "include/errorcode.h"


namespace mindspore::lite::opencl { namespace mindspore::lite::opencl {


@@ -61,7 +61,7 @@ void *OpenCLAllocator::Malloc(size_t size, const std::vector<size_t> &img_size)
} }
Lock(); Lock();
auto iter = free_list_.lower_bound(size); auto iter = free_list_.lower_bound(size);
if (iter != free_list_.end() && (iter->second->size_ >= size) && (iter->second->size_ < (size << shift_factor_))) {
while (iter != free_list_.end() && (iter->second->size_ >= size) && (iter->second->size_ < (size << shift_factor_))) {
auto mem_buf = iter->second; auto mem_buf = iter->second;
bool is_match{mem_buf->img_size.size() == img_size.size()}; bool is_match{mem_buf->img_size.size() == img_size.size()};
for (int i = 0; i < img_size.size() && is_match; ++i) { for (int i = 0; i < img_size.size() && is_match; ++i) {
@@ -75,6 +75,7 @@ void *OpenCLAllocator::Malloc(size_t size, const std::vector<size_t> &img_size)
<< ", host addr: " << mem_buf->host_ptr_ << ", device addr: " << mem_buf->device_ptr_; << ", host addr: " << mem_buf->host_ptr_ << ", device addr: " << mem_buf->device_ptr_;
return mem_buf->host_ptr_; return mem_buf->host_ptr_;
} }
++iter;
} }
void *host_ptr = nullptr; void *host_ptr = nullptr;
void *device_ptr = nullptr; void *device_ptr = nullptr;
@@ -136,7 +137,7 @@ void *OpenCLAllocator::CreateImageFromHost(void *data, size_t size, const std::v
auto ocl_runtime = opencl::OpenCLRuntime::GetInstance(); auto ocl_runtime = opencl::OpenCLRuntime::GetInstance();
Lock(); Lock();
auto iter = free_list_.lower_bound(size); auto iter = free_list_.lower_bound(size);
if (iter != free_list_.end() && (iter->second->size_ >= size) && (iter->second->size_ < (size << shift_factor_))) {
while (iter != free_list_.end() && (iter->second->size_ >= size) && (iter->second->size_ < (size << shift_factor_))) {
auto mem_buf = iter->second; auto mem_buf = iter->second;
bool is_match{mem_buf->img_size.size() == img_size.size()}; bool is_match{mem_buf->img_size.size() == img_size.size()};
for (int i = 0; i < img_size.size() && is_match; ++i) { for (int i = 0; i < img_size.size() && is_match; ++i) {
@@ -150,6 +151,7 @@ void *OpenCLAllocator::CreateImageFromHost(void *data, size_t size, const std::v
<< ", host addr: " << mem_buf->host_ptr_ << ", device addr: " << mem_buf->device_ptr_; << ", host addr: " << mem_buf->host_ptr_ << ", device addr: " << mem_buf->device_ptr_;
return mem_buf->host_ptr_; return mem_buf->host_ptr_;
} }
++iter;
} }
void *host_ptr = nullptr; void *host_ptr = nullptr;
void *device_ptr = nullptr; void *device_ptr = nullptr;
@@ -198,10 +200,13 @@ void OpenCLAllocator::Free(void *buf) {
allocated_list_.erase(iter); allocated_list_.erase(iter);
free_list_.insert(std::make_pair(mem_buf->size_, mem_buf)); free_list_.insert(std::make_pair(mem_buf->size_, mem_buf));
UnLock(); UnLock();
MS_LOG(DEBUG) << "Free a new Image2D. size: " << mem_buf->size_ << ", host addr: " << mem_buf->host_ptr_
<< ", device addr: " << mem_buf->device_ptr_ << ", image addr: " << mem_buf->image_ptr_;
return; return;
} }
UnLock(); UnLock();
free(buf); free(buf);
MS_LOG(DEBUG) << "Free host ptr: " << buf;
} }


size_t OpenCLAllocator::GetTotalSize() { size_t OpenCLAllocator::GetTotalSize() {


+ 25
- 54
mindspore/lite/src/runtime/opencl/opencl_executor.cc View File

@@ -24,41 +24,9 @@ namespace mindspore::lite::opencl {
int OpenCLExecutor::Run(std::vector<tensor::Tensor *> &inputs, std::vector<tensor::Tensor *> &outputs, int OpenCLExecutor::Run(std::vector<tensor::Tensor *> &inputs, std::vector<tensor::Tensor *> &outputs,
std::vector<kernel::LiteKernel *> &kernels, Allocator *allocator, std::vector<kernel::LiteKernel *> &kernels, Allocator *allocator,
const session::KernelCallBack &before, const session::KernelCallBack &after) { const session::KernelCallBack &before, const session::KernelCallBack &after) {
MS_ASSERT(nullptr != allocator);
for (auto &inTensor : inputs) {
if (inTensor == nullptr) {
MS_LOG(ERROR) << "Graph input tensor is nullptr";
return RET_ERROR;
}
if (inTensor->GetFormat() != schema::Format_NHWC4 && inTensor->GetFormat() != schema::Format_NC4HW4 &&
inTensor->GetFormat() != schema::Format_NHWC) {
MS_LOG(ERROR) << "input should be NHWC/NHWC4/NC4HW4, actual is " << schema::EnumNameFormat(inTensor->GetFormat());
return RET_ERROR;
} else {
TransformTensorLayout(inTensor, inTensor->GetFormat(), schema::Format_NHWC4, true);
// TransformTensorLayout(inTensor, inTensor->GetFormat(), schema::Format_NC4HW4, true);
}
}
kernel::LiteKernelUtil::InitTensorRefCount(kernels); kernel::LiteKernelUtil::InitTensorRefCount(kernels);
OpenCLAllocator* op_allocator = reinterpret_cast<OpenCLAllocator*>(allocator);
for (auto *kernel : kernels) { for (auto *kernel : kernels) {
MS_ASSERT(nullptr != kernel); MS_ASSERT(nullptr != kernel);
kernel::OpenCLKernel *op_kernel = reinterpret_cast<kernel::OpenCLKernel*>(kernel);
auto &outputs = kernel->out_tensors();
for (auto i = 0; i < outputs.size(); ++i) {
auto *output = outputs.at(i);
MS_ASSERT(nullptr != output);
if (is_image2d_out_) {
std::vector<size_t> img_size;
op_kernel->GetImageSize(i, &img_size);
auto data_ptr = op_allocator->Malloc(output->Size(), img_size);

output->SetData(data_ptr);
} else {
output->MallocData(allocator);
}
output->set_allocator(allocator);
}
session::CallBackParam callbackParam; session::CallBackParam callbackParam;
callbackParam.name_callback_param = kernel->name(); callbackParam.name_callback_param = kernel->name();


@@ -67,6 +35,21 @@ int OpenCLExecutor::Run(std::vector<tensor::Tensor *> &inputs, std::vector<tenso
MS_LOG(ERROR) << "run kernel before_callback failed, name: " << kernel->name(); MS_LOG(ERROR) << "run kernel before_callback failed, name: " << kernel->name();
} }
} }
kernel::OpenCLKernel *op_kernel = reinterpret_cast<kernel::OpenCLKernel *>(kernel);
auto &cur_outputs = kernel->out_tensors();
for (auto i = 0; i < cur_outputs.size(); ++i) {
auto *output = cur_outputs.at(i);
MS_ASSERT(nullptr != output);
if (op_kernel->GetMemType() == kernel::OpenCLMemType::IMG) {
std::vector<size_t> img_size;
op_kernel->GetImageSize(i, &img_size);
auto data_ptr = allocator_->Malloc(output->Size(), img_size);
output->SetData(data_ptr);
} else {
output->MallocData(allocator_);
}
}

auto ret = kernel->Run(); auto ret = kernel->Run();
if (0 != ret) { if (0 != ret) {
MS_LOG(ERROR) << "run kernel failed, name: " << kernel->name(); MS_LOG(ERROR) << "run kernel failed, name: " << kernel->name();
@@ -86,21 +69,11 @@ int OpenCLExecutor::Run(std::vector<tensor::Tensor *> &inputs, std::vector<tenso
} }
} }
} }
// output format transform
for (auto &outTensor : outputs) {
if (outTensor == nullptr) {
MS_LOG(ERROR) << "Graph output tensor is nullptr";
return RET_ERROR;
}
if (outTensor->GetFormat() != schema::Format_NHWC) {
TransformTensorLayout(outTensor, outTensor->GetFormat(), schema::Format_NHWC, false);
}
}
return RET_OK; return RET_OK;
} }


int OpenCLExecutor::TransformTensorLayout(tensor::Tensor *tensor, schema::Format src_format,
schema::Format dst_format, bool trans_dir) {
int OpenCLExecutor::TransformTensorLayout(tensor::Tensor *tensor, schema::Format src_format, schema::Format dst_format,
bool trans_dir) {
MS_ASSERT(nullptr != tensor); MS_ASSERT(nullptr != tensor);
MS_ASSERT(4 == tensor->shape().size()); MS_ASSERT(4 == tensor->shape().size());
auto data_type = tensor->data_type(); auto data_type = tensor->data_type();
@@ -114,11 +87,10 @@ int OpenCLExecutor::TransformTensorLayout(tensor::Tensor *tensor, schema::Format
<< schema::EnumNameFormat(dst_format); << schema::EnumNameFormat(dst_format);
return RET_ERROR; return RET_ERROR;
} }
return RET_OK;
} }


int OpenCLExecutor::TransformTensorLayoutFp32(tensor::Tensor *tensor, schema::Format src_format, int OpenCLExecutor::TransformTensorLayoutFp32(tensor::Tensor *tensor, schema::Format src_format,
schema::Format dst_format, bool trans_dir) {
schema::Format dst_format, bool trans_dir) {
MS_ASSERT(nullptr != tensor); MS_ASSERT(nullptr != tensor);
MS_ASSERT(nullptr != allocator_); MS_ASSERT(nullptr != allocator_);
MS_ASSERT(4 == tensor->shape().size()); MS_ASSERT(4 == tensor->shape().size());
@@ -138,11 +110,11 @@ int OpenCLExecutor::TransformTensorLayoutFp32(tensor::Tensor *tensor, schema::Fo
} }


int OpenCLExecutor::TransformTensorLayoutToBuffer(tensor::Tensor *tensor, schema::Format src_format, int OpenCLExecutor::TransformTensorLayoutToBuffer(tensor::Tensor *tensor, schema::Format src_format,
schema::Format dst_format) {
schema::Format dst_format) {
if (dst_format == schema::Format_NHWC4) { if (dst_format == schema::Format_NHWC4) {
auto *src_data = tensor->Data(); auto *src_data = tensor->Data();
size_t C4 = UP_DIV(tensor->Channel(), C4NUM); size_t C4 = UP_DIV(tensor->Channel(), C4NUM);
std::vector <size_t> img_size{tensor->Width() * C4, (size_t) tensor->Height(), CL_FLOAT};
std::vector<size_t> img_size{tensor->Width() * C4, (size_t)tensor->Height(), CL_FLOAT};
if (src_format == schema::Format_NHWC) { if (src_format == schema::Format_NHWC) {
auto *dst_data = allocator_->Malloc(tensor->Size(), img_size); auto *dst_data = allocator_->Malloc(tensor->Size(), img_size);
if (dst_data == nullptr) { if (dst_data == nullptr) {
@@ -168,7 +140,7 @@ int OpenCLExecutor::TransformTensorLayoutToBuffer(tensor::Tensor *tensor, schema
} }


int OpenCLExecutor::TransformTensorLayoutToImage(tensor::Tensor *tensor, schema::Format src_format, int OpenCLExecutor::TransformTensorLayoutToImage(tensor::Tensor *tensor, schema::Format src_format,
schema::Format dst_format) {
schema::Format dst_format) {
if (dst_format == schema::Format_NHWC4) { if (dst_format == schema::Format_NHWC4) {
tensor->SetFormat(schema::Format_NHWC4); tensor->SetFormat(schema::Format_NHWC4);
// convert to nhwc4 // convert to nhwc4
@@ -202,15 +174,15 @@ int OpenCLExecutor::TransformTensorLayoutToImage(tensor::Tensor *tensor, schema:
} }


int OpenCLExecutor::TransformTensorLayoutFromImage(tensor::Tensor *tensor, schema::Format src_format, int OpenCLExecutor::TransformTensorLayoutFromImage(tensor::Tensor *tensor, schema::Format src_format,
schema::Format dst_format) {
schema::Format dst_format) {
if (dst_format == schema::Format_NHWC) { if (dst_format == schema::Format_NHWC) {
auto src_data = tensor->Data(); auto src_data = tensor->Data();
auto dst_data = allocator_->Malloc(tensor->Size()); auto dst_data = allocator_->Malloc(tensor->Size());
cl::Image2D *out_mem = reinterpret_cast<cl::Image2D *>(allocator_->GetImage(src_data)); cl::Image2D *out_mem = reinterpret_cast<cl::Image2D *>(allocator_->GetImage(src_data));
std::vector<size_t> img_size; std::vector<size_t> img_size;
allocator_->GetImageSize(src_data, &img_size); allocator_->GetImageSize(src_data, &img_size);
auto origin = cl::array < cl::size_type, 3U > {0, 0, 0};
auto region = cl::array < cl::size_type, 3U > {img_size[0], img_size[1], 1};
auto origin = cl::array<cl::size_type, 3U>{0, 0, 0};
auto region = cl::array<cl::size_type, 3U>{img_size[0], img_size[1], 1};
auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance(); auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();
ocl_runtime->GetDefaultCommandQueue()->enqueueReadImage(*out_mem, CL_TRUE, origin, region, 0, 0, dst_data); ocl_runtime->GetDefaultCommandQueue()->enqueueReadImage(*out_mem, CL_TRUE, origin, region, 0, 0, dst_data);
tensor->SetData(dst_data); tensor->SetData(dst_data);
@@ -224,7 +196,7 @@ int OpenCLExecutor::TransformTensorLayoutFromImage(tensor::Tensor *tensor, schem
} }


int OpenCLExecutor::TransformTensorLayoutUint8(tensor::Tensor *tensor, schema::Format src_format, int OpenCLExecutor::TransformTensorLayoutUint8(tensor::Tensor *tensor, schema::Format src_format,
schema::Format dst_format, bool is_image) {
schema::Format dst_format, bool is_image) {
MS_ASSERT(nullptr != tensor); MS_ASSERT(nullptr != tensor);
MS_ASSERT(4 == tensor->shape().size()); MS_ASSERT(4 == tensor->shape().size());
// auto src_format = tensor->GetFormat(); // auto src_format = tensor->GetFormat();
@@ -234,4 +206,3 @@ int OpenCLExecutor::TransformTensorLayoutUint8(tensor::Tensor *tensor, schema::F
return RET_ERROR; return RET_ERROR;
} }
} // namespace mindspore::lite::opencl } // namespace mindspore::lite::opencl


+ 3
- 0
mindspore/lite/test/CMakeLists.txt View File

@@ -148,6 +148,7 @@ if (SUPPORT_GPU)
${LITE_DIR}/src/runtime/kernel/opencl/kernel/conv2d_transpose.cc ${LITE_DIR}/src/runtime/kernel/opencl/kernel/conv2d_transpose.cc
${LITE_DIR}/src/runtime/kernel/opencl/kernel/transpose.cc ${LITE_DIR}/src/runtime/kernel/opencl/kernel/transpose.cc
${LITE_DIR}/src/runtime/kernel/opencl/kernel/reshape.cc ${LITE_DIR}/src/runtime/kernel/opencl/kernel/reshape.cc
${LITE_DIR}/src/runtime/kernel/opencl/kernel/to_format.cc
) )
endif() endif()
### minddata lite ### minddata lite
@@ -323,6 +324,8 @@ if (SUPPORT_GPU)
${TEST_DIR}/ut/src/runtime/kernel/opencl/transpose_tests.cc ${TEST_DIR}/ut/src/runtime/kernel/opencl/transpose_tests.cc
${TEST_DIR}/ut/src/runtime/kernel/opencl/convolution_tests.cc ${TEST_DIR}/ut/src/runtime/kernel/opencl/convolution_tests.cc
${TEST_DIR}/ut/src/runtime/kernel/opencl/activation_tests.cc ${TEST_DIR}/ut/src/runtime/kernel/opencl/activation_tests.cc
#${TEST_DIR}/ut/src/runtime/kernel/opencl/leakyrelu_tests.cc
${TEST_DIR}/ut/src/runtime/kernel/opencl/to_format_tests.cc
) )
endif() endif()




+ 79
- 0
mindspore/lite/test/ut/src/runtime/kernel/opencl/to_format_tests.cc View File

@@ -0,0 +1,79 @@
/**
* Copyright 2020 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include <iostream>
#include <memory>
#include "mindspore/core/utils/log_adapter.h"
#include "common/common_test.h"
#include "mindspore/lite/src/common/file_utils.h"
#include "mindspore/lite/src/runtime/opencl/opencl_runtime.h"
#include "mindspore/lite/src/runtime/kernel/opencl/subgraph_opencl_kernel.h"
#include "mindspore/lite/src/runtime/kernel/opencl/kernel/transpose.h"

namespace mindspore {
class TestToFormatOpenCL : public mindspore::CommonTest {
public:
TestToFormatOpenCL() {}
};

TEST_F(TestToFormatOpenCL, TransposeFp32) {
auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();
ocl_runtime->Init();
auto allocator = ocl_runtime->GetAllocator();
int h = 64;
int w = 1;
int c = 7360;
size_t input_size;
std::string input_path = "./test_data/transpose/transpose_fp32_input.bin";
auto input_data = reinterpret_cast<float *>(mindspore::lite::ReadFile(input_path.c_str(), &input_size));

lite::tensor::Tensor *tensor_x =
new lite::tensor::Tensor(TypeId(kNumberTypeFloat32), {1, h, w, c}, schema::Format_NHWC4);

lite::tensor::Tensor *tensor_out = new lite::tensor::Tensor(TypeId(kNumberTypeFloat32), {1, c, h, w});
std::vector<lite::tensor::Tensor *> inputs{tensor_x};
std::vector<lite::tensor::Tensor *> outputs{tensor_out};
auto *arith_kernel = new kernel::TransposeOpenCLKernel(nullptr, inputs, outputs);
arith_kernel->Init();

inputs[0]->MallocData(allocator);

std::vector<kernel::LiteKernel *> kernels{arith_kernel};
auto *pGraph = new kernel::SubGraphOpenCLKernel(inputs, outputs, kernels, kernels, kernels);
pGraph->Init();
memcpy(inputs[0]->Data(), input_data, input_size);
pGraph->Run();

size_t output_size;
std::string output_path = "./test_data/transpose/transpose_fp32_output.bin";
auto correct_data = reinterpret_cast<float *>(mindspore::lite::ReadFile(output_path.c_str(), &output_size));
printf("==================output data=================\n");
float *output_data = reinterpret_cast<float *>(tensor_out->Data());
std::cout << std::endl;
int size_n = h * w * c;
size_n = size_n > 100 ? 100 : size_n;
for (int i = 0; i < size_n; i++) {
std::cout << output_data[i] << " ";
if ((i + 1) % c == 0) {
std::cout << std::endl;
}
}
std::cout << std::endl;

// compare
CompareOutputData(output_data, correct_data, h * w * c, 0.00001);
MS_LOG(INFO) << "TestMatMulFp32 passed";
}
} // namespace mindspore

Loading…
Cancel
Save