|
|
@@ -14,16 +14,12 @@ |
|
|
* limitations under the License. |
|
|
* limitations under the License. |
|
|
*/ |
|
|
*/ |
|
|
|
|
|
|
|
|
#include "src/runtime/kernel/opencl/kernel/convolution.h" |
|
|
|
|
|
#include <vector> |
|
|
|
|
|
#include <string> |
|
|
#include <string> |
|
|
#include <set> |
|
|
#include <set> |
|
|
#include "schema/model_generated.h" |
|
|
|
|
|
#include "src/kernel_registry.h" |
|
|
|
|
|
#include "src/runtime/opencl/opencl_runtime.h" |
|
|
|
|
|
#ifndef PROGRAM_WITH_IL |
|
|
|
|
|
|
|
|
#include <algorithm> |
|
|
|
|
|
#include "src/runtime/kernel/opencl/kernel/convolution.h" |
|
|
#include "src/runtime/kernel/opencl/cl/fp32/convolution.cl.inc" |
|
|
#include "src/runtime/kernel/opencl/cl/fp32/convolution.cl.inc" |
|
|
#endif |
|
|
|
|
|
|
|
|
#include "src/kernel_registry.h" |
|
|
|
|
|
|
|
|
using mindspore::kernel::KERNEL_ARCH::kGPU; |
|
|
using mindspore::kernel::KERNEL_ARCH::kGPU; |
|
|
using mindspore::lite::KernelRegistrar; |
|
|
using mindspore::lite::KernelRegistrar; |
|
|
@@ -38,27 +34,27 @@ int ConvolutionOpenCLKernel::Init() { |
|
|
MS_LOG(ERROR) << "ConvolutionOpenCLKernel only support Batch=1!"; |
|
|
MS_LOG(ERROR) << "ConvolutionOpenCLKernel only support Batch=1!"; |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
outputs_[0]->SetFormat(schema::Format_NHWC4); |
|
|
|
|
|
io_dataformat_ = outputs_[0]->GetFormat(); |
|
|
|
|
|
|
|
|
auto io_NHWC = inputs_[0]->GetFormat() == schema::Format_NHWC && outputs_[0]->GetFormat() == schema::Format_NHWC; |
|
|
|
|
|
auto io_NHWC4 = inputs_[0]->GetFormat() == schema::Format_NHWC4 && outputs_[0]->GetFormat() == schema::Format_NHWC4; |
|
|
|
|
|
if (!io_NHWC && !io_NHWC4) { |
|
|
|
|
|
MS_LOG(ERROR) << "input and output data_format is invalid!"; |
|
|
|
|
|
} |
|
|
|
|
|
io_dataformat_ = inputs_[0]->GetFormat(); |
|
|
|
|
|
|
|
|
if (inputs_[1]->GetFormat() != schema::Format_KHWC) { |
|
|
if (inputs_[1]->GetFormat() != schema::Format_KHWC) { |
|
|
MS_LOG(ERROR) << "weight data_format is invalid!"; |
|
|
MS_LOG(ERROR) << "weight data_format is invalid!"; |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance(); |
|
|
|
|
|
std::string kernel_name = "convolution_NHWC_OHWI"; |
|
|
|
|
|
#ifdef PROGRAM_WITH_IL |
|
|
|
|
|
ocl_runtime->CreateKernelFromIL(kernel_(), kernel_name); |
|
|
|
|
|
#else |
|
|
|
|
|
std::set<std::string> build_options; |
|
|
std::set<std::string> build_options; |
|
|
std::string source = convolution_source_fp32; |
|
|
std::string source = convolution_source_fp32; |
|
|
std::string program_name = "convolution"; |
|
|
std::string program_name = "convolution"; |
|
|
|
|
|
std::string kernel_name = io_NHWC4 ? "convolution_NHWC4_OHWIIO_float8" : "convolution_NHWC_OHWI"; |
|
|
|
|
|
auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance(); |
|
|
|
|
|
|
|
|
ocl_runtime->LoadSource(program_name, source); |
|
|
ocl_runtime->LoadSource(program_name, source); |
|
|
ocl_runtime->BuildKernel(kernel_, program_name, kernel_name, build_options); |
|
|
ocl_runtime->BuildKernel(kernel_, program_name, kernel_name, build_options); |
|
|
#endif |
|
|
|
|
|
|
|
|
|
|
|
this->InitBuffer(); |
|
|
this->InitBuffer(); |
|
|
MS_LOG(DEBUG) << kernel_name << " Init Done!"; |
|
|
|
|
|
|
|
|
|
|
|
return 0; |
|
|
return 0; |
|
|
} |
|
|
} |
|
|
int ConvolutionOpenCLKernel::InitBuffer() { |
|
|
int ConvolutionOpenCLKernel::InitBuffer() { |
|
|
@@ -78,35 +74,41 @@ int ConvolutionOpenCLKernel::InitBuffer() { |
|
|
memcpy_s(packed_bias_, bias_tensor->Size(), bias_tensor->Data(), bias_tensor->Size()); |
|
|
memcpy_s(packed_bias_, bias_tensor->Size(), bias_tensor->Data(), bias_tensor->Size()); |
|
|
allocator->UnmapBuffer(packed_bias_); |
|
|
allocator->UnmapBuffer(packed_bias_); |
|
|
} else if (io_dataformat_ == schema::Format_NHWC4) { |
|
|
} else if (io_dataformat_ == schema::Format_NHWC4) { |
|
|
|
|
|
// OHWI -> OHWIIO |
|
|
auto weight_shape = weight_tensor->shape(); |
|
|
auto weight_shape = weight_tensor->shape(); |
|
|
size_t CO = weight_shape[0]; |
|
|
size_t CO = weight_shape[0]; |
|
|
size_t KH = weight_shape[1]; |
|
|
size_t KH = weight_shape[1]; |
|
|
size_t KW = weight_shape[2]; |
|
|
size_t KW = weight_shape[2]; |
|
|
size_t CI = weight_shape[3]; |
|
|
size_t CI = weight_shape[3]; |
|
|
size_t CI_ALIGN = UP_DIV(CI, C4NUM) * C4NUM; |
|
|
|
|
|
size_t CO_ALIGN = UP_DIV(CO, C4NUM) * C4NUM; |
|
|
|
|
|
size_t weight_size_tiled = CO_ALIGN * KH * KW * CI_ALIGN * sizeof(float); |
|
|
|
|
|
|
|
|
size_t CI_SLICES = UP_DIV(CI, C4NUM); |
|
|
|
|
|
size_t CO_SLICES = UP_DIV(CO, C4NUM); |
|
|
|
|
|
constexpr size_t CI_TILE = C4NUM; |
|
|
|
|
|
constexpr size_t CO_TILE = C4NUM; |
|
|
|
|
|
size_t packed_weight_size = CO_SLICES * KH * KW * CI_SLICES * CI_TILE * CO_TILE * sizeof(float); |
|
|
|
|
|
|
|
|
packed_weight_ = reinterpret_cast<float *>(allocator->Malloc(weight_size_tiled)); |
|
|
|
|
|
|
|
|
packed_weight_ = reinterpret_cast<float *>(allocator->Malloc(packed_weight_size)); |
|
|
packed_weight_ = reinterpret_cast<float *>(allocator->MapBuffer(packed_weight_, CL_MAP_WRITE, nullptr, true)); |
|
|
packed_weight_ = reinterpret_cast<float *>(allocator->MapBuffer(packed_weight_, CL_MAP_WRITE, nullptr, true)); |
|
|
memset_s(packed_weight_, weight_size_tiled, 0x00, weight_size_tiled); |
|
|
|
|
|
|
|
|
memset_s(packed_weight_, packed_weight_size, 0x00, packed_weight_size); |
|
|
auto weight_data = reinterpret_cast<float *>(weight_tensor->Data()); |
|
|
auto weight_data = reinterpret_cast<float *>(weight_tensor->Data()); |
|
|
for (int co = 0; co < CO; ++co) { |
|
|
for (int co = 0; co < CO; ++co) { |
|
|
for (int kh = 0; kh < KH; ++kh) { |
|
|
for (int kh = 0; kh < KH; ++kh) { |
|
|
for (int kw = 0; kw < KW; ++kw) { |
|
|
for (int kw = 0; kw < KW; ++kw) { |
|
|
for (int ci = 0; ci < CI; ++ci) { |
|
|
for (int ci = 0; ci < CI; ++ci) { |
|
|
packed_weight_[co * KH * KW * CI_ALIGN + kh * KW * CI_ALIGN + kw * CI_ALIGN + ci] = |
|
|
|
|
|
weight_data[co * KH * KW * CI + kh * KW * CI + kw * CI + ci]; |
|
|
|
|
|
|
|
|
auto co_outer = co / CO_TILE; |
|
|
|
|
|
auto co_inner = co % CO_TILE; |
|
|
|
|
|
auto ci_outer = ci / CI_TILE; |
|
|
|
|
|
auto ci_inner = ci % CI_TILE; |
|
|
|
|
|
packed_weight_[((((co_outer * KH + kh) * KW + kw) * CI_SLICES + ci_outer) * CI_TILE + ci_inner) * CO_TILE + |
|
|
|
|
|
co_inner] = *(weight_data++); |
|
|
} |
|
|
} |
|
|
} |
|
|
} |
|
|
} |
|
|
} |
|
|
} |
|
|
} |
|
|
allocator->UnmapBuffer(packed_weight_); |
|
|
allocator->UnmapBuffer(packed_weight_); |
|
|
|
|
|
|
|
|
size_t bias_size_tiled = CO_ALIGN * sizeof(float); |
|
|
|
|
|
packed_bias_ = reinterpret_cast<float *>(allocator->Malloc(bias_size_tiled)); |
|
|
|
|
|
|
|
|
size_t packed_bias_size = CO_SLICES * CO_TILE * sizeof(float); |
|
|
|
|
|
packed_bias_ = reinterpret_cast<float *>(allocator->Malloc(packed_bias_size)); |
|
|
packed_bias_ = reinterpret_cast<float *>(allocator->MapBuffer(packed_bias_, CL_MAP_WRITE, nullptr, true)); |
|
|
packed_bias_ = reinterpret_cast<float *>(allocator->MapBuffer(packed_bias_, CL_MAP_WRITE, nullptr, true)); |
|
|
memset_s(packed_bias_, bias_size_tiled, 0x00, bias_size_tiled); |
|
|
|
|
|
|
|
|
memset_s(packed_bias_, packed_bias_size, 0x00, packed_bias_size); |
|
|
auto bias_data = reinterpret_cast<float *>(bias_tensor->Data()); |
|
|
auto bias_data = reinterpret_cast<float *>(bias_tensor->Data()); |
|
|
for (int co = 0; co < CO; ++co) { |
|
|
for (int co = 0; co < CO; ++co) { |
|
|
packed_bias_[co] = bias_data[co]; |
|
|
packed_bias_[co] = bias_data[co]; |
|
|
@@ -115,47 +117,80 @@ int ConvolutionOpenCLKernel::InitBuffer() { |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
return 0; |
|
|
return 0; |
|
|
} |
|
|
|
|
|
|
|
|
} // namespace mindspore::kernel |
|
|
|
|
|
|
|
|
int ConvolutionOpenCLKernel::ReSize() { return 0; } |
|
|
int ConvolutionOpenCLKernel::ReSize() { return 0; } |
|
|
|
|
|
|
|
|
|
|
|
static int GetBiggestDivider(int x, int y) { |
|
|
|
|
|
for (int i = y; i != 0; i--) { |
|
|
|
|
|
if (x % i == 0) { |
|
|
|
|
|
return i; |
|
|
|
|
|
} |
|
|
|
|
|
} |
|
|
|
|
|
return 1; |
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
static void GetLocalSize(const ConvParameter *param, std::vector<size_t> *global, std::vector<size_t> *local) { |
|
|
|
|
|
constexpr size_t work_group_size[] = {4, 4, 1}; |
|
|
|
|
|
constexpr size_t max_work_item_sizes[] = {512, 512, 512}; |
|
|
|
|
|
constexpr size_t max_work_group_size = 512; |
|
|
|
|
|
const size_t max_z_size = std::min<size_t>(16, max_work_item_sizes[2]); |
|
|
|
|
|
|
|
|
|
|
|
// 先用OH OW CO_SLICES初始化global,并且441对齐 |
|
|
|
|
|
size_t global_h = UP_DIV(param->output_h_, work_group_size[0]) * work_group_size[0]; |
|
|
|
|
|
size_t global_w = UP_DIV(param->output_w_, work_group_size[1]) * work_group_size[1]; |
|
|
|
|
|
size_t global_c = UP_DIV(UP_DIV(param->output_channel_, C4NUM), work_group_size[2]) * work_group_size[2]; |
|
|
|
|
|
|
|
|
|
|
|
// 使用策略计算local |
|
|
|
|
|
size_t local_c = GetBiggestDivider(global_c, max_z_size); |
|
|
|
|
|
size_t local_hw_size = std::min<size_t>(256, max_work_group_size) / local_c; |
|
|
|
|
|
size_t local_w = std::min(global_w, local_hw_size); |
|
|
|
|
|
size_t local_h = std::min(local_hw_size / local_w, global_h); |
|
|
|
|
|
if (local_h == global_h && global_h % 2 == 0) { |
|
|
|
|
|
local_h = global_h / 2; |
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
global->clear(); |
|
|
|
|
|
global->push_back(UP_DIV(param->output_h_, local_h) * local_h); |
|
|
|
|
|
global->push_back(UP_DIV(param->output_w_, local_w) * local_w); |
|
|
|
|
|
global->push_back(UP_DIV(UP_DIV(param->output_channel_, C4NUM), local_c) * local_c); |
|
|
|
|
|
local->clear(); |
|
|
|
|
|
local->push_back(local_h); |
|
|
|
|
|
local->push_back(local_w); |
|
|
|
|
|
local->push_back(local_c); |
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
int ConvolutionOpenCLKernel::Run() { |
|
|
int ConvolutionOpenCLKernel::Run() { |
|
|
MS_LOG(DEBUG) << this->Name() << " Running!"; |
|
|
|
|
|
|
|
|
MS_LOG(INFO) << "ConvolutionOpenCLKernel::Run()"; |
|
|
auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance(); |
|
|
auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance(); |
|
|
|
|
|
|
|
|
auto param = reinterpret_cast<ConvParameter *>(opParameter); |
|
|
auto param = reinterpret_cast<ConvParameter *>(opParameter); |
|
|
auto input0_shape = inputs_[0]->shape(); // NHWC |
|
|
auto input0_shape = inputs_[0]->shape(); // NHWC |
|
|
auto input1_shape = inputs_[1]->shape(); // OHWI |
|
|
auto input1_shape = inputs_[1]->shape(); // OHWI |
|
|
auto outpu0_shape = outputs_[0]->shape(); // NHWC |
|
|
auto outpu0_shape = outputs_[0]->shape(); // NHWC |
|
|
cl_uint N = input0_shape[0]; |
|
|
|
|
|
cl_uint CI = input0_shape[3]; |
|
|
|
|
|
cl_uint IH = input0_shape[1]; |
|
|
|
|
|
cl_uint IW = input0_shape[2]; |
|
|
|
|
|
cl_uint CO = outpu0_shape[3]; |
|
|
|
|
|
cl_uint OH = outpu0_shape[1]; |
|
|
|
|
|
cl_uint OW = outpu0_shape[2]; |
|
|
|
|
|
cl_uint KH = input1_shape[1]; |
|
|
|
|
|
cl_uint KW = input1_shape[2]; |
|
|
|
|
|
cl_uint CI_TILE_NUM = UP_DIV(CI, C4NUM); |
|
|
|
|
|
cl_uint CO_TILE_NUM = UP_DIV(CO, C4NUM); |
|
|
|
|
|
cl_uint CI_ALIGN = CI_TILE_NUM * C4NUM; |
|
|
|
|
|
cl_uint CO_ALIGN = CO_TILE_NUM * C4NUM; |
|
|
|
|
|
|
|
|
|
|
|
cl_uint4 input_shape; |
|
|
|
|
|
cl_uint4 weight_shape; |
|
|
|
|
|
cl_uint4 output_shape; |
|
|
|
|
|
|
|
|
cl_int N = input0_shape[0]; |
|
|
|
|
|
cl_int CI = input0_shape[3]; |
|
|
|
|
|
cl_int IH = input0_shape[1]; |
|
|
|
|
|
cl_int IW = input0_shape[2]; |
|
|
|
|
|
cl_int CO = outpu0_shape[3]; |
|
|
|
|
|
cl_int OH = outpu0_shape[1]; |
|
|
|
|
|
cl_int OW = outpu0_shape[2]; |
|
|
|
|
|
cl_int KH = input1_shape[1]; |
|
|
|
|
|
cl_int KW = input1_shape[2]; |
|
|
|
|
|
cl_int CI_ALIGN = UP_DIV(CI, C4NUM) * C4NUM; |
|
|
|
|
|
cl_int CO_ALIGN = UP_DIV(CO, C4NUM) * C4NUM; |
|
|
|
|
|
|
|
|
|
|
|
cl_int4 input_shape; |
|
|
|
|
|
cl_int4 output_shape; |
|
|
if (io_dataformat_ == schema::Format_NHWC) { |
|
|
if (io_dataformat_ == schema::Format_NHWC) { |
|
|
input_shape = {N, IH, IW, CI}; |
|
|
input_shape = {N, IH, IW, CI}; |
|
|
weight_shape = {CO, KH, KW, CI}; |
|
|
|
|
|
output_shape = {N, OH, OW, CO}; |
|
|
output_shape = {N, OH, OW, CO}; |
|
|
} else if (io_dataformat_ == schema::Format_NHWC4) { |
|
|
} else if (io_dataformat_ == schema::Format_NHWC4) { |
|
|
input_shape = {N, IH, IW, CI_ALIGN}; |
|
|
input_shape = {N, IH, IW, CI_ALIGN}; |
|
|
weight_shape = {CO_ALIGN, KH, KW, CI_ALIGN}; |
|
|
|
|
|
output_shape = {N, OH, OW, CO_ALIGN}; |
|
|
output_shape = {N, OH, OW, CO_ALIGN}; |
|
|
} |
|
|
} |
|
|
cl_uint2 stride = {static_cast<cl_uint>(param->stride_h_), static_cast<cl_uint>(param->stride_w_)}; |
|
|
|
|
|
cl_uint4 pad = {static_cast<cl_uint>(param->pad_u_), static_cast<cl_uint>(param->pad_d_), |
|
|
|
|
|
static_cast<cl_uint>(param->pad_l_), static_cast<cl_uint>(param->pad_r_)}; |
|
|
|
|
|
|
|
|
cl_int4 kernel_stride = {KH, KW, param->stride_h_, param->stride_w_}; |
|
|
|
|
|
cl_int4 pad = {param->pad_u_, param->pad_d_, param->pad_l_, param->pad_r_}; |
|
|
|
|
|
|
|
|
int arg_cn = 0; |
|
|
int arg_cn = 0; |
|
|
ocl_runtime->SetKernelArg(kernel_, arg_cn++, inputs_[0]->Data()); |
|
|
ocl_runtime->SetKernelArg(kernel_, arg_cn++, inputs_[0]->Data()); |
|
|
@@ -163,14 +198,19 @@ int ConvolutionOpenCLKernel::Run() { |
|
|
ocl_runtime->SetKernelArg(kernel_, arg_cn++, packed_bias_); |
|
|
ocl_runtime->SetKernelArg(kernel_, arg_cn++, packed_bias_); |
|
|
ocl_runtime->SetKernelArg(kernel_, arg_cn++, outputs_[0]->Data()); |
|
|
ocl_runtime->SetKernelArg(kernel_, arg_cn++, outputs_[0]->Data()); |
|
|
ocl_runtime->SetKernelArg(kernel_, arg_cn++, input_shape); |
|
|
ocl_runtime->SetKernelArg(kernel_, arg_cn++, input_shape); |
|
|
ocl_runtime->SetKernelArg(kernel_, arg_cn++, weight_shape); |
|
|
|
|
|
ocl_runtime->SetKernelArg(kernel_, arg_cn++, output_shape); |
|
|
ocl_runtime->SetKernelArg(kernel_, arg_cn++, output_shape); |
|
|
ocl_runtime->SetKernelArg(kernel_, arg_cn++, stride); |
|
|
|
|
|
|
|
|
ocl_runtime->SetKernelArg(kernel_, arg_cn++, kernel_stride); |
|
|
ocl_runtime->SetKernelArg(kernel_, arg_cn++, pad); |
|
|
ocl_runtime->SetKernelArg(kernel_, arg_cn++, pad); |
|
|
|
|
|
|
|
|
std::vector<size_t> global = {OW, OH, CO_TILE_NUM}; |
|
|
|
|
|
std::vector<size_t> local = {1, 1, CO_TILE_NUM}; |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
std::vector<size_t> global; |
|
|
|
|
|
std::vector<size_t> local; |
|
|
|
|
|
GetLocalSize(reinterpret_cast<ConvParameter *>(this->opParameter), &global, &local); |
|
|
|
|
|
// float8 per thread |
|
|
|
|
|
if (io_dataformat_ == schema::Format_NHWC4) { |
|
|
|
|
|
local[2] = UP_DIV(local[2], 2); |
|
|
|
|
|
global[2] = UP_DIV(global[2], 2); |
|
|
|
|
|
global[2] = UP_DIV(global[2], global[2]) * global[2]; |
|
|
|
|
|
} |
|
|
ocl_runtime->RunKernel(kernel_, global, local, nullptr); |
|
|
ocl_runtime->RunKernel(kernel_, global, local, nullptr); |
|
|
|
|
|
|
|
|
return 0; |
|
|
return 0; |
|
|
@@ -196,4 +236,3 @@ kernel::LiteKernel *OpenCLConvolutionKernelCreator(const std::vector<lite::tenso |
|
|
|
|
|
|
|
|
REG_KERNEL(kGPU, PrimitiveType_Conv2D, OpenCLConvolutionKernelCreator) |
|
|
REG_KERNEL(kGPU, PrimitiveType_Conv2D, OpenCLConvolutionKernelCreator) |
|
|
} // namespace mindspore::kernel |
|
|
} // namespace mindspore::kernel |
|
|
|
|
|
|