| @@ -54,6 +54,62 @@ __kernel void to_format_NHWC_to_NHWC4_IMG_half(__global half4 *src_data, __write | |||
| } | |||
| WRITE_IMAGE(dst_data, (int2)(Y * size.z + Z, X), data); | |||
| } | |||
| __kernel void to_format_NCHW_to_NHWC4_IMG_float(__global float4 *src_data, __write_only image2d_t dst_data, int4 size, | |||
| int4 shape) { | |||
| int X = get_global_id(0); | |||
| int Y = get_global_id(1); | |||
| int Z = get_global_id(2); | |||
| if (X >= size.x || Y >= size.y || Z >= size.z) { | |||
| return; | |||
| } | |||
| FLT4 data = (FLT4)(0.f); | |||
| __global float *src_addr = (__global float *)src_data; | |||
| __global float *src_addr_0 = src_addr + ((Z * 4 + 0) * shape.y + X) * shape.z + Y; | |||
| __global float *src_addr_1 = src_addr + ((Z * 4 + 1) * shape.y + X) * shape.z + Y; | |||
| __global float *src_addr_2 = src_addr + ((Z * 4 + 2) * shape.y + X) * shape.z + Y; | |||
| if ((Z + 1) * 4 <= shape.w) { | |||
| data = TO_FLT4(((__global float4 *)src_addr_0)[0]); | |||
| } else { | |||
| if ((shape.w - Z * 4) >= 1) { | |||
| data.x = src_addr_0[0]; | |||
| } | |||
| if ((shape.w - Z * 4) >= 2) { | |||
| data.y = src_addr_1[0]; | |||
| } | |||
| if ((shape.w - Z * 4) >= 3) { | |||
| data.z = src_addr_2[0]; | |||
| } | |||
| } | |||
| WRITE_IMAGE(dst_data, (int2)(Y * size.z + Z, X), data); | |||
| } | |||
| __kernel void to_format_NCHW_to_NHWC4_IMG_half(__global half4 *src_data, __write_only image2d_t dst_data, int4 size, | |||
| int4 shape) { | |||
| int X = get_global_id(0); | |||
| int Y = get_global_id(1); | |||
| int Z = get_global_id(2); | |||
| if (X >= size.x || Y >= size.y || Z >= size.z) { | |||
| return; | |||
| } | |||
| FLT4 data = (FLT4)(0.f); | |||
| __global half *src_addr = (__global half *)src_data; | |||
| __global half *src_addr_0 = src_addr + ((Z * 4 + 0) * shape.y + X) * shape.z + Y; | |||
| __global half *src_addr_1 = src_addr + ((Z * 4 + 1) * shape.y + X) * shape.z + Y; | |||
| __global half *src_addr_2 = src_addr + ((Z * 4 + 2) * shape.y + X) * shape.z + Y; | |||
| if ((Z + 1) * 4 <= shape.w) { | |||
| data = TO_FLT4(((__global half4 *)src_addr_0)[0]); | |||
| } else { | |||
| if ((shape.w - Z * 4) >= 1) { | |||
| data.x = src_addr_0[0]; | |||
| } | |||
| if ((shape.w - Z * 4) >= 2) { | |||
| data.y = src_addr_1[0]; | |||
| } | |||
| if ((shape.w - Z * 4) >= 3) { | |||
| data.z = src_addr_2[0]; | |||
| } | |||
| } | |||
| WRITE_IMAGE(dst_data, (int2)(Y * size.z + Z, X), data); | |||
| } | |||
| __kernel void to_format_NHWC_to_NC4HW4_IMG_float(__global float4 *src_data, __write_only image2d_t dst_data, int4 size, | |||
| int4 shape) { | |||
| int X = get_global_id(0); | |||
| @@ -198,6 +254,64 @@ __kernel void to_format_NHWC4_to_NHWC_BUF_float(__read_only image2d_t src_data, | |||
| } | |||
| } | |||
| } | |||
| __kernel void to_format_NHWC4_to_NCHW_BUF_float(__read_only image2d_t src_data, __global float4 *dst_data, int4 size, | |||
| int4 shape) { | |||
| int X = get_global_id(0); | |||
| int Y = get_global_id(1); | |||
| int Z = get_global_id(2); | |||
| if (X >= size.x || Y >= size.y || Z >= size.z) { | |||
| return; | |||
| } | |||
| float4 data = convert_float4(READ_IMAGE(src_data, smp_zero, (int2)(Y * size.z + Z, X))); | |||
| int offset = (X * shape.z + Y) * shape.w + Z * 4; | |||
| __global float *dst_addr = (__global float *)dst_data; | |||
| __global float *dst_addr_0 = dst_addr + ((Z * 4 + 0) * shape.y + X) * shape.z + Y; | |||
| __global float *dst_addr_1 = dst_addr + ((Z * 4 + 1) * shape.y + X) * shape.z + Y; | |||
| __global float *dst_addr_2 = dst_addr + ((Z * 4 + 2) * shape.y + X) * shape.z + Y; | |||
| dst_addr += offset; | |||
| if ((Z + 1) * 4 <= shape.w) { | |||
| ((__global float4 *)dst_addr_0)[0] = data; | |||
| } else { | |||
| if (shape.w - Z * 4 >= 1) { | |||
| dst_addr_0[0] = data.x; | |||
| } | |||
| if (shape.w - Z * 4 >= 2) { | |||
| dst_addr_1[0] = data.y; | |||
| } | |||
| if (shape.w - Z * 4 >= 3) { | |||
| dst_addr_2[0] = data.z; | |||
| } | |||
| } | |||
| } | |||
| __kernel void to_format_NHWC4_to_NCHW_BUF_half(__read_only image2d_t src_data, __global half4 *dst_data, int4 size, | |||
| int4 shape) { | |||
| int X = get_global_id(0); | |||
| int Y = get_global_id(1); | |||
| int Z = get_global_id(2); | |||
| if (X >= size.x || Y >= size.y || Z >= size.z) { | |||
| return; | |||
| } | |||
| half4 data = convert_half4(READ_IMAGE(src_data, smp_zero, (int2)(Y * size.z + Z, X))); | |||
| int offset = (X * shape.z + Y) * shape.w + Z * 4; | |||
| __global half *dst_addr = (__global half *)dst_data; | |||
| __global half *dst_addr_0 = dst_addr + ((Z * 4 + 0) * shape.y + X) * shape.z + Y; | |||
| __global half *dst_addr_1 = dst_addr + ((Z * 4 + 1) * shape.y + X) * shape.z + Y; | |||
| __global half *dst_addr_2 = dst_addr + ((Z * 4 + 2) * shape.y + X) * shape.z + Y; | |||
| dst_addr += offset; | |||
| if ((Z + 1) * 4 <= shape.w) { | |||
| ((__global half4 *)dst_addr_0)[0] = data; | |||
| } else { | |||
| if (shape.w - Z * 4 >= 1) { | |||
| dst_addr_0[0] = data.x; | |||
| } | |||
| if (shape.w - Z * 4 >= 2) { | |||
| dst_addr_1[0] = data.y; | |||
| } | |||
| if (shape.w - Z * 4 >= 3) { | |||
| dst_addr_2[0] = data.z; | |||
| } | |||
| } | |||
| } | |||
| __kernel void to_format_NHWC4_to_NHWC_BUF_half(__read_only image2d_t src_data, __global half4 *dst_data, int4 size, | |||
| int4 shape) { | |||
| int X = get_global_id(0); | |||
| @@ -40,8 +40,6 @@ using mindspore::schema::PrimitiveType_Activation; | |||
| namespace mindspore::kernel { | |||
| void ActivationOpenClKernel::InitBuffer() {} | |||
| int ActivationOpenClKernel::Init() { | |||
| in_size_ = in_tensors_[0]->shape().size(); | |||
| out_size_ = out_tensors_[0]->shape().size(); | |||
| @@ -39,7 +39,7 @@ class ActivationOpenClKernel : public OpenCLKernel { | |||
| int Run() override; | |||
| int GetImageSize(size_t idx, std::vector<size_t> *img_size) override; | |||
| cl_int4 GetImg2dShape(); | |||
| void InitBuffer(); | |||
| void InitBuffer() {} | |||
| private: | |||
| cl::Kernel kernel_; | |||
| @@ -16,10 +16,10 @@ | |||
| #include <cstring> | |||
| #include <algorithm> | |||
| #include <set> | |||
| #include<string> | |||
| #include "src/kernel_registry.h" | |||
| #include "src/runtime/opencl/opencl_runtime.h" | |||
| #include "src/runtime/kernel/opencl/kernel/arithmetic_self.h" | |||
| #include "src/runtime/kernel/opencl/utils.h" | |||
| #include "src/runtime/kernel/opencl/cl/arithmeticself.cl.inc" | |||
| using mindspore::kernel::KERNEL_ARCH::kGPU; | |||
| @@ -145,31 +145,12 @@ int ArithmeticSelfOpenCLKernel::Init() { | |||
| int ArithmeticSelfOpenCLKernel::ReSize() { return RET_OK; } | |||
| int ArithmeticSelfGetBiggestDividerWithPriority(int number, int max_divider) { | |||
| if (number % 8 == 0 && max_divider >= 8) { | |||
| return number / 8; | |||
| } | |||
| if (number % 4 == 0 && 4 <= max_divider) { | |||
| return number / 4; | |||
| } | |||
| if (number % 2 == 0 && 2 <= max_divider) { | |||
| return number / 2; | |||
| } | |||
| for (int i = max_divider; i != 0; i--) { | |||
| if (number % i == 0) { | |||
| return i; | |||
| } | |||
| } | |||
| return RET_OK; | |||
| } | |||
| void ArithmeticSelfGetWorkGroup(const std::vector<size_t> &global, std::vector<size_t> *local, int max_size) { | |||
| const int max_divider = 8; | |||
| const int max_x = 4, max_y = 8; | |||
| int x = std::min(ArithmeticSelfGetBiggestDividerWithPriority(global[0], max_divider), max_x); | |||
| int x = std::min(GetMaxDivisorStrategy1(global[0], max_divider), max_x); | |||
| int yz = max_size / x; | |||
| int y = std::min(std::min(ArithmeticSelfGetBiggestDividerWithPriority(global[1], max_divider), yz), max_y); | |||
| int y = std::min(std::min(GetMaxDivisorStrategy1(global[1], max_divider), yz), max_y); | |||
| int z = std::min(yz / y, static_cast<int>(UP_DIV(global[2], 2))); | |||
| local->clear(); | |||
| @@ -20,6 +20,7 @@ | |||
| #include "src/kernel_registry.h" | |||
| #include "src/runtime/opencl/opencl_runtime.h" | |||
| #include "src/runtime/kernel/opencl/kernel/batchnorm.h" | |||
| #include "src/runtime/kernel/opencl/utils.h" | |||
| #include "src/runtime/kernel/opencl/cl/batchnorm.cl.inc" | |||
| using mindspore::kernel::KERNEL_ARCH::kGPU; | |||
| @@ -49,6 +50,7 @@ int BatchNormOpenCLKernel::GetImageSize(size_t idx, std::vector<size_t> *img_siz | |||
| *img_size = vec; | |||
| return RET_OK; | |||
| } | |||
| int BatchNormOpenCLKernel::Init() { | |||
| auto in_format = op_format_; | |||
| if (in_format != schema::Format_NHWC4 && in_format != schema::Format_NC4HW4) { | |||
| @@ -79,31 +81,12 @@ int BatchNormOpenCLKernel::Init() { | |||
| int BatchNormOpenCLKernel::ReSize() { return RET_OK; } | |||
| int BatchnormGetBiggestDividerWithPriority(int number, int max_divider) { | |||
| if (number % 8 == 0 && 8 <= max_divider) { | |||
| return number / 8; | |||
| } | |||
| if (number % 4 == 0 && 4 <= max_divider) { | |||
| return number / 4; | |||
| } | |||
| if (number % 2 == 0 && 2 <= max_divider) { | |||
| return number / 2; | |||
| } | |||
| for (int i = max_divider; i != 0; i--) { | |||
| if (number % i == 0) { | |||
| return i; | |||
| } | |||
| } | |||
| return RET_OK; | |||
| } | |||
| void BatchNormGetWorkGroup(const std::vector<size_t> &global, std::vector<size_t> *local, int max_size) { | |||
| const int max_divider = 8; | |||
| const int max_x = 4, max_y = 8; | |||
| int x = std::min(BatchnormGetBiggestDividerWithPriority(global[0], max_divider), max_x); | |||
| int x = std::min(GetMaxDivisorStrategy1(global[0], max_divider), max_x); | |||
| int yz = max_size / x; | |||
| int y = std::min(std::min(BatchnormGetBiggestDividerWithPriority(global[1], max_divider), yz), max_y); | |||
| int y = std::min(std::min(GetMaxDivisorStrategy1(global[1], max_divider), yz), max_y); | |||
| int z = std::min(yz / y, static_cast<int>(UP_DIV(global[2], 2))); | |||
| local->clear(); | |||
| @@ -111,6 +94,7 @@ void BatchNormGetWorkGroup(const std::vector<size_t> &global, std::vector<size_t | |||
| local->push_back(y); | |||
| local->push_back(z); | |||
| } | |||
| int BatchNormOpenCLKernel::Run() { | |||
| MS_LOG(DEBUG) << this->name() << " Running! "; | |||
| auto param = reinterpret_cast<BatchNormParameter *>(this->op_parameter_); | |||
| @@ -14,12 +14,12 @@ | |||
| * limitations under the License. | |||
| */ | |||
| #include <cstring> | |||
| #include <string> | |||
| #include <algorithm> | |||
| #include <set> | |||
| #include "src/kernel_registry.h" | |||
| #include "src/runtime/opencl/opencl_runtime.h" | |||
| #include "src/runtime/kernel/opencl/kernel/concat.h" | |||
| #include "src/runtime/kernel/opencl/utils.h" | |||
| #include "src/runtime/kernel/opencl/cl/concat.cl.inc" | |||
| using mindspore::kernel::KERNEL_ARCH::kGPU; | |||
| @@ -131,31 +131,12 @@ int ConcatOpenCLKernel::GetSumShape(std::vector<int> *sum_shape, std::vector<int | |||
| return RET_OK; | |||
| } | |||
| int ConcatGetBiggestDividerWithPriority(int number, int max_divider) { | |||
| if (number % 8 == 0 && max_divider >= 8) { | |||
| return number / 8; | |||
| } | |||
| if (number % 4 == 0 && 4 <= max_divider) { | |||
| return number / 4; | |||
| } | |||
| if (number % 2 == 0 && 2 <= max_divider) { | |||
| return number / 2; | |||
| } | |||
| for (int i = max_divider; i != 0; i--) { | |||
| if (number % i == 0) { | |||
| return i; | |||
| } | |||
| } | |||
| return RET_OK; | |||
| } | |||
| void ConcatGetWorkGroup(const std::vector<size_t> &global, std::vector<size_t> *local, int max_size) { | |||
| const int max_divider = 8; | |||
| const int max_x = 4, max_y = 8; | |||
| int x = std::min(ConcatGetBiggestDividerWithPriority(global[0], max_divider), max_x); | |||
| int x = std::min(GetMaxDivisorStrategy1(global[0], max_divider), max_x); | |||
| int yz = max_size / x; | |||
| int y = std::min(std::min(ConcatGetBiggestDividerWithPriority(global[1], max_divider), yz), max_y); | |||
| int y = std::min(std::min(GetMaxDivisorStrategy1(global[1], max_divider), yz), max_y); | |||
| int z = std::min(yz / y, static_cast<int>(UP_DIV(global[2], 2))); | |||
| local->clear(); | |||
| @@ -163,6 +144,7 @@ void ConcatGetWorkGroup(const std::vector<size_t> &global, std::vector<size_t> * | |||
| local->push_back(y); | |||
| local->push_back(z); | |||
| } | |||
| int ConcatOpenCLKernel::Run() { | |||
| MS_LOG(DEBUG) << this->name() << " Running! "; | |||
| auto param = reinterpret_cast<ConcatParameter *>(this->op_parameter_); | |||
| @@ -19,6 +19,7 @@ | |||
| #include <algorithm> | |||
| #include "src/common/utils.h" | |||
| #include "src/runtime/kernel/opencl/kernel/convolution.h" | |||
| #include "src/runtime/kernel/opencl/utils.h" | |||
| #include "src/kernel_registry.h" | |||
| #include "include/errorcode.h" | |||
| @@ -113,7 +114,7 @@ int ConvolutionOpenCLKernel::Init() { | |||
| return RET_OK; | |||
| } | |||
| int ConvolutionOpenCLKernel::RearrangeWinogradWeight() { | |||
| int ConvolutionOpenCLKernel::GenerateWinogradWeight() { | |||
| constexpr float Gt[] = {1.0000000000, 1.0000000000, 1.0000000000, 1.0000000000, 1.0000000000, 0.0000000000, | |||
| 0.0000000000, 0.7071067691, -0.7071067691, 1.4142135382, -1.4142135382, 0.0000000000, | |||
| 0.0000000000, 0.4999999702, 0.4999999702, 1.9999998808, 1.9999998808, 1.0000000000}; | |||
| @@ -155,41 +156,16 @@ int ConvolutionOpenCLKernel::RearrangeWinogradWeight() { | |||
| } | |||
| if (use_fp16_) { | |||
| OHWI2OHWIOGroupI4O4<float, float16_t>(encoded_weight.data(), 6, 6, 2); | |||
| ConvertConvWeight4DTo7D<float, float16_t>(reinterpret_cast<void *>(encoded_weight.data()), packed_weight_, CO_, 6, | |||
| 6, CI_, 2); | |||
| } else { | |||
| OHWI2OHWIOGroupI4O4<float, float>(encoded_weight.data(), 6, 6, 2); | |||
| ConvertConvWeight4DTo7D<float, float>(reinterpret_cast<void *>(encoded_weight.data()), packed_weight_, CO_, 6, 6, | |||
| CI_, 2); | |||
| } | |||
| return RET_OK; | |||
| } | |||
| template <typename SRC_T, typename DST_T> | |||
| int ConvolutionOpenCLKernel::OHWI2OHWIOGroupI4O4(void *weight_OHWI, size_t KH, size_t KW, size_t OGroup) { | |||
| auto origin_weight = reinterpret_cast<SRC_T *>(weight_OHWI); | |||
| auto packed_weight = reinterpret_cast<DST_T *>(packed_weight_); | |||
| // OHWI -> O/OGroup/4 KH KW I/4 OGroup I4 O4 | |||
| for (size_t co = 0, src_idx = 0; co < CO_; ++co) { | |||
| for (size_t kh = 0; kh < KH; ++kh) { | |||
| for (size_t kw = 0; kw < KW; ++kw) { | |||
| for (size_t ci = 0; ci < CI_; ++ci) { | |||
| size_t co_outer = co / (CO_TILE * OGroup); | |||
| size_t group_idx = co % (CO_TILE * OGroup) / CO_TILE; | |||
| size_t co_inner = co % CO_TILE; | |||
| size_t ci_outer = ci / CI_TILE; | |||
| size_t ci_inner = ci % CI_TILE; | |||
| size_t dst_idx = | |||
| (((((co_outer * KH + kh) * KW + kw) * CI_SLICES_ + ci_outer) * OGroup + group_idx) * CI_TILE + ci_inner) * | |||
| CO_TILE + | |||
| co_inner; | |||
| packed_weight[dst_idx] = static_cast<DST_T>(origin_weight[src_idx++]); | |||
| } | |||
| } | |||
| } | |||
| } | |||
| return RET_OK; | |||
| } | |||
| int ConvolutionOpenCLKernel::InitWeight() { | |||
| auto allocator = lite::opencl::OpenCLRuntime::GetInstance()->GetAllocator(); | |||
| @@ -206,20 +182,20 @@ int ConvolutionOpenCLKernel::InitWeight() { | |||
| // rearrange weight | |||
| if (use_winograd_) { | |||
| RearrangeWinogradWeight(); | |||
| GenerateWinogradWeight(); | |||
| } else { | |||
| auto weight_tensor = in_tensors_[1]; | |||
| if (weight_tensor->data_type() == kNumberTypeFloat16) { | |||
| if (use_fp16_) { | |||
| OHWI2OHWIOGroupI4O4<float16_t, float16_t>(weight_tensor->data_c(), KH_, KW_, 1); | |||
| ConvertConvWeight4DTo7D<float16_t, float16_t>(weight_tensor->data_c(), packed_weight_, CO_, KH_, KW_, CI_); | |||
| } else { | |||
| OHWI2OHWIOGroupI4O4<float16_t, float>(weight_tensor->data_c(), KH_, KW_, 1); | |||
| ConvertConvWeight4DTo7D<float16_t, float>(weight_tensor->data_c(), packed_weight_, CO_, KH_, KW_, CI_); | |||
| } | |||
| } else { | |||
| if (use_fp16_) { | |||
| OHWI2OHWIOGroupI4O4<float, float16_t>(weight_tensor->data_c(), KH_, KW_, 1); | |||
| ConvertConvWeight4DTo7D<float, float16_t>(weight_tensor->data_c(), packed_weight_, CO_, KH_, KW_, CI_); | |||
| } else { | |||
| OHWI2OHWIOGroupI4O4<float, float>(weight_tensor->data_c(), KH_, KW_, 1); | |||
| ConvertConvWeight4DTo7D<float, float>(weight_tensor->data_c(), packed_weight_, CO_, KH_, KW_, CI_); | |||
| } | |||
| } | |||
| } | |||
| @@ -635,7 +611,7 @@ std::string ConvolutionOpenCLKernel::CodeGenWinograd4x4To36() { | |||
| " }\n" | |||
| "\n" | |||
| " int IH = input_shape.y, IW = input_shape.z;\n" | |||
| " int TILE_X = IW / 4;\n" | |||
| " int TILE_X = UP_DIV(IW, 4);\n" | |||
| " int tile_x = tile_xy % TILE_X;\n" | |||
| " int tile_y = tile_xy / TILE_X;\n" | |||
| "\n" | |||
| @@ -764,6 +740,8 @@ std::string ConvolutionOpenCLKernel::CodeGenWinogradConvolution() { | |||
| std::string ConvolutionOpenCLKernel::CodeGenWinograd36To4x4() { | |||
| std::string code = | |||
| "#pragma OPENCL EXTENSION cl_khr_fp16 : enable\n" | |||
| "#define UP_DIV(x, y) (((x) + (y) - (1)) / (y))\n" | |||
| "\n" | |||
| "__constant sampler_t\n" | |||
| "smp_none = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE | CLK_FILTER_NEAREST;\n" | |||
| "\n" | |||
| @@ -804,6 +782,7 @@ std::string ConvolutionOpenCLKernel::CodeGenWinograd36To4x4() { | |||
| " }\n" | |||
| " }\n" | |||
| "\n" | |||
| " int TILE_X = UP_DIV(OW, 4);\n" | |||
| " for (int x = 0; x < 4; x++)\n" | |||
| " {\n" | |||
| " FLT4 acc = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f);\n" | |||
| @@ -822,14 +801,15 @@ std::string ConvolutionOpenCLKernel::CodeGenWinograd36To4x4() { | |||
| } | |||
| code += | |||
| " int TILE_X = OW / 4;\n" | |||
| " int tile_x = tile_xy % TILE_X * 4;\n" | |||
| " int tile_y = tile_xy / TILE_X * 4;\n"; | |||
| " int tile_x = tile_xy % TILE_X;\n" | |||
| " int tile_y = tile_xy / TILE_X;\n" | |||
| " int ow = tile_x * 4 + x;\n" | |||
| " int oh = tile_y * 4 + row;\n"; | |||
| if (op_format_ == Format_NHWC4) { | |||
| code += " WRITE_IMAGE(output, (int2)((tile_x + x) * SLICES + slice, tile_y + row), acc);\n"; | |||
| code += " if(ow < OW) { WRITE_IMAGE(output, (int2)(ow * SLICES + slice, oh), acc);}\n"; | |||
| } else if (op_format_ == Format_NC4HW4) { | |||
| code += " WRITE_IMAGE(output, (int2)(tile_x + x, slice * OH + tile_y + row), acc);\n"; | |||
| code += " if(oh < OH) { WRITE_IMAGE(output, (int2)(ow, slice * OH + oh), acc);}\n"; | |||
| } | |||
| code += | |||
| @@ -849,7 +829,7 @@ int ConvolutionOpenCLKernel::SetGlobalLocalConv(std::vector<size_t> *global, std | |||
| size_t global_w = UP_DIV(OW_, work_group_size[1]) * work_group_size[1]; | |||
| size_t global_c = UP_DIV(CO_SLICES_, work_group_size[2]) * work_group_size[2]; | |||
| size_t local_c = GetBiggestDivider(global_c, max_z_size); | |||
| size_t local_c = GetMaxDivisor(global_c, max_z_size); | |||
| if (local_c == 0) { | |||
| MS_LOG(ERROR) << "Divide by zero"; | |||
| return RET_ERROR; | |||
| @@ -68,9 +68,7 @@ class ConvolutionOpenCLKernel : public OpenCLKernel { | |||
| int InitWeight(); | |||
| int InitBias(); | |||
| int RearrangeWinogradWeight(); | |||
| template <typename SRC_T, typename DST_T> | |||
| int OHWI2OHWIOGroupI4O4(void *weight_OHWI, size_t KH, size_t KW, size_t OGroup); | |||
| int GenerateWinogradWeight(); | |||
| std::string CodeGenConvolutionNHWC4(); | |||
| std::string CodeGenConvolutionNC4HW4(); | |||
| @@ -90,29 +88,6 @@ class ConvolutionOpenCLKernel : public OpenCLKernel { | |||
| const bool hw_good = TILES_X_ * TILES_Y_ >= 16; | |||
| return attr_valid && channel_good && hw_good; | |||
| } | |||
| static std::vector<float> MatrixMultiply(const float A[], const float B[], int M, int N, int K) { | |||
| std::vector<float> C(M * K); | |||
| for (int i = 0; i < M; ++i) { | |||
| for (int j = 0; j < K; ++j) { | |||
| float s = 0.0f; | |||
| for (int k = 0; k < N; ++k) { | |||
| s += A[i * N + k] * B[k * K + j]; | |||
| } | |||
| C[i * K + j] = s; | |||
| } | |||
| } | |||
| return C; | |||
| } | |||
| static int GetBiggestDivider(int x, int y) { | |||
| for (int i = y; i != 0; i--) { | |||
| if (x % i == 0) { | |||
| return i; | |||
| } | |||
| } | |||
| return 1; | |||
| } | |||
| }; | |||
| } // namespace mindspore::kernel | |||
| @@ -20,6 +20,7 @@ | |||
| #include "src/kernel_registry.h" | |||
| #include "src/runtime/opencl/opencl_runtime.h" | |||
| #include "src/runtime/kernel/opencl/kernel/slice.h" | |||
| #include "src/runtime/kernel/opencl/utils.h" | |||
| #include "src/runtime/kernel/opencl/cl/slice.cl.inc" | |||
| using mindspore::kernel::KERNEL_ARCH::kGPU; | |||
| @@ -49,6 +50,7 @@ int SliceOpenCLKernel::GetImageSize(size_t idx, std::vector<size_t> *img_size) { | |||
| *img_size = vec; | |||
| return RET_OK; | |||
| } | |||
| int SliceOpenCLKernel::Init() { | |||
| std::string kernel_name = "slice"; | |||
| auto in_format = op_format_; | |||
| @@ -77,28 +79,12 @@ int SliceOpenCLKernel::Init() { | |||
| int SliceOpenCLKernel::ReSize() { return RET_OK; } | |||
| int SliceGetBiggestDividerWithPriority(int number, int max_divider) { | |||
| if (number % 8 == 0 && 8 <= max_divider) { | |||
| return number / 8; | |||
| } else if (number % 4 == 0 && 4 <= max_divider) { | |||
| return number / 4; | |||
| } else if (number % 2 == 0 && 2 <= max_divider) { | |||
| return number / 2; | |||
| } | |||
| for (int i = max_divider; i != 0; i--) { | |||
| if (number % i == 0) { | |||
| return i; | |||
| } | |||
| } | |||
| return 1; | |||
| } | |||
| void SlcieGetWorkGroup(const std::vector<size_t> &global, std::vector<size_t> *local, int max_size) { | |||
| const int max_divider = 8; | |||
| const int max_x = 4, max_y = 8; | |||
| int x = std::min(SliceGetBiggestDividerWithPriority(global[0], max_divider), max_x); | |||
| int x = std::min(GetMaxDivisorStrategy1(global[0], max_divider), max_x); | |||
| int yz = max_size / x; | |||
| int y = std::min(std::min(SliceGetBiggestDividerWithPriority(global[1], max_divider), yz), max_y); | |||
| int y = std::min(std::min(GetMaxDivisorStrategy1(global[1], max_divider), yz), max_y); | |||
| int z = std::min(yz / y, static_cast<int>(UP_DIV(global[2], 2))); | |||
| local->clear(); | |||
| @@ -106,6 +92,7 @@ void SlcieGetWorkGroup(const std::vector<size_t> &global, std::vector<size_t> *l | |||
| local->push_back(y); | |||
| local->push_back(z); | |||
| } | |||
| int SliceOpenCLKernel::Run() { | |||
| MS_LOG(DEBUG) << this->name() << " Running! "; | |||
| auto param = reinterpret_cast<SliceParameter *>(this->op_parameter_); | |||
| @@ -154,5 +141,4 @@ kernel::LiteKernel *OpenCLSliceKernelCreator(const std::vector<lite::Tensor *> & | |||
| REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Slice, OpenCLSliceKernelCreator); | |||
| REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_Slice, OpenCLSliceKernelCreator); | |||
| } // namespace mindspore::kernel | |||
| @@ -16,7 +16,6 @@ | |||
| #include "src/runtime/kernel/opencl/utils.h" | |||
| #include <algorithm> | |||
| #include <string> | |||
| #include <vector> | |||
| #include "src/kernel_registry.h" | |||
| @@ -34,26 +33,61 @@ kernel::LiteKernel *GetOpenCLKernel(const std::vector<Tensor *> &in_tensors, con | |||
| } | |||
| } // namespace mindspore::lite | |||
| namespace mindspore { | |||
| namespace kernel { | |||
| namespace mindspore::kernel { | |||
| int GetMaxDivisor(int x, int divisor) { | |||
| int i = divisor; | |||
| while (i > 0) { | |||
| if (x % i == 0) { | |||
| return i; | |||
| } | |||
| i--; | |||
| } | |||
| return 1; | |||
| } | |||
| int GetMaxDivisorStrategy0(int x, int divisor) { | |||
| if (divisor >= 8 && x % 8 == 0) { | |||
| return 8; | |||
| } else if (divisor >= 4 && x % 4 == 0) { | |||
| return 4; | |||
| } else if (divisor >= 2 && x % 2 == 0) { | |||
| return 2; | |||
| } else { | |||
| return GetMaxDivisor(x, divisor); | |||
| } | |||
| } | |||
| int GetMaxDivisorStrategy1(int x, int divisor) { | |||
| if (divisor >= 8 && x % 8 == 0) { | |||
| return x / 8; | |||
| } else if (divisor >= 4 && x % 4 == 0) { | |||
| return x / 4; | |||
| } else if (divisor >= 2 && x % 2 == 0) { | |||
| return x / 2; | |||
| } else { | |||
| return GetMaxDivisor(x, divisor); | |||
| } | |||
| } | |||
| std::vector<size_t> GetCommonGlobalSize(const std::vector<size_t> &local, const std::vector<size_t> &global) { | |||
| std::vector<size_t> result(3, 1); | |||
| std::vector<size_t> result(3); | |||
| for (int i = 0; i < 3; ++i) { | |||
| result[i] = AlignByN(global[i], local[i]); | |||
| result[i] = UP_ROUND(global[i], local[i]); | |||
| } | |||
| return result; | |||
| } | |||
| std::vector<size_t> GetCommonLocalSize(const std::vector<size_t> &global, int max_size) { | |||
| size_t wg_z = GetBiggestDividerWithPriority(global[2], 8); | |||
| if (wg_z == 0) { | |||
| size_t local_z = GetMaxDivisorStrategy0(global[2], 8); | |||
| if (local_z == 0) { | |||
| MS_LOG(ERROR) << "Divide by zero"; | |||
| return {}; | |||
| } | |||
| size_t wg_xy_size = max_size / wg_z; | |||
| size_t wg_x = std::min(DivideRoundUp(global[0], 2), wg_xy_size); | |||
| size_t wg_y = std::min(wg_xy_size / wg_x, global[1]); | |||
| std::vector<size_t> local = {wg_x, wg_y, wg_z}; | |||
| size_t local_xy = max_size / local_z; | |||
| size_t local_x = std::min(UP_DIV(global[0], 2), local_xy); | |||
| size_t local_y = std::min(local_xy / local_x, global[1]); | |||
| std::vector<size_t> local = {local_x, local_y, local_z}; | |||
| return local; | |||
| } | |||
| @@ -187,5 +221,4 @@ std::string CLErrorCode(cl_int error_code) { | |||
| return "Unknown OpenCL error code"; | |||
| } | |||
| } | |||
| } // namespace kernel | |||
| } // namespace mindspore | |||
| } // namespace mindspore::kernel | |||
| @@ -23,7 +23,7 @@ | |||
| #include "utils/log_adapter.h" | |||
| #include "nnacl/op_base.h" | |||
| #include "src/lite_kernel.h" | |||
| #include "src/common//utils.h" | |||
| #include "src/common/utils.h" | |||
| namespace mindspore::lite { | |||
| kernel::LiteKernel *GetOpenCLKernel(const std::vector<Tensor *> &in_tensors, const std::vector<Tensor *> &out_tensors, | |||
| @@ -32,59 +32,14 @@ kernel::LiteKernel *GetOpenCLKernel(const std::vector<Tensor *> &in_tensors, con | |||
| namespace mindspore::kernel { | |||
| /** | |||
| * GetLocalSize | |||
| * @param number | |||
| * @param max_divider | |||
| * @return | |||
| */ | |||
| template <typename T, typename N> | |||
| T GetBiggestDividerWithPriority(T number, N max_divider) { | |||
| if (number % 8 == 0 && 8 <= max_divider) { | |||
| return (T)8; | |||
| } | |||
| if (number % 4 == 0 && 4 <= max_divider) { | |||
| return (T)4; | |||
| } | |||
| if (number % 2 == 0 && 2 <= max_divider) { | |||
| return (T)2; | |||
| } | |||
| for (int i = max_divider; i != 0; i--) { | |||
| if (number % i == 0) { | |||
| return (T)i; | |||
| } | |||
| } | |||
| return (T)1; | |||
| } | |||
| int GetMaxDivisor(int x, int divisor); | |||
| /** | |||
| * GetLocalSize | |||
| * @param n must be non negative | |||
| * @param divisor must be greater than zero | |||
| * @return | |||
| */ | |||
| template <typename T, typename N> | |||
| T DivideRoundUp(T n, N divisor) { | |||
| const T div = static_cast<T>(divisor); | |||
| const T q = n / div; | |||
| return n % div == 0 ? q : q + 1; | |||
| } | |||
| int GetMaxDivisorStrategy0(int x, int divisor); | |||
| /** | |||
| * GetLocalSize | |||
| * @param number | |||
| * @param n | |||
| * @return | |||
| */ | |||
| template <typename T, typename N> | |||
| T AlignByN(T number, N n) { | |||
| return DivideRoundUp(number, n) * n; | |||
| } | |||
| int GetMaxDivisorStrategy1(int x, int divisor); | |||
| // GetGlobalSize | |||
| std::vector<size_t> GetCommonGlobalSize(const std::vector<size_t> &local, const std::vector<size_t> &global); | |||
| // GetLocalSize | |||
| std::vector<size_t> GetCommonLocalSize(const std::vector<size_t> &global, int max_size); | |||
| std::string CLErrorCode(cl_int error_code); | |||
| @@ -108,6 +63,7 @@ void PackNCHWToNC4HW4(void *src, void *dst, int batch, int plane, int channel, c | |||
| } | |||
| } | |||
| } | |||
| template <class T1, class T2> | |||
| void PackNHWCToNHWC4(void *src, void *dst, int batch, int plane, int channel, const std::function<T2(T1)> &to_dtype) { | |||
| int c4 = UP_DIV(channel, C4NUM); | |||
| @@ -132,6 +88,7 @@ void PackNHWCToNHWC4(void *src, void *dst, int batch, int plane, int channel, co | |||
| } | |||
| } | |||
| } | |||
| template <class T1, class T2> | |||
| void PackNHWCToNC4HW4(void *src, void *dst, int batch, int plane, int channel, const std::function<T2(T1)> &to_dtype) { | |||
| int c4 = UP_DIV(channel, C4NUM); | |||
| @@ -152,6 +109,47 @@ void PackNHWCToNC4HW4(void *src, void *dst, int batch, int plane, int channel, c | |||
| } | |||
| } | |||
| template <class T> | |||
| std::vector<T> MatrixMultiply(const T A[], const T B[], int M, int N, int K) { | |||
| std::vector<T> C(M * K); | |||
| for (int i = 0; i < M; ++i) { | |||
| for (int j = 0; j < K; ++j) { | |||
| float s = 0.0f; | |||
| for (int k = 0; k < N; ++k) { | |||
| s += A[i * N + k] * B[k * K + j]; | |||
| } | |||
| C[i * K + j] = s; | |||
| } | |||
| } | |||
| return C; | |||
| } | |||
| template <typename SRC_T, typename DST_T> | |||
| void ConvertConvWeight4DTo7D(void *src, void *dst, size_t CO, size_t KH, size_t KW, size_t CI, size_t OGroup = 1, | |||
| size_t CI_TILE = 4, size_t CO_TILE = 4) { | |||
| auto origin_weight = reinterpret_cast<SRC_T *>(src); | |||
| auto packed_weight = reinterpret_cast<DST_T *>(dst); | |||
| auto CI_SLICES = UP_DIV(CI, CI_TILE); | |||
| for (size_t co = 0, src_idx = 0; co < CO; ++co) { | |||
| for (size_t kh = 0; kh < KH; ++kh) { | |||
| for (size_t kw = 0; kw < KW; ++kw) { | |||
| for (size_t ci = 0; ci < CI; ++ci) { | |||
| size_t co_outer = co / (CO_TILE * OGroup); | |||
| size_t group_idx = co % (CO_TILE * OGroup) / CO_TILE; | |||
| size_t co_inner = co % CO_TILE; | |||
| size_t ci_outer = ci / CI_TILE; | |||
| size_t ci_inner = ci % CI_TILE; | |||
| size_t dst_idx = | |||
| (((((co_outer * KH + kh) * KW + kw) * CI_SLICES + ci_outer) * OGroup + group_idx) * CI_TILE + ci_inner) * | |||
| CO_TILE + | |||
| co_inner; | |||
| packed_weight[dst_idx] = static_cast<DST_T>(origin_weight[src_idx++]); | |||
| } | |||
| } | |||
| } | |||
| } | |||
| } | |||
| } // namespace mindspore::kernel | |||
| #endif // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_OPENCL_KERNEL_UTILS_H_ | |||
| @@ -133,6 +133,8 @@ Format get_op_format(Format input_format) { | |||
| case Format_NHWC: | |||
| case Format_NHWC4: | |||
| return Format_NHWC4; | |||
| case Format_NCHW: | |||
| return Format_NHWC4; | |||
| default: | |||
| return Format_NC4HW4; | |||
| } | |||
| @@ -249,7 +251,7 @@ TEST_F(TestConvolutionOpenCL, winograd_inputNHWC_1x16x256x96_outputNHWC_1x16x256 | |||
| TEST_MAIN(attr, Format_NHWC4, Format_NHWC4, kNumberTypeFloat16, 0.6f, "testcases/test_fp32/"); | |||
| } | |||
| TEST_F(TestConvolutionOpenCL, simple_test0) { | |||
| TEST_F(TestConvolutionOpenCL, simple_test0_NHWC) { | |||
| std::string attr = | |||
| "inputNHWC_1x2x2x2_outputNHWC_1x2x2x2_kernelHW_1x1_strideHW_1x1_padTopBottomLeftRight_0x0x0x0_dilationHW_1x1"; | |||
| float input_data[] = {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f}; | |||
| @@ -259,6 +261,34 @@ TEST_F(TestConvolutionOpenCL, simple_test0) { | |||
| TEST_MAIN(attr, Format_NHWC, Format_NHWC, kNumberTypeFloat32, 1e-3f, input_data, weight_data, bias_data, expect_data); | |||
| TEST_MAIN(attr, Format_NHWC, Format_NHWC, kNumberTypeFloat16, 1e-6f, input_data, weight_data, bias_data, expect_data); | |||
| } | |||
| TEST_F(TestConvolutionOpenCL, simple_test0_NCHW) { | |||
| std::string attr = | |||
| "inputNHWC_1x2x2x2_outputNHWC_1x2x2x2_kernelHW_1x1_strideHW_1x1_padTopBottomLeftRight_0x0x0x0_dilationHW_1x1"; | |||
| float input_data[] = {0.0f, 2.0f, 4.0f, 6.0f, 1.0f, 3.0f, 5.0f, 7.0f}; | |||
| float weight_data[] = {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f}; | |||
| float bias_data[] = {0.0f, 0.0f}; | |||
| float expect_data[] = {1.0f, 5.0f, 9.0f, 13.0f, 1.0f, 5.0f, 9.0f, 13.0f}; | |||
| TEST_MAIN(attr, Format_NCHW, Format_NCHW, kNumberTypeFloat32, 1e-3f, input_data, weight_data, bias_data, expect_data); | |||
| TEST_MAIN(attr, Format_NCHW, Format_NCHW, kNumberTypeFloat16, 1e-6f, input_data, weight_data, bias_data, expect_data); | |||
| } | |||
| TEST_F(TestConvolutionOpenCL, simple_test0_NHWC4_and_NC4HW4) { | |||
| std::string attr = | |||
| "inputNHWC_1x2x2x2_outputNHWC_1x2x2x2_kernelHW_1x1_strideHW_1x1_padTopBottomLeftRight_0x0x0x0_dilationHW_1x1"; | |||
| float input_data[] = {0.0f, 1.0f, 0.0f, 0.0f, 2.0f, 3.0f, 0.0f, 0.0f, 4.0f, 5.0f, 0.0f, 0.0f, 6.0f, 7.0f, 0.0f, 0.0f}; | |||
| float weight_data[] = {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f}; | |||
| float bias_data[] = {0.0f, 0.0f}; | |||
| float expect_data[] = {1.0f, 1.0f, 0.0f, 0.0f, 5.0f, 5.0f, 0.0f, 0.0f, | |||
| 9.0f, 9.0f, 0.0f, 0.0f, 13.0f, 13.0f, 0.0f, 0.0f}; | |||
| TEST_MAIN(attr, Format_NHWC4, Format_NHWC4, kNumberTypeFloat32, 1e-3f, input_data, weight_data, bias_data, | |||
| expect_data); | |||
| TEST_MAIN(attr, Format_NHWC4, Format_NHWC4, kNumberTypeFloat16, 1e-6f, input_data, weight_data, bias_data, | |||
| expect_data); | |||
| TEST_MAIN(attr, Format_NC4HW4, Format_NC4HW4, kNumberTypeFloat32, 1e-3f, input_data, weight_data, bias_data, | |||
| expect_data); | |||
| TEST_MAIN(attr, Format_NC4HW4, Format_NC4HW4, kNumberTypeFloat16, 1e-6f, input_data, weight_data, bias_data, | |||
| expect_data); | |||
| } | |||
| TEST_F(TestConvolutionOpenCL, simple_test1) { | |||
| std::string attr = | |||