| @@ -54,6 +54,62 @@ __kernel void to_format_NHWC_to_NHWC4_IMG_half(__global half4 *src_data, __write | |||||
| } | } | ||||
| WRITE_IMAGE(dst_data, (int2)(Y * size.z + Z, X), data); | WRITE_IMAGE(dst_data, (int2)(Y * size.z + Z, X), data); | ||||
| } | } | ||||
| __kernel void to_format_NCHW_to_NHWC4_IMG_float(__global float4 *src_data, __write_only image2d_t dst_data, int4 size, | |||||
| int4 shape) { | |||||
| int X = get_global_id(0); | |||||
| int Y = get_global_id(1); | |||||
| int Z = get_global_id(2); | |||||
| if (X >= size.x || Y >= size.y || Z >= size.z) { | |||||
| return; | |||||
| } | |||||
| FLT4 data = (FLT4)(0.f); | |||||
| __global float *src_addr = (__global float *)src_data; | |||||
| __global float *src_addr_0 = src_addr + ((Z * 4 + 0) * shape.y + X) * shape.z + Y; | |||||
| __global float *src_addr_1 = src_addr + ((Z * 4 + 1) * shape.y + X) * shape.z + Y; | |||||
| __global float *src_addr_2 = src_addr + ((Z * 4 + 2) * shape.y + X) * shape.z + Y; | |||||
| if ((Z + 1) * 4 <= shape.w) { | |||||
| data = TO_FLT4(((__global float4 *)src_addr_0)[0]); | |||||
| } else { | |||||
| if ((shape.w - Z * 4) >= 1) { | |||||
| data.x = src_addr_0[0]; | |||||
| } | |||||
| if ((shape.w - Z * 4) >= 2) { | |||||
| data.y = src_addr_1[0]; | |||||
| } | |||||
| if ((shape.w - Z * 4) >= 3) { | |||||
| data.z = src_addr_2[0]; | |||||
| } | |||||
| } | |||||
| WRITE_IMAGE(dst_data, (int2)(Y * size.z + Z, X), data); | |||||
| } | |||||
| __kernel void to_format_NCHW_to_NHWC4_IMG_half(__global half4 *src_data, __write_only image2d_t dst_data, int4 size, | |||||
| int4 shape) { | |||||
| int X = get_global_id(0); | |||||
| int Y = get_global_id(1); | |||||
| int Z = get_global_id(2); | |||||
| if (X >= size.x || Y >= size.y || Z >= size.z) { | |||||
| return; | |||||
| } | |||||
| FLT4 data = (FLT4)(0.f); | |||||
| __global half *src_addr = (__global half *)src_data; | |||||
| __global half *src_addr_0 = src_addr + ((Z * 4 + 0) * shape.y + X) * shape.z + Y; | |||||
| __global half *src_addr_1 = src_addr + ((Z * 4 + 1) * shape.y + X) * shape.z + Y; | |||||
| __global half *src_addr_2 = src_addr + ((Z * 4 + 2) * shape.y + X) * shape.z + Y; | |||||
| if ((Z + 1) * 4 <= shape.w) { | |||||
| data = TO_FLT4(((__global half4 *)src_addr_0)[0]); | |||||
| } else { | |||||
| if ((shape.w - Z * 4) >= 1) { | |||||
| data.x = src_addr_0[0]; | |||||
| } | |||||
| if ((shape.w - Z * 4) >= 2) { | |||||
| data.y = src_addr_1[0]; | |||||
| } | |||||
| if ((shape.w - Z * 4) >= 3) { | |||||
| data.z = src_addr_2[0]; | |||||
| } | |||||
| } | |||||
| WRITE_IMAGE(dst_data, (int2)(Y * size.z + Z, X), data); | |||||
| } | |||||
| __kernel void to_format_NHWC_to_NC4HW4_IMG_float(__global float4 *src_data, __write_only image2d_t dst_data, int4 size, | __kernel void to_format_NHWC_to_NC4HW4_IMG_float(__global float4 *src_data, __write_only image2d_t dst_data, int4 size, | ||||
| int4 shape) { | int4 shape) { | ||||
| int X = get_global_id(0); | int X = get_global_id(0); | ||||
| @@ -198,6 +254,64 @@ __kernel void to_format_NHWC4_to_NHWC_BUF_float(__read_only image2d_t src_data, | |||||
| } | } | ||||
| } | } | ||||
| } | } | ||||
| __kernel void to_format_NHWC4_to_NCHW_BUF_float(__read_only image2d_t src_data, __global float4 *dst_data, int4 size, | |||||
| int4 shape) { | |||||
| int X = get_global_id(0); | |||||
| int Y = get_global_id(1); | |||||
| int Z = get_global_id(2); | |||||
| if (X >= size.x || Y >= size.y || Z >= size.z) { | |||||
| return; | |||||
| } | |||||
| float4 data = convert_float4(READ_IMAGE(src_data, smp_zero, (int2)(Y * size.z + Z, X))); | |||||
| int offset = (X * shape.z + Y) * shape.w + Z * 4; | |||||
| __global float *dst_addr = (__global float *)dst_data; | |||||
| __global float *dst_addr_0 = dst_addr + ((Z * 4 + 0) * shape.y + X) * shape.z + Y; | |||||
| __global float *dst_addr_1 = dst_addr + ((Z * 4 + 1) * shape.y + X) * shape.z + Y; | |||||
| __global float *dst_addr_2 = dst_addr + ((Z * 4 + 2) * shape.y + X) * shape.z + Y; | |||||
| dst_addr += offset; | |||||
| if ((Z + 1) * 4 <= shape.w) { | |||||
| ((__global float4 *)dst_addr_0)[0] = data; | |||||
| } else { | |||||
| if (shape.w - Z * 4 >= 1) { | |||||
| dst_addr_0[0] = data.x; | |||||
| } | |||||
| if (shape.w - Z * 4 >= 2) { | |||||
| dst_addr_1[0] = data.y; | |||||
| } | |||||
| if (shape.w - Z * 4 >= 3) { | |||||
| dst_addr_2[0] = data.z; | |||||
| } | |||||
| } | |||||
| } | |||||
| __kernel void to_format_NHWC4_to_NCHW_BUF_half(__read_only image2d_t src_data, __global half4 *dst_data, int4 size, | |||||
| int4 shape) { | |||||
| int X = get_global_id(0); | |||||
| int Y = get_global_id(1); | |||||
| int Z = get_global_id(2); | |||||
| if (X >= size.x || Y >= size.y || Z >= size.z) { | |||||
| return; | |||||
| } | |||||
| half4 data = convert_half4(READ_IMAGE(src_data, smp_zero, (int2)(Y * size.z + Z, X))); | |||||
| int offset = (X * shape.z + Y) * shape.w + Z * 4; | |||||
| __global half *dst_addr = (__global half *)dst_data; | |||||
| __global half *dst_addr_0 = dst_addr + ((Z * 4 + 0) * shape.y + X) * shape.z + Y; | |||||
| __global half *dst_addr_1 = dst_addr + ((Z * 4 + 1) * shape.y + X) * shape.z + Y; | |||||
| __global half *dst_addr_2 = dst_addr + ((Z * 4 + 2) * shape.y + X) * shape.z + Y; | |||||
| dst_addr += offset; | |||||
| if ((Z + 1) * 4 <= shape.w) { | |||||
| ((__global half4 *)dst_addr_0)[0] = data; | |||||
| } else { | |||||
| if (shape.w - Z * 4 >= 1) { | |||||
| dst_addr_0[0] = data.x; | |||||
| } | |||||
| if (shape.w - Z * 4 >= 2) { | |||||
| dst_addr_1[0] = data.y; | |||||
| } | |||||
| if (shape.w - Z * 4 >= 3) { | |||||
| dst_addr_2[0] = data.z; | |||||
| } | |||||
| } | |||||
| } | |||||
| __kernel void to_format_NHWC4_to_NHWC_BUF_half(__read_only image2d_t src_data, __global half4 *dst_data, int4 size, | __kernel void to_format_NHWC4_to_NHWC_BUF_half(__read_only image2d_t src_data, __global half4 *dst_data, int4 size, | ||||
| int4 shape) { | int4 shape) { | ||||
| int X = get_global_id(0); | int X = get_global_id(0); | ||||
| @@ -40,8 +40,6 @@ using mindspore::schema::PrimitiveType_Activation; | |||||
| namespace mindspore::kernel { | namespace mindspore::kernel { | ||||
| void ActivationOpenClKernel::InitBuffer() {} | |||||
| int ActivationOpenClKernel::Init() { | int ActivationOpenClKernel::Init() { | ||||
| in_size_ = in_tensors_[0]->shape().size(); | in_size_ = in_tensors_[0]->shape().size(); | ||||
| out_size_ = out_tensors_[0]->shape().size(); | out_size_ = out_tensors_[0]->shape().size(); | ||||
| @@ -39,7 +39,7 @@ class ActivationOpenClKernel : public OpenCLKernel { | |||||
| int Run() override; | int Run() override; | ||||
| int GetImageSize(size_t idx, std::vector<size_t> *img_size) override; | int GetImageSize(size_t idx, std::vector<size_t> *img_size) override; | ||||
| cl_int4 GetImg2dShape(); | cl_int4 GetImg2dShape(); | ||||
| void InitBuffer(); | |||||
| void InitBuffer() {} | |||||
| private: | private: | ||||
| cl::Kernel kernel_; | cl::Kernel kernel_; | ||||
| @@ -16,10 +16,10 @@ | |||||
| #include <cstring> | #include <cstring> | ||||
| #include <algorithm> | #include <algorithm> | ||||
| #include <set> | #include <set> | ||||
| #include<string> | |||||
| #include "src/kernel_registry.h" | #include "src/kernel_registry.h" | ||||
| #include "src/runtime/opencl/opencl_runtime.h" | #include "src/runtime/opencl/opencl_runtime.h" | ||||
| #include "src/runtime/kernel/opencl/kernel/arithmetic_self.h" | #include "src/runtime/kernel/opencl/kernel/arithmetic_self.h" | ||||
| #include "src/runtime/kernel/opencl/utils.h" | |||||
| #include "src/runtime/kernel/opencl/cl/arithmeticself.cl.inc" | #include "src/runtime/kernel/opencl/cl/arithmeticself.cl.inc" | ||||
| using mindspore::kernel::KERNEL_ARCH::kGPU; | using mindspore::kernel::KERNEL_ARCH::kGPU; | ||||
| @@ -145,31 +145,12 @@ int ArithmeticSelfOpenCLKernel::Init() { | |||||
| int ArithmeticSelfOpenCLKernel::ReSize() { return RET_OK; } | int ArithmeticSelfOpenCLKernel::ReSize() { return RET_OK; } | ||||
| int ArithmeticSelfGetBiggestDividerWithPriority(int number, int max_divider) { | |||||
| if (number % 8 == 0 && max_divider >= 8) { | |||||
| return number / 8; | |||||
| } | |||||
| if (number % 4 == 0 && 4 <= max_divider) { | |||||
| return number / 4; | |||||
| } | |||||
| if (number % 2 == 0 && 2 <= max_divider) { | |||||
| return number / 2; | |||||
| } | |||||
| for (int i = max_divider; i != 0; i--) { | |||||
| if (number % i == 0) { | |||||
| return i; | |||||
| } | |||||
| } | |||||
| return RET_OK; | |||||
| } | |||||
| void ArithmeticSelfGetWorkGroup(const std::vector<size_t> &global, std::vector<size_t> *local, int max_size) { | void ArithmeticSelfGetWorkGroup(const std::vector<size_t> &global, std::vector<size_t> *local, int max_size) { | ||||
| const int max_divider = 8; | const int max_divider = 8; | ||||
| const int max_x = 4, max_y = 8; | const int max_x = 4, max_y = 8; | ||||
| int x = std::min(ArithmeticSelfGetBiggestDividerWithPriority(global[0], max_divider), max_x); | |||||
| int x = std::min(GetMaxDivisorStrategy1(global[0], max_divider), max_x); | |||||
| int yz = max_size / x; | int yz = max_size / x; | ||||
| int y = std::min(std::min(ArithmeticSelfGetBiggestDividerWithPriority(global[1], max_divider), yz), max_y); | |||||
| int y = std::min(std::min(GetMaxDivisorStrategy1(global[1], max_divider), yz), max_y); | |||||
| int z = std::min(yz / y, static_cast<int>(UP_DIV(global[2], 2))); | int z = std::min(yz / y, static_cast<int>(UP_DIV(global[2], 2))); | ||||
| local->clear(); | local->clear(); | ||||
| @@ -20,6 +20,7 @@ | |||||
| #include "src/kernel_registry.h" | #include "src/kernel_registry.h" | ||||
| #include "src/runtime/opencl/opencl_runtime.h" | #include "src/runtime/opencl/opencl_runtime.h" | ||||
| #include "src/runtime/kernel/opencl/kernel/batchnorm.h" | #include "src/runtime/kernel/opencl/kernel/batchnorm.h" | ||||
| #include "src/runtime/kernel/opencl/utils.h" | |||||
| #include "src/runtime/kernel/opencl/cl/batchnorm.cl.inc" | #include "src/runtime/kernel/opencl/cl/batchnorm.cl.inc" | ||||
| using mindspore::kernel::KERNEL_ARCH::kGPU; | using mindspore::kernel::KERNEL_ARCH::kGPU; | ||||
| @@ -49,6 +50,7 @@ int BatchNormOpenCLKernel::GetImageSize(size_t idx, std::vector<size_t> *img_siz | |||||
| *img_size = vec; | *img_size = vec; | ||||
| return RET_OK; | return RET_OK; | ||||
| } | } | ||||
| int BatchNormOpenCLKernel::Init() { | int BatchNormOpenCLKernel::Init() { | ||||
| auto in_format = op_format_; | auto in_format = op_format_; | ||||
| if (in_format != schema::Format_NHWC4 && in_format != schema::Format_NC4HW4) { | if (in_format != schema::Format_NHWC4 && in_format != schema::Format_NC4HW4) { | ||||
| @@ -79,31 +81,12 @@ int BatchNormOpenCLKernel::Init() { | |||||
| int BatchNormOpenCLKernel::ReSize() { return RET_OK; } | int BatchNormOpenCLKernel::ReSize() { return RET_OK; } | ||||
| int BatchnormGetBiggestDividerWithPriority(int number, int max_divider) { | |||||
| if (number % 8 == 0 && 8 <= max_divider) { | |||||
| return number / 8; | |||||
| } | |||||
| if (number % 4 == 0 && 4 <= max_divider) { | |||||
| return number / 4; | |||||
| } | |||||
| if (number % 2 == 0 && 2 <= max_divider) { | |||||
| return number / 2; | |||||
| } | |||||
| for (int i = max_divider; i != 0; i--) { | |||||
| if (number % i == 0) { | |||||
| return i; | |||||
| } | |||||
| } | |||||
| return RET_OK; | |||||
| } | |||||
| void BatchNormGetWorkGroup(const std::vector<size_t> &global, std::vector<size_t> *local, int max_size) { | void BatchNormGetWorkGroup(const std::vector<size_t> &global, std::vector<size_t> *local, int max_size) { | ||||
| const int max_divider = 8; | const int max_divider = 8; | ||||
| const int max_x = 4, max_y = 8; | const int max_x = 4, max_y = 8; | ||||
| int x = std::min(BatchnormGetBiggestDividerWithPriority(global[0], max_divider), max_x); | |||||
| int x = std::min(GetMaxDivisorStrategy1(global[0], max_divider), max_x); | |||||
| int yz = max_size / x; | int yz = max_size / x; | ||||
| int y = std::min(std::min(BatchnormGetBiggestDividerWithPriority(global[1], max_divider), yz), max_y); | |||||
| int y = std::min(std::min(GetMaxDivisorStrategy1(global[1], max_divider), yz), max_y); | |||||
| int z = std::min(yz / y, static_cast<int>(UP_DIV(global[2], 2))); | int z = std::min(yz / y, static_cast<int>(UP_DIV(global[2], 2))); | ||||
| local->clear(); | local->clear(); | ||||
| @@ -111,6 +94,7 @@ void BatchNormGetWorkGroup(const std::vector<size_t> &global, std::vector<size_t | |||||
| local->push_back(y); | local->push_back(y); | ||||
| local->push_back(z); | local->push_back(z); | ||||
| } | } | ||||
| int BatchNormOpenCLKernel::Run() { | int BatchNormOpenCLKernel::Run() { | ||||
| MS_LOG(DEBUG) << this->name() << " Running! "; | MS_LOG(DEBUG) << this->name() << " Running! "; | ||||
| auto param = reinterpret_cast<BatchNormParameter *>(this->op_parameter_); | auto param = reinterpret_cast<BatchNormParameter *>(this->op_parameter_); | ||||
| @@ -14,12 +14,12 @@ | |||||
| * limitations under the License. | * limitations under the License. | ||||
| */ | */ | ||||
| #include <cstring> | #include <cstring> | ||||
| #include <string> | |||||
| #include <algorithm> | #include <algorithm> | ||||
| #include <set> | #include <set> | ||||
| #include "src/kernel_registry.h" | #include "src/kernel_registry.h" | ||||
| #include "src/runtime/opencl/opencl_runtime.h" | #include "src/runtime/opencl/opencl_runtime.h" | ||||
| #include "src/runtime/kernel/opencl/kernel/concat.h" | #include "src/runtime/kernel/opencl/kernel/concat.h" | ||||
| #include "src/runtime/kernel/opencl/utils.h" | |||||
| #include "src/runtime/kernel/opencl/cl/concat.cl.inc" | #include "src/runtime/kernel/opencl/cl/concat.cl.inc" | ||||
| using mindspore::kernel::KERNEL_ARCH::kGPU; | using mindspore::kernel::KERNEL_ARCH::kGPU; | ||||
| @@ -131,31 +131,12 @@ int ConcatOpenCLKernel::GetSumShape(std::vector<int> *sum_shape, std::vector<int | |||||
| return RET_OK; | return RET_OK; | ||||
| } | } | ||||
| int ConcatGetBiggestDividerWithPriority(int number, int max_divider) { | |||||
| if (number % 8 == 0 && max_divider >= 8) { | |||||
| return number / 8; | |||||
| } | |||||
| if (number % 4 == 0 && 4 <= max_divider) { | |||||
| return number / 4; | |||||
| } | |||||
| if (number % 2 == 0 && 2 <= max_divider) { | |||||
| return number / 2; | |||||
| } | |||||
| for (int i = max_divider; i != 0; i--) { | |||||
| if (number % i == 0) { | |||||
| return i; | |||||
| } | |||||
| } | |||||
| return RET_OK; | |||||
| } | |||||
| void ConcatGetWorkGroup(const std::vector<size_t> &global, std::vector<size_t> *local, int max_size) { | void ConcatGetWorkGroup(const std::vector<size_t> &global, std::vector<size_t> *local, int max_size) { | ||||
| const int max_divider = 8; | const int max_divider = 8; | ||||
| const int max_x = 4, max_y = 8; | const int max_x = 4, max_y = 8; | ||||
| int x = std::min(ConcatGetBiggestDividerWithPriority(global[0], max_divider), max_x); | |||||
| int x = std::min(GetMaxDivisorStrategy1(global[0], max_divider), max_x); | |||||
| int yz = max_size / x; | int yz = max_size / x; | ||||
| int y = std::min(std::min(ConcatGetBiggestDividerWithPriority(global[1], max_divider), yz), max_y); | |||||
| int y = std::min(std::min(GetMaxDivisorStrategy1(global[1], max_divider), yz), max_y); | |||||
| int z = std::min(yz / y, static_cast<int>(UP_DIV(global[2], 2))); | int z = std::min(yz / y, static_cast<int>(UP_DIV(global[2], 2))); | ||||
| local->clear(); | local->clear(); | ||||
| @@ -163,6 +144,7 @@ void ConcatGetWorkGroup(const std::vector<size_t> &global, std::vector<size_t> * | |||||
| local->push_back(y); | local->push_back(y); | ||||
| local->push_back(z); | local->push_back(z); | ||||
| } | } | ||||
| int ConcatOpenCLKernel::Run() { | int ConcatOpenCLKernel::Run() { | ||||
| MS_LOG(DEBUG) << this->name() << " Running! "; | MS_LOG(DEBUG) << this->name() << " Running! "; | ||||
| auto param = reinterpret_cast<ConcatParameter *>(this->op_parameter_); | auto param = reinterpret_cast<ConcatParameter *>(this->op_parameter_); | ||||
| @@ -19,6 +19,7 @@ | |||||
| #include <algorithm> | #include <algorithm> | ||||
| #include "src/common/utils.h" | #include "src/common/utils.h" | ||||
| #include "src/runtime/kernel/opencl/kernel/convolution.h" | #include "src/runtime/kernel/opencl/kernel/convolution.h" | ||||
| #include "src/runtime/kernel/opencl/utils.h" | |||||
| #include "src/kernel_registry.h" | #include "src/kernel_registry.h" | ||||
| #include "include/errorcode.h" | #include "include/errorcode.h" | ||||
| @@ -113,7 +114,7 @@ int ConvolutionOpenCLKernel::Init() { | |||||
| return RET_OK; | return RET_OK; | ||||
| } | } | ||||
| int ConvolutionOpenCLKernel::RearrangeWinogradWeight() { | |||||
| int ConvolutionOpenCLKernel::GenerateWinogradWeight() { | |||||
| constexpr float Gt[] = {1.0000000000, 1.0000000000, 1.0000000000, 1.0000000000, 1.0000000000, 0.0000000000, | constexpr float Gt[] = {1.0000000000, 1.0000000000, 1.0000000000, 1.0000000000, 1.0000000000, 0.0000000000, | ||||
| 0.0000000000, 0.7071067691, -0.7071067691, 1.4142135382, -1.4142135382, 0.0000000000, | 0.0000000000, 0.7071067691, -0.7071067691, 1.4142135382, -1.4142135382, 0.0000000000, | ||||
| 0.0000000000, 0.4999999702, 0.4999999702, 1.9999998808, 1.9999998808, 1.0000000000}; | 0.0000000000, 0.4999999702, 0.4999999702, 1.9999998808, 1.9999998808, 1.0000000000}; | ||||
| @@ -155,41 +156,16 @@ int ConvolutionOpenCLKernel::RearrangeWinogradWeight() { | |||||
| } | } | ||||
| if (use_fp16_) { | if (use_fp16_) { | ||||
| OHWI2OHWIOGroupI4O4<float, float16_t>(encoded_weight.data(), 6, 6, 2); | |||||
| ConvertConvWeight4DTo7D<float, float16_t>(reinterpret_cast<void *>(encoded_weight.data()), packed_weight_, CO_, 6, | |||||
| 6, CI_, 2); | |||||
| } else { | } else { | ||||
| OHWI2OHWIOGroupI4O4<float, float>(encoded_weight.data(), 6, 6, 2); | |||||
| ConvertConvWeight4DTo7D<float, float>(reinterpret_cast<void *>(encoded_weight.data()), packed_weight_, CO_, 6, 6, | |||||
| CI_, 2); | |||||
| } | } | ||||
| return RET_OK; | return RET_OK; | ||||
| } | } | ||||
| template <typename SRC_T, typename DST_T> | |||||
| int ConvolutionOpenCLKernel::OHWI2OHWIOGroupI4O4(void *weight_OHWI, size_t KH, size_t KW, size_t OGroup) { | |||||
| auto origin_weight = reinterpret_cast<SRC_T *>(weight_OHWI); | |||||
| auto packed_weight = reinterpret_cast<DST_T *>(packed_weight_); | |||||
| // OHWI -> O/OGroup/4 KH KW I/4 OGroup I4 O4 | |||||
| for (size_t co = 0, src_idx = 0; co < CO_; ++co) { | |||||
| for (size_t kh = 0; kh < KH; ++kh) { | |||||
| for (size_t kw = 0; kw < KW; ++kw) { | |||||
| for (size_t ci = 0; ci < CI_; ++ci) { | |||||
| size_t co_outer = co / (CO_TILE * OGroup); | |||||
| size_t group_idx = co % (CO_TILE * OGroup) / CO_TILE; | |||||
| size_t co_inner = co % CO_TILE; | |||||
| size_t ci_outer = ci / CI_TILE; | |||||
| size_t ci_inner = ci % CI_TILE; | |||||
| size_t dst_idx = | |||||
| (((((co_outer * KH + kh) * KW + kw) * CI_SLICES_ + ci_outer) * OGroup + group_idx) * CI_TILE + ci_inner) * | |||||
| CO_TILE + | |||||
| co_inner; | |||||
| packed_weight[dst_idx] = static_cast<DST_T>(origin_weight[src_idx++]); | |||||
| } | |||||
| } | |||||
| } | |||||
| } | |||||
| return RET_OK; | |||||
| } | |||||
| int ConvolutionOpenCLKernel::InitWeight() { | int ConvolutionOpenCLKernel::InitWeight() { | ||||
| auto allocator = lite::opencl::OpenCLRuntime::GetInstance()->GetAllocator(); | auto allocator = lite::opencl::OpenCLRuntime::GetInstance()->GetAllocator(); | ||||
| @@ -206,20 +182,20 @@ int ConvolutionOpenCLKernel::InitWeight() { | |||||
| // rearrange weight | // rearrange weight | ||||
| if (use_winograd_) { | if (use_winograd_) { | ||||
| RearrangeWinogradWeight(); | |||||
| GenerateWinogradWeight(); | |||||
| } else { | } else { | ||||
| auto weight_tensor = in_tensors_[1]; | auto weight_tensor = in_tensors_[1]; | ||||
| if (weight_tensor->data_type() == kNumberTypeFloat16) { | if (weight_tensor->data_type() == kNumberTypeFloat16) { | ||||
| if (use_fp16_) { | if (use_fp16_) { | ||||
| OHWI2OHWIOGroupI4O4<float16_t, float16_t>(weight_tensor->data_c(), KH_, KW_, 1); | |||||
| ConvertConvWeight4DTo7D<float16_t, float16_t>(weight_tensor->data_c(), packed_weight_, CO_, KH_, KW_, CI_); | |||||
| } else { | } else { | ||||
| OHWI2OHWIOGroupI4O4<float16_t, float>(weight_tensor->data_c(), KH_, KW_, 1); | |||||
| ConvertConvWeight4DTo7D<float16_t, float>(weight_tensor->data_c(), packed_weight_, CO_, KH_, KW_, CI_); | |||||
| } | } | ||||
| } else { | } else { | ||||
| if (use_fp16_) { | if (use_fp16_) { | ||||
| OHWI2OHWIOGroupI4O4<float, float16_t>(weight_tensor->data_c(), KH_, KW_, 1); | |||||
| ConvertConvWeight4DTo7D<float, float16_t>(weight_tensor->data_c(), packed_weight_, CO_, KH_, KW_, CI_); | |||||
| } else { | } else { | ||||
| OHWI2OHWIOGroupI4O4<float, float>(weight_tensor->data_c(), KH_, KW_, 1); | |||||
| ConvertConvWeight4DTo7D<float, float>(weight_tensor->data_c(), packed_weight_, CO_, KH_, KW_, CI_); | |||||
| } | } | ||||
| } | } | ||||
| } | } | ||||
| @@ -635,7 +611,7 @@ std::string ConvolutionOpenCLKernel::CodeGenWinograd4x4To36() { | |||||
| " }\n" | " }\n" | ||||
| "\n" | "\n" | ||||
| " int IH = input_shape.y, IW = input_shape.z;\n" | " int IH = input_shape.y, IW = input_shape.z;\n" | ||||
| " int TILE_X = IW / 4;\n" | |||||
| " int TILE_X = UP_DIV(IW, 4);\n" | |||||
| " int tile_x = tile_xy % TILE_X;\n" | " int tile_x = tile_xy % TILE_X;\n" | ||||
| " int tile_y = tile_xy / TILE_X;\n" | " int tile_y = tile_xy / TILE_X;\n" | ||||
| "\n" | "\n" | ||||
| @@ -764,6 +740,8 @@ std::string ConvolutionOpenCLKernel::CodeGenWinogradConvolution() { | |||||
| std::string ConvolutionOpenCLKernel::CodeGenWinograd36To4x4() { | std::string ConvolutionOpenCLKernel::CodeGenWinograd36To4x4() { | ||||
| std::string code = | std::string code = | ||||
| "#pragma OPENCL EXTENSION cl_khr_fp16 : enable\n" | "#pragma OPENCL EXTENSION cl_khr_fp16 : enable\n" | ||||
| "#define UP_DIV(x, y) (((x) + (y) - (1)) / (y))\n" | |||||
| "\n" | |||||
| "__constant sampler_t\n" | "__constant sampler_t\n" | ||||
| "smp_none = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE | CLK_FILTER_NEAREST;\n" | "smp_none = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE | CLK_FILTER_NEAREST;\n" | ||||
| "\n" | "\n" | ||||
| @@ -804,6 +782,7 @@ std::string ConvolutionOpenCLKernel::CodeGenWinograd36To4x4() { | |||||
| " }\n" | " }\n" | ||||
| " }\n" | " }\n" | ||||
| "\n" | "\n" | ||||
| " int TILE_X = UP_DIV(OW, 4);\n" | |||||
| " for (int x = 0; x < 4; x++)\n" | " for (int x = 0; x < 4; x++)\n" | ||||
| " {\n" | " {\n" | ||||
| " FLT4 acc = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f);\n" | " FLT4 acc = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f);\n" | ||||
| @@ -822,14 +801,15 @@ std::string ConvolutionOpenCLKernel::CodeGenWinograd36To4x4() { | |||||
| } | } | ||||
| code += | code += | ||||
| " int TILE_X = OW / 4;\n" | |||||
| " int tile_x = tile_xy % TILE_X * 4;\n" | |||||
| " int tile_y = tile_xy / TILE_X * 4;\n"; | |||||
| " int tile_x = tile_xy % TILE_X;\n" | |||||
| " int tile_y = tile_xy / TILE_X;\n" | |||||
| " int ow = tile_x * 4 + x;\n" | |||||
| " int oh = tile_y * 4 + row;\n"; | |||||
| if (op_format_ == Format_NHWC4) { | if (op_format_ == Format_NHWC4) { | ||||
| code += " WRITE_IMAGE(output, (int2)((tile_x + x) * SLICES + slice, tile_y + row), acc);\n"; | |||||
| code += " if(ow < OW) { WRITE_IMAGE(output, (int2)(ow * SLICES + slice, oh), acc);}\n"; | |||||
| } else if (op_format_ == Format_NC4HW4) { | } else if (op_format_ == Format_NC4HW4) { | ||||
| code += " WRITE_IMAGE(output, (int2)(tile_x + x, slice * OH + tile_y + row), acc);\n"; | |||||
| code += " if(oh < OH) { WRITE_IMAGE(output, (int2)(ow, slice * OH + oh), acc);}\n"; | |||||
| } | } | ||||
| code += | code += | ||||
| @@ -849,7 +829,7 @@ int ConvolutionOpenCLKernel::SetGlobalLocalConv(std::vector<size_t> *global, std | |||||
| size_t global_w = UP_DIV(OW_, work_group_size[1]) * work_group_size[1]; | size_t global_w = UP_DIV(OW_, work_group_size[1]) * work_group_size[1]; | ||||
| size_t global_c = UP_DIV(CO_SLICES_, work_group_size[2]) * work_group_size[2]; | size_t global_c = UP_DIV(CO_SLICES_, work_group_size[2]) * work_group_size[2]; | ||||
| size_t local_c = GetBiggestDivider(global_c, max_z_size); | |||||
| size_t local_c = GetMaxDivisor(global_c, max_z_size); | |||||
| if (local_c == 0) { | if (local_c == 0) { | ||||
| MS_LOG(ERROR) << "Divide by zero"; | MS_LOG(ERROR) << "Divide by zero"; | ||||
| return RET_ERROR; | return RET_ERROR; | ||||
| @@ -68,9 +68,7 @@ class ConvolutionOpenCLKernel : public OpenCLKernel { | |||||
| int InitWeight(); | int InitWeight(); | ||||
| int InitBias(); | int InitBias(); | ||||
| int RearrangeWinogradWeight(); | |||||
| template <typename SRC_T, typename DST_T> | |||||
| int OHWI2OHWIOGroupI4O4(void *weight_OHWI, size_t KH, size_t KW, size_t OGroup); | |||||
| int GenerateWinogradWeight(); | |||||
| std::string CodeGenConvolutionNHWC4(); | std::string CodeGenConvolutionNHWC4(); | ||||
| std::string CodeGenConvolutionNC4HW4(); | std::string CodeGenConvolutionNC4HW4(); | ||||
| @@ -90,29 +88,6 @@ class ConvolutionOpenCLKernel : public OpenCLKernel { | |||||
| const bool hw_good = TILES_X_ * TILES_Y_ >= 16; | const bool hw_good = TILES_X_ * TILES_Y_ >= 16; | ||||
| return attr_valid && channel_good && hw_good; | return attr_valid && channel_good && hw_good; | ||||
| } | } | ||||
| static std::vector<float> MatrixMultiply(const float A[], const float B[], int M, int N, int K) { | |||||
| std::vector<float> C(M * K); | |||||
| for (int i = 0; i < M; ++i) { | |||||
| for (int j = 0; j < K; ++j) { | |||||
| float s = 0.0f; | |||||
| for (int k = 0; k < N; ++k) { | |||||
| s += A[i * N + k] * B[k * K + j]; | |||||
| } | |||||
| C[i * K + j] = s; | |||||
| } | |||||
| } | |||||
| return C; | |||||
| } | |||||
| static int GetBiggestDivider(int x, int y) { | |||||
| for (int i = y; i != 0; i--) { | |||||
| if (x % i == 0) { | |||||
| return i; | |||||
| } | |||||
| } | |||||
| return 1; | |||||
| } | |||||
| }; | }; | ||||
| } // namespace mindspore::kernel | } // namespace mindspore::kernel | ||||
| @@ -20,6 +20,7 @@ | |||||
| #include "src/kernel_registry.h" | #include "src/kernel_registry.h" | ||||
| #include "src/runtime/opencl/opencl_runtime.h" | #include "src/runtime/opencl/opencl_runtime.h" | ||||
| #include "src/runtime/kernel/opencl/kernel/slice.h" | #include "src/runtime/kernel/opencl/kernel/slice.h" | ||||
| #include "src/runtime/kernel/opencl/utils.h" | |||||
| #include "src/runtime/kernel/opencl/cl/slice.cl.inc" | #include "src/runtime/kernel/opencl/cl/slice.cl.inc" | ||||
| using mindspore::kernel::KERNEL_ARCH::kGPU; | using mindspore::kernel::KERNEL_ARCH::kGPU; | ||||
| @@ -49,6 +50,7 @@ int SliceOpenCLKernel::GetImageSize(size_t idx, std::vector<size_t> *img_size) { | |||||
| *img_size = vec; | *img_size = vec; | ||||
| return RET_OK; | return RET_OK; | ||||
| } | } | ||||
| int SliceOpenCLKernel::Init() { | int SliceOpenCLKernel::Init() { | ||||
| std::string kernel_name = "slice"; | std::string kernel_name = "slice"; | ||||
| auto in_format = op_format_; | auto in_format = op_format_; | ||||
| @@ -77,28 +79,12 @@ int SliceOpenCLKernel::Init() { | |||||
| int SliceOpenCLKernel::ReSize() { return RET_OK; } | int SliceOpenCLKernel::ReSize() { return RET_OK; } | ||||
| int SliceGetBiggestDividerWithPriority(int number, int max_divider) { | |||||
| if (number % 8 == 0 && 8 <= max_divider) { | |||||
| return number / 8; | |||||
| } else if (number % 4 == 0 && 4 <= max_divider) { | |||||
| return number / 4; | |||||
| } else if (number % 2 == 0 && 2 <= max_divider) { | |||||
| return number / 2; | |||||
| } | |||||
| for (int i = max_divider; i != 0; i--) { | |||||
| if (number % i == 0) { | |||||
| return i; | |||||
| } | |||||
| } | |||||
| return 1; | |||||
| } | |||||
| void SlcieGetWorkGroup(const std::vector<size_t> &global, std::vector<size_t> *local, int max_size) { | void SlcieGetWorkGroup(const std::vector<size_t> &global, std::vector<size_t> *local, int max_size) { | ||||
| const int max_divider = 8; | const int max_divider = 8; | ||||
| const int max_x = 4, max_y = 8; | const int max_x = 4, max_y = 8; | ||||
| int x = std::min(SliceGetBiggestDividerWithPriority(global[0], max_divider), max_x); | |||||
| int x = std::min(GetMaxDivisorStrategy1(global[0], max_divider), max_x); | |||||
| int yz = max_size / x; | int yz = max_size / x; | ||||
| int y = std::min(std::min(SliceGetBiggestDividerWithPriority(global[1], max_divider), yz), max_y); | |||||
| int y = std::min(std::min(GetMaxDivisorStrategy1(global[1], max_divider), yz), max_y); | |||||
| int z = std::min(yz / y, static_cast<int>(UP_DIV(global[2], 2))); | int z = std::min(yz / y, static_cast<int>(UP_DIV(global[2], 2))); | ||||
| local->clear(); | local->clear(); | ||||
| @@ -106,6 +92,7 @@ void SlcieGetWorkGroup(const std::vector<size_t> &global, std::vector<size_t> *l | |||||
| local->push_back(y); | local->push_back(y); | ||||
| local->push_back(z); | local->push_back(z); | ||||
| } | } | ||||
| int SliceOpenCLKernel::Run() { | int SliceOpenCLKernel::Run() { | ||||
| MS_LOG(DEBUG) << this->name() << " Running! "; | MS_LOG(DEBUG) << this->name() << " Running! "; | ||||
| auto param = reinterpret_cast<SliceParameter *>(this->op_parameter_); | auto param = reinterpret_cast<SliceParameter *>(this->op_parameter_); | ||||
| @@ -154,5 +141,4 @@ kernel::LiteKernel *OpenCLSliceKernelCreator(const std::vector<lite::Tensor *> & | |||||
| REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Slice, OpenCLSliceKernelCreator); | REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Slice, OpenCLSliceKernelCreator); | ||||
| REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_Slice, OpenCLSliceKernelCreator); | REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_Slice, OpenCLSliceKernelCreator); | ||||
| } // namespace mindspore::kernel | } // namespace mindspore::kernel | ||||
| @@ -16,7 +16,6 @@ | |||||
| #include "src/runtime/kernel/opencl/utils.h" | #include "src/runtime/kernel/opencl/utils.h" | ||||
| #include <algorithm> | #include <algorithm> | ||||
| #include <string> | |||||
| #include <vector> | #include <vector> | ||||
| #include "src/kernel_registry.h" | #include "src/kernel_registry.h" | ||||
| @@ -34,26 +33,61 @@ kernel::LiteKernel *GetOpenCLKernel(const std::vector<Tensor *> &in_tensors, con | |||||
| } | } | ||||
| } // namespace mindspore::lite | } // namespace mindspore::lite | ||||
| namespace mindspore { | |||||
| namespace kernel { | |||||
| namespace mindspore::kernel { | |||||
| int GetMaxDivisor(int x, int divisor) { | |||||
| int i = divisor; | |||||
| while (i > 0) { | |||||
| if (x % i == 0) { | |||||
| return i; | |||||
| } | |||||
| i--; | |||||
| } | |||||
| return 1; | |||||
| } | |||||
| int GetMaxDivisorStrategy0(int x, int divisor) { | |||||
| if (divisor >= 8 && x % 8 == 0) { | |||||
| return 8; | |||||
| } else if (divisor >= 4 && x % 4 == 0) { | |||||
| return 4; | |||||
| } else if (divisor >= 2 && x % 2 == 0) { | |||||
| return 2; | |||||
| } else { | |||||
| return GetMaxDivisor(x, divisor); | |||||
| } | |||||
| } | |||||
| int GetMaxDivisorStrategy1(int x, int divisor) { | |||||
| if (divisor >= 8 && x % 8 == 0) { | |||||
| return x / 8; | |||||
| } else if (divisor >= 4 && x % 4 == 0) { | |||||
| return x / 4; | |||||
| } else if (divisor >= 2 && x % 2 == 0) { | |||||
| return x / 2; | |||||
| } else { | |||||
| return GetMaxDivisor(x, divisor); | |||||
| } | |||||
| } | |||||
| std::vector<size_t> GetCommonGlobalSize(const std::vector<size_t> &local, const std::vector<size_t> &global) { | std::vector<size_t> GetCommonGlobalSize(const std::vector<size_t> &local, const std::vector<size_t> &global) { | ||||
| std::vector<size_t> result(3, 1); | |||||
| std::vector<size_t> result(3); | |||||
| for (int i = 0; i < 3; ++i) { | for (int i = 0; i < 3; ++i) { | ||||
| result[i] = AlignByN(global[i], local[i]); | |||||
| result[i] = UP_ROUND(global[i], local[i]); | |||||
| } | } | ||||
| return result; | return result; | ||||
| } | } | ||||
| std::vector<size_t> GetCommonLocalSize(const std::vector<size_t> &global, int max_size) { | std::vector<size_t> GetCommonLocalSize(const std::vector<size_t> &global, int max_size) { | ||||
| size_t wg_z = GetBiggestDividerWithPriority(global[2], 8); | |||||
| if (wg_z == 0) { | |||||
| size_t local_z = GetMaxDivisorStrategy0(global[2], 8); | |||||
| if (local_z == 0) { | |||||
| MS_LOG(ERROR) << "Divide by zero"; | MS_LOG(ERROR) << "Divide by zero"; | ||||
| return {}; | return {}; | ||||
| } | } | ||||
| size_t wg_xy_size = max_size / wg_z; | |||||
| size_t wg_x = std::min(DivideRoundUp(global[0], 2), wg_xy_size); | |||||
| size_t wg_y = std::min(wg_xy_size / wg_x, global[1]); | |||||
| std::vector<size_t> local = {wg_x, wg_y, wg_z}; | |||||
| size_t local_xy = max_size / local_z; | |||||
| size_t local_x = std::min(UP_DIV(global[0], 2), local_xy); | |||||
| size_t local_y = std::min(local_xy / local_x, global[1]); | |||||
| std::vector<size_t> local = {local_x, local_y, local_z}; | |||||
| return local; | return local; | ||||
| } | } | ||||
| @@ -187,5 +221,4 @@ std::string CLErrorCode(cl_int error_code) { | |||||
| return "Unknown OpenCL error code"; | return "Unknown OpenCL error code"; | ||||
| } | } | ||||
| } | } | ||||
| } // namespace kernel | |||||
| } // namespace mindspore | |||||
| } // namespace mindspore::kernel | |||||
| @@ -23,7 +23,7 @@ | |||||
| #include "utils/log_adapter.h" | #include "utils/log_adapter.h" | ||||
| #include "nnacl/op_base.h" | #include "nnacl/op_base.h" | ||||
| #include "src/lite_kernel.h" | #include "src/lite_kernel.h" | ||||
| #include "src/common//utils.h" | |||||
| #include "src/common/utils.h" | |||||
| namespace mindspore::lite { | namespace mindspore::lite { | ||||
| kernel::LiteKernel *GetOpenCLKernel(const std::vector<Tensor *> &in_tensors, const std::vector<Tensor *> &out_tensors, | kernel::LiteKernel *GetOpenCLKernel(const std::vector<Tensor *> &in_tensors, const std::vector<Tensor *> &out_tensors, | ||||
| @@ -32,59 +32,14 @@ kernel::LiteKernel *GetOpenCLKernel(const std::vector<Tensor *> &in_tensors, con | |||||
| namespace mindspore::kernel { | namespace mindspore::kernel { | ||||
| /** | |||||
| * GetLocalSize | |||||
| * @param number | |||||
| * @param max_divider | |||||
| * @return | |||||
| */ | |||||
| template <typename T, typename N> | |||||
| T GetBiggestDividerWithPriority(T number, N max_divider) { | |||||
| if (number % 8 == 0 && 8 <= max_divider) { | |||||
| return (T)8; | |||||
| } | |||||
| if (number % 4 == 0 && 4 <= max_divider) { | |||||
| return (T)4; | |||||
| } | |||||
| if (number % 2 == 0 && 2 <= max_divider) { | |||||
| return (T)2; | |||||
| } | |||||
| for (int i = max_divider; i != 0; i--) { | |||||
| if (number % i == 0) { | |||||
| return (T)i; | |||||
| } | |||||
| } | |||||
| return (T)1; | |||||
| } | |||||
| int GetMaxDivisor(int x, int divisor); | |||||
| /** | |||||
| * GetLocalSize | |||||
| * @param n must be non negative | |||||
| * @param divisor must be greater than zero | |||||
| * @return | |||||
| */ | |||||
| template <typename T, typename N> | |||||
| T DivideRoundUp(T n, N divisor) { | |||||
| const T div = static_cast<T>(divisor); | |||||
| const T q = n / div; | |||||
| return n % div == 0 ? q : q + 1; | |||||
| } | |||||
| int GetMaxDivisorStrategy0(int x, int divisor); | |||||
| /** | |||||
| * GetLocalSize | |||||
| * @param number | |||||
| * @param n | |||||
| * @return | |||||
| */ | |||||
| template <typename T, typename N> | |||||
| T AlignByN(T number, N n) { | |||||
| return DivideRoundUp(number, n) * n; | |||||
| } | |||||
| int GetMaxDivisorStrategy1(int x, int divisor); | |||||
| // GetGlobalSize | |||||
| std::vector<size_t> GetCommonGlobalSize(const std::vector<size_t> &local, const std::vector<size_t> &global); | std::vector<size_t> GetCommonGlobalSize(const std::vector<size_t> &local, const std::vector<size_t> &global); | ||||
| // GetLocalSize | |||||
| std::vector<size_t> GetCommonLocalSize(const std::vector<size_t> &global, int max_size); | std::vector<size_t> GetCommonLocalSize(const std::vector<size_t> &global, int max_size); | ||||
| std::string CLErrorCode(cl_int error_code); | std::string CLErrorCode(cl_int error_code); | ||||
| @@ -108,6 +63,7 @@ void PackNCHWToNC4HW4(void *src, void *dst, int batch, int plane, int channel, c | |||||
| } | } | ||||
| } | } | ||||
| } | } | ||||
| template <class T1, class T2> | template <class T1, class T2> | ||||
| void PackNHWCToNHWC4(void *src, void *dst, int batch, int plane, int channel, const std::function<T2(T1)> &to_dtype) { | void PackNHWCToNHWC4(void *src, void *dst, int batch, int plane, int channel, const std::function<T2(T1)> &to_dtype) { | ||||
| int c4 = UP_DIV(channel, C4NUM); | int c4 = UP_DIV(channel, C4NUM); | ||||
| @@ -132,6 +88,7 @@ void PackNHWCToNHWC4(void *src, void *dst, int batch, int plane, int channel, co | |||||
| } | } | ||||
| } | } | ||||
| } | } | ||||
| template <class T1, class T2> | template <class T1, class T2> | ||||
| void PackNHWCToNC4HW4(void *src, void *dst, int batch, int plane, int channel, const std::function<T2(T1)> &to_dtype) { | void PackNHWCToNC4HW4(void *src, void *dst, int batch, int plane, int channel, const std::function<T2(T1)> &to_dtype) { | ||||
| int c4 = UP_DIV(channel, C4NUM); | int c4 = UP_DIV(channel, C4NUM); | ||||
| @@ -152,6 +109,47 @@ void PackNHWCToNC4HW4(void *src, void *dst, int batch, int plane, int channel, c | |||||
| } | } | ||||
| } | } | ||||
| template <class T> | |||||
| std::vector<T> MatrixMultiply(const T A[], const T B[], int M, int N, int K) { | |||||
| std::vector<T> C(M * K); | |||||
| for (int i = 0; i < M; ++i) { | |||||
| for (int j = 0; j < K; ++j) { | |||||
| float s = 0.0f; | |||||
| for (int k = 0; k < N; ++k) { | |||||
| s += A[i * N + k] * B[k * K + j]; | |||||
| } | |||||
| C[i * K + j] = s; | |||||
| } | |||||
| } | |||||
| return C; | |||||
| } | |||||
| template <typename SRC_T, typename DST_T> | |||||
| void ConvertConvWeight4DTo7D(void *src, void *dst, size_t CO, size_t KH, size_t KW, size_t CI, size_t OGroup = 1, | |||||
| size_t CI_TILE = 4, size_t CO_TILE = 4) { | |||||
| auto origin_weight = reinterpret_cast<SRC_T *>(src); | |||||
| auto packed_weight = reinterpret_cast<DST_T *>(dst); | |||||
| auto CI_SLICES = UP_DIV(CI, CI_TILE); | |||||
| for (size_t co = 0, src_idx = 0; co < CO; ++co) { | |||||
| for (size_t kh = 0; kh < KH; ++kh) { | |||||
| for (size_t kw = 0; kw < KW; ++kw) { | |||||
| for (size_t ci = 0; ci < CI; ++ci) { | |||||
| size_t co_outer = co / (CO_TILE * OGroup); | |||||
| size_t group_idx = co % (CO_TILE * OGroup) / CO_TILE; | |||||
| size_t co_inner = co % CO_TILE; | |||||
| size_t ci_outer = ci / CI_TILE; | |||||
| size_t ci_inner = ci % CI_TILE; | |||||
| size_t dst_idx = | |||||
| (((((co_outer * KH + kh) * KW + kw) * CI_SLICES + ci_outer) * OGroup + group_idx) * CI_TILE + ci_inner) * | |||||
| CO_TILE + | |||||
| co_inner; | |||||
| packed_weight[dst_idx] = static_cast<DST_T>(origin_weight[src_idx++]); | |||||
| } | |||||
| } | |||||
| } | |||||
| } | |||||
| } | |||||
| } // namespace mindspore::kernel | } // namespace mindspore::kernel | ||||
| #endif // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_OPENCL_KERNEL_UTILS_H_ | #endif // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_OPENCL_KERNEL_UTILS_H_ | ||||
| @@ -133,6 +133,8 @@ Format get_op_format(Format input_format) { | |||||
| case Format_NHWC: | case Format_NHWC: | ||||
| case Format_NHWC4: | case Format_NHWC4: | ||||
| return Format_NHWC4; | return Format_NHWC4; | ||||
| case Format_NCHW: | |||||
| return Format_NHWC4; | |||||
| default: | default: | ||||
| return Format_NC4HW4; | return Format_NC4HW4; | ||||
| } | } | ||||
| @@ -249,7 +251,7 @@ TEST_F(TestConvolutionOpenCL, winograd_inputNHWC_1x16x256x96_outputNHWC_1x16x256 | |||||
| TEST_MAIN(attr, Format_NHWC4, Format_NHWC4, kNumberTypeFloat16, 0.6f, "testcases/test_fp32/"); | TEST_MAIN(attr, Format_NHWC4, Format_NHWC4, kNumberTypeFloat16, 0.6f, "testcases/test_fp32/"); | ||||
| } | } | ||||
| TEST_F(TestConvolutionOpenCL, simple_test0) { | |||||
| TEST_F(TestConvolutionOpenCL, simple_test0_NHWC) { | |||||
| std::string attr = | std::string attr = | ||||
| "inputNHWC_1x2x2x2_outputNHWC_1x2x2x2_kernelHW_1x1_strideHW_1x1_padTopBottomLeftRight_0x0x0x0_dilationHW_1x1"; | "inputNHWC_1x2x2x2_outputNHWC_1x2x2x2_kernelHW_1x1_strideHW_1x1_padTopBottomLeftRight_0x0x0x0_dilationHW_1x1"; | ||||
| float input_data[] = {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f}; | float input_data[] = {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f}; | ||||
| @@ -259,6 +261,34 @@ TEST_F(TestConvolutionOpenCL, simple_test0) { | |||||
| TEST_MAIN(attr, Format_NHWC, Format_NHWC, kNumberTypeFloat32, 1e-3f, input_data, weight_data, bias_data, expect_data); | TEST_MAIN(attr, Format_NHWC, Format_NHWC, kNumberTypeFloat32, 1e-3f, input_data, weight_data, bias_data, expect_data); | ||||
| TEST_MAIN(attr, Format_NHWC, Format_NHWC, kNumberTypeFloat16, 1e-6f, input_data, weight_data, bias_data, expect_data); | TEST_MAIN(attr, Format_NHWC, Format_NHWC, kNumberTypeFloat16, 1e-6f, input_data, weight_data, bias_data, expect_data); | ||||
| } | } | ||||
| TEST_F(TestConvolutionOpenCL, simple_test0_NCHW) { | |||||
| std::string attr = | |||||
| "inputNHWC_1x2x2x2_outputNHWC_1x2x2x2_kernelHW_1x1_strideHW_1x1_padTopBottomLeftRight_0x0x0x0_dilationHW_1x1"; | |||||
| float input_data[] = {0.0f, 2.0f, 4.0f, 6.0f, 1.0f, 3.0f, 5.0f, 7.0f}; | |||||
| float weight_data[] = {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f}; | |||||
| float bias_data[] = {0.0f, 0.0f}; | |||||
| float expect_data[] = {1.0f, 5.0f, 9.0f, 13.0f, 1.0f, 5.0f, 9.0f, 13.0f}; | |||||
| TEST_MAIN(attr, Format_NCHW, Format_NCHW, kNumberTypeFloat32, 1e-3f, input_data, weight_data, bias_data, expect_data); | |||||
| TEST_MAIN(attr, Format_NCHW, Format_NCHW, kNumberTypeFloat16, 1e-6f, input_data, weight_data, bias_data, expect_data); | |||||
| } | |||||
| TEST_F(TestConvolutionOpenCL, simple_test0_NHWC4_and_NC4HW4) { | |||||
| std::string attr = | |||||
| "inputNHWC_1x2x2x2_outputNHWC_1x2x2x2_kernelHW_1x1_strideHW_1x1_padTopBottomLeftRight_0x0x0x0_dilationHW_1x1"; | |||||
| float input_data[] = {0.0f, 1.0f, 0.0f, 0.0f, 2.0f, 3.0f, 0.0f, 0.0f, 4.0f, 5.0f, 0.0f, 0.0f, 6.0f, 7.0f, 0.0f, 0.0f}; | |||||
| float weight_data[] = {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f}; | |||||
| float bias_data[] = {0.0f, 0.0f}; | |||||
| float expect_data[] = {1.0f, 1.0f, 0.0f, 0.0f, 5.0f, 5.0f, 0.0f, 0.0f, | |||||
| 9.0f, 9.0f, 0.0f, 0.0f, 13.0f, 13.0f, 0.0f, 0.0f}; | |||||
| TEST_MAIN(attr, Format_NHWC4, Format_NHWC4, kNumberTypeFloat32, 1e-3f, input_data, weight_data, bias_data, | |||||
| expect_data); | |||||
| TEST_MAIN(attr, Format_NHWC4, Format_NHWC4, kNumberTypeFloat16, 1e-6f, input_data, weight_data, bias_data, | |||||
| expect_data); | |||||
| TEST_MAIN(attr, Format_NC4HW4, Format_NC4HW4, kNumberTypeFloat32, 1e-3f, input_data, weight_data, bias_data, | |||||
| expect_data); | |||||
| TEST_MAIN(attr, Format_NC4HW4, Format_NC4HW4, kNumberTypeFloat16, 1e-6f, input_data, weight_data, bias_data, | |||||
| expect_data); | |||||
| } | |||||
| TEST_F(TestConvolutionOpenCL, simple_test1) { | TEST_F(TestConvolutionOpenCL, simple_test1) { | ||||
| std::string attr = | std::string attr = | ||||