From: @pengyongrong Reviewed-by: Signed-off-by:tags/v1.2.0-rc1
| @@ -73,7 +73,7 @@ __constant sampler_t smp_none = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE | | |||
| int coordinate_x = Y * input_shape0.w + Z; \ | |||
| int coordinate_y = X; \ | |||
| result = READ_IMAGE(input0, smp_none, (int2)(coordinate_x, coordinate_y)); \ | |||
| } else { \ | |||
| } else if (Y < boundary1) { \ | |||
| int coordinate_x = (Y - boundary0) * input_shape1.w + Z; \ | |||
| int coordinate_y = X; \ | |||
| result = READ_IMAGE(input1, smp_none, (int2)(coordinate_x, coordinate_y)); \ | |||
| @@ -123,7 +123,7 @@ __constant sampler_t smp_none = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE | | |||
| int coordinate_x = Y * input_shape0.w + Z; \ | |||
| int coordinate_y = X; \ | |||
| result = READ_IMAGE(input0, smp_none, (int2)(coordinate_x, coordinate_y)); \ | |||
| } else { \ | |||
| } else if (Z < boundary1) { \ | |||
| int coordinate_x = Y * input_shape1.w + Z - boundary0; \ | |||
| int coordinate_y = X; \ | |||
| result = READ_IMAGE(input1, smp_none, (int2)(coordinate_x, coordinate_y)); \ | |||
| @@ -1,5 +1,5 @@ | |||
| #pragma OPENCL EXTENSION cl_khr_fp16 : enable | |||
| #define C4NUM 4 | |||
| __constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; | |||
| __kernel void gather(__write_only image2d_t dst_data, __read_only image2d_t src_data, __global int *indices, | |||
| @@ -0,0 +1,114 @@ | |||
| #pragma OPENCL EXTENSION cl_khr_fp16 : enable | |||
| __constant sampler_t smp_none = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; | |||
| #define UP_DIV(x, y) (((x) + (y) - (1)) / (y)) | |||
| #define C4NUM 4 | |||
| #define CHECK_IDX_ALIGN \ | |||
| const int X = get_global_id(0); \ | |||
| const int Y = get_global_id(1); \ | |||
| const int Z = get_global_id(2); \ | |||
| if (X > in_shape.x * in_shape.y || Y > in_shape.z || Z > in_shape.w || in_shape.y == 0) { \ | |||
| return; \ | |||
| } | |||
| #define ARGS_ALIGN \ | |||
| const int IN = X / in_shape.y; \ | |||
| const int IH = X % in_shape.y; \ | |||
| int coordinate_x = IN * in_shape.y + IH; \ | |||
| int coordinate_y = Y * in_shape.w + Z; \ | |||
| FLT4 result = READ_IMAGE(input, smp_none, (int2)(coordinate_y, coordinate_x)); | |||
| __kernel void split_out2_axis3(__read_only image2d_t input, __write_only image2d_t output1, | |||
| __write_only image2d_t output2, __global int *split_sizes_, int4 in_shape, | |||
| int4 out_shape1, int4 out_shape2) { | |||
| CHECK_IDX_ALIGN; | |||
| ARGS_ALIGN; | |||
| int boundary = UP_DIV(split_sizes_[0], C4NUM); | |||
| if (Z < boundary) { | |||
| coordinate_x = IN * out_shape1.y + IH; | |||
| coordinate_y = Y * out_shape1.w + Z; | |||
| WRITE_IMAGE(output1, (int2)(coordinate_y, coordinate_x), result); | |||
| } else { | |||
| coordinate_x = IN * out_shape2.y + IH; | |||
| coordinate_y = Y * out_shape2.w + Z - boundary; | |||
| WRITE_IMAGE(output2, (int2)(coordinate_y, coordinate_x), result); | |||
| } | |||
| } | |||
| __kernel void split_out2_axis2(__read_only image2d_t input, __write_only image2d_t output1, | |||
| __write_only image2d_t output2, __global int *split_sizes_, int4 in_shape, | |||
| int4 out_shape1, int4 out_shape2) { | |||
| CHECK_IDX_ALIGN; | |||
| ARGS_ALIGN; | |||
| if (Y < split_sizes_[0]) { | |||
| coordinate_x = IN * out_shape1.y + IH; | |||
| coordinate_y = Y * out_shape1.w + Z; | |||
| WRITE_IMAGE(output1, (int2)(coordinate_y, coordinate_x), result); | |||
| } else { | |||
| coordinate_x = IN * out_shape2.y + IH; | |||
| coordinate_y = (Y - split_sizes_[0]) * out_shape2.w + Z; | |||
| WRITE_IMAGE(output2, (int2)(coordinate_y, coordinate_x), result); | |||
| } | |||
| } | |||
| __kernel void split_out2_axis1(__read_only image2d_t input, __write_only image2d_t output1, | |||
| __write_only image2d_t output2, __global int *split_sizes_, int4 in_shape, | |||
| int4 out_shape1, int4 out_shape2) { | |||
| CHECK_IDX_ALIGN; | |||
| ARGS_ALIGN; | |||
| if (IH < split_sizes_[0]) { | |||
| coordinate_x = IN * out_shape1.y + IH; | |||
| coordinate_y = Y * out_shape1.w + Z; | |||
| WRITE_IMAGE(output1, (int2)(coordinate_y, coordinate_x), result); | |||
| } else { | |||
| coordinate_x = IN * out_shape2.y + IH - split_sizes_[0]; | |||
| coordinate_y = Y * out_shape2.w + Z; | |||
| WRITE_IMAGE(output2, (int2)(coordinate_y, coordinate_x), result); | |||
| } | |||
| } | |||
| // UnAlign in Axis C for concat | |||
| #define CHECK_IDX_UNALIGN \ | |||
| const int X = get_global_id(0); \ | |||
| const int Y = get_global_id(1); \ | |||
| if (X >= in_shape.x * in_shape.y || Y >= in_shape.z || in_shape.y == 0) { \ | |||
| return; \ | |||
| } | |||
| #define ARGS_UNALIGN \ | |||
| const int IN = X / in_shape.y, IH = X % in_shape.y; \ | |||
| const int IW = Y; \ | |||
| const int Align_inShape = UP_DIV(in_shape.w, C4NUM); \ | |||
| int index_input = (IN * in_shape.y + IH) * stride_w + IW * Align_inShape * C4NUM; | |||
| int dosplit(__global FLT *input, __write_only image2d_t output, int4 out_shape, int IN, int IH, int IW, | |||
| int index_input) { | |||
| int Remainder = out_shape.w % C4NUM; | |||
| int coordinate_x = IN * out_shape.y + IH; | |||
| int align_w = UP_DIV(out_shape.w, C4NUM); | |||
| for (int i = 0; i < align_w; ++i) { | |||
| int coordinate_y = IW * align_w + i; | |||
| if ((i + 1) * C4NUM <= out_shape.w) { | |||
| FLT4 result = {input[index_input], input[index_input + 1], input[index_input + 2], input[index_input + 3]}; | |||
| WRITE_IMAGE(output, (int2)(coordinate_y, coordinate_x), result); | |||
| index_input += 4; | |||
| } else { | |||
| FLT result_temp[4] = {}; | |||
| for (int j = 0; j < Remainder; ++j) { | |||
| result_temp[j] = input[index_input++]; | |||
| } | |||
| FLT4 result = {result_temp[0], result_temp[1], result_temp[2], result_temp[3]}; | |||
| WRITE_IMAGE(output, (int2)(coordinate_y, coordinate_x), result); | |||
| } | |||
| } | |||
| return index_input; | |||
| } | |||
| __kernel void split_out2_axis3_unalign(__global FLT *input, __write_only image2d_t output1, | |||
| __write_only image2d_t output2, __global int *split_sizes_, int4 in_shape, | |||
| int4 out_shape1, int4 out_shape2, int stride_w) { | |||
| CHECK_IDX_UNALIGN; | |||
| ARGS_UNALIGN; | |||
| index_input = dosplit(input, output1, out_shape1, IN, IH, IW, index_input); | |||
| index_input = dosplit(input, output2, out_shape2, IN, IH, IW, index_input); | |||
| } | |||
| @@ -161,6 +161,14 @@ void ConcatOpenCLKernel::SetGlobalLocal() { | |||
| } | |||
| int ConcatOpenCLKernel::Prepare() { | |||
| if (axis_ == 0) { | |||
| for (int i = 0; i < in_tensors_.size(); ++i) { | |||
| if (in_tensors_.at(0)->shape().size() != 1) { | |||
| return RET_OK; | |||
| } | |||
| } | |||
| axis_ = 3; | |||
| } | |||
| for (int i = 0; i < in_tensors_.size(); ++i) { | |||
| int length = in_tensors_[0]->shape().size(); | |||
| if (in_tensors_[i]->shape()[length - 1] % C4NUM != 0) { | |||
| @@ -99,6 +99,9 @@ void GatherOpenCLKernel::SetGlobalLocal() { | |||
| int GatherOpenCLKernel::Prepare() { | |||
| std::string kernel_name = "gather"; | |||
| if (in_tensors_.at(0)->shape().size() == 1 && axis_ == 0) { | |||
| axis_ = 3; | |||
| } | |||
| #ifdef PROGRAM_WITH_IL | |||
| kernel_ = ocl_runtime_->GetKernelFromBinary(kernel_name); | |||
| #else | |||
| @@ -106,17 +109,13 @@ int GatherOpenCLKernel::Prepare() { | |||
| ocl_runtime_->LoadSource(program_name, gather_source); | |||
| ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name); | |||
| #endif | |||
| if (!in_tensors_.at(1)->IsConst()) { | |||
| intensor1_is_tensor = true; | |||
| } | |||
| if (!intensor1_is_tensor) { | |||
| if (in_tensors_.at(1)->IsConst()) { | |||
| intensor1_is_tensor = false; | |||
| int ret = InitWeights(); | |||
| if (ret != RET_OK) { | |||
| return ret; | |||
| } | |||
| } | |||
| SetGlobalLocal(); | |||
| SetConstArgs(); | |||
| MS_LOG(DEBUG) << kernel_name << " Init Done!"; | |||
| @@ -125,7 +124,6 @@ int GatherOpenCLKernel::Prepare() { | |||
| int GatherOpenCLKernel::ConvertTensorToweight() { | |||
| auto allocator = ocl_runtime_->GetAllocator(); | |||
| GpuTensorInfo img_info(in_tensors_[1]); | |||
| auto indices_tensor = in_tensors_.at(1); | |||
| auto indices_num = indices_tensor->ElementsNum(); | |||
| indices_data_ = reinterpret_cast<int32_t *>(allocator->Malloc(sizeof(int32_t) * indices_num)); | |||
| @@ -41,13 +41,10 @@ class GatherOpenCLKernel : public OpenCLKernel { | |||
| int Tune() override { return lite::RET_OK; } | |||
| int ConvertTensorToweight(); | |||
| protected: | |||
| int UpdateWeights(); | |||
| private: | |||
| int32_t *indices_data_{nullptr}; | |||
| int axis_ = {0}; | |||
| bool intensor1_is_tensor{false}; | |||
| bool intensor1_is_tensor{true}; | |||
| bool enable_fp16_{false}; | |||
| }; | |||
| } // namespace mindspore::kernel | |||
| @@ -17,17 +17,12 @@ | |||
| #include <set> | |||
| #include <string> | |||
| #include <map> | |||
| #include "nnacl/fp32/common_func_fp32.h" | |||
| #include "src/kernel_registry.h" | |||
| #include "src/runtime/kernel/opencl/utils.h" | |||
| #include "src/runtime/kernel/opencl/kernel/matmul.h" | |||
| #include "src/common/utils.h" | |||
| #include "src/runtime/kernel/opencl/kernel/strassen.h" | |||
| #ifndef PROGRAM_WITH_IL | |||
| #include "src/runtime/kernel/opencl/cl/matmul.cl.inc" | |||
| #include "src/runtime/kernel/opencl/cl/strassen.cl.inc" | |||
| #endif | |||
| using mindspore::kernel::KERNEL_ARCH::kGPU; | |||
| @@ -36,6 +31,21 @@ using mindspore::schema::PrimitiveType_MatMul; | |||
| namespace mindspore::kernel { | |||
| bool IsUseStrassenMatmul(const std::vector<lite::Tensor *> &in_tensors_) { | |||
| if (in_tensors_.at(0)->shape().size() == 2) { | |||
| auto shape0 = in_tensors_.at(0)->shape(); | |||
| auto shape1 = in_tensors_.at(1)->shape(); | |||
| if (in_tensors_.at(1)->IsConst() && (shape0[0] == shape0[1]) && (shape1[0] == shape1[1]) && | |||
| (shape0[0] == shape1[0]) && (shape0[0] % 8 == 0)) { | |||
| return true; | |||
| } else { | |||
| return false; | |||
| } | |||
| } else { | |||
| return false; | |||
| } | |||
| } | |||
| int MatMulOpenCLKernel::CheckSpecs() { | |||
| if (in_tensors_.size() != 2 || out_tensors_.size() != 1) { | |||
| MS_LOG(ERROR) << "in size: " << in_tensors_.size() << ", out size: " << out_tensors_.size(); | |||
| @@ -74,35 +84,14 @@ int MatMulOpenCLKernel::Prepare() { | |||
| } | |||
| std::map<int, std::string> dims2str = {{2, "_2d"}, {3, "_4d"}, {4, "_4d"}}; | |||
| kernel_name += dims2str[dims]; | |||
| if (in_tensors_.at(0)->shape().size() == 2) { | |||
| auto shape0 = in_tensors_.at(0)->shape(); | |||
| auto shape1 = in_tensors_.at(1)->shape(); | |||
| if (in_tensors_.at(1)->IsConst() && (shape0[0] == shape0[1]) && (shape1[0] == shape1[1]) && | |||
| (shape0[0] == shape1[0]) && (shape0[0] % 8 == 0)) { | |||
| use_strassen = true; | |||
| } | |||
| } | |||
| #ifdef PROGRAM_WITH_IL | |||
| kernel_ = ocl_runtime_->GetKernelFromBinary(kernel_name); | |||
| #else | |||
| std::string source = matmul_source; | |||
| if (use_strassen) { | |||
| source.clear(); | |||
| source = strassen_source; | |||
| } | |||
| std::string program_name = "MatMul"; | |||
| ocl_runtime_->LoadSource(program_name, source); | |||
| if (use_strassen) { | |||
| kernel_name = "MatMul_Strassen_NHWC4_2d"; | |||
| ocl_runtime_->BuildKernel(kernel_IMG_add_sub_2, program_name, "MatMul_IMG_Add_Sub_2"); | |||
| ocl_runtime_->BuildKernel(kernel_BUF_add_sub_2, program_name, "MatMul_BUF_Add_Sub_2"); | |||
| ocl_runtime_->BuildKernel(kernel_back_result, program_name, "Strassen_Back_Result"); | |||
| ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name); | |||
| ocl_runtime_->BuildKernel(MatMul_StrassenBUFFilled, program_name, "MatMul_BUF_Filled"); | |||
| ocl_runtime_->BuildKernel(MatMul_StrassenIMGFilled, program_name, "MatMul_IMG_Filled"); | |||
| } else { | |||
| ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name); | |||
| } | |||
| ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name); | |||
| #endif | |||
| auto ret = InitWeights(); | |||
| if (ret != RET_OK) { | |||
| @@ -114,31 +103,6 @@ int MatMulOpenCLKernel::Prepare() { | |||
| return mindspore::lite::RET_OK; | |||
| } | |||
| void MatMulOpenCLKernel::AllocatorMemoryForStrassen(int NumA, int NumB) { | |||
| std::vector<size_t> img_size; | |||
| img_size.push_back(UP_DIV(NumA, C4NUM)); | |||
| img_size.push_back(NumA); | |||
| size_t img_dtype = enable_fp16_ ? CL_HALF_FLOAT : CL_FLOAT; | |||
| size_t dtype_size = enable_fp16_ ? sizeof(CL_HALF_FLOAT) : sizeof(CL_FLOAT); | |||
| img_size.push_back(img_dtype); | |||
| auto allocator = ocl_runtime_->GetAllocator(); | |||
| size_t memA = NumA * NumA; | |||
| size_t memB = NumB * NumB * dtype_size; | |||
| for (int depth = 0; depth < MAXDEPTH; depth++) { | |||
| B_temp[depth] = allocator->Malloc(memB); | |||
| A_temp[depth] = allocator->Malloc(memA, img_size); | |||
| M1[depth] = allocator->Malloc(memA, img_size); | |||
| M2[depth] = allocator->Malloc(memA, img_size); | |||
| M3[depth] = allocator->Malloc(memA, img_size); | |||
| M4[depth] = allocator->Malloc(memA, img_size); | |||
| M5[depth] = allocator->Malloc(memA, img_size); | |||
| M6[depth] = allocator->Malloc(memA, img_size); | |||
| M7[depth] = allocator->Malloc(memA, img_size); | |||
| } | |||
| } | |||
| int MatMulOpenCLKernel::InitWeights() { | |||
| if (act_weight_) { | |||
| return RET_OK; | |||
| @@ -165,64 +129,39 @@ int MatMulOpenCLKernel::InitWeights() { | |||
| auto originWeightFp32 = reinterpret_cast<float *>(in_tensors_.at(kWeightIndex)->data_c()); | |||
| auto originWeightFp16 = reinterpret_cast<float16_t *>(in_tensors_.at(kWeightIndex)->data_c()); | |||
| bool isModelFp16 = in_tensors_.at(kWeightIndex)->data_type() == kNumberTypeFloat16; | |||
| if (use_strassen) { | |||
| int NumA = in_tensors_[0]->shape()[0]; | |||
| int NumB = in_tensors_[1]->shape()[0]; | |||
| AllocatorMemoryForStrassen(NumA / 2, NumB / 2); | |||
| size_t size = NumA * NumB * dtype_size; | |||
| transposeB = false; | |||
| if (isModelFp16) { | |||
| if (enable_fp16_) { | |||
| memcpy(padWeightFp16, originWeightFp16, size); | |||
| } else { | |||
| for (int i = 0; i < NumA * NumB; ++i) { | |||
| padWeightFp32[i] = static_cast<float>(originWeightFp16[i]); | |||
| } | |||
| } | |||
| } else { | |||
| if (enable_fp16_) { | |||
| for (int i = 0; i < NumA * NumB; ++i) { | |||
| padWeightFp16[i] = static_cast<float16_t>(originWeightFp32[i]); | |||
| } | |||
| } else { | |||
| memcpy(padWeightFp32, originWeightFp32, size); | |||
| } | |||
| } | |||
| } else { | |||
| // pad weight | |||
| // ABCICO -> AB(CI4)(CO4)(4 from CO)(4 from CI) | |||
| // if tranposeB, ABCOCI -> AB(CI4)(CO4)(4 from CO)(4 from CI) | |||
| int index = 0; | |||
| for (int aa = 0; aa < a; aa++) { | |||
| for (int bb = 0; bb < b; bb++) { | |||
| int baseAB = (aa * b + bb) * ci * co; | |||
| for (int i = 0; i < ci4; ++i) { | |||
| for (int j = 0; j < co4; ++j) { | |||
| for (int k = 0; k < C4NUM; ++k) { | |||
| for (int l = 0; l < C4NUM; ++l) { | |||
| int src_ci = i * C4NUM + l; | |||
| int src_co = j * C4NUM + k; | |||
| if (src_ci < ci && src_co < co) { | |||
| int originId = baseAB + src_ci * co + src_co; | |||
| if (transposeB) { | |||
| originId = baseAB + src_co * ci + src_ci; | |||
| } | |||
| if (enable_fp16_) { | |||
| if (!isModelFp16) { | |||
| padWeightFp16[index++] = originWeightFp32[originId]; | |||
| } else { | |||
| padWeightFp16[index++] = originWeightFp16[originId]; | |||
| } | |||
| // pad weight | |||
| // ABCICO -> AB(CI4)(CO4)(4 from CO)(4 from CI) | |||
| // if tranposeB, ABCOCI -> AB(CI4)(CO4)(4 from CO)(4 from CI) | |||
| int index = 0; | |||
| for (int aa = 0; aa < a; aa++) { | |||
| for (int bb = 0; bb < b; bb++) { | |||
| int baseAB = (aa * b + bb) * ci * co; | |||
| for (int i = 0; i < ci4; ++i) { | |||
| for (int j = 0; j < co4; ++j) { | |||
| for (int k = 0; k < C4NUM; ++k) { | |||
| for (int l = 0; l < C4NUM; ++l) { | |||
| int src_ci = i * C4NUM + l; | |||
| int src_co = j * C4NUM + k; | |||
| if (src_ci < ci && src_co < co) { | |||
| int originId = baseAB + src_ci * co + src_co; | |||
| if (transposeB) { | |||
| originId = baseAB + src_co * ci + src_ci; | |||
| } | |||
| if (enable_fp16_) { | |||
| if (!isModelFp16) { | |||
| padWeightFp16[index++] = originWeightFp32[originId]; | |||
| } else { | |||
| if (!isModelFp16) { | |||
| padWeightFp32[index++] = originWeightFp32[originId]; | |||
| } else { | |||
| padWeightFp32[index++] = originWeightFp16[originId]; | |||
| } | |||
| padWeightFp16[index++] = originWeightFp16[originId]; | |||
| } | |||
| } else { | |||
| index++; | |||
| if (!isModelFp16) { | |||
| padWeightFp32[index++] = originWeightFp32[originId]; | |||
| } else { | |||
| padWeightFp32[index++] = originWeightFp16[originId]; | |||
| } | |||
| } | |||
| } else { | |||
| index++; | |||
| } | |||
| } | |||
| } | |||
| @@ -236,266 +175,67 @@ int MatMulOpenCLKernel::InitWeights() { | |||
| return RET_OK; | |||
| } | |||
| void AlignStrassenGlobalLocal(const std::vector<size_t> &global, const std::vector<size_t> &local, | |||
| cl::NDRange *global_range, cl::NDRange *local_range) { | |||
| *local_range = cl::NDRange(local[0], local[1], local[2]); | |||
| *global_range = | |||
| cl::NDRange(UP_ROUND(global[0], local[0]), UP_ROUND(global[1], local[1]), UP_ROUND(global[2], local[2])); | |||
| } | |||
| // 0 : global_size_, 1: global_size_add_sub | |||
| void MatMulOpenCLKernel::StrassenSetGlobalLocal(size_t strassen_size, int type_flag) { | |||
| size_t strassen_size_C4 = UP_DIV(strassen_size, C4NUM); | |||
| local_size_add_sub = {16, 1, 16}; | |||
| if (type_flag == 0) { | |||
| global_size_ = {strassen_size_C4, 1, strassen_size}; | |||
| AlignGlobalLocal(global_size_, local_size_); | |||
| } else { | |||
| global_size_add_sub = {strassen_size_C4, 1, strassen_size}; | |||
| AlignStrassenGlobalLocal(global_size_add_sub, local_size_add_sub, &global_add_sub_, &local_add_sub_); | |||
| } | |||
| } | |||
| void MatMulOpenCLKernel::SetGlobalLocal() { | |||
| // local size should less than MAX_GROUP_SIZE | |||
| local_size_ = {32, 4, 1}; | |||
| global_size_ = {1, 1, 1}; | |||
| if (use_strassen) { | |||
| size_t strassen_size = outShape[3] / 2; | |||
| StrassenSetGlobalLocal(strassen_size, 0); // set global_ and local | |||
| StrassenSetGlobalLocal(strassen_size, 1); // set global_size_add_sub | |||
| StrassenSetGlobalLocal(strassen_size, 2); // set global_size_weights | |||
| } else { | |||
| global_size_ = {UP_DIV(static_cast<size_t>(outShape[3]), C4NUM), | |||
| 4 * static_cast<size_t>(outShape[0]) * static_cast<size_t>(outShape[1]), | |||
| static_cast<size_t>(outShape[2])}; | |||
| AlignGlobalLocal(global_size_, local_size_); | |||
| } | |||
| } | |||
| void MatMulOpenCLKernel::StrassenSetConstArgs(cl::Kernel *kernel, int index, int strassen_size, bool is_matmul_kernel) { | |||
| cl_int4 shape; | |||
| if (is_matmul_kernel) { | |||
| shape = {1, 1, strassen_size, strassen_size}; | |||
| } else { | |||
| shape = {strassen_size, 1, 1, UP_DIV(strassen_size, C4NUM)}; | |||
| } | |||
| ocl_runtime_->SetKernelArg(*kernel, index, shape); | |||
| global_size_ = {UP_DIV(static_cast<size_t>(outShape[3]), C4NUM), | |||
| 4 * static_cast<size_t>(outShape[0]) * static_cast<size_t>(outShape[1]), | |||
| static_cast<size_t>(outShape[2])}; | |||
| AlignGlobalLocal(global_size_, local_size_); | |||
| } | |||
| void MatMulOpenCLKernel::SetConstArgs() { | |||
| int arg_count = 2; | |||
| cl_int4 in_shape = {inShape[0], inShape[1], inShape[2], inShape[3]}; | |||
| cl_int4 out_shape = {outShape[0], outShape[1], outShape[2], outShape[3]}; | |||
| cl_int4 shape_offset = {0, 0, 0, 0}; | |||
| if (use_strassen) { | |||
| int strassen_size = inShape[3] / 2; | |||
| out_shape.s[2] = in_shape.s[2] = in_shape.s[2] / 2; | |||
| out_shape.s[3] = in_shape.s[3] = in_shape.s[3] / 2; | |||
| StrassenSetConstArgs(&kernel_IMG_add_sub_2, 3, strassen_size, false); | |||
| StrassenSetConstArgs(&kernel_BUF_add_sub_2, 2, strassen_size, false); | |||
| if (act_weight_) { | |||
| arg_count++; | |||
| } else { | |||
| if (act_weight_) { | |||
| arg_count++; | |||
| } else { | |||
| ocl_runtime_->SetKernelArg(kernel_, arg_count++, padWeight_, lite::opencl::MemType::BUF); | |||
| } | |||
| ocl_runtime_->SetKernelArg(kernel_, arg_count++, padWeight_, lite::opencl::MemType::BUF); | |||
| } | |||
| ocl_runtime_->SetKernelArg(kernel_, arg_count++, in_shape); | |||
| ocl_runtime_->SetKernelArg(kernel_, arg_count++, out_shape); | |||
| ocl_runtime_->SetKernelArg(kernel_, arg_count++, shape_offset); | |||
| } | |||
| // OriginSize = N*H*W*C typesize = sizeof(type data) width = W * UP_DIV(C,C4NUM) size = N | |||
| void MatMulOpenCLKernel::PrintImage2d(void *IMGData, size_t typesize, size_t width, size_t size) { | |||
| auto runtime_wrapper = lite::opencl::OpenCLRuntimeWrapper(); | |||
| int alignment = runtime_wrapper.GetInstance()->GetImagePitchAlignment(); | |||
| auto runtime = runtime_wrapper.GetInstance(); | |||
| runtime->SyncCommandQueue(); | |||
| MS_ASSERT(alignment); | |||
| size_t row_pitch = UP_ROUND(width, alignment) * typesize * C4NUM; | |||
| size_t OriginSize = size * size * typesize; | |||
| std::vector<char> data(OriginSize); | |||
| auto row_size = width * typesize * C4NUM; | |||
| for (int i = 0; i < size; ++i) { | |||
| memcpy(reinterpret_cast<char *>(data.data()) + i * row_size, static_cast<char *>(IMGData) + i * row_pitch, | |||
| row_size); | |||
| } | |||
| for (int i = 0; i < size * size; ++i) { | |||
| if ((i + 1) % size == 0) { | |||
| std::cout << std::endl; | |||
| } | |||
| int MatMulOpenCLKernel::Run() { | |||
| MS_LOG(DEBUG) << this->name() << " Running!"; | |||
| int arg_count = 0; | |||
| ocl_runtime_->SetKernelArg(kernel_, arg_count++, in_tensors_[0]->data_c()); | |||
| ocl_runtime_->SetKernelArg(kernel_, arg_count++, out_tensors_[0]->data_c()); | |||
| if (act_weight_) { | |||
| ocl_runtime_->SetKernelArg(kernel_, arg_count++, in_tensors_[1]->data_c()); | |||
| } | |||
| ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr, &event_); | |||
| return mindspore::lite::RET_OK; | |||
| } | |||
| void MatMulOpenCLKernel::StrassenDataFilled(cl::Kernel *kernel, void *input, void *output, const int size, | |||
| cl_int2 offset, lite::opencl::MemType mem_type) { | |||
| if (input == nullptr || output == nullptr) { | |||
| MS_LOG(ERROR) << "StrassenDataFilled input or output can not nullptr"; | |||
| return; | |||
| } | |||
| if (mem_type == lite::opencl::MemType::IMG) { | |||
| ocl_runtime_->SetKernelArg(*kernel, 0, input); | |||
| ocl_runtime_->SetKernelArg(*kernel, 1, output); | |||
| kernel::LiteKernel *OpenCLMatMulKernelCreator(const std::vector<lite::Tensor *> &inputs, | |||
| const std::vector<lite::Tensor *> &outputs, OpParameter *opParameter, | |||
| const lite::InnerContext *ctx, const kernel::KernelKey &desc, | |||
| const mindspore::lite::PrimitiveC *primitive) { | |||
| kernel::OpenCLKernel *kernel; | |||
| if (IsUseStrassenMatmul(inputs)) { | |||
| MS_LOG(DEBUG) << "use_matmul_strassen"; | |||
| kernel = new (std::nothrow) StrassenOpenCLKernel(opParameter, inputs, outputs); | |||
| } else { | |||
| ocl_runtime_->SetKernelArg(*kernel, 0, input, lite::opencl::MemType::BUF); | |||
| ocl_runtime_->SetKernelArg(*kernel, 1, output, lite::opencl::MemType::BUF); | |||
| kernel = new (std::nothrow) MatMulOpenCLKernel(opParameter, inputs, outputs); | |||
| } | |||
| StrassenSetConstArgs(kernel, 2, size, false); | |||
| ocl_runtime_->SetKernelArg(*kernel, 3, offset); | |||
| ocl_runtime_->RunKernel(*kernel, global_add_sub_, local_add_sub_, nullptr, &event_); | |||
| } | |||
| void MatMulOpenCLKernel::StrassenAddSub(cl::Kernel *kernel, void *input, void *output, const int size, cl_int4 offset, | |||
| int flag, lite::opencl::MemType mem_type) { | |||
| if (input == nullptr || output == nullptr) { | |||
| MS_LOG(ERROR) << "StrassenAddSub input or output can not nullptr"; | |||
| return; | |||
| } | |||
| if (mem_type == lite::opencl::MemType::IMG) { | |||
| ocl_runtime_->SetKernelArg(*kernel, 0, input, lite::opencl::MemType::IMG); | |||
| ocl_runtime_->SetKernelArg(*kernel, 1, output, lite::opencl::MemType::IMG); | |||
| } else { | |||
| ocl_runtime_->SetKernelArg(*kernel, 0, input, lite::opencl::MemType::BUF); | |||
| ocl_runtime_->SetKernelArg(*kernel, 1, output, lite::opencl::MemType::BUF); | |||
| if (kernel == nullptr) { | |||
| MS_LOG(ERROR) << "kernel " << opParameter->name_ << "is nullptr."; | |||
| free(opParameter); | |||
| return nullptr; | |||
| } | |||
| StrassenSetConstArgs(kernel, 2, size, false); | |||
| ocl_runtime_->SetKernelArg(*kernel, 3, offset); | |||
| ocl_runtime_->SetKernelArg(*kernel, 4, flag); | |||
| ocl_runtime_->RunKernel(*kernel, global_add_sub_, local_add_sub_, nullptr, &event_); | |||
| } | |||
| void MatMulOpenCLKernel::StrassenBackResult(cl::Kernel *kernel, void *input1, void *input2, void *input3, void *input4, | |||
| void *input5, void *input6, void *input7, void *output, const int size) { | |||
| if (input1 == nullptr || input2 == nullptr || input3 == nullptr || input4 == nullptr || input5 == nullptr || | |||
| input6 == nullptr || input7 == nullptr || output == nullptr) { | |||
| MS_LOG(ERROR) << "StrassenBackResult input or output can not nullptr"; | |||
| return; | |||
| auto ret = kernel->CheckSpecs(); | |||
| if (ret != mindspore::lite::RET_OK) { | |||
| MS_LOG(ERROR) << "Check " << opParameter->name_ << " specification failed!"; | |||
| delete kernel; | |||
| return nullptr; | |||
| } | |||
| ocl_runtime_->SetKernelArg(*kernel, 0, input1); | |||
| ocl_runtime_->SetKernelArg(*kernel, 1, input2); | |||
| ocl_runtime_->SetKernelArg(*kernel, 2, input3); | |||
| ocl_runtime_->SetKernelArg(*kernel, 3, input4); | |||
| ocl_runtime_->SetKernelArg(*kernel, 4, input5); | |||
| ocl_runtime_->SetKernelArg(*kernel, 5, input6); | |||
| ocl_runtime_->SetKernelArg(*kernel, 6, input7); | |||
| ocl_runtime_->SetKernelArg(*kernel, 7, output); | |||
| StrassenSetConstArgs(kernel, 8, size, false); | |||
| ocl_runtime_->RunKernel(*kernel, global_add_sub_, local_add_sub_, nullptr, &event_); | |||
| return kernel; | |||
| } | |||
| void MatMulOpenCLKernel::StrassenRunMmatmul(void *input, void *weight, void *output, const int size) { | |||
| if (input == nullptr || weight == nullptr || output == nullptr) { | |||
| MS_LOG(ERROR) << "StrassenRunMmatmul input ,weight or output can not nullptr"; | |||
| return; | |||
| } | |||
| ocl_runtime_->SetKernelArg(kernel_, 0, input); | |||
| ocl_runtime_->SetKernelArg(kernel_, 1, output); | |||
| ocl_runtime_->SetKernelArg(kernel_, 2, weight, lite::opencl::MemType::BUF); | |||
| StrassenSetConstArgs(&kernel_, 3, size, true); | |||
| StrassenSetConstArgs(&kernel_, 4, size, true); | |||
| ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr, &event_); | |||
| } | |||
| void MatMulOpenCLKernel::DoStrassen(void *data, void *weight, void *result, const int size, const int depth, | |||
| const int threshold) { | |||
| const int size_2 = size / 2; | |||
| int C4 = UP_DIV(size_2, C4NUM); | |||
| if (size <= threshold) { | |||
| // run matmul; | |||
| StrassenSetGlobalLocal(size, 0); | |||
| StrassenRunMmatmul(data, weight, result, size); | |||
| return; | |||
| } | |||
| // flag = 0 : add otherwise flag = 1 : sub | |||
| // M1 = A11 * ( B12- B22) | |||
| StrassenSetGlobalLocal(size_2, 1); | |||
| StrassenDataFilled(&MatMul_StrassenIMGFilled, data, A_temp[depth + 1], size_2, {0, 0}, lite::opencl::MemType::IMG); | |||
| StrassenAddSub(&kernel_BUF_add_sub_2, weight, B_temp[depth + 1], size_2, {0, C4, size_2, C4}, 1, | |||
| lite::opencl::MemType::BUF); | |||
| DoStrassen(A_temp[depth + 1], B_temp[depth + 1], M1[depth + 1], size_2, depth + 1, threshold); | |||
| // M2 = (A11 + A12) * B22 | |||
| StrassenSetGlobalLocal(size_2, 1); | |||
| StrassenDataFilled(&MatMul_StrassenBUFFilled, weight, B_temp[depth + 1], size_2, {size_2, C4}, | |||
| lite::opencl::MemType::BUF); | |||
| StrassenAddSub(&kernel_IMG_add_sub_2, data, A_temp[depth + 1], size_2, {0, 0, 0, C4}, 0, lite::opencl::MemType::IMG); | |||
| DoStrassen(A_temp[depth + 1], B_temp[depth + 1], M2[depth + 1], size_2, depth + 1, threshold); | |||
| // M3 = (A21 + A22) * B11 | |||
| StrassenSetGlobalLocal(size_2, 1); | |||
| StrassenDataFilled(&MatMul_StrassenBUFFilled, weight, B_temp[depth + 1], size_2, {0, 0}, lite::opencl::MemType::BUF); | |||
| StrassenAddSub(&kernel_IMG_add_sub_2, data, A_temp[depth + 1], size_2, {size_2, 0, size_2, C4}, 0, | |||
| lite::opencl::MemType::IMG); | |||
| DoStrassen(A_temp[depth + 1], B_temp[depth + 1], M3[depth + 1], size_2, depth + 1, threshold); | |||
| // M4 = A22 * (B21 - B11) | |||
| StrassenSetGlobalLocal(size_2, 1); | |||
| StrassenDataFilled(&MatMul_StrassenIMGFilled, data, A_temp[depth + 1], size_2, {size_2, C4}, | |||
| lite::opencl::MemType::IMG); | |||
| StrassenAddSub(&kernel_BUF_add_sub_2, weight, B_temp[depth + 1], size_2, {size_2, 0, 0, 0}, 1, | |||
| lite::opencl::MemType::BUF); | |||
| DoStrassen(A_temp[depth + 1], B_temp[depth + 1], M4[depth + 1], size_2, depth + 1, threshold); | |||
| // M5 = (A11 + A22) * (B11 + B22) | |||
| StrassenSetGlobalLocal(size_2, 1); | |||
| StrassenAddSub(&kernel_IMG_add_sub_2, data, A_temp[depth + 1], size_2, {0, 0, size_2, C4}, 0, | |||
| lite::opencl::MemType::IMG); | |||
| // (B11 + B22) | |||
| StrassenAddSub(&kernel_BUF_add_sub_2, weight, B_temp[depth + 1], size_2, {0, 0, size_2, C4}, 0, | |||
| lite::opencl::MemType::BUF); | |||
| DoStrassen(A_temp[depth + 1], B_temp[depth + 1], M5[depth + 1], size_2, depth + 1, threshold); | |||
| // M6 = (A12 - A22) * (B21 + B22) | |||
| StrassenSetGlobalLocal(size_2, 1); | |||
| StrassenAddSub(&kernel_IMG_add_sub_2, data, A_temp[depth + 1], size_2, {0, C4, size_2, C4}, 1, | |||
| lite::opencl::MemType::IMG); | |||
| StrassenAddSub(&kernel_BUF_add_sub_2, weight, B_temp[depth + 1], size_2, {size_2, 0, size_2, C4}, 0, | |||
| lite::opencl::MemType::BUF); | |||
| DoStrassen(A_temp[depth + 1], B_temp[depth + 1], M6[depth + 1], size_2, depth + 1, threshold); | |||
| // M7 = (A11 - A21) * (B11 + B12) | |||
| StrassenSetGlobalLocal(size_2, 1); | |||
| StrassenAddSub(&kernel_IMG_add_sub_2, data, A_temp[depth + 1], size_2, {0, 0, size_2, 0}, 1, | |||
| lite::opencl::MemType::IMG); | |||
| StrassenAddSub(&kernel_BUF_add_sub_2, weight, B_temp[depth + 1], size_2, {0, 0, 0, C4}, 0, | |||
| lite::opencl::MemType::BUF); | |||
| DoStrassen(A_temp[depth + 1], B_temp[depth + 1], M7[depth + 1], size_2, depth + 1, threshold); | |||
| // BackResult | |||
| StrassenSetGlobalLocal(size_2, 1); | |||
| StrassenBackResult(&kernel_back_result, M1[depth + 1], M2[depth + 1], M3[depth + 1], M4[depth + 1], M5[depth + 1], | |||
| M6[depth + 1], M7[depth + 1], result, size_2); | |||
| } | |||
| int MatMulOpenCLKernel::Run() { | |||
| MS_LOG(DEBUG) << this->name() << " Running!"; | |||
| if (use_strassen) { | |||
| int threshold = 0; | |||
| const int up_bound = 1024; | |||
| const int down_bound = 256; | |||
| if (in_tensors_.at(0)->shape()[0] >= up_bound) { | |||
| threshold = UP_DIV(in_tensors_.at(0)->shape()[0], C4NUM) / 2; | |||
| } else if (in_tensors_.at(0)->shape()[0] <= down_bound) { | |||
| threshold = in_tensors_.at(0)->shape()[0]; | |||
| } else { | |||
| threshold = UP_DIV(in_tensors_.at(0)->shape()[0], C4NUM); | |||
| } | |||
| DoStrassen(in_tensors_.at(0)->data_c(), padWeight_, out_tensors_.at(0)->data_c(), in_tensors_.at(0)->shape()[0], 0, | |||
| threshold); | |||
| } else { | |||
| int arg_count = 0; | |||
| ocl_runtime_->SetKernelArg(kernel_, arg_count++, in_tensors_[0]->data_c()); | |||
| ocl_runtime_->SetKernelArg(kernel_, arg_count++, out_tensors_[0]->data_c()); | |||
| if (act_weight_) { | |||
| ocl_runtime_->SetKernelArg(kernel_, arg_count++, in_tensors_[1]->data_c()); | |||
| } | |||
| ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr, &event_); | |||
| } | |||
| return mindspore::lite::RET_OK; | |||
| } | |||
| REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_MatMul, OpenCLMatMulKernelCreator) | |||
| REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_MatMul, OpenCLMatMulKernelCreator) | |||
| REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_MatMul, OpenCLKernelCreator<MatMulOpenCLKernel>) | |||
| REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_MatMul, OpenCLKernelCreator<MatMulOpenCLKernel>) | |||
| } // namespace mindspore::kernel | |||
| @@ -41,7 +41,7 @@ class MatMulOpenCLKernel : public OpenCLKernel { | |||
| void SetGlobalLocal() override; | |||
| int Tune() override { return lite::RET_OK; } | |||
| private: | |||
| protected: | |||
| void *padWeight_{nullptr}; | |||
| bool enable_fp16_{false}; | |||
| bool transposeA{false}; | |||
| @@ -51,43 +51,6 @@ class MatMulOpenCLKernel : public OpenCLKernel { | |||
| bool act_weight_{false}; | |||
| std::vector<int> inShape{std::vector<int>(MAX_DIMS, 1)}; | |||
| std::vector<int> outShape{std::vector<int>(MAX_DIMS, 1)}; | |||
| // strassen | |||
| private: | |||
| void AllocatorMemoryForStrassen(int NumA, int NumB); | |||
| void DoStrassen(void *data, void *weight, void *result, const int size, const int depth, const int threshold); | |||
| void StrassenSetGlobalLocal(size_t strassen_size, int type_flag); | |||
| void StrassenSetConstArgs(cl::Kernel *kernel, int index, int strassen_size, bool is_matmul_kernel); | |||
| void StrassenDataFilled(cl::Kernel *kernel, void *input, void *output, const int size, cl_int2 offset, | |||
| lite::opencl::MemType mem_type); | |||
| void StrassenAddSub(cl::Kernel *kernel, void *input, void *output, const int size, cl_int4 offset, int flag, | |||
| lite::opencl::MemType mem_type); | |||
| void StrassenBackResult(cl::Kernel *kernel, void *input1, void *input2, void *input3, void *input4, void *input5, | |||
| void *input6, void *input7, void *output, const int size); | |||
| void StrassenRunMmatmul(void *input, void *weight, void *output, const int size); | |||
| void PrintImage2d(void *IMGData, size_t typesize, size_t width, size_t size); | |||
| bool use_strassen{false}; | |||
| cl::Kernel kernel_IMG_add_sub_2; | |||
| cl::Kernel MatMul_StrassenBUFFilled; | |||
| cl::Kernel MatMul_StrassenIMGFilled; | |||
| cl::Kernel kernel_BUF_add_sub_2; | |||
| cl::Kernel kernel_back_result; | |||
| cl::NDRange global_add_sub_, local_add_sub_; | |||
| std::vector<size_t> global_size_add_sub; | |||
| std::vector<size_t> local_size_add_sub; | |||
| // image 2d | |||
| void *A_temp[MAXDEPTH] = {nullptr}; | |||
| void *M1[MAXDEPTH] = {nullptr}; | |||
| void *M2[MAXDEPTH] = {nullptr}; | |||
| void *M3[MAXDEPTH] = {nullptr}; | |||
| void *M4[MAXDEPTH] = {nullptr}; | |||
| void *M5[MAXDEPTH] = {nullptr}; | |||
| void *M6[MAXDEPTH] = {nullptr}; | |||
| void *M7[MAXDEPTH] = {nullptr}; | |||
| // buffer | |||
| void *B_temp[MAXDEPTH] = {nullptr}; | |||
| }; | |||
| } // namespace mindspore::kernel | |||
| @@ -0,0 +1,206 @@ | |||
| /** | |||
| * Copyright 2019 Huawei Technologies Co., Ltd | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| * You may obtain a copy of the License at | |||
| * | |||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, software | |||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| #include "src/runtime/kernel/opencl/kernel/split.h" | |||
| #include <cstring> | |||
| #include <string> | |||
| #include <algorithm> | |||
| #include <set> | |||
| #include "src/kernel_registry.h" | |||
| #include "src/runtime/kernel/opencl/cl/split.cl.inc" | |||
| using mindspore::kernel::KERNEL_ARCH::kGPU; | |||
| using mindspore::lite::KernelRegistrar; | |||
| using mindspore::lite::RET_ERROR; | |||
| using mindspore::lite::RET_OK; | |||
| using mindspore::schema::PrimitiveType_Split; | |||
| namespace mindspore::kernel { | |||
| int SplitOpenCLKernel::RunAxis0() { | |||
| auto allocator_ = ocl_runtime_->GetAllocator(); | |||
| std::vector<size_t> img_size; | |||
| auto src_data = in_tensors_[0]->data_c(); | |||
| cl::Image2D *in_image = reinterpret_cast<cl::Image2D *>(allocator_->GetImage(src_data)); | |||
| if (in_image == nullptr) { | |||
| MS_LOG(ERROR) << "RunAxis0 in_image can not be nullptr"; | |||
| return RET_ERROR; | |||
| } | |||
| auto src_area = cl::array<cl::size_type, 3U>{0, 0, 0}; | |||
| for (int i = 0; i < out_tensors_.size(); i++) { | |||
| auto dst_data = out_tensors_[i]->data_c(); | |||
| allocator_->GetImageSize(dst_data, &img_size); | |||
| auto dst_area = cl::array<cl::size_type, 3U>{0, 0, 0}; | |||
| auto region = cl::array<cl::size_type, 3U>{img_size[0], img_size[1], 1}; | |||
| cl::Image2D *out_image = reinterpret_cast<cl::Image2D *>(allocator_->GetImage(dst_data)); | |||
| if (out_image == nullptr) { | |||
| MS_LOG(ERROR) << "RunAxis0 out_image can not be nullptr"; | |||
| return RET_ERROR; | |||
| } | |||
| ocl_runtime_->GetDefaultCommandQueue()->enqueueCopyImage(*in_image, *out_image, src_area, dst_area, region); | |||
| src_area[1] += region[1]; | |||
| } | |||
| return RET_OK; | |||
| } | |||
| int SplitOpenCLKernel::CheckSpecs() { | |||
| if (out_tensors_.size() != 2 || in_tensors_.size() != 1) { | |||
| MS_LOG(ERROR) << "in size: " << in_tensors_.size() << ", out size: " << out_tensors_.size(); | |||
| return RET_ERROR; | |||
| } | |||
| if (in_tensors_.at(0)->IsConst()) { | |||
| MS_LOG(ERROR) << "in_tensors_ must be tensor"; | |||
| return RET_ERROR; | |||
| } | |||
| for (auto &out_tensor : out_tensors_) { | |||
| if (out_tensor->IsConst()) { | |||
| MS_LOG(ERROR) << "out_tensor must be tensor"; | |||
| return RET_ERROR; | |||
| } | |||
| } | |||
| auto param = reinterpret_cast<SplitParameter *>(this->op_parameter_); | |||
| if (param->num_split_ != 2 && param->num_split_ != 1) { | |||
| MS_LOG(ERROR) << "num_split_ only supported 1 or 2 yet"; | |||
| return RET_ERROR; | |||
| } | |||
| if (param->split_dim_ < 0 || param->split_dim_ > 3) { | |||
| MS_LOG(ERROR) << "split_dim_ must between 0~3"; | |||
| return RET_ERROR; | |||
| } | |||
| if (param->split_sizes_ == nullptr) { | |||
| MS_LOG(ERROR) << "split_sizes_ can not nullptr"; | |||
| return RET_ERROR; | |||
| } | |||
| return RET_OK; | |||
| } | |||
| void SplitOpenCLKernel::AlignSplitSizes(SplitParameter *param, const std::vector<int> &in_shape) { | |||
| auto allocator = ocl_runtime_->GetAllocator(); | |||
| int shape_dim = in_shape.at(param->split_dim_); | |||
| if (num_split_ == 1) { | |||
| size_t num_split = UP_DIV(shape_dim, param->split_sizes_[0]); | |||
| split_sizes_ = reinterpret_cast<int *>(allocator->Malloc(num_split * sizeof(int))); | |||
| for (int i = 0; i < num_split - 1; ++i) { | |||
| split_sizes_[i] = (i + 1) * param->split_sizes_[0]; | |||
| } | |||
| } else { | |||
| int sum = 0; | |||
| split_sizes_ = reinterpret_cast<int *>(allocator->Malloc(num_split_ * sizeof(int))); | |||
| for (int i = 0; i < num_split_ - 1; ++i) { | |||
| sum += param->split_sizes_[i]; | |||
| split_sizes_[i] = sum; | |||
| } | |||
| } | |||
| } | |||
| int SplitOpenCLKernel::Prepare() { | |||
| auto param = reinterpret_cast<SplitParameter *>(this->op_parameter_); | |||
| auto in_shape = in_tensors_.at(0)->shape(); | |||
| int increment_dim = C4NUM - in_shape.size(); | |||
| split_dim_ = param->split_dim_ == 0 ? param->split_dim_ : param->split_dim_ + increment_dim; | |||
| num_split_ = param->num_split_; | |||
| if (split_dim_ == 0) { | |||
| return RET_OK; | |||
| } | |||
| for (int i = 0; i < out_tensors_.size(); ++i) { | |||
| int length = out_tensors_[0]->shape().size(); | |||
| if (split_dim_ == 3) { | |||
| if (out_tensors_[i]->shape()[length - 1] % C4NUM != 0) { | |||
| Align_ = false; | |||
| } | |||
| } | |||
| } | |||
| AlignSplitSizes(param, in_shape); | |||
| std::string kernel_name = "split_out"; | |||
| kernel_name += num_split_ == 1 ? std::to_string(out_tensors().size()) : std::to_string(num_split_); | |||
| kernel_name += "_axis" + std::to_string(split_dim_); | |||
| if (!Align_) { | |||
| kernel_name += "_unalign"; | |||
| } | |||
| MS_LOG(DEBUG) << "kernel_name=: " << kernel_name; | |||
| std::string source = split_source; | |||
| std::string program_name = "split"; | |||
| ocl_runtime_->LoadSource(program_name, source); | |||
| ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name); | |||
| MS_LOG(DEBUG) << kernel_name << " Init Done!"; | |||
| SetConstArgs(); | |||
| SetGlobalLocal(); | |||
| return RET_OK; | |||
| } | |||
| void SplitOpenCLKernel::SetConstArgs() { | |||
| int arg_cn = out_tensors_.size() + 2; | |||
| cl_int4 shape = {}; | |||
| for (int i = 0; i < in_tensors_[0]->shape().size(); ++i) { | |||
| shape.s[i] = in_tensors_[0]->shape()[i]; | |||
| } | |||
| Broadcast2GpuShape(in_shape_.s, shape.s, out_tensors_[0]->shape().size(), 1); | |||
| if (Align_) { | |||
| in_shape_.s[3] = UP_DIV(in_shape_.s[3], C4NUM); | |||
| } | |||
| ocl_runtime_->SetKernelArg(kernel_, arg_cn++, in_shape_); | |||
| for (int i = 0; i < out_tensors_.size(); ++i) { | |||
| cl_int4 temp = {}; | |||
| for (int j = 0; j < out_tensors_[i]->shape().size(); ++j) { | |||
| temp.s[j] = out_tensors_[i]->shape()[j]; | |||
| } | |||
| Broadcast2GpuShape(out_shape_.s, temp.s, out_tensors_[i]->shape().size(), 1); | |||
| if (Align_) { | |||
| out_shape_.s[3] = UP_DIV(out_shape_.s[3], C4NUM); | |||
| } | |||
| ocl_runtime_->SetKernelArg(kernel_, arg_cn++, out_shape_); | |||
| } | |||
| GpuTensorInfo img_info(in_tensors_.at(0)); | |||
| size_t dtype = enable_fp16_ ? sizeof(cl_half) : sizeof(cl_float); | |||
| stride_w = img_info.RowPitch() / dtype; | |||
| ocl_runtime_->SetKernelArg(kernel_, arg_cn++, stride_w); | |||
| return; | |||
| } | |||
| void SplitOpenCLKernel::SetGlobalLocal() { | |||
| OH = in_shape_.s[0] * in_shape_.s[1]; | |||
| OW = in_shape_.s[2]; | |||
| if (Align_) { | |||
| OC = in_shape_.s[3]; | |||
| } | |||
| global_size_ = {OH, OW, OC}; | |||
| local_size_ = {1, 1, 1}; | |||
| OpenCLKernel::AlignGlobalLocal(global_size_, local_size_); | |||
| return; | |||
| } | |||
| int SplitOpenCLKernel::Run() { | |||
| if (split_dim_ == 0) { | |||
| RunAxis0(); | |||
| return RET_OK; | |||
| } | |||
| int arg_cn = 0; | |||
| if (Align_) { | |||
| ocl_runtime_->SetKernelArg(kernel_, arg_cn++, in_tensors_.at(0)->data_c()); | |||
| } else { | |||
| ocl_runtime_->SetKernelArg(kernel_, arg_cn++, in_tensors_.at(0)->data_c(), lite::opencl::MemType::BUF); | |||
| } | |||
| for (int i = 0; i < out_tensors_.size(); ++i) { | |||
| ocl_runtime_->SetKernelArg(kernel_, arg_cn++, out_tensors_.at(i)->data_c()); | |||
| } | |||
| ocl_runtime_->SetKernelArg(kernel_, arg_cn++, split_sizes_, lite::opencl::MemType::BUF); | |||
| ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr, &event_); | |||
| return RET_OK; | |||
| } | |||
| REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Split, OpenCLKernelCreator<SplitOpenCLKernel>) | |||
| REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_Split, OpenCLKernelCreator<SplitOpenCLKernel>) | |||
| } // namespace mindspore::kernel | |||
| @@ -0,0 +1,60 @@ | |||
| /** | |||
| * Copyright 2019 Huawei Technologies Co., Ltd | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| * You may obtain a copy of the License at | |||
| * | |||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, software | |||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| #ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_OPENCL_KERNEL_SPLIT_H_ | |||
| #define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_OPENCL_KERNEL_SPLIT_H_ | |||
| #include <vector> | |||
| #include "src/runtime/kernel/opencl/opencl_kernel.h" | |||
| #include "nnacl/split_parameter.h" | |||
| namespace mindspore::kernel { | |||
| class SplitOpenCLKernel : public OpenCLKernel { | |||
| public: | |||
| SplitOpenCLKernel(OpParameter *parameter, const std::vector<lite::Tensor *> &inputs, | |||
| const std::vector<lite::Tensor *> &outputs) | |||
| : OpenCLKernel(parameter, inputs, outputs) {} | |||
| ~SplitOpenCLKernel() override = default; | |||
| int Prepare() override; | |||
| int CheckSpecs() override; | |||
| void SetConstArgs() override; | |||
| void SetGlobalLocal() override; | |||
| int Run() override; | |||
| private: | |||
| void AlignSplitSizes(SplitParameter *param, const std::vector<int> &in_shape); | |||
| int RunAxis0(); | |||
| private: | |||
| cl_int4 in_shape_{}; | |||
| cl_int4 out_shape_ = {}; | |||
| bool Align_{true}; | |||
| bool enable_fp16_{false}; | |||
| size_t num_split_ = 1; | |||
| int *split_sizes_{nullptr}; | |||
| int split_dim_ = 0; | |||
| cl_int stride_w{1}; | |||
| uint32_t OH = {1}; | |||
| uint32_t OW = {1}; | |||
| uint32_t OC = {1}; | |||
| }; | |||
| } // namespace mindspore::kernel | |||
| #endif | |||
| @@ -0,0 +1,361 @@ | |||
| /** | |||
| * Copyright 2019 Huawei Technologies n., Ltd | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| * You may obtain a copy of the License at | |||
| * | |||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, software | |||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| #include <set> | |||
| #include <string> | |||
| #include <map> | |||
| #include "src/runtime/kernel/opencl/kernel/matmul.h" | |||
| #include "src/runtime/kernel/opencl/kernel/strassen.h" | |||
| #include "src/common/utils.h" | |||
| #ifndef PROGRAM_WITH_IL | |||
| #include "src/runtime/kernel/opencl/cl/strassen.cl.inc" | |||
| #endif | |||
| namespace mindspore::kernel { | |||
| int StrassenOpenCLKernel::Prepare() { | |||
| #ifdef PROGRAM_WITH_IL | |||
| kernel_ = ocl_runtime_->GetKernelFromBinary(kernel_name); | |||
| #else | |||
| std::string kernel_name = "MatMul_Strassen_NHWC4_2d"; | |||
| std::string source = strassen_source; | |||
| std::string program_name = "MatMul"; | |||
| ocl_runtime_->LoadSource(program_name, source); | |||
| ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name); | |||
| ocl_runtime_->BuildKernel(kernel_IMG_add_sub_2, program_name, "MatMul_IMG_Add_Sub_2"); | |||
| ocl_runtime_->BuildKernel(kernel_BUF_add_sub_2, program_name, "MatMul_BUF_Add_Sub_2"); | |||
| ocl_runtime_->BuildKernel(kernel_back_result, program_name, "Strassen_Back_Result"); | |||
| ocl_runtime_->BuildKernel(MatMul_StrassenBUFFilled, program_name, "MatMul_BUF_Filled"); | |||
| ocl_runtime_->BuildKernel(MatMul_StrassenIMGFilled, program_name, "MatMul_IMG_Filled"); | |||
| #endif | |||
| auto ret = InitWeights(); | |||
| if (ret != RET_OK) { | |||
| return ret; | |||
| } | |||
| SetConstArgs(); | |||
| SetGlobalLocal(); | |||
| MS_LOG(DEBUG) << kernel_name << " Init Done!"; | |||
| return mindspore::lite::RET_OK; | |||
| } | |||
| void StrassenOpenCLKernel::AllocatorMemoryForStrassen(int NumA, int NumB) { | |||
| std::vector<size_t> img_size; | |||
| img_size.push_back(UP_DIV(NumA, C4NUM)); | |||
| img_size.push_back(NumA); | |||
| size_t img_dtype = enable_fp16_ ? CL_HALF_FLOAT : CL_FLOAT; | |||
| size_t dtype_size = enable_fp16_ ? sizeof(CL_HALF_FLOAT) : sizeof(CL_FLOAT); | |||
| img_size.push_back(img_dtype); | |||
| auto allocator = ocl_runtime_->GetAllocator(); | |||
| size_t memA = NumA * NumA; | |||
| size_t memB = NumB * NumB * dtype_size; | |||
| for (int depth = 0; depth < MAXDEPTH; depth++) { | |||
| B_temp[depth] = allocator->Malloc(memB); | |||
| A_temp[depth] = allocator->Malloc(memA, img_size); | |||
| M1[depth] = allocator->Malloc(memA, img_size); | |||
| M2[depth] = allocator->Malloc(memA, img_size); | |||
| M3[depth] = allocator->Malloc(memA, img_size); | |||
| M4[depth] = allocator->Malloc(memA, img_size); | |||
| M5[depth] = allocator->Malloc(memA, img_size); | |||
| M6[depth] = allocator->Malloc(memA, img_size); | |||
| M7[depth] = allocator->Malloc(memA, img_size); | |||
| } | |||
| } | |||
| int StrassenOpenCLKernel::InitWeights() { | |||
| // ABMCI @ ABCICO = ABMCO | |||
| auto ret = DequantWeight(); | |||
| if (ret != RET_OK) { | |||
| return ret; | |||
| } | |||
| auto allocator = ocl_runtime_->GetAllocator(); | |||
| int NumA = in_tensors_[0]->shape()[0]; | |||
| int NumB = in_tensors_[1]->shape()[0]; | |||
| size_t dtype_size = enable_fp16_ ? sizeof(uint16_t) : sizeof(float); | |||
| padWeight_ = allocator->Malloc(NumA * NumB * dtype_size); | |||
| padWeight_ = allocator->MapBuffer(padWeight_, CL_MAP_WRITE, nullptr, true); | |||
| auto padWeightFp32 = reinterpret_cast<float *>(padWeight_); | |||
| auto padWeightFp16 = reinterpret_cast<float16_t *>(padWeight_); | |||
| memset(padWeight_, 0x00, NumA * NumB * dtype_size); | |||
| auto originWeightFp32 = reinterpret_cast<float *>(in_tensors_.at(kWeightIndex)->data_c()); | |||
| auto originWeightFp16 = reinterpret_cast<float16_t *>(in_tensors_.at(kWeightIndex)->data_c()); | |||
| bool isModelFp16 = in_tensors_.at(kWeightIndex)->data_type() == kNumberTypeFloat16; | |||
| AllocatorMemoryForStrassen(NumA / 2, NumB / 2); | |||
| size_t size = NumA * NumB * dtype_size; | |||
| if (isModelFp16) { | |||
| if (enable_fp16_) { | |||
| memcpy(padWeightFp16, originWeightFp16, size); | |||
| } else { | |||
| for (int i = 0; i < NumA * NumB; ++i) { | |||
| padWeightFp32[i] = static_cast<float>(originWeightFp16[i]); | |||
| } | |||
| } | |||
| } else { | |||
| if (enable_fp16_) { | |||
| for (int i = 0; i < NumA * NumB; ++i) { | |||
| padWeightFp16[i] = static_cast<float16_t>(originWeightFp32[i]); | |||
| } | |||
| } else { | |||
| memcpy(padWeightFp32, originWeightFp32, size); | |||
| } | |||
| } | |||
| allocator->UnmapBuffer(padWeight_); | |||
| FreeDequantedWeight(); | |||
| return RET_OK; | |||
| } | |||
| void AlignStrassenGlobalLocal(const std::vector<size_t> &global, const std::vector<size_t> &local, | |||
| cl::NDRange *global_range, cl::NDRange *local_range) { | |||
| *local_range = cl::NDRange(local[0], local[1], local[2]); | |||
| *global_range = | |||
| cl::NDRange(UP_ROUND(global[0], local[0]), UP_ROUND(global[1], local[1]), UP_ROUND(global[2], local[2])); | |||
| } | |||
| // 0 : global_size_, 1: global_size_add_sub | |||
| void StrassenOpenCLKernel::StrassenSetGlobalLocal(size_t strassen_size, int type_flag) { | |||
| size_t strassen_size_C4 = UP_DIV(strassen_size, C4NUM); | |||
| local_size_add_sub = {16, 1, 16}; | |||
| if (type_flag == 0) { | |||
| global_size_ = {strassen_size_C4, 1, strassen_size}; | |||
| AlignGlobalLocal(global_size_, local_size_); | |||
| } else { | |||
| global_size_add_sub = {strassen_size_C4, 1, strassen_size}; | |||
| AlignStrassenGlobalLocal(global_size_add_sub, local_size_add_sub, &global_add_sub_, &local_add_sub_); | |||
| } | |||
| } | |||
| void StrassenOpenCLKernel::SetGlobalLocal() { | |||
| // local size should less than MAX_GROUP_SIZE | |||
| local_size_ = {32, 4, 1}; | |||
| global_size_ = {1, 1, 1}; | |||
| size_t strassen_size = outShape[3] / 2; | |||
| StrassenSetGlobalLocal(strassen_size, 0); // set global_ and local | |||
| StrassenSetGlobalLocal(strassen_size, 1); // set global_size_add_sub | |||
| StrassenSetGlobalLocal(strassen_size, 2); // set global_size_weights | |||
| } | |||
| void StrassenOpenCLKernel::StrassenSetConstArgs(cl::Kernel *kernel, int index, int strassen_size, | |||
| bool is_matmul_kernel) { | |||
| cl_int4 shape; | |||
| if (is_matmul_kernel) { | |||
| shape = {1, 1, strassen_size, strassen_size}; | |||
| } else { | |||
| shape = {strassen_size, 1, 1, UP_DIV(strassen_size, C4NUM)}; | |||
| } | |||
| ocl_runtime_->SetKernelArg(*kernel, index, shape); | |||
| } | |||
| void StrassenOpenCLKernel::SetConstArgs() { | |||
| int arg_count = 2; | |||
| cl_int4 in_shape = {inShape[0], inShape[1], inShape[2], inShape[3]}; | |||
| cl_int4 out_shape = {outShape[0], outShape[1], outShape[2], outShape[3]}; | |||
| cl_int4 shape_offset = {0, 0, 0, 0}; | |||
| int strassen_size = inShape[3] / 2; | |||
| out_shape.s[2] = in_shape.s[2] = in_shape.s[2] / 2; | |||
| out_shape.s[3] = in_shape.s[3] = in_shape.s[3] / 2; | |||
| StrassenSetConstArgs(&kernel_IMG_add_sub_2, 3, strassen_size, false); | |||
| StrassenSetConstArgs(&kernel_BUF_add_sub_2, 2, strassen_size, false); | |||
| ocl_runtime_->SetKernelArg(kernel_, arg_count++, in_shape); | |||
| ocl_runtime_->SetKernelArg(kernel_, arg_count++, out_shape); | |||
| ocl_runtime_->SetKernelArg(kernel_, arg_count++, shape_offset); | |||
| } | |||
| // OriginSize = N*H*W*C typesize = sizeof(type data) width = W * UP_DIV(C,C4NUM) size = N | |||
| void StrassenOpenCLKernel::PrintImage2d(void *IMGData, size_t typesize, size_t width, size_t size) { | |||
| auto runtime_wrapper = lite::opencl::OpenCLRuntimeWrapper(); | |||
| int alignment = runtime_wrapper.GetInstance()->GetImagePitchAlignment(); | |||
| auto runtime = runtime_wrapper.GetInstance(); | |||
| runtime->SyncCommandQueue(); | |||
| MS_ASSERT(alignment); | |||
| size_t row_pitch = UP_ROUND(width, alignment) * typesize * C4NUM; | |||
| size_t OriginSize = size * size * typesize; | |||
| std::vector<char> data(OriginSize); | |||
| auto row_size = width * typesize * C4NUM; | |||
| for (int i = 0; i < size; ++i) { | |||
| memcpy(reinterpret_cast<char *>(data.data()) + i * row_size, static_cast<char *>(IMGData) + i * row_pitch, | |||
| row_size); | |||
| } | |||
| for (int i = 0; i < size * size; ++i) { | |||
| if ((i + 1) % size == 0) { | |||
| std::cout << std::endl; | |||
| } | |||
| } | |||
| } | |||
| void StrassenOpenCLKernel::StrassenDataFilled(cl::Kernel *kernel, void *input, void *output, const int size, | |||
| cl_int2 offset, lite::opencl::MemType mem_type) { | |||
| if (input == nullptr || output == nullptr) { | |||
| MS_LOG(ERROR) << "StrassenDataFilled input or output can not nullptr"; | |||
| return; | |||
| } | |||
| if (mem_type == lite::opencl::MemType::IMG) { | |||
| ocl_runtime_->SetKernelArg(*kernel, 0, input); | |||
| ocl_runtime_->SetKernelArg(*kernel, 1, output); | |||
| } else { | |||
| ocl_runtime_->SetKernelArg(*kernel, 0, input, lite::opencl::MemType::BUF); | |||
| ocl_runtime_->SetKernelArg(*kernel, 1, output, lite::opencl::MemType::BUF); | |||
| } | |||
| StrassenSetConstArgs(kernel, 2, size, false); | |||
| ocl_runtime_->SetKernelArg(*kernel, 3, offset); | |||
| ocl_runtime_->RunKernel(*kernel, global_add_sub_, local_add_sub_, nullptr, &event_); | |||
| } | |||
| void StrassenOpenCLKernel::StrassenAddSub(cl::Kernel *kernel, void *input, void *output, const int size, cl_int4 offset, | |||
| int flag, lite::opencl::MemType mem_type) { | |||
| if (input == nullptr || output == nullptr) { | |||
| MS_LOG(ERROR) << "StrassenAddSub input or output can not nullptr"; | |||
| return; | |||
| } | |||
| if (mem_type == lite::opencl::MemType::IMG) { | |||
| ocl_runtime_->SetKernelArg(*kernel, 0, input, lite::opencl::MemType::IMG); | |||
| ocl_runtime_->SetKernelArg(*kernel, 1, output, lite::opencl::MemType::IMG); | |||
| } else { | |||
| ocl_runtime_->SetKernelArg(*kernel, 0, input, lite::opencl::MemType::BUF); | |||
| ocl_runtime_->SetKernelArg(*kernel, 1, output, lite::opencl::MemType::BUF); | |||
| } | |||
| StrassenSetConstArgs(kernel, 2, size, false); | |||
| ocl_runtime_->SetKernelArg(*kernel, 3, offset); | |||
| ocl_runtime_->SetKernelArg(*kernel, 4, flag); | |||
| ocl_runtime_->RunKernel(*kernel, global_add_sub_, local_add_sub_, nullptr, &event_); | |||
| } | |||
| void StrassenOpenCLKernel::StrassenBackResult(cl::Kernel *kernel, void *input1, void *input2, void *input3, | |||
| void *input4, void *input5, void *input6, void *input7, void *output, | |||
| const int size) { | |||
| if (input1 == nullptr || input2 == nullptr || input3 == nullptr || input4 == nullptr || input5 == nullptr || | |||
| input6 == nullptr || input7 == nullptr || output == nullptr) { | |||
| MS_LOG(ERROR) << "StrassenBackResult input or output can not nullptr"; | |||
| return; | |||
| } | |||
| ocl_runtime_->SetKernelArg(*kernel, 0, input1); | |||
| ocl_runtime_->SetKernelArg(*kernel, 1, input2); | |||
| ocl_runtime_->SetKernelArg(*kernel, 2, input3); | |||
| ocl_runtime_->SetKernelArg(*kernel, 3, input4); | |||
| ocl_runtime_->SetKernelArg(*kernel, 4, input5); | |||
| ocl_runtime_->SetKernelArg(*kernel, 5, input6); | |||
| ocl_runtime_->SetKernelArg(*kernel, 6, input7); | |||
| ocl_runtime_->SetKernelArg(*kernel, 7, output); | |||
| StrassenSetConstArgs(kernel, 8, size, false); | |||
| ocl_runtime_->RunKernel(*kernel, global_add_sub_, local_add_sub_, nullptr, &event_); | |||
| } | |||
| void StrassenOpenCLKernel::StrassenRunMmatmul(void *input, void *weight, void *output, const int size) { | |||
| if (input == nullptr || weight == nullptr || output == nullptr) { | |||
| MS_LOG(ERROR) << "StrassenRunMmatmul input ,weight or output can not nullptr"; | |||
| return; | |||
| } | |||
| ocl_runtime_->SetKernelArg(kernel_, 0, input); | |||
| ocl_runtime_->SetKernelArg(kernel_, 1, output); | |||
| ocl_runtime_->SetKernelArg(kernel_, 2, weight, lite::opencl::MemType::BUF); | |||
| StrassenSetConstArgs(&kernel_, 3, size, true); | |||
| StrassenSetConstArgs(&kernel_, 4, size, true); | |||
| ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr, &event_); | |||
| } | |||
| void StrassenOpenCLKernel::DoStrassen(void *data, void *weight, void *result, const int size, const int depth, | |||
| const int threshold) { | |||
| const int size_2 = size / 2; | |||
| int C4 = UP_DIV(size_2, C4NUM); | |||
| if (size <= threshold) { | |||
| // run matmul; | |||
| StrassenSetGlobalLocal(size, 0); | |||
| StrassenRunMmatmul(data, weight, result, size); | |||
| return; | |||
| } | |||
| // flag = 0 : add otherwise flag = 1 : sub | |||
| // M1 = A11 * ( B12- B22) | |||
| StrassenSetGlobalLocal(size_2, 1); | |||
| StrassenDataFilled(&MatMul_StrassenIMGFilled, data, A_temp[depth + 1], size_2, {0, 0}, lite::opencl::MemType::IMG); | |||
| StrassenAddSub(&kernel_BUF_add_sub_2, weight, B_temp[depth + 1], size_2, {0, C4, size_2, C4}, 1, | |||
| lite::opencl::MemType::BUF); | |||
| DoStrassen(A_temp[depth + 1], B_temp[depth + 1], M1[depth + 1], size_2, depth + 1, threshold); | |||
| // M2 = (A11 + A12) * B22 | |||
| StrassenSetGlobalLocal(size_2, 1); | |||
| StrassenDataFilled(&MatMul_StrassenBUFFilled, weight, B_temp[depth + 1], size_2, {size_2, C4}, | |||
| lite::opencl::MemType::BUF); | |||
| StrassenAddSub(&kernel_IMG_add_sub_2, data, A_temp[depth + 1], size_2, {0, 0, 0, C4}, 0, lite::opencl::MemType::IMG); | |||
| DoStrassen(A_temp[depth + 1], B_temp[depth + 1], M2[depth + 1], size_2, depth + 1, threshold); | |||
| // M3 = (A21 + A22) * B11 | |||
| StrassenSetGlobalLocal(size_2, 1); | |||
| StrassenDataFilled(&MatMul_StrassenBUFFilled, weight, B_temp[depth + 1], size_2, {0, 0}, lite::opencl::MemType::BUF); | |||
| StrassenAddSub(&kernel_IMG_add_sub_2, data, A_temp[depth + 1], size_2, {size_2, 0, size_2, C4}, 0, | |||
| lite::opencl::MemType::IMG); | |||
| DoStrassen(A_temp[depth + 1], B_temp[depth + 1], M3[depth + 1], size_2, depth + 1, threshold); | |||
| // M4 = A22 * (B21 - B11) | |||
| StrassenSetGlobalLocal(size_2, 1); | |||
| StrassenDataFilled(&MatMul_StrassenIMGFilled, data, A_temp[depth + 1], size_2, {size_2, C4}, | |||
| lite::opencl::MemType::IMG); | |||
| StrassenAddSub(&kernel_BUF_add_sub_2, weight, B_temp[depth + 1], size_2, {size_2, 0, 0, 0}, 1, | |||
| lite::opencl::MemType::BUF); | |||
| DoStrassen(A_temp[depth + 1], B_temp[depth + 1], M4[depth + 1], size_2, depth + 1, threshold); | |||
| // M5 = (A11 + A22) * (B11 + B22) | |||
| StrassenSetGlobalLocal(size_2, 1); | |||
| StrassenAddSub(&kernel_IMG_add_sub_2, data, A_temp[depth + 1], size_2, {0, 0, size_2, C4}, 0, | |||
| lite::opencl::MemType::IMG); | |||
| // (B11 + B22) | |||
| StrassenAddSub(&kernel_BUF_add_sub_2, weight, B_temp[depth + 1], size_2, {0, 0, size_2, C4}, 0, | |||
| lite::opencl::MemType::BUF); | |||
| DoStrassen(A_temp[depth + 1], B_temp[depth + 1], M5[depth + 1], size_2, depth + 1, threshold); | |||
| // M6 = (A12 - A22) * (B21 + B22) | |||
| StrassenSetGlobalLocal(size_2, 1); | |||
| StrassenAddSub(&kernel_IMG_add_sub_2, data, A_temp[depth + 1], size_2, {0, C4, size_2, C4}, 1, | |||
| lite::opencl::MemType::IMG); | |||
| StrassenAddSub(&kernel_BUF_add_sub_2, weight, B_temp[depth + 1], size_2, {size_2, 0, size_2, C4}, 0, | |||
| lite::opencl::MemType::BUF); | |||
| DoStrassen(A_temp[depth + 1], B_temp[depth + 1], M6[depth + 1], size_2, depth + 1, threshold); | |||
| // M7 = (A11 - A21) * (B11 + B12) | |||
| StrassenSetGlobalLocal(size_2, 1); | |||
| StrassenAddSub(&kernel_IMG_add_sub_2, data, A_temp[depth + 1], size_2, {0, 0, size_2, 0}, 1, | |||
| lite::opencl::MemType::IMG); | |||
| StrassenAddSub(&kernel_BUF_add_sub_2, weight, B_temp[depth + 1], size_2, {0, 0, 0, C4}, 0, | |||
| lite::opencl::MemType::BUF); | |||
| DoStrassen(A_temp[depth + 1], B_temp[depth + 1], M7[depth + 1], size_2, depth + 1, threshold); | |||
| // BackResult | |||
| StrassenSetGlobalLocal(size_2, 1); | |||
| StrassenBackResult(&kernel_back_result, M1[depth + 1], M2[depth + 1], M3[depth + 1], M4[depth + 1], M5[depth + 1], | |||
| M6[depth + 1], M7[depth + 1], result, size_2); | |||
| } | |||
| int StrassenOpenCLKernel::Run() { | |||
| MS_LOG(DEBUG) << this->name() << " Running!"; | |||
| int threshold = 0; | |||
| const int up_bound = 1024; | |||
| const int down_bound = 256; | |||
| if (in_tensors_.at(0)->shape()[0] >= up_bound) { | |||
| threshold = UP_DIV(in_tensors_.at(0)->shape()[0], C4NUM) / 2; | |||
| } else if (in_tensors_.at(0)->shape()[0] <= down_bound) { | |||
| threshold = in_tensors_.at(0)->shape()[0]; | |||
| } else { | |||
| threshold = UP_DIV(in_tensors_.at(0)->shape()[0], C4NUM); | |||
| } | |||
| DoStrassen(in_tensors_.at(0)->data_c(), padWeight_, out_tensors_.at(0)->data_c(), in_tensors_.at(0)->shape()[0], 0, | |||
| threshold); | |||
| return mindspore::lite::RET_OK; | |||
| } | |||
| } // namespace mindspore::kernel | |||
| @@ -0,0 +1,77 @@ | |||
| /** | |||
| * Copyright 2019 Huawei Technologies Co., Ltd | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| * You may obtain a copy of the License at | |||
| * | |||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, software | |||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| #ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_OPENCL_KERNEL_STRASSEN_H_ | |||
| #define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_OPENCL_KERNEL_STRASSEN_H_ | |||
| #include <string> | |||
| #include <vector> | |||
| #include "src/runtime/kernel/opencl/kernel/matmul.h" | |||
| namespace mindspore::kernel { | |||
| class StrassenOpenCLKernel : public MatMulOpenCLKernel { | |||
| public: | |||
| StrassenOpenCLKernel(OpParameter *parameter, const std::vector<lite::Tensor *> &inputs, | |||
| const std::vector<lite::Tensor *> &outputs) | |||
| : MatMulOpenCLKernel(parameter, inputs, outputs) {} | |||
| ~StrassenOpenCLKernel() override = default; | |||
| public: | |||
| int Run() override; | |||
| int Prepare() override; | |||
| int InitWeights() override; | |||
| void SetConstArgs() override; | |||
| void SetGlobalLocal() override; | |||
| // strassen | |||
| private: | |||
| void AllocatorMemoryForStrassen(int NumA, int NumB); | |||
| void DoStrassen(void *data, void *weight, void *result, const int size, const int depth, const int threshold); | |||
| void StrassenSetGlobalLocal(size_t strassen_size, int type_flag); | |||
| void StrassenSetConstArgs(cl::Kernel *kernel, int index, int strassen_size, bool is_matmul_kernel); | |||
| void StrassenDataFilled(cl::Kernel *kernel, void *input, void *output, const int size, cl_int2 offset, | |||
| lite::opencl::MemType mem_type); | |||
| void StrassenAddSub(cl::Kernel *kernel, void *input, void *output, const int size, cl_int4 offset, int flag, | |||
| lite::opencl::MemType mem_type); | |||
| void StrassenBackResult(cl::Kernel *kernel, void *input1, void *input2, void *input3, void *input4, void *input5, | |||
| void *input6, void *input7, void *output, const int size); | |||
| void StrassenRunMmatmul(void *input, void *weight, void *output, const int size); | |||
| void PrintImage2d(void *IMGData, size_t typesize, size_t width, size_t size); | |||
| cl::Kernel kernel_IMG_add_sub_2; | |||
| cl::Kernel MatMul_StrassenBUFFilled; | |||
| cl::Kernel MatMul_StrassenIMGFilled; | |||
| cl::Kernel kernel_BUF_add_sub_2; | |||
| cl::Kernel kernel_back_result; | |||
| cl::NDRange global_add_sub_, local_add_sub_; | |||
| std::vector<size_t> global_size_add_sub; | |||
| std::vector<size_t> local_size_add_sub; | |||
| // image 2d | |||
| void *A_temp[MAXDEPTH] = {nullptr}; | |||
| void *M1[MAXDEPTH] = {nullptr}; | |||
| void *M2[MAXDEPTH] = {nullptr}; | |||
| void *M3[MAXDEPTH] = {nullptr}; | |||
| void *M4[MAXDEPTH] = {nullptr}; | |||
| void *M5[MAXDEPTH] = {nullptr}; | |||
| void *M6[MAXDEPTH] = {nullptr}; | |||
| void *M7[MAXDEPTH] = {nullptr}; | |||
| // buffer | |||
| void *B_temp[MAXDEPTH] = {nullptr}; | |||
| }; | |||
| } // namespace mindspore::kernel | |||
| #endif // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_OPENCL_KERNEL_WINOGRAD_H_ | |||
| @@ -26,7 +26,156 @@ using mindspore::lite::KernelRegistry; | |||
| using mindspore::schema::Format::Format_NHWC; | |||
| namespace mindspore::lite::opencl::test { | |||
| // muti-output | |||
| void TestMain(const std::vector<ArgsTuple> &input_infos, const std::vector<ArgsTupleOut> &output_info, | |||
| OpParameter *op_parameter, bool fp16_enable, float atol, float rtol, bool print_data) { | |||
| std::vector<ArgsTupleWithDtype> input_infos_new; | |||
| auto transform_fun = [](ArgsTuple in) -> ArgsTupleWithDtype { | |||
| return ArgsTupleWithDtype(std::get<0>(in), std::get<1>(in), std::get<2>(in), kNumberTypeFloat32); | |||
| }; | |||
| std::transform(input_infos.begin(), input_infos.end(), std::back_inserter(input_infos_new), transform_fun); | |||
| TestMain(input_infos_new, output_info, op_parameter, fp16_enable, atol, rtol, print_data); | |||
| } | |||
| void TestMain(const std::vector<ArgsTupleWithDtype> &input_infos, const std::vector<ArgsTupleOut> &output_info, | |||
| OpParameter *op_parameter, bool fp16_enable, float atol, float rtol, bool print_data) { | |||
| auto primitive_type = static_cast<schema::PrimitiveType>(op_parameter->type_); | |||
| static std::set<schema::PrimitiveType> packed_op = { | |||
| schema::PrimitiveType_Conv2D, schema::PrimitiveType_DeConv2D, schema::PrimitiveType_DepthwiseConv2D, | |||
| schema::PrimitiveType_DeDepthwiseConv2D, schema::PrimitiveType_MatMul}; | |||
| // simulating benchmark: session::LiteSession::CreateSession() -> session->Init() | |||
| MS_LOG(DEBUG) << "initialize OpenCLRuntime and OpenCLAllocator"; | |||
| auto runtime_wrapper = lite::opencl::OpenCLRuntimeWrapper(); | |||
| auto ocl_runtime = runtime_wrapper.GetInstance(); | |||
| ocl_runtime->SetFp16Enable(fp16_enable); | |||
| EXPECT_TRUE(ocl_runtime->Init() == RET_OK); | |||
| // simulating benchmark: session_->CompileGraph() -> ConvertTensors() | |||
| MS_LOG(DEBUG) << "create Tensors & init weight data"; | |||
| std::vector<std::shared_ptr<Tensor>> in_tensors; | |||
| std::vector<std::shared_ptr<Tensor>> out_tensors; | |||
| // firstly, create all Tensors | |||
| in_tensors.reserve(input_infos.size()); // vector's capacity() is 0, so call reserve() avoiding vector re-malloc | |||
| for (auto input_info : input_infos) { | |||
| auto &shape = std::get<0>(input_info); | |||
| auto category = std::get<2>(input_info); | |||
| auto data_type = std::get<3>(input_info); | |||
| in_tensors.emplace_back(std::make_shared<Tensor>(data_type, shape, Format_NHWC, category)); | |||
| } | |||
| for (auto outout_info : output_info) { | |||
| const std::vector<int> &output_shape = std::get<0>(outout_info); | |||
| out_tensors.emplace_back(std::make_shared<Tensor>(kNumberTypeFloat32, output_shape, Format_NHWC, VAR)); | |||
| } | |||
| // secondly, init weight Tensor's data | |||
| std::vector<Tensor *> kernel_inputs; | |||
| std::vector<Tensor *> subgraph_inputs; | |||
| std::vector<Tensor *> outputs; | |||
| std::map<Tensor *, float *> subgraph_inputs_data; | |||
| for (int i = 0; i < in_tensors.size(); ++i) { | |||
| auto tensor = in_tensors[i]; | |||
| auto *input_data = std::get<1>(input_infos[i]); | |||
| kernel_inputs.push_back(tensor.get()); | |||
| if (tensor->category() != VAR) { // tensor is weight | |||
| // simulating src/lite_session.cc:WeightTensorNeedCopy() | |||
| if (packed_op.count(primitive_type)) { | |||
| tensor->set_data(input_data); | |||
| } else { | |||
| memcpy(tensor->MutableData(), input_data, tensor->Size()); | |||
| } | |||
| } else { | |||
| EXPECT_TRUE(tensor->data_type() == kNumberTypeFloat32 || tensor->data_type() == kNumberTypeInt32); | |||
| subgraph_inputs.push_back(tensor.get()); | |||
| subgraph_inputs_data[tensor.get()] = reinterpret_cast<float *>(input_data); | |||
| } | |||
| } | |||
| for (int i = 0; i < out_tensors.size(); ++i) { | |||
| auto out_tensor = out_tensors[i]; | |||
| outputs.push_back(out_tensor.get()); | |||
| } | |||
| // simulating benchmark: session_->CompileGraph() -> scheduler.Schedule() -> BuildKernels() | |||
| MS_LOG(DEBUG) << "create OpenCLKernel"; | |||
| kernel::KernelKey key{kernel::kGPU, kernel_inputs.front()->data_type(), primitive_type}; | |||
| auto creator = KernelRegistry::GetInstance()->GetCreator(key); | |||
| if (creator == nullptr) { | |||
| std::cerr << "can't get registry function for: " << schema::EnumNamePrimitiveType(primitive_type) | |||
| << ". Maybe you forget setting op_parameter_.type_ for OpParameter." << std::endl; | |||
| free(op_parameter); | |||
| FAIL(); | |||
| } | |||
| auto *kernel = creator(kernel_inputs, outputs, op_parameter, nullptr, key, nullptr); | |||
| if (kernel == nullptr) { | |||
| std::cerr << "call registry function error: " << schema::EnumNamePrimitiveType(primitive_type) << std::endl; | |||
| free(op_parameter); | |||
| FAIL(); | |||
| } | |||
| kernel->set_name(schema::EnumNamesPrimitiveType()[primitive_type]); | |||
| // simulating benchmark: session_->CompileGraph() -> scheduler.Schedule() -> ConstructSubGraphs() | |||
| MS_LOG(DEBUG) << "create SubGraph"; | |||
| std::vector<LiteKernel *> kernels{kernel}; | |||
| auto sub_graph = new (std::nothrow) OpenCLSubGraph(subgraph_inputs, outputs, kernels, kernels, kernels); | |||
| if (sub_graph == nullptr) { | |||
| return; | |||
| } | |||
| // call sub_graph->Init() after construct subgraph like scheduler.cc | |||
| MS_LOG(DEBUG) << "call sub_graph->Init()"; | |||
| EXPECT_TRUE(sub_graph->Init() == RET_OK); | |||
| // simulating benchmark: session_->CompileGraph() -> PrepareKernels() -> OpenCLSubGraph.Prepare() | |||
| MS_LOG(DEBUG) << "call sub_graph->Prepare()"; | |||
| EXPECT_TRUE(sub_graph->Prepare() == RET_OK); // will set Tensor's allocator be OpenCLAllocator | |||
| // simulating benchmark: model->Free(), clear weight data in input_infos | |||
| std::vector<std::unique_ptr<uint8_t[]>> saved_weights; | |||
| for (int i = 0; i < in_tensors.size(); ++i) { | |||
| auto &tensor = in_tensors[i]; | |||
| if (tensor->category() != VAR) { | |||
| saved_weights.emplace_back(new uint8_t[tensor->Size()]); | |||
| auto *weight_data = std::get<1>(input_infos[i]); | |||
| memcpy(saved_weights.back().get(), weight_data, tensor->Size()); | |||
| srand(time(nullptr)); | |||
| memset(weight_data, rand(), tensor->Size()); | |||
| } | |||
| } | |||
| // simulating benchmark: LoadInput() | |||
| MS_LOG(DEBUG) << "malloc and init input data"; | |||
| for (auto input : subgraph_inputs) { | |||
| EXPECT_TRUE(input->MutableData() != nullptr); // malloc Image2D & call MapBuffer() | |||
| memcpy(input->data_c(), subgraph_inputs_data[input], input->Size()); | |||
| } | |||
| // simulating benchmark: MarkAccuracy() -> session_->RunGraph() -> executor_->Run() -> OpenCLSubGraph->Run() | |||
| MS_LOG(DEBUG) << "run SubGraph & compare result"; | |||
| EXPECT_TRUE(sub_graph->Run() == RET_OK); // will call UnmapBuffer() for input | |||
| for (int i = 0; i < outputs.size(); ++i) { | |||
| ocl_runtime->GetAllocator()->MapBuffer(outputs[i]->data_c(), CL_MAP_READ, nullptr, true); | |||
| float *expect_data = reinterpret_cast<float *>(std::get<1>(output_info[i])); | |||
| CompareOutput<float>(outputs[i]->data_c(), expect_data, outputs[i]->ElementsNum(), atol, rtol, print_data); | |||
| ocl_runtime->GetAllocator()->UnmapBuffer(outputs[i]->data_c()); | |||
| } | |||
| MS_LOG(DEBUG) << "release resources"; | |||
| for (auto &tensor : in_tensors) { | |||
| if (tensor->category() != VAR && packed_op.count(primitive_type)) { | |||
| tensor->set_data(nullptr); | |||
| } | |||
| } | |||
| for (int i = 0, j = 0; i < in_tensors.size(); ++i) { // resume weight data to input_infos | |||
| auto &tensor = in_tensors[i]; | |||
| if (tensor->category() != VAR) { | |||
| auto *weight_data = std::get<1>(input_infos[i]); | |||
| memcpy(weight_data, saved_weights[j++].get(), tensor->Size()); | |||
| } | |||
| } | |||
| delete sub_graph; | |||
| } | |||
| // single-output | |||
| void TestMain(const std::vector<ArgsTupleWithDtype> &input_infos, std::tuple<std::vector<int>, float *> output_info, | |||
| OpParameter *op_parameter, bool fp16_enable, float atol, float rtol, bool print_data) { | |||
| auto primitive_type = static_cast<schema::PrimitiveType>(op_parameter->type_); | |||
| @@ -31,6 +31,7 @@ | |||
| using Tensor = mindspore::lite::Tensor; | |||
| using ArgsTuple = std::tuple<std::vector<int>, void *, Tensor::Category>; | |||
| using ArgsTupleOut = std::tuple<std::vector<int>, void *>; | |||
| using ArgsTupleWithDtype = std::tuple<std::vector<int>, void *, Tensor::Category, mindspore::TypeId>; | |||
| constexpr Tensor::Category VAR = Tensor::VAR; | |||
| constexpr Tensor::Category CONST_TENSOR = Tensor::Category::CONST_TENSOR; | |||
| @@ -89,10 +90,17 @@ T *CreateParameter(schema::PrimitiveType type) { | |||
| return param; | |||
| } | |||
| void TestMain(const std::vector<ArgsTupleWithDtype> &input_infos, std::tuple<std::vector<int>, float *> output_info, | |||
| void TestMain(const std::vector<ArgsTupleWithDtype> &input_infos, const std::vector<ArgsTupleOut> &output_info, | |||
| OpParameter *op_parameter, bool fp16_enable = false, float atol = 1e-9, float rtol = 1e-9, | |||
| bool print_output = false); | |||
| void TestMain(const std::vector<ArgsTuple> &input_infos, const std::vector<ArgsTupleOut> &output_info, | |||
| OpParameter *op_parameter, bool fp16_enable = false, float atol = 1e-9, float rtol = 1e-9, | |||
| bool print_output = false); | |||
| void TestMain(const std::vector<ArgsTupleWithDtype> &input_infos, std::tuple<std::vector<int>, float *> output_info, | |||
| OpParameter *op_parameter, bool fp16_enable = false, float atol = 1e-9, float rtol = 1e-9, | |||
| bool print_output = false); | |||
| void TestMain(const std::vector<ArgsTuple> &input_infos, std::tuple<std::vector<int>, float *> output_info, | |||
| OpParameter *op_parameter, bool fp16_enable = false, float atol = 1e-9, float rtol = 1e-9, | |||
| bool print_output = false); | |||
| @@ -44,6 +44,21 @@ TEST_F(TestOpenCL_Concat, input2_axis0) { | |||
| } | |||
| } | |||
| TEST_F(TestOpenCL_Concat, input2_axis0_shape1) { | |||
| std::vector<int> input0_shape = {1}; | |||
| std::vector<int> input1_shape = {1}; | |||
| std::vector<int> output_shape = {2}; | |||
| int axis = 0; | |||
| float input0_data[] = {0.75}; | |||
| float input1_data[] = {0.5}; | |||
| float output_data[] = {0.75, 0.5}; | |||
| for (auto fp16_enable : {false}) { | |||
| auto *param = CreateParameter(axis); | |||
| TestMain({{input0_shape, input0_data, VAR}, {input1_shape, input1_data, VAR}}, {output_shape, output_data}, param, | |||
| fp16_enable, fp16_enable ? 1e-3 : 1e-9); | |||
| } | |||
| } | |||
| TEST_F(TestOpenCL_Concat, input2_axis1_Align) { | |||
| std::vector<int> input0_shape = {2, 2, 2, 8}; | |||
| std::vector<int> input1_shape = {2, 2, 2, 8}; | |||
| @@ -32,13 +32,30 @@ OpParameter *CreateParameter(int axis) { | |||
| TEST_F(TestOpenCL_Gather, Axis0) { | |||
| int axis = 0; | |||
| std::vector<int> input_shape = {10}; | |||
| std::vector<int> indices_shape = {2}; | |||
| std::vector<int> output_shape = {2}; | |||
| std::vector<int> indices_shape = {5}; | |||
| std::vector<int> output_shape = {5}; | |||
| float input_data[] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9}; | |||
| int32_t indices[] = {1, 3}; | |||
| float output_data[] = {1, 3}; | |||
| int32_t indices[] = {1, 3, 5, 7, 9}; | |||
| float output_data[] = {1, 3, 5, 7, 9}; | |||
| for (auto fp16_enable : {false, true}) { | |||
| for (auto fp16_enable : {false}) { | |||
| auto *param = CreateParameter(axis); | |||
| TestMain( | |||
| {{input_shape, input_data, VAR, kNumberTypeFloat32}, {indices_shape, indices, CONST_TENSOR, kNumberTypeInt32}}, | |||
| {output_shape, output_data}, param, fp16_enable); | |||
| } | |||
| } | |||
| TEST_F(TestOpenCL_Gather, Axis0ConstTensor) { | |||
| int axis = 0; | |||
| std::vector<int> input_shape = {10}; | |||
| std::vector<int> indices_shape = {1}; | |||
| std::vector<int> output_shape = {1}; | |||
| float input_data[] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9}; | |||
| int32_t indices[] = {1}; | |||
| float output_data[] = {1}; | |||
| for (auto fp16_enable : {false}) { | |||
| auto *param = CreateParameter(axis); | |||
| TestMain( | |||
| {{input_shape, input_data, VAR, kNumberTypeFloat32}, {indices_shape, indices, CONST_TENSOR, kNumberTypeInt32}}, | |||
| @@ -49,11 +66,11 @@ TEST_F(TestOpenCL_Gather, Axis0) { | |||
| TEST_F(TestOpenCL_Gather, Axis0_Tensor) { | |||
| int axis = 0; | |||
| std::vector<int> input_shape = {10}; | |||
| std::vector<int> indices_shape = {2}; | |||
| std::vector<int> output_shape = {2}; | |||
| std::vector<int> indices_shape = {1}; | |||
| std::vector<int> output_shape = {1}; | |||
| float input_data[] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9}; | |||
| int32_t indices[] = {1, 3}; | |||
| float output_data[] = {1, 3}; | |||
| int32_t indices[] = {1}; | |||
| float output_data[] = {1}; | |||
| for (auto fp16_enable : {false}) { | |||
| auto *param = CreateParameter(axis); | |||
| @@ -45,7 +45,7 @@ TEST_F(TestOpenCL_MatMul, 2Dfile) { | |||
| auto output_data = reinterpret_cast<float *>(mindspore::lite::ReadFile(correctOutputPath.c_str(), &output_size)); | |||
| for (auto fp16_enable : {false}) { | |||
| auto *param = CreateParameter(); | |||
| auto *param = CreateParameter(false, false); | |||
| TestMain({{input_shape, input_data, VAR}, {weight_shape, weight_data, CONST_TENSOR}}, {output_shape, output_data}, | |||
| param, fp16_enable, fp16_enable ? 1e-3 : 1e-3); | |||
| } | |||
| @@ -0,0 +1,57 @@ | |||
| /** | |||
| * Copyright 2020 Huawei Technologies Co., Ltd | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| * You may obtain a copy of the License at | |||
| * | |||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, software | |||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| #include "ut/src/runtime/kernel/opencl/common.h" | |||
| #include "nnacl/split_parameter.h" | |||
| namespace mindspore::lite::opencl::test { | |||
| class TestOpenCL_Split : public CommonTest {}; | |||
| namespace { | |||
| // PrimitiveType_Split: src/ops/populate/split_populate.cc | |||
| OpParameter *CreateParameter(int split_dim_, int num_split_, std::vector<int> split_sizes_) { | |||
| auto *param = test::CreateParameter<SplitParameter>(schema::PrimitiveType_Split); | |||
| param->split_dim_ = split_dim_; | |||
| param->num_split_ = num_split_; | |||
| param->split_sizes_ = reinterpret_cast<int *>(malloc(param->num_split_ * sizeof(int))); | |||
| for (int i = 0; i < param->num_split_; ++i) { | |||
| param->split_sizes_[i] = split_sizes_[i]; | |||
| } | |||
| return reinterpret_cast<OpParameter *>(param); | |||
| } | |||
| } // namespace | |||
| TEST_F(TestOpenCL_Split, input2_axis3) { | |||
| std::vector<int> input_shape = {2, 2, 2, 12}; | |||
| std::vector<int> output_shape1 = {2, 2, 2, 6}; | |||
| std::vector<int> output_shape2 = {2, 2, 2, 6}; | |||
| int split_dim_ = 3; | |||
| int num_split_ = 2; // len of split_sizes_ | |||
| std::vector<int> split_sizes_{6, 6}; | |||
| size_t input_size, output1_size, output2_size; | |||
| std::string inputPpath = "./test_data/splitfp32_input.bin"; | |||
| std::string output1Ppath = "./test_data/splitfp32_output1.bin"; | |||
| std::string output2Ppath = "./test_data/splitfp32_output2.bin"; | |||
| auto input_data = reinterpret_cast<float *>(mindspore::lite::ReadFile(inputPpath.c_str(), &input_size)); | |||
| auto output_data1 = reinterpret_cast<float *>(mindspore::lite::ReadFile(output1Ppath.c_str(), &output1_size)); | |||
| auto output_data2 = reinterpret_cast<float *>(mindspore::lite::ReadFile(output2Ppath.c_str(), &output2_size)); | |||
| for (auto fp16_enable : {false}) { | |||
| auto *param = CreateParameter(split_dim_, num_split_, split_sizes_); | |||
| TestMain({{input_shape, input_data, VAR}}, {{output_shape1, output_data1}, {output_shape2, output_data2}}, param, | |||
| fp16_enable, fp16_enable ? 1e-3 : 1e-9); | |||
| } | |||
| } | |||
| } // namespace mindspore::lite::opencl::test | |||