From e6157bf19101ab9a3c94dfaacf817241795d7325 Mon Sep 17 00:00:00 2001 From: fuzhiye Date: Mon, 3 Aug 2020 09:09:49 +0800 Subject: [PATCH] extract layout tranform func for fp16 op --- .../kernel/arm/base/layout_transform.cc | 10 ----- .../kernel/arm/fp16/convolution_3x3_fp16.cc | 10 +++-- .../kernel/arm/fp16/convolution_fp16.cc | 10 +++-- .../kernel/arm/fp16/layout_transform_fp16.cc | 39 +++++++++++++++++ .../kernel/arm/fp16/layout_transform_fp16.h | 27 ++++++++++++ .../runtime/kernel/arm/fp32/convolution.cc | 14 ++++++- .../src/runtime/kernel/arm/fp32/convolution.h | 3 +- .../kernel/arm/fp32/convolution_3x3.cc | 34 ++++++++++----- .../runtime/kernel/arm/fp32/convolution_3x3.h | 5 +-- .../kernel/arm/fp32/convolution_winograd.cc | 42 ++++++++++++------- .../kernel/arm/fp32/convolution_winograd.h | 6 +-- .../runtime/kernel/arm/opclib/common_func.cc | 7 +++- .../runtime/kernel/arm/opclib/common_func.h | 3 ++ .../runtime/kernel/arm/opclib/fp32/conv.cc | 24 ++++++----- .../src/runtime/kernel/arm/opclib/fp32/conv.h | 11 +++-- .../kernel/arm/opclib/int8/conv_int8.cc | 4 +- .../kernel/arm/opclib/optimized_kernel.h | 8 ++-- .../src/runtime/kernel/arm/opclib/pack.cc | 21 +++++----- .../lite/src/runtime/kernel/arm/opclib/pack.h | 3 +- .../kernel/arm/opclib/winograd_transform.cc | 15 ++++--- .../kernel/arm/opclib/winograd_transform.h | 4 +- .../runtime/kernel/arm/common/pack_tests.cc | 2 +- 22 files changed, 207 insertions(+), 95 deletions(-) create mode 100644 mindspore/lite/src/runtime/kernel/arm/fp16/layout_transform_fp16.cc create mode 100644 mindspore/lite/src/runtime/kernel/arm/fp16/layout_transform_fp16.h diff --git a/mindspore/lite/src/runtime/kernel/arm/base/layout_transform.cc b/mindspore/lite/src/runtime/kernel/arm/base/layout_transform.cc index ba8a82cc13..a97c392bf5 100644 --- a/mindspore/lite/src/runtime/kernel/arm/base/layout_transform.cc +++ b/mindspore/lite/src/runtime/kernel/arm/base/layout_transform.cc @@ -19,12 +19,6 @@ using mindspore::schema::Format; namespace mindspore::kernel { -#ifdef ENABLE_FP16 -LayoutConvertor LayoutTransformFp16(schema::Format src_format, schema::Format dst_format) { - // todo - return nullptr; -} -#endif LayoutConvertor LayoutTransformFp32(schema::Format src_format, schema::Format dst_format) { // todo if (src_format == schema::Format_NHWC && dst_format == schema::Format_NC4HW4) { @@ -58,10 +52,6 @@ LayoutConvertor LayoutTransform(TypeId data_type, schema::Format src_format, sch switch (data_type) { case kNumberTypeInt8: return LayoutTransformInt8(src_format, dst_format); -#ifdef ENABLE_FP16 - case kNumberTypeFloat16: - return LayoutTransformFp16(src_format, dst_format); -#endif case kNumberTypeFloat32: return LayoutTransformFp32(src_format, dst_format); default: diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_3x3_fp16.cc b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_3x3_fp16.cc index 2203932115..8d6c00e210 100644 --- a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_3x3_fp16.cc +++ b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_3x3_fp16.cc @@ -18,7 +18,7 @@ #include "src/runtime/kernel/arm/opclib/fp16/conv_fp16.h" #include "src/runtime/kernel/arm/opclib/fp16/winograd_transform_fp16.h" #include "src/runtime/kernel/arm/opclib/fp16/pack_fp16.h" -#include "src/runtime/kernel/arm/base/layout_transform.h" +#include "src/runtime/kernel/arm/fp16/layout_transform_fp16.h" #include "schema/model_generated.h" #include "src/kernel_registry.h" #include "include/errorcode.h" @@ -159,9 +159,11 @@ void Convolution3x3FP16CPUKernel::ConfigInputOutput() { auto output_tensor = outputs_.at(kOutputIndex); output_tensor->SetFormat(schema::Format_NHWC); auto input_tensor = inputs_.at(kInputIndex); - auto ret = CheckLayout(input_tensor); - if (ret != RET_OK) { - MS_LOG(ERROR) << "Check layout failed."; + auto input_format = input_tensor->GetFormat(); + schema::Format execute_format = schema::Format_NHWC4; + convert_func_ = LayoutTransformFp16(input_format, execute_format); + if (convert_func_ == nullptr) { + MS_LOG(ERROR) << "layout convert func is nullptr."; return; } } diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_fp16.cc b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_fp16.cc index c4a775a030..2e5e8e8f14 100644 --- a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_fp16.cc +++ b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_fp16.cc @@ -18,7 +18,7 @@ #include "src/runtime/kernel/arm/fp16/convolution_3x3_fp16.h" #include "src/runtime/kernel/arm/opclib/fp16/conv_fp16.h" #include "src/runtime/kernel/arm/opclib/fp16/pack_fp16.h" -#include "src/runtime/kernel/arm/base/layout_transform.h" +#include "src/runtime/kernel/arm/fp16/layout_transform_fp16.h" #include "schema/model_generated.h" #include "src/kernel_registry.h" #include "include/errorcode.h" @@ -130,9 +130,11 @@ int ConvolutionFP16CPUKernel::InitTmpBuffer() { void ConvolutionFP16CPUKernel::ConfigInputOutput() { auto input_tensor = inputs_.at(kInputIndex); - auto ret = CheckLayout(input_tensor); - if (ret != RET_OK) { - MS_LOG(ERROR) << "Check layout failed."; + auto input_format = input_tensor->GetFormat(); + schema::Format execute_format = schema::Format_NHWC4; + convert_func_ = LayoutTransformFp16(input_format, execute_format); + if (convert_func_ == nullptr) { + MS_LOG(ERROR) << "layout convert func is nullptr."; return; } auto output_tensor = outputs_.at(kOutputIndex); diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/layout_transform_fp16.cc b/mindspore/lite/src/runtime/kernel/arm/fp16/layout_transform_fp16.cc new file mode 100644 index 0000000000..dc7fd5720b --- /dev/null +++ b/mindspore/lite/src/runtime/kernel/arm/fp16/layout_transform_fp16.cc @@ -0,0 +1,39 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "src/runtime/kernel/arm/fp16/layout_transform_fp16.h" +#include "src/runtime/kernel/arm/opclib/fp16/pack_fp16.h" +#include "schema/ops_generated.h" +#include "mindspore/core/utils/log_adapter.h" + +namespace mindspore::kernel { +LayoutConvertor LayoutTransformFp16(schema::Format src_format, schema::Format dst_format) { + if (src_format == schema::Format_NHWC && dst_format == schema::Format_NC4HW4) { + return PackNHWCToNC4HW4Fp16; + } else if (src_format == schema::Format_NHWC && dst_format == schema::Format_NHWC4) { + return PackNHWCToNHWC4Fp16; + } else if (src_format == schema::Format_NC4HW4 && dst_format == schema::Format_NHWC4) { + return PackNC4HW4ToNHWC4Fp16; + } else if (src_format == schema::Format_NCHW && dst_format == schema::Format_NC4HW4) { + return PackNCHWToNC4HW4Fp16; + } else if (src_format == schema::Format_NC4HW4 && dst_format == schema::Format_NHWC) { + return PackNC4HW4ToNHWCFp16; + } else { + MS_LOG(ERROR) << "Unsupported transform from " << schema::EnumNameFormat(src_format) << " to " + << schema::EnumNameFormat(dst_format); + return nullptr; + } +} +} // namespace mindspore::kernel diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/layout_transform_fp16.h b/mindspore/lite/src/runtime/kernel/arm/fp16/layout_transform_fp16.h new file mode 100644 index 0000000000..37e11da649 --- /dev/null +++ b/mindspore/lite/src/runtime/kernel/arm/fp16/layout_transform_fp16.h @@ -0,0 +1,27 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_LAYOUT_TRANSFORM_FP16_H_ +#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_LAYOUT_TRANSFORM_FP16_H_ + +#include "src/runtime/kernel/arm/base/layout_transform.h" +#include "schema/ops_generated.h" + +namespace mindspore::kernel { +LayoutConvertor LayoutTransformFp16(schema::Format src_format, schema::Format dst_format); +} // namespace mindspore::kernel + +#endif // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_LAYOUT_TRANSFORM_FP16_H_ diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution.cc b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution.cc index 1fcf2fece2..94aa1a6ad9 100644 --- a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution.cc +++ b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution.cc @@ -19,6 +19,7 @@ #include "src/runtime/kernel/arm/fp32/convolution_3x3.h" #include "src/runtime/kernel/arm/fp32/convolution_winograd.h" #include "src/runtime/kernel/arm/opclib/fp32/conv.h" +#include "src/runtime/kernel/arm/opclib/common_func.h" #include "schema/model_generated.h" #include "src/kernel_factory.h" #include "include/errorcode.h" @@ -56,7 +57,7 @@ int ConvolutionCPUKernel::InitWeightBias() { return RET_ERROR; } memset(packed_weight_, 0, pack_weight_size * sizeof(float)); - PackWeightFp32(origin_weight, conv_param_, packed_weight_); + PackWeightFp32(origin_weight, conv_param_, packed_weight_, oc_block, oc_block_num); // init bias bias_data_ = reinterpret_cast(malloc(oc_block_num * oc_block * sizeof(float))); @@ -125,6 +126,11 @@ void ConvolutionCPUKernel::ConfigInputOutput() { MS_LOG(ERROR) << "Check layout failed."; return; } +#ifdef ENABLE_ARM32 + gemm_func_ = IndirectGemmFp32_8x4; +#else + gemm_func_ = IndirectGemmFp32_8x8; +#endif } int ConvolutionCPUKernel::Init() { @@ -175,9 +181,13 @@ int ConvolutionCPUKernel::ReSize() { } int ConvolutionCPUKernel::RunImpl(int task_id) { + if (gemm_func_ == nullptr) { + MS_LOG(ERROR) << "gemm_func is nullptr."; + return RET_ERROR; + } auto output_addr = reinterpret_cast(outputs_.at(kOutputIndex)->Data()); ConvFp32(reinterpret_cast(nhwc4_input_), packed_input_, packed_weight_, - reinterpret_cast(bias_data_), tmp_output_block_, output_addr, task_id, conv_param_); + reinterpret_cast(bias_data_), tmp_output_block_, output_addr, task_id, conv_param_, gemm_func_); return RET_OK; } diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution.h b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution.h index 688b981b22..a184c7df1c 100644 --- a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution.h +++ b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution.h @@ -21,6 +21,7 @@ #include "src/lite_kernel.h" #include "src/runtime/kernel/arm/opclib/op_base.h" #include "src/runtime/kernel/arm/base/convolution_base.h" +#include "src/runtime/kernel/arm/opclib/fp32/conv.h" namespace mindspore::kernel { class ConvolutionCPUKernel : public ConvolutionBaseCPUKernel { @@ -52,8 +53,8 @@ class ConvolutionCPUKernel : public ConvolutionBaseCPUKernel { float *packed_input_; float *packed_weight_; float *tmp_output_block_; + GEMM_FUNC_FP32 gemm_func_ = nullptr; }; } // namespace mindspore::kernel #endif // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP32_CONVOLUTION_H_ - diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_3x3.cc b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_3x3.cc index 3bf8682a37..8beb70ed62 100644 --- a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_3x3.cc +++ b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_3x3.cc @@ -29,14 +29,13 @@ using mindspore::lite::RET_OK; using mindspore::schema::PrimitiveType_Conv2D; namespace mindspore::kernel { -void ProcessFilter(float *origin_weight, float *dst_weight, ConvParameter *conv_param) { +void ProcessFilter(float *origin_weight, float *dst_weight, ConvParameter *conv_param, int oc_block, int oc_block_num) { auto input_channel = conv_param->input_channel_; auto output_channel = conv_param->output_channel_; auto kernel_plane = conv_param->kernel_w_ * conv_param->kernel_h_; int iC4 = UP_DIV(input_channel, C4NUM); - int oc8 = UP_DIV(output_channel, C8NUM); - size_t tmp_size = oc8 * C8NUM * iC4 * C4NUM * kernel_plane * sizeof(float); + size_t tmp_size = oc_block_num * oc_block * iC4 * C4NUM * kernel_plane * sizeof(float); auto tmp_addr = reinterpret_cast(malloc(tmp_size)); if (tmp_addr == nullptr) { MS_LOG(ERROR) << "malloc tmp_addr failed."; @@ -45,8 +44,7 @@ void ProcessFilter(float *origin_weight, float *dst_weight, ConvParameter *conv_ memset(tmp_addr, 0, tmp_size); PackNHWCToNC4HW4Fp32(origin_weight, tmp_addr, output_channel, kernel_plane, input_channel); - Conv3x3Fp32FilterTransform(tmp_addr, dst_weight, iC4, output_channel, kernel_plane); - + Conv3x3Fp32FilterTransform(tmp_addr, dst_weight, iC4, output_channel, kernel_plane, oc_block); free(tmp_addr); } @@ -55,10 +53,17 @@ int Convolution3x3CPUKernel::InitWeightBias() { auto output_channel = conv_param_->output_channel_; int iC4 = UP_DIV(input_channel, C4NUM); int oC4 = UP_DIV(output_channel, C4NUM); - int oC8 = UP_DIV(output_channel, C8NUM); + int oc_block, oc_block_num; +#ifdef ENABLE_ARM32 + oc_block = C4NUM; + oc_block_num = UP_DIV(output_channel, C4NUM); +#else + oc_block = C8NUM; + oc_block_num = UP_DIV(output_channel, C8NUM); +#endif int k_plane = 16; // init weight - size_t transformed_size = iC4 * C4NUM * oC8 * C8NUM * k_plane * sizeof(float); + size_t transformed_size = iC4 * C4NUM * oc_block_num * oc_block * k_plane * sizeof(float); transformed_filter_addr_ = reinterpret_cast(malloc(transformed_size)); if (transformed_filter_addr_ == nullptr) { MS_LOG(ERROR) << "malloc transformed filter addr failed."; @@ -66,7 +71,7 @@ int Convolution3x3CPUKernel::InitWeightBias() { } memset(transformed_filter_addr_, 0, transformed_size); auto weight_data = reinterpret_cast(inputs_.at(kWeightIndex)->Data()); - ProcessFilter(weight_data, transformed_filter_addr_, conv_param_); + ProcessFilter(weight_data, transformed_filter_addr_, conv_param_, oc_block, oc_block_num); // init bias size_t new_bias_size = oC4 * C4NUM * sizeof(float); @@ -89,7 +94,6 @@ int Convolution3x3CPUKernel::InitTmpBuffer() { int iC4 = UP_DIV(conv_param_->input_channel_, C4NUM); int oC4 = UP_DIV(conv_param_->output_channel_, C4NUM); int k_plane = 16; - // todo size_t tile_buffer_size = thread_count_ * TILE_NUM * k_plane * iC4 * C4NUM * sizeof(float); tile_buffer_ = reinterpret_cast(malloc(tile_buffer_size)); if (tile_buffer_ == nullptr) { @@ -148,6 +152,11 @@ void Convolution3x3CPUKernel::ConfigInputOutput() { MS_LOG(ERROR) << "Check layout failed."; return; } +#ifdef ENABLE_ARM32 + gemm_func_ = IndirectGemmFp32_8x4; +#else + gemm_func_ = IndirectGemmFp32_8x8; +#endif } int Convolution3x3CPUKernel::Init() { @@ -201,9 +210,13 @@ int Convolution3x3CPUKernel::ReSize() { } int Convolution3x3CPUKernel::RunImpl(int task_id) { + if (gemm_func_ == nullptr) { + MS_LOG(ERROR) << "gemm_func is nullptr."; + return RET_ERROR; + } auto output_addr = reinterpret_cast(outputs_.at(kOutputIndex)->Data()); Conv3x3Fp32(reinterpret_cast(nhwc4_input_), transformed_filter_addr_, reinterpret_cast(bias_data_), - output_addr, tmp_buffer_address_list_, task_id, conv_param_); + output_addr, tmp_buffer_address_list_, task_id, conv_param_, gemm_func_); return RET_OK; } @@ -234,4 +247,3 @@ int Convolution3x3CPUKernel::Run() { return RET_OK; } } // namespace mindspore::kernel - diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_3x3.h b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_3x3.h index 9d9880ea58..4b70909d86 100644 --- a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_3x3.h +++ b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_3x3.h @@ -19,7 +19,6 @@ #include #include "src/lite_kernel.h" - #include "src/runtime/kernel/arm/base/convolution_base.h" #include "src/runtime/kernel/arm/opclib/winograd_transform.h" @@ -62,9 +61,9 @@ class Convolution3x3CPUKernel : public ConvolutionBaseCPUKernel { float *tmp_dst_buffer_; float *nc4hw4_out_; TmpBufferAddress tmp_buffer_address_list_[4]; + GEMM_FUNC_FP32 gemm_func_ = nullptr; }; -void ProcessFilter(float *origin_weight, float *dst_weight, ConvParameter *conv_param); +void ProcessFilter(float *origin_weight, float *dst_weight, ConvParameter *conv_param, int oc_block, int oc_block_num); } // namespace mindspore::kernel #endif // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP32_CONVOLUTION_3X3_H_ - diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_winograd.cc b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_winograd.cc index 3a021d2b66..f1cf72d85d 100644 --- a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_winograd.cc +++ b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_winograd.cc @@ -29,7 +29,7 @@ using mindspore::schema::PrimitiveType_Conv2D; namespace mindspore::kernel { void WinogradFilterTransform(const float *weight_data, Matrix *trans_weight, int kernel_unit, int input_unit, - ConvParameter *conv_param) { + ConvParameter *conv_param, int oc_block) { // original weight format : ohwi auto channel_in = conv_param->input_channel_; auto channel_out = conv_param->output_channel_; @@ -53,10 +53,10 @@ void WinogradFilterTransform(const float *weight_data, Matrix *trans_weight, int int kernel_plane_stride = channel_in; for (int i = 0; i < channel_out; i++) { - int oc8_block = i / C8NUM; - int oc8_res = i % C8NUM; + int out_c_block = i / oc_block; + int out_c_res = i % oc_block; int input_oz_offset = i * kernel_unit * kernel_unit * channel_in; - int output_oz_offset = oc8_block * strides[1] * input_unit * input_unit + oc8_res; + int output_oz_offset = out_c_block * strides[1] * input_unit * input_unit + out_c_res; for (int j = 0; j < channel_in; j++) { int ic4_block = j / C4NUM; int ic4_res = j % C4NUM; @@ -88,16 +88,24 @@ void WinogradFilterTransform(const float *weight_data, Matrix *trans_weight, int int ConvolutionWinogradCPUKernel::InitWeightBias() { int output_channel = conv_param_->output_channel_; int oc4 = UP_DIV(output_channel, C4NUM); + int oc_block, oc_block_num; +#ifdef ENABLE_ARM32 + oc_block = C4NUM; + oc_block_num = UP_DIV(output_channel, C4NUM); +#else + oc_block = C8NUM; + oc_block_num = UP_DIV(output_channel, C8NUM); +#endif // init weight - auto ret = MallocFilterMatrix(); + auto ret = MallocFilterMatrix(oc_block, oc_block_num); if (ret != RET_OK) { MS_LOG(ERROR) << "Malloc filter matrix failed."; return RET_ERROR; } auto weight_tensor = inputs_.at(kWeightIndex); auto weight_data = reinterpret_cast(weight_tensor->Data()); - WinogradFilterTransform(weight_data, trans_weight_, kernel_unit_, input_unit_, conv_param_); + WinogradFilterTransform(weight_data, trans_weight_, kernel_unit_, input_unit_, conv_param_, oc_block); // init bias size_t new_bias_size = oc4 * C4NUM * sizeof(float); @@ -112,14 +120,12 @@ int ConvolutionWinogradCPUKernel::InitWeightBias() { return RET_OK; } -int ConvolutionWinogradCPUKernel::MallocFilterMatrix() { +int ConvolutionWinogradCPUKernel::MallocFilterMatrix(int oc_block, int oc_block_num) { int channel_in = conv_param_->input_channel_; - int channel_out = conv_param_->output_channel_; int ic4 = UP_DIV(channel_in, BLOCK); - int oc8 = UP_DIV(channel_out, C8NUM); // set data - auto trans_matrix_data_size = input_unit_ * input_unit_ * ic4 * oc8 * C4NUM * C8NUM * sizeof(float); + auto trans_matrix_data_size = input_unit_ * input_unit_ * ic4 * C4NUM * oc_block_num * oc_block * sizeof(float); auto matrix_buffer = malloc(trans_matrix_data_size); if (matrix_buffer == nullptr) { MS_LOG(ERROR) << "malloc matrix_buffer failed."; @@ -134,10 +140,10 @@ int ConvolutionWinogradCPUKernel::MallocFilterMatrix() { std::vector strides; // set shape shapes.push_back(input_unit_ * input_unit_); - shapes.push_back(oc8); + shapes.push_back(oc_block_num); shapes.push_back(ic4); shapes.push_back(C4NUM); - shapes.push_back(C8NUM); + shapes.push_back(oc_block); // set stride for (int i = 0; i < 4; i++) { int stride = 1; @@ -227,6 +233,11 @@ int ConvolutionWinogradCPUKernel::ConfigInputOutput() { MS_LOG(ERROR) << "Get output_trans_func_ failed."; return RET_ERROR; } +#ifdef ENABLE_ARM32 + gemm_func_ = IndirectGemmFp32_8x4; +#else + gemm_func_ = IndirectGemmFp32_8x8; +#endif return RET_OK; } @@ -301,10 +312,14 @@ int ConvolutionWinogradCPUKernel::ReSize() { } int ConvolutionWinogradCPUKernel::RunImpl(int task_id) { + if (gemm_func_ == nullptr) { + MS_LOG(ERROR) << "gemm_func is nullptr."; + return RET_ERROR; + } auto output_addr = reinterpret_cast(outputs_.at(kOutputIndex)->Data()); ConvWinogardFp32(reinterpret_cast(nhwc4_input_), reinterpret_cast(trans_weight_->GetData()), reinterpret_cast(bias_data_), output_addr, tmp_buffer_address_list_, task_id, - conv_param_, input_trans_func_, output_trans_func_); + conv_param_, input_trans_func_, output_trans_func_, gemm_func_); return RET_OK; } @@ -335,4 +350,3 @@ int ConvolutionWinogradCPUKernel::Run() { return RET_OK; } } // namespace mindspore::kernel - diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_winograd.h b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_winograd.h index 6518db604d..1fe90cf032 100644 --- a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_winograd.h +++ b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_winograd.h @@ -50,7 +50,7 @@ class ConvolutionWinogradCPUKernel : public ConvolutionBaseCPUKernel { int Run() override; int RunImpl(int task_id); int InitWeightBias(); - int MallocFilterMatrix(); + int MallocFilterMatrix(int oc_block, int oc_block_num); int InitTmpBuffer(); int ConfigInputOutput(); @@ -66,9 +66,9 @@ class ConvolutionWinogradCPUKernel : public ConvolutionBaseCPUKernel { InputTransformUnitFunc input_trans_func_; OutputTransformUnitFunc output_trans_func_; TmpBufferAddress tmp_buffer_address_list_[5]; + GEMM_FUNC_FP32 gemm_func_ = nullptr; }; void WinogradFilterTransform(const float *weight_data, Matrix *trans_weight, int kernel_unit, int input_unit, - ConvParameter *conv_param); + ConvParameter *conv_param, int oc_block); } // namespace mindspore::kernel #endif // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP32_CONVOLUTION_WINOGRAD_H_ - diff --git a/mindspore/lite/src/runtime/kernel/arm/opclib/common_func.cc b/mindspore/lite/src/runtime/kernel/arm/opclib/common_func.cc index ebc32b780b..9e6f9edd4e 100644 --- a/mindspore/lite/src/runtime/kernel/arm/opclib/common_func.cc +++ b/mindspore/lite/src/runtime/kernel/arm/opclib/common_func.cc @@ -17,7 +17,7 @@ #include "src/runtime/kernel/arm/opclib/common_func.h" #include "src/runtime/kernel/arm/opclib/quantization/fixed_point.h" -#ifndef __aarch64__ +#ifndef ENABLE_ARM64 void IndirectGemmFp32(float *output, const float *input, const float *weight, const float *bias, size_t step, int ic4, int output_channel, size_t offset, size_t relu, size_t relu6) { for (int i = 0; i < TILE_NUM; i++) { @@ -102,6 +102,11 @@ void IndirectGemmFp32_8x8(float *output, const float *input, const float *weight } } #endif +#ifndef ENABLE_ARM32 +void IndirectGemmFp32_8x4(float *output, const float *input, const float *weight, const float *bias, size_t step, + size_t ic4, size_t output_channel, size_t offset, size_t mode, size_t writeC4, size_t relu, + size_t relu6) {} +#endif int8_t MinInt8(int8_t a, int8_t b) { return b ^ ((a ^ b) & -(a < b)); } diff --git a/mindspore/lite/src/runtime/kernel/arm/opclib/common_func.h b/mindspore/lite/src/runtime/kernel/arm/opclib/common_func.h index 895dbf0081..e8c3f587f0 100644 --- a/mindspore/lite/src/runtime/kernel/arm/opclib/common_func.h +++ b/mindspore/lite/src/runtime/kernel/arm/opclib/common_func.h @@ -36,6 +36,9 @@ void PostFuncInt8(const int *in, const int *bias, int8_t *out, int oc, int plane void IndirectGemmFp32_8x8(float *output, const float *input, const float *weight, const float *bias, size_t step, size_t ic4, size_t output_channel, size_t offset, size_t mode, size_t writeC4, size_t relu, size_t relu6); +void IndirectGemmFp32_8x4(float *output, const float *input, const float *weight, const float *bias, size_t step, + size_t ic4, size_t output_channel, size_t offset, size_t mode, size_t writeC4, size_t relu, + size_t relu6); void IndirectGemmFp32_Comm(float *output, const float *input, const float *weight, size_t ic4, size_t hw, size_t oc, size_t offset); void IndirectGemmFp32(float *output, const float *input, const float *weight, const float *bias, size_t step, int ic4, diff --git a/mindspore/lite/src/runtime/kernel/arm/opclib/fp32/conv.cc b/mindspore/lite/src/runtime/kernel/arm/opclib/fp32/conv.cc index 66fe33c862..023faa9521 100644 --- a/mindspore/lite/src/runtime/kernel/arm/opclib/fp32/conv.cc +++ b/mindspore/lite/src/runtime/kernel/arm/opclib/fp32/conv.cc @@ -20,7 +20,8 @@ // fp32 conv common void ConvFp32(float *input_data, float *packed_input, float *packed_weight, const float *bias_data, - float *tmp_out_block, float *output_data, int task_id, ConvParameter *conv_param) { + float *tmp_out_block, float *output_data, int task_id, ConvParameter *conv_param, + GEMM_FUNC_FP32 gemm_func) { int kernel_h = conv_param->kernel_h_; int kernel_w = conv_param->kernel_w_; int in_batch = conv_param->input_batch_; @@ -57,12 +58,12 @@ void ConvFp32(float *input_data, float *packed_input, float *packed_weight, cons int out_offset = thread_id * TILE_NUM * out_channel + out_batch_offset; if (real_cal_num == TILE_NUM) { float *gemm_output = output_data + out_offset; - IndirectGemmFp32_8x8(gemm_output, gemm_input, packed_weight, bias_data, conv_depth, ic4, out_channel, - output_offset, 0, 0, conv_param->is_relu_, conv_param->is_relu6_); + gemm_func(gemm_output, gemm_input, packed_weight, bias_data, conv_depth, ic4, out_channel, output_offset, 0, 0, + conv_param->is_relu_, conv_param->is_relu6_); } else { // res part - IndirectGemmFp32_8x8(tmp_out_block, gemm_input, packed_weight, bias_data, conv_depth, ic4, out_channel, - output_offset, 0, 0, conv_param->is_relu_, conv_param->is_relu6_); + gemm_func(tmp_out_block, gemm_input, packed_weight, bias_data, conv_depth, ic4, out_channel, output_offset, 0, + 0, conv_param->is_relu_, conv_param->is_relu6_); memcpy(output_data + out_offset, tmp_out_block, real_cal_num * out_channel * sizeof(float)); } } @@ -78,7 +79,8 @@ int Conv1x1Fp32(const float *input_data, const float *weight_data, float *output // fp32 conv winograd void ConvWinogardFp32(float *input_data, float *trans_weight, const float *bias_data, float *output_data, TmpBufferAddress *buffer_list, int task_id, ConvParameter *conv_param, - InputTransformUnitFunc input_trans_func, OutputTransformUnitFunc output_trans_func) { + InputTransformUnitFunc input_trans_func, OutputTransformUnitFunc output_trans_func, + GEMM_FUNC_FP32 gemm_func) { int thread_num = conv_param->thread_num_; int input_unit = conv_param->input_unit_; int in_batch = conv_param->input_batch_; @@ -111,8 +113,8 @@ void ConvWinogardFp32(float *input_data, float *trans_weight, const float *bias_ WinogradInputTransform(input_data, trans_input, tmp_data, cal_num, out_tile_index, out_w_block, conv_param, input_trans_func); // step 3 : gemm - IndirectGemmFp32_8x8(gemm_out, trans_input, trans_weight, nullptr, input_unit_square, ic4, oc4 * C4NUM, - output_offset, 1, 1, 0, 0); + gemm_func(gemm_out, trans_input, trans_weight, nullptr, input_unit_square, ic4, oc4 * C4NUM, output_offset, 1, 1, + 0, 0); // step 4 : output transform WinogradOutputTransform(gemm_out, tmp_out_data, bias_data, cal_num, out_tile_index, out_w_block, conv_param, @@ -173,7 +175,7 @@ void UnPackWinogradOutput(const float *src, float *dst, int batch, int height, i // fp32 conv3x3 void Conv3x3Fp32(float *input_data, float *transed_weight, const float *bias_data, float *output_data, - TmpBufferAddress *buffer_list, int task_id, ConvParameter *conv_param) { + TmpBufferAddress *buffer_list, int task_id, ConvParameter *conv_param, GEMM_FUNC_FP32 gemm_func) { int thread_count = conv_param->thread_num_; int ic4 = UP_DIV(conv_param->input_channel_, C4NUM); int output_channel = conv_param->output_channel_; @@ -198,8 +200,8 @@ void Conv3x3Fp32(float *input_data, float *transed_weight, const float *bias_dat Conv3x3Fp32InputTransform(input_data, tile_buffer, block_unit_buffer, start_index, real_cal_num, out_w_block, conv_param); - IndirectGemmFp32_8x8(tmp_dst_buffer, tile_buffer, transed_weight, nullptr, input_unit_square, ic4, oc4 * C4NUM, - oc4 * C4NUM * input_unit_square * sizeof(float), 1, 1, 0, 0); + gemm_func(tmp_dst_buffer, tile_buffer, transed_weight, nullptr, input_unit_square, ic4, oc4 * C4NUM, + oc4 * C4NUM * input_unit_square * sizeof(float), 1, 1, 0, 0); Conv3x3Fp32OutputTransform(tmp_dst_buffer, nc4hw4_out, bias_data, start_index, real_cal_num, out_w_block, conv_param); diff --git a/mindspore/lite/src/runtime/kernel/arm/opclib/fp32/conv.h b/mindspore/lite/src/runtime/kernel/arm/opclib/fp32/conv.h index 07b316587f..3a4a97791e 100644 --- a/mindspore/lite/src/runtime/kernel/arm/opclib/fp32/conv.h +++ b/mindspore/lite/src/runtime/kernel/arm/opclib/fp32/conv.h @@ -28,10 +28,14 @@ #include "src/runtime/kernel/arm/opclib/winograd_utils.h" using TmpBufferAddress = float *; +typedef void (*GEMM_FUNC_FP32)(float *output, const float *input, const float *weight, const float *bias, size_t step, + size_t ic4, size_t output_channel, size_t offset, size_t mode, size_t writeC4, + size_t relu, size_t relu6); // fp32 convolution common (im2col+gemm) void ConvFp32(float *input_data, float *packed_input, float *packed_weight, const float *bias_data, - float *tmp_out_block, float *output_data, int task_id, ConvParameter *conv_param); + float *tmp_out_block, float *output_data, int task_id, ConvParameter *conv_param, + GEMM_FUNC_FP32 gemm_func); // fp32 conv1x1 strassen matmul int Conv1x1Fp32(const float *input_data, const float *weight_data, float *output_data, float *tmp_ptr, @@ -40,12 +44,13 @@ int Conv1x1Fp32(const float *input_data, const float *weight_data, float *output // fp32 convolution winograd void ConvWinogardFp32(float *input_data, float *trans_weight, const float *bias_data, float *output_data, TmpBufferAddress *buffer_list, int task_id, ConvParameter *conv_param, - InputTransformUnitFunc input_trans_func, OutputTransformUnitFunc output_trans_func); + InputTransformUnitFunc input_trans_func, OutputTransformUnitFunc output_trans_func, + GEMM_FUNC_FP32 gemm_func); void UnPackWinogradOutput(const float *src, float *dst, int batch, int height, int width, int channel, int output_unit); // fp32 conv3x3 void Conv3x3Fp32(float *input_data, float *transed_weight, const float *bias_data, float *output_data, - TmpBufferAddress *buffer_list, int task_id, ConvParameter *conv_param); + TmpBufferAddress *buffer_list, int task_id, ConvParameter *conv_param, GEMM_FUNC_FP32 gemm_func); #endif // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_OPCLIB_FP32_CONV_H_ diff --git a/mindspore/lite/src/runtime/kernel/arm/opclib/int8/conv_int8.cc b/mindspore/lite/src/runtime/kernel/arm/opclib/int8/conv_int8.cc index fe7d9f0b9c..2c460e0713 100644 --- a/mindspore/lite/src/runtime/kernel/arm/opclib/int8/conv_int8.cc +++ b/mindspore/lite/src/runtime/kernel/arm/opclib/int8/conv_int8.cc @@ -49,7 +49,9 @@ void IndirectGemmInt8(int8_t *dst, int32_t *tmp_dst, const int8_t *src, const in #ifdef __aarch64__ IndirectGemmInt8_4x4(dst, src, weight, bias, kernel_plane, ic4, output_channel, output_channel * sizeof(int8_t), input_sum, act_min, act_max, out_zp, out_multiplier, shift_before, shift_after); - // todo arm32 +#elif defined(ENABLE_ARM32) + IndirectGemmInt8_2x4(dst, src, weight, bias, kernel_plane, ic4, output_channel, output_channel * sizeof(int8_t), + input_sum, act_min, act_max, out_zp, out_multiplier, shift_before, shift_after); #else int tile_num = conv_param->tile_num_; int plane_c4 = UP_DIV(kernel_plane, C4NUM); diff --git a/mindspore/lite/src/runtime/kernel/arm/opclib/optimized_kernel.h b/mindspore/lite/src/runtime/kernel/arm/opclib/optimized_kernel.h index b7a051d5e9..150369b110 100644 --- a/mindspore/lite/src/runtime/kernel/arm/opclib/optimized_kernel.h +++ b/mindspore/lite/src/runtime/kernel/arm/opclib/optimized_kernel.h @@ -58,10 +58,10 @@ class OptimizeModule { if ((!support_optimize_ops) && (!support_fp16)) { return; } -// optimized_op_handler_ = dlopen(OPTIMIZE_SHARED_LIBRARY_PATH, RTLD_LAZY); -// if (optimized_op_handler_ == nullptr) { -// printf("Open optimize shared library failed.\n"); -// } + optimized_op_handler_ = dlopen(OPTIMIZE_SHARED_LIBRARY_PATH, RTLD_LAZY); + if (optimized_op_handler_ == nullptr) { + printf("Open optimize shared library failed.\n"); + } } ~OptimizeModule() = default; diff --git a/mindspore/lite/src/runtime/kernel/arm/opclib/pack.cc b/mindspore/lite/src/runtime/kernel/arm/opclib/pack.cc index bffcb84616..c5b909c75c 100644 --- a/mindspore/lite/src/runtime/kernel/arm/opclib/pack.cc +++ b/mindspore/lite/src/runtime/kernel/arm/opclib/pack.cc @@ -18,20 +18,19 @@ #include #include -void PackWeightFp32(float *weight_data, ConvParameter *conv_param, float *packed_weight) { +void PackWeightFp32(float *weight_data, ConvParameter *conv_param, float *packed_weight, int oc_block, + int oc_block_num) { // original weight format : ohwi - // todo pack weight for arm32 platform int kernel_h = conv_param->kernel_h_; int kernel_w = conv_param->kernel_w_; int in_channel = conv_param->input_channel_; int out_channel = conv_param->output_channel_; - int oc8 = UP_DIV(out_channel, C8NUM); int ic4 = UP_DIV(in_channel, C4NUM); int kernel_plane = kernel_h * kernel_w; - int pack_weight_size = oc8 * C8NUM * ic4 * C4NUM * kernel_plane; + int pack_weight_size = oc_block * oc_block_num * ic4 * C4NUM * kernel_plane; - int unit_size = C8NUM * C4NUM; - int block_size = pack_weight_size / oc8; + int unit_size = oc_block * C4NUM; + int block_size = pack_weight_size / oc_block_num; for (int m = 0; m < kernel_plane; m++) { int kernel_plane_stride = m * in_channel; @@ -43,12 +42,12 @@ void PackWeightFp32(float *weight_data, ConvParameter *conv_param, float *packed int real_ic_num = ic_remainder < C4NUM ? ic_remainder : C4NUM; for (int h = 0; h < real_ic_num; h++) { int block_stride = channel_block_stride + h; - int packed_block_stride = packed_channel_block_size + h * C8NUM; - for (int j = 0; j < oc8; j++) { - int kernel_block_stride = block_stride + j * C8NUM * kernel_plane * in_channel; + int packed_block_stride = packed_channel_block_size + h * oc_block; + for (int j = 0; j < oc_block_num; j++) { + int kernel_block_stride = block_stride + j * oc_block * kernel_plane * in_channel; int packed_kernel_block_size = packed_block_stride + j * block_size; - int oc_remainder = out_channel - j * C8NUM; - int real_oc_num = oc_remainder < C8NUM ? oc_remainder : C8NUM; + int oc_remainder = out_channel - j * oc_block; + int real_oc_num = oc_remainder < oc_block ? oc_remainder : oc_block; for (int k = 0; k < real_oc_num; k++) { float *origin_data_ptr = weight_data + kernel_block_stride + k * kernel_plane * in_channel; float *packed_data_ptr = packed_weight + packed_kernel_block_size + k; diff --git a/mindspore/lite/src/runtime/kernel/arm/opclib/pack.h b/mindspore/lite/src/runtime/kernel/arm/opclib/pack.h index c486b75e38..8930550c93 100644 --- a/mindspore/lite/src/runtime/kernel/arm/opclib/pack.h +++ b/mindspore/lite/src/runtime/kernel/arm/opclib/pack.h @@ -40,7 +40,8 @@ void MatrixPack(const float *src, float *dst, int row, int ic4, int stride); void PackInputToC8Int8(const int8_t *input_data, int16_t *packed_input, ConvParameter *conv_param); -void PackWeightFp32(float *weight_data, ConvParameter *conv_param, float *packed_weight); +void PackWeightFp32(float *weight_data, ConvParameter *conv_param, float *packed_weight, int oc_block, + int oc_block_num); void PackWeightInt8(int8_t *weight_data, ConvParameter *conv_param, int8_t *packed_weight, int32_t *weight_sum); diff --git a/mindspore/lite/src/runtime/kernel/arm/opclib/winograd_transform.cc b/mindspore/lite/src/runtime/kernel/arm/opclib/winograd_transform.cc index 571501b142..697e2f04f7 100644 --- a/mindspore/lite/src/runtime/kernel/arm/opclib/winograd_transform.cc +++ b/mindspore/lite/src/runtime/kernel/arm/opclib/winograd_transform.cc @@ -326,18 +326,18 @@ void Conv3x3Fp32InputTransform(const float *input_data, float *trans_input, floa } } -void Conv3x3Fp32FilterTransform(float *weight_data, float *trans_weight, int iC4, int output_channel, - int kernel_plane) { +void Conv3x3Fp32FilterTransform(float *weight_data, float *trans_weight, int iC4, int output_channel, int kernel_plane, + int oc_block) { int input_unit = 4; - int dst_step = iC4 * C4NUM * C8NUM; + int dst_step = iC4 * C4NUM * oc_block; for (int o = 0; o < output_channel; o++) { - int oc8_block_num = o / C8NUM; - int oc8_block_rem = o % C8NUM; + int oc_block_num = o / oc_block; + int oc_block_rem = o % oc_block; int src_oc_offset = o * iC4 * C4NUM * kernel_plane; - int dst_oc_offset = oc8_block_num * C8NUM * iC4 * C4NUM * input_unit * input_unit + oc8_block_rem; + int dst_oc_offset = oc_block_num * oc_block * iC4 * C4NUM * input_unit * input_unit + oc_block_rem; for (int i = 0; i < iC4; i++) { float *src_ic4_ptr = weight_data + src_oc_offset + i * kernel_plane * C4NUM; - float *dst_ic4_ptr = trans_weight + dst_oc_offset + i * C8NUM * C4NUM; + float *dst_ic4_ptr = trans_weight + dst_oc_offset + i * oc_block * C4NUM; #ifdef ENABLE_ARM float32x4_t g00 = vld1q_f32(src_ic4_ptr); float32x4_t g01 = vld1q_f32(src_ic4_ptr + 4); @@ -1368,4 +1368,3 @@ void Conv3x3Uint8OutputTransform(const int32_t *gemm_out, int8_t *out_data, cons } } } - diff --git a/mindspore/lite/src/runtime/kernel/arm/opclib/winograd_transform.h b/mindspore/lite/src/runtime/kernel/arm/opclib/winograd_transform.h index 42e8f4a366..d251f4859c 100644 --- a/mindspore/lite/src/runtime/kernel/arm/opclib/winograd_transform.h +++ b/mindspore/lite/src/runtime/kernel/arm/opclib/winograd_transform.h @@ -43,7 +43,8 @@ void Conv3x3Fp32InputUnit(const float *tmp_data, float *trans_input_data, size_t void Conv3x3Fp32InputTransform(const float *input_data, float *trans_input, float *tmp_data, int start_index, int real_cal_num, int out_w_block, ConvParameter *conv_param); -void Conv3x3Fp32FilterTransform(float *weight_data, float *trans_weight, int iC4, int output_channel, int kernel_plane); +void Conv3x3Fp32FilterTransform(float *weight_data, float *trans_weight, int iC4, int output_channel, int kernel_plane, + int oc_block); void Conv3x3Fp32OutputUnit(const float *gemm_out, const float *bias_data, float *output_data, bool h_not_bound, bool w_not_bound, int output_w); @@ -67,4 +68,3 @@ void Conv3x3Uint8OutputTransform(const int32_t *gemm_out, int8_t *out_data, cons int real_cal_num, int out_w_block, ConvParameter *conv_param); #endif // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_OPCLIB_WINOGRAD_TRANSFORM_H_ - diff --git a/mindspore/lite/test/ut/src/runtime/kernel/arm/common/pack_tests.cc b/mindspore/lite/test/ut/src/runtime/kernel/arm/common/pack_tests.cc index 3a4632809f..c184f7cd02 100644 --- a/mindspore/lite/test/ut/src/runtime/kernel/arm/common/pack_tests.cc +++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/common/pack_tests.cc @@ -122,7 +122,7 @@ TEST_F(TestPack, PackWeightFp32) { std::string weight_path = "./test_data/conv/convfp32_weight_32_3_3_3.bin"; auto weight_data = reinterpret_cast(mindspore::lite::ReadFile(weight_path.c_str(), &weight_size)); auto packed_weight = reinterpret_cast(malloc(k_h * k_w * ic4 * C4NUM * oc8 * C8NUM * sizeof(float))); - PackWeightFp32(weight_data, conv_param, packed_weight); + PackWeightFp32(weight_data, conv_param, packed_weight, C8NUM, oc8); printf("==================output data=================\n"); for (int i = 0; i < 20; i++) {