Merge pull request !3864 from fuzhiye/mindsporetags/v0.7.0-beta
| @@ -19,12 +19,6 @@ | |||||
| using mindspore::schema::Format; | using mindspore::schema::Format; | ||||
| namespace mindspore::kernel { | namespace mindspore::kernel { | ||||
| #ifdef ENABLE_FP16 | |||||
| LayoutConvertor LayoutTransformFp16(schema::Format src_format, schema::Format dst_format) { | |||||
| // todo | |||||
| return nullptr; | |||||
| } | |||||
| #endif | |||||
| LayoutConvertor LayoutTransformFp32(schema::Format src_format, schema::Format dst_format) { | LayoutConvertor LayoutTransformFp32(schema::Format src_format, schema::Format dst_format) { | ||||
| // todo | // todo | ||||
| if (src_format == schema::Format_NHWC && dst_format == schema::Format_NC4HW4) { | if (src_format == schema::Format_NHWC && dst_format == schema::Format_NC4HW4) { | ||||
| @@ -58,10 +52,6 @@ LayoutConvertor LayoutTransform(TypeId data_type, schema::Format src_format, sch | |||||
| switch (data_type) { | switch (data_type) { | ||||
| case kNumberTypeInt8: | case kNumberTypeInt8: | ||||
| return LayoutTransformInt8(src_format, dst_format); | return LayoutTransformInt8(src_format, dst_format); | ||||
| #ifdef ENABLE_FP16 | |||||
| case kNumberTypeFloat16: | |||||
| return LayoutTransformFp16(src_format, dst_format); | |||||
| #endif | |||||
| case kNumberTypeFloat32: | case kNumberTypeFloat32: | ||||
| return LayoutTransformFp32(src_format, dst_format); | return LayoutTransformFp32(src_format, dst_format); | ||||
| default: | default: | ||||
| @@ -18,7 +18,7 @@ | |||||
| #include "src/runtime/kernel/arm/opclib/fp16/conv_fp16.h" | #include "src/runtime/kernel/arm/opclib/fp16/conv_fp16.h" | ||||
| #include "src/runtime/kernel/arm/opclib/fp16/winograd_transform_fp16.h" | #include "src/runtime/kernel/arm/opclib/fp16/winograd_transform_fp16.h" | ||||
| #include "src/runtime/kernel/arm/opclib/fp16/pack_fp16.h" | #include "src/runtime/kernel/arm/opclib/fp16/pack_fp16.h" | ||||
| #include "src/runtime/kernel/arm/base/layout_transform.h" | |||||
| #include "src/runtime/kernel/arm/fp16/layout_transform_fp16.h" | |||||
| #include "schema/model_generated.h" | #include "schema/model_generated.h" | ||||
| #include "src/kernel_registry.h" | #include "src/kernel_registry.h" | ||||
| #include "include/errorcode.h" | #include "include/errorcode.h" | ||||
| @@ -159,9 +159,11 @@ void Convolution3x3FP16CPUKernel::ConfigInputOutput() { | |||||
| auto output_tensor = outputs_.at(kOutputIndex); | auto output_tensor = outputs_.at(kOutputIndex); | ||||
| output_tensor->SetFormat(schema::Format_NHWC); | output_tensor->SetFormat(schema::Format_NHWC); | ||||
| auto input_tensor = inputs_.at(kInputIndex); | auto input_tensor = inputs_.at(kInputIndex); | ||||
| auto ret = CheckLayout(input_tensor); | |||||
| if (ret != RET_OK) { | |||||
| MS_LOG(ERROR) << "Check layout failed."; | |||||
| auto input_format = input_tensor->GetFormat(); | |||||
| schema::Format execute_format = schema::Format_NHWC4; | |||||
| convert_func_ = LayoutTransformFp16(input_format, execute_format); | |||||
| if (convert_func_ == nullptr) { | |||||
| MS_LOG(ERROR) << "layout convert func is nullptr."; | |||||
| return; | return; | ||||
| } | } | ||||
| } | } | ||||
| @@ -18,7 +18,7 @@ | |||||
| #include "src/runtime/kernel/arm/fp16/convolution_3x3_fp16.h" | #include "src/runtime/kernel/arm/fp16/convolution_3x3_fp16.h" | ||||
| #include "src/runtime/kernel/arm/opclib/fp16/conv_fp16.h" | #include "src/runtime/kernel/arm/opclib/fp16/conv_fp16.h" | ||||
| #include "src/runtime/kernel/arm/opclib/fp16/pack_fp16.h" | #include "src/runtime/kernel/arm/opclib/fp16/pack_fp16.h" | ||||
| #include "src/runtime/kernel/arm/base/layout_transform.h" | |||||
| #include "src/runtime/kernel/arm/fp16/layout_transform_fp16.h" | |||||
| #include "schema/model_generated.h" | #include "schema/model_generated.h" | ||||
| #include "src/kernel_registry.h" | #include "src/kernel_registry.h" | ||||
| #include "include/errorcode.h" | #include "include/errorcode.h" | ||||
| @@ -130,9 +130,11 @@ int ConvolutionFP16CPUKernel::InitTmpBuffer() { | |||||
| void ConvolutionFP16CPUKernel::ConfigInputOutput() { | void ConvolutionFP16CPUKernel::ConfigInputOutput() { | ||||
| auto input_tensor = inputs_.at(kInputIndex); | auto input_tensor = inputs_.at(kInputIndex); | ||||
| auto ret = CheckLayout(input_tensor); | |||||
| if (ret != RET_OK) { | |||||
| MS_LOG(ERROR) << "Check layout failed."; | |||||
| auto input_format = input_tensor->GetFormat(); | |||||
| schema::Format execute_format = schema::Format_NHWC4; | |||||
| convert_func_ = LayoutTransformFp16(input_format, execute_format); | |||||
| if (convert_func_ == nullptr) { | |||||
| MS_LOG(ERROR) << "layout convert func is nullptr."; | |||||
| return; | return; | ||||
| } | } | ||||
| auto output_tensor = outputs_.at(kOutputIndex); | auto output_tensor = outputs_.at(kOutputIndex); | ||||
| @@ -0,0 +1,39 @@ | |||||
| /** | |||||
| * Copyright 2020 Huawei Technologies Co., Ltd | |||||
| * | |||||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||||
| * you may not use this file except in compliance with the License. | |||||
| * You may obtain a copy of the License at | |||||
| * | |||||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||||
| * | |||||
| * Unless required by applicable law or agreed to in writing, software | |||||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| * See the License for the specific language governing permissions and | |||||
| * limitations under the License. | |||||
| */ | |||||
| #include "src/runtime/kernel/arm/fp16/layout_transform_fp16.h" | |||||
| #include "src/runtime/kernel/arm/opclib/fp16/pack_fp16.h" | |||||
| #include "schema/ops_generated.h" | |||||
| #include "mindspore/core/utils/log_adapter.h" | |||||
| namespace mindspore::kernel { | |||||
| LayoutConvertor LayoutTransformFp16(schema::Format src_format, schema::Format dst_format) { | |||||
| if (src_format == schema::Format_NHWC && dst_format == schema::Format_NC4HW4) { | |||||
| return PackNHWCToNC4HW4Fp16; | |||||
| } else if (src_format == schema::Format_NHWC && dst_format == schema::Format_NHWC4) { | |||||
| return PackNHWCToNHWC4Fp16; | |||||
| } else if (src_format == schema::Format_NC4HW4 && dst_format == schema::Format_NHWC4) { | |||||
| return PackNC4HW4ToNHWC4Fp16; | |||||
| } else if (src_format == schema::Format_NCHW && dst_format == schema::Format_NC4HW4) { | |||||
| return PackNCHWToNC4HW4Fp16; | |||||
| } else if (src_format == schema::Format_NC4HW4 && dst_format == schema::Format_NHWC) { | |||||
| return PackNC4HW4ToNHWCFp16; | |||||
| } else { | |||||
| MS_LOG(ERROR) << "Unsupported transform from " << schema::EnumNameFormat(src_format) << " to " | |||||
| << schema::EnumNameFormat(dst_format); | |||||
| return nullptr; | |||||
| } | |||||
| } | |||||
| } // namespace mindspore::kernel | |||||
| @@ -0,0 +1,27 @@ | |||||
| /** | |||||
| * Copyright 2020 Huawei Technologies Co., Ltd | |||||
| * | |||||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||||
| * you may not use this file except in compliance with the License. | |||||
| * You may obtain a copy of the License at | |||||
| * | |||||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||||
| * | |||||
| * Unless required by applicable law or agreed to in writing, software | |||||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| * See the License for the specific language governing permissions and | |||||
| * limitations under the License. | |||||
| */ | |||||
| #ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_LAYOUT_TRANSFORM_FP16_H_ | |||||
| #define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_LAYOUT_TRANSFORM_FP16_H_ | |||||
| #include "src/runtime/kernel/arm/base/layout_transform.h" | |||||
| #include "schema/ops_generated.h" | |||||
| namespace mindspore::kernel { | |||||
| LayoutConvertor LayoutTransformFp16(schema::Format src_format, schema::Format dst_format); | |||||
| } // namespace mindspore::kernel | |||||
| #endif // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_LAYOUT_TRANSFORM_FP16_H_ | |||||
| @@ -19,6 +19,7 @@ | |||||
| #include "src/runtime/kernel/arm/fp32/convolution_3x3.h" | #include "src/runtime/kernel/arm/fp32/convolution_3x3.h" | ||||
| #include "src/runtime/kernel/arm/fp32/convolution_winograd.h" | #include "src/runtime/kernel/arm/fp32/convolution_winograd.h" | ||||
| #include "src/runtime/kernel/arm/opclib/fp32/conv.h" | #include "src/runtime/kernel/arm/opclib/fp32/conv.h" | ||||
| #include "src/runtime/kernel/arm/opclib/common_func.h" | |||||
| #include "schema/model_generated.h" | #include "schema/model_generated.h" | ||||
| #include "src/kernel_factory.h" | #include "src/kernel_factory.h" | ||||
| #include "include/errorcode.h" | #include "include/errorcode.h" | ||||
| @@ -56,7 +57,7 @@ int ConvolutionCPUKernel::InitWeightBias() { | |||||
| return RET_ERROR; | return RET_ERROR; | ||||
| } | } | ||||
| memset(packed_weight_, 0, pack_weight_size * sizeof(float)); | memset(packed_weight_, 0, pack_weight_size * sizeof(float)); | ||||
| PackWeightFp32(origin_weight, conv_param_, packed_weight_); | |||||
| PackWeightFp32(origin_weight, conv_param_, packed_weight_, oc_block, oc_block_num); | |||||
| // init bias | // init bias | ||||
| bias_data_ = reinterpret_cast<float *>(malloc(oc_block_num * oc_block * sizeof(float))); | bias_data_ = reinterpret_cast<float *>(malloc(oc_block_num * oc_block * sizeof(float))); | ||||
| @@ -125,6 +126,11 @@ void ConvolutionCPUKernel::ConfigInputOutput() { | |||||
| MS_LOG(ERROR) << "Check layout failed."; | MS_LOG(ERROR) << "Check layout failed."; | ||||
| return; | return; | ||||
| } | } | ||||
| #ifdef ENABLE_ARM32 | |||||
| gemm_func_ = IndirectGemmFp32_8x4; | |||||
| #else | |||||
| gemm_func_ = IndirectGemmFp32_8x8; | |||||
| #endif | |||||
| } | } | ||||
| int ConvolutionCPUKernel::Init() { | int ConvolutionCPUKernel::Init() { | ||||
| @@ -175,9 +181,13 @@ int ConvolutionCPUKernel::ReSize() { | |||||
| } | } | ||||
| int ConvolutionCPUKernel::RunImpl(int task_id) { | int ConvolutionCPUKernel::RunImpl(int task_id) { | ||||
| if (gemm_func_ == nullptr) { | |||||
| MS_LOG(ERROR) << "gemm_func is nullptr."; | |||||
| return RET_ERROR; | |||||
| } | |||||
| auto output_addr = reinterpret_cast<float *>(outputs_.at(kOutputIndex)->Data()); | auto output_addr = reinterpret_cast<float *>(outputs_.at(kOutputIndex)->Data()); | ||||
| ConvFp32(reinterpret_cast<float *>(nhwc4_input_), packed_input_, packed_weight_, | ConvFp32(reinterpret_cast<float *>(nhwc4_input_), packed_input_, packed_weight_, | ||||
| reinterpret_cast<float *>(bias_data_), tmp_output_block_, output_addr, task_id, conv_param_); | |||||
| reinterpret_cast<float *>(bias_data_), tmp_output_block_, output_addr, task_id, conv_param_, gemm_func_); | |||||
| return RET_OK; | return RET_OK; | ||||
| } | } | ||||
| @@ -21,6 +21,7 @@ | |||||
| #include "src/lite_kernel.h" | #include "src/lite_kernel.h" | ||||
| #include "src/runtime/kernel/arm/opclib/op_base.h" | #include "src/runtime/kernel/arm/opclib/op_base.h" | ||||
| #include "src/runtime/kernel/arm/base/convolution_base.h" | #include "src/runtime/kernel/arm/base/convolution_base.h" | ||||
| #include "src/runtime/kernel/arm/opclib/fp32/conv.h" | |||||
| namespace mindspore::kernel { | namespace mindspore::kernel { | ||||
| class ConvolutionCPUKernel : public ConvolutionBaseCPUKernel { | class ConvolutionCPUKernel : public ConvolutionBaseCPUKernel { | ||||
| @@ -52,8 +53,8 @@ class ConvolutionCPUKernel : public ConvolutionBaseCPUKernel { | |||||
| float *packed_input_; | float *packed_input_; | ||||
| float *packed_weight_; | float *packed_weight_; | ||||
| float *tmp_output_block_; | float *tmp_output_block_; | ||||
| GEMM_FUNC_FP32 gemm_func_ = nullptr; | |||||
| }; | }; | ||||
| } // namespace mindspore::kernel | } // namespace mindspore::kernel | ||||
| #endif // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP32_CONVOLUTION_H_ | #endif // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP32_CONVOLUTION_H_ | ||||
| @@ -29,14 +29,13 @@ using mindspore::lite::RET_OK; | |||||
| using mindspore::schema::PrimitiveType_Conv2D; | using mindspore::schema::PrimitiveType_Conv2D; | ||||
| namespace mindspore::kernel { | namespace mindspore::kernel { | ||||
| void ProcessFilter(float *origin_weight, float *dst_weight, ConvParameter *conv_param) { | |||||
| void ProcessFilter(float *origin_weight, float *dst_weight, ConvParameter *conv_param, int oc_block, int oc_block_num) { | |||||
| auto input_channel = conv_param->input_channel_; | auto input_channel = conv_param->input_channel_; | ||||
| auto output_channel = conv_param->output_channel_; | auto output_channel = conv_param->output_channel_; | ||||
| auto kernel_plane = conv_param->kernel_w_ * conv_param->kernel_h_; | auto kernel_plane = conv_param->kernel_w_ * conv_param->kernel_h_; | ||||
| int iC4 = UP_DIV(input_channel, C4NUM); | int iC4 = UP_DIV(input_channel, C4NUM); | ||||
| int oc8 = UP_DIV(output_channel, C8NUM); | |||||
| size_t tmp_size = oc8 * C8NUM * iC4 * C4NUM * kernel_plane * sizeof(float); | |||||
| size_t tmp_size = oc_block_num * oc_block * iC4 * C4NUM * kernel_plane * sizeof(float); | |||||
| auto tmp_addr = reinterpret_cast<float *>(malloc(tmp_size)); | auto tmp_addr = reinterpret_cast<float *>(malloc(tmp_size)); | ||||
| if (tmp_addr == nullptr) { | if (tmp_addr == nullptr) { | ||||
| MS_LOG(ERROR) << "malloc tmp_addr failed."; | MS_LOG(ERROR) << "malloc tmp_addr failed."; | ||||
| @@ -45,8 +44,7 @@ void ProcessFilter(float *origin_weight, float *dst_weight, ConvParameter *conv_ | |||||
| memset(tmp_addr, 0, tmp_size); | memset(tmp_addr, 0, tmp_size); | ||||
| PackNHWCToNC4HW4Fp32(origin_weight, tmp_addr, output_channel, kernel_plane, input_channel); | PackNHWCToNC4HW4Fp32(origin_weight, tmp_addr, output_channel, kernel_plane, input_channel); | ||||
| Conv3x3Fp32FilterTransform(tmp_addr, dst_weight, iC4, output_channel, kernel_plane); | |||||
| Conv3x3Fp32FilterTransform(tmp_addr, dst_weight, iC4, output_channel, kernel_plane, oc_block); | |||||
| free(tmp_addr); | free(tmp_addr); | ||||
| } | } | ||||
| @@ -55,10 +53,17 @@ int Convolution3x3CPUKernel::InitWeightBias() { | |||||
| auto output_channel = conv_param_->output_channel_; | auto output_channel = conv_param_->output_channel_; | ||||
| int iC4 = UP_DIV(input_channel, C4NUM); | int iC4 = UP_DIV(input_channel, C4NUM); | ||||
| int oC4 = UP_DIV(output_channel, C4NUM); | int oC4 = UP_DIV(output_channel, C4NUM); | ||||
| int oC8 = UP_DIV(output_channel, C8NUM); | |||||
| int oc_block, oc_block_num; | |||||
| #ifdef ENABLE_ARM32 | |||||
| oc_block = C4NUM; | |||||
| oc_block_num = UP_DIV(output_channel, C4NUM); | |||||
| #else | |||||
| oc_block = C8NUM; | |||||
| oc_block_num = UP_DIV(output_channel, C8NUM); | |||||
| #endif | |||||
| int k_plane = 16; | int k_plane = 16; | ||||
| // init weight | // init weight | ||||
| size_t transformed_size = iC4 * C4NUM * oC8 * C8NUM * k_plane * sizeof(float); | |||||
| size_t transformed_size = iC4 * C4NUM * oc_block_num * oc_block * k_plane * sizeof(float); | |||||
| transformed_filter_addr_ = reinterpret_cast<float *>(malloc(transformed_size)); | transformed_filter_addr_ = reinterpret_cast<float *>(malloc(transformed_size)); | ||||
| if (transformed_filter_addr_ == nullptr) { | if (transformed_filter_addr_ == nullptr) { | ||||
| MS_LOG(ERROR) << "malloc transformed filter addr failed."; | MS_LOG(ERROR) << "malloc transformed filter addr failed."; | ||||
| @@ -66,7 +71,7 @@ int Convolution3x3CPUKernel::InitWeightBias() { | |||||
| } | } | ||||
| memset(transformed_filter_addr_, 0, transformed_size); | memset(transformed_filter_addr_, 0, transformed_size); | ||||
| auto weight_data = reinterpret_cast<float *>(inputs_.at(kWeightIndex)->Data()); | auto weight_data = reinterpret_cast<float *>(inputs_.at(kWeightIndex)->Data()); | ||||
| ProcessFilter(weight_data, transformed_filter_addr_, conv_param_); | |||||
| ProcessFilter(weight_data, transformed_filter_addr_, conv_param_, oc_block, oc_block_num); | |||||
| // init bias | // init bias | ||||
| size_t new_bias_size = oC4 * C4NUM * sizeof(float); | size_t new_bias_size = oC4 * C4NUM * sizeof(float); | ||||
| @@ -89,7 +94,6 @@ int Convolution3x3CPUKernel::InitTmpBuffer() { | |||||
| int iC4 = UP_DIV(conv_param_->input_channel_, C4NUM); | int iC4 = UP_DIV(conv_param_->input_channel_, C4NUM); | ||||
| int oC4 = UP_DIV(conv_param_->output_channel_, C4NUM); | int oC4 = UP_DIV(conv_param_->output_channel_, C4NUM); | ||||
| int k_plane = 16; | int k_plane = 16; | ||||
| // todo | |||||
| size_t tile_buffer_size = thread_count_ * TILE_NUM * k_plane * iC4 * C4NUM * sizeof(float); | size_t tile_buffer_size = thread_count_ * TILE_NUM * k_plane * iC4 * C4NUM * sizeof(float); | ||||
| tile_buffer_ = reinterpret_cast<float *>(malloc(tile_buffer_size)); | tile_buffer_ = reinterpret_cast<float *>(malloc(tile_buffer_size)); | ||||
| if (tile_buffer_ == nullptr) { | if (tile_buffer_ == nullptr) { | ||||
| @@ -148,6 +152,11 @@ void Convolution3x3CPUKernel::ConfigInputOutput() { | |||||
| MS_LOG(ERROR) << "Check layout failed."; | MS_LOG(ERROR) << "Check layout failed."; | ||||
| return; | return; | ||||
| } | } | ||||
| #ifdef ENABLE_ARM32 | |||||
| gemm_func_ = IndirectGemmFp32_8x4; | |||||
| #else | |||||
| gemm_func_ = IndirectGemmFp32_8x8; | |||||
| #endif | |||||
| } | } | ||||
| int Convolution3x3CPUKernel::Init() { | int Convolution3x3CPUKernel::Init() { | ||||
| @@ -201,9 +210,13 @@ int Convolution3x3CPUKernel::ReSize() { | |||||
| } | } | ||||
| int Convolution3x3CPUKernel::RunImpl(int task_id) { | int Convolution3x3CPUKernel::RunImpl(int task_id) { | ||||
| if (gemm_func_ == nullptr) { | |||||
| MS_LOG(ERROR) << "gemm_func is nullptr."; | |||||
| return RET_ERROR; | |||||
| } | |||||
| auto output_addr = reinterpret_cast<float *>(outputs_.at(kOutputIndex)->Data()); | auto output_addr = reinterpret_cast<float *>(outputs_.at(kOutputIndex)->Data()); | ||||
| Conv3x3Fp32(reinterpret_cast<float *>(nhwc4_input_), transformed_filter_addr_, reinterpret_cast<float *>(bias_data_), | Conv3x3Fp32(reinterpret_cast<float *>(nhwc4_input_), transformed_filter_addr_, reinterpret_cast<float *>(bias_data_), | ||||
| output_addr, tmp_buffer_address_list_, task_id, conv_param_); | |||||
| output_addr, tmp_buffer_address_list_, task_id, conv_param_, gemm_func_); | |||||
| return RET_OK; | return RET_OK; | ||||
| } | } | ||||
| @@ -234,4 +247,3 @@ int Convolution3x3CPUKernel::Run() { | |||||
| return RET_OK; | return RET_OK; | ||||
| } | } | ||||
| } // namespace mindspore::kernel | } // namespace mindspore::kernel | ||||
| @@ -19,7 +19,6 @@ | |||||
| #include <vector> | #include <vector> | ||||
| #include "src/lite_kernel.h" | #include "src/lite_kernel.h" | ||||
| #include "src/runtime/kernel/arm/base/convolution_base.h" | #include "src/runtime/kernel/arm/base/convolution_base.h" | ||||
| #include "src/runtime/kernel/arm/opclib/winograd_transform.h" | #include "src/runtime/kernel/arm/opclib/winograd_transform.h" | ||||
| @@ -62,9 +61,9 @@ class Convolution3x3CPUKernel : public ConvolutionBaseCPUKernel { | |||||
| float *tmp_dst_buffer_; | float *tmp_dst_buffer_; | ||||
| float *nc4hw4_out_; | float *nc4hw4_out_; | ||||
| TmpBufferAddress tmp_buffer_address_list_[4]; | TmpBufferAddress tmp_buffer_address_list_[4]; | ||||
| GEMM_FUNC_FP32 gemm_func_ = nullptr; | |||||
| }; | }; | ||||
| void ProcessFilter(float *origin_weight, float *dst_weight, ConvParameter *conv_param); | |||||
| void ProcessFilter(float *origin_weight, float *dst_weight, ConvParameter *conv_param, int oc_block, int oc_block_num); | |||||
| } // namespace mindspore::kernel | } // namespace mindspore::kernel | ||||
| #endif // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP32_CONVOLUTION_3X3_H_ | #endif // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP32_CONVOLUTION_3X3_H_ | ||||
| @@ -29,7 +29,7 @@ using mindspore::schema::PrimitiveType_Conv2D; | |||||
| namespace mindspore::kernel { | namespace mindspore::kernel { | ||||
| void WinogradFilterTransform(const float *weight_data, Matrix *trans_weight, int kernel_unit, int input_unit, | void WinogradFilterTransform(const float *weight_data, Matrix *trans_weight, int kernel_unit, int input_unit, | ||||
| ConvParameter *conv_param) { | |||||
| ConvParameter *conv_param, int oc_block) { | |||||
| // original weight format : ohwi | // original weight format : ohwi | ||||
| auto channel_in = conv_param->input_channel_; | auto channel_in = conv_param->input_channel_; | ||||
| auto channel_out = conv_param->output_channel_; | auto channel_out = conv_param->output_channel_; | ||||
| @@ -53,10 +53,10 @@ void WinogradFilterTransform(const float *weight_data, Matrix *trans_weight, int | |||||
| int kernel_plane_stride = channel_in; | int kernel_plane_stride = channel_in; | ||||
| for (int i = 0; i < channel_out; i++) { | for (int i = 0; i < channel_out; i++) { | ||||
| int oc8_block = i / C8NUM; | |||||
| int oc8_res = i % C8NUM; | |||||
| int out_c_block = i / oc_block; | |||||
| int out_c_res = i % oc_block; | |||||
| int input_oz_offset = i * kernel_unit * kernel_unit * channel_in; | int input_oz_offset = i * kernel_unit * kernel_unit * channel_in; | ||||
| int output_oz_offset = oc8_block * strides[1] * input_unit * input_unit + oc8_res; | |||||
| int output_oz_offset = out_c_block * strides[1] * input_unit * input_unit + out_c_res; | |||||
| for (int j = 0; j < channel_in; j++) { | for (int j = 0; j < channel_in; j++) { | ||||
| int ic4_block = j / C4NUM; | int ic4_block = j / C4NUM; | ||||
| int ic4_res = j % C4NUM; | int ic4_res = j % C4NUM; | ||||
| @@ -88,16 +88,24 @@ void WinogradFilterTransform(const float *weight_data, Matrix *trans_weight, int | |||||
| int ConvolutionWinogradCPUKernel::InitWeightBias() { | int ConvolutionWinogradCPUKernel::InitWeightBias() { | ||||
| int output_channel = conv_param_->output_channel_; | int output_channel = conv_param_->output_channel_; | ||||
| int oc4 = UP_DIV(output_channel, C4NUM); | int oc4 = UP_DIV(output_channel, C4NUM); | ||||
| int oc_block, oc_block_num; | |||||
| #ifdef ENABLE_ARM32 | |||||
| oc_block = C4NUM; | |||||
| oc_block_num = UP_DIV(output_channel, C4NUM); | |||||
| #else | |||||
| oc_block = C8NUM; | |||||
| oc_block_num = UP_DIV(output_channel, C8NUM); | |||||
| #endif | |||||
| // init weight | // init weight | ||||
| auto ret = MallocFilterMatrix(); | |||||
| auto ret = MallocFilterMatrix(oc_block, oc_block_num); | |||||
| if (ret != RET_OK) { | if (ret != RET_OK) { | ||||
| MS_LOG(ERROR) << "Malloc filter matrix failed."; | MS_LOG(ERROR) << "Malloc filter matrix failed."; | ||||
| return RET_ERROR; | return RET_ERROR; | ||||
| } | } | ||||
| auto weight_tensor = inputs_.at(kWeightIndex); | auto weight_tensor = inputs_.at(kWeightIndex); | ||||
| auto weight_data = reinterpret_cast<float *>(weight_tensor->Data()); | auto weight_data = reinterpret_cast<float *>(weight_tensor->Data()); | ||||
| WinogradFilterTransform(weight_data, trans_weight_, kernel_unit_, input_unit_, conv_param_); | |||||
| WinogradFilterTransform(weight_data, trans_weight_, kernel_unit_, input_unit_, conv_param_, oc_block); | |||||
| // init bias | // init bias | ||||
| size_t new_bias_size = oc4 * C4NUM * sizeof(float); | size_t new_bias_size = oc4 * C4NUM * sizeof(float); | ||||
| @@ -112,14 +120,12 @@ int ConvolutionWinogradCPUKernel::InitWeightBias() { | |||||
| return RET_OK; | return RET_OK; | ||||
| } | } | ||||
| int ConvolutionWinogradCPUKernel::MallocFilterMatrix() { | |||||
| int ConvolutionWinogradCPUKernel::MallocFilterMatrix(int oc_block, int oc_block_num) { | |||||
| int channel_in = conv_param_->input_channel_; | int channel_in = conv_param_->input_channel_; | ||||
| int channel_out = conv_param_->output_channel_; | |||||
| int ic4 = UP_DIV(channel_in, BLOCK); | int ic4 = UP_DIV(channel_in, BLOCK); | ||||
| int oc8 = UP_DIV(channel_out, C8NUM); | |||||
| // set data | // set data | ||||
| auto trans_matrix_data_size = input_unit_ * input_unit_ * ic4 * oc8 * C4NUM * C8NUM * sizeof(float); | |||||
| auto trans_matrix_data_size = input_unit_ * input_unit_ * ic4 * C4NUM * oc_block_num * oc_block * sizeof(float); | |||||
| auto matrix_buffer = malloc(trans_matrix_data_size); | auto matrix_buffer = malloc(trans_matrix_data_size); | ||||
| if (matrix_buffer == nullptr) { | if (matrix_buffer == nullptr) { | ||||
| MS_LOG(ERROR) << "malloc matrix_buffer failed."; | MS_LOG(ERROR) << "malloc matrix_buffer failed."; | ||||
| @@ -134,10 +140,10 @@ int ConvolutionWinogradCPUKernel::MallocFilterMatrix() { | |||||
| std::vector<int> strides; | std::vector<int> strides; | ||||
| // set shape | // set shape | ||||
| shapes.push_back(input_unit_ * input_unit_); | shapes.push_back(input_unit_ * input_unit_); | ||||
| shapes.push_back(oc8); | |||||
| shapes.push_back(oc_block_num); | |||||
| shapes.push_back(ic4); | shapes.push_back(ic4); | ||||
| shapes.push_back(C4NUM); | shapes.push_back(C4NUM); | ||||
| shapes.push_back(C8NUM); | |||||
| shapes.push_back(oc_block); | |||||
| // set stride | // set stride | ||||
| for (int i = 0; i < 4; i++) { | for (int i = 0; i < 4; i++) { | ||||
| int stride = 1; | int stride = 1; | ||||
| @@ -227,6 +233,11 @@ int ConvolutionWinogradCPUKernel::ConfigInputOutput() { | |||||
| MS_LOG(ERROR) << "Get output_trans_func_ failed."; | MS_LOG(ERROR) << "Get output_trans_func_ failed."; | ||||
| return RET_ERROR; | return RET_ERROR; | ||||
| } | } | ||||
| #ifdef ENABLE_ARM32 | |||||
| gemm_func_ = IndirectGemmFp32_8x4; | |||||
| #else | |||||
| gemm_func_ = IndirectGemmFp32_8x8; | |||||
| #endif | |||||
| return RET_OK; | return RET_OK; | ||||
| } | } | ||||
| @@ -301,10 +312,14 @@ int ConvolutionWinogradCPUKernel::ReSize() { | |||||
| } | } | ||||
| int ConvolutionWinogradCPUKernel::RunImpl(int task_id) { | int ConvolutionWinogradCPUKernel::RunImpl(int task_id) { | ||||
| if (gemm_func_ == nullptr) { | |||||
| MS_LOG(ERROR) << "gemm_func is nullptr."; | |||||
| return RET_ERROR; | |||||
| } | |||||
| auto output_addr = reinterpret_cast<float *>(outputs_.at(kOutputIndex)->Data()); | auto output_addr = reinterpret_cast<float *>(outputs_.at(kOutputIndex)->Data()); | ||||
| ConvWinogardFp32(reinterpret_cast<float *>(nhwc4_input_), reinterpret_cast<float *>(trans_weight_->GetData()), | ConvWinogardFp32(reinterpret_cast<float *>(nhwc4_input_), reinterpret_cast<float *>(trans_weight_->GetData()), | ||||
| reinterpret_cast<const float *>(bias_data_), output_addr, tmp_buffer_address_list_, task_id, | reinterpret_cast<const float *>(bias_data_), output_addr, tmp_buffer_address_list_, task_id, | ||||
| conv_param_, input_trans_func_, output_trans_func_); | |||||
| conv_param_, input_trans_func_, output_trans_func_, gemm_func_); | |||||
| return RET_OK; | return RET_OK; | ||||
| } | } | ||||
| @@ -335,4 +350,3 @@ int ConvolutionWinogradCPUKernel::Run() { | |||||
| return RET_OK; | return RET_OK; | ||||
| } | } | ||||
| } // namespace mindspore::kernel | } // namespace mindspore::kernel | ||||
| @@ -50,7 +50,7 @@ class ConvolutionWinogradCPUKernel : public ConvolutionBaseCPUKernel { | |||||
| int Run() override; | int Run() override; | ||||
| int RunImpl(int task_id); | int RunImpl(int task_id); | ||||
| int InitWeightBias(); | int InitWeightBias(); | ||||
| int MallocFilterMatrix(); | |||||
| int MallocFilterMatrix(int oc_block, int oc_block_num); | |||||
| int InitTmpBuffer(); | int InitTmpBuffer(); | ||||
| int ConfigInputOutput(); | int ConfigInputOutput(); | ||||
| @@ -66,9 +66,9 @@ class ConvolutionWinogradCPUKernel : public ConvolutionBaseCPUKernel { | |||||
| InputTransformUnitFunc input_trans_func_; | InputTransformUnitFunc input_trans_func_; | ||||
| OutputTransformUnitFunc output_trans_func_; | OutputTransformUnitFunc output_trans_func_; | ||||
| TmpBufferAddress tmp_buffer_address_list_[5]; | TmpBufferAddress tmp_buffer_address_list_[5]; | ||||
| GEMM_FUNC_FP32 gemm_func_ = nullptr; | |||||
| }; | }; | ||||
| void WinogradFilterTransform(const float *weight_data, Matrix *trans_weight, int kernel_unit, int input_unit, | void WinogradFilterTransform(const float *weight_data, Matrix *trans_weight, int kernel_unit, int input_unit, | ||||
| ConvParameter *conv_param); | |||||
| ConvParameter *conv_param, int oc_block); | |||||
| } // namespace mindspore::kernel | } // namespace mindspore::kernel | ||||
| #endif // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP32_CONVOLUTION_WINOGRAD_H_ | #endif // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP32_CONVOLUTION_WINOGRAD_H_ | ||||
| @@ -17,7 +17,7 @@ | |||||
| #include "src/runtime/kernel/arm/opclib/common_func.h" | #include "src/runtime/kernel/arm/opclib/common_func.h" | ||||
| #include "src/runtime/kernel/arm/opclib/quantization/fixed_point.h" | #include "src/runtime/kernel/arm/opclib/quantization/fixed_point.h" | ||||
| #ifndef __aarch64__ | |||||
| #ifndef ENABLE_ARM64 | |||||
| void IndirectGemmFp32(float *output, const float *input, const float *weight, const float *bias, size_t step, int ic4, | void IndirectGemmFp32(float *output, const float *input, const float *weight, const float *bias, size_t step, int ic4, | ||||
| int output_channel, size_t offset, size_t relu, size_t relu6) { | int output_channel, size_t offset, size_t relu, size_t relu6) { | ||||
| for (int i = 0; i < TILE_NUM; i++) { | for (int i = 0; i < TILE_NUM; i++) { | ||||
| @@ -102,6 +102,11 @@ void IndirectGemmFp32_8x8(float *output, const float *input, const float *weight | |||||
| } | } | ||||
| } | } | ||||
| #endif | #endif | ||||
| #ifndef ENABLE_ARM32 | |||||
| void IndirectGemmFp32_8x4(float *output, const float *input, const float *weight, const float *bias, size_t step, | |||||
| size_t ic4, size_t output_channel, size_t offset, size_t mode, size_t writeC4, size_t relu, | |||||
| size_t relu6) {} | |||||
| #endif | |||||
| int8_t MinInt8(int8_t a, int8_t b) { return b ^ ((a ^ b) & -(a < b)); } | int8_t MinInt8(int8_t a, int8_t b) { return b ^ ((a ^ b) & -(a < b)); } | ||||
| @@ -36,6 +36,9 @@ void PostFuncInt8(const int *in, const int *bias, int8_t *out, int oc, int plane | |||||
| void IndirectGemmFp32_8x8(float *output, const float *input, const float *weight, const float *bias, size_t step, | void IndirectGemmFp32_8x8(float *output, const float *input, const float *weight, const float *bias, size_t step, | ||||
| size_t ic4, size_t output_channel, size_t offset, size_t mode, size_t writeC4, size_t relu, | size_t ic4, size_t output_channel, size_t offset, size_t mode, size_t writeC4, size_t relu, | ||||
| size_t relu6); | size_t relu6); | ||||
| void IndirectGemmFp32_8x4(float *output, const float *input, const float *weight, const float *bias, size_t step, | |||||
| size_t ic4, size_t output_channel, size_t offset, size_t mode, size_t writeC4, size_t relu, | |||||
| size_t relu6); | |||||
| void IndirectGemmFp32_Comm(float *output, const float *input, const float *weight, size_t ic4, size_t hw, size_t oc, | void IndirectGemmFp32_Comm(float *output, const float *input, const float *weight, size_t ic4, size_t hw, size_t oc, | ||||
| size_t offset); | size_t offset); | ||||
| void IndirectGemmFp32(float *output, const float *input, const float *weight, const float *bias, size_t step, int ic4, | void IndirectGemmFp32(float *output, const float *input, const float *weight, const float *bias, size_t step, int ic4, | ||||
| @@ -20,7 +20,8 @@ | |||||
| // fp32 conv common | // fp32 conv common | ||||
| void ConvFp32(float *input_data, float *packed_input, float *packed_weight, const float *bias_data, | void ConvFp32(float *input_data, float *packed_input, float *packed_weight, const float *bias_data, | ||||
| float *tmp_out_block, float *output_data, int task_id, ConvParameter *conv_param) { | |||||
| float *tmp_out_block, float *output_data, int task_id, ConvParameter *conv_param, | |||||
| GEMM_FUNC_FP32 gemm_func) { | |||||
| int kernel_h = conv_param->kernel_h_; | int kernel_h = conv_param->kernel_h_; | ||||
| int kernel_w = conv_param->kernel_w_; | int kernel_w = conv_param->kernel_w_; | ||||
| int in_batch = conv_param->input_batch_; | int in_batch = conv_param->input_batch_; | ||||
| @@ -57,12 +58,12 @@ void ConvFp32(float *input_data, float *packed_input, float *packed_weight, cons | |||||
| int out_offset = thread_id * TILE_NUM * out_channel + out_batch_offset; | int out_offset = thread_id * TILE_NUM * out_channel + out_batch_offset; | ||||
| if (real_cal_num == TILE_NUM) { | if (real_cal_num == TILE_NUM) { | ||||
| float *gemm_output = output_data + out_offset; | float *gemm_output = output_data + out_offset; | ||||
| IndirectGemmFp32_8x8(gemm_output, gemm_input, packed_weight, bias_data, conv_depth, ic4, out_channel, | |||||
| output_offset, 0, 0, conv_param->is_relu_, conv_param->is_relu6_); | |||||
| gemm_func(gemm_output, gemm_input, packed_weight, bias_data, conv_depth, ic4, out_channel, output_offset, 0, 0, | |||||
| conv_param->is_relu_, conv_param->is_relu6_); | |||||
| } else { | } else { | ||||
| // res part | // res part | ||||
| IndirectGemmFp32_8x8(tmp_out_block, gemm_input, packed_weight, bias_data, conv_depth, ic4, out_channel, | |||||
| output_offset, 0, 0, conv_param->is_relu_, conv_param->is_relu6_); | |||||
| gemm_func(tmp_out_block, gemm_input, packed_weight, bias_data, conv_depth, ic4, out_channel, output_offset, 0, | |||||
| 0, conv_param->is_relu_, conv_param->is_relu6_); | |||||
| memcpy(output_data + out_offset, tmp_out_block, real_cal_num * out_channel * sizeof(float)); | memcpy(output_data + out_offset, tmp_out_block, real_cal_num * out_channel * sizeof(float)); | ||||
| } | } | ||||
| } | } | ||||
| @@ -78,7 +79,8 @@ int Conv1x1Fp32(const float *input_data, const float *weight_data, float *output | |||||
| // fp32 conv winograd | // fp32 conv winograd | ||||
| void ConvWinogardFp32(float *input_data, float *trans_weight, const float *bias_data, float *output_data, | void ConvWinogardFp32(float *input_data, float *trans_weight, const float *bias_data, float *output_data, | ||||
| TmpBufferAddress *buffer_list, int task_id, ConvParameter *conv_param, | TmpBufferAddress *buffer_list, int task_id, ConvParameter *conv_param, | ||||
| InputTransformUnitFunc input_trans_func, OutputTransformUnitFunc output_trans_func) { | |||||
| InputTransformUnitFunc input_trans_func, OutputTransformUnitFunc output_trans_func, | |||||
| GEMM_FUNC_FP32 gemm_func) { | |||||
| int thread_num = conv_param->thread_num_; | int thread_num = conv_param->thread_num_; | ||||
| int input_unit = conv_param->input_unit_; | int input_unit = conv_param->input_unit_; | ||||
| int in_batch = conv_param->input_batch_; | int in_batch = conv_param->input_batch_; | ||||
| @@ -111,8 +113,8 @@ void ConvWinogardFp32(float *input_data, float *trans_weight, const float *bias_ | |||||
| WinogradInputTransform(input_data, trans_input, tmp_data, cal_num, out_tile_index, out_w_block, conv_param, | WinogradInputTransform(input_data, trans_input, tmp_data, cal_num, out_tile_index, out_w_block, conv_param, | ||||
| input_trans_func); | input_trans_func); | ||||
| // step 3 : gemm | // step 3 : gemm | ||||
| IndirectGemmFp32_8x8(gemm_out, trans_input, trans_weight, nullptr, input_unit_square, ic4, oc4 * C4NUM, | |||||
| output_offset, 1, 1, 0, 0); | |||||
| gemm_func(gemm_out, trans_input, trans_weight, nullptr, input_unit_square, ic4, oc4 * C4NUM, output_offset, 1, 1, | |||||
| 0, 0); | |||||
| // step 4 : output transform | // step 4 : output transform | ||||
| WinogradOutputTransform(gemm_out, tmp_out_data, bias_data, cal_num, out_tile_index, out_w_block, conv_param, | WinogradOutputTransform(gemm_out, tmp_out_data, bias_data, cal_num, out_tile_index, out_w_block, conv_param, | ||||
| @@ -173,7 +175,7 @@ void UnPackWinogradOutput(const float *src, float *dst, int batch, int height, i | |||||
| // fp32 conv3x3 | // fp32 conv3x3 | ||||
| void Conv3x3Fp32(float *input_data, float *transed_weight, const float *bias_data, float *output_data, | void Conv3x3Fp32(float *input_data, float *transed_weight, const float *bias_data, float *output_data, | ||||
| TmpBufferAddress *buffer_list, int task_id, ConvParameter *conv_param) { | |||||
| TmpBufferAddress *buffer_list, int task_id, ConvParameter *conv_param, GEMM_FUNC_FP32 gemm_func) { | |||||
| int thread_count = conv_param->thread_num_; | int thread_count = conv_param->thread_num_; | ||||
| int ic4 = UP_DIV(conv_param->input_channel_, C4NUM); | int ic4 = UP_DIV(conv_param->input_channel_, C4NUM); | ||||
| int output_channel = conv_param->output_channel_; | int output_channel = conv_param->output_channel_; | ||||
| @@ -198,8 +200,8 @@ void Conv3x3Fp32(float *input_data, float *transed_weight, const float *bias_dat | |||||
| Conv3x3Fp32InputTransform(input_data, tile_buffer, block_unit_buffer, start_index, real_cal_num, out_w_block, | Conv3x3Fp32InputTransform(input_data, tile_buffer, block_unit_buffer, start_index, real_cal_num, out_w_block, | ||||
| conv_param); | conv_param); | ||||
| IndirectGemmFp32_8x8(tmp_dst_buffer, tile_buffer, transed_weight, nullptr, input_unit_square, ic4, oc4 * C4NUM, | |||||
| oc4 * C4NUM * input_unit_square * sizeof(float), 1, 1, 0, 0); | |||||
| gemm_func(tmp_dst_buffer, tile_buffer, transed_weight, nullptr, input_unit_square, ic4, oc4 * C4NUM, | |||||
| oc4 * C4NUM * input_unit_square * sizeof(float), 1, 1, 0, 0); | |||||
| Conv3x3Fp32OutputTransform(tmp_dst_buffer, nc4hw4_out, bias_data, start_index, real_cal_num, out_w_block, | Conv3x3Fp32OutputTransform(tmp_dst_buffer, nc4hw4_out, bias_data, start_index, real_cal_num, out_w_block, | ||||
| conv_param); | conv_param); | ||||
| @@ -28,10 +28,14 @@ | |||||
| #include "src/runtime/kernel/arm/opclib/winograd_utils.h" | #include "src/runtime/kernel/arm/opclib/winograd_utils.h" | ||||
| using TmpBufferAddress = float *; | using TmpBufferAddress = float *; | ||||
| typedef void (*GEMM_FUNC_FP32)(float *output, const float *input, const float *weight, const float *bias, size_t step, | |||||
| size_t ic4, size_t output_channel, size_t offset, size_t mode, size_t writeC4, | |||||
| size_t relu, size_t relu6); | |||||
| // fp32 convolution common (im2col+gemm) | // fp32 convolution common (im2col+gemm) | ||||
| void ConvFp32(float *input_data, float *packed_input, float *packed_weight, const float *bias_data, | void ConvFp32(float *input_data, float *packed_input, float *packed_weight, const float *bias_data, | ||||
| float *tmp_out_block, float *output_data, int task_id, ConvParameter *conv_param); | |||||
| float *tmp_out_block, float *output_data, int task_id, ConvParameter *conv_param, | |||||
| GEMM_FUNC_FP32 gemm_func); | |||||
| // fp32 conv1x1 strassen matmul | // fp32 conv1x1 strassen matmul | ||||
| int Conv1x1Fp32(const float *input_data, const float *weight_data, float *output_data, float *tmp_ptr, | int Conv1x1Fp32(const float *input_data, const float *weight_data, float *output_data, float *tmp_ptr, | ||||
| @@ -40,12 +44,13 @@ int Conv1x1Fp32(const float *input_data, const float *weight_data, float *output | |||||
| // fp32 convolution winograd | // fp32 convolution winograd | ||||
| void ConvWinogardFp32(float *input_data, float *trans_weight, const float *bias_data, float *output_data, | void ConvWinogardFp32(float *input_data, float *trans_weight, const float *bias_data, float *output_data, | ||||
| TmpBufferAddress *buffer_list, int task_id, ConvParameter *conv_param, | TmpBufferAddress *buffer_list, int task_id, ConvParameter *conv_param, | ||||
| InputTransformUnitFunc input_trans_func, OutputTransformUnitFunc output_trans_func); | |||||
| InputTransformUnitFunc input_trans_func, OutputTransformUnitFunc output_trans_func, | |||||
| GEMM_FUNC_FP32 gemm_func); | |||||
| void UnPackWinogradOutput(const float *src, float *dst, int batch, int height, int width, int channel, int output_unit); | void UnPackWinogradOutput(const float *src, float *dst, int batch, int height, int width, int channel, int output_unit); | ||||
| // fp32 conv3x3 | // fp32 conv3x3 | ||||
| void Conv3x3Fp32(float *input_data, float *transed_weight, const float *bias_data, float *output_data, | void Conv3x3Fp32(float *input_data, float *transed_weight, const float *bias_data, float *output_data, | ||||
| TmpBufferAddress *buffer_list, int task_id, ConvParameter *conv_param); | |||||
| TmpBufferAddress *buffer_list, int task_id, ConvParameter *conv_param, GEMM_FUNC_FP32 gemm_func); | |||||
| #endif // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_OPCLIB_FP32_CONV_H_ | #endif // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_OPCLIB_FP32_CONV_H_ | ||||
| @@ -49,7 +49,9 @@ void IndirectGemmInt8(int8_t *dst, int32_t *tmp_dst, const int8_t *src, const in | |||||
| #ifdef __aarch64__ | #ifdef __aarch64__ | ||||
| IndirectGemmInt8_4x4(dst, src, weight, bias, kernel_plane, ic4, output_channel, output_channel * sizeof(int8_t), | IndirectGemmInt8_4x4(dst, src, weight, bias, kernel_plane, ic4, output_channel, output_channel * sizeof(int8_t), | ||||
| input_sum, act_min, act_max, out_zp, out_multiplier, shift_before, shift_after); | input_sum, act_min, act_max, out_zp, out_multiplier, shift_before, shift_after); | ||||
| // todo arm32 | |||||
| #elif defined(ENABLE_ARM32) | |||||
| IndirectGemmInt8_2x4(dst, src, weight, bias, kernel_plane, ic4, output_channel, output_channel * sizeof(int8_t), | |||||
| input_sum, act_min, act_max, out_zp, out_multiplier, shift_before, shift_after); | |||||
| #else | #else | ||||
| int tile_num = conv_param->tile_num_; | int tile_num = conv_param->tile_num_; | ||||
| int plane_c4 = UP_DIV(kernel_plane, C4NUM); | int plane_c4 = UP_DIV(kernel_plane, C4NUM); | ||||
| @@ -58,10 +58,10 @@ class OptimizeModule { | |||||
| if ((!support_optimize_ops) && (!support_fp16)) { | if ((!support_optimize_ops) && (!support_fp16)) { | ||||
| return; | return; | ||||
| } | } | ||||
| // optimized_op_handler_ = dlopen(OPTIMIZE_SHARED_LIBRARY_PATH, RTLD_LAZY); | |||||
| // if (optimized_op_handler_ == nullptr) { | |||||
| // printf("Open optimize shared library failed.\n"); | |||||
| // } | |||||
| optimized_op_handler_ = dlopen(OPTIMIZE_SHARED_LIBRARY_PATH, RTLD_LAZY); | |||||
| if (optimized_op_handler_ == nullptr) { | |||||
| printf("Open optimize shared library failed.\n"); | |||||
| } | |||||
| } | } | ||||
| ~OptimizeModule() = default; | ~OptimizeModule() = default; | ||||
| @@ -18,20 +18,19 @@ | |||||
| #include <cstring> | #include <cstring> | ||||
| #include <cstdlib> | #include <cstdlib> | ||||
| void PackWeightFp32(float *weight_data, ConvParameter *conv_param, float *packed_weight) { | |||||
| void PackWeightFp32(float *weight_data, ConvParameter *conv_param, float *packed_weight, int oc_block, | |||||
| int oc_block_num) { | |||||
| // original weight format : ohwi | // original weight format : ohwi | ||||
| // todo pack weight for arm32 platform | |||||
| int kernel_h = conv_param->kernel_h_; | int kernel_h = conv_param->kernel_h_; | ||||
| int kernel_w = conv_param->kernel_w_; | int kernel_w = conv_param->kernel_w_; | ||||
| int in_channel = conv_param->input_channel_; | int in_channel = conv_param->input_channel_; | ||||
| int out_channel = conv_param->output_channel_; | int out_channel = conv_param->output_channel_; | ||||
| int oc8 = UP_DIV(out_channel, C8NUM); | |||||
| int ic4 = UP_DIV(in_channel, C4NUM); | int ic4 = UP_DIV(in_channel, C4NUM); | ||||
| int kernel_plane = kernel_h * kernel_w; | int kernel_plane = kernel_h * kernel_w; | ||||
| int pack_weight_size = oc8 * C8NUM * ic4 * C4NUM * kernel_plane; | |||||
| int pack_weight_size = oc_block * oc_block_num * ic4 * C4NUM * kernel_plane; | |||||
| int unit_size = C8NUM * C4NUM; | |||||
| int block_size = pack_weight_size / oc8; | |||||
| int unit_size = oc_block * C4NUM; | |||||
| int block_size = pack_weight_size / oc_block_num; | |||||
| for (int m = 0; m < kernel_plane; m++) { | for (int m = 0; m < kernel_plane; m++) { | ||||
| int kernel_plane_stride = m * in_channel; | int kernel_plane_stride = m * in_channel; | ||||
| @@ -43,12 +42,12 @@ void PackWeightFp32(float *weight_data, ConvParameter *conv_param, float *packed | |||||
| int real_ic_num = ic_remainder < C4NUM ? ic_remainder : C4NUM; | int real_ic_num = ic_remainder < C4NUM ? ic_remainder : C4NUM; | ||||
| for (int h = 0; h < real_ic_num; h++) { | for (int h = 0; h < real_ic_num; h++) { | ||||
| int block_stride = channel_block_stride + h; | int block_stride = channel_block_stride + h; | ||||
| int packed_block_stride = packed_channel_block_size + h * C8NUM; | |||||
| for (int j = 0; j < oc8; j++) { | |||||
| int kernel_block_stride = block_stride + j * C8NUM * kernel_plane * in_channel; | |||||
| int packed_block_stride = packed_channel_block_size + h * oc_block; | |||||
| for (int j = 0; j < oc_block_num; j++) { | |||||
| int kernel_block_stride = block_stride + j * oc_block * kernel_plane * in_channel; | |||||
| int packed_kernel_block_size = packed_block_stride + j * block_size; | int packed_kernel_block_size = packed_block_stride + j * block_size; | ||||
| int oc_remainder = out_channel - j * C8NUM; | |||||
| int real_oc_num = oc_remainder < C8NUM ? oc_remainder : C8NUM; | |||||
| int oc_remainder = out_channel - j * oc_block; | |||||
| int real_oc_num = oc_remainder < oc_block ? oc_remainder : oc_block; | |||||
| for (int k = 0; k < real_oc_num; k++) { | for (int k = 0; k < real_oc_num; k++) { | ||||
| float *origin_data_ptr = weight_data + kernel_block_stride + k * kernel_plane * in_channel; | float *origin_data_ptr = weight_data + kernel_block_stride + k * kernel_plane * in_channel; | ||||
| float *packed_data_ptr = packed_weight + packed_kernel_block_size + k; | float *packed_data_ptr = packed_weight + packed_kernel_block_size + k; | ||||
| @@ -40,7 +40,8 @@ void MatrixPack(const float *src, float *dst, int row, int ic4, int stride); | |||||
| void PackInputToC8Int8(const int8_t *input_data, int16_t *packed_input, ConvParameter *conv_param); | void PackInputToC8Int8(const int8_t *input_data, int16_t *packed_input, ConvParameter *conv_param); | ||||
| void PackWeightFp32(float *weight_data, ConvParameter *conv_param, float *packed_weight); | |||||
| void PackWeightFp32(float *weight_data, ConvParameter *conv_param, float *packed_weight, int oc_block, | |||||
| int oc_block_num); | |||||
| void PackWeightInt8(int8_t *weight_data, ConvParameter *conv_param, int8_t *packed_weight, int32_t *weight_sum); | void PackWeightInt8(int8_t *weight_data, ConvParameter *conv_param, int8_t *packed_weight, int32_t *weight_sum); | ||||
| @@ -326,18 +326,18 @@ void Conv3x3Fp32InputTransform(const float *input_data, float *trans_input, floa | |||||
| } | } | ||||
| } | } | ||||
| void Conv3x3Fp32FilterTransform(float *weight_data, float *trans_weight, int iC4, int output_channel, | |||||
| int kernel_plane) { | |||||
| void Conv3x3Fp32FilterTransform(float *weight_data, float *trans_weight, int iC4, int output_channel, int kernel_plane, | |||||
| int oc_block) { | |||||
| int input_unit = 4; | int input_unit = 4; | ||||
| int dst_step = iC4 * C4NUM * C8NUM; | |||||
| int dst_step = iC4 * C4NUM * oc_block; | |||||
| for (int o = 0; o < output_channel; o++) { | for (int o = 0; o < output_channel; o++) { | ||||
| int oc8_block_num = o / C8NUM; | |||||
| int oc8_block_rem = o % C8NUM; | |||||
| int oc_block_num = o / oc_block; | |||||
| int oc_block_rem = o % oc_block; | |||||
| int src_oc_offset = o * iC4 * C4NUM * kernel_plane; | int src_oc_offset = o * iC4 * C4NUM * kernel_plane; | ||||
| int dst_oc_offset = oc8_block_num * C8NUM * iC4 * C4NUM * input_unit * input_unit + oc8_block_rem; | |||||
| int dst_oc_offset = oc_block_num * oc_block * iC4 * C4NUM * input_unit * input_unit + oc_block_rem; | |||||
| for (int i = 0; i < iC4; i++) { | for (int i = 0; i < iC4; i++) { | ||||
| float *src_ic4_ptr = weight_data + src_oc_offset + i * kernel_plane * C4NUM; | float *src_ic4_ptr = weight_data + src_oc_offset + i * kernel_plane * C4NUM; | ||||
| float *dst_ic4_ptr = trans_weight + dst_oc_offset + i * C8NUM * C4NUM; | |||||
| float *dst_ic4_ptr = trans_weight + dst_oc_offset + i * oc_block * C4NUM; | |||||
| #ifdef ENABLE_ARM | #ifdef ENABLE_ARM | ||||
| float32x4_t g00 = vld1q_f32(src_ic4_ptr); | float32x4_t g00 = vld1q_f32(src_ic4_ptr); | ||||
| float32x4_t g01 = vld1q_f32(src_ic4_ptr + 4); | float32x4_t g01 = vld1q_f32(src_ic4_ptr + 4); | ||||
| @@ -1368,4 +1368,3 @@ void Conv3x3Uint8OutputTransform(const int32_t *gemm_out, int8_t *out_data, cons | |||||
| } | } | ||||
| } | } | ||||
| } | } | ||||
| @@ -43,7 +43,8 @@ void Conv3x3Fp32InputUnit(const float *tmp_data, float *trans_input_data, size_t | |||||
| void Conv3x3Fp32InputTransform(const float *input_data, float *trans_input, float *tmp_data, int start_index, | void Conv3x3Fp32InputTransform(const float *input_data, float *trans_input, float *tmp_data, int start_index, | ||||
| int real_cal_num, int out_w_block, ConvParameter *conv_param); | int real_cal_num, int out_w_block, ConvParameter *conv_param); | ||||
| void Conv3x3Fp32FilterTransform(float *weight_data, float *trans_weight, int iC4, int output_channel, int kernel_plane); | |||||
| void Conv3x3Fp32FilterTransform(float *weight_data, float *trans_weight, int iC4, int output_channel, int kernel_plane, | |||||
| int oc_block); | |||||
| void Conv3x3Fp32OutputUnit(const float *gemm_out, const float *bias_data, float *output_data, bool h_not_bound, | void Conv3x3Fp32OutputUnit(const float *gemm_out, const float *bias_data, float *output_data, bool h_not_bound, | ||||
| bool w_not_bound, int output_w); | bool w_not_bound, int output_w); | ||||
| @@ -67,4 +68,3 @@ void Conv3x3Uint8OutputTransform(const int32_t *gemm_out, int8_t *out_data, cons | |||||
| int real_cal_num, int out_w_block, ConvParameter *conv_param); | int real_cal_num, int out_w_block, ConvParameter *conv_param); | ||||
| #endif // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_OPCLIB_WINOGRAD_TRANSFORM_H_ | #endif // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_OPCLIB_WINOGRAD_TRANSFORM_H_ | ||||
| @@ -122,7 +122,7 @@ TEST_F(TestPack, PackWeightFp32) { | |||||
| std::string weight_path = "./test_data/conv/convfp32_weight_32_3_3_3.bin"; | std::string weight_path = "./test_data/conv/convfp32_weight_32_3_3_3.bin"; | ||||
| auto weight_data = reinterpret_cast<float *>(mindspore::lite::ReadFile(weight_path.c_str(), &weight_size)); | auto weight_data = reinterpret_cast<float *>(mindspore::lite::ReadFile(weight_path.c_str(), &weight_size)); | ||||
| auto packed_weight = reinterpret_cast<float *>(malloc(k_h * k_w * ic4 * C4NUM * oc8 * C8NUM * sizeof(float))); | auto packed_weight = reinterpret_cast<float *>(malloc(k_h * k_w * ic4 * C4NUM * oc8 * C8NUM * sizeof(float))); | ||||
| PackWeightFp32(weight_data, conv_param, packed_weight); | |||||
| PackWeightFp32(weight_data, conv_param, packed_weight, C8NUM, oc8); | |||||
| printf("==================output data=================\n"); | printf("==================output data=================\n"); | ||||
| for (int i = 0; i < 20; i++) { | for (int i = 0; i < 20; i++) { | ||||