Browse Source

!3864 fix fp16 init func bug && enable arm32 assembly code

Merge pull request !3864 from fuzhiye/mindspore
tags/v0.7.0-beta
mindspore-ci-bot Gitee 5 years ago
parent
commit
4edcce367d
22 changed files with 207 additions and 95 deletions
  1. +0
    -10
      mindspore/lite/src/runtime/kernel/arm/base/layout_transform.cc
  2. +6
    -4
      mindspore/lite/src/runtime/kernel/arm/fp16/convolution_3x3_fp16.cc
  3. +6
    -4
      mindspore/lite/src/runtime/kernel/arm/fp16/convolution_fp16.cc
  4. +39
    -0
      mindspore/lite/src/runtime/kernel/arm/fp16/layout_transform_fp16.cc
  5. +27
    -0
      mindspore/lite/src/runtime/kernel/arm/fp16/layout_transform_fp16.h
  6. +12
    -2
      mindspore/lite/src/runtime/kernel/arm/fp32/convolution.cc
  7. +2
    -1
      mindspore/lite/src/runtime/kernel/arm/fp32/convolution.h
  8. +23
    -11
      mindspore/lite/src/runtime/kernel/arm/fp32/convolution_3x3.cc
  9. +2
    -3
      mindspore/lite/src/runtime/kernel/arm/fp32/convolution_3x3.h
  10. +28
    -14
      mindspore/lite/src/runtime/kernel/arm/fp32/convolution_winograd.cc
  11. +3
    -3
      mindspore/lite/src/runtime/kernel/arm/fp32/convolution_winograd.h
  12. +6
    -1
      mindspore/lite/src/runtime/kernel/arm/opclib/common_func.cc
  13. +3
    -0
      mindspore/lite/src/runtime/kernel/arm/opclib/common_func.h
  14. +13
    -11
      mindspore/lite/src/runtime/kernel/arm/opclib/fp32/conv.cc
  15. +8
    -3
      mindspore/lite/src/runtime/kernel/arm/opclib/fp32/conv.h
  16. +3
    -1
      mindspore/lite/src/runtime/kernel/arm/opclib/int8/conv_int8.cc
  17. +4
    -4
      mindspore/lite/src/runtime/kernel/arm/opclib/optimized_kernel.h
  18. +10
    -11
      mindspore/lite/src/runtime/kernel/arm/opclib/pack.cc
  19. +2
    -1
      mindspore/lite/src/runtime/kernel/arm/opclib/pack.h
  20. +7
    -8
      mindspore/lite/src/runtime/kernel/arm/opclib/winograd_transform.cc
  21. +2
    -2
      mindspore/lite/src/runtime/kernel/arm/opclib/winograd_transform.h
  22. +1
    -1
      mindspore/lite/test/ut/src/runtime/kernel/arm/common/pack_tests.cc

+ 0
- 10
mindspore/lite/src/runtime/kernel/arm/base/layout_transform.cc View File

@@ -19,12 +19,6 @@


using mindspore::schema::Format; using mindspore::schema::Format;
namespace mindspore::kernel { namespace mindspore::kernel {
#ifdef ENABLE_FP16
LayoutConvertor LayoutTransformFp16(schema::Format src_format, schema::Format dst_format) {
// todo
return nullptr;
}
#endif
LayoutConvertor LayoutTransformFp32(schema::Format src_format, schema::Format dst_format) { LayoutConvertor LayoutTransformFp32(schema::Format src_format, schema::Format dst_format) {
// todo // todo
if (src_format == schema::Format_NHWC && dst_format == schema::Format_NC4HW4) { if (src_format == schema::Format_NHWC && dst_format == schema::Format_NC4HW4) {
@@ -58,10 +52,6 @@ LayoutConvertor LayoutTransform(TypeId data_type, schema::Format src_format, sch
switch (data_type) { switch (data_type) {
case kNumberTypeInt8: case kNumberTypeInt8:
return LayoutTransformInt8(src_format, dst_format); return LayoutTransformInt8(src_format, dst_format);
#ifdef ENABLE_FP16
case kNumberTypeFloat16:
return LayoutTransformFp16(src_format, dst_format);
#endif
case kNumberTypeFloat32: case kNumberTypeFloat32:
return LayoutTransformFp32(src_format, dst_format); return LayoutTransformFp32(src_format, dst_format);
default: default:


+ 6
- 4
mindspore/lite/src/runtime/kernel/arm/fp16/convolution_3x3_fp16.cc View File

@@ -18,7 +18,7 @@
#include "src/runtime/kernel/arm/opclib/fp16/conv_fp16.h" #include "src/runtime/kernel/arm/opclib/fp16/conv_fp16.h"
#include "src/runtime/kernel/arm/opclib/fp16/winograd_transform_fp16.h" #include "src/runtime/kernel/arm/opclib/fp16/winograd_transform_fp16.h"
#include "src/runtime/kernel/arm/opclib/fp16/pack_fp16.h" #include "src/runtime/kernel/arm/opclib/fp16/pack_fp16.h"
#include "src/runtime/kernel/arm/base/layout_transform.h"
#include "src/runtime/kernel/arm/fp16/layout_transform_fp16.h"
#include "schema/model_generated.h" #include "schema/model_generated.h"
#include "src/kernel_registry.h" #include "src/kernel_registry.h"
#include "include/errorcode.h" #include "include/errorcode.h"
@@ -159,9 +159,11 @@ void Convolution3x3FP16CPUKernel::ConfigInputOutput() {
auto output_tensor = outputs_.at(kOutputIndex); auto output_tensor = outputs_.at(kOutputIndex);
output_tensor->SetFormat(schema::Format_NHWC); output_tensor->SetFormat(schema::Format_NHWC);
auto input_tensor = inputs_.at(kInputIndex); auto input_tensor = inputs_.at(kInputIndex);
auto ret = CheckLayout(input_tensor);
if (ret != RET_OK) {
MS_LOG(ERROR) << "Check layout failed.";
auto input_format = input_tensor->GetFormat();
schema::Format execute_format = schema::Format_NHWC4;
convert_func_ = LayoutTransformFp16(input_format, execute_format);
if (convert_func_ == nullptr) {
MS_LOG(ERROR) << "layout convert func is nullptr.";
return; return;
} }
} }


+ 6
- 4
mindspore/lite/src/runtime/kernel/arm/fp16/convolution_fp16.cc View File

@@ -18,7 +18,7 @@
#include "src/runtime/kernel/arm/fp16/convolution_3x3_fp16.h" #include "src/runtime/kernel/arm/fp16/convolution_3x3_fp16.h"
#include "src/runtime/kernel/arm/opclib/fp16/conv_fp16.h" #include "src/runtime/kernel/arm/opclib/fp16/conv_fp16.h"
#include "src/runtime/kernel/arm/opclib/fp16/pack_fp16.h" #include "src/runtime/kernel/arm/opclib/fp16/pack_fp16.h"
#include "src/runtime/kernel/arm/base/layout_transform.h"
#include "src/runtime/kernel/arm/fp16/layout_transform_fp16.h"
#include "schema/model_generated.h" #include "schema/model_generated.h"
#include "src/kernel_registry.h" #include "src/kernel_registry.h"
#include "include/errorcode.h" #include "include/errorcode.h"
@@ -130,9 +130,11 @@ int ConvolutionFP16CPUKernel::InitTmpBuffer() {


void ConvolutionFP16CPUKernel::ConfigInputOutput() { void ConvolutionFP16CPUKernel::ConfigInputOutput() {
auto input_tensor = inputs_.at(kInputIndex); auto input_tensor = inputs_.at(kInputIndex);
auto ret = CheckLayout(input_tensor);
if (ret != RET_OK) {
MS_LOG(ERROR) << "Check layout failed.";
auto input_format = input_tensor->GetFormat();
schema::Format execute_format = schema::Format_NHWC4;
convert_func_ = LayoutTransformFp16(input_format, execute_format);
if (convert_func_ == nullptr) {
MS_LOG(ERROR) << "layout convert func is nullptr.";
return; return;
} }
auto output_tensor = outputs_.at(kOutputIndex); auto output_tensor = outputs_.at(kOutputIndex);


+ 39
- 0
mindspore/lite/src/runtime/kernel/arm/fp16/layout_transform_fp16.cc View File

@@ -0,0 +1,39 @@
/**
* Copyright 2020 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "src/runtime/kernel/arm/fp16/layout_transform_fp16.h"
#include "src/runtime/kernel/arm/opclib/fp16/pack_fp16.h"
#include "schema/ops_generated.h"
#include "mindspore/core/utils/log_adapter.h"

namespace mindspore::kernel {
LayoutConvertor LayoutTransformFp16(schema::Format src_format, schema::Format dst_format) {
if (src_format == schema::Format_NHWC && dst_format == schema::Format_NC4HW4) {
return PackNHWCToNC4HW4Fp16;
} else if (src_format == schema::Format_NHWC && dst_format == schema::Format_NHWC4) {
return PackNHWCToNHWC4Fp16;
} else if (src_format == schema::Format_NC4HW4 && dst_format == schema::Format_NHWC4) {
return PackNC4HW4ToNHWC4Fp16;
} else if (src_format == schema::Format_NCHW && dst_format == schema::Format_NC4HW4) {
return PackNCHWToNC4HW4Fp16;
} else if (src_format == schema::Format_NC4HW4 && dst_format == schema::Format_NHWC) {
return PackNC4HW4ToNHWCFp16;
} else {
MS_LOG(ERROR) << "Unsupported transform from " << schema::EnumNameFormat(src_format) << " to "
<< schema::EnumNameFormat(dst_format);
return nullptr;
}
}
} // namespace mindspore::kernel

+ 27
- 0
mindspore/lite/src/runtime/kernel/arm/fp16/layout_transform_fp16.h View File

@@ -0,0 +1,27 @@
/**
* Copyright 2020 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_LAYOUT_TRANSFORM_FP16_H_
#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_LAYOUT_TRANSFORM_FP16_H_

#include "src/runtime/kernel/arm/base/layout_transform.h"
#include "schema/ops_generated.h"

namespace mindspore::kernel {
LayoutConvertor LayoutTransformFp16(schema::Format src_format, schema::Format dst_format);
} // namespace mindspore::kernel

#endif // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_LAYOUT_TRANSFORM_FP16_H_

+ 12
- 2
mindspore/lite/src/runtime/kernel/arm/fp32/convolution.cc View File

@@ -19,6 +19,7 @@
#include "src/runtime/kernel/arm/fp32/convolution_3x3.h" #include "src/runtime/kernel/arm/fp32/convolution_3x3.h"
#include "src/runtime/kernel/arm/fp32/convolution_winograd.h" #include "src/runtime/kernel/arm/fp32/convolution_winograd.h"
#include "src/runtime/kernel/arm/opclib/fp32/conv.h" #include "src/runtime/kernel/arm/opclib/fp32/conv.h"
#include "src/runtime/kernel/arm/opclib/common_func.h"
#include "schema/model_generated.h" #include "schema/model_generated.h"
#include "src/kernel_factory.h" #include "src/kernel_factory.h"
#include "include/errorcode.h" #include "include/errorcode.h"
@@ -56,7 +57,7 @@ int ConvolutionCPUKernel::InitWeightBias() {
return RET_ERROR; return RET_ERROR;
} }
memset(packed_weight_, 0, pack_weight_size * sizeof(float)); memset(packed_weight_, 0, pack_weight_size * sizeof(float));
PackWeightFp32(origin_weight, conv_param_, packed_weight_);
PackWeightFp32(origin_weight, conv_param_, packed_weight_, oc_block, oc_block_num);


// init bias // init bias
bias_data_ = reinterpret_cast<float *>(malloc(oc_block_num * oc_block * sizeof(float))); bias_data_ = reinterpret_cast<float *>(malloc(oc_block_num * oc_block * sizeof(float)));
@@ -125,6 +126,11 @@ void ConvolutionCPUKernel::ConfigInputOutput() {
MS_LOG(ERROR) << "Check layout failed."; MS_LOG(ERROR) << "Check layout failed.";
return; return;
} }
#ifdef ENABLE_ARM32
gemm_func_ = IndirectGemmFp32_8x4;
#else
gemm_func_ = IndirectGemmFp32_8x8;
#endif
} }


int ConvolutionCPUKernel::Init() { int ConvolutionCPUKernel::Init() {
@@ -175,9 +181,13 @@ int ConvolutionCPUKernel::ReSize() {
} }


int ConvolutionCPUKernel::RunImpl(int task_id) { int ConvolutionCPUKernel::RunImpl(int task_id) {
if (gemm_func_ == nullptr) {
MS_LOG(ERROR) << "gemm_func is nullptr.";
return RET_ERROR;
}
auto output_addr = reinterpret_cast<float *>(outputs_.at(kOutputIndex)->Data()); auto output_addr = reinterpret_cast<float *>(outputs_.at(kOutputIndex)->Data());
ConvFp32(reinterpret_cast<float *>(nhwc4_input_), packed_input_, packed_weight_, ConvFp32(reinterpret_cast<float *>(nhwc4_input_), packed_input_, packed_weight_,
reinterpret_cast<float *>(bias_data_), tmp_output_block_, output_addr, task_id, conv_param_);
reinterpret_cast<float *>(bias_data_), tmp_output_block_, output_addr, task_id, conv_param_, gemm_func_);
return RET_OK; return RET_OK;
} }




+ 2
- 1
mindspore/lite/src/runtime/kernel/arm/fp32/convolution.h View File

@@ -21,6 +21,7 @@
#include "src/lite_kernel.h" #include "src/lite_kernel.h"
#include "src/runtime/kernel/arm/opclib/op_base.h" #include "src/runtime/kernel/arm/opclib/op_base.h"
#include "src/runtime/kernel/arm/base/convolution_base.h" #include "src/runtime/kernel/arm/base/convolution_base.h"
#include "src/runtime/kernel/arm/opclib/fp32/conv.h"


namespace mindspore::kernel { namespace mindspore::kernel {
class ConvolutionCPUKernel : public ConvolutionBaseCPUKernel { class ConvolutionCPUKernel : public ConvolutionBaseCPUKernel {
@@ -52,8 +53,8 @@ class ConvolutionCPUKernel : public ConvolutionBaseCPUKernel {
float *packed_input_; float *packed_input_;
float *packed_weight_; float *packed_weight_;
float *tmp_output_block_; float *tmp_output_block_;
GEMM_FUNC_FP32 gemm_func_ = nullptr;
}; };
} // namespace mindspore::kernel } // namespace mindspore::kernel


#endif // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP32_CONVOLUTION_H_ #endif // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP32_CONVOLUTION_H_


+ 23
- 11
mindspore/lite/src/runtime/kernel/arm/fp32/convolution_3x3.cc View File

@@ -29,14 +29,13 @@ using mindspore::lite::RET_OK;
using mindspore::schema::PrimitiveType_Conv2D; using mindspore::schema::PrimitiveType_Conv2D;


namespace mindspore::kernel { namespace mindspore::kernel {
void ProcessFilter(float *origin_weight, float *dst_weight, ConvParameter *conv_param) {
void ProcessFilter(float *origin_weight, float *dst_weight, ConvParameter *conv_param, int oc_block, int oc_block_num) {
auto input_channel = conv_param->input_channel_; auto input_channel = conv_param->input_channel_;
auto output_channel = conv_param->output_channel_; auto output_channel = conv_param->output_channel_;
auto kernel_plane = conv_param->kernel_w_ * conv_param->kernel_h_; auto kernel_plane = conv_param->kernel_w_ * conv_param->kernel_h_;
int iC4 = UP_DIV(input_channel, C4NUM); int iC4 = UP_DIV(input_channel, C4NUM);
int oc8 = UP_DIV(output_channel, C8NUM);


size_t tmp_size = oc8 * C8NUM * iC4 * C4NUM * kernel_plane * sizeof(float);
size_t tmp_size = oc_block_num * oc_block * iC4 * C4NUM * kernel_plane * sizeof(float);
auto tmp_addr = reinterpret_cast<float *>(malloc(tmp_size)); auto tmp_addr = reinterpret_cast<float *>(malloc(tmp_size));
if (tmp_addr == nullptr) { if (tmp_addr == nullptr) {
MS_LOG(ERROR) << "malloc tmp_addr failed."; MS_LOG(ERROR) << "malloc tmp_addr failed.";
@@ -45,8 +44,7 @@ void ProcessFilter(float *origin_weight, float *dst_weight, ConvParameter *conv_
memset(tmp_addr, 0, tmp_size); memset(tmp_addr, 0, tmp_size);


PackNHWCToNC4HW4Fp32(origin_weight, tmp_addr, output_channel, kernel_plane, input_channel); PackNHWCToNC4HW4Fp32(origin_weight, tmp_addr, output_channel, kernel_plane, input_channel);
Conv3x3Fp32FilterTransform(tmp_addr, dst_weight, iC4, output_channel, kernel_plane);

Conv3x3Fp32FilterTransform(tmp_addr, dst_weight, iC4, output_channel, kernel_plane, oc_block);
free(tmp_addr); free(tmp_addr);
} }


@@ -55,10 +53,17 @@ int Convolution3x3CPUKernel::InitWeightBias() {
auto output_channel = conv_param_->output_channel_; auto output_channel = conv_param_->output_channel_;
int iC4 = UP_DIV(input_channel, C4NUM); int iC4 = UP_DIV(input_channel, C4NUM);
int oC4 = UP_DIV(output_channel, C4NUM); int oC4 = UP_DIV(output_channel, C4NUM);
int oC8 = UP_DIV(output_channel, C8NUM);
int oc_block, oc_block_num;
#ifdef ENABLE_ARM32
oc_block = C4NUM;
oc_block_num = UP_DIV(output_channel, C4NUM);
#else
oc_block = C8NUM;
oc_block_num = UP_DIV(output_channel, C8NUM);
#endif
int k_plane = 16; int k_plane = 16;
// init weight // init weight
size_t transformed_size = iC4 * C4NUM * oC8 * C8NUM * k_plane * sizeof(float);
size_t transformed_size = iC4 * C4NUM * oc_block_num * oc_block * k_plane * sizeof(float);
transformed_filter_addr_ = reinterpret_cast<float *>(malloc(transformed_size)); transformed_filter_addr_ = reinterpret_cast<float *>(malloc(transformed_size));
if (transformed_filter_addr_ == nullptr) { if (transformed_filter_addr_ == nullptr) {
MS_LOG(ERROR) << "malloc transformed filter addr failed."; MS_LOG(ERROR) << "malloc transformed filter addr failed.";
@@ -66,7 +71,7 @@ int Convolution3x3CPUKernel::InitWeightBias() {
} }
memset(transformed_filter_addr_, 0, transformed_size); memset(transformed_filter_addr_, 0, transformed_size);
auto weight_data = reinterpret_cast<float *>(inputs_.at(kWeightIndex)->Data()); auto weight_data = reinterpret_cast<float *>(inputs_.at(kWeightIndex)->Data());
ProcessFilter(weight_data, transformed_filter_addr_, conv_param_);
ProcessFilter(weight_data, transformed_filter_addr_, conv_param_, oc_block, oc_block_num);


// init bias // init bias
size_t new_bias_size = oC4 * C4NUM * sizeof(float); size_t new_bias_size = oC4 * C4NUM * sizeof(float);
@@ -89,7 +94,6 @@ int Convolution3x3CPUKernel::InitTmpBuffer() {
int iC4 = UP_DIV(conv_param_->input_channel_, C4NUM); int iC4 = UP_DIV(conv_param_->input_channel_, C4NUM);
int oC4 = UP_DIV(conv_param_->output_channel_, C4NUM); int oC4 = UP_DIV(conv_param_->output_channel_, C4NUM);
int k_plane = 16; int k_plane = 16;
// todo
size_t tile_buffer_size = thread_count_ * TILE_NUM * k_plane * iC4 * C4NUM * sizeof(float); size_t tile_buffer_size = thread_count_ * TILE_NUM * k_plane * iC4 * C4NUM * sizeof(float);
tile_buffer_ = reinterpret_cast<float *>(malloc(tile_buffer_size)); tile_buffer_ = reinterpret_cast<float *>(malloc(tile_buffer_size));
if (tile_buffer_ == nullptr) { if (tile_buffer_ == nullptr) {
@@ -148,6 +152,11 @@ void Convolution3x3CPUKernel::ConfigInputOutput() {
MS_LOG(ERROR) << "Check layout failed."; MS_LOG(ERROR) << "Check layout failed.";
return; return;
} }
#ifdef ENABLE_ARM32
gemm_func_ = IndirectGemmFp32_8x4;
#else
gemm_func_ = IndirectGemmFp32_8x8;
#endif
} }


int Convolution3x3CPUKernel::Init() { int Convolution3x3CPUKernel::Init() {
@@ -201,9 +210,13 @@ int Convolution3x3CPUKernel::ReSize() {
} }


int Convolution3x3CPUKernel::RunImpl(int task_id) { int Convolution3x3CPUKernel::RunImpl(int task_id) {
if (gemm_func_ == nullptr) {
MS_LOG(ERROR) << "gemm_func is nullptr.";
return RET_ERROR;
}
auto output_addr = reinterpret_cast<float *>(outputs_.at(kOutputIndex)->Data()); auto output_addr = reinterpret_cast<float *>(outputs_.at(kOutputIndex)->Data());
Conv3x3Fp32(reinterpret_cast<float *>(nhwc4_input_), transformed_filter_addr_, reinterpret_cast<float *>(bias_data_), Conv3x3Fp32(reinterpret_cast<float *>(nhwc4_input_), transformed_filter_addr_, reinterpret_cast<float *>(bias_data_),
output_addr, tmp_buffer_address_list_, task_id, conv_param_);
output_addr, tmp_buffer_address_list_, task_id, conv_param_, gemm_func_);
return RET_OK; return RET_OK;
} }


@@ -234,4 +247,3 @@ int Convolution3x3CPUKernel::Run() {
return RET_OK; return RET_OK;
} }
} // namespace mindspore::kernel } // namespace mindspore::kernel


+ 2
- 3
mindspore/lite/src/runtime/kernel/arm/fp32/convolution_3x3.h View File

@@ -19,7 +19,6 @@


#include <vector> #include <vector>
#include "src/lite_kernel.h" #include "src/lite_kernel.h"

#include "src/runtime/kernel/arm/base/convolution_base.h" #include "src/runtime/kernel/arm/base/convolution_base.h"
#include "src/runtime/kernel/arm/opclib/winograd_transform.h" #include "src/runtime/kernel/arm/opclib/winograd_transform.h"


@@ -62,9 +61,9 @@ class Convolution3x3CPUKernel : public ConvolutionBaseCPUKernel {
float *tmp_dst_buffer_; float *tmp_dst_buffer_;
float *nc4hw4_out_; float *nc4hw4_out_;
TmpBufferAddress tmp_buffer_address_list_[4]; TmpBufferAddress tmp_buffer_address_list_[4];
GEMM_FUNC_FP32 gemm_func_ = nullptr;
}; };
void ProcessFilter(float *origin_weight, float *dst_weight, ConvParameter *conv_param);
void ProcessFilter(float *origin_weight, float *dst_weight, ConvParameter *conv_param, int oc_block, int oc_block_num);
} // namespace mindspore::kernel } // namespace mindspore::kernel


#endif // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP32_CONVOLUTION_3X3_H_ #endif // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP32_CONVOLUTION_3X3_H_


+ 28
- 14
mindspore/lite/src/runtime/kernel/arm/fp32/convolution_winograd.cc View File

@@ -29,7 +29,7 @@ using mindspore::schema::PrimitiveType_Conv2D;


namespace mindspore::kernel { namespace mindspore::kernel {
void WinogradFilterTransform(const float *weight_data, Matrix *trans_weight, int kernel_unit, int input_unit, void WinogradFilterTransform(const float *weight_data, Matrix *trans_weight, int kernel_unit, int input_unit,
ConvParameter *conv_param) {
ConvParameter *conv_param, int oc_block) {
// original weight format : ohwi // original weight format : ohwi
auto channel_in = conv_param->input_channel_; auto channel_in = conv_param->input_channel_;
auto channel_out = conv_param->output_channel_; auto channel_out = conv_param->output_channel_;
@@ -53,10 +53,10 @@ void WinogradFilterTransform(const float *weight_data, Matrix *trans_weight, int


int kernel_plane_stride = channel_in; int kernel_plane_stride = channel_in;
for (int i = 0; i < channel_out; i++) { for (int i = 0; i < channel_out; i++) {
int oc8_block = i / C8NUM;
int oc8_res = i % C8NUM;
int out_c_block = i / oc_block;
int out_c_res = i % oc_block;
int input_oz_offset = i * kernel_unit * kernel_unit * channel_in; int input_oz_offset = i * kernel_unit * kernel_unit * channel_in;
int output_oz_offset = oc8_block * strides[1] * input_unit * input_unit + oc8_res;
int output_oz_offset = out_c_block * strides[1] * input_unit * input_unit + out_c_res;
for (int j = 0; j < channel_in; j++) { for (int j = 0; j < channel_in; j++) {
int ic4_block = j / C4NUM; int ic4_block = j / C4NUM;
int ic4_res = j % C4NUM; int ic4_res = j % C4NUM;
@@ -88,16 +88,24 @@ void WinogradFilterTransform(const float *weight_data, Matrix *trans_weight, int
int ConvolutionWinogradCPUKernel::InitWeightBias() { int ConvolutionWinogradCPUKernel::InitWeightBias() {
int output_channel = conv_param_->output_channel_; int output_channel = conv_param_->output_channel_;
int oc4 = UP_DIV(output_channel, C4NUM); int oc4 = UP_DIV(output_channel, C4NUM);
int oc_block, oc_block_num;
#ifdef ENABLE_ARM32
oc_block = C4NUM;
oc_block_num = UP_DIV(output_channel, C4NUM);
#else
oc_block = C8NUM;
oc_block_num = UP_DIV(output_channel, C8NUM);
#endif


// init weight // init weight
auto ret = MallocFilterMatrix();
auto ret = MallocFilterMatrix(oc_block, oc_block_num);
if (ret != RET_OK) { if (ret != RET_OK) {
MS_LOG(ERROR) << "Malloc filter matrix failed."; MS_LOG(ERROR) << "Malloc filter matrix failed.";
return RET_ERROR; return RET_ERROR;
} }
auto weight_tensor = inputs_.at(kWeightIndex); auto weight_tensor = inputs_.at(kWeightIndex);
auto weight_data = reinterpret_cast<float *>(weight_tensor->Data()); auto weight_data = reinterpret_cast<float *>(weight_tensor->Data());
WinogradFilterTransform(weight_data, trans_weight_, kernel_unit_, input_unit_, conv_param_);
WinogradFilterTransform(weight_data, trans_weight_, kernel_unit_, input_unit_, conv_param_, oc_block);


// init bias // init bias
size_t new_bias_size = oc4 * C4NUM * sizeof(float); size_t new_bias_size = oc4 * C4NUM * sizeof(float);
@@ -112,14 +120,12 @@ int ConvolutionWinogradCPUKernel::InitWeightBias() {
return RET_OK; return RET_OK;
} }


int ConvolutionWinogradCPUKernel::MallocFilterMatrix() {
int ConvolutionWinogradCPUKernel::MallocFilterMatrix(int oc_block, int oc_block_num) {
int channel_in = conv_param_->input_channel_; int channel_in = conv_param_->input_channel_;
int channel_out = conv_param_->output_channel_;
int ic4 = UP_DIV(channel_in, BLOCK); int ic4 = UP_DIV(channel_in, BLOCK);
int oc8 = UP_DIV(channel_out, C8NUM);


// set data // set data
auto trans_matrix_data_size = input_unit_ * input_unit_ * ic4 * oc8 * C4NUM * C8NUM * sizeof(float);
auto trans_matrix_data_size = input_unit_ * input_unit_ * ic4 * C4NUM * oc_block_num * oc_block * sizeof(float);
auto matrix_buffer = malloc(trans_matrix_data_size); auto matrix_buffer = malloc(trans_matrix_data_size);
if (matrix_buffer == nullptr) { if (matrix_buffer == nullptr) {
MS_LOG(ERROR) << "malloc matrix_buffer failed."; MS_LOG(ERROR) << "malloc matrix_buffer failed.";
@@ -134,10 +140,10 @@ int ConvolutionWinogradCPUKernel::MallocFilterMatrix() {
std::vector<int> strides; std::vector<int> strides;
// set shape // set shape
shapes.push_back(input_unit_ * input_unit_); shapes.push_back(input_unit_ * input_unit_);
shapes.push_back(oc8);
shapes.push_back(oc_block_num);
shapes.push_back(ic4); shapes.push_back(ic4);
shapes.push_back(C4NUM); shapes.push_back(C4NUM);
shapes.push_back(C8NUM);
shapes.push_back(oc_block);
// set stride // set stride
for (int i = 0; i < 4; i++) { for (int i = 0; i < 4; i++) {
int stride = 1; int stride = 1;
@@ -227,6 +233,11 @@ int ConvolutionWinogradCPUKernel::ConfigInputOutput() {
MS_LOG(ERROR) << "Get output_trans_func_ failed."; MS_LOG(ERROR) << "Get output_trans_func_ failed.";
return RET_ERROR; return RET_ERROR;
} }
#ifdef ENABLE_ARM32
gemm_func_ = IndirectGemmFp32_8x4;
#else
gemm_func_ = IndirectGemmFp32_8x8;
#endif
return RET_OK; return RET_OK;
} }


@@ -301,10 +312,14 @@ int ConvolutionWinogradCPUKernel::ReSize() {
} }


int ConvolutionWinogradCPUKernel::RunImpl(int task_id) { int ConvolutionWinogradCPUKernel::RunImpl(int task_id) {
if (gemm_func_ == nullptr) {
MS_LOG(ERROR) << "gemm_func is nullptr.";
return RET_ERROR;
}
auto output_addr = reinterpret_cast<float *>(outputs_.at(kOutputIndex)->Data()); auto output_addr = reinterpret_cast<float *>(outputs_.at(kOutputIndex)->Data());
ConvWinogardFp32(reinterpret_cast<float *>(nhwc4_input_), reinterpret_cast<float *>(trans_weight_->GetData()), ConvWinogardFp32(reinterpret_cast<float *>(nhwc4_input_), reinterpret_cast<float *>(trans_weight_->GetData()),
reinterpret_cast<const float *>(bias_data_), output_addr, tmp_buffer_address_list_, task_id, reinterpret_cast<const float *>(bias_data_), output_addr, tmp_buffer_address_list_, task_id,
conv_param_, input_trans_func_, output_trans_func_);
conv_param_, input_trans_func_, output_trans_func_, gemm_func_);
return RET_OK; return RET_OK;
} }


@@ -335,4 +350,3 @@ int ConvolutionWinogradCPUKernel::Run() {
return RET_OK; return RET_OK;
} }
} // namespace mindspore::kernel } // namespace mindspore::kernel


+ 3
- 3
mindspore/lite/src/runtime/kernel/arm/fp32/convolution_winograd.h View File

@@ -50,7 +50,7 @@ class ConvolutionWinogradCPUKernel : public ConvolutionBaseCPUKernel {
int Run() override; int Run() override;
int RunImpl(int task_id); int RunImpl(int task_id);
int InitWeightBias(); int InitWeightBias();
int MallocFilterMatrix();
int MallocFilterMatrix(int oc_block, int oc_block_num);
int InitTmpBuffer(); int InitTmpBuffer();
int ConfigInputOutput(); int ConfigInputOutput();


@@ -66,9 +66,9 @@ class ConvolutionWinogradCPUKernel : public ConvolutionBaseCPUKernel {
InputTransformUnitFunc input_trans_func_; InputTransformUnitFunc input_trans_func_;
OutputTransformUnitFunc output_trans_func_; OutputTransformUnitFunc output_trans_func_;
TmpBufferAddress tmp_buffer_address_list_[5]; TmpBufferAddress tmp_buffer_address_list_[5];
GEMM_FUNC_FP32 gemm_func_ = nullptr;
}; };
void WinogradFilterTransform(const float *weight_data, Matrix *trans_weight, int kernel_unit, int input_unit, void WinogradFilterTransform(const float *weight_data, Matrix *trans_weight, int kernel_unit, int input_unit,
ConvParameter *conv_param);
ConvParameter *conv_param, int oc_block);
} // namespace mindspore::kernel } // namespace mindspore::kernel
#endif // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP32_CONVOLUTION_WINOGRAD_H_ #endif // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP32_CONVOLUTION_WINOGRAD_H_


+ 6
- 1
mindspore/lite/src/runtime/kernel/arm/opclib/common_func.cc View File

@@ -17,7 +17,7 @@
#include "src/runtime/kernel/arm/opclib/common_func.h" #include "src/runtime/kernel/arm/opclib/common_func.h"
#include "src/runtime/kernel/arm/opclib/quantization/fixed_point.h" #include "src/runtime/kernel/arm/opclib/quantization/fixed_point.h"


#ifndef __aarch64__
#ifndef ENABLE_ARM64
void IndirectGemmFp32(float *output, const float *input, const float *weight, const float *bias, size_t step, int ic4, void IndirectGemmFp32(float *output, const float *input, const float *weight, const float *bias, size_t step, int ic4,
int output_channel, size_t offset, size_t relu, size_t relu6) { int output_channel, size_t offset, size_t relu, size_t relu6) {
for (int i = 0; i < TILE_NUM; i++) { for (int i = 0; i < TILE_NUM; i++) {
@@ -102,6 +102,11 @@ void IndirectGemmFp32_8x8(float *output, const float *input, const float *weight
} }
} }
#endif #endif
#ifndef ENABLE_ARM32
void IndirectGemmFp32_8x4(float *output, const float *input, const float *weight, const float *bias, size_t step,
size_t ic4, size_t output_channel, size_t offset, size_t mode, size_t writeC4, size_t relu,
size_t relu6) {}
#endif


int8_t MinInt8(int8_t a, int8_t b) { return b ^ ((a ^ b) & -(a < b)); } int8_t MinInt8(int8_t a, int8_t b) { return b ^ ((a ^ b) & -(a < b)); }




+ 3
- 0
mindspore/lite/src/runtime/kernel/arm/opclib/common_func.h View File

@@ -36,6 +36,9 @@ void PostFuncInt8(const int *in, const int *bias, int8_t *out, int oc, int plane
void IndirectGemmFp32_8x8(float *output, const float *input, const float *weight, const float *bias, size_t step, void IndirectGemmFp32_8x8(float *output, const float *input, const float *weight, const float *bias, size_t step,
size_t ic4, size_t output_channel, size_t offset, size_t mode, size_t writeC4, size_t relu, size_t ic4, size_t output_channel, size_t offset, size_t mode, size_t writeC4, size_t relu,
size_t relu6); size_t relu6);
void IndirectGemmFp32_8x4(float *output, const float *input, const float *weight, const float *bias, size_t step,
size_t ic4, size_t output_channel, size_t offset, size_t mode, size_t writeC4, size_t relu,
size_t relu6);
void IndirectGemmFp32_Comm(float *output, const float *input, const float *weight, size_t ic4, size_t hw, size_t oc, void IndirectGemmFp32_Comm(float *output, const float *input, const float *weight, size_t ic4, size_t hw, size_t oc,
size_t offset); size_t offset);
void IndirectGemmFp32(float *output, const float *input, const float *weight, const float *bias, size_t step, int ic4, void IndirectGemmFp32(float *output, const float *input, const float *weight, const float *bias, size_t step, int ic4,


+ 13
- 11
mindspore/lite/src/runtime/kernel/arm/opclib/fp32/conv.cc View File

@@ -20,7 +20,8 @@


// fp32 conv common // fp32 conv common
void ConvFp32(float *input_data, float *packed_input, float *packed_weight, const float *bias_data, void ConvFp32(float *input_data, float *packed_input, float *packed_weight, const float *bias_data,
float *tmp_out_block, float *output_data, int task_id, ConvParameter *conv_param) {
float *tmp_out_block, float *output_data, int task_id, ConvParameter *conv_param,
GEMM_FUNC_FP32 gemm_func) {
int kernel_h = conv_param->kernel_h_; int kernel_h = conv_param->kernel_h_;
int kernel_w = conv_param->kernel_w_; int kernel_w = conv_param->kernel_w_;
int in_batch = conv_param->input_batch_; int in_batch = conv_param->input_batch_;
@@ -57,12 +58,12 @@ void ConvFp32(float *input_data, float *packed_input, float *packed_weight, cons
int out_offset = thread_id * TILE_NUM * out_channel + out_batch_offset; int out_offset = thread_id * TILE_NUM * out_channel + out_batch_offset;
if (real_cal_num == TILE_NUM) { if (real_cal_num == TILE_NUM) {
float *gemm_output = output_data + out_offset; float *gemm_output = output_data + out_offset;
IndirectGemmFp32_8x8(gemm_output, gemm_input, packed_weight, bias_data, conv_depth, ic4, out_channel,
output_offset, 0, 0, conv_param->is_relu_, conv_param->is_relu6_);
gemm_func(gemm_output, gemm_input, packed_weight, bias_data, conv_depth, ic4, out_channel, output_offset, 0, 0,
conv_param->is_relu_, conv_param->is_relu6_);
} else { } else {
// res part // res part
IndirectGemmFp32_8x8(tmp_out_block, gemm_input, packed_weight, bias_data, conv_depth, ic4, out_channel,
output_offset, 0, 0, conv_param->is_relu_, conv_param->is_relu6_);
gemm_func(tmp_out_block, gemm_input, packed_weight, bias_data, conv_depth, ic4, out_channel, output_offset, 0,
0, conv_param->is_relu_, conv_param->is_relu6_);
memcpy(output_data + out_offset, tmp_out_block, real_cal_num * out_channel * sizeof(float)); memcpy(output_data + out_offset, tmp_out_block, real_cal_num * out_channel * sizeof(float));
} }
} }
@@ -78,7 +79,8 @@ int Conv1x1Fp32(const float *input_data, const float *weight_data, float *output
// fp32 conv winograd // fp32 conv winograd
void ConvWinogardFp32(float *input_data, float *trans_weight, const float *bias_data, float *output_data, void ConvWinogardFp32(float *input_data, float *trans_weight, const float *bias_data, float *output_data,
TmpBufferAddress *buffer_list, int task_id, ConvParameter *conv_param, TmpBufferAddress *buffer_list, int task_id, ConvParameter *conv_param,
InputTransformUnitFunc input_trans_func, OutputTransformUnitFunc output_trans_func) {
InputTransformUnitFunc input_trans_func, OutputTransformUnitFunc output_trans_func,
GEMM_FUNC_FP32 gemm_func) {
int thread_num = conv_param->thread_num_; int thread_num = conv_param->thread_num_;
int input_unit = conv_param->input_unit_; int input_unit = conv_param->input_unit_;
int in_batch = conv_param->input_batch_; int in_batch = conv_param->input_batch_;
@@ -111,8 +113,8 @@ void ConvWinogardFp32(float *input_data, float *trans_weight, const float *bias_
WinogradInputTransform(input_data, trans_input, tmp_data, cal_num, out_tile_index, out_w_block, conv_param, WinogradInputTransform(input_data, trans_input, tmp_data, cal_num, out_tile_index, out_w_block, conv_param,
input_trans_func); input_trans_func);
// step 3 : gemm // step 3 : gemm
IndirectGemmFp32_8x8(gemm_out, trans_input, trans_weight, nullptr, input_unit_square, ic4, oc4 * C4NUM,
output_offset, 1, 1, 0, 0);
gemm_func(gemm_out, trans_input, trans_weight, nullptr, input_unit_square, ic4, oc4 * C4NUM, output_offset, 1, 1,
0, 0);


// step 4 : output transform // step 4 : output transform
WinogradOutputTransform(gemm_out, tmp_out_data, bias_data, cal_num, out_tile_index, out_w_block, conv_param, WinogradOutputTransform(gemm_out, tmp_out_data, bias_data, cal_num, out_tile_index, out_w_block, conv_param,
@@ -173,7 +175,7 @@ void UnPackWinogradOutput(const float *src, float *dst, int batch, int height, i


// fp32 conv3x3 // fp32 conv3x3
void Conv3x3Fp32(float *input_data, float *transed_weight, const float *bias_data, float *output_data, void Conv3x3Fp32(float *input_data, float *transed_weight, const float *bias_data, float *output_data,
TmpBufferAddress *buffer_list, int task_id, ConvParameter *conv_param) {
TmpBufferAddress *buffer_list, int task_id, ConvParameter *conv_param, GEMM_FUNC_FP32 gemm_func) {
int thread_count = conv_param->thread_num_; int thread_count = conv_param->thread_num_;
int ic4 = UP_DIV(conv_param->input_channel_, C4NUM); int ic4 = UP_DIV(conv_param->input_channel_, C4NUM);
int output_channel = conv_param->output_channel_; int output_channel = conv_param->output_channel_;
@@ -198,8 +200,8 @@ void Conv3x3Fp32(float *input_data, float *transed_weight, const float *bias_dat
Conv3x3Fp32InputTransform(input_data, tile_buffer, block_unit_buffer, start_index, real_cal_num, out_w_block, Conv3x3Fp32InputTransform(input_data, tile_buffer, block_unit_buffer, start_index, real_cal_num, out_w_block,
conv_param); conv_param);


IndirectGemmFp32_8x8(tmp_dst_buffer, tile_buffer, transed_weight, nullptr, input_unit_square, ic4, oc4 * C4NUM,
oc4 * C4NUM * input_unit_square * sizeof(float), 1, 1, 0, 0);
gemm_func(tmp_dst_buffer, tile_buffer, transed_weight, nullptr, input_unit_square, ic4, oc4 * C4NUM,
oc4 * C4NUM * input_unit_square * sizeof(float), 1, 1, 0, 0);


Conv3x3Fp32OutputTransform(tmp_dst_buffer, nc4hw4_out, bias_data, start_index, real_cal_num, out_w_block, Conv3x3Fp32OutputTransform(tmp_dst_buffer, nc4hw4_out, bias_data, start_index, real_cal_num, out_w_block,
conv_param); conv_param);


+ 8
- 3
mindspore/lite/src/runtime/kernel/arm/opclib/fp32/conv.h View File

@@ -28,10 +28,14 @@
#include "src/runtime/kernel/arm/opclib/winograd_utils.h" #include "src/runtime/kernel/arm/opclib/winograd_utils.h"


using TmpBufferAddress = float *; using TmpBufferAddress = float *;
typedef void (*GEMM_FUNC_FP32)(float *output, const float *input, const float *weight, const float *bias, size_t step,
size_t ic4, size_t output_channel, size_t offset, size_t mode, size_t writeC4,
size_t relu, size_t relu6);


// fp32 convolution common (im2col+gemm) // fp32 convolution common (im2col+gemm)
void ConvFp32(float *input_data, float *packed_input, float *packed_weight, const float *bias_data, void ConvFp32(float *input_data, float *packed_input, float *packed_weight, const float *bias_data,
float *tmp_out_block, float *output_data, int task_id, ConvParameter *conv_param);
float *tmp_out_block, float *output_data, int task_id, ConvParameter *conv_param,
GEMM_FUNC_FP32 gemm_func);


// fp32 conv1x1 strassen matmul // fp32 conv1x1 strassen matmul
int Conv1x1Fp32(const float *input_data, const float *weight_data, float *output_data, float *tmp_ptr, int Conv1x1Fp32(const float *input_data, const float *weight_data, float *output_data, float *tmp_ptr,
@@ -40,12 +44,13 @@ int Conv1x1Fp32(const float *input_data, const float *weight_data, float *output
// fp32 convolution winograd // fp32 convolution winograd
void ConvWinogardFp32(float *input_data, float *trans_weight, const float *bias_data, float *output_data, void ConvWinogardFp32(float *input_data, float *trans_weight, const float *bias_data, float *output_data,
TmpBufferAddress *buffer_list, int task_id, ConvParameter *conv_param, TmpBufferAddress *buffer_list, int task_id, ConvParameter *conv_param,
InputTransformUnitFunc input_trans_func, OutputTransformUnitFunc output_trans_func);
InputTransformUnitFunc input_trans_func, OutputTransformUnitFunc output_trans_func,
GEMM_FUNC_FP32 gemm_func);


void UnPackWinogradOutput(const float *src, float *dst, int batch, int height, int width, int channel, int output_unit); void UnPackWinogradOutput(const float *src, float *dst, int batch, int height, int width, int channel, int output_unit);


// fp32 conv3x3 // fp32 conv3x3
void Conv3x3Fp32(float *input_data, float *transed_weight, const float *bias_data, float *output_data, void Conv3x3Fp32(float *input_data, float *transed_weight, const float *bias_data, float *output_data,
TmpBufferAddress *buffer_list, int task_id, ConvParameter *conv_param);
TmpBufferAddress *buffer_list, int task_id, ConvParameter *conv_param, GEMM_FUNC_FP32 gemm_func);


#endif // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_OPCLIB_FP32_CONV_H_ #endif // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_OPCLIB_FP32_CONV_H_

+ 3
- 1
mindspore/lite/src/runtime/kernel/arm/opclib/int8/conv_int8.cc View File

@@ -49,7 +49,9 @@ void IndirectGemmInt8(int8_t *dst, int32_t *tmp_dst, const int8_t *src, const in
#ifdef __aarch64__ #ifdef __aarch64__
IndirectGemmInt8_4x4(dst, src, weight, bias, kernel_plane, ic4, output_channel, output_channel * sizeof(int8_t), IndirectGemmInt8_4x4(dst, src, weight, bias, kernel_plane, ic4, output_channel, output_channel * sizeof(int8_t),
input_sum, act_min, act_max, out_zp, out_multiplier, shift_before, shift_after); input_sum, act_min, act_max, out_zp, out_multiplier, shift_before, shift_after);
// todo arm32
#elif defined(ENABLE_ARM32)
IndirectGemmInt8_2x4(dst, src, weight, bias, kernel_plane, ic4, output_channel, output_channel * sizeof(int8_t),
input_sum, act_min, act_max, out_zp, out_multiplier, shift_before, shift_after);
#else #else
int tile_num = conv_param->tile_num_; int tile_num = conv_param->tile_num_;
int plane_c4 = UP_DIV(kernel_plane, C4NUM); int plane_c4 = UP_DIV(kernel_plane, C4NUM);


+ 4
- 4
mindspore/lite/src/runtime/kernel/arm/opclib/optimized_kernel.h View File

@@ -58,10 +58,10 @@ class OptimizeModule {
if ((!support_optimize_ops) && (!support_fp16)) { if ((!support_optimize_ops) && (!support_fp16)) {
return; return;
} }
// optimized_op_handler_ = dlopen(OPTIMIZE_SHARED_LIBRARY_PATH, RTLD_LAZY);
// if (optimized_op_handler_ == nullptr) {
// printf("Open optimize shared library failed.\n");
// }
optimized_op_handler_ = dlopen(OPTIMIZE_SHARED_LIBRARY_PATH, RTLD_LAZY);
if (optimized_op_handler_ == nullptr) {
printf("Open optimize shared library failed.\n");
}
} }


~OptimizeModule() = default; ~OptimizeModule() = default;


+ 10
- 11
mindspore/lite/src/runtime/kernel/arm/opclib/pack.cc View File

@@ -18,20 +18,19 @@
#include <cstring> #include <cstring>
#include <cstdlib> #include <cstdlib>


void PackWeightFp32(float *weight_data, ConvParameter *conv_param, float *packed_weight) {
void PackWeightFp32(float *weight_data, ConvParameter *conv_param, float *packed_weight, int oc_block,
int oc_block_num) {
// original weight format : ohwi // original weight format : ohwi
// todo pack weight for arm32 platform
int kernel_h = conv_param->kernel_h_; int kernel_h = conv_param->kernel_h_;
int kernel_w = conv_param->kernel_w_; int kernel_w = conv_param->kernel_w_;
int in_channel = conv_param->input_channel_; int in_channel = conv_param->input_channel_;
int out_channel = conv_param->output_channel_; int out_channel = conv_param->output_channel_;
int oc8 = UP_DIV(out_channel, C8NUM);
int ic4 = UP_DIV(in_channel, C4NUM); int ic4 = UP_DIV(in_channel, C4NUM);
int kernel_plane = kernel_h * kernel_w; int kernel_plane = kernel_h * kernel_w;
int pack_weight_size = oc8 * C8NUM * ic4 * C4NUM * kernel_plane;
int pack_weight_size = oc_block * oc_block_num * ic4 * C4NUM * kernel_plane;


int unit_size = C8NUM * C4NUM;
int block_size = pack_weight_size / oc8;
int unit_size = oc_block * C4NUM;
int block_size = pack_weight_size / oc_block_num;


for (int m = 0; m < kernel_plane; m++) { for (int m = 0; m < kernel_plane; m++) {
int kernel_plane_stride = m * in_channel; int kernel_plane_stride = m * in_channel;
@@ -43,12 +42,12 @@ void PackWeightFp32(float *weight_data, ConvParameter *conv_param, float *packed
int real_ic_num = ic_remainder < C4NUM ? ic_remainder : C4NUM; int real_ic_num = ic_remainder < C4NUM ? ic_remainder : C4NUM;
for (int h = 0; h < real_ic_num; h++) { for (int h = 0; h < real_ic_num; h++) {
int block_stride = channel_block_stride + h; int block_stride = channel_block_stride + h;
int packed_block_stride = packed_channel_block_size + h * C8NUM;
for (int j = 0; j < oc8; j++) {
int kernel_block_stride = block_stride + j * C8NUM * kernel_plane * in_channel;
int packed_block_stride = packed_channel_block_size + h * oc_block;
for (int j = 0; j < oc_block_num; j++) {
int kernel_block_stride = block_stride + j * oc_block * kernel_plane * in_channel;
int packed_kernel_block_size = packed_block_stride + j * block_size; int packed_kernel_block_size = packed_block_stride + j * block_size;
int oc_remainder = out_channel - j * C8NUM;
int real_oc_num = oc_remainder < C8NUM ? oc_remainder : C8NUM;
int oc_remainder = out_channel - j * oc_block;
int real_oc_num = oc_remainder < oc_block ? oc_remainder : oc_block;
for (int k = 0; k < real_oc_num; k++) { for (int k = 0; k < real_oc_num; k++) {
float *origin_data_ptr = weight_data + kernel_block_stride + k * kernel_plane * in_channel; float *origin_data_ptr = weight_data + kernel_block_stride + k * kernel_plane * in_channel;
float *packed_data_ptr = packed_weight + packed_kernel_block_size + k; float *packed_data_ptr = packed_weight + packed_kernel_block_size + k;


+ 2
- 1
mindspore/lite/src/runtime/kernel/arm/opclib/pack.h View File

@@ -40,7 +40,8 @@ void MatrixPack(const float *src, float *dst, int row, int ic4, int stride);


void PackInputToC8Int8(const int8_t *input_data, int16_t *packed_input, ConvParameter *conv_param); void PackInputToC8Int8(const int8_t *input_data, int16_t *packed_input, ConvParameter *conv_param);


void PackWeightFp32(float *weight_data, ConvParameter *conv_param, float *packed_weight);
void PackWeightFp32(float *weight_data, ConvParameter *conv_param, float *packed_weight, int oc_block,
int oc_block_num);


void PackWeightInt8(int8_t *weight_data, ConvParameter *conv_param, int8_t *packed_weight, int32_t *weight_sum); void PackWeightInt8(int8_t *weight_data, ConvParameter *conv_param, int8_t *packed_weight, int32_t *weight_sum);




+ 7
- 8
mindspore/lite/src/runtime/kernel/arm/opclib/winograd_transform.cc View File

@@ -326,18 +326,18 @@ void Conv3x3Fp32InputTransform(const float *input_data, float *trans_input, floa
} }
} }


void Conv3x3Fp32FilterTransform(float *weight_data, float *trans_weight, int iC4, int output_channel,
int kernel_plane) {
void Conv3x3Fp32FilterTransform(float *weight_data, float *trans_weight, int iC4, int output_channel, int kernel_plane,
int oc_block) {
int input_unit = 4; int input_unit = 4;
int dst_step = iC4 * C4NUM * C8NUM;
int dst_step = iC4 * C4NUM * oc_block;
for (int o = 0; o < output_channel; o++) { for (int o = 0; o < output_channel; o++) {
int oc8_block_num = o / C8NUM;
int oc8_block_rem = o % C8NUM;
int oc_block_num = o / oc_block;
int oc_block_rem = o % oc_block;
int src_oc_offset = o * iC4 * C4NUM * kernel_plane; int src_oc_offset = o * iC4 * C4NUM * kernel_plane;
int dst_oc_offset = oc8_block_num * C8NUM * iC4 * C4NUM * input_unit * input_unit + oc8_block_rem;
int dst_oc_offset = oc_block_num * oc_block * iC4 * C4NUM * input_unit * input_unit + oc_block_rem;
for (int i = 0; i < iC4; i++) { for (int i = 0; i < iC4; i++) {
float *src_ic4_ptr = weight_data + src_oc_offset + i * kernel_plane * C4NUM; float *src_ic4_ptr = weight_data + src_oc_offset + i * kernel_plane * C4NUM;
float *dst_ic4_ptr = trans_weight + dst_oc_offset + i * C8NUM * C4NUM;
float *dst_ic4_ptr = trans_weight + dst_oc_offset + i * oc_block * C4NUM;
#ifdef ENABLE_ARM #ifdef ENABLE_ARM
float32x4_t g00 = vld1q_f32(src_ic4_ptr); float32x4_t g00 = vld1q_f32(src_ic4_ptr);
float32x4_t g01 = vld1q_f32(src_ic4_ptr + 4); float32x4_t g01 = vld1q_f32(src_ic4_ptr + 4);
@@ -1368,4 +1368,3 @@ void Conv3x3Uint8OutputTransform(const int32_t *gemm_out, int8_t *out_data, cons
} }
} }
} }


+ 2
- 2
mindspore/lite/src/runtime/kernel/arm/opclib/winograd_transform.h View File

@@ -43,7 +43,8 @@ void Conv3x3Fp32InputUnit(const float *tmp_data, float *trans_input_data, size_t
void Conv3x3Fp32InputTransform(const float *input_data, float *trans_input, float *tmp_data, int start_index, void Conv3x3Fp32InputTransform(const float *input_data, float *trans_input, float *tmp_data, int start_index,
int real_cal_num, int out_w_block, ConvParameter *conv_param); int real_cal_num, int out_w_block, ConvParameter *conv_param);


void Conv3x3Fp32FilterTransform(float *weight_data, float *trans_weight, int iC4, int output_channel, int kernel_plane);
void Conv3x3Fp32FilterTransform(float *weight_data, float *trans_weight, int iC4, int output_channel, int kernel_plane,
int oc_block);


void Conv3x3Fp32OutputUnit(const float *gemm_out, const float *bias_data, float *output_data, bool h_not_bound, void Conv3x3Fp32OutputUnit(const float *gemm_out, const float *bias_data, float *output_data, bool h_not_bound,
bool w_not_bound, int output_w); bool w_not_bound, int output_w);
@@ -67,4 +68,3 @@ void Conv3x3Uint8OutputTransform(const int32_t *gemm_out, int8_t *out_data, cons
int real_cal_num, int out_w_block, ConvParameter *conv_param); int real_cal_num, int out_w_block, ConvParameter *conv_param);


#endif // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_OPCLIB_WINOGRAD_TRANSFORM_H_ #endif // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_OPCLIB_WINOGRAD_TRANSFORM_H_


+ 1
- 1
mindspore/lite/test/ut/src/runtime/kernel/arm/common/pack_tests.cc View File

@@ -122,7 +122,7 @@ TEST_F(TestPack, PackWeightFp32) {
std::string weight_path = "./test_data/conv/convfp32_weight_32_3_3_3.bin"; std::string weight_path = "./test_data/conv/convfp32_weight_32_3_3_3.bin";
auto weight_data = reinterpret_cast<float *>(mindspore::lite::ReadFile(weight_path.c_str(), &weight_size)); auto weight_data = reinterpret_cast<float *>(mindspore::lite::ReadFile(weight_path.c_str(), &weight_size));
auto packed_weight = reinterpret_cast<float *>(malloc(k_h * k_w * ic4 * C4NUM * oc8 * C8NUM * sizeof(float))); auto packed_weight = reinterpret_cast<float *>(malloc(k_h * k_w * ic4 * C4NUM * oc8 * C8NUM * sizeof(float)));
PackWeightFp32(weight_data, conv_param, packed_weight);
PackWeightFp32(weight_data, conv_param, packed_weight, C8NUM, oc8);


printf("==================output data=================\n"); printf("==================output data=================\n");
for (int i = 0; i < 20; i++) { for (int i = 0; i < 20; i++) {


Loading…
Cancel
Save