add DeConv2d Coder

4 years ago · 6ea18643bd
--- a/mindspore/lite/micro/cmake/file_list.cmake
+++ b/mindspore/lite/micro/cmake/file_list.cmake
@@ -99,6 +99,7 @@ set(CODER_OPCODERS_SRC
        ${MICRO_DIR}/coder/opcoders/nnacl/fp32/transpose_fp32_coder.cc
        ${MICRO_DIR}/coder/opcoders/nnacl/fp32/splice_fp32_coder.cc
        ${MICRO_DIR}/coder/opcoders/nnacl/fp32/exp_fp32_coder.cc
        ${MICRO_DIR}/coder/opcoders/nnacl/fp32/deconv2d_fp32_coder.cc
        #### nnacl int8 coder
        ${MICRO_DIR}/coder/opcoders/nnacl/int8/activation_int8_coder.cc
        ${MICRO_DIR}/coder/opcoders/nnacl/int8/add_int8_coder.cc
@@ -188,11 +189,13 @@ set(LITE_KERNEL_SRC
        ${NNACL_DIR}/fp32/winograd_utils.c
        ${NNACL_DIR}/fp32/pack_fp32.c
        ${NNACL_DIR}/fp32/arithmetic_fp32.c
        ${NNACL_DIR}/fp32/deconv_fp32.c
        ${NNACL_DIR}/fp32/matmul_fp32.c
        ${NNACL_DIR}/fp32/common_func_fp32.c
        ${NNACL_DIR}/int8/quantize.c
        ${NNACL_DIR}/int8/pack_int8.c
        ${NNACL_DIR}/int8/matmul_int8.c
        ${NNACL_DIR}/int8/fixed_point.c
        ${NNACL_DIR}/fp32/matmul_fp32.c
        ${NNACL_DIR}/int8/arithmetic_int8.c
        ${NNACL_DIR}/int8/add_int8.c
        ${NNACL_DIR}/int8/concat_int8.c
@@ -288,6 +291,8 @@ set(LITE_KERNEL_SRC
 if("${X86_64_SIMD}" STREQUAL "sse")
    set(SSE_SRC
            ${NNACL_DIR}/intrinsics/sse/MatMul_Sse.c
            ${NNACL_DIR}/intrinsics/sse/PostFuncBiasReluC8.c
            ${NNACL_DIR}/intrinsics/sse/PostFuncBiasReluC4.c
            )
    set_property(SOURCE ${SSE_SRC} PROPERTY LANGUAGE C)
 endif()
@@ -299,6 +304,8 @@ if("${X86_64_SIMD}" STREQUAL "avx")
    set(AVX_SRC
            ${NNACL_DIR}/intrinsics/avx/common_utils.c
            ${NNACL_DIR}/intrinsics/sse/MatMul_Sse.c
            ${NNACL_DIR}/intrinsics/sse/PostFuncBiasReluC8.c
            ${NNACL_DIR}/intrinsics/sse/PostFuncBiasReluC4.c
            ${NNACL_DIR}/assembly/avx/MatmulAvx.S
            )
    set_property(SOURCE ${AVX_SRC} PROPERTY LANGUAGE C)
--- a/mindspore/lite/micro/cmake/package_wrapper.cmake
+++ b/mindspore/lite/micro/cmake/package_wrapper.cmake
@@ -7,6 +7,7 @@ set(WRAPPER_SRC
        ${WRAPPER_DIR}/base/optimize_handler_wrapper.c
        ${WRAPPER_DIR}/fp32/matmul_fp32_wrapper.c
        ${WRAPPER_DIR}/fp32/arithmetic_fp32_wrapper.c
        ${WRAPPER_DIR}/fp32/deconvolution_fp32_wrapper.c
        ${WRAPPER_DIR}/int8/matmul_int8_wrapper.c
        ${WRAPPER_DIR}/int8/add_int8_wrapper.c
        ${WRAPPER_DIR}/int8/concat_int8_wrapper.c
--- a/mindspore/lite/micro/coder/opcoders/nnacl/fp32/deconv2d_fp32_coder.cc
+++ b/mindspore/lite/micro/coder/opcoders/nnacl/fp32/deconv2d_fp32_coder.cc
@@ -0,0 +1,196 @@
 /**
 * Copyright 2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "coder/opcoders/nnacl/fp32/deconv2d_fp32_coder.h"
 #include <memory>
 #include <string>
 #include <vector>
 #include "coder/opcoders/nnacl/fp32/convolution_winograd_fp32_coder.h"
 #include "nnacl/fp32/winograd_utils.h"
 #include "coder/opcoders/file_collector.h"
 #include "coder/log.h"
 #include "coder/opcoders/parallel.h"
 #include "src/common/version_manager.h"
 #include "coder/opcoders/nnacl/dequant/de_quant.h"

 using mindspore::schema::PrimitiveType_Conv2dTransposeFusion;
 namespace mindspore::lite::micro::nnacl {
 int DeConvolutionFP32Coder::InitRunBuf() {
  pack_output_size_ = UP_ROUND(conv_param_->output_channel_, C8NUM) * output_plane_ * sizeof(float);
  packed_output_ = reinterpret_cast<float *>(allocator_->Malloc(kNumberTypeFloat32, pack_output_size_, kWorkspace));
  MS_CHECK_PTR(packed_output_);

  if (target_ == kARM32A) {
    tmp_buffer_size_ = matmul_param_.row_4_ * matmul_param_.col_8_ * sizeof(float);
  } else {
    tmp_buffer_size_ = matmul_param_.row_12_ * matmul_param_.col_8_ * sizeof(float);
  }
  tmp_buffer_ = reinterpret_cast<float *>(allocator_->Malloc(kNumberTypeFloat32, tmp_buffer_size_, kWorkspace));
  MS_CHECK_PTR(tmp_buffer_);

  if (target_ == kARM32A) {
    pack_input_size_ = matmul_param_.row_4_ * matmul_param_.deep_ * sizeof(float);
  } else {
    pack_input_size_ = matmul_param_.row_12_ * matmul_param_.deep_ * sizeof(float);
  }
  packed_input_ = reinterpret_cast<float *>(allocator_->Malloc(kNumberTypeFloat32, pack_input_size_, kWorkspace));
  MS_CHECK_PTR(packed_input_);
  return RET_OK;
 }

 int DeConvolutionFP32Coder::InitParam() {
  input_plane_ = conv_param_->input_h_ * conv_param_->input_w_;
  kernel_plane_ = conv_param_->kernel_w_ * conv_param_->kernel_h_;
  output_plane_ = conv_param_->output_h_ * conv_param_->output_w_;

  matmul_param_.row_ = input_plane_;
  matmul_param_.deep_ = conv_param_->input_channel_;
  matmul_param_.col_ = conv_param_->output_channel_ * kernel_plane_;
  matmul_param_.row_12_ = UP_ROUND(matmul_param_.row_, C12NUM);
  matmul_param_.row_4_ = UP_ROUND(matmul_param_.row_, C4NUM);
  matmul_param_.col_8_ = UP_ROUND(conv_param_->output_channel_, C8NUM) * kernel_plane_;
  return RET_OK;
 }

 int DeConvolutionFP32Coder::Prepare(CoderContext *const context) {
  MS_CHECK_RET_CODE(Conv2DBaseCoder::Init(), "Conv2DBaseCoder::Init() failed.");
  MS_CHECK_RET_CODE(InitWeightBias(context), "Init weight bias failed.");
  return Resize();
 }

 int DeConvolutionFP32Coder::Resize() {
  MS_CHECK_RET_CODE(Conv2DBaseCoder::Init(), "init failed.");
  MS_CHECK_RET_CODE(InitParam(), "init param  failed.");
  MS_CHECK_RET_CODE(InitRunBuf(), "init run buffer failed.");
  return RET_OK;
 }

 int DeConvolutionFP32Coder::InitWeightBias(CoderContext *const context) {
  int kernel_h = filter_tensor_->Height();
  int kernel_w = filter_tensor_->Width();
  int in_channel = filter_tensor_->Channel();
  int out_channel = filter_tensor_->Batch();
  conv_param_->input_channel_ = in_channel;
  conv_param_->output_channel_ = out_channel;

  if (input_tensors_.size() == kInputSize2) {
    bias_data_size_ = UP_ROUND(out_channel, C4NUM) * sizeof(float);
    packed_bias_ = reinterpret_cast<float *>(allocator_->Malloc(kNumberTypeFloat32, kOnlineSize, kOnlinePackWeight));
    MS_CHECK_PTR(packed_bias_);
  }

  int kernel_plane = kernel_h * kernel_w;
  int pack_weight_size = in_channel * kernel_plane;
  pack_weight_size_ = pack_weight_size * UP_ROUND(out_channel, C8NUM) * sizeof(float);

  packed_weight_ = reinterpret_cast<float *>(allocator_->Malloc(kNumberTypeFloat32, kOnlineSize, kOnlinePackWeight));
  MS_CHECK_PTR(packed_weight_);

  NNaclFp32Serializer init_code;
  if (input_tensors_.size() == kInputSize2) {
    init_code.CodeMallocExpression(packed_bias_, bias_data_size_);
    init_code.CodeFunction("memset", packed_bias_, 0, pack_weight_size_);
    init_code.CodeFunction("memcpy", packed_bias_, bias_tensor_, out_channel * sizeof(float));
  }

  init_code.CodeMallocExpression(packed_weight_, pack_weight_size_);
  init_code.CodeFunction("memset", packed_weight_, 0, pack_weight_size_);
  init_code.CodeFunction("PackNHWCToC8HWN8Fp32", filter_tensor_, packed_weight_, in_channel, kernel_plane, out_channel);

  context->AppendInitCode(init_code.str());
  return RET_OK;
 }

 int DeConvolutionFP32Coder::DoCode(CoderContext *const context) {
  Collect(context,
          {
            "wrapper/fp32/deconvolution_fp32_wrapper.h",
            "nnacl/fp32/conv_common_fp32.h",
            "nnacl/pack.h",
            "nnacl/fp32/common_func_fp32.h",
            "nnacl/base/minimal_filtering_generator.h",
            "nnacl/fp32/matmul_fp32.h",
            "nnacl/conv_parameter.h",
            "nnacl/matmul_parameter.h",
            "nnacl/op_base.h",
          },
          {
            "deconvolution_fp32_wrapper.c",
            "common_func.c",
            "conv_common_fp32.c",
            "matmul_fp32.c",
            "pack_fp32.c",
            "deconv_fp32.c",
            "minimal_filter_generator.c",
          });
  if (target_ == kARM32A) {
    Collect(context, {}, {},
            {
              "MatmulFp32.S",
              "MatmulFp32Opt.S",
              "PreSum4x16Int8Peroc.S",
              "PreSum4x16Int8Pert.S",
              "IndirectGemmInt16to32_8x4.S",
              "MatmulInt8.S",
              "MatmulFp32Opt12x4.S",
            });
  } else if (target_ == kARM64) {
    Collect(context, {}, {},
            {
              "MatmulFp32.S",
              "MatmulFp32Opt.S",
              "PreSum4x16Int8Peroc.S",
              "MatVecMulFp32.S",
              "PreSum4x16Int8Peroc.S",
              "PreSum4x16Int8Pert.S",
              "IndirectGemmInt16to32_8x4.S",
              "MatmulInt8.S",
            });
  }

  NNaclFp32Serializer code;
  // call the op function
  code.CodeFunction("memset", packed_input_, "0", pack_input_size_);
  code.CodeFunction("memset", packed_output_, "0", pack_output_size_);
  code.CodeFunction("memset", tmp_buffer_, "0", tmp_buffer_size_);
  code.CodeStruct("conv_parameter", *conv_param_);
  code.CodeStruct("matmul_parameter", matmul_param_);

  std::string src_in_ptr_str = allocator_->GetRuntimeAddr(input_tensor_);
  std::string src_out_ptr_str = allocator_->GetRuntimeAddr(output_tensor_);

  for (int batch_index = 0; batch_index < conv_param_->input_batch_; batch_index++) {
    input_ptr_ = src_in_ptr_str + std::to_string(batch_index * input_plane_ * conv_param_->input_channel_);
    output_ptr_ = src_out_ptr_str + std::to_string(batch_index * output_plane_ * conv_param_->output_channel_);

    if (target_ == kARM32A) {
      code.CodeFunction("RowMajor2Col4Major", input_ptr_, packed_input_, matmul_param_.row_, matmul_param_.deep_);
    } else {
      code.CodeFunction("RowMajor2Col12Major", input_ptr_, packed_input_, matmul_param_.row_, matmul_param_.deep_);
    }
    code.CodeBaseStruct("DeConvFp32Args", kRunArgs, packed_input_, packed_weight_, packed_bias_, packed_output_,
                        output_ptr_, tmp_buffer_, "&matmul_parameter", "&conv_parameter");
    if (!support_parallel_) {
      code.CodeFunction("DeConvFp32Run", kRunArgsAddr, kDefaultTaskId);
    } else {
      code.CodeFunction(kParallelLaunch, gThreadPool, "DeConvFp32Run", kRunArgsAddr, "conv_parameter.thread_num_");
    }
  }
  context->AppendCode(code.str());
  return RET_OK;
 }
 REG_OPERATOR_CODER(kAllTargets, kNumberTypeFloat32, PrimitiveType_Conv2dTransposeFusion,
                   CPUOpCoderCreator<DeConvolutionFP32Coder>);
 }  // namespace mindspore::lite::micro::nnacl
--- a/mindspore/lite/micro/coder/opcoders/nnacl/fp32/deconv2d_fp32_coder.h
+++ b/mindspore/lite/micro/coder/opcoders/nnacl/fp32/deconv2d_fp32_coder.h
@@ -0,0 +1,65 @@
 /**
 * Copyright 2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #ifndef MINDSPORE_LITE_MICRO_CODER_OPCODERS_NNACL_FP32_DECONV2D_FP32_CODER_H_
 #define MINDSPORE_LITE_MICRO_CODER_OPCODERS_NNACL_FP32_DECONV2D_FP32_CODER_H_

 #include <vector>
 #include <string>
 #include "nnacl/conv_parameter.h"
 #include "coder/opcoders/base/conv2d_base_coder.h"
 #include "coder/opcoders/serializers/nnacl_serializer/nnacl_fp32_serializer.h"
 #include "nnacl/fp32/deconv_fp32.h"
 #include "nnacl/fp32/matmul_fp32.h"

 namespace mindspore::lite::micro::nnacl {
 class DeConvolutionFP32Coder final : public Conv2DBaseCoder {
 public:
  DeConvolutionFP32Coder(const std::vector<Tensor *> &in_tensors, const std::vector<Tensor *> &out_tensors,
                         const Model::Node *node, size_t node_index, Target target)
      : Conv2DBaseCoder(in_tensors, out_tensors, node, node_index, target) {}

  int Prepare(CoderContext *const context) override;

  int DoCode(CoderContext *const context) override;

  ~DeConvolutionFP32Coder() override = default;

 private:
  int InitWeightBias(CoderContext *const context);
  int Resize();
  int InitRunBuf();
  int InitParam();

  MatMulParameter matmul_param_{};
  size_t pack_output_size_{0};
  size_t tmp_buffer_size_{0};
  size_t pack_input_size_{0};
  size_t bias_data_size_{0};
  size_t pack_weight_size_{0};
  int input_plane_{0};
  int kernel_plane_{0};
  int output_plane_{0};
  float *packed_bias_{nullptr};
  float *packed_weight_{nullptr};
  float *packed_input_{nullptr};
  float *packed_output_{nullptr};
  float *tmp_buffer_{nullptr};
  std::string input_ptr_;
  std::string output_ptr_;
 };
 }  // namespace mindspore::lite::micro::nnacl
 #endif  // MINDSPORE_LITE_MICRO_CODER_OPCODERS_NNACL_FP32_DECONV2D_FP32_CODER_H_
--- a/mindspore/lite/micro/coder/wrapper/fp32/deconvolution_fp32_wrapper.c
+++ b/mindspore/lite/micro/coder/wrapper/fp32/deconvolution_fp32_wrapper.c
@@ -0,0 +1,69 @@
 /*
 * Copyright 2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #include "wrapper/fp32/deconvolution_fp32_wrapper.h"
 #include "nnacl/fp32/deconv_fp32.h"
 #include "nnacl/fp32/matmul_fp32.h"

 int DoDeconvFp32(const float *packed_input, const float *packed_weight, const float *packed_bias, float *packed_output,
                 float *output, float *tmp_ori_buffer, const MatMulParameter *matmul_param,
                 const ConvParameter *conv_param, int task_id) {
  int thread_count = MSMIN(conv_param->thread_num_, UP_DIV(conv_param->output_channel_, C8NUM));
  int thread_stride = UP_DIV(UP_DIV(conv_param->output_channel_, C8NUM), thread_count);
  int res_stride = UP_DIV(conv_param->output_channel_, C8NUM) - task_id * thread_stride;
  int oc = MSMIN(thread_stride, res_stride);
  int cur_stride = thread_stride * C8NUM;
  res_stride = conv_param->output_channel_ - task_id * thread_stride * C8NUM;
  int oc_res = MSMIN(cur_stride, res_stride);
  if (oc <= 0 || oc_res <= 0) {
    return NNACL_OK;
  }

  int kernel_plane = conv_param->kernel_w_ * conv_param->kernel_h_;
  int output_plane = conv_param->output_h_ * conv_param->output_w_;

 #if defined(ENABLE_ARM32)
  float *tmp_buffer = tmp_ori_buffer + task_id * thread_stride * C8NUM * kernel_plane * matmul_param->row_4_;
  MatMulOpt(packed_input, packed_weight + task_id * thread_stride * C8NUM * kernel_plane * matmul_param->deep_,
            tmp_buffer, NULL, ActType_No, matmul_param->deep_, matmul_param->row_4_, oc * C8NUM * kernel_plane,
            matmul_param->col_, OutType_C8);
 #else
  float *tmp_buffer = tmp_ori_buffer + task_id * thread_stride * C8NUM * kernel_plane * matmul_param->row_12_;
  MatMulOpt(packed_input, packed_weight + task_id * thread_stride * C8NUM * kernel_plane * matmul_param->deep_,
            tmp_buffer, NULL, ActType_No, matmul_param->deep_, matmul_param->row_12_, oc * C8NUM * kernel_plane,
            matmul_param->col_, OutType_C8);
 #endif

  DeConvPostFp32C8(tmp_buffer, packed_output + task_id * thread_stride * C8NUM * output_plane,
                   packed_bias + thread_stride * task_id * C8NUM, output + task_id * thread_stride * C8NUM, oc_res,
                   conv_param);
  return NNACL_OK;
 }

 int DeConvFp32Run(void *cdata, int task_id) {
  DeConvFp32Args *args = (DeConvFp32Args *)cdata;
  const MatMulParameter *matmul_param = args->matmul_param_;
  const ConvParameter *conv_param = args->conv_param_;
  const float *packed_input = args->packed_input_;
  const float *packed_weight = args->packed_weight_;
  const float *packed_bias = args->packed_bias_;
  float *packed_output = args->packed_output_;
  float *output = args->output_;
  float *tmp_buffer = args->tmp_buffer_;
  DoDeconvFp32(packed_input, packed_weight, packed_bias, packed_output, output, tmp_buffer, matmul_param, conv_param,
               task_id);
  return NNACL_OK;
 }
--- a/mindspore/lite/micro/coder/wrapper/fp32/deconvolution_fp32_wrapper.h
+++ b/mindspore/lite/micro/coder/wrapper/fp32/deconvolution_fp32_wrapper.h
@@ -0,0 +1,48 @@
 /*
 * Copyright 2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #ifndef MINDSPORE_LITE_MICRO_CODER_WRAPPER_DECONVOLUTION_FP32_WRAPPER_H_
 #define MINDSPORE_LITE_MICRO_CODER_WRAPPER_DECONVOLUTION_FP32_WRAPPER_H_

 #include "nnacl/errorcode.h"
 #include "nnacl/conv_parameter.h"
 #include "nnacl/matmul_parameter.h"

 typedef struct {
  const float *packed_input_;
  const float *packed_weight_;
  const float *packed_bias_;
  float *packed_output_;
  float *output_;
  float *tmp_buffer_;
  const MatMulParameter *matmul_param_;
  const ConvParameter *conv_param_;
 } DeConvFp32Args;

 #ifdef __cplusplus
 extern "C" {
 #endif

 int DoDeconvFp32(const float *packed_input, const float *packed_weight, const float *packed_bias, float *packed_output,
                 float *output, float *tmp_ori_buffer, const MatMulParameter *matmul_param,
                 const ConvParameter *conv_param, int task_id);

 int DeConvFp32Run(void *cdata, int task_id);

 #ifdef __cplusplus
 }
 #endif
 #endif  // MINDSPORE_LITE_MICRO_CODER_WRAPPER_DECONVOLUTION_FP32_WRAPPER_H_