diff --git a/mindspore/lite/micro/cmake/file_list.cmake b/mindspore/lite/micro/cmake/file_list.cmake index 45e6dda842..e0842dd1d1 100644 --- a/mindspore/lite/micro/cmake/file_list.cmake +++ b/mindspore/lite/micro/cmake/file_list.cmake @@ -81,6 +81,7 @@ set(CODER_OPCODERS_SRC ${MICRO_DIR}/coder/opcoders/nnacl/int8/concat_int8_coder.cc ${MICRO_DIR}/coder/opcoders/nnacl/int8/fullconnection_int8_coder.cc ${MICRO_DIR}/coder/opcoders/nnacl/int8/matmul_int8_coder.cc + ${MICRO_DIR}/coder/opcoders/nnacl/int8/conv2d_1x1_int8_coder.cc ${MICRO_DIR}/coder/opcoders/nnacl/int8/conv2d_3x3_int8_coder.cc ${MICRO_DIR}/coder/opcoders/nnacl/int8/conv2d_int8_coder.cc ${MICRO_DIR}/coder/opcoders/nnacl/int8/pooling_int8_coder.cc @@ -126,13 +127,10 @@ set(LITE_KERNEL_SRC ${LITE_DIR}/nnacl/int8/fixed_point.c ${LITE_DIR}/nnacl/fp32/matmul_fp32.c ${LITE_DIR}/nnacl/int8/conv3x3_int8.c - ) -set(MICRO_ADAPTER_SRC - ${MICRO_DIR}/wrapper/fp32/matmul_fp32_wrapper.c - ${MICRO_DIR}/wrapper/int8/matmul_int8_wrapper.c - ${MICRO_DIR}/wrapper/int8/conv_init_int8.c + ${LITE_DIR}/nnacl/int8/conv1x1_int8.c + ${LITE_DIR}/nnacl/base/conv1x1_base.c ) list(APPEND FILE_SET ${CODER_SRC} ${CODER_UTILS_SRC} ${CODER_OPCODERS_SRC} ${CODER_GENERATOR_SRC} - ${CODER_ALLOCATOR_SRC} ${LITE_SRC} ${LITE_KERNEL_SRC} ${MICRO_ADAPTER_SRC}) + ${CODER_ALLOCATOR_SRC} ${LITE_SRC} ${LITE_KERNEL_SRC}) diff --git a/mindspore/lite/micro/cmake/wrapper.cmake b/mindspore/lite/micro/cmake/wrapper.cmake new file mode 100644 index 0000000000..fa24daa627 --- /dev/null +++ b/mindspore/lite/micro/cmake/wrapper.cmake @@ -0,0 +1,12 @@ +SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -pthread") + +set(MICRO_WRAPPER_SRC + ${LITE_DIR}/src/runtime/thread_pool.c + ${MICRO_DIR}/wrapper/fp32/matmul_fp32_wrapper.c + ${MICRO_DIR}/wrapper/int8/matmul_int8_wrapper.c + ${MICRO_DIR}/wrapper/int8/conv_init_int8.c + ${MICRO_DIR}/wrapper/int8/conv1x1_init_int8.c + ${MICRO_DIR}/wrapper/int8/conv1x1_run_int8.c + ) + +list(APPEND FILE_SET ${MICRO_WRAPPER_SRC}) \ No newline at end of file diff --git a/mindspore/lite/micro/coder/CMakeLists.txt b/mindspore/lite/micro/coder/CMakeLists.txt index 14b61a1775..3f87bef7d7 100644 --- a/mindspore/lite/micro/coder/CMakeLists.txt +++ b/mindspore/lite/micro/coder/CMakeLists.txt @@ -19,6 +19,7 @@ include_directories(${TOP_DIR}/mindspore/core/) #include coder include_directories(${CMAKE_CURRENT_SOURCE_DIR}/../) include(${MICRO_DIR}/cmake/file_list.cmake) +include(${MICRO_DIR}/cmake/wrapper.cmake) add_executable(codegen main.cc ${FILE_SET}) add_dependencies(codegen fbs_src) add_dependencies(codegen fbs_inner_src) diff --git a/mindspore/lite/micro/coder/opcoders/nnacl/int8/conv2d_1x1_int8_coder.cc b/mindspore/lite/micro/coder/opcoders/nnacl/int8/conv2d_1x1_int8_coder.cc new file mode 100644 index 0000000000..8966f4715e --- /dev/null +++ b/mindspore/lite/micro/coder/opcoders/nnacl/int8/conv2d_1x1_int8_coder.cc @@ -0,0 +1,193 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "micro/coder/opcoders/nnacl/int8/conv2d_1x1_int8_coder.h" +#include +#include +#include "securec/include/securec.h" +#include "src/runtime/kernel/arm/base/convolution_base.h" +#include "micro/coder/opcoders/file_collector.h" +#include "micro/coder/log.h" +#include "micro/coder/opcoders/serializers/nnacl_serializer/nnacl_int8_serializer.h" + +namespace mindspore::lite::micro::nnacl { + +int Conv2D1x1Int8Coder::Prepare(CoderContext *const context) { + matmul_param_ = new (std::nothrow) MatMulParameter(); + MS_CHECK_PTR(matmul_param_); + MS_CHECK_RET_CODE(Conv2DBaseCoder::Init(), "Init failed"); + MS_CHECK_RET_CODE(Conv2DBaseCoder::SetQuantParam(), "SetQuantParam failed"); + filter_peroc_ = (conv_param_->conv_quant_arg_.filter_arg_num_ != kPerTensor); + if (filter_peroc_) { + MS_CHECK_RET_CODE(InitFilterPeroc(), "InitFilterPeroc failed."); + } + CheckSupportOptimize(); + MS_CHECK_RET_CODE(InitWeightBias(context), "InitWeightBias failed"); + MS_CHECK_RET_CODE(InitParam(), "InitParam failed"); + MS_CHECK_RET_CODE(InitRunBuf(), "InitRunBuf failed"); + return RET_OK; +} + +int Conv2D1x1Int8Coder::DoCode(CoderContext *const context) { + Collect(context, + {"nnacl/int8/conv1x1_int8.h", "nnacl/common_func.h", "wrapper/int8/conv1x1_init_int8.h", + "wrapper/int8/conv1x1_run_int8.h"}, + {"common_func.c", "pack.c", "conv1x1_int8.c", "matmul_int8.c", "fixed_point.c", "conv1x1_init_int8.c", + "conv1x1_run_int8.c"}); + + nnacl::NNaclInt8Serializer code; + + code.CodeStruct("conv_param", *conv_param_); + code.CodeStruct("matmul_param", *matmul_param_); + + code.CodeBaseStruct("Conv1x1Args", "args", input_sum_, filter_zp_ptr_, left_shift_, right_shift_, multiplier_, + packed_weight_, bias_data_, packed_input_, nullptr, nullptr, 0, 0, "conv_param", "matmul_param", + matmul_func_, pre_trans_input_, support_optimize_, filter_peroc_); + + code.CodeFunction("Conv1x1Run", input_tensor_, "args", "THREAD_POOL_DEFAULT", thread_num_s_, output_tensor_); + + context->AppendCode(code.str()); + return RET_OK; +} + +void Conv2D1x1Int8Coder::CheckSupportOptimize() { + support_optimize_ = false; + matmul_func_ = "MatMulInt8_4x16_r"; + if (target_ == kARM64) { + matmul_func_ = "MatMulDpInt8_optimize_handler"; + } +} + +int Conv2D1x1Int8Coder::InitWeightBias(CoderContext *const context) { + int32_t input_channel = filter_tensor_->Channel(); + int32_t output_channel = filter_tensor_->Batch(); + int32_t input_zp = conv_param_->conv_quant_arg_.input_quant_args_[0].zp_; + + nnacl::NNaclInt8Serializer code; + + packed_weight_ = static_cast(allocator_->Malloc(kNumberTypeInt8, kOnlineSize, kOnlinePackWeight)); + MS_CHECK_PTR(packed_weight_); + bias_data_ = static_cast(allocator_->Malloc(kNumberTypeInt32, kOnlineSize, kOnlinePackWeight)); + MS_CHECK_PTR(bias_data_); + + std::string packed_weight_str = "(int8_t **)&" + allocator_->GetRuntimeAddr(packed_weight_); + std::string bias_data_str = "(int32_t **)&" + allocator_->GetRuntimeAddr(bias_data_); + std::string filter_zp_str = ""; + if (filter_peroc_) { + filter_zp_str = allocator_->GetRuntimeAddr(filter_zp_ptr_); + } else { + MS_CHECK_PTR(conv_param_->conv_quant_arg_.filter_quant_args_); + filter_zp_str = "filter_zp"; + code << "int32_t filter_zp[1] = {" << conv_param_->conv_quant_arg_.filter_quant_args_[0].zp_ << "};\n"; + } + + if (target_ == kARM64) { + code.CodeFunctionWithCheck("Conv1x1Init", filter_tensor_, bias_tensor_, filter_zp_str, input_channel, + output_channel, input_zp, "GetSupportOptFlag()", filter_peroc_, packed_weight_str, + bias_data_str); + } else { + code.CodeFunctionWithCheck("Conv1x1Init", filter_tensor_, bias_tensor_, filter_zp_str, input_channel, + output_channel, input_zp, support_optimize_, filter_peroc_, packed_weight_str, + bias_data_str); + } + + context->AppendInitCode(code.str()); + return RET_OK; +} + +int Conv2D1x1Int8Coder::InitFilterPeroc() { + int32_t output_channel = filter_tensor_->Batch(); + int round_oc; + if (target_ == kARM32A) { + round_oc = UP_ROUND(output_channel, C2NUM); + } else { + round_oc = MSMAX(UP_ROUND(output_channel, C16NUM), UP_ROUND(output_channel, C4NUM)); + } + + MS_CHECK_TRUE(conv_quant_arg_->filter_arg_num_ == static_cast(output_channel), + "weight per channel quant param length is not equal to filter num, filter is not PerChannel"); + size_t output_size = output_channel * sizeof(int32_t); + size_t oc_size = round_oc * sizeof(int32_t); + + /* filter zp */ + filter_zp_ptr_ = static_cast(allocator_->Malloc(kNumberTypeInt32, output_size, kOfflinePackWeight)); + MS_CHECK_PTR(filter_zp_ptr_); + MS_CHECK_PTR(conv_param_->conv_quant_arg_.filter_quant_args_); + for (int fi = 0; fi < output_channel; fi++) { + filter_zp_ptr_[fi] = conv_param_->conv_quant_arg_.filter_quant_args_[fi].zp_; + } + + /* left shift */ + left_shift_ = static_cast(allocator_->Malloc(kNumberTypeInt32, oc_size, kOfflinePackWeight)); + MS_CHECK_PTR(left_shift_); + MS_CHECK_RET_CODE(memset_s(left_shift_, oc_size, 0, oc_size), "memset left_shift_ failed"); + MS_CHECK_RET_CODE(memcpy_s(left_shift_, oc_size, conv_param_->conv_quant_arg_.left_shift_, output_size), + "memcpy_s left_shift_ failed"); + + /* right shift */ + right_shift_ = static_cast(allocator_->Malloc(kNumberTypeInt32, oc_size, kOfflinePackWeight)); + MS_CHECK_PTR(right_shift_); + MS_CHECK_RET_CODE(memset_s(right_shift_, oc_size, 0, oc_size), "memset right_shift_ failed"); + MS_CHECK_RET_CODE(memcpy_s(right_shift_, oc_size, conv_param_->conv_quant_arg_.right_shift_, output_size), + "memcpy_s right_shift_ failed"); + /* multiplier */ + multiplier_ = static_cast(allocator_->Malloc(kNumberTypeInt32, oc_size, kOfflinePackWeight)); + MS_CHECK_PTR(multiplier_); + MS_CHECK_RET_CODE(memset_s(multiplier_, oc_size, 0, oc_size), "memset multiplier_ failed"); + MS_CHECK_RET_CODE(memcpy_s(multiplier_, oc_size, conv_param_->conv_quant_arg_.quant_multiplier_, output_size), + "memcpy_s multiplier_ failed"); + + return RET_OK; +} + +int Conv2D1x1Int8Coder::InitParam() { + pre_trans_input_ = (conv_param_->pad_u_ != 0 || conv_param_->pad_l_ != 0 || conv_param_->stride_h_ != 1 || + conv_param_->stride_w_ != 1); + + matmul_param_->row_ = conv_param_->output_h_ * conv_param_->output_w_; + matmul_param_->deep_ = conv_param_->input_channel_; + matmul_param_->col_ = conv_param_->output_channel_; + matmul_param_->row_4_ = UP_ROUND(matmul_param_->row_, C4NUM); + matmul_param_->deep_4_ = UP_ROUND(matmul_param_->deep_, C4NUM); + matmul_param_->deep_16_ = UP_ROUND(matmul_param_->deep_, C16NUM); + + int row_pack_count = C4NUM; + /* init input sum size */ + input_sum_size_ = UP_ROUND(matmul_param_->row_, row_pack_count); + + if (pre_trans_input_) { + input_ptr_ = reinterpret_cast( + allocator_->Malloc(kNumberTypeInt8, matmul_param_->row_ * matmul_param_->deep_ * sizeof(int8_t), kWorkspace)); + MS_CHECK_PTR(input_ptr_); + } + + return RET_OK; +} + +int Conv2D1x1Int8Coder::InitRunBuf() { + input_sum_ = + reinterpret_cast(allocator_->Malloc(kNumberTypeInt32, input_sum_size_ * sizeof(int32_t), kWorkspace)); + MS_CHECK_PTR(input_sum_); + + size_t size = MSMAX(UP_ROUND(matmul_param_->row_, C8NUM) * UP_ROUND(matmul_param_->deep_, C4NUM), + UP_ROUND(matmul_param_->row_, C4NUM) * UP_ROUND(matmul_param_->deep_, C16NUM)); + + packed_input_ = reinterpret_cast(allocator_->Malloc(kNumberTypeInt8, size * sizeof(int8_t), kWorkspace)); + MS_CHECK_PTR(packed_input_); + return RET_OK; +} + +} // namespace mindspore::lite::micro::nnacl diff --git a/mindspore/lite/micro/coder/opcoders/nnacl/int8/conv2d_1x1_int8_coder.h b/mindspore/lite/micro/coder/opcoders/nnacl/int8/conv2d_1x1_int8_coder.h new file mode 100644 index 0000000000..04a232484b --- /dev/null +++ b/mindspore/lite/micro/coder/opcoders/nnacl/int8/conv2d_1x1_int8_coder.h @@ -0,0 +1,67 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_LITE_MICRO_CODER_OPCODERS_Conv2D_1X1_INT8_CODER_H_ +#define MINDSPORE_LITE_MICRO_CODER_OPCODERS_Conv2D_1X1_INT8_CODER_H_ +#include "micro/coder/opcoders/base/conv2d_base_coder.h" +#include +#include +#include +#include "nnacl/conv_parameter.h" + +namespace mindspore::lite::micro::nnacl { +class Conv2D1x1Int8Coder final : public Conv2DBaseCoder { + public: + Conv2D1x1Int8Coder(const std::vector &in_tensors, const std::vector &out_tensors, + const Model::Node *node, size_t node_index, Target target) + : Conv2DBaseCoder(in_tensors, out_tensors, node, node_index, target) {} + + int Prepare(CoderContext *const context) override; + + int DoCode(CoderContext *const context) override; + + ~Conv2D1x1Int8Coder() override = default; + + private: + void CheckSupportOptimize(); + + int InitWeightBias(CoderContext *const context); + + int InitFilterPeroc(); + + int InitParam(); + + int InitRunBuf(); + + int32_t *input_sum_{nullptr}; /* per-oc */ + int32_t *filter_zp_ptr_{nullptr}; /* per-oc up round */ + int32_t *left_shift_{nullptr}; /* per-oc up round */ + int32_t *right_shift_{nullptr}; /* per-oc up round */ + int32_t *multiplier_{nullptr}; /* per-oc up round */ + int8_t *packed_weight_{nullptr}; + int32_t *bias_data_{nullptr}; + int8_t *packed_input_{nullptr}; + int8_t *input_ptr_{nullptr}; + int8_t *output_ptr_{nullptr}; + size_t input_sum_size_{0}; + MatMulParameter *matmul_param_{nullptr}; + std::string matmul_func_; + bool pre_trans_input_{false}; + bool support_optimize_{false}; + bool filter_peroc_{false}; +}; +} // namespace mindspore::lite::micro::nnacl +#endif // MINDSPORE_LITE_MICRO_CODER_OPCODERS_Conv2D_1X1_INT8_CODER_H_ diff --git a/mindspore/lite/micro/coder/opcoders/serializers/nnacl_serializer/nnacl_int8_serializer.cc b/mindspore/lite/micro/coder/opcoders/serializers/nnacl_serializer/nnacl_int8_serializer.cc index 51d60ed3f7..ea814afb42 100644 --- a/mindspore/lite/micro/coder/opcoders/serializers/nnacl_serializer/nnacl_int8_serializer.cc +++ b/mindspore/lite/micro/coder/opcoders/serializers/nnacl_serializer/nnacl_int8_serializer.cc @@ -60,6 +60,16 @@ void NNaclInt8Serializer::CodeStruct(const std::string &name, const ConvParamete conv_parameter.input_unit_, conv_parameter.output_unit_, conv_parameter.pad_mode_, conv_parameter.act_type_); } +void NNaclInt8Serializer::CodeStruct(const std::string &name, const MatMulParameter &matmul_parameter) { + CodeBaseStruct("MatMulParameter", name, matmul_parameter.op_parameter_, matmul_parameter.has_bias_, + matmul_parameter.row_, matmul_parameter.col_, matmul_parameter.row_4_, matmul_parameter.row_6_, + matmul_parameter.row_12_, matmul_parameter.row_16_, matmul_parameter.row_align_, + matmul_parameter.col_4_, matmul_parameter.col_8_, matmul_parameter.col_align_, matmul_parameter.deep_, + matmul_parameter.deep_4_, matmul_parameter.deep_16_, matmul_parameter.batch, + matmul_parameter.a_transpose_, matmul_parameter.b_transpose_, matmul_parameter.a_const_, + matmul_parameter.b_const_, matmul_parameter.act_type_); +} + void NNaclInt8Serializer::CodeStruct(const std::string &name, const ArithmeticParameter &arithmetic_parameter) { CodeBaseStruct("ArithmeticParameter", name, arithmetic_parameter.op_parameter_, arithmetic_parameter.broadcasting_, arithmetic_parameter.ndim_, arithmetic_parameter.activation_type_, diff --git a/mindspore/lite/micro/wrapper/int8/conv1x1_init_int8.c b/mindspore/lite/micro/wrapper/int8/conv1x1_init_int8.c new file mode 100644 index 0000000000..315efceb2b --- /dev/null +++ b/mindspore/lite/micro/wrapper/int8/conv1x1_init_int8.c @@ -0,0 +1,90 @@ +/* + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "wrapper/int8/conv1x1_init_int8.h" +#include +#include "nnacl/int8/matmul_int8.h" +#include "nnacl/errorcode.h" + +int Conv1x1Init(int8_t *src_weight, int32_t *src_bias, int32_t *filter_zps, int32_t input_channel, + int32_t output_channel, int32_t input_zp, bool support_optimize, bool filter_peroc, + int8_t **packed_weight, int32_t **bias_data) { + if (packed_weight == NULL || bias_data == NULL) { + return NNACL_ERR; + } +#ifdef ENABLE_ARM32 + /* InitWeightBiasArm32 */ + /* weight */ + size_t size = UP_ROUND(input_channel, C16NUM) * UP_ROUND(output_channel, C2NUM) * sizeof(int8_t); + int8_t *packed_weight_ = (int8_t *)(malloc(size)); + if (packed_weight_ == NULL) { + return NNACL_ERR; + } + memset(packed_weight_, 0, size); + RowMajor2Row2x16MajorInt8(src_weight, packed_weight_, output_channel, input_channel); + /* bias */ + size = UP_ROUND(output_channel, C2NUM); + int32_t *bias_data_ = (int32_t *)malloc(size * sizeof(int32_t)); + if (bias_data_ == NULL) { + free(packed_weight_); + return NNACL_ERR; + } + memset(bias_data_, 0, size * sizeof(int32_t)); + if (src_bias != NULL) { + memcpy(bias_data_, src_bias, output_channel * sizeof(int32_t)); + } +#else + /* InitWeightBias */ + /* weight */ + size_t size = support_optimize ? UP_ROUND(input_channel, C4NUM) * UP_ROUND(output_channel, C16NUM) * sizeof(int8_t) + : UP_ROUND(input_channel, C16NUM) * UP_ROUND(output_channel, C4NUM) * sizeof(int8_t); + int8_t *packed_weight_ = (int8_t *)(malloc(size)); + if (packed_weight_ == NULL) { + return NNACL_ERR; + } + memset(packed_weight_, 0, size); + if (support_optimize) { + RowMajor2Row4x16MajorInt8(src_weight, packed_weight_, output_channel, input_channel); + } else { + RowMajor2Row16x4MajorInt8(src_weight, packed_weight_, output_channel, input_channel); + } + /* bias */ + size = support_optimize ? UP_ROUND(output_channel, C16NUM) : UP_ROUND(output_channel, C4NUM); + int32_t *bias_data_ = (int32_t *)malloc(size * sizeof(int32_t)); + if (bias_data_ == NULL) { + free(packed_weight_); + return NNACL_ERR; + } + memset(bias_data_, 0, size * sizeof(int32_t)); + if (src_bias != NULL) { + memcpy(bias_data_, src_bias, output_channel * sizeof(int32_t)); + } +#endif + /* InitBiasByzp */ + /* bias = bias - v2 x zp1 + zp1 x zp2 */ + for (int oc = 0; oc < output_channel; oc++) { + int32_t weight_sum_value = 0; + int32_t filter_zp = (filter_peroc) ? filter_zps[oc] : filter_zps[0]; + for (int ic = 0; ic < input_channel; ic++) { + weight_sum_value += src_weight[oc * input_channel + ic]; + } + bias_data_[oc] += filter_zp * input_zp * input_channel - weight_sum_value * input_zp; + } + + *packed_weight = packed_weight_; + *bias_data = bias_data_; + return NNACL_OK; +} diff --git a/mindspore/lite/micro/wrapper/int8/conv1x1_init_int8.h b/mindspore/lite/micro/wrapper/int8/conv1x1_init_int8.h new file mode 100644 index 0000000000..462574366d --- /dev/null +++ b/mindspore/lite/micro/wrapper/int8/conv1x1_init_int8.h @@ -0,0 +1,28 @@ +/* + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_LITE_MICRO_INT8_CONV1X1_INIT_INT8_H_ +#define MINDSPORE_LITE_MICRO_INT8_CONV1X1_INIT_INT8_H_ + +#include +#include +#include "nnacl/conv_parameter.h" + +int Conv1x1Init(int8_t *src_weight, int32_t *src_bias, int32_t *filter_zps, int32_t input_channel, + int32_t output_channel, int32_t input_zp, bool support_optimize, bool filter_peroc, + int8_t **packed_weight, int32_t **bias_data); + +#endif // MINDSPORE_LITE_MICRO_INT8_CONV1X1_INIT_INT8_H_ diff --git a/mindspore/lite/micro/wrapper/int8/conv1x1_run_int8.c b/mindspore/lite/micro/wrapper/int8/conv1x1_run_int8.c new file mode 100644 index 0000000000..82e42bf125 --- /dev/null +++ b/mindspore/lite/micro/wrapper/int8/conv1x1_run_int8.c @@ -0,0 +1,224 @@ +/* + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "wrapper/int8/conv1x1_run_int8.h" +#include "nnacl/base/conv1x1_base.h" +#include "nnacl/int8/matmul_int8.h" +#include "nnacl/int8/pack_int8.h" +#include "nnacl/int8/conv1x1_int8.h" +#include "nnacl/errorcode.h" + +void Pre1x1Trans(Conv1x1Args *args, int8_t *src_input, int8_t *src_output) { + args->output_ptr_ = src_output; + if (args->pre_trans_input_) { + Conv1x1InputPack(src_input, args->input_ptr_, args->conv_param_, sizeof(int8_t)); + } else { + args->input_ptr_ = src_input; + } +} + +int OcOptPre(void *cdata, int task_id) { + Conv1x1Args *args = (Conv1x1Args *)(cdata); + int cur_stride = args->thread_stride_hw_ * C4NUM; + int res_stride = args->matmul_param_->row_ - task_id * args->thread_stride_hw_ * C4NUM; + int cur_hw = MSMIN(cur_stride, res_stride); + if (cur_hw <= 0) { + return NNACL_OK; + } + int8_t *hw_in = args->input_ptr_ + task_id * args->thread_stride_hw_ * C4NUM * args->conv_param_->input_channel_; + int8_t *hw_packed_in = args->packed_input_ + task_id * args->thread_stride_hw_ * C4NUM * args->matmul_param_->deep_4_; + int32_t *hw_input_sum = args->input_sum_ + task_id * args->thread_stride_hw_ * C4NUM; + + if (args->filter_peroc_) { + PackInput4x4AndInputSumPert(hw_in, hw_packed_in, hw_input_sum, args->matmul_param_->deep_, cur_hw, 1); + } else { + PackInput4x4AndInputSumPert(hw_in, hw_packed_in, hw_input_sum, args->matmul_param_->deep_, cur_hw, + args->conv_param_->conv_quant_arg_.filter_quant_args_[0].zp_); + } + return NNACL_OK; +} + +int RunArm64OptOc(void *cdata, int task_id) { + Conv1x1Args *args = (Conv1x1Args *)(cdata); + int stride = args->thread_stride_oc_ * C16NUM; + int cur_stride = task_id * stride; + int res_stride = args->matmul_param_->col_ - cur_stride; + int cur_oc = MSMIN(stride, res_stride); + if (cur_oc <= 0) { + return NNACL_OK; + } + + bool filter_peroc = args->filter_peroc_; + int32_t *cur_left_shift = + filter_peroc ? args->left_shift_ + cur_stride : args->conv_param_->conv_quant_arg_.left_shift_; + int32_t *cur_right_shift = + filter_peroc ? args->right_shift_ + cur_stride : args->conv_param_->conv_quant_arg_.right_shift_; + int32_t *cur_multiplier = + filter_peroc ? args->multiplier_ + cur_stride : args->conv_param_->conv_quant_arg_.quant_multiplier_; + int32_t *cur_zp = filter_peroc ? args->filter_zp_ptr_ + cur_stride : args->filter_zp_ptr_; + + Conv1x1Int8Opt(args->packed_input_, args->packed_weight_ + cur_stride * args->matmul_param_->deep_4_, + args->output_ptr_ + cur_stride, args->input_sum_, args->bias_data_ + cur_stride, + args->matmul_param_->row_, cur_oc, args->matmul_param_->deep_4_, cur_left_shift, cur_right_shift, + cur_multiplier, args->conv_param_, args->matmul_func_, cur_zp); + return NNACL_OK; +} + +int RunArmOc(void *cdata, int task_id) { + Conv1x1Args *args = (Conv1x1Args *)(cdata); +#ifdef ENABLE_ARM32 + int col_tile = C2NUM; +#else + int col_tile = C4NUM; +#endif + int stride = args->thread_stride_oc_ * col_tile; + int cur_stride = task_id * stride; + int res_stride = args->matmul_param_->col_ - cur_stride; + int cur_oc = MSMIN(stride, res_stride); + if (cur_oc <= 0) { + return NNACL_OK; + } + + bool filter_peroc = args->filter_peroc_; + int32_t *cur_left_shift = + filter_peroc ? args->left_shift_ + cur_stride : args->conv_param_->conv_quant_arg_.left_shift_; + int32_t *cur_right_shift = + filter_peroc ? args->right_shift_ + cur_stride : args->conv_param_->conv_quant_arg_.right_shift_; + int32_t *cur_multiplier = + filter_peroc ? args->multiplier_ + cur_stride : args->conv_param_->conv_quant_arg_.quant_multiplier_; + int32_t *cur_zp = filter_peroc ? args->filter_zp_ptr_ + cur_stride : args->filter_zp_ptr_; + + Conv1x1Int8(args->packed_input_, args->packed_weight_ + cur_stride * args->matmul_param_->deep_16_, + args->output_ptr_ + cur_stride, args->input_sum_, args->bias_data_ + cur_stride, + args->matmul_param_->row_, cur_oc, args->matmul_param_->deep_16_, cur_left_shift, cur_right_shift, + cur_multiplier, args->conv_param_, cur_zp); + return NNACL_OK; +} + +int RunArm64OptHw(void *cdata, int task_id) { + Conv1x1Args *args = (Conv1x1Args *)(cdata); + int cur_stride = args->thread_stride_hw_ * C4NUM; + int res_stride = args->matmul_param_->row_ - task_id * args->thread_stride_hw_ * C4NUM; + int cur_hw = MSMIN(cur_stride, res_stride); + if (cur_hw <= 0) { + return NNACL_OK; + } + int8_t *hw_in = args->input_ptr_ + task_id * args->thread_stride_hw_ * C4NUM * args->conv_param_->input_channel_; + int8_t *hw_out = args->output_ptr_ + task_id * args->thread_stride_hw_ * C4NUM * args->conv_param_->output_channel_; + int8_t *hw_packed_in = args->packed_input_ + task_id * args->thread_stride_hw_ * C4NUM * args->matmul_param_->deep_4_; + int32_t *hw_input_sum = args->input_sum_ + task_id * args->thread_stride_hw_ * C4NUM; + + if (args->filter_peroc_) { + PackInput4x4AndInputSumPert(hw_in, hw_packed_in, hw_input_sum, args->matmul_param_->deep_, cur_hw, 1); + } else { + PackInput4x4AndInputSumPert(hw_in, hw_packed_in, hw_input_sum, args->matmul_param_->deep_, cur_hw, + args->conv_param_->conv_quant_arg_.filter_quant_args_[0].zp_); + } + + Conv1x1Int8Opt(hw_packed_in, args->packed_weight_, hw_out, hw_input_sum, args->bias_data_, cur_hw, + args->matmul_param_->col_, args->matmul_param_->deep_4_, args->left_shift_, args->right_shift_, + args->multiplier_, args->conv_param_, args->matmul_func_, args->filter_zp_ptr_); + return NNACL_OK; +} + +int RunArmHw(void *cdata, int task_id) { + Conv1x1Args *args = (Conv1x1Args *)(cdata); + int cur_stride = args->thread_stride_hw_ * C4NUM; + int res_stride = args->matmul_param_->row_ - task_id * args->thread_stride_hw_ * C4NUM; + int cur_hw = MSMIN(cur_stride, res_stride); + if (cur_hw <= 0) { + return NNACL_OK; + } + + int8_t *hw_in = args->input_ptr_ + task_id * args->thread_stride_hw_ * C4NUM * args->conv_param_->input_channel_; + int8_t *hw_out = args->output_ptr_ + task_id * args->thread_stride_hw_ * C4NUM * args->conv_param_->output_channel_; + int8_t *hw_packed_in = + args->packed_input_ + task_id * args->thread_stride_hw_ * C4NUM * args->matmul_param_->deep_16_; + int32_t *hw_input_sum = args->input_sum_ + task_id * args->thread_stride_hw_ * C4NUM; + + RowMajor2Row16x4MajorInt8(hw_in, hw_packed_in, cur_hw, args->matmul_param_->deep_); + + if (args->filter_peroc_) { + PackInputSum16x4PerLayer(hw_packed_in, hw_input_sum, 1, UP_ROUND(cur_hw, C4NUM), args->matmul_param_->deep_16_); + } else { + PackInputSum16x4PerLayer(hw_packed_in, hw_input_sum, args->conv_param_->conv_quant_arg_.filter_quant_args_[0].zp_, + UP_ROUND(cur_hw, C4NUM), args->matmul_param_->deep_16_); + } + + Conv1x1Int8(hw_packed_in, args->packed_weight_, hw_out, hw_input_sum, args->bias_data_, cur_hw, + args->matmul_param_->col_, args->matmul_param_->deep_16_, args->left_shift_, args->right_shift_, + args->multiplier_, args->conv_param_, args->filter_zp_ptr_); + return NNACL_OK; +} + +void Conv1x1Run(int8_t *src_in, Conv1x1Args *args, struct ThreadPool *thread_pool, int thread_num, int8_t *src_out) { + int row_pack_count = C4NUM; + int col_pack_count; + +#ifdef ENABLE_ARM32 + col_pack_count = C2NUM; +#else + if (args->support_optimize_) { + col_pack_count = C16NUM; + } else { + col_pack_count = C4NUM; + } +#endif + int hw_thread_count = UP_DIV(args->matmul_param_->row_, row_pack_count); + int oc_thread_count = UP_DIV(args->matmul_param_->col_, col_pack_count); + size_t thread_count_hw = MSMIN(thread_num, hw_thread_count); + args->thread_stride_hw_ = UP_DIV(hw_thread_count, thread_count_hw); + size_t thread_count_oc = MSMIN(thread_num, oc_thread_count); + args->thread_stride_oc_ = UP_DIV(oc_thread_count, thread_count_oc); + bool parallel_by_oc = oc_thread_count > thread_num; + + for (int batch_index = 0; batch_index < args->conv_param_->input_batch_; batch_index++) { + Pre1x1Trans(args, + src_in + batch_index * args->conv_param_->input_h_ * args->conv_param_->input_w_ * + args->conv_param_->input_channel_, + src_out + batch_index * args->matmul_param_->row_ * args->matmul_param_->col_); + if (parallel_by_oc) { + /* input transpose and input sum */ + if (args->support_optimize_) { + ParallelLaunch(thread_pool, OcOptPre, args, thread_count_hw); + } else { + RowMajor2Row16x4MajorInt8(args->input_ptr_, args->packed_input_, args->matmul_param_->row_, + args->matmul_param_->deep_); + if (args->filter_peroc_) { + PackInputSum16x4PerLayer(args->packed_input_, args->input_sum_, 1, args->matmul_param_->row_4_, + args->matmul_param_->deep_16_); + } else { + PackInputSum16x4PerLayer(args->packed_input_, args->input_sum_, + args->conv_param_->conv_quant_arg_.filter_quant_args_[0].zp_, + args->matmul_param_->row_4_, args->matmul_param_->deep_16_); + } + } + /* matmul parallel by oc */ + if (args->support_optimize_) { + ParallelLaunch(thread_pool, RunArm64OptOc, args, thread_count_oc); + } else { + ParallelLaunch(thread_pool, RunArmOc, args, thread_count_oc); + } + } else { + /* matmul parallel by hw */ + if (args->support_optimize_) { + ParallelLaunch(thread_pool, RunArm64OptHw, args, thread_count_hw); + } else { + ParallelLaunch(thread_pool, RunArmHw, args, thread_count_hw); + } + } + } +} diff --git a/mindspore/lite/micro/wrapper/int8/conv1x1_run_int8.h b/mindspore/lite/micro/wrapper/int8/conv1x1_run_int8.h new file mode 100644 index 0000000000..10c2366009 --- /dev/null +++ b/mindspore/lite/micro/wrapper/int8/conv1x1_run_int8.h @@ -0,0 +1,49 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_LITE_MICRO_INT8_CONV1X1_RUN_H_ +#define MINDSPORE_LITE_MICRO_INT8_CONV1X1_RUN_H_ + +#include +#include +#include "nnacl/conv_parameter.h" +#include "nnacl/matmul_parameter.h" +#include "src/runtime/thread_pool.h" + +typedef struct { + int32_t *input_sum_; /* per-oc */ + int32_t *filter_zp_ptr_; /* per-oc up round */ + int32_t *left_shift_; /* per-oc up round */ + int32_t *right_shift_; /* per-oc up round */ + int32_t *multiplier_; /* per-oc up round */ + int8_t *packed_weight_; + int32_t *bias_data_; + int8_t *packed_input_; + int8_t *input_ptr_; + int8_t *output_ptr_; + size_t thread_stride_hw_; + size_t thread_stride_oc_; + ConvParameter *conv_param_; + MatMulParameter *matmul_param_; + MATMUL_OPT_DP_FUNC matmul_func_; + bool pre_trans_input_; + bool support_optimize_; + bool filter_peroc_; +} Conv1x1Args; + +void Conv1x1Run(int8_t *src_in, Conv1x1Args *args, struct ThreadPool *thread_pool, int thread_num, int8_t *src_out); + +#endif // MINDSPORE_LITE_MICRO_INT8_CONV1X1_RUN_H_