From: @zhujingxuan Reviewed-by: Signed-off-by:tags/v1.2.0-rc1
| @@ -81,6 +81,7 @@ set(CODER_OPCODERS_SRC | |||
| ${MICRO_DIR}/coder/opcoders/nnacl/int8/concat_int8_coder.cc | |||
| ${MICRO_DIR}/coder/opcoders/nnacl/int8/fullconnection_int8_coder.cc | |||
| ${MICRO_DIR}/coder/opcoders/nnacl/int8/matmul_int8_coder.cc | |||
| ${MICRO_DIR}/coder/opcoders/nnacl/int8/conv2d_1x1_int8_coder.cc | |||
| ${MICRO_DIR}/coder/opcoders/nnacl/int8/conv2d_3x3_int8_coder.cc | |||
| ${MICRO_DIR}/coder/opcoders/nnacl/int8/conv2d_int8_coder.cc | |||
| ${MICRO_DIR}/coder/opcoders/nnacl/int8/pooling_int8_coder.cc | |||
| @@ -126,13 +127,10 @@ set(LITE_KERNEL_SRC | |||
| ${LITE_DIR}/nnacl/int8/fixed_point.c | |||
| ${LITE_DIR}/nnacl/fp32/matmul_fp32.c | |||
| ${LITE_DIR}/nnacl/int8/conv3x3_int8.c | |||
| ) | |||
| set(MICRO_ADAPTER_SRC | |||
| ${MICRO_DIR}/wrapper/fp32/matmul_fp32_wrapper.c | |||
| ${MICRO_DIR}/wrapper/int8/matmul_int8_wrapper.c | |||
| ${MICRO_DIR}/wrapper/int8/conv_init_int8.c | |||
| ${LITE_DIR}/nnacl/int8/conv1x1_int8.c | |||
| ${LITE_DIR}/nnacl/base/conv1x1_base.c | |||
| ) | |||
| list(APPEND FILE_SET ${CODER_SRC} ${CODER_UTILS_SRC} ${CODER_OPCODERS_SRC} ${CODER_GENERATOR_SRC} | |||
| ${CODER_ALLOCATOR_SRC} ${LITE_SRC} ${LITE_KERNEL_SRC} ${MICRO_ADAPTER_SRC}) | |||
| ${CODER_ALLOCATOR_SRC} ${LITE_SRC} ${LITE_KERNEL_SRC}) | |||
| @@ -0,0 +1,12 @@ | |||
| SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -pthread") | |||
| set(MICRO_WRAPPER_SRC | |||
| ${LITE_DIR}/src/runtime/thread_pool.c | |||
| ${MICRO_DIR}/wrapper/fp32/matmul_fp32_wrapper.c | |||
| ${MICRO_DIR}/wrapper/int8/matmul_int8_wrapper.c | |||
| ${MICRO_DIR}/wrapper/int8/conv_init_int8.c | |||
| ${MICRO_DIR}/wrapper/int8/conv1x1_init_int8.c | |||
| ${MICRO_DIR}/wrapper/int8/conv1x1_run_int8.c | |||
| ) | |||
| list(APPEND FILE_SET ${MICRO_WRAPPER_SRC}) | |||
| @@ -19,6 +19,7 @@ include_directories(${TOP_DIR}/mindspore/core/) | |||
| #include coder | |||
| include_directories(${CMAKE_CURRENT_SOURCE_DIR}/../) | |||
| include(${MICRO_DIR}/cmake/file_list.cmake) | |||
| include(${MICRO_DIR}/cmake/wrapper.cmake) | |||
| add_executable(codegen main.cc ${FILE_SET}) | |||
| add_dependencies(codegen fbs_src) | |||
| add_dependencies(codegen fbs_inner_src) | |||
| @@ -0,0 +1,193 @@ | |||
| /** | |||
| * Copyright 2020 Huawei Technologies Co., Ltd | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| * You may obtain a copy of the License at | |||
| * | |||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, software | |||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| #include "micro/coder/opcoders/nnacl/int8/conv2d_1x1_int8_coder.h" | |||
| #include <string> | |||
| #include <vector> | |||
| #include "securec/include/securec.h" | |||
| #include "src/runtime/kernel/arm/base/convolution_base.h" | |||
| #include "micro/coder/opcoders/file_collector.h" | |||
| #include "micro/coder/log.h" | |||
| #include "micro/coder/opcoders/serializers/nnacl_serializer/nnacl_int8_serializer.h" | |||
| namespace mindspore::lite::micro::nnacl { | |||
| int Conv2D1x1Int8Coder::Prepare(CoderContext *const context) { | |||
| matmul_param_ = new (std::nothrow) MatMulParameter(); | |||
| MS_CHECK_PTR(matmul_param_); | |||
| MS_CHECK_RET_CODE(Conv2DBaseCoder::Init(), "Init failed"); | |||
| MS_CHECK_RET_CODE(Conv2DBaseCoder::SetQuantParam(), "SetQuantParam failed"); | |||
| filter_peroc_ = (conv_param_->conv_quant_arg_.filter_arg_num_ != kPerTensor); | |||
| if (filter_peroc_) { | |||
| MS_CHECK_RET_CODE(InitFilterPeroc(), "InitFilterPeroc failed."); | |||
| } | |||
| CheckSupportOptimize(); | |||
| MS_CHECK_RET_CODE(InitWeightBias(context), "InitWeightBias failed"); | |||
| MS_CHECK_RET_CODE(InitParam(), "InitParam failed"); | |||
| MS_CHECK_RET_CODE(InitRunBuf(), "InitRunBuf failed"); | |||
| return RET_OK; | |||
| } | |||
| int Conv2D1x1Int8Coder::DoCode(CoderContext *const context) { | |||
| Collect(context, | |||
| {"nnacl/int8/conv1x1_int8.h", "nnacl/common_func.h", "wrapper/int8/conv1x1_init_int8.h", | |||
| "wrapper/int8/conv1x1_run_int8.h"}, | |||
| {"common_func.c", "pack.c", "conv1x1_int8.c", "matmul_int8.c", "fixed_point.c", "conv1x1_init_int8.c", | |||
| "conv1x1_run_int8.c"}); | |||
| nnacl::NNaclInt8Serializer code; | |||
| code.CodeStruct("conv_param", *conv_param_); | |||
| code.CodeStruct("matmul_param", *matmul_param_); | |||
| code.CodeBaseStruct("Conv1x1Args", "args", input_sum_, filter_zp_ptr_, left_shift_, right_shift_, multiplier_, | |||
| packed_weight_, bias_data_, packed_input_, nullptr, nullptr, 0, 0, "conv_param", "matmul_param", | |||
| matmul_func_, pre_trans_input_, support_optimize_, filter_peroc_); | |||
| code.CodeFunction("Conv1x1Run", input_tensor_, "args", "THREAD_POOL_DEFAULT", thread_num_s_, output_tensor_); | |||
| context->AppendCode(code.str()); | |||
| return RET_OK; | |||
| } | |||
| void Conv2D1x1Int8Coder::CheckSupportOptimize() { | |||
| support_optimize_ = false; | |||
| matmul_func_ = "MatMulInt8_4x16_r"; | |||
| if (target_ == kARM64) { | |||
| matmul_func_ = "MatMulDpInt8_optimize_handler"; | |||
| } | |||
| } | |||
| int Conv2D1x1Int8Coder::InitWeightBias(CoderContext *const context) { | |||
| int32_t input_channel = filter_tensor_->Channel(); | |||
| int32_t output_channel = filter_tensor_->Batch(); | |||
| int32_t input_zp = conv_param_->conv_quant_arg_.input_quant_args_[0].zp_; | |||
| nnacl::NNaclInt8Serializer code; | |||
| packed_weight_ = static_cast<int8_t *>(allocator_->Malloc(kNumberTypeInt8, kOnlineSize, kOnlinePackWeight)); | |||
| MS_CHECK_PTR(packed_weight_); | |||
| bias_data_ = static_cast<int32_t *>(allocator_->Malloc(kNumberTypeInt32, kOnlineSize, kOnlinePackWeight)); | |||
| MS_CHECK_PTR(bias_data_); | |||
| std::string packed_weight_str = "(int8_t **)&" + allocator_->GetRuntimeAddr(packed_weight_); | |||
| std::string bias_data_str = "(int32_t **)&" + allocator_->GetRuntimeAddr(bias_data_); | |||
| std::string filter_zp_str = ""; | |||
| if (filter_peroc_) { | |||
| filter_zp_str = allocator_->GetRuntimeAddr(filter_zp_ptr_); | |||
| } else { | |||
| MS_CHECK_PTR(conv_param_->conv_quant_arg_.filter_quant_args_); | |||
| filter_zp_str = "filter_zp"; | |||
| code << "int32_t filter_zp[1] = {" << conv_param_->conv_quant_arg_.filter_quant_args_[0].zp_ << "};\n"; | |||
| } | |||
| if (target_ == kARM64) { | |||
| code.CodeFunctionWithCheck("Conv1x1Init", filter_tensor_, bias_tensor_, filter_zp_str, input_channel, | |||
| output_channel, input_zp, "GetSupportOptFlag()", filter_peroc_, packed_weight_str, | |||
| bias_data_str); | |||
| } else { | |||
| code.CodeFunctionWithCheck("Conv1x1Init", filter_tensor_, bias_tensor_, filter_zp_str, input_channel, | |||
| output_channel, input_zp, support_optimize_, filter_peroc_, packed_weight_str, | |||
| bias_data_str); | |||
| } | |||
| context->AppendInitCode(code.str()); | |||
| return RET_OK; | |||
| } | |||
| int Conv2D1x1Int8Coder::InitFilterPeroc() { | |||
| int32_t output_channel = filter_tensor_->Batch(); | |||
| int round_oc; | |||
| if (target_ == kARM32A) { | |||
| round_oc = UP_ROUND(output_channel, C2NUM); | |||
| } else { | |||
| round_oc = MSMAX(UP_ROUND(output_channel, C16NUM), UP_ROUND(output_channel, C4NUM)); | |||
| } | |||
| MS_CHECK_TRUE(conv_quant_arg_->filter_arg_num_ == static_cast<size_t>(output_channel), | |||
| "weight per channel quant param length is not equal to filter num, filter is not PerChannel"); | |||
| size_t output_size = output_channel * sizeof(int32_t); | |||
| size_t oc_size = round_oc * sizeof(int32_t); | |||
| /* filter zp */ | |||
| filter_zp_ptr_ = static_cast<int32_t *>(allocator_->Malloc(kNumberTypeInt32, output_size, kOfflinePackWeight)); | |||
| MS_CHECK_PTR(filter_zp_ptr_); | |||
| MS_CHECK_PTR(conv_param_->conv_quant_arg_.filter_quant_args_); | |||
| for (int fi = 0; fi < output_channel; fi++) { | |||
| filter_zp_ptr_[fi] = conv_param_->conv_quant_arg_.filter_quant_args_[fi].zp_; | |||
| } | |||
| /* left shift */ | |||
| left_shift_ = static_cast<int32_t *>(allocator_->Malloc(kNumberTypeInt32, oc_size, kOfflinePackWeight)); | |||
| MS_CHECK_PTR(left_shift_); | |||
| MS_CHECK_RET_CODE(memset_s(left_shift_, oc_size, 0, oc_size), "memset left_shift_ failed"); | |||
| MS_CHECK_RET_CODE(memcpy_s(left_shift_, oc_size, conv_param_->conv_quant_arg_.left_shift_, output_size), | |||
| "memcpy_s left_shift_ failed"); | |||
| /* right shift */ | |||
| right_shift_ = static_cast<int32_t *>(allocator_->Malloc(kNumberTypeInt32, oc_size, kOfflinePackWeight)); | |||
| MS_CHECK_PTR(right_shift_); | |||
| MS_CHECK_RET_CODE(memset_s(right_shift_, oc_size, 0, oc_size), "memset right_shift_ failed"); | |||
| MS_CHECK_RET_CODE(memcpy_s(right_shift_, oc_size, conv_param_->conv_quant_arg_.right_shift_, output_size), | |||
| "memcpy_s right_shift_ failed"); | |||
| /* multiplier */ | |||
| multiplier_ = static_cast<int32_t *>(allocator_->Malloc(kNumberTypeInt32, oc_size, kOfflinePackWeight)); | |||
| MS_CHECK_PTR(multiplier_); | |||
| MS_CHECK_RET_CODE(memset_s(multiplier_, oc_size, 0, oc_size), "memset multiplier_ failed"); | |||
| MS_CHECK_RET_CODE(memcpy_s(multiplier_, oc_size, conv_param_->conv_quant_arg_.quant_multiplier_, output_size), | |||
| "memcpy_s multiplier_ failed"); | |||
| return RET_OK; | |||
| } | |||
| int Conv2D1x1Int8Coder::InitParam() { | |||
| pre_trans_input_ = (conv_param_->pad_u_ != 0 || conv_param_->pad_l_ != 0 || conv_param_->stride_h_ != 1 || | |||
| conv_param_->stride_w_ != 1); | |||
| matmul_param_->row_ = conv_param_->output_h_ * conv_param_->output_w_; | |||
| matmul_param_->deep_ = conv_param_->input_channel_; | |||
| matmul_param_->col_ = conv_param_->output_channel_; | |||
| matmul_param_->row_4_ = UP_ROUND(matmul_param_->row_, C4NUM); | |||
| matmul_param_->deep_4_ = UP_ROUND(matmul_param_->deep_, C4NUM); | |||
| matmul_param_->deep_16_ = UP_ROUND(matmul_param_->deep_, C16NUM); | |||
| int row_pack_count = C4NUM; | |||
| /* init input sum size */ | |||
| input_sum_size_ = UP_ROUND(matmul_param_->row_, row_pack_count); | |||
| if (pre_trans_input_) { | |||
| input_ptr_ = reinterpret_cast<int8_t *>( | |||
| allocator_->Malloc(kNumberTypeInt8, matmul_param_->row_ * matmul_param_->deep_ * sizeof(int8_t), kWorkspace)); | |||
| MS_CHECK_PTR(input_ptr_); | |||
| } | |||
| return RET_OK; | |||
| } | |||
| int Conv2D1x1Int8Coder::InitRunBuf() { | |||
| input_sum_ = | |||
| reinterpret_cast<int32_t *>(allocator_->Malloc(kNumberTypeInt32, input_sum_size_ * sizeof(int32_t), kWorkspace)); | |||
| MS_CHECK_PTR(input_sum_); | |||
| size_t size = MSMAX(UP_ROUND(matmul_param_->row_, C8NUM) * UP_ROUND(matmul_param_->deep_, C4NUM), | |||
| UP_ROUND(matmul_param_->row_, C4NUM) * UP_ROUND(matmul_param_->deep_, C16NUM)); | |||
| packed_input_ = reinterpret_cast<int8_t *>(allocator_->Malloc(kNumberTypeInt8, size * sizeof(int8_t), kWorkspace)); | |||
| MS_CHECK_PTR(packed_input_); | |||
| return RET_OK; | |||
| } | |||
| } // namespace mindspore::lite::micro::nnacl | |||
| @@ -0,0 +1,67 @@ | |||
| /** | |||
| * Copyright 2020 Huawei Technologies Co., Ltd | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| * You may obtain a copy of the License at | |||
| * | |||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, software | |||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| #ifndef MINDSPORE_LITE_MICRO_CODER_OPCODERS_Conv2D_1X1_INT8_CODER_H_ | |||
| #define MINDSPORE_LITE_MICRO_CODER_OPCODERS_Conv2D_1X1_INT8_CODER_H_ | |||
| #include "micro/coder/opcoders/base/conv2d_base_coder.h" | |||
| #include <memory> | |||
| #include <string> | |||
| #include <vector> | |||
| #include "nnacl/conv_parameter.h" | |||
| namespace mindspore::lite::micro::nnacl { | |||
| class Conv2D1x1Int8Coder final : public Conv2DBaseCoder { | |||
| public: | |||
| Conv2D1x1Int8Coder(const std::vector<Tensor *> &in_tensors, const std::vector<Tensor *> &out_tensors, | |||
| const Model::Node *node, size_t node_index, Target target) | |||
| : Conv2DBaseCoder(in_tensors, out_tensors, node, node_index, target) {} | |||
| int Prepare(CoderContext *const context) override; | |||
| int DoCode(CoderContext *const context) override; | |||
| ~Conv2D1x1Int8Coder() override = default; | |||
| private: | |||
| void CheckSupportOptimize(); | |||
| int InitWeightBias(CoderContext *const context); | |||
| int InitFilterPeroc(); | |||
| int InitParam(); | |||
| int InitRunBuf(); | |||
| int32_t *input_sum_{nullptr}; /* per-oc */ | |||
| int32_t *filter_zp_ptr_{nullptr}; /* per-oc up round */ | |||
| int32_t *left_shift_{nullptr}; /* per-oc up round */ | |||
| int32_t *right_shift_{nullptr}; /* per-oc up round */ | |||
| int32_t *multiplier_{nullptr}; /* per-oc up round */ | |||
| int8_t *packed_weight_{nullptr}; | |||
| int32_t *bias_data_{nullptr}; | |||
| int8_t *packed_input_{nullptr}; | |||
| int8_t *input_ptr_{nullptr}; | |||
| int8_t *output_ptr_{nullptr}; | |||
| size_t input_sum_size_{0}; | |||
| MatMulParameter *matmul_param_{nullptr}; | |||
| std::string matmul_func_; | |||
| bool pre_trans_input_{false}; | |||
| bool support_optimize_{false}; | |||
| bool filter_peroc_{false}; | |||
| }; | |||
| } // namespace mindspore::lite::micro::nnacl | |||
| #endif // MINDSPORE_LITE_MICRO_CODER_OPCODERS_Conv2D_1X1_INT8_CODER_H_ | |||
| @@ -60,6 +60,16 @@ void NNaclInt8Serializer::CodeStruct(const std::string &name, const ConvParamete | |||
| conv_parameter.input_unit_, conv_parameter.output_unit_, conv_parameter.pad_mode_, conv_parameter.act_type_); | |||
| } | |||
| void NNaclInt8Serializer::CodeStruct(const std::string &name, const MatMulParameter &matmul_parameter) { | |||
| CodeBaseStruct("MatMulParameter", name, matmul_parameter.op_parameter_, matmul_parameter.has_bias_, | |||
| matmul_parameter.row_, matmul_parameter.col_, matmul_parameter.row_4_, matmul_parameter.row_6_, | |||
| matmul_parameter.row_12_, matmul_parameter.row_16_, matmul_parameter.row_align_, | |||
| matmul_parameter.col_4_, matmul_parameter.col_8_, matmul_parameter.col_align_, matmul_parameter.deep_, | |||
| matmul_parameter.deep_4_, matmul_parameter.deep_16_, matmul_parameter.batch, | |||
| matmul_parameter.a_transpose_, matmul_parameter.b_transpose_, matmul_parameter.a_const_, | |||
| matmul_parameter.b_const_, matmul_parameter.act_type_); | |||
| } | |||
| void NNaclInt8Serializer::CodeStruct(const std::string &name, const ArithmeticParameter &arithmetic_parameter) { | |||
| CodeBaseStruct("ArithmeticParameter", name, arithmetic_parameter.op_parameter_, arithmetic_parameter.broadcasting_, | |||
| arithmetic_parameter.ndim_, arithmetic_parameter.activation_type_, | |||
| @@ -0,0 +1,90 @@ | |||
| /* | |||
| * Copyright 2020 Huawei Technologies Co., Ltd | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| * You may obtain a copy of the License at | |||
| * | |||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, software | |||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| #include "wrapper/int8/conv1x1_init_int8.h" | |||
| #include <memory.h> | |||
| #include "nnacl/int8/matmul_int8.h" | |||
| #include "nnacl/errorcode.h" | |||
| int Conv1x1Init(int8_t *src_weight, int32_t *src_bias, int32_t *filter_zps, int32_t input_channel, | |||
| int32_t output_channel, int32_t input_zp, bool support_optimize, bool filter_peroc, | |||
| int8_t **packed_weight, int32_t **bias_data) { | |||
| if (packed_weight == NULL || bias_data == NULL) { | |||
| return NNACL_ERR; | |||
| } | |||
| #ifdef ENABLE_ARM32 | |||
| /* InitWeightBiasArm32 */ | |||
| /* weight */ | |||
| size_t size = UP_ROUND(input_channel, C16NUM) * UP_ROUND(output_channel, C2NUM) * sizeof(int8_t); | |||
| int8_t *packed_weight_ = (int8_t *)(malloc(size)); | |||
| if (packed_weight_ == NULL) { | |||
| return NNACL_ERR; | |||
| } | |||
| memset(packed_weight_, 0, size); | |||
| RowMajor2Row2x16MajorInt8(src_weight, packed_weight_, output_channel, input_channel); | |||
| /* bias */ | |||
| size = UP_ROUND(output_channel, C2NUM); | |||
| int32_t *bias_data_ = (int32_t *)malloc(size * sizeof(int32_t)); | |||
| if (bias_data_ == NULL) { | |||
| free(packed_weight_); | |||
| return NNACL_ERR; | |||
| } | |||
| memset(bias_data_, 0, size * sizeof(int32_t)); | |||
| if (src_bias != NULL) { | |||
| memcpy(bias_data_, src_bias, output_channel * sizeof(int32_t)); | |||
| } | |||
| #else | |||
| /* InitWeightBias */ | |||
| /* weight */ | |||
| size_t size = support_optimize ? UP_ROUND(input_channel, C4NUM) * UP_ROUND(output_channel, C16NUM) * sizeof(int8_t) | |||
| : UP_ROUND(input_channel, C16NUM) * UP_ROUND(output_channel, C4NUM) * sizeof(int8_t); | |||
| int8_t *packed_weight_ = (int8_t *)(malloc(size)); | |||
| if (packed_weight_ == NULL) { | |||
| return NNACL_ERR; | |||
| } | |||
| memset(packed_weight_, 0, size); | |||
| if (support_optimize) { | |||
| RowMajor2Row4x16MajorInt8(src_weight, packed_weight_, output_channel, input_channel); | |||
| } else { | |||
| RowMajor2Row16x4MajorInt8(src_weight, packed_weight_, output_channel, input_channel); | |||
| } | |||
| /* bias */ | |||
| size = support_optimize ? UP_ROUND(output_channel, C16NUM) : UP_ROUND(output_channel, C4NUM); | |||
| int32_t *bias_data_ = (int32_t *)malloc(size * sizeof(int32_t)); | |||
| if (bias_data_ == NULL) { | |||
| free(packed_weight_); | |||
| return NNACL_ERR; | |||
| } | |||
| memset(bias_data_, 0, size * sizeof(int32_t)); | |||
| if (src_bias != NULL) { | |||
| memcpy(bias_data_, src_bias, output_channel * sizeof(int32_t)); | |||
| } | |||
| #endif | |||
| /* InitBiasByzp */ | |||
| /* bias = bias - v2 x zp1 + zp1 x zp2 */ | |||
| for (int oc = 0; oc < output_channel; oc++) { | |||
| int32_t weight_sum_value = 0; | |||
| int32_t filter_zp = (filter_peroc) ? filter_zps[oc] : filter_zps[0]; | |||
| for (int ic = 0; ic < input_channel; ic++) { | |||
| weight_sum_value += src_weight[oc * input_channel + ic]; | |||
| } | |||
| bias_data_[oc] += filter_zp * input_zp * input_channel - weight_sum_value * input_zp; | |||
| } | |||
| *packed_weight = packed_weight_; | |||
| *bias_data = bias_data_; | |||
| return NNACL_OK; | |||
| } | |||
| @@ -0,0 +1,28 @@ | |||
| /* | |||
| * Copyright 2020 Huawei Technologies Co., Ltd | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| * You may obtain a copy of the License at | |||
| * | |||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, software | |||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| #ifndef MINDSPORE_LITE_MICRO_INT8_CONV1X1_INIT_INT8_H_ | |||
| #define MINDSPORE_LITE_MICRO_INT8_CONV1X1_INIT_INT8_H_ | |||
| #include <stdint.h> | |||
| #include <stdbool.h> | |||
| #include "nnacl/conv_parameter.h" | |||
| int Conv1x1Init(int8_t *src_weight, int32_t *src_bias, int32_t *filter_zps, int32_t input_channel, | |||
| int32_t output_channel, int32_t input_zp, bool support_optimize, bool filter_peroc, | |||
| int8_t **packed_weight, int32_t **bias_data); | |||
| #endif // MINDSPORE_LITE_MICRO_INT8_CONV1X1_INIT_INT8_H_ | |||
| @@ -0,0 +1,224 @@ | |||
| /* | |||
| * Copyright 2020 Huawei Technologies Co., Ltd | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| * You may obtain a copy of the License at | |||
| * | |||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, software | |||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| #include "wrapper/int8/conv1x1_run_int8.h" | |||
| #include "nnacl/base/conv1x1_base.h" | |||
| #include "nnacl/int8/matmul_int8.h" | |||
| #include "nnacl/int8/pack_int8.h" | |||
| #include "nnacl/int8/conv1x1_int8.h" | |||
| #include "nnacl/errorcode.h" | |||
| void Pre1x1Trans(Conv1x1Args *args, int8_t *src_input, int8_t *src_output) { | |||
| args->output_ptr_ = src_output; | |||
| if (args->pre_trans_input_) { | |||
| Conv1x1InputPack(src_input, args->input_ptr_, args->conv_param_, sizeof(int8_t)); | |||
| } else { | |||
| args->input_ptr_ = src_input; | |||
| } | |||
| } | |||
| int OcOptPre(void *cdata, int task_id) { | |||
| Conv1x1Args *args = (Conv1x1Args *)(cdata); | |||
| int cur_stride = args->thread_stride_hw_ * C4NUM; | |||
| int res_stride = args->matmul_param_->row_ - task_id * args->thread_stride_hw_ * C4NUM; | |||
| int cur_hw = MSMIN(cur_stride, res_stride); | |||
| if (cur_hw <= 0) { | |||
| return NNACL_OK; | |||
| } | |||
| int8_t *hw_in = args->input_ptr_ + task_id * args->thread_stride_hw_ * C4NUM * args->conv_param_->input_channel_; | |||
| int8_t *hw_packed_in = args->packed_input_ + task_id * args->thread_stride_hw_ * C4NUM * args->matmul_param_->deep_4_; | |||
| int32_t *hw_input_sum = args->input_sum_ + task_id * args->thread_stride_hw_ * C4NUM; | |||
| if (args->filter_peroc_) { | |||
| PackInput4x4AndInputSumPert(hw_in, hw_packed_in, hw_input_sum, args->matmul_param_->deep_, cur_hw, 1); | |||
| } else { | |||
| PackInput4x4AndInputSumPert(hw_in, hw_packed_in, hw_input_sum, args->matmul_param_->deep_, cur_hw, | |||
| args->conv_param_->conv_quant_arg_.filter_quant_args_[0].zp_); | |||
| } | |||
| return NNACL_OK; | |||
| } | |||
| int RunArm64OptOc(void *cdata, int task_id) { | |||
| Conv1x1Args *args = (Conv1x1Args *)(cdata); | |||
| int stride = args->thread_stride_oc_ * C16NUM; | |||
| int cur_stride = task_id * stride; | |||
| int res_stride = args->matmul_param_->col_ - cur_stride; | |||
| int cur_oc = MSMIN(stride, res_stride); | |||
| if (cur_oc <= 0) { | |||
| return NNACL_OK; | |||
| } | |||
| bool filter_peroc = args->filter_peroc_; | |||
| int32_t *cur_left_shift = | |||
| filter_peroc ? args->left_shift_ + cur_stride : args->conv_param_->conv_quant_arg_.left_shift_; | |||
| int32_t *cur_right_shift = | |||
| filter_peroc ? args->right_shift_ + cur_stride : args->conv_param_->conv_quant_arg_.right_shift_; | |||
| int32_t *cur_multiplier = | |||
| filter_peroc ? args->multiplier_ + cur_stride : args->conv_param_->conv_quant_arg_.quant_multiplier_; | |||
| int32_t *cur_zp = filter_peroc ? args->filter_zp_ptr_ + cur_stride : args->filter_zp_ptr_; | |||
| Conv1x1Int8Opt(args->packed_input_, args->packed_weight_ + cur_stride * args->matmul_param_->deep_4_, | |||
| args->output_ptr_ + cur_stride, args->input_sum_, args->bias_data_ + cur_stride, | |||
| args->matmul_param_->row_, cur_oc, args->matmul_param_->deep_4_, cur_left_shift, cur_right_shift, | |||
| cur_multiplier, args->conv_param_, args->matmul_func_, cur_zp); | |||
| return NNACL_OK; | |||
| } | |||
| int RunArmOc(void *cdata, int task_id) { | |||
| Conv1x1Args *args = (Conv1x1Args *)(cdata); | |||
| #ifdef ENABLE_ARM32 | |||
| int col_tile = C2NUM; | |||
| #else | |||
| int col_tile = C4NUM; | |||
| #endif | |||
| int stride = args->thread_stride_oc_ * col_tile; | |||
| int cur_stride = task_id * stride; | |||
| int res_stride = args->matmul_param_->col_ - cur_stride; | |||
| int cur_oc = MSMIN(stride, res_stride); | |||
| if (cur_oc <= 0) { | |||
| return NNACL_OK; | |||
| } | |||
| bool filter_peroc = args->filter_peroc_; | |||
| int32_t *cur_left_shift = | |||
| filter_peroc ? args->left_shift_ + cur_stride : args->conv_param_->conv_quant_arg_.left_shift_; | |||
| int32_t *cur_right_shift = | |||
| filter_peroc ? args->right_shift_ + cur_stride : args->conv_param_->conv_quant_arg_.right_shift_; | |||
| int32_t *cur_multiplier = | |||
| filter_peroc ? args->multiplier_ + cur_stride : args->conv_param_->conv_quant_arg_.quant_multiplier_; | |||
| int32_t *cur_zp = filter_peroc ? args->filter_zp_ptr_ + cur_stride : args->filter_zp_ptr_; | |||
| Conv1x1Int8(args->packed_input_, args->packed_weight_ + cur_stride * args->matmul_param_->deep_16_, | |||
| args->output_ptr_ + cur_stride, args->input_sum_, args->bias_data_ + cur_stride, | |||
| args->matmul_param_->row_, cur_oc, args->matmul_param_->deep_16_, cur_left_shift, cur_right_shift, | |||
| cur_multiplier, args->conv_param_, cur_zp); | |||
| return NNACL_OK; | |||
| } | |||
| int RunArm64OptHw(void *cdata, int task_id) { | |||
| Conv1x1Args *args = (Conv1x1Args *)(cdata); | |||
| int cur_stride = args->thread_stride_hw_ * C4NUM; | |||
| int res_stride = args->matmul_param_->row_ - task_id * args->thread_stride_hw_ * C4NUM; | |||
| int cur_hw = MSMIN(cur_stride, res_stride); | |||
| if (cur_hw <= 0) { | |||
| return NNACL_OK; | |||
| } | |||
| int8_t *hw_in = args->input_ptr_ + task_id * args->thread_stride_hw_ * C4NUM * args->conv_param_->input_channel_; | |||
| int8_t *hw_out = args->output_ptr_ + task_id * args->thread_stride_hw_ * C4NUM * args->conv_param_->output_channel_; | |||
| int8_t *hw_packed_in = args->packed_input_ + task_id * args->thread_stride_hw_ * C4NUM * args->matmul_param_->deep_4_; | |||
| int32_t *hw_input_sum = args->input_sum_ + task_id * args->thread_stride_hw_ * C4NUM; | |||
| if (args->filter_peroc_) { | |||
| PackInput4x4AndInputSumPert(hw_in, hw_packed_in, hw_input_sum, args->matmul_param_->deep_, cur_hw, 1); | |||
| } else { | |||
| PackInput4x4AndInputSumPert(hw_in, hw_packed_in, hw_input_sum, args->matmul_param_->deep_, cur_hw, | |||
| args->conv_param_->conv_quant_arg_.filter_quant_args_[0].zp_); | |||
| } | |||
| Conv1x1Int8Opt(hw_packed_in, args->packed_weight_, hw_out, hw_input_sum, args->bias_data_, cur_hw, | |||
| args->matmul_param_->col_, args->matmul_param_->deep_4_, args->left_shift_, args->right_shift_, | |||
| args->multiplier_, args->conv_param_, args->matmul_func_, args->filter_zp_ptr_); | |||
| return NNACL_OK; | |||
| } | |||
| int RunArmHw(void *cdata, int task_id) { | |||
| Conv1x1Args *args = (Conv1x1Args *)(cdata); | |||
| int cur_stride = args->thread_stride_hw_ * C4NUM; | |||
| int res_stride = args->matmul_param_->row_ - task_id * args->thread_stride_hw_ * C4NUM; | |||
| int cur_hw = MSMIN(cur_stride, res_stride); | |||
| if (cur_hw <= 0) { | |||
| return NNACL_OK; | |||
| } | |||
| int8_t *hw_in = args->input_ptr_ + task_id * args->thread_stride_hw_ * C4NUM * args->conv_param_->input_channel_; | |||
| int8_t *hw_out = args->output_ptr_ + task_id * args->thread_stride_hw_ * C4NUM * args->conv_param_->output_channel_; | |||
| int8_t *hw_packed_in = | |||
| args->packed_input_ + task_id * args->thread_stride_hw_ * C4NUM * args->matmul_param_->deep_16_; | |||
| int32_t *hw_input_sum = args->input_sum_ + task_id * args->thread_stride_hw_ * C4NUM; | |||
| RowMajor2Row16x4MajorInt8(hw_in, hw_packed_in, cur_hw, args->matmul_param_->deep_); | |||
| if (args->filter_peroc_) { | |||
| PackInputSum16x4PerLayer(hw_packed_in, hw_input_sum, 1, UP_ROUND(cur_hw, C4NUM), args->matmul_param_->deep_16_); | |||
| } else { | |||
| PackInputSum16x4PerLayer(hw_packed_in, hw_input_sum, args->conv_param_->conv_quant_arg_.filter_quant_args_[0].zp_, | |||
| UP_ROUND(cur_hw, C4NUM), args->matmul_param_->deep_16_); | |||
| } | |||
| Conv1x1Int8(hw_packed_in, args->packed_weight_, hw_out, hw_input_sum, args->bias_data_, cur_hw, | |||
| args->matmul_param_->col_, args->matmul_param_->deep_16_, args->left_shift_, args->right_shift_, | |||
| args->multiplier_, args->conv_param_, args->filter_zp_ptr_); | |||
| return NNACL_OK; | |||
| } | |||
| void Conv1x1Run(int8_t *src_in, Conv1x1Args *args, struct ThreadPool *thread_pool, int thread_num, int8_t *src_out) { | |||
| int row_pack_count = C4NUM; | |||
| int col_pack_count; | |||
| #ifdef ENABLE_ARM32 | |||
| col_pack_count = C2NUM; | |||
| #else | |||
| if (args->support_optimize_) { | |||
| col_pack_count = C16NUM; | |||
| } else { | |||
| col_pack_count = C4NUM; | |||
| } | |||
| #endif | |||
| int hw_thread_count = UP_DIV(args->matmul_param_->row_, row_pack_count); | |||
| int oc_thread_count = UP_DIV(args->matmul_param_->col_, col_pack_count); | |||
| size_t thread_count_hw = MSMIN(thread_num, hw_thread_count); | |||
| args->thread_stride_hw_ = UP_DIV(hw_thread_count, thread_count_hw); | |||
| size_t thread_count_oc = MSMIN(thread_num, oc_thread_count); | |||
| args->thread_stride_oc_ = UP_DIV(oc_thread_count, thread_count_oc); | |||
| bool parallel_by_oc = oc_thread_count > thread_num; | |||
| for (int batch_index = 0; batch_index < args->conv_param_->input_batch_; batch_index++) { | |||
| Pre1x1Trans(args, | |||
| src_in + batch_index * args->conv_param_->input_h_ * args->conv_param_->input_w_ * | |||
| args->conv_param_->input_channel_, | |||
| src_out + batch_index * args->matmul_param_->row_ * args->matmul_param_->col_); | |||
| if (parallel_by_oc) { | |||
| /* input transpose and input sum */ | |||
| if (args->support_optimize_) { | |||
| ParallelLaunch(thread_pool, OcOptPre, args, thread_count_hw); | |||
| } else { | |||
| RowMajor2Row16x4MajorInt8(args->input_ptr_, args->packed_input_, args->matmul_param_->row_, | |||
| args->matmul_param_->deep_); | |||
| if (args->filter_peroc_) { | |||
| PackInputSum16x4PerLayer(args->packed_input_, args->input_sum_, 1, args->matmul_param_->row_4_, | |||
| args->matmul_param_->deep_16_); | |||
| } else { | |||
| PackInputSum16x4PerLayer(args->packed_input_, args->input_sum_, | |||
| args->conv_param_->conv_quant_arg_.filter_quant_args_[0].zp_, | |||
| args->matmul_param_->row_4_, args->matmul_param_->deep_16_); | |||
| } | |||
| } | |||
| /* matmul parallel by oc */ | |||
| if (args->support_optimize_) { | |||
| ParallelLaunch(thread_pool, RunArm64OptOc, args, thread_count_oc); | |||
| } else { | |||
| ParallelLaunch(thread_pool, RunArmOc, args, thread_count_oc); | |||
| } | |||
| } else { | |||
| /* matmul parallel by hw */ | |||
| if (args->support_optimize_) { | |||
| ParallelLaunch(thread_pool, RunArm64OptHw, args, thread_count_hw); | |||
| } else { | |||
| ParallelLaunch(thread_pool, RunArmHw, args, thread_count_hw); | |||
| } | |||
| } | |||
| } | |||
| } | |||
| @@ -0,0 +1,49 @@ | |||
| /** | |||
| * Copyright 2020 Huawei Technologies Co., Ltd | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| * You may obtain a copy of the License at | |||
| * | |||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, software | |||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| #ifndef MINDSPORE_LITE_MICRO_INT8_CONV1X1_RUN_H_ | |||
| #define MINDSPORE_LITE_MICRO_INT8_CONV1X1_RUN_H_ | |||
| #include <stdint.h> | |||
| #include <stdbool.h> | |||
| #include "nnacl/conv_parameter.h" | |||
| #include "nnacl/matmul_parameter.h" | |||
| #include "src/runtime/thread_pool.h" | |||
| typedef struct { | |||
| int32_t *input_sum_; /* per-oc */ | |||
| int32_t *filter_zp_ptr_; /* per-oc up round */ | |||
| int32_t *left_shift_; /* per-oc up round */ | |||
| int32_t *right_shift_; /* per-oc up round */ | |||
| int32_t *multiplier_; /* per-oc up round */ | |||
| int8_t *packed_weight_; | |||
| int32_t *bias_data_; | |||
| int8_t *packed_input_; | |||
| int8_t *input_ptr_; | |||
| int8_t *output_ptr_; | |||
| size_t thread_stride_hw_; | |||
| size_t thread_stride_oc_; | |||
| ConvParameter *conv_param_; | |||
| MatMulParameter *matmul_param_; | |||
| MATMUL_OPT_DP_FUNC matmul_func_; | |||
| bool pre_trans_input_; | |||
| bool support_optimize_; | |||
| bool filter_peroc_; | |||
| } Conv1x1Args; | |||
| void Conv1x1Run(int8_t *src_in, Conv1x1Args *args, struct ThreadPool *thread_pool, int thread_num, int8_t *src_out); | |||
| #endif // MINDSPORE_LITE_MICRO_INT8_CONV1X1_RUN_H_ | |||