IR Unify

5 years ago · 7108df0d2c
--- a/mindspore/lite/CMakeLists.txt
+++ b/mindspore/lite/CMakeLists.txt
@@ -72,6 +72,7 @@ add_compile_definitions(NO_DLIB)
 add_compile_options(-fPIC)

 if(SUPPORT_TRAIN)
    set(BUILD_MINDDATA "full")
    if(PLATFORM_ARM64)
        set(RUNTIME_COMPONENT_NAME train-android-aarch64)
    elseif(PLATFORM_ARM32)
--- a/mindspore/lite/include/model.h
+++ b/mindspore/lite/include/model.h
@@ -19,14 +19,14 @@
 #include "include/lite_utils.h"

 namespace mindspore::lite {
 class PrimitiveC;
 struct MS_API Model {
  struct Node {
    String name_;
    NodeType node_type_;
    PrimitiveC *primitive_;
    const void *primitive_;
    Uint32Vector input_indices_;
    Uint32Vector output_indices_;
    int quant_type_;
  };
  using NodePtrVector = std::vector<Node *>;
  struct SubGraph {
@@ -55,7 +55,7 @@ struct MS_API Model {
  /// \brief Free meta graph temporary buffer
  virtual void Free() = 0;

  /// \brief Free all temporay buffer.EG: nodes in the model.
  /// \brief Free all temporary buffer.EG: nodes in the model.
  virtual void Destroy() = 0;

  /// \brief Model destruct, free all memory
--- a/mindspore/lite/include/version.h
+++ b/mindspore/lite/include/version.h
@@ -22,7 +22,7 @@
 namespace mindspore {
 namespace lite {
 const int ms_version_major = 1;
 const int ms_version_minor = 1;
 const int ms_version_minor = 2;
 const int ms_version_revision = 0;

 /// \brief Global method to get a version string.
--- a/mindspore/lite/micro/CMakeLists.txt
+++ b/mindspore/lite/micro/CMakeLists.txt
@@ -9,16 +9,10 @@ include_directories(${CMAKE_BINARY_DIR})
 include(${TOP_DIR}/cmake/utils.cmake)
 include(${TOP_DIR}/cmake/dependency_utils.cmake)
 include(${TOP_DIR}/cmake/dependency_securec.cmake)
 include(${TOP_DIR}/cmake/external_libs/glog.cmake)
 include(${TOP_DIR}/cmake/external_libs/flatbuffers.cmake)
 include(${TOP_DIR}/cmake/external_libs/cmsis.cmake)

 set(FBS_FILES
        ${CMAKE_CURRENT_SOURCE_DIR}/../schema/model.fbs
        ${CMAKE_CURRENT_SOURCE_DIR}/../schema/ops.fbs
        ${CMAKE_CURRENT_SOURCE_DIR}/../schema/model_v0.fbs
        ${CMAKE_CURRENT_SOURCE_DIR}/../schema/ops_v0.fbs
        )

 file(GLOB FBS_FILES ${CMAKE_CURRENT_SOURCE_DIR}/../schema/*.fbs)
 ms_build_flatbuffers_lite(FBS_FILES
        ${CMAKE_CURRENT_SOURCE_DIR}/../schema/
        fbs_src
@@ -50,6 +44,6 @@ if(ENABLE_ASAN)
 endif()

 add_subdirectory(coder)
 if(${BUILD_TESTCASES})
 if(BUILD_TESTCASES)
  add_subdirectory(test)
 endif()
--- a/mindspore/lite/micro/cmake/file_list.cmake
+++ b/mindspore/lite/micro/cmake/file_list.cmake
@@ -5,6 +5,9 @@ set(CODER_SRC
        ${MICRO_DIR}/coder/graph.cc
        ${MICRO_DIR}/coder/session.cc
        ${MICRO_DIR}/coder/train.cc
        ${MICRO_DIR}/coder/utils/coder_utils.cc
        ${MICRO_DIR}/coder/utils/dir_utils.cc
        ${MICRO_DIR}/coder/utils/type_cast.cc
        )

 set(CODER_ALLOCATOR_SRC
@@ -21,6 +24,11 @@ set(CODER_GENERATOR_SRC
        ${MICRO_DIR}/coder/generator/component/weight_component.cc
        ${MICRO_DIR}/coder/generator/component/cmake_component.cc
        ${MICRO_DIR}/coder/generator/component/train_component.cc
        ${MICRO_DIR}/coder/generator/component/parallel_component.cc
        )

 set(MINDSPORE_CORE
        ${TOP_DIR}/mindspore/core/gvar/logging_level.cc
        )

 set(CODER_OPCODERS_SRC
@@ -28,16 +36,20 @@ set(CODER_OPCODERS_SRC
        ${MICRO_DIR}/coder/opcoders/op_coder.cc
        ${MICRO_DIR}/coder/opcoders/op_coder_builder.cc
        ${MICRO_DIR}/coder/opcoders/op_coder_register.cc
        ${MICRO_DIR}/coder/opcoders/parallel.cc
        #### serializer
        ${MICRO_DIR}/coder/opcoders/serializers/nnacl_serializer/nnacl_fp32_serializer.cc
        ${MICRO_DIR}/coder/opcoders/serializers/nnacl_serializer/nnacl_int8_serializer.cc
        ${MICRO_DIR}/coder/opcoders/serializers/nnacl_serializer/nnacl_stream_utils.cc
        #### base coder
        ${MICRO_DIR}/coder/opcoders/base/conv2d_base_coder.cc
        ${MICRO_DIR}/coder/opcoders/base/dtype_cast_coder.cc
        ${MICRO_DIR}/coder/opcoders/base/full_connection_base_coder.cc
        ${MICRO_DIR}/coder/opcoders/base/quant_dtype_cast_coder.cc
        ${MICRO_DIR}/coder/opcoders/base/reduce_base_coder.cc
        ${MICRO_DIR}/coder/opcoders/base/resize_base_coder.cc
        ${MICRO_DIR}/coder/opcoders/base/softmax_base_coder.cc
        ${MICRO_DIR}/coder/opcoders/base/detection_post_process_base_coder.cc
        #### cmsis int8 coder
        ${MICRO_DIR}/coder/opcoders/cmsis-nn/int8/add_int8_coder.cc
        ${MICRO_DIR}/coder/opcoders/cmsis-nn/int8/conv2d_base_coder.cc
@@ -55,6 +67,7 @@ set(CODER_OPCODERS_SRC
        ${MICRO_DIR}/coder/opcoders/nnacl/fp32/arithmetic_self_fp32_coder.cc
        ${MICRO_DIR}/coder/opcoders/nnacl/fp32/assign_add_fp32_coder.cc
        ${MICRO_DIR}/coder/opcoders/nnacl/fp32/batchnorm_fp32_coder.cc
        ${MICRO_DIR}/coder/opcoders/nnacl/fp32/biasadd_fp32_coder.cc
        ${MICRO_DIR}/coder/opcoders/nnacl/fp32/concat_fp32_coder.cc
        ${MICRO_DIR}/coder/opcoders/nnacl/fp32/convolution_fp32_coder.cc
        ${MICRO_DIR}/coder/opcoders/nnacl/fp32/convolution_winograd_fp32_coder.cc
@@ -64,21 +77,20 @@ set(CODER_OPCODERS_SRC
        ${MICRO_DIR}/coder/opcoders/nnacl/fp32/gather_fp32_coder.cc
        ${MICRO_DIR}/coder/opcoders/nnacl/fp32/matmul_fp32_base_coder.cc
        ${MICRO_DIR}/coder/opcoders/nnacl/fp32/matmul_fp32_coder.cc
        ${MICRO_DIR}/coder/opcoders/nnacl/fp32/nchw2nhwc_fp32_coder.cc
        ${MICRO_DIR}/coder/opcoders/nnacl/fp32/nhwc2nchw_fp32_coder.cc
        ${MICRO_DIR}/coder/opcoders/nnacl/fp32/pad_fp32_coder.cc
        ${MICRO_DIR}/coder/opcoders/nnacl/fp32/pooling_fp32_coder.cc
        ${MICRO_DIR}/coder/opcoders/nnacl/fp32/power_fp32_coder.cc
        ${MICRO_DIR}/coder/opcoders/nnacl/fp32/reduce_fp32_coder.cc
        ${MICRO_DIR}/coder/opcoders/nnacl/fp32/reshape_fp32_coder.cc
        ${MICRO_DIR}/coder/opcoders/nnacl/fp32/scale_fp32_coder.cc
        ${MICRO_DIR}/coder/opcoders/nnacl/fp32/slice_fp32_coder.cc
        ${MICRO_DIR}/coder/opcoders/nnacl/fp32/softmax_fp32_coder.cc
        ${MICRO_DIR}/coder/opcoders/nnacl/fp32/squeeze_dims_fp32_coder.cc
        ${MICRO_DIR}/coder/opcoders/nnacl/fp32/tile_fp32_coder.cc
        ${MICRO_DIR}/coder/opcoders/nnacl/fp32/transpose_fp32_coder.cc
        #### nnacl int8 coder
        ${MICRO_DIR}/coder/opcoders/nnacl/int8/activation_int8_coder.cc
        ${MICRO_DIR}/coder/opcoders/nnacl/int8/add_int8_coder.cc
        ${MICRO_DIR}/coder/opcoders/nnacl/int8/batchnorm_int8_coder.cc
        ${MICRO_DIR}/coder/opcoders/nnacl/int8/concat_int8_coder.cc
        ${MICRO_DIR}/coder/opcoders/nnacl/int8/fullconnection_int8_coder.cc
        ${MICRO_DIR}/coder/opcoders/nnacl/int8/matmul_int8_coder.cc
@@ -87,40 +99,69 @@ set(CODER_OPCODERS_SRC
        ${MICRO_DIR}/coder/opcoders/nnacl/int8/conv2d_int8_coder.cc
        ${MICRO_DIR}/coder/opcoders/nnacl/int8/deconvolution_int8_coder.cc
        ${MICRO_DIR}/coder/opcoders/nnacl/int8/pooling_int8_coder.cc
        ${MICRO_DIR}/coder/opcoders/nnacl/int8/resize_int8_coder.cc
        ${MICRO_DIR}/coder/opcoders/nnacl/int8/reduce_int8_coder.cc
        ${MICRO_DIR}/coder/opcoders/nnacl/int8/reshape_int8_coder.cc
        ${MICRO_DIR}/coder/opcoders/nnacl/int8/softmax_int8_coder.cc
        )

 set(CODER_UTILS_SRC
        ${MICRO_DIR}/coder/utils/coder_utils.cc
        ${MICRO_DIR}/coder/utils/dir_utils.cc
        ${MICRO_DIR}/coder/utils/type_cast.cc
        ${MICRO_DIR}/coder/opcoders/nnacl/int8/sub_int8_coder.cc
        ${MICRO_DIR}/coder/opcoders/nnacl/int8/detection_post_process_int8_coder.cc
        ${MICRO_DIR}/coder/opcoders/nnacl/int8/sigmoid_int8_coder.cc
        ${MICRO_DIR}/coder/opcoders/nnacl/int8/relux_int8_coder.cc
        ${MICRO_DIR}/coder/opcoders/nnacl/int8/div_int8_coder.cc
        #### nnacl dequant coder
        ${MICRO_DIR}/coder/opcoders/nnacl/dequant/de_quant.cc
        )

 set(LITE_SRC
        ${LITE_DIR}/src/common/file_utils.cc
        ${LITE_DIR}/src/common/graph_util.cc
        ${LITE_DIR}/src/common/string_util.cc
        ${LITE_DIR}/src/common/prim_util.cc
        ${LITE_DIR}/src/common/tensor_util.cc
        ${LITE_DIR}/src/runtime/allocator.cc
        ${LITE_DIR}/src/runtime/infer_manager.cc
        ${LITE_DIR}/src/runtime/runtime_api.cc
        ${LITE_DIR}/src/lite_model.cc
        ${LITE_DIR}/src/tensorlist.cc
        ${LITE_DIR}/src/tensor.cc
        ${LITE_DIR}/src/scheduler.cc
        ${LITE_DIR}/src/inner_context.cc
        ${LITE_DIR}/src/dequant.cc
        ${LITE_DIR}/src/kernel_registry.cc
        ${LITE_DIR}/src/lite_kernel.cc
        ${LITE_DIR}/src/sub_graph_kernel.cc
        ${LITE_DIR}/src/huffman_decode.cc
        ${LITE_DIR}/src/executor.cc
        ${LITE_DIR}/src/common/log_adapter.cc
        ### src/ops for parameter and infer shape
        ${LITE_DIR}/src/ops/batch_norm.cc
        ${LITE_DIR}/src/ops/conv2d.cc
        ${LITE_DIR}/src/ops/primitive_c.cc
        ${LITE_DIR}/src/ops/slice.cc
        ${LITE_DIR}/src/ops/while.cc
        ${LITE_DIR}/src/common/utils.cc
        ### populate operator parameter
        ${LITE_DIR}/src/ops/populate/conv2d_populate.cc
        ${LITE_DIR}/src/ops/populate/arithmetic_populate.cc
        ${LITE_DIR}/src/ops/populate/add_populate.cc
        ${LITE_DIR}/src/ops/populate/concat_populate.cc
        ${LITE_DIR}/src/ops/populate/conv2d_populate.cc
        ${LITE_DIR}/src/ops/populate/detection_post_process_populate.cc
        ${LITE_DIR}/src/ops/populate/depthwise_conv2d_populate.cc
        ${LITE_DIR}/src/ops/populate/full_connection_populate.cc
        ${LITE_DIR}/src/ops/populate/pooling_populate.cc
        ${LITE_DIR}/src/ops/populate/quant_dtype_cast_populate.cc
        ${LITE_DIR}/src/ops/populate/resize_populate.cc
        ${LITE_DIR}/src/ops/populate/reshape_populate.cc
        ${LITE_DIR}/src/ops/populate/batch_norm_populate.cc
        ${LITE_DIR}/src/ops/populate/slice_populate.cc
        ${LITE_DIR}/src/ops/populate/while_populate.cc
        ${LITE_DIR}/src/ops/populate/matmul_populate.cc
        ${LITE_DIR}/src/ops/populate/bias_add_populate.cc
        ${LITE_DIR}/src/ops/populate/activation_populate.cc
        ### tools
        ${LITE_DIR}/tools/common/flag_parser.cc
        )
 set(LITE_KERNEL_SRC
        ### nnacl
        ${LITE_DIR}/nnacl/common_func.c
        ${LITE_DIR}/nnacl/base/minimal_filtering_generator.c
        ${LITE_DIR}/nnacl/base/arithmetic_base.c
        ${LITE_DIR}/nnacl/base/slice_base.c
        ${LITE_DIR}/nnacl/fp32/winograd_utils.c
        ${LITE_DIR}/nnacl/fp32/pack_fp32.c
        ${LITE_DIR}/nnacl/int8/quantize.c
@@ -128,13 +169,138 @@ set(LITE_KERNEL_SRC
        ${LITE_DIR}/nnacl/int8/matmul_int8.c
        ${LITE_DIR}/nnacl/int8/fixed_point.c
        ${LITE_DIR}/nnacl/fp32/matmul_fp32.c
        ${LITE_DIR}/nnacl/int8/arithmetic_int8.c
        ${LITE_DIR}/nnacl/int8/add_int8.c
        ${LITE_DIR}/nnacl/int8/concat_int8.c
        ${LITE_DIR}/nnacl/int8/conv_int8.c
        ${LITE_DIR}/nnacl/int8/conv3x3_int8.c
        ${LITE_DIR}/nnacl/int8/conv1x1_int8.c
        ${LITE_DIR}/nnacl/base/conv1x1_base.c
        ${LITE_DIR}/nnacl/int8/conv_depthwise_int8.c
        ${LITE_DIR}/nnacl/int8/deconv_int8.c
        ${LITE_DIR}/nnacl/int8/common_func_int8.c
        ${LITE_DIR}/nnacl/int8/slice_int8.c
        ${LITE_DIR}/nnacl/int8/batchnorm_int8.c
        ${LITE_DIR}/nnacl/int8/sub_int8.c
        ${LITE_DIR}/nnacl/int8/quant_dtype_cast_int8.c
        ${LITE_DIR}/nnacl/int8/sigmoid_int8.c
        ${LITE_DIR}/nnacl/int8/resize_int8.c
        ### infer
        ${LITE_DIR}/nnacl/infer/adam_infer.c
        ${LITE_DIR}/nnacl/infer/add_sub_grad_infer.c
        ${LITE_DIR}/nnacl/infer/addn_infer.c
        ${LITE_DIR}/nnacl/infer/apply_momentum_infer.c
        ${LITE_DIR}/nnacl/infer/argmin_max_infer.c
        ${LITE_DIR}/nnacl/infer/arithmetic_compare_infer.c
        ${LITE_DIR}/nnacl/infer/arithmetic_grad_infer.c
        ${LITE_DIR}/nnacl/infer/arithmetic_infer.c
        ${LITE_DIR}/nnacl/infer/assert_op_infer.c
        ${LITE_DIR}/nnacl/infer/assign_add_infer.c
        ${LITE_DIR}/nnacl/infer/assign_infer.c
        ${LITE_DIR}/nnacl/infer/audio_spectrogram_infer.c
        ${LITE_DIR}/nnacl/infer/batch_to_space_infer.c
        ${LITE_DIR}/nnacl/infer/bias_grad_infer.c
        ${LITE_DIR}/nnacl/infer/binary_cross_entropy_infer.c
        ${LITE_DIR}/nnacl/infer/bn_grad_infer.c
        ${LITE_DIR}/nnacl/infer/broadcast_to_infer.c
        ${LITE_DIR}/nnacl/infer/cast_infer.c
        ${LITE_DIR}/nnacl/infer/common_infer.c
        ${LITE_DIR}/nnacl/infer/concat_infer.c
        ${LITE_DIR}/nnacl/infer/constant_of_shape_infer.c
        ${LITE_DIR}/nnacl/infer/conv2d_grad_filter_infer.c
        ${LITE_DIR}/nnacl/infer/conv2d_grad_input_infer.c
        ${LITE_DIR}/nnacl/infer/conv2d_infer.c
        ${LITE_DIR}/nnacl/infer/crop_and_resize_infer.c
        ${LITE_DIR}/nnacl/infer/crop_infer.c
        ${LITE_DIR}/nnacl/infer/custom_extract_features_infer.c
        ${LITE_DIR}/nnacl/infer/custom_normalize_infer.c
        ${LITE_DIR}/nnacl/infer/custom_predict_infer.c
        ${LITE_DIR}/nnacl/infer/deconv2d_infer.c
        ${LITE_DIR}/nnacl/infer/dedepthwise_conv2d_infer.c
        ${LITE_DIR}/nnacl/infer/depth_to_space_infer.c
        ${LITE_DIR}/nnacl/infer/depthwise_conv2d_infer.c
        ${LITE_DIR}/nnacl/infer/detection_post_process_infer.c
        ${LITE_DIR}/nnacl/infer/dropout_grad_infer.c
        ${LITE_DIR}/nnacl/infer/dropout_infer.c
        ${LITE_DIR}/nnacl/infer/embedding_lookup_infer.c
        ${LITE_DIR}/nnacl/infer/expand_dims_infer.c
        ${LITE_DIR}/nnacl/infer/fft_imag_infer.c
        ${LITE_DIR}/nnacl/infer/fft_real_infer.c
        ${LITE_DIR}/nnacl/infer/fill_infer.c
        ${LITE_DIR}/nnacl/infer/flatten_grad_infer.c
        ${LITE_DIR}/nnacl/infer/flatten_infer.c
        ${LITE_DIR}/nnacl/infer/full_connection_infer.c
        ${LITE_DIR}/nnacl/infer/fused_batchnorm_infer.c
        ${LITE_DIR}/nnacl/infer/gather_infer.c
        ${LITE_DIR}/nnacl/infer/gather_nd_infer.c
        ${LITE_DIR}/nnacl/infer/group_conv2d_grad_input_infer.c
        ${LITE_DIR}/nnacl/infer/gru_infer.c
        ${LITE_DIR}/nnacl/infer/hashtable_lookup_infer.c
        ${LITE_DIR}/nnacl/infer/invert_permutation_infer.c
        ${LITE_DIR}/nnacl/infer/layer_norm_infer.c
        ${LITE_DIR}/nnacl/infer/lin_space_infer.c
        ${LITE_DIR}/nnacl/infer/lsh_projection_infer.c
        ${LITE_DIR}/nnacl/infer/lstm_infer.c
        ${LITE_DIR}/nnacl/infer/matmul_infer.c
        ${LITE_DIR}/nnacl/infer/maximum_grad_infer.c
        ${LITE_DIR}/nnacl/infer/mean_infer.c
        ${LITE_DIR}/nnacl/infer/merge_infer.c
        ${LITE_DIR}/nnacl/infer/mfcc_infer.c
        ${LITE_DIR}/nnacl/infer/non_max_suppression_infer.c
        ${LITE_DIR}/nnacl/infer/one_hot_infer.c
        ${LITE_DIR}/nnacl/infer/pad_infer.c
        ${LITE_DIR}/nnacl/infer/partial_infer.c
        ${LITE_DIR}/nnacl/infer/pooling_grad_infer.c
        ${LITE_DIR}/nnacl/infer/pooling_infer.c
        ${LITE_DIR}/nnacl/infer/power_infer.c
        ${LITE_DIR}/nnacl/infer/prior_box_infer.c
        ${LITE_DIR}/nnacl/infer/quant_dtype_cast_infer.c
        ${LITE_DIR}/nnacl/infer/random_standard_normal_infer.c
        ${LITE_DIR}/nnacl/infer/range_infer.c
        ${LITE_DIR}/nnacl/infer/rank_infer.c
        ${LITE_DIR}/nnacl/infer/reduce_infer.c
        ${LITE_DIR}/nnacl/infer/reshape_infer.c
        ${LITE_DIR}/nnacl/infer/resize_infer.c
        ${LITE_DIR}/nnacl/infer/rfft_infer.c
        ${LITE_DIR}/nnacl/infer/roi_pooling_infer.c
        ${LITE_DIR}/nnacl/infer/scatter_nd_infer.c
        ${LITE_DIR}/nnacl/infer/select_infer.c
        ${LITE_DIR}/nnacl/infer/sgd_infer.c
        ${LITE_DIR}/nnacl/infer/shape_infer.c
        ${LITE_DIR}/nnacl/infer/size_infer.c
        ${LITE_DIR}/nnacl/infer/skip_gram_infer.c
        ${LITE_DIR}/nnacl/infer/slice_infer.c
        ${LITE_DIR}/nnacl/infer/softmax_cross_entropy_infer.c
        ${LITE_DIR}/nnacl/infer/softmax_infer.c
        ${LITE_DIR}/nnacl/infer/space_to_batch_infer.c
        ${LITE_DIR}/nnacl/infer/space_to_batch_nd_infer.c
        ${LITE_DIR}/nnacl/infer/space_to_depth_infer.c
        ${LITE_DIR}/nnacl/infer/sparse_softmax_cross_entropy_infer.c
        ${LITE_DIR}/nnacl/infer/sparse_to_dense_infer.c
        ${LITE_DIR}/nnacl/infer/split_infer.c
        ${LITE_DIR}/nnacl/infer/squeeze_infer.c
        ${LITE_DIR}/nnacl/infer/stack_infer.c
        ${LITE_DIR}/nnacl/infer/strided_slice_grad_infer.c
        ${LITE_DIR}/nnacl/infer/strided_slice_infer.c
        ${LITE_DIR}/nnacl/infer/switch_infer.c
        ${LITE_DIR}/nnacl/infer/tensorlist_fromtensor_infer.c
        ${LITE_DIR}/nnacl/infer/tensorlist_getitem_infer.c
        ${LITE_DIR}/nnacl/infer/tensorlist_reserve_infer.c
        ${LITE_DIR}/nnacl/infer/tensorlist_setitem_infer.c
        ${LITE_DIR}/nnacl/infer/tensorlist_stack_infer.c
        ${LITE_DIR}/nnacl/infer/tile_infer.c
        ${LITE_DIR}/nnacl/infer/topk_infer.c
        ${LITE_DIR}/nnacl/infer/transpose_infer.c
        ${LITE_DIR}/nnacl/infer/uniform_real_infer.c
        ${LITE_DIR}/nnacl/infer/unique_infer.c
        ${LITE_DIR}/nnacl/infer/unsorted_segment_sum_infer.c
        ${LITE_DIR}/nnacl/infer/unsqueeze_infer.c
        ${LITE_DIR}/nnacl/infer/unstack_infer.c
        ${LITE_DIR}/nnacl/infer/where_infer.c
        ${LITE_DIR}/nnacl/infer/while_infer.c
        ${LITE_DIR}/nnacl/infer/splice_infer.c
        )

 list(APPEND FILE_SET ${CODER_SRC} ${CODER_UTILS_SRC} ${CODER_OPCODERS_SRC} ${CODER_GENERATOR_SRC}
        ${CODER_ALLOCATOR_SRC} ${LITE_SRC} ${LITE_KERNEL_SRC})
 list(APPEND FILE_SET ${CODER_SRC} ${CODER_OPCODERS_SRC} ${CODER_GENERATOR_SRC}
        ${CODER_ALLOCATOR_SRC} ${LITE_SRC} ${LITE_KERNEL_SRC} ${MINDSPORE_CORE})

--- a/mindspore/lite/micro/cmake/package_cmsis.cmake
+++ b/mindspore/lite/micro/cmake/package_cmsis.cmake
@@ -0,0 +1,21 @@
 set(CMSIS_DIR ${LITE_DIR}/micro/build/cmsis)
 if(MICRO_CMSIS_X86)
    message("build cmsis kernels")
    include_directories(${CMSIS_DIR}/CMSIS/Core/Include)
    include_directories(${CMSIS_DIR}/CMSIS/DSP/Include)
    include_directories(${CMSIS_DIR}/CMSIS/NN/Include)

    file(REMOVE ${CMSIS_DIR}/CMSIS/NN/Source/NNSupportFunctions/arm_q7_to_q15_reordered_no_shift.c)

    file(GLOB CMSIS_OPS
            ${CMSIS_DIR}/CMSIS/NN/Source/BasicMathFunctions/*.c
            ${CMSIS_DIR}/CMSIS/NN/Source/ActivationFunctions/*.c
            ${CMSIS_DIR}/CMSIS/NN/Source/ConcatenationFunctions/*.c
            ${CMSIS_DIR}/CMSIS/NN/Source/ConvolutionFunctions/*.c
            ${CMSIS_DIR}/CMSIS/NN/Source/FullyConnectedFunctions/*.c
            ${CMSIS_DIR}/CMSIS/NN/Source/NNSupportFunctions/*.c
            ${CMSIS_DIR}/CMSIS/NN/Source/PoolingFunctions/*.c
            ${CMSIS_DIR}/CMSIS/NN/Source/ReshapeFunctions/*.c
            ${CMSIS_DIR}/CMSIS/NN/Source/SoftmaxFunctions/*.c
            )
 endif()
--- a/mindspore/lite/micro/cmake/package_micro_ops.cmake
+++ b/mindspore/lite/micro/cmake/package_micro_ops.cmake
@@ -1,32 +0,0 @@
 include_directories(${NNACL_DIR}/..)

 set(CMSIS_SRC ${NNACL_DIR}/../micro/build/cmsis)
 if(MICRO_CMSIS_X86)
    message("*****build cmsis x86 codes****")
    include_directories(${CMSIS_SRC}/CMSIS/Core/Include)
    include_directories(${CMSIS_SRC}/CMSIS/DSP/Include)
    include_directories(${CMSIS_SRC}/CMSIS/NN/Include)
    file(GLOB RUNTIME_KERNEL_CMSIS_SRC
            ${CMSIS_SRC}/CMSIS/NN/Source/BasicMathFunctions/*.c
            ${CMSIS_SRC}/CMSIS/NN/Source/ActivationFunctions/*.c
            ${CMSIS_SRC}/CMSIS/NN/Source/ConcatenationFunctions/*.c
            ${CMSIS_SRC}/CMSIS/NN/Source/ConvolutionFunctions/*.c
            ${CMSIS_SRC}/CMSIS/NN/Source/FullyConnectedFunctions/*.c
            ${CMSIS_SRC}/CMSIS/NN/Source/NNSupportFunctions/*.c
            ${CMSIS_SRC}/CMSIS/NN/Source/PoolingFunctions/*.c
            ${CMSIS_SRC}/CMSIS/NN/Source/ReshapeFunctions/*.c
            ${CMSIS_SRC}/CMSIS/NN/Source/SoftmaxFunctions/*.c
            )
 endif()

 ########################### files ###########################
 file(GLOB RUNTIME_KERNEL_SRC
        ${NNACL_DIR}/kernel/fp32/*.c
        ${NNACL_DIR}/kernel/int8/*.c
        )
 if(MICRO_CMSIS_X86)
    set(RUNTIME_OPS ${RUNTIME_KERNEL_SRC} ${RUNTIME_TRAIN_SRC} ${RUNTIME_KERNEL_CMSIS_SRC})
 else()
    set(RUNTIME_OPS ${RUNTIME_KERNEL_SRC} ${RUNTIME_TRAIN_SRC})
 endif()

--- a/mindspore/lite/micro/cmake/package_nnacl.cmake
+++ b/mindspore/lite/micro/cmake/package_nnacl.cmake
@@ -0,0 +1,20 @@
 include_directories(${LITE_DIR})
 set(NNACL_DIR ${LITE_DIR}/nnacl)
 file(GLOB KERNEL_SRC
    ${NNACL_DIR}/*.c
    ${NNACL_DIR}/base/*.c
    ${NNACL_DIR}/fp32/*.c
    ${NNACL_DIR}/int8/*.c
 )

 if(MICRO_BUILD_ARM64)
    file(GLOB ASSEMBLY_SRC ${NNACL_DIR}/assembly/arm64/*.S)
    set_property(SOURCE ${ASSEMBLY_SRC} PROPERTY LANGUAGE C)
 endif()

 if(MICRO_BUILD_ARM32A)
    file(GLOB ASSEMBLY_SRC ${NNACL_DIR}/assembly/arm32/*.S)
    set_property(SOURCE ${ASSEMBLY_SRC} PROPERTY LANGUAGE C)
 endif()

 set(NNACL_OPS ${KERNEL_SRC} ${ASSEMBLY_SRC})
--- a/mindspore/lite/micro/cmake/package_wrapper.cmake
+++ b/mindspore/lite/micro/cmake/package_wrapper.cmake
@@ -0,0 +1,25 @@
 include_directories(${LITE_DIR}/micro/coder/operator_library)
 set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -pthread")
 set(WRAPPER_DIR ${LITE_DIR}/micro/coder/operator_library/wrapper/)

 set(RUNTIME_SRC
        ${LITE_DIR}/src/runtime/thread_pool.c
        )

 set(WRAPPER_SRC
        ${WRAPPER_DIR}/base/detection_post_process_base_wrapper.c
        ${WRAPPER_DIR}/fp32/matmul_fp32_wrapper.c
        ${WRAPPER_DIR}/int8/matmul_int8_wrapper.c
        ${WRAPPER_DIR}/int8/add_int8_wrapper.c
        ${WRAPPER_DIR}/int8/concat_int8_wrapper.c
        ${WRAPPER_DIR}/int8/convolution_int8_wrapper.c
        ${WRAPPER_DIR}/int8/conv_init_int8_wrapper.c
        ${WRAPPER_DIR}/int8/conv1x1_init_int8_wrapper.c
        ${WRAPPER_DIR}/int8/conv1x1_run_int8_wrapper.c
        ${WRAPPER_DIR}/int8/convolution_depthwise_int8_wrapper.c
        ${WRAPPER_DIR}/int8/resize_int8_wrapper.c
        ${WRAPPER_DIR}/int8/slice_int8_wrapper.c
        ${WRAPPER_DIR}/int8/batchnorm_int8_wrapper.c
        )

 list(APPEND FILE_SET ${WRAPPER_SRC} ${RUNTIME_SRC})
--- a/mindspore/lite/micro/cmake/wrapper.cmake
+++ b/mindspore/lite/micro/cmake/wrapper.cmake
@@ -1,12 +0,0 @@
 SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -pthread")

 set(MICRO_WRAPPER_SRC
        ${LITE_DIR}/src/runtime/thread_pool.c
        ${MICRO_DIR}/wrapper/fp32/matmul_fp32_wrapper.c
        ${MICRO_DIR}/wrapper/int8/matmul_int8_wrapper.c
        ${MICRO_DIR}/wrapper/int8/conv_init_int8_wrapper.c
        ${MICRO_DIR}/wrapper/int8/conv1x1_init_int8_wrapper.c
        ${MICRO_DIR}/wrapper/int8/conv1x1_run_int8_wrapper.c
        )

 list(APPEND FILE_SET ${MICRO_WRAPPER_SRC})
--- a/mindspore/lite/micro/coder/CMakeLists.txt
+++ b/mindspore/lite/micro/coder/CMakeLists.txt
@@ -1,7 +1,10 @@
 add_definitions(-DUSE_GLOG)

 set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -Wl,--gc-sections ")
 set(MICRO_DIR ${CMAKE_CURRENT_SOURCE_DIR}/..)
 set(LITE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/../..)
 set(3RD_DIR ${TOP_DIR}/third_party)
 set(LITE_DIR ${TOP_DIR}/mindspore/lite)
 set(MICRO_DIR ${LITE_DIR}/micro)

 if(ENABLE_CONVERTER)
    set(CODEGEN_PATH ${CMAKE_BINARY_DIR}/micro/coder/codegen)
 else()
@@ -13,17 +16,19 @@ include_directories(${3RD_DIR})
 include_directories(${3RD_DIR}/flatbuffers/include)
 #include ms
 include_directories(${TOP_DIR}/)
 include_directories(${LITE_DIR})
 include_directories(${TOP_DIR}/mindspore/core/)

 include_directories(${LITE_DIR})
 include_directories(${MICRO_DIR})
 #include coder
 include_directories(${CMAKE_CURRENT_SOURCE_DIR}/../)
 include(${TOP_DIR}/cmake/external_libs/cmsis.cmake)
 include(${MICRO_DIR}/cmake/file_list.cmake)
 include(${MICRO_DIR}/cmake/wrapper.cmake)
 include(${MICRO_DIR}/cmake/package_wrapper.cmake)
 add_subdirectory(operator_library)

 add_executable(codegen main.cc ${FILE_SET})
 add_dependencies(codegen fbs_src)
 add_dependencies(codegen fbs_inner_src)
 target_link_libraries(codegen PRIVATE ${SECUREC_LIBRARY})
 if(NOT WIN32)
  add_custom_command(TARGET codegen POST_BUILD COMMAND strip ${CODEGEN_PATH})
 target_link_libraries(codegen PRIVATE ${SECUREC_LIBRARY} mindspore::glog)
 if(NOT WIN32 AND "${CMAKE_BUILD_TYPE}" STREQUAL "Release")
    add_custom_command(TARGET codegen POST_BUILD COMMAND strip ${CODEGEN_PATH})
 endif()
--- a/mindspore/lite/micro/coder/allocator/allocator.cc
+++ b/mindspore/lite/micro/coder/allocator/allocator.cc
@@ -22,11 +22,9 @@

 namespace mindspore::lite::micro {
 void *MemoryAllocator::MallocWeightTensor(TypeId type_id, size_t size, MallocType type) {
  static const std::map<TypeId, size_t> size_map = {{kNumberTypeFloat32, sizeof(float)},
                                                    {kNumberTypeInt32, sizeof(int)},
                                                    {kNumberTypeInt32, sizeof(int32_t)},
                                                    {kNumberTypeInt16, sizeof(int16_t)},
                                                    {kNumberTypeInt8, sizeof(int8_t)}};
  static const std::map<TypeId, size_t> size_map = {
    {kNumberTypeFloat, sizeof(float)},   {kNumberTypeFloat32, sizeof(float)}, {kNumberTypeInt32, sizeof(int32_t)},
    {kNumberTypeInt16, sizeof(int16_t)}, {kNumberTypeInt8, sizeof(int8_t)},   {kNumberTypeUInt8, sizeof(uint8_t)}};
  auto item = size_map.find(type_id);
  MS_CHECK_TRUE_RET_NULL(item != size_map.end(), "unsupported type idnex");
  size_t type_size = item->second;
--- a/mindspore/lite/micro/coder/allocator/allocator.h
+++ b/mindspore/lite/micro/coder/allocator/allocator.h
@@ -73,7 +73,7 @@ class MemoryAllocator {
    if (type != kWorkspace) {
      return MallocWeightTensor(type_id, size, type);
    }
    if (size == 0 && size >= UINT_MAX) {
    if (size == 0 || size >= UINT_MAX) {
      return nullptr;
    }

@@ -94,12 +94,12 @@ class MemoryAllocator {
  template <typename T>
  std::string GetRuntimeAddr(T t, bool is_const = false) {
    if (!t) {
      return "NULL";
      return "";
    }
    std::string type_info = is_const ? "const " : "";
    std::string type_name;
    if (std::type_index(typeid(T)) == std::type_index(typeid(Tensor *))) {
      type_name = GetTensorDataType(reinterpret_cast<Tensor *>(t)->data_type()) + " *";
      type_name = GetTensorDataType(reinterpret_cast<Tensor *>(t)->data_type()) + "*";
    } else {
      type_name = GetVariableTypeName<T>();
    }
--- a/mindspore/lite/micro/coder/coder.cc
+++ b/mindspore/lite/micro/coder/coder.cc
@@ -34,19 +34,20 @@ namespace mindspore::lite::micro {
 class CoderFlags : public virtual FlagParser {
 public:
  CoderFlags() {
    AddFlag(&CoderFlags::is_weight_file_, "isWeightFile", "whether generating weight .net file, true| false", false);
    AddFlag(&CoderFlags::is_weight_file_, "isWeightFile", "whether generating weight binary file, true| false", false);
    AddFlag(&CoderFlags::model_path_, "modelPath", "Input model path", "");
    AddFlag(&CoderFlags::code_path_, "codePath", "Input code path", ".");
    AddFlag(&CoderFlags::code_module_name_, "moduleName", "Input code module name", "");
    AddFlag(&CoderFlags::target_, "target", "generateed code target, x86| ARM32M| ARM32A| ARM64", "x86");
    AddFlag(&CoderFlags::code_mode_, "codeMode", "generated code mode, Normal | Inference | Train", "Normal");
    AddFlag(&CoderFlags::debug_mode_, "debugMode", "dump perlayer's time cost and tensor, true | false", false);
    AddFlag(&CoderFlags::target_, "target", "generated code target, x86| ARM32M| ARM32A| ARM64", "x86");
    AddFlag(&CoderFlags::code_mode_, "codeMode", "generated code mode, Inference | Train", "Inference");
    AddFlag(&CoderFlags::support_parallel_, "supportParallel", "whether support parallel launch, true | false", false);
    AddFlag(&CoderFlags::debug_mode_, "debugMode", "dump the tensors data for debugging, true | false", false);
  }

  ~CoderFlags() override = default;

 public:
  std::string model_path_;
  bool support_parallel_{false};
  bool is_weight_file_{false};
  std::string code_module_name_;
  std::string code_path_;
@@ -87,8 +88,7 @@ int Coder::Run(const std::string &model_path) {
 int Coder::Init(const CoderFlags &flags) const {
  static const std::map<std::string, Target> kTargetMap = {
    {"x86", kX86}, {"ARM32M", kARM32M}, {"ARM32A", kARM32A}, {"ARM64", kARM64}, {"All", kAllTargets}};
  static const std::map<std::string, CodeMode> kCodeModeMap = {
    {"Normal", Code_Normal}, {"Inference", Code_Inference}, {"Train", Code_Train}};
  static const std::map<std::string, CodeMode> kCodeModeMap = {{"Inference", Inference}, {"Train", Train}};

  Configurator *config = Configurator::GetInstance();

@@ -112,6 +112,11 @@ int Coder::Init(const CoderFlags &flags) const {
    return true;
  });

  parsers.emplace_back([&flags, config]() -> bool {
    config->set_support_parallel(flags.support_parallel_);
    return true;
  });

  parsers.emplace_back([&flags, config]() -> bool {
    config->set_debug_mode(flags.debug_mode_);
    return true;
--- a/mindspore/lite/micro/coder/coder_config.h
+++ b/mindspore/lite/micro/coder/coder_config.h
@@ -21,7 +21,7 @@

 namespace mindspore::lite::micro {
 enum Target { kX86 = 0, kARM32M = 1, kARM32A = 2, kARM64 = 3, kAllTargets = 4, kTargetUnknown = 99 };
 enum CodeMode { Code_Normal = 0, Code_Inference = 1, Code_Train = 2, Code_Unknown = 99 };
 enum CodeMode { Inference = 0, Train = 1, Code_Unknown = 99 };

 class Configurator {
 public:
@@ -36,9 +36,6 @@ class Configurator {
  void set_code_path(const std::string &code_path) { code_path_ = code_path; }
  std::string code_path() const { return code_path_; }

  void set_subgraph_(const std::string &subgraph) { sub_graph_ = subgraph; }
  std::string sub_graph() { return sub_graph_; }

  void set_target(Target target) { target_ = target; }
  Target target() const { return target_; }

@@ -51,16 +48,19 @@ class Configurator {
  void set_is_weight_file(bool flag) { is_weight_file_ = flag; }
  bool is_weight_file() const { return is_weight_file_; }

  void set_support_parallel(bool parallel) { support_parallel_ = parallel; }
  bool support_parallel() const { return support_parallel_; }

 private:
  Configurator() = default;
  ~Configurator() = default;

  bool is_weight_file_{false};
  std::string module_name_;
  std::string code_path_;
  std::string sub_graph_;
  Target target_{kTargetUnknown};
  CodeMode code_mode_{Code_Unknown};
  bool is_weight_file_{false};
  bool support_parallel_{false};
  bool debug_mode_{false};
 };
 }  // namespace mindspore::lite::micro
--- a/mindspore/lite/micro/coder/context.cc
+++ b/mindspore/lite/micro/coder/context.cc
@@ -14,9 +14,9 @@
 * limitations under the License.
 */

 #include "micro/coder/context.h"
 #include "micro/coder/coder_config.h"
 #include "micro/coder/allocator/allocator.h"
 #include "coder/context.h"
 #include "coder/coder_config.h"
 #include "coder/allocator/allocator.h"

 namespace mindspore::lite::micro {
 CoderContext::CoderContext() {
--- a/mindspore/lite/micro/coder/generator/component/benchmark_component.cc
+++ b/mindspore/lite/micro/coder/generator/component/benchmark_component.cc
@@ -108,7 +108,7 @@ void CodeBenchmarkSetBuffer(std::ofstream &ofs, const std::string &module_name)
      << "_SetBuffer(buffer);\n"
         "  if (ret != RET_OK) {\n"
         "    MICRO_ERROR(\"set inputs failed\");\n"
         "    return RET_ERROR;"
         "    return RET_ERROR;\n"
         "  }\n";
 }

@@ -128,19 +128,6 @@ void CodeBenchmarkInitWeight(std::ofstream &ofs, const std::string &module_name)
         "  weight_buffer = NULL;\n";
 }

 void CodeBenchmarkConfigThread(std::ofstream &ofs) {
  ofs << "  int thread_num = 4;\n"
         "  BindMode bind_mode = NO_BIND_MODE;\n"
         "  if (argc >= 6) {\n"
         "    thread_num = atoi(argv[4]);\n"
         "    bind_mode = atoi(argv[5]);\n"
         "  }\n"
         "  ret = ConfigThreadPool(THREAD_POOL_DEFAULT, thread_num, bind_mode);\n"
         "  if (ret != 0) {\n"
         "    MICRO_ERROR(\"create thread pool failed\");\n"
         "  }\n";
 }

 void CodeBenchmarkInference(std::ofstream &ofs, const std::string &module_name) {
  ofs << "  if (argc >= 4) {\n"
      << "    " << module_name << "_WarmUp();\n"
@@ -170,7 +157,6 @@ void CodeBenchmarkPrintOutputs(std::ofstream &ofs, const std::string &module_nam
         "      PrintTensorData(tensor);\n"
         "  }\n";
  ofs << "  printf(\"" << module_name << " inference success.\\n\");\n";
  ofs << "  free(buffer);\n";
 }

 /**
--- a/mindspore/lite/micro/coder/generator/component/benchmark_component.h
+++ b/mindspore/lite/micro/coder/generator/component/benchmark_component.h
@@ -39,8 +39,6 @@ void CodeBenchmarkSetBuffer(std::ofstream &ofs, const std::string &module_name);

 void CodeBenchmarkInitWeight(std::ofstream &ofs, const std::string &module_name);

 void CodeBenchmarkConfigThread(std::ofstream &ofs);

 void CodeBenchmarkInference(std::ofstream &ofs, const std::string &module_name);

 void CodeBenchmarkPrintOutputs(std::ofstream &ofs, const std::string &module_name);
--- a/mindspore/lite/micro/coder/generator/component/cmake_component.cc
+++ b/mindspore/lite/micro/coder/generator/component/cmake_component.cc
@@ -24,10 +24,9 @@ void CodeCMakeNetLibrary(std::ofstream &ofs, const std::string &module_name, con
                         Target target) {
  ofs << "include_directories(${CMAKE_CURRENT_SOURCE_DIR}/../include/)\n";
  if (target == kARM32M) {
    ofs << "include_directories(${OP_HEADER_PATH}/cmsis)\n"
        << "include_directories(${OP_HEADER_PATH}/cmsis/CMSIS/NN/Include)\n"
        << "include_directories(${OP_HEADER_PATH}/cmsis/CMSIS/DSP/Include)\n"
        << "include_directories(${OP_HEADER_PATH}/cmsis/CMSIS/Core/Include)\n";
    ofs << "include_directories(${OP_HEADER_PATH}/CMSIS/NN/Include)\n"
        << "include_directories(${OP_HEADER_PATH}/CMSIS/DSP/Include)\n"
        << "include_directories(${OP_HEADER_PATH}/CMSIS/Core/Include)\n";
  }
  ofs << "set(OP_SRC\n";
  for (const std::string &c_file : ctx->c_files()) {
@@ -38,7 +37,7 @@ void CodeCMakeNetLibrary(std::ofstream &ofs, const std::string &module_name, con
      << ")\n";

  std::set<std::string> kernel_cmake_asm_set_files = ctx->asm_files();
  if (!kernel_cmake_asm_set_files.empty()) {
  if (!kernel_cmake_asm_set_files.empty() && (target == kARM32A || target == kARM64)) {
    ofs << "set(ASSEMBLY_SRC\n";
    for (const std::string &asm_file : kernel_cmake_asm_set_files) {
      ofs << "    " << asm_file << ".o\n";
--- a/mindspore/lite/micro/coder/generator/component/common_component.cc
+++ b/mindspore/lite/micro/coder/generator/component/common_component.cc
@@ -26,7 +26,7 @@ namespace mindspore::lite::micro {
 void CodeSourceFileInclude(std::ofstream &ofs, const std::string &weight_file, const std::string &header) {
  ofs << g_hwLicense << "#include \"microtensor.h\"\n"
      << "#include \"" << weight_file << "\"\n"
      << "#include \"" << header << "\"\n";
      << "#include \"" << header << "\"\n\n";
 }

 void CodeInputAndOutputState(std::ofstream &ofs, const std::string &module_name) {
@@ -53,13 +53,13 @@ void PrintMicroTensors(std::ofstream &ofs, std::vector<Tensor *> tensors, const
      MS_LOG(ERROR) << "nonexistent tensor";
      break;
    }
    ofs << "  static int dim[] = {";
    ofs << "  static int dim" << i << "[] = {";
    for (size_t j = 0; j < tensor->shape().size(); ++j) {
      ofs << tensor->shape()[j] << ", ";
    }
    ofs << "};\n"
        << "  " << name << "[" << i << "].ndim = " << tensor->shape().size() << ";\n"
        << "  " << name << "[" << i << "].dim = dim;\n"
        << "  " << name << "[" << i << "].dim = dim" << i << ";\n"
        << "  " << name << "[" << i << "].type = " << EnumMicroTensorDataType(tensor->data_type()) << ";\n"
        << "  " << name << "[" << i << "].format = " << std::to_string(tensor->format()) << ";\n"
        << "  " << name << "[" << i << "].data =" << item->second << ";\n";
@@ -69,7 +69,6 @@ void PrintMicroTensors(std::ofstream &ofs, std::vector<Tensor *> tensors, const
 void CodeInputAndOutputImplement(std::ofstream &ofs, const std::string &module_name,
                                 const std::unique_ptr<CoderContext> &ctx) {
  // input tensors
  ofs << "\n// input tensors\n";
  std::vector<Tensor *> inputs = ctx->graph_inputs();
  for (size_t i = 0; i < inputs.size(); ++i) {
    ofs << "static const unsigned char *" << ctx->input_name() + std::to_string(i) << " = 0;\n";
@@ -88,7 +87,6 @@ void CodeInputAndOutputImplement(std::ofstream &ofs, const std::string &module_n
  ofs << "  return RET_OK;\n}\n";

  // output tensors
  ofs << "\n// output tensors\n";
  std::vector<Tensor *> outputs = ctx->graph_outputs();
  size_t output_num = outputs.size();
  std::string output_name = ctx->output_name();
@@ -158,7 +156,7 @@ void CodeManageResourceState(std::ofstream &ofs, const std::string &module_name)

 void CodeInitResourceImplement(std::ofstream &ofs, const std::string &module_name,
                               const std::unique_ptr<CoderContext> &ctx) {
  ofs << "int " << module_name << "deconv_GetBufferSize() {\n"
  ofs << "int " << module_name << "_GetBufferSize() {\n"
      << "  return " << ctx->total_buffer_size() << ";\n"
      << "}\n";
  ofs << "int " << module_name << "_SetBuffer( void *buffer) {\n";
--- a/mindspore/lite/micro/coder/generator/component/const_blocks/cmake_lists.h
+++ b/mindspore/lite/micro/coder/generator/component/const_blocks/cmake_lists.h
@@ -14,10 +14,10 @@
 * limitations under the License.
 */

 #ifndef MICRO_LITE_MICRO_CODER_GENERATOR_CONST_BLOCKS_CMAKE_LISTS_CODE_H_
 #ifndef MINDSPORE_LITE_MICRO_CODER_GENERATOR_CONST_BLOCKS_CMAKE_LISTS_CODE_H_
 #define MICRO_LITE_MICRO_CODER_GENERATOR_CONST_BLOCKS_CMAKE_LISTS_CODE_H_

 static const char bench_cmake_lists_txt[] =
 const char *bench_cmake_lists_txt =
  "cmake_minimum_required(VERSION 3.14)\n"
  "project(${PROJ_NAME})\n"
  "\n"
@@ -55,9 +55,9 @@ static const char bench_cmake_lists_txt[] =
  "link_directories(${MODEL_LIB_PATH})\n"
  "include(benchmark.cmake)\n"
  "add_executable(${PROJ_NAME}_bench ${SRC_FILES})\n"
  "target_link_libraries(${PROJ_NAME}_bench ${MODEL_LIB_NAME} -lm)\n";
  "target_link_libraries(${PROJ_NAME}_bench ${MODEL_LIB_NAME} -lm -pthread)\n";

 static const char src_cmake_lists_txt[] =
 const char *src_cmake_lists_txt =
  "cmake_minimum_required(VERSION 3.14)\n"
  "project(${PROJ_NAME})\n"
  "\n"
@@ -112,4 +112,4 @@ static const char src_cmake_lists_txt[] =
  "string(CONCAT library_name \"lib\" ${PROJ_NAME} \".a\")\n"
  "create_library()\n";

 #endif  // MICRO_LITE_MICRO_CODER_GENERATOR_CONST_BLOCKS_CMAKE_LISTS_CODE_H_
 #endif  // MINDSPORE_LITE_MICRO_CODER_GENERATOR_CONST_BLOCKS_CMAKE_LISTS_CODE_H_
--- a/mindspore/lite/micro/coder/generator/component/const_blocks/debug_utils.h
+++ b/mindspore/lite/micro/coder/generator/component/const_blocks/debug_utils.h
@@ -13,10 +13,10 @@
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #ifndef MICRO_LITE_MICRO_CODER_GENERATOR_CONST_BLOCKS_BEN_DEBUG_UTILS_H_
 #define MICRO_LITE_MICRO_CODER_GENERATOR_CONST_BLOCKS_BEN_DEBUG_UTILS_H_
 #ifndef MINDSPORE_LITE_MICRO_CODER_GENERATOR_CONST_BLOCKS_DEBUG_UTILS_H_
 #define MINDSPORE_LITE_MICRO_CODER_GENERATOR_CONST_BLOCKS_DEBUG_UTILS_H_

 static const char debug_utils_h[] =
 const char *debug_utils_h =
  "/**\n"
  " * Copyright 2021 Huawei Technologies Co., Ltd\n"
  " *\n"
@@ -50,7 +50,7 @@ static const char debug_utils_h[] =
  "\n"
  "#endif  // MINDSPORE_LITE_MICRO_MICRODEBUGUTIL_H_\n";

 static const char debug_utils_c[] =
 const char *debug_utils_c =
  "/**\n"
  " * Copyright 2021 Huawei Technologies Co., Ltd\n"
  " *\n"
@@ -239,7 +239,7 @@ static const char debug_utils_c[] =
  "}\n"
  "\n"
  "void PrintTensor(MicroTensor *tensor, FILE *output_file, const char *is_input) {\n"
  "  if (output_file != NULL) {\n"
  "  if (output_file == NULL) {\n"
  "    MICRO_ERROR(\"output file is NULL\");\n"
  "    return;\n"
  "  }\n"
@@ -269,4 +269,4 @@ static const char debug_utils_c[] =
  "  return retval;\n"
  "}\n";

 #endif  // MICRO_LITE_MICRO_CODER_GENERATOR_CONST_BLOCKS_BEN_DEBUG_UTILS_H_
 #endif  // MINDSPORE_LITE_MICRO_CODER_GENERATOR_CONST_BLOCKS_DEBUG_UTILS_H_
--- a/mindspore/lite/micro/coder/generator/component/const_blocks/license.h
+++ b/mindspore/lite/micro/coder/generator/component/const_blocks/license.h
@@ -14,12 +14,12 @@
 * limitations under the License.
 */

 #ifndef MICRO_GENERATOR_CONST_BLOCK_LICENSE_INFOS_H
 #define MICRO_GENERATOR_CONST_BLOCK_LICENSE_INFOS_H
 #ifndef MINDSPORE_LITE_MICRO_GENERATOR_CONST_BLOCK_LICENSE_INFOS_H_
 #define MINDSPORE_LITE_MICRO_GENERATOR_CONST_BLOCK_LICENSE_INFOS_H_

 namespace mindspore::lite::micro {

 const char g_hwLicense[] =
 static const char *g_hwLicense =
  "/**\n"
  " * Copyright 2021 Huawei Technologies Co., Ltd\n"
  " *\n"
@@ -37,4 +37,4 @@ const char g_hwLicense[] =
  " */\n\n";
 }  // namespace mindspore::lite::micro

 #endif  // MICRO_GENERATOR_CONST_BLOCK_LICENSE_INFOS_H
 #endif  // MINDSPORE_LITE_MICRO_GENERATOR_CONST_BLOCK_LICENSE_INFOS_H_
--- a/mindspore/lite/micro/coder/generator/component/const_blocks/load_input.h
+++ b/mindspore/lite/micro/coder/generator/component/const_blocks/load_input.h
@@ -14,9 +14,9 @@
 * limitations under the License.
 */

 #ifndef MICRO_LITE_MICRO_CODER_GENERATOR_CONST_BLOCKS_BENCH_LOAD_INPUT_H_
 #define MICRO_LITE_MICRO_CODER_GENERATOR_CONST_BLOCKS_BENCH_LOAD_INPUT_H_
 static const char load_input_h[] =
 #ifndef MINDSPORE_LITE_MICRO_CODER_GENERATOR_CONST_BLOCKS_BENCH_LOAD_INPUT_H_
 #define MINDSPORE_LITE_MICRO_CODER_GENERATOR_CONST_BLOCKS_BENCH_LOAD_INPUT_H_
 const char *load_input_h =
  "/**\n"
  " * Copyright 2021 Huawei Technologies Co., Ltd\n"
  " *\n"
@@ -43,7 +43,7 @@ static const char load_input_h[] =
  "\n"
  "#endif  // MICRO_EXAMPLE_LOAD_INPUT_LOAD_INPUT_H_\n";

 static const char load_input_c[] =
 const char *load_input_c =
  "/**\n"
  " * Copyright 2021 Huawei Technologies Co., Ltd\n"
  " *\n"
@@ -131,11 +131,11 @@ static const char load_input_c[] =
  "    int size = 0;\n"
  "    buffers[i] = ReadInputData(inputs_path[i], &size);\n"
  "    if (size != inputs_size[i] || buffers[i] == NULL) {\n"
  "      printf(\"size mismatch, %s, %d, %d\\n\", inputs_path[i], size, inputs_size[i]);\n"
  "      printf(\"size mismatch, %s, input: %d, needed: %d\\n\", inputs_path[i], size, inputs_size[i]);\n"
  "      return -1;\n"
  "    }\n"
  "  }\n"
  "  return 0;\n"
  "}\n";

 #endif  // MICRO_LITE_MICRO_CODER_GENERATOR_CONST_BLOCKS_BENCH_LOAD_INPUT_H_
 #endif  // MINDSPORE_LITE_MICRO_CODER_GENERATOR_CONST_BLOCKS_BENCH_LOAD_INPUT_H_
--- a/mindspore/lite/micro/coder/generator/component/const_blocks/micro_tensor.h
+++ b/mindspore/lite/micro/coder/generator/component/const_blocks/micro_tensor.h
@@ -13,10 +13,10 @@
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #ifndef MICRO_LITE_MICRO_CODER_GENERATOR_CONST_BLOCKS_MICRO_TENSOR_H_
 #define MICRO_LITE_MICRO_CODER_GENERATOR_CONST_BLOCKS_MICRO_TENSOR_H_
 #ifndef MINDSPORE_LITE_MICRO_CODER_GENERATOR_CONST_BLOCKS_MICRO_TENSOR_H_
 #define MINDSPORE_LITE_MICRO_CODER_GENERATOR_CONST_BLOCKS_MICRO_TENSOR_H_

 static const char micro_tensor_h[] =
 const char *micro_tensor_h =
  "/**\n"
  " * Copyright 2021 Huawei Technologies Co., Ltd\n"
  " *\n"
@@ -42,20 +42,8 @@ static const char micro_tensor_h[] =
  "#include <stdbool.h>\n"
  "#include <stdint.h>\n"
  "\n"
  "inline bool IsPrint() {\n"
  "  char *env = getenv(\"GLOG_v\");\n"
  "  if (env == NULL) {\n"
  "    return false;\n"
  "  }\n"
  "  return strcmp(env, \"1\") == 0;\n"
  "}\n"
  "\n"
  "#define MICRO_INFO(content, args...)                                        \\\n"
  "  {                                                                         \\\n"
  "    if (IsPrint()) {                                                        \\\n"
  "      printf(\"[INFO] %s|%d: \" #content \"\\r\\n\", __func__, __LINE__, ##args); \\\n"
  "    }                                                                       \\\n"
  "  }\n"
  "#define MICRO_INFO(content, args...) \\\n"
  "  { printf(\"[INFO] %s|%d: \" #content \"\\r\\n\", __func__, __LINE__, ##args); }\n"
  "#define MICRO_ERROR(content, args...) \\\n"
  "  { printf(\"[ERROR] %s|%d: \" #content \"\\r\\n\", __func__, __LINE__, ##args); }\n"
  "\n"
@@ -115,4 +103,4 @@ static const char micro_tensor_h[] =
  "} GraphQuantArgs;\n"
  "\n"
  "#endif  // MSMICRO_TENSOR_H\n";
 #endif  // MICRO_LITE_MICRO_CODER_GENERATOR_CONST_BLOCKS_MICRO_TENSOR_H_
 #endif  // MINDSPORE_LITE_MICRO_CODER_GENERATOR_CONST_BLOCKS_MICRO_TENSOR_H_
--- a/mindspore/lite/micro/coder/generator/component/const_blocks/thread_pool.h
+++ b/mindspore/lite/micro/coder/generator/component/const_blocks/thread_pool.h
@@ -0,0 +1,99 @@
 /**
 * Copyright 2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #ifndef MINDSPORE_LITE_MICRO_CODER_GENERATOR_CONST_BLOCKS_THREAD_POOL_H_
 #define MINDSPORE_LITE_MICRO_CODER_GENERATOR_CONST_BLOCKS_THREAD_POOL_H_

 namespace mindspore::lite::micro {

 const char *thread_pool_h =
  "/**\n"
  " * Copyright 2021 Huawei Technologies Co., Ltd\n"
  " *\n"
  " * Licensed under the Apache License, Version 2.0 (the \"License\");\n"
  " * you may not use this file except in compliance with the License.\n"
  " * You may obtain a copy of the License at\n"
  " *\n"
  " * http://www.apache.org/licenses/LICENSE-2.0\n"
  " *\n"
  " * Unless required by applicable law or agreed to in writing, software\n"
  " * distributed under the License is distributed on an \"AS IS\" BASIS,\n"
  " * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n"
  " * See the License for the specific language governing permissions and\n"
  " * limitations under the License.\n"
  " */\n"
  "\n"
  "#ifndef MINDSPORE_LITE_SRC_RUNTIME_THREAD_POOL_H_\n"
  "#define MINDSPORE_LITE_SRC_RUNTIME_THREAD_POOL_H_\n"
  "\n"
  "#include <stdbool.h>\n"
  "\n"
  "#define MAX_TASK_NUM (2)\n"
  "\n"
  "/// \\brief BindMode defined for holding bind cpu strategy argument.\n"
  "typedef enum {\n"
  "  NO_BIND_MODE = 0, /**< no bind */\n"
  "  HIGHER_MODE = 1,  /**< bind higher cpu first */\n"
  "  MID_MODE = 2      /**< bind middle cpu first */\n"
  "} BindMode;\n"
  "\n"
  "struct ThreadPool;\n"
  "\n"
  "struct ThreadPool *CreateThreadPool(int thread_num, int mode);\n"
  "\n"
  "/**\n"
  " *\n"
  " * @param session_index, support multi session\n"
  " * @param job\n"
  " * @param content\n"
  " * @param task_num\n"
  " */\n"
  "int ParallelLaunch(struct ThreadPool *thread_pool, int (*job)(void *, int), void *content, int task_num);\n"
  "\n"
  "/**\n"
  " * bind each thread to specified cpu core\n"
  " * @param is_bind\n"
  " * @param mode\n"
  " */\n"
  "int BindThreads(struct ThreadPool *thread_pool, bool is_bind, int mode);\n"
  "\n"
  "/**\n"
  " * activate the thread pool\n"
  " * @param thread_pool_id\n"
  " */\n"
  "void ActivateThreadPool(struct ThreadPool *thread_pool);\n"
  "\n"
  "/**\n"
  " * deactivate the thread pool\n"
  " * @param thread_pool_id\n"
  " */\n"
  "void DeactivateThreadPool(struct ThreadPool *thread_pool);\n"
  "\n"
  "/**\n"
  " *\n"
  " * @return current thread num\n"
  " */\n"
  "int GetCurrentThreadNum(struct ThreadPool *thread_pool);\n"
  "\n"
  "/**\n"
  " * destroy thread pool, and release resource\n"
  " */\n"
  "void DestroyThreadPool(struct ThreadPool *thread_pool);\n"
  "\n"
  "#endif  // MINDSPORE_LITE_SRC_RUNTIME_THREAD_POOL_H_\n";
 }  // namespace mindspore::lite::micro

 #endif  // MINDSPORE_LITE_MICRO_CODER_GENERATOR_CONST_BLOCKS_THREAD_POOL_H_
--- a/mindspore/lite/micro/coder/generator/component/parallel_component.cc
+++ b/mindspore/lite/micro/coder/generator/component/parallel_component.cc
@@ -0,0 +1,61 @@
 /**
 * Copyright 2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #include "coder/generator/component/parallel_component.h"
 #include <string>

 namespace mindspore::lite::micro {

 void CodeCreateThreadPool(std::ofstream &ofs, const std::string &module_name) {
  ofs << "  int thread_num = 4;\n"
         "  BindMode bind_mode = NO_BIND_MODE;\n"
         "  if (argc >= 6) {\n"
         "    thread_num = atoi(argv[4]);\n"
         "    bind_mode = atoi(argv[5]);\n"
         "  }\n"
         "  struct ThreadPool *thread_pool = CreateThreadPool(thread_num, bind_mode);\n"
         "  if (thread_pool == NULL) {\n"
         "    MICRO_ERROR(\"create thread pool failed\");\n"
         "    return RET_ERROR;\n"
         "  }\n"
      << "  ret = " << module_name << "_SetThreadPool(thread_pool);\n"
      << "  if (ret != RET_OK) {\n"
         "    MICRO_ERROR(\"set global thread pool failed\");\n"
         "    return RET_ERROR;\n"
         "  }\n"
         "  MICRO_INFO(\"config: ThreadNum: %d, BindMode: %d\", thread_num, bind_mode);\n";
 }

 void CodeDestroyThreadPool(std::ofstream &ofs) { ofs << "  DestroyThreadPool(thread_pool);\n"; }

 void CodeSetGlobalThreadPoolState(std::ofstream &ofs, const std::string &module_name) {
  ofs << "/*\n"
         " * set global thread pool, which is created by user\n"
         " */\n"
      << "int " << module_name << "_SetThreadPool(struct ThreadPool *thread_pool);\n\n";
 }

 void CodeSetGlobalThreadPoolImplement(std::ofstream &ofs, const std::string &module_name) {
  ofs << "struct ThreadPool *g_thread_pool = NULL;\n"
      << "int " << module_name << "_SetThreadPool(struct ThreadPool *thread_pool) {\n"
      << "  if (thread_pool == NULL) {\n"
         "    return RET_ERROR;\n"
         "  }\n"
         "  g_thread_pool = thread_pool;\n"
         "  return RET_OK;\n"
         "}\n";
 }
 }  // namespace mindspore::lite::micro
--- a/mindspore/lite/micro/coder/generator/component/parallel_component.h
+++ b/mindspore/lite/micro/coder/generator/component/parallel_component.h
@@ -0,0 +1,35 @@
 /**
 * Copyright 2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #ifndef MINDSPORE_LITE_MICRO_CODER_GENERATOR_PARALLEL_COMPONENT_H_
 #define MINDSPORE_LITE_MICRO_CODER_GENERATOR_PARALLEL_COMPONENT_H_

 #include <string>
 #include <fstream>

 namespace mindspore::lite::micro {

 void CodeCreateThreadPool(std::ofstream &ofs, const std::string &module_name);

 void CodeDestroyThreadPool(std::ofstream &ofs);

 void CodeSetGlobalThreadPoolState(std::ofstream &ofs, const std::string &module_name);

 void CodeSetGlobalThreadPoolImplement(std::ofstream &ofs, const std::string &module_name);

 }  // namespace mindspore::lite::micro

 #endif  // MINDSPORE_LITE_MICRO_CODER_GENERATOR_PARALLEL_COMPONENT_H_
--- a/mindspore/lite/micro/coder/generator/component/weight_component.cc
+++ b/mindspore/lite/micro/coder/generator/component/weight_component.cc
@@ -74,10 +74,10 @@ void CodeModelParamsForNet(std::ofstream &hofs, std::ofstream &cofs, const std::
      continue;
    }
    if (tensor->category() == Tensor::Category::CONST_TENSOR) {
      hofs << "extern " << GetTensorDataType(tensor->data_type()) << name << " = [];\n";
      cofs << GetTensorDataType(tensor->data_type()) << name << " = [" << tensor->ElementsNum() << "];\n";
      hofs << "extern " << GetTensorDataType(tensor->data_type()) << name << "[];\n";
      cofs << GetTensorDataType(tensor->data_type()) << name << "[" << tensor->ElementsNum() << "];\n";
    } else if (tensor->category() == Tensor::Category::VAR) {
      hofs << "extern " << GetTensorDataType(tensor->data_type()) << " *" << name << ";\n";
      hofs << "extern " << GetTensorDataType(tensor->data_type()) << "*" << name << ";\n";
      cofs << GetTensorDataType(tensor->data_type()) << "*" << name << " = NULL;\n";
    }
  }
@@ -87,7 +87,6 @@ void CodeModelParamsForNet(std::ofstream &hofs, std::ofstream &cofs, const std::
 void CodeWeightInitFunc(std::ofstream &ofs, const std::string &module_name, const std::unique_ptr<CoderContext> &ctx) {
  ofs << "int " << module_name << "_Init(void *weight_buffer, int weight_size) {\n"
      << "  if (weight_buffer == NULL) {\n"
         "    MICRO_ERROR(\"weight buffer is NULL\");\n"
      << "    return RET_ERROR;\n"
      << "  }\n";

@@ -106,8 +105,9 @@ void CodeWeightInitFunc(std::ofstream &ofs, const std::string &module_name, cons
    if (tensor->category() != Tensor::Category::CONST_TENSOR) {
      continue;
    }
    auto iter = ctx->tensors_map().find(tensor);
    if (iter != ctx->tensors_map().end()) {
    std::map<Tensor *, std::string> ctx_tensor_map = ctx->tensors_map();
    auto iter = ctx_tensor_map.find(tensor);
    if (iter != ctx_tensor_map.end()) {
      origins += "    {" + name + ", " + std::to_string(tensor->Size()) + ", " + std::to_string(offset) + "},\n";
      params_num++;
    } else {
@@ -115,14 +115,14 @@ void CodeWeightInitFunc(std::ofstream &ofs, const std::string &module_name, cons
      params +=
        "  " + GetTensorDataType(data_type) + "*" + name + " = (weight_buffer + " + std::to_string(offset) + ");\n";
    }
    offset += tensor->Size();
  }
  ofs << "  struct ModelParameter model_params[] = {\n" << origins << "  };\n";
  ofs << params << "\n";
  ofs << "  struct ModelParameter model_params[] = {\n" << origins << "  };\n";

  ofs << "\n";
  ofs << "  for(int i = 0; i < " << params_num << "; ++i) {\n"
      << "    if (model_params[i].offset + model_params[i].size > weight_size) {\n"
         "      MICRO_ERROR(\"buffer is invalid, size: %d, offset: %lu\", weight_size, model_params[i].offset);\n"
         "      return RET_ERROR;\n"
         "    }\n"
      << "    memcpy(model_params[i].addr, (weight_buffer + model_params[i].offset), model_params[i].size);\n"
--- a/mindspore/lite/micro/coder/generator/generator.cc
+++ b/mindspore/lite/micro/coder/generator/generator.cc
@@ -24,8 +24,9 @@
 #include "coder/generator/component/const_blocks/cmake_lists.h"
 #include "coder/generator/component/const_blocks/debug_utils.h"
 #include "coder/generator/component/const_blocks/load_input.h"
 #include "coder/generator/component/const_blocks/thread_pool.h"
 #include "coder/generator/component/const_blocks/license.h"
 #include "micro/coder/log.h"
 #include "coder/log.h"

 namespace mindspore::lite::micro {
 int WriteContentToFile(const std::string &file, const std::string &content) {
@@ -61,11 +62,13 @@ Generator::~Generator() { (void)umask(origin_umask_); }
 void Generator::CodeNetRunFunc(std::ofstream &ofs) {
  // generate net inference code
  ofs << "void " << config_->module_name() << "_Inference() {\n";
  if (config_->code_mode() == CodeMode::Code_Inference) {
    ofs << "int thread_num = GetCurrentThreadNum(THREAD_POOL_DEFAULT);\n";
  if (config_->support_parallel()) {
    ofs << "  const int g_thread_num = GetCurrentThreadNum(g_thread_pool);\n";
  } else {
    ofs << "  const int g_thread_num = 1;\n";
  }
  for (const auto &block : ctx_->code_blocks()) {
    ofs << "\t{\n" << block << "\t}\n";
    ofs << "  {\n" << block << "  }\n";
  }
  ofs << "}\n";
 }
@@ -98,7 +101,7 @@ int Generator::CodeSourceCMakeFile() {
 }

 int Generator::CodeStaticContent() {
  const std::vector<std::pair<std::string, std::string>> static_blocks = {
  std::vector<std::pair<std::string, std::string>> static_blocks = {
    {net_inc_file_path_ + "microtensor.h", micro_tensor_h},
    {net_src_file_path_ + "CMakeLists.txt", src_cmake_lists_txt},
    {net_main_file_path_ + "debug_utils.h", debug_utils_h},
@@ -106,12 +109,13 @@ int Generator::CodeStaticContent() {
    {net_main_file_path_ + "load_input.h", load_input_h},
    {net_main_file_path_ + "load_input.c", load_input_c},
    {net_main_file_path_ + "CMakeLists.txt", bench_cmake_lists_txt}};
  if (config_->support_parallel()) {
    static_blocks.emplace_back(net_inc_file_path_ + "thread_pool.h", thread_pool_h);
  }
  for (const auto &static_block : static_blocks) {
    std::string file_name = static_block.first;
    std::string content = static_block.second;
    if (WriteContentToFile(file_name, content) != RET_OK) {
      return RET_ERROR;
    }
    MS_CHECK_RET_CODE(WriteContentToFile(file_name, content), "write file failed");
  }
  return RET_OK;
 }
--- a/mindspore/lite/micro/coder/generator/inference/inference_generator.cc
+++ b/mindspore/lite/micro/coder/generator/inference/inference_generator.cc
@@ -18,6 +18,7 @@
 #include <vector>
 #include <string>
 #include "coder/generator/component/common_component.h"
 #include "coder/generator/component/parallel_component.h"
 #include "coder/generator/component/benchmark_component.h"
 #include "coder/generator/component/const_blocks/license.h"

@@ -28,14 +29,17 @@ int InferenceGenerator::CodeNetHFile() {
  MS_CHECK_TRUE(!ofs.bad(), "filed to open file");
  MS_LOG(INFO) << "write " << net_include_file;
  ofs << g_hwLicense;
  if (config_->code_mode() == CodeMode::Code_Inference) {
    ofs << "#include \"src/runtime/thread_pool.h\"\n";
  if (config_->support_parallel()) {
    ofs << "#include \"thread_pool.h\"\n";
  }
  ofs << "#include \"microtensor.h\"\n\n";
  CodeInputAndOutputState(ofs, config_->module_name());
  if (is_get_quant_args_) {
    CodeGraphQuantArgsState(ofs, config_->module_name());
  }
  if (config_->support_parallel()) {
    CodeSetGlobalThreadPoolState(ofs, config_->module_name());
  }
  if (config_->is_weight_file()) {
    CodeInitWeightState(ofs, config_->module_name());
  }
@@ -50,6 +54,9 @@ int InferenceGenerator::CodeNetCFile() {
  MS_CHECK_TRUE(!ofs.bad(), "filed to open file");
  MS_LOG(INFO) << "write " << net_impl_file;
  CodeSourceFileInclude(ofs, net_weight_hfile_, net_inc_hfile_);
  if (config_->support_parallel()) {
    CodeSetGlobalThreadPoolImplement(ofs, config_->module_name());
  }
  CodeInputAndOutputImplement(ofs, config_->module_name(), ctx_);
  CodeInitResourceImplement(ofs, config_->module_name(), ctx_);
  CodeFreeResourceImplement(ofs, config_->module_name(), ctx_);
@@ -78,12 +85,14 @@ int InferenceGenerator::CodeBenchmarkFile() {
  if (config_->is_weight_file()) {
    CodeBenchmarkInitWeight(ofs, config_->module_name());
  }
  if (config_->code_mode() == CodeMode::Code_Inference) {
    CodeBenchmarkConfigThread(ofs);
  if (config_->support_parallel()) {
    CodeCreateThreadPool(ofs, config_->module_name());
  }
  CodeBenchmarkInference(ofs, config_->module_name());
  CodeBenchmarkPrintOutputs(ofs, config_->module_name());

  if (config_->support_parallel()) {
    CodeDestroyThreadPool(ofs);
  }
  CodeBenchmarkFreeResourse(ofs, config_->module_name(), inputs_num);
  ofs.close();
  return RET_OK;
--- a/mindspore/lite/micro/coder/generator/inference/inference_generator.h
+++ b/mindspore/lite/micro/coder/generator/inference/inference_generator.h
@@ -19,7 +19,7 @@

 #include <utility>
 #include <memory>
 #include "micro/coder/generator/generator.h"
 #include "coder/generator/generator.h"

 namespace mindspore::lite::micro {
 class InferenceGenerator : public Generator {
--- a/mindspore/lite/micro/coder/generator/train/train_generator.cc
+++ b/mindspore/lite/micro/coder/generator/train/train_generator.cc
@@ -39,7 +39,7 @@ int TrainGenerator::CodeNetHFile() {
  MS_CHECK_TRUE(!ofs.bad(), "filed to open file");
  MS_LOG(INFO) << "write " << net_include_file;
  ofs << g_hwLicense;
  if (config_->code_mode() == CodeMode::Code_Inference) {
  if (config_->code_mode() == CodeMode::Inference) {
    ofs << "#include \"src/runtime/thread_pool.h\"\n";
  }
  ofs << "#include \"microtensor.h\"\n\n";
--- a/mindspore/lite/micro/coder/generator/train/train_generator.h
+++ b/mindspore/lite/micro/coder/generator/train/train_generator.h
@@ -19,7 +19,7 @@

 #include <utility>
 #include <memory>
 #include "micro/coder/generator/generator.h"
 #include "coder/generator/generator.h"

 namespace mindspore::lite::micro {
 class TrainGenerator : public Generator {
--- a/mindspore/lite/micro/coder/graph.cc
+++ b/mindspore/lite/micro/coder/graph.cc
@@ -14,7 +14,7 @@
 * limitations under the License.
 */

 #include "micro/coder/graph.h"
 #include "coder/graph.h"
 #include <queue>
 #include <deque>
 #include <string>
@@ -23,7 +23,6 @@
 #include <set>
 #include "coder/log.h"
 #include "schema/inner/model_generated.h"
 #include "src/ops/primitive_c.h"
 #include "securec/include/securec.h"

 namespace mindspore::lite::micro {
@@ -92,8 +91,15 @@ int CoderGraph::ConvertTensors() {
    if (quant_params != nullptr) {
      for (int j = 0; j < static_cast<int>(quant_params->size()); j++) {
        QuantArg quant_arg{};
        quant_arg.bitNum = quant_params->Get(j)->numBits();
        quant_arg.scale = quant_params->Get(j)->scale();
        quant_arg.zeroPoint = quant_params->Get(j)->zeroPoint();
        quant_arg.var_corr = quant_params->Get(j)->varCorr();
        quant_arg.mean_corr = quant_params->Get(j)->meanCorr();
        quant_arg.inited = quant_params->Get(j)->inited();
        quant_arg.roundType = quant_params->Get(j)->roundType();
        quant_arg.multiplier = quant_params->Get(j)->multiplier();
        quant_arg.dstDtype = quant_params->Get(j)->dstDtype();
        dstTensor->AddQuantParam(quant_arg);
      }
    }
--- a/mindspore/lite/micro/coder/opcoders/base/conv2d_base_coder.cc
+++ b/mindspore/lite/micro/coder/opcoders/base/conv2d_base_coder.cc
@@ -14,12 +14,12 @@
 * limitations under the License.
 */

 #include "micro/coder/opcoders/base/conv2d_base_coder.h"
 #include "coder/opcoders/base/conv2d_base_coder.h"
 #include <string>
 #include <vector>
 #include "nnacl/fp32/winograd_utils.h"
 #include "nnacl/int8/quantize.h"
 #include "micro/coder/log.h"
 #include "coder/log.h"

 namespace {
 int MallocConvQuantParams(ConvQuantArg *quant_arg, size_t input_arg_num, size_t filter_arg_num, size_t output_arg_num) {
@@ -37,8 +37,8 @@ int MallocConvQuantParams(ConvQuantArg *quant_arg, size_t input_arg_num, size_t
 }  // namespace

 namespace mindspore::lite::micro {
 string Conv2DBaseCoder::LayoutTransformFp32(schema::Format src_format, schema::Format dst_format) {
  string ret;
 std::string Conv2DBaseCoder::LayoutTransformFp32(schema::Format src_format, schema::Format dst_format) {
  std::string ret;
  if (src_format == schema::Format_NHWC && dst_format == schema::Format_NC4HW4) {
    ret = "PackNHWCToNC4HW4Fp32";
  } else if (src_format == schema::Format_NHWC && dst_format == schema::Format_NHWC4) {
@@ -56,8 +56,8 @@ string Conv2DBaseCoder::LayoutTransformFp32(schema::Format src_format, schema::F
  return ret;
 }

 string Conv2DBaseCoder::LayoutTransformInt8(schema::Format src_format, schema::Format dst_format) {
  string ret;
 std::string Conv2DBaseCoder::LayoutTransformInt8(schema::Format src_format, schema::Format dst_format) {
  std::string ret;
  if (src_format == schema::Format_NHWC && dst_format == schema::Format_NHWC4) {
    ret = "PackNHWCToNHWC4Int8";
  } else {
@@ -67,8 +67,8 @@ string Conv2DBaseCoder::LayoutTransformInt8(schema::Format src_format, schema::F
  return ret;
 }

 string Conv2DBaseCoder::LayoutTransform(TypeId data_type, schema::Format src_format, schema::Format dst_format) {
  string ret;
 std::string Conv2DBaseCoder::LayoutTransform(TypeId data_type, schema::Format src_format, schema::Format dst_format) {
  std::string ret;
  switch (data_type) {
    case kNumberTypeInt8:
      ret = LayoutTransformInt8(src_format, dst_format);
@@ -197,7 +197,7 @@ int Conv2DBaseCoder::SetQuantMultiplier() {
  return RET_OK;
 }

 int Conv2DBaseCoder::CheckResizeValid() {
 int Conv2DBaseCoder::CheckResizeValid() const {
  // ===============check in channel================= //
  int32_t filter_in_channel = filter_tensor_->Channel();
  int32_t resize_in_channel = input_tensor_->Channel();
@@ -206,12 +206,39 @@ int Conv2DBaseCoder::CheckResizeValid() {
  return RET_OK;
 }

 void Conv2DBaseCoder::SetRoundingAndMultipilerMode() {
  auto input_quant_arg = input_tensor_->quant_params().front();
  int round_type = input_quant_arg.roundType;
  switch (round_type) {
    case 1:
      conv_quant_arg_->round_mode_ = Rounding_Away_from_zero;
      break;
    case 2:
      conv_quant_arg_->round_mode_ = Rounding_Up;
      break;
    default:
      conv_quant_arg_->round_mode_ = Rounding_No;
  }
  int cal_multiplier_type = input_quant_arg.multiplier;
  switch (cal_multiplier_type) {
    case 0:
      conv_quant_arg_->quant_multiplier_mode_ = Method_SinglePrecision;
      break;
    case 1:
      conv_quant_arg_->quant_multiplier_mode_ = Method_DoublePrecision;
      break;
    default:
      conv_quant_arg_->quant_multiplier_mode_ = Method_No;
  }
 }

 int Conv2DBaseCoder::SetQuantParam() {
  MS_CHECK_RET_CODE(MallocQuantParam(), "Malloc quant param failed.");
  MS_CHECK_RET_CODE(SetInputTensorQuantParam(), "Set Input Tensor Quant Param Failed.");
  MS_CHECK_RET_CODE(SetFilterTensorQuantParam(), "Set Filter Tensor Quant Param Failed.");
  MS_CHECK_RET_CODE(SetOutputTensorQuantParam(), "Set Output Tensor Quant Param Failed.");
  MS_CHECK_RET_CODE(SetIfPerChannel(), "Set if per tensor channel failed.");
  SetRoundingAndMultipilerMode();
  MS_CHECK_RET_CODE(SetQuantMultiplier(), "Set Quant Multiplier Failed.");
  // now only consider per tensor for output
  MS_CHECK_PTR(conv_param_->conv_quant_arg_.out_act_min_);
--- a/mindspore/lite/micro/coder/opcoders/base/conv2d_base_coder.h
+++ b/mindspore/lite/micro/coder/opcoders/base/conv2d_base_coder.h
@@ -21,13 +21,11 @@
 #include <vector>
 #include <utility>
 #include <memory>
 #include "micro/coder/opcoders/op_coder.h"
 #include "coder/opcoders/op_coder.h"
 #include "src/runtime/kernel/arm/base/layout_transform.h"
 #include "nnacl/conv_parameter.h"
 namespace mindspore::lite::micro {

 using std::string;

 class Conv2DBaseCoder : public OperatorCoder {
 public:
  Conv2DBaseCoder(const std::vector<Tensor *> &in_tensors, const std::vector<Tensor *> &out_tensors,
@@ -47,10 +45,14 @@ class Conv2DBaseCoder : public OperatorCoder {
    free(conv_quant_arg_->input_quant_args_);
    free(conv_quant_arg_->filter_quant_args_);
    free(conv_quant_arg_->output_quant_args_);
    conv_param_ = nullptr;
    conv_quant_arg_ = nullptr;
    filter_tensor_ = nullptr;
    bias_tensor_ = nullptr;
  }

 protected:
  int Init();
  virtual int Init();

  int SetQuantParam();

@@ -62,19 +64,21 @@ class Conv2DBaseCoder : public OperatorCoder {

  int SetOutputTensorQuantParam();

  void SetRoundingAndMultipilerMode();

  int SetQuantMultiplier();

  int CheckResizeValid();
  int CheckResizeValid() const;

  int SetIfPerChannel();

  int CheckLayout(lite::Tensor *input_tensor);

  string LayoutTransformFp32(schema::Format src_format, schema::Format dst_format);
  std::string LayoutTransformFp32(schema::Format src_format, schema::Format dst_format);

  string LayoutTransformInt8(schema::Format src_format, schema::Format dst_format);
  std::string LayoutTransformInt8(schema::Format src_format, schema::Format dst_format);

  string LayoutTransform(TypeId data_type, schema::Format src_format, schema::Format dst_format);
  std::string LayoutTransform(TypeId data_type, schema::Format src_format, schema::Format dst_format);

  ConvParameter *conv_param_{nullptr};

@@ -84,7 +88,7 @@ class Conv2DBaseCoder : public OperatorCoder {

  Tensor *bias_tensor_{nullptr};

  string convert_func_;
  std::string convert_func_;
 };
 }  // namespace mindspore::lite::micro
 #endif  // MINDSPORE_LITE_MICRO_CODER_OPCODERS_BASE_CONV2D_BASE_CODER_H_
--- a/mindspore/lite/micro/coder/opcoders/base/detection_post_process_base_coder.cc
+++ b/mindspore/lite/micro/coder/opcoders/base/detection_post_process_base_coder.cc
@@ -0,0 +1,153 @@
 /**
 * Copyright 2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #include "coder/opcoders/base/detection_post_process_base_coder.h"

 #include "nnacl/int8/quant_dtype_cast_int8.h"

 #include "coder/opcoders/file_collector.h"
 #include "coder/log.h"
 #include "include/errorcode.h"

 namespace mindspore::lite::micro {

 int DetectionPostProcessBaseCoder::Prepare(CoderContext *const context) {
  MS_CHECK_PTR(parameter_);
  params_ = reinterpret_cast<DetectionPostProcessParameter *>(parameter_);
  params_->anchors_ = nullptr;
  params_->decoded_boxes_ = nullptr;
  params_->nms_candidate_ = nullptr;
  params_->indexes_ = nullptr;
  params_->scores_ = nullptr;
  params_->all_class_indexes_ = nullptr;
  params_->all_class_scores_ = nullptr;
  params_->single_class_indexes_ = nullptr;
  params_->selected_ = nullptr;

  Tensor *anchor_tensor = input_tensors_.at(2);
  MS_CHECK_PTR(anchor_tensor);
  if (anchor_tensor->data_type() == kNumberTypeInt8) {
    QuantArg quant_param = anchor_tensor->quant_params().at(0);
    auto anchor_int8 = reinterpret_cast<int8_t *>(anchor_tensor->data_c());
    MS_CHECK_PTR(anchor_int8);
    auto anchor_fp32 = static_cast<float *>(
      allocator_->Malloc(kNumberTypeFloat, anchor_tensor->ElementsNum() * sizeof(float), kOfflinePackWeight));
    MS_CHECK_PTR(anchor_fp32);
    DoDequantizeInt8ToFp32(anchor_int8, anchor_fp32, quant_param.scale, quant_param.zeroPoint,
                           anchor_tensor->ElementsNum());
    params_->anchors_ = anchor_fp32;
  } else if (anchor_tensor->data_type() == kNumberTypeUInt8) {
    QuantArg quant_param = anchor_tensor->quant_params().front();
    auto anchor_uint8 = reinterpret_cast<uint8_t *>(anchor_tensor->data_c());
    MS_CHECK_PTR(anchor_uint8);
    auto anchor_fp32 = static_cast<float *>(
      allocator_->Malloc(kNumberTypeFloat, anchor_tensor->ElementsNum() * sizeof(float), kOfflinePackWeight));
    MS_CHECK_PTR(anchor_fp32);
    DoDequantizeUInt8ToFp32(anchor_uint8, anchor_fp32, quant_param.scale, quant_param.zeroPoint,
                            anchor_tensor->ElementsNum());
    params_->anchors_ = anchor_fp32;
  } else if (anchor_tensor->data_type() == kNumberTypeFloat32 || anchor_tensor->data_type() == kNumberTypeFloat) {
    params_->anchors_ = static_cast<float *>(
      allocator_->Malloc(kNumberTypeFloat, anchor_tensor->ElementsNum() * sizeof(float), kOfflinePackWeight));
    MS_CHECK_PTR(params_->anchors_);
    memcpy(params_->anchors_, anchor_tensor->data_c(), anchor_tensor->Size());
  } else {
    MS_LOG(ERROR) << "unsupported anchor data type " << anchor_tensor->data_type();
    return RET_ERROR;
  }
  MS_CHECK_RET_CODE(AllocateBuffer(), "AllocateBuffer failed");
  MS_CHECK_RET_CODE(MallocInputsBuffer(), "malloc inputs buffer failed");
  return RET_OK;
 }

 int DetectionPostProcessBaseCoder::AllocateBuffer() {
  MS_CHECK_PTR(input_tensors_.at(0));
  MS_CHECK_PTR(input_tensors_.at(1));
  num_boxes_ = input_tensors_.at(0)->shape().at(1);
  num_classes_with_bg_ = input_tensors_.at(1)->shape().at(2);
  params_->decoded_boxes_ = allocator_->Malloc(kNumberTypeFloat, num_boxes_ * 4 * sizeof(float), kWorkspace);
  MS_CHECK_PTR(params_->decoded_boxes_);
  params_->nms_candidate_ = allocator_->Malloc(kNumberTypeUInt8, num_boxes_ * sizeof(uint8_t), kWorkspace);
  MS_CHECK_PTR(params_->nms_candidate_);
  params_->selected_ = allocator_->Malloc(kNumberTypeInt, num_boxes_ * sizeof(int), kWorkspace);
  MS_CHECK_PTR(params_->selected_);
  params_->single_class_indexes_ = allocator_->Malloc(kNumberTypeInt, num_boxes_ * sizeof(int), kWorkspace);
  MS_CHECK_PTR(params_->single_class_indexes_);

  if (params_->use_regular_nms_) {
    params_->scores_ =
      allocator_->Malloc(kNumberTypeFloat, (num_boxes_ + params_->max_detections_) * sizeof(float), kWorkspace);
    MS_CHECK_PTR(params_->scores_);
    params_->indexes_ =
      allocator_->Malloc(kNumberTypeInt, (num_boxes_ + params_->max_detections_) * sizeof(int), kWorkspace);
    MS_CHECK_PTR(params_->indexes_);
    params_->all_class_scores_ =
      allocator_->Malloc(kNumberTypeFloat, (num_boxes_ + params_->max_detections_) * sizeof(float), kWorkspace);
    MS_CHECK_PTR(params_->all_class_scores_);
    params_->all_class_indexes_ =
      allocator_->Malloc(kNumberTypeInt, (num_boxes_ + params_->max_detections_) * sizeof(int), kWorkspace);
    MS_CHECK_PTR(params_->all_class_indexes_);
  } else {
    params_->scores_ = allocator_->Malloc(kNumberTypeFloat, num_boxes_ * sizeof(float), kWorkspace);
    MS_CHECK_PTR(params_->scores_);
    params_->indexes_ =
      allocator_->Malloc(kNumberTypeFloat, num_boxes_ * params_->num_classes_ * sizeof(int), kWorkspace);
    MS_CHECK_PTR(params_->indexes_);
  }
  return RET_OK;
 }

 int DetectionPostProcessBaseCoder::DoCode(CoderContext *const context) {
  Collect(context, {"nnacl/detection_post_process_parameter.h", "wrapper/base/detection_post_process_base_wrapper.h"},
          {"detection_post_process_fp32.c", "detection_post_process_base_wrapper.c"});

  Serializer code;
  MS_CHECK_RET_CODE(GetInputData(context, &code), "GetInputData failed");
  Tensor *output_boxes = output_tensors_.at(0);
  Tensor *output_classes = output_tensors_.at(1);
  Tensor *output_scores = output_tensors_.at(2);
  Tensor *output_num = output_tensors_.at(3);

  code.CodeBaseStruct("DetectionPostProcessParameter", "params", params_->op_parameter_, params_->h_scale_,
                      params_->w_scale_, params_->x_scale_, params_->y_scale_, params_->nms_iou_threshold_,
                      params_->nms_score_threshold_, params_->max_detections_, params_->detections_per_class_,
                      params_->max_classes_per_detection_, params_->num_classes_, params_->use_regular_nms_,
                      params_->out_quantized_, params_->anchors_, params_->decoded_boxes_, params_->nms_candidate_,
                      params_->indexes_, params_->scores_, params_->all_class_indexes_, params_->all_class_scores_,
                      params_->single_class_indexes_, params_->selected_);

  code.CodeFunction("DecodeBoxes", num_boxes_, input_boxes_, params_->anchors_, "&params");

  if (params_->use_regular_nms_) {
    code.CodeFunction("DetectionPostProcessRegular", num_boxes_, num_classes_with_bg_, input_scores_, output_boxes,
                      output_classes, output_scores, output_num, "PartialArgSort", "&params");
  } else {
    int task_id = 0;
    int thread_num = 1;
    code.CodeFunction("NmsMultiClassesFastCore", num_boxes_, num_classes_with_bg_, input_scores_, "PartialArgSort",
                      "&params", task_id, thread_num);

    code.CodeFunction("DetectionPostProcessFast", num_boxes_, num_classes_with_bg_, input_scores_,
                      "(float *)(params.decoded_boxes_)", output_boxes, output_classes, output_scores, output_num,
                      "PartialArgSort", "&params");
  }

  context->AppendCode(code.str());

  return RET_OK;
 }

 }  // namespace mindspore::lite::micro
--- a/mindspore/lite/micro/coder/opcoders/base/detection_post_process_base_coder.h
+++ b/mindspore/lite/micro/coder/opcoders/base/detection_post_process_base_coder.h
@@ -0,0 +1,54 @@
 /**
 * Copyright 2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #ifndef MINDSPORE_LITE_MICRO_CODER_OPCODERS_BASE_DETECTION_POST_PROCESS_BASE_CODER_H_
 #define MINDSPORE_LITE_MICRO_CODER_OPCODERS_BASE_DETECTION_POST_PROCESS_BASE_CODER_H_

 #include <string>
 #include <vector>
 #include <utility>
 #include <memory>
 #include "coder/opcoders/op_coder.h"
 #include "nnacl/detection_post_process_parameter.h"
 #include "coder/opcoders/serializers/serializer.h"

 namespace mindspore::lite::micro {

 class DetectionPostProcessBaseCoder : public OperatorCoder {
 public:
  DetectionPostProcessBaseCoder(const std::vector<Tensor *> &in_tensors, const std::vector<Tensor *> &out_tensors,
                                const Model::Node *node, size_t node_index, Target target)
      : OperatorCoder(in_tensors, out_tensors, node, node_index, target) {}

  ~DetectionPostProcessBaseCoder() override = default;

  int Prepare(CoderContext *const context) override;

  int DoCode(CoderContext *const context) override;

 protected:
  int AllocateBuffer();
  virtual int GetInputData(CoderContext *const context, Serializer *const coder) = 0;
  virtual int MallocInputsBuffer() = 0;

  int num_boxes_{0};
  int num_classes_with_bg_{0};
  float *input_boxes_{nullptr};
  float *input_scores_{nullptr};
  DetectionPostProcessParameter *params_{nullptr};
 };
 }  // namespace mindspore::lite::micro
 #endif  // MINDSPORE_LITE_MICRO_CODER_OPCODERS_BASE_DETECTION_POST_PROCESS_BASE_CODER_H_
--- a/mindspore/lite/micro/coder/opcoders/base/dtype_cast_coder.cc
+++ b/mindspore/lite/micro/coder/opcoders/base/dtype_cast_coder.cc
@@ -15,7 +15,7 @@
 */

 #include <string>
 #include "micro/coder/opcoders/op_coder.h"
 #include "coder/opcoders/op_coder.h"
 #include "micro/coder/opcoders/file_collector.h"
 #include "micro/coder/opcoders/base/dtype_cast_coder.h"
 #include "micro/coder/opcoders/serializers/serializer.h"
--- a/mindspore/lite/micro/coder/opcoders/base/dtype_cast_coder.h
+++ b/mindspore/lite/micro/coder/opcoders/base/dtype_cast_coder.h
@@ -19,7 +19,7 @@

 #include <vector>
 #include <memory>
 #include "micro/coder/opcoders/op_coder.h"
 #include "coder/opcoders/op_coder.h"
 #include "nnacl/int8/quant_dtype_cast_int8.h"

 namespace mindspore::lite::micro {
--- a/mindspore/lite/micro/coder/opcoders/base/full_connection_base_coder.cc
+++ b/mindspore/lite/micro/coder/opcoders/base/full_connection_base_coder.cc
@@ -14,10 +14,14 @@
 * limitations under the License.
 */

 #include "micro/coder/opcoders/base/full_connection_base_coder.h"
 #include "coder/opcoders/base/full_connection_base_coder.h"

 namespace mindspore::lite::micro {
 FullConnectionBaseCoder::~FullConnectionBaseCoder() { fc_param_ = nullptr; }
 FullConnectionBaseCoder::~FullConnectionBaseCoder() {
  fc_param_ = nullptr;
  filter_tensor_ = nullptr;
  bias_tensor_ = nullptr;
 }

 int FullConnectionBaseCoder::Init() {
  this->fc_param_ = reinterpret_cast<MatMulParameter *>(parameter_);
--- a/mindspore/lite/micro/coder/opcoders/base/full_connection_base_coder.h
+++ b/mindspore/lite/micro/coder/opcoders/base/full_connection_base_coder.h
@@ -18,7 +18,7 @@
 #define MINDSPORE_LITE_MICRO_CODER_OPCODERS_BASE_FULLY_CONNECTED_BASE_CODER_H_

 #include <vector>
 #include "micro/coder/opcoders/op_coder.h"
 #include "coder/opcoders/op_coder.h"
 #include "nnacl/matmul_parameter.h"

 namespace mindspore::lite::micro {
@@ -29,7 +29,8 @@ class FullConnectionBaseCoder : public OperatorCoder {
      : OperatorCoder(in_tensors, out_tensors, node, node_index, target) {}

  ~FullConnectionBaseCoder() override;
  int Init();

  virtual int Init();

 protected:
  MatMulParameter *fc_param_{nullptr};
--- a/mindspore/lite/micro/coder/opcoders/base/quant_dtype_cast_coder.cc
+++ b/mindspore/lite/micro/coder/opcoders/base/quant_dtype_cast_coder.cc
@@ -14,61 +14,72 @@
 * limitations under the License.
 */

 #include <string>
 #include "micro/coder/opcoders/op_coder.h"
 #include "micro/coder/opcoders/file_collector.h"
 #include "micro/coder/opcoders/base/quant_dtype_cast_coder.h"
 #include "micro/coder/opcoders/serializers/serializer.h"
 #include "coder/opcoders/op_coder.h"
 #include "coder/opcoders/file_collector.h"
 #include "coder/opcoders/base/quant_dtype_cast_coder.h"
 #include "coder/opcoders/serializers/serializer.h"
 #include "coder/utils/type_cast.h"

 using mindspore::schema::PrimitiveType_QuantDTypeCast;

 namespace mindspore::lite::micro {

 int QuantDTypeCastCoder::Prepare(CoderContext *const context) {
  this->cast_param_ = reinterpret_cast<QuantDTypeCastParameter *>(parameter_);

  if (cast_param_->srcT == kNumberTypeFloat32 && cast_param_->dstT == kNumberTypeInt8) {
    if (input_tensor_->data_type() != kNumberTypeFloat32 || output_tensor_->data_type() != kNumberTypeInt8) {
      MS_LOG(ERROR) << "cast_param_ data type and tensor data type do not match.";
      return RET_ERROR;
    }
    inverse_ = false;
  } else if (cast_param_->srcT == kNumberTypeInt8 && cast_param_->dstT == kNumberTypeFloat32) {
    if (input_tensor_->data_type() != kNumberTypeInt8 || output_tensor_->data_type() != kNumberTypeFloat32) {
      MS_LOG(ERROR) << "cast_param_ data type and tensor data type do not match.";
      return RET_ERROR;
    }
    inverse_ = true;
  } else {
    MS_LOG(ERROR) << "cast_param_ data type not supported:"
                  << " src: " << cast_param_->srcT << " dst: " << cast_param_->dstT;
    return RET_PARAM_INVALID;
  auto *param = reinterpret_cast<QuantDTypeCastParameter *>(parameter_);
  if (input_tensor_->data_type() != static_cast<TypeId>(param->srcT) ||
      output_tensor_->data_type() != static_cast<TypeId>(param->dstT)) {
    MS_LOG(ERROR) << "param data type not supported:"
                  << " src: " << param->srcT << " dst: " << param->dstT;
    return RET_ERROR;
  }
  src_dtype = static_cast<TypeId>(param->srcT);
  dst_dtype = static_cast<TypeId>(param->dstT);
  return RET_OK;
 }

 int QuantDTypeCastCoder::DoCode(CoderContext *const context) {
  // get quant params
  QuantArg in_quant_arg = input_tensor_->quant_params().at(0);

  // single thread for now
  if (input_tensor_->quant_params().empty() && output_tensor_->quant_params().empty()) {
    MS_LOG(ERROR) << "QuantDTypeCast need quantization parameters which is not found.";
    return RET_ERROR;
  }
  auto quant_arg = (!output_tensor_->quant_params().empty() && output_tensor_->quant_params().at(0).inited)
                     ? output_tensor_->quant_params().at(0)
                     : input_tensor_->quant_params().at(0);
  int num_unit_thread = input_tensor_->ElementsNum();

  // generate code .h .c
  Collect(context, {"nnacl/int8/quant_dtype_cast_int8.h"}, {"quant_dtype_cast_int8.c"});

  Serializer code;
  code.precision(kPrecision);
  std::string function = inverse_ ? "DoDequantizeInt8ToFp32" : "DoQuantizeFp32ToInt8";
  code.CodeFunction(function, input_tensor_, output_tensor_, in_quant_arg.scale, in_quant_arg.zeroPoint,
                    num_unit_thread);

  if (src_dtype == TypeId::kNumberTypeInt8 && dst_dtype == TypeId::kNumberTypeFloat32) {
    code.CodeFunction("DoDequantizeInt8ToFp32", input_tensor_, output_tensor_, quant_arg.scale, quant_arg.zeroPoint,
                      num_unit_thread);
  } else if (src_dtype == TypeId::kNumberTypeFloat32 && dst_dtype == TypeId::kNumberTypeInt8) {
    bool from_uint8_src = false;
    if (quant_arg.dstDtype == TypeId::kNumberTypeUInt8) {
      from_uint8_src = true;
    }
    code.CodeFunction("DoQuantizeFp32ToInt8", input_tensor_, output_tensor_, quant_arg.scale, quant_arg.zeroPoint,
                      num_unit_thread, from_uint8_src);
  } else if (src_dtype == TypeId::kNumberTypeInt8 && dst_dtype == TypeId::kNumberTypeUInt8) {
    code.CodeFunction("Int8ToUInt8", input_tensor_, output_tensor_, num_unit_thread);
  } else if (src_dtype == TypeId::kNumberTypeUInt8 && dst_dtype == TypeId::kNumberTypeFloat32) {
    code.CodeFunction("DoDequantizeUInt8ToFp32", input_tensor_, output_tensor_, quant_arg.scale, quant_arg.zeroPoint,
                      num_unit_thread);
  } else if (src_dtype == TypeId::kNumberTypeFloat32 && dst_dtype == TypeId::kNumberTypeUInt8) {
    code.CodeFunction("DoQuantizeFp32ToUInt8", input_tensor_, output_tensor_, quant_arg.scale, quant_arg.zeroPoint,
                      num_unit_thread);
  } else if (src_dtype == TypeId::kNumberTypeUInt8 && dst_dtype == TypeId::kNumberTypeInt8) {
    code.CodeFunction("UInt8ToInt8", input_tensor_, output_tensor_, num_unit_thread);
  } else {
    MS_LOG(INFO) << "unsupported type cast, src: " << EnumNameDataType(src_dtype)
                 << ", dst: " << EnumNameDataType(dst_dtype);
    return RET_ERROR;
  }
  context->AppendCode(code.str());

  return RET_OK;
 }

 REG_OPERATOR_CODER(kAllTargets, kNumberTypeFloat32, PrimitiveType_QuantDTypeCast,
                   CPUOpCoderCreator<QuantDTypeCastCoder>)
 REG_OPERATOR_CODER(kAllTargets, kNumberTypeInt8, PrimitiveType_QuantDTypeCast, CPUOpCoderCreator<QuantDTypeCastCoder>)
 REG_OPERATOR_CODER(kAllTargets, kNumberTypeUInt8, PrimitiveType_QuantDTypeCast, CPUOpCoderCreator<QuantDTypeCastCoder>)
 }  // namespace mindspore::lite::micro
--- a/mindspore/lite/micro/coder/opcoders/base/quant_dtype_cast_coder.h
+++ b/mindspore/lite/micro/coder/opcoders/base/quant_dtype_cast_coder.h
@@ -19,7 +19,7 @@

 #include <vector>
 #include <memory>
 #include "micro/coder/opcoders/op_coder.h"
 #include "coder/opcoders/op_coder.h"
 #include "nnacl/int8/quant_dtype_cast_int8.h"

 namespace mindspore::lite::micro {
@@ -36,10 +36,8 @@ class QuantDTypeCastCoder final : public OperatorCoder {
  int DoCode(CoderContext *const context) override;

 private:
  QuantDTypeCastParameter *cast_param_{nullptr};
  std::vector<Tensor *> inputs_;
  std::vector<Tensor *> outputs_;
  bool inverse_{false};
  TypeId src_dtype{kTypeUnknown};
  TypeId dst_dtype{kTypeUnknown};
  int thread_num_{0};
  int thread_n_num_{0};
  int thread_n_stride_{0};
--- a/mindspore/lite/micro/coder/opcoders/base/reduce_base_coder.cc
+++ b/mindspore/lite/micro/coder/opcoders/base/reduce_base_coder.cc
@@ -14,16 +14,16 @@
 * limitations under the License.
 */

 #include "micro/coder/opcoders/base/reduce_base_coder.h"
 #include "coder/opcoders/base/reduce_base_coder.h"
 #include <vector>
 #include "micro/coder/opcoders/op_coder.h"
 #include "coder/opcoders/op_coder.h"

 namespace mindspore::lite::micro {
 namespace {
 constexpr size_t kInputNum = 1;
 constexpr size_t kOutputNum = 1;
 }  // namespace
 int ReduceBaseCoder::CheckInputsOutputs() {
 int ReduceBaseCoder::CheckInputsOutputs() const {
  if (input_tensors_.size() < kInputNum) {
    MS_LOG(ERROR) << "Reduce inputs size should be at least " << kInputNum << " but got " << input_tensors_.size();
    return RET_ERROR;
--- a/mindspore/lite/micro/coder/opcoders/base/reduce_base_coder.h
+++ b/mindspore/lite/micro/coder/opcoders/base/reduce_base_coder.h
@@ -19,7 +19,7 @@

 #include <vector>
 #include <memory>
 #include "micro/coder/opcoders/op_coder.h"
 #include "coder/opcoders/op_coder.h"
 #include "nnacl/reduce_parameter.h"

 namespace mindspore::lite::micro {
@@ -31,11 +31,10 @@ class ReduceBaseCoder : public OperatorCoder {

  ~ReduceBaseCoder() override = default;

  int Init();
  virtual int ReSize();
  virtual int Init();

 private:
  int CheckInputsOutputs();
  int CheckInputsOutputs() const;
  int CheckParameters();

 protected:
@@ -54,6 +53,7 @@ class ReduceBaseCoder : public OperatorCoder {
  int outer_size_{0};
  int inner_size_{0};
  int axis_size_{0};
  virtual int ReSize();
 };
 }  // namespace mindspore::lite::micro
 #endif  // MINDSPORE_LITE_MICRO_CODER_OPCODERS_REDUCE_BASE_CODER_H
--- a/mindspore/lite/micro/coder/opcoders/base/resize_base_coder.cc
+++ b/mindspore/lite/micro/coder/opcoders/base/resize_base_coder.cc
@@ -0,0 +1,104 @@
 /**
 * Copyright 2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #include "coder/opcoders/base/resize_base_coder.h"
 #include "coder/opcoders/op_coder.h"

 namespace mindspore::lite::micro {
 constexpr int kMaxInputNum = 2;
 constexpr int kOutputNum = 1;
 constexpr int kSingleNum = 1;
 constexpr int kDoubleNum = 2;
 constexpr int kQuadrupleNum = 4;

 int ResizeBaseCoder::CheckParameters() {
  auto parameter = reinterpret_cast<ResizeParameter *>(parameter_);
  if (parameter == nullptr) {
    MS_LOG(ERROR) << "cast ResizeParameter failed.";
    return RET_NULL_PTR;
  }
  method_ = parameter->method_;
  if (method_ != static_cast<int>(schema::ResizeMethod_LINEAR) &&
      method_ != static_cast<int>(schema::ResizeMethod_NEAREST)) {
    MS_LOG(ERROR) << "Resize method should be bilinear or nearest_neighbor, but got " << method_;
    return RET_INVALID_OP_ATTR;
  }
  if (this->input_tensors_.size() == kSingleNum) {
    new_height_ = parameter->new_height_;
    if (new_height_ < 1) {
      MS_LOG(ERROR) << "Resize new_height should >= 1, but got " << new_height_;
      return RET_INVALID_OP_ATTR;
    }
    new_width_ = parameter->new_width_;
    if (new_width_ < 1) {
      MS_LOG(ERROR) << "Resize new_width should >= 1, but got " << new_width_;
      return RET_INVALID_OP_ATTR;
    }
  } else if (this->input_tensors_.size() == kDoubleNum) {
    auto out_shape = this->input_tensors_.at(1)->data_c();
    if (out_shape == nullptr) {
      MS_LOG(INFO) << "Out shape is not assigned";
      const_shape_ = false;
    } else {
      const_shape_ = true;
    }
  }
  coordinate_transform_mode_ = parameter->coordinate_transform_mode_;
  preserve_aspect_ratio_ = parameter->preserve_aspect_ratio_;
  if (preserve_aspect_ratio_) {
    MS_LOG(ERROR) << "Resize currently not support preserve_aspect_ratio true";
    return RET_ERROR;
  }
  return RET_OK;
 }

 int ResizeBaseCoder::CheckInputsOuputs() {
  if (input_tensors_.size() <= kQuadrupleNum) {
    if (std::any_of(input_tensors_.begin(), input_tensors_.end(), [](const Tensor *t) { return t == nullptr; })) {
      return RET_NULL_PTR;
    }
  } else {
    MS_LOG(ERROR) << "Resize input num should be no more than" << kMaxInputNum << ", but got " << input_tensors_.size();
    return RET_ERROR;
  }
  if (output_tensors_.size() != kOutputNum) {
    MS_LOG(ERROR) << "Resize output num should be " << kOutputNum << ", but got " << output_tensors_.size();
    return RET_ERROR;
  }
  auto output = output_tensors_.at(0);
  if (output == nullptr) {
    return RET_NULL_PTR;
  }
  return RET_OK;
 }

 int ResizeBaseCoder::Init() {
  auto ret = CheckParameters();
  if (ret != RET_OK) {
    return ret;
  }
  ret = CheckInputsOuputs();
  if (ret != RET_OK) {
    return ret;
  }
  auto input_shape = input_tensor_->shape();
  if (!input_shape.empty() && input_shape.size() != COMM_SHAPE_SIZE) {
    MS_LOG(ERROR) << "Resize op support input rank 4, got " << input_shape.size();
    return RET_ERROR;
  }
  return RET_OK;
 }
 }  // namespace mindspore::lite::micro
--- a/mindspore/lite/micro/coder/opcoders/base/resize_base_coder.h
+++ b/mindspore/lite/micro/coder/opcoders/base/resize_base_coder.h
@@ -0,0 +1,49 @@
 /**
 * Copyright 2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #ifndef MINDSPORE_LITE_MICRO_CODER_OPCODERS_RESIZE_BASE_CODER_H_
 #define MINDSPORE_LITE_MICRO_CODER_OPCODERS_RESIZE_BASE_CODER_H_

 #include <vector>
 #include <memory>
 #include "coder/opcoders/op_coder.h"
 #include "nnacl/resize_parameter.h"

 namespace mindspore::lite::micro {
 class ResizeBaseCoder : public OperatorCoder {
 public:
  ResizeBaseCoder(const std::vector<Tensor *> &in_tensors, const std::vector<Tensor *> &out_tensors,
                  const Model::Node *node, size_t node_index, Target target)
      : OperatorCoder(in_tensors, out_tensors, node, node_index, target) {}

  ~ResizeBaseCoder() override = default;

  int Init();

 protected:
  int method_{0};
  int new_height_{0};
  int new_width_{0};
  int coordinate_transform_mode_{0};
  bool preserve_aspect_ratio_{false};
  bool const_shape_{false};

 private:
  int CheckParameters();
  int CheckInputsOuputs();
 };
 }  // namespace mindspore::lite::micro
 #endif  // MINDSPORE_LITE_MICRO_CODER_OPCODERS_RESIZE_BASE_CODER_H_
--- a/mindspore/lite/micro/coder/opcoders/base/softmax_base_coder.cc
+++ b/mindspore/lite/micro/coder/opcoders/base/softmax_base_coder.cc
@@ -13,7 +13,7 @@
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "micro/coder/opcoders/base/softmax_base_coder.h"
 #include "coder/opcoders/base/softmax_base_coder.h"
 #include <vector>
 #include <type_traits>

--- a/mindspore/lite/micro/coder/opcoders/base/softmax_base_coder.h
+++ b/mindspore/lite/micro/coder/opcoders/base/softmax_base_coder.h
@@ -19,14 +19,12 @@

 #include <vector>
 #include <string>
 #include "micro/coder/opcoders/op_coder.h"
 #include "coder/opcoders/op_coder.h"
 #include "nnacl/softmax_parameter.h"
 #include "nnacl/int8/quantize.h"

 namespace mindspore::lite::micro {

 using std::string;

 class SoftmaxBaseCoder : public OperatorCoder {
 public:
  SoftmaxBaseCoder(const std::vector<Tensor *> &in_tensors, const std::vector<Tensor *> &out_tensors,
--- a/mindspore/lite/micro/coder/opcoders/cmsis-nn/int8/add_int8_coder.cc
+++ b/mindspore/lite/micro/coder/opcoders/cmsis-nn/int8/add_int8_coder.cc
@@ -17,13 +17,13 @@
 #include "coder/opcoders/cmsis-nn/int8/add_int8_coder.h"
 #include <algorithm>
 #include <limits>
 #include "micro/coder/opcoders/serializers/serializer.h"
 #include "coder/opcoders/serializers/serializer.h"
 #include "nnacl/arithmetic.h"
 #include "nnacl/int8/quantize.h"
 #include "coder/opcoders/file_collector.h"
 #include "coder/log.h"

 using mindspore::schema::PrimitiveType_Add;
 using mindspore::schema::PrimitiveType_AddFusion;

 namespace mindspore::lite::micro::cmsis {

@@ -85,5 +85,5 @@ int AddInt8Coder::DoCode(CoderContext *const context) {
  context->AppendCode(code.str());
  return RET_OK;
 }
 REG_OPERATOR_CODER(kARM32M, kNumberTypeInt8, PrimitiveType_Add, CPUOpCoderCreator<AddInt8Coder>)
 REG_OPERATOR_CODER(kARM32M, kNumberTypeInt8, PrimitiveType_AddFusion, CPUOpCoderCreator<AddInt8Coder>)
 }  // namespace mindspore::lite::micro::cmsis
--- a/mindspore/lite/micro/coder/opcoders/cmsis-nn/int8/conv2d_int8_coder.cc
+++ b/mindspore/lite/micro/coder/opcoders/cmsis-nn/int8/conv2d_int8_coder.cc
@@ -15,14 +15,13 @@
 */

 #include "coder/opcoders/cmsis-nn/int8/conv2d_int8_coder.h"
 #include <memory>
 #include <string>
 #include <vector>
 #include "coder/opcoders/cmsis-nn/int8/dwconv_int8_coder.h"
 #include "coder/opcoders/serializers/serializer.h"
 #include "coder/opcoders/file_collector.h"
 #include "src/common/prim_util.h"

 using mindspore::schema::PrimitiveType_Conv2D;
 using mindspore::schema::PrimitiveType_Conv2DFusion;

 namespace mindspore::lite::micro::cmsis {

@@ -40,13 +39,11 @@ int Conv2DInt8Coder::Prepare(CoderContext *const context) {
 int Conv2DInt8Coder::DoCode(CoderContext *const context) {
  Serializer code;
  code.precision(kPrecision);
  std::vector<string> h_files;
  std::vector<string> c_files;
  std::vector<std::string> h_files;
  std::vector<std::string> c_files;
  h_files.emplace_back("CMSIS/NN/Include/arm_nnfunctions.h");
  string buffer_str = "NULL";
  if (opt_ != Convolve_1x1_fast) {
    buffer_str = allocator_->GetRuntimeAddr(buffer_);
    code << "  memset(" << buffer_str << ", 0, " << buffer_size_ << ");\n";
    code.CodeFunction("memset", buffer_, 0, buffer_size_);
  }
  code.CodeArray("output_shift", output_shift_, output_ch_);
  code.CodeArray("output_mult", output_mult_, output_ch_);
@@ -57,7 +54,7 @@ int Conv2DInt8Coder::DoCode(CoderContext *const context) {
      code.CodeFunction("arm_convolve_s8", input_tensor_, input_x_, input_y_, input_ch_, input_batches_, filter_tensor_,
                        output_ch_, kernel_x_, kernel_y_, pad_x_, pad_y_, stride_x_, stride_y_, bias_tensor_,
                        output_tensor_, "output_shift", "output_mult", out_offset_, input_offset_, out_activation_min_,
                        out_activation_max_, output_x_, output_y_, buffer_str);
                        out_activation_max_, output_x_, output_y_, buffer_);
      break;
    case Convolve_1_x_n:
      c_files = {"arm_convolve_1_x_n_s8.c", "arm_nn_mat_mul_core_1x_s8.c"};
@@ -65,7 +62,7 @@ int Conv2DInt8Coder::DoCode(CoderContext *const context) {
      code.CodeFunction("arm_convolve_1_x_n_s8", input_tensor_, input_x_, input_ch_, input_batches_, filter_tensor_,
                        output_ch_, kernel_x_, pad_x_, stride_x_, bias_tensor_, output_tensor_, "output_shift",
                        "output_mult", out_offset_, input_offset_, out_activation_min_, out_activation_max_, output_x_,
                        buffer_str);
                        buffer_);
      break;
    case Convolve_1x1_fast:
      c_files = {"arm_convolve_1x1_s8_fast.c", "arm_nn_mat_mult_nt_t_s8.c", "arm_nn_mat_mul_core_4x_s8.c",
@@ -74,7 +71,7 @@ int Conv2DInt8Coder::DoCode(CoderContext *const context) {
      code.CodeFunction("arm_convolve_1x1_s8_fast", input_tensor_, input_x_, input_y_, input_ch_, input_batches_,
                        filter_tensor_, output_ch_, pad_x_, pad_y_, stride_x_, stride_y_, bias_tensor_, output_tensor_,
                        "output_shift", "output_mult", out_offset_, input_offset_, out_activation_min_,
                        out_activation_max_, output_x_, output_y_, buffer_str);
                        out_activation_max_, output_x_, output_y_, buffer_);
      break;
    default:
      MS_LOG(ERROR) << "opt enum value is not defined";
@@ -159,5 +156,20 @@ int Conv2DInt8Coder::InitTmpBuffer() {
  return RET_OK;
 }

 REG_OPERATOR_CODER(kARM32M, kNumberTypeInt8, PrimitiveType_Conv2D, CPUOpCoderCreator<Conv2DInt8Coder>)
 std::unique_ptr<OperatorCoder> CmsisConv2DInt8OpCoderCreator(const std::vector<Tensor *> &in_tensors,
                                                             const std::vector<Tensor *> &out_tensors,
                                                             const Model::Node *node, size_t node_index,
                                                             Target target) {
  MS_CHECK_PTR_RET_NULL(node);
  int pt = GetPrimitiveType(node->primitive_);
  if (pt != schema::PrimitiveType::PrimitiveType_Conv2DFusion) {
    MS_LOG(ERROR) << "unmatched primitive type " << PrimitiveTypeName(pt);
    return nullptr;
  }
  std::unique_ptr<Conv2DInt8Coder> coder =
    std::make_unique<Conv2DInt8Coder>(in_tensors, out_tensors, node, node_index, target);
  return coder;
 }

 REG_OPERATOR_CODER(kARM32M, kNumberTypeInt8, PrimitiveType_Conv2DFusion, CPUOpCoderCreator<Conv2DInt8Coder>)
 }  // namespace mindspore::lite::micro::cmsis
--- a/mindspore/lite/micro/coder/opcoders/cmsis-nn/int8/dwconv_int8_coder.cc
+++ b/mindspore/lite/micro/coder/opcoders/cmsis-nn/int8/dwconv_int8_coder.cc
@@ -20,8 +20,6 @@
 #include "coder/opcoders/file_collector.h"
 #include "coder/log.h"

 using mindspore::schema::PrimitiveType_DepthwiseConv2D;

 namespace mindspore::lite::micro::cmsis {

 int DWConvInt8Coder::Prepare(CoderContext *const context) {
@@ -153,6 +151,4 @@ int DWConvInt8Coder::InitTmpBuffer() {
  return 0;
 }

 REG_OPERATOR_CODER(kARM32M, kNumberTypeInt8, PrimitiveType_DepthwiseConv2D, CPUOpCoderCreator<DWConvInt8Coder>)

 }  // namespace mindspore::lite::micro::cmsis
--- a/mindspore/lite/micro/coder/opcoders/cmsis-nn/int8/fullconnection_int8_coder.h
+++ b/mindspore/lite/micro/coder/opcoders/cmsis-nn/int8/fullconnection_int8_coder.h
@@ -19,8 +19,8 @@

 #include <string>
 #include <vector>
 #include "micro/coder/opcoders/op_coder.h"
 #include "micro/coder/opcoders/base/full_connection_base_coder.h"
 #include "coder/opcoders/op_coder.h"
 #include "coder/opcoders/base/full_connection_base_coder.h"
 #include "nnacl/int8/quantize.h"
 namespace mindspore::lite::micro::cmsis {
 class FullConnectionInt8Coder final : public FullConnectionBaseCoder {
--- a/mindspore/lite/micro/coder/opcoders/cmsis-nn/int8/mul_int8_coder.cc
+++ b/mindspore/lite/micro/coder/opcoders/cmsis-nn/int8/mul_int8_coder.cc
@@ -20,7 +20,7 @@
 #include "nnacl/int8/quantize.h"
 #include "coder/opcoders/file_collector.h"

 using mindspore::schema::PrimitiveType_Mul;
 using mindspore::schema::PrimitiveType_MulFusion;

 namespace mindspore::lite::micro::cmsis {

@@ -69,5 +69,5 @@ int MulInt8Coder::DoCode(CoderContext *const context) {
  context->AppendCode(code.str());
  return RET_OK;
 }
 REG_OPERATOR_CODER(kARM32M, kNumberTypeInt8, PrimitiveType_Mul, CPUOpCoderCreator<MulInt8Coder>)
 REG_OPERATOR_CODER(kARM32M, kNumberTypeInt8, PrimitiveType_MulFusion, CPUOpCoderCreator<MulInt8Coder>)
 }  // namespace mindspore::lite::micro::cmsis
--- a/mindspore/lite/micro/coder/opcoders/cmsis-nn/int8/pooling_int8_coder.cc
+++ b/mindspore/lite/micro/coder/opcoders/cmsis-nn/int8/pooling_int8_coder.cc
@@ -20,7 +20,8 @@
 #include "coder/opcoders/serializers/serializer.h"
 #include "coder/opcoders/file_collector.h"

 using mindspore::schema::PrimitiveType_Pooling;
 using mindspore::schema::PrimitiveType_AvgPoolFusion;
 using mindspore::schema::PrimitiveType_MaxPoolFusion;

 namespace mindspore::lite::micro::cmsis {
 int PoolingInt8Coder::Prepare(CoderContext *const context) {
@@ -39,14 +40,12 @@ int PoolingInt8Coder::Prepare(CoderContext *const context) {

 int PoolingInt8Coder::DoCode(CoderContext *const context) {
  // init struct PoolingParameters
  std::string buffer_str = "NULL";
  std::string pooling_func;

  std::vector<std::string> cFiles;
  if (pooling_parameter_->pool_mode_ == PoolMode_AvgPool) {
    cFiles = {"arm_avgpool_s8.c"};
    pooling_func = "arm_avgpool_s8";
    buffer_str = allocator_->GetRuntimeAddr(buffer_);
  } else if (pooling_parameter_->pool_mode_ == PoolMode_MaxPool) {
    cFiles = {"arm_max_pool_s8.c"};
    pooling_func = "arm_max_pool_s8";
@@ -59,11 +58,9 @@ int PoolingInt8Coder::DoCode(CoderContext *const context) {
  Serializer code;
  code.precision(kPrecision);

  code.CodeFunction(pooling_func, "&nn_context", "&pool_params", "&input_dims", input_tensor_, "&filter_dims",
                    "&output_dims", output_tensor_);
  code.CodeFunction(pooling_func, dim_src_height_, dim_src_width_, dim_dst_height_, dim_dst_width_, stride_height_,
                    stride_width_, dim_kernel_height_, dim_kernel_width_, padding_height_, padding_width_, act_min_,
                    act_max_, ch_src_, input_tensor_, buffer_str, output_tensor_);
                    act_max_, ch_src_, input_tensor_, buffer_, output_tensor_);
  context->AppendCode(code.str());
  return RET_OK;
 }
@@ -97,6 +94,7 @@ int PoolingInt8Coder::SetParameters() {
  return RET_OK;
 }

 REG_OPERATOR_CODER(kARM32M, kNumberTypeInt8, PrimitiveType_Pooling, CPUOpCoderCreator<PoolingInt8Coder>)
 REG_OPERATOR_CODER(kARM32M, kNumberTypeInt8, PrimitiveType_AvgPoolFusion, CPUOpCoderCreator<PoolingInt8Coder>)
 REG_OPERATOR_CODER(kARM32M, kNumberTypeInt8, PrimitiveType_MaxPoolFusion, CPUOpCoderCreator<PoolingInt8Coder>)

 }  // namespace mindspore::lite::micro::cmsis
--- a/mindspore/lite/micro/coder/opcoders/cmsis-nn/int8/softmax_int8_coder.cc
+++ b/mindspore/lite/micro/coder/opcoders/cmsis-nn/int8/softmax_int8_coder.cc
@@ -19,7 +19,7 @@
 #include "coder/opcoders/serializers/serializer.h"
 #include "coder/opcoders/file_collector.h"

 using mindspore::schema::PrimitiveType_SoftMax;
 using mindspore::schema::PrimitiveType_Softmax;
 namespace mindspore::lite::micro::cmsis {

 int SoftMaxInt8Coder::Prepare(CoderContext *const context) {
@@ -76,6 +76,6 @@ int SoftMaxInt8Coder::DoCode(CoderContext *const context) {
  context->AppendCode(code.str());
  return RET_OK;
 }
 REG_OPERATOR_CODER(kARM32M, kNumberTypeInt8, PrimitiveType_SoftMax, CPUOpCoderCreator<SoftMaxInt8Coder>)
 REG_OPERATOR_CODER(kARM32M, kNumberTypeInt8, PrimitiveType_Softmax, CPUOpCoderCreator<SoftMaxInt8Coder>)

 }  // namespace mindspore::lite::micro::cmsis
--- a/mindspore/lite/micro/coder/opcoders/nnacl/dequant/de_quant.cc
+++ b/mindspore/lite/micro/coder/opcoders/nnacl/dequant/de_quant.cc
@@ -0,0 +1,143 @@
 /**
 * Copyright 2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #include "coder/opcoders/nnacl/dequant/de_quant.h"
 #include <string>
 #include <vector>
 #include "coder/opcoders/serializers/nnacl_serializer/nnacl_fp32_serializer.h"

 static constexpr int kPerTensor = 1;
 static constexpr size_t kPerBatch = 3;
 namespace mindspore::lite::micro::nnacl {

 void Dequant::set_de_quant_buffer_str(const std::string &dequant_buffer_str) {
  de_quant_buffer_str_ = "(float *)(" + dequant_buffer_str + ")";
 }

 void Dequant::DequantRecordWorkspcae(size_t curr_workspace) {
  de_quant_max_workspace_ = de_quant_max_workspace_ > curr_workspace ? de_quant_max_workspace_ : curr_workspace;
 }

 bool Dequant::CheckDequantFlag(const Tensor *weight_tensor) {
  if (weight_tensor == nullptr) {
    return false;
  }
  return !weight_tensor->quant_params().empty() && weight_tensor->quant_params().front().inited &&
         weight_tensor->data_c() != nullptr;
 }

 void Dequant::DeQuantFunctionPerChannel(const Tensor *quant_tensor, const std::vector<DeQuantArg> &de_quant_args,
                                        const std::string &de_quant_arg_base_str,
                                        NNaclFp32Serializer *const de_quant_code) {
  int quant_arg_dims = static_cast<int>(quant_tensor->quant_params().size());
  int de_quant_nums = quant_tensor->ElementsNum();
  for (int i = 0; i < quant_arg_dims; ++i) {
    auto de_quant_arg = de_quant_args.at(i);
    std::string de_quant_arg_str = de_quant_arg_base_str + std::to_string(i);
    de_quant_code->CodeStruct(de_quant_arg_str, de_quant_arg);
  }
  std::string de_quant_args_name = "de_quant_args";
  *de_quant_code << "const DeQuantArg *" << de_quant_args_name << "[" << quant_arg_dims << "] = {\n";
  for (int i = 0; i < quant_arg_dims - 1; ++i) {
    *de_quant_code << "&" << de_quant_arg_base_str << std::to_string(i) << ", ";
  }
  *de_quant_code << "&" << de_quant_arg_base_str << std::to_string(quant_arg_dims - 1);
  *de_quant_code << "};\n";
  size_t per_batch_size = quant_tensor->shape().at(0);
  std::string quant_tensor_addr_str = "(int8_t *)(" + quant_tensor_addr_ + ")";
  de_quant_code->CodeFunction("DequantDataPerChannel", quant_tensor_addr_str, de_quant_args_name, de_quant_nums,
                              per_batch_size, de_quant_buffer_str_);
 }

 void Dequant::DeQuantFunction(const Tensor *quant_tensor, const std::vector<DeQuantArg> &de_quant_args,
                              const std::string &de_quant_arg_base_str, NNaclFp32Serializer *const de_quant_code) {
  int quant_arg_dims = static_cast<int>(quant_tensor->quant_params().size());
  int de_quant_nums = quant_tensor->ElementsNum();
  for (int i = 0; i < quant_arg_dims; ++i) {
    auto de_quant_arg = de_quant_args.at(i);
    std::string de_quant_arg_str = de_quant_arg_base_str + std::to_string(i);
    de_quant_code->CodeStruct(de_quant_arg_str, de_quant_arg);
  }
  std::string de_quant_args_name = "de_quant_args";
  *de_quant_code << "const DeQuantArg *" << de_quant_args_name << "[" << quant_arg_dims << "] = {\n";
  for (int i = 0; i < quant_arg_dims - 1; ++i) {
    *de_quant_code << "&" << de_quant_arg_base_str << std::to_string(i) << ", ";
  }
  *de_quant_code << "&" << de_quant_arg_base_str << std::to_string(quant_arg_dims - 1);
  *de_quant_code << "};\n";
  auto channels = static_cast<size_t>(quant_tensor->Batch());
  std::string quant_tensor_addr_str = "(int8_t *)(" + quant_tensor_addr_ + ")";
  de_quant_code->CodeFunction("DequantData", quant_tensor_addr_str, de_quant_args_name, de_quant_nums, channels,
                              de_quant_buffer_str_);
 }

 void Dequant::DeQuantFunctionPerTensor(const Tensor *quant_tensor, const std::vector<DeQuantArg> &de_quant_args,
                                       const std::string &de_quant_arg_base_str,
                                       NNaclFp32Serializer *const de_quant_code) {
  size_t de_quant_nums = quant_tensor->ElementsNum();
  auto de_quant_arg = de_quant_args.at(0);
  std::string de_quant_arg_str = de_quant_arg_base_str + std::to_string(0);
  de_quant_code->CodeStruct(de_quant_arg_str, de_quant_arg);
  std::string de_quant_args_name = "de_quant_args";
  *de_quant_code << "const DeQuantArg *" << de_quant_args_name << "[" << 1 << "] = {\n";
  *de_quant_code << "&" << de_quant_arg_base_str << std::to_string(0);
  *de_quant_code << "};\n";
  std::string quant_tensor_addr_str = "(int8_t *)(" + quant_tensor_addr_ + ")";
  de_quant_code->CodeFunction("DequantDataPerTensor", quant_tensor_addr_str, de_quant_args_name, de_quant_nums,
                              de_quant_buffer_str_);
 }

 std::string Dequant::GetMicroDeQuantFunction(const Tensor *quant_tensor, const std::string &quant_tensor_addr) {
  std::string de_quant_block;
  if (quant_tensor == nullptr || de_quant_buffer_str_.empty()) {
    return de_quant_block;
  }
  quant_tensor_addr_ = quant_tensor_addr;
  size_t de_quant_nums = quant_tensor->ElementsNum();
  size_t quant_arg_dims = quant_tensor->quant_params().size();
  DequantRecordWorkspcae(static_cast<size_t>(de_quant_nums * sizeof(float)));
  NNaclFp32Serializer de_quant_code;
  de_quant_code << "{\n";
  size_t quant_tensor_dims = quant_tensor->shape().size();
  std::vector<DeQuantArg> de_quant_args;
  std::string de_quant_arg_base_str = "de_quant_arg_";
  for (size_t i = 0; i < quant_arg_dims; ++i) {
    auto curr_quant_param = quant_tensor->quant_params().at(i);
    DeQuantArg de_quant_arg = {
      .scale = static_cast<float>(curr_quant_param.scale),
      .zeroPoint = curr_quant_param.zeroPoint,
      .var_corr = curr_quant_param.var_corr,
      .mean_corr = curr_quant_param.mean_corr,
      // this clusters is meaningless which will be supported in future
      .clusters = {},
      .clusters_nums = static_cast<int>(curr_quant_param.clusters.size()),
      .bitNum = quant_tensor->quant_params().at(i).bitNum,
    };
    de_quant_args.emplace_back(de_quant_arg);
  }
  de_quant_code.CodeFunction("memset", de_quant_buffer_str_, 0, de_quant_nums * sizeof(float));
  if (quant_tensor_dims == kPerBatch && quant_arg_dims == static_cast<size_t>(quant_tensor->shape().at(0))) {
    DeQuantFunctionPerChannel(quant_tensor, de_quant_args, de_quant_arg_base_str, &de_quant_code);
  } else if (quant_arg_dims != kPerTensor) {
    DeQuantFunction(quant_tensor, de_quant_args, de_quant_arg_base_str, &de_quant_code);
  } else {
    DeQuantFunctionPerTensor(quant_tensor, de_quant_args, de_quant_arg_base_str, &de_quant_code);
  }
  de_quant_code << "}\n";
  de_quant_block = de_quant_code.str();
  return de_quant_block;
 }
 }  // namespace mindspore::lite::micro::nnacl
--- a/mindspore/lite/micro/coder/opcoders/nnacl/dequant/de_quant.h
+++ b/mindspore/lite/micro/coder/opcoders/nnacl/dequant/de_quant.h
@@ -0,0 +1,63 @@
 /**
 * Copyright 2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #ifndef MICRO_LITE_MICRO_CODER_OPCODERS_NNACL_DEQUANT_DEQUANT_H_
 #define MICRO_LITE_MICRO_CODER_OPCODERS_NNACL_DEQUANT_DEQUANT_H_

 #include <string>
 #include <vector>
 #include "src/tensor.h"
 #include "coder/opcoders/serializers/nnacl_serializer/nnacl_fp32_serializer.h"
 namespace mindspore::lite::micro::nnacl {
 class Dequant {
 public:
  Dequant(const Dequant &) = delete;
  Dequant &operator=(const Dequant &) = delete;
  static Dequant *GetInstance() {
    static Dequant dequant;
    return &dequant;
  }

  void set_de_quant_buffer_str(const std::string &de_quant_buffer_str);

  const size_t de_quant_max_workspace() const { return de_quant_max_workspace_; }

  const std::string de_quant_buffer_str() const { return de_quant_buffer_str_; }

  bool CheckDequantFlag(const Tensor *quant_tensor);

  std::string GetMicroDeQuantFunction(const Tensor *quant_tensor, const std::string &quant_tensor_addr);

 private:
  void DeQuantFunctionPerTensor(const Tensor *quant_tensor, const std::vector<DeQuantArg> &de_quant_args,
                                const std::string &de_quant_arg_base_str, NNaclFp32Serializer *de_quant_code);

  void DeQuantFunction(const Tensor *quant_tensor, const std::vector<DeQuantArg> &de_quant_args,
                       const std::string &de_quant_arg_base_str, NNaclFp32Serializer *de_quant_code);

  void DeQuantFunctionPerChannel(const Tensor *quant_tensor, const std::vector<DeQuantArg> &de_quant_args,
                                 const std::string &de_quant_arg_base_str, NNaclFp32Serializer *de_quant_code);

  Dequant() = default;
  ~Dequant() = default;
  void DequantRecordWorkspcae(size_t curr_workspace);

  std::string de_quant_buffer_str_;
  std::string quant_tensor_addr_;
  size_t de_quant_max_workspace_{0};
 };
 }  // namespace mindspore::lite::micro::nnacl
 #endif  // MICRO_LITE_MICRO_CODER_OPCODERS_NNACL_DEQUANT_DEQUANT_H_
--- a/mindspore/lite/micro/coder/opcoders/nnacl/fp32/activation_fp32_coder.cc
+++ b/mindspore/lite/micro/coder/opcoders/nnacl/fp32/activation_fp32_coder.cc
@@ -13,12 +13,12 @@
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "micro/coder/opcoders/nnacl/fp32/activation_fp32_coder.h"
 #include "coder/opcoders/nnacl/fp32/activation_fp32_coder.h"
 #include <string>
 #include "nnacl/fp32/activation_fp32.h"
 #include "nnacl/op_base.h"
 #include "micro/coder/opcoders/serializers/nnacl_serializer/nnacl_fp32_serializer.h"
 #include "micro/coder/opcoders/file_collector.h"
 #include "coder/opcoders/serializers/nnacl_serializer/nnacl_fp32_serializer.h"
 #include "coder/opcoders/file_collector.h"

 using mindspore::schema::PrimitiveType_Activation;

@@ -34,9 +34,9 @@ int ActivationFP32Coder::DoCode(CoderContext *const context) {
  int count = MSMIN(stride, length - stride * task_id);

  if (activation_parameter->type_ == schema::ActivationType_SIGMOID) {
    Collect(context, {"runtime/kernel/fp32/sigmoid.h"}, {"sigmoid.c"});
    Collect(context, {"runtime/kernel/fp32/sigmoid_fp32.h"}, {"sigmoid_fp32.c"});
  } else {
    Collect(context, {"nnacl/fp32/activation.h"}, {"activation.c"});
    Collect(context, {"nnacl/fp32/activation_fp32.h"}, {"activation_fp32.c"});
  }
  NNaclFp32Serializer code;
  switch (activation_parameter->type_) {
--- a/mindspore/lite/micro/coder/opcoders/nnacl/fp32/addn_fp32_coder.cc
+++ b/mindspore/lite/micro/coder/opcoders/nnacl/fp32/addn_fp32_coder.cc
@@ -14,10 +14,10 @@
 * limitations under the License.
 */

 #include "micro/coder/opcoders/nnacl/fp32/addn_fp32_coder.h"
 #include "coder/opcoders/nnacl/fp32/addn_fp32_coder.h"
 #include <string>
 #include "micro/coder/opcoders/serializers/nnacl_serializer/nnacl_fp32_serializer.h"
 #include "micro/coder/opcoders/file_collector.h"
 #include "coder/opcoders/serializers/nnacl_serializer/nnacl_fp32_serializer.h"
 #include "coder/opcoders/file_collector.h"

 using mindspore::schema::PrimitiveType_AddN;
 namespace mindspore::lite::micro::nnacl {
@@ -28,15 +28,12 @@ int AddNFP32Coder::DoCode(CoderContext *const context) {
  int elements_num = input0->ElementsNum();

  // Get Tensor Pointer
  std::string input0_str = allocator_->GetRuntimeAddr(input0);
  std::string input1_str = allocator_->GetRuntimeAddr(input1);
  Collect(context, {"nnacl/kernel/fp32/add_fp32_slim.h"}, {"add_fp32_slim.c"});
  Collect(context, {"nnacl/kernel/fp32/add_fp32.h"}, {"add_fp32.c"});
  NNaclFp32Serializer code;
  code.CodeFunction("ElementAdd", input0_str, input1_str, output_tensor_, elements_num);
  code.CodeFunction("ElementAdd", input0, input1, output_tensor_, elements_num);
  if (input_tensors_.size() > 2) {
    for (size_t i = 2; i < input_tensors_.size(); ++i) {
      std::string input_str = allocator_->GetRuntimeAddr(input_tensors_.at(i));
      code.CodeFunction("ElementAdd", input_str, output_tensor_, elements_num);
      code.CodeFunction("ElementAdd", input_tensors_.at(i), output_tensor_, elements_num);
    }
  }
  context->AppendCode(code.str());
--- a/mindspore/lite/micro/coder/opcoders/nnacl/fp32/arithmetic_fp32_coder.cc
+++ b/mindspore/lite/micro/coder/opcoders/nnacl/fp32/arithmetic_fp32_coder.cc
@@ -61,7 +61,7 @@ int ArithmeticFP32Coder::Init(CoderContext *const context) {

  if (arithmetic_parameter_->in_elements_num0_ == 1 || arithmetic_parameter_->in_elements_num1_ == 1) {
    switch (arithmetic_parameter_->op_parameter_.type_) {
      case PrimitiveType_Mul:
      case PrimitiveType_MulFusion:
        switch (arithmetic_parameter_->activation_type_) {
          case schema::ActivationType_RELU:
            arithmetic_parameter_->broadcasting_ = false;
@@ -80,7 +80,7 @@ int ArithmeticFP32Coder::Init(CoderContext *const context) {
            break;
        }
        break;
      case PrimitiveType_Add:
      case PrimitiveType_AddFusion:
        switch (arithmetic_parameter_->activation_type_) {
          case schema::ActivationType_RELU:
            arithmetic_parameter_->broadcasting_ = false;
@@ -99,7 +99,7 @@ int ArithmeticFP32Coder::Init(CoderContext *const context) {
            break;
        }
        break;
      case PrimitiveType_Sub:
      case PrimitiveType_SubFusion:
        switch (arithmetic_parameter_->activation_type_) {
          case schema::ActivationType_RELU:
            arithmetic_parameter_->broadcasting_ = false;
@@ -157,7 +157,7 @@ int ArithmeticFP32Coder::Prepare(CoderContext *const context) {
  }
  arithmetic_parameter_ = reinterpret_cast<ArithmeticParameter *>(parameter_);
  std::map<int, std::function<void()>> type_setters = {
    {PrimitiveType_Mul,
    {PrimitiveType_MulFusion,
     [this]() {
       switch (arithmetic_parameter_->activation_type_) {
         case schema::ActivationType_RELU:
@@ -174,7 +174,7 @@ int ArithmeticFP32Coder::Prepare(CoderContext *const context) {
           break;
       }
     }},
    {PrimitiveType_Add,
    {PrimitiveType_AddFusion,
     [this]() {
       switch (arithmetic_parameter_->activation_type_) {
         case schema::ActivationType_RELU:
@@ -191,7 +191,7 @@ int ArithmeticFP32Coder::Prepare(CoderContext *const context) {
           break;
       }
     }},
    {PrimitiveType_Sub,
    {PrimitiveType_SubFusion,
     [this]() {
       switch (arithmetic_parameter_->activation_type_) {
         case schema::ActivationType_RELU:
@@ -205,7 +205,7 @@ int ArithmeticFP32Coder::Prepare(CoderContext *const context) {
           break;
       }
     }},
    {PrimitiveType_Div,
    {PrimitiveType_DivFusion,
     [this]() {
       switch (arithmetic_parameter_->activation_type_) {
         case schema::ActivationType_RELU:
@@ -275,15 +275,16 @@ int ArithmeticFP32Coder::DoCode(CoderContext *const context) {
   * this solution is not suitable for micro, for the size of package.
   * */
  if (arithmetic_opt_run_ == "ElementOptSub" || arithmetic_run_ == "ElementSub") {
    Collect(context, {"nnacl/kernel/fp32/sub.h"}, {"sub.c"});
    Collect(context, {"nnacl/fp32/sub_fp32.h"}, {"sub_fp32.c"});
  } else if (arithmetic_opt_run_ == "ElementOptAdd" || arithmetic_run_ == "ElementAdd") {
    Collect(context, {"nnacl/kernel/fp32/add_fp32_slim.h"}, {"add_fp32_slim.c"});
    Collect(context, {"nnacl/fp32/add_fp32.h"}, {"add_fp32.c"});
  } else if (arithmetic_opt_run_ == "ElementOptMul" || arithmetic_run_ == "ElementMul") {
    Collect(context, {"nnacl/kernel/fp32/mul.h"}, {"mul.c"});
    Collect(context, {"nnacl/fp32/mul_fp32.h"}, {"mul_fp32.c"});
  } else if (arithmetic_run_ == "ElementAddRelu") {
    Collect(context, {"nnacl/kernel/fp32/add_relu.h"}, {"add_relu.c"});
    Collect(context, {"nnacl/fp32/add_relu_fp32.h"}, {"add_relu_fp32.c"});
  } else {
    Collect(context, {"nnacl/arithmetic_common.h", "nnacl/fp32/arithmetic.h"}, {"arithmetic_common.c", "arithmetic.c"});
    Collect(context, {"nnacl/arithmetic_common.h", "nnacl/fp32/arithmetic_fp32.h"},
            {"arithmetic_common.c", "arithmetic_fp32.c"});
  }

  if (arithmetic_parameter_->broadcasting_) {
@@ -330,15 +331,15 @@ int ArithmeticFP32Coder::DoCode(CoderContext *const context) {
  return RET_OK;
 }

 REG_OPERATOR_CODER(kAllTargets, kNumberTypeInt32, PrimitiveType_Add, CPUOpCoderCreator<ArithmeticFP32Coder>)
 REG_OPERATOR_CODER(kAllTargets, kNumberTypeInt32, PrimitiveType_AddFusion, CPUOpCoderCreator<ArithmeticFP32Coder>)

 REG_OPERATOR_CODER(kAllTargets, kNumberTypeFloat32, PrimitiveType_Mul, CPUOpCoderCreator<ArithmeticFP32Coder>)
 REG_OPERATOR_CODER(kAllTargets, kNumberTypeFloat32, PrimitiveType_MulFusion, CPUOpCoderCreator<ArithmeticFP32Coder>)

 REG_OPERATOR_CODER(kAllTargets, kNumberTypeFloat32, PrimitiveType_Add, CPUOpCoderCreator<ArithmeticFP32Coder>)
 REG_OPERATOR_CODER(kAllTargets, kNumberTypeFloat32, PrimitiveType_AddFusion, CPUOpCoderCreator<ArithmeticFP32Coder>)

 REG_OPERATOR_CODER(kAllTargets, kNumberTypeFloat32, PrimitiveType_Sub, CPUOpCoderCreator<ArithmeticFP32Coder>)
 REG_OPERATOR_CODER(kAllTargets, kNumberTypeFloat32, PrimitiveType_SubFusion, CPUOpCoderCreator<ArithmeticFP32Coder>)

 REG_OPERATOR_CODER(kAllTargets, kNumberTypeFloat32, PrimitiveType_Div, CPUOpCoderCreator<ArithmeticFP32Coder>)
 REG_OPERATOR_CODER(kAllTargets, kNumberTypeFloat32, PrimitiveType_DivFusion, CPUOpCoderCreator<ArithmeticFP32Coder>)

 REG_OPERATOR_CODER(kAllTargets, kNumberTypeFloat32, PrimitiveType_LogicalAnd, CPUOpCoderCreator<ArithmeticFP32Coder>)

--- a/mindspore/lite/micro/coder/opcoders/nnacl/fp32/arithmetic_fp32_coder.h
+++ b/mindspore/lite/micro/coder/opcoders/nnacl/fp32/arithmetic_fp32_coder.h
@@ -25,9 +25,9 @@
 #define DEFAULT_ARITHMETIC_NDIMS 10
 namespace mindspore::lite::micro::nnacl {

 using mindspore::schema::PrimitiveType_Add;
 using mindspore::schema::PrimitiveType_AddFusion;

 using mindspore::schema::PrimitiveType_Div;
 using mindspore::schema::PrimitiveType_DivFusion;

 using mindspore::schema::PrimitiveType_Equal;

@@ -51,7 +51,7 @@ using mindspore::schema::PrimitiveType_Maximum;

 using mindspore::schema::PrimitiveType_Minimum;

 using mindspore::schema::PrimitiveType_Mul;
 using mindspore::schema::PrimitiveType_MulFusion;

 using mindspore::schema::PrimitiveType_NotEqual;

@@ -59,7 +59,7 @@ using mindspore::schema::PrimitiveType_RealDiv;

 using mindspore::schema::PrimitiveType_SquaredDifference;

 using mindspore::schema::PrimitiveType_Sub;
 using mindspore::schema::PrimitiveType_SubFusion;

 using mindspore::schema::PrimitiveType_Eltwise;

--- a/mindspore/lite/micro/coder/opcoders/nnacl/fp32/arithmetic_self_fp32_coder.h
+++ b/mindspore/lite/micro/coder/opcoders/nnacl/fp32/arithmetic_self_fp32_coder.h
@@ -27,7 +27,7 @@ namespace mindspore::lite::micro::nnacl {

 using mindspore::schema::PrimitiveType_Abs;

 using mindspore::schema::PrimitiveType_Add;
 using mindspore::schema::PrimitiveType_AddFusion;

 using mindspore::schema::PrimitiveType_AddN;

@@ -37,7 +37,7 @@ using mindspore::schema::PrimitiveType_Ceil;

 using mindspore::schema::PrimitiveType_Cos;

 using mindspore::schema::PrimitiveType_Div;
 using mindspore::schema::PrimitiveType_DivFusion;

 using mindspore::schema::PrimitiveType_Equal;

@@ -67,7 +67,7 @@ using mindspore::schema::PrimitiveType_Maximum;

 using mindspore::schema::PrimitiveType_Minimum;

 using mindspore::schema::PrimitiveType_Mul;
 using mindspore::schema::PrimitiveType_MulFusion;

 using mindspore::schema::PrimitiveType_NotEqual;

@@ -81,7 +81,7 @@ using mindspore::schema::PrimitiveType_Sqrt;

 using mindspore::schema::PrimitiveType_SquaredDifference;

 using mindspore::schema::PrimitiveType_Sub;
 using mindspore::schema::PrimitiveType_SubFusion;

 using mindspore::schema::PrimitiveType_Sin;

--- a/mindspore/lite/micro/coder/opcoders/nnacl/fp32/assign_add_fp32_coder.cc
+++ b/mindspore/lite/micro/coder/opcoders/nnacl/fp32/assign_add_fp32_coder.cc
@@ -14,10 +14,10 @@
 * limitations under the License.
 */

 #include "micro/coder/opcoders/nnacl/fp32/assign_add_fp32_coder.h"
 #include "coder/opcoders/nnacl/fp32/assign_add_fp32_coder.h"
 #include <string>
 #include "schema/inner/ops_generated.h"
 #include "micro/coder/opcoders/serializers/nnacl_serializer/nnacl_fp32_serializer.h"
 #include "coder/opcoders/serializers/nnacl_serializer/nnacl_fp32_serializer.h"

 namespace mindspore::lite::micro::nnacl {

--- a/mindspore/lite/micro/coder/opcoders/nnacl/fp32/batchnorm_fp32_coder.cc
+++ b/mindspore/lite/micro/coder/opcoders/nnacl/fp32/batchnorm_fp32_coder.cc
@@ -17,7 +17,6 @@
 #include <string>
 #include <vector>
 #include "nnacl/fp32/batchnorm_fp32.h"
 #include "src/ops/batch_norm.h"
 #include "nnacl/op_base.h"
 #include "coder/opcoders/file_collector.h"
 #include "coder/opcoders/serializers/nnacl_serializer/nnacl_fp32_serializer.h"
@@ -27,10 +26,7 @@ using mindspore::schema::PrimitiveType_BatchNorm;
 namespace mindspore::lite::micro::nnacl {

 int BatchnormFP32Coder::Init() {
  auto bn_parameter = reinterpret_cast<BatchNormParameter *>(parameter_);
  auto bn_prim = reinterpret_cast<const mindspore::lite::BatchNorm *>(OperatorCoder::primitive());
  bn_parameter->epsilon_ = bn_prim->GetEpsilon();

  auto bn_parameter = reinterpret_cast<BatchNormParameter *>(OperatorCoder::parameter_);
  std::vector<int> input_shapes = input_tensor_->shape();
  if (input_shapes.empty()) {
    return RET_ERROR;
@@ -41,7 +37,9 @@ int BatchnormFP32Coder::Init() {
  for (int i = 0; i < n_dim - 1; i++) {
    bn_parameter->unit_ *= input_shapes.at(i);
  }
  bn_parameter->op_parameter_.thread_num_ = MSMIN(bn_parameter->op_parameter_.thread_num_, bn_parameter->unit_);
  if (default_momentum_ < 0.0f) {
    default_momentum_ = bn_parameter->momentum_;
  }
  return RET_OK;
 }

@@ -59,7 +57,7 @@ int BatchnormFP32Coder::DoCode(CoderContext *const context) {
  Collect(context, {"nnacl/fp32/batchnorm.h"}, {"nnacl/fp32/batchnorm.c"});
  NNaclFp32Serializer code;
  code.CodeStruct("bn_parameter", *bn_parameter);
  code.CodeFunction("BatchNorm", output_tensor_, input_tensor_, mean_tensor, var_tensor, task_id, "&bn_parameter");
  code.CodeFunction("BatchNormFp32", input_tensor_, mean_tensor, var_tensor, "&bn_parameter", task_id, output_tensor_);
  MS_LOG(INFO) << "BatchnormFP32Code has been called";
  context->AppendCode(code.str());
  return lite::RET_OK;
--- a/mindspore/lite/micro/coder/opcoders/nnacl/fp32/batchnorm_fp32_coder.h
+++ b/mindspore/lite/micro/coder/opcoders/nnacl/fp32/batchnorm_fp32_coder.h
@@ -36,6 +36,12 @@ class BatchnormFP32Coder final : public OperatorCoder {

 private:
  int Init();

  float default_momentum_{-1.0f};

  float *mean_{nullptr};

  float *variance_{nullptr};
 };

 }  // namespace mindspore::lite::micro::nnacl
--- a/mindspore/lite/micro/coder/opcoders/nnacl/fp32/biasadd_fp32_coder.cc
+++ b/mindspore/lite/micro/coder/opcoders/nnacl/fp32/biasadd_fp32_coder.cc
@@ -0,0 +1,77 @@
 /**
 * Copyright 2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #include "coder/opcoders/nnacl/fp32/biasadd_fp32_coder.h"
 #include <string>
 #include "coder/opcoders/file_collector.h"
 #include "coder/opcoders/serializers/nnacl_serializer/nnacl_fp32_serializer.h"

 using mindspore::schema::PrimitiveType_BiasAdd;

 namespace mindspore::lite::micro::nnacl {

 int BiasAddFP32Coder::Prepare(CoderContext *context) {
  arithmetic_parameter_ = reinterpret_cast<ArithmeticParameter *>(parameter_);
  size_t data_size = input_tensors_.at(0)->ElementsNum();
  tile_in_ = reinterpret_cast<float *>(allocator_->Malloc(kNumberTypeFloat32, data_size * sizeof(float), kWorkspace));
  tile_bias_ = reinterpret_cast<float *>(allocator_->Malloc(kNumberTypeFloat32, data_size * sizeof(float), kWorkspace));
  return RET_OK;
 }

 int BiasAddFP32Coder::DoCode(CoderContext *ctx) {
  if (input_tensors_.size() < kBiasIndex) {
    return RET_ERROR;
  }
  size_t data_size = input_tensor_->ElementsNum();
  std::string bias_str = allocator_->GetRuntimeAddr(input_tensors_.at(kWeightIndex));
  Collect(ctx,
          {"nnacl/arithmetic.h", "nnacl/nnacl_utils.h", "nnacl/nnacl_common.h", "nnacl/base/arithmetic_base.h",
           "nnacl/fp32/add_fp32.h", "nnacl/fp32/arithmetic_fp32.h"},
          {"arithmetic_base.c", "arithmetic_fp32.c", "add_fp32.c"});
  nnacl::NNaclFp32Serializer code;
  std::vector<int> dims = input_tensor_->shape();
  arithmetic_parameter_->broadcasting_ = false;
  arithmetic_parameter_->ndim_ = dims.size();
  arithmetic_parameter_->activation_type_ = 0;
  for (size_t i = 0; i < dims.size(); i++) {
    arithmetic_parameter_->in_shape0_[i] = dims[i];
  }
  arithmetic_parameter_->in_elements_num0_ = 0;

  for (size_t i = 0; i < dims.size(); i++) {
    if (i == dims.size() - 1) {
      arithmetic_parameter_->in_shape1_[i] = dims[dims.size() - 1];
      continue;
    }
    arithmetic_parameter_->in_shape1_[i] = 1;
  }
  arithmetic_parameter_->in_elements_num1_ = 0;

  for (size_t i = 0; i < dims.size(); i++) {
    arithmetic_parameter_->out_shape_[i] = dims[i];
  }
  arithmetic_parameter_->out_elements_num_ = 0;
  // other rest elements is not sure

  code.CodeStruct("arith_param", *arithmetic_parameter_);
  code.CodeFunction("BroadcastAdd", input_tensor_, bias_str, tile_in_, tile_bias_, output_tensor_, data_size,
                    "(ArithmeticParameter *)&arith_param");
  ctx->AppendCode(code.str());
  return RET_OK;
 }

 REG_OPERATOR_CODER(kAllTargets, kNumberTypeFloat32, PrimitiveType_BiasAdd, CPUOpCoderCreator<BiasAddFP32Coder>)
 }  // namespace mindspore::lite::micro::nnacl
--- a/mindspore/lite/micro/coder/opcoders/nnacl/fp32/biasadd_fp32_coder.h
+++ b/mindspore/lite/micro/coder/opcoders/nnacl/fp32/biasadd_fp32_coder.h
@@ -0,0 +1,43 @@
 /**
 * Copyright 2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #ifndef MINDSPORE_LITE_MICRO_CODER_OPCODERS_NNACL_FP32_BIASADD_FP32_CODER_H_
 #define MINDSPORE_LITE_MICRO_CODER_OPCODERS_NNACL_FP32_BIASADD_FP32_CODER_H_

 #include <vector>
 #include "coder/opcoders/op_coder.h"
 #include "nnacl/arithmetic.h"

 namespace mindspore::lite::micro::nnacl {
 class BiasAddFP32Coder final : public OperatorCoder {
 public:
  BiasAddFP32Coder(const std::vector<Tensor *> &in_tensors, const std::vector<Tensor *> &out_tensors,
                   const Model::Node *node, size_t node_index, Target target)
      : OperatorCoder(in_tensors, out_tensors, node, node_index, target) {}

  ~BiasAddFP32Coder() override = default;

  int Prepare(CoderContext *context) override;

  int DoCode(CoderContext *context) override;

 private:
  ArithmeticParameter *arithmetic_parameter_{nullptr};
  float *tile_in_{nullptr};
  float *tile_bias_{nullptr};
 };
 }  // namespace mindspore::lite::micro::nnacl
 #endif  // MINDSPORE_LITE_MICRO_CODER_OPCODERS_NNACL_FP32_BIASADD_FP32_CODER_H_
--- a/mindspore/lite/micro/coder/opcoders/nnacl/fp32/convolution_depthwise_fp32_coder.cc
+++ b/mindspore/lite/micro/coder/opcoders/nnacl/fp32/convolution_depthwise_fp32_coder.cc
@@ -14,13 +14,12 @@
 * limitations under the License.
 */

 #include "micro/coder/opcoders/nnacl/fp32/convolution_depthwise_fp32_coder.h"
 #include "coder/opcoders/nnacl/fp32/convolution_depthwise_fp32_coder.h"
 #include <string>
 #include "micro/coder/log.h"
 #include "micro/coder/opcoders/file_collector.h"
 #include "micro/coder/opcoders/serializers/nnacl_serializer/nnacl_fp32_serializer.h"
 #include "coder/log.h"
 #include "coder/opcoders/file_collector.h"
 #include "coder/opcoders/serializers/nnacl_serializer/nnacl_fp32_serializer.h"

 using mindspore::schema::PrimitiveType_DepthwiseConv2D;
 namespace mindspore::lite::micro::nnacl {
 int ConvolutionDepthwiseFP32Coder::Prepare(CoderContext *const context) {
  Conv2DBaseCoder::Init();
@@ -73,6 +72,4 @@ int ConvolutionDepthwiseFP32Coder::DoCode(CoderContext *const context) {
  return RET_OK;
 }

 REG_OPERATOR_CODER(kAllTargets, kNumberTypeFloat32, PrimitiveType_DepthwiseConv2D,
                   CPUOpCoderCreator<ConvolutionDepthwiseFP32Coder>)
 }  // namespace mindspore::lite::micro::nnacl
--- a/mindspore/lite/micro/coder/opcoders/nnacl/fp32/convolution_depthwise_fp32_coder.h
+++ b/mindspore/lite/micro/coder/opcoders/nnacl/fp32/convolution_depthwise_fp32_coder.h
@@ -18,7 +18,7 @@
 #define MINDSPORE_LITE_MICRO_CODER_OPCODERS_FP32_CONVOLUTION_DEPTHWISE_FP32_CODER_H_

 #include <vector>
 #include "micro/coder/opcoders/base/conv2d_base_coder.h"
 #include "coder/opcoders/base/conv2d_base_coder.h"
 #include "src/runtime/kernel/arm/fp32/convolution_depthwise_fp32.h"

 namespace mindspore::lite::micro::nnacl {
--- a/mindspore/lite/micro/coder/opcoders/nnacl/fp32/convolution_fp32_coder.cc
+++ b/mindspore/lite/micro/coder/opcoders/nnacl/fp32/convolution_fp32_coder.cc
@@ -14,17 +14,21 @@
 * limitations under the License.
 */

 #include "micro/coder/opcoders/nnacl/fp32/convolution_fp32_coder.h"
 #include "coder/opcoders/nnacl/fp32/convolution_fp32_coder.h"
 #include <memory>
 #include <string>
 #include <vector>
 #include "micro/coder/opcoders/nnacl/fp32/convolution_winograd_fp32_coder.h"
 #include "coder/opcoders/nnacl/fp32/convolution_winograd_fp32_coder.h"
 #include "coder/opcoders/nnacl/fp32/convolution_depthwise_fp32_coder.h"
 #include "nnacl/fp32/winograd_utils.h"
 #include "src/ops/populate/populate_register.h"
 #include "micro/coder/opcoders/file_collector.h"
 #include "micro/coder/log.h"
 #include "coder/opcoders/file_collector.h"
 #include "coder/log.h"
 #include "src/common/prim_util.h"
 #include "src/common/version_manager.h"
 #include "coder/opcoders/nnacl/dequant/de_quant.h"

 using mindspore::schema::PrimitiveType_Conv2D;
 using mindspore::schema::PrimitiveType_Conv2DFusion;
 namespace mindspore::lite::micro::nnacl {
 int ConvolutionFP32Coder::InitTmpBuffer() {
  int in_channel = conv_param_->input_channel_;
@@ -43,20 +47,16 @@ int ConvolutionFP32Coder::InitTmpBuffer() {
 }

 int ConvolutionFP32Coder::Prepare(CoderContext *const context) {
  int ret = Conv2DBaseCoder::Init();
  MS_CHECK_RET_CODE(ret, "Conv2DBaseCoder::Init() failed.");
  ret = InitWeightBias(context);
  MS_CHECK_RET_CODE(ret, "Init weight bias failed.");
  MS_CHECK_RET_CODE(Conv2DBaseCoder::Init(), "Conv2DBaseCoder::Init() failed.");
  de_quant_flag_ = Dequant::GetInstance()->CheckDequantFlag(filter_tensor_);
  MS_CHECK_RET_CODE(InitWeightBias(context), "Init weight bias failed.");
  return Resize();
 }

 int ConvolutionFP32Coder::Resize() {
  int ret = Conv2DBaseCoder::CheckResizeValid();
  MS_CHECK_RET_CODE(ret, "Resize is invalid.");
  ret = Conv2DBaseCoder::Init();
  MS_CHECK_RET_CODE(ret, "init failed.");
  ret = InitTmpBuffer();
  MS_CHECK_RET_CODE(ret, "init tmp buffer failed.");
  MS_CHECK_RET_CODE(Conv2DBaseCoder::CheckResizeValid(), "Resize is invalid.");
  MS_CHECK_RET_CODE(Conv2DBaseCoder::Init(), "init failed.");
  MS_CHECK_RET_CODE(InitTmpBuffer(), "init tmp buffer failed.");
  return RET_OK;
 }

@@ -71,36 +71,43 @@ int ConvolutionFP32Coder::InitWeightBias(CoderContext *const context) {
  const int oc_block = C8NUM;
  int oc_block_num = UP_DIV(out_channel, C8NUM);
  int pack_weight_size = oc_block_num * oc_block * in_channel * kernel_plane;

  pack_weight_size_ = pack_weight_size * sizeof(float);
  auto origin_weight = reinterpret_cast<float *>(filter_tensor_->MutableData());
  MS_CHECK_PTR(origin_weight);
  packed_weight_ = reinterpret_cast<float *>(
    allocator_->Malloc(kNumberTypeFloat32, pack_weight_size * sizeof(float), kOnlinePackWeight));
  packed_weight_ = reinterpret_cast<float *>(allocator_->Malloc(kNumberTypeFloat32, kOnlineSize, kOnlinePackWeight));
  MS_CHECK_PTR(packed_weight_);
  auto out_channel_size = static_cast<size_t>(out_channel);

  NNaclFp32Serializer code;
  code.CodeMallocExpression(packed_weight_, pack_weight_size * sizeof(float));
  code.CodeFunction("memset", packed_weight_, 0, pack_weight_size * sizeof(float));
  code.CodeFunction("RowMajor2Col8Major", filter_tensor_, packed_weight_, out_channel_size, in_channel * kernel_plane);
  NNaclFp32Serializer init_code;
  std::string ori_weight_addr = allocator_->GetRuntimeAddr(filter_tensor_);
  std::string init_weight_str = ori_weight_addr;
  if (de_quant_flag_) {
    init_weight_str = Dequant::GetInstance()->de_quant_buffer_str();
    std::string de_quant_function = Dequant::GetInstance()->GetMicroDeQuantFunction(filter_tensor_, ori_weight_addr);
    init_code << de_quant_function;
  }
  init_code.CodeMallocExpression(packed_weight_, pack_weight_size_);
  init_code.CodeFunction("memset", packed_weight_, 0, pack_weight_size_);
  init_code.CodeFunction("RowMajor2Col8Major", init_weight_str, packed_weight_, out_channel_size,
                         in_channel * kernel_plane);

  auto bias_data_size = static_cast<size_t>(oc_block_num * oc_block * sizeof(float));
  bias_data_ = reinterpret_cast<float *>(allocator_->Malloc(kNumberTypeFloat32, bias_data_size, kOnlinePackWeight));
  bias_data_ = reinterpret_cast<float *>(allocator_->Malloc(kNumberTypeFloat32, kOnlineSize, kOnlinePackWeight));
  MS_CHECK_PTR(bias_data_);
  if (input_tensors_.size() == kInputSize2) {
    code.CodeMallocExpression(bias_data_, bias_data_size);
    code.CodeFunction("memset", bias_data_, 0, bias_data_size);
    code.CodeFunction("memcpy", bias_data_, bias_tensor_, out_channel_size * sizeof(float));
    init_code.CodeMallocExpression(bias_data_, bias_data_size);
    init_code.CodeFunction("memset", bias_data_, 0, bias_data_size);
    init_code.CodeFunction("memcpy", bias_data_, bias_tensor_, out_channel_size * sizeof(float));
  } else {
    return RET_ERROR;
  }
  context->AppendInitCode(code.str());
  context->AppendInitCode(init_code.str());
  return RET_OK;
 }

 int ConvolutionFP32Coder::DoCode(CoderContext *const context) {
  {
    std::vector<string> asmFiles;
    std::vector<std::string> asmFiles;
    if (target_ == kARM32A) {
      asmFiles = {"MatmulFp32.S",
                  "MatmulFp32Opt.S",
@@ -112,9 +119,14 @@ int ConvolutionFP32Coder::DoCode(CoderContext *const context) {
      asmFiles = {"MatmulFp32.S",          "MatmulFp32Opt.S",      "PreSum4x16Int8Peroc.S",       "MatVecMulFp32.S",
                  "PreSum4x16Int8Peroc.S", "PreSum4x16Int8Pert.S", "IndirectGemmInt16to32_8x4.S", "MatmulInt8.S"};
    }
    Collect(context,
            {"nnacl/kernel/fp32/conv_fp32_slim.h", "nnacl/fp32/matmul.h", "nnacl/conv_parameter.h", "nnacl/op_base.h"},
            {"common_func.c", "conv_fp32_slim.c", "matmul.c"}, asmFiles);
    std::vector<std::string> h_files = {"nnacl/fp32/conv_common_fp32.h", "nnacl/fp32/matmul.h",
                                        "nnacl/conv_parameter.h", "nnacl/op_base.h"};
    std::vector<std::string> c_files = {"common_func.c", "conv_common_fp32.c", "matmul.c"};
    if (de_quant_flag_) {
      h_files.emplace_back("wrapper/fp32/dequant_int8_to_fp32_wrapper.h");
      c_files.emplace_back("dequant_int8_to_fp32_wrapper.c");
    }
    Collect(context, h_files, c_files, asmFiles);
  }
  NNaclFp32Serializer code;
  // call the op function
@@ -122,7 +134,7 @@ int ConvolutionFP32Coder::DoCode(CoderContext *const context) {
  code.CodeFunction("memset", col_major_input_, "0", col_major_input_size_);
  code.CodeStruct("conv_parameter", *conv_param_);
  int task_id = 0;
  code.CodeFunction("ConvFp32Slim", input_tensor_, packed_input_, packed_weight_, bias_data_, col_major_input_,
  code.CodeFunction("ConvFp32", input_tensor_, packed_input_, packed_weight_, bias_data_, col_major_input_,
                    output_tensor_, task_id, "(ConvParameter *)&conv_parameter");

  context->AppendCode(code.str());
@@ -135,18 +147,18 @@ std::unique_ptr<OperatorCoder> CPUConvolutionFP32CoderCreator(const std::vector<
                                                              Target target) {
  std::vector<Tensor *> inputs = in_tensors;
  std::vector<Tensor *> outputs = out_tensors;
  auto primitive = node->primitive_;
  if (!primitive) {
  const void *primitive = node->primitive_;
  if (primitive == nullptr) {
    return nullptr;
  }
  OpParameter *parameter =
    PopulateRegistry::GetInstance()->GetParameterCreator((schema::PrimitiveType(primitive->Type())))(primitive);
  if (parameter == nullptr) {
    MS_LOG(ERROR) << "PopulateParameter return nullptr, type: "
                  << schema::EnumNamePrimitiveType((schema::PrimitiveType)(primitive->Type()));
  int schema_version = VersionManager::GetInstance()->GetSchemaVersion();
  ParameterGen paramGen =
    PopulateRegistry::GetInstance()->GetParameterCreator(GetPrimitiveType(node->primitive_), schema_version);
  if (paramGen == nullptr) {
    MS_LOG(ERROR) << "parameter generator is null";
    return nullptr;
  }
  auto conv_param = reinterpret_cast<ConvParameter *>(parameter);
  auto conv_param = reinterpret_cast<ConvParameter *>(paramGen(node->primitive_));
  bool use_winograd = false;
  int out_unit = 0;
  int kernel_h = conv_param->kernel_h_;
@@ -159,7 +171,7 @@ std::unique_ptr<OperatorCoder> CPUConvolutionFP32CoderCreator(const std::vector<
  conv_param->output_channel_ = outputs.at(kOutputIndex)->Channel();
  conv_param->op_parameter_.thread_num_ = 1;
  CheckIfUseWinograd(&use_winograd, &out_unit, conv_param);
  free(parameter);
  free(conv_param);
  // weight de quant
  std::unique_ptr<OperatorCoder> coder;
  if (kernel_h == 1 && kernel_w == 1) {
@@ -175,5 +187,32 @@ std::unique_ptr<OperatorCoder> CPUConvolutionFP32CoderCreator(const std::vector<
  return coder;
 }

 REG_OPERATOR_CODER(kAllTargets, kNumberTypeFloat32, PrimitiveType_Conv2D, CPUConvolutionFP32CoderCreator)
 std::unique_ptr<OperatorCoder> CPUConv2DFusionFP32CoderCreator(const std::vector<Tensor *> &in_tensors,
                                                               const std::vector<Tensor *> &out_tensors,
                                                               const Model::Node *node, size_t node_index,
                                                               Target target) {
  const void *primitive = node->primitive_;
  if (primitive == nullptr) {
    return nullptr;
  }
  int schema_version = VersionManager::GetInstance()->GetSchemaVersion();
  ParameterGen paramGen =
    PopulateRegistry::GetInstance()->GetParameterCreator(GetPrimitiveType(node->primitive_), schema_version);
  if (paramGen == nullptr) {
    MS_LOG(ERROR) << "parameter generator is null";
    return nullptr;
  }
  auto conv_param = reinterpret_cast<ConvParameter *>(paramGen(node->primitive_));
  std::unique_ptr<OperatorCoder> coder;
  if (conv_param->group_ == 1) {
    coder = CPUConvolutionFP32CoderCreator(in_tensors, out_tensors, node, node_index, target);
  } else if (conv_param->group_ == conv_param->input_channel_ && conv_param->group_ == conv_param->output_channel_) {
    coder = CPUOpCoderCreator<ConvolutionDepthwiseFP32Coder>(in_tensors, out_tensors, node, node_index, target);
  } else {
    // GroupConv
  }
  return coder;
 }

 REG_OPERATOR_CODER(kAllTargets, kNumberTypeFloat32, PrimitiveType_Conv2DFusion, CPUConv2DFusionFP32CoderCreator)
 }  // namespace mindspore::lite::micro::nnacl
--- a/mindspore/lite/micro/coder/opcoders/nnacl/fp32/convolution_fp32_coder.h
+++ b/mindspore/lite/micro/coder/opcoders/nnacl/fp32/convolution_fp32_coder.h
@@ -14,14 +14,14 @@
 * limitations under the License.
 */

 #ifndef MINDSPORE_LITE_MICRO_CODER_OPCODERS_FP32_CONVOLUTION_FP32_CODER_H_
 #define MINDSPORE_LITE_MICRO_CODER_OPCODERS_FP32_CONVOLUTION_FP32_CODER_H_
 #ifndef MINDSPORE_LITE_MICRO_CODER_OPCODERS_NNACL_FP32_CONVOLUTION_FP32_CODER_H_
 #define MINDSPORE_LITE_MICRO_CODER_OPCODERS_NNACL_FP32_CONVOLUTION_FP32_CODER_H_

 #include <vector>
 #include <string>
 #include "nnacl/conv_parameter.h"
 #include "micro/coder/opcoders/base/conv2d_base_coder.h"
 #include "micro/coder/opcoders/serializers/nnacl_serializer/nnacl_fp32_serializer.h"
 #include "coder/opcoders/base/conv2d_base_coder.h"
 #include "coder/opcoders/serializers/nnacl_serializer/nnacl_fp32_serializer.h"

 namespace mindspore::lite::micro::nnacl {
 class ConvolutionFP32Coder final : public Conv2DBaseCoder {
@@ -51,12 +51,14 @@ class ConvolutionFP32Coder final : public Conv2DBaseCoder {

  size_t packed_input_size_{0};

  int thread_stride_{0};
  bool de_quant_flag_{false};

  int thread_count_{0};

  float *col_major_input_{nullptr};
  size_t col_major_input_size_{0};

  size_t pack_weight_size_{0};
 };
 }  // namespace mindspore::lite::micro::nnacl
 #endif  // MINDSPORE_LITE_MICRO_CODER_OPCODERS_FP32_CONVOLUTION_FP32_CODER_H_
 #endif  // MINDSPORE_LITE_MICRO_CODER_OPCODERS_NNACL_FP32_CONVOLUTION_FP32_CODER_H_
--- a/mindspore/lite/micro/coder/opcoders/nnacl/fp32/convolution_winograd_fp32_coder.cc
+++ b/mindspore/lite/micro/coder/opcoders/nnacl/fp32/convolution_winograd_fp32_coder.cc
@@ -13,12 +13,12 @@
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "micro/coder/opcoders/nnacl/fp32/convolution_winograd_fp32_coder.h"
 #include "coder/opcoders/nnacl/fp32/convolution_winograd_fp32_coder.h"
 #include <array>
 #include "nnacl/base/minimal_filtering_generator.h"
 #include "micro/coder/log.h"
 #include "micro/coder/opcoders/file_collector.h"
 #include "micro/coder/opcoders/serializers/nnacl_serializer/nnacl_fp32_serializer.h"
 #include "coder/log.h"
 #include "coder/opcoders/file_collector.h"
 #include "coder/opcoders/serializers/nnacl_serializer/nnacl_fp32_serializer.h"

 namespace mindspore::lite::micro::nnacl {
 const std::array<std::string, 9> InputTransFuncList = {
@@ -222,10 +222,11 @@ int ConvolutionWinogradFP32Coder::DoCode(CoderContext *const context) {
    asmFiles = {"MatmulFp32.S",          "MatmulFp32Opt.S",      "PreSum4x16Int8Peroc.S",       "MatVecMulFp32.S",
                "PreSum4x16Int8Peroc.S", "PreSum4x16Int8Pert.S", "IndirectGemmInt16to32_8x4.S", "MatmulInt8.S"};
  }
  Collect(context, {"nnacl/fp32/conv.h", "nnacl/common_func.h"},
          {"common_func.c", "conv_int8.c", "matmul_int8.c", "pack.c", "conv.c", "winograd_transform.c",
           "common_func_fp32.c", "fixed_point.c", "winograd_utils.c", "minimal_filtering_generator.c"},
          asmFiles);
  Collect(
    context, {"nnacl/fp32/conv_winograd_fp32.h", "nnacl/common_func.h"},
    {"common_func.c", "conv_int8.c", "matmul_int8.c", "pack_fp32.c", "conv_winograd_fp32.c", "winograd_transform.c",
     "common_func_fp32.c", "fixed_point.c", "winograd_utils.c", "minimal_filtering_generator.c"},
    asmFiles);

  NNaclFp32Serializer code;
  // call the op function
--- a/mindspore/lite/micro/coder/opcoders/nnacl/fp32/convolution_winograd_fp32_coder.h
+++ b/mindspore/lite/micro/coder/opcoders/nnacl/fp32/convolution_winograd_fp32_coder.h
@@ -20,7 +20,7 @@
 #include <memory>
 #include <string>
 #include <vector>
 #include "micro/coder/opcoders/base/conv2d_base_coder.h"
 #include "coder/opcoders/base/conv2d_base_coder.h"
 #include "nnacl/conv_parameter.h"

 namespace mindspore::lite::micro::nnacl {
--- a/mindspore/lite/micro/coder/opcoders/nnacl/fp32/matmul_fp32_base_coder.cc
+++ b/mindspore/lite/micro/coder/opcoders/nnacl/fp32/matmul_fp32_base_coder.cc
@@ -22,6 +22,7 @@
 #include "coder/opcoders/file_collector.h"
 #include "nnacl/fp32/matmul_fp32.h"
 #include "wrapper/fp32/matmul_fp32_wrapper.h"
 #include "coder/opcoders/nnacl/dequant/de_quant.h"

 using mindspore::schema::PrimitiveType_MatMul;

@@ -31,6 +32,13 @@ int MatMulFP32BaseCoder::ReSize() {
  ResizeParameter();
  thread_count_ = MSMIN(thread_num_, UP_DIV(params_->col_align_, col_tile_));
  thread_stride_ = UP_DIV(UP_DIV(params_->col_align_, col_tile_), thread_count_);
  // can not call Malloc in DoCode,so move this runtime init to final resize
  if (!params_->a_const_) {
    MS_CHECK_RET_CODE(InitBufferA(), "InitBufferA failed");
  }
  if (!params_->b_const_) {
    MS_CHECK_RET_CODE(InitBufferB(), "InitBufferB failed");
  }
  return RET_OK;
 }

@@ -45,17 +53,16 @@ int MatMulFP32BaseCoder::InitBiasData() {
 }

 void MatMulFP32BaseCoder::InitParameter() {
  row_tile_ = C12NUM;
  if (target_ == kARM32A) {
    row_tile_ = C12NUM;
    col_tile_ = C4NUM;
  } else {
    row_tile_ = C12NUM;
    col_tile_ = C8NUM;
  }
 }

 void MatMulFP32BaseCoder::ResizeParameter() {
  if (params_->row_ == 1 && !params_->b_const_) {
  if (params_->row_ == 1) {
    vec_matmul_ = true;
  }
  params_->row_align_ = vec_matmul_ ? 1 : UP_ROUND(params_->row_, row_tile_);
@@ -66,12 +73,11 @@ int MatMulFP32BaseCoder::InitBufferA() {
  if (a_pack_ptr_ != nullptr) {
    return RET_OK;
  }
  a_pack_ptr_size_ = static_cast<size_t>(params_->batch * params_->row_align_ * params_->deep_ * sizeof(float));
  if (params_->a_const_) {
    a_pack_ptr_ = reinterpret_cast<float *>(allocator_->Malloc(kNumberTypeFloat32, kOnlineSize, kOnlinePackWeight));
  } else {
    a_pack_ptr_size_ = static_cast<size_t>(params_->batch * params_->row_align_ * params_->deep_ * sizeof(float));
    a_pack_ptr_ =
      reinterpret_cast<float *>(allocator_->Malloc(kNumberTypeFloat32, a_pack_ptr_size_, kOfflinePackWeight));
    a_pack_ptr_ = reinterpret_cast<float *>(allocator_->Malloc(kNumberTypeFloat32, a_pack_ptr_size_, kWorkspace));
  }
  MS_CHECK_PTR(a_pack_ptr_);
  return RET_OK;
@@ -81,12 +87,11 @@ int MatMulFP32BaseCoder::InitBufferB() {
  if (b_pack_ptr_ != nullptr) {
    return RET_OK;
  }
  b_pack_ptr_size_ = static_cast<size_t>(params_->batch * params_->col_align_ * params_->deep_ * sizeof(float));
  if (params_->b_const_) {
    b_pack_ptr_ = reinterpret_cast<float *>(allocator_->Malloc(kNumberTypeFloat32, kOnlineSize, kOnlinePackWeight));
  } else {
    b_pack_ptr_size_ = static_cast<size_t>(params_->batch * params_->col_align_ * params_->deep_ * sizeof(float));
    b_pack_ptr_ =
      reinterpret_cast<float *>(allocator_->Malloc(kNumberTypeFloat32, b_pack_ptr_size_, kOfflinePackWeight));
    b_pack_ptr_ = reinterpret_cast<float *>(allocator_->Malloc(kNumberTypeFloat32, b_pack_ptr_size_, kWorkspace));
  }
  MS_CHECK_PTR(b_pack_ptr_);
  return RET_OK;
@@ -108,12 +113,9 @@ int MatMulFP32BaseCoder::Init() {
  MS_CHECK_RET_CODE(InitBiasData(), "InitBiasData failed");
  if (params_->a_const_) {
    MS_CHECK_RET_CODE(InitBufferA(), "InitBufferA failed");
    MS_CHECK_RET_CODE(InitMatrixA(reinterpret_cast<float *>(input_tensor_->data_c())), "InitMatrixA failed");
  }

  if (params_->b_const_) {
    MS_CHECK_RET_CODE(InitBufferB(), "InitBufferB failed");
    MS_CHECK_RET_CODE(InitMatrixB(reinterpret_cast<float *>(filter_tensor_->data_c())), "InitMatrixB failed");
  }
  return RET_OK;
 }
@@ -124,12 +126,17 @@ int MatMulFP32BaseCoder::DoCode(CoderContext *const context) {
  // generate code .h .c
  std::vector<std::string> asm_files;
  if (target_ == kARM32A) {
    asm_files = {"MatmulFp32.S", "MatmulFp32Opt.S"};
    asm_files = {"MatmulFp32.S", "MatmulFp32Opt.S", "MatmulFp32Opt12x4.S"};
  } else if (target_ == kARM64) {
    asm_files = {"arm64/MatmulFp32.S", "MatmulFp32Opt.S", "arm64/MatVecMulFp32.S"};
    asm_files = {"MatmulFp32.S", "MatmulFp32Opt.S", "MatVecMulFp32.S"};
  }
  std::vector<std::string> h_files = {"nnacl/fp32/matmul_fp32.h", "wrapper/fp32/matmul_fp32_wrapper.h"};
  std::vector<std::string> c_files = {"matmul_fp32.c", "matmul_fp32_wrapper.c"};
  if (de_quant_flag_) {
    h_files.emplace_back("wrapper/fp32/dequant_int8_to_fp32_wrapper.h");
    c_files.emplace_back("dequant_int8_to_fp32_wrapper.c");
  }
  Collect(context, {"nnacl/fp32/matmul.h", "adapter/fp32/matmul_fp32_adapter.h"}, {"matmul.c", "matmul_fp32_adapter.c"},
          asm_files);
  Collect(context, h_files, c_files, asm_files);
  NNaclFp32Serializer code;
  NNaclFp32Serializer init_code;
  code.CodeStruct("mat_mul_parameter", *params_);
@@ -137,9 +144,12 @@ int MatMulFP32BaseCoder::DoCode(CoderContext *const context) {
  // do bias packing to init
  if (bias_ptr_) {
    init_code.CodeMallocExpression(bias_ptr_, bias_pack_ptr_size_);
    init_code.CodeFunction("memcpy", bias_ptr_, bias_tensor_->data_c(), bias_pack_ptr_size_);
    init_code.CodeFunction("memcpy", bias_ptr_, bias_tensor_, bias_pack_ptr_size_);
  }

  // Get Tensor Pointer
  std::string a_str = allocator_->GetRuntimeAddr(input_tensor_);
  std::string b_str = allocator_->GetRuntimeAddr(filter_tensor_);
  std::string c_str = allocator_->GetRuntimeAddr(output_tensor_);
  std::string a_pack_str = allocator_->GetRuntimeAddr(a_pack_ptr_);
  std::string b_pack_str = allocator_->GetRuntimeAddr(b_pack_ptr_);
@@ -147,12 +157,28 @@ int MatMulFP32BaseCoder::DoCode(CoderContext *const context) {
  // do const value packing to init
  if (!params_->a_const_) {
    code.CodeFunction("InitMatrixA", input_tensor_, a_pack_ptr_, "&mat_mul_parameter", vec_matmul_);
    init_code.CodeMallocExpression(b_pack_ptr_, b_pack_ptr_size_);
    std::string b_src_str = b_str;
    if (de_quant_flag_) {
      // reuse to b_pack_str
      b_src_str = Dequant::GetInstance()->de_quant_buffer_str();
      std::string de_quant_function = Dequant::GetInstance()->GetMicroDeQuantFunction(filter_tensor_, b_str);
      init_code << de_quant_function;
    }
    // b_pack_str has been memset, no need to memset
    init_code.CodeFunction("InitMatrixB", filter_tensor_, b_pack_ptr_, "&mat_mul_parameter", vec_matmul_);
    init_code.CodeFunction("InitMatrixB", b_src_str, b_pack_ptr_, "&mat_mul_parameter", vec_matmul_);
  }
  if (!params_->b_const_) {
    init_code.CodeMallocExpression(a_pack_str, a_pack_ptr_size_);
    std::string a_src_str = a_str;
    if (de_quant_flag_) {
      // reuse to a_pack_str
      a_src_str = Dequant::GetInstance()->de_quant_buffer_str();
      std::string de_quant_function = Dequant::GetInstance()->GetMicroDeQuantFunction(input_tensor_, a_str);
      init_code << de_quant_function;
    }
    // a_pack_str has been memset, no need to memset
    init_code.CodeFunction("InitMatrixA", input_tensor_, a_pack_ptr_, "&mat_mul_parameter", vec_matmul_);
    init_code.CodeFunction("InitMatrixA", a_src_str, a_pack_ptr_, "&mat_mul_parameter", vec_matmul_);
    code.CodeFunction("InitMatrixB", filter_tensor_, b_pack_ptr_, "&mat_mul_parameter", vec_matmul_);
  }

@@ -165,13 +191,13 @@ int MatMulFP32BaseCoder::DoCode(CoderContext *const context) {
  }
  code << "for (int i = 0; i < " << params_->batch << "; ++i) {\n";
  if (vec_matmul_) {
    code << "\t\tbatch_a_ptr = " << a_pack_str << " + i * " << params_->deep_ << ";\n";
    code << "\t\tbatch_b_ptr = " << b_pack_str << " + i * " << params_->deep_ * params_->col_ << ";\n";
    code << "\t\tbatch_c_ptr = " << c_str << " + i * " << params_->row_ * params_->col_ << ";\n";
    code << "\t\tfloat *batch_a_ptr = " << a_pack_str << " + i * " << params_->deep_ << ";\n";
    code << "\t\tfloat *batch_b_ptr = " << b_pack_str << " + i * " << params_->deep_ * params_->col_ << ";\n";
    code << "\t\tfloat *batch_c_ptr = " << c_str << " + i * " << params_->row_ * params_->col_ << ";\n";
  } else {
    code << "\t\tbatch_a_ptr = " << a_pack_str << " + i * " << params_->row_align_ * params_->deep_ << ";\n";
    code << "\t\tbatch_b_ptr = " << b_pack_str << " + i * " << params_->deep_ * params_->col_align_ << ";\n";
    code << "\tbatch_c_ptr = " << c_str << " + i * " << params_->row_ * params_->col_ << ";\n";
    code << "\t\tfloat *batch_a_ptr = " << a_pack_str << " + i * " << params_->row_align_ * params_->deep_ << ";\n";
    code << "\t\tfloat *batch_b_ptr = " << b_pack_str << " + i * " << params_->deep_ * params_->col_align_ << ";\n";
    code << "\t\tfloat *batch_c_ptr = " << c_str << " + i * " << params_->row_ * params_->col_ << ";\n";
  }

  if (vec_matmul_) {
--- a/mindspore/lite/micro/coder/opcoders/nnacl/fp32/matmul_fp32_base_coder.h
+++ b/mindspore/lite/micro/coder/opcoders/nnacl/fp32/matmul_fp32_base_coder.h
@@ -56,6 +56,7 @@ class MatMulFP32BaseCoder : public OperatorCoder {
  float *b_pack_ptr_ = nullptr;
  float *bias_ptr_{nullptr};
  bool vec_matmul_{false};
  bool de_quant_flag_{false};

 private:
  int col_tile_{0};
--- a/mindspore/lite/micro/coder/opcoders/nnacl/fp32/matmul_fp32_coder.cc
+++ b/mindspore/lite/micro/coder/opcoders/nnacl/fp32/matmul_fp32_coder.cc
@@ -18,6 +18,7 @@
 #include <vector>
 #include "coder/log.h"
 #include "coder/opcoders/file_collector.h"
 #include "coder/opcoders/nnacl/dequant/de_quant.h"

 using mindspore::schema::PrimitiveType_MatMul;

@@ -77,10 +78,12 @@ int MatMulFP32Coder::Prepare(CoderContext *const context) {
  params_->b_const_ = (filter_tensor_->data_c() != nullptr);
  MatMulFP32BaseCoder::InitParameter();
  if (params_->a_const_) {
    InitShapeA();
    de_quant_flag_ = Dequant::GetInstance()->CheckDequantFlag(input_tensor_);
    MS_CHECK_RET_CODE(InitShapeA(), "MatMulFP32Coder init_shape_a failed");
  }
  if (params_->b_const_) {
    InitShapeB();
    de_quant_flag_ = Dequant::GetInstance()->CheckDequantFlag(filter_tensor_);
    MS_CHECK_RET_CODE(InitShapeB(), "MatMulFP32Coder init_shape_b failed");
  }
  MS_CHECK_RET_CODE(MatMulFP32BaseCoder::Init(), "MatMulFP32Coder init failed");
  return ReSize();
--- a/mindspore/lite/micro/coder/opcoders/nnacl/fp32/pad_fp32_coder.cc
+++ b/mindspore/lite/micro/coder/opcoders/nnacl/fp32/pad_fp32_coder.cc
@@ -14,14 +14,14 @@
 * limitations under the License.
 */

 #include "micro/coder/opcoders/nnacl/fp32/pad_fp32_coder.h"
 #include "coder/opcoders/nnacl/fp32/pad_fp32_coder.h"
 #include <string>
 #include <vector>
 #include "micro/coder/log.h"
 #include "micro/coder/opcoders/serializers/nnacl_serializer/nnacl_fp32_serializer.h"
 #include "micro/coder/opcoders/file_collector.h"
 #include "coder/log.h"
 #include "coder/opcoders/serializers/nnacl_serializer/nnacl_fp32_serializer.h"
 #include "coder/opcoders/file_collector.h"

 using mindspore::schema::PrimitiveType_Pad;
 using mindspore::schema::PrimitiveType_PadFusion;

 namespace mindspore::lite::micro::nnacl {

@@ -99,5 +99,5 @@ int PadFP32Coder::DoCode(CoderContext *const context) {
  return RET_OK;
 }

 REG_OPERATOR_CODER(kAllTargets, kNumberTypeFloat32, PrimitiveType_Pad, CPUOpCoderCreator<PadFP32Coder>)
 REG_OPERATOR_CODER(kAllTargets, kNumberTypeFloat32, PrimitiveType_PadFusion, CPUOpCoderCreator<PadFP32Coder>)
 }  // namespace mindspore::lite::micro::nnacl
--- a/mindspore/lite/micro/coder/opcoders/nnacl/fp32/pooling_fp32_coder.cc
+++ b/mindspore/lite/micro/coder/opcoders/nnacl/fp32/pooling_fp32_coder.cc
@@ -21,7 +21,8 @@
 #include "coder/log.h"
 #include "coder/opcoders/file_collector.h"

 using mindspore::schema::PrimitiveType_Pooling;
 using mindspore::schema::PrimitiveType_AvgPoolFusion;
 using mindspore::schema::PrimitiveType_MaxPoolFusion;

 namespace mindspore::lite::micro::nnacl {

@@ -46,7 +47,7 @@ int PoolingFP32Coder::DoCode(CoderContext *const context) {
  float minf = -FLT_MAX;
  float maxf = FLT_MAX;
  if (pooling_parameter->pool_mode_ == PoolMode_MaxPool) {
    Collect(context, {"nnacl/kernel/fp32/max_pooling_fp32_slim.h"}, {"max_pooling_fp32_slim.c"});
    Collect(context, {"nnacl/fp32/pooling_fp32.h"}, {"pooling_fp32.c"});
    switch (pooling_parameter->act_type_) {
      case ActType_Relu: {
        minf = 0.f;
@@ -63,14 +64,9 @@ int PoolingFP32Coder::DoCode(CoderContext *const context) {
      }
    }

    if (thread_num_ > 1) {
      code.CodeBaseStruct("PoolingFp32Args", "args", input_tensor_, output_tensor_, "&pooling_parameter", minf, maxf);
      CODE_PARALLEL_FUNC("MaxPoolingFp32Run");
    } else {
      code.CodeFunction("MaxPooling", input_tensor_, output_tensor_, "&pooling_parameter", task_id, minf, maxf);
    }
    code.CodeFunction("MaxPooling", input_tensor_, output_tensor_, "&pooling_parameter", task_id, minf, maxf);
  } else {
    Collect(context, {"nnacl/fp32/pooling.h"}, {"pooling.c"});
    Collect(context, {"nnacl/fp32/pooling_fp32.h"}, {"pooling_fp32.c"});
    switch (pooling_parameter->act_type_) {
      case ActType_Relu: {
        minf = 0.f;
@@ -86,12 +82,7 @@ int PoolingFP32Coder::DoCode(CoderContext *const context) {
        break;
      }
    }
    if (thread_num_ > 1) {
      code.CodeBaseStruct("PoolingFp32Args", "args", input_tensor_, output_tensor_, "&pooling_parameter", minf, maxf);
      CODE_PARALLEL_FUNC("AvgPoolingFp32Run");
    } else {
      code.CodeFunction("AvgPooling", input_tensor_, output_tensor_, "&pooling_parameter", task_id, minf, maxf);
    }
    code.CodeFunction("AvgPooling", input_tensor_, output_tensor_, "&pooling_parameter", task_id, minf, maxf);
  }

  MS_LOG(INFO) << "PoolingFp32Code has been called";
@@ -99,5 +90,6 @@ int PoolingFP32Coder::DoCode(CoderContext *const context) {
  return lite::RET_OK;
 }

 REG_OPERATOR_CODER(kAllTargets, kNumberTypeFloat32, PrimitiveType_Pooling, CPUOpCoderCreator<PoolingFP32Coder>)
 REG_OPERATOR_CODER(kAllTargets, kNumberTypeFloat32, PrimitiveType_AvgPoolFusion, CPUOpCoderCreator<PoolingFP32Coder>)
 REG_OPERATOR_CODER(kAllTargets, kNumberTypeFloat32, PrimitiveType_MaxPoolFusion, CPUOpCoderCreator<PoolingFP32Coder>)
 }  // namespace mindspore::lite::micro::nnacl
--- a/mindspore/lite/micro/coder/opcoders/nnacl/fp32/pooling_fp32_coder.h
+++ b/mindspore/lite/micro/coder/opcoders/nnacl/fp32/pooling_fp32_coder.h
@@ -18,7 +18,7 @@
 #define MINDSPORE_LITE_MICRO_CODER_OPCODERS_POOLFP32_CODER_H_

 #include <vector>
 #include "micro/coder/opcoders/op_coder.h"
 #include "coder/opcoders/op_coder.h"

 namespace mindspore::lite::micro::nnacl {

--- a/mindspore/lite/micro/coder/opcoders/nnacl/fp32/power_fp32_coder.cc
+++ b/mindspore/lite/micro/coder/opcoders/nnacl/fp32/power_fp32_coder.cc
@@ -20,7 +20,7 @@
 #include "coder/opcoders/serializers/nnacl_serializer/nnacl_fp32_serializer.h"
 #include "coder/opcoders/file_collector.h"

 using mindspore::schema::PrimitiveType_Power;
 using mindspore::schema::PrimitiveType_PowFusion;

 namespace mindspore::lite::micro::nnacl {

@@ -55,6 +55,6 @@ int PowerFP32Coder::DoCode(CoderContext *const context) {
  return RET_OK;
 }

 REG_OPERATOR_CODER(kAllTargets, kNumberTypeFloat32, PrimitiveType_Power, CPUOpCoderCreator<PowerFP32Coder>)
 REG_OPERATOR_CODER(kAllTargets, kNumberTypeFloat32, PrimitiveType_PowFusion, CPUOpCoderCreator<PowerFP32Coder>)

 }  // namespace mindspore::lite::micro::nnacl
--- a/mindspore/lite/micro/coder/opcoders/nnacl/fp32/reduce_fp32_coder.cc
+++ b/mindspore/lite/micro/coder/opcoders/nnacl/fp32/reduce_fp32_coder.cc
@@ -20,7 +20,7 @@
 #include "coder/opcoders/serializers/nnacl_serializer/nnacl_fp32_serializer.h"
 #include "coder/opcoders/file_collector.h"

 using mindspore::schema::PrimitiveType_Reduce;
 using mindspore::schema::PrimitiveType_PowFusion;

 namespace mindspore::lite::micro::nnacl {
 int ReduceFP32Coder::Prepare(CoderContext *const context) {
@@ -116,6 +116,6 @@ int ReduceFP32Coder::DoCode(CoderContext *const context) {
  return RET_OK;
 }

 REG_OPERATOR_CODER(kAllTargets, kNumberTypeFloat32, PrimitiveType_Reduce, CPUOpCoderCreator<ReduceFP32Coder>)
 REG_OPERATOR_CODER(kAllTargets, kNumberTypeFloat32, PrimitiveType_PowFusion, CPUOpCoderCreator<ReduceFP32Coder>)

 }  // namespace mindspore::lite::micro::nnacl
--- a/mindspore/lite/micro/coder/opcoders/nnacl/fp32/scale_fp32_coder.cc
+++ b/mindspore/lite/micro/coder/opcoders/nnacl/fp32/scale_fp32_coder.cc
@@ -18,8 +18,9 @@
 #include "coder/log.h"
 #include "coder/opcoders/serializers/nnacl_serializer/nnacl_fp32_serializer.h"
 #include "coder/opcoders/file_collector.h"
 #include "coder/opcoders/parallel.h"

 using mindspore::schema::PrimitiveType_Scale;
 using mindspore::schema::PrimitiveType_ScaleFusion;

 namespace mindspore::lite::micro::nnacl {
 ScaleFP32Coder::~ScaleFP32Coder() {
@@ -131,34 +132,26 @@ int ScaleFP32Coder::DoCode(CoderContext *const context) {
  NNaclFp32Serializer code;
  code.CodeStruct("scale_parameter", *scale_param_);

  if (thread_num_ > 1) {
    code.CodeBaseStruct("ScaleFp32Args", "args", input_tensor_, output_tensor_, scale_tensor, offset_tensor,
  switch (scale_param_->activation_type_) {
    case schema::ActivationType_RELU6:
      code.CodeFunction("DoScaleRelu6", input_tensor_, output_tensor_, scale_tensor, offset_tensor, kDefaultTaskId,
                        "&scale_parameter");
    CODE_PARALLEL_FUNC("ScaleFp32Run");
  } else {
    int task_id = 0;
    switch (scale_param_->activation_type_) {
      case schema::ActivationType_RELU6:
        code.CodeFunction("DoScaleRelu6", input_tensor_, output_tensor_, scale_tensor, offset_tensor, task_id,
                          "&scale_parameter");
        break;
      case schema::ActivationType_RELU:
        code.CodeFunction("DoScaleRelu", input_tensor_, output_tensor_, scale_tensor, offset_tensor, task_id,
                          "&scale_parameter");
        break;
      case schema::ActivationType_NO_ACTIVATION:
        code.CodeFunction("DoScale", input_tensor_, output_tensor_, scale_tensor, offset_tensor, task_id,
                          "&scale_parameter");
        break;
      default:
        MS_LOG(ERROR) << "Scale does not support activation type " << scale_param_->activation_type_;
        return RET_ERROR;
    }
      break;
    case schema::ActivationType_RELU:
      code.CodeFunction("DoScaleRelu", input_tensor_, output_tensor_, scale_tensor, offset_tensor, kDefaultTaskId,
                        "&scale_parameter");
      break;
    case schema::ActivationType_NO_ACTIVATION:
      code.CodeFunction("DoScale", input_tensor_, output_tensor_, scale_tensor, offset_tensor, kDefaultTaskId,
                        "&scale_parameter");
      break;
    default:
      MS_LOG(ERROR) << "Scale does not support activation type " << scale_param_->activation_type_;
      return RET_ERROR;
  }
  MS_LOG(INFO) << "ScaleFP32Code has been called";
  context->AppendCode(code.str());
  return RET_OK;
 }

 REG_OPERATOR_CODER(kAllTargets, kNumberTypeFloat32, PrimitiveType_Scale, CPUOpCoderCreator<ScaleFP32Coder>)
 REG_OPERATOR_CODER(kAllTargets, kNumberTypeFloat32, PrimitiveType_ScaleFusion, CPUOpCoderCreator<ScaleFP32Coder>)
 }  // namespace mindspore::lite::micro::nnacl
--- a/mindspore/lite/micro/coder/opcoders/nnacl/fp32/slice_fp32_coder.cc
+++ b/mindspore/lite/micro/coder/opcoders/nnacl/fp32/slice_fp32_coder.cc
@@ -1,74 +0,0 @@
 /**
 * Copyright 2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #include "coder/opcoders/nnacl/fp32/slice_fp32_coder.h"
 #include <string>
 #include "nnacl/slice_parameter.h"
 #include "src/ops/slice.h"
 #include "coder/opcoders/serializers/nnacl_serializer/nnacl_fp32_serializer.h"
 #include "coder/opcoders/file_collector.h"

 using mindspore::schema::PrimitiveType_Slice;
 namespace mindspore::lite::micro::nnacl {
 int SliceFP32Coder::Prepare(CoderContext *const context) { return RET_OK; }

 int SliceFP32Coder::DoCode(CoderContext *const context) {
  // generate code .h .c
  Collect(context, {"nnacl/slice_parameter.h", "nnacl/fp32/slice.h"}, {"slice.c"});

  auto param = reinterpret_cast<SliceParameter *>(parameter_);
  auto primitive_slice = reinterpret_cast<const mindspore::lite::Slice *>(OperatorCoder::primitive());
  std::vector<int> begin = primitive_slice->GetPostProcessBegin();
  std::vector<int> size = primitive_slice->GetPostProcessSize();
  std::vector<int> input_shape = input_tensor_->shape();
  NNaclFp32Serializer code;
  for (int i = 0; i < param->param_length_; i++) {
    param->shape_[i] = input_shape.at(i);
  }

  for (int i = 0; i < param->param_length_; i++) {
    param->begin_[i] = begin.at(i);
  }

  for (int i = 0; i < param->param_length_; i++) {
    int tmp_size = size.at(i);
    if (size.at(i) < 0) {
      tmp_size = input_shape.at(i) - begin.at(i);
    }
    param->end_[i] = (begin.at(i) + tmp_size);
  }

  for (int i = 0; i < param->param_length_; i++) {
    if (size.at(i) < 0) {
      param->size_[i] = (input_shape.at(i) - begin.at(i));
      continue;
    }
    param->size_[i] = size.at(i);
  }

  code.CodeStruct("slice_parameter", *param);

  // call the op function
  if (param->param_length_ < DIMENSION_4D) {
    code.CodeFunction("PadSliceParameterTo4D", "&slice_parameter");
  }
  code.CodeFunction("DoSliceNoParallel", input_tensor_, output_tensor_, "&slice_parameter");
  context->AppendCode(code.str());
  return RET_OK;
 }

 REG_OPERATOR_CODER(kAllTargets, kNumberTypeFloat32, PrimitiveType_Slice, CPUOpCoderCreator<SliceFP32Coder>)
 }  // namespace mindspore::lite::micro::nnacl
--- a/mindspore/lite/micro/coder/opcoders/nnacl/fp32/slice_fp32_coder.h
+++ b/mindspore/lite/micro/coder/opcoders/nnacl/fp32/slice_fp32_coder.h
@@ -1,37 +0,0 @@
 /**
 * Copyright 2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #ifndef MINDSPORE_LITE_MICRO_CODER_OPCODERS_NNACL_SLICE_FP32_CODER_H_
 #define MINDSPORE_LITE_MICRO_CODER_OPCODERS_NNACL_SLICE_FP32_CODER_H_

 #include <vector>
 #include "coder/opcoders/op_coder.h"

 namespace mindspore::lite::micro::nnacl {
 class SliceFP32Coder final : public OperatorCoder {
 public:
  SliceFP32Coder(const std::vector<Tensor *> &in_tensors, const std::vector<Tensor *> &out_tensors,
                 const Model::Node *node, size_t node_index, Target target)
      : OperatorCoder(in_tensors, out_tensors, node, node_index, target) {}

  ~SliceFP32Coder() override = default;

  int Prepare(CoderContext *const context) override;

  int DoCode(CoderContext *const context) override;
 };
 }  // namespace mindspore::lite::micro::nnacl
 #endif  // MINDSPORE_LITE_MICRO_CODER_OPCOD ERS_SLICE_FP32_CODER_H_
--- a/mindspore/lite/micro/coder/opcoders/nnacl/fp32/softmax_fp32_coder.cc
+++ b/mindspore/lite/micro/coder/opcoders/nnacl/fp32/softmax_fp32_coder.cc
@@ -20,7 +20,7 @@
 #include "schema/inner/ops_generated.h"
 #include "coder/opcoders/file_collector.h"

 using mindspore::schema::PrimitiveType_SoftMax;
 using mindspore::schema::PrimitiveType_Softmax;

 namespace mindspore::lite::micro::nnacl {

@@ -48,7 +48,7 @@ int SoftMaxFP32Coder::Prepare(CoderContext *const context) {
 }

 int SoftMaxFP32Coder::DoCode(CoderContext *const context) {
  Collect(context, {"nnacl/fp32/softmax.h"}, {"softmax.c"});
  Collect(context, {"nnacl/fp32/softmax_fp32.h"}, {"softmax_fp32.c", "exp_fp32.c"});
  NNaclFp32Serializer code;
  code.CodeStruct("softmax_parameter", *softmax_param_);
  code.CodeFunction("memset", sum_data_, "0", sum_data_size_);
@@ -58,6 +58,6 @@ int SoftMaxFP32Coder::DoCode(CoderContext *const context) {
  return RET_OK;
 }

 REG_OPERATOR_CODER(kAllTargets, kNumberTypeFloat32, PrimitiveType_SoftMax, CPUOpCoderCreator<SoftMaxFP32Coder>)
 REG_OPERATOR_CODER(kAllTargets, kNumberTypeFloat32, PrimitiveType_Softmax, CPUOpCoderCreator<SoftMaxFP32Coder>)

 }  // namespace mindspore::lite::micro::nnacl
--- a/mindspore/lite/micro/coder/opcoders/nnacl/fp32/softmax_fp32_coder.h
+++ b/mindspore/lite/micro/coder/opcoders/nnacl/fp32/softmax_fp32_coder.h
@@ -17,7 +17,7 @@
 #define MINDSPORE_LITE_MICRO_CODER_SOFTMAX_CODER_H_

 #include <vector>
 #include "micro/coder/opcoders/base/softmax_base_coder.h"
 #include "coder/opcoders/base/softmax_base_coder.h"
 namespace mindspore::lite::micro::nnacl {

 class SoftMaxFP32Coder final : public SoftmaxBaseCoder {
--- a/mindspore/lite/micro/coder/opcoders/nnacl/fp32/splice_fp32_coder.cc
+++ b/mindspore/lite/micro/coder/opcoders/nnacl/fp32/splice_fp32_coder.cc
@@ -0,0 +1,57 @@
 /**
 * Copyright 2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #include "coder/opcoders/nnacl/fp32/splice_fp32_coder.h"
 #include <string>
 #include "coder/opcoders/serializers/nnacl_serializer/nnacl_fp32_serializer.h"
 #include "coder/opcoders/file_collector.h"
 #include "src/common/log_adapter.h"
 #include "nnacl/splice_parameter.h"
 using mindspore::schema::PrimitiveType_Splice;
 namespace mindspore::lite::micro::nnacl {
 int SpliceFP32Coder::DoCode(CoderContext *const context) {
  auto splice_parameter = reinterpret_cast<SpliceParameter *>(parameter_);
  // to make forward_indexes nullptr
  splice_parameter->forward_indexes_ = nullptr;
  std::vector<int> src_shape = input_tensor_->shape();
  std::vector<int> dst_shape = output_tensor_->shape();
  if (src_shape.size() != dst_shape.size() || src_shape.size() != kInputSize2 || dst_shape.size() != kInputSize2) {
    MS_LOG(ERROR) << "SpliceFP32Coder src_shape size not equal to dst_shape";
    return RET_ERROR;
  }
  int src_row = src_shape.at(kInputIndex);
  int dst_row = dst_shape.at(kInputIndex);
  int src_col = src_shape.at(kBiasIndex);
  int dst_col = dst_shape.at(kBiasIndex);
  if (src_row != dst_row) {
    MS_LOG(ERROR) << "SpliceFP32Coder src_row not equal to dst_row";
    return RET_ERROR;
  }
  if (src_col * splice_parameter->context_dim_ != dst_col) {
    MS_LOG(ERROR) << "SpliceFP32Coder src_col not match to dst_col";
    return RET_ERROR;
  }
  Collect(context, {"nnacl/splice_parameter.h", "nnacl/fp32/splice_fp32.h"}, {"splice_fp32.c"});
  NNaclFp32Serializer code;
  code.CodeStruct("splice_parameter", *splice_parameter);
  code.CodeFunction("SpliceFp32", input_tensor_, src_row, src_col, "&splice_parameter", output_tensor_, dst_row,
                    dst_col);
  context->AppendCode(code.str());
  MS_LOG(DEBUG) << "SpliceFP32Coder do_code ok";
  return RET_OK;
 }
 REG_OPERATOR_CODER(kAllTargets, kNumberTypeFloat32, PrimitiveType_Splice, CPUOpCoderCreator<SpliceFP32Coder>)
 }  // namespace mindspore::lite::micro::nnacl
--- a/mindspore/lite/micro/coder/opcoders/nnacl/fp32/splice_fp32_coder.h
+++ b/mindspore/lite/micro/coder/opcoders/nnacl/fp32/splice_fp32_coder.h
@@ -0,0 +1,35 @@
 /**
 * Copyright 2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #ifndef MINDSPORE_LITE_MICRO_CODER_OPCODERS_NNACL_FP32_SPLICE_FP32_CODER_H_
 #define MINDSPORE_LITE_MICRO_CODER_OPCODERS_NNACL_FP32_SPLICE_FP32_CODER_H_
 #include <vector>
 #include "coder/opcoders/op_coder.h"
 namespace mindspore::lite::micro::nnacl {
 class SpliceFP32Coder final : public OperatorCoder {
 public:
  SpliceFP32Coder(const std::vector<Tensor *> &in_tensors, const std::vector<Tensor *> &out_tensors,
                  const Model::Node *node, size_t node_index, Target target)
      : OperatorCoder(in_tensors, out_tensors, node, node_index, target) {}

  ~SpliceFP32Coder() override = default;

  int Prepare(CoderContext *const context) override { return RET_OK; }

  int DoCode(CoderContext *const context) override;
 };
 }  // namespace mindspore::lite::micro::nnacl
 #endif  // MINDSPORE_LITE_MICRO_CODER_OPCODERS_NNACL_FP32_SPLICE_FP32_CODER_H_
--- a/mindspore/lite/micro/coder/opcoders/nnacl/fp32/tile_fp32_coder.cc
+++ b/mindspore/lite/micro/coder/opcoders/nnacl/fp32/tile_fp32_coder.cc
@@ -20,7 +20,7 @@
 #include "coder/opcoders/serializers/nnacl_serializer/nnacl_fp32_serializer.h"
 #include "coder/opcoders/file_collector.h"

 using mindspore::schema::PrimitiveType_Tile;
 using mindspore::schema::PrimitiveType_TileFusion;

 namespace mindspore::lite::micro::nnacl {
 void TileFP32Coder::ComputeStrides(const int *shape, int *strides, int ndim) const {
@@ -63,6 +63,6 @@ int TileFP32Coder::DoCode(CoderContext *const context) {
  return RET_OK;
 }

 REG_OPERATOR_CODER(kAllTargets, kNumberTypeFloat32, PrimitiveType_Tile, CPUOpCoderCreator<TileFP32Coder>)
 REG_OPERATOR_CODER(kAllTargets, kNumberTypeFloat32, PrimitiveType_TileFusion, CPUOpCoderCreator<TileFP32Coder>)

 }  // namespace mindspore::lite::micro::nnacl
--- a/mindspore/lite/micro/coder/opcoders/nnacl/fp32/transpose_fp32_coder.cc
+++ b/mindspore/lite/micro/coder/opcoders/nnacl/fp32/transpose_fp32_coder.cc
@@ -14,11 +14,11 @@
 * limitations under the License.
 */

 #include "micro/coder/opcoders/nnacl/fp32/transpose_fp32_coder.h"
 #include "coder/opcoders/nnacl/fp32/transpose_fp32_coder.h"
 #include <vector>
 #include <string>
 #include "micro/coder/opcoders/serializers/nnacl_serializer/nnacl_fp32_serializer.h"
 #include "micro/coder/opcoders/file_collector.h"
 #include "coder/opcoders/serializers/nnacl_serializer/nnacl_fp32_serializer.h"
 #include "coder/opcoders/file_collector.h"

 using mindspore::schema::PrimitiveType_Transpose;
 namespace mindspore::lite::micro::nnacl {
@@ -83,8 +83,8 @@ int TransposeFp32Coder::DoCode(CoderContext *const context) {
  NNaclFp32Serializer code;
  code.CodeStruct("transpose_parameter", *transpose_parameter_);

  code.CodeFunction("DoTransposeFp32", input_tensor_, output_tensor_, in_shape_, out_shape_, "&transpose_parameter",
                    task_id, num_unit_thread, dim_size_, position_);
  code.CodeFunction("DoTransposeFp32", input_tensor_, output_tensor_, in_shape_, out_shape_,
                    "(TransposeParameter *)&transpose_parameter", task_id, num_unit_thread, dim_size_, position_);

  context->AppendCode(code.str());
  return RET_OK;
--- a/mindspore/lite/micro/coder/opcoders/nnacl/fp32/transpose_fp32_coder.h
+++ b/mindspore/lite/micro/coder/opcoders/nnacl/fp32/transpose_fp32_coder.h
@@ -39,14 +39,14 @@ class TransposeFp32Coder final : public OperatorCoder {

 private:
  TransposeParameter *transpose_parameter_ = nullptr;
  int thread_num_ = 1;
  int thread_h_stride_ = 0;
  int thread_h_num_ = 0;
  int num_unit_ = 0;
  int *in_shape_ = nullptr;
  int *out_shape_ = nullptr;
  int *dim_size_ = nullptr;
  int *position_ = nullptr;
  int thread_num_{1};
  int thread_h_stride_{0};
  int thread_h_num_{0};
  int num_unit_{0};
  int *in_shape_{nullptr};
  int *out_shape_{nullptr};
  int *dim_size_{nullptr};
  int *position_{nullptr};
 };

 }  // namespace mindspore::lite::micro::nnacl
--- a/mindspore/lite/micro/coder/opcoders/nnacl/int8/activation_int8_coder.cc
+++ b/mindspore/lite/micro/coder/opcoders/nnacl/int8/activation_int8_coder.cc
@@ -0,0 +1,74 @@
 /**
 * Copyright 2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #include "coder/opcoders/nnacl/int8/sigmoid_int8_coder.h"
 #include "coder/opcoders/nnacl/int8/relux_int8_coder.h"
 #include "src/ops/populate/populate_register.h"
 #include "nnacl/fp32/activation_fp32.h"
 #include "schema/model_generated.h"
 #include "src/common/version_manager.h"

 using mindspore::schema::PrimitiveType_Activation;

 namespace mindspore::lite::micro::nnacl {

 std::unique_ptr<OperatorCoder> CPUActivationINT8CoderCreator(const std::vector<Tensor *> &in_tensors,
                                                             const std::vector<Tensor *> &out_tensors,
                                                             const Model::Node *node, size_t node_index,
                                                             Target target) {
  const void *primitive_c = node->primitive_;
  if (primitive_c == nullptr) {
    return nullptr;
  }
  int schema_version = VersionManager::GetInstance()->GetSchemaVersion();
  ParameterGen parameter_gen =
    PopulateRegistry::GetInstance()->GetParameterCreator(GetPrimitiveType(node->primitive_), schema_version);
  if (parameter_gen == nullptr) {
    MS_LOG(ERROR) << "parameter generator is nullptr";
    return nullptr;
  }
  OpParameter *parameter = parameter_gen(node->primitive_);
  if (parameter == nullptr) {
    MS_LOG(ERROR) << "PopulateParameter return nullptr, type: "
                  << schema::EnumNamePrimitiveType((schema::PrimitiveType)GetPrimitiveType(node->primitive_));
    return nullptr;
  }
  auto type = (reinterpret_cast<ActivationParameter *>(parameter))->type_;

  std::unique_ptr<OperatorCoder> coder;
  switch (static_cast<schema::ActivationType>(type)) {
    case schema::ActivationType_SIGMOID:
      coder = CPUOpCoderCreator<SigmodInt8Coder>(in_tensors, out_tensors, node, node_index, target);
      break;
    case schema::ActivationType_RELU:
      coder = CPUOpCoderCreator<ReluInt8Coder>(in_tensors, out_tensors, node, node_index, target);
      break;
    case schema::ActivationType_RELU6:
      coder = CPUOpCoderCreator<Relu6Int8Coder>(in_tensors, out_tensors, node, node_index, target);
      break;
    default:
      break;
  }

  if (coder == nullptr) {
    MS_LOG(ERROR) << "create conv2d int8 coder failed";
    return nullptr;
  }
  return coder;
 }

 REG_OPERATOR_CODER(kAllTargets, kNumberTypeInt8, PrimitiveType_Activation, CPUActivationINT8CoderCreator)
 }  // namespace mindspore::lite::micro::nnacl
--- a/mindspore/lite/micro/coder/opcoders/nnacl/int8/add_int8_coder.cc
+++ b/mindspore/lite/micro/coder/opcoders/nnacl/int8/add_int8_coder.cc
@@ -14,17 +14,18 @@
 * limitations under the License.
 */

 #include "micro/coder/opcoders/nnacl/int8/add_int8_coder.h"
 #include "coder/opcoders/nnacl/int8/add_int8_coder.h"
 #include <algorithm>
 #include <type_traits>
 #include "nnacl/int8/quantize.h"
 #include "micro/coder/log.h"
 #include "micro/coder/opcoders/serializers/nnacl_serializer/nnacl_int8_serializer.h"
 #include "micro/coder/opcoders/file_collector.h"
 #include "coder/log.h"
 #include "coder/opcoders/serializers/nnacl_serializer/nnacl_int8_serializer.h"
 #include "coder/opcoders/file_collector.h"
 #include "coder/opcoders/parallel.h"

 using mindspore::schema::PrimitiveType_Add;
 using mindspore::schema::PrimitiveType_AddFusion;

 namespace mindspore::lite::micro {
 namespace mindspore::lite::micro::nnacl {

 int AddInt8Coder::Prepare(CoderContext *const context) {
  input0 = input_tensors().at(0);
@@ -38,26 +39,8 @@ int AddInt8Coder::Prepare(CoderContext *const context) {
  return RET_OK;
 }

 int AddInt8Coder::DoCode(CoderContext *const context) {
  Collect(context, {"wrapper/int8/conv1x1_init_int8.h"}, {"add_int8_wrapper.c", "add_int8.c", "thread_pool.c"});

  nnacl::NNaclInt8Serializer code;

  code.CodeStruct("para", para_);
  code.CodeStruct("arith_para", *arith_para_);
  code.CodeBaseStruct("AddArgs", "args", "para", "arith_para", in_size_, out_size_, thread_num_s_, elements_num_,
                      support_opt_add_, input0, input1, output_tensor_);

  if (arith_para_->broadcasting_) {
    code.CodeFunction("ParallelLaunch", "THREAD_POOL_DEFAULT", "AddBroadcastRun", "&args", thread_num_s_);
  } else {
    code.CodeFunction("ParallelLaunch", "THREAD_POOL_DEFAULT", "AddRun", "&args", thread_num_s_);
  }

  return RET_OK;
 }

 int AddInt8Coder::Init() {
  arith_para_ = reinterpret_cast<ArithmeticParameter *>(parameter_);
  para_.in0_args_.zp_ = input0->quant_params().front().zeroPoint * -1;
  para_.in1_args_.zp_ = input1->quant_params().front().zeroPoint * -1;
  para_.out_zp_ = output_tensor_->quant_params().front().zeroPoint;
@@ -152,5 +135,32 @@ int AddInt8Coder::ReSize() {
  return RET_OK;
 }

 REG_OPERATOR_CODER(kAllTargets, kNumberTypeInt8, PrimitiveType_Add, CPUOpCoderCreator<AddInt8Coder>)
 }  // namespace mindspore::lite::micro
 int AddInt8Coder::DoCode(CoderContext *const context) {
  Collect(context, {"wrapper/int8/add_int8_wrapper.h"},
          {"add_int8_wrapper.c", "add_int8.c", "arithmetic_base.c", "arithmetic_int8.c", "thread_pool.c"});

  nnacl::NNaclInt8Serializer code;

  code.CodeStruct("para", para_);
  code.CodeStruct("arith_para", *arith_para_);
  code.CodeBaseStruct("AddInt8Args", kRunArgs, "&para", "&arith_para", in_size_, out_size_, gThreadNum, elements_num_,
                      support_opt_add_, input0, input1, output_tensor_);
  if (support_parallel_) {
    if (arith_para_->broadcasting_) {
      code.CodeFunction(kParallelLaunch, gThreadPool, "AddBroadcastInt8Run", kRunArgsAddr, gThreadNum);
    } else {
      code.CodeFunction(kParallelLaunch, gThreadPool, "AddInt8Run", kRunArgsAddr, gThreadNum);
    }
  } else {
    if (arith_para_->broadcasting_) {
      code.CodeFunction("AddBroadcastInt8Run", kRunArgsAddr, kDefaultTaskId);
    } else {
      code.CodeFunction("AddInt8Run", kRunArgsAddr, kDefaultTaskId);
    }
  }
  context->AppendCode(code.str());
  return RET_OK;
 }

 REG_OPERATOR_CODER(kAllTargets, kNumberTypeInt8, PrimitiveType_AddFusion, CPUOpCoderCreator<AddInt8Coder>)
 }  // namespace mindspore::lite::micro::nnacl
--- a/mindspore/lite/micro/coder/opcoders/nnacl/int8/add_int8_coder.h
+++ b/mindspore/lite/micro/coder/opcoders/nnacl/int8/add_int8_coder.h
@@ -18,17 +18,15 @@
 #define MINDSPORE_LITE_MICRO_CODER_OPCODERS_NNACL_ADD_INT8_CODER_H_

 #include <vector>
 #include "micro/coder/opcoders/op_coder.h"
 #include "coder/opcoders/op_coder.h"
 #include "nnacl/int8/add_int8.h"

 namespace mindspore::lite::micro {
 class AddInt8Coder : public OperatorCoder {
 namespace mindspore::lite::micro::nnacl {
 class AddInt8Coder final : public OperatorCoder {
 public:
  AddInt8Coder(const std::vector<Tensor *> &in_tensors, const std::vector<Tensor *> &out_tensors,
               const Model::Node *node, size_t node_index, Target target)
      : OperatorCoder(in_tensors, out_tensors, node, node_index, target) {
    arith_para_ = reinterpret_cast<ArithmeticParameter *>(parameter_);
  }
      : OperatorCoder(in_tensors, out_tensors, node, node_index, target) {}

  ~AddInt8Coder() override = default;

@@ -49,5 +47,5 @@ class AddInt8Coder : public OperatorCoder {
  int elements_num_{0};
  bool support_opt_add_{false};
 };
 }  // namespace mindspore::lite::micro
 }  // namespace mindspore::lite::micro::nnacl
 #endif  // MINDSPORE_LITE_MICRO_CODER_OPCODERS_NNACL_ADD_INT8_CODER_H_
--- a/mindspore/lite/micro/coder/opcoders/nnacl/int8/batchnorm_int8_coder.cc
+++ b/mindspore/lite/micro/coder/opcoders/nnacl/int8/batchnorm_int8_coder.cc
@@ -0,0 +1,162 @@
 /**
 * Copyright 2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #include "coder/opcoders/nnacl/int8/batchnorm_int8_coder.h"
 #include <string>
 #include "coder/log.h"
 #include "coder/opcoders/serializers/nnacl_serializer/nnacl_int8_serializer.h"
 #include "coder/opcoders/file_collector.h"
 #include "coder/opcoders/parallel.h"

 using mindspore::schema::PrimitiveType_BatchNorm;

 namespace mindspore::lite::micro::nnacl {

 int BatchNormInt8Coder::Prepare(CoderContext *const context) {
  std::vector<int> input_shapes = input_tensor_->shape();
  size_t n_dim = input_shapes.size();
  batchnorm_param_->channel_ = input_shapes[n_dim - 1];
  batchnorm_param_->units_ = 1;
  for (size_t i = 0; i < n_dim - 1; i++) {
    batchnorm_param_->units_ *= input_shapes[i];
  }
  batchnorm_param_->op_parameter_.thread_num_ =
    MSMIN(batchnorm_param_->op_parameter_.thread_num_, batchnorm_param_->channel_);
  if (target_ == kARM32M) {
    batchnorm_param_->unit_ = batchnorm_param_->units_;
  } else {
    batchnorm_param_->unit_ = UP_DIV(batchnorm_param_->units_, kMaxThreadNumSupported);
  }
  if (batchnorm_param_->fused_) {
    MS_CHECK_RET_CODE(InitFusedConstTensor(), "InitFusedConstTensor failed");
  } else {
    MS_CHECK_RET_CODE(InitConstTensor(), "InitConstTensor failed");
  }

  return RET_OK;
 }
 int BatchNormInt8Coder::DoCode(CoderContext *context) {
  std::vector<std::string> headers = {"nnacl/slice_parameter.h"};
  std::vector<std::string> cFiles = {"batchnorm_int8.c"};
  NNaclInt8Serializer code;

  code.CodeStruct("param", *batchnorm_param_);
  code.CodeFunction("BatchNormInt8", output_tensor_, input_tensor_, alpha_addr_, beta_addr_, kDefaultTaskId, "&param");

  Collect(context, headers, cFiles);
  context->AppendCode(code.str());

  return RET_OK;
 }

 int BatchNormInt8Coder::InitConstTensor() {
  MS_CHECK_TRUE(input_tensors_.size() >= kInputSize2, "input tensors number not match");
  Tensor *input = input_tensor_;
  Tensor *mean = input_tensors_.at(1);
  Tensor *variance = input_tensors_.at(2);
  Tensor *output = output_tensor_;

  auto mean_ptr = reinterpret_cast<int8_t *>(mean->MutableData());
  auto var_ptr = reinterpret_cast<int8_t *>(variance->MutableData());

  MS_CHECK_PTR(mean_ptr);
  MS_CHECK_PTR(var_ptr);

  alpha_addr_ = reinterpret_cast<float *>(
    allocator_->Malloc(kNumberTypeFloat, mean->ElementsNum() * sizeof(float), kOfflinePackWeight));
  MS_CHECK_PTR(alpha_addr_);
  beta_addr_ = reinterpret_cast<float *>(
    allocator_->Malloc(kNumberTypeFloat, variance->ElementsNum() * sizeof(float), kOfflinePackWeight));
  MS_CHECK_PTR(beta_addr_);
  // compute alpha, beta;
  auto eps = batchnorm_param_->epsilon_;
  int32_t zp_in = input->quant_params().at(0).zeroPoint;
  int32_t zp_mean = mean->quant_params().at(0).zeroPoint;
  int32_t zp_var = variance->quant_params().at(0).zeroPoint;
  int32_t zp_out = output->quant_params().at(0).zeroPoint;
  auto s_in = static_cast<float>(input->quant_params().at(0).scale);
  auto s_mean = static_cast<float>(mean->quant_params().at(0).scale);
  auto s_var = static_cast<float>(variance->quant_params().at(0).scale);
  auto s_out = static_cast<float>(output->quant_params().at(0).scale);

  for (int i = 0; i < batchnorm_param_->channel_; ++i) {
    float tmp = s_out * sqrt(eps + s_var * (var_ptr[i] - zp_var));
    float tmp_a = s_in / tmp;
    float tmp_b = zp_out - tmp_a * zp_in - (s_mean * (mean_ptr[i] - zp_mean)) / tmp;
    alpha_addr_[i] = tmp_a;
    beta_addr_[i] = tmp_b;
  }

  return RET_OK;
 }

 int BatchNormInt8Coder::InitFusedConstTensor() {
  MS_CHECK_TRUE(input_tensors_.size() >= 5, "input tensors number not match");
  Tensor *input = input_tensors_.at(0);
  Tensor *scale = input_tensors_.at(1);
  Tensor *offset = input_tensors_.at(2);
  Tensor *mean = input_tensors_.at(3);
  Tensor *variance = input_tensors_.at(4);
  Tensor *output = output_tensor_;

  auto scale_ptr = reinterpret_cast<int8_t *>(scale->MutableData());
  auto offset_ptr = reinterpret_cast<int8_t *>(offset->MutableData());
  auto mean_ptr = reinterpret_cast<int8_t *>(mean->MutableData());
  auto var_ptr = reinterpret_cast<int8_t *>(variance->MutableData());

  MS_CHECK_PTR(scale_ptr);
  MS_CHECK_PTR(offset_ptr);
  MS_CHECK_PTR(mean_ptr);
  MS_CHECK_PTR(var_ptr);

  alpha_addr_ = reinterpret_cast<float *>(
    allocator_->Malloc(kNumberTypeFloat, mean->ElementsNum() * sizeof(float), kOfflinePackWeight));
  MS_CHECK_PTR(alpha_addr_);
  beta_addr_ = reinterpret_cast<float *>(
    allocator_->Malloc(kNumberTypeFloat, variance->ElementsNum() * sizeof(float), kOfflinePackWeight));
  MS_CHECK_PTR(beta_addr_);
  // compute alpha, beta;
  float eps = batchnorm_param_->epsilon_;
  int32_t zp_in = input->quant_params().at(0).zeroPoint;
  int32_t zp_scale = scale->quant_params().at(0).zeroPoint;
  int32_t zp_offset = offset->quant_params().at(0).zeroPoint;
  int32_t zp_mean = mean->quant_params().at(0).zeroPoint;
  int32_t zp_var = variance->quant_params().at(0).zeroPoint;
  int32_t zp_out = output->quant_params().at(0).zeroPoint;
  auto s_in = static_cast<float>(input->quant_params().at(0).scale);
  auto s_scale = static_cast<float>(scale->quant_params().at(0).scale);
  auto s_offset = static_cast<float>(offset->quant_params().at(0).scale);
  auto s_mean = static_cast<float>(mean->quant_params().at(0).scale);
  auto s_var = static_cast<float>(variance->quant_params().at(0).scale);
  auto s_out = static_cast<float>(output->quant_params().at(0).scale);

  float mul_12 = s_in * s_scale;
  float mul_24 = s_scale * s_mean;
  float div_36 = s_offset / s_out;
  for (int i = 0; i < batchnorm_param_->channel_; ++i) {
    float tmp = s_out * sqrt(eps + s_var * (var_ptr[i] - zp_var));
    float tmp_a = (mul_12 * (scale_ptr[i] - zp_scale)) / tmp;
    float tmp_b = zp_out + div_36 * (offset_ptr[i] - zp_offset) - tmp_a * zp_in -
                  (mul_24 * (scale_ptr[i] - zp_scale) * (mean_ptr[i] - zp_mean)) / tmp;
    alpha_addr_[i] = tmp_a;
    beta_addr_[i] = tmp_b;
  }

  return RET_OK;
 }

 REG_OPERATOR_CODER(kAllTargets, kNumberTypeInt8, PrimitiveType_BatchNorm, CPUOpCoderCreator<BatchNormInt8Coder>)
 }  // namespace mindspore::lite::micro::nnacl
--- a/mindspore/lite/micro/coder/opcoders/nnacl/int8/batchnorm_int8_coder.h
+++ b/mindspore/lite/micro/coder/opcoders/nnacl/int8/batchnorm_int8_coder.h
@@ -0,0 +1,49 @@
 /**
 * Copyright 2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #ifndef MINDSPORE_LITE_MICRO_CODER_OPCODERS_NNACL_BATCHNORM_INT8_CODER_H_
 #define MINDSPORE_LITE_MICRO_CODER_OPCODERS_NNACL_BATCHNORM_INT8_CODER_H_

 #include <cstring>
 #include <vector>
 #include "coder/opcoders/op_coder.h"
 #include "nnacl/batchnorm_parameter.h"

 namespace mindspore::lite::micro::nnacl {
 class BatchNormInt8Coder final : public OperatorCoder {
 public:
  BatchNormInt8Coder(const std::vector<Tensor *> &in_tensors, const std::vector<Tensor *> &out_tensors,
                     const Model::Node *node, size_t node_index, Target target)
      : OperatorCoder(in_tensors, out_tensors, node, node_index, target) {
    batchnorm_param_ = reinterpret_cast<BatchNormParameter *>(parameter_);
  }

  ~BatchNormInt8Coder() override = default;

  int Prepare(CoderContext *const context) override;

  int DoCode(CoderContext *context) override;

 private:
  int InitConstTensor();
  int InitFusedConstTensor();

  float *alpha_addr_{nullptr};
  float *beta_addr_{nullptr};
  BatchNormParameter *batchnorm_param_;
 };
 }  // namespace mindspore::lite::micro::nnacl
 #endif  // MINDSPORE_LITE_MICRO_CODER_OPCODERS_NNACL_BATCHNORM_INT8_CODER_H_
--- a/mindspore/lite/micro/coder/opcoders/nnacl/int8/concat_int8_coder.cc
+++ b/mindspore/lite/micro/coder/opcoders/nnacl/int8/concat_int8_coder.cc
@@ -21,6 +21,7 @@
 #include "nnacl/int8/quantize.h"
 #include "coder/opcoders/file_collector.h"
 #include "coder/log.h"
 #include "coder/opcoders/parallel.h"
 #include "coder/opcoders/serializers/nnacl_serializer/nnacl_int8_serializer.h"

 int MallocQuantArgForConcat(ConcatQuantArg *quant_arg, size_t input_num) {
@@ -37,7 +38,6 @@ int ConcatInt8Coder::Prepare(CoderContext *const context) {

  concat_param_->input_shapes_ = nullptr;
  size_t input_num = input_tensors().size();
  MS_CHECK_PTR(input_data_);
  MS_CHECK_RET_CODE(MallocQuantArgForConcat(&concat_param_->quant_arg_, input_num),
                    "Null pointer reference: quant_concat_parm_->in_quant_args_.");
  for (int i = 0; i < static_cast<int>(input_num); i++) {
@@ -60,7 +60,10 @@ int ConcatInt8Coder::Prepare(CoderContext *const context) {
  concat_param_->input_shapes_ = reinterpret_cast<int **>(malloc(sizeof(int *) * input_num));
  MS_CHECK_PTR(concat_param_->input_shapes_);
  for (int i = 0; i < static_cast<int>(input_num); i++) {
    concat_param_->input_shapes_[i] = reinterpret_cast<int *>(input_tensors().at(i)->shape().data());
    auto in_shape = input_tensors_.at(i)->shape();
    concat_param_->input_shapes_[i] = reinterpret_cast<int *>(malloc(in_shape.size() * sizeof(int)));
    MS_CHECK_PTR(concat_param_->input_shapes_[i]);
    memcpy(reinterpret_cast<void *>(concat_param_->input_shapes_[i]), in_shape.data(), sizeof(int) * in_shape.size());
  }

  before_axis_size = 1;
@@ -70,7 +73,10 @@ int ConcatInt8Coder::Prepare(CoderContext *const context) {

  int64_t after_axis_size = 1;
  int output_dim = static_cast<int>(output_tensor_->shape().size());
  concat_param_->output_shapes_ = output_tensor_->shape().data();
  concat_param_->output_shapes_ = reinterpret_cast<int *>(malloc(output_dim * sizeof(int)));
  MS_CHECK_PTR(concat_param_->output_shapes_);
  memcpy(reinterpret_cast<void *>(concat_param_->output_shapes_), output_tensor_->shape().data(),
         sizeof(int) * output_dim);
  for (int i = axis_ + 1; i < output_dim; i++) {
    after_axis_size *= concat_param_->output_shapes_[i];
  }
@@ -84,7 +90,8 @@ int ConcatInt8Coder::DoCode(CoderContext *const context) {
  count_unit_ = thread_num_ > 1 ? UP_DIV(before_axis_size, thread_num_) : before_axis_size;
  concat_param_->count_unit_ = count_unit_;

  Collect(context, {"nnacl/int8/concat_int8.h"}, {"concat_int8.c"});
  Collect(context, {"nnacl/int8/concat_int8.h", "wrapper/int8/concat_int8_wrapper.h"},
          {"concat_int8.c", "concat_int8_wrapper.c"});
  NNaclInt8Serializer code;

  int in_tensor_count = input_tensors().size();
@@ -96,15 +103,12 @@ int ConcatInt8Coder::DoCode(CoderContext *const context) {
  }
  code.CodeStruct("concat_param", *concat_param_, in_tensor_count, input_tensor_->shape().size(),
                  output_tensor_->shape().size());

  if (thread_num_ > 1) {
    code.CodeBaseStruct("ConcatInt8Args", "args", "input_data", output_tensor_, "&concat_param", axis_,
                        before_axis_size, count_unit_);
    code.CodeFunction("ParallelLaunch", "THREAD_POOL_DEFAULT", "ConcatInt8Run", "&args", "thread_num");
  code.CodeBaseStruct("ConcatInt8Args", kRunArgs, "input_data", output_tensor_, "&concat_param", axis_,
                      before_axis_size, count_unit_);
  if (support_parallel_) {
    code.CodeFunction(kParallelLaunch, gThreadPool, "ConcatInt8Run", kRunArgsAddr, gThreadNum);
  } else {
    int task_id = 0;
    int64_t real_dst_count = MSMIN(before_axis_size - task_id * count_unit_, count_unit_);
    code.CodeFunction("Int8Concat", "input_data", output_tensor_, "&concat_param", axis_, real_dst_count, task_id);
    code.CodeFunction("ConcatInt8Run", kRunArgsAddr, kDefaultTaskId);
  }
  context->AppendCode(code.str());
  return RET_OK;