sync code of micro to master

5 years ago · 4faf97f6bd
--- a/cmake/package_lite.cmake
+++ b/cmake/package_lite.cmake
@@ -136,6 +136,8 @@ if(PLATFORM_ARM64)
            COMPONENT ${RUNTIME_COMPONENT_NAME})
    install(DIRECTORY ${TOP_DIR}/include/api/ DESTINATION ${RUNTIME_INC_DIR}/api
            COMPONENT ${RUNTIME_COMPONENT_NAME} FILES_MATCHING PATTERN "*.h" PATTERN "ascend* ops*" EXCLUDE)
    install(DIRECTORY ${TOP_DIR}/mindspore/lite/build/operator_library DESTINATION ${CODEGEN_PKG_NAME}
            COMPONENT ${CODEGEN_COMPONENT_NAME})
    if(ENABLE_TOOLS)
        install(TARGETS benchmark RUNTIME DESTINATION ${RUNTIME_PKG_NAME}/benchmark COMPONENT ${RUNTIME_COMPONENT_NAME})
    endif()
@@ -157,6 +159,8 @@ elseif(PLATFORM_ARM32)
            COMPONENT ${RUNTIME_COMPONENT_NAME})
    install(DIRECTORY ${TOP_DIR}/include/api/ DESTINATION ${RUNTIME_INC_DIR}/api
            COMPONENT ${RUNTIME_COMPONENT_NAME} FILES_MATCHING PATTERN "*.h" PATTERN "ascend*" EXCLUDE)
    install(DIRECTORY ${TOP_DIR}/mindspore/lite/build/operator_library DESTINATION ${CODEGEN_PKG_NAME}
            COMPONENT ${CODEGEN_COMPONENT_NAME})
    if(ENABLE_TOOLS)
        install(TARGETS benchmark RUNTIME DESTINATION ${RUNTIME_PKG_NAME}/benchmark COMPONENT ${RUNTIME_COMPONENT_NAME})
    endif()
@@ -231,6 +235,8 @@ else()
        install(FILES ${glog_LIBPATH}/libglog.so.0.4.0
                DESTINATION ${CONVERTER_PKG_NAME}/third_party/glog/lib RENAME libglog.so.0
                COMPONENT ${CONVERTER_COMPONENT_NAME})
        install(DIRECTORY ${TOP_DIR}/mindspore/lite/build/operator_library DESTINATION ${CODEGEN_PKG_NAME}
                COMPONENT ${CODEGEN_COMPONENT_NAME})
        install(TARGETS codegen RUNTIME DESTINATION ${CODEGEN_PKG_NAME}/
                COMPONENT ${CODEGEN_COMPONENT_NAME})
    endif()
@@ -249,7 +255,7 @@ else()
 endif()
 set(CPACK_ARCHIVE_COMPONENT_INSTALL ON)
 if(PLATFORM_ARM64 OR PLATFORM_ARM32)
    set(CPACK_COMPONENTS_ALL ${RUNTIME_COMPONENT_NAME})
    set(CPACK_COMPONENTS_ALL ${RUNTIME_COMPONENT_NAME} ${CODEGEN_COMPONENT_NAME})
 else()
    set(CPACK_COMPONENTS_ALL ${RUNTIME_COMPONENT_NAME} ${CONVERTER_COMPONENT_NAME} ${CODEGEN_COMPONENT_NAME})
 endif()
--- a/mindspore/core/utils/log_adapter.h
+++ b/mindspore/core/utils/log_adapter.h
@@ -34,7 +34,7 @@
 #define LOG_HDR_FILE_REL_PATH "mindspore/core/utils/log_adapter.h"

 // Get start index of file relative path in __FILE__
 static constexpr int GetRelPathPos() noexcept {
 static constexpr size_t GetRelPathPos() noexcept {
  return sizeof(__FILE__) > sizeof(LOG_HDR_FILE_REL_PATH) ? sizeof(__FILE__) - sizeof(LOG_HDR_FILE_REL_PATH) : 0;
 }

--- a/mindspore/lite/CMakeLists.txt
+++ b/mindspore/lite/CMakeLists.txt
@@ -89,8 +89,10 @@ if(SUPPORT_TRAIN)
 else()
    if(PLATFORM_ARM64)
        set(RUNTIME_COMPONENT_NAME inference-android-aarch64)
        set(CODEGEN_COMPONENT_NAME codegen-android-aarch64)
    elseif(PLATFORM_ARM32)
        set(RUNTIME_COMPONENT_NAME inference-android-aarch32)
        set(CODEGEN_COMPONENT_NAME codegen-android-aarch32)
    elseif(WIN32)
        if("${X86_64_SIMD}" STREQUAL "off")
            set(RUNTIME_COMPONENT_NAME inference-win-x64)
@@ -218,7 +220,6 @@ if(ENABLE_CONVERTER)
    include(${TOP_DIR}/cmake/external_libs/eigen.cmake)
    include(${TOP_DIR}/cmake/external_libs/protobuf.cmake)
    add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/tools/converter)
    add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/micro/coder)
 endif()

 if(ENABLE_MINDRT)
@@ -272,6 +273,7 @@ endif()

 add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/src)
 add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/nnacl)
 add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/micro/coder)
 if(ENABLE_TOOLS)
    add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/tools/benchmark)
    if(SUPPORT_TRAIN)
--- a/mindspore/lite/micro/cmake/file_list.cmake
+++ b/mindspore/lite/micro/cmake/file_list.cmake
@@ -301,6 +301,30 @@ set(LITE_KERNEL_SRC
        ${LITE_DIR}/nnacl/infer/splice_infer.c
        )

 #### sse
 if("${X86_64_SIMD}" STREQUAL "sse")
    set(SSE_SRC
            ${LITE_DIR}/nnacl/intrinsics/sse/sse_common.c
            ${LITE_DIR}/nnacl/intrinsics/sse/PackNHWCToNCHWFp32.c
            ${LITE_DIR}/nnacl/intrinsics/sse/MatMul_Sse.c
            )
    set_property(SOURCE ${SSE_SRC} PROPERTY LANGUAGE C)
 endif()

 #### avx
 if("${X86_64_SIMD}" STREQUAL "avx")
    set(CMAKE_CXX_FLAGS  "${CMAKE_CXX_FLAGS} -msse4.1 -mavx -mavx2")
    set(CMAKE_C_FLAGS  "${CMAKE_C_FLAGS} -msse4.1 -mavx -mavx2")
    set(AVX_SRC
            ${LITE_DIR}/nnacl/intrinsics/avx/common_utils.c
            ${LITE_DIR}/nnacl/intrinsics/sse/sse_common.c
            ${LITE_DIR}/nnacl/intrinsics/sse/MatMul_Sse.c
            ${LITE_DIR}/nnacl/intrinsics/sse/PackNHWCToNCHWFp32.c
            ${LITE_DIR}/nnacl/assembly/avx/MatmulAvx.S
            )
    set_property(SOURCE ${AVX_SRC} PROPERTY LANGUAGE C)
 endif()

 list(APPEND FILE_SET ${CODER_SRC} ${CODER_OPCODERS_SRC} ${CODER_GENERATOR_SRC}
        ${CODER_ALLOCATOR_SRC} ${LITE_SRC} ${LITE_KERNEL_SRC} ${MINDSPORE_CORE})
        ${CODER_ALLOCATOR_SRC} ${LITE_SRC} ${LITE_KERNEL_SRC} ${MINDSPORE_CORE} ${SSE_SRC} ${AVX_SRC})

--- a/mindspore/lite/micro/coder/CMakeLists.txt
+++ b/mindspore/lite/micro/coder/CMakeLists.txt
@@ -25,10 +25,12 @@ include(${MICRO_DIR}/cmake/file_list.cmake)
 include(${MICRO_DIR}/cmake/package_wrapper.cmake)
 add_subdirectory(operator_library)

 add_executable(codegen main.cc ${FILE_SET})
 add_dependencies(codegen fbs_src)
 add_dependencies(codegen fbs_inner_src)
 target_link_libraries(codegen PRIVATE ${SECUREC_LIBRARY} mindspore::glog)
 if(NOT WIN32 AND "${CMAKE_BUILD_TYPE}" STREQUAL "Release")
    add_custom_command(TARGET codegen POST_BUILD COMMAND strip ${CODEGEN_PATH})
 if(NOT PLATFORM_ARM32 AND NOT PLATFORM_ARM64)
    add_executable(codegen main.cc ${FILE_SET})
    add_dependencies(codegen fbs_src)
    add_dependencies(codegen fbs_inner_src)
    target_link_libraries(codegen PRIVATE ${SECUREC_LIBRARY} mindspore::glog)
    if(NOT WIN32 AND "${CMAKE_BUILD_TYPE}" STREQUAL "Release")
        add_custom_command(TARGET codegen POST_BUILD COMMAND strip ${CODEGEN_PATH})
    endif()
 endif()
--- a/mindspore/lite/micro/coder/allocator/allocator.h
+++ b/mindspore/lite/micro/coder/allocator/allocator.h
@@ -92,19 +92,17 @@ class MemoryAllocator {
   * including tensor, workspace
   */
  template <typename T>
  std::string GetRuntimeAddr(T t, bool is_const = false) {
  std::string GetRuntimeAddr(T t, bool immutable = false) {
    if (!t) {
      return "";
    }
    std::string type_info = is_const ? "const " : "";
    std::string type_name;
    if (std::type_index(typeid(T)) == std::type_index(typeid(Tensor *))) {
      type_name = GetTensorDataType(reinterpret_cast<Tensor *>(t)->data_type()) + "*";
    } else {
      type_name = GetVariableTypeName<T>();
    }
    type_info = wrap(type_info + type_name);

    std::string type_info = wrap(type_name);
    void *variable = reinterpret_cast<void *>(t);
    auto item = inputs_addr_.find(variable);
    if (item != inputs_addr_.end()) {
@@ -133,6 +131,9 @@ class MemoryAllocator {
                        [&variable](const std::pair<Tensor *, std::string> &a) { return variable == a.first; });
    if (iter != origin_weights_addr_.end()) {
      saved_weights_addr_.insert(std::make_pair(iter->second, reinterpret_cast<Tensor *>(variable)));
      if (immutable) {
        malloc_weights_addr_.insert({reinterpret_cast<Tensor *>(variable), iter->second});
      }
      return iter->second;
    }
    MS_LOG(ERROR) << "uninitialized memory";
--- a/mindspore/lite/micro/coder/generator/component/benchmark_component.cc
+++ b/mindspore/lite/micro/coder/generator/component/benchmark_component.cc
@@ -134,7 +134,7 @@ void CodeBenchmarkInference(std::ofstream &ofs, const std::string &module_name)
      << "    uint64_t timeAvg = 0;\n"
      << "    int loop_count = atoi(argv[3]);\n"
      << "    printf(\"======Inference Start======\\n\");\n"
      << "    printf(\"cycles: %d\", loop_count);\n"
      << "    printf(\"cycles: %d\\n\", loop_count);\n"
      << "    for (int i = 0; i < loop_count; i++) {\n"
      << "      uint64_t runBegin = GetTimeUs();\n"
      << "      " << module_name << "_Inference();\n"
--- a/mindspore/lite/micro/coder/generator/component/cmake_component.cc
+++ b/mindspore/lite/micro/coder/generator/component/cmake_component.cc
@@ -48,7 +48,7 @@ void CodeCMakeNetLibrary(std::ofstream &ofs, const std::string &module_name, con
  }

  ofs << "file(GLOB NET_SRC ${CMAKE_CURRENT_SOURCE_DIR}/*.c)\n"
      << "add_library(${PROJ_NAME} STATIC ${NET_SRC})\n";
      << "add_library(net STATIC ${NET_SRC})\n";
 }

 }  // namespace mindspore::lite::micro
--- a/mindspore/lite/micro/coder/generator/component/const_blocks/cmake_lists.h
+++ b/mindspore/lite/micro/coder/generator/component/const_blocks/cmake_lists.h
@@ -19,9 +19,8 @@

 const char *bench_cmake_lists_txt =
  "cmake_minimum_required(VERSION 3.14)\n"
  "project(${PROJ_NAME})\n"
  "project(benchmark)\n"
  "\n"
  "message(\"project name: ${PROJ_NAME}\")\n"
  "message(\"project name: ${MODEL_LIB_PATH}\")\n"
  "message(\"architecture cmake file path: ${ARCH_CMAKE_PATH}\")\n"
  "\n"
@@ -54,14 +53,13 @@ const char *bench_cmake_lists_txt =
  "endif ()\n"
  "link_directories(${MODEL_LIB_PATH})\n"
  "include(benchmark.cmake)\n"
  "add_executable(${PROJ_NAME}_bench ${SRC_FILES})\n"
  "target_link_libraries(${PROJ_NAME}_bench ${MODEL_LIB_NAME} -lm -pthread)\n";
  "add_executable(benchmark ${SRC_FILES})\n"
  "target_link_libraries(benchmark ${MODEL_LIB_NAME} -lm -pthread)\n";

 const char *src_cmake_lists_txt =
  "cmake_minimum_required(VERSION 3.14)\n"
  "project(${PROJ_NAME})\n"
  "project(net)\n"
  "\n"
  "message(\"project name: ${PROJ_NAME}\")\n"
  "message(\"architecture cmake file path: ${ARCH_CMAKE_PATH}\")\n"
  "message(\"operator lib path: ${OP_LIB}\")\n"
  "message(\"operator header path: ${OP_HEADER_PATH}\")\n"
@@ -83,10 +81,11 @@ const char *src_cmake_lists_txt =
  "else()\n"
  "    set(CMAKE_C_FLAGS \"-fPIC -fPIE -O3 -Werror -fstack-protector-strong -fomit-frame-pointer ${CMAKE_C_FLAGS}\")\n"
  "    set(CMAKE_C_FLAGS_Release \"${CMAKE_C_FLAGS_Release} -O3 -ffunction-sections -Werror -fdata-sections\")\n"
  "    string(REPLACE \"-g\" \"\" CMAKE_C_FLAGS \"${CMAKE_C_FLAGS}\")\n"
  "endif()\n"
  "\n"
  "function(create_library)\n"
  "    add_custom_command(TARGET ${PROJ_NAME}\n"
  "    add_custom_command(TARGET net\n"
  "            POST_BUILD\n"
  "            COMMAND rm -rf tmp\n"
  "            COMMAND mkdir tmp\n"
@@ -97,9 +96,9 @@ const char *src_cmake_lists_txt =
  "            COMMENT \"unzip raw static library ${library_name}\"\n"
  "            )\n"
  "    foreach (object_file ${OP_SRC})\n"
  "        add_custom_command(TARGET ${PROJ_NAME} POST_BUILD COMMAND mv ./tmp/${object_file} .)\n"
  "        add_custom_command(TARGET net POST_BUILD COMMAND mv ./tmp/${object_file} .)\n"
  "    endforeach ()\n"
  "    add_custom_command(TARGET ${PROJ_NAME}\n"
  "    add_custom_command(TARGET net\n"
  "            POST_BUILD\n"
  "            COMMAND ar cr ${library_name} *.o\n"
  "            COMMAND ranlib ${library_name}\n"
@@ -109,7 +108,7 @@ const char *src_cmake_lists_txt =
  "            COMMENT \"generate specified static library ${library_name}\"\n"
  "            )\n"
  "endfunction(create_library)\n"
  "string(CONCAT library_name \"lib\" ${PROJ_NAME} \".a\")\n"
  "string(CONCAT library_name \"lib\" net \".a\")\n"
  "create_library()\n";

 #endif  // MINDSPORE_LITE_MICRO_CODER_GENERATOR_CONST_BLOCKS_CMAKE_LISTS_CODE_H_
--- a/mindspore/lite/micro/coder/generator/component/parallel_component.cc
+++ b/mindspore/lite/micro/coder/generator/component/parallel_component.cc
@@ -36,7 +36,7 @@ void CodeCreateThreadPool(std::ofstream &ofs, const std::string &module_name) {
         "    MICRO_ERROR(\"set global thread pool failed\");\n"
         "    return RET_ERROR;\n"
         "  }\n"
         "  MICRO_INFO(\"config: ThreadNum: %d, BindMode: %d\", thread_num, bind_mode);\n";
         "  printf(\"config: ThreadNum: %d, BindMode: %d\\n\", thread_num, bind_mode);\n";
 }

 void CodeDestroyThreadPool(std::ofstream &ofs) { ofs << "  DestroyThreadPool(thread_pool);\n"; }
--- a/mindspore/lite/micro/coder/generator/component/weight_component.cc
+++ b/mindspore/lite/micro/coder/generator/component/weight_component.cc
@@ -17,9 +17,9 @@
 #include "coder/generator/component/weight_component.h"
 #include <memory>
 #include <utility>
 #include <algorithm>
 #include "coder/generator/component/const_blocks/license.h"
 #include "coder/utils/coder_utils.h"
 #include "coder/opcoders/parallel.h"

 namespace mindspore::lite::micro {
 void CodeWeightFileHeader(std::ofstream &ofs, const std::unique_ptr<CoderContext> &ctx) {
@@ -89,7 +89,7 @@ void CodeWeightInitFunc(std::ofstream &ofs, const std::string &module_name, cons
      << "  if (weight_buffer == NULL) {\n"
      << "    return RET_ERROR;\n"
      << "  }\n";

  ofs << "  int " << gThreadNum << " = 1;\n\n";
  ofs << "  struct ModelParameter {\n"
      << "    void *addr;\n"
      << "    size_t size;\n"
--- a/mindspore/lite/micro/coder/opcoders/base/detection_post_process_base_coder.cc
+++ b/mindspore/lite/micro/coder/opcoders/base/detection_post_process_base_coder.cc
@@ -82,9 +82,9 @@ int DetectionPostProcessBaseCoder::AllocateBuffer() {
  MS_CHECK_PTR(params_->decoded_boxes_);
  params_->nms_candidate_ = allocator_->Malloc(kNumberTypeUInt8, num_boxes_ * sizeof(uint8_t), kWorkspace);
  MS_CHECK_PTR(params_->nms_candidate_);
  params_->selected_ = allocator_->Malloc(kNumberTypeInt, num_boxes_ * sizeof(int), kWorkspace);
  params_->selected_ = allocator_->Malloc(kNumberTypeInt32, num_boxes_ * sizeof(int), kWorkspace);
  MS_CHECK_PTR(params_->selected_);
  params_->single_class_indexes_ = allocator_->Malloc(kNumberTypeInt, num_boxes_ * sizeof(int), kWorkspace);
  params_->single_class_indexes_ = allocator_->Malloc(kNumberTypeInt32, num_boxes_ * sizeof(int), kWorkspace);
  MS_CHECK_PTR(params_->single_class_indexes_);

  if (params_->use_regular_nms_) {
@@ -92,13 +92,13 @@ int DetectionPostProcessBaseCoder::AllocateBuffer() {
      allocator_->Malloc(kNumberTypeFloat, (num_boxes_ + params_->max_detections_) * sizeof(float), kWorkspace);
    MS_CHECK_PTR(params_->scores_);
    params_->indexes_ =
      allocator_->Malloc(kNumberTypeInt, (num_boxes_ + params_->max_detections_) * sizeof(int), kWorkspace);
      allocator_->Malloc(kNumberTypeInt32, (num_boxes_ + params_->max_detections_) * sizeof(int), kWorkspace);
    MS_CHECK_PTR(params_->indexes_);
    params_->all_class_scores_ =
      allocator_->Malloc(kNumberTypeFloat, (num_boxes_ + params_->max_detections_) * sizeof(float), kWorkspace);
    MS_CHECK_PTR(params_->all_class_scores_);
    params_->all_class_indexes_ =
      allocator_->Malloc(kNumberTypeInt, (num_boxes_ + params_->max_detections_) * sizeof(int), kWorkspace);
      allocator_->Malloc(kNumberTypeInt32, (num_boxes_ + params_->max_detections_) * sizeof(int), kWorkspace);
    MS_CHECK_PTR(params_->all_class_indexes_);
  } else {
    params_->scores_ = allocator_->Malloc(kNumberTypeFloat, num_boxes_ * sizeof(float), kWorkspace);
--- a/mindspore/lite/micro/coder/opcoders/nnacl/fp32/biasadd_fp32_coder.cc
+++ b/mindspore/lite/micro/coder/opcoders/nnacl/fp32/biasadd_fp32_coder.cc
@@ -36,7 +36,7 @@ int BiasAddFP32Coder::DoCode(CoderContext *ctx) {
    return RET_ERROR;
  }
  size_t data_size = input_tensor_->ElementsNum();
  std::string bias_str = allocator_->GetRuntimeAddr(input_tensors_.at(kWeightIndex));
  std::string bias_str = allocator_->GetRuntimeAddr(input_tensors_.at(kWeightIndex), true);
  Collect(ctx,
          {"nnacl/arithmetic.h", "nnacl/nnacl_utils.h", "nnacl/nnacl_common.h", "nnacl/base/arithmetic_base.h",
           "nnacl/fp32/add_fp32.h", "nnacl/fp32/arithmetic_fp32.h"},
--- a/mindspore/lite/micro/coder/opcoders/nnacl/int8/conv2d_int8_coder.cc
+++ b/mindspore/lite/micro/coder/opcoders/nnacl/int8/conv2d_int8_coder.cc
@@ -183,13 +183,15 @@ int Conv2DINT8Coder::Resize() {
 int Conv2DINT8Coder::DoCode(CoderContext *const context) {
  std::vector<std::string> asm_files;
  if (target_ == kARM32A) {
    asm_files = {"PreSum4x16Int8Peroc.S", "PreSum4x16Int8Pert.S", "MatmulInt8Neon32.S"};
    asm_files = {"PreSum4x16Int8Peroc.S", "PreSum4x16Int8Pert.S", "MatmulInt8.S"};
  } else if (target_ == kARM64) {
    asm_files = {"PreSum4x16Int8Peroc.S", "PreSum4x16Int8Pert.S", "MatmulInt8Neon64.S"};
    asm_files = {"PreSum4x16Int8Peroc.S", "PreSum4x16Int8Pert.S", "MatmulInt8.S", "MatmulDpInt8.S"};
  }
  Collect(context, {"nnacl/int8/conv_int8.h", "nnacl/common_func.h", "wrapper/int8/convolution_int8_wrapper.h"},
  Collect(context,
          {"nnacl/int8/conv_int8.h", "nnacl/common_func.h", "wrapper/int8/convolution_int8_wrapper.h",
           "wrapper/base/common_wrapper.h", "wrapper/base/optimize_handler_wrapper.h"},
          {"common_func.c", "pack_int8.c", "conv_int8.c", "winograd_transform.c", "matmul_int8.c", "fixed_point.c",
           "convolution_int8_wrapper.c", "conv_init_int8_wrapper.c", "thread_pool.c"},
           "convolution_int8_wrapper.c", "conv_init_int8_wrapper.c", "common_wrapper.c", "optimize_handler_wrapper.c"},
          asm_files);
  // call the op function
  nnacl::NNaclInt8Serializer code;
@@ -202,7 +204,6 @@ int Conv2DINT8Coder::DoCode(CoderContext *const context) {
  code.CodeBaseStruct("ConvolutionInt8Args", kRunArgs, input_tensor_, packed_input_, matmul_packed_input_,
                      packed_weight_, bias_data_, output_tensor_, filter_zp_ptr_, input_sum_,
                      "(ConvParameter *)&conv_param", matmul_func_, support_optimize_);
  code.CodeFunction("CheckSupportOptimize", kRunArgsAddr);
  if (support_parallel_) {
    code.CodeFunction(kParallelLaunch, gThreadPool, "ConvolutionInt8Run", kRunArgsAddr, gThreadNum);
  } else {
--- a/mindspore/lite/micro/coder/opcoders/nnacl/int8/conv2d_int8_coder.h
+++ b/mindspore/lite/micro/coder/opcoders/nnacl/int8/conv2d_int8_coder.h
@@ -44,10 +44,8 @@ class Conv2DINT8Coder final : public Conv2DBaseCoder {
  }

 private:
  int InitWeightBias(CoderContext *ctx);

  void CheckSupportOptimize();

  int InitWeightBias(CoderContext *ctx);
  int InitTmpBuffer(CoderContext *ctx);

  int Resize();
@@ -70,7 +68,7 @@ class Conv2DINT8Coder final : public Conv2DBaseCoder {
  int32_t *input_sum_{nullptr};
  int8_t *matmul_packed_input_{nullptr};

  std::string matmul_func_;
  std::string matmul_func_{"NULL"};

  std::function<int(nnacl::NNaclInt8Serializer &, const std::string &, const std::string &)> pack_weight_init_{nullptr};
 };
--- a/mindspore/lite/micro/coder/opcoders/serializers/serializer.h
+++ b/mindspore/lite/micro/coder/opcoders/serializers/serializer.h
@@ -168,9 +168,13 @@ class Serializer {
   *    "int pointer_gen[4] = {1 ,3, 2, 42};\n
   *    const Foo foo_gen = {{1, 2, 3}, pointer_gen, 4};\n"
   */
  template <typename... PARAMETERS>
  template <bool immutable = true, typename... PARAMETERS>
  void CodeBaseStruct(const std::string &type, const std::string &name, PARAMETERS... parameters) {
    code << "const " << type << " " << name << " = {";
    if constexpr (immutable) {
      code << "const " << type << " " << name << " = {";
    } else {
      code << type << " " << name << " = {";
    }
    GenCode(parameters...);
    code << "};\n";
  }
--- a/mindspore/lite/micro/coder/operator_library/CMakeLists.txt
+++ b/mindspore/lite/micro/coder/operator_library/CMakeLists.txt
@@ -22,7 +22,6 @@ endif()
 set(MICRO_CMAKE_PATH ${MICRO_DIR}/cmake)
 set(OPERATOR_LIBRARY_PATH ${CMAKE_BINARY_DIR}/operator_library)
 set(HEADER_PATH "${OPERATOR_LIBRARY_PATH}/include")
 set(LIB_PATH "${OPERATOR_LIBRARY_PATH}/lib/x86")

 message("===========>start to pack operators' head file")
 file(REMOVE_RECURSE ${OPERATOR_LIBRARY_PATH})
@@ -36,14 +35,31 @@ file(REMOVE_RECURSE ${HEADER_PATH}/nnacl/assembly)
 file(REMOVE_RECURSE ${HEADER_PATH}/nnacl/fp16)
 file(REMOVE_RECURSE ${HEADER_PATH}/nnacl/fp16_grad)
 file(REMOVE_RECURSE ${HEADER_PATH}/nnacl/fp32_grad)
 file(REMOVE_RECURSE ${HEADER_PATH}/nnacl/intrinsics)
 file(REMOVE_RECURSE ${HEADER_PATH}/nnacl/optimize)

 if(PLATFORM_ARM64)
    set(MICRO_BUILD_ARM64 ON)
 endif()
 if(PLATFORM_ARM32)
    set(MICRO_BUILD_ARM32A ON)
 endif()

 include(${MICRO_CMAKE_PATH}/package_android.cmake)
 include(${MICRO_CMAKE_PATH}/package_nnacl.cmake)
 include(${MICRO_CMAKE_PATH}/package_cmsis.cmake)
 include(${MICRO_CMAKE_PATH}/package_wrapper.cmake)

 list(APPEND OP_FILES ${NNACL_OPS} ${WRAPPER_SRC} ${RUNTIME_SRC})

 if(PLATFORM_ARM64)
    set(LIB_PATH "${OPERATOR_LIBRARY_PATH}/lib/arm64")
 elseif(PLATFORM_ARM32)
    set(LIB_PATH "${OPERATOR_LIBRARY_PATH}/lib/arm32a")
 else()
    set(LIB_PATH "${OPERATOR_LIBRARY_PATH}/lib/x86")
    list(APPEND OP_FILES ${CMSIS_OPS})
 endif()

 # generate static library
 add_library(ops STATIC ${NNACL_OPS} ${CMSIS_OPS} ${WRAPPER_SRC} ${RUNTIME_SRC})
 add_library(ops STATIC ${OP_FILES})
 install(TARGETS ops ARCHIVE DESTINATION ${LIB_PATH})
--- a/mindspore/lite/micro/coder/operator_library/wrapper/base/common_wrapper.c
+++ b/mindspore/lite/micro/coder/operator_library/wrapper/base/common_wrapper.c
@@ -0,0 +1,36 @@
 /*
 * Copyright 2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #include "wrapper/base/common_wrapper.h"
 #ifdef __ANDROID__
 #include <sys/auxv.h>
 #include <asm/hwcap.h>
 #endif

 bool GetSupportOptFlag() {
  bool status = false;
 #ifdef ENABLE_ARM64
  int hwcap_type = 16;
  // getHwCap
  uint32_t hwcap = getauxval(hwcap_type);
  if (hwcap & HWCAP_ASIMDDP) {
    status = true;
  } else {
    status = false;
  }
 #endif
  return status;
 }
--- a/mindspore/lite/micro/coder/operator_library/wrapper/base/common_wrapper.h
+++ b/mindspore/lite/micro/coder/operator_library/wrapper/base/common_wrapper.h
@@ -0,0 +1,24 @@
 /*
 * Copyright 2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #ifndef MINDSPORE_LITE_MICRO_CODER_OPERATOR_LIBRARY_COMMON_WRAPPER_H_
 #define MINDSPORE_LITE_MICRO_CODER_OPERATOR_LIBRARY_COMMON_WRAPPER_H_

 #include "nnacl/op_base.h"

 bool GetSupportOptFlag();

 #endif  // MINDSPORE_LITE_MICRO_CODER_OPERATOR_LIBRARY_COMMON_WRAPPER_H_
--- a/mindspore/lite/micro/coder/operator_library/wrapper/base/optimize_handler_wrapper.c
+++ b/mindspore/lite/micro/coder/operator_library/wrapper/base/optimize_handler_wrapper.c
@@ -0,0 +1,49 @@
 /*
 * Copyright 2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #include "wrapper/base/optimize_handler_wrapper.h"

 extern void MatMulOptR4Int8Neon64(const int8_t *a, const int8_t *b, int *dst, int row4, int col4, int deep16,
                                  const int *input_sum, const int *bias);
 extern void MatmulInt8DpNeon64(const int8_t *a, const int8_t *b, int8_t *dst, int row8, int col8, int deep4,
                               const int *a_sums, const int *bias, int act_min, int act_max, int out_zp,
                               int *multiplier, int *left_shift, int *right_shift, int row, int col, int stride,
                               size_t peroc);
 extern void MatmulInt8DpOpt(const int8_t *a, const int8_t *b, int8_t *dst, size_t row8, size_t col8, size_t deep4,
                            const int *a_sums, const int *bias, int act_min, int act_max, int out_zp, int *multiplier,
                            int *left_shift, int *right_shift, size_t stride, size_t peroc, int *filter_zp);

 #ifdef ENABLE_ARM64
 void MatMulR4Int8_optimize_handler(const int8_t *a, const int8_t *b, int *dst, int row4, int col4, int deep16,
                                   const int *input_sum, const int *bias) {
  return MatMulOptR4Int8Neon64(a, b, dst, row4, col4, deep16, input_sum, bias);
 }

 void MatMulRInt8_optimize_handler(const int8_t *a, const int8_t *b, int8_t *dst, size_t row, size_t col, size_t deep_4,
                                  size_t stride, const int32_t *input_sum, const int32_t *bias, int32_t *left_shift,
                                  int32_t *right_shift, int32_t *multiplier, int32_t output_zp, int32_t mini,
                                  int32_t maxi, size_t per_channel) {
  return MatmulInt8DpNeon64(a, b, dst, UP_ROUND(row, C8NUM), UP_ROUND(col, C8NUM), deep_4, input_sum, bias, mini, maxi,
                            output_zp, multiplier, left_shift, right_shift, row, col, stride, per_channel);
 }
 void MatMulDpInt8_optimize_handler(const int8_t *a, const int8_t *b, int8_t *dst, size_t row, size_t col, size_t deep_4,
                                   size_t stride, const int32_t *input_sum, const int32_t *bias, int32_t *left_shift,
                                   int32_t *right_shift, int32_t *multiplier, int32_t output_zp, int32_t mini,
                                   int32_t maxi, size_t per_channel, int32_t *filter_zp) {
  return MatmulInt8DpOpt(a, b, dst, row, col, deep_4, input_sum, bias, mini, maxi, output_zp, multiplier, left_shift,
                         right_shift, stride, per_channel, filter_zp);
 }
 #endif
--- a/mindspore/lite/micro/coder/operator_library/wrapper/base/optimize_handler_wrapper.h
+++ b/mindspore/lite/micro/coder/operator_library/wrapper/base/optimize_handler_wrapper.h
@@ -0,0 +1,41 @@
 /*
 * Copyright 2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #ifndef MINDSPORE_LITE_MICRO_CODER_OPERATOR_LIBRARY_OPTIMIZE_HANDLER_WRAPPER_H_
 #define MINDSPORE_LITE_MICRO_CODER_OPERATOR_LIBRARY_OPTIMIZE_HANDLER_WRAPPER_H_

 #include "nnacl/op_base.h"

 #ifdef ENABLE_ARM64
 void IndirectGemmInt8_optimize_handler(int8_t *dst, const int8_t *src, const int8_t *weight, const int32_t *bias,
                                       size_t ksize, size_t ic4, size_t output_channel, size_t offset,
                                       const int32_t *input_sum, size_t act_min, size_t act_max, size_t out_zp,
                                       int32_t *out_multiplier, int32_t *shift_before, int32_t *shift_after,
                                       size_t asymmetric, size_t per_channel, size_t per_channel_offset);
 void MatMulR4Int8_optimize_handler(const int8_t *a, const int8_t *b, int *dst, int row4, int col4, int deep16,
                                   const int *input_sum, const int *bias);

 void MatMulRInt8_optimize_handler(const int8_t *a, const int8_t *b, int8_t *dst, size_t row, size_t col, size_t deep_4,
                                  size_t stride, const int32_t *input_sum, const int32_t *bias, int32_t *left_shift,
                                  int32_t *right_shift, int32_t *multiplier, int32_t output_zp, int32_t mini,
                                  int32_t maxi, size_t per_channel);
 void MatMulDpInt8_optimize_handler(const int8_t *a, const int8_t *b, int8_t *dst, size_t row, size_t col, size_t deep_4,
                                   size_t stride, const int32_t *input_sum, const int32_t *bias, int32_t *left_shift,
                                   int32_t *right_shift, int32_t *multiplier, int32_t output_zp, int32_t mini,
                                   int32_t maxi, size_t per_channel, int32_t *filter_zp);
 #endif

 #endif  // MINDSPORE_LITE_MICRO_CODER_OPERATOR_LIBRARY_OPTIMIZE_HANDLER_WRAPPER_H_
--- a/mindspore/lite/micro/coder/operator_library/wrapper/fp32/matmul_fp32_wrapper.c
+++ b/mindspore/lite/micro/coder/operator_library/wrapper/fp32/matmul_fp32_wrapper.c
@@ -22,21 +22,12 @@ void InitMatrixA(const float *src_ptr, float *dst_ptr, const MatMulParameter *pa
  }
  for (int i = 0; i < params_->batch; i++) {
    const float *src = src_ptr + i * params_->deep_ * params_->row_;
 #ifdef ENABLE_ARM32
    float *dst = dst_ptr + i * params_->deep_ * params_->row_4_;
    if (params_->a_transpose_) {
      RowMajor2Row4Major(src, dst, params_->deep_, params_->row_);
    } else {
      RowMajor2Col4Major(src, dst, params_->row_, params_->deep_);
    }
 #else
    float *dst = dst_ptr + i * params_->deep_ * params_->row_12_;
    float *dst = dst_ptr + i * params_->deep_ * params_->row_align_;
    if (params_->a_transpose_) {
      RowMajor2Row12Major(src, dst, params_->deep_, params_->row_);
    } else {
      RowMajor2Col12Major(src, dst, params_->row_, params_->deep_);
    }
 #endif
  }
 }

@@ -55,11 +46,19 @@ void InitMatrixB(const float *src_ptr, float *dst_ptr, const MatMulParameter *pa
  }
  for (int i = 0; i < params_->batch; i++) {
    const float *src = src_ptr + i * params_->deep_ * params_->col_;
    float *dst = dst_ptr + i * params_->deep_ * params_->col_8_;
    float *dst = dst_ptr + i * params_->deep_ * params_->col_align_;
 #ifdef ENABLE_ARM32
    if (params_->b_transpose_) {
      RowMajor2Col4Major(src, dst, params_->col_, params_->deep_);
    } else {
      RowMajor2Row4Major(src, dst, params_->deep_, params_->col_);
    }
 #else
    if (params_->b_transpose_) {
      RowMajor2Col8Major(src, dst, params_->col_, params_->deep_);
    } else {
      RowMajor2Row8Major(src, dst, params_->deep_, params_->col_);
    }
 #endif
  }
 }
--- a/mindspore/lite/micro/coder/operator_library/wrapper/int8/convolution_int8_wrapper.c
+++ b/mindspore/lite/micro/coder/operator_library/wrapper/int8/convolution_int8_wrapper.c
@@ -16,24 +16,6 @@

 #include "wrapper/int8/convolution_int8_wrapper.h"

 void CheckSupportOptimize(const ConvolutionInt8Args *args) {
  int tile_num = 8;
 #ifdef ENABLE_ARM32
  tile_num = 4;
  args->is_optimize_ = false;
 #endif
 #ifdef ENABLE_ARM64
  if (mindspore::lite::IsSupportSDot()) {
    matmul_func_ = MatMulRInt8_optimize_handler;
    args->is_optimize_ = true;
  } else {
    tile_num = 4;
    args->is_optimize_ = false;
  }
 #endif
  args->conv_param_->tile_num_ = tile_num;
 }

 int ConvolutionInt8Run(void *cdata, int task_id) {
  ConvolutionInt8Args *args = (ConvolutionInt8Args *)cdata;
  ConvInt8(args->input_data_, args->packed_input_, args->matmul_input_, args->packed_weight_, args->bias_data_,
--- a/mindspore/lite/micro/coder/operator_library/wrapper/int8/convolution_int8_wrapper.h
+++ b/mindspore/lite/micro/coder/operator_library/wrapper/int8/convolution_int8_wrapper.h
@@ -36,8 +36,6 @@ typedef struct {
  bool is_optimize_;
 } ConvolutionInt8Args;

 void CheckSupportOptimize(const ConvolutionInt8Args *args);

 int ConvolutionInt8Run(void *cdata, int task_id);

 #endif  // MINDSPORE_LITE_MICRO_INT8_CONVOLUTION_WRAPPER_INT8_WRAPPER_H_