!3784 add lite/test

Merge pull request !3784 from wangzhe/master
5 years ago · 0972d15629
--- a/mindspore/lite/test/CMakeLists.txt
+++ b/mindspore/lite/test/CMakeLists.txt
@@ -1,9 +1,11 @@
 set(TEST_DIR ${TOP_DIR}/tests/ut/cpp)
 set(TEST_DIR ${TOP_DIR}/mindspore/lite/test)
 set(LITE_DIR ${TOP_DIR}/mindspore/lite)
 include_directories(${TOP_DIR})
 include_directories(${TEST_DIR})
 include_directories(${LITE_DIR})
 include_directories(${LITE_DIR}/tools)
 include_directories(${LITE_DIR}/lite)
 include(${CMAKE_CURRENT_SOURCE_DIR}/../../../cmake/dependency_gtest.cmake)
 include(${CMAKE_CURRENT_SOURCE_DIR}/../../../cmake/external_libs/gtest.cmake)

 ### anf src
 set(ANF_SRC
@@ -158,7 +160,7 @@ set(TEST_LITE_SRC
        ${LITE_DIR}/tools/common/flag_parser.cc
        ${LITE_DIR}/tools/common/storage.cc
        ${LITE_DIR}/tools/benchmark/benchmark.cc
        ${LITE_DIR}/test/benchmark_test.cc
        ${LITE_DIR}/test/st/benchmark_test.cc
        )
 ### gpu runtime
 if (SUPPORT_GPU)
@@ -179,6 +181,7 @@ endif()
 if(BUILD_CONVERTER)
    set(TEST_LITE_SRC
            ${TEST_LITE_SRC}
            ${TOP_DIR}/mindspore/core/utils/flags.cc
            ${LITE_DIR}/tools/converter/optimizer.cc
            ${LITE_DIR}/src/common/anf_importer/anf_importer.cc
            ${LITE_DIR}/src/common/anf_importer/import_from_meta_graphT.cc
@@ -188,7 +191,7 @@ if(BUILD_CONVERTER)
            ${LITE_DIR}/tools/converter/converter_flags.cc
            ${LITE_DIR}/tools/converter/converter.cc
            ${LITE_DIR}/tools/converter/parser/onnx/onnx.pb.cc
            ${LITE_DIR}/test/converter_test.cc
            ${LITE_DIR}/test/st/converter_test.cc
            ${LITE_DIR}/src/gllo/common/node_pass.cc
            ${LITE_DIR}/src/gllo/common/optimizer.cc
            ${LITE_DIR}/src/gllo/common/pass_manager.cc
@@ -233,59 +236,50 @@ else()
 endif()
 ### test src
 file(GLOB_RECURSE TEST_CASE_KERNEL_SRC
    ${TEST_DIR}/kernel/cpu/arm/fp32/*.cc
    ${TEST_DIR}/kernel/cpu/arm/int8/*.cc
    ${TEST_DIR}/ut/src/runtime/kernel/arm/fp32/*.cc
    ${TEST_DIR}/ut/src/runtime/kernel/arm/int8/*.cc
 )

 set(TEST_SRC
    ${TEST_LITE_SRC}
    ${TEST_CASE_KERNEL_SRC}
    ${TEST_DIR}/common/common_test.cc
    ${TEST_DIR}/common/test_lite_main.cc
    ${TEST_DIR}/kernel/cpu/arm/common/pack_tests.cc
    ${TEST_DIR}/device/cpu/arm/infer_test.cc
    ${TEST_DIR}/main.cc
    ${TEST_DIR}/ut/src/runtime/kernel/arm/common/pack_tests.cc
    ${TEST_DIR}/ut/src/infer_test.cc
 #        ${TEST_DIR}/device/cpu/arm/graph_test.cc
 )

 if (SUPPORT_TRAIN)
    set(TEST_SRC
            ${TEST_SRC}
            ${TEST_DIR}/device/cpu/arm/train_test.cc
            ${TEST_DIR}/ut/src/train_test.cc
            )
 else()
    set(TEST_SRC
            ${TEST_SRC}
            ${TEST_DIR}/device/cpu/arm/infer_test.cc
            ${TEST_DIR}/ut/src/infer_test.cc
            )
 endif()

 if (SUPPORT_GPU)
    set(TEST_SRC
            ${TEST_SRC}
            ${TEST_DIR}/device/opencl/opencl_infer_tests.cc
            ${TEST_DIR}/kernel/opencl/utils_cl_tests.cc
            ${TEST_DIR}/kernel/opencl/arithmetic_tests.cc
            ${TEST_DIR}/kernel/opencl/convolution_tests.cc
            ${TEST_DIR}/kernel/opencl/depthwise_conv2d_tests.cc
            ${TEST_DIR}/kernel/opencl/matmul_tests.cc
            ${TEST_DIR}/kernel/opencl/max_pooling_cl_tests.cc
            ${TEST_DIR}/kernel/opencl/avg_pooling_cl_tests.cc
            ${TEST_DIR}/kernel/opencl/softmax_cl_tests.cc
            ${TEST_DIR}/kernel/opencl/concat_tests.cc
            ${TEST_DIR}/kernel/opencl/conv2d_transpose_tests.cc
            ${TEST_DIR}/ut/stc/runtime/kernel/opencl/matmul_tests.cc
            ${TEST_DIR}/ut/stc/runtime/kernel/opencl/softmax_cl_tests.cc
            )
 endif()

 if (ENABLE_FP16)
    set(TEST_SRC
            ${TEST_SRC}
            ${TEST_DIR}/kernel/cpu/arm/fp16/convolution_fp16_tests.cc)
            ${TEST_DIR}/ut/src/runtime/kernel/arm/fp16/convolution_fp16_tests.cc)
 endif ()


 add_executable(lite-test ${TEST_SRC})

 target_link_libraries(lite-test dl ${SECUREC_LIBRARY} ${GTEST_LIBRARY} mindspore::json)
 target_link_libraries(lite-test dl ${SECUREC_LIBRARY} ${GTEST_LIBRARY} mindspore::json mindspore::gtest)
 if (BUILD_CONVERTER)
    target_link_libraries(lite-test
            anf_exporter_mid
--- a/mindspore/lite/test/common/common_test.cc
+++ b/mindspore/lite/test/common/common_test.cc
@@ -0,0 +1,41 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "common/common_test.h"
 #include "mindspore/core/utils/log_adapter.h"

 #ifdef __cplusplus
 #if __cplusplus
 extern "C" {
 #endif
 #endif

 namespace mindspore {

 void Common::SetUpTestCase() {}

 void Common::TearDownTestCase() {}

 void Common::SetUp() {}

 void Common::TearDown() {}

 }  // namespace mindspore

 #ifdef __cplusplus
 #if __cplusplus
 }
 #endif
 #endif
--- a/mindspore/lite/test/common/common_test.h
+++ b/mindspore/lite/test/common/common_test.h
@@ -0,0 +1,78 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #ifndef TESTS_UT_COMMON_UT_COMMON_H_
 #define TESTS_UT_COMMON_UT_COMMON_H_

 #include <cmath>
 #include <fstream>
 #include <iostream>
 #include <string>
 #include <algorithm>
 #include "gtest/gtest.h"
 namespace mindspore {
 class Common : public testing::Test {
 public:
  // TestCase only enter once
  static void SetUpTestCase();
  static void TearDownTestCase();

  // every TEST_F macro will enter one
  virtual void SetUp();
  virtual void TearDown();

  template <typename T>
  void PrintData(std::string name, T *output_data, int size) {
    std::cout << "The " << name << " is as follows:" << std::endl;
    if (typeid(output_data[0]) == typeid(uint8_t) || typeid(output_data[0]) == typeid(int8_t)) {
      for (size_t i = 0; i < std::min(size, 100); i++) {
        std::cout << static_cast<int>(output_data[i]) << " ";
      }
    } else {
      for (size_t i = 0; i < std::min(size, 100); i++) {
        std::cout << output_data[i] << " ";
      }
    }
    std::cout << std::endl;
  }

  template <typename T>
  static void CompareOutputData(T *output_data, T *correct_data, int size, float err_bound) {
    for (size_t i = 0; i < size; i++) {
      T abs = fabs(output_data[i] - correct_data[i]);
      ASSERT_LE(abs, err_bound);
    }
  }

  void ReadFile(const char *file, size_t *size, char **buf) {
    ASSERT_NE(nullptr, file);
    ASSERT_NE(nullptr, size);
    ASSERT_NE(nullptr, buf);
    std::string path = std::string(file);
    std::ifstream ifs(path);
    ASSERT_EQ(true, ifs.good());
    ASSERT_EQ(true, ifs.is_open());

    ifs.seekg(0, std::ios::end);
    *size = ifs.tellg();
    *buf = new char[*size];

    ifs.seekg(0, std::ios::beg);
    ifs.read(*buf, *size);
    ifs.close();
  }
 };
 }  // namespace mindspore
 #endif  // TESTS_UT_COMMON_UT_COMMON_H_
--- a/mindspore/lite/test/main.cc
+++ b/mindspore/lite/test/main.cc
@@ -0,0 +1,29 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include <iostream>
 #include "gtest/gtest.h"
 #include "mindspore/core/utils/log_adapter.h"

 namespace mindspore {
 extern void InitSubModulesLogLevel();
 }

 GTEST_API_ int main(int argc, char** argv) {
  mindspore::InitSubModulesLogLevel();
  testing::InitGoogleTest(&argc, argv);
  int ret = RUN_ALL_TESTS();
  return ret;
 }
--- a/mindspore/lite/test/st/benchmark_test.cc
+++ b/mindspore/lite/test/st/benchmark_test.cc
@@ -15,12 +15,12 @@
 */
 #include <gtest/gtest.h>
 #include <string>
 #include "tests/ut/cpp/common/common_test.h"
 #include "common/common_test.h"
 #include "benchmark/benchmark.h"

 namespace mindspore {
 namespace lite {
 class BenchmarkTest : public UT::Common {
 class BenchmarkTest : public mindspore::Common {
 public:
  BenchmarkTest() {}
 };
--- a/mindspore/lite/test/st/converter_test.cc
+++ b/mindspore/lite/test/st/converter_test.cc
@@ -16,11 +16,11 @@
 #include <gtest/gtest.h>
 #include <string>
 #include "converter/converter.h"
 #include "tests/ut/cpp/common/common_test.h"
 #include "common/common_test.h"

 namespace mindspore {
 namespace lite {
 class ConverterTest : public UT::Common {
 class ConverterTest : public mindspore::Common {
 public:
  ConverterTest() {}
 };
--- a/mindspore/lite/test/ut/src/graph_test.cc
+++ b/mindspore/lite/test/ut/src/graph_test.cc
@@ -0,0 +1,246 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include <stdio.h>
 #include <stdlib.h>
 #include <time.h>
 #include <climits>
 #include <string>
 #include <iostream>
 #include <memory>
 #include <fstream>
 #include "common/common_test.h"
 #include "mindspore/core/utils/log_adapter.h"
 #include "mindspore/lite/include/lite_session.h"
 #include "mindspore/lite/src/executor.h"
 #include "mindspore/lite/schema/inner/anf_ir_generated.h"

 namespace mindspore {
 class TestLiteInference : public mindspore::Common {
 public:
  TestLiteInference() {}
 };

 std::string RealPath(const char *path) {
  if (path == nullptr) {
    return "";
  }
  if ((strlen(path)) >= PATH_MAX) {
    return "";
  }

  std::shared_ptr<char> resolvedPath(new (std::nothrow) char[PATH_MAX]{0});
  if (resolvedPath == nullptr) {
    return "";
  }

  auto ret = realpath(path, resolvedPath.get());
  if (ret == nullptr) {
    return "";
  }
  return resolvedPath.get();
 }

 char *ReadModelFile(const char *file, size_t *size) {
  if (file == nullptr) {
    return nullptr;
  }
  MS_ASSERT(size != nullptr);
  std::ifstream ifs(RealPath(file));
  if (!ifs.good()) {
    return nullptr;
  }

  if (!ifs.is_open()) {
    return nullptr;
  }

  ifs.seekg(0, std::ios::end);
  *size = ifs.tellg();
  std::unique_ptr<char> buf(new (std::nothrow) char[*size]);
  if (buf == nullptr) {
    ifs.close();
    return nullptr;
  }

  ifs.seekg(0, std::ios::beg);
  ifs.read(buf.get(), *size);
  ifs.close();

  return buf.release();
 }

 // TEST_F(TestLiteInference, Net) {
 //  auto msGraph = std::make_shared<lite::GraphDefT>();
 //  msGraph->name = "graph";
 //  auto msSubgraph = std::make_unique<lite::SubGraphDefT>();
 //  msSubgraph->name = "subGraph";
 //
 //  auto node = std::make_unique<lite::OpDefT>();
 //  node->inputIndex = {0, 1};
 //  node->outputIndex = {2};
 //  node->attr.type = lite::OpT_Add;
 //  node->attr.value = new lite::AddT;
 //  node->name = "Add";
 //  node->fmkType = lite::FmkType_CAFFE;
 //  msSubgraph->nodes.emplace_back(std::move(node));
 //
 //  msSubgraph->inputIndex = {0};
 //  msSubgraph->outputIndex = {2};
 //
 //  auto input0 = std::make_unique<lite::TensorDefT>();
 //  input0->refCount = lite::MSCONST_WEIGHT_REFCOUNT;
 //  input0->format = lite::Format_NCHW;
 //  input0->dataType = TypeId::kNumberTypeFloat;
 //  input0->dims = {1, 1, 2, 2};
 //  input0->offset = -1;
 //  msSubgraph->allTensors.emplace_back(std::move(input0));
 //
 //  auto input1 = std::make_unique<lite::TensorDefT>();
 //  input1->refCount = lite::MSCONST_WEIGHT_REFCOUNT;
 //  input1->format = lite::Format_NCHW;
 //  input1->dataType = TypeId::kNumberTypeFloat;
 //  input1->dims = {1, 1, 2, 2};
 //  input1->offset = -1;
 //  input1->data.resize(16);
 //  msSubgraph->allTensors.emplace_back(std::move(input1));
 //
 //  auto output = std::make_unique<lite::TensorDefT>();
 //  output->refCount = 0;
 //  output->format = lite::Format_NCHW;
 //  output->dims = {1, 1, 2, 2};
 //  output->offset = -1;
 //  msSubgraph->allTensors.emplace_back(std::move(output));
 //  msGraph->subgraphs.emplace_back(std::move(msSubgraph));
 //
 //  flatbuffers::FlatBufferBuilder builder(1024);
 //  auto offset = lite::GraphDef::Pack(builder, msGraph.get());
 //  builder.Finish(offset);
 //  int size = builder.GetSize();
 //  auto *content = builder.GetBufferPointer();
 //  mindspore::lite::Context context;
 //  context.allocator = nullptr;
 //  context.deviceCtx.type = mindspore::lite::DeviceType::DT_CPU;
 // #if 0
 //    auto graph = mindspore::lite::inference::LoadModel((char *)content, size);
 //
 //    auto session = mindspore::lite::inference::Session::CreateSession(&context);
 //
 //    std::vector<float> z1 = {1.1, 2.1, 3.1, 4.1};
 //    std::vector<inference::MSTensor *> inputs;
 //    auto t1 = inference::MSTensor::CreateTensor(TypeId::kNumberTypeFloat32, std::vector<int>({1, 1, 2, 2}));
 //    memcpy_s(t1->MutableData(), z1.size() * sizeof(float), z1.data(), z1.size() * sizeof(float));
 //
 //    auto t2 = inference::MSTensor::CreateTensor(TypeId::kNumberTypeFloat32, std::vector<int>({1, 1, 2, 2}));
 //    memcpy_s(t2->MutableData(), z1.size() * sizeof(float), z1.data(), z1.size() * sizeof(float));
 //
 //    inputs.push_back(t1);
 //    inputs.push_back(t1);
 //    //    VectorRef *outputs = new VectorRef();
 //    auto outputs = session->RunGraph(inputs);
 // #else
 //  auto file = "./efficientnet_b0.ms";
 //  size_t model_size;
 //
 //  char *modelbuf = ReadModelFile(file, &model_size);
 //  auto graph = mindspore::lite::inference::LoadModel(modelbuf, model_size);
 //  auto session = mindspore::lite::inference::Session::CreateSession(&context);
 //  session->CompileGraph(graph);
 //  std::vector<inference::MSTensor *> inputs;
 //  auto t1 = inference::MSTensor::CreateTensor(TypeId::kNumberTypeFloat32, std::vector<int>({1, 244, 244, 3}));
 //
 //  inputs.push_back(t1);
 //  auto outputs = session->RunGraph(inputs);
 // #endif
 // }

 // TEST_F(TestLiteInference, Conv) {
 //   auto msGraph = std::make_shared<lite::GraphDefT>();
 //   msGraph->name = "graph";
 //   auto msSubgraph = std::make_unique<lite::SubGraphDefT>();
 //   msSubgraph->name = "subGraph";
 //
 //   auto node = std::make_unique<lite::OpDefT>();
 //   node->inputIndex = {0, 1};
 //   node->outputIndex = {2};
 //   node->attr.type = lite::OpT_Conv2D;
 //   auto attr = new lite::Conv2DT;
 //   attr->padMode = lite::PadMode_SAME;
 //   attr->channelIn = 1;
 //   attr->channelOut = 1;
 //   attr->format = lite::Format_NHWC;
 //   attr->strideH = 1;
 //   attr->strideW = 1;
 //   attr->kernelH = 2;
 //   attr->kernelW = 2;
 //
 //   node->attr.value = attr;
 //   node->name = "Conv2D";
 //   node->fmkType = lite::FmkType_CAFFE;
 //   msSubgraph->nodes.emplace_back(std::move(node));
 //
 //   msSubgraph->inputIndex = {0};
 //   msSubgraph->outputIndex = {2};
 //   // MS_LOG(ERROR) << "OutData";
 //
 //   auto input0 = std::make_unique<lite::TensorDefT>();
 //   input0->refCount = lite::MSCONST_WEIGHT_REFCOUNT;
 //   input0->format = lite::Format_NCHW;
 //   input0->dataType = TypeId::kNumberTypeFloat;
 //   input0->dims = {1, 1, 5, 5};
 //   // input0->data.resize(sizeof(float) * 25);
 //   // std::vector<float> input_data = {1, 2, 3, 4, 5, 1, 2, 3, 4, 5, 1, 2, 3, 4, 5, 1, 2, 3, 4, 5, 1, 2, 3, 4, 5};
 //   // memcpy(input0->data.data(), input_data.data(), sizeof(int) * 25);
 //   input0->offset = -1;
 //   msSubgraph->allTensors.emplace_back(std::move(input0));
 //
 //   auto weight = std::make_unique<lite::TensorDefT>();
 //   weight->refCount = lite::MSCONST_WEIGHT_REFCOUNT;
 //   weight->format = lite::Format_KHWC;
 //   weight->dataType = TypeId::kNumberTypeFloat;
 //   weight->dims = {1, 2, 2, 1};
 //   weight->data.resize(sizeof(float) * 4);
 //   std::vector<float> weight_data = {1, 2, 3, 4};
 //   memcpy(weight->data.data(), weight_data.data(), sizeof(int) * 4);
 //   weight->offset = -1;
 //   msSubgraph->allTensors.emplace_back(std::move(weight));
 //
 //   auto output = std::make_unique<lite::TensorDefT>();
 //   output->refCount = 0;
 //   output->format = lite::Format_NCHW;
 //   output->dims = {1, 1, 5, 5};
 //   output->offset = -1;
 //   msSubgraph->allTensors.emplace_back(std::move(output));
 //   msGraph->subgraphs.emplace_back(std::move(msSubgraph));
 //
 //   flatbuffers::FlatBufferBuilder builder(1024);
 //   auto offset = lite::GraphDef::Pack(builder, msGraph.get());
 //   builder.Finish(offset);
 //   int size = builder.GetSize();
 //   auto *content = builder.GetBufferPointer();
 //   mindspore::lite::Context context;
 //   context.allocator = nullptr;
 //   context.deviceCtx.type = mindspore::lite::DeviceType::DT_CPU;
 //   auto graph = mindspore::lite::inference::LoadModel((char *)content, size);
 //   auto session = mindspore::lite::inference::Session::CreateSession(&context);
 //   session->CompileGraph(graph);
 //   std::vector<inference::MSTensor *> inputs;
 //   auto t1 = inference::MSTensor::CreateTensor(TypeId::kNumberTypeFloat32, std::vector<int>({1, 3, 244, 244}));
 //
 //   inputs.push_back(t1);
 //   auto outputs = session->RunGraph(inputs);
 // }

 }  // namespace mindspore
--- a/mindspore/lite/test/ut/src/infer_test.cc
+++ b/mindspore/lite/test/ut/src/infer_test.cc
@@ -0,0 +1,409 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #include <cmath>
 #include <memory>
 #include "mindspore/lite/schema/inner/model_generated.h"
 #include "mindspore/lite/include/model.h"
 #include "common/common_test.h"
 #include "include/lite_session.h"
 #include "include/context.h"
 #include "include/errorcode.h"
 #include "mindspore/core/utils/log_adapter.h"

 namespace mindspore {
 class InferTest : public mindspore::Common {
 public:
  InferTest() {}
 };

 TEST_F(InferTest, TestConvNode) {
  auto meta_graph = std::make_shared<schema::MetaGraphT>();
  meta_graph->name = "graph";

  auto node = std::make_unique<schema::CNodeT>();
  node->inputIndex = {0, 1};
  node->outputIndex = {2};
  node->primitive = std::make_unique<schema::PrimitiveT>();
  node->primitive->value.type = schema::PrimitiveType_Conv2D;
  auto primitive = new schema::Conv2DT;
  primitive->padMode = schema::PadMode_SAME;
  primitive->channelIn = 3;
  primitive->channelOut = 32;
  primitive->format = schema::Format_NHWC;
  primitive->strideH = 1;
  primitive->strideW = 1;
  primitive->kernelH = 3;
  primitive->kernelW = 3;
  primitive->dilateH = 1;
  primitive->dilateW = 1;
  node->primitive->value.value = primitive;
  node->name = "Conv2D";
  meta_graph->nodes.emplace_back(std::move(node));
  meta_graph->inputIndex = {0};
  meta_graph->outputIndex = {2};

  auto input0 = std::make_unique<schema::TensorT>();
  input0->nodeType = schema::NodeType::NodeType_ValueNode;
  input0->format = schema::Format_NHWC;
  input0->dataType = TypeId::kNumberTypeFloat32;
  input0->dims = {1, 28, 28, 3};
  input0->offset = -1;
  meta_graph->allTensors.emplace_back(std::move(input0));

  auto weight = std::make_unique<schema::TensorT>();
  weight->nodeType = schema::NodeType::NodeType_ValueNode;
  weight->format = schema::Format_KHWC;
  weight->dataType = TypeId::kNumberTypeFloat32;
  weight->dims = {32, 3, 3, 3};

  auto buf = new char *[1];
  //================================================================
  size_t weight_size;
  std::string weight_path = "./convfp32_weight_32_3_3_3.bin";
  ReadFile(weight_path.c_str(), &weight_size, buf);
  ASSERT_NE(nullptr, buf[0]);
  auto weight_data_temp = reinterpret_cast<float *>(buf[0]);
  ASSERT_NE(nullptr, weight_data_temp);
  weight->data.resize(sizeof(float) * 32 * 3 * 3 * 3);

  //================================================================
  memcpy(weight->data.data(), weight_data_temp, weight_size);
  weight->offset = -1;
  meta_graph->allTensors.emplace_back(std::move(weight));

  auto output = std::make_unique<schema::TensorT>();
  output->nodeType = schema::NodeType::NodeType_Parameter;
  output->format = schema::Format_NHWC;
  output->dataType = TypeId::kNumberTypeFloat32;
  output->dims = {1, 28, 28, 32};
  output->offset = -1;
  meta_graph->allTensors.emplace_back(std::move(output));

  flatbuffers::FlatBufferBuilder builder(1024);
  auto offset = schema::MetaGraph::Pack(builder, meta_graph.get());
  builder.Finish(offset);
  size_t size = builder.GetSize();
  const char *content = reinterpret_cast<char *>(builder.GetBufferPointer());

  auto model = lite::Model::Import(content, size);
  ASSERT_NE(nullptr, model);
  meta_graph.reset();
  content = nullptr;
  auto context = new lite::Context;
  context->cpuBindMode = lite::NO_BIND;
  context->deviceCtx.type = lite::DT_CPU;
  context->threadNum = 4;
  auto session = session::LiteSession::CreateSession(context);
  ASSERT_NE(nullptr, session);
  auto ret = session->CompileGraph(model.get());
  ASSERT_EQ(lite::RET_OK, ret);
  auto inputs = session->GetInputs();
  ASSERT_EQ(inputs.size(), 1);
  auto inTensor = inputs.front();
  ASSERT_NE(nullptr, inTensor);
  auto data = inTensor->MutableData();
  //===================================================
  size_t input_size;
  std::string input_path = "./convfp32_input_1_28_28_3.bin";
  ReadFile(input_path.c_str(), &input_size, buf);
  ASSERT_NE(nullptr, buf[0]);
  auto input_data = reinterpret_cast<float *>(buf[0]);
  ASSERT_NE(nullptr, input_data);
  //===================================================
  ASSERT_EQ(input_size, inTensor->Size());
  memcpy(data, input_data, input_size);
  ret = session->RunGraph();
  ASSERT_EQ(lite::RET_OK, ret);
  auto outputs = session->GetOutputs();
  ASSERT_EQ(outputs.size(), 1);
  auto outTensor = outputs.front();
  ASSERT_NE(nullptr, outTensor);
  ASSERT_EQ(28 * 28 * 32, outTensor->ElementsNum());
  ASSERT_EQ(TypeId::kNumberTypeFloat32, outTensor->data_type());
  auto *outData = reinterpret_cast<float *>(outTensor->MutableData());
  ASSERT_NE(nullptr, outData);
  //===================================================
  size_t output_size;
  std::string output_path = "./convfp32_out_1_28_28_32.bin";
  ReadFile(output_path.c_str(), &output_size, buf);
  ASSERT_NE(nullptr, buf[0]);
  auto output_data = reinterpret_cast<float *>(buf[0]);
  ASSERT_NE(nullptr, output_data);
  //===================================================
  ASSERT_EQ(output_size, outTensor->Size());
  for (size_t i = 0; i < outTensor->ElementsNum(); i++) {
    ASSERT_EQ(output_data[i], outData[i]);
  }
  MS_LOG(INFO) << "Passed";
 }
 TEST_F(InferTest, TestAddNode) {
  auto meta_graph = std::make_shared<schema::MetaGraphT>();
  meta_graph->name = "graph";

  auto node = std::make_unique<schema::CNodeT>();
  node->inputIndex = {0, 1};
  node->outputIndex = {2};
  node->primitive = std::make_unique<schema::PrimitiveT>();
  node->primitive->value.type = schema::PrimitiveType_Add;
  auto primitive = new schema::AddT;
  node->primitive->value.value = primitive;
  node->name = "Add";
  meta_graph->nodes.emplace_back(std::move(node));
  meta_graph->inputIndex = {0, 1};
  meta_graph->outputIndex = {2};

  auto input0 = std::make_unique<schema::TensorT>();
  input0->nodeType = schema::NodeType::NodeType_ValueNode;
  input0->format = schema::Format_NHWC;
  input0->dataType = TypeId::kNumberTypeFloat32;
  input0->dims = {1, 28, 28, 3};
  input0->offset = -1;
  meta_graph->allTensors.emplace_back(std::move(input0));

  auto weight = std::make_unique<schema::TensorT>();
  weight->nodeType = schema::NodeType::NodeType_ValueNode;
  weight->format = schema::Format_KHWC;
  weight->dataType = TypeId::kNumberTypeFloat32;
  weight->dims = {1, 28, 28, 3};

  weight->offset = -1;
  meta_graph->allTensors.emplace_back(std::move(weight));

  auto output = std::make_unique<schema::TensorT>();
  output->nodeType = schema::NodeType::NodeType_Parameter;
  output->format = schema::Format_NHWC;
  output->dataType = TypeId::kNumberTypeFloat32;
  output->offset = -1;
  meta_graph->allTensors.emplace_back(std::move(output));

  flatbuffers::FlatBufferBuilder builder(1024);
  auto offset = schema::MetaGraph::Pack(builder, meta_graph.get());
  builder.Finish(offset);
  size_t size = builder.GetSize();
  const char *content = reinterpret_cast<char *>(builder.GetBufferPointer());

  auto model = lite::Model::Import(content, size);
  ASSERT_NE(nullptr, model);
  meta_graph.reset();
  content = nullptr;
  auto context = new lite::Context;
  context->cpuBindMode = lite::NO_BIND;
  context->deviceCtx.type = lite::DT_GPU;
  context->threadNum = 4;
  auto session = session::LiteSession::CreateSession(context);
  ASSERT_NE(nullptr, session);
  auto ret = session->CompileGraph(model.get());
  ASSERT_EQ(lite::RET_OK, ret);
  auto inputs = session->GetInputs();
  ASSERT_EQ(inputs.size(), 2);
  auto inTensor = inputs.front();
  ASSERT_NE(nullptr, inTensor);
  (void)inTensor->MutableData();
  auto inTensor1 = inputs.back();
  ASSERT_NE(nullptr, inTensor1);
  (void)inTensor1->MutableData();
  ret = session->RunGraph();
  ASSERT_EQ(lite::RET_OK, ret);
  auto outputs = session->GetOutputs();
  ASSERT_EQ(outputs.size(), 1);
  auto outTensor = outputs.front();
  ASSERT_NE(nullptr, outTensor);
  ASSERT_EQ(28 * 28 * 3, outTensor->ElementsNum());
  ASSERT_EQ(TypeId::kNumberTypeFloat32, outTensor->data_type());
  auto *outData = reinterpret_cast<float *>(outTensor->MutableData());
  ASSERT_NE(nullptr, outData);
  // //===================================================
  // size_t output_size;
  // std::string output_path = "./convfp32_out_1_28_28_32.bin";
  // ReadFile(output_path.c_str(), &output_size, buf);
  // ASSERT_NE(nullptr, buf[0]);
  // auto output_data = reinterpret_cast<float *>(buf[0]);
  // ASSERT_NE(nullptr, output_data);
  // //===================================================
  // ASSERT_EQ(output_size, outTensor->Size());
  // for (size_t i = 0; i < outTensor->ElementsNum(); i++) {
  //   ASSERT_EQ(output_data[i], outData[i]);
  // }
  MS_LOG(INFO) << "Passed";
 }

 TEST_F(InferTest, TestModel) {
  auto buf = new char *[1];
  size_t model_size;
  std::string model_path = "./model.ms";
  ReadFile(model_path.c_str(), &model_size, buf);
  ASSERT_NE(nullptr, buf[0]);

  auto model = lite::Model::Import(buf[0], model_size);
  ASSERT_NE(nullptr, model);
  delete[] buf[0];
  auto context = new lite::Context;
  context->cpuBindMode = lite::NO_BIND;
  context->deviceCtx.type = lite::DT_CPU;
  context->threadNum = 4;
  auto session = session::LiteSession::CreateSession(context);
  ASSERT_NE(nullptr, session);
  auto ret = session->CompileGraph(model.get());
  ASSERT_EQ(lite::RET_OK, ret);
  auto inputs = session->GetInputs();
  ASSERT_EQ(inputs.size(), 1);
  auto inTensor = inputs.front();
  ASSERT_NE(nullptr, inTensor);
  (void)inTensor->MutableData();
  ret = session->RunGraph();
  ASSERT_EQ(lite::RET_OK, ret);
  auto outputs = session->GetOutputs();
  MS_LOG(INFO) << "Passed";
 }

 // TEST_F(TrainTest, TestMultiNode) {
 //  auto msGraph = std::make_shared<schema::GraphDefT>();
 //  msGraph->name = "graph";
 //  auto msSubgraph = std::make_unique<schema::SubGraphDefT>();
 //  msSubgraph->name = "subGraph";
 //
 //  auto conv = std::make_unique<schema::OpDefT>();
 //  conv->inputIndex = {0, 1};
 //  conv->outputIndex = {2};
 //  conv->attr.type = schema::OpT_Conv2D;
 //  auto conv_attr = new schema::Conv2DT;
 //  conv_attr->padMode = schema::PadMode_SAME;
 //  conv_attr->format = schema::Format_NHWC;
 //  conv_attr->strideH = 1;
 //  conv_attr->strideW = 1;
 //  conv_attr->kernelH = 3;
 //  conv_attr->kernelW = 3;
 //  conv_attr->dilateH = 1;
 //  conv_attr->dilateW = 1;
 //
 //  conv->attr.value = conv_attr;
 //  conv->name = "Conv2D";
 //  conv->fmkType = schema::FmkType_CAFFE;
 //  msSubgraph->nodes.emplace_back(std::move(conv));
 //
 //  auto matMul1 = std::make_unique<schema::OpDefT>();
 //  matMul1->inputIndex = {2, 3};
 //  matMul1->outputIndex = {4};
 //  matMul1->attr.type = schema::OpT_MatMul;
 //  auto matMul_attr1 = new schema::MatMulT;
 //  matMul_attr1->transposeA = false;
 //  matMul_attr1->transposeB = true;
 //  matMul1->attr.value = matMul_attr1;
 //  matMul1->name = "matmul1";
 //  matMul1->fmkType = schema::FmkType_CAFFE;
 //  msSubgraph->nodes.emplace_back(std::move(matMul1));
 //
 //  auto matMul2 = std::make_unique<schema::OpDefT>();
 //  matMul2->inputIndex = {4, 5};
 //  matMul2->outputIndex = {6};
 //  matMul2->attr.type = schema::OpT_MatMul;
 //  auto matMul_attr2 = new schema::MatMulT;
 //  matMul_attr2->transposeA = false;
 //  matMul_attr2->transposeB = true;
 //  matMul2->attr.value = matMul_attr2;
 //  matMul2->name = "matmul2";
 //  matMul2->fmkType = schema::FmkType_CAFFE;
 //  msSubgraph->nodes.emplace_back(std::move(matMul2));
 //
 //  msSubgraph->inputIndex = {0};
 //  msSubgraph->outputIndex = {6};
 //
 //  auto input0 = std::make_unique<schema::TensorDefT>();
 //  input0->refCount = schema::MSCONST_WEIGHT_REFCOUNT;
 //  input0->format = schema::Format_NHWC;
 //  input0->dataType = TypeId::kNumberTypeFloat32;
 //  input0->dims = {1, 5, 5, 3};
 //  input0->offset = -1;
 //  msSubgraph->allTensors.emplace_back(std::move(input0));
 //
 //  auto conv_weight = std::make_unique<schema::TensorDefT>();
 //  conv_weight->refCount = schema::MSCONST_WEIGHT_REFCOUNT;
 //  conv_weight->format = schema::Format_KHWC;
 //  conv_weight->dataType = TypeId::kNumberTypeFloat32;
 //  conv_weight->dims = {8, 3, 3, 3};
 //  conv_weight->data.resize(8*3*3*3*sizeof(float));
 //  msSubgraph->allTensors.emplace_back(std::move(conv_weight));
 //
 //  auto conv_output = std::make_unique<schema::TensorDefT>();
 //  conv_output->refCount = 0;
 //  conv_output->format = schema::Format_NHWC;
 //  conv_output->dataType = TypeId::kNumberTypeFloat32;
 //  conv_output->dims = {1, 5, 5, 8};
 //  msSubgraph->allTensors.emplace_back(std::move(conv_output));
 //
 //  auto add_weight = std::make_unique<schema::TensorDefT>();
 //  add_weight->refCount = schema::MSCONST_WEIGHT_REFCOUNT;
 //  add_weight->format = schema::Format_NHWC;
 //  add_weight->dataType = TypeId::kNumberTypeFloat32;
 //  add_weight->dims = {1, 5, 5, 8};
 //  add_weight->data.resize(5*5*8*sizeof(float));
 //  msSubgraph->allTensors.emplace_back(std::move(add_weight));
 //
 //  auto add_output = std::make_unique<schema::TensorDefT>();
 //  add_output->refCount = 0;
 //  add_output->format = schema::Format_NHWC;
 //  add_output->dataType = TypeId::kNumberTypeFloat32;
 //  add_output->dims = {1, 5, 5, 8};
 //  msSubgraph->allTensors.emplace_back(std::move(add_output));
 //
 //  auto mul_weight = std::make_unique<schema::TensorDefT>();
 //  mul_weight->refCount = schema::MSCONST_WEIGHT_REFCOUNT;
 //  mul_weight->format = schema::Format_NHWC;
 //  mul_weight->dataType = TypeId::kNumberTypeFloat32;
 //  mul_weight->dims = {1, 5, 5, 8};
 //  mul_weight->data.resize(5*5*8*sizeof(float));
 //  msSubgraph->allTensors.emplace_back(std::move(mul_weight));
 //
 //  auto mul_output = std::make_unique<schema::TensorDefT>();
 //  mul_output->refCount = 0;
 //  mul_output->format = schema::Format_NHWC;
 //  mul_output->dataType = TypeId::kNumberTypeFloat32;
 //  mul_output->dims = {1, 5, 5, 8};
 //  msSubgraph->allTensors.emplace_back(std::move(mul_output));
 //  msGraph->subgraphs.emplace_back(std::move(msSubgraph));
 //
 //  flatbuffers::FlatBufferBuilder builder(1024);
 //  auto offset = schema::GraphDef::Pack(builder, msGraph.get());
 //  builder.Finish(offset);
 //  size_t size = builder.GetSize();
 //  const char *content = (char *)builder.GetBufferPointer();
 //  const std::string strstub = "";
 //
 //  auto func_graph = inference::LoadModel(content, size, strstub);
 //  ASSERT_NE(nullptr, func_graph);
 //  auto session = inference::MSSession::CreateSession(kCPUDevice, 0);
 //  ASSERT_NE(nullptr, session);
 //  auto graphId = session->CompileGraph(func_graph);
 //
 //  auto inTensor =
 //    std::shared_ptr<inference::MSTensor>(inference::MSTensor::CreateTensor(TypeId::kNumberTypeFloat32, {1, 5, 5, 3}));
 //  ASSERT_NE(nullptr, inTensor);
 //  ASSERT_EQ(sizeof(float) * (5 * 5 * 3), inTensor->Size());
 //  (void)inTensor->MutableData();
 //
 //  std::vector<std::shared_ptr<inference::MSTensor>> inputs;
 //  inputs.emplace_back(inTensor);
 //  auto outputs = session->RunGraph(graphId, inputs);
 //  ASSERT_EQ(1, outputs.size());
 //  ASSERT_EQ(1, outputs.front().size());
 //  auto runOutput = outputs.front().front();
 //  ASSERT_NE(nullptr, runOutput);
 //  ASSERT_EQ(5 * 5 * 8, runOutput->ElementsNum());
 //  ASSERT_EQ(TypeId::kNumberTypeFloat32, runOutput->data_type());
 //  MS_LOG(INFO) << "Passed";
 //}
 }  // namespace mindspore
--- a/mindspore/lite/test/ut/src/runtime/kernel/arm/common/pack_tests.cc
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/common/pack_tests.cc
@@ -0,0 +1,303 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #include <iostream>
 #include <memory>
 #include "mindspore/core/utils/log_adapter.h"
 #include "common/common_test.h"
 #include "mindspore/lite/src/common/file_utils.h"
 #include "mindspore/lite/src/runtime/kernel/arm/opclib/pack.h"
 #include "mindspore/lite/src/runtime/kernel/arm/base/convolution_base.h"

 namespace mindspore {
 class TestPack : public mindspore::Common {
 public:
  TestPack() {}
 };

 void InitConvParamPack(ConvParameter *conv_param) {
  conv_param->input_batch_ = 1;
  conv_param->input_h_ = 28;
  conv_param->input_w_ = 28;
  conv_param->input_channel_ = 3;

  conv_param->output_batch_ = 1;
  conv_param->output_h_ = 28;
  conv_param->output_w_ = 28;
  conv_param->output_channel_ = 32;

  conv_param->kernel_h_ = 3;
  conv_param->kernel_w_ = 3;

  conv_param->stride_h_ = 1;
  conv_param->stride_w_ = 1;

  conv_param->dilation_h_ = 1;
  conv_param->dilation_w_ = 1;

  conv_param->pad_h_ = 1;
  conv_param->pad_w_ = 1;
 }

 TEST_F(TestPack, PackInputFp32) {
  size_t input_size;
  std::string input_path = "./test_data/conv/convfp32_input_1_28_28_3.bin";
  auto input_data = reinterpret_cast<float *>(mindspore::lite::ReadFile(input_path.c_str(), &input_size));
  auto conv_param = new ConvParameter;
  InitConvParamPack(conv_param);
  int kernel_h = conv_param->kernel_h_;
  int kernel_w = conv_param->kernel_w_;
  int in_batch = conv_param->input_batch_;
  int in_channel = conv_param->input_channel_;
  int in_h = conv_param->input_h_;
  int in_w = conv_param->input_w_;
  int out_h = conv_param->output_h_;
  int out_w = conv_param->output_w_;

  int thread_count = 1;
  int tile_n = 8;
  int output_count = out_h * out_w;
  int output_tile_count = UP_DIV(output_count, tile_n);

  int inchannel_block = 4;
  int channel_block = UP_DIV(in_channel, inchannel_block);
  int kernel_plane = kernel_h * kernel_w;
  int unit_size = kernel_plane * channel_block * inchannel_block;
  int packed_input_size = output_tile_count * tile_n * unit_size;

  auto packed_input = reinterpret_cast<float *>(malloc(in_batch * packed_input_size * sizeof(float)));
  memset(packed_input, 0, in_batch * packed_input_size * sizeof(float));

  for (int b = 0; b < in_batch; b++) {
    int in_batch_offset = b * in_channel * in_h * in_w;
    int gemm_in_batch_offset = b * packed_input_size;
    for (int thread_id = 0; thread_id < output_tile_count; thread_id += thread_count) {
      int start_index = thread_id * tile_n;
      int real_cal_num = (output_count - start_index) < tile_n ? (output_count - tile_n) : tile_n;
      float *gemm_input =
        reinterpret_cast<float *>(packed_input) + thread_id * unit_size * tile_n + gemm_in_batch_offset;
      Im2ColPackUnitFp32(input_data + in_batch_offset, conv_param, gemm_input, real_cal_num, start_index);
    }
  }

  printf("==================output data=================\n");
  for (int i = 0; i < 20; i++) {
    std::cout << packed_input[i] << " ,";
  }
  std::cout << std::endl;

  std::string file_path = "./test_data/conv/convfp32_packinput.txt";
  // mindspore::lite::WriteToTxt<float>(file_path, packed_data, in_batch * packed_input_size);

  delete input_data;
  delete conv_param;
  free(packed_input);
  MS_LOG(INFO) << "TestPackInputFp32 passed";
 }

 TEST_F(TestPack, PackWeightFp32) {
  auto conv_param = new ConvParameter;
  InitConvParamPack(conv_param);

  int k_h = conv_param->kernel_h_;
  int k_w = conv_param->kernel_w_;
  int in_channel = conv_param->input_channel_;
  int out_channel = conv_param->output_channel_;
  int ic4 = UP_DIV(in_channel, C4NUM);
  int oc8 = UP_DIV(out_channel, C8NUM);

  size_t weight_size;
  std::string weight_path = "./test_data/conv/convfp32_weight_32_3_3_3.bin";
  auto weight_data = reinterpret_cast<float *>(mindspore::lite::ReadFile(weight_path.c_str(), &weight_size));
  auto packed_weight = reinterpret_cast<float *>(malloc(k_h * k_w * ic4 * C4NUM * oc8 * C8NUM * sizeof(float)));
  PackWeightFp32(weight_data, conv_param, packed_weight);

  printf("==================output data=================\n");
  for (int i = 0; i < 20; i++) {
    std::cout << packed_weight[i] << " ,";
  }
  std::cout << std::endl;

  free(packed_weight);
  delete conv_param;

  MS_LOG(INFO) << "TestPackWeightFp32 passed";
 }

 #ifdef ENABLE_FP16
 TEST_F(TestPack, PackInputFp16) {
  // todo
  size_t input_size;
  std::string input_path = "./test_data/conv/convfp32_input_1_28_28_3.bin";
  auto input_data = reinterpret_cast<float *>(mindspore::lite::ReadFile(input_path.c_str(), &input_size));
  int input_ele_size = input_size / sizeof(float);
  auto fp16_input_data = new float16_t[input_ele_size];
  for (int i = 0; i < input_ele_size; i++) {
    fp16_input_data[i] = (float16_t)input_data[i];
  }

  auto conv_param = new ConvParameter;
  InitConvParamPack(conv_param);
  int kernel_h = conv_param->kernel_h_;
  int kernel_w = conv_param->kernel_w_;
  int in_batch = conv_param->input_batch_;
  int in_channel = conv_param->input_channel_;
  int in_h = conv_param->input_h_;
  int in_w = conv_param->input_w_;
  int out_h = conv_param->output_h_;
  int out_w = conv_param->output_w_;

  int thread_count = 1;
  int tile_n = 16;
  int output_count = out_h * out_w;
  int output_tile_count = UP_DIV(output_count, tile_n);

  int inchannel_block = 8;
  int channel_block = UP_DIV(in_channel, inchannel_block);
  int kernel_plane = kernel_h * kernel_w;
  int unit_size = kernel_plane * channel_block * inchannel_block;
  int packed_input_size = output_tile_count * tile_n * unit_size;

  auto packed_input = reinterpret_cast<float *>(malloc(in_batch * packed_input_size * sizeof(float16_t)));
  memset(packed_input, 0, in_batch * packed_input_size * sizeof(float16_t));

  for (int b = 0; b < in_batch; b++) {
    int in_batch_offset = b * in_channel * in_h * in_w;
    int gemm_in_batch_offset = b * packed_input_size;
    for (int thread_id = 0; thread_id < output_tile_count; thread_id += thread_count) {
      int start_index = thread_id * tile_n;
      int real_cal_num = (output_count - start_index) < tile_n ? (output_count - tile_n) : tile_n;
      float16_t *gemm_input =
        reinterpret_cast<float16_t *>(packed_input) + thread_id * unit_size * tile_n + gemm_in_batch_offset;
      Im2ColPackUnitFp16(fp16_input_data + in_batch_offset, conv_param, gemm_input, real_cal_num, start_index);
    }
  }

  printf("==================output data=================\n");
  for (int i = 0; i < 20; i++) {
    std::cout << packed_input[i] << " ,";
  }
  std::cout << std::endl;

  delete input_data;
  delete[] fp16_input_data;
  delete conv_param;
  delete packed_input;
  MS_LOG(INFO) << "TestPackInputFp16 passed";
 }
 #endif

 TEST_F(TestPack, PackInputUint8) {
  auto conv_param = new ConvParameter;
  InitConvParamPack(conv_param);
  int kernel_h = conv_param->kernel_h_;
  int kernel_w = conv_param->kernel_w_;
  int in_batch = conv_param->input_batch_;
  int in_channel = conv_param->input_channel_;
  int in_h = conv_param->input_h_;
  int in_w = conv_param->input_w_;
  int out_h = conv_param->output_h_;
  int out_w = conv_param->output_w_;

  int thread_count = 1;
  int tile_n = 8;
  int output_count = out_h * out_w;
  int output_tile_count = UP_DIV(output_count, tile_n);

  int inchannel_block = 4;
  int channel_block = UP_DIV(in_channel, inchannel_block);
  int kernel_plane = kernel_h * kernel_w;
  int unit_size = kernel_plane * channel_block * inchannel_block;
  int packed_input_size = output_tile_count * tile_n * unit_size;

  // input
  size_t input_size;
  std::string input_path = "./test_data/conv/convuint8_input_1_28_28_3.bin";
  auto input_data = reinterpret_cast<uint8_t *>(mindspore::lite::ReadFile(input_path.c_str(), &input_size));
  auto int8_input = reinterpret_cast<int8_t *>(malloc(input_size));
  for (int i = 0; i < input_size; i++) {
    int8_input[i] = (int8_t)(input_data[i] - 128);
  }
  auto packed_input = reinterpret_cast<int8_t *>(malloc(in_batch * packed_input_size));
  memset(packed_input, 0, in_batch * packed_input_size);
  int32_t *input_sum = reinterpret_cast<int32_t *>(malloc(tile_n * thread_count * sizeof(int32_t)));

  for (int b = 0; b < in_batch; b++) {
    int in_batch_offset = b * in_channel * in_h * in_w;
    int gemm_in_batch_offset = b * packed_input_size;
    for (int thread_id = 0; thread_id < output_tile_count; thread_id += thread_count) {
      int start_index = thread_id * tile_n;
      int real_cal_num = (output_count - start_index) < tile_n ? (output_count - tile_n) : tile_n;
      int8_t *gemm_input =
        reinterpret_cast<int8_t *>(packed_input) + thread_id * unit_size * tile_n + gemm_in_batch_offset;
      memset(input_sum, 0, tile_n * thread_count * sizeof(int32_t));
      Im2ColPackUnitInt8(int8_input + in_batch_offset, gemm_input, real_cal_num, start_index, input_sum, conv_param);
    }
  }

  printf("==================output data=================\n");
  for (int i = 0; i < 20; i++) {
    std::cout << static_cast<int>(packed_input[i]) << " ,";
  }
  std::cout << std::endl;

  delete input_data;
  delete conv_param;
  free(int8_input);
  free(packed_input);
  free(input_sum);
  MS_LOG(INFO) << "TestPackInputUint8 passed";
 }

 TEST_F(TestPack, PackWeightUint8) {
  auto conv_param = new ConvParameter;
  InitConvParamPack(conv_param);

  int k_h = conv_param->kernel_h_;
  int k_w = conv_param->kernel_w_;
  int in_channel = conv_param->input_channel_;
  int out_channel = conv_param->output_channel_;
  int ic4 = UP_DIV(in_channel, C4NUM);
  int oc4 = UP_DIV(out_channel, C4NUM);

  size_t weight_size;
  std::string weight_path = "./test_data/conv/convuint8_weight_32_3_3_3.bin";
  auto weight_data = reinterpret_cast<uint8_t *>(mindspore::lite::ReadFile(weight_path.c_str(), &weight_size));
  auto int8_weight = reinterpret_cast<int8_t *>(malloc(weight_size));
  for (int i = 0; i < weight_size; i++) {
    int8_weight[i] = (int8_t)(weight_data[i] - 128);
  }
  int32_t filter_zp = 20;

  int32_t *weight_sum = reinterpret_cast<int32_t *>(malloc(sizeof(int32_t) * out_channel));
  for (int i = 0; i < out_channel; i++) weight_sum[i] = filter_zp * ic4 * C4NUM * k_h * k_w;
  auto packed_weight = reinterpret_cast<int8_t *>(malloc(k_h * k_w * ic4 * C4NUM * oc4 * C4NUM));
  PackWeightInt8(int8_weight, conv_param, packed_weight, weight_sum);

  printf("==================output data=================\n");
  for (int i = 0; i < 20; i++) {
    std::cout << static_cast<int>(packed_weight[i]) << " ,";
  }
  std::cout << std::endl;

  free(weight_sum);
  free(int8_weight);
  free(packed_weight);
  delete conv_param;

  MS_LOG(INFO) << "TestPackWeightUint8 passed";
 }
 }  // namespace mindspore
--- a/mindspore/lite/test/ut/src/runtime/kernel/arm/fp16/convolution_fp16_tests.cc
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/fp16/convolution_fp16_tests.cc
@@ -0,0 +1,593 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #include <iostream>
 #include <memory>
 #include "mindspore/core/utils/log_adapter.h"
 #include "common/common_test.h"
 #include "mindspore/lite/src/common/utils.h"
 #include "src/common/file_utils.h"
 #include "mindspore/lite/src/runtime/kernel/arm/fp16/convolution_fp16.h"
 #include "mindspore/lite/src/runtime/kernel/arm/fp16/convolution_3x3_fp16.h"
 #include "src/runtime/kernel/arm/opclib/fp16/conv_fp16.h"

 namespace mindspore {
 class TestConvolutionFp16 : public mindspore::Common {
 public:
  TestConvolutionFp16() {}
 };

 void InitConvParamGroup1Fp16(ConvParameter *conv_param) {
  conv_param->input_batch_ = 1;
  conv_param->input_h_ = 28;
  conv_param->input_w_ = 28;
  conv_param->input_channel_ = 3;

  conv_param->output_batch_ = 1;
  conv_param->output_h_ = 28;
  conv_param->output_w_ = 28;
  conv_param->output_channel_ = 32;

  conv_param->kernel_h_ = 3;
  conv_param->kernel_w_ = 3;

  conv_param->stride_h_ = 1;
  conv_param->stride_w_ = 1;

  conv_param->dilation_h_ = 1;
  conv_param->dilation_w_ = 1;

  conv_param->pad_h_ = 1;
  conv_param->pad_w_ = 1;
  conv_param->thread_num_ = 1;
 }

 void InitConvParamGroup2Fp16(ConvParameter *conv_param) {
  conv_param->input_batch_ = 1;
  conv_param->input_h_ = 128;
  conv_param->input_w_ = 128;
  conv_param->input_channel_ = 32;

  conv_param->output_batch_ = 1;
  conv_param->output_h_ = 128;
  conv_param->output_w_ = 128;
  conv_param->output_channel_ = 32;

  conv_param->kernel_h_ = 3;
  conv_param->kernel_w_ = 3;

  conv_param->stride_h_ = 1;
  conv_param->stride_w_ = 1;

  conv_param->dilation_h_ = 1;
  conv_param->dilation_w_ = 1;

  conv_param->pad_h_ = 1;
  conv_param->pad_w_ = 1;
  conv_param->thread_num_ = 1;
 }

 TEST_F(TestConvolutionFp16, ConvTest1) {
  // prepare stage
  auto conv_param = new ConvParameter();
  InitConvParamGroup1Fp16(conv_param);

  int tile_num = 16;
  int k_h = conv_param->kernel_h_;
  int k_w = conv_param->kernel_w_;
  int kernel_plane = k_h * k_w;
  int in_batch = conv_param->input_batch_;
  int in_channel = conv_param->input_channel_;
  int i_h = conv_param->input_h_;
  int i_w = conv_param->input_w_;
  int out_channel = conv_param->output_channel_;
  int ic4 = UP_DIV(in_channel, C4NUM);
  int oc8 = UP_DIV(out_channel, C8NUM);

  size_t weight_size;
  std::string weight_path = "./test_data/conv/convfp32_weight_32_3_3_3.bin";
  auto weight_data = reinterpret_cast<float *>(mindspore::lite::ReadFile(weight_path.c_str(), &weight_size));
  std::cout << "==============fp32 weight data===========" << std::endl;
  for (int i = 0; i < 20; i++) {
    std::cout << weight_data[i] << ", ";
  }
  std::cout << std::endl;

  std::cout << "weight data size: " << weight_size / sizeof(float) << std::endl;

  int weight_ele_size = weight_size / sizeof(float);
  auto fp16_weight_data = new float16_t[weight_ele_size];
  for (int i = 0; i < weight_ele_size; i++) {
    fp16_weight_data[i] = static_cast<float16_t>(weight_data[i]);
  }

  std::cout << "==============fp16 weight data===========" << std::endl;
  for (int i = 0; i < 20; i++) {
    std::cout << fp16_weight_data[i] << ", ";
  }
  std::cout << std::endl;

  auto packed_weight = reinterpret_cast<float16_t *>(malloc(k_h * k_w * ic4 * C4NUM * oc8 * C8NUM * sizeof(float16_t)));
  PackWeightFp16(fp16_weight_data, conv_param, packed_weight);

  std::cout << "==============fp16 packed weight data===========" << std::endl;
  for (int i = 0; i < 20; i++) {
    std::cout << packed_weight[i] << ", ";
  }
  std::cout << std::endl;

  size_t input_size;
  std::string input_path = "./test_data/conv/convfp32_input_1_28_28_3.bin";
  auto input_data = reinterpret_cast<float *>(mindspore::lite::ReadFile(input_path.c_str(), &input_size));
  std::cout << "==============fp32 input data===========" << std::endl;
  for (int i = 0; i < 20; i++) {
    std::cout << input_data[i] << ", ";
  }
  std::cout << std::endl;

  int input_ele_size = input_size / sizeof(float);
  auto fp16_input_data = new float16_t[input_ele_size];
  for (int i = 0; i < input_ele_size; i++) {
    fp16_input_data[i] = static_cast<float16_t>(input_data[i]);
  }

  auto nhwc4_input_data = reinterpret_cast<float16_t *>(malloc(i_h * i_w * ic4 * C4NUM* sizeof(float16_t)));
  PackNHWCToNHWC4Fp32(fp16_input_data, nhwc4_input_data, 1, i_h * i_w, in_channel);

  std::cout << "==============fp16 input data===========" << std::endl;
  for (int i = 0; i < 20; i++) {
    std::cout << fp16_input_data[i] << ", ";
  }
  std::cout << std::endl;

  int output_count = conv_param->output_h_ * conv_param->output_w_;
  int output_tile_count = UP_DIV(output_count, tile_num);
  int unit_size = kernel_plane * ic4 * C4NUM;
  int packed_input_size = output_tile_count * tile_num * unit_size;
  auto packed_input = reinterpret_cast<float16_t *>(malloc(in_batch * packed_input_size * sizeof(float16_t)));
  memset(packed_input, 0, in_batch * packed_input_size * sizeof(float16_t));

  auto bias_data = reinterpret_cast<float16_t *>(malloc(conv_param->output_channel_ * sizeof(float16_t)));
  memset(bias_data, 0, conv_param->output_channel_ * sizeof(float16_t));

  size_t output_data_size =
    conv_param->output_batch_ * conv_param->output_channel_ * conv_param->output_h_ * conv_param->output_w_;
  auto output_data = new float16_t[output_data_size];
  auto tmp_output_block = reinterpret_cast<float16_t *>(malloc(tile_num * out_channel * sizeof(float16_t)));

  // runtime part
  printf("Calculating runtime cost...\n");
  uint64_t time_avg = 0;
  // warmup
  for (int i = 0; i < 3; i++) {
    ConvFp16(nhwc4_input_data, packed_input, packed_weight, bias_data, tmp_output_block, output_data, 0, conv_param);
  }

  int loop_count = 100;
  auto time_start = mindspore::lite::GetTimeUs();
  for (int i = 0; i < loop_count; i++) {
    ConvFp16(nhwc4_input_data, packed_input, packed_weight, bias_data, tmp_output_block, output_data, 0, conv_param);
  }
  auto time_end = mindspore::lite::GetTimeUs();
  auto cost = time_end - time_start;
  time_avg = cost / loop_count;
  printf("single thread running time : %f ms\n", time_avg / 1000.0f);

  std::cout << "==============fp16 output data===========" << std::endl;
  for (int i = 0; i < 20; i++) {
    std::cout << output_data[i] << ", ";
  }
  std::cout << std::endl;

  auto fp32_output_data = new float[output_data_size];
  for (int i = 0; i < output_data_size; i++) {
    fp32_output_data[i] = static_cast<float>(output_data[i]);
  }
  printf("==================output data=================\n");
  for (int i = 0; i < 20; i++) {
    std::cout << fp32_output_data[i] << " ,";
  }
  std::cout << std::endl;

  std::string output_path = "./test_data/conv/convfp32_out_1_28_28_32.bin";
  lite::CompareOutput(fp32_output_data, output_path);

  free(nhwc4_input_data);
  free(packed_input);
  free(bias_data);
  free(packed_weight);
  free(tmp_output_block);
  delete conv_param;
  delete input_data;
  delete weight_data;
  delete[] fp16_weight_data;
  delete[] fp16_input_data;
  delete[] fp32_output_data;
  delete[] output_data;
  MS_LOG(INFO) << "TestConvolutionFp16 passed";
 }

 TEST_F(TestConvolutionFp16, ConvTest2) {
  // prepare stage
  auto conv_param = new ConvParameter();
  InitConvParamGroup2Fp16(conv_param);

  // parameter
  int tile_num = 16;
  int k_h = conv_param->kernel_h_;
  int k_w = conv_param->kernel_w_;
  int kernel_plane = k_h * k_w;
  int in_batch = conv_param->input_batch_;
  int in_channel = conv_param->input_channel_;
  int out_channel = conv_param->output_channel_;
  int ic4 = UP_DIV(in_channel, C4NUM);
  int oc8 = UP_DIV(out_channel, C8NUM);

  // weight
  size_t weight_size;
  std::string weight_path = "./test_data/conv/convfp32_weight_32_3_3_32.bin";
  auto weight_data = reinterpret_cast<float *>(mindspore::lite::ReadFile(weight_path.c_str(), &weight_size));
  int weight_ele_size = weight_size / sizeof(float);
  auto fp16_weight_data = new float16_t[weight_ele_size];
  for (int i = 0; i < weight_ele_size; i++) {
    fp16_weight_data[i] = static_cast<float16_t>(weight_data[i]);
  }
  auto packed_weight = reinterpret_cast<float16_t *>(malloc(k_h * k_w * ic4 * C4NUM * oc8 * C8NUM * sizeof(float16_t)));
  PackWeightFp16(fp16_weight_data, conv_param, packed_weight);

  // input
  size_t input_size;
  std::string input_path = "./test_data/conv/convfp32_input_1_128_128_32.bin";
  auto input_data = reinterpret_cast<float *>(mindspore::lite::ReadFile(input_path.c_str(), &input_size));
  int input_ele_size = input_size / sizeof(float);
  auto fp16_input_data = new float16_t[input_ele_size];
  for (int i = 0; i < input_ele_size; i++) {
    fp16_input_data[i] = static_cast<float16_t>(input_data[i]);
  }
  int output_count = conv_param->output_h_ * conv_param->output_w_;
  int output_tile_count = UP_DIV(output_count, tile_num);
  int unit_size = kernel_plane * ic4 * C4NUM;
  int packed_input_size = output_tile_count * tile_num * unit_size;
  auto packed_input = reinterpret_cast<float16_t *>(malloc(in_batch * packed_input_size * sizeof(float16_t)));
  memset(packed_input, 0, in_batch * packed_input_size * sizeof(float16_t));

  // bias
  auto bias_data = reinterpret_cast<float16_t *>(malloc(conv_param->output_channel_ * sizeof(float16_t)));
  memset(bias_data, 0, conv_param->output_channel_ * sizeof(float16_t));

  // output
  auto tmp_output_block = reinterpret_cast<float16_t *>(malloc(tile_num * out_channel * sizeof(float16_t)));
  size_t output_data_size =
    conv_param->output_batch_ * conv_param->output_channel_ * conv_param->output_h_ * conv_param->output_w_;
  auto output_data = new float16_t[output_data_size];

  // runtime part
  printf("Calculating runtime cost...\n");
  uint64_t time_avg = 0;
  // warmup
  for (int i = 0; i < 3; i++) {
    ConvFp16(fp16_input_data, packed_input, packed_weight, bias_data, tmp_output_block, output_data, 0, conv_param);
  }

  int loop_count = 100;
  auto time_start = mindspore::lite::GetTimeUs();
  for (int i = 0; i < loop_count; i++) {
    ConvFp16(fp16_input_data, packed_input, packed_weight, bias_data, tmp_output_block, output_data, 0, conv_param);
  }
  auto time_end = mindspore::lite::GetTimeUs();
  auto cost = time_end - time_start;
  time_avg = cost / loop_count;
  printf("single thread running time : %f ms\n", time_avg / 1000.0f);

  std::cout << "==============fp16 output data===========" << std::endl;
  for (int i = 0; i < 20; i++) {
    std::cout << output_data[i] << ", ";
  }
  std::cout << std::endl;

  auto fp32_output_data = new float[output_data_size];
  for (int i = 0; i < output_data_size; i++) {
    fp32_output_data[i] = static_cast<float>(output_data[i]);
  }
  printf("==================output data=================\n");
  for (int i = 0; i < 20; i++) {
    std::cout << fp32_output_data[i] << " ,";
  }
  std::cout << std::endl;

  std::string output_path = "./test_data/conv/convfp32_out_1_128_128_32.bin";
  lite::CompareOutput(fp32_output_data, output_path);

  free(packed_input);
  free(bias_data);
  free(packed_weight);
  free(tmp_output_block);
  delete conv_param;
  delete input_data;
  delete weight_data;
  delete[] fp16_weight_data;
  delete[] fp16_input_data;
  delete[] fp32_output_data;
  delete[] output_data;
  MS_LOG(INFO) << "TestConvolutionFp16 passed";
 }

 TEST_F(TestConvolutionFp16, Conv3x3Test1) {
  auto conv_param = new ConvParameter();
  InitConvParamGroup1Fp16(conv_param);
  // todo
  int thread_count = 1;
  int tile_num = 16;
  int output_batch = conv_param->output_batch_;
  int output_h = conv_param->output_h_;
  int output_w = conv_param->output_w_;
  int ic4 = UP_DIV(conv_param->input_channel_, C4NUM);
  int oc8 = UP_DIV(conv_param->output_channel_, C8NUM);

  // tmp buffer
  int k_plane = 36;
  size_t tile_buffer_size = thread_count * tile_num * k_plane * ic4 * C4NUM * sizeof(float16_t);
  float16_t *tile_buffer = reinterpret_cast<float16_t *>(malloc(tile_buffer_size));
  memset(tile_buffer, 0, tile_buffer_size);

  size_t block_unit_buffer_size = thread_count * k_plane * C4NUM * sizeof(float16_t);
  float16_t *block_unit_buffer = reinterpret_cast<float16_t *>(malloc(block_unit_buffer_size));
  memset(block_unit_buffer, 0, block_unit_buffer_size);

  size_t tmp_dst_buffer_size = thread_count * tile_num * k_plane * oc8 * C8NUM * sizeof(float16_t);
  float16_t *tmp_dst_buffer = reinterpret_cast<float16_t *>(malloc(tmp_dst_buffer_size));
  memset(tmp_dst_buffer, 0, tmp_dst_buffer_size);

  size_t tmp_out_size = oc8 * C8NUM * output_batch * output_h * output_w * tile_num * sizeof(float16_t);
  float16_t *tmp_out = reinterpret_cast<float16_t *>(malloc(tmp_out_size));
  memset(tmp_out, 0, tmp_out_size);

  // weight
  size_t weight_size;
  std::string weight_path = "./test_data/conv/convfp32_weight_32_3_3_3.bin";
  auto weight_data = reinterpret_cast<float *>(mindspore::lite::ReadFile(weight_path.c_str(), &weight_size));
  std::cout << "==============fp32 weight data===========" << std::endl;
  for (int i = 0; i < 20; i++) {
    std::cout << weight_data[i] << ", ";
  }
  std::cout << std::endl;

  std::cout << "weight data size: " << weight_size / sizeof(float) << std::endl;

  int weight_ele_size = weight_size / sizeof(float);
  auto fp16_weight_data = new float16_t[weight_ele_size];
  for (int i = 0; i < weight_ele_size; i++) {
    fp16_weight_data[i] = (float16_t)weight_data[i];
  }

  std::cout << "==============fp16 weight data===========" << std::endl;
  for (int i = 0; i < 20; i++) {
    std::cout << fp16_weight_data[i] << ", ";
  }
  std::cout << std::endl;

  size_t transformed_size = ic4 * C4NUM * oc8 * C8NUM * 36;
  auto transformed_weight_data = new float16_t[transformed_size];
  memset(transformed_weight_data, 0, transformed_size * sizeof(float16_t));
  kernel::ProcessFilterFp16(fp16_weight_data, transformed_weight_data, conv_param);

  // bias
  auto bias_data =
    reinterpret_cast<float16_t *>(malloc(UP_DIV(conv_param->output_channel_, 8) * 8 * sizeof(float16_t)));
  memset(bias_data, 0, UP_DIV(conv_param->output_channel_, 8) * 8 * sizeof(float16_t));

  // input
  size_t input_size;
  std::string input_path = "./test_data/conv/convfp32_input_1_28_28_3.bin";
  auto input_data = reinterpret_cast<float *>(mindspore::lite::ReadFile(input_path.c_str(), &input_size));
  std::cout << "==============fp32 input data===========" << std::endl;
  for (int i = 0; i < 20; i++) {
    std::cout << input_data[i] << ", ";
  }
  std::cout << std::endl;

  int input_ele_size = input_size / sizeof(float);
  auto fp16_input_data = new float16_t[input_ele_size];
  for (int i = 0; i < input_ele_size; i++) {
    fp16_input_data[i] = static_cast<float16_t>(input_data[i]);
  }

  std::cout << "==============fp16 input data===========" << std::endl;
  for (int i = 0; i < 20; i++) {
    std::cout << fp16_input_data[i] << ", ";
  }
  std::cout << std::endl;

  // output
  size_t output_data_size =
    conv_param->output_batch_ * conv_param->output_channel_ * conv_param->output_h_ * conv_param->output_w_;
  auto output_data = new float16_t[output_data_size];

  // runtime part
  printf("Calculating runtime cost...\n");
  uint64_t time_avg = 0;
  // warmup
  for (int i = 0; i < 3; i++) {
    Conv3x3Fp16(fp16_input_data, transformed_weight_data, bias_data, output_data, tile_buffer, block_unit_buffer,
                tmp_dst_buffer, tmp_out, 0, conv_param);
  }

  int loop_count = 100;
  auto time_start = mindspore::lite::GetTimeUs();
  for (int i = 0; i < loop_count; i++) {
    Conv3x3Fp16(fp16_input_data, transformed_weight_data, bias_data, output_data, tile_buffer, block_unit_buffer,
                tmp_dst_buffer, tmp_out, 0, conv_param);
  }
  auto time_end = mindspore::lite::GetTimeUs();
  auto cost = time_end - time_start;
  time_avg = cost / loop_count;
  printf("single thread running time : %f ms\n", time_avg / 1000.0f);

  std::cout << "==============fp16 output data===========" << std::endl;
  for (int i = 0; i < 20; i++) {
    std::cout << output_data[i] << ", ";
  }
  std::cout << std::endl;

  auto fp32_output_data = new float[output_data_size];
  for (int i = 0; i < output_data_size; i++) {
    fp32_output_data[i] = static_cast<float>(output_data[i]);
  }
  printf("==================output data=================\n");
  for (int i = 0; i < 20; i++) {
    std::cout << fp32_output_data[i] << " ,";
  }
  std::cout << std::endl;

  std::string output_path = "./test_data/conv/convfp32_out_1_28_28_32.bin";
  lite::CompareOutput(fp32_output_data, output_path);

  free(bias_data);
  free(tile_buffer);
  free(block_unit_buffer);
  free(tmp_dst_buffer);
  free(tmp_out);
  delete input_data;
  delete weight_data;
  delete conv_param;
  delete[] fp16_weight_data;
  delete[] fp16_input_data;
  delete[] fp32_output_data;
  delete[] output_data;
  delete[] transformed_weight_data;
  MS_LOG(INFO) << "TestConvolutionFp16 Conv3x3 passed";
 }

 TEST_F(TestConvolutionFp16, Conv3x3Test2) {
  auto conv_param = new ConvParameter();
  InitConvParamGroup2Fp16(conv_param);
  // todo
  int thread_count = 1;
  int tile_num = 16;
  int output_batch = conv_param->output_batch_;
  int output_h = conv_param->output_h_;
  int output_w = conv_param->output_w_;
  int ic4 = UP_DIV(conv_param->input_channel_, C4NUM);
  int oc8 = UP_DIV(conv_param->output_channel_, C8NUM);

  // tmp buffer
  int k_plane = 36;
  size_t tile_buffer_size = thread_count * tile_num * k_plane * ic4 * C4NUM * sizeof(float16_t);
  float16_t *tile_buffer = reinterpret_cast<float16_t *>(malloc(tile_buffer_size));
  memset(tile_buffer, 0, tile_buffer_size);

  size_t block_unit_buffer_size = thread_count * k_plane * C4NUM * sizeof(float16_t);
  float16_t *block_unit_buffer = reinterpret_cast<float16_t *>(malloc(block_unit_buffer_size));
  memset(block_unit_buffer, 0, block_unit_buffer_size);

  size_t tmp_dst_buffer_size = thread_count * tile_num * k_plane * oc8 * C8NUM * sizeof(float16_t);
  float16_t *tmp_dst_buffer = reinterpret_cast<float16_t *>(malloc(tmp_dst_buffer_size));
  memset(tmp_dst_buffer, 0, tmp_dst_buffer_size);

  size_t tmp_out_size = oc8 * C8NUM * output_batch * output_h * output_w * tile_num * sizeof(float16_t);
  float16_t *tmp_out = reinterpret_cast<float16_t *>(malloc(tmp_out_size));
  memset(tmp_out, 0, tmp_out_size);

  // weight
  size_t weight_size;
  std::string weight_path = "./test_data/conv/convfp32_weight_32_3_3_32.bin";
  auto weight_data = reinterpret_cast<float *>(mindspore::lite::ReadFile(weight_path.c_str(), &weight_size));
  int weight_ele_size = weight_size / sizeof(float);
  auto fp16_weight_data = new float16_t[weight_ele_size];
  for (int i = 0; i < weight_ele_size; i++) {
    fp16_weight_data[i] = static_cast<float16_t>(weight_data[i]);
  }
  size_t transformed_size = ic4 * C4NUM * oc8 * C8NUM * 36;
  auto transformed_weight_data = new float16_t[transformed_size];
  memset(transformed_weight_data, 0, transformed_size * sizeof(float16_t));
  kernel::ProcessFilterFp16(fp16_weight_data, transformed_weight_data, conv_param);

  // bias
  auto bias_data =
    reinterpret_cast<float16_t *>(malloc(UP_DIV(conv_param->output_channel_, 8) * 8 * sizeof(float16_t)));
  memset(bias_data, 0, UP_DIV(conv_param->output_channel_, 8) * 8 * sizeof(float16_t));

  // input
  size_t input_size;
  std::string input_path = "./test_data/conv/convfp32_input_1_128_128_32.bin";
  auto input_data = reinterpret_cast<float *>(mindspore::lite::ReadFile(input_path.c_str(), &input_size));
  int input_ele_size = input_size / sizeof(float);
  auto fp16_input_data = new float16_t[input_ele_size];
  for (int i = 0; i < input_ele_size; i++) {
    fp16_input_data[i] = static_cast<float16_t>(input_data[i]);
  }

  // output
  size_t output_data_size =
    conv_param->output_batch_ * conv_param->output_channel_ * conv_param->output_h_ * conv_param->output_w_;
  auto output_data = new float16_t[output_data_size];

  // runtime part
  printf("Calculating runtime cost...\n");
  uint64_t time_avg = 0;
  // warmup
  for (int i = 0; i < 3; i++) {
    Conv3x3Fp16(fp16_input_data, transformed_weight_data, bias_data, output_data, tile_buffer, block_unit_buffer,
                tmp_dst_buffer, tmp_out, 0, conv_param);
  }

  int loop_count = 100;
  auto time_start = mindspore::lite::GetTimeUs();
  for (int i = 0; i < loop_count; i++) {
    Conv3x3Fp16(fp16_input_data, transformed_weight_data, bias_data, output_data, tile_buffer, block_unit_buffer,
                tmp_dst_buffer, tmp_out, 0, conv_param);
  }
  auto time_end = mindspore::lite::GetTimeUs();
  auto cost = time_end - time_start;
  time_avg = cost / loop_count;
  printf("single thread running time : %f ms\n", time_avg / 1000.0f);

  std::cout << "==============fp16 output data===========" << std::endl;
  for (int i = 0; i < 20; i++) {
    std::cout << output_data[i] << ", ";
  }
  std::cout << std::endl;

  auto fp32_output_data = new float[output_data_size];
  for (int i = 0; i < output_data_size; i++) {
    fp32_output_data[i] = static_cast<float>(output_data[i]);
  }
  printf("==================output data=================\n");
  for (int i = 0; i < 20; i++) {
    std::cout << fp32_output_data[i] << " ,";
  }
  std::cout << std::endl;

  std::string output_path = "./test_data/conv/convfp32_out_1_128_128_32.bin";
  lite::CompareOutput(fp32_output_data, output_path);

  free(bias_data);
  free(tile_buffer);
  free(block_unit_buffer);
  free(tmp_dst_buffer);
  free(tmp_out);
  delete input_data;
  delete weight_data;
  delete conv_param;
  delete[] fp16_weight_data;
  delete[] fp16_input_data;
  delete[] fp32_output_data;
  delete[] output_data;
  delete[] transformed_weight_data;
  MS_LOG(INFO) << "TestConvolutionFp16 Conv3x3 passed";
 }

 }  // namespace mindspore
--- a/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32/activation_fp32_test.cc
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32/activation_fp32_test.cc
@@ -0,0 +1,128 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include <iostream>
 #include "mindspore/core/utils/log_adapter.h"
 #include "common/common_test.h"
 #include "mindspore/lite/src/runtime/kernel/arm/opclib/fp32/activation.h"
 #include "mindspore/lite/src/kernel_registry.h"
 #include "mindspore/lite/src/lite_kernel.h"

 namespace mindspore {

 class TestActivationFp32 : public mindspore::Common {
 public:
  TestActivationFp32() {}
 };

 TEST_F(TestActivationFp32, ReluFp32) {
  float input[8] = {-3, -2, -1, 0, 1, 5, 6, 7};
  float output[8] = {0};
  Relu(input, 8, output);
  float expect[8] = {0, 0, 0, 0, 1, 5, 6, 7};
  for (int i = 0; i < 8; ++i) {
    ASSERT_EQ(output[i], expect[i]);
  }
 }

 TEST_F(TestActivationFp32, Relu6Fp32) {
  float input[8] = {-3, -2, -1, 0, 1, 5, 6, 7};
  float output[8] = {0};
  Relu6(input, 8, output);
  float expect[8] = {0, 0, 0, 0, 1, 5, 6, 6};
  for (int i = 0; i < 8; ++i) {
    ASSERT_EQ(output[i], expect[i]);
  }
  MS_LOG(INFO) << "TestActivationFp32 passed";
 }

 TEST_F(TestActivationFp32, LReluFp32) {
  float input[8] = {-3, -2, -1, 0, 1, 5, 6, 7};
  float output[8] = {0};
  LRelu(input, 8, output, 0.01);
  float expect[8] = {-0.03, -0.02, -0.01, 0, 1, 5, 6, 7};
  for (int i = 0; i < 8; ++i) {
    ASSERT_EQ(output[i], expect[i]);
  }
  MS_LOG(INFO) << "TestActivationFp32 passed";
 }

 TEST_F(TestActivationFp32, SigmoidFp32) {
  float input[8] = {0, 1, 2, 3, 4, 5, 6, 7};
  float output[8] = {0};
  Sigmoid(input, 8, output);

  // expect output {0.5, 0.731059, 0.880797, 0.952574, 0.982014, 0.993307, 0.997527, 0.999089};
  printf("==================output data=================\n");
  for (int i = 0; i < 8; ++i) {
    std::cout << output[i] << " ";
  }
  std::cout << std::endl;
  MS_LOG(INFO) << "TestSigmoidFp32 passed";
 }

 TEST_F(TestActivationFp32, TanhFp32) {
  float input[7] = {-3, -2, -1, 0, 1, 2, 3};
  float output[7] = {0};
  Tanh(input, 7, output);
  float expect[8] = {-0.995055, -0.964028, -0.761594, 0.000000, 0.761594, 0.964028, 0.995055};
  for (int i = 0; i < 8; ++i) {
    EXPECT_NEAR(output[i], expect[i], 0.00001);
  }
  MS_LOG(INFO) << "TanhFp32 passed";
 }

 TEST_F(TestActivationFp32, HSwishFp32) {
  std::vector<lite::tensor::Tensor *> inputs_tensor;
  std::vector<lite::tensor::Tensor *> outputs_tensor;

  ActivationParameter op_param;
  op_param.op_parameter_.type_ = schema::PrimitiveType_Activation;
  op_param.type_ = schema::ActivationType_HSWISH;
  op_param.alpha_ = 0.01;

  std::vector<float> input = {-3.0, -2.0, -1.0, 0.0, 1.0, 5.0, 6.0, 7.0};
  std::vector<int> in_shape = {8};

  lite::tensor::Tensor input0_tensor;
  inputs_tensor.push_back(&input0_tensor);
  input0_tensor.SetData(input.data());
  input0_tensor.set_shape(in_shape);

  std::vector<float> output(8);
  std::vector<int> output_shape = {8};

  lite::tensor::Tensor output0_tensor;
  outputs_tensor.push_back(&output0_tensor);
  output0_tensor.SetData(output.data());

  kernel::KernelKey desc = {kernel::KERNEL_ARCH::kCPU, schema::PrimitiveType_Activation};
  auto creator = lite::KernelRegistry::GetInstance()->GetKernelCreator(desc);
  ASSERT_NE(creator, nullptr);
  lite::Context ctx;
  ctx.threadNum = 7;
  kernel::LiteKernel *kernel =
    creator(inputs_tensor, outputs_tensor, reinterpret_cast<OpParameter *>(&op_param), &ctx, desc);
  ASSERT_NE(kernel, nullptr);
  auto output_tensor_shape = output0_tensor.shape();
  kernel->Run();

  std::vector<float> expect_output = {-0, -0.33333334, -0.33333334, 0, 0.6666667, 5, 6, 7};
  CompareOutputData(output.data(), expect_output.data(), 8, 0.00001);

  input0_tensor.SetData(nullptr);
  output0_tensor.SetData(nullptr);
 }
 }  // namespace mindspore
--- a/mindspore/lite/test/ut/src/runtime/kernel/arm/int8/add_int8_tests.cc
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/int8/add_int8_tests.cc
@@ -0,0 +1,74 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #include <iostream>
 #include <memory>
 #include "common/common_test.h"
 #include "mindspore/lite/src/runtime/kernel/arm/int8/add_int8.h"
 #include "mindspore/lite/src/kernel_registry.h"
 #include "mindspore/lite/include/context.h"

 namespace mindspore {
 class TestQuantizedAdd : public mindspore::Common {
 public:
  TestQuantizedAdd() {}
 };

 TEST_F(TestQuantizedAdd, Add) {
  lite::tensor::Tensor in_tensor0(kNumberTypeInt8, {1, 1, 2, 5});
  lite::tensor::Tensor in_tensor1(kNumberTypeInt8, {1, 1, 2, 5});
  lite::tensor::Tensor out_tensor(kNumberTypeInt8, {1, 1, 2, 5});

  int8_t input_data0[] = {-102, 25, -51, 89, -102, 25, -51, 89, -102, 25};  // -0.8 0.2 -0.4 0.7
  int8_t input_data1[] = {38, 51, 64, -102, 38, 51, 64, -102, 38, 51};  // 0.3 0.4 0.5 -0.8
  int8_t output_data[10] = {0};
  in_tensor0.SetData(input_data0);
  in_tensor1.SetData(input_data1);
  out_tensor.SetData(output_data);

  const lite::tensor::QuantArg quant_in0 = {0.00784314f, 0};  // -1.0--1.0 -> 0--255
  const lite::tensor::QuantArg quant_in1 = {0.00784314f, 0};
  const lite::tensor::QuantArg quant_out = {0.00784314f, 0};
  in_tensor0.AddQuantParam(quant_in0);
  in_tensor1.AddQuantParam(quant_in1);
  out_tensor.AddQuantParam(quant_out);

  std::vector<lite::tensor::Tensor *> inputs = {&in_tensor0, &in_tensor1};
  std::vector<lite::tensor::Tensor *> outputs = {&out_tensor};

  OpParameter parameter = {};
  kernel::KernelKey desc = {kernel::KERNEL_ARCH::kCPU, schema::PrimitiveType_Add};

  auto creator = lite::KernelRegistry::GetInstance()->GetKernelCreator(desc);
  ASSERT_NE(creator, nullptr);

  auto ctx = std::make_shared<lite::Context>();
  auto kernel = creator(inputs, outputs, reinterpret_cast<OpParameter *>(&parameter), ctx.get(), desc);
  ASSERT_NE(kernel, nullptr);

  auto ret = kernel->Run();
  EXPECT_EQ(0, ret);

  int8_t expect0[10] = {-64, 76, 13, -13, -64, 76, 13, -13, -64, 76};  // -0.5 0.6 0.1 -0.1
  for (int i = 0; i < 10; ++i) {
    EXPECT_EQ(output_data[i], expect0[i]);
  }

  in_tensor0.SetData(nullptr);
  in_tensor1.SetData(nullptr);
  out_tensor.SetData(nullptr);
 }
 }  // namespace mindspore
--- a/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/batchnorm/fusedBatchnorm_input_0.bin
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/batchnorm/fusedBatchnorm_input_0.bin
--- a/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/batchnorm/fusedBatchnorm_input_1.bin
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/batchnorm/fusedBatchnorm_input_1.bin
@@ -0,0 +1 @@
 ýL[?-"R>‰qƒ>{B¸>´?yx?ó×_>JSD>Gº0?
--- a/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/batchnorm/fusedBatchnorm_input_2.bin
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/batchnorm/fusedBatchnorm_input_2.bin
@@ -0,0 +1 @@
 J[q? §P?¾ŸŒ>gý?õA?>oo?7G?x¸<¿”"?
--- a/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/batchnorm/fusedBatchnorm_input_3.bin
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/batchnorm/fusedBatchnorm_input_3.bin
@@ -0,0 +1 @@
 WÚU>X™8?*Á?!—v>›žF>0î?.ť<�C?Čd?
--- a/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/batchnorm/fusedBatchnorm_input_4.bin
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/batchnorm/fusedBatchnorm_input_4.bin
@@ -0,0 +1 @@
 ÜR?Ü]?žÎ>†c~?um?z1->í??Ø'?—U?
--- a/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/batchnorm/fusedBatchnorm_out.bin
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/batchnorm/fusedBatchnorm_out.bin
--- a/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/conv/2conv1x1conv1_input_nc4hwc4.txt
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/conv/2conv1x1conv1_input_nc4hwc4.txt
--- a/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/conv/conv1x1fp32_bias1_nhwc.bin
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/conv/conv1x1fp32_bias1_nhwc.bin
@@ -0,0 +1,2 @@
 „ë:¿eQÝ¿²c?pº@ÞE(Àoéï=Å±*¿Î¢ñ=Í•†¿^C½°ç?Æþ-?ú»=@$Á?ò(ÀW!=à+> æ¿êó@•@§?	-¿JP Àµï?€k¿ýüÁ¿þ“?M
 ¾wq‘>3Û=RïÀ¢j
@¿E%@!Hï¿¸lÀþ�¾=•©=\j/½m2¶>bâ@òB‡¾
--- a/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/conv/conv1x1fp32_input1_nhwc.bin
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/conv/conv1x1fp32_input1_nhwc.bin
--- a/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/conv/conv1x1fp32_output1_nhwc.bin
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/conv/conv1x1fp32_output1_nhwc.bin
--- a/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/conv/conv1x1fp32_weight1_nhwc.bin
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/conv/conv1x1fp32_weight1_nhwc.bin
--- a/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/conv/convfp32.tflite
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/conv/convfp32.tflite
--- a/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/conv/convfp32_input_1_128_128_24.bin
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/conv/convfp32_input_1_128_128_24.bin
--- a/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/conv/convfp32_input_1_128_128_32.bin
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/conv/convfp32_input_1_128_128_32.bin
--- a/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/conv/convfp32_input_1_28_28_3.bin
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/conv/convfp32_input_1_28_28_3.bin
--- a/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/conv/convfp32_input_1_3_28_28.bin
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/conv/convfp32_input_1_3_28_28.bin
--- a/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/conv/convfp32_out_1_127_127_24.bin
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/conv/convfp32_out_1_127_127_24.bin
--- a/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/conv/convfp32_out_1_128_128_32.bin
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/conv/convfp32_out_1_128_128_32.bin
--- a/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/conv/convfp32_out_1_28_28_32.bin
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/conv/convfp32_out_1_28_28_32.bin
--- a/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/conv/convfp32_weight_24_3_3_24.bin
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/conv/convfp32_weight_24_3_3_24.bin
--- a/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/conv/convfp32_weight_32_3_3_3.bin
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/conv/convfp32_weight_32_3_3_3.bin
--- a/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/conv/convfp32_weight_32_3_3_32.bin
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/conv/convfp32_weight_32_3_3_32.bin
--- a/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/conv/inception_v1_quant.tflite
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/conv/inception_v1_quant.tflite
--- a/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/conv/mv1_quant.tflite
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/conv/mv1_quant.tflite
--- a/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/conv/uint8_1_224_224_3.bin
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/conv/uint8_1_224_224_3.bin
--- a/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/conv/uint8_1_28_28_16.bin
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/conv/uint8_1_28_28_16.bin
--- a/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/conv/uint8_out_1_112_112_32.bin
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/conv/uint8_out_1_112_112_32.bin
--- a/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/conv/uint8_out_1_28_28_32.bin
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/conv/uint8_out_1_28_28_32.bin
--- a/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/convDw/convDwfp32_input.bin
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/convDw/convDwfp32_input.bin
--- a/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/convDw/convDwfp32_output.bin
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/convDw/convDwfp32_output.bin
--- a/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/convDw/convDwfp32_weight.bin
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/convDw/convDwfp32_weight.bin
--- a/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/deconv/deconv_fp32_nchw_bias1.bin
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/deconv/deconv_fp32_nchw_bias1.bin
@@ -0,0 +1 @@
 ¦êœ¿´Xã>+Î?6Å@?Ü•�¿Çe¥?š;¿–ˆÜ¿Úþ?Æ„R?�ÇÓ?ñÃ½èÀí¾P�¿`Ú„=¼»?æ?¨¥Ž¿ñ�C¿mÄÇ?òßH?õ¤¾<±ž¿ÀŸ€<›º?ÀŸâ<Cû»?À’�=ŽúN?Üí„¾�±¿¹ÎÏ¿êà×¿°ýB>ŠÚˆ¿�3‘?þ:v¿�	½?�”¾¿-Œ¨?
--- a/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/deconv/deconv_fp32_nchw_output1.bin
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/deconv/deconv_fp32_nchw_output1.bin
--- a/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/deconv/deconv_fp32_nchw_weight1.bin
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/deconv/deconv_fp32_nchw_weight1.bin
--- a/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/deconv/deconv_fp32_nhwc_input1.bin
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/deconv/deconv_fp32_nhwc_input1.bin
--- a/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/deconvDw/deconvDw_input.bin
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/deconvDw/deconvDw_input.bin
--- a/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/deconvDw/deconvDw_output.bin
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/deconvDw/deconvDw_output.bin
--- a/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/deconvDw/deconvDw_weight.bin
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/deconvDw/deconvDw_weight.bin
--- a/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/matmul/FcFp32_bias1.bin
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/matmul/FcFp32_bias1.bin
@@ -0,0 +1 @@
 3:П�х?iпОМ�te?Йй?6\XОМ`^Оj6@h'П>жнМуьд>ЈЌ6П�юђ?в%РсFm?|C)@ешрПџlѕ>FЊЋ@vCП(и*ПМn6@{Ф?�@ТЙ�О,~@d6>@(R@g�?�8@
--- a/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/matmul/FcFp32_input1.bin
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/matmul/FcFp32_input1.bin
--- a/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/matmul/FcFp32_output1.bin
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/matmul/FcFp32_output1.bin
--- a/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/matmul/FcFp32_weight1.bin
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/matmul/FcFp32_weight1.bin
--- a/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/models/inception_v1_quant/inception_v1_224_quant.tflite
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/models/inception_v1_quant/inception_v1_224_quant.tflite
--- a/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/models/mobilenet_quant/mobilenet_v1_1.0_224_quant.ckpt.data-00000-of-00001
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/models/mobilenet_quant/mobilenet_v1_1.0_224_quant.ckpt.data-00000-of-00001
--- a/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/models/mobilenet_quant/mobilenet_v1_1.0_224_quant.ckpt.index
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/models/mobilenet_quant/mobilenet_v1_1.0_224_quant.ckpt.index
--- a/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/models/mobilenet_quant/mobilenet_v1_1.0_224_quant.ckpt.meta
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/models/mobilenet_quant/mobilenet_v1_1.0_224_quant.ckpt.meta
--- a/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/models/mobilenet_quant/mobilenet_v1_1.0_224_quant.tflite
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/models/mobilenet_quant/mobilenet_v1_1.0_224_quant.tflite
--- a/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/models/mobilenet_quant/mobilenet_v1_1.0_224_quant_eval.pbtxt
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/models/mobilenet_quant/mobilenet_v1_1.0_224_quant_eval.pbtxt
--- a/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/models/mobilenet_quant/mobilenet_v1_1.0_224_quant_info.txt
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/models/mobilenet_quant/mobilenet_v1_1.0_224_quant_info.txt
@@ -0,0 +1,3 @@
 Model: mobilenet_v1_1.0_224_quant
 Input: input
 Output: MobilenetV1/Predictions/Reshape_1
--- a/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/pooling/avg_pool_1_128_128_24.bin
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/pooling/avg_pool_1_128_128_24.bin
--- a/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/pooling/avgpoolingfp32_out_1_28_28_3.bin
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/pooling/avgpoolingfp32_out_1_28_28_3.bin
--- a/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/pooling/max_pool_1_128_128_24.bin
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/pooling/max_pool_1_128_128_24.bin
--- a/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/pooling/maxpoolingfp32_out_1_28_28_3.bin
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/pooling/maxpoolingfp32_out_1_28_28_3.bin
--- a/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/softmax/softmaxfp32_out_1_28_28_3.bin
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/softmax/softmaxfp32_out_1_28_28_3.bin
--- a/mindspore/lite/test/ut/src/runtime/kernel/common_utils_test.cc
+++ b/mindspore/lite/test/ut/src/runtime/kernel/common_utils_test.cc
@@ -0,0 +1,134 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #include <vector>
 #include "common/common_test.h"
 #include "backend/kernel_compiler/common_utils.h"

 namespace mindspore {
 namespace kernel {
 class CommonUtilTest : public mindspore::Common {
 public:
  CommonUtilTest() = default;
 };

 TEST_F(CommonUtilTest, BucketReduceSparseGradient1) {
  // The indices is a vector and the grad is a tensor with shape (6, 2)
  /* 0
   * 0
   * 1
   * 1
   * 0
   * 3
   */
  std::vector<int> indices{0, 0, 1, 1, 0, 3};
  /* 0 1
   * 2 3
   * 4 5
   * 6 7
   * 8 9
   * 10 11
   */
  std::vector<float> grad;
  for (int i = 0; i < 6 * 2; i++) {
    grad.push_back(i);
  }
  std::vector<int> unique_indices(6);
  std::vector<float> summed_grad(12);
  std::vector<int> tmp_indices(6);
  std::vector<float> tmp_grad(12);

  SparseGradient unique_grad({summed_grad.data(), unique_indices.data(), 6});
  SparseGradient workspace_grad({tmp_grad.data(), tmp_indices.data(), 6});
  SparseGradient input_grad({grad.data(), indices.data(), 6});

  ReduceSparseGradientParam param;
  param.input_grad_ = &input_grad;
  param.workspace_grad_ = &workspace_grad;
  param.output_grad_ = &unique_grad;
  param.max_index_ = 6;
  param.value_stride_ = 2;
  BucketReduceSparseGradient(param);

  EXPECT_EQ(unique_grad.indices_size_, 3);
  std::vector<int> expect_indices({0, 1, 3});
  for (size_t i = 0; i < unique_grad.indices_size_; ++i) {
    EXPECT_EQ(unique_grad.indices_[i], expect_indices[i]);
  }
  /* 10 13
   * 10 12
   * 10 11
   */
  std::vector<int> expect_value({10, 13, 10, 12, 10, 11});
  for (size_t i = 0; i < unique_grad.indices_size_ * 2; ++i) {
    EXPECT_EQ(unique_grad.value_[i], expect_value[i]);
  }
 }

 TEST_F(CommonUtilTest, BucketReduceSparseGradient2) {
  // The indices is a vector and the grad is a tensor with shape (6, 2)
  /* 0
   * 0
   * 1
   * 1
   * 0
   * 6
   */
  std::vector<int> indices{0, 0, 1, 1, 0, 6};
  /* 0 1
   * 2 3
   * 4 5
   * 6 7
   * 8 9
   * 10 11
   */
  std::vector<float> grad;
  for (int i = 0; i < 6 * 2; i++) {
    grad.push_back(i);
  }
  std::vector<int> unique_indices(6);
  std::vector<float> summed_grad(12);
  std::vector<int> tmp_indices(6);
  std::vector<float> tmp_grad(12);
  SparseGradient unique_grad({summed_grad.data(), unique_indices.data(), 6});
  SparseGradient workspace_grad({tmp_grad.data(), tmp_indices.data(), 6});
  SparseGradient input_grad({grad.data(), indices.data(), 6});

  ReduceSparseGradientParam param;
  param.input_grad_ = &input_grad;
  param.workspace_grad_ = &workspace_grad;
  param.output_grad_ = &unique_grad;
  param.max_index_ = 6;
  param.value_stride_ = 2;
  BucketReduceSparseGradient(param);

  EXPECT_EQ(unique_grad.indices_size_, 2);

  std::vector<int> expect_indices({0, 1});
  for (size_t i = 0; i < unique_grad.indices_size_; ++i) {
    EXPECT_EQ(unique_grad.indices_[i], expect_indices[i]);
  }

  /* 10 13
   * 10 12
   */
  std::vector<int> expect_value({10, 13, 10, 12});
  for (size_t i = 0; i < unique_grad.indices_size_ * 2; ++i) {
    EXPECT_EQ(unique_grad.value_[i], expect_value[i]);
  }
 }
 }  // namespace kernel
 }  // namespace mindspore
--- a/mindspore/lite/test/ut/src/runtime/kernel/opencl/matmul_tests.cc
+++ b/mindspore/lite/test/ut/src/runtime/kernel/opencl/matmul_tests.cc
@@ -0,0 +1,89 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include <iostream>
 #include <memory>
 #include "mindspore/core/utils/log_adapter.h"
 #include "common/common_test.h"
 #include "mindspore/lite/src/common/file_utils.h"
 #include "mindspore/lite/src/runtime/opencl/opencl_runtime.h"
 #include "mindspore/lite/src/runtime/kernel/opencl/subgraph_opencl_kernel.h"
 #include "mindspore/lite/src/runtime/kernel/opencl/kernel/matmul.h"

 // using namespace mindspore::kernel;
 // using namespace mindspore::lite;
 // using namespace mindspore;

 namespace mindspore {
 class TestMatMulOpenCL : public mindspore::Common {
 public:
  TestMatMulOpenCL() {}
 };

 TEST_F(TestMatMulOpenCL, MatMulFp32) {
  auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();
  ocl_runtime->Init();
  size_t input_size;
  int ci = 1280;
  int co = 1001;
  std::string input_path = "./test_data/matmul/matmul_fp32_input.bin";
  auto input_data = reinterpret_cast<float *>(mindspore::lite::ReadFile(input_path.c_str(), &input_size));

  size_t weight_size;
  std::string weight_path = "./test_data/matmul/matmul_fp32_weight.bin";
  auto weight_data = reinterpret_cast<float *>(mindspore::lite::ReadFile(weight_path.c_str(), &weight_size));

  lite::tensor::Tensor *tensor_x = new lite::tensor::Tensor(TypeId(kNumberTypeFloat32), {1, ci});

  lite::tensor::Tensor *tensor_w = new lite::tensor::Tensor(TypeId(kNumberTypeFloat32), {co, ci});
  tensor_w->SetData(weight_data);

  lite::tensor::Tensor *tensor_out = new lite::tensor::Tensor(TypeId(kNumberTypeFloat32), {1, co});
  std::vector<lite::tensor::Tensor *> inputs{tensor_x, tensor_w};
  std::vector<lite::tensor::Tensor *> outputs{tensor_out};
  auto *arith_kernel = new MatMulOpenCLKernel(nullptr, inputs, outputs, false);
  arith_kernel->Init();

  std::vector<LiteKernel *> kernels{arith_kernel};
  auto *pGraph = new SubGraphOpenCLKernel(inputs, outputs, kernels, kernels, kernels);
  pGraph->Init();

  memcpy(inputs[0]->Data(), input_data, sizeof(float) * ci);
  pGraph->Run();

  printf("==================output data=================\n");
  float *output_data = reinterpret_cast<float *>(tensor_out->Data());
  std::cout << std::endl;
  for (int i = 0; i < co; i++) {
    std::cout << output_data[i] << ", ";
  }
  std::cout << std::endl;

  size_t output_size;
  std::string output_path = "./test_data/matmul/matmul_fp32_output.bin";
  auto correct_data = reinterpret_cast<float *>(mindspore::lite::ReadFile(output_path.c_str(), &output_size));

  // compare
  CompareOutputData(output_data, correct_data, co * sizeof(float), 0.00001);

  delete input_data;
  delete weight_data;
  delete tensor_x;
  delete tensor_w;
  delete tensor_out;
  delete correct_data;
  MS_LOG(INFO) << "TestMatMulFp32 passed";
 }
 }  // namespace mindspore
--- a/mindspore/lite/test/ut/src/runtime/kernel/opencl/opencl_kernel_tests.h
+++ b/mindspore/lite/test/ut/src/runtime/kernel/opencl/opencl_kernel_tests.h
@@ -0,0 +1,35 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #include <iostream>
 #include "common/common_test.h"
 #include "mindspore/core/utils/log_adapter.h"
 #include "mindspore/lite/src/common/file_utils.h"
 #include "mindspore/lite/src/runtime/opencl/opencl_runtime.h"
 #include "mindspore/lite/src/runtime/kernel/opencl/subgraph_opencl_kernel.h"

 #ifndef TESTS_UT_OPENCL_KERNLE_TESTS_H
 #define TESTS_UT_OPENCL_KERNLE_TESTS_H

 namespace mindspore {

 class TestOpenCLKernel : public mindspore::Common {
 public:
  TestOpenCLKernel() {}
 };

 }  // namespace mindspore
 #endif  // TESTS_UT_OPENCL_KERNLE_TESTS_H
--- a/mindspore/lite/test/ut/src/runtime/kernel/opencl/softmax_cl_tests.cc
+++ b/mindspore/lite/test/ut/src/runtime/kernel/opencl/softmax_cl_tests.cc
@@ -0,0 +1,96 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include <iostream>
 #include <memory>
 #include "mindspore/core/utils/log_adapter.h"
 #include "common/common_test.h"
 #include "mindspore/lite/src/common/file_utils.h"
 #include "mindspore/lite/src/runtime/opencl/opencl_runtime.h"
 #include "mindspore/lite/src/runtime/kernel/opencl/subgraph_opencl_kernel.h"
 #include "mindspore/lite/src/runtime/kernel/opencl/kernel/softmax.h"

 // using namespace mindspore::kernel;
 // using namespace mindspore::lite;
 // using namespace mindspore;

 namespace mindspore {

 class TestSoftmaxOpenCL : public mindspore::Common {};

 void InitSoftaxParam(SoftmaxParameter *param) { param->axis_ = -1; }

 TEST_F(TestSoftmaxOpenCL, SoftmaxFp32) {
  std::cout << "======" << std::endl;
  MS_LOG(INFO) << "start TEST_F TestSoftmaxOpenCL";
  auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();
  ocl_runtime->Init();

  MS_LOG(INFO) << "create SoftmaxParameter";
  auto param = new SoftmaxParameter();
  InitSoftaxParam(param);

  MS_LOG(INFO) << "create Tensors";
  std::vector<int> shape_in = {1, 2, 2, 1};
  std::vector<int> shape_out = {1, 2, 2, 1};
  auto data_type = kNumberTypeFloat32;
  auto tensorType = schema::NodeType_ValueNode;
  lite::tensor::Tensor *tensor_in = new lite::tensor::Tensor(data_type, shape_in, schema::Format_NCHW, tensorType);
  lite::tensor::Tensor *tensor_out = new lite::tensor::Tensor(data_type, shape_out, schema::Format_NCHW, tensorType);
  std::vector<lite::tensor::Tensor *> inputs{tensor_in};
  std::vector<lite::tensor::Tensor *> outputs{tensor_out};

  MS_LOG(INFO) << "create OpenCL Kernel";
  auto *Softmax_kernel = new SoftmaxOpenCLKernel(reinterpret_cast<OpParameter *>(param), inputs, outputs);
  Softmax_kernel->Init();
  std::vector<LiteKernel *> kernels{Softmax_kernel};

  MS_LOG(INFO) << "create SubGraphOpenCLKernel";
  auto *pGraph = new SubGraphOpenCLKernel(inputs, outputs, kernels, kernels, kernels);
  pGraph->Init();

  MS_LOG(INFO) << "initialize data";
  std::vector<lite::tensor::Tensor *> tensor_map = {tensor_in};
  for (auto &tensor_file : tensor_map) {
    auto tensor = tensor_file;
    size_t size = tensor->Size();
    const float data[4] = {std::log(1.0f), std::log(2.0f), std::log(3.0f), std::log(4.0f)};
    memcpy(tensor->Data(), data, size);
  }

  MS_LOG(INFO) << "pGraph->Run()";
  pGraph->Run();

  MS_LOG(INFO) << "==================output data=================";
  float *output_data = reinterpret_cast<float *>(tensor_out->Data());
  size_t output_size = tensor_out->Size();

  printf("output:");
  for (int i = 0; i < 4; i++) {
    printf("%.3f ", output_data[i]);
  }
  printf("\n");
  float expect[4] = {1.0f, 2.0f, 3.0f, 4.0f};

  for (int i = 0; i < tensor_out->ElementsNum(); ++i) {
    if (std::fabs(output_data[i] - expect[i]) > 1e-5) {
      printf("idx[%d] except=%.3f output=%.3f .", i, expect[i], output_data[i]);
    }
  }
  printf("\nTest all close OK for %zu!\n", output_size);
  lite::CompareOutputData(output_data, expect, 4);
 }

 }  // namespace mindspore
--- a/mindspore/lite/test/ut/src/train_test.cc
+++ b/mindspore/lite/test/ut/src/train_test.cc
@@ -0,0 +1,287 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #include <cmath>
 #include <iostream>
 #include <memory>
 #include "utils/base_ref_utils.h"
 #include "mindspore/lite/schema/inner/model_generated.h"
 #include "mindspore/lite/src/train/model_impl.h"
 #include "mindspore/lite/include/model.h"
 #include "mindspore/lite/src/train/train_session.h"
 #include "common/common_test.h"
 #include "mindspore/core/utils/log_adapter.h"

 namespace mindspore {
 class TrainTest : public mindspore::Common {
 public:
  TrainTest() {}
 };

 TEST_F(TrainTest, TestConvNode) {
  auto meta_graph = std::make_shared<schema::MetaGraphT>();
  meta_graph->name = "graph";

  auto node = std::make_unique<schema::CNodeT>();
  node->inputIndex = {0, 1};
  node->outputIndex = {2};
  node->primitive = std::make_unique<schema::PrimitiveT>();
  node->primitive->value.type = schema::PrimitiveType_Conv2D;
  auto primitive = new schema::Conv2DT;
  primitive->padMode = schema::PadMode_SAME;
  primitive->channelIn = 3;
  primitive->channelOut = 32;
  primitive->format = schema::Format_NHWC;
  primitive->strideH = 1;
  primitive->strideW = 1;
  primitive->kernelH = 3;
  primitive->kernelW = 3;
  primitive->dilateH = 1;
  primitive->dilateW = 1;
  node->primitive->value.value = primitive;
  node->name = "Conv2D";
  meta_graph->nodes.emplace_back(std::move(node));
  meta_graph->inputIndex = {0};
  meta_graph->outputIndex = {2};

  auto input0 = std::make_unique<schema::TensorT>();
  input0->nodeType = schema::NodeType::NodeType_Parameter;  // todo use ValueNode?
  input0->format = schema::Format_NHWC;
  input0->dataType = TypeId::kNumberTypeFloat32;
  input0->dims = {1, 28, 28, 3};
  input0->offset = -1;
  meta_graph->allTensors.emplace_back(std::move(input0));

  auto weight = std::make_unique<schema::TensorT>();
  weight->nodeType = schema::NodeType::NodeType_ValueNode;
  weight->format = schema::Format_KHWC;
  weight->dataType = TypeId::kNumberTypeFloat32;
  weight->dims = {32, 3, 3, 3};

  auto buf = new char *[1];
  //================================================================
  size_t weight_size;
  std::string weight_path = "./convfp32_weight_32_3_3_3.bin";
  ReadFile(weight_path.c_str(), &weight_size, buf);
  ASSERT_NE(nullptr, buf[0]);
  auto weight_data_temp = reinterpret_cast<float *>(buf[0]);
  ASSERT_NE(nullptr, weight_data_temp);
  weight->data.resize(sizeof(float) * 32 * 3 * 3 * 3);

  //================================================================
  memcpy(weight->data.data(), weight_data_temp, weight_size);
  weight->offset = -1;
  meta_graph->allTensors.emplace_back(std::move(weight));

  auto output = std::make_unique<schema::TensorT>();
  output->nodeType = schema::NodeType::NodeType_Parameter;
  output->format = schema::Format_NHWC;
  output->dataType = TypeId::kNumberTypeFloat32;
  output->dims = {1, 28, 28, 32};
  output->offset = -1;
  meta_graph->allTensors.emplace_back(std::move(output));

  flatbuffers::FlatBufferBuilder builder(1024);
  auto offset = schema::MetaGraph::Pack(builder, meta_graph.get());
  builder.Finish(offset);
  size_t size = builder.GetSize();
  const char *content = reinterpret_cast<char *>(builder.GetBufferPointer());

  auto model = lite::Model::Import(content, size);
  ASSERT_NE(nullptr, model);
  auto session = new session::TrainSession();  // inference::MSSession::CreateSession(kCPUDevice, 0);
  ASSERT_NE(nullptr, session);
  auto graphId = session->CompileGraph(NOT_NULL(model->GetModelImpl()));

  auto inTensor = new tensor::Tensor(TypeId::kNumberTypeFloat32, {1, 28, 28, 3});
  ASSERT_NE(nullptr, inTensor);
  ASSERT_EQ(sizeof(float) * (28 * 28 * 3), inTensor->Size());
  auto ret = inTensor->MallocData();
  ASSERT_EQ(0, ret);
  auto data = inTensor->Data();
  //===================================================
  size_t input_size;
  std::string input_path = "./convfp32_input_1_28_28_3.bin";
  ReadFile(input_path.c_str(), &input_size, buf);
  ASSERT_NE(nullptr, buf[0]);
  auto input_data = reinterpret_cast<float *>(buf[0]);
  ASSERT_NE(nullptr, input_data);
  //===================================================
  memcpy(data, input_data, input_size);
  std::vector<std::shared_ptr<tensor::Tensor>> inputs;
  inputs.emplace_back(inTensor);
  VectorRef outputsRef;
  session->RunGraph(graphId, inputs, &outputsRef);
  auto outputs = TransformVectorRefToMultiTensor(outputsRef);
  ASSERT_EQ(1, outputs.size());
  ASSERT_EQ(1, outputs.front().size());
  auto runOutput = outputs.front().front();
  ASSERT_NE(nullptr, runOutput);
  ASSERT_EQ(28 * 28 * 32, runOutput->ElementsNum());
  ASSERT_EQ(TypeId::kNumberTypeFloat32, runOutput->data_type());
  auto *outData = reinterpret_cast<float *>(runOutput->MutableData());
  //===================================================
  size_t output_size;
  std::string output_path = "./convfp32_out_1_28_28_32.bin";
  ReadFile(output_path.c_str(), &output_size, buf);
  ASSERT_NE(nullptr, buf[0]);
  auto output_data = reinterpret_cast<float *>(buf[0]);
  ASSERT_NE(nullptr, output_data);
  //===================================================
  ASSERT_EQ(output_size, runOutput->Size());
  for (size_t i = 0; i < runOutput->ElementsNum(); i++) {
    ASSERT_EQ(output_data[i], outData[i]);
  }
  MS_LOG(INFO) << "Passed";
 }

 // TEST_F(TrainTest, TestMultiNode) {
 //  auto msGraph = std::make_shared<schema::GraphDefT>();
 //  msGraph->name = "graph";
 //  auto msSubgraph = std::make_unique<schema::SubGraphDefT>();
 //  msSubgraph->name = "subGraph";
 //
 //  auto conv = std::make_unique<schema::OpDefT>();
 //  conv->inputIndex = {0, 1};
 //  conv->outputIndex = {2};
 //  conv->attr.type = schema::OpT_Conv2D;
 //  auto conv_attr = new schema::Conv2DT;
 //  conv_attr->padMode = schema::PadMode_SAME;
 //  conv_attr->format = schema::Format_NHWC;
 //  conv_attr->strideH = 1;
 //  conv_attr->strideW = 1;
 //  conv_attr->kernelH = 3;
 //  conv_attr->kernelW = 3;
 //  conv_attr->dilateH = 1;
 //  conv_attr->dilateW = 1;
 //
 //  conv->attr.value = conv_attr;
 //  conv->name = "Conv2D";
 //  conv->fmkType = schema::FmkType_CAFFE;
 //  msSubgraph->nodes.emplace_back(std::move(conv));
 //
 //  auto matMul1 = std::make_unique<schema::OpDefT>();
 //  matMul1->inputIndex = {2, 3};
 //  matMul1->outputIndex = {4};
 //  matMul1->attr.type = schema::OpT_MatMul;
 //  auto matMul_attr1 = new schema::MatMulT;
 //  matMul_attr1->transposeA = false;
 //  matMul_attr1->transposeB = true;
 //  matMul1->attr.value = matMul_attr1;
 //  matMul1->name = "matmul1";
 //  matMul1->fmkType = schema::FmkType_CAFFE;
 //  msSubgraph->nodes.emplace_back(std::move(matMul1));
 //
 //  auto matMul2 = std::make_unique<schema::OpDefT>();
 //  matMul2->inputIndex = {4, 5};
 //  matMul2->outputIndex = {6};
 //  matMul2->attr.type = schema::OpT_MatMul;
 //  auto matMul_attr2 = new schema::MatMulT;
 //  matMul_attr2->transposeA = false;
 //  matMul_attr2->transposeB = true;
 //  matMul2->attr.value = matMul_attr2;
 //  matMul2->name = "matmul2";
 //  matMul2->fmkType = schema::FmkType_CAFFE;
 //  msSubgraph->nodes.emplace_back(std::move(matMul2));
 //
 //  msSubgraph->inputIndex = {0};
 //  msSubgraph->outputIndex = {6};
 //
 //  auto input0 = std::make_unique<schema::TensorDefT>();
 //  input0->refCount = schema::MSCONST_WEIGHT_REFCOUNT;
 //  input0->format = schema::Format_NHWC;
 //  input0->dataType = TypeId::kNumberTypeFloat32;
 //  input0->dims = {1, 5, 5, 3};
 //  input0->offset = -1;
 //  msSubgraph->allTensors.emplace_back(std::move(input0));
 //
 //  auto conv_weight = std::make_unique<schema::TensorDefT>();
 //  conv_weight->refCount = schema::MSCONST_WEIGHT_REFCOUNT;
 //  conv_weight->format = schema::Format_KHWC;
 //  conv_weight->dataType = TypeId::kNumberTypeFloat32;
 //  conv_weight->dims = {8, 3, 3, 3};
 //  conv_weight->data.resize(8*3*3*3*sizeof(float));
 //  msSubgraph->allTensors.emplace_back(std::move(conv_weight));
 //
 //  auto conv_output = std::make_unique<schema::TensorDefT>();
 //  conv_output->refCount = 0;
 //  conv_output->format = schema::Format_NHWC;
 //  conv_output->dataType = TypeId::kNumberTypeFloat32;
 //  conv_output->dims = {1, 5, 5, 8};
 //  msSubgraph->allTensors.emplace_back(std::move(conv_output));
 //
 //  auto add_weight = std::make_unique<schema::TensorDefT>();
 //  add_weight->refCount = schema::MSCONST_WEIGHT_REFCOUNT;
 //  add_weight->format = schema::Format_NHWC;
 //  add_weight->dataType = TypeId::kNumberTypeFloat32;
 //  add_weight->dims = {1, 5, 5, 8};
 //  add_weight->data.resize(5*5*8*sizeof(float));
 //  msSubgraph->allTensors.emplace_back(std::move(add_weight));
 //
 //  auto add_output = std::make_unique<schema::TensorDefT>();
 //  add_output->refCount = 0;
 //  add_output->format = schema::Format_NHWC;
 //  add_output->dataType = TypeId::kNumberTypeFloat32;
 //  add_output->dims = {1, 5, 5, 8};
 //  msSubgraph->allTensors.emplace_back(std::move(add_output));
 //
 //  auto mul_weight = std::make_unique<schema::TensorDefT>();
 //  mul_weight->refCount = schema::MSCONST_WEIGHT_REFCOUNT;
 //  mul_weight->format = schema::Format_NHWC;
 //  mul_weight->dataType = TypeId::kNumberTypeFloat32;
 //  mul_weight->dims = {1, 5, 5, 8};
 //  mul_weight->data.resize(5*5*8*sizeof(float));
 //  msSubgraph->allTensors.emplace_back(std::move(mul_weight));
 //
 //  auto mul_output = std::make_unique<schema::TensorDefT>();
 //  mul_output->refCount = 0;
 //  mul_output->format = schema::Format_NHWC;
 //  mul_output->dataType = TypeId::kNumberTypeFloat32;
 //  mul_output->dims = {1, 5, 5, 8};
 //  msSubgraph->allTensors.emplace_back(std::move(mul_output));
 //  msGraph->subgraphs.emplace_back(std::move(msSubgraph));
 //
 //  flatbuffers::FlatBufferBuilder builder(1024);
 //  auto offset = schema::GraphDef::Pack(builder, msGraph.get());
 //  builder.Finish(offset);
 //  size_t size = builder.GetSize();
 //  const char *content = (char *)builder.GetBufferPointer();
 //  const std::string strstub = "";
 //
 //  auto func_graph = inference::LoadModel(content, size, strstub);
 //  ASSERT_NE(nullptr, func_graph);
 //  auto session = inference::MSSession::CreateSession(kCPUDevice, 0);
 //  ASSERT_NE(nullptr, session);
 //  auto graphId = session->CompileGraph(func_graph);
 //
 //  auto inTensor =
 //    std::shared_ptr<inference::MSTensor>(inference::MSTensor::CreateTensor(TypeId::kNumberTypeFloat32, {1, 5, 5, 3}));
 //  ASSERT_NE(nullptr, inTensor);
 //  ASSERT_EQ(sizeof(float) * (5 * 5 * 3), inTensor->Size());
 //  (void)inTensor->MutableData();
 //
 //  std::vector<std::shared_ptr<inference::MSTensor>> inputs;
 //  inputs.emplace_back(inTensor);
 //  auto outputs = session->RunGraph(graphId, inputs);
 //  ASSERT_EQ(1, outputs.size());
 //  ASSERT_EQ(1, outputs.front().size());
 //  auto runOutput = outputs.front().front();
 //  ASSERT_NE(nullptr, runOutput);
 //  ASSERT_EQ(5 * 5 * 8, runOutput->ElementsNum());
 //  ASSERT_EQ(TypeId::kNumberTypeFloat32, runOutput->data_type());
 //  MS_LOG(INFO) << "Passed";
 // }
 }  // namespace mindspore