!31 Synchronization code to ms-incubator

Merge pull request !31 from changzherui/syn-code
6 years ago · a14794caa8
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -105,11 +105,11 @@ When reporting issues, refer to this format:
 * If it is a new feature that needs lots of design details, a design proposal should also be submitted.
 * After reaching consensus in the issue discussions and design proposal reviews, complete the development on the forked repo and submit a PR.
 * None of PRs is not permitted until it receives **2+ LGTM** from approvers. Please NOTICE that approver is NOT allowed to add *LGTM* on his own PR.
 * After PR is sufficiently discussed, it will get merged, abondoned or rejected depending on the outcome of the discussion.
 * After PR is sufficiently discussed, it will get merged, abandoned or rejected depending on the outcome of the discussion.

 **PRs advisory:**

 - Any irrelevant changes should be avoided.
 - Make sure your commit history being ordered.
 - Always keep your branch up with the master branch.
 - For bug-fix PRs, make sure all related issues being linked. 
 - For bug-fix PRs, make sure all related issues being linked.
--- a/README.md
+++ b/README.md
@@ -129,7 +129,7 @@ Check out how MindSpore Open Governance [works](https://gitee.com/mindspore/comm
 - [MindSpore Slack](https://join.slack.com/t/mindspore/shared_invite/enQtOTcwMTIxMDI3NjM0LTNkMWM2MzI5NjIyZWU5ZWQ5M2EwMTQ5MWNiYzMxOGM4OWFhZjI4M2E5OGI2YTg3ODU1ODE2Njg1MThiNWI3YmQ) - Communication platform for developers.
 - IRC channel at `#mindspore` (only for meeting minutes logging purpose)
 - Video Conferencing: meet.jit.si
 - Mailing-list: https://mailweb.mindspore.cn/postorius/lists 
 - Mailing-list: https://mailweb.mindspore.cn/postorius/lists

 ## Contributing

--- a/RELEASE.md
+++ b/RELEASE.md
@@ -70,4 +70,4 @@
 * [MindSpore Official Website] (https://www.mindspore.cn/)
 * [MindInsight Visualization Debugging and Optimization] (https://gitee.com/mindspore/mindinsight)
 * [MindArmour Model Security Hardening Package] (https://gitee.com/mindspore/mindarmour)
 * [GraphEngine Computational Graph Engine] (https://gitee.com/mindspore/graphengine)
 * [GraphEngine Computational Graph Engine] (https://gitee.com/mindspore/graphengine)
--- a/+ 1
+++ b/+ 1
@@ -368,7 +368,7 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 THE SOFTWARE.


 Software: MKL-DNN 1.1.2
 Software: oneDNN 1.1.2
 Copyright (c) 2009-2018 The MathJax Consortium
 Copyright 2018 Intel Corporation
 Copyright 2019 Intel Corporation
--- a/build.sh
+++ b/build.sh
@@ -26,7 +26,7 @@ usage()
  echo "Usage:"
  echo "bash build.sh [-d] [-r] [-v] [-c on|off] [-t on|off] [-g on|off] [-h] [-s] [-b ge|cpu] [-m infer|train] \\"
  echo "              [-a on|off] [-g on|off] [-p on|off] [-i] [-L] [-R] [-D on|off] [-j[n]] [-e gpu|d|cpu] \\"
  echo "              [-P on|off] [-z] [-M on|off] [-V 9.2|10.1] [-I] [-K]"
  echo "              [-P on|off] [-z [on|off]] [-M on|off] [-V 9.2|10.1] [-I] [-K]"
  echo ""
  echo "Options:"
  echo "    -d Debug mode"
@@ -50,8 +50,8 @@ usage()
  echo "    -P Enable dump anf graph to file in ProtoBuffer format, default on"
  echo "    -Q Enable dump end to end, default off"
  echo "    -D Enable dumping of function graph ir, default on"
  echo "    -z Compile dataset & mindrecord, default off"
  echo "    -M Enable MPI and NCCL for GPU training, default off"
  echo "    -z Compile dataset & mindrecord, default on"
  echo "    -M Enable MPI and NCCL for GPU training, default on"
  echo "    -V Specify the minimum required cuda version, default CUDA 9.2"
  echo "    -I Compile predict, default off"
  echo "    -K Compile with AKG, default off"
@@ -88,8 +88,8 @@ checkopts()
  ENABLE_DUMP2PROTO="on"
  ENABLE_DUMPE2E="off"
  ENABLE_DUMP_IR="on"
  COMPILE_MINDDATA="off"
  ENABLE_MPI="off"
  COMPILE_MINDDATA="on"
  ENABLE_MPI="on"
  CUDA_VERSION="9.2"
  COMPILE_PREDICT="off"
  USE_GLOG="on"
@@ -177,7 +177,7 @@ checkopts()
        if [[ "X$OPTARG" == "Xgpu" ]]; then
          ENABLE_GPU="on"
          ENABLE_CPU="on"
        elif [[ "X$OPTARG" == "Xd" ]]; then
        elif [[ "X$OPTARG" == "Xd" || "X$OPTARG" == "Xascend" ]]; then
          ENABLE_D="on"
          ENABLE_CPU="on"
        elif [[ "X$OPTARG" == "Xcpu" ]]; then
@@ -216,7 +216,17 @@ checkopts()
        echo "enable dump function graph ir"
        ;;
      z)
        COMPILE_MINDDATA="on"
        eval ARG=\$\{$OPTIND\}
        if [[ -n $ARG && $ARG != -* ]]; then
          OPTARG=$ARG
          check_on_off $OPTARG z
          OPTIND=$((OPTIND + 1))
        else
          OPTARG=""
        fi
        if [[ "X$OPTARG" == "Xoff" ]]; then
          COMPILE_MINDDATA="off"
        fi
        ;;
      I)
        COMPILE_PREDICT="on"
@@ -452,8 +462,10 @@ if [[ "X$INC_BUILD" = "Xoff" ]]; then
        bash "${PROJECT_PATH}/package.sh" ge
    elif [[ "X$ENABLE_GPU" = "Xon" ]]; then
        bash "${PROJECT_PATH}/package.sh" ms gpu
    elif [[ "X$ENABLE_D" = "Xon" ]] || [[ "X$ENABLE_CPU" = "Xon" ]]; then
        bash "${PROJECT_PATH}/package.sh" ms
    elif [[ "X$ENABLE_D" = "Xon" ]]; then
        bash "${PROJECT_PATH}/package.sh" ms ascend
    elif [[ "X$ENABLE_CPU" = "Xon" ]]; then
        bash "${PROJECT_PATH}/package.sh" ms cpu
    else
        bash "${PROJECT_PATH}/package.sh" debug
    fi
--- a/cmake/dependency_graphengine.cmake
+++ b/cmake/dependency_graphengine.cmake
@@ -39,7 +39,11 @@ elseif (DEFINED ENV{D_LINK_PATH})
    find_library(resource libresource.so ${GE_LIB_PATH})
 else()
    # Ascend mode
    set(ASCEND_PATH /usr/local/Ascend)
    if(DEFINED ENV{ASCEND_CUSTOM_PATH})
        set(ASCEND_PATH $ENV{ASCEND_CUSTOM_PATH})
    else()
        set(ASCEND_PATH /usr/local/Ascend)
    endif()
    set(ASCEND_DRIVER_PATH ${ASCEND_PATH}/driver/lib64/common)
    set(ASCEND_RUNTIME_PATH ${ASCEND_PATH}/fwkacllib/lib64)
    find_library(c_sec libc_sec.so ${ASCEND_DRIVER_PATH})
--- a/cmake/external_libs/mkl_dnn.cmake
+++ b/cmake/external_libs/mkl_dnn.cmake
@@ -1,11 +1,11 @@
 set(mkl_dnn_CXXFLAGS "-D_FORTIFY_SOURCE=2 -O2")
 set(mkl_dnn_CFLAGS "-D_FORTIFY_SOURCE=2 -O2")
 mindspore_add_pkg(mkl_dnn
        VER 1.1.1
 set(onednn_CXXFLAGS "-D_FORTIFY_SOURCE=2 -O2")
 set(onednn_CFLAGS "-D_FORTIFY_SOURCE=2 -O2")
 mindspore_add_pkg(onednn
        VER 1.1.2
        LIBS dnnl mkldnn
        URL https://github.com/intel/mkl-dnn/archive/v1.1.1.tar.gz
        MD5 d6a422b00459600bdc22242590953f38
        URL https://github.com/oneapi-src/oneDNN/archive/v1.1.2.tar.gz
        MD5 ab40d52230f3ad1d7a6f06ce0f6bc17a
        CMAKE_OPTION -DDNNL_ARCH_OPT_FLAGS='' -DDNNL_CPU_RUNTIME='SEQ' -DDNNL_BUILD_EXAMPLES=OFF -DDNNL_BUILD_TESTS=OFF)
 include_directories(${mkl_dnn_INC})
 add_library(mindspore::dnnl ALIAS mkl_dnn::dnnl)
 add_library(mindspore::mkldnn ALIAS mkl_dnn::mkldnn)
 include_directories(${onednn_INC})
 add_library(mindspore::dnnl ALIAS onednn::dnnl)
 add_library(mindspore::mkldnn ALIAS onednn::mkldnn)
--- a/cmake/mind_expression.cmake
+++ b/cmake/mind_expression.cmake
@@ -29,11 +29,11 @@ if (ENABLE_GPU)
    include(${CMAKE_SOURCE_DIR}/cmake/external_libs/dmlc_core.cmake)
    include(${CMAKE_SOURCE_DIR}/cmake/external_libs/rang.cmake)
    include(${CMAKE_SOURCE_DIR}/cmake/external_libs/tvm_gpu.cmake)
 endif()

 if (ENABLE_MPI)
    include(${CMAKE_SOURCE_DIR}/cmake/external_libs/nccl.cmake)
    include(${CMAKE_SOURCE_DIR}/cmake/external_libs/ompi.cmake)
    if (ENABLE_MPI)
        include(${CMAKE_SOURCE_DIR}/cmake/external_libs/nccl.cmake)
        include(${CMAKE_SOURCE_DIR}/cmake/external_libs/ompi.cmake)
    endif()
 endif()

 if (ENABLE_GE)
--- a/cmake/utils.cmake
+++ b/cmake/utils.cmake
@@ -40,6 +40,8 @@ else()
    set(JOBS 8)
    if (${JOBS} GREATER ${N})
        set(THNUM ${N})
    else()
        set(THNUM ${JOBS})
    endif()
 endif ()
 message("set make thread num: ${THNUM}")
--- a/example/yolov3_coco2017/train.py
+++ b/example/yolov3_coco2017/train.py
@@ -67,7 +67,7 @@ if __name__ == '__main__':
    parser.add_argument("--distribute", type=bool, default=False, help="Run distribute, default is false.")
    parser.add_argument("--device_id", type=int, default=0, help="Device id, default is 0.")
    parser.add_argument("--device_num", type=int, default=1, help="Use device nums, default is 1.")
    parser.add_argument("--mode", type=str, default="graph", help="Run graph mode or feed mode, default is graph")
    parser.add_argument("--mode", type=str, default="sink", help="Run sink mode or not, default is sink")
    parser.add_argument("--epoch_size", type=int, default=10, help="Epoch size, default is 10")
    parser.add_argument("--batch_size", type=int, default=32, help="Batch size, default is 32.")
    parser.add_argument("--checkpoint_path", type=str, default="", help="Checkpoint file path")
@@ -150,8 +150,8 @@ if __name__ == '__main__':

        model = Model(net)
        dataset_sink_mode = False
        if args_opt.mode == "graph":
            print("In graph mode, one epoch return a loss.")
        if args_opt.mode == "sink":
            print("In sink mode, one epoch return a loss.")
            dataset_sink_mode = True
        print("Start train YOLOv3, the first epoch will be slower because of the graph compilation.")
        model.train(args_opt.epoch_size, dataset, callbacks=callback, dataset_sink_mode=dataset_sink_mode)
--- a/mindspore/ccsrc/CMakeLists.txt
+++ b/mindspore/ccsrc/CMakeLists.txt
@@ -132,6 +132,7 @@ file(GLOB_RECURSE MINDSPORE_SRC_LIST RELATIVE ${CMAKE_CURRENT_SOURCE_DIR}
        "kernel/kash/*.cc"
        "device/kernel_info.cc"
        "device/kernel_runtime.cc"
        "device/memory_manager.cc"
        "device/kernel_runtime_manager.cc"
        "device/convert_tensor_utils.cc"
        "pre_activate/common/*.cc"
@@ -295,7 +296,11 @@ if(ENABLE_D)
        endif()
    else()
        MESSAGE("use system default lib")
        set(ASCEND_PATH /usr/local/Ascend)
        if(DEFINED ENV{ASCEND_CUSTOM_PATH})
            set(ASCEND_PATH $ENV{ASCEND_CUSTOM_PATH})
        else()
            set(ASCEND_PATH /usr/local/Ascend)
        endif()
        set(ASCEND_DRIVER_PATH ${ASCEND_PATH}/driver/lib64/common)
        set(ASCEND_RUNTIME_PATH ${ASCEND_PATH}/fwkacllib/lib64)
    endif()
@@ -499,7 +504,11 @@ add_dependencies(add_ms_lib _c_expression)

 if (NOT ENABLE_GE)
    if (ENABLE_D)
        set(ASCEND_PATH /usr/local/Ascend)
        if(DEFINED ENV{ASCEND_CUSTOM_PATH})
            set(ASCEND_PATH $ENV{ASCEND_CUSTOM_PATH})
        else()
            set(ASCEND_PATH /usr/local/Ascend)
        endif()
        set(ASCEND_DRIVER_PATH ${ASCEND_PATH}/driver/lib64/common)
        add_custom_target(add_ge_lib ALL
                COMMAND cp ${MS_CCSRC_BUILD_PATH}/../../graphengine/src/common/graph/libgraph.so ${MS_LIB_PATH}
@@ -542,7 +551,7 @@ endif()

 if (ENABLE_CPU)
    add_custom_target(add_cpu_lib ALL
            COMMAND cp ${mkl_dnn_LIBPATH}/libdnnl.so.1.1 ${MS_LIB_PATH}/libdnnl.so.1
            COMMAND cp ${onednn_LIBPATH}/libdnnl.so.1.1 ${MS_LIB_PATH}/libdnnl.so.1
            )
    add_dependencies(add_cpu_lib add_ms_lib)
 endif()
--- a/mindspore/ccsrc/dataset/CMakeLists.txt
+++ b/mindspore/ccsrc/dataset/CMakeLists.txt
@@ -17,8 +17,6 @@ if (ENABLE_TDTQUE)
    message(STATUS "TDT queue is enabled")
 endif ()

 add_definitions(-D ENABLE_MINDRECORD)

 # conde coverage
 # option(ENABLE_COVERAGE "Enable code coverage report" OFF)
 # if (ENABLE_COVERAGE)
--- a/mindspore/ccsrc/dataset/api/de_pipeline.cc
+++ b/mindspore/ccsrc/dataset/api/de_pipeline.cc
@@ -23,17 +23,14 @@
 #include "dataset/engine/datasetops/source/image_folder_op.h"
 #include "dataset/engine/datasetops/source/mnist_op.h"
 #include "dataset/engine/datasetops/source/voc_op.h"
 #include "dataset/util/make_unique.h"
 #include "dataset/core/tensor.h"
 #include "dataset/engine/dataset_iterator.h"
 #include "dataset/engine/datasetops/source/manifest_op.h"
 #include "dataset/engine/datasetops/source/cifar_op.h"
 #include "dataset/engine/datasetops/source/celeba_op.h"
 #ifdef ENABLE_MINDRECORD
 #include "./shard_category.h"
 #include "./shard_sample.h"
 #include "./shard_shuffle.h"
 #endif
 #include "mindrecord/include/shard_category.h"
 #include "mindrecord/include/shard_sample.h"
 #include "mindrecord/include/shard_shuffle.h"

 #include "dataset/util/random.h"
 #include "dataset/util/status.h"
@@ -46,9 +43,7 @@ using pFunction = Status (DEPipeline::*)(const py::dict &, std::shared_ptr<Datas

 static std::unordered_map<uint32_t, pFunction> g_parse_op_func_ = {{kStorage, &DEPipeline::ParseStorageOp},
                                                                   {kShuffle, &DEPipeline::ParseShuffleOp},
 #ifdef ENABLE_MINDRECORD
                                                                   {kMindrecord, &DEPipeline::ParseMindRecordOp},
 #endif
                                                                   {kMap, &DEPipeline::ParseMapOp},
                                                                   {kBatch, &DEPipeline::ParseBatchOp},
                                                                   {kRepeat, &DEPipeline::ParseRepeatOp},
@@ -123,7 +118,7 @@ Status DEPipeline::AssignRootNode(const DsOpPtr &dataset_op) { return (tree_->As
 Status DEPipeline::LaunchTreeExec() {
  RETURN_IF_NOT_OK(tree_->Prepare());
  RETURN_IF_NOT_OK(tree_->Launch());
  iterator_ = make_unique<DatasetIterator>(tree_);
  iterator_ = std::make_unique<DatasetIterator>(tree_);
  if (iterator_ == nullptr) RETURN_STATUS_UNEXPECTED("Cannot create an Iterator.");
  return Status::OK();
 }
@@ -311,7 +306,7 @@ Status DEPipeline::ParseStorageOp(const py::dict &args, std::shared_ptr<DatasetO
    if (!args["schema"].is_none()) {
      (void)builder->SetSchemaFile(ToString(args["schema"]));
    } else if (!args["schema_json_string"].is_none()) {
      std::unique_ptr<DataSchema> schema = make_unique<DataSchema>();
      std::unique_ptr<DataSchema> schema = std::make_unique<DataSchema>();
      std::string s = ToString(args["schema_json_string"]);
      RETURN_IF_NOT_OK(schema->LoadSchemaString(s, std::vector<std::string>()));
      (void)builder->SetNumRows(schema->num_rows());
@@ -364,7 +359,6 @@ Status DEPipeline::ParseShuffleOp(const py::dict &args, std::shared_ptr<DatasetO
  return Status::OK();
 }

 #ifdef ENABLE_MINDRECORD
 Status DEPipeline::CheckMindRecordPartitionInfo(const py::dict &args, std::vector<int> *in_partitions) {
  if (args["partitions"].is_none()) {
    std::string err_msg = "Error: partitions is not set (None)";
@@ -450,7 +444,6 @@ Status DEPipeline::ParseMindRecordOp(const py::dict &args, std::shared_ptr<Datas
  *ptr = op;
  return Status::OK();
 }
 #endif

 Status DEPipeline::ParseMapOp(const py::dict &args, std::shared_ptr<DatasetOp> *ptr) {
  std::shared_ptr<MapOp::Builder> builder = std::make_shared<MapOp::Builder>();
@@ -689,7 +682,7 @@ Status DEPipeline::ParseTFReaderOp(const py::dict &args, std::shared_ptr<Dataset
    }
  }
  if (schema_exists) {
    std::unique_ptr<DataSchema> schema = make_unique<DataSchema>();
    std::unique_ptr<DataSchema> schema = std::make_unique<DataSchema>();
    if (args.contains("schema_file_path")) {
      RETURN_IF_NOT_OK(schema->LoadSchemaFile(ToString(args["schema_file_path"]), columns_to_load));
    } else {
--- a/mindspore/ccsrc/dataset/api/de_pipeline.h
+++ b/mindspore/ccsrc/dataset/api/de_pipeline.h
@@ -38,9 +38,7 @@ using DsOpPtr = std::shared_ptr<DatasetOp>;
 enum OpName {
  kStorage = 0,
  kShuffle,
 #ifdef ENABLE_MINDRECORD
  kMindrecord,
 #endif
  kBatch,
  kCache,
  kRepeat,
@@ -101,11 +99,9 @@ class DEPipeline {

  Status ParseShuffleOp(const py::dict &args, std::shared_ptr<DatasetOp> *ptr);

 #ifdef ENABLE_MINDRECORD
  Status CheckMindRecordPartitionInfo(const py::dict &args, std::vector<int> *ptr);

  Status ParseMindRecordOp(const py::dict &args, std::shared_ptr<DatasetOp> *ptr);
 #endif

  Status ParseMapOp(const py::dict &args, std::shared_ptr<DatasetOp> *ptr);

--- a/mindspore/ccsrc/dataset/api/python_bindings.cc
+++ b/mindspore/ccsrc/dataset/api/python_bindings.cc
@@ -44,9 +44,7 @@
 #include "dataset/engine/datasetops/source/io_block.h"
 #include "dataset/engine/datasetops/source/mnist_op.h"
 #include "dataset/engine/datasetops/source/manifest_op.h"
 #ifdef ENABLE_MINDRECORD
 #include "dataset/engine/datasetops/source/mindrecord_op.h"
 #endif
 #include "dataset/engine/datasetops/source/sampler/distributed_sampler.h"
 #include "dataset/engine/datasetops/source/sampler/pk_sampler.h"
 #include "dataset/engine/datasetops/source/sampler/random_sampler.h"
@@ -146,14 +144,12 @@ void bindDatasetOps(py::module *m) {
      return py::make_tuple(count, num_classes);
    });

 #ifdef ENABLE_MINDRECORD
  (void)py::class_<MindRecordOp, DatasetOp, std::shared_ptr<MindRecordOp>>(*m, "MindRecordOp")
    .def_static("get_num_rows", [](const std::string &path) {
      int64_t count = 0;
      THROW_IF_ERROR(MindRecordOp::CountTotalRows(path, &count));
      return count;
    });
 #endif

  (void)py::class_<ManifestOp, DatasetOp, std::shared_ptr<ManifestOp>>(*m, "ManifestOp")
    .def_static("get_num_rows_and_classes",
@@ -424,9 +420,7 @@ PYBIND11_MODULE(_c_dataengine, m) {
    .value("STORAGE", OpName::kStorage)
    .value("SHUFFLE", OpName::kShuffle)
    .value("BATCH", OpName::kBatch)
 #ifdef ENABLE_MINDRECORD
    .value("MINDRECORD", OpName::kMindrecord)
 #endif
    .value("CACHE", OpName::kCache)
    .value("REPEAT", OpName::kRepeat)
    .value("TAKE", OpName::kTake)
--- a/mindspore/ccsrc/dataset/core/global_context.cc
+++ b/mindspore/ccsrc/dataset/core/global_context.cc
@@ -55,9 +55,9 @@ Status GlobalContext::Init() {
  // For testing we can use Dummy pool instead

  // Create some tensor allocators for the different types and hook them into the pool.
  tensor_allocator_ = mindspore::make_unique<Allocator<Tensor>>(mem_pool_);
  cv_tensor_allocator_ = mindspore::make_unique<Allocator<CVTensor>>(mem_pool_);
  int_allocator_ = mindspore::make_unique<IntAlloc>(mem_pool_);
  tensor_allocator_ = std::make_unique<Allocator<Tensor>>(mem_pool_);
  cv_tensor_allocator_ = std::make_unique<Allocator<CVTensor>>(mem_pool_);
  int_allocator_ = std::make_unique<IntAlloc>(mem_pool_);
  return Status::OK();
 }

--- a/mindspore/ccsrc/dataset/core/tensor.cc
+++ b/mindspore/ccsrc/dataset/core/tensor.cc
@@ -28,7 +28,6 @@
 #include "dataset/core/global_context.h"
 #include "dataset/core/pybind_support.h"
 #include "dataset/core/tensor_shape.h"
 #include "dataset/util/make_unique.h"

 namespace py = pybind11;
 namespace mindspore {
@@ -53,7 +52,7 @@ namespace dataset {
 Tensor::Tensor(const TensorShape &shape, const DataType &type) : shape_(shape), type_(type), data_(nullptr) {
  // grab the mem pool from global context and create the allocator for char data area
  std::shared_ptr<MemoryPool> global_pool = GlobalContext::Instance()->mem_pool();
  data_allocator_ = mindspore::make_unique<Allocator<unsigned char>>(global_pool);
  data_allocator_ = std::make_unique<Allocator<unsigned char>>(global_pool);
 }

 Tensor::Tensor(const TensorShape &shape, const DataType &type, const unsigned char *data) : Tensor(shape, type) {
@@ -137,7 +136,7 @@ Status Tensor::CreateTensor(std::shared_ptr<Tensor> *ptr, py::array arr) {
  if ((*ptr)->type_ == DataType::DE_UNKNOWN) RETURN_STATUS_UNEXPECTED("Invalid data type.");

  std::shared_ptr<MemoryPool> global_pool = GlobalContext::Instance()->mem_pool();
  (*ptr)->data_allocator_ = mindspore::make_unique<Allocator<unsigned char>>(global_pool);
  (*ptr)->data_allocator_ = std::make_unique<Allocator<unsigned char>>(global_pool);
  static_cast<void>((*ptr)->StartAddr());
  int64_t byte_size = (*ptr)->SizeInBytes();
  unsigned char *data = static_cast<unsigned char *>(arr.request().ptr);
--- a/mindspore/ccsrc/dataset/engine/data_buffer.cc
+++ b/mindspore/ccsrc/dataset/engine/data_buffer.cc
@@ -40,7 +40,7 @@ Status DataBuffer::CreateDataBuffer(
      case DatasetType::kTf: {
        // This type of buffer is for TF record data.
        // Allocate derived class version for a TF buffers
        new_data_buffer = mindspore::make_unique<TFBuffer>(id, kDeBFlagNone, storage_client);
        new_data_buffer = std::make_unique<TFBuffer>(id, kDeBFlagNone, storage_client);
        break;
      }
      default: {
--- a/mindspore/ccsrc/dataset/engine/data_schema.cc
+++ b/mindspore/ccsrc/dataset/engine/data_schema.cc
@@ -26,8 +26,8 @@
 #include "common/utils.h"
 #include "dataset/util/status.h"
 #include "dataset/core/tensor_shape.h"
 #include "dataset/util/make_unique.h"
 #include "utils/log_adapter.h"
 #include "dataset/util/de_error.h"

 namespace mindspore {
 namespace dataset {
@@ -58,7 +58,7 @@ ColDescriptor::ColDescriptor(const std::string &col_name, DataType col_type, Ten
  // our shape.  Otherwise, set our shape to be empty.
  if (in_shape != nullptr) {
    // Create a shape and copy construct it into our column's shape.
    tensor_shape_ = mindspore::make_unique<TensorShape>(*in_shape);
    tensor_shape_ = std::make_unique<TensorShape>(*in_shape);
  } else {
    tensor_shape_ = nullptr;
  }
@@ -75,7 +75,7 @@ ColDescriptor::ColDescriptor(const std::string &col_name, DataType col_type, Ten
 ColDescriptor::ColDescriptor(const ColDescriptor &in_cd)
    : type_(in_cd.type_), rank_(in_cd.rank_), tensor_impl_(in_cd.tensor_impl_), col_name_(in_cd.col_name_) {
  // If it has a tensor shape, make a copy of it with our own unique_ptr.
  tensor_shape_ = in_cd.hasShape() ? mindspore::make_unique<TensorShape>(in_cd.shape()) : nullptr;
  tensor_shape_ = in_cd.hasShape() ? std::make_unique<TensorShape>(in_cd.shape()) : nullptr;
 }

 // Assignment overload
@@ -86,7 +86,7 @@ ColDescriptor &ColDescriptor::operator=(const ColDescriptor &in_cd) {
    tensor_impl_ = in_cd.tensor_impl_;
    col_name_ = in_cd.col_name_;
    // If it has a tensor shape, make a copy of it with our own unique_ptr.
    tensor_shape_ = in_cd.hasShape() ? mindspore::make_unique<TensorShape>(in_cd.shape()) : nullptr;
    tensor_shape_ = in_cd.hasShape() ? std::make_unique<TensorShape>(in_cd.shape()) : nullptr;
  }
  return *this;
 }
--- a/mindspore/ccsrc/dataset/engine/datasetops/batch_op.cc
+++ b/mindspore/ccsrc/dataset/engine/datasetops/batch_op.cc
@@ -59,8 +59,8 @@ Status BatchOp::operator()() {
  TaskManager::FindMe()->Post();
  int32_t epoch_num = 0, batch_num = 0, cnt = 0;
  TensorRow new_row;
  std::unique_ptr<TensorQTable> table = make_unique<TensorQTable>();
  child_iterator_ = mindspore::make_unique<ChildIterator>(this, 0, 0);
  std::unique_ptr<TensorQTable> table = std::make_unique<TensorQTable>();
  child_iterator_ = std::make_unique<ChildIterator>(this, 0, 0);
  RETURN_IF_NOT_OK(child_iterator_->FetchNextTensorRow(&new_row));
  column_name_map_ = child_iterator_->col_name_id_map();
  int32_t cur_batch_size = 0;
@@ -72,7 +72,7 @@ Status BatchOp::operator()() {
      if (table->size() == static_cast<size_t>(cur_batch_size)) {
        RETURN_IF_NOT_OK(worker_queues_[cnt++ % num_workers_]->EmplaceBack(
          std::make_pair(std::move(table), CBatchInfo(epoch_num, batch_num++, cnt - epoch_num))));
        table = make_unique<TensorQTable>();
        table = std::make_unique<TensorQTable>();
        RETURN_IF_NOT_OK(GetBatchSize(&cur_batch_size, CBatchInfo(epoch_num, batch_num, cnt - epoch_num)));
      }
      RETURN_IF_NOT_OK(child_iterator_->FetchNextTensorRow(&new_row));
@@ -82,7 +82,7 @@ Status BatchOp::operator()() {
      RETURN_IF_NOT_OK(worker_queues_[cnt++ % num_workers_]->EmplaceBack(
        std::make_pair(std::move(table), CBatchInfo(epoch_num, batch_num++, cnt - epoch_num))));
    }
    table = make_unique<TensorQTable>();  // this drops when drop == true
    table = std::make_unique<TensorQTable>();  // this drops when drop == true
    // end of the current epoch, batch_num should start from 0 again
    batch_num = 0;
    epoch_num++;
@@ -153,9 +153,9 @@ Status BatchOp::WorkerEntry(int32_t workerId) {
  RETURN_IF_NOT_OK(worker_queues_[workerId]->PopFront(&table_pair));
  while (table_pair.second.ctrl_ != batchCtrl::kQuit) {
    if (table_pair.second.ctrl_ == batchCtrl::kEOE) {
      RETURN_IF_NOT_OK(out_connector_->Add(workerId, make_unique<DataBuffer>(0, DataBuffer::kDeBFlagEOE)));
      RETURN_IF_NOT_OK(out_connector_->Add(workerId, std::make_unique<DataBuffer>(0, DataBuffer::kDeBFlagEOE)));
    } else if (table_pair.second.ctrl_ == batchCtrl::kEOF) {
      RETURN_IF_NOT_OK(out_connector_->Add(workerId, make_unique<DataBuffer>(0, DataBuffer::kDeBFlagEOF)));
      RETURN_IF_NOT_OK(out_connector_->Add(workerId, std::make_unique<DataBuffer>(0, DataBuffer::kDeBFlagEOF)));
    } else if (table_pair.second.ctrl_ == batchCtrl::kNoCtrl) {
      std::unique_ptr<DataBuffer> db = nullptr;
      RETURN_IF_NOT_OK(MakeBatchedBuffer(std::move(table_pair), &db));
@@ -170,8 +170,8 @@ Status BatchOp::MakeBatchedBuffer(std::pair<std::unique_ptr<TensorQTable>, CBatc
                                  std::unique_ptr<DataBuffer> *db) {
  RETURN_UNEXPECTED_IF_NULL(table_pair.first);
  if (!input_column_names_.empty()) RETURN_IF_NOT_OK(MapColumns(&table_pair));  // pass it through pyfunc
  (*db) = make_unique<DataBuffer>(table_pair.second.batch_num_, DataBuffer::kDeBFlagNone);
  std::unique_ptr<TensorQTable> dest_table = make_unique<TensorQTable>();
  (*db) = std::make_unique<DataBuffer>(table_pair.second.batch_num_, DataBuffer::kDeBFlagNone);
  std::unique_ptr<TensorQTable> dest_table = std::make_unique<TensorQTable>();
  RETURN_IF_NOT_OK(BatchRows(&table_pair.first, &dest_table, table_pair.first->size()));
  (*db)->set_tensor_table(std::move(dest_table));
  (*db)->set_column_name_map(column_name_map_);
--- a/mindspore/ccsrc/dataset/engine/datasetops/dataset_op.cc
+++ b/mindspore/ccsrc/dataset/engine/datasetops/dataset_op.cc
@@ -80,9 +80,9 @@ void DatasetOp::CreateConnector(int32_t num_producers, int32_t num_consumers) {
  MS_LOG(INFO) << "Creating connector in tree operator: " << operator_id_ << ". Producer: " << num_producers
               << ". Consumer: " << num_consumers << ".";
  if (oc_queue_size_ > 0) {
    out_connector_ = mindspore::make_unique<DbConnector>(num_producers,  // The number of producers
                                                         num_consumers,  // Only one consumer (the training App)
                                                         oc_queue_size_);
    out_connector_ = std::make_unique<DbConnector>(num_producers,  // The number of producers
                                                   num_consumers,  // Only one consumer (the training App)
                                                   oc_queue_size_);
  } else {
    // Some op's may choose not to have an output connector
    MS_LOG(INFO) << "Bypassed connector creation for tree operator: " << operator_id_ << ".";
@@ -149,7 +149,7 @@ Status DatasetOp::GetNextInput(std::unique_ptr<DataBuffer> *p_buffer, int32_t wo
 // The base class implementation simply flows the eoe message to output. Derived classes
 // may override if they need to perform special eoe handling.
 Status DatasetOp::EoeReceived(int32_t worker_id) {
  std::unique_ptr<DataBuffer> eoe_buffer = mindspore::make_unique<DataBuffer>(0, DataBuffer::kDeBFlagEOE);
  std::unique_ptr<DataBuffer> eoe_buffer = std::make_unique<DataBuffer>(0, DataBuffer::kDeBFlagEOE);
  return (out_connector_->Add(static_cast<int>(worker_id), std::move(eoe_buffer)));
 }

@@ -157,7 +157,7 @@ Status DatasetOp::EoeReceived(int32_t worker_id) {
 // The base class implementation simply flows the eof message to output. Derived classes
 // may override if they need to perform special eof handling.
 Status DatasetOp::EofReceived(int32_t worker_id) {
  std::unique_ptr<DataBuffer> eof_buffer = mindspore::make_unique<DataBuffer>(0, DataBuffer::kDeBFlagEOF);
  std::unique_ptr<DataBuffer> eof_buffer = std::make_unique<DataBuffer>(0, DataBuffer::kDeBFlagEOF);
  return (out_connector_->Add(static_cast<int>(worker_id), std::move(eof_buffer)));
 }

--- a/mindspore/ccsrc/dataset/engine/datasetops/device_queue_op.cc
+++ b/mindspore/ccsrc/dataset/engine/datasetops/device_queue_op.cc
@@ -225,7 +225,7 @@ Status DeviceQueueOp::SendDataToCPU() {
  MS_LOG(INFO) << "Device queue, sending data to CPU.";
  int64_t total_batch = 0;

  std::unique_ptr<ChildIterator> child_iterator = mindspore::make_unique<ChildIterator>(this, 0, 0);
  std::unique_ptr<ChildIterator> child_iterator = std::make_unique<ChildIterator>(this, 0, 0);
  while (!(child_iterator->eof_handled())) {
    TensorRow curr_row;
    RETURN_IF_NOT_OK(child_iterator->FetchNextTensorRow(&curr_row));
--- a/mindspore/ccsrc/dataset/engine/datasetops/map_op.cc
+++ b/mindspore/ccsrc/dataset/engine/datasetops/map_op.cc
@@ -179,7 +179,7 @@ Status MapOp::WorkerEntry(int32_t worker_id) {
    RETURN_IF_NOT_OK(WorkerEntryInit(in_buffer.get(), &keep_input_columns, &to_process_indices, &final_col_name_id_map,
                                     &input_columns, &output_columns));

    std::unique_ptr<TensorQTable> new_tensor_table(mindspore::make_unique<TensorQTable>());
    std::unique_ptr<TensorQTable> new_tensor_table(std::make_unique<TensorQTable>());
    // Perform the compute function of TensorOp(s) and store the result in new_tensor_table.
    RETURN_IF_NOT_OK(WorkerCompute(in_buffer.get(), to_process_indices, new_tensor_table.get(), keep_input_columns,
                                   &input_columns, &output_columns));
--- a/mindspore/ccsrc/dataset/engine/datasetops/parallel_op.cc
+++ b/mindspore/ccsrc/dataset/engine/datasetops/parallel_op.cc
@@ -48,7 +48,7 @@ Status ParallelOp::CreateWorkerConnector(int32_t worker_connector_size) {
  // Instantiate the worker connector.  This is the internal connector, not the operators
  // output connector.  It has single master consuming from it (num producers is 1), and the number
  // of workers is the defined count from the op.
  worker_connector_ = mindspore::make_unique<DbConnector>(num_workers_, num_producers_, worker_connector_size);
  worker_connector_ = std::make_unique<DbConnector>(num_workers_, num_producers_, worker_connector_size);

  return Status::OK();
 }
--- a/mindspore/ccsrc/dataset/engine/datasetops/project_op.cc
+++ b/mindspore/ccsrc/dataset/engine/datasetops/project_op.cc
@@ -79,7 +79,7 @@ Status ProjectOp::Project(std::unique_ptr<DataBuffer> *data_buffer) {
    new_column_name_mapping[current_column] = i;
    projected_column_indices.push_back(column_name_mapping[current_column]);
  }
  std::unique_ptr<TensorQTable> new_tensor_table = mindspore::make_unique<TensorQTable>();
  std::unique_ptr<TensorQTable> new_tensor_table = std::make_unique<TensorQTable>();
  while ((*data_buffer)->NumRows() > 0) {
    TensorRow current_row;
    RETURN_IF_NOT_OK((*data_buffer)->PopRow(&current_row));
--- a/mindspore/ccsrc/dataset/engine/datasetops/rename_op.cc
+++ b/mindspore/ccsrc/dataset/engine/datasetops/rename_op.cc
@@ -84,13 +84,13 @@ Status RenameOp::operator()() {

    // we got eoe, now try again until we get eof
    MS_LOG(INFO) << "Rename operator EOE Received.";
    RETURN_IF_NOT_OK(out_connector_->Add(0, std::move(mindspore::make_unique<DataBuffer>(0, DataBuffer::kDeBFlagEOE))));
    RETURN_IF_NOT_OK(out_connector_->Add(0, std::move(std::make_unique<DataBuffer>(0, DataBuffer::kDeBFlagEOE))));
    MS_LOG(DEBUG) << "Rename operator fetching buffer after EOE.";
    RETURN_IF_NOT_OK(GetNextInput(&curr_buffer));
  }  // end of while eof loop

  MS_LOG(INFO) << "Rename opeerator EOF Received.";
  RETURN_IF_NOT_OK(out_connector_->Add(0, std::move(mindspore::make_unique<DataBuffer>(0, DataBuffer::kDeBFlagEOF))));
  RETURN_IF_NOT_OK(out_connector_->Add(0, std::move(std::make_unique<DataBuffer>(0, DataBuffer::kDeBFlagEOF))));
  return Status::OK();
 }

--- a/mindspore/ccsrc/dataset/engine/datasetops/shuffle_op.cc
+++ b/mindspore/ccsrc/dataset/engine/datasetops/shuffle_op.cc
@@ -70,7 +70,7 @@ ShuffleOp::ShuffleOp(int32_t shuffle_size, uint32_t shuffle_seed, int32_t op_con
      rng_(shuffle_seed),
      buffer_counter_(0),
      rows_per_buffer_(rows_per_buffer),
      shuffle_buffer_(mindspore::make_unique<TensorTable>()),
      shuffle_buffer_(std::make_unique<TensorTable>()),
      shuffle_last_row_idx_(0),
      shuffle_buffer_state_(kShuffleStateInit) {}

@@ -90,7 +90,7 @@ Status ShuffleOp::SelfReset() {
    shuffle_seed_ = distribution(random_device);
    rng_ = std::mt19937_64(shuffle_seed_);
  }
  shuffle_buffer_ = mindspore::make_unique<TensorTable>();
  shuffle_buffer_ = std::make_unique<TensorTable>();
  buffer_counter_ = 0;
  shuffle_last_row_idx_ = 0;
  shuffle_buffer_state_ = kShuffleStateInit;
@@ -142,7 +142,7 @@ Status ShuffleOp::operator()() {
  // Create the child iterator to fetch our data from.
  int32_t worker_id = 0;
  int32_t child_idx = 0;
  child_iterator_ = mindspore::make_unique<ChildIterator>(this, worker_id, child_idx);
  child_iterator_ = std::make_unique<ChildIterator>(this, worker_id, child_idx);

  // Main operator loop
  while (true) {
@@ -161,7 +161,7 @@ Status ShuffleOp::operator()() {
      // Step 1)
      // Create an output tensor table if one is not created yet.
      if (!new_buffer_table) {
        new_buffer_table = mindspore::make_unique<TensorQTable>();
        new_buffer_table = std::make_unique<TensorQTable>();
      }

      // Step 2)
@@ -176,7 +176,7 @@ Status ShuffleOp::operator()() {
      // and send this buffer on it's way up the pipeline. Special case is if this is the
      // last row then we also send it.
      if (new_buffer_table->size() == rows_per_buffer_ || shuffle_last_row_idx_ == 0) {
        auto new_buffer = mindspore::make_unique<DataBuffer>(buffer_counter_, DataBuffer::kDeBFlagNone);
        auto new_buffer = std::make_unique<DataBuffer>(buffer_counter_, DataBuffer::kDeBFlagNone);
        new_buffer->set_tensor_table(std::move(new_buffer_table));
        new_buffer->set_column_name_map(column_name_map_);
        buffer_counter_++;
@@ -218,7 +218,7 @@ Status ShuffleOp::operator()() {
    // Since we overloaded eoeReceived function, we are responsible to flow the EOE up the
    // pipepline manually now that we are done draining the shuffle buffer
    MS_LOG(INFO) << "Shuffle operator sending EOE.";
    auto eoe_buffer = mindspore::make_unique<DataBuffer>(0, DataBuffer::kDeBFlagEOE);
    auto eoe_buffer = std::make_unique<DataBuffer>(0, DataBuffer::kDeBFlagEOE);
    RETURN_IF_NOT_OK(out_connector_->Add(0, std::move(eoe_buffer)));

    // Do not wait for any reset to be flown down from operators above us.
--- a/mindspore/ccsrc/dataset/engine/datasetops/source/celeba_op.cc
+++ b/mindspore/ccsrc/dataset/engine/datasetops/source/celeba_op.cc
@@ -40,7 +40,7 @@ Status CelebAOp::Builder::Build(std::shared_ptr<CelebAOp> *op) {
    builder_sampler_ = std::make_shared<SequentialSampler>();
  }

  builder_schema_ = make_unique<DataSchema>();
  builder_schema_ = std::make_unique<DataSchema>();
  RETURN_IF_NOT_OK(
    builder_schema_->AddColumn(ColDescriptor("image", DataType(DataType::DE_UINT8), TensorImpl::kFlexible, 1)));
  // label is like this:0 1 0 0 1......
@@ -83,7 +83,7 @@ CelebAOp::CelebAOp(int32_t num_workers, int32_t rows_per_buffer, const std::stri
    col_name_map_[data_schema_->column(index).name()] = index;
  }

  attr_info_queue_ = make_unique<Queue<std::vector<std::string>>>(queue_size);
  attr_info_queue_ = std::make_unique<Queue<std::vector<std::string>>>(queue_size);
  io_block_queues_.Init(num_workers_, queue_size);
 }

@@ -311,7 +311,7 @@ Status CelebAOp::AddIOBlock(std::unique_ptr<DataBuffer> *data_buffer) {
        row_count++;
        if (row_count % rows_per_buffer_ == 0) {
          RETURN_IF_NOT_OK(io_block_queues_[buff_count++ % num_workers_]->Add(
            make_unique<IOBlock>(IOBlock(keys, IOBlock::kDeIoBlockNone))));
            std::make_unique<IOBlock>(IOBlock(keys, IOBlock::kDeIoBlockNone))));
          keys.clear();
        }
      }
@@ -320,21 +320,21 @@ Status CelebAOp::AddIOBlock(std::unique_ptr<DataBuffer> *data_buffer) {

    if (!keys.empty()) {
      RETURN_IF_NOT_OK(io_block_queues_[(buff_count++) % num_workers_]->Add(
        make_unique<IOBlock>(IOBlock(keys, IOBlock::kDeIoBlockNone))));
        std::make_unique<IOBlock>(IOBlock(keys, IOBlock::kDeIoBlockNone))));
    }
    if (!BitTest(op_ctrl_flags_, kDeOpRepeated) || BitTest(op_ctrl_flags_, kDeOpLastRepeat)) {
      RETURN_IF_NOT_OK(
        io_block_queues_[(buff_count++) % num_workers_]->Add(make_unique<IOBlock>(IOBlock::kDeIoBlockFlagEoe)));
        io_block_queues_[(buff_count++) % num_workers_]->Add(std::make_unique<IOBlock>(IOBlock::kDeIoBlockFlagEoe)));
      RETURN_IF_NOT_OK(
        io_block_queues_[(buff_count++) % num_workers_]->Add(make_unique<IOBlock>(IOBlock::kDeIoBlockFlagEof)));
        io_block_queues_[(buff_count++) % num_workers_]->Add(std::make_unique<IOBlock>(IOBlock::kDeIoBlockFlagEof)));
      for (int32_t i = 0; i < num_workers_; i++) {
        RETURN_IF_NOT_OK(
          io_block_queues_[i]->Add(std::move(make_unique<IOBlock>(std::vector<int64_t>(), IOBlock::kDeIoBlockNone))));
          io_block_queues_[i]->Add(std::make_unique<IOBlock>(std::vector<int64_t>(), IOBlock::kDeIoBlockNone)));
      }
      return Status::OK();
    } else {  // not the last repeat. Acquire lock, sleeps master thread, wait for the wake-up from reset
      RETURN_IF_NOT_OK(
        io_block_queues_[(buff_count++) % num_workers_]->Add(make_unique<IOBlock>(IOBlock::kDeIoBlockFlagEoe)));
        io_block_queues_[(buff_count++) % num_workers_]->Add(std::make_unique<IOBlock>(IOBlock::kDeIoBlockFlagEoe)));
      RETURN_IF_NOT_OK(wp_.Wait());  // Master thread goes to sleep after it has made all the IOBlocks
      wp_.Clear();
      RETURN_IF_NOT_OK(sampler_->GetNextBuffer(data_buffer));
@@ -349,17 +349,17 @@ Status CelebAOp::WorkerEntry(int32_t worker_id) {
  RETURN_IF_NOT_OK(io_block_queues_[worker_id]->PopFront(&io_block));
  while (io_block != nullptr) {
    if (io_block->eoe() == true) {
      RETURN_IF_NOT_OK(out_connector_->Add(worker_id, std::move(make_unique<DataBuffer>(0, DataBuffer::kDeBFlagEOE))));
      RETURN_IF_NOT_OK(out_connector_->Add(worker_id, std::make_unique<DataBuffer>(0, DataBuffer::kDeBFlagEOE)));
      buffer_id = worker_id;
    } else if (io_block->eof() == true) {
      RETURN_IF_NOT_OK(out_connector_->Add(worker_id, std::move(make_unique<DataBuffer>(0, DataBuffer::kDeBFlagEOF))));
      RETURN_IF_NOT_OK(out_connector_->Add(worker_id, std::make_unique<DataBuffer>(0, DataBuffer::kDeBFlagEOF)));
    } else {
      std::vector<int64_t> keys;
      RETURN_IF_NOT_OK(io_block->GetKeys(&keys));
      if (keys.empty()) {
        return Status::OK();  // empty key is a quit signal for workers
      }
      std::unique_ptr<DataBuffer> db = make_unique<DataBuffer>(buffer_id, DataBuffer::kDeBFlagNone);
      std::unique_ptr<DataBuffer> db = std::make_unique<DataBuffer>(buffer_id, DataBuffer::kDeBFlagNone);
      RETURN_IF_NOT_OK(LoadBuffer(keys, &db));
      RETURN_IF_NOT_OK(out_connector_->Add(worker_id, std::move(db)));
      buffer_id += num_workers_;
@@ -370,7 +370,7 @@ Status CelebAOp::WorkerEntry(int32_t worker_id) {
 }

 Status CelebAOp::LoadBuffer(const std::vector<int64_t> &keys, std::unique_ptr<DataBuffer> *db) {
  std::unique_ptr<TensorQTable> deq = make_unique<TensorQTable>();
  std::unique_ptr<TensorQTable> deq = std::make_unique<TensorQTable>();
  for (const auto &key : keys) {
    TensorRow row;
    RETURN_IF_NOT_OK(LoadTensorRow(image_labels_vec_[key], &row));
--- a/mindspore/ccsrc/dataset/engine/datasetops/source/cifar_op.cc
+++ b/mindspore/ccsrc/dataset/engine/datasetops/source/cifar_op.cc
@@ -47,7 +47,7 @@ Status CifarOp::Builder::Build(std::shared_ptr<CifarOp> *ptr) {
  if (sampler_ == nullptr) {
    sampler_ = std::make_shared<SequentialSampler>();
  }
  schema_ = make_unique<DataSchema>();
  schema_ = std::make_unique<DataSchema>();
  TensorShape scalar = TensorShape::CreateScalar();
  RETURN_IF_NOT_OK(schema_->AddColumn(ColDescriptor("image", DataType(DataType::DE_UINT8), TensorImpl::kFlexible, 1)));
  if (cifar_type_ == kCifar10) {
@@ -91,7 +91,7 @@ CifarOp::CifarOp(CifarType type, int32_t num_works, int32_t rows_per_buf, const
    col_name_map_[data_schema_->column(i).name()] = i;
  }
  constexpr uint64_t kUtilQueueSize = 512;
  cifar_raw_data_block_ = make_unique<Queue<std::vector<unsigned char>>>(kUtilQueueSize);
  cifar_raw_data_block_ = std::make_unique<Queue<std::vector<unsigned char>>>(kUtilQueueSize);
  io_block_queues_.Init(num_workers_, queue_size);
 }

@@ -114,7 +114,7 @@ Status CifarOp::operator()() {
        if (row_cnt_ >= num_samples_) break;  // enough row read, break for loop
        if (row_cnt_ % rows_per_buffer_ == 0) {
          RETURN_IF_NOT_OK(io_block_queues_[buf_cnt_++ % num_workers_]->Add(
            make_unique<IOBlock>(IOBlock(keys, IOBlock::kDeIoBlockNone))));
            std::make_unique<IOBlock>(IOBlock(keys, IOBlock::kDeIoBlockNone))));
          keys.clear();
        }
      }
@@ -122,21 +122,21 @@ Status CifarOp::operator()() {
    }
    if (keys.empty() == false) {
      RETURN_IF_NOT_OK(io_block_queues_[(buf_cnt_++) % num_workers_]->Add(
        make_unique<IOBlock>(IOBlock(keys, IOBlock::kDeIoBlockNone))));
        std::make_unique<IOBlock>(IOBlock(keys, IOBlock::kDeIoBlockNone))));
    }
    if (!BitTest(op_ctrl_flags_, kDeOpRepeated) || BitTest(op_ctrl_flags_, kDeOpLastRepeat)) {
      RETURN_IF_NOT_OK(
        io_block_queues_[(buf_cnt_++) % num_workers_]->Add(make_unique<IOBlock>(IOBlock::kDeIoBlockFlagEoe)));
        io_block_queues_[(buf_cnt_++) % num_workers_]->Add(std::make_unique<IOBlock>(IOBlock::kDeIoBlockFlagEoe)));
      RETURN_IF_NOT_OK(
        io_block_queues_[(buf_cnt_++) % num_workers_]->Add(make_unique<IOBlock>(IOBlock::kDeIoBlockFlagEof)));
        io_block_queues_[(buf_cnt_++) % num_workers_]->Add(std::make_unique<IOBlock>(IOBlock::kDeIoBlockFlagEof)));
      for (int32_t i = 0; i < num_workers_; i++) {
        RETURN_IF_NOT_OK(
          io_block_queues_[i]->Add(make_unique<IOBlock>(std::vector<int64_t>(), IOBlock::kDeIoBlockNone)));
          io_block_queues_[i]->Add(std::make_unique<IOBlock>(std::vector<int64_t>(), IOBlock::kDeIoBlockNone)));
      }
      return Status::OK();
    } else {  // not the last repeat. Acquire lock, sleeps master thread, wait for the wake-up from reset
      RETURN_IF_NOT_OK(
        io_block_queues_[(buf_cnt_++) % num_workers_]->Add(make_unique<IOBlock>(IOBlock::kDeIoBlockFlagEoe)));
        io_block_queues_[(buf_cnt_++) % num_workers_]->Add(std::make_unique<IOBlock>(IOBlock::kDeIoBlockFlagEoe)));
      RETURN_IF_NOT_OK(wp_.Wait());  // Master thread goes to sleep after it has made all the IOBlocks
      wp_.Clear();
      RETURN_IF_NOT_OK(sampler_->GetNextBuffer(&sampler_buffer));
@@ -169,17 +169,17 @@ Status CifarOp::WorkerEntry(int32_t worker_id) {
  RETURN_IF_NOT_OK(io_block_queues_[worker_id]->PopFront(&io_block));
  while (io_block != nullptr) {
    if (io_block->eoe() == true) {
      RETURN_IF_NOT_OK(out_connector_->Add(worker_id, make_unique<DataBuffer>(0, DataBuffer::kDeBFlagEOE)));
      RETURN_IF_NOT_OK(out_connector_->Add(worker_id, std::make_unique<DataBuffer>(0, DataBuffer::kDeBFlagEOE)));
      buffer_id = worker_id;
    } else if (io_block->eof() == true) {
      RETURN_IF_NOT_OK(out_connector_->Add(worker_id, make_unique<DataBuffer>(0, DataBuffer::kDeBFlagEOF)));
      RETURN_IF_NOT_OK(out_connector_->Add(worker_id, std::make_unique<DataBuffer>(0, DataBuffer::kDeBFlagEOF)));
    } else {
      std::vector<int64_t> keys;
      RETURN_IF_NOT_OK(io_block->GetKeys(&keys));
      if (keys.empty() == true) {
        return Status::OK();  // empty key is a quit signal for workers
      }
      std::unique_ptr<DataBuffer> db = make_unique<DataBuffer>(buffer_id, DataBuffer::kDeBFlagNone);
      std::unique_ptr<DataBuffer> db = std::make_unique<DataBuffer>(buffer_id, DataBuffer::kDeBFlagNone);
      RETURN_IF_NOT_OK(LoadBuffer(keys, &db));
      RETURN_IF_NOT_OK(out_connector_->Add(worker_id, std::move(db)));
      buffer_id += num_workers_;
@@ -213,7 +213,7 @@ Status CifarOp::LoadTensorRow(uint64_t index, TensorRow *trow) {

 // Looping over LoadTensorRow to make 1 DataBuffer. 1 function call produces 1 buffer
 Status CifarOp::LoadBuffer(const std::vector<int64_t> &keys, std::unique_ptr<DataBuffer> *db) {
  std::unique_ptr<TensorQTable> deq = make_unique<TensorQTable>();
  std::unique_ptr<TensorQTable> deq = std::make_unique<TensorQTable>();
  for (const int64_t &key : keys) {
    TensorRow trow;
    RETURN_IF_NOT_OK(LoadTensorRow(key, &trow));
--- a/mindspore/ccsrc/dataset/engine/datasetops/source/generator_op.cc
+++ b/mindspore/ccsrc/dataset/engine/datasetops/source/generator_op.cc
@@ -173,9 +173,9 @@ Status GeneratorOp::operator()() {
  bool eof = false;
  while (!eof) {
    // Create new buffer each iteration
    fetched_buffer = mindspore::make_unique<DataBuffer>(buffer_id_++, DataBuffer::kDeBFlagNone);
    fetched_buffer = std::make_unique<DataBuffer>(buffer_id_++, DataBuffer::kDeBFlagNone);
    fetched_buffer->set_column_name_map(column_names_map_);
    std::unique_ptr<TensorQTable> fetched_table = mindspore::make_unique<TensorQTable>();
    std::unique_ptr<TensorQTable> fetched_table = std::make_unique<TensorQTable>();
    bool eoe = false;
    {
      py::gil_scoped_acquire gil_acquire;
@@ -201,12 +201,12 @@ Status GeneratorOp::operator()() {
    if (eoe) {
      // Push out EOE upon StopIteration exception from generator
      MS_LOG(INFO) << "Generator operator sends out EOE.";
      std::unique_ptr<DataBuffer> eoe_buffer = mindspore::make_unique<DataBuffer>(0, DataBuffer::kDeBFlagEOE);
      std::unique_ptr<DataBuffer> eoe_buffer = std::make_unique<DataBuffer>(0, DataBuffer::kDeBFlagEOE);
      RETURN_IF_NOT_OK(out_connector_->Add(0, std::move(eoe_buffer)));
      if (!BitTest(op_ctrl_flags_, kDeOpRepeated) || BitTest(op_ctrl_flags_, kDeOpLastRepeat)) {
        // If last repeat or not repeated, push out EOF and exit master loop
        MS_LOG(INFO) << "Generator operator sends out EOF.";
        std::unique_ptr<DataBuffer> eof_buffer = mindspore::make_unique<DataBuffer>(0, DataBuffer::kDeBFlagEOF);
        std::unique_ptr<DataBuffer> eof_buffer = std::make_unique<DataBuffer>(0, DataBuffer::kDeBFlagEOF);
        RETURN_IF_NOT_OK(out_connector_->Add(0, std::move(eof_buffer)));
        MS_LOG(INFO) << "Generator operator main execution loop complete.";
        eof = true;
--- a/mindspore/ccsrc/dataset/engine/datasetops/source/image_folder_op.cc
+++ b/mindspore/ccsrc/dataset/engine/datasetops/source/image_folder_op.cc
@@ -39,7 +39,7 @@ Status ImageFolderOp::Builder::Build(std::shared_ptr<ImageFolderOp> *ptr) {
  if (builder_sampler_ == nullptr) {
    builder_sampler_ = std::make_shared<SequentialSampler>();
  }
  builder_schema_ = make_unique<DataSchema>();
  builder_schema_ = std::make_unique<DataSchema>();
  TensorShape scalar = TensorShape::CreateScalar();
  RETURN_IF_NOT_OK(
    builder_schema_->AddColumn(ColDescriptor("image", DataType(DataType::DE_UINT8), TensorImpl::kFlexible, 1)));
@@ -82,8 +82,8 @@ ImageFolderOp::ImageFolderOp(int32_t num_wkrs, int32_t rows_per_buffer, std::str
  for (int32_t i = 0; i < data_schema_->NumColumns(); ++i) {
    col_name_map_[data_schema_->column(i).name()] = i;
  }
  folder_name_queue_ = make_unique<Queue<std::string>>(num_wkrs * queue_size);
  image_name_queue_ = make_unique<Queue<FolderImagesPair>>(num_wkrs * queue_size);
  folder_name_queue_ = std::make_unique<Queue<std::string>>(num_wkrs * queue_size);
  image_name_queue_ = std::make_unique<Queue<FolderImagesPair>>(num_wkrs * queue_size);
  io_block_queues_.Init(num_workers_, queue_size);
 }

@@ -143,7 +143,7 @@ Status ImageFolderOp::operator()() {
        row_cnt_++;
        if (row_cnt_ % rows_per_buffer_ == 0) {
          RETURN_IF_NOT_OK(
            io_block_queues_[buf_cnt_++ % num_workers_]->Add(make_unique<IOBlock>(keys, IOBlock::kDeIoBlockNone)));
            io_block_queues_[buf_cnt_++ % num_workers_]->Add(std::make_unique<IOBlock>(keys, IOBlock::kDeIoBlockNone)));
          keys.clear();
        }
      }
@@ -151,21 +151,21 @@ Status ImageFolderOp::operator()() {
    }
    if (keys.empty() == false) {
      RETURN_IF_NOT_OK(
        io_block_queues_[(buf_cnt_++) % num_workers_]->Add(make_unique<IOBlock>(keys, IOBlock::kDeIoBlockNone)));
        io_block_queues_[(buf_cnt_++) % num_workers_]->Add(std::make_unique<IOBlock>(keys, IOBlock::kDeIoBlockNone)));
    }
    if (!BitTest(op_ctrl_flags_, kDeOpRepeated) || BitTest(op_ctrl_flags_, kDeOpLastRepeat)) {
      std::unique_ptr<IOBlock> eoe_block = make_unique<IOBlock>(IOBlock::kDeIoBlockFlagEoe);
      std::unique_ptr<IOBlock> eof_block = make_unique<IOBlock>(IOBlock::kDeIoBlockFlagEof);
      std::unique_ptr<IOBlock> eoe_block = std::make_unique<IOBlock>(IOBlock::kDeIoBlockFlagEoe);
      std::unique_ptr<IOBlock> eof_block = std::make_unique<IOBlock>(IOBlock::kDeIoBlockFlagEof);
      RETURN_IF_NOT_OK(io_block_queues_[(buf_cnt_++) % num_workers_]->Add(std::move(eoe_block)));
      RETURN_IF_NOT_OK(io_block_queues_[(buf_cnt_++) % num_workers_]->Add(std::move(eof_block)));
      for (int32_t i = 0; i < num_workers_; ++i) {
        RETURN_IF_NOT_OK(
          io_block_queues_[i]->Add(make_unique<IOBlock>(std::vector<int64_t>(), IOBlock::kDeIoBlockNone)));
          io_block_queues_[i]->Add(std::make_unique<IOBlock>(std::vector<int64_t>(), IOBlock::kDeIoBlockNone)));
      }
      return Status::OK();
    } else {  // not the last repeat. Sleep master thread, wait for the wake-up from reset
      RETURN_IF_NOT_OK(
        io_block_queues_[(buf_cnt_++) % num_workers_]->Add(make_unique<IOBlock>(IOBlock::kDeIoBlockFlagEoe)));
        io_block_queues_[(buf_cnt_++) % num_workers_]->Add(std::make_unique<IOBlock>(IOBlock::kDeIoBlockFlagEoe)));
      RETURN_IF_NOT_OK(wp_.Wait());  // Master thread goes to sleep after it has made all the IOBlocks
      wp_.Clear();
      RETURN_IF_NOT_OK(sampler_->GetNextBuffer(&sampler_buffer));
@@ -182,15 +182,15 @@ Status ImageFolderOp::WorkerEntry(int32_t worker_id) {
  RETURN_IF_NOT_OK(io_block_queues_[worker_id]->PopFront(&io_block));
  while (io_block != nullptr) {
    if (io_block->eoe() == true) {
      RETURN_IF_NOT_OK(out_connector_->Add(worker_id, make_unique<DataBuffer>(0, DataBuffer::kDeBFlagEOE)));
      RETURN_IF_NOT_OK(out_connector_->Add(worker_id, std::make_unique<DataBuffer>(0, DataBuffer::kDeBFlagEOE)));
      buffer_id = worker_id;
    } else if (io_block->eof() == true) {
      RETURN_IF_NOT_OK(out_connector_->Add(worker_id, make_unique<DataBuffer>(0, DataBuffer::kDeBFlagEOF)));
      RETURN_IF_NOT_OK(out_connector_->Add(worker_id, std::make_unique<DataBuffer>(0, DataBuffer::kDeBFlagEOF)));
    } else {
      std::vector<int64_t> keys;
      RETURN_IF_NOT_OK(io_block->GetKeys(&keys));
      if (keys.empty() == true) return Status::OK();  // empty key is a quit signal for workers
      std::unique_ptr<DataBuffer> db = make_unique<DataBuffer>(buffer_id, DataBuffer::kDeBFlagNone);
      std::unique_ptr<DataBuffer> db = std::make_unique<DataBuffer>(buffer_id, DataBuffer::kDeBFlagNone);
      RETURN_IF_NOT_OK(LoadBuffer(keys, &db));
      RETURN_IF_NOT_OK(out_connector_->Add(worker_id, std::move(db)));
      buffer_id += num_workers_;
@@ -231,7 +231,7 @@ Status ImageFolderOp::LoadTensorRow(ImageLabelPair pairPtr, TensorRow *trow) {

 // Looping over LoadTensorRow to make 1 DataBuffer. 1 function call produces 1 buffer
 Status ImageFolderOp::LoadBuffer(const std::vector<int64_t> &keys, std::unique_ptr<DataBuffer> *db) {
  std::unique_ptr<TensorQTable> deq = make_unique<TensorQTable>();
  std::unique_ptr<TensorQTable> deq = std::make_unique<TensorQTable>();
  TensorRow trow;
  for (const int64_t &key : keys) {
    RETURN_IF_NOT_OK(this->LoadTensorRow(image_label_pairs_[key], &trow));
--- a/mindspore/ccsrc/dataset/engine/datasetops/source/manifest_op.cc
+++ b/mindspore/ccsrc/dataset/engine/datasetops/source/manifest_op.cc
@@ -40,7 +40,7 @@ Status ManifestOp::Builder::Build(std::shared_ptr<ManifestOp> *ptr) {
  if (builder_sampler_ == nullptr) {
    builder_sampler_ = std::make_shared<SequentialSampler>();
  }
  builder_schema_ = make_unique<DataSchema>();
  builder_schema_ = std::make_unique<DataSchema>();
  RETURN_IF_NOT_OK(
    builder_schema_->AddColumn(ColDescriptor("image", DataType(DataType::DE_UINT8), TensorImpl::kFlexible, 1)));
  RETURN_IF_NOT_OK(
@@ -105,7 +105,7 @@ Status ManifestOp::AddIoBlock(std::unique_ptr<DataBuffer> *sampler_buffer) {
        row_cnt_++;
        if (row_cnt_ % rows_per_buffer_ == 0) {
          RETURN_IF_NOT_OK(io_block_queues_[buf_cnt_++ % num_workers_]->Add(
            make_unique<IOBlock>(IOBlock(keys, IOBlock::kDeIoBlockNone))));
            std::make_unique<IOBlock>(IOBlock(keys, IOBlock::kDeIoBlockNone))));
          keys.clear();
        }
      }
@@ -113,21 +113,21 @@ Status ManifestOp::AddIoBlock(std::unique_ptr<DataBuffer> *sampler_buffer) {
    }
    if (keys.empty() == false) {
      RETURN_IF_NOT_OK(io_block_queues_[(buf_cnt_++) % num_workers_]->Add(
        make_unique<IOBlock>(IOBlock(keys, IOBlock::kDeIoBlockNone))));
        std::make_unique<IOBlock>(IOBlock(keys, IOBlock::kDeIoBlockNone))));
    }
    if (!BitTest(op_ctrl_flags_, kDeOpRepeated) || BitTest(op_ctrl_flags_, kDeOpLastRepeat)) {
      RETURN_IF_NOT_OK(
        io_block_queues_[(buf_cnt_++) % num_workers_]->Add(make_unique<IOBlock>(IOBlock::kDeIoBlockFlagEoe)));
        io_block_queues_[(buf_cnt_++) % num_workers_]->Add(std::make_unique<IOBlock>(IOBlock::kDeIoBlockFlagEoe)));
      RETURN_IF_NOT_OK(
        io_block_queues_[(buf_cnt_++) % num_workers_]->Add(make_unique<IOBlock>(IOBlock::kDeIoBlockFlagEof)));
        io_block_queues_[(buf_cnt_++) % num_workers_]->Add(std::make_unique<IOBlock>(IOBlock::kDeIoBlockFlagEof)));
      for (int32_t i = 0; i < num_workers_; i++) {
        RETURN_IF_NOT_OK(
          io_block_queues_[i]->Add(make_unique<IOBlock>(std::vector<int64_t>(), IOBlock::kDeIoBlockNone)));
          io_block_queues_[i]->Add(std::make_unique<IOBlock>(std::vector<int64_t>(), IOBlock::kDeIoBlockNone)));
      }
      return Status::OK();
    } else {
      RETURN_IF_NOT_OK(
        io_block_queues_[(buf_cnt_++) % num_workers_]->Add(make_unique<IOBlock>(IOBlock::kDeIoBlockFlagEoe)));
        io_block_queues_[(buf_cnt_++) % num_workers_]->Add(std::make_unique<IOBlock>(IOBlock::kDeIoBlockFlagEoe)));
      RETURN_IF_NOT_OK(wp_.Wait());  // Master thread goes to sleep after it has made all the IOBlocks
      wp_.Clear();
      RETURN_IF_NOT_OK(sampler_->GetNextBuffer(sampler_buffer));
@@ -160,17 +160,17 @@ Status ManifestOp::WorkerEntry(int32_t worker_id) {
  RETURN_IF_NOT_OK(io_block_queues_[worker_id]->PopFront(&io_block));
  while (io_block != nullptr) {
    if (io_block->eoe() == true) {
      RETURN_IF_NOT_OK(out_connector_->Add(worker_id, make_unique<DataBuffer>(0, DataBuffer::kDeBFlagEOE)));
      RETURN_IF_NOT_OK(out_connector_->Add(worker_id, std::make_unique<DataBuffer>(0, DataBuffer::kDeBFlagEOE)));
      buffer_id = worker_id;
    } else if (io_block->eof() == true) {
      RETURN_IF_NOT_OK(out_connector_->Add(worker_id, make_unique<DataBuffer>(0, DataBuffer::kDeBFlagEOF)));
      RETURN_IF_NOT_OK(out_connector_->Add(worker_id, std::make_unique<DataBuffer>(0, DataBuffer::kDeBFlagEOF)));
    } else {
      std::vector<int64_t> keys;
      RETURN_IF_NOT_OK(io_block->GetKeys(&keys));
      if (keys.empty()) {
        return Status::OK();  // empty key is a quit signal for workers
      }
      std::unique_ptr<DataBuffer> db = make_unique<DataBuffer>(buffer_id, DataBuffer::kDeBFlagNone);
      std::unique_ptr<DataBuffer> db = std::make_unique<DataBuffer>(buffer_id, DataBuffer::kDeBFlagNone);
      RETURN_IF_NOT_OK(LoadBuffer(keys, &db));
      RETURN_IF_NOT_OK(out_connector_->Add(worker_id, std::move(db)));
      buffer_id += num_workers_;
@@ -227,7 +227,7 @@ Status ManifestOp::LoadTensorRow(const std::pair<std::string, std::vector<std::s

 // Looping over LoadTensorRow to make 1 DataBuffer. 1 function call produces 1 buffer
 Status ManifestOp::LoadBuffer(const std::vector<int64_t> &keys, std::unique_ptr<DataBuffer> *db) {
  std::unique_ptr<TensorQTable> deq = make_unique<TensorQTable>();
  std::unique_ptr<TensorQTable> deq = std::make_unique<TensorQTable>();
  for (const auto &key : keys) {
    TensorRow trow;
    RETURN_IF_NOT_OK(LoadTensorRow(image_labelname_[static_cast<size_t>(key)], &trow));
--- a/mindspore/ccsrc/dataset/engine/datasetops/source/mindrecord_op.cc
+++ b/mindspore/ccsrc/dataset/engine/datasetops/source/mindrecord_op.cc
@@ -13,8 +13,6 @@
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #ifdef ENABLE_MINDRECORD

 #include "dataset/engine/datasetops/source/mindrecord_op.h"

 #include <algorithm>
@@ -30,7 +28,6 @@
 #include "dataset/engine/datasetops/dataset_op.h"
 #include "dataset/engine/db_connector.h"
 #include "dataset/engine/execution_tree.h"
 #include "dataset/util/make_unique.h"
 #include "utils/log_adapter.h"

 namespace mindspore {
@@ -96,18 +93,19 @@ MindRecordOp::MindRecordOp(int32_t num_mind_record_workers, int32_t rows_per_buf
  io_blk_queues_.Init(num_workers_, op_connector_queue_size);
  if (!block_reader_) return;
  for (int32_t i = 0; i < num_workers_; ++i) {
    block_buffer_.emplace_back(make_unique<std::vector<ShardTuple>>(std::vector<ShardTuple>{}));
    block_buffer_.emplace_back(std::make_unique<std::vector<ShardTuple>>(std::vector<ShardTuple>{}));
  }
 }

 // Private helper method to encapsulate some common construction/reset tasks
 Status MindRecordOp::Init() {
  shard_reader_ = mindspore::make_unique<ShardReader>();
  shard_reader_ = std::make_unique<ShardReader>();
  auto rc = shard_reader_->Open(dataset_file_, num_mind_record_workers_, columns_to_load_, operators_, block_reader_);

  CHECK_FAIL_RETURN_UNEXPECTED(rc != MSRStatus::FAILED, "MindRecordOp init failed.");
  CHECK_FAIL_RETURN_UNEXPECTED(rc != MSRStatus::FAILED,
                               "MindRecordOp init failed. Error message: " + ErrnoToMessage(rc));

  data_schema_ = mindspore::make_unique<DataSchema>();
  data_schema_ = std::make_unique<DataSchema>();

  std::vector<std::shared_ptr<Schema>> schema_vec = shard_reader_->get_shard_header()->get_schemas();
  // check whether schema exists, if so use the first one
@@ -144,7 +142,7 @@ Status MindRecordOp::Init() {
  }

  if (!load_all_cols) {
    std::unique_ptr<DataSchema> tmp_schema = make_unique<DataSchema>();
    std::unique_ptr<DataSchema> tmp_schema = std::make_unique<DataSchema>();
    for (std::string colname : columns_to_load_) {
      CHECK_FAIL_RETURN_UNEXPECTED(colname_to_ind.find(colname) != colname_to_ind.end(), colname + ": doesn't exist");
      RETURN_IF_NOT_OK(tmp_schema->AddColumn(data_schema_->column(colname_to_ind[colname])));
@@ -298,7 +296,7 @@ Status MindRecordOp::LoadFloat(TensorShape *new_shape, std::unique_ptr<T[]> *arr
    RETURN_IF_NOT_OK(GetFloat(&value, columns_json[column_name], use_double));

    *new_shape = TensorShape::CreateScalar();
    *array_data = mindspore::make_unique<T[]>(1);
    *array_data = std::make_unique<T[]>(1);
    (*array_data)[0] = value;
  } else {
    if (column.hasShape()) {
@@ -309,7 +307,7 @@ Status MindRecordOp::LoadFloat(TensorShape *new_shape, std::unique_ptr<T[]> *arr
    }

    int idx = 0;
    *array_data = mindspore::make_unique<T[]>(new_shape->NumOfElements());
    *array_data = std::make_unique<T[]>(new_shape->NumOfElements());
    for (auto &element : columns_json[column_name]) {
      T value = 0;
      RETURN_IF_NOT_OK(GetFloat(&value, element, use_double));
@@ -350,7 +348,7 @@ Status MindRecordOp::LoadInt(TensorShape *new_shape, std::unique_ptr<T[]> *array
    RETURN_IF_NOT_OK(GetInt(&value, columns_json[column_name]));

    *new_shape = TensorShape::CreateScalar();
    *array_data = mindspore::make_unique<T[]>(1);
    *array_data = std::make_unique<T[]>(1);
    (*array_data)[0] = value;
  } else {
    if (column.hasShape()) {
@@ -361,7 +359,7 @@ Status MindRecordOp::LoadInt(TensorShape *new_shape, std::unique_ptr<T[]> *array
    }

    int idx = 0;
    *array_data = mindspore::make_unique<T[]>(new_shape->NumOfElements());
    *array_data = std::make_unique<T[]>(new_shape->NumOfElements());
    for (auto &element : columns_json[column_name]) {
      T value = 0;
      RETURN_IF_NOT_OK(GetInt(&value, element));
@@ -431,12 +429,14 @@ Status MindRecordOp::WorkerEntry(int32_t worker_id) {
  RETURN_IF_NOT_OK(io_blk_queues_[worker_id]->PopFront(&io_block));
  while (io_block != nullptr) {
    if (io_block->eoe() == true) {
      RETURN_IF_NOT_OK(out_connector_->Add(worker_id, std::move(make_unique<DataBuffer>(0, DataBuffer::kDeBFlagEOE))));
      RETURN_IF_NOT_OK(
        out_connector_->Add(worker_id, std::move(std::make_unique<DataBuffer>(0, DataBuffer::kDeBFlagEOE))));
      RETURN_IF_NOT_OK(io_blk_queues_[worker_id]->PopFront(&io_block));
      continue;
    }
    if (io_block->eof() == true) {
      RETURN_IF_NOT_OK(out_connector_->Add(worker_id, std::move(make_unique<DataBuffer>(0, DataBuffer::kDeBFlagEOF))));
      RETURN_IF_NOT_OK(
        out_connector_->Add(worker_id, std::move(std::make_unique<DataBuffer>(0, DataBuffer::kDeBFlagEOF))));
      RETURN_IF_NOT_OK(io_blk_queues_[worker_id]->PopFront(&io_block));
      continue;
    }
@@ -486,9 +486,9 @@ Status MindRecordOp::WorkerEntry(int32_t worker_id) {

 Status MindRecordOp::GetBufferFromReader(std::unique_ptr<DataBuffer> *fetched_buffer, int64_t buffer_id,
                                         int32_t worker_id) {
  *fetched_buffer = mindspore::make_unique<DataBuffer>(buffer_id, DataBuffer::kDeBFlagNone);
  *fetched_buffer = std::make_unique<DataBuffer>(buffer_id, DataBuffer::kDeBFlagNone);
  (*fetched_buffer)->set_column_name_map(column_name_mapping_);
  std::unique_ptr<TensorQTable> tensor_table = mindspore::make_unique<TensorQTable>();
  std::unique_ptr<TensorQTable> tensor_table = std::make_unique<TensorQTable>();
  for (int32_t i = 0; i < rows_per_buffer_; ++i) {
    ShardTuple tupled_buffer;
    if (block_reader_) {
@@ -597,22 +597,22 @@ Status MindRecordOp::operator()() {
    for (int32_t i = 0; i < buffers_needed_; ++i) {
      if (block_reader_) RETURN_IF_NOT_OK(FetchBlockBuffer(i));
      std::vector<int64_t> keys(1, i);
      RETURN_IF_NOT_OK(
        io_blk_queues_[buf_cnt_++ % num_workers_]->Add(make_unique<IOBlock>(IOBlock(keys, IOBlock::kDeIoBlockNone))));
      RETURN_IF_NOT_OK(io_blk_queues_[buf_cnt_++ % num_workers_]->Add(
        std::make_unique<IOBlock>(IOBlock(keys, IOBlock::kDeIoBlockNone))));
    }
    if (!BitTest(op_ctrl_flags_, kDeOpRepeated) || BitTest(op_ctrl_flags_, kDeOpLastRepeat)) {
      RETURN_IF_NOT_OK(
        io_blk_queues_[(buf_cnt_++) % num_workers_]->Add(make_unique<IOBlock>(IOBlock::kDeIoBlockFlagEoe)));
        io_blk_queues_[(buf_cnt_++) % num_workers_]->Add(std::make_unique<IOBlock>(IOBlock::kDeIoBlockFlagEoe)));
      RETURN_IF_NOT_OK(
        io_blk_queues_[(buf_cnt_++) % num_workers_]->Add(make_unique<IOBlock>(IOBlock::kDeIoBlockFlagEof)));
        io_blk_queues_[(buf_cnt_++) % num_workers_]->Add(std::make_unique<IOBlock>(IOBlock::kDeIoBlockFlagEof)));
      for (int32_t i = 0; i < num_workers_; i++) {
        RETURN_IF_NOT_OK(
          io_blk_queues_[i]->Add(std::move(make_unique<IOBlock>(std::vector<int64_t>(), IOBlock::kDeIoBlockNone))));
        RETURN_IF_NOT_OK(io_blk_queues_[i]->Add(
          std::move(std::make_unique<IOBlock>(std::vector<int64_t>(), IOBlock::kDeIoBlockNone))));
      }
      return Status::OK();
    } else {  // not the last repeat. Acquire lock, sleeps master thread, wait for the wake-up from reset
      RETURN_IF_NOT_OK(
        io_blk_queues_[(buf_cnt_++) % num_workers_]->Add(make_unique<IOBlock>(IOBlock::kDeIoBlockFlagEoe)));
        io_blk_queues_[(buf_cnt_++) % num_workers_]->Add(std::make_unique<IOBlock>(IOBlock::kDeIoBlockFlagEoe)));

      // reset our buffer count and go to loop again.
      RETURN_IF_NOT_OK(shard_reader_wait_post_.Wait());
@@ -656,7 +656,7 @@ Status MindRecordOp::LaunchThreadAndInitOp() {
 }

 Status MindRecordOp::CountTotalRows(const std::string dataset_path, int64_t *count) {
  std::unique_ptr<ShardReader> shard_reader = mindspore::make_unique<ShardReader>();
  std::unique_ptr<ShardReader> shard_reader = std::make_unique<ShardReader>();
  MSRStatus rc = shard_reader->CountTotalRows(dataset_path, count);
  if (rc == MSRStatus::FAILED) {
    RETURN_STATUS_UNEXPECTED("MindRecordOp count total rows failed.");
@@ -665,4 +665,3 @@ Status MindRecordOp::CountTotalRows(const std::string dataset_path, int64_t *cou
 }
 }  // namespace dataset
 }  // namespace mindspore
 #endif
--- a/mindspore/ccsrc/dataset/engine/datasetops/source/mindrecord_op.h
+++ b/mindspore/ccsrc/dataset/engine/datasetops/source/mindrecord_op.h
@@ -15,7 +15,6 @@
 */
 #ifndef DATASET_ENGINE_DATASETOPS_SOURCE_MINDRECORD_OP_H_
 #define DATASET_ENGINE_DATASETOPS_SOURCE_MINDRECORD_OP_H_
 #ifdef ENABLE_MINDRECORD
 #pragma once

 #include <cstdint>
@@ -33,6 +32,7 @@
 #include "dataset/engine/datasetops/source/io_block.h"
 #include "dataset/util/queue.h"
 #include "dataset/util/status.h"
 #include "mindrecord/include/shard_error.h"
 #include "mindrecord/include/shard_reader.h"
 #include "mindrecord/include/common/shard_utils.h"
 #include "dataset/util/wait_post.h"
@@ -276,5 +276,4 @@ class MindRecordOp : public ParallelOp {
 };
 }  // namespace dataset
 }  // namespace mindspore
 #endif
 #endif  // DATASET_ENGINE_DATASETOPS_SOURCE_MINDRECORD_OP_H_
--- a/mindspore/ccsrc/dataset/engine/datasetops/source/mnist_op.cc
+++ b/mindspore/ccsrc/dataset/engine/datasetops/source/mnist_op.cc
@@ -43,7 +43,7 @@ Status MnistOp::Builder::Build(std::shared_ptr<MnistOp> *ptr) {
  if (builder_sampler_ == nullptr) {
    builder_sampler_ = std::make_shared<SequentialSampler>();
  }
  builder_schema_ = make_unique<DataSchema>();
  builder_schema_ = std::make_unique<DataSchema>();
  RETURN_IF_NOT_OK(
    builder_schema_->AddColumn(ColDescriptor("image", DataType(DataType::DE_UINT8), TensorImpl::kCv, 1)));
  TensorShape scalar = TensorShape::CreateScalar();
@@ -89,7 +89,7 @@ Status MnistOp::TraversalSampleIds(const std::shared_ptr<Tensor> &sample_ids, st
    row_cnt_++;
    if (row_cnt_ % rows_per_buffer_ == 0) {
      RETURN_IF_NOT_OK(io_block_queues_[buf_cnt_++ % num_workers_]->Add(
        make_unique<IOBlock>(IOBlock(*keys, IOBlock::kDeIoBlockNone))));
        std::make_unique<IOBlock>(IOBlock(*keys, IOBlock::kDeIoBlockNone))));
      keys->clear();
    }
  }
@@ -115,21 +115,21 @@ Status MnistOp::operator()() {
    }
    if (keys.empty() == false) {
      RETURN_IF_NOT_OK(io_block_queues_[(buf_cnt_++) % num_workers_]->Add(
        make_unique<IOBlock>(IOBlock(keys, IOBlock::kDeIoBlockNone))));
        std::make_unique<IOBlock>(IOBlock(keys, IOBlock::kDeIoBlockNone))));
    }
    if (!BitTest(op_ctrl_flags_, kDeOpRepeated) || BitTest(op_ctrl_flags_, kDeOpLastRepeat)) {
      RETURN_IF_NOT_OK(
        io_block_queues_[(buf_cnt_++) % num_workers_]->Add(make_unique<IOBlock>(IOBlock::kDeIoBlockFlagEoe)));
        io_block_queues_[(buf_cnt_++) % num_workers_]->Add(std::make_unique<IOBlock>(IOBlock::kDeIoBlockFlagEoe)));
      RETURN_IF_NOT_OK(
        io_block_queues_[(buf_cnt_++) % num_workers_]->Add(make_unique<IOBlock>(IOBlock::kDeIoBlockFlagEof)));
        io_block_queues_[(buf_cnt_++) % num_workers_]->Add(std::make_unique<IOBlock>(IOBlock::kDeIoBlockFlagEof)));
      for (int32_t i = 0; i < num_workers_; ++i) {
        RETURN_IF_NOT_OK(
          io_block_queues_[i]->Add(make_unique<IOBlock>(std::vector<int64_t>(), IOBlock::kDeIoBlockNone)));
          io_block_queues_[i]->Add(std::make_unique<IOBlock>(std::vector<int64_t>(), IOBlock::kDeIoBlockNone)));
      }
      return Status::OK();
    } else {
      RETURN_IF_NOT_OK(
        io_block_queues_[(buf_cnt_++) % num_workers_]->Add(make_unique<IOBlock>(IOBlock::kDeIoBlockFlagEoe)));
        io_block_queues_[(buf_cnt_++) % num_workers_]->Add(std::make_unique<IOBlock>(IOBlock::kDeIoBlockFlagEoe)));
      RETURN_IF_NOT_OK(wp_.Wait());  // Master thread goes to sleep after it has made all the IOBlocks
      wp_.Clear();
      RETURN_IF_NOT_OK(sampler_->GetNextBuffer(&sampler_buffer));
@@ -145,15 +145,15 @@ Status MnistOp::WorkerEntry(int32_t worker_id) {
  RETURN_IF_NOT_OK(io_block_queues_[worker_id]->PopFront(&iOBlock));
  while (iOBlock != nullptr) {
    if (iOBlock->eoe() == true) {
      RETURN_IF_NOT_OK(out_connector_->Add(worker_id, make_unique<DataBuffer>(0, DataBuffer::kDeBFlagEOE)));
      RETURN_IF_NOT_OK(out_connector_->Add(worker_id, std::make_unique<DataBuffer>(0, DataBuffer::kDeBFlagEOE)));
      buffer_id = worker_id;
    } else if (iOBlock->eof() == true) {
      RETURN_IF_NOT_OK(out_connector_->Add(worker_id, make_unique<DataBuffer>(0, DataBuffer::kDeBFlagEOF)));
      RETURN_IF_NOT_OK(out_connector_->Add(worker_id, std::make_unique<DataBuffer>(0, DataBuffer::kDeBFlagEOF)));
    } else {
      std::vector<int64_t> keys;
      RETURN_IF_NOT_OK(iOBlock->GetKeys(&keys));
      if (keys.empty() == true) return Status::OK();  // empty key is a quit signal for workers
      std::unique_ptr<DataBuffer> db = make_unique<DataBuffer>(buffer_id, DataBuffer::kDeBFlagNone);
      std::unique_ptr<DataBuffer> db = std::make_unique<DataBuffer>(buffer_id, DataBuffer::kDeBFlagNone);
      RETURN_IF_NOT_OK(LoadBuffer(keys, &db));
      RETURN_IF_NOT_OK(out_connector_->Add(worker_id, std::move(db)));
      buffer_id += num_workers_;
@@ -178,7 +178,7 @@ Status MnistOp::LoadTensorRow(const MnistLabelPair &mnist_pair, TensorRow *trow)

 // Looping over LoadTensorRow to make 1 DataBuffer. 1 function call produces 1 buffer
 Status MnistOp::LoadBuffer(const std::vector<int64_t> &keys, std::unique_ptr<DataBuffer> *db) {
  std::unique_ptr<TensorQTable> deq = make_unique<TensorQTable>();
  std::unique_ptr<TensorQTable> deq = std::make_unique<TensorQTable>();
  TensorRow trow;
  for (const int64_t &key : keys) {
    RETURN_IF_NOT_OK(this->LoadTensorRow(image_label_pairs_[key], &trow));
@@ -309,8 +309,8 @@ Status MnistOp::ReadImageAndLabel(std::ifstream *image_reader, std::ifstream *la
  CHECK_FAIL_RETURN_UNEXPECTED((num_images == num_labels), "num_images != num_labels");
  // The image size of the Mnist dataset is fixed at [28,28]
  int64_t size = kMnistImageRows * kMnistImageCols;
  auto images_buf = mindspore::make_unique<char[]>(size * num_images);
  auto labels_buf = mindspore::make_unique<char[]>(num_images);
  auto images_buf = std::make_unique<char[]>(size * num_images);
  auto labels_buf = std::make_unique<char[]>(num_images);
  if (images_buf == nullptr || labels_buf == nullptr) {
    std::string err_msg = "Fail to allocate memory for MNIST Buffer.";
    MS_LOG(ERROR) << err_msg.c_str();
--- a/mindspore/ccsrc/dataset/engine/datasetops/source/sampler/distributed_sampler.cc
+++ b/mindspore/ccsrc/dataset/engine/datasetops/source/sampler/distributed_sampler.cc
@@ -52,9 +52,9 @@ Status DistributedSampler::GetNextBuffer(std::unique_ptr<DataBuffer> *out_buffer
  if (cnt_ > samples_per_buffer_) {
    RETURN_STATUS_UNEXPECTED("Distributed Sampler Error");
  } else if (cnt_ == samples_per_buffer_) {
    (*out_buffer) = mindspore::make_unique<DataBuffer>(0, DataBuffer::kDeBFlagEOE);
    (*out_buffer) = std::make_unique<DataBuffer>(0, DataBuffer::kDeBFlagEOE);
  } else {
    (*out_buffer) = mindspore::make_unique<DataBuffer>(cnt_, DataBuffer::kDeBFlagNone);
    (*out_buffer) = std::make_unique<DataBuffer>(cnt_, DataBuffer::kDeBFlagNone);
    std::shared_ptr<Tensor> sample_ids;
    RETURN_IF_NOT_OK(CreateSamplerTensor(&sample_ids, samples_per_buffer_));
    int64_t *id_ptr = reinterpret_cast<int64_t *>(sample_ids->StartAddr());
@@ -63,7 +63,7 @@ Status DistributedSampler::GetNextBuffer(std::unique_ptr<DataBuffer> *out_buffer
      *(id_ptr++) = shuffle_ ? shuffle_vec_[static_cast<size_t>(next_id)] : next_id;
    }
    TensorRow row(1, sample_ids);
    (*out_buffer)->set_tensor_table(make_unique<TensorQTable>(1, row));
    (*out_buffer)->set_tensor_table(std::make_unique<TensorQTable>(1, row));
  }
  return Status::OK();
 }
--- a/mindspore/ccsrc/dataset/engine/datasetops/source/sampler/pk_sampler.cc
+++ b/mindspore/ccsrc/dataset/engine/datasetops/source/sampler/pk_sampler.cc
@@ -53,9 +53,9 @@ Status PKSampler::GetNextBuffer(std::unique_ptr<DataBuffer> *out_buffer) {
  if (next_id_ > num_pk_samples_ || num_pk_samples_ == 0) {
    RETURN_STATUS_UNEXPECTED("Index out of bound in PKSampler");
  } else if (next_id_ == num_pk_samples_) {
    (*out_buffer) = mindspore::make_unique<DataBuffer>(0, DataBuffer::kDeBFlagEOE);
    (*out_buffer) = std::make_unique<DataBuffer>(0, DataBuffer::kDeBFlagEOE);
  } else {
    (*out_buffer) = mindspore::make_unique<DataBuffer>(next_id_, DataBuffer::kDeBFlagNone);
    (*out_buffer) = std::make_unique<DataBuffer>(next_id_, DataBuffer::kDeBFlagNone);
    std::shared_ptr<Tensor> sample_ids;
    int64_t last_id =
      (samples_per_buffer_ + next_id_ > num_pk_samples_) ? num_pk_samples_ : samples_per_buffer_ + next_id_;
@@ -68,7 +68,7 @@ Status PKSampler::GetNextBuffer(std::unique_ptr<DataBuffer> *out_buffer) {
      *(id_ptr++) = samples[rnd_ind];
    }
    TensorRow row(1, sample_ids);
    (*out_buffer)->set_tensor_table(make_unique<TensorQTable>(1, row));
    (*out_buffer)->set_tensor_table(std::make_unique<TensorQTable>(1, row));
  }
  return Status::OK();
 }
--- a/mindspore/ccsrc/dataset/engine/datasetops/source/sampler/random_sampler.cc
+++ b/mindspore/ccsrc/dataset/engine/datasetops/source/sampler/random_sampler.cc
@@ -32,9 +32,9 @@ Status RandomSampler::GetNextBuffer(std::unique_ptr<DataBuffer> *out_buffer) {
  if (next_id_ > num_samples_) {
    RETURN_STATUS_UNEXPECTED("RandomSampler Internal Error");
  } else if (next_id_ == num_samples_) {
    (*out_buffer) = make_unique<DataBuffer>(0, DataBuffer::kDeBFlagEOE);
    (*out_buffer) = std::make_unique<DataBuffer>(0, DataBuffer::kDeBFlagEOE);
  } else {
    (*out_buffer) = make_unique<DataBuffer>(next_id_, DataBuffer::kDeBFlagNone);
    (*out_buffer) = std::make_unique<DataBuffer>(next_id_, DataBuffer::kDeBFlagNone);
    std::shared_ptr<Tensor> sampleIds;
    int64_t last_id = samples_per_buffer_ + next_id_ > num_samples_ ? num_samples_ : samples_per_buffer_ + next_id_;
    RETURN_IF_NOT_OK(CreateSamplerTensor(&sampleIds, last_id - next_id_));
@@ -44,7 +44,7 @@ Status RandomSampler::GetNextBuffer(std::unique_ptr<DataBuffer> *out_buffer) {
    }
    next_id_ = last_id;
    TensorRow row(1, sampleIds);
    (*out_buffer)->set_tensor_table(make_unique<TensorQTable>(1, row));
    (*out_buffer)->set_tensor_table(std::make_unique<TensorQTable>(1, row));
  }
  return Status::OK();
 }
@@ -61,7 +61,7 @@ Status RandomSampler::Init(const RandomAccessOp *op) {
    }
    std::shuffle(shuffled_ids_.begin(), shuffled_ids_.end(), rnd_);
  } else {
    dist = make_unique<std::uniform_int_distribution<int64_t>>(0, num_rows_ - 1);
    dist = std::make_unique<std::uniform_int_distribution<int64_t>>(0, num_rows_ - 1);
  }
  rnd_.seed(seed_++);
  return Status::OK();
--- a/mindspore/ccsrc/dataset/engine/datasetops/source/sampler/sampler.cc
+++ b/mindspore/ccsrc/dataset/engine/datasetops/source/sampler/sampler.cc
@@ -35,7 +35,7 @@ Status Sampler::CreateSamplerTensor(std::shared_ptr<Tensor> *sample_ids, int64_t
  }
  if (col_desc_ == nullptr) {
    // a ColDescriptor for Tensor that holds SampleIds
    col_desc_ = make_unique<ColDescriptor>("sampleIds", DataType(DataType::DE_INT64), TensorImpl::kFlexible, 1);
    col_desc_ = std::make_unique<ColDescriptor>("sampleIds", DataType(DataType::DE_INT64), TensorImpl::kFlexible, 1);
  }
  TensorShape shape(std::vector<dsize_t>(1, num_elements));
  RETURN_IF_NOT_OK(Tensor::CreateTensor(sample_ids, col_desc_->tensorImpl(), shape, col_desc_->type()));
--- a/mindspore/ccsrc/dataset/engine/datasetops/source/sampler/sampler.h
+++ b/mindspore/ccsrc/dataset/engine/datasetops/source/sampler/sampler.h
@@ -27,7 +27,6 @@
 #include "dataset/engine/data_buffer.h"
 #include "dataset/engine/data_schema.h"
 #include "dataset/engine/datasetops/dataset_op.h"
 #include "dataset/util/make_unique.h"

 namespace mindspore {
 namespace dataset {
--- a/mindspore/ccsrc/dataset/engine/datasetops/source/sampler/sequential_sampler.cc
+++ b/mindspore/ccsrc/dataset/engine/datasetops/source/sampler/sequential_sampler.cc
@@ -25,9 +25,9 @@ Status SequentialSampler::GetNextBuffer(std::unique_ptr<DataBuffer> *out_buffer)
  if (next_id_ > num_samples_) {
    RETURN_STATUS_UNEXPECTED("Sequential Sampler Internal Error");
  } else if (next_id_ == num_samples_) {
    (*out_buffer) = make_unique<DataBuffer>(0, DataBuffer::kDeBFlagEOE);
    (*out_buffer) = std::make_unique<DataBuffer>(0, DataBuffer::kDeBFlagEOE);
  } else {
    (*out_buffer) = make_unique<DataBuffer>(next_id_, DataBuffer::kDeBFlagNone);
    (*out_buffer) = std::make_unique<DataBuffer>(next_id_, DataBuffer::kDeBFlagNone);
    std::shared_ptr<Tensor> sampleIds;
    int64_t lastId = (samples_per_buffer_ + next_id_ > num_samples_) ? num_samples_ : samples_per_buffer_ + next_id_;
    RETURN_IF_NOT_OK(CreateSamplerTensor(&sampleIds, lastId - next_id_));
@@ -36,7 +36,7 @@ Status SequentialSampler::GetNextBuffer(std::unique_ptr<DataBuffer> *out_buffer)
      *(idPtr++) = next_id_++;
    }
    TensorRow row(1, sampleIds);
    (*out_buffer)->set_tensor_table(make_unique<TensorQTable>(1, row));
    (*out_buffer)->set_tensor_table(std::make_unique<TensorQTable>(1, row));
  }
  return Status::OK();
 }
--- a/mindspore/ccsrc/dataset/engine/datasetops/source/sampler/subset_random_sampler.cc
+++ b/mindspore/ccsrc/dataset/engine/datasetops/source/sampler/subset_random_sampler.cc
@@ -64,9 +64,9 @@ Status SubsetRandomSampler::Reset() {
 Status SubsetRandomSampler::GetNextBuffer(std::unique_ptr<DataBuffer> *out_buffer) {
  // All samples have been drawn
  if (sample_id_ == indices_.size()) {
    (*out_buffer) = make_unique<DataBuffer>(buffer_id_++, DataBuffer::kDeBFlagEOE);
    (*out_buffer) = std::make_unique<DataBuffer>(buffer_id_++, DataBuffer::kDeBFlagEOE);
  } else {
    (*out_buffer) = make_unique<DataBuffer>(buffer_id_++, DataBuffer::kDeBFlagNone);
    (*out_buffer) = std::make_unique<DataBuffer>(buffer_id_++, DataBuffer::kDeBFlagNone);
    std::shared_ptr<Tensor> outputIds;

    int64_t last_id = sample_id_ + samples_per_buffer_;
@@ -92,7 +92,7 @@ Status SubsetRandomSampler::GetNextBuffer(std::unique_ptr<DataBuffer> *out_buffe
    }

    // Create a TensorTable from that single tensor and push into DataBuffer
    (*out_buffer)->set_tensor_table(make_unique<TensorQTable>(1, TensorRow(1, outputIds)));
    (*out_buffer)->set_tensor_table(std::make_unique<TensorQTable>(1, TensorRow(1, outputIds)));
  }

  return Status::OK();
--- a/mindspore/ccsrc/dataset/engine/datasetops/source/sampler/weighted_random_sampler.cc
+++ b/mindspore/ccsrc/dataset/engine/datasetops/source/sampler/weighted_random_sampler.cc
@@ -46,10 +46,10 @@ Status WeightedRandomSampler::Init(const RandomAccessOp *op) {
  CHECK_FAIL_RETURN_UNEXPECTED(num_samples_ > 0 && samples_per_buffer_ > 0, "Fail to init WeightedRandomSampler");

  if (!replacement_) {
    exp_dist_ = mindspore::make_unique<std::exponential_distribution<>>(1);
    exp_dist_ = std::make_unique<std::exponential_distribution<>>(1);
    InitOnePassSampling();
  } else {
    discrete_dist_ = mindspore::make_unique<std::discrete_distribution<int64_t>>(weights_.begin(), weights_.end());
    discrete_dist_ = std::make_unique<std::discrete_distribution<int64_t>>(weights_.begin(), weights_.end());
  }

  return Status::OK();
@@ -96,9 +96,9 @@ Status WeightedRandomSampler::GetNextBuffer(std::unique_ptr<DataBuffer> *out_buf
  }

  if (sample_id_ == num_samples_) {
    (*out_buffer) = make_unique<DataBuffer>(buffer_id_++, DataBuffer::kDeBFlagEOE);
    (*out_buffer) = std::make_unique<DataBuffer>(buffer_id_++, DataBuffer::kDeBFlagEOE);
  } else {
    (*out_buffer) = make_unique<DataBuffer>(buffer_id_++, DataBuffer::kDeBFlagNone);
    (*out_buffer) = std::make_unique<DataBuffer>(buffer_id_++, DataBuffer::kDeBFlagNone);
    std::shared_ptr<Tensor> outputIds;

    int64_t last_id = sample_id_ + samples_per_buffer_;
@@ -132,7 +132,7 @@ Status WeightedRandomSampler::GetNextBuffer(std::unique_ptr<DataBuffer> *out_buf
    }

    // Create a TensorTable from that single tensor and push into DataBuffer
    (*out_buffer)->set_tensor_table(make_unique<TensorQTable>(1, TensorRow(1, outputIds)));
    (*out_buffer)->set_tensor_table(std::make_unique<TensorQTable>(1, TensorRow(1, outputIds)));
  }

  return Status::OK();
--- a/mindspore/ccsrc/dataset/engine/datasetops/source/storage_client.cc
+++ b/mindspore/ccsrc/dataset/engine/datasetops/source/storage_client.cc
@@ -24,7 +24,6 @@
 #include "dataset/engine/datasetops/source/storage_client.h"
 #include "dataset/engine/datasetops/source/storage_op.h"
 #include "dataset/engine/datasetops/source/tf_client.h"
 #include "dataset/util/make_unique.h"
 #include "dataset/util/status.h"

 namespace mindspore {
@@ -57,7 +56,7 @@ static Status CreateStorageClientSwitch(
    case DatasetType::kTf: {
      // Construct the derived class TFClient, stored as base class StorageClient
      store_op->set_rows_per_buffer(32);
      *out_client = mindspore::make_unique<TFClient>(std::move(schema), store_op);
      *out_client = std::make_unique<TFClient>(std::move(schema), store_op);
      break;
    }
    case DatasetType::kUnknown:
@@ -83,7 +82,7 @@ Status StorageClient::CreateStorageClient(
  std::shared_ptr<StorageClient> *out_client) {  // Out: the created storage client
  // Make a new schema first.  This only assigns the dataset type.  It does not
  // create the columns yet.
  auto new_schema = mindspore::make_unique<DataSchema>();
  auto new_schema = std::make_unique<DataSchema>();
  RETURN_IF_NOT_OK(new_schema->LoadDatasetType(dataset_schema_path));
  RETURN_IF_NOT_OK(CreateStorageClientSwitch(std::move(new_schema), store_op, out_client));
  return Status::OK();
@@ -99,7 +98,7 @@ Status StorageClient::CreateStorageClient(
  std::shared_ptr<StorageClient> *out_client) {  // Out: the created storage client
  // The dataset type is passed in by the user.  Create an empty schema with only
  // only the dataset type filled in and then create the client with it.
  auto new_schema = mindspore::make_unique<DataSchema>();
  auto new_schema = std::make_unique<DataSchema>();
  new_schema->set_dataset_type(in_type);
  RETURN_IF_NOT_OK(CreateStorageClientSwitch(std::move(new_schema), store_op, out_client));
  return Status::OK();
@@ -147,7 +146,7 @@ Status StorageClient::AssignDatasetLayout(uint32_t num_rows,           // In: Th
  // The current schema was just an empty one with only the dataset field populated.
  // Let's copy construct a new one that will be a copy of the input schema (releasing the old
  // one) and then set the number of rows that the user requested.
  data_schema_ = mindspore::make_unique<DataSchema>(schema);
  data_schema_ = std::make_unique<DataSchema>(schema);
  CHECK_FAIL_RETURN_UNEXPECTED(num_rows <= MAX_INTEGER_INT32, "numRows exceeds the boundary numRows>2147483647");
  num_rows_in_dataset_ = num_rows;

--- a/mindspore/ccsrc/dataset/engine/datasetops/source/storage_op.cc
+++ b/mindspore/ccsrc/dataset/engine/datasetops/source/storage_op.cc
@@ -303,7 +303,7 @@ Status StorageOp::init() {
    // For simplicity, we'll make both of them 3 so they are the same size.
    int32_t action_queue_size = (buffers_needed / num_workers_) + 1;
    for (int32_t i = 0; i < num_workers_; ++i) {
      auto new_queue = mindspore::make_unique<Queue<int32_t>>(action_queue_size);
      auto new_queue = std::make_unique<Queue<int32_t>>(action_queue_size);
      action_queue_.push_back(std::move(new_queue));
    }
  }
@@ -483,10 +483,10 @@ Status StorageOp::operator()() {
        // Post the control message to tell the workers to stop waiting on action queue
        // because we are done!
        RETURN_IF_NOT_OK(this->PostEndOfData());
        std::unique_ptr<DataBuffer> eoeBuffer = mindspore::make_unique<DataBuffer>(0, DataBuffer::kDeBFlagEOE);
        std::unique_ptr<DataBuffer> eoeBuffer = std::make_unique<DataBuffer>(0, DataBuffer::kDeBFlagEOE);
        RETURN_IF_NOT_OK(out_connector_->Add(0, std::move(eoeBuffer)));
        MS_LOG(INFO) << "StorageOp master: Flow end-of-data eof message.";
        std::unique_ptr<DataBuffer> eofBuffer = mindspore::make_unique<DataBuffer>(0, DataBuffer::kDeBFlagEOF);
        std::unique_ptr<DataBuffer> eofBuffer = std::make_unique<DataBuffer>(0, DataBuffer::kDeBFlagEOF);
        RETURN_IF_NOT_OK(out_connector_->Add(0, std::move(eofBuffer)));
        MS_LOG(INFO) << "StorageOp master: Main execution loop complete.";
        done = true;  // while loop exit
@@ -496,7 +496,7 @@ Status StorageOp::operator()() {
        // RepeatOp above us somewhere in the tree will re-init us with the data to fetch again
        // once it gets the end-of-epoch message.
        MS_LOG(INFO) << "StorageOp master: Flow end-of-epoch eoe message.";
        std::unique_ptr<DataBuffer> eoe_buffer = mindspore::make_unique<DataBuffer>(0, DataBuffer::kDeBFlagEOE);
        std::unique_ptr<DataBuffer> eoe_buffer = std::make_unique<DataBuffer>(0, DataBuffer::kDeBFlagEOE);
        RETURN_IF_NOT_OK(out_connector_->Add(0, std::move(eoe_buffer)));

        // reset our buffer count and go to loop again.
--- a/mindspore/ccsrc/dataset/engine/datasetops/source/tf_buffer.cc
+++ b/mindspore/ccsrc/dataset/engine/datasetops/source/tf_buffer.cc
@@ -27,7 +27,6 @@
 #include "dataset/core/data_type.h"
 #include "dataset/engine/datasetops/source/storage_client.h"
 #include "dataset/engine/data_schema.h"
 #include "dataset/util/make_unique.h"

 namespace mindspore {
 namespace dataset {
@@ -72,7 +71,7 @@ Status TFBuffer::Load() {
  }

  // Construct the Tensor table for this buffer.
  tensor_table_ = mindspore::make_unique<TensorQTable>();
  tensor_table_ = std::make_unique<TensorQTable>();

  // At each position in the tensor table, instantiate the shared pointer to it's Tensor.
  uint32_t row = 0;
@@ -272,7 +271,7 @@ Status TFBuffer::LoadFloatList(const ColDescriptor &current_col, const dataengin
  // Identify how many values we have and then create a local array of these
  // to deserialize into
  *num_elements = float_list.value_size();
  *float_array = mindspore::make_unique<float[]>(*num_elements);
  *float_array = std::make_unique<float[]>(*num_elements);
  for (int i = 0; i < float_list.value_size(); i++) {
    (*float_array)[i] = float_list.value(i);
  }
@@ -294,7 +293,7 @@ Status TFBuffer::LoadIntList(const ColDescriptor &current_col, const dataengine:
  // Identify how many values we have and then create a local array of these
  // to deserialize into
  *num_elements = int64_list.value_size();
  *int_array = mindspore::make_unique<int64_t[]>(*num_elements);
  *int_array = std::make_unique<int64_t[]>(*num_elements);
  for (int i = 0; i < int64_list.value_size(); i++) {
    (*int_array)[i] = int64_list.value(i);
  }
--- a/mindspore/ccsrc/dataset/engine/datasetops/source/tf_reader_op.cc
+++ b/mindspore/ccsrc/dataset/engine/datasetops/source/tf_reader_op.cc
@@ -36,7 +36,6 @@
 #include "dataset/engine/db_connector.h"
 #include "dataset/engine/execution_tree.h"
 #include "dataset/engine/jagged_connector.h"
 #include "dataset/util/make_unique.h"
 #include "dataset/util/path.h"
 #include "dataset/util/queue.h"
 #include "dataset/util/random.h"
@@ -54,7 +53,7 @@ TFReaderOp::Builder::Builder()
  builder_op_connector_size_ = config_manager->op_connector_size();
  builder_rows_per_buffer_ = config_manager->rows_per_buffer();
  builder_shuffle_files_ = false;
  builder_data_schema_ = make_unique<DataSchema>();
  builder_data_schema_ = std::make_unique<DataSchema>();
 }

 Status TFReaderOp::Builder::ValidateInputs() const {
@@ -103,7 +102,7 @@ TFReaderOp::TFReaderOp(int32_t num_workers, int32_t worker_connector_size, int64
      finished_reading_dataset_(false),
      shuffle_files_(shuffle_files),
      data_schema_(std::move(data_schema)),
      filename_index_(make_unique<StringIndex>()),
      filename_index_(std::make_unique<StringIndex>()),
      load_io_block_queue_(true),
      load_jagged_connector_(true),
      num_rows_(0),
@@ -129,7 +128,7 @@ Status TFReaderOp::Init() {
  // parallel op base.
  RETURN_IF_NOT_OK(ParallelOp::CreateWorkerConnector(worker_connector_size_));

  jagged_buffer_connector_ = mindspore::make_unique<JaggedConnector>(num_workers_, 1, worker_connector_size_);
  jagged_buffer_connector_ = std::make_unique<JaggedConnector>(num_workers_, 1, worker_connector_size_);

  // temporary: make size large enough to hold all files + EOE to avoid hangs
  int32_t safe_queue_size = static_cast<int32_t>(std::ceil(dataset_files_list_.size() / num_workers_)) + 1;
@@ -229,7 +228,7 @@ Status TFReaderOp::operator()() {
    }

    // all workers finished reading for this epoch, and we have read all the data from all workers
    std::unique_ptr<DataBuffer> eoe_buffer = mindspore::make_unique<DataBuffer>(0, DataBuffer::kDeBFlagEOE);
    std::unique_ptr<DataBuffer> eoe_buffer = std::make_unique<DataBuffer>(0, DataBuffer::kDeBFlagEOE);
    RETURN_IF_NOT_OK(out_connector_->Add(0, std::move(eoe_buffer)));

    if (!BitTest(op_ctrl_flags_, kDeOpRepeated) || BitTest(op_ctrl_flags_, kDeOpLastRepeat)) {
@@ -241,7 +240,7 @@ Status TFReaderOp::operator()() {
    }
  }

  std::unique_ptr<DataBuffer> eof_buffer = mindspore::make_unique<DataBuffer>(0, DataBuffer::kDeBFlagEOF);
  std::unique_ptr<DataBuffer> eof_buffer = std::make_unique<DataBuffer>(0, DataBuffer::kDeBFlagEOF);
  RETURN_IF_NOT_OK(out_connector_->Add(0, std::move(eof_buffer)));

  RETURN_IF_NOT_OK(PostEndOfData());
@@ -274,7 +273,7 @@ Status TFReaderOp::WorkerEntry(int32_t worker_id) {
        MS_LOG(INFO) << "TFReader operator worker " << worker_id << " loaded file " << filename << ".";
      }
    } else {
      std::unique_ptr<DataBuffer> eoe_buffer = mindspore::make_unique<DataBuffer>(1, DataBuffer::kDeBFlagEOE);
      std::unique_ptr<DataBuffer> eoe_buffer = std::make_unique<DataBuffer>(1, DataBuffer::kDeBFlagEOE);
      RETURN_IF_NOT_OK(jagged_buffer_connector_->Add(worker_id, std::move(eoe_buffer)));
    }

@@ -288,7 +287,7 @@ Status TFReaderOp::WorkerEntry(int32_t worker_id) {
 // When the worker pops this control indicator, it will shut itself down gracefully.
 Status TFReaderOp::PostEndOfData() {
  for (int i = 0; i < num_workers_; ++i) {
    std::unique_ptr<FilenameBlock> eof = mindspore::make_unique<FilenameBlock>(IOBlock::kDeIoBlockFlagEof);
    std::unique_ptr<FilenameBlock> eof = std::make_unique<FilenameBlock>(IOBlock::kDeIoBlockFlagEof);
    RETURN_IF_NOT_OK(PushIoBlockQueue(i, std::move(eof)));
  }

@@ -299,7 +298,7 @@ Status TFReaderOp::PostEndOfData() {
 // pops this control indicator, it will wait until the next epoch starts and then resume execution.
 Status TFReaderOp::PostEndOfEpoch(int32_t queue_index) {
  for (int i = 0; i < num_workers_; ++i) {
    std::unique_ptr<FilenameBlock> eoe = mindspore::make_unique<FilenameBlock>(IOBlock::kDeIoBlockFlagEoe);
    std::unique_ptr<FilenameBlock> eoe = std::make_unique<FilenameBlock>(IOBlock::kDeIoBlockFlagEoe);
    RETURN_IF_NOT_OK(PushIoBlockQueue((queue_index + i) % num_workers_, std::move(eoe)));
  }

@@ -358,7 +357,7 @@ Status TFReaderOp::FillIOBlockShuffle(const std::vector<int64_t> &i_keys) {
      }
      if (!equal_rows_per_shard_) {
        if (key_index++ % num_devices_ == device_id_) {
          auto ioBlock = make_unique<FilenameBlock>(*it, kInvalidOffset, kInvalidOffset, IOBlock::kDeIoBlockNone);
          auto ioBlock = std::make_unique<FilenameBlock>(*it, kInvalidOffset, kInvalidOffset, IOBlock::kDeIoBlockNone);
          RETURN_IF_NOT_OK(PushIoBlockQueue(queue_index, std::move(ioBlock)));
          queue_index = (queue_index + 1) % num_workers_;
        }
@@ -367,7 +366,7 @@ Status TFReaderOp::FillIOBlockShuffle(const std::vector<int64_t> &i_keys) {
        auto file_it = filename_index_->Search(*it);
        std::string file_name = file_it.value();
        if (NeedPushFileToblockQueue(file_name, &start_offset, &end_offset, pre_count)) {
          auto ioBlock = make_unique<FilenameBlock>(*it, start_offset, end_offset, IOBlock::kDeIoBlockNone);
          auto ioBlock = std::make_unique<FilenameBlock>(*it, start_offset, end_offset, IOBlock::kDeIoBlockNone);
          RETURN_IF_NOT_OK(PushIoBlockQueue(queue_index, std::move(ioBlock)));
          MS_LOG(DEBUG) << "File name " << *it << " start offset " << start_offset << " end_offset " << end_offset;
          queue_index = (queue_index + 1) % num_workers_;
@@ -404,14 +403,15 @@ Status TFReaderOp::FillIOBlockNoShuffle() {
      }
      if (!equal_rows_per_shard_) {
        if (key_index++ % num_devices_ == device_id_) {
          auto ioBlock = make_unique<FilenameBlock>(it.key(), kInvalidOffset, kInvalidOffset, IOBlock::kDeIoBlockNone);
          auto ioBlock =
            std::make_unique<FilenameBlock>(it.key(), kInvalidOffset, kInvalidOffset, IOBlock::kDeIoBlockNone);
          RETURN_IF_NOT_OK(PushIoBlockQueue(queue_index, std::move(ioBlock)));
          queue_index = (queue_index + 1) % num_workers_;
        }
      } else {
        std::string file_name = it.value();
        if (NeedPushFileToblockQueue(file_name, &start_offset, &end_offset, pre_count)) {
          auto ioBlock = make_unique<FilenameBlock>(it.key(), start_offset, end_offset, IOBlock::kDeIoBlockNone);
          auto ioBlock = std::make_unique<FilenameBlock>(it.key(), start_offset, end_offset, IOBlock::kDeIoBlockNone);
          RETURN_IF_NOT_OK(PushIoBlockQueue(queue_index, std::move(ioBlock)));
          queue_index = (queue_index + 1) % num_workers_;
        }
@@ -490,14 +490,13 @@ Status TFReaderOp::LoadFile(const std::string &filename, const int64_t start_off

  int64_t rows_read = 0;
  int64_t rows_total = 0;
  std::unique_ptr<DataBuffer> current_buffer =
    mindspore::make_unique<DataBuffer>(0, DataBuffer::BufferFlags::kDeBFlagNone);
  std::unique_ptr<DataBuffer> current_buffer = std::make_unique<DataBuffer>(0, DataBuffer::BufferFlags::kDeBFlagNone);
  std::unordered_map<std::string, int32_t> column_name_map;
  for (int32_t i = 0; i < data_schema_->NumColumns(); ++i) {
    column_name_map[data_schema_->column(i).name()] = i;
  }
  current_buffer->set_column_name_map(column_name_map);
  std::unique_ptr<TensorQTable> new_tensor_table = make_unique<TensorQTable>();
  std::unique_ptr<TensorQTable> new_tensor_table = std::make_unique<TensorQTable>();

  while (reader.peek() != EOF) {
    if (!load_jagged_connector_) {
@@ -532,9 +531,9 @@ Status TFReaderOp::LoadFile(const std::string &filename, const int64_t start_off
      current_buffer->set_tensor_table(std::move(new_tensor_table));
      RETURN_IF_NOT_OK(jagged_buffer_connector_->Add(worker_id, std::move(current_buffer)));

      current_buffer = make_unique<DataBuffer>(0, DataBuffer::BufferFlags::kDeBFlagNone);
      current_buffer = std::make_unique<DataBuffer>(0, DataBuffer::BufferFlags::kDeBFlagNone);
      current_buffer->set_column_name_map(column_name_map);
      new_tensor_table = make_unique<TensorQTable>();
      new_tensor_table = std::make_unique<TensorQTable>();
      rows_read = 0;
    }
  }
@@ -742,7 +741,7 @@ Status TFReaderOp::LoadFloatList(const ColDescriptor &current_col, const dataeng
  // Identify how many values we have and then create a local array of these
  // to deserialize into
  *num_elements = float_list.value_size();
  *float_array = mindspore::make_unique<float[]>(*num_elements);
  *float_array = std::make_unique<float[]>(*num_elements);
  for (int i = 0; i < float_list.value_size(); ++i) {
    (*float_array)[i] = float_list.value(i);
  }
--- a/mindspore/ccsrc/dataset/engine/datasetops/source/voc_op.cc
+++ b/mindspore/ccsrc/dataset/engine/datasetops/source/voc_op.cc
@@ -38,7 +38,7 @@ Status VOCOp::Builder::Build(std::shared_ptr<VOCOp> *ptr) {
  if (builder_sampler_ == nullptr) {
    builder_sampler_ = std::make_shared<SequentialSampler>();
  }
  builder_schema_ = make_unique<DataSchema>();
  builder_schema_ = std::make_unique<DataSchema>();
  RETURN_IF_NOT_OK(
    builder_schema_->AddColumn(ColDescriptor("image", DataType(DataType::DE_UINT8), TensorImpl::kFlexible, 1)));
  RETURN_IF_NOT_OK(
@@ -85,7 +85,7 @@ Status VOCOp::TraverseSampleIds(const std::shared_ptr<Tensor> &sample_ids, std::
    row_cnt_++;
    if (row_cnt_ % rows_per_buffer_ == 0) {
      RETURN_IF_NOT_OK(io_block_queues_[buf_cnt_++ % num_workers_]->Add(
        make_unique<IOBlock>(IOBlock(*keys, IOBlock::kDeIoBlockNone))));
        std::make_unique<IOBlock>(IOBlock(*keys, IOBlock::kDeIoBlockNone))));
      keys->clear();
    }
  }
@@ -110,21 +110,21 @@ Status VOCOp::operator()() {
    }
    if (keys.empty() == false) {
      RETURN_IF_NOT_OK(io_block_queues_[(buf_cnt_++) % num_workers_]->Add(
        make_unique<IOBlock>(IOBlock(keys, IOBlock::kDeIoBlockNone))));
        std::make_unique<IOBlock>(IOBlock(keys, IOBlock::kDeIoBlockNone))));
    }
    if (!BitTest(op_ctrl_flags_, kDeOpRepeated) || BitTest(op_ctrl_flags_, kDeOpLastRepeat)) {
      std::unique_ptr<IOBlock> eoe_block = make_unique<IOBlock>(IOBlock::kDeIoBlockFlagEoe);
      std::unique_ptr<IOBlock> eof_block = make_unique<IOBlock>(IOBlock::kDeIoBlockFlagEof);
      std::unique_ptr<IOBlock> eoe_block = std::make_unique<IOBlock>(IOBlock::kDeIoBlockFlagEoe);
      std::unique_ptr<IOBlock> eof_block = std::make_unique<IOBlock>(IOBlock::kDeIoBlockFlagEof);
      RETURN_IF_NOT_OK(io_block_queues_[(buf_cnt_++) % num_workers_]->Add(std::move(eoe_block)));
      RETURN_IF_NOT_OK(io_block_queues_[(buf_cnt_++) % num_workers_]->Add(std::move(eof_block)));
      for (int32_t i = 0; i < num_workers_; i++) {
        RETURN_IF_NOT_OK(
          io_block_queues_[i]->Add(make_unique<IOBlock>(std::vector<int64_t>(), IOBlock::kDeIoBlockNone)));
          io_block_queues_[i]->Add(std::make_unique<IOBlock>(std::vector<int64_t>(), IOBlock::kDeIoBlockNone)));
      }
      return Status::OK();
    } else {
      RETURN_IF_NOT_OK(
        io_block_queues_[(buf_cnt_++) % num_workers_]->Add(make_unique<IOBlock>(IOBlock::kDeIoBlockFlagEoe)));
        io_block_queues_[(buf_cnt_++) % num_workers_]->Add(std::make_unique<IOBlock>(IOBlock::kDeIoBlockFlagEoe)));
      RETURN_IF_NOT_OK(wp_.Wait());
      wp_.Clear();
      RETURN_IF_NOT_OK(sampler_->GetNextBuffer(&sampler_buffer));
@@ -164,7 +164,7 @@ Status VOCOp::LoadTensorRow(const std::string &image_id, TensorRow *trow) {
 }

 Status VOCOp::LoadBuffer(const std::vector<int64_t> &keys, std::unique_ptr<DataBuffer> *db) {
  std::unique_ptr<TensorQTable> deq = make_unique<TensorQTable>();
  std::unique_ptr<TensorQTable> deq = std::make_unique<TensorQTable>();
  TensorRow trow;
  for (const uint64_t &key : keys) {
    RETURN_IF_NOT_OK(this->LoadTensorRow(image_ids_[key], &trow));
@@ -182,15 +182,15 @@ Status VOCOp::WorkerEntry(int32_t worker_id) {
  RETURN_IF_NOT_OK(io_block_queues_[worker_id]->PopFront(&io_block));
  while (io_block != nullptr) {
    if (io_block->eoe() == true) {
      RETURN_IF_NOT_OK(out_connector_->Add(worker_id, make_unique<DataBuffer>(0, DataBuffer::kDeBFlagEOE)));
      RETURN_IF_NOT_OK(out_connector_->Add(worker_id, std::make_unique<DataBuffer>(0, DataBuffer::kDeBFlagEOE)));
      buffer_id = worker_id;
    } else if (io_block->eof() == true) {
      RETURN_IF_NOT_OK(out_connector_->Add(worker_id, (make_unique<DataBuffer>(0, DataBuffer::kDeBFlagEOF))));
      RETURN_IF_NOT_OK(out_connector_->Add(worker_id, (std::make_unique<DataBuffer>(0, DataBuffer::kDeBFlagEOF))));
    } else {
      std::vector<int64_t> keys;
      RETURN_IF_NOT_OK(io_block->GetKeys(&keys));
      if (keys.empty() == true) return Status::OK();
      std::unique_ptr<DataBuffer> db = make_unique<DataBuffer>(buffer_id, DataBuffer::kDeBFlagNone);
      std::unique_ptr<DataBuffer> db = std::make_unique<DataBuffer>(buffer_id, DataBuffer::kDeBFlagNone);
      RETURN_IF_NOT_OK(LoadBuffer(keys, &db));
      RETURN_IF_NOT_OK(out_connector_->Add(worker_id, std::move(db)));
      buffer_id += num_workers_;
--- a/mindspore/ccsrc/dataset/engine/datasetops/zip_op.cc
+++ b/mindspore/ccsrc/dataset/engine/datasetops/zip_op.cc
@@ -65,13 +65,13 @@ Status ZipOp::operator()() {
  // initialize the iterators
  for (int32_t i = 0; i < children_num_; ++i) {
    // magic number 0 since Zip is not a parallel Op
    child_iterators_.push_back(mindspore::make_unique<ChildIterator>(this, 0, i));
    child_iterators_.push_back(std::make_unique<ChildIterator>(this, 0, i));
  }

  // Loop until eof is true
  while (!eof_) {
    // Create tensor table and prepare it by fetching and packing the first zipped row into it.
    std::unique_ptr<TensorQTable> curr_table = mindspore::make_unique<TensorQTable>();
    std::unique_ptr<TensorQTable> curr_table = std::make_unique<TensorQTable>();
    RETURN_IF_NOT_OK(prepare(curr_table.get()));

    // If an eof got picked up during the above prepare, then we're done
@@ -81,7 +81,7 @@ Status ZipOp::operator()() {
    while (!draining_) {
      // 1. If a previous loop iteration sent the current table out, then create a new one.
      if (curr_table == nullptr) {
        curr_table = mindspore::make_unique<TensorQTable>();
        curr_table = std::make_unique<TensorQTable>();
      }

      // 2 fill the table.  Note: draining mode might get turned on if any of the child inputs were done
@@ -89,8 +89,7 @@ Status ZipOp::operator()() {

      // 3 create and update buffer and send it to the out connector
      if (!curr_table->empty()) {
        std::unique_ptr<DataBuffer> curr_buffer =
          mindspore::make_unique<DataBuffer>(buffer_id_, DataBuffer::kDeBFlagNone);
        std::unique_ptr<DataBuffer> curr_buffer = std::make_unique<DataBuffer>(buffer_id_, DataBuffer::kDeBFlagNone);
        curr_buffer->set_tensor_table(std::move(curr_table));
        curr_buffer->set_column_name_map(col_name_id_map_);
        MS_LOG(DEBUG) << "Zip operator finished one buffer, pushing, rows " << curr_buffer->NumRows() << ", cols "
@@ -105,15 +104,14 @@ Status ZipOp::operator()() {
      MS_LOG(DEBUG) << "Zip operator is now draining child inputs.";
      RETURN_IF_NOT_OK(drainPipeline());
      // Now that we have drained child inputs, send the eoe up.
      RETURN_IF_NOT_OK(
        out_connector_->Add(0, std::move(mindspore::make_unique<DataBuffer>(0, DataBuffer::kDeBFlagEOE))));
      RETURN_IF_NOT_OK(out_connector_->Add(0, std::move(std::make_unique<DataBuffer>(0, DataBuffer::kDeBFlagEOE))));
    }
  }

  // 5 handle eof
  // propagate eof here.
  MS_LOG(INFO) << "Zip operator got EOF, propagating.";
  RETURN_IF_NOT_OK(out_connector_->Add(0, std::move(mindspore::make_unique<DataBuffer>(0, DataBuffer::kDeBFlagEOF))));
  RETURN_IF_NOT_OK(out_connector_->Add(0, std::move(std::make_unique<DataBuffer>(0, DataBuffer::kDeBFlagEOF))));
  return Status::OK();
 }

--- a/mindspore/ccsrc/dataset/engine/db_connector.h
+++ b/mindspore/ccsrc/dataset/engine/db_connector.h
@@ -65,7 +65,7 @@ class DbConnector : public Connector<std::unique_ptr<DataBuffer>> {
      RETURN_IF_NOT_OK(cv_.Wait(&lk, [this, worker_id]() { return expect_consumer_ == worker_id; }));
      // Once an EOF message is encountered this flag will be set and we can return early.
      if (end_of_file_) {
        *result = mindspore::make_unique<DataBuffer>(0, DataBuffer::kDeBFlagEOF);
        *result = std::make_unique<DataBuffer>(0, DataBuffer::kDeBFlagEOF);
      } else {
        RETURN_IF_NOT_OK(queues_[pop_from_]->PopFront(result));
        if (*result == nullptr) {
--- a/mindspore/ccsrc/dataset/engine/execution_tree.cc
+++ b/mindspore/ccsrc/dataset/engine/execution_tree.cc
@@ -24,7 +24,7 @@ namespace mindspore {
 namespace dataset {
 // Constructor
 ExecutionTree::ExecutionTree() : id_count_(0) {
  tg_ = mindspore::make_unique<TaskGroup>();
  tg_ = std::make_unique<TaskGroup>();
  tree_state_ = kDeTStateInit;
  prepare_flags_ = kDePrepNone;
 }
--- a/mindspore/ccsrc/dataset/kernels/image/image_utils.cc
+++ b/mindspore/ccsrc/dataset/kernels/image/image_utils.cc
@@ -24,7 +24,6 @@
 #include "dataset/core/cv_tensor.h"
 #include "dataset/core/tensor.h"
 #include "dataset/core/tensor_shape.h"
 #include "dataset/util/make_unique.h"
 #include "dataset/util/random.h"

 #define MAX_INT_PRECISION 16777216  // float int precision is 16777216
@@ -376,7 +375,7 @@ Status HwcToChw(std::shared_ptr<Tensor> input, std::shared_ptr<Tensor> *output)
    int width = input_cv->shape()[1];
    int num_channels = input_cv->shape()[2];

    auto output_cv = mindspore::make_unique<CVTensor>(TensorShape{num_channels, height, width}, input_cv->type());
    auto output_cv = std::make_unique<CVTensor>(TensorShape{num_channels, height, width}, input_cv->type());
    for (int i = 0; i < num_channels; ++i) {
      cv::Mat mat;
      RETURN_IF_NOT_OK(output_cv->Mat({i}, &mat));
--- a/mindspore/ccsrc/dataset/kernels/image/image_utils.h
+++ b/mindspore/ccsrc/dataset/kernels/image/image_utils.h
@@ -84,18 +84,8 @@ Status Resize(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *out

 // Returns Decoded image
 // Supported images:
 // -   Windows bitmaps - \*.bmp, \*.dib (always supported)
 // -   JPEG files - \*.jpeg, \*.jpg, \*.jpe (see the *Note* section)
 // -   JPEG 2000 files - \*.jp2 (see the *Note* section)
 // -   Portable Network Graphics - \*.png (see the *Note* section)
 // -   WebP - \*.webp (see the *Note* section)
 // -   Portable image format - \*.pbm, \*.pgm, \*.ppm \*.pxm, \*.pnm (always supported)
 // -   PFM files - \*.pfm (see the *Note* section)
 // -   Sun rasters - \*.sr, \*.ras (always supported)
 // -   TIFF files - \*.tiff, \*.tif (see the *Note* section)
 // -   OpenEXR Image files - \*.exr (see the *Note* section)
 // -   Radiance HDR - \*.hdr, \*.pic (always supported)
 // -   Raster and Vector geospatial data supported by GDAL (see the *Note* section)
 //  BMP JPEG JPG PNG TIFF
 // supported by opencv, if user need more image analysis capabilities, please compile opencv particularlly.
 // @param input: CVTensor containing the not decoded image 1D bytes
 // @param output: Decoded image Tensor of shape <H,W,C> and type DE_UINT8. Pixel order is RGB
 Status Decode(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output);
--- a/mindspore/ccsrc/dataset/kernels/py_func_op.cc
+++ b/mindspore/ccsrc/dataset/kernels/py_func_op.cc
@@ -20,7 +20,6 @@

 #include "dataset/core/tensor.h"
 #include "dataset/kernels/tensor_op.h"
 #include "dataset/util/make_unique.h"
 #include "dataset/util/status.h"

 namespace mindspore {
--- a/mindspore/ccsrc/dataset/util/arena.cc
+++ b/mindspore/ccsrc/dataset/util/arena.cc
@@ -16,7 +16,6 @@
 #include "dataset/util/arena.h"
 #include <unistd.h>
 #include <utility>
 #include "dataset/util/make_unique.h"
 #include "dataset/util/system_pool.h"
 #include "dataset/util/de_error.h"
 #include "./securec.h"
--- a/mindspore/ccsrc/dataset/util/circular_pool.cc
+++ b/mindspore/ccsrc/dataset/util/circular_pool.cc
@@ -18,10 +18,8 @@
 #include <algorithm>
 #include <limits>
 #include <utility>

 #include "./securec.h"

 #include "dataset/util/make_unique.h"
 #include "dataset/util/de_error.h"
 #include "dataset/util/system_pool.h"

 namespace mindspore {
--- a/mindspore/ccsrc/dataset/util/de_error.h
+++ b/mindspore/ccsrc/dataset/util/de_error.h
@@ -16,6 +16,13 @@
 #ifndef DATASET_UTIL_DE_ERROR_H_
 #define DATASET_UTIL_DE_ERROR_H_

 #ifdef DEBUG
 #include <cassert>
 #define DS_ASSERT(f) assert(f)
 #else
 #define DS_ASSERT(f) ((void)0)
 #endif

 #include <map>
 #include "utils/error_code.h"

--- a/mindspore/ccsrc/dataset/util/list.h
+++ b/mindspore/ccsrc/dataset/util/list.h
@@ -18,8 +18,7 @@

 #include <iostream>
 #include <iterator>

 #include "dataset/util/make_unique.h"
 #include "dataset/util/de_error.h"

 namespace mindspore {
 namespace dataset {
--- a/mindspore/ccsrc/dataset/util/lock.cc
+++ b/mindspore/ccsrc/dataset/util/lock.cc
@@ -14,6 +14,7 @@
 * limitations under the License.
 */
 #include "dataset/util/lock.h"
 #include "dataset/util/de_error.h"

 namespace mindspore {
 namespace dataset {
--- a/mindspore/ccsrc/dataset/util/lock.h
+++ b/mindspore/ccsrc/dataset/util/lock.h
@@ -19,7 +19,6 @@
 #include <atomic>
 #include <condition_variable>
 #include <mutex>
 #include "dataset/util/make_unique.h"

 namespace mindspore {
 namespace dataset {
--- a/mindspore/ccsrc/dataset/util/make_unique.h
+++ b/mindspore/ccsrc/dataset/util/make_unique.h
@@ -1,37 +0,0 @@
 /**
 * Copyright 2019 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #ifndef DATASET_UTIL_MAKE_UNIQUE_H_
 #define DATASET_UTIL_MAKE_UNIQUE_H_

 #ifdef DEBUG
 #include <cassert>
 #define DS_ASSERT(f) assert(f)
 #else
 #define DS_ASSERT(f) ((void)0)
 #endif

 #include <memory>
 #include <type_traits>
 #include <utility>
 #include "dataset/util/de_error.h"
 #include "utils/log_adapter.h"

 namespace mindspore {
 using std::make_unique;
 }  // namespace mindspore

 #endif  // DATASET_UTIL_MAKE_UNIQUE_H_
--- a/mindspore/ccsrc/dataset/util/queue.h
+++ b/mindspore/ccsrc/dataset/util/queue.h
@@ -212,7 +212,7 @@ class QueueList {
  void Init(int num_queues, int capacity) {
    queue_list_.reserve(num_queues);
    for (int i = 0; i < num_queues; i++) {
      queue_list_.emplace_back(mindspore::make_unique<Queue<T>>(capacity));
      queue_list_.emplace_back(std::make_unique<Queue<T>>(capacity));
    }
  }

--- a/mindspore/ccsrc/dataset/util/task.h
+++ b/mindspore/ccsrc/dataset/util/task.h
@@ -27,7 +27,6 @@
 #include <string>
 #include <thread>
 #include "dataset/util/de_error.h"
 #include "dataset/util/make_unique.h"
 #include "dataset/util/intrp_resource.h"
 #include "dataset/util/list.h"
 #include "dataset/util/memory_pool.h"
--- a/mindspore/ccsrc/device/ascend/ascend_device_address.cc
+++ b/mindspore/ccsrc/device/ascend/ascend_device_address.cc
@@ -262,8 +262,8 @@ AscendDeviceAddress::~AscendDeviceAddress() {
  if (ptr_ == nullptr) {
    return;
  }
  if (mem_dynamic_alloc_) {
    AscendMemoryAllocator::GetInstance().FreeTensorMem(ptr_);
  if (from_mem_pool_) {
    AscendMemoryPool::GetInstance().FreeTensorMem(ptr_);
    ptr_ = nullptr;
  }
 }
--- a/mindspore/ccsrc/device/ascend/ascend_device_address.h
+++ b/mindspore/ccsrc/device/ascend/ascend_device_address.h
@@ -21,7 +21,7 @@
 #include <vector>
 #include <memory>
 #include "device/device_address.h"
 #include "device/ascend/ascend_memory_allocator.h"
 #include "device/ascend/ascend_memory_pool.h"
 #include "ir/dtype.h"

 namespace mindspore {
--- a/mindspore/ccsrc/device/ascend/ascend_kernel_runtime.cc
+++ b/mindspore/ccsrc/device/ascend/ascend_kernel_runtime.cc
@@ -29,7 +29,7 @@
 #include "hccl/hcom.h"
 #include "runtime/context.h"
 #include "device/ascend/ascend_stream_assign.h"
 #include "device/ascend/ascend_memory_allocator.h"
 #include "device/ascend/ascend_memory_pool.h"
 #include "framework/ge_runtime/model_runner.h"
 #include "device/ascend/tasksink/task_generator.h"
 #include "session/anf_runtime_algorithm.h"
@@ -37,6 +37,7 @@
 #include "kernel/tbe/tbe_utils.h"
 #include "kernel/tbe/tbe_python_funcs.h"
 #include "pre_activate/mem_reuse/mem_reuse_checker.h"
 #include "device/ascend/ascend_memory_manager.h"

 using mindspore::device::ascend::ProfilingManager;
 using mindspore::device::ascend::ProfilingUtils;
@@ -47,8 +48,6 @@ using std::vector;
 namespace mindspore {
 namespace device {
 namespace ascend {
 static const uint64_t ASCEND_MEM_SIZE = 20;
 static const uint64_t ASCEND_MEM_SIZE_BYTE = (ASCEND_MEM_SIZE << 30);
 static const size_t PRAMATER_OUTPUT_INDEX = 0;

 AscendKernelRuntime::~AscendKernelRuntime() { graph_model_map_.clear(); }
@@ -86,7 +85,8 @@ void AscendKernelRuntime::ReleaseDeviceRes() {
    MS_EXCEPTION(DeviceProcessError) << "rtSetDevice, ret[" << static_cast<int>(ret) << "]";
  }

  FreeDeviceMemory();
  MS_EXCEPTION_IF_NULL(mem_manager_);
  mem_manager_->FreeDeviceMemory();
  (void)DestroyHccl();
  (void)ResetDevice();
  (void)ProfilingManager::GetInstance().StopProfiling();
@@ -109,11 +109,9 @@ bool AscendKernelRuntime::Init() {
  if (!ret) {
    return ret;
  }

  ret = MallocDeviceMemory();
  if (!ret) {
    return ret;
  }
  mem_manager_ = std::make_shared<AscendMemoryManager>();
  MS_EXCEPTION_IF_NULL(mem_manager_);
  mem_manager_->MallocDeviceMemory();

  ret = ProfilingManager::GetInstance().StartupProfiling(device_id_);
  if (!ret) {
@@ -239,13 +237,6 @@ DeviceAddressPtr AscendKernelRuntime::CreateDeviceAddress(void *device_ptr, size
  return std::make_shared<AscendDeviceAddress>(device_ptr, device_size, format, type_id);
 }

 void AscendKernelRuntime::MallocOpMemory(const DeviceAddressPtr address, size_t size, int) {
  auto device_ptr = AscendMemoryAllocator::GetInstance().AllocTensorMem(size);
  MS_EXCEPTION_IF_NULL(device_ptr);
  address->ptr_ = device_ptr;
  address->mem_dynamic_alloc_ = true;
 }

 bool AscendKernelRuntime::GenTask(const session::KernelGraph *graph) {
  auto context_ptr = MsContext::GetInstance();
  MS_EXCEPTION_IF_NULL(context_ptr);
@@ -474,42 +465,6 @@ bool AscendKernelRuntime::DestroyHccl() {
  context_ptr->set_enable_hccl(false);
  return true;
 }

 bool AscendKernelRuntime::MallocDeviceMemory() {
  device_mem_size_ = ASCEND_MEM_SIZE_BYTE;
  static_mem_offset_ = FloatToSize(device_mem_size_ * GRAPH_INIT_ASCEND_MEM_RATIO);
  auto ret = rtMalloc(reinterpret_cast<void **>(&device_mem_base_), static_mem_offset_, RT_MEMORY_HBM);
  if (ret != RT_ERROR_NONE) {
    MS_EXCEPTION(DeviceProcessError) << "rtMalloc mem size[" << static_mem_offset_ << "] fail, ret[" << ret << "]";
  }
  device_mem_pool_size_ = FloatToSize(device_mem_size_ * (1 - GRAPH_INIT_ASCEND_MEM_RATIO));
  ret = rtMalloc(reinterpret_cast<void **>(&device_mem_pool_base_), device_mem_pool_size_, RT_MEMORY_HBM);
  if (ret != RT_ERROR_NONE) {
    MS_EXCEPTION(DeviceProcessError) << "rtMalloc mem size[" << device_mem_pool_size_ << "] fail, ret[" << ret << "]";
  }
  AscendMemoryAllocator::GetInstance().set_device_mem_pool_base(device_mem_pool_base_);
  AscendMemoryAllocator::GetInstance().set_device_mem_pool_size(device_mem_pool_size_);
  return true;
 }

 void AscendKernelRuntime::FreeDeviceMemory() {
  if (device_mem_base_ != nullptr) {
    auto ret = rtFree(device_mem_base_);
    if (ret != RT_ERROR_NONE) {
      MS_LOG(ERROR) << "rtFree mem size[" << device_mem_size_ << "] fail, ret[" << ret << "]";
    }
    device_mem_base_ = nullptr;
  }
  if (device_mem_pool_base_ != nullptr) {
    auto ret = rtFree(device_mem_pool_base_);
    if (ret != RT_ERROR_NONE) {
      MS_LOG(ERROR) << "rtFree mem size[" << device_mem_pool_size_ << "] fail, ret[" << ret << "]";
    }
    device_mem_pool_base_ = nullptr;
  }
 }

 void AscendKernelRuntime::FreeHostMemory() { dynamic_mem_offset_ = 0; }
 }  // namespace ascend
 }  // namespace device
 }  // namespace mindspore
--- a/mindspore/ccsrc/device/ascend/ascend_kernel_runtime.h
+++ b/mindspore/ccsrc/device/ascend/ascend_kernel_runtime.h
@@ -39,13 +39,11 @@ class AscendKernelRuntime : public KernelRuntime {
  bool GenTask(const session::KernelGraph *graph) override;
  bool RunTask(const session::KernelGraph *graph) override;
  bool LoadTask(const session::KernelGraph *graph) override;
  void FreeHostMemory() override;

 protected:
  DeviceAddressPtr CreateDeviceAddress(void *device_ptr, size_t device_size, const string &format,
                                       TypeId type_id) override;
  bool SyncStream() override;
  void MallocOpMemory(const DeviceAddressPtr address, size_t size, int flag) override;

 private:
  bool InitDevice();
@@ -53,8 +51,7 @@ class AscendKernelRuntime : public KernelRuntime {
  bool HcclInit();
  bool NeedDestroyHccl();
  bool DestroyHccl();
  bool MallocDeviceMemory();
  void FreeDeviceMemory();

  void ClearGraphModelMap();
  void ReleaseDeviceRes() override;
  uint32_t GetGraphModelId(const session::KernelGraph *kernel_graph);
--- a/mindspore/ccsrc/device/ascend/ascend_memory_manager.cc
+++ b/mindspore/ccsrc/device/ascend/ascend_memory_manager.cc
@@ -0,0 +1,67 @@
 /**
 * Copyright 2019 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #include "device/ascend/ascend_memory_manager.h"
 #include "device/ascend/ascend_memory_pool.h"
 #include "utils/context/ms_context.h"
 #include "runtime/mem.h"
 namespace mindspore {
 namespace device {
 namespace ascend {
 const uint64_t kAscendDeviceMemGB = 20;
 const uint64_t kAscendMemPoolGB = 5;
 const uint64_t kAscendDeviceMemSize = (kAscendDeviceMemGB << 30);
 const uint64_t kAscendMemPoolSize = (kAscendMemPoolGB << 30);

 void AscendMemoryManager::MallocDeviceMemory() {
  device_mem_size_ = kAscendDeviceMemSize;
  static_mem_offset_ = device_mem_size_;
  auto ret = rtMalloc(reinterpret_cast<void **>(&device_mem_base_), static_mem_offset_, RT_MEMORY_HBM);
  if (ret != RT_ERROR_NONE) {
    MS_EXCEPTION(DeviceProcessError) << "rtMalloc mem size[" << static_mem_offset_ << "] fail, ret[" << ret << "]";
  }
  device_mem_pool_size_ = kAscendMemPoolSize;
  ret = rtMalloc(reinterpret_cast<void **>(&device_mem_pool_base_), device_mem_pool_size_, RT_MEMORY_HBM);
  if (ret != RT_ERROR_NONE) {
    MS_EXCEPTION(DeviceProcessError) << "rtMalloc mem size[" << device_mem_pool_size_ << "] fail, ret[" << ret << "]";
  }
  AscendMemoryPool::GetInstance().set_device_mem_pool_base(device_mem_pool_base_);
  AscendMemoryPool::GetInstance().set_device_mem_pool_size(device_mem_pool_size_);
 }

 void AscendMemoryManager::FreeDeviceMemory() {
  if (device_mem_base_ != nullptr) {
    auto ret = rtFree(device_mem_base_);
    if (ret != RT_ERROR_NONE) {
      MS_LOG(ERROR) << "rtFree mem size[" << device_mem_size_ << "] fail, ret[" << ret << "]";
    }
    device_mem_base_ = nullptr;
  }
  if (device_mem_pool_base_ != nullptr) {
    auto ret = rtFree(device_mem_pool_base_);
    if (ret != RT_ERROR_NONE) {
      MS_LOG(ERROR) << "rtFree mem size[" << device_mem_pool_size_ << "] fail, ret[" << ret << "]";
    }
    device_mem_pool_base_ = nullptr;
  }
 }

 void *AscendMemoryManager::MallocMemFromMemPool(size_t size) {
  return AscendMemoryPool::GetInstance().AllocTensorMem(size);
 }
 }  // namespace ascend
 }  // namespace device
 }  // namespace mindspore
--- a/mindspore/ccsrc/device/ascend/ascend_memory_manager.h
+++ b/mindspore/ccsrc/device/ascend/ascend_memory_manager.h
@@ -0,0 +1,39 @@
 /**
 * Copyright 2019 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #ifndef MINDSPORE_MINDSPORE_CCSRC_DEVICE_ASCEND_ASCEND_MEMORY_MANAGER_H_
 #define MINDSPORE_MINDSPORE_CCSRC_DEVICE_ASCEND_ASCEND_MEMORY_MANAGER_H_
 #include "device/memory_manager.h"
 namespace mindspore {
 namespace device {
 namespace ascend {
 class AscendMemoryManager : public MemoryManager {
 public:
  AscendMemoryManager() = default;
  virtual ~AscendMemoryManager() = default;

  void MallocDeviceMemory() override;
  void FreeDeviceMemory() override;
  void *MallocMemFromMemPool(size_t size) override;

 private:
  uint8_t *device_mem_pool_base_{nullptr};
  uint64_t device_mem_pool_size_{0};
 };
 }  // namespace ascend
 }  // namespace device
 }  // namespace mindspore
 #endif  // MINDSPORE_MINDSPORE_CCSRC_DEVICE_ASCEND_ASCEND_MEMORY_MANAGER_H_
--- a/mindspore/ccsrc/device/ascend/ascend_memory_allocator.cc
+++ b/mindspore/ccsrc/device/ascend/ascend_memory_allocator.cc
@@ -14,24 +14,15 @@
 * limitations under the License.
 */

 #include "device/ascend/ascend_memory_allocator.h"
 #include "device/ascend/ascend_memory_pool.h"
 #include "device/ascend/ascend_kernel_runtime.h"
 #include "utils/log_adapter.h"

 namespace mindspore {
 namespace device {
 namespace ascend {
 const uint64_t MEM_SIZE = 20;
 const uint64_t MEM_SIZE_BYTE = (MEM_SIZE << 30);

 AscendMemoryAllocator::AscendMemoryAllocator() {
  hasMalloc_ = false;
  free_mem_size_ = FloatToSize(MEM_SIZE_BYTE * (1 - GRAPH_INIT_ASCEND_MEM_RATIO));
  total_mem_size_ = free_mem_size_;
 }

 size_t AscendMemoryAllocator::AllocDeviceMem(size_t size, DeviceMemPtr* addr) {
  if (hasMalloc_) {
 size_t AscendMemoryPool::AllocDeviceMem(size_t size, DeviceMemPtr* addr) {
  if (has_malloc_) {
    MS_LOG(EXCEPTION) << "Has alloc memory pool memory !";
  }
  if (size == 0 || size > free_mem_size_) {
@@ -41,35 +32,35 @@ size_t AscendMemoryAllocator::AllocDeviceMem(size_t size, DeviceMemPtr* addr) {
  if (*addr == nullptr) {
    MS_LOG(EXCEPTION) << "Device memory pool base is nullptr, failed to alloc memory pool memory!";
  }
  hasMalloc_ = true;
  has_malloc_ = true;
  free_mem_size_ -= size;
  return size;
 }

 bool AscendMemoryAllocator::FreeDeviceMem(const DeviceMemPtr& addr) {
 bool AscendMemoryPool::FreeDeviceMem(const DeviceMemPtr& addr) {
  MS_EXCEPTION_IF_NULL(addr);
  hasMalloc_ = false;
  has_malloc_ = false;
  free_mem_size_ = total_mem_size_;
  return true;
 }

 size_t AscendMemoryAllocator::AlignMemorySize(size_t size) const {
 size_t AscendMemoryPool::AlignMemorySize(size_t size) const {
  if (size == 0) {
    return DYNAMIC_MEM_ALIGN_SIZE;
  }
  return ((size + DYNAMIC_MEM_ALIGN_SIZE + 31) / DYNAMIC_MEM_ALIGN_SIZE) * DYNAMIC_MEM_ALIGN_SIZE;
 }

 size_t AscendMemoryAllocator::mem_alloc_unit_size() const { return free_mem_size_ - 512; }
 size_t AscendMemoryPool::mem_alloc_unit_size() const { return free_mem_size_ - 512; }

 void AscendMemoryAllocator::set_device_mem_pool_base(uint8_t* device_mem_pool_base) {
 void AscendMemoryPool::set_device_mem_pool_base(uint8_t* device_mem_pool_base) {
  MS_EXCEPTION_IF_NULL(device_mem_pool_base);
  device_mem_pool_base_ = device_mem_pool_base;
 }

 size_t AscendMemoryAllocator::free_mem_size() { return free_mem_size_; }
 size_t AscendMemoryPool::free_mem_size() { return free_mem_size_; }

 size_t AscendMemoryAllocator::total_mem_size() { return total_mem_size_; }
 size_t AscendMemoryPool::total_mem_size() { return total_mem_size_; }
 }  // namespace ascend
 }  // namespace device
 }  // namespace mindspore
--- a/mindspore/ccsrc/device/ascend/ascend_memory_allocator.h
+++ b/mindspore/ccsrc/device/ascend/ascend_memory_allocator.h
@@ -14,8 +14,8 @@
 * limitations under the License.
 */

 #ifndef MINDSPORE_CCSRC_DEVICE_ASCEND_ASCEND_MEMORY_ALLOCATOR_H_
 #define MINDSPORE_CCSRC_DEVICE_ASCEND_ASCEND_MEMORY_ALLOCATOR_H_
 #ifndef MINDSPORE_CCSRC_DEVICE_ASCEND_ASCEND_MEMORY_POOL_H_
 #define MINDSPORE_CCSRC_DEVICE_ASCEND_ASCEND_MEMORY_POOL_H_

 #include <memory>
 #include "pre_activate/mem_reuse/mem_dynamic_allocator.h"
@@ -23,22 +23,23 @@
 namespace mindspore {
 namespace device {
 namespace ascend {
 // The fraction of total ascend memory used to compute the graph.
 static const float GRAPH_INIT_ASCEND_MEM_RATIO = 0.8;

 class AscendMemoryAllocator : public DynamicMemPoolBestFit {
 class AscendMemoryPool : public DynamicMemPoolBestFit {
 public:
  ~AscendMemoryAllocator() override = default;
  ~AscendMemoryPool() override = default;

  size_t AllocDeviceMem(size_t size, DeviceMemPtr* addr) override;
  bool FreeDeviceMem(const DeviceMemPtr& addr) override;
  void set_device_mem_pool_base(uint8_t* device_mem_pool_base);
  void set_device_mem_pool_size(uint64_t device_mem_pool_size) { device_mem_pool_size_ = device_mem_pool_size; }
  void set_device_mem_pool_size(uint64_t device_mem_pool_size) {
    device_mem_pool_size_ = device_mem_pool_size;
    free_mem_size_ = device_mem_pool_size_;
    total_mem_size_ = free_mem_size_;
  }
  size_t free_mem_size() override;
  size_t total_mem_size() override;

  static AscendMemoryAllocator& GetInstance() {
    static AscendMemoryAllocator instance;
  static AscendMemoryPool& GetInstance() {
    static AscendMemoryPool instance;
    return instance;
  }

@@ -49,10 +50,10 @@ class AscendMemoryAllocator : public DynamicMemPoolBestFit {
  size_t mem_alloc_unit_size() const override;

 private:
  AscendMemoryAllocator();
  AscendMemoryAllocator(const AscendMemoryAllocator&) = delete;
  AscendMemoryAllocator& operator=(const AscendMemoryAllocator&) = delete;
  bool hasMalloc_;
  AscendMemoryPool() = default;
  AscendMemoryPool(const AscendMemoryPool&) = delete;
  AscendMemoryPool& operator=(const AscendMemoryPool&) = delete;
  bool has_malloc_{false};
  uint8_t* device_mem_pool_base_{nullptr};
  uint64_t device_mem_pool_size_{0};
  size_t free_mem_size_;
@@ -62,4 +63,4 @@ class AscendMemoryAllocator : public DynamicMemPoolBestFit {
 }  // namespace device
 }  // namespace mindspore

 #endif  // MINDSPORE_CCSRC_DEVICE_ASCEND_ASCEND_MEMORY_ALLOCATOR_H_
 #endif  // MINDSPORE_CCSRC_DEVICE_ASCEND_ASCEND_MEMORY_POOL_H_
--- a/mindspore/ccsrc/device/ascend/ascend_stream_assign.cc
+++ b/mindspore/ccsrc/device/ascend/ascend_stream_assign.cc
@@ -702,7 +702,7 @@ void AscendStreamAssign::PrintGraphExeOrders(const shared_ptr<mindspore::session
                   << AnfAlgo::GetStreamId(cur_cnode_ptr) << "], event_id["
                   << GetValue<uint32_t>(primitive->GetAttr(kAttrEventId)) << "]";
    } else {
      MS_LOG(INFO) << "node name[" << AnfAlgo::GetCNodeName(cur_cnode_ptr) << "], logic id["
      MS_LOG(INFO) << "node name[" << cur_cnode_ptr->fullname_with_scope() << "], logic id["
                   << AnfAlgo::GetStreamDistinctionLabel(cur_cnode_ptr.get()) << "], stream id["
                   << AnfAlgo::GetStreamId(cur_cnode_ptr) << "]";
    }
--- a/mindspore/ccsrc/device/ascend/profiling/profiling_manager.h
+++ b/mindspore/ccsrc/device/ascend/profiling/profiling_manager.h
@@ -29,10 +29,6 @@ namespace ascend {
 // PROFILING_CUSTOM_LOGID_START 3
 const uint64_t kProfilingFpStartLogId = 1;
 const uint64_t kProfilingBpEndLogId = 2;
 const uint64_t kProfilingAllReduce1Start = 3;
 const uint64_t kProfilingAllReduce1End = 4;
 const uint64_t kProfilingAllReduce2Start = 5;
 const uint64_t kProfilingAllReduce2End = 6;
 const uint64_t kProfilingIterEndLogId = 255;

 class ProfilingEngineImpl;
--- a/mindspore/ccsrc/device/ascend/profiling/profiling_utils.cc
+++ b/mindspore/ccsrc/device/ascend/profiling/profiling_utils.cc
@@ -14,10 +14,8 @@
 * limitations under the License.
 */

 #include "device/ascend/profiling/profiling_utils.h"

 #include <map>

 #include "device/ascend/profiling/profiling_utils.h"
 #include "kernel/kernel.h"
 #include "device/ascend/profiling/profiling_manager.h"
 #include "session/anf_runtime_algorithm.h"
@@ -27,82 +25,61 @@
 namespace mindspore {
 namespace device {
 namespace ascend {
 const char ProfilingUtils::kProfiling[] = "Profiling";
 const char ProfilingUtils::kNotify[] = "notify";
 const char ProfilingUtils::kProfilerTraceId[] = "profiler_trace_id";
 const char ProfilingUtils::kFlags[] = "flags";
 constexpr uint32_t kMaxProfilingNodeNum = 100;
 constexpr char kCustomNode[] = "PROFILING_CUSTOM_";
 constexpr char kFpStartNode[] = "PROFILING_FP_START";
 constexpr char kBpEndNode[] = "PROFILING_BP_END";
 constexpr char kIterEndNode[] = "PROFILING_ITER_END";
 std::unordered_map<uint32_t, std::vector<std::string>> ProfilingUtils::graph_kernel_name_;
 bool ProfilingUtils::GetProfilingTraceInfo(const std::shared_ptr<session::KernelGraph> &graph_ptr,
                                           ProfilingTraceInfo *profiling_trace_info) {
  MS_EXCEPTION_IF_NULL(profiling_trace_info);
  MS_EXCEPTION_IF_NULL(graph_ptr);
  bool find_begin = false;
  bool first_allreduce = true;
  for (const auto &anf_node : graph_ptr->execution_order()) {
    if (anf_node->isa<CNode>()) {
      const std::string kernel_name = AnfAlgo::GetCNodeName(anf_node);
      if ((kernel_name == "Cast" || kernel_name == "Four2Five") && !find_begin) {
        profiling_trace_info->profiling_trace_begin = anf_node->fullname_with_scope();
        find_begin = true;
      }
      if (kernel_name == "Conv2DBackpropFilter") {
        profiling_trace_info->profiling_trace_bp_end = anf_node->fullname_with_scope();
      }
      if (kernel_name == kFusedMulApplyMomentumOpName || kernel_name == kApplyMomentumOpName) {
        profiling_trace_info->profiling_trace_netoutput = anf_node->fullname_with_scope();
      }
      if (kernel_name == kAllReduceOpName) {
        if (first_allreduce) {
          profiling_trace_info->profiling_allreduce1_start = anf_node->fullname_with_scope();
          profiling_trace_info->profiling_allreduce1_end = anf_node->fullname_with_scope();
          first_allreduce = false;
        } else {
          profiling_trace_info->profiling_allreduce2_start = anf_node->fullname_with_scope();
          profiling_trace_info->profiling_allreduce2_end = anf_node->fullname_with_scope();
        }
      }
 uint32_t ProfilingUtils::custom_node_index_ = 1;

 ProfilingTraceInfo ProfilingUtils::GetProfilingTraceFromEnv(NotNull<session::KernelGraph *> graph_ptr) {
  MS_LOG(INFO) << "get env start";
  custom_node_index_ = 1;
  auto &cnode_exec_order = graph_ptr->execution_order();
  ProfilingTraceInfo profiling_trace;
  profiling_trace.trace_begin = GetTraceBegin(cnode_exec_order);
  profiling_trace.trace_bp_end = GetTraceBpEnd();
  profiling_trace.trace_netoutput = GetTraceNetoutput(cnode_exec_order);

  MS_LOG(INFO) << "[profiling] trace_begin:" << profiling_trace.trace_begin
               << " trace_bp_end:" << profiling_trace.trace_bp_end
               << " trace_netoutput:" << profiling_trace.trace_netoutput;

  for (uint32_t i = 1; i <= kMaxProfilingNodeNum; ++i) {
    std::string env_str = std::string(kCustomNode) + std::to_string(i);
    const char *node_full_name = std::getenv(env_str.c_str());
    if (node_full_name == nullptr) {
      break;
    }
    MS_LOG(INFO) << "Get profiling node:" << node_full_name;
    profiling_trace.trace_custom_node.insert(node_full_name);
  }
  MS_LOG(INFO) << "[profiling]begin:" << profiling_trace_info->profiling_trace_begin
               << ", net_output:" << profiling_trace_info->profiling_trace_netoutput
               << ", end:" << profiling_trace_info->profiling_trace_bp_end
               << ", allreduce1:" << profiling_trace_info->profiling_allreduce1_start
               << ", allreduce2:" << profiling_trace_info->profiling_allreduce2_start;
  return profiling_trace_info->IsValid();
  MS_LOG(INFO) << "get env end";
  return profiling_trace;
 }

 bool ProfilingUtils::GetNetOutput(AnfNodePtr anf_node, std::string *profiling_trace_net_output) {
  MS_EXCEPTION_IF_NULL(anf_node);
  MS_EXCEPTION_IF_NULL(profiling_trace_net_output);
  MS_LOG(INFO) << "[profiling]Anf node's full name with scope:" << anf_node->fullname_with_scope();
  if (!profiling_trace_net_output->empty()) {
    MS_LOG(INFO) << "[profiling]Has got the net_output:" << profiling_trace_net_output->c_str();
    return true;
  }

  if (AnfAlgo::IsRealKernel(anf_node)) {
    *profiling_trace_net_output = anf_node->fullname_with_scope();
    return true;
  }
 std::string ProfilingUtils::GetTraceBegin(const std::vector<CNodePtr> &cnode_exec_order) {
  const char *trace_begin = std::getenv(kFpStartNode);
  auto &first_cnode = cnode_exec_order.front();
  MS_EXCEPTION_IF_NULL(first_cnode);
  return trace_begin == nullptr ? first_cnode->fullname_with_scope() : std::string(trace_begin);
 }

  auto cnode = anf_node->cast<CNodePtr>();
  if (cnode == nullptr) {
    MS_LOG(ERROR) << "[profiling]Anf node should be a CNode";
    return false;
  }
 std::string ProfilingUtils::GetTraceBpEnd() {
  const char *trace_bp_end = std::getenv(kBpEndNode);
  return trace_bp_end == nullptr ? "" : std::string(trace_bp_end);
 }

  auto inputs = cnode->inputs();
  auto input_size = inputs.size();
  if (input_size < 2) {
    MS_LOG(ERROR) << "[profiling]Anf node' input size(" << input_size << ") < 2, don't support get apply kernel node.";
    return false;
  }
  return GetNetOutput(inputs[1], profiling_trace_net_output);
 std::string ProfilingUtils::GetTraceNetoutput(const std::vector<CNodePtr> &cnode_exec_order) {
  const char *trace_netoutput = std::getenv(kIterEndNode);
  auto &last_cnode = cnode_exec_order.back();
  MS_EXCEPTION_IF_NULL(last_cnode);
  return trace_netoutput == nullptr ? last_cnode->fullname_with_scope() : std::string(trace_netoutput);
 }

 CNodePtr ProfilingUtils::CreateProfilingCNode(const std::shared_ptr<session::KernelGraph> &graph_ptr, bool notify,
                                              uint64_t profiler_trace_id, uint32_t flags) {
  MS_EXCEPTION_IF_NULL(graph_ptr);
 NotNull<CNodePtr> ProfilingUtils::CreateProfilingCNode(const ProfilingContent &profiling_content,
                                                       NotNull<session::KernelGraph *> graph_ptr) {
  kernel::KernelBuildInfo::KernelBuildInfoBuilder selected_kernel_builder;
  selected_kernel_builder.SetInputsFormat({kOpFormat_DEFAULT, kOpFormat_DEFAULT});
  selected_kernel_builder.SetInputsDeviceType({TypeId::kNumberTypeInt32, TypeId::kNumberTypeInt32});
@@ -118,75 +95,79 @@ CNodePtr ProfilingUtils::CreateProfilingCNode(const std::shared_ptr<session::Ker
  AnfAlgo::SetSelectKernelBuildInfo(selected_kernel_builder.Build(), cnode_ptr.get());
  cnode_ptr->set_abstract(type_none_abstract);
  // set attr
  ValuePtr notify_value = MakeValue(notify);
  ValuePtr trace_id_value = MakeValue(profiler_trace_id);
  ValuePtr flags_value = MakeValue(flags);
  ValuePtr notify_value = MakeValue(profiling_content.notify);
  ValuePtr trace_id_value = MakeValue(profiling_content.profiler_trace_id);
  ValuePtr flags_value = MakeValue(profiling_content.flags);
  AnfAlgo::SetNodeAttr(ProfilingUtils::kNotify, notify_value, cnode_ptr);
  AnfAlgo::SetNodeAttr(ProfilingUtils::kProfilerTraceId, trace_id_value, cnode_ptr);
  AnfAlgo::SetNodeAttr(ProfilingUtils::kFlags, flags_value, cnode_ptr);
  return cnode_ptr;
  return NOT_NULL(cnode_ptr);
 }

 void ProfilingUtils::ProfilingTraceFpStart(const std::shared_ptr<mindspore::session::KernelGraph> &graph_ptr,
                                           const mindspore::AnfNodePtr &anf_node,
                                           const mindspore::device::ascend::ProfilingTraceInfo &profiling_trace_info,
                                           std::vector<mindspore::CNodePtr> *kernel_list) {
  if (profiling_trace_info.IsValid() && profiling_trace_info.profiling_trace_begin == anf_node->fullname_with_scope()) {
    if (graph_ptr == nullptr || kernel_list == nullptr || anf_node == nullptr) {
      MS_LOG(ERROR) << "[profiling]input param invalid";
      return;
    }
 void ProfilingUtils::ProfilingTraceFpStart(const mindspore::AnfNodePtr &anf_node,
                                           const ProfilingTraceInfo &profiling_trace_info,
                                           NotNull<session::KernelGraph *> graph_ptr,
                                           NotNull<std::vector<mindspore::CNodePtr> *> kernel_list) {
  if (profiling_trace_info.trace_begin == anf_node->fullname_with_scope()) {
    auto job_id = ProfilingManager::GetInstance().GetJobId();
    // job task info
    CNodePtr job_kernel_ptr = CreateProfilingCNode(graph_ptr, false, job_id, 0);
    AnfAlgo::SetStreamDistinctionLabel(AnfAlgo::GetStreamDistinctionLabel(anf_node.get()), job_kernel_ptr.get());
    AnfAlgo::SetStreamId(AnfAlgo::GetStreamId(anf_node), job_kernel_ptr.get());
    // fp task info
    CNodePtr start_kernel_ptr = CreateProfilingCNode(graph_ptr, false, kProfilingFpStartLogId, 0);
    AnfAlgo::SetStreamDistinctionLabel(AnfAlgo::GetStreamDistinctionLabel(anf_node.get()), start_kernel_ptr.get());
    AnfAlgo::SetStreamId(AnfAlgo::GetStreamId(anf_node), start_kernel_ptr.get());
    kernel_list->emplace_back(job_kernel_ptr);
    kernel_list->emplace_back(start_kernel_ptr);
    ProfilingContent job_profiling_context = {false, job_id, 0};
    auto job_profiling_node = CreateProfilingCNodeWithStream(anf_node, job_profiling_context, graph_ptr);
    kernel_list->emplace_back(job_profiling_node);

    ProfilingContent fp_profiling_content = {false, kProfilingFpStartLogId, 0};
    auto fp_profiling_node = CreateProfilingCNodeWithStream(anf_node, fp_profiling_content, graph_ptr);
    kernel_list->emplace_back(fp_profiling_node);
  }
 }

 void ProfilingUtils::ProfilingAllReduce(const std::shared_ptr<session::KernelGraph> &graph_ptr,
                                        const AnfNodePtr &anf_node, int job_id, const std::string &profiling_node_name,
                                        std::vector<CNodePtr> *kernel_list) {
  MS_EXCEPTION_IF_NULL(graph_ptr);
 CNodePtr ProfilingUtils::CreateProfilingCNodeWithStream(const mindspore::AnfNodePtr &anf_node,
                                                        const ProfilingContent &profiling_content,
                                                        NotNull<session::KernelGraph *> graph_ptr) {
  CNodePtr profiling_node = CreateProfilingCNode(profiling_content, graph_ptr);
  AnfAlgo::SetStreamDistinctionLabel(AnfAlgo::GetStreamDistinctionLabel(anf_node.get()), profiling_node.get());
  AnfAlgo::SetStreamId(AnfAlgo::GetStreamId(anf_node), profiling_node.get());
  return profiling_node;
 }

 void ProfilingUtils::ProfilingCustomOp(const AnfNodePtr &anf_node, const ProfilingTraceInfo &profiling_trace_info,
                                       NotNull<session::KernelGraph *> graph_ptr,
                                       NotNull<std::vector<CNodePtr> *> kernel_list) {
  MS_EXCEPTION_IF_NULL(anf_node);
  MS_EXCEPTION_IF_NULL(kernel_list);
  auto full_scope_name = anf_node->fullname_with_scope();
  if (profiling_node_name == full_scope_name) {
    CNodePtr allreduce_kernel_ptr = CreateProfilingCNode(graph_ptr, false, job_id, 0);
    AnfAlgo::SetStreamDistinctionLabel(AnfAlgo::GetStreamDistinctionLabel(anf_node.get()), allreduce_kernel_ptr.get());
    AnfAlgo::SetStreamId(AnfAlgo::GetStreamId(anf_node), allreduce_kernel_ptr.get());
    kernel_list->emplace_back(allreduce_kernel_ptr);
  auto iter = profiling_trace_info.trace_custom_node.find(anf_node->fullname_with_scope());
  if (iter == profiling_trace_info.trace_custom_node.end()) {
    return;
  }
  // custom op profiling job start from 3.
  ProfilingContent front_profiling_content = {false, 2 * custom_node_index_ + 1, 0};
  CNodePtr front_node = CreateProfilingCNodeWithStream(anf_node, front_profiling_content, graph_ptr);
  kernel_list->insert(kernel_list->end() - 1, front_node);

  ProfilingContent back_profiling_content = {false, 2 * custom_node_index_ + 2, 0};
  CNodePtr back_node = CreateProfilingCNodeWithStream(anf_node, back_profiling_content, graph_ptr);
  kernel_list->insert(kernel_list->end(), back_node);
  ++custom_node_index_;
 }

 void ProfilingUtils::ProfilingTraceEnd(const std::shared_ptr<mindspore::session::KernelGraph> &graph_ptr,
                                       const mindspore::AnfNodePtr &anf_node,
                                       const mindspore::device::ascend::ProfilingTraceInfo &profiling_trace_info,
                                       std::vector<mindspore::CNodePtr> *kernel_list) {
  MS_EXCEPTION_IF_NULL(graph_ptr);
 void ProfilingUtils::ProfilingTraceBpEnd(const AnfNodePtr &anf_node, const ProfilingTraceInfo &profiling_trace_info,
                                         NotNull<session::KernelGraph *> graph_ptr,
                                         NotNull<std::vector<CNodePtr> *> kernel_list) {
  MS_EXCEPTION_IF_NULL(anf_node);
  MS_EXCEPTION_IF_NULL(kernel_list);
  if (profiling_trace_info.IsValid()) {
    auto full_scope_name = anf_node->fullname_with_scope();
    if (profiling_trace_info.profiling_trace_netoutput == full_scope_name) {
      CNodePtr bp_kernel_ptr = CreateProfilingCNode(graph_ptr, true, kProfilingIterEndLogId, 0);
      AnfAlgo::SetStreamDistinctionLabel(AnfAlgo::GetStreamDistinctionLabel(anf_node.get()), bp_kernel_ptr.get());
      AnfAlgo::SetStreamId(AnfAlgo::GetStreamId(anf_node), bp_kernel_ptr.get());
      kernel_list->emplace_back(bp_kernel_ptr);
    }
  if (profiling_trace_info.trace_bp_end == anf_node->fullname_with_scope()) {
    ProfilingContent bp_end_profiling_content = {false, kProfilingBpEndLogId, 0};
    CNodePtr bp_end_node = CreateProfilingCNodeWithStream(anf_node, bp_end_profiling_content, graph_ptr);
    kernel_list->emplace_back(bp_end_node);
  }
 }

    if (profiling_trace_info.profiling_trace_bp_end == full_scope_name) {
      CNodePtr end_task_info = CreateProfilingCNode(graph_ptr, false, kProfilingBpEndLogId, 0);
      AnfAlgo::SetStreamDistinctionLabel(AnfAlgo::GetStreamDistinctionLabel(anf_node.get()), end_task_info.get());
      AnfAlgo::SetStreamId(AnfAlgo::GetStreamId(anf_node), end_task_info.get());
      kernel_list->emplace_back(end_task_info);
    }
 void ProfilingUtils::ProfilingTraceEnd(const AnfNodePtr &anf_node, const ProfilingTraceInfo &profiling_trace_info,
                                       NotNull<session::KernelGraph *> graph_ptr,
                                       NotNull<std::vector<mindspore::CNodePtr> *> kernel_list) {
  MS_EXCEPTION_IF_NULL(anf_node);
  auto full_scope_name = anf_node->fullname_with_scope();
  if (profiling_trace_info.trace_netoutput == full_scope_name) {
    ProfilingContent bp_end_profiling_content = {true, kProfilingIterEndLogId, 0};
    CNodePtr bp_kernel_ptr = CreateProfilingCNodeWithStream(anf_node, bp_end_profiling_content, graph_ptr);
    kernel_list->emplace_back(bp_kernel_ptr);
  }
 }

--- a/mindspore/ccsrc/device/ascend/profiling/profiling_utils.h
+++ b/mindspore/ccsrc/device/ascend/profiling/profiling_utils.h
@@ -19,63 +19,102 @@
 #include <memory>
 #include <string>
 #include <vector>
 #include <set>
 #include <unordered_map>
 #include "session/kernel_graph.h"
 #include "utils/contract.h"

 namespace mindspore {
 namespace device {
 namespace ascend {
 struct ProfilingTraceInfo {
  // execute order's first execute op(like: Cast or Four2Five ...), except tdt op(GetNext ...)
  std::string profiling_trace_begin;
  std::string trace_begin;
  // get first net_output(apply kernel) from graph outputs: fp ->net_output<- bp
  std::string profiling_trace_bp_end;
  std::string trace_bp_end;
  // execute order's end execute (like: Conv2DBackpropFilter)
  std::string profiling_trace_netoutput;
  std::string trace_netoutput;

  std::string profiling_allreduce1_start;

  std::string profiling_allreduce1_end;

  std::string profiling_allreduce2_start;

  std::string profiling_allreduce2_end;
  // profiling specific op, such as AllReduce;
  std::set<std::string> trace_custom_node;

  // 1. insert profiling_trace_begin if profiling_trace_bp_end is not empty.
  // 2. op lanuch get task info with callback func.
  // 3. insert profiling_trace_bp_end.
  // 4. insert profiling_trace_net_output if profiling_trace_bp_end is not empty.

  bool IsValid() const { return !(profiling_trace_begin.empty() || profiling_trace_bp_end.empty()); }
  bool IsValid() const { return !(trace_begin.empty() || trace_bp_end.empty() || trace_netoutput.empty()); }
 };

 struct ProfilingContent {
  // true -send data from device to host and finish profiling
  bool notify;
  uint64_t profiler_trace_id;
  uint32_t flags;
 };

 class ProfilingUtils {
 public:
  ProfilingUtils() = default;
  ~ProfilingUtils() = default;
  static bool GetProfilingTraceInfo(const std::shared_ptr<session::KernelGraph> &graph_ptr,
                                    ProfilingTraceInfo *profiling_trace_info);
  static void ProfilingTraceFpStart(const std::shared_ptr<session::KernelGraph> &graph_ptr, const AnfNodePtr &anf_node,
                                    const ProfilingTraceInfo &profiling_trace_info, std::vector<CNodePtr> *kernel_list);
  static void ProfilingAllReduce(const std::shared_ptr<session::KernelGraph> &graph_ptr, const AnfNodePtr &anf_node,
                                 int job_id, const std::string &profiling_node_name,
                                 std::vector<CNodePtr> *kernel_list);
  static void ProfilingTraceEnd(const std::shared_ptr<session::KernelGraph> &graph_ptr, const AnfNodePtr &anf_node,
                                const ProfilingTraceInfo &profiling_trace_info, std::vector<CNodePtr> *kernel_list);

  // Insert job_id profiling node and fp_start profiling node.
  // Job_id is got from envs, which shound be a number greater than 255
  // Fp_start node should been inserted in the start of a network, and the log_id is hard code to 1.
  static void ProfilingTraceFpStart(const AnfNodePtr &anf_node, const ProfilingTraceInfo &profiling_trace_info,
                                    NotNull<session::KernelGraph *> graph_ptr,
                                    NotNull<std::vector<CNodePtr> *> kernel_list);

  // Insert net output profiling node, which tells the device to stop profiling.
  // The notify in struct ProfilingContent should be 'true', which tells the device to send data to host.
  static void ProfilingTraceEnd(const AnfNodePtr &anf_node, const ProfilingTraceInfo &profiling_trace_info,
                                NotNull<session::KernelGraph *> graph_ptr,
                                NotNull<std::vector<CNodePtr> *> kernel_list);

  // Insert bp_end profiling node, which should been inserted after the last backpropagation CNode in the network.
  static void ProfilingTraceBpEnd(const mindspore::AnfNodePtr &anf_node, const ProfilingTraceInfo &profiling_trace_info,
                                  NotNull<session::KernelGraph *> graph_ptr,
                                  NotNull<std::vector<mindspore::CNodePtr> *> kernel_list);

  // Mapping graph id and the kernels' name in the graph
  static void SetGraphKernelName(uint32_t graph_id, const std::vector<std::string> &kernel_names);

  // Mapping task_id and kernel name for device to generate the time cost of specific kernel.
  // Device calculate the time cost of the task which is marked by task id.
  // But we need data of (kernel name , time cost)
  static void ReportProfilingData(uint32_t graph_id, const std::vector<uint32_t> &task_ids);

  static const char kProfiling[];
  static const char kNotify[];
  static const char kProfilerTraceId[];
  static const char kFlags[];
  // Get profiling trace point from envs.
  // export PROFILING_FP_START='full name of the first cnode to execute'
  // export PROFILING_BP_END='full name of the last backpropagation cnode to execute'
  // export PROFILING_ITER_END='full name of last cnode in graph to execute'
  // And other cnode, like AllReduce, export PROFILING_CUSTOM_1='full name of AllReduce cnode'
  // GetNext, export PROFIFLING_CUSTOM_2='full name fo GetNext cnode'
  // The variable i in PROFILING_CUSTOM_i should start from 1 without interruption.
  static ProfilingTraceInfo GetProfilingTraceFromEnv(NotNull<session::KernelGraph *> graph_ptr);

  // Insert two profiling trace points, one in front and one behind
  static void ProfilingCustomOp(const mindspore::AnfNodePtr &anf_node, const ProfilingTraceInfo &profiling_trace_info,
                                NotNull<session::KernelGraph *> graph_ptr,
                                NotNull<std::vector<mindspore::CNodePtr> *> kernel_list);

  inline static constexpr char kProfiling[] = "Profiling";
  inline static constexpr char kNotify[] = "notify";
  inline static constexpr char kProfilerTraceId[] = "profiler_trace_id";
  inline static constexpr char kFlags[] = "flags";

 private:
  static bool GetNetOutput(AnfNodePtr anf_node, std::string *profiling_trace_net_output);
  static CNodePtr CreateProfilingCNode(const std::shared_ptr<session::KernelGraph> &graph_ptr, bool notify,
                                       uint64_t profiler_trace_id, uint32_t flags);
  static NotNull<CNodePtr> CreateProfilingCNode(const ProfilingContent &profiling_content,
                                                NotNull<session::KernelGraph *> graph_ptr);
  static CNodePtr CreateProfilingCNodeWithStream(const AnfNodePtr &anf_node, const ProfilingContent &profiling_content,
                                                 NotNull<session::KernelGraph *> graph_ptr);
  static std::string GetTraceBegin(const std::vector<CNodePtr> &cnode_exec_order);
  static std::string GetTraceBpEnd();
  static std::string GetTraceNetoutput(const std::vector<CNodePtr> &cnode_exec_order);

  // graph id --> (kernel name list)
  static std::unordered_map<uint32_t, std::vector<std::string>> graph_kernel_name_;
  static uint32_t custom_node_index_;
 };
 }  // namespace ascend
 }  // namespace device
--- a/mindspore/ccsrc/device/device_address.h
+++ b/mindspore/ccsrc/device/device_address.h
@@ -33,12 +33,14 @@ class CPUKernelRuntime;
 }  // namespace cpu
 namespace ascend {
 class AscendKernelRuntime;
 class AscendMemoryManager;
 namespace tasksink {
 class TaskGenerator;
 }  // namespace tasksink
 }  // namespace ascend
 namespace gpu {
 class GPUKernelRuntime;
 class GPUMemoryManager;
 }  // namespace gpu
 }  // namespace device
 }  // namespace mindspore
@@ -68,14 +70,17 @@ class DeviceAddress {
  size_t ref_count_{0};
  string format_{"DefaultFormat"};
  TypeId type_id_{kNumberTypeFloat16};
  bool mem_dynamic_alloc_{false};
  bool from_mem_pool_{false};
  friend class KernelRuntime;
  friend class MemoryManager;
  friend class mindspore::device::ascend::tasksink::TaskGenerator;
  friend class mindspore::device::cpu::CPUSimpleMemPlan;
  friend class mindspore::device::cpu::CPUResourceManager;
  friend class mindspore::device::cpu::CPUKernelRuntime;
  friend class mindspore::device::gpu::GPUKernelRuntime;
  friend class mindspore::device::gpu::GPUMemoryManager;
  friend class mindspore::device::ascend::AscendKernelRuntime;
  friend class mindspore::device::ascend::AscendMemoryManager;
 };

 using DeviceAddressPtr = std::shared_ptr<DeviceAddress>;
--- a/mindspore/ccsrc/device/gpu/blocking_queue.cc
+++ b/mindspore/ccsrc/device/gpu/blocking_queue.cc
@@ -17,7 +17,6 @@
 #include "device/gpu/blocking_queue.h"
 #include <chrono>
 #include "device/gpu/gpu_common.h"
 #include "dataset/util/make_unique.h"
 #include "common/utils.h"

 namespace mindspore {
@@ -32,7 +31,7 @@ GpuQueue::GpuQueue(void *addr, size_t feature_size, size_t label_size, size_t ca
      stream_(0),
      node_info_(nullptr) {
  CHECK_CUDA_RET_WITH_ERROR(cudaStreamCreate(&stream_), "Cuda Create Stream Failed");
  node_info_ = mindspore::make_unique<NodeInfo[]>(capacity);
  node_info_ = std::make_unique<NodeInfo[]>(capacity);
 }

 GpuQueue::~GpuQueue() { buffer_ = nullptr; }
--- a/mindspore/ccsrc/device/gpu/gpu_device_address.cc
+++ b/mindspore/ccsrc/device/gpu/gpu_device_address.cc
@@ -46,7 +46,7 @@ GPUDeviceAddress::~GPUDeviceAddress() {
  }
  auto ms_context = MsContext::GetInstance();
  MS_EXCEPTION_IF_NULL(ms_context);
  if (mem_dynamic_alloc_) {
  if (from_mem_pool_) {
    GPUMemoryAllocator::GetInstance().FreeTensorMem(ptr_);
    ptr_ = nullptr;
  }
--- a/mindspore/ccsrc/device/gpu/gpu_kernel_runtime.cc
+++ b/mindspore/ccsrc/device/gpu/gpu_kernel_runtime.cc
@@ -26,6 +26,7 @@
 #include "device/kernel_runtime_manager.h"
 #include "device/gpu/gpu_common.h"
 #include "common/utils.h"
 #include "device/gpu/gpu_memory_manager.h"

 namespace mindspore {
 namespace device {
@@ -36,26 +37,14 @@ bool GPUKernelRuntime::Init() {
  if (device_init_ == true) {
    return true;
  }

  auto ret = InitDevice();
  if (!ret) {
    MS_LOG(ERROR) << "InitDevice error.";
    return ret;
  }

  auto context_ptr = MsContext::GetInstance();
  MS_EXCEPTION_IF_NULL(context_ptr);
  // If use the dynamic memory pool, then alloc the first memory block to init.
  if (context_ptr->enable_dynamic_mem_pool()) {
    auto device_addr = AllocTensorMemDynamic(1);
    if (!device_addr) {
      MS_LOG(ERROR) << "Dynamic memory pool init error.";
      return false;
    }
  } else {
    MallocDeviceMemory();
  }

  mem_manager_ = std::make_shared<GPUMemoryManager>();
  MS_EXCEPTION_IF_NULL(mem_manager_);
  mem_manager_->MallocDeviceMemory();
  const void *collective_handle_ = CollectiveInitializer::instance().collective_handle();
  bool collective_inited = CollectiveInitializer::instance().collective_inited();
  if (collective_inited && collective_handle_ != nullptr) {
@@ -101,16 +90,6 @@ bool GPUKernelRuntime::InitDevice() {
  return true;
 }

 void GPUKernelRuntime::MallocDeviceMemory() {
  // Need to reserve 20% space for dynamic memory
  const float init_gpu_mem_ratio = 0.8;
  size_t mem_size = FloatToSize(GPUMemoryAllocator::GetInstance().free_mem_size() * init_gpu_mem_ratio);
  auto alloc_size =
    GPUMemoryAllocator::GetInstance().AllocDeviceMem(mem_size, reinterpret_cast<void **>(&device_mem_base_));
  device_mem_size_ = alloc_size;
  static_mem_offset_ = device_mem_size_;
 }

 void GPUKernelRuntime::ReleaseDeviceRes() {
  // For dataset mode.
  if (GpuBufferMgr::GetInstance().IsInit()) {
@@ -122,39 +101,22 @@ void GPUKernelRuntime::ReleaseDeviceRes() {
    CHECK_OP_RET_WITH_EXCEPT(GpuBufferMgr::GetInstance().Destroy(), "Could not destroy gpu data queue.");
  }
  GPUDeviceManager::GetInstance().ReleaseDevice();
  if (device_mem_base_ != nullptr) {
    if (!GPUMemoryAllocator::GetInstance().FreeDeviceMem(device_mem_base_)) {
      MS_LOG(EXCEPTION) << "Could not free gpu device memory.";
    }
  }
  GPUMemoryAllocator::GetInstance().ReleaseDeviceRes();
 }

 void GPUKernelRuntime::FreeHostMemory() { dynamic_mem_offset_ = 0; }

 void *GPUKernelRuntime::AllocTensorMemDynamic(size_t size) {
  return GPUMemoryAllocator::GetInstance().AllocTensorMem(size);
 }

 void GPUKernelRuntime::FreeTensorMemDynamic(void *device_ptr) {
  GPUMemoryAllocator::GetInstance().FreeTensorMem(device_ptr);
  MS_EXCEPTION_IF_NULL(mem_manager_);
  mem_manager_->FreeDeviceMemory();
 }

 void GPUKernelRuntime::AssignMemory(session::KernelGraph *graph) {
  auto context_ptr = MsContext::GetInstance();
  MS_EXCEPTION_IF_NULL(context_ptr);
  MS_EXCEPTION_IF_NULL(mem_manager_);
  mem_manager_->ResetDynamicMemory();
  AssignStaticMemory(graph);
  bool is_enable_mem_reuse = context_ptr->enable_mem_reuse();
  bool is_enable_dynamic_mem = context_ptr->enable_dynamic_mem_pool();
  if (is_enable_dynamic_mem) {
    // Use the dynamic memory pool.
    InitKernelRefCount(graph);
    InitKernelOutputAddress(graph);
  } else if (is_enable_mem_reuse) {
    // Use the memory reuse.
    ReuseAssignDynamicMemory(graph);
  } else {
    // Normal way.
    AssignDynamicMemory(graph);
  }
 }
@@ -179,32 +141,6 @@ bool GPUKernelRuntime::Run(session::KernelGraph *graph) {
  return ret;
 }

 uint8_t *GPUKernelRuntime::MallocStaticMem(size_t size, bool) {
  auto context_ptr = MsContext::GetInstance();
  MS_EXCEPTION_IF_NULL(context_ptr);
  if (context_ptr->enable_dynamic_mem_pool()) {
    auto device_ptr = AllocTensorMemDynamic(size);
    MS_EXCEPTION_IF_NULL(device_ptr);
    return AddressOffset(device_ptr, 0);
  }

  auto align_size = GetCommonAlignSize(size);
  if (static_mem_offset_ < align_size) {
    MS_LOG(EXCEPTION) << "Out of memory!!! total[" << device_mem_size_ << "](dynamic[" << total_dynamic_size_
                      << "] static[" << total_static_size_ << "])"
                      << " malloc [" << align_size << "] failed!";
  }
  auto offset = static_mem_offset_ - align_size;
  if (dynamic_mem_offset_ > offset) {
    MS_LOG(EXCEPTION) << "Out of memory!!! total[" << device_mem_size_ << "](dynamic[" << total_dynamic_size_
                      << "] static[" << total_static_size_ << "])"
                      << " malloc [" << align_size << "] failed!";
  }
  total_static_size_ += align_size;
  static_mem_offset_ = offset;
  return device_mem_base_ + offset;
 }

 void GPUKernelRuntime::InitKernelRefCount(const session::KernelGraph *graph) {
  MS_EXCEPTION_IF_NULL(graph);
  MemReuseUtilPtr mem_reuse_util_ptr = std::make_shared<memreuse::MemReuseUtil>();
@@ -273,6 +209,7 @@ void GPUKernelRuntime::AllocKernelDynamicRes(const mindspore::kernel::KernelMod
  MS_EXCEPTION_IF_NULL(kernel_inputs);
  MS_EXCEPTION_IF_NULL(kernel_workspaces);
  MS_EXCEPTION_IF_NULL(kernel_outputs);
  MS_EXCEPTION_IF_NULL(mem_manager_);
  for (size_t i = 0; i < AnfAlgo::GetInputTensorNum(kernel); ++i) {
    auto device_address = AnfAlgo::GetPrevNodeOutputAddr(kernel, i);
    MS_EXCEPTION_IF_NULL(device_address);
@@ -290,7 +227,7 @@ void GPUKernelRuntime::AllocKernelDynamicRes(const mindspore::kernel::KernelMod
    MS_EXCEPTION_IF_NULL(device_address);
    auto device_ptr = device_address->ptr_;
    if (device_ptr == nullptr) {
      device_ptr = AllocTensorMemDynamic(output_sizes[i]);
      device_ptr = mem_manager_->MallocMemFromMemPool(output_sizes[i]);
      MS_EXCEPTION_IF_NULL(device_ptr);
      device_address->ptr_ = device_ptr;
    }
@@ -307,7 +244,7 @@ void GPUKernelRuntime::AllocKernelDynamicRes(const mindspore::kernel::KernelMod
      kernel_workspaces->emplace_back(nullptr);
      continue;
    }
    auto device_ptr = AllocTensorMemDynamic(workspace_sizes[i]);
    auto device_ptr = mem_manager_->MallocMemFromMemPool(workspace_sizes[i]);
    MS_EXCEPTION_IF_NULL(device_ptr);
    kernel::AddressPtr workspace = std::make_shared<kernel::Address>();
    MS_EXCEPTION_IF_NULL(workspace);
@@ -333,6 +270,7 @@ void GPUKernelRuntime::AllocCommunicationOpDynamicRes(const session::KernelGraph

 void GPUKernelRuntime::AllocCommunicationOpInputDynamicRes(const mindspore::AnfNodePtr &kernel) {
  MS_EXCEPTION_IF_NULL(kernel);
  MS_EXCEPTION_IF_NULL(mem_manager_);
  // The reference count of communication kernel input is not 0.
  if (communication_op_input_ref_count_ != 0) {
    MS_LOG(ERROR) << "The reference count of communication kernel input is not 0.";
@@ -354,7 +292,7 @@ void GPUKernelRuntime::AllocCommunicationOpInputDynamicRes(const mindspore::AnfN
    addr_size.emplace_back(device_address.get(), output_size);
  }

  auto device_mem_ptr = AllocTensorMemDynamic(total);
  auto device_mem_ptr = mem_manager_->MallocMemFromMemPool(total);
  MS_EXCEPTION_IF_NULL(device_mem_ptr);
  for (const auto &iter : addr_size) {
    MS_EXCEPTION_IF_NULL(iter.first);
@@ -366,6 +304,7 @@ void GPUKernelRuntime::AllocCommunicationOpInputDynamicRes(const mindspore::AnfN

 void GPUKernelRuntime::AllocCommunicationOpOutputDynamicRes(const mindspore::AnfNodePtr &kernel) {
  MS_EXCEPTION_IF_NULL(kernel);
  MS_EXCEPTION_IF_NULL(mem_manager_);
  // The reference count of communication kernel output is not 0.
  if (communication_op_output_ref_count_ != 0) {
    MS_LOG(ERROR) << "The reference count of communication kernel output is not 0.";
@@ -389,7 +328,7 @@ void GPUKernelRuntime::AllocCommunicationOpOutputDynamicRes(const mindspore::Anf
    addr_size.emplace_back(device_address.get(), output_sizes[i]);
  }

  auto device_mem_ptr = AllocTensorMemDynamic(total);
  auto device_mem_ptr = mem_manager_->MallocMemFromMemPool(total);
  MS_EXCEPTION_IF_NULL(device_mem_ptr);
  for (const auto &iter : addr_size) {
    MS_EXCEPTION_IF_NULL(iter.first);
@@ -402,6 +341,7 @@ void GPUKernelRuntime::AllocCommunicationOpOutputDynamicRes(const mindspore::Anf
 void GPUKernelRuntime::FreeKernelDynamicRes(const mindspore::AnfNodePtr &kernel,
                                            const AddressPtrList &kernel_workspaces) {
  MS_EXCEPTION_IF_NULL(kernel);
  MS_EXCEPTION_IF_NULL(mem_manager_);
  auto cnode = kernel->cast<CNodePtr>();
  MS_EXCEPTION_IF_NULL(cnode);
  // Free the input of kernel by reference count.
@@ -421,7 +361,7 @@ void GPUKernelRuntime::FreeKernelDynamicRes(const mindspore::AnfNodePtr &kernel,
        auto device_address = AnfAlgo::GetPrevNodeMutableOutputAddr(kernel, i);
        MS_EXCEPTION_IF_NULL(device_address);
        MS_EXCEPTION_IF_NULL(device_address->ptr_);
        FreeTensorMemDynamic(device_address->ptr_);
        mem_manager_->FreeMemFromMemPool(device_address->ptr_);
        device_address->ptr_ = nullptr;
      }
    }
@@ -432,7 +372,7 @@ void GPUKernelRuntime::FreeKernelDynamicRes(const mindspore::AnfNodePtr &kernel,
    auto workspace = kernel_workspaces[i];
    if (workspace != nullptr) {
      MS_EXCEPTION_IF_NULL(workspace->addr);
      FreeTensorMemDynamic(workspace->addr);
      mem_manager_->FreeMemFromMemPool(workspace->addr);
      workspace->addr = nullptr;
    }
  }
@@ -441,6 +381,7 @@ void GPUKernelRuntime::FreeKernelDynamicRes(const mindspore::AnfNodePtr &kernel,
 void GPUKernelRuntime::FreeCommunicationOpDynamicRes(const mindspore::AnfNodePtr &kernel, size_t input_idx,
                                                     bool *is_communication_op) {
  MS_EXCEPTION_IF_NULL(kernel);
  MS_EXCEPTION_IF_NULL(mem_manager_);
  // The inputs memory of communication kernel is one piece memory, need release together.
  if (AnfAlgo::GetCNodeName(kernel) == kAllReduceOpName) {
    communication_op_input_ref_count_--;
@@ -448,7 +389,7 @@ void GPUKernelRuntime::FreeCommunicationOpDynamicRes(const mindspore::AnfNodePtr
      auto device_address = AnfAlgo::GetPrevNodeMutableOutputAddr(kernel, 0);
      MS_EXCEPTION_IF_NULL(device_address);
      MS_EXCEPTION_IF_NULL(device_address->ptr_);
      FreeTensorMemDynamic(device_address->ptr_);
      mem_manager_->FreeMemFromMemPool(device_address->ptr_);
      device_address->ptr_ = nullptr;
    }
    *is_communication_op = true;
@@ -470,19 +411,12 @@ void GPUKernelRuntime::FreeCommunicationOpDynamicRes(const mindspore::AnfNodePtr
      auto device_address = AnfAlgo::GetMutableOutputAddr(kernel_input.first, 0);
      MS_EXCEPTION_IF_NULL(device_address);
      MS_EXCEPTION_IF_NULL(device_address->ptr_);
      FreeTensorMemDynamic(device_address->ptr_);
      mem_manager_->FreeMemFromMemPool(device_address->ptr_);
      device_address->ptr_ = nullptr;
    }
    *is_communication_op = true;
  }
 }

 void GPUKernelRuntime::MallocOpMemory(const DeviceAddressPtr address, size_t size, int) {
  auto device_ptr = AllocTensorMemDynamic(size);
  MS_EXCEPTION_IF_NULL(device_ptr);
  address->ptr_ = device_ptr;
  address->mem_dynamic_alloc_ = true;
 }
 }  // namespace gpu
 }  // namespace device
 }  // namespace mindspore
--- a/mindspore/ccsrc/device/gpu/gpu_kernel_runtime.h
+++ b/mindspore/ccsrc/device/gpu/gpu_kernel_runtime.h
@@ -33,7 +33,6 @@ class GPUKernelRuntime : public KernelRuntime {
  ~GPUKernelRuntime() override = default;
  bool Init() override;
  void ReleaseDeviceRes() override;
  void FreeHostMemory() override;
  void AssignMemory(session::KernelGraph *graph) override;
  bool Run(session::KernelGraph *graph) override;

@@ -41,18 +40,11 @@ class GPUKernelRuntime : public KernelRuntime {
  DeviceAddressPtr CreateDeviceAddress(void *device_ptr, size_t device_size, const string &format,
                                       TypeId type_id) override;
  bool SyncStream() override;
  // Alloc memory use the dynamic memory pool.
  void *AllocTensorMemDynamic(size_t size) override;
  // Free memory use the dynamic memory pool.
  void FreeTensorMemDynamic(void *device_ptr) override;
  void MallocOpMemory(const DeviceAddressPtr address, size_t size, int flag) override;
  uint8_t *MallocStaticMem(size_t size, bool communication_mem) override;

 private:
  GPUKernelRuntime(const GPUKernelRuntime &);
  GPUKernelRuntime &operator=(const GPUKernelRuntime &);
  bool InitDevice();
  void MallocDeviceMemory();
  bool device_init_{false};

  // The related functions and members for using dynamic memory pool.
@@ -69,6 +61,7 @@ class GPUKernelRuntime : public KernelRuntime {
  void FreeCommunicationOpDynamicRes(const mindspore::AnfNodePtr &kernel, size_t input_idx, bool *is_communication_op);
  size_t communication_op_input_ref_count_{0};
  size_t communication_op_output_ref_count_{0};
  MemReuseUtilPtr mem_reuse_util_ptr_{nullptr};
 };
 MS_REG_KERNEL_RUNTIME(kGPUDevice, GPUKernelRuntime);
 }  // namespace gpu
--- a/mindspore/ccsrc/device/gpu/gpu_memory_manager.cc
+++ b/mindspore/ccsrc/device/gpu/gpu_memory_manager.cc
@@ -0,0 +1,88 @@
 /**
 * Copyright 2019 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #include "device/gpu/gpu_memory_manager.h"
 #include "device/gpu/gpu_memory_allocator.h"
 #include "utils/context/ms_context.h"
 #include "utils/convert_utils.h"
 namespace mindspore {
 namespace device {
 namespace gpu {
 void *GPUMemoryManager::MallocMemFromMemPool(size_t size) {
  return GPUMemoryAllocator::GetInstance().AllocTensorMem(size);
 }

 void GPUMemoryManager::FreeMemFromMemPool(void *device_ptr) {
  GPUMemoryAllocator::GetInstance().FreeTensorMem(device_ptr);
 }

 void GPUMemoryManager::MallocDeviceMemory() {
  auto context_ptr = MsContext::GetInstance();
  MS_EXCEPTION_IF_NULL(context_ptr);
  // If use the dynamic memory pool, then alloc the first memory block to init.
  if (context_ptr->enable_dynamic_mem_pool()) {
    auto device_addr = MallocMemFromMemPool(1);
    if (!device_addr) {
      MS_LOG(ERROR) << "Dynamic memory pool init error.";
    }
  } else {
    // Need to reserve 20% space for dynamic memory
    const float init_gpu_mem_ratio = 0.8;
    size_t mem_size = FloatToSize(GPUMemoryAllocator::GetInstance().free_mem_size() * init_gpu_mem_ratio);
    auto alloc_size =
      GPUMemoryAllocator::GetInstance().AllocDeviceMem(mem_size, reinterpret_cast<void **>(&device_mem_base_));
    device_mem_size_ = alloc_size;
    static_mem_offset_ = device_mem_size_;
  }
 }

 void GPUMemoryManager::FreeDeviceMemory() {
  if (device_mem_base_ != nullptr) {
    if (!GPUMemoryAllocator::GetInstance().FreeDeviceMem(device_mem_base_)) {
      MS_LOG(EXCEPTION) << "Could not free gpu device memory.";
    }
  }
  GPUMemoryAllocator::GetInstance().ReleaseDeviceRes();
 }

 uint8_t *GPUMemoryManager::MallocStaticMem(size_t size, bool) {
  auto context_ptr = MsContext::GetInstance();
  MS_EXCEPTION_IF_NULL(context_ptr);
  if (context_ptr->enable_dynamic_mem_pool()) {
    auto device_ptr = MallocMemFromMemPool(size);
    MS_EXCEPTION_IF_NULL(device_ptr);
    return AddressOffset(device_ptr, 0);
  }

  auto align_size = GetCommonAlignSize(size);
  if (static_mem_offset_ < align_size) {
    MS_LOG(EXCEPTION) << "Out of memory!!! total[" << device_mem_size_ << "](dynamic[" << total_dynamic_size_
                      << "] static[" << total_static_size_ << "])"
                      << " malloc [" << align_size << "] failed!";
  }
  auto offset = static_mem_offset_ - align_size;
  if (dynamic_mem_offset_ > offset) {
    MS_LOG(EXCEPTION) << "Out of memory!!! total[" << device_mem_size_ << "](dynamic[" << total_dynamic_size_
                      << "] static[" << total_static_size_ << "])"
                      << " malloc [" << align_size << "] failed!";
  }
  total_static_size_ += align_size;
  static_mem_offset_ = offset;
  return device_mem_base_ + offset;
 }
 }  // namespace gpu
 }  // namespace device
 }  // namespace mindspore
--- a/mindspore/ccsrc/device/gpu/gpu_memory_manager.h
+++ b/mindspore/ccsrc/device/gpu/gpu_memory_manager.h
@@ -0,0 +1,40 @@
 /**
 * Copyright 2019 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #ifndef MINDSPORE_MINDSPORE_CCSRC_DEVICE_GPU_GPU_MEMORY_MANAGER_H_
 #define MINDSPORE_MINDSPORE_CCSRC_DEVICE_GPU_GPU_MEMORY_MANAGER_H_
 #include "device/memory_manager.h"
 namespace mindspore {
 namespace device {
 namespace gpu {
 class GPUMemoryManager : public MemoryManager {
 public:
  GPUMemoryManager() = default;
  virtual ~GPUMemoryManager() = default;

  void MallocDeviceMemory() override;
  void FreeDeviceMemory() override;

  void *MallocMemFromMemPool(size_t size) override;
  void FreeMemFromMemPool(void *device_ptr) override;

 protected:
  uint8_t *MallocStaticMem(size_t size, bool communication_mem) override;
 };
 }  // namespace gpu
 }  // namespace device
 }  // namespace mindspore
 #endif  // MINDSPORE_MINDSPORE_CCSRC_DEVICE_GPU_GPU_MEMORY_MANAGER_H_
--- a/mindspore/ccsrc/device/kernel_adjust.cc
+++ b/mindspore/ccsrc/device/kernel_adjust.cc
@@ -438,23 +438,22 @@ void KernelAdjust::LoadSwitchInputs(std::vector<tensor::TensorPtr> *inputs) {
  MS_LOG(INFO) << "---------------- LoadSwitchInputs End--";
 }

 void KernelAdjust::Profiling(const std::shared_ptr<session::KernelGraph> &kernel_graph_ptr) {
 void KernelAdjust::Profiling(NotNull<session::KernelGraph *> kernel_graph_ptr) {
  if (!ascend::ProfilingManager::GetInstance().IsProfiling()) {
    MS_LOG(INFO) << "No need to profiling";
    return;
  }
  ProfilingTraceInfo profiling_trace_info;
  if (ProfilingUtils::GetProfilingTraceInfo(kernel_graph_ptr, &profiling_trace_info)) {
    InsertProfilingKernel(kernel_graph_ptr, profiling_trace_info);
  } else {
    MS_LOG(WARNING) << "[profiling] GetProfilingTraceInfo failed";
  ProfilingTraceInfo profiling_trace_info = ProfilingUtils::GetProfilingTraceFromEnv(kernel_graph_ptr);
  if (!profiling_trace_info.IsValid()) {
    MS_LOG(WARNING) << "[profiling] no profiling node found!";
    return;
  }
  InsertProfilingKernel(profiling_trace_info, kernel_graph_ptr);
 }

 void KernelAdjust::InsertProfilingKernel(const std::shared_ptr<session::KernelGraph> &kernel_graph_ptr,
                                         const ProfilingTraceInfo &profiling_trace_info) {
 void KernelAdjust::InsertProfilingKernel(const ProfilingTraceInfo &profiling_trace_info,
                                         NotNull<session::KernelGraph *> kernel_graph_ptr) {
  MS_LOG(INFO) << "[profiling] Insert profiling kernel start";
  MS_EXCEPTION_IF_NULL(kernel_graph_ptr);
  if (!profiling_trace_info.IsValid()) {
    MS_LOG(WARNING) << "Profiling trace point not found";
    return;
@@ -462,18 +461,12 @@ void KernelAdjust::InsertProfilingKernel(const std::shared_ptr<session::KernelGr
  std::vector<CNodePtr> new_cnode_list;
  std::vector<CNodePtr> cnode_ptr_list = kernel_graph_ptr->execution_order();
  for (const auto &cnode_ptr : cnode_ptr_list) {
    ProfilingUtils::ProfilingTraceFpStart(kernel_graph_ptr, cnode_ptr, profiling_trace_info, &new_cnode_list);
    ProfilingUtils::ProfilingAllReduce(kernel_graph_ptr, cnode_ptr, ascend::kProfilingAllReduce1Start,
                                       profiling_trace_info.profiling_allreduce1_start, &new_cnode_list);
    ProfilingUtils::ProfilingAllReduce(kernel_graph_ptr, cnode_ptr, ascend::kProfilingAllReduce2Start,
                                       profiling_trace_info.profiling_allreduce2_start, &new_cnode_list);
    ProfilingUtils::ProfilingTraceFpStart(cnode_ptr, profiling_trace_info, kernel_graph_ptr, NOT_NULL(&new_cnode_list));
    new_cnode_list.emplace_back(cnode_ptr);

    ProfilingUtils::ProfilingAllReduce(kernel_graph_ptr, cnode_ptr, ascend::kProfilingAllReduce1End,
                                       profiling_trace_info.profiling_allreduce1_end, &new_cnode_list);
    ProfilingUtils::ProfilingAllReduce(kernel_graph_ptr, cnode_ptr, ascend::kProfilingAllReduce2End,
                                       profiling_trace_info.profiling_allreduce2_end, &new_cnode_list);
    ProfilingUtils::ProfilingTraceEnd(kernel_graph_ptr, cnode_ptr, profiling_trace_info, &new_cnode_list);
    ProfilingUtils::ProfilingCustomOp(cnode_ptr, profiling_trace_info, kernel_graph_ptr, NOT_NULL(&new_cnode_list));
    ProfilingUtils::ProfilingTraceBpEnd(cnode_ptr, profiling_trace_info, kernel_graph_ptr, NOT_NULL(&new_cnode_list));
    ProfilingUtils::ProfilingTraceEnd(cnode_ptr, profiling_trace_info, kernel_graph_ptr, NOT_NULL(&new_cnode_list));
  }
  kernel_graph_ptr->set_execution_order(new_cnode_list);
 }
--- a/mindspore/ccsrc/device/kernel_adjust.h
+++ b/mindspore/ccsrc/device/kernel_adjust.h
@@ -48,7 +48,7 @@ class KernelAdjust {
  void SetStreamSwitchOps(const std::shared_ptr<session::KernelGraph> &kernel_graph_ptr);
  bool StepLoadCtrlInputs(const std::shared_ptr<session::Context> &context,
                          const std::shared_ptr<session::KernelGraph> &kernel_graph_ptr);
  void Profiling(const std::shared_ptr<session::KernelGraph> &kernel_graph_ptr);
  void Profiling(NotNull<session::KernelGraph *> kernel_graph_ptr);
  static bool NeedInsertSwitch();
  CNodePtr CreateSteamActiveOp(const std::shared_ptr<session::KernelGraph> &kernel_graph_ptr);

@@ -66,8 +66,8 @@ class KernelAdjust {
  kernel::KernelBuildInfo::KernelBuildInfoBuilder CreateMngKernelBuilder(const std::vector<std::string> &formats,
                                                                         const std::vector<TypeId> &type_ids);
  void LoadSwitchInputs(std::vector<tensor::TensorPtr> *inputs);
  void InsertProfilingKernel(const std::shared_ptr<session::KernelGraph> &kernel_graph_ptr,
                             const ProfilingTraceInfo &profiling_trace_info);
  void InsertProfilingKernel(const ProfilingTraceInfo &profiling_trace_info,
                             NotNull<session::KernelGraph *> kernel_graph_ptr);
 };
 }  // namespace device
 }  // namespace mindspore
--- a/mindspore/ccsrc/device/kernel_runtime.cc
+++ b/mindspore/ccsrc/device/kernel_runtime.cc
@@ -31,18 +31,13 @@
 #include "ir/value.h"
 using mindspore::kernel::Address;
 using mindspore::kernel::AddressPtr;
 using mindspore::memreuse::BestFitMemReuse;
 using mindspore::memreuse::MemReuseUtilPtr;

 namespace mindspore {
 namespace device {
 KernelRuntime::~KernelRuntime() {
  device_mem_base_ = nullptr;
  device_mem_pool_base_ = nullptr;
 #ifdef ENABLE_DUMP_E2E
  dump_conf_ptr_ = nullptr;
 #endif
  mem_reuse_util_ptr_ = nullptr;
 }

 bool KernelRuntime::Run(session::KernelGraph *graph) {
@@ -88,11 +83,6 @@ bool KernelRuntime::LoadTask(const session::KernelGraph *graph) {
  return false;
 }

 void KernelRuntime::FreeHostMemory() {
  dynamic_mem_offset_ = 0;
  static_mem_offset_ = 0;
 }

 // for D to impl
 bool KernelRuntime::RunTask(const session::KernelGraph *graph) {
  if (graph != nullptr) {
@@ -126,13 +116,11 @@ size_t KernelRuntime::CountNodeDeviceMemorySize(const mindspore::AnfNodePtr &nod
 void KernelRuntime::AssignMemory(session::KernelGraph *graph) {
  auto context_ptr = MsContext::GetInstance();
  MS_EXCEPTION_IF_NULL(context_ptr);
  MS_EXCEPTION_IF_NULL(mem_manager_);
  mem_manager_->ResetDynamicMemory();
  AssignStaticMemory(graph);
  bool is_enable_mem_reuse = context_ptr->enable_mem_reuse();
  if (is_enable_mem_reuse) {
    ReuseAssignDynamicMemory(graph);
  } else {
    AssignDynamicMemory(graph);
  }
  AssignDynamicMemory(graph);

  UpdateRefNodeOutputMem(graph);
 }

@@ -159,6 +147,7 @@ void KernelRuntime::AssignStaticMemory(session::KernelGraph *graph) {
 void KernelRuntime::RunOpAssignInputMemory(const std::vector<tensor::TensorPtr> &input_tensors,
                                           const session::KernelGraph *graph) {
  MS_EXCEPTION_IF_NULL(graph);
  MS_EXCEPTION_IF_NULL(mem_manager_);
  for (size_t input_index = 0; input_index < graph->inputs().size(); ++input_index) {
    auto item = graph->inputs()[input_index];
    MS_EXCEPTION_IF_NULL(item);
@@ -180,7 +169,7 @@ void KernelRuntime::RunOpAssignInputMemory(const std::vector<tensor::TensorPtr>
      auto device_address =
        CreateDeviceAddress(nullptr, tensor_size, AnfAlgo::GetOutputFormat(item, index), output_type_id);
      MS_EXCEPTION_IF_NULL(device_address);
      MallocOpMemory(device_address, tensor_size, kStaticMem);
      mem_manager_->MallocMemFromMemPool(device_address, tensor_size);
      AnfAlgo::SetOutputAddr(device_address, index, item.get());
    }
  }
@@ -188,6 +177,7 @@ void KernelRuntime::RunOpAssignInputMemory(const std::vector<tensor::TensorPtr>

 void KernelRuntime::RunOpAssignOutputMemory(const AnfNodePtr &kernel) {
  MS_EXCEPTION_IF_NULL(kernel);
  MS_EXCEPTION_IF_NULL(mem_manager_);
  auto kernel_mod = AnfAlgo::GetKernelMod(kernel);
  MS_EXCEPTION_IF_NULL(kernel_mod);
  auto output_sizes = kernel_mod->GetOutputSizeList();
@@ -208,13 +198,14 @@ void KernelRuntime::RunOpAssignOutputMemory(const AnfNodePtr &kernel) {
    auto output_type = AnfAlgo::GetOutputDeviceDataType(kernel, i);
    auto device_address = CreateDeviceAddress(nullptr, output_sizes[i], output_format, output_type);
    MS_EXCEPTION_IF_NULL(device_address);
    MallocOpMemory(device_address, output_sizes[i], kDynamicMem);
    mem_manager_->MallocMemFromMemPool(device_address, output_sizes[i]);
    AnfAlgo::SetOutputAddr(device_address, i, kernel.get());
  }
 }

 void KernelRuntime::RunOpAssignWorkSpaceMemory(const AnfNodePtr &kernel) {
  MS_EXCEPTION_IF_NULL(kernel);
  MS_EXCEPTION_IF_NULL(mem_manager_);
  if (kernel->isa<CNode>()) {
    auto kernel_mod = AnfAlgo::GetKernelMod(kernel);
    MS_EXCEPTION_IF_NULL(kernel_mod);
@@ -222,7 +213,7 @@ void KernelRuntime::RunOpAssignWorkSpaceMemory(const AnfNodePtr &kernel) {
    for (size_t i = 0; i < workspace_lists.size(); ++i) {
      auto device_address = CreateDeviceAddress(nullptr, workspace_lists[i], "", kTypeUnknown);
      MS_EXCEPTION_IF_NULL(device_address);
      MallocOpMemory(device_address, workspace_lists[i], kDynamicMem);
      mem_manager_->MallocMemFromMemPool(device_address, workspace_lists[i]);
      AnfAlgo::SetWorkspaceAddr(device_address, i, kernel.get());
    }
  }
@@ -230,6 +221,7 @@ void KernelRuntime::RunOpAssignWorkSpaceMemory(const AnfNodePtr &kernel) {

 void KernelRuntime::AssignStaticMemoryInput(const session::KernelGraph *graph) {
  MS_EXCEPTION_IF_NULL(graph);
  MS_EXCEPTION_IF_NULL(mem_manager_);
  for (auto &item : graph->inputs()) {
    MS_EXCEPTION_IF_NULL(item);
    if (!item->isa<Parameter>()) {
@@ -247,7 +239,7 @@ void KernelRuntime::AssignStaticMemoryInput(const session::KernelGraph *graph) {
        output_type_id = AnfAlgo::GetOutputInferDataType(item, index);
      }
      auto tensor_size = CountNodeDeviceMemorySize(item, index);
      auto ptr = MallocStaticMem(tensor_size, false);
      auto ptr = mem_manager_->MallocMem(kStaticMem, tensor_size);
      auto address = CreateDeviceAddress(ptr, tensor_size, AnfAlgo::GetOutputFormat(item, index), output_type_id);
      AnfAlgo::SetOutputAddr(address, index, item.get());
    }
@@ -301,6 +293,7 @@ void KernelRuntime::UpdateRefNodeOutputMem(const session::KernelGraph *graph) {

 void KernelRuntime::AssignCommunicationNodeOutputMem(int flag, const AnfNodePtr &node) {
  MS_EXCEPTION_IF_NULL(node);
  MS_EXCEPTION_IF_NULL(mem_manager_);
  auto kernel_mod = AnfAlgo::GetKernelMod(node);
  MS_EXCEPTION_IF_NULL(kernel_mod);
  auto output_sizes = kernel_mod->GetOutputSizeList();
@@ -314,12 +307,12 @@ void KernelRuntime::AssignCommunicationNodeOutputMem(int flag, const AnfNodePtr
  std::vector<size_t> align_size_list;
  for (uint64_t mem_size : output_sizes) {
    if (context_ptr->enable_hccl()) {
      mem_size = GetCommonAlignSize(mem_size);
      mem_size = mem_manager_->GetCommonAlignSize(mem_size);
    }
    total_size += mem_size;
    align_size_list.emplace_back(mem_size);
  }
  uint8_t *output_ptr = CalDeviceMem(node, total_size, flag, 0);
  uint8_t *output_ptr = mem_manager_->MallocOutputMem(node, 0, flag, total_size);
  for (size_t j = 0; j < align_size_list.size(); ++j) {
    std::string output_format = AnfAlgo::GetOutputFormat(node, j);
    auto output_type = AnfAlgo::GetOutputDeviceDataType(node, j);
@@ -333,6 +326,7 @@ void KernelRuntime::UpdateCommunicationOpInputMem(const AnfNodePtr &node) {
  auto context_ptr = MsContext::GetInstance();
  MS_EXCEPTION_IF_NULL(context_ptr);
  MS_EXCEPTION_IF_NULL(node);
  MS_EXCEPTION_IF_NULL(mem_manager_);
  size_t total_size = 0;
  std::vector<std::pair<mindspore::device::DeviceAddress *, size_t>> addr_size;
  for (size_t i = 0; i < AnfAlgo::GetInputTensorNum(node); ++i) {
@@ -340,12 +334,12 @@ void KernelRuntime::UpdateCommunicationOpInputMem(const AnfNodePtr &node) {
    MS_EXCEPTION_IF_NULL(address);
    auto mem_size = address->size();
    if (context_ptr->enable_hccl()) {
      mem_size = GetCommonAlignSize(mem_size);
      mem_size = mem_manager_->GetCommonAlignSize(mem_size);
    }
    total_size += mem_size;
    addr_size.emplace_back(address.get(), mem_size);
  }
  uint8_t *input_ptr = CalDeviceMem(node, total_size, kDynamicMem, 0);
  uint8_t *input_ptr = mem_manager_->MallocOutputMem(node, 0, kDynamicMem, total_size);
  for (const auto &iter : addr_size) {
    MS_EXCEPTION_IF_NULL(iter.first);
    iter.first->set_ptr(input_ptr);
@@ -355,7 +349,8 @@ void KernelRuntime::UpdateCommunicationOpInputMem(const AnfNodePtr &node) {

 void KernelRuntime::AssignNodeOutputMem(int flag, const AnfNodePtr &node, int index) {
  MS_EXCEPTION_IF_NULL(node);
  if (IsCommunicationOp(node)) {
  MS_EXCEPTION_IF_NULL(mem_manager_);
  if (AnfAlgo::IsCommunicationOp(node)) {
    UpdateCommunicationOpInputMem(node);
    AssignCommunicationNodeOutputMem(flag, node);
    return;
@@ -375,7 +370,7 @@ void KernelRuntime::AssignNodeOutputMem(int flag, const AnfNodePtr &node, int in
      MS_LOG(INFO) << "Already malloc index:" << i;
      continue;
    }
    auto ptr = CalDeviceMem(node, output_sizes[i], flag, i);
    auto ptr = mem_manager_->MallocOutputMem(node, i, flag, output_sizes[i]);
    if (ptr == nullptr) {
      // reused ptr, no need alloc, continue;
      continue;
@@ -390,6 +385,7 @@ void KernelRuntime::AssignValueNodeTensor(const ValueNodePtr &value_node, const
                                          size_t output_idx) {
  MS_EXCEPTION_IF_NULL(value_node);
  MS_EXCEPTION_IF_NULL(node_value);
  MS_EXCEPTION_IF_NULL(mem_manager_);
  auto tensor = node_value->cast<TensorPtr>();
  if (tensor == nullptr) {
    MS_LOG(WARNING) << "Tensor is null";
@@ -397,7 +393,7 @@ void KernelRuntime::AssignValueNodeTensor(const ValueNodePtr &value_node, const
  }
  size_t tensor_size = tensor->data().nbytes();
  auto node_size = CountNodeDeviceMemorySize(value_node, output_idx);
  auto ptr = MallocStaticMem(node_size, false);
  auto ptr = mem_manager_->MallocMem(kStaticMem, node_size);
  TypeId output_type_id = AnfAlgo::GetOutputDeviceDataType(value_node, output_idx);
  if (output_type_id == kTypeUnknown) {
    output_type_id = AnfAlgo::GetOutputInferDataType(value_node, output_idx);
@@ -414,6 +410,7 @@ void KernelRuntime::AssignValueNodeTensor(const ValueNodePtr &value_node, const

 void KernelRuntime::AssignStaticMemoryValueNode(session::KernelGraph *graph) {
  MS_EXCEPTION_IF_NULL(graph);
  MS_EXCEPTION_IF_NULL(mem_manager_);
  for (auto &value_node : graph->graph_value_nodes()) {
    MS_EXCEPTION_IF_NULL(value_node);
    if (AnfAlgo::OutputAddrExist(value_node, 0)) {
@@ -440,7 +437,7 @@ void KernelRuntime::AssignStaticMemoryValueNode(session::KernelGraph *graph) {
    } else if (node_value->isa<StringImm>()) {
      auto value = GetValue<std::string>(node_value);
      size_t tensor_size = value.size();
      auto ptr = MallocStaticMem(tensor_size, false);
      auto ptr = mem_manager_->MallocMem(kStaticMem, tensor_size);
      auto address = CreateDeviceAddress(ptr, tensor_size, kOpFormat_DEFAULT, kNumberTypeUInt8);
      MS_EXCEPTION_IF_NULL(address);
      AnfAlgo::SetOutputAddr(address, 0, value_node.get());
@@ -452,103 +449,37 @@ void KernelRuntime::AssignStaticMemoryValueNode(session::KernelGraph *graph) {
  }
 }

 void KernelRuntime::AssignDynamicMemory(const session::KernelGraph *graph) {
 void KernelRuntime::AssignDynamicMemory(session::KernelGraph *graph) {
  MS_EXCEPTION_IF_NULL(graph);
  // reset dynamic mem offset
  dynamic_mem_offset_ = 0;
  auto &kernels = graph->execution_order();
  for (auto &kernel : kernels) {
    AssignNodeOutputMem(kDynamicMem, kernel, kGetAllOuts);
    AssignWorkSpaceMem(kernel);
  MS_EXCEPTION_IF_NULL(mem_manager_);
  auto context_ptr = MsContext::GetInstance();
  MS_EXCEPTION_IF_NULL(context_ptr);
  bool is_enable_mem_reuse = context_ptr->enable_mem_reuse();
  auto mem_flag = kDynamicMem;
  if (is_enable_mem_reuse) {
    mem_manager_->MallocReusedDynamicMem(graph);
    mem_flag = kReuseDynamicMem;
  }
 }

 void KernelRuntime::ReuseAssignDynamicMemory(session::KernelGraph *graph) {
  MS_EXCEPTION_IF_NULL(graph);
  dynamic_mem_offset_ = 0;
  MemReuseUtilPtr mem_reuse_util_ptr = std::make_shared<memreuse::MemReuseUtil>();
  MS_EXCEPTION_IF_NULL(mem_reuse_util_ptr);
  // set all infos
  mem_reuse_util_ptr->SetAllInfo(graph);
  auto bestfit_mem_reuse = std::make_shared<BestFitMemReuse>();
  MS_EXCEPTION_IF_NULL(bestfit_mem_reuse);
  bestfit_mem_reuse->Reuse(mem_reuse_util_ptr.get());
  size_t total_allocated_size = bestfit_mem_reuse->GetAllocatedSize();
  MS_LOG(INFO) << "TotalReuseDynamicSize [" << total_allocated_size << "]";
  mem_reuse_util_ptr_ = mem_reuse_util_ptr;
  auto base_ptr = MallocDynamicMem(total_allocated_size, false);
  mem_reuse_util_ptr_->set_mem_base(base_ptr);
  auto &kernels = graph->execution_order();
  for (auto &kernel : kernels) {
    AssignNodeOutputMem(kReuseDynamicMem, kernel, kGetAllOuts);
    AssignReuseWorkSpaceMem(kernel);
    AssignNodeOutputMem(mem_flag, kernel, kGetAllOuts);
    AssignWorkSpaceMem(mem_flag, kernel);
  }
 }

 void KernelRuntime::AssignReuseWorkSpaceMem(const AnfNodePtr &node) {
 void KernelRuntime::AssignWorkSpaceMem(int flag, const AnfNodePtr &node) {
  MS_EXCEPTION_IF_NULL(node);
  MS_EXCEPTION_IF_NULL(mem_manager_);
  auto kernel_mod = AnfAlgo::GetKernelMod(node);
  MS_EXCEPTION_IF_NULL(kernel_mod);
  size_t index = 0;
  for (auto &size : kernel_mod->GetWorkspaceSizeList()) {
    auto wk_ptr = mem_reuse_util_ptr_->GetNodeWorkSpacePtr(node, index);
    AnfAlgo::SetWorkspaceAddr(CreateDeviceAddress(wk_ptr, size, "", kTypeUnknown), index, node.get());
    auto ptr = mem_manager_->MallocWorkSpaceMem(node, flag, index, size);
    AnfAlgo::SetWorkspaceAddr(CreateDeviceAddress(ptr, size, "", kTypeUnknown), index, node.get());
    index++;
  }
 }

 void KernelRuntime::AssignWorkSpaceMem(const AnfNodePtr &node) {
  MS_EXCEPTION_IF_NULL(node);
  if (node->isa<CNode>()) {
    auto kernel_mod = AnfAlgo::GetKernelMod(node);
    MS_EXCEPTION_IF_NULL(kernel_mod);
    size_t index = 0;
    for (auto &size : kernel_mod->GetWorkspaceSizeList()) {
      auto ptr = MallocDynamicMem(size, false);
      AnfAlgo::SetWorkspaceAddr(CreateDeviceAddress(ptr, size, "", kTypeUnknown), index, node.get());
      index++;
    }
  }
 }

 bool KernelRuntime::IsCommunicationOp(const AnfNodePtr &node) {
  MS_EXCEPTION_IF_NULL(node);
  auto kernel_name = AnfAlgo::GetCNodeName(node);
  auto kernel_type = AnfAlgo::GetKernelType(node);
  if (kernel_name == kAllReduceOpName || kernel_type == HCCL_KERNEL) {
    return true;
  }
  return false;
 }

 uint8_t *KernelRuntime::CalDeviceMem(const AnfNodePtr &node, size_t size, int flag, size_t index) {
  MS_EXCEPTION_IF_NULL(node);
  auto context_ptr = MsContext::GetInstance();
  MS_EXCEPTION_IF_NULL(context_ptr);
  uint8_t *ptr = nullptr;
  if (IsCommunicationOp(node)) {
    bool communication_mem = false;
    if (context_ptr->enable_hccl()) {
      communication_mem = true;
    }
    if (flag == kStaticMem) {
      ptr = MallocStaticMem(size, communication_mem);
    } else {
      ptr = MallocDynamicMem(size, communication_mem);
    }
    return ptr;
  }

  if (flag == kStaticMem) {
    ptr = MallocStaticMem(size, false);
  } else if (flag == kDynamicMem) {
    ptr = MallocDynamicMem(size, false);
  } else if (flag == kReuseDynamicMem) {
    ptr = mem_reuse_util_ptr_->GetNodeOutputPtr(node, index);
  }
  return ptr;
 }

 void KernelRuntime::GenLaunchArgs(const mindspore::kernel::KernelMod &kernel_mod, const mindspore::AnfNodePtr &kernel,
                                  AddressPtrList *kernel_inputs, AddressPtrList *const kernel_workspaces,
                                  AddressPtrList *kernel_outputs) {
@@ -659,65 +590,6 @@ bool KernelRuntime::LaunchKernelMod(const session::KernelGraph &graph) {
  return true;
 }

 size_t KernelRuntime::GetCommonAlignSize(size_t input_size) const {
  return (input_size + mem_align_size_ + 31) / mem_align_size_ * mem_align_size_;
 }

 size_t KernelRuntime::GetCommunicationAlignSize(size_t input_size) const {
  return (input_size + mem_align_size_ - 1) / mem_align_size_ * mem_align_size_ + 2 * mem_align_size_;
 }

 uint8_t *KernelRuntime::MallocStaticMem(size_t size, bool communication_mem) {
  size_t align_size = 0;
  if (communication_mem) {
    align_size = GetCommunicationAlignSize(size);
  } else {
    align_size = GetCommonAlignSize(size);
  }
  if (static_mem_offset_ < align_size) {
    MS_LOG(EXCEPTION) << "Out of memory!!! total[" << device_mem_size_ << "](dynamic[" << total_dynamic_size_
                      << "] static[" << total_static_size_ << "])"
                      << " malloc [" << align_size << "] failed!";
  }
  total_static_size_ += align_size;
  auto offset = static_mem_offset_ - align_size;
  if (dynamic_mem_offset_ > offset) {
    MS_LOG(EXCEPTION) << "Out of memory!!! total[" << device_mem_size_ << "](dynamic[" << total_dynamic_size_
                      << "] static[" << total_static_size_ << "])"
                      << " malloc [" << align_size << "] failed!";
  }
  static_mem_offset_ = offset;
  if (communication_mem) {
    return device_mem_base_ + offset + mem_align_size_;
  } else {
    return device_mem_base_ + offset;
  }
 }

 uint8_t *KernelRuntime::MallocDynamicMem(size_t size, bool communication_mem) {
  size_t align_size = 0;
  if (communication_mem) {
    align_size = GetCommunicationAlignSize(size);
  } else {
    align_size = GetCommonAlignSize(size);
  }
  uint64_t offset = dynamic_mem_offset_;
  auto new_offset = dynamic_mem_offset_ + align_size;
  if (new_offset > static_mem_offset_) {
    MS_LOG(EXCEPTION) << "Out of memory!!! total[" << device_mem_size_ << "](dynamic[" << total_dynamic_size_
                      << "] static[" << total_static_size_ << "])"
                      << " malloc [" << align_size << "] failed!";
  }
  total_dynamic_size_ += align_size;
  dynamic_mem_offset_ = new_offset;

  if (communication_mem) {
    return device_mem_base_ + offset + mem_align_size_;
  } else {
    return device_mem_base_ + offset;
  }
 }

 bool KernelRuntime::LaunchKernel(const session::KernelGraph *graph) {
  MS_EXCEPTION_IF_NULL(graph);
  if (!LaunchKernelMod(*graph)) {
@@ -731,29 +603,6 @@ bool KernelRuntime::LaunchKernel(const session::KernelGraph *graph) {
  return true;
 }

 void KernelRuntime::MallocOpMemory(const DeviceAddressPtr address, size_t size, int flag) {
  if (flag == kStaticMem) {
    address->ptr_ = MallocStaticMem(size, false);
  } else if (flag == kDynamicMem) {
    address->ptr_ = MallocDynamicMem(size, false);
  } else {
    MS_LOG(EXCEPTION) << "Unknown memory type!";
  }
 }

 void *KernelRuntime::AllocTensorMemDynamic(size_t size) {
  if (size == 0) {
    MS_LOG(ERROR) << "AllocTensorMemDynamic size is 0.";
  }
  return nullptr;
 }

 void KernelRuntime::FreeTensorMemDynamic(void *device_ptr) {
  if (device_ptr == nullptr) {
    MS_LOG(ERROR) << "FreeTensorMemDynamic device_ptr is null.";
  }
 }

 #ifdef ENABLE_DUMP_E2E
 bool KernelRuntime::SetDumpConf() {
  dump_conf_ptr_ = std::make_shared<Dump>();
--- a/mindspore/ccsrc/device/kernel_runtime.h
+++ b/mindspore/ccsrc/device/kernel_runtime.h
@@ -20,8 +20,7 @@
 #include <memory>
 #include <string>
 #include <map>
 #include "pre_activate/mem_reuse/mem_reuse.h"
 #include "pre_activate/mem_reuse/mem_reuse_allocator.h"

 #include "device/device_address.h"
 #include "ir/meta_tensor.h"
 #include "predict/generator/utils/ir_model_util.h"
@@ -32,21 +31,15 @@
 #include "session/anf_runtime_algorithm.h"
 #include "kernel/kernel.h"
 #include "utils/context/ms_context.h"
 #include "device/memory_manager.h"

 // using mindspore::session::KernelGraph;
 using mindspore::tensor::Tensor;
 using TensorPtr = std::shared_ptr<Tensor>;
 using MemReuseUtilPtr = mindspore::memreuse::MemReuseUtilPtr;
 using mindspore::kernel::AddressPtr;
 using AddressPtrList = std::vector<mindspore::kernel::AddressPtr>;

 namespace mindspore {
 namespace device {
 const int kStaticMem = 0;
 const int kDynamicMem = 1;
 const int kReuseDynamicMem = 2;
 const int kGetAllOuts = -1;

 class KernelRuntime {
 public:
  KernelRuntime() = default;
@@ -65,7 +58,6 @@ class KernelRuntime {
  DumpConfPtr GetDumpConf();
 #endif
  virtual bool LoadTask(const session::KernelGraph *graph);
  virtual void FreeHostMemory();
  // for GPU and D to impl
  virtual void ReleaseDeviceRes() {}
  void set_device_id(uint32_t device_id) { device_id_ = device_id; }
@@ -75,29 +67,17 @@ class KernelRuntime {
                                               TypeId type_id) = 0;
  virtual bool SyncStream() = 0;
  void AssignStaticMemory(session::KernelGraph *graph);
  void AssignDynamicMemory(const session::KernelGraph *graph);
  void AssignDynamicMemory(session::KernelGraph *graph);
  void ReuseAssignDynamicMemory(session::KernelGraph *graph);
  void AssignNodeOutputMem(int flag, const AnfNodePtr &node, int index);
  void AssignWorkSpaceMem(const AnfNodePtr &node);
  void AssignWorkSpaceMem(int flag, const AnfNodePtr &node);
  void AssignReuseWorkSpaceMem(const AnfNodePtr &node);
  void AssignCommunicationNodeOutputMem(int flag, const AnfNodePtr &node);
  void UpdateRefNodeOutputMem(const session::KernelGraph *graph);
  void UpdateCommunicationOpInputMem(const AnfNodePtr &node);
  bool IsCommunicationOp(const AnfNodePtr &node);
  size_t GetCommonAlignSize(size_t input_size) const;
  size_t GetCommunicationAlignSize(size_t input_size) const;

  uint8_t *CalDeviceMem(const AnfNodePtr &node, size_t size, int flag, size_t index);
  virtual uint8_t *MallocStaticMem(size_t size, bool communication_mem);
  uint8_t *MallocDynamicMem(size_t size, bool communication_mem);
 #ifdef ENABLE_DUMP_E2E
  bool SetDumpConf();
 #endif
  // Alloc memory use the dynamic memory pool.
  virtual void *AllocTensorMemDynamic(size_t size);
  // Free memory use the dynamic memory pool.
  virtual void FreeTensorMemDynamic(void *device_ptr);
  virtual void MallocOpMemory(const DeviceAddressPtr address, size_t size, int flag);

 private:
  void AssignStaticMemoryOutput(const session::KernelGraph *graph);
@@ -114,20 +94,11 @@ class KernelRuntime {

 protected:
  uint32_t device_id_{0};
  uint8_t *device_mem_base_{nullptr};
  uint8_t *device_mem_pool_base_{nullptr};
  uint64_t device_mem_size_{0};
  uint64_t device_mem_pool_size_{0};
  uint64_t dynamic_mem_offset_{0};
  uint64_t static_mem_offset_{0};
  const uint64_t mem_align_size_ = 512;
 #ifdef ENABLE_DUMP_E2E
  DumpConfPtr dump_conf_ptr_;
 #endif
  void *stream_ = nullptr;
  size_t total_static_size_ = 0;
  size_t total_dynamic_size_ = 0;
  MemReuseUtilPtr mem_reuse_util_ptr_{nullptr};
  std::shared_ptr<MemoryManager> mem_manager_{nullptr};
 };
 using KernelRuntimePtr = std::shared_ptr<KernelRuntime>;
 }  // namespace device
--- a/mindspore/ccsrc/device/memory_manager.cc
+++ b/mindspore/ccsrc/device/memory_manager.cc
@@ -0,0 +1,164 @@
 /**
 * Copyright 2019 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #include "device/memory_manager.h"
 #include "session/anf_runtime_algorithm.h"
 #include "utils/context/ms_context.h"
 using mindspore::memreuse::BestFitMemReuse;
 using mindspore::memreuse::MemReuseUtilPtr;
 namespace mindspore {
 namespace device {
 size_t MemoryManager::GetCommonAlignSize(size_t input_size) const {
  return (input_size + kMemAlignSize + 31) / kMemAlignSize * kMemAlignSize;
 }

 size_t MemoryManager::GetCommunicationAlignSize(size_t input_size) const {
  return (input_size + kMemAlignSize - 1) / kMemAlignSize * kMemAlignSize + 2 * kMemAlignSize;
 }

 void MemoryManager::MallocReusedDynamicMem(session::KernelGraph *graph) {
  MS_EXCEPTION_IF_NULL(graph);
  MemReuseUtilPtr mem_reuse_util_ptr = std::make_shared<memreuse::MemReuseUtil>();
  MS_EXCEPTION_IF_NULL(mem_reuse_util_ptr);
  // set all infos
  mem_reuse_util_ptr->SetAllInfo(graph);
  auto bestfit_mem_reuse = std::make_shared<BestFitMemReuse>();
  MS_EXCEPTION_IF_NULL(bestfit_mem_reuse);
  bestfit_mem_reuse->Reuse(mem_reuse_util_ptr.get());
  size_t total_allocated_size = bestfit_mem_reuse->GetAllocatedSize();
  MS_LOG(INFO) << "TotalReuseDynamicSize [" << total_allocated_size << "]";
  mem_reuse_util_ptr_ = mem_reuse_util_ptr;
  auto base_ptr = MallocDynamicMem(total_allocated_size, false);
  mem_reuse_util_ptr_->set_mem_base(base_ptr);
 }

 uint8_t *MemoryManager::MallocOutputMem(const AnfNodePtr &node, size_t index, int flag, size_t size) {
  MS_EXCEPTION_IF_NULL(node);
  auto context_ptr = MsContext::GetInstance();
  MS_EXCEPTION_IF_NULL(context_ptr);
  uint8_t *ptr = nullptr;
  if (AnfAlgo::IsCommunicationOp(node)) {
    bool communication_mem = false;
    if (context_ptr->enable_hccl()) {
      communication_mem = true;
    }
    if (flag == kStaticMem) {
      ptr = MallocStaticMem(size, communication_mem);
    } else {
      ptr = MallocDynamicMem(size, communication_mem);
    }
    return ptr;
  }

  if (flag == kStaticMem) {
    ptr = MallocStaticMem(size, false);
  } else if (flag == kDynamicMem) {
    ptr = MallocDynamicMem(size, false);
  } else if (flag == kReuseDynamicMem) {
    ptr = mem_reuse_util_ptr_->GetNodeOutputPtr(node, index);
  }
  return ptr;
 }

 uint8_t *MemoryManager::MallocWorkSpaceMem(const AnfNodePtr &node, size_t index, int flag, size_t size) {
  if (flag == kReuseDynamicMem) {
    return mem_reuse_util_ptr_->GetNodeWorkSpacePtr(node, index);
  }
  return MallocDynamicMem(size, false);
 }

 uint8_t *MemoryManager::MallocMem(int flag, size_t size) {
  uint8_t *ptr = nullptr;
  if (flag == kStaticMem) {
    ptr = MallocStaticMem(size, false);
  } else if (flag == kDynamicMem) {
    ptr = MallocDynamicMem(size, false);
  }
  return ptr;
 }

 uint8_t *MemoryManager::MallocStaticMem(size_t size, bool communication_mem) {
  size_t align_size = 0;
  if (communication_mem) {
    align_size = GetCommunicationAlignSize(size);
  } else {
    align_size = GetCommonAlignSize(size);
  }
  if (static_mem_offset_ < align_size) {
    MS_LOG(EXCEPTION) << "Out of memory!!! total[" << device_mem_size_ << "](dynamic[" << total_dynamic_size_
                      << "] static[" << total_static_size_ << "])"
                      << " malloc [" << align_size << "] failed!";
  }
  total_static_size_ += align_size;
  auto offset = static_mem_offset_ - align_size;
  if (dynamic_mem_offset_ > offset) {
    MS_LOG(EXCEPTION) << "Out of memory!!! total[" << device_mem_size_ << "](dynamic[" << total_dynamic_size_
                      << "] static[" << total_static_size_ << "])"
                      << " malloc [" << align_size << "] failed!";
  }
  static_mem_offset_ = offset;
  if (communication_mem) {
    return device_mem_base_ + offset + kMemAlignSize;
  } else {
    return device_mem_base_ + offset;
  }
 }

 uint8_t *MemoryManager::MallocDynamicMem(size_t size, bool communication_mem) {
  size_t align_size = 0;
  if (communication_mem) {
    align_size = GetCommunicationAlignSize(size);
  } else {
    align_size = GetCommonAlignSize(size);
  }
  uint64_t offset = dynamic_mem_offset_;
  auto new_offset = dynamic_mem_offset_ + align_size;
  if (new_offset > static_mem_offset_) {
    MS_LOG(EXCEPTION) << "Out of memory!!! total[" << device_mem_size_ << "](dynamic[" << total_dynamic_size_
                      << "] static[" << total_static_size_ << "])"
                      << " malloc [" << align_size << "] failed!";
  }
  total_dynamic_size_ += align_size;
  dynamic_mem_offset_ = new_offset;

  if (communication_mem) {
    return device_mem_base_ + offset + kMemAlignSize;
  } else {
    return device_mem_base_ + offset;
  }
 }

 void MemoryManager::MallocMemFromMemPool(const DeviceAddressPtr address, size_t size) {
  auto device_ptr = MallocMemFromMemPool(size);
  MS_EXCEPTION_IF_NULL(device_ptr);
  address->ptr_ = device_ptr;
  address->from_mem_pool_ = true;
 }

 void *MemoryManager::MallocMemFromMemPool(size_t size) {
  if (size == 0) {
    MS_LOG(ERROR) << "MallocMemFromMemPool size is 0.";
  }
  return nullptr;
 }

 void MemoryManager::FreeMemFromMemPool(void *device_ptr) {
  if (device_ptr == nullptr) {
    MS_LOG(ERROR) << "FreeMemFromMemPool device_ptr is null.";
  }
 }
 }  // namespace device
 }  // namespace mindspore
--- a/mindspore/ccsrc/device/memory_manager.h
+++ b/mindspore/ccsrc/device/memory_manager.h
@@ -0,0 +1,68 @@
 /**
 * Copyright 2019 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #ifndef MINDSPORE_MINDSPORE_CCSRC_DEVICE_MEMORY_MANAGER_H_
 #define MINDSPORE_MINDSPORE_CCSRC_DEVICE_MEMORY_MANAGER_H_
 #include <memory>
 #include "pre_activate/mem_reuse/mem_reuse.h"
 #include "pre_activate/mem_reuse/mem_reuse_allocator.h"
 namespace mindspore {
 namespace device {
 const int kStaticMem = 0;
 const int kDynamicMem = 1;
 const int kReuseDynamicMem = 2;
 const int kGetAllOuts = -1;
 const uint64_t kMemAlignSize = 512;
 using MemReuseUtilPtr = mindspore::memreuse::MemReuseUtilPtr;

 class MemoryManager {
 public:
  MemoryManager() = default;
  virtual ~MemoryManager() = default;

  virtual void MallocDeviceMemory() = 0;
  virtual void FreeDeviceMemory() = 0;
  void ResetDynamicMemory() {
    total_dynamic_size_ = 0;
    dynamic_mem_offset_ = 0;
  }

  void MallocReusedDynamicMem(session::KernelGraph *graph);
  uint8_t *MallocOutputMem(const AnfNodePtr &node, size_t index, int flag, size_t size);
  uint8_t *MallocWorkSpaceMem(const AnfNodePtr &node, size_t index, int flag, size_t size);
  virtual uint8_t *MallocMem(int flag, size_t size);

  virtual void MallocMemFromMemPool(const DeviceAddressPtr address, size_t size);
  virtual void *MallocMemFromMemPool(size_t size);
  virtual void FreeMemFromMemPool(void *device_ptr);

  size_t GetCommonAlignSize(size_t input_size) const;
  size_t GetCommunicationAlignSize(size_t input_size) const;

 protected:
  virtual uint8_t *MallocStaticMem(size_t size, bool communication_mem);
  virtual uint8_t *MallocDynamicMem(size_t size, bool communication_mem);
  uint8_t *device_mem_base_{nullptr};
  uint64_t device_mem_size_{0};
  uint64_t dynamic_mem_offset_{0};
  uint64_t static_mem_offset_{0};
  size_t total_static_size_ = 0;
  size_t total_dynamic_size_ = 0;
  MemReuseUtilPtr mem_reuse_util_ptr_{nullptr};
 };
 }  // namespace device
 }  // namespace mindspore
 #endif  // MINDSPORE_MINDSPORE_CCSRC_DEVICE_MEMORY_MANAGER_H_
--- a/mindspore/ccsrc/kernel/aicpu/aicpu_kernel_build.cc
+++ b/mindspore/ccsrc/kernel/aicpu/aicpu_kernel_build.cc
@@ -39,45 +39,7 @@ namespace mindspore {
 namespace kernel {
 using FNodeAttrHandle = std::function<void(const std::shared_ptr<AnfNode> &anf_node, mindspore::NodeDef *proto)>;

 const std::vector<std::string> local_framework_op_vec = {kInitDataSetQueue, kGetNext, kDropoutGenMask, kPrint};

 void InitDataSetQueueAttr(const std::shared_ptr<AnfNode> &anf_node, mindspore::NodeDef *proto) {
  MS_EXCEPTION_IF_NULL(anf_node);
  MS_EXCEPTION_IF_NULL(proto);

  ::google::protobuf::Map<::std::string, ::mindspore::AttrValue> *node_attr = proto->mutable_attrs();
  MS_EXCEPTION_IF_NULL(node_attr);
  std::string channel_name = AnfAlgo::GetNodeAttr<std::string>(anf_node, kQueueName);
  (*node_attr)[kChannelName].set_s(channel_name);
 }

 void GetNextAttr(const std::shared_ptr<AnfNode> &anf_node, mindspore::NodeDef *proto) {
  MS_EXCEPTION_IF_NULL(anf_node);
  MS_EXCEPTION_IF_NULL(proto);

  ::google::protobuf::Map<::std::string, ::mindspore::AttrValue> *node_attr = proto->mutable_attrs();
  MS_EXCEPTION_IF_NULL(node_attr);
  std::string shared_name = AnfAlgo::GetNodeAttr<std::string>(anf_node, kSharedName);
  (*node_attr)[kChannelName].set_s(shared_name);
 }

 void DropoutGenMaskAttr(const std::shared_ptr<AnfNode> &anf_node, mindspore::NodeDef *proto) {
  MS_EXCEPTION_IF_NULL(anf_node);
  MS_EXCEPTION_IF_NULL(proto);

  ::google::protobuf::Map<::std::string, ::mindspore::AttrValue> *node_attr = proto->mutable_attrs();
  MS_EXCEPTION_IF_NULL(node_attr);
  int seed = AnfAlgo::GetNodeAttr<int>(anf_node, kSeed);
  int seed2 = AnfAlgo::GetNodeAttr<int>(anf_node, kSeed2);
  (*node_attr)["seed"].set_i(seed);
  (*node_attr)["seed2"].set_i(seed2);
 }

 void CreateAttrFuncMap(std::map<std::string, FNodeAttrHandle> *mOpAttrFuncMap) {
  (void)mOpAttrFuncMap->emplace(std::pair<std::string, FNodeAttrHandle>(kInitDataSetQueue, InitDataSetQueueAttr));
  (void)mOpAttrFuncMap->emplace(std::pair<std::string, FNodeAttrHandle>(kGetNext, GetNextAttr));
  (void)mOpAttrFuncMap->emplace(std::pair<std::string, FNodeAttrHandle>(kDropoutGenMask, DropoutGenMaskAttr));
 }
 const std::vector<std::string> local_framework_op_vec = {kInitData, kGetNext, kDropoutGenMask, kPrint};

 bool SetIOIputSize(const std::shared_ptr<AnfNode> &anf_node, const size_t &input_num,
                   std::vector<size_t> *input_size_list) {
@@ -147,24 +109,74 @@ bool SetIOSize(const std::shared_ptr<AnfNode> &anf_node, const std::shared_ptr<A
  return true;
 }

 void ParseAttrValue(const std::string &type, const std::string &attr_name, const mindspore::ValuePtr &value,
                    ::google::protobuf::Map<::std::string, ::mindspore::AttrValue> *node_attr) {
  MS_EXCEPTION_IF_NULL(node_attr);
  if (type == "int") {
    auto attr_value = GetValue<int>(value);
    (*node_attr)[attr_name].set_i(attr_value);
  } else if (type == "str") {
    auto attr_value = GetValue<std::string>(value);
    (*node_attr)[attr_name].set_s(attr_value);
  } else if (type == "bool") {
    auto attr_value = GetValue<bool>(value);
    (*node_attr)[attr_name].set_b(attr_value);
  } else if (type == "float") {
    auto attr_value = GetValue<float>(value);
    (*node_attr)[attr_name].set_f(attr_value);
  } else if (type == "listInt") {
    std::vector<int> attr_value;
    auto value_type = value->type();
    MS_EXCEPTION_IF_NULL(value_type);
    auto value_type_str = value_type->ToString();
    if (value_type_str == "Int32") {
      int data = GetValue<int>(value);
      attr_value.push_back(data);
    } else {
      attr_value = GetValue<std::vector<int>>(value);
    }
    mindspore::AttrValue input_shape_attr;
    mindspore::AttrValue_ArrayValue *input_shape_attr_list = input_shape_attr.mutable_array();
    MS_EXCEPTION_IF_NULL(input_shape_attr_list);
    for (const auto shape : attr_value) {
      input_shape_attr_list->add_i(shape);
    }
    (*node_attr)[attr_name] = input_shape_attr;
  } else {
    MS_LOG(EXCEPTION) << "type: " << type << "not support";
  }
 }

 void SetNodeAttr(const std::shared_ptr<AnfNode> &anf_node, mindspore::NodeDef *proto) {
  std::string op_name = AnfAlgo::GetCNodeName(anf_node);
  if (op_name == "InitDataSetQueue") {
    op_name = "InitData";
  if (op_name == kInitDataSetQueue) {
    op_name = kInitData;
  }
  if (op_name == "Print") {
  if (op_name == kPrint) {
    return;
  }
  std::map<std::string, FNodeAttrHandle> mOpAttrFuncMap;
  CreateAttrFuncMap(&mOpAttrFuncMap);
  FNodeAttrHandle func_ptr = nullptr;
  auto iter = mOpAttrFuncMap.find(op_name);
  if (iter != mOpAttrFuncMap.end()) {
    func_ptr = iter->second;
    MS_EXCEPTION_IF_NULL(func_ptr);
    func_ptr(anf_node, proto);
  } else {
    MS_LOG(ERROR) << "Don't support node [" << op_name << "] to set nodedef of attr";

  auto op_info_ptr = mindspore::kernel::OpLib::FindOp(op_name, OpImplyType::kAICPU);
  MS_EXCEPTION_IF_NULL(op_info_ptr);
  auto attrs_ptr = op_info_ptr->attrs_ptr();
  auto primitive = AnfAlgo::GetCNodePrimitive(anf_node);
  MS_EXCEPTION_IF_NULL(primitive);
  ::google::protobuf::Map<::std::string, ::mindspore::AttrValue> *node_attr = proto->mutable_attrs();
  for (const auto &attr_ptr : attrs_ptr) {
    std::string attr_name = attr_ptr->name();
    std::string real_name;
    auto value = primitive->GetAttr(attr_name);
    if (value != nullptr) {
      if (attr_name == kQueueName || attr_name == kSharedName) {
        real_name = kChannelName;
      } else if (attr_name == kSeed) {
        real_name = "seed";
      } else if (attr_name == kSeed2) {
        real_name = "seed2";
      }
      std::string type = attr_ptr->type();
      ParseAttrValue(type, real_name, value, node_attr);
    }
  }
  MS_LOG(INFO) << "Set node attr end!";
 }
--- a/mindspore/ccsrc/kernel/aicpu/aicpu_kernel_metadata.cc
+++ b/mindspore/ccsrc/kernel/aicpu/aicpu_kernel_metadata.cc
@@ -17,68 +17,27 @@
 #include "kernel/aicpu/aicpu_kernel_metadata.h"
 #include <memory>
 #include <string>
 #include "kernel/oplib/oplib.h"
 #include "kernel/common_utils.h"
 #include "kernel/aicpu/aicpu_util.h"
 #include "session/anf_runtime_algorithm.h"

 namespace mindspore {
 namespace kernel {
 constexpr auto kInitDataSetQueueOpName = "InitDataSetQueue";
 constexpr auto kGetNext = "GetNext";
 constexpr auto kDropoutGenMask = "DropoutGenMask";
 constexpr auto kPrint = "Print";
 const std::vector<std::string> AICPU_OPS = {kInitDataSetQueueOpName, kGetNext, kDropoutGenMask, kPrint};

 std::shared_ptr<KernelBuildInfo> CreateKernelInfo(const std::vector<std::string> &inputs_format,
                                                  const std::vector<TypeId> &inputs_device_type,
                                                  const std::vector<std::string> &outputs_format,
                                                  const std::vector<TypeId> &outputs_device_type) {
  auto builder = KernelBuildInfo::KernelBuildInfoBuilder();
  builder.SetInputsFormat(inputs_format);
  builder.SetInputsDeviceType(inputs_device_type);
  builder.SetOutputsFormat(outputs_format);
  builder.SetOutputsDeviceType(outputs_device_type);
  builder.SetProcessor(AICPU);
  builder.SetKernelType(AICPU_KERNEL);
  builder.SetFusionType(OPAQUE);
  return builder.Build();
 }

 bool CheckIfExistAicpuMeta(const std::string &op_name) {
  if (std::find(AICPU_OPS.begin(), AICPU_OPS.end(), op_name) != AICPU_OPS.end()) {
    return false;
  }
  return true;
 }

 void AicpuMetadataInfo(const CNodePtr &kernel_node, std::vector<std::shared_ptr<KernelBuildInfo>> *kernel_info_list) {
  MS_LOG(INFO) << "AicpuMetadataInfo.";
  MS_EXCEPTION_IF_NULL(kernel_node);
  MS_EXCEPTION_IF_NULL(kernel_info_list);
  std::string op_name = AnfAlgo::GetCNodeName(kernel_node);
  if (CheckIfExistAicpuMeta(op_name)) {
    MS_LOG(DEBUG) << "Aicpu doesn't have metadata of op [" << op_name << "].";
    return;
  }

  if (op_name == kInitDataSetQueueOpName) {
    kernel_info_list->push_back(CreateKernelInfo({}, {}, {}, {}));
  if (op_name == kInitDataSetQueue) {
    op_name = kInitData;
  }

  if (op_name == kGetNext) {
    std::vector<std::string> outputs_format;
    std::vector<TypeId> outputs_type;
    for (size_t output_index = 0; output_index < AnfAlgo::GetOutputTensorNum(kernel_node); ++output_index) {
      outputs_format.emplace_back(kOpFormat_DEFAULT);
      outputs_type.push_back(AnfAlgo::GetOutputInferDataType(kernel_node, output_index));
    }
    kernel_info_list->push_back(CreateKernelInfo({}, {}, outputs_format, outputs_type));
  }

  if (op_name == kDropoutGenMask) {
    kernel_info_list->push_back(CreateKernelInfo({kOpFormat_NCHW, kOpFormat_NCHW},
                                                 {kInt32->type_id(), kFloat16->type_id()}, {kOpFormat_NCHW},
                                                 {kUInt8->type_id()}));
  auto op_info_ptr = mindspore::kernel::OpLib::FindOp(op_name, OpImplyType::kAICPU);
  if (op_info_ptr == nullptr) {
    MS_LOG(WARNING) << "Aicpu doestn't have metadata of op [" << op_name << "]";
    return;
  }

  // For compatibility with the current framework
  if (op_name == kPrint) {
    std::vector<std::string> inputs_format;
    std::vector<TypeId> inputs_type;
@@ -92,11 +51,20 @@ void AicpuMetadataInfo(const CNodePtr &kernel_node, std::vector<std::shared_ptr<
      outputs_format.emplace_back(kOpFormat_DEFAULT);
      outputs_type.push_back(AnfAlgo::GetOutputInferDataType(kernel_node, output_index));
    }
    kernel_info_list->push_back(CreateKernelInfo(inputs_format, inputs_type, outputs_format, outputs_type));
    auto builder = KernelBuildInfo::KernelBuildInfoBuilder();
    builder.SetInputsFormat(inputs_format);
    builder.SetInputsDeviceType(inputs_type);
    builder.SetOutputsFormat(outputs_format);
    builder.SetOutputsDeviceType(outputs_type);
    builder.SetProcessor(AICPU);
    builder.SetKernelType(AICPU_KERNEL);
    builder.SetFusionType(OPAQUE);
    kernel_info_list->push_back(builder.Build());
    return;
  }

  if (kernel_info_list->empty()) {
    MS_LOG(INFO) << "Aicpu dose not has metadata of op[ " << op_name << "].";
  if (!ParseMetadata(kernel_node, op_info_ptr, AICPU, kernel_info_list)) {
    MS_LOG(WARNING) << "Aicpu parsed metadata op [" << op_name << "] failed";
    return;
  }
 }
 }  // namespace kernel
--- a/mindspore/ccsrc/kernel/aicpu/aicpu_util.h
+++ b/mindspore/ccsrc/kernel/aicpu/aicpu_util.h
@@ -24,7 +24,8 @@

 namespace mindspore {
 namespace kernel {
 constexpr auto kInitDataSetQueue = "InitData";
 constexpr auto kInitDataSetQueue = "InitDataSetQueue";
 constexpr auto kInitData = "InitData";
 constexpr auto kGetNext = "GetNext";
 constexpr auto kDropoutGenMask = "DropoutGenMask";
 constexpr auto kPrint = "Print";
--- a/mindspore/ccsrc/kernel/common_utils.cc
+++ b/mindspore/ccsrc/kernel/common_utils.cc
@@ -417,6 +417,8 @@ void SetKernelBuildInfo(const std::shared_ptr<KernelBuildInfo::KernelBuildInfoBu

  if (imply_type == kAKG) {
    builder->SetKernelType(AUTO_DIFF_KERNEL);
  } else if (imply_type == kAICPU) {
    builder->SetKernelType(AICPU_KERNEL);
  } else {
    builder->SetKernelType(TBE_KERNEL);
  }
@@ -471,6 +473,13 @@ bool ParseMetadata(const CNodePtr &kernel_node, const std::shared_ptr<const OpIn
        return false;
      }

      kernel_info_list->push_back(builder->Build());
    }
  } else {
    if (processor == AICPU) {
      auto builder = std::make_shared<KernelBuildInfo::KernelBuildInfoBuilder>();
      MS_EXCEPTION_IF_NULL(builder);
      SetKernelBuildInfo(builder, processor, op_info_ptr);
      kernel_info_list->push_back(builder->Build());
    }
  }
--- a/mindspore/ccsrc/kernel/gpu/math/bias_add_gpu_kernel.h
+++ b/mindspore/ccsrc/kernel/gpu/math/bias_add_gpu_kernel.h
@@ -23,7 +23,6 @@
 #include <vector>
 #include "kernel/gpu/gpu_kernel.h"
 #include "kernel/gpu/gpu_kernel_factory.h"
 #include "dataset/util/make_unique.h"
 #include "kernel/gpu/kernel_constants.h"

 namespace mindspore {
@@ -74,8 +73,8 @@ class BiasAddGpuKernel : public GpuKernel {

    // Expand to 4 dims for cudnnSetTensorNdDescriptorEx.
    auto cudnn_dims = std::max(num_dims, 4UL);
    std::unique_ptr<int[]> x_dims = mindspore::make_unique<int[]>(cudnn_dims);
    std::unique_ptr<int[]> b_dims = mindspore::make_unique<int[]>(cudnn_dims);
    std::unique_ptr<int[]> x_dims = std::make_unique<int[]>(cudnn_dims);
    std::unique_ptr<int[]> b_dims = std::make_unique<int[]>(cudnn_dims);
    for (size_t i = 0; i < cudnn_dims; i++) {
      x_dims[i] = (i < num_dims) ? SizeToInt(x_shape[i]) : 1;
      b_dims[i] = (i == pos) ? SizeToInt(x_shape[i]) : 1;
--- a/mindspore/ccsrc/kernel/gpu/nn/bias_add_grad_gpu_kenel.h
+++ b/mindspore/ccsrc/kernel/gpu/nn/bias_add_grad_gpu_kenel.h
@@ -26,7 +26,6 @@
 #include "kernel/gpu/gpu_kernel.h"
 #include "kernel/gpu/gpu_kernel_factory.h"
 #include "kernel/gpu/kernel_constants.h"
 #include "dataset/util/make_unique.h"

 namespace mindspore {
 namespace kernel {
@@ -84,8 +83,8 @@ class BiasAddGradGpuKernel : public GpuKernel {

    // Expand to 4 dims for cudnnSetTensorNdDescriptorEx.
    auto cudnn_dims = std::max(num_dims, 4UL);
    std::unique_ptr<int[]> dy_dims = mindspore::make_unique<int[]>(cudnn_dims);
    std::unique_ptr<int[]> db_dims = mindspore::make_unique<int[]>(cudnn_dims);
    std::unique_ptr<int[]> dy_dims = std::make_unique<int[]>(cudnn_dims);
    std::unique_ptr<int[]> db_dims = std::make_unique<int[]>(cudnn_dims);
    for (size_t i = 0; i < cudnn_dims; i++) {
      dy_dims[i] = (i < num_dims) ? SizeToInt(dy_shape[i]) : 1;
      db_dims[i] = (i == pos) ? SizeToInt(dy_shape[i]) : 1;
--- a/mindspore/ccsrc/kernel/gpu/nn/lstm_gpu_kernel.h
+++ b/mindspore/ccsrc/kernel/gpu/nn/lstm_gpu_kernel.h
@@ -22,7 +22,6 @@
 #include <memory>
 #include "kernel/gpu/gpu_kernel.h"
 #include "kernel/gpu/gpu_kernel_factory.h"
 #include "dataset/util/make_unique.h"
 #include "kernel/gpu/kernel_constants.h"

 namespace mindspore {
@@ -144,8 +143,8 @@ class LstmGpuKernel : public GpuKernel {
    int x_dims[3]{batch_size_, input_size_, 1};
    int y_dims[3]{batch_size_, hidden_size_ * (bidirectional_ ? 2 : 1), 1};

    x_desc_ = mindspore::make_unique<cudnnTensorDescriptor_t[]>(seq_len_);
    y_desc_ = mindspore::make_unique<cudnnTensorDescriptor_t[]>(seq_len_);
    x_desc_ = std::make_unique<cudnnTensorDescriptor_t[]>(seq_len_);
    y_desc_ = std::make_unique<cudnnTensorDescriptor_t[]>(seq_len_);

    for (size_t i = 0; i < IntToSize(seq_len_); ++i) {
      CHECK_CUDNN_RET_WITH_EXCEPT(cudnnCreateTensorDescriptor(&x_desc_[i]), "create x_desc failed");
--- a/mindspore/ccsrc/kernel/gpu/nn/lstm_grad_data_gpu_kernel.h
+++ b/mindspore/ccsrc/kernel/gpu/nn/lstm_grad_data_gpu_kernel.h
@@ -23,7 +23,6 @@
 #include "kernel/gpu/gpu_kernel.h"
 #include "kernel/gpu/gpu_kernel_factory.h"
 #include "kernel/gpu/kernel_constants.h"
 #include "dataset/util/make_unique.h"

 namespace mindspore {
 namespace kernel {
@@ -212,9 +211,9 @@ class LstmGradDataGpuKernel : public GpuKernel {
    int x_dims[3]{batch_size_, input_size_, 1};
    int y_dims[3]{batch_size_, hidden_size_ * (bidirectional_ ? 2 : 1), 1};

    dx_desc_ = mindspore::make_unique<cudnnTensorDescriptor_t[]>(seq_len_);
    y_desc_ = mindspore::make_unique<cudnnTensorDescriptor_t[]>(seq_len_);
    dy_desc_ = mindspore::make_unique<cudnnTensorDescriptor_t[]>(seq_len_);
    dx_desc_ = std::make_unique<cudnnTensorDescriptor_t[]>(seq_len_);
    y_desc_ = std::make_unique<cudnnTensorDescriptor_t[]>(seq_len_);
    dy_desc_ = std::make_unique<cudnnTensorDescriptor_t[]>(seq_len_);

    for (size_t i = 0; i < IntToSize(seq_len_); ++i) {
      CHECK_CUDNN_RET_WITH_EXCEPT(cudnnCreateTensorDescriptor(&dx_desc_[i]), "create x_desc failed");
--- a/mindspore/ccsrc/kernel/gpu/nn/lstm_grad_weight_gpu_kernel.h
+++ b/mindspore/ccsrc/kernel/gpu/nn/lstm_grad_weight_gpu_kernel.h
@@ -22,7 +22,6 @@
 #include <memory>
 #include "kernel/gpu/gpu_kernel.h"
 #include "kernel/gpu/gpu_kernel_factory.h"
 #include "dataset/util/make_unique.h"
 #include "kernel/gpu/kernel_constants.h"
 namespace mindspore {
 namespace kernel {
@@ -169,8 +168,8 @@ class LstmGradWeightGpuKernel : public GpuKernel {
    int x_dims[3]{batch_size_, input_size_, 1};
    int y_dims[3]{batch_size_, hidden_size_ * (bidirectional_ ? 2 : 1), 1};

    x_desc_ = mindspore::make_unique<cudnnTensorDescriptor_t[]>(seq_len_);
    y_desc_ = mindspore::make_unique<cudnnTensorDescriptor_t[]>(seq_len_);
    x_desc_ = std::make_unique<cudnnTensorDescriptor_t[]>(seq_len_);
    y_desc_ = std::make_unique<cudnnTensorDescriptor_t[]>(seq_len_);

    for (size_t i = 0; i < IntToSize(seq_len_); ++i) {
      CHECK_CUDNN_RET_WITH_EXCEPT(cudnnCreateTensorDescriptor(&x_desc_[i]), "create x_desc failed");
--- a/mindspore/ccsrc/kernel/oplib/opinfo.h
+++ b/mindspore/ccsrc/kernel/oplib/opinfo.h
@@ -24,7 +24,7 @@

 namespace mindspore {
 namespace kernel {
 enum OpImplyType { kAKG = 0, kTBE };
 enum OpImplyType { kAKG = 0, kTBE = 1, kAICPU };
 enum OpIOType { kInput = 0, kOutput };

 class OpAttr {
--- a/mindspore/ccsrc/kernel/oplib/oplib.cc
+++ b/mindspore/ccsrc/kernel/oplib/oplib.cc
@@ -39,6 +39,7 @@ constexpr auto kDtypeFormat = "dtype_format";
 constexpr auto kAttr = "attr";
 constexpr auto kIputs = "inputs";
 constexpr auto kOutputs = "outputs";
 constexpr auto kAiCPU = "AiCPU";
 constexpr auto kTbe = "TBE";
 constexpr auto kAkg = "akg";
 constexpr auto kAutodiff = "AutoDiff";
@@ -60,6 +61,8 @@ std::string ImplTypeToStr(OpImplyType impl_type) {
      return kTbe;
    case kAKG:
      return kAkg;
    case kAICPU:
      return kAiCPU;
    default:
      return "unknow";
  }
@@ -76,6 +79,9 @@ bool OpLib::RegOp(const std::string& json_string, const std::string& impl_path)
    } else if (imply_type_string == kAutodiff) {
      OpImplyType imply_type = kAKG;
      ret = DecodeOpInfo(op_json, imply_type, impl_path);
    } else if (imply_type_string == kAiCPU) {
      OpImplyType imply_type = kAICPU;
      ret = DecodeOpInfo(op_json, imply_type, impl_path);
    } else {
      MS_LOG(DEBUG) << "Not support imply_type";
    }
@@ -154,7 +160,9 @@ bool OpLib::DecodeAttr(const nlohmann::json& obj, const OpImplyType imply_type,
    std::shared_ptr<OpAttr> op_attr = std::make_shared<OpAttr>();
    MS_EXCEPTION_IF_NULL(op_attr);
    op_attr->set_name(obj.at(kName));
    op_attr->set_param_type(obj.at(kParamType));
    if (imply_type != kAICPU) {
      op_attr->set_param_type(obj.at(kParamType));
    }
    op_attr->set_type(obj.at(kType));
    if (imply_type == kTBE) {
      op_attr->set_value(obj.at(kValue));
@@ -242,9 +250,10 @@ std::shared_ptr<OpInfo> OpLib::FindOp(const std::string& op_name, OpImplyType im
  auto context = MsContext::GetInstance();
  MS_EXCEPTION_IF_NULL(context);
  bool is_gpu = (context->device_target() == kGPUDevice);
  if ((is_gpu && imply_type == kTBE) || (!is_gpu && imply_type != kTBE)) {
    MS_LOG(DEBUG) << "FindOp failed: opname:" << op_name << "imply_type:" << ImplTypeToStr(imply_type)
                  << "current op num:" << op_info_.size();
  if ((is_gpu && (imply_type == kTBE || imply_type == kAICPU)) ||
      (!is_gpu && (imply_type != kTBE && imply_type != kAICPU))) {
    MS_LOG(ERROR) << "FindOp failed: opname:" << op_name << ", imply_type:" << ImplTypeToStr(imply_type)
                  << ", current op num:" << op_info_.size();
    return nullptr;
  }
  for (const auto& op_info : op_info_) {
@@ -253,8 +262,8 @@ std::shared_ptr<OpInfo> OpLib::FindOp(const std::string& op_name, OpImplyType im
      return op_info;
    }
  }
  MS_LOG(DEBUG) << "FindOp failed: opname:" << op_name << "imply_type:" << ImplTypeToStr(imply_type)
                << "current op num:" << op_info_.size();
  MS_LOG(DEBUG) << "FindOp failed: opname:" << op_name << ", imply_type:" << ImplTypeToStr(imply_type)
                << ", current op num:" << op_info_.size();
  return nullptr;
 }

--- a/mindspore/ccsrc/mindrecord/common/shard_error.cc
+++ b/mindspore/ccsrc/mindrecord/common/shard_error.cc
@@ -0,0 +1,178 @@
 /**
 * Copyright 2019 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #include "mindrecord/include/shard_error.h"

 namespace mindspore {
 namespace mindrecord {
 std::string ErrnoToMessage(MSRStatus status) {
  switch (status) {
    case FAILED:
      return "operator failed";
      break;
    case SUCCESS:
      return "operator success";
      break;
    case OPEN_FILE_FAILED:
      return "open file failed";
      break;
    case CLOSE_FILE_FAILED:
      return "close file failed";
      break;
    case WRITE_METADATA_FAILED:
      return "write metadata failed";
      break;
    case WRITE_RAWDATA_FAILED:
      return "write rawdata failed";
      break;
    case GET_SCHEMA_FAILED:
      return "get schema failed";
      break;
    case ILLEGAL_RAWDATA:
      return "illegal raw data";
      break;
    case PYTHON_TO_JSON_FAILED:
      return "pybind: python object to json failed";
      break;
    case DIR_CREATE_FAILED:
      return "directory create failed";
      break;
    case OPEN_DIR_FAILED:
      return "open directory failed";
      break;
    case INVALID_STATISTICS:
      return "invalid statistics object";
      break;
    case OPEN_DATABASE_FAILED:
      return "open database failed";
      break;
    case CLOSE_DATABASE_FAILED:
      return "close database failed";
      break;
    case DATABASE_OPERATE_FAILED:
      return "database operate failed";
      break;
    case BUILD_SCHEMA_FAILED:
      return "build schema failed";
      break;
    case DIVISOR_IS_ILLEGAL:
      return "divisor is illegal";
      break;
    case INVALID_FILE_PATH:
      return "file path is invalid";
      break;
    case SECURE_FUNC_FAILED:
      return "secure function failed";
      break;
    case ALLOCATE_MEM_FAILED:
      return "allocate memory failed";
      break;
    case ILLEGAL_FIELD_NAME:
      return "illegal field name";
      break;
    case ILLEGAL_FIELD_TYPE:
      return "illegal field type";
      break;
    case SET_METADATA_FAILED:
      return "set metadata failed";
      break;
    case ILLEGAL_SCHEMA_DEFINITION:
      return "illegal schema definition";
      break;
    case ILLEGAL_COLUMN_LIST:
      return "illegal column list";
      break;
    case SQL_ERROR:
      return "sql error";
      break;
    case ILLEGAL_SHARD_COUNT:
      return "illegal shard count";
      break;
    case ILLEGAL_SCHEMA_COUNT:
      return "illegal schema count";
      break;
    case VERSION_ERROR:
      return "data version is not matched";
      break;
    case ADD_SCHEMA_FAILED:
      return "add schema failed";
      break;
    case ILLEGAL_Header_SIZE:
      return "illegal header size";
      break;
    case ILLEGAL_Page_SIZE:
      return "illegal page size";
      break;
    case ILLEGAL_SIZE_VALUE:
      return "illegal size value";
      break;
    case INDEX_FIELD_ERROR:
      return "add index fields failed";
      break;
    case GET_CANDIDATE_CATEGORYFIELDS_FAILED:
      return "get candidate category fields failed";
      break;
    case GET_CATEGORY_INFO_FAILED:
      return "get category information failed";
      break;
    case ILLEGAL_CATEGORY_ID:
      return "illegal category id";
      break;
    case ILLEGAL_ROWNUMBER_OF_PAGE:
      return "illegal row number of page";
      break;
    case ILLEGAL_SCHEMA_ID:
      return "illegal schema id";
      break;
    case DESERIALIZE_SCHEMA_FAILED:
      return "deserialize schema failed";
      break;
    case DESERIALIZE_STATISTICS_FAILED:
      return "deserialize statistics failed";
      break;
    case ILLEGAL_DB_FILE:
      return "illegal db file";
      break;
    case OVERWRITE_DB_FILE:
      return "overwrite db file";
      break;
    case OVERWRITE_MINDRECORD_FILE:
      return "overwrite mindrecord file";
      break;
    case ILLEGAL_MINDRECORD_FILE:
      return "illegal mindrecord file";
      break;
    case PARSE_JSON_FAILED:
      return "parse json failed";
      break;
    case ILLEGAL_PARAMETERS:
      return "illegal parameters";
      break;
    case GET_PAGE_BY_GROUP_ID_FAILED:
      return "get page by group id failed";
      break;
    case GET_SYSTEM_STATE_FAILED:
      return "get system state failed";
      break;
    case IO_FAILED:
      return "io operate failed";
      break;
    default:
      return "invalid error no";
  }
 }
 }  // namespace mindrecord
 }  // namespace mindspore