synchronize latest Ascend software suite 18 Jul 2020, and merging branches

5 years ago · 859acc6d2a
--- a/.gitmodules
+++ b/.gitmodules
@@ -15,4 +15,4 @@
 	url = https://gitee.com/mindspore/akg.git
 [submodule "graphengine"]
 	path = graphengine
 	url = https://gitee.com/ms-incubator/graphengine.git
 	url = https://gitee.com/mindspore/graphengine.git
--- a/README.md
+++ b/README.md
@@ -202,10 +202,10 @@ Check out how MindSpore Open Governance [works](https://gitee.com/mindspore/comm

 ### Communication

 - [MindSpore Slack](https://join.slack.com/t/mindspore/shared_invite/enQtOTcwMTIxMDI3NjM0LTNkMWM2MzI5NjIyZWU5ZWQ5M2EwMTQ5MWNiYzMxOGM4OWFhZjI4M2E5OGI2YTg3ODU1ODE2Njg1MThiNWI3YmQ) - Communication platform for developers.
 - [MindSpore Slack](https://join.slack.com/t/mindspore/shared_invite/zt-dgk65rli-3ex4xvS4wHX7UDmsQmfu8w) - Communication platform for developers.
 - IRC channel at `#mindspore` (only for meeting minutes logging purpose)
 - Video Conferencing: https://meet.jit.si
 - Mailing-list: https://mailweb.mindspore.cn/postorius/lists
 - Video Conferencing: TBD
 - Mailing-list: <https://mailweb.mindspore.cn/postorius/lists>

 ## Contributing

--- a/+ 1
+++ b/+ 1
@@ -1 +1 @@
 Subproject commit df57a6cf9450e347d1854687d1fe66a420ee3b35
 Subproject commit f60af9df4220bf3db5de2b224418953c0dc1f625
--- a/build.sh
+++ b/build.sh
@@ -24,7 +24,7 @@ usage()
 {
  echo "Usage:"
  echo "bash build.sh [-d] [-r] [-v] [-c on|off] [-t on|off] [-g on|off] [-h] [-b ge] [-m infer|train] \\"
  echo "              [-a on|off] [-Q on|off] [-S on|off] [-p on|off] [-i] [-L] [-R] [-D on|off] [-j[n]] [-e gpu|d|cpu] \\"
  echo "              [-a on|off] [-Q on|off] [-p on|off] [-i] [-L] [-R] [-D on|off] [-j[n]] [-e gpu|d|cpu] \\"
  echo "              [-P on|off] [-z [on|off]] [-M on|off] [-V 9.2|10.1] [-I] [-K] [-B on|off] [-E] [-l on|off]"
  echo ""
  echo "Options:"
@@ -48,7 +48,6 @@ usage()
  echo "    -P Enable dump anf graph to file in ProtoBuffer format, default on"
  echo "    -Q Enable dump memory, default off"
  echo "    -D Enable dumping of function graph ir, default on"
  echo "    -S Enable async data dump, default off"
  echo "    -z Compile dataset & mindrecord, default on"
  echo "    -M Enable MPI and NCCL for GPU training, gpu default on"
  echo "    -V Specify the minimum required cuda version, default CUDA 10.1"
@@ -89,7 +88,6 @@ checkopts()
  ENABLE_TIMELINE="off"
  ENABLE_DUMP2PROTO="on"
  ENABLE_DUMPE2E="off"
  ENABLE_DATA_DUMP="off"
  ENABLE_DUMP_IR="on"
  COMPILE_MINDDATA="on"
  ENABLE_MPI="off"
@@ -104,7 +102,7 @@ checkopts()
  ENABLE_PYTHON="on"

  # Process the options
  while getopts 'drvj:c:t:hsb:a:g:p:ie:m:l:I:LRP:Q:S:D:zM:V:K:sB:E' opt
  while getopts 'drvj:c:t:hsb:a:g:p:ie:m:l:I:LRP:Q:D:zM:V:K:sB:E' opt
  do
    OPTARG=$(echo ${OPTARG} | tr '[A-Z]' '[a-z]')
    case "${opt}" in
@@ -186,6 +184,7 @@ checkopts()
        elif [[ "X$OPTARG" == "Xd" || "X$OPTARG" == "Xascend" ]]; then
          ENABLE_D="on"
          ENABLE_CPU="on"
          ENABLE_SERVING="on"
        elif [[ "X$OPTARG" == "Xcpu" ]]; then
          ENABLE_CPU="on"
        else
@@ -220,11 +219,6 @@ checkopts()
        ENABLE_DUMPE2E="$OPTARG"
        echo "enable dump end to end"
        ;;
      S)
        check_on_off $OPTARG S
        ENABLE_DATA_DUMP="$OPTARG"
        echo "enable data dump"
        ;;
      D)
        check_on_off $OPTARG D
        ENABLE_DUMP_IR="$OPTARG"
@@ -328,9 +322,6 @@ build_mindspore()
    if [[ "X$ENABLE_DUMPE2E" = "Xon" ]]; then
        CMAKE_ARGS="${CMAKE_ARGS} -DENABLE_DUMP_E2E=ON"
    fi
    if [[ "X$ENABLE_DATA_DUMP" = "Xon" ]]; then
        CMAKE_ARGS="${CMAKE_ARGS} -DENABLE_DATA_DUMP=ON"
    fi
    CMAKE_ARGS="${CMAKE_ARGS} -DENABLE_DUMP_IR=${ENABLE_DUMP_IR}"
    CMAKE_ARGS="${CMAKE_ARGS} -DENABLE_PYTHON=${ENABLE_PYTHON}"
    if [[ "X$ENABLE_MPI" = "Xon" ]]; then
--- a/cmake/external_libs/glog.cmake
+++ b/cmake/external_libs/glog.cmake
@@ -1,4 +1,4 @@
 set(glog_CXXFLAGS "-D_FORTIFY_SOURCE=2 -O2 ${SECURE_CXX_FLAGS}")
 set(glog_CXXFLAGS "-D_FORTIFY_SOURCE=2 -O2 ${SECURE_CXX_FLAGS} -D_GLIBCXX_USE_CXX11_ABI=0")
 set(glog_CFLAGS "-D_FORTIFY_SOURCE=2 -O2")
 mindspore_add_pkg(glog
        VER 0.4.0
--- a/cmake/options.cmake
+++ b/cmake/options.cmake
@@ -116,10 +116,10 @@ if(ENABLE_DUMP_E2E)
    add_compile_definitions(ENABLE_DUMP_E2E)
 endif()

 if(ENABLE_DATA_DUMP)
    add_compile_definitions(ENABLE_DATA_DUMP)
 endif()

 if(ENABLE_DEBUGGER)
    add_compile_definitions(ENABLE_DEBUGGER)
 endif()

 if(ENABLE_TESTCASES)
    add_compile_definitions(ENABLE_TESTCASES)
 endif()
--- a/cmake/package_script.cmake
+++ b/cmake/package_script.cmake
@@ -1,13 +1,16 @@
 # find exec
 find_package(Python3 3.7 COMPONENTS Interpreter Development)
 if (NOT Python3_FOUND)
    message("No python3 found.")
    return ()
    message(FATAL_ERROR "No python3 found.")
 endif ()

 set(PYTHON ${Python3_EXECUTABLE})
 set(PYTHON_VERSION ${Python3_VERSION_MAJOR}.${Python3_VERSION_MINOR})

 if (NOT PYTHON_VERSION MATCHES "3.7")
    message(FATAL_ERROR "FIND PYTHON VERSION ${PYTHON_VERSION} BUT CAN NOT MATCH PYTHON VERSION 3.7")
 endif ()

 find_package(Git)
 if (NOT GIT_FOUND)
    message("No git found.")
--- a/+ 1
+++ b/+ 1
@@ -1 +1 @@
 Subproject commit eee707935c066c16e9b9cd207f8125871b6b97cf
 Subproject commit 103f2d1019dc50d781d7a964551d9f1f50b3b009
--- a/hub/docs/.gitkeep
+++ b/hub/docs/.gitkeep
--- a/hub/images/.gitkeep
+++ b/hub/images/.gitkeep
--- a/hub/scripts/.gitkeep
+++ b/hub/scripts/.gitkeep
--- a/mindspore/_extends/parse/resources.py
+++ b/mindspore/_extends/parse/resources.py
@@ -17,7 +17,7 @@
 """Resources for ast tree parse."""
 import ast
 import math
 from mindspore import IndexedSlices
 from mindspore import IndexedSlices, SparseTensor
 from mindspore.ops.composite import multitype_ops
 from mindspore.ops import functional as F, composite as C
 from . import standard_method as M
@@ -140,4 +140,5 @@ convert_object_map = {

    # user defined
    IndexedSlices:  F.make_indexed_slices,
    SparseTensor:   F.make_sparse_tensor,
 }
--- a/mindspore/ccsrc/CMakeLists.txt
+++ b/mindspore/ccsrc/CMakeLists.txt
@@ -44,7 +44,7 @@ if(ENABLE_GPU)
            "backend/kernel_compiler/akg/akg_kernel_attrs_process.cc"
            )

    list(APPEND CUDA_NVCC_FLAGS -arch=sm_53)
    list(APPEND CUDA_NVCC_FLAGS -arch=sm_53 --expt-relaxed-constexpr)
    list(REMOVE_ITEM GPU_SRC_LIST "runtime/device/gpu/blocking_queue.cc" "runtime/device/gpu/gpu_buffer_mgr.cc")
    list(REMOVE_ITEM GPU_SRC_LIST "runtime/device/gpu/mpi/mpi_initializer.cc"
                                  "runtime/device/gpu/distribution/collective_wrapper.cc"
--- a/mindspore/ccsrc/backend/kernel_compiler/CMakeLists.txt
+++ b/mindspore/ccsrc/backend/kernel_compiler/CMakeLists.txt
@@ -26,14 +26,6 @@ if (ENABLE_CPU)
        "cpu/*.cc"
    )

    list(REMOVE_ITEM CPU_SRC_LIST "cpu/ps/push_kernel.cc" 
                                  "cpu/ps/pull_kernel.cc"
                                  "cpu/ps/embedding_look_up_ps_kernel.cc"
                                  "cpu/ps/embedding_look_up_proxy_kernel.cc"
                                  "cpu/ps/apply_momentum_ps_kernel.cc"
                                  "cpu/ps/sparse_apply_adam_ps_kernel.cc"
                                  "cpu/ps/sparse_apply_ftrl_ps_kernel.cc")

    if (NOT ENABLE_MPI)
        list(REMOVE_ITEM CPU_SRC_LIST "cpu/allgather_cpu_kernel.cc")
        list(REMOVE_ITEM CPU_SRC_LIST "cpu/reduce_scatter_cpu_kernel.cc")
@@ -41,6 +33,17 @@ if (ENABLE_CPU)
    endif ()
 endif ()

 if (${CMAKE_SYSTEM_NAME} MATCHES "Windows" OR ENABLE_GE)
    list(REMOVE_ITEM CPU_SRC_LIST "cpu/ps/apply_momentum_ps_kernel.cc")
    list(REMOVE_ITEM CPU_SRC_LIST "cpu/ps/embedding_look_up_proxy_kernel.cc")
    list(REMOVE_ITEM CPU_SRC_LIST "cpu/ps/embedding_look_up_ps_kernel.cc")
    list(REMOVE_ITEM CPU_SRC_LIST "cpu/ps/pserver_kernel.cc")
    list(REMOVE_ITEM CPU_SRC_LIST "cpu/ps/pull_kernel.cc")
    list(REMOVE_ITEM CPU_SRC_LIST "cpu/ps/push_kernel.cc")
    list(REMOVE_ITEM CPU_SRC_LIST "cpu/ps/sparse_apply_adam_ps_kernel.cc")
    list(REMOVE_ITEM CPU_SRC_LIST "cpu/ps/sparse_apply_ftrl_ps_kernel.cc")
 endif()

 if (ENABLE_GPU)
    file(GLOB_RECURSE CUDA_SRC_LIST RELATIVE ${CMAKE_CURRENT_SOURCE_DIR}
        "gpu/*.cu"
--- a/mindspore/ccsrc/backend/kernel_compiler/akg/akg_kernel_attrs_process.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/akg/akg_kernel_attrs_process.cc
@@ -18,6 +18,7 @@
 #include <algorithm>
 #include "backend/session/anf_runtime_algorithm.h"
 #include "backend/optimizer/common/helper.h"
 #include "backend/kernel_compiler/common_utils.h"

 namespace mindspore {
 namespace kernel {
@@ -75,15 +76,7 @@ void SetAkgAttrsForCast(const AnfNodePtr &anf_node) {

  std::string dst_type;
  TypeId output_type = AnfAlgo::GetOutputDeviceDataType(anf_node, 0);
  if (output_type == kFloat32->type_id()) {
    dst_type = "float32";
  } else if (output_type == kFloat16->type_id()) {
    dst_type = "float16";
  } else if (output_type == kInt32->type_id()) {
    dst_type = "int32";
  } else {
    MS_LOG(WARNING) << "Unknown cast_to type: " << TypeIdToType(output_type)->ToString();
  }
  dst_type = TypeId2String(output_type);
  AnfAlgo::SetNodeAttr("dst_type", MakeValue(dst_type), anf_node);
 }

--- a/mindspore/ccsrc/backend/kernel_compiler/ascend_kernel_mod.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/ascend_kernel_mod.h
@@ -21,9 +21,7 @@
 #include <memory>
 #include "framework/ge_runtime/task_info.h"
 #include "backend/kernel_compiler/kernel.h"
 #ifdef ENABLE_DATA_DUMP
 #include "debug/data_dump_parser.h"
 #endif

 using TaskInfoPtr = std::shared_ptr<ge::model_runner::TaskInfo>;
 namespace mindspore {
@@ -34,13 +32,7 @@ class AscendKernelMod : public KernelMod {
                                           const std::vector<AddressPtr> &, uint32_t) = 0;
  uint32_t block_dim() { return block_dim_; }
  uint32_t stream_id() { return stream_id_; }
  virtual bool NeedDump() {
 #ifdef ENABLE_DATA_DUMP
    return DataDumpParser::GetInstance().NeedDump(kernel_name_);
 #else
    return false;
 #endif
  }
  virtual bool NeedDump() { return DataDumpParser::GetInstance().NeedDump(kernel_name_); }

 protected:
  uint32_t block_dim_{1};
--- a/mindspore/ccsrc/backend/kernel_compiler/common_utils.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/common_utils.cc
@@ -20,6 +20,7 @@
 #include <iostream>
 #include <utility>
 #include <fstream>
 #include <algorithm>
 #include <thread>
 #include "nlohmann/json.hpp"
 #include "backend/session/anf_runtime_algorithm.h"
@@ -499,235 +500,329 @@ int Sign(float x) {
  return 0;
 }

 void DeduplicateIndexedSlices(const SparseGradient &origin_sparse_grad, SparseGradient *unique_grad, size_t first_dim,
                              size_t outer_dim) {
  MS_EXCEPTION_IF_NULL(origin_sparse_grad.value_);
  MS_EXCEPTION_IF_NULL(origin_sparse_grad.indices_);
  MS_EXCEPTION_IF_NULL(unique_grad);
  MS_EXCEPTION_IF_NULL(unique_grad->value_);
  MS_EXCEPTION_IF_NULL(unique_grad->indices_);
  std::unordered_map<int, size_t> index_map;
  size_t unique_indices_size = 0;
  for (size_t i = 0; i < origin_sparse_grad.indices_size_; ++i) {
    int index = origin_sparse_grad.indices_[i];
    if (index < 0 || IntToSize(index) >= first_dim) {
      continue;
    }
    auto iter = index_map.find(index);
    if (iter == index_map.end()) {
      index_map[index] = unique_indices_size;
      unique_grad->indices_[unique_indices_size] = index;
      size_t start_index = unique_indices_size * outer_dim;
      size_t end_index = start_index + outer_dim;
      for (size_t j = start_index, k = i * outer_dim; j < end_index; ++j, ++k) {
        unique_grad->value_[j] = origin_sparse_grad.value_[k];
      }
      unique_indices_size++;
    } else {
      size_t first_index = iter->second;
      size_t start_index = first_index * outer_dim;
      size_t end_index = start_index + outer_dim;
      for (size_t j = start_index, k = i * outer_dim; j < end_index; ++j, ++k) {
        unique_grad->value_[j] += origin_sparse_grad.value_[k];
      }
 namespace {
 struct BucketSparseGradient {
  float *value_;
  int *indices_;
  int *global_indices_;
  size_t indices_size_;
 };

 struct MultiThreadReduceSparseGradientParam {
  SparseGradient *input_grad_{nullptr};
  SparseGradient *workspace_grad_{nullptr};
  SparseGradient *output_grad_{nullptr};
  size_t max_index_{0};
  size_t value_stride_{0};
  size_t thread_num_{0};
  bool use_sort_reduce_{false};
 };

 void CalculateEachBucketSize(const std::shared_ptr<SparseGradient> &sparse_grad, size_t max_index,
                             std::vector<size_t> *each_bucket_size) {
  MS_LOG(DEBUG) << "Start";
  MS_EXCEPTION_IF_NULL(sparse_grad);
  MS_EXCEPTION_IF_NULL(sparse_grad->indices_);
  MS_EXCEPTION_IF_NULL(each_bucket_size);
  size_t bucket_num = each_bucket_size->size();
  for (size_t i = 0; i < sparse_grad->indices_size_; ++i) {
    int index = sparse_grad->indices_[i];
    if (index >= 0 && IntToSize(index) < max_index) {
      auto bucket_id = index % bucket_num;
      each_bucket_size->at(bucket_id)++;
    }
  }
  unique_grad->indices_size_ = unique_indices_size;
  MS_LOG(DEBUG) << "End";
 }

 struct WorkerParamsForReduceSparseGradient {
  size_t slice_start_{0};
  size_t slice_end_{0};
  size_t max_length_{0};
  size_t outer_dim_{0};
  std::vector<std::pair<int, size_t>> *sorted_indices_{nullptr};
  std::vector<size_t> *slice_positions_{nullptr};
  float *src_value_{nullptr};
  SparseGradient *unique_grad_{nullptr};
 };
 void SplitAndCalculateSegmentBucketSize(const MultiThreadReduceSparseGradientParam &param,
                                        std::vector<std::shared_ptr<SparseGradient>> *segments_ptr,
                                        std::vector<std::shared_ptr<std::vector<size_t>>> *segment_bucket_sizes_ptr) {
  MS_EXCEPTION_IF_NULL(param.input_grad_);
  MS_EXCEPTION_IF_NULL(segment_bucket_sizes_ptr);
  MS_EXCEPTION_IF_NULL(segments_ptr);
  auto &segments = *segments_ptr;
  auto &segment_bucket_sizes = *segment_bucket_sizes_ptr;
  auto input_grad = param.input_grad_;
  if (param.thread_num_ < 1) {
    MS_EXCEPTION(ArgumentError) << "Input param thread num must > 0!";
  }
  size_t thread_indices_size = input_grad->indices_size_ / param.thread_num_;
  size_t left_indices_size = input_grad->indices_size_ % param.thread_num_;
  std::vector<std::thread> threads;
  threads.reserve(param.thread_num_);
  segments.reserve(param.thread_num_);

 void WorkerForReduceSparseGradient(WorkerParamsForReduceSparseGradient param) {
  MS_EXCEPTION_IF_NULL(param.sorted_indices_);
  MS_EXCEPTION_IF_NULL(param.slice_positions_);
  MS_EXCEPTION_IF_NULL(param.src_value_);
  MS_EXCEPTION_IF_NULL(param.unique_grad_);
  auto outer_dim = param.outer_dim_;
  auto &sorted_indices = *(param.sorted_indices_);
  auto &slice_positions = *(param.slice_positions_);
  auto unique_grad = param.unique_grad_;
  for (size_t slice_id = param.slice_start_; slice_id < param.slice_end_; ++slice_id) {
    size_t cur_pos = slice_positions[slice_id];
    int index = sorted_indices[cur_pos].first;
    unique_grad->indices_[slice_id] = index;
    size_t start_index = slice_id * outer_dim;
    auto ret_code = memcpy_s(unique_grad->value_ + start_index, (param.max_length_ - start_index) * sizeof(float),
                             param.src_value_ + sorted_indices[cur_pos].second, outer_dim * sizeof(float));
    if (ret_code != EOK) {
      MS_LOG(EXCEPTION) << "Failed to copy data!";
    }
    cur_pos++;
    size_t end_pos;
    if (slice_id + 1 < slice_positions.size()) {
      end_pos = slice_positions[slice_id + 1];
    } else {
      end_pos = sorted_indices.size();
    }
    while (cur_pos < end_pos) {
      for (size_t i = 0; i < outer_dim; ++i) {
        unique_grad->value_[start_index + i] += param.src_value_[sorted_indices[cur_pos].second + i];
      }
      cur_pos++;
  size_t current_indices_offset = 0;
  for (size_t i = 0; i < param.thread_num_; ++i) {
    segment_bucket_sizes.emplace_back(std::make_shared<std::vector<size_t>>(param.thread_num_, 0));
    size_t indices_size = thread_indices_size;
    if (i < left_indices_size) {
      indices_size += 1;
    }
    segments.emplace_back(std::make_shared<SparseGradient>());
    segments[i]->value_ = input_grad->value_ + current_indices_offset * param.value_stride_;
    segments[i]->indices_ = input_grad->indices_ + current_indices_offset;
    segments[i]->indices_size_ = indices_size;
    threads.emplace_back(
      std::thread(CalculateEachBucketSize, segments[i], param.max_index_, segment_bucket_sizes[i].get()));
    current_indices_offset += indices_size;
  }

  for (size_t i = 0; i < param.thread_num_; ++i) {
    threads[i].join();
  }
 }

 void RunMultiThreadReduceSparseGradient(const SparseGradient &origin_sparse_grad, SparseGradient *unique_grad,
                                        size_t outer_dim, std::vector<std::pair<int, size_t>> *sorted_indices,
                                        std::vector<size_t> *slice_positions) {
 void CopySegmentIndicesToBucket(const MultiThreadReduceSparseGradientParam &param,
                                const std::shared_ptr<SparseGradient> &segment, size_t bucket_offset,
                                const std::vector<std::shared_ptr<BucketSparseGradient>> &buckets) {
  MS_LOG(DEBUG) << "Start";
  size_t thread_num = 24;
  if (slice_positions->size() < thread_num) {
    thread_num = slice_positions->size();
  MS_EXCEPTION_IF_NULL(segment);
  MS_EXCEPTION_IF_NULL(segment->indices_);
  std::vector<size_t> bucket_data_num(param.thread_num_, 0);
  for (size_t i = 0; i < segment->indices_size_; ++i) {
    int index = segment->indices_[i];
    if (index >= 0 && IntToSize(index) < param.max_index_) {
      auto bucket_id = index % param.thread_num_;
      auto bucket_index = bucket_data_num[bucket_id];
      buckets[bucket_id]->indices_[bucket_index] = index;
      buckets[bucket_id]->global_indices_[bucket_index] = bucket_offset + i;
      bucket_data_num[bucket_id]++;
    }
  }
  size_t stride = (slice_positions->size() + thread_num - 1) / thread_num;
  thread_num = (slice_positions->size() + stride - 1) / stride;
  std::vector<std::thread> threads;
  size_t max_length = sorted_indices->size() * outer_dim;
  MS_LOG(DEBUG) << "End";
 }

 void GatherSegmentIndicesToOutputBucket(const MultiThreadReduceSparseGradientParam &param,
                                        const std::vector<std::shared_ptr<SparseGradient>> &segments,
                                        const std::vector<std::shared_ptr<std::vector<size_t>>> &segment_bucket_sizes,
                                        std::vector<std::shared_ptr<BucketSparseGradient>> *buckets_ptr) {
  MS_EXCEPTION_IF_NULL(param.output_grad_);
  MS_EXCEPTION_IF_NULL(param.output_grad_->value_);
  MS_EXCEPTION_IF_NULL(param.output_grad_->indices_);
  MS_EXCEPTION_IF_NULL(buckets_ptr);
  auto &buckets = *buckets_ptr;
  size_t thread_num = param.thread_num_;
  if (thread_num != segment_bucket_sizes.size()) {
    MS_EXCEPTION(ArgumentError) << "Input param thread num not equal to segment size!";
  }
  std::vector<size_t> bucket_data_size(thread_num, 0);
  for (size_t i = 0; i < thread_num; ++i) {
    size_t slice_start = i * stride;
    size_t slice_end = 0;
    if (i == thread_num - 1) {
      slice_end = slice_positions->size();
    } else {
      slice_end = slice_start + stride;
    for (size_t j = 0; j < thread_num; ++j) {
      bucket_data_size[j] += segment_bucket_sizes[i]->at(j);
    }
    WorkerParamsForReduceSparseGradient params{
      slice_start, slice_end, max_length, outer_dim, sorted_indices, slice_positions, origin_sparse_grad.value_,
      unique_grad};
    threads.emplace_back(std::thread(WorkerForReduceSparseGradient, params));
  }
  size_t current_indices_offset = 0;
  for (size_t i = 0; i < thread_num; ++i) {
    buckets.emplace_back(std::make_shared<BucketSparseGradient>());
    buckets[i]->value_ = param.output_grad_->value_ + current_indices_offset * param.value_stride_;
    buckets[i]->indices_ = param.output_grad_->indices_ + current_indices_offset;
    buckets[i]->global_indices_ = param.workspace_grad_->indices_ + current_indices_offset;
    buckets[i]->indices_size_ = bucket_data_size[i];
    current_indices_offset += bucket_data_size[i];
  }
  std::vector<size_t> tmp_bucket_data_size(thread_num, 0);
  std::vector<std::vector<std::shared_ptr<BucketSparseGradient>>> each_thread_buckets;
  for (size_t i = 0; i < thread_num; ++i) {
    std::vector<std::shared_ptr<BucketSparseGradient>> thread_buckets;
    for (size_t j = 0; j < thread_num; ++j) {
      thread_buckets.emplace_back(std::make_shared<BucketSparseGradient>());
      thread_buckets[j]->indices_ = buckets[j]->indices_ + tmp_bucket_data_size[j];
      thread_buckets[j]->global_indices_ = buckets[j]->global_indices_ + tmp_bucket_data_size[j];
      thread_buckets[j]->value_ = buckets[j]->value_ + tmp_bucket_data_size[j] * param.value_stride_;
      thread_buckets[j]->indices_size_ = segment_bucket_sizes[i]->at(j);
      tmp_bucket_data_size[j] += segment_bucket_sizes[i]->at(j);
    }
    each_thread_buckets.emplace_back(thread_buckets);
  }
  std::vector<std::thread> threads;
  threads.reserve(thread_num);
  current_indices_offset = 0;
  for (size_t i = 0; i < thread_num; ++i) {
    threads.emplace_back(
      std::thread(CopySegmentIndicesToBucket, param, segments[i], current_indices_offset, each_thread_buckets[i]));
    current_indices_offset += segments[i]->indices_size_;
  }
  for (size_t i = 0; i < thread_num; ++i) {
    threads[i].join();
  }
  MS_LOG(DEBUG) << "End";
 }

 void ReduceSparseGradient(const SparseGradient &origin_sparse_grad, SparseGradient *unique_grad, size_t first_dim,
                          size_t outer_dim, bool use_multi_threads) {
 void SortAndReduceBucketSparseGradient(const MultiThreadReduceSparseGradientParam &param,
                                       const std::shared_ptr<BucketSparseGradient> &bucket,
                                       const std::shared_ptr<SparseGradient> &reduced_bucket) {
  MS_LOG(DEBUG) << "Start";
  MS_EXCEPTION_IF_NULL(origin_sparse_grad.value_);
  MS_EXCEPTION_IF_NULL(origin_sparse_grad.indices_);
  MS_EXCEPTION_IF_NULL(unique_grad);
  MS_EXCEPTION_IF_NULL(unique_grad->value_);
  MS_EXCEPTION_IF_NULL(unique_grad->indices_);
  std::vector<std::pair<int, size_t>> sorted_indices;
  sorted_indices.reserve(origin_sparse_grad.indices_size_);
  for (size_t i = 0; i < origin_sparse_grad.indices_size_; ++i) {
    int index = origin_sparse_grad.indices_[i];
    if (index >= 0 && IntToSize(index) < first_dim) {
      sorted_indices.emplace_back(std::pair<int, size_t>(index, i * outer_dim));
    }
  }
  std::sort(
    sorted_indices.begin(), sorted_indices.end(),
    [](const std::pair<int, size_t> &left, const std::pair<int, size_t> &right) { return left.first < right.first; });
  int last_index = 0;
  std::vector<size_t> slice_positions;
  slice_positions.reserve(sorted_indices.size());
  MS_EXCEPTION_IF_NULL(bucket);
  MS_EXCEPTION_IF_NULL(bucket->value_);
  MS_EXCEPTION_IF_NULL(bucket->indices_);
  MS_EXCEPTION_IF_NULL(reduced_bucket);
  MS_EXCEPTION_IF_NULL(reduced_bucket->value_);
  MS_EXCEPTION_IF_NULL(reduced_bucket->indices_);
  std::vector<std::pair<int, int>> sorted_indices;
  sorted_indices.reserve(bucket->indices_size_);
  for (size_t i = 0; i < bucket->indices_size_; ++i) {
    int index = bucket->indices_[i];
    int global_index = bucket->global_indices_[i];
    sorted_indices.emplace_back(std::pair<int, int>(index, global_index));
  }
  std::sort(sorted_indices.begin(), sorted_indices.end());

  float *global_value = param.input_grad_->value_;
  size_t unique_indices_size = 0;
  size_t max_length = reduced_bucket->indices_size_ * param.value_stride_;
  int last_index{0};
  size_t value_offset{0};
  for (size_t i = 0; i < sorted_indices.size(); ++i) {
    if (i == 0 || last_index != sorted_indices[i].first) {
      slice_positions.emplace_back(i);
    int index = sorted_indices[i].first;
    int global_index = sorted_indices[i].second;
    int global_value_offset = global_index * param.value_stride_;
    if (i == 0 || index != last_index) {
      if (i != 0) {
        unique_indices_size++;
      }
      reduced_bucket->indices_[unique_indices_size] = index;
      value_offset = unique_indices_size * param.value_stride_;
      auto ret_code = memcpy_s(reduced_bucket->value_ + value_offset, (max_length - value_offset) * sizeof(float),
                               global_value + global_value_offset, param.value_stride_ * sizeof(float));
      if (ret_code != EOK) {
        MS_LOG(EXCEPTION) << "Failed to copy data!";
      }
    } else {
      for (size_t j = 0; j < param.value_stride_; ++j) {
        reduced_bucket->value_[value_offset + j] += global_value[global_value_offset + j];
      }
    }
    last_index = sorted_indices[i].first;
    last_index = index;
  }
  if (use_multi_threads) {
    RunMultiThreadReduceSparseGradient(origin_sparse_grad, unique_grad, outer_dim, &sorted_indices, &slice_positions);
  } else {
    size_t max_length = sorted_indices.size() * outer_dim;
    WorkerParamsForReduceSparseGradient params{0,
                                               slice_positions.size(),
                                               max_length,
                                               outer_dim,
                                               &sorted_indices,
                                               &slice_positions,
                                               origin_sparse_grad.value_,
                                               unique_grad};
    WorkerForReduceSparseGradient(params);
  }
  unique_grad->indices_size_ = slice_positions.size();
  reduced_bucket->indices_size_ = unique_indices_size;
  MS_LOG(DEBUG) << "End";
 }

 void ReduceMultiSparseGradient(const std::vector<std::shared_ptr<SparseGradient>> &unique_slice_grads,
                               SparseGradient *tmp_grad, SparseGradient *unique_grad, size_t first_dim,
                               size_t outer_dim) {
 void ReduceBucketSparseGradient(const MultiThreadReduceSparseGradientParam &param,
                                const std::shared_ptr<BucketSparseGradient> &bucket,
                                const std::shared_ptr<SparseGradient> &reduced_bucket) {
  MS_LOG(DEBUG) << "Start";
  if (unique_slice_grads.empty()) {
    return;
  }
  size_t index_data_size = outer_dim * sizeof(float);
  MS_EXCEPTION_IF_NULL(bucket);
  MS_EXCEPTION_IF_NULL(bucket->value_);
  MS_EXCEPTION_IF_NULL(bucket->indices_);
  MS_EXCEPTION_IF_NULL(reduced_bucket);
  MS_EXCEPTION_IF_NULL(reduced_bucket->value_);
  MS_EXCEPTION_IF_NULL(reduced_bucket->indices_);

  float *global_value = param.input_grad_->value_;
  std::unordered_map<int, size_t> index_map;
  size_t unique_indices_size = 0;
  for (size_t i = 0; i < unique_slice_grads.size(); ++i) {
    auto &slice_grad = unique_slice_grads[i];
    auto ret_code = memcpy_s(tmp_grad->value_ + unique_indices_size * outer_dim,
                             (tmp_grad->indices_size_ - unique_indices_size) * index_data_size, slice_grad->value_,
                             slice_grad->indices_size_ * index_data_size);
    if (ret_code != EOK) {
      MS_LOG(EXCEPTION) << "Failed to copy data!";
    }
    ret_code =
      memcpy_s(tmp_grad->indices_ + unique_indices_size, (tmp_grad->indices_size_ - unique_indices_size) * sizeof(int),
               slice_grad->indices_, slice_grad->indices_size_ * sizeof(int));
    if (ret_code != EOK) {
      MS_LOG(EXCEPTION) << "Failed to copy data!";
  size_t max_length = reduced_bucket->indices_size_ * param.value_stride_;
  for (size_t i = 0; i < bucket->indices_size_; ++i) {
    int index = bucket->indices_[i];
    int global_index = bucket->global_indices_[i];
    auto iter = index_map.find(index);
    if (iter == index_map.end()) {
      reduced_bucket->indices_[unique_indices_size] = index;
      size_t start_index = unique_indices_size * param.value_stride_;
      index_map[index] = start_index;
      auto ret_code = memcpy_s(reduced_bucket->value_ + start_index, (max_length - start_index) * sizeof(float),
                               global_value + global_index * param.value_stride_, param.value_stride_ * sizeof(float));
      if (ret_code != EOK) {
        MS_LOG(EXCEPTION) << "Failed to copy data!";
      }
      unique_indices_size++;
    } else {
      size_t start_index = iter->second;
      size_t end_index = start_index + param.value_stride_;
      for (size_t j = start_index, k = global_index * param.value_stride_; j < end_index; ++j, ++k) {
        reduced_bucket->value_[j] += global_value[k];
      }
    }
    unique_indices_size += slice_grad->indices_size_;
  }
  tmp_grad->indices_size_ = unique_indices_size;
  ReduceSparseGradient(*tmp_grad, unique_grad, first_dim, outer_dim);
  reduced_bucket->indices_size_ = unique_indices_size;
  MS_LOG(DEBUG) << "End";
 }

 void TwoLevelReduceSparseGradient(const SparseGradient &origin_sparse_grad, SparseGradient *tmp_grad,
                                  SparseGradient *unique_grad, size_t first_dim, size_t outer_dim) {
  MS_LOG(DEBUG) << "Start";
  MS_EXCEPTION_IF_NULL(origin_sparse_grad.value_);
  MS_EXCEPTION_IF_NULL(origin_sparse_grad.indices_);
  MS_EXCEPTION_IF_NULL(unique_grad);
  MS_EXCEPTION_IF_NULL(unique_grad->value_);
  MS_EXCEPTION_IF_NULL(unique_grad->indices_);
  MS_EXCEPTION_IF_NULL(tmp_grad);
  MS_EXCEPTION_IF_NULL(tmp_grad->value_);
  MS_EXCEPTION_IF_NULL(tmp_grad->indices_);
  size_t thread_num = 24;
  if (origin_sparse_grad.indices_size_ < thread_num) {
    thread_num = origin_sparse_grad.indices_size_;
  }
  size_t thread_indices_size = origin_sparse_grad.indices_size_ / thread_num;
  size_t left_indices_size = origin_sparse_grad.indices_size_ % thread_num;
 void ReduceBucketSparseGradientToWorkspace(const MultiThreadReduceSparseGradientParam &param,
                                           const std::vector<std::shared_ptr<BucketSparseGradient>> &buckets,
                                           std::vector<std::shared_ptr<SparseGradient>> *reduced_buckets_ptr) {
  MS_EXCEPTION_IF_NULL(param.workspace_grad_);
  MS_EXCEPTION_IF_NULL(param.workspace_grad_->value_);
  MS_EXCEPTION_IF_NULL(param.workspace_grad_->indices_);
  MS_EXCEPTION_IF_NULL(reduced_buckets_ptr);
  auto &reduced_buckets = *reduced_buckets_ptr;
  size_t thread_num = buckets.size();
  std::vector<std::thread> threads;
  threads.reserve(thread_num);
  std::vector<std::shared_ptr<SparseGradient>> unique_slice_grads;

  size_t current_indices_offset = 0;
  for (size_t i = 0; i < thread_num; ++i) {
    size_t indices_size = thread_indices_size;
    if (i == thread_num - 1) {
      indices_size = thread_indices_size + left_indices_size;
    reduced_buckets.emplace_back(std::make_shared<SparseGradient>());
    reduced_buckets[i]->value_ = param.workspace_grad_->value_ + current_indices_offset * param.value_stride_;
    reduced_buckets[i]->indices_ = param.workspace_grad_->indices_ + current_indices_offset;
    reduced_buckets[i]->indices_size_ = buckets[i]->indices_size_;
    if (param.use_sort_reduce_) {
      threads.emplace_back(std::thread(SortAndReduceBucketSparseGradient, param, buckets[i], reduced_buckets[i]));
    } else {
      threads.emplace_back(std::thread(ReduceBucketSparseGradient, param, buckets[i], reduced_buckets[i]));
    }
    size_t value_offset = i * thread_indices_size * outer_dim;
    size_t indices_offset = i * thread_indices_size;
    auto slice_grad = SparseGradient(
      {origin_sparse_grad.value_ + value_offset, origin_sparse_grad.indices_ + indices_offset, indices_size});
    unique_slice_grads.emplace_back(std::make_shared<SparseGradient>());
    unique_slice_grads[i]->value_ = unique_grad->value_ + value_offset;
    unique_slice_grads[i]->indices_ = unique_grad->indices_ + indices_offset;
    unique_slice_grads[i]->indices_size_ = indices_size;
    threads.emplace_back(
      std::thread(ReduceSparseGradient, slice_grad, unique_slice_grads[i].get(), first_dim, outer_dim, false));
    current_indices_offset += buckets[i]->indices_size_;
  }
  for (size_t i = 0; i < thread_num; ++i) {
    threads[i].join();
  }
  ReduceMultiSparseGradient(unique_slice_grads, tmp_grad, unique_grad, first_dim, outer_dim);
 }

 void MergeReduceSparseGradient(const MultiThreadReduceSparseGradientParam &param,
                               const std::vector<std::shared_ptr<SparseGradient>> &reduced_buckets) {
  MS_EXCEPTION_IF_NULL(param.output_grad_);
  auto output_grad = param.output_grad_;
  MS_EXCEPTION_IF_NULL(output_grad->value_);
  MS_EXCEPTION_IF_NULL(output_grad->indices_);
  size_t stride_data_size = param.value_stride_ * sizeof(float);
  size_t unique_indices_size = 0;
  for (size_t i = 0; i < reduced_buckets.size(); ++i) {
    auto &bucket = reduced_buckets[i];
    MS_EXCEPTION_IF_NULL(bucket);
    if (bucket->indices_size_ == 0) {
      continue;
    }
    auto ret_code = memcpy_s(output_grad->value_ + unique_indices_size * param.value_stride_,
                             (output_grad->indices_size_ - unique_indices_size) * stride_data_size, bucket->value_,
                             bucket->indices_size_ * stride_data_size);
    if (ret_code != EOK) {
      MS_LOG(EXCEPTION) << "Failed to copy data!";
    }
    ret_code = memcpy_s(output_grad->indices_ + unique_indices_size,
                        (output_grad->indices_size_ - unique_indices_size) * sizeof(int), bucket->indices_,
                        bucket->indices_size_ * sizeof(int));
    if (ret_code != EOK) {
      MS_LOG(EXCEPTION) << "Failed to copy data!";
    }
    unique_indices_size += bucket->indices_size_;
  }
  output_grad->indices_size_ = unique_indices_size;
 }
 }  // namespace

 void BucketReduceSparseGradient(const ReduceSparseGradientParam &param) {
  MS_LOG(DEBUG) << "Start";
  MS_EXCEPTION_IF_NULL(param.input_grad_);
  size_t thread_num = 23;
  if (param.input_grad_->indices_size_ < thread_num) {
    thread_num = param.input_grad_->indices_size_;
  }
  MultiThreadReduceSparseGradientParam multi_thread_param({param.input_grad_, param.workspace_grad_, param.output_grad_,
                                                           param.max_index_, param.value_stride_, thread_num,
                                                           param.use_sort_reduce_});
  std::vector<std::shared_ptr<SparseGradient>> segments;
  std::vector<std::shared_ptr<std::vector<size_t>>> segment_bucket_sizes;
  SplitAndCalculateSegmentBucketSize(multi_thread_param, &segments, &segment_bucket_sizes);

  std::vector<std::shared_ptr<BucketSparseGradient>> buckets;
  GatherSegmentIndicesToOutputBucket(multi_thread_param, segments, segment_bucket_sizes, &buckets);

  std::vector<std::shared_ptr<SparseGradient>> reduced_buckets;
  ReduceBucketSparseGradientToWorkspace(multi_thread_param, buckets, &reduced_buckets);

  MergeReduceSparseGradient(multi_thread_param, reduced_buckets);
  MS_LOG(DEBUG) << "End";
 }

--- a/mindspore/ccsrc/backend/kernel_compiler/common_utils.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/common_utils.h
@@ -73,9 +73,18 @@ class KernelMeta {
 };

 struct SparseGradient {
  float *value_;
  int *indices_;
  size_t indices_size_;
  float *value_{nullptr};
  int *indices_{nullptr};
  size_t indices_size_{0};
 };

 struct ReduceSparseGradientParam {
  SparseGradient *input_grad_{nullptr};
  SparseGradient *workspace_grad_{nullptr};
  SparseGradient *output_grad_{nullptr};
  size_t max_index_{0};
  size_t value_stride_{0};
  bool use_sort_reduce_{false};
 };

 struct MultiThreadComputeParams {
@@ -112,10 +121,6 @@ void SaveJsonInfo(const std::string &json_name, const std::string &info);
 std::string GetProcessor(const AnfNodePtr &anf_node);
 bool IsSameShape(const std::vector<size_t> &shape_a, const std::vector<size_t> &shape_b);
 int Sign(float x);
 void DeduplicateIndexedSlices(const SparseGradient &origin_sparse_grad, SparseGradient *unique_grad, size_t first_dim,
                              size_t outer_dim);
 void ReduceSparseGradient(const SparseGradient &origin_sparse_grad, SparseGradient *unique_grad, size_t first_dim,
                          size_t outer_dim, bool use_multi_threads = true);
 std::pair<AnfNodePtr, size_t> GetKernelInput(const AnfNodePtr &anf_node, size_t index);
 std::vector<std::pair<AnfNodePtr, std::pair<size_t, size_t>>> GetInputIndex(const std::vector<AnfNodePtr> &node_list,
                                                                            const std::vector<AnfNodePtr> &input_list);
@@ -130,14 +135,7 @@ void GetGraphRealOutput(const FuncGraphPtr &func_graph, std::vector<std::pair<An
 bool IsWeightBoundary(const AnfNodePtr &node);
 void MultiThreadCompute(const MultiThreadComputeFunc &func, MultiThreadComputeParams *params,
                        size_t total_compute_size);
 void RunMultiThreadReduceSparseGradient(const SparseGradient &origin_sparse_grad, SparseGradient *unique_grad,
                                        size_t outer_dim, std::vector<std::pair<int, size_t>> *sorted_indices,
                                        std::vector<size_t> *slice_positions);
 void ReduceMultiSparseGradient(const std::vector<std::shared_ptr<SparseGradient>> &unique_slice_grads,
                               SparseGradient *tmp_grad, SparseGradient *unique_grad, size_t first_dim,
                               size_t outer_dim);
 void TwoLevelReduceSparseGradient(const SparseGradient &origin_sparse_grad, SparseGradient *tmp_grad,
                                  SparseGradient *unique_grad, size_t first_dim, size_t outer_dim);
 void BucketReduceSparseGradient(const ReduceSparseGradientParam &param);
 std::vector<int> GetReduceAttrAxis(const CNodePtr &cnode);
 }  // namespace kernel
 }  // namespace mindspore
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/embedding_look_up_cpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/embedding_look_up_cpu_kernel.h
@@ -46,7 +46,7 @@ class EmbeddingLookUpCPUKernel : public CPUKernel {
  bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
              const std::vector<AddressPtr> &outputs) override;

 private:
 protected:
  void LookUpTable(const std::vector<kernel::AddressPtr> &inputs, size_t dim0, size_t dim1, size_t dim2,
                   float **output_addr);
  void CheckParam(const CNodePtr &kernel_node);
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/ps/embedding_look_up_proxy_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/ps/embedding_look_up_proxy_kernel.cc
@@ -53,15 +53,15 @@ bool EmbeddingLookUpProxyKernel::Launch(const std::vector<kernel::AddressPtr> &i
  size_t output_size = outputs[0]->size;

  size_t size = input_size / sizeof(float);
  ::ps::SArray<float> lookup_ids(size, 0);
  ::ps::SArray<int> lookup_ids(size, 0);
  ::ps::SArray<int> lengths{size};
  ::ps::SArray<float> lookup_result;
  ::ps::SArray<float> lookup_result(output_size / sizeof(float), 0);

  auto ret = memcpy_s(lookup_ids.data(), input_size, indices_addr, input_size);
  if (ret != EOK) {
    MS_LOG(EXCEPTION) << "Lookup id memcpy failed.";
  }
  parallel::ps::Worker<float>::GetInstance().DoPSEmbeddingLookup({key_}, lookup_ids, lengths, lookup_result,
  parallel::ps::Worker<float>::GetInstance().DoPSEmbeddingLookup({key_}, lookup_ids, lengths, &lookup_result,
                                                                 parallel::ps::kEmbeddingLookupCmd);

  auto ret2 = memcpy_s(output_addr, output_size, lookup_result.data(), output_size);
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/ps/embedding_look_up_ps_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/ps/embedding_look_up_ps_kernel.cc
@@ -50,7 +50,7 @@ void EmbeddingLookUpPSKernel::InitKernel(
  split_num_ = pserver_num_;

  // input shape should be sharded after computing offset_;
  Shard(input_shape_, axis_);
  Shard(&input_shape_, axis_);

  size_t output_size =
    std::accumulate(output_shape_.begin(), output_shape_.end(), sizeof(float), std::multiplies<size_t>());
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/ps/push_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/ps/push_kernel.cc
@@ -34,5 +34,13 @@ MS_REG_CPU_KERNEL_T(Push,
 MS_REG_CPU_KERNEL_T(
  Push, KernelAttr().AddInputAttr(kNumberTypeFloat32).AddInputAttr(kNumberTypeInt32).AddOutputAttr(kNumberTypeUInt64),
  PushKernel, float);

 MS_REG_CPU_KERNEL_T(Push,
                    KernelAttr()
                      .AddInputAttr(kNumberTypeFloat32)
                      .AddInputAttr(kNumberTypeFloat32)
                      .AddInputAttr(kNumberTypeFloat32)
                      .AddOutputAttr(kNumberTypeUInt64),
                    PushKernel, float);
 }  // namespace kernel
 }  // namespace mindspore
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/ps/push_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/ps/push_kernel.h
@@ -43,7 +43,7 @@ class PushKernel : public CPUKernel {
      sizes.push_back(SizeToInt(input->size) / sizeof(T));
    }
    parallel::ps::Worker<T>::GetInstance().Push(keys, addrs, sizes);
    memcpy(outputs[0]->addr, &key_, sizeof(size_t));
    memcpy_s(outputs[0]->addr, sizeof(size_t), &key_, sizeof(size_t));
    return true;
  }

--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/ps/sparse_apply_adam_ps_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/ps/sparse_apply_adam_ps_kernel.cc
@@ -75,7 +75,7 @@ void SparseApplyAdamPSKernel::ReInit(const std::shared_ptr<std::vector<std::shar

 void SparseApplyAdamPSKernel::ReInit(const std::vector<AddressPtr> &inputs) {
  const auto &indices_addr = inputs[10];
  indices_size_ = indices_addr->size;
  indices_size_ = indices_addr->size / sizeof(int);
  workspace_size_list_[0] = indices_size_ * var_outer_dim_size_ * sizeof(float);
  workspace_size_list_[1] = indices_size_ * sizeof(int);
 }
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/ps/sparse_apply_ftrl_ps_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/ps/sparse_apply_ftrl_ps_kernel.cc
@@ -64,7 +64,7 @@ void SparseApplyFtrlPSKernel::ReInit(const std::shared_ptr<std::vector<std::shar

 void SparseApplyFtrlPSKernel::ReInit(const std::vector<AddressPtr> &inputs) {
  const auto &indices_addr = inputs[4];
  indices_size_ = indices_addr->size;
  indices_size_ = indices_addr->size / sizeof(int);
  workspace_size_list_[0] = indices_size_ * var_outer_dim_size_ * sizeof(float);
  workspace_size_list_[1] = indices_size_ * sizeof(int);
 }
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/sparse_apply_adam_cpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/sparse_apply_adam_cpu_kernel.cc
@@ -81,6 +81,8 @@ void SparseApplyAdamCPUKernel::InitInputOutputSize(const CNodePtr &kernel_node)
  MS_EXCEPTION_IF_NULL(kernel_node);
  workspace_size_list_.emplace_back(indices_size_ * var_outer_dim_size_ * sizeof(float));
  workspace_size_list_.emplace_back(indices_size_ * sizeof(int));
  workspace_size_list_.emplace_back(indices_size_ * var_outer_dim_size_ * sizeof(float));
  workspace_size_list_.emplace_back(indices_size_ * sizeof(int));
  workspace_size_list_.emplace_back(var_first_dim_size_ * var_outer_dim_size_ * sizeof(float));
 }

@@ -142,11 +144,21 @@ bool SparseApplyAdamCPUKernel::Launch(const std::vector<kernel::AddressPtr> &inp
  auto indices = reinterpret_cast<int *>(inputs[10]->addr);
  auto new_grad = reinterpret_cast<float *>(workspace[0]->addr);
  auto new_indices = reinterpret_cast<int *>(workspace[1]->addr);
  auto m_t = reinterpret_cast<float *>(workspace[2]->addr);
  auto workspace_grad = reinterpret_cast<float *>(workspace[2]->addr);
  auto workspace_indices = reinterpret_cast<int *>(workspace[3]->addr);
  auto m_t = reinterpret_cast<float *>(workspace[4]->addr);

  SparseGradient unique_sparse_grad({new_grad, new_indices, indices_size_});
  ReduceSparseGradient(SparseGradient({grad, indices, indices_size_}), &unique_sparse_grad, var_first_dim_size_,
                       var_outer_dim_size_);
  SparseGradient workspace_sparse_grad({workspace_grad, workspace_indices, indices_size_});
  SparseGradient input_sparse_grad({grad, indices, indices_size_});
  ReduceSparseGradientParam param;
  param.input_grad_ = &input_sparse_grad;
  param.workspace_grad_ = &workspace_sparse_grad;
  param.output_grad_ = &unique_sparse_grad;
  param.max_index_ = var_first_dim_size_;
  param.value_stride_ = var_outer_dim_size_;
  BucketReduceSparseGradient(param);

  size_t total_dim_size = var_first_dim_size_ * var_outer_dim_size_;
  lr = lr * std::sqrt(1 - beta2_power) / (1 - beta1_power);

--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/sparse_apply_ftrl_cpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/sparse_apply_ftrl_cpu_kernel.cc
@@ -132,12 +132,19 @@ bool SparseApplyFtrlCPUKernel::Launch(const std::vector<kernel::AddressPtr> &inp
  auto indices = reinterpret_cast<int *>(inputs[4]->addr);
  auto new_grad = reinterpret_cast<float *>(workspace[0]->addr);
  auto new_indices = reinterpret_cast<int *>(workspace[1]->addr);
  auto tmp_grad = reinterpret_cast<float *>(workspace[2]->addr);
  auto tmp_indices = reinterpret_cast<int *>(workspace[3]->addr);
  auto workspace_grad = reinterpret_cast<float *>(workspace[2]->addr);
  auto workspace_indices = reinterpret_cast<int *>(workspace[3]->addr);

  SparseGradient unique_sparse_grad({new_grad, new_indices, indices_size_});
  SparseGradient tmp_sparse_grad({tmp_grad, tmp_indices, indices_size_});
  TwoLevelReduceSparseGradient(SparseGradient({grad, indices, indices_size_}), &tmp_sparse_grad, &unique_sparse_grad,
                               var_first_dim_size_, var_outer_dim_size_);
  SparseGradient workspace_sparse_grad({workspace_grad, workspace_indices, indices_size_});
  SparseGradient input_sparse_grad({grad, indices, indices_size_});
  ReduceSparseGradientParam param;
  param.input_grad_ = &input_sparse_grad;
  param.workspace_grad_ = &workspace_sparse_grad;
  param.output_grad_ = &unique_sparse_grad;
  param.max_index_ = var_first_dim_size_;
  param.value_stride_ = var_outer_dim_size_;
  BucketReduceSparseGradient(param);

  MultiThreadComputeParams input_params;
  input_params.var_ = var;
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/sparse_apply_lazy_adam_cpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/sparse_apply_lazy_adam_cpu_kernel.cc
@@ -123,13 +123,19 @@ bool SparseApplyLazyAdamCPUKernel::Launch(const std::vector<kernel::AddressPtr>
  auto indices = reinterpret_cast<int *>(inputs[10]->addr);
  auto new_grad = reinterpret_cast<float *>(workspace[0]->addr);
  auto new_indices = reinterpret_cast<int *>(workspace[1]->addr);
  auto tmp_grad = reinterpret_cast<float *>(workspace[2]->addr);
  auto tmp_indices = reinterpret_cast<int *>(workspace[3]->addr);
  auto workspace_grad = reinterpret_cast<float *>(workspace[2]->addr);
  auto workspace_indices = reinterpret_cast<int *>(workspace[3]->addr);

  SparseGradient unique_sparse_grad({new_grad, new_indices, indices_size_});
  SparseGradient tmp_sparse_grad({tmp_grad, tmp_indices, indices_size_});
  TwoLevelReduceSparseGradient(SparseGradient({grad, indices, indices_size_}), &tmp_sparse_grad, &unique_sparse_grad,
                               var_first_dim_size_, var_outer_dim_size_);
  SparseGradient workspace_sparse_grad({workspace_grad, workspace_indices, indices_size_});
  SparseGradient input_sparse_grad({grad, indices, indices_size_});
  ReduceSparseGradientParam param;
  param.input_grad_ = &input_sparse_grad;
  param.workspace_grad_ = &workspace_sparse_grad;
  param.output_grad_ = &unique_sparse_grad;
  param.max_index_ = var_first_dim_size_;
  param.value_stride_ = var_outer_dim_size_;
  BucketReduceSparseGradient(param);

  lr = lr * std::sqrt(1 - beta2_power) / (1 - beta1_power);
  MultiThreadComputeParams input_params;
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/sparse_apply_proximal_adagrad_cpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/sparse_apply_proximal_adagrad_cpu_kernel.cc
@@ -61,6 +61,8 @@ void SparseApplyProximalAdagradCPUKernel::InitInputOutputSize(const CNodePtr &ke
  MS_EXCEPTION_IF_NULL(kernel_node);
  workspace_size_list_.emplace_back(indices_size_ * var_outer_dim_size_ * sizeof(float));
  workspace_size_list_.emplace_back(indices_size_ * sizeof(int));
  workspace_size_list_.emplace_back(indices_size_ * var_outer_dim_size_ * sizeof(float));
  workspace_size_list_.emplace_back(indices_size_ * sizeof(int));
 }

 void SparseApplyProximalAdagradCPUKernel::InitKernel(const CNodePtr &kernel_node) {
@@ -119,9 +121,19 @@ bool SparseApplyProximalAdagradCPUKernel::Launch(const std::vector<kernel::Addre
  auto indices = reinterpret_cast<int *>(inputs[6]->addr);
  auto new_grad = reinterpret_cast<float *>(workspace[0]->addr);
  auto new_indices = reinterpret_cast<int *>(workspace[1]->addr);
  auto workspace_grad = reinterpret_cast<float *>(workspace[2]->addr);
  auto workspace_indices = reinterpret_cast<int *>(workspace[3]->addr);

  SparseGradient unique_sparse_grad({new_grad, new_indices, indices_size_});
  ReduceSparseGradient(SparseGradient({grad, indices, indices_size_}), &unique_sparse_grad, var_first_dim_size_,
                       var_outer_dim_size_);
  SparseGradient workspace_sparse_grad({workspace_grad, workspace_indices, indices_size_});
  SparseGradient input_sparse_grad({grad, indices, indices_size_});
  ReduceSparseGradientParam param;
  param.input_grad_ = &input_sparse_grad;
  param.workspace_grad_ = &workspace_sparse_grad;
  param.output_grad_ = &unique_sparse_grad;
  param.max_index_ = var_first_dim_size_;
  param.value_stride_ = var_outer_dim_size_;
  BucketReduceSparseGradient(param);

  MultiThreadComputeParams input_params;
  input_params.var_ = var;
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/broadcast_to_gpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/broadcast_to_gpu_kernel.cc
@@ -0,0 +1,26 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #include "backend/kernel_compiler/gpu/arrays/broadcast_to_gpu_kernel.h"

 namespace mindspore {
 namespace kernel {
 MS_REG_GPU_KERNEL_ONE(BroadcastTo, KernelAttr().AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32),
                      BroadcastToGpuKernel, float)
 MS_REG_GPU_KERNEL_ONE(BroadcastTo, KernelAttr().AddInputAttr(kNumberTypeFloat16).AddOutputAttr(kNumberTypeFloat16),
                      BroadcastToGpuKernel, half)
 }  // namespace kernel
 }  // namespace mindspore
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/broadcast_to_gpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/broadcast_to_gpu_kernel.h
@@ -0,0 +1,83 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #ifndef MINDSPORE_CCSRC_KERNEL_GPU_BROADCAST_TO_GPU_KERNEL_H_
 #define MINDSPORE_CCSRC_KERNEL_GPU_BROADCAST_TO_GPU_KERNEL_H_

 #include <vector>
 #include "backend/kernel_compiler/gpu/gpu_kernel.h"
 #include "backend/kernel_compiler/gpu/gpu_kernel_factory.h"
 #include "backend/kernel_compiler/gpu/cuda_impl/broadcast_impl.cuh"

 namespace mindspore {
 namespace kernel {
 template <typename T>
 class BroadcastToGpuKernel : public GpuKernel {
 public:
  BroadcastToGpuKernel() {}
  ~BroadcastToGpuKernel() = default;

  const std::vector<size_t> &GetInputSizeList() const override { return input_size_list_; }
  const std::vector<size_t> &GetOutputSizeList() const override { return output_size_list_; }
  const std::vector<size_t> &GetWorkspaceSizeList() const override { return workspace_size_list_; }

  bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &,
              const std::vector<AddressPtr> &outputs, void *stream_ptr) override {
    T *input_addr = GetDeviceAddress<T>(inputs, 0);
    T *output_addr = GetDeviceAddress<T>(outputs, 0);

    BroadcastTo(input_shape_[0], input_shape_[1], input_shape_[2], input_shape_[3], output_shape_[0], output_shape_[1],
                output_shape_[2], output_shape_[3], input_addr, output_addr,
                reinterpret_cast<cudaStream_t>(stream_ptr));
    return true;
  }
  bool Init(const CNodePtr &kernel_node) override {
    auto input_shapes = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
    auto output_shapes = AnfAlgo::GetOutputInferShape(kernel_node, 0);
    if (input_shapes.size() > 4 || output_shapes.size() > 4) {
      MS_LOG(EXCEPTION) << "BroadcastTo operation not support dim greater than 4";
    }

    for (int i = input_shapes.size() - 1; i >= 0; i--) {
      input_shape_[i] = input_shapes[i];
    }

    for (int j = output_shapes.size() - 1; j >= 0; j--) {
      output_shape_[j] = output_shapes[j];
    }

    InitSizeLists();
    return true;
  }

 protected:
  void InitSizeLists() override {
    input_size_list_.push_back(input_shape_[0] * input_shape_[1] * input_shape_[2] * input_shape_[3] * sizeof(T));
    output_size_list_.push_back(output_shape_[0] * output_shape_[1] * output_shape_[2] * output_shape_[3] * sizeof(T));
  }

 private:
  int input_shape_[4] = {1, 1, 1, 1};
  int output_shape_[4] = {1, 1, 1, 1};

  std::vector<size_t> input_size_list_;
  std::vector<size_t> output_size_list_;
  std::vector<size_t> workspace_size_list_;
 };
 }  // namespace kernel
 }  // namespace mindspore

 #endif  // MINDSPORE_CCSRC_KERNEL_GPU_BROADCAST_TO_GPU_KERNEL_H_
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/concatv2_gpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/concatv2_gpu_kernel.h
@@ -18,6 +18,7 @@
 #define MINDSPORE_CCSRC_KERNEL_GPU_CONCATV2_GPU_KERNEL_H

 #include <vector>
 #include <memory>
 #include "backend/kernel_compiler/gpu/gpu_kernel.h"
 #include "backend/kernel_compiler/gpu/gpu_kernel_factory.h"
 #include "backend/kernel_compiler/gpu/cuda_impl/concatv2_impl.cuh"
@@ -27,40 +28,35 @@ namespace kernel {
 template <typename T>
 class ConcatV2GpuFwdKernel : public GpuKernel {
 public:
  ConcatV2GpuFwdKernel() : axis_(0), output_size_(0) {}
  ConcatV2GpuFwdKernel()
      : axis_(0),
        input_num_(1),
        output_size_(0),
        all_size_before_axis_(1),
        all_size_axis_(1),
        inputs_host_(nullptr),
        len_axis_(nullptr) {}
  ~ConcatV2GpuFwdKernel() override = default;
  const std::vector<size_t> &GetInputSizeList() const override { return input_size_list_; }
  const std::vector<size_t> &GetOutputSizeList() const override { return output_size_list_; }
  const std::vector<size_t> &GetWorkspaceSizeList() const override { return workspace_size_list_; }

  bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &,
  bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
              const std::vector<AddressPtr> &outputs, void *stream_ptr) override {
    if (inputs.size() == 2) {
      T *input_0 = GetDeviceAddress<T>(inputs, 0);
      T *input_1 = GetDeviceAddress<T>(inputs, 1);
      T *output = GetDeviceAddress<T>(outputs, 0);
      ConcatKernel(output_size_ / sizeof(T), w_[0], w_[1], input_0, input_1, output,
                   reinterpret_cast<cudaStream_t>(stream_ptr));
    }

    if (inputs.size() == 3) {
      T *input_0 = GetDeviceAddress<T>(inputs, 0);
      T *input_1 = GetDeviceAddress<T>(inputs, 1);
      T *input_2 = GetDeviceAddress<T>(inputs, 2);
      T *output = GetDeviceAddress<T>(outputs, 0);
      ConcatKernel(output_size_ / sizeof(T), w_[0], w_[1], w_[2], input_0, input_1, input_2, output,
                   reinterpret_cast<cudaStream_t>(stream_ptr));
    }

    if (inputs.size() == 4) {
      T *input_0 = GetDeviceAddress<T>(inputs, 0);
      T *input_1 = GetDeviceAddress<T>(inputs, 1);
      T *input_2 = GetDeviceAddress<T>(inputs, 2);
      T *input_3 = GetDeviceAddress<T>(inputs, 3);
      T *output = GetDeviceAddress<T>(outputs, 0);
      ConcatKernel(output_size_ / sizeof(T), w_[0], w_[1], w_[2], w_[3], input_0, input_1, input_2, input_3, output,
                   reinterpret_cast<cudaStream_t>(stream_ptr));
    T *output = GetDeviceAddress<T>(outputs, 0);
    T **inputs_device = GetDeviceAddress<T *>(workspace, 0);
    int *len_axis_device = GetDeviceAddress<int>(workspace, 1);
    for (size_t i = 0; i < inputs.size(); i++) {
      inputs_host_[i] = GetDeviceAddress<T>(inputs, i);
    }
    CHECK_CUDA_RET_WITH_EXCEPT(cudaMemcpyAsync(inputs_device, inputs_host_.get(), sizeof(T *) * input_num_,
                                               cudaMemcpyHostToDevice, reinterpret_cast<cudaStream_t>(stream_ptr)),
                               "ConcatV2 opt cudaMemcpyAsync inputs failed");
    CHECK_CUDA_RET_WITH_EXCEPT(cudaMemcpyAsync(len_axis_device, len_axis_.get(), sizeof(int) * input_num_,
                                               cudaMemcpyHostToDevice, reinterpret_cast<cudaStream_t>(stream_ptr)),
                               "ConcatV2 opt cudaMemcpyAsync length on axis failed");
    ConcatKernel(output_size_, input_num_, all_size_before_axis_, all_size_axis_, len_axis_device, inputs_device,
                 output, reinterpret_cast<cudaStream_t>(stream_ptr));
    return true;
  }
  bool Init(const CNodePtr &kernel_node) override {
@@ -74,25 +70,34 @@ class ConcatV2GpuFwdKernel : public GpuKernel {
      axis_ += SizeToInt(input_shape.size());
    }

    auto input_num = AnfAlgo::GetInputTensorNum(kernel_node);
    for (size_t i = 0; i < input_num; i++) {
      auto input_size = sizeof(T);
    input_num_ = SizeToInt(AnfAlgo::GetInputTensorNum(kernel_node));
    inputs_host_ = std::make_unique<T *[]>(input_num_);
    len_axis_ = std::make_unique<int[]>(input_num_);
    for (int i = 0; i < input_num_; i++) {
      int input_size = 1;
      auto input_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, i);
      for (size_t j = 0; j < input_shape.size(); j++) {
        input_size *= SizeToInt(input_shape[j]);
        if (j >= IntToSize(axis_)) {
          w_[i] *= SizeToInt(input_shape[j]);
        }
        input_size_list_.push_back(input_size);
      }
      input_size_list_.push_back(IntToSize(input_size * sizeof(T)));
      len_axis_[i] = SizeToInt(input_shape[axis_]);
    }
    workspace_size_list_.push_back(sizeof(T *) * input_num_);
    workspace_size_list_.push_back(sizeof(int) * input_num_);

    auto output_shape = AnfAlgo::GetOutputInferShape(kernel_node, 0);
    output_size_ = sizeof(T);
    for (size_t i = 0; i < output_shape.size(); i++) {
    output_size_ = 1;
    for (int i = 0; i < SizeToInt(output_shape.size()); i++) {
      output_size_ *= output_shape[i];
      if (i > axis_) {
        all_size_before_axis_ *= output_shape[i];
        all_size_axis_ *= output_shape[i];
      }
      if (i == axis_) {
        all_size_before_axis_ *= output_shape[i];
      }
    }
    output_size_list_.push_back(output_size_);
    output_size_list_.push_back(IntToSize(output_size_ * sizeof(T)));

    InitSizeLists();
    return true;
@@ -103,11 +108,6 @@ class ConcatV2GpuFwdKernel : public GpuKernel {

 private:
  bool CheckParam(const CNodePtr &kernel_node) {
    size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
    if (input_num < 2 || input_num > 4) {
      MS_LOG(ERROR) << "Input number is " << input_num << ", but ConcatV2GpuFwdKernel needs inputs between 2 and 4.";
      return false;
    }
    size_t output_num = AnfAlgo::GetOutputTensorNum(kernel_node);
    if (output_num != 1) {
      MS_LOG(ERROR) << "Output number is " << output_num << ", but ConcatV2GpuFwdKernel needs 1 output.";
@@ -115,9 +115,13 @@ class ConcatV2GpuFwdKernel : public GpuKernel {
    }
    return true;
  }
  int w_[4] = {1, 1, 1, 1};
  int axis_;
  size_t output_size_;
  int input_num_;
  int output_size_;
  int all_size_before_axis_;
  int all_size_axis_;
  std::unique_ptr<T *[]> inputs_host_;
  std::unique_ptr<int[]> len_axis_;
  std::vector<size_t> input_size_list_;
  std::vector<size_t> output_size_list_;
  std::vector<size_t> workspace_size_list_;
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/split_gpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/split_gpu_kernel.cc
@@ -0,0 +1,31 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #include "backend/kernel_compiler/gpu/arrays/split_gpu_kernel.h"

 namespace mindspore {
 namespace kernel {
 MS_REG_GPU_KERNEL_ONE(
  Split, KernelAttr().AddAllSameAttr(true).AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32),
  SplitGpuFwdKernel, float)
 MS_REG_GPU_KERNEL_ONE(Split,
                      KernelAttr().AddAllSameAttr(true).AddInputAttr(kNumberTypeInt32).AddOutputAttr(kNumberTypeInt32),
                      SplitGpuFwdKernel, int)
 MS_REG_GPU_KERNEL_ONE(
  Split, KernelAttr().AddAllSameAttr(true).AddInputAttr(kNumberTypeFloat16).AddOutputAttr(kNumberTypeFloat16),
  SplitGpuFwdKernel, half)
 }  // namespace kernel
 }  // namespace mindspore
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/split_gpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/split_gpu_kernel.h
@@ -0,0 +1,153 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #ifndef MINDSPORE_CCSRC_KERNEL_GPU_SPLIT_GPU_KERNEL_H
 #define MINDSPORE_CCSRC_KERNEL_GPU_SPLIT_GPU_KERNEL_H

 #include <vector>
 #include <memory>
 #include "backend/kernel_compiler/gpu/gpu_kernel.h"
 #include "backend/kernel_compiler/gpu/gpu_kernel_factory.h"
 #include "backend/kernel_compiler/gpu/cuda_impl/split_impl.cuh"

 namespace mindspore {
 namespace kernel {
 template <typename T>
 class SplitGpuFwdKernel : public GpuKernel {
 public:
  SplitGpuFwdKernel()
      : axis_(0),
        output_num_(1),
        input_size_(1),
        axis_step_(1),
        all_size_before_axis_(1),
        all_size_axis_(1),
        outputs_host_(nullptr) {}
  ~SplitGpuFwdKernel() override = default;
  const std::vector<size_t> &GetInputSizeList() const override { return input_size_list_; }
  const std::vector<size_t> &GetOutputSizeList() const override { return output_size_list_; }
  const std::vector<size_t> &GetWorkspaceSizeList() const override { return workspace_size_list_; }

  bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
              const std::vector<AddressPtr> &outputs, void *stream_ptr) override {
    T *input = GetDeviceAddress<T>(inputs, 0);
    T **outputs_device = GetDeviceAddress<T *>(workspace, 0);
    for (size_t i = 0; i < outputs.size(); i++) {
      outputs_host_[i] = GetDeviceAddress<T>(outputs, i);
    }
    CHECK_CUDA_RET_WITH_EXCEPT(cudaMemcpyAsync(outputs_device, outputs_host_.get(), sizeof(T *) * output_num_,
                                               cudaMemcpyHostToDevice, reinterpret_cast<cudaStream_t>(stream_ptr)),
                               "Split opt cudaMemcpyAsync outputs failed");
    SplitKernel(input_size_, axis_step_, all_size_before_axis_, all_size_axis_, input, outputs_device,
                reinterpret_cast<cudaStream_t>(stream_ptr));
    return true;
  }

  bool Init(const CNodePtr &kernel_node) override {
    axis_ = GetAttr<int>(kernel_node, "axis");
    if (axis_ < 0) {
      auto input_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
      axis_ += SizeToInt(input_shape.size());
    }
    output_num_ = GetAttr<int>(kernel_node, "output_num");

    if (!CheckParam(kernel_node)) {
      return false;
    }

    auto input_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
    input_size_ = 1;
    all_size_before_axis_ = 1;
    all_size_axis_ = 1;

    for (int i = 0; i < SizeToInt(input_shape.size()); i++) {
      input_size_ *= input_shape[i];
      if (i > axis_) {
        all_size_before_axis_ *= input_shape[i];
        all_size_axis_ *= input_shape[i];
      }
      if (i == axis_) {
        all_size_before_axis_ *= input_shape[i];
      }
    }
    input_size_list_.push_back(IntToSize(input_size_ * sizeof(T)));
    axis_step_ = input_shape[axis_] / output_num_;

    for (int i = 0; i < output_num_; i++) {
      size_t output_size = 1;
      auto output_shape = AnfAlgo::GetOutputInferShape(kernel_node, i);
      for (size_t j = 0; j < output_shape.size(); j++) {
        output_size *= output_shape[j];
      }
      output_size_list_.push_back(output_size * sizeof(T));
    }
    workspace_size_list_.push_back(sizeof(T *) * output_num_);
    InitSizeLists();
    outputs_host_ = std::make_unique<T *[]>(output_num_);
    return true;
  }

 protected:
  void InitSizeLists() override {}

 private:
  bool CheckParam(const CNodePtr &kernel_node) {
    auto input_num = AnfAlgo::GetInputTensorNum(kernel_node);
    auto input_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
    int dims = SizeToInt(input_shape.size());
    int output_num = SizeToInt(AnfAlgo::GetOutputTensorNum(kernel_node));

    if (input_num != 1) {
      MS_LOG(ERROR) << "Input number is " << input_num << ", but Split needs 1 input.";
      return false;
    }
    if (dims == 0) {
      MS_LOG(ERROR) << "Input dims is " << dims << ", scalar is not supported.";
      return false;
    }
    if (axis_ < -dims || axis_ >= dims) {
      MS_LOG(ERROR) << "Attr axis " << axis_ << " must be in " << -dims << "~" << dims;
      return false;
    }
    if (output_num_ > SizeToInt(input_shape[axis_])) {
      MS_LOG(ERROR) << "Attr output_num " << output_num_ << "must less than" << input_shape[axis_];
      return false;
    }
    if (input_shape[axis_] % output_num_ != 0) {
      MS_LOG(ERROR) << "Attr output_num " << output_num_ << "must be divided by" << input_shape[axis_];
      return false;
    }
    if (output_num_ != output_num) {
      MS_LOG(ERROR) << "Output num is " << output_num << ", but need " << output_num_;
      return false;
    }
    return true;
  }
  int axis_;
  int output_num_;
  int input_size_;
  int axis_step_;
  int all_size_before_axis_;
  int all_size_axis_;
  std::unique_ptr<T *[]> outputs_host_;
  std::vector<size_t> input_size_list_;
  std::vector<size_t> output_size_list_;
  std::vector<size_t> workspace_size_list_;
 };
 }  // namespace kernel
 }  // namespace mindspore

 #endif  // MINDSPORE_CCSRC_KERNEL_GPU_SPLIT_GPU_KERNEL_H
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/topk_gpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/topk_gpu_kernel.cc
@@ -0,0 +1,29 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #include "backend/kernel_compiler/gpu/arrays/topk_gpu_kernel.h"

 namespace mindspore {
 namespace kernel {
 MS_REG_GPU_KERNEL_TWO(TopK,
                      KernelAttr()
                        .AddInputAttr(kNumberTypeFloat32)
                        .AddInputAttr(kNumberTypeInt32)
                        .AddOutputAttr(kNumberTypeFloat32)
                        .AddOutputAttr(kNumberTypeInt32),
                      TopKGpuKernel, float, int)
 }
 }  // namespace mindspore
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/topk_gpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/topk_gpu_kernel.h
@@ -0,0 +1,110 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #ifndef MINDSPORE_CCSRC_KERNEL_GPU_TOPK_H_
 #define MINDSPORE_CCSRC_KERNEL_GPU_TOPK_H_

 #include <vector>
 #include "backend/kernel_compiler/gpu/gpu_kernel.h"
 #include "backend/kernel_compiler/gpu/gpu_kernel_factory.h"
 #include "backend/kernel_compiler/gpu/cuda_impl/topk_impl.cuh"

 namespace mindspore {
 namespace kernel {
 template <typename T, typename S>
 class TopKGpuKernel : public GpuKernel {
 public:
  TopKGpuKernel() : sorted_(false), outer_size_(1), inner_size_(1), k_(1), use_share_mem_(true), ceil_power2_(0) {}
  ~TopKGpuKernel() override = default;

  const std::vector<size_t> &GetInputSizeList() const override { return input_size_list_; }
  const std::vector<size_t> &GetOutputSizeList() const override { return output_size_list_; }
  const std::vector<size_t> &GetWorkspaceSizeList() const override { return workspace_size_list_; }

  bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspaces,
              const std::vector<AddressPtr> &outputs, void *stream_ptr) override {
    T *input_addr = GetDeviceAddress<T>(inputs, 0);
    S *k = GetDeviceAddress<S>(inputs, 1);
    T *output_addr = GetDeviceAddress<T>(outputs, 0);
    S *indices = GetDeviceAddress<S>(outputs, 1);
    T *data_buff = nullptr;
    S *index_buff = nullptr;
    if (use_share_mem_ == false) {
      data_buff = GetDeviceAddress<T>(workspaces, 0);
      index_buff = GetDeviceAddress<S>(workspaces, 1);
    }

    TopK(outer_size_, inner_size_, input_addr, k, output_addr, indices, data_buff, index_buff,
         reinterpret_cast<cudaStream_t>(stream_ptr));

    if (sorted_ == false) {
      std::cout << "================BitonicSortByKey" << std::endl;
      BitonicSortByKey(outer_size_, k_, output_addr, indices, data_buff, index_buff,
                       reinterpret_cast<cudaStream_t>(stream_ptr));
    }
    return true;
  }

  bool Init(const CNodePtr &kernel_node) override {
    auto input_shapes = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
    auto output_shapes = AnfAlgo::GetOutputInferShape(kernel_node, 0);
    for (size_t i = 0; i < input_shapes.size() - 1; i++) {
      outer_size_ *= input_shapes[i];
    }
    inner_size_ = input_shapes[input_shapes.size() - 1];
    k_ = output_shapes[output_shapes.size() - 1];

    sorted_ = GetAttr<bool>(kernel_node, "sorted");

    ceil_power2_ = RoundUpPower2(inner_size_);
    size_t buffer_size = ceil_power2_ * (sizeof(T) + sizeof(S));
    if (buffer_size > SHARED_MEM_PER_BLOCK) {
      use_share_mem_ = false;
      MS_LOG(WARNING) << "CUDA share memory not enough, sort with RAM";
    }

    InitSizeLists();
    return true;
  }

 protected:
  void InitSizeLists() override {
    input_size_list_.push_back(outer_size_ * inner_size_ * sizeof(T));
    input_size_list_.push_back(sizeof(S));
    output_size_list_.push_back(outer_size_ * k_ * sizeof(T));
    output_size_list_.push_back(outer_size_ * k_ * sizeof(S));
    if (use_share_mem_ == false) {
      workspace_size_list_.push_back(outer_size_ * ceil_power2_ * sizeof(T));
      workspace_size_list_.push_back(outer_size_ * ceil_power2_ * sizeof(S));
    }
  }

 private:
  bool sorted_;
  int outer_size_;
  int inner_size_;
  int k_;
  bool use_share_mem_;
  int ceil_power2_;

  std::vector<size_t> input_size_list_;
  std::vector<size_t> output_size_list_;
  std::vector<size_t> workspace_size_list_;
 };
 }  // namespace kernel
 }  // namespace mindspore

 #endif  // TopKpuKernel
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/broadcast_impl.cu
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/broadcast_impl.cu
@@ -116,16 +116,16 @@ __global__ void BroadcastKernel(const int l0, const int l1, const int l2, const
                                                      output);
    case BROADCAST_TYPE_REALDIV:
      return BroadcastOperator<T, S, RealDivFunc<T, S>>(l0, l1, l2, l3, r0, r1, r2, r3, d0, d1, d2, d3, input0, input1,
                                                      output);
                                                        output);
    case BROADCAST_TYPE_MUL:
      return BroadcastOperator<T, S, MulFunc<T, S>>(l0, l1, l2, l3, r0, r1, r2, r3, d0, d1, d2, d3, input0, input1,
                                                      output);
                                                    output);
    case BROADCAST_TYPE_SUB:
      return BroadcastOperator<T, S, SubFunc<T, S>>(l0, l1, l2, l3, r0, r1, r2, r3, d0, d1, d2, d3, input0, input1,
                                                      output);
                                                    output);
    case BROADCAST_TYPE_ADD:
      return BroadcastOperator<T, S, AddFunc<T, S>>(l0, l1, l2, l3, r0, r1, r2, r3, d0, d1, d2, d3, input0, input1,
                                                      output);
                                                    output);
  }
 }

@@ -176,6 +176,28 @@ void NoBroadcast(const int &nums, enum BroadcastOpType op, const T *input0, cons
  NoBroadcastKernel<<<GET_BLOCKS(nums), GET_THREADS, 0, stream>>>(nums, op, input0, input1, output);
 }

 template <typename T>
 __global__ void BroadcastToKernel(const int i0, const int i1, const int i2, const int i3, const int o0,
                                  const int o1, const int o2, const int o3, const T *input_addr, T *output_addr) {
  for (size_t pos = blockIdx.x * blockDim.x + threadIdx.x; pos < o0 * o1 * o2 * o3; pos += blockDim.x * gridDim.x) {
    int i = pos / (o1 * o2 * o3) % o0;
    int j = pos / (o2 * o3) % o1;
    int k = pos / o3 % o2;
    int l = pos % o3;

    int input_idx = Index(i, i0) * i1 * i2 * i3 + Index(j, i1) * i2 * i3 + Index(k, i2) * i3 + Index(l, i3);
    output_addr[pos] = input_addr[input_idx];
  }
 }

 template <typename T>
 void BroadcastTo(const int &i0, const int &i1, const int &i2, const int &i3, const int &o0, const int &o1,
                 const int &o2, const int &o3, const T *input_addr, T *output_addr, cudaStream_t stream) {
  int nums = o0 * o1 * o2 * o3;
  BroadcastToKernel<<<GET_BLOCKS(nums), GET_THREADS, 0, stream>>>(i0, i1, i2, i3, o0, o1, o2, o3, input_addr,
                                                                  output_addr);
 }

 template void Broadcast(const int &l0, const int &l1, const int &l2, const int &l3, const int &r0, const int &r1,
                        const int &r2, const int &r3, const int &d0, const int &d1, const int &d2, const int &d3,
                        enum BroadcastOpType op, const float *input0, const float *input1, bool *output,
@@ -204,5 +226,11 @@ template void NoBroadcast(const int &nums, enum BroadcastOpType op, const half *
                          bool *output, cudaStream_t stream);
 template void NoBroadcast(const int &nums, enum BroadcastOpType op, const half *input0, const half *input1,
                          half *output, cudaStream_t stream);
 template void NoBroadcast(const int &nums, enum BroadcastOpType op, const int *input0, const int *input1,
                          int *output, cudaStream_t stream);
 template void NoBroadcast(const int &nums, enum BroadcastOpType op, const int *input0, const int *input1, int *output,
                          cudaStream_t stream);

 template void BroadcastTo(const int &i0, const int &i1, const int &i2, const int &i3, const int &o0, const int &o1,
                          const int &o2, const int &o3, const float *input_addr, float *output_addr,
                          cudaStream_t stream);
 template void BroadcastTo(const int &i0, const int &i1, const int &i2, const int &i3, const int &o0, const int &o1,
                          const int &o2, const int &o3, const half *input_addr, half *output_addr, cudaStream_t stream);
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/broadcast_impl.cuh
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/broadcast_impl.cuh
@@ -41,4 +41,8 @@ template <typename T, typename S>
 void NoBroadcast(const int &size, enum BroadcastOpType op, const T *input0, const T *input1, S *output,
                 cudaStream_t stream);

 template <typename T>
 void BroadcastTo(const int &i0, const int &i1, const int &i2, const int &i3, const int &o0, const int &o1,
                 const int &o2, const int &o3, const T *input_addr, T *output_addr, cudaStream_t stream);

 #endif  // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_BROADCAST_H_
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/concatv2_impl.cu
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/concatv2_impl.cu
@@ -19,90 +19,51 @@
 #include <cuda_runtime.h>
 #include "backend/kernel_compiler/gpu/cuda_impl/concatv2_impl.cuh"
 template <typename T>
 __global__ void Concat(const size_t size, const int w1, const int w2, const T* input_1, const T* input_2, T* output) {
  for (size_t pos = blockIdx.x * blockDim.x + threadIdx.x; pos < (size); pos += blockDim.x * gridDim.x) {
    int n = pos / (w1 + w2);
    int m = pos % (w1 + w2);
    output[pos] = m >= w1 ? input_2[n * w2 + m - w1] : input_1[n * w1 + m];
 __global__ void Concat(const int size, const int input_num,
                       const int all_size_before_axis, const int all_size_axis,
                       int* len_axis, T** inputs, T* output) {
  for (int pos = blockIdx.x * blockDim.x + threadIdx.x; pos < (size); pos += blockDim.x * gridDim.x) {
    int num = pos % all_size_before_axis / all_size_axis;
    int block = -1;
    int axis_inc = 0;
    int block_len = 0;
    for (int i = 0; i < input_num; i++) {
      if (axis_inc <= num) {
        block++;
        axis_inc += len_axis[i];
      } else {
        break;
      }
    }
    block_len = len_axis[block];
    axis_inc -= len_axis[block];
    int block_pos = pos / all_size_before_axis * block_len * all_size_axis +
                    (num - axis_inc) * all_size_axis + pos % all_size_axis;;
    output[pos] = inputs[block][block_pos];
  }
  return;
 }

 template <typename T>
 __global__ void Concat(const size_t size, const int w1, const int w2, const int w3,
                       const T* input_1, const T* input_2, const T* input_3, T* output) {
  for (size_t pos = blockIdx.x * blockDim.x + threadIdx.x; pos < (size); pos += blockDim.x * gridDim.x) {
    int n = pos / (w1 + w2 + w3);
    int m = pos % (w1 + w2 + w3);
    output[pos] = m < w1 ? input_1[n * w1 + m] :
                    m < w1 + w2 ? input_2[n * w2 + m - w1] :
                      input_3[n * w3 + m - w1 - w2];
  }
  return;
 }

 template <typename T>
 __global__ void Concat(const size_t size, const int w1, const int w2, const int w3, const int w4,
                       const T* input_1, const T* input_2, const T* input_3, const T* input_4, T* output) {
  for (size_t pos = blockIdx.x * blockDim.x + threadIdx.x; pos < (size); pos += blockDim.x * gridDim.x) {
    int n = pos / (w1 + w2 + w3 + w4);
    int m = pos % (w1 + w2 + w3 + w4);
    output[pos] = m < w1 ? input_1[n * w1 + m] :
                    m < w1 + w2 ? input_2[n * w2 + m - w1]:
                      m < w1 + w2 + w3 ? input_3[n * w3 + m - w1 - w2]:
                        input_4[n * w4 + m - w1 - w2 - w3];
  }
  return;
 }

 template <typename T>
 void ConcatKernel(const size_t size, const int w1, const int w2, const T* input_1, const T* input_2, T* output,
                 cudaStream_t cuda_stream) {
  Concat<<<GET_BLOCKS(size), GET_THREADS, 0, cuda_stream>>>(size, w1, w2, input_1, input_2, output);
  return;
 }

 template <typename T>
 void ConcatKernel(const size_t size, const int w1, const int w2, const int w3,
                  const T* input_1, const T* input_2, const T* input_3, T* output,
 void ConcatKernel(const int size, const int input_num,
                  const int all_size_before_axis, const int all_size_axis,
                  int* len_axis, T** inputs, T* output,
                  cudaStream_t cuda_stream) {
  Concat<<<GET_BLOCKS(size), GET_THREADS, 0, cuda_stream>>>(size, w1, w2, w3, input_1, input_2, input_3, output);
  Concat<<<GET_BLOCKS(size), GET_THREADS, 0, cuda_stream>>>(size, input_num,
                                                            all_size_before_axis, all_size_axis,
                                                            len_axis, inputs, output);
  return;
 }

 template <typename T>
 void ConcatKernel(const size_t size, const int w1, const int w2, const int w3, const int w4,
                  const T* input_1, const T* input_2, const T* input_3, const T* input_4, T* output,
                  cudaStream_t cuda_stream) {
  Concat<<<GET_BLOCKS(size), GET_THREADS, 0, cuda_stream>>>(size, w1, w2, w3, w4, input_1,
                                                            input_2, input_3, input_4, output);
  return;
 }

 template void ConcatKernel(const size_t size, const int w1, const int w2, const float* input_1, const float* input_2,
                           float* output, cudaStream_t cuda_stream);
 template void ConcatKernel(const size_t size, const int w1, const int w2, const int* input_1, const int* input_2,
                           int* output, cudaStream_t cuda_stream);
 template void ConcatKernel(const size_t size, const int w1, const int w2, const half* input_1, const half* input_2,
                           half* output, cudaStream_t cuda_stream);

 template void ConcatKernel(const size_t size, const int w1, const int w2, const int w3,
                           const float* input_1, const float* input_2, const float* input_3,
                           float* output, cudaStream_t cuda_stream);
 template void ConcatKernel(const size_t size, const int w1, const int w2, const int w3,
                           const int* input_1, const int* input_2, const int* input_3,
                           int* output, cudaStream_t cuda_stream);
 template void ConcatKernel(const size_t size, const int w1, const int w2, const int w3,
                           const half* input_1, const half* input_2, const half* input_3,
                           half* output, cudaStream_t cuda_stream);

 template void ConcatKernel(const size_t size, const int w1, const int w2, const int w3, const int w4,
                           const float* input_1, const float* input_2, const float* input_3, const float* input_4,
                           float* output, cudaStream_t cuda_stream);
 template void ConcatKernel(const size_t size, const int w1, const int w2, const int w3, const int w4,
                           const int* input_1, const int* input_2, const int* input_3, const int* input_4,
                           int* output, cudaStream_t cuda_stream);
 template void ConcatKernel(const size_t size, const int w1, const int w2, const int w3, const int w4,
                           const half* input_1, const half* input_2, const half* input_3, const half* input_4,
                           half* output, cudaStream_t cuda_stream);

 template void ConcatKernel(const int size, const int input_num,
                           const int all_size_before_axis, const int all_size_axis,
                           int* len_axis, float** inputs, float* output,
                           cudaStream_t cuda_stream);
 template void ConcatKernel(const int size, const int input_num,
                           const int all_size_before_axis, const int all_size_axis,
                           int* len_axis, int** inputs, int* output,
                           cudaStream_t cuda_stream);
 template void ConcatKernel(const int size, const int input_num,
                           const int all_size_before_axis, const int all_size_axis,
                           int* len_axis, half** inputs, half* output,
                           cudaStream_t cuda_stream);
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/concatv2_impl.cuh
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/concatv2_impl.cuh
@@ -19,13 +19,8 @@

 #include "runtime/device/gpu/cuda_common.h"
 template <typename T>
 void ConcatKernel(const size_t size, const int w1, const int w2, const T* input_1, const T* input_2, T* output,
                  cudaStream_t cuda_stream);
 template <typename T>
 void ConcatKernel(const size_t size, const int w1, const int w2, const int w3,
                  const T* input_1, const T* input_2, const T* input_3, T* output, cudaStream_t cuda_stream);
 template <typename T>
 void ConcatKernel(const size_t size, const int w1, const int w2, const int w3, const int w4,
                  const T* input_1, const T* input_2, const T* input_3, const T* input_4, T* output,
 void ConcatKernel(const int size, const int input_num,
                  const int all_size_before_axis, const int all_size_axis,
                  int* len_axis, T** inputs, T* output,
                  cudaStream_t cuda_stream);
 #endif  // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_CONCATV2IMPL_H_
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/momentum_impl.cu
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/momentum_impl.cu
@@ -15,9 +15,9 @@
 */

 #include "momentum_impl.cuh"
 template <typename T, typename S>
 template <typename T, typename S, typename G>
 __global__ void MomentumUpdateVariableKernel(const size_t size, T *variable, T *accumulation, const S *learning_rate,
                                             const T *gradient, const S *momentum) {
                                             const G *gradient, const S *momentum) {
  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (size); i += blockDim.x * gridDim.x) {
    accumulation[i] = momentum[0] * accumulation[i] + gradient[i];
    variable[i] -= learning_rate[0] * accumulation[i];
@@ -34,19 +34,32 @@ __global__ void MomentumUpdateVariableKernel(const size_t size, half *variable,
  }
  return;
 }
 template <typename T, typename S>
 void MomentumUpdateVariable(const size_t size, T *variable, T *accumulation, const S *learning_rate, const T *gradient,
 template <>
 __global__ void MomentumUpdateVariableKernel(const size_t size, float *variable, float *accumulation,
                                             const float *learning_rate, const half *gradient,
                                             const float *momentum) {
  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (size); i += blockDim.x * gridDim.x) {
    accumulation[i] = momentum[0] * accumulation[i] + __half2float(gradient[i]);
    variable[i] -= learning_rate[0] * accumulation[i];
  }
  return;
 }
 template <typename T, typename S, typename G>
 void MomentumUpdateVariable(const size_t size, T *variable, T *accumulation, const S *learning_rate, const G *gradient,
                            const S *momentum, cudaStream_t cuda_stream) {
  MomentumUpdateVariableKernel<<<GET_BLOCKS(size), GET_THREADS, 0, cuda_stream>>>(size, variable, accumulation,
                                                                                  learning_rate, gradient, momentum);
  return;
 }
 template void MomentumUpdateVariable<float, float>(const size_t size, float *variable, float *accumulation,
 template void MomentumUpdateVariable<float, float, float>(const size_t size, float *variable, float *accumulation,
                                                   const float *learning_rate, const float *gradient,
                                                   const float *momentum, cudaStream_t cuda_stream);
 template void MomentumUpdateVariable<half, half>(const size_t size, half *variable, half *accumulation,
 template void MomentumUpdateVariable<half, half, half>(const size_t size, half *variable, half *accumulation,
                                                 const half *learning_rate, const half *gradient,
                                                 const half *momentum, cudaStream_t cuda_stream);
 template void MomentumUpdateVariable<half, float>(const size_t size, half *variable, half *accumulation,
 template void MomentumUpdateVariable<half, float, half>(const size_t size, half *variable, half *accumulation,
                                                  const float *learning_rate, const half *gradient,
                                                  const float *momentum, cudaStream_t cuda_stream);
 template void MomentumUpdateVariable<float, float, half>(const size_t size, float *variable, float *accumulation,
                                                  const float *learning_rate, const half *gradient,
                                                  const float *momentum, cudaStream_t cuda_stream);
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/momentum_impl.cuh
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/momentum_impl.cuh
@@ -18,8 +18,8 @@
 #define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_MOMENTUMIMPL_H_

 #include "runtime/device/gpu/cuda_common.h"
 template <typename T, typename S>
 void MomentumUpdateVariable(const size_t size, T *variable, T *accumulation, const S *learning_rate, const T *gradient,
 template <typename T, typename S, typename G>
 void MomentumUpdateVariable(const size_t size, T *variable, T *accumulation, const S *learning_rate, const G *gradient,
                            const S *momentum, cudaStream_t cuda_stream);

 #endif  // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_MOMENTUMIMPL_H_
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/split_impl.cu
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/split_impl.cu
@@ -0,0 +1,50 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #include <stdio.h>
 #include <stdint.h>
 #include <cuda_runtime.h>
 #include "backend/kernel_compiler/gpu/cuda_impl/split_impl.cuh"
 template <typename T>
 __global__ void Split(const int size, const int axis_step, const int all_size_before_axis,
                      const int all_size_axis, const T* input, T** outputs) {
  for (size_t pos = blockIdx.x * blockDim.x + threadIdx.x; pos < size; pos += blockDim.x * gridDim.x) {
    int num = pos % all_size_before_axis / all_size_axis;
    int block = num / axis_step;
    int block_pos = pos / all_size_before_axis * axis_step * all_size_axis +
                    num % axis_step * all_size_axis + pos % all_size_axis;
    outputs[block][block_pos] = input[pos];
  }
  return;
 }

 template <typename T>
 void SplitKernel(const int size, const int axis_step, const int all_size_before_axis,
                 const int all_size_axis, const T* input, T** outputs, cudaStream_t cuda_stream) {
  Split<<<GET_BLOCKS(size), GET_THREADS, 0, cuda_stream>>>(size, axis_step, all_size_before_axis,
                                                           all_size_axis, input, outputs);
  return;
 }

 template void SplitKernel(const int size, const int axis_step, const int all_size_before_axis,
                          const int all_size_axis, const float* input, float** outputs,
                          cudaStream_t cuda_stream);
 template void SplitKernel(const int size, const int axis_step, const int all_size_before_axis,
                          const int all_size_axis, const int* input, int** outputs,
                          cudaStream_t cuda_stream);
 template void SplitKernel(const int size, const int axis_step, const int all_size_before_axis,
                          const int all_size_axis, const half* input, half** outputs,
                          cudaStream_t cuda_stream);
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/split_impl.cuh
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/split_impl.cuh
@@ -0,0 +1,24 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPLIT_H_
 #define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPLIT_H_

 #include "runtime/device/gpu/cuda_common.h"
 template <typename T>
 void SplitKernel(const int size, const int axis_step, const int all_size_before_axis,
                 const int all_size_axis, const T* input, T** outputs, cudaStream_t cuda_stream);
 #endif  // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPLIT_H_
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/topk_impl.cu
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/topk_impl.cu
@@ -0,0 +1,162 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #include "backend/kernel_compiler/gpu/cuda_impl/topk_impl.cuh"
 #include <limits>
 #include <algorithm>

 int RoundUpPower2(int v) {
  v--;
  v |= v >> 1;
  v |= v >> 2;
  v |= v >> 4;
  v |= v >> 8;
  v |= v >> 16;
  v++;
  return v;
 }

 template <typename T>
 __inline__ __device__ void Swap(T *lhs, T *rhs) {
  T tmp = lhs[0];
  lhs[0] = rhs[0];
  rhs[0] = tmp;
 }

 template <typename T, typename S>
 __global__ void TopkKernel(const int outer, const int inner, const int ceil_power2, const T *input, const S *k,
                           T *output, S *indices, T *data_buff, S *index_buff) {
  // default: sort with share memory
  extern __shared__ T share_mem[];
  T *data_arr = share_mem;
  S *index_arr = reinterpret_cast<S *>(data_arr + ceil_power2);
  // sort with RAM
  if (data_buff != nullptr && index_buff != nullptr) {
    data_arr = data_buff + blockIdx.x * ceil_power2;
    index_arr = index_buff + blockIdx.x * ceil_power2;
  }

  for (int i = threadIdx.x; i < ceil_power2; i += blockDim.x) {
    data_arr[i] = (i < inner) ? input[blockIdx.x * inner + i] : std::numeric_limits<T>::max();
    index_arr[i] = i;
  }
  __syncthreads();

  for (size_t i = 2; i <= ceil_power2; i <<= 1) {
    for (size_t j = (i >> 1); j > 0; j >>= 1) {
      for (size_t tid = threadIdx.x; tid < ceil_power2; tid += blockDim.x) {
        size_t tid_comp = tid ^ j;
        if (tid_comp > tid) {
          if ((tid & i) == 0) {
            if (data_arr[tid] > data_arr[tid_comp]) {
              Swap(&data_arr[tid], &data_arr[tid_comp]);
              Swap(&index_arr[tid], &index_arr[tid_comp]);
            }
          } else {
            if (data_arr[tid] < data_arr[tid_comp]) {
              Swap(&data_arr[tid], &data_arr[tid_comp]);
              Swap(&index_arr[tid], &index_arr[tid_comp]);
            }
          }
        }
      }
      __syncthreads();
    }
  }

  for (size_t tid = threadIdx.x; tid < k[0]; tid += blockDim.x) {
    output[blockIdx.x * k[0] + tid] = data_arr[inner - tid - 1];
    indices[blockIdx.x * k[0] + tid] = index_arr[inner - tid - 1];
  }
 }

 template <typename T, typename S>
 void TopK(const int &outer, const int &inner, const T *input, const S *k, T *output, S *indices, T *data_buff,
          S *index_buff, cudaStream_t stream) {
  int ceil_power2 = RoundUpPower2(inner);
  int share_mem = (data_buff == nullptr) ? ceil_power2 * (sizeof(T) + sizeof(S)) : 0;
  int thread = std::min(ceil_power2, GET_THREADS);
  TopkKernel<<<outer, thread, share_mem, stream>>>(outer, inner, ceil_power2, input, k, output, indices, data_buff,
                                                   index_buff);
 }

 template <typename T, typename S>
 __global__ void BitonicSortByKeyKernel(const int outer, const int inner, const int ceil_power2, T *input,
                                       S *indices, T *data_buff, S *index_buff) {
  // default: sort with share memory
  extern __shared__ T share_mem[];
  T *data_arr = share_mem;
  S *index_arr = reinterpret_cast<S *>(data_arr + ceil_power2);
  // sort with RAM
  if (data_buff != nullptr && index_buff != nullptr) {
    data_arr = data_buff + blockIdx.x * ceil_power2;
    index_arr = index_buff + blockIdx.x * ceil_power2;
  }

  for (int i = threadIdx.x; i < ceil_power2; i += blockDim.x) {
    data_arr[i] = (i < inner) ? input[blockIdx.x * inner + i] : std::numeric_limits<T>::max();
    index_arr[i] = (i < inner) ? indices[blockIdx.x * inner + i] : std::numeric_limits<S>::max();;
  }
  __syncthreads();

  for (size_t i = 2; i <= ceil_power2; i <<= 1) {
    for (size_t j = (i >> 1); j > 0; j >>= 1) {
      for (size_t tid = threadIdx.x; tid < ceil_power2; tid += blockDim.x) {
        size_t tid_comp = tid ^ j;
        if (tid_comp > tid) {
          if ((tid & i) == 0) {
            if (index_arr[tid] > index_arr[tid_comp]) {
              Swap(&data_arr[tid], &data_arr[tid_comp]);
              Swap(&index_arr[tid], &index_arr[tid_comp]);
            }
          } else {
            if (index_arr[tid] < index_arr[tid_comp]) {
              Swap(&data_arr[tid], &data_arr[tid_comp]);
              Swap(&index_arr[tid], &index_arr[tid_comp]);
            }
          }
        }
      }
      __syncthreads();
    }
  }

  for (size_t tid = threadIdx.x; tid < inner; tid += blockDim.x) {
    input[blockIdx.x * inner + tid] = data_arr[tid];
    indices[blockIdx.x * inner + tid] = index_arr[tid];
  }
 }

 template <typename T, typename S>
 void BitonicSortByKey(const int &outer, const int &inner, T *input, S *indices, T *data_buff, S *index_buff,
                      cudaStream_t stream) {
  int ceil_power2 = RoundUpPower2(inner);
  size_t share_mem = ceil_power2 * (sizeof(T) + sizeof(S));
  if (share_mem > SHARED_MEM_PER_BLOCK) {
    share_mem = 0;
  } else {
    data_buff = nullptr;
    index_buff = nullptr;
  }
  int thread = std::min(ceil_power2, GET_THREADS);
  BitonicSortByKeyKernel<<<outer, thread, share_mem, stream>>>(outer, inner, ceil_power2, input, indices, data_buff,
                                                               index_buff);
 }

 template void TopK(const int &outer, const int &inner, const float *input_addr, const int *k, float *output,
                   int *indices, float *data_buff, int *index_buff, cudaStream_t stream);
 template void BitonicSortByKey(const int &outer, const int &inner, float *input, int *indices, float *data_buff,
                               int *index_buff, cudaStream_t stream);
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/topk_impl.cuh
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/topk_impl.cuh
@@ -0,0 +1,32 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_TOPK_H_
 #define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_TOPK_H_

 #include <cuda_runtime.h>
 #include "runtime/device/gpu/cuda_common.h"

 template <typename T, typename S>
 void TopK(const int &outer, const int &inner, const T *input_addr, const S *k, T *output, S *indices, T *data_buff,
          S *index_buff, cudaStream_t stream);

 template <typename T, typename S>
 void BitonicSortByKey(const int &outer, const int &inner, T *input, S *indices, T *data_buff, S *index_buff,
                      cudaStream_t stream);
 int RoundUpPower2(int v);

 #endif  // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_TOPK_H_
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/unary_op_impl.cu
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/unary_op_impl.cu
@@ -103,6 +103,35 @@ __global__ void ZeroslikeKernel(T *output, size_t count) {
  return;
 }
 template <typename T>
 __global__ void AbsKernel(T *input, T *output, size_t count) {
  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) {
    output[i] = abs(input[i]);
  }
  return;
 }
 template <>
 __global__ void AbsKernel(half *input, half *output, size_t count) {
  half zero = 0.0;
  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) {
    output[i] = input[i] < zero ? -input[i] : input[i];
  }
  return;
 }
 template <typename T>
 __global__ void FloorKernel(T *input, T *output, size_t count) {
  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) {
    output[i] = floor(input[i]);
  }
  return;
 }
 template <>
 __global__ void FloorKernel(half *input, half *output, size_t count) {
  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) {
    output[i] = hfloor(input[i]);
  }
  return;
 }
 template <typename T>
 void Exponential(T *input, T *output, size_t count, cudaStream_t cuda_stream) {
  ExponentialKernel<<<GET_BLOCKS(count), GET_THREADS, 0, cuda_stream>>>(input, output, count);
  return;
@@ -147,6 +176,16 @@ void Zeroslike(T *output, size_t count, cudaStream_t cuda_stream) {
  ZeroslikeKernel<<<GET_BLOCKS(count), GET_THREADS, 0, cuda_stream>>>(output, count);
  return;
 }
 template <typename T>
 void Abs(T *input, T *output, size_t count, cudaStream_t cuda_stream) {
  AbsKernel<<<GET_BLOCKS(count), GET_THREADS, 0, cuda_stream>>>(input, output, count);
  return;
 }
 template <typename T>
 void Floor(T *input, T *output, size_t count, cudaStream_t cuda_stream) {
  FloorKernel<<<GET_BLOCKS(count), GET_THREADS, 0, cuda_stream>>>(input, output, count);
  return;
 }

 template void Exponential<float>(float *input, float *output, size_t count, cudaStream_t cuda_stream);
 template void Logarithm<float>(float *input, float *output, size_t count, cudaStream_t cuda_stream);
@@ -156,6 +195,8 @@ template void Square<float>(float *input, float *output, size_t count, cudaStrea
 template void Sqrt<float>(float *input, float *output, size_t count, cudaStream_t cuda_stream);
 template void Rsqrt<float>(float *input, float *output, size_t count, cudaStream_t cuda_stream);
 template void Zeroslike<float>(float *output, size_t count, cudaStream_t cuda_stream);
 template void Abs<float>(float *input, float *output, size_t count, cudaStream_t cuda_stream);
 template void Floor<float>(float *input, float *output, size_t count, cudaStream_t cuda_stream);
 template void Exponential<half>(half *input, half *output, size_t count, cudaStream_t cuda_stream);
 template void Logarithm<half>(half *input, half *output, size_t count, cudaStream_t cuda_stream);
 template void Negative<half>(half *input, half *output, size_t count, cudaStream_t cuda_stream);
@@ -164,3 +205,5 @@ template void Square<half>(half *input, half *output, size_t count, cudaStream_t
 template void Sqrt<half>(half *input, half *output, size_t count, cudaStream_t cuda_stream);
 template void Rsqrt<half>(half *input, half *output, size_t count, cudaStream_t cuda_stream);
 template void Zeroslike<half>(half *output, size_t count, cudaStream_t cuda_stream);
 template void Abs<half>(half *input, half *output, size_t count, cudaStream_t cuda_stream);
 template void Floor<half>(half *input, half *output, size_t count, cudaStream_t cuda_stream);
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/unary_op_impl.cuh
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/unary_op_impl.cuh
@@ -34,5 +34,9 @@ template <typename T>
 void Rsqrt(T *input, T *output, size_t count, cudaStream_t cuda_stream);
 template <typename T>
 void Zeroslike(T *output, size_t count, cudaStream_t cuda_stream);
 template <typename T>
 void Abs(T *input, T *output, size_t count, cudaStream_t cuda_stream);
 template <typename T>
 void Floor(T *input, T *output, size_t count, cudaStream_t cuda_stream);

 #endif  // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_UNARYOPIMPL_H_
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/gpu_kernel_factory.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/gpu_kernel_factory.h
@@ -88,6 +88,12 @@ class GpuKernelRegister {
  static_assert(std::is_base_of<GpuKernel, OPCLASS<T, S>>::value, " must be base of GpuKernel"); \
  static const GpuKernelRegister g_##OPNAME##_##T##_##S##_gpu_kernel_reg(#OPNAME, ATTR,          \
                                                                         []() { return new OPCLASS<T, S>(); });

 // register of mixed accuracy kernels which use template and maintain three typename
 #define MS_REG_GPU_KERNEL_THREE(OPNAME, ATTR, OPCLASS, T, S, G)                                     \
  static_assert(std::is_base_of<GpuKernel, OPCLASS<T, S, G>>::value, " must be base of GpuKernel"); \
  static const GpuKernelRegister g_##OPNAME##_##T##_##S##_##G##_gpu_kernel_reg(                     \
    #OPNAME, ATTR, []() { return new OPCLASS<T, S, G>(); });
 }  // namespace kernel
 }  // namespace mindspore
 #endif  // MINDSPORE_CCSRC_KERNEL_GPU_GPUKERNELFACTORY_H_
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/math/unary_op_gpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/math/unary_op_gpu_kernel.cc
@@ -46,5 +46,13 @@ MS_REG_GPU_KERNEL_ONE(Sqrt, KernelAttr().AddInputAttr(kNumberTypeFloat32).AddOut
                      UnaryOpGpuKernel, float)
 MS_REG_GPU_KERNEL_ONE(Rsqrt, KernelAttr().AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32),
                      UnaryOpGpuKernel, float)
 MS_REG_GPU_KERNEL_ONE(Abs, KernelAttr().AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32),
                      UnaryOpGpuKernel, float)
 MS_REG_GPU_KERNEL_ONE(Abs, KernelAttr().AddInputAttr(kNumberTypeFloat16).AddOutputAttr(kNumberTypeFloat16),
                      UnaryOpGpuKernel, half)
 MS_REG_GPU_KERNEL_ONE(Floor, KernelAttr().AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32),
                      UnaryOpGpuKernel, float)
 MS_REG_GPU_KERNEL_ONE(Floor, KernelAttr().AddInputAttr(kNumberTypeFloat16).AddOutputAttr(kNumberTypeFloat16),
                      UnaryOpGpuKernel, half)
 }  // namespace kernel
 }  // namespace mindspore
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/math/unary_op_gpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/math/unary_op_gpu_kernel.h
@@ -36,6 +36,8 @@ enum UnaryOptype {
  UNARY_OP_SQUARE,
  UNARY_OP_SQRT,
  UNARY_OP_RSQRT,
  UNARY_OP_ABS,
  UNARY_OP_FLOOR,
  UNARY_OP_INVALID_TYPE = 255
 };
 static const std::map<std::string, UnaryOptype> kUnaryOpTypeMap = {{"Exp", UNARY_OP_EXP},
@@ -45,7 +47,9 @@ static const std::map<std::string, UnaryOptype> kUnaryOpTypeMap = {{"Exp", UNARY
                                                                   {"ZerosLike", UNARY_OP_ZEROSLIKE},
                                                                   {"Square", UNARY_OP_SQUARE},
                                                                   {"Sqrt", UNARY_OP_SQRT},
                                                                   {"Rsqrt", UNARY_OP_RSQRT}};
                                                                   {"Rsqrt", UNARY_OP_RSQRT},
                                                                   {"Abs", UNARY_OP_ABS},
                                                                   {"Floor", UNARY_OP_FLOOR}};
 template <typename T>
 class UnaryOpGpuKernel : public GpuKernel {
 public:
@@ -100,6 +104,14 @@ class UnaryOpGpuKernel : public GpuKernel {
        Zeroslike(output_addr, output_size_ / sizeof(T), reinterpret_cast<cudaStream_t>(stream_ptr));
        return true;
      }
      case UNARY_OP_ABS: {
        Abs(input_addr, output_addr, inputs[0]->size / sizeof(T), reinterpret_cast<cudaStream_t>(stream_ptr));
        break;
      }
      case UNARY_OP_FLOOR: {
        Floor(input_addr, output_addr, inputs[0]->size / sizeof(T), reinterpret_cast<cudaStream_t>(stream_ptr));
        break;
      }
      default: {
        MS_LOG(EXCEPTION) << "Unary operation " << unary_op_type_ << " is not supported.";
      }
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/fused_batch_norm_gpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/fused_batch_norm_gpu_kernel.cc
@@ -34,15 +34,15 @@ MS_REG_GPU_KERNEL_ONE(FusedBatchNorm,
 MS_REG_GPU_KERNEL_ONE(FusedBatchNorm,
                      KernelAttr()
                        .AddInputAttr(kNumberTypeFloat16)
                        .AddInputAttr(kNumberTypeFloat16)
                        .AddInputAttr(kNumberTypeFloat16)
                        .AddInputAttr(kNumberTypeFloat16)
                        .AddInputAttr(kNumberTypeFloat16)
                        .AddOutputAttr(kNumberTypeFloat16)
                        .AddOutputAttr(kNumberTypeFloat16)
                        .AddOutputAttr(kNumberTypeFloat16)
                        .AddInputAttr(kNumberTypeFloat32)
                        .AddInputAttr(kNumberTypeFloat32)
                        .AddInputAttr(kNumberTypeFloat32)
                        .AddInputAttr(kNumberTypeFloat32)
                        .AddOutputAttr(kNumberTypeFloat16)
                        .AddOutputAttr(kNumberTypeFloat16),
                        .AddOutputAttr(kNumberTypeFloat32)
                        .AddOutputAttr(kNumberTypeFloat32)
                        .AddOutputAttr(kNumberTypeFloat32)
                        .AddOutputAttr(kNumberTypeFloat32),
                      FusedBatchNormGpuKernel, half)
 MS_REG_GPU_KERNEL_ONE(BatchNorm,
                      KernelAttr()
@@ -60,15 +60,15 @@ MS_REG_GPU_KERNEL_ONE(BatchNorm,
 MS_REG_GPU_KERNEL_ONE(BatchNorm,
                      KernelAttr()
                        .AddInputAttr(kNumberTypeFloat16)
                        .AddInputAttr(kNumberTypeFloat16)
                        .AddInputAttr(kNumberTypeFloat16)
                        .AddInputAttr(kNumberTypeFloat16)
                        .AddInputAttr(kNumberTypeFloat16)
                        .AddOutputAttr(kNumberTypeFloat16)
                        .AddOutputAttr(kNumberTypeFloat16)
                        .AddOutputAttr(kNumberTypeFloat16)
                        .AddInputAttr(kNumberTypeFloat32)
                        .AddInputAttr(kNumberTypeFloat32)
                        .AddInputAttr(kNumberTypeFloat32)
                        .AddInputAttr(kNumberTypeFloat32)
                        .AddOutputAttr(kNumberTypeFloat16)
                        .AddOutputAttr(kNumberTypeFloat16),
                        .AddOutputAttr(kNumberTypeFloat32)
                        .AddOutputAttr(kNumberTypeFloat32)
                        .AddOutputAttr(kNumberTypeFloat32)
                        .AddOutputAttr(kNumberTypeFloat32),
                      FusedBatchNormGpuKernel, half)
 }  // namespace kernel
 }  // namespace mindspore
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/fused_batch_norm_gpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/fused_batch_norm_gpu_kernel.h
@@ -56,17 +56,17 @@ class FusedBatchNormGpuKernel : public GpuKernel {
      return true;
    }
    auto x = GetDeviceAddress<T>(inputs, 0);
    auto scale = GetDeviceAddress<T>(inputs, 1);
    auto bias = GetDeviceAddress<T>(inputs, 2);
    auto runing_mean = GetDeviceAddress<T>(inputs, 3);
    auto runnig_variance = GetDeviceAddress<T>(inputs, 4);
    auto scale = GetDeviceAddress<float>(inputs, 1);
    auto bias = GetDeviceAddress<float>(inputs, 2);
    auto runing_mean = GetDeviceAddress<float>(inputs, 3);
    auto runnig_variance = GetDeviceAddress<float>(inputs, 4);
    auto y = GetDeviceAddress<T>(outputs, 0);

    const float alpha = 1;
    const float beta = 0;
    if (is_train_) {
      auto save_mean = GetDeviceAddress<T>(outputs, 3);
      auto save_variance = GetDeviceAddress<T>(outputs, 4);
      auto save_mean = GetDeviceAddress<float>(outputs, 3);
      auto save_variance = GetDeviceAddress<float>(outputs, 4);
      CHECK_CUDNN_RET_WITH_EXCEPT(
        cudnnBatchNormalizationForwardTraining(handle_, mode_, &alpha, &beta, x_desc_, x, y_desc_, y,
                                               scale_bias_mean_var_desc_, scale, bias, exp_avg_factor_, runing_mean,
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/fused_batchnorm_grad_gpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/fused_batchnorm_grad_gpu_kernel.cc
@@ -33,12 +33,12 @@ MS_REG_GPU_KERNEL_ONE(FusedBatchNormGrad,
                      KernelAttr()
                        .AddInputAttr(kNumberTypeFloat16)
                        .AddInputAttr(kNumberTypeFloat16)
                        .AddInputAttr(kNumberTypeFloat16)
                        .AddInputAttr(kNumberTypeFloat16)
                        .AddInputAttr(kNumberTypeFloat16)
                        .AddOutputAttr(kNumberTypeFloat16)
                        .AddInputAttr(kNumberTypeFloat32)
                        .AddInputAttr(kNumberTypeFloat32)
                        .AddInputAttr(kNumberTypeFloat32)
                        .AddOutputAttr(kNumberTypeFloat16)
                        .AddOutputAttr(kNumberTypeFloat16),
                        .AddOutputAttr(kNumberTypeFloat32)
                        .AddOutputAttr(kNumberTypeFloat32),
                      FusedBatchNormGradGpuKernel, half)
 }  // namespace kernel
 }  // namespace mindspore
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/fused_batchnorm_grad_gpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/fused_batchnorm_grad_gpu_kernel.h
@@ -55,12 +55,12 @@ class FusedBatchNormGradGpuKernel : public GpuKernel {
    }
    auto dy = GetDeviceAddress<T>(inputs, 0);
    auto x = GetDeviceAddress<T>(inputs, 1);
    auto scale = GetDeviceAddress<T>(inputs, 2);
    auto save_mean = GetDeviceAddress<T>(inputs, 3);
    auto save_variance = GetDeviceAddress<T>(inputs, 4);
    auto scale = GetDeviceAddress<float>(inputs, 2);
    auto save_mean = GetDeviceAddress<float>(inputs, 3);
    auto save_variance = GetDeviceAddress<float>(inputs, 4);
    auto dx = GetDeviceAddress<T>(outputs, 0);
    auto bn_scale = GetDeviceAddress<T>(outputs, 1);
    auto bn_bias = GetDeviceAddress<T>(outputs, 2);
    auto bn_scale = GetDeviceAddress<float>(outputs, 1);
    auto bn_bias = GetDeviceAddress<float>(outputs, 2);

    const float alpha_data_diff = 1;
    const float beta_data_diff = 0;
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/momentum_gpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/momentum_gpu_kernel.cc
@@ -18,32 +18,41 @@

 namespace mindspore {
 namespace kernel {
 MS_REG_GPU_KERNEL_TWO(ApplyMomentum,
                      KernelAttr()
                        .AddInputAttr(kNumberTypeFloat32)
                        .AddInputAttr(kNumberTypeFloat32)
                        .AddInputAttr(kNumberTypeFloat32)
                        .AddInputAttr(kNumberTypeFloat32)
                        .AddInputAttr(kNumberTypeFloat32)
                        .AddOutputAttr(kNumberTypeFloat32),
                      MomentumGpuKernel, float, float)
 MS_REG_GPU_KERNEL_TWO(ApplyMomentum,
                      KernelAttr()
                        .AddInputAttr(kNumberTypeFloat16)
                        .AddInputAttr(kNumberTypeFloat16)
                        .AddInputAttr(kNumberTypeFloat16)
                        .AddInputAttr(kNumberTypeFloat16)
                        .AddInputAttr(kNumberTypeFloat16)
                        .AddOutputAttr(kNumberTypeFloat16),
                      MomentumGpuKernel, half, half)
 MS_REG_GPU_KERNEL_TWO(ApplyMomentum,
                      KernelAttr()
                        .AddInputAttr(kNumberTypeFloat16)
                        .AddInputAttr(kNumberTypeFloat16)
                        .AddInputAttr(kNumberTypeFloat32)
                        .AddInputAttr(kNumberTypeFloat16)
                        .AddInputAttr(kNumberTypeFloat32)
                        .AddOutputAttr(kNumberTypeFloat16),
                      MomentumGpuKernel, half, float)
 MS_REG_GPU_KERNEL_THREE(ApplyMomentum,
                        KernelAttr()
                          .AddInputAttr(kNumberTypeFloat32)
                          .AddInputAttr(kNumberTypeFloat32)
                          .AddInputAttr(kNumberTypeFloat32)
                          .AddInputAttr(kNumberTypeFloat32)
                          .AddInputAttr(kNumberTypeFloat32)
                          .AddOutputAttr(kNumberTypeFloat32),
                        MomentumGpuKernel, float, float, float)
 MS_REG_GPU_KERNEL_THREE(ApplyMomentum,
                        KernelAttr()
                          .AddInputAttr(kNumberTypeFloat16)
                          .AddInputAttr(kNumberTypeFloat16)
                          .AddInputAttr(kNumberTypeFloat16)
                          .AddInputAttr(kNumberTypeFloat16)
                          .AddInputAttr(kNumberTypeFloat16)
                          .AddOutputAttr(kNumberTypeFloat16),
                        MomentumGpuKernel, half, half, half)
 MS_REG_GPU_KERNEL_THREE(ApplyMomentum,
                        KernelAttr()
                          .AddInputAttr(kNumberTypeFloat16)
                          .AddInputAttr(kNumberTypeFloat16)
                          .AddInputAttr(kNumberTypeFloat32)
                          .AddInputAttr(kNumberTypeFloat16)
                          .AddInputAttr(kNumberTypeFloat32)
                          .AddOutputAttr(kNumberTypeFloat16),
                        MomentumGpuKernel, half, float, half)
 MS_REG_GPU_KERNEL_THREE(ApplyMomentum,
                        KernelAttr()
                          .AddInputAttr(kNumberTypeFloat32)
                          .AddInputAttr(kNumberTypeFloat32)
                          .AddInputAttr(kNumberTypeFloat32)
                          .AddInputAttr(kNumberTypeFloat16)
                          .AddInputAttr(kNumberTypeFloat32)
                          .AddOutputAttr(kNumberTypeFloat32),
                        MomentumGpuKernel, float, float, half)
 }  // namespace kernel
 }  // namespace mindspore
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/momentum_gpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/momentum_gpu_kernel.h
@@ -23,7 +23,7 @@
 #include "backend/kernel_compiler/gpu/cuda_impl/momentum_impl.cuh"
 namespace mindspore {
 namespace kernel {
 template <typename T, typename S>
 template <typename T, typename S, typename G>
 class MomentumGpuKernel : public GpuKernel {
 public:
  MomentumGpuKernel()
@@ -38,7 +38,7 @@ class MomentumGpuKernel : public GpuKernel {
    T *variable = GetDeviceAddress<T>(inputs, 0);
    T *accumulation = GetDeviceAddress<T>(inputs, 1);
    S *learning_rate = GetDeviceAddress<S>(inputs, 2);
    T *gradient = GetDeviceAddress<T>(inputs, 3);
    G *gradient = GetDeviceAddress<G>(inputs, 3);
    S *momentum = GetDeviceAddress<S>(inputs, 4);
    MomentumUpdateVariable(inputs[0]->size / sizeof(T), variable, accumulation, learning_rate, gradient, momentum,
                           reinterpret_cast<cudaStream_t>(stream_ptr));
@@ -54,7 +54,7 @@ class MomentumGpuKernel : public GpuKernel {
    variable_size_ = sizeof(T);
    accumulation_size_ = sizeof(T);
    learning_rate_size_ = sizeof(S);
    gradient_size_ = sizeof(T);
    gradient_size_ = sizeof(G);
    momentum_size_ = sizeof(S);

    auto variable_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
--- a/mindspore/ccsrc/backend/kernel_compiler/tbe/tbe_adapter.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/tbe/tbe_adapter.cc
@@ -81,6 +81,7 @@ static std::map<string, string> tbe_func_adapter_map = {
  {"sparse_apply_proximal_adagrad", "sparse_apply_proximal_adagrad_d"},
  {"apply_add_sign", "apply_add_sign_d"},
  {"apply_power_sign", "apply_power_sign_d"},
  {"apply_centered_rms_prop", "apply_centered_rms_prop_d"},
  {"transpose", "transpose_d"},
  {"fill", "fill_d"},
  {"unsorted_segment_sum", "unsorted_segment_sum_d"},
--- a/mindspore/ccsrc/backend/kernel_compiler/tbe/tbe_kernel_build.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/tbe/tbe_kernel_build.cc
@@ -43,6 +43,7 @@ constexpr auto kJInputs = "inputs";
 constexpr auto kJOutputs = "outputs";
 constexpr auto kJAttrs = "attrs";
 constexpr auto kJKernelName = "kernel_name";
 constexpr auto kJFullName = "full_name";
 constexpr auto kJOpInfo = "op_info";
 constexpr auto kJDtype = "dtype";
 constexpr auto kJtype = "type";
@@ -125,6 +126,7 @@ bool TbeKernelJsonCreator::GenTbeSingleKernelJson(const std::shared_ptr<mindspor
    op_info_json[kJKernelName] = json_name_;
  }
  (*kernel_json)[kJOpInfo] = op_info_json;
  (*kernel_json)[kJFullName] = anf_node->fullname_with_scope();
  if (creater_type_ == SINGLE_BUILD) {
    TbeUtils::SaveJsonInfo(json_name_, json_info_);
  }
--- a/mindspore/ccsrc/backend/optimizer/ascend/ascend_backend_optimization.cc
+++ b/mindspore/ccsrc/backend/optimizer/ascend/ascend_backend_optimization.cc
@@ -97,6 +97,7 @@
 #include "backend/optimizer/ascend/format_type/modify_ops_attrs.h"
 #include "backend/optimizer/ascend/format_type/remove_no_use_reshape_op.h"
 #include "backend/optimizer/ascend/ir_fusion/add_input_to_output.h"
 #include "backend/optimizer/ascend/format_type/remove_internal_output.h"
 #include "utils/context/ms_context.h"
 #include "utils/config_manager.h"
 #include "debug/anf_ir_dump.h"
@@ -201,6 +202,7 @@ void AscendDataLayout(const std::shared_ptr<session::KernelGraph> &kernel_graph)
  data_layout_pm->AddPass(std::make_shared<OptimizeDependence>());
  data_layout_pm->AddPass(std::make_shared<TransDataSplit>());
  data_layout_pm->AddPass(std::make_shared<EraseVisitAttr>());
  data_layout_pm->AddPass(std::make_shared<RemoveInternalOutputTransOp>());
  optimizer->AddPassManager(data_layout_pm);
  (void)optimizer->Optimize(kernel_graph);
  kernel_graph->SetExecOrderByDefault();
@@ -222,6 +224,7 @@ void AscendMixPrecision(const std::shared_ptr<session::KernelGraph> &kernel_grap
  mixed_precision_pm->AddPass(std::make_shared<LayerNormBetaGammaBackpropFusion>());
  mixed_precision_pm->AddPass(std::make_shared<EraseVisitAttr>());
  mixed_precision_pm->AddPass(std::make_shared<ConvertUnSupportNodeToAICPU>());
  mixed_precision_pm->AddPass(std::make_shared<RemoveInternalOutputCast>());
  optimizer->AddPassManager(mixed_precision_pm);
  (void)optimizer->Optimize(kernel_graph);
  kernel_graph->SetExecOrderByDefault();
--- a/mindspore/ccsrc/backend/optimizer/ascend/ascend_helper.cc
+++ b/mindspore/ccsrc/backend/optimizer/ascend/ascend_helper.cc
@@ -142,6 +142,7 @@ AnfNodePtr InsertTransOpForMultipleOutput(const FuncGraphPtr &func_graph, const
  MS_EXCEPTION_IF_NULL(node);
  std::vector<AnfNodePtr> make_tuple_inputs;
  make_tuple_inputs.push_back(NewValueNode(prim::kPrimMakeTuple));
  auto kernel_graph = func_graph->cast<KernelGraphPtr>();
  for (size_t output_idx = 0; output_idx < AnfAlgo::GetOutputTensorNum(node); ++output_idx) {
    std::string output_format = AnfAlgo::GetOutputFormat(node, output_idx);
    if (output_format == kOpFormat_NC1KHKWHWC0) {
@@ -151,7 +152,11 @@ AnfNodePtr InsertTransOpForMultipleOutput(const FuncGraphPtr &func_graph, const
    auto tuple_getitem = CreatTupleGetItemNode(func_graph, node, output_idx);
    std::vector<size_t> origin_shape = AnfAlgo::GetOutputInferShape(node, output_idx);
    if (kCommonFormatSet.find(output_format) == kCommonFormatSet.end() && origin_shape.size() > 1) {
      make_tuple_inputs.emplace_back(AddTransOpNodeToGraph(func_graph, tuple_getitem, kernel_select, 0, false));
      auto trans_op = AddTransOpNodeToGraph(func_graph, tuple_getitem, kernel_select, 0, false);
      if (kernel_graph != nullptr && kernel_graph->IsInternalOutput(node)) {
        kernel_graph->ReplaceInternalOutput(node, trans_op, output_idx, 0);
      }
      make_tuple_inputs.emplace_back(trans_op);
    } else {
      // No need insert trans op.
      make_tuple_inputs.push_back(tuple_getitem);
@@ -249,9 +254,14 @@ AnfNodePtr InsertTransOpForOutput(const FuncGraphPtr &func_graph, const AnfNodeP
  if (outputs_num == 0) {
    return node;
  }
  auto kernel_graph = func_graph->cast<KernelGraphPtr>();
  // Single output
  if (outputs_num == 1 && (!AnfAlgo::IsTupleOutput(node))) {
    return InsertTransOpForSingleOutput(func_graph, node, kernel_select);
    auto new_node = InsertTransOpForSingleOutput(func_graph, node, kernel_select);
    if (kernel_graph != nullptr && kernel_graph->IsInternalOutput(node)) {
      kernel_graph->ReplaceInternalOutput(node, new_node);
    }
    return new_node;
  }
  // Multiple output
  return InsertTransOpForMultipleOutput(func_graph, node, kernel_select);
--- a/mindspore/ccsrc/backend/optimizer/ascend/format_type/insert_cast.cc
+++ b/mindspore/ccsrc/backend/optimizer/ascend/format_type/insert_cast.cc
@@ -40,6 +40,7 @@ AnfNodePtr InsertCastForMultipleOutput(const FuncGraphPtr &func_graph, const CNo
  std::vector<AnfNodePtr> make_tuple_inputs;
  AbstractBasePtrList abstract_list;
  make_tuple_inputs.push_back(NewValueNode(prim::kPrimMakeTuple));
  auto kernel_graph = func_graph->cast<KernelGraphPtr>();
  for (size_t output_idx = 0; output_idx < AnfAlgo::GetOutputTensorNum(cnode); ++output_idx) {
    AnfNodePtr replace_node = nullptr;
    const auto origin_shape = AnfAlgo::GetOutputInferShape(cnode, output_idx);
@@ -64,6 +65,9 @@ AnfNodePtr InsertCastForMultipleOutput(const FuncGraphPtr &func_graph, const CNo
        MS_EXCEPTION_IF_NULL(replace_node);
        replace_node->set_scope(cnode->scope());
        AnfAlgo::SetNodeAttr(kAttrVisited, MakeValue(true), replace_node);
        if (kernel_graph != nullptr && kernel_graph->IsInternalOutput(cnode)) {
          kernel_graph->ReplaceInternalOutput(cnode, replace_node, output_idx, 0);
        }
      } else {
        replace_node = getitem;
      }
@@ -87,6 +91,7 @@ AnfNodePtr InsertCastForOutput(const FuncGraphPtr &func_graph, const CNodePtr &c
    return cnode;
  }
  MS_EXCEPTION_IF_NULL(cnode->Type());
  auto kernel_graph = func_graph->cast<KernelGraphPtr>();
  // Single output
  if (!cnode->Type()->isa<Tuple>()) {
    if (!need_insert_cast[0]) {
@@ -109,6 +114,9 @@ AnfNodePtr InsertCastForOutput(const FuncGraphPtr &func_graph, const CNodePtr &c
      MS_EXCEPTION_IF_NULL(replace_node);
      replace_node->set_scope(cnode->scope());
      AnfAlgo::SetNodeAttr(kAttrVisited, MakeValue(true), replace_node);
      if (kernel_graph != nullptr && kernel_graph->IsInternalOutput(cnode)) {
        kernel_graph->ReplaceInternalOutput(cnode, replace_node);
      }
    }
    return replace_node;
  }
@@ -188,6 +196,10 @@ const AnfNodePtr InsertCast::Process(const FuncGraphPtr &func_graph, const AnfNo
  CNodePtr cnode = node->cast<CNodePtr>();
  MS_EXCEPTION_IF_NULL(cnode);
  auto new_node = InsertCastForInput(func_graph, cnode);
  auto kernel_graph = func_graph->cast<std::shared_ptr<session::KernelGraph>>();
  if (kernel_graph != nullptr && kernel_graph->IsInternalOutput(node)) {
    kernel_graph->ReplaceInternalOutput(node, new_node);
  }
  // process output
  return InsertCastForOutput(func_graph, new_node, std::vector<bool>(AnfAlgo::GetOutputTensorNum(new_node), true));
 }
--- a/mindspore/ccsrc/backend/optimizer/ascend/format_type/insert_trans_op.cc
+++ b/mindspore/ccsrc/backend/optimizer/ascend/format_type/insert_trans_op.cc
@@ -46,14 +46,13 @@ const AnfNodePtr InsertTransOp::Process(const FuncGraphPtr &func_graph, const An
  if (node == nullptr || !AnfAlgo::IsRealKernel(node)) {
    return nullptr;
  }
  AnfNodePtr front_node;
  AnfAlgo::SetNodeAttr(kAttrVisited, MakeValue(true), node);
  MS_LOG(DEBUG) << "process op: " << node->DebugString();
  AnfNodePtr new_node = InsertTransOpForInput(func_graph, node, kernel_select_);
  auto kernel_graph = func_graph->cast<std::shared_ptr<session::KernelGraph>>();
  if (kernel_graph != nullptr && kernel_graph->IsInternalOutput(node)) {
    front_node = kernel_graph->GetFrontNodeByInternalOutput(node);
    kernel_graph->ReplaceInternalOutput(node, new_node);
  }
  AnfAlgo::SetNodeAttr(kAttrVisited, MakeValue(true), node);
  MS_LOG(DEBUG) << "====process op: " << node->DebugString();
  AnfNodePtr new_node = InsertTransOpForInput(func_graph, node, kernel_select_);
  auto ms_context = MsContext::GetInstance();
  MS_EXCEPTION_IF_NULL(ms_context);
  if (ms_context->execution_mode() == kPynativeMode && !ms_context->enable_pynative_hook()) {
@@ -61,12 +60,7 @@ const AnfNodePtr InsertTransOp::Process(const FuncGraphPtr &func_graph, const An
      return new_node;
    }
  }
  auto final_node = InsertTransOpForOutput(func_graph, new_node, kernel_select_);
  if (kernel_graph != nullptr && front_node != nullptr) {
    auto old_node = kernel_graph->GetInternalOutputByFrontNode(front_node);
    kernel_graph->ReplaceInternalOutput(old_node, final_node);
  }
  return final_node;
  return InsertTransOpForOutput(func_graph, new_node, kernel_select_);
 }
 }  // namespace opt
 }  // namespace mindspore
--- a/mindspore/ccsrc/backend/optimizer/ascend/format_type/remove_internal_output.cc
+++ b/mindspore/ccsrc/backend/optimizer/ascend/format_type/remove_internal_output.cc
@@ -0,0 +1,83 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #include "backend/optimizer/ascend/format_type/remove_internal_output.h"
 #include <memory>
 #include "backend/session/anf_runtime_algorithm.h"

 namespace mindspore {
 namespace opt {
 namespace {
 bool UsedForOutputOnly(const FuncGraphPtr &func_graph, const AnfNodePtr &node) {
  MS_EXCEPTION_IF_NULL(func_graph);
  auto manager = func_graph->manager();
  MS_EXCEPTION_IF_NULL(manager);
  auto &node_users = manager->node_users();
  auto iter = node_users.find(node);
  if (iter == node_users.end()) {
    return false;
  }
  const auto &node_set = iter->second;
  for (const auto &node_index : node_set) {
    if (!AnfAlgo::CheckPrimitiveType(node_index.first, prim::kPrimMakeTuple)) {
      return false;
    }
  }
  return true;
 }
 }  // namespace
 const BaseRef RemoveInternalOutputTransOp::DefinePattern() const {
  VarPtr X = std::make_shared<Var>();
  auto prim = std::make_shared<Primitive>(kTransDataOpName);
  return VectorRef({prim, X});
 }

 const BaseRef RemoveInternalOutputCast::DefinePattern() const {
  VarPtr X = std::make_shared<Var>();
  return VectorRef({prim::kPrimCast, X});
 }

 const AnfNodePtr RemoveInternalOutput::Process(const FuncGraphPtr &func_graph, const AnfNodePtr &node,
                                               const EquivPtr &) const {
  MS_EXCEPTION_IF_NULL(func_graph);
  MS_EXCEPTION_IF_NULL(node);
  auto kernel_graph = func_graph->cast<KernelGraphPtr>();
  if (kernel_graph == nullptr) {
    return nullptr;
  }
  if (!kernel_graph->IsInternalOutput(node)) {
    return nullptr;
  }
  if (!UsedForOutputOnly(func_graph, node)) {
    return nullptr;
  }
  auto cnode = node->cast<CNodePtr>();
  MS_EXCEPTION_IF_NULL(cnode);
  CheckCNodeInputSize(cnode, kTransOpInputNum);
  auto input_node = cnode->input(1);
  if (!AnfAlgo::CheckPrimitiveType(input_node, prim::kPrimTupleGetItem)) {
    kernel_graph->ReplaceInternalOutput(node, input_node);
  } else {
    auto tuple_getitem = input_node->cast<CNodePtr>();
    MS_EXCEPTION_IF_NULL(tuple_getitem);
    int idx = AnfAlgo::GetTupleGetItemOutIndex(tuple_getitem);
    AnfNodePtr real_input_node = AnfAlgo::GetTupleGetItemRealInput(tuple_getitem);
    kernel_graph->ReplaceInternalOutput(node, real_input_node, 0, idx);
  }
  return input_node;
 }
 }  // namespace opt
 }  // namespace mindspore
--- a/mindspore/ccsrc/backend/optimizer/ascend/format_type/remove_internal_output.h
+++ b/mindspore/ccsrc/backend/optimizer/ascend/format_type/remove_internal_output.h
@@ -0,0 +1,51 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #ifndef MINDSPORE_CCSRC_PRE_ACTIVATE_ASCEND_FORMAT_TYPE_REMOVE_INTERNAL_OUTPUT_H_
 #define MINDSPORE_CCSRC_PRE_ACTIVATE_ASCEND_FORMAT_TYPE_REMOVE_INTERNAL_OUTPUT_H_

 #include <string>
 #include "backend/optimizer/common/optimizer.h"

 namespace mindspore {
 namespace opt {
 class RemoveInternalOutput : public PatternProcessPass {
 public:
  explicit RemoveInternalOutput(const std::string &name, bool multigraph = true)
      : PatternProcessPass(name, multigraph) {}
  ~RemoveInternalOutput() override = default;
  const AnfNodePtr Process(const FuncGraphPtr &, const AnfNodePtr &, const EquivPtr &) const override;
 };

 class RemoveInternalOutputTransOp : public RemoveInternalOutput {
 public:
  explicit RemoveInternalOutputTransOp(bool multigraph = true)
      : RemoveInternalOutput("remove_internal_output_trans_op", multigraph) {}
  ~RemoveInternalOutputTransOp() override = default;
  const BaseRef DefinePattern() const override;
 };

 class RemoveInternalOutputCast : public RemoveInternalOutput {
 public:
  explicit RemoveInternalOutputCast(bool multigraph = true)
      : RemoveInternalOutput("remove_internal_output_cast", multigraph) {}
  ~RemoveInternalOutputCast() override = default;
  const BaseRef DefinePattern() const override;
 };
 }  // namespace opt
 }  // namespace mindspore

 #endif  // MINDSPORE_CCSRC_PRE_ACTIVATE_ASCEND_FORMAT_TYPE_REMOVE_INTERNAL_OUTPUT_H_
--- a/mindspore/ccsrc/backend/optimizer/gpu/adam_fusion.h
+++ b/mindspore/ccsrc/backend/optimizer/gpu/adam_fusion.h
@@ -13,8 +13,8 @@
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #ifndef MINDSPORE_CCSRC_PRE_ACTIVATE_GPU_IR_FUSION_ADAM_FUSION_H_
 #define MINDSPORE_CCSRC_PRE_ACTIVATE_GPU_IR_FUSION_ADAM_FUSION_H_
 #ifndef MINDSPORE_CCSRC_BACKEND_OPTIMIZER_GPU_ADAM_FUSION_H_
 #define MINDSPORE_CCSRC_BACKEND_OPTIMIZER_GPU_ADAM_FUSION_H_

 #include <memory>
 #include "backend/optimizer/common/optimizer.h"
@@ -53,4 +53,4 @@ class AdamFusion : public PatternProcessPass {
 };
 }  // namespace opt
 }  // namespace mindspore
 #endif  // MINDSPORE_CCSRC_PRE_ACTIVATE_GPU_IR_FUSION_ADAM_FUSION_H_
 #endif  // MINDSPORE_CCSRC_BACKEND_OPTIMIZER_GPU_ADAM_FUSION_H_
--- a/mindspore/ccsrc/backend/optimizer/gpu/adam_weight_decay_fusion.h
+++ b/mindspore/ccsrc/backend/optimizer/gpu/adam_weight_decay_fusion.h
@@ -13,8 +13,8 @@
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #ifndef MINDSPORE_CCSRC_PRE_ACTIVATE_GPU_IR_FUSION_ADAM_WEIGHT_DECAY_FUSION_H_
 #define MINDSPORE_CCSRC_PRE_ACTIVATE_GPU_IR_FUSION_ADAM_WEIGHT_DECAY_FUSION_H_
 #ifndef MINDSPORE_CCSRC_BACKEND_OPTIMIZER_GPU_ADAM_WEIGHT_DECAY_FUSION_H_
 #define MINDSPORE_CCSRC_BACKEND_OPTIMIZER_GPU_ADAM_WEIGHT_DECAY_FUSION_H_

 #include <memory>
 #include "backend/optimizer/common/optimizer.h"
@@ -55,4 +55,4 @@ class AdamWeightDecayFusion : public PatternProcessPass {
 };
 }  // namespace opt
 }  // namespace mindspore
 #endif  // MINDSPORE_CCSRC_PRE_ACTIVATE_GPU_IR_FUSION_ADAM_WEIGHT_DECAY_FUSION_H_
 #endif  // MINDSPORE_CCSRC_BACKEND_OPTIMIZER_GPU_ADAM_WEIGHT_DECAY_FUSION_H_
--- a/mindspore/ccsrc/backend/optimizer/gpu/replace_addn_fusion.cc
+++ b/mindspore/ccsrc/backend/optimizer/gpu/replace_addn_fusion.cc
@@ -0,0 +1,65 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "backend/optimizer/gpu/replace_addn_fusion.h"

 #include <memory>
 #include <vector>
 #include <string>

 #include "backend/session/anf_runtime_algorithm.h"
 #include "ir/primitive.h"
 #include "utils/utils.h"
 #include "backend/optimizer/common/helper.h"

 namespace mindspore {
 namespace opt {
 const BaseRef ReplaceAddNFusion::DefinePattern() const {
  VectorRef addn = VectorRef({prim::kPrimAddN, A, B});
  return addn;
 }

 const AnfNodePtr ReplaceAddNFusion::Process(const FuncGraphPtr &graph, const AnfNodePtr &node,
                                            const EquivPtr &equiv) const {
  MS_EXCEPTION_IF_NULL(graph);
  MS_EXCEPTION_IF_NULL(node);
  MS_EXCEPTION_IF_NULL(equiv);

  auto A = AnfAlgo::GetInputNode(utils::cast<CNodePtr>(node), 0);
  auto B = AnfAlgo::GetInputNode(utils::cast<CNodePtr>(node), 1);
  MS_EXCEPTION_IF_NULL(A);
  MS_EXCEPTION_IF_NULL(B);
  int num_input = AnfAlgo::GetNodeAttr<int>(node, "n");

  if (num_input == 2) {
    auto prim = std::make_shared<Primitive>(prim::kPrimTensorAdd->name());
    MS_EXCEPTION_IF_NULL(prim);
    std::vector<AnfNodePtr> inputs = {NewValueNode(prim), A, B};
    auto add_new = graph->NewCNode(inputs);
    std::vector<TypeId> outputs_type;
    std::vector<std::vector<size_t>> outputs_shape;
    outputs_type.push_back(AnfAlgo::GetOutputInferDataType(A, 0));
    outputs_shape.push_back(AnfAlgo::GetOutputInferShape(A, 0));
    AnfAlgo::SetOutputInferTypeAndShape(outputs_type, outputs_shape, add_new.get());
    auto manager = graph->manager();
    MS_EXCEPTION_IF_NULL(manager);
    manager->Replace(utils::cast<CNodePtr>(node), utils::cast<CNodePtr>(add_new));
    return add_new;
  } else {
    return nullptr;
  }
 }
 }  // namespace opt
 }  // namespace mindspore
--- a/mindspore/ccsrc/backend/optimizer/gpu/replace_addn_fusion.h
+++ b/mindspore/ccsrc/backend/optimizer/gpu/replace_addn_fusion.h
@@ -0,0 +1,40 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #ifndef MINDSPORE_CCSRC_BACKEND_OPTIMIZER_GPU_REPLACE_ADDN_FUSION_H_
 #define MINDSPORE_CCSRC_BACKEND_OPTIMIZER_GPU_REPLACE_ADDN_FUSION_H_

 #include <memory>
 #include "backend/optimizer/common/optimizer.h"

 namespace mindspore {
 namespace opt {
 class ReplaceAddNFusion : public PatternProcessPass {
 public:
  explicit ReplaceAddNFusion(bool multigraph = true) : PatternProcessPass("replace_addn", multigraph) {
    A = std::make_shared<Var>();
    B = std::make_shared<Var>();
  }
  ~ReplaceAddNFusion() override = default;
  const BaseRef DefinePattern() const override;
  const AnfNodePtr Process(const FuncGraphPtr &, const AnfNodePtr &, const EquivPtr &) const override;

 private:
  VarPtr A;
  VarPtr B;
 };
 }  // namespace opt
 }  // namespace mindspore
 #endif  // MINDSPORE_CCSRC_BACKEND_OPTIMIZER_GPU_REPLACE_ADDN_FUSION_H_
--- a/mindspore/ccsrc/backend/optimizer/gpu/replace_bn_cast_fusion.cc
+++ b/mindspore/ccsrc/backend/optimizer/gpu/replace_bn_cast_fusion.cc
@@ -0,0 +1,92 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "backend/optimizer/gpu/replace_bn_cast_fusion.h"

 #include <memory>
 #include <vector>
 #include <string>

 #include "backend/session/anf_runtime_algorithm.h"
 #include "ir/primitive.h"
 #include "utils/utils.h"
 #include "backend/optimizer/common/helper.h"

 namespace mindspore {
 namespace opt {
 const BaseRef ReplaceBNCastFusion::DefinePattern() const {
  VectorRef in_cast = VectorRef({prim::kPrimCast, x_});
  VectorRef fbn2 = VectorRef({prim::kPrimFusedBatchNorm, in_cast, scale_, bias_, mean_, var_});
  VectorRef tupleget = VectorRef({prim::kPrimTupleGetItem, fbn2, index_});
  VectorRef out_cast = VectorRef({prim::kPrimCast, tupleget});
  return out_cast;
 }

 const AnfNodePtr ReplaceBNCastFusion::Process(const FuncGraphPtr &graph, const AnfNodePtr &node,
                                              const EquivPtr &equiv) const {
  MS_EXCEPTION_IF_NULL(graph);
  MS_EXCEPTION_IF_NULL(node);
  MS_EXCEPTION_IF_NULL(equiv);

  auto tuple = AnfAlgo::GetInputNode(utils::cast<CNodePtr>(node), 0);
  auto index_node = AnfAlgo::GetInputNode(utils::cast<CNodePtr>(tuple), 1);
  MS_EXCEPTION_IF_NULL(index_node);
  auto value_node = index_node->cast<ValueNodePtr>();
  MS_EXCEPTION_IF_NULL(value_node);
  int item_idx = GetValue<int>(value_node->value());

  auto fbn2 = AnfAlgo::GetInputNode(utils::cast<CNodePtr>(tuple), 0);
  auto x_after = AnfAlgo::GetInputNode(utils::cast<CNodePtr>(fbn2), 0);
  auto x_before = AnfAlgo::GetInputNode(utils::cast<CNodePtr>(x_after), 0);
  if (item_idx != 0) {
    return nullptr;
  }
  auto scale = AnfAlgo::GetInputNode(utils::cast<CNodePtr>(fbn2), 1);
  auto bias = AnfAlgo::GetInputNode(utils::cast<CNodePtr>(fbn2), 2);
  auto mean = AnfAlgo::GetInputNode(utils::cast<CNodePtr>(fbn2), 3);
  auto var = AnfAlgo::GetInputNode(utils::cast<CNodePtr>(fbn2), 4);

  MS_EXCEPTION_IF_NULL(fbn2);
  MS_EXCEPTION_IF_NULL(x_after);
  MS_EXCEPTION_IF_NULL(x_before);
  MS_EXCEPTION_IF_NULL(scale);
  MS_EXCEPTION_IF_NULL(bias);
  MS_EXCEPTION_IF_NULL(mean);
  MS_EXCEPTION_IF_NULL(var);

  auto manager = graph->manager();
  MS_EXCEPTION_IF_NULL(manager);
  manager->Replace(utils::cast<CNodePtr>(x_after), utils::cast<CNodePtr>(x_before));
  manager->Replace(utils::cast<CNodePtr>(node), utils::cast<CNodePtr>(tuple));

  std::vector<TypeId> outputs_type;
  std::vector<std::vector<size_t>> outputs_shape;
  auto output_num = AnfAlgo::GetOutputTensorNum(fbn2);
  for (size_t i = 0; i < output_num; i++) {
    outputs_type.push_back(AnfAlgo::GetOutputInferDataType(fbn2, i));
    outputs_shape.push_back(AnfAlgo::GetOutputInferShape(fbn2, i));
  }
  outputs_type[0] = kNumberTypeFloat16;
  AnfAlgo::SetOutputInferTypeAndShape(outputs_type, outputs_shape, fbn2.get());

  outputs_type.clear();
  outputs_shape.clear();
  outputs_type.push_back(kNumberTypeFloat16);
  outputs_shape.push_back(AnfAlgo::GetOutputInferShape(tuple, 0));
  AnfAlgo::SetOutputInferTypeAndShape(outputs_type, outputs_shape, tuple.get());
  return tuple;
 }
 }  // namespace opt
 }  // namespace mindspore
--- a/mindspore/ccsrc/backend/optimizer/gpu/replace_bn_cast_fusion.h
+++ b/mindspore/ccsrc/backend/optimizer/gpu/replace_bn_cast_fusion.h
@@ -0,0 +1,58 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #ifndef MINDSPORE_CCSRC_BACKEND_OPTIMIZER_GPU_REPLACE_BN_CAST_FUSION_H_
 #define MINDSPORE_CCSRC_BACKEND_OPTIMIZER_GPU_REPLACE_BN_CAST_FUSION_H_

 #include <memory>
 #include "backend/optimizer/common/optimizer.h"

 namespace mindspore {
 namespace opt {
 class ReplaceBNCastFusion : public PatternProcessPass {
 public:
  explicit ReplaceBNCastFusion(bool multigraph = true) : PatternProcessPass("replace_bn_cast", multigraph) {
    x_ = std::make_shared<Var>();
    scale_ = std::make_shared<Var>();
    bias_ = std::make_shared<Var>();
    mean_ = std::make_shared<Var>();
    var_ = std::make_shared<Var>();
    y_ = std::make_shared<Var>();
    running_mean_ = std::make_shared<Var>();
    running_var_ = std::make_shared<Var>();
    save_mean_ = std::make_shared<Var>();
    save_var_ = std::make_shared<Var>();
    index_ = std::make_shared<Var>();
  }
  ~ReplaceBNCastFusion() override = default;
  const BaseRef DefinePattern() const override;
  const AnfNodePtr Process(const FuncGraphPtr &, const AnfNodePtr &, const EquivPtr &) const override;

 private:
  VarPtr x_;
  VarPtr scale_;
  VarPtr bias_;
  VarPtr mean_;
  VarPtr var_;
  VarPtr y_;
  VarPtr running_mean_;
  VarPtr running_var_;
  VarPtr save_mean_;
  VarPtr save_var_;
  VarPtr index_;
 };
 }  // namespace opt
 }  // namespace mindspore
 #endif  // MINDSPORE_CCSRC_BACKEND_OPTIMIZER_GPU_REPLACE_BN_CAST_FUSION_H_
--- a/mindspore/ccsrc/backend/optimizer/gpu/replace_bn_grad_cast2_fusion.cc
+++ b/mindspore/ccsrc/backend/optimizer/gpu/replace_bn_grad_cast2_fusion.cc
@@ -0,0 +1,88 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "backend/optimizer/gpu/replace_bn_grad_cast2_fusion.h"

 #include <memory>
 #include <vector>
 #include <string>

 #include "backend/session/anf_runtime_algorithm.h"
 #include "ir/primitive.h"
 #include "utils/utils.h"
 #include "backend/optimizer/common/helper.h"

 namespace mindspore {
 namespace opt {
 const BaseRef ReplaceBNGradCast2Fusion::DefinePattern() const {
  VectorRef fbn2g = VectorRef({prim::kPrimFusedBatchNormGrad, dy_, x_, scale_, mean_, var_});
  VectorRef tupleget = VectorRef({prim::kPrimTupleGetItem, fbn2g, index_});
  VectorRef out_cast = VectorRef({prim::kPrimCast, tupleget});
  return out_cast;
 }

 const AnfNodePtr ReplaceBNGradCast2Fusion::Process(const FuncGraphPtr &graph, const AnfNodePtr &node,
                                                   const EquivPtr &equiv) const {
  MS_EXCEPTION_IF_NULL(graph);
  MS_EXCEPTION_IF_NULL(node);
  MS_EXCEPTION_IF_NULL(equiv);
  auto tuple = AnfAlgo::GetInputNode(utils::cast<CNodePtr>(node), 0);
  auto index_node = AnfAlgo::GetInputNode(utils::cast<CNodePtr>(tuple), 1);
  MS_EXCEPTION_IF_NULL(index_node);
  auto value_node = index_node->cast<ValueNodePtr>();
  MS_EXCEPTION_IF_NULL(value_node);
  int item_idx = GetValue<int>(value_node->value());
  if (item_idx != 0) {
    return nullptr;
  }
  auto fbn2g = AnfAlgo::GetInputNode(utils::cast<CNodePtr>(tuple), 0);

  auto dy_ = AnfAlgo::GetInputNode(utils::cast<CNodePtr>(fbn2g), 0);
  auto x_ = AnfAlgo::GetInputNode(utils::cast<CNodePtr>(fbn2g), 1);

  auto scale = AnfAlgo::GetInputNode(utils::cast<CNodePtr>(fbn2g), 2);
  auto mean = AnfAlgo::GetInputNode(utils::cast<CNodePtr>(fbn2g), 3);
  auto var = AnfAlgo::GetInputNode(utils::cast<CNodePtr>(fbn2g), 4);

  MS_EXCEPTION_IF_NULL(fbn2g);
  MS_EXCEPTION_IF_NULL(dy_);
  MS_EXCEPTION_IF_NULL(scale);
  MS_EXCEPTION_IF_NULL(x_);
  MS_EXCEPTION_IF_NULL(mean);
  MS_EXCEPTION_IF_NULL(var);

  auto manager = graph->manager();
  MS_EXCEPTION_IF_NULL(manager);
  manager->Replace(utils::cast<CNodePtr>(node), utils::cast<CNodePtr>(tuple));
  std::vector<TypeId> outputs_type;
  std::vector<std::vector<size_t>> outputs_shape;
  auto output_num = AnfAlgo::GetOutputTensorNum(fbn2g);
  for (size_t i = 0; i < output_num; i++) {
    outputs_type.push_back(AnfAlgo::GetOutputInferDataType(fbn2g, i));
    outputs_shape.push_back(AnfAlgo::GetOutputInferShape(fbn2g, i));
  }
  outputs_type[0] = AnfAlgo::GetPrevNodeOutputInferDataType(fbn2g, 0);
  AnfAlgo::SetOutputInferTypeAndShape(outputs_type, outputs_shape, fbn2g.get());

  outputs_type.clear();
  outputs_shape.clear();
  outputs_type.push_back(AnfAlgo::GetPrevNodeOutputInferDataType(fbn2g, 0));
  outputs_shape.push_back(AnfAlgo::GetOutputInferShape(tuple, 0));
  AnfAlgo::SetOutputInferTypeAndShape(outputs_type, outputs_shape, tuple.get());

  return tuple;
 }
 }  // namespace opt
 }  // namespace mindspore
--- a/mindspore/ccsrc/backend/optimizer/gpu/replace_bn_grad_cast2_fusion.h
+++ b/mindspore/ccsrc/backend/optimizer/gpu/replace_bn_grad_cast2_fusion.h
@@ -0,0 +1,54 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #ifndef MINDSPORE_CCSRC_BACKEND_OPTIMIZER_GPU_REPLACE_BN_GRAD_CAST2_FUSION_H_
 #define MINDSPORE_CCSRC_BACKEND_OPTIMIZER_GPU_REPLACE_BN_GRAD_CAST2_FUSION_H_

 #include <memory>
 #include "backend/optimizer/common/optimizer.h"

 namespace mindspore {
 namespace opt {
 class ReplaceBNGradCast2Fusion : public PatternProcessPass {
 public:
  explicit ReplaceBNGradCast2Fusion(bool multigraph = true) : PatternProcessPass("replace_grad_cast2", multigraph) {
    dy_ = std::make_shared<Var>();
    x_ = std::make_shared<Var>();
    scale_ = std::make_shared<Var>();
    mean_ = std::make_shared<Var>();
    var_ = std::make_shared<Var>();
    dx_ = std::make_shared<Var>();
    bn_scale_ = std::make_shared<Var>();
    bn_bias_ = std::make_shared<Var>();
    index_ = std::make_shared<Var>();
  }
  ~ReplaceBNGradCast2Fusion() override = default;
  const BaseRef DefinePattern() const override;
  const AnfNodePtr Process(const FuncGraphPtr &, const AnfNodePtr &, const EquivPtr &) const override;

 private:
  VarPtr dy_;
  VarPtr x_;
  VarPtr scale_;
  VarPtr mean_;
  VarPtr var_;
  VarPtr dx_;
  VarPtr bn_scale_;
  VarPtr bn_bias_;
  VarPtr index_;
 };
 }  // namespace opt
 }  // namespace mindspore
 #endif  // MINDSPORE_CCSRC_BACKEND_OPTIMIZER_GPU_REPLACE_BN_GRAD_CAST2_FUSION_H_
--- a/mindspore/ccsrc/backend/optimizer/gpu/replace_bn_grad_cast_fusion.cc
+++ b/mindspore/ccsrc/backend/optimizer/gpu/replace_bn_grad_cast_fusion.cc
@@ -0,0 +1,91 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "backend/optimizer/gpu/replace_bn_grad_cast_fusion.h"

 #include <memory>
 #include <vector>
 #include <string>

 #include "backend/session/anf_runtime_algorithm.h"
 #include "ir/primitive.h"
 #include "utils/utils.h"
 #include "backend/optimizer/common/helper.h"

 namespace mindspore {
 namespace opt {
 const BaseRef ReplaceBNGradCastFusion::DefinePattern() const {
  VectorRef dy_cast = VectorRef({prim::kPrimCast, dy_});
  VectorRef fbn2g = VectorRef({prim::kPrimFusedBatchNormGrad, dy_cast, x_, scale_, mean_, var_});
  VectorRef tupleget = VectorRef({prim::kPrimTupleGetItem, fbn2g, index_});
  VectorRef out_cast = VectorRef({prim::kPrimCast, tupleget});
  return out_cast;
 }

 const AnfNodePtr ReplaceBNGradCastFusion::Process(const FuncGraphPtr &graph, const AnfNodePtr &node,
                                                  const EquivPtr &equiv) const {
  MS_EXCEPTION_IF_NULL(graph);
  MS_EXCEPTION_IF_NULL(node);
  MS_EXCEPTION_IF_NULL(equiv);

  auto tuple = AnfAlgo::GetInputNode(utils::cast<CNodePtr>(node), 0);
  auto index_node = AnfAlgo::GetInputNode(utils::cast<CNodePtr>(tuple), 1);
  MS_EXCEPTION_IF_NULL(index_node);
  auto value_node = index_node->cast<ValueNodePtr>();
  MS_EXCEPTION_IF_NULL(value_node);
  int item_idx = GetValue<int>(value_node->value());
  if (item_idx != 0) {
    return nullptr;
  }
  auto fbn2g = AnfAlgo::GetInputNode(utils::cast<CNodePtr>(tuple), 0);

  auto dy_after = AnfAlgo::GetInputNode(utils::cast<CNodePtr>(fbn2g), 0);
  auto dy_before = AnfAlgo::GetInputNode(utils::cast<CNodePtr>(dy_after), 0);
  auto x_ = AnfAlgo::GetInputNode(utils::cast<CNodePtr>(fbn2g), 1);

  auto scale = AnfAlgo::GetInputNode(utils::cast<CNodePtr>(fbn2g), 2);
  auto mean = AnfAlgo::GetInputNode(utils::cast<CNodePtr>(fbn2g), 3);
  auto var = AnfAlgo::GetInputNode(utils::cast<CNodePtr>(fbn2g), 4);

  MS_EXCEPTION_IF_NULL(fbn2g);
  MS_EXCEPTION_IF_NULL(dy_after);
  MS_EXCEPTION_IF_NULL(dy_before);
  MS_EXCEPTION_IF_NULL(scale);
  MS_EXCEPTION_IF_NULL(x_);
  MS_EXCEPTION_IF_NULL(mean);
  MS_EXCEPTION_IF_NULL(var);

  auto manager = graph->manager();
  MS_EXCEPTION_IF_NULL(manager);
  manager->Replace(utils::cast<CNodePtr>(dy_after), utils::cast<CNodePtr>(dy_before));
  manager->Replace(utils::cast<CNodePtr>(node), utils::cast<CNodePtr>(tuple));
  std::vector<TypeId> outputs_type;
  std::vector<std::vector<size_t>> outputs_shape;
  auto output_num = AnfAlgo::GetOutputTensorNum(fbn2g);
  for (size_t i = 0; i < output_num; i++) {
    outputs_type.push_back(AnfAlgo::GetOutputInferDataType(fbn2g, i));
    outputs_shape.push_back(AnfAlgo::GetOutputInferShape(fbn2g, i));
  }
  outputs_type[0] = kNumberTypeFloat16;
  AnfAlgo::SetOutputInferTypeAndShape(outputs_type, outputs_shape, fbn2g.get());
  outputs_type.clear();
  outputs_shape.clear();
  outputs_type.push_back(kNumberTypeFloat16);
  outputs_shape.push_back(AnfAlgo::GetOutputInferShape(tuple, 0));
  AnfAlgo::SetOutputInferTypeAndShape(outputs_type, outputs_shape, tuple.get());
  return tuple;
 }
 }  // namespace opt
 }  // namespace mindspore
--- a/mindspore/ccsrc/backend/optimizer/gpu/replace_bn_grad_cast_fusion.h
+++ b/mindspore/ccsrc/backend/optimizer/gpu/replace_bn_grad_cast_fusion.h
@@ -0,0 +1,54 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #ifndef MINDSPORE_CCSRC_BACKEND_OPTIMIZER_GPU_REPLACE_BN_GRAD_CAST_FUSION_H_
 #define MINDSPORE_CCSRC_BACKEND_OPTIMIZER_GPU_REPLACE_BN_GRAD_CAST_FUSION_H_

 #include <memory>
 #include "backend/optimizer/common/optimizer.h"

 namespace mindspore {
 namespace opt {
 class ReplaceBNGradCastFusion : public PatternProcessPass {
 public:
  explicit ReplaceBNGradCastFusion(bool multigraph = true) : PatternProcessPass("replace_bn_grad_cast", multigraph) {
    dy_ = std::make_shared<Var>();
    x_ = std::make_shared<Var>();
    scale_ = std::make_shared<Var>();
    mean_ = std::make_shared<Var>();
    var_ = std::make_shared<Var>();
    dx_ = std::make_shared<Var>();
    bn_scale_ = std::make_shared<Var>();
    bn_bias_ = std::make_shared<Var>();
    index_ = std::make_shared<Var>();
  }
  ~ReplaceBNGradCastFusion() override = default;
  const BaseRef DefinePattern() const override;
  const AnfNodePtr Process(const FuncGraphPtr &, const AnfNodePtr &, const EquivPtr &) const override;

 private:
  VarPtr dy_;
  VarPtr x_;
  VarPtr scale_;
  VarPtr mean_;
  VarPtr var_;
  VarPtr dx_;
  VarPtr bn_scale_;
  VarPtr bn_bias_;
  VarPtr index_;
 };
 }  // namespace opt
 }  // namespace mindspore
 #endif  // MINDSPORE_CCSRC_BACKEND_OPTIMIZER_GPU_REPLACE_BN_GRAD_CAST_FUSION_H_
--- a/mindspore/ccsrc/backend/optimizer/gpu/replace_momentum_cast_fusion.cc
+++ b/mindspore/ccsrc/backend/optimizer/gpu/replace_momentum_cast_fusion.cc
@@ -0,0 +1,63 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "backend/optimizer/gpu/replace_momentum_cast_fusion.h"

 #include <memory>
 #include <vector>
 #include <string>

 #include "backend/session/anf_runtime_algorithm.h"
 #include "ir/primitive.h"
 #include "utils/utils.h"
 #include "backend/optimizer/common/helper.h"

 namespace mindspore {
 namespace opt {
 const BaseRef ReplaceMomentumCastFusion::DefinePattern() const {
  VectorRef grad_cast = VectorRef({prim::kPrimCast, grad_});
  VectorRef momentum = VectorRef({prim::kPrimApplyMomentum, var_, acc_, lr_, grad_cast, mom_});
  return momentum;
 }

 const AnfNodePtr ReplaceMomentumCastFusion::Process(const FuncGraphPtr &graph, const AnfNodePtr &node,
                                                    const EquivPtr &equiv) const {
  MS_EXCEPTION_IF_NULL(graph);
  MS_EXCEPTION_IF_NULL(node);
  MS_EXCEPTION_IF_NULL(equiv);

  auto grad_cast = AnfAlgo::GetInputNode(utils::cast<CNodePtr>(node), 3);
  auto grad = AnfAlgo::GetInputNode(utils::cast<CNodePtr>(grad_cast), 0);
  MS_EXCEPTION_IF_NULL(grad_cast);
  MS_EXCEPTION_IF_NULL(grad);

  auto manager = graph->manager();
  MS_EXCEPTION_IF_NULL(manager);
  manager->Replace(utils::cast<CNodePtr>(grad_cast), utils::cast<CNodePtr>(grad));
  std::vector<TypeId> outputs_type;
  std::vector<std::vector<size_t>> outputs_shape;
  auto output_num = AnfAlgo::GetOutputTensorNum(node);
  for (size_t i = 0; i < output_num; i++) {
    outputs_type.push_back(AnfAlgo::GetOutputInferDataType(node, i));
    outputs_shape.push_back(AnfAlgo::GetOutputInferShape(node, i));
  }
  outputs_type[3] = AnfAlgo::GetPrevNodeOutputInferDataType(grad_cast, 0);

  AnfAlgo::SetOutputInferTypeAndShape(outputs_type, outputs_shape, node.get());

  return node;
 }
 }  // namespace opt
 }  // namespace mindspore
--- a/mindspore/ccsrc/backend/optimizer/gpu/replace_momentum_cast_fusion.h
+++ b/mindspore/ccsrc/backend/optimizer/gpu/replace_momentum_cast_fusion.h
@@ -0,0 +1,46 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #ifndef MINDSPORE_CCSRC_BACKEND_OPTIMIZER_GPU_REPLACE_MOMENTUM_CAST_FUSION_H_
 #define MINDSPORE_CCSRC_BACKEND_OPTIMIZER_GPU_REPLACE_MOMENTUM_CAST_FUSION_H_

 #include <memory>
 #include "backend/optimizer/common/optimizer.h"

 namespace mindspore {
 namespace opt {
 class ReplaceMomentumCastFusion : public PatternProcessPass {
 public:
  explicit ReplaceMomentumCastFusion(bool multigraph = true) : PatternProcessPass("replace_momentum_cast", multigraph) {
    var_ = std::make_shared<Var>();
    acc_ = std::make_shared<Var>();
    lr_ = std::make_shared<Var>();
    grad_ = std::make_shared<Var>();
    mom_ = std::make_shared<Var>();
  }
  ~ReplaceMomentumCastFusion() override = default;
  const BaseRef DefinePattern() const override;
  const AnfNodePtr Process(const FuncGraphPtr &, const AnfNodePtr &, const EquivPtr &) const override;

 private:
  VarPtr var_;
  VarPtr acc_;
  VarPtr lr_;
  VarPtr grad_;
  VarPtr mom_;
 };
 }  // namespace opt
 }  // namespace mindspore
 #endif  // MINDSPORE_CCSRC_BACKEND_OPTIMIZER_GPU_REPLACE_MOMENTUM_CAST_FUSION_H_
--- a/mindspore/ccsrc/backend/optimizer/mem_reuse/kernel_refcount.h
+++ b/mindspore/ccsrc/backend/optimizer/mem_reuse/kernel_refcount.h
@@ -25,7 +25,8 @@
 namespace mindspore {
 namespace memreuse {
 enum RefCountType { kDynamicRefCount, kStaticRefCount };
 enum NodeType { NORMAL, SPECIAL };
 enum NodeType { COMMON_NODE, COMMUNICATION_NODE };
 enum KernelRefType { COMMON, REFNODE_OUTPUT, COMM_NOTREUSE, COMM_REUSE, SUMMARY };
 static constexpr int kInitIndex = -1;
 class KernelRefCount {
 public:
@@ -36,6 +37,7 @@ class KernelRefCount {
  size_t offset_;
  size_t size_;
  int index_;
  KernelRefType type_;
  // remember to reset offset
  KernelRefCount()
      : stream_id_(0),
@@ -44,6 +46,7 @@ class KernelRefCount {
        offset_(0),
        size_(0),
        index_(kInitIndex),
        type_(COMMON),
        reftype_(kStaticRefCount) {}
  ~KernelRefCount() = default;
  void SetKernelRefCountInfo(int index, size_t size, RefCountType reftype);
@@ -65,7 +68,7 @@ class KernelDef {
  KernelMap inputs_;
  KernelMap outputs_;
  KernelMap wk_space_;
  NodeType dirty = NORMAL;
  NodeType type_ = COMMON_NODE;
  KernelDef() = default;
  ~KernelDef() = default;
  void set_input_refs(const KernelRefCountPtrList &kernelRefPtrList) { input_refs_ = kernelRefPtrList; }
--- a/mindspore/ccsrc/backend/optimizer/mem_reuse/mem_reuse.cc
+++ b/mindspore/ccsrc/backend/optimizer/mem_reuse/mem_reuse.cc
@@ -46,6 +46,8 @@ bool MemReuseUtil::InitDynamicOutputKernelRef() {
    if (iter == kernel_output_refs_.end()) {
      auto output_sizes = kernel_mod->GetOutputSizeList();
      KernelRefCountPtrList kernel_refs;
      bool is_comm_op = AnfAlgo::IsCommunicationOp(kernel_cnode);
      size_t output_index = 0;
      for (auto size : output_sizes) {
        total_dy_size_ += size;
        // do not MallocDynamicMem just record this
@@ -54,9 +56,20 @@ bool MemReuseUtil::InitDynamicOutputKernelRef() {
        auto curr_stream_id = AnfAlgo::GetStreamId(kernel_cnode);
        kernel_ref->stream_id_ = curr_stream_id;
        kernel_ref->SetKernelRefCountInfo(index, size, kDynamicRefCount);
        if (is_comm_op) {
          kernel_ref->type_ = COMM_REUSE;
        } else {
          session::AnfWithOutIndex out_pair(kernel_cnode, output_index);
          if (graph_->IsInRefOutputMap(out_pair)) {
            kernel_ref->type_ = REFNODE_OUTPUT;
          } else {
            kernel_ref->type_ = COMMON;
          }
        }
        kernel_refs.push_back(kernel_ref);
        kernel_out_ref_num++;
        total_refs_list_.push_back(kernel_ref);
        output_index++;
      }
      if (!kernel_refs.empty()) {
        kernel_output_refs_[key] = kernel_refs;
@@ -155,9 +168,19 @@ void MemReuseUtil::SetInputMap(const CNodePtr &kernel, KernelDef *kernel_def_ptr
  MS_EXCEPTION_IF_NULL(kernel);
  MS_EXCEPTION_IF_NULL(kernel_def_ptr);
  auto key = kernel.get();
  for (size_t i = 0; i < AnfAlgo::GetInputTensorNum(kernel); ++i) {
  bool is_comm_op = AnfAlgo::IsCommunicationOp(kernel);
  size_t input_tensor_num = AnfAlgo::GetInputTensorNum(kernel);
  for (size_t i = 0; i < input_tensor_num; ++i) {
    auto ref_ptr = GetKernelInputRef(kernel, i);
    if (ref_ptr != nullptr) {
      if (is_comm_op) {
        if (input_tensor_num == 1) {
          ref_ptr->type_ = COMM_REUSE;
        } else {
          ref_ptr->type_ = COMM_NOTREUSE;
        }
      }

      if (ref_ptr->reftype() == kStaticRefCount) {
        continue;
      } else if (ref_ptr->reftype() == kDynamicRefCount) {
@@ -258,6 +281,11 @@ void MemReuseUtil::SetKernelDefMap() {
    auto key = kernel.get();
    kernel_def_ptr->set_input_refs(kernel_def_ptr->inputs_[key]);
    kernel_def_ptr->set_output_refs(kernel_def_ptr->outputs_[key]);
    if (AnfAlgo::IsCommunicationOp(kernel)) {
      kernel_def_ptr->type_ = COMMUNICATION_NODE;
    } else {
      kernel_def_ptr->type_ = COMMON_NODE;
    }
    kernel_def_ptr_list_.push_back(kernel_def_ptr);
    kernel_map_[key] = kernel_def_ptr;
  }
@@ -337,6 +365,7 @@ void MemReuseUtil::SetSummaryNodesRefCount() {
      KernelRefCountPtr kernel_ref = kernel_output_refs_[node.get()][index];
      kernel_ref->ref_count_ = kMaxRefCount;
      kernel_ref->ref_count_dynamic_use_ = kMaxRefCount;
      kernel_ref->type_ = SUMMARY;
      total_summary_size += kernel_ref->size_;
      MS_LOG(INFO) << "Set summary node's ref count, node: " << node->fullname_with_scope() << " index: " << index;
    } else {
--- a/mindspore/ccsrc/backend/optimizer/mem_reuse/mem_reuse.h
+++ b/mindspore/ccsrc/backend/optimizer/mem_reuse/mem_reuse.h
@@ -83,6 +83,7 @@ class MemReuseUtil {
  void set_mem_base(uint8_t *mem_base) { mem_base_ = mem_base; }
  uint8_t *GetNodeOutputPtr(const AnfNodePtr &node, size_t index) const;
  uint8_t *GetNodeWorkSpacePtr(const AnfNodePtr &node, size_t index) const;
  bool is_all_nop_node() const { return is_all_nop_node_; }

 private:
  int util_index_;
--- a/mindspore/ccsrc/backend/optimizer/mem_reuse/mem_reuse_allocator.cc
+++ b/mindspore/ccsrc/backend/optimizer/mem_reuse/mem_reuse_allocator.cc
@@ -33,11 +33,11 @@ void BestFitMemReuse::InitMemReuseInfo(const MemReuseUtil *mem_reuse_util_ptr) {
  set_op_ptr_list(mem_reuse_util_ptr->kernel_def_ptr_list());
  // check info Correctness
  for (auto &tensor : tensor_ptr_list_) {
    tensor->size_ = AlignMemorySize(tensor->size_);
    tensor->size_ = AlignCommonMemorySize(tensor->size_);
  }
  // align wk size to 512 && refcount == 1
  for (auto &wk : wk_tensor_list_) {
    wk->size_ = AlignMemorySize(wk->size_);
    wk->size_ = AlignCommonMemorySize(wk->size_);
    wk->ref_count_ = 1;
  }
 #ifdef ENABLE_D
@@ -135,11 +135,23 @@ bool BestFitMemReuse::IsUsable(const KernelDefPtr &kernel_curr, const MembufPtr
  return false;
 }

 void BestFitMemReuse::AssignNodeOutputOffset() {
 void BestFitMemReuse::AssignCommonNodeOutputOffset() {
  MS_EXCEPTION_IF_NULL(current_kernel_);
  for (auto &tensor_idx : current_kernel_->GetOutputRefIndexs()) {
    size_t index = GetTensorIndex(tensor_idx);
    auto tensor_desc = tensor_ptr_list_[index];
    MS_EXCEPTION_IF_NULL(tensor_desc);
    if (tensor_desc->type_ == REFNODE_OUTPUT) {
      total_refoutput_size += tensor_desc->size_;
      continue;
    } else if (tensor_desc->type_ == COMM_NOTREUSE) {
      total_comm_not_reuse_size += tensor_desc->size_;
    } else if (tensor_desc->type_ == COMM_REUSE) {
      // get align size for communication op's single input
      tensor_desc->size_ = AlignCommunicationMemorySize(tensor_desc->size_);
      total_comm_reuse_size += tensor_desc->size_;
    }

    auto reusable_membuf_map = GetReusableMembufMap(tensor_desc->size_);
    if (!reusable_membuf_map.empty()) {
      auto membuf_index = reusable_membuf_map.begin()->second;
@@ -152,6 +164,93 @@ void BestFitMemReuse::AssignNodeOutputOffset() {
      MemReuseChecker::GetInstance().IsAddNewMembuf_ = true;
 #endif
    }
    // skip left align border for communication op single input to used
    if (tensor_desc->type_ == COMM_REUSE) {
      tensor_desc->offset_ += kDefaultMemAlignSize;
    }
  }
 }

 void BestFitMemReuse::AssignCommunicationNodeOutputOffset() {
  size_t total_kernel_output_size = 0;
  size_t output_num = 0;
  // get all output size
  MS_EXCEPTION_IF_NULL(current_kernel_);
  for (auto &tensor_idx : current_kernel_->GetOutputRefIndexs()) {
    size_t index = GetTensorIndex(tensor_idx);
    auto tensor_desc = tensor_ptr_list_[index];
    MS_EXCEPTION_IF_NULL(tensor_desc);
    if (tensor_desc->type_ == COMM_REUSE) {
      total_comm_reuse_size += tensor_desc->size_;
      total_comm_output_reuse_size += tensor_desc->size_;
      total_kernel_output_size += tensor_desc->size_;
    } else {
      MS_LOG(ERROR) << "All communication op's outputs should be memory reuse, Kernel:"
                    << current_kernel_->scope_full_name();
      continue;
    }
  }
  total_kernel_output_size = AlignCommunicationMemorySize(total_kernel_output_size);

  // add left align border for the first output and right align border for the last output to alloc align border memory
  size_t output_index = 0;
  auto output_ref_indexes = current_kernel_->GetOutputRefIndexs();
  for (auto &tensor_idx : output_ref_indexes) {
    size_t index = GetTensorIndex(tensor_idx);
    auto tensor_desc = tensor_ptr_list_[index];
    MS_EXCEPTION_IF_NULL(tensor_desc);
    if (output_index == 0 || output_index == output_num - 1) {
      tensor_desc->size_ += kDefaultMemAlignSize;
    }

    if ((output_index == 0) && (output_ref_indexes.size() == 1)) {
      // add right align border for single output
      tensor_desc->size_ += kDefaultMemAlignSize;
    }

    output_index++;
  }

  auto reusable_membuf_map = GetReusableMembufMap(total_kernel_output_size);
  if (!reusable_membuf_map.empty()) {
    auto membuf_index = reusable_membuf_map.begin()->second;
    output_index = 0;
    for (auto &tensor_idx : current_kernel_->GetOutputRefIndexs()) {
      size_t index = GetTensorIndex(tensor_idx);
      auto tensor_desc = tensor_ptr_list_[index];
      MS_EXCEPTION_IF_NULL(tensor_desc);
      ReuseExistMembuf(tensor_desc.get(), membuf_index + output_index, kDynamicMem);
      // skip skip left align border for communication op's first output to used
      if (output_index == 0) {
        tensor_desc->offset_ += kDefaultMemAlignSize;
      }
      output_index++;
    }
  } else {
    // no membuf can reuse, add new membuf after the membuf_ptr_list
    output_index = 0;
    for (auto &tensor_idx : current_kernel_->GetOutputRefIndexs()) {
      size_t index = GetTensorIndex(tensor_idx);
      auto tensor_desc = tensor_ptr_list_[index];
      MS_EXCEPTION_IF_NULL(tensor_desc);
      AddNewMembufPtr(tensor_desc.get(), kDynamicMem);
      // skip align size offset for first output to used
      if (output_index == 0) {
        tensor_desc->offset_ += kDefaultMemAlignSize;
      }
      output_index++;
 #ifdef MEM_REUSE_DEBUG
      MemReuseChecker::GetInstance().IsAddNewMembuf_ = true;
 #endif
    }
  }
 }

 void BestFitMemReuse::AssignNodeOutputOffset() {
  if (current_kernel_->type_ == COMMUNICATION_NODE) {
    AssignCommunicationNodeOutputOffset();
  } else {
    AssignCommonNodeOutputOffset();
  }
 }

@@ -319,11 +418,17 @@ void BestFitMemReuse::ReleaseMembuf(size_t tensor_index, int flag) {
  }
 }

 size_t BestFitMemReuse::AlignMemorySize(size_t size) const {
 size_t BestFitMemReuse::AlignCommonMemorySize(size_t size) const {
  // memory size 512 align
  return (size + kDefaultMemAlignSize + kAttAlignSize) / kDefaultMemAlignSize * kDefaultMemAlignSize;
 }

 size_t BestFitMemReuse::AlignCommunicationMemorySize(size_t size) const {
  // memory size 512 align and add communication memory:  left align border memory - data - right align border memory
  return kDefaultMemAlignSize + (size + kDefaultMemAlignSize - 1) / kDefaultMemAlignSize * kDefaultMemAlignSize +
         kDefaultMemAlignSize;
 }

 size_t BestFitMemReuse::GetAllocatedSize() {
  size_t AllocatedSize = kTotalSize;
  if (membuf_ptr_list_.empty()) {
@@ -412,6 +517,9 @@ void BestFitMemReuse::Reuse(const MemReuseUtil *mem_reuse_util_ptr) {
    ++op_num;
 #endif
  }
  MS_LOG(INFO) << "Special Tensor total size: RefOutput: " << total_refoutput_size
               << " CommReuse: " << total_comm_reuse_size << " CommOutputReuse: " << total_comm_output_reuse_size
               << " CommNotReuse: " << total_comm_not_reuse_size;
 #ifdef MEM_REUSE_DEBUG
  MemReuseChecker::GetInstance().ExportMembufInfoIR();
  MemReuseChecker::GetInstance().ExportAddNewMmebufIR();
--- a/mindspore/ccsrc/backend/optimizer/mem_reuse/mem_reuse_allocator.h
+++ b/mindspore/ccsrc/backend/optimizer/mem_reuse/mem_reuse_allocator.h
@@ -74,6 +74,14 @@ class BestFitMemReuse {
   * Assign output tensor memory offset of current kernel
   */
  void AssignNodeOutputOffset();
  /**
   * Assign output tensor memory offset of common kernel
   */
  void AssignCommonNodeOutputOffset();
  /**
   * Assign output tensor memory offset of communication kernel
   */
  void AssignCommunicationNodeOutputOffset();
  /**
   * Update input tensor's status of current kernel, and the status of membuf used by current kernel
   */
@@ -110,8 +118,10 @@ class BestFitMemReuse {
  void AddNewMembufPtr(KernelRefCount *tensor_desc, int flag);
  // Merge unused membuf
  void ReleaseMembuf(size_t tensor_index, int flag);
  // Memory address alignment 512
  size_t AlignMemorySize(size_t size) const;
  // Memory address alignment for common memory
  size_t AlignCommonMemorySize(size_t size) const;
  // Memory address alignment for communication used memory
  size_t AlignCommunicationMemorySize(size_t size) const;
  int GetRealIndex(size_t index, int flag = kDynamicMem) const;
  size_t GetTensorIndex(int index) const;
  size_t GetWorkspaceIndex(int index) const;
@@ -153,6 +163,10 @@ class BestFitMemReuse {
  // kernel_front_map_, key: the kernel_def, value: kernels before this kernel_def
  std::map<KernelDefPtr, std::set<KernelDefPtr>> kernel_front_map_;
  std::vector<std::vector<uint32_t>> stream_groups_;
  size_t total_refoutput_size{0};
  size_t total_comm_reuse_size{0};
  size_t total_comm_output_reuse_size{0};
  size_t total_comm_not_reuse_size{0};
 };
 }  // namespace memreuse
 }  // namespace mindspore
--- a/mindspore/ccsrc/backend/optimizer/mem_reuse/mem_reuse_checker.cc
+++ b/mindspore/ccsrc/backend/optimizer/mem_reuse/mem_reuse_checker.cc
@@ -170,12 +170,14 @@ void MemReuseChecker::CheckMemReuseIR(const KernelRefCountPtrList &total_refs_li
  ofs << "all_tensor_refs:\n";
  ofs << "index:"
      << "\tsize:"
      << "\trefcount:\n";
      << "\trefcount:"
      << "\ttype:\n";
  for (auto &ref : total_refs_list) {
    ofs << "%" << ref->index_ << "T"
        << "\t"
        << "#" << ref->size_ << "S"
        << "\t" << ref->ref_count_ << "C"
        << "\t" << ref->type_ << "t"
        << "\n";
  }
  ofs << "kernel_def exc_order:\n";
@@ -241,7 +243,7 @@ bool MemReuseChecker::CheckGraphOutputAssigned(const session::KernelGraph *graph
 void MemReuseChecker::ExportMemOpIr(const KernelDef *def, std::ofstream &ofs, int def_idx) {
  auto scope_name = def->scope_full_name();
  std::string split_name = GetSplitName(scope_name);
  ofs << "$" << def_idx << "\t" << split_name << "\t";
  ofs << "$" << def_idx << "\t" << split_name << "\t" << static_cast<int>(def->type_) << "\t";
  ofs << "inputs[";
  for (auto &in : def->inputs_) {
    for (auto &in_ref : in.second) {
--- a/mindspore/ccsrc/backend/optimizer/pass/communication_op_fusion.cc
+++ b/mindspore/ccsrc/backend/optimizer/pass/communication_op_fusion.cc
@@ -100,7 +100,10 @@ bool CommunicationOpFusion::GetSplitSegments(const CommunicationOpInfo &communic

  auto parallel_context = parallel::ParallelContext::GetInstance();
  MS_EXCEPTION_IF_NULL(parallel_context);
  const auto &split_indices = parallel_context->GetAllReduceFusionSplitIndices(group);
  std::vector<uint32_t> split_indices;
  if (!parallel_context->enable_parallel_optimizer()) {
    split_indices = parallel_context->GetAllReduceFusionSplitIndices(group);
  }

  size_t segments = 0;
  if (split_indices.size() != 0) {
--- a/mindspore/ccsrc/backend/optimizer/pass/replace_node_by_proxy.cc
+++ b/mindspore/ccsrc/backend/optimizer/pass/replace_node_by_proxy.cc
@@ -71,7 +71,6 @@ bool ReplaceNodeByProxy::Run(const FuncGraphPtr &func_graph) {

      AbstractBasePtrList abstract_list;
      AnfAlgo::CopyNodeAttr(kAttrPsKey, cnode, proxy_node);
      AnfAlgo::CopyNodeAttr("reduce_scatter_flag", cnode, proxy_node);
      AnfAlgo::CopyNodeAttr("offset", cnode, proxy_node);
      abstract_list.push_back(cnode->abstract());
      auto abstract_tuple = std::make_shared<abstract::AbstractTuple>(abstract_list);
--- a/mindspore/ccsrc/backend/session/ascend_control_parser.cc
+++ b/mindspore/ccsrc/backend/session/ascend_control_parser.cc
@@ -18,9 +18,12 @@
 #include <utility>
 #include <memory>
 #include <algorithm>
 #include <string>
 #include "backend/session/anf_runtime_algorithm.h"
 #include "utils/union_find_set.h"
 #include "runtime/device/ascend/ascend_label_assign.h"
 #include "utils/context/ms_context.h"
 #include "debug/anf_ir_dump.h"

 static constexpr size_t kCNodePrim = 0;
 static constexpr size_t kCNodeCallArg = 1;
@@ -104,7 +107,7 @@ static void ReuseParameter(NotNull<KernelGraphPtr> root_kg,

 static CNodePtr GetNextRealKernel(const std::vector<CNodePtr> &list, size_t start) {
  for (size_t i = start; i < list.size() - 1; ++i) {
    if (!IsPrimitiveCNode(list[i], prim::kPrimPartial) && AnfAlgo::IsRealKernel(list[i])) {
    if (AnfAlgo::IsRealKernel(list[i])) {
      return list[i];
    }
  }
@@ -168,18 +171,43 @@ static void EraseNodeFromExecOrder(const AnfNodePtr &node, const NotNull<std::ve
  exec_order->erase(exec_iter);
 }

 void AscendControlParser::AttachChildGraphToReturnNode(NotNull<KernelGraphPtr> graph,
                                                       const NotNull<std::set<KernelGraphPtr> *> memo) {
  if (memo->find(graph) != memo->end()) {
    return;
  }
  memo->insert(graph.get());
  const std::vector<std::shared_ptr<KernelGraph>> &child_graph_order = graph->child_graph_order();
  if (child_graph_order.empty()) {
    return;
  }

  std::vector<AnfNodePtr> depend_inputs = {NewValueNode(std::make_shared<Primitive>(prim::kPrimPartial->name()))};
  for (auto &cg : child_graph_order) {
    MS_EXCEPTION_IF_NULL(cg);
    auto fg = cg->cast<FuncGraphPtr>();
    MS_EXCEPTION_IF_NULL(fg);
    depend_inputs.emplace_back(NewValueNode(fg));
    AttachChildGraphToReturnNode(NOT_NULL(cg), memo);
  }
  auto child_graphs = graph->NewCNode(depend_inputs);
  InsertDependToGraph(graph, NOT_NULL(child_graphs));
 }

 void AscendControlParser::LinkGraph(NotNull<KernelGraphPtr> kg) {
  std::set<KernelGraphPtr> memo;
  std::vector<std::pair<AnfNodePtr, AnfNodePtr>> link_list;
  // Insert Assign
  ChildGraphDataAssign(kg, NOT_NULL(&link_list), NOT_NULL(&memo));
  memo.clear();
  // Reuse Parameter
  ReuseParameter(kg, link_list);
  // replace call by label goto / label switch
  memo.clear();
  (void)ProcessKernelGraph(kg, nullptr, nullptr, NOT_NULL(&memo));
  memo.clear();
  // assign label resource
  device::ascend::AscendLabelAssign::GetInstance().AssignLabel(kg);
  AttachChildGraphToReturnNode(kg, NOT_NULL(&memo));
 }

 void AscendControlParser::EraseParameter(NotNull<KernelGraphPtr> root_graph,
@@ -248,10 +276,14 @@ void AscendControlParser::EraseParameter(NotNull<KernelGraphPtr> root_graph,
    }
    MS_LOG(INFO) << "Erase " << assign_node->DebugString(5);
    EraseNodeFromExecOrder(assign_node, NOT_NULL(&exec_order));

    auto source = AnfAlgo::VisitKernelWithReturnType(assign_node->input(kCNodeAssignSource), 0).first;
    parameter_count.AddReadCount(source, -1);
    auto source = assign_node->input(kCNodeAssignSource);
    MS_EXCEPTION_IF_NULL(source);
    auto visit_source = AnfAlgo::VisitKernelWithReturnType(source, 0).first;
    parameter_count.AddWriteCount(para, -1);
    parameter_count.AddReadCount(para, -1);
    if (visit_source->isa<Parameter>()) {
      parameter_count.AddReadCount(visit_source, read - 1);
    }
    for (auto &node : all_nodes) {
      for (size_t i = 0; i < node->size(); ++i) {
        if (node->input(i) == para) {
@@ -260,8 +292,6 @@ void AscendControlParser::EraseParameter(NotNull<KernelGraphPtr> root_graph,
        }
      }
    }
    parameter_count.AddReadCount(source, 1);
    parameter_count.AddReadCount(para, -1);
  }
  root_graph->set_execution_order(exec_order);
 }
@@ -318,6 +348,17 @@ void AscendControlParser::ExecutorValidate(NotNull<KernelGraphPtr> root_graph) {
  (void)RecurseGraph(root_graph, NOT_NULL(&memo));
  EraseParameter(root_graph, memo);
  EraseLabel(root_graph);

  auto context_ptr = MsContext::GetInstance();
  MS_EXCEPTION_IF_NULL(context_ptr);
  auto save_graphs_path = context_ptr->save_graphs_path();
  if (save_graphs_path.empty()) {
    save_graphs_path = ".";
  }
  if (context_ptr->save_graphs_flag()) {
    std::string file_path = save_graphs_path + "/after_erase_label_and_parameter.ir";
    DumpIR(file_path, root_graph.get());
  }
 }

 std::vector<std::pair<KernelGraphPtr, std::vector<AnfNodePtr>>> AscendControlParser::ParseCallNode(
--- a/mindspore/ccsrc/backend/session/ascend_control_parser.h
+++ b/mindspore/ccsrc/backend/session/ascend_control_parser.h
@@ -66,7 +66,8 @@ class AscendControlParser {
  static AnfNodePtr InsertAssignToGraph(NotNull<KernelGraphPtr> kg, NotNull<AnfNodePtr> from, NotNull<AnfNodePtr> to);
  static std::vector<std::pair<KernelGraphPtr, std::vector<AnfNodePtr>>> ParseCallNode(NotNull<CNodePtr> call_node);
  static std::tuple<KernelGraphPtr, std::vector<AnfNodePtr>> ParsePartial(NotNull<AnfNodePtr> node);

  static void AttachChildGraphToReturnNode(NotNull<KernelGraphPtr> graph,
                                           const NotNull<std::set<KernelGraphPtr> *> memo);
  // root graph order
  static bool CheckLabelIndex(uint32_t order_index, uint32_t label_index, const CNodePtr &cnode,
                              NotNull<KernelGraphPtr> graph);
--- a/mindspore/ccsrc/backend/session/ascend_session.cc
+++ b/mindspore/ccsrc/backend/session/ascend_session.cc
@@ -353,6 +353,10 @@ GraphId AscendSession::CompileGraph(NotNull<FuncGraphPtr> func_graph) {
  RootGraphExecutorValidate(NOT_NULL(root_graph));
  // adjust kernel
  AdjustKernel(root_graph);
 #if (!_WIN32 && !ENABLE_GE && !ENABLE_TESTCASES)
  // Assign parameter keys.
  AssignParamKey(root_graph);
 #endif
  // assign stream
  AssignStream(NOT_NULL(root_graph));
  // insert profiling point
@@ -511,6 +515,12 @@ void AscendSession::RunGraph(const GraphId &graph_id, const std::vector<tensor::
  }
  // load input data from user input
  LoadInputData(kernel_graph, inputs);
 #if (!_WIN32 && !ENABLE_GE && !ENABLE_TESTCASES)
  // Initialize parameter server
  if (!ps_init_) {
    InitPSParamAndOptim(kernel_graph, inputs);
  }
 #endif
  // convert inputs to model
  predictmodel::StepConvertWeight(inputs);
  {
--- a/mindspore/ccsrc/backend/session/cpu_session.cc
+++ b/mindspore/ccsrc/backend/session/cpu_session.cc
@@ -16,6 +16,7 @@

 #include "backend/session/cpu_session.h"
 #include <algorithm>
 #include <sstream>
 #include "ir/tensor.h"
 #include "ir/anf.h"
 #include "backend/kernel_compiler/kernel.h"
@@ -25,9 +26,15 @@
 #include "predict/predict.h"
 #include "backend/kernel_compiler/cpu/cpu_kernel_factory.h"
 #include "runtime/device/cpu/kernel_select_cpu.h"
 #include "backend/optimizer/common/optimizer.h"
 #include "backend/optimizer/common/pass_manager.h"
 #include "backend/optimizer/pass/replace_node_by_proxy.h"
 #ifdef ENABLE_DEBUGGER
 #include "debug/debugger/debugger.h"
 #endif
 #if (!_WIN32 && !ENABLE_GE && !ENABLE_TESTCASES)
 #include "frontend/parallel/ps/util.h"
 #endif

 namespace mindspore {
 namespace session {
@@ -49,12 +56,29 @@ ParameterPtr CPUSession::CreateNewParameterFromParameter(const AnfNodePtr &anf,
  return new_parameter;
 }

 void CPUSession::Optimize(const std::shared_ptr<KernelGraph> &kernel_graph) {
  auto optimizer = std::make_shared<opt::GraphOptimizer>();
  auto pm = std::make_shared<opt::PassManager>();
  std::string pass_name = "replace_node_by_proxy";
  pass_name.append(std::to_string(graph_sum_));
  pm->AddPass(std::make_shared<opt::ReplaceNodeByProxy>(pass_name));
  optimizer->AddPassManager(pm);
  (void)optimizer->Optimize(kernel_graph);
  kernel_graph->SetExecOrderByDefault();
 }

 GraphId CPUSession::CompileGraph(const AnfNodePtrList &lst, const AnfNodePtrList &outputs) {
  auto graph_id = graph_sum_;
  auto graph = ConstructKernelGraph(lst, outputs);
  MS_EXCEPTION_IF_NULL(graph);
  MS_LOG(INFO) << "Set kernel info";
  SetKernelInfo(graph.get());
 #if (!_WIN32 && !ENABLE_GE && !ENABLE_TESTCASES)
  AssignParamKey(graph);
  if (parallel::ps::Util::IsRoleOfWorker()) {
    Optimize(graph);
  }
 #endif
  predictmodel::StepConvertGraph(graph);
  MS_LOG(INFO) << "Build kernel";
  BuildKernel(graph.get());
@@ -66,6 +90,12 @@ GraphId CPUSession::CompileGraph(const AnfNodePtrList &lst, const AnfNodePtrList
 void CPUSession::RunGraph(const GraphId &graph_id, const std::vector<tensor::TensorPtr> &inputs, VectorRef *outputs) {
  auto &kernel_graph = graphs_[graph_id];
  MS_EXCEPTION_IF_NULL(kernel_graph);
 #if (!_WIN32 && !ENABLE_GE && !ENABLE_TESTCASES)
  // Initialize parameter server
  if (!ps_init_) {
    InitPSParamAndOptim(kernel_graph, inputs);
  }
 #endif
  MS_LOG(INFO) << "Bind input output address";
  std::vector<tensor::TensorPtr> need_sync_outputs;
  runtime_.BindInputOutput(kernel_graph.get(), inputs, outputs, &need_sync_outputs);
@@ -119,6 +149,48 @@ void CPUSession::SetKernelInfo(const KernelGraph *kernel_graph) {
  }
 }

 namespace {
 void KernelNotSupportException(const AnfNodePtr &kernel_node) {
  std::string kernel_name = AnfAlgo::GetCNodeName(kernel_node);
  std::stringstream operator_info;
  operator_info << "Operator[" << kernel_name << "] ";
  auto kernel_info = dynamic_cast<device::KernelInfo *>(kernel_node->kernel_info());
  if (kernel_info == nullptr) {
    operator_info << "is not support.";
    MS_LOG(EXCEPTION) << operator_info.str();
  }
  auto kernel_build_Info = kernel_info->select_kernel_build_info();
  if (kernel_build_Info == nullptr) {
    operator_info << "is not support.";
    MS_LOG(EXCEPTION) << operator_info.str();
  }
  size_t input_num = kernel_build_Info->GetInputNum();
  if (input_num > 0) {
    operator_info << " input(";
    for (size_t i = 0; i < input_num; ++i) {
      operator_info << TypeIdLabel(kernel_build_Info->GetInputDeviceType(i));
      if (i != input_num - 1) {
        operator_info << ",";
      }
    }
    operator_info << ") ";
  }
  size_t output_num = kernel_build_Info->GetOutputNum();
  if (output_num > 0) {
    operator_info << "output(";
    for (size_t i = 0; i < output_num; ++i) {
      operator_info << TypeIdLabel(kernel_build_Info->GetOutputDeviceType(i));
      if (i != kernel_build_Info->GetOutputNum() - 1) {
        operator_info << ",";
      }
    }
    operator_info << ") ";
  }
  operator_info << "is not support.";
  MS_LOG(EXCEPTION) << operator_info.str();
 }
 }  // namespace

 void CPUSession::BuildKernel(const KernelGraph *kernel_graph) {
  MS_EXCEPTION_IF_NULL(kernel_graph);
  auto &kernel_nodes = kernel_graph->execution_order();
@@ -129,7 +201,7 @@ void CPUSession::BuildKernel(const KernelGraph *kernel_graph) {
    std::shared_ptr<kernel::CPUKernel> cpu_kernel =
      kernel::CPUKernelFactory::GetInstance().Create(kernel_name, kernel_node);
    if (cpu_kernel == nullptr) {
      MS_LOG(EXCEPTION) << "Operator[" << kernel_name << "] is not support.";
      KernelNotSupportException(kernel_node);
    }
    cpu_kernel->Init(kernel_node);
    AnfAlgo::SetKernelMod(cpu_kernel, kernel_node.get());
--- a/mindspore/ccsrc/backend/session/cpu_session.h
+++ b/mindspore/ccsrc/backend/session/cpu_session.h
@@ -37,6 +37,7 @@ class CPUSession : public SessionBasic {

 protected:
  ParameterPtr CreateNewParameterFromParameter(const AnfNodePtr &anf, bool valid_input, KernelGraph *graph) override;
  void Optimize(const std::shared_ptr<KernelGraph> &kernel_graph);

 private:
  void SetKernelInfo(const KernelGraph *kernel_graph);
--- a/mindspore/ccsrc/backend/session/gpu_session.cc
+++ b/mindspore/ccsrc/backend/session/gpu_session.cc
@@ -25,6 +25,11 @@
 #include "backend/optimizer/pass/getitem_tuple.h"
 #include "backend/optimizer/gpu/adam_weight_decay_fusion.h"
 #include "backend/optimizer/gpu/adam_fusion.h"
 #include "backend/optimizer/gpu/replace_bn_cast_fusion.h"
 #include "backend/optimizer/gpu/replace_bn_grad_cast_fusion.h"
 #include "backend/optimizer/gpu/replace_bn_grad_cast2_fusion.h"
 #include "backend/optimizer/gpu/replace_momentum_cast_fusion.h"
 #include "backend/optimizer/gpu/replace_addn_fusion.h"
 #include "runtime/device/kernel_runtime_manager.h"
 #include "predict/predict.h"
 #include "common/utils.h"
@@ -59,6 +64,11 @@ void GPUSession::Optimize(const std::shared_ptr<KernelGraph> &kernel_graph) {
  auto pm = std::make_shared<opt::PassManager>();
  pm->AddPass(std::make_shared<opt::AdamWeightDecayFusion>());
  pm->AddPass(std::make_shared<opt::AdamFusion>());
  pm->AddPass(std::make_shared<opt::ReplaceBNCastFusion>());
  pm->AddPass(std::make_shared<opt::ReplaceBNGradCastFusion>());
  pm->AddPass(std::make_shared<opt::ReplaceBNGradCast2Fusion>());
  pm->AddPass(std::make_shared<opt::ReplaceMomentumCastFusion>());
  pm->AddPass(std::make_shared<opt::ReplaceAddNFusion>());
  optimizer->AddPassManager(pm);
  (void)optimizer->Optimize(kernel_graph);
  kernel_graph->SetExecOrderByDefault();
@@ -167,6 +177,10 @@ GraphId GPUSession::CompileGraph(const AnfNodePtrList &lst, const AnfNodePtrList
  Optimize(graph);
  // Select kernel build info
  SelectKernel(graph);
 #if (!_WIN32 && !ENABLE_GE && !ENABLE_TESTCASES)
  // Assign parameter keys.
  AssignParamKey(graph);
 #endif
  // Convert kernel Graph to model
  predictmodel::StepConvertGraph(graph);
  // Start gpu kernel runtime
@@ -204,6 +218,10 @@ void GPUSession::RunGraph(const GraphId &graph_id, const std::vector<tensor::Ten
  auto &kernel_graph = graphs_[graph_id];
  // Load input data from user input
  LoadInputData(kernel_graph, inputs);
  // Initialize parameter server
  if (!ps_init_) {
    InitPSParamAndOptim(kernel_graph, inputs);
  }
  MS_EXCEPTION_IF_NULL(kernel_graph);
  // Convert inputs to model
  predictmodel::StepConvertWeight(inputs);
--- a/mindspore/ccsrc/backend/session/kernel_graph.cc
+++ b/mindspore/ccsrc/backend/session/kernel_graph.cc
@@ -307,7 +307,7 @@ CNodePtr KernelGraph::NewCNode(const std::vector<AnfNodePtr> &inputs) {
  if (inputs.size() == 1 || !feature_map_input_indexs.empty()) {
    kernel_info->SetFeatureMapFlag(true);
  }
  if (AnfAlgo::IsRealCNodeKernel(cnode)) {
  if (AnfAlgo::IsRealKernel(cnode)) {
    AnfAlgo::SetNodeAttr(kIsFeatureMapOutput, MakeValue(kernel_info->is_feature_map()), cnode);
    AnfAlgo::SetNodeAttr(kIsFeatureMapInputList, MakeValue(feature_map_input_indexs), cnode);
  }
@@ -929,10 +929,15 @@ void KernelGraph::AddInternalOutput(const AnfNodePtr &front_node, const AnfNodeP
  }
  MS_LOG(INFO) << "Add internal node " << node->DebugString() << " with front node " << front_node->DebugString();
  front_to_internal_outputs_map_[front_node] = node;
  internal_outputs_to_front_map_[node] = front_node;
  int output_idx = 0;
  if (AnfAlgo::CheckPrimitiveType(front_node, prim::kPrimTupleGetItem)) {
    output_idx = AnfAlgo::GetTupleGetItemOutIndex(front_node->cast<CNodePtr>());
  }
  internal_outputs_to_front_map_[node][output_idx] = front_node;
 }

 void KernelGraph::ReplaceInternalOutput(const AnfNodePtr &node, const AnfNodePtr &new_node) {
 void KernelGraph::ReplaceInternalOutput(const AnfNodePtr &node, const AnfNodePtr &new_node, int src_output_idx,
                                        int dst_output_idx) {
  if (new_node == nullptr || node == nullptr) {
    MS_LOG(INFO) << "New node or node is nullptr";
    return;
@@ -947,9 +952,30 @@ void KernelGraph::ReplaceInternalOutput(const AnfNodePtr &node, const AnfNodePtr
    return;
  }
  MS_LOG(INFO) << "Replace internal node " << node->DebugString() << " To " << new_node->DebugString();
  internal_outputs_to_front_map_[new_node] = iter->second;
  front_to_internal_outputs_map_[iter->second] = new_node;
  internal_outputs_to_front_map_.erase(iter);
  auto &front_nodes = iter->second;
  // Move all front nodes to new node mapping
  if (src_output_idx == -1) {
    internal_outputs_to_front_map_[new_node] = front_nodes;
    for (const auto &front_node_iter : front_nodes) {
      front_to_internal_outputs_map_[front_node_iter.second] = new_node;
    }
    internal_outputs_to_front_map_.erase(iter);
    return;
  }
  // Move specified front node to new node mapping
  int index = SizeToInt(src_output_idx);
  auto front_node_iter = front_nodes.find(index);
  if (front_node_iter == front_nodes.end()) {
    MS_LOG(INFO) << "The output " << src_output_idx << " of node " << node->DebugString() << " is not an internal node";
    return;
  }
  auto front_node = front_node_iter->second;
  internal_outputs_to_front_map_[new_node][dst_output_idx] = front_node;
  front_to_internal_outputs_map_[front_node] = new_node;
  front_nodes.erase(index);
  if (front_nodes.empty()) {
    internal_outputs_to_front_map_.erase(iter);
  }
 }

 AnfNodePtr KernelGraph::GetInternalOutputByFrontNode(const AnfNodePtr &front_node) const {
@@ -967,14 +993,6 @@ bool KernelGraph::IsInternalOutput(const AnfNodePtr &node) const {
  return false;
 }

 AnfNodePtr KernelGraph::GetFrontNodeByInternalOutput(const AnfNodePtr &node) const {
  auto iter = internal_outputs_to_front_map_.find(node);
  if (iter != internal_outputs_to_front_map_.end()) {
    return iter->second;
  }
  return nullptr;
 }

 void KernelGraph::AddFinalOutputKernel(const AnfNodePtr &node) {
  if (node == nullptr) {
    return;
--- a/mindspore/ccsrc/backend/session/kernel_graph.h
+++ b/mindspore/ccsrc/backend/session/kernel_graph.h
@@ -148,10 +148,10 @@ class KernelGraph : public FuncGraph {
  const std::map<std::string, std::pair<AnfNodePtr, int>> &summary_nodes() const { return summary_nodes_; }
  void set_summary_nodes(const std::map<std::string, std::pair<AnfNodePtr, int>> &nodes) { summary_nodes_ = nodes; }
  void AddInternalOutput(const AnfNodePtr &front_node, const AnfNodePtr &node);
  void ReplaceInternalOutput(const AnfNodePtr &node, const AnfNodePtr &new_node);
  void ReplaceInternalOutput(const AnfNodePtr &node, const AnfNodePtr &new_node, int src_output_idx = -1,
                             int dst_output_idx = -1);
  AnfNodePtr GetInternalOutputByFrontNode(const AnfNodePtr &front_node) const;
  bool IsInternalOutput(const AnfNodePtr &node) const;
  AnfNodePtr GetFrontNodeByInternalOutput(const AnfNodePtr &node) const;
  void AddFinalOutputKernel(const AnfNodePtr &node);
  bool IsFinalOutputKernel(const AnfNodePtr &node) const;
  uint32_t current_epoch() const { return current_epoch_; }
@@ -223,7 +223,7 @@ class KernelGraph : public FuncGraph {
  CNodePtr end_goto_;
  bool null_output_;
  std::unordered_map<AnfNodePtr, AnfNodePtr> front_to_internal_outputs_map_;
  std::unordered_map<AnfNodePtr, AnfNodePtr> internal_outputs_to_front_map_;
  std::unordered_map<AnfNodePtr, std::unordered_map<int, AnfNodePtr>> internal_outputs_to_front_map_;
  std::set<AnfNodePtr> final_output_kernels_;
  uint32_t current_epoch_;
 };
--- a/mindspore/ccsrc/backend/session/session_basic.cc
+++ b/mindspore/ccsrc/backend/session/session_basic.cc
@@ -35,6 +35,11 @@
 #include "ir/dtype.h"
 #include "ir/anf.h"
 #include "ir/func_graph_cloner.h"
 #if (!_WIN32 && !ENABLE_GE && !ENABLE_TESTCASES)
 #include "frontend/parallel/ps/worker.h"
 #include "frontend/parallel/ps/common.h"
 #include "frontend/parallel/ps/util.h"
 #endif

 namespace mindspore {
 namespace session {
@@ -295,7 +300,11 @@ void SessionBasic::InitInternalOutputParameter(const AnfNodePtr &out_node, const
    MS_LOG(INFO) << "No corresponding internal output for output node";
    return;
  }
  auto real_kernel = AnfAlgo::VisitKernel(ref_node, 0);
  size_t output_idx = 0;
  if (AnfAlgo::CheckPrimitiveType(out_node, prim::kPrimTupleGetItem)) {
    output_idx = AnfAlgo::GetTupleGetItemOutIndex(out_node->cast<CNodePtr>());
  }
  auto real_kernel = AnfAlgo::VisitKernel(ref_node, output_idx);
  auto ref_real_node = real_kernel.first;
  auto ref_real_node_index = real_kernel.second;
  if (ref_real_node->isa<CNode>() && node_graph->IsInternalOutput(ref_real_node) &&
@@ -320,6 +329,7 @@ void SessionBasic::InitInternalOutputParameter(const AnfNodePtr &out_node, const
    builder.SetOutputsFormat({format});
    d_kernel_info->set_select_kernel_build_info(builder.Build());
    AnfAlgo::SetOutputAddr(address, 0, parameter.get());
    AnfAlgo::SetOutputInferTypeAndShape({type}, {AnfAlgo::GetOutputInferShape(parameter, 0)}, parameter.get());
  }
 }

@@ -973,6 +983,16 @@ CNodePtr SessionBasic::ConstructOutput(const AnfNodePtrList &outputs, const std:
      bool internal_output = true;
      std::string kernel_target = GetCNodeTarget(front_real_kernel.first);
      for (auto user : users) {
        auto cnode = user.first->cast<CNodePtr>();
        if (cnode == nullptr) {
          internal_output = false;
          break;
        }
        auto prim = cnode->input(kAnfPrimitiveIndex);
        if (prim == nullptr || !prim->isa<ValueNode>()) {
          internal_output = false;
          break;
        }
        if (!AnfAlgo::IsRealKernel(user.first) || kernel_target != GetCNodeTarget(user.first)) {
          internal_output = false;
          break;
@@ -1097,5 +1117,92 @@ KernelGraphPtr SessionBasic::NewKernelGraph() {
  graphs_[graph_sum_++] = graph;
  return graph;
 }

 AnfNodePtr SessionBasic::FindPullNode(const AnfNodePtr &push_node, const std::vector<AnfNodePtr> &node_list) {
  MS_EXCEPTION_IF_NULL(push_node);
  for (auto &node : node_list) {
    if (node != nullptr && node->isa<CNode>()) {
      for (auto input : node->cast<CNodePtr>()->inputs()) {
        if (push_node == AnfAlgo::VisitKernel(input, 0).first) {
          if (AnfAlgo::GetCNodeName(node) != kPullOpName) {
            MS_LOG(EXCEPTION) << "The edge between Push and Pull node is invalid.";
          }
          return node;
        }
      }
    }
  }
  return nullptr;
 }

 #if (!_WIN32 && !ENABLE_GE && !ENABLE_TESTCASES)
 void SessionBasic::AssignParamKey(const KernelGraphPtr &kernel_graph) {
  if (!parallel::ps::Util::IsRoleOfWorker()) {
    MS_LOG(INFO) << "Not parameter server mode.";
    return;
  }
  MS_EXCEPTION_IF_NULL(kernel_graph);
  std::vector<AnfNodePtr> node_list = TopoSort(kernel_graph->get_return());
  for (auto &node : node_list) {
    if (node != nullptr && node->isa<CNode>()) {
      // Assign key for forward kernel EmbeddingLookup.
      // The key will be assigned to embedding table ande Push kernel as well.
      if (AnfAlgo::GetCNodeName(node) == kEmbeddingLookupOpName) {
        size_t embedding_table_idx = 0;
        auto embedding_table = AnfAlgo::GetInputNode(node->cast<CNodePtr>(), embedding_table_idx);
        size_t key = parallel::ps::Worker<float>::GetInstance().SetParamKey(embedding_table->fullname_with_scope());
        AnfAlgo::SetNodeAttr(kAttrPsKey, MakeValue(key), node);
      } else if (AnfAlgo::GetCNodeName(node) == kPushOpName) {
        auto pull_node = FindPullNode(node, node_list);
        if (!pull_node) {
          MS_LOG(EXCEPTION) << "Assigning parameter key failed: can't find Pull node of the Push node.";
        }

        // Second input of Pull node is the trainable parameter.
        size_t parameter_index = 1;
        auto parameter_node = AnfAlgo::GetInputNode(pull_node->cast<CNodePtr>(), parameter_index);
        size_t key = parallel::ps::Worker<float>::GetInstance().SetParamKey(parameter_node->fullname_with_scope());
        AnfAlgo::SetNodeAttr(kAttrPsKey, MakeValue(key), node);
        AnfAlgo::SetNodeAttr(kAttrPsKey, MakeValue(key), pull_node);

        std::string optimizer_name = AnfAlgo::GetNodeAttr<std::string>(node, kAttrOptimizerType);
        parallel::ps::Worker<float>::GetInstance().SetKeyOptimId(key, optimizer_name);
      }
    }
  }
 }

 void SessionBasic::InitPSParamAndOptim(const KernelGraphPtr &kernel_graph,
                                       const std::vector<tensor::TensorPtr> &inputs_const) {
  if (!parallel::ps::Util::IsRoleOfWorker()) {
    return;
  }
  std::vector<tensor::TensorPtr> inputs(inputs_const);
  size_t input_ctrl_size = 1;
  MS_EXCEPTION_IF_NULL(kernel_graph);
  if (kernel_graph->input_ctrl_tensors()) {
    input_ctrl_size = LoadCtrlInputTensor(kernel_graph, &inputs);
  }
  auto input_nodes = kernel_graph->inputs();
  if ((inputs.size() + input_ctrl_size) - 1 != input_nodes.size()) {
    MS_LOG(EXCEPTION) << "Tensor input:" << inputs.size() << " is not equal graph inputs:" << input_nodes.size()
                      << ", input_ctrl_size:" << input_ctrl_size;
  }
  auto ms_context = MsContext::GetInstance();
  MS_EXCEPTION_IF_NULL(ms_context);
  for (size_t i = 0; i < inputs.size(); ++i) {
    auto tensor = inputs[i];
    MS_EXCEPTION_IF_NULL(tensor);
    auto input_node = input_nodes[i];
    MS_EXCEPTION_IF_NULL(input_node);
    if (input_node->isa<Parameter>() && AnfAlgo::OutputAddrExist(input_node, 0)) {
      auto pk_node = input_node->cast<ParameterPtr>();
      mindspore::parallel::ps::Worker<float>::GetInstance().InitPSParamAndOptim(
        pk_node->fullname_with_scope(), tensor->data_c(), LongToSize(tensor->data().nbytes()));
    }
  }
  ps_init_ = true;
 }
 #endif
 }  // namespace session
 }  // namespace mindspore
--- a/mindspore/ccsrc/backend/session/session_basic.h
+++ b/mindspore/ccsrc/backend/session/session_basic.h
@@ -51,7 +51,7 @@ using OpRunInfoPtr = std::shared_ptr<OpRunInfo>;

 class SessionBasic {
 public:
  SessionBasic() : context_(nullptr), summary_callback_(nullptr), device_id_(0) {
  SessionBasic() : context_(nullptr), summary_callback_(nullptr), device_id_(0), ps_init_(false) {
 #ifdef ENABLE_DEBUGGER
    debugger_ = nullptr;
 #endif
@@ -104,6 +104,8 @@ class SessionBasic {
  virtual GraphId GetFinalRunGraph() const { return kInvalidGraphId; }
  virtual void SetActive(GraphId, GraphId) {}
  virtual void GetSummaryNodes(KernelGraph *graph);
  void AssignParamKey(const KernelGraphPtr &kernel_graph);
  void InitPSParamAndOptim(const KernelGraphPtr &kernel_graph, const std::vector<tensor::TensorPtr> &inputs_const);

 #ifdef ENABLE_DEBUGGER
  // set debugger
@@ -140,6 +142,7 @@ class SessionBasic {
  AnfNodePtr CreateNewParameterFromCNode(const AnfNodePtr &anf, bool valid_input, KernelGraph *graph);
  void AddParameterToGraphInputs(const std::vector<AnfNodePtr> &parameters, KernelGraph *graph);
  void InitInternalOutputParameter(const AnfNodePtr &out_node, const AnfNodePtr &parameter);
  AnfNodePtr FindPullNode(const AnfNodePtr &push_node, const std::vector<AnfNodePtr> &node_list);

  std::unordered_map<GraphId, std::shared_ptr<KernelGraph>> graphs_;
  std::unordered_map<GraphInfo, std::shared_ptr<KernelGraph>> run_op_graphs_;
@@ -148,6 +151,7 @@ class SessionBasic {
  CallBackFunc summary_callback_;
  static GraphId graph_sum_;
  uint32_t device_id_;
  bool ps_init_;
 #ifdef ENABLE_DEBUGGER
  std::shared_ptr<Debugger> debugger_;
 #endif
--- a/mindspore/ccsrc/debug/CMakeLists.txt
+++ b/mindspore/ccsrc/debug/CMakeLists.txt
@@ -23,9 +23,7 @@ if (ENABLE_D)
    list(APPEND _DEBUG_SRC_LIST
        "${CMAKE_CURRENT_SOURCE_DIR}/common.cc"
        )
    if (ENABLE_DATA_DUMP)
        list(APPEND _DEBUG_SRC_LIST "${CMAKE_CURRENT_SOURCE_DIR}/data_dump_parser.cc")
    endif(ENABLE_DATA_DUMP)
    list(APPEND _DEBUG_SRC_LIST "${CMAKE_CURRENT_SOURCE_DIR}/data_dump_parser.cc")
 endif()

 if (ENABLE_DUMP_E2E)
--- a/mindspore/ccsrc/debug/common.cc
+++ b/mindspore/ccsrc/debug/common.cc
@@ -120,6 +120,10 @@ std::optional<std::string> Common::GetConfigFile(const std::string &env) {
    MS_LOG(ERROR) << dump_config_file << " not exist.";
    return {};
  }
  auto suffix = dump_config_file.substr(dump_config_file.find_last_of('.') + 1);
  if (suffix != "json") {
    MS_LOG(EXCEPTION) << "[DataDump] dump config file suffix only support json! But got:." << suffix;
  }
  return dump_config_file;
 }
 }  // namespace mindspore
--- a/mindspore/ccsrc/debug/data_dump_parser.cc
+++ b/mindspore/ccsrc/debug/data_dump_parser.cc
@@ -29,13 +29,13 @@ void DataDumpParser::ResetParam() {
  net_name_.clear();
  dump_mode_ = 0;
  dump_step_ = 0;
  kernel_set_.clear();
  kernel_map_.clear();
 }

 bool DataDumpParser::DumpEnabled() const {
  auto enable_dump = std::getenv(kEnableDataDump);
  if (!enable_dump) {
    MS_LOG(WARNING) << "[DataDump] enable dump is null. Please export ENABLE_DATA_DUMP";
    MS_LOG(INFO) << "[DataDump] enable dump is null. Please export ENABLE_DATA_DUMP";
    return false;
  }

@@ -60,9 +60,18 @@ std::optional<std::string> DataDumpParser::GetDumpPath() const {
    return {};
  }
  std::string dump_path_str(dump_path);
  if (!std::all_of(dump_path_str.begin(), dump_path_str.end(), ::isalpha)) {
    MS_LOG(EXCEPTION) << "[DataDump] dump path only support alphas, but got:" << dump_path_str;
  }
  return dump_path_str;
 }

 std::string GetIfstreamString(const std::ifstream &ifstream) {
  std::stringstream buffer;
  buffer << ifstream.rdbuf();
  return buffer.str();
 }

 void DataDumpParser::ParseDumpConfig() {
  std::lock_guard<std::mutex> guard(lock_);
  MS_LOG(INFO) << "[DataDump] parse start";
@@ -84,7 +93,12 @@ void DataDumpParser::ParseDumpConfig() {
  }

  nlohmann::json j;
  json_file >> j;
  try {
    json_file >> j;
  } catch (nlohmann::json::parse_error &e) {
    MS_LOG(ERROR) << "[DataDump] json contents:" << GetIfstreamString(json_file);
    MS_LOG(EXCEPTION) << "[DataDump] parse json failed, error:" << e.what();
  }
  if (j.find("DumpSettings") == j.end()) {
    MS_LOG(EXCEPTION) << "[DataDump] DumpSettings is not exist.";
  }
@@ -111,8 +125,8 @@ bool DataDumpParser::NeedDump(const std::string &op_full_name) const {
  if (dump_mode_ == 0) {
    return true;
  }
  auto iter = kernel_set_.find(op_full_name);
  return iter != kernel_set_.end();
  auto iter = kernel_map_.find(op_full_name);
  return iter != kernel_map_.end();
 }

 bool DataDumpParser::IsConfigExist(const nlohmann::json &dump_settings) const {
@@ -145,8 +159,25 @@ bool DataDumpParser::ParseDumpSetting(const nlohmann::json &dump_settings) {
    auto kernel_str = kernel.dump();
    kernel_str.erase(std::remove(kernel_str.begin(), kernel_str.end(), '\"'), kernel_str.end());
    MS_LOG(INFO) << "[DataDump] Need dump kernel:" << kernel_str;
    kernel_set_.insert(kernel_str);
    kernel_map_.insert({kernel_str, 0});
  }
  return true;
 }

 void DataDumpParser::MatchKernel(const std::string &kernel_name) {
  auto iter = kernel_map_.find(kernel_name);
  if (iter == kernel_map_.end()) {
    return;
  }
  iter->second = iter->second + 1;
  MS_LOG(INFO) << "Match dump kernel:" << iter->first << " match times:" << iter->second;
 }

 void DataDumpParser::PrintUnusedKernel() {
  for (const auto &iter : kernel_map_) {
    if (iter.second == 0) {
      MS_LOG(WARNING) << "[DataDump] Unused Kernel in json:" << iter.first;
    }
  }
 }
 }  // namespace mindspore
--- a/mindspore/ccsrc/debug/data_dump_parser.h
+++ b/mindspore/ccsrc/debug/data_dump_parser.h
@@ -18,7 +18,7 @@
 #define MINDSPORE_MINDSPORE_CCSRC_DEBUG_ASYNC_DUMP_JSON_PARE_H_

 #include <string>
 #include <set>
 #include <map>
 #include <mutex>
 #include <optional>
 #include "nlohmann/json.hpp"
@@ -39,7 +39,8 @@ class DataDumpParser {
  const std::string &net_name() const { return net_name_; }
  uint32_t dump_mode() const { return dump_mode_; }
  uint32_t dump_step() const { return dump_step_; }
  const std::set<std::string> &kernel_set() const { return kernel_set_; }
  void MatchKernel(const std::string &kernel_name);
  void PrintUnusedKernel();

 private:
  DataDumpParser() = default;
@@ -55,7 +56,7 @@ class DataDumpParser {
  std::string net_name_;
  uint32_t dump_mode_{0};
  uint32_t dump_step_{0};
  std::set<std::string> kernel_set_;
  std::map<std::string, uint32_t> kernel_map_;
 };
 }  // namespace mindspore
 #endif  // MINDSPORE_MINDSPORE_CCSRC_DEBUG_ASYNC_DUMP_JSON_PARE_H_
--- a/mindspore/ccsrc/debug/dump_proto.cc
+++ b/mindspore/ccsrc/debug/dump_proto.cc
@@ -124,6 +124,8 @@ void ProtoExporter::SetNodeOutputType(const TypePtr &type, const BaseShapePtr &s
    // Do Nothing
  } else if (type->isa<UndeterminedType>()) {
    // Do Nothing
  } else if (type->isa<SparseTensorType>()) {
    // Do Nothing
  } else if (type->isa<Tuple>()) {
    TuplePtr tuple_type = dyn_cast<Tuple>(type);
    type_proto->set_data_type(irpb::DT_TUPLE);