code check for cpu ops

4 years ago · 95473ee561
--- a/mindspore/ccsrc/backend/kernel_compiler/common_utils.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/common_utils.h
@@ -144,6 +144,30 @@ std::vector<int64_t> CalDimOffset(const std::vector<int64_t> &input_shape);
 size_t GetCopySize(const std::vector<int64_t> &dim_offset, const std::vector<int64_t> &start,
                   const std::vector<int64_t> &stop);
 size_t UnitSizeInBytes(const mindspore::TypeId &t);

 #define CHECK_KERNEL_INPUTS_NUM(actual_inputs_num, expect_inputs_num, kernel_name)                     \
  do {                                                                                                 \
    if ((actual_inputs_num) != (expect_inputs_num)) {                                                  \
      MS_LOG(EXCEPTION) << (kernel_name) << " requires " << (expect_inputs_num) << " inputs, but got " \
                        << (actual_inputs_num) << ".";                                                 \
    }                                                                                                  \
  } while (0)

 #define CHECK_KERNEL_OUTPUTS_NUM(actual_outputs_num, expect_outputs_num, kernel_name)                       \
  do {                                                                                                      \
    if ((actual_outputs_num) != (expect_outputs_num)) {                                                     \
      MS_LOG(EXCEPTION) << (kernel_name) << " should have " << (expect_outputs_num) << " outputs, but got " \
                        << (actual_outputs_num) << ".";                                                     \
    }                                                                                                       \
  } while (0)

 #define CHECK_KERNEL_WORKSPACE_SIZE(actual_size, expect_size, kernel_name)                                           \
  do {                                                                                                               \
    if ((actual_size) != (expect_size)) {                                                                            \
      MS_LOG(EXCEPTION) << (kernel_name) << " requires " << (expect_size) << " workspace, but got " << (actual_size) \
                        << ".";                                                                                      \
    }                                                                                                                \
  } while (0)
 }  // namespace kernel
 }  // namespace mindspore

--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/fl/fused_pull_weight_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/fl/fused_pull_weight_kernel.h
@@ -44,7 +44,6 @@ class FusedPullWeightKernel : public CPUKernel {
    if (inputs.size() != weight_full_names_.size()) {
      MS_LOG(EXCEPTION) << "Input number is " << inputs.size() << ", but FusedPullWeightKernel needs "
                        << weight_full_names_.size() << " weights as inputs.";
      return false;
    }

    std::shared_ptr<fl::FBBuilder> fbb = std::make_shared<fl::FBBuilder>();
@@ -67,7 +66,6 @@ class FusedPullWeightKernel : public CPUKernel {
    MS_LOG(INFO) << "Launching pulling weight for federated learning iteration " << fl_iteration_;
    if (!BuildPullWeightReq(fbb)) {
      MS_LOG(EXCEPTION) << "Building request for FusedPullWeight failed.";
      return false;
    }

    std::shared_ptr<std::vector<unsigned char>> pull_weight_rsp_msg = nullptr;
@@ -98,13 +96,11 @@ class FusedPullWeightKernel : public CPUKernel {
        fbb = std::make_shared<fl::FBBuilder>();
        if (!BuildPullWeightReq(fbb)) {
          MS_LOG(EXCEPTION) << "Building request for FusedDownloadWeightsByKeys failed.";
          return false;
        }
        continue;
      } else if (retcode != schema::ResponseCode_SUCCEED) {
        MS_LOG(EXCEPTION) << "FusedPullWeight failed. Server return code: " << pull_weight_rsp->retcode()
                          << ", reason: " << pull_weight_rsp->reason()->str();
        return false;
      } else {
        MS_LOG(DEBUG) << "FusedPullWeight succeed.";
      }
@@ -115,13 +111,11 @@ class FusedPullWeightKernel : public CPUKernel {
      const std::string &weight_name = weight_full_names_[i];
      if (feature_map.count(weight_name) == 0) {
        MS_LOG(EXCEPTION) << "The weights for " << weight_name << " is not pulled from server.";
        return false;
      }
      int ret =
        memcpy_s(inputs[i]->addr, inputs[i]->size, feature_map[weight_name].addr, feature_map[weight_name].size);
      if (ret != 0) {
        MS_LOG(EXCEPTION) << "memcpy_s error, errorno(" << ret << ")";
        return false;
      }
    }
    MS_LOG(INFO) << "Pull weights for " << weight_full_names_ << " success. Iteration: " << fl_iteration_;
@@ -147,7 +141,6 @@ class FusedPullWeightKernel : public CPUKernel {
      MS_LOG(EXCEPTION)
        << "Attributes of FusedPullWeightKernel are invalid: server number is 0 or weight_full_names_ is "
           "empty or indices_ is UINT32_MAX.";
      return;
    }

    size_t output_num = AnfAlgo::GetOutputTensorNum(kernel_node);
@@ -186,7 +179,6 @@ class FusedPullWeightKernel : public CPUKernel {
    if (fbs_feature_map->size() != weight_full_names_.size()) {
      MS_LOG(EXCEPTION) << "FusedPullWeightKernel should get " << weight_full_names_.size() << " weights, but got "
                        << fbs_feature_map->size() << " weights.";
      return {};
    }

    std::map<std::string, Address> feature_map;
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/fl/fused_push_weight_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/fl/fused_push_weight_kernel.h
@@ -42,7 +42,6 @@ class FusedPushWeightKernel : public CPUKernel {
    if (inputs.size() != weight_full_names_.size()) {
      MS_LOG(EXCEPTION) << "Input number is " << inputs.size() << ", but FusedPushWeightKernel needs "
                        << weight_full_names_.size() << " weights as inputs.";
      return false;
    }

    std::shared_ptr<fl::FBBuilder> fbb = std::make_shared<fl::FBBuilder>();
@@ -65,7 +64,6 @@ class FusedPushWeightKernel : public CPUKernel {
    MS_LOG(INFO) << "Launching pushing weight for federated learning iteration " << fl_iteration_;
    if (!BuildPushWeightReq(fbb, inputs)) {
      MS_LOG(EXCEPTION) << "Building request for FusedPushWeight failed.";
      return false;
    }

    // The server number may change after scaling in/out.
@@ -97,13 +95,11 @@ class FusedPushWeightKernel : public CPUKernel {
                        << ". Retry later.";
          if (!BuildPushWeightReq(fbb, inputs)) {
            MS_LOG(EXCEPTION) << "Building request for FusedPushWeight failed.";
            return false;
          }
          continue;
        } else if (retcode != schema::ResponseCode_SUCCEED) {
          MS_LOG(EXCEPTION) << "FusedPushWeight failed. Server return code: " << push_weight_rsp->retcode()
                            << ", reason: " << push_weight_rsp->reason()->str();
          return false;
        } else {
          MS_LOG(DEBUG) << "FusedPushWeight succeed.";
        }
@@ -132,7 +128,6 @@ class FusedPushWeightKernel : public CPUKernel {
      MS_LOG(EXCEPTION)
        << "Attributes of FusedPushWeightKernel are invalid: server number is 0 or weight_full_names_ is "
           "empty or indices_ is UINT32_MAX.";
      return;
    }

    size_t output_num = AnfAlgo::GetOutputTensorNum(kernel_node);
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/mkldnn/addn_cpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/mkldnn/addn_cpu_kernel.cc
@@ -1,5 +1,5 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 * Copyright 2020-2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -24,17 +24,26 @@

 namespace mindspore {
 namespace kernel {
 namespace {
 constexpr size_t kAddNInputsMinNum = 2;
 constexpr size_t kAddNOutputsNum = 1;

 void AddInt(const int *in_0, const int *in_1, int *out, int start, int end) {
  int ret = ElementAddInt(in_0 + start, in_1 + start, out + start, end - start);
  if (ret != NNACL_OK) {
    MS_LOG(EXCEPTION) << "Add failed.";
  }
 }
 }  // namespace

 void AddNCPUKernel::InitKernel(const CNodePtr &kernel_node) {
  MS_EXCEPTION_IF_NULL(kernel_node);
  CheckParam(kernel_node);
  kernel_name_ = AnfAlgo::GetCNodeName(kernel_node);
  input_num_ = AnfAlgo::GetInputTensorNum(kernel_node);
  if (input_num_ < kAddNInputsMinNum) {
    MS_LOG(EXCEPTION) << "Input numbers should not less " << kAddNInputsMinNum << ", but got " << input_num_;
  }
  CheckParam(kernel_node);
  dtype_ = AnfAlgo::GetInputDeviceDataType(kernel_node, 0);
  std::vector<size_t> src0_shape = AnfAlgo::GetInputDeviceShape(kernel_node, 0);
  std::vector<size_t> src1_shape = AnfAlgo::GetInputDeviceShape(kernel_node, 1);
@@ -52,6 +61,8 @@ void AddNCPUKernel::InitKernel(const CNodePtr &kernel_node) {

 bool AddNCPUKernel::Launch(const std::vector<kernel::AddressPtr> &inputs, const std::vector<kernel::AddressPtr> &,
                           const std::vector<kernel::AddressPtr> &outputs) {
  CHECK_KERNEL_INPUTS_NUM(inputs.size(), input_num_, kernel_name_);
  CHECK_KERNEL_OUTPUTS_NUM(outputs.size(), kAddNOutputsNum, kernel_name_);
  if (dtype_ == kNumberTypeFloat32) {
    SetArgumentHandle(DNNL_ARG_SRC_0, inputs[0]->addr);
    SetArgumentHandle(DNNL_ARG_SRC_1, inputs[1]->addr);
@@ -93,10 +104,6 @@ void AddNCPUKernel::CheckParam(const CNodePtr &kernel_node) {
      MS_LOG(EXCEPTION) << "AddN input shapes must be equal.";
    }
  }
  size_t output_num = AnfAlgo::GetOutputTensorNum(kernel_node);
  if (output_num != 1) {
    MS_LOG(EXCEPTION) << "Output number is " << output_num << ", but AddNCPUKernel needs 1 output.";
  }
 }
 }  // namespace kernel
 }  // namespace mindspore
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/mkldnn/assignadd_cpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/mkldnn/assignadd_cpu_kernel.cc
@@ -13,6 +13,7 @@
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #include "backend/kernel_compiler/cpu/mkldnn/assignadd_cpu_kernel.h"
 #include "backend/kernel_compiler/cpu/mkldnn/mkl_kernel_engine.h"
 #include "runtime/device/cpu/cpu_device_address.h"
@@ -20,13 +21,19 @@

 namespace mindspore {
 namespace kernel {
 namespace {
 constexpr size_t kAssignAddInputsNum = 2;
 constexpr size_t kAssignAddOutputsNum = 1;
 }  // namespace

 void AssignAddCPUKernel::InitKernel(const CNodePtr &kernel_node) {
  MS_EXCEPTION_IF_NULL(kernel_node);
  kernel_name_ = AnfAlgo::GetCNodeName(kernel_node);
  std::vector<size_t> src0_shape = AnfAlgo::GetInputDeviceShape(kernel_node, 0);
  std::vector<size_t> src1_shape = AnfAlgo::GetInputDeviceShape(kernel_node, 1);
  if (src1_shape.size() == 0 && src0_shape.size() == 0) {
    src0_shape.insert(src0_shape.begin(), 1);
    src1_shape.insert(src1_shape.begin(), 1);
    (void)src0_shape.insert(src0_shape.begin(), 1);
    (void)src1_shape.insert(src1_shape.begin(), 1);
  }
  if (src0_shape.size() != src1_shape.size() && src1_shape.size() > 1) {
    MS_LOG(EXCEPTION) << "AssignAdd only support same dim input or tensor * scalar " << src0_shape.size() << " vs "
@@ -49,9 +56,8 @@ void AssignAddCPUKernel::InitKernel(const CNodePtr &kernel_node) {

 bool AssignAddCPUKernel::Launch(const std::vector<kernel::AddressPtr> &inputs, const std::vector<kernel::AddressPtr> &,
                                const std::vector<kernel::AddressPtr> &outputs) {
  if (inputs.size() < 2) {
    MS_LOG(EXCEPTION) << "AssignAdd error input output size!";
  }
  CHECK_KERNEL_INPUTS_NUM(inputs.size(), kAssignAddInputsNum, kernel_name_);
  CHECK_KERNEL_OUTPUTS_NUM(outputs.size(), kAssignAddOutputsNum, kernel_name_);
  SetArgumentHandle(DNNL_ARG_SRC_0, inputs[0]->addr);
  SetArgumentHandle(DNNL_ARG_SRC_1, inputs[1]->addr);
  SetArgumentHandle(DNNL_ARG_DST, outputs[0]->addr);
@@ -59,7 +65,6 @@ bool AssignAddCPUKernel::Launch(const std::vector<kernel::AddressPtr> &inputs, c
  auto ret = memcpy_s(inputs[0]->addr, inputs[0]->size, outputs[0]->addr, outputs[0]->size);
  if (ret != 0) {
    MS_LOG(EXCEPTION) << "Memcpy_s error, errorno " << ret;
    return false;
  }
  return true;
 }
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/mkldnn/batch_norm_cpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/mkldnn/batch_norm_cpu_kernel.cc
@@ -13,6 +13,7 @@
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #include "backend/kernel_compiler/cpu/mkldnn/batch_norm_cpu_kernel.h"
 #include "backend/kernel_compiler/cpu/mkldnn/mkl_kernel_engine.h"
 #include "runtime/device/cpu/cpu_device_address.h"
@@ -20,9 +21,15 @@

 namespace mindspore {
 namespace kernel {
 namespace {
 constexpr size_t kBatchNormInputsNum = 5;
 constexpr size_t kBatchNormOutputsNum = 5;
 constexpr size_t kBatchNormInputShapeSize = 4;
 constexpr size_t kBatchNormInputShapeSize2 = 2;
 }  // namespace

 void BatchNormCPUKernel::InitInputOutputSize(const CNodePtr &kernel_node) {
  CPUKernel::InitInputOutputSize(kernel_node);
  MS_EXCEPTION_IF_NULL(kernel_node);
  size_t type_size = sizeof(float);
  std::vector<size_t> shape = AnfAlgo::GetInputDeviceShape(kernel_node, 0);
  size_t tensor_size = shape[1] * 2 * type_size;  // [2, c] to store scale and bias
@@ -31,12 +38,13 @@ void BatchNormCPUKernel::InitInputOutputSize(const CNodePtr &kernel_node) {

 void BatchNormCPUKernel::InitKernel(const CNodePtr &kernel_node) {
  MS_EXCEPTION_IF_NULL(kernel_node);
  kernel_name_ = AnfAlgo::GetCNodeName(kernel_node);
  is_train = AnfAlgo::GetNodeAttr<bool>(kernel_node, "is_training");
  momentum = AnfAlgo::GetNodeAttr<float>(kernel_node, "momentum");
  std::vector<size_t> x_shape = AnfAlgo::GetInputDeviceShape(kernel_node, 0);
  if (x_shape.size() == 2) {
    (void)x_shape.insert(x_shape.end(), 2, 1);  // expand 2 dim: NC -> NCHW
  } else if (x_shape.size() != 4) {
  if (x_shape.size() == kBatchNormInputShapeSize2) {
    (void)x_shape.insert(x_shape.end(), kBatchNormInputShapeSize - kBatchNormInputShapeSize2, 1);
  } else if (x_shape.size() != kBatchNormInputShapeSize) {
    MS_LOG(EXCEPTION) << "Batchnorm only support nchw input!";
  }
  batch_size = x_shape[0];
@@ -67,9 +75,8 @@ void BatchNormCPUKernel::InitKernel(const CNodePtr &kernel_node) {
 bool BatchNormCPUKernel::Launch(const std::vector<kernel::AddressPtr> &inputs,
                                const std::vector<kernel::AddressPtr> &workspace,
                                const std::vector<kernel::AddressPtr> &outputs) {
  if (inputs.size() < 5 || outputs.empty()) {
    MS_LOG(EXCEPTION) << "Error input output size!";
  }
  CHECK_KERNEL_INPUTS_NUM(inputs.size(), kBatchNormInputsNum, kernel_name_);
  CHECK_KERNEL_OUTPUTS_NUM(outputs.size(), kBatchNormOutputsNum, kernel_name_);
  auto wksp = reinterpret_cast<float *>(workspace[0]->addr);
  auto scale_ret = memcpy_s(wksp, workspace[0]->size, inputs[1]->addr, inputs[1]->size);
  auto max_size = workspace[0]->size - inputs[1]->size;
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/mkldnn/batch_norm_grad_cpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/mkldnn/batch_norm_grad_cpu_kernel.cc
@@ -13,6 +13,7 @@
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #include "backend/kernel_compiler/cpu/mkldnn/batch_norm_grad_cpu_kernel.h"
 #include "backend/kernel_compiler/cpu/mkldnn/mkl_kernel_engine.h"
 #include "runtime/device/cpu/cpu_device_address.h"
@@ -20,9 +21,15 @@

 namespace mindspore {
 namespace kernel {
 namespace {
 constexpr size_t kBatchNormGradInputsNum = 6;
 constexpr size_t kBatchNormGradOutputsNum = 3;
 constexpr size_t kBatchNormGradInputShapeSize = 4;
 constexpr size_t kBatchNormGradInputShapeSize2 = 2;
 }  // namespace

 void BatchNormGradCPUKernel::InitInputOutputSize(const CNodePtr &kernel_node) {
  CPUKernel::InitInputOutputSize(kernel_node);
  MS_EXCEPTION_IF_NULL(kernel_node);
  size_t type_size = sizeof(float);
  std::vector<size_t> shape = AnfAlgo::GetInputDeviceShape(kernel_node, Y_BACKPROP);
  size_t tensor_size = shape[C] * SCALE_SHIFT_NUM * type_size;
@@ -35,6 +42,7 @@ void BatchNormGradCPUKernel::InitInputOutputSize(const CNodePtr &kernel_node) {

 void BatchNormGradCPUKernel::InitKernel(const CNodePtr &kernel_node) {
  MS_EXCEPTION_IF_NULL(kernel_node);
  kernel_name_ = AnfAlgo::GetCNodeName(kernel_node);
  std::vector<size_t> x_shape = AnfAlgo::GetInputDeviceShape(kernel_node, 0);
  if (x_shape.size() == NC) {
    (void)x_shape.insert(x_shape.end(), (NCHW - NC), 1);
@@ -76,10 +84,9 @@ void BatchNormGradCPUKernel::InitKernel(const CNodePtr &kernel_node) {
 bool BatchNormGradCPUKernel::Launch(const std::vector<kernel::AddressPtr> &inputs,
                                    const std::vector<kernel::AddressPtr> &workspace,
                                    const std::vector<kernel::AddressPtr> &outputs) {
  constexpr size_t INPUT_NUM = 5;
  if (inputs.size() < INPUT_NUM || outputs.empty()) {
    MS_LOG(EXCEPTION) << "Error input output size!";
  }
  CHECK_KERNEL_INPUTS_NUM(inputs.size(), kBatchNormGradInputsNum, kernel_name_);
  CHECK_KERNEL_OUTPUTS_NUM(outputs.size(), kBatchNormGradOutputsNum, kernel_name_);

  auto wksp_in = reinterpret_cast<float *>(workspace[SCALE_BIAS]->addr);
  auto scale_ret = memcpy_s(wksp_in, workspace[SCALE_BIAS]->size, inputs[SCALE]->addr, inputs[SCALE]->size);
  if (scale_ret != 0) {
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/mkldnn/conv2d_grad_filter_cpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/mkldnn/conv2d_grad_filter_cpu_kernel.cc
@@ -13,6 +13,7 @@
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #include "backend/kernel_compiler/cpu/mkldnn/conv2d_grad_filter_cpu_kernel.h"
 #include <string>
 #include <algorithm>
@@ -22,12 +23,20 @@

 namespace mindspore {
 namespace kernel {
 namespace {
 constexpr size_t kShapeSize2D = 2;
 constexpr size_t kShapeSize4D = 4;
 constexpr size_t kConv2dGradFilterInputsNum = 2;
 constexpr size_t kConv2dGradFilterOutputsNum = 1;
 }  // namespace

 void Conv2dGradFilterCPUKernel::InitKernel(const CNodePtr &kernel_node) {
  MS_EXCEPTION_IF_NULL(kernel_node);
  kernel_name_ = AnfAlgo::GetCNodeName(kernel_node);
  std::vector<size_t> src_shape = AnfAlgo::GetInputDeviceShape(kernel_node, 1);
  std::vector<size_t> weight_shape = AnfAlgo::GetOutputDeviceShape(kernel_node, 0);
  std::vector<size_t> dst_shape = AnfAlgo::GetInputDeviceShape(kernel_node, 0);
  if (src_shape.size() != 4 || weight_shape.size() != 4) {
  if (src_shape.size() != kShapeSize4D || weight_shape.size() != kShapeSize4D) {
    MS_LOG(EXCEPTION) << ("Conv2d grad filter only support nchw input!");
  }
  std::vector<size_t> kernel_size({weight_shape[2], weight_shape[3]});
@@ -36,7 +45,7 @@ void Conv2dGradFilterCPUKernel::InitKernel(const CNodePtr &kernel_node) {
    if (src_shape[1] % group != 0) {
      MS_LOG(EXCEPTION) << "Conv2d channels should be divided by group!";
    }
    weight_shape.insert(weight_shape.begin(), group);
    (void)weight_shape.insert(weight_shape.begin(), group);
    weight_shape[1] = weight_shape[1] / group;
  }
  dnnl::memory::desc src_desc = GetDefaultMemDesc(src_shape);
@@ -47,16 +56,19 @@ void Conv2dGradFilterCPUKernel::InitKernel(const CNodePtr &kernel_node) {
  auto stride_me = AnfAlgo::GetNodeAttr<std::vector<int64_t>>(kernel_node, STRIDE);
  auto dilation_me = AnfAlgo::GetNodeAttr<std::vector<int64_t>>(kernel_node, DILATION);
  (void)std::transform(stride_me.begin(), stride_me.end(), std::back_inserter(stride_ori),
                       [](const int64_t &value) { return static_cast<int>(value); });
                       [](const int64_t &value) { return LongToInt(value); });
  (void)std::transform(dilation_me.begin(), dilation_me.end(), std::back_inserter(dilation_ori),
                       [](const int64_t &value) { return static_cast<int>(value); });
                       [](const int64_t &value) { return LongToInt(value); });

  if (dilation_ori.size() != 4) {
  if (dilation_ori.size() != kShapeSize4D) {
    MS_LOG(EXCEPTION) << "Conv2dGradFilterCPUKernel dilation must be 4d!";
  }
  if (dilation_ori[0] != 1 || dilation_ori[1] != 1) {
    MS_LOG(EXCEPTION) << "Conv2dGradFilterCPUKernel dilation only support 1 in N axis and C axis!";
  }
  if (stride_ori.size() < kShapeSize2D) {
    MS_LOG(EXCEPTION) << "Conv2dGradFilterCPUKernel stride_ori should not less than 2d!";
  }
  std::vector<int> stride{stride_ori[0], stride_ori[1]};
  std::vector<int> dilation{dilation_ori[2], dilation_ori[3]};
  dnnl::memory::dims strides{stride_ori[0], stride_ori[1]};
@@ -91,9 +103,8 @@ void Conv2dGradFilterCPUKernel::InitKernel(const CNodePtr &kernel_node) {
 bool Conv2dGradFilterCPUKernel::Launch(const std::vector<kernel::AddressPtr> &inputs,
                                       const std::vector<kernel::AddressPtr> &,
                                       const std::vector<kernel::AddressPtr> &outputs) {
  if (inputs.size() < 2 || outputs.empty()) {
    MS_LOG(EXCEPTION) << "Error input output size!";
  }
  CHECK_KERNEL_INPUTS_NUM(inputs.size(), kConv2dGradFilterInputsNum, kernel_name_);
  CHECK_KERNEL_OUTPUTS_NUM(outputs.size(), kConv2dGradFilterOutputsNum, kernel_name_);
  SetArgumentHandle(DNNL_ARG_SRC, inputs[1]->addr);
  SetArgumentHandle(DNNL_ARG_DIFF_DST, inputs[0]->addr);
  SetArgumentHandle(DNNL_ARG_DIFF_WEIGHTS, outputs[0]->addr);
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/mkldnn/conv2d_grad_filter_cpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/mkldnn/conv2d_grad_filter_cpu_kernel.h
@@ -1,5 +1,5 @@
 /**
 * Copyright 2019 Huawei Technologies Co., Ltd
 * Copyright 2019-2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -13,6 +13,7 @@
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_CONV2D_GRAD_FILTER_CPU_KERNEL_H_
 #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_CONV2D_GRAD_FILTER_CPU_KERNEL_H_

--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/mkldnn/conv2d_grad_input_cpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/mkldnn/conv2d_grad_input_cpu_kernel.cc
@@ -1,5 +1,5 @@
 /**
 * Copyright 2019 Huawei Technologies Co., Ltd
 * Copyright 2019-2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -13,6 +13,7 @@
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #include "backend/kernel_compiler/cpu/mkldnn/conv2d_grad_input_cpu_kernel.h"
 #include <string>
 #include <map>
@@ -23,13 +24,21 @@

 namespace mindspore {
 namespace kernel {
 namespace {
 constexpr size_t kConv2dGradInputInputsNum = 2;
 constexpr size_t kConv2dGradInputOutputsNum = 1;
 constexpr size_t kShapeSize2D = 2;
 constexpr size_t kShapeSize4D = 4;
 const std::map<std::string, size_t> kFormatIndexMap = {{"NCHW", 2}, {"HWCN", 0}, {"NHWC", 1}};
 }  // namespace

 void Conv2dGradInputCPUKernel::InitKernel(const CNodePtr &kernel_node) {
  MS_EXCEPTION_IF_NULL(kernel_node);
  kernel_name_ = AnfAlgo::GetCNodeName(kernel_node);
  std::vector<size_t> src_shape = AnfAlgo::GetOutputDeviceShape(kernel_node, 0);
  std::vector<size_t> weight_shape = AnfAlgo::GetInputDeviceShape(kernel_node, 1);
  std::vector<size_t> dst_shape = AnfAlgo::GetInputDeviceShape(kernel_node, 0);
  if (src_shape.size() != 4 || weight_shape.size() != 4) {
  if (src_shape.size() != kShapeSize4D || weight_shape.size() != kShapeSize4D) {
    MS_LOG(EXCEPTION) << "Conv2d grad filter only support nchw input!";
  }
  std::vector<size_t> kernel_size({weight_shape[2], weight_shape[3]});
@@ -38,7 +47,7 @@ void Conv2dGradInputCPUKernel::InitKernel(const CNodePtr &kernel_node) {
    if (src_shape[1] % group != 0) {
      MS_LOG(EXCEPTION) << "Conv2d channels should be divided by group!";
    }
    weight_shape.insert(weight_shape.begin(), group);
    (void)weight_shape.insert(weight_shape.begin(), group);
    weight_shape[1] = weight_shape[1] / group;
  }
  dnnl::memory::desc src_desc = GetDefaultMemDesc(src_shape);
@@ -64,13 +73,15 @@ void Conv2dGradInputCPUKernel::InitKernel(const CNodePtr &kernel_node) {
  (void)std::transform(dilation_me.begin(), dilation_me.end(), std::back_inserter(dilation_ori),
                       [](const int64_t &value) { return static_cast<int>(value); });

  if (dilation_ori.size() != 4) {
  if (dilation_ori.size() != kShapeSize4D) {
    MS_LOG(EXCEPTION) << "Conv2dGradInputCPUKernel dilation must be 4d!";
  }
  if (dilation_ori[0] != 1 || dilation_ori[1] != 1) {
    MS_LOG(EXCEPTION) << "Conv2dGradInputCPUKernel dilation only support 1 in N axis and C axis!";
  }

  if (stride_ori.size() < kShapeSize2D) {
    MS_LOG(EXCEPTION) << "Conv2dGradInputCPUKernel stride_ori should not less than 2d!";
  }
  std::vector<int> stride{stride_ori[0], stride_ori[1]};
  std::vector<int> dilation{dilation_ori[2], dilation_ori[3]};
  dnnl::memory::dims strides{stride_ori[0], stride_ori[1]};
@@ -105,9 +116,8 @@ void Conv2dGradInputCPUKernel::InitKernel(const CNodePtr &kernel_node) {
 bool Conv2dGradInputCPUKernel::Launch(const std::vector<kernel::AddressPtr> &inputs,
                                      const std::vector<kernel::AddressPtr> &,
                                      const std::vector<kernel::AddressPtr> &outputs) {
  if (inputs.size() < 2 || outputs.empty()) {
    MS_LOG(EXCEPTION) << "Error input output size!";
  }
  CHECK_KERNEL_INPUTS_NUM(inputs.size(), kConv2dGradInputInputsNum, kernel_name_);
  CHECK_KERNEL_OUTPUTS_NUM(outputs.size(), kConv2dGradInputOutputsNum, kernel_name_);
  SetArgumentHandle(DNNL_ARG_DIFF_DST, inputs[0]->addr);
  SetArgumentHandle(DNNL_ARG_WEIGHTS, inputs[1]->addr);
  SetArgumentHandle(DNNL_ARG_DIFF_SRC, outputs[0]->addr);
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/mkldnn/conv2d_grad_input_cpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/mkldnn/conv2d_grad_input_cpu_kernel.h
@@ -1,5 +1,5 @@
 /**
 * Copyright 2019 Huawei Technologies Co., Ltd
 * Copyright 2019-2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -13,6 +13,7 @@
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_CONV2D_GRAD_INPUT_CPU_KERNEL_H_
 #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_CONV2D_GRAD_INPUT_CPU_KERNEL_H_

--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/mkldnn/conv_cpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/mkldnn/conv_cpu_kernel.cc
@@ -13,6 +13,7 @@
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #include "backend/kernel_compiler/cpu/mkldnn/conv_cpu_kernel.h"
 #include <string>
 #include <algorithm>
@@ -22,13 +23,17 @@

 namespace mindspore {
 namespace kernel {
 constexpr size_t kConvInputTensorNum = 2;
 namespace {
 constexpr size_t kConvInputsNum = 2;
 constexpr size_t kConvOutputsNum = 1;
 constexpr size_t kShapeSize4D = 4;
 constexpr size_t kShapeSize5D = 5;
 constexpr size_t kKernelStartAxis = 2;
 }  // namespace

 void ConvCPUKernel::InitKernel(const CNodePtr &kernel_node) {
  MS_EXCEPTION_IF_NULL(kernel_node);
  kernel_name_ = AnfAlgo::GetCNodeName(kernel_node);
  std::vector<size_t> src_shape = AnfAlgo::GetInputDeviceShape(kernel_node, 0);
  std::vector<size_t> weight_shape = AnfAlgo::GetInputDeviceShape(kernel_node, 1);
  std::vector<size_t> dst_shape = AnfAlgo::GetOutputDeviceShape(kernel_node, 0);
@@ -59,9 +64,9 @@ void ConvCPUKernel::InitKernel(const CNodePtr &kernel_node) {
  auto stride_me = AnfAlgo::GetNodeAttr<std::vector<int64_t>>(kernel_node, stride_attr);
  auto dilation_me = AnfAlgo::GetNodeAttr<std::vector<int64_t>>(kernel_node, dilation_attr);
  (void)std::transform(stride_me.begin(), stride_me.end(), std::back_inserter(stride_ori),
                       [](const int64_t &value) { return static_cast<int>(value); });
                       [](const int64_t &value) { return LongToInt(value); });
  (void)std::transform(dilation_me.begin(), dilation_me.end(), std::back_inserter(dilation_ori),
                       [](const int64_t &value) { return static_cast<int>(value); });
                       [](const int64_t &value) { return LongToInt(value); });
  if (stride_ori.size() != src_dim) {
    MS_LOG(EXCEPTION) << "Conv stride size must be " << src_dim << "D!";
  }
@@ -111,9 +116,8 @@ void ConvCPUKernel::InitKernel(const CNodePtr &kernel_node) {

 bool ConvCPUKernel::Launch(const std::vector<kernel::AddressPtr> &inputs, const std::vector<kernel::AddressPtr> &,
                           const std::vector<kernel::AddressPtr> &outputs) {
  if (inputs.size() < kConvInputTensorNum || outputs.empty()) {
    MS_LOG(EXCEPTION) << "Error input output size!";
  }
  CHECK_KERNEL_INPUTS_NUM(inputs.size(), kConvInputsNum, kernel_name_);
  CHECK_KERNEL_OUTPUTS_NUM(outputs.size(), kConvOutputsNum, kernel_name_);
  SetArgumentHandle(DNNL_ARG_SRC, inputs[0]->addr);
  SetArgumentHandle(DNNL_ARG_WEIGHTS, inputs[1]->addr);
  SetArgumentHandle(DNNL_ARG_DST, outputs[0]->addr);
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/mkldnn/conv_cpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/mkldnn/conv_cpu_kernel.h
@@ -13,6 +13,7 @@
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_CONV_CPU_KERNEL_H_
 #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_CONV_CPU_KERNEL_H_

@@ -35,7 +36,6 @@ class ConvCPUKernel : public MKLCPUKernel {

 MS_REG_CPU_KERNEL(Conv2D, KernelAttr(), ConvCPUKernel);
 MS_REG_CPU_KERNEL(Conv3D, KernelAttr(), ConvCPUKernel);

 }  // namespace kernel
 }  // namespace mindspore

--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/mkldnn/pooling_avg_grad_cpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/mkldnn/pooling_avg_grad_cpu_kernel.cc
@@ -23,8 +23,17 @@

 namespace mindspore {
 namespace kernel {
 namespace {
 constexpr size_t kAvgPoolingGradInputsNum = 3;
 constexpr size_t kkAvgPoolingGradOutputsNum = 1;
 constexpr size_t kAvgPoolingGradKernelSize = 4;
 constexpr size_t kkAvgPoolingGradStrideSize = 4;
 constexpr size_t kkAvgPoolingGradPadSize = 2;
 }  // namespace

 void AvgPoolingGradCPUKernel::InitKernel(const CNodePtr &kernel_node) {
  MS_EXCEPTION_IF_NULL(kernel_node);
  kernel_name_ = AnfAlgo::GetCNodeName(kernel_node);
  std::vector<size_t> src_shape = AnfAlgo::GetInputDeviceShape(kernel_node, 0);
  std::vector<size_t> dst_shape = AnfAlgo::GetInputDeviceShape(kernel_node, 1);
  dnnl::memory::desc src_desc = GetDefaultMemDesc(src_shape);
@@ -34,10 +43,10 @@ void AvgPoolingGradCPUKernel::InitKernel(const CNodePtr &kernel_node) {
  std::vector<int64_t> kernel_sizes_me = AnfAlgo::GetNodeAttr<std::vector<int64_t>>(kernel_node, KERNEL_SIZE);
  std::vector<int64_t> strides_me = AnfAlgo::GetNodeAttr<std::vector<int64_t>>(kernel_node, STRIDES);
  (void)std::transform(kernel_sizes_me.begin(), kernel_sizes_me.end(), std::back_inserter(origin_kernel_sizes),
                       [](const int64_t &value) { return static_cast<int>(value); });
                       [](const int64_t &value) { return LongToInt(value); });
  (void)std::transform(strides_me.begin(), strides_me.end(), std::back_inserter(strides),
                       [](const int64_t &value) { return static_cast<int>(value); });
  if (origin_kernel_sizes.size() != 4 || strides.size() != 4) {
                       [](const int64_t &value) { return LongToInt(value); });
  if (origin_kernel_sizes.size() != kAvgPoolingGradKernelSize || strides.size() != kkAvgPoolingGradStrideSize) {
    MS_LOG(EXCEPTION) << "Invalid kernel size " << origin_kernel_sizes.size() << " or stride size " << strides.size();
  }
  std::vector<int> stride{strides[2], strides[3]};
@@ -49,7 +58,7 @@ void AvgPoolingGradCPUKernel::InitKernel(const CNodePtr &kernel_node) {
  std::vector<size_t> kernel_size({IntToSize(origin_kernel_sizes[2]), IntToSize(origin_kernel_sizes[3])});
  std::vector<int> dummy_dilation{1, 1};
  GetPadding(kernel_node, pad_mode, src_shape, kernel_size, stride, &int_padding_l, &int_padding_r, dummy_dilation);
  if (int_padding_l.size() != 2 || int_padding_r.size() != 2) {
  if (int_padding_l.size() != kkAvgPoolingGradPadSize || int_padding_r.size() != kkAvgPoolingGradPadSize) {
    MS_LOG(EXCEPTION) << "Pooling avg get padding failed";
  }
  dnnl::memory::dims padding_l{int_padding_l[0], int_padding_l[1]};
@@ -77,9 +86,8 @@ void AvgPoolingGradCPUKernel::InitKernel(const CNodePtr &kernel_node) {
 bool AvgPoolingGradCPUKernel::Launch(const std::vector<kernel::AddressPtr> &inputs,
                                     const std::vector<kernel::AddressPtr> &,
                                     const std::vector<kernel::AddressPtr> &outputs) {
  if (inputs.size() < 3 || outputs.empty()) {
    MS_LOG(EXCEPTION) << "Pooling avg grad error input output size!";
  }
  CHECK_KERNEL_INPUTS_NUM(inputs.size(), kAvgPoolingGradInputsNum, kernel_name_);
  CHECK_KERNEL_OUTPUTS_NUM(outputs.size(), kkAvgPoolingGradOutputsNum, kernel_name_);
  SetArgumentHandle(DNNL_ARG_SRC, inputs[0]->addr);
  SetArgumentHandle(DNNL_ARG_DST, inputs[1]->addr);
  SetArgumentHandle(DNNL_ARG_DIFF_DST, inputs[2]->addr);
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/mkldnn/pooling_avg_grad_cpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/mkldnn/pooling_avg_grad_cpu_kernel.h
@@ -13,6 +13,7 @@
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_POOLING_AVG_GRAD_CPU_KERNEL_H_
 #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_POOLING_AVG_GRAD_CPU_KERNEL_H_

--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/mkldnn/pooling_max_grad_cpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/mkldnn/pooling_max_grad_cpu_kernel.cc
@@ -23,8 +23,17 @@

 namespace mindspore {
 namespace kernel {
 namespace {
 constexpr size_t kMaxPoolingGradInputsNum = 3;
 constexpr size_t kMaxPoolingGradOutputsNum = 1;
 constexpr size_t kMaxPoolingGradKernelSize = 4;
 constexpr size_t kMaxPoolingGradStrideSize = 4;
 constexpr size_t kMaxPoolingGradInputShapeSize = 4;
 }  // namespace

 void MaxPoolingGradCPUKernel::InitKernel(const CNodePtr &kernel_node) {
  MS_EXCEPTION_IF_NULL(kernel_node);
  kernel_name_ = AnfAlgo::GetCNodeName(kernel_node);
  src_shape_ = AnfAlgo::GetInputDeviceShape(kernel_node, 0);
  dst_shape_ = AnfAlgo::GetInputDeviceShape(kernel_node, 1);
  std::vector<int> kernel_sizes;
@@ -32,10 +41,11 @@ void MaxPoolingGradCPUKernel::InitKernel(const CNodePtr &kernel_node) {
  auto kernel_sizes_me = AnfAlgo::GetNodeAttr<std::vector<int64_t>>(kernel_node, KERNEL_SIZE);
  auto strides_me = AnfAlgo::GetNodeAttr<std::vector<int64_t>>(kernel_node, STRIDES);
  (void)std::transform(kernel_sizes_me.begin(), kernel_sizes_me.end(), std::back_inserter(kernel_sizes),
                       [](const int64_t &value) { return static_cast<int>(value); });
                       [](const int64_t &value) { return LongToInt(value); });
  (void)std::transform(strides_me.begin(), strides_me.end(), std::back_inserter(strides),
                       [](const int64_t &value) { return static_cast<int>(value); });
  if (kernel_sizes.size() != 4 || strides.size() != 4 || src_shape_.size() != 4 || dst_shape_.size() != 4) {
                       [](const int64_t &value) { return LongToInt(value); });
  if (kernel_sizes.size() != kMaxPoolingGradKernelSize || strides.size() != kMaxPoolingGradStrideSize ||
      src_shape_.size() != kMaxPoolingGradInputShapeSize || dst_shape_.size() != kMaxPoolingGradInputShapeSize) {
    MS_LOG(EXCEPTION) << "Pooling grad invalid input size!";
  }
  std::vector<int> padding_r;
@@ -105,9 +115,8 @@ void MaxPoolingGradCPUKernel::ChannelPoolingGrad(const float *input, const float
 bool MaxPoolingGradCPUKernel::Launch(const std::vector<kernel::AddressPtr> &inputs,
                                     const std::vector<kernel::AddressPtr> &,
                                     const std::vector<kernel::AddressPtr> &outputs) {
  if (inputs.size() < 3 || outputs.empty()) {
    MS_LOG(EXCEPTION) << "Pooling grad error input output size!";
  }
  CHECK_KERNEL_INPUTS_NUM(inputs.size(), kMaxPoolingGradInputsNum, kernel_name_);
  CHECK_KERNEL_OUTPUTS_NUM(outputs.size(), kMaxPoolingGradOutputsNum, kernel_name_);

  auto input = reinterpret_cast<float *>(inputs[0]->addr);
  auto diff = reinterpret_cast<float *>(inputs[2]->addr);
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/mkldnn/pooling_max_grad_cpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/mkldnn/pooling_max_grad_cpu_kernel.h
@@ -13,6 +13,7 @@
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_POOLING_MAX_GRAD_CPU_KERNEL_H_
 #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_POOLING_MAX_GRAD_CPU_KERNEL_H_

--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/base/transpose_base.c
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/base/transpose_base.c
@@ -174,35 +174,14 @@
    }                                                                                                         \
  }

 #define TRANSPOSE_MULTI_DIMS(TYPE, NAME)                                                               \
  int Transpose##NAME(const TYPE *in_data, TYPE *out_data, const int *strides, const int *out_strides, \
                      const int *perm, const int *output_shape, int dims, int *size, int *position) {  \
    if (size == NULL || position == NULL) {                                                            \
      return NNACL_ERR;                                                                                \
    }                                                                                                  \
    *(size + dims - 1) = 1;                                                                            \
    for (int i = dims - 1; i > 0; --i) {                                                               \
      *(size + i - 1) = *(size + i) * output_shape[i];                                                 \
    }                                                                                                  \
    for (int idx = 0; idx < (*size) * output_shape[0]; ++idx) {                                        \
      int pos = idx;                                                                                   \
      int output_idx = 0;                                                                              \
      int input_idx = 0;                                                                               \
      for (int i = 0; i < dims; ++i) {                                                                 \
        *(position + i) = pos / *(size + i);                                                           \
        int out_stride = i < dims - 1 ? out_strides[i] : 1;                                            \
        output_idx += (*(position + i) * out_stride);                                                  \
        input_idx += (*(position + i) * strides[perm[i]]);                                             \
        pos -= *(position + i) * (*(size + i));                                                        \
      }                                                                                                \
      out_data[output_idx] = in_data[input_idx];                                                       \
    }                                                                                                  \
    return NNACL_OK;                                                                                   \
  }

 #define TRANSPOSE_DIMS(TYPE, NAME)                                                                   \
  void TransposeDims##NAME(const TYPE *in_data, TYPE *out_data, const int *output_shape,             \
                           const TransposeParameter *transpose_param, int task_id, int thread_num) { \
    NNACL_CHECK_NULL_RETURN_VOID(in_data);                                                           \
    NNACL_CHECK_NULL_RETURN_VOID(out_data);                                                          \
    NNACL_CHECK_NULL_RETURN_VOID(output_shape);                                                      \
    NNACL_CHECK_NULL_RETURN_VOID(transpose_param);                                                   \
    NNACL_CHECK_ZERO_RETURN(thread_num);                                                             \
    const int *perm = transpose_param->perm_;                                                        \
    const int *strides = transpose_param->strides_;                                                  \
    const int *out_strides = transpose_param->out_strides_;                                          \
@@ -220,6 +199,7 @@
      int output_idx = 0;                                                                            \
      int input_idx = 0;                                                                             \
      for (int i = 0; i < num_axes; ++i) {                                                           \
        NNACL_CHECK_ZERO_RETURN(*(out_strides + i));                                                 \
        int position = pos / *(out_strides + i);                                                     \
        int out_stride = i < num_axes - 1 ? out_strides[i] : 1;                                      \
        output_idx += (position * out_stride);                                                       \
@@ -230,69 +210,48 @@
    }                                                                                                \
  }

 #define DOTRANSPOSE(TYPE, NAME)                                                                                 \
  int DoTranspose##NAME(const TYPE *in_data, TYPE *out_data, const int *output_shape,                           \
                        const TransposeParameter *transpose_param) {                                            \
    if (in_data == NULL || out_data == NULL) {                                                                  \
      return NNACL_ERR;                                                                                         \
    }                                                                                                           \
    const int *perm = transpose_param->perm_;                                                                   \
    const int *strides = transpose_param->strides_;                                                             \
    const int *out_strides = transpose_param->out_strides_;                                                     \
    int data_size = transpose_param->data_num_ * sizeof(TYPE);                                                  \
    int num_axes = transpose_param->num_axes_;                                                                  \
    bool needTranspose = false;                                                                                 \
    for (int i = 1; i < num_axes; ++i) {                                                                        \
      if (perm[i] - perm[i - 1] != 1) {                                                                         \
        needTranspose = true;                                                                                   \
        break;                                                                                                  \
      }                                                                                                         \
    }                                                                                                           \
    if (!needTranspose) {                                                                                       \
      (void)memcpy(out_data, in_data, data_size);                                                               \
      return NNACL_OK;                                                                                          \
    }                                                                                                           \
    for (int i = 0; i < num_axes; ++i) {                                                                        \
      if (perm[i] < 0) {                                                                                        \
        return NNACL_PARAM_INVALID;                                                                             \
      }                                                                                                         \
    }                                                                                                           \
    if (num_axes == 2) {                                                                                        \
      TransposeDim2##NAME(in_data, out_data, strides, out_strides, perm, output_shape);                         \
    } else if (num_axes == 3) {                                                                                 \
      TransposeDim3##NAME(in_data, out_data, strides, out_strides, perm, output_shape);                         \
    } else if (num_axes == 4) {                                                                                 \
      TransposeDim4##NAME(in_data, out_data, strides, out_strides, perm, output_shape);                         \
    } else if (num_axes == 5) {                                                                                 \
      TransposeDim5##NAME(in_data, out_data, strides, out_strides, perm, output_shape);                         \
    } else if (num_axes == 6) {                                                                                 \
      TransposeDim6##NAME(in_data, out_data, strides, out_strides, perm, output_shape);                         \
    } else {                                                                                                    \
      int *size = (int *)(malloc(num_axes * sizeof(int)));                                                      \
      if (size == NULL) {                                                                                       \
        return NNACL_ERR;                                                                                       \
      }                                                                                                         \
      int *position = (int *)(malloc(num_axes * sizeof(int)));                                                  \
      if (position == NULL) {                                                                                   \
        free(size);                                                                                             \
        size = NULL;                                                                                            \
        return NNACL_ERR;                                                                                       \
      }                                                                                                         \
      int ret =                                                                                                 \
        Transpose##NAME(in_data, out_data, strides, out_strides, perm, output_shape, num_axes, size, position); \
      if (size != NULL) {                                                                                       \
        free(size);                                                                                             \
        size = NULL;                                                                                            \
      }                                                                                                         \
      if (position != NULL) {                                                                                   \
        free(position);                                                                                         \
        position = NULL;                                                                                        \
      }                                                                                                         \
      if (ret != NNACL_OK) {                                                                                    \
        return NNACL_ERR;                                                                                       \
      }                                                                                                         \
    }                                                                                                           \
    return NNACL_OK;                                                                                            \
 #define DOTRANSPOSE(TYPE, NAME)                                                         \
  int DoTranspose##NAME(const TYPE *in_data, TYPE *out_data, const int *output_shape,   \
                        const TransposeParameter *transpose_param) {                    \
    NNACL_CHECK_NULL_RETURN_ERR(in_data);                                               \
    NNACL_CHECK_NULL_RETURN_ERR(out_data);                                              \
    NNACL_CHECK_NULL_RETURN_ERR(output_shape);                                          \
    NNACL_CHECK_NULL_RETURN_ERR(transpose_param);                                       \
    const int *perm = transpose_param->perm_;                                           \
    const int *strides = transpose_param->strides_;                                     \
    const int *out_strides = transpose_param->out_strides_;                             \
    int data_size = transpose_param->data_num_ * sizeof(TYPE);                          \
    int num_axes = transpose_param->num_axes_;                                          \
    bool needTranspose = false;                                                         \
    for (int i = 1; i < num_axes; ++i) {                                                \
      if (perm[i] - perm[i - 1] != 1) {                                                 \
        needTranspose = true;                                                           \
        break;                                                                          \
      }                                                                                 \
    }                                                                                   \
    if (!needTranspose) {                                                               \
      (void)memcpy(out_data, in_data, data_size);                                       \
      return NNACL_OK;                                                                  \
    }                                                                                   \
    for (int i = 0; i < num_axes; ++i) {                                                \
      if (perm[i] < 0) {                                                                \
        return NNACL_PARAM_INVALID;                                                     \
      }                                                                                 \
    }                                                                                   \
    if (num_axes == 2) {                                                                \
      TransposeDim2##NAME(in_data, out_data, strides, out_strides, perm, output_shape); \
    } else if (num_axes == 3) {                                                         \
      TransposeDim3##NAME(in_data, out_data, strides, out_strides, perm, output_shape); \
    } else if (num_axes == 4) {                                                         \
      TransposeDim4##NAME(in_data, out_data, strides, out_strides, perm, output_shape); \
    } else if (num_axes == 5) {                                                         \
      TransposeDim5##NAME(in_data, out_data, strides, out_strides, perm, output_shape); \
    } else if (num_axes == 6) {                                                         \
      TransposeDim6##NAME(in_data, out_data, strides, out_strides, perm, output_shape); \
    } else {                                                                            \
      return NNACL_ERR;                                                                 \
    }                                                                                   \
    return NNACL_OK;                                                                    \
  }

 #define TRANSPOSE_TEMPLATE(TYPE, NAME) \
@@ -301,7 +260,6 @@
  TRANSPOSE_FOUR_DIMS(TYPE, NAME)      \
  TRANSPOSE_FIVE_DIMS(TYPE, NAME)      \
  TRANSPOSE_SIX_DIMS(TYPE, NAME)       \
  TRANSPOSE_MULTI_DIMS(TYPE, NAME)     \
  TRANSPOSE_DIMS(TYPE, NAME)           \
  DOTRANSPOSE(TYPE, NAME)

--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/base/unsorted_segment_sum_base.c
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/base/unsorted_segment_sum_base.c
@@ -19,6 +19,9 @@
 #define UNSORTEDSEGMENTSUM(type, type1)                                                                          \
  int UnsortedSegmentSum_##type##_##type1(const type *input, int unit_num, int input_dim1, const type1 *indices, \
                                          type *output, int output_dim0, int output_dim1) {                      \
    NNACL_CHECK_NULL_RETURN_ERR(input);                                                                          \
    NNACL_CHECK_NULL_RETURN_ERR(indices);                                                                        \
    NNACL_CHECK_NULL_RETURN_ERR(output);                                                                         \
    if (input_dim1 == 0) {                                                                                       \
      return NNACL_ERR;                                                                                          \
    }                                                                                                            \
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/base/unstack_base.c
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/base/unstack_base.c
@@ -17,6 +17,9 @@
 #include "nnacl/base/unstack_base.h"

 void Unstack(const void *input, void **output, const UnstackParameter *para, int data_size) {
  NNACL_CHECK_NULL_RETURN_VOID(input);
  NNACL_CHECK_NULL_RETURN_VOID(output);
  NNACL_CHECK_NULL_RETURN_VOID(para);
  const int8_t *in_addr = (int8_t *)input;
  for (int j = 0; j < para->num_; j++) {
    int8_t *out_addr = (int8_t *)output[j];
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16/transpose_fp16.c
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16/transpose_fp16.c
@@ -175,6 +175,11 @@ void Fp16TransposeDim6(const float16_t *in_data, float16_t *out_data, const int

 void TransposeDimsFp16(const float16_t *in_data, float16_t *out_data, const int *output_shape,
                       const TransposeParameter *param, int task_id, int thread_num) {
  NNACL_CHECK_NULL_RETURN_VOID(in_data);
  NNACL_CHECK_NULL_RETURN_VOID(out_data);
  NNACL_CHECK_NULL_RETURN_VOID(output_shape);
  NNACL_CHECK_NULL_RETURN_VOID(param);
  NNACL_CHECK_ZERO_RETURN(thread_num);
  const int *perm = param->perm_;
  const int *strides = param->strides_;
  const int *out_strides = param->out_strides_;
@@ -192,6 +197,7 @@ void TransposeDimsFp16(const float16_t *in_data, float16_t *out_data, const int
    int output_idx = 0;
    int input_idx = 0;
    for (int i = 0; i < num_axes; ++i) {
      NNACL_CHECK_ZERO_RETURN(*(out_strides + i));
      int position = pos / *(out_strides + i);
      int out_stride = i < num_axes - 1 ? out_strides[i] : 1;
      output_idx += (position * out_stride);
@@ -204,9 +210,10 @@ void TransposeDimsFp16(const float16_t *in_data, float16_t *out_data, const int

 int DoTransposeFp16(const float16_t *in_data, float16_t *out_data, const int *output_shape,
                    const TransposeParameter *param) {
  if (in_data == NULL || out_data == NULL) {
    return NNACL_ERR;
  }
  NNACL_CHECK_NULL_RETURN_ERR(in_data);
  NNACL_CHECK_NULL_RETURN_ERR(out_data);
  NNACL_CHECK_NULL_RETURN_ERR(output_shape);
  NNACL_CHECK_NULL_RETURN_ERR(param);
  const int *perm = param->perm_;
  const int *strides = param->strides_;
  const int *out_strides = param->out_strides_;
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32/transpose_fp32.c
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32/transpose_fp32.c
@@ -173,9 +173,11 @@ void TransposeDim6Fp32(const float *in_data, float *out_data, const int *strides

 void TransposeDimsFp32(const float *in_data, float *out_data, const int *output_shape,
                       const TransposeParameter *transpose_param, int task_id, int thread_num) {
  if (thread_num == 0) {
    return;
  }
  NNACL_CHECK_NULL_RETURN_VOID(in_data);
  NNACL_CHECK_NULL_RETURN_VOID(out_data);
  NNACL_CHECK_NULL_RETURN_VOID(output_shape);
  NNACL_CHECK_NULL_RETURN_VOID(transpose_param);
  NNACL_CHECK_ZERO_RETURN(thread_num);
  int *perm = (int *)(transpose_param->perm_);
  int *strides = (int *)(transpose_param->strides_);
  int *out_strides = (int *)(transpose_param->out_strides_);
@@ -206,9 +208,10 @@ void TransposeDimsFp32(const float *in_data, float *out_data, const int *output_

 int DoTransposeFp32(const float *in_data, float *out_data, const int *output_shape,
                    const TransposeParameter *transpose_param) {
  if (in_data == NULL || out_data == NULL) {
    return NNACL_ERR;
  }
  NNACL_CHECK_NULL_RETURN_ERR(in_data);
  NNACL_CHECK_NULL_RETURN_ERR(out_data);
  NNACL_CHECK_NULL_RETURN_ERR(output_shape);
  NNACL_CHECK_NULL_RETURN_ERR(transpose_param);
  int *perm = (int *)(transpose_param->perm_);
  int *strides = (int *)(transpose_param->strides_);
  int *out_strides = (int *)(transpose_param->out_strides_);
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/int8/transpose_int8.c
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/int8/transpose_int8.c
@@ -174,9 +174,10 @@ void TransposeDim6Int8(const int8_t *in_data, int8_t *out_data, const int *strid

 int DoTransposeInt8(const int8_t *in_data, int8_t *out_data, const int *output_shape,
                    const TransposeParameter *transpose_param) {
  if (in_data == NULL || out_data == NULL) {
    return NNACL_NULL_PTR;
  }
  NNACL_CHECK_NULL_RETURN_ERR(in_data);
  NNACL_CHECK_NULL_RETURN_ERR(out_data);
  NNACL_CHECK_NULL_RETURN_ERR(output_shape);
  NNACL_CHECK_NULL_RETURN_ERR(transpose_param);

  const int *perm = transpose_param->perm_;
  const int *strides = transpose_param->strides_;
@@ -222,6 +223,11 @@ int DoTransposeInt8(const int8_t *in_data, int8_t *out_data, const int *output_s

 void TransposeDimsInt8(const int8_t *in_data, int8_t *out_data, const int *output_shape,
                       const TransposeParameter *transpose_param, int task_id, int thread_num) {
  NNACL_CHECK_NULL_RETURN_VOID(in_data);
  NNACL_CHECK_NULL_RETURN_VOID(out_data);
  NNACL_CHECK_NULL_RETURN_VOID(output_shape);
  NNACL_CHECK_NULL_RETURN_VOID(transpose_param);
  NNACL_CHECK_ZERO_RETURN(thread_num);
  const int *perm = transpose_param->perm_;
  const int *strides = transpose_param->strides_;
  const int *out_strides = transpose_param->out_strides_;
@@ -239,6 +245,7 @@ void TransposeDimsInt8(const int8_t *in_data, int8_t *out_data, const int *outpu
    int output_idx = 0;
    int input_idx = 0;
    for (int i = 0; i < num_axes; ++i) {
      NNACL_CHECK_ZERO_RETURN(*(out_strides + i));
      int position = pos / *(out_strides + i);
      int out_stride = i < num_axes - 1 ? out_strides[i] : 1;
      output_idx += (position * out_stride);
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/print_cpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/print_cpu_kernel.cc
@@ -13,6 +13,7 @@
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #include "backend/kernel_compiler/cpu/print_cpu_kernel.h"
 #include <algorithm>
 #include "ir/tensor.h"
@@ -24,6 +25,7 @@ namespace mindspore {
 namespace kernel {
 template <typename T>
 void PrintCPUKernel<T>::InitKernel(const CNodePtr &kernel_node) {
  MS_EXCEPTION_IF_NULL(kernel_node);
  size_t input_tensor_num = AnfAlgo::GetInputTensorNum(kernel_node);
  for (size_t i = 0; i < input_tensor_num; ++i) {
    auto input_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, i);
@@ -51,7 +53,7 @@ bool PrintCPUKernel<T>::Launch(const std::vector<kernel::AddressPtr> &inputs,
    } else {
      ShapeVector shape;
      (void)std::transform(input_shapes_[i].begin(), input_shapes_[i].end(), std::back_inserter(shape),
                           [](const size_t &value) { return static_cast<int64_t>(value); });
                           [](const size_t &value) { return SizeToLong(value); });
      Tensor tensor(data_type, shape, inputs[i]->addr, input_sizes_[i] * sizeof(T));
      std::cout << tensor.ToStringNoLimit() << std::endl;
    }
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/print_cpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/print_cpu_kernel.h
@@ -1,5 +1,5 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 * Copyright 2020-2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -13,8 +13,10 @@
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_PRINT_CPU_KERNEL_H_
 #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_PRINT_CPU_KERNEL_H_

 #include <memory>
 #include <vector>
 #include "backend/kernel_compiler/cpu/cpu_kernel.h"
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/ps/embedding_look_up_proxy_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/ps/embedding_look_up_proxy_kernel.cc
@@ -23,7 +23,11 @@
 namespace mindspore {
 namespace kernel {
 namespace ps {
 constexpr size_t kEmbeddingLookUpProxyInputsNum = 2;
 constexpr size_t kEmbeddingLookUpProxyOutputsNum = 1;

 void EmbeddingLookUpProxyKernel::InitKernel(const CNodePtr &kernel_node) {
  MS_EXCEPTION_IF_NULL(kernel_node);
  EmbeddingLookUpCPUKernel::InitKernel(kernel_node);
  auto input_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
  auto indices_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 1);
@@ -46,12 +50,12 @@ void EmbeddingLookUpProxyKernel::InitKernel(const CNodePtr &kernel_node) {
    key_ = AnfAlgo::GetNodeAttr<size_t>(kernel_node, kAttrPsKey);
  }
  std::vector<float> values;
  std::transform(input_shape.begin(), input_shape.end(), std::back_inserter(values),
                 [](size_t dim) -> float { return SizeToFloat(dim); });
  std::transform(indices_shape.begin(), indices_shape.end(), std::back_inserter(values),
                 [](size_t dim) -> float { return SizeToFloat(dim); });
  std::transform(output_shape.begin(), output_shape.end(), std::back_inserter(values),
                 [](size_t dim) -> float { return SizeToFloat(dim); });
  (void)std::transform(input_shape.begin(), input_shape.end(), std::back_inserter(values),
                       [](size_t dim) -> float { return SizeToFloat(dim); });
  (void)std::transform(indices_shape.begin(), indices_shape.end(), std::back_inserter(values),
                       [](size_t dim) -> float { return SizeToFloat(dim); });
  (void)std::transform(output_shape.begin(), output_shape.end(), std::back_inserter(values),
                       [](size_t dim) -> float { return SizeToFloat(dim); });
  MS_LOG(INFO) << "Init embedding lookup proxy kernel, input shape:" << input_shape
               << ", indices_shape:" << indices_shape << ", output_shape:" << output_shape;
  std::vector<int64_t> lens{SizeToLong(input_shape.size()), SizeToLong(indices_shape.size()),
@@ -66,12 +70,8 @@ void EmbeddingLookUpProxyKernel::InitKernel(const CNodePtr &kernel_node) {
 bool EmbeddingLookUpProxyKernel::Launch(const std::vector<kernel::AddressPtr> &inputs,
                                        const std::vector<kernel::AddressPtr> &,
                                        const std::vector<kernel::AddressPtr> &outputs) {
  if (inputs.size() != 2) {
    MS_LOG(EXCEPTION) << "Inputs size is " << inputs.size() << ", but EmbeddingLookUpProxyKernel needs 2.";
  }
  if (outputs.size() != 1) {
    MS_LOG(EXCEPTION) << "Outputs size is " << outputs.size() << ", but EmbeddingLookUpProxyKernel needs 1.";
  }
  CHECK_KERNEL_INPUTS_NUM(inputs.size(), kEmbeddingLookUpProxyInputsNum, kernel_name_);
  CHECK_KERNEL_OUTPUTS_NUM(outputs.size(), kEmbeddingLookUpProxyOutputsNum, kernel_name_);
  auto indices_addr = reinterpret_cast<int *>(inputs[1]->addr);
  auto output_addr = reinterpret_cast<float *>(outputs[0]->addr);
  size_t input_size = inputs[1]->size;
@@ -84,7 +84,6 @@ bool EmbeddingLookUpProxyKernel::Launch(const std::vector<kernel::AddressPtr> &i
  auto ret = memcpy_s(lookup_ids.data(), lookup_ids.size() * sizeof(int), indices_addr, input_size);
  if (ret != EOK) {
    MS_LOG(EXCEPTION) << "Lookup id memcpy failed.";
    return false;
  }
  mindspore::ps::Worker::GetInstance().DoPSEmbeddingLookup(key_, lookup_ids, &lookup_result,
                                                           mindspore::ps::kEmbeddingLookupCmd);
@@ -92,7 +91,6 @@ bool EmbeddingLookUpProxyKernel::Launch(const std::vector<kernel::AddressPtr> &i
  auto ret2 = memcpy_s(output_addr, outputs[0]->size, lookup_result.data(), output_size);
  if (ret2 != EOK) {
    MS_LOG(EXCEPTION) << "Lookup result memcpy failed.";
    return false;
  }
  return true;
 }
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/ps/embedding_look_up_proxy_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/ps/embedding_look_up_proxy_kernel.h
@@ -1,5 +1,5 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 * Copyright 2020-2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -13,6 +13,7 @@
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_EMBEDDING_LOOK_UP_PROXY_KERNEL_H_
 #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_EMBEDDING_LOOK_UP_PROXY_KERNEL_H_

--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/ps/embedding_look_up_ps_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/ps/embedding_look_up_ps_kernel.cc
@@ -93,15 +93,14 @@ void EmbeddingLookUpPSKernel::UpdateEmbeddings(float *embedding_table, const siz
  size_t dest_len = copy_len;
  for (size_t i = 0; i < ids_size; ++i) {
    int index = SizeToInt(lookup_ids[i]) - LongToInt(offset_);
    if (index >= 0 && index < SizeToInt(first_dim_size_)) {
      auto ret = memcpy_s(embedding_table + IntToSize(index) * outer_dim_size_, dest_len,
                          update_vals + i * outer_dim_size_, copy_len);
      if (ret != EOK) {
        MS_LOG(EXCEPTION) << "LookUpTable task memcpy failed.";
      }
    } else {
    if (index < 0 || index >= SizeToInt(first_dim_size_)) {
      MS_LOG(EXCEPTION) << "UpdateEmbeddings index invalid.";
    }
    auto ret = memcpy_s(embedding_table + IntToSize(index) * outer_dim_size_, dest_len,
                        update_vals + i * outer_dim_size_, copy_len);
    if (ret != EOK) {
      MS_LOG(EXCEPTION) << "LookUpTable task memcpy failed.";
    }
  }
 }

--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/ps/embedding_look_up_ps_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/ps/embedding_look_up_ps_kernel.h
@@ -1,5 +1,5 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 * Copyright 2020-2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -13,6 +13,7 @@
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_EMBEDDING_LOOK_UP_PS_KERNEL_H_
 #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_EMBEDDING_LOOK_UP_PS_KERNEL_H_

--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/ps/pserver_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/ps/pserver_kernel.cc
@@ -1,5 +1,5 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 * Copyright 2020-2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/ps/pull_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/ps/pull_kernel.h
@@ -44,6 +44,7 @@ class PullKernel : public CPUKernel {
    return true;
  }
  void Init(const CNodePtr &kernel_node) {
    MS_EXCEPTION_IF_NULL(kernel_node);
    size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
    if (input_num != 2) {
      MS_LOG(ERROR) << "Input number is " << input_num << ", but pull needs 2 inputs.";
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/ps/push_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/ps/push_kernel.h
@@ -49,7 +49,6 @@ class PushKernel : public CPUKernel {
    auto ret = memcpy_s(outputs[0]->addr, outputs[0]->size, &key_, sizeof(size_t));
    if (ret != EOK) {
      MS_LOG(EXCEPTION) << "Lookup id memcpy failed.";
      return false;
    }
    return true;
  }
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/ps/sparse_apply_adam_ps_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/ps/sparse_apply_adam_ps_kernel.cc
@@ -27,6 +27,7 @@ constexpr size_t kSparseApplyAdamPSInputsShapeSize = 11;

 void SparseApplyAdamPSKernel::InitKernel(
  const CNodePtr &cnode, const std::shared_ptr<std::vector<std::shared_ptr<std::vector<size_t>>>> &shapes) {
  MS_EXCEPTION_IF_NULL(cnode);
  MS_EXCEPTION_IF_NULL(shapes);
  const std::vector<std::shared_ptr<std::vector<size_t>>> &shape_vec = *shapes;
  if (shape_vec.size() < kSparseApplyAdamPSInputsShapeSize) {
@@ -68,7 +69,7 @@ void SparseApplyAdamPSKernel::InitKernel(
    MS_LOG(ERROR) << "The first dimension of grad shape must be equal to indices";
  }
  if (AnfAlgo::HasNodeAttr(USE_NESTEROV, cnode)) {
    use_nesterov_ = AnfAlgo::GetNodeAttr<bool>(cnode, "use_nesterov");
    use_nesterov_ = AnfAlgo::GetNodeAttr<bool>(cnode, USE_NESTEROV);
  }
  (void)workspace_size_list_.emplace_back(indices_size_ * var_outer_dim_size_ * sizeof(float) * worker_num_);
  (void)workspace_size_list_.emplace_back(indices_size_ * sizeof(int) * worker_num_);
@@ -79,7 +80,7 @@ void SparseApplyAdamPSKernel::InitKernel(

 void SparseApplyAdamPSKernel::ReInit(const std::vector<std::vector<size_t>> &shapes) {
  if (shapes.empty() || shapes[0].empty()) {
    MS_LOG(EXCEPTION) << "Shape should not empty";
    MS_LOG(EXCEPTION) << "Shape is empty";
  }
  const std::vector<size_t> &indices_shape = shapes[0];
  indices_size_ = indices_shape[0];
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/ps/sparse_apply_ftrl_ps_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/ps/sparse_apply_ftrl_ps_kernel.cc
@@ -24,6 +24,7 @@ constexpr size_t kSparseApplyFtrlPSInputSize = 5;

 void SparseApplyFtrlPSKernel::InitKernel(
  const CNodePtr &cnode, const std::shared_ptr<std::vector<std::shared_ptr<std::vector<size_t>>>> &shapes) {
  MS_EXCEPTION_IF_NULL(cnode);
  MS_EXCEPTION_IF_NULL(shapes);
  const std::vector<std::shared_ptr<std::vector<size_t>>> &shape_vec = *shapes;
  if (shape_vec.size() < kSparseApplyFtrlPSInputSize) {
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/ps/sparse_apply_ftrl_ps_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/ps/sparse_apply_ftrl_ps_kernel.h
@@ -46,7 +46,7 @@ class SparseApplyFtrlPSKernel : public SparseApplyFtrlCPUKernel, public PServerK

 protected:
  void ReInit(const std::vector<AddressPtr> &) override;
  float init_accum_;
  float init_accum_{0.1};
 };
 }  // namespace ps
 }  // namespace kernel
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/ps/sparse_apply_lazy_adam_ps_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/ps/sparse_apply_lazy_adam_ps_kernel.cc
@@ -23,14 +23,15 @@
 namespace mindspore {
 namespace kernel {
 namespace ps {
 constexpr size_t kSparseApplyLazyAdamPSInputSize = 5;
 constexpr size_t kSparseApplyLazyAdamPSInputsSize = 11;

 void SparseApplyLazyAdamPSKernel::InitKernel(
  const CNodePtr &cnode, const std::shared_ptr<std::vector<std::shared_ptr<std::vector<size_t>>>> &shapes) {
  MS_EXCEPTION_IF_NULL(cnode);
  MS_EXCEPTION_IF_NULL(shapes);
  const std::vector<std::shared_ptr<std::vector<size_t>>> &shape_vec = *shapes;
  if (shape_vec.size() < kSparseApplyLazyAdamPSInputSize) {
    MS_LOG(EXCEPTION) << "SparseApplyLazyAdamPSKernel needs " << kSparseApplyLazyAdamPSInputSize
  if (shape_vec.size() < kSparseApplyLazyAdamPSInputsSize) {
    MS_LOG(EXCEPTION) << "SparseApplyLazyAdamPSKernel needs " << kSparseApplyLazyAdamPSInputsSize
                      << " input shapes, but got " << shape_vec.size();
  }
  std::vector<size_t> &var_shape = *(shape_vec[0]);
@@ -70,7 +71,7 @@ void SparseApplyLazyAdamPSKernel::InitKernel(
    MS_LOG(ERROR) << "The first dimension of grad shape must be equal to indices";
  }
  if (AnfAlgo::HasNodeAttr(USE_NESTEROV, cnode)) {
    use_nesterov_ = AnfAlgo::GetNodeAttr<bool>(cnode, "use_nesterov");
    use_nesterov_ = AnfAlgo::GetNodeAttr<bool>(cnode, USE_NESTEROV);
  }
  (void)workspace_size_list_.emplace_back(indices_size_ * var_outer_dim_size_ * sizeof(float) * worker_num_);
  (void)workspace_size_list_.emplace_back(indices_size_ * sizeof(int) * worker_num_);
@@ -89,6 +90,10 @@ void SparseApplyLazyAdamPSKernel::ReInit(const std::vector<std::vector<size_t>>
 }

 void SparseApplyLazyAdamPSKernel::ReInit(const std::vector<AddressPtr> &inputs) {
  if (inputs.size() < kSparseApplyLazyAdamPSInputsSize) {
    MS_LOG(EXCEPTION) << "Input shape size should not less than " << kSparseApplyLazyAdamPSInputsSize << ", but got "
                      << inputs.size();
  }
  const auto &indices_addr = inputs[10];
  indices_size_ = indices_addr->size / sizeof(int);
  workspace_size_list_[0] = indices_size_ * var_outer_dim_size_ * sizeof(float) * worker_num_;
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/random_cpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/random_cpu_kernel.cc
@@ -20,6 +20,13 @@

 namespace mindspore {
 namespace kernel {
 namespace {
 constexpr size_t kUniformIntInputsNum = 3;
 constexpr size_t kUniformRealInputsNum = 1;
 constexpr size_t kUniformIntOutputsNum = 1;
 constexpr size_t kUniformRealOutputsNum = 1;
 constexpr size_t kStandardNormalOutputsNum = 1;
 }  // namespace
 void StandardNormal(float *output, std::normal_distribution<float> distribution,
                    std::default_random_engine random_generator, size_t start, size_t end) {
  for (size_t i = start; i < end; i++) {
@@ -60,12 +67,6 @@ void LaunchStandardNormal(unsigned int seed, const std::vector<AddressPtr> &outp

 void LaunchUniformInt(unsigned int seed, const std::vector<AddressPtr> &inputs,
                      const std::vector<AddressPtr> &outputs) {
  if (inputs.size() != 3) {
    MS_LOG(EXCEPTION) << "Expect input number 3, actual got input number " << inputs.size();
  }
  if (outputs.size() != 1) {
    MS_LOG(EXCEPTION) << "Expect output number 1, actual got output number " << outputs.size();
  }
  // Init min/max values.
  int min_val = reinterpret_cast<int *>(inputs[1]->addr)[0];
  int max_val = reinterpret_cast<int *>(inputs[2]->addr)[0];
@@ -75,7 +76,6 @@ void LaunchUniformInt(unsigned int seed, const std::vector<AddressPtr> &inputs,

  // Init output address.
  auto output = reinterpret_cast<int *>(outputs[0]->addr);
  MS_EXCEPTION_IF_NULL(output);

  // Init sample number.
  size_t num_sample = outputs[0]->size / sizeof(int);
@@ -92,15 +92,8 @@ void LaunchUniformInt(unsigned int seed, const std::vector<AddressPtr> &inputs,

 void LaunchUniformReal(unsigned int seed, const std::vector<AddressPtr> &inputs,
                       const std::vector<AddressPtr> &outputs) {
  if (inputs.size() != 1) {
    MS_LOG(EXCEPTION) << "Expect input number 1, actual got input number " << inputs.size();
  }
  if (outputs.size() != 1) {
    MS_LOG(EXCEPTION) << "Expect output number 1, actual got output number " << outputs.size();
  }
  // Init output address.
  auto output = reinterpret_cast<float *>(outputs[0]->addr);
  MS_EXCEPTION_IF_NULL(output);

  // Init sample number.
  size_t num_sample = outputs[0]->size / sizeof(int);
@@ -117,24 +110,14 @@ void LaunchUniformReal(unsigned int seed, const std::vector<AddressPtr> &inputs,

 void RandomCPUKernel::InitKernel(const CNodePtr &kernel_node) {
  MS_EXCEPTION_IF_NULL(kernel_node);
  std::string kernel_name = AnfAlgo::GetCNodeName(kernel_node);
  auto iter = kRandomOpTypeMap.find(kernel_name);
  kernel_name_ = AnfAlgo::GetCNodeName(kernel_node);
  auto iter = kRandomOpTypeMap.find(kernel_name_);
  if (iter == kRandomOpTypeMap.end()) {
    MS_LOG(EXCEPTION) << "Random operation " << kernel_name << " is not supported.";
    MS_LOG(EXCEPTION) << "Random operation " << kernel_name_ << " is not supported.";
  } else {
    random_op_type_ = iter->second;
  }

  size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
  if ((random_op_type_ == RANDOM_OP_NORMAL) && input_num != 1) {
    MS_LOG(EXCEPTION) << "Input number is " << input_num << ", but random op needs 1 input.";
  }

  size_t output_num = AnfAlgo::GetOutputTensorNum(kernel_node);
  if (output_num != 1) {
    MS_LOG(EXCEPTION) << "Output number is " << output_num << ", but random op needs 1 output.";
  }

  seed_ = LongToInt(GetValue<int64_t>(AnfAlgo::GetCNodePrimitive(kernel_node)->GetAttr("seed")));
  seed2_ = LongToInt(GetValue<int64_t>(AnfAlgo::GetCNodePrimitive(kernel_node)->GetAttr("seed2")));
 }
@@ -152,10 +135,15 @@ bool RandomCPUKernel::Launch(const std::vector<kernel::AddressPtr> &inputs, cons
  }

  if (random_op_type_ == RANDOM_OP_NORMAL) {
    CHECK_KERNEL_OUTPUTS_NUM(outputs.size(), kStandardNormalOutputsNum, kernel_name_);
    LaunchStandardNormal(RNG_seed, outputs);
  } else if (random_op_type_ == RANDOM_OP_UNIFORM_INT) {
    CHECK_KERNEL_INPUTS_NUM(inputs.size(), kUniformIntInputsNum, kernel_name_);
    CHECK_KERNEL_OUTPUTS_NUM(outputs.size(), kUniformIntOutputsNum, kernel_name_);
    LaunchUniformInt(RNG_seed, inputs, outputs);
  } else if (random_op_type_ == RANDOM_OP_UNIFORM_REAL) {
    CHECK_KERNEL_INPUTS_NUM(inputs.size(), kUniformRealInputsNum, kernel_name_);
    CHECK_KERNEL_OUTPUTS_NUM(outputs.size(), kUniformRealOutputsNum, kernel_name_);
    LaunchUniformReal(RNG_seed, inputs, outputs);
  } else {
    MS_LOG(EXCEPTION) << "Random operation " << random_op_type_ << " is not supported.";
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/random_cpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/random_cpu_kernel.h
@@ -13,8 +13,10 @@
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_RANDOM_CPU_KERNEL_H_
 #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_RANDOM_CPU_KERNEL_H_

 #include <vector>
 #include <string>
 #include <map>
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/reduce_scatter_cpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/reduce_scatter_cpu_kernel.cc
@@ -13,6 +13,7 @@
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #include "backend/kernel_compiler/cpu/reduce_scatter_cpu_kernel.h"
 #include "runtime/device/cpu/cpu_device_address.h"
 #include "runtime/device/cpu/mpi/mpi_interface.h"
@@ -22,12 +23,15 @@ namespace mindspore {
 namespace kernel {
 namespace {
 constexpr auto kRanksGroup = "group";
 constexpr size_t kReduceScatterInputsNum = 1;
 constexpr size_t kReduceScatterOutputsNum = 1;
 }  // namespace

 ReduceScatterCPUKernel::ReduceScatterCPUKernel() : op_type_(kMPIOpTypeSum) {}

 void ReduceScatterCPUKernel::InitKernel(const CNodePtr &kernel_node) {
  MS_EXCEPTION_IF_NULL(kernel_node);
  kernel_name_ = AnfAlgo::GetCNodeName(kernel_node);
  auto primitive = AnfAlgo::GetCNodePrimitive(kernel_node);
  MS_EXCEPTION_IF_NULL(primitive);
  auto op = primitive->GetAttr("op");
@@ -46,8 +50,10 @@ void ReduceScatterCPUKernel::InitKernel(const CNodePtr &kernel_node) {
 bool ReduceScatterCPUKernel::Launch(const std::vector<kernel::AddressPtr> &inputs,
                                    const std::vector<kernel::AddressPtr> &,
                                    const std::vector<kernel::AddressPtr> &outputs) {
  auto input_addr = reinterpret_cast<float *>(inputs[0]->addr);
  auto output_addr = reinterpret_cast<float *>(outputs[0]->addr);
  CHECK_KERNEL_INPUTS_NUM(inputs.size(), kReduceScatterInputsNum, kernel_name_);
  CHECK_KERNEL_OUTPUTS_NUM(outputs.size(), kReduceScatterOutputsNum, kernel_name_);
  auto *input_addr = reinterpret_cast<float *>(inputs[0]->addr);
  auto *output_addr = reinterpret_cast<float *>(outputs[0]->addr);
  auto output_data_num = outputs[0]->size / sizeof(float);
  return MPIReduceScatter(input_addr, output_addr, ranks_group_, output_data_num, op_type_);
 }
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/reduce_scatter_cpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/reduce_scatter_cpu_kernel.h
@@ -1,5 +1,5 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 * Copyright 2020-2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -13,8 +13,10 @@
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_REDUCE_SCATTER_CPU_KERNEL_H_
 #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_REDUCE_SCATTER_CPU_KERNEL_H_

 #include <vector>
 #include <string>
 #include "backend/kernel_compiler/cpu/cpu_kernel.h"
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/reshape_cpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/reshape_cpu_kernel.cc
@@ -19,37 +19,41 @@

 namespace mindspore {
 namespace kernel {
 namespace {
 constexpr size_t kReshapeInputsNum = 1;
 constexpr size_t kReshapeOutputsNum = 1;
 }  // namespace

 void ReshapeCPUKernel::InitKernel(const CNodePtr &kernel_node) {
  MS_EXCEPTION_IF_NULL(kernel_node);
  node_wpt_ = kernel_node;
  kernel_name_ = AnfAlgo::GetCNodeName(kernel_node);
  x_data_type_ = AnfAlgo::GetInputDeviceDataType(kernel_node, 0);
  type_size_ = GetTypeByte(TypeIdToType(x_data_type_));
 }

 bool ReshapeCPUKernel::Launch(const std::vector<kernel::AddressPtr> &inputs, const std::vector<kernel::AddressPtr> &,
                              const std::vector<kernel::AddressPtr> &outputs) {
  auto node_ = node_wpt_.lock();
  if (!node_) {
    MS_LOG(EXCEPTION) << "node_wpt_ is expired.";
  }
  auto x_shape = AnfAlgo::GetPrevNodeOutputInferShape(node_, 0);
  if (inputs.empty() || outputs.empty()) {
    MS_LOG(EXCEPTION) << "Input or output empty!";
  }
  CHECK_KERNEL_INPUTS_NUM(inputs.size(), kReshapeInputsNum, kernel_name_);
  CHECK_KERNEL_OUTPUTS_NUM(outputs.size(), kReshapeOutputsNum, kernel_name_);
  if (inputs[0]->size != outputs[0]->size) {
    return false;
  }

  if (inputs[0]->addr == outputs[0]->addr) {
    return true;
  }

  auto node = node_wpt_.lock();
  if (!node) {
    MS_LOG(EXCEPTION) << "node_wpt_ is expired.";
  }
  auto x_shape = AnfAlgo::GetPrevNodeOutputInferShape(node, 0);
  size_t mem_bits = type_size_;
  for (size_t i = 0; i < x_shape.size(); ++i) {
    mem_bits *= x_shape[i];
  }
  auto ret = memcpy_s(outputs[0]->addr, mem_bits, inputs[0]->addr, mem_bits);
  if (ret != 0) {
  if (ret != EOK) {
    MS_LOG(EXCEPTION) << "memcpy_s error, errorno" << ret;
  }
  return true;
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/reshape_cpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/reshape_cpu_kernel.h
@@ -1,5 +1,5 @@
 /**
 * Copyright 2019-2021 Huawei Technologies Co., Ltd
 * Copyright 2021-2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/resize_bilinear_cpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/resize_bilinear_cpu_kernel.cc
@@ -21,24 +21,26 @@
 namespace mindspore {
 namespace kernel {
 namespace {
 constexpr size_t kResizeBilinearInputSize = 4;
 constexpr size_t kResizeBilinearInputsNum = 1;
 constexpr size_t kResizeBilinearOutputsNum = 1;
 constexpr size_t kResizeBilinearInputsShapeSize = 4;
 constexpr size_t kResizeBilinearAttrSize = 2;
 }  // namespace

 void ResizeBilinearCPUKernel::InitKernel(const CNodePtr &kernel_node) {
  MS_EXCEPTION_IF_NULL(kernel_node);
  CheckParam(kernel_node);
  kernel_name_ = AnfAlgo::GetCNodeName(kernel_node);
  shape_ = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
  size_ = AnfAlgo::GetNodeAttr<std::vector<int64_t>>(kernel_node, SIZE);
  align_corners_ = AnfAlgo::GetNodeAttr<bool>(kernel_node, "align_corners");
  dtype_ = AnfAlgo::GetInputDeviceDataType(kernel_node, 0);
  if (shape_.size() < kResizeBilinearInputSize) {
    MS_LOG(EXCEPTION) << "Input shape size should be " << kResizeBilinearInputSize << ", but got " << shape_.size();
  if (shape_.size() != kResizeBilinearInputsShapeSize) {
    MS_LOG(EXCEPTION) << "Input shape size should be " << kResizeBilinearInputsShapeSize << ", but got "
                      << shape_.size();
  }

  if (size_.size() < kResizeBilinearAttrSize) {
    MS_LOG(EXCEPTION) << "Attr SIZE shape size should be " << kResizeBilinearAttrSize << ", but got " << size_.size();
  if (size_.size() != kResizeBilinearAttrSize) {
    MS_LOG(EXCEPTION) << "Size attr requires " << kResizeBilinearAttrSize << " elements, but got " << size_.size();
  }

  size_t in_height = shape_[2];
  size_t in_width = shape_[3];
  size_t out_height = size_[0];
@@ -50,6 +52,8 @@ void ResizeBilinearCPUKernel::InitKernel(const CNodePtr &kernel_node) {
 bool ResizeBilinearCPUKernel::Launch(const std::vector<kernel::AddressPtr> &inputs,
                                     const std::vector<kernel::AddressPtr> &,
                                     const std::vector<kernel::AddressPtr> &outputs) {
  CHECK_KERNEL_INPUTS_NUM(inputs.size(), kResizeBilinearInputsNum, kernel_name_);
  CHECK_KERNEL_OUTPUTS_NUM(outputs.size(), kResizeBilinearOutputsNum, kernel_name_);
  if (dtype_ == kNumberTypeFloat16) {
    LaunchKernel<float16, float16>(inputs, outputs);
  } else if (dtype_ == kNumberTypeFloat32) {
@@ -62,10 +66,9 @@ bool ResizeBilinearCPUKernel::Launch(const std::vector<kernel::AddressPtr> &inpu

 template <typename T1, typename T2>
 void ResizeBilinearCPUKernel::LaunchKernel(const std::vector<AddressPtr> &inputs,
                                           const std::vector<AddressPtr> &outputs) {
  auto input_addr = reinterpret_cast<T1 *>(inputs[0]->addr);
  auto output_addr = reinterpret_cast<T2 *>(outputs[0]->addr);

                                           const std::vector<AddressPtr> &outputs) const {
  const auto *input_addr = reinterpret_cast<T1 *>(inputs[0]->addr);
  auto *output_addr = reinterpret_cast<T2 *>(outputs[0]->addr);
  size_t batch_size = shape_[0];
  size_t channel = shape_[1];
  size_t in_height = shape_[2];
@@ -84,7 +87,6 @@ void ResizeBilinearCPUKernel::LaunchKernel(const std::vector<AddressPtr> &inputs

  std::vector<CachedInterpolation> ys(out_height + 1);
  std::vector<CachedInterpolation> xs(out_width + 1);

  ComputeInterpolationWeights(out_height, in_height, height_scale, ys.data());
  ComputeInterpolationWeights(out_width, in_width, width_scale, xs.data());

@@ -111,16 +113,5 @@ void ResizeBilinearCPUKernel::LaunchKernel(const std::vector<AddressPtr> &inputs
    }
  }
 }

 void ResizeBilinearCPUKernel::CheckParam(const CNodePtr &kernel_node) {
  size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
  if (input_num != 1) {
    MS_LOG(EXCEPTION) << "ResizeBilinear needs 1 inputs, but gets " << input_num;
  }
  size_t output_num = AnfAlgo::GetOutputTensorNum(kernel_node);
  if (output_num != 1) {
    MS_LOG(EXCEPTION) << "ResizeBilinear expects 1 output, but gets" << output_num;
  }
 }
 }  // namespace kernel
 }  // namespace mindspore
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/resize_bilinear_cpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/resize_bilinear_cpu_kernel.h
@@ -36,11 +36,10 @@ class ResizeBilinearCPUKernel : public CPUKernel {
  bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
              const std::vector<AddressPtr> &outputs) override;

 private:
  template <typename T1, typename T2>
  void LaunchKernel(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &outputs);
  void LaunchKernel(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &outputs) const;

 private:
  void CheckParam(const CNodePtr &kernel_node);
  TypeId dtype_{kTypeUnknown};
  bool align_corners_{false};
  float height_scale{1.0};
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/resize_bilinear_grad_cpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/resize_bilinear_grad_cpu_kernel.cc
@@ -21,23 +21,25 @@
 namespace mindspore {
 namespace kernel {
 namespace {
 constexpr size_t kResizeBilinearGradInput0Size = 4;
 constexpr size_t kResizeBilinearGradInput1Size = 4;
 constexpr size_t kResizeBilinearGradInputsNum = 2;
 constexpr size_t kResizeBilinearGradOutputNum = 1;
 constexpr size_t kResizeBilinearGradInputsDoutShapeSize = 4;
 constexpr size_t kResizeBilinearGradInputsXShapeSize = 4;
 }  // namespace

 void ResizeBilinearGradCPUKernel::InitKernel(const CNodePtr &kernel_node) {
  MS_EXCEPTION_IF_NULL(kernel_node);
  CheckParam(kernel_node);
  kernel_name_ = AnfAlgo::GetCNodeName(kernel_node);
  shape_ = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
  size_ = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 1);
  align_corners_ = AnfAlgo::GetNodeAttr<bool>(kernel_node, "align_corners");
  dtype_ = AnfAlgo::GetInputDeviceDataType(kernel_node, 0);
  if (shape_.size() < kResizeBilinearGradInput0Size) {
    MS_LOG(EXCEPTION) << "Input_0 shape size should be " << kResizeBilinearGradInput0Size << ", but got "
  dtype_ = AnfAlgo::GetPrevNodeOutputInferDataType(kernel_node, 0);
  if (shape_.size() < kResizeBilinearGradInputsDoutShapeSize) {
    MS_LOG(EXCEPTION) << "Input dout shape should be " << kResizeBilinearGradInputsDoutShapeSize << ", but got "
                      << shape_.size();
  }

  if (size_.size() < kResizeBilinearGradInput1Size) {
    MS_LOG(EXCEPTION) << "Input_1 shape size should be " << kResizeBilinearGradInput1Size << ", but got "
  if (size_.size() < kResizeBilinearGradInputsXShapeSize) {
    MS_LOG(EXCEPTION) << "Input x shape should be " << kResizeBilinearGradInputsXShapeSize << ", but got "
                      << size_.size();
  }

@@ -45,7 +47,6 @@ void ResizeBilinearGradCPUKernel::InitKernel(const CNodePtr &kernel_node) {
  size_t in_width = shape_[3];
  size_t out_height = size_[2];
  size_t out_width = size_[3];

  height_scale = Scaling(out_height, in_height, align_corners_);
  width_scale = Scaling(out_width, in_width, align_corners_);
 }
@@ -53,6 +54,8 @@ void ResizeBilinearGradCPUKernel::InitKernel(const CNodePtr &kernel_node) {
 bool ResizeBilinearGradCPUKernel::Launch(const std::vector<kernel::AddressPtr> &inputs,
                                         const std::vector<kernel::AddressPtr> &,
                                         const std::vector<kernel::AddressPtr> &outputs) {
  CHECK_KERNEL_INPUTS_NUM(inputs.size(), kResizeBilinearGradInputsNum, kernel_name_);
  CHECK_KERNEL_OUTPUTS_NUM(outputs.size(), kResizeBilinearGradOutputNum, kernel_name_);
  if (dtype_ == kNumberTypeFloat16) {
    LaunchKernel<float16>(inputs, outputs);
  } else if (dtype_ == kNumberTypeFloat32) {
@@ -65,9 +68,9 @@ bool ResizeBilinearGradCPUKernel::Launch(const std::vector<kernel::AddressPtr> &

 template <typename T>
 void ResizeBilinearGradCPUKernel::LaunchKernel(const std::vector<AddressPtr> &inputs,
                                               const std::vector<AddressPtr> &outputs) {
  auto dloss_addr = reinterpret_cast<T *>(inputs[0]->addr);
  auto output_addr = reinterpret_cast<T *>(outputs[0]->addr);
                                               const std::vector<AddressPtr> &outputs) const {
  const auto *dloss_addr = reinterpret_cast<T *>(inputs[0]->addr);
  auto *output_addr = reinterpret_cast<T *>(outputs[0]->addr);

  auto ret = memset_s(output_addr, outputs[0]->size, 0, outputs[0]->size);
  if (ret != EOK) {
@@ -111,16 +114,5 @@ void ResizeBilinearGradCPUKernel::LaunchKernel(const std::vector<AddressPtr> &in
    }
  }
 }

 void ResizeBilinearGradCPUKernel::CheckParam(const CNodePtr &kernel_node) {
  size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
  if (input_num != 2) {
    MS_LOG(EXCEPTION) << "ResizeBilinearGrad needs 2 inputs, but gets " << input_num;
  }
  size_t output_num = AnfAlgo::GetOutputTensorNum(kernel_node);
  if (output_num != 1) {
    MS_LOG(EXCEPTION) << "ResizeBilinear Gradexpects 1 output, but gets" << output_num;
  }
 }
 }  // namespace kernel
 }  // namespace mindspore
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/resize_bilinear_grad_cpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/resize_bilinear_grad_cpu_kernel.h
@@ -36,15 +36,14 @@ class ResizeBilinearGradCPUKernel : public CPUKernel {
  bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
              const std::vector<AddressPtr> &outputs) override;

 private:
  template <typename T>
  void LaunchKernel(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &outputs);
  void LaunchKernel(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &outputs) const;

 private:
  void CheckParam(const CNodePtr &kernel_node);
  TypeId dtype_{kTypeUnknown};
  bool align_corners_ = false;
  float height_scale = 1.;
  float width_scale = 1.;
  bool align_corners_{false};
  float height_scale{1.0};
  float width_scale{1.0};
  std::vector<size_t> size_;
  std::vector<size_t> shape_;
 };
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/resize_nearest_neighbor_cpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/resize_nearest_neighbor_cpu_kernel.cc
@@ -1,5 +1,5 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 * Copyright 2020-2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -21,24 +21,26 @@
 namespace mindspore {
 namespace kernel {
 namespace {
 constexpr size_t kResizeNearestNeighborInputSize = 4;
 constexpr size_t kResizeNearestNeighborOutputSize = 2;
 constexpr size_t kResizeNearestNeighborInputsNum = 1;
 constexpr size_t kResizeNearestNeighborOutputNum = 1;
 constexpr size_t kResizeNearestNeighborInputsShapeSize = 4;
 constexpr size_t kResizeNearestNeighborAttrSize = 2;
 }  // namespace

 void ResizeNearestNeighborCPUKernel::InitKernel(const CNodePtr &kernel_node) {
  MS_EXCEPTION_IF_NULL(kernel_node);
  CheckParam(kernel_node);
  kernel_name_ = AnfAlgo::GetCNodeName(kernel_node);
  std::vector<size_t> input_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
  std::vector<int64_t> output_size = AnfAlgo::GetNodeAttr<std::vector<int64_t>>(kernel_node, SIZE);
  align_corners_ = AnfAlgo::GetNodeAttr<bool>(kernel_node, "align_corners");
  dtype_ = AnfAlgo::GetInputDeviceDataType(kernel_node, 0);
  if (input_shape.size() < kResizeNearestNeighborInputSize) {
    MS_LOG(EXCEPTION) << "Input_0 shape size should be " << kResizeNearestNeighborInputSize << ", but got "
  if (input_shape.size() != kResizeNearestNeighborInputsShapeSize) {
    MS_LOG(EXCEPTION) << "Input shape size should be " << kResizeNearestNeighborInputsShapeSize << ", but got "
                      << input_shape.size();
  }

  if (output_size.size() < kResizeNearestNeighborOutputSize) {
    MS_LOG(EXCEPTION) << "Output shape size should be " << kResizeNearestNeighborOutputSize << ", but got "
                      << output_size.size();
  if (output_size.size() != kResizeNearestNeighborAttrSize) {
    MS_LOG(EXCEPTION) << "Size attr should be " << kResizeNearestNeighborAttrSize << ", but got " << output_size.size();
  }

  batch_size_ = input_shape[0];
@@ -55,6 +57,8 @@ void ResizeNearestNeighborCPUKernel::InitKernel(const CNodePtr &kernel_node) {
 bool ResizeNearestNeighborCPUKernel::Launch(const std::vector<kernel::AddressPtr> &inputs,
                                            const std::vector<kernel::AddressPtr> &,
                                            const std::vector<kernel::AddressPtr> &outputs) {
  CHECK_KERNEL_INPUTS_NUM(inputs.size(), kResizeNearestNeighborInputsNum, kernel_name_);
  CHECK_KERNEL_OUTPUTS_NUM(outputs.size(), kResizeNearestNeighborOutputNum, kernel_name_);
  if (dtype_ == kNumberTypeFloat16) {
    LaunchKernel<float16>(inputs, outputs);
  } else if (dtype_ == kNumberTypeFloat32) {
@@ -74,8 +78,8 @@ bool ResizeNearestNeighborCPUKernel::Launch(const std::vector<kernel::AddressPtr
 template <typename T>
 void ResizeNearestNeighborCPUKernel::LaunchKernel(const std::vector<AddressPtr> &inputs,
                                                  const std::vector<AddressPtr> &outputs) {
  auto input_addr = reinterpret_cast<T *>(inputs[0]->addr);
  auto output_addr = reinterpret_cast<T *>(outputs[0]->addr);
  auto *input_addr = reinterpret_cast<T *>(inputs[0]->addr);
  auto *output_addr = reinterpret_cast<T *>(outputs[0]->addr);

  if (out_height_ == in_height_ && out_width_ == in_width_) {
    for (size_t i = 0; i < output_size_; ++i) {
@@ -99,16 +103,5 @@ void ResizeNearestNeighborCPUKernel::LaunchKernel(const std::vector<AddressPtr>
    output_addr[i] = input_addr[input_pos];
  }
 }

 void ResizeNearestNeighborCPUKernel::CheckParam(const CNodePtr &kernel_node) {
  size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
  if (input_num != 1) {
    MS_LOG(EXCEPTION) << "ResizeBilinear needs 1 inputs, but gets " << input_num;
  }
  size_t output_num = AnfAlgo::GetOutputTensorNum(kernel_node);
  if (output_num != 1) {
    MS_LOG(EXCEPTION) << "ResizeBilinear expects 1 output, but gets" << output_num;
  }
 }
 }  // namespace kernel
 }  // namespace mindspore
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/resize_nearest_neighbor_cpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/resize_nearest_neighbor_cpu_kernel.h
@@ -36,11 +36,10 @@ class ResizeNearestNeighborCPUKernel : public CPUKernel {
  bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
              const std::vector<AddressPtr> &outputs) override;

 private:
  template <typename T>
  void LaunchKernel(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &outputs);

 private:
  void CheckParam(const CNodePtr &kernel_node);
  TypeId dtype_{kTypeUnknown};
  bool align_corners_{false};
  size_t batch_size_{0};
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/resize_nearest_neighbor_grad_cpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/resize_nearest_neighbor_grad_cpu_kernel.cc
@@ -1,5 +1,5 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 * Copyright 2020-2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -21,23 +21,27 @@
 namespace mindspore {
 namespace kernel {
 namespace {
 constexpr size_t kResizeNearestNeighborGradInputSize = 4;
 constexpr size_t kResizeNearestNeighborGradOutputSize = 4;
 constexpr size_t kResizeNearestNeighborGradInputsNum = 1;
 constexpr size_t kResizeNearestNeighborGradOutputNum = 1;
 constexpr size_t kResizeNearestNeighborGradInputsShapeSize = 4;
 constexpr size_t kResizeNearestNeighborGradOutputsShapeSize = 4;
 }  // namespace

 void ResizeNearestNeighborGradCPUKernel::InitKernel(const CNodePtr &kernel_node) {
  MS_EXCEPTION_IF_NULL(kernel_node);
  CheckParam(kernel_node);
  kernel_name_ = AnfAlgo::GetCNodeName(kernel_node);
  std::vector<size_t> input_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
  std::vector<size_t> output_size = AnfAlgo::GetOutputInferShape(kernel_node, 0);
  align_corners_ = AnfAlgo::GetNodeAttr<bool>(kernel_node, "align_corners");
  dtype_ = AnfAlgo::GetInputDeviceDataType(kernel_node, 0);
  if (input_shape.size() < kResizeNearestNeighborGradInputSize) {
    MS_LOG(EXCEPTION) << "Input_0 shape size should be " << kResizeNearestNeighborGradInputSize << ", but got "
  dtype_ = AnfAlgo::GetPrevNodeOutputInferDataType(kernel_node, 0);

  if (input_shape.size() != kResizeNearestNeighborGradInputsShapeSize) {
    MS_LOG(EXCEPTION) << "Input shape size should be " << kResizeNearestNeighborGradInputsShapeSize << ", but got "
                      << input_shape.size();
  }

  if (output_size.size() < kResizeNearestNeighborGradOutputSize) {
    MS_LOG(EXCEPTION) << "Output shape size should be " << kResizeNearestNeighborGradOutputSize << ", but got "
  if (output_size.size() != kResizeNearestNeighborGradOutputsShapeSize) {
    MS_LOG(EXCEPTION) << "Output shape size should be " << kResizeNearestNeighborGradOutputsShapeSize << ", but got "
                      << output_size.size();
  }

@@ -54,6 +58,8 @@ void ResizeNearestNeighborGradCPUKernel::InitKernel(const CNodePtr &kernel_node)
 bool ResizeNearestNeighborGradCPUKernel::Launch(const std::vector<kernel::AddressPtr> &inputs,
                                                const std::vector<kernel::AddressPtr> &,
                                                const std::vector<kernel::AddressPtr> &outputs) {
  CHECK_KERNEL_INPUTS_NUM(inputs.size(), kResizeNearestNeighborGradInputsNum, kernel_name_);
  CHECK_KERNEL_OUTPUTS_NUM(outputs.size(), kResizeNearestNeighborGradOutputNum, kernel_name_);
  if (dtype_ == kNumberTypeFloat16) {
    LaunchKernel<float16>(inputs, outputs);
  } else if (dtype_ == kNumberTypeFloat32) {
@@ -73,9 +79,8 @@ bool ResizeNearestNeighborGradCPUKernel::Launch(const std::vector<kernel::Addres
 template <typename T>
 void ResizeNearestNeighborGradCPUKernel::LaunchKernel(const std::vector<AddressPtr> &inputs,
                                                      const std::vector<AddressPtr> &outputs) {
  auto dloss_addr = reinterpret_cast<T *>(inputs[0]->addr);
  auto output_addr = reinterpret_cast<T *>(outputs[0]->addr);

  const auto *dloss_addr = reinterpret_cast<T *>(inputs[0]->addr);
  auto *output_addr = reinterpret_cast<T *>(outputs[0]->addr);
  auto ret = memset_s(output_addr, outputs[0]->size, 0, outputs[0]->size);
  if (ret != EOK) {
    MS_LOG(EXCEPTION) << "Output buffer memset failed, ret:" << ret;
@@ -83,7 +88,6 @@ void ResizeNearestNeighborGradCPUKernel::LaunchKernel(const std::vector<AddressP

  size_t in_hw_size = in_width_ * in_height_;
  size_t out_hw_size = out_width_ * out_height_;

  for (size_t b = 0; b < batch_size_; ++b) {
    for (size_t c = 0; c < channel_; ++c) {
      for (size_t h = 0; h < in_height_; ++h) {
@@ -102,16 +106,5 @@ void ResizeNearestNeighborGradCPUKernel::LaunchKernel(const std::vector<AddressP
    }
  }
 }

 void ResizeNearestNeighborGradCPUKernel::CheckParam(const CNodePtr &kernel_node) {
  size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
  if (input_num != 1) {
    MS_LOG(EXCEPTION) << "ResizeBilinearGrad needs 1 inputs, but gets " << input_num;
  }
  size_t output_num = AnfAlgo::GetOutputTensorNum(kernel_node);
  if (output_num != 1) {
    MS_LOG(EXCEPTION) << "ResizeBilinear Gradexpects 1 output, but gets" << output_num;
  }
 }
 }  // namespace kernel
 }  // namespace mindspore
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/resize_nearest_neighbor_grad_cpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/resize_nearest_neighbor_grad_cpu_kernel.h
@@ -36,11 +36,10 @@ class ResizeNearestNeighborGradCPUKernel : public CPUKernel {
  bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
              const std::vector<AddressPtr> &outputs) override;

 private:
  template <typename T>
  void LaunchKernel(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &outputs);

 private:
  void CheckParam(const CNodePtr &kernel_node);
  TypeId dtype_{kTypeUnknown};
  bool align_corners_{false};
  size_t batch_size_{0};
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/rmsprop_cpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/rmsprop_cpu_kernel.cc
@@ -21,6 +21,11 @@

 namespace mindspore {
 namespace kernel {
 namespace {
 constexpr size_t kCenteredRMSPropInputsNum = 9;
 constexpr size_t kRMSPropInputsNum = 5;
 }  // namespace

 template <typename T>
 void RMSPropCPUKernel<T>::LaunchRMSPropUnuseCenter(T *variable, T *mean_square, T *moment, T *gradients,
                                                   float *learning_rate) {
@@ -71,6 +76,7 @@ void RMSPropCPUKernel<T>::LaunchRMSPropUseCenter(T *variable, T *mean_square, T
 template <typename T>
 void RMSPropCPUKernel<T>::InitKernel(const CNodePtr &kernel_node) {
  MS_EXCEPTION_IF_NULL(kernel_node);
  kernel_name_ = AnfAlgo::GetCNodeName(kernel_node);
  auto node_name = AnfAlgo::GetCNodeName(kernel_node);
  if (node_name == "ApplyCenteredRMSProp") {
    use_center_ = true;
@@ -92,6 +98,7 @@ template <typename T>
 bool RMSPropCPUKernel<T>::Launch(const std::vector<kernel::AddressPtr> &inputs, const std::vector<kernel::AddressPtr> &,
                                 const std::vector<kernel::AddressPtr> &) {
  if (!use_center_) {
    CHECK_KERNEL_INPUTS_NUM(inputs.size(), kCenteredRMSPropInputsNum, kernel_name_);
    float *variable = reinterpret_cast<float *>(inputs[0]->addr);
    float *mean_square = reinterpret_cast<float *>(inputs[1]->addr);
    float *moment = reinterpret_cast<float *>(inputs[2]->addr);
@@ -102,6 +109,7 @@ bool RMSPropCPUKernel<T>::Launch(const std::vector<kernel::AddressPtr> &inputs,
    MS_LOG(INFO) << "RMSPropCPUKernel lens:" << lens << " size_:" << size_;
    LaunchRMSPropUnuseCenter(variable, mean_square, moment, gradients, learning_rate);
  } else {
    CHECK_KERNEL_INPUTS_NUM(inputs.size(), kRMSPropInputsNum, kernel_name_);
    T *variable = reinterpret_cast<float *>(inputs[0]->addr);
    T *mean_gradients = reinterpret_cast<float *>(inputs[1]->addr);
    T *mean_square = reinterpret_cast<float *>(inputs[2]->addr);
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/rmsprop_cpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/rmsprop_cpu_kernel.h
@@ -27,7 +27,7 @@ namespace kernel {
 template <typename T>
 class RMSPropCPUKernel : public CPUKernel {
 public:
  RMSPropCPUKernel() {}
  RMSPropCPUKernel() = default;
  ~RMSPropCPUKernel() override = default;

  void InitKernel(const CNodePtr &kernel_node) override;
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/scatter_arithmetic_cpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/scatter_arithmetic_cpu_kernel.cc
@@ -22,15 +22,34 @@
 namespace mindspore {
 namespace kernel {
 namespace {
 constexpr size_t kInputNum = 3;
 constexpr size_t kOutputNum = 1;
 constexpr size_t kScatterArithmeticInputsNum = 3;
 constexpr size_t kScatterArithmeticOutputsNum = 1;
 }  // namespace

 template <typename T>
 void ScatterArithmeticCPUKernel<T>::InitComputeFunc() {
  static const std::map<std::string, TypeComputeFunc> scatterArithmeticFuncMap{
    {prim::kPrimScatterAdd->name(), &ScatterArithmeticCPUKernel<T>::ScatterAdd},
    {prim::kPrimScatterSub->name(), &ScatterArithmeticCPUKernel<T>::ScatterSub},
    {prim::kPrimScatterMul->name(), &ScatterArithmeticCPUKernel<T>::ScatterMul},
    {prim::kPrimScatterDiv->name(), &ScatterArithmeticCPUKernel<T>::ScatterDiv},
    {prim::kPrimScatterMax->name(), &ScatterArithmeticCPUKernel<T>::ScatterMax},
    {prim::kPrimScatterMin->name(), &ScatterArithmeticCPUKernel<T>::ScatterMin},
    {prim::kPrimScatterUpdate->name(), &ScatterArithmeticCPUKernel<T>::ScatterUpdate}};
  if (scatterArithmeticFuncMap.find(kernel_name_) == scatterArithmeticFuncMap.end()) {
    MS_LOG(EXCEPTION) << "ScatterArithmeticCPUKernel does not support " << kernel_name_;
  }
  compute_func_ = scatterArithmeticFuncMap.at(kernel_name_);
 }

 template <typename T>
 void ScatterArithmeticCPUKernel<T>::InitKernel(const CNodePtr &kernel_node) {
  MS_EXCEPTION_IF_NULL(kernel_node);
  CheckParam(kernel_node);
  kernel_name_ = AnfAlgo::GetCNodeName(kernel_node);
  auto input_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
  if (input_shape.size() < 1) {
    MS_LOG(EXCEPTION) << "Input shape size should not less than 1";
  }
  input_size_ = 1;
  inner_size_ = 1;
  if (input_shape.empty()) {
@@ -46,52 +65,30 @@ void ScatterArithmeticCPUKernel<T>::InitKernel(const CNodePtr &kernel_node) {
  for (size_t i = 0; i < indices_shape.size(); i++) {
    indices_size_ *= indices_shape[i];
  }
 }

 template <typename T>
 void ScatterArithmeticCPUKernel<T>::CheckParam(const CNodePtr &kernel_node) const {
  MS_EXCEPTION_IF_NULL(kernel_node);
  size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
  if (input_num != kInputNum) {
    MS_LOG(EXCEPTION) << "Input number is " << input_num << ", but ScatterAdd needs 3 inputs.";
  }
  size_t output_num = AnfAlgo::GetOutputTensorNum(kernel_node);
  if (output_num != kOutputNum) {
    MS_LOG(EXCEPTION) << "Output number is " << output_num << ", but ScatterAdd has 1 output.";
  }
  InitComputeFunc();
 }

 template <typename T>
 bool ScatterArithmeticCPUKernel<T>::Launch(const std::vector<kernel::AddressPtr> &inputs,
                                           const std::vector<kernel::AddressPtr> &workspace,
                                           const std::vector<kernel::AddressPtr> &,
                                           const std::vector<kernel::AddressPtr> &outputs) {
  static const std::map<std::string, std::function<void(ScatterArithmeticCPUKernel *, T *, const int *, const T *)>>
    kScatterArithmeticBinOpFuncMap{{"ScatterAdd", &ScatterArithmeticCPUKernel<T>::ScatterAdd},
                                   {"ScatterSub", &ScatterArithmeticCPUKernel<T>::ScatterSub},
                                   {"ScatterMul", &ScatterArithmeticCPUKernel<T>::ScatterMul},
                                   {"ScatterDiv", &ScatterArithmeticCPUKernel<T>::ScatterDiv},
                                   {"ScatterMax", &ScatterArithmeticCPUKernel<T>::ScatterMax},
                                   {"ScatterMin", &ScatterArithmeticCPUKernel<T>::ScatterMin},
                                   {"ScatterUpdate", &ScatterArithmeticCPUKernel<T>::ScatterUpdate}};
  if (kScatterArithmeticBinOpFuncMap.find(kernel_name_) != kScatterArithmeticBinOpFuncMap.end()) {
    T *input = reinterpret_cast<T *>(inputs[INPUT]->addr);
    int *indices = reinterpret_cast<int *>(inputs[INDICES]->addr);
    T *updates = reinterpret_cast<T *>(inputs[UPDATES]->addr);
    T *output = reinterpret_cast<T *>(outputs[0]->addr);
    kScatterArithmeticBinOpFuncMap.at(kernel_name_)(this, input, indices, updates);
    auto bufferSize = outputs[0]->size;
    auto ret = memcpy_s(output, bufferSize, input, input_size_ * sizeof(T));
    if (ret != EOK) {
      MS_LOG(EXCEPTION) << "Memory copy failed!";
    }
  } else {
    MS_LOG(EXCEPTION) << "Not support operator:" << kernel_name_;
  CHECK_KERNEL_INPUTS_NUM(inputs.size(), kScatterArithmeticInputsNum, kernel_name_);
  CHECK_KERNEL_OUTPUTS_NUM(outputs.size(), kScatterArithmeticOutputsNum, kernel_name_);
  auto *input = reinterpret_cast<T *>(inputs[INPUT_INDEX_]->addr);
  auto *indices = reinterpret_cast<int *>(inputs[INDICES_INDEX_]->addr);
  auto *updates = reinterpret_cast<T *>(inputs[UPDATES_INDEX_]->addr);
  auto *output = reinterpret_cast<T *>(outputs[OUTPUT_INDEX_]->addr);
  compute_func_(this, input, indices, updates);
  auto bufferSize = outputs[OUTPUT_INDEX_]->size;
  auto ret = memcpy_s(output, bufferSize, input, input_size_ * sizeof(T));
  if (ret != EOK) {
    MS_LOG(EXCEPTION) << "Memory copy failed!";
  }
  return true;
 }

 template <typename T>
 void ScatterArithmeticCPUKernel<T>::ScatterAdd(T *input, const int *indices, const T *updates) {
 void ScatterArithmeticCPUKernel<T>::ScatterAdd(T *input, const int *indices, const T *updates) const {
  for (size_t i = 0; i < indices_size_; i++) {
    auto base_index_updates = i * inner_size_;
    auto base_index_input = indices[i] * inner_size_;
@@ -102,7 +99,7 @@ void ScatterArithmeticCPUKernel<T>::ScatterAdd(T *input, const int *indices, con
 }

 template <typename T>
 void ScatterArithmeticCPUKernel<T>::ScatterSub(T *input, const int *indices, const T *updates) {
 void ScatterArithmeticCPUKernel<T>::ScatterSub(T *input, const int *indices, const T *updates) const {
  for (size_t i = 0; i < indices_size_; i++) {
    auto base_index_updates = i * inner_size_;
    auto base_index_input = indices[i] * inner_size_;
@@ -113,7 +110,7 @@ void ScatterArithmeticCPUKernel<T>::ScatterSub(T *input, const int *indices, con
 }

 template <typename T>
 void ScatterArithmeticCPUKernel<T>::ScatterMul(T *input, const int *indices, const T *updates) {
 void ScatterArithmeticCPUKernel<T>::ScatterMul(T *input, const int *indices, const T *updates) const {
  for (size_t i = 0; i < indices_size_; i++) {
    auto base_index_updates = i * inner_size_;
    auto base_index_input = indices[i] * inner_size_;
@@ -124,32 +121,32 @@ void ScatterArithmeticCPUKernel<T>::ScatterMul(T *input, const int *indices, con
 }

 template <typename T>
 void ScatterArithmeticCPUKernel<T>::ScatterDiv(T *input, const int *indices, const T *updates) {
 void ScatterArithmeticCPUKernel<T>::ScatterDiv(T *input, const int *indices, const T *updates) const {
  for (size_t i = 0; i < indices_size_; i++) {
    for (size_t j = 0; j < inner_size_; j++) {
      auto dividend = input[indices[i] * inner_size_ + j];
      auto divisor = updates[i * inner_size_ + j];
      if (divisor == 0) {
        if (dividend == 0) {
          input[indices[i] * inner_size_ + j] = std::numeric_limits<T>::quiet_NaN();
          continue;
        }
        if (std::numeric_limits<T>::has_infinity) {
          input[indices[i] * inner_size_ + j] =
            dividend > 0 ? std::numeric_limits<T>::infinity() : -std::numeric_limits<T>::infinity();
        } else {
          input[indices[i] * inner_size_ + j] =
            dividend > 0 ? std::numeric_limits<T>::max() : std::numeric_limits<T>::min();
        }
      if (divisor != 0) {
        input[indices[i] * inner_size_ + j] = dividend / divisor;
        continue;
      }
      if (dividend == 0) {
        input[indices[i] * inner_size_ + j] = std::numeric_limits<T>::quiet_NaN();
        continue;
      }
      input[indices[i] * inner_size_ + j] = dividend / divisor;
      if (std::numeric_limits<T>::has_infinity) {
        input[indices[i] * inner_size_ + j] =
          dividend > 0 ? std::numeric_limits<T>::infinity() : -std::numeric_limits<T>::infinity();
      } else {
        input[indices[i] * inner_size_ + j] =
          dividend > 0 ? std::numeric_limits<T>::max() : std::numeric_limits<T>::min();
      }
    }
  }
 }

 template <typename T>
 void ScatterArithmeticCPUKernel<T>::ScatterMax(T *input, const int *indices, const T *updates) {
 void ScatterArithmeticCPUKernel<T>::ScatterMax(T *input, const int *indices, const T *updates) const {
  for (size_t i = 0; i < indices_size_; i++) {
    auto base_index_updates = i * inner_size_;
    auto base_index_input = indices[i] * inner_size_;
@@ -162,7 +159,7 @@ void ScatterArithmeticCPUKernel<T>::ScatterMax(T *input, const int *indices, con
 }

 template <typename T>
 void ScatterArithmeticCPUKernel<T>::ScatterMin(T *input, const int *indices, const T *updates) {
 void ScatterArithmeticCPUKernel<T>::ScatterMin(T *input, const int *indices, const T *updates) const {
  for (size_t i = 0; i < indices_size_; i++) {
    auto base_index_updates = i * inner_size_;
    auto base_index_input = indices[i] * inner_size_;
@@ -175,7 +172,7 @@ void ScatterArithmeticCPUKernel<T>::ScatterMin(T *input, const int *indices, con
 }

 template <typename T>
 void ScatterArithmeticCPUKernel<T>::ScatterUpdate(T *input, const int *indices, const T *updates) {
 void ScatterArithmeticCPUKernel<T>::ScatterUpdate(T *input, const int *indices, const T *updates) const {
  for (size_t i = 0; i < indices_size_; i++) {
    auto base_index_updates = i * inner_size_;
    auto base_index_input = indices[i] * inner_size_;
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/scatter_arithmetic_cpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/scatter_arithmetic_cpu_kernel.h
@@ -37,27 +37,25 @@ class ScatterArithmeticCPUKernel : public CPUKernel {
              const std::vector<AddressPtr> &outputs) override;

 private:
  void CheckParam(const CNodePtr &kernel_node) const;

  void ScatterAdd(T *input, const int *indices, const T *updates);

  void ScatterSub(T *input, const int *indices, const T *updates);

  void ScatterMul(T *input, const int *indices, const T *updates);

  void ScatterDiv(T *input, const int *indices, const T *updates);

  void ScatterMax(T *input, const int *indices, const T *updates);

  void ScatterMin(T *input, const int *indices, const T *updates);

  void ScatterUpdate(T *input, const int *indices, const T *updates);

  size_t input_size_{1};
  size_t inner_size_{1};
  size_t indices_size_{1};
  std::string kernel_name_;
  enum input_list_ { INPUT, INDICES, UPDATES };
  void InitComputeFunc();
  void ScatterAdd(T *input, const int *indices, const T *updates) const;
  void ScatterSub(T *input, const int *indices, const T *updates) const;
  void ScatterMul(T *input, const int *indices, const T *updates) const;
  void ScatterDiv(T *input, const int *indices, const T *updates) const;
  void ScatterMax(T *input, const int *indices, const T *updates) const;
  void ScatterMin(T *input, const int *indices, const T *updates) const;
  void ScatterUpdate(T *input, const int *indices, const T *updates) const;

  using TypeComputeFunc = std::function<void(ScatterArithmeticCPUKernel *, T *, const int *, const T *)>;

  TypeComputeFunc compute_func_;
  size_t input_size_{0};
  size_t inner_size_{0};
  size_t indices_size_{0};
  const size_t INPUT_INDEX_{0};
  const size_t INDICES_INDEX_{1};
  const size_t UPDATES_INDEX_{2};
  const size_t OUTPUT_INDEX_{0};
 };

 MS_REG_CPU_KERNEL_T(ScatterAdd,
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/scatter_nd_update_cpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/scatter_nd_update_cpu_kernel.cc
@@ -22,14 +22,21 @@
 namespace mindspore {
 namespace kernel {
 namespace {
 constexpr size_t kScatterNdUpdateInputsNum = 3;
 constexpr size_t kScatterNdUpdateOutputsNum = 1;
 constexpr size_t kMinIndiceRank = 2;

 template <typename T>
 void Compute(const ComputeParams<T> *params, const size_t start, const size_t end) {
  MS_EXCEPTION_IF_NULL(params);
  T *x = params->x_;
  int *indices = params->indices_;
  T *updates = params->updates_;
  std::vector<int> *out_strides = params->out_strides_;
  MS_EXCEPTION_IF_NULL(x);
  MS_EXCEPTION_IF_NULL(indices);
  MS_EXCEPTION_IF_NULL(updates);
  MS_EXCEPTION_IF_NULL(out_strides);

  for (int i = SizeToInt(start); i < SizeToInt(end); ++i) {
    int offset = 0;
@@ -51,7 +58,7 @@ void Compute(const ComputeParams<T> *params, const size_t start, const size_t en

 void ScatterNdUpdateCPUKernel::InitKernel(const CNodePtr &kernel_node) {
  MS_EXCEPTION_IF_NULL(kernel_node);
  Check(kernel_node);
  kernel_name_ = AnfAlgo::GetCNodeName(kernel_node);
  auto shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
  auto indices_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 1);
  auto updates_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 2);
@@ -93,6 +100,8 @@ void ScatterNdUpdateCPUKernel::InitKernel(const CNodePtr &kernel_node) {
 bool ScatterNdUpdateCPUKernel::Launch(const std::vector<kernel::AddressPtr> &inputs,
                                      const std::vector<kernel::AddressPtr> &,
                                      const std::vector<kernel::AddressPtr> &outputs) {
  CHECK_KERNEL_INPUTS_NUM(inputs.size(), kScatterNdUpdateInputsNum, kernel_name_);
  CHECK_KERNEL_OUTPUTS_NUM(outputs.size(), kScatterNdUpdateOutputsNum, kernel_name_);
  if (dtype_ == kNumberTypeFloat16) {
    LaunchKernel<float16>(inputs, outputs);
  } else if (dtype_ == kNumberTypeFloat32) {
@@ -136,16 +145,5 @@ void ScatterNdUpdateCPUKernel::LaunchKernel(const std::vector<AddressPtr> &input
    MS_LOG(EXCEPTION) << "memcpy_s error, errorno" << ret;
  }
 }

 void ScatterNdUpdateCPUKernel::Check(const CNodePtr &kernel_node) {
  size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
  if (input_num != 3) {
    MS_LOG(EXCEPTION) << "Input number is " << input_num << ", but ScatterNdUpdate needs 3 input.";
  }
  size_t output_num = AnfAlgo::GetOutputTensorNum(kernel_node);
  if (output_num != 1) {
    MS_LOG(EXCEPTION) << "Output number is " << output_num << ", but ScatterNdUpdate needs 1 output.";
  }
 }
 }  // namespace kernel
 }  // namespace mindspore
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/scatter_nd_update_cpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/scatter_nd_update_cpu_kernel.h
@@ -46,11 +46,10 @@ class ScatterNdUpdateCPUKernel : public CPUKernel {
  bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
              const std::vector<AddressPtr> &outputs) override;

 private:
  template <typename T>
  void LaunchKernel(const std::vector<AddressPtr> &inputs, const std::vector<kernel::AddressPtr> &outputs);

 private:
  void Check(const CNodePtr &kernel_node);
  TypeId dtype_{kTypeUnknown};
  int unit_size_{0};
  size_t num_units_{0};
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/searchsorted_cpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/searchsorted_cpu_kernel.cc
@@ -23,13 +23,14 @@
 namespace mindspore {
 namespace kernel {
 namespace {
 constexpr size_t kInputSize = 2;
 constexpr size_t kOutputSize = 1;
 constexpr size_t kSearchSortedInputsNum = 2;
 constexpr size_t kSearchSortedOutputsNum = 1;
 }  // namespace

 template <typename S, typename T>
 void SearchSortedCPUKernel<S, T>::InitKernel(const CNodePtr &kernel_node) {
  MS_EXCEPTION_IF_NULL(kernel_node);
  kernel_name_ = AnfAlgo::GetCNodeName(kernel_node);
  right_ = AnfAlgo::GetNodeAttr<bool>(kernel_node, "right");
  sequence_shape_ = AnfAlgo::GetInputDeviceShape(kernel_node, 0);
  values_shape_ = AnfAlgo::GetInputDeviceShape(kernel_node, 1);
@@ -76,16 +77,8 @@ bool SearchSortedCPUKernel<S, T>::Launch(const std::vector<kernel::AddressPtr> &
 template <typename S, typename T>
 void SearchSortedCPUKernel<S, T>::CheckParam(const std::vector<AddressPtr> &inputs,
                                             const std::vector<AddressPtr> &outputs) {
  // inputs: sequence, values
  if (inputs.size() != kInputSize) {
    MS_LOG(EXCEPTION) << "Input number is: " << inputs.size() << ", but SearchSorted needs" << kInputSize << " inputs.";
  }

  // outputs: positions
  if (outputs.size() != kOutputSize) {
    MS_LOG(EXCEPTION) << "Output number is " << outputs.size() << ", but SearchSorted needs " << kOutputSize
                      << " outputs";
  }
  CHECK_KERNEL_INPUTS_NUM(inputs.size(), kSearchSortedInputsNum, kernel_name_);
  CHECK_KERNEL_OUTPUTS_NUM(outputs.size(), kSearchSortedOutputsNum, kernel_name_);

  if (outputs[0]->size / sizeof(T) != inputs[1]->size / sizeof(S)) {
    MS_LOG(EXCEPTION) << "The output dimensions " << outputs[0]->size << " must match the dimensions of input values "
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/searchsorted_cpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/searchsorted_cpu_kernel.h
@@ -39,10 +39,10 @@ class SearchSortedCPUKernel : public CPUKernel {
  void CheckParam(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &outputs);

  bool right_{false};
  size_t search_len{0};
  std::vector<size_t> sequence_shape_;
  std::vector<size_t> values_shape_;
  std::vector<size_t> output_shape_;
  size_t search_len{0};
 };

 MS_REG_CPU_KERNEL_T_S(
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/select_cpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/select_cpu_kernel.cc
@@ -19,31 +19,30 @@

 namespace mindspore {
 namespace kernel {
 namespace {
 constexpr size_t kSelectInputsNum = 3;
 constexpr size_t kSelectOutputsNum = 1;
 }  // namespace

 template <typename T>
 void SelectCPUKernel<T>::InitKernel(const CNodePtr &kernel_node) {
  MS_EXCEPTION_IF_NULL(kernel_node);
  size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
  if (input_num != 3) {
    MS_LOG(EXCEPTION) << "Input number is " << input_num << ", but SelectCpuKernel needs 3 input.";
  }
  size_t output_num = AnfAlgo::GetOutputTensorNum(kernel_node);
  if (output_num != 1) {
    MS_LOG(EXCEPTION) << "Output number is " << output_num << ", but SelectCpuKernel needs 1 output.";
  }
  kernel_name_ = AnfAlgo::GetCNodeName(kernel_node);
  auto shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
  for (size_t x : shape) {
    element_num_ *= x;
  }
  return;
 }

 template <typename T>
 bool SelectCPUKernel<T>::Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &,
                                const std::vector<AddressPtr> &outputs) {
  auto input_cond = reinterpret_cast<bool *>(inputs[0]->addr);
  auto input_x = reinterpret_cast<T *>(inputs[1]->addr);
  auto input_y = reinterpret_cast<T *>(inputs[2]->addr);
  auto output = reinterpret_cast<T *>(outputs[0]->addr);
  CHECK_KERNEL_INPUTS_NUM(inputs.size(), kSelectInputsNum, kernel_name_);
  CHECK_KERNEL_OUTPUTS_NUM(outputs.size(), kSelectOutputsNum, kernel_name_);
  auto *input_cond = reinterpret_cast<bool *>(inputs[0]->addr);
  auto *input_x = reinterpret_cast<T *>(inputs[1]->addr);
  auto *input_y = reinterpret_cast<T *>(inputs[2]->addr);
  auto *output = reinterpret_cast<T *>(outputs[0]->addr);
  for (size_t pos = 0; pos < element_num_; pos++) {
    output[pos] = input_cond[pos] ? input_x[pos] : input_y[pos];
  }
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/sgd_cpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/sgd_cpu_kernel.cc
@@ -15,42 +15,29 @@
 */

 #include "backend/kernel_compiler/cpu/sgd_cpu_kernel.h"

 #include <thread>
 #include <vector>

 namespace mindspore {
 namespace kernel {
 namespace {
 constexpr size_t kInputSize = 6;
 constexpr size_t kOutputSize = 1;
 constexpr size_t kSGDInputsNum = 6;
 constexpr size_t kSGDOutputsNum = 1;
 }  // namespace
 template <typename T>
 void SGDCPUKernel<T>::InitKernel(const CNodePtr &kernel_node) {
  MS_EXCEPTION_IF_NULL(kernel_node);
  kernel_name_ = AnfAlgo::GetCNodeName(kernel_node);
  dampening_ = AnfAlgo::GetNodeAttr<float>(kernel_node, "dampening");
  weight_decay_ = AnfAlgo::GetNodeAttr<float>(kernel_node, "weight_decay");
  nesterov_ = AnfAlgo::GetNodeAttr<bool>(kernel_node, "nesterov");
 }

 template <typename T>
 void SGDCPUKernel<T>::CheckParam(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &outputs) {
  // inputs: param, grad, lr, accum, momentum, stat
  if (inputs.size() != kInputSize) {
    MS_LOG(EXCEPTION) << "Input number is " << inputs.size() << ", but SGD needs 6 inputs.";
  }

  // output: output_param
  if (outputs.size() != kOutputSize) {
    MS_LOG(EXCEPTION) << "Output number is " << outputs.size() << ", but SGD needs 1 outputs.";
  }
 }

 template <typename T>
 bool SGDCPUKernel<T>::Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &,
                             const std::vector<AddressPtr> &outputs) {
  CheckParam(inputs, outputs);

  CHECK_KERNEL_INPUTS_NUM(inputs.size(), kSGDInputsNum, kernel_name_);
  CHECK_KERNEL_OUTPUTS_NUM(outputs.size(), kSGDOutputsNum, kernel_name_);
  auto param = reinterpret_cast<T *>(inputs[PARAM]->addr);
  auto grad = reinterpret_cast<T *>(inputs[GRAD]->addr);
  auto lr = reinterpret_cast<T *>(inputs[LR]->addr);
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/sgd_cpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/sgd_cpu_kernel.h
@@ -13,6 +13,7 @@
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_SGD_CPU_KERNEL_H_
 #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_SGD_CPU_KERNEL_H_

@@ -35,7 +36,6 @@ class SGDCPUKernel : public CPUKernel {
              const std::vector<AddressPtr> &outputs) override;

 private:
  static void CheckParam(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &outputs);
  float dampening_{0.0};
  float weight_decay_{0.0};
  bool nesterov_{true};
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/sigmoid_cross_entropy_with_logits_cpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/sigmoid_cross_entropy_with_logits_cpu_kernel.cc
@@ -19,9 +19,14 @@

 namespace mindspore {
 namespace kernel {
 namespace {
 constexpr size_t kSigmoidCrossEntropyWithLogitsInputsNum = 2;
 constexpr size_t kSigmoidCrossEntropyWithLogitsOutputsNum = 1;
 }  // namespace

 void SigmoidCrossEntropyWithLogitsCPUKernel::InitKernel(const CNodePtr &kernel_node) {
  MS_EXCEPTION_IF_NULL(kernel_node);
  CheckParam(kernel_node);
  kernel_name_ = AnfAlgo::GetCNodeName(kernel_node);
  dtype_ = AnfAlgo::GetInputDeviceDataType(kernel_node, 0);
  std::vector<size_t> x_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
  for (const uint64_t &d : x_shape) {
@@ -45,12 +50,14 @@ bool SigmoidCrossEntropyWithLogitsCPUKernel::Launch(const std::vector<kernel::Ad
 template <typename T>
 void SigmoidCrossEntropyWithLogitsCPUKernel::LaunchKernel(const std::vector<AddressPtr> &inputs,
                                                          const std::vector<AddressPtr> &outputs) {
  auto logits_addr = reinterpret_cast<T *>(inputs[0]->addr);
  auto labels_addr = reinterpret_cast<T *>(inputs[1]->addr);
  auto output_addr = reinterpret_cast<T *>(outputs[0]->addr);
  T zero = (T)0.0;
  T one = (T)1.0;
  T two = (T)2.0;
  CHECK_KERNEL_INPUTS_NUM(inputs.size(), kSigmoidCrossEntropyWithLogitsInputsNum, kernel_name_);
  CHECK_KERNEL_OUTPUTS_NUM(outputs.size(), kSigmoidCrossEntropyWithLogitsOutputsNum, kernel_name_);
  auto *logits_addr = reinterpret_cast<T *>(inputs[0]->addr);
  auto *labels_addr = reinterpret_cast<T *>(inputs[1]->addr);
  auto *output_addr = reinterpret_cast<T *>(outputs[0]->addr);
  auto zero = static_cast<T>(0.0);
  auto one = static_cast<T>(1.0);
  auto two = static_cast<T>(2.0);
  for (uint64_t i = 0; i < tensor_size_; ++i) {
    if (logits_addr[i] >= zero) {
      output_addr[i] = static_cast<T>(log1p(static_cast<float>(exp(logits_addr[i] - two * logits_addr[i])))) -
@@ -60,16 +67,5 @@ void SigmoidCrossEntropyWithLogitsCPUKernel::LaunchKernel(const std::vector<Addr
    }
  }
 }

 void SigmoidCrossEntropyWithLogitsCPUKernel::CheckParam(const CNodePtr &kernel_node) {
  size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
  if (input_num != 2) {
    MS_LOG(EXCEPTION) << "SigmoidCrossEntropyWithLogitsCPUKernel needs 2 inputs, but gets " << input_num;
  }
  size_t output_num = AnfAlgo::GetOutputTensorNum(kernel_node);
  if (output_num != 1) {
    MS_LOG(EXCEPTION) << "SigmoidCrossEntropyWithLogitsCPUKernel expects 1 output, but gets" << output_num;
  }
 }
 }  // namespace kernel
 }  // namespace mindspore
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/sigmoid_cross_entropy_with_logits_cpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/sigmoid_cross_entropy_with_logits_cpu_kernel.h
@@ -1,5 +1,5 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 * Copyright 2020-2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -16,6 +16,7 @@

 #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_SIGMOID_CROSS_ENTROPY_WITH_LOGITS_CPU_KERNEL_H_
 #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_SIGMOID_CROSS_ENTROPY_WITH_LOGITS_CPU_KERNEL_H_

 #include <memory>
 #include <unordered_map>
 #include <vector>
@@ -34,11 +35,10 @@ class SigmoidCrossEntropyWithLogitsCPUKernel : public CPUKernel {
  bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
              const std::vector<AddressPtr> &outputs) override;

 private:
  template <typename T>
  void LaunchKernel(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &outputs);

 private:
  void CheckParam(const CNodePtr &kernel_node);
  TypeId dtype_{kTypeUnknown};
  uint64_t tensor_size_{1};
 };
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/sigmoid_cross_entropy_with_logits_grad_cpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/sigmoid_cross_entropy_with_logits_grad_cpu_kernel.cc
@@ -1,5 +1,5 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 * Copyright 2020-2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -19,9 +19,14 @@

 namespace mindspore {
 namespace kernel {
 namespace {
 constexpr size_t kSigmoidCrossEntropyWithLogitsGradInputsNum = 3;
 constexpr size_t kSigmoidCrossEntropyWithLogitsGradOutputsNum = 1;
 }  // namespace

 void SigmoidCrossEntropyWithLogitsGradCPUKernel::InitKernel(const CNodePtr &kernel_node) {
  MS_EXCEPTION_IF_NULL(kernel_node);
  CheckParam(kernel_node);
  kernel_name_ = AnfAlgo::GetCNodeName(kernel_node);
  dtype_ = AnfAlgo::GetInputDeviceDataType(kernel_node, 0);
  std::vector<size_t> x_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
  for (const uint64_t &d : x_shape) {
@@ -32,6 +37,8 @@ void SigmoidCrossEntropyWithLogitsGradCPUKernel::InitKernel(const CNodePtr &kern
 bool SigmoidCrossEntropyWithLogitsGradCPUKernel::Launch(const std::vector<kernel::AddressPtr> &inputs,
                                                        const std::vector<kernel::AddressPtr> &,
                                                        const std::vector<kernel::AddressPtr> &outputs) {
  CHECK_KERNEL_INPUTS_NUM(inputs.size(), kSigmoidCrossEntropyWithLogitsGradInputsNum, kernel_name_);
  CHECK_KERNEL_OUTPUTS_NUM(outputs.size(), kSigmoidCrossEntropyWithLogitsGradOutputsNum, kernel_name_);
  if (dtype_ == kNumberTypeFloat16) {
    LaunchKernel<float16>(inputs, outputs);
  } else if (dtype_ == kNumberTypeFloat32 || dtype_ == kNumberTypeFloat64) {
@@ -45,12 +52,12 @@ bool SigmoidCrossEntropyWithLogitsGradCPUKernel::Launch(const std::vector<kernel
 template <typename T>
 void SigmoidCrossEntropyWithLogitsGradCPUKernel::LaunchKernel(const std::vector<AddressPtr> &inputs,
                                                              const std::vector<AddressPtr> &outputs) {
  auto logits_addr = reinterpret_cast<T *>(inputs[0]->addr);
  auto labels_addr = reinterpret_cast<T *>(inputs[1]->addr);
  auto dloss_addr = reinterpret_cast<T *>(inputs[2]->addr);
  auto output_addr = reinterpret_cast<T *>(outputs[0]->addr);
  T zero = (T)0.0;
  T one = (T)1.0;
  auto *logits_addr = reinterpret_cast<T *>(inputs[0]->addr);
  auto *labels_addr = reinterpret_cast<T *>(inputs[1]->addr);
  auto *dloss_addr = reinterpret_cast<T *>(inputs[2]->addr);
  auto *output_addr = reinterpret_cast<T *>(outputs[0]->addr);
  auto zero = static_cast<T>(0.0);
  auto one = static_cast<T>(1.0);
  for (uint64_t i = 0; i < tensor_size_; ++i) {
    if (logits_addr[i] >= zero) {
      output_addr[i] = (one / (one + static_cast<T>(exp(-logits_addr[i]))) - labels_addr[i]) * dloss_addr[i];
@@ -60,16 +67,5 @@ void SigmoidCrossEntropyWithLogitsGradCPUKernel::LaunchKernel(const std::vector<
    }
  }
 }

 void SigmoidCrossEntropyWithLogitsGradCPUKernel::CheckParam(const CNodePtr &kernel_node) {
  size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
  if (input_num != 3) {
    MS_LOG(EXCEPTION) << "SigmoidCrossEntropyWithLogitsCPUKernel needs 2 inputs, but gets " << input_num;
  }
  size_t output_num = AnfAlgo::GetOutputTensorNum(kernel_node);
  if (output_num != 1) {
    MS_LOG(EXCEPTION) << "SigmoidCrossEntropyWithLogitsCPUKernel expects 1 output, but gets" << output_num;
  }
 }
 }  // namespace kernel
 }  // namespace mindspore
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/sigmoid_cross_entropy_with_logits_grad_cpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/sigmoid_cross_entropy_with_logits_grad_cpu_kernel.h
@@ -1,5 +1,5 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 * Copyright 2020-2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -16,6 +16,7 @@

 #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_SIGMOID_CROSS_ENTROPY_WITH_LOGITS_GRAD_CPU_KERNEL_H_
 #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_SIGMOID_CROSS_ENTROPY_WITH_LOGITS_GRAD_CPU_KERNEL_H_

 #include <memory>
 #include <unordered_map>
 #include <vector>
@@ -34,11 +35,10 @@ class SigmoidCrossEntropyWithLogitsGradCPUKernel : public CPUKernel {
  bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
              const std::vector<AddressPtr> &outputs) override;

 private:
  template <typename T>
  void LaunchKernel(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &outputs);

 private:
  void CheckParam(const CNodePtr &kernel_node);
  TypeId dtype_{kTypeUnknown};
  uint64_t tensor_size_{1};
 };
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/slice_cpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/slice_cpu_kernel.cc
@@ -1,5 +1,5 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 * Copyright 2020-2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -15,15 +15,18 @@
 */

 #include "backend/kernel_compiler/cpu/slice_cpu_kernel.h"

 #include <algorithm>
 #include <unordered_map>

 #include "common/thread_pool.h"
 #include "runtime/device/cpu/cpu_device_address.h"

 namespace mindspore {
 namespace kernel {
 namespace {
 constexpr size_t kSliceInputsNum = 1;
 constexpr size_t kSliceOutputsNum = 1;
 }  // namespace

 int NormalizeBeginPos(int begin_pos, int dim_len) {
  if (begin_pos < 0) {
    int normal_pos = begin_pos + dim_len;
@@ -34,6 +37,7 @@ int NormalizeBeginPos(int begin_pos, int dim_len) {

 void SliceCPUKernel::InitKernel(const CNodePtr &kernel_node) {
  MS_EXCEPTION_IF_NULL(kernel_node);
  kernel_name_ = AnfAlgo::GetCNodeName(kernel_node);
  static const std::unordered_map<TypeId, int> type_size_map = {{kNumberTypeBool, sizeof(bool)},
                                                                {kNumberTypeInt32, sizeof(int)},
                                                                {kNumberTypeFloat32, sizeof(float)},
@@ -84,29 +88,29 @@ void SliceCPUKernel::InitSliceParam(const std::vector<size_t> &input_shape, cons
  slice_param_.param_length_ = DIMENSION_8D;
 }

 void SliceSimpleDim2(const int8_t *input, int8_t *output, SliceParameter *param, int data_size, size_t row_size) {
  size_t copy_size = data_size * param->size_[1];
 void SliceSimpleDim2(const int8_t *input, int8_t *output, const SliceParameter *param, int data_size, size_t row_size) {
  size_t copy_size = IntToSize(data_size * param->size_[1]);
  for (size_t i = 0; i < row_size; ++i) {
    auto dst = output + data_size * param->size_[1] * i;
    auto src = input + data_size * (param->shape_[1] * i + param->begin_[1]);
    (void)memcpy_s(dst, copy_size, src, copy_size);
    auto ret = memcpy_s(dst, copy_size, src, copy_size);
    if (ret != EOK) {
      MS_LOG(EXCEPTION) << "Memcpy failed.";
    }
  }
 }

 bool SliceCPUKernel::Launch(const std::vector<kernel::AddressPtr> &inputs, const std::vector<kernel::AddressPtr> &,
                            const std::vector<kernel::AddressPtr> &outputs) {
  if (inputs.size() != 1 || outputs.size() != 1) {
    MS_LOG(ERROR) << "Slice requires 1 input and 1 output, but got " << inputs.size() << " input and " << outputs.size()
                  << " output.";
    return false;
  }
  CHECK_KERNEL_INPUTS_NUM(inputs.size(), kSliceInputsNum, kernel_name_);
  CHECK_KERNEL_OUTPUTS_NUM(outputs.size(), kSliceOutputsNum, kernel_name_);
  if (outputs[0]->size == 0) {
    MS_LOG(WARNING) << "Slice output memory size should be greater than 0, but got 0.";
    return true;
  }

  auto input_addr = inputs[0]->addr;
  auto output_addr = outputs[0]->addr;

  if (origin_dim_size_ == 2) {
    auto task = [this, &input_addr, &output_addr](size_t start, size_t end) {
      auto src =
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/slice_cpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/slice_cpu_kernel.h
@@ -1,5 +1,5 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 * Copyright 2020-2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -19,7 +19,6 @@

 #include <vector>
 #include <memory>

 #include "backend/kernel_compiler/cpu/cpu_kernel.h"
 #include "backend/kernel_compiler/cpu/cpu_kernel_factory.h"
 #include "nnacl/base/slice_base.h"
@@ -39,7 +38,6 @@ class SliceCPUKernel : public CPUKernel {
 private:
  void InitSliceParam(const std::vector<size_t> &input_shape, const std::vector<int64_t> &begin,
                      const std::vector<int64_t> &size);

  size_t origin_dim_size_{0};
  int data_size_{4};
  SliceParameter slice_param_;
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/slice_grad_cpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/slice_grad_cpu_kernel.cc
@@ -13,6 +13,7 @@
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #include "backend/kernel_compiler/cpu/slice_grad_cpu_kernel.h"
 #include <algorithm>
 #include "runtime/device/cpu/cpu_device_address.h"
@@ -20,11 +21,22 @@

 namespace mindspore {
 namespace kernel {
 namespace {
 constexpr size_t kSliceGradInputsNum = 2;
 constexpr size_t kStridedSliceGradInputsNum = 1;
 constexpr size_t kOutputsNum = 1;
 }  // namespace

 void SliceGradCPUKernel::InitKernel(const CNodePtr &kernel_node) {
  MS_EXCEPTION_IF_NULL(kernel_node);
  CheckParam(kernel_node);
  kernel_name_ = AnfAlgo::GetCNodeName(kernel_node);
  output_shape_ = AnfAlgo::GetOutputInferShape(kernel_node, 0);
  dtype_ = AnfAlgo::GetInputDeviceDataType(kernel_node, 0);
  auto input_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
  if (input_shape.empty() || input_shape.size() > 4) {
    MS_LOG(EXCEPTION) << "Input dims is " << input_shape.size() << ", but SliceGradGpuKernel only support 1-4D.";
  }

  std::vector<int64_t> begin_me = AnfAlgo::GetNodeAttr<std::vector<int64_t>>(kernel_node, BEGIN);
  (void)std::transform(begin_me.begin(), begin_me.end(), std::back_inserter(begin_),
                       [](const int64_t &value) { return LongToInt(value); });
@@ -51,6 +63,7 @@ void SliceGradCPUKernel::InitKernel(const CNodePtr &kernel_node) {
    }
    FormatArgs(false);
  }

  ExpandAllMemberDims();
  CPUKernelUtils::GetElementNumEveryDim(input_shape_, &input_element_num_);
  CPUKernelUtils::GetElementNumEveryDim(output_shape_, &output_element_num_);
@@ -60,10 +73,10 @@ void SliceGradCPUKernel::ExpandAllMemberDims() {
  auto output_len = output_shape_.size();
  if (output_len < 4) {
    for (size_t i = 0; i < 4 - output_len; ++i) {
      output_shape_.insert(output_shape_.begin(), 1);
      begin_.insert(begin_.begin(), 0);
      strides_.insert(strides_.begin(), 1);
      end_.insert(end_.begin(), 1);
      (void)output_shape_.insert(output_shape_.begin(), 1);
      (void)begin_.insert(begin_.begin(), 0);
      (void)strides_.insert(strides_.begin(), 1);
      (void)end_.insert(end_.begin(), 1);
    }
  }
  for (size_t i = 0; i < 4; ++i) {
@@ -79,7 +92,12 @@ void SliceGradCPUKernel::ExpandAllMemberDims() {

 bool SliceGradCPUKernel::Launch(const std::vector<kernel::AddressPtr> &inputs, const std::vector<kernel::AddressPtr> &,
                                const std::vector<kernel::AddressPtr> &outputs) {
  bool ret{true};
  size_t expect_inputs_num =
    kernel_name_ == prim::kPrimSliceGrad->name() ? kSliceGradInputsNum : kStridedSliceGradInputsNum;
  CHECK_KERNEL_INPUTS_NUM(inputs.size(), expect_inputs_num, kernel_name_);
  CHECK_KERNEL_OUTPUTS_NUM(outputs.size(), kOutputsNum, kernel_name_);

  bool ret = true;
  if (dtype_ == kNumberTypeInt32) {
    ret = LaunchKernel<int>(inputs, outputs);
  } else if (dtype_ == kNumberTypeFloat32) {
@@ -96,9 +114,9 @@ bool SliceGradCPUKernel::Launch(const std::vector<kernel::AddressPtr> &inputs, c

 template <typename T>
 bool SliceGradCPUKernel::LaunchKernel(const std::vector<kernel::AddressPtr> &inputs,
                                      const std::vector<kernel::AddressPtr> &outputs) {
  T *input_addr = reinterpret_cast<T *>(inputs[0]->addr);
  T *output_addr = reinterpret_cast<T *>(outputs[0]->addr);
                                      const std::vector<kernel::AddressPtr> &outputs) const {
  auto *input_addr = reinterpret_cast<T *>(inputs[0]->addr);
  auto *output_addr = reinterpret_cast<T *>(outputs[0]->addr);

  auto ret = memset_s(output_addr, outputs[0]->size, 0, outputs[0]->size);
  if (ret != EOK) {
@@ -113,16 +131,17 @@ bool SliceGradCPUKernel::LaunchKernel(const std::vector<kernel::AddressPtr> &inp
  size_t out_step_size[3] = {IntToSize(strides_[0]) * output_element_num_[0],
                             IntToSize(strides_[1]) * output_element_num_[1],
                             IntToSize(strides_[2]) * output_element_num_[2]};
  auto in_n_offset = 0;
  auto out_n_offset = out_start_offset[0];
  size_t in_n_offset = 0;
  size_t out_n_offset = out_start_offset[0];
  size_t input_index = 0;
  for (int i = begin_[0]; stride_signs[0] * i < stride_signs[0] * end_[0];
       i += strides_[0], in_n_offset += input_element_num_[0], out_n_offset += out_step_size[0]) {
    if (can_copy_memory[0]) {
      CopyDataToOutput<T>(inputs, in_n_offset, outputs, out_n_offset, input_element_num_[0], 0);
      continue;
    }
    auto in_c_offset = 0;
    auto out_c_offset = out_start_offset[1];
    size_t in_c_offset = 0;
    size_t out_c_offset = out_start_offset[1];
    for (int j = begin_[1]; stride_signs[1] * j < stride_signs[1] * end_[1];
         j += strides_[1], in_c_offset += input_element_num_[1], out_c_offset += out_step_size[1]) {
      if (can_copy_memory[1]) {
@@ -130,8 +149,8 @@ bool SliceGradCPUKernel::LaunchKernel(const std::vector<kernel::AddressPtr> &inp
                            input_element_num_[1], 1);
        continue;
      }
      auto in_h_offset = 0;
      auto out_h_offset = out_start_offset[2];
      size_t in_h_offset = 0;
      size_t out_h_offset = out_start_offset[2];
      for (int k = begin_[2]; stride_signs[2] * k < stride_signs[2] * end_[2];
           k += strides_[2], in_h_offset += input_element_num_[2], out_h_offset += out_step_size[2]) {
        if (can_copy_memory[2]) {
@@ -140,7 +159,7 @@ bool SliceGradCPUKernel::LaunchKernel(const std::vector<kernel::AddressPtr> &inp
          continue;
        }
        for (int m = begin_[3]; stride_signs[3] * m < stride_signs[3] * end_[3]; m += strides_[3]) {
          output_addr[out_n_offset + out_c_offset + out_h_offset + IntToSize(m)] = *input_addr++;
          output_addr[out_n_offset + out_c_offset + out_h_offset + IntToSize(m)] = input_addr[input_index++];
        }
      }
    }
@@ -223,19 +242,5 @@ void SliceGradCPUKernel::FormatArgs(bool stride) {
    }
  }
 }

 void SliceGradCPUKernel::CheckParam(const CNodePtr &kernel_node) const {
  size_t output_num = AnfAlgo::GetOutputTensorNum(kernel_node);
  if (output_num != 1) {
    MS_LOG(EXCEPTION) << "Output number is " << output_num << ", but SliceGradGpuKernel needs 1 output.";
  }
  auto input_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
  if (input_shape.size() > 4) {
    MS_LOG(EXCEPTION) << "Input dims is " << input_shape.size() << ", but SliceGradGpuKernel only support 4d or lower.";
  }
  if (input_shape.size() == 0) {
    MS_LOG(EXCEPTION) << "Input dims is " << input_shape.size() << ", scalar is not supported.";
  }
 }
 }  // namespace kernel
 }  // namespace mindspore
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/slice_grad_cpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/slice_grad_cpu_kernel.h
@@ -1,5 +1,5 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 * Copyright 2020-2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -13,8 +13,10 @@
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_SLICE_GRAD_CPU_KERNEL_H_
 #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_SLICE_GRAD_CPU_KERNEL_H_

 #include <vector>
 #include <memory>
 #include "backend/kernel_compiler/cpu/cpu_kernel.h"
@@ -34,16 +36,16 @@ class SliceGradCPUKernel : public CPUKernel {

 private:
  template <typename T>
  bool LaunchKernel(const std::vector<kernel::AddressPtr> &inputs, const std::vector<kernel::AddressPtr> &outputs);
  bool LaunchKernel(const std::vector<kernel::AddressPtr> &inputs,
                    const std::vector<kernel::AddressPtr> &outputs) const;
  template <typename T>
  void CopyDataToOutput(const std::vector<kernel::AddressPtr> &inputs, size_t in_offset,
                        const std::vector<kernel::AddressPtr> &outputs, size_t out_offset, size_t copy_num,
                        int id) const;

  void ExpandAllMemberDims();
  bool CanCopyMemoryOnAxis(size_t dim) const;
  int SignOfStride(size_t axis) const;

  void CheckParam(const CNodePtr &kernel_node) const;
  void FormatArgs(bool stride);
  std::vector<int> begin_;
  std::vector<int> end_;
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/smooth_l1_loss_cpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/smooth_l1_loss_cpu_kernel.cc
@@ -19,11 +19,19 @@

 namespace mindspore {
 namespace kernel {
 namespace {
 constexpr size_t kSmoothL1LossInputsNum = 2;
 constexpr size_t kSmoothL1LossOutputsNum = 1;
 }  // namespace

 template <typename T>
 void SmoothL1LossCPUKernel<T>::InitKernel(const CNodePtr &kernel_node) {
  MS_EXCEPTION_IF_NULL(kernel_node);
  beta_ = AnfAlgo::GetNodeAttr<float>(kernel_node, "beta");
  CheckParam(kernel_node);
  if (beta_ == 0.0) {
    MS_LOG(EXCEPTION) << "Attr beta can not be zero.";
  }
  kernel_name_ = AnfAlgo::GetCNodeName(kernel_node);
  std::vector<size_t> x_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
  for (const uint64_t &d : x_shape) {
    tensor_size_ *= d;
@@ -34,9 +42,11 @@ template <typename T>
 bool SmoothL1LossCPUKernel<T>::Launch(const std::vector<kernel::AddressPtr> &inputs,
                                      const std::vector<kernel::AddressPtr> &,
                                      const std::vector<kernel::AddressPtr> &outputs) {
  auto predict_addr = reinterpret_cast<T *>(inputs[0]->addr);
  auto target_addr = reinterpret_cast<T *>(inputs[1]->addr);
  auto result_addr = reinterpret_cast<T *>(outputs[0]->addr);
  CHECK_KERNEL_INPUTS_NUM(inputs.size(), kSmoothL1LossInputsNum, kernel_name_);
  CHECK_KERNEL_OUTPUTS_NUM(outputs.size(), kSmoothL1LossOutputsNum, kernel_name_);
  const auto *predict_addr = reinterpret_cast<T *>(inputs[0]->addr);
  const auto *target_addr = reinterpret_cast<T *>(inputs[1]->addr);
  auto *result_addr = reinterpret_cast<T *>(outputs[0]->addr);
  T zero = (T)0.0;
  T half = (T)0.5;
  T beta = (T)beta_;
@@ -56,20 +66,5 @@ bool SmoothL1LossCPUKernel<T>::Launch(const std::vector<kernel::AddressPtr> &inp
  CPUKernelUtils::ParallelFor(task, tensor_size_);
  return true;
 }

 template <typename T>
 void SmoothL1LossCPUKernel<T>::CheckParam(const CNodePtr &kernel_node) {
  size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
  if (input_num != 2) {
    MS_LOG(EXCEPTION) << "Input number is " << input_num << ", but SmoothL1LossCPUKernel needs 2 input.";
  }
  size_t output_num = AnfAlgo::GetOutputTensorNum(kernel_node);
  if (output_num != 1) {
    MS_LOG(EXCEPTION) << "Output number is " << output_num << ", but SmoothL1LossCPUKernel needs 1 output.";
  }
  if (beta_ == 0.0) {
    MS_LOG(EXCEPTION) << "Attr beta can not be zero.";
  }
 }
 }  // namespace kernel
 }  // namespace mindspore
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/smooth_l1_loss_cpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/smooth_l1_loss_cpu_kernel.h
@@ -37,10 +37,9 @@ class SmoothL1LossCPUKernel : public CPUKernel {
              const std::vector<AddressPtr> &outputs) override;

 private:
  void CheckParam(const CNodePtr &kernel_node);
  float beta_ = 1.0;
  float beta_{1.0};
  TypeId dtype_{kTypeUnknown};
  uint64_t tensor_size_ = 1;
  uint64_t tensor_size_{1};
 };

 MS_REG_CPU_KERNEL_T(
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/smooth_l1_loss_grad_cpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/smooth_l1_loss_grad_cpu_kernel.cc
@@ -19,11 +19,19 @@

 namespace mindspore {
 namespace kernel {
 namespace {
 constexpr size_t kSmoothL1LossGradInputsNum = 3;
 constexpr size_t kSmoothL1LossGradOutputsNum = 1;
 }  // namespace

 template <typename T>
 void SmoothL1LossGradCPUKernel<T>::InitKernel(const CNodePtr &kernel_node) {
  MS_EXCEPTION_IF_NULL(kernel_node);
  kernel_name_ = AnfAlgo::GetCNodeName(kernel_node);
  beta_ = AnfAlgo::GetNodeAttr<float>(kernel_node, "beta");
  CheckParam(kernel_node);
  if (beta_ == 0.0) {
    MS_LOG(EXCEPTION) << "Attr beta can not be zero.";
  }
  std::vector<size_t> x_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
  for (const uint64_t &d : x_shape) {
    tensor_size_ *= d;
@@ -34,10 +42,12 @@ template <typename T>
 bool SmoothL1LossGradCPUKernel<T>::Launch(const std::vector<kernel::AddressPtr> &inputs,
                                          const std::vector<kernel::AddressPtr> &,
                                          const std::vector<kernel::AddressPtr> &outputs) {
  auto predict_addr = reinterpret_cast<T *>(inputs[0]->addr);
  auto target_addr = reinterpret_cast<T *>(inputs[1]->addr);
  auto dloss_addr = reinterpret_cast<T *>(inputs[2]->addr);
  auto result_addr = reinterpret_cast<T *>(outputs[0]->addr);
  CHECK_KERNEL_INPUTS_NUM(inputs.size(), kSmoothL1LossGradInputsNum, kernel_name_);
  CHECK_KERNEL_OUTPUTS_NUM(outputs.size(), kSmoothL1LossGradOutputsNum, kernel_name_);
  const auto *predict_addr = reinterpret_cast<T *>(inputs[0]->addr);
  const auto *target_addr = reinterpret_cast<T *>(inputs[1]->addr);
  const auto *dloss_addr = reinterpret_cast<T *>(inputs[2]->addr);
  auto *result_addr = reinterpret_cast<T *>(outputs[0]->addr);
  T beta = (T)beta_;
  for (uint64_t i = 0; i < tensor_size_; ++i) {
    T diff = predict_addr[i] - target_addr[i];
@@ -51,20 +61,5 @@ bool SmoothL1LossGradCPUKernel<T>::Launch(const std::vector<kernel::AddressPtr>
  }
  return true;
 }

 template <typename T>
 void SmoothL1LossGradCPUKernel<T>::CheckParam(const CNodePtr &kernel_node) {
  size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
  if (input_num != 3) {
    MS_LOG(EXCEPTION) << "Input number is " << input_num << ", but SmoothL1LossGradCPUKernel needs 3 input.";
  }
  size_t output_num = AnfAlgo::GetOutputTensorNum(kernel_node);
  if (output_num != 1) {
    MS_LOG(EXCEPTION) << "Output number is " << output_num << ", but SmoothL1LossGradCPUKernel needs 1 output.";
  }
  if (beta_ == 0.0) {
    MS_LOG(EXCEPTION) << "Attr beta can not be zero.";
  }
 }
 }  // namespace kernel
 }  // namespace mindspore
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/smooth_l1_loss_grad_cpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/smooth_l1_loss_grad_cpu_kernel.h
@@ -37,7 +37,6 @@ class SmoothL1LossGradCPUKernel : public CPUKernel {
              const std::vector<AddressPtr> &outputs) override;

 private:
  void CheckParam(const CNodePtr &kernel_node);
  float beta_{1.0};
  uint64_t tensor_size_{1};
 };
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/spacetodepth_cpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/spacetodepth_cpu_kernel.cc
@@ -15,27 +15,39 @@
 */

 #include "backend/kernel_compiler/cpu/spacetodepth_cpu_kernel.h"

 #include <vector>

 #include "runtime/device/cpu/cpu_device_address.h"

 namespace mindspore {
 namespace kernel {
 namespace {
 constexpr size_t kSpaceToDepthInputsNum = 1;
 constexpr size_t kSpaceToDepthOutputsNum = 1;
 constexpr size_t kSpaceToDepthInputShapeSize = 4;
 constexpr size_t kSpaceToDepthMinBlockSize = 2;
 }  // namespace
 template <typename T>
 void SpaceToDepthCPUKernel<T>::InitKernel(const CNodePtr &kernel_node) {
  MS_EXCEPTION_IF_NULL(kernel_node);
  CheckParam(kernel_node);
  kernel_name_ = AnfAlgo::GetCNodeName(kernel_node);

  input_shape_ = AnfAlgo::GetInputDeviceShape(kernel_node, 0);
  output_shape_ = AnfAlgo::GetOutputDeviceShape(kernel_node, 0);
  block_size_ = LongToSize(AnfAlgo::GetNodeAttr<int64_t>(kernel_node, "block_size"));
  if (input_shape_.size() != kSpaceToDepthInputShapeSize) {
    MS_LOG(EXCEPTION) << "Input shape must be a 4-D tensor, but got " << input_shape_.size() << "-D";
  }
  if (block_size_ < kSpaceToDepthMinBlockSize) {
    MS_LOG(EXCEPTION) << "The block size must be >= " << kSpaceToDepthMinBlockSize << ", but got " << block_size_;
  }
 }

 template <typename T>
 bool SpaceToDepthCPUKernel<T>::Launch(const std::vector<kernel::AddressPtr> &inputs,
                                      const std::vector<kernel::AddressPtr> & /* workspace */,
                                      const std::vector<kernel::AddressPtr> &outputs) {
  CHECK_KERNEL_INPUTS_NUM(inputs.size(), kSpaceToDepthInputsNum, kernel_name_);
  CHECK_KERNEL_OUTPUTS_NUM(outputs.size(), kSpaceToDepthOutputsNum, kernel_name_);
  auto input_addr = reinterpret_cast<T *>(inputs[0]->addr);
  auto output_addr = reinterpret_cast<T *>(outputs[0]->addr);
  size_t size = inputs[0]->size / sizeof(T);
@@ -75,17 +87,5 @@ bool SpaceToDepthCPUKernel<T>::Launch(const std::vector<kernel::AddressPtr> &inp
  CPUKernelUtils::ParallelFor(task, size);
  return true;
 }

 template <typename T>
 void SpaceToDepthCPUKernel<T>::CheckParam(const CNodePtr &kernel_node) {
  size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
  if (input_num != 1) {
    MS_LOG(EXCEPTION) << "Input number is " << input_num << ", but DepthToSpaceCPUKerrnel needs 1 input.";
  }
  size_t output_num = AnfAlgo::GetOutputTensorNum(kernel_node);
  if (output_num != 1) {
    MS_LOG(EXCEPTION) << "Output number is " << output_num << ", but DepthToSpaceCPUKernel needs 1 output.";
  }
 }
 }  // namespace kernel
 }  // namespace mindspore
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/spacetodepth_cpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/spacetodepth_cpu_kernel.h
@@ -13,11 +13,12 @@
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_SPACETODEPTH_CPU_KERNEL_H_
 #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_SPACETODEPTH_CPU_KERNEL_H_

 #include <string>
 #include <vector>

 #include "backend/kernel_compiler/cpu/cpu_kernel.h"
 #include "backend/kernel_compiler/cpu/cpu_kernel_factory.h"
 namespace mindspore {
@@ -33,10 +34,9 @@ class SpaceToDepthCPUKernel : public CPUKernel {
              const std::vector<AddressPtr> &outputs) override;

 private:
  void CheckParam(const CNodePtr &kernel_node);
  size_t block_size_{0};
  std::vector<size_t> input_shape_;
  std::vector<size_t> output_shape_;
  size_t block_size_{0};
 };

 MS_REG_CPU_KERNEL_T(
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/sparse_apply_adam_cpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/sparse_apply_adam_cpu_kernel.cc
@@ -21,7 +21,8 @@
 namespace mindspore {
 namespace kernel {
 namespace {
 constexpr size_t kSparseApplyAdamInputSize = 11;
 constexpr size_t kSparseApplyAdamInputsNum = 11;
 constexpr size_t kSparseApplyAdamWorkspaceSize = 5;

 template <typename T>
 void ComputeAdam(MultiThreadComputeParams<T> *input_params, size_t start, size_t end) {
@@ -100,6 +101,7 @@ void SparseApplyAdamCPUKernel::InitInputOutputSize(const CNodePtr &kernel_node)

 void SparseApplyAdamCPUKernel::InitKernel(const CNodePtr &kernel_node) {
  MS_EXCEPTION_IF_NULL(kernel_node);
  kernel_name_ = AnfAlgo::GetCNodeName(kernel_node);
  std::vector<size_t> var_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
  std::vector<size_t> m_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 1);
  std::vector<size_t> v_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 2);
@@ -140,9 +142,9 @@ void SparseApplyAdamCPUKernel::InitKernel(const CNodePtr &kernel_node) {
 template <typename T>
 void SparseApplyAdamCPUKernel::LaunchKernel(const std::vector<kernel::AddressPtr> &inputs,
                                            const std::vector<kernel::AddressPtr> &workspace) const {
  auto var = reinterpret_cast<float *>(inputs[0]->addr);
  auto m = reinterpret_cast<float *>(inputs[1]->addr);
  auto v = reinterpret_cast<float *>(inputs[2]->addr);
  auto *var = reinterpret_cast<float *>(inputs[0]->addr);
  auto *m = reinterpret_cast<float *>(inputs[1]->addr);
  auto *v = reinterpret_cast<float *>(inputs[2]->addr);
  auto beta1_power = reinterpret_cast<float *>(inputs[3]->addr)[0];
  if (beta1_power == 1) {
    MS_LOG(EXCEPTION) << "The beta1_power should not be 1";
@@ -152,13 +154,13 @@ void SparseApplyAdamCPUKernel::LaunchKernel(const std::vector<kernel::AddressPtr
  auto beta1 = reinterpret_cast<float *>(inputs[6]->addr)[0];
  auto beta2 = reinterpret_cast<float *>(inputs[7]->addr)[0];
  auto epsilon = reinterpret_cast<float *>(inputs[8]->addr)[0];
  auto grad = reinterpret_cast<float *>(inputs[9]->addr);
  auto indices = reinterpret_cast<T *>(inputs[10]->addr);
  auto new_grad = reinterpret_cast<float *>(workspace[0]->addr);
  auto new_indices = reinterpret_cast<T *>(workspace[1]->addr);
  auto workspace_grad = reinterpret_cast<float *>(workspace[2]->addr);
  auto workspace_indices = reinterpret_cast<T *>(workspace[3]->addr);
  auto m_t = reinterpret_cast<float *>(workspace[4]->addr);
  auto *grad = reinterpret_cast<float *>(inputs[9]->addr);
  auto *indices = reinterpret_cast<T *>(inputs[10]->addr);
  auto *new_grad = reinterpret_cast<float *>(workspace[0]->addr);
  auto *new_indices = reinterpret_cast<T *>(workspace[1]->addr);
  auto *workspace_grad = reinterpret_cast<float *>(workspace[2]->addr);
  auto *workspace_indices = reinterpret_cast<T *>(workspace[3]->addr);
  auto *m_t = reinterpret_cast<float *>(workspace[4]->addr);

  SparseGradient<T> unique_sparse_grad({new_grad, new_indices, indices_size_});
  SparseGradient<T> workspace_sparse_grad({workspace_grad, workspace_indices, indices_size_});
@@ -180,7 +182,6 @@ void SparseApplyAdamCPUKernel::LaunchKernel(const std::vector<kernel::AddressPtr
  input_params.beta1_ = beta1;
  input_params.beta2_ = beta2;
  MultiThreadCompute<T>(ComputeMomentum<T>, &input_params, total_dim_size);

  input_params.m_t_ = m_t;
  input_params.use_nesterov_ = use_nesterov_;
  input_params.sparse_grad_ = unique_sparse_grad;
@@ -200,9 +201,8 @@ void SparseApplyAdamCPUKernel::LaunchKernel(const std::vector<kernel::AddressPtr
 bool SparseApplyAdamCPUKernel::Launch(const std::vector<kernel::AddressPtr> &inputs,
                                      const std::vector<kernel::AddressPtr> &workspace,
                                      const std::vector<kernel::AddressPtr> &) {
  if (inputs.size() < kSparseApplyAdamInputSize) {
    MS_LOG(EXCEPTION) << "Error input size!";
  }
  CHECK_KERNEL_INPUTS_NUM(inputs.size(), kSparseApplyAdamInputsNum, kernel_name_);
  CHECK_KERNEL_WORKSPACE_SIZE(workspace.size(), kSparseApplyAdamWorkspaceSize, kernel_name_);
  if (indices_data_type_ == kNumberTypeInt32) {
    LaunchKernel<int>(inputs, workspace);
  } else if (indices_data_type_ == kNumberTypeInt64) {
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/sparse_apply_adam_cpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/sparse_apply_adam_cpu_kernel.h
@@ -1,5 +1,5 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 * Copyright 2020-2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -13,6 +13,7 @@
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_SPARSE_APPLY_ADAM_CPU_KERNEL_H_
 #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_SPARSE_APPLY_ADAM_CPU_KERNEL_H_

@@ -27,17 +28,21 @@ class SparseApplyAdamCPUKernel : public SparseOptimizerCPUKernel {
  ~SparseApplyAdamCPUKernel() override = default;

  void InitKernel(const CNodePtr &kernel_node) override;
  void InitInputOutputSize(const CNodePtr &kernel_node) override;
  bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
              const std::vector<AddressPtr> &outputs) override;

 protected:
  bool use_nesterov_{false};

 private:
  void InitInputOutputSize(const CNodePtr &kernel_node) override;

  template <typename T>
  void InitWorkspaceSize();

  template <typename T>
  void LaunchKernel(const std::vector<kernel::AddressPtr> &inputs,
                    const std::vector<kernel::AddressPtr> &workspace) const;

 protected:
  bool use_nesterov_{false};
 };

 MS_REG_CPU_KERNEL(FusedSparseAdam,
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/sparse_apply_ftrl_cpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/sparse_apply_ftrl_cpu_kernel.cc
@@ -21,7 +21,9 @@
 namespace mindspore {
 namespace kernel {
 namespace {
 constexpr size_t kSparseApplyFtrlInputSize = 5;
 constexpr size_t kSparseApplyFtrlInputsNum = 5;
 constexpr size_t kSparseApplyFtrlWorkspaceSize = 4;

 template <typename T>
 void ComputeFtrl(MultiThreadComputeParams<T> *input_params, size_t start, size_t end) {
  MS_EXCEPTION_IF_NULL(input_params);
@@ -74,8 +76,10 @@ void SparseApplyFtrlCPUKernel::InitInputOutputSize(const CNodePtr &kernel_node)
  CPUKernel::InitInputOutputSize(kernel_node);
  if (indices_data_type_ == kNumberTypeInt32) {
    InitWorkspaceSize<int>();
  } else {
  } else if (indices_data_type_ == kNumberTypeInt64) {
    InitWorkspaceSize<int64_t>();
  } else {
    MS_LOG(EXCEPTION) << "Input data type " << indices_data_type_ << " is unsupported";
  }
 }

@@ -135,15 +139,15 @@ void SparseApplyFtrlCPUKernel::InitKernel(const CNodePtr &kernel_node) {
 template <typename T>
 void SparseApplyFtrlCPUKernel::LaunchKernel(const std::vector<kernel::AddressPtr> &inputs,
                                            const std::vector<kernel::AddressPtr> &workspace) const {
  auto var = reinterpret_cast<float *>(inputs[0]->addr);
  auto accum = reinterpret_cast<float *>(inputs[1]->addr);
  auto linear = reinterpret_cast<float *>(inputs[2]->addr);
  auto grad = reinterpret_cast<float *>(inputs[3]->addr);
  auto indices = reinterpret_cast<T *>(inputs[4]->addr);
  auto new_grad = reinterpret_cast<float *>(workspace[0]->addr);
  auto new_indices = reinterpret_cast<T *>(workspace[1]->addr);
  auto workspace_grad = reinterpret_cast<float *>(workspace[2]->addr);
  auto workspace_indices = reinterpret_cast<T *>(workspace[3]->addr);
  auto *var = reinterpret_cast<float *>(inputs[0]->addr);
  auto *accum = reinterpret_cast<float *>(inputs[1]->addr);
  auto *linear = reinterpret_cast<float *>(inputs[2]->addr);
  auto *grad = reinterpret_cast<float *>(inputs[3]->addr);
  auto *indices = reinterpret_cast<T *>(inputs[4]->addr);
  auto *new_grad = reinterpret_cast<float *>(workspace[0]->addr);
  auto *new_indices = reinterpret_cast<T *>(workspace[1]->addr);
  auto *workspace_grad = reinterpret_cast<float *>(workspace[2]->addr);
  auto *workspace_indices = reinterpret_cast<T *>(workspace[3]->addr);

  SparseGradient<T> unique_sparse_grad({new_grad, new_indices, indices_size_});
  SparseGradient<T> workspace_sparse_grad({workspace_grad, workspace_indices, indices_size_});
@@ -173,10 +177,8 @@ void SparseApplyFtrlCPUKernel::LaunchKernel(const std::vector<kernel::AddressPtr
 bool SparseApplyFtrlCPUKernel::Launch(const std::vector<kernel::AddressPtr> &inputs,
                                      const std::vector<kernel::AddressPtr> &workspace,
                                      const std::vector<kernel::AddressPtr> &) {
  if (inputs.size() < kSparseApplyFtrlInputSize) {
    MS_LOG(EXCEPTION) << "error input output size!";
  }

  CHECK_KERNEL_INPUTS_NUM(inputs.size(), kSparseApplyFtrlInputsNum, kernel_name_);
  CHECK_KERNEL_WORKSPACE_SIZE(workspace.size(), kSparseApplyFtrlWorkspaceSize, kernel_name_);
  if (indices_data_type_ == kNumberTypeInt32) {
    LaunchKernel<int>(inputs, workspace);
  } else if (indices_data_type_ == kNumberTypeInt64) {
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/sparse_apply_ftrl_cpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/sparse_apply_ftrl_cpu_kernel.h
@@ -1,5 +1,5 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 * Copyright 2020-2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -13,6 +13,7 @@
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_SPARSE_APPLY_FTRL_CPU_KERNEL_H_
 #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_SPARSE_APPLY_FTRL_CPU_KERNEL_H_

@@ -27,20 +28,24 @@ class SparseApplyFtrlCPUKernel : public SparseOptimizerCPUKernel {
  ~SparseApplyFtrlCPUKernel() override = default;

  void InitKernel(const CNodePtr &kernel_node) override;
  void InitInputOutputSize(const CNodePtr &kernel_node) override;
  bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
              const std::vector<AddressPtr> &outputs) override;

 protected:
  float lr_{0.0};
  float l1_{0.0};
  float l2_{0.0};
  float lr_power_{0.0};

 private:
  void InitInputOutputSize(const CNodePtr &kernel_node) override;

  template <typename T>
  void InitWorkspaceSize();

  template <typename T>
  void LaunchKernel(const std::vector<kernel::AddressPtr> &inputs,
                    const std::vector<kernel::AddressPtr> &workspace) const;

 protected:
  float lr_{0};
  float l1_{0};
  float l2_{0};
  float lr_power_{0};
 };

 MS_REG_CPU_KERNEL(FusedSparseFtrl,
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/sparse_apply_lazy_adam_cpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/sparse_apply_lazy_adam_cpu_kernel.cc
@@ -21,7 +21,8 @@
 namespace mindspore {
 namespace kernel {
 namespace {
 constexpr size_t kSparseApplyLazyAdamInputSize = 11;
 constexpr size_t kSparseApplyLazyAdamInputsNum = 11;
 constexpr size_t kSparseApplyLazyAdamWorkspaceSize = 4;

 template <typename T>
 void ComputeLazyAdam(MultiThreadComputeParams<T> *input_params, size_t start, size_t end) {
@@ -70,13 +71,16 @@ void SparseApplyLazyAdamCPUKernel::InitInputOutputSize(const CNodePtr &kernel_no
  CPUKernel::InitInputOutputSize(kernel_node);
  if (indices_data_type_ == kNumberTypeInt32) {
    InitWorkspaceSize<int>();
  } else {
  } else if (indices_data_type_ == kNumberTypeInt64) {
    InitWorkspaceSize<int64_t>();
  } else {
    MS_LOG(EXCEPTION) << "Input data type " << indices_data_type_ << " is unsupported";
  }
 }

 void SparseApplyLazyAdamCPUKernel::InitKernel(const CNodePtr &kernel_node) {
  MS_EXCEPTION_IF_NULL(kernel_node);
  kernel_name_ = AnfAlgo::GetCNodeName(kernel_node);
  std::vector<size_t> var_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
  std::vector<size_t> m_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 1);
  std::vector<size_t> v_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 2);
@@ -103,14 +107,14 @@ void SparseApplyLazyAdamCPUKernel::InitKernel(const CNodePtr &kernel_node) {
    var_outer_dim_size_ *= var_shape[i];
  }
  if (indices_shape.size() != 1) {
    MS_LOG(EXCEPTION) << "Indices must be 1D!";
    MS_LOG(EXCEPTION) << "Indices must be 1D";
  }
  indices_size_ = indices_shape[0];
  if (grad_shape[0] != indices_size_) {
    MS_LOG(EXCEPTION) << "The first dimension of grad shape must be equal to indices";
  }
  if (AnfAlgo::HasNodeAttr(USE_NESTEROV, kernel_node)) {
    use_nesterov_ = AnfAlgo::GetNodeAttr<bool>(kernel_node, "use_nesterov");
    use_nesterov_ = AnfAlgo::GetNodeAttr<bool>(kernel_node, USE_NESTEROV);
  }
  indices_data_type_ = AnfAlgo::GetInputDeviceDataType(kernel_node, 10);
 }
@@ -118,9 +122,9 @@ void SparseApplyLazyAdamCPUKernel::InitKernel(const CNodePtr &kernel_node) {
 template <typename T>
 void SparseApplyLazyAdamCPUKernel::LaunchKernel(const std::vector<kernel::AddressPtr> &inputs,
                                                const std::vector<kernel::AddressPtr> &workspace) const {
  auto var = reinterpret_cast<float *>(inputs[0]->addr);
  auto m = reinterpret_cast<float *>(inputs[1]->addr);
  auto v = reinterpret_cast<float *>(inputs[2]->addr);
  auto *var = reinterpret_cast<float *>(inputs[0]->addr);
  auto *m = reinterpret_cast<float *>(inputs[1]->addr);
  auto *v = reinterpret_cast<float *>(inputs[2]->addr);
  auto beta1_power = reinterpret_cast<float *>(inputs[3]->addr)[0];
  if (beta1_power == 1) {
    MS_LOG(EXCEPTION) << "The beta1_power should not be 1";
@@ -130,12 +134,12 @@ void SparseApplyLazyAdamCPUKernel::LaunchKernel(const std::vector<kernel::Addres
  auto beta1 = reinterpret_cast<float *>(inputs[6]->addr)[0];
  auto beta2 = reinterpret_cast<float *>(inputs[7]->addr)[0];
  auto epsilon = reinterpret_cast<float *>(inputs[8]->addr)[0];
  auto grad = reinterpret_cast<float *>(inputs[9]->addr);
  auto indices = reinterpret_cast<T *>(inputs[10]->addr);
  auto new_grad = reinterpret_cast<float *>(workspace[0]->addr);
  auto new_indices = reinterpret_cast<T *>(workspace[1]->addr);
  auto workspace_grad = reinterpret_cast<float *>(workspace[2]->addr);
  auto workspace_indices = reinterpret_cast<T *>(workspace[3]->addr);
  auto *grad = reinterpret_cast<float *>(inputs[9]->addr);
  auto *indices = reinterpret_cast<T *>(inputs[10]->addr);
  auto *new_grad = reinterpret_cast<float *>(workspace[0]->addr);
  auto *new_indices = reinterpret_cast<T *>(workspace[1]->addr);
  auto *workspace_grad = reinterpret_cast<float *>(workspace[2]->addr);
  auto *workspace_indices = reinterpret_cast<T *>(workspace[3]->addr);

  SparseGradient<T> unique_sparse_grad({new_grad, new_indices, indices_size_});
  SparseGradient<T> workspace_sparse_grad({workspace_grad, workspace_indices, indices_size_});
@@ -167,10 +171,8 @@ void SparseApplyLazyAdamCPUKernel::LaunchKernel(const std::vector<kernel::Addres
 bool SparseApplyLazyAdamCPUKernel::Launch(const std::vector<kernel::AddressPtr> &inputs,
                                          const std::vector<kernel::AddressPtr> &workspace,
                                          const std::vector<kernel::AddressPtr> &) {
  if (inputs.size() < kSparseApplyLazyAdamInputSize) {
    MS_LOG(EXCEPTION) << "Error input size!";
  }

  CHECK_KERNEL_INPUTS_NUM(inputs.size(), kSparseApplyLazyAdamInputsNum, kernel_name_);
  CHECK_KERNEL_WORKSPACE_SIZE(workspace.size(), kSparseApplyLazyAdamWorkspaceSize, kernel_name_);
  if (indices_data_type_ == kNumberTypeInt32) {
    LaunchKernel<int>(inputs, workspace);
  } else if (indices_data_type_ == kNumberTypeInt64) {
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/sparse_apply_lazy_adam_cpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/sparse_apply_lazy_adam_cpu_kernel.h
@@ -1,5 +1,5 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 * Copyright 2020-2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -13,6 +13,7 @@
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_SPARSE_APPLY_LAZY_ADAM_CPU_KERNEL_H_
 #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_SPARSE_APPLY_LAZY_ADAM_CPU_KERNEL_H_

@@ -27,17 +28,20 @@ class SparseApplyLazyAdamCPUKernel : public SparseOptimizerCPUKernel {
  ~SparseApplyLazyAdamCPUKernel() override = default;

  void InitKernel(const CNodePtr &kernel_node) override;
  void InitInputOutputSize(const CNodePtr &kernel_node) override;
  bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
              const std::vector<AddressPtr> &outputs) override;

 protected:
  void InitInputOutputSize(const CNodePtr &kernel_node) override;
  bool use_nesterov_{false};

 private:
  template <typename T>
  void InitWorkspaceSize();

  template <typename T>
  void LaunchKernel(const std::vector<kernel::AddressPtr> &inputs,
                    const std::vector<kernel::AddressPtr> &workspace) const;

 protected:
  bool use_nesterov_{false};
 };

 MS_REG_CPU_KERNEL(FusedSparseLazyAdam,
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/sparse_apply_proximal_adagrad_cpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/sparse_apply_proximal_adagrad_cpu_kernel.cc
@@ -21,7 +21,8 @@
 namespace mindspore {
 namespace kernel {
 namespace {
 constexpr size_t kSparseApplyProximalAdagradInputSize = 7;
 constexpr size_t kSparseApplyProximalAdagradInputsNum = 7;
 constexpr size_t kSparseApplyProximalAdagradWorkspaceSize = 4;

 template <typename T>
 void ComputeProximalAdagrad(MultiThreadComputeParams<T> *input_params, size_t start, size_t end) {
@@ -70,13 +71,16 @@ void SparseApplyProximalAdagradCPUKernel::InitInputOutputSize(const CNodePtr &ke
  CPUKernel::InitInputOutputSize(kernel_node);
  if (indices_data_type_ == kNumberTypeInt32) {
    InitWorkspaceSize<int>();
  } else {
  } else if (indices_data_type_ == kNumberTypeInt64) {
    InitWorkspaceSize<int64_t>();
  } else {
    MS_LOG(EXCEPTION) << "Input data type " << indices_data_type_ << " is unsupported";
  }
 }

 void SparseApplyProximalAdagradCPUKernel::InitKernel(const CNodePtr &kernel_node) {
  MS_EXCEPTION_IF_NULL(kernel_node);
  kernel_name_ = AnfAlgo::GetCNodeName(kernel_node);
  std::vector<size_t> var_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
  std::vector<size_t> accum_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 1);
  std::vector<size_t> lr_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 2);
@@ -160,9 +164,8 @@ void SparseApplyProximalAdagradCPUKernel::LaunchKernel(const std::vector<kernel:
 bool SparseApplyProximalAdagradCPUKernel::Launch(const std::vector<kernel::AddressPtr> &inputs,
                                                 const std::vector<kernel::AddressPtr> &workspace,
                                                 const std::vector<kernel::AddressPtr> &) {
  if (inputs.size() < kSparseApplyProximalAdagradInputSize) {
    MS_LOG(EXCEPTION) << "Wrong input size!";
  }
  CHECK_KERNEL_INPUTS_NUM(inputs.size(), kSparseApplyProximalAdagradInputsNum, kernel_name_);
  CHECK_KERNEL_WORKSPACE_SIZE(workspace.size(), kSparseApplyProximalAdagradWorkspaceSize, kernel_name_);
  if (indices_data_type_ == kNumberTypeInt32) {
    LaunchKernel<int>(inputs, workspace);
  } else if (indices_data_type_ == kNumberTypeInt64) {
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/sparse_apply_proximal_adagrad_cpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/sparse_apply_proximal_adagrad_cpu_kernel.h
@@ -1,5 +1,5 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 * Copyright 2020-2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -13,6 +13,7 @@
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_SPARSE_APPLY_PROXIMAL_ADAGRAD_CPU_KERNEL_H_
 #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_SPARSE_APPLY_PROXIMAL_ADAGRAD_CPU_KERNEL_H_

@@ -27,11 +28,16 @@ class SparseApplyProximalAdagradCPUKernel : public SparseOptimizerCPUKernel {
  ~SparseApplyProximalAdagradCPUKernel() override = default;

  void InitKernel(const CNodePtr &kernel_node) override;
  void InitInputOutputSize(const CNodePtr &kernel_node) override;
  bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
              const std::vector<AddressPtr> &outputs) override;

 protected:
  void InitInputOutputSize(const CNodePtr &kernel_node) override;

 private:
  template <typename T>
  void InitWorkspaceSize();

  template <typename T>
  void LaunchKernel(const std::vector<kernel::AddressPtr> &inputs,
                    const std::vector<kernel::AddressPtr> &workspace) const;
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/sparse_optimizer_cpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/sparse_optimizer_cpu_kernel.h
@@ -1,5 +1,5 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 * Copyright 2020-2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -13,6 +13,7 @@
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_SPARSE_OPTIMIZER_CPU_KERNEL_H_
 #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_SPARSE_OPTIMIZER_CPU_KERNEL_H_

@@ -63,6 +64,7 @@ struct MultiThreadComputeParams {
  size_t var_outer_dim_size_{0};
  bool use_nesterov_;
 };

 template <typename T>
 using MultiThreadComputeFunc = std::function<void(MultiThreadComputeParams<T> *param, size_t start, size_t end)>;

@@ -205,7 +207,7 @@ class SparseOptimizerCPUKernel : public CPUKernel {
    MS_LOG(DEBUG) << "Start";
    MS_EXCEPTION_IF_NULL(segment);
    MS_EXCEPTION_IF_NULL(segment->indices_);
    if (param.thread_num_ < 1) {
    if (param.thread_num_ == 0) {
      MS_EXCEPTION(ArgumentError) << "Input param thread num must > 0!";
    }
    std::vector<size_t> bucket_data_num(param.thread_num_, 0);
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/sparse_tensor_dense_matmul_cpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/sparse_tensor_dense_matmul_cpu_kernel.cc
@@ -20,12 +20,18 @@
 namespace mindspore {
 namespace kernel {
 namespace {
 constexpr size_t kSparseTensorDenseMatmulInputsNum = 4;
 constexpr size_t kSparseTensorDenseMatmulOutputsNum = 1;
 constexpr size_t kSparseTensorDenseMatmulOutputShapeSize = 2;
 constexpr size_t kSparseTensorDenseMatmulDenseShapeSize = 2;
 constexpr size_t kIndicesSizeNum = 2;
 constexpr size_t kIndices2rdDimNum = 2;
 }  // namespace

 template <typename I, typename T>
 void SparseTensorDenseMatmulCPUKernel<I, T>::InitKernel(const CNodePtr &kernel_node) {
  MS_EXCEPTION_IF_NULL(kernel_node);
  kernel_name_ = AnfAlgo::GetCNodeName(kernel_node);
  adj_st_ = AnfAlgo::GetNodeAttr<bool>(kernel_node, ADJ_ST);
  adj_dt_ = AnfAlgo::GetNodeAttr<bool>(kernel_node, ADJ_dT);
  auto indices_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, INDICES);
@@ -59,11 +65,8 @@ template <typename I, typename T>
 bool SparseTensorDenseMatmulCPUKernel<I, T>::Launch(const std::vector<kernel::AddressPtr> &inputs,
                                                    const std::vector<kernel::AddressPtr> & /* workspace */,
                                                    const std::vector<kernel::AddressPtr> &outputs) {
  if (inputs.size() != kInputNum || outputs.size() != kOutputNum) {
    MS_LOG(ERROR) << "SparseTensorDenseMatmul requires 4 inputs and 1 output, but got " << inputs.size()
                  << " inputs and " << outputs.size() << " output.";
    return false;
  }
  CHECK_KERNEL_INPUTS_NUM(inputs.size(), kSparseTensorDenseMatmulInputsNum, kernel_name_);
  CHECK_KERNEL_OUTPUTS_NUM(outputs.size(), kSparseTensorDenseMatmulOutputsNum, kernel_name_);
  if (outputs[0]->size == 0) {
    MS_LOG(WARNING) << "SparseTensorDenseMatmul output memory size should be greater than 0, but got 0.";
    return true;
@@ -72,13 +75,16 @@ bool SparseTensorDenseMatmulCPUKernel<I, T>::Launch(const std::vector<kernel::Ad
    MS_LOG(EXCEPTION) << "SparseTensorDenseMatmul memset output failed!";
  }

  const size_t b_index = 3;
  const auto *a_indices = reinterpret_cast<I *>(inputs[0]->addr);
  const auto *a_values = reinterpret_cast<T *>(inputs[1]->addr);
  const auto *b = reinterpret_cast<T *>(inputs[3]->addr);
  const auto *b = reinterpret_cast<T *>(inputs[b_index]->addr);
  auto *out = reinterpret_cast<T *>(outputs[0]->addr);
  const size_t indices_length = inputs[0]->size / sizeof(I);
  const size_t values_length = inputs[1]->size / sizeof(T);
  const size_t b_length = inputs[3]->size / sizeof(T);
  const size_t b_length = inputs[b_index]->size / sizeof(T);

  const size_t dim_num = 2;
  const size_t out_dim_0 = output_shape_[0];
  const size_t out_dim_1 = output_shape_[1];
  const size_t b_dim_0 = b_shape_[0];
@@ -86,14 +92,14 @@ bool SparseTensorDenseMatmulCPUKernel<I, T>::Launch(const std::vector<kernel::Ad
  const size_t same_dim = adj_dt_ ? b_dim_1 : b_dim_0;

  for (size_t i = 0; i < values_size_; ++i) {
    if (i * 2 + 1 >= indices_length) {  // the interval is 2
    if (i * dim_num + 1 >= indices_length) {
      MS_LOG(EXCEPTION) << "The index of a_indices out of bounds.";
    }
    if (i >= values_length) {
      MS_LOG(EXCEPTION) << "The index of a_values out of bounds.";
    }
    const int row = adj_st_ ? a_indices[i * 2 + 1] : a_indices[i * 2];
    const int col = adj_st_ ? a_indices[i * 2] : a_indices[i * 2 + 1];
    const int row = adj_st_ ? a_indices[i * dim_num + 1] : a_indices[i * dim_num];
    const int col = adj_st_ ? a_indices[i * dim_num] : a_indices[i * dim_num + 1];
    if (row >= SizeToInt(out_dim_0) || row < 0 || col >= SizeToInt(same_dim) || col < 0) {
      MS_EXCEPTION(ValueError) << "The indices including out of bounds index, row range: [0, " << out_dim_0
                               << "), col range: [0, " << same_dim << "), but got row: " << row << ", col: " << col;
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/sparse_tensor_dense_matmul_cpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/sparse_tensor_dense_matmul_cpu_kernel.h
@@ -23,10 +23,6 @@

 namespace mindspore {
 namespace kernel {
 constexpr size_t kInputNum = 4;
 constexpr size_t kOutputNum = 1;
 constexpr size_t kIndicesSizeNum = 2;
 constexpr size_t kIndices2rdDimNum = 2;
 template <typename I, typename T>
 class SparseTensorDenseMatmulCPUKernel : public CPUKernel {
 public:
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/sparse_to_dense_cpu_kernal.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/sparse_to_dense_cpu_kernal.cc
@@ -22,12 +22,14 @@ namespace mindspore {
 namespace kernel {
 namespace {
 constexpr size_t kIndicesShapeSize = 2;
 constexpr size_t kSparseToDenseInputsNum = 3;
 constexpr size_t kSparseToDenseOutputsNum = 1;
 }  // namespace

 template <typename I, typename T>
 void SparseToDenseCPUKernel<I, T>::InitKernel(const CNodePtr &kernel_node) {
  MS_EXCEPTION_IF_NULL(kernel_node);
  CheckParam(kernel_node);
  kernel_name_ = AnfAlgo::GetCNodeName(kernel_node);
  auto indices_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
  if (indices_shape.size() != kIndicesShapeSize) {
    MS_LOG(EXCEPTION) << "SparseToDense requires 'indices' should be a " << kIndicesShapeSize << "-D Tensor, but got "
@@ -48,11 +50,8 @@ template <typename I, typename T>
 bool SparseToDenseCPUKernel<I, T>::Launch(const std::vector<kernel::AddressPtr> &inputs,
                                          const std::vector<kernel::AddressPtr> & /*workspace*/,
                                          const std::vector<kernel::AddressPtr> &outputs) {
  if (inputs.size() != 3 || outputs.size() != 1) {
    MS_LOG(ERROR) << "SparseToDense requires 3 inputs and 1 output, but got " << inputs.size() << " inputs and "
                  << outputs.size() << " output.";
    return false;
  }
  CHECK_KERNEL_INPUTS_NUM(inputs.size(), kSparseToDenseInputsNum, kernel_name_);
  CHECK_KERNEL_OUTPUTS_NUM(outputs.size(), kSparseToDenseOutputsNum, kernel_name_);
  if (outputs[0]->size == 0) {
    MS_LOG(WARNING) << "SparseToDense output memory size should be greater than 0, but got 0.";
    return true;
@@ -92,17 +91,5 @@ bool SparseToDenseCPUKernel<I, T>::Launch(const std::vector<kernel::AddressPtr>
  }
  return true;
 }

 template <typename I, typename T>
 void SparseToDenseCPUKernel<I, T>::CheckParam(const CNodePtr &kernel_node) {
  size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
  if (input_num != 3) {
    MS_LOG(EXCEPTION) << "SparseToDense needs 3 inputs, but got " << input_num;
  }
  size_t output_num = AnfAlgo::GetOutputTensorNum(kernel_node);
  if (output_num != 1) {
    MS_LOG(EXCEPTION) << "SparseToDense should have 2 outputs, but got " << output_num;
  }
 }
 }  // namespace kernel
 }  // namespace mindspore
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/sparse_to_dense_cpu_kernal.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/sparse_to_dense_cpu_kernal.h
@@ -37,7 +37,6 @@ class SparseToDenseCPUKernel : public CPUKernel {
              const std::vector<AddressPtr> &outputs) override;

 private:
  void CheckParam(const CNodePtr &kernel_node);
  std::vector<size_t> output_shape_;
  size_t values_size_{0};
 };
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/split_cpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/split_cpu_kernel.cc
@@ -21,11 +21,16 @@

 namespace mindspore {
 namespace kernel {
 namespace {
 constexpr size_t kSplitInputsNum = 1;
 }  // namespace

 template <typename T>
 void SplitCPUKernel<T>::InitKernel(const CNodePtr &kernel_node) {
  MS_EXCEPTION_IF_NULL(kernel_node);
  axis_ = AnfAlgo::GetNodeAttr<int64_t>(kernel_node, "axis");
  output_num_ = AnfAlgo::GetNodeAttr<int64_t>(kernel_node, "output_num");
  kernel_name_ = AnfAlgo::GetCNodeName(kernel_node);
  axis_ = AnfAlgo::GetNodeAttr<int64_t>(kernel_node, AXIS);
  output_num_ = LongToSize(AnfAlgo::GetNodeAttr<int64_t>(kernel_node, "output_num"));
  if (output_num_ == 0) {
    MS_LOG(EXCEPTION) << "Attr output_num is equal to 0";
  }
@@ -49,6 +54,8 @@ template <typename T>
 bool SplitCPUKernel<T>::Launch(const std::vector<kernel::AddressPtr> &inputs,
                               const std::vector<kernel::AddressPtr> &workspace,
                               const std::vector<kernel::AddressPtr> &outputs) {
  CHECK_KERNEL_INPUTS_NUM(inputs.size(), kSplitInputsNum, kernel_name_);
  CHECK_KERNEL_OUTPUTS_NUM(outputs.size(), output_num_, kernel_name_);
  LaunchKernel(inputs, workspace, outputs);
  return true;
 }
@@ -56,7 +63,7 @@ bool SplitCPUKernel<T>::Launch(const std::vector<kernel::AddressPtr> &inputs,
 template <typename T>
 void SplitCPUKernel<T>::LaunchSplit(T *input, T **output, size_t /* size */) {
  SplitParameter param;
  param.num_split_ = LongToInt(output_num_);
  param.num_split_ = SizeToInt(output_num_);
  param.split_dim_ = LongToInt(axis_);
  param.strides_[input_shape_.size() - 1] = 1;
  for (int i = SizeToInt(input_shape_.size()) - 2; i >= 0; i--) {  // from -2 to 0 dim
@@ -64,7 +71,7 @@ void SplitCPUKernel<T>::LaunchSplit(T *input, T **output, size_t /* size */) {
  }
  auto split_sizes = std::make_unique<int[]>(IntToSize(param.num_split_));
  param.split_sizes_ = split_sizes.get();
  int split_size = input_shape_[param.split_dim_] / output_num_;
  int split_size = input_shape_[param.split_dim_] / SizeToInt(output_num_);
  for (int i = 0; i < param.num_split_; i++) {
    param.split_sizes_[i] = split_size;
  }
@@ -96,13 +103,7 @@ void SplitCPUKernel<T>::LaunchKernel(const std::vector<AddressPtr> &inputs,

 template <typename T>
 void SplitCPUKernel<T>::CheckParam(const CNodePtr &kernel_node) {
  auto input_num = AnfAlgo::GetInputTensorNum(kernel_node);
  int64_t dims = SizeToLong(input_shape_.size());
  int64_t output_num = SizeToLong(AnfAlgo::GetOutputTensorNum(kernel_node));

  if (input_num != 1) {
    MS_LOG(EXCEPTION) << "Input number is " << input_num << ", but Split needs 1 input.";
  }
  if (dims == 0 || dims > SPLIT_STRIDES_SIZE) {
    MS_LOG(EXCEPTION) << "Input dims is " << dims << ", scalar is not supported.";
  }
@@ -110,14 +111,11 @@ void SplitCPUKernel<T>::CheckParam(const CNodePtr &kernel_node) {
    MS_LOG(EXCEPTION) << "Attr axis_ " << axis_ << " must be in " << -dims << "~" << dims;
  }
  if (axis_ < 0) {
    axis_ += SizeToInt(input_shape_.size());
    axis_ += SizeToLong(input_shape_.size());
  }
  if (output_num_ > IntToLong(input_shape_[LongToUlong(axis_)])) {
  if (output_num_ > IntToSize(input_shape_[LongToUlong(axis_)])) {
    MS_LOG(EXCEPTION) << "Attr output_num " << output_num_ << " must less than " << input_shape_[axis_];
  }
  if (output_num_ != output_num) {
    MS_LOG(EXCEPTION) << "Output num is " << output_num << ", but need " << output_num_;
  }
 }
 }  // namespace kernel
 }  // namespace mindspore
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/split_cpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/split_cpu_kernel.h
@@ -37,25 +37,19 @@ class SplitCPUKernel : public CPUKernel {
  bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
              const std::vector<AddressPtr> &outputs) override;

  void LaunchKernel(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
                    const std::vector<AddressPtr> &outputs);

  void InitInputOutputSize(const CNodePtr &kernel_node) override;

 private:
  void CheckParam(const CNodePtr &kernel_node);

  void LaunchSplit(T *input, T **output, size_t size);
  int64_t axis_{1};
  int64_t output_num_{1};
  int64_t axis_step_{1};

  size_t input_size_{1};
  size_t dims_after_axis_{1};
  size_t dims_current_after_axis_{1};
  void LaunchKernel(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
                    const std::vector<AddressPtr> &outputs);

  void InitInputOutputSize(const CNodePtr &kernel_node) override;

  std::vector<std::vector<size_t>> output_shape_list_;
  int64_t axis_{0};
  size_t output_num_{1};
  std::vector<int> input_shape_;
  TypeId dtype_{kTypeUnknown};
 };

 MS_REG_CPU_KERNEL_T(Split, KernelAttr(), SplitCPUKernel, float);
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/stridedslice_cpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/stridedslice_cpu_kernel.cc
@@ -24,21 +24,25 @@

 namespace mindspore {
 namespace kernel {
 namespace {
 constexpr size_t kStridedSliceInputsNum = 1;
 constexpr size_t kStridedSliceOutputsNum = 1;
 }  // namespace

 enum PosType { kBegin, kEnd };

 int NormalizePos(int pos, int dim_len, PosType pos_type) {
  if (pos < 0) {
    int normal_pos = pos + dim_len;
    int threshold = pos_type == kBegin ? 0 : -1;
    normal_pos = std::max(normal_pos, threshold);
    return normal_pos;
  if (pos >= 0) {
    int max_pos = pos_type == kBegin ? dim_len - 1 : dim_len;
    return std::min(pos, max_pos);
  }
  int max_pos = pos_type == kBegin ? dim_len - 1 : dim_len;
  return std::min(pos, max_pos);
  int min_pos = pos_type == kBegin ? 0 : -1;
  return std::max(pos + dim_len, min_pos);
 }

 void StridedSliceCPUKernel::InitKernel(const CNodePtr &kernel_node) {
  MS_EXCEPTION_IF_NULL(kernel_node);
  kernel_name_ = AnfAlgo::GetCNodeName(kernel_node);
  input_shape_ = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
  output_shape_ = AnfAlgo::GetOutputInferShape(kernel_node, 0);
  if (input_shape_.size() > DIMENSION_8D || input_shape_.empty()) {
@@ -70,18 +74,17 @@ bool StridedSliceCPUKernel::MatchParallelPattern() {
  // Example 2:
  // input shape info:  [1, 46, 40]
  // output shape info: [1, 20, 40]
  if (input_shape_.size() != output_shape_.size()) {
    return false;
  }
  std::vector<int> axis_list;
  for (size_t i = 0; i < input_shape_.size(); ++i) {
    if (input_shape_[i] != output_shape_[i]) {
      (void)axis_list.emplace_back(i);
  if (input_shape_.size() == output_shape_.size()) {
    std::vector<int> axis_list;
    for (size_t i = 0; i < input_shape_.size(); ++i) {
      if (input_shape_[i] != output_shape_[i]) {
        (void)axis_list.emplace_back(i);
      }
    }
    if (axis_list.size() == 1) {
      split_axis_ = axis_list.front();
      return true;
    }
  }
  if (axis_list.size() == 1) {
    split_axis_ = axis_list.front();
    return true;
  }
  return false;
 }
@@ -123,8 +126,9 @@ void StridedSliceCPUKernel::InitSliceParam(const std::vector<int64_t> &begin, co
  slice_param_.data_type = type_pair->second.first;

  for (size_t i = 0; i < DIMENSION_8D; i++) {
    int dim_len;
    if (i < begin.size()) {
      int dim_len = SizeToInt(input_shape_[i]);
      dim_len = SizeToInt(input_shape_[i]);
      int begin_pos = LongToInt(begin[i]);
      int end_pos = LongToInt(end[i]);
      int stride_size = LongToInt(stride[i]);
@@ -142,7 +146,7 @@ void StridedSliceCPUKernel::InitSliceParam(const std::vector<int64_t> &begin, co
        slice_param_.ends_[i] = slice_param_.begins_[i] - 1;
      }
    } else if (i < input_shape_.size()) {
      int dim_len = SizeToInt(input_shape_[i]);
      dim_len = SizeToInt(input_shape_[i]);
      slice_param_.in_shape_[i] = dim_len;
      slice_param_.begins_[i] = 0;
      slice_param_.ends_[i] = dim_len;
@@ -158,10 +162,10 @@ void StridedSliceCPUKernel::InitSliceParam(const std::vector<int64_t> &begin, co
  slice_param_.num_axes_ = DIMENSION_8D;
 }

 int StridedSliceCPUKernel::RunTaskOnOuter(uint8_t *input_addr, uint8_t *output_addr, int start_pos) {
 int StridedSliceCPUKernel::RunTaskOnOuter(const uint8_t *input_addr, uint8_t *output_addr, int start_pos) {
  int begin_index = slice_param_.begins_[split_axis_];
  int inner_size = inner_ * data_size_;
  uint8_t *cur_in_ptr = input_addr + (start_pos * input_shape_[split_axis_] + begin_index) * inner_size;
  const uint8_t *cur_in_ptr = input_addr + (start_pos * input_shape_[split_axis_] + begin_index) * inner_size;
  uint8_t *cur_out_ptr = output_addr + start_pos * output_shape_[split_axis_] * inner_size;
  int cur_outer = outer_ - start_pos;
  if (cur_outer <= 0) {
@@ -173,10 +177,10 @@ int StridedSliceCPUKernel::RunTaskOnOuter(uint8_t *input_addr, uint8_t *output_a
  return common::SUCCESS;
 }

 int StridedSliceCPUKernel::RunTaskOnSplitAxis(uint8_t *input_addr, uint8_t *output_addr, int start_pos) {
 int StridedSliceCPUKernel::RunTaskOnSplitAxis(const uint8_t *input_addr, uint8_t *output_addr, int start_pos) {
  int begin_index = slice_param_.begins_[split_axis_];
  int inner_size = inner_ * data_size_;
  uint8_t *cur_in_ptr = input_addr + (start_pos * slice_param_.strides_[split_axis_] + begin_index) * inner_size;
  const uint8_t *cur_in_ptr = input_addr + (start_pos * slice_param_.strides_[split_axis_] + begin_index) * inner_size;
  uint8_t *cur_out_ptr = output_addr + start_pos * inner_size;
  int cal_axis_num = output_shape_[split_axis_] - start_pos;
  if (cal_axis_num <= 0) {
@@ -187,10 +191,10 @@ int StridedSliceCPUKernel::RunTaskOnSplitAxis(uint8_t *input_addr, uint8_t *outp
  return common::SUCCESS;
 }

 void StridedSliceCPUKernel::ParallelRun(uint8_t *input_addr, uint8_t *output_addr, int thread_num) {
 void StridedSliceCPUKernel::ParallelRun(const uint8_t *input_addr, uint8_t *output_addr, int thread_num) {
  int thread_index = 0;
  std::vector<common::Task> tasks;
  std::function<int(StridedSliceCPUKernel *, uint8_t *, uint8_t *, int)> execute_func;
  std::function<int(StridedSliceCPUKernel *, const uint8_t *, uint8_t *, int)> execute_func;
  if (parallel_strategy_ == kOnOuter) {
    execute_func = &StridedSliceCPUKernel::RunTaskOnOuter;
  } else if (parallel_strategy_ == kOnSplitAxis) {
@@ -208,13 +212,10 @@ void StridedSliceCPUKernel::ParallelRun(uint8_t *input_addr, uint8_t *output_add
 }

 bool StridedSliceCPUKernel::Launch(const std::vector<kernel::AddressPtr> &inputs,
                                   const std::vector<kernel::AddressPtr> & /*workspace*/,
                                   const std::vector<kernel::AddressPtr> & /* workspace */,
                                   const std::vector<kernel::AddressPtr> &outputs) {
  if (inputs.size() != 1 || outputs.size() != 1) {
    MS_LOG(ERROR) << "StridedSlice requires 1 input and 1 output, but got " << inputs.size() << " input and "
                  << outputs.size() << " output.";
    return false;
  }
  CHECK_KERNEL_INPUTS_NUM(inputs.size(), kStridedSliceInputsNum, kernel_name_);
  CHECK_KERNEL_OUTPUTS_NUM(outputs.size(), kStridedSliceOutputsNum, kernel_name_);
  if (outputs[0]->size == 0) {
    MS_LOG(WARNING) << "StridedSlice output memory size should be greater than 0, but got 0.";
    return true;
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/stridedslice_cpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/stridedslice_cpu_kernel.h
@@ -14,8 +14,8 @@
 * limitations under the License.
 */

 #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_SLICE_CPU_KERNEL_H_
 #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_SLICE_CPU_KERNEL_H_
 #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_STRIDESLICE_CPU_KERNEL_H_
 #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_STRIDESLICE_CPU_KERNEL_H_

 #include <vector>
 #include <memory>
@@ -37,14 +37,13 @@ class StridedSliceCPUKernel : public CPUKernel {

 private:
  enum ParallelStrategy { kOnSplitAxis, kOnOuter };

  void InitSliceParam(const std::vector<int64_t> &begin, const std::vector<int64_t> &end,
                      const std::vector<int64_t> &stride);
  bool MatchParallelPattern();
  void InitParallelParam();
  void ParallelRun(uint8_t *input_addr, uint8_t *output_addr, int thread_num);
  int RunTaskOnOuter(uint8_t *input_addr, uint8_t *output_addr, int start_pos);
  int RunTaskOnSplitAxis(uint8_t *input_addr, uint8_t *output_addr, int start_pos);
  void ParallelRun(const uint8_t *input_addr, uint8_t *output_addr, int thread_num);
  int RunTaskOnOuter(const uint8_t *input_addr, uint8_t *output_addr, int start_pos);
  int RunTaskOnSplitAxis(const uint8_t *input_addr, uint8_t *output_addr, int start_pos);

  TypeId dtype_;
  int data_size_{4};
@@ -70,4 +69,4 @@ MS_REG_CPU_KERNEL(StridedSlice, KernelAttr().AddInputAttr(kNumberTypeFloat64).Ad
 }  // namespace kernel
 }  // namespace mindspore

 #endif  // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_SLICE_CPU_KERNEL_H_
 #endif  // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_STRIDESLICE_CPU_KERNEL_H_
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/sub_and_filter_cpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/sub_and_filter_cpu_kernel.cc
@@ -20,8 +20,14 @@

 namespace mindspore {
 namespace kernel {
 namespace {
 constexpr size_t kSubAndFilterInputsNum = 3;
 constexpr size_t kSubAndFilterOutputNum = 2;
 }  // namespace

 void SubAndFilterCPUKernel::InitKernel(const CNodePtr &kernel_node) {
  MS_EXCEPTION_IF_NULL(kernel_node);
  kernel_name_ = AnfAlgo::GetCNodeName(kernel_node);
  node_wpt_ = kernel_node;
  input_x_dtype_ = AnfAlgo::GetInputDeviceDataType(kernel_node, 0);
 }
@@ -29,6 +35,8 @@ void SubAndFilterCPUKernel::InitKernel(const CNodePtr &kernel_node) {
 bool SubAndFilterCPUKernel::Launch(const std::vector<kernel::AddressPtr> &inputs,
                                   const std::vector<kernel::AddressPtr> &,
                                   const std::vector<kernel::AddressPtr> &outputs) {
  CHECK_KERNEL_INPUTS_NUM(inputs.size(), kSubAndFilterInputsNum, kernel_name_);
  CHECK_KERNEL_OUTPUTS_NUM(outputs.size(), kSubAndFilterOutputNum, kernel_name_);
  if (input_x_dtype_ == kNumberTypeInt32) {
    LaunchKernel<int>(inputs, outputs);
  } else if (input_x_dtype_ == kNumberTypeInt64) {
@@ -42,11 +50,9 @@ bool SubAndFilterCPUKernel::Launch(const std::vector<kernel::AddressPtr> &inputs
 template <typename T>
 void SubAndFilterCPUKernel::LaunchKernel(const std::vector<AddressPtr> &inputs,
                                         const std::vector<kernel::AddressPtr> &outputs) {
  auto node_ = node_wpt_.lock();
  if (!node_) {
    MS_LOG(EXCEPTION) << "node_wpt_ is expired.";
  }
  auto indices_shape = AnfAlgo::GetPrevNodeOutputInferShape(node_, 0);
  auto node = node_wpt_.lock();
  MS_EXCEPTION_IF_NULL(node);
  auto indices_shape = AnfAlgo::GetPrevNodeOutputInferShape(node, 0);

  batch_size_ = 1;
  for (size_t i = 0; i < indices_shape.size(); ++i) {
@@ -71,12 +77,12 @@ void SubAndFilterCPUKernel::LaunchKernel(const std::vector<AddressPtr> &inputs,
  MS_LOG(INFO) << "SubAndFilter output count is " << count;
  std::vector<size_t> out_shape;
  (void)out_shape.emplace_back(count);
  size_t output_num = AnfAlgo::GetOutputTensorNum(node_);
  size_t output_num = AnfAlgo::GetOutputTensorNum(node);
  std::vector<TypeId> dtypes(output_num);
  for (size_t i = 0; i < output_num; i++) {
    dtypes[i] = AnfAlgo::GetOutputDeviceDataType(node_, i);
    dtypes[i] = AnfAlgo::GetOutputDeviceDataType(node, i);
  }
  AnfAlgo::SetOutputInferTypeAndShape(dtypes, {out_shape, out_shape}, node_.get());
  AnfAlgo::SetOutputInferTypeAndShape(dtypes, {out_shape, out_shape}, node.get());
 }
 }  // namespace kernel
 }  // namespace mindspore
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/sub_and_filter_cpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/sub_and_filter_cpu_kernel.h
@@ -35,10 +35,10 @@ class SubAndFilterCPUKernel : public CPUKernel {
  bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
              const std::vector<AddressPtr> &outputs) override;

 private:
  template <typename T>
  void LaunchKernel(const std::vector<AddressPtr> &inputs, const std::vector<kernel::AddressPtr> &outputs);

 private:
  size_t batch_size_{1};
  TypeId input_x_dtype_{kTypeUnknown};
  CNodeWeakPtr node_wpt_;
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/tensor_copy_slices_cpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/tensor_copy_slices_cpu_kernel.cc
@@ -23,8 +23,14 @@

 namespace mindspore {
 namespace kernel {
 namespace {
 constexpr size_t kTensorCopySlicesInputsNum = 2;
 constexpr size_t kTensorCopySlicesOutputsNum = 1;
 }  // namespace

 void TensorCopySlicesCPUKernel::InitKernel(const CNodePtr &kernel_node) {
  MS_EXCEPTION_IF_NULL(kernel_node);
  kernel_name_ = AnfAlgo::GetCNodeName(kernel_node);
  auto input_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
  auto update_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 1);
  auto output_shape = AnfAlgo::GetOutputInferShape(kernel_node, 0);
@@ -48,11 +54,8 @@ void TensorCopySlicesCPUKernel::InitKernel(const CNodePtr &kernel_node) {
 bool TensorCopySlicesCPUKernel::Launch(const std::vector<kernel::AddressPtr> &inputs,
                                       const std::vector<kernel::AddressPtr> & /* workspace */,
                                       const std::vector<kernel::AddressPtr> &outputs) {
  if (inputs.size() != 2 || outputs.size() != 1) {
    MS_LOG(ERROR) << "TensorCopySlices requires 1 input and 1 output, but got " << inputs.size() << " input and "
                  << outputs.size() << " output.";
    return false;
  }
  CHECK_KERNEL_INPUTS_NUM(inputs.size(), kTensorCopySlicesInputsNum, kernel_name_);
  CHECK_KERNEL_OUTPUTS_NUM(outputs.size(), kTensorCopySlicesOutputsNum, kernel_name_);

  auto input_addr = reinterpret_cast<uint8_t *>(inputs[0]->addr);
  auto update_addr = reinterpret_cast<uint8_t *>(inputs[1]->addr);
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/tensoradd_cpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/tensoradd_cpu_kernel.cc
@@ -1,5 +1,5 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 * Copyright 2020-2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -20,9 +20,15 @@

 namespace mindspore {
 namespace kernel {
 namespace {
 constexpr size_t kTensorAddInputsSize = 2;
 constexpr size_t kTensorAddOutputsSize = 1;
 }  // namespace

 template <typename T>
 void TensorAddCPUKernel<T>::InitKernel(const CNodePtr &kernel_node) {
  MS_EXCEPTION_IF_NULL(kernel_node);
  kernel_name_ = AnfAlgo::GetCNodeName(kernel_node);
  // Init shape ans strides
  input_shape_a_ = AnfAlgo::GetInputDeviceShape(kernel_node, 0);
  input_shape_b_ = AnfAlgo::GetInputDeviceShape(kernel_node, 1);
@@ -33,6 +39,8 @@ template <typename T>
 bool TensorAddCPUKernel<T>::Launch(const std::vector<kernel::AddressPtr> &inputs,
                                   const std::vector<kernel::AddressPtr> &,
                                   const std::vector<kernel::AddressPtr> &outputs) {
  CHECK_KERNEL_INPUTS_NUM(inputs.size(), kTensorAddInputsSize, kernel_name_);
  CHECK_KERNEL_OUTPUTS_NUM(outputs.size(), kTensorAddOutputsSize, kernel_name_);
  T *input_addr_a = reinterpret_cast<T *>(inputs[0]->addr);
  T *input_addr_b = reinterpret_cast<T *>(inputs[1]->addr);
  T *output_addr = reinterpret_cast<T *>(outputs[0]->addr);
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/tensoradd_cpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/tensoradd_cpu_kernel.h
@@ -1,5 +1,5 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 * Copyright 2020-2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/tile_cpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/tile_cpu_kernel.cc
@@ -20,10 +20,15 @@

 namespace mindspore {
 namespace kernel {
 void TileCPUKernel::TileMultipleCompute(void) {
 namespace {
 constexpr size_t kTileInputsNum = 1;
 constexpr size_t kTileOutputsNum = 1;
 }  // namespace

 void TileCPUKernel::TileMultipleCompute() {
  int large_one_multiple_count_ = 0;
  int multiple = 0;
  int mul_index = 0;
  size_t mul_index = 0;
  for (size_t i = 0; i < multiples_.size(); i++) {
    tile_parameter_.multiples_[i] = multiples_[i];
    if (tile_parameter_.multiples_[i] > 1) {
@@ -47,6 +52,10 @@ void TileCPUKernel::TileMultipleCompute(void) {
 void TileCPUKernel::TileTensorParamrInit(const CNodePtr &kernel_node) {
  x_shape_ = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
  y_shape_ = AnfAlgo::GetOutputInferShape(kernel_node, 0);
  if (x_shape_.size() > MAX_TILE_DIM_SIZE || x_shape_.size() > y_shape_.size()) {
    MS_LOG(EXCEPTION) << "Tile input shape should not be greater than default max size :" << MAX_TILE_DIM_SIZE
                      << " and output shape : " << y_shape_.size() << ", but got input shape " << x_shape_.size();
  }
  std::vector<int64_t> multiples_me = AnfAlgo::GetNodeAttr<std::vector<int64_t>>(kernel_node, "multiples");
  (void)std::transform(multiples_me.begin(), multiples_me.end(), std::back_inserter(multiples_),
                       [](const int64_t &value) { return LongToInt(value); });
@@ -54,17 +63,9 @@ void TileCPUKernel::TileTensorParamrInit(const CNodePtr &kernel_node) {
  size_t ones = multiples_.size() - x_shape_.size();
  if (ones > 0) {
    for (size_t i = 0; i < ones; ++i) {
      x_shape_.insert(x_shape_.begin(), 1);
      (void)x_shape_.insert(x_shape_.begin(), 1);
    }
  }
  if (x_shape_.size() > MAX_TILE_DIM_SIZE) {
    MS_LOG(EXCEPTION) << "Input shape size should not greater than " << MAX_TILE_DIM_SIZE << ", but got "
                      << x_shape_.size();
  }
  if (y_shape_.size() < x_shape_.size()) {
    MS_LOG(EXCEPTION) << "Output shape size should not less than input shape size, but got output shape: " << y_shape_
                      << ", input shape: " << x_shape_;
  }

  input_size_ = 1;
  tile_parameter_.in_dim_ = x_shape_.size();
@@ -88,7 +89,7 @@ void TileCPUKernel::TileTensorParamrInit(const CNodePtr &kernel_node) {

 void TileCPUKernel::InitKernel(const CNodePtr &kernel_node) {
  MS_EXCEPTION_IF_NULL(kernel_node);
  CheckParam(kernel_node);
  kernel_name_ = AnfAlgo::GetCNodeName(kernel_node);
  TileTensorParamrInit(kernel_node);

  launch_map_[kNumberTypeInt8] = &TileCPUKernel::LaunchKernel<int8_t>;
@@ -112,6 +113,8 @@ void TileCPUKernel::InitKernel(const CNodePtr &kernel_node) {

 bool TileCPUKernel::Launch(const std::vector<kernel::AddressPtr> &inputs, const std::vector<kernel::AddressPtr> &,
                           const std::vector<kernel::AddressPtr> &outputs) {
  CHECK_KERNEL_INPUTS_NUM(inputs.size(), kTileInputsNum, kernel_name_);
  CHECK_KERNEL_OUTPUTS_NUM(outputs.size(), kTileOutputsNum, kernel_name_);
  launch_func_(this, inputs, outputs);
  return true;
 }
@@ -132,16 +135,5 @@ void TileCPUKernel::LaunchKernel(const std::vector<AddressPtr> &inputs, const st

  Tile(x_addr, y_addr, &tile_parameter_);
 }

 void TileCPUKernel::CheckParam(const CNodePtr &kernel_node) {
  size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
  if (input_num != 1) {
    MS_LOG(EXCEPTION) << "Input number is " << input_num << ", but TileCPUKernel needs 1 input.";
  }
  size_t output_num = AnfAlgo::GetOutputTensorNum(kernel_node);
  if (output_num != 1) {
    MS_LOG(EXCEPTION) << "Output number is " << output_num << ", but TileCPUKernel needs 1 output.";
  }
 }
 }  // namespace kernel
 }  // namespace mindspore
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/tile_cpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/tile_cpu_kernel.h
@@ -36,6 +36,7 @@ class TileCPUKernel : public CPUKernel {
  bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
              const std::vector<AddressPtr> &outputs) override;

 private:
  template <typename T>
  void LaunchKernel(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &outputs);

@@ -43,8 +44,6 @@ class TileCPUKernel : public CPUKernel {

  void TileMultipleCompute(void);

 private:
  void CheckParam(const CNodePtr &kernel_node);
  std::vector<size_t> x_shape_;
  std::vector<size_t> y_shape_;
  std::vector<int> multiples_;
@@ -54,8 +53,8 @@ class TileCPUKernel : public CPUKernel {
  std::unordered_map<TypeId, TypeKernel> launch_map_;
  TypeKernel launch_func_;
  TileParameter tile_parameter_;
  bool one_dim_tile_;
  size_t input_size_;
  bool one_dim_tile_{false};
  size_t input_size_{0};
 };

 MS_REG_CPU_KERNEL(Tile, KernelAttr(), TileCPUKernel);
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/topk_cpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/topk_cpu_kernel.cc
@@ -21,6 +21,11 @@

 namespace mindspore {
 namespace kernel {
 namespace {
 constexpr size_t kTopKInputsNum = 2;
 constexpr size_t kTopKOutputsNum = 2;
 }  // namespace

 template <typename T>
 void TopKCPUKernel::LaunchKernel(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspaces,
                                 const std::vector<AddressPtr> &outputs) {
@@ -87,8 +92,8 @@ void TopKCPUKernel::LaunchKernel(const std::vector<AddressPtr> &inputs, const st
 void TopKCPUKernel::InitKernel(const CNodePtr &kernel_node) {
  MS_EXCEPTION_IF_NULL(kernel_node);
  auto x_shape_ = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
  if (x_shape_.size() < 1) {
    MS_LOG(EXCEPTION) << "Input shape size should not less than 1";
  if (x_shape_.empty()) {
    MS_LOG(EXCEPTION) << "Input shape is empty";
  }
  for (size_t i = 0; i < x_shape_.size() - 1; ++i) {
    outer_size_ *= x_shape_[i];
@@ -107,6 +112,8 @@ void TopKCPUKernel::InitInputOutputSize(const CNodePtr &kernel_node) {
 bool TopKCPUKernel::Launch(const std::vector<kernel::AddressPtr> &inputs,
                           const std::vector<kernel::AddressPtr> &workspaces,
                           const std::vector<kernel::AddressPtr> &outputs) {
  CHECK_KERNEL_INPUTS_NUM(inputs.size(), kTopKInputsNum, kernel_name_);
  CHECK_KERNEL_OUTPUTS_NUM(outputs.size(), kTopKOutputsNum, kernel_name_);
  if (dtype_ == kNumberTypeFloat16) {
    LaunchKernel<float16>(inputs, workspaces, outputs);
  } else if (dtype_ == kNumberTypeFloat32) {