sync cpu op review on master

4 years ago · 0e882f44cc
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/elu_grad_cpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/elu_grad_cpu_kernel.cc
@@ -13,16 +13,23 @@
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #include "backend/kernel_compiler/cpu/elu_grad_cpu_kernel.h"
 #include <cmath>
 #include <string>
 #include <thread>
 #include "backend/kernel_compiler/cpu/elu_grad_cpu_kernel.h"
 #include "runtime/device/cpu/cpu_device_address.h"

 namespace mindspore {
 namespace kernel {
 namespace {
 constexpr size_t kEleGradInputsNum = 2;
 constexpr size_t kEleGradOutputsNum = 1;
 }  // namespace

 void EluGradCPUKernel::InitKernel(const CNodePtr &kernel_node) {
  MS_EXCEPTION_IF_NULL(kernel_node);
  kernel_name_ = AnfAlgo::GetCNodeName(kernel_node);
  dtype_ = AnfAlgo::GetInputDeviceDataType(kernel_node, 0);
  if (dtype_ != AnfAlgo::GetInputDeviceDataType(kernel_node, 1)) {
    MS_LOG(EXCEPTION) << "Input0 and input1 must has the same data type";
@@ -31,6 +38,8 @@ void EluGradCPUKernel::InitKernel(const CNodePtr &kernel_node) {

 bool EluGradCPUKernel::Launch(const std::vector<kernel::AddressPtr> &inputs, const std::vector<kernel::AddressPtr> &,
                              const std::vector<kernel::AddressPtr> &outputs) {
  CHECK_KERNEL_INPUTS_NUM(inputs.size(), kEleGradInputsNum, kernel_name_);
  CHECK_KERNEL_OUTPUTS_NUM(outputs.size(), kEleGradOutputsNum, kernel_name_);
  if (dtype_ == kNumberTypeFloat32 || dtype_ == kNumberTypeFloat) {
    LaunchKernel<float>(inputs, outputs);
  } else if (dtype_ == kNumberTypeFloat16) {
@@ -44,9 +53,9 @@ bool EluGradCPUKernel::Launch(const std::vector<kernel::AddressPtr> &inputs, con
 template <typename T>
 void EluGradCPUKernel::LaunchKernel(const std::vector<AddressPtr> &inputs,
                                    const std::vector<AddressPtr> &outputs) const {
  T *input0 = reinterpret_cast<T *>(inputs[0]->addr);
  T *input1 = reinterpret_cast<T *>(inputs[1]->addr);
  T *output = reinterpret_cast<T *>(outputs[0]->addr);
  const auto *input0 = reinterpret_cast<T *>(inputs[0]->addr);
  const auto *input1 = reinterpret_cast<T *>(inputs[1]->addr);
  auto *output = reinterpret_cast<T *>(outputs[0]->addr);

  size_t lens = outputs[0]->size > 0 ? static_cast<size_t>(outputs[0]->size / sizeof(T)) : 1;
  auto task = [input0, input1, output](const size_t start, const size_t end) {
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/elu_grad_cpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/elu_grad_cpu_kernel.h
@@ -13,8 +13,10 @@
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_ELU_GRAD_CPU_KERNEL_H_
 #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_ELU_GRAD_CPU_KERNEL_H_

 #include <memory>
 #include <vector>
 #include "backend/kernel_compiler/cpu/cpu_kernel.h"
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/embedding_look_up_comm_grad_cpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/embedding_look_up_comm_grad_cpu_kernel.cc
@@ -1,5 +1,5 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 * Copyright 2020-2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -13,18 +13,31 @@
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include <thread>

 #include "backend/kernel_compiler/cpu/embedding_look_up_comm_grad_cpu_kernel.h"
 #include <thread>
 #include "runtime/device/cpu/cpu_device_address.h"
 #include "runtime/device/cpu/mpi/mpi_interface.h"

 namespace mindspore {
 namespace kernel {
 namespace {
 constexpr size_t kEmbeddingLookupCommGradInputsNum = 1;
 constexpr size_t kEmbeddingLookupCommGradOutputsNum = 1;
 }  // namespace

 void EmbeddingLookUpCommGradCPUKernel::InitKernel(const CNodePtr &kernel_node) {
  CheckParam(kernel_node);
  MS_EXCEPTION_IF_NULL(kernel_node);
  kernel_name_ = AnfAlgo::GetCNodeName(kernel_node);
  split_num_ = AnfAlgo::GetNodeAttr<int64_t>(kernel_node, "split_num");
  MS_LOG(INFO) << "split_num: " << split_num_;
  auto input_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
  if (split_num_ == 0) {
    MS_LOG(EXCEPTION) << "The split_num_ must be larger than 0.";
  }
  if (input_shape.size() < 1) {
    MS_LOG(EXCEPTION) << "The size of input's shape must be at least 1.";
  }
  if (input_shape[0] % split_num_ != 0) {
    MS_LOG(EXCEPTION) << "Input shape[0] is " << input_shape[0] << ", but it must be multiple of split_num.";
  }
@@ -33,14 +46,16 @@ void EmbeddingLookUpCommGradCPUKernel::InitKernel(const CNodePtr &kernel_node) {
 bool EmbeddingLookUpCommGradCPUKernel::Launch(const std::vector<kernel::AddressPtr> &inputs,
                                              const std::vector<kernel::AddressPtr> &,
                                              const std::vector<kernel::AddressPtr> &outputs) {
  CHECK_KERNEL_INPUTS_NUM(inputs.size(), kEmbeddingLookupCommGradInputsNum, kernel_name_);
  CHECK_KERNEL_OUTPUTS_NUM(outputs.size(), kEmbeddingLookupCommGradOutputsNum, kernel_name_);
 #if defined(_WIN32) || defined(_WIN64)
  auto start_time = std::chrono::steady_clock::now();
 #else
  struct timeval start_time, end_time;
  (void)gettimeofday(&start_time, nullptr);
 #endif
  auto input_addr = reinterpret_cast<float *>(inputs[0]->addr);
  auto output_addr = reinterpret_cast<float *>(outputs[0]->addr);
  auto *input_addr = reinterpret_cast<float *>(inputs[0]->addr);
  auto *output_addr = reinterpret_cast<float *>(outputs[0]->addr);
  size_t input_size = inputs[0]->size;
  size_t output_size = outputs[0]->size;
  MS_LOG(DEBUG) << "input addr: " << input_addr << "input size: " << input_size;
@@ -67,12 +82,5 @@ bool EmbeddingLookUpCommGradCPUKernel::Launch(const std::vector<kernel::AddressP
 #endif
  return true;
 }

 void EmbeddingLookUpCommGradCPUKernel::CheckParam(const CNodePtr &kernel_node) {
  size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
  if (input_num != 1) {
    MS_LOG(EXCEPTION) << "Argument number is " << input_num << ", but EmbeddingLookUpCommGradCPUKernel needs 1.";
  }
 }
 }  // namespace kernel
 }  // namespace mindspore
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/embedding_look_up_comm_grad_cpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/embedding_look_up_comm_grad_cpu_kernel.h
@@ -1,5 +1,5 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 * Copyright 2020-2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -13,8 +13,10 @@
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_EMBEDDING_LOOK_UP_COMM_GRAD_CPU_KERNEL_H_
 #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_EMBEDDING_LOOK_UP_COMM_GRAD_CPU_KERNEL_H_

 #include <vector>
 #include <memory>
 #include "backend/kernel_compiler/cpu/cpu_kernel.h"
@@ -33,7 +35,6 @@ class EmbeddingLookUpCommGradCPUKernel : public CPUKernel {
              const std::vector<AddressPtr> &outputs) override;

 private:
  void CheckParam(const CNodePtr &kernel_node);
  int64_t split_num_;
 };

--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/embedding_look_up_cpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/embedding_look_up_cpu_kernel.cc
@@ -1,5 +1,5 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 * Copyright 2020-2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -13,9 +13,10 @@
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #include "backend/kernel_compiler/cpu/embedding_look_up_cpu_kernel.h"
 #include <thread>
 #include <string>
 #include "backend/kernel_compiler/cpu/embedding_look_up_cpu_kernel.h"
 #include "runtime/device/cpu/cpu_device_address.h"
 #include "ir/primitive.h"
 #include "common/thread_pool.h"
@@ -23,6 +24,11 @@
 namespace mindspore {
 namespace kernel {
 namespace {
 constexpr size_t kBlockSize = 10000;
 constexpr size_t kEmbeddingLookupInputsNum = 2;
 constexpr size_t kEmbeddingLookupOutputsNum = 1;
 constexpr size_t kEmbeddingLookupInputParamsMaxDim = 2;

 template <typename T>
 void LookUpTableTask(const float *input_addr, const T *indices_addr, float *output_addr, size_t indices_lens,
                     size_t outer_dim_size, T offset, size_t first_dim_size) {
@@ -48,11 +54,13 @@ void LookUpTableTask(const float *input_addr, const T *indices_addr, float *outp
 }  // namespace

 void EmbeddingLookUpCPUKernel::InitKernel(const CNodePtr &kernel_node) {
  CheckParam(kernel_node);
  MS_EXCEPTION_IF_NULL(kernel_node);
  kernel_name_ = AnfAlgo::GetCNodeName(kernel_node);
  node_wpt_ = kernel_node;
  std::vector<size_t> input_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
  if (input_shape.empty()) {
    MS_LOG(EXCEPTION) << "Param must be at least 1D";
  auto input_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
  if (input_shape.empty() || input_shape.size() > kEmbeddingLookupInputParamsMaxDim) {
    MS_LOG(EXCEPTION) << "EmbeddingLookUpCPUKernel support 1-" << kEmbeddingLookupInputParamsMaxDim
                      << "D input tensor, but got " << input_shape.size() << "D.";
  }
  first_dim_size_ = input_shape[0];
  outer_dim_size_ = 1;
@@ -74,11 +82,11 @@ template <typename T>
 void EmbeddingLookUpCPUKernel::LaunchKernel(const std::vector<kernel::AddressPtr> &inputs,
                                            const std::vector<kernel::AddressPtr> &outputs) {
  if (!node_wpt_.expired()) {
    auto node_ = node_wpt_.lock();
    if (!node_) {
    auto node = node_wpt_.lock();
    if (!node) {
      MS_LOG(EXCEPTION) << "node_wpt_ is expired.";
    }
    std::vector<size_t> input_shape = AnfAlgo::GetPrevNodeOutputInferShape(node_, 0);
    std::vector<size_t> input_shape = AnfAlgo::GetPrevNodeOutputInferShape(node, 0);
    if (input_shape.empty()) {
      MS_LOG(EXCEPTION) << "Param must be at least 1D";
    }
@@ -89,15 +97,15 @@ void EmbeddingLookUpCPUKernel::LaunchKernel(const std::vector<kernel::AddressPtr
    }

    indices_lens_ = 1;
    std::vector<size_t> indices_shape = AnfAlgo::GetPrevNodeOutputInferShape(node_, 1);
    std::vector<size_t> indices_shape = AnfAlgo::GetPrevNodeOutputInferShape(node, 1);
    for (const auto &shape : indices_shape) {
      indices_lens_ *= shape;
    }
  }
  auto input_addr = reinterpret_cast<float *>(inputs[0]->addr);
  auto indices_addr = reinterpret_cast<T *>(inputs[1]->addr);
  auto output_addr = reinterpret_cast<float *>(outputs[0]->addr);
  size_t thread_num = indices_lens_ / 10000 + 1;
  const auto *input_addr = reinterpret_cast<float *>(inputs[0]->addr);
  const auto *indices_addr = reinterpret_cast<T *>(inputs[1]->addr);
  auto *output_addr = reinterpret_cast<float *>(outputs[0]->addr);
  size_t thread_num = indices_lens_ / kBlockSize + 1;
  auto max_thread_num = common::ThreadPool::GetInstance().GetSyncRunThreadNum();
  thread_num = thread_num > max_thread_num ? max_thread_num : thread_num;
  std::vector<common::Task> tasks;
@@ -127,6 +135,8 @@ void EmbeddingLookUpCPUKernel::LaunchKernel(const std::vector<kernel::AddressPtr
 bool EmbeddingLookUpCPUKernel::Launch(const std::vector<kernel::AddressPtr> &inputs,
                                      const std::vector<kernel::AddressPtr> &,
                                      const std::vector<kernel::AddressPtr> &outputs) {
  CHECK_KERNEL_INPUTS_NUM(inputs.size(), kEmbeddingLookupInputsNum, kernel_name_);
  CHECK_KERNEL_OUTPUTS_NUM(outputs.size(), kEmbeddingLookupOutputsNum, kernel_name_);
  if (indices_data_type_ == kNumberTypeInt32) {
    LaunchKernel<int>(inputs, outputs);
  } else {
@@ -134,18 +144,5 @@ bool EmbeddingLookUpCPUKernel::Launch(const std::vector<kernel::AddressPtr> &inp
  }
  return true;
 }

 void EmbeddingLookUpCPUKernel::CheckParam(const CNodePtr &kernel_node) {
  auto input_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
  if (input_shape.size() > 4) {
    MS_LOG(EXCEPTION) << "Input dims is " << input_shape.size()
                      << ", but EmbeddingLookUpCPUKernel only support 4d or lower.";
  }

  size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
  if (input_num != 2) {
    MS_LOG(EXCEPTION) << "Argument number is " << input_num << ", but EmbeddingLookUpCPUKernel needs 2.";
  }
 }
 }  // namespace kernel
 }  // namespace mindspore
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/embedding_look_up_cpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/embedding_look_up_cpu_kernel.h
@@ -1,5 +1,5 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 * Copyright 2020-2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -13,8 +13,10 @@
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_EMBEDDING_LOOK_UP_CPU_KERNEL_H_
 #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_EMBEDDING_LOOK_UP_CPU_KERNEL_H_

 #include <vector>
 #include <memory>
 #include "backend/kernel_compiler/cpu/cpu_kernel.h"
@@ -31,11 +33,11 @@ class EmbeddingLookUpCPUKernel : public CPUKernel {

  bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
              const std::vector<AddressPtr> &outputs) override;

 protected:
  template <typename T>
  void LaunchKernel(const std::vector<kernel::AddressPtr> &inputs, const std::vector<kernel::AddressPtr> &outputs);

 protected:
  void CheckParam(const CNodePtr &kernel_node);
  int64_t offset_{0};
  size_t indices_lens_{1};
  size_t first_dim_size_{1};
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/equal_count_cpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/equal_count_cpu_kernel.cc
@@ -1,5 +1,5 @@
 /**
 * Copyright 2019 Huawei Technologies Co., Ltd
 * Copyright 2019-2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -13,18 +13,26 @@
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #include "backend/kernel_compiler/cpu/equal_count_cpu_kernel.h"
 #include "runtime/device/cpu/cpu_device_address.h"

 namespace mindspore {
 namespace kernel {
 void EqualCountCPUKernel::InitKernel(const CNodePtr &) {}
 namespace {
 constexpr size_t kEqualCountInputsNum = 2;
 constexpr size_t kEqualCountOutputsNum = 1;
 }  // namespace

 void EqualCountCPUKernel::InitKernel(const CNodePtr &kernel_node) {
  MS_EXCEPTION_IF_NULL(kernel_node);
  kernel_name_ = AnfAlgo::GetCNodeName(kernel_node);
 }

 bool EqualCountCPUKernel::Launch(const std::vector<kernel::AddressPtr> &inputs, const std::vector<kernel::AddressPtr> &,
                                 const std::vector<kernel::AddressPtr> &outputs) {
  if (inputs.size() < 2 || outputs.empty()) {
    MS_LOG(EXCEPTION) << "Input or output empty!";
  }
  CHECK_KERNEL_INPUTS_NUM(inputs.size(), kEqualCountInputsNum, kernel_name_);
  CHECK_KERNEL_OUTPUTS_NUM(outputs.size(), kEqualCountOutputsNum, kernel_name_);
  if (inputs[0]->size != inputs[1]->size) {
    MS_LOG(EXCEPTION) << "Input or output size!";
  }
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/equal_count_cpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/equal_count_cpu_kernel.h
@@ -1,5 +1,5 @@
 /**
 * Copyright 2019 Huawei Technologies Co., Ltd
 * Copyright 2019-2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -13,8 +13,10 @@
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_EQUAL_COUNT_CPU_KERNEL_H_
 #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_EQUAL_COUNT_CPU_KERNEL_H_

 #include <vector>
 #include <memory>
 #include "backend/kernel_compiler/cpu/cpu_kernel.h"
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/expm1_cpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/expm1_cpu_kernel.cc
@@ -1,5 +1,5 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 * Copyright 2020-2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -20,28 +20,27 @@

 namespace mindspore {
 namespace kernel {
 namespace {
 constexpr size_t kExpm1InputsNum = 1;
 constexpr size_t kExpm1OutputsNum = 1;
 }  // namespace

 void Expm1CPUKernel::InitKernel(const CNodePtr &kernel_node) {
  MS_EXCEPTION_IF_NULL(kernel_node);
  size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
  if (input_num != 1) {
    MS_LOG(EXCEPTION) << "Input number is " << input_num << ", but Expm1CPUKernel needs 1 inputs.";
  }
  size_t output_num = AnfAlgo::GetOutputTensorNum(kernel_node);
  if (output_num != 1) {
    MS_LOG(EXCEPTION) << "Output number is " << output_num << ", but Expm1CPUKernel needs 1 output.";
  }

  kernel_name_ = AnfAlgo::GetCNodeName(kernel_node);
  input_dtype_ = AnfAlgo::GetInputDeviceDataType(kernel_node, 0);
  if (input_dtype_ != kNumberTypeFloat16 && input_dtype_ != kNumberTypeFloat32 && input_dtype_ != kNumberTypeFloat) {
  if (input_dtype_ != kNumberTypeFloat16 && input_dtype_ != kNumberTypeFloat32) {
    MS_LOG(EXCEPTION) << "Unsupported input type found.";
  }
 }

 bool Expm1CPUKernel::Launch(const std::vector<kernel::AddressPtr> &inputs, const std::vector<kernel::AddressPtr> &,
                            const std::vector<kernel::AddressPtr> &outputs) {
  CHECK_KERNEL_INPUTS_NUM(inputs.size(), kExpm1InputsNum, kernel_name_);
  CHECK_KERNEL_OUTPUTS_NUM(outputs.size(), kExpm1OutputsNum, kernel_name_);
  if (input_dtype_ == kNumberTypeFloat16) {
    LaunchKernel<float16>(inputs, outputs);
  } else if (input_dtype_ == kNumberTypeFloat32 || input_dtype_ == kNumberTypeFloat) {
  } else if (input_dtype_ == kNumberTypeFloat32) {
    LaunchKernel<float>(inputs, outputs);
  } else {
    MS_LOG(EXCEPTION) << "Only support float, half, but actual data type is " << TypeIdLabel(input_dtype_);
@@ -52,11 +51,9 @@ bool Expm1CPUKernel::Launch(const std::vector<kernel::AddressPtr> &inputs, const
 template <typename T>
 void Expm1CPUKernel::LaunchKernel(const std::vector<AddressPtr> &inputs,
                                  const std::vector<kernel::AddressPtr> &outputs) {
  T *input = reinterpret_cast<T *>(inputs[0]->addr);
  T *output = reinterpret_cast<T *>(outputs[0]->addr);

  const auto *input = reinterpret_cast<T *>(inputs[0]->addr);
  auto *output = reinterpret_cast<T *>(outputs[0]->addr);
  size_t elem_num = inputs[0]->size / sizeof(T);

  for (size_t i = 0; i < elem_num; i++) {
    output[i] = exp(input[i]) - T(1);
  }
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/expm1_cpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/expm1_cpu_kernel.h
@@ -1,5 +1,5 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 * Copyright 2020-2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -13,6 +13,7 @@
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_EXPM1_CPU_KERNEL_H_
 #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_EXPM1_CPU_KERNEL_H_

@@ -45,8 +46,6 @@ MS_REG_CPU_KERNEL(Expm1, KernelAttr().AddInputAttr(kNumberTypeFloat16).AddOutput

 MS_REG_CPU_KERNEL(Expm1, KernelAttr().AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32),
                  Expm1CPUKernel);

 MS_REG_CPU_KERNEL(Expm1, KernelAttr().AddInputAttr(kNumberTypeFloat).AddOutputAttr(kNumberTypeFloat32), Expm1CPUKernel);
 }  // namespace kernel
 }  // namespace mindspore

--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/gather_cpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/gather_cpu_kernel.cc
@@ -1,5 +1,5 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 * Copyright 2020-2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -13,6 +13,7 @@
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #include "backend/kernel_compiler/cpu/gather_cpu_kernel.h"
 #include "runtime/device/cpu/cpu_device_address.h"
 #include "nnacl/gather_parameter.h"
@@ -21,12 +22,23 @@

 namespace mindspore {
 namespace kernel {
 namespace {
 constexpr size_t kGatherInputsNum = 2;
 constexpr size_t kGatherOutputsNum = 1;
 constexpr size_t kGatherInputParamsMaxDim = 4;
 }  // namespace

 template <typename T>
 void GatherV2CPUKernel<T>::InitKernel(const CNodePtr &kernel_node) {
  CheckParam(kernel_node);
  MS_EXCEPTION_IF_NULL(kernel_node);
  kernel_name_ = AnfAlgo::GetCNodeName(kernel_node);
  input_shape_ = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
  indices_shape_ = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 1);
  output_shape_ = AnfAlgo::GetOutputInferShape(kernel_node, 0);
  if (input_shape_.size() > kGatherInputParamsMaxDim) {
    MS_LOG(EXCEPTION) << "Input dims is " << input_shape_.size() << ", but GatherV2CPUKernel olny support "
                      << kGatherInputParamsMaxDim << "D or lower.";
  }
  if (!is_dynamic_shape_) {
    axis_ = AnfAlgo::GetNodeAttr<int64_t>(kernel_node, AXIS);
  }
@@ -36,9 +48,11 @@ template <typename T>
 bool GatherV2CPUKernel<T>::Launch(const std::vector<kernel::AddressPtr> &inputs,
                                  const std::vector<kernel::AddressPtr> &,
                                  const std::vector<kernel::AddressPtr> &outputs) {
  auto input_tensor = reinterpret_cast<int8_t *>(inputs[0]->addr);
  indices_data_ = reinterpret_cast<int32_t *>(inputs[1]->addr);
  auto output_addr = reinterpret_cast<int8_t *>(outputs[0]->addr);
  CHECK_KERNEL_INPUTS_NUM(inputs.size(), kGatherInputsNum, kernel_name_);
  CHECK_KERNEL_OUTPUTS_NUM(outputs.size(), kGatherOutputsNum, kernel_name_);
  const auto *input_tensor = reinterpret_cast<int8_t *>(inputs[0]->addr);
  const auto *indices_data = reinterpret_cast<int32_t *>(inputs[1]->addr);
  auto *output_addr = reinterpret_cast<int8_t *>(outputs[0]->addr);
  if (is_dynamic_shape_) {
    axis_ = reinterpret_cast<int64_t *>(inputs[2]->addr)[0];
  }
@@ -51,13 +65,14 @@ bool GatherV2CPUKernel<T>::Launch(const std::vector<kernel::AddressPtr> &inputs,
    axis_ = axis_ + dims;
  }

  int max_thread_num = static_cast<int>(common::ThreadPool::GetInstance().GetSyncRunThreadNum());
  ParallelRun(input_tensor, output_addr, max_thread_num);
  int max_thread_num = SizeToInt(common::ThreadPool::GetInstance().GetSyncRunThreadNum());
  ParallelRun(input_tensor, indices_data, output_addr, max_thread_num);
  return true;
 }

 template <typename T>
 void GatherV2CPUKernel<T>::ParallelRun(int8_t *input_addr, int8_t *output_addr, int thread_num) {
 void GatherV2CPUKernel<T>::ParallelRun(const int8_t *input_addr, const int *indices_data, int8_t *output_addr,
                                       int thread_num) {
  size_t outer_size = 1, inner_size = 1;
  auto axis = static_cast<size_t>(axis_);
  for (size_t i = 0; i < axis; ++i) {
@@ -76,12 +91,14 @@ void GatherV2CPUKernel<T>::ParallelRun(int8_t *input_addr, int8_t *output_addr,
  int thread_index = 0;
  while (thread_index < thread_num) {
    int count = SizeToInt(MSMIN(stride, outer_size - stride * IntToSize(thread_index)));
    if (count <= 0) break;
    if (count <= 0) {
      break;
    }
    auto thread_stride = static_cast<size_t>(stride * thread_index);
    int8_t *in = input_addr + thread_stride * limit * inner_size * sizeof(T);
    const int8_t *in = input_addr + thread_stride * limit * inner_size * sizeof(T);
    int8_t *out = output_addr + thread_stride * indices_element_size * inner_size * sizeof(T);
    auto block = [this, in, count, inner_size, limit, indices_element_size, out, thread_index]() {
      int ret = Gather(in, count, inner_size, limit, indices_data_, indices_element_size, out, sizeof(T));
    auto block = [this, in, indices_data, count, inner_size, limit, indices_element_size, out, thread_index]() {
      int ret = Gather(in, count, inner_size, limit, indices_data, indices_element_size, out, sizeof(T));
      if (ret != 0) {
        MS_LOG(ERROR) << "GatherRun error task_id[" << thread_index << "] error_code[" << ret << "]";
        return common::FAIL;
@@ -95,18 +112,5 @@ void GatherV2CPUKernel<T>::ParallelRun(int8_t *input_addr, int8_t *output_addr,
    MS_LOG(EXCEPTION) << "SyncRun error!";
  }
 }

 template <typename T>
 void GatherV2CPUKernel<T>::CheckParam(const CNodePtr &kernel_node) {
  size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
  if (input_num == 3) {
    is_dynamic_shape_ = true;
    MS_LOG(DEBUG) << " GatherV2CPUKernel running in Dynamic Mode.";
  } else if (input_num == 2) {
    MS_LOG(DEBUG) << " GatherV2CPUKernel running in Normal Mode.";
  } else {
    MS_LOG(EXCEPTION) << "Argument number is " << input_num << ", but GatherV2CPUKernel needs 2.";
  }
 }
 }  // namespace kernel
 }  // namespace mindspore
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/gather_cpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/gather_cpu_kernel.h
@@ -1,5 +1,5 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 * Copyright 2020-2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -13,8 +13,10 @@
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_GATHER_CPU_KERNEL_H_
 #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_GATHER_CPU_KERNEL_H_

 #include <vector>
 #include <memory>
 #include "backend/kernel_compiler/cpu/cpu_kernel.h"
@@ -35,12 +37,10 @@ class GatherV2CPUKernel : public CPUKernel {
              const std::vector<AddressPtr> &outputs) override;

 private:
  void CheckParam(const CNodePtr &kernel_node);
  void ParallelRun(int8_t *input_addr, int8_t *output_addr, int thread_num);
  void ParallelRun(const int8_t *input_addr, const int *indices_data, int8_t *output_addr, int thread_num);
  std::vector<size_t> input_shape_;
  std::vector<size_t> indices_shape_;
  std::vector<size_t> output_shape_;
  int *indices_data_ = nullptr;
  int64_t axis_{0};
  bool is_dynamic_shape_{false};
 };
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/gather_d_cpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/gather_d_cpu_kernel.cc
@@ -1,5 +1,5 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 * Copyright 2020-2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -19,6 +19,9 @@
 namespace mindspore {
 namespace kernel {
 namespace {
 constexpr size_t kGatherDInputsNum = 3;
 constexpr size_t kGatherDOutputsNum = 1;

 size_t get_element_num(const std::vector<size_t> &shape) {
  size_t size = 1;
  for (size_t i = 0; i < shape.size(); i++) {
@@ -63,6 +66,8 @@ void CopyTask(size_t cur, std::vector<size_t> *pos, T *input, const I *index, co

 template <typename T, typename I>
 void GatherDCPUKernel<T, I>::InitKernel(const CNodePtr &kernel_node) {
  MS_EXCEPTION_IF_NULL(kernel_node);
  kernel_name_ = AnfAlgo::GetCNodeName(kernel_node);
  input_shape_ = AnfAlgo::GetInputDeviceShape(kernel_node, 0);
  index_shape_ = AnfAlgo::GetInputDeviceShape(kernel_node, 2);
  if (input_shape_.size() != index_shape_.size()) {
@@ -76,6 +81,8 @@ template <typename T, typename I>
 bool GatherDCPUKernel<T, I>::Launch(const std::vector<kernel::AddressPtr> &inputs,
                                    const std::vector<kernel::AddressPtr> &,
                                    const std::vector<kernel::AddressPtr> &outputs) {
  CHECK_KERNEL_INPUTS_NUM(inputs.size(), kGatherDInputsNum, kernel_name_);
  CHECK_KERNEL_OUTPUTS_NUM(outputs.size(), kGatherDOutputsNum, kernel_name_);
  size_t input_size = get_element_num(input_shape_) * sizeof(T);
  size_t index_size = get_element_num(index_shape_) * sizeof(I);
  size_t dim_size = sizeof(int);
@@ -83,17 +90,15 @@ bool GatherDCPUKernel<T, I>::Launch(const std::vector<kernel::AddressPtr> &input
  if (inputs[0]->size != input_size || inputs[1]->size != dim_size || inputs[2]->size != index_size ||
      outputs[0]->size != output_size) {
    MS_LOG(EXCEPTION) << "invalid input or output data size!";
    return false;
  }
  auto input = reinterpret_cast<T *>(inputs[0]->addr);
  auto dim = reinterpret_cast<int32_t *>(inputs[1]->addr);
  auto index = reinterpret_cast<I *>(inputs[2]->addr);
  auto *input = reinterpret_cast<T *>(inputs[0]->addr);
  auto *dim = reinterpret_cast<int32_t *>(inputs[1]->addr);
  auto *index = reinterpret_cast<I *>(inputs[2]->addr);
  auto output = reinterpret_cast<T *>(outputs[0]->addr);
  int32_t input_rank = SizeToInt(input_shape_.size());
  if (dim[0] >= input_rank || dim[0] < -input_rank) {
    MS_LOG(EXCEPTION) << "The value of 'dim' should be in [" << -input_rank << ", " << input_rank
                      << "], but got: " << dim[0];
    return false;
  }
  if (dim[0] < 0) {
    dim[0] = static_cast<int>(dim[0] + input_rank);
@@ -105,7 +110,6 @@ bool GatherDCPUKernel<T, I>::Launch(const std::vector<kernel::AddressPtr> &input
    if (index[i] >= max_index || index[i] < -max_index) {
      MS_LOG(EXCEPTION) << "The value of index should be in [" << -max_index << ", " << max_index
                        << "], but got: " << index[i];
      return false;
    }
    if (index[i] < 0) {
      index[i] = max_index + index[i];
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/gather_d_cpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/gather_d_cpu_kernel.h
@@ -1,5 +1,5 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 * Copyright 2020-2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -13,8 +13,8 @@
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_GATHERD_CPU_KERNEL_H_
 #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_GATHERD_CPU_KERNEL_H_
 #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_GATHER_D_CPU_KERNEL_H_
 #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_GATHER_D_CPU_KERNEL_H_
 #include <vector>
 #include <memory>
 #include "backend/kernel_compiler/cpu/cpu_kernel.h"
@@ -52,4 +52,4 @@ MS_REG_CPU_KERNEL_T_S(GatherD, KernelAttr(), GatherDCPUKernel, bool, int64_t);
 }  // namespace kernel
 }  // namespace mindspore

 #endif  // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_GATHERD_CPU_KERNEL_H_
 #endif  // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_GATHER_D_CPU_KERNEL_H_
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/gather_d_grad_cpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/gather_d_grad_cpu_kernel.cc
@@ -1,5 +1,5 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 * Copyright 2020-2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -13,12 +13,16 @@
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #include "backend/kernel_compiler/cpu/gather_d_grad_cpu_kernel.h"
 #include "runtime/device/cpu/cpu_device_address.h"

 namespace mindspore {
 namespace kernel {
 namespace {
 constexpr size_t kGatherDGradInputsNum = 2;
 constexpr size_t kGatherDGradOutputsNum = 1;

 size_t get_element_num(const std::vector<size_t> &shape) {
  size_t size = 1;
  for (size_t i = 0; i < shape.size(); i++) {
@@ -59,6 +63,8 @@ void GatherDGradCopyTask(size_t cur, std::vector<size_t> *pos, T *input, I *inde

 template <typename I, typename T>
 void GatherDGradCPUKernel<I, T>::InitKernel(const CNodePtr &kernel_node) {
  MS_EXCEPTION_IF_NULL(kernel_node);
  kernel_name_ = AnfAlgo::GetCNodeName(kernel_node);
  index_shape_ = AnfAlgo::GetInputDeviceShape(kernel_node, 0);
  input_shape_ = AnfAlgo::GetInputDeviceShape(kernel_node, 1);
  if (input_shape_ != index_shape_) {
@@ -72,25 +78,23 @@ template <typename I, typename T>
 bool GatherDGradCPUKernel<I, T>::Launch(const std::vector<kernel::AddressPtr> &inputs,
                                        const std::vector<kernel::AddressPtr> &,
                                        const std::vector<kernel::AddressPtr> &outputs) {
  CHECK_KERNEL_INPUTS_NUM(inputs.size(), kGatherDGradInputsNum, kernel_name_);
  CHECK_KERNEL_OUTPUTS_NUM(outputs.size(), kGatherDGradOutputsNum, kernel_name_);
  size_t input_size = get_element_num(input_shape_) * sizeof(T);
  size_t index_size = get_element_num(index_shape_) * sizeof(I);
  size_t output_size = get_element_num(output_shape_) * sizeof(T);
  if (inputs[0]->size != index_size || inputs[1]->size != input_size || outputs[0]->size != output_size) {
    MS_LOG(EXCEPTION) << "invalid input or output data size!";
    return false;
  }

  auto index = reinterpret_cast<I *>(inputs[0]->addr);
  auto input = reinterpret_cast<T *>(inputs[1]->addr);
  auto *index = reinterpret_cast<I *>(inputs[0]->addr);
  auto *input = reinterpret_cast<T *>(inputs[1]->addr);
  auto out = reinterpret_cast<T *>(outputs[0]->addr);

  int output_rank = SizeToInt(output_shape_.size());
  if (axis_ >= output_rank || axis_ < -output_rank) {
    MS_LOG(EXCEPTION) << "The value of 'axis_' should be in [" << -output_rank << ", " << output_rank
                      << "], but got: " << axis_;
    return false;
  }

  if (axis_ < 0) {
    axis_ = axis_ + SizeToInt(output_shape_.size());
  }
@@ -102,7 +106,6 @@ bool GatherDGradCPUKernel<I, T>::Launch(const std::vector<kernel::AddressPtr> &i
    if (index[i] >= max_index || index[i] < -max_index) {
      MS_LOG(EXCEPTION) << "The value of index should be in [" << -max_index << ", " << max_index
                        << "], but got: " << index[i];
      return false;
    }
    if (index[i] < 0) {
      index[i] = max_index + index[i];
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/gather_d_grad_cpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/gather_d_grad_cpu_kernel.h
@@ -1,5 +1,5 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 * Copyright 2020-2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -13,8 +13,10 @@
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_GATHERDGRAD_CPU_KERNEL_H_
 #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_GATHERDGRAD_CPU_KERNEL_H_

 #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_GATHER_D_GRAD_CPU_KERNEL_H_
 #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_GATHER_D_GRAD_CPU_KERNEL_H_

 #include <vector>
 #include <memory>
 #include "backend/kernel_compiler/cpu/cpu_kernel.h"
@@ -37,7 +39,7 @@ class GatherDGradCPUKernel : public CPUKernel {
  std::vector<size_t> input_shape_;
  std::vector<size_t> index_shape_;
  std::vector<size_t> output_shape_;
  int32_t axis_;
  int32_t axis_{1};
 };

 MS_REG_CPU_KERNEL_T_S(GatherDGrad, KernelAttr(), GatherDGradCPUKernel, int32_t, int32_t);
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/gathernd_cpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/gathernd_cpu_kernel.cc
@@ -1,5 +1,5 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 * Copyright 2020-2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -13,14 +13,23 @@
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #include "backend/kernel_compiler/cpu/gathernd_cpu_kernel.h"
 #include "runtime/device/cpu/cpu_device_address.h"
 #define MAX_INT (((unsigned int)(-1)) >> 1)

 namespace mindspore {
 namespace kernel {
 namespace {
 #define MAX_INT (((unsigned int)(-1)) >> 1)

 constexpr size_t kGatherNdInputsNum = 2;
 constexpr size_t kGatherNdOutputsNum = 1;
 }  // namespace

 template <typename T>
 void GatherNdCPUKernel<T>::InitKernel(const CNodePtr &kernel_node) {
  MS_EXCEPTION_IF_NULL(kernel_node);
  kernel_name_ = AnfAlgo::GetCNodeName(kernel_node);
  input_shapes_ = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
  indices_shapes_ = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 1);
  output_shapes_ = AnfAlgo::GetOutputInferShape(kernel_node, 0);
@@ -35,6 +44,9 @@ void GatherNdCPUKernel<T>::InitKernel(const CNodePtr &kernel_node) {

  size_t dim_after_indices = 1;
  size_t dim_indices_last = indices_shapes_[indices_shapes_.size() - IntToSize(1)];
  if (dim_indices_last == 0) {
    MS_LOG(EXCEPTION) << "Value of indices_shapes_[" << indices_shapes_.size() << " - 1] should not be 0";
  }
  for (size_t i = dim_indices_last; i < input_shapes_.size(); i++) {
    dim_after_indices *= input_shapes_[i];
  }
@@ -61,8 +73,10 @@ template <typename T>
 bool GatherNdCPUKernel<T>::Launch(const std::vector<kernel::AddressPtr> &inputs,
                                  const std::vector<kernel::AddressPtr> &,
                                  const std::vector<kernel::AddressPtr> &outputs) {
  auto input_addr = reinterpret_cast<T *>(inputs[0]->addr);
  auto indices_addr = reinterpret_cast<int *>(inputs[1]->addr);
  CHECK_KERNEL_INPUTS_NUM(inputs.size(), kGatherNdInputsNum, kernel_name_);
  CHECK_KERNEL_OUTPUTS_NUM(outputs.size(), kGatherNdOutputsNum, kernel_name_);
  const auto *input_addr = reinterpret_cast<T *>(inputs[0]->addr);
  const auto *indices_addr = reinterpret_cast<int *>(inputs[1]->addr);
  auto output_addr = reinterpret_cast<T *>(outputs[0]->addr);

  size_t output_dim0 = dims_[0];
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/gathernd_cpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/gathernd_cpu_kernel.h
@@ -1,5 +1,5 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 * Copyright 2020-2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -13,8 +13,10 @@
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_GATHERND_CPU_KERNEL_H_
 #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_GATHERND_CPU_KERNEL_H_

 #include <vector>
 #include <memory>
 #include "backend/kernel_compiler/cpu/cpu_kernel.h"
@@ -56,7 +58,6 @@ MS_REG_CPU_KERNEL_T(GatherNd, KernelAttr(), GatherNdCPUKernel, uint32_t);
 MS_REG_CPU_KERNEL_T(GatherNd, KernelAttr(), GatherNdCPUKernel, uint64_t);
 MS_REG_CPU_KERNEL_T(GatherNd, KernelAttr(), GatherNdCPUKernel, float);
 MS_REG_CPU_KERNEL_T(GatherNd, KernelAttr(), GatherNdCPUKernel, double);

 }  // namespace kernel
 }  // namespace mindspore

--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/hsigmoid_cpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/hsigmoid_cpu_kernel.cc
@@ -20,9 +20,15 @@

 namespace mindspore {
 namespace kernel {
 namespace {
 constexpr size_t kHSigmoidInputsNum = 1;
 constexpr size_t kHSigmoidOutputsNum = 1;
 }  // namespace

 template <typename T>
 void HSigmoidCPUKernel<T>::InitKernel(const CNodePtr &kernel_node) {
  CheckParam(kernel_node);
  MS_EXCEPTION_IF_NULL(kernel_node);
  kernel_name_ = AnfAlgo::GetCNodeName(kernel_node);
  x_shape_ = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
  for (const uint64_t &d : x_shape_) {
    tensor_size_ *= d;
@@ -33,33 +39,28 @@ template <typename T>
 bool HSigmoidCPUKernel<T>::Launch(const std::vector<kernel::AddressPtr> &inputs,
                                  const std::vector<kernel::AddressPtr> &,
                                  const std::vector<kernel::AddressPtr> &outputs) {
  auto x = reinterpret_cast<T *>(inputs[0]->addr);
  CHECK_KERNEL_INPUTS_NUM(inputs.size(), kHSigmoidInputsNum, kernel_name_);
  CHECK_KERNEL_OUTPUTS_NUM(outputs.size(), kHSigmoidOutputsNum, kernel_name_);
  const auto *x = reinterpret_cast<T *>(inputs[0]->addr);
  auto y = reinterpret_cast<T *>(outputs[0]->addr);
  auto zero = static_cast<T>(0);
  auto one = static_cast<T>(1);
  auto three = static_cast<T>(3);
  auto six = static_cast<T>(6);

  auto task = [&](size_t start, size_t end) {
    for (uint64_t i = start; i < end; ++i) {
      if (x[i] <= -3) {
        y[i] = 0;
      } else if (x[i] >= 3) {
        y[i] = 1;
      if (x[i] + three <= zero) {
        y[i] = zero;
      } else if (x[i] >= three) {
        y[i] = one;
      } else {
        y[i] = (x[i] + 3) / 6;
        y[i] = (x[i] + three) / six;
      }
    }
  };
  CPUKernelUtils::ParallelFor(task, tensor_size_);
  return true;
 }

 template <typename T>
 void HSigmoidCPUKernel<T>::CheckParam(const CNodePtr &kernel_node) {
  size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
  if (input_num != 1) {
    MS_LOG(EXCEPTION) << "Input number is " << input_num << ", but HSigmoidCPUKernel needs 1 input.";
  }
  size_t output_num = AnfAlgo::GetOutputTensorNum(kernel_node);
  if (output_num != 1) {
    MS_LOG(EXCEPTION) << "Output number is " << output_num << ", but HSigmoidCPUKernel needs 1 output.";
  }
 }
 }  // namespace kernel
 }  // namespace mindspore
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/hsigmoid_cpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/hsigmoid_cpu_kernel.h
@@ -14,8 +14,9 @@
 * limitations under the License.
 */

 #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_TILE_CPU_KERNEL_H_
 #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_TILE_CPU_KERNEL_H_
 #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_HSIGMOID_CPU_KERNEL_H_
 #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_HSIGMOID_CPU_KERNEL_H_

 #include <memory>
 #include <unordered_map>
 #include <vector>
@@ -36,7 +37,6 @@ class HSigmoidCPUKernel : public CPUKernel {
              const std::vector<AddressPtr> &outputs) override;

 private:
  void CheckParam(const CNodePtr &kernel_node);
  std::vector<size_t> x_shape_;
  uint64_t tensor_size_ = 1;
 };
@@ -52,4 +52,4 @@ MS_REG_CPU_KERNEL_T(HSigmoid, KernelAttr(), HSigmoidCPUKernel, int64_t);
 MS_REG_CPU_KERNEL_T(HSigmoid, KernelAttr(), HSigmoidCPUKernel, float);
 }  // namespace kernel
 }  // namespace mindspore
 #endif  // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_TILE_CPU_KERNEL_H_
 #endif  // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_HSIGMOID_CPU_KERNEL_H_
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/hsigmoid_grad_cpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/hsigmoid_grad_cpu_kernel.cc
@@ -20,9 +20,15 @@

 namespace mindspore {
 namespace kernel {
 namespace {
 constexpr size_t kHSigmoidGradInputsNum = 2;
 constexpr size_t kHSigmoidGradOutputsNum = 1;
 }  // namespace

 template <typename T>
 void HSigmoidGradCPUKernel<T>::InitKernel(const CNodePtr &kernel_node) {
  CheckParam(kernel_node);
  MS_EXCEPTION_IF_NULL(kernel_node);
  kernel_name_ = AnfAlgo::GetCNodeName(kernel_node);
  x_shape_ = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 1);
  for (const uint64_t &d : x_shape_) {
    tensor_size_ *= d;
@@ -33,32 +39,27 @@ template <typename T>
 bool HSigmoidGradCPUKernel<T>::Launch(const std::vector<kernel::AddressPtr> &inputs,
                                      const std::vector<kernel::AddressPtr> &,
                                      const std::vector<kernel::AddressPtr> &outputs) {
  auto dy = reinterpret_cast<T *>(inputs[0]->addr);
  auto x = reinterpret_cast<T *>(inputs[1]->addr);
  auto out = reinterpret_cast<T *>(outputs[0]->addr);
  CHECK_KERNEL_INPUTS_NUM(inputs.size(), kHSigmoidGradInputsNum, kernel_name_);
  CHECK_KERNEL_OUTPUTS_NUM(outputs.size(), kHSigmoidGradOutputsNum, kernel_name_);
  const auto *dy = reinterpret_cast<T *>(inputs[0]->addr);
  const auto *x = reinterpret_cast<T *>(inputs[1]->addr);
  auto *out = reinterpret_cast<T *>(outputs[0]->addr);

  auto zero = static_cast<T>(0);
  auto three = static_cast<T>(3);
  auto six = static_cast<T>(6);

  auto task = [&](size_t start, size_t end) {
    for (uint64_t i = start; i < end; ++i) {
      if (x[i] <= -3 || x[i] >= 3) {
        out[i] = 0;
      if (x[i] + three <= zero || x[i] >= three) {
        out[i] = zero;
      } else {
        out[i] = dy[i] / 6;
        out[i] = dy[i] / six;
      }
    }
  };
  CPUKernelUtils::ParallelFor(task, tensor_size_);
  return true;
 }

 template <typename T>
 void HSigmoidGradCPUKernel<T>::CheckParam(const CNodePtr &kernel_node) {
  size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
  if (input_num != 2) {
    MS_LOG(EXCEPTION) << "Input number is " << input_num << ", but HSigmoidGradCPUKernel needs 2 input.";
  }
  size_t output_num = AnfAlgo::GetOutputTensorNum(kernel_node);
  if (output_num != 1) {
    MS_LOG(EXCEPTION) << "Output number is " << output_num << ", but HSigmoidGradCPUKernel needs 1 output.";
  }
 }
 }  // namespace kernel
 }  // namespace mindspore
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/hsigmoid_grad_cpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/hsigmoid_grad_cpu_kernel.h
@@ -14,8 +14,9 @@
 * limitations under the License.
 */

 #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_TILE_CPU_KERNEL_H_
 #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_TILE_CPU_KERNEL_H_
 #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_HSIGMOID_GRAD_CPU_KERNEL_H_
 #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_HSIGMOID_GRAD_CPU_KERNEL_H_

 #include <memory>
 #include <unordered_map>
 #include <vector>
@@ -36,7 +37,6 @@ class HSigmoidGradCPUKernel : public CPUKernel {
              const std::vector<AddressPtr> &outputs) override;

 private:
  void CheckParam(const CNodePtr &kernel_node);
  std::vector<size_t> x_shape_;
  uint64_t tensor_size_ = 1;
 };
@@ -48,4 +48,4 @@ MS_REG_CPU_KERNEL_T(HSigmoidGrad, KernelAttr(), HSigmoidGradCPUKernel, int64_t);
 MS_REG_CPU_KERNEL_T(HSigmoidGrad, KernelAttr(), HSigmoidGradCPUKernel, float);
 }  // namespace kernel
 }  // namespace mindspore
 #endif  // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_TILE_CPU_KERNEL_H_
 #endif  // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_HSIGMOID_GRAD_CPU_KERNEL_H_
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/hswish_cpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/hswish_cpu_kernel.cc
@@ -20,9 +20,15 @@

 namespace mindspore {
 namespace kernel {
 namespace {
 constexpr size_t kHSwishInputsNum = 1;
 constexpr size_t kHSwishOutputsNum = 1;
 }  // namespace

 template <typename T>
 void HSwishCPUKernel<T>::InitKernel(const CNodePtr &kernel_node) {
  CheckParam(kernel_node);
  MS_EXCEPTION_IF_NULL(kernel_node);
  kernel_name_ = AnfAlgo::GetCNodeName(kernel_node);
  x_shape_ = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
  for (const uint64_t &d : x_shape_) {
    tensor_size_ *= d;
@@ -32,33 +38,27 @@ void HSwishCPUKernel<T>::InitKernel(const CNodePtr &kernel_node) {
 template <typename T>
 bool HSwishCPUKernel<T>::Launch(const std::vector<kernel::AddressPtr> &inputs, const std::vector<kernel::AddressPtr> &,
                                const std::vector<kernel::AddressPtr> &outputs) {
  auto x = reinterpret_cast<T *>(inputs[0]->addr);
  auto y = reinterpret_cast<T *>(outputs[0]->addr);
  CHECK_KERNEL_INPUTS_NUM(inputs.size(), kHSwishInputsNum, kernel_name_);
  CHECK_KERNEL_OUTPUTS_NUM(outputs.size(), kHSwishOutputsNum, kernel_name_);
  const auto *x = reinterpret_cast<T *>(inputs[0]->addr);
  auto *y = reinterpret_cast<T *>(outputs[0]->addr);
  auto zero = static_cast<T>(0);
  auto three = static_cast<T>(3);
  auto six = static_cast<T>(6);

  auto task = [&](size_t start, size_t end) {
    for (uint64_t i = start; i < end; ++i) {
      if (x[i] <= -3) {
        y[i] = 0;
      } else if (x[i] >= 3) {
      if (x[i] + three <= zero) {
        y[i] = zero;
      } else if (x[i] >= three) {
        y[i] = x[i];
      } else {
        y[i] = x[i] * (x[i] + 3) / 6;
        y[i] = x[i] * (x[i] + three) / six;
      }
    }
  };
  CPUKernelUtils::ParallelFor(task, tensor_size_);
  return true;
 }

 template <typename T>
 void HSwishCPUKernel<T>::CheckParam(const CNodePtr &kernel_node) {
  size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
  if (input_num != 1) {
    MS_LOG(EXCEPTION) << "Input number is " << input_num << ", but HSwishCPUKernel needs 1 input.";
  }
  size_t output_num = AnfAlgo::GetOutputTensorNum(kernel_node);
  if (output_num != 1) {
    MS_LOG(EXCEPTION) << "Output number is " << output_num << ", but HSwishCPUKernel needs 1 output.";
  }
 }
 }  // namespace kernel
 }  // namespace mindspore
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/hswish_cpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/hswish_cpu_kernel.h
@@ -14,8 +14,9 @@
 * limitations under the License.
 */

 #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_TILE_CPU_KERNEL_H_
 #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_TILE_CPU_KERNEL_H_
 #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_HSWISH_CPU_KERNEL_H_
 #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_HSWISH_CPU_KERNEL_H_

 #include <memory>
 #include <unordered_map>
 #include <vector>
@@ -36,7 +37,6 @@ class HSwishCPUKernel : public CPUKernel {
              const std::vector<AddressPtr> &outputs) override;

 private:
  void CheckParam(const CNodePtr &kernel_node);
  std::vector<size_t> x_shape_;
  uint64_t tensor_size_ = 1;
 };
@@ -48,4 +48,4 @@ MS_REG_CPU_KERNEL_T(HSwish, KernelAttr(), HSwishCPUKernel, int64_t);
 MS_REG_CPU_KERNEL_T(HSwish, KernelAttr(), HSwishCPUKernel, float);
 }  // namespace kernel
 }  // namespace mindspore
 #endif  // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_TILE_CPU_KERNEL_H_
 #endif  // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_HSWISH_CPU_KERNEL_H_
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/hswish_grad_cpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/hswish_grad_cpu_kernel.cc
@@ -20,9 +20,15 @@

 namespace mindspore {
 namespace kernel {
 namespace {
 constexpr size_t kHSwishGradInputsNum = 2;
 constexpr size_t kHSwishGradOutputsNum = 1;
 }  // namespace

 template <typename T>
 void HSwishGradCPUKernel<T>::InitKernel(const CNodePtr &kernel_node) {
  CheckParam(kernel_node);
  MS_EXCEPTION_IF_NULL(kernel_node);
  kernel_name_ = AnfAlgo::GetCNodeName(kernel_node);
  x_shape_ = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 1);
  for (const uint64_t &d : x_shape_) {
    tensor_size_ *= d;
@@ -33,34 +39,30 @@ template <typename T>
 bool HSwishGradCPUKernel<T>::Launch(const std::vector<kernel::AddressPtr> &inputs,
                                    const std::vector<kernel::AddressPtr> &,
                                    const std::vector<kernel::AddressPtr> &outputs) {
  auto dy = reinterpret_cast<T *>(inputs[0]->addr);
  auto x = reinterpret_cast<T *>(inputs[1]->addr);
  auto out = reinterpret_cast<T *>(outputs[0]->addr);
  CHECK_KERNEL_INPUTS_NUM(inputs.size(), kHSwishGradInputsNum, kernel_name_);
  CHECK_KERNEL_OUTPUTS_NUM(outputs.size(), kHSwishGradOutputsNum, kernel_name_);
  const auto *dy = reinterpret_cast<T *>(inputs[0]->addr);
  const auto *x = reinterpret_cast<T *>(inputs[1]->addr);
  auto *out = reinterpret_cast<T *>(outputs[0]->addr);

  auto zero = static_cast<T>(0);
  auto two = static_cast<T>(2);
  auto three = static_cast<T>(3);
  auto six = static_cast<T>(6);

  auto task = [&](size_t start, size_t end) {
    for (uint64_t i = start; i < end; ++i) {
      if (x[i] <= -3) {
        out[i] = 0;
      } else if (x[i] >= 3) {
      if (x[i] + three <= zero) {
        out[i] = zero;
      } else if (x[i] >= three) {
        out[i] = dy[i];
      } else {
        out[i] = dy[i] * (2 * x[i] + 3) / 6;
        out[i] = dy[i] * (two * x[i] + three) / six;
      }
    }
  };
  CPUKernelUtils::ParallelFor(task, tensor_size_);
  return true;
 }

 template <typename T>
 void HSwishGradCPUKernel<T>::CheckParam(const CNodePtr &kernel_node) {
  size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
  if (input_num != 2) {
    MS_LOG(EXCEPTION) << "Input number is " << input_num << ", but HSwishGradCPUKernel needs 2 input.";
  }
  size_t output_num = AnfAlgo::GetOutputTensorNum(kernel_node);
  if (output_num != 1) {
    MS_LOG(EXCEPTION) << "Output number is " << output_num << ", but HSwishGradCPUKernel needs 1 output.";
  }
 }
 }  // namespace kernel
 }  // namespace mindspore
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/hswish_grad_cpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/hswish_grad_cpu_kernel.h
@@ -14,8 +14,9 @@
 * limitations under the License.
 */

 #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_TILE_CPU_KERNEL_H_
 #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_TILE_CPU_KERNEL_H_
 #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_HSWISH_GRAD_CPU_KERNEL_H_
 #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_HSWISH_GRAD_CPU_KERNEL_H_

 #include <memory>
 #include <unordered_map>
 #include <vector>
@@ -36,7 +37,6 @@ class HSwishGradCPUKernel : public CPUKernel {
              const std::vector<AddressPtr> &outputs) override;

 private:
  void CheckParam(const CNodePtr &kernel_node);
  std::vector<size_t> x_shape_;
  uint64_t tensor_size_ = 1;
 };
@@ -48,4 +48,4 @@ MS_REG_CPU_KERNEL_T(HSwishGrad, KernelAttr(), HSwishGradCPUKernel, int64_t);
 MS_REG_CPU_KERNEL_T(HSwishGrad, KernelAttr(), HSwishGradCPUKernel, float);
 }  // namespace kernel
 }  // namespace mindspore
 #endif  // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_TILE_CPU_KERNEL_H_
 #endif  // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_HSWISH_GRAD_CPU_KERNEL_H_
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/isfinite_cpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/isfinite_cpu_kernel.cc
@@ -1,5 +1,5 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 * Copyright 2020-2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -21,18 +21,15 @@

 namespace mindspore {
 namespace kernel {
 void IsFiniteCPUKernel::InitKernel(const CNodePtr &kernelNode) {
  MS_EXCEPTION_IF_NULL(kernelNode);
  size_t input_num = AnfAlgo::GetInputTensorNum(kernelNode);
  if (input_num != 1) {
    MS_LOG(EXCEPTION) << "Input number is " << input_num << ", but IsFiniteCPUKernel needs 1 inputs.";
  }
  size_t output_num = AnfAlgo::GetOutputTensorNum(kernelNode);
  if (output_num != 1) {
    MS_LOG(EXCEPTION) << "Output number is " << output_num << ", but IsFiniteCPUKernel needs 1 output.";
  }
 namespace {
 constexpr size_t kIsFiniteInputsNum = 1;
 constexpr size_t kIsFiniteOutputsNum = 1;
 }  // namespace

  input_dtype_ = AnfAlgo::GetInputDeviceDataType(kernelNode, 0);
 void IsFiniteCPUKernel::InitKernel(const CNodePtr &kernel_node) {
  MS_EXCEPTION_IF_NULL(kernel_node);
  kernel_name_ = AnfAlgo::GetCNodeName(kernel_node);
  input_dtype_ = AnfAlgo::GetInputDeviceDataType(kernel_node, 0);
  if (dtype_map_.find(input_dtype_) == dtype_map_.end()) {
    MS_LOG(EXCEPTION) << "Unsupported input type found.";
  }
@@ -40,24 +37,24 @@ void IsFiniteCPUKernel::InitKernel(const CNodePtr &kernelNode) {

 bool IsFiniteCPUKernel::Launch(const std::vector<kernel::AddressPtr> &inputs, const std::vector<kernel::AddressPtr> &,
                               const std::vector<kernel::AddressPtr> &outputs) {
  CHECK_KERNEL_INPUTS_NUM(inputs.size(), kIsFiniteInputsNum, kernel_name_);
  CHECK_KERNEL_OUTPUTS_NUM(outputs.size(), kIsFiniteOutputsNum, kernel_name_);
  if (input_dtype_ == kNumberTypeFloat16) {
    LaunchKernelFloat16(inputs, outputs);
  } else if (input_dtype_ == kNumberTypeFloat32 || input_dtype_ == kNumberTypeFloat) {
  } else if (input_dtype_ == kNumberTypeFloat32) {
    LaunchKernelFloat<float>(inputs, outputs);
  } else if (input_dtype_ == kNumberTypeFloat64) {
    LaunchKernelFloat<double>(inputs, outputs);
  } else if (dtype_map_.find(input_dtype_) != dtype_map_.end()) {
    LaunchKernelOther(inputs, outputs);
  } else {
    MS_LOG(EXCEPTION) << "Only support bool, int, uint, float, but actual data type is " << TypeIdLabel(input_dtype_);
    LaunchKernelOther(inputs, outputs);
  }
  return true;
 }

 void IsFiniteCPUKernel::LaunchKernelFloat16(const std::vector<AddressPtr> &inputs,
                                            const std::vector<kernel::AddressPtr> &outputs) const {
  float16 *input = reinterpret_cast<float16 *>(inputs[0]->addr);
  bool *output = reinterpret_cast<bool *>(outputs[0]->addr);
  const auto *input = reinterpret_cast<float16 *>(inputs[0]->addr);
  auto *output = reinterpret_cast<bool *>(outputs[0]->addr);

  size_t elem_num = inputs[0]->size / sizeof(float16);

--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/isfinite_cpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/isfinite_cpu_kernel.h
@@ -13,6 +13,7 @@
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_ISFINITE_CPU_KERNEL_H_
 #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_ISFINITE_CPU_KERNEL_H_

--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/isnan_cpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/isnan_cpu_kernel.cc
@@ -21,18 +21,15 @@

 namespace mindspore {
 namespace kernel {
 void IsNanCPUKernel::InitKernel(const CNodePtr &kernelNode) {
  MS_EXCEPTION_IF_NULL(kernelNode);
  size_t input_num = AnfAlgo::GetInputTensorNum(kernelNode);
  if (input_num != 1) {
    MS_LOG(EXCEPTION) << "Input number is " << input_num << ", but IsNanCPUKernel needs 1 inputs.";
  }
  size_t output_num = AnfAlgo::GetOutputTensorNum(kernelNode);
  if (output_num != 1) {
    MS_LOG(EXCEPTION) << "Output number is " << output_num << ", but IsNanCPUKernel needs 1 output.";
  }
 namespace {
 constexpr size_t kIsNanInputsNum = 1;
 constexpr size_t kIsNanOutputsNum = 1;
 }  // namespace

  input_dtype_ = AnfAlgo::GetInputDeviceDataType(kernelNode, 0);
 void IsNanCPUKernel::InitKernel(const CNodePtr &kernel_node) {
  MS_EXCEPTION_IF_NULL(kernel_node);
  kernel_name_ = AnfAlgo::GetCNodeName(kernel_node);
  input_dtype_ = AnfAlgo::GetInputDeviceDataType(kernel_node, 0);
  if (dtype_map_.find(input_dtype_) == dtype_map_.end()) {
    MS_LOG(EXCEPTION) << "Unsupported input type found.";
  }
@@ -40,24 +37,24 @@ void IsNanCPUKernel::InitKernel(const CNodePtr &kernelNode) {

 bool IsNanCPUKernel::Launch(const std::vector<kernel::AddressPtr> &inputs, const std::vector<kernel::AddressPtr> &,
                            const std::vector<kernel::AddressPtr> &outputs) {
  CHECK_KERNEL_INPUTS_NUM(inputs.size(), kIsNanInputsNum, kernel_name_);
  CHECK_KERNEL_OUTPUTS_NUM(outputs.size(), kIsNanOutputsNum, kernel_name_);
  if (input_dtype_ == kNumberTypeFloat16) {
    LaunchKernelFloat16(inputs, outputs);
  } else if (input_dtype_ == kNumberTypeFloat32 || input_dtype_ == kNumberTypeFloat) {
  } else if (input_dtype_ == kNumberTypeFloat32) {
    LaunchKernelFloat<float>(inputs, outputs);
  } else if (input_dtype_ == kNumberTypeFloat64) {
    LaunchKernelFloat<double>(inputs, outputs);
  } else if (dtype_map_.find(input_dtype_) != dtype_map_.end()) {
    LaunchKernelOther(inputs, outputs);
  } else {
    MS_LOG(EXCEPTION) << "Only support bool, int, uint, float, but actual data type is " << TypeIdLabel(input_dtype_);
    LaunchKernelOther(inputs, outputs);
  }
  return true;
 }

 void IsNanCPUKernel::LaunchKernelFloat16(const std::vector<AddressPtr> &inputs,
                                         const std::vector<kernel::AddressPtr> &outputs) {
  float16 *input = reinterpret_cast<float16 *>(inputs[0]->addr);
  bool *output = reinterpret_cast<bool *>(outputs[0]->addr);
  const auto *input = reinterpret_cast<float16 *>(inputs[0]->addr);
  auto *output = reinterpret_cast<bool *>(outputs[0]->addr);

  size_t elem_num = inputs[0]->size / sizeof(float16);

--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/isnan_cpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/isnan_cpu_kernel.h
@@ -13,6 +13,7 @@
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_ISNAN_CPU_KERNEL_H_
 #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_ISNAN_CPU_KERNEL_H_

--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/l2loss_cpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/l2loss_cpu_kernel.cc
@@ -19,9 +19,15 @@

 namespace mindspore {
 namespace kernel {
 namespace {
 constexpr size_t kL2LossInputsNum = 1;
 constexpr size_t kL2LossOutputsNum = 1;
 }  // namespace

 template <typename T>
 void L2LossCPUKernel<T>::InitKernel(const CNodePtr &kernel_node) {
  CheckParam(kernel_node);
  MS_EXCEPTION_IF_NULL(kernel_node);
  kernel_name_ = AnfAlgo::GetCNodeName(kernel_node);
  std::vector<size_t> x_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
  for (const size_t &d : x_shape) {
    tensor_size_ *= d;
@@ -31,26 +37,16 @@ void L2LossCPUKernel<T>::InitKernel(const CNodePtr &kernel_node) {
 template <typename T>
 bool L2LossCPUKernel<T>::Launch(const std::vector<kernel::AddressPtr> &inputs, const std::vector<kernel::AddressPtr> &,
                                const std::vector<kernel::AddressPtr> &outputs) {
  CHECK_KERNEL_INPUTS_NUM(inputs.size(), kL2LossInputsNum, kernel_name_);
  CHECK_KERNEL_OUTPUTS_NUM(outputs.size(), kL2LossOutputsNum, kernel_name_);
  auto input_addr = reinterpret_cast<T *>(inputs[0]->addr);
  auto result_addr = reinterpret_cast<T *>(outputs[0]->addr);
  *result_addr = (T)0;
  *result_addr = static_cast<T>(0);
  for (size_t i = 0; i < tensor_size_; i++) {
    *result_addr += input_addr[i] * input_addr[i];
  }
  *result_addr = *result_addr / 2;
  return true;
 }

 template <typename T>
 void L2LossCPUKernel<T>::CheckParam(const CNodePtr &kernel_node) {
  size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
  if (input_num != 1) {
    MS_LOG(EXCEPTION) << "Input number is " << input_num << ", but L2LossCPUKernel needs 1 input.";
  }
  size_t output_num = AnfAlgo::GetOutputTensorNum(kernel_node);
  if (output_num != 1) {
    MS_LOG(EXCEPTION) << "Output number is " << output_num << ", but L2LossCPUKernel needs 1 output.";
  }
 }
 }  // namespace kernel
 }  // namespace mindspore
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/l2loss_cpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/l2loss_cpu_kernel.h
@@ -16,6 +16,7 @@

 #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_L2_LOSS_CPU_KERNEL_H_
 #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_L2_LOSS_CPU_KERNEL_H_

 #include <memory>
 #include <unordered_map>
 #include <vector>
@@ -36,8 +37,8 @@ class L2LossCPUKernel : public CPUKernel {
              const std::vector<AddressPtr> &outputs) override;

 private:
  void CheckParam(const CNodePtr &kernel_node);
  size_t tensor_size_{1};
  TypeId dtype_{kTypeUnknown};
  size_t tensor_size_ = 1;
 };

 MS_REG_CPU_KERNEL_T(L2Loss, KernelAttr(), L2LossCPUKernel, float16);
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/layer_norm_cpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/layer_norm_cpu_kernel.cc
@@ -13,6 +13,7 @@
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #include "backend/kernel_compiler/cpu/layer_norm_cpu_kernel.h"
 #include "backend/kernel_compiler/common_utils.h"
 #include "runtime/device/cpu/cpu_device_address.h"
@@ -20,8 +21,14 @@

 namespace mindspore {
 namespace kernel {
 namespace {
 constexpr size_t kLayerNormInputsNum = 3;
 constexpr size_t kLayerNormOutputsNum = 3;
 }  // namespace

 void LayerNormCPUKernel::InitKernel(const CNodePtr &kernel_node) {
  CheckParam(kernel_node);
  MS_EXCEPTION_IF_NULL(kernel_node);
  kernel_name_ = AnfAlgo::GetCNodeName(kernel_node);
  dtype_ = AnfAlgo::GetInputDeviceDataType(kernel_node, 0);
  std::vector<size_t> x_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
  auto begin_norm_axis = AnfAlgo::GetNodeAttr<int64_t>(kernel_node, "begin_norm_axis");
@@ -48,12 +55,14 @@ void LayerNormCPUKernel::InitKernel(const CNodePtr &kernel_node) {

 bool LayerNormCPUKernel::Launch(const std::vector<kernel::AddressPtr> &inputs, const std::vector<kernel::AddressPtr> &,
                                const std::vector<kernel::AddressPtr> &outputs) {
  CHECK_KERNEL_INPUTS_NUM(inputs.size(), kLayerNormInputsNum, kernel_name_);
  CHECK_KERNEL_OUTPUTS_NUM(outputs.size(), kLayerNormOutputsNum, kernel_name_);
  if (dtype_ == kNumberTypeFloat16) {
    LaunchKernel<float16>(inputs, outputs);
  } else if (dtype_ == kNumberTypeFloat32 || dtype_ == kNumberTypeFloat64) {
    LaunchKernel<float>(inputs, outputs);
  } else {
    MS_LOG(EXCEPTION) << "Input dtype only support float16, float32, float64!";
    MS_LOG(EXCEPTION) << "Input dtype only support float16, float32, float64";
  }
  return true;
 }
@@ -111,16 +120,5 @@ void LayerNormCPUKernel::LaunchKernel(const std::vector<AddressPtr> &inputs, con
  }
  (void)common::ThreadPool::GetInstance().SyncRun(tasks);
 }

 void LayerNormCPUKernel::CheckParam(const CNodePtr &kernel_node) {
  size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
  if (input_num != 3) {
    MS_LOG(EXCEPTION) << "LayerNormCPUKernel needs 3 inputs, but gets " << input_num;
  }
  size_t output_num = AnfAlgo::GetOutputTensorNum(kernel_node);
  if (output_num != 3) {
    MS_LOG(EXCEPTION) << "LayerNormCPUKernel expects 3 output, but gets" << output_num;
  }
 }
 }  // namespace kernel
 }  // namespace mindspore
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/layer_norm_cpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/layer_norm_cpu_kernel.h
@@ -16,6 +16,7 @@

 #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_LAYER_NORM_CPU_KERNEL_H_
 #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_LAYER_NORM_CPU_KERNEL_H_

 #include <memory>
 #include <unordered_map>
 #include <vector>
@@ -34,11 +35,10 @@ class LayerNormCPUKernel : public CPUKernel {
  bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
              const std::vector<AddressPtr> &outputs) override;

 private:
  template <typename T>
  void LaunchKernel(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &outputs);

 private:
  void CheckParam(const CNodePtr &kernel_node);
  TypeId dtype_{kTypeUnknown};
  float eps_{1e-12};
  size_t block_num_{1};
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/layer_norm_grad_cpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/layer_norm_grad_cpu_kernel.cc
@@ -21,8 +21,14 @@

 namespace mindspore {
 namespace kernel {
 namespace {
 constexpr size_t kLayerNormGradInputsNum = 5;
 constexpr size_t kLayerNormGradOutputsNum = 3;
 }  // namespace

 void LayerNormGradCPUKernel::InitKernel(const CNodePtr &kernel_node) {
  CheckParam(kernel_node);
  MS_EXCEPTION_IF_NULL(kernel_node);
  kernel_name_ = AnfAlgo::GetCNodeName(kernel_node);
  dtype_ = AnfAlgo::GetInputDeviceDataType(kernel_node, 0);
  std::vector<size_t> x_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
  auto begin_norm_axis = AnfAlgo::GetNodeAttr<int64_t>(kernel_node, "begin_norm_axis");
@@ -53,6 +59,8 @@ void LayerNormGradCPUKernel::InitKernel(const CNodePtr &kernel_node) {
 bool LayerNormGradCPUKernel::Launch(const std::vector<kernel::AddressPtr> &inputs,
                                    const std::vector<kernel::AddressPtr> &,
                                    const std::vector<kernel::AddressPtr> &outputs) {
  CHECK_KERNEL_INPUTS_NUM(inputs.size(), kLayerNormGradInputsNum, kernel_name_);
  CHECK_KERNEL_OUTPUTS_NUM(outputs.size(), kLayerNormGradOutputsNum, kernel_name_);
  if (dtype_ == kNumberTypeFloat16) {
    LaunchKernel<float16>(inputs, outputs);
  } else if (dtype_ == kNumberTypeFloat32 || dtype_ == kNumberTypeFloat64) {
@@ -66,14 +74,14 @@ bool LayerNormGradCPUKernel::Launch(const std::vector<kernel::AddressPtr> &input
 template <typename T>
 void LayerNormGradCPUKernel::LaunchKernel(const std::vector<AddressPtr> &inputs,
                                          const std::vector<AddressPtr> &outputs) {
  auto x = reinterpret_cast<T *>(inputs[0]->addr);
  auto dy = reinterpret_cast<T *>(inputs[1]->addr);
  auto var = reinterpret_cast<T *>(inputs[2]->addr);
  auto mean = reinterpret_cast<T *>(inputs[3]->addr);
  auto gamma = reinterpret_cast<T *>(inputs[4]->addr);
  auto dx = reinterpret_cast<T *>(outputs[0]->addr);
  auto dg = reinterpret_cast<T *>(outputs[1]->addr);
  auto db = reinterpret_cast<T *>(outputs[2]->addr);
  auto *x = reinterpret_cast<T *>(inputs[0]->addr);
  auto *dy = reinterpret_cast<T *>(inputs[1]->addr);
  auto *var = reinterpret_cast<T *>(inputs[2]->addr);
  auto *mean = reinterpret_cast<T *>(inputs[3]->addr);
  auto *gamma = reinterpret_cast<T *>(inputs[4]->addr);
  auto *dx = reinterpret_cast<T *>(outputs[0]->addr);
  auto *dg = reinterpret_cast<T *>(outputs[1]->addr);
  auto *db = reinterpret_cast<T *>(outputs[2]->addr);
  size_t thread_num = common::ThreadPool::GetInstance().GetSyncRunThreadNum();
  auto thread_num1 = param_num_ < thread_num ? param_num_ : thread_num;
  std::vector<common::Task> tasks1;
@@ -121,7 +129,7 @@ void LayerNormGradCPUKernel::LaunchKernel(const std::vector<AddressPtr> &inputs,
        auto norm_shift = static_cast<int>(j / block_size_);
        auto var_sqrt = (T)std::pow(static_cast<double>(var[norm_shift]) + eps_, -0.5);
        auto dx1 = dy[j] * gamma[param_shift] * var_sqrt;
        auto dx2 = sum1 * (T)2.0 / block_size_ * (x[j] - mean[norm_shift]);
        auto dx2 = sum1 * (T)2.0 / (T)(block_size_) * (x[j] - mean[norm_shift]);
        auto dx3 = ((T)(-1.0) * var_sqrt * sum2 + ((T)1.0 / (T)block_size_) * sum1 * sum3) * ((T)1.0 / (T)block_size_);
        dx[j] = dx1 + dx2 + dx3;
      }
@@ -144,16 +152,5 @@ void LayerNormGradCPUKernel::LaunchKernel(const std::vector<AddressPtr> &inputs,
  }
  (void)common::ThreadPool::GetInstance().SyncRun(tasks2);
 }

 void LayerNormGradCPUKernel::CheckParam(const CNodePtr &kernel_node) {
  size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
  if (input_num != 5) {
    MS_LOG(EXCEPTION) << "LayerNormGradCPUKernel needs 5 inputs, but gets " << input_num;
  }
  size_t output_num = AnfAlgo::GetOutputTensorNum(kernel_node);
  if (output_num != 3) {
    MS_LOG(EXCEPTION) << "LayerNormGradCPUKernel expects 3 output, but gets" << output_num;
  }
 }
 }  // namespace kernel
 }  // namespace mindspore
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/layer_norm_grad_cpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/layer_norm_grad_cpu_kernel.h
@@ -16,6 +16,7 @@

 #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_LAYER_NORM_GRAD_CPU_KERNEL_H_
 #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_LAYER_NORM_GRAD_CPU_KERNEL_H_

 #include <memory>
 #include <unordered_map>
 #include <vector>
@@ -34,11 +35,10 @@ class LayerNormGradCPUKernel : public CPUKernel {
  bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
              const std::vector<AddressPtr> &outputs) override;

 private:
  template <typename T>
  void LaunchKernel(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &outputs);

 private:
  void CheckParam(const CNodePtr &kernel_node);
  TypeId dtype_{kTypeUnknown};
  float eps_{1e-12};
  size_t block_num_{1};
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/map_cache_idx_cpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/map_cache_idx_cpu_kernel.cc
@@ -1,5 +1,5 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 * Copyright 2020-2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -23,12 +23,17 @@

 namespace mindspore {
 namespace kernel {
 namespace {
 constexpr size_t kMapCacheIdxInputsNum = 5;
 constexpr size_t kMapCacheIdxOutputsNum = 4;
 }  // namespace

 template <typename T>
 int Compress(HashmapEntry<T> *entry_p, const size_t &length, T entry) {
  T i = (entry + 1) % length;
  int64_t off = 1;
  T i = (entry + 1) % static_cast<T>(length);
  T off = 1;
  int compress_count = 0;
  for (; !entry_p[i].IsEmpty(); i = (i + 1) % length, off++) {
  for (; !entry_p[i].IsEmpty(); i = (i + 1) % static_cast<T>(length), off++) {
    if (entry_p[i].tag_ > off) {
      entry_p[entry].key_ = entry_p[i].key_;
      entry_p[entry].value_ = entry_p[i].value_;
@@ -43,28 +48,29 @@ int Compress(HashmapEntry<T> *entry_p, const size_t &length, T entry) {
  return compress_count;
 }

 void UpdateShape(size_t miss_count, const CNodePtr &node_) {
 void UpdateShape(size_t miss_count, const CNodePtr &node) {
  std::vector<size_t> out_shape;
  (void)out_shape.emplace_back(miss_count);
  size_t output_num = AnfAlgo::GetOutputTensorNum(node_);
  size_t output_num = AnfAlgo::GetOutputTensorNum(node);
  std::vector<TypeId> dtypes(output_num);
  for (size_t i = 0; i < output_num; i++) {
    dtypes[i] = AnfAlgo::GetOutputDeviceDataType(node_, i);
    dtypes[i] = AnfAlgo::GetOutputDeviceDataType(node, i);
  }
  AnfAlgo::SetOutputInferTypeAndShape(dtypes, {AnfAlgo::GetOutputInferShape(node_, 0), out_shape, out_shape, out_shape},
                                      node_.get());
  AnfAlgo::SetOutputInferTypeAndShape(dtypes, {AnfAlgo::GetOutputInferShape(node, 0), out_shape, out_shape, out_shape},
                                      node.get());
 }

 void MapCacheIdxCPUKernel::InitKernel(const CNodePtr &kernel_node) {
  MS_EXCEPTION_IF_NULL(kernel_node);
  kernel_name_ = AnfAlgo::GetCNodeName(kernel_node);
  node_wpt_ = kernel_node;
  auto hashmap_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
  if (hashmap_shape.size() != 2) {
    MS_LOG(EXCEPTION) << "Dimension of HashMap must be 2, (n, 4)";
  }
  hashmap_length_ = hashmap_shape[0];
  if (hashmap_length_ <= 0) {
    MS_LOG(INFO) << "Value of hashmap_length_ must > 0!";
  if (hashmap_length_ == 0) {
    MS_LOG(EXCEPTION) << "Value of hashmap_length_ must > 0!";
  }
  dtype_ = AnfAlgo::GetInputDeviceDataType(kernel_node, 0);
 }
@@ -72,13 +78,14 @@ void MapCacheIdxCPUKernel::InitKernel(const CNodePtr &kernel_node) {
 bool MapCacheIdxCPUKernel::Launch(const std::vector<kernel::AddressPtr> &inputs,
                                  const std::vector<kernel::AddressPtr> &,
                                  const std::vector<kernel::AddressPtr> &outputs) {
  CHECK_KERNEL_INPUTS_NUM(inputs.size(), kMapCacheIdxInputsNum, kernel_name_);
  CHECK_KERNEL_OUTPUTS_NUM(outputs.size(), kMapCacheIdxOutputsNum, kernel_name_);
  if (dtype_ == kNumberTypeInt32) {
    LaunchKernel<int>(inputs, outputs);
  } else if (dtype_ == kNumberTypeInt64) {
    LaunchKernel<int64_t>(inputs, outputs);
  } else {
    MS_LOG(ERROR) << "Only support int32, int64";
    return false;
    MS_LOG(EXCEPTION) << "Only support int32, int64";
  }
  return true;
 }
@@ -86,8 +93,8 @@ bool MapCacheIdxCPUKernel::Launch(const std::vector<kernel::AddressPtr> &inputs,
 template <typename T>
 void MapCacheIdxCPUKernel::LaunchKernel(const std::vector<AddressPtr> &inputs,
                                        const std::vector<kernel::AddressPtr> &outputs) {
  auto node_ = node_wpt_.lock();
  auto emb_idx_shape = AnfAlgo::GetPrevNodeOutputInferShape(node_, 1);
  auto node = node_wpt_.lock();
  auto emb_idx_shape = AnfAlgo::GetPrevNodeOutputInferShape(node, 1);
  batch_size_ = 1;
  for (size_t i = 0; i < emb_idx_shape.size(); ++i) {
    batch_size_ *= emb_idx_shape[i];
@@ -157,8 +164,8 @@ void MapCacheIdxCPUKernel::LaunchKernel(const std::vector<AddressPtr> &inputs,
      tag_count++;
    }
    hashmap[entry].key_ = emb_idx;
    hashmap[entry].step_ = SizeToLong(step_[0]);
    hashmap[entry].tag_ = SizeToLong(tag_count);
    hashmap[entry].step_ = step_[0];
    hashmap[entry].tag_ = static_cast<T>(tag_count);
    T tmp_entry = (entry + 1) % static_cast<T>(hashmap_length_);
    size_t delete_count = 1;
    while (hashmap[tmp_entry].IsEmpty() || hashmap[tmp_entry].IsUsing(step_[0])) {
@@ -184,7 +191,7 @@ void MapCacheIdxCPUKernel::LaunchKernel(const std::vector<AddressPtr> &inputs,
  for (size_t i = 0; i < miss_count; ++i) {
    output_cache_idx[miss_idx[i]] = output_swap_cache_idx[i];
  }
  UpdateShape(miss_count, node_);
  UpdateShape(miss_count, node);
 }
 }  // namespace kernel
 }  // namespace mindspore
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/map_cache_idx_cpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/map_cache_idx_cpu_kernel.h
@@ -13,6 +13,7 @@
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_MAP_CACHE_IDX_CPU_KERNEL_H_
 #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_MAP_CACHE_IDX_CPU_KERNEL_H_

@@ -35,10 +36,10 @@ class MapCacheIdxCPUKernel : public CPUKernel {
  bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
              const std::vector<AddressPtr> &outputs) override;

 private:
  template <typename T>
  void LaunchKernel(const std::vector<AddressPtr> &inputs, const std::vector<kernel::AddressPtr> &outputs);

 private:
  size_t batch_size_{1};
  size_t hashmap_length_{1};
  TypeId dtype_{kTypeUnknown};
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/map_uniform_cpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/map_uniform_cpu_kernel.cc
@@ -1,5 +1,5 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 * Copyright 2020-2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -22,21 +22,28 @@

 namespace mindspore {
 namespace kernel {
 namespace {
 constexpr size_t kMapUniformInputsNum = 3;
 constexpr size_t kMapUniformOutputsNum = 1;
 }  // namespace

 void MapUniformCPUKernel::InitKernel(const CNodePtr &kernel_node) {
  MS_EXCEPTION_IF_NULL(kernel_node);
  kernel_name_ = AnfAlgo::GetCNodeName(kernel_node);
  node_wpt_ = kernel_node;
  dtype_ = AnfAlgo::GetInputDeviceDataType(kernel_node, 0);
 }

 bool MapUniformCPUKernel::Launch(const std::vector<kernel::AddressPtr> &inputs, const std::vector<kernel::AddressPtr> &,
                                 const std::vector<kernel::AddressPtr> &outputs) {
  CHECK_KERNEL_INPUTS_NUM(inputs.size(), kMapUniformInputsNum, kernel_name_);
  CHECK_KERNEL_OUTPUTS_NUM(outputs.size(), kMapUniformOutputsNum, kernel_name_);
  if (dtype_ == kNumberTypeInt32) {
    LaunchKernel<int>(inputs, outputs);
  } else if (dtype_ == kNumberTypeInt64) {
    LaunchKernel<int64_t>(inputs, outputs);
  } else {
    MS_LOG(ERROR) << "Only support int32, int64";
    return false;
    MS_LOG(EXCEPTION) << "Only support int32, int64";
  }
  return true;
 }
@@ -44,11 +51,11 @@ bool MapUniformCPUKernel::Launch(const std::vector<kernel::AddressPtr> &inputs,
 template <typename T>
 void MapUniformCPUKernel::LaunchKernel(const std::vector<AddressPtr> &inputs,
                                       const std::vector<kernel::AddressPtr> &outputs) {
  auto node_ = node_wpt_.lock();
  if (!node_) {
  auto node = node_wpt_.lock();
  if (!node) {
    MS_LOG(EXCEPTION) << "node_wpt_ is expired.";
  }
  auto input_x_shape = AnfAlgo::GetPrevNodeOutputInferShape(node_, 0);
  auto input_x_shape = AnfAlgo::GetPrevNodeOutputInferShape(node, 0);
  batch_size_ = 1;
  for (size_t i = 0; i < input_x_shape.size(); ++i) {
    batch_size_ *= input_x_shape[i];
@@ -58,6 +65,9 @@ void MapUniformCPUKernel::LaunchKernel(const std::vector<AddressPtr> &inputs,
  auto per_group_size = *reinterpret_cast<T *>(inputs[1]->addr);
  auto group_num = *reinterpret_cast<T *>(inputs[2]->addr);
  auto output_x = reinterpret_cast<T *>(outputs[0]->addr);
  if (group_num <= 0) {
    MS_LOG(EXCEPTION) << "Group num should be greater than 0";
  }
  T max_num = group_num * per_group_size;
  for (size_t i = 0; i < batch_size_; ++i) {
    output_x[i] = input_x[i] % group_num * per_group_size + input_x[i] / group_num;
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/map_uniform_cpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/map_uniform_cpu_kernel.h
@@ -1,5 +1,5 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 * Copyright 2020-2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -13,6 +13,7 @@
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_MAP_UNIFORM_CPU_KERNEL_H_
 #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_MAP_UNIFORM_CPU_KERNEL_H_

@@ -35,10 +36,10 @@ class MapUniformCPUKernel : public CPUKernel {
  bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
              const std::vector<AddressPtr> &outputs) override;

 private:
  template <typename T>
  void LaunchKernel(const std::vector<AddressPtr> &inputs, const std::vector<kernel::AddressPtr> &outputs);

 private:
  size_t batch_size_{1};
  TypeId dtype_{kTypeUnknown};
  CNodeWeakPtr node_wpt_;
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/maximum_cpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/maximum_cpu_kernel.cc
@@ -1,5 +1,5 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 * Copyright 2020-2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -19,9 +19,15 @@

 namespace mindspore {
 namespace kernel {
 namespace {
 constexpr size_t kMaximumInputsNum = 2;
 constexpr size_t kMaximumOutputsNum = 1;
 }  // namespace

 template <typename T>
 void MaximumCPUKernel<T>::InitKernel(const CNodePtr &kernel_node) {
  CheckParam(kernel_node);
  MS_EXCEPTION_IF_NULL(kernel_node);
  kernel_name_ = AnfAlgo::GetCNodeName(kernel_node);
  input_x_shape_ = AnfAlgo::GetInputDeviceShape(kernel_node, 0);
  input_y_shape_ = AnfAlgo::GetInputDeviceShape(kernel_node, 1);
  output_shape_ = AnfAlgo::GetOutputDeviceShape(kernel_node, 0);
@@ -42,18 +48,6 @@ void MaximumCPUKernel<T>::InitKernel(const CNodePtr &kernel_node) {
  }
 }

 template <typename T>
 void MaximumCPUKernel<T>::CheckParam(const CNodePtr &kernel_node) {
  size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
  if (input_num != 2) {
    MS_LOG(EXCEPTION) << "Input number is " << input_num << ", but MaximumCPUKernel needs 2 input.";
  }
  size_t output_num = AnfAlgo::GetOutputTensorNum(kernel_node);
  if (output_num != 1) {
    MS_LOG(EXCEPTION) << "Output number is " << output_num << ", but MaximumCPUKernel needs 1 output.";
  }
 }

 template <typename T>
 void MaximumCPUKernel<T>::InitInputTensorAndScalar(size_t max_input_shape_size) {
  if (max_input_shape_size != output_shape_.size()) {
@@ -77,6 +71,8 @@ void MaximumCPUKernel<T>::InitInputTensors(TypeId input_x_dtype, TypeId input_y_
 template <typename T>
 bool MaximumCPUKernel<T>::Launch(const std::vector<kernel::AddressPtr> &inputs, const std::vector<kernel::AddressPtr> &,
                                 const std::vector<kernel::AddressPtr> &outputs) {
  CHECK_KERNEL_INPUTS_NUM(inputs.size(), kMaximumInputsNum, kernel_name_);
  CHECK_KERNEL_OUTPUTS_NUM(outputs.size(), kMaximumOutputsNum, kernel_name_);
  T *input_x_ = reinterpret_cast<T *>(inputs[0]->addr);
  T *input_y_ = reinterpret_cast<T *>(inputs[1]->addr);
  T *output_ = reinterpret_cast<T *>(outputs[0]->addr);
@@ -85,7 +81,7 @@ bool MaximumCPUKernel<T>::Launch(const std::vector<kernel::AddressPtr> &inputs,
 }

 template <typename T>
 void MaximumCPUKernel<T>::BroadcastArith(const T *input_x, const T *input_y, T *output) {
 void MaximumCPUKernel<T>::BroadcastArith(const T *input_x, const T *input_y, T *output) const {
  MS_EXCEPTION_IF_NULL(input_x);
  MS_EXCEPTION_IF_NULL(input_y);
  MS_EXCEPTION_IF_NULL(output);
@@ -108,7 +104,7 @@ void MaximumCPUKernel<T>::BroadcastArith(const T *input_x, const T *input_y, T *
 }

 template <typename T>
 bool MaximumCPUKernel<T>::IsBroadcast() {
 bool MaximumCPUKernel<T>::IsBroadcast() const {
  if (input_x_shape_.size() != input_y_shape_.size()) {
    return true;
  }
@@ -122,12 +118,12 @@ bool MaximumCPUKernel<T>::IsBroadcast() {

 template <typename T>
 void MaximumCPUKernel<T>::InitTensorBroadcastShape() {
  if (output_shape_.size() > max_dims) {
  if (output_shape_.size() > max_dims_) {
    MS_LOG(EXCEPTION) << "Broadcast operation not support dim greater than 7";
  }
  broadcast_input_x_shape_.resize(max_dims, 1);
  broadcast_input_y_shape_.resize(max_dims, 1);
  broadcast_output_shape_.resize(max_dims, 1);
  broadcast_input_x_shape_.resize(max_dims_, 1);
  broadcast_input_y_shape_.resize(max_dims_, 1);
  broadcast_output_shape_.resize(max_dims_, 1);
  for (size_t i = 0; i < output_shape_.size(); i++) {
    broadcast_output_shape_[i] = output_shape_[i];
  }
@@ -147,7 +143,7 @@ void MaximumCPUKernel<T>::InitTensorBroadcastShape() {

 // Broadcast comparison
 template <typename T>
 size_t MaximumCPUKernel<T>::Index(const size_t &index, const size_t &dim) {
 size_t MaximumCPUKernel<T>::Index(const size_t &index, const size_t &dim) const {
  return dim == 1 ? 0 : index;
 }

@@ -158,10 +154,7 @@ void MaximumCPUKernel<T>::BroadcastArithKernel(const size_t l0, const size_t l1,
                                               const size_t r1, const size_t r2, const size_t r3, const size_t r4,
                                               const size_t r5, const size_t r6, const size_t d0, const size_t d1,
                                               const size_t d2, const size_t d3, const size_t d4, const size_t d5,
                                               const size_t d6, const T *input_x, const T *input_y, T *output) {
  MS_EXCEPTION_IF_NULL(input_x);
  MS_EXCEPTION_IF_NULL(input_y);
  MS_EXCEPTION_IF_NULL(output);
                                               const size_t d6, const T *input_x, const T *input_y, T *output) const {
  for (size_t pos = 0; pos < output_num_; pos++) {
    size_t i = pos / (d1 * d2 * d3 * d4 * d5 * d6) % d0;
    size_t j = pos / (d2 * d3 * d4 * d5 * d6) % d1;
@@ -190,10 +183,7 @@ void MaximumCPUKernel<T>::BroadcastArithKernel(const size_t l0, const size_t l1,
 }

 template <typename T>
 void MaximumCPUKernel<T>::BroadcastArithOneScalarOneTensor(const T *input_x, const T *input_y, T *output) {
  MS_EXCEPTION_IF_NULL(input_x);
  MS_EXCEPTION_IF_NULL(input_y);
  MS_EXCEPTION_IF_NULL(output);
 void MaximumCPUKernel<T>::BroadcastArithOneScalarOneTensor(const T *input_x, const T *input_y, T *output) const {
  if (input_x_shape_.size() == 0) {
    for (size_t i = 0; i < output_num_; ++i) {
      output[i] = MaximumFunc(input_x[0], input_y[i]);
@@ -206,10 +196,7 @@ void MaximumCPUKernel<T>::BroadcastArithOneScalarOneTensor(const T *input_x, con
 }

 template <typename T>
 void MaximumCPUKernel<T>::BroadcastArithTensors(const T *input_x, const T *input_y, T *output) {
  MS_EXCEPTION_IF_NULL(input_x);
  MS_EXCEPTION_IF_NULL(input_y);
  MS_EXCEPTION_IF_NULL(output);
 void MaximumCPUKernel<T>::BroadcastArithTensors(const T *input_x, const T *input_y, T *output) const {
  for (size_t i = 0; i < output_num_; ++i) {
    output[i] = MaximumFunc(input_x[i], input_y[i]);
  }
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/maximum_cpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/maximum_cpu_kernel.h
@@ -1,5 +1,5 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 * Copyright 2020-2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -13,6 +13,7 @@
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_MAXIMUM_CPU_KERNEL_H_
 #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_MAXIMUM_CPU_KERNEL_H_

@@ -34,11 +35,9 @@ class MaximumCPUKernel : public CPUKernel {
              const std::vector<AddressPtr> &outputs) override;

 private:
  void CheckParam(const CNodePtr &kernel_node);

  bool IsBroadcast();
  bool IsBroadcast() const;

  size_t Index(const size_t &index, const size_t &dim);
  size_t Index(const size_t &index, const size_t &dim) const;

  void InitTensorBroadcastShape();

@@ -51,15 +50,15 @@ class MaximumCPUKernel : public CPUKernel {
                            const size_t l5, const size_t l6, const size_t r0, const size_t r1, const size_t r2,
                            const size_t r3, const size_t r4, const size_t r5, const size_t r6, const size_t d0,
                            const size_t d1, const size_t d2, const size_t d3, const size_t d4, const size_t d5,
                            const size_t d6, const T *input_x, const T *input_y, T *output);
                            const size_t d6, const T *input_x, const T *input_y, T *output) const;

  T MaximumFunc(const T &lhs, const T &rhs) { return lhs > rhs ? lhs : rhs; }
  T MaximumFunc(const T &lhs, const T &rhs) const { return lhs > rhs ? lhs : rhs; }

  void BroadcastArithOneScalarOneTensor(const T *input_x, const T *input_y, T *output);
  void BroadcastArithOneScalarOneTensor(const T *input_x, const T *input_y, T *output) const;

  void BroadcastArithTensors(const T *input_x, const T *input_y, T *output);
  void BroadcastArithTensors(const T *input_x, const T *input_y, T *output) const;

  void BroadcastArith(const T *input_x, const T *input_y, T *output);
  void BroadcastArith(const T *input_x, const T *input_y, T *output) const;

 private:
  bool need_broadcast_{false};
@@ -72,7 +71,7 @@ class MaximumCPUKernel : public CPUKernel {
  std::vector<size_t> broadcast_input_x_shape_;
  std::vector<size_t> broadcast_input_y_shape_;
  std::vector<size_t> broadcast_output_shape_;
  const size_t max_dims{7};
  const size_t max_dims_{7};
 };

 MS_REG_CPU_KERNEL_T(Maximum, KernelAttr(), MaximumCPUKernel, int32_t);
@@ -84,4 +83,4 @@ MS_REG_CPU_KERNEL_T(Maximum, KernelAttr(), MaximumCPUKernel, double);
 }  // namespace kernel
 }  // namespace mindspore

 #endif  // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_UPDATE_CACHE_CPU_KERNEL_H_
 #endif  // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_MAXIMUM_CPU_KERNEL_H_
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/maximum_grad_cpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/maximum_grad_cpu_kernel.cc
@@ -1,5 +1,5 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 * Copyright 2020-2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -21,6 +21,9 @@
 namespace mindspore {
 namespace kernel {
 namespace {
 constexpr size_t kMaximumGradInputsNum = 3;
 constexpr size_t kMaximumGradOutputsNum = 2;

 void CheckShape(std::vector<size_t> *shape) {
  MS_EXCEPTION_IF_NULL(shape);
  if (shape->empty()) {
@@ -30,7 +33,8 @@ void CheckShape(std::vector<size_t> *shape) {
 }  // namespace

 void MaximumGradCPUKernel::InitKernel(const CNodePtr &kernel_node) {
  CheckParam(kernel_node);
  MS_EXCEPTION_IF_NULL(kernel_node);
  kernel_name_ = AnfAlgo::GetCNodeName(kernel_node);
  x_shape_ = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
  y_shape_ = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 1);
  dout_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 2);
@@ -45,6 +49,8 @@ void MaximumGradCPUKernel::InitKernel(const CNodePtr &kernel_node) {
 bool MaximumGradCPUKernel::Launch(const std::vector<kernel::AddressPtr> &inputs,
                                  const std::vector<kernel::AddressPtr> &,
                                  const std::vector<kernel::AddressPtr> &outputs) {
  CHECK_KERNEL_INPUTS_NUM(inputs.size(), kMaximumGradInputsNum, kernel_name_);
  CHECK_KERNEL_OUTPUTS_NUM(outputs.size(), kMaximumGradOutputsNum, kernel_name_);
  if (dtype_ == kNumberTypeInt32) {
    LaunchKernel<int>(inputs, outputs);
  } else if (dtype_ == kNumberTypeUInt32) {
@@ -57,6 +63,8 @@ bool MaximumGradCPUKernel::Launch(const std::vector<kernel::AddressPtr> &inputs,
    LaunchKernel<uint64_t>(inputs, outputs);
  } else if (dtype_ == kNumberTypeFloat64) {
    LaunchKernel<double>(inputs, outputs);
  } else if (dtype_ == kNumberTypeFloat16) {
    LaunchKernel<float16>(inputs, outputs);
  }
  return true;
 }
@@ -145,16 +153,5 @@ void MaximumGradCPUKernel::LaunchKernel(const std::vector<AddressPtr> &inputs, c
  MaximumGradRecTask<T>(x_addr, y_addr, dout_addr, dx_addr, dy_addr, 0, 0, 0, 0, x_cargo, y_cargo, dout_cargo, x_shape,
                        y_shape, dout_shape);
 }

 void MaximumGradCPUKernel::CheckParam(const CNodePtr &kernel_node) {
  size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
  if (input_num != 3) {
    MS_LOG(EXCEPTION) << "Input number is " << input_num << ", but MaximumGradCPUKernel needs 3 input.";
  }
  size_t output_num = AnfAlgo::GetOutputTensorNum(kernel_node);
  if (output_num != 2) {
    MS_LOG(EXCEPTION) << "Output number is " << output_num << ", but MaximumGradCPUKernel needs 2 output.";
  }
 }
 }  // namespace kernel
 }  // namespace mindspore
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/maximum_grad_cpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/maximum_grad_cpu_kernel.h
@@ -1,5 +1,5 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 * Copyright 2020-2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -14,8 +14,9 @@
 * limitations under the License.
 */

 #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_MAXIMUMGRAD_CPU_KERNEL_H_
 #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_MAXIMUMGRAD_CPU_KERNEL_H_
 #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_MAXIMUM_GRAD_CPU_KERNEL_H_
 #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_MAXIMUM_GRAD_CPU_KERNEL_H_

 #include <memory>
 #include <unordered_map>
 #include <vector>
@@ -34,11 +35,10 @@ class MaximumGradCPUKernel : public CPUKernel {
  bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
              const std::vector<AddressPtr> &outputs) override;

 private:
  template <typename T>
  void LaunchKernel(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &outputs);

 private:
  void CheckParam(const CNodePtr &kernel_node);
  std::vector<size_t> x_shape_;
  std::vector<size_t> y_shape_;
  std::vector<size_t> dout_shape;
@@ -50,4 +50,5 @@ class MaximumGradCPUKernel : public CPUKernel {
 MS_REG_CPU_KERNEL(MaximumGrad, KernelAttr(), MaximumGradCPUKernel);
 }  // namespace kernel
 }  // namespace mindspore
 #endif  // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_MaximumGrad_CPU_KERNEL_H_

 #endif  // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_MAXIMUM_GRAD_CPU_KERNEL_H_
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/minimum_cpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/minimum_cpu_kernel.cc
@@ -1,5 +1,5 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 * Copyright 2020-2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -19,9 +19,15 @@

 namespace mindspore {
 namespace kernel {
 namespace {
 constexpr size_t kMinimumInputsNum = 2;
 constexpr size_t kMinimumOutputsNum = 1;
 }  // namespace

 template <typename T>
 void MinimumCPUKernel<T>::InitKernel(const CNodePtr &kernel_node) {
  CheckParam(kernel_node);
  MS_EXCEPTION_IF_NULL(kernel_node);
  kernel_name_ = AnfAlgo::GetCNodeName(kernel_node);
  input_x_shape_ = AnfAlgo::GetInputDeviceShape(kernel_node, 0);
  input_y_shape_ = AnfAlgo::GetInputDeviceShape(kernel_node, 1);
  output_shape_ = AnfAlgo::GetOutputDeviceShape(kernel_node, 0);
@@ -42,18 +48,6 @@ void MinimumCPUKernel<T>::InitKernel(const CNodePtr &kernel_node) {
  }
 }

 template <typename T>
 void MinimumCPUKernel<T>::CheckParam(const CNodePtr &kernel_node) {
  size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
  if (input_num != 2) {
    MS_LOG(EXCEPTION) << "Input number is " << input_num << ", but MinimumCPUKernel needs 2 input.";
  }
  size_t output_num = AnfAlgo::GetOutputTensorNum(kernel_node);
  if (output_num != 1) {
    MS_LOG(EXCEPTION) << "Output number is " << output_num << ", but MinimumCPUKernel needs 1 output.";
  }
 }

 template <typename T>
 void MinimumCPUKernel<T>::InitInputTensorAndScalar(size_t max_input_shape_size) {
  if (max_input_shape_size != output_shape_.size()) {
@@ -77,6 +71,8 @@ void MinimumCPUKernel<T>::InitInputTensors(TypeId input_x_dtype, TypeId input_y_
 template <typename T>
 bool MinimumCPUKernel<T>::Launch(const std::vector<kernel::AddressPtr> &inputs, const std::vector<kernel::AddressPtr> &,
                                 const std::vector<kernel::AddressPtr> &outputs) {
  CHECK_KERNEL_INPUTS_NUM(inputs.size(), kMinimumInputsNum, kernel_name_);
  CHECK_KERNEL_OUTPUTS_NUM(outputs.size(), kMinimumOutputsNum, kernel_name_);
  T *input_x_ = reinterpret_cast<T *>(inputs[0]->addr);
  T *input_y_ = reinterpret_cast<T *>(inputs[1]->addr);
  T *output_ = reinterpret_cast<T *>(outputs[0]->addr);
@@ -85,7 +81,7 @@ bool MinimumCPUKernel<T>::Launch(const std::vector<kernel::AddressPtr> &inputs,
 }

 template <typename T>
 void MinimumCPUKernel<T>::BroadcastArith(const T *input_x, const T *input_y, T *output) {
 void MinimumCPUKernel<T>::BroadcastArith(const T *input_x, const T *input_y, T *output) const {
  MS_EXCEPTION_IF_NULL(input_x);
  MS_EXCEPTION_IF_NULL(input_y);
  MS_EXCEPTION_IF_NULL(output);
@@ -108,7 +104,7 @@ void MinimumCPUKernel<T>::BroadcastArith(const T *input_x, const T *input_y, T *
 }

 template <typename T>
 bool MinimumCPUKernel<T>::IsBroadcast() {
 bool MinimumCPUKernel<T>::IsBroadcast() const {
  if (input_x_shape_.size() != input_y_shape_.size()) {
    return true;
  }
@@ -122,12 +118,12 @@ bool MinimumCPUKernel<T>::IsBroadcast() {

 template <typename T>
 void MinimumCPUKernel<T>::InitTensorBroadcastShape() {
  if (output_shape_.size() > max_dims) {
  if (output_shape_.size() > max_dims_) {
    MS_LOG(EXCEPTION) << "Broadcast operation not support dim greater than 7";
  }
  broadcast_input_x_shape_.resize(max_dims, 1);
  broadcast_input_y_shape_.resize(max_dims, 1);
  broadcast_output_shape_.resize(max_dims, 1);
  broadcast_input_x_shape_.resize(max_dims_, 1);
  broadcast_input_y_shape_.resize(max_dims_, 1);
  broadcast_output_shape_.resize(max_dims_, 1);
  for (size_t i = 0; i < output_shape_.size(); i++) {
    broadcast_output_shape_[i] = output_shape_[i];
  }
@@ -147,7 +143,7 @@ void MinimumCPUKernel<T>::InitTensorBroadcastShape() {

 // Broadcast comparison
 template <typename T>
 size_t MinimumCPUKernel<T>::Index(const size_t &index, const size_t &dim) {
 size_t MinimumCPUKernel<T>::Index(const size_t &index, const size_t &dim) const {
  return dim == 1 ? 0 : index;
 }

@@ -158,10 +154,7 @@ void MinimumCPUKernel<T>::BroadcastArithKernel(const size_t l0, const size_t l1,
                                               const size_t r1, const size_t r2, const size_t r3, const size_t r4,
                                               const size_t r5, const size_t r6, const size_t d0, const size_t d1,
                                               const size_t d2, const size_t d3, const size_t d4, const size_t d5,
                                               const size_t d6, const T *input_x, const T *input_y, T *output) {
  MS_EXCEPTION_IF_NULL(input_x);
  MS_EXCEPTION_IF_NULL(input_y);
  MS_EXCEPTION_IF_NULL(output);
                                               const size_t d6, const T *input_x, const T *input_y, T *output) const {
  for (size_t pos = 0; pos < output_num_; pos++) {
    size_t i = pos / (d1 * d2 * d3 * d4 * d5 * d6) % d0;
    size_t j = pos / (d2 * d3 * d4 * d5 * d6) % d1;
@@ -190,10 +183,7 @@ void MinimumCPUKernel<T>::BroadcastArithKernel(const size_t l0, const size_t l1,
 }

 template <typename T>
 void MinimumCPUKernel<T>::BroadcastArithOneScalarOneTensor(const T *input_x, const T *input_y, T *output) {
  MS_EXCEPTION_IF_NULL(input_x);
  MS_EXCEPTION_IF_NULL(input_y);
  MS_EXCEPTION_IF_NULL(output);
 void MinimumCPUKernel<T>::BroadcastArithOneScalarOneTensor(const T *input_x, const T *input_y, T *output) const {
  if (input_x_shape_.size() == 0) {
    for (size_t i = 0; i < output_num_; ++i) {
      output[i] = MinimumFunc(input_x[0], input_y[i]);
@@ -206,10 +196,7 @@ void MinimumCPUKernel<T>::BroadcastArithOneScalarOneTensor(const T *input_x, con
 }

 template <typename T>
 void MinimumCPUKernel<T>::BroadcastArithTensors(const T *input_x, const T *input_y, T *output) {
  MS_EXCEPTION_IF_NULL(input_x);
  MS_EXCEPTION_IF_NULL(input_y);
  MS_EXCEPTION_IF_NULL(output);
 void MinimumCPUKernel<T>::BroadcastArithTensors(const T *input_x, const T *input_y, T *output) const {
  for (size_t i = 0; i < output_num_; ++i) {
    output[i] = MinimumFunc(input_x[i], input_y[i]);
  }
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/minimum_cpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/minimum_cpu_kernel.h
@@ -1,5 +1,5 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 * Copyright 2020-2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -13,6 +13,7 @@
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_MINIMUM_CPU_KERNEL_H_
 #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_MINIMUM_CPU_KERNEL_H_

@@ -34,11 +35,9 @@ class MinimumCPUKernel : public CPUKernel {
              const std::vector<AddressPtr> &outputs) override;

 private:
  void CheckParam(const CNodePtr &kernel_node);

  bool IsBroadcast();
  bool IsBroadcast() const;

  size_t Index(const size_t &index, const size_t &dim);
  size_t Index(const size_t &index, const size_t &dim) const;

  void InitTensorBroadcastShape();

@@ -51,15 +50,15 @@ class MinimumCPUKernel : public CPUKernel {
                            const size_t l5, const size_t l6, const size_t r0, const size_t r1, const size_t r2,
                            const size_t r3, const size_t r4, const size_t r5, const size_t r6, const size_t d0,
                            const size_t d1, const size_t d2, const size_t d3, const size_t d4, const size_t d5,
                            const size_t d6, const T *input_x, const T *input_y, T *output);
                            const size_t d6, const T *input_x, const T *input_y, T *output) const;

  T MinimumFunc(const T &lhs, const T &rhs) { return lhs < rhs ? lhs : rhs; }
  T MinimumFunc(const T &lhs, const T &rhs) const { return lhs < rhs ? lhs : rhs; }

  void BroadcastArithOneScalarOneTensor(const T *input_x, const T *input_y, T *output);
  void BroadcastArithOneScalarOneTensor(const T *input_x, const T *input_y, T *output) const;

  void BroadcastArithTensors(const T *input_x, const T *input_y, T *output);
  void BroadcastArithTensors(const T *input_x, const T *input_y, T *output) const;

  void BroadcastArith(const T *input_x, const T *input_y, T *output);
  void BroadcastArith(const T *input_x, const T *input_y, T *output) const;

 private:
  bool need_broadcast_{false};
@@ -72,7 +71,7 @@ class MinimumCPUKernel : public CPUKernel {
  std::vector<size_t> broadcast_input_x_shape_;
  std::vector<size_t> broadcast_input_y_shape_;
  std::vector<size_t> broadcast_output_shape_;
  const size_t max_dims{7};
  const size_t max_dims_{7};
 };

 MS_REG_CPU_KERNEL_T(Minimum, KernelAttr(), MinimumCPUKernel, int32_t);
@@ -84,4 +83,4 @@ MS_REG_CPU_KERNEL_T(Minimum, KernelAttr(), MinimumCPUKernel, double);
 }  // namespace kernel
 }  // namespace mindspore

 #endif  // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_UPDATE_CACHE_CPU_KERNEL_H_
 #endif  // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_MINIMUM_CPU_KERNEL_H_
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/minimum_grad_cpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/minimum_grad_cpu_kernel.cc
@@ -1,5 +1,5 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 * Copyright 2020-2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -21,6 +21,9 @@
 namespace mindspore {
 namespace kernel {
 namespace {
 constexpr size_t kMinimumGradInputsNum = 3;
 constexpr size_t kMinimumGradOutputsNum = 2;

 void GetCargo(std::vector<size_t> *cargo, const std::vector<size_t> &shape, const std::vector<size_t> &dout_shape) {
  int i = dout_shape.size() - 1;
  int j = shape.size() - 1;
@@ -58,7 +61,8 @@ void CheckShape(std::vector<size_t> *shape) {
 }  // namespace

 void MinimumGradCPUKernel::InitKernel(const CNodePtr &kernel_node) {
  CheckParam(kernel_node);
  MS_EXCEPTION_IF_NULL(kernel_node);
  kernel_name_ = AnfAlgo::GetCNodeName(kernel_node);
  x_shape_ = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
  y_shape_ = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 1);
  dout_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 2);
@@ -73,6 +77,8 @@ void MinimumGradCPUKernel::InitKernel(const CNodePtr &kernel_node) {
 bool MinimumGradCPUKernel::Launch(const std::vector<kernel::AddressPtr> &inputs,
                                  const std::vector<kernel::AddressPtr> &,
                                  const std::vector<kernel::AddressPtr> &outputs) {
  CHECK_KERNEL_INPUTS_NUM(inputs.size(), kMinimumGradInputsNum, kernel_name_);
  CHECK_KERNEL_OUTPUTS_NUM(outputs.size(), kMinimumGradOutputsNum, kernel_name_);
  if (dtype_ == kNumberTypeInt32) {
    LaunchKernel<int>(inputs, outputs);
  } else if (dtype_ == kNumberTypeUInt32) {
@@ -115,11 +121,11 @@ void MinimumGradRecTask(const T *x, const T *y, const T *dout, T *dx, T *dy, con

 template <typename T>
 void MinimumGradCPUKernel::LaunchKernel(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &outputs) {
  auto x_addr = reinterpret_cast<T *>(inputs[0]->addr);
  auto y_addr = reinterpret_cast<T *>(inputs[1]->addr);
  auto dout_addr = reinterpret_cast<T *>(inputs[2]->addr);
  auto dx_addr = reinterpret_cast<T *>(outputs[0]->addr);
  auto dy_addr = reinterpret_cast<T *>(outputs[1]->addr);
  auto *x_addr = reinterpret_cast<T *>(inputs[0]->addr);
  auto *y_addr = reinterpret_cast<T *>(inputs[1]->addr);
  auto *dout_addr = reinterpret_cast<T *>(inputs[2]->addr);
  auto *dx_addr = reinterpret_cast<T *>(outputs[0]->addr);
  auto *dy_addr = reinterpret_cast<T *>(outputs[1]->addr);

  size_t x_tensor_len = GetTensorLen(x_shape_);
  size_t y_tensor_len = GetTensorLen(y_shape_);
@@ -146,16 +152,5 @@ void MinimumGradCPUKernel::LaunchKernel(const std::vector<AddressPtr> &inputs, c
  MinimumGradRecTask<T>(x_addr, y_addr, dout_addr, dx_addr, dy_addr, 0, 0, 0, 0, x_cargo, y_cargo, dout_cargo, x_shape,
                        y_shape, dout_shape);
 }

 void MinimumGradCPUKernel::CheckParam(const CNodePtr &kernel_node) {
  size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
  if (input_num != 3) {
    MS_LOG(EXCEPTION) << "Input number is " << input_num << ", but MinimumGradCPUKernel needs 3 input.";
  }
  size_t output_num = AnfAlgo::GetOutputTensorNum(kernel_node);
  if (output_num != 2) {
    MS_LOG(EXCEPTION) << "Output number is " << output_num << ", but MinimumGradCPUKernel needs 2 output.";
  }
 }
 }  // namespace kernel
 }  // namespace mindspore
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/minimum_grad_cpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/minimum_grad_cpu_kernel.h
@@ -1,5 +1,5 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 * Copyright 2020-2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -16,6 +16,7 @@

 #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_MINIMUMGRAD_CPU_KERNEL_H_
 #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_MINIMUMGRAD_CPU_KERNEL_H_

 #include <memory>
 #include <unordered_map>
 #include <vector>
@@ -34,11 +35,10 @@ class MinimumGradCPUKernel : public CPUKernel {
  bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
              const std::vector<AddressPtr> &outputs) override;

 private:
  template <typename T>
  void LaunchKernel(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &outputs);

 private:
  void CheckParam(const CNodePtr &kernel_node);
  std::vector<size_t> x_shape_;
  std::vector<size_t> y_shape_;
  std::vector<size_t> dout_shape;
@@ -50,4 +50,4 @@ class MinimumGradCPUKernel : public CPUKernel {
 MS_REG_CPU_KERNEL(MinimumGrad, KernelAttr(), MinimumGradCPUKernel);
 }  // namespace kernel
 }  // namespace mindspore
 #endif  // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_MinimumGrad_CPU_KERNEL_H_
 #endif  // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_MINIMUMGRAD_CPU_KERNEL_H_
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/mirror_pad_cpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/mirror_pad_cpu_kernel.cc
@@ -33,9 +33,13 @@ constexpr int TOP = 0;
 constexpr int BOTTOM = 1;
 constexpr int LEFT = 0;
 constexpr int RIGHT = 1;
 constexpr size_t kMirrorPadInputsNum = 2;
 constexpr size_t kMirrorPadOutputsNum = 1;
 }  // namespace

 void MirrorPadCPUKernel::InitKernel(const CNodePtr &kernel_node) {
  CheckParam(kernel_node);
  MS_EXCEPTION_IF_NULL(kernel_node);
  kernel_name_ = AnfAlgo::GetCNodeName(kernel_node);
  std::string mode = AnfAlgo::GetNodeAttr<std::string>(kernel_node, "mode");
  dtype_ = AnfAlgo::GetInputDeviceDataType(kernel_node, 0);
  if (mode == "REFLECT") {
@@ -50,12 +54,10 @@ void MirrorPadCPUKernel::InitKernel(const CNodePtr &kernel_node) {
  shape_size_ = input_shape.size();
  if (shape_size_ == 4) {  // shape adjustment from 2d/3d to 4d
  } else if (shape_size_ == 3) {
    auto it = input_shape.begin();
    input_shape.insert(it, 1);  // batch padding
    (void)input_shape.insert(input_shape.begin(), 1);  // batch padding
    shape_size_ = 4;
  } else if (shape_size_ == 2) {
    auto it = input_shape.begin();
    input_shape.insert(it, 2, 1);  // channel padding
    (void)input_shape.insert(input_shape.begin(), 2, 1);  // channel padding
    shape_size_ = 4;
  }

@@ -63,6 +65,7 @@ void MirrorPadCPUKernel::InitKernel(const CNodePtr &kernel_node) {
    tensor_size_ *= input_shape[i];
    input_shape_.push_back(SizeToLong(input_shape[i]));
  }

  std::vector<size_t> padding_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 1);
  num_paddings_ = SizeToLong(padding_shape[0]);

@@ -74,6 +77,7 @@ void MirrorPadCPUKernel::InitKernel(const CNodePtr &kernel_node) {

  int64_t max_width = input_shape_[3];
  int64_t max_height = input_shape_[2];

  if (mode_ == 1) {  // symmetric
    max_width = max_width + (2 * max_width);
    max_height = max_height + (2 * max_height);
@@ -97,6 +101,8 @@ void extract_paddings(const int64_t *paddings_arg, int64_t padd_dim, int64_t *ex

 bool MirrorPadCPUKernel::Launch(const std::vector<kernel::AddressPtr> &inputs, const std::vector<kernel::AddressPtr> &,
                                const std::vector<kernel::AddressPtr> &outputs) {
  CHECK_KERNEL_INPUTS_NUM(inputs.size(), kMirrorPadInputsNum, kernel_name_);
  CHECK_KERNEL_OUTPUTS_NUM(outputs.size(), kMirrorPadOutputsNum, kernel_name_);
  if (dtype_ == kNumberTypeFloat16) {
    LaunchKernel<float16>(inputs, outputs);
  } else if (dtype_ == kNumberTypeFloat32) {
@@ -112,7 +118,8 @@ bool MirrorPadCPUKernel::Launch(const std::vector<kernel::AddressPtr> &inputs, c
 }

 template <typename T>
 void MirrorPadCPUKernel::LaunchKernel(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &outputs) {
 void MirrorPadCPUKernel::LaunchKernel(const std::vector<AddressPtr> &inputs,
                                      const std::vector<AddressPtr> &outputs) const {
  auto inputs_addr = reinterpret_cast<T *>(inputs[0]->addr);
  int64_t *paddings_arg = reinterpret_cast<int64_t *>(inputs[1]->addr);
  auto outputs_addr = reinterpret_cast<T *>(outputs[0]->addr);
@@ -126,6 +133,7 @@ void MirrorPadCPUKernel::LaunchKernel(const std::vector<AddressPtr> &inputs, con
  const int64_t padded_height = output_shape_[dim_offset];
  const int64_t padded_width = output_shape_[dim_offset + 1];
  const int64_t padd_dim = num_paddings_;

  const int64_t mode = mode_;

  int64_t paddings[MAX_PADDINGS * PADDING_SIZE];  // local and fixed size to keep in registers
@@ -190,16 +198,5 @@ void MirrorPadCPUKernel::LaunchKernel(const std::vector<AddressPtr> &inputs, con
    outputs_addr[pos] = inputs_addr[pos_index];
  }
 }

 void MirrorPadCPUKernel::CheckParam(const CNodePtr &kernel_node) {
  size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
  if (input_num != 2) {
    MS_LOG(EXCEPTION) << "Input number is " << input_num << ", but MirrorPadCPUKernel needs 2 inputs.";
  }
  size_t output_num = AnfAlgo::GetOutputTensorNum(kernel_node);
  if (output_num != 1) {
    MS_LOG(EXCEPTION) << "Output number is " << output_num << ", but MirrorPadCPUKernel needs 1 output.";
  }
 }
 }  // namespace kernel
 }  // namespace mindspore
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/mirror_pad_cpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/mirror_pad_cpu_kernel.h
@@ -16,6 +16,7 @@

 #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_MIRROR_PAD_CPU_KERNEL_H_
 #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_MIRROR_PAD_CPU_KERNEL_H_

 #include <memory>
 #include <unordered_map>
 #include <vector>
@@ -35,19 +36,18 @@ class MirrorPadCPUKernel : public CPUKernel {
  bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
              const std::vector<AddressPtr> &outputs) override;

 private:
  template <typename T>
  void LaunchKernel(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &outputs);
  void LaunchKernel(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &outputs) const;

 private:
  void CheckParam(const CNodePtr &kernel_node);
  TypeId dtype_{kTypeUnknown};
  size_t tensor_size_ = 1;
  size_t shape_size_;
  size_t output_size_ = 1;
  size_t tensor_size_{1};
  size_t shape_size_{0};
  size_t output_size_{1};
  int64_t mode_{0};
  int64_t num_paddings_{0};
  std::vector<int64_t> input_shape_;
  std::vector<int64_t> output_shape_;
  int64_t mode_;
  int64_t num_paddings_;
 };

 MS_REG_CPU_KERNEL(
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/mirror_pad_grad_cpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/mirror_pad_grad_cpu_kernel.cc
@@ -33,8 +33,28 @@ constexpr int TOP = 0;
 constexpr int BOTTOM = 1;
 constexpr int LEFT = 0;
 constexpr int RIGHT = 1;
 constexpr size_t kMirrorPadGradInputsNum = 2;
 constexpr size_t kMirrorPadGradOutputsNum = 1;

 void extract_paddings(const int64_t *paddings_arg, int64_t padd_dim, int64_t *extracted_paddings) {
  const int64_t paddings_offset = MAX_PADDINGS - padd_dim;
  for (int64_t i = 0; i < padd_dim; i++) {
    extracted_paddings[(paddings_offset + i) * PADDING_SIZE] = paddings_arg[i * PADDING_SIZE];
    extracted_paddings[(paddings_offset + i) * PADDING_SIZE + 1] = paddings_arg[i * PADDING_SIZE + 1];
  }
 }

 bool range_check(int64_t x, int64_t y, int64_t padded_width, int64_t padded_height) {
  if (((x >= 0) && (x <= padded_width - 1)) && ((y >= 0) && (y <= padded_height - 1))) {
    return true;
  }
  return false;
 }
 }  // namespace

 void MirrorPadGradCPUKernel::InitKernel(const CNodePtr &kernel_node) {
  MS_EXCEPTION_IF_NULL(kernel_node);
  kernel_name_ = AnfAlgo::GetCNodeName(kernel_node);
  std::string mode = AnfAlgo::GetNodeAttr<std::string>(kernel_node, "mode");
  dtype_ = AnfAlgo::GetInputDeviceDataType(kernel_node, 0);
  if (mode == "REFLECT") {
@@ -49,12 +69,10 @@ void MirrorPadGradCPUKernel::InitKernel(const CNodePtr &kernel_node) {
  shape_size_ = input_shape.size();
  if (shape_size_ == 4) {  // shape adjustment from 2d/3d to 4d
  } else if (shape_size_ == 3) {
    auto it = input_shape.begin();
    input_shape.insert(it, 1);  // batch padding
    (void)input_shape.insert(input_shape.begin(), 1);  // batch padding
    shape_size_ = 4;
  } else if (shape_size_ == 2) {
    auto it = input_shape.begin();
    input_shape.insert(it, 2, 1);  // channel padding
    (void)input_shape.insert(input_shape.begin(), 2, 1);  // channel padding
    shape_size_ = 4;
  }

@@ -70,11 +88,9 @@ void MirrorPadGradCPUKernel::InitKernel(const CNodePtr &kernel_node) {

  if (output_shape.size() == 4) {
  } else if (output_shape.size() == 3) {
    auto it = output_shape.begin();
    output_shape.insert(it, 1);  // batch padding
    (void)output_shape.insert(output_shape.begin(), 1);  // batch padding
  } else if (output_shape.size() == 2) {
    auto it = output_shape.begin();
    output_shape.insert(it, 2, 1);  // channel padding
    (void)output_shape.insert(output_shape.begin(), 2, 1);  // channel padding
  }
  for (auto x : output_shape) {
    output_size_ *= x;
@@ -103,24 +119,11 @@ void MirrorPadGradCPUKernel::InitKernel(const CNodePtr &kernel_node) {
  }
 }

 void extract_paddings_(const int64_t *paddings_arg, int64_t padd_dim, int64_t *extracted_paddings) {
  const int64_t paddings_offset = MAX_PADDINGS - padd_dim;
  for (int64_t i = 0; i < padd_dim; i++) {
    extracted_paddings[(paddings_offset + i) * PADDING_SIZE] = paddings_arg[i * PADDING_SIZE];
    extracted_paddings[(paddings_offset + i) * PADDING_SIZE + 1] = paddings_arg[i * PADDING_SIZE + 1];
  }
 }

 bool range_check(int64_t x, int64_t y, int64_t padded_width, int64_t padded_height) {
  if (((x >= 0) && (x <= padded_width - 1)) && ((y >= 0) && (y <= padded_height - 1))) {
    return true;
  }
  return false;
 }

 bool MirrorPadGradCPUKernel::Launch(const std::vector<kernel::AddressPtr> &inputs,
                                    const std::vector<kernel::AddressPtr> &workspace,
                                    const std::vector<kernel::AddressPtr> &outputs) {
  CHECK_KERNEL_INPUTS_NUM(inputs.size(), kMirrorPadGradInputsNum, kernel_name_);
  CHECK_KERNEL_OUTPUTS_NUM(outputs.size(), kMirrorPadGradOutputsNum, kernel_name_);
  if (dtype_ == kNumberTypeFloat16) {
    LaunchKernel<float16>(inputs, workspace, outputs);
  } else if (dtype_ == kNumberTypeFloat32) {
@@ -158,12 +161,12 @@ template <typename T>
 void MirrorPadGradCPUKernel::MirrorPadGrad_Width_Height(const size_t size, const T *interim_dy, const int64_t dx_height,
                                                        const int64_t dx_width, const int64_t dy_height,
                                                        const int64_t dy_width, const int64_t padd_dim,
                                                        const int64_t *paddings_arg, int64_t mode, T *dx) {
                                                        const int64_t *paddings_arg, int64_t mode, T *dx) const {
  int64_t paddings[MAX_PADDINGS * PADDING_SIZE];  // local and fixed size to keep in registers
  for (int i = 0; i < MAX_PADDINGS * PADDING_SIZE; i++) {
    paddings[i] = 0;  // init all to 0
  }
  extract_paddings_(paddings_arg, padd_dim, paddings);
  extract_paddings(paddings_arg, padd_dim, paddings);
  // Create required anchor points for non-mirrored data inside new tensor
  int64_t ap1_x = paddings[WIDTH];
  int64_t ap2_x = paddings[WIDTH] + dx_width - 1;
@@ -216,7 +219,6 @@ void MirrorPadGradCPUKernel::MirrorPadGrad_Width_Height(const size_t size, const
      }
    }
  }
  return;
 }

 template <typename T>
@@ -224,12 +226,12 @@ void MirrorPadGradCPUKernel::MirrorPadGradBatchChannel(const size_t size, T *dy,
                                                       const int64_t dx_batches, const int64_t dx_channels,
                                                       const int64_t dy_height, const int64_t dy_width,
                                                       const int64_t padd_dim, const int64_t *paddings_arg,
                                                       int64_t mode) {
                                                       int64_t mode) const {
  int64_t paddings[MAX_PADDINGS * PADDING_SIZE];  // local and fixed size to keep in registers
  for (int i = 0; i < MAX_PADDINGS * PADDING_SIZE; i++) {
    paddings[i] = 0;  // init all to 0
  }
  extract_paddings_(paddings_arg, padd_dim, paddings);
  extract_paddings(paddings_arg, padd_dim, paddings);
  // Create anchor points for non mirrored data inside new tensor
  int64_t ap1_channel = paddings[CHANNEL];
  int64_t ap2_channel = paddings[CHANNEL] + dx_channels - 1;
@@ -273,17 +275,16 @@ void MirrorPadGradCPUKernel::MirrorPadGradBatchChannel(const size_t size, T *dy,
      }
    }
  }
  return;
 }

 template <typename T>
 void MirrorPadGradCPUKernel::LaunchKernel(const std::vector<AddressPtr> &inputs,
                                          const std::vector<AddressPtr> &workspace,
                                          const std::vector<AddressPtr> &outputs) {
  auto inputs_addr = reinterpret_cast<T *>(inputs[0]->addr);
  int64_t *paddings = reinterpret_cast<int64_t *>(inputs[1]->addr);
  auto interim = reinterpret_cast<T *>(workspace[0]->addr);
  auto outputs_addr = reinterpret_cast<T *>(outputs[0]->addr);
                                          const std::vector<AddressPtr> &outputs) const {
  auto *inputs_addr = reinterpret_cast<T *>(inputs[0]->addr);
  auto *paddings = reinterpret_cast<int64_t *>(inputs[1]->addr);
  auto *interim = reinterpret_cast<T *>(workspace[0]->addr);
  auto *outputs_addr = reinterpret_cast<T *>(outputs[0]->addr);

  MirrorPadGradBatchChannel(workspace_size_, inputs_addr, interim, output_shape_[0], output_shape_[1], input_shape_[2],
                            input_shape_[3], num_paddings_, paddings, mode_);
@@ -291,16 +292,5 @@ void MirrorPadGradCPUKernel::LaunchKernel(const std::vector<AddressPtr> &inputs,
  MirrorPadGrad_Width_Height(output_size_, interim, output_shape_[2], output_shape_[3], input_shape_[2],
                             input_shape_[3], num_paddings_, paddings, mode_, outputs_addr);
 }

 void MirrorPadGradCPUKernel::CheckParam(const CNodePtr &kernel_node) {
  size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
  if (input_num != 2) {
    MS_LOG(EXCEPTION) << "Input number is " << input_num << ", but MirrorPadGradCPUKernel needs 2 inputs.";
  }
  size_t output_num = AnfAlgo::GetOutputTensorNum(kernel_node);
  if (output_num != 1) {
    MS_LOG(EXCEPTION) << "Output number is " << output_num << ", but MirrorPadGradCPUKernel needs 1 output.";
  }
 }
 }  // namespace kernel
 }  // namespace mindspore
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/mirror_pad_grad_cpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/mirror_pad_grad_cpu_kernel.h
@@ -16,13 +16,13 @@

 #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_MIRROR_PAD_GRAD_CPU_KERNEL_H_
 #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_MIRROR_PAD_GRAD_CPU_KERNEL_H_

 #include <memory>
 #include <unordered_map>
 #include <vector>
 #include <string>
 #include "backend/kernel_compiler/cpu/cpu_kernel.h"
 #include "backend/kernel_compiler/cpu/cpu_kernel_factory.h"

 namespace mindspore {
 namespace kernel {
 class MirrorPadGradCPUKernel : public CPUKernel {
@@ -36,34 +36,33 @@ class MirrorPadGradCPUKernel : public CPUKernel {
  bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
              const std::vector<AddressPtr> &outputs) override;

 private:
  template <typename T>
  void InitWorkspaceSize();

  template <typename T>
  void LaunchKernel(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
                    const std::vector<AddressPtr> &outputs);
                    const std::vector<AddressPtr> &outputs) const;

  template <typename T>
  void MirrorPadGrad_Width_Height(const size_t size, const T *interim_dy, const int64_t dx_height,
                                  const int64_t dx_width, const int64_t dy_height, const int64_t dy_width,
                                  const int64_t padd_dim, const int64_t *paddings_arg, int64_t mode, T *dx);
                                  const int64_t padd_dim, const int64_t *paddings_arg, int64_t mode, T *dx) const;

  template <typename T>
  void MirrorPadGradBatchChannel(const size_t size, T *dy, T *interim_dy, const int64_t dx_batches,
                                 const int64_t dx_channels, const int64_t dy_height, const int64_t dy_width,
                                 const int64_t padd_dim, const int64_t *paddings_arg, int64_t mode);
                                 const int64_t padd_dim, const int64_t *paddings_arg, int64_t mode) const;

 private:
  void CheckParam(const CNodePtr &kernel_node);
  TypeId dtype_{kTypeUnknown};
  size_t tensor_size_ = 1;
  size_t shape_size_;
  size_t output_size_ = 1;
  size_t workspace_size_ = 1;
  size_t tensor_size_{1};
  size_t shape_size_{1};
  size_t output_size_{1};
  size_t workspace_size_{1};
  int mode_{0};
  int64_t num_paddings_{0};
  std::vector<int64_t> input_shape_;
  std::vector<int64_t> output_shape_;
  int64_t mode_;
  int64_t num_paddings_;
 };

 MS_REG_CPU_KERNEL(
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/mkldnn/pooling_avg_grad_cpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/mkldnn/pooling_avg_grad_cpu_kernel.cc
@@ -1,5 +1,5 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 * Copyright 2020-2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -13,6 +13,7 @@
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #include "backend/kernel_compiler/cpu/mkldnn/pooling_avg_grad_cpu_kernel.h"
 #include <string>
 #include <utility>
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/mkldnn/pooling_cpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/mkldnn/pooling_cpu_kernel.cc
@@ -1,5 +1,5 @@
 /**
 * Copyright 2019 Huawei Technologies Co., Ltd
 * Copyright 2019-2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -25,7 +25,8 @@ namespace kernel {
 constexpr size_t kPoolingMinDim = 4;
 constexpr size_t kPoolingMaxDim = 5;
 constexpr size_t kPoolingOffsetDim = 2;

 constexpr size_t kPoolingInputsNum = 1;
 constexpr size_t kPoolingOutputsNum = 1;
 void PoolingCPUKernel::InitInputOutputSize(const CNodePtr &kernel_node) {
  CPUKernel::InitInputOutputSize(kernel_node);
  (void)workspace_size_list_.emplace_back(workspace_size_);
@@ -33,6 +34,7 @@ void PoolingCPUKernel::InitInputOutputSize(const CNodePtr &kernel_node) {

 void PoolingCPUKernel::InitKernel(const CNodePtr &kernel_node) {
  MS_EXCEPTION_IF_NULL(kernel_node);
  kernel_name_ = AnfAlgo::GetCNodeName(kernel_node);
  std::vector<size_t> src_shape = AnfAlgo::GetInputDeviceShape(kernel_node, 0);
  std::vector<size_t> dst_shape = AnfAlgo::GetOutputDeviceShape(kernel_node, 0);
  dnnl::memory::desc src_desc = GetDefaultMemDesc(src_shape);
@@ -78,8 +80,7 @@ void PoolingCPUKernel::InitKernel(const CNodePtr &kernel_node) {
  dnnl::pooling_forward::desc desc =
    dnnl::pooling_forward::desc(dnnl::prop_kind::forward_training, dnnl::algorithm::pooling_max, src_desc, dst_desc,
                                strides_dims, kernels_dims, padding_l, padding_r);
  std::string kernel_name = AnfAlgo::GetCNodeName(kernel_node);
  if (kernel_name == prim::kPrimAvgPool->name() || kernel_name == prim::kPrimAvgPool3D->name()) {
  if (kernel_name_ == prim::kPrimAvgPool->name()) {
    desc = dnnl::pooling_forward::desc(dnnl::prop_kind::forward_training, dnnl::algorithm::pooling_avg, src_desc,
                                       dst_desc, strides_dims, kernels_dims, padding_l, padding_r);
  }
@@ -94,9 +95,8 @@ void PoolingCPUKernel::InitKernel(const CNodePtr &kernel_node) {
 bool PoolingCPUKernel::Launch(const std::vector<kernel::AddressPtr> &inputs,
                              const std::vector<kernel::AddressPtr> &workspace,
                              const std::vector<kernel::AddressPtr> &outputs) {
  if (inputs.empty() || outputs.empty()) {
    MS_LOG(EXCEPTION) << "Error input output size!";
  }
  CHECK_KERNEL_INPUTS_NUM(inputs.size(), kPoolingInputsNum, kernel_name_);
  CHECK_KERNEL_OUTPUTS_NUM(outputs.size(), kPoolingOutputsNum, kernel_name_);
  SetArgumentHandle(DNNL_ARG_SRC, inputs[0]->addr);
  SetArgumentHandle(DNNL_ARG_DST, outputs[0]->addr);
  SetArgumentHandle(DNNL_ARG_WORKSPACE, workspace[0]->addr);
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/mkldnn/pooling_cpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/mkldnn/pooling_cpu_kernel.h
@@ -1,5 +1,5 @@
 /**
 * Copyright 2019 Huawei Technologies Co., Ltd
 * Copyright 2019-2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -13,6 +13,7 @@
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_POOLING_CPU_KERNEL_H_
 #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_POOLING_CPU_KERNEL_H_

@@ -45,8 +46,6 @@ MS_REG_CPU_KERNEL(MaxPool3D, KernelAttr().AddInputAttr(kNumberTypeFloat32).AddOu
                  PoolingCPUKernel);
 MS_REG_CPU_KERNEL(AvgPool, KernelAttr().AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32),
                  PoolingCPUKernel);
 MS_REG_CPU_KERNEL(AvgPool3D, KernelAttr().AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32),
                  PoolingCPUKernel);
 }  // namespace kernel
 }  // namespace mindspore

--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/mkldnn/pooling_max_grad_cpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/mkldnn/pooling_max_grad_cpu_kernel.cc
@@ -1,5 +1,5 @@
 /**
 * Copyright 2019 Huawei Technologies Co., Ltd
 * Copyright 2019-2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -13,6 +13,7 @@
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #include "backend/kernel_compiler/cpu/mkldnn/pooling_max_grad_cpu_kernel.h"
 #include <string>
 #include <utility>
@@ -117,13 +118,13 @@ bool MaxPoolingGradCPUKernel::Launch(const std::vector<kernel::AddressPtr> &inpu
                                     const std::vector<kernel::AddressPtr> &outputs) {
  CHECK_KERNEL_INPUTS_NUM(inputs.size(), kMaxPoolingGradInputsNum, kernel_name_);
  CHECK_KERNEL_OUTPUTS_NUM(outputs.size(), kMaxPoolingGradOutputsNum, kernel_name_);

  auto input = reinterpret_cast<float *>(inputs[0]->addr);
  auto diff = reinterpret_cast<float *>(inputs[2]->addr);
  auto output = reinterpret_cast<float *>(outputs[0]->addr);
  auto ret = memset_s(output, outputs[0]->size, 0, outputs[0]->size);
  if (ret != 0) {
    MS_LOG(EXCEPTION) << "Pooling grad memset error!";
    MS_LOG(EXCEPTION) << "Pooling grad memset error, ret value:" << ret << ", output address: " << output
                      << ", memset size: " << outputs[0]->size;
  }
  size_t src_wh = src_shape_[2] * src_shape_[3];
  size_t dst_wh = dst_shape_[2] * dst_shape_[3];
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/mkldnn/pooling_max_grad_cpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/mkldnn/pooling_max_grad_cpu_kernel.h
@@ -1,5 +1,5 @@
 /**
 * Copyright 2019 Huawei Technologies Co., Ltd
 * Copyright 2019-2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/mkldnn/softmax_cpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/mkldnn/softmax_cpu_kernel.cc
@@ -1,5 +1,5 @@
 /**
 * Copyright 2019 Huawei Technologies Co., Ltd
 * Copyright 2019-2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -13,6 +13,7 @@
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #include "backend/kernel_compiler/cpu/mkldnn/softmax_cpu_kernel.h"
 #include <algorithm>
 #include "backend/kernel_compiler/cpu/mkldnn/mkl_kernel_engine.h"
@@ -21,8 +22,14 @@

 namespace mindspore {
 namespace kernel {
 namespace {
 constexpr size_t kSoftmaxInputsNum = 1;
 constexpr size_t kSoftmaxOutputsNum = 1;
 }  // namespace

 void SoftmaxCPUKernel::InitKernel(const CNodePtr &kernel_node) {
  MS_EXCEPTION_IF_NULL(kernel_node);
  kernel_name_ = AnfAlgo::GetCNodeName(kernel_node);
  std::vector<size_t> src_shape = AnfAlgo::GetInputDeviceShape(kernel_node, 0);
  std::vector<int> axis_list;
  std::vector<int64_t> axis_list_me = AnfAlgo::GetNodeAttr<std::vector<int64_t>>(kernel_node, AXIS);
@@ -48,9 +55,8 @@ void SoftmaxCPUKernel::InitKernel(const CNodePtr &kernel_node) {

 bool SoftmaxCPUKernel::Launch(const std::vector<kernel::AddressPtr> &inputs, const std::vector<kernel::AddressPtr> &,
                              const std::vector<kernel::AddressPtr> &outputs) {
  if (inputs.empty() || outputs.empty()) {
    MS_LOG(EXCEPTION) << "Softmax error input output size!";
  }
  CHECK_KERNEL_INPUTS_NUM(inputs.size(), kSoftmaxInputsNum, kernel_name_);
  CHECK_KERNEL_OUTPUTS_NUM(outputs.size(), kSoftmaxOutputsNum, kernel_name_);
  SetArgumentHandle(DNNL_ARG_SRC, inputs[0]->addr);
  SetArgumentHandle(DNNL_ARG_DST, outputs[0]->addr);
  ExecutePrimitive();
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/mkldnn/softmax_cpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/mkldnn/softmax_cpu_kernel.h
@@ -1,5 +1,5 @@
 /**
 * Copyright 2019 Huawei Technologies Co., Ltd
 * Copyright 2019-2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -13,6 +13,7 @@
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_SOFTMAX_CPU_KERNEL_H_
 #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_SOFTMAX_CPU_KERNEL_H_

--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/mkldnn/softmax_cross_entropy_with_logits_cpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/mkldnn/softmax_cross_entropy_with_logits_cpu_kernel.cc
@@ -1,5 +1,5 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 * Copyright 2020-2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -13,8 +13,10 @@
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #include "backend/kernel_compiler/cpu/mkldnn/softmax_cross_entropy_with_logits_cpu_kernel.h"
 #include <numeric>
 #include <limits>
 #include <functional>
 #include <cmath>
 #include "backend/kernel_compiler/cpu/mkldnn/mkl_kernel_engine.h"
@@ -23,6 +25,12 @@

 namespace mindspore {
 namespace kernel {
 namespace {
 constexpr size_t kSoftmaxCrossEntropyWithLogitsInputsNum = 2;
 constexpr size_t kSoftmaxCrossEntropyWithLogitsOutputsNum = 2;
 constexpr size_t kSoftmaxCrossEntropyWithLogitsWorkspaceSize = 1;
 }  // namespace

 void SoftmaxCrossEntropyWithLogitsCPUKernel::InitInputOutputSize(const CNodePtr &kernel_node) {
  CPUKernel::InitInputOutputSize(kernel_node);
  MS_EXCEPTION_IF_NULL(kernel_node);
@@ -34,9 +42,10 @@ void SoftmaxCrossEntropyWithLogitsCPUKernel::InitInputOutputSize(const CNodePtr

 void SoftmaxCrossEntropyWithLogitsCPUKernel::InitKernel(const CNodePtr &kernel_node) {
  MS_EXCEPTION_IF_NULL(kernel_node);
  kernel_name_ = AnfAlgo::GetCNodeName(kernel_node);
  std::vector<size_t> shape = AnfAlgo::GetInputDeviceShape(kernel_node, 0);
  dnnl::memory::dims mem_dims;
  mem_dims.insert(mem_dims.end(), shape.begin(), shape.end());
  (void)mem_dims.insert(mem_dims.end(), shape.begin(), shape.end());
  if (mem_dims.size() != 2) {
    MS_LOG(EXCEPTION) << "SoftmaxCrossEntropyWithLogits kernel dims invalid " << mem_dims.size();
  }
@@ -73,9 +82,10 @@ void SoftmaxCrossEntropyWithLogitsCPUKernel::ForwardPostExecute(const float *log
 bool SoftmaxCrossEntropyWithLogitsCPUKernel::Launch(const std::vector<kernel::AddressPtr> &inputs,
                                                    const std::vector<kernel::AddressPtr> &workspace,
                                                    const std::vector<kernel::AddressPtr> &outputs) {
  if (inputs.empty() || workspace.empty() || outputs.empty()) {
    MS_LOG(EXCEPTION) << "Error input output size!";
  }
  CHECK_KERNEL_INPUTS_NUM(inputs.size(), kSoftmaxCrossEntropyWithLogitsInputsNum, kernel_name_);
  CHECK_KERNEL_OUTPUTS_NUM(outputs.size(), kSoftmaxCrossEntropyWithLogitsOutputsNum, kernel_name_);
  CHECK_KERNEL_WORKSPACE_SIZE(workspace.size(), kSoftmaxCrossEntropyWithLogitsWorkspaceSize, kernel_name_);

  size_t batch_float_size = batch_size_ * sizeof(float);
  size_t batch_class_float_size = class_num_ * batch_float_size;
  if (inputs[0]->size != workspace[0]->size || inputs[0]->size != batch_class_float_size ||
@@ -88,10 +98,10 @@ bool SoftmaxCrossEntropyWithLogitsCPUKernel::Launch(const std::vector<kernel::Ad
  SetArgumentHandle(DNNL_ARG_SRC, inputs[0]->addr);
  SetArgumentHandle(DNNL_ARG_DST, workspace[0]->addr);
  ExecutePrimitive();
  auto labels = reinterpret_cast<float *>(inputs[1]->addr);
  auto logits = reinterpret_cast<float *>(workspace[0]->addr);
  auto output1 = reinterpret_cast<float *>(outputs[0]->addr);
  auto output2 = reinterpret_cast<float *>(outputs[1]->addr);
  const auto *labels = reinterpret_cast<float *>(inputs[1]->addr);
  const auto *logits = reinterpret_cast<float *>(workspace[0]->addr);
  auto *output1 = reinterpret_cast<float *>(outputs[0]->addr);
  auto *output2 = reinterpret_cast<float *>(outputs[1]->addr);
  ForwardPostExecute(logits, labels, output1, output2);
  return true;
 }
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/mkldnn/softmax_cross_entropy_with_logits_cpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/mkldnn/softmax_cross_entropy_with_logits_cpu_kernel.h
@@ -1,5 +1,5 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 * Copyright 2020-2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -13,6 +13,7 @@
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_SOFTMAX_CROSS_ENTROPY_WITH_LOGITS_CPU_KERNEL_H_
 #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_SOFTMAX_CROSS_ENTROPY_WITH_LOGITS_CPU_KERNEL_H_

--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/mkldnn/sparse_softmax_cross_entropy_with_logits_cpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/mkldnn/sparse_softmax_cross_entropy_with_logits_cpu_kernel.cc
@@ -1,5 +1,5 @@
 /**
 * Copyright 2019 Huawei Technologies Co., Ltd
 * Copyright 2019-2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -13,8 +13,10 @@
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #include "backend/kernel_compiler/cpu/mkldnn/sparse_softmax_cross_entropy_with_logits_cpu_kernel.h"
 #include <numeric>
 #include <limits>
 #include <functional>
 #include <cmath>
 #include "backend/kernel_compiler/cpu/mkldnn/mkl_kernel_engine.h"
@@ -23,6 +25,12 @@

 namespace mindspore {
 namespace kernel {
 namespace {
 constexpr size_t kSparseSoftmaxCrossEntropyWithLogitsInputsNum = 2;
 constexpr size_t kSparseSoftmaxCrossEntropyWithLogitsOutputsNum = 1;
 constexpr size_t kSparseSoftmaxCrossEntropyWithLogitsWorkspaceSize = 1;
 }  // namespace

 void SparseSoftmaxCrossEntropyWithLogitsCPUKernel::InitInputOutputSize(const CNodePtr &kernel_node) {
  CPUKernel::InitInputOutputSize(kernel_node);
  MS_EXCEPTION_IF_NULL(kernel_node);
@@ -34,13 +42,14 @@ void SparseSoftmaxCrossEntropyWithLogitsCPUKernel::InitInputOutputSize(const CNo

 void SparseSoftmaxCrossEntropyWithLogitsCPUKernel::InitKernel(const CNodePtr &kernel_node) {
  MS_EXCEPTION_IF_NULL(kernel_node);
  kernel_name_ = AnfAlgo::GetCNodeName(kernel_node);
  std::vector<size_t> shape = AnfAlgo::GetInputDeviceShape(kernel_node, 0);
  std::vector<size_t> label_shape = AnfAlgo::GetInputDeviceShape(kernel_node, 1);
  if (label_shape.size() > 1) {
    MS_LOG(EXCEPTION) << "Labels shape length should be equal to Logits shape length minus 1";
  }
  dnnl::memory::dims mem_dims;
  mem_dims.insert(mem_dims.end(), shape.begin(), shape.end());
  (void)mem_dims.insert(mem_dims.end(), shape.begin(), shape.end());
  if (mem_dims.size() != 2) {
    MS_LOG(EXCEPTION) << "SparseSoftmaxCrossEntropyWithLogits kernel dims invalid " << mem_dims.size();
  }
@@ -66,7 +75,7 @@ void SparseSoftmaxCrossEntropyWithLogitsCPUKernel::ForwardPostExecute(const int
  float epsilon = std::numeric_limits<float>::min();
  for (size_t i = 0; i < batch_size_; ++i) {
    if (labels[i] < 0) {
      MS_LOG(EXCEPTION) << "Label value must >= 0!";
      MS_LOG(EXCEPTION) << "Label value must >= 0";
    }
    size_t label = IntToSize(labels[i]);
    if (label > class_num_) {
@@ -82,7 +91,7 @@ void SparseSoftmaxCrossEntropyWithLogitsCPUKernel::GradPostExecute(const int *la
  size_t row_start = 0;
  for (size_t i = 0; i < batch_size_; ++i) {
    if (labels[i] < 0) {
      MS_LOG(EXCEPTION) << "Label value must >= 0!";
      MS_LOG(EXCEPTION) << "Label value must >= 0";
    }
    size_t label = IntToSize(labels[i]);
    if (label > class_num_) {
@@ -103,9 +112,9 @@ void SparseSoftmaxCrossEntropyWithLogitsCPUKernel::GradPostExecute(const int *la
 bool SparseSoftmaxCrossEntropyWithLogitsCPUKernel::Launch(const std::vector<kernel::AddressPtr> &inputs,
                                                          const std::vector<kernel::AddressPtr> &workspace,
                                                          const std::vector<kernel::AddressPtr> &outputs) {
  if (inputs.empty() || workspace.empty() || outputs.empty()) {
    MS_LOG(EXCEPTION) << "Error input output size!";
  }
  CHECK_KERNEL_INPUTS_NUM(inputs.size(), kSparseSoftmaxCrossEntropyWithLogitsInputsNum, kernel_name_);
  CHECK_KERNEL_OUTPUTS_NUM(outputs.size(), kSparseSoftmaxCrossEntropyWithLogitsOutputsNum, kernel_name_);
  CHECK_KERNEL_WORKSPACE_SIZE(workspace.size(), kSparseSoftmaxCrossEntropyWithLogitsWorkspaceSize, kernel_name_);
  size_t batch_float_size = batch_size_ * sizeof(float);
  size_t batch_class_float_size = class_num_ * batch_float_size;
  if (inputs[0]->size != workspace[0]->size || inputs[0]->size != batch_class_float_size ||
@@ -120,9 +129,9 @@ bool SparseSoftmaxCrossEntropyWithLogitsCPUKernel::Launch(const std::vector<kern
  SetArgumentHandle(DNNL_ARG_SRC, inputs[0]->addr);
  SetArgumentHandle(DNNL_ARG_DST, workspace[0]->addr);
  ExecutePrimitive();
  auto labels = reinterpret_cast<int *>(inputs[1]->addr);
  auto losses = reinterpret_cast<float *>(workspace[0]->addr);
  auto output = reinterpret_cast<float *>(outputs[0]->addr);
  const auto *labels = reinterpret_cast<int *>(inputs[1]->addr);
  const auto *losses = reinterpret_cast<float *>(workspace[0]->addr);
  auto *output = reinterpret_cast<float *>(outputs[0]->addr);
  if (is_grad_) {
    GradPostExecute(labels, losses, output);
  } else {
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/mkldnn/sparse_softmax_cross_entropy_with_logits_cpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/mkldnn/sparse_softmax_cross_entropy_with_logits_cpu_kernel.h
@@ -1,5 +1,5 @@
 /**
 * Copyright 2019 Huawei Technologies Co., Ltd
 * Copyright 2019-2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -13,6 +13,7 @@
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_SPARSE_SOFTMAX_CROSS_ENTROPY_WITH_LOGITS_CPU_KERNEL_H_
 #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_SPARSE_SOFTMAX_CROSS_ENTROPY_WITH_LOGITS_CPU_KERNEL_H_

@@ -32,10 +33,8 @@ class SparseSoftmaxCrossEntropyWithLogitsCPUKernel : public MKLCPUKernel {
  bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
              const std::vector<AddressPtr> &outputs) override;

 protected:
  void InitInputOutputSize(const CNodePtr &kernel_node) override;

 private:
  void InitInputOutputSize(const CNodePtr &kernel_node) override;
  void ForwardPostExecute(const int *labels, const float *losses, float *output) const;
  void GradPostExecute(const int *labels, const float *losses, float *output) const;
  bool is_grad_{false};
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/one_hot_cpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/one_hot_cpu_kernel.cc
@@ -1,5 +1,5 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 * Copyright 2020-2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -13,13 +13,20 @@
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #include "backend/kernel_compiler/cpu/one_hot_cpu_kernel.h"
 #include "runtime/device/cpu/cpu_device_address.h"

 namespace mindspore {
 namespace kernel {
 namespace {
 constexpr size_t kOneHotInputsNum = 3;
 constexpr size_t kOneHotOutputsNum = 1;
 }  // namespace

 void OneHotCPUKernel::InitKernel(const CNodePtr &kernel_node) {
  MS_EXCEPTION_IF_NULL(kernel_node);
  kernel_name_ = AnfAlgo::GetCNodeName(kernel_node);
  auto output_shape = AnfAlgo::GetOutputInferShape(kernel_node, 0);
  if (output_shape.size() < 2) {
    MS_LOG(EXCEPTION) << "Invalid output shape size: " << output_shape.size();
@@ -28,6 +35,7 @@ void OneHotCPUKernel::InitKernel(const CNodePtr &kernel_node) {
  if (axis != -1 && LongToSize(axis) >= output_shape.size()) {
    MS_LOG(EXCEPTION) << "Invalid axis: " << axis;
  }

  if (axis == -1) {
    axis_ = output_shape.size() - 1;
  } else {
@@ -42,13 +50,12 @@ void OneHotCPUKernel::InitKernel(const CNodePtr &kernel_node) {

 bool OneHotCPUKernel::Launch(const std::vector<kernel::AddressPtr> &inputs, const std::vector<kernel::AddressPtr> &,
                             const std::vector<kernel::AddressPtr> &outputs) {
  if (inputs.size() < 3 || outputs.empty()) {
    MS_LOG(EXCEPTION) << "Input or output invalid!";
  }
  auto indices = reinterpret_cast<int *>(inputs[0]->addr);
  CHECK_KERNEL_INPUTS_NUM(inputs.size(), kOneHotInputsNum, kernel_name_);
  CHECK_KERNEL_OUTPUTS_NUM(outputs.size(), kOneHotOutputsNum, kernel_name_);
  const auto *indices = reinterpret_cast<int *>(inputs[0]->addr);
  auto on_value = reinterpret_cast<float *>(inputs[1]->addr)[0];
  auto off_value = reinterpret_cast<float *>(inputs[2]->addr)[0];
  auto output = reinterpret_cast<float *>(outputs[0]->addr);
  auto *output = reinterpret_cast<float *>(outputs[0]->addr);
  size_t elem_num = inputs[0]->size / sizeof(int);

  auto task = [this, &indices, &on_value, &off_value, &output](size_t start, size_t end) {
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/one_hot_cpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/one_hot_cpu_kernel.h
@@ -1,5 +1,5 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 * Copyright 2020-2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -13,8 +13,10 @@
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_ONE_HOT_CPU_KERNEL_H_
 #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_ONE_HOT_CPU_KERNEL_H_

 #include <vector>
 #include <memory>
 #include "backend/kernel_compiler/cpu/cpu_kernel.h"
@@ -33,9 +35,9 @@ class OneHotCPUKernel : public CPUKernel {
              const std::vector<AddressPtr> &outputs) override;

 private:
  size_t depth_;
  size_t stride_;
  size_t axis_;
  size_t depth_{0};
  size_t stride_{0};
  size_t axis_{0};
 };

 MS_REG_CPU_KERNEL(OneHot, KernelAttr(), OneHotCPUKernel);
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/pack_cpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/pack_cpu_kernel.cc
@@ -1,5 +1,5 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 * Copyright 2020-2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -20,17 +20,16 @@

 namespace mindspore {
 namespace kernel {
 template <typename T>
 PackCpuFwdKernel<T>::PackCpuFwdKernel()
    : axis_(0), input_num_(1), output_size_(0), dims_behind_axis_(1), inputs_host_(nullptr) {}
 namespace {
 constexpr size_t kPackOutputsNum = 1;
 }  // namespace

 template <typename T>
 void PackCpuFwdKernel<T>::InitKernel(const CNodePtr &kernel_node) {
  MS_EXCEPTION_IF_NULL(kernel_node);

  axis_ = AnfAlgo::GetNodeAttr<int64_t>(kernel_node, AXIS);
  kernel_name_ = AnfAlgo::GetCNodeName(kernel_node);
  input_num_ = AnfAlgo::GetInputTensorNum(kernel_node);

  axis_ = AnfAlgo::GetNodeAttr<int64_t>(kernel_node, AXIS);
  if (axis_ < 0) {
    auto input_shape = AnfAlgo::GetInputDeviceShape(kernel_node, 0);
    axis_ += (SizeToInt(input_shape.size()) + 1);
@@ -52,11 +51,9 @@ void PackCpuFwdKernel<T>::InitKernel(const CNodePtr &kernel_node) {
 template <typename T>
 bool PackCpuFwdKernel<T>::Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
                                 const std::vector<AddressPtr> &outputs) {
  if (!CheckParam(outputs)) {
    return false;
  }
  auto output = reinterpret_cast<T *>(outputs[0]->addr);

  CHECK_KERNEL_INPUTS_NUM(inputs.size(), input_num_, kernel_name_);
  CHECK_KERNEL_OUTPUTS_NUM(outputs.size(), kPackOutputsNum, kernel_name_);
  auto *output = reinterpret_cast<T *>(outputs[0]->addr);
  inputs_host_ = std::make_unique<T *[]>(input_num_);
  for (size_t i = 0; i < inputs.size(); i++) {
    inputs_host_[i] = reinterpret_cast<T *>(inputs[i]->addr);
@@ -90,16 +87,7 @@ bool PackCpuFwdKernel<T>::Launch(const std::vector<AddressPtr> &inputs, const st
 }

 template <typename T>
 bool PackCpuFwdKernel<T>::CheckParam(const std::vector<AddressPtr> &outputs) const {
  if (outputs.size() != 1) {
    MS_LOG(EXCEPTION) << "Output number is " << outputs.size() << ", but PackGpuFwdKernel needs 1 output.";
    return false;
  }
  return true;
 }

 template <typename T>
 void PackCpuFwdKernel<T>::PackTensor(T *output, size_t start, size_t end) {
 void PackCpuFwdKernel<T>::PackTensor(T *output, size_t start, size_t end) const {
  for (size_t pos = start; pos < end; ++pos) {
    size_t cur_input_index = pos / dims_behind_axis_ % input_num_;
    size_t cycle_len = input_num_ * dims_behind_axis_;
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/pack_cpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/pack_cpu_kernel.h
@@ -1,5 +1,5 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 * Copyright 2020-2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -13,8 +13,9 @@
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #ifndef MINDSPORE_PACK_CPU_KERNEL_H
 #define MINDSPORE_PACK_CPU_KERNEL_H

 #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_PACK_CPU_KERNEL_H_
 #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_PACK_CPU_KERNEL_H_

 #include <vector>
 #include <memory>
@@ -26,7 +27,7 @@ namespace kernel {
 template <typename T>
 class PackCpuFwdKernel : public CPUKernel {
 public:
  PackCpuFwdKernel();
  PackCpuFwdKernel() = default;
  ~PackCpuFwdKernel() override = default;

  void InitKernel(const CNodePtr &kernel_node) override;
@@ -34,14 +35,13 @@ class PackCpuFwdKernel : public CPUKernel {
              const std::vector<AddressPtr> &outputs) override;

 private:
  bool CheckParam(const std::vector<AddressPtr> &outputs) const;
  void PackTensor(T *output, size_t start, size_t end);
  void PackTensor(T *output, size_t start, size_t end) const;

  int axis_;
  size_t input_num_;
  size_t output_size_;
  size_t dims_behind_axis_;
  std::unique_ptr<T *[]> inputs_host_;
  int axis_{0};
  size_t input_num_{1};
  size_t output_size_{0};
  size_t dims_behind_axis_{1};
  std::unique_ptr<T *[]> inputs_host_ { nullptr };
 };

 MS_REG_CPU_KERNEL_T(Stack, KernelAttr(), PackCpuFwdKernel, int8_t)
@@ -57,4 +57,4 @@ MS_REG_CPU_KERNEL_T(Stack, KernelAttr(), PackCpuFwdKernel, float)
 MS_REG_CPU_KERNEL_T(Stack, KernelAttr(), PackCpuFwdKernel, bool)
 }  // namespace kernel
 }  // namespace mindspore
 #endif  // MINDSPORE_PACK_CPU_KERNEL_H
 #endif  // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_PACK_CPU_KERNEL_H_
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/pad_cpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/pad_cpu_kernel.cc
@@ -19,7 +19,14 @@

 namespace mindspore {
 namespace kernel {
 namespace {
 constexpr size_t kPadInputsNum = 1;
 constexpr size_t kPadOutputsNum = 1;
 }  // namespace

 void PadCPUKernel::InitKernel(const CNodePtr &kernel_node) {
  MS_EXCEPTION_IF_NULL(kernel_node);
  kernel_name_ = AnfAlgo::GetCNodeName(kernel_node);
  paddings_ = AnfAlgo::GetNodeAttr<std::vector<std::vector<int64_t>>>(kernel_node, "paddings");
  dtype_ = AnfAlgo::GetInputDeviceDataType(kernel_node, 0);
  std::vector<size_t> input_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
@@ -27,12 +34,10 @@ void PadCPUKernel::InitKernel(const CNodePtr &kernel_node) {
  shape_size_ = input_shape.size();
  if (shape_size_ == 4) {  // shape adjustment from 2d/3d to 4d
  } else if (shape_size_ == 3) {
    auto it = input_shape.begin();
    input_shape.insert(it, 1);  // batch padding
    (void)input_shape.insert(input_shape.begin(), 1);  // batch padding
    shape_size_ = 4;
  } else if (shape_size_ == 2) {
    auto it = input_shape.begin();
    input_shape.insert(it, 2, 1);  // channel padding
    (void)input_shape.insert(input_shape.begin(), 2, 1);  // channel padding
    shape_size_ = 4;
  }

@@ -43,11 +48,9 @@ void PadCPUKernel::InitKernel(const CNodePtr &kernel_node) {

  if (paddings_.size() == 4) {  // shape adjustment from 2d/3d to 4d
  } else if (paddings_.size() == 3) {
    auto it = paddings_.begin();
    paddings_.insert(it, 1, {0, 0});  // batch padding
    (void)paddings_.insert(paddings_.begin(), 1, {0, 0});  // batch padding
  } else if (paddings_.size() == 2) {
    auto it = paddings_.begin();
    paddings_.insert(it, 2, {0, 0});  // channel padding
    (void)paddings_.insert(paddings_.begin(), 2, {0, 0});  // channel padding
  }

  for (size_t i = 0; i < shape_size_; i++) {
@@ -59,6 +62,8 @@ void PadCPUKernel::InitKernel(const CNodePtr &kernel_node) {

 bool PadCPUKernel::Launch(const std::vector<kernel::AddressPtr> &inputs, const std::vector<kernel::AddressPtr> &,
                          const std::vector<kernel::AddressPtr> &outputs) {
  CHECK_KERNEL_INPUTS_NUM(inputs.size(), kPadInputsNum, kernel_name_);
  CHECK_KERNEL_OUTPUTS_NUM(outputs.size(), kPadOutputsNum, kernel_name_);
  if (dtype_ == kNumberTypeFloat16) {
    LaunchKernel<float16>(inputs, outputs);
  } else if (dtype_ == kNumberTypeFloat32) {
@@ -74,9 +79,9 @@ bool PadCPUKernel::Launch(const std::vector<kernel::AddressPtr> &inputs, const s
 }

 template <typename T>
 void PadCPUKernel::LaunchKernel(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &outputs) {
  auto inputs_addr = reinterpret_cast<T *>(inputs[0]->addr);
  auto outputs_addr = reinterpret_cast<T *>(outputs[0]->addr);
 void PadCPUKernel::LaunchKernel(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &outputs) const {
  const auto *inputs_addr = reinterpret_cast<T *>(inputs[0]->addr);
  auto *outputs_addr = reinterpret_cast<T *>(outputs[0]->addr);

  const int pad_left = paddings_[3][0];
  const int pad_top = paddings_[2][0];
@@ -112,16 +117,5 @@ void PadCPUKernel::LaunchKernel(const std::vector<AddressPtr> &inputs, const std
    }
  }
 }

 void PadCPUKernel::CheckParam(const CNodePtr &kernel_node) {
  size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
  if (input_num != 1) {
    MS_LOG(EXCEPTION) << "Input number is " << input_num << ", but PadCPUKernel needs 1 input.";
  }
  size_t output_num = AnfAlgo::GetOutputTensorNum(kernel_node);
  if (output_num != 1) {
    MS_LOG(EXCEPTION) << "Output number is " << output_num << ", but PadCPUKernel needs 1 output.";
  }
 }
 }  // namespace kernel
 }  // namespace mindspore
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/pad_cpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/pad_cpu_kernel.h
@@ -16,6 +16,7 @@

 #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_PAD_CPU_KERNEL_H_
 #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_PAD_CPU_KERNEL_H_

 #include <memory>
 #include <unordered_map>
 #include <vector>
@@ -34,16 +35,15 @@ class PadCPUKernel : public CPUKernel {
  bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
              const std::vector<AddressPtr> &outputs) override;

 private:
  template <typename T>
  void LaunchKernel(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &outputs);
  void LaunchKernel(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &outputs) const;

 private:
  void CheckParam(const CNodePtr &kernel_node);
  std::vector<std::vector<int64_t>> paddings_;
  TypeId dtype_{kTypeUnknown};
  uint64_t tensor_size_ = 1;
  size_t shape_size_ = 1;
  uint64_t output_size_ = 1;
  uint64_t tensor_size_{1};
  size_t shape_size_{1};
  uint64_t output_size_{1};
  std::vector<std::vector<int64_t>> paddings_;
  std::vector<size_t> input_shape_;
  std::vector<size_t> output_shape_;
 };
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/range_cpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/range_cpu_kernel.cc
@@ -1,5 +1,5 @@
 /**
 * Copyright 2019 Huawei Technologies Co., Ltd
 * Copyright 2019-2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -13,18 +13,27 @@
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #include "backend/kernel_compiler/cpu/range_cpu_kernel.h"
 #include "runtime/device/cpu/cpu_device_address.h"

 namespace mindspore {
 namespace kernel {
 namespace {
 constexpr size_t kRangeInputsNum = 3;
 constexpr size_t kRangeOutputsNum = 1;
 }  // namespace

 void RangeCPUKernel::InitKernel(const CNodePtr &kernel_node) {
  MS_EXCEPTION_IF_NULL(kernel_node);
  kernel_name_ = AnfAlgo::GetCNodeName(kernel_node);
  dtype_ = AnfAlgo::GetInputDeviceDataType(kernel_node, 0);
 }

 bool RangeCPUKernel::Launch(const std::vector<kernel::AddressPtr> &inputs, const std::vector<kernel::AddressPtr> &,
                            const std::vector<kernel::AddressPtr> &outputs) {
  CHECK_KERNEL_INPUTS_NUM(inputs.size(), kRangeInputsNum, kernel_name_);
  CHECK_KERNEL_OUTPUTS_NUM(outputs.size(), kRangeOutputsNum, kernel_name_);
  if (dtype_ == kNumberTypeInt32) {
    return LaunchKernel<int32_t>(inputs, outputs);
  } else if (dtype_ == kNumberTypeFloat32) {
@@ -35,19 +44,19 @@ bool RangeCPUKernel::Launch(const std::vector<kernel::AddressPtr> &inputs, const
 }

 template <typename T>
 bool RangeCPUKernel::LaunchKernel(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &outputs) {
  T start_ = reinterpret_cast<T *>(inputs[0]->addr)[0];
  T limit_ = reinterpret_cast<T *>(inputs[1]->addr)[0];
  T delta_ = reinterpret_cast<T *>(inputs[2]->addr)[0];
 bool RangeCPUKernel::LaunchKernel(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &outputs) const {
  auto start = reinterpret_cast<T *>(inputs[0]->addr)[0];
  auto limit = reinterpret_cast<T *>(inputs[1]->addr)[0];
  auto delta = reinterpret_cast<T *>(inputs[2]->addr)[0];

  auto output_addr = reinterpret_cast<T *>(outputs[0]->addr);
  size_t elem_num = outputs[0]->size / sizeof(T);
  for (size_t i = 0; i < elem_num; i++) {
    T val_ = start_ + static_cast<T>(i) * delta_;
    if (val_ > limit_) {
    T val = start + static_cast<T>(i) * delta;
    if (val > limit) {
      break;
    }
    output_addr[i] = val_;
    output_addr[i] = val;
  }
  return true;
 }
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/range_cpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/range_cpu_kernel.h
@@ -1,5 +1,5 @@
 /**
 * Copyright 2019 Huawei Technologies Co., Ltd
 * Copyright 2019-2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -13,8 +13,10 @@
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_RANGE_CPU_KERNEL_H_
 #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_RANGE_CPU_KERNEL_H_

 #include <vector>
 #include <memory>
 #include "backend/kernel_compiler/cpu/cpu_kernel.h"
@@ -31,10 +33,11 @@ class RangeCPUKernel : public CPUKernel {

  bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
              const std::vector<AddressPtr> &outputs) override;
  template <typename T>
  bool LaunchKernel(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &outputs);

 private:
  template <typename T>
  bool LaunchKernel(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &outputs) const;

  TypeId dtype_{kTypeUnknown};
 };

--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/reduce_cpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/reduce_cpu_kernel.cc
@@ -23,13 +23,23 @@

 namespace mindspore {
 namespace kernel {
 namespace {
 constexpr size_t kReduceSmallVectorSize = 200000;
 constexpr size_t kReduceInputsNum = 1;
 constexpr size_t kReduceOutputsNum = 1;
 }  // namespace

 template <typename T>
 void ReduceCPUKernel<T>::InitKernel(const CNodePtr &kernel_node) {
  MS_EXCEPTION_IF_NULL(kernel_node);
  kernel_name_ = AnfAlgo::GetCNodeName(kernel_node);
  input_shape_ = AnfAlgo::GetInputDeviceShape(kernel_node, 0);
  auto axis_addr = AnfAlgo::GetCNodePrimitive(kernel_node)->GetAttr(AXIS);
  auto prim = AnfAlgo::GetCNodePrimitive(kernel_node);
  MS_EXCEPTION_IF_NULL(prim);
  auto axis_addr = prim->GetAttr(AXIS);
  if (axis_addr == nullptr) {
    MS_LOG(EXCEPTION) << "Miss attribute " << AXIS;
  }
  if (axis_addr->isa<ValueTuple>() || axis_addr->isa<ValueList>()) {
    axis_ = AnfAlgo::GetNodeAttr<std::vector<int64_t>>(kernel_node, AXIS);
  } else if (axis_addr->isa<Int64Imm>()) {
@@ -39,8 +49,8 @@ void ReduceCPUKernel<T>::InitKernel(const CNodePtr &kernel_node) {
  }

  int dimension = input_shape_.size();
  std::transform(axis_.begin(), axis_.end(), axis_.begin(),
                 [dimension](const auto &a) { return a < 0 ? dimension + a : a; });
  (void)std::transform(axis_.begin(), axis_.end(), axis_.begin(),
                       [dimension](const auto &a) { return a < 0 ? dimension + a : a; });
  sort(axis_.begin(), axis_.end());
  // Delete the duplicate axis.
  auto last = std::unique(axis_.begin(), axis_.end());
@@ -48,30 +58,30 @@ void ReduceCPUKernel<T>::InitKernel(const CNodePtr &kernel_node) {
  auto kernel_name = AnfAlgo::GetCNodeName(kernel_node);

  if constexpr (std::is_same<T, bool>::value) {
    if (kernel_name == "ReduceAll") {
    if (kernel_name_ == prim::kPrimReduceAll->name()) {
      reduce_type_ = kReduceAll;
      reduce_func_ = [](const T *input, size_t pos, T *out) { *out &= input[pos]; };
    } else if (kernel_name == "ReduceAny") {
    } else if (kernel_name_ == prim::kPrimReduceAny->name()) {
      reduce_type_ = kReduceAny;
      reduce_func_ = [](const T *input, size_t pos, T *out) { *out |= input[pos]; };
    } else {
      MS_LOG(EXCEPTION) << "Unsupported reduce operation: " << fullname_ << " for bool.";
      MS_LOG(EXCEPTION) << "Unsupported reduce operation: " << kernel_name_ << " for bool.";
    }
  } else {
    if (kernel_name == "ReduceMax") {
    if (kernel_name_ == prim::kPrimReduceMax->name()) {
      reduce_type_ = kReduceMax;
      reduce_func_ = [](const T *input, size_t pos, T *out) { *out = std::max(input[pos], *out); };
    } else if (kernel_name == "ReduceMin") {
    } else if (kernel_name_ == prim::kPrimReduceMin->name()) {
      reduce_type_ = kReduceMin;
      reduce_func_ = [](const T *input, size_t pos, T *out) { *out = std::min(input[pos], *out); };
    } else if (kernel_name == "ReduceSum") {
    } else if (kernel_name_ == prim::kPrimReduceSum->name()) {
      reduce_type_ = kReduceSum;
      reduce_func_ = [](const T *input, size_t pos, T *out) { *out += input[pos]; };
    } else if (kernel_name == "ReduceMean") {
    } else if (kernel_name_ == prim::kPrimReduceMean->name()) {
      reduce_type_ = kReduceMean;
      reduce_func_ = [](const T *input, size_t pos, T *out) { *out += input[pos]; };
    } else {
      MS_LOG(EXCEPTION) << "Unsupported reduce operation:  " << kernel_name;
      MS_LOG(EXCEPTION) << "Unsupported reduce operation:  " << kernel_name_;
    }
  }

@@ -87,13 +97,11 @@ void ReduceCPUKernel<T>::InitKernel(const CNodePtr &kernel_node) {
 template <typename T>
 bool ReduceCPUKernel<T>::Launch(const std::vector<kernel::AddressPtr> &inputs, const std::vector<kernel::AddressPtr> &,
                                const std::vector<kernel::AddressPtr> &outputs) {
  CHECK_KERNEL_INPUTS_NUM(inputs.size(), kReduceInputsNum, kernel_name_);
  CHECK_KERNEL_OUTPUTS_NUM(outputs.size(), kReduceOutputsNum, kernel_name_);
  size_t input_size = inputs[0]->size / sizeof(T);
  if (input_size == 0) {
    MS_LOG(EXCEPTION) << "Input data size is 0.";
  }

  auto input_addr = reinterpret_cast<T *>(inputs[0]->addr);
  auto output_addr = reinterpret_cast<T *>(outputs[0]->addr);
  auto *input_addr = reinterpret_cast<T *>(inputs[0]->addr);
  auto *output_addr = reinterpret_cast<T *>(outputs[0]->addr);
  if (axis_.empty() || input_shape_.empty() || input_shape_.size() == 1) {
    if (input_size < kReduceSmallVectorSize) {
      // Get one ret
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/reduce_cpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/reduce_cpu_kernel.h
@@ -1,5 +1,5 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 * Copyright 2020-2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -13,8 +13,10 @@
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_REDUCE_CPU_KERNEL_H_
 #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_REDUCE_CPU_KERNEL_H_

 #include <vector>
 #include <memory>
 #include <string>