!23733 code check fix

Merge pull request !23733 from zhangbuxue/code_check_fix
4 years ago · 2010d79336
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/adam_cpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/adam_cpu_kernel.cc
@@ -13,26 +13,32 @@
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "backend/kernel_compiler/cpu/mkldnn/mkl_kernel_engine.h"
 #include "runtime/device/cpu/cpu_device_address.h"

 #include "backend/kernel_compiler/cpu/adam_cpu_kernel.h"
 #include "nnacl/errorcode.h"
 #include "nnacl/fp32/adam_fp32.h"
 #include "backend/kernel_compiler/cpu/nnacl/errorcode.h"
 #include "backend/kernel_compiler/cpu/nnacl/fp32/adam_fp32.h"
 #include "runtime/device/cpu/cpu_device_address.h"
 #include "utils/ms_utils.h"

 namespace mindspore {
 namespace kernel {
 namespace {
 constexpr size_t kAdamInputsNum = 10;
 constexpr size_t kAdamOutputsNum = 3;
 constexpr size_t kScalarIndex = 0;
 }  // namespace

 template <typename T>
 void AdamCPUKernel::LaunchAdam(const std::vector<kernel::AddressPtr> &inputs, const std::vector<kernel::AddressPtr> &) {
  T *var = reinterpret_cast<T *>(inputs[VAR]->addr);
  T *m = reinterpret_cast<T *>(inputs[M]->addr);
  T *v = reinterpret_cast<T *>(inputs[V]->addr);
  float beta1_power = reinterpret_cast<float *>(inputs[BETA1_POWER]->addr)[SCALAR_INDEX];
  float beta2_power = reinterpret_cast<float *>(inputs[BETA2_POWER]->addr)[SCALAR_INDEX];
  float lr = reinterpret_cast<float *>(inputs[LR]->addr)[SCALAR_INDEX];
  T beta1 = static_cast<T>(reinterpret_cast<float *>(inputs[BETA1]->addr)[SCALAR_INDEX]);
  T beta2 = static_cast<T>(reinterpret_cast<float *>(inputs[BETA1]->addr)[SCALAR_INDEX]);
  T epsilon = static_cast<T>(reinterpret_cast<float *>(inputs[EPSILON]->addr)[SCALAR_INDEX]);
  float beta1_power = reinterpret_cast<float *>(inputs[BETA1_POWER]->addr)[kScalarIndex];
  float beta2_power = reinterpret_cast<float *>(inputs[BETA2_POWER]->addr)[kScalarIndex];
  float lr = reinterpret_cast<float *>(inputs[LR]->addr)[kScalarIndex];
  T beta1 = static_cast<T>(reinterpret_cast<float *>(inputs[BETA1]->addr)[kScalarIndex]);
  T beta2 = static_cast<T>(reinterpret_cast<float *>(inputs[BETA1]->addr)[kScalarIndex]);
  T epsilon = static_cast<T>(reinterpret_cast<float *>(inputs[EPSILON]->addr)[kScalarIndex]);
  T *gradient = reinterpret_cast<T *>(inputs[GRAD]->addr);
  constexpr float ONE = 1.0;
  if (beta1_power - ONE == 0) {
@@ -62,12 +68,12 @@ void AdamCPUKernel::LaunchAdamNnacl(const std::vector<kernel::AddressPtr> &input
  float *var = reinterpret_cast<float *>(inputs[VAR]->addr);
  float *m = reinterpret_cast<float *>(inputs[M]->addr);
  float *v = reinterpret_cast<float *>(inputs[V]->addr);
  float beta1_power = reinterpret_cast<float *>(inputs[BETA1_POWER]->addr)[SCALAR_INDEX];
  float beta2_power = reinterpret_cast<float *>(inputs[BETA2_POWER]->addr)[SCALAR_INDEX];
  float lr = reinterpret_cast<float *>(inputs[LR]->addr)[SCALAR_INDEX];
  float beta1 = reinterpret_cast<float *>(inputs[BETA1]->addr)[SCALAR_INDEX];
  float beta2 = reinterpret_cast<float *>(inputs[BETA2]->addr)[SCALAR_INDEX];
  float epsilon = reinterpret_cast<float *>(inputs[EPSILON]->addr)[SCALAR_INDEX];
  float beta1_power = reinterpret_cast<float *>(inputs[BETA1_POWER]->addr)[kScalarIndex];
  float beta2_power = reinterpret_cast<float *>(inputs[BETA2_POWER]->addr)[kScalarIndex];
  float lr = reinterpret_cast<float *>(inputs[LR]->addr)[kScalarIndex];
  float beta1 = reinterpret_cast<float *>(inputs[BETA1]->addr)[kScalarIndex];
  float beta2 = reinterpret_cast<float *>(inputs[BETA2]->addr)[kScalarIndex];
  float epsilon = reinterpret_cast<float *>(inputs[EPSILON]->addr)[kScalarIndex];
  float *gradient = reinterpret_cast<float *>(inputs[GRAD]->addr);
  constexpr float ONE = 1.0;
  if (beta1_power - ONE == 0) {
@@ -88,26 +94,20 @@ void AdamCPUKernel::LaunchAdamNnacl(const std::vector<kernel::AddressPtr> &input

 void AdamCPUKernel::InitKernel(const CNodePtr &kernel_node) {
  MS_EXCEPTION_IF_NULL(kernel_node);
  size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
  kernel_name_ = AnfAlgo::GetCNodeName(kernel_node);
  dtype_ = AnfAlgo::GetInputDeviceDataType(kernel_node, 0);
  if (input_num != INPUT_NUMS) {
    MS_LOG(EXCEPTION) << "Input number is " << input_num << ", but Adam needs 10 inputs.";
  }
  size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
  CHECK_KERNEL_INPUTS_NUM(input_num, kAdamInputsNum, kernel_name_);
  size_t output_num = AnfAlgo::GetOutputTensorNum(kernel_node);
  if (output_num != OUTPUT_NUMS) {
    MS_LOG(EXCEPTION) << "Output number is " << output_num << ", but Adam needs 3 outputs.";
  }
  use_nesterov_ = AnfAlgo::GetNodeAttr<bool>(kernel_node, "use_nesterov");
  CHECK_KERNEL_OUTPUTS_NUM(output_num, kAdamOutputsNum, kernel_name_);
  use_nesterov_ = AnfAlgo::GetNodeAttr<bool>(kernel_node, USE_NESTEROV);
 }

 bool AdamCPUKernel::Launch(const std::vector<kernel::AddressPtr> &inputs, const std::vector<kernel::AddressPtr> &,
                           const std::vector<kernel::AddressPtr> &outputs) {
  if (inputs.size() != INPUT_NUMS) {
    MS_LOG(EXCEPTION) << "Input number is " << inputs.size() << ", but Adam needs 10 inputs.";
  }
  if (outputs.size() != OUTPUT_NUMS) {
    MS_LOG(EXCEPTION) << "Output number is " << outputs.size() << ", but Adam needs 3 outputs.";
  }
  CHECK_KERNEL_INPUTS_NUM(inputs.size(), kAdamInputsNum, kernel_name_);
  CHECK_KERNEL_OUTPUTS_NUM(outputs.size(), kAdamOutputsNum, kernel_name_);

  if (inputs[VAR]->size != inputs[M]->size || inputs[VAR]->size != inputs[V]->size ||
      inputs[VAR]->size != inputs[GRAD]->size) {
    MS_LOG(EXCEPTION) << "Error input data size!";
@@ -124,7 +124,6 @@ bool AdamCPUKernel::Launch(const std::vector<kernel::AddressPtr> &inputs, const
    LaunchAdam<float16>(inputs, outputs);
  } else {
    MS_LOG(EXCEPTION) << "Adam not support " << dtype_;
    return false;
  }
  return true;
 }
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/adam_cpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/adam_cpu_kernel.h
@@ -13,33 +13,33 @@
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_ADAM_CPU_KERNEL_H_
 #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_ADAM_CPU_KERNEL_H_

 #include <vector>
 #include <memory>

 #include "backend/kernel_compiler/cpu/cpu_kernel.h"
 #include "backend/kernel_compiler/cpu/cpu_kernel_factory.h"

 namespace mindspore {
 namespace kernel {
 constexpr size_t SCALAR_INDEX = 0;
 constexpr size_t INPUT_NUMS = 10;
 constexpr size_t OUTPUT_NUMS = 3;

 class AdamCPUKernel : public CPUKernel {
 public:
  AdamCPUKernel() = default;
  ~AdamCPUKernel() override = default;
  template <typename T>
  void LaunchAdam(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &outputs);

  void LaunchAdamNnacl(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &outputs);
  void InitKernel(const CNodePtr &kernel_node) override;
  bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
              const std::vector<AddressPtr> &outputs) override;

 private:
  template <typename T>
  void LaunchAdam(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &outputs);

  void LaunchAdamNnacl(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &outputs);

  bool use_nesterov_{false};
  TypeId dtype_{kTypeUnknown};
  enum input_list_ { VAR, M, V, BETA1_POWER, BETA2_POWER, LR, BETA1, BETA2, EPSILON, GRAD };
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/adam_delta_cpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/adam_delta_cpu_kernel.cc
@@ -13,20 +13,24 @@
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include <thread>

 #include "backend/kernel_compiler/cpu/adam_delta_cpu_kernel.h"

 #include <vector>
 #include <string>
 #include <memory>

 #include "backend/kernel_compiler/common_utils.h"
 #include "backend/kernel_compiler/cpu/nnacl/fp32/adam_fp32.h"
 #include "runtime/device/cpu/cpu_device_address.h"
 #include "backend/kernel_compiler/cpu/adam_delta_cpu_kernel.h"
 #include "nnacl/errorcode.h"
 #include "nnacl/fp32/adam_fp32.h"
 #include "utils/ms_utils.h"

 namespace mindspore {
 namespace kernel {
 constexpr size_t kAdamDeltaInputSize = 9;
 namespace {
 constexpr size_t kAdamDeltaInputsNum = 9;
 constexpr size_t kAdamDeltaOutputsNum = 1;
 }  // namespace

 template <typename T>
 void AdamDeltaCPUKernel::LaunchAdamDelta(T *delta, T *m, T *v, float lr, float beta1, float beta2, float epsilon,
                                         const T *gradient, size_t size) {
@@ -55,6 +59,7 @@ void AdamDeltaCPUKernel::LaunchAdamDelta(T *delta, T *m, T *v, float lr, float b

 void AdamDeltaCPUKernel::InitKernel(const CNodePtr &kernel_node) {
  MS_EXCEPTION_IF_NULL(kernel_node);
  kernel_name_ = AnfAlgo::GetCNodeName(kernel_node);
  std::vector<size_t> delta_shape = AnfAlgo::GetOutputDeviceShape(kernel_node, 0);
  std::vector<size_t> m_shape = AnfAlgo::GetInputDeviceShape(kernel_node, 0);
  std::vector<size_t> v_shape = AnfAlgo::GetInputDeviceShape(kernel_node, 1);
@@ -86,14 +91,14 @@ void AdamDeltaCPUKernel::InitKernel(const CNodePtr &kernel_node) {

 void AdamDeltaCPUKernel::CheckParams(const std::vector<kernel::AddressPtr> &inputs,
                                     const std::vector<kernel::AddressPtr> &outputs) const {
  if (inputs.size() != kAdamDeltaInputSize) {
    MS_LOG(EXCEPTION) << "Error input size!";
  }
  CHECK_KERNEL_INPUTS_NUM(inputs.size(), kAdamDeltaInputsNum, kernel_name_);
  CHECK_KERNEL_OUTPUTS_NUM(outputs.size(), kAdamDeltaOutputsNum, kernel_name_);

  size_t elem_size = elem_num_ * 4;
  std::vector<size_t> expect_sizes = {elem_size, elem_size, 4, 4, 4, 4, 4, 4, elem_size};
  std::vector<std::string> input_names = {"m",     "v",     "beta1_power", "beta2_power", "lr",
                                          "beta1", "beta2", "epsilon",     "grad"};
  for (size_t i = 0; i < kAdamDeltaInputSize; ++i) {
  for (size_t i = 0; i < kAdamDeltaInputsNum; ++i) {
    if (inputs[i]->size != expect_sizes[i]) {
      MS_LOG(EXCEPTION) << "Error input " << input_names[i] << " size!";
    }
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/adam_delta_cpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/adam_delta_cpu_kernel.h
@@ -1,5 +1,5 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 * Copyright 2020-2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -13,9 +13,12 @@
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_ADAM_DELTA_CPU_KERNEL_H_
 #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_ADAM_DELTA_CPU_KERNEL_H_

 #include <vector>

 #include "backend/kernel_compiler/cpu/cpu_kernel.h"
 #include "backend/kernel_compiler/cpu/cpu_kernel_factory.h"

@@ -29,8 +32,9 @@ class AdamDeltaCPUKernel : public CPUKernel {
  bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
              const std::vector<AddressPtr> &outputs) override;

 protected:
 private:
  void CheckParams(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &outputs) const;

  template <typename T>
  void LaunchAdamDelta(T *delta, T *m, T *v, float lr, float beta1, float beta2, float epsilon, const T *gradient,
                       size_t size);
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/adam_weight_decay_cpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/adam_weight_decay_cpu_kernel.cc
@@ -13,12 +13,14 @@
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #include "backend/kernel_compiler/cpu/adam_weight_decay_cpu_kernel.h"

 #include <cmath>
 #include "backend/kernel_compiler/cpu/mkldnn/mkl_kernel_engine.h"

 #include "backend/kernel_compiler/cpu/nnacl/errorcode.h"
 #include "backend/kernel_compiler/cpu/nnacl/fp32/adam_fp32.h"
 #include "runtime/device/cpu/cpu_device_address.h"
 #include "nnacl/errorcode.h"
 #include "nnacl/fp32/adam_fp32.h"
 #include "utils/ms_utils.h"

 namespace mindspore {
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/adam_weight_decay_cpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/adam_weight_decay_cpu_kernel.h
@@ -13,11 +13,13 @@
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_ADAM_WEIGHT_DECAY_CPU_KERNEL_H_
 #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_ADAM_WEIGHT_DECAY_CPU_KERNEL_H_

 #include <vector>
 #include <memory>

 #include "backend/kernel_compiler/cpu/cpu_kernel.h"
 #include "backend/kernel_compiler/cpu/cpu_kernel_factory.h"

--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/allgather_cpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/allgather_cpu_kernel.cc
@@ -13,6 +13,7 @@
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #include "backend/kernel_compiler/cpu/allgather_cpu_kernel.h"
 #include "runtime/device/cpu/cpu_device_address.h"
 #include "runtime/device/cpu/mpi/mpi_interface.h"
@@ -21,28 +22,25 @@
 namespace mindspore {
 namespace kernel {
 namespace {
 constexpr size_t kAllGatherInputsNum = 1;
 constexpr size_t kAllGatherOutputsNum = 1;
 constexpr auto kRanksGroup = "group";
 constexpr auto kAllGatherInputNum = 1;
 }  // namespace

 void AllGatherCPUKernel::InitKernel(const CNodePtr &kernel_node) {
  MS_EXCEPTION_IF_NULL(kernel_node);
  kernel_name_ = AnfAlgo::GetCNodeName(kernel_node);
  size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
  if (input_num != kAllGatherInputNum) {
    MS_LOG(EXCEPTION) << "Allgather input num:" << input_num;
  }

  auto ranks_group = AnfAlgo::GetCNodePrimitive(kernel_node)->GetAttr(kRanksGroup);
  if (ranks_group != nullptr) {
    ranks_group_ = GetValue<std::vector<int>>(ranks_group);
  } else {
    MS_LOG(EXCEPTION) << "Miss attribute " << kRanksGroup;
  }
  CHECK_KERNEL_INPUTS_NUM(input_num, kAllGatherInputsNum, kernel_name_);
  ranks_group_ = AnfAlgo::GetNodeAttr<std::vector<int>>(kernel_node, kRanksGroup);
 }

 bool AllGatherCPUKernel::Launch(const std::vector<kernel::AddressPtr> &inputs, const std::vector<kernel::AddressPtr> &,
                                const std::vector<kernel::AddressPtr> &outputs) {
  auto input_addr = reinterpret_cast<float *>(inputs[0]->addr);
  auto output_addr = reinterpret_cast<float *>(outputs[0]->addr);
  CHECK_KERNEL_INPUTS_NUM(inputs.size(), kAllGatherInputsNum, kernel_name_);
  CHECK_KERNEL_OUTPUTS_NUM(outputs.size(), kAllGatherOutputsNum, kernel_name_);
  auto *input_addr = reinterpret_cast<float *>(inputs[0]->addr);
  auto *output_addr = reinterpret_cast<float *>(outputs[0]->addr);
  auto input_data_num = inputs[0]->size / sizeof(float);
  return MPIAllGather(input_addr, output_addr, ranks_group_, input_data_num);
 }
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/allgather_cpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/allgather_cpu_kernel.h
@@ -1,5 +1,5 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 * Copyright 2020-2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -13,10 +13,13 @@
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_REDUCE_SCATTER_CPU_KERNEL_H_
 #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_REDUCE_SCATTER_CPU_KERNEL_H_

 #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_ALLGATHER_CPU_KERNEL_H_
 #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_ALLGATHER_CPU_KERNEL_H_

 #include <vector>
 #include <memory>

 #include "backend/kernel_compiler/cpu/cpu_kernel.h"
 #include "backend/kernel_compiler/cpu/cpu_kernel_factory.h"

@@ -41,4 +44,4 @@ MS_REG_CPU_KERNEL(_HostAllGather, KernelAttr().AddInputAttr(kNumberTypeFloat32).
 }  // namespace kernel
 }  // namespace mindspore

 #endif  // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_REDUCE_SCATTER_CPU_KERNEL_H_
 #endif  // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_ALLGATHER_CPU_KERNEL_H_
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/apply_adagrad_cpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/apply_adagrad_cpu_kernel.cc
@@ -1,5 +1,5 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 * Copyright 2020-2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -24,11 +24,13 @@ namespace kernel {
 namespace {
 constexpr size_t kSizeFloat16 = 2;
 constexpr size_t kSizeFloat32 = 4;
 constexpr size_t kInputSize = 4;
 constexpr size_t kOutputSize = 2;
 constexpr size_t kApplyAdagradInputsNum = 4;
 constexpr size_t kApplyAdagradOutputsNum = 2;
 }  // namespace

 void ApplyAdagradCPUKernel::InitKernel(const CNodePtr &kernel_node) {
  MS_EXCEPTION_IF_NULL(kernel_node);
  kernel_name_ = AnfAlgo::GetCNodeName(kernel_node);
  update_slots_ = AnfAlgo::GetNodeAttr<bool>(kernel_node, "update_slots");
  dtype_ = AnfAlgo::GetInputDeviceDataType(kernel_node, 0);
 }
@@ -36,47 +38,41 @@ void ApplyAdagradCPUKernel::InitKernel(const CNodePtr &kernel_node) {
 bool ApplyAdagradCPUKernel::Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &,
                                   const std::vector<AddressPtr> &outputs) {
  CheckParam(inputs, outputs);

  if (dtype_ == kNumberTypeFloat16) {
    LaunchKernel<float16>(inputs, outputs);
  } else if (dtype_ == kNumberTypeFloat32) {
    LaunchKernel<float>(inputs, outputs);
  } else {
    MS_LOG(EXCEPTION) << kernel_name_ << " only support float16 and float32 on CPU, but got "
                      << TypeIdToType(dtype_)->ToString();
  }

  return true;
 }

 void ApplyAdagradCPUKernel::CheckParam(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &outputs) {
 void ApplyAdagradCPUKernel::CheckParam(const std::vector<AddressPtr> &inputs,
                                       const std::vector<AddressPtr> &outputs) const {
  // inputs: var, accum, lr, gradient
  if (inputs.size() != kInputSize) {
    MS_LOG(EXCEPTION) << "Input number is " << inputs.size() << ", but ApplyAdagrad needs 4 inputs.";
  }

  // outputs: var, accum
  if (outputs.size() != kOutputSize) {
    MS_LOG(EXCEPTION) << "Output number is " << outputs.size() << ", but ApplyAdagrad needs 2 outputs.";
  }

  CHECK_KERNEL_INPUTS_NUM(inputs.size(), kApplyAdagradInputsNum, kernel_name_);
  CHECK_KERNEL_OUTPUTS_NUM(outputs.size(), kApplyAdagradOutputsNum, kernel_name_);
  if (inputs[0]->size != inputs[1]->size || inputs[0]->size != inputs[3]->size) {
    MS_LOG(EXCEPTION) << "Error input data size!";
  }

  if (inputs[2]->size != kSizeFloat16 && inputs[2]->size != kSizeFloat32) {
    MS_LOG(EXCEPTION) << "The attribute lr and grad must be float16 or float32!";
    MS_LOG(EXCEPTION) << kernel_name_ << " requires the attribute lr and grad must be float16 or float32!";
  }
 }

 template <typename T>
 void ApplyAdagradCPUKernel::LaunchKernel(const std::vector<AddressPtr> &inputs,
                                         const std::vector<AddressPtr> &outputs) {
  auto var = reinterpret_cast<T *>(inputs[0]->addr);
  auto accum = reinterpret_cast<T *>(inputs[1]->addr);
  auto lr = reinterpret_cast<T *>(inputs[2]->addr);
  auto gradient = reinterpret_cast<T *>(inputs[3]->addr);
  auto *var = reinterpret_cast<T *>(inputs[0]->addr);
  auto *accum = reinterpret_cast<T *>(inputs[1]->addr);
  const auto *lr = reinterpret_cast<T *>(inputs[2]->addr);
  const auto *gradient = reinterpret_cast<T *>(inputs[3]->addr);

  // multithreading
  size_t length = inputs[0]->size / sizeof(T);
  auto task = [this, &var, &accum, lr, gradient](size_t start, size_t end) {
  auto task = [this, &var, &accum, &lr, &gradient](size_t start, size_t end) {
    LaunchApplyAdagrad(var, accum, lr, gradient, start, end);
  };
  CPUKernelUtils::ParallelForAutoSearch(task, length, &parallel_search_info_);
@@ -87,19 +83,17 @@ void ApplyAdagradCPUKernel::LaunchKernel(const std::vector<AddressPtr> &inputs,
  if (memcpy_s(output_var, outputs[0]->size, var, inputs[0]->size) != EOK) {
    MS_LOG(EXCEPTION) << "Launch kernel error: memcpy failed.";
  }

  if (memcpy_s(output_accum, outputs[1]->size, accum, inputs[1]->size) != EOK) {
    MS_LOG(EXCEPTION) << "Launch kernel error: memcpy failed.";
  }
 }

 template <typename T>
 void ApplyAdagradCPUKernel::LaunchApplyAdagrad(T const var, T const accum, const T lr, const T gradient, size_t start,
                                               size_t end) {
 void ApplyAdagradCPUKernel::LaunchApplyAdagrad(T *var, T *accum, const T *lr, const T *gradient, size_t start,
                                               size_t end) const {
  // DataType can only be float32 or float16, so eps will not be zero.
  using DataType = typename std::iterator_traits<T>::value_type;
  const DataType one = DataType(1);
  const DataType eps = DataType(1e-6);
  auto one = static_cast<T>(1);
  auto eps = static_cast<T>(1e-6);
  for (size_t i = start; i < end; ++i) {
    // update accum: accum += grad * grad
    if (update_slots_) {
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/apply_adagrad_cpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/apply_adagrad_cpu_kernel.h
@@ -1,5 +1,5 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 * Copyright 2020-2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -13,11 +13,13 @@
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_APPLY_ADAGRAD_CPU_KERNEL_H_
 #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_APPLY_ADAGRAD_CPU_KERNEL_H_

 #include <thread>
 #include <vector>

 #include "backend/kernel_compiler/cpu/cpu_kernel.h"
 #include "backend/kernel_compiler/cpu/cpu_kernel_factory.h"

@@ -34,11 +36,14 @@ class ApplyAdagradCPUKernel : public CPUKernel {
              const std::vector<AddressPtr> &outputs) override;

 private:
  static void CheckParam(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &outputs);
  void CheckParam(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &outputs) const;

  template <typename T>
  void LaunchKernel(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &outputs);

  template <typename T>
  void LaunchApplyAdagrad(T const var, T const accum, const T lr, const T gradient, size_t start, size_t end);
  void LaunchApplyAdagrad(T *var, T *accum, const T *lr, const T *gradient, size_t start, size_t end) const;

  bool update_slots_{true};
  TypeId dtype_{kTypeUnknown};
 };
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/apply_momentum_cpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/apply_momentum_cpu_kernel.cc
@@ -1,5 +1,5 @@
 /**
 * Copyright 2019 Huawei Technologies Co., Ltd
 * Copyright 2019-2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -13,6 +13,7 @@
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #include "backend/kernel_compiler/cpu/apply_momentum_cpu_kernel.h"
 #include "backend/kernel_compiler/cpu/mkldnn/mkl_kernel_engine.h"
 #include "runtime/device/cpu/cpu_device_address.h"
@@ -20,20 +21,25 @@

 namespace mindspore {
 namespace kernel {
 void ApplyMomentumCPUKernel::InitKernel(const CNodePtr &) {}
 namespace {
 constexpr size_t kApplyMomentumInputsNum = 5;
 }  // namespace

 void ApplyMomentumCPUKernel::InitKernel(const CNodePtr &kernel_node) {
  MS_EXCEPTION_IF_NULL(kernel_node);
  kernel_name_ = AnfAlgo::GetCNodeName(kernel_node);
 }

 bool ApplyMomentumCPUKernel::Launch(const std::vector<kernel::AddressPtr> &inputs,
                                    const std::vector<kernel::AddressPtr> &, const std::vector<kernel::AddressPtr> &) {
  if (inputs.size() < 5) {
    MS_LOG(EXCEPTION) << "Error input output size!";
  }
  CHECK_KERNEL_INPUTS_NUM(inputs.size(), kApplyMomentumInputsNum, kernel_name_);
  if (inputs[0]->size != inputs[1]->size || inputs[0]->size != inputs[3]->size) {
    MS_LOG(EXCEPTION) << "Error input data size!";
  }
  auto weight = reinterpret_cast<float *>(inputs[0]->addr);
  auto accumulate = reinterpret_cast<float *>(inputs[1]->addr);
  auto *weight = reinterpret_cast<float *>(inputs[0]->addr);
  auto *accumulate = reinterpret_cast<float *>(inputs[1]->addr);
  float learning_rate = reinterpret_cast<float *>(inputs[2]->addr)[0];
  auto gradient = reinterpret_cast<float *>(inputs[3]->addr);
  const auto *gradient = reinterpret_cast<float *>(inputs[3]->addr);
  float moment = reinterpret_cast<float *>(inputs[4]->addr)[0];
  size_t elem_num = inputs[0]->size / sizeof(float);
  for (size_t i = 0; i < elem_num; ++i) {
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/apply_momentum_cpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/apply_momentum_cpu_kernel.h
@@ -1,5 +1,5 @@
 /**
 * Copyright 2019 Huawei Technologies Co., Ltd
 * Copyright 2019-2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -13,16 +13,19 @@
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_APPLY_MOMENTUM_CPU_KERNEL_H_
 #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_APPLY_MOMENTUM_CPU_KERNEL_H_

 #include <vector>
 #include <memory>
 #include "backend/kernel_compiler/cpu/mkldnn/mkl_cpu_kernel.h"

 #include "backend/kernel_compiler/cpu/cpu_kernel.h"
 #include "backend/kernel_compiler/cpu/cpu_kernel_factory.h"

 namespace mindspore {
 namespace kernel {
 class ApplyMomentumCPUKernel : public MKLCPUKernel {
 class ApplyMomentumCPUKernel : public CPUKernel {
 public:
  ApplyMomentumCPUKernel() = default;
  ~ApplyMomentumCPUKernel() override = default;
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/argmax_cpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/argmax_cpu_kernel.cc
@@ -1,5 +1,5 @@
 /**
 * Copyright 2019 Huawei Technologies Co., Ltd
 * Copyright 2019-2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -13,12 +13,20 @@
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #include "backend/kernel_compiler/cpu/argmax_cpu_kernel.h"

 #include <string>

 #include "runtime/device/cpu/cpu_device_address.h"

 namespace mindspore {
 namespace kernel {
 namespace {
 constexpr size_t kArgMaxInputsNum = 1;
 constexpr size_t kArgMaxOutputsNum = 1;
 constexpr char kKernelName[] = "ArgMax";

 size_t get_element_num(const std::vector<size_t> &shape) {
  size_t size = 1;
  for (size_t i = 0; i < shape.size(); i++) {
@@ -30,17 +38,14 @@ size_t get_element_num(const std::vector<size_t> &shape) {
 template <typename T>
 bool check_validation(const std::vector<size_t> &shape, const size_t num_before_axis, const size_t num_after_axis,
                      const std::vector<kernel::AddressPtr> &inputs, const std::vector<kernel::AddressPtr> &outputs) {
  if (inputs.size() != 1 || outputs.size() != 1) {
    MS_LOG(EXCEPTION) << "Wrong number of inputs or outputs!";
    return false;
  }
  CHECK_KERNEL_INPUTS_NUM(inputs.size(), kArgMaxInputsNum, kKernelName);
  CHECK_KERNEL_OUTPUTS_NUM(outputs.size(), kArgMaxOutputsNum, kKernelName);
  size_t data_size = sizeof(T);
  size_t input_size = get_element_num(shape) * data_size;
  size_t output_num = num_before_axis * num_after_axis;
  size_t output_size = output_num * sizeof(int);
  if (inputs[0]->size != input_size || outputs[0]->size != output_size) {
    MS_LOG(EXCEPTION) << "Invalid input or output data size!";
    return false;
  }
  return true;
 }
@@ -49,24 +54,28 @@ bool check_validation(const std::vector<size_t> &shape, const size_t num_before_
 template <typename T>
 void ArgmaxCPUKernel<T>::InitKernel(const CNodePtr &kernel_node) {
  MS_EXCEPTION_IF_NULL(kernel_node);
  kernel_name_ = AnfAlgo::GetCNodeName(kernel_node);
  shape_ = AnfAlgo::GetInputDeviceShape(kernel_node, 0);
  size_t shape_len = shape_.size();
  if (shape_len == 0) {
    MS_LOG(EXCEPTION) << "Shape size should be greater than 0";
  }
  int64_t axis = AnfAlgo::GetNodeAttr<int64_t>(kernel_node, AXIS);
  axis += SizeToLong(shape_len);
  if (axis < 0) {
    MS_LOG(EXCEPTION) << "Invalid axis:" << axis << ", should in range [-1, " << (shape_len - 1) << "]";
  }
  axis = axis % static_cast<int64_t>(shape_len);
  axis = axis % SizeToLong(shape_len);
  num_before_axis_ = 1;
  num_after_axis_ = 1;
  for (size_t i = 0; i < shape_len; i++) {
    if (static_cast<int64_t>(i) < axis) {
    if (SizeToLong(i) < axis) {
      num_before_axis_ *= shape_[i];
    } else if (static_cast<int64_t>(i) > axis) {
    } else if (SizeToLong(i) > axis) {
      num_after_axis_ *= shape_[i];
    }
  }
  dim_axis_ = shape_[axis];
  dim_axis_ = shape_[LongToSize(axis)];
 }

 template <typename T>
@@ -76,8 +85,8 @@ bool ArgmaxCPUKernel<T>::Launch(const std::vector<kernel::AddressPtr> &inputs, c
    return false;
  }

  auto input = reinterpret_cast<T *>(inputs[0]->addr);
  auto output = reinterpret_cast<int32_t *>(outputs[0]->addr);
  const auto *input = reinterpret_cast<T *>(inputs[0]->addr);
  auto *output = reinterpret_cast<int32_t *>(outputs[0]->addr);

  std::vector<float> array_axis(dim_axis_);
  for (size_t i = 0; i < num_before_axis_; i++) {
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/argmax_cpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/argmax_cpu_kernel.h
@@ -1,5 +1,5 @@
 /**
 * Copyright 2019 Huawei Technologies Co., Ltd
 * Copyright 2019-2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -13,10 +13,13 @@
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_ARGMAX_CPU_KERNEL_H_
 #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_ARGMAX_CPU_KERNEL_H_

 #include <vector>
 #include <memory>

 #include "backend/kernel_compiler/cpu/cpu_kernel.h"
 #include "backend/kernel_compiler/cpu/cpu_kernel_factory.h"

@@ -35,9 +38,9 @@ class ArgmaxCPUKernel : public CPUKernel {

 private:
  std::vector<size_t> shape_;
  size_t num_before_axis_;
  size_t num_after_axis_;
  size_t dim_axis_;
  size_t num_before_axis_{0};
  size_t num_after_axis_{0};
  size_t dim_axis_{0};
 };

 MS_REG_CPU_KERNEL_T(Argmax, KernelAttr(), ArgmaxCPUKernel, float);
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/argmax_with_value_cpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/argmax_with_value_cpu_kernel.cc
@@ -13,6 +13,7 @@
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #include "backend/kernel_compiler/cpu/argmax_with_value_cpu_kernel.h"
 #include "runtime/device/cpu/cpu_device_address.h"

--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/argmax_with_value_cpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/argmax_with_value_cpu_kernel.h
@@ -13,12 +13,15 @@
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_ARGMAXWITHVALUE_CPU_KERNEL_H_
 #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_ARGMAXWITHVALUE_CPU_KERNEL_H_

 #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_ARGMAX_WITH_VALUE_CPU_KERNEL_H_
 #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_ARGMAX_WITH_VALUE_CPU_KERNEL_H_

 #include <vector>
 #include <map>
 #include <memory>
 #include <algorithm>

 #include "backend/kernel_compiler/cpu/cpu_kernel.h"
 #include "backend/kernel_compiler/cpu/cpu_kernel_factory.h"

@@ -47,4 +50,4 @@ MS_REG_CPU_KERNEL_T(ArgMaxWithValue, KernelAttr(), ArgMaxWithValueCPUKernel, flo
 }  // namespace kernel
 }  // namespace mindspore

 #endif  // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_ARGMAXWITHVALUE_CPU_KERNEL_H_
 #endif  // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_ARGMAX_WITH_VALUE_CPU_KERNEL_H_
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/argmin_with_value_cpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/argmin_with_value_cpu_kernel.cc
@@ -1,5 +1,5 @@
 /**
 * Copyright 2019 Huawei Technologies Co., Ltd
 * Copyright 2019-2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -13,12 +13,20 @@
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #include "backend/kernel_compiler/cpu/argmin_with_value_cpu_kernel.h"

 #include <string>

 #include "runtime/device/cpu/cpu_device_address.h"

 namespace mindspore {
 namespace kernel {
 namespace {
 constexpr size_t kArgMinWithValueInputsNum = 1;
 constexpr size_t kArgMinWithValueOutputsNum = 2;
 constexpr char kKernelName[] = "ArgMaxWithValue";

 size_t get_element_num(const std::vector<size_t> &shape) {
  size_t size = 1;
  for (size_t i = 0; i < shape.size(); i++) {
@@ -30,10 +38,8 @@ size_t get_element_num(const std::vector<size_t> &shape) {
 template <typename T>
 bool check_validation(const std::vector<size_t> &shape, const size_t num_before_axis, const size_t num_after_axis,
                      const std::vector<kernel::AddressPtr> &inputs, const std::vector<kernel::AddressPtr> &outputs) {
  if (inputs.size() != 1 || outputs.size() != 2) {
    MS_LOG(EXCEPTION) << "Wrong number of inputs or outputs!";
    return false;
  }
  CHECK_KERNEL_INPUTS_NUM(inputs.size(), kArgMinWithValueInputsNum, kKernelName);
  CHECK_KERNEL_OUTPUTS_NUM(outputs.size(), kArgMinWithValueOutputsNum, kKernelName);
  size_t data_size = sizeof(T);
  size_t input_size = get_element_num(shape) * data_size;
  size_t output_num = num_before_axis * num_after_axis;
@@ -41,7 +47,6 @@ bool check_validation(const std::vector<size_t> &shape, const size_t num_before_
  size_t out1_size = output_num * data_size;
  if (inputs[0]->size != input_size || outputs[0]->size != out0_size || outputs[1]->size != out1_size) {
    MS_LOG(EXCEPTION) << "Invalid input or output data size!";
    return false;
  }
  return true;
 }
@@ -50,8 +55,12 @@ bool check_validation(const std::vector<size_t> &shape, const size_t num_before_
 template <typename T>
 void ArgMinWithValueCPUKernel<T>::InitKernel(const CNodePtr &kernel_node) {
  MS_EXCEPTION_IF_NULL(kernel_node);
  kernel_name_ = AnfAlgo::GetCNodeName(kernel_node);
  shape_ = AnfAlgo::GetInputDeviceShape(kernel_node, 0);
  size_t shape_len = shape_.size();
  if (shape_len == 0) {
    MS_LOG(EXCEPTION) << "Shape size should be greater than 0";
  }
  int64_t axis = AnfAlgo::GetNodeAttr<int64_t>(kernel_node, AXIS);
  axis += static_cast<int64_t>(shape_len);
  if (axis < 0) {
@@ -78,10 +87,9 @@ bool ArgMinWithValueCPUKernel<T>::Launch(const std::vector<kernel::AddressPtr> &
    return false;
  }

  auto input = reinterpret_cast<T *>(inputs[0]->addr);
  auto output0 = reinterpret_cast<int32_t *>(outputs[0]->addr);
  auto output1 = reinterpret_cast<T *>(outputs[1]->addr);

  const auto *input = reinterpret_cast<T *>(inputs[0]->addr);
  auto *output0 = reinterpret_cast<int32_t *>(outputs[0]->addr);
  auto *output1 = reinterpret_cast<T *>(outputs[1]->addr);
  std::vector<float> array_axis(dim_axis_);
  for (size_t i = 0; i < num_before_axis_; i++) {
    size_t src_index_i = i * dim_axis_ * num_after_axis_;
@@ -93,9 +101,9 @@ bool ArgMinWithValueCPUKernel<T>::Launch(const std::vector<kernel::AddressPtr> &
      }
      auto min_ops = std::min_element(array_axis.begin(), array_axis.end());
      auto min_index = static_cast<int32_t>(std::distance(array_axis.begin(), min_ops));
      auto dst_index = i * num_after_axis_ + j;
      size_t dst_index = i * num_after_axis_ + j;
      output0[dst_index] = min_index;
      auto src_index = IntToSize(min_index) * num_after_axis_ + src_index_j;
      size_t src_index = IntToSize(min_index) * num_after_axis_ + src_index_j;
      output1[dst_index] = input[src_index];
    }
  }
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/argmin_with_value_cpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/argmin_with_value_cpu_kernel.h
@@ -1,5 +1,5 @@
 /**
 * Copyright 2019 Huawei Technologies Co., Ltd
 * Copyright 2019-2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -13,12 +13,15 @@
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_ARGMINWITHVALUE_CPU_KERNEL_H_
 #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_ARGMINWITHVALUE_CPU_KERNEL_H_

 #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_ARGMIN_WITH_VALUE_CPU_KERNEL_H_
 #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_ARGMIN_WITH_VALUE_CPU_KERNEL_H_

 #include <vector>
 #include <map>
 #include <memory>
 #include <algorithm>

 #include "backend/kernel_compiler/cpu/cpu_kernel.h"
 #include "backend/kernel_compiler/cpu/cpu_kernel_factory.h"

@@ -37,9 +40,9 @@ class ArgMinWithValueCPUKernel : public CPUKernel {

 private:
  std::vector<size_t> shape_;
  size_t num_before_axis_;
  size_t num_after_axis_;
  size_t dim_axis_;
  size_t num_before_axis_{0};
  size_t num_after_axis_{0};
  size_t dim_axis_{0};
 };

 MS_REG_CPU_KERNEL_T(ArgMinWithValue, KernelAttr(), ArgMinWithValueCPUKernel, float);
@@ -47,4 +50,4 @@ MS_REG_CPU_KERNEL_T(ArgMinWithValue, KernelAttr(), ArgMinWithValueCPUKernel, flo
 }  // namespace kernel
 }  // namespace mindspore

 #endif  // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_ARGMINWITHVALUE_CPU_KERNEL_H_
 #endif  // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_ARGMIN_WITH_VALUE_CPU_KERNEL_H_
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/arithmetic_cpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/arithmetic_cpu_kernel.cc
@@ -1,5 +1,5 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 * Copyright 2020-2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -13,18 +13,56 @@
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include <string>
 #include <map>

 #include "backend/kernel_compiler/cpu/arithmetic_cpu_kernel.h"

 #include <cmath>
 #include <string>
 #include <unordered_map>
 #include <limits>

 #include "backend/kernel_compiler/cpu/nnacl/fp32/power_fp32.h"
 #include "backend/kernel_compiler/cpu/nnacl/fp32/sub_fp32.h"
 #include "backend/kernel_compiler/cpu/nnacl/fp32/mul_fp32.h"
 #include "runtime/device/cpu/cpu_device_address.h"
 #include "nnacl/fp32/power_fp32.h"
 #include "nnacl/fp32/sub_fp32.h"
 #include "nnacl/fp32/mul_fp32.h"

 namespace mindspore {
 namespace kernel {
 namespace {
 constexpr size_t kInputsNum = 2;
 constexpr size_t kOutputsNum = 1;
 constexpr float kMaxSubSerialSize = 10000.0;
 constexpr float kMaxPowSerialSize = 700.0;

 template <typename T>
 void ArithmeticCPUKernel<T>::AssignAdd(T *input1, const T *input2, T *out) {
 void ElementRealDiv(const T *input1, const T *input2, T *out, size_t size, size_t delta_1, size_t delta_2) {
  size_t idx_1 = 0;
  size_t idx_2 = 0;
  auto zero = (T)0;
  for (size_t i = 0; i < size; ++i) {
    auto dividend = input1[idx_1];
    auto divisor = input2[idx_2];
    idx_1 += delta_1;
    idx_2 += delta_2;
    if (divisor == zero) {
      if (dividend == zero) {
        out[i] = std::numeric_limits<T>::quiet_NaN();
        continue;
      }
      if (std::numeric_limits<T>::has_infinity) {
        out[i] = dividend > zero ? std::numeric_limits<T>::infinity() : -std::numeric_limits<T>::infinity();
      } else {
        out[i] = dividend > zero ? std::numeric_limits<T>::max() : std::numeric_limits<T>::min();
      }
      continue;
    }
    out[i] = dividend / divisor;
  }
 }
 }  // namespace

 template <typename T>
 void ArithmeticCPUKernel<T>::AssignAdd(T *input1, const T *input2, T *out) const {
  auto task = [&input1, &input2, &out](size_t start, size_t end) {
    for (size_t i = start; i < end; i++) {
      out[i] = input1[i] + input2[i];
@@ -35,7 +73,7 @@ void ArithmeticCPUKernel<T>::AssignAdd(T *input1, const T *input2, T *out) {
 }

 template <typename T>
 void ArithmeticCPUKernel<T>::Add(const T *input1, const T *input2, T *out) {
 void ArithmeticCPUKernel<T>::Add(const T *input1, const T *input2, T *out) const {
  BroadcastIterator base_iter(input_shape1_, input_shape2_, output_shape_);
  auto task = [&input1, &input2, &out, &base_iter](size_t start, size_t end) {
    auto iter = base_iter;
@@ -58,12 +96,12 @@ void ArithmeticCPUKernel<T>::Sub(const T *input1, const T *input2, T *out) {
      ParallelLaunchAutoSearch(task, output_size_, this, &parallel_search_info_);
      return;
    }
    if (op_para.in_elements_num0_ == 1 || op_para.in_elements_num1_ == 1) {
    if (op_para_.in_elements_num0_ == 1 || op_para_.in_elements_num1_ == 1) {
      auto task = [this, input1, input2, out](size_t start, size_t end) {
        if (op_para.in_elements_num0_ == 1) {
          (void)ElementOptSub(input1, input2 + start, out + start, end - start, &op_para);
        if (op_para_.in_elements_num0_ == 1) {
          (void)ElementOptSub(input1, input2 + start, out + start, end - start, &op_para_);
        } else {
          (void)ElementOptSub(input1 + start, input2, out + start, end - start, &op_para);
          (void)ElementOptSub(input1 + start, input2, out + start, end - start, &op_para_);
        }
      };
      ParallelLaunchAutoSearch(task, output_size_, this, &parallel_search_info_);
@@ -80,7 +118,7 @@ void ArithmeticCPUKernel<T>::Sub(const T *input1, const T *input2, T *out) {
      iter.GenNextPos();
    }
  };
  CPUKernelUtils::ParallelFor(task, output_size_, MAX_SUB_SERIAL_SIZE);
  CPUKernelUtils::ParallelFor(task, output_size_, kMaxSubSerialSize);
 }

 template <typename T>
@@ -93,12 +131,12 @@ void ArithmeticCPUKernel<T>::Mul(const T *input1, const T *input2, T *out) {
      ParallelLaunchAutoSearch(task, output_size_, this, &parallel_search_info_);
      return;
    }
    if (op_para.in_elements_num0_ == 1 || op_para.in_elements_num1_ == 1) {
    if (op_para_.in_elements_num0_ == 1 || op_para_.in_elements_num1_ == 1) {
      auto task = [this, input1, input2, out](size_t start, size_t end) {
        if (op_para.in_elements_num0_ == 1) {
          (void)ElementOptMul(input1, input2 + start, out + start, end - start, &op_para);
        if (op_para_.in_elements_num0_ == 1) {
          (void)ElementOptMul(input1, input2 + start, out + start, end - start, &op_para_);
        } else {
          (void)ElementOptMul(input1 + start, input2, out + start, end - start, &op_para);
          (void)ElementOptMul(input1 + start, input2, out + start, end - start, &op_para_);
        }
      };
      ParallelLaunchAutoSearch(task, output_size_, this, &parallel_search_info_);
@@ -110,39 +148,13 @@ void ArithmeticCPUKernel<T>::Mul(const T *input1, const T *input2, T *out) {
    auto iter = base_iter;
    iter.SetPos(start);
    for (size_t i = start; i < end; i++) {
      out[i] = input1[iter.GetInputPosA()] * input2[iter.GetInputPosB()];
      out[i] = static_cast<T>(input1[iter.GetInputPosA()] * input2[iter.GetInputPosB()]);
      iter.GenNextPos();
    }
  };
  CPUKernelUtils::ParallelFor(task, output_size_);
 }

 template <typename T>
 void ElementRealDiv(const T *input1, const T *input2, T *out, size_t size, size_t delta_1, size_t delta_2) {
  size_t idx_1 = 0;
  size_t idx_2 = 0;
  auto zero = (T)0;
  for (size_t i = 0; i < size; ++i) {
    auto dividend = input1[idx_1];
    auto divisor = input2[idx_2];
    idx_1 += delta_1;
    idx_2 += delta_2;
    if (divisor == zero) {
      if (dividend == zero) {
        out[i] = std::numeric_limits<T>::quiet_NaN();
        continue;
      }
      if (std::numeric_limits<T>::has_infinity) {
        out[i] = dividend > zero ? std::numeric_limits<T>::infinity() : -std::numeric_limits<T>::infinity();
      } else {
        out[i] = dividend > zero ? std::numeric_limits<T>::max() : std::numeric_limits<T>::min();
      }
      continue;
    }
    out[i] = dividend / divisor;
  }
 }

 template <typename T>
 void ArithmeticCPUKernel<T>::RealDiv(const T *input1, const T *input2, T *out) {
  if (input_shape1_ == input_shape2_) {
@@ -152,14 +164,14 @@ void ArithmeticCPUKernel<T>::RealDiv(const T *input1, const T *input2, T *out) {
    ParallelLaunchAutoSearch(task, output_size_, this, &parallel_search_info_);
    return;
  }
  if (op_para.in_elements_num0_ == 1) {
  if (op_para_.in_elements_num0_ == 1) {
    auto task = [&](size_t start, size_t end) {
      ElementRealDiv<T>(input1, input2 + start, out + start, end - start, 0, 1);
    };
    ParallelLaunchAutoSearch(task, output_size_, this, &parallel_search_info_);
    return;
  }
  if (op_para.in_elements_num1_ == 1) {
  if (op_para_.in_elements_num1_ == 1) {
    auto task = [&](size_t start, size_t end) {
      ElementRealDiv<T>(input1 + start, input2, out + start, end - start, 1, 0);
    };
@@ -195,7 +207,7 @@ void ArithmeticCPUKernel<T>::RealDiv(const T *input1, const T *input2, T *out) {
 }

 template <typename T>
 void ArithmeticCPUKernel<T>::Div(const T *input1, const T *input2, T *out) {
 void ArithmeticCPUKernel<T>::Div(const T *input1, const T *input2, T *out) const {
  BroadcastIterator base_iter(input_shape1_, input_shape2_, output_shape_);
  auto task = [&input1, &input2, &out, &base_iter](size_t start, size_t end) {
    auto iter = base_iter;
@@ -224,7 +236,7 @@ void ArithmeticCPUKernel<T>::Div(const T *input1, const T *input2, T *out) {
 }

 template <typename T>
 void ArithmeticCPUKernel<T>::FloorDiv(const T *input1, const T *input2, T *out) {
 void ArithmeticCPUKernel<T>::FloorDiv(const T *input1, const T *input2, T *out) const {
  BroadcastIterator base_iter(input_shape1_, input_shape2_, output_shape_);
  auto task = [&input1, &input2, &out, &base_iter](size_t start, size_t end) {
    auto iter = base_iter;
@@ -233,7 +245,7 @@ void ArithmeticCPUKernel<T>::FloorDiv(const T *input1, const T *input2, T *out)
      auto dividend = input1[iter.GetInputPosA()];
      auto divisor = input2[iter.GetInputPosB()];
      iter.GenNextPos();
      auto zero = (T)0;
      auto zero = static_cast<T>(0);
      if (divisor == zero) {
        if (dividend == zero) {
          out[i] = std::numeric_limits<T>::quiet_NaN();
@@ -246,14 +258,14 @@ void ArithmeticCPUKernel<T>::FloorDiv(const T *input1, const T *input2, T *out)
        }
        continue;
      }
      out[i] = (T)floor(static_cast<double>(dividend) / static_cast<double>(divisor));
      out[i] = static_cast<T>(floor(static_cast<double>(dividend) / static_cast<double>(divisor)));
    }
  };
  CPUKernelUtils::ParallelFor(task, output_size_);
 }

 template <typename T>
 void ArithmeticCPUKernel<T>::Mod(const T *input1, const T *input2, T *out) {
 void ArithmeticCPUKernel<T>::Mod(const T *input1, const T *input2, T *out) const {
  BroadcastIterator base_iter(input_shape1_, input_shape2_, output_shape_);
  auto task = [&input1, &input2, &out, &base_iter](size_t start, size_t end) {
    auto iter = base_iter;
@@ -275,7 +287,7 @@ void ArithmeticCPUKernel<T>::Mod(const T *input1, const T *input2, T *out) {
 }

 template <typename T>
 void ArithmeticCPUKernel<T>::FloorMod(const T *input1, const T *input2, T *out) {
 void ArithmeticCPUKernel<T>::FloorMod(const T *input1, const T *input2, T *out) const {
  BroadcastIterator base_iter(input_shape1_, input_shape2_, output_shape_);
  auto task = [&input1, &input2, &out, &base_iter](size_t start, size_t end) {
    auto iter = base_iter;
@@ -292,7 +304,7 @@ void ArithmeticCPUKernel<T>::FloorMod(const T *input1, const T *input2, T *out)
 }

 template <typename T>
 void ArithmeticCPUKernel<T>::Pow(const T *input1, const T *input2, T *out) {
 void ArithmeticCPUKernel<T>::Pow(const T *input1, const T *input2, T *out) const {
  if constexpr (std::is_same_v<T, float>) {
    auto is_power_single = [this]() {
      bool is_power_single = false;
@@ -308,7 +320,7 @@ void ArithmeticCPUKernel<T>::Pow(const T *input1, const T *input2, T *out) {
      return is_power_single;
    };

    if (op_para.in_elements_num1_ == 1) {
    if (op_para_.in_elements_num1_ == 1) {
      auto task = [&](size_t start, size_t end) {
        (void)Power(input1 + start, input2, out + start, end - start, 1, 0, true);
      };
@@ -325,7 +337,7 @@ void ArithmeticCPUKernel<T>::Pow(const T *input1, const T *input2, T *out) {
  }

  BroadcastIterator base_iter(input_shape1_, input_shape2_, output_shape_);
  if (output_size_ > MAX_POW_SERIAL_SIZE) {
  if (output_size_ > kMaxPowSerialSize) {
    auto task = [&input1, &input2, &out, &base_iter](size_t start, size_t end) {
      auto iter = base_iter;
      iter.SetPos(start);
@@ -356,7 +368,7 @@ void ArithmeticCPUKernel<T>::SquaredDifference(const T *input1, const T *input2,
    iter.SetPos(start);
    for (size_t i = start; i < end; i++) {
      T diff = input1[iter.GetInputPosA()] - input2[iter.GetInputPosB()];
      out[i] = diff * diff;
      out[i] = static_cast<T>(diff * diff);
      iter.GenNextPos();
    }
  };
@@ -364,44 +376,47 @@ void ArithmeticCPUKernel<T>::SquaredDifference(const T *input1, const T *input2,
 }

 template <typename T>
 void ArithmeticCPUKernel<T>::Atan2(const T *input1, const T *input2, T *out) {
 void ArithmeticCPUKernel<T>::Atan2(const T *input1, const T *input2, T *out) const {
  BroadcastIterator base_iter(input_shape1_, input_shape2_, output_shape_);
  auto task = [&input1, &input2, &out, &base_iter](size_t start, size_t end) {
    auto iter = base_iter;
    iter.SetPos(start);
    for (size_t i = start; i < end; i++) {
      out[i] =
        (T)atan2(static_cast<double>(input1[iter.GetInputPosA()]), static_cast<double>(input2[iter.GetInputPosB()]));
      out[i] = static_cast<T>(
        atan2(static_cast<double>(input1[iter.GetInputPosA()]), static_cast<double>(input2[iter.GetInputPosB()])));
      iter.GenNextPos();
    }
  };
  CPUKernelUtils::ParallelFor(task, output_size_);
 }

 static const std::map<std::string, OperateType> kArithmeticBinOpTypeMap = {
  {prim::kPrimAdd->name(), ADD},
  {prim::kPrimSub->name(), SUB},
  {prim::kPrimMul->name(), MUL},
  {prim::kPrimDiv->name(), DIV},
  {prim::kPrimMod->name(), MOD},
  {prim::kPrimAssignAdd->name(), ASSIGNADD},
  {prim::kPrimPow->name(), POW},
  {prim::kPrimFloorDiv->name(), FLOORDIV},
  {prim::kPrimAtan2->name(), ATAN2},
  {prim::kPrimRealDiv->name(), REALDIV},
  {prim::kPrimSquaredDifference->name(), SQUAREDDIFFERENCE},
  {prim::kPrimFloorMod->name(), FLOORMOD}};
 template <typename T>
 void ArithmeticCPUKernel<T>::InitComputeFunc() {
  if (kernel_name_ == prim::kPrimAssignAdd->name()) {
    return;
  }
  static const std::unordered_map<std::string, TypeComputeFunc> arithmeticMathFuncMap{
    {prim::kPrimAdd->name(), &ArithmeticCPUKernel<T>::Add},
    {prim::kPrimSub->name(), &ArithmeticCPUKernel<T>::Sub},
    {prim::kPrimMul->name(), &ArithmeticCPUKernel<T>::Mul},
    {prim::kPrimDiv->name(), &ArithmeticCPUKernel<T>::Div},
    {prim::kPrimMod->name(), &ArithmeticCPUKernel<T>::Mod},
    {prim::kPrimFloorMod->name(), &ArithmeticCPUKernel<T>::FloorMod},
    {prim::kPrimPow->name(), &ArithmeticCPUKernel<T>::Pow},
    {prim::kPrimFloorDiv->name(), &ArithmeticCPUKernel<T>::FloorDiv},
    {prim::kPrimAtan2->name(), &ArithmeticCPUKernel<T>::Atan2},
    {prim::kPrimRealDiv->name(), &ArithmeticCPUKernel<T>::RealDiv},
    {prim::kPrimSquaredDifference->name(), &ArithmeticCPUKernel<T>::SquaredDifference}};
  if (arithmeticMathFuncMap.find(kernel_name_) == arithmeticMathFuncMap.end()) {
    MS_LOG(EXCEPTION) << "ArithmeticCPUKernel does not support " << kernel_name_;
  }
  compute_func_ = arithmeticMathFuncMap.at(kernel_name_);
 }

 template <typename T>
 void ArithmeticCPUKernel<T>::InitKernel(const CNodePtr &kernel_node) {
  MS_EXCEPTION_IF_NULL(kernel_node);
  std::string kernel_name = AnfAlgo::GetCNodeName(kernel_node);
  if (kArithmeticBinOpTypeMap.find(kernel_name) != kArithmeticBinOpTypeMap.end()) {
    operate_type_ = kArithmeticBinOpTypeMap.at(kernel_name);
  } else {
    MS_LOG(EXCEPTION) << "Not support " << kernel_name;
  }

  kernel_name_ = AnfAlgo::GetCNodeName(kernel_node);
  input_shape1_ = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
  input_shape2_ = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 1);
  output_shape_ = AnfAlgo::GetOutputInferShape(kernel_node, 0);
@@ -414,14 +429,14 @@ void ArithmeticCPUKernel<T>::InitKernel(const CNodePtr &kernel_node) {
    output_size_ *= output_shape_[i];
  }

  op_para.in_elements_num0_ = 1;
  op_para_.in_elements_num0_ = 1;
  for (size_t i = 0; i < input_shape1_.size(); ++i) {
    op_para.in_elements_num0_ *= input_shape1_[i];
    op_para_.in_elements_num0_ *= input_shape1_[i];
  }

  op_para.in_elements_num1_ = 1;
  op_para_.in_elements_num1_ = 1;
  for (size_t i = 0; i < input_shape2_.size(); ++i) {
    op_para.in_elements_num1_ *= input_shape2_[i];
    op_para_.in_elements_num1_ *= input_shape2_[i];
  }

  size_t l = input_shape1_.size();
@@ -435,47 +450,21 @@ void ArithmeticCPUKernel<T>::InitKernel(const CNodePtr &kernel_node) {
  CPUKernelUtils::GetElementNumEveryDim(input_shape1_, &input_element_num1_);
  CPUKernelUtils::GetElementNumEveryDim(input_shape2_, &input_element_num2_);
  CPUKernelUtils::GetElementNumEveryDim(output_shape_, &output_element_num_);
  dtype_ = AnfAlgo::GetInputDeviceDataType(kernel_node, 0);
  if (dtype_ != AnfAlgo::GetInputDeviceDataType(kernel_node, 1)) {
    MS_LOG(EXCEPTION) << "Input0 and input1 must has the same data type";
  }
  target_dtype_ = AnfAlgo::GetOutputDeviceDataType(kernel_node, 0);
  InitComputeFunc();
 }

 template <typename T>
 bool ArithmeticCPUKernel<T>::Launch(const std::vector<AddressPtr> &inputs,
                                    const std::vector<AddressPtr> & /* workspace */,
 bool ArithmeticCPUKernel<T>::Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &,
                                    const std::vector<AddressPtr> &outputs) {
  T *input1 = reinterpret_cast<T *>(inputs[0]->addr);
  T *input2 = reinterpret_cast<T *>(inputs[1]->addr);
  T *output = reinterpret_cast<T *>(outputs[0]->addr);

  if (operate_type_ == ADD) {
    Add(input1, input2, output);
  } else if (operate_type_ == SUB) {
    Sub(input1, input2, output);
  } else if (operate_type_ == MUL) {
    Mul(input1, input2, output);
  } else if (operate_type_ == REALDIV) {
    RealDiv(input1, input2, output);
  } else if (operate_type_ == DIV) {
    Div(input1, input2, output);
  } else if (operate_type_ == FLOORDIV) {
    FloorDiv(input1, input2, output);
  } else if (operate_type_ == MOD) {
    Mod(input1, input2, output);
  } else if (operate_type_ == FLOORMOD) {
    FloorMod(input1, input2, output);
  } else if (operate_type_ == POW) {
    Pow(input1, input2, output);
  } else if (operate_type_ == ASSIGNADD) {
  CHECK_KERNEL_INPUTS_NUM(inputs.size(), kInputsNum, kernel_name_);
  CHECK_KERNEL_OUTPUTS_NUM(outputs.size(), kOutputsNum, kernel_name_);
  auto *input1 = reinterpret_cast<T *>(inputs[0]->addr);
  const auto *input2 = reinterpret_cast<T *>(inputs[1]->addr);
  auto *output = reinterpret_cast<T *>(outputs[0]->addr);
  if (kernel_name_ == prim::kPrimAssignAdd->name()) {
    AssignAdd(input1, input2, output);
  } else if (operate_type_ == ATAN2) {
    Atan2(input1, input2, output);
  } else if (operate_type_ == SQUAREDDIFFERENCE) {
    SquaredDifference(input1, input2, output);
  } else {
    MS_LOG(EXCEPTION) << "Not support " << operate_type_;
    compute_func_(this, input1, input2, output);
  }
  return true;
 }
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/arithmetic_cpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/arithmetic_cpu_kernel.h
@@ -13,18 +13,15 @@
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_ARITHMETIC_CPU_KERNEL_H_
 #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_ARITHMETIC_CPU_KERNEL_H_
 #include <memory>

 #include <vector>
 #include <limits>

 #include "backend/kernel_compiler/cpu/cpu_kernel.h"
 #include "backend/kernel_compiler/cpu/cpu_kernel_factory.h"
 #include "nnacl/arithmetic.h"

 const float MAX_SUB_SERIAL_SIZE = 10000;
 const float MAX_DIV_SERIAL_SIZE = 10000;
 const float MAX_POW_SERIAL_SIZE = 700;
 #include "backend/kernel_compiler/cpu/nnacl/arithmetic.h"

 namespace mindspore {
 namespace kernel {
@@ -40,29 +37,31 @@ class ArithmeticCPUKernel : public CPUKernel {
              const std::vector<AddressPtr> &outputs) override;

 private:
  void InitComputeFunc();
  void Sub(const T *input1, const T *input2, T *out);
  void Add(const T *input1, const T *input2, T *out);
  void Add(const T *input1, const T *input2, T *out) const;
  void Mul(const T *input1, const T *input2, T *out);
  void RealDiv(const T *input1, const T *input2, T *out);
  void Div(const T *input1, const T *input2, T *out);
  void FloorDiv(const T *input1, const T *input2, T *out);
  void Mod(const T *input1, const T *input2, T *out);
  void FloorMod(const T *input1, const T *input2, T *out);
  void Pow(const T *input1, const T *input2, T *out);
  void AssignAdd(T *input1, const T *input2, T *out);
  void Atan2(const T *input1, const T *input2, T *out);
  void Div(const T *input1, const T *input2, T *out) const;
  void FloorDiv(const T *input1, const T *input2, T *out) const;
  void Mod(const T *input1, const T *input2, T *out) const;
  void FloorMod(const T *input1, const T *input2, T *out) const;
  void Pow(const T *input1, const T *input2, T *out) const;
  void AssignAdd(T *input1, const T *input2, T *out) const;
  void Atan2(const T *input1, const T *input2, T *out) const;
  void SquaredDifference(const T *input1, const T *input2, T *out);

  using TypeComputeFunc = std::function<void(ArithmeticCPUKernel *, const T *in_x, const T *in_y, T *out)>;
  TypeComputeFunc compute_func_{nullptr};
  size_t output_size_{1};
  ArithmeticParameter op_para_{};

  std::vector<size_t> input_shape1_;
  std::vector<size_t> input_shape2_;
  std::vector<size_t> input_element_num1_;
  std::vector<size_t> input_element_num2_;
  std::vector<size_t> output_shape_;
  std::vector<size_t> output_element_num_;
  size_t output_size_{1};
  ArithmeticParameter op_para;
  OperateType operate_type_{ADD};
  TypeId dtype_{kTypeUnknown};
  TypeId target_dtype_{kTypeUnknown};
 };

 MS_REG_CPU_KERNEL_T(Sub, KernelAttr(), ArithmeticCPUKernel, int32_t);
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/arithmetic_logic_cpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/arithmetic_logic_cpu_kernel.cc
@@ -1,5 +1,5 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 * Copyright 2020-2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -15,18 +15,26 @@
 */

 #include "backend/kernel_compiler/cpu/arithmetic_logic_cpu_kernel.h"
 #include <cmath>

 #include <string>
 #include <map>
 #include <cmath>
 #include <unordered_map>
 #include <functional>

 #include "runtime/device/cpu/cpu_device_address.h"

 namespace mindspore {
 namespace kernel {
 namespace {
 constexpr size_t kMaxLessSerialSize = 15000;
 constexpr size_t kInputsNum = 2;
 constexpr size_t kOutputsNum = 1;
 }  // namespace

 template <typename T>
 void ArithmeticLogicCPUKernel<T>::Less(const T *input1, const T *input2, bool *out) {
 void ArithmeticLogicCPUKernel<T>::Less(const T *input1, const T *input2, bool *out) const {
  BroadcastIterator base_iter(input_shape1_, input_shape2_, output_shape_);
  if (output_size_ > MAX_LESS_SERIAL_SIZE) {
  if (output_size_ > kMaxLessSerialSize) {
    auto task = [&](size_t start, size_t end) {
      auto iter = base_iter;
      iter.SetPos(start);
@@ -50,7 +58,7 @@ void ArithmeticLogicCPUKernel<T>::Less(const T *input1, const T *input2, bool *o
 }

 template <typename T>
 void ArithmeticLogicCPUKernel<T>::Equal(const T *input1, const T *input2, bool *out) {
 void ArithmeticLogicCPUKernel<T>::Equal(const T *input1, const T *input2, bool *out) const {
  BroadcastIterator base_iter(input_shape1_, input_shape2_, output_shape_);
  auto task = [&](size_t start, size_t end) {
    auto iter = base_iter;
@@ -66,7 +74,7 @@ void ArithmeticLogicCPUKernel<T>::Equal(const T *input1, const T *input2, bool *
 }

 template <typename T>
 void ArithmeticLogicCPUKernel<T>::NotEqual(const T *input1, const T *input2, bool *out) {
 void ArithmeticLogicCPUKernel<T>::NotEqual(const T *input1, const T *input2, bool *out) const {
  BroadcastIterator base_iter(input_shape1_, input_shape2_, output_shape_);
  auto task = [&](size_t start, size_t end) {
    auto iter = base_iter;
@@ -82,7 +90,7 @@ void ArithmeticLogicCPUKernel<T>::NotEqual(const T *input1, const T *input2, boo
 }

 template <typename T>
 void ArithmeticLogicCPUKernel<T>::LogicalAnd(const T *input1, const T *input2, bool *out) {
 void ArithmeticLogicCPUKernel<T>::LogicalAnd(const T *input1, const T *input2, bool *out) const {
  BroadcastIterator base_iter(input_shape1_, input_shape2_, output_shape_);
  auto task = [&](size_t start, size_t end) {
    auto iter = base_iter;
@@ -96,7 +104,7 @@ void ArithmeticLogicCPUKernel<T>::LogicalAnd(const T *input1, const T *input2, b
 }

 template <typename T>
 void ArithmeticLogicCPUKernel<T>::LogicalOr(const T *input1, const T *input2, bool *out) {
 void ArithmeticLogicCPUKernel<T>::LogicalOr(const T *input1, const T *input2, bool *out) const {
  BroadcastIterator base_iter(input_shape1_, input_shape2_, output_shape_);
  auto task = [&](size_t start, size_t end) {
    auto iter = base_iter;
@@ -110,7 +118,7 @@ void ArithmeticLogicCPUKernel<T>::LogicalOr(const T *input1, const T *input2, bo
 }

 template <typename T>
 void ArithmeticLogicCPUKernel<T>::Greater(const T *input1, const T *input2, bool *out) {
 void ArithmeticLogicCPUKernel<T>::Greater(const T *input1, const T *input2, bool *out) const {
  BroadcastIterator base_iter(input_shape1_, input_shape2_, output_shape_);
  auto task = [&](size_t start, size_t end) {
    auto iter = base_iter;
@@ -126,7 +134,7 @@ void ArithmeticLogicCPUKernel<T>::Greater(const T *input1, const T *input2, bool
 }

 template <typename T>
 void ArithmeticLogicCPUKernel<T>::GreaterEqual(const T *input1, const T *input2, bool *out) {
 void ArithmeticLogicCPUKernel<T>::GreaterEqual(const T *input1, const T *input2, bool *out) const {
  BroadcastIterator base_iter(input_shape1_, input_shape2_, output_shape_);
  auto task = [&](size_t start, size_t end) {
    auto iter = base_iter;
@@ -142,7 +150,7 @@ void ArithmeticLogicCPUKernel<T>::GreaterEqual(const T *input1, const T *input2,
 }

 template <typename T>
 void ArithmeticLogicCPUKernel<T>::LessEqual(const T *input1, const T *input2, bool *out) {
 void ArithmeticLogicCPUKernel<T>::LessEqual(const T *input1, const T *input2, bool *out) const {
  BroadcastIterator base_iter(input_shape1_, input_shape2_, output_shape_);
  auto task = [&](size_t start, size_t end) {
    auto iter = base_iter;
@@ -157,26 +165,31 @@ void ArithmeticLogicCPUKernel<T>::LessEqual(const T *input1, const T *input2, bo
  CPUKernelUtils::ParallelFor(task, output_size_);
 }

 static const std::map<std::string, OperateType> kArithmeticBinOpTypeMap = {
  {prim::kPrimGreater->name(), GREATER},       {prim::kPrimGreaterEqual->name(), GREATEREQUAL},
  {prim::kPrimLogicalAnd->name(), LOGICALAND}, {prim::kPrimLessEqual->name(), LESSEQUAL},
  {prim::kPrimLogicalOr->name(), LOGICALOR},   {prim::kPrimLess->name(), LESS},
  {prim::kPrimNotEqual->name(), NOTEQUAL},     {prim::kPrimEqual->name(), EQUAL}};
 template <typename T>
 void ArithmeticLogicCPUKernel<T>::InitComputeFunc() {
  static const std::unordered_map<std::string, TypeComputeFunc> arithmeticLogicFuncMap{
    {prim::kPrimGreater->name(), &ArithmeticLogicCPUKernel<T>::Greater},
    {prim::kPrimGreaterEqual->name(), &ArithmeticLogicCPUKernel<T>::GreaterEqual},
    {prim::kPrimLogicalAnd->name(), &ArithmeticLogicCPUKernel<T>::LogicalAnd},
    {prim::kPrimLessEqual->name(), &ArithmeticLogicCPUKernel<T>::LessEqual},
    {prim::kPrimLogicalOr->name(), &ArithmeticLogicCPUKernel<T>::LogicalOr},
    {prim::kPrimLess->name(), &ArithmeticLogicCPUKernel<T>::Less},
    {prim::kPrimNotEqual->name(), &ArithmeticLogicCPUKernel<T>::NotEqual},
    {prim::kPrimEqual->name(), &ArithmeticLogicCPUKernel<T>::Equal}};
  if (arithmeticLogicFuncMap.find(kernel_name_) == arithmeticLogicFuncMap.end()) {
    MS_LOG(EXCEPTION) << "ArithmeticLogicCPUKernel does not support " << kernel_name_;
  }
  compute_func_ = arithmeticLogicFuncMap.at(kernel_name_);
 }

 template <typename T>
 void ArithmeticLogicCPUKernel<T>::InitKernel(const CNodePtr &kernel_node) {
  MS_EXCEPTION_IF_NULL(kernel_node);
  std::string kernel_name = AnfAlgo::GetCNodeName(kernel_node);
  if (kArithmeticBinOpTypeMap.find(kernel_name) != kArithmeticBinOpTypeMap.end()) {
    operate_type_ = kArithmeticBinOpTypeMap.at(kernel_name);
  } else {
    MS_LOG(EXCEPTION) << "Not support " << kernel_name;
  }

  kernel_name_ = AnfAlgo::GetCNodeName(kernel_node);
  input_shape1_ = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
  input_shape2_ = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 1);
  output_shape_ = AnfAlgo::GetOutputInferShape(kernel_node, 0);
  if (output_shape_.size() == 0) {
  if (output_shape_.empty()) {
    (void)output_shape_.insert(output_shape_.begin(), 1);
  }

@@ -200,36 +213,19 @@ void ArithmeticLogicCPUKernel<T>::InitKernel(const CNodePtr &kernel_node) {
  if (dtype_ != AnfAlgo::GetInputDeviceDataType(kernel_node, 1)) {
    MS_LOG(EXCEPTION) << "Input0 and input1 must has the same data type";
  }
  target_dtype_ = AnfAlgo::GetOutputDeviceDataType(kernel_node, 0);
  InitComputeFunc();
 }

 template <typename T>
 bool ArithmeticLogicCPUKernel<T>::Launch(const std::vector<AddressPtr> &inputs,
                                         const std::vector<AddressPtr> & /* workspace */,
                                         const std::vector<AddressPtr> &outputs) {
  T *input1 = reinterpret_cast<T *>(inputs[0]->addr);
  T *input2 = reinterpret_cast<T *>(inputs[1]->addr);
  CHECK_KERNEL_INPUTS_NUM(inputs.size(), kInputsNum, kernel_name_);
  CHECK_KERNEL_OUTPUTS_NUM(outputs.size(), kOutputsNum, kernel_name_);
  const auto *input1 = reinterpret_cast<T *>(inputs[0]->addr);
  const auto *input2 = reinterpret_cast<T *>(inputs[1]->addr);
  bool *output = reinterpret_cast<bool *>(outputs[0]->addr);

  if (operate_type_ == LESS) {
    Less(input1, input2, output);
  } else if (operate_type_ == EQUAL) {
    Equal(input1, input2, output);
  } else if (operate_type_ == NOTEQUAL) {
    NotEqual(input1, input2, output);
  } else if (operate_type_ == GREATER) {
    Greater(input1, input2, output);
  } else if (operate_type_ == GREATEREQUAL) {
    GreaterEqual(input1, input2, output);
  } else if (operate_type_ == LESSEQUAL) {
    LessEqual(input1, input2, output);
  } else if (operate_type_ == LOGICALAND) {
    LogicalAnd(input1, input2, output);
  } else if (operate_type_ == LOGICALOR) {
    LogicalOr(input1, input2, output);
  } else {
    MS_LOG(EXCEPTION) << "Not support " << operate_type_;
  }
  compute_func_(this, input1, input2, output);
  return true;
 }
 }  // namespace kernel
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/arithmetic_logic_cpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/arithmetic_logic_cpu_kernel.h
@@ -13,16 +13,17 @@
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_ARITHMETIC_LOGIC_CPU_KERNEL_H_
 #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_ARITHMETIC_LOGIC_CPU_KERNEL_H_

 #include <memory>
 #include <vector>
 #include <limits>

 #include "backend/kernel_compiler/cpu/cpu_kernel.h"
 #include "backend/kernel_compiler/cpu/cpu_kernel_factory.h"

 #define MAX_LESS_SERIAL_SIZE 15000

 namespace mindspore {
 namespace kernel {
 template <typename T>
@@ -37,25 +38,27 @@ class ArithmeticLogicCPUKernel : public CPUKernel {
              const std::vector<AddressPtr> &outputs) override;

 private:
  void GenIndex(size_t num, std::vector<size_t> *idx);
  void Less(const T *input1, const T *input2, bool *out);
  void Equal(const T *input1, const T *input2, bool *out);
  void NotEqual(const T *input1, const T *input2, bool *out);
  void Greater(const T *input1, const T *input2, bool *out);
  void GreaterEqual(const T *input1, const T *input2, bool *out);
  void LessEqual(const T *input1, const T *input2, bool *out);
  void LogicalAnd(const T *input1, const T *input2, bool *out);
  void LogicalOr(const T *input1, const T *input2, bool *out);
  void InitComputeFunc();
  void Less(const T *input1, const T *input2, bool *out) const;
  void Equal(const T *input1, const T *input2, bool *out) const;
  void NotEqual(const T *input1, const T *input2, bool *out) const;
  void Greater(const T *input1, const T *input2, bool *out) const;
  void GreaterEqual(const T *input1, const T *input2, bool *out) const;
  void LessEqual(const T *input1, const T *input2, bool *out) const;
  void LogicalAnd(const T *input1, const T *input2, bool *out) const;
  void LogicalOr(const T *input1, const T *input2, bool *out) const;

  using TypeComputeFunc = std::function<void(ArithmeticLogicCPUKernel *, const T *, const T *, bool *)>;
  TypeComputeFunc compute_func_{nullptr};
  size_t output_size_{1};
  TypeId dtype_{kTypeUnknown};

  std::vector<size_t> input_shape1_;
  std::vector<size_t> input_shape2_;
  std::vector<size_t> input_element_num1_;
  std::vector<size_t> input_element_num2_;
  std::vector<size_t> output_shape_;
  std::vector<size_t> output_element_num_;
  size_t output_size_{1};
  OperateType operate_type_{ADD};
  TypeId dtype_{kTypeUnknown};
  TypeId target_dtype_{kTypeUnknown};
 };

 MS_REG_CPU_KERNEL_T(
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/arithmetic_self_cpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/arithmetic_self_cpu_kernel.cc
@@ -1,5 +1,5 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 * Copyright 2020-2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -13,17 +13,25 @@
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include <algorithm>

 #include "backend/kernel_compiler/cpu/arithmetic_self_cpu_kernel.h"

 #include <cmath>
 #include <string>
 #include <thread>
 #include <map>
 #include "backend/kernel_compiler/cpu/arithmetic_self_cpu_kernel.h"
 #include <algorithm>
 #include <unordered_map>

 #include "runtime/device/cpu/cpu_device_address.h"

 namespace mindspore {
 namespace kernel {
 namespace {
 constexpr float kMaxNegSerialSize = 5000.0f;
 constexpr float kMaxSquareSerialSize = 5000.0f;
 constexpr size_t kInputsNum = 1;
 constexpr size_t kOutputsNum = 1;

 template <typename T>
 void Square(const T *in, T *out, size_t size) {
  auto task = [&in, &out](size_t start, size_t end) {
@@ -31,7 +39,7 @@ void Square(const T *in, T *out, size_t size) {
      out[i] = in[i] * in[i];
    }
  };
  ParallelLaunch(task, size, MAX_SQUARE_SERIAL_SIZE);
  ParallelLaunch(task, size, kMaxSquareSerialSize);
 }

 template <typename T>
@@ -57,11 +65,10 @@ void Neg(const T *in, T *out, size_t size) {
      out[i] = -in[i];
    }
  };
  ParallelLaunch(task, size, MAX_NEG_SERIAL_SIZE);
  ParallelLaunch(task, size, kMaxNegSerialSize);
 }

 template <typename T>
 void LogicalNot(const T *in, T *out, size_t size) {
 void LogicalNot(const bool *in, bool *out, size_t size) {
  auto task = [&in, &out](size_t start, size_t end) {
    for (size_t i = start; i < end; i++) {
      out[i] = !in[i];
@@ -133,10 +140,12 @@ void Reciprocal(const T *in, T *out, size_t size) {
 template <typename T>
 void Gelu(const T *in, T *out, size_t size) {
  auto task = [&in, &out](size_t start, size_t end) {
    auto factor_a = static_cast<T>(0.7978845608);
    auto factor_b = static_cast<T>(0.044715);
    for (size_t i = start; i < end; i++) {
      T x = in[i];
      auto double_x = static_cast<T>(x);
      T tanh_res = static_cast<T>(std::tanh(0.7978845608 * (double_x + 0.044715 * double_x * double_x * double_x)));
      T tanh_res = static_cast<T>(std::tanh(factor_a * (double_x + factor_b * double_x * double_x * double_x)));
      out[i] = x * (static_cast<T>(1.0) + tanh_res) / static_cast<T>(2.0);
    }
  };
@@ -259,40 +268,17 @@ void Identity(const T *in, T *out, size_t size) {
 }
 }  // namespace

 static const std::map<std::string, OperateType> kArithmeticOpTypeMap = {{prim::kPrimNeg->name(), NEG},
                                                                        {prim::kPrimSquare->name(), SQUARE},
                                                                        {prim::kPrimOnesLike->name(), ONESLIKE},
                                                                        {prim::kPrimZerosLike->name(), ZEROSLIKE},
                                                                        {prim::kPrimLogicalNot->name(), LOGICALNOT},
                                                                        {prim::kPrimSign->name(), SIGN},
                                                                        {prim::kPrimFloor->name(), FLOOR},
                                                                        {prim::kPrimRint->name(), RINT},
                                                                        {prim::kPrimRound->name(), ROUND},
                                                                        {prim::kPrimReciprocal->name(), RECIPROCAL},
                                                                        {prim::kPrimGeLU->name(), GELU},
                                                                        {prim::kPrimAsin->name(), ASIN},
                                                                        {prim::kPrimACos->name(), ACOS},
                                                                        {prim::kPrimAtan->name(), ATAN},
                                                                        {prim::kPrimSin->name(), SIN},
                                                                        {prim::kPrimCos->name(), COS},
                                                                        {prim::kPrimTan->name(), TAN},
                                                                        {prim::kPrimSinh->name(), SINH},
                                                                        {prim::kPrimCosh->name(), COSH},
                                                                        {prim::kPrimAsinh->name(), ASINH},
                                                                        {prim::kPrimAcosh->name(), ACOSH},
                                                                        {prim::kPrimAtanh->name(), ATANH},
                                                                        {prim::kPrimIdentityMath->name(), IDENTITY}};

 void ArithmeticSelfCPUKernel::InitKernel(const CNodePtr &kernel_node) {
  MS_EXCEPTION_IF_NULL(kernel_node);
  std::string kernel_name = AnfAlgo::GetCNodeName(kernel_node);
  operate_type_ = kArithmeticOpTypeMap.at(kernel_name);
  kernel_name_ = AnfAlgo::GetCNodeName(kernel_node);
  dtype_ = AnfAlgo::GetInputDeviceDataType(kernel_node, 0);
 }

 bool ArithmeticSelfCPUKernel::Launch(const std::vector<kernel::AddressPtr> &inputs,
                                     const std::vector<kernel::AddressPtr> &,
                                     const std::vector<kernel::AddressPtr> &outputs) {
  CHECK_KERNEL_INPUTS_NUM(inputs.size(), kInputsNum, kernel_name_);
  CHECK_KERNEL_OUTPUTS_NUM(outputs.size(), kOutputsNum, kernel_name_);
  if (dtype_ == kNumberTypeFloat32 || dtype_ == kNumberTypeFloat16 || dtype_ == kNumberTypeFloat64) {
    LaunchKernel<float>(inputs, outputs);
  } else if (dtype_ == kNumberTypeInt32 || dtype_ == kNumberTypeInt16) {
@@ -300,52 +286,63 @@ bool ArithmeticSelfCPUKernel::Launch(const std::vector<kernel::AddressPtr> &inpu
  } else if (dtype_ == kNumberTypeInt64) {
    LaunchKernel<int64_t>(inputs, outputs);
  } else if (dtype_ == kNumberTypeBool) {
    LaunchKernelLogic<bool>(inputs, outputs);
    LaunchLogicalNot(inputs, outputs);
  } else {
    MS_LOG(EXCEPTION) << "Data type is " << TypeIdLabel(dtype_) << "is not support.";
  }
  return true;
 }

 template <typename T>
 void ArithmeticSelfCPUKernel::LaunchKernelLogic(const std::vector<AddressPtr> &inputs,
                                                const std::vector<AddressPtr> &outputs) {
  T *input = reinterpret_cast<T *>(inputs[0]->addr);
  T *output = reinterpret_cast<T *>(outputs[0]->addr);
  size_t lens = outputs[0]->size > 0 ? static_cast<size_t>(outputs[0]->size / sizeof(T)) : 1;
  LogicalNot<T>(input, output, lens);
  return;
 void ArithmeticSelfCPUKernel::LaunchLogicalNot(const std::vector<AddressPtr> &inputs,
                                               const std::vector<AddressPtr> &outputs) const {
  auto *input = reinterpret_cast<bool *>(inputs[0]->addr);
  auto *output = reinterpret_cast<bool *>(outputs[0]->addr);
  size_t lens = outputs[0]->size / sizeof(bool);
  LogicalNot(input, output, lens);
 }

 template <typename T>
 void ArithmeticSelfCPUKernel::LaunchKernel(const std::vector<AddressPtr> &inputs,
                                           const std::vector<AddressPtr> &outputs) {
  T *input = reinterpret_cast<T *>(inputs[0]->addr);
  T *output = reinterpret_cast<T *>(outputs[0]->addr);
  size_t lens = outputs[0]->size > 0 ? static_cast<size_t>(outputs[0]->size / sizeof(T)) : 1;
  static const std::map<OperateType, std::function<void(const T *in, T *out, size_t size)>> kArithmeticOpFuncMap = {
    {SQUARE, Square<T>},     {SIGN, Sign<T>},
    {NEG, Neg<T>},           {LOGICALNOT, LogicalNot<T>},
    {ONESLIKE, OnesLike<T>}, {ZEROSLIKE, ZerosLike<T>},
    {FLOOR, Floor<T>},       {RECIPROCAL, Reciprocal<T>},
    {GELU, Gelu<T>},         {SIN, Sin<T>},
    {COS, Cos<T>},           {TAN, Tan<T>},
    {ASIN, Asin<T>},         {ACOS, ACos<T>},
    {ATAN, Atan<T>},         {SINH, Sinh<T>},
    {COSH, Cosh<T>},         {ASINH, Asinh<T>},
    {ACOSH, Acosh<T>},       {ATANH, Atanh<T>},
    {RINT, Rint<T>},         {ROUND, Round<T>}};
  if (kArithmeticOpFuncMap.find(operate_type_) != kArithmeticOpFuncMap.end()) {
    kArithmeticOpFuncMap.at(operate_type_)(input, output, lens);
  } else {
    MS_LOG(EXCEPTION) << "Not support " << operate_type_;
                                           const std::vector<AddressPtr> &outputs) const {
  const auto *input = reinterpret_cast<T *>(inputs[0]->addr);
  auto *output = reinterpret_cast<T *>(outputs[0]->addr);
  const size_t lens = outputs[0]->size / sizeof(T);
  static const std::unordered_map<std::string, std::function<void(const T *, T *, size_t)>> arithmeticSelfFuncMap{
    {prim::kPrimSquare->name(), Square<T>},
    {prim::kPrimSign->name(), Sign<T>},
    {prim::kPrimNeg->name(), Neg<T>},
    {prim::kPrimAtanh->name(), Atanh<T>},
    {prim::kPrimAcosh->name(), Acosh<T>},
    {prim::kPrimFloor->name(), Floor<T>},
    {prim::kPrimSin->name(), Sin<T>},
    {prim::kPrimGeLU->name(), Gelu<T>},
    {prim::kPrimCos->name(), Cos<T>},
    {prim::kPrimTan->name(), Tan<T>},
    {prim::kPrimAsin->name(), Asin<T>},
    {prim::kPrimACos->name(), ACos<T>},
    {prim::kPrimAtan->name(), Atan<T>},
    {prim::kPrimSinh->name(), Sinh<T>},
    {prim::kPrimCosh->name(), Cosh<T>},
    {prim::kPrimAsinh->name(), Asinh<T>},
    {prim::kPrimZerosLike->name(), ZerosLike<T>},
    {prim::kPrimOnesLike->name(), OnesLike<T>},
    {prim::kPrimReciprocal->name(), Reciprocal<T>},
    {prim::kPrimRint->name(), Rint<T>},
    {prim::kPrimRound->name(), Round<T>}};

  const auto func_pair = arithmeticSelfFuncMap.find(kernel_name_);
  if (arithmeticSelfFuncMap.find(kernel_name_) == arithmeticSelfFuncMap.end()) {
    MS_LOG(EXCEPTION) << "ArithmeticSelfCPUKernel does not support " << kernel_name_;
  }
  func_pair->second(input, output, lens);
 }

 template <typename T>
 bool IdentityCPUKernel<T>::Launch(const std::vector<kernel::AddressPtr> &inputs,
                                  const std::vector<kernel::AddressPtr> &,
                                  const std::vector<kernel::AddressPtr> &outputs) {
  CHECK_KERNEL_INPUTS_NUM(inputs.size(), kInputsNum, kernel_name_);
  CHECK_KERNEL_OUTPUTS_NUM(outputs.size(), kOutputsNum, kernel_name_);
  T *input = reinterpret_cast<T *>(inputs[0]->addr);
  T *output = reinterpret_cast<T *>(outputs[0]->addr);
  size_t lens = outputs[0]->size > 0 ? static_cast<size_t>(outputs[0]->size / sizeof(T)) : 1;
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/arithmetic_self_cpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/arithmetic_self_cpu_kernel.h
@@ -13,16 +13,16 @@
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_ARITHMETIC_SELF_CPU_KERNEL_H_
 #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_ARITHMETIC_SELF_CPU_KERNEL_H_

 #include <memory>
 #include <vector>

 #include "backend/kernel_compiler/cpu/cpu_kernel.h"
 #include "backend/kernel_compiler/cpu/cpu_kernel_factory.h"

 const float MAX_NEG_SERIAL_SIZE = 5000;
 const float MAX_SQUARE_SERIAL_SIZE = 5000;

 namespace mindspore {
 namespace kernel {
 class ArithmeticSelfCPUKernel : public CPUKernel {
@@ -35,13 +35,12 @@ class ArithmeticSelfCPUKernel : public CPUKernel {
  bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
              const std::vector<AddressPtr> &outputs) override;

 private:
  template <typename T>
  void LaunchKernel(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &outputs);
  template <typename T>
  void LaunchKernelLogic(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &outputs);
  void LaunchKernel(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &outputs) const;

  void LaunchLogicalNot(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &outputs) const;

 private:
  OperateType operate_type_{SQUARE};
  TypeId dtype_{kTypeUnknown};
  TypeId target_dtype_{kTypeUnknown};
 };
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/assign_cpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/assign_cpu_kernel.cc
@@ -1,5 +1,5 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 * Copyright 2020-2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -15,23 +15,34 @@
 */

 #include "backend/kernel_compiler/cpu/assign_cpu_kernel.h"

 #include <string>
 #include <map>

 #include "runtime/device/cpu/cpu_device_address.h"
 #include "common/thread_pool.h"

 namespace mindspore {
 namespace kernel {
 static std::map<TypeId, size_t> input_x_dtype_size_map = {
  {kNumberTypeBool, sizeof(bool)}, {kNumberTypeInt8, 1},    {kNumberTypeInt16, 2},   {kNumberTypeInt32, 4},
  {kNumberTypeInt64, 8},           {kNumberTypeUInt8, 1},   {kNumberTypeUInt16, 2},  {kNumberTypeUInt32, 4},
  {kNumberTypeUInt64, 8},          {kNumberTypeFloat16, 2}, {kNumberTypeFloat32, 4}, {kNumberTypeFloat64, 8}};
 namespace {
 constexpr size_t kAssignInputsNum = 2;
 constexpr size_t kAssignOutputsNum = 1;

 const std::map<TypeId, size_t> input_x_dtype_size_map = {
  {kNumberTypeBool, sizeof(bool)},       {kNumberTypeInt8, sizeof(int8_t)},     {kNumberTypeInt16, sizeof(int16_t)},
  {kNumberTypeInt32, sizeof(int32_t)},   {kNumberTypeInt64, sizeof(int64_t)},   {kNumberTypeUInt8, sizeof(uint8_t)},
  {kNumberTypeUInt16, sizeof(uint16_t)}, {kNumberTypeUInt32, sizeof(uint32_t)}, {kNumberTypeUInt64, sizeof(uint64_t)},
  {kNumberTypeFloat16, sizeof(float16)}, {kNumberTypeFloat32, sizeof(float)},   {kNumberTypeFloat64, sizeof(double)}};
 }  // namespace

 void AssignCPUKernel::InitKernel(const CNodePtr &kernel_node) {
  MS_EXCEPTION_IF_NULL(kernel_node);
  kernel_name_ = AnfAlgo::GetCNodeName(kernel_node);
  auto input_x_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
  auto input_y_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 1);
  if (input_x_shape.size() != input_y_shape.size()) MS_LOG(EXCEPTION) << "X and y must be same shape!";
  if (input_x_shape.size() != input_y_shape.size()) {
    MS_LOG(EXCEPTION) << "X and y must be same shape!";
  }
  for (size_t i = 0; i < input_x_shape.size(); ++i) {
    if (input_x_shape[i] != input_y_shape[i]) {
      MS_LOG(EXCEPTION) << "X and y must be same shape!";
@@ -39,14 +50,17 @@ void AssignCPUKernel::InitKernel(const CNodePtr &kernel_node) {
    batch_size_ *= input_x_shape[i];
  }
  input_x_dtype_ = AnfAlgo::GetInputDeviceDataType(kernel_node, 0);
  if (input_x_dtype_size_map.find(input_x_dtype_) == input_x_dtype_size_map.end()) {
  auto type_len = input_x_dtype_size_map.find(input_x_dtype_);
  if (type_len == input_x_dtype_size_map.end()) {
    MS_LOG(EXCEPTION) << "Unsupported input_x dtype!";
  }
  input_x_dtype_size_ = input_x_dtype_size_map[input_x_dtype_];
  input_x_dtype_size_ = type_len->second;
 }

 bool AssignCPUKernel::Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &,
                             const std::vector<AddressPtr> &outputs) {
  CHECK_KERNEL_INPUTS_NUM(inputs.size(), kAssignInputsNum, kernel_name_);
  CHECK_KERNEL_OUTPUTS_NUM(outputs.size(), kAssignOutputsNum, kernel_name_);
  auto max_size = inputs[0]->size;
  size_t total_size = input_x_dtype_size_ * batch_size_;
  if (total_size > max_size) {
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/assign_cpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/assign_cpu_kernel.h
@@ -1,5 +1,5 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 * Copyright 2020-2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -13,12 +13,14 @@
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_ASSIGN_CPU_KERNEL_H_
 #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_ASSIGN_CPU_KERNEL_H_

 #include <vector>
 #include <memory>
 #include <unordered_map>

 #include "backend/kernel_compiler/cpu/cpu_kernel.h"
 #include "backend/kernel_compiler/cpu/cpu_kernel_factory.h"

@@ -36,8 +38,8 @@ class AssignCPUKernel : public CPUKernel {

 private:
  size_t batch_size_{1};
  size_t input_x_dtype_size_{4};
  TypeId input_x_dtype_{kTypeUnknown};
  size_t input_x_dtype_size_ = 4;
 };

 MS_REG_CPU_KERNEL(
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/bias_add_cpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/bias_add_cpu_kernel.cc
@@ -1,5 +1,5 @@
 /**
 * Copyright 2019 Huawei Technologies Co., Ltd
 * Copyright 2019-2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -15,16 +15,21 @@
 */

 #include "backend/kernel_compiler/cpu/bias_add_cpu_kernel.h"
 #include "nnacl/fp32/add_fp32.h"
 #include "backend/kernel_compiler/cpu/nnacl/fp32/add_fp32.h"
 #include "backend/kernel_compiler/cpu/nnacl/errorcode.h"

 namespace mindspore {
 namespace kernel {
 namespace {
 constexpr size_t kBiasAddMinDim = 2;
 constexpr size_t kBiasAddMaxDim = 5;
 constexpr size_t kBiasAddInputNum = 2;
 constexpr size_t kBiasAddInputsNum = 2;
 constexpr size_t kBiasAddOutputsNum = 1;
 }  // namespace

 void BiasAddCPUKernel::InitKernel(const CNodePtr &kernel_node) {
  MS_EXCEPTION_IF_NULL(kernel_node);
  kernel_name_ = AnfAlgo::GetCNodeName(kernel_node);
  input_shape_ = AnfAlgo::GetInputDeviceShape(kernel_node, 0);
  bias_shape_ = AnfAlgo::GetInputDeviceShape(kernel_node, 1);
  data_shape_ = input_shape_.size();
@@ -44,13 +49,11 @@ void BiasAddCPUKernel::InitKernel(const CNodePtr &kernel_node) {

 bool BiasAddCPUKernel::Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &,
                              const std::vector<AddressPtr> &outputs) {
  if (inputs.size() != kBiasAddInputNum || outputs.size() != 1) {
    MS_LOG(EXCEPTION) << "Inputs outputs size not supoort";
  }

  auto src_addr = reinterpret_cast<float *>(inputs[0]->addr);
  auto bias_addr = reinterpret_cast<float *>(inputs[1]->addr);
  auto output_addr = reinterpret_cast<float *>(outputs[0]->addr);
  CHECK_KERNEL_INPUTS_NUM(inputs.size(), kBiasAddInputsNum, kernel_name_);
  CHECK_KERNEL_OUTPUTS_NUM(outputs.size(), kBiasAddOutputsNum, kernel_name_);
  const auto *src_addr = reinterpret_cast<float *>(inputs[0]->addr);
  const auto *bias_addr = reinterpret_cast<float *>(inputs[1]->addr);
  auto *output_addr = reinterpret_cast<float *>(outputs[0]->addr);

  if (input_shape_.size() > 2) {
    size_t hw_size = 1;
@@ -87,11 +90,14 @@ bool BiasAddCPUKernel::Launch(const std::vector<AddressPtr> &inputs, const std::
    auto task = [&](size_t start, size_t end) {
      for (size_t n = start; n < end; ++n) {
        size_t n_offset = input_shape_[1] * n;
        ElementAdd(src_addr + n_offset, bias_addr, output_addr + n_offset, input_shape_[1]);
        if (ElementAdd(src_addr + n_offset, bias_addr, output_addr + n_offset, input_shape_[1]) != NNACL_OK) {
          MS_LOG(EXCEPTION) << "ElementAdd failed.";
        }
      }
    };
    ParallelLaunchAutoSearch(task, input_shape_[0], this, &parallel_search_info_);
  }

  return true;
 }
 }  // namespace kernel
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/bias_add_cpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/bias_add_cpu_kernel.h
@@ -1,5 +1,5 @@
 /**
 * Copyright 2019 Huawei Technologies Co., Ltd
 * Copyright 2019-2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -13,11 +13,13 @@
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_BIAS_ADD_CPU_KERNEL_H_
 #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_BIAS_ADD_CPU_KERNEL_H_

 #include <vector>
 #include <memory>

 #include "backend/kernel_compiler/cpu/cpu_kernel.h"
 #include "backend/kernel_compiler/cpu/cpu_kernel_factory.h"

--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/bias_add_grad_cpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/bias_add_grad_cpu_kernel.cc
@@ -1,5 +1,5 @@
 /**
 * Copyright 2019 Huawei Technologies Co., Ltd
 * Copyright 2019-2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -15,11 +15,19 @@
 */

 #include "backend/kernel_compiler/cpu/bias_add_grad_cpu_kernel.h"
 #include "nnacl/fp32/reduce_fp32.h"
 #include "backend/kernel_compiler/cpu/nnacl/fp32/reduce_fp32.h"
 #include "backend/kernel_compiler/cpu/nnacl/errorcode.h"

 namespace mindspore {
 namespace kernel {
 namespace {
 constexpr size_t kBiasAddGradInputsNum = 1;
 constexpr size_t kBiasAddGradOutputsNum = 1;
 }  // namespace

 void BiasAddGradCPUKernel::InitKernel(const CNodePtr &kernel_node) {
  MS_EXCEPTION_IF_NULL(kernel_node);
  kernel_name_ = AnfAlgo::GetCNodeName(kernel_node);
  input_shape_ = AnfAlgo::GetInputDeviceShape(kernel_node, 0);
  if (input_shape_.size() < 2) {
    MS_LOG(EXCEPTION) << "Input tensor's rank must be at least 2 for 'BiasAddGrad' Op, but input tensor's rank is "
@@ -29,11 +37,10 @@ void BiasAddGradCPUKernel::InitKernel(const CNodePtr &kernel_node) {

 bool BiasAddGradCPUKernel::Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &,
                                  const std::vector<AddressPtr> &outputs) {
  if (inputs.size() != 1 || outputs.size() != 1) {
    MS_LOG(EXCEPTION) << "input output size not support";
  }
  auto output_addr = reinterpret_cast<float *>(outputs[0]->addr);
  auto input_addr = reinterpret_cast<float *>(inputs[0]->addr);
  CHECK_KERNEL_INPUTS_NUM(inputs.size(), kBiasAddGradInputsNum, kernel_name_);
  CHECK_KERNEL_OUTPUTS_NUM(outputs.size(), kBiasAddGradOutputsNum, kernel_name_);
  const auto *input_addr = reinterpret_cast<float *>(inputs[0]->addr);
  auto *output_addr = reinterpret_cast<float *>(outputs[0]->addr);

  if (input_shape_.size() > 2) {
    size_t hw_size = 1;
@@ -53,7 +60,11 @@ bool BiasAddGradCPUKernel::Launch(const std::vector<AddressPtr> &inputs, const s
    }
  } else if (input_shape_.size() == 2) {
    auto task = [this, input_addr, output_addr](size_t start, size_t end) {
      (void)ReduceSumDim2Axis0(end - start, input_shape_[1], input_shape_[0], input_addr + start, output_addr + start);
      int ret =
        ReduceSumDim2Axis0(end - start, input_shape_[1], input_shape_[0], input_addr + start, output_addr + start);
      if (ret != NNACL_OK) {
        MS_LOG(EXCEPTION) << "ReduceSumDim2Axis0 failed.";
      }
    };
    ParallelLaunchAutoSearch(task, input_shape_[1], this, &parallel_search_info_);
  }
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/bias_add_grad_cpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/bias_add_grad_cpu_kernel.h
@@ -1,5 +1,5 @@
 /**
 * Copyright 2019 Huawei Technologies Co., Ltd
 * Copyright 2019-2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -14,11 +14,12 @@
 * limitations under the License.
 */

 #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_BIASADDGRADCPUKERNEL_H_
 #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_BIASADDGRADCPUKERNEL_H_
 #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_BIAS_ADD_GRAD_CPU_KERNEL_H_
 #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_BIAS_ADD_GRAD_CPU_KERNEL_H_

 #include <vector>
 #include <memory>

 #include "backend/kernel_compiler/cpu/cpu_kernel.h"
 #include "backend/kernel_compiler/cpu/cpu_kernel_factory.h"

@@ -39,4 +40,4 @@ class BiasAddGradCPUKernel : public CPUKernel {
 MS_REG_CPU_KERNEL(BiasAddGrad, KernelAttr(), BiasAddGradCPUKernel);
 }  // namespace kernel
 }  // namespace mindspore
 #endif  // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_BIASADDGRADCPUKERNEL_H_
 #endif  // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_BIAS_ADD_GRAD_CPU_KERNEL_H_
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/binary_cross_entropy_cpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/binary_cross_entropy_cpu_kernel.cc
@@ -13,14 +13,19 @@
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #include "backend/kernel_compiler/cpu/binary_cross_entropy_cpu_kernel.h"

 namespace mindspore {
 namespace kernel {
 constexpr size_t kBceInputNumWithWeight = 3;
 namespace {
 constexpr size_t kBceInputsNumWithWeight = 3;
 constexpr size_t kBceOutputsNum = 1;
 }  // namespace

 template <typename T>
 void BinaryCrossEntropyCpuKernel::LaunchToScalar(const int &input_size, const int &reduction, T *loss, T *tmp_loss) {
 void BinaryCrossEntropyCpuKernel::LaunchToScalar(const int &input_size, const int &reduction, T *loss,
                                                 T *tmp_loss) const {
  if (input_size % 2 == 1) {
    tmp_loss[0] += tmp_loss[input_size - 1];
  }
@@ -35,83 +40,94 @@ void BinaryCrossEntropyCpuKernel::LaunchToScalar(const int &input_size, const in
  }

  loss[0] = tmp_loss[0];
  if (reduction == 1) {
  if (reduction == kMean) {
    loss[0] /= static_cast<T>(input_size);
  }
 }

 template <typename T>
 void BinaryCrossEntropyCpuKernel::Launchkernel(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &,
                                               const std::vector<AddressPtr> &outputs) {
  T *input_x = reinterpret_cast<T *>(inputs[0]->addr);
  T *input_y = reinterpret_cast<T *>(inputs[1]->addr);
  T *weight = nullptr;
  if (weight_defined_) {
    weight = reinterpret_cast<T *>(inputs[2]->addr);
  }
  T *loss = reinterpret_cast<T *>(outputs[0]->addr);
 void BinaryCrossEntropyCpuKernel::LaunchKernel(const std::vector<AddressPtr> &inputs,
                                               const std::vector<AddressPtr> &outputs) const {
  const auto *input_x = reinterpret_cast<T *>(inputs[0]->addr);
  const auto *input_y = reinterpret_cast<T *>(inputs[1]->addr);
  const T *weight = weight_defined_ ? reinterpret_cast<T *>(inputs[2]->addr) : nullptr;
  auto *loss = reinterpret_cast<T *>(outputs[0]->addr);
  std::vector<T> tmp_loss(input_size_);
  auto epsilon = static_cast<T>(1e-12);
  auto one = static_cast<T>(1);

  T epsilon = static_cast<T>(1e-12);
  T one = static_cast<T>(1);
  if (reduction_ == 0 && weight_defined_) {
    for (size_t i = 0; i < input_size_; i++) {
      T value =
        -weight[i] * (input_y[i] * log(input_x[i] + epsilon) + (one - input_y[i]) * log(one - input_x[i] + epsilon));
      loss[i] = value;
    }
  } else if (reduction_ == 0 && (!weight_defined_)) {
    for (size_t i = 0; i < input_size_; i++) {
      T value = static_cast<T>(
        -(input_y[i] * log(input_x[i] + epsilon) + (one - input_y[i]) * log(one - input_x[i] + epsilon)));
      loss[i] = value;
    }
  } else if ((reduction_ != 0) && weight_defined_) {
    for (size_t i = 0; i < input_size_; i++) {
      T value =
        -weight[i] * (input_y[i] * log(input_x[i] + epsilon) + (one - input_y[i]) * log(one - input_x[i] + epsilon));
      tmp_loss[i] = value;
  if (reduction_ == kNone) {
    if (weight_defined_) {
      for (size_t i = 0; i < input_size_; i++) {
        auto value = static_cast<T>(
          -weight[i] * (input_y[i] * log(input_x[i] + epsilon) + (one - input_y[i]) * log(one - input_x[i] + epsilon)));
        loss[i] = value;
      }
    } else {
      for (size_t i = 0; i < input_size_; i++) {
        auto value = static_cast<T>(
          -(input_y[i] * log(input_x[i] + epsilon) + (one - input_y[i]) * log(one - input_x[i] + epsilon)));
        loss[i] = value;
      }
    }
  } else {
    for (size_t i = 0; i < input_size_; i++) {
      T value = static_cast<T>(
        -(input_y[i] * log(input_x[i] + epsilon) + (one - input_y[i]) * log(one - input_x[i] + epsilon)));
      tmp_loss[i] = value;
    if (weight_defined_) {
      for (size_t i = 0; i < input_size_; i++) {
        auto value = static_cast<T>(
          -weight[i] * (input_y[i] * log(input_x[i] + epsilon) + (one - input_y[i]) * log(one - input_x[i] + epsilon)));
        tmp_loss[i] = value;
      }
    } else {
      for (size_t i = 0; i < input_size_; i++) {
        auto value = static_cast<T>(
          -(input_y[i] * log(input_x[i] + epsilon) + (one - input_y[i]) * log(one - input_x[i] + epsilon)));
        tmp_loss[i] = value;
      }
    }
  }

  if (reduction_ != 0) {
  if (reduction_ != kNone) {
    LaunchToScalar<T>(input_size_, reduction_, loss, tmp_loss.data());
  }
 }

 bool BinaryCrossEntropyCpuKernel::Launch(const std::vector<AddressPtr> &inputs,
                                         const std::vector<AddressPtr> &workspace,
 bool BinaryCrossEntropyCpuKernel::Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &,
                                         const std::vector<AddressPtr> &outputs) {
  if (input_size_ > 0) {
    if (dtype_ == kNumberTypeFloat32) {
      Launchkernel<float>(inputs, workspace, outputs);
    } else if (dtype_ == kNumberTypeFloat16) {
      Launchkernel<float16>(inputs, workspace, outputs);
    }
  const size_t expect_inputs_num = weight_defined_ ? kBceInputsNumWithWeight : kBceInputsNumWithWeight - 1;
  CHECK_KERNEL_INPUTS_NUM(inputs.size(), expect_inputs_num, kernel_name_);
  CHECK_KERNEL_OUTPUTS_NUM(outputs.size(), kBceOutputsNum, kernel_name_);
  if (dtype_ == kNumberTypeFloat32) {
    LaunchKernel<float>(inputs, outputs);
  } else if (dtype_ == kNumberTypeFloat16) {
    LaunchKernel<float16>(inputs, outputs);
  } else {
    MS_LOG(EXCEPTION) << kernel_name_ << " only support float16 and float32 on CPU, but got "
                      << TypeIdToType(dtype_)->ToString();
  }
  return true;
 }

 void BinaryCrossEntropyCpuKernel::InitKernel(const CNodePtr &kernel_node) {
  MS_EXCEPTION_IF_NULL(kernel_node);
  kernel_name_ = AnfAlgo::GetCNodeName(kernel_node);
  size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
  weight_defined_ = (input_num == kBceInputsNumWithWeight);
  dtype_ = AnfAlgo::GetInputDeviceDataType(kernel_node, 0);
  auto input_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
  for (size_t i = 0; i < input_shape.size(); i++) {
    input_size_ *= input_shape[i];
  }
  string reduction = AnfAlgo::GetNodeAttr<string>(kernel_node, "reduction");
  if (reduction == "none") {
    reduction_ = 0;
  } else if (reduction == "sum") {
    reduction_ = 2;

  const std::string reduction = AnfAlgo::GetNodeAttr<string>(kernel_node, REDUCTION);
  if (reduction == NONE) {
    reduction_ = kNone;
  } else if (reduction == MEAN) {
    reduction_ = kMean;
  } else if (reduction == SUM) {
    reduction_ = kSum;
  } else {
    MS_LOG(EXCEPTION) << kernel_name_ << "only support the reduction is 'none', 'mean', or 'sum', but got "
                      << reduction;
  }
  size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
  weight_defined_ = (input_num == kBceInputNumWithWeight);
  dtype_ = AnfAlgo::GetInputDeviceDataType(kernel_node, 0);
 }
 }  // namespace kernel
 }  // namespace mindspore
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/binary_cross_entropy_cpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/binary_cross_entropy_cpu_kernel.h
@@ -13,19 +13,23 @@
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_NN_BINARY_CROSS_ENTROPY_KERNEL_H
 #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_NN_BINARY_CROSS_ENTROPY_KERNEL_H

 #include <vector>
 #include <string>

 #include "backend/kernel_compiler/cpu/cpu_kernel.h"
 #include "backend/kernel_compiler/cpu/cpu_kernel_factory.h"

 namespace mindspore {
 namespace kernel {
 enum ReductionType { kNone, kMean, kSum };

 class BinaryCrossEntropyCpuKernel : public CPUKernel {
 public:
  BinaryCrossEntropyCpuKernel() : input_size_(1), reduction_(1), weight_defined_(false) {}
  BinaryCrossEntropyCpuKernel() = default;
  ~BinaryCrossEntropyCpuKernel() override = default;

  void InitKernel(const CNodePtr &kernel_node) override;
@@ -34,15 +38,14 @@ class BinaryCrossEntropyCpuKernel : public CPUKernel {

 private:
  template <typename T>
  void LaunchToScalar(const int &input_size, const int &reduction, T *loss, T *tmp_loss);
  void LaunchToScalar(const int &input_size, const int &reduction, T *loss, T *tmp_loss) const;
  template <typename T>
  void Launchkernel(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
                    const std::vector<AddressPtr> &outputs);
  void LaunchKernel(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &outputs) const;

  TypeId dtype_{kTypeUnknown};
  size_t input_size_;
  int reduction_;
  bool weight_defined_;  // true: there are 3 inputs, false: there are 2 inputs(no [weight])
  size_t input_size_{1};
  ReductionType reduction_{kNone};
  bool weight_defined_{false};  // true: there are 3 inputs, false: there are 2 inputs(no [weight])
 };
 MS_REG_CPU_KERNEL(BinaryCrossEntropy,
                  KernelAttr()
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/binary_cross_entropy_grad_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/binary_cross_entropy_grad_kernel.cc
@@ -13,28 +13,28 @@
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #include "backend/kernel_compiler/cpu/binary_cross_entropy_grad_kernel.h"

 namespace mindspore {
 namespace kernel {
 constexpr size_t kBceGradInputNumWithWeight = 4;
 namespace {
 constexpr size_t kBceGradInputsNumWithWeight = 4;
 constexpr size_t kBceGradOutputsNum = 1;
 }  // namespace

 template <typename T>
 void BinaryCrossEntropyGradCpuKernel::Launchkernel(const std::vector<AddressPtr> &inputs,
                                                   const std::vector<AddressPtr> &outputs) {
  T *input_x = reinterpret_cast<T *>(inputs[0]->addr);
  T *input_y = reinterpret_cast<T *>(inputs[1]->addr);
  T *dloss = reinterpret_cast<T *>(inputs[2]->addr);
  T *weight = nullptr;
  if (weight_defined_) {
    weight = reinterpret_cast<T *>(inputs[3]->addr);
  }

  T *dx = reinterpret_cast<T *>(outputs[0]->addr);
 void BinaryCrossEntropyGradCpuKernel::LaunchKernel(const std::vector<AddressPtr> &inputs,
                                                   const std::vector<AddressPtr> &outputs) const {
  const auto *input_x = reinterpret_cast<T *>(inputs[0]->addr);
  const auto *input_y = reinterpret_cast<T *>(inputs[1]->addr);
  const auto *dloss = reinterpret_cast<T *>(inputs[2]->addr);
  const T *weight = weight_defined_ ? reinterpret_cast<T *>(inputs[3]->addr) : nullptr;
  auto *dx = reinterpret_cast<T *>(outputs[0]->addr);
  auto epsilon = static_cast<T>(1e-12);
  auto one = static_cast<T>(1);

  T epsilon = static_cast<T>(1e-12);
  T one = static_cast<T>(1);
  if (reduction_ == 0) {
  if (reduction_ == kNone) {
    if (weight_defined_) {
      for (size_t i = 0; i < input_size_; i++) {
        T denominator = ((input_x[i] * (one - input_x[i])) > epsilon) ? (input_x[i] * (one - input_x[i])) : epsilon;
@@ -50,7 +50,7 @@ void BinaryCrossEntropyGradCpuKernel::Launchkernel(const std::vector<AddressPtr>
    }
  } else {
    T dloss1 = dloss[0];
    if (reduction_ == 1) {
    if (reduction_ == kMean) {
      dloss1 = dloss[0] / static_cast<T>(input_size_);
    }
    if (weight_defined_) {
@@ -69,34 +69,44 @@ void BinaryCrossEntropyGradCpuKernel::Launchkernel(const std::vector<AddressPtr>
  }
 }

 bool BinaryCrossEntropyGradCpuKernel::Launch(const std::vector<AddressPtr> &inputs,
                                             const std::vector<AddressPtr> &workspace,
 bool BinaryCrossEntropyGradCpuKernel::Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &,
                                             const std::vector<AddressPtr> &outputs) {
  if (input_size_ > 0) {
    if (dtype_ == kNumberTypeFloat32) {
      Launchkernel<float>(inputs, outputs);
    } else if (dtype_ == kNumberTypeFloat16) {
      Launchkernel<float16>(inputs, outputs);
    }
  const size_t expect_inputs_num = weight_defined_ ? kBceGradInputsNumWithWeight : kBceGradInputsNumWithWeight - 1;
  CHECK_KERNEL_INPUTS_NUM(inputs.size(), expect_inputs_num, kernel_name_);
  CHECK_KERNEL_OUTPUTS_NUM(outputs.size(), kBceGradOutputsNum, kernel_name_);
  if (dtype_ == kNumberTypeFloat32) {
    LaunchKernel<float>(inputs, outputs);
  } else if (dtype_ == kNumberTypeFloat16) {
    LaunchKernel<float16>(inputs, outputs);
  } else {
    MS_LOG(EXCEPTION) << kernel_name_ << " only support float16 and float32 on CPU, but got "
                      << TypeIdToType(dtype_)->ToString();
  }
  return true;
 }

 void BinaryCrossEntropyGradCpuKernel::InitKernel(const CNodePtr &kernel_node) {
  MS_EXCEPTION_IF_NULL(kernel_node);
  kernel_name_ = AnfAlgo::GetCNodeName(kernel_node);
  size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
  weight_defined_ = (input_num == kBceGradInputsNumWithWeight);
  dtype_ = AnfAlgo::GetInputDeviceDataType(kernel_node, 0);
  auto input_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
  for (size_t i = 0; i < input_shape.size(); i++) {
    input_size_ *= input_shape[i];
  }
  string reduction = AnfAlgo::GetNodeAttr<string>(kernel_node, "reduction");
  if (reduction == "none") {
    reduction_ = 0;
  } else if (reduction == "sum") {
    reduction_ = 2;
  }

  size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
  weight_defined_ = (input_num == kBceGradInputNumWithWeight);
  dtype_ = AnfAlgo::GetInputDeviceDataType(kernel_node, 0);
  const std::string reduction = AnfAlgo::GetNodeAttr<string>(kernel_node, REDUCTION);
  if (reduction == NONE) {
    reduction_ = kNone;
  } else if (reduction == MEAN) {
    reduction_ = kMean;
  } else if (reduction == SUM) {
    reduction_ = kSum;
  } else {
    MS_LOG(EXCEPTION) << kernel_name_ << "only support the reduction is 'none', 'mean', or 'sum', but got "
                      << reduction;
  }
 }
 }  // namespace kernel
 }  // namespace mindspore
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/binary_cross_entropy_grad_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/binary_cross_entropy_grad_kernel.h
@@ -13,19 +13,22 @@
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_NN_BINARY_CROSS_ENTROPY_GRAD_KERNEL_H
 #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_NN_BINARY_CROSS_ENTROPY_GRAD_KERNEL_H

 #include <vector>
 #include <string>

 #include "backend/kernel_compiler/cpu/cpu_kernel.h"
 #include "backend/kernel_compiler/cpu/cpu_kernel_factory.h"
 #include "backend/kernel_compiler/cpu/binary_cross_entropy_cpu_kernel.h"

 namespace mindspore {
 namespace kernel {
 class BinaryCrossEntropyGradCpuKernel : public CPUKernel {
 public:
  BinaryCrossEntropyGradCpuKernel() : input_size_(1), reduction_(1), weight_defined_(false) {}
  BinaryCrossEntropyGradCpuKernel() = default;
  ~BinaryCrossEntropyGradCpuKernel() override = default;

  void InitKernel(const CNodePtr &kernel_node) override;
@@ -34,12 +37,12 @@ class BinaryCrossEntropyGradCpuKernel : public CPUKernel {

 private:
  template <typename T>
  void Launchkernel(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &outputs);
  void LaunchKernel(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &outputs) const;

  TypeId dtype_{kTypeUnknown};
  size_t input_size_;
  int reduction_;
  bool weight_defined_;  // true: there are 4 inputs, false: there are 3 inputs(no [weight])
  size_t input_size_{1};
  ReductionType reduction_{kNone};
  bool weight_defined_{false};  // true: there are 4 inputs, false: there are 3 inputs(no [weight])
 };
 MS_REG_CPU_KERNEL(BinaryCrossEntropyGrad,
                  KernelAttr()
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/boundingbox_decode_cpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/boundingbox_decode_cpu_kernel.cc
@@ -13,6 +13,7 @@
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #include "backend/kernel_compiler/cpu/boundingbox_decode_cpu_kernel.h"
 #include "runtime/device/cpu/cpu_device_address.h"

--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/boundingbox_decode_cpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/boundingbox_decode_cpu_kernel.h
@@ -13,10 +13,13 @@
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_BOUNDINGBOX_DECODE_CPU_KERNEL_H_
 #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_BOUNDINGBOX_DECODE_CPU_KERNEL_H_

 #include <vector>
 #include <algorithm>

 #include "backend/kernel_compiler/cpu/cpu_kernel.h"
 #include "backend/kernel_compiler/cpu/cpu_kernel_factory.h"

--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/boundingbox_encode_cpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/boundingbox_encode_cpu_kernel.cc
@@ -13,6 +13,7 @@
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #include "backend/kernel_compiler/cpu/boundingbox_encode_cpu_kernel.h"
 #include "runtime/device/cpu/cpu_device_address.h"

--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/boundingbox_encode_cpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/boundingbox_encode_cpu_kernel.h
@@ -13,10 +13,13 @@
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_BOUNDINGBOX_ENCODE_CPU_KERNEL_H_
 #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_BOUNDINGBOX_ENCODE_CPU_KERNEL_H_

 #include <vector>
 #include <algorithm>

 #include "backend/kernel_compiler/cpu/cpu_kernel.h"
 #include "backend/kernel_compiler/cpu/cpu_kernel_factory.h"

--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/broadcast_to_cpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/broadcast_to_cpu_kernel.cc
@@ -15,13 +15,19 @@
 */

 #include "backend/kernel_compiler/cpu/broadcast_to_cpu_kernel.h"
 #include "nnacl/errorcode.h"
 #include "backend/kernel_compiler/cpu/nnacl/errorcode.h"

 namespace mindspore {
 namespace kernel {
 namespace {
 constexpr size_t kBroadcastToInputsNum = 1;
 constexpr size_t kBroadcastToOutputsNum = 1;
 }  // namespace

 template <typename T>
 void BroadcastToCPUKernel<T>::InitKernel(const CNodePtr &kernel_node) {
  MS_EXCEPTION_IF_NULL(kernel_node);
  kernel_name_ = AnfAlgo::GetCNodeName(kernel_node);
  input_shape_ = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
  output_shape_ = AnfAlgo::GetOutputInferShape(kernel_node, 0);
  size_t input_shape_size = input_shape_.size();
@@ -55,35 +61,26 @@ void BroadcastToCPUKernel<T>::InitKernel(const CNodePtr &kernel_node) {
 template <typename T>
 bool BroadcastToCPUKernel<T>::Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &,
                                     const std::vector<AddressPtr> &outputs) {
  if (inputs.size() != 1 || outputs.size() != 1) {
    MS_LOG(EXCEPTION) << "Wrong number of inputs or outputs!";
  }
  if ((inputs[0] == nullptr) || (inputs[0]->size == 0)) {
    MS_LOG(EXCEPTION) << "Input data is NULL!";
  }
  if ((outputs[0] == nullptr) || (outputs[0]->size == 0)) {
    MS_LOG(EXCEPTION) << "Output data is NULL!";
  }

  const auto input_addr = reinterpret_cast<T *>(inputs[0]->addr);
  auto output_addr = reinterpret_cast<T *>(outputs[0]->addr);
  int ret = static_cast<int>(NNACL_ERR);
  CHECK_KERNEL_INPUTS_NUM(inputs.size(), kBroadcastToInputsNum, kernel_name_);
  CHECK_KERNEL_OUTPUTS_NUM(outputs.size(), kBroadcastToOutputsNum, kernel_name_);
  const auto *input_addr = reinterpret_cast<T *>(inputs[0]->addr);
  auto *output_addr = reinterpret_cast<T *>(outputs[0]->addr);
  int status = static_cast<int>(NNACL_OK);
  if constexpr (std::is_same_v<T, bool>) {
    ret = BroadcastTo(bool, input_addr, &shape_info_, output_addr);
    status = BROADCAST_TO(bool, input_addr, &shape_info_, output_addr);
  } else if constexpr (std::is_same_v<T, int>) {
    ret = BroadcastTo(int, input_addr, &shape_info_, output_addr);
    status = BROADCAST_TO(int, input_addr, &shape_info_, output_addr);
  } else if constexpr (std::is_same_v<T, float>) {
    ret = BroadcastTo(float, input_addr, &shape_info_, output_addr);
    status = BROADCAST_TO(float, input_addr, &shape_info_, output_addr);
  } else {
    MS_LOG(EXCEPTION) << "Not supported data type for BroadcastTo.";
  }

  if (ret == NNACL_OK) {
    return true;
  if (status != static_cast<int>(NNACL_OK)) {
    MS_LOG(EXCEPTION) << "Broadcast tensor with shape " << input_shape_ << " to shape " << output_shape_
                      << " execute failed, error code: " << status;
  }
  MS_LOG(ERROR) << "Broadcast tensor with shape " << input_shape_ << " to shape " << output_shape_
                << " execute failed.";
  return false;
  return true;
 }
 }  // namespace kernel
 }  // namespace mindspore
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/broadcast_to_cpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/broadcast_to_cpu_kernel.h
@@ -14,14 +14,15 @@
 * limitations under the License.
 */

 #ifndef MINDSPORE_BROADCAST_TO_CPU_KERNEL_H
 #define MINDSPORE_BROADCAST_TO_CPU_KERNEL_H
 #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_BROADCAST_TO_CPU_KERNEL_H_
 #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_BROADCAST_TO_CPU_KERNEL_H_

 #include <vector>
 #include <memory>

 #include "backend/kernel_compiler/cpu/cpu_kernel.h"
 #include "backend/kernel_compiler/cpu/cpu_kernel_factory.h"
 #include "nnacl/base/broadcast_to.h"
 #include "backend/kernel_compiler/cpu/nnacl/base/broadcast_to.h"

 namespace mindspore {
 namespace kernel {
@@ -38,7 +39,7 @@ class BroadcastToCPUKernel : public CPUKernel {
 private:
  std::vector<size_t> input_shape_;
  std::vector<size_t> output_shape_;
  BroadcastShapeInfo shape_info_;
  BroadcastShapeInfo shape_info_{};
 };

 MS_REG_CPU_KERNEL_T(BroadcastTo, KernelAttr().AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32),
@@ -50,4 +51,4 @@ MS_REG_CPU_KERNEL_T(BroadcastTo, KernelAttr().AddInputAttr(kNumberTypeBool).AddO
 }  // namespace kernel
 }  // namespace mindspore

 #endif  // MINDSPORE_BROADCAST_TO_CPU_KERNEL_H
 #endif  // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_BROADCAST_TO_CPU_KERNEL_H_
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/cast_cpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/cast_cpu_kernel.cc
@@ -1,5 +1,5 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 * Copyright 2020-2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -13,14 +13,22 @@
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #include "backend/kernel_compiler/cpu/cast_cpu_kernel.h"

 #include <cmath>
 #include <map>
 #include <string>
 #include "backend/kernel_compiler/cpu/cast_cpu_kernel.h"

 #include "runtime/device/cpu/cpu_device_address.h"

 namespace mindspore {
 namespace kernel {
 namespace {
 constexpr size_t kCastInputsNum = 1;
 constexpr size_t kCastOutputsNum = 1;
 }  // namespace

 template <typename S, typename T>
 void Cast(const S *in, T *out, size_t size) {
  auto task = [&in, &out](size_t start, size_t end) {
@@ -34,6 +42,7 @@ void Cast(const S *in, T *out, size_t size) {
 template <typename S, typename T>
 void CastCPUKernel<S, T>::InitKernel(const CNodePtr &kernel_node) {
  MS_EXCEPTION_IF_NULL(kernel_node);
  kernel_name_ = AnfAlgo::GetCNodeName(kernel_node);
  source_dtype_ = AnfAlgo::GetInputDeviceDataType(kernel_node, 0);
  target_dtype_ = AnfAlgo::GetOutputDeviceDataType(kernel_node, 0);
 }
@@ -41,17 +50,14 @@ void CastCPUKernel<S, T>::InitKernel(const CNodePtr &kernel_node) {
 template <typename S, typename T>
 bool CastCPUKernel<S, T>::Launch(const std::vector<kernel::AddressPtr> &inputs, const std::vector<kernel::AddressPtr> &,
                                 const std::vector<kernel::AddressPtr> &outputs) {
  if (inputs.size() != 1 || outputs.size() != 1) {
    MS_LOG(ERROR) << "Cast requires 1 input and 1 output, but got " << inputs.size() << " input and " << outputs.size()
                  << " output.";
    return false;
  }
  CHECK_KERNEL_INPUTS_NUM(inputs.size(), kCastInputsNum, kernel_name_);
  CHECK_KERNEL_OUTPUTS_NUM(outputs.size(), kCastOutputsNum, kernel_name_);
  if (outputs[0]->size == 0) {
    MS_LOG(WARNING) << "Cast output memory size should be greater than 0, but got 0.";
    return true;
  }
  const auto input = reinterpret_cast<S *>(inputs[0]->addr);
  const auto output = reinterpret_cast<T *>(outputs[0]->addr);
  const auto *input = reinterpret_cast<S *>(inputs[0]->addr);
  auto *output = reinterpret_cast<T *>(outputs[0]->addr);
  MS_LOG(DEBUG) << "Type source: " << typeid(S).name() << "; target: " << typeid(T).name();
  Cast<S, T>(input, output, outputs[0]->size / sizeof(T));
  return true;
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/cast_cpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/cast_cpu_kernel.h
@@ -1,5 +1,5 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 * Copyright 2020-2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -13,11 +13,14 @@
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_CAST_CPU_KERNEL_H_
 #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_CAST_CPU_KERNEL_H_

 #include <functional>
 #include <memory>
 #include <vector>

 #include "backend/kernel_compiler/cpu/cpu_kernel.h"
 #include "backend/kernel_compiler/cpu/cpu_kernel_factory.h"

--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/check_valid_cpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/check_valid_cpu_kernel.cc
@@ -13,6 +13,9 @@
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #include <functional>

 #include "backend/kernel_compiler/cpu/check_valid_cpu_kernel.h"
 #include "runtime/device/cpu/cpu_device_address.h"

--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/check_valid_cpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/check_valid_cpu_kernel.h
@@ -13,9 +13,12 @@
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_CHECK_VALID_CPU_KERNEL_H_
 #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_CHECK_VALID_CPU_KERNEL_H_

 #include <vector>

 #include "backend/kernel_compiler/cpu/cpu_kernel.h"
 #include "backend/kernel_compiler/cpu/cpu_kernel_factory.h"

--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/concat_cpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/concat_cpu_kernel.cc
@@ -19,11 +19,15 @@

 namespace mindspore {
 namespace kernel {
 namespace {
 constexpr size_t kConcatOutputsNum = 1;
 }  // namespace

 template <typename T>
 void ConcatCPUKernel<T>::InitKernel(const CNodePtr &kernel_node) {
  MS_EXCEPTION_IF_NULL(kernel_node);
  kernel_name_ = AnfAlgo::GetCNodeName(kernel_node);
  node_wpt_ = kernel_node;
  CheckParam(kernel_node);

  axis_ = LongToInt(AnfAlgo::GetNodeAttr<int64_t>(kernel_node, AXIS));
  auto input_1_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
  if (axis_ < 0) {
@@ -34,15 +38,18 @@ void ConcatCPUKernel<T>::InitKernel(const CNodePtr &kernel_node) {
 template <typename T>
 bool ConcatCPUKernel<T>::Launch(const std::vector<kernel::AddressPtr> &inputs, const std::vector<kernel::AddressPtr> &,
                                const std::vector<kernel::AddressPtr> &outputs) {
  auto node_ = node_wpt_.lock();
  if (!node_) {
  auto node = node_wpt_.lock();
  if (!node) {
    MS_LOG(EXCEPTION) << "node_wpt_ is expired.";
  }
  size_t input_num = AnfAlgo::GetInputTensorNum(node_);
  const size_t input_num = AnfAlgo::GetInputTensorNum(node);
  CHECK_KERNEL_INPUTS_NUM(inputs.size(), input_num, kernel_name_);
  CHECK_KERNEL_OUTPUTS_NUM(outputs.size(), kConcatOutputsNum, kernel_name_);

  std::vector<std::vector<size_t>> input_flat_shape_list;
  input_flat_shape_list.reserve(input_num);
  for (size_t i = 0; i < input_num; i++) {
    auto input_shape_i = AnfAlgo::GetPrevNodeOutputInferShape(node_, i);
    auto input_shape_i = AnfAlgo::GetPrevNodeOutputInferShape(node, i);
    auto flat_shape = CPUKernelUtils::FlatShapeByAxis(input_shape_i, axis_);
    (void)input_flat_shape_list.emplace_back(flat_shape);
  }
@@ -51,10 +58,10 @@ bool ConcatCPUKernel<T>::Launch(const std::vector<kernel::AddressPtr> &inputs, c
  for (size_t j = 0; j < input_num; ++j) {
    output_dim_1 += input_flat_shape_list[j][1];
  }
  auto output_addr = reinterpret_cast<T *>(outputs[0]->addr);
  auto *output_addr = reinterpret_cast<T *>(outputs[0]->addr);
  std::vector<T *> input_addr_list;
  for (size_t j = 0; j < input_num; ++j) {
    auto tmp_addr = reinterpret_cast<T *>(inputs[j]->addr);
    auto *tmp_addr = reinterpret_cast<T *>(inputs[j]->addr);
    (void)input_addr_list.emplace_back(tmp_addr);
  }
  // each input's row of shape after flat are same
@@ -69,7 +76,10 @@ bool ConcatCPUKernel<T>::Launch(const std::vector<kernel::AddressPtr> &inputs, c
        auto copy_num = input_flat_shape_list[j][1];
        auto copy_size = copy_num * sizeof(T);
        auto offset = copy_num * i;
        (void)memcpy_s(output_ptr, copy_size, input_addr_list[j] + offset, copy_size);
        auto ret = memcpy_s(output_ptr, copy_size, input_addr_list[j] + offset, copy_size);
        if (ret != EOK) {
          MS_LOG(EXCEPTION) << "Memcpy failed.";
        }
        output_ptr += copy_num;
      }
    }
@@ -77,13 +87,5 @@ bool ConcatCPUKernel<T>::Launch(const std::vector<kernel::AddressPtr> &inputs, c
  ParallelLaunchAutoSearch(task, before_axis, this, &parallel_search_info_);
  return true;
 }

 template <typename T>
 void ConcatCPUKernel<T>::CheckParam(const CNodePtr &kernel_node) const {
  size_t output_num = AnfAlgo::GetOutputTensorNum(kernel_node);
  if (output_num != 1) {
    MS_LOG(EXCEPTION) << "Output number is " << output_num << ", but ConcatCPUKernel needs 1 output.";
  }
 }
 }  // namespace kernel
 }  // namespace mindspore
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/concat_cpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/concat_cpu_kernel.h
@@ -1,5 +1,5 @@
 /**
 * Copyright 2021 Huawei Technologies Co., Ltd
 * Copyright 2020-2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -13,10 +13,13 @@
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_CONCAT_CPU_KERNEL_H_
 #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_CONCAT_CPU_KERNEL_H_

 #include <vector>
 #include <memory>

 #include "backend/kernel_compiler/cpu/cpu_kernel.h"
 #include "backend/kernel_compiler/cpu/cpu_kernel_factory.h"

@@ -34,8 +37,7 @@ class ConcatCPUKernel : public CPUKernel {
              const std::vector<AddressPtr> &outputs) override;

 private:
  void CheckParam(const CNodePtr &kernel_node) const;
  int axis_ = 0;
  int axis_{0};
  CNodeWeakPtr node_wpt_;
 };

--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/cpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/cpu_kernel.cc
@@ -1,5 +1,5 @@
 /**
 * Copyright 2019 Huawei Technologies Co., Ltd
 * Copyright 2019-2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -13,10 +13,13 @@
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #include "backend/kernel_compiler/cpu/cpu_kernel.h"

 #include <algorithm>
 #include <utility>
 #include <cmath>

 #include "common/thread_pool.h"
 #include "utils/profile.h"

@@ -52,10 +55,11 @@ void CPUKernel::Init(const CNodePtr &kernel_node) {
 }

 void CPUKernelUtils::ExpandDimsTo4(std::vector<size_t> *shape) {
  MS_EXCEPTION_IF_NULL(shape);
  auto len = shape->size();
  if (len < 4) {
    for (size_t i = 0; i < 4 - len; ++i) {
      shape->insert(shape->begin(), 1);
      (void)shape->insert(shape->begin(), 1);
    }
  }
 }
@@ -79,6 +83,7 @@ size_t CPUKernelUtils::GetElementNumOnAxis(const std::vector<size_t> &shape, int

 void CPUKernelUtils::GetElementNumEveryDim(const std::vector<size_t> &shape, std::vector<size_t> *element_num) {
  size_t accumulation = 1;
  MS_EXCEPTION_IF_NULL(element_num);
  (void)element_num->emplace_back(1);
  for (size_t i = shape.size() - 1; i > 0; --i) {
    accumulation *= shape[i];
@@ -112,6 +117,7 @@ void CPUKernelUtils::ParallelFor(const CTask &task, size_t count, float block_si
 void CPUKernelUtils::ParallelForAutoSearch(const CTask &task, size_t count, ParallelSearchInfo *parallel_search_info) {
  const size_t MAX_POW = 6;
  const size_t AVG_COUNT = 5;
  MS_EXCEPTION_IF_NULL(parallel_search_info);
  size_t current_pow = parallel_search_info->search_count / AVG_COUNT;
  if (current_pow < MAX_POW) {
    if (parallel_search_info->search_count % AVG_COUNT == 0) {
@@ -276,12 +282,12 @@ void BroadcastIterator::GenNextPos() {
 void BroadcastIterator::BroadcastShape() {
  int input_dimension_a = input_shape_a_.size();
  if (input_dimension_a < output_dimension_) {
    input_shape_a_.insert(input_shape_a_.begin(), output_dimension_ - input_dimension_a, 1);
    (void)input_shape_a_.insert(input_shape_a_.begin(), IntToSize(output_dimension_ - input_dimension_a), 1);
  }

  int input_dimension_b = input_shape_b_.size();
  if (input_dimension_b < output_dimension_) {
    input_shape_b_.insert(input_shape_b_.begin(), output_dimension_ - input_dimension_b, 1);
    (void)input_shape_b_.insert(input_shape_b_.begin(), IntToSize(output_dimension_ - input_dimension_b), 1);
  }
 }

@@ -297,10 +303,10 @@ void BroadcastIterator::InitStrides() {

  // Update strides for broadcast
  // While the axis value is 1, the stride is 0
  std::transform(input_strides_a_.begin(), input_strides_a_.end(), input_shape_a_.begin(), input_strides_a_.begin(),
                 [](const auto &a, const auto &b) { return b == 1 ? 0 : a; });
  std::transform(input_strides_b_.begin(), input_strides_b_.end(), input_shape_b_.begin(), input_strides_b_.begin(),
                 [](const auto &a, const auto &b) { return b == 1 ? 0 : a; });
  (void)std::transform(input_strides_a_.begin(), input_strides_a_.end(), input_shape_a_.begin(),
                       input_strides_a_.begin(), [](const auto &a, const auto &b) { return b == 1 ? 0 : a; });
  (void)std::transform(input_strides_b_.begin(), input_strides_b_.end(), input_shape_b_.begin(),
                       input_strides_b_.begin(), [](const auto &a, const auto &b) { return b == 1 ? 0 : a; });
 }

 TransposeIterator::TransposeIterator(std::vector<size_t> output_shape, std::vector<size_t> axes,
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/cpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/cpu_kernel.h
@@ -13,14 +13,17 @@
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_CPU_KERNEL_H_
 #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_CPU_KERNEL_H_

 #include <functional>
 #include <memory>
 #include <numeric>
 #include <string>
 #include <thread>
 #include <vector>

 #include "backend/kernel_compiler/kernel.h"
 #include "backend/session/anf_runtime_algorithm.h"
 #include "backend/kernel_compiler/common_utils.h"
@@ -33,106 +36,61 @@ using mindspore::kernel::AddressPtr;
 using CTask = std::function<void(size_t, size_t)>;
 namespace mindspore {
 namespace kernel {
 const char KERNEL_SIZE[] = "kernel_size";
 const char STRIDE[] = "stride";
 const char STRIDES[] = "strides";
 const char DILATION[] = "dilation";
 const char DILATIONS[] = "dilations";
 const char FORMAT[] = "format";
 const char PAD[] = "pad";
 const char PAD_LIST[] = "pad_list";
 const char PAD_MODE[] = "pad_mode";
 const char PAD_MODE_LOWER_SAME[] = "same";
 const char PAD_MODE_LOWER_VALID[] = "valid";
 const char PAD_MODE_UPPER_SAME[] = "SAME";
 const char PAD_MODE_UPPER_VALID[] = "VALID";
 const char TRANSPOSE_A[] = "transpose_a";
 const char TRANSPOSE_B[] = "transpose_b";
 const char IS_GRAD[] = "is_grad";
 const char TRANSPOSE_NO = 'N';
 const char TRANSPOSE_YES = 'T';
 const char AXIS[] = "axis";
 const char DIM[] = "dim";
 const char BEGIN[] = "begin";
 const char END[] = "end";
 const char SIZE[] = "size";
 const char USE_NESTEROV[] = "use_nesterov";
 const char GROUP[] = "group";
 const char START[] = "start";
 const char LIMIT[] = "limit";
 const char DELTA[] = "delta";
 const char SORTED[] = "sorted";
 const char ADJ_ST[] = "adjoint_st";
 const char ADJ_dT[] = "adjoint_dt";
 const char PERIODS[] = "periods";
 const char WINDOW[] = "window";
 const char MIN_PERIODS[] = "min_periods";
 const char CENTER[] = "center";
 const char METHOD[] = "method";
 const char CLOSED[] = "closed";
 const char NA_OPTION[] = "na_option";
 const char ASCENDING[] = "ascending";
 const char PCT[] = "pct";

 enum OperateType {
  ADD = 0,
  SUB,
  MUL,
  DIV,
  SQUARE,
  SQRT,
  POW,
  REALDIV,
  FLOORDIV,
  MOD,
  FLOORMOD,
  NEG,
  LESS,
  ASSIGNADD,
  RELUGRAD,
  RELU6GRAD,
  ABSGRAD,
  TANHGRAD,
  SQRTGRAD,
  SIGMOIDGRAD,
  ONESLIKE,
  ZEROSLIKE,
  SIGN,
  EQUAL,
  NOTEQUAL,
  LESSEQUAL,
  LOGICALAND,
  LOGICALOR,
  LOGICALNOT,
  FLOOR,
  SQUAREDDIFFERENCE,
  GREATER,
  GREATEREQUAL,
  RECIPROCAL,
  GELU,
  GELUGRAD,
  ASIN,
  ACOS,
  ATAN,
  ASINGRAD,
  ACOSGRAD,
  ATANGRAD,
  SIN,
  COS,
  TAN,
  SINH,
  COSH,
  ASINH,
  ACOSH,
  ATANH,
  ASINHGRAD,
  ACOSHGRAD,
  ATAN2,
  RINT,
  ROUND,
  EXP,
  IDENTITY,
 };
 constexpr char KERNEL_SIZE[] = "kernel_size";
 constexpr char STRIDE[] = "stride";
 constexpr char STRIDES[] = "strides";
 constexpr char DILATION[] = "dilation";
 constexpr char DILATIONS[] = "dilations";
 constexpr char FORMAT[] = "format";
 constexpr char PAD[] = "pad";
 constexpr char PAD_LIST[] = "pad_list";
 constexpr char PAD_MODE[] = "pad_mode";
 constexpr char PAD_MODE_LOWER_SAME[] = "same";
 constexpr char PAD_MODE_LOWER_VALID[] = "valid";
 constexpr char PAD_MODE_UPPER_SAME[] = "SAME";
 constexpr char PAD_MODE_UPPER_VALID[] = "VALID";
 constexpr char TRANSPOSE_A[] = "transpose_a";
 constexpr char TRANSPOSE_B[] = "transpose_b";
 constexpr char IS_GRAD[] = "is_grad";
 constexpr char TRANSPOSE_NO = 'N';
 constexpr char TRANSPOSE_YES = 'T';
 constexpr char AXIS[] = "axis";
 constexpr char DIM[] = "dim";
 constexpr char NUM[] = "num";
 constexpr char BEGIN[] = "begin";
 constexpr char END[] = "end";
 constexpr char SIZE[] = "size";
 constexpr char USE_NESTEROV[] = "use_nesterov";
 constexpr char GROUP[] = "group";
 constexpr char START[] = "start";
 constexpr char LIMIT[] = "limit";
 constexpr char DELTA[] = "delta";
 constexpr char SORTED[] = "sorted";
 constexpr char ADJ_ST[] = "adjoint_st";
 constexpr char ADJ_dT[] = "adjoint_dt";
 constexpr char REDUCTION[] = "reduction";
 constexpr char NONE[] = "none";
 constexpr char SUM[] = "sum";
 constexpr char MEAN[] = "mean";
 constexpr char BETA[] = "beta";
 constexpr char EXCLUSIVE[] = "exclusive";
 constexpr char REVERSE[] = "reverse";
 constexpr char PCR[] = "preprocess_collapse_repeated";
 constexpr char CTR[] = "ctc_merge_repeated";
 constexpr char ILOTI[] = "ignore_longer_outputs_than_inputs";
 constexpr char MOMENTUM[] = "momentum";
 constexpr char RHO[] = "rho";
 constexpr char EPSILON[] = "epsilon";
 constexpr char ALIGN_CORNERS[] = "align_corners";
 constexpr char PERIODS[] = "periods";
 constexpr char WINDOW[] = "window";
 constexpr char MIN_PERIODS[] = "min_periods";
 constexpr char CENTER[] = "center";
 constexpr char METHOD[] = "method";
 constexpr char CLOSED[] = "closed";
 constexpr char NA_OPTION[] = "na_option";
 constexpr char ASCENDING[] = "ascending";
 constexpr char PCT[] = "pct";

 struct ParallelSearchInfo {
  double min_cost_time{DBL_MAX};
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/cpu_kernel_factory.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/cpu_kernel_factory.cc
@@ -1,5 +1,5 @@
 /**
 * Copyright 2019 Huawei Technologies Co., Ltd
 * Copyright 2019-2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -25,7 +25,10 @@

 namespace mindspore {
 namespace kernel {
 namespace {
 const std::set<std::string> same_op_name = {"Concat", "Pack", "Stack", "Split", "Transpose", "Unpack", "AddN"};
 }  // namespace

 CPUKernelFactory &CPUKernelFactory::GetInstance() {
  static CPUKernelFactory instance;
  return instance;
@@ -40,6 +43,7 @@ void CPUKernelFactory::Register(const std::string &kernel_name, const KernelAttr
 }

 std::shared_ptr<CPUKernel> CPUKernelFactory::Create(const std::string &kernel_name, const CNodePtr &apply_kernel) {
  MS_EXCEPTION_IF_NULL(apply_kernel);
  auto kernel_info = dynamic_cast<device::KernelInfo *>(apply_kernel->kernel_info());
  MS_EXCEPTION_IF_NULL(kernel_info);
  const KernelBuildInfo *kernel_build_Info = kernel_info->select_kernel_build_info();
@@ -53,6 +57,8 @@ std::shared_ptr<CPUKernel> CPUKernelFactory::Create(const std::string &kernel_na

 void CPUKernelFactory::SetKernelAttrs(const std::shared_ptr<kernel::OpInfo> op_info,
                                      std::vector<KernelAttr> *kernel_attrs) {
  MS_EXCEPTION_IF_NULL(kernel_attrs);
  MS_EXCEPTION_IF_NULL(op_info);
  auto inputs_ptr = op_info->inputs_ptr();
  auto outputs_ptr = op_info->outputs_ptr();
  if (inputs_ptr.empty()) {
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/cpu_kernel_factory.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/cpu_kernel_factory.h
@@ -1,5 +1,5 @@
 /**
 * Copyright 2019 Huawei Technologies Co., Ltd
 * Copyright 2019-2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -13,6 +13,7 @@
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_CPU_KERNEL_FACTORY_H_
 #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_CPU_KERNEL_FACTORY_H_

@@ -23,15 +24,16 @@
 #include <utility>
 #include <vector>

 #include "utils/ms_utils.h"
 #include "backend/kernel_compiler/cpu/cpu_kernel.h"
 #include "backend/kernel_compiler/oplib/oplib.h"
 #include "runtime/device/cpu/kernel_select_cpu.h"
 #include "utils/ms_utils.h"

 namespace mindspore {
 namespace kernel {
 using mindspore::device::cpu::KernelAttr;
 using CPUKernelCreator = std::function<std::shared_ptr<CPUKernel>()>;

 class CPUKernelFactory {
 public:
  static CPUKernelFactory &GetInstance();
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/crop_and_resize_cpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/crop_and_resize_cpu_kernel.cc
@@ -13,6 +13,7 @@
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #include "backend/kernel_compiler/cpu/crop_and_resize_cpu_kernel.h"
 #include "runtime/device/cpu/cpu_device_address.h"

--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/crop_and_resize_cpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/crop_and_resize_cpu_kernel.h
@@ -13,11 +13,14 @@
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_CROP_AND_RESIZE_CPU_KERNEL_H_
 #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_CROP_AND_RESIZE_CPU_KERNEL_H_

 #include <vector>
 #include <string>
 #include <algorithm>

 #include "backend/kernel_compiler/cpu/cpu_kernel.h"
 #include "backend/kernel_compiler/cpu/cpu_kernel_factory.h"

--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/ctcloss_cpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/ctcloss_cpu_kernel.cc
@@ -19,10 +19,62 @@

 namespace mindspore {
 namespace kernel {
 namespace {
 constexpr size_t kCTCLossInputsNum = 4;
 constexpr size_t kCTCLossOutputsNum = 2;

 template <typename T>
 inline T LogSumExp(const T logprob1, const T logprob2) {
  T kLogZero_ = -std::numeric_limits<T>::infinity();
  if (logprob1 <= kLogZero_) {
    return logprob2;
  }
  if (logprob2 <= kLogZero_) {
    return logprob1;
  }
  return (logprob1 > logprob2) ? logprob1 + static_cast<T>(log1p(exp(logprob2 - logprob1)))
                               : logprob2 + static_cast<T>(log1p(exp(logprob1 - logprob2)));
 }

 template <typename T>
 void InnerSoftMax(const T *inputs_addr, std::vector<std::vector<T>> *softmax_probs, const uint32_t sequence_length,
                  size_t num_class, size_t batch_size, size_t b) {
  for (size_t t = 0; t < sequence_length; ++t) {
    auto maxCoeff = static_cast<T>(0);
    auto sumCoeff = static_cast<T>(0);

    for (size_t c = 0; c < num_class; ++c) {
      if (inputs_addr[t * batch_size * num_class + b * num_class + c] > maxCoeff) {
        maxCoeff = inputs_addr[t * batch_size * num_class + b * num_class + c];
      }
    }

    for (size_t c = 0; c < num_class; ++c) {
      sumCoeff += static_cast<T>(exp(inputs_addr[t * batch_size * num_class + b * num_class + c] - maxCoeff));
      (*softmax_probs)[c][t] =
        static_cast<T>(exp(inputs_addr[t * batch_size * num_class + b * num_class + c] - maxCoeff));
    }

    for (size_t c = 0; c < num_class; ++c) {
      (*softmax_probs)[c][t] /= sumCoeff;
    }
  }
 }

 template <typename T>
 void MatrixFromVector(uint32_t row, uint32_t col, std::vector<std::vector<T>> *array2D, const T init_value) {
  array2D->resize(row);
  for (size_t i = 0; i < row; ++i) {
    (*array2D)[i].resize(col, init_value);
  }
 }
 }  // namespace

 void CTCLossCPUKernel::InitKernel(const CNodePtr &kernel_node) {
  CheckParam(kernel_node);
  MS_EXCEPTION_IF_NULL(kernel_node);
  kernel_name_ = AnfAlgo::GetCNodeName(kernel_node);
  probs_shape_ = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
  indice_dims_ = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 1);
  indices_dims_ = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 1);
  labels_dims_ = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 2);
  dtype_ = AnfAlgo::GetInputDeviceDataType(kernel_node, 0);

@@ -32,14 +84,13 @@ void CTCLossCPUKernel::InitKernel(const CNodePtr &kernel_node) {
  if (labels_dims_.size() != 1) {
    MS_LOG(EXCEPTION) << "Labels dims: " << labels_dims_.size() << " not support.";
  }
  if (indice_dims_.size() != 2) {
    MS_LOG(EXCEPTION) << "Labels indice dims: " << indice_dims_.size() << " not support.";
  if (indices_dims_.size() != 2) {
    MS_LOG(EXCEPTION) << "Labels indice dims: " << indices_dims_.size() << " not support.";
  }

  preprocess_collapse_repeated_ = AnfAlgo::GetNodeAttr<bool>(kernel_node, "preprocess_collapse_repeated");
  ctc_merge_repeated_ = AnfAlgo::GetNodeAttr<bool>(kernel_node, "ctc_merge_repeated");
  ignore_longer_outputs_than_inputs_ = AnfAlgo::GetNodeAttr<bool>(kernel_node, "ignore_longer_outputs_than_inputs");

  preprocess_collapse_repeated_ = AnfAlgo::GetNodeAttr<bool>(kernel_node, PCR);
  ctc_merge_repeated_ = AnfAlgo::GetNodeAttr<bool>(kernel_node, CTR);
  ignore_longer_outputs_than_inputs_ = AnfAlgo::GetNodeAttr<bool>(kernel_node, ILOTI);
  max_time_ = probs_shape_[0];
  batch_size_ = probs_shape_[1];
  num_class_ = probs_shape_[2];
@@ -48,31 +99,23 @@ void CTCLossCPUKernel::InitKernel(const CNodePtr &kernel_node) {

 bool CTCLossCPUKernel::Launch(const std::vector<kernel::AddressPtr> &inputs, const std::vector<kernel::AddressPtr> &,
                              const std::vector<kernel::AddressPtr> &outputs) {
  CHECK_KERNEL_INPUTS_NUM(inputs.size(), kCTCLossInputsNum, kernel_name_);
  CHECK_KERNEL_OUTPUTS_NUM(outputs.size(), kCTCLossOutputsNum, kernel_name_);
  if (dtype_ == kNumberTypeFloat16) {
    LaunchKernel<float16>(inputs, outputs);
  } else if (dtype_ == kNumberTypeFloat32) {
    LaunchKernel<float>(inputs, outputs);
  }
  return true;
 }

 template <typename T>
 inline T LogSumExp(const T logprob1, const T logprob2) {
  T kLogZero_ = -std::numeric_limits<T>::infinity();
  if (logprob1 <= kLogZero_) {
    return logprob2;
  } else if (logprob2 <= kLogZero_) {
    return logprob1;
  } else {
    return (logprob1 > logprob2) ? logprob1 + static_cast<T>(log1p(exp(logprob2 - logprob1)))
                                 : logprob2 + static_cast<T>(log1p(exp(logprob1 - logprob2)));
    MS_LOG(EXCEPTION) << kernel_name_ << " only support float16 and float32 on CPU, but got "
                      << TypeIdToType(dtype_)->ToString();
  }
  return true;
 }

 template <typename TT>
 void CTCLossCPUKernel::CalculateFwdVar(const std::vector<uint32_t> &label_with_blank,
                                       const std::vector<std::vector<TT>> &y,
                                       std::vector<std::vector<TT>> *log_alpha_b) {
                                       std::vector<std::vector<TT>> *log_alpha_b) const {
  int U = label_with_blank.size();
  int T = (*log_alpha_b)[0].size();
  TT kLogZero_ = -std::numeric_limits<TT>::infinity();
@@ -112,7 +155,7 @@ void CTCLossCPUKernel::CalculateFwdVar(const std::vector<uint32_t> &label_with_b
 template <typename TT>
 void CTCLossCPUKernel::CalculateBwdVar(const std::vector<uint32_t> &label_with_blank,
                                       const std::vector<std::vector<TT>> &y,
                                       std::vector<std::vector<TT>> *log_beta_b) {
                                       std::vector<std::vector<TT>> *log_beta_b) const {
  int T = (*log_beta_b)[0].size();
  int U = label_with_blank.size();
  if (U > 1) {
@@ -154,7 +197,7 @@ void CTCLossCPUKernel::CalculateGrad(const std::vector<uint32_t> &label_with_bla
                                     const std::vector<std::vector<TT>> &y,
                                     const std::vector<std::vector<TT>> &log_alpha_b,
                                     const std::vector<std::vector<TT>> &log_beta_b, const TT log_pzx,
                                     std::vector<std::vector<TT>> *dy) {
                                     std::vector<std::vector<TT>> *dy) const {
  auto dy_b = dy;
  TT kLogZero_ = -std::numeric_limits<TT>::infinity();
  if (log_pzx <= kLogZero_) {
@@ -179,8 +222,8 @@ void CTCLossCPUKernel::CalculateGrad(const std::vector<uint32_t> &label_with_bla
  }
 }

 void CTCLossCPUKernel::GenLableWithBlank(const uint32_t *seq_len, const std::vector<std::vector<uint32_t>> &batch_label,
                                         std::vector<std::vector<uint32_t>> *label_with_blank) {
 void CTCLossCPUKernel::GenLabelWithBlank(const uint32_t *seq_len, const std::vector<std::vector<uint32_t>> &batch_label,
                                         std::vector<std::vector<uint32_t>> *label_with_blank) const {
  for (size_t b = 0; b < batch_size_; ++b) {
    std::vector<uint32_t> l;
    const std::vector<uint32_t> &label = batch_label[b];
@@ -197,11 +240,9 @@ void CTCLossCPUKernel::GenLableWithBlank(const uint32_t *seq_len, const std::vec
        }
      }
    }
    if (!ignore_longer_outputs_than_inputs_) {
      if (l.size() > seq_len[b]) {
        MS_LOG(EXCEPTION) << "Input time(sequence length) should greater than output size(label length), but gets "
                          << seq_len[b] << "< " << l.size();
      }
    if (!ignore_longer_outputs_than_inputs_ && l.size() > seq_len[b]) {
      MS_LOG(EXCEPTION) << "Input time(sequence length) should greater than output size(label length), but gets "
                        << seq_len[b] << "< " << l.size();
    }

    (*label_with_blank)[b].reserve(2 * l.size() + 1);
@@ -214,46 +255,14 @@ void CTCLossCPUKernel::GenLableWithBlank(const uint32_t *seq_len, const std::vec
 }

 template <typename T>
 void InnerSoftMax(const T *inputs_addr, std::vector<std::vector<T>> *softmax_probs, const uint32_t sequence_length,
                  size_t num_class, size_t batch_size, size_t b) {
  for (size_t t = 0; t < sequence_length; ++t) {
    T maxCoeff(T(0));
    T sumCoeff(T(0));

    for (size_t c = 0; c < num_class; ++c) {
      if (inputs_addr[t * batch_size * num_class + b * num_class + c] > maxCoeff) {
        maxCoeff = inputs_addr[t * batch_size * num_class + b * num_class + c];
      }
    }

    for (size_t c = 0; c < num_class; ++c) {
      sumCoeff += static_cast<T>(exp(inputs_addr[t * batch_size * num_class + b * num_class + c] - maxCoeff));
      (*softmax_probs)[c][t] =
        static_cast<T>(exp(inputs_addr[t * batch_size * num_class + b * num_class + c] - maxCoeff));
    }

    for (size_t c = 0; c < num_class; ++c) {
      (*softmax_probs)[c][t] /= sumCoeff;
    }
  }
 }

 template <typename T>
 void MatrixfromVector(uint32_t row, uint32_t col, std::vector<std::vector<T>> *array2D, const T init_value) {
  array2D->resize(row);
  for (size_t i = 0; i < row; ++i) {
    (*array2D)[i].resize(col, init_value);
  }
 }

 template <typename T>
 void CTCLossCPUKernel::LaunchKernel(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &outputs) {
  auto inputs_addr = reinterpret_cast<T *>(inputs[0]->addr);
  auto labels_indices_addr = reinterpret_cast<uint64_t *>(inputs[1]->addr);
  auto labels_values_addr = reinterpret_cast<uint32_t *>(inputs[2]->addr);
  auto sequence_length_addr = reinterpret_cast<uint32_t *>(inputs[3]->addr);
  auto loss_addr = reinterpret_cast<T *>(outputs[0]->addr);
  auto gradient_addr = reinterpret_cast<T *>(outputs[1]->addr);
 void CTCLossCPUKernel::LaunchKernel(const std::vector<AddressPtr> &inputs,
                                    const std::vector<AddressPtr> &outputs) const {
  const auto *inputs_addr = reinterpret_cast<T *>(inputs[0]->addr);
  const auto *labels_indices_addr = reinterpret_cast<uint64_t *>(inputs[1]->addr);
  const auto *labels_values_addr = reinterpret_cast<uint32_t *>(inputs[2]->addr);
  const auto *sequence_length_addr = reinterpret_cast<uint32_t *>(inputs[3]->addr);
  auto *loss_addr = reinterpret_cast<T *>(outputs[0]->addr);
  auto *gradient_addr = reinterpret_cast<T *>(outputs[1]->addr);

  std::vector<std::vector<uint32_t>> label_batch;
  std::vector<std::vector<uint32_t>> labels_with_blank;
@@ -266,18 +275,21 @@ void CTCLossCPUKernel::LaunchKernel(const std::vector<AddressPtr> &inputs, const
  T kLogZero_ = -std::numeric_limits<T>::infinity();
  // check validation of sequence length
  for (size_t b = 0; b < batch_size_; ++b) {
    if (sequence_length_addr[b] == uint32_t(0)) {
    if (sequence_length_addr[b] == static_cast<uint32_t>(0)) {
      MS_LOG(EXCEPTION) << "Sequence length should > 0, but gets " << sequence_length_addr[b];
    }

    if (sequence_length_addr[b] > max_time_) {
      MS_LOG(EXCEPTION) << "Max time should be greater than sequence length, but gets " << max_time_ << " < "
                        << sequence_length_addr[b];
    }
  }

  for (size_t i = 0; i < indice_dims_[0]; ++i) {
    each_label_length[labels_indices_addr[i * 2]]++;
  for (size_t i = 0; i < indices_dims_[0]; ++i) {
    const size_t factor = 2;
    auto index = labels_indices_addr[i * factor];
    if (index >= SizeToUlong(each_label_length.size())) {
      MS_LOG(EXCEPTION) << "Index: " << index << "out of the bounds of the vector.";
    }
    each_label_length[index]++;
  }

  // convert label format of label_value and label_indices to batch_label
@@ -291,7 +303,7 @@ void CTCLossCPUKernel::LaunchKernel(const std::vector<AddressPtr> &inputs, const
  }

  // convert label to label with blank
  GenLableWithBlank(sequence_length_addr, label_batch, &labels_with_blank);
  GenLabelWithBlank(sequence_length_addr, label_batch, &labels_with_blank);

  for (size_t b = 0; b < batch_size_; ++b) {
    std::vector<uint32_t> label_with_blank = labels_with_blank[b];
@@ -300,12 +312,11 @@ void CTCLossCPUKernel::LaunchKernel(const std::vector<AddressPtr> &inputs, const
    std::vector<std::vector<T>> dy;
    std::vector<std::vector<T>> log_alpha_b;
    std::vector<std::vector<T>> log_beta_b;
    MatrixfromVector(num_class_, sequence_length_addr[b], &y_b, kLogZero_);
    MatrixfromVector(y_b.size(), y_b[0].size(), &dy, T(0));
    MatrixfromVector(label_with_blank.size(), sequence_length_addr[b], &log_alpha_b, kLogZero_);
    MatrixfromVector(label_with_blank.size(), sequence_length_addr[b], &log_beta_b, kLogZero_);
    MatrixFromVector(num_class_, sequence_length_addr[b], &y_b, kLogZero_);
    MatrixFromVector(y_b.size(), y_b[0].size(), &dy, T(0));
    MatrixFromVector(label_with_blank.size(), sequence_length_addr[b], &log_alpha_b, kLogZero_);
    MatrixFromVector(label_with_blank.size(), sequence_length_addr[b], &log_beta_b, kLogZero_);
    InnerSoftMax(inputs_addr, &y_b, sequence_length_addr[b], num_class_, batch_size_, b);

    CalculateFwdVar(label_with_blank, y_b, &log_alpha_b);
    CalculateBwdVar(label_with_blank, y_b, &log_beta_b);

@@ -313,9 +324,7 @@ void CTCLossCPUKernel::LaunchKernel(const std::vector<AddressPtr> &inputs, const
    for (size_t u = 0; u < label_with_blank.size(); ++u) {
      log_pzx = LogSumExp(log_pzx, log_alpha_b[u][0] + log_beta_b[u][0]);
    }

    loss_addr[b] = -log_pzx;

    CalculateGrad(label_with_blank, y_b, log_alpha_b, log_beta_b, log_pzx, &dy);

    for (size_t t = 0; t < sequence_length_addr[b]; ++t) {
@@ -325,16 +334,5 @@ void CTCLossCPUKernel::LaunchKernel(const std::vector<AddressPtr> &inputs, const
    }
  }
 }

 void CTCLossCPUKernel::CheckParam(const CNodePtr &kernel_node) {
  size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
  if (input_num != 4) {
    MS_LOG(EXCEPTION) << "CTCLossCPUKernel needs 4 inputs, but gets " << input_num;
  }
  size_t output_num = AnfAlgo::GetOutputTensorNum(kernel_node);
  if (output_num != 2) {
    MS_LOG(EXCEPTION) << "CTCLossCPUKernel expects 2 outputs, but gets" << output_num;
  }
 }
 }  // namespace kernel
 }  // namespace mindspore
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/ctcloss_cpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/ctcloss_cpu_kernel.h
@@ -16,11 +16,13 @@

 #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_CTCLOSS_CPU_KERNEL_H_
 #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_CTCLOSS_CPU_KERNEL_H_

 #include <memory>
 #include <unordered_map>
 #include <vector>
 #include <algorithm>
 #include <limits>

 #include "backend/kernel_compiler/cpu/cpu_kernel.h"
 #include "backend/kernel_compiler/cpu/cpu_kernel_factory.h"

@@ -36,36 +38,35 @@ class CTCLossCPUKernel : public CPUKernel {
  bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
              const std::vector<AddressPtr> &outputs) override;

  void GenLableWithBlank(const uint32_t *seq_len, const std::vector<std::vector<uint32_t>> &batch_label,
                         std::vector<std::vector<uint32_t>> *label_with_blank);
 private:
  void GenLabelWithBlank(const uint32_t *seq_len, const std::vector<std::vector<uint32_t>> &batch_label,
                         std::vector<std::vector<uint32_t>> *label_with_blank) const;

  template <typename T>
  void CalculateFwdVar(const std::vector<uint32_t> &label_with_blank, const std::vector<std::vector<T>> &y,
                       std::vector<std::vector<T>> *log_alpha_b);
                       std::vector<std::vector<T>> *log_alpha_b) const;
  template <typename T>
  void CalculateBwdVar(const std::vector<uint32_t> &label_with_blank, const std::vector<std::vector<T>> &y,
                       std::vector<std::vector<T>> *log_beta_b);
                       std::vector<std::vector<T>> *log_beta_b) const;
  template <typename T>
  void CalculateGrad(const std::vector<uint32_t> &label_with_blank, const std::vector<std::vector<T>> &y,
                     const std::vector<std::vector<T>> &log_alpha_b, const std::vector<std::vector<T>> &log_beta_b,
                     const T log_pzx, std::vector<std::vector<T>> *dy);
                     const T log_pzx, std::vector<std::vector<T>> *dy) const;

  template <typename T>
  void LaunchKernel(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &outputs);
  void LaunchKernel(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &outputs) const;

 private:
  void CheckParam(const CNodePtr &kernel_node);
  std::vector<size_t> probs_shape_;
  std::vector<size_t> indice_dims_;
  std::vector<size_t> indices_dims_;
  std::vector<size_t> labels_dims_;
  size_t num_class_;
  size_t max_time_;
  size_t batch_size_;
  uint32_t blank_index_;
  size_t num_class_{0};
  size_t max_time_{0};
  size_t batch_size_{0};
  uint32_t blank_index_{0};
  TypeId dtype_{kTypeUnknown};
  bool preprocess_collapse_repeated_;
  bool ctc_merge_repeated_;
  bool ignore_longer_outputs_than_inputs_;
  bool preprocess_collapse_repeated_{false};
  bool ctc_merge_repeated_{false};
  bool ignore_longer_outputs_than_inputs_{false};
 };

 MS_REG_CPU_KERNEL(CTCLoss,
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/cumsum_cpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/cumsum_cpu_kernel.cc
@@ -1,5 +1,5 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 * Copyright 2020-2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -13,20 +13,29 @@
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include <thread>

 #include "backend/kernel_compiler/cpu/cumsum_cpu_kernel.h"

 #include <thread>

 #include "runtime/device/cpu/cpu_device_address.h"

 namespace mindspore {
 namespace kernel {
 namespace {
 constexpr size_t kCumSumInputsNum = 1;
 constexpr size_t kCumSumOutputsNum = 1;
 }  // namespace

 void CumSumCPUKernel::InitKernel(const CNodePtr &kernel_node) {
  CheckParam(kernel_node);
  MS_EXCEPTION_IF_NULL(kernel_node);
  kernel_name_ = AnfAlgo::GetCNodeName(kernel_node);
  shape_ = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
  dtype_ = AnfAlgo::GetInputDeviceDataType(kernel_node, 0);
  axis_ = static_cast<int>(AnfAlgo::GetNodeAttr<int64_t>(kernel_node, "axis"));
  axis_ = LongToInt(AnfAlgo::GetNodeAttr<int64_t>(kernel_node, AXIS));
  dst_shape = AnfAlgo::GetOutputInferShape(kernel_node, 0);
  exclusive_ = AnfAlgo::GetNodeAttr<bool>(kernel_node, "exclusive");
  reverse_ = AnfAlgo::GetNodeAttr<bool>(kernel_node, "reverse");
  exclusive_ = AnfAlgo::GetNodeAttr<bool>(kernel_node, EXCLUSIVE);
  reverse_ = AnfAlgo::GetNodeAttr<bool>(kernel_node, REVERSE);
  int input_dim_length = SizeToInt(shape_.size());
  if (axis_ >= input_dim_length) {
    MS_LOG(EXCEPTION) << "Axis out of bounds.";
@@ -57,12 +66,17 @@ void CumSumCPUKernel::InitInputOutputSize(const CNodePtr &kernel_node) {
    InitWorkspaceSize<int8_t>();
  } else if (dtype_ == kNumberTypeUInt8) {
    InitWorkspaceSize<uint8_t>();
  } else {
    MS_LOG(EXCEPTION) << kernel_name_ << " supports (float16, float32, uint8, int8, int32) on CPU, but got "
                      << TypeIdToType(dtype_)->ToString();
  }
 }

 bool CumSumCPUKernel::Launch(const std::vector<kernel::AddressPtr> &inputs,
                             const std::vector<kernel::AddressPtr> &workspace,
                             const std::vector<kernel::AddressPtr> &outputs) {
  CHECK_KERNEL_INPUTS_NUM(inputs.size(), kCumSumInputsNum, kernel_name_);
  CHECK_KERNEL_OUTPUTS_NUM(outputs.size(), kCumSumOutputsNum, kernel_name_);
  Reshape();
  if (dtype_ == kNumberTypeFloat32) {
    LaunchKernel<float_t>(inputs, workspace, outputs);
@@ -74,6 +88,9 @@ bool CumSumCPUKernel::Launch(const std::vector<kernel::AddressPtr> &inputs,
    LaunchKernel<int8_t>(inputs, workspace, outputs);
  } else if (dtype_ == kNumberTypeUInt8) {
    LaunchKernel<uint8_t>(inputs, workspace, outputs);
  } else {
    MS_LOG(EXCEPTION) << kernel_name_ << " supports (float16, float32, uint8, int8, int32) on CPU, but got "
                      << TypeIdToType(dtype_)->ToString();
  }
  return true;
 }
@@ -90,12 +107,11 @@ void CumSumCPUKernel::Reshape() {
  }
  stride_ = dims_[1] * dims_[2];
  stride2_ = dims_[2];
  return;
 }

 template <typename T>
 void CumSumCPUKernel::LeftMove(const T *input, T *output, size_t dim0, size_t dim1, size_t dim2, size_t stride,
                               size_t stride2, size_t start, size_t end) {
                               size_t stride2, size_t start, size_t end) const {
  for (size_t i = start; i < end; i++) {
    size_t k1 = i / dim2 % dim0;
    size_t k2 = i % dim2;
@@ -114,7 +130,7 @@ void CumSumCPUKernel::LeftMove(const T *input, T *output, size_t dim0, size_t di

 template <typename T>
 void CumSumCPUKernel::RightMove(const T *input, T *output, size_t dim0, size_t dim1, size_t dim2, size_t stride,
                                size_t stride2, size_t start, size_t end) {
                                size_t stride2, size_t start, size_t end) const {
  for (size_t i = start; i < end; i++) {
    size_t k1 = i / dim2 % dim0;
    size_t k2 = i % dim2;
@@ -133,7 +149,7 @@ void CumSumCPUKernel::RightMove(const T *input, T *output, size_t dim0, size_t d

 template <typename T>
 void CumSumCPUKernel::Copy(T *input, T *output, size_t dim0, size_t dim1, size_t dim2, size_t stride, size_t stride2,
                           size_t start, size_t end) {
                           size_t start, size_t end) const {
  for (size_t i = start; i < end; i++) {
    size_t k1 = i / dim2 % dim0;
    size_t k2 = i % dim2;
@@ -147,7 +163,7 @@ void CumSumCPUKernel::Copy(T *input, T *output, size_t dim0, size_t dim1, size_t

 template <typename T>
 void CumSumCPUKernel::CumSumKernelReverse(const T *input, T *output, size_t dim0, size_t dim1, size_t dim2,
                                          size_t stride, size_t stride2, size_t start, size_t end) {
                                          size_t stride, size_t stride2, size_t start, size_t end) const {
  for (size_t i = start; i < end; i++) {
    size_t k1 = i / dim2 % dim0;
    size_t k2 = i % dim2;
@@ -166,7 +182,7 @@ void CumSumCPUKernel::CumSumKernelReverse(const T *input, T *output, size_t dim0

 template <typename T>
 void CumSumCPUKernel::CumSumKernel(const T *input, T *output, size_t dim0, size_t dim1, size_t dim2, size_t stride,
                                   size_t stride2, size_t start, size_t end) {
                                   size_t stride2, size_t start, size_t end) const {
  for (size_t i = start; i < end; i++) {
    size_t k1 = i / dim2 % dim0;
    size_t k2 = i % dim2;
@@ -184,7 +200,7 @@ void CumSumCPUKernel::CumSumKernel(const T *input, T *output, size_t dim0, size_
 }

 template <typename T>
 void CumSumCPUKernel::LaunchCumSum(const T *input, T *output, T *workspace, size_t start, size_t end) {
 void CumSumCPUKernel::LaunchCumSum(const T *input, T *output, T *workspace, size_t start, size_t end) const {
  start = start / dims_[1];
  end = end / dims_[1];
  if (exclusive_) {
@@ -204,15 +220,14 @@ void CumSumCPUKernel::LaunchCumSum(const T *input, T *output, T *workspace, size
      CumSumKernel(input, output, dims_[0], dims_[1], dims_[2], stride_, stride2_, start, end);
    }
  }
  return;
 }

 template <typename T>
 void CumSumCPUKernel::LaunchKernel(const std::vector<kernel::AddressPtr> &inputs,
                                   const std::vector<kernel::AddressPtr> &workspace,
                                   const std::vector<kernel::AddressPtr> &outputs) {
  auto input = reinterpret_cast<T *>(inputs[0]->addr);
  auto ws = reinterpret_cast<T *>(workspace[0]->addr);
                                   const std::vector<kernel::AddressPtr> &outputs) const {
  const auto *input = reinterpret_cast<T *>(inputs[0]->addr);
  auto *ws = reinterpret_cast<T *>(workspace[0]->addr);
  auto output = reinterpret_cast<T *>(outputs[0]->addr);
  // multithreading
  size_t lens = inputs[0]->size > 0 ? static_cast<size_t>(inputs[0]->size / sizeof(T)) : 1;
@@ -239,14 +254,6 @@ void CumSumCPUKernel::LaunchKernel(const std::vector<kernel::AddressPtr> &inputs
  for (size_t i = 0; i < threads.size(); ++i) {
    threads[i].join();
  }
  return;
 }

 void CumSumCPUKernel::CheckParam(const CNodePtr &kernel_node) {
  size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
  if (input_num != 1) {
    MS_LOG(EXCEPTION) << "Argument number is " << input_num << ", but CumSumGpuKernel needs 1.";
  }
 }
 }  // namespace kernel
 }  // namespace mindspore
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/cumsum_cpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/cumsum_cpu_kernel.h
@@ -1,5 +1,5 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 * Copyright 2020-2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -19,6 +19,7 @@

 #include <memory>
 #include <vector>

 #include "backend/kernel_compiler/cpu/cpu_kernel.h"
 #include "backend/kernel_compiler/cpu/cpu_kernel_factory.h"

@@ -31,55 +32,53 @@ class CumSumCPUKernel : public CPUKernel {

  void InitKernel(const CNodePtr &kernel_node) override;

  void InitInputOutputSize(const CNodePtr &kernel_node) override;

  bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
              const std::vector<AddressPtr> &outputs) override;

  template <typename T>
  void InitWorkspaceSize();

  template <typename T>
  void LaunchKernel(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
                    const std::vector<AddressPtr> &outputs);
 private:
  void Reshape();

  template <typename T>
  void LaunchCumSum(const T *input_addr, T *output_addr, T *ws_addr, size_t start, size_t end);

 private:
  void CheckParam(const CNodePtr &kernel_node);
  void InitWorkspaceSize();

  void Reshape();
  void InitInputOutputSize(const CNodePtr &kernel_node) override;

  template <typename T>
  void LeftMove(const T *input, T *output, size_t dim0, size_t dim1, size_t dim2, size_t stride, size_t stride2,
                size_t start, size_t end);
                size_t start, size_t end) const;

  template <typename T>
  void RightMove(const T *input, T *output, size_t dim0, size_t dim1, size_t dim2, size_t stride, size_t stride2,
                 size_t start, size_t end);
                 size_t start, size_t end) const;

  template <typename T>
  void Copy(T *input, T *output, size_t dim0, size_t dim1, size_t dim2, size_t stride, size_t stride2, size_t start,
            size_t end);
            size_t end) const;

  template <typename T>
  void CumSumKernelReverse(const T *input, T *output, size_t dim0, size_t dim1, size_t dim2, size_t stride,
                           size_t stride2, size_t start, size_t end);
                           size_t stride2, size_t start, size_t end) const;

  template <typename T>
  void CumSumKernel(const T *input, T *output, size_t dim0, size_t dim1, size_t dim2, size_t stride, size_t stride2,
                    size_t start, size_t end);
                    size_t start, size_t end) const;

  template <typename T>
  void LaunchKernel(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
                    const std::vector<AddressPtr> &outputs) const;

  template <typename T>
  void LaunchCumSum(const T *input_addr, T *output_addr, T *ws_addr, size_t start, size_t end) const;

  std::vector<size_t> shape_;
  std::vector<size_t> dst_shape;
  size_t input_size_0_;
  size_t stride_;
  size_t stride2_;
  size_t dims_[3] = {};
  int exclusive_;
  int reverse_;
  int axis_;
  size_t input_size_0_{0};
  size_t stride_{0};
  size_t stride2_{0};
  size_t dims_[3]{0};
  int exclusive_{0};
  int reverse_{0};
  int axis_{0};
  TypeId dtype_{kTypeUnknown};
 };

--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/debug_cpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/debug_cpu_kernel.cc
@@ -1,5 +1,5 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 * Copyright 2020-2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -13,28 +13,35 @@
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #include "backend/kernel_compiler/cpu/debug_cpu_kernel.h"
 #include "runtime/device/cpu/cpu_device_address.h"
 #include "utils/ms_utils.h"

 namespace mindspore {
 namespace kernel {
 void DebugCPUKernel::InitKernel(const CNodePtr &kernel_node) { MS_EXCEPTION_IF_NULL(kernel_node); }
 namespace {
 constexpr size_t kDebugInputsNum = 1;
 constexpr size_t kDebugOutputsNum = 1;
 }  // namespace

 void DebugCPUKernel::InitKernel(const CNodePtr &kernel_node) {
  MS_EXCEPTION_IF_NULL(kernel_node);
  kernel_name_ = AnfAlgo::GetCNodeName(kernel_node);
 }

 bool DebugCPUKernel::Launch(const std::vector<kernel::AddressPtr> &inputs, const std::vector<kernel::AddressPtr> &,
                            const std::vector<kernel::AddressPtr> &outputs) {
  if (inputs.size() < 1 || outputs.empty()) {
    MS_LOG(EXCEPTION) << "Input or output empty!";
  }
  auto val = reinterpret_cast<int *>(inputs[0]->addr);
  MS_LOG(DEBUG) << " launch DebugCountCPUKernel val " << *val;
  CHECK_KERNEL_INPUTS_NUM(inputs.size(), kDebugInputsNum, kernel_name_);
  CHECK_KERNEL_OUTPUTS_NUM(outputs.size(), kDebugOutputsNum, kernel_name_);
  const auto *val = reinterpret_cast<int *>(inputs[0]->addr);
  MS_LOG(DEBUG) << " launch DebugCountCPUKernel";

  auto output = reinterpret_cast<int *>(outputs[0]->addr);
  size_t elem_num = inputs[0]->size / sizeof(int);
  for (size_t i = 0; i < elem_num; i++) {
    output[i] = static_cast<int>(val[i]);
  }

  return true;
 }
 }  // namespace kernel
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/debug_cpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/debug_cpu_kernel.h
@@ -1,5 +1,5 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 * Copyright 2020-2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -13,11 +13,13 @@
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_DEBUG_CPU_KERNEL_H_
 #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_DEBUG_CPU_KERNEL_H_

 #include <vector>
 #include <memory>

 #include "backend/kernel_compiler/cpu/cpu_kernel.h"
 #include "backend/kernel_compiler/cpu/cpu_kernel_factory.h"

--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/depthtospace_cpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/depthtospace_cpu_kernel.h
@@ -13,14 +13,17 @@
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_DEPTHTOSPACE_CPU_KERNEL_H_
 #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_DEPTHTOSPACE_CPU_KERNEL_H_

 #include <memory>
 #include <string>
 #include <vector>

 #include "backend/kernel_compiler/cpu/cpu_kernel.h"
 #include "backend/kernel_compiler/cpu/cpu_kernel_factory.h"

 namespace mindspore {
 namespace kernel {
 template <typename T>
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/dropout_cpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/dropout_cpu_kernel.cc
@@ -1,5 +1,5 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 * Copyright 2020-2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -14,24 +14,29 @@
 * limitations under the License.
 */

 #include "backend/kernel_compiler/cpu/dropout_cpu_kernel.h"

 #include <algorithm>
 #include <random>

 #include "runtime/device/cpu/cpu_device_address.h"
 #include "backend/kernel_compiler/cpu/dropout_cpu_kernel.h"

 namespace mindspore {
 namespace kernel {
 namespace {
 constexpr size_t kDropoutInputsNum = 1;
 constexpr size_t kDropoutOutputsNum = 2;
 }  // namespace

 void DropoutCPUKernel::InitKernel(const CNodePtr &kernel_node) {
  CheckParam(kernel_node);
  MS_EXCEPTION_IF_NULL(kernel_node);
  kernel_name_ = AnfAlgo::GetCNodeName(kernel_node);
  input_shape_ = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
  output_shape_ = AnfAlgo::GetOutputInferShape(kernel_node, 0);
  mask_shape_ = AnfAlgo::GetOutputInferShape(kernel_node, 1);
  keep_prob_ = AnfAlgo::GetNodeAttr<float>(kernel_node, "keep_prob");
  if (keep_prob_ <= 0.0) {
    MS_LOG(EXCEPTION) << "Keep_prob is smaller or equal to zero but DropoutCPUKernel needs greater than 0";
  }
  if (keep_prob_ > 1.0) {
    MS_LOG(EXCEPTION) << "Keep_prob greater than one but DropoutCPUKernel needs smaller or equal to one";
  if (keep_prob_ <= 0.0 || keep_prob_ > 1.0) {
    MS_LOG(EXCEPTION) << kernel_name_ << "requires keep_prob should be in (0.0, 1.0], but got " << keep_prob_;
  }
  dtype_ = AnfAlgo::GetInputDeviceDataType(kernel_node, 0);
  for (const uint64_t &d : input_shape_) {
@@ -41,18 +46,24 @@ void DropoutCPUKernel::InitKernel(const CNodePtr &kernel_node) {

 bool DropoutCPUKernel::Launch(const std::vector<kernel::AddressPtr> &inputs, const std::vector<kernel::AddressPtr> &,
                              const std::vector<kernel::AddressPtr> &outputs) {
  CHECK_KERNEL_INPUTS_NUM(inputs.size(), kDropoutInputsNum, kernel_name_);
  CHECK_KERNEL_OUTPUTS_NUM(outputs.size(), kDropoutOutputsNum, kernel_name_);
  if (dtype_ == kNumberTypeFloat16) {
    LaunchKernel<float16>(inputs, outputs);
  } else if (dtype_ == kNumberTypeFloat32) {
    LaunchKernel<float>(inputs, outputs);
  } else {
    MS_LOG(EXCEPTION) << kernel_name_ << " only support float16 and float32 on CPU, but got "
                      << TypeIdToType(dtype_)->ToString();
  }
  return true;
 }

 template <typename T>
 void DropoutCPUKernel::LaunchKernel(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &outputs) {
  auto input_addr = reinterpret_cast<T *>(inputs[0]->addr);
  auto output_addr = reinterpret_cast<T *>(outputs[0]->addr);
 void DropoutCPUKernel::LaunchKernel(const std::vector<AddressPtr> &inputs,
                                    const std::vector<AddressPtr> &outputs) const {
  const auto *input_addr = reinterpret_cast<T *>(inputs[0]->addr);
  auto *output_addr = reinterpret_cast<T *>(outputs[0]->addr);
  auto mask_addr = reinterpret_cast<T *>(outputs[1]->addr);
  std::random_device rd;
  std::mt19937 gen(rd());
@@ -63,17 +74,5 @@ void DropoutCPUKernel::LaunchKernel(const std::vector<AddressPtr> &inputs, const
    output_addr[i] = mask_addr[i] * input_addr[i] * scale;
  }
 }

 void DropoutCPUKernel::CheckParam(const CNodePtr &kernel_node) {
  MS_EXCEPTION_IF_NULL(kernel_node);
  size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
  if (input_num != 1) {
    MS_LOG(EXCEPTION) << "Input number is " << input_num << ", but DropoutCPUKernel needs 1 input.";
  }
  size_t output_num = AnfAlgo::GetOutputTensorNum(kernel_node);
  if (output_num != 2) {
    MS_LOG(EXCEPTION) << "Output number is " << output_num << ", but DropoutCPUKernel needs 1 output.";
  }
 }
 }  // namespace kernel
 }  // namespace mindspore
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/dropout_cpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/dropout_cpu_kernel.h
@@ -1,5 +1,5 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 * Copyright 2020-2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -16,8 +16,10 @@

 #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_DROPOUT_CPU_KERNEL_H_
 #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_DROPOUT_CPU_KERNEL_H_

 #include <memory>
 #include <vector>

 #include "backend/kernel_compiler/cpu/cpu_kernel.h"
 #include "backend/kernel_compiler/cpu/cpu_kernel_factory.h"

@@ -33,17 +35,16 @@ class DropoutCPUKernel : public CPUKernel {
  bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
              const std::vector<AddressPtr> &outputs) override;

 private:
  template <typename T>
  void LaunchKernel(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &outputs);
  void LaunchKernel(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &outputs) const;

 private:
  void CheckParam(const CNodePtr &kernel_node);
  std::vector<size_t> input_shape_;
  std::vector<size_t> output_shape_;
  std::vector<size_t> mask_shape_;
  TypeId dtype_{kTypeUnknown};
  float keep_prob_ = 0.0;
  uint64_t tensor_size_ = 1;
  float keep_prob_{0.0};
  uint64_t tensor_size_{1};
 };

 MS_REG_CPU_KERNEL(Dropout, KernelAttr(), DropoutCPUKernel);
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/dropout_grad_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/dropout_grad_kernel.cc
@@ -1,5 +1,5 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 * Copyright 2020-2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -13,16 +13,24 @@
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #include "backend/kernel_compiler/cpu/dropout_grad_kernel.h"

 #include <vector>
 #include "runtime/device/cpu/cpu_device_address.h"
 #include "backend/kernel_compiler/cpu/dropout_grad_kernel.h"
 #include "nnacl/fp32_grad/dropout_grad.h"

 #include "backend/kernel_compiler/cpu/nnacl/fp32_grad/dropout_grad.h"

 namespace mindspore {
 namespace kernel {
 namespace {
 constexpr size_t kDropoutGradInputsNum = 2;
 constexpr size_t kDropoutGradOutputsNum = 1;
 }  // namespace

 void DropoutGradCpuBwdKernel::InitKernel(const CNodePtr &kernel_node) {
  MS_EXCEPTION_IF_NULL(kernel_node);

  kernel_name_ = AnfAlgo::GetCNodeName(kernel_node);
  auto input_shape = AnfAlgo::GetInputDeviceShape(kernel_node, 0);
  auto input_mask_shape = AnfAlgo::GetInputDeviceShape(kernel_node, 1);
  if (input_shape.size() != input_mask_shape.size()) {
@@ -35,8 +43,8 @@ void DropoutGradCpuBwdKernel::InitKernel(const CNodePtr &kernel_node) {
  }
  dtype_ = AnfAlgo::GetInputDeviceDataType(kernel_node, 0);
  keep_prob_ = AnfAlgo::GetNodeAttr<float>(kernel_node, "keep_prob");
  if (keep_prob_ == 0) {
    MS_LOG(EXCEPTION) << "The keep_prob is zero.";
  if (keep_prob_ <= 0.0 || keep_prob_ > 1.0) {
    MS_LOG(EXCEPTION) << kernel_name_ << "requires keep_prob should be in (0.0, 1.0], but got " << keep_prob_;
  }
 }

@@ -51,12 +59,15 @@ void DropoutGradCpuBwdKernel::InitInputOutputSize(const CNodePtr &kernel_node) {

 bool DropoutGradCpuBwdKernel::Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
                                     const std::vector<AddressPtr> &outputs) {
  CHECK_KERNEL_INPUTS_NUM(inputs.size(), kDropoutGradInputsNum, kernel_name_);
  CHECK_KERNEL_OUTPUTS_NUM(outputs.size(), kDropoutGradOutputsNum, kernel_name_);
  if (dtype_ == kNumberTypeFloat16) {
    DropoutBackwardKernel<float16>(inputs, workspace, outputs, keep_prob_);
  } else if (dtype_ == kNumberTypeFloat32) {
    DropoutBackwardKernel<float>(inputs, workspace, outputs, keep_prob_);
  } else {
    MS_LOG(ERROR) << "Input data type: " << dtype_ << " is not supported for DropoutGrad kernel for CPU.";
    MS_LOG(EXCEPTION) << kernel_name_ << " only support float16 and float32 on CPU, but got "
                      << TypeIdToType(dtype_)->ToString();
  }

  return true;
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/dropout_grad_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/dropout_grad_kernel.h
@@ -1,5 +1,5 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 * Copyright 2020-2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -21,6 +21,7 @@
 #include <memory>
 #include <string>
 #include <unordered_map>

 #include "backend/kernel_compiler/cpu/cpu_kernel.h"
 #include "backend/kernel_compiler/cpu/cpu_kernel_factory.h"

@@ -36,12 +37,12 @@ class DropoutGradCpuBwdKernel : public CPUKernel {
              const std::vector<AddressPtr> &outputs) override;

 private:
  float keep_prob_{1.0};
  size_t num_count_{1};
  TypeId dtype_{kTypeUnknown};
  template <typename T>
  void DropoutBackwardKernel(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
                             const std::vector<AddressPtr> &outputs, float keep_prob);
  float keep_prob_{1.0};
  size_t num_count_{1};
  TypeId dtype_{kTypeUnknown};
 };

 MS_REG_CPU_KERNEL(DropoutGrad, KernelAttr(), DropoutGradCpuBwdKernel);
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/dynamic_assign_cpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/dynamic_assign_cpu_kernel.cc
@@ -1,5 +1,5 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 * Copyright 2020-2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -13,13 +13,23 @@
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include <algorithm>

 #include "backend/kernel_compiler/cpu/dynamic_assign_cpu_kernel.h"

 #include <algorithm>

 #include "runtime/device/cpu/cpu_device_address.h"

 namespace mindspore {
 namespace kernel {
 namespace {
 constexpr size_t kDynamicAssignInputsNum = 2;
 constexpr size_t kDynamicAssignOutputsNum = 1;
 }  // namespace

 void DynamicAssignCPUKernel::InitKernel(const CNodePtr &kernel_node) {
  MS_EXCEPTION_IF_NULL(kernel_node);
  kernel_name_ = AnfAlgo::GetCNodeName(kernel_node);
  node_wpt_ = kernel_node;
  input_x_dtype_ = AnfAlgo::GetInputDeviceDataType(kernel_node, 0);
  input_x_dtype_size_ = GetTypeByte(TypeIdToType(input_x_dtype_));
@@ -28,6 +38,8 @@ void DynamicAssignCPUKernel::InitKernel(const CNodePtr &kernel_node) {
 bool DynamicAssignCPUKernel::Launch(const std::vector<kernel::AddressPtr> &inputs,
                                    const std::vector<kernel::AddressPtr> &,
                                    const std::vector<kernel::AddressPtr> &outputs) {
  CHECK_KERNEL_INPUTS_NUM(inputs.size(), kDynamicAssignInputsNum, kernel_name_);
  CHECK_KERNEL_OUTPUTS_NUM(outputs.size(), kDynamicAssignOutputsNum, kernel_name_);
  if (input_x_dtype_ == kNumberTypeInt32) {
    LaunchKernel<int>(inputs, outputs);
  } else if (input_x_dtype_ == kNumberTypeInt64) {
@@ -37,8 +49,8 @@ bool DynamicAssignCPUKernel::Launch(const std::vector<kernel::AddressPtr> &input
  } else if (input_x_dtype_ == kNumberTypeFloat64) {
    LaunchKernel<double>(inputs, outputs);
  } else {
    MS_LOG(ERROR) << "Dtype of indices only support float32, float64, int32, int64";
    return false;
    MS_LOG(EXCEPTION) << kernel_name_ << " support (int32, int64, float32, float64) on CPU , but got "
                      << TypeIdToType(input_x_dtype_)->ToString();
  }
  return true;
 }
@@ -46,25 +58,27 @@ bool DynamicAssignCPUKernel::Launch(const std::vector<kernel::AddressPtr> &input
 template <typename T>
 void DynamicAssignCPUKernel::LaunchKernel(const std::vector<AddressPtr> &inputs,
                                          const std::vector<kernel::AddressPtr> &outputs) {
  auto node_ = node_wpt_.lock();
  if (!node_) {
    MS_LOG(EXCEPTION) << "node_wpt_ is expired.";
  auto node = node_wpt_.lock();
  if (!node) {
    MS_LOG(EXCEPTION) << kernel_name_ << " node_wpt_ is expired.";
  }
  auto input_x_shape = AnfAlgo::GetPrevNodeOutputInferShape(node_, 0);
  auto input_y_shape = AnfAlgo::GetPrevNodeOutputInferShape(node_, 1);
  auto input_x_shape = AnfAlgo::GetPrevNodeOutputInferShape(node, 0);
  auto input_y_shape = AnfAlgo::GetPrevNodeOutputInferShape(node, 1);
  batch_size_ = 1;
  for (size_t i = 0; i < input_x_shape.size(); ++i) {
    batch_size_ *= input_x_shape[i];
  }

  if (input_x_shape.size() != input_y_shape.size()) MS_LOG(EXCEPTION) << "X and y must be same shape!";
  if (input_x_shape.size() != input_y_shape.size()) {
    MS_LOG(EXCEPTION) << "X and y must be same shape";
  }
  for (size_t i = 0; i < input_x_shape.size(); ++i) {
    if (input_x_shape[i] != input_y_shape[i]) {
      MS_LOG(EXCEPTION) << "X and y must be same shape!";
      MS_LOG(EXCEPTION) << "x and y must be same shape!";
    }
  }
  T *input_x = reinterpret_cast<T *>(inputs[0]->addr);
  T *input_y = reinterpret_cast<T *>(inputs[1]->addr);
  auto *input_x = reinterpret_cast<T *>(inputs[0]->addr);
  auto *input_y = reinterpret_cast<T *>(inputs[1]->addr);
  auto max_size = inputs[0]->size;
  size_t total_size = input_x_dtype_size_ * batch_size_;
  if (total_size > max_size) {
@@ -76,10 +90,10 @@ void DynamicAssignCPUKernel::LaunchKernel(const std::vector<AddressPtr> &inputs,
    MS_LOG(EXCEPTION) << "Memcpy_s error, errorno" << ret;
  }

  auto node_with_idx = AnfAlgo::GetPrevNodeOutput(node_, 0);
  auto node = node_with_idx.first;
  if (node->isa<Parameter>()) {
    auto node_ptr = node->cast<ParameterPtr>();
  auto node_with_idx = AnfAlgo::GetPrevNodeOutput(node, 0);
  auto out_node = node_with_idx.first;
  if (out_node->isa<Parameter>()) {
    auto node_ptr = out_node->cast<ParameterPtr>();
    auto value = node_ptr->default_param();
    auto tensor = value->cast<std::shared_ptr<tensor::Tensor>>();
    ShapeVector shape_tmp;
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/dynamic_assign_cpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/dynamic_assign_cpu_kernel.h
@@ -1,5 +1,5 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 * Copyright 2020-2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -13,12 +13,14 @@
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_DYNAMIC_ASSIGN_CPU_KERNEL_H_
 #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_DYNAMIC_ASSIGN_CPU_KERNEL_H_

 #include <vector>
 #include <memory>
 #include <unordered_map>

 #include "backend/kernel_compiler/cpu/cpu_kernel.h"
 #include "backend/kernel_compiler/cpu/cpu_kernel_factory.h"

@@ -34,13 +36,13 @@ class DynamicAssignCPUKernel : public CPUKernel {
  bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
              const std::vector<AddressPtr> &outputs) override;

 private:
  template <typename T>
  void LaunchKernel(const std::vector<AddressPtr> &inputs, const std::vector<kernel::AddressPtr> &outputs);

 private:
  size_t batch_size_{1};
  TypeId input_x_dtype_{kTypeUnknown};
  size_t input_x_dtype_size_ = 4;
  size_t input_x_dtype_size_{4};
  CNodeWeakPtr node_wpt_;
 };

--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/mkldnn/eltwise_cpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/mkldnn/eltwise_cpu_kernel.cc
@@ -1,5 +1,5 @@
 /**
 * Copyright 2019 Huawei Technologies Co., Ltd
 * Copyright 2019-2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -24,59 +24,57 @@
 namespace mindspore {
 namespace kernel {
 namespace {
 constexpr size_t kInputsNum = 1;
 constexpr size_t kOutputsNum = 1;

 struct DescParam {
  dnnl::algorithm algorithm;
  float alpha = 0.f;
  float beta = 0.f;
  dnnl::algorithm algorithm{dnnl::algorithm::undef};
  float alpha{0.0f};
  float beta{0.0f};
 };
 }  // namespace

 dnnl::eltwise_forward::desc EltWiseCPUKernel::GetForwardEltwiseDesc(const CNodePtr &kernel_node,
                                                                    const dnnl::memory::desc src_desc) {
 dnnl::eltwise_forward::desc EltWiseCPUKernel::GetForwardEltwiseDesc(const dnnl::memory::desc src_desc) {
  static const std::unordered_map<std::string, DescParam> eltWiseOpDescMap{
    {prim::kPrimRelu->name(), DescParam{dnnl::algorithm::eltwise_relu}},
    {prim::kPrimRelu6->name(), DescParam{dnnl::algorithm::eltwise_clip, 0.f, 6.f}},
    {prim::kPrimRelu6->name(), DescParam{dnnl::algorithm::eltwise_clip, 0.0f, 6.0f}},
    {prim::kPrimAbs->name(), DescParam{dnnl::algorithm::eltwise_abs}},
    {prim::kPrimExp->name(), DescParam{dnnl::algorithm::eltwise_exp}},
    {prim::kPrimLog->name(), DescParam{dnnl::algorithm::eltwise_log}},
    {prim::kPrimSigmoid->name(), DescParam{dnnl::algorithm::eltwise_logistic}},
    {prim::kPrimSqrt->name(), DescParam{dnnl::algorithm::eltwise_sqrt}},
    {prim::kPrimSquare->name(), DescParam{dnnl::algorithm::eltwise_square}},
    {prim::kPrimTanh->name(), DescParam{dnnl::algorithm::eltwise_tanh}},
    {prim::kPrimElu->name(), DescParam{dnnl::algorithm::eltwise_elu, 1.f, 0.f}},
    {prim::kPrimElu->name(), DescParam{dnnl::algorithm::eltwise_elu, 1.0f, 0.0f}},
    {prim::kPrimSoftplus->name(), DescParam{dnnl::algorithm::eltwise_soft_relu}},
  };

  std::string kernel_name = AnfAlgo::GetCNodeName(kernel_node);
  const auto desc_pair = eltWiseOpDescMap.find(kernel_name);
  const auto desc_pair = eltWiseOpDescMap.find(kernel_name_);
  if (desc_pair == eltWiseOpDescMap.end()) {
    MS_LOG(EXCEPTION) << "EltWiseCPUKernel does not support " << kernel_name;
    MS_LOG(EXCEPTION) << "EltWiseCPUKernel does not support " << kernel_name_;
  }
  return dnnl::eltwise_forward::desc(DnnlForward, desc_pair->second.algorithm, src_desc, desc_pair->second.alpha,
  return dnnl::eltwise_forward::desc(dnnl_forward_, desc_pair->second.algorithm, src_desc, desc_pair->second.alpha,
                                     desc_pair->second.beta);
 }

 void EltWiseCPUKernel::InitKernel(const CNodePtr &kernel_node) {
  MS_EXCEPTION_IF_NULL(kernel_node);
  kernel_name_ = AnfAlgo::GetCNodeName(kernel_node);
  std::vector<size_t> src_shape = AnfAlgo::GetInputDeviceShape(kernel_node, 0);
  if (src_shape.size() == 0) {
    src_shape.insert(src_shape.begin(), 1);
  if (src_shape.empty()) {
    (void)src_shape.insert(src_shape.begin(), 1);
  }
  dnnl::memory::desc src_desc = GetDefaultMemDesc(src_shape);

  auto desc = GetForwardEltwiseDesc(kernel_node, src_desc);
  auto desc = GetForwardEltwiseDesc(src_desc);
  auto prim_desc = dnnl::eltwise_forward::primitive_desc(desc, MKLKernelEngine::Get().engine());
  primitive_ = std::make_shared<dnnl::eltwise_forward>(prim_desc);

  AddArgument(DNNL_ARG_SRC, src_desc);
  AddArgument(DNNL_ARG_DST, src_desc);
 }

 bool EltWiseCPUKernel::Launch(const std::vector<kernel::AddressPtr> &inputs, const std::vector<kernel::AddressPtr> &,
                              const std::vector<kernel::AddressPtr> &outputs) {
  if (inputs.empty() || outputs.empty()) {
    MS_LOG(EXCEPTION) << "Error input output size!";
  }
  CHECK_KERNEL_INPUTS_NUM(inputs.size(), kInputsNum, kernel_name_);
  CHECK_KERNEL_OUTPUTS_NUM(outputs.size(), kOutputsNum, kernel_name_);
  SetArgumentHandle(DNNL_ARG_SRC, inputs[0]->addr);
  SetArgumentHandle(DNNL_ARG_DST, outputs[0]->addr);
  ExecutePrimitive();
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/mkldnn/eltwise_cpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/mkldnn/eltwise_cpu_kernel.h
@@ -13,8 +13,10 @@
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_ELTWISE_CPU_KERNEL_H_
 #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_ELTWISE_CPU_KERNEL_H_

 #include <memory>
 #include <vector>
 #include "backend/kernel_compiler/cpu/mkldnn/mkl_cpu_kernel.h"
@@ -32,8 +34,9 @@ class EltWiseCPUKernel : public MKLCPUKernel {
              const std::vector<AddressPtr> &outputs) override;

 private:
  dnnl::eltwise_forward::desc GetForwardEltwiseDesc(const CNodePtr &kernel_node, const dnnl::memory::desc src_desc);
  dnnl::prop_kind DnnlForward = dnnl::prop_kind::forward_training;
  dnnl::eltwise_forward::desc GetForwardEltwiseDesc(const dnnl::memory::desc src_desc);

  dnnl::prop_kind dnnl_forward_{dnnl::prop_kind::forward_training};
 };

 MS_REG_CPU_KERNEL(Elu, KernelAttr().AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32),
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/mkldnn/log_softmax_cpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/mkldnn/log_softmax_cpu_kernel.cc
@@ -13,15 +13,23 @@
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #include "backend/kernel_compiler/cpu/mkldnn/log_softmax_cpu_kernel.h"
 #include <algorithm>
 #include "backend/kernel_compiler/cpu/mkldnn/mkl_kernel_engine.h"
 #include "runtime/device/cpu/cpu_device_address.h"
 #include "utils/ms_utils.h"

 namespace mindspore {
 namespace kernel {
 namespace {
 constexpr size_t kLogSoftmaxInputsNum = 1;
 constexpr size_t kLogSoftmaxOutputsNum = 1;
 }  // namespace

 void LogSoftmaxCPUKernel::InitKernel(const CNodePtr &kernel_node) {
  MS_EXCEPTION_IF_NULL(kernel_node);
  kernel_name_ = AnfAlgo::GetCNodeName(kernel_node);
  std::vector<size_t> src_shape = AnfAlgo::GetInputDeviceShape(kernel_node, 0);
  int axis = AnfAlgo::GetNodeAttr<int64_t>(kernel_node, AXIS);
  if (axis >= SizeToInt(src_shape.size())) {
@@ -41,9 +49,8 @@ void LogSoftmaxCPUKernel::InitKernel(const CNodePtr &kernel_node) {

 bool LogSoftmaxCPUKernel::Launch(const std::vector<kernel::AddressPtr> &inputs, const std::vector<kernel::AddressPtr> &,
                                 const std::vector<kernel::AddressPtr> &outputs) {
  if (inputs.empty() || outputs.empty()) {
    MS_LOG(EXCEPTION) << "Log softmax error input output size!";
  }
  CHECK_KERNEL_INPUTS_NUM(inputs.size(), kLogSoftmaxInputsNum, kernel_name_);
  CHECK_KERNEL_OUTPUTS_NUM(outputs.size(), kLogSoftmaxOutputsNum, kernel_name_);
  SetArgumentHandle(DNNL_ARG_SRC, inputs[0]->addr);
  SetArgumentHandle(DNNL_ARG_DST, outputs[0]->addr);
  ExecutePrimitive();
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/mkldnn/log_softmax_cpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/mkldnn/log_softmax_cpu_kernel.h
@@ -13,6 +13,7 @@
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_LOG_SOFTMAX_CPU_KERNEL_H_
 #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_LOG_SOFTMAX_CPU_KERNEL_H_

--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/mkldnn/log_softmax_grad_cpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/mkldnn/log_softmax_grad_cpu_kernel.cc
@@ -13,15 +13,23 @@
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #include "backend/kernel_compiler/cpu/mkldnn/log_softmax_grad_cpu_kernel.h"
 #include <algorithm>
 #include "backend/kernel_compiler/cpu/mkldnn/mkl_kernel_engine.h"
 #include "runtime/device/cpu/cpu_device_address.h"
 #include "utils/ms_utils.h"

 namespace mindspore {
 namespace kernel {
 namespace {
 constexpr size_t kLogSoftmaxGradInputsNum = 2;
 constexpr size_t kLogSoftmaxGradOutputsNum = 1;
 }  // namespace

 void LogSoftmaxGradCPUKernel::InitKernel(const CNodePtr &kernel_node) {
  MS_EXCEPTION_IF_NULL(kernel_node);
  kernel_name_ = AnfAlgo::GetCNodeName(kernel_node);
  std::vector<size_t> src_shape = AnfAlgo::GetInputDeviceShape(kernel_node, 0);
  int axis = AnfAlgo::GetNodeAttr<int64_t>(kernel_node, AXIS);
  if (axis >= SizeToInt(src_shape.size())) {
@@ -47,9 +55,8 @@ void LogSoftmaxGradCPUKernel::InitKernel(const CNodePtr &kernel_node) {
 bool LogSoftmaxGradCPUKernel::Launch(const std::vector<kernel::AddressPtr> &inputs,
                                     const std::vector<kernel::AddressPtr> &,
                                     const std::vector<kernel::AddressPtr> &outputs) {
  if (inputs.size() < 2 || outputs.empty()) {
    MS_LOG(EXCEPTION) << "LogSoftmaxGrad error input output size!";
  }
  CHECK_KERNEL_INPUTS_NUM(inputs.size(), kLogSoftmaxGradInputsNum, kernel_name_);
  CHECK_KERNEL_OUTPUTS_NUM(outputs.size(), kLogSoftmaxGradOutputsNum, kernel_name_);
  SetArgumentHandle(DNNL_ARG_DST, inputs[0]->addr);
  SetArgumentHandle(DNNL_ARG_DIFF_DST, inputs[1]->addr);
  SetArgumentHandle(DNNL_ARG_DIFF_SRC, outputs[0]->addr);
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/mkldnn/log_softmax_grad_cpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/mkldnn/log_softmax_grad_cpu_kernel.h
@@ -13,6 +13,7 @@
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_LOG_SOFTMAX_GRAD_CPU_KERNEL_H_
 #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_LOG_SOFTMAX_GRAD_CPU_KERNEL_H_

--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/mkldnn/lstm_cpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/mkldnn/lstm_cpu_kernel.cc
@@ -1,5 +1,5 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 * Copyright 2020-2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -13,6 +13,7 @@
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #include "backend/kernel_compiler/cpu/mkldnn/lstm_cpu_kernel.h"
 #include <string>
 #include "utils/ms_utils.h"
@@ -21,9 +22,18 @@

 namespace mindspore {
 namespace kernel {
 const int kMaxLSTMLayer = 100;
 const int kOutputWorkSpaceIndex = 3;
 const int kGateNum = 4;
 namespace {
 constexpr size_t kLstmInputsNum = 4;
 constexpr size_t kLstmOutputsNum = 5;
 constexpr int kMaxLSTMLayer = 100;
 constexpr int kOutputWorkSpaceIndex = 3;
 constexpr int kGateNum = 4;

 using tag = dnnl::memory::format_tag;
 using dim = dnnl::memory::dims;
 using dt = dnnl::memory::data_type;
 }  // namespace

 void LstmCPUKernel::InitInputOutputSize(const CNodePtr &kernel_node) {
  CPUKernel::InitInputOutputSize(kernel_node);
  output_size_list_[kOutputWorkSpaceIndex] = reserve_size_;
@@ -46,8 +56,7 @@ void LstmCPUKernel::InitKernel(const CNodePtr &kernel_node) {
  _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON);
 #endif
  MS_EXCEPTION_IF_NULL(kernel_node);
  using tag = dnnl::memory::format_tag;
  using dim = dnnl::memory::dims;
  kernel_name_ = AnfAlgo::GetCNodeName(kernel_node);
  CheckParam(kernel_node);
  auto eng = MKLKernelEngine::Get().engine();
  dnnl::rnn_direction direction = dnnl::rnn_direction::unidirectional;
@@ -70,10 +79,10 @@ void LstmCPUKernel::InitKernel(const CNodePtr &kernel_node) {
  dnnl::memory::desc dst_desc = formatted_md(dst_dims, tag::tnc);
  dnnl::memory::desc dst_h_desc = formatted_md(dst_h_dims, tag::ldnc);
  dnnl::memory::desc dst_c_desc = formatted_md(dst_c_dims, tag::ldnc);
  if (!kernel_node->HasAttr(kAttrIsTraining)) {
    is_training = true;
  } else {
  if (kernel_node->HasAttr(kAttrIsTraining)) {
    is_training = GetValue<bool>(kernel_node->GetAttr(kAttrIsTraining));
  } else {
    is_training = true;
  }
  auto prop_kind = dnnl::prop_kind::forward_training;
  if (!is_training) {
@@ -106,9 +115,9 @@ void LstmCPUKernel::CheckParam(const CNodePtr &kernel_node) {
  std::vector<size_t> src_h_shape = AnfAlgo::GetInputDeviceShape(kernel_node, 1);
  std::vector<size_t> src_c_shape = AnfAlgo::GetInputDeviceShape(kernel_node, 2);
  bidirectional_ = AnfAlgo::GetNodeAttr<bool>(kernel_node, "bidirectional");
  input_size_ = static_cast<int>(AnfAlgo::GetNodeAttr<int64_t>(kernel_node, "input_size"));
  hidden_size_ = static_cast<int>(AnfAlgo::GetNodeAttr<int64_t>(kernel_node, "hidden_size"));
  num_layers_ = static_cast<int>(AnfAlgo::GetNodeAttr<int64_t>(kernel_node, "num_layers"));
  input_size_ = LongToInt(AnfAlgo::GetNodeAttr<int64_t>(kernel_node, "input_size"));
  hidden_size_ = LongToInt(AnfAlgo::GetNodeAttr<int64_t>(kernel_node, "hidden_size"));
  num_layers_ = LongToInt(AnfAlgo::GetNodeAttr<int64_t>(kernel_node, "num_layers"));
  has_bias_ = AnfAlgo::GetNodeAttr<bool>(kernel_node, "has_bias");
  batch_size_ = SizeToInt(src_shape[1]);
  seq_len_ = SizeToInt(src_shape[0]);
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/mkldnn/lstm_cpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/mkldnn/lstm_cpu_kernel.h
@@ -1,5 +1,5 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 * Copyright 2020-2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -16,15 +16,18 @@

 #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_LSTM_CPU_KERNEL_H_
 #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_LSTM_CPU_KERNEL_H_

 #if defined(__x86_64__) || defined(__amd64__) || defined(_M_IX86) || defined(_M_X64)
 #define PLATFORM_86
 #endif
 #ifdef PLATFORM_86
 #include <pmmintrin.h>
 #endif

 #include <vector>
 #include <memory>
 #include "backend/kernel_compiler/cpu/mkldnn/mkl_cpu_kernel.h"

 namespace mindspore {
 namespace kernel {
 class LstmCPUKernel : public MKLCPUKernel {
@@ -41,18 +44,20 @@ class LstmCPUKernel : public MKLCPUKernel {

 private:
  void CheckParam(const CNodePtr &kernel_node);
  int weight_size_ = 0;
  int weight_h_size_ = 0;
  int input_size_;
  int hidden_size_;
  int num_layers_;
  int batch_size_;
  int seq_len_;
  int num_directions_;
  bool bidirectional_;
  bool has_bias_;
  size_t reserve_size_;
  bool is_training;

  int weight_size_{0};
  int weight_h_size_{0};
  int input_size_{0};
  int hidden_size_{0};
  int num_layers_{0};
  int batch_size_{0};
  int seq_len_{0};
  int num_directions_{0};
  bool bidirectional_{false};
  bool has_bias_{false};
  bool is_training{false};
  size_t reserve_size_{0};

  dnnl::memory::dims weights_dims_;
  dnnl::memory::dims weights_h_dims_;
  dnnl::memory::dims bias_dims_;
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/mkldnn/lstm_grad_cpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/mkldnn/lstm_grad_cpu_kernel.cc
@@ -1,5 +1,5 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 * Copyright 2020-2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -13,6 +13,7 @@
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #include "backend/kernel_compiler/cpu/mkldnn/lstm_grad_cpu_kernel.h"
 #include <cstring>
 #include <string>
@@ -22,8 +23,17 @@

 namespace mindspore {
 namespace kernel {
 const int kMaxLSTMLayer = 100;
 const int kInputWorkSpaceIndex = 10;
 namespace {
 constexpr size_t kLstmGradInputsNum = 11;
 constexpr size_t kLstmGradOutputsNum = 4;
 constexpr int kMaxLSTMLayer = 100;
 constexpr int kInputWorkSpaceIndex = 10;

 using tag = dnnl::memory::format_tag;
 using dim = dnnl::memory::dims;
 using dt = dnnl::memory::data_type;
 }  // namespace

 void LSTMGradCPUKernel::InitInputOutputSize(const CNodePtr &kernel_node) {
  CPUKernel::InitInputOutputSize(kernel_node);
  input_size_list_[kInputWorkSpaceIndex] = reserve_size_;
@@ -31,8 +41,7 @@ void LSTMGradCPUKernel::InitInputOutputSize(const CNodePtr &kernel_node) {

 void LSTMGradCPUKernel::InitKernel(const CNodePtr &kernel_node) {
  MS_EXCEPTION_IF_NULL(kernel_node);
  using tag = dnnl::memory::format_tag;
  using dim = dnnl::memory::dims;
  kernel_name_ = AnfAlgo::GetCNodeName(kernel_node);
  CheckParam(kernel_node);
  auto eng = MKLKernelEngine::Get().engine();
  dnnl::rnn_direction direction = dnnl::rnn_direction::unidirectional;
@@ -167,8 +176,8 @@ void LSTMGradCPUKernel::ResetMemory(const dnnl::memory &mem, const string name)

 bool LSTMGradCPUKernel::Launch(const std::vector<kernel::AddressPtr> &inputs, const std::vector<kernel::AddressPtr> &,
                               const std::vector<kernel::AddressPtr> &outputs) {
  using dt = dnnl::memory::data_type;
  using tag = dnnl::memory::format_tag;
  CHECK_KERNEL_INPUTS_NUM(inputs.size(), kLstmGradInputsNum, kernel_name_);
  CHECK_KERNEL_OUTPUTS_NUM(outputs.size(), kLstmGradOutputsNum, kernel_name_);
  auto eng = MKLKernelEngine::Get().engine();
  // construct fw memory
  auto user_weights_memory = dnnl::memory(dnnl::memory::desc{{weights_dims_}, dt::f32, tag::ldgoi}, eng);
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/mkldnn/lstm_grad_cpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/mkldnn/lstm_grad_cpu_kernel.h
@@ -1,5 +1,5 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 * Copyright 2020-2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -13,6 +13,7 @@
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_LSTM_GRAD_CPU_KERNEL_H_
 #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_LSTM_GRAD_CPU_KERNEL_H_

@@ -47,17 +48,19 @@ class LSTMGradCPUKernel : public MKLCPUKernel {
                           const dnnl::memory &diff_bias_memory);
  void ResetMemory(const dnnl::memory &mem, const string name) const;
  void CheckParam(const CNodePtr &kernel_node);
  int64_t weight_size_ = 0;
  int64_t weight_h_size_ = 0;
  int64_t input_size_;
  int64_t hidden_size_;
  int64_t num_layers_;
  int64_t batch_size_;
  int64_t seq_len_;
  int num_directions_;
  bool bidirectional_;
  bool has_bias_;
  size_t reserve_size_;

  int num_directions_{0};
  bool bidirectional_{false};
  bool has_bias_{false};
  int64_t weight_size_{0};
  int64_t weight_h_size_{0};
  int64_t input_size_{0};
  int64_t hidden_size_{0};
  int64_t num_layers_{0};
  int64_t batch_size_{0};
  int64_t seq_len_{0};
  size_t reserve_size_{0};

  dnnl::memory::dims weights_dims_;
  dnnl::memory::dims weights_h_dims_;
  dnnl::memory::dims bias_dims_;
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/mkldnn/matmul_cpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/mkldnn/matmul_cpu_kernel.cc
@@ -1,5 +1,5 @@
 /**
 * Copyright 2019 Huawei Technologies Co., Ltd
 * Copyright 2019-2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -13,10 +13,9 @@
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "backend/kernel_compiler/cpu/mkldnn/matmul_cpu_kernel.h"

 #include "backend/kernel_compiler/cpu/mkldnn/matmul_cpu_kernel.h"
 #include <utility>

 #include "common/thread_pool.h"
 #include "backend/kernel_compiler/cpu/mkldnn/mkl_kernel_engine.h"
 #include "backend/kernel_compiler/cpu/nnacl/op_base.h"
@@ -26,8 +25,10 @@
 namespace mindspore {
 namespace kernel {
 namespace {
 constexpr size_t kMatMulInputsNum = 2;
 constexpr size_t kMatMulOutputsNum = 1;
 const size_t kIndexOffset = 2;
 }
 }  // namespace

 void MatMulCPUKernel::InitTile() {
 #ifdef ENABLE_AVX
@@ -47,13 +48,16 @@ void MatMulCPUKernel::InitTile() {

 void MatMulCPUKernel::InitMatrixA(const float *src_ptr) {
  const size_t size = param_.batch * param_.row_align_ * param_.deep_;
  a_pack_ptr_ = new float[size];
  a_pack_ptr_ = new (std::nothrow) float[size];
  if (a_pack_ptr_ == nullptr) {
    MS_LOG(EXCEPTION) << "MatMul new a_pack_ptr_ failed.";
  }

  if (vec_matmul_) {
    const size_t count = size * sizeof(float);
    if (memcpy_s(a_pack_ptr_, count, src_ptr, count) != EOK) {
      FreeBuffer();
      MS_LOG(EXCEPTION) << "Memcpy a_pack_ptr_ failed.";
      MS_LOG(EXCEPTION) << "MatMul memcpy a_pack_ptr_ failed.";
    }
    return;
  }
@@ -88,14 +92,14 @@ void MatMulCPUKernel::InitMatrixB(const float *src_ptr) {
  b_pack_ptr_ = new (std::nothrow) float[size];
  if (b_pack_ptr_ == nullptr) {
    FreeBuffer();
    MS_LOG(EXCEPTION) << "Malloc b_pack_ptr_ failed";
    MS_LOG(EXCEPTION) << "MatMul new b_pack_ptr_ failed";
  }
  if (vec_matmul_) {
    if (param_.b_transpose_) {
      const size_t count = size * sizeof(float);
      if (memcpy_s(b_pack_ptr_, count, src_ptr, count) != EOK) {
        FreeBuffer();
        MS_LOG(EXCEPTION) << "Memcpy b_pack_ptr_ failed.";
        MS_LOG(EXCEPTION) << "MatMul memcpy b_pack_ptr_ failed.";
      }
    } else {
      for (int i = 0; i < param_.batch; i++) {
@@ -169,6 +173,7 @@ void MatMulCPUKernel::InitX64Kernel(bool trans_a, bool trans_b, const std::vecto

 void MatMulCPUKernel::InitKernel(const CNodePtr &kernel_node) {
  MS_EXCEPTION_IF_NULL(kernel_node);
  kernel_name_ = AnfAlgo::GetCNodeName(kernel_node);
  std::vector<size_t> a_shape = AnfAlgo::GetInputDeviceShape(kernel_node, 0);
  std::vector<size_t> b_shape = AnfAlgo::GetInputDeviceShape(kernel_node, 1);
  std::vector<size_t> o_shape = AnfAlgo::GetOutputDeviceShape(kernel_node, 0);
@@ -190,7 +195,7 @@ void MatMulCPUKernel::InitKernel(const CNodePtr &kernel_node) {
 #endif
 }

 int MatMulCPUKernel::FloatRun(size_t task_id) {
 int MatMulCPUKernel::FloatRun(size_t task_id) const {
  size_t current_stride_oc = thread_stride_ * col_tile_;
  if (IntToSize(param_.col_) <= task_id * current_stride_oc) {
    return common::SUCCESS;
@@ -238,7 +243,7 @@ void MatMulCPUKernel::LaunchARM(const float *input_a, const float *input_b, floa
  FreeBuffer();
 }

 void MatMulCPUKernel::LaunchX64(const float *input_a, const float *input_b, float *output) {
 void MatMulCPUKernel::LaunchX64(const float *input_a, const float *input_b, float *output) const {
  dnnl_dim_t lda = (trans_a_ == TRANSPOSE_YES ? dim_m_ : dim_k_);
  dnnl_dim_t ldb = (trans_b_ == TRANSPOSE_YES ? dim_k_ : dim_n_);
  dnnl_dim_t ldc = dim_n_;
@@ -252,9 +257,8 @@ void MatMulCPUKernel::LaunchX64(const float *input_a, const float *input_b, floa

 bool MatMulCPUKernel::Launch(const std::vector<kernel::AddressPtr> &inputs, const std::vector<kernel::AddressPtr> &,
                             const std::vector<kernel::AddressPtr> &outputs) {
  if (inputs.size() < 2 || outputs.empty()) {
    MS_LOG(EXCEPTION) << "matmul error input output size!";
  }
  CHECK_KERNEL_INPUTS_NUM(inputs.size(), kMatMulInputsNum, kernel_name_);
  CHECK_KERNEL_OUTPUTS_NUM(outputs.size(), kMatMulOutputsNum, kernel_name_);
  const auto input_a = reinterpret_cast<float *>(inputs[0]->addr);
  const auto input_b = reinterpret_cast<float *>(inputs[1]->addr);
  auto output = reinterpret_cast<float *>(outputs[0]->addr);
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/mkldnn/matmul_cpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/mkldnn/matmul_cpu_kernel.h
@@ -1,5 +1,5 @@
 /**
 * Copyright 2019 Huawei Technologies Co., Ltd
 * Copyright 2019-2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -13,6 +13,7 @@
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_MATMUL_CPU_KERNEL_H_
 #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_MATMUL_CPU_KERNEL_H_

@@ -42,14 +43,12 @@ class MatMulCPUKernel : public MKLCPUKernel {
                     const std::vector<size_t> &o_shape);
  void InitX64Kernel(bool trans_a, bool trans_b, const std::vector<size_t> &a_shape, const std::vector<size_t> &b_shape,
                     const std::vector<size_t> &o_shape);
  void LaunchX64(const float *input_a, const float *input_b, float *output);
  void LaunchX64(const float *input_a, const float *input_b, float *output) const;
  void LaunchARM(const float *input_a, const float *input_b, float *output);
  void ParallelRun(float *output);
  int FloatRun(size_t task_id);
  int FloatRun(size_t task_id) const;
  void FreeBuffer();

  char trans_a_{TRANSPOSE_NO};
  char trans_b_{TRANSPOSE_NO};
  dnnl_dim_t dim_m_{0};
  dnnl_dim_t dim_n_{0};
  dnnl_dim_t dim_k_{0};
@@ -62,6 +61,8 @@ class MatMulCPUKernel : public MKLCPUKernel {
  size_t size_mat_a_{0};
  size_t size_mat_b_{0};
  size_t size_mat_o_{0};
  char trans_a_{TRANSPOSE_NO};
  char trans_b_{TRANSPOSE_NO};
  bool vec_matmul_{false};
  float *a_pack_ptr_{nullptr};
  float *b_pack_ptr_{nullptr};
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/mkldnn/mkl_cpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/mkldnn/mkl_cpu_kernel.cc
@@ -1,5 +1,5 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 * Copyright 2020-2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -13,9 +13,11 @@
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #include "backend/kernel_compiler/cpu/mkldnn/mkl_cpu_kernel.h"
 #include <vector>
 #include <string>
 #include <algorithm>
 #include "utils/ms_utils.h"
 #include "backend/kernel_compiler/cpu/mkldnn/mkl_kernel_engine.h"

@@ -24,8 +26,10 @@ namespace kernel {
 void MKLCPUKernel::GetPadding(const CNodePtr &kernel_node, const std::string &pad_mode,
                              const std::vector<size_t> &src_shape, const std::vector<size_t> &kernel_size,
                              const std::vector<int> &stride, std::vector<int> *padding_l, std::vector<int> *padding_r,
                              const std::vector<int> &dilation) {
                              const std::vector<int> &dilation) const {
  MS_EXCEPTION_IF_NULL(kernel_node);
  MS_EXCEPTION_IF_NULL(padding_l);
  MS_EXCEPTION_IF_NULL(padding_r);
  auto dim = src_shape.size();
  if (dim < 2) {
    MS_LOG(EXCEPTION) << "Set pad only support src dim >= 2!";
@@ -65,7 +69,7 @@ void MKLCPUKernel::GetPadding(const CNodePtr &kernel_node, const std::string &pa
 }

 bool MKLCPUKernel::BinaryBroadCast(std::vector<size_t> *src0_shape, std::vector<size_t> *src1_shape,
                                   std::vector<size_t> *dst_shape) {
                                   std::vector<size_t> *dst_shape) const {
  MS_EXCEPTION_IF_NULL(src0_shape);
  MS_EXCEPTION_IF_NULL(src1_shape);
  MS_EXCEPTION_IF_NULL(dst_shape);
@@ -115,20 +119,19 @@ dnnl::memory::format_tag MKLCPUKernel::GetDefaultFormatTag(const dnnl::memory::d
    dnnl::memory::format_tag::a,      dnnl::memory::format_tag::ab,    dnnl::memory::format_tag::abc,
    dnnl::memory::format_tag::abcd,   dnnl::memory::format_tag::abcde, dnnl::memory::format_tag::abcdef,
    dnnl::memory::format_tag::abcdefg};

  auto rank = dims.size();
  size_t rank = dims.size();
  if (rank > tag_vec.size()) {
    MS_LOG(EXCEPTION) << "The kernel does not support construct " << rank << "-D tensor dnnl memory format_tag.";
  }
  return tag_vec[rank - 1];
 }

 dnnl::memory::desc MKLCPUKernel::GetDefaultMemDesc(const std::vector<size_t> &shape) {
 dnnl::memory::desc MKLCPUKernel::GetDefaultMemDesc(const std::vector<size_t> &shape) const {
  dnnl::memory::dims dims;
  if (shape.size() == 0) {
    dims.insert(dims.end(), 1);
  if (shape.empty()) {
    (void)dims.insert(dims.end(), 1);
  } else {
    dims.insert(dims.end(), shape.begin(), shape.end());
    (void)dims.insert(dims.end(), shape.begin(), shape.end());
  }
  dnnl::memory::format_tag mem_tag = GetDefaultFormatTag(dims);
  dnnl::memory::desc mem_desc(dims, dnnl::memory::data_type::f32, mem_tag);
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/mkldnn/mkl_cpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/mkldnn/mkl_cpu_kernel.h
@@ -1,5 +1,5 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 * Copyright 2020-2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -13,6 +13,7 @@
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_MKL_CPU_KERNEL_H_
 #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_MKL_CPU_KERNEL_H_

@@ -33,21 +34,22 @@ class MKLCPUKernel : public CPUKernel {

 protected:
  bool BinaryBroadCast(std::vector<size_t> *src0_shape, std::vector<size_t> *src1_shape,
                       std::vector<size_t> *dst_shape);
                       std::vector<size_t> *dst_shape) const;
  void GetPadding(const CNodePtr &kernel_node, const std::string &pad_mode, const std::vector<size_t> &src_shape,
                  const std::vector<size_t> &kernel_size, const std::vector<int> &stride, std::vector<int> *padding_l,
                  std::vector<int> *padding_r, const std::vector<int> &dilation);
                  std::vector<int> *padding_r, const std::vector<int> &dilation) const;
  void AddArgument(int arg_key, const dnnl::memory::desc &mem_desc, bool alloc = false);
  void SetArgumentHandle(int arg_key, void *ptr);
  dnnl::memory::format_tag GetDefaultFormatTag(const dnnl::memory::dims &dims) const;
  dnnl::memory::desc GetDefaultMemDesc(const std::vector<size_t> &shape);
  dnnl::memory::desc GetDefaultMemDesc(const std::vector<size_t> &shape) const;
  void ExecutePrimitive();
  std::unordered_map<int, dnnl::memory> arguments_;
  std::shared_ptr<dnnl::primitive> primitive_{nullptr};
  inline dnnl::memory::desc formatted_md(const dnnl::memory::dims &dimensions, dnnl::memory::format_tag layout) {
    return dnnl::memory::desc{{dimensions}, dnnl::memory::data_type::f32, layout};
  }
  void Reorder(dnnl::memory *src_mem, dnnl::memory *dst_mem);

  std::unordered_map<int, dnnl::memory> arguments_;
  std::shared_ptr<dnnl::primitive> primitive_{nullptr};
 };
 }  // namespace kernel
 }  // namespace mindspore
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/mkldnn/mkl_kernel_engine.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/mkldnn/mkl_kernel_engine.cc
@@ -1,5 +1,5 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 * Copyright 2020-2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -13,6 +13,7 @@
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #include "backend/kernel_compiler/cpu/mkldnn/mkl_kernel_engine.h"
 #include "utils/log_adapter.h"
 #include "dnnl.hpp"
@@ -33,6 +34,7 @@ dnnl::memory MKLKernelEngine::CreateMemory(const dnnl::memory::desc &mem_desc, b
    return dnnl::memory(mem_desc, engine_, nullptr);
  }
 }

 void MKLKernelEngine::Reorder(dnnl::memory *src_mem, dnnl::memory *dst_mem) {
  dnnl::reorder(*src_mem, *dst_mem).execute(stream_, *src_mem, *dst_mem);
 }
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/mkldnn/mkl_kernel_engine.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/mkldnn/mkl_kernel_engine.h
@@ -1,5 +1,5 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 * Copyright 2020-2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -13,8 +13,10 @@
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #ifndef MINDSPORE_MKL_KERNEL_ENGINE_H_
 #define MINDSPORE_MKL_KERNEL_ENGINE_H_

 #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_MKL_KERNEL_ENGINE_H_
 #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_MKL_KERNEL_ENGINE_H_

 #include <cstdlib>
 #include <algorithm>
 #include <iostream>
@@ -46,10 +48,11 @@ class MKLKernelEngine {
 private:
  MKLKernelEngine() : engine_(dnnl::engine::kind::cpu, 0), stream_(engine_) {}
  ~MKLKernelEngine() = default;

  dnnl::engine engine_;
  dnnl::stream stream_;
 };
 }  // namespace kernel
 }  // namespace mindspore

 #endif  // MINDSPORE_MKL_KERNEL_ENGINE_H_
 #endif  // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_MKL_KERNEL_ENGINE_H_
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/mkldnn/mul_cpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/mkldnn/mul_cpu_kernel.cc
@@ -1,65 +0,0 @@
 /**
 * Copyright 2019 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "backend/kernel_compiler/cpu/mkldnn/mul_cpu_kernel.h"
 #include "backend/kernel_compiler/cpu/mkldnn/mkl_kernel_engine.h"
 #include "runtime/device/cpu/cpu_device_address.h"
 #include "utils/ms_utils.h"

 namespace mindspore {
 namespace kernel {
 void MulCPUKernel::InitKernel(const CNodePtr &kernel_node) {
  MS_EXCEPTION_IF_NULL(kernel_node);
  std::vector<size_t> src0_shape = AnfAlgo::GetInputDeviceShape(kernel_node, 0);
  std::vector<size_t> src1_shape = AnfAlgo::GetInputDeviceShape(kernel_node, 1);
  std::vector<size_t> dst_shape = AnfAlgo::GetOutputDeviceShape(kernel_node, 0);
  need_swap_ = BinaryBroadCast(&src0_shape, &src1_shape, &dst_shape);
  dnnl::memory::desc src0_desc;
  dnnl::memory::desc src1_desc;
  if (need_swap_) {
    src0_desc = GetDefaultMemDesc(src1_shape);
    src1_desc = GetDefaultMemDesc(src0_shape);
  } else {
    src0_desc = GetDefaultMemDesc(src0_shape);
    src1_desc = GetDefaultMemDesc(src1_shape);
  }
  dnnl::memory::desc dst_desc = GetDefaultMemDesc(dst_shape);
  dnnl::binary::desc desc = dnnl::binary::desc(dnnl::algorithm::binary_mul, src0_desc, src1_desc, dst_desc);
  auto prim_desc = dnnl::binary::primitive_desc(desc, MKLKernelEngine::Get().engine());
  primitive_ = std::make_shared<dnnl::binary>(prim_desc);

  AddArgument(DNNL_ARG_SRC_0, src0_desc);
  AddArgument(DNNL_ARG_SRC_1, src1_desc);
  AddArgument(DNNL_ARG_DST, dst_desc);
 }

 bool MulCPUKernel::Launch(const std::vector<kernel::AddressPtr> &inputs, const std::vector<kernel::AddressPtr> &,
                          const std::vector<kernel::AddressPtr> &outputs) {
  if (inputs.size() < 2 || outputs.empty()) {
    MS_LOG(EXCEPTION) << "mul error input output size!";
  }
  if (need_swap_) {
    SetArgumentHandle(DNNL_ARG_SRC_0, inputs[1]->addr);
    SetArgumentHandle(DNNL_ARG_SRC_1, inputs[0]->addr);
  } else {
    SetArgumentHandle(DNNL_ARG_SRC_0, inputs[0]->addr);
    SetArgumentHandle(DNNL_ARG_SRC_1, inputs[1]->addr);
  }
  SetArgumentHandle(DNNL_ARG_DST, outputs[0]->addr);
  ExecutePrimitive();
  return true;
 }
 }  // namespace kernel
 }  // namespace mindspore
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/mkldnn/mul_cpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/mkldnn/mul_cpu_kernel.h
@@ -1,42 +0,0 @@
 /**
 * Copyright 2019 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_MUL_CPU_KERNEL_H_
 #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_MUL_CPU_KERNEL_H_

 #include <vector>
 #include <memory>
 #include "backend/kernel_compiler/cpu/arithmetic_cpu_kernel.h"
 #include "backend/kernel_compiler/cpu/mkldnn/mkl_cpu_kernel.h"

 namespace mindspore {
 namespace kernel {
 class MulCPUKernel : public MKLCPUKernel {
 public:
  MulCPUKernel() = default;
  ~MulCPUKernel() override = default;

  void InitKernel(const CNodePtr &kernel_node) override;

  bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
              const std::vector<AddressPtr> &outputs) override;

 private:
  bool need_swap_{false};
 };
 }  // namespace kernel
 }  // namespace mindspore

 #endif  // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_MUL_CPU_KERNEL_H_
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/base/broadcast_to.c
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/base/broadcast_to.c
@@ -39,7 +39,7 @@ void pad_input_shape(int *input_shape, int input_shape_len, int output_shape_len
  }
 }

 #define BROADCAST_TO(type)                                                                             \
 #define BROADCAST_TO_IMPL(type)                                                                        \
  int broadcast_to_##type(const type *input, BroadcastShapeInfo *shape_info, type *output) {           \
    if (input == NULL || output == NULL) {                                                             \
      return NNACL_NULL_PTR;                                                                           \
@@ -96,9 +96,9 @@ void pad_input_shape(int *input_shape, int input_shape_len, int output_shape_len
    return NNACL_OK;                                                                                   \
  }

 BROADCAST_TO(int)
 BROADCAST_TO(float)
 BROADCAST_TO(bool)
 BROADCAST_TO_IMPL(int)
 BROADCAST_TO_IMPL(float)
 BROADCAST_TO_IMPL(bool)
 #ifdef ENABLE_FP16
 BROADCAST_TO(float16_t)
 BROADCAST_TO_IMPL(float16_t)
 #endif
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/base/broadcast_to.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/base/broadcast_to.h
@@ -21,7 +21,7 @@
 #ifdef __cplusplus
 extern "C" {
 #endif
 #define BroadcastTo(type, input, shape_info, output) broadcast_to_##type(input, shape_info, output)
 #define BROADCAST_TO(type, input, shape_info, output) broadcast_to_##type(input, shape_info, output)
 int broadcast_to_int(const int *input, BroadcastShapeInfo *shape_info, int *output);
 int broadcast_to_float(const float *input, BroadcastShapeInfo *shape_info, float *output);
 int broadcast_to_bool(const bool *input, BroadcastShapeInfo *shape_info, bool *output);
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/broadcast_to_fp32.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/broadcast_to_fp32.cc
@@ -59,17 +59,17 @@ int BroadcastToCPUKernel::Run() {

  switch (data_type_) {
    case kNumberTypeFloat32:
      return BroadcastTo(float, reinterpret_cast<const float *>(input_data), &shape_info_,
                         reinterpret_cast<float *>(output_data));
      return BROADCAST_TO(float, reinterpret_cast<const float *>(input_data), &shape_info_,
                          reinterpret_cast<float *>(output_data));
 #ifdef ENABLE_FP16
    case kNumberTypeFloat16:
      return BroadcastTo(float16_t, reinterpret_cast<const float16_t *>(input_data), &shape_info_,
                         reinterpret_cast<float16_t *>(output_data));
      return BROADCAST_TO(float16_t, reinterpret_cast<const float16_t *>(input_data), &shape_info_,
                          reinterpret_cast<float16_t *>(output_data));
 #endif
    case kNumberTypeInt32:
    case kNumberTypeInt:
      return BroadcastTo(int, reinterpret_cast<const int *>(input_data), &shape_info_,
                         reinterpret_cast<int *>(output_data));
      return BROADCAST_TO(int, reinterpret_cast<const int *>(input_data), &shape_info_,
                          reinterpret_cast<int *>(output_data));
    default:
      MS_LOG(ERROR) << "UnSupported data type: " << data_type_;
      return RET_ERROR;