change from CRLF to LF

4 years ago · b3d4399d32
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/binary_cross_entropy_cpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/binary_cross_entropy_cpu_kernel.cc
@@ -1,116 +1,116 @@
 /**
 * Copyright 2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "backend/kernel_compiler/cpu/binary_cross_entropy_cpu_kernel.h"

 namespace mindspore {
 namespace kernel {
 constexpr size_t kBceInputNumWithWeight = 3;

 template <typename T>
 void BinaryCrossEntropyCpuKernel::LaunchToScalar(const int &input_size, const int &reduction, T *loss, T *tmp_loss) {
  if (input_size % 2 == 1) {
    tmp_loss[0] += tmp_loss[input_size - 1];
  }

  for (int stride = input_size / 2; stride > 0; stride = stride / 2) {
    for (int i = 0; i < stride; i++) {
      tmp_loss[i] += tmp_loss[i + stride];
    }
    if (stride > 2 && stride % 2 == 1) {
      tmp_loss[0] += tmp_loss[stride - 1];
    }
  }

  loss[0] += tmp_loss[0];
  if (reduction == 1) {
    loss[0] /= static_cast<T>(input_size);
  }
 }

 template <typename T>
 void BinaryCrossEntropyCpuKernel::Launchkernel(const std::vector<AddressPtr> &inputs,
                                               const std::vector<AddressPtr> &workspace,
                                               const std::vector<AddressPtr> &outputs) {
  T *input_x = reinterpret_cast<T *>(inputs[0]->addr);
  T *input_y = reinterpret_cast<T *>(inputs[1]->addr);
  T *weight = nullptr;
  if (weight_defined_) {
    weight = reinterpret_cast<T *>(inputs[2]->addr);
  }
  T *loss = reinterpret_cast<T *>(outputs[0]->addr);
  std::vector<T> tmp_loss(input_size_);

  T epsilon = static_cast<T>(1e-12);
  T one = static_cast<T>(1);
  if (reduction_ == 0 && weight_defined_) {
    for (size_t i = 0; i < input_size_; i++) {
      T value =
        -weight[i] * (input_y[i] * log(input_x[i] + epsilon) + (one - input_y[i]) * log(one - input_x[i] + epsilon));
      loss[i] = value;
    }
  } else if (reduction_ == 0 && (!weight_defined_)) {
    for (size_t i = 0; i < input_size_; i++) {
      T value = -(input_y[i] * log(input_x[i] + epsilon) + (one - input_y[i]) * log(one - input_x[i] + epsilon));
      loss[i] = value;
    }
  } else if ((reduction_ != 0) && weight_defined_) {
    for (size_t i = 0; i < input_size_; i++) {
      T value =
        -weight[i] * (input_y[i] * log(input_x[i] + epsilon) + (one - input_y[i]) * log(one - input_x[i] + epsilon));
      tmp_loss[i] = value;
    }
  } else {
    for (size_t i = 0; i < input_size_; i++) {
      T value = -(input_y[i] * log(input_x[i] + epsilon) + (one - input_y[i]) * log(one - input_x[i] + epsilon));
      tmp_loss[i] = value;
    }
  }

  if (reduction_ != 0) {
    LaunchToScalar<T>(input_size_, reduction_, loss, tmp_loss.data());
  }
 }

 bool BinaryCrossEntropyCpuKernel::Launch(const std::vector<AddressPtr> &inputs,
                                         const std::vector<AddressPtr> &workspace,
                                         const std::vector<AddressPtr> &outputs) {
  if (input_size_ > 0) {
    if (dtype_ == kNumberTypeFloat32) {
      Launchkernel<float>(inputs, workspace, outputs);
    } else if (dtype_ == kNumberTypeFloat16) {
      Launchkernel<float16>(inputs, workspace, outputs);
    }
  }
  return true;
 }

 void BinaryCrossEntropyCpuKernel::InitKernel(const CNodePtr &kernel_node) {
  auto input_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
  for (size_t i = 0; i < input_shape.size(); i++) {
    input_size_ *= input_shape[i];
  }
  string reduction = AnfAlgo::GetNodeAttr<string>(kernel_node, "reduction");
  if (reduction == "none") {
    reduction_ = 0;
  } else if (reduction == "sum") {
    reduction_ = 2;
  }
  size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
  weight_defined_ = (input_num == kBceInputNumWithWeight);
  dtype_ = AnfAlgo::GetInputDeviceDataType(kernel_node, 0);
 }
 }  // namespace kernel
 }  // namespace mindspore
 /**
 * Copyright 2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "backend/kernel_compiler/cpu/binary_cross_entropy_cpu_kernel.h"

 namespace mindspore {
 namespace kernel {
 constexpr size_t kBceInputNumWithWeight = 3;

 template <typename T>
 void BinaryCrossEntropyCpuKernel::LaunchToScalar(const int &input_size, const int &reduction, T *loss, T *tmp_loss) {
  if (input_size % 2 == 1) {
    tmp_loss[0] += tmp_loss[input_size - 1];
  }

  for (int stride = input_size / 2; stride > 0; stride = stride / 2) {
    for (int i = 0; i < stride; i++) {
      tmp_loss[i] += tmp_loss[i + stride];
    }
    if (stride > 2 && stride % 2 == 1) {
      tmp_loss[0] += tmp_loss[stride - 1];
    }
  }

  loss[0] += tmp_loss[0];
  if (reduction == 1) {
    loss[0] /= static_cast<T>(input_size);
  }
 }

 template <typename T>
 void BinaryCrossEntropyCpuKernel::Launchkernel(const std::vector<AddressPtr> &inputs,
                                               const std::vector<AddressPtr> &workspace,
                                               const std::vector<AddressPtr> &outputs) {
  T *input_x = reinterpret_cast<T *>(inputs[0]->addr);
  T *input_y = reinterpret_cast<T *>(inputs[1]->addr);
  T *weight = nullptr;
  if (weight_defined_) {
    weight = reinterpret_cast<T *>(inputs[2]->addr);
  }
  T *loss = reinterpret_cast<T *>(outputs[0]->addr);
  std::vector<T> tmp_loss(input_size_);

  T epsilon = static_cast<T>(1e-12);
  T one = static_cast<T>(1);
  if (reduction_ == 0 && weight_defined_) {
    for (size_t i = 0; i < input_size_; i++) {
      T value =
        -weight[i] * (input_y[i] * log(input_x[i] + epsilon) + (one - input_y[i]) * log(one - input_x[i] + epsilon));
      loss[i] = value;
    }
  } else if (reduction_ == 0 && (!weight_defined_)) {
    for (size_t i = 0; i < input_size_; i++) {
      T value = -(input_y[i] * log(input_x[i] + epsilon) + (one - input_y[i]) * log(one - input_x[i] + epsilon));
      loss[i] = value;
    }
  } else if ((reduction_ != 0) && weight_defined_) {
    for (size_t i = 0; i < input_size_; i++) {
      T value =
        -weight[i] * (input_y[i] * log(input_x[i] + epsilon) + (one - input_y[i]) * log(one - input_x[i] + epsilon));
      tmp_loss[i] = value;
    }
  } else {
    for (size_t i = 0; i < input_size_; i++) {
      T value = -(input_y[i] * log(input_x[i] + epsilon) + (one - input_y[i]) * log(one - input_x[i] + epsilon));
      tmp_loss[i] = value;
    }
  }

  if (reduction_ != 0) {
    LaunchToScalar<T>(input_size_, reduction_, loss, tmp_loss.data());
  }
 }

 bool BinaryCrossEntropyCpuKernel::Launch(const std::vector<AddressPtr> &inputs,
                                         const std::vector<AddressPtr> &workspace,
                                         const std::vector<AddressPtr> &outputs) {
  if (input_size_ > 0) {
    if (dtype_ == kNumberTypeFloat32) {
      Launchkernel<float>(inputs, workspace, outputs);
    } else if (dtype_ == kNumberTypeFloat16) {
      Launchkernel<float16>(inputs, workspace, outputs);
    }
  }
  return true;
 }

 void BinaryCrossEntropyCpuKernel::InitKernel(const CNodePtr &kernel_node) {
  auto input_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
  for (size_t i = 0; i < input_shape.size(); i++) {
    input_size_ *= input_shape[i];
  }
  string reduction = AnfAlgo::GetNodeAttr<string>(kernel_node, "reduction");
  if (reduction == "none") {
    reduction_ = 0;
  } else if (reduction == "sum") {
    reduction_ = 2;
  }
  size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
  weight_defined_ = (input_num == kBceInputNumWithWeight);
  dtype_ = AnfAlgo::GetInputDeviceDataType(kernel_node, 0);
 }
 }  // namespace kernel
 }  // namespace mindspore
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/binary_cross_entropy_cpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/binary_cross_entropy_cpu_kernel.h
@@ -1,71 +1,71 @@
 /**
 * Copyright 2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_NN_BINARY_CROSS_ENTROPY_KERNEL_H
 #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_NN_BINARY_CROSS_ENTROPY_KERNEL_H

 #include <vector>
 #include <string>
 #include "backend/kernel_compiler/cpu/cpu_kernel.h"
 #include "backend/kernel_compiler/cpu/cpu_kernel_factory.h"

 namespace mindspore {
 namespace kernel {
 class BinaryCrossEntropyCpuKernel : public CPUKernel {
 public:
  BinaryCrossEntropyCpuKernel() : input_size_(1), reduction_(1), weight_defined_(false) {}
  ~BinaryCrossEntropyCpuKernel() override = default;

  void InitKernel(const CNodePtr &kernel_node) override;
  bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
              const std::vector<AddressPtr> &outputs) override;

 private:
  template <typename T>
  void LaunchToScalar(const int &input_size, const int &reduction, T *loss, T *tmp_loss);
  template <typename T>
  void Launchkernel(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
                    const std::vector<AddressPtr> &outputs);

  TypeId dtype_{kTypeUnknown};
  size_t input_size_;
  int reduction_;
  bool weight_defined_;  // true: there are 3 inputs, false: there are 2 inputs(no [weight])
 };
 MS_REG_CPU_KERNEL(BinaryCrossEntropy,
                  KernelAttr()
                    .AddInputAttr(kNumberTypeFloat16)
                    .AddInputAttr(kNumberTypeFloat16)
                    .AddInputAttr(kNumberTypeFloat16)
                    .AddOutputAttr(kNumberTypeFloat16),
                  BinaryCrossEntropyCpuKernel);
 MS_REG_CPU_KERNEL(BinaryCrossEntropy,
                  KernelAttr()
                    .AddInputAttr(kNumberTypeFloat32)
                    .AddInputAttr(kNumberTypeFloat32)
                    .AddInputAttr(kNumberTypeFloat32)
                    .AddOutputAttr(kNumberTypeFloat32),
                  BinaryCrossEntropyCpuKernel);
 MS_REG_CPU_KERNEL(
  BinaryCrossEntropy,
  KernelAttr().AddInputAttr(kNumberTypeFloat16).AddInputAttr(kNumberTypeFloat16).AddOutputAttr(kNumberTypeFloat16),
  BinaryCrossEntropyCpuKernel);
 MS_REG_CPU_KERNEL(
  BinaryCrossEntropy,
  KernelAttr().AddInputAttr(kNumberTypeFloat32).AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32),
  BinaryCrossEntropyCpuKernel);
 }  // namespace kernel
 }  // namespace mindspore
 #endif  // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_NN_BINARY_CROSS_ENTROPY_KERNEL_H
 /**
 * Copyright 2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_NN_BINARY_CROSS_ENTROPY_KERNEL_H
 #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_NN_BINARY_CROSS_ENTROPY_KERNEL_H

 #include <vector>
 #include <string>
 #include "backend/kernel_compiler/cpu/cpu_kernel.h"
 #include "backend/kernel_compiler/cpu/cpu_kernel_factory.h"

 namespace mindspore {
 namespace kernel {
 class BinaryCrossEntropyCpuKernel : public CPUKernel {
 public:
  BinaryCrossEntropyCpuKernel() : input_size_(1), reduction_(1), weight_defined_(false) {}
  ~BinaryCrossEntropyCpuKernel() override = default;

  void InitKernel(const CNodePtr &kernel_node) override;
  bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
              const std::vector<AddressPtr> &outputs) override;

 private:
  template <typename T>
  void LaunchToScalar(const int &input_size, const int &reduction, T *loss, T *tmp_loss);
  template <typename T>
  void Launchkernel(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
                    const std::vector<AddressPtr> &outputs);

  TypeId dtype_{kTypeUnknown};
  size_t input_size_;
  int reduction_;
  bool weight_defined_;  // true: there are 3 inputs, false: there are 2 inputs(no [weight])
 };
 MS_REG_CPU_KERNEL(BinaryCrossEntropy,
                  KernelAttr()
                    .AddInputAttr(kNumberTypeFloat16)
                    .AddInputAttr(kNumberTypeFloat16)
                    .AddInputAttr(kNumberTypeFloat16)
                    .AddOutputAttr(kNumberTypeFloat16),
                  BinaryCrossEntropyCpuKernel);
 MS_REG_CPU_KERNEL(BinaryCrossEntropy,
                  KernelAttr()
                    .AddInputAttr(kNumberTypeFloat32)
                    .AddInputAttr(kNumberTypeFloat32)
                    .AddInputAttr(kNumberTypeFloat32)
                    .AddOutputAttr(kNumberTypeFloat32),
                  BinaryCrossEntropyCpuKernel);
 MS_REG_CPU_KERNEL(
  BinaryCrossEntropy,
  KernelAttr().AddInputAttr(kNumberTypeFloat16).AddInputAttr(kNumberTypeFloat16).AddOutputAttr(kNumberTypeFloat16),
  BinaryCrossEntropyCpuKernel);
 MS_REG_CPU_KERNEL(
  BinaryCrossEntropy,
  KernelAttr().AddInputAttr(kNumberTypeFloat32).AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32),
  BinaryCrossEntropyCpuKernel);
 }  // namespace kernel
 }  // namespace mindspore
 #endif  // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_NN_BINARY_CROSS_ENTROPY_KERNEL_H
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/binary_cross_entropy_grad_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/binary_cross_entropy_grad_kernel.cc
@@ -1,102 +1,102 @@
 /**
 * Copyright 2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "backend/kernel_compiler/cpu/binary_cross_entropy_grad_kernel.h"

 namespace mindspore {
 namespace kernel {
 constexpr size_t kBceGradInputNumWithWeight = 4;

 template <typename T>
 void BinaryCrossEntropyGradCpuKernel::Launchkernel(const std::vector<AddressPtr> &inputs,
                                                   const std::vector<AddressPtr> &outputs) {
  T *input_x = reinterpret_cast<T *>(inputs[0]->addr);
  T *input_y = reinterpret_cast<T *>(inputs[1]->addr);
  T *dloss = reinterpret_cast<T *>(inputs[2]->addr);
  T *weight = nullptr;
  if (weight_defined_) {
    weight = reinterpret_cast<T *>(inputs[3]->addr);
  }

  T *dx = reinterpret_cast<T *>(outputs[0]->addr);

  T epsilon = static_cast<T>(1e-12);
  T one = static_cast<T>(1);
  if (reduction_ == 0) {
    if (weight_defined_) {
      for (size_t i = 0; i < input_size_; i++) {
        T denominator = ((input_x[i] * (one - input_x[i])) > epsilon) ? (input_x[i] * (one - input_x[i])) : epsilon;
        T value = weight[i] * (input_x[i] - input_y[i]) / denominator;
        dx[i] = value * dloss[i];
      }
    } else {
      for (size_t i = 0; i < input_size_; i++) {
        T denominator = ((input_x[i] * (one - input_x[i])) > epsilon) ? (input_x[i] * (one - input_x[i])) : epsilon;
        T value = (input_x[i] - input_y[i]) / denominator;
        dx[i] = value * dloss[i];
      }
    }
  } else {
    T dloss1 = dloss[0];
    if (reduction_ == 1) {
      dloss1 = dloss[0] / static_cast<T>(input_size_);
    }
    if (weight_defined_) {
      for (size_t i = 0; i < input_size_; i++) {
        T denominator = ((input_x[i] * (one - input_x[i])) > epsilon) ? (input_x[i] * (one - input_x[i])) : epsilon;
        T value = weight[i] * (input_x[i] - input_y[i]) / denominator;
        dx[i] = value * dloss1;
      }
    } else {
      for (size_t i = 0; i < input_size_; i++) {
        T denominator = ((input_x[i] * (one - input_x[i])) > epsilon) ? (input_x[i] * (one - input_x[i])) : epsilon;
        T value = (input_x[i] - input_y[i]) / denominator;
        dx[i] = value * dloss1;
      }
    }
  }
 }

 bool BinaryCrossEntropyGradCpuKernel::Launch(const std::vector<AddressPtr> &inputs,
                                             const std::vector<AddressPtr> &workspace,
                                             const std::vector<AddressPtr> &outputs) {
  if (input_size_ > 0) {
    if (dtype_ == kNumberTypeFloat32) {
      Launchkernel<float>(inputs, outputs);
    } else if (dtype_ == kNumberTypeFloat16) {
      Launchkernel<float16>(inputs, outputs);
    }
  }
  return true;
 }

 void BinaryCrossEntropyGradCpuKernel::InitKernel(const CNodePtr &kernel_node) {
  auto input_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
  for (size_t i = 0; i < input_shape.size(); i++) {
    input_size_ *= input_shape[i];
  }
  string reduction = AnfAlgo::GetNodeAttr<string>(kernel_node, "reduction");
  if (reduction == "none") {
    reduction_ = 0;
  } else if (reduction == "sum") {
    reduction_ = 2;
  }

  size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
  weight_defined_ = (input_num == kBceGradInputNumWithWeight);
  dtype_ = AnfAlgo::GetInputDeviceDataType(kernel_node, 0);
 }
 }  // namespace kernel
 }  // namespace mindspore
 /**
 * Copyright 2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "backend/kernel_compiler/cpu/binary_cross_entropy_grad_kernel.h"

 namespace mindspore {
 namespace kernel {
 constexpr size_t kBceGradInputNumWithWeight = 4;

 template <typename T>
 void BinaryCrossEntropyGradCpuKernel::Launchkernel(const std::vector<AddressPtr> &inputs,
                                                   const std::vector<AddressPtr> &outputs) {
  T *input_x = reinterpret_cast<T *>(inputs[0]->addr);
  T *input_y = reinterpret_cast<T *>(inputs[1]->addr);
  T *dloss = reinterpret_cast<T *>(inputs[2]->addr);
  T *weight = nullptr;
  if (weight_defined_) {
    weight = reinterpret_cast<T *>(inputs[3]->addr);
  }

  T *dx = reinterpret_cast<T *>(outputs[0]->addr);

  T epsilon = static_cast<T>(1e-12);
  T one = static_cast<T>(1);
  if (reduction_ == 0) {
    if (weight_defined_) {
      for (size_t i = 0; i < input_size_; i++) {
        T denominator = ((input_x[i] * (one - input_x[i])) > epsilon) ? (input_x[i] * (one - input_x[i])) : epsilon;
        T value = weight[i] * (input_x[i] - input_y[i]) / denominator;
        dx[i] = value * dloss[i];
      }
    } else {
      for (size_t i = 0; i < input_size_; i++) {
        T denominator = ((input_x[i] * (one - input_x[i])) > epsilon) ? (input_x[i] * (one - input_x[i])) : epsilon;
        T value = (input_x[i] - input_y[i]) / denominator;
        dx[i] = value * dloss[i];
      }
    }
  } else {
    T dloss1 = dloss[0];
    if (reduction_ == 1) {
      dloss1 = dloss[0] / static_cast<T>(input_size_);
    }
    if (weight_defined_) {
      for (size_t i = 0; i < input_size_; i++) {
        T denominator = ((input_x[i] * (one - input_x[i])) > epsilon) ? (input_x[i] * (one - input_x[i])) : epsilon;
        T value = weight[i] * (input_x[i] - input_y[i]) / denominator;
        dx[i] = value * dloss1;
      }
    } else {
      for (size_t i = 0; i < input_size_; i++) {
        T denominator = ((input_x[i] * (one - input_x[i])) > epsilon) ? (input_x[i] * (one - input_x[i])) : epsilon;
        T value = (input_x[i] - input_y[i]) / denominator;
        dx[i] = value * dloss1;
      }
    }
  }
 }

 bool BinaryCrossEntropyGradCpuKernel::Launch(const std::vector<AddressPtr> &inputs,
                                             const std::vector<AddressPtr> &workspace,
                                             const std::vector<AddressPtr> &outputs) {
  if (input_size_ > 0) {
    if (dtype_ == kNumberTypeFloat32) {
      Launchkernel<float>(inputs, outputs);
    } else if (dtype_ == kNumberTypeFloat16) {
      Launchkernel<float16>(inputs, outputs);
    }
  }
  return true;
 }

 void BinaryCrossEntropyGradCpuKernel::InitKernel(const CNodePtr &kernel_node) {
  auto input_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
  for (size_t i = 0; i < input_shape.size(); i++) {
    input_size_ *= input_shape[i];
  }
  string reduction = AnfAlgo::GetNodeAttr<string>(kernel_node, "reduction");
  if (reduction == "none") {
    reduction_ = 0;
  } else if (reduction == "sum") {
    reduction_ = 2;
  }

  size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
  weight_defined_ = (input_num == kBceGradInputNumWithWeight);
  dtype_ = AnfAlgo::GetInputDeviceDataType(kernel_node, 0);
 }
 }  // namespace kernel
 }  // namespace mindspore
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/binary_cross_entropy_grad_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/binary_cross_entropy_grad_kernel.h
@@ -1,76 +1,76 @@
 /**
 * Copyright 2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_NN_BINARY_CROSS_ENTROPY_GRAD_KERNEL_H
 #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_NN_BINARY_CROSS_ENTROPY_GRAD_KERNEL_H

 #include <vector>
 #include <string>
 #include "backend/kernel_compiler/cpu/cpu_kernel.h"
 #include "backend/kernel_compiler/cpu/cpu_kernel_factory.h"

 namespace mindspore {
 namespace kernel {
 class BinaryCrossEntropyGradCpuKernel : public CPUKernel {
 public:
  BinaryCrossEntropyGradCpuKernel() : input_size_(1), reduction_(1), weight_defined_(false) {}
  ~BinaryCrossEntropyGradCpuKernel() override = default;

  void InitKernel(const CNodePtr &kernel_node) override;
  bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
              const std::vector<AddressPtr> &outputs) override;

 private:
  template <typename T>
  void Launchkernel(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &outputs);

  TypeId dtype_{kTypeUnknown};
  size_t input_size_;
  int reduction_;
  bool weight_defined_;  // true: there are 4 inputs, false: there are 3 inputs(no [weight])
 };
 MS_REG_CPU_KERNEL(BinaryCrossEntropyGrad,
                  KernelAttr()
                    .AddInputAttr(kNumberTypeFloat16)
                    .AddInputAttr(kNumberTypeFloat16)
                    .AddInputAttr(kNumberTypeFloat16)
                    .AddInputAttr(kNumberTypeFloat16)
                    .AddOutputAttr(kNumberTypeFloat16),
                  BinaryCrossEntropyGradCpuKernel);
 MS_REG_CPU_KERNEL(BinaryCrossEntropyGrad,
                  KernelAttr()
                    .AddInputAttr(kNumberTypeFloat32)
                    .AddInputAttr(kNumberTypeFloat32)
                    .AddInputAttr(kNumberTypeFloat32)
                    .AddInputAttr(kNumberTypeFloat32)
                    .AddOutputAttr(kNumberTypeFloat32),
                  BinaryCrossEntropyGradCpuKernel);
 MS_REG_CPU_KERNEL(BinaryCrossEntropyGrad,
                  KernelAttr()
                    .AddInputAttr(kNumberTypeFloat16)
                    .AddInputAttr(kNumberTypeFloat16)
                    .AddInputAttr(kNumberTypeFloat16)
                    .AddOutputAttr(kNumberTypeFloat16),
                  BinaryCrossEntropyGradCpuKernel);
 MS_REG_CPU_KERNEL(BinaryCrossEntropyGrad,
                  KernelAttr()
                    .AddInputAttr(kNumberTypeFloat32)
                    .AddInputAttr(kNumberTypeFloat32)
                    .AddInputAttr(kNumberTypeFloat32)
                    .AddOutputAttr(kNumberTypeFloat32),
                  BinaryCrossEntropyGradCpuKernel);
 }  // namespace kernel
 }  // namespace mindspore
 #endif  // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_NN_BINARY_CROSS_ENTROPY_GRAD_KERNEL_H
 /**
 * Copyright 2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_NN_BINARY_CROSS_ENTROPY_GRAD_KERNEL_H
 #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_NN_BINARY_CROSS_ENTROPY_GRAD_KERNEL_H

 #include <vector>
 #include <string>
 #include "backend/kernel_compiler/cpu/cpu_kernel.h"
 #include "backend/kernel_compiler/cpu/cpu_kernel_factory.h"

 namespace mindspore {
 namespace kernel {
 class BinaryCrossEntropyGradCpuKernel : public CPUKernel {
 public:
  BinaryCrossEntropyGradCpuKernel() : input_size_(1), reduction_(1), weight_defined_(false) {}
  ~BinaryCrossEntropyGradCpuKernel() override = default;

  void InitKernel(const CNodePtr &kernel_node) override;
  bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
              const std::vector<AddressPtr> &outputs) override;

 private:
  template <typename T>
  void Launchkernel(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &outputs);

  TypeId dtype_{kTypeUnknown};
  size_t input_size_;
  int reduction_;
  bool weight_defined_;  // true: there are 4 inputs, false: there are 3 inputs(no [weight])
 };
 MS_REG_CPU_KERNEL(BinaryCrossEntropyGrad,
                  KernelAttr()
                    .AddInputAttr(kNumberTypeFloat16)
                    .AddInputAttr(kNumberTypeFloat16)
                    .AddInputAttr(kNumberTypeFloat16)
                    .AddInputAttr(kNumberTypeFloat16)
                    .AddOutputAttr(kNumberTypeFloat16),
                  BinaryCrossEntropyGradCpuKernel);
 MS_REG_CPU_KERNEL(BinaryCrossEntropyGrad,
                  KernelAttr()
                    .AddInputAttr(kNumberTypeFloat32)
                    .AddInputAttr(kNumberTypeFloat32)
                    .AddInputAttr(kNumberTypeFloat32)
                    .AddInputAttr(kNumberTypeFloat32)
                    .AddOutputAttr(kNumberTypeFloat32),
                  BinaryCrossEntropyGradCpuKernel);
 MS_REG_CPU_KERNEL(BinaryCrossEntropyGrad,
                  KernelAttr()
                    .AddInputAttr(kNumberTypeFloat16)
                    .AddInputAttr(kNumberTypeFloat16)
                    .AddInputAttr(kNumberTypeFloat16)
                    .AddOutputAttr(kNumberTypeFloat16),
                  BinaryCrossEntropyGradCpuKernel);
 MS_REG_CPU_KERNEL(BinaryCrossEntropyGrad,
                  KernelAttr()
                    .AddInputAttr(kNumberTypeFloat32)
                    .AddInputAttr(kNumberTypeFloat32)
                    .AddInputAttr(kNumberTypeFloat32)
                    .AddOutputAttr(kNumberTypeFloat32),
                  BinaryCrossEntropyGradCpuKernel);
 }  // namespace kernel
 }  // namespace mindspore
 #endif  // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_NN_BINARY_CROSS_ENTROPY_GRAD_KERNEL_H
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/cpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/cpu_kernel.cc
@@ -1,271 +1,271 @@
 /**
 * Copyright 2019 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "backend/kernel_compiler/cpu/cpu_kernel.h"
 #include <algorithm>
 #include <utility>
 #include "common/thread_pool.h"

 namespace mindspore {
 namespace kernel {
 void CPUKernel::InitInputOutputSize(const CNodePtr &kernel_node) {
  MS_EXCEPTION_IF_NULL(kernel_node);
  size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
  for (size_t input_index = 0; input_index < input_num; ++input_index) {
    TypeId type_id = AnfAlgo::GetInputDeviceDataType(kernel_node, input_index);
    size_t type_size = GetTypeByte(TypeIdToType(type_id));
    std::vector<size_t> shape = AnfAlgo::GetInputDeviceShape(kernel_node, input_index);
    size_t tensor_size =
      shape.empty() ? type_size : std::accumulate(shape.begin(), shape.end(), type_size, std::multiplies<size_t>());
    tensor_size = std::max(tensor_size, type_size);
    input_size_list_.emplace_back(tensor_size);
  }
  size_t output_num = AnfAlgo::GetOutputTensorNum(kernel_node);
  for (size_t output_index = 0; output_index < output_num; ++output_index) {
    TypeId type_id = AnfAlgo::GetOutputDeviceDataType(kernel_node, output_index);
    size_t type_size = GetTypeByte(TypeIdToType(type_id));
    std::vector<size_t> shape = AnfAlgo::GetOutputDeviceShape(kernel_node, output_index);
    size_t tensor_size =
      shape.empty() ? type_size : std::accumulate(shape.begin(), shape.end(), type_size, std::multiplies<size_t>());
    tensor_size = std::max(tensor_size, type_size);
    output_size_list_.emplace_back(tensor_size);
  }
 }

 void CPUKernel::Init(const CNodePtr &kernel_node) {
  InitKernel(kernel_node);
  InitInputOutputSize(kernel_node);
 }

 void CPUKernelUtils::ExpandDimsTo4(std::vector<size_t> *shape) {
  auto len = shape->size();
  if (len < 4) {
    for (size_t i = 0; i < 4 - len; ++i) {
      shape->insert(shape->begin(), 1);
    }
  }
 }

 size_t CPUKernelUtils::CalcOffset(const std::vector<size_t> &shape, size_t dim0, size_t dim1, size_t dim2,
                                  size_t dim3) {
  size_t offset = dim0 * shape[1] * shape[2] * shape[3] + dim1 * shape[2] * shape[3] + dim2 * shape[3] + dim3;
  return offset;
 }

 size_t CPUKernelUtils::GetElementNumOnAxis(const std::vector<size_t> &shape, int axis) {
  if (axis < 0) {
    axis = axis + SizeToInt(shape.size());
  }
  size_t result = 1;
  for (int j = 3; j > axis; --j) {
    result *= shape[j];
  }
  return result;
 }

 void CPUKernelUtils::GetElementNumEveryDim(const std::vector<size_t> &shape, std::vector<size_t> *element_num) {
  size_t accumulation = 1;
  element_num->emplace_back(1);
  for (size_t i = shape.size() - 1; i > 0; --i) {
    accumulation *= shape[i];
    element_num->emplace_back(accumulation);
  }
  std::reverse(element_num->begin(), element_num->end());
 }

 void CPUKernelUtils::ParallelFor(const CTask &task, size_t count) {
  auto max_thread_num = common::ThreadPool::GetInstance().GetSyncRunThreadNum();
  const float block_size = 128.0;
  size_t thread_num = count < block_size * max_thread_num ? std::ceil(count / block_size) : max_thread_num;
  std::vector<common::Task> tasks;
  size_t start = 0;
  size_t once_compute_size = (count + thread_num - 1) / thread_num;
  while (start < count) {
    size_t end = (start + once_compute_size) > count ? count : (start + once_compute_size);
    auto block = [&, start, end]() {
      task(start, end);
      return common::SUCCESS;
    };
    tasks.emplace_back(block);
    start += once_compute_size;
  }
  common::ThreadPool::GetInstance().SyncRun(tasks);
 }

 std::vector<size_t> CPUKernelUtils::FlatShapeByAxis(const std::vector<size_t> &shape, int axis) {
  if (axis < 0) {
    axis = axis + SizeToInt(shape.size());
  }
  size_t dim_row = 1;
  size_t dim_col = 1;
  std::vector<size_t> flat_shape;
  for (size_t i = 0; i < shape.size(); ++i) {
    if (SizeToInt(i) < axis) {
      dim_row *= shape[i];
    } else {
      dim_col *= shape[i];
    }
  }
  flat_shape.push_back(dim_row);
  flat_shape.push_back(dim_col);
  return flat_shape;
 }

 BroadcastIterator::BroadcastIterator(std::vector<size_t> input_shape_a, std::vector<size_t> input_shape_b,
                                     std::vector<size_t> output_shape)
    : input_shape_a_(std::move(input_shape_a)),
      input_shape_b_(std::move(input_shape_b)),
      output_shape_(std::move(output_shape)) {
  output_dimension_ = SizeToInt(output_shape_.size());  // Assign dimension to int for iterator
  BroadcastShape();
  // Allocate strides memory
  input_strides_a_.resize(output_dimension_);
  input_strides_b_.resize(output_dimension_);
  input_back_strides_a_.resize(output_dimension_);
  input_back_strides_b_.resize(output_dimension_);
  coordinates_.resize(output_dimension_);
  InitStrides();
 }

 void BroadcastIterator::SetPos(size_t pos) {
  for (int i = output_dimension_ - 1; i >= 0 && pos != 0; --i) {
    coordinates_[i] = pos % output_shape_[i];
    input_pos_[0] += coordinates_[i] * input_strides_a_[i];
    input_pos_[1] += coordinates_[i] * input_strides_b_[i];
    pos /= output_shape_[i];
  }
 }

 void BroadcastIterator::GenNextPos() {
  // Calculate output next coordinate
  for (int i = output_dimension_ - 1; i >= 0; --i) {
    if (coordinates_[i] + 1 == output_shape_[i]) {
      coordinates_[i] = 0;
      input_pos_[0] -= input_back_strides_a_[i];
      input_pos_[1] -= input_back_strides_b_[i];
    } else {
      ++coordinates_[i];
      input_pos_[0] += input_strides_a_[i];
      input_pos_[1] += input_strides_b_[i];
      break;
    }
  }
 }

 void BroadcastIterator::BroadcastShape() {
  int input_dimension_a = input_shape_a_.size();
  if (input_dimension_a < output_dimension_) {
    input_shape_a_.insert(input_shape_a_.begin(), output_dimension_ - input_dimension_a, 1);
  }

  int input_dimension_b = input_shape_b_.size();
  if (input_dimension_b < output_dimension_) {
    input_shape_b_.insert(input_shape_b_.begin(), output_dimension_ - input_dimension_b, 1);
  }
 }

 void BroadcastIterator::InitStrides() {
  input_strides_a_[output_dimension_ - 1] = 1;
  input_strides_b_[output_dimension_ - 1] = 1;
  for (int i = output_dimension_ - 2; i >= 0; --i) {
    input_strides_a_[i] = input_shape_a_[i + 1] * input_strides_a_[i + 1];
    input_strides_b_[i] = input_shape_b_[i + 1] * input_strides_b_[i + 1];
    input_back_strides_a_[i + 1] = (input_shape_a_[i + 1] - 1) * input_strides_a_[i + 1];
    input_back_strides_b_[i + 1] = (input_shape_b_[i + 1] - 1) * input_strides_b_[i + 1];
  }

  // Update strides for broadcast
  // While the axis value is 1, the stride is 0
  std::transform(input_strides_a_.begin(), input_strides_a_.end(), input_shape_a_.begin(), input_strides_a_.begin(),
                 [](const auto &a, const auto &b) { return b == 1 ? 0 : a; });
  std::transform(input_strides_b_.begin(), input_strides_b_.end(), input_shape_b_.begin(), input_strides_b_.begin(),
                 [](const auto &a, const auto &b) { return b == 1 ? 0 : a; });
 }

 TransposeIterator::TransposeIterator(std::vector<size_t> output_shape, std::vector<size_t> axes,
                                     const std::vector<size_t> &input_shape)
    : shape_(std::move(output_shape)), axes_(std::move(axes)) {
  // Calculate strides
  dimension_ = shape_.size();
  std::vector<uint32_t> strides(dimension_, 1);
  for (int i = dimension_ - 2; i >= 0; --i) {
    strides[i] = input_shape[i + 1] * strides[i + 1];
  }

  // Swap shape ans strides and calculate back strides
  strides_.resize(dimension_);
  back_strides_.resize(dimension_);
  for (int i = dimension_ - 1; i >= 0; --i) {
    strides_[i] = strides[axes_[i]];
    back_strides_[i] = (shape_[i] - 1) * strides_[i];
  }

  // Calculate coordinate by pos
  coordinates_.resize(dimension_);
 }

 void TransposeIterator::SetPos(size_t pos) {
  for (int i = dimension_ - 1; i >= 0 && pos != 0; --i) {
    coordinates_[i] = pos % shape_[i];
    pos_ += coordinates_[i] * strides_[i];
    pos /= shape_[i];
  }
 }

 void TransposeIterator::GenNextPos() {
  for (int i = dimension_ - 1; i >= 0; --i) {
    if (coordinates_[i] + 1 == shape_[i]) {
      coordinates_[i] = 0;
      pos_ -= back_strides_[i];
    } else {
      coordinates_[i]++;
      pos_ += strides_[i];
      break;
    }
  }
 }

 std::vector<size_t> CPUKernelUtils::GetBroadcastShape(const std::vector<size_t> &x, const std::vector<size_t> &y) {
  size_t x_len = x.size();
  size_t y_len = y.size();
  size_t length = x_len < y_len ? x_len : y_len;
  std::vector<size_t> broadcast_shape;
  std::vector<size_t> broadcast_shape_back;
  for (int i = -length; i < 0; ++i) {
    if (x[x_len + i] == 1) {
      broadcast_shape_back.push_back(y[y_len + i]);
    } else if (y[y_len + i] == 1) {
      broadcast_shape_back.push_back(x[x_len + i]);
    } else if (x[x_len + i] == y[y_len + i]) {
      broadcast_shape_back.push_back(x[x_len + i]);
    }
  }
  if (length == x_len) {
    for (size_t i = 0; i < y_len - length; ++i) {
      broadcast_shape.push_back(y[i]);
    }
  } else {
    for (size_t i = 0; i < x_len - length; ++i) {
      broadcast_shape.push_back(x[i]);
    }
  }
  for (size_t i = 0; i < length; ++i) {
    broadcast_shape.push_back(broadcast_shape_back[i]);
  }
  return broadcast_shape;
 }

 }  // namespace kernel
 }  // namespace mindspore
 /**
 * Copyright 2019 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "backend/kernel_compiler/cpu/cpu_kernel.h"
 #include <algorithm>
 #include <utility>
 #include "common/thread_pool.h"

 namespace mindspore {
 namespace kernel {
 void CPUKernel::InitInputOutputSize(const CNodePtr &kernel_node) {
  MS_EXCEPTION_IF_NULL(kernel_node);
  size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
  for (size_t input_index = 0; input_index < input_num; ++input_index) {
    TypeId type_id = AnfAlgo::GetInputDeviceDataType(kernel_node, input_index);
    size_t type_size = GetTypeByte(TypeIdToType(type_id));
    std::vector<size_t> shape = AnfAlgo::GetInputDeviceShape(kernel_node, input_index);
    size_t tensor_size =
      shape.empty() ? type_size : std::accumulate(shape.begin(), shape.end(), type_size, std::multiplies<size_t>());
    tensor_size = std::max(tensor_size, type_size);
    input_size_list_.emplace_back(tensor_size);
  }
  size_t output_num = AnfAlgo::GetOutputTensorNum(kernel_node);
  for (size_t output_index = 0; output_index < output_num; ++output_index) {
    TypeId type_id = AnfAlgo::GetOutputDeviceDataType(kernel_node, output_index);
    size_t type_size = GetTypeByte(TypeIdToType(type_id));
    std::vector<size_t> shape = AnfAlgo::GetOutputDeviceShape(kernel_node, output_index);
    size_t tensor_size =
      shape.empty() ? type_size : std::accumulate(shape.begin(), shape.end(), type_size, std::multiplies<size_t>());
    tensor_size = std::max(tensor_size, type_size);
    output_size_list_.emplace_back(tensor_size);
  }
 }

 void CPUKernel::Init(const CNodePtr &kernel_node) {
  InitKernel(kernel_node);
  InitInputOutputSize(kernel_node);
 }

 void CPUKernelUtils::ExpandDimsTo4(std::vector<size_t> *shape) {
  auto len = shape->size();
  if (len < 4) {
    for (size_t i = 0; i < 4 - len; ++i) {
      shape->insert(shape->begin(), 1);
    }
  }
 }

 size_t CPUKernelUtils::CalcOffset(const std::vector<size_t> &shape, size_t dim0, size_t dim1, size_t dim2,
                                  size_t dim3) {
  size_t offset = dim0 * shape[1] * shape[2] * shape[3] + dim1 * shape[2] * shape[3] + dim2 * shape[3] + dim3;
  return offset;
 }

 size_t CPUKernelUtils::GetElementNumOnAxis(const std::vector<size_t> &shape, int axis) {
  if (axis < 0) {
    axis = axis + SizeToInt(shape.size());
  }
  size_t result = 1;
  for (int j = 3; j > axis; --j) {
    result *= shape[j];
  }
  return result;
 }

 void CPUKernelUtils::GetElementNumEveryDim(const std::vector<size_t> &shape, std::vector<size_t> *element_num) {
  size_t accumulation = 1;
  element_num->emplace_back(1);
  for (size_t i = shape.size() - 1; i > 0; --i) {
    accumulation *= shape[i];
    element_num->emplace_back(accumulation);
  }
  std::reverse(element_num->begin(), element_num->end());
 }

 void CPUKernelUtils::ParallelFor(const CTask &task, size_t count) {
  auto max_thread_num = common::ThreadPool::GetInstance().GetSyncRunThreadNum();
  const float block_size = 128.0;
  size_t thread_num = count < block_size * max_thread_num ? std::ceil(count / block_size) : max_thread_num;
  std::vector<common::Task> tasks;
  size_t start = 0;
  size_t once_compute_size = (count + thread_num - 1) / thread_num;
  while (start < count) {
    size_t end = (start + once_compute_size) > count ? count : (start + once_compute_size);
    auto block = [&, start, end]() {
      task(start, end);
      return common::SUCCESS;
    };
    tasks.emplace_back(block);
    start += once_compute_size;
  }
  common::ThreadPool::GetInstance().SyncRun(tasks);
 }

 std::vector<size_t> CPUKernelUtils::FlatShapeByAxis(const std::vector<size_t> &shape, int axis) {
  if (axis < 0) {
    axis = axis + SizeToInt(shape.size());
  }
  size_t dim_row = 1;
  size_t dim_col = 1;
  std::vector<size_t> flat_shape;
  for (size_t i = 0; i < shape.size(); ++i) {
    if (SizeToInt(i) < axis) {
      dim_row *= shape[i];
    } else {
      dim_col *= shape[i];
    }
  }
  flat_shape.push_back(dim_row);
  flat_shape.push_back(dim_col);
  return flat_shape;
 }

 BroadcastIterator::BroadcastIterator(std::vector<size_t> input_shape_a, std::vector<size_t> input_shape_b,
                                     std::vector<size_t> output_shape)
    : input_shape_a_(std::move(input_shape_a)),
      input_shape_b_(std::move(input_shape_b)),
      output_shape_(std::move(output_shape)) {
  output_dimension_ = SizeToInt(output_shape_.size());  // Assign dimension to int for iterator
  BroadcastShape();
  // Allocate strides memory
  input_strides_a_.resize(output_dimension_);
  input_strides_b_.resize(output_dimension_);
  input_back_strides_a_.resize(output_dimension_);
  input_back_strides_b_.resize(output_dimension_);
  coordinates_.resize(output_dimension_);
  InitStrides();
 }

 void BroadcastIterator::SetPos(size_t pos) {
  for (int i = output_dimension_ - 1; i >= 0 && pos != 0; --i) {
    coordinates_[i] = pos % output_shape_[i];
    input_pos_[0] += coordinates_[i] * input_strides_a_[i];
    input_pos_[1] += coordinates_[i] * input_strides_b_[i];
    pos /= output_shape_[i];
  }
 }

 void BroadcastIterator::GenNextPos() {
  // Calculate output next coordinate
  for (int i = output_dimension_ - 1; i >= 0; --i) {
    if (coordinates_[i] + 1 == output_shape_[i]) {
      coordinates_[i] = 0;
      input_pos_[0] -= input_back_strides_a_[i];
      input_pos_[1] -= input_back_strides_b_[i];
    } else {
      ++coordinates_[i];
      input_pos_[0] += input_strides_a_[i];
      input_pos_[1] += input_strides_b_[i];
      break;
    }
  }
 }

 void BroadcastIterator::BroadcastShape() {
  int input_dimension_a = input_shape_a_.size();
  if (input_dimension_a < output_dimension_) {
    input_shape_a_.insert(input_shape_a_.begin(), output_dimension_ - input_dimension_a, 1);
  }

  int input_dimension_b = input_shape_b_.size();
  if (input_dimension_b < output_dimension_) {
    input_shape_b_.insert(input_shape_b_.begin(), output_dimension_ - input_dimension_b, 1);
  }
 }

 void BroadcastIterator::InitStrides() {
  input_strides_a_[output_dimension_ - 1] = 1;
  input_strides_b_[output_dimension_ - 1] = 1;
  for (int i = output_dimension_ - 2; i >= 0; --i) {
    input_strides_a_[i] = input_shape_a_[i + 1] * input_strides_a_[i + 1];
    input_strides_b_[i] = input_shape_b_[i + 1] * input_strides_b_[i + 1];
    input_back_strides_a_[i + 1] = (input_shape_a_[i + 1] - 1) * input_strides_a_[i + 1];
    input_back_strides_b_[i + 1] = (input_shape_b_[i + 1] - 1) * input_strides_b_[i + 1];
  }

  // Update strides for broadcast
  // While the axis value is 1, the stride is 0
  std::transform(input_strides_a_.begin(), input_strides_a_.end(), input_shape_a_.begin(), input_strides_a_.begin(),
                 [](const auto &a, const auto &b) { return b == 1 ? 0 : a; });
  std::transform(input_strides_b_.begin(), input_strides_b_.end(), input_shape_b_.begin(), input_strides_b_.begin(),
                 [](const auto &a, const auto &b) { return b == 1 ? 0 : a; });
 }

 TransposeIterator::TransposeIterator(std::vector<size_t> output_shape, std::vector<size_t> axes,
                                     const std::vector<size_t> &input_shape)
    : shape_(std::move(output_shape)), axes_(std::move(axes)) {
  // Calculate strides
  dimension_ = shape_.size();
  std::vector<uint32_t> strides(dimension_, 1);
  for (int i = dimension_ - 2; i >= 0; --i) {
    strides[i] = input_shape[i + 1] * strides[i + 1];
  }

  // Swap shape ans strides and calculate back strides
  strides_.resize(dimension_);
  back_strides_.resize(dimension_);
  for (int i = dimension_ - 1; i >= 0; --i) {
    strides_[i] = strides[axes_[i]];
    back_strides_[i] = (shape_[i] - 1) * strides_[i];
  }

  // Calculate coordinate by pos
  coordinates_.resize(dimension_);
 }

 void TransposeIterator::SetPos(size_t pos) {
  for (int i = dimension_ - 1; i >= 0 && pos != 0; --i) {
    coordinates_[i] = pos % shape_[i];
    pos_ += coordinates_[i] * strides_[i];
    pos /= shape_[i];
  }
 }

 void TransposeIterator::GenNextPos() {
  for (int i = dimension_ - 1; i >= 0; --i) {
    if (coordinates_[i] + 1 == shape_[i]) {
      coordinates_[i] = 0;
      pos_ -= back_strides_[i];
    } else {
      coordinates_[i]++;
      pos_ += strides_[i];
      break;
    }
  }
 }

 std::vector<size_t> CPUKernelUtils::GetBroadcastShape(const std::vector<size_t> &x, const std::vector<size_t> &y) {
  size_t x_len = x.size();
  size_t y_len = y.size();
  size_t length = x_len < y_len ? x_len : y_len;
  std::vector<size_t> broadcast_shape;
  std::vector<size_t> broadcast_shape_back;
  for (int i = -length; i < 0; ++i) {
    if (x[x_len + i] == 1) {
      broadcast_shape_back.push_back(y[y_len + i]);
    } else if (y[y_len + i] == 1) {
      broadcast_shape_back.push_back(x[x_len + i]);
    } else if (x[x_len + i] == y[y_len + i]) {
      broadcast_shape_back.push_back(x[x_len + i]);
    }
  }
  if (length == x_len) {
    for (size_t i = 0; i < y_len - length; ++i) {
      broadcast_shape.push_back(y[i]);
    }
  } else {
    for (size_t i = 0; i < x_len - length; ++i) {
      broadcast_shape.push_back(x[i]);
    }
  }
  for (size_t i = 0; i < length; ++i) {
    broadcast_shape.push_back(broadcast_shape_back[i]);
  }
  return broadcast_shape;
 }

 }  // namespace kernel
 }  // namespace mindspore
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/cpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/cpu_kernel.h
@@ -1,205 +1,205 @@
 /**
 * Copyright 2019 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_CPU_KERNEL_H_
 #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_CPU_KERNEL_H_
 #include <functional>
 #include <memory>
 #include <numeric>
 #include <string>
 #include <thread>
 #include <vector>
 #include "backend/kernel_compiler/kernel.h"
 #include "backend/session/anf_runtime_algorithm.h"
 #include "backend/kernel_compiler/common_utils.h"
 #include "ir/anf.h"

 using mindspore::kernel::Address;
 using mindspore::kernel::AddressPtr;
 using CTask = std::function<void(size_t, size_t)>;
 namespace mindspore {
 namespace kernel {
 const char KERNEL_SIZE[] = "kernel_size";
 const char STRIDE[] = "stride";
 const char STRIDES[] = "strides";
 const char DILATION[] = "dilation";
 const char DILATIONS[] = "dilations";
 const char FORMAT[] = "format";
 const char PAD[] = "pad";
 const char PAD_LIST[] = "pad_list";
 const char PAD_MODE[] = "pad_mode";
 const char PAD_MODE_LOWER_SAME[] = "same";
 const char PAD_MODE_LOWER_VALID[] = "valid";
 const char PAD_MODE_UPPER_SAME[] = "SAME";
 const char PAD_MODE_UPPER_VALID[] = "VALID";
 const char TRANSPOSE_A[] = "transpose_a";
 const char TRANSPOSE_B[] = "transpose_b";
 const char IS_GRAD[] = "is_grad";
 const char TRANSPOSE_NO = 'N';
 const char TRANSPOSE_YES = 'T';
 const char AXIS[] = "axis";
 const char DIM[] = "dim";
 const char BEGIN[] = "begin";
 const char END[] = "end";
 const char SIZE[] = "size";
 const char USE_NESTEROV[] = "use_nesterov";
 const char GROUP[] = "group";
 const char START[] = "start";
 const char LIMIT[] = "limit";
 const char DELTA[] = "delta";
 const char SORTED[] = "sorted";
 const char ADJ_ST[] = "adjoint_st";
 const char ADJ_dT[] = "adjoint_dt";

 enum OperateType {
  ADD = 0,
  SUB,
  MUL,
  DIV,
  SQUARE,
  SQRT,
  POW,
  REALDIV,
  FLOORDIV,
  MOD,
  FLOORMOD,
  NEG,
  LESS,
  ASSIGNADD,
  RELUGRAD,
  RELU6GRAD,
  ABSGRAD,
  TANHGRAD,
  SQRTGRAD,
  SIGMOIDGRAD,
  ONESLIKE,
  ZEROSLIKE,
  SIGN,
  EQUAL,
  NOTEQUAL,
  LESSEQUAL,
  LOGICALAND,
  LOGICALOR,
  LOGICALNOT,
  FLOOR,
  SQUAREDDIFFERENCE,
  GREATER,
  GREATEREQUAL,
  RECIPROCAL,
  GELU,
  GELUGRAD,
  ASIN,
  ACOS,
  ATAN,
  ASINGRAD,
  ACOSGRAD,
  ATANGRAD,
  SIN,
  COS,
  TAN,
  SINH,
  COSH,
  ASINH,
  ACOSH,
  ATANH,
  ASINHGRAD,
  ACOSHGRAD,
  ATAN2,
  RINT,
  ROUND,
  IDENTITY,
 };

 class CPUKernel : public kernel::KernelMod {
 public:
  CPUKernel() = default;
  ~CPUKernel() override = default;
  virtual void Init(const CNodePtr &kernel_node);
  virtual void InitKernel(const CNodePtr &kernel_node) = 0;
  bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
              const std::vector<AddressPtr> &outputs, void * /*stream_ptr*/) override {
    return Launch(inputs, workspace, outputs);
  };
  virtual bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
                      const std::vector<AddressPtr> &outputs) = 0;
  const std::vector<size_t> &GetInputSizeList() const override { return input_size_list_; }
  const std::vector<size_t> &GetOutputSizeList() const override { return output_size_list_; }
  const std::vector<size_t> &GetWorkspaceSizeList() const override { return workspace_size_list_; }

 protected:
  virtual void InitInputOutputSize(const CNodePtr &kernel_node);
  std::vector<size_t> input_size_list_;
  std::vector<size_t> output_size_list_;
  std::vector<size_t> workspace_size_list_;
 };

 class CPUKernelUtils {
 public:
  static void ExpandDimsTo4(std::vector<size_t> *shape);
  static size_t CalcOffset(const std::vector<size_t> &shape, size_t dim0, size_t dim1, size_t dim2, size_t dim3);
  static size_t GetElementNumOnAxis(const std::vector<size_t> &shape, int axis);
  static void GetElementNumEveryDim(const std::vector<size_t> &shape, std::vector<size_t> *element_num);
  static void ParallelFor(const CTask &task, size_t count);
  static std::vector<size_t> FlatShapeByAxis(const std::vector<size_t> &shape, int axis);
  static std::vector<size_t> GetBroadcastShape(const std::vector<size_t> &x, const std::vector<size_t> &y);
 };

 class BroadcastIterator {
 public:
  BroadcastIterator(std::vector<size_t> input_shape_a, std::vector<size_t> input_shape_b,
                    std::vector<size_t> output_shape);
  virtual ~BroadcastIterator() = default;
  inline size_t GetInputPosA() const { return input_pos_[0]; }
  inline size_t GetInputPosB() const { return input_pos_[1]; }
  void SetPos(size_t pos);
  void GenNextPos();

 private:
  void BroadcastShape();
  void InitStrides();

  std::vector<size_t> coordinates_;
  std::vector<size_t> input_shape_a_;
  std::vector<size_t> input_shape_b_;
  std::vector<size_t> output_shape_;
  std::vector<size_t> input_strides_a_;
  std::vector<size_t> input_strides_b_;
  std::vector<size_t> input_back_strides_a_;
  std::vector<size_t> input_back_strides_b_;
  std::array<size_t, 2> input_pos_{0};
  int output_dimension_{0};
 };

 class TransposeIterator {
 public:
  TransposeIterator(std::vector<size_t> output_shape, std::vector<size_t> axes, const std::vector<size_t> &input_shape);
  virtual ~TransposeIterator() = default;
  inline size_t GetPos() const { return pos_; }
  void SetPos(size_t pos);
  void GenNextPos();

 private:
  int dimension_{0};
  std::vector<size_t> coordinates_;
  std::vector<size_t> shape_;
  std::vector<size_t> strides_;
  std::vector<size_t> back_strides_;
  std::vector<size_t> axes_;
  size_t pos_{0};
 };
 }  // namespace kernel
 }  // namespace mindspore

 #endif  // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_CPU_KERNEL_H_
 /**
 * Copyright 2019 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_CPU_KERNEL_H_
 #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_CPU_KERNEL_H_
 #include <functional>
 #include <memory>
 #include <numeric>
 #include <string>
 #include <thread>
 #include <vector>
 #include "backend/kernel_compiler/kernel.h"
 #include "backend/session/anf_runtime_algorithm.h"
 #include "backend/kernel_compiler/common_utils.h"
 #include "ir/anf.h"

 using mindspore::kernel::Address;
 using mindspore::kernel::AddressPtr;
 using CTask = std::function<void(size_t, size_t)>;
 namespace mindspore {
 namespace kernel {
 const char KERNEL_SIZE[] = "kernel_size";
 const char STRIDE[] = "stride";
 const char STRIDES[] = "strides";
 const char DILATION[] = "dilation";
 const char DILATIONS[] = "dilations";
 const char FORMAT[] = "format";
 const char PAD[] = "pad";
 const char PAD_LIST[] = "pad_list";
 const char PAD_MODE[] = "pad_mode";
 const char PAD_MODE_LOWER_SAME[] = "same";
 const char PAD_MODE_LOWER_VALID[] = "valid";
 const char PAD_MODE_UPPER_SAME[] = "SAME";
 const char PAD_MODE_UPPER_VALID[] = "VALID";
 const char TRANSPOSE_A[] = "transpose_a";
 const char TRANSPOSE_B[] = "transpose_b";
 const char IS_GRAD[] = "is_grad";
 const char TRANSPOSE_NO = 'N';
 const char TRANSPOSE_YES = 'T';
 const char AXIS[] = "axis";
 const char DIM[] = "dim";
 const char BEGIN[] = "begin";
 const char END[] = "end";
 const char SIZE[] = "size";
 const char USE_NESTEROV[] = "use_nesterov";
 const char GROUP[] = "group";
 const char START[] = "start";
 const char LIMIT[] = "limit";
 const char DELTA[] = "delta";
 const char SORTED[] = "sorted";
 const char ADJ_ST[] = "adjoint_st";
 const char ADJ_dT[] = "adjoint_dt";

 enum OperateType {
  ADD = 0,
  SUB,
  MUL,
  DIV,
  SQUARE,
  SQRT,
  POW,
  REALDIV,
  FLOORDIV,
  MOD,
  FLOORMOD,
  NEG,
  LESS,
  ASSIGNADD,
  RELUGRAD,
  RELU6GRAD,
  ABSGRAD,
  TANHGRAD,
  SQRTGRAD,
  SIGMOIDGRAD,
  ONESLIKE,
  ZEROSLIKE,
  SIGN,
  EQUAL,
  NOTEQUAL,
  LESSEQUAL,
  LOGICALAND,
  LOGICALOR,
  LOGICALNOT,
  FLOOR,
  SQUAREDDIFFERENCE,
  GREATER,
  GREATEREQUAL,
  RECIPROCAL,
  GELU,
  GELUGRAD,
  ASIN,
  ACOS,
  ATAN,
  ASINGRAD,
  ACOSGRAD,
  ATANGRAD,
  SIN,
  COS,
  TAN,
  SINH,
  COSH,
  ASINH,
  ACOSH,
  ATANH,
  ASINHGRAD,
  ACOSHGRAD,
  ATAN2,
  RINT,
  ROUND,
  IDENTITY,
 };

 class CPUKernel : public kernel::KernelMod {
 public:
  CPUKernel() = default;
  ~CPUKernel() override = default;
  virtual void Init(const CNodePtr &kernel_node);
  virtual void InitKernel(const CNodePtr &kernel_node) = 0;
  bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
              const std::vector<AddressPtr> &outputs, void * /*stream_ptr*/) override {
    return Launch(inputs, workspace, outputs);
  };
  virtual bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
                      const std::vector<AddressPtr> &outputs) = 0;
  const std::vector<size_t> &GetInputSizeList() const override { return input_size_list_; }
  const std::vector<size_t> &GetOutputSizeList() const override { return output_size_list_; }
  const std::vector<size_t> &GetWorkspaceSizeList() const override { return workspace_size_list_; }

 protected:
  virtual void InitInputOutputSize(const CNodePtr &kernel_node);
  std::vector<size_t> input_size_list_;
  std::vector<size_t> output_size_list_;
  std::vector<size_t> workspace_size_list_;
 };

 class CPUKernelUtils {
 public:
  static void ExpandDimsTo4(std::vector<size_t> *shape);
  static size_t CalcOffset(const std::vector<size_t> &shape, size_t dim0, size_t dim1, size_t dim2, size_t dim3);
  static size_t GetElementNumOnAxis(const std::vector<size_t> &shape, int axis);
  static void GetElementNumEveryDim(const std::vector<size_t> &shape, std::vector<size_t> *element_num);
  static void ParallelFor(const CTask &task, size_t count);
  static std::vector<size_t> FlatShapeByAxis(const std::vector<size_t> &shape, int axis);
  static std::vector<size_t> GetBroadcastShape(const std::vector<size_t> &x, const std::vector<size_t> &y);
 };

 class BroadcastIterator {
 public:
  BroadcastIterator(std::vector<size_t> input_shape_a, std::vector<size_t> input_shape_b,
                    std::vector<size_t> output_shape);
  virtual ~BroadcastIterator() = default;
  inline size_t GetInputPosA() const { return input_pos_[0]; }
  inline size_t GetInputPosB() const { return input_pos_[1]; }
  void SetPos(size_t pos);
  void GenNextPos();

 private:
  void BroadcastShape();
  void InitStrides();

  std::vector<size_t> coordinates_;
  std::vector<size_t> input_shape_a_;
  std::vector<size_t> input_shape_b_;
  std::vector<size_t> output_shape_;
  std::vector<size_t> input_strides_a_;
  std::vector<size_t> input_strides_b_;
  std::vector<size_t> input_back_strides_a_;
  std::vector<size_t> input_back_strides_b_;
  std::array<size_t, 2> input_pos_{0};
  int output_dimension_{0};
 };

 class TransposeIterator {
 public:
  TransposeIterator(std::vector<size_t> output_shape, std::vector<size_t> axes, const std::vector<size_t> &input_shape);
  virtual ~TransposeIterator() = default;
  inline size_t GetPos() const { return pos_; }
  void SetPos(size_t pos);
  void GenNextPos();

 private:
  int dimension_{0};
  std::vector<size_t> coordinates_;
  std::vector<size_t> shape_;
  std::vector<size_t> strides_;
  std::vector<size_t> back_strides_;
  std::vector<size_t> axes_;
  size_t pos_{0};
 };
 }  // namespace kernel
 }  // namespace mindspore

 #endif  // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_CPU_KERNEL_H_
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/ctcloss_cpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/ctcloss_cpu_kernel.cc
@@ -1,340 +1,340 @@
 /**
 * Copyright 2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #include "backend/kernel_compiler/cpu/ctcloss_cpu_kernel.h"
 #include "runtime/device/cpu/cpu_device_address.h"

 namespace mindspore {
 namespace kernel {
 void CTCLossCPUKernel::InitKernel(const CNodePtr &kernel_node) {
  CheckParam(kernel_node);
  probs_shape_ = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
  indice_dims_ = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 1);
  labels_dims_ = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 2);
  dtype_ = AnfAlgo::GetInputDeviceDataType(kernel_node, 0);

  if (probs_shape_.size() != 3) {
    MS_LOG(EXCEPTION) << "Probs dims: " << probs_shape_.size() << " not support.";
  }
  if (labels_dims_.size() != 1) {
    MS_LOG(EXCEPTION) << "Labels dims: " << labels_dims_.size() << " not support.";
  }
  if (indice_dims_.size() != 2) {
    MS_LOG(EXCEPTION) << "Labels indice dims: " << indice_dims_.size() << " not support.";
  }

  preprocess_collapse_repeated_ = AnfAlgo::GetNodeAttr<bool>(kernel_node, "preprocess_collapse_repeated");
  ctc_merge_repeated_ = AnfAlgo::GetNodeAttr<bool>(kernel_node, "ctc_merge_repeated");
  ignore_longer_outputs_than_inputs_ = AnfAlgo::GetNodeAttr<bool>(kernel_node, "ignore_longer_outputs_than_inputs");

  max_time_ = probs_shape_[0];
  batch_size_ = probs_shape_[1];
  num_class_ = probs_shape_[2];
  blank_index_ = num_class_ - 1;
 }

 bool CTCLossCPUKernel::Launch(const std::vector<kernel::AddressPtr> &inputs, const std::vector<kernel::AddressPtr> &,
                              const std::vector<kernel::AddressPtr> &outputs) {
  if (dtype_ == kNumberTypeFloat16) {
    LaunchKernel<float16>(inputs, outputs);
  } else if (dtype_ == kNumberTypeFloat32) {
    LaunchKernel<float>(inputs, outputs);
  }
  return true;
 }

 template <typename T>
 inline T LogSumExp(const T logprob1, const T logprob2) {
  T kLogZero_ = -std::numeric_limits<T>::infinity();
  if (logprob1 <= kLogZero_) {
    return logprob2;
  } else if (logprob2 <= kLogZero_) {
    return logprob1;
  } else {
    return (logprob1 > logprob2) ? logprob1 + static_cast<T>(log1p(exp(logprob2 - logprob1)))
                                 : logprob2 + static_cast<T>(log1p(exp(logprob1 - logprob2)));
  }
 }

 template <typename TT>
 void CTCLossCPUKernel::CalculateFwdVar(const std::vector<uint32_t> &label_with_blank,
                                       const std::vector<std::vector<TT>> &y,
                                       std::vector<std::vector<TT>> *log_alpha_b) {
  int U = label_with_blank.size();
  int T = (*log_alpha_b)[0].size();
  TT kLogZero_ = -std::numeric_limits<TT>::infinity();

  (*log_alpha_b)[0][0] = static_cast<TT>(log(y[blank_index_][0]));
  auto label_0 = (label_with_blank.size() > 1) ? label_with_blank[1] : blank_index_;
  if (label_with_blank.size() > 1) {
    (*log_alpha_b)[1][0] = static_cast<TT>(log(y[label_0][0]));
  }

  for (int t = 1; t < T; ++t) {
    int low = std::max(0, U - (2 * (T - t)));
    int high = std::min(U, 2 * (t + 1));
    for (int u = low; u < high; ++u) {
      auto sum_log_alpha_b = kLogZero_;
      if (ctc_merge_repeated_ || label_with_blank[u] == blank_index_) {
        sum_log_alpha_b = (*log_alpha_b)[u][t - 1];
      }

      if (u > 0) {
        sum_log_alpha_b = LogSumExp(sum_log_alpha_b, (*log_alpha_b)[u - 1][t - 1]);
      }

      if (u > 1) {
        bool matching_labels_merge = ctc_merge_repeated_ && (label_with_blank[u] == label_with_blank[u - 2]);
        if (label_with_blank[u] != blank_index_ && !matching_labels_merge) {
          sum_log_alpha_b = LogSumExp(sum_log_alpha_b, (*log_alpha_b)[u - 2][t - 1]);
        }
      }

      (*log_alpha_b)[u][t] =
        static_cast<TT>(log(static_cast<TT>(y[label_with_blank[IntToSize(u)]][IntToSize(t)]))) + sum_log_alpha_b;
    }
  }
 }

 template <typename TT>
 void CTCLossCPUKernel::CalculateBwdVar(const std::vector<uint32_t> &label_with_blank,
                                       const std::vector<std::vector<TT>> &y,
                                       std::vector<std::vector<TT>> *log_beta_b) {
  int T = (*log_beta_b)[0].size();
  int U = label_with_blank.size();
  if (U > 1) {
    for (int u = U - 2; u < U; ++u) {
      (*log_beta_b)[u][T - 1] = TT(0);
    }
  } else {
    (*log_beta_b)[0][T - 1] = TT(0);
    (*log_beta_b)[0][T - 2] = TT(0);
  }

  for (int t = T - 2; t >= 0; --t) {
    int low = std::max(0, U - (2 * (T - t)));
    int high = std::min(U, 2 * (t + 1));
    for (int u = low; u < high; ++u) {
      if (ctc_merge_repeated_ || label_with_blank[u] == blank_index_) {
        (*log_beta_b)[u][t] =
          LogSumExp((*log_beta_b)[u][t], (*log_beta_b)[u][t + 1] + TT(log(y[label_with_blank[u]][t + 1])));
      }

      if (u + 1 < U) {
        (*log_beta_b)[u][t] =
          LogSumExp((*log_beta_b)[u][t], (*log_beta_b)[u + 1][t + 1] + TT(log(y[label_with_blank[u + 1]][t + 1])));
      }

      if (u + 2 < U) {
        bool matching_labels_merge = ctc_merge_repeated_ && (label_with_blank[u] == label_with_blank[u + 2]);
        if (label_with_blank[u] != blank_index_ && !matching_labels_merge) {
          (*log_beta_b)[u][t] =
            LogSumExp((*log_beta_b)[u][t], (*log_beta_b)[u + 2][t + 1] + TT(log(y[label_with_blank[u + 2]][t + 1])));
        }
      }
    }
  }
 }

 template <typename TT>
 void CTCLossCPUKernel::CalculateGrad(const std::vector<uint32_t> &label_with_blank,
                                     const std::vector<std::vector<TT>> &y,
                                     const std::vector<std::vector<TT>> &log_alpha_b,
                                     const std::vector<std::vector<TT>> &log_beta_b, const TT log_pzx,
                                     std::vector<std::vector<TT>> *dy) {
  auto dy_b = dy;
  TT kLogZero_ = -std::numeric_limits<TT>::infinity();
  if (log_pzx <= kLogZero_) {
    MS_LOG(INFO) << "No valid path found";
    return;
  }

  size_t L = y.size();
  size_t T = y[0].size();
  size_t U = label_with_blank.size();

  for (size_t t = 0; t < T; ++t) {
    std::vector<TT> prob_sum(L, kLogZero_);

    for (size_t u = 0; u < U; ++u) {
      uint32_t l = label_with_blank[u];
      prob_sum[l] = LogSumExp(prob_sum[l], log_alpha_b[u][t] + log_beta_b[u][t]);
    }
    for (size_t l = 0; l < L; ++l) {
      (*dy_b)[l][t] = y[l][t] - static_cast<TT>(exp(prob_sum[l] - log_pzx));
    }
  }
 }

 void CTCLossCPUKernel::GenLableWithBlank(const uint32_t *seq_len, const std::vector<std::vector<uint32_t>> &batch_label,
                                         std::vector<std::vector<uint32_t>> *label_with_blank) {
  for (size_t b = 0; b < batch_size_; ++b) {
    std::vector<uint32_t> l;
    const std::vector<uint32_t> &label = batch_label[b];
    bool has_blank = false;
    for (size_t i = 0; i < label.size(); ++i) {
      if (i == 0 || !preprocess_collapse_repeated_ || label[i] != label[i - 1]) {
        if (label[i] >= num_class_ - 1) {
          has_blank = true;
        } else {
          if (has_blank) {
            MS_LOG(EXCEPTION) << "Invalid labels(index >= num_class - 1) should not appear between two valid labels";
          }
          l.push_back(label[i]);
        }
      }
    }
    if (!ignore_longer_outputs_than_inputs_) {
      if (l.size() > seq_len[b]) {
        MS_LOG(EXCEPTION) << "Input time(sequence length) should greater than output size(label length), but gets "
                          << seq_len[b] << "< " << l.size();
      }
    }

    (*label_with_blank)[b].reserve(2 * l.size() + 1);
    for (auto l_i : l) {
      (*label_with_blank)[b].push_back(blank_index_);
      (*label_with_blank)[b].push_back(l_i);
    }
    (*label_with_blank)[b].push_back(blank_index_);
  }
 }

 template <typename T>
 void InnerSoftMax(const T *inputs_addr, std::vector<std::vector<T>> *softmax_probs, const uint32_t sequence_length,
                  size_t num_class, size_t batch_size, size_t b) {
  for (size_t t = 0; t < sequence_length; ++t) {
    T maxCoeff(T(0));
    T sumCoeff(T(0));

    for (size_t c = 0; c < num_class; ++c) {
      if (inputs_addr[t * batch_size * num_class + b * num_class + c] > maxCoeff) {
        maxCoeff = inputs_addr[t * batch_size * num_class + b * num_class + c];
      }
    }

    for (size_t c = 0; c < num_class; ++c) {
      sumCoeff += static_cast<T>(exp(inputs_addr[t * batch_size * num_class + b * num_class + c] - maxCoeff));
      (*softmax_probs)[c][t] =
        static_cast<T>(exp(inputs_addr[t * batch_size * num_class + b * num_class + c] - maxCoeff));
    }

    for (size_t c = 0; c < num_class; ++c) {
      (*softmax_probs)[c][t] /= sumCoeff;
    }
  }
 }

 template <typename T>
 void MatrixfromVector(uint32_t row, uint32_t col, std::vector<std::vector<T>> *array2D, const T init_value) {
  array2D->resize(row);
  for (size_t i = 0; i < row; ++i) {
    (*array2D)[i].resize(col, init_value);
  }
 }

 template <typename T>
 void CTCLossCPUKernel::LaunchKernel(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &outputs) {
  auto inputs_addr = reinterpret_cast<T *>(inputs[0]->addr);
  auto labels_indices_addr = reinterpret_cast<uint64_t *>(inputs[1]->addr);
  auto labels_values_addr = reinterpret_cast<uint32_t *>(inputs[2]->addr);
  auto sequence_length_addr = reinterpret_cast<uint32_t *>(inputs[3]->addr);
  auto loss_addr = reinterpret_cast<T *>(outputs[0]->addr);
  auto gradient_addr = reinterpret_cast<T *>(outputs[1]->addr);

  std::vector<std::vector<uint32_t>> label_batch;
  std::vector<std::vector<uint32_t>> labels_with_blank;
  std::vector<uint64_t> each_label_length;

  label_batch.resize(batch_size_);
  labels_with_blank.resize(batch_size_);
  each_label_length.resize(batch_size_, 0);

  T kLogZero_ = -std::numeric_limits<T>::infinity();
  // check validation of sequence length
  for (size_t b = 0; b < batch_size_; ++b) {
    if (sequence_length_addr[b] == uint32_t(0)) {
      MS_LOG(EXCEPTION) << "Sequence length should > 0, but gets " << sequence_length_addr[b];
    }

    if (sequence_length_addr[b] > max_time_) {
      MS_LOG(EXCEPTION) << "Max time should be greater than sequence length, but gets " << max_time_ << " < "
                        << sequence_length_addr[b];
    }
  }

  for (size_t i = 0; i < indice_dims_[0]; ++i) {
    each_label_length[labels_indices_addr[i * 2]]++;
  }

  // convert label format of label_value and label_indices to batch_label
  uint64_t cum_sum = 0;
  for (size_t b = 0; b < batch_size_; ++b) {
    std::vector<uint32_t> *b_value = &label_batch[b];
    for (size_t l = 0; l < each_label_length[b]; ++l) {
      b_value->push_back(labels_values_addr[cum_sum + l]);
    }
    cum_sum += each_label_length[b];
  }

  // convert label to label with blank
  GenLableWithBlank(sequence_length_addr, label_batch, &labels_with_blank);

  for (size_t b = 0; b < batch_size_; ++b) {
    std::vector<uint32_t> label_with_blank = labels_with_blank[b];
    // y_b [num_class, sequence_length]
    std::vector<std::vector<T>> y_b;
    std::vector<std::vector<T>> dy;
    std::vector<std::vector<T>> log_alpha_b;
    std::vector<std::vector<T>> log_beta_b;
    MatrixfromVector(num_class_, sequence_length_addr[b], &y_b, kLogZero_);
    MatrixfromVector(y_b.size(), y_b[0].size(), &dy, T(0));
    MatrixfromVector(label_with_blank.size(), sequence_length_addr[b], &log_alpha_b, kLogZero_);
    MatrixfromVector(label_with_blank.size(), sequence_length_addr[b], &log_beta_b, kLogZero_);
    InnerSoftMax(inputs_addr, &y_b, sequence_length_addr[b], num_class_, batch_size_, b);

    CalculateFwdVar(label_with_blank, y_b, &log_alpha_b);
    CalculateBwdVar(label_with_blank, y_b, &log_beta_b);

    T log_pzx = kLogZero_;
    for (size_t u = 0; u < label_with_blank.size(); ++u) {
      log_pzx = LogSumExp(log_pzx, log_alpha_b[u][0] + log_beta_b[u][0]);
    }

    loss_addr[b] = -log_pzx;

    CalculateGrad(label_with_blank, y_b, log_alpha_b, log_beta_b, log_pzx, &dy);

    for (size_t t = 0; t < sequence_length_addr[b]; ++t) {
      for (size_t c = 0; c < num_class_; ++c) {
        gradient_addr[t * batch_size_ * num_class_ + b * num_class_ + c] = dy[c][t];
      }
    }
  }
 }

 void CTCLossCPUKernel::CheckParam(const CNodePtr &kernel_node) {
  size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
  if (input_num != 4) {
    MS_LOG(EXCEPTION) << "CTCLossCPUKernel needs 4 inputs, but gets " << input_num;
  }
  size_t output_num = AnfAlgo::GetOutputTensorNum(kernel_node);
  if (output_num != 2) {
    MS_LOG(EXCEPTION) << "CTCLossCPUKernel expects 2 outputs, but gets" << output_num;
  }
 }
 }  // namespace kernel
 }  // namespace mindspore
 /**
 * Copyright 2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #include "backend/kernel_compiler/cpu/ctcloss_cpu_kernel.h"
 #include "runtime/device/cpu/cpu_device_address.h"

 namespace mindspore {
 namespace kernel {
 void CTCLossCPUKernel::InitKernel(const CNodePtr &kernel_node) {
  CheckParam(kernel_node);
  probs_shape_ = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
  indice_dims_ = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 1);
  labels_dims_ = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 2);
  dtype_ = AnfAlgo::GetInputDeviceDataType(kernel_node, 0);

  if (probs_shape_.size() != 3) {
    MS_LOG(EXCEPTION) << "Probs dims: " << probs_shape_.size() << " not support.";
  }
  if (labels_dims_.size() != 1) {
    MS_LOG(EXCEPTION) << "Labels dims: " << labels_dims_.size() << " not support.";
  }
  if (indice_dims_.size() != 2) {
    MS_LOG(EXCEPTION) << "Labels indice dims: " << indice_dims_.size() << " not support.";
  }

  preprocess_collapse_repeated_ = AnfAlgo::GetNodeAttr<bool>(kernel_node, "preprocess_collapse_repeated");
  ctc_merge_repeated_ = AnfAlgo::GetNodeAttr<bool>(kernel_node, "ctc_merge_repeated");
  ignore_longer_outputs_than_inputs_ = AnfAlgo::GetNodeAttr<bool>(kernel_node, "ignore_longer_outputs_than_inputs");

  max_time_ = probs_shape_[0];
  batch_size_ = probs_shape_[1];
  num_class_ = probs_shape_[2];
  blank_index_ = num_class_ - 1;
 }

 bool CTCLossCPUKernel::Launch(const std::vector<kernel::AddressPtr> &inputs, const std::vector<kernel::AddressPtr> &,
                              const std::vector<kernel::AddressPtr> &outputs) {
  if (dtype_ == kNumberTypeFloat16) {
    LaunchKernel<float16>(inputs, outputs);
  } else if (dtype_ == kNumberTypeFloat32) {
    LaunchKernel<float>(inputs, outputs);
  }
  return true;
 }

 template <typename T>
 inline T LogSumExp(const T logprob1, const T logprob2) {
  T kLogZero_ = -std::numeric_limits<T>::infinity();
  if (logprob1 <= kLogZero_) {
    return logprob2;
  } else if (logprob2 <= kLogZero_) {
    return logprob1;
  } else {
    return (logprob1 > logprob2) ? logprob1 + static_cast<T>(log1p(exp(logprob2 - logprob1)))
                                 : logprob2 + static_cast<T>(log1p(exp(logprob1 - logprob2)));
  }
 }

 template <typename TT>
 void CTCLossCPUKernel::CalculateFwdVar(const std::vector<uint32_t> &label_with_blank,
                                       const std::vector<std::vector<TT>> &y,
                                       std::vector<std::vector<TT>> *log_alpha_b) {
  int U = label_with_blank.size();
  int T = (*log_alpha_b)[0].size();
  TT kLogZero_ = -std::numeric_limits<TT>::infinity();

  (*log_alpha_b)[0][0] = static_cast<TT>(log(y[blank_index_][0]));
  auto label_0 = (label_with_blank.size() > 1) ? label_with_blank[1] : blank_index_;
  if (label_with_blank.size() > 1) {
    (*log_alpha_b)[1][0] = static_cast<TT>(log(y[label_0][0]));
  }

  for (int t = 1; t < T; ++t) {
    int low = std::max(0, U - (2 * (T - t)));
    int high = std::min(U, 2 * (t + 1));
    for (int u = low; u < high; ++u) {
      auto sum_log_alpha_b = kLogZero_;
      if (ctc_merge_repeated_ || label_with_blank[u] == blank_index_) {
        sum_log_alpha_b = (*log_alpha_b)[u][t - 1];
      }

      if (u > 0) {
        sum_log_alpha_b = LogSumExp(sum_log_alpha_b, (*log_alpha_b)[u - 1][t - 1]);
      }

      if (u > 1) {
        bool matching_labels_merge = ctc_merge_repeated_ && (label_with_blank[u] == label_with_blank[u - 2]);
        if (label_with_blank[u] != blank_index_ && !matching_labels_merge) {
          sum_log_alpha_b = LogSumExp(sum_log_alpha_b, (*log_alpha_b)[u - 2][t - 1]);
        }
      }

      (*log_alpha_b)[u][t] =
        static_cast<TT>(log(static_cast<TT>(y[label_with_blank[IntToSize(u)]][IntToSize(t)]))) + sum_log_alpha_b;
    }
  }
 }

 template <typename TT>
 void CTCLossCPUKernel::CalculateBwdVar(const std::vector<uint32_t> &label_with_blank,
                                       const std::vector<std::vector<TT>> &y,
                                       std::vector<std::vector<TT>> *log_beta_b) {
  int T = (*log_beta_b)[0].size();
  int U = label_with_blank.size();
  if (U > 1) {
    for (int u = U - 2; u < U; ++u) {
      (*log_beta_b)[u][T - 1] = TT(0);
    }
  } else {
    (*log_beta_b)[0][T - 1] = TT(0);
    (*log_beta_b)[0][T - 2] = TT(0);
  }

  for (int t = T - 2; t >= 0; --t) {
    int low = std::max(0, U - (2 * (T - t)));
    int high = std::min(U, 2 * (t + 1));
    for (int u = low; u < high; ++u) {
      if (ctc_merge_repeated_ || label_with_blank[u] == blank_index_) {
        (*log_beta_b)[u][t] =
          LogSumExp((*log_beta_b)[u][t], (*log_beta_b)[u][t + 1] + TT(log(y[label_with_blank[u]][t + 1])));
      }

      if (u + 1 < U) {
        (*log_beta_b)[u][t] =
          LogSumExp((*log_beta_b)[u][t], (*log_beta_b)[u + 1][t + 1] + TT(log(y[label_with_blank[u + 1]][t + 1])));
      }

      if (u + 2 < U) {
        bool matching_labels_merge = ctc_merge_repeated_ && (label_with_blank[u] == label_with_blank[u + 2]);
        if (label_with_blank[u] != blank_index_ && !matching_labels_merge) {
          (*log_beta_b)[u][t] =
            LogSumExp((*log_beta_b)[u][t], (*log_beta_b)[u + 2][t + 1] + TT(log(y[label_with_blank[u + 2]][t + 1])));
        }
      }
    }
  }
 }

 template <typename TT>
 void CTCLossCPUKernel::CalculateGrad(const std::vector<uint32_t> &label_with_blank,
                                     const std::vector<std::vector<TT>> &y,
                                     const std::vector<std::vector<TT>> &log_alpha_b,
                                     const std::vector<std::vector<TT>> &log_beta_b, const TT log_pzx,
                                     std::vector<std::vector<TT>> *dy) {
  auto dy_b = dy;
  TT kLogZero_ = -std::numeric_limits<TT>::infinity();
  if (log_pzx <= kLogZero_) {
    MS_LOG(INFO) << "No valid path found";
    return;
  }

  size_t L = y.size();
  size_t T = y[0].size();
  size_t U = label_with_blank.size();

  for (size_t t = 0; t < T; ++t) {
    std::vector<TT> prob_sum(L, kLogZero_);

    for (size_t u = 0; u < U; ++u) {
      uint32_t l = label_with_blank[u];
      prob_sum[l] = LogSumExp(prob_sum[l], log_alpha_b[u][t] + log_beta_b[u][t]);
    }
    for (size_t l = 0; l < L; ++l) {
      (*dy_b)[l][t] = y[l][t] - static_cast<TT>(exp(prob_sum[l] - log_pzx));
    }
  }
 }

 void CTCLossCPUKernel::GenLableWithBlank(const uint32_t *seq_len, const std::vector<std::vector<uint32_t>> &batch_label,
                                         std::vector<std::vector<uint32_t>> *label_with_blank) {
  for (size_t b = 0; b < batch_size_; ++b) {
    std::vector<uint32_t> l;
    const std::vector<uint32_t> &label = batch_label[b];
    bool has_blank = false;
    for (size_t i = 0; i < label.size(); ++i) {
      if (i == 0 || !preprocess_collapse_repeated_ || label[i] != label[i - 1]) {
        if (label[i] >= num_class_ - 1) {
          has_blank = true;
        } else {
          if (has_blank) {
            MS_LOG(EXCEPTION) << "Invalid labels(index >= num_class - 1) should not appear between two valid labels";
          }
          l.push_back(label[i]);
        }
      }
    }
    if (!ignore_longer_outputs_than_inputs_) {
      if (l.size() > seq_len[b]) {
        MS_LOG(EXCEPTION) << "Input time(sequence length) should greater than output size(label length), but gets "
                          << seq_len[b] << "< " << l.size();
      }
    }

    (*label_with_blank)[b].reserve(2 * l.size() + 1);
    for (auto l_i : l) {
      (*label_with_blank)[b].push_back(blank_index_);
      (*label_with_blank)[b].push_back(l_i);
    }
    (*label_with_blank)[b].push_back(blank_index_);
  }
 }

 template <typename T>
 void InnerSoftMax(const T *inputs_addr, std::vector<std::vector<T>> *softmax_probs, const uint32_t sequence_length,
                  size_t num_class, size_t batch_size, size_t b) {
  for (size_t t = 0; t < sequence_length; ++t) {
    T maxCoeff(T(0));
    T sumCoeff(T(0));

    for (size_t c = 0; c < num_class; ++c) {
      if (inputs_addr[t * batch_size * num_class + b * num_class + c] > maxCoeff) {
        maxCoeff = inputs_addr[t * batch_size * num_class + b * num_class + c];
      }
    }

    for (size_t c = 0; c < num_class; ++c) {
      sumCoeff += static_cast<T>(exp(inputs_addr[t * batch_size * num_class + b * num_class + c] - maxCoeff));
      (*softmax_probs)[c][t] =
        static_cast<T>(exp(inputs_addr[t * batch_size * num_class + b * num_class + c] - maxCoeff));
    }

    for (size_t c = 0; c < num_class; ++c) {
      (*softmax_probs)[c][t] /= sumCoeff;
    }
  }
 }

 template <typename T>
 void MatrixfromVector(uint32_t row, uint32_t col, std::vector<std::vector<T>> *array2D, const T init_value) {
  array2D->resize(row);
  for (size_t i = 0; i < row; ++i) {
    (*array2D)[i].resize(col, init_value);
  }
 }

 template <typename T>
 void CTCLossCPUKernel::LaunchKernel(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &outputs) {
  auto inputs_addr = reinterpret_cast<T *>(inputs[0]->addr);
  auto labels_indices_addr = reinterpret_cast<uint64_t *>(inputs[1]->addr);
  auto labels_values_addr = reinterpret_cast<uint32_t *>(inputs[2]->addr);
  auto sequence_length_addr = reinterpret_cast<uint32_t *>(inputs[3]->addr);
  auto loss_addr = reinterpret_cast<T *>(outputs[0]->addr);
  auto gradient_addr = reinterpret_cast<T *>(outputs[1]->addr);

  std::vector<std::vector<uint32_t>> label_batch;
  std::vector<std::vector<uint32_t>> labels_with_blank;
  std::vector<uint64_t> each_label_length;

  label_batch.resize(batch_size_);
  labels_with_blank.resize(batch_size_);
  each_label_length.resize(batch_size_, 0);

  T kLogZero_ = -std::numeric_limits<T>::infinity();
  // check validation of sequence length
  for (size_t b = 0; b < batch_size_; ++b) {
    if (sequence_length_addr[b] == uint32_t(0)) {
      MS_LOG(EXCEPTION) << "Sequence length should > 0, but gets " << sequence_length_addr[b];
    }

    if (sequence_length_addr[b] > max_time_) {
      MS_LOG(EXCEPTION) << "Max time should be greater than sequence length, but gets " << max_time_ << " < "
                        << sequence_length_addr[b];
    }
  }

  for (size_t i = 0; i < indice_dims_[0]; ++i) {
    each_label_length[labels_indices_addr[i * 2]]++;
  }

  // convert label format of label_value and label_indices to batch_label
  uint64_t cum_sum = 0;
  for (size_t b = 0; b < batch_size_; ++b) {
    std::vector<uint32_t> *b_value = &label_batch[b];
    for (size_t l = 0; l < each_label_length[b]; ++l) {
      b_value->push_back(labels_values_addr[cum_sum + l]);
    }
    cum_sum += each_label_length[b];
  }

  // convert label to label with blank
  GenLableWithBlank(sequence_length_addr, label_batch, &labels_with_blank);

  for (size_t b = 0; b < batch_size_; ++b) {
    std::vector<uint32_t> label_with_blank = labels_with_blank[b];
    // y_b [num_class, sequence_length]
    std::vector<std::vector<T>> y_b;
    std::vector<std::vector<T>> dy;
    std::vector<std::vector<T>> log_alpha_b;
    std::vector<std::vector<T>> log_beta_b;
    MatrixfromVector(num_class_, sequence_length_addr[b], &y_b, kLogZero_);
    MatrixfromVector(y_b.size(), y_b[0].size(), &dy, T(0));
    MatrixfromVector(label_with_blank.size(), sequence_length_addr[b], &log_alpha_b, kLogZero_);
    MatrixfromVector(label_with_blank.size(), sequence_length_addr[b], &log_beta_b, kLogZero_);
    InnerSoftMax(inputs_addr, &y_b, sequence_length_addr[b], num_class_, batch_size_, b);

    CalculateFwdVar(label_with_blank, y_b, &log_alpha_b);
    CalculateBwdVar(label_with_blank, y_b, &log_beta_b);

    T log_pzx = kLogZero_;
    for (size_t u = 0; u < label_with_blank.size(); ++u) {
      log_pzx = LogSumExp(log_pzx, log_alpha_b[u][0] + log_beta_b[u][0]);
    }

    loss_addr[b] = -log_pzx;

    CalculateGrad(label_with_blank, y_b, log_alpha_b, log_beta_b, log_pzx, &dy);

    for (size_t t = 0; t < sequence_length_addr[b]; ++t) {
      for (size_t c = 0; c < num_class_; ++c) {
        gradient_addr[t * batch_size_ * num_class_ + b * num_class_ + c] = dy[c][t];
      }
    }
  }
 }

 void CTCLossCPUKernel::CheckParam(const CNodePtr &kernel_node) {
  size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
  if (input_num != 4) {
    MS_LOG(EXCEPTION) << "CTCLossCPUKernel needs 4 inputs, but gets " << input_num;
  }
  size_t output_num = AnfAlgo::GetOutputTensorNum(kernel_node);
  if (output_num != 2) {
    MS_LOG(EXCEPTION) << "CTCLossCPUKernel expects 2 outputs, but gets" << output_num;
  }
 }
 }  // namespace kernel
 }  // namespace mindspore
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/ctcloss_cpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/ctcloss_cpu_kernel.h
@@ -1,92 +1,92 @@
 /**
 * Copyright 2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_CTCLOSS_CPU_KERNEL_H_
 #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_CTCLOSS_CPU_KERNEL_H_
 #include <memory>
 #include <unordered_map>
 #include <vector>
 #include <algorithm>
 #include <limits>
 #include "backend/kernel_compiler/cpu/cpu_kernel.h"
 #include "backend/kernel_compiler/cpu/cpu_kernel_factory.h"

 namespace mindspore {
 namespace kernel {
 class CTCLossCPUKernel : public CPUKernel {
 public:
  CTCLossCPUKernel() = default;
  ~CTCLossCPUKernel() override = default;

  void InitKernel(const CNodePtr &kernel_node) override;

  bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
              const std::vector<AddressPtr> &outputs) override;

  void GenLableWithBlank(const uint32_t *seq_len, const std::vector<std::vector<uint32_t>> &batch_label,
                         std::vector<std::vector<uint32_t>> *label_with_blank);

  template <typename T>
  void CalculateFwdVar(const std::vector<uint32_t> &label_with_blank, const std::vector<std::vector<T>> &y,
                       std::vector<std::vector<T>> *log_alpha_b);
  template <typename T>
  void CalculateBwdVar(const std::vector<uint32_t> &label_with_blank, const std::vector<std::vector<T>> &y,
                       std::vector<std::vector<T>> *log_beta_b);
  template <typename T>
  void CalculateGrad(const std::vector<uint32_t> &label_with_blank, const std::vector<std::vector<T>> &y,
                     const std::vector<std::vector<T>> &log_alpha_b, const std::vector<std::vector<T>> &log_beta_b,
                     const T log_pzx, std::vector<std::vector<T>> *dy);

  template <typename T>
  void LaunchKernel(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &outputs);

 private:
  void CheckParam(const CNodePtr &kernel_node);
  std::vector<size_t> probs_shape_;
  std::vector<size_t> indice_dims_;
  std::vector<size_t> labels_dims_;
  size_t num_class_;
  size_t max_time_;
  size_t batch_size_;
  uint32_t blank_index_;
  TypeId dtype_{kTypeUnknown};
  bool preprocess_collapse_repeated_;
  bool ctc_merge_repeated_;
  bool ignore_longer_outputs_than_inputs_;
 };

 MS_REG_CPU_KERNEL(CTCLoss,
                  KernelAttr()
                    .AddInputAttr(kNumberTypeFloat16)
                    .AddInputAttr(kNumberTypeInt64)
                    .AddInputAttr(kNumberTypeInt32)
                    .AddInputAttr(kNumberTypeInt32)
                    .AddOutputAttr(kNumberTypeFloat16)
                    .AddOutputAttr(kNumberTypeFloat16),
                  CTCLossCPUKernel);

 MS_REG_CPU_KERNEL(CTCLoss,
                  KernelAttr()
                    .AddInputAttr(kNumberTypeFloat32)
                    .AddInputAttr(kNumberTypeInt64)
                    .AddInputAttr(kNumberTypeInt32)
                    .AddInputAttr(kNumberTypeInt32)
                    .AddOutputAttr(kNumberTypeFloat32)
                    .AddOutputAttr(kNumberTypeFloat32),
                  CTCLossCPUKernel);
 }  // namespace kernel
 }  // namespace mindspore
 #endif  // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_CTCLOSS_CPU_KERNEL_H_
 /**
 * Copyright 2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_CTCLOSS_CPU_KERNEL_H_
 #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_CTCLOSS_CPU_KERNEL_H_
 #include <memory>
 #include <unordered_map>
 #include <vector>
 #include <algorithm>
 #include <limits>
 #include "backend/kernel_compiler/cpu/cpu_kernel.h"
 #include "backend/kernel_compiler/cpu/cpu_kernel_factory.h"

 namespace mindspore {
 namespace kernel {
 class CTCLossCPUKernel : public CPUKernel {
 public:
  CTCLossCPUKernel() = default;
  ~CTCLossCPUKernel() override = default;

  void InitKernel(const CNodePtr &kernel_node) override;

  bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
              const std::vector<AddressPtr> &outputs) override;

  void GenLableWithBlank(const uint32_t *seq_len, const std::vector<std::vector<uint32_t>> &batch_label,
                         std::vector<std::vector<uint32_t>> *label_with_blank);

  template <typename T>
  void CalculateFwdVar(const std::vector<uint32_t> &label_with_blank, const std::vector<std::vector<T>> &y,
                       std::vector<std::vector<T>> *log_alpha_b);
  template <typename T>
  void CalculateBwdVar(const std::vector<uint32_t> &label_with_blank, const std::vector<std::vector<T>> &y,
                       std::vector<std::vector<T>> *log_beta_b);
  template <typename T>
  void CalculateGrad(const std::vector<uint32_t> &label_with_blank, const std::vector<std::vector<T>> &y,
                     const std::vector<std::vector<T>> &log_alpha_b, const std::vector<std::vector<T>> &log_beta_b,
                     const T log_pzx, std::vector<std::vector<T>> *dy);

  template <typename T>
  void LaunchKernel(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &outputs);

 private:
  void CheckParam(const CNodePtr &kernel_node);
  std::vector<size_t> probs_shape_;
  std::vector<size_t> indice_dims_;
  std::vector<size_t> labels_dims_;
  size_t num_class_;
  size_t max_time_;
  size_t batch_size_;
  uint32_t blank_index_;
  TypeId dtype_{kTypeUnknown};
  bool preprocess_collapse_repeated_;
  bool ctc_merge_repeated_;
  bool ignore_longer_outputs_than_inputs_;
 };

 MS_REG_CPU_KERNEL(CTCLoss,
                  KernelAttr()
                    .AddInputAttr(kNumberTypeFloat16)
                    .AddInputAttr(kNumberTypeInt64)
                    .AddInputAttr(kNumberTypeInt32)
                    .AddInputAttr(kNumberTypeInt32)
                    .AddOutputAttr(kNumberTypeFloat16)
                    .AddOutputAttr(kNumberTypeFloat16),
                  CTCLossCPUKernel);

 MS_REG_CPU_KERNEL(CTCLoss,
                  KernelAttr()
                    .AddInputAttr(kNumberTypeFloat32)
                    .AddInputAttr(kNumberTypeInt64)
                    .AddInputAttr(kNumberTypeInt32)
                    .AddInputAttr(kNumberTypeInt32)
                    .AddOutputAttr(kNumberTypeFloat32)
                    .AddOutputAttr(kNumberTypeFloat32),
                  CTCLossCPUKernel);
 }  // namespace kernel
 }  // namespace mindspore
 #endif  // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_CTCLOSS_CPU_KERNEL_H_
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/depthtospace_cpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/depthtospace_cpu_kernel.cc
@@ -1,89 +1,89 @@
 /**
 * Copyright 2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #include "backend/kernel_compiler/cpu/depthtospace_cpu_kernel.h"

 #include <vector>

 #include "runtime/device/cpu/cpu_device_address.h"

 namespace mindspore {
 namespace kernel {
 template <typename T>
 void DepthToSpaceCPUKernel<T>::InitKernel(const CNodePtr &kernel_node) {
  MS_EXCEPTION_IF_NULL(kernel_node);
  CheckParam(kernel_node);
  input_shape_ = AnfAlgo::GetInputDeviceShape(kernel_node, 0);
  output_shape_ = AnfAlgo::GetOutputDeviceShape(kernel_node, 0);
  block_size_ = AnfAlgo::GetNodeAttr<int64_t>(kernel_node, "block_size");
 }

 template <typename T>
 bool DepthToSpaceCPUKernel<T>::Launch(const std::vector<kernel::AddressPtr> &inputs,
                                      const std::vector<kernel::AddressPtr> & /*workspace*/,
                                      const std::vector<kernel::AddressPtr> &outputs) {
  auto input_addr = reinterpret_cast<T *>(inputs[0]->addr);
  auto output_addr = reinterpret_cast<T *>(outputs[0]->addr);
  size_t size = IntToSize(inputs[0]->size / sizeof(T));
  std::vector<size_t> input_shape = input_shape_;
  std::vector<size_t> output_shape = output_shape_;
  size_t block_size = block_size_;
  size_t input_dimension = input_shape.size();
  size_t output_strides[3] = {1, 1, 1};

  for (size_t i = input_dimension - 1; i >= 1; --i) {
    for (size_t j = 0; j < i; ++j) {
      output_strides[j] *= output_shape[i];
    }
  }

  auto task = [&, input_addr, output_addr](size_t start, size_t end) {
    std::vector<size_t> output_pos_array(input_dimension, 0);
    for (size_t i = start; i < end; ++i) {
      size_t tmp_pos = i;
      for (size_t j = 0; j < input_dimension - 1; ++j) {
        output_pos_array[j] = tmp_pos / output_strides[j];
        tmp_pos %= output_strides[j];
      }
      output_pos_array.back() = tmp_pos;
      size_t input_pos = output_pos_array[0];
      input_pos =
        (input_pos * input_shape[1]) +
        (output_pos_array[1] +
         (block_size * (output_pos_array[2] % block_size) + output_pos_array[3] % block_size) * output_shape[1]);
      input_pos = (input_pos * input_shape[2]) + (output_pos_array[2] / block_size);
      input_pos = (input_pos * input_shape[3]) + (output_pos_array[3] / block_size);
      output_addr[i] = input_addr[input_pos];
    }
  };

  CPUKernelUtils::ParallelFor(task, size);
  return true;
 }

 template <typename T>
 void DepthToSpaceCPUKernel<T>::CheckParam(const CNodePtr &kernel_node) {
  size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
  if (input_num != 1) {
    MS_LOG(EXCEPTION) << "Input number is " << input_num << ", but DepthToSpaceCPUKerrnel needs 1 input.";
  }
  size_t output_num = AnfAlgo::GetOutputTensorNum(kernel_node);
  if (output_num != 1) {
    MS_LOG(EXCEPTION) << "Output number is " << output_num << ", but DepthToSpaceCPUKernel needs 1 output.";
  }
 }
 }  // namespace kernel
 }  // namespace mindspore
 /**
 * Copyright 2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #include "backend/kernel_compiler/cpu/depthtospace_cpu_kernel.h"

 #include <vector>

 #include "runtime/device/cpu/cpu_device_address.h"

 namespace mindspore {
 namespace kernel {
 template <typename T>
 void DepthToSpaceCPUKernel<T>::InitKernel(const CNodePtr &kernel_node) {
  MS_EXCEPTION_IF_NULL(kernel_node);
  CheckParam(kernel_node);
  input_shape_ = AnfAlgo::GetInputDeviceShape(kernel_node, 0);
  output_shape_ = AnfAlgo::GetOutputDeviceShape(kernel_node, 0);
  block_size_ = AnfAlgo::GetNodeAttr<int64_t>(kernel_node, "block_size");
 }

 template <typename T>
 bool DepthToSpaceCPUKernel<T>::Launch(const std::vector<kernel::AddressPtr> &inputs,
                                      const std::vector<kernel::AddressPtr> & /*workspace*/,
                                      const std::vector<kernel::AddressPtr> &outputs) {
  auto input_addr = reinterpret_cast<T *>(inputs[0]->addr);
  auto output_addr = reinterpret_cast<T *>(outputs[0]->addr);
  size_t size = IntToSize(inputs[0]->size / sizeof(T));
  std::vector<size_t> input_shape = input_shape_;
  std::vector<size_t> output_shape = output_shape_;
  size_t block_size = block_size_;
  size_t input_dimension = input_shape.size();
  size_t output_strides[3] = {1, 1, 1};

  for (size_t i = input_dimension - 1; i >= 1; --i) {
    for (size_t j = 0; j < i; ++j) {
      output_strides[j] *= output_shape[i];
    }
  }

  auto task = [&, input_addr, output_addr](size_t start, size_t end) {
    std::vector<size_t> output_pos_array(input_dimension, 0);
    for (size_t i = start; i < end; ++i) {
      size_t tmp_pos = i;
      for (size_t j = 0; j < input_dimension - 1; ++j) {
        output_pos_array[j] = tmp_pos / output_strides[j];
        tmp_pos %= output_strides[j];
      }
      output_pos_array.back() = tmp_pos;
      size_t input_pos = output_pos_array[0];
      input_pos =
        (input_pos * input_shape[1]) +
        (output_pos_array[1] +
         (block_size * (output_pos_array[2] % block_size) + output_pos_array[3] % block_size) * output_shape[1]);
      input_pos = (input_pos * input_shape[2]) + (output_pos_array[2] / block_size);
      input_pos = (input_pos * input_shape[3]) + (output_pos_array[3] / block_size);
      output_addr[i] = input_addr[input_pos];
    }
  };

  CPUKernelUtils::ParallelFor(task, size);
  return true;
 }

 template <typename T>
 void DepthToSpaceCPUKernel<T>::CheckParam(const CNodePtr &kernel_node) {
  size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
  if (input_num != 1) {
    MS_LOG(EXCEPTION) << "Input number is " << input_num << ", but DepthToSpaceCPUKerrnel needs 1 input.";
  }
  size_t output_num = AnfAlgo::GetOutputTensorNum(kernel_node);
  if (output_num != 1) {
    MS_LOG(EXCEPTION) << "Output number is " << output_num << ", but DepthToSpaceCPUKernel needs 1 output.";
  }
 }
 }  // namespace kernel
 }  // namespace mindspore
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/depthtospace_cpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/depthtospace_cpu_kernel.h
@@ -1,85 +1,85 @@
 /**
 * Copyright 2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_DEPTHTOSPACE_CPU_KERNEL_H_
 #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_DEPTHTOSPACE_CPU_KERNEL_H_
 #include <memory>
 #include <string>
 #include <vector>

 #include "backend/kernel_compiler/cpu/cpu_kernel.h"
 #include "backend/kernel_compiler/cpu/cpu_kernel_factory.h"
 namespace mindspore {
 namespace kernel {
 template <typename T>
 class DepthToSpaceCPUKernel : public CPUKernel {
 public:
  DepthToSpaceCPUKernel() = default;
  ~DepthToSpaceCPUKernel() override = default;

  void InitKernel(const CNodePtr &kernel_node) override;
  bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
              const std::vector<AddressPtr> &outputs) override;

 private:
  void CheckParam(const CNodePtr &kernel_node);
  std::vector<size_t> input_shape_;
  std::vector<size_t> output_shape_;
  size_t block_size_;
 };

 MS_REG_CPU_KERNEL_T(
  DepthToSpace, KernelAttr().SetAllSameAttr(true).AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32),
  DepthToSpaceCPUKernel, float);

 MS_REG_CPU_KERNEL_T(
  DepthToSpace, KernelAttr().SetAllSameAttr(true).AddInputAttr(kNumberTypeFloat16).AddOutputAttr(kNumberTypeFloat16),
  DepthToSpaceCPUKernel, float16);

 MS_REG_CPU_KERNEL_T(DepthToSpace,
                    KernelAttr().SetAllSameAttr(true).AddInputAttr(kNumberTypeInt8).AddOutputAttr(kNumberTypeInt8),
                    DepthToSpaceCPUKernel, int8_t);

 MS_REG_CPU_KERNEL_T(DepthToSpace,
                    KernelAttr().SetAllSameAttr(true).AddInputAttr(kNumberTypeInt16).AddOutputAttr(kNumberTypeInt16),
                    DepthToSpaceCPUKernel, int16_t);

 MS_REG_CPU_KERNEL_T(DepthToSpace,
                    KernelAttr().SetAllSameAttr(true).AddInputAttr(kNumberTypeInt32).AddOutputAttr(kNumberTypeInt32),
                    DepthToSpaceCPUKernel, int);

 MS_REG_CPU_KERNEL_T(DepthToSpace,
                    KernelAttr().SetAllSameAttr(true).AddInputAttr(kNumberTypeInt64).AddOutputAttr(kNumberTypeInt64),
                    DepthToSpaceCPUKernel, int64_t);

 MS_REG_CPU_KERNEL_T(DepthToSpace,
                    KernelAttr().SetAllSameAttr(true).AddInputAttr(kNumberTypeUInt8).AddOutputAttr(kNumberTypeUInt8),
                    DepthToSpaceCPUKernel, uint8_t);

 MS_REG_CPU_KERNEL_T(DepthToSpace,
                    KernelAttr().SetAllSameAttr(true).AddInputAttr(kNumberTypeUInt16).AddOutputAttr(kNumberTypeUInt16),
                    DepthToSpaceCPUKernel, uint16_t);

 MS_REG_CPU_KERNEL_T(DepthToSpace,
                    KernelAttr().SetAllSameAttr(true).AddInputAttr(kNumberTypeUInt32).AddOutputAttr(kNumberTypeUInt32),
                    DepthToSpaceCPUKernel, uint32_t);

 MS_REG_CPU_KERNEL_T(DepthToSpace,
                    KernelAttr().SetAllSameAttr(true).AddInputAttr(kNumberTypeUInt64).AddOutputAttr(kNumberTypeUInt64),
                    DepthToSpaceCPUKernel, uint64_t);

 }  // namespace kernel
 }  // namespace mindspore
 #endif  // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_DEPTHTOSPACE_CPU_KERNEL_H_
 /**
 * Copyright 2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_DEPTHTOSPACE_CPU_KERNEL_H_
 #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_DEPTHTOSPACE_CPU_KERNEL_H_
 #include <memory>
 #include <string>
 #include <vector>

 #include "backend/kernel_compiler/cpu/cpu_kernel.h"
 #include "backend/kernel_compiler/cpu/cpu_kernel_factory.h"
 namespace mindspore {
 namespace kernel {
 template <typename T>
 class DepthToSpaceCPUKernel : public CPUKernel {
 public:
  DepthToSpaceCPUKernel() = default;
  ~DepthToSpaceCPUKernel() override = default;

  void InitKernel(const CNodePtr &kernel_node) override;
  bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
              const std::vector<AddressPtr> &outputs) override;

 private:
  void CheckParam(const CNodePtr &kernel_node);
  std::vector<size_t> input_shape_;
  std::vector<size_t> output_shape_;
  size_t block_size_;
 };

 MS_REG_CPU_KERNEL_T(
  DepthToSpace, KernelAttr().SetAllSameAttr(true).AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32),
  DepthToSpaceCPUKernel, float);

 MS_REG_CPU_KERNEL_T(
  DepthToSpace, KernelAttr().SetAllSameAttr(true).AddInputAttr(kNumberTypeFloat16).AddOutputAttr(kNumberTypeFloat16),
  DepthToSpaceCPUKernel, float16);

 MS_REG_CPU_KERNEL_T(DepthToSpace,
                    KernelAttr().SetAllSameAttr(true).AddInputAttr(kNumberTypeInt8).AddOutputAttr(kNumberTypeInt8),
                    DepthToSpaceCPUKernel, int8_t);

 MS_REG_CPU_KERNEL_T(DepthToSpace,
                    KernelAttr().SetAllSameAttr(true).AddInputAttr(kNumberTypeInt16).AddOutputAttr(kNumberTypeInt16),
                    DepthToSpaceCPUKernel, int16_t);

 MS_REG_CPU_KERNEL_T(DepthToSpace,
                    KernelAttr().SetAllSameAttr(true).AddInputAttr(kNumberTypeInt32).AddOutputAttr(kNumberTypeInt32),
                    DepthToSpaceCPUKernel, int);

 MS_REG_CPU_KERNEL_T(DepthToSpace,
                    KernelAttr().SetAllSameAttr(true).AddInputAttr(kNumberTypeInt64).AddOutputAttr(kNumberTypeInt64),
                    DepthToSpaceCPUKernel, int64_t);

 MS_REG_CPU_KERNEL_T(DepthToSpace,
                    KernelAttr().SetAllSameAttr(true).AddInputAttr(kNumberTypeUInt8).AddOutputAttr(kNumberTypeUInt8),
                    DepthToSpaceCPUKernel, uint8_t);

 MS_REG_CPU_KERNEL_T(DepthToSpace,
                    KernelAttr().SetAllSameAttr(true).AddInputAttr(kNumberTypeUInt16).AddOutputAttr(kNumberTypeUInt16),
                    DepthToSpaceCPUKernel, uint16_t);

 MS_REG_CPU_KERNEL_T(DepthToSpace,
                    KernelAttr().SetAllSameAttr(true).AddInputAttr(kNumberTypeUInt32).AddOutputAttr(kNumberTypeUInt32),
                    DepthToSpaceCPUKernel, uint32_t);

 MS_REG_CPU_KERNEL_T(DepthToSpace,
                    KernelAttr().SetAllSameAttr(true).AddInputAttr(kNumberTypeUInt64).AddOutputAttr(kNumberTypeUInt64),
                    DepthToSpaceCPUKernel, uint64_t);

 }  // namespace kernel
 }  // namespace mindspore
 #endif  // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_DEPTHTOSPACE_CPU_KERNEL_H_
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/mkldnn/addn_cpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/mkldnn/addn_cpu_kernel.cc
@@ -1,102 +1,102 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #include "backend/kernel_compiler/cpu/mkldnn/addn_cpu_kernel.h"
 #include "backend/kernel_compiler/cpu/mkldnn/mkl_kernel_engine.h"
 #include "runtime/device/cpu/cpu_device_address.h"
 #include "backend/kernel_compiler/cpu/nnacl/fp32/add_fp32.h"
 #include "backend/kernel_compiler/cpu/nnacl/errorcode.h"
 #include "utils/ms_utils.h"
 #include "common/thread_pool.h"

 namespace mindspore {
 namespace kernel {
 void AddInt(const int *in_0, const int *in_1, int *out, int start, int end) {
  int ret = ElementAddInt(in_0 + start, in_1 + start, out + start, end - start);
  if (ret != NNACL_OK) {
    MS_LOG(EXCEPTION) << "Add failed.";
  }
 }

 void AddNCPUKernel::InitKernel(const CNodePtr &kernel_node) {
  MS_EXCEPTION_IF_NULL(kernel_node);
  CheckParam(kernel_node);
  input_num_ = AnfAlgo::GetInputTensorNum(kernel_node);
  dtype_ = AnfAlgo::GetInputDeviceDataType(kernel_node, 0);
  std::vector<size_t> src0_shape = AnfAlgo::GetInputDeviceShape(kernel_node, 0);
  std::vector<size_t> src1_shape = AnfAlgo::GetInputDeviceShape(kernel_node, 1);
  std::vector<size_t> dst_shape = AnfAlgo::GetOutputDeviceShape(kernel_node, 0);
  dnnl::memory::desc src0_mem_desc = GetDefaultMemDesc(src0_shape);
  dnnl::memory::desc src1_mem_desc = GetDefaultMemDesc(src1_shape);
  dnnl::memory::desc dst_mem_desc = GetDefaultMemDesc(dst_shape);
  dnnl::binary::desc desc = dnnl::binary::desc(dnnl::algorithm::binary_add, src0_mem_desc, src1_mem_desc, dst_mem_desc);
  auto prim_desc = dnnl::binary::primitive_desc(desc, MKLKernelEngine::Get().engine());
  primitive_ = std::make_shared<dnnl::binary>(prim_desc);
  AddArgument(DNNL_ARG_SRC_0, src0_mem_desc);
  AddArgument(DNNL_ARG_SRC_1, src1_mem_desc);
  AddArgument(DNNL_ARG_DST, dst_mem_desc);
 }

 bool AddNCPUKernel::Launch(const std::vector<kernel::AddressPtr> &inputs, const std::vector<kernel::AddressPtr> &,
                           const std::vector<kernel::AddressPtr> &outputs) {
  if (dtype_ == kNumberTypeFloat32) {
    SetArgumentHandle(DNNL_ARG_SRC_0, inputs[0]->addr);
    SetArgumentHandle(DNNL_ARG_SRC_1, inputs[1]->addr);
    SetArgumentHandle(DNNL_ARG_DST, outputs[0]->addr);
    ExecutePrimitive();
    for (size_t index = 2; index < input_num_; ++index) {
      SetArgumentHandle(DNNL_ARG_SRC_0, outputs[0]->addr);
      SetArgumentHandle(DNNL_ARG_SRC_1, inputs[index]->addr);
      SetArgumentHandle(DNNL_ARG_DST, outputs[0]->addr);
      ExecutePrimitive();
    }
  } else if (dtype_ == kNumberTypeInt32) {
    size_t elements_num = outputs[0]->size / sizeof(int);
    const auto input_0 = reinterpret_cast<int *>(inputs[0]->addr);
    const auto input_1 = reinterpret_cast<int *>(inputs[1]->addr);
    auto output = reinterpret_cast<int *>(outputs[0]->addr);
    auto task_0 = std::bind(AddInt, input_0, input_1, output, std::placeholders::_1, std::placeholders::_2);
    CPUKernelUtils::ParallelFor(task_0, elements_num);
    for (size_t index = 2; index < input_num_; ++index) {
      const auto input = reinterpret_cast<int *>(inputs[index]->addr);
      auto task = std::bind(AddInt, input, output, output, std::placeholders::_1, std::placeholders::_2);
      CPUKernelUtils::ParallelFor(task, elements_num);
    }
  } else {
    MS_LOG(EXCEPTION) << "AddN only support float32 and int32, but got " << TypeIdToType(dtype_)->ToString();
  }
  return true;
 }

 void AddNCPUKernel::CheckParam(const CNodePtr &kernel_node) {
  auto src0_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
  auto dst_shape = AnfAlgo::GetOutputDeviceShape(kernel_node, 0);
  if (src0_shape != dst_shape) {
    MS_LOG(EXCEPTION) << "AddN output shape must be equal to input shape.";
  }
  for (size_t index = 1; index < input_num_; ++index) {
    auto src_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, index);
    if (src0_shape != src_shape) {
      MS_LOG(EXCEPTION) << "AddN input shapes must be equal.";
    }
  }
  size_t output_num = AnfAlgo::GetOutputTensorNum(kernel_node);
  if (output_num != 1) {
    MS_LOG(EXCEPTION) << "Output number is " << output_num << ", but AddNCPUKernel needs 1 output.";
  }
 }
 }  // namespace kernel
 }  // namespace mindspore
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #include "backend/kernel_compiler/cpu/mkldnn/addn_cpu_kernel.h"
 #include "backend/kernel_compiler/cpu/mkldnn/mkl_kernel_engine.h"
 #include "runtime/device/cpu/cpu_device_address.h"
 #include "backend/kernel_compiler/cpu/nnacl/fp32/add_fp32.h"
 #include "backend/kernel_compiler/cpu/nnacl/errorcode.h"
 #include "utils/ms_utils.h"
 #include "common/thread_pool.h"

 namespace mindspore {
 namespace kernel {
 void AddInt(const int *in_0, const int *in_1, int *out, int start, int end) {
  int ret = ElementAddInt(in_0 + start, in_1 + start, out + start, end - start);
  if (ret != NNACL_OK) {
    MS_LOG(EXCEPTION) << "Add failed.";
  }
 }

 void AddNCPUKernel::InitKernel(const CNodePtr &kernel_node) {
  MS_EXCEPTION_IF_NULL(kernel_node);
  CheckParam(kernel_node);
  input_num_ = AnfAlgo::GetInputTensorNum(kernel_node);
  dtype_ = AnfAlgo::GetInputDeviceDataType(kernel_node, 0);
  std::vector<size_t> src0_shape = AnfAlgo::GetInputDeviceShape(kernel_node, 0);
  std::vector<size_t> src1_shape = AnfAlgo::GetInputDeviceShape(kernel_node, 1);
  std::vector<size_t> dst_shape = AnfAlgo::GetOutputDeviceShape(kernel_node, 0);
  dnnl::memory::desc src0_mem_desc = GetDefaultMemDesc(src0_shape);
  dnnl::memory::desc src1_mem_desc = GetDefaultMemDesc(src1_shape);
  dnnl::memory::desc dst_mem_desc = GetDefaultMemDesc(dst_shape);
  dnnl::binary::desc desc = dnnl::binary::desc(dnnl::algorithm::binary_add, src0_mem_desc, src1_mem_desc, dst_mem_desc);
  auto prim_desc = dnnl::binary::primitive_desc(desc, MKLKernelEngine::Get().engine());
  primitive_ = std::make_shared<dnnl::binary>(prim_desc);
  AddArgument(DNNL_ARG_SRC_0, src0_mem_desc);
  AddArgument(DNNL_ARG_SRC_1, src1_mem_desc);
  AddArgument(DNNL_ARG_DST, dst_mem_desc);
 }

 bool AddNCPUKernel::Launch(const std::vector<kernel::AddressPtr> &inputs, const std::vector<kernel::AddressPtr> &,
                           const std::vector<kernel::AddressPtr> &outputs) {
  if (dtype_ == kNumberTypeFloat32) {
    SetArgumentHandle(DNNL_ARG_SRC_0, inputs[0]->addr);
    SetArgumentHandle(DNNL_ARG_SRC_1, inputs[1]->addr);
    SetArgumentHandle(DNNL_ARG_DST, outputs[0]->addr);
    ExecutePrimitive();
    for (size_t index = 2; index < input_num_; ++index) {
      SetArgumentHandle(DNNL_ARG_SRC_0, outputs[0]->addr);
      SetArgumentHandle(DNNL_ARG_SRC_1, inputs[index]->addr);
      SetArgumentHandle(DNNL_ARG_DST, outputs[0]->addr);
      ExecutePrimitive();
    }
  } else if (dtype_ == kNumberTypeInt32) {
    size_t elements_num = outputs[0]->size / sizeof(int);
    const auto input_0 = reinterpret_cast<int *>(inputs[0]->addr);
    const auto input_1 = reinterpret_cast<int *>(inputs[1]->addr);
    auto output = reinterpret_cast<int *>(outputs[0]->addr);
    auto task_0 = std::bind(AddInt, input_0, input_1, output, std::placeholders::_1, std::placeholders::_2);
    CPUKernelUtils::ParallelFor(task_0, elements_num);
    for (size_t index = 2; index < input_num_; ++index) {
      const auto input = reinterpret_cast<int *>(inputs[index]->addr);
      auto task = std::bind(AddInt, input, output, output, std::placeholders::_1, std::placeholders::_2);
      CPUKernelUtils::ParallelFor(task, elements_num);
    }
  } else {
    MS_LOG(EXCEPTION) << "AddN only support float32 and int32, but got " << TypeIdToType(dtype_)->ToString();
  }
  return true;
 }

 void AddNCPUKernel::CheckParam(const CNodePtr &kernel_node) {
  auto src0_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
  auto dst_shape = AnfAlgo::GetOutputDeviceShape(kernel_node, 0);
  if (src0_shape != dst_shape) {
    MS_LOG(EXCEPTION) << "AddN output shape must be equal to input shape.";
  }
  for (size_t index = 1; index < input_num_; ++index) {
    auto src_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, index);
    if (src0_shape != src_shape) {
      MS_LOG(EXCEPTION) << "AddN input shapes must be equal.";
    }
  }
  size_t output_num = AnfAlgo::GetOutputTensorNum(kernel_node);
  if (output_num != 1) {
    MS_LOG(EXCEPTION) << "Output number is " << output_num << ", but AddNCPUKernel needs 1 output.";
  }
 }
 }  // namespace kernel
 }  // namespace mindspore
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/mkldnn/addn_cpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/mkldnn/addn_cpu_kernel.h
@@ -1,51 +1,51 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_ADDN_CPU_KERNEL_H_
 #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_ADDN_CPU_KERNEL_H_
 #include <vector>
 #include <memory>
 #include "backend/kernel_compiler/cpu/mkldnn/mkl_cpu_kernel.h"

 namespace mindspore {
 namespace kernel {
 class AddNCPUKernel : public MKLCPUKernel {
 public:
  AddNCPUKernel() = default;
  ~AddNCPUKernel() override = default;

  void InitKernel(const CNodePtr &kernel_node) override;

  bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
              const std::vector<AddressPtr> &outputs) override;

 private:
  void CheckParam(const CNodePtr &kernel_node);
  size_t input_num_{0};
  std::vector<size_t> output_shape_;
  TypeId dtype_{kNumberTypeFloat32};
 };

 MS_REG_CPU_KERNEL(AddN,
                  KernelAttr().SetAllSameAttr(true).AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32),
                  AddNCPUKernel);
 MS_REG_CPU_KERNEL(AddN,
                  KernelAttr().SetAllSameAttr(true).AddInputAttr(kNumberTypeInt32).AddOutputAttr(kNumberTypeInt32),
                  AddNCPUKernel);
 }  // namespace kernel
 }  // namespace mindspore

 #endif  // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_ADDN_CPU_KERNEL_H_
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_ADDN_CPU_KERNEL_H_
 #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_ADDN_CPU_KERNEL_H_
 #include <vector>
 #include <memory>
 #include "backend/kernel_compiler/cpu/mkldnn/mkl_cpu_kernel.h"

 namespace mindspore {
 namespace kernel {
 class AddNCPUKernel : public MKLCPUKernel {
 public:
  AddNCPUKernel() = default;
  ~AddNCPUKernel() override = default;

  void InitKernel(const CNodePtr &kernel_node) override;

  bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
              const std::vector<AddressPtr> &outputs) override;

 private:
  void CheckParam(const CNodePtr &kernel_node);
  size_t input_num_{0};
  std::vector<size_t> output_shape_;
  TypeId dtype_{kNumberTypeFloat32};
 };

 MS_REG_CPU_KERNEL(AddN,
                  KernelAttr().SetAllSameAttr(true).AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32),
                  AddNCPUKernel);
 MS_REG_CPU_KERNEL(AddN,
                  KernelAttr().SetAllSameAttr(true).AddInputAttr(kNumberTypeInt32).AddOutputAttr(kNumberTypeInt32),
                  AddNCPUKernel);
 }  // namespace kernel
 }  // namespace mindspore

 #endif  // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_ADDN_CPU_KERNEL_H_
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/mkldnn/lstm_cpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/mkldnn/lstm_cpu_kernel.cc
@@ -1,178 +1,178 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "backend/kernel_compiler/cpu/mkldnn/lstm_cpu_kernel.h"
 #include <string>
 #include "utils/ms_utils.h"
 #include "backend/kernel_compiler/cpu/mkldnn/mkl_kernel_engine.h"
 #include "runtime/device/cpu/cpu_device_address.h"

 namespace mindspore {
 namespace kernel {
 const int kMaxLSTMLayer = 100;
 const int kOutputWorkSpaceIndex = 3;
 void LstmCPUKernel::InitInputOutputSize(const CNodePtr &kernel_node) {
  CPUKernel::InitInputOutputSize(kernel_node);
  output_size_list_[kOutputWorkSpaceIndex] = reserve_size_;
  auto output_num = AnfAlgo::GetOutputTensorNum(kernel_node);
  auto output_type = AnfAlgo::GetOutputInferDataType(kernel_node, 0);
  auto output_types = std::vector<TypeId>(output_num, output_type);
  std::vector<std::vector<size_t>> output_shapes;
  for (size_t output_index = 0; output_index < output_num; ++output_index) {
    std::vector<size_t> shape = AnfAlgo::GetOutputInferShape(kernel_node, output_index);
    output_shapes.emplace_back(shape);
  }
  size_t len = reserve_size_ / 4;
  output_shapes[kOutputWorkSpaceIndex] = {len, 1};
  AnfAlgo::SetOutputInferTypeAndShape(output_types, output_shapes, kernel_node.get());
 }

 void LstmCPUKernel::InitKernel(const CNodePtr &kernel_node) {
 #ifdef PLATFORM_86
  _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON);
  _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON);
 #endif
  MS_EXCEPTION_IF_NULL(kernel_node);
  using tag = dnnl::memory::format_tag;
  using dim = dnnl::memory::dims;
  CheckParam(kernel_node);
  auto eng = MKLKernelEngine::Get().engine();
  dnnl::rnn_direction direction = dnnl::rnn_direction::unidirectional;
  if (bidirectional_) {
    direction = dnnl::rnn_direction::bidirectional_concat;
  }
  dim src_dims = {seq_len_, batch_size_, input_size_};
  dim src_h_dims = {num_layers_, num_directions_, batch_size_, hidden_size_};
  dim src_c_dims = {num_layers_, num_directions_, batch_size_, hidden_size_};
  weights_dims_ = {num_layers_, num_directions_, input_size_, 4, hidden_size_};
  weights_h_dims_ = {num_layers_, num_directions_, hidden_size_, 4, hidden_size_};
  bias_dims_ = {num_layers_, num_directions_, 4, hidden_size_};
  dim dst_dims = {seq_len_, batch_size_, static_cast<int64_t>(hidden_size_) * num_directions_};
  dim dst_h_dims = {num_layers_, num_directions_, batch_size_, hidden_size_};
  dim dst_c_dims = {num_layers_, num_directions_, batch_size_, hidden_size_};
  dnnl::memory::desc src_desc = formatted_md(src_dims, tag::tnc);
  dnnl::memory::desc src_h_desc = formatted_md(src_h_dims, tag::ldnc);
  dnnl::memory::desc src_c_desc = formatted_md(src_c_dims, tag::ldnc);
  dnnl::memory::desc bias_desc = formatted_md(bias_dims_, tag::ldgo);
  dnnl::memory::desc dst_desc = formatted_md(dst_dims, tag::tnc);
  dnnl::memory::desc dst_h_desc = formatted_md(dst_h_dims, tag::ldnc);
  dnnl::memory::desc dst_c_desc = formatted_md(dst_c_dims, tag::ldnc);
  if (!kernel_node->HasAttr(kAttrIsTraining)) {
    is_training = true;
  } else {
    is_training = GetValue<bool>(kernel_node->GetAttr(kAttrIsTraining));
  }
  auto prop_kind = dnnl::prop_kind::forward_training;
  if (!is_training) {
    prop_kind = dnnl::prop_kind::forward_inference;
  }
  auto desc = std::make_shared<dnnl::lstm_forward::desc>(
    prop_kind, direction, src_desc, src_h_desc, src_c_desc, formatted_md(weights_dims_, tag::any),
    formatted_md(weights_h_dims_, tag::any), bias_desc, dst_desc, dst_h_desc, dst_c_desc);
  prim_desc_ = dnnl::lstm_forward::primitive_desc(*desc, eng);
  primitive_ = std::make_shared<dnnl::lstm_forward>(prim_desc_);
  if (is_training) {
    reserve_size_ = static_cast<size_t>(prim_desc_.workspace_desc().get_size());
    AddArgument(DNNL_ARG_WORKSPACE, prim_desc_.workspace_desc());
  } else {
    reserve_size_ = 1;
  }
  AddArgument(DNNL_ARG_SRC_LAYER, src_desc);
  AddArgument(DNNL_ARG_SRC_ITER, src_h_desc);
  AddArgument(DNNL_ARG_SRC_ITER_C, src_c_desc);
  AddArgument(DNNL_ARG_WEIGHTS_LAYER, prim_desc_.weights_layer_desc());
  AddArgument(DNNL_ARG_WEIGHTS_ITER, prim_desc_.weights_iter_desc());
  AddArgument(DNNL_ARG_BIAS, bias_desc);
  AddArgument(DNNL_ARG_DST_LAYER, dst_desc);
  AddArgument(DNNL_ARG_DST_ITER, dst_h_desc);
  AddArgument(DNNL_ARG_DST_ITER_C, dst_c_desc);
 }

 void LstmCPUKernel::CheckParam(const CNodePtr &kernel_node) {
  std::vector<size_t> src_shape = AnfAlgo::GetInputDeviceShape(kernel_node, 0);
  std::vector<size_t> src_h_shape = AnfAlgo::GetInputDeviceShape(kernel_node, 1);
  std::vector<size_t> src_c_shape = AnfAlgo::GetInputDeviceShape(kernel_node, 2);
  bidirectional_ = AnfAlgo::GetNodeAttr<bool>(kernel_node, "bidirectional");
  input_size_ = static_cast<int>(AnfAlgo::GetNodeAttr<int64_t>(kernel_node, "input_size"));
  hidden_size_ = static_cast<int>(AnfAlgo::GetNodeAttr<int64_t>(kernel_node, "hidden_size"));
  num_layers_ = static_cast<int>(AnfAlgo::GetNodeAttr<int64_t>(kernel_node, "num_layers"));
  has_bias_ = AnfAlgo::GetNodeAttr<bool>(kernel_node, "has_bias");
  batch_size_ = SizeToInt(src_shape[1]);
  seq_len_ = SizeToInt(src_shape[0]);
  num_directions_ = 1;
  if (bidirectional_) {
    num_directions_ = 2;
  }
  const int gate_size = 4 * hidden_size_;
  if (num_layers_ <= 0) {
    MS_LOG(EXCEPTION) << "Layers must be greater than zero!";
  }
  if (num_layers_ > kMaxLSTMLayer) {
    MS_LOG(EXCEPTION) << "Layers must be lower than 100!";
  }
  for (int i = 0; i < num_layers_; ++i) {
    weight_size_ += gate_size * (i == 0 ? input_size_ : hidden_size_ * num_directions_);
    weight_h_size_ += gate_size * hidden_size_;
  }
  weight_size_ = weight_size_ * num_directions_;
  weight_h_size_ = weight_h_size_ * num_directions_;
  if (num_directions_ * num_layers_ != SizeToInt(src_h_shape[0])) {
    MS_LOG(EXCEPTION) << "Error iteration shape!";
  }
  if (src_shape.size() != 3 || src_h_shape.size() != 3 || src_c_shape.size() != 3) {
    MS_LOG(EXCEPTION) << "Lstm only support 3-D input!";
  }
 }

 bool LstmCPUKernel::Launch(const std::vector<kernel::AddressPtr> &inputs, const std::vector<kernel::AddressPtr> &,
                           const std::vector<kernel::AddressPtr> &outputs) {
  using dt = dnnl::memory::data_type;
  using tag = dnnl::memory::format_tag;
  auto eng = MKLKernelEngine::Get().engine();
  auto user_weights_memory = dnnl::memory(dnnl::memory::desc{{weights_dims_}, dt::f32, tag::ldgoi}, eng);
  auto user_weights_h_memory = dnnl::memory(dnnl::memory::desc{{weights_h_dims_}, dt::f32, tag::ldgoi}, eng);
  auto weights_memory = dnnl::memory(prim_desc_.weights_layer_desc(), eng);
  auto weights_h_memory = dnnl::memory(prim_desc_.weights_iter_desc(), eng);
  user_weights_memory.set_data_handle(inputs[3]->addr);
  user_weights_h_memory.set_data_handle(reinterpret_cast<float *>(inputs[3]->addr) + weight_size_);
  Reorder(&user_weights_memory, &weights_memory);
  Reorder(&user_weights_h_memory, &weights_h_memory);
  auto bias_memory = dnnl::memory(prim_desc_.bias_desc(), eng);
  if (has_bias_) {
    bias_memory.set_data_handle(reinterpret_cast<float *>(inputs[3]->addr) + weight_size_ + weight_h_size_);
  } else {
    if (memset_s(bias_memory.get_data_handle(), prim_desc_.bias_desc().get_size(), 0,
                 prim_desc_.bias_desc().get_size())) {
      MS_LOG(EXCEPTION) << "Bias memset error";
    }
  }
  // set handle
  SetArgumentHandle(DNNL_ARG_SRC_LAYER, inputs[0]->addr);
  SetArgumentHandle(DNNL_ARG_SRC_ITER, inputs[1]->addr);
  SetArgumentHandle(DNNL_ARG_SRC_ITER_C, inputs[2]->addr);
  SetArgumentHandle(DNNL_ARG_WEIGHTS_LAYER, weights_memory.get_data_handle());
  SetArgumentHandle(DNNL_ARG_WEIGHTS_ITER, weights_h_memory.get_data_handle());
  SetArgumentHandle(DNNL_ARG_BIAS, bias_memory.get_data_handle());
  SetArgumentHandle(DNNL_ARG_DST_LAYER, outputs[0]->addr);
  SetArgumentHandle(DNNL_ARG_DST_ITER, outputs[1]->addr);
  SetArgumentHandle(DNNL_ARG_DST_ITER_C, outputs[2]->addr);
  if (is_training) {
    SetArgumentHandle(DNNL_ARG_WORKSPACE, outputs[3]->addr);
  }
  ExecutePrimitive();
  return true;
 }
 }  // namespace kernel
 }  // namespace mindspore
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "backend/kernel_compiler/cpu/mkldnn/lstm_cpu_kernel.h"
 #include <string>
 #include "utils/ms_utils.h"
 #include "backend/kernel_compiler/cpu/mkldnn/mkl_kernel_engine.h"
 #include "runtime/device/cpu/cpu_device_address.h"

 namespace mindspore {
 namespace kernel {
 const int kMaxLSTMLayer = 100;
 const int kOutputWorkSpaceIndex = 3;
 void LstmCPUKernel::InitInputOutputSize(const CNodePtr &kernel_node) {
  CPUKernel::InitInputOutputSize(kernel_node);
  output_size_list_[kOutputWorkSpaceIndex] = reserve_size_;
  auto output_num = AnfAlgo::GetOutputTensorNum(kernel_node);
  auto output_type = AnfAlgo::GetOutputInferDataType(kernel_node, 0);
  auto output_types = std::vector<TypeId>(output_num, output_type);
  std::vector<std::vector<size_t>> output_shapes;
  for (size_t output_index = 0; output_index < output_num; ++output_index) {
    std::vector<size_t> shape = AnfAlgo::GetOutputInferShape(kernel_node, output_index);
    output_shapes.emplace_back(shape);
  }
  size_t len = reserve_size_ / 4;
  output_shapes[kOutputWorkSpaceIndex] = {len, 1};
  AnfAlgo::SetOutputInferTypeAndShape(output_types, output_shapes, kernel_node.get());
 }

 void LstmCPUKernel::InitKernel(const CNodePtr &kernel_node) {
 #ifdef PLATFORM_86
  _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON);
  _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON);
 #endif
  MS_EXCEPTION_IF_NULL(kernel_node);
  using tag = dnnl::memory::format_tag;
  using dim = dnnl::memory::dims;
  CheckParam(kernel_node);
  auto eng = MKLKernelEngine::Get().engine();
  dnnl::rnn_direction direction = dnnl::rnn_direction::unidirectional;
  if (bidirectional_) {
    direction = dnnl::rnn_direction::bidirectional_concat;
  }
  dim src_dims = {seq_len_, batch_size_, input_size_};
  dim src_h_dims = {num_layers_, num_directions_, batch_size_, hidden_size_};
  dim src_c_dims = {num_layers_, num_directions_, batch_size_, hidden_size_};
  weights_dims_ = {num_layers_, num_directions_, input_size_, 4, hidden_size_};
  weights_h_dims_ = {num_layers_, num_directions_, hidden_size_, 4, hidden_size_};
  bias_dims_ = {num_layers_, num_directions_, 4, hidden_size_};
  dim dst_dims = {seq_len_, batch_size_, static_cast<int64_t>(hidden_size_) * num_directions_};
  dim dst_h_dims = {num_layers_, num_directions_, batch_size_, hidden_size_};
  dim dst_c_dims = {num_layers_, num_directions_, batch_size_, hidden_size_};
  dnnl::memory::desc src_desc = formatted_md(src_dims, tag::tnc);
  dnnl::memory::desc src_h_desc = formatted_md(src_h_dims, tag::ldnc);
  dnnl::memory::desc src_c_desc = formatted_md(src_c_dims, tag::ldnc);
  dnnl::memory::desc bias_desc = formatted_md(bias_dims_, tag::ldgo);
  dnnl::memory::desc dst_desc = formatted_md(dst_dims, tag::tnc);
  dnnl::memory::desc dst_h_desc = formatted_md(dst_h_dims, tag::ldnc);
  dnnl::memory::desc dst_c_desc = formatted_md(dst_c_dims, tag::ldnc);
  if (!kernel_node->HasAttr(kAttrIsTraining)) {
    is_training = true;
  } else {
    is_training = GetValue<bool>(kernel_node->GetAttr(kAttrIsTraining));
  }
  auto prop_kind = dnnl::prop_kind::forward_training;
  if (!is_training) {
    prop_kind = dnnl::prop_kind::forward_inference;
  }
  auto desc = std::make_shared<dnnl::lstm_forward::desc>(
    prop_kind, direction, src_desc, src_h_desc, src_c_desc, formatted_md(weights_dims_, tag::any),
    formatted_md(weights_h_dims_, tag::any), bias_desc, dst_desc, dst_h_desc, dst_c_desc);
  prim_desc_ = dnnl::lstm_forward::primitive_desc(*desc, eng);
  primitive_ = std::make_shared<dnnl::lstm_forward>(prim_desc_);
  if (is_training) {
    reserve_size_ = static_cast<size_t>(prim_desc_.workspace_desc().get_size());
    AddArgument(DNNL_ARG_WORKSPACE, prim_desc_.workspace_desc());
  } else {
    reserve_size_ = 1;
  }
  AddArgument(DNNL_ARG_SRC_LAYER, src_desc);
  AddArgument(DNNL_ARG_SRC_ITER, src_h_desc);
  AddArgument(DNNL_ARG_SRC_ITER_C, src_c_desc);
  AddArgument(DNNL_ARG_WEIGHTS_LAYER, prim_desc_.weights_layer_desc());
  AddArgument(DNNL_ARG_WEIGHTS_ITER, prim_desc_.weights_iter_desc());
  AddArgument(DNNL_ARG_BIAS, bias_desc);
  AddArgument(DNNL_ARG_DST_LAYER, dst_desc);
  AddArgument(DNNL_ARG_DST_ITER, dst_h_desc);
  AddArgument(DNNL_ARG_DST_ITER_C, dst_c_desc);
 }

 void LstmCPUKernel::CheckParam(const CNodePtr &kernel_node) {
  std::vector<size_t> src_shape = AnfAlgo::GetInputDeviceShape(kernel_node, 0);
  std::vector<size_t> src_h_shape = AnfAlgo::GetInputDeviceShape(kernel_node, 1);
  std::vector<size_t> src_c_shape = AnfAlgo::GetInputDeviceShape(kernel_node, 2);
  bidirectional_ = AnfAlgo::GetNodeAttr<bool>(kernel_node, "bidirectional");
  input_size_ = static_cast<int>(AnfAlgo::GetNodeAttr<int64_t>(kernel_node, "input_size"));
  hidden_size_ = static_cast<int>(AnfAlgo::GetNodeAttr<int64_t>(kernel_node, "hidden_size"));
  num_layers_ = static_cast<int>(AnfAlgo::GetNodeAttr<int64_t>(kernel_node, "num_layers"));
  has_bias_ = AnfAlgo::GetNodeAttr<bool>(kernel_node, "has_bias");
  batch_size_ = SizeToInt(src_shape[1]);
  seq_len_ = SizeToInt(src_shape[0]);
  num_directions_ = 1;
  if (bidirectional_) {
    num_directions_ = 2;
  }
  const int gate_size = 4 * hidden_size_;
  if (num_layers_ <= 0) {
    MS_LOG(EXCEPTION) << "Layers must be greater than zero!";
  }
  if (num_layers_ > kMaxLSTMLayer) {
    MS_LOG(EXCEPTION) << "Layers must be lower than 100!";
  }
  for (int i = 0; i < num_layers_; ++i) {
    weight_size_ += gate_size * (i == 0 ? input_size_ : hidden_size_ * num_directions_);
    weight_h_size_ += gate_size * hidden_size_;
  }
  weight_size_ = weight_size_ * num_directions_;
  weight_h_size_ = weight_h_size_ * num_directions_;
  if (num_directions_ * num_layers_ != SizeToInt(src_h_shape[0])) {
    MS_LOG(EXCEPTION) << "Error iteration shape!";
  }
  if (src_shape.size() != 3 || src_h_shape.size() != 3 || src_c_shape.size() != 3) {
    MS_LOG(EXCEPTION) << "Lstm only support 3-D input!";
  }
 }

 bool LstmCPUKernel::Launch(const std::vector<kernel::AddressPtr> &inputs, const std::vector<kernel::AddressPtr> &,
                           const std::vector<kernel::AddressPtr> &outputs) {
  using dt = dnnl::memory::data_type;
  using tag = dnnl::memory::format_tag;
  auto eng = MKLKernelEngine::Get().engine();
  auto user_weights_memory = dnnl::memory(dnnl::memory::desc{{weights_dims_}, dt::f32, tag::ldgoi}, eng);
  auto user_weights_h_memory = dnnl::memory(dnnl::memory::desc{{weights_h_dims_}, dt::f32, tag::ldgoi}, eng);
  auto weights_memory = dnnl::memory(prim_desc_.weights_layer_desc(), eng);
  auto weights_h_memory = dnnl::memory(prim_desc_.weights_iter_desc(), eng);
  user_weights_memory.set_data_handle(inputs[3]->addr);
  user_weights_h_memory.set_data_handle(reinterpret_cast<float *>(inputs[3]->addr) + weight_size_);
  Reorder(&user_weights_memory, &weights_memory);
  Reorder(&user_weights_h_memory, &weights_h_memory);
  auto bias_memory = dnnl::memory(prim_desc_.bias_desc(), eng);
  if (has_bias_) {
    bias_memory.set_data_handle(reinterpret_cast<float *>(inputs[3]->addr) + weight_size_ + weight_h_size_);
  } else {
    if (memset_s(bias_memory.get_data_handle(), prim_desc_.bias_desc().get_size(), 0,
                 prim_desc_.bias_desc().get_size())) {
      MS_LOG(EXCEPTION) << "Bias memset error";
    }
  }
  // set handle
  SetArgumentHandle(DNNL_ARG_SRC_LAYER, inputs[0]->addr);
  SetArgumentHandle(DNNL_ARG_SRC_ITER, inputs[1]->addr);
  SetArgumentHandle(DNNL_ARG_SRC_ITER_C, inputs[2]->addr);
  SetArgumentHandle(DNNL_ARG_WEIGHTS_LAYER, weights_memory.get_data_handle());
  SetArgumentHandle(DNNL_ARG_WEIGHTS_ITER, weights_h_memory.get_data_handle());
  SetArgumentHandle(DNNL_ARG_BIAS, bias_memory.get_data_handle());
  SetArgumentHandle(DNNL_ARG_DST_LAYER, outputs[0]->addr);
  SetArgumentHandle(DNNL_ARG_DST_ITER, outputs[1]->addr);
  SetArgumentHandle(DNNL_ARG_DST_ITER_C, outputs[2]->addr);
  if (is_training) {
    SetArgumentHandle(DNNL_ARG_WORKSPACE, outputs[3]->addr);
  }
  ExecutePrimitive();
  return true;
 }
 }  // namespace kernel
 }  // namespace mindspore
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/mkldnn/lstm_cpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/mkldnn/lstm_cpu_kernel.h
@@ -1,76 +1,76 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_LSTM_CPU_KERNEL_H_
 #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_LSTM_CPU_KERNEL_H_
 #if defined(__x86_64__) || defined(__amd64__) || defined(_M_IX86) || defined(_M_X64)
 #define PLATFORM_86
 #endif
 #ifdef PLATFORM_86
 #include <pmmintrin.h>
 #endif
 #include <vector>
 #include <memory>
 #include "backend/kernel_compiler/cpu/mkldnn/mkl_cpu_kernel.h"
 namespace mindspore {
 namespace kernel {
 class LstmCPUKernel : public MKLCPUKernel {
 public:
  LstmCPUKernel() = default;
  ~LstmCPUKernel() override = default;
  void InitKernel(const CNodePtr &kernel_node) override;

  bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
              const std::vector<AddressPtr> &outputs) override;

 protected:
  void InitInputOutputSize(const CNodePtr &kernel_node) override;

 private:
  void CheckParam(const CNodePtr &kernel_node);
  int weight_size_ = 0;
  int weight_h_size_ = 0;
  int input_size_;
  int hidden_size_;
  int num_layers_;
  int batch_size_;
  int seq_len_;
  int num_directions_;
  bool bidirectional_;
  bool has_bias_;
  size_t reserve_size_;
  bool is_training;
  dnnl::memory::dims weights_dims_;
  dnnl::memory::dims weights_h_dims_;
  dnnl::memory::dims bias_dims_;
  dnnl::lstm_forward::primitive_desc prim_desc_;
 };

 MS_REG_CPU_KERNEL(LSTM,
                  KernelAttr()
                    .AddInputAttr(kNumberTypeFloat32)
                    .AddInputAttr(kNumberTypeFloat32)
                    .AddInputAttr(kNumberTypeFloat32)
                    .AddInputAttr(kNumberTypeFloat32)
                    .AddOutputAttr(kNumberTypeFloat32)
                    .AddOutputAttr(kNumberTypeFloat32)
                    .AddOutputAttr(kNumberTypeFloat32)
                    .AddOutputAttr(kNumberTypeFloat32)
                    .AddOutputAttr(kNumberTypeFloat32),
                  LstmCPUKernel);
 }  // namespace kernel
 }  // namespace mindspore
 #endif  // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_LSTM_CPU_KERNEL_H
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_LSTM_CPU_KERNEL_H_
 #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_LSTM_CPU_KERNEL_H_
 #if defined(__x86_64__) || defined(__amd64__) || defined(_M_IX86) || defined(_M_X64)
 #define PLATFORM_86
 #endif
 #ifdef PLATFORM_86
 #include <pmmintrin.h>
 #endif
 #include <vector>
 #include <memory>
 #include "backend/kernel_compiler/cpu/mkldnn/mkl_cpu_kernel.h"
 namespace mindspore {
 namespace kernel {
 class LstmCPUKernel : public MKLCPUKernel {
 public:
  LstmCPUKernel() = default;
  ~LstmCPUKernel() override = default;
  void InitKernel(const CNodePtr &kernel_node) override;

  bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
              const std::vector<AddressPtr> &outputs) override;

 protected:
  void InitInputOutputSize(const CNodePtr &kernel_node) override;

 private:
  void CheckParam(const CNodePtr &kernel_node);
  int weight_size_ = 0;
  int weight_h_size_ = 0;
  int input_size_;
  int hidden_size_;
  int num_layers_;
  int batch_size_;
  int seq_len_;
  int num_directions_;
  bool bidirectional_;
  bool has_bias_;
  size_t reserve_size_;
  bool is_training;
  dnnl::memory::dims weights_dims_;
  dnnl::memory::dims weights_h_dims_;
  dnnl::memory::dims bias_dims_;
  dnnl::lstm_forward::primitive_desc prim_desc_;
 };

 MS_REG_CPU_KERNEL(LSTM,
                  KernelAttr()
                    .AddInputAttr(kNumberTypeFloat32)
                    .AddInputAttr(kNumberTypeFloat32)
                    .AddInputAttr(kNumberTypeFloat32)
                    .AddInputAttr(kNumberTypeFloat32)
                    .AddOutputAttr(kNumberTypeFloat32)
                    .AddOutputAttr(kNumberTypeFloat32)
                    .AddOutputAttr(kNumberTypeFloat32)
                    .AddOutputAttr(kNumberTypeFloat32)
                    .AddOutputAttr(kNumberTypeFloat32),
                  LstmCPUKernel);
 }  // namespace kernel
 }  // namespace mindspore
 #endif  // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_LSTM_CPU_KERNEL_H
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/mkldnn/lstm_grad_cpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/mkldnn/lstm_grad_cpu_kernel.cc
@@ -1,218 +1,218 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "backend/kernel_compiler/cpu/mkldnn/lstm_grad_cpu_kernel.h"
 #include <cstring>
 #include <string>
 #include "utils/ms_utils.h"
 #include "backend/kernel_compiler/cpu/mkldnn/mkl_kernel_engine.h"
 #include "runtime/device/cpu/cpu_device_address.h"

 namespace mindspore {
 namespace kernel {
 const int kMaxLSTMLayer = 100;
 const int kInputWorkSpaceIndex = 10;
 void LSTMGradCPUKernel::InitInputOutputSize(const CNodePtr &kernel_node) {
  CPUKernel::InitInputOutputSize(kernel_node);
  input_size_list_[kInputWorkSpaceIndex] = reserve_size_;
 }

 void LSTMGradCPUKernel::InitKernel(const CNodePtr &kernel_node) {
  MS_EXCEPTION_IF_NULL(kernel_node);
  using tag = dnnl::memory::format_tag;
  using dim = dnnl::memory::dims;
  CheckParam(kernel_node);
  auto eng = MKLKernelEngine::Get().engine();
  dnnl::rnn_direction direction = dnnl::rnn_direction::unidirectional;
  if (bidirectional_) {
    direction = dnnl::rnn_direction::bidirectional_concat;
  }
  dim src_dims = {seq_len_, batch_size_, input_size_};
  dim src_h_dims = {num_layers_, num_directions_, batch_size_, hidden_size_};
  dim src_c_dims = {num_layers_, num_directions_, batch_size_, hidden_size_};
  weights_dims_ = {num_layers_, num_directions_, input_size_, 4, hidden_size_};
  weights_h_dims_ = {num_layers_, num_directions_, hidden_size_, 4, hidden_size_};
  bias_dims_ = {num_layers_, num_directions_, 4, hidden_size_};
  dim dst_dims = {seq_len_, batch_size_, static_cast<int64_t>(hidden_size_) * num_directions_};
  dim dst_h_dims = {num_layers_, num_directions_, batch_size_, hidden_size_};
  dim dst_c_dims = {num_layers_, num_directions_, batch_size_, hidden_size_};
  dnnl::memory::desc src_desc = formatted_md(src_dims, tag::tnc);
  dnnl::memory::desc src_h_desc = formatted_md(src_h_dims, tag::ldnc);
  dnnl::memory::desc src_c_desc = formatted_md(src_c_dims, tag::ldnc);
  dnnl::memory::desc bias_desc = formatted_md(bias_dims_, tag::ldgo);
  dnnl::memory::desc dst_desc = formatted_md(dst_dims, tag::tnc);
  dnnl::memory::desc dst_h_desc = formatted_md(dst_h_dims, tag::ldnc);
  dnnl::memory::desc dst_c_desc = formatted_md(dst_c_dims, tag::ldnc);
  auto forward_desc = std::make_shared<dnnl::lstm_forward::desc>(
    dnnl::prop_kind::forward_training, direction, src_desc, src_h_desc, src_c_desc,
    formatted_md(weights_dims_, tag::any), formatted_md(weights_h_dims_, tag::any), bias_desc, dst_desc, dst_h_desc,
    dst_c_desc);
  auto prim_forward_desc = dnnl::lstm_forward::primitive_desc(*forward_desc, eng);
  auto backward_desc = std::make_shared<dnnl::lstm_backward::desc>(
    dnnl::prop_kind::backward, direction, src_desc, src_h_desc, src_c_desc, formatted_md(weights_dims_, tag::any),
    formatted_md(weights_h_dims_, tag::any), bias_desc, dst_desc, dst_h_desc, dst_c_desc, src_desc, src_h_desc,
    src_c_desc, formatted_md(weights_dims_, tag::any), formatted_md(weights_h_dims_, tag::any), bias_desc, dst_desc,
    dst_h_desc, dst_c_desc);
  prim_backward_desc_ = dnnl::lstm_backward::primitive_desc(*backward_desc, eng, prim_forward_desc);
  primitive_ = std::make_shared<dnnl::lstm_backward>(prim_backward_desc_);
  reserve_size_ = static_cast<size_t>(prim_forward_desc.workspace_desc().get_size());
  AddArgument(DNNL_ARG_WORKSPACE, prim_forward_desc.workspace_desc());
  AddArgumentOp(src_desc, src_h_desc, src_c_desc, bias_desc, dst_desc, dst_h_desc, dst_c_desc);
 }

 void LSTMGradCPUKernel::AddArgumentOp(const dnnl::memory::desc &src_desc, const dnnl::memory::desc &src_h_desc,
                                      const dnnl::memory::desc &src_c_desc, const dnnl::memory::desc &bias_desc,
                                      const dnnl::memory::desc &dst_desc, const dnnl::memory::desc &dst_h_desc,
                                      const dnnl::memory::desc &dst_c_desc) {
  AddArgument(DNNL_ARG_SRC_LAYER, src_desc);
  AddArgument(DNNL_ARG_SRC_ITER, src_h_desc);
  AddArgument(DNNL_ARG_SRC_ITER_C, src_c_desc);
  AddArgument(DNNL_ARG_WEIGHTS_LAYER, prim_backward_desc_.weights_layer_desc());
  AddArgument(DNNL_ARG_WEIGHTS_ITER, prim_backward_desc_.weights_iter_desc());
  AddArgument(DNNL_ARG_BIAS, bias_desc);
  AddArgument(DNNL_ARG_DST_LAYER, dst_desc);
  AddArgument(DNNL_ARG_DST_ITER, dst_h_desc);
  AddArgument(DNNL_ARG_DST_ITER_C, dst_c_desc);
  AddArgument(DNNL_ARG_DIFF_SRC_LAYER, src_desc);
  AddArgument(DNNL_ARG_DIFF_SRC_ITER, src_h_desc);
  AddArgument(DNNL_ARG_DIFF_SRC_ITER_C, src_c_desc);
  AddArgument(DNNL_ARG_DIFF_WEIGHTS_LAYER, prim_backward_desc_.diff_weights_layer_desc());
  AddArgument(DNNL_ARG_DIFF_WEIGHTS_ITER, prim_backward_desc_.diff_weights_iter_desc());
  AddArgument(DNNL_ARG_DIFF_BIAS, bias_desc);
  AddArgument(DNNL_ARG_DIFF_DST_LAYER, dst_desc);
  AddArgument(DNNL_ARG_DIFF_DST_ITER, dst_h_desc);
  AddArgument(DNNL_ARG_DIFF_DST_ITER_C, dst_c_desc);
 }

 void LSTMGradCPUKernel::CheckParam(const CNodePtr &kernel_node) {
  std::vector<size_t> src_shape = AnfAlgo::GetInputDeviceShape(kernel_node, 0);
  std::vector<size_t> src_h_shape = AnfAlgo::GetInputDeviceShape(kernel_node, 1);
  std::vector<size_t> src_c_shape = AnfAlgo::GetInputDeviceShape(kernel_node, 2);
  bidirectional_ = AnfAlgo::GetNodeAttr<bool>(kernel_node, "bidirectional");
  input_size_ = AnfAlgo::GetNodeAttr<int64_t>(kernel_node, "input_size");
  hidden_size_ = AnfAlgo::GetNodeAttr<int64_t>(kernel_node, "hidden_size");
  num_layers_ = AnfAlgo::GetNodeAttr<int64_t>(kernel_node, "num_layers");
  has_bias_ = AnfAlgo::GetNodeAttr<bool>(kernel_node, "has_bias");
  batch_size_ = SizeToInt(src_shape[1]);
  seq_len_ = SizeToInt(src_shape[0]);
  num_directions_ = 1;
  if (bidirectional_) {
    num_directions_ = 2;
  }
  const int64_t gate_size = 4 * hidden_size_;
  if (num_layers_ <= 0) {
    MS_LOG(EXCEPTION) << "Layers must be greater than zero!";
  }
  if (num_layers_ > kMaxLSTMLayer) {
    MS_LOG(EXCEPTION) << "Layers must be lower than 100!";
  }
  for (int64_t i = 0; i < num_layers_; ++i) {
    weight_size_ += gate_size * (i == 0 ? input_size_ : hidden_size_ * num_directions_);
    weight_h_size_ += gate_size * hidden_size_;
  }
  weight_size_ = weight_size_ * num_directions_;
  weight_h_size_ = weight_h_size_ * num_directions_;
  if (num_directions_ * num_layers_ != SizeToLong(src_h_shape[0])) {
    MS_LOG(EXCEPTION) << "Error iteration shape!";
  }
  if (src_shape.size() != 3 || src_h_shape.size() != 3 || src_c_shape.size() != 3) {
    MS_LOG(EXCEPTION) << "Lstm only support 3-D input!";
  }
 }

 void LSTMGradCPUKernel::SetArgumentHandleOp(const std::vector<kernel::AddressPtr> &inputs,
                                            const std::vector<kernel::AddressPtr> &outputs,
                                            const dnnl::memory &weights_memory, const dnnl::memory &weights_h_memory,
                                            const dnnl::memory &bias_memory, const dnnl::memory &diff_weights_memory,
                                            const dnnl::memory &diff_weights_h_memory,
                                            const dnnl::memory &diff_bias_memory) {
  SetArgumentHandle(DNNL_ARG_SRC_LAYER, inputs[0]->addr);
  SetArgumentHandle(DNNL_ARG_SRC_ITER, inputs[1]->addr);
  SetArgumentHandle(DNNL_ARG_SRC_ITER_C, inputs[2]->addr);
  SetArgumentHandle(DNNL_ARG_WEIGHTS_LAYER, weights_memory.get_data_handle());
  SetArgumentHandle(DNNL_ARG_WEIGHTS_ITER, weights_h_memory.get_data_handle());
  SetArgumentHandle(DNNL_ARG_BIAS, bias_memory.get_data_handle());
  SetArgumentHandle(DNNL_ARG_DST_LAYER, inputs[4]->addr);
  SetArgumentHandle(DNNL_ARG_DST_ITER, inputs[5]->addr);
  SetArgumentHandle(DNNL_ARG_DST_ITER_C, inputs[6]->addr);
  SetArgumentHandle(DNNL_ARG_WORKSPACE, inputs[10]->addr);
  SetArgumentHandle(DNNL_ARG_DIFF_SRC_LAYER, outputs[0]->addr);
  SetArgumentHandle(DNNL_ARG_DIFF_SRC_ITER, outputs[1]->addr);
  SetArgumentHandle(DNNL_ARG_DIFF_SRC_ITER_C, outputs[2]->addr);
  SetArgumentHandle(DNNL_ARG_DIFF_WEIGHTS_LAYER, diff_weights_memory.get_data_handle());
  SetArgumentHandle(DNNL_ARG_DIFF_WEIGHTS_ITER, diff_weights_h_memory.get_data_handle());
  SetArgumentHandle(DNNL_ARG_DIFF_BIAS, diff_bias_memory.get_data_handle());
  SetArgumentHandle(DNNL_ARG_DIFF_DST_LAYER, inputs[7]->addr);
  SetArgumentHandle(DNNL_ARG_DIFF_DST_ITER, inputs[8]->addr);
  SetArgumentHandle(DNNL_ARG_DIFF_DST_ITER_C, inputs[9]->addr);
 }

 void LSTMGradCPUKernel::ResetMemory(const dnnl::memory &mem, const string name) const {
  if (memset_s(mem.get_data_handle(), mem.get_desc().get_size(), 0, mem.get_desc().get_size())) {
    MS_LOG(EXCEPTION) << name << " memset error";
  }
 }

 bool LSTMGradCPUKernel::Launch(const std::vector<kernel::AddressPtr> &inputs, const std::vector<kernel::AddressPtr> &,
                               const std::vector<kernel::AddressPtr> &outputs) {
  using dt = dnnl::memory::data_type;
  using tag = dnnl::memory::format_tag;
  auto eng = MKLKernelEngine::Get().engine();
  // construct fw memory
  auto user_weights_memory = dnnl::memory(dnnl::memory::desc{{weights_dims_}, dt::f32, tag::ldgoi}, eng);
  auto user_weights_h_memory = dnnl::memory(dnnl::memory::desc{{weights_h_dims_}, dt::f32, tag::ldgoi}, eng);
  auto weights_memory = dnnl::memory(prim_backward_desc_.weights_layer_desc(), eng);
  auto weights_h_memory = dnnl::memory(prim_backward_desc_.weights_iter_desc(), eng);
  auto bias_memory = dnnl::memory(prim_backward_desc_.bias_desc(), eng);
  user_weights_memory.set_data_handle(inputs[3]->addr);
  user_weights_h_memory.set_data_handle(reinterpret_cast<float *>(inputs[3]->addr) + weight_size_);
  Reorder(&user_weights_memory, &weights_memory);
  Reorder(&user_weights_h_memory, &weights_h_memory);
  if (has_bias_) {
    bias_memory.set_data_handle(reinterpret_cast<float *>(inputs[3]->addr) + weight_size_ + weight_h_size_);
  } else {
    if (memset_s(bias_memory.get_data_handle(), prim_backward_desc_.bias_desc().get_size(), 0,
                 prim_backward_desc_.bias_desc().get_size())) {
      MS_LOG(EXCEPTION) << "Bias memset error";
    }
  }
  // construct bw memory
  auto diff_weights_memory = dnnl::memory(prim_backward_desc_.diff_weights_layer_desc(), eng);
  auto diff_weights_h_memory = dnnl::memory(prim_backward_desc_.diff_weights_iter_desc(), eng);
  auto diff_bias_memory = dnnl::memory(prim_backward_desc_.diff_bias_desc(), eng);
  auto user_diff_weights_memory = dnnl::memory(dnnl::memory::desc{{weights_dims_}, dt::f32, tag::ldgoi}, eng);
  auto user_diff_weights_h_memory = dnnl::memory(dnnl::memory::desc{{weights_h_dims_}, dt::f32, tag::ldgoi}, eng);
  user_diff_weights_memory.set_data_handle(outputs[3]->addr);
  user_diff_weights_h_memory.set_data_handle(reinterpret_cast<float *>(outputs[3]->addr) + weight_size_);
  ResetMemory(user_diff_weights_memory, "user weights grad");
  ResetMemory(user_diff_weights_h_memory, "user weights iter grad");
  ResetMemory(diff_weights_memory, "weights grad");
  ResetMemory(diff_weights_h_memory, "weights iter grad");
  if (has_bias_) {
    diff_bias_memory.set_data_handle(reinterpret_cast<float *>(outputs[3]->addr) + weight_size_ + weight_h_size_);
  }
  if (memset_s(diff_bias_memory.get_data_handle(), prim_backward_desc_.diff_bias_desc().get_size(), 0,
               prim_backward_desc_.diff_bias_desc().get_size())) {
    MS_LOG(EXCEPTION) << "Bias grad memset error";
  }
  SetArgumentHandleOp(inputs, outputs, weights_memory, weights_h_memory, bias_memory, diff_weights_memory,
                      diff_weights_h_memory, diff_bias_memory);
  ExecutePrimitive();
  Reorder(&diff_weights_memory, &user_diff_weights_memory);
  Reorder(&diff_weights_h_memory, &user_diff_weights_h_memory);
  return true;
 }
 }  // namespace kernel
 }  // namespace mindspore
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "backend/kernel_compiler/cpu/mkldnn/lstm_grad_cpu_kernel.h"
 #include <cstring>
 #include <string>
 #include "utils/ms_utils.h"
 #include "backend/kernel_compiler/cpu/mkldnn/mkl_kernel_engine.h"
 #include "runtime/device/cpu/cpu_device_address.h"

 namespace mindspore {
 namespace kernel {
 const int kMaxLSTMLayer = 100;
 const int kInputWorkSpaceIndex = 10;
 void LSTMGradCPUKernel::InitInputOutputSize(const CNodePtr &kernel_node) {
  CPUKernel::InitInputOutputSize(kernel_node);
  input_size_list_[kInputWorkSpaceIndex] = reserve_size_;
 }

 void LSTMGradCPUKernel::InitKernel(const CNodePtr &kernel_node) {
  MS_EXCEPTION_IF_NULL(kernel_node);
  using tag = dnnl::memory::format_tag;
  using dim = dnnl::memory::dims;
  CheckParam(kernel_node);
  auto eng = MKLKernelEngine::Get().engine();
  dnnl::rnn_direction direction = dnnl::rnn_direction::unidirectional;
  if (bidirectional_) {
    direction = dnnl::rnn_direction::bidirectional_concat;
  }
  dim src_dims = {seq_len_, batch_size_, input_size_};
  dim src_h_dims = {num_layers_, num_directions_, batch_size_, hidden_size_};
  dim src_c_dims = {num_layers_, num_directions_, batch_size_, hidden_size_};
  weights_dims_ = {num_layers_, num_directions_, input_size_, 4, hidden_size_};
  weights_h_dims_ = {num_layers_, num_directions_, hidden_size_, 4, hidden_size_};
  bias_dims_ = {num_layers_, num_directions_, 4, hidden_size_};
  dim dst_dims = {seq_len_, batch_size_, static_cast<int64_t>(hidden_size_) * num_directions_};
  dim dst_h_dims = {num_layers_, num_directions_, batch_size_, hidden_size_};
  dim dst_c_dims = {num_layers_, num_directions_, batch_size_, hidden_size_};
  dnnl::memory::desc src_desc = formatted_md(src_dims, tag::tnc);
  dnnl::memory::desc src_h_desc = formatted_md(src_h_dims, tag::ldnc);
  dnnl::memory::desc src_c_desc = formatted_md(src_c_dims, tag::ldnc);
  dnnl::memory::desc bias_desc = formatted_md(bias_dims_, tag::ldgo);
  dnnl::memory::desc dst_desc = formatted_md(dst_dims, tag::tnc);
  dnnl::memory::desc dst_h_desc = formatted_md(dst_h_dims, tag::ldnc);
  dnnl::memory::desc dst_c_desc = formatted_md(dst_c_dims, tag::ldnc);
  auto forward_desc = std::make_shared<dnnl::lstm_forward::desc>(
    dnnl::prop_kind::forward_training, direction, src_desc, src_h_desc, src_c_desc,
    formatted_md(weights_dims_, tag::any), formatted_md(weights_h_dims_, tag::any), bias_desc, dst_desc, dst_h_desc,
    dst_c_desc);
  auto prim_forward_desc = dnnl::lstm_forward::primitive_desc(*forward_desc, eng);
  auto backward_desc = std::make_shared<dnnl::lstm_backward::desc>(
    dnnl::prop_kind::backward, direction, src_desc, src_h_desc, src_c_desc, formatted_md(weights_dims_, tag::any),
    formatted_md(weights_h_dims_, tag::any), bias_desc, dst_desc, dst_h_desc, dst_c_desc, src_desc, src_h_desc,
    src_c_desc, formatted_md(weights_dims_, tag::any), formatted_md(weights_h_dims_, tag::any), bias_desc, dst_desc,
    dst_h_desc, dst_c_desc);
  prim_backward_desc_ = dnnl::lstm_backward::primitive_desc(*backward_desc, eng, prim_forward_desc);
  primitive_ = std::make_shared<dnnl::lstm_backward>(prim_backward_desc_);
  reserve_size_ = static_cast<size_t>(prim_forward_desc.workspace_desc().get_size());
  AddArgument(DNNL_ARG_WORKSPACE, prim_forward_desc.workspace_desc());
  AddArgumentOp(src_desc, src_h_desc, src_c_desc, bias_desc, dst_desc, dst_h_desc, dst_c_desc);
 }

 void LSTMGradCPUKernel::AddArgumentOp(const dnnl::memory::desc &src_desc, const dnnl::memory::desc &src_h_desc,
                                      const dnnl::memory::desc &src_c_desc, const dnnl::memory::desc &bias_desc,
                                      const dnnl::memory::desc &dst_desc, const dnnl::memory::desc &dst_h_desc,
                                      const dnnl::memory::desc &dst_c_desc) {
  AddArgument(DNNL_ARG_SRC_LAYER, src_desc);
  AddArgument(DNNL_ARG_SRC_ITER, src_h_desc);
  AddArgument(DNNL_ARG_SRC_ITER_C, src_c_desc);
  AddArgument(DNNL_ARG_WEIGHTS_LAYER, prim_backward_desc_.weights_layer_desc());
  AddArgument(DNNL_ARG_WEIGHTS_ITER, prim_backward_desc_.weights_iter_desc());
  AddArgument(DNNL_ARG_BIAS, bias_desc);
  AddArgument(DNNL_ARG_DST_LAYER, dst_desc);
  AddArgument(DNNL_ARG_DST_ITER, dst_h_desc);
  AddArgument(DNNL_ARG_DST_ITER_C, dst_c_desc);
  AddArgument(DNNL_ARG_DIFF_SRC_LAYER, src_desc);
  AddArgument(DNNL_ARG_DIFF_SRC_ITER, src_h_desc);
  AddArgument(DNNL_ARG_DIFF_SRC_ITER_C, src_c_desc);
  AddArgument(DNNL_ARG_DIFF_WEIGHTS_LAYER, prim_backward_desc_.diff_weights_layer_desc());
  AddArgument(DNNL_ARG_DIFF_WEIGHTS_ITER, prim_backward_desc_.diff_weights_iter_desc());
  AddArgument(DNNL_ARG_DIFF_BIAS, bias_desc);
  AddArgument(DNNL_ARG_DIFF_DST_LAYER, dst_desc);
  AddArgument(DNNL_ARG_DIFF_DST_ITER, dst_h_desc);
  AddArgument(DNNL_ARG_DIFF_DST_ITER_C, dst_c_desc);
 }

 void LSTMGradCPUKernel::CheckParam(const CNodePtr &kernel_node) {
  std::vector<size_t> src_shape = AnfAlgo::GetInputDeviceShape(kernel_node, 0);
  std::vector<size_t> src_h_shape = AnfAlgo::GetInputDeviceShape(kernel_node, 1);
  std::vector<size_t> src_c_shape = AnfAlgo::GetInputDeviceShape(kernel_node, 2);
  bidirectional_ = AnfAlgo::GetNodeAttr<bool>(kernel_node, "bidirectional");
  input_size_ = AnfAlgo::GetNodeAttr<int64_t>(kernel_node, "input_size");
  hidden_size_ = AnfAlgo::GetNodeAttr<int64_t>(kernel_node, "hidden_size");
  num_layers_ = AnfAlgo::GetNodeAttr<int64_t>(kernel_node, "num_layers");
  has_bias_ = AnfAlgo::GetNodeAttr<bool>(kernel_node, "has_bias");
  batch_size_ = SizeToInt(src_shape[1]);
  seq_len_ = SizeToInt(src_shape[0]);
  num_directions_ = 1;
  if (bidirectional_) {
    num_directions_ = 2;
  }
  const int64_t gate_size = 4 * hidden_size_;
  if (num_layers_ <= 0) {
    MS_LOG(EXCEPTION) << "Layers must be greater than zero!";
  }
  if (num_layers_ > kMaxLSTMLayer) {
    MS_LOG(EXCEPTION) << "Layers must be lower than 100!";
  }
  for (int64_t i = 0; i < num_layers_; ++i) {
    weight_size_ += gate_size * (i == 0 ? input_size_ : hidden_size_ * num_directions_);
    weight_h_size_ += gate_size * hidden_size_;
  }
  weight_size_ = weight_size_ * num_directions_;
  weight_h_size_ = weight_h_size_ * num_directions_;
  if (num_directions_ * num_layers_ != SizeToLong(src_h_shape[0])) {
    MS_LOG(EXCEPTION) << "Error iteration shape!";
  }
  if (src_shape.size() != 3 || src_h_shape.size() != 3 || src_c_shape.size() != 3) {
    MS_LOG(EXCEPTION) << "Lstm only support 3-D input!";
  }
 }

 void LSTMGradCPUKernel::SetArgumentHandleOp(const std::vector<kernel::AddressPtr> &inputs,
                                            const std::vector<kernel::AddressPtr> &outputs,
                                            const dnnl::memory &weights_memory, const dnnl::memory &weights_h_memory,
                                            const dnnl::memory &bias_memory, const dnnl::memory &diff_weights_memory,
                                            const dnnl::memory &diff_weights_h_memory,
                                            const dnnl::memory &diff_bias_memory) {
  SetArgumentHandle(DNNL_ARG_SRC_LAYER, inputs[0]->addr);
  SetArgumentHandle(DNNL_ARG_SRC_ITER, inputs[1]->addr);
  SetArgumentHandle(DNNL_ARG_SRC_ITER_C, inputs[2]->addr);
  SetArgumentHandle(DNNL_ARG_WEIGHTS_LAYER, weights_memory.get_data_handle());
  SetArgumentHandle(DNNL_ARG_WEIGHTS_ITER, weights_h_memory.get_data_handle());
  SetArgumentHandle(DNNL_ARG_BIAS, bias_memory.get_data_handle());
  SetArgumentHandle(DNNL_ARG_DST_LAYER, inputs[4]->addr);
  SetArgumentHandle(DNNL_ARG_DST_ITER, inputs[5]->addr);
  SetArgumentHandle(DNNL_ARG_DST_ITER_C, inputs[6]->addr);
  SetArgumentHandle(DNNL_ARG_WORKSPACE, inputs[10]->addr);
  SetArgumentHandle(DNNL_ARG_DIFF_SRC_LAYER, outputs[0]->addr);
  SetArgumentHandle(DNNL_ARG_DIFF_SRC_ITER, outputs[1]->addr);
  SetArgumentHandle(DNNL_ARG_DIFF_SRC_ITER_C, outputs[2]->addr);
  SetArgumentHandle(DNNL_ARG_DIFF_WEIGHTS_LAYER, diff_weights_memory.get_data_handle());
  SetArgumentHandle(DNNL_ARG_DIFF_WEIGHTS_ITER, diff_weights_h_memory.get_data_handle());
  SetArgumentHandle(DNNL_ARG_DIFF_BIAS, diff_bias_memory.get_data_handle());
  SetArgumentHandle(DNNL_ARG_DIFF_DST_LAYER, inputs[7]->addr);
  SetArgumentHandle(DNNL_ARG_DIFF_DST_ITER, inputs[8]->addr);
  SetArgumentHandle(DNNL_ARG_DIFF_DST_ITER_C, inputs[9]->addr);
 }

 void LSTMGradCPUKernel::ResetMemory(const dnnl::memory &mem, const string name) const {
  if (memset_s(mem.get_data_handle(), mem.get_desc().get_size(), 0, mem.get_desc().get_size())) {
    MS_LOG(EXCEPTION) << name << " memset error";
  }
 }

 bool LSTMGradCPUKernel::Launch(const std::vector<kernel::AddressPtr> &inputs, const std::vector<kernel::AddressPtr> &,
                               const std::vector<kernel::AddressPtr> &outputs) {
  using dt = dnnl::memory::data_type;
  using tag = dnnl::memory::format_tag;
  auto eng = MKLKernelEngine::Get().engine();
  // construct fw memory
  auto user_weights_memory = dnnl::memory(dnnl::memory::desc{{weights_dims_}, dt::f32, tag::ldgoi}, eng);
  auto user_weights_h_memory = dnnl::memory(dnnl::memory::desc{{weights_h_dims_}, dt::f32, tag::ldgoi}, eng);
  auto weights_memory = dnnl::memory(prim_backward_desc_.weights_layer_desc(), eng);
  auto weights_h_memory = dnnl::memory(prim_backward_desc_.weights_iter_desc(), eng);
  auto bias_memory = dnnl::memory(prim_backward_desc_.bias_desc(), eng);
  user_weights_memory.set_data_handle(inputs[3]->addr);
  user_weights_h_memory.set_data_handle(reinterpret_cast<float *>(inputs[3]->addr) + weight_size_);
  Reorder(&user_weights_memory, &weights_memory);
  Reorder(&user_weights_h_memory, &weights_h_memory);
  if (has_bias_) {
    bias_memory.set_data_handle(reinterpret_cast<float *>(inputs[3]->addr) + weight_size_ + weight_h_size_);
  } else {
    if (memset_s(bias_memory.get_data_handle(), prim_backward_desc_.bias_desc().get_size(), 0,
                 prim_backward_desc_.bias_desc().get_size())) {
      MS_LOG(EXCEPTION) << "Bias memset error";
    }
  }
  // construct bw memory
  auto diff_weights_memory = dnnl::memory(prim_backward_desc_.diff_weights_layer_desc(), eng);
  auto diff_weights_h_memory = dnnl::memory(prim_backward_desc_.diff_weights_iter_desc(), eng);
  auto diff_bias_memory = dnnl::memory(prim_backward_desc_.diff_bias_desc(), eng);
  auto user_diff_weights_memory = dnnl::memory(dnnl::memory::desc{{weights_dims_}, dt::f32, tag::ldgoi}, eng);
  auto user_diff_weights_h_memory = dnnl::memory(dnnl::memory::desc{{weights_h_dims_}, dt::f32, tag::ldgoi}, eng);
  user_diff_weights_memory.set_data_handle(outputs[3]->addr);
  user_diff_weights_h_memory.set_data_handle(reinterpret_cast<float *>(outputs[3]->addr) + weight_size_);
  ResetMemory(user_diff_weights_memory, "user weights grad");
  ResetMemory(user_diff_weights_h_memory, "user weights iter grad");
  ResetMemory(diff_weights_memory, "weights grad");
  ResetMemory(diff_weights_h_memory, "weights iter grad");
  if (has_bias_) {
    diff_bias_memory.set_data_handle(reinterpret_cast<float *>(outputs[3]->addr) + weight_size_ + weight_h_size_);
  }
  if (memset_s(diff_bias_memory.get_data_handle(), prim_backward_desc_.diff_bias_desc().get_size(), 0,
               prim_backward_desc_.diff_bias_desc().get_size())) {
    MS_LOG(EXCEPTION) << "Bias grad memset error";
  }
  SetArgumentHandleOp(inputs, outputs, weights_memory, weights_h_memory, bias_memory, diff_weights_memory,
                      diff_weights_h_memory, diff_bias_memory);
  ExecutePrimitive();
  Reorder(&diff_weights_memory, &user_diff_weights_memory);
  Reorder(&diff_weights_h_memory, &user_diff_weights_h_memory);
  return true;
 }
 }  // namespace kernel
 }  // namespace mindspore
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/mkldnn/lstm_grad_cpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/mkldnn/lstm_grad_cpu_kernel.h
@@ -1,87 +1,87 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_LSTM_GRAD_CPU_KERNEL_H_
 #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_LSTM_GRAD_CPU_KERNEL_H_

 #include <string>
 #include <vector>
 #include <memory>
 #include "backend/kernel_compiler/cpu/mkldnn/mkl_cpu_kernel.h"

 namespace mindspore {
 namespace kernel {
 class LSTMGradCPUKernel : public MKLCPUKernel {
 public:
  LSTMGradCPUKernel() = default;
  ~LSTMGradCPUKernel() override = default;

  void InitKernel(const CNodePtr &kernel_node) override;
  bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
              const std::vector<AddressPtr> &outputs) override;

 protected:
  void InitInputOutputSize(const CNodePtr &kernel_node) override;

 private:
  void AddArgumentOp(const dnnl::memory::desc &src_desc, const dnnl::memory::desc &src_h_desc,
                     const dnnl::memory::desc &src_c_desc, const dnnl::memory::desc &bias_desc,
                     const dnnl::memory::desc &dst_desc, const dnnl::memory::desc &dst_h_desc,
                     const dnnl::memory::desc &dst_c_desc);
  void SetArgumentHandleOp(const std::vector<kernel::AddressPtr> &inputs,
                           const std::vector<kernel::AddressPtr> &outputs, const dnnl::memory &weights_memory,
                           const dnnl::memory &weights_h_memory, const dnnl::memory &bias_memory,
                           const dnnl::memory &diff_weights_memory, const dnnl::memory &diff_weights_h_memory,
                           const dnnl::memory &diff_bias_memory);
  void ResetMemory(const dnnl::memory &mem, const string name) const;
  void CheckParam(const CNodePtr &kernel_node);
  int64_t weight_size_ = 0;
  int64_t weight_h_size_ = 0;
  int64_t input_size_;
  int64_t hidden_size_;
  int64_t num_layers_;
  int64_t batch_size_;
  int64_t seq_len_;
  int num_directions_;
  bool bidirectional_;
  bool has_bias_;
  size_t reserve_size_;
  dnnl::memory::dims weights_dims_;
  dnnl::memory::dims weights_h_dims_;
  dnnl::memory::dims bias_dims_;
  dnnl::lstm_backward::primitive_desc prim_backward_desc_;
 };

 MS_REG_CPU_KERNEL(LSTMGrad,
                  KernelAttr()
                    .AddInputAttr(kNumberTypeFloat32)
                    .AddInputAttr(kNumberTypeFloat32)
                    .AddInputAttr(kNumberTypeFloat32)
                    .AddInputAttr(kNumberTypeFloat32)
                    .AddInputAttr(kNumberTypeFloat32)
                    .AddInputAttr(kNumberTypeFloat32)
                    .AddInputAttr(kNumberTypeFloat32)
                    .AddInputAttr(kNumberTypeFloat32)
                    .AddInputAttr(kNumberTypeFloat32)
                    .AddInputAttr(kNumberTypeFloat32)
                    .AddInputAttr(kNumberTypeFloat32)
                    .AddOutputAttr(kNumberTypeFloat32)
                    .AddOutputAttr(kNumberTypeFloat32)
                    .AddOutputAttr(kNumberTypeFloat32)
                    .AddOutputAttr(kNumberTypeFloat32),
                  LSTMGradCPUKernel);
 }  // namespace kernel
 }  // namespace mindspore
 #endif  // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_LSTM_GRAD_CPU_KERNEL_H_
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_LSTM_GRAD_CPU_KERNEL_H_
 #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_LSTM_GRAD_CPU_KERNEL_H_

 #include <string>
 #include <vector>
 #include <memory>
 #include "backend/kernel_compiler/cpu/mkldnn/mkl_cpu_kernel.h"

 namespace mindspore {
 namespace kernel {
 class LSTMGradCPUKernel : public MKLCPUKernel {
 public:
  LSTMGradCPUKernel() = default;
  ~LSTMGradCPUKernel() override = default;

  void InitKernel(const CNodePtr &kernel_node) override;
  bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
              const std::vector<AddressPtr> &outputs) override;

 protected:
  void InitInputOutputSize(const CNodePtr &kernel_node) override;

 private:
  void AddArgumentOp(const dnnl::memory::desc &src_desc, const dnnl::memory::desc &src_h_desc,
                     const dnnl::memory::desc &src_c_desc, const dnnl::memory::desc &bias_desc,
                     const dnnl::memory::desc &dst_desc, const dnnl::memory::desc &dst_h_desc,
                     const dnnl::memory::desc &dst_c_desc);
  void SetArgumentHandleOp(const std::vector<kernel::AddressPtr> &inputs,
                           const std::vector<kernel::AddressPtr> &outputs, const dnnl::memory &weights_memory,
                           const dnnl::memory &weights_h_memory, const dnnl::memory &bias_memory,
                           const dnnl::memory &diff_weights_memory, const dnnl::memory &diff_weights_h_memory,
                           const dnnl::memory &diff_bias_memory);
  void ResetMemory(const dnnl::memory &mem, const string name) const;
  void CheckParam(const CNodePtr &kernel_node);
  int64_t weight_size_ = 0;
  int64_t weight_h_size_ = 0;
  int64_t input_size_;
  int64_t hidden_size_;
  int64_t num_layers_;
  int64_t batch_size_;
  int64_t seq_len_;
  int num_directions_;
  bool bidirectional_;
  bool has_bias_;
  size_t reserve_size_;
  dnnl::memory::dims weights_dims_;
  dnnl::memory::dims weights_h_dims_;
  dnnl::memory::dims bias_dims_;
  dnnl::lstm_backward::primitive_desc prim_backward_desc_;
 };

 MS_REG_CPU_KERNEL(LSTMGrad,
                  KernelAttr()
                    .AddInputAttr(kNumberTypeFloat32)
                    .AddInputAttr(kNumberTypeFloat32)
                    .AddInputAttr(kNumberTypeFloat32)
                    .AddInputAttr(kNumberTypeFloat32)
                    .AddInputAttr(kNumberTypeFloat32)
                    .AddInputAttr(kNumberTypeFloat32)
                    .AddInputAttr(kNumberTypeFloat32)
                    .AddInputAttr(kNumberTypeFloat32)
                    .AddInputAttr(kNumberTypeFloat32)
                    .AddInputAttr(kNumberTypeFloat32)
                    .AddInputAttr(kNumberTypeFloat32)
                    .AddOutputAttr(kNumberTypeFloat32)
                    .AddOutputAttr(kNumberTypeFloat32)
                    .AddOutputAttr(kNumberTypeFloat32)
                    .AddOutputAttr(kNumberTypeFloat32),
                  LSTMGradCPUKernel);
 }  // namespace kernel
 }  // namespace mindspore
 #endif  // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_LSTM_GRAD_CPU_KERNEL_H_
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/mkldnn/softmax_cross_entropy_with_logits_cpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/mkldnn/softmax_cross_entropy_with_logits_cpu_kernel.cc
@@ -1,99 +1,99 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "backend/kernel_compiler/cpu/mkldnn/softmax_cross_entropy_with_logits_cpu_kernel.h"
 #include <numeric>
 #include <functional>
 #include <cmath>
 #include "backend/kernel_compiler/cpu/mkldnn/mkl_kernel_engine.h"
 #include "runtime/device/cpu/cpu_device_address.h"
 #include "utils/ms_utils.h"

 namespace mindspore {
 namespace kernel {
 void SoftmaxCrossEntropyWithLogitsCPUKernel::InitInputOutputSize(const CNodePtr &kernel_node) {
  CPUKernel::InitInputOutputSize(kernel_node);
  MS_EXCEPTION_IF_NULL(kernel_node);
  size_t type_size = sizeof(float);
  std::vector<size_t> shape = AnfAlgo::GetInputDeviceShape(kernel_node, 0);
  size_t tensor_size = std::accumulate(shape.begin(), shape.end(), type_size, std::multiplies<size_t>());
  workspace_size_list_.emplace_back(tensor_size);
 }

 void SoftmaxCrossEntropyWithLogitsCPUKernel::InitKernel(const CNodePtr &kernel_node) {
  MS_EXCEPTION_IF_NULL(kernel_node);
  std::vector<size_t> shape = AnfAlgo::GetInputDeviceShape(kernel_node, 0);
  dnnl::memory::dims mem_dims;
  mem_dims.insert(mem_dims.end(), shape.begin(), shape.end());
  if (mem_dims.size() != 2) {
    MS_LOG(EXCEPTION) << "SoftmaxCrossEntropyWithLogits kernel dims invalid " << mem_dims.size();
  }
  batch_size_ = shape[0];
  class_num_ = shape[1];
  if (batch_size_ == 0 || class_num_ == 0) {
    MS_LOG(EXCEPTION) << "Invalid batch size or class num input!";
  }
  dnnl::memory::desc mem_desc(mem_dims, dnnl::memory::data_type::f32, dnnl::memory::format_tag::nc);

  dnnl::softmax_forward::desc desc = dnnl::softmax_forward::desc(dnnl::prop_kind::forward_training, mem_desc, 1);
  auto prim_desc = dnnl::softmax_forward::primitive_desc(desc, MKLKernelEngine::Get().engine());
  primitive_ = std::make_shared<dnnl::softmax_forward>(prim_desc);

  AddArgument(DNNL_ARG_SRC, mem_desc);
  AddArgument(DNNL_ARG_DST, mem_desc);
 }

 void SoftmaxCrossEntropyWithLogitsCPUKernel::ForwardPostExecute(const float *logits, const float *labels,
                                                                float *output1, float *output2) const {
  float epsilon = 1e-6;
  for (size_t i = 0; i < batch_size_; ++i) {
    output1[i] = 0;
    float loss = 0.0;
    for (size_t j = 0; j < class_num_; ++j) {
      float logit = logf(logits[i * class_num_ + j] <= 0.0 ? epsilon : logits[i * class_num_ + j]);
      output2[i * class_num_ + j] = logits[i * class_num_ + j] - labels[i * class_num_ + j];
      loss += labels[i * class_num_ + j] * logit;
    }
    output1[i] = -loss;
  }
 }

 bool SoftmaxCrossEntropyWithLogitsCPUKernel::Launch(const std::vector<kernel::AddressPtr> &inputs,
                                                    const std::vector<kernel::AddressPtr> &workspace,
                                                    const std::vector<kernel::AddressPtr> &outputs) {
  if (inputs.empty() || workspace.empty() || outputs.empty()) {
    MS_LOG(EXCEPTION) << "Error input output size!";
  }
  size_t batch_float_size = batch_size_ * sizeof(float);
  size_t batch_class_float_size = class_num_ * batch_float_size;
  if (inputs[0]->size != workspace[0]->size || inputs[0]->size != batch_class_float_size ||
      inputs[1]->size != batch_class_float_size) {
    MS_LOG(EXCEPTION) << "Error input data size!";
  }
  if (outputs[1]->size != batch_class_float_size || outputs[0]->size != batch_float_size) {
    MS_LOG(EXCEPTION) << "Error output data size!";
  }
  SetArgumentHandle(DNNL_ARG_SRC, inputs[0]->addr);
  SetArgumentHandle(DNNL_ARG_DST, workspace[0]->addr);
  ExecutePrimitive();
  auto labels = reinterpret_cast<float *>(inputs[1]->addr);
  auto logits = reinterpret_cast<float *>(workspace[0]->addr);
  auto output1 = reinterpret_cast<float *>(outputs[0]->addr);
  auto output2 = reinterpret_cast<float *>(outputs[1]->addr);
  ForwardPostExecute(logits, labels, output1, output2);
  return true;
 }
 }  // namespace kernel
 }  // namespace mindspore
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "backend/kernel_compiler/cpu/mkldnn/softmax_cross_entropy_with_logits_cpu_kernel.h"
 #include <numeric>
 #include <functional>
 #include <cmath>
 #include "backend/kernel_compiler/cpu/mkldnn/mkl_kernel_engine.h"
 #include "runtime/device/cpu/cpu_device_address.h"
 #include "utils/ms_utils.h"

 namespace mindspore {
 namespace kernel {
 void SoftmaxCrossEntropyWithLogitsCPUKernel::InitInputOutputSize(const CNodePtr &kernel_node) {
  CPUKernel::InitInputOutputSize(kernel_node);
  MS_EXCEPTION_IF_NULL(kernel_node);
  size_t type_size = sizeof(float);
  std::vector<size_t> shape = AnfAlgo::GetInputDeviceShape(kernel_node, 0);
  size_t tensor_size = std::accumulate(shape.begin(), shape.end(), type_size, std::multiplies<size_t>());
  workspace_size_list_.emplace_back(tensor_size);
 }

 void SoftmaxCrossEntropyWithLogitsCPUKernel::InitKernel(const CNodePtr &kernel_node) {
  MS_EXCEPTION_IF_NULL(kernel_node);
  std::vector<size_t> shape = AnfAlgo::GetInputDeviceShape(kernel_node, 0);
  dnnl::memory::dims mem_dims;
  mem_dims.insert(mem_dims.end(), shape.begin(), shape.end());
  if (mem_dims.size() != 2) {
    MS_LOG(EXCEPTION) << "SoftmaxCrossEntropyWithLogits kernel dims invalid " << mem_dims.size();
  }
  batch_size_ = shape[0];
  class_num_ = shape[1];
  if (batch_size_ == 0 || class_num_ == 0) {
    MS_LOG(EXCEPTION) << "Invalid batch size or class num input!";
  }
  dnnl::memory::desc mem_desc(mem_dims, dnnl::memory::data_type::f32, dnnl::memory::format_tag::nc);

  dnnl::softmax_forward::desc desc = dnnl::softmax_forward::desc(dnnl::prop_kind::forward_training, mem_desc, 1);
  auto prim_desc = dnnl::softmax_forward::primitive_desc(desc, MKLKernelEngine::Get().engine());
  primitive_ = std::make_shared<dnnl::softmax_forward>(prim_desc);

  AddArgument(DNNL_ARG_SRC, mem_desc);
  AddArgument(DNNL_ARG_DST, mem_desc);
 }

 void SoftmaxCrossEntropyWithLogitsCPUKernel::ForwardPostExecute(const float *logits, const float *labels,
                                                                float *output1, float *output2) const {
  float epsilon = 1e-6;
  for (size_t i = 0; i < batch_size_; ++i) {
    output1[i] = 0;
    float loss = 0.0;
    for (size_t j = 0; j < class_num_; ++j) {
      float logit = logf(logits[i * class_num_ + j] <= 0.0 ? epsilon : logits[i * class_num_ + j]);
      output2[i * class_num_ + j] = logits[i * class_num_ + j] - labels[i * class_num_ + j];
      loss += labels[i * class_num_ + j] * logit;
    }
    output1[i] = -loss;
  }
 }

 bool SoftmaxCrossEntropyWithLogitsCPUKernel::Launch(const std::vector<kernel::AddressPtr> &inputs,
                                                    const std::vector<kernel::AddressPtr> &workspace,
                                                    const std::vector<kernel::AddressPtr> &outputs) {
  if (inputs.empty() || workspace.empty() || outputs.empty()) {
    MS_LOG(EXCEPTION) << "Error input output size!";
  }
  size_t batch_float_size = batch_size_ * sizeof(float);
  size_t batch_class_float_size = class_num_ * batch_float_size;
  if (inputs[0]->size != workspace[0]->size || inputs[0]->size != batch_class_float_size ||
      inputs[1]->size != batch_class_float_size) {
    MS_LOG(EXCEPTION) << "Error input data size!";
  }
  if (outputs[1]->size != batch_class_float_size || outputs[0]->size != batch_float_size) {
    MS_LOG(EXCEPTION) << "Error output data size!";
  }
  SetArgumentHandle(DNNL_ARG_SRC, inputs[0]->addr);
  SetArgumentHandle(DNNL_ARG_DST, workspace[0]->addr);
  ExecutePrimitive();
  auto labels = reinterpret_cast<float *>(inputs[1]->addr);
  auto logits = reinterpret_cast<float *>(workspace[0]->addr);
  auto output1 = reinterpret_cast<float *>(outputs[0]->addr);
  auto output2 = reinterpret_cast<float *>(outputs[1]->addr);
  ForwardPostExecute(logits, labels, output1, output2);
  return true;
 }
 }  // namespace kernel
 }  // namespace mindspore
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/mkldnn/softmax_cross_entropy_with_logits_cpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/mkldnn/softmax_cross_entropy_with_logits_cpu_kernel.h
@@ -1,53 +1,53 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_SOFTMAX_CROSS_ENTROPY_WITH_LOGITS_CPU_KERNEL_H_
 #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_SOFTMAX_CROSS_ENTROPY_WITH_LOGITS_CPU_KERNEL_H_

 #include <vector>
 #include <memory>
 #include "backend/kernel_compiler/cpu/mkldnn/mkl_cpu_kernel.h"

 namespace mindspore {
 namespace kernel {
 class SoftmaxCrossEntropyWithLogitsCPUKernel : public MKLCPUKernel {
 public:
  SoftmaxCrossEntropyWithLogitsCPUKernel() = default;
  ~SoftmaxCrossEntropyWithLogitsCPUKernel() override = default;

  void InitKernel(const CNodePtr &kernel_node) override;

  bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
              const std::vector<AddressPtr> &outputs) override;

 protected:
  void InitInputOutputSize(const CNodePtr &kernel_node) override;

 private:
  void ForwardPostExecute(const float *logits, const float *labels, float *output1, float *output2) const;
  size_t class_num_{0};
  size_t batch_size_{0};
 };
 MS_REG_CPU_KERNEL(SoftmaxCrossEntropyWithLogits,
                  KernelAttr()
                    .AddInputAttr(kNumberTypeFloat32)
                    .AddInputAttr(kNumberTypeFloat32)
                    .AddOutputAttr(kNumberTypeFloat32)
                    .AddOutputAttr(kNumberTypeFloat32),
                  SoftmaxCrossEntropyWithLogitsCPUKernel);
 }  // namespace kernel
 }  // namespace mindspore

 #endif  // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_SOFTMAX_CROSS_ENTROPY_WITH_LOGITS_CPU_KERNEL_H_
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_SOFTMAX_CROSS_ENTROPY_WITH_LOGITS_CPU_KERNEL_H_
 #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_SOFTMAX_CROSS_ENTROPY_WITH_LOGITS_CPU_KERNEL_H_

 #include <vector>
 #include <memory>
 #include "backend/kernel_compiler/cpu/mkldnn/mkl_cpu_kernel.h"

 namespace mindspore {
 namespace kernel {
 class SoftmaxCrossEntropyWithLogitsCPUKernel : public MKLCPUKernel {
 public:
  SoftmaxCrossEntropyWithLogitsCPUKernel() = default;
  ~SoftmaxCrossEntropyWithLogitsCPUKernel() override = default;

  void InitKernel(const CNodePtr &kernel_node) override;

  bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
              const std::vector<AddressPtr> &outputs) override;

 protected:
  void InitInputOutputSize(const CNodePtr &kernel_node) override;

 private:
  void ForwardPostExecute(const float *logits, const float *labels, float *output1, float *output2) const;
  size_t class_num_{0};
  size_t batch_size_{0};
 };
 MS_REG_CPU_KERNEL(SoftmaxCrossEntropyWithLogits,
                  KernelAttr()
                    .AddInputAttr(kNumberTypeFloat32)
                    .AddInputAttr(kNumberTypeFloat32)
                    .AddOutputAttr(kNumberTypeFloat32)
                    .AddOutputAttr(kNumberTypeFloat32),
                  SoftmaxCrossEntropyWithLogitsCPUKernel);
 }  // namespace kernel
 }  // namespace mindspore

 #endif  // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_SOFTMAX_CROSS_ENTROPY_WITH_LOGITS_CPU_KERNEL_H_
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/ps/pserver_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/ps/pserver_kernel.h
@@ -1,59 +1,59 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_PS_PSERVER_KERNEL_H_
 #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_PS_PSERVER_KERNEL_H_

 #include <vector>
 #include <memory>
 #include "backend/kernel_compiler/kernel.h"
 #include "ps/util.h"

 namespace mindspore {
 namespace kernel {
 namespace ps {
 using mindspore::ps::Util;
 class PServerKernel {
 public:
  PServerKernel(size_t rank_id, size_t pserver_num, size_t worker_num)
      : rank_id_(rank_id), pserver_num_(pserver_num), worker_num_(worker_num) {}
  ~PServerKernel() = default;
  PServerKernel(const PServerKernel &) = delete;
  PServerKernel &operator=(const PServerKernel &) = delete;
  virtual void InitKernel(const std::shared_ptr<std::vector<std::shared_ptr<std::vector<size_t>>>> &) {}
  virtual void InitKernel(const CNodePtr &cnode,
                          const std::shared_ptr<std::vector<std::shared_ptr<std::vector<size_t>>>> &) {}
  virtual void ReInit(const std::vector<std::vector<size_t>> &) {}
  virtual bool Execute(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
                       const std::vector<AddressPtr> &outputs) = 0;
  virtual void UpdateEmbeddings(float *embedding_table, const size_t *lookup_ids, const float *update_vals,
                                size_t ids_size) {}
  virtual const std::vector<size_t> &input_sizes() const = 0;
  virtual const std::vector<size_t> &output_sizes() const = 0;
  virtual const std::vector<size_t> &workspace_sizes() const = 0;

 protected:
  virtual void ReInit(const std::vector<AddressPtr> &) {}
  void Shard(std::vector<size_t> *shape, int axis);

  size_t rank_id_;
  size_t pserver_num_;
  size_t worker_num_;
 };
 }  // namespace ps
 }  // namespace kernel
 }  // namespace mindspore

 #endif  // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_PS_PSERVER_KERNEL_H_
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_PS_PSERVER_KERNEL_H_
 #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_PS_PSERVER_KERNEL_H_

 #include <vector>
 #include <memory>
 #include "backend/kernel_compiler/kernel.h"
 #include "ps/util.h"

 namespace mindspore {
 namespace kernel {
 namespace ps {
 using mindspore::ps::Util;
 class PServerKernel {
 public:
  PServerKernel(size_t rank_id, size_t pserver_num, size_t worker_num)
      : rank_id_(rank_id), pserver_num_(pserver_num), worker_num_(worker_num) {}
  ~PServerKernel() = default;
  PServerKernel(const PServerKernel &) = delete;
  PServerKernel &operator=(const PServerKernel &) = delete;
  virtual void InitKernel(const std::shared_ptr<std::vector<std::shared_ptr<std::vector<size_t>>>> &) {}
  virtual void InitKernel(const CNodePtr &cnode,
                          const std::shared_ptr<std::vector<std::shared_ptr<std::vector<size_t>>>> &) {}
  virtual void ReInit(const std::vector<std::vector<size_t>> &) {}
  virtual bool Execute(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
                       const std::vector<AddressPtr> &outputs) = 0;
  virtual void UpdateEmbeddings(float *embedding_table, const size_t *lookup_ids, const float *update_vals,
                                size_t ids_size) {}
  virtual const std::vector<size_t> &input_sizes() const = 0;
  virtual const std::vector<size_t> &output_sizes() const = 0;
  virtual const std::vector<size_t> &workspace_sizes() const = 0;

 protected:
  virtual void ReInit(const std::vector<AddressPtr> &) {}
  void Shard(std::vector<size_t> *shape, int axis);

  size_t rank_id_;
  size_t pserver_num_;
  size_t worker_num_;
 };
 }  // namespace ps
 }  // namespace kernel
 }  // namespace mindspore

 #endif  // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_PS_PSERVER_KERNEL_H_
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/reduce_cpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/reduce_cpu_kernel.cc
@@ -1,138 +1,138 @@
 /**
 * Copyright 2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #include "backend/kernel_compiler/cpu/reduce_cpu_kernel.h"
 #include <string>
 #include <vector>
 #include <algorithm>
 #include <utility>

 namespace mindspore {
 namespace kernel {
 template <typename T>
 void ReduceCPUKernel<T>::InitKernel(const CNodePtr &kernel_node) {
  MS_EXCEPTION_IF_NULL(kernel_node);
  input_shape_ = AnfAlgo::GetInputDeviceShape(kernel_node, 0);
  auto axis_addr = AnfAlgo::GetCNodePrimitive(kernel_node)->GetAttr(AXIS);
  if (axis_addr->isa<ValueTuple>() || axis_addr->isa<ValueList>()) {
    axis_ = AnfAlgo::GetNodeAttr<std::vector<int64_t>>(kernel_node, AXIS);
  } else if (axis_addr->isa<Int64Imm>()) {
    axis_.emplace_back(AnfAlgo::GetNodeAttr<int64_t>(kernel_node, AXIS));
  } else {
    MS_LOG(EXCEPTION) << "Attribute is invalid";
  }

  int dimension = input_shape_.size();
  std::transform(axis_.begin(), axis_.end(), axis_.begin(),
                 [dimension](const auto &a) { return a < 0 ? dimension + a : a; });
  sort(axis_.begin(), axis_.end());
  // Delete the duplicate axis.
  auto last = std::unique(axis_.begin(), axis_.end());
  axis_.erase(last, axis_.end());
  auto kernel_name = AnfAlgo::GetCNodeName(kernel_node);

  if constexpr (std::is_same<T, bool>::value) {
    if (kernel_name == "ReduceAll") {
      reduce_type_ = kReduceAll;
      reduce_func_ = [](const T *input, size_t pos, T *out) { *out &= input[pos]; };
    } else if (kernel_name == "ReduceAny") {
      reduce_type_ = kReduceAny;
      reduce_func_ = [](const T *input, size_t pos, T *out) { *out |= input[pos]; };
    } else {
      MS_LOG(EXCEPTION) << "Unsupported reduce operation: " << fullname_ << " for bool.";
    }
  } else {
    if (kernel_name == "ReduceMax") {
      reduce_type_ = kReduceMax;
      reduce_func_ = [](const T *input, size_t pos, T *out) { *out = std::max(input[pos], *out); };
    } else if (kernel_name == "ReduceMin") {
      reduce_type_ = kReduceMin;
      reduce_func_ = [](const T *input, size_t pos, T *out) { *out = std::min(input[pos], *out); };
    } else if (kernel_name == "ReduceSum") {
      reduce_type_ = kReduceSum;
      reduce_func_ = [](const T *input, size_t pos, T *out) { *out += input[pos]; };
    } else if (kernel_name == "ReduceMean") {
      reduce_type_ = kReduceMean;
      reduce_func_ = [](const T *input, size_t pos, T *out) { *out += input[pos]; };
    } else {
      MS_LOG(EXCEPTION) << "Unsupported reduce operation:  " << kernel_name;
    }
  }
 }

 template <typename T>
 bool ReduceCPUKernel<T>::Launch(const std::vector<kernel::AddressPtr> &inputs, const std::vector<kernel::AddressPtr> &,
                                const std::vector<kernel::AddressPtr> &outputs) {
  size_t input_size = inputs[0]->size / sizeof(T);
  auto input_addr = reinterpret_cast<T *>(inputs[0]->addr);
  auto output_addr = reinterpret_cast<T *>(outputs[0]->addr);
  if (axis_.empty() || input_shape_.empty() || input_shape_.size() == 1) {
    // Get one ret
    *output_addr = input_addr[0];
    for (size_t i = 1; i < input_size; ++i) {
      reduce_func_(input_addr, i, output_addr);
    }
    if (reduce_type_ == kReduceMean) {
      *output_addr /= input_size;
    }
  } else {
    // Calculate transpose axes and stride
    int dimension = input_shape_.size();
    size_t stride = 1;
    std::vector<size_t> axes(input_shape_.size());
    size_t j = 0;
    size_t k = 0;
    for (int i = 0; i < dimension; ++i) {
      if (j == axis_.size() || i != axis_[j]) {
        axes[k] = i;
        ++k;
      } else {
        stride *= input_shape_[i];
        ++j;
      }
    }
    for (auto &it : axis_) {
      axes[k] = it;
      ++k;
    }
    // Calculate transpose shape
    std::vector<size_t> transpose_shape(input_shape_.size());
    for (int i = 0; i < dimension; ++i) {
      transpose_shape[i] = input_shape_[axes[i]];
    }
    size_t output_size = outputs[0]->size / sizeof(T);
    TransposeIterator base_iter(std::move(transpose_shape), std::move(axes), input_shape_);
    auto task = [this, &base_iter, input_addr, output_addr, stride](size_t start, size_t end) {
      auto iter = base_iter;
      iter.SetPos(start * stride);
      for (size_t i = start; i < end; ++i) {
        output_addr[i] = input_addr[iter.GetPos()];
        iter.GenNextPos();
        for (size_t j = 1; j < stride; ++j) {
          reduce_func_(input_addr, iter.GetPos(), &output_addr[i]);
          iter.GenNextPos();
        }
        if (reduce_type_ == kReduceMean) {
          output_addr[i] /= stride;
        }
      }
    };
    CPUKernelUtils::ParallelFor(task, output_size);
  }
  return true;
 }
 }  // namespace kernel
 }  // namespace mindspore
 /**
 * Copyright 2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #include "backend/kernel_compiler/cpu/reduce_cpu_kernel.h"
 #include <string>
 #include <vector>
 #include <algorithm>
 #include <utility>

 namespace mindspore {
 namespace kernel {
 template <typename T>
 void ReduceCPUKernel<T>::InitKernel(const CNodePtr &kernel_node) {
  MS_EXCEPTION_IF_NULL(kernel_node);
  input_shape_ = AnfAlgo::GetInputDeviceShape(kernel_node, 0);
  auto axis_addr = AnfAlgo::GetCNodePrimitive(kernel_node)->GetAttr(AXIS);
  if (axis_addr->isa<ValueTuple>() || axis_addr->isa<ValueList>()) {
    axis_ = AnfAlgo::GetNodeAttr<std::vector<int64_t>>(kernel_node, AXIS);
  } else if (axis_addr->isa<Int64Imm>()) {
    axis_.emplace_back(AnfAlgo::GetNodeAttr<int64_t>(kernel_node, AXIS));
  } else {
    MS_LOG(EXCEPTION) << "Attribute is invalid";
  }

  int dimension = input_shape_.size();
  std::transform(axis_.begin(), axis_.end(), axis_.begin(),
                 [dimension](const auto &a) { return a < 0 ? dimension + a : a; });
  sort(axis_.begin(), axis_.end());
  // Delete the duplicate axis.
  auto last = std::unique(axis_.begin(), axis_.end());
  axis_.erase(last, axis_.end());
  auto kernel_name = AnfAlgo::GetCNodeName(kernel_node);

  if constexpr (std::is_same<T, bool>::value) {
    if (kernel_name == "ReduceAll") {
      reduce_type_ = kReduceAll;
      reduce_func_ = [](const T *input, size_t pos, T *out) { *out &= input[pos]; };
    } else if (kernel_name == "ReduceAny") {
      reduce_type_ = kReduceAny;
      reduce_func_ = [](const T *input, size_t pos, T *out) { *out |= input[pos]; };
    } else {
      MS_LOG(EXCEPTION) << "Unsupported reduce operation: " << fullname_ << " for bool.";
    }
  } else {
    if (kernel_name == "ReduceMax") {
      reduce_type_ = kReduceMax;
      reduce_func_ = [](const T *input, size_t pos, T *out) { *out = std::max(input[pos], *out); };
    } else if (kernel_name == "ReduceMin") {
      reduce_type_ = kReduceMin;
      reduce_func_ = [](const T *input, size_t pos, T *out) { *out = std::min(input[pos], *out); };
    } else if (kernel_name == "ReduceSum") {
      reduce_type_ = kReduceSum;
      reduce_func_ = [](const T *input, size_t pos, T *out) { *out += input[pos]; };
    } else if (kernel_name == "ReduceMean") {
      reduce_type_ = kReduceMean;
      reduce_func_ = [](const T *input, size_t pos, T *out) { *out += input[pos]; };
    } else {
      MS_LOG(EXCEPTION) << "Unsupported reduce operation:  " << kernel_name;
    }
  }
 }

 template <typename T>
 bool ReduceCPUKernel<T>::Launch(const std::vector<kernel::AddressPtr> &inputs, const std::vector<kernel::AddressPtr> &,
                                const std::vector<kernel::AddressPtr> &outputs) {
  size_t input_size = inputs[0]->size / sizeof(T);
  auto input_addr = reinterpret_cast<T *>(inputs[0]->addr);
  auto output_addr = reinterpret_cast<T *>(outputs[0]->addr);
  if (axis_.empty() || input_shape_.empty() || input_shape_.size() == 1) {
    // Get one ret
    *output_addr = input_addr[0];
    for (size_t i = 1; i < input_size; ++i) {
      reduce_func_(input_addr, i, output_addr);
    }
    if (reduce_type_ == kReduceMean) {
      *output_addr /= input_size;
    }
  } else {
    // Calculate transpose axes and stride
    int dimension = input_shape_.size();
    size_t stride = 1;
    std::vector<size_t> axes(input_shape_.size());
    size_t j = 0;
    size_t k = 0;
    for (int i = 0; i < dimension; ++i) {
      if (j == axis_.size() || i != axis_[j]) {
        axes[k] = i;
        ++k;
      } else {
        stride *= input_shape_[i];
        ++j;
      }
    }
    for (auto &it : axis_) {
      axes[k] = it;
      ++k;
    }
    // Calculate transpose shape
    std::vector<size_t> transpose_shape(input_shape_.size());
    for (int i = 0; i < dimension; ++i) {
      transpose_shape[i] = input_shape_[axes[i]];
    }
    size_t output_size = outputs[0]->size / sizeof(T);
    TransposeIterator base_iter(std::move(transpose_shape), std::move(axes), input_shape_);
    auto task = [this, &base_iter, input_addr, output_addr, stride](size_t start, size_t end) {
      auto iter = base_iter;
      iter.SetPos(start * stride);
      for (size_t i = start; i < end; ++i) {
        output_addr[i] = input_addr[iter.GetPos()];
        iter.GenNextPos();
        for (size_t j = 1; j < stride; ++j) {
          reduce_func_(input_addr, iter.GetPos(), &output_addr[i]);
          iter.GenNextPos();
        }
        if (reduce_type_ == kReduceMean) {
          output_addr[i] /= stride;
        }
      }
    };
    CPUKernelUtils::ParallelFor(task, output_size);
  }
  return true;
 }
 }  // namespace kernel
 }  // namespace mindspore
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/reduce_cpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/reduce_cpu_kernel.h
@@ -1,69 +1,69 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_REDUCE_CPU_KERNEL_H_
 #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_REDUCE_CPU_KERNEL_H_
 #include <vector>
 #include <memory>
 #include <string>
 #include <functional>
 #include "backend/kernel_compiler/cpu/cpu_kernel.h"
 #include "backend/kernel_compiler/cpu/cpu_kernel_factory.h"

 namespace mindspore {
 namespace kernel {
 template <typename T>
 class ReduceCPUKernel : public CPUKernel {
 public:
  ReduceCPUKernel() = default;
  ~ReduceCPUKernel() override = default;
  void InitKernel(const CNodePtr &kernel_node) override;
  bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
              const std::vector<AddressPtr> &outputs) override;

 private:
  enum ReduceType { kReduceAll, kReduceAny, kReduceMax, kReduceMin, kReduceSum, kReduceMean };
  std::vector<size_t> input_shape_;
  std::vector<int64_t> axis_;
  ReduceType reduce_type_{kReduceAll};
  std::function<void(const T *, size_t, T *)> reduce_func_;
 };

 MS_REG_CPU_KERNEL_T(ReduceMean, KernelAttr(), ReduceCPUKernel, float);
 MS_REG_CPU_KERNEL_T(ReduceMean, KernelAttr(), ReduceCPUKernel, double);
 MS_REG_CPU_KERNEL_T(ReduceMean, KernelAttr(), ReduceCPUKernel, int32_t);
 MS_REG_CPU_KERNEL_T(ReduceMean, KernelAttr(), ReduceCPUKernel, int64_t);

 MS_REG_CPU_KERNEL_T(ReduceMax, KernelAttr(), ReduceCPUKernel, float);
 MS_REG_CPU_KERNEL_T(ReduceMax, KernelAttr(), ReduceCPUKernel, double);
 MS_REG_CPU_KERNEL_T(ReduceMax, KernelAttr(), ReduceCPUKernel, int32_t);
 MS_REG_CPU_KERNEL_T(ReduceMax, KernelAttr(), ReduceCPUKernel, int64_t);

 MS_REG_CPU_KERNEL_T(ReduceSum, KernelAttr(), ReduceCPUKernel, float);
 MS_REG_CPU_KERNEL_T(ReduceSum, KernelAttr(), ReduceCPUKernel, double);
 MS_REG_CPU_KERNEL_T(ReduceSum, KernelAttr(), ReduceCPUKernel, int32_t);
 MS_REG_CPU_KERNEL_T(ReduceSum, KernelAttr(), ReduceCPUKernel, int64_t);

 MS_REG_CPU_KERNEL_T(ReduceMin, KernelAttr(), ReduceCPUKernel, float);
 MS_REG_CPU_KERNEL_T(ReduceMin, KernelAttr(), ReduceCPUKernel, double);
 MS_REG_CPU_KERNEL_T(ReduceMin, KernelAttr(), ReduceCPUKernel, int32_t);
 MS_REG_CPU_KERNEL_T(ReduceMin, KernelAttr(), ReduceCPUKernel, int64_t);

 MS_REG_CPU_KERNEL_T(ReduceAll, KernelAttr(), ReduceCPUKernel, bool);

 MS_REG_CPU_KERNEL_T(ReduceAny, KernelAttr(), ReduceCPUKernel, bool);
 }  // namespace kernel
 }  // namespace mindspore
 #endif  // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_REDUCE_CPU_KERNEL_H_
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_REDUCE_CPU_KERNEL_H_
 #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_REDUCE_CPU_KERNEL_H_
 #include <vector>
 #include <memory>
 #include <string>
 #include <functional>
 #include "backend/kernel_compiler/cpu/cpu_kernel.h"
 #include "backend/kernel_compiler/cpu/cpu_kernel_factory.h"

 namespace mindspore {
 namespace kernel {
 template <typename T>
 class ReduceCPUKernel : public CPUKernel {
 public:
  ReduceCPUKernel() = default;
  ~ReduceCPUKernel() override = default;
  void InitKernel(const CNodePtr &kernel_node) override;
  bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
              const std::vector<AddressPtr> &outputs) override;

 private:
  enum ReduceType { kReduceAll, kReduceAny, kReduceMax, kReduceMin, kReduceSum, kReduceMean };
  std::vector<size_t> input_shape_;
  std::vector<int64_t> axis_;
  ReduceType reduce_type_{kReduceAll};
  std::function<void(const T *, size_t, T *)> reduce_func_;
 };

 MS_REG_CPU_KERNEL_T(ReduceMean, KernelAttr(), ReduceCPUKernel, float);
 MS_REG_CPU_KERNEL_T(ReduceMean, KernelAttr(), ReduceCPUKernel, double);
 MS_REG_CPU_KERNEL_T(ReduceMean, KernelAttr(), ReduceCPUKernel, int32_t);
 MS_REG_CPU_KERNEL_T(ReduceMean, KernelAttr(), ReduceCPUKernel, int64_t);

 MS_REG_CPU_KERNEL_T(ReduceMax, KernelAttr(), ReduceCPUKernel, float);
 MS_REG_CPU_KERNEL_T(ReduceMax, KernelAttr(), ReduceCPUKernel, double);
 MS_REG_CPU_KERNEL_T(ReduceMax, KernelAttr(), ReduceCPUKernel, int32_t);
 MS_REG_CPU_KERNEL_T(ReduceMax, KernelAttr(), ReduceCPUKernel, int64_t);

 MS_REG_CPU_KERNEL_T(ReduceSum, KernelAttr(), ReduceCPUKernel, float);
 MS_REG_CPU_KERNEL_T(ReduceSum, KernelAttr(), ReduceCPUKernel, double);
 MS_REG_CPU_KERNEL_T(ReduceSum, KernelAttr(), ReduceCPUKernel, int32_t);
 MS_REG_CPU_KERNEL_T(ReduceSum, KernelAttr(), ReduceCPUKernel, int64_t);

 MS_REG_CPU_KERNEL_T(ReduceMin, KernelAttr(), ReduceCPUKernel, float);
 MS_REG_CPU_KERNEL_T(ReduceMin, KernelAttr(), ReduceCPUKernel, double);
 MS_REG_CPU_KERNEL_T(ReduceMin, KernelAttr(), ReduceCPUKernel, int32_t);
 MS_REG_CPU_KERNEL_T(ReduceMin, KernelAttr(), ReduceCPUKernel, int64_t);

 MS_REG_CPU_KERNEL_T(ReduceAll, KernelAttr(), ReduceCPUKernel, bool);

 MS_REG_CPU_KERNEL_T(ReduceAny, KernelAttr(), ReduceCPUKernel, bool);
 }  // namespace kernel
 }  // namespace mindspore
 #endif  // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_REDUCE_CPU_KERNEL_H_
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/spacetodepth_cpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/spacetodepth_cpu_kernel.cc
@@ -1,91 +1,91 @@
 /**
 * Copyright 2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #include "backend/kernel_compiler/cpu/spacetodepth_cpu_kernel.h"

 #include <vector>

 #include "runtime/device/cpu/cpu_device_address.h"

 namespace mindspore {
 namespace kernel {
 template <typename T>
 void SpaceToDepthCPUKernel<T>::InitKernel(const CNodePtr &kernel_node) {
  MS_EXCEPTION_IF_NULL(kernel_node);
  CheckParam(kernel_node);

  input_shape_ = AnfAlgo::GetInputDeviceShape(kernel_node, 0);
  output_shape_ = AnfAlgo::GetOutputDeviceShape(kernel_node, 0);
  block_size_ = AnfAlgo::GetNodeAttr<int64_t>(kernel_node, "block_size");
 }

 template <typename T>
 bool SpaceToDepthCPUKernel<T>::Launch(const std::vector<kernel::AddressPtr> &inputs,
                                      const std::vector<kernel::AddressPtr> & /*workspace*/,
                                      const std::vector<kernel::AddressPtr> &outputs) {
  auto input_addr = reinterpret_cast<T *>(inputs[0]->addr);
  auto output_addr = reinterpret_cast<T *>(outputs[0]->addr);
  size_t size = IntToSize(inputs[0]->size / sizeof(T));

  std::vector<size_t> input_shape = input_shape_;
  std::vector<size_t> output_shape = output_shape_;
  size_t block_size = block_size_;
  size_t input_dimension = input_shape.size();
  size_t input_strides[3] = {1, 1, 1};

  for (size_t i = input_dimension - 1; i >= 1; --i) {
    for (size_t j = 0; j < i; ++j) {
      input_strides[j] *= input_shape[i];
    }
  }

  auto task = [&, input_addr, output_addr](size_t start, size_t end) {
    std::vector<size_t> input_pos_array(input_dimension, 0);
    for (size_t i = start; i < end; ++i) {
      size_t tmp_pos = i;
      for (size_t j = 0; j < input_dimension - 1; ++j) {
        input_pos_array[j] = tmp_pos / input_strides[j];
        tmp_pos %= input_strides[j];
      }
      input_pos_array.back() = tmp_pos;
      size_t output_pos = input_pos_array[0];
      output_pos =
        (output_pos * output_shape[1]) +
        (input_pos_array[1] +
         (block_size * (input_pos_array[2] % block_size) + input_pos_array[3] % block_size) * input_shape[1]);
      output_pos = (output_pos * output_shape[2]) + (input_pos_array[2] / block_size);
      output_pos = (output_pos * output_shape[3]) + (input_pos_array[3] / block_size);
      output_addr[output_pos] = input_addr[i];
    }
  };

  CPUKernelUtils::ParallelFor(task, size);
  return true;
 }

 template <typename T>
 void SpaceToDepthCPUKernel<T>::CheckParam(const CNodePtr &kernel_node) {
  size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
  if (input_num != 1) {
    MS_LOG(EXCEPTION) << "Input number is " << input_num << ", but DepthToSpaceCPUKerrnel needs 1 input.";
  }
  size_t output_num = AnfAlgo::GetOutputTensorNum(kernel_node);
  if (output_num != 1) {
    MS_LOG(EXCEPTION) << "Output number is " << output_num << ", but DepthToSpaceCPUKernel needs 1 output.";
  }
 }
 }  // namespace kernel
 }  // namespace mindspore
 /**
 * Copyright 2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #include "backend/kernel_compiler/cpu/spacetodepth_cpu_kernel.h"

 #include <vector>

 #include "runtime/device/cpu/cpu_device_address.h"

 namespace mindspore {
 namespace kernel {
 template <typename T>
 void SpaceToDepthCPUKernel<T>::InitKernel(const CNodePtr &kernel_node) {
  MS_EXCEPTION_IF_NULL(kernel_node);
  CheckParam(kernel_node);

  input_shape_ = AnfAlgo::GetInputDeviceShape(kernel_node, 0);
  output_shape_ = AnfAlgo::GetOutputDeviceShape(kernel_node, 0);
  block_size_ = AnfAlgo::GetNodeAttr<int64_t>(kernel_node, "block_size");
 }

 template <typename T>
 bool SpaceToDepthCPUKernel<T>::Launch(const std::vector<kernel::AddressPtr> &inputs,
                                      const std::vector<kernel::AddressPtr> & /*workspace*/,
                                      const std::vector<kernel::AddressPtr> &outputs) {
  auto input_addr = reinterpret_cast<T *>(inputs[0]->addr);
  auto output_addr = reinterpret_cast<T *>(outputs[0]->addr);
  size_t size = IntToSize(inputs[0]->size / sizeof(T));

  std::vector<size_t> input_shape = input_shape_;
  std::vector<size_t> output_shape = output_shape_;
  size_t block_size = block_size_;
  size_t input_dimension = input_shape.size();
  size_t input_strides[3] = {1, 1, 1};

  for (size_t i = input_dimension - 1; i >= 1; --i) {
    for (size_t j = 0; j < i; ++j) {
      input_strides[j] *= input_shape[i];
    }
  }

  auto task = [&, input_addr, output_addr](size_t start, size_t end) {
    std::vector<size_t> input_pos_array(input_dimension, 0);
    for (size_t i = start; i < end; ++i) {
      size_t tmp_pos = i;
      for (size_t j = 0; j < input_dimension - 1; ++j) {
        input_pos_array[j] = tmp_pos / input_strides[j];
        tmp_pos %= input_strides[j];
      }
      input_pos_array.back() = tmp_pos;
      size_t output_pos = input_pos_array[0];
      output_pos =
        (output_pos * output_shape[1]) +
        (input_pos_array[1] +
         (block_size * (input_pos_array[2] % block_size) + input_pos_array[3] % block_size) * input_shape[1]);
      output_pos = (output_pos * output_shape[2]) + (input_pos_array[2] / block_size);
      output_pos = (output_pos * output_shape[3]) + (input_pos_array[3] / block_size);
      output_addr[output_pos] = input_addr[i];
    }
  };

  CPUKernelUtils::ParallelFor(task, size);
  return true;
 }

 template <typename T>
 void SpaceToDepthCPUKernel<T>::CheckParam(const CNodePtr &kernel_node) {
  size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
  if (input_num != 1) {
    MS_LOG(EXCEPTION) << "Input number is " << input_num << ", but DepthToSpaceCPUKerrnel needs 1 input.";
  }
  size_t output_num = AnfAlgo::GetOutputTensorNum(kernel_node);
  if (output_num != 1) {
    MS_LOG(EXCEPTION) << "Output number is " << output_num << ", but DepthToSpaceCPUKernel needs 1 output.";
  }
 }
 }  // namespace kernel
 }  // namespace mindspore
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/spacetodepth_cpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/spacetodepth_cpu_kernel.h
@@ -1,84 +1,84 @@
 /**
 * Copyright 2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_SPACETODEPTH_CPU_KERNEL_H_
 #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_SPACETODEPTH_CPU_KERNEL_H_
 #include <string>
 #include <vector>

 #include "backend/kernel_compiler/cpu/cpu_kernel.h"
 #include "backend/kernel_compiler/cpu/cpu_kernel_factory.h"
 namespace mindspore {
 namespace kernel {
 template <typename T>
 class SpaceToDepthCPUKernel : public CPUKernel {
 public:
  SpaceToDepthCPUKernel() = default;
  ~SpaceToDepthCPUKernel() override = default;

  void InitKernel(const CNodePtr &kernel_node) override;
  bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
              const std::vector<AddressPtr> &outputs) override;

 private:
  void CheckParam(const CNodePtr &kernel_node);
  std::vector<size_t> input_shape_;
  std::vector<size_t> output_shape_;
  size_t block_size_;
 };

 MS_REG_CPU_KERNEL_T(
  SpaceToDepth, KernelAttr().SetAllSameAttr(true).AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32),
  SpaceToDepthCPUKernel, float);

 MS_REG_CPU_KERNEL_T(
  SpaceToDepth, KernelAttr().SetAllSameAttr(true).AddInputAttr(kNumberTypeFloat16).AddOutputAttr(kNumberTypeFloat16),
  SpaceToDepthCPUKernel, float16);

 MS_REG_CPU_KERNEL_T(SpaceToDepth,
                    KernelAttr().SetAllSameAttr(true).AddInputAttr(kNumberTypeInt8).AddOutputAttr(kNumberTypeInt8),
                    SpaceToDepthCPUKernel, int8_t);

 MS_REG_CPU_KERNEL_T(SpaceToDepth,
                    KernelAttr().SetAllSameAttr(true).AddInputAttr(kNumberTypeInt16).AddOutputAttr(kNumberTypeInt16),
                    SpaceToDepthCPUKernel, int16_t);

 MS_REG_CPU_KERNEL_T(SpaceToDepth,
                    KernelAttr().SetAllSameAttr(true).AddInputAttr(kNumberTypeInt32).AddOutputAttr(kNumberTypeInt32),
                    SpaceToDepthCPUKernel, int);

 MS_REG_CPU_KERNEL_T(SpaceToDepth,
                    KernelAttr().SetAllSameAttr(true).AddInputAttr(kNumberTypeInt64).AddOutputAttr(kNumberTypeInt64),
                    SpaceToDepthCPUKernel, int64_t);

 MS_REG_CPU_KERNEL_T(SpaceToDepth,
                    KernelAttr().SetAllSameAttr(true).AddInputAttr(kNumberTypeUInt8).AddOutputAttr(kNumberTypeUInt8),
                    SpaceToDepthCPUKernel, uint8_t);

 MS_REG_CPU_KERNEL_T(SpaceToDepth,
                    KernelAttr().SetAllSameAttr(true).AddInputAttr(kNumberTypeUInt16).AddOutputAttr(kNumberTypeUInt16),
                    SpaceToDepthCPUKernel, uint16_t);

 MS_REG_CPU_KERNEL_T(SpaceToDepth,
                    KernelAttr().SetAllSameAttr(true).AddInputAttr(kNumberTypeUInt32).AddOutputAttr(kNumberTypeUInt32),
                    SpaceToDepthCPUKernel, uint32_t);

 MS_REG_CPU_KERNEL_T(SpaceToDepth,
                    KernelAttr().SetAllSameAttr(true).AddInputAttr(kNumberTypeUInt64).AddOutputAttr(kNumberTypeUInt64),
                    SpaceToDepthCPUKernel, uint64_t);

 }  // namespace kernel
 }  // namespace mindspore
 #endif  // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_SPACETODEPTH_CPU_KERNEL_H_
 /**
 * Copyright 2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_SPACETODEPTH_CPU_KERNEL_H_
 #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_SPACETODEPTH_CPU_KERNEL_H_
 #include <string>
 #include <vector>

 #include "backend/kernel_compiler/cpu/cpu_kernel.h"
 #include "backend/kernel_compiler/cpu/cpu_kernel_factory.h"
 namespace mindspore {
 namespace kernel {
 template <typename T>
 class SpaceToDepthCPUKernel : public CPUKernel {
 public:
  SpaceToDepthCPUKernel() = default;
  ~SpaceToDepthCPUKernel() override = default;

  void InitKernel(const CNodePtr &kernel_node) override;
  bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
              const std::vector<AddressPtr> &outputs) override;

 private:
  void CheckParam(const CNodePtr &kernel_node);
  std::vector<size_t> input_shape_;
  std::vector<size_t> output_shape_;
  size_t block_size_;
 };

 MS_REG_CPU_KERNEL_T(
  SpaceToDepth, KernelAttr().SetAllSameAttr(true).AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32),
  SpaceToDepthCPUKernel, float);

 MS_REG_CPU_KERNEL_T(
  SpaceToDepth, KernelAttr().SetAllSameAttr(true).AddInputAttr(kNumberTypeFloat16).AddOutputAttr(kNumberTypeFloat16),
  SpaceToDepthCPUKernel, float16);

 MS_REG_CPU_KERNEL_T(SpaceToDepth,
                    KernelAttr().SetAllSameAttr(true).AddInputAttr(kNumberTypeInt8).AddOutputAttr(kNumberTypeInt8),
                    SpaceToDepthCPUKernel, int8_t);

 MS_REG_CPU_KERNEL_T(SpaceToDepth,
                    KernelAttr().SetAllSameAttr(true).AddInputAttr(kNumberTypeInt16).AddOutputAttr(kNumberTypeInt16),
                    SpaceToDepthCPUKernel, int16_t);

 MS_REG_CPU_KERNEL_T(SpaceToDepth,
                    KernelAttr().SetAllSameAttr(true).AddInputAttr(kNumberTypeInt32).AddOutputAttr(kNumberTypeInt32),
                    SpaceToDepthCPUKernel, int);

 MS_REG_CPU_KERNEL_T(SpaceToDepth,
                    KernelAttr().SetAllSameAttr(true).AddInputAttr(kNumberTypeInt64).AddOutputAttr(kNumberTypeInt64),
                    SpaceToDepthCPUKernel, int64_t);

 MS_REG_CPU_KERNEL_T(SpaceToDepth,
                    KernelAttr().SetAllSameAttr(true).AddInputAttr(kNumberTypeUInt8).AddOutputAttr(kNumberTypeUInt8),
                    SpaceToDepthCPUKernel, uint8_t);

 MS_REG_CPU_KERNEL_T(SpaceToDepth,
                    KernelAttr().SetAllSameAttr(true).AddInputAttr(kNumberTypeUInt16).AddOutputAttr(kNumberTypeUInt16),
                    SpaceToDepthCPUKernel, uint16_t);

 MS_REG_CPU_KERNEL_T(SpaceToDepth,
                    KernelAttr().SetAllSameAttr(true).AddInputAttr(kNumberTypeUInt32).AddOutputAttr(kNumberTypeUInt32),
                    SpaceToDepthCPUKernel, uint32_t);

 MS_REG_CPU_KERNEL_T(SpaceToDepth,
                    KernelAttr().SetAllSameAttr(true).AddInputAttr(kNumberTypeUInt64).AddOutputAttr(kNumberTypeUInt64),
                    SpaceToDepthCPUKernel, uint64_t);

 }  // namespace kernel
 }  // namespace mindspore
 #endif  // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_SPACETODEPTH_CPU_KERNEL_H_
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/topk_cpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/topk_cpu_kernel.cc
@@ -1,87 +1,87 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #include <string>
 #include <vector>
 #include <algorithm>
 #include <map>
 #include "backend/kernel_compiler/cpu/topk_cpu_kernel.h"
 #include "runtime/device/cpu/cpu_device_address.h"

 namespace mindspore {
 namespace kernel {
 template <typename T>
 void TopKCPUKernel::LaunchKernel(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &outputs) {
  if (inputs.size() != 2 || outputs.size() != 2) {
    MS_LOG(EXCEPTION) << "TopK needs 2 inputs and 2 outputs, but get inputs: " << inputs.size()
                      << "outputs: " << outputs.size();
  }
  if (inputs[0]->size != outer_size_ * inner_size_ * sizeof(T)) {
    MS_LOG(EXCEPTION) << "Error input data size!";
  }
  if (inputs[1]->size != sizeof(int)) {
    MS_LOG(EXCEPTION) << "Input K must be int!";
  }
  auto input = reinterpret_cast<T *>(inputs[0]->addr);
  int k = reinterpret_cast<int *>(inputs[1]->addr)[0];
  auto output = reinterpret_cast<T *>(outputs[0]->addr);
  auto indices = reinterpret_cast<int *>(outputs[1]->addr);
  if (k < 1) {
    MS_LOG(EXCEPTION) << "Input k must > 0!";
  }
  size_t k_num = IntToSize(std::min<int>(inner_size_, k));
  if (outputs[0]->size != outer_size_ * k_num * sizeof(T)) {
    MS_LOG(EXCEPTION) << "Error output data size!";
  }
  for (size_t i = 0; i < outer_size_; ++i) {
    std::vector<size_t> idx(inner_size_);
    auto base_input = i * inner_size_;
    std::iota(idx.begin(), idx.end(), base_input);
    std::stable_sort(idx.begin(), idx.end(),
                     [&input](size_t index_1, size_t index_2) { return input[index_1] > input[index_2]; });
    auto base_output = i * k_num;
    if (!sorted_) {
      std::stable_sort(idx.begin(), idx.begin() + SizeToLong(k_num));
    }
    for (size_t j = 0; j < k_num; ++j) {
      indices[base_output + j] = SizeToInt(idx[j]) - SizeToInt(base_input);
      output[base_output + j] = input[idx[j]];
    }
  }
 }

 void TopKCPUKernel::InitKernel(const CNodePtr &kernel_node) {
  MS_EXCEPTION_IF_NULL(kernel_node);
  auto x_shape_ = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
  for (size_t i = 0; i < x_shape_.size() - 1; ++i) {
    outer_size_ *= x_shape_[i];
  }
  inner_size_ = x_shape_[x_shape_.size() - 1];
  sorted_ = AnfAlgo::GetNodeAttr<bool>(kernel_node, "sorted");
  dtype_ = AnfAlgo::GetInputDeviceDataType(kernel_node, 0);
 }

 bool TopKCPUKernel::Launch(const std::vector<kernel::AddressPtr> &inputs, const std::vector<kernel::AddressPtr> &,
                           const std::vector<kernel::AddressPtr> &outputs) {
  if (dtype_ == kNumberTypeFloat16) {
    LaunchKernel<float16>(inputs, outputs);
  } else if (dtype_ == kNumberTypeFloat32) {
    LaunchKernel<float>(inputs, outputs);
  }
  return true;
 }
 }  // namespace kernel
 }  // namespace mindspore
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #include <string>
 #include <vector>
 #include <algorithm>
 #include <map>
 #include "backend/kernel_compiler/cpu/topk_cpu_kernel.h"
 #include "runtime/device/cpu/cpu_device_address.h"

 namespace mindspore {
 namespace kernel {
 template <typename T>
 void TopKCPUKernel::LaunchKernel(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &outputs) {
  if (inputs.size() != 2 || outputs.size() != 2) {
    MS_LOG(EXCEPTION) << "TopK needs 2 inputs and 2 outputs, but get inputs: " << inputs.size()
                      << "outputs: " << outputs.size();
  }
  if (inputs[0]->size != outer_size_ * inner_size_ * sizeof(T)) {
    MS_LOG(EXCEPTION) << "Error input data size!";
  }
  if (inputs[1]->size != sizeof(int)) {
    MS_LOG(EXCEPTION) << "Input K must be int!";
  }
  auto input = reinterpret_cast<T *>(inputs[0]->addr);
  int k = reinterpret_cast<int *>(inputs[1]->addr)[0];
  auto output = reinterpret_cast<T *>(outputs[0]->addr);
  auto indices = reinterpret_cast<int *>(outputs[1]->addr);
  if (k < 1) {
    MS_LOG(EXCEPTION) << "Input k must > 0!";
  }
  size_t k_num = IntToSize(std::min<int>(inner_size_, k));
  if (outputs[0]->size != outer_size_ * k_num * sizeof(T)) {
    MS_LOG(EXCEPTION) << "Error output data size!";
  }
  for (size_t i = 0; i < outer_size_; ++i) {
    std::vector<size_t> idx(inner_size_);
    auto base_input = i * inner_size_;
    std::iota(idx.begin(), idx.end(), base_input);
    std::stable_sort(idx.begin(), idx.end(),
                     [&input](size_t index_1, size_t index_2) { return input[index_1] > input[index_2]; });
    auto base_output = i * k_num;
    if (!sorted_) {
      std::stable_sort(idx.begin(), idx.begin() + SizeToLong(k_num));
    }
    for (size_t j = 0; j < k_num; ++j) {
      indices[base_output + j] = SizeToInt(idx[j]) - SizeToInt(base_input);
      output[base_output + j] = input[idx[j]];
    }
  }
 }

 void TopKCPUKernel::InitKernel(const CNodePtr &kernel_node) {
  MS_EXCEPTION_IF_NULL(kernel_node);
  auto x_shape_ = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
  for (size_t i = 0; i < x_shape_.size() - 1; ++i) {
    outer_size_ *= x_shape_[i];
  }
  inner_size_ = x_shape_[x_shape_.size() - 1];
  sorted_ = AnfAlgo::GetNodeAttr<bool>(kernel_node, "sorted");
  dtype_ = AnfAlgo::GetInputDeviceDataType(kernel_node, 0);
 }

 bool TopKCPUKernel::Launch(const std::vector<kernel::AddressPtr> &inputs, const std::vector<kernel::AddressPtr> &,
                           const std::vector<kernel::AddressPtr> &outputs) {
  if (dtype_ == kNumberTypeFloat16) {
    LaunchKernel<float16>(inputs, outputs);
  } else if (dtype_ == kNumberTypeFloat32) {
    LaunchKernel<float>(inputs, outputs);
  }
  return true;
 }
 }  // namespace kernel
 }  // namespace mindspore
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/topk_cpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/topk_cpu_kernel.h
@@ -1,46 +1,46 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_TOPK_CPU_KERNEL_H_
 #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_TOPK_CPU_KERNEL_H_
 #include <vector>
 #include <memory>
 #include <string>
 #include "backend/kernel_compiler/cpu/cpu_kernel.h"
 #include "backend/kernel_compiler/cpu/cpu_kernel_factory.h"

 namespace mindspore {
 namespace kernel {
 class TopKCPUKernel : public CPUKernel {
 public:
  TopKCPUKernel() = default;
  ~TopKCPUKernel() override = default;
  void InitKernel(const CNodePtr &kernel_node) override;
  bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
              const std::vector<AddressPtr> &outputs) override;

 private:
  template <typename T>
  void LaunchKernel(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &outputs);
  size_t outer_size_{1};
  size_t inner_size_{1};
  bool sorted_{false};
  TypeId dtype_{kTypeUnknown};
 };

 MS_REG_CPU_KERNEL(TopK, KernelAttr(), TopKCPUKernel)
 }  // namespace kernel
 }  // namespace mindspore
 #endif  // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_TOPK_CPU_KERNEL_H_
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_TOPK_CPU_KERNEL_H_
 #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_TOPK_CPU_KERNEL_H_
 #include <vector>
 #include <memory>
 #include <string>
 #include "backend/kernel_compiler/cpu/cpu_kernel.h"
 #include "backend/kernel_compiler/cpu/cpu_kernel_factory.h"

 namespace mindspore {
 namespace kernel {
 class TopKCPUKernel : public CPUKernel {
 public:
  TopKCPUKernel() = default;
  ~TopKCPUKernel() override = default;
  void InitKernel(const CNodePtr &kernel_node) override;
  bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
              const std::vector<AddressPtr> &outputs) override;

 private:
  template <typename T>
  void LaunchKernel(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &outputs);
  size_t outer_size_{1};
  size_t inner_size_{1};
  bool sorted_{false};
  TypeId dtype_{kTypeUnknown};
 };

 MS_REG_CPU_KERNEL(TopK, KernelAttr(), TopKCPUKernel)
 }  // namespace kernel
 }  // namespace mindspore
 #endif  // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_TOPK_CPU_KERNEL_H_
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/transpose_cpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/transpose_cpu_kernel.cc
@@ -1,159 +1,159 @@
 /**
 * Copyright 2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #include "backend/kernel_compiler/cpu/transpose_cpu_kernel.h"
 #include <algorithm>
 #include <vector>
 #include "runtime/device/cpu/cpu_device_address.h"
 #include "common/thread_pool.h"
 #include "nnacl/fp32/transpose_fp32.h"
 #include "nnacl/int8/transpose_int8.h"
 #include "nnacl/errorcode.h"

 namespace mindspore {
 namespace kernel {
 void TransposeCPUFwdKernel::InitKernel(const CNodePtr &kernel_node) {
  MS_EXCEPTION_IF_NULL(kernel_node);
  input_shape_ = AnfAlgo::GetInputDeviceShape(kernel_node, 0);
  output_shape_ = AnfAlgo::GetOutputDeviceShape(kernel_node, 0);
  auto tmp = AnfAlgo::GetNodeAttr<std::vector<int64_t>>(kernel_node, "perm");
  axes_ = {tmp.begin(), tmp.end()};
  dtype_ = AnfAlgo::GetInputDeviceDataType(kernel_node, 0);
  if (axes_.size() > MAX_TRANSPOSE_DIM_SIZE) {
    MS_LOG(EXCEPTION) << "Transpose support max dimension is " << MAX_TRANSPOSE_DIM_SIZE << "D, but got "
                      << axes_.size() << "D.";
  }

  for (size_t i = 0; i < axes_.size(); ++i) {
    transpose_param_.perm_[i] = SizeToInt(axes_[i]);
  }
  int num_axes = SizeToInt(input_shape_.size());
  transpose_param_.perm_size_ = axes_.size();
  transpose_param_.num_axes_ = num_axes;
  transpose_param_.strides_[num_axes - 1] = 1;
  transpose_param_.out_strides_[num_axes - 1] = 1;
  for (int i = num_axes - 2; i >= 0; i--) {
    transpose_param_.strides_[i] = input_shape_[i + 1] * transpose_param_.strides_[i + 1];
    transpose_param_.out_strides_[i] = output_shape_[i + 1] * transpose_param_.out_strides_[i + 1];
  }
  launch_map_[kNumberTypeInt8] = &TransposeCPUFwdKernel::LaunchKernel<int8_t>;
  launch_map_[kNumberTypeInt16] = &TransposeCPUFwdKernel::LaunchKernel<int16_t>;
  launch_map_[kNumberTypeInt32] = &TransposeCPUFwdKernel::LaunchKernel<int>;
  launch_map_[kNumberTypeInt64] = &TransposeCPUFwdKernel::LaunchKernel<int64_t>;
  launch_map_[kNumberTypeUInt8] = &TransposeCPUFwdKernel::LaunchKernel<uint8_t>;
  launch_map_[kNumberTypeUInt16] = &TransposeCPUFwdKernel::LaunchKernel<uint16_t>;
  launch_map_[kNumberTypeUInt32] = &TransposeCPUFwdKernel::LaunchKernel<uint32_t>;
  launch_map_[kNumberTypeUInt64] = &TransposeCPUFwdKernel::LaunchKernel<uint64_t>;
  launch_map_[kNumberTypeFloat32] = &TransposeCPUFwdKernel::LaunchKernel<float>;
  launch_map_[kNumberTypeBool] = &TransposeCPUFwdKernel::LaunchKernel<bool>;

  auto iter = launch_map_.find(dtype_);
  if (iter != launch_map_.end()) {
    launch_func_ = iter->second;
  } else {
    MS_LOG(EXCEPTION) << "Input data type: " << dtype_ << "is not supported for Transpose kernel on CPU.";
  }
 }

 bool TransposeCPUFwdKernel::Launch(const std::vector<kernel::AddressPtr> &inputs,
                                   const std::vector<kernel::AddressPtr> &,
                                   const std::vector<kernel::AddressPtr> &outputs) {
  launch_func_(this, inputs, outputs);
  return true;
 }

 template <typename T>
 void TransposeCPUFwdKernel::LaunchKernel(const std::vector<AddressPtr> &inputs,
                                         const std::vector<AddressPtr> &outputs) {
  const auto *input_addr = reinterpret_cast<T *>(inputs[0]->addr);
  auto *output_addr = reinterpret_cast<T *>(outputs[0]->addr);
  transpose_param_.data_num_ = inputs[0]->size / sizeof(T);
  int output_shape[SizeToInt(output_shape_.size())];
  for (size_t i = 0; i < output_shape_.size(); ++i) {
    output_shape[i] = SizeToInt(output_shape_[i]);
  }
  size_t data_count = (inputs[0]->size) / sizeof(T);
  if (axes_.size() <= DIMENSION_6D && data_count < MAX_TRANSPOSE_SERIAL_SIZE) {
    int res = NNACL_ERR;
    if constexpr (std::is_same_v<T, int8_t>) {
      res = DoTransposeInt8(input_addr, output_addr, output_shape, &transpose_param_);
    } else if constexpr (std::is_same_v<T, int16_t>) {
      res = DoTransposeInt16(input_addr, output_addr, output_shape, &transpose_param_);
    } else if constexpr (std::is_same_v<T, int32_t>) {
      res = DoTransposeInt32(input_addr, output_addr, output_shape, &transpose_param_);
    } else if constexpr (std::is_same_v<T, int64_t>) {
      res = DoTransposeInt64(input_addr, output_addr, output_shape, &transpose_param_);
    } else if constexpr (std::is_same_v<T, uint8_t>) {
      res = DoTransposeUInt8(input_addr, output_addr, output_shape, &transpose_param_);
    } else if constexpr (std::is_same_v<T, uint16_t>) {
      res = DoTransposeUInt16(input_addr, output_addr, output_shape, &transpose_param_);
    } else if constexpr (std::is_same_v<T, uint32_t>) {
      res = DoTransposeUInt32(input_addr, output_addr, output_shape, &transpose_param_);
    } else if constexpr (std::is_same_v<T, uint64_t>) {
      res = DoTransposeUInt64(input_addr, output_addr, output_shape, &transpose_param_);
    } else if constexpr (std::is_same_v<T, float>) {
      res = DoTransposeFp32(input_addr, output_addr, output_shape, &transpose_param_);
    } else if constexpr (std::is_same_v<T, bool>) {
      res = DoTransposeBool(input_addr, output_addr, output_shape, &transpose_param_);
    }
    if (res != NNACL_OK) {
      MS_LOG(ERROR) << "Transpose run failed";
    }
  } else {
    ParallelRun(input_addr, output_addr, output_shape, data_count);
  }
 }

 template <typename T>
 void TransposeCPUFwdKernel::ParallelRun(const T *input_addr, T *output_addr, const int *output_shape, size_t count) {
  auto max_thread_num = common::ThreadPool::GetInstance().GetSyncRunThreadNum();
  const float block_size = 128.0;
  size_t thread_num = count < block_size * max_thread_num ? std::ceil(count / block_size) : max_thread_num;
  std::vector<common::Task> tasks;
  std::function<void(const T *, T *, const int *, TransposeParameter *, int, int)> TransposeDims;

  if constexpr (std::is_same_v<T, int8_t>) {
    TransposeDims = &TransposeDimsInt8;
  } else if constexpr (std::is_same_v<T, int16_t>) {
    TransposeDims = &TransposeDimsInt16;
  } else if constexpr (std::is_same_v<T, int32_t>) {
    TransposeDims = &TransposeDimsInt32;
  } else if constexpr (std::is_same_v<T, int64_t>) {
    TransposeDims = &TransposeDimsInt64;
  } else if constexpr (std::is_same_v<T, uint8_t>) {
    TransposeDims = &TransposeDimsUInt8;
  } else if constexpr (std::is_same_v<T, uint16_t>) {
    TransposeDims = &TransposeDimsUInt16;
  } else if constexpr (std::is_same_v<T, uint32_t>) {
    TransposeDims = &TransposeDimsUInt32;
  } else if constexpr (std::is_same_v<T, uint64_t>) {
    TransposeDims = &TransposeDimsUInt64;
  } else if constexpr (std::is_same_v<T, float>) {
    TransposeDims = &TransposeDimsFp32;
  } else if constexpr (std::is_same_v<T, bool>) {
    TransposeDims = &TransposeDimsBool;
  }
  for (int task_id = 0; task_id < SizeToInt(thread_num); ++task_id) {
    auto task = [&, task_id, thread_num]() {
      TransposeDims(input_addr, output_addr, output_shape, &transpose_param_, task_id, SizeToInt(thread_num));
      return common::SUCCESS;
    };
    tasks.emplace_back(task);
  }
  common::ThreadPool::GetInstance().SyncRun(tasks);
 }
 }  // namespace kernel
 }  // namespace mindspore
 /**
 * Copyright 2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #include "backend/kernel_compiler/cpu/transpose_cpu_kernel.h"
 #include <algorithm>
 #include <vector>
 #include "runtime/device/cpu/cpu_device_address.h"
 #include "common/thread_pool.h"
 #include "nnacl/fp32/transpose_fp32.h"
 #include "nnacl/int8/transpose_int8.h"
 #include "nnacl/errorcode.h"

 namespace mindspore {
 namespace kernel {
 void TransposeCPUFwdKernel::InitKernel(const CNodePtr &kernel_node) {
  MS_EXCEPTION_IF_NULL(kernel_node);
  input_shape_ = AnfAlgo::GetInputDeviceShape(kernel_node, 0);
  output_shape_ = AnfAlgo::GetOutputDeviceShape(kernel_node, 0);
  auto tmp = AnfAlgo::GetNodeAttr<std::vector<int64_t>>(kernel_node, "perm");
  axes_ = {tmp.begin(), tmp.end()};
  dtype_ = AnfAlgo::GetInputDeviceDataType(kernel_node, 0);
  if (axes_.size() > MAX_TRANSPOSE_DIM_SIZE) {
    MS_LOG(EXCEPTION) << "Transpose support max dimension is " << MAX_TRANSPOSE_DIM_SIZE << "D, but got "
                      << axes_.size() << "D.";
  }

  for (size_t i = 0; i < axes_.size(); ++i) {
    transpose_param_.perm_[i] = SizeToInt(axes_[i]);
  }
  int num_axes = SizeToInt(input_shape_.size());
  transpose_param_.perm_size_ = axes_.size();
  transpose_param_.num_axes_ = num_axes;
  transpose_param_.strides_[num_axes - 1] = 1;
  transpose_param_.out_strides_[num_axes - 1] = 1;
  for (int i = num_axes - 2; i >= 0; i--) {
    transpose_param_.strides_[i] = input_shape_[i + 1] * transpose_param_.strides_[i + 1];
    transpose_param_.out_strides_[i] = output_shape_[i + 1] * transpose_param_.out_strides_[i + 1];
  }
  launch_map_[kNumberTypeInt8] = &TransposeCPUFwdKernel::LaunchKernel<int8_t>;
  launch_map_[kNumberTypeInt16] = &TransposeCPUFwdKernel::LaunchKernel<int16_t>;
  launch_map_[kNumberTypeInt32] = &TransposeCPUFwdKernel::LaunchKernel<int>;
  launch_map_[kNumberTypeInt64] = &TransposeCPUFwdKernel::LaunchKernel<int64_t>;
  launch_map_[kNumberTypeUInt8] = &TransposeCPUFwdKernel::LaunchKernel<uint8_t>;
  launch_map_[kNumberTypeUInt16] = &TransposeCPUFwdKernel::LaunchKernel<uint16_t>;
  launch_map_[kNumberTypeUInt32] = &TransposeCPUFwdKernel::LaunchKernel<uint32_t>;
  launch_map_[kNumberTypeUInt64] = &TransposeCPUFwdKernel::LaunchKernel<uint64_t>;
  launch_map_[kNumberTypeFloat32] = &TransposeCPUFwdKernel::LaunchKernel<float>;
  launch_map_[kNumberTypeBool] = &TransposeCPUFwdKernel::LaunchKernel<bool>;

  auto iter = launch_map_.find(dtype_);
  if (iter != launch_map_.end()) {
    launch_func_ = iter->second;
  } else {
    MS_LOG(EXCEPTION) << "Input data type: " << dtype_ << "is not supported for Transpose kernel on CPU.";
  }
 }

 bool TransposeCPUFwdKernel::Launch(const std::vector<kernel::AddressPtr> &inputs,
                                   const std::vector<kernel::AddressPtr> &,
                                   const std::vector<kernel::AddressPtr> &outputs) {
  launch_func_(this, inputs, outputs);
  return true;
 }

 template <typename T>
 void TransposeCPUFwdKernel::LaunchKernel(const std::vector<AddressPtr> &inputs,
                                         const std::vector<AddressPtr> &outputs) {
  const auto *input_addr = reinterpret_cast<T *>(inputs[0]->addr);
  auto *output_addr = reinterpret_cast<T *>(outputs[0]->addr);
  transpose_param_.data_num_ = inputs[0]->size / sizeof(T);
  int output_shape[SizeToInt(output_shape_.size())];
  for (size_t i = 0; i < output_shape_.size(); ++i) {
    output_shape[i] = SizeToInt(output_shape_[i]);
  }
  size_t data_count = (inputs[0]->size) / sizeof(T);
  if (axes_.size() <= DIMENSION_6D && data_count < MAX_TRANSPOSE_SERIAL_SIZE) {
    int res = NNACL_ERR;
    if constexpr (std::is_same_v<T, int8_t>) {
      res = DoTransposeInt8(input_addr, output_addr, output_shape, &transpose_param_);
    } else if constexpr (std::is_same_v<T, int16_t>) {
      res = DoTransposeInt16(input_addr, output_addr, output_shape, &transpose_param_);
    } else if constexpr (std::is_same_v<T, int32_t>) {
      res = DoTransposeInt32(input_addr, output_addr, output_shape, &transpose_param_);
    } else if constexpr (std::is_same_v<T, int64_t>) {
      res = DoTransposeInt64(input_addr, output_addr, output_shape, &transpose_param_);
    } else if constexpr (std::is_same_v<T, uint8_t>) {
      res = DoTransposeUInt8(input_addr, output_addr, output_shape, &transpose_param_);
    } else if constexpr (std::is_same_v<T, uint16_t>) {
      res = DoTransposeUInt16(input_addr, output_addr, output_shape, &transpose_param_);
    } else if constexpr (std::is_same_v<T, uint32_t>) {
      res = DoTransposeUInt32(input_addr, output_addr, output_shape, &transpose_param_);
    } else if constexpr (std::is_same_v<T, uint64_t>) {
      res = DoTransposeUInt64(input_addr, output_addr, output_shape, &transpose_param_);
    } else if constexpr (std::is_same_v<T, float>) {
      res = DoTransposeFp32(input_addr, output_addr, output_shape, &transpose_param_);
    } else if constexpr (std::is_same_v<T, bool>) {
      res = DoTransposeBool(input_addr, output_addr, output_shape, &transpose_param_);
    }
    if (res != NNACL_OK) {
      MS_LOG(ERROR) << "Transpose run failed";
    }
  } else {
    ParallelRun(input_addr, output_addr, output_shape, data_count);
  }
 }

 template <typename T>
 void TransposeCPUFwdKernel::ParallelRun(const T *input_addr, T *output_addr, const int *output_shape, size_t count) {
  auto max_thread_num = common::ThreadPool::GetInstance().GetSyncRunThreadNum();
  const float block_size = 128.0;
  size_t thread_num = count < block_size * max_thread_num ? std::ceil(count / block_size) : max_thread_num;
  std::vector<common::Task> tasks;
  std::function<void(const T *, T *, const int *, TransposeParameter *, int, int)> TransposeDims;

  if constexpr (std::is_same_v<T, int8_t>) {
    TransposeDims = &TransposeDimsInt8;
  } else if constexpr (std::is_same_v<T, int16_t>) {
    TransposeDims = &TransposeDimsInt16;
  } else if constexpr (std::is_same_v<T, int32_t>) {
    TransposeDims = &TransposeDimsInt32;
  } else if constexpr (std::is_same_v<T, int64_t>) {
    TransposeDims = &TransposeDimsInt64;
  } else if constexpr (std::is_same_v<T, uint8_t>) {
    TransposeDims = &TransposeDimsUInt8;
  } else if constexpr (std::is_same_v<T, uint16_t>) {
    TransposeDims = &TransposeDimsUInt16;
  } else if constexpr (std::is_same_v<T, uint32_t>) {
    TransposeDims = &TransposeDimsUInt32;
  } else if constexpr (std::is_same_v<T, uint64_t>) {
    TransposeDims = &TransposeDimsUInt64;
  } else if constexpr (std::is_same_v<T, float>) {
    TransposeDims = &TransposeDimsFp32;
  } else if constexpr (std::is_same_v<T, bool>) {
    TransposeDims = &TransposeDimsBool;
  }
  for (int task_id = 0; task_id < SizeToInt(thread_num); ++task_id) {
    auto task = [&, task_id, thread_num]() {
      TransposeDims(input_addr, output_addr, output_shape, &transpose_param_, task_id, SizeToInt(thread_num));
      return common::SUCCESS;
    };
    tasks.emplace_back(task);
  }
  common::ThreadPool::GetInstance().SyncRun(tasks);
 }
 }  // namespace kernel
 }  // namespace mindspore
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/transpose_cpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/transpose_cpu_kernel.h
@@ -1,58 +1,58 @@
 /**
 * Copyright 2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_TRANSPOSE_CPU_KERNEL_H_
 #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_TRANSPOSE_CPU_KERNEL_H_
 #include <vector>
 #include <unordered_map>
 #include <memory>
 #include <string>
 #include "backend/kernel_compiler/cpu/cpu_kernel.h"
 #include "backend/kernel_compiler/cpu/cpu_kernel_factory.h"
 #include "nnacl/base/transpose_base.h"

 namespace mindspore {
 namespace kernel {
 class TransposeCPUFwdKernel : public CPUKernel {
 public:
  TransposeCPUFwdKernel() = default;
  ~TransposeCPUFwdKernel() override = default;

  void InitKernel(const CNodePtr &kernel_node) override;

  bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
              const std::vector<AddressPtr> &outputs) override;

 private:
  template <typename T>
  void LaunchKernel(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &outputs);

  template <typename T>
  void ParallelRun(const T *input_addr, T *output_addr, const int *output_shape, size_t count);

  TransposeParameter transpose_param_;
  std::vector<size_t> input_shape_;
  std::vector<size_t> output_shape_;
  std::vector<size_t> axes_;
  TypeId dtype_{kTypeUnknown};
  using TypeKernel =
    std::function<void(TransposeCPUFwdKernel *, const std::vector<AddressPtr> &, const std::vector<AddressPtr> &)>;
  std::unordered_map<TypeId, TypeKernel> launch_map_;
  TypeKernel launch_func_;
 };
 MS_REG_CPU_KERNEL(Transpose, KernelAttr(), TransposeCPUFwdKernel);
 }  // namespace kernel
 }  // namespace mindspore
 #endif  // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_TRANSPOSE_CPU_KERNEL_H_
 /**
 * Copyright 2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_TRANSPOSE_CPU_KERNEL_H_
 #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_TRANSPOSE_CPU_KERNEL_H_
 #include <vector>
 #include <unordered_map>
 #include <memory>
 #include <string>
 #include "backend/kernel_compiler/cpu/cpu_kernel.h"
 #include "backend/kernel_compiler/cpu/cpu_kernel_factory.h"
 #include "nnacl/base/transpose_base.h"

 namespace mindspore {
 namespace kernel {
 class TransposeCPUFwdKernel : public CPUKernel {
 public:
  TransposeCPUFwdKernel() = default;
  ~TransposeCPUFwdKernel() override = default;

  void InitKernel(const CNodePtr &kernel_node) override;

  bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
              const std::vector<AddressPtr> &outputs) override;

 private:
  template <typename T>
  void LaunchKernel(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &outputs);

  template <typename T>
  void ParallelRun(const T *input_addr, T *output_addr, const int *output_shape, size_t count);

  TransposeParameter transpose_param_;
  std::vector<size_t> input_shape_;
  std::vector<size_t> output_shape_;
  std::vector<size_t> axes_;
  TypeId dtype_{kTypeUnknown};
  using TypeKernel =
    std::function<void(TransposeCPUFwdKernel *, const std::vector<AddressPtr> &, const std::vector<AddressPtr> &)>;
  std::unordered_map<TypeId, TypeKernel> launch_map_;
  TypeKernel launch_func_;
 };
 MS_REG_CPU_KERNEL(Transpose, KernelAttr(), TransposeCPUFwdKernel);
 }  // namespace kernel
 }  // namespace mindspore
 #endif  // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_TRANSPOSE_CPU_KERNEL_H_