Browse Source

!23733 code check fix

Merge pull request !23733 from zhangbuxue/code_check_fix
tags/v1.6.0
i-robot Gitee 4 years ago
parent
commit
2010d79336
86 changed files with 1280 additions and 1184 deletions
  1. +30
    -31
      mindspore/ccsrc/backend/kernel_compiler/cpu/adam_cpu_kernel.cc
  2. +7
    -7
      mindspore/ccsrc/backend/kernel_compiler/cpu/adam_cpu_kernel.h
  3. +15
    -10
      mindspore/ccsrc/backend/kernel_compiler/cpu/adam_delta_cpu_kernel.cc
  4. +6
    -2
      mindspore/ccsrc/backend/kernel_compiler/cpu/adam_delta_cpu_kernel.h
  5. +5
    -3
      mindspore/ccsrc/backend/kernel_compiler/cpu/adam_weight_decay_cpu_kernel.cc
  6. +2
    -0
      mindspore/ccsrc/backend/kernel_compiler/cpu/adam_weight_decay_cpu_kernel.h
  7. +11
    -13
      mindspore/ccsrc/backend/kernel_compiler/cpu/allgather_cpu_kernel.cc
  8. +7
    -4
      mindspore/ccsrc/backend/kernel_compiler/cpu/allgather_cpu_kernel.h
  9. +22
    -28
      mindspore/ccsrc/backend/kernel_compiler/cpu/apply_adagrad_cpu_kernel.cc
  10. +8
    -3
      mindspore/ccsrc/backend/kernel_compiler/cpu/apply_adagrad_cpu_kernel.h
  11. +14
    -8
      mindspore/ccsrc/backend/kernel_compiler/cpu/apply_momentum_cpu_kernel.cc
  12. +6
    -3
      mindspore/ccsrc/backend/kernel_compiler/cpu/apply_momentum_cpu_kernel.h
  13. +21
    -12
      mindspore/ccsrc/backend/kernel_compiler/cpu/argmax_cpu_kernel.cc
  14. +7
    -4
      mindspore/ccsrc/backend/kernel_compiler/cpu/argmax_cpu_kernel.h
  15. +1
    -0
      mindspore/ccsrc/backend/kernel_compiler/cpu/argmax_with_value_cpu_kernel.cc
  16. +6
    -3
      mindspore/ccsrc/backend/kernel_compiler/cpu/argmax_with_value_cpu_kernel.h
  17. +20
    -12
      mindspore/ccsrc/backend/kernel_compiler/cpu/argmin_with_value_cpu_kernel.cc
  18. +10
    -7
      mindspore/ccsrc/backend/kernel_compiler/cpu/argmin_with_value_cpu_kernel.h
  19. +107
    -118
      mindspore/ccsrc/backend/kernel_compiler/cpu/arithmetic_cpu_kernel.cc
  20. +19
    -20
      mindspore/ccsrc/backend/kernel_compiler/cpu/arithmetic_cpu_kernel.h
  21. +44
    -48
      mindspore/ccsrc/backend/kernel_compiler/cpu/arithmetic_logic_cpu_kernel.cc
  22. +18
    -15
      mindspore/ccsrc/backend/kernel_compiler/cpu/arithmetic_logic_cpu_kernel.h
  23. +61
    -64
      mindspore/ccsrc/backend/kernel_compiler/cpu/arithmetic_self_cpu_kernel.cc
  24. +7
    -8
      mindspore/ccsrc/backend/kernel_compiler/cpu/arithmetic_self_cpu_kernel.h
  25. +22
    -8
      mindspore/ccsrc/backend/kernel_compiler/cpu/assign_cpu_kernel.cc
  26. +4
    -2
      mindspore/ccsrc/backend/kernel_compiler/cpu/assign_cpu_kernel.h
  27. +17
    -11
      mindspore/ccsrc/backend/kernel_compiler/cpu/bias_add_cpu_kernel.cc
  28. +3
    -1
      mindspore/ccsrc/backend/kernel_compiler/cpu/bias_add_cpu_kernel.h
  29. +19
    -8
      mindspore/ccsrc/backend/kernel_compiler/cpu/bias_add_grad_cpu_kernel.cc
  30. +5
    -4
      mindspore/ccsrc/backend/kernel_compiler/cpu/bias_add_grad_cpu_kernel.h
  31. +69
    -53
      mindspore/ccsrc/backend/kernel_compiler/cpu/binary_cross_entropy_cpu_kernel.cc
  32. +10
    -7
      mindspore/ccsrc/backend/kernel_compiler/cpu/binary_cross_entropy_cpu_kernel.h
  33. +43
    -33
      mindspore/ccsrc/backend/kernel_compiler/cpu/binary_cross_entropy_grad_kernel.cc
  34. +8
    -5
      mindspore/ccsrc/backend/kernel_compiler/cpu/binary_cross_entropy_grad_kernel.h
  35. +1
    -0
      mindspore/ccsrc/backend/kernel_compiler/cpu/boundingbox_decode_cpu_kernel.cc
  36. +3
    -0
      mindspore/ccsrc/backend/kernel_compiler/cpu/boundingbox_decode_cpu_kernel.h
  37. +1
    -0
      mindspore/ccsrc/backend/kernel_compiler/cpu/boundingbox_encode_cpu_kernel.cc
  38. +3
    -0
      mindspore/ccsrc/backend/kernel_compiler/cpu/boundingbox_encode_cpu_kernel.h
  39. +19
    -22
      mindspore/ccsrc/backend/kernel_compiler/cpu/broadcast_to_cpu_kernel.cc
  40. +6
    -5
      mindspore/ccsrc/backend/kernel_compiler/cpu/broadcast_to_cpu_kernel.h
  41. +15
    -9
      mindspore/ccsrc/backend/kernel_compiler/cpu/cast_cpu_kernel.cc
  42. +4
    -1
      mindspore/ccsrc/backend/kernel_compiler/cpu/cast_cpu_kernel.h
  43. +3
    -0
      mindspore/ccsrc/backend/kernel_compiler/cpu/check_valid_cpu_kernel.cc
  44. +3
    -0
      mindspore/ccsrc/backend/kernel_compiler/cpu/check_valid_cpu_kernel.h
  45. +19
    -17
      mindspore/ccsrc/backend/kernel_compiler/cpu/concat_cpu_kernel.cc
  46. +5
    -3
      mindspore/ccsrc/backend/kernel_compiler/cpu/concat_cpu_kernel.h
  47. +14
    -8
      mindspore/ccsrc/backend/kernel_compiler/cpu/cpu_kernel.cc
  48. +58
    -100
      mindspore/ccsrc/backend/kernel_compiler/cpu/cpu_kernel.h
  49. +7
    -1
      mindspore/ccsrc/backend/kernel_compiler/cpu/cpu_kernel_factory.cc
  50. +4
    -2
      mindspore/ccsrc/backend/kernel_compiler/cpu/cpu_kernel_factory.h
  51. +1
    -0
      mindspore/ccsrc/backend/kernel_compiler/cpu/crop_and_resize_cpu_kernel.cc
  52. +3
    -0
      mindspore/ccsrc/backend/kernel_compiler/cpu/crop_and_resize_cpu_kernel.h
  53. +93
    -95
      mindspore/ccsrc/backend/kernel_compiler/cpu/ctcloss_cpu_kernel.cc
  54. +17
    -16
      mindspore/ccsrc/backend/kernel_compiler/cpu/ctcloss_cpu_kernel.h
  55. +32
    -25
      mindspore/ccsrc/backend/kernel_compiler/cpu/cumsum_cpu_kernel.cc
  56. +25
    -26
      mindspore/ccsrc/backend/kernel_compiler/cpu/cumsum_cpu_kernel.h
  57. +15
    -8
      mindspore/ccsrc/backend/kernel_compiler/cpu/debug_cpu_kernel.cc
  58. +3
    -1
      mindspore/ccsrc/backend/kernel_compiler/cpu/debug_cpu_kernel.h
  59. +3
    -0
      mindspore/ccsrc/backend/kernel_compiler/cpu/depthtospace_cpu_kernel.h
  60. +22
    -23
      mindspore/ccsrc/backend/kernel_compiler/cpu/dropout_cpu_kernel.cc
  61. +7
    -6
      mindspore/ccsrc/backend/kernel_compiler/cpu/dropout_cpu_kernel.h
  62. +18
    -7
      mindspore/ccsrc/backend/kernel_compiler/cpu/dropout_grad_kernel.cc
  63. +5
    -4
      mindspore/ccsrc/backend/kernel_compiler/cpu/dropout_grad_kernel.h
  64. +31
    -17
      mindspore/ccsrc/backend/kernel_compiler/cpu/dynamic_assign_cpu_kernel.cc
  65. +5
    -3
      mindspore/ccsrc/backend/kernel_compiler/cpu/dynamic_assign_cpu_kernel.h
  66. +19
    -21
      mindspore/ccsrc/backend/kernel_compiler/cpu/mkldnn/eltwise_cpu_kernel.cc
  67. +5
    -2
      mindspore/ccsrc/backend/kernel_compiler/cpu/mkldnn/eltwise_cpu_kernel.h
  68. +10
    -3
      mindspore/ccsrc/backend/kernel_compiler/cpu/mkldnn/log_softmax_cpu_kernel.cc
  69. +1
    -0
      mindspore/ccsrc/backend/kernel_compiler/cpu/mkldnn/log_softmax_cpu_kernel.h
  70. +10
    -3
      mindspore/ccsrc/backend/kernel_compiler/cpu/mkldnn/log_softmax_grad_cpu_kernel.cc
  71. +1
    -0
      mindspore/ccsrc/backend/kernel_compiler/cpu/mkldnn/log_softmax_grad_cpu_kernel.h
  72. +21
    -12
      mindspore/ccsrc/backend/kernel_compiler/cpu/mkldnn/lstm_cpu_kernel.cc
  73. +18
    -13
      mindspore/ccsrc/backend/kernel_compiler/cpu/mkldnn/lstm_cpu_kernel.h
  74. +16
    -7
      mindspore/ccsrc/backend/kernel_compiler/cpu/mkldnn/lstm_grad_cpu_kernel.cc
  75. +15
    -12
      mindspore/ccsrc/backend/kernel_compiler/cpu/mkldnn/lstm_grad_cpu_kernel.h
  76. +17
    -13
      mindspore/ccsrc/backend/kernel_compiler/cpu/mkldnn/matmul_cpu_kernel.cc
  77. +6
    -5
      mindspore/ccsrc/backend/kernel_compiler/cpu/mkldnn/matmul_cpu_kernel.h
  78. +12
    -9
      mindspore/ccsrc/backend/kernel_compiler/cpu/mkldnn/mkl_cpu_kernel.cc
  79. +8
    -6
      mindspore/ccsrc/backend/kernel_compiler/cpu/mkldnn/mkl_cpu_kernel.h
  80. +3
    -1
      mindspore/ccsrc/backend/kernel_compiler/cpu/mkldnn/mkl_kernel_engine.cc
  81. +7
    -4
      mindspore/ccsrc/backend/kernel_compiler/cpu/mkldnn/mkl_kernel_engine.h
  82. +0
    -65
      mindspore/ccsrc/backend/kernel_compiler/cpu/mkldnn/mul_cpu_kernel.cc
  83. +0
    -42
      mindspore/ccsrc/backend/kernel_compiler/cpu/mkldnn/mul_cpu_kernel.h
  84. +5
    -5
      mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/base/broadcast_to.c
  85. +1
    -1
      mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/base/broadcast_to.h
  86. +6
    -6
      mindspore/lite/src/runtime/kernel/arm/fp32/broadcast_to_fp32.cc

+ 30
- 31
mindspore/ccsrc/backend/kernel_compiler/cpu/adam_cpu_kernel.cc View File

@@ -13,26 +13,32 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "backend/kernel_compiler/cpu/mkldnn/mkl_kernel_engine.h"
#include "runtime/device/cpu/cpu_device_address.h"

#include "backend/kernel_compiler/cpu/adam_cpu_kernel.h"
#include "nnacl/errorcode.h"
#include "nnacl/fp32/adam_fp32.h"
#include "backend/kernel_compiler/cpu/nnacl/errorcode.h"
#include "backend/kernel_compiler/cpu/nnacl/fp32/adam_fp32.h"
#include "runtime/device/cpu/cpu_device_address.h"
#include "utils/ms_utils.h"

namespace mindspore {
namespace kernel {
namespace {
constexpr size_t kAdamInputsNum = 10;
constexpr size_t kAdamOutputsNum = 3;
constexpr size_t kScalarIndex = 0;
} // namespace

template <typename T>
void AdamCPUKernel::LaunchAdam(const std::vector<kernel::AddressPtr> &inputs, const std::vector<kernel::AddressPtr> &) {
T *var = reinterpret_cast<T *>(inputs[VAR]->addr);
T *m = reinterpret_cast<T *>(inputs[M]->addr);
T *v = reinterpret_cast<T *>(inputs[V]->addr);
float beta1_power = reinterpret_cast<float *>(inputs[BETA1_POWER]->addr)[SCALAR_INDEX];
float beta2_power = reinterpret_cast<float *>(inputs[BETA2_POWER]->addr)[SCALAR_INDEX];
float lr = reinterpret_cast<float *>(inputs[LR]->addr)[SCALAR_INDEX];
T beta1 = static_cast<T>(reinterpret_cast<float *>(inputs[BETA1]->addr)[SCALAR_INDEX]);
T beta2 = static_cast<T>(reinterpret_cast<float *>(inputs[BETA1]->addr)[SCALAR_INDEX]);
T epsilon = static_cast<T>(reinterpret_cast<float *>(inputs[EPSILON]->addr)[SCALAR_INDEX]);
float beta1_power = reinterpret_cast<float *>(inputs[BETA1_POWER]->addr)[kScalarIndex];
float beta2_power = reinterpret_cast<float *>(inputs[BETA2_POWER]->addr)[kScalarIndex];
float lr = reinterpret_cast<float *>(inputs[LR]->addr)[kScalarIndex];
T beta1 = static_cast<T>(reinterpret_cast<float *>(inputs[BETA1]->addr)[kScalarIndex]);
T beta2 = static_cast<T>(reinterpret_cast<float *>(inputs[BETA1]->addr)[kScalarIndex]);
T epsilon = static_cast<T>(reinterpret_cast<float *>(inputs[EPSILON]->addr)[kScalarIndex]);
T *gradient = reinterpret_cast<T *>(inputs[GRAD]->addr);
constexpr float ONE = 1.0;
if (beta1_power - ONE == 0) {
@@ -62,12 +68,12 @@ void AdamCPUKernel::LaunchAdamNnacl(const std::vector<kernel::AddressPtr> &input
float *var = reinterpret_cast<float *>(inputs[VAR]->addr);
float *m = reinterpret_cast<float *>(inputs[M]->addr);
float *v = reinterpret_cast<float *>(inputs[V]->addr);
float beta1_power = reinterpret_cast<float *>(inputs[BETA1_POWER]->addr)[SCALAR_INDEX];
float beta2_power = reinterpret_cast<float *>(inputs[BETA2_POWER]->addr)[SCALAR_INDEX];
float lr = reinterpret_cast<float *>(inputs[LR]->addr)[SCALAR_INDEX];
float beta1 = reinterpret_cast<float *>(inputs[BETA1]->addr)[SCALAR_INDEX];
float beta2 = reinterpret_cast<float *>(inputs[BETA2]->addr)[SCALAR_INDEX];
float epsilon = reinterpret_cast<float *>(inputs[EPSILON]->addr)[SCALAR_INDEX];
float beta1_power = reinterpret_cast<float *>(inputs[BETA1_POWER]->addr)[kScalarIndex];
float beta2_power = reinterpret_cast<float *>(inputs[BETA2_POWER]->addr)[kScalarIndex];
float lr = reinterpret_cast<float *>(inputs[LR]->addr)[kScalarIndex];
float beta1 = reinterpret_cast<float *>(inputs[BETA1]->addr)[kScalarIndex];
float beta2 = reinterpret_cast<float *>(inputs[BETA2]->addr)[kScalarIndex];
float epsilon = reinterpret_cast<float *>(inputs[EPSILON]->addr)[kScalarIndex];
float *gradient = reinterpret_cast<float *>(inputs[GRAD]->addr);
constexpr float ONE = 1.0;
if (beta1_power - ONE == 0) {
@@ -88,26 +94,20 @@ void AdamCPUKernel::LaunchAdamNnacl(const std::vector<kernel::AddressPtr> &input

void AdamCPUKernel::InitKernel(const CNodePtr &kernel_node) {
MS_EXCEPTION_IF_NULL(kernel_node);
size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
kernel_name_ = AnfAlgo::GetCNodeName(kernel_node);
dtype_ = AnfAlgo::GetInputDeviceDataType(kernel_node, 0);
if (input_num != INPUT_NUMS) {
MS_LOG(EXCEPTION) << "Input number is " << input_num << ", but Adam needs 10 inputs.";
}
size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
CHECK_KERNEL_INPUTS_NUM(input_num, kAdamInputsNum, kernel_name_);
size_t output_num = AnfAlgo::GetOutputTensorNum(kernel_node);
if (output_num != OUTPUT_NUMS) {
MS_LOG(EXCEPTION) << "Output number is " << output_num << ", but Adam needs 3 outputs.";
}
use_nesterov_ = AnfAlgo::GetNodeAttr<bool>(kernel_node, "use_nesterov");
CHECK_KERNEL_OUTPUTS_NUM(output_num, kAdamOutputsNum, kernel_name_);
use_nesterov_ = AnfAlgo::GetNodeAttr<bool>(kernel_node, USE_NESTEROV);
}

bool AdamCPUKernel::Launch(const std::vector<kernel::AddressPtr> &inputs, const std::vector<kernel::AddressPtr> &,
const std::vector<kernel::AddressPtr> &outputs) {
if (inputs.size() != INPUT_NUMS) {
MS_LOG(EXCEPTION) << "Input number is " << inputs.size() << ", but Adam needs 10 inputs.";
}
if (outputs.size() != OUTPUT_NUMS) {
MS_LOG(EXCEPTION) << "Output number is " << outputs.size() << ", but Adam needs 3 outputs.";
}
CHECK_KERNEL_INPUTS_NUM(inputs.size(), kAdamInputsNum, kernel_name_);
CHECK_KERNEL_OUTPUTS_NUM(outputs.size(), kAdamOutputsNum, kernel_name_);

if (inputs[VAR]->size != inputs[M]->size || inputs[VAR]->size != inputs[V]->size ||
inputs[VAR]->size != inputs[GRAD]->size) {
MS_LOG(EXCEPTION) << "Error input data size!";
@@ -124,7 +124,6 @@ bool AdamCPUKernel::Launch(const std::vector<kernel::AddressPtr> &inputs, const
LaunchAdam<float16>(inputs, outputs);
} else {
MS_LOG(EXCEPTION) << "Adam not support " << dtype_;
return false;
}
return true;
}


+ 7
- 7
mindspore/ccsrc/backend/kernel_compiler/cpu/adam_cpu_kernel.h View File

@@ -13,33 +13,33 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_ADAM_CPU_KERNEL_H_
#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_ADAM_CPU_KERNEL_H_

#include <vector>
#include <memory>

#include "backend/kernel_compiler/cpu/cpu_kernel.h"
#include "backend/kernel_compiler/cpu/cpu_kernel_factory.h"

namespace mindspore {
namespace kernel {
constexpr size_t SCALAR_INDEX = 0;
constexpr size_t INPUT_NUMS = 10;
constexpr size_t OUTPUT_NUMS = 3;

class AdamCPUKernel : public CPUKernel {
public:
AdamCPUKernel() = default;
~AdamCPUKernel() override = default;
template <typename T>
void LaunchAdam(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &outputs);

void LaunchAdamNnacl(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &outputs);
void InitKernel(const CNodePtr &kernel_node) override;
bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
const std::vector<AddressPtr> &outputs) override;

private:
template <typename T>
void LaunchAdam(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &outputs);

void LaunchAdamNnacl(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &outputs);

bool use_nesterov_{false};
TypeId dtype_{kTypeUnknown};
enum input_list_ { VAR, M, V, BETA1_POWER, BETA2_POWER, LR, BETA1, BETA2, EPSILON, GRAD };


+ 15
- 10
mindspore/ccsrc/backend/kernel_compiler/cpu/adam_delta_cpu_kernel.cc View File

@@ -13,20 +13,24 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include <thread>

#include "backend/kernel_compiler/cpu/adam_delta_cpu_kernel.h"

#include <vector>
#include <string>
#include <memory>

#include "backend/kernel_compiler/common_utils.h"
#include "backend/kernel_compiler/cpu/nnacl/fp32/adam_fp32.h"
#include "runtime/device/cpu/cpu_device_address.h"
#include "backend/kernel_compiler/cpu/adam_delta_cpu_kernel.h"
#include "nnacl/errorcode.h"
#include "nnacl/fp32/adam_fp32.h"
#include "utils/ms_utils.h"

namespace mindspore {
namespace kernel {
constexpr size_t kAdamDeltaInputSize = 9;
namespace {
constexpr size_t kAdamDeltaInputsNum = 9;
constexpr size_t kAdamDeltaOutputsNum = 1;
} // namespace

template <typename T>
void AdamDeltaCPUKernel::LaunchAdamDelta(T *delta, T *m, T *v, float lr, float beta1, float beta2, float epsilon,
const T *gradient, size_t size) {
@@ -55,6 +59,7 @@ void AdamDeltaCPUKernel::LaunchAdamDelta(T *delta, T *m, T *v, float lr, float b

void AdamDeltaCPUKernel::InitKernel(const CNodePtr &kernel_node) {
MS_EXCEPTION_IF_NULL(kernel_node);
kernel_name_ = AnfAlgo::GetCNodeName(kernel_node);
std::vector<size_t> delta_shape = AnfAlgo::GetOutputDeviceShape(kernel_node, 0);
std::vector<size_t> m_shape = AnfAlgo::GetInputDeviceShape(kernel_node, 0);
std::vector<size_t> v_shape = AnfAlgo::GetInputDeviceShape(kernel_node, 1);
@@ -86,14 +91,14 @@ void AdamDeltaCPUKernel::InitKernel(const CNodePtr &kernel_node) {

void AdamDeltaCPUKernel::CheckParams(const std::vector<kernel::AddressPtr> &inputs,
const std::vector<kernel::AddressPtr> &outputs) const {
if (inputs.size() != kAdamDeltaInputSize) {
MS_LOG(EXCEPTION) << "Error input size!";
}
CHECK_KERNEL_INPUTS_NUM(inputs.size(), kAdamDeltaInputsNum, kernel_name_);
CHECK_KERNEL_OUTPUTS_NUM(outputs.size(), kAdamDeltaOutputsNum, kernel_name_);
size_t elem_size = elem_num_ * 4;
std::vector<size_t> expect_sizes = {elem_size, elem_size, 4, 4, 4, 4, 4, 4, elem_size};
std::vector<std::string> input_names = {"m", "v", "beta1_power", "beta2_power", "lr",
"beta1", "beta2", "epsilon", "grad"};
for (size_t i = 0; i < kAdamDeltaInputSize; ++i) {
for (size_t i = 0; i < kAdamDeltaInputsNum; ++i) {
if (inputs[i]->size != expect_sizes[i]) {
MS_LOG(EXCEPTION) << "Error input " << input_names[i] << " size!";
}


+ 6
- 2
mindspore/ccsrc/backend/kernel_compiler/cpu/adam_delta_cpu_kernel.h View File

@@ -1,5 +1,5 @@
/**
* Copyright 2020 Huawei Technologies Co., Ltd
* Copyright 2020-2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
@@ -13,9 +13,12 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_ADAM_DELTA_CPU_KERNEL_H_
#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_ADAM_DELTA_CPU_KERNEL_H_

#include <vector>

#include "backend/kernel_compiler/cpu/cpu_kernel.h"
#include "backend/kernel_compiler/cpu/cpu_kernel_factory.h"

@@ -29,8 +32,9 @@ class AdamDeltaCPUKernel : public CPUKernel {
bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
const std::vector<AddressPtr> &outputs) override;

protected:
private:
void CheckParams(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &outputs) const;

template <typename T>
void LaunchAdamDelta(T *delta, T *m, T *v, float lr, float beta1, float beta2, float epsilon, const T *gradient,
size_t size);


+ 5
- 3
mindspore/ccsrc/backend/kernel_compiler/cpu/adam_weight_decay_cpu_kernel.cc View File

@@ -13,12 +13,14 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include "backend/kernel_compiler/cpu/adam_weight_decay_cpu_kernel.h"

#include <cmath>
#include "backend/kernel_compiler/cpu/mkldnn/mkl_kernel_engine.h"

#include "backend/kernel_compiler/cpu/nnacl/errorcode.h"
#include "backend/kernel_compiler/cpu/nnacl/fp32/adam_fp32.h"
#include "runtime/device/cpu/cpu_device_address.h"
#include "nnacl/errorcode.h"
#include "nnacl/fp32/adam_fp32.h"
#include "utils/ms_utils.h"

namespace mindspore {


+ 2
- 0
mindspore/ccsrc/backend/kernel_compiler/cpu/adam_weight_decay_cpu_kernel.h View File

@@ -13,11 +13,13 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_ADAM_WEIGHT_DECAY_CPU_KERNEL_H_
#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_ADAM_WEIGHT_DECAY_CPU_KERNEL_H_

#include <vector>
#include <memory>

#include "backend/kernel_compiler/cpu/cpu_kernel.h"
#include "backend/kernel_compiler/cpu/cpu_kernel_factory.h"



+ 11
- 13
mindspore/ccsrc/backend/kernel_compiler/cpu/allgather_cpu_kernel.cc View File

@@ -13,6 +13,7 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include "backend/kernel_compiler/cpu/allgather_cpu_kernel.h"
#include "runtime/device/cpu/cpu_device_address.h"
#include "runtime/device/cpu/mpi/mpi_interface.h"
@@ -21,28 +22,25 @@
namespace mindspore {
namespace kernel {
namespace {
constexpr size_t kAllGatherInputsNum = 1;
constexpr size_t kAllGatherOutputsNum = 1;
constexpr auto kRanksGroup = "group";
constexpr auto kAllGatherInputNum = 1;
} // namespace

void AllGatherCPUKernel::InitKernel(const CNodePtr &kernel_node) {
MS_EXCEPTION_IF_NULL(kernel_node);
kernel_name_ = AnfAlgo::GetCNodeName(kernel_node);
size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
if (input_num != kAllGatherInputNum) {
MS_LOG(EXCEPTION) << "Allgather input num:" << input_num;
}

auto ranks_group = AnfAlgo::GetCNodePrimitive(kernel_node)->GetAttr(kRanksGroup);
if (ranks_group != nullptr) {
ranks_group_ = GetValue<std::vector<int>>(ranks_group);
} else {
MS_LOG(EXCEPTION) << "Miss attribute " << kRanksGroup;
}
CHECK_KERNEL_INPUTS_NUM(input_num, kAllGatherInputsNum, kernel_name_);
ranks_group_ = AnfAlgo::GetNodeAttr<std::vector<int>>(kernel_node, kRanksGroup);
}

bool AllGatherCPUKernel::Launch(const std::vector<kernel::AddressPtr> &inputs, const std::vector<kernel::AddressPtr> &,
const std::vector<kernel::AddressPtr> &outputs) {
auto input_addr = reinterpret_cast<float *>(inputs[0]->addr);
auto output_addr = reinterpret_cast<float *>(outputs[0]->addr);
CHECK_KERNEL_INPUTS_NUM(inputs.size(), kAllGatherInputsNum, kernel_name_);
CHECK_KERNEL_OUTPUTS_NUM(outputs.size(), kAllGatherOutputsNum, kernel_name_);
auto *input_addr = reinterpret_cast<float *>(inputs[0]->addr);
auto *output_addr = reinterpret_cast<float *>(outputs[0]->addr);
auto input_data_num = inputs[0]->size / sizeof(float);
return MPIAllGather(input_addr, output_addr, ranks_group_, input_data_num);
}


+ 7
- 4
mindspore/ccsrc/backend/kernel_compiler/cpu/allgather_cpu_kernel.h View File

@@ -1,5 +1,5 @@
/**
* Copyright 2020 Huawei Technologies Co., Ltd
* Copyright 2020-2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
@@ -13,10 +13,13 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_REDUCE_SCATTER_CPU_KERNEL_H_
#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_REDUCE_SCATTER_CPU_KERNEL_H_

#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_ALLGATHER_CPU_KERNEL_H_
#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_ALLGATHER_CPU_KERNEL_H_

#include <vector>
#include <memory>

#include "backend/kernel_compiler/cpu/cpu_kernel.h"
#include "backend/kernel_compiler/cpu/cpu_kernel_factory.h"

@@ -41,4 +44,4 @@ MS_REG_CPU_KERNEL(_HostAllGather, KernelAttr().AddInputAttr(kNumberTypeFloat32).
} // namespace kernel
} // namespace mindspore

#endif // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_REDUCE_SCATTER_CPU_KERNEL_H_
#endif // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_ALLGATHER_CPU_KERNEL_H_

+ 22
- 28
mindspore/ccsrc/backend/kernel_compiler/cpu/apply_adagrad_cpu_kernel.cc View File

@@ -1,5 +1,5 @@
/**
* Copyright 2020 Huawei Technologies Co., Ltd
* Copyright 2020-2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
@@ -24,11 +24,13 @@ namespace kernel {
namespace {
constexpr size_t kSizeFloat16 = 2;
constexpr size_t kSizeFloat32 = 4;
constexpr size_t kInputSize = 4;
constexpr size_t kOutputSize = 2;
constexpr size_t kApplyAdagradInputsNum = 4;
constexpr size_t kApplyAdagradOutputsNum = 2;
} // namespace

void ApplyAdagradCPUKernel::InitKernel(const CNodePtr &kernel_node) {
MS_EXCEPTION_IF_NULL(kernel_node);
kernel_name_ = AnfAlgo::GetCNodeName(kernel_node);
update_slots_ = AnfAlgo::GetNodeAttr<bool>(kernel_node, "update_slots");
dtype_ = AnfAlgo::GetInputDeviceDataType(kernel_node, 0);
}
@@ -36,47 +38,41 @@ void ApplyAdagradCPUKernel::InitKernel(const CNodePtr &kernel_node) {
bool ApplyAdagradCPUKernel::Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &,
const std::vector<AddressPtr> &outputs) {
CheckParam(inputs, outputs);

if (dtype_ == kNumberTypeFloat16) {
LaunchKernel<float16>(inputs, outputs);
} else if (dtype_ == kNumberTypeFloat32) {
LaunchKernel<float>(inputs, outputs);
} else {
MS_LOG(EXCEPTION) << kernel_name_ << " only support float16 and float32 on CPU, but got "
<< TypeIdToType(dtype_)->ToString();
}

return true;
}

void ApplyAdagradCPUKernel::CheckParam(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &outputs) {
void ApplyAdagradCPUKernel::CheckParam(const std::vector<AddressPtr> &inputs,
const std::vector<AddressPtr> &outputs) const {
// inputs: var, accum, lr, gradient
if (inputs.size() != kInputSize) {
MS_LOG(EXCEPTION) << "Input number is " << inputs.size() << ", but ApplyAdagrad needs 4 inputs.";
}

// outputs: var, accum
if (outputs.size() != kOutputSize) {
MS_LOG(EXCEPTION) << "Output number is " << outputs.size() << ", but ApplyAdagrad needs 2 outputs.";
}

CHECK_KERNEL_INPUTS_NUM(inputs.size(), kApplyAdagradInputsNum, kernel_name_);
CHECK_KERNEL_OUTPUTS_NUM(outputs.size(), kApplyAdagradOutputsNum, kernel_name_);
if (inputs[0]->size != inputs[1]->size || inputs[0]->size != inputs[3]->size) {
MS_LOG(EXCEPTION) << "Error input data size!";
}

if (inputs[2]->size != kSizeFloat16 && inputs[2]->size != kSizeFloat32) {
MS_LOG(EXCEPTION) << "The attribute lr and grad must be float16 or float32!";
MS_LOG(EXCEPTION) << kernel_name_ << " requires the attribute lr and grad must be float16 or float32!";
}
}

template <typename T>
void ApplyAdagradCPUKernel::LaunchKernel(const std::vector<AddressPtr> &inputs,
const std::vector<AddressPtr> &outputs) {
auto var = reinterpret_cast<T *>(inputs[0]->addr);
auto accum = reinterpret_cast<T *>(inputs[1]->addr);
auto lr = reinterpret_cast<T *>(inputs[2]->addr);
auto gradient = reinterpret_cast<T *>(inputs[3]->addr);
auto *var = reinterpret_cast<T *>(inputs[0]->addr);
auto *accum = reinterpret_cast<T *>(inputs[1]->addr);
const auto *lr = reinterpret_cast<T *>(inputs[2]->addr);
const auto *gradient = reinterpret_cast<T *>(inputs[3]->addr);

// multithreading
size_t length = inputs[0]->size / sizeof(T);
auto task = [this, &var, &accum, lr, gradient](size_t start, size_t end) {
auto task = [this, &var, &accum, &lr, &gradient](size_t start, size_t end) {
LaunchApplyAdagrad(var, accum, lr, gradient, start, end);
};
CPUKernelUtils::ParallelForAutoSearch(task, length, &parallel_search_info_);
@@ -87,19 +83,17 @@ void ApplyAdagradCPUKernel::LaunchKernel(const std::vector<AddressPtr> &inputs,
if (memcpy_s(output_var, outputs[0]->size, var, inputs[0]->size) != EOK) {
MS_LOG(EXCEPTION) << "Launch kernel error: memcpy failed.";
}

if (memcpy_s(output_accum, outputs[1]->size, accum, inputs[1]->size) != EOK) {
MS_LOG(EXCEPTION) << "Launch kernel error: memcpy failed.";
}
}

template <typename T>
void ApplyAdagradCPUKernel::LaunchApplyAdagrad(T const var, T const accum, const T lr, const T gradient, size_t start,
size_t end) {
void ApplyAdagradCPUKernel::LaunchApplyAdagrad(T *var, T *accum, const T *lr, const T *gradient, size_t start,
size_t end) const {
// DataType can only be float32 or float16, so eps will not be zero.
using DataType = typename std::iterator_traits<T>::value_type;
const DataType one = DataType(1);
const DataType eps = DataType(1e-6);
auto one = static_cast<T>(1);
auto eps = static_cast<T>(1e-6);
for (size_t i = start; i < end; ++i) {
// update accum: accum += grad * grad
if (update_slots_) {


+ 8
- 3
mindspore/ccsrc/backend/kernel_compiler/cpu/apply_adagrad_cpu_kernel.h View File

@@ -1,5 +1,5 @@
/**
* Copyright 2020 Huawei Technologies Co., Ltd
* Copyright 2020-2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
@@ -13,11 +13,13 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_APPLY_ADAGRAD_CPU_KERNEL_H_
#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_APPLY_ADAGRAD_CPU_KERNEL_H_

#include <thread>
#include <vector>

#include "backend/kernel_compiler/cpu/cpu_kernel.h"
#include "backend/kernel_compiler/cpu/cpu_kernel_factory.h"

@@ -34,11 +36,14 @@ class ApplyAdagradCPUKernel : public CPUKernel {
const std::vector<AddressPtr> &outputs) override;

private:
static void CheckParam(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &outputs);
void CheckParam(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &outputs) const;

template <typename T>
void LaunchKernel(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &outputs);

template <typename T>
void LaunchApplyAdagrad(T const var, T const accum, const T lr, const T gradient, size_t start, size_t end);
void LaunchApplyAdagrad(T *var, T *accum, const T *lr, const T *gradient, size_t start, size_t end) const;

bool update_slots_{true};
TypeId dtype_{kTypeUnknown};
};


+ 14
- 8
mindspore/ccsrc/backend/kernel_compiler/cpu/apply_momentum_cpu_kernel.cc View File

@@ -1,5 +1,5 @@
/**
* Copyright 2019 Huawei Technologies Co., Ltd
* Copyright 2019-2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
@@ -13,6 +13,7 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include "backend/kernel_compiler/cpu/apply_momentum_cpu_kernel.h"
#include "backend/kernel_compiler/cpu/mkldnn/mkl_kernel_engine.h"
#include "runtime/device/cpu/cpu_device_address.h"
@@ -20,20 +21,25 @@

namespace mindspore {
namespace kernel {
void ApplyMomentumCPUKernel::InitKernel(const CNodePtr &) {}
namespace {
constexpr size_t kApplyMomentumInputsNum = 5;
} // namespace

void ApplyMomentumCPUKernel::InitKernel(const CNodePtr &kernel_node) {
MS_EXCEPTION_IF_NULL(kernel_node);
kernel_name_ = AnfAlgo::GetCNodeName(kernel_node);
}

bool ApplyMomentumCPUKernel::Launch(const std::vector<kernel::AddressPtr> &inputs,
const std::vector<kernel::AddressPtr> &, const std::vector<kernel::AddressPtr> &) {
if (inputs.size() < 5) {
MS_LOG(EXCEPTION) << "Error input output size!";
}
CHECK_KERNEL_INPUTS_NUM(inputs.size(), kApplyMomentumInputsNum, kernel_name_);
if (inputs[0]->size != inputs[1]->size || inputs[0]->size != inputs[3]->size) {
MS_LOG(EXCEPTION) << "Error input data size!";
}
auto weight = reinterpret_cast<float *>(inputs[0]->addr);
auto accumulate = reinterpret_cast<float *>(inputs[1]->addr);
auto *weight = reinterpret_cast<float *>(inputs[0]->addr);
auto *accumulate = reinterpret_cast<float *>(inputs[1]->addr);
float learning_rate = reinterpret_cast<float *>(inputs[2]->addr)[0];
auto gradient = reinterpret_cast<float *>(inputs[3]->addr);
const auto *gradient = reinterpret_cast<float *>(inputs[3]->addr);
float moment = reinterpret_cast<float *>(inputs[4]->addr)[0];
size_t elem_num = inputs[0]->size / sizeof(float);
for (size_t i = 0; i < elem_num; ++i) {


+ 6
- 3
mindspore/ccsrc/backend/kernel_compiler/cpu/apply_momentum_cpu_kernel.h View File

@@ -1,5 +1,5 @@
/**
* Copyright 2019 Huawei Technologies Co., Ltd
* Copyright 2019-2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
@@ -13,16 +13,19 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_APPLY_MOMENTUM_CPU_KERNEL_H_
#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_APPLY_MOMENTUM_CPU_KERNEL_H_

#include <vector>
#include <memory>
#include "backend/kernel_compiler/cpu/mkldnn/mkl_cpu_kernel.h"

#include "backend/kernel_compiler/cpu/cpu_kernel.h"
#include "backend/kernel_compiler/cpu/cpu_kernel_factory.h"

namespace mindspore {
namespace kernel {
class ApplyMomentumCPUKernel : public MKLCPUKernel {
class ApplyMomentumCPUKernel : public CPUKernel {
public:
ApplyMomentumCPUKernel() = default;
~ApplyMomentumCPUKernel() override = default;


+ 21
- 12
mindspore/ccsrc/backend/kernel_compiler/cpu/argmax_cpu_kernel.cc View File

@@ -1,5 +1,5 @@
/**
* Copyright 2019 Huawei Technologies Co., Ltd
* Copyright 2019-2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
@@ -13,12 +13,20 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include "backend/kernel_compiler/cpu/argmax_cpu_kernel.h"

#include <string>

#include "runtime/device/cpu/cpu_device_address.h"

namespace mindspore {
namespace kernel {
namespace {
constexpr size_t kArgMaxInputsNum = 1;
constexpr size_t kArgMaxOutputsNum = 1;
constexpr char kKernelName[] = "ArgMax";

size_t get_element_num(const std::vector<size_t> &shape) {
size_t size = 1;
for (size_t i = 0; i < shape.size(); i++) {
@@ -30,17 +38,14 @@ size_t get_element_num(const std::vector<size_t> &shape) {
template <typename T>
bool check_validation(const std::vector<size_t> &shape, const size_t num_before_axis, const size_t num_after_axis,
const std::vector<kernel::AddressPtr> &inputs, const std::vector<kernel::AddressPtr> &outputs) {
if (inputs.size() != 1 || outputs.size() != 1) {
MS_LOG(EXCEPTION) << "Wrong number of inputs or outputs!";
return false;
}
CHECK_KERNEL_INPUTS_NUM(inputs.size(), kArgMaxInputsNum, kKernelName);
CHECK_KERNEL_OUTPUTS_NUM(outputs.size(), kArgMaxOutputsNum, kKernelName);
size_t data_size = sizeof(T);
size_t input_size = get_element_num(shape) * data_size;
size_t output_num = num_before_axis * num_after_axis;
size_t output_size = output_num * sizeof(int);
if (inputs[0]->size != input_size || outputs[0]->size != output_size) {
MS_LOG(EXCEPTION) << "Invalid input or output data size!";
return false;
}
return true;
}
@@ -49,24 +54,28 @@ bool check_validation(const std::vector<size_t> &shape, const size_t num_before_
template <typename T>
void ArgmaxCPUKernel<T>::InitKernel(const CNodePtr &kernel_node) {
MS_EXCEPTION_IF_NULL(kernel_node);
kernel_name_ = AnfAlgo::GetCNodeName(kernel_node);
shape_ = AnfAlgo::GetInputDeviceShape(kernel_node, 0);
size_t shape_len = shape_.size();
if (shape_len == 0) {
MS_LOG(EXCEPTION) << "Shape size should be greater than 0";
}
int64_t axis = AnfAlgo::GetNodeAttr<int64_t>(kernel_node, AXIS);
axis += SizeToLong(shape_len);
if (axis < 0) {
MS_LOG(EXCEPTION) << "Invalid axis:" << axis << ", should in range [-1, " << (shape_len - 1) << "]";
}
axis = axis % static_cast<int64_t>(shape_len);
axis = axis % SizeToLong(shape_len);
num_before_axis_ = 1;
num_after_axis_ = 1;
for (size_t i = 0; i < shape_len; i++) {
if (static_cast<int64_t>(i) < axis) {
if (SizeToLong(i) < axis) {
num_before_axis_ *= shape_[i];
} else if (static_cast<int64_t>(i) > axis) {
} else if (SizeToLong(i) > axis) {
num_after_axis_ *= shape_[i];
}
}
dim_axis_ = shape_[axis];
dim_axis_ = shape_[LongToSize(axis)];
}

template <typename T>
@@ -76,8 +85,8 @@ bool ArgmaxCPUKernel<T>::Launch(const std::vector<kernel::AddressPtr> &inputs, c
return false;
}

auto input = reinterpret_cast<T *>(inputs[0]->addr);
auto output = reinterpret_cast<int32_t *>(outputs[0]->addr);
const auto *input = reinterpret_cast<T *>(inputs[0]->addr);
auto *output = reinterpret_cast<int32_t *>(outputs[0]->addr);

std::vector<float> array_axis(dim_axis_);
for (size_t i = 0; i < num_before_axis_; i++) {


+ 7
- 4
mindspore/ccsrc/backend/kernel_compiler/cpu/argmax_cpu_kernel.h View File

@@ -1,5 +1,5 @@
/**
* Copyright 2019 Huawei Technologies Co., Ltd
* Copyright 2019-2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
@@ -13,10 +13,13 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_ARGMAX_CPU_KERNEL_H_
#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_ARGMAX_CPU_KERNEL_H_

#include <vector>
#include <memory>

#include "backend/kernel_compiler/cpu/cpu_kernel.h"
#include "backend/kernel_compiler/cpu/cpu_kernel_factory.h"

@@ -35,9 +38,9 @@ class ArgmaxCPUKernel : public CPUKernel {

private:
std::vector<size_t> shape_;
size_t num_before_axis_;
size_t num_after_axis_;
size_t dim_axis_;
size_t num_before_axis_{0};
size_t num_after_axis_{0};
size_t dim_axis_{0};
};

MS_REG_CPU_KERNEL_T(Argmax, KernelAttr(), ArgmaxCPUKernel, float);


+ 1
- 0
mindspore/ccsrc/backend/kernel_compiler/cpu/argmax_with_value_cpu_kernel.cc View File

@@ -13,6 +13,7 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include "backend/kernel_compiler/cpu/argmax_with_value_cpu_kernel.h"
#include "runtime/device/cpu/cpu_device_address.h"



+ 6
- 3
mindspore/ccsrc/backend/kernel_compiler/cpu/argmax_with_value_cpu_kernel.h View File

@@ -13,12 +13,15 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_ARGMAXWITHVALUE_CPU_KERNEL_H_
#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_ARGMAXWITHVALUE_CPU_KERNEL_H_

#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_ARGMAX_WITH_VALUE_CPU_KERNEL_H_
#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_ARGMAX_WITH_VALUE_CPU_KERNEL_H_

#include <vector>
#include <map>
#include <memory>
#include <algorithm>

#include "backend/kernel_compiler/cpu/cpu_kernel.h"
#include "backend/kernel_compiler/cpu/cpu_kernel_factory.h"

@@ -47,4 +50,4 @@ MS_REG_CPU_KERNEL_T(ArgMaxWithValue, KernelAttr(), ArgMaxWithValueCPUKernel, flo
} // namespace kernel
} // namespace mindspore

#endif // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_ARGMAXWITHVALUE_CPU_KERNEL_H_
#endif // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_ARGMAX_WITH_VALUE_CPU_KERNEL_H_

+ 20
- 12
mindspore/ccsrc/backend/kernel_compiler/cpu/argmin_with_value_cpu_kernel.cc View File

@@ -1,5 +1,5 @@
/**
* Copyright 2019 Huawei Technologies Co., Ltd
* Copyright 2019-2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
@@ -13,12 +13,20 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include "backend/kernel_compiler/cpu/argmin_with_value_cpu_kernel.h"

#include <string>

#include "runtime/device/cpu/cpu_device_address.h"

namespace mindspore {
namespace kernel {
namespace {
constexpr size_t kArgMinWithValueInputsNum = 1;
constexpr size_t kArgMinWithValueOutputsNum = 2;
constexpr char kKernelName[] = "ArgMaxWithValue";

size_t get_element_num(const std::vector<size_t> &shape) {
size_t size = 1;
for (size_t i = 0; i < shape.size(); i++) {
@@ -30,10 +38,8 @@ size_t get_element_num(const std::vector<size_t> &shape) {
template <typename T>
bool check_validation(const std::vector<size_t> &shape, const size_t num_before_axis, const size_t num_after_axis,
const std::vector<kernel::AddressPtr> &inputs, const std::vector<kernel::AddressPtr> &outputs) {
if (inputs.size() != 1 || outputs.size() != 2) {
MS_LOG(EXCEPTION) << "Wrong number of inputs or outputs!";
return false;
}
CHECK_KERNEL_INPUTS_NUM(inputs.size(), kArgMinWithValueInputsNum, kKernelName);
CHECK_KERNEL_OUTPUTS_NUM(outputs.size(), kArgMinWithValueOutputsNum, kKernelName);
size_t data_size = sizeof(T);
size_t input_size = get_element_num(shape) * data_size;
size_t output_num = num_before_axis * num_after_axis;
@@ -41,7 +47,6 @@ bool check_validation(const std::vector<size_t> &shape, const size_t num_before_
size_t out1_size = output_num * data_size;
if (inputs[0]->size != input_size || outputs[0]->size != out0_size || outputs[1]->size != out1_size) {
MS_LOG(EXCEPTION) << "Invalid input or output data size!";
return false;
}
return true;
}
@@ -50,8 +55,12 @@ bool check_validation(const std::vector<size_t> &shape, const size_t num_before_
template <typename T>
void ArgMinWithValueCPUKernel<T>::InitKernel(const CNodePtr &kernel_node) {
MS_EXCEPTION_IF_NULL(kernel_node);
kernel_name_ = AnfAlgo::GetCNodeName(kernel_node);
shape_ = AnfAlgo::GetInputDeviceShape(kernel_node, 0);
size_t shape_len = shape_.size();
if (shape_len == 0) {
MS_LOG(EXCEPTION) << "Shape size should be greater than 0";
}
int64_t axis = AnfAlgo::GetNodeAttr<int64_t>(kernel_node, AXIS);
axis += static_cast<int64_t>(shape_len);
if (axis < 0) {
@@ -78,10 +87,9 @@ bool ArgMinWithValueCPUKernel<T>::Launch(const std::vector<kernel::AddressPtr> &
return false;
}

auto input = reinterpret_cast<T *>(inputs[0]->addr);
auto output0 = reinterpret_cast<int32_t *>(outputs[0]->addr);
auto output1 = reinterpret_cast<T *>(outputs[1]->addr);

const auto *input = reinterpret_cast<T *>(inputs[0]->addr);
auto *output0 = reinterpret_cast<int32_t *>(outputs[0]->addr);
auto *output1 = reinterpret_cast<T *>(outputs[1]->addr);
std::vector<float> array_axis(dim_axis_);
for (size_t i = 0; i < num_before_axis_; i++) {
size_t src_index_i = i * dim_axis_ * num_after_axis_;
@@ -93,9 +101,9 @@ bool ArgMinWithValueCPUKernel<T>::Launch(const std::vector<kernel::AddressPtr> &
}
auto min_ops = std::min_element(array_axis.begin(), array_axis.end());
auto min_index = static_cast<int32_t>(std::distance(array_axis.begin(), min_ops));
auto dst_index = i * num_after_axis_ + j;
size_t dst_index = i * num_after_axis_ + j;
output0[dst_index] = min_index;
auto src_index = IntToSize(min_index) * num_after_axis_ + src_index_j;
size_t src_index = IntToSize(min_index) * num_after_axis_ + src_index_j;
output1[dst_index] = input[src_index];
}
}


+ 10
- 7
mindspore/ccsrc/backend/kernel_compiler/cpu/argmin_with_value_cpu_kernel.h View File

@@ -1,5 +1,5 @@
/**
* Copyright 2019 Huawei Technologies Co., Ltd
* Copyright 2019-2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
@@ -13,12 +13,15 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_ARGMINWITHVALUE_CPU_KERNEL_H_
#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_ARGMINWITHVALUE_CPU_KERNEL_H_

#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_ARGMIN_WITH_VALUE_CPU_KERNEL_H_
#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_ARGMIN_WITH_VALUE_CPU_KERNEL_H_

#include <vector>
#include <map>
#include <memory>
#include <algorithm>

#include "backend/kernel_compiler/cpu/cpu_kernel.h"
#include "backend/kernel_compiler/cpu/cpu_kernel_factory.h"

@@ -37,9 +40,9 @@ class ArgMinWithValueCPUKernel : public CPUKernel {

private:
std::vector<size_t> shape_;
size_t num_before_axis_;
size_t num_after_axis_;
size_t dim_axis_;
size_t num_before_axis_{0};
size_t num_after_axis_{0};
size_t dim_axis_{0};
};

MS_REG_CPU_KERNEL_T(ArgMinWithValue, KernelAttr(), ArgMinWithValueCPUKernel, float);
@@ -47,4 +50,4 @@ MS_REG_CPU_KERNEL_T(ArgMinWithValue, KernelAttr(), ArgMinWithValueCPUKernel, flo
} // namespace kernel
} // namespace mindspore

#endif // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_ARGMINWITHVALUE_CPU_KERNEL_H_
#endif // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_ARGMIN_WITH_VALUE_CPU_KERNEL_H_

+ 107
- 118
mindspore/ccsrc/backend/kernel_compiler/cpu/arithmetic_cpu_kernel.cc View File

@@ -1,5 +1,5 @@
/**
* Copyright 2020 Huawei Technologies Co., Ltd
* Copyright 2020-2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
@@ -13,18 +13,56 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include <string>
#include <map>

#include "backend/kernel_compiler/cpu/arithmetic_cpu_kernel.h"

#include <cmath>
#include <string>
#include <unordered_map>
#include <limits>

#include "backend/kernel_compiler/cpu/nnacl/fp32/power_fp32.h"
#include "backend/kernel_compiler/cpu/nnacl/fp32/sub_fp32.h"
#include "backend/kernel_compiler/cpu/nnacl/fp32/mul_fp32.h"
#include "runtime/device/cpu/cpu_device_address.h"
#include "nnacl/fp32/power_fp32.h"
#include "nnacl/fp32/sub_fp32.h"
#include "nnacl/fp32/mul_fp32.h"

namespace mindspore {
namespace kernel {
namespace {
constexpr size_t kInputsNum = 2;
constexpr size_t kOutputsNum = 1;
constexpr float kMaxSubSerialSize = 10000.0;
constexpr float kMaxPowSerialSize = 700.0;

template <typename T>
void ArithmeticCPUKernel<T>::AssignAdd(T *input1, const T *input2, T *out) {
void ElementRealDiv(const T *input1, const T *input2, T *out, size_t size, size_t delta_1, size_t delta_2) {
size_t idx_1 = 0;
size_t idx_2 = 0;
auto zero = (T)0;
for (size_t i = 0; i < size; ++i) {
auto dividend = input1[idx_1];
auto divisor = input2[idx_2];
idx_1 += delta_1;
idx_2 += delta_2;
if (divisor == zero) {
if (dividend == zero) {
out[i] = std::numeric_limits<T>::quiet_NaN();
continue;
}
if (std::numeric_limits<T>::has_infinity) {
out[i] = dividend > zero ? std::numeric_limits<T>::infinity() : -std::numeric_limits<T>::infinity();
} else {
out[i] = dividend > zero ? std::numeric_limits<T>::max() : std::numeric_limits<T>::min();
}
continue;
}
out[i] = dividend / divisor;
}
}
} // namespace

template <typename T>
void ArithmeticCPUKernel<T>::AssignAdd(T *input1, const T *input2, T *out) const {
auto task = [&input1, &input2, &out](size_t start, size_t end) {
for (size_t i = start; i < end; i++) {
out[i] = input1[i] + input2[i];
@@ -35,7 +73,7 @@ void ArithmeticCPUKernel<T>::AssignAdd(T *input1, const T *input2, T *out) {
}

template <typename T>
void ArithmeticCPUKernel<T>::Add(const T *input1, const T *input2, T *out) {
void ArithmeticCPUKernel<T>::Add(const T *input1, const T *input2, T *out) const {
BroadcastIterator base_iter(input_shape1_, input_shape2_, output_shape_);
auto task = [&input1, &input2, &out, &base_iter](size_t start, size_t end) {
auto iter = base_iter;
@@ -58,12 +96,12 @@ void ArithmeticCPUKernel<T>::Sub(const T *input1, const T *input2, T *out) {
ParallelLaunchAutoSearch(task, output_size_, this, &parallel_search_info_);
return;
}
if (op_para.in_elements_num0_ == 1 || op_para.in_elements_num1_ == 1) {
if (op_para_.in_elements_num0_ == 1 || op_para_.in_elements_num1_ == 1) {
auto task = [this, input1, input2, out](size_t start, size_t end) {
if (op_para.in_elements_num0_ == 1) {
(void)ElementOptSub(input1, input2 + start, out + start, end - start, &op_para);
if (op_para_.in_elements_num0_ == 1) {
(void)ElementOptSub(input1, input2 + start, out + start, end - start, &op_para_);
} else {
(void)ElementOptSub(input1 + start, input2, out + start, end - start, &op_para);
(void)ElementOptSub(input1 + start, input2, out + start, end - start, &op_para_);
}
};
ParallelLaunchAutoSearch(task, output_size_, this, &parallel_search_info_);
@@ -80,7 +118,7 @@ void ArithmeticCPUKernel<T>::Sub(const T *input1, const T *input2, T *out) {
iter.GenNextPos();
}
};
CPUKernelUtils::ParallelFor(task, output_size_, MAX_SUB_SERIAL_SIZE);
CPUKernelUtils::ParallelFor(task, output_size_, kMaxSubSerialSize);
}

template <typename T>
@@ -93,12 +131,12 @@ void ArithmeticCPUKernel<T>::Mul(const T *input1, const T *input2, T *out) {
ParallelLaunchAutoSearch(task, output_size_, this, &parallel_search_info_);
return;
}
if (op_para.in_elements_num0_ == 1 || op_para.in_elements_num1_ == 1) {
if (op_para_.in_elements_num0_ == 1 || op_para_.in_elements_num1_ == 1) {
auto task = [this, input1, input2, out](size_t start, size_t end) {
if (op_para.in_elements_num0_ == 1) {
(void)ElementOptMul(input1, input2 + start, out + start, end - start, &op_para);
if (op_para_.in_elements_num0_ == 1) {
(void)ElementOptMul(input1, input2 + start, out + start, end - start, &op_para_);
} else {
(void)ElementOptMul(input1 + start, input2, out + start, end - start, &op_para);
(void)ElementOptMul(input1 + start, input2, out + start, end - start, &op_para_);
}
};
ParallelLaunchAutoSearch(task, output_size_, this, &parallel_search_info_);
@@ -110,39 +148,13 @@ void ArithmeticCPUKernel<T>::Mul(const T *input1, const T *input2, T *out) {
auto iter = base_iter;
iter.SetPos(start);
for (size_t i = start; i < end; i++) {
out[i] = input1[iter.GetInputPosA()] * input2[iter.GetInputPosB()];
out[i] = static_cast<T>(input1[iter.GetInputPosA()] * input2[iter.GetInputPosB()]);
iter.GenNextPos();
}
};
CPUKernelUtils::ParallelFor(task, output_size_);
}

template <typename T>
void ElementRealDiv(const T *input1, const T *input2, T *out, size_t size, size_t delta_1, size_t delta_2) {
size_t idx_1 = 0;
size_t idx_2 = 0;
auto zero = (T)0;
for (size_t i = 0; i < size; ++i) {
auto dividend = input1[idx_1];
auto divisor = input2[idx_2];
idx_1 += delta_1;
idx_2 += delta_2;
if (divisor == zero) {
if (dividend == zero) {
out[i] = std::numeric_limits<T>::quiet_NaN();
continue;
}
if (std::numeric_limits<T>::has_infinity) {
out[i] = dividend > zero ? std::numeric_limits<T>::infinity() : -std::numeric_limits<T>::infinity();
} else {
out[i] = dividend > zero ? std::numeric_limits<T>::max() : std::numeric_limits<T>::min();
}
continue;
}
out[i] = dividend / divisor;
}
}

template <typename T>
void ArithmeticCPUKernel<T>::RealDiv(const T *input1, const T *input2, T *out) {
if (input_shape1_ == input_shape2_) {
@@ -152,14 +164,14 @@ void ArithmeticCPUKernel<T>::RealDiv(const T *input1, const T *input2, T *out) {
ParallelLaunchAutoSearch(task, output_size_, this, &parallel_search_info_);
return;
}
if (op_para.in_elements_num0_ == 1) {
if (op_para_.in_elements_num0_ == 1) {
auto task = [&](size_t start, size_t end) {
ElementRealDiv<T>(input1, input2 + start, out + start, end - start, 0, 1);
};
ParallelLaunchAutoSearch(task, output_size_, this, &parallel_search_info_);
return;
}
if (op_para.in_elements_num1_ == 1) {
if (op_para_.in_elements_num1_ == 1) {
auto task = [&](size_t start, size_t end) {
ElementRealDiv<T>(input1 + start, input2, out + start, end - start, 1, 0);
};
@@ -195,7 +207,7 @@ void ArithmeticCPUKernel<T>::RealDiv(const T *input1, const T *input2, T *out) {
}

template <typename T>
void ArithmeticCPUKernel<T>::Div(const T *input1, const T *input2, T *out) {
void ArithmeticCPUKernel<T>::Div(const T *input1, const T *input2, T *out) const {
BroadcastIterator base_iter(input_shape1_, input_shape2_, output_shape_);
auto task = [&input1, &input2, &out, &base_iter](size_t start, size_t end) {
auto iter = base_iter;
@@ -224,7 +236,7 @@ void ArithmeticCPUKernel<T>::Div(const T *input1, const T *input2, T *out) {
}

template <typename T>
void ArithmeticCPUKernel<T>::FloorDiv(const T *input1, const T *input2, T *out) {
void ArithmeticCPUKernel<T>::FloorDiv(const T *input1, const T *input2, T *out) const {
BroadcastIterator base_iter(input_shape1_, input_shape2_, output_shape_);
auto task = [&input1, &input2, &out, &base_iter](size_t start, size_t end) {
auto iter = base_iter;
@@ -233,7 +245,7 @@ void ArithmeticCPUKernel<T>::FloorDiv(const T *input1, const T *input2, T *out)
auto dividend = input1[iter.GetInputPosA()];
auto divisor = input2[iter.GetInputPosB()];
iter.GenNextPos();
auto zero = (T)0;
auto zero = static_cast<T>(0);
if (divisor == zero) {
if (dividend == zero) {
out[i] = std::numeric_limits<T>::quiet_NaN();
@@ -246,14 +258,14 @@ void ArithmeticCPUKernel<T>::FloorDiv(const T *input1, const T *input2, T *out)
}
continue;
}
out[i] = (T)floor(static_cast<double>(dividend) / static_cast<double>(divisor));
out[i] = static_cast<T>(floor(static_cast<double>(dividend) / static_cast<double>(divisor)));
}
};
CPUKernelUtils::ParallelFor(task, output_size_);
}

template <typename T>
void ArithmeticCPUKernel<T>::Mod(const T *input1, const T *input2, T *out) {
void ArithmeticCPUKernel<T>::Mod(const T *input1, const T *input2, T *out) const {
BroadcastIterator base_iter(input_shape1_, input_shape2_, output_shape_);
auto task = [&input1, &input2, &out, &base_iter](size_t start, size_t end) {
auto iter = base_iter;
@@ -275,7 +287,7 @@ void ArithmeticCPUKernel<T>::Mod(const T *input1, const T *input2, T *out) {
}

template <typename T>
void ArithmeticCPUKernel<T>::FloorMod(const T *input1, const T *input2, T *out) {
void ArithmeticCPUKernel<T>::FloorMod(const T *input1, const T *input2, T *out) const {
BroadcastIterator base_iter(input_shape1_, input_shape2_, output_shape_);
auto task = [&input1, &input2, &out, &base_iter](size_t start, size_t end) {
auto iter = base_iter;
@@ -292,7 +304,7 @@ void ArithmeticCPUKernel<T>::FloorMod(const T *input1, const T *input2, T *out)
}

template <typename T>
void ArithmeticCPUKernel<T>::Pow(const T *input1, const T *input2, T *out) {
void ArithmeticCPUKernel<T>::Pow(const T *input1, const T *input2, T *out) const {
if constexpr (std::is_same_v<T, float>) {
auto is_power_single = [this]() {
bool is_power_single = false;
@@ -308,7 +320,7 @@ void ArithmeticCPUKernel<T>::Pow(const T *input1, const T *input2, T *out) {
return is_power_single;
};

if (op_para.in_elements_num1_ == 1) {
if (op_para_.in_elements_num1_ == 1) {
auto task = [&](size_t start, size_t end) {
(void)Power(input1 + start, input2, out + start, end - start, 1, 0, true);
};
@@ -325,7 +337,7 @@ void ArithmeticCPUKernel<T>::Pow(const T *input1, const T *input2, T *out) {
}

BroadcastIterator base_iter(input_shape1_, input_shape2_, output_shape_);
if (output_size_ > MAX_POW_SERIAL_SIZE) {
if (output_size_ > kMaxPowSerialSize) {
auto task = [&input1, &input2, &out, &base_iter](size_t start, size_t end) {
auto iter = base_iter;
iter.SetPos(start);
@@ -356,7 +368,7 @@ void ArithmeticCPUKernel<T>::SquaredDifference(const T *input1, const T *input2,
iter.SetPos(start);
for (size_t i = start; i < end; i++) {
T diff = input1[iter.GetInputPosA()] - input2[iter.GetInputPosB()];
out[i] = diff * diff;
out[i] = static_cast<T>(diff * diff);
iter.GenNextPos();
}
};
@@ -364,44 +376,47 @@ void ArithmeticCPUKernel<T>::SquaredDifference(const T *input1, const T *input2,
}

template <typename T>
void ArithmeticCPUKernel<T>::Atan2(const T *input1, const T *input2, T *out) {
void ArithmeticCPUKernel<T>::Atan2(const T *input1, const T *input2, T *out) const {
BroadcastIterator base_iter(input_shape1_, input_shape2_, output_shape_);
auto task = [&input1, &input2, &out, &base_iter](size_t start, size_t end) {
auto iter = base_iter;
iter.SetPos(start);
for (size_t i = start; i < end; i++) {
out[i] =
(T)atan2(static_cast<double>(input1[iter.GetInputPosA()]), static_cast<double>(input2[iter.GetInputPosB()]));
out[i] = static_cast<T>(
atan2(static_cast<double>(input1[iter.GetInputPosA()]), static_cast<double>(input2[iter.GetInputPosB()])));
iter.GenNextPos();
}
};
CPUKernelUtils::ParallelFor(task, output_size_);
}

static const std::map<std::string, OperateType> kArithmeticBinOpTypeMap = {
{prim::kPrimAdd->name(), ADD},
{prim::kPrimSub->name(), SUB},
{prim::kPrimMul->name(), MUL},
{prim::kPrimDiv->name(), DIV},
{prim::kPrimMod->name(), MOD},
{prim::kPrimAssignAdd->name(), ASSIGNADD},
{prim::kPrimPow->name(), POW},
{prim::kPrimFloorDiv->name(), FLOORDIV},
{prim::kPrimAtan2->name(), ATAN2},
{prim::kPrimRealDiv->name(), REALDIV},
{prim::kPrimSquaredDifference->name(), SQUAREDDIFFERENCE},
{prim::kPrimFloorMod->name(), FLOORMOD}};
template <typename T>
void ArithmeticCPUKernel<T>::InitComputeFunc() {
if (kernel_name_ == prim::kPrimAssignAdd->name()) {
return;
}
static const std::unordered_map<std::string, TypeComputeFunc> arithmeticMathFuncMap{
{prim::kPrimAdd->name(), &ArithmeticCPUKernel<T>::Add},
{prim::kPrimSub->name(), &ArithmeticCPUKernel<T>::Sub},
{prim::kPrimMul->name(), &ArithmeticCPUKernel<T>::Mul},
{prim::kPrimDiv->name(), &ArithmeticCPUKernel<T>::Div},
{prim::kPrimMod->name(), &ArithmeticCPUKernel<T>::Mod},
{prim::kPrimFloorMod->name(), &ArithmeticCPUKernel<T>::FloorMod},
{prim::kPrimPow->name(), &ArithmeticCPUKernel<T>::Pow},
{prim::kPrimFloorDiv->name(), &ArithmeticCPUKernel<T>::FloorDiv},
{prim::kPrimAtan2->name(), &ArithmeticCPUKernel<T>::Atan2},
{prim::kPrimRealDiv->name(), &ArithmeticCPUKernel<T>::RealDiv},
{prim::kPrimSquaredDifference->name(), &ArithmeticCPUKernel<T>::SquaredDifference}};
if (arithmeticMathFuncMap.find(kernel_name_) == arithmeticMathFuncMap.end()) {
MS_LOG(EXCEPTION) << "ArithmeticCPUKernel does not support " << kernel_name_;
}
compute_func_ = arithmeticMathFuncMap.at(kernel_name_);
}

template <typename T>
void ArithmeticCPUKernel<T>::InitKernel(const CNodePtr &kernel_node) {
MS_EXCEPTION_IF_NULL(kernel_node);
std::string kernel_name = AnfAlgo::GetCNodeName(kernel_node);
if (kArithmeticBinOpTypeMap.find(kernel_name) != kArithmeticBinOpTypeMap.end()) {
operate_type_ = kArithmeticBinOpTypeMap.at(kernel_name);
} else {
MS_LOG(EXCEPTION) << "Not support " << kernel_name;
}

kernel_name_ = AnfAlgo::GetCNodeName(kernel_node);
input_shape1_ = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
input_shape2_ = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 1);
output_shape_ = AnfAlgo::GetOutputInferShape(kernel_node, 0);
@@ -414,14 +429,14 @@ void ArithmeticCPUKernel<T>::InitKernel(const CNodePtr &kernel_node) {
output_size_ *= output_shape_[i];
}

op_para.in_elements_num0_ = 1;
op_para_.in_elements_num0_ = 1;
for (size_t i = 0; i < input_shape1_.size(); ++i) {
op_para.in_elements_num0_ *= input_shape1_[i];
op_para_.in_elements_num0_ *= input_shape1_[i];
}

op_para.in_elements_num1_ = 1;
op_para_.in_elements_num1_ = 1;
for (size_t i = 0; i < input_shape2_.size(); ++i) {
op_para.in_elements_num1_ *= input_shape2_[i];
op_para_.in_elements_num1_ *= input_shape2_[i];
}

size_t l = input_shape1_.size();
@@ -435,47 +450,21 @@ void ArithmeticCPUKernel<T>::InitKernel(const CNodePtr &kernel_node) {
CPUKernelUtils::GetElementNumEveryDim(input_shape1_, &input_element_num1_);
CPUKernelUtils::GetElementNumEveryDim(input_shape2_, &input_element_num2_);
CPUKernelUtils::GetElementNumEveryDim(output_shape_, &output_element_num_);
dtype_ = AnfAlgo::GetInputDeviceDataType(kernel_node, 0);
if (dtype_ != AnfAlgo::GetInputDeviceDataType(kernel_node, 1)) {
MS_LOG(EXCEPTION) << "Input0 and input1 must has the same data type";
}
target_dtype_ = AnfAlgo::GetOutputDeviceDataType(kernel_node, 0);
InitComputeFunc();
}

template <typename T>
bool ArithmeticCPUKernel<T>::Launch(const std::vector<AddressPtr> &inputs,
const std::vector<AddressPtr> & /* workspace */,
bool ArithmeticCPUKernel<T>::Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &,
const std::vector<AddressPtr> &outputs) {
T *input1 = reinterpret_cast<T *>(inputs[0]->addr);
T *input2 = reinterpret_cast<T *>(inputs[1]->addr);
T *output = reinterpret_cast<T *>(outputs[0]->addr);

if (operate_type_ == ADD) {
Add(input1, input2, output);
} else if (operate_type_ == SUB) {
Sub(input1, input2, output);
} else if (operate_type_ == MUL) {
Mul(input1, input2, output);
} else if (operate_type_ == REALDIV) {
RealDiv(input1, input2, output);
} else if (operate_type_ == DIV) {
Div(input1, input2, output);
} else if (operate_type_ == FLOORDIV) {
FloorDiv(input1, input2, output);
} else if (operate_type_ == MOD) {
Mod(input1, input2, output);
} else if (operate_type_ == FLOORMOD) {
FloorMod(input1, input2, output);
} else if (operate_type_ == POW) {
Pow(input1, input2, output);
} else if (operate_type_ == ASSIGNADD) {
CHECK_KERNEL_INPUTS_NUM(inputs.size(), kInputsNum, kernel_name_);
CHECK_KERNEL_OUTPUTS_NUM(outputs.size(), kOutputsNum, kernel_name_);
auto *input1 = reinterpret_cast<T *>(inputs[0]->addr);
const auto *input2 = reinterpret_cast<T *>(inputs[1]->addr);
auto *output = reinterpret_cast<T *>(outputs[0]->addr);
if (kernel_name_ == prim::kPrimAssignAdd->name()) {
AssignAdd(input1, input2, output);
} else if (operate_type_ == ATAN2) {
Atan2(input1, input2, output);
} else if (operate_type_ == SQUAREDDIFFERENCE) {
SquaredDifference(input1, input2, output);
} else {
MS_LOG(EXCEPTION) << "Not support " << operate_type_;
compute_func_(this, input1, input2, output);
}
return true;
}


+ 19
- 20
mindspore/ccsrc/backend/kernel_compiler/cpu/arithmetic_cpu_kernel.h View File

@@ -13,18 +13,15 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_ARITHMETIC_CPU_KERNEL_H_
#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_ARITHMETIC_CPU_KERNEL_H_
#include <memory>
#include <vector>
#include <limits>
#include "backend/kernel_compiler/cpu/cpu_kernel.h"
#include "backend/kernel_compiler/cpu/cpu_kernel_factory.h"
#include "nnacl/arithmetic.h"

const float MAX_SUB_SERIAL_SIZE = 10000;
const float MAX_DIV_SERIAL_SIZE = 10000;
const float MAX_POW_SERIAL_SIZE = 700;
#include "backend/kernel_compiler/cpu/nnacl/arithmetic.h"

namespace mindspore {
namespace kernel {
@@ -40,29 +37,31 @@ class ArithmeticCPUKernel : public CPUKernel {
const std::vector<AddressPtr> &outputs) override;

private:
void InitComputeFunc();
void Sub(const T *input1, const T *input2, T *out);
void Add(const T *input1, const T *input2, T *out);
void Add(const T *input1, const T *input2, T *out) const;
void Mul(const T *input1, const T *input2, T *out);
void RealDiv(const T *input1, const T *input2, T *out);
void Div(const T *input1, const T *input2, T *out);
void FloorDiv(const T *input1, const T *input2, T *out);
void Mod(const T *input1, const T *input2, T *out);
void FloorMod(const T *input1, const T *input2, T *out);
void Pow(const T *input1, const T *input2, T *out);
void AssignAdd(T *input1, const T *input2, T *out);
void Atan2(const T *input1, const T *input2, T *out);
void Div(const T *input1, const T *input2, T *out) const;
void FloorDiv(const T *input1, const T *input2, T *out) const;
void Mod(const T *input1, const T *input2, T *out) const;
void FloorMod(const T *input1, const T *input2, T *out) const;
void Pow(const T *input1, const T *input2, T *out) const;
void AssignAdd(T *input1, const T *input2, T *out) const;
void Atan2(const T *input1, const T *input2, T *out) const;
void SquaredDifference(const T *input1, const T *input2, T *out);

using TypeComputeFunc = std::function<void(ArithmeticCPUKernel *, const T *in_x, const T *in_y, T *out)>;
TypeComputeFunc compute_func_{nullptr};
size_t output_size_{1};
ArithmeticParameter op_para_{};

std::vector<size_t> input_shape1_;
std::vector<size_t> input_shape2_;
std::vector<size_t> input_element_num1_;
std::vector<size_t> input_element_num2_;
std::vector<size_t> output_shape_;
std::vector<size_t> output_element_num_;
size_t output_size_{1};
ArithmeticParameter op_para;
OperateType operate_type_{ADD};
TypeId dtype_{kTypeUnknown};
TypeId target_dtype_{kTypeUnknown};
};

MS_REG_CPU_KERNEL_T(Sub, KernelAttr(), ArithmeticCPUKernel, int32_t);


+ 44
- 48
mindspore/ccsrc/backend/kernel_compiler/cpu/arithmetic_logic_cpu_kernel.cc View File

@@ -1,5 +1,5 @@
/**
* Copyright 2020 Huawei Technologies Co., Ltd
* Copyright 2020-2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
@@ -15,18 +15,26 @@
*/

#include "backend/kernel_compiler/cpu/arithmetic_logic_cpu_kernel.h"
#include <cmath>
#include <string>
#include <map>
#include <cmath>
#include <unordered_map>
#include <functional>

#include "runtime/device/cpu/cpu_device_address.h"

namespace mindspore {
namespace kernel {
namespace {
constexpr size_t kMaxLessSerialSize = 15000;
constexpr size_t kInputsNum = 2;
constexpr size_t kOutputsNum = 1;
} // namespace

template <typename T>
void ArithmeticLogicCPUKernel<T>::Less(const T *input1, const T *input2, bool *out) {
void ArithmeticLogicCPUKernel<T>::Less(const T *input1, const T *input2, bool *out) const {
BroadcastIterator base_iter(input_shape1_, input_shape2_, output_shape_);
if (output_size_ > MAX_LESS_SERIAL_SIZE) {
if (output_size_ > kMaxLessSerialSize) {
auto task = [&](size_t start, size_t end) {
auto iter = base_iter;
iter.SetPos(start);
@@ -50,7 +58,7 @@ void ArithmeticLogicCPUKernel<T>::Less(const T *input1, const T *input2, bool *o
}

template <typename T>
void ArithmeticLogicCPUKernel<T>::Equal(const T *input1, const T *input2, bool *out) {
void ArithmeticLogicCPUKernel<T>::Equal(const T *input1, const T *input2, bool *out) const {
BroadcastIterator base_iter(input_shape1_, input_shape2_, output_shape_);
auto task = [&](size_t start, size_t end) {
auto iter = base_iter;
@@ -66,7 +74,7 @@ void ArithmeticLogicCPUKernel<T>::Equal(const T *input1, const T *input2, bool *
}

template <typename T>
void ArithmeticLogicCPUKernel<T>::NotEqual(const T *input1, const T *input2, bool *out) {
void ArithmeticLogicCPUKernel<T>::NotEqual(const T *input1, const T *input2, bool *out) const {
BroadcastIterator base_iter(input_shape1_, input_shape2_, output_shape_);
auto task = [&](size_t start, size_t end) {
auto iter = base_iter;
@@ -82,7 +90,7 @@ void ArithmeticLogicCPUKernel<T>::NotEqual(const T *input1, const T *input2, boo
}

template <typename T>
void ArithmeticLogicCPUKernel<T>::LogicalAnd(const T *input1, const T *input2, bool *out) {
void ArithmeticLogicCPUKernel<T>::LogicalAnd(const T *input1, const T *input2, bool *out) const {
BroadcastIterator base_iter(input_shape1_, input_shape2_, output_shape_);
auto task = [&](size_t start, size_t end) {
auto iter = base_iter;
@@ -96,7 +104,7 @@ void ArithmeticLogicCPUKernel<T>::LogicalAnd(const T *input1, const T *input2, b
}

template <typename T>
void ArithmeticLogicCPUKernel<T>::LogicalOr(const T *input1, const T *input2, bool *out) {
void ArithmeticLogicCPUKernel<T>::LogicalOr(const T *input1, const T *input2, bool *out) const {
BroadcastIterator base_iter(input_shape1_, input_shape2_, output_shape_);
auto task = [&](size_t start, size_t end) {
auto iter = base_iter;
@@ -110,7 +118,7 @@ void ArithmeticLogicCPUKernel<T>::LogicalOr(const T *input1, const T *input2, bo
}

template <typename T>
void ArithmeticLogicCPUKernel<T>::Greater(const T *input1, const T *input2, bool *out) {
void ArithmeticLogicCPUKernel<T>::Greater(const T *input1, const T *input2, bool *out) const {
BroadcastIterator base_iter(input_shape1_, input_shape2_, output_shape_);
auto task = [&](size_t start, size_t end) {
auto iter = base_iter;
@@ -126,7 +134,7 @@ void ArithmeticLogicCPUKernel<T>::Greater(const T *input1, const T *input2, bool
}

template <typename T>
void ArithmeticLogicCPUKernel<T>::GreaterEqual(const T *input1, const T *input2, bool *out) {
void ArithmeticLogicCPUKernel<T>::GreaterEqual(const T *input1, const T *input2, bool *out) const {
BroadcastIterator base_iter(input_shape1_, input_shape2_, output_shape_);
auto task = [&](size_t start, size_t end) {
auto iter = base_iter;
@@ -142,7 +150,7 @@ void ArithmeticLogicCPUKernel<T>::GreaterEqual(const T *input1, const T *input2,
}

template <typename T>
void ArithmeticLogicCPUKernel<T>::LessEqual(const T *input1, const T *input2, bool *out) {
void ArithmeticLogicCPUKernel<T>::LessEqual(const T *input1, const T *input2, bool *out) const {
BroadcastIterator base_iter(input_shape1_, input_shape2_, output_shape_);
auto task = [&](size_t start, size_t end) {
auto iter = base_iter;
@@ -157,26 +165,31 @@ void ArithmeticLogicCPUKernel<T>::LessEqual(const T *input1, const T *input2, bo
CPUKernelUtils::ParallelFor(task, output_size_);
}

static const std::map<std::string, OperateType> kArithmeticBinOpTypeMap = {
{prim::kPrimGreater->name(), GREATER}, {prim::kPrimGreaterEqual->name(), GREATEREQUAL},
{prim::kPrimLogicalAnd->name(), LOGICALAND}, {prim::kPrimLessEqual->name(), LESSEQUAL},
{prim::kPrimLogicalOr->name(), LOGICALOR}, {prim::kPrimLess->name(), LESS},
{prim::kPrimNotEqual->name(), NOTEQUAL}, {prim::kPrimEqual->name(), EQUAL}};
template <typename T>
void ArithmeticLogicCPUKernel<T>::InitComputeFunc() {
static const std::unordered_map<std::string, TypeComputeFunc> arithmeticLogicFuncMap{
{prim::kPrimGreater->name(), &ArithmeticLogicCPUKernel<T>::Greater},
{prim::kPrimGreaterEqual->name(), &ArithmeticLogicCPUKernel<T>::GreaterEqual},
{prim::kPrimLogicalAnd->name(), &ArithmeticLogicCPUKernel<T>::LogicalAnd},
{prim::kPrimLessEqual->name(), &ArithmeticLogicCPUKernel<T>::LessEqual},
{prim::kPrimLogicalOr->name(), &ArithmeticLogicCPUKernel<T>::LogicalOr},
{prim::kPrimLess->name(), &ArithmeticLogicCPUKernel<T>::Less},
{prim::kPrimNotEqual->name(), &ArithmeticLogicCPUKernel<T>::NotEqual},
{prim::kPrimEqual->name(), &ArithmeticLogicCPUKernel<T>::Equal}};
if (arithmeticLogicFuncMap.find(kernel_name_) == arithmeticLogicFuncMap.end()) {
MS_LOG(EXCEPTION) << "ArithmeticLogicCPUKernel does not support " << kernel_name_;
}
compute_func_ = arithmeticLogicFuncMap.at(kernel_name_);
}

template <typename T>
void ArithmeticLogicCPUKernel<T>::InitKernel(const CNodePtr &kernel_node) {
MS_EXCEPTION_IF_NULL(kernel_node);
std::string kernel_name = AnfAlgo::GetCNodeName(kernel_node);
if (kArithmeticBinOpTypeMap.find(kernel_name) != kArithmeticBinOpTypeMap.end()) {
operate_type_ = kArithmeticBinOpTypeMap.at(kernel_name);
} else {
MS_LOG(EXCEPTION) << "Not support " << kernel_name;
}

kernel_name_ = AnfAlgo::GetCNodeName(kernel_node);
input_shape1_ = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
input_shape2_ = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 1);
output_shape_ = AnfAlgo::GetOutputInferShape(kernel_node, 0);
if (output_shape_.size() == 0) {
if (output_shape_.empty()) {
(void)output_shape_.insert(output_shape_.begin(), 1);
}

@@ -200,36 +213,19 @@ void ArithmeticLogicCPUKernel<T>::InitKernel(const CNodePtr &kernel_node) {
if (dtype_ != AnfAlgo::GetInputDeviceDataType(kernel_node, 1)) {
MS_LOG(EXCEPTION) << "Input0 and input1 must has the same data type";
}
target_dtype_ = AnfAlgo::GetOutputDeviceDataType(kernel_node, 0);
InitComputeFunc();
}

template <typename T>
bool ArithmeticLogicCPUKernel<T>::Launch(const std::vector<AddressPtr> &inputs,
const std::vector<AddressPtr> & /* workspace */,
const std::vector<AddressPtr> &outputs) {
T *input1 = reinterpret_cast<T *>(inputs[0]->addr);
T *input2 = reinterpret_cast<T *>(inputs[1]->addr);
CHECK_KERNEL_INPUTS_NUM(inputs.size(), kInputsNum, kernel_name_);
CHECK_KERNEL_OUTPUTS_NUM(outputs.size(), kOutputsNum, kernel_name_);
const auto *input1 = reinterpret_cast<T *>(inputs[0]->addr);
const auto *input2 = reinterpret_cast<T *>(inputs[1]->addr);
bool *output = reinterpret_cast<bool *>(outputs[0]->addr);

if (operate_type_ == LESS) {
Less(input1, input2, output);
} else if (operate_type_ == EQUAL) {
Equal(input1, input2, output);
} else if (operate_type_ == NOTEQUAL) {
NotEqual(input1, input2, output);
} else if (operate_type_ == GREATER) {
Greater(input1, input2, output);
} else if (operate_type_ == GREATEREQUAL) {
GreaterEqual(input1, input2, output);
} else if (operate_type_ == LESSEQUAL) {
LessEqual(input1, input2, output);
} else if (operate_type_ == LOGICALAND) {
LogicalAnd(input1, input2, output);
} else if (operate_type_ == LOGICALOR) {
LogicalOr(input1, input2, output);
} else {
MS_LOG(EXCEPTION) << "Not support " << operate_type_;
}
compute_func_(this, input1, input2, output);
return true;
}
} // namespace kernel


+ 18
- 15
mindspore/ccsrc/backend/kernel_compiler/cpu/arithmetic_logic_cpu_kernel.h View File

@@ -13,16 +13,17 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_ARITHMETIC_LOGIC_CPU_KERNEL_H_
#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_ARITHMETIC_LOGIC_CPU_KERNEL_H_

#include <memory>
#include <vector>
#include <limits>

#include "backend/kernel_compiler/cpu/cpu_kernel.h"
#include "backend/kernel_compiler/cpu/cpu_kernel_factory.h"

#define MAX_LESS_SERIAL_SIZE 15000

namespace mindspore {
namespace kernel {
template <typename T>
@@ -37,25 +38,27 @@ class ArithmeticLogicCPUKernel : public CPUKernel {
const std::vector<AddressPtr> &outputs) override;

private:
void GenIndex(size_t num, std::vector<size_t> *idx);
void Less(const T *input1, const T *input2, bool *out);
void Equal(const T *input1, const T *input2, bool *out);
void NotEqual(const T *input1, const T *input2, bool *out);
void Greater(const T *input1, const T *input2, bool *out);
void GreaterEqual(const T *input1, const T *input2, bool *out);
void LessEqual(const T *input1, const T *input2, bool *out);
void LogicalAnd(const T *input1, const T *input2, bool *out);
void LogicalOr(const T *input1, const T *input2, bool *out);
void InitComputeFunc();
void Less(const T *input1, const T *input2, bool *out) const;
void Equal(const T *input1, const T *input2, bool *out) const;
void NotEqual(const T *input1, const T *input2, bool *out) const;
void Greater(const T *input1, const T *input2, bool *out) const;
void GreaterEqual(const T *input1, const T *input2, bool *out) const;
void LessEqual(const T *input1, const T *input2, bool *out) const;
void LogicalAnd(const T *input1, const T *input2, bool *out) const;
void LogicalOr(const T *input1, const T *input2, bool *out) const;

using TypeComputeFunc = std::function<void(ArithmeticLogicCPUKernel *, const T *, const T *, bool *)>;
TypeComputeFunc compute_func_{nullptr};
size_t output_size_{1};
TypeId dtype_{kTypeUnknown};

std::vector<size_t> input_shape1_;
std::vector<size_t> input_shape2_;
std::vector<size_t> input_element_num1_;
std::vector<size_t> input_element_num2_;
std::vector<size_t> output_shape_;
std::vector<size_t> output_element_num_;
size_t output_size_{1};
OperateType operate_type_{ADD};
TypeId dtype_{kTypeUnknown};
TypeId target_dtype_{kTypeUnknown};
};

MS_REG_CPU_KERNEL_T(


+ 61
- 64
mindspore/ccsrc/backend/kernel_compiler/cpu/arithmetic_self_cpu_kernel.cc View File

@@ -1,5 +1,5 @@
/**
* Copyright 2020 Huawei Technologies Co., Ltd
* Copyright 2020-2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
@@ -13,17 +13,25 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include <algorithm>

#include "backend/kernel_compiler/cpu/arithmetic_self_cpu_kernel.h"

#include <cmath>
#include <string>
#include <thread>
#include <map>
#include "backend/kernel_compiler/cpu/arithmetic_self_cpu_kernel.h"
#include <algorithm>
#include <unordered_map>

#include "runtime/device/cpu/cpu_device_address.h"

namespace mindspore {
namespace kernel {
namespace {
constexpr float kMaxNegSerialSize = 5000.0f;
constexpr float kMaxSquareSerialSize = 5000.0f;
constexpr size_t kInputsNum = 1;
constexpr size_t kOutputsNum = 1;

template <typename T>
void Square(const T *in, T *out, size_t size) {
auto task = [&in, &out](size_t start, size_t end) {
@@ -31,7 +39,7 @@ void Square(const T *in, T *out, size_t size) {
out[i] = in[i] * in[i];
}
};
ParallelLaunch(task, size, MAX_SQUARE_SERIAL_SIZE);
ParallelLaunch(task, size, kMaxSquareSerialSize);
}

template <typename T>
@@ -57,11 +65,10 @@ void Neg(const T *in, T *out, size_t size) {
out[i] = -in[i];
}
};
ParallelLaunch(task, size, MAX_NEG_SERIAL_SIZE);
ParallelLaunch(task, size, kMaxNegSerialSize);
}

template <typename T>
void LogicalNot(const T *in, T *out, size_t size) {
void LogicalNot(const bool *in, bool *out, size_t size) {
auto task = [&in, &out](size_t start, size_t end) {
for (size_t i = start; i < end; i++) {
out[i] = !in[i];
@@ -133,10 +140,12 @@ void Reciprocal(const T *in, T *out, size_t size) {
template <typename T>
void Gelu(const T *in, T *out, size_t size) {
auto task = [&in, &out](size_t start, size_t end) {
auto factor_a = static_cast<T>(0.7978845608);
auto factor_b = static_cast<T>(0.044715);
for (size_t i = start; i < end; i++) {
T x = in[i];
auto double_x = static_cast<T>(x);
T tanh_res = static_cast<T>(std::tanh(0.7978845608 * (double_x + 0.044715 * double_x * double_x * double_x)));
T tanh_res = static_cast<T>(std::tanh(factor_a * (double_x + factor_b * double_x * double_x * double_x)));
out[i] = x * (static_cast<T>(1.0) + tanh_res) / static_cast<T>(2.0);
}
};
@@ -259,40 +268,17 @@ void Identity(const T *in, T *out, size_t size) {
}
} // namespace

static const std::map<std::string, OperateType> kArithmeticOpTypeMap = {{prim::kPrimNeg->name(), NEG},
{prim::kPrimSquare->name(), SQUARE},
{prim::kPrimOnesLike->name(), ONESLIKE},
{prim::kPrimZerosLike->name(), ZEROSLIKE},
{prim::kPrimLogicalNot->name(), LOGICALNOT},
{prim::kPrimSign->name(), SIGN},
{prim::kPrimFloor->name(), FLOOR},
{prim::kPrimRint->name(), RINT},
{prim::kPrimRound->name(), ROUND},
{prim::kPrimReciprocal->name(), RECIPROCAL},
{prim::kPrimGeLU->name(), GELU},
{prim::kPrimAsin->name(), ASIN},
{prim::kPrimACos->name(), ACOS},
{prim::kPrimAtan->name(), ATAN},
{prim::kPrimSin->name(), SIN},
{prim::kPrimCos->name(), COS},
{prim::kPrimTan->name(), TAN},
{prim::kPrimSinh->name(), SINH},
{prim::kPrimCosh->name(), COSH},
{prim::kPrimAsinh->name(), ASINH},
{prim::kPrimAcosh->name(), ACOSH},
{prim::kPrimAtanh->name(), ATANH},
{prim::kPrimIdentityMath->name(), IDENTITY}};

void ArithmeticSelfCPUKernel::InitKernel(const CNodePtr &kernel_node) {
MS_EXCEPTION_IF_NULL(kernel_node);
std::string kernel_name = AnfAlgo::GetCNodeName(kernel_node);
operate_type_ = kArithmeticOpTypeMap.at(kernel_name);
kernel_name_ = AnfAlgo::GetCNodeName(kernel_node);
dtype_ = AnfAlgo::GetInputDeviceDataType(kernel_node, 0);
}

bool ArithmeticSelfCPUKernel::Launch(const std::vector<kernel::AddressPtr> &inputs,
const std::vector<kernel::AddressPtr> &,
const std::vector<kernel::AddressPtr> &outputs) {
CHECK_KERNEL_INPUTS_NUM(inputs.size(), kInputsNum, kernel_name_);
CHECK_KERNEL_OUTPUTS_NUM(outputs.size(), kOutputsNum, kernel_name_);
if (dtype_ == kNumberTypeFloat32 || dtype_ == kNumberTypeFloat16 || dtype_ == kNumberTypeFloat64) {
LaunchKernel<float>(inputs, outputs);
} else if (dtype_ == kNumberTypeInt32 || dtype_ == kNumberTypeInt16) {
@@ -300,52 +286,63 @@ bool ArithmeticSelfCPUKernel::Launch(const std::vector<kernel::AddressPtr> &inpu
} else if (dtype_ == kNumberTypeInt64) {
LaunchKernel<int64_t>(inputs, outputs);
} else if (dtype_ == kNumberTypeBool) {
LaunchKernelLogic<bool>(inputs, outputs);
LaunchLogicalNot(inputs, outputs);
} else {
MS_LOG(EXCEPTION) << "Data type is " << TypeIdLabel(dtype_) << "is not support.";
}
return true;
}

template <typename T>
void ArithmeticSelfCPUKernel::LaunchKernelLogic(const std::vector<AddressPtr> &inputs,
const std::vector<AddressPtr> &outputs) {
T *input = reinterpret_cast<T *>(inputs[0]->addr);
T *output = reinterpret_cast<T *>(outputs[0]->addr);
size_t lens = outputs[0]->size > 0 ? static_cast<size_t>(outputs[0]->size / sizeof(T)) : 1;
LogicalNot<T>(input, output, lens);
return;
void ArithmeticSelfCPUKernel::LaunchLogicalNot(const std::vector<AddressPtr> &inputs,
const std::vector<AddressPtr> &outputs) const {
auto *input = reinterpret_cast<bool *>(inputs[0]->addr);
auto *output = reinterpret_cast<bool *>(outputs[0]->addr);
size_t lens = outputs[0]->size / sizeof(bool);
LogicalNot(input, output, lens);
}

template <typename T>
void ArithmeticSelfCPUKernel::LaunchKernel(const std::vector<AddressPtr> &inputs,
const std::vector<AddressPtr> &outputs) {
T *input = reinterpret_cast<T *>(inputs[0]->addr);
T *output = reinterpret_cast<T *>(outputs[0]->addr);
size_t lens = outputs[0]->size > 0 ? static_cast<size_t>(outputs[0]->size / sizeof(T)) : 1;
static const std::map<OperateType, std::function<void(const T *in, T *out, size_t size)>> kArithmeticOpFuncMap = {
{SQUARE, Square<T>}, {SIGN, Sign<T>},
{NEG, Neg<T>}, {LOGICALNOT, LogicalNot<T>},
{ONESLIKE, OnesLike<T>}, {ZEROSLIKE, ZerosLike<T>},
{FLOOR, Floor<T>}, {RECIPROCAL, Reciprocal<T>},
{GELU, Gelu<T>}, {SIN, Sin<T>},
{COS, Cos<T>}, {TAN, Tan<T>},
{ASIN, Asin<T>}, {ACOS, ACos<T>},
{ATAN, Atan<T>}, {SINH, Sinh<T>},
{COSH, Cosh<T>}, {ASINH, Asinh<T>},
{ACOSH, Acosh<T>}, {ATANH, Atanh<T>},
{RINT, Rint<T>}, {ROUND, Round<T>}};
if (kArithmeticOpFuncMap.find(operate_type_) != kArithmeticOpFuncMap.end()) {
kArithmeticOpFuncMap.at(operate_type_)(input, output, lens);
} else {
MS_LOG(EXCEPTION) << "Not support " << operate_type_;
const std::vector<AddressPtr> &outputs) const {
const auto *input = reinterpret_cast<T *>(inputs[0]->addr);
auto *output = reinterpret_cast<T *>(outputs[0]->addr);
const size_t lens = outputs[0]->size / sizeof(T);
static const std::unordered_map<std::string, std::function<void(const T *, T *, size_t)>> arithmeticSelfFuncMap{
{prim::kPrimSquare->name(), Square<T>},
{prim::kPrimSign->name(), Sign<T>},
{prim::kPrimNeg->name(), Neg<T>},
{prim::kPrimAtanh->name(), Atanh<T>},
{prim::kPrimAcosh->name(), Acosh<T>},
{prim::kPrimFloor->name(), Floor<T>},
{prim::kPrimSin->name(), Sin<T>},
{prim::kPrimGeLU->name(), Gelu<T>},
{prim::kPrimCos->name(), Cos<T>},
{prim::kPrimTan->name(), Tan<T>},
{prim::kPrimAsin->name(), Asin<T>},
{prim::kPrimACos->name(), ACos<T>},
{prim::kPrimAtan->name(), Atan<T>},
{prim::kPrimSinh->name(), Sinh<T>},
{prim::kPrimCosh->name(), Cosh<T>},
{prim::kPrimAsinh->name(), Asinh<T>},
{prim::kPrimZerosLike->name(), ZerosLike<T>},
{prim::kPrimOnesLike->name(), OnesLike<T>},
{prim::kPrimReciprocal->name(), Reciprocal<T>},
{prim::kPrimRint->name(), Rint<T>},
{prim::kPrimRound->name(), Round<T>}};

const auto func_pair = arithmeticSelfFuncMap.find(kernel_name_);
if (arithmeticSelfFuncMap.find(kernel_name_) == arithmeticSelfFuncMap.end()) {
MS_LOG(EXCEPTION) << "ArithmeticSelfCPUKernel does not support " << kernel_name_;
}
func_pair->second(input, output, lens);
}

template <typename T>
bool IdentityCPUKernel<T>::Launch(const std::vector<kernel::AddressPtr> &inputs,
const std::vector<kernel::AddressPtr> &,
const std::vector<kernel::AddressPtr> &outputs) {
CHECK_KERNEL_INPUTS_NUM(inputs.size(), kInputsNum, kernel_name_);
CHECK_KERNEL_OUTPUTS_NUM(outputs.size(), kOutputsNum, kernel_name_);
T *input = reinterpret_cast<T *>(inputs[0]->addr);
T *output = reinterpret_cast<T *>(outputs[0]->addr);
size_t lens = outputs[0]->size > 0 ? static_cast<size_t>(outputs[0]->size / sizeof(T)) : 1;


+ 7
- 8
mindspore/ccsrc/backend/kernel_compiler/cpu/arithmetic_self_cpu_kernel.h View File

@@ -13,16 +13,16 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_ARITHMETIC_SELF_CPU_KERNEL_H_
#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_ARITHMETIC_SELF_CPU_KERNEL_H_

#include <memory>
#include <vector>

#include "backend/kernel_compiler/cpu/cpu_kernel.h"
#include "backend/kernel_compiler/cpu/cpu_kernel_factory.h"

const float MAX_NEG_SERIAL_SIZE = 5000;
const float MAX_SQUARE_SERIAL_SIZE = 5000;

namespace mindspore {
namespace kernel {
class ArithmeticSelfCPUKernel : public CPUKernel {
@@ -35,13 +35,12 @@ class ArithmeticSelfCPUKernel : public CPUKernel {
bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
const std::vector<AddressPtr> &outputs) override;

private:
template <typename T>
void LaunchKernel(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &outputs);
template <typename T>
void LaunchKernelLogic(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &outputs);
void LaunchKernel(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &outputs) const;
void LaunchLogicalNot(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &outputs) const;

private:
OperateType operate_type_{SQUARE};
TypeId dtype_{kTypeUnknown};
TypeId target_dtype_{kTypeUnknown};
};


+ 22
- 8
mindspore/ccsrc/backend/kernel_compiler/cpu/assign_cpu_kernel.cc View File

@@ -1,5 +1,5 @@
/**
* Copyright 2020 Huawei Technologies Co., Ltd
* Copyright 2020-2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
@@ -15,23 +15,34 @@
*/

#include "backend/kernel_compiler/cpu/assign_cpu_kernel.h"

#include <string>
#include <map>

#include "runtime/device/cpu/cpu_device_address.h"
#include "common/thread_pool.h"

namespace mindspore {
namespace kernel {
static std::map<TypeId, size_t> input_x_dtype_size_map = {
{kNumberTypeBool, sizeof(bool)}, {kNumberTypeInt8, 1}, {kNumberTypeInt16, 2}, {kNumberTypeInt32, 4},
{kNumberTypeInt64, 8}, {kNumberTypeUInt8, 1}, {kNumberTypeUInt16, 2}, {kNumberTypeUInt32, 4},
{kNumberTypeUInt64, 8}, {kNumberTypeFloat16, 2}, {kNumberTypeFloat32, 4}, {kNumberTypeFloat64, 8}};
namespace {
constexpr size_t kAssignInputsNum = 2;
constexpr size_t kAssignOutputsNum = 1;

const std::map<TypeId, size_t> input_x_dtype_size_map = {
{kNumberTypeBool, sizeof(bool)}, {kNumberTypeInt8, sizeof(int8_t)}, {kNumberTypeInt16, sizeof(int16_t)},
{kNumberTypeInt32, sizeof(int32_t)}, {kNumberTypeInt64, sizeof(int64_t)}, {kNumberTypeUInt8, sizeof(uint8_t)},
{kNumberTypeUInt16, sizeof(uint16_t)}, {kNumberTypeUInt32, sizeof(uint32_t)}, {kNumberTypeUInt64, sizeof(uint64_t)},
{kNumberTypeFloat16, sizeof(float16)}, {kNumberTypeFloat32, sizeof(float)}, {kNumberTypeFloat64, sizeof(double)}};
} // namespace

void AssignCPUKernel::InitKernel(const CNodePtr &kernel_node) {
MS_EXCEPTION_IF_NULL(kernel_node);
kernel_name_ = AnfAlgo::GetCNodeName(kernel_node);
auto input_x_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
auto input_y_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 1);
if (input_x_shape.size() != input_y_shape.size()) MS_LOG(EXCEPTION) << "X and y must be same shape!";
if (input_x_shape.size() != input_y_shape.size()) {
MS_LOG(EXCEPTION) << "X and y must be same shape!";
}
for (size_t i = 0; i < input_x_shape.size(); ++i) {
if (input_x_shape[i] != input_y_shape[i]) {
MS_LOG(EXCEPTION) << "X and y must be same shape!";
@@ -39,14 +50,17 @@ void AssignCPUKernel::InitKernel(const CNodePtr &kernel_node) {
batch_size_ *= input_x_shape[i];
}
input_x_dtype_ = AnfAlgo::GetInputDeviceDataType(kernel_node, 0);
if (input_x_dtype_size_map.find(input_x_dtype_) == input_x_dtype_size_map.end()) {
auto type_len = input_x_dtype_size_map.find(input_x_dtype_);
if (type_len == input_x_dtype_size_map.end()) {
MS_LOG(EXCEPTION) << "Unsupported input_x dtype!";
}
input_x_dtype_size_ = input_x_dtype_size_map[input_x_dtype_];
input_x_dtype_size_ = type_len->second;
}

bool AssignCPUKernel::Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &,
const std::vector<AddressPtr> &outputs) {
CHECK_KERNEL_INPUTS_NUM(inputs.size(), kAssignInputsNum, kernel_name_);
CHECK_KERNEL_OUTPUTS_NUM(outputs.size(), kAssignOutputsNum, kernel_name_);
auto max_size = inputs[0]->size;
size_t total_size = input_x_dtype_size_ * batch_size_;
if (total_size > max_size) {


+ 4
- 2
mindspore/ccsrc/backend/kernel_compiler/cpu/assign_cpu_kernel.h View File

@@ -1,5 +1,5 @@
/**
* Copyright 2020 Huawei Technologies Co., Ltd
* Copyright 2020-2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
@@ -13,12 +13,14 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_ASSIGN_CPU_KERNEL_H_
#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_ASSIGN_CPU_KERNEL_H_

#include <vector>
#include <memory>
#include <unordered_map>

#include "backend/kernel_compiler/cpu/cpu_kernel.h"
#include "backend/kernel_compiler/cpu/cpu_kernel_factory.h"

@@ -36,8 +38,8 @@ class AssignCPUKernel : public CPUKernel {

private:
size_t batch_size_{1};
size_t input_x_dtype_size_{4};
TypeId input_x_dtype_{kTypeUnknown};
size_t input_x_dtype_size_ = 4;
};

MS_REG_CPU_KERNEL(


+ 17
- 11
mindspore/ccsrc/backend/kernel_compiler/cpu/bias_add_cpu_kernel.cc View File

@@ -1,5 +1,5 @@
/**
* Copyright 2019 Huawei Technologies Co., Ltd
* Copyright 2019-2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
@@ -15,16 +15,21 @@
*/

#include "backend/kernel_compiler/cpu/bias_add_cpu_kernel.h"
#include "nnacl/fp32/add_fp32.h"
#include "backend/kernel_compiler/cpu/nnacl/fp32/add_fp32.h"
#include "backend/kernel_compiler/cpu/nnacl/errorcode.h"

namespace mindspore {
namespace kernel {
namespace {
constexpr size_t kBiasAddMinDim = 2;
constexpr size_t kBiasAddMaxDim = 5;
constexpr size_t kBiasAddInputNum = 2;
constexpr size_t kBiasAddInputsNum = 2;
constexpr size_t kBiasAddOutputsNum = 1;
} // namespace

void BiasAddCPUKernel::InitKernel(const CNodePtr &kernel_node) {
MS_EXCEPTION_IF_NULL(kernel_node);
kernel_name_ = AnfAlgo::GetCNodeName(kernel_node);
input_shape_ = AnfAlgo::GetInputDeviceShape(kernel_node, 0);
bias_shape_ = AnfAlgo::GetInputDeviceShape(kernel_node, 1);
data_shape_ = input_shape_.size();
@@ -44,13 +49,11 @@ void BiasAddCPUKernel::InitKernel(const CNodePtr &kernel_node) {

bool BiasAddCPUKernel::Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &,
const std::vector<AddressPtr> &outputs) {
if (inputs.size() != kBiasAddInputNum || outputs.size() != 1) {
MS_LOG(EXCEPTION) << "Inputs outputs size not supoort";
}

auto src_addr = reinterpret_cast<float *>(inputs[0]->addr);
auto bias_addr = reinterpret_cast<float *>(inputs[1]->addr);
auto output_addr = reinterpret_cast<float *>(outputs[0]->addr);
CHECK_KERNEL_INPUTS_NUM(inputs.size(), kBiasAddInputsNum, kernel_name_);
CHECK_KERNEL_OUTPUTS_NUM(outputs.size(), kBiasAddOutputsNum, kernel_name_);
const auto *src_addr = reinterpret_cast<float *>(inputs[0]->addr);
const auto *bias_addr = reinterpret_cast<float *>(inputs[1]->addr);
auto *output_addr = reinterpret_cast<float *>(outputs[0]->addr);

if (input_shape_.size() > 2) {
size_t hw_size = 1;
@@ -87,11 +90,14 @@ bool BiasAddCPUKernel::Launch(const std::vector<AddressPtr> &inputs, const std::
auto task = [&](size_t start, size_t end) {
for (size_t n = start; n < end; ++n) {
size_t n_offset = input_shape_[1] * n;
ElementAdd(src_addr + n_offset, bias_addr, output_addr + n_offset, input_shape_[1]);
if (ElementAdd(src_addr + n_offset, bias_addr, output_addr + n_offset, input_shape_[1]) != NNACL_OK) {
MS_LOG(EXCEPTION) << "ElementAdd failed.";
}
}
};
ParallelLaunchAutoSearch(task, input_shape_[0], this, &parallel_search_info_);
}

return true;
}
} // namespace kernel


+ 3
- 1
mindspore/ccsrc/backend/kernel_compiler/cpu/bias_add_cpu_kernel.h View File

@@ -1,5 +1,5 @@
/**
* Copyright 2019 Huawei Technologies Co., Ltd
* Copyright 2019-2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
@@ -13,11 +13,13 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_BIAS_ADD_CPU_KERNEL_H_
#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_BIAS_ADD_CPU_KERNEL_H_

#include <vector>
#include <memory>

#include "backend/kernel_compiler/cpu/cpu_kernel.h"
#include "backend/kernel_compiler/cpu/cpu_kernel_factory.h"



+ 19
- 8
mindspore/ccsrc/backend/kernel_compiler/cpu/bias_add_grad_cpu_kernel.cc View File

@@ -1,5 +1,5 @@
/**
* Copyright 2019 Huawei Technologies Co., Ltd
* Copyright 2019-2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
@@ -15,11 +15,19 @@
*/

#include "backend/kernel_compiler/cpu/bias_add_grad_cpu_kernel.h"
#include "nnacl/fp32/reduce_fp32.h"
#include "backend/kernel_compiler/cpu/nnacl/fp32/reduce_fp32.h"
#include "backend/kernel_compiler/cpu/nnacl/errorcode.h"

namespace mindspore {
namespace kernel {
namespace {
constexpr size_t kBiasAddGradInputsNum = 1;
constexpr size_t kBiasAddGradOutputsNum = 1;
} // namespace

void BiasAddGradCPUKernel::InitKernel(const CNodePtr &kernel_node) {
MS_EXCEPTION_IF_NULL(kernel_node);
kernel_name_ = AnfAlgo::GetCNodeName(kernel_node);
input_shape_ = AnfAlgo::GetInputDeviceShape(kernel_node, 0);
if (input_shape_.size() < 2) {
MS_LOG(EXCEPTION) << "Input tensor's rank must be at least 2 for 'BiasAddGrad' Op, but input tensor's rank is "
@@ -29,11 +37,10 @@ void BiasAddGradCPUKernel::InitKernel(const CNodePtr &kernel_node) {

bool BiasAddGradCPUKernel::Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &,
const std::vector<AddressPtr> &outputs) {
if (inputs.size() != 1 || outputs.size() != 1) {
MS_LOG(EXCEPTION) << "input output size not support";
}
auto output_addr = reinterpret_cast<float *>(outputs[0]->addr);
auto input_addr = reinterpret_cast<float *>(inputs[0]->addr);
CHECK_KERNEL_INPUTS_NUM(inputs.size(), kBiasAddGradInputsNum, kernel_name_);
CHECK_KERNEL_OUTPUTS_NUM(outputs.size(), kBiasAddGradOutputsNum, kernel_name_);
const auto *input_addr = reinterpret_cast<float *>(inputs[0]->addr);
auto *output_addr = reinterpret_cast<float *>(outputs[0]->addr);

if (input_shape_.size() > 2) {
size_t hw_size = 1;
@@ -53,7 +60,11 @@ bool BiasAddGradCPUKernel::Launch(const std::vector<AddressPtr> &inputs, const s
}
} else if (input_shape_.size() == 2) {
auto task = [this, input_addr, output_addr](size_t start, size_t end) {
(void)ReduceSumDim2Axis0(end - start, input_shape_[1], input_shape_[0], input_addr + start, output_addr + start);
int ret =
ReduceSumDim2Axis0(end - start, input_shape_[1], input_shape_[0], input_addr + start, output_addr + start);
if (ret != NNACL_OK) {
MS_LOG(EXCEPTION) << "ReduceSumDim2Axis0 failed.";
}
};
ParallelLaunchAutoSearch(task, input_shape_[1], this, &parallel_search_info_);
}


+ 5
- 4
mindspore/ccsrc/backend/kernel_compiler/cpu/bias_add_grad_cpu_kernel.h View File

@@ -1,5 +1,5 @@
/**
* Copyright 2019 Huawei Technologies Co., Ltd
* Copyright 2019-2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
@@ -14,11 +14,12 @@
* limitations under the License.
*/

#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_BIASADDGRADCPUKERNEL_H_
#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_BIASADDGRADCPUKERNEL_H_
#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_BIAS_ADD_GRAD_CPU_KERNEL_H_
#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_BIAS_ADD_GRAD_CPU_KERNEL_H_

#include <vector>
#include <memory>

#include "backend/kernel_compiler/cpu/cpu_kernel.h"
#include "backend/kernel_compiler/cpu/cpu_kernel_factory.h"

@@ -39,4 +40,4 @@ class BiasAddGradCPUKernel : public CPUKernel {
MS_REG_CPU_KERNEL(BiasAddGrad, KernelAttr(), BiasAddGradCPUKernel);
} // namespace kernel
} // namespace mindspore
#endif // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_BIASADDGRADCPUKERNEL_H_
#endif // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_BIAS_ADD_GRAD_CPU_KERNEL_H_

+ 69
- 53
mindspore/ccsrc/backend/kernel_compiler/cpu/binary_cross_entropy_cpu_kernel.cc View File

@@ -13,14 +13,19 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include "backend/kernel_compiler/cpu/binary_cross_entropy_cpu_kernel.h"

namespace mindspore {
namespace kernel {
constexpr size_t kBceInputNumWithWeight = 3;
namespace {
constexpr size_t kBceInputsNumWithWeight = 3;
constexpr size_t kBceOutputsNum = 1;
} // namespace

template <typename T>
void BinaryCrossEntropyCpuKernel::LaunchToScalar(const int &input_size, const int &reduction, T *loss, T *tmp_loss) {
void BinaryCrossEntropyCpuKernel::LaunchToScalar(const int &input_size, const int &reduction, T *loss,
T *tmp_loss) const {
if (input_size % 2 == 1) {
tmp_loss[0] += tmp_loss[input_size - 1];
}
@@ -35,83 +40,94 @@ void BinaryCrossEntropyCpuKernel::LaunchToScalar(const int &input_size, const in
}

loss[0] = tmp_loss[0];
if (reduction == 1) {
if (reduction == kMean) {
loss[0] /= static_cast<T>(input_size);
}
}

template <typename T>
void BinaryCrossEntropyCpuKernel::Launchkernel(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &,
const std::vector<AddressPtr> &outputs) {
T *input_x = reinterpret_cast<T *>(inputs[0]->addr);
T *input_y = reinterpret_cast<T *>(inputs[1]->addr);
T *weight = nullptr;
if (weight_defined_) {
weight = reinterpret_cast<T *>(inputs[2]->addr);
}
T *loss = reinterpret_cast<T *>(outputs[0]->addr);
void BinaryCrossEntropyCpuKernel::LaunchKernel(const std::vector<AddressPtr> &inputs,
const std::vector<AddressPtr> &outputs) const {
const auto *input_x = reinterpret_cast<T *>(inputs[0]->addr);
const auto *input_y = reinterpret_cast<T *>(inputs[1]->addr);
const T *weight = weight_defined_ ? reinterpret_cast<T *>(inputs[2]->addr) : nullptr;
auto *loss = reinterpret_cast<T *>(outputs[0]->addr);
std::vector<T> tmp_loss(input_size_);
auto epsilon = static_cast<T>(1e-12);
auto one = static_cast<T>(1);

T epsilon = static_cast<T>(1e-12);
T one = static_cast<T>(1);
if (reduction_ == 0 && weight_defined_) {
for (size_t i = 0; i < input_size_; i++) {
T value =
-weight[i] * (input_y[i] * log(input_x[i] + epsilon) + (one - input_y[i]) * log(one - input_x[i] + epsilon));
loss[i] = value;
}
} else if (reduction_ == 0 && (!weight_defined_)) {
for (size_t i = 0; i < input_size_; i++) {
T value = static_cast<T>(
-(input_y[i] * log(input_x[i] + epsilon) + (one - input_y[i]) * log(one - input_x[i] + epsilon)));
loss[i] = value;
}
} else if ((reduction_ != 0) && weight_defined_) {
for (size_t i = 0; i < input_size_; i++) {
T value =
-weight[i] * (input_y[i] * log(input_x[i] + epsilon) + (one - input_y[i]) * log(one - input_x[i] + epsilon));
tmp_loss[i] = value;
if (reduction_ == kNone) {
if (weight_defined_) {
for (size_t i = 0; i < input_size_; i++) {
auto value = static_cast<T>(
-weight[i] * (input_y[i] * log(input_x[i] + epsilon) + (one - input_y[i]) * log(one - input_x[i] + epsilon)));
loss[i] = value;
}
} else {
for (size_t i = 0; i < input_size_; i++) {
auto value = static_cast<T>(
-(input_y[i] * log(input_x[i] + epsilon) + (one - input_y[i]) * log(one - input_x[i] + epsilon)));
loss[i] = value;
}
}
} else {
for (size_t i = 0; i < input_size_; i++) {
T value = static_cast<T>(
-(input_y[i] * log(input_x[i] + epsilon) + (one - input_y[i]) * log(one - input_x[i] + epsilon)));
tmp_loss[i] = value;
if (weight_defined_) {
for (size_t i = 0; i < input_size_; i++) {
auto value = static_cast<T>(
-weight[i] * (input_y[i] * log(input_x[i] + epsilon) + (one - input_y[i]) * log(one - input_x[i] + epsilon)));
tmp_loss[i] = value;
}
} else {
for (size_t i = 0; i < input_size_; i++) {
auto value = static_cast<T>(
-(input_y[i] * log(input_x[i] + epsilon) + (one - input_y[i]) * log(one - input_x[i] + epsilon)));
tmp_loss[i] = value;
}
}
}

if (reduction_ != 0) {
if (reduction_ != kNone) {
LaunchToScalar<T>(input_size_, reduction_, loss, tmp_loss.data());
}
}

bool BinaryCrossEntropyCpuKernel::Launch(const std::vector<AddressPtr> &inputs,
const std::vector<AddressPtr> &workspace,
bool BinaryCrossEntropyCpuKernel::Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &,
const std::vector<AddressPtr> &outputs) {
if (input_size_ > 0) {
if (dtype_ == kNumberTypeFloat32) {
Launchkernel<float>(inputs, workspace, outputs);
} else if (dtype_ == kNumberTypeFloat16) {
Launchkernel<float16>(inputs, workspace, outputs);
}
const size_t expect_inputs_num = weight_defined_ ? kBceInputsNumWithWeight : kBceInputsNumWithWeight - 1;
CHECK_KERNEL_INPUTS_NUM(inputs.size(), expect_inputs_num, kernel_name_);
CHECK_KERNEL_OUTPUTS_NUM(outputs.size(), kBceOutputsNum, kernel_name_);
if (dtype_ == kNumberTypeFloat32) {
LaunchKernel<float>(inputs, outputs);
} else if (dtype_ == kNumberTypeFloat16) {
LaunchKernel<float16>(inputs, outputs);
} else {
MS_LOG(EXCEPTION) << kernel_name_ << " only support float16 and float32 on CPU, but got "
<< TypeIdToType(dtype_)->ToString();
}
return true;
}

void BinaryCrossEntropyCpuKernel::InitKernel(const CNodePtr &kernel_node) {
MS_EXCEPTION_IF_NULL(kernel_node);
kernel_name_ = AnfAlgo::GetCNodeName(kernel_node);
size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
weight_defined_ = (input_num == kBceInputsNumWithWeight);
dtype_ = AnfAlgo::GetInputDeviceDataType(kernel_node, 0);
auto input_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
for (size_t i = 0; i < input_shape.size(); i++) {
input_size_ *= input_shape[i];
}
string reduction = AnfAlgo::GetNodeAttr<string>(kernel_node, "reduction");
if (reduction == "none") {
reduction_ = 0;
} else if (reduction == "sum") {
reduction_ = 2;

const std::string reduction = AnfAlgo::GetNodeAttr<string>(kernel_node, REDUCTION);
if (reduction == NONE) {
reduction_ = kNone;
} else if (reduction == MEAN) {
reduction_ = kMean;
} else if (reduction == SUM) {
reduction_ = kSum;
} else {
MS_LOG(EXCEPTION) << kernel_name_ << "only support the reduction is 'none', 'mean', or 'sum', but got "
<< reduction;
}
size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
weight_defined_ = (input_num == kBceInputNumWithWeight);
dtype_ = AnfAlgo::GetInputDeviceDataType(kernel_node, 0);
}
} // namespace kernel
} // namespace mindspore

+ 10
- 7
mindspore/ccsrc/backend/kernel_compiler/cpu/binary_cross_entropy_cpu_kernel.h View File

@@ -13,19 +13,23 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_NN_BINARY_CROSS_ENTROPY_KERNEL_H
#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_NN_BINARY_CROSS_ENTROPY_KERNEL_H

#include <vector>
#include <string>

#include "backend/kernel_compiler/cpu/cpu_kernel.h"
#include "backend/kernel_compiler/cpu/cpu_kernel_factory.h"

namespace mindspore {
namespace kernel {
enum ReductionType { kNone, kMean, kSum };

class BinaryCrossEntropyCpuKernel : public CPUKernel {
public:
BinaryCrossEntropyCpuKernel() : input_size_(1), reduction_(1), weight_defined_(false) {}
BinaryCrossEntropyCpuKernel() = default;
~BinaryCrossEntropyCpuKernel() override = default;

void InitKernel(const CNodePtr &kernel_node) override;
@@ -34,15 +38,14 @@ class BinaryCrossEntropyCpuKernel : public CPUKernel {

private:
template <typename T>
void LaunchToScalar(const int &input_size, const int &reduction, T *loss, T *tmp_loss);
void LaunchToScalar(const int &input_size, const int &reduction, T *loss, T *tmp_loss) const;
template <typename T>
void Launchkernel(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
const std::vector<AddressPtr> &outputs);
void LaunchKernel(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &outputs) const;

TypeId dtype_{kTypeUnknown};
size_t input_size_;
int reduction_;
bool weight_defined_; // true: there are 3 inputs, false: there are 2 inputs(no [weight])
size_t input_size_{1};
ReductionType reduction_{kNone};
bool weight_defined_{false}; // true: there are 3 inputs, false: there are 2 inputs(no [weight])
};
MS_REG_CPU_KERNEL(BinaryCrossEntropy,
KernelAttr()


+ 43
- 33
mindspore/ccsrc/backend/kernel_compiler/cpu/binary_cross_entropy_grad_kernel.cc View File

@@ -13,28 +13,28 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include "backend/kernel_compiler/cpu/binary_cross_entropy_grad_kernel.h"

namespace mindspore {
namespace kernel {
constexpr size_t kBceGradInputNumWithWeight = 4;
namespace {
constexpr size_t kBceGradInputsNumWithWeight = 4;
constexpr size_t kBceGradOutputsNum = 1;
} // namespace

template <typename T>
void BinaryCrossEntropyGradCpuKernel::Launchkernel(const std::vector<AddressPtr> &inputs,
const std::vector<AddressPtr> &outputs) {
T *input_x = reinterpret_cast<T *>(inputs[0]->addr);
T *input_y = reinterpret_cast<T *>(inputs[1]->addr);
T *dloss = reinterpret_cast<T *>(inputs[2]->addr);
T *weight = nullptr;
if (weight_defined_) {
weight = reinterpret_cast<T *>(inputs[3]->addr);
}

T *dx = reinterpret_cast<T *>(outputs[0]->addr);
void BinaryCrossEntropyGradCpuKernel::LaunchKernel(const std::vector<AddressPtr> &inputs,
const std::vector<AddressPtr> &outputs) const {
const auto *input_x = reinterpret_cast<T *>(inputs[0]->addr);
const auto *input_y = reinterpret_cast<T *>(inputs[1]->addr);
const auto *dloss = reinterpret_cast<T *>(inputs[2]->addr);
const T *weight = weight_defined_ ? reinterpret_cast<T *>(inputs[3]->addr) : nullptr;
auto *dx = reinterpret_cast<T *>(outputs[0]->addr);
auto epsilon = static_cast<T>(1e-12);
auto one = static_cast<T>(1);

T epsilon = static_cast<T>(1e-12);
T one = static_cast<T>(1);
if (reduction_ == 0) {
if (reduction_ == kNone) {
if (weight_defined_) {
for (size_t i = 0; i < input_size_; i++) {
T denominator = ((input_x[i] * (one - input_x[i])) > epsilon) ? (input_x[i] * (one - input_x[i])) : epsilon;
@@ -50,7 +50,7 @@ void BinaryCrossEntropyGradCpuKernel::Launchkernel(const std::vector<AddressPtr>
}
} else {
T dloss1 = dloss[0];
if (reduction_ == 1) {
if (reduction_ == kMean) {
dloss1 = dloss[0] / static_cast<T>(input_size_);
}
if (weight_defined_) {
@@ -69,34 +69,44 @@ void BinaryCrossEntropyGradCpuKernel::Launchkernel(const std::vector<AddressPtr>
}
}

bool BinaryCrossEntropyGradCpuKernel::Launch(const std::vector<AddressPtr> &inputs,
const std::vector<AddressPtr> &workspace,
bool BinaryCrossEntropyGradCpuKernel::Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &,
const std::vector<AddressPtr> &outputs) {
if (input_size_ > 0) {
if (dtype_ == kNumberTypeFloat32) {
Launchkernel<float>(inputs, outputs);
} else if (dtype_ == kNumberTypeFloat16) {
Launchkernel<float16>(inputs, outputs);
}
const size_t expect_inputs_num = weight_defined_ ? kBceGradInputsNumWithWeight : kBceGradInputsNumWithWeight - 1;
CHECK_KERNEL_INPUTS_NUM(inputs.size(), expect_inputs_num, kernel_name_);
CHECK_KERNEL_OUTPUTS_NUM(outputs.size(), kBceGradOutputsNum, kernel_name_);
if (dtype_ == kNumberTypeFloat32) {
LaunchKernel<float>(inputs, outputs);
} else if (dtype_ == kNumberTypeFloat16) {
LaunchKernel<float16>(inputs, outputs);
} else {
MS_LOG(EXCEPTION) << kernel_name_ << " only support float16 and float32 on CPU, but got "
<< TypeIdToType(dtype_)->ToString();
}
return true;
}

void BinaryCrossEntropyGradCpuKernel::InitKernel(const CNodePtr &kernel_node) {
MS_EXCEPTION_IF_NULL(kernel_node);
kernel_name_ = AnfAlgo::GetCNodeName(kernel_node);
size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
weight_defined_ = (input_num == kBceGradInputsNumWithWeight);
dtype_ = AnfAlgo::GetInputDeviceDataType(kernel_node, 0);
auto input_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
for (size_t i = 0; i < input_shape.size(); i++) {
input_size_ *= input_shape[i];
}
string reduction = AnfAlgo::GetNodeAttr<string>(kernel_node, "reduction");
if (reduction == "none") {
reduction_ = 0;
} else if (reduction == "sum") {
reduction_ = 2;
}

size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
weight_defined_ = (input_num == kBceGradInputNumWithWeight);
dtype_ = AnfAlgo::GetInputDeviceDataType(kernel_node, 0);
const std::string reduction = AnfAlgo::GetNodeAttr<string>(kernel_node, REDUCTION);
if (reduction == NONE) {
reduction_ = kNone;
} else if (reduction == MEAN) {
reduction_ = kMean;
} else if (reduction == SUM) {
reduction_ = kSum;
} else {
MS_LOG(EXCEPTION) << kernel_name_ << "only support the reduction is 'none', 'mean', or 'sum', but got "
<< reduction;
}
}
} // namespace kernel
} // namespace mindspore

+ 8
- 5
mindspore/ccsrc/backend/kernel_compiler/cpu/binary_cross_entropy_grad_kernel.h View File

@@ -13,19 +13,22 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_NN_BINARY_CROSS_ENTROPY_GRAD_KERNEL_H
#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_NN_BINARY_CROSS_ENTROPY_GRAD_KERNEL_H

#include <vector>
#include <string>

#include "backend/kernel_compiler/cpu/cpu_kernel.h"
#include "backend/kernel_compiler/cpu/cpu_kernel_factory.h"
#include "backend/kernel_compiler/cpu/binary_cross_entropy_cpu_kernel.h"

namespace mindspore {
namespace kernel {
class BinaryCrossEntropyGradCpuKernel : public CPUKernel {
public:
BinaryCrossEntropyGradCpuKernel() : input_size_(1), reduction_(1), weight_defined_(false) {}
BinaryCrossEntropyGradCpuKernel() = default;
~BinaryCrossEntropyGradCpuKernel() override = default;

void InitKernel(const CNodePtr &kernel_node) override;
@@ -34,12 +37,12 @@ class BinaryCrossEntropyGradCpuKernel : public CPUKernel {

private:
template <typename T>
void Launchkernel(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &outputs);
void LaunchKernel(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &outputs) const;

TypeId dtype_{kTypeUnknown};
size_t input_size_;
int reduction_;
bool weight_defined_; // true: there are 4 inputs, false: there are 3 inputs(no [weight])
size_t input_size_{1};
ReductionType reduction_{kNone};
bool weight_defined_{false}; // true: there are 4 inputs, false: there are 3 inputs(no [weight])
};
MS_REG_CPU_KERNEL(BinaryCrossEntropyGrad,
KernelAttr()


+ 1
- 0
mindspore/ccsrc/backend/kernel_compiler/cpu/boundingbox_decode_cpu_kernel.cc View File

@@ -13,6 +13,7 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include "backend/kernel_compiler/cpu/boundingbox_decode_cpu_kernel.h"
#include "runtime/device/cpu/cpu_device_address.h"



+ 3
- 0
mindspore/ccsrc/backend/kernel_compiler/cpu/boundingbox_decode_cpu_kernel.h View File

@@ -13,10 +13,13 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_BOUNDINGBOX_DECODE_CPU_KERNEL_H_
#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_BOUNDINGBOX_DECODE_CPU_KERNEL_H_

#include <vector>
#include <algorithm>

#include "backend/kernel_compiler/cpu/cpu_kernel.h"
#include "backend/kernel_compiler/cpu/cpu_kernel_factory.h"



+ 1
- 0
mindspore/ccsrc/backend/kernel_compiler/cpu/boundingbox_encode_cpu_kernel.cc View File

@@ -13,6 +13,7 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include "backend/kernel_compiler/cpu/boundingbox_encode_cpu_kernel.h"
#include "runtime/device/cpu/cpu_device_address.h"



+ 3
- 0
mindspore/ccsrc/backend/kernel_compiler/cpu/boundingbox_encode_cpu_kernel.h View File

@@ -13,10 +13,13 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_BOUNDINGBOX_ENCODE_CPU_KERNEL_H_
#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_BOUNDINGBOX_ENCODE_CPU_KERNEL_H_

#include <vector>
#include <algorithm>

#include "backend/kernel_compiler/cpu/cpu_kernel.h"
#include "backend/kernel_compiler/cpu/cpu_kernel_factory.h"



+ 19
- 22
mindspore/ccsrc/backend/kernel_compiler/cpu/broadcast_to_cpu_kernel.cc View File

@@ -15,13 +15,19 @@
*/

#include "backend/kernel_compiler/cpu/broadcast_to_cpu_kernel.h"
#include "nnacl/errorcode.h"
#include "backend/kernel_compiler/cpu/nnacl/errorcode.h"

namespace mindspore {
namespace kernel {
namespace {
constexpr size_t kBroadcastToInputsNum = 1;
constexpr size_t kBroadcastToOutputsNum = 1;
} // namespace

template <typename T>
void BroadcastToCPUKernel<T>::InitKernel(const CNodePtr &kernel_node) {
MS_EXCEPTION_IF_NULL(kernel_node);
kernel_name_ = AnfAlgo::GetCNodeName(kernel_node);
input_shape_ = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
output_shape_ = AnfAlgo::GetOutputInferShape(kernel_node, 0);
size_t input_shape_size = input_shape_.size();
@@ -55,35 +61,26 @@ void BroadcastToCPUKernel<T>::InitKernel(const CNodePtr &kernel_node) {
template <typename T>
bool BroadcastToCPUKernel<T>::Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &,
const std::vector<AddressPtr> &outputs) {
if (inputs.size() != 1 || outputs.size() != 1) {
MS_LOG(EXCEPTION) << "Wrong number of inputs or outputs!";
}
if ((inputs[0] == nullptr) || (inputs[0]->size == 0)) {
MS_LOG(EXCEPTION) << "Input data is NULL!";
}
if ((outputs[0] == nullptr) || (outputs[0]->size == 0)) {
MS_LOG(EXCEPTION) << "Output data is NULL!";
}

const auto input_addr = reinterpret_cast<T *>(inputs[0]->addr);
auto output_addr = reinterpret_cast<T *>(outputs[0]->addr);
int ret = static_cast<int>(NNACL_ERR);
CHECK_KERNEL_INPUTS_NUM(inputs.size(), kBroadcastToInputsNum, kernel_name_);
CHECK_KERNEL_OUTPUTS_NUM(outputs.size(), kBroadcastToOutputsNum, kernel_name_);
const auto *input_addr = reinterpret_cast<T *>(inputs[0]->addr);
auto *output_addr = reinterpret_cast<T *>(outputs[0]->addr);
int status = static_cast<int>(NNACL_OK);
if constexpr (std::is_same_v<T, bool>) {
ret = BroadcastTo(bool, input_addr, &shape_info_, output_addr);
status = BROADCAST_TO(bool, input_addr, &shape_info_, output_addr);
} else if constexpr (std::is_same_v<T, int>) {
ret = BroadcastTo(int, input_addr, &shape_info_, output_addr);
status = BROADCAST_TO(int, input_addr, &shape_info_, output_addr);
} else if constexpr (std::is_same_v<T, float>) {
ret = BroadcastTo(float, input_addr, &shape_info_, output_addr);
status = BROADCAST_TO(float, input_addr, &shape_info_, output_addr);
} else {
MS_LOG(EXCEPTION) << "Not supported data type for BroadcastTo.";
}

if (ret == NNACL_OK) {
return true;
if (status != static_cast<int>(NNACL_OK)) {
MS_LOG(EXCEPTION) << "Broadcast tensor with shape " << input_shape_ << " to shape " << output_shape_
<< " execute failed, error code: " << status;
}
MS_LOG(ERROR) << "Broadcast tensor with shape " << input_shape_ << " to shape " << output_shape_
<< " execute failed.";
return false;
return true;
}
} // namespace kernel
} // namespace mindspore

+ 6
- 5
mindspore/ccsrc/backend/kernel_compiler/cpu/broadcast_to_cpu_kernel.h View File

@@ -14,14 +14,15 @@
* limitations under the License.
*/

#ifndef MINDSPORE_BROADCAST_TO_CPU_KERNEL_H
#define MINDSPORE_BROADCAST_TO_CPU_KERNEL_H
#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_BROADCAST_TO_CPU_KERNEL_H_
#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_BROADCAST_TO_CPU_KERNEL_H_

#include <vector>
#include <memory>

#include "backend/kernel_compiler/cpu/cpu_kernel.h"
#include "backend/kernel_compiler/cpu/cpu_kernel_factory.h"
#include "nnacl/base/broadcast_to.h"
#include "backend/kernel_compiler/cpu/nnacl/base/broadcast_to.h"

namespace mindspore {
namespace kernel {
@@ -38,7 +39,7 @@ class BroadcastToCPUKernel : public CPUKernel {
private:
std::vector<size_t> input_shape_;
std::vector<size_t> output_shape_;
BroadcastShapeInfo shape_info_;
BroadcastShapeInfo shape_info_{};
};

MS_REG_CPU_KERNEL_T(BroadcastTo, KernelAttr().AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32),
@@ -50,4 +51,4 @@ MS_REG_CPU_KERNEL_T(BroadcastTo, KernelAttr().AddInputAttr(kNumberTypeBool).AddO
} // namespace kernel
} // namespace mindspore

#endif // MINDSPORE_BROADCAST_TO_CPU_KERNEL_H
#endif // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_BROADCAST_TO_CPU_KERNEL_H_

+ 15
- 9
mindspore/ccsrc/backend/kernel_compiler/cpu/cast_cpu_kernel.cc View File

@@ -1,5 +1,5 @@
/**
* Copyright 2020 Huawei Technologies Co., Ltd
* Copyright 2020-2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
@@ -13,14 +13,22 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include "backend/kernel_compiler/cpu/cast_cpu_kernel.h"

#include <cmath>
#include <map>
#include <string>
#include "backend/kernel_compiler/cpu/cast_cpu_kernel.h"
#include "runtime/device/cpu/cpu_device_address.h"

namespace mindspore {
namespace kernel {
namespace {
constexpr size_t kCastInputsNum = 1;
constexpr size_t kCastOutputsNum = 1;
} // namespace

template <typename S, typename T>
void Cast(const S *in, T *out, size_t size) {
auto task = [&in, &out](size_t start, size_t end) {
@@ -34,6 +42,7 @@ void Cast(const S *in, T *out, size_t size) {
template <typename S, typename T>
void CastCPUKernel<S, T>::InitKernel(const CNodePtr &kernel_node) {
MS_EXCEPTION_IF_NULL(kernel_node);
kernel_name_ = AnfAlgo::GetCNodeName(kernel_node);
source_dtype_ = AnfAlgo::GetInputDeviceDataType(kernel_node, 0);
target_dtype_ = AnfAlgo::GetOutputDeviceDataType(kernel_node, 0);
}
@@ -41,17 +50,14 @@ void CastCPUKernel<S, T>::InitKernel(const CNodePtr &kernel_node) {
template <typename S, typename T>
bool CastCPUKernel<S, T>::Launch(const std::vector<kernel::AddressPtr> &inputs, const std::vector<kernel::AddressPtr> &,
const std::vector<kernel::AddressPtr> &outputs) {
if (inputs.size() != 1 || outputs.size() != 1) {
MS_LOG(ERROR) << "Cast requires 1 input and 1 output, but got " << inputs.size() << " input and " << outputs.size()
<< " output.";
return false;
}
CHECK_KERNEL_INPUTS_NUM(inputs.size(), kCastInputsNum, kernel_name_);
CHECK_KERNEL_OUTPUTS_NUM(outputs.size(), kCastOutputsNum, kernel_name_);
if (outputs[0]->size == 0) {
MS_LOG(WARNING) << "Cast output memory size should be greater than 0, but got 0.";
return true;
}
const auto input = reinterpret_cast<S *>(inputs[0]->addr);
const auto output = reinterpret_cast<T *>(outputs[0]->addr);
const auto *input = reinterpret_cast<S *>(inputs[0]->addr);
auto *output = reinterpret_cast<T *>(outputs[0]->addr);
MS_LOG(DEBUG) << "Type source: " << typeid(S).name() << "; target: " << typeid(T).name();
Cast<S, T>(input, output, outputs[0]->size / sizeof(T));
return true;


+ 4
- 1
mindspore/ccsrc/backend/kernel_compiler/cpu/cast_cpu_kernel.h View File

@@ -1,5 +1,5 @@
/**
* Copyright 2020 Huawei Technologies Co., Ltd
* Copyright 2020-2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
@@ -13,11 +13,14 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_CAST_CPU_KERNEL_H_
#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_CAST_CPU_KERNEL_H_

#include <functional>
#include <memory>
#include <vector>

#include "backend/kernel_compiler/cpu/cpu_kernel.h"
#include "backend/kernel_compiler/cpu/cpu_kernel_factory.h"



+ 3
- 0
mindspore/ccsrc/backend/kernel_compiler/cpu/check_valid_cpu_kernel.cc View File

@@ -13,6 +13,9 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include <functional>

#include "backend/kernel_compiler/cpu/check_valid_cpu_kernel.h"
#include "runtime/device/cpu/cpu_device_address.h"



+ 3
- 0
mindspore/ccsrc/backend/kernel_compiler/cpu/check_valid_cpu_kernel.h View File

@@ -13,9 +13,12 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_CHECK_VALID_CPU_KERNEL_H_
#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_CHECK_VALID_CPU_KERNEL_H_

#include <vector>

#include "backend/kernel_compiler/cpu/cpu_kernel.h"
#include "backend/kernel_compiler/cpu/cpu_kernel_factory.h"



+ 19
- 17
mindspore/ccsrc/backend/kernel_compiler/cpu/concat_cpu_kernel.cc View File

@@ -19,11 +19,15 @@

namespace mindspore {
namespace kernel {
namespace {
constexpr size_t kConcatOutputsNum = 1;
} // namespace

template <typename T>
void ConcatCPUKernel<T>::InitKernel(const CNodePtr &kernel_node) {
MS_EXCEPTION_IF_NULL(kernel_node);
kernel_name_ = AnfAlgo::GetCNodeName(kernel_node);
node_wpt_ = kernel_node;
CheckParam(kernel_node);

axis_ = LongToInt(AnfAlgo::GetNodeAttr<int64_t>(kernel_node, AXIS));
auto input_1_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
if (axis_ < 0) {
@@ -34,15 +38,18 @@ void ConcatCPUKernel<T>::InitKernel(const CNodePtr &kernel_node) {
template <typename T>
bool ConcatCPUKernel<T>::Launch(const std::vector<kernel::AddressPtr> &inputs, const std::vector<kernel::AddressPtr> &,
const std::vector<kernel::AddressPtr> &outputs) {
auto node_ = node_wpt_.lock();
if (!node_) {
auto node = node_wpt_.lock();
if (!node) {
MS_LOG(EXCEPTION) << "node_wpt_ is expired.";
}
size_t input_num = AnfAlgo::GetInputTensorNum(node_);
const size_t input_num = AnfAlgo::GetInputTensorNum(node);
CHECK_KERNEL_INPUTS_NUM(inputs.size(), input_num, kernel_name_);
CHECK_KERNEL_OUTPUTS_NUM(outputs.size(), kConcatOutputsNum, kernel_name_);

std::vector<std::vector<size_t>> input_flat_shape_list;
input_flat_shape_list.reserve(input_num);
for (size_t i = 0; i < input_num; i++) {
auto input_shape_i = AnfAlgo::GetPrevNodeOutputInferShape(node_, i);
auto input_shape_i = AnfAlgo::GetPrevNodeOutputInferShape(node, i);
auto flat_shape = CPUKernelUtils::FlatShapeByAxis(input_shape_i, axis_);
(void)input_flat_shape_list.emplace_back(flat_shape);
}
@@ -51,10 +58,10 @@ bool ConcatCPUKernel<T>::Launch(const std::vector<kernel::AddressPtr> &inputs, c
for (size_t j = 0; j < input_num; ++j) {
output_dim_1 += input_flat_shape_list[j][1];
}
auto output_addr = reinterpret_cast<T *>(outputs[0]->addr);
auto *output_addr = reinterpret_cast<T *>(outputs[0]->addr);
std::vector<T *> input_addr_list;
for (size_t j = 0; j < input_num; ++j) {
auto tmp_addr = reinterpret_cast<T *>(inputs[j]->addr);
auto *tmp_addr = reinterpret_cast<T *>(inputs[j]->addr);
(void)input_addr_list.emplace_back(tmp_addr);
}
// each input's row of shape after flat are same
@@ -69,7 +76,10 @@ bool ConcatCPUKernel<T>::Launch(const std::vector<kernel::AddressPtr> &inputs, c
auto copy_num = input_flat_shape_list[j][1];
auto copy_size = copy_num * sizeof(T);
auto offset = copy_num * i;
(void)memcpy_s(output_ptr, copy_size, input_addr_list[j] + offset, copy_size);
auto ret = memcpy_s(output_ptr, copy_size, input_addr_list[j] + offset, copy_size);
if (ret != EOK) {
MS_LOG(EXCEPTION) << "Memcpy failed.";
}
output_ptr += copy_num;
}
}
@@ -77,13 +87,5 @@ bool ConcatCPUKernel<T>::Launch(const std::vector<kernel::AddressPtr> &inputs, c
ParallelLaunchAutoSearch(task, before_axis, this, &parallel_search_info_);
return true;
}

template <typename T>
void ConcatCPUKernel<T>::CheckParam(const CNodePtr &kernel_node) const {
size_t output_num = AnfAlgo::GetOutputTensorNum(kernel_node);
if (output_num != 1) {
MS_LOG(EXCEPTION) << "Output number is " << output_num << ", but ConcatCPUKernel needs 1 output.";
}
}
} // namespace kernel
} // namespace mindspore

+ 5
- 3
mindspore/ccsrc/backend/kernel_compiler/cpu/concat_cpu_kernel.h View File

@@ -1,5 +1,5 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
* Copyright 2020-2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
@@ -13,10 +13,13 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_CONCAT_CPU_KERNEL_H_
#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_CONCAT_CPU_KERNEL_H_

#include <vector>
#include <memory>

#include "backend/kernel_compiler/cpu/cpu_kernel.h"
#include "backend/kernel_compiler/cpu/cpu_kernel_factory.h"

@@ -34,8 +37,7 @@ class ConcatCPUKernel : public CPUKernel {
const std::vector<AddressPtr> &outputs) override;

private:
void CheckParam(const CNodePtr &kernel_node) const;
int axis_ = 0;
int axis_{0};
CNodeWeakPtr node_wpt_;
};



+ 14
- 8
mindspore/ccsrc/backend/kernel_compiler/cpu/cpu_kernel.cc View File

@@ -1,5 +1,5 @@
/**
* Copyright 2019 Huawei Technologies Co., Ltd
* Copyright 2019-2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
@@ -13,10 +13,13 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include "backend/kernel_compiler/cpu/cpu_kernel.h"

#include <algorithm>
#include <utility>
#include <cmath>

#include "common/thread_pool.h"
#include "utils/profile.h"

@@ -52,10 +55,11 @@ void CPUKernel::Init(const CNodePtr &kernel_node) {
}

void CPUKernelUtils::ExpandDimsTo4(std::vector<size_t> *shape) {
MS_EXCEPTION_IF_NULL(shape);
auto len = shape->size();
if (len < 4) {
for (size_t i = 0; i < 4 - len; ++i) {
shape->insert(shape->begin(), 1);
(void)shape->insert(shape->begin(), 1);
}
}
}
@@ -79,6 +83,7 @@ size_t CPUKernelUtils::GetElementNumOnAxis(const std::vector<size_t> &shape, int

void CPUKernelUtils::GetElementNumEveryDim(const std::vector<size_t> &shape, std::vector<size_t> *element_num) {
size_t accumulation = 1;
MS_EXCEPTION_IF_NULL(element_num);
(void)element_num->emplace_back(1);
for (size_t i = shape.size() - 1; i > 0; --i) {
accumulation *= shape[i];
@@ -112,6 +117,7 @@ void CPUKernelUtils::ParallelFor(const CTask &task, size_t count, float block_si
void CPUKernelUtils::ParallelForAutoSearch(const CTask &task, size_t count, ParallelSearchInfo *parallel_search_info) {
const size_t MAX_POW = 6;
const size_t AVG_COUNT = 5;
MS_EXCEPTION_IF_NULL(parallel_search_info);
size_t current_pow = parallel_search_info->search_count / AVG_COUNT;
if (current_pow < MAX_POW) {
if (parallel_search_info->search_count % AVG_COUNT == 0) {
@@ -276,12 +282,12 @@ void BroadcastIterator::GenNextPos() {
void BroadcastIterator::BroadcastShape() {
int input_dimension_a = input_shape_a_.size();
if (input_dimension_a < output_dimension_) {
input_shape_a_.insert(input_shape_a_.begin(), output_dimension_ - input_dimension_a, 1);
(void)input_shape_a_.insert(input_shape_a_.begin(), IntToSize(output_dimension_ - input_dimension_a), 1);
}

int input_dimension_b = input_shape_b_.size();
if (input_dimension_b < output_dimension_) {
input_shape_b_.insert(input_shape_b_.begin(), output_dimension_ - input_dimension_b, 1);
(void)input_shape_b_.insert(input_shape_b_.begin(), IntToSize(output_dimension_ - input_dimension_b), 1);
}
}

@@ -297,10 +303,10 @@ void BroadcastIterator::InitStrides() {

// Update strides for broadcast
// While the axis value is 1, the stride is 0
std::transform(input_strides_a_.begin(), input_strides_a_.end(), input_shape_a_.begin(), input_strides_a_.begin(),
[](const auto &a, const auto &b) { return b == 1 ? 0 : a; });
std::transform(input_strides_b_.begin(), input_strides_b_.end(), input_shape_b_.begin(), input_strides_b_.begin(),
[](const auto &a, const auto &b) { return b == 1 ? 0 : a; });
(void)std::transform(input_strides_a_.begin(), input_strides_a_.end(), input_shape_a_.begin(),
input_strides_a_.begin(), [](const auto &a, const auto &b) { return b == 1 ? 0 : a; });
(void)std::transform(input_strides_b_.begin(), input_strides_b_.end(), input_shape_b_.begin(),
input_strides_b_.begin(), [](const auto &a, const auto &b) { return b == 1 ? 0 : a; });
}

TransposeIterator::TransposeIterator(std::vector<size_t> output_shape, std::vector<size_t> axes,


+ 58
- 100
mindspore/ccsrc/backend/kernel_compiler/cpu/cpu_kernel.h View File

@@ -13,14 +13,17 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_CPU_KERNEL_H_
#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_CPU_KERNEL_H_

#include <functional>
#include <memory>
#include <numeric>
#include <string>
#include <thread>
#include <vector>

#include "backend/kernel_compiler/kernel.h"
#include "backend/session/anf_runtime_algorithm.h"
#include "backend/kernel_compiler/common_utils.h"
@@ -33,106 +36,61 @@ using mindspore::kernel::AddressPtr;
using CTask = std::function<void(size_t, size_t)>;
namespace mindspore {
namespace kernel {
const char KERNEL_SIZE[] = "kernel_size";
const char STRIDE[] = "stride";
const char STRIDES[] = "strides";
const char DILATION[] = "dilation";
const char DILATIONS[] = "dilations";
const char FORMAT[] = "format";
const char PAD[] = "pad";
const char PAD_LIST[] = "pad_list";
const char PAD_MODE[] = "pad_mode";
const char PAD_MODE_LOWER_SAME[] = "same";
const char PAD_MODE_LOWER_VALID[] = "valid";
const char PAD_MODE_UPPER_SAME[] = "SAME";
const char PAD_MODE_UPPER_VALID[] = "VALID";
const char TRANSPOSE_A[] = "transpose_a";
const char TRANSPOSE_B[] = "transpose_b";
const char IS_GRAD[] = "is_grad";
const char TRANSPOSE_NO = 'N';
const char TRANSPOSE_YES = 'T';
const char AXIS[] = "axis";
const char DIM[] = "dim";
const char BEGIN[] = "begin";
const char END[] = "end";
const char SIZE[] = "size";
const char USE_NESTEROV[] = "use_nesterov";
const char GROUP[] = "group";
const char START[] = "start";
const char LIMIT[] = "limit";
const char DELTA[] = "delta";
const char SORTED[] = "sorted";
const char ADJ_ST[] = "adjoint_st";
const char ADJ_dT[] = "adjoint_dt";
const char PERIODS[] = "periods";
const char WINDOW[] = "window";
const char MIN_PERIODS[] = "min_periods";
const char CENTER[] = "center";
const char METHOD[] = "method";
const char CLOSED[] = "closed";
const char NA_OPTION[] = "na_option";
const char ASCENDING[] = "ascending";
const char PCT[] = "pct";

enum OperateType {
ADD = 0,
SUB,
MUL,
DIV,
SQUARE,
SQRT,
POW,
REALDIV,
FLOORDIV,
MOD,
FLOORMOD,
NEG,
LESS,
ASSIGNADD,
RELUGRAD,
RELU6GRAD,
ABSGRAD,
TANHGRAD,
SQRTGRAD,
SIGMOIDGRAD,
ONESLIKE,
ZEROSLIKE,
SIGN,
EQUAL,
NOTEQUAL,
LESSEQUAL,
LOGICALAND,
LOGICALOR,
LOGICALNOT,
FLOOR,
SQUAREDDIFFERENCE,
GREATER,
GREATEREQUAL,
RECIPROCAL,
GELU,
GELUGRAD,
ASIN,
ACOS,
ATAN,
ASINGRAD,
ACOSGRAD,
ATANGRAD,
SIN,
COS,
TAN,
SINH,
COSH,
ASINH,
ACOSH,
ATANH,
ASINHGRAD,
ACOSHGRAD,
ATAN2,
RINT,
ROUND,
EXP,
IDENTITY,
};
constexpr char KERNEL_SIZE[] = "kernel_size";
constexpr char STRIDE[] = "stride";
constexpr char STRIDES[] = "strides";
constexpr char DILATION[] = "dilation";
constexpr char DILATIONS[] = "dilations";
constexpr char FORMAT[] = "format";
constexpr char PAD[] = "pad";
constexpr char PAD_LIST[] = "pad_list";
constexpr char PAD_MODE[] = "pad_mode";
constexpr char PAD_MODE_LOWER_SAME[] = "same";
constexpr char PAD_MODE_LOWER_VALID[] = "valid";
constexpr char PAD_MODE_UPPER_SAME[] = "SAME";
constexpr char PAD_MODE_UPPER_VALID[] = "VALID";
constexpr char TRANSPOSE_A[] = "transpose_a";
constexpr char TRANSPOSE_B[] = "transpose_b";
constexpr char IS_GRAD[] = "is_grad";
constexpr char TRANSPOSE_NO = 'N';
constexpr char TRANSPOSE_YES = 'T';
constexpr char AXIS[] = "axis";
constexpr char DIM[] = "dim";
constexpr char NUM[] = "num";
constexpr char BEGIN[] = "begin";
constexpr char END[] = "end";
constexpr char SIZE[] = "size";
constexpr char USE_NESTEROV[] = "use_nesterov";
constexpr char GROUP[] = "group";
constexpr char START[] = "start";
constexpr char LIMIT[] = "limit";
constexpr char DELTA[] = "delta";
constexpr char SORTED[] = "sorted";
constexpr char ADJ_ST[] = "adjoint_st";
constexpr char ADJ_dT[] = "adjoint_dt";
constexpr char REDUCTION[] = "reduction";
constexpr char NONE[] = "none";
constexpr char SUM[] = "sum";
constexpr char MEAN[] = "mean";
constexpr char BETA[] = "beta";
constexpr char EXCLUSIVE[] = "exclusive";
constexpr char REVERSE[] = "reverse";
constexpr char PCR[] = "preprocess_collapse_repeated";
constexpr char CTR[] = "ctc_merge_repeated";
constexpr char ILOTI[] = "ignore_longer_outputs_than_inputs";
constexpr char MOMENTUM[] = "momentum";
constexpr char RHO[] = "rho";
constexpr char EPSILON[] = "epsilon";
constexpr char ALIGN_CORNERS[] = "align_corners";
constexpr char PERIODS[] = "periods";
constexpr char WINDOW[] = "window";
constexpr char MIN_PERIODS[] = "min_periods";
constexpr char CENTER[] = "center";
constexpr char METHOD[] = "method";
constexpr char CLOSED[] = "closed";
constexpr char NA_OPTION[] = "na_option";
constexpr char ASCENDING[] = "ascending";
constexpr char PCT[] = "pct";

struct ParallelSearchInfo {
double min_cost_time{DBL_MAX};


+ 7
- 1
mindspore/ccsrc/backend/kernel_compiler/cpu/cpu_kernel_factory.cc View File

@@ -1,5 +1,5 @@
/**
* Copyright 2019 Huawei Technologies Co., Ltd
* Copyright 2019-2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
@@ -25,7 +25,10 @@

namespace mindspore {
namespace kernel {
namespace {
const std::set<std::string> same_op_name = {"Concat", "Pack", "Stack", "Split", "Transpose", "Unpack", "AddN"};
} // namespace

CPUKernelFactory &CPUKernelFactory::GetInstance() {
static CPUKernelFactory instance;
return instance;
@@ -40,6 +43,7 @@ void CPUKernelFactory::Register(const std::string &kernel_name, const KernelAttr
}

std::shared_ptr<CPUKernel> CPUKernelFactory::Create(const std::string &kernel_name, const CNodePtr &apply_kernel) {
MS_EXCEPTION_IF_NULL(apply_kernel);
auto kernel_info = dynamic_cast<device::KernelInfo *>(apply_kernel->kernel_info());
MS_EXCEPTION_IF_NULL(kernel_info);
const KernelBuildInfo *kernel_build_Info = kernel_info->select_kernel_build_info();
@@ -53,6 +57,8 @@ std::shared_ptr<CPUKernel> CPUKernelFactory::Create(const std::string &kernel_na

void CPUKernelFactory::SetKernelAttrs(const std::shared_ptr<kernel::OpInfo> op_info,
std::vector<KernelAttr> *kernel_attrs) {
MS_EXCEPTION_IF_NULL(kernel_attrs);
MS_EXCEPTION_IF_NULL(op_info);
auto inputs_ptr = op_info->inputs_ptr();
auto outputs_ptr = op_info->outputs_ptr();
if (inputs_ptr.empty()) {


+ 4
- 2
mindspore/ccsrc/backend/kernel_compiler/cpu/cpu_kernel_factory.h View File

@@ -1,5 +1,5 @@
/**
* Copyright 2019 Huawei Technologies Co., Ltd
* Copyright 2019-2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
@@ -13,6 +13,7 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_CPU_KERNEL_FACTORY_H_
#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_CPU_KERNEL_FACTORY_H_

@@ -23,15 +24,16 @@
#include <utility>
#include <vector>

#include "utils/ms_utils.h"
#include "backend/kernel_compiler/cpu/cpu_kernel.h"
#include "backend/kernel_compiler/oplib/oplib.h"
#include "runtime/device/cpu/kernel_select_cpu.h"
#include "utils/ms_utils.h"

namespace mindspore {
namespace kernel {
using mindspore::device::cpu::KernelAttr;
using CPUKernelCreator = std::function<std::shared_ptr<CPUKernel>()>;

class CPUKernelFactory {
public:
static CPUKernelFactory &GetInstance();


+ 1
- 0
mindspore/ccsrc/backend/kernel_compiler/cpu/crop_and_resize_cpu_kernel.cc View File

@@ -13,6 +13,7 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include "backend/kernel_compiler/cpu/crop_and_resize_cpu_kernel.h"
#include "runtime/device/cpu/cpu_device_address.h"



+ 3
- 0
mindspore/ccsrc/backend/kernel_compiler/cpu/crop_and_resize_cpu_kernel.h View File

@@ -13,11 +13,14 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_CROP_AND_RESIZE_CPU_KERNEL_H_
#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_CROP_AND_RESIZE_CPU_KERNEL_H_

#include <vector>
#include <string>
#include <algorithm>

#include "backend/kernel_compiler/cpu/cpu_kernel.h"
#include "backend/kernel_compiler/cpu/cpu_kernel_factory.h"



+ 93
- 95
mindspore/ccsrc/backend/kernel_compiler/cpu/ctcloss_cpu_kernel.cc View File

@@ -19,10 +19,62 @@

namespace mindspore {
namespace kernel {
namespace {
constexpr size_t kCTCLossInputsNum = 4;
constexpr size_t kCTCLossOutputsNum = 2;

template <typename T>
inline T LogSumExp(const T logprob1, const T logprob2) {
T kLogZero_ = -std::numeric_limits<T>::infinity();
if (logprob1 <= kLogZero_) {
return logprob2;
}
if (logprob2 <= kLogZero_) {
return logprob1;
}
return (logprob1 > logprob2) ? logprob1 + static_cast<T>(log1p(exp(logprob2 - logprob1)))
: logprob2 + static_cast<T>(log1p(exp(logprob1 - logprob2)));
}

template <typename T>
void InnerSoftMax(const T *inputs_addr, std::vector<std::vector<T>> *softmax_probs, const uint32_t sequence_length,
size_t num_class, size_t batch_size, size_t b) {
for (size_t t = 0; t < sequence_length; ++t) {
auto maxCoeff = static_cast<T>(0);
auto sumCoeff = static_cast<T>(0);

for (size_t c = 0; c < num_class; ++c) {
if (inputs_addr[t * batch_size * num_class + b * num_class + c] > maxCoeff) {
maxCoeff = inputs_addr[t * batch_size * num_class + b * num_class + c];
}
}

for (size_t c = 0; c < num_class; ++c) {
sumCoeff += static_cast<T>(exp(inputs_addr[t * batch_size * num_class + b * num_class + c] - maxCoeff));
(*softmax_probs)[c][t] =
static_cast<T>(exp(inputs_addr[t * batch_size * num_class + b * num_class + c] - maxCoeff));
}

for (size_t c = 0; c < num_class; ++c) {
(*softmax_probs)[c][t] /= sumCoeff;
}
}
}

template <typename T>
void MatrixFromVector(uint32_t row, uint32_t col, std::vector<std::vector<T>> *array2D, const T init_value) {
array2D->resize(row);
for (size_t i = 0; i < row; ++i) {
(*array2D)[i].resize(col, init_value);
}
}
} // namespace

void CTCLossCPUKernel::InitKernel(const CNodePtr &kernel_node) {
CheckParam(kernel_node);
MS_EXCEPTION_IF_NULL(kernel_node);
kernel_name_ = AnfAlgo::GetCNodeName(kernel_node);
probs_shape_ = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
indice_dims_ = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 1);
indices_dims_ = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 1);
labels_dims_ = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 2);
dtype_ = AnfAlgo::GetInputDeviceDataType(kernel_node, 0);

@@ -32,14 +84,13 @@ void CTCLossCPUKernel::InitKernel(const CNodePtr &kernel_node) {
if (labels_dims_.size() != 1) {
MS_LOG(EXCEPTION) << "Labels dims: " << labels_dims_.size() << " not support.";
}
if (indice_dims_.size() != 2) {
MS_LOG(EXCEPTION) << "Labels indice dims: " << indice_dims_.size() << " not support.";
if (indices_dims_.size() != 2) {
MS_LOG(EXCEPTION) << "Labels indice dims: " << indices_dims_.size() << " not support.";
}

preprocess_collapse_repeated_ = AnfAlgo::GetNodeAttr<bool>(kernel_node, "preprocess_collapse_repeated");
ctc_merge_repeated_ = AnfAlgo::GetNodeAttr<bool>(kernel_node, "ctc_merge_repeated");
ignore_longer_outputs_than_inputs_ = AnfAlgo::GetNodeAttr<bool>(kernel_node, "ignore_longer_outputs_than_inputs");

preprocess_collapse_repeated_ = AnfAlgo::GetNodeAttr<bool>(kernel_node, PCR);
ctc_merge_repeated_ = AnfAlgo::GetNodeAttr<bool>(kernel_node, CTR);
ignore_longer_outputs_than_inputs_ = AnfAlgo::GetNodeAttr<bool>(kernel_node, ILOTI);
max_time_ = probs_shape_[0];
batch_size_ = probs_shape_[1];
num_class_ = probs_shape_[2];
@@ -48,31 +99,23 @@ void CTCLossCPUKernel::InitKernel(const CNodePtr &kernel_node) {

bool CTCLossCPUKernel::Launch(const std::vector<kernel::AddressPtr> &inputs, const std::vector<kernel::AddressPtr> &,
const std::vector<kernel::AddressPtr> &outputs) {
CHECK_KERNEL_INPUTS_NUM(inputs.size(), kCTCLossInputsNum, kernel_name_);
CHECK_KERNEL_OUTPUTS_NUM(outputs.size(), kCTCLossOutputsNum, kernel_name_);
if (dtype_ == kNumberTypeFloat16) {
LaunchKernel<float16>(inputs, outputs);
} else if (dtype_ == kNumberTypeFloat32) {
LaunchKernel<float>(inputs, outputs);
}
return true;
}

template <typename T>
inline T LogSumExp(const T logprob1, const T logprob2) {
T kLogZero_ = -std::numeric_limits<T>::infinity();
if (logprob1 <= kLogZero_) {
return logprob2;
} else if (logprob2 <= kLogZero_) {
return logprob1;
} else {
return (logprob1 > logprob2) ? logprob1 + static_cast<T>(log1p(exp(logprob2 - logprob1)))
: logprob2 + static_cast<T>(log1p(exp(logprob1 - logprob2)));
MS_LOG(EXCEPTION) << kernel_name_ << " only support float16 and float32 on CPU, but got "
<< TypeIdToType(dtype_)->ToString();
}
return true;
}

template <typename TT>
void CTCLossCPUKernel::CalculateFwdVar(const std::vector<uint32_t> &label_with_blank,
const std::vector<std::vector<TT>> &y,
std::vector<std::vector<TT>> *log_alpha_b) {
std::vector<std::vector<TT>> *log_alpha_b) const {
int U = label_with_blank.size();
int T = (*log_alpha_b)[0].size();
TT kLogZero_ = -std::numeric_limits<TT>::infinity();
@@ -112,7 +155,7 @@ void CTCLossCPUKernel::CalculateFwdVar(const std::vector<uint32_t> &label_with_b
template <typename TT>
void CTCLossCPUKernel::CalculateBwdVar(const std::vector<uint32_t> &label_with_blank,
const std::vector<std::vector<TT>> &y,
std::vector<std::vector<TT>> *log_beta_b) {
std::vector<std::vector<TT>> *log_beta_b) const {
int T = (*log_beta_b)[0].size();
int U = label_with_blank.size();
if (U > 1) {
@@ -154,7 +197,7 @@ void CTCLossCPUKernel::CalculateGrad(const std::vector<uint32_t> &label_with_bla
const std::vector<std::vector<TT>> &y,
const std::vector<std::vector<TT>> &log_alpha_b,
const std::vector<std::vector<TT>> &log_beta_b, const TT log_pzx,
std::vector<std::vector<TT>> *dy) {
std::vector<std::vector<TT>> *dy) const {
auto dy_b = dy;
TT kLogZero_ = -std::numeric_limits<TT>::infinity();
if (log_pzx <= kLogZero_) {
@@ -179,8 +222,8 @@ void CTCLossCPUKernel::CalculateGrad(const std::vector<uint32_t> &label_with_bla
}
}

void CTCLossCPUKernel::GenLableWithBlank(const uint32_t *seq_len, const std::vector<std::vector<uint32_t>> &batch_label,
std::vector<std::vector<uint32_t>> *label_with_blank) {
void CTCLossCPUKernel::GenLabelWithBlank(const uint32_t *seq_len, const std::vector<std::vector<uint32_t>> &batch_label,
std::vector<std::vector<uint32_t>> *label_with_blank) const {
for (size_t b = 0; b < batch_size_; ++b) {
std::vector<uint32_t> l;
const std::vector<uint32_t> &label = batch_label[b];
@@ -197,11 +240,9 @@ void CTCLossCPUKernel::GenLableWithBlank(const uint32_t *seq_len, const std::vec
}
}
}
if (!ignore_longer_outputs_than_inputs_) {
if (l.size() > seq_len[b]) {
MS_LOG(EXCEPTION) << "Input time(sequence length) should greater than output size(label length), but gets "
<< seq_len[b] << "< " << l.size();
}
if (!ignore_longer_outputs_than_inputs_ && l.size() > seq_len[b]) {
MS_LOG(EXCEPTION) << "Input time(sequence length) should greater than output size(label length), but gets "
<< seq_len[b] << "< " << l.size();
}

(*label_with_blank)[b].reserve(2 * l.size() + 1);
@@ -214,46 +255,14 @@ void CTCLossCPUKernel::GenLableWithBlank(const uint32_t *seq_len, const std::vec
}

template <typename T>
void InnerSoftMax(const T *inputs_addr, std::vector<std::vector<T>> *softmax_probs, const uint32_t sequence_length,
size_t num_class, size_t batch_size, size_t b) {
for (size_t t = 0; t < sequence_length; ++t) {
T maxCoeff(T(0));
T sumCoeff(T(0));

for (size_t c = 0; c < num_class; ++c) {
if (inputs_addr[t * batch_size * num_class + b * num_class + c] > maxCoeff) {
maxCoeff = inputs_addr[t * batch_size * num_class + b * num_class + c];
}
}

for (size_t c = 0; c < num_class; ++c) {
sumCoeff += static_cast<T>(exp(inputs_addr[t * batch_size * num_class + b * num_class + c] - maxCoeff));
(*softmax_probs)[c][t] =
static_cast<T>(exp(inputs_addr[t * batch_size * num_class + b * num_class + c] - maxCoeff));
}

for (size_t c = 0; c < num_class; ++c) {
(*softmax_probs)[c][t] /= sumCoeff;
}
}
}

template <typename T>
void MatrixfromVector(uint32_t row, uint32_t col, std::vector<std::vector<T>> *array2D, const T init_value) {
array2D->resize(row);
for (size_t i = 0; i < row; ++i) {
(*array2D)[i].resize(col, init_value);
}
}

template <typename T>
void CTCLossCPUKernel::LaunchKernel(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &outputs) {
auto inputs_addr = reinterpret_cast<T *>(inputs[0]->addr);
auto labels_indices_addr = reinterpret_cast<uint64_t *>(inputs[1]->addr);
auto labels_values_addr = reinterpret_cast<uint32_t *>(inputs[2]->addr);
auto sequence_length_addr = reinterpret_cast<uint32_t *>(inputs[3]->addr);
auto loss_addr = reinterpret_cast<T *>(outputs[0]->addr);
auto gradient_addr = reinterpret_cast<T *>(outputs[1]->addr);
void CTCLossCPUKernel::LaunchKernel(const std::vector<AddressPtr> &inputs,
const std::vector<AddressPtr> &outputs) const {
const auto *inputs_addr = reinterpret_cast<T *>(inputs[0]->addr);
const auto *labels_indices_addr = reinterpret_cast<uint64_t *>(inputs[1]->addr);
const auto *labels_values_addr = reinterpret_cast<uint32_t *>(inputs[2]->addr);
const auto *sequence_length_addr = reinterpret_cast<uint32_t *>(inputs[3]->addr);
auto *loss_addr = reinterpret_cast<T *>(outputs[0]->addr);
auto *gradient_addr = reinterpret_cast<T *>(outputs[1]->addr);

std::vector<std::vector<uint32_t>> label_batch;
std::vector<std::vector<uint32_t>> labels_with_blank;
@@ -266,18 +275,21 @@ void CTCLossCPUKernel::LaunchKernel(const std::vector<AddressPtr> &inputs, const
T kLogZero_ = -std::numeric_limits<T>::infinity();
// check validation of sequence length
for (size_t b = 0; b < batch_size_; ++b) {
if (sequence_length_addr[b] == uint32_t(0)) {
if (sequence_length_addr[b] == static_cast<uint32_t>(0)) {
MS_LOG(EXCEPTION) << "Sequence length should > 0, but gets " << sequence_length_addr[b];
}

if (sequence_length_addr[b] > max_time_) {
MS_LOG(EXCEPTION) << "Max time should be greater than sequence length, but gets " << max_time_ << " < "
<< sequence_length_addr[b];
}
}

for (size_t i = 0; i < indice_dims_[0]; ++i) {
each_label_length[labels_indices_addr[i * 2]]++;
for (size_t i = 0; i < indices_dims_[0]; ++i) {
const size_t factor = 2;
auto index = labels_indices_addr[i * factor];
if (index >= SizeToUlong(each_label_length.size())) {
MS_LOG(EXCEPTION) << "Index: " << index << "out of the bounds of the vector.";
}
each_label_length[index]++;
}

// convert label format of label_value and label_indices to batch_label
@@ -291,7 +303,7 @@ void CTCLossCPUKernel::LaunchKernel(const std::vector<AddressPtr> &inputs, const
}

// convert label to label with blank
GenLableWithBlank(sequence_length_addr, label_batch, &labels_with_blank);
GenLabelWithBlank(sequence_length_addr, label_batch, &labels_with_blank);

for (size_t b = 0; b < batch_size_; ++b) {
std::vector<uint32_t> label_with_blank = labels_with_blank[b];
@@ -300,12 +312,11 @@ void CTCLossCPUKernel::LaunchKernel(const std::vector<AddressPtr> &inputs, const
std::vector<std::vector<T>> dy;
std::vector<std::vector<T>> log_alpha_b;
std::vector<std::vector<T>> log_beta_b;
MatrixfromVector(num_class_, sequence_length_addr[b], &y_b, kLogZero_);
MatrixfromVector(y_b.size(), y_b[0].size(), &dy, T(0));
MatrixfromVector(label_with_blank.size(), sequence_length_addr[b], &log_alpha_b, kLogZero_);
MatrixfromVector(label_with_blank.size(), sequence_length_addr[b], &log_beta_b, kLogZero_);
MatrixFromVector(num_class_, sequence_length_addr[b], &y_b, kLogZero_);
MatrixFromVector(y_b.size(), y_b[0].size(), &dy, T(0));
MatrixFromVector(label_with_blank.size(), sequence_length_addr[b], &log_alpha_b, kLogZero_);
MatrixFromVector(label_with_blank.size(), sequence_length_addr[b], &log_beta_b, kLogZero_);
InnerSoftMax(inputs_addr, &y_b, sequence_length_addr[b], num_class_, batch_size_, b);

CalculateFwdVar(label_with_blank, y_b, &log_alpha_b);
CalculateBwdVar(label_with_blank, y_b, &log_beta_b);

@@ -313,9 +324,7 @@ void CTCLossCPUKernel::LaunchKernel(const std::vector<AddressPtr> &inputs, const
for (size_t u = 0; u < label_with_blank.size(); ++u) {
log_pzx = LogSumExp(log_pzx, log_alpha_b[u][0] + log_beta_b[u][0]);
}

loss_addr[b] = -log_pzx;

CalculateGrad(label_with_blank, y_b, log_alpha_b, log_beta_b, log_pzx, &dy);

for (size_t t = 0; t < sequence_length_addr[b]; ++t) {
@@ -325,16 +334,5 @@ void CTCLossCPUKernel::LaunchKernel(const std::vector<AddressPtr> &inputs, const
}
}
}

void CTCLossCPUKernel::CheckParam(const CNodePtr &kernel_node) {
size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
if (input_num != 4) {
MS_LOG(EXCEPTION) << "CTCLossCPUKernel needs 4 inputs, but gets " << input_num;
}
size_t output_num = AnfAlgo::GetOutputTensorNum(kernel_node);
if (output_num != 2) {
MS_LOG(EXCEPTION) << "CTCLossCPUKernel expects 2 outputs, but gets" << output_num;
}
}
} // namespace kernel
} // namespace mindspore

+ 17
- 16
mindspore/ccsrc/backend/kernel_compiler/cpu/ctcloss_cpu_kernel.h View File

@@ -16,11 +16,13 @@

#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_CTCLOSS_CPU_KERNEL_H_
#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_CTCLOSS_CPU_KERNEL_H_

#include <memory>
#include <unordered_map>
#include <vector>
#include <algorithm>
#include <limits>

#include "backend/kernel_compiler/cpu/cpu_kernel.h"
#include "backend/kernel_compiler/cpu/cpu_kernel_factory.h"

@@ -36,36 +38,35 @@ class CTCLossCPUKernel : public CPUKernel {
bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
const std::vector<AddressPtr> &outputs) override;

void GenLableWithBlank(const uint32_t *seq_len, const std::vector<std::vector<uint32_t>> &batch_label,
std::vector<std::vector<uint32_t>> *label_with_blank);
private:
void GenLabelWithBlank(const uint32_t *seq_len, const std::vector<std::vector<uint32_t>> &batch_label,
std::vector<std::vector<uint32_t>> *label_with_blank) const;

template <typename T>
void CalculateFwdVar(const std::vector<uint32_t> &label_with_blank, const std::vector<std::vector<T>> &y,
std::vector<std::vector<T>> *log_alpha_b);
std::vector<std::vector<T>> *log_alpha_b) const;
template <typename T>
void CalculateBwdVar(const std::vector<uint32_t> &label_with_blank, const std::vector<std::vector<T>> &y,
std::vector<std::vector<T>> *log_beta_b);
std::vector<std::vector<T>> *log_beta_b) const;
template <typename T>
void CalculateGrad(const std::vector<uint32_t> &label_with_blank, const std::vector<std::vector<T>> &y,
const std::vector<std::vector<T>> &log_alpha_b, const std::vector<std::vector<T>> &log_beta_b,
const T log_pzx, std::vector<std::vector<T>> *dy);
const T log_pzx, std::vector<std::vector<T>> *dy) const;

template <typename T>
void LaunchKernel(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &outputs);
void LaunchKernel(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &outputs) const;

private:
void CheckParam(const CNodePtr &kernel_node);
std::vector<size_t> probs_shape_;
std::vector<size_t> indice_dims_;
std::vector<size_t> indices_dims_;
std::vector<size_t> labels_dims_;
size_t num_class_;
size_t max_time_;
size_t batch_size_;
uint32_t blank_index_;
size_t num_class_{0};
size_t max_time_{0};
size_t batch_size_{0};
uint32_t blank_index_{0};
TypeId dtype_{kTypeUnknown};
bool preprocess_collapse_repeated_;
bool ctc_merge_repeated_;
bool ignore_longer_outputs_than_inputs_;
bool preprocess_collapse_repeated_{false};
bool ctc_merge_repeated_{false};
bool ignore_longer_outputs_than_inputs_{false};
};

MS_REG_CPU_KERNEL(CTCLoss,


+ 32
- 25
mindspore/ccsrc/backend/kernel_compiler/cpu/cumsum_cpu_kernel.cc View File

@@ -1,5 +1,5 @@
/**
* Copyright 2020 Huawei Technologies Co., Ltd
* Copyright 2020-2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
@@ -13,20 +13,29 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include <thread>
#include "backend/kernel_compiler/cpu/cumsum_cpu_kernel.h"

#include <thread>

#include "runtime/device/cpu/cpu_device_address.h"

namespace mindspore {
namespace kernel {
namespace {
constexpr size_t kCumSumInputsNum = 1;
constexpr size_t kCumSumOutputsNum = 1;
} // namespace

void CumSumCPUKernel::InitKernel(const CNodePtr &kernel_node) {
CheckParam(kernel_node);
MS_EXCEPTION_IF_NULL(kernel_node);
kernel_name_ = AnfAlgo::GetCNodeName(kernel_node);
shape_ = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
dtype_ = AnfAlgo::GetInputDeviceDataType(kernel_node, 0);
axis_ = static_cast<int>(AnfAlgo::GetNodeAttr<int64_t>(kernel_node, "axis"));
axis_ = LongToInt(AnfAlgo::GetNodeAttr<int64_t>(kernel_node, AXIS));
dst_shape = AnfAlgo::GetOutputInferShape(kernel_node, 0);
exclusive_ = AnfAlgo::GetNodeAttr<bool>(kernel_node, "exclusive");
reverse_ = AnfAlgo::GetNodeAttr<bool>(kernel_node, "reverse");
exclusive_ = AnfAlgo::GetNodeAttr<bool>(kernel_node, EXCLUSIVE);
reverse_ = AnfAlgo::GetNodeAttr<bool>(kernel_node, REVERSE);
int input_dim_length = SizeToInt(shape_.size());
if (axis_ >= input_dim_length) {
MS_LOG(EXCEPTION) << "Axis out of bounds.";
@@ -57,12 +66,17 @@ void CumSumCPUKernel::InitInputOutputSize(const CNodePtr &kernel_node) {
InitWorkspaceSize<int8_t>();
} else if (dtype_ == kNumberTypeUInt8) {
InitWorkspaceSize<uint8_t>();
} else {
MS_LOG(EXCEPTION) << kernel_name_ << " supports (float16, float32, uint8, int8, int32) on CPU, but got "
<< TypeIdToType(dtype_)->ToString();
}
}

bool CumSumCPUKernel::Launch(const std::vector<kernel::AddressPtr> &inputs,
const std::vector<kernel::AddressPtr> &workspace,
const std::vector<kernel::AddressPtr> &outputs) {
CHECK_KERNEL_INPUTS_NUM(inputs.size(), kCumSumInputsNum, kernel_name_);
CHECK_KERNEL_OUTPUTS_NUM(outputs.size(), kCumSumOutputsNum, kernel_name_);
Reshape();
if (dtype_ == kNumberTypeFloat32) {
LaunchKernel<float_t>(inputs, workspace, outputs);
@@ -74,6 +88,9 @@ bool CumSumCPUKernel::Launch(const std::vector<kernel::AddressPtr> &inputs,
LaunchKernel<int8_t>(inputs, workspace, outputs);
} else if (dtype_ == kNumberTypeUInt8) {
LaunchKernel<uint8_t>(inputs, workspace, outputs);
} else {
MS_LOG(EXCEPTION) << kernel_name_ << " supports (float16, float32, uint8, int8, int32) on CPU, but got "
<< TypeIdToType(dtype_)->ToString();
}
return true;
}
@@ -90,12 +107,11 @@ void CumSumCPUKernel::Reshape() {
}
stride_ = dims_[1] * dims_[2];
stride2_ = dims_[2];
return;
}

template <typename T>
void CumSumCPUKernel::LeftMove(const T *input, T *output, size_t dim0, size_t dim1, size_t dim2, size_t stride,
size_t stride2, size_t start, size_t end) {
size_t stride2, size_t start, size_t end) const {
for (size_t i = start; i < end; i++) {
size_t k1 = i / dim2 % dim0;
size_t k2 = i % dim2;
@@ -114,7 +130,7 @@ void CumSumCPUKernel::LeftMove(const T *input, T *output, size_t dim0, size_t di

template <typename T>
void CumSumCPUKernel::RightMove(const T *input, T *output, size_t dim0, size_t dim1, size_t dim2, size_t stride,
size_t stride2, size_t start, size_t end) {
size_t stride2, size_t start, size_t end) const {
for (size_t i = start; i < end; i++) {
size_t k1 = i / dim2 % dim0;
size_t k2 = i % dim2;
@@ -133,7 +149,7 @@ void CumSumCPUKernel::RightMove(const T *input, T *output, size_t dim0, size_t d

template <typename T>
void CumSumCPUKernel::Copy(T *input, T *output, size_t dim0, size_t dim1, size_t dim2, size_t stride, size_t stride2,
size_t start, size_t end) {
size_t start, size_t end) const {
for (size_t i = start; i < end; i++) {
size_t k1 = i / dim2 % dim0;
size_t k2 = i % dim2;
@@ -147,7 +163,7 @@ void CumSumCPUKernel::Copy(T *input, T *output, size_t dim0, size_t dim1, size_t

template <typename T>
void CumSumCPUKernel::CumSumKernelReverse(const T *input, T *output, size_t dim0, size_t dim1, size_t dim2,
size_t stride, size_t stride2, size_t start, size_t end) {
size_t stride, size_t stride2, size_t start, size_t end) const {
for (size_t i = start; i < end; i++) {
size_t k1 = i / dim2 % dim0;
size_t k2 = i % dim2;
@@ -166,7 +182,7 @@ void CumSumCPUKernel::CumSumKernelReverse(const T *input, T *output, size_t dim0

template <typename T>
void CumSumCPUKernel::CumSumKernel(const T *input, T *output, size_t dim0, size_t dim1, size_t dim2, size_t stride,
size_t stride2, size_t start, size_t end) {
size_t stride2, size_t start, size_t end) const {
for (size_t i = start; i < end; i++) {
size_t k1 = i / dim2 % dim0;
size_t k2 = i % dim2;
@@ -184,7 +200,7 @@ void CumSumCPUKernel::CumSumKernel(const T *input, T *output, size_t dim0, size_
}

template <typename T>
void CumSumCPUKernel::LaunchCumSum(const T *input, T *output, T *workspace, size_t start, size_t end) {
void CumSumCPUKernel::LaunchCumSum(const T *input, T *output, T *workspace, size_t start, size_t end) const {
start = start / dims_[1];
end = end / dims_[1];
if (exclusive_) {
@@ -204,15 +220,14 @@ void CumSumCPUKernel::LaunchCumSum(const T *input, T *output, T *workspace, size
CumSumKernel(input, output, dims_[0], dims_[1], dims_[2], stride_, stride2_, start, end);
}
}
return;
}

template <typename T>
void CumSumCPUKernel::LaunchKernel(const std::vector<kernel::AddressPtr> &inputs,
const std::vector<kernel::AddressPtr> &workspace,
const std::vector<kernel::AddressPtr> &outputs) {
auto input = reinterpret_cast<T *>(inputs[0]->addr);
auto ws = reinterpret_cast<T *>(workspace[0]->addr);
const std::vector<kernel::AddressPtr> &outputs) const {
const auto *input = reinterpret_cast<T *>(inputs[0]->addr);
auto *ws = reinterpret_cast<T *>(workspace[0]->addr);
auto output = reinterpret_cast<T *>(outputs[0]->addr);
// multithreading
size_t lens = inputs[0]->size > 0 ? static_cast<size_t>(inputs[0]->size / sizeof(T)) : 1;
@@ -239,14 +254,6 @@ void CumSumCPUKernel::LaunchKernel(const std::vector<kernel::AddressPtr> &inputs
for (size_t i = 0; i < threads.size(); ++i) {
threads[i].join();
}
return;
}

void CumSumCPUKernel::CheckParam(const CNodePtr &kernel_node) {
size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
if (input_num != 1) {
MS_LOG(EXCEPTION) << "Argument number is " << input_num << ", but CumSumGpuKernel needs 1.";
}
}
} // namespace kernel
} // namespace mindspore

+ 25
- 26
mindspore/ccsrc/backend/kernel_compiler/cpu/cumsum_cpu_kernel.h View File

@@ -1,5 +1,5 @@
/**
* Copyright 2020 Huawei Technologies Co., Ltd
* Copyright 2020-2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
@@ -19,6 +19,7 @@

#include <memory>
#include <vector>

#include "backend/kernel_compiler/cpu/cpu_kernel.h"
#include "backend/kernel_compiler/cpu/cpu_kernel_factory.h"

@@ -31,55 +32,53 @@ class CumSumCPUKernel : public CPUKernel {

void InitKernel(const CNodePtr &kernel_node) override;

void InitInputOutputSize(const CNodePtr &kernel_node) override;

bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
const std::vector<AddressPtr> &outputs) override;

template <typename T>
void InitWorkspaceSize();

template <typename T>
void LaunchKernel(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
const std::vector<AddressPtr> &outputs);
private:
void Reshape();

template <typename T>
void LaunchCumSum(const T *input_addr, T *output_addr, T *ws_addr, size_t start, size_t end);

private:
void CheckParam(const CNodePtr &kernel_node);
void InitWorkspaceSize();

void Reshape();
void InitInputOutputSize(const CNodePtr &kernel_node) override;

template <typename T>
void LeftMove(const T *input, T *output, size_t dim0, size_t dim1, size_t dim2, size_t stride, size_t stride2,
size_t start, size_t end);
size_t start, size_t end) const;

template <typename T>
void RightMove(const T *input, T *output, size_t dim0, size_t dim1, size_t dim2, size_t stride, size_t stride2,
size_t start, size_t end);
size_t start, size_t end) const;

template <typename T>
void Copy(T *input, T *output, size_t dim0, size_t dim1, size_t dim2, size_t stride, size_t stride2, size_t start,
size_t end);
size_t end) const;

template <typename T>
void CumSumKernelReverse(const T *input, T *output, size_t dim0, size_t dim1, size_t dim2, size_t stride,
size_t stride2, size_t start, size_t end);
size_t stride2, size_t start, size_t end) const;

template <typename T>
void CumSumKernel(const T *input, T *output, size_t dim0, size_t dim1, size_t dim2, size_t stride, size_t stride2,
size_t start, size_t end);
size_t start, size_t end) const;

template <typename T>
void LaunchKernel(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
const std::vector<AddressPtr> &outputs) const;

template <typename T>
void LaunchCumSum(const T *input_addr, T *output_addr, T *ws_addr, size_t start, size_t end) const;

std::vector<size_t> shape_;
std::vector<size_t> dst_shape;
size_t input_size_0_;
size_t stride_;
size_t stride2_;
size_t dims_[3] = {};
int exclusive_;
int reverse_;
int axis_;
size_t input_size_0_{0};
size_t stride_{0};
size_t stride2_{0};
size_t dims_[3]{0};
int exclusive_{0};
int reverse_{0};
int axis_{0};
TypeId dtype_{kTypeUnknown};
};



+ 15
- 8
mindspore/ccsrc/backend/kernel_compiler/cpu/debug_cpu_kernel.cc View File

@@ -1,5 +1,5 @@
/**
* Copyright 2020 Huawei Technologies Co., Ltd
* Copyright 2020-2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
@@ -13,28 +13,35 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include "backend/kernel_compiler/cpu/debug_cpu_kernel.h"
#include "runtime/device/cpu/cpu_device_address.h"
#include "utils/ms_utils.h"

namespace mindspore {
namespace kernel {
void DebugCPUKernel::InitKernel(const CNodePtr &kernel_node) { MS_EXCEPTION_IF_NULL(kernel_node); }
namespace {
constexpr size_t kDebugInputsNum = 1;
constexpr size_t kDebugOutputsNum = 1;
} // namespace

void DebugCPUKernel::InitKernel(const CNodePtr &kernel_node) {
MS_EXCEPTION_IF_NULL(kernel_node);
kernel_name_ = AnfAlgo::GetCNodeName(kernel_node);
}

bool DebugCPUKernel::Launch(const std::vector<kernel::AddressPtr> &inputs, const std::vector<kernel::AddressPtr> &,
const std::vector<kernel::AddressPtr> &outputs) {
if (inputs.size() < 1 || outputs.empty()) {
MS_LOG(EXCEPTION) << "Input or output empty!";
}
auto val = reinterpret_cast<int *>(inputs[0]->addr);
MS_LOG(DEBUG) << " launch DebugCountCPUKernel val " << *val;
CHECK_KERNEL_INPUTS_NUM(inputs.size(), kDebugInputsNum, kernel_name_);
CHECK_KERNEL_OUTPUTS_NUM(outputs.size(), kDebugOutputsNum, kernel_name_);
const auto *val = reinterpret_cast<int *>(inputs[0]->addr);
MS_LOG(DEBUG) << " launch DebugCountCPUKernel";

auto output = reinterpret_cast<int *>(outputs[0]->addr);
size_t elem_num = inputs[0]->size / sizeof(int);
for (size_t i = 0; i < elem_num; i++) {
output[i] = static_cast<int>(val[i]);
}

return true;
}
} // namespace kernel


+ 3
- 1
mindspore/ccsrc/backend/kernel_compiler/cpu/debug_cpu_kernel.h View File

@@ -1,5 +1,5 @@
/**
* Copyright 2020 Huawei Technologies Co., Ltd
* Copyright 2020-2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
@@ -13,11 +13,13 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_DEBUG_CPU_KERNEL_H_
#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_DEBUG_CPU_KERNEL_H_

#include <vector>
#include <memory>

#include "backend/kernel_compiler/cpu/cpu_kernel.h"
#include "backend/kernel_compiler/cpu/cpu_kernel_factory.h"



+ 3
- 0
mindspore/ccsrc/backend/kernel_compiler/cpu/depthtospace_cpu_kernel.h View File

@@ -13,14 +13,17 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_DEPTHTOSPACE_CPU_KERNEL_H_
#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_DEPTHTOSPACE_CPU_KERNEL_H_

#include <memory>
#include <string>
#include <vector>

#include "backend/kernel_compiler/cpu/cpu_kernel.h"
#include "backend/kernel_compiler/cpu/cpu_kernel_factory.h"

namespace mindspore {
namespace kernel {
template <typename T>


+ 22
- 23
mindspore/ccsrc/backend/kernel_compiler/cpu/dropout_cpu_kernel.cc View File

@@ -1,5 +1,5 @@
/**
* Copyright 2020 Huawei Technologies Co., Ltd
* Copyright 2020-2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
@@ -14,24 +14,29 @@
* limitations under the License.
*/

#include "backend/kernel_compiler/cpu/dropout_cpu_kernel.h"

#include <algorithm>
#include <random>

#include "runtime/device/cpu/cpu_device_address.h"
#include "backend/kernel_compiler/cpu/dropout_cpu_kernel.h"

namespace mindspore {
namespace kernel {
namespace {
constexpr size_t kDropoutInputsNum = 1;
constexpr size_t kDropoutOutputsNum = 2;
} // namespace

void DropoutCPUKernel::InitKernel(const CNodePtr &kernel_node) {
CheckParam(kernel_node);
MS_EXCEPTION_IF_NULL(kernel_node);
kernel_name_ = AnfAlgo::GetCNodeName(kernel_node);
input_shape_ = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
output_shape_ = AnfAlgo::GetOutputInferShape(kernel_node, 0);
mask_shape_ = AnfAlgo::GetOutputInferShape(kernel_node, 1);
keep_prob_ = AnfAlgo::GetNodeAttr<float>(kernel_node, "keep_prob");
if (keep_prob_ <= 0.0) {
MS_LOG(EXCEPTION) << "Keep_prob is smaller or equal to zero but DropoutCPUKernel needs greater than 0";
}
if (keep_prob_ > 1.0) {
MS_LOG(EXCEPTION) << "Keep_prob greater than one but DropoutCPUKernel needs smaller or equal to one";
if (keep_prob_ <= 0.0 || keep_prob_ > 1.0) {
MS_LOG(EXCEPTION) << kernel_name_ << "requires keep_prob should be in (0.0, 1.0], but got " << keep_prob_;
}
dtype_ = AnfAlgo::GetInputDeviceDataType(kernel_node, 0);
for (const uint64_t &d : input_shape_) {
@@ -41,18 +46,24 @@ void DropoutCPUKernel::InitKernel(const CNodePtr &kernel_node) {

bool DropoutCPUKernel::Launch(const std::vector<kernel::AddressPtr> &inputs, const std::vector<kernel::AddressPtr> &,
const std::vector<kernel::AddressPtr> &outputs) {
CHECK_KERNEL_INPUTS_NUM(inputs.size(), kDropoutInputsNum, kernel_name_);
CHECK_KERNEL_OUTPUTS_NUM(outputs.size(), kDropoutOutputsNum, kernel_name_);
if (dtype_ == kNumberTypeFloat16) {
LaunchKernel<float16>(inputs, outputs);
} else if (dtype_ == kNumberTypeFloat32) {
LaunchKernel<float>(inputs, outputs);
} else {
MS_LOG(EXCEPTION) << kernel_name_ << " only support float16 and float32 on CPU, but got "
<< TypeIdToType(dtype_)->ToString();
}
return true;
}

template <typename T>
void DropoutCPUKernel::LaunchKernel(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &outputs) {
auto input_addr = reinterpret_cast<T *>(inputs[0]->addr);
auto output_addr = reinterpret_cast<T *>(outputs[0]->addr);
void DropoutCPUKernel::LaunchKernel(const std::vector<AddressPtr> &inputs,
const std::vector<AddressPtr> &outputs) const {
const auto *input_addr = reinterpret_cast<T *>(inputs[0]->addr);
auto *output_addr = reinterpret_cast<T *>(outputs[0]->addr);
auto mask_addr = reinterpret_cast<T *>(outputs[1]->addr);
std::random_device rd;
std::mt19937 gen(rd());
@@ -63,17 +74,5 @@ void DropoutCPUKernel::LaunchKernel(const std::vector<AddressPtr> &inputs, const
output_addr[i] = mask_addr[i] * input_addr[i] * scale;
}
}

void DropoutCPUKernel::CheckParam(const CNodePtr &kernel_node) {
MS_EXCEPTION_IF_NULL(kernel_node);
size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
if (input_num != 1) {
MS_LOG(EXCEPTION) << "Input number is " << input_num << ", but DropoutCPUKernel needs 1 input.";
}
size_t output_num = AnfAlgo::GetOutputTensorNum(kernel_node);
if (output_num != 2) {
MS_LOG(EXCEPTION) << "Output number is " << output_num << ", but DropoutCPUKernel needs 1 output.";
}
}
} // namespace kernel
} // namespace mindspore

+ 7
- 6
mindspore/ccsrc/backend/kernel_compiler/cpu/dropout_cpu_kernel.h View File

@@ -1,5 +1,5 @@
/**
* Copyright 2020 Huawei Technologies Co., Ltd
* Copyright 2020-2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
@@ -16,8 +16,10 @@

#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_DROPOUT_CPU_KERNEL_H_
#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_DROPOUT_CPU_KERNEL_H_

#include <memory>
#include <vector>

#include "backend/kernel_compiler/cpu/cpu_kernel.h"
#include "backend/kernel_compiler/cpu/cpu_kernel_factory.h"

@@ -33,17 +35,16 @@ class DropoutCPUKernel : public CPUKernel {
bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
const std::vector<AddressPtr> &outputs) override;

private:
template <typename T>
void LaunchKernel(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &outputs);
void LaunchKernel(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &outputs) const;

private:
void CheckParam(const CNodePtr &kernel_node);
std::vector<size_t> input_shape_;
std::vector<size_t> output_shape_;
std::vector<size_t> mask_shape_;
TypeId dtype_{kTypeUnknown};
float keep_prob_ = 0.0;
uint64_t tensor_size_ = 1;
float keep_prob_{0.0};
uint64_t tensor_size_{1};
};

MS_REG_CPU_KERNEL(Dropout, KernelAttr(), DropoutCPUKernel);


+ 18
- 7
mindspore/ccsrc/backend/kernel_compiler/cpu/dropout_grad_kernel.cc View File

@@ -1,5 +1,5 @@
/**
* Copyright 2020 Huawei Technologies Co., Ltd
* Copyright 2020-2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
@@ -13,16 +13,24 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include "backend/kernel_compiler/cpu/dropout_grad_kernel.h"

#include <vector>
#include "runtime/device/cpu/cpu_device_address.h"
#include "backend/kernel_compiler/cpu/dropout_grad_kernel.h"
#include "nnacl/fp32_grad/dropout_grad.h"
#include "backend/kernel_compiler/cpu/nnacl/fp32_grad/dropout_grad.h"

namespace mindspore {
namespace kernel {
namespace {
constexpr size_t kDropoutGradInputsNum = 2;
constexpr size_t kDropoutGradOutputsNum = 1;
} // namespace

void DropoutGradCpuBwdKernel::InitKernel(const CNodePtr &kernel_node) {
MS_EXCEPTION_IF_NULL(kernel_node);

kernel_name_ = AnfAlgo::GetCNodeName(kernel_node);
auto input_shape = AnfAlgo::GetInputDeviceShape(kernel_node, 0);
auto input_mask_shape = AnfAlgo::GetInputDeviceShape(kernel_node, 1);
if (input_shape.size() != input_mask_shape.size()) {
@@ -35,8 +43,8 @@ void DropoutGradCpuBwdKernel::InitKernel(const CNodePtr &kernel_node) {
}
dtype_ = AnfAlgo::GetInputDeviceDataType(kernel_node, 0);
keep_prob_ = AnfAlgo::GetNodeAttr<float>(kernel_node, "keep_prob");
if (keep_prob_ == 0) {
MS_LOG(EXCEPTION) << "The keep_prob is zero.";
if (keep_prob_ <= 0.0 || keep_prob_ > 1.0) {
MS_LOG(EXCEPTION) << kernel_name_ << "requires keep_prob should be in (0.0, 1.0], but got " << keep_prob_;
}
}

@@ -51,12 +59,15 @@ void DropoutGradCpuBwdKernel::InitInputOutputSize(const CNodePtr &kernel_node) {

bool DropoutGradCpuBwdKernel::Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
const std::vector<AddressPtr> &outputs) {
CHECK_KERNEL_INPUTS_NUM(inputs.size(), kDropoutGradInputsNum, kernel_name_);
CHECK_KERNEL_OUTPUTS_NUM(outputs.size(), kDropoutGradOutputsNum, kernel_name_);
if (dtype_ == kNumberTypeFloat16) {
DropoutBackwardKernel<float16>(inputs, workspace, outputs, keep_prob_);
} else if (dtype_ == kNumberTypeFloat32) {
DropoutBackwardKernel<float>(inputs, workspace, outputs, keep_prob_);
} else {
MS_LOG(ERROR) << "Input data type: " << dtype_ << " is not supported for DropoutGrad kernel for CPU.";
MS_LOG(EXCEPTION) << kernel_name_ << " only support float16 and float32 on CPU, but got "
<< TypeIdToType(dtype_)->ToString();
}

return true;


+ 5
- 4
mindspore/ccsrc/backend/kernel_compiler/cpu/dropout_grad_kernel.h View File

@@ -1,5 +1,5 @@
/**
* Copyright 2020 Huawei Technologies Co., Ltd
* Copyright 2020-2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
@@ -21,6 +21,7 @@
#include <memory>
#include <string>
#include <unordered_map>

#include "backend/kernel_compiler/cpu/cpu_kernel.h"
#include "backend/kernel_compiler/cpu/cpu_kernel_factory.h"

@@ -36,12 +37,12 @@ class DropoutGradCpuBwdKernel : public CPUKernel {
const std::vector<AddressPtr> &outputs) override;

private:
float keep_prob_{1.0};
size_t num_count_{1};
TypeId dtype_{kTypeUnknown};
template <typename T>
void DropoutBackwardKernel(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
const std::vector<AddressPtr> &outputs, float keep_prob);
float keep_prob_{1.0};
size_t num_count_{1};
TypeId dtype_{kTypeUnknown};
};

MS_REG_CPU_KERNEL(DropoutGrad, KernelAttr(), DropoutGradCpuBwdKernel);


+ 31
- 17
mindspore/ccsrc/backend/kernel_compiler/cpu/dynamic_assign_cpu_kernel.cc View File

@@ -1,5 +1,5 @@
/**
* Copyright 2020 Huawei Technologies Co., Ltd
* Copyright 2020-2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
@@ -13,13 +13,23 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include <algorithm>
#include "backend/kernel_compiler/cpu/dynamic_assign_cpu_kernel.h"

#include <algorithm>

#include "runtime/device/cpu/cpu_device_address.h"

namespace mindspore {
namespace kernel {
namespace {
constexpr size_t kDynamicAssignInputsNum = 2;
constexpr size_t kDynamicAssignOutputsNum = 1;
} // namespace

void DynamicAssignCPUKernel::InitKernel(const CNodePtr &kernel_node) {
MS_EXCEPTION_IF_NULL(kernel_node);
kernel_name_ = AnfAlgo::GetCNodeName(kernel_node);
node_wpt_ = kernel_node;
input_x_dtype_ = AnfAlgo::GetInputDeviceDataType(kernel_node, 0);
input_x_dtype_size_ = GetTypeByte(TypeIdToType(input_x_dtype_));
@@ -28,6 +38,8 @@ void DynamicAssignCPUKernel::InitKernel(const CNodePtr &kernel_node) {
bool DynamicAssignCPUKernel::Launch(const std::vector<kernel::AddressPtr> &inputs,
const std::vector<kernel::AddressPtr> &,
const std::vector<kernel::AddressPtr> &outputs) {
CHECK_KERNEL_INPUTS_NUM(inputs.size(), kDynamicAssignInputsNum, kernel_name_);
CHECK_KERNEL_OUTPUTS_NUM(outputs.size(), kDynamicAssignOutputsNum, kernel_name_);
if (input_x_dtype_ == kNumberTypeInt32) {
LaunchKernel<int>(inputs, outputs);
} else if (input_x_dtype_ == kNumberTypeInt64) {
@@ -37,8 +49,8 @@ bool DynamicAssignCPUKernel::Launch(const std::vector<kernel::AddressPtr> &input
} else if (input_x_dtype_ == kNumberTypeFloat64) {
LaunchKernel<double>(inputs, outputs);
} else {
MS_LOG(ERROR) << "Dtype of indices only support float32, float64, int32, int64";
return false;
MS_LOG(EXCEPTION) << kernel_name_ << " support (int32, int64, float32, float64) on CPU , but got "
<< TypeIdToType(input_x_dtype_)->ToString();
}
return true;
}
@@ -46,25 +58,27 @@ bool DynamicAssignCPUKernel::Launch(const std::vector<kernel::AddressPtr> &input
template <typename T>
void DynamicAssignCPUKernel::LaunchKernel(const std::vector<AddressPtr> &inputs,
const std::vector<kernel::AddressPtr> &outputs) {
auto node_ = node_wpt_.lock();
if (!node_) {
MS_LOG(EXCEPTION) << "node_wpt_ is expired.";
auto node = node_wpt_.lock();
if (!node) {
MS_LOG(EXCEPTION) << kernel_name_ << " node_wpt_ is expired.";
}
auto input_x_shape = AnfAlgo::GetPrevNodeOutputInferShape(node_, 0);
auto input_y_shape = AnfAlgo::GetPrevNodeOutputInferShape(node_, 1);
auto input_x_shape = AnfAlgo::GetPrevNodeOutputInferShape(node, 0);
auto input_y_shape = AnfAlgo::GetPrevNodeOutputInferShape(node, 1);
batch_size_ = 1;
for (size_t i = 0; i < input_x_shape.size(); ++i) {
batch_size_ *= input_x_shape[i];
}

if (input_x_shape.size() != input_y_shape.size()) MS_LOG(EXCEPTION) << "X and y must be same shape!";
if (input_x_shape.size() != input_y_shape.size()) {
MS_LOG(EXCEPTION) << "X and y must be same shape";
}
for (size_t i = 0; i < input_x_shape.size(); ++i) {
if (input_x_shape[i] != input_y_shape[i]) {
MS_LOG(EXCEPTION) << "X and y must be same shape!";
MS_LOG(EXCEPTION) << "x and y must be same shape!";
}
}
T *input_x = reinterpret_cast<T *>(inputs[0]->addr);
T *input_y = reinterpret_cast<T *>(inputs[1]->addr);
auto *input_x = reinterpret_cast<T *>(inputs[0]->addr);
auto *input_y = reinterpret_cast<T *>(inputs[1]->addr);
auto max_size = inputs[0]->size;
size_t total_size = input_x_dtype_size_ * batch_size_;
if (total_size > max_size) {
@@ -76,10 +90,10 @@ void DynamicAssignCPUKernel::LaunchKernel(const std::vector<AddressPtr> &inputs,
MS_LOG(EXCEPTION) << "Memcpy_s error, errorno" << ret;
}

auto node_with_idx = AnfAlgo::GetPrevNodeOutput(node_, 0);
auto node = node_with_idx.first;
if (node->isa<Parameter>()) {
auto node_ptr = node->cast<ParameterPtr>();
auto node_with_idx = AnfAlgo::GetPrevNodeOutput(node, 0);
auto out_node = node_with_idx.first;
if (out_node->isa<Parameter>()) {
auto node_ptr = out_node->cast<ParameterPtr>();
auto value = node_ptr->default_param();
auto tensor = value->cast<std::shared_ptr<tensor::Tensor>>();
ShapeVector shape_tmp;


+ 5
- 3
mindspore/ccsrc/backend/kernel_compiler/cpu/dynamic_assign_cpu_kernel.h View File

@@ -1,5 +1,5 @@
/**
* Copyright 2020 Huawei Technologies Co., Ltd
* Copyright 2020-2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
@@ -13,12 +13,14 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_DYNAMIC_ASSIGN_CPU_KERNEL_H_
#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_DYNAMIC_ASSIGN_CPU_KERNEL_H_

#include <vector>
#include <memory>
#include <unordered_map>

#include "backend/kernel_compiler/cpu/cpu_kernel.h"
#include "backend/kernel_compiler/cpu/cpu_kernel_factory.h"

@@ -34,13 +36,13 @@ class DynamicAssignCPUKernel : public CPUKernel {
bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
const std::vector<AddressPtr> &outputs) override;

private:
template <typename T>
void LaunchKernel(const std::vector<AddressPtr> &inputs, const std::vector<kernel::AddressPtr> &outputs);

private:
size_t batch_size_{1};
TypeId input_x_dtype_{kTypeUnknown};
size_t input_x_dtype_size_ = 4;
size_t input_x_dtype_size_{4};
CNodeWeakPtr node_wpt_;
};



+ 19
- 21
mindspore/ccsrc/backend/kernel_compiler/cpu/mkldnn/eltwise_cpu_kernel.cc View File

@@ -1,5 +1,5 @@
/**
* Copyright 2019 Huawei Technologies Co., Ltd
* Copyright 2019-2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
@@ -24,59 +24,57 @@
namespace mindspore {
namespace kernel {
namespace {
constexpr size_t kInputsNum = 1;
constexpr size_t kOutputsNum = 1;

struct DescParam {
dnnl::algorithm algorithm;
float alpha = 0.f;
float beta = 0.f;
dnnl::algorithm algorithm{dnnl::algorithm::undef};
float alpha{0.0f};
float beta{0.0f};
};
} // namespace

dnnl::eltwise_forward::desc EltWiseCPUKernel::GetForwardEltwiseDesc(const CNodePtr &kernel_node,
const dnnl::memory::desc src_desc) {
dnnl::eltwise_forward::desc EltWiseCPUKernel::GetForwardEltwiseDesc(const dnnl::memory::desc src_desc) {
static const std::unordered_map<std::string, DescParam> eltWiseOpDescMap{
{prim::kPrimRelu->name(), DescParam{dnnl::algorithm::eltwise_relu}},
{prim::kPrimRelu6->name(), DescParam{dnnl::algorithm::eltwise_clip, 0.f, 6.f}},
{prim::kPrimRelu6->name(), DescParam{dnnl::algorithm::eltwise_clip, 0.0f, 6.0f}},
{prim::kPrimAbs->name(), DescParam{dnnl::algorithm::eltwise_abs}},
{prim::kPrimExp->name(), DescParam{dnnl::algorithm::eltwise_exp}},
{prim::kPrimLog->name(), DescParam{dnnl::algorithm::eltwise_log}},
{prim::kPrimSigmoid->name(), DescParam{dnnl::algorithm::eltwise_logistic}},
{prim::kPrimSqrt->name(), DescParam{dnnl::algorithm::eltwise_sqrt}},
{prim::kPrimSquare->name(), DescParam{dnnl::algorithm::eltwise_square}},
{prim::kPrimTanh->name(), DescParam{dnnl::algorithm::eltwise_tanh}},
{prim::kPrimElu->name(), DescParam{dnnl::algorithm::eltwise_elu, 1.f, 0.f}},
{prim::kPrimElu->name(), DescParam{dnnl::algorithm::eltwise_elu, 1.0f, 0.0f}},
{prim::kPrimSoftplus->name(), DescParam{dnnl::algorithm::eltwise_soft_relu}},
};

std::string kernel_name = AnfAlgo::GetCNodeName(kernel_node);
const auto desc_pair = eltWiseOpDescMap.find(kernel_name);
const auto desc_pair = eltWiseOpDescMap.find(kernel_name_);
if (desc_pair == eltWiseOpDescMap.end()) {
MS_LOG(EXCEPTION) << "EltWiseCPUKernel does not support " << kernel_name;
MS_LOG(EXCEPTION) << "EltWiseCPUKernel does not support " << kernel_name_;
}
return dnnl::eltwise_forward::desc(DnnlForward, desc_pair->second.algorithm, src_desc, desc_pair->second.alpha,
return dnnl::eltwise_forward::desc(dnnl_forward_, desc_pair->second.algorithm, src_desc, desc_pair->second.alpha,
desc_pair->second.beta);
}

void EltWiseCPUKernel::InitKernel(const CNodePtr &kernel_node) {
MS_EXCEPTION_IF_NULL(kernel_node);
kernel_name_ = AnfAlgo::GetCNodeName(kernel_node);
std::vector<size_t> src_shape = AnfAlgo::GetInputDeviceShape(kernel_node, 0);
if (src_shape.size() == 0) {
src_shape.insert(src_shape.begin(), 1);
if (src_shape.empty()) {
(void)src_shape.insert(src_shape.begin(), 1);
}
dnnl::memory::desc src_desc = GetDefaultMemDesc(src_shape);

auto desc = GetForwardEltwiseDesc(kernel_node, src_desc);
auto desc = GetForwardEltwiseDesc(src_desc);
auto prim_desc = dnnl::eltwise_forward::primitive_desc(desc, MKLKernelEngine::Get().engine());
primitive_ = std::make_shared<dnnl::eltwise_forward>(prim_desc);

AddArgument(DNNL_ARG_SRC, src_desc);
AddArgument(DNNL_ARG_DST, src_desc);
}

bool EltWiseCPUKernel::Launch(const std::vector<kernel::AddressPtr> &inputs, const std::vector<kernel::AddressPtr> &,
const std::vector<kernel::AddressPtr> &outputs) {
if (inputs.empty() || outputs.empty()) {
MS_LOG(EXCEPTION) << "Error input output size!";
}
CHECK_KERNEL_INPUTS_NUM(inputs.size(), kInputsNum, kernel_name_);
CHECK_KERNEL_OUTPUTS_NUM(outputs.size(), kOutputsNum, kernel_name_);
SetArgumentHandle(DNNL_ARG_SRC, inputs[0]->addr);
SetArgumentHandle(DNNL_ARG_DST, outputs[0]->addr);
ExecutePrimitive();


+ 5
- 2
mindspore/ccsrc/backend/kernel_compiler/cpu/mkldnn/eltwise_cpu_kernel.h View File

@@ -13,8 +13,10 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_ELTWISE_CPU_KERNEL_H_
#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_ELTWISE_CPU_KERNEL_H_

#include <memory>
#include <vector>
#include "backend/kernel_compiler/cpu/mkldnn/mkl_cpu_kernel.h"
@@ -32,8 +34,9 @@ class EltWiseCPUKernel : public MKLCPUKernel {
const std::vector<AddressPtr> &outputs) override;

private:
dnnl::eltwise_forward::desc GetForwardEltwiseDesc(const CNodePtr &kernel_node, const dnnl::memory::desc src_desc);
dnnl::prop_kind DnnlForward = dnnl::prop_kind::forward_training;
dnnl::eltwise_forward::desc GetForwardEltwiseDesc(const dnnl::memory::desc src_desc);

dnnl::prop_kind dnnl_forward_{dnnl::prop_kind::forward_training};
};

MS_REG_CPU_KERNEL(Elu, KernelAttr().AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32),


+ 10
- 3
mindspore/ccsrc/backend/kernel_compiler/cpu/mkldnn/log_softmax_cpu_kernel.cc View File

@@ -13,15 +13,23 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include "backend/kernel_compiler/cpu/mkldnn/log_softmax_cpu_kernel.h"
#include <algorithm>
#include "backend/kernel_compiler/cpu/mkldnn/mkl_kernel_engine.h"
#include "runtime/device/cpu/cpu_device_address.h"
#include "utils/ms_utils.h"

namespace mindspore {
namespace kernel {
namespace {
constexpr size_t kLogSoftmaxInputsNum = 1;
constexpr size_t kLogSoftmaxOutputsNum = 1;
} // namespace

void LogSoftmaxCPUKernel::InitKernel(const CNodePtr &kernel_node) {
MS_EXCEPTION_IF_NULL(kernel_node);
kernel_name_ = AnfAlgo::GetCNodeName(kernel_node);
std::vector<size_t> src_shape = AnfAlgo::GetInputDeviceShape(kernel_node, 0);
int axis = AnfAlgo::GetNodeAttr<int64_t>(kernel_node, AXIS);
if (axis >= SizeToInt(src_shape.size())) {
@@ -41,9 +49,8 @@ void LogSoftmaxCPUKernel::InitKernel(const CNodePtr &kernel_node) {

bool LogSoftmaxCPUKernel::Launch(const std::vector<kernel::AddressPtr> &inputs, const std::vector<kernel::AddressPtr> &,
const std::vector<kernel::AddressPtr> &outputs) {
if (inputs.empty() || outputs.empty()) {
MS_LOG(EXCEPTION) << "Log softmax error input output size!";
}
CHECK_KERNEL_INPUTS_NUM(inputs.size(), kLogSoftmaxInputsNum, kernel_name_);
CHECK_KERNEL_OUTPUTS_NUM(outputs.size(), kLogSoftmaxOutputsNum, kernel_name_);
SetArgumentHandle(DNNL_ARG_SRC, inputs[0]->addr);
SetArgumentHandle(DNNL_ARG_DST, outputs[0]->addr);
ExecutePrimitive();


+ 1
- 0
mindspore/ccsrc/backend/kernel_compiler/cpu/mkldnn/log_softmax_cpu_kernel.h View File

@@ -13,6 +13,7 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_LOG_SOFTMAX_CPU_KERNEL_H_
#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_LOG_SOFTMAX_CPU_KERNEL_H_



+ 10
- 3
mindspore/ccsrc/backend/kernel_compiler/cpu/mkldnn/log_softmax_grad_cpu_kernel.cc View File

@@ -13,15 +13,23 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include "backend/kernel_compiler/cpu/mkldnn/log_softmax_grad_cpu_kernel.h"
#include <algorithm>
#include "backend/kernel_compiler/cpu/mkldnn/mkl_kernel_engine.h"
#include "runtime/device/cpu/cpu_device_address.h"
#include "utils/ms_utils.h"

namespace mindspore {
namespace kernel {
namespace {
constexpr size_t kLogSoftmaxGradInputsNum = 2;
constexpr size_t kLogSoftmaxGradOutputsNum = 1;
} // namespace

void LogSoftmaxGradCPUKernel::InitKernel(const CNodePtr &kernel_node) {
MS_EXCEPTION_IF_NULL(kernel_node);
kernel_name_ = AnfAlgo::GetCNodeName(kernel_node);
std::vector<size_t> src_shape = AnfAlgo::GetInputDeviceShape(kernel_node, 0);
int axis = AnfAlgo::GetNodeAttr<int64_t>(kernel_node, AXIS);
if (axis >= SizeToInt(src_shape.size())) {
@@ -47,9 +55,8 @@ void LogSoftmaxGradCPUKernel::InitKernel(const CNodePtr &kernel_node) {
bool LogSoftmaxGradCPUKernel::Launch(const std::vector<kernel::AddressPtr> &inputs,
const std::vector<kernel::AddressPtr> &,
const std::vector<kernel::AddressPtr> &outputs) {
if (inputs.size() < 2 || outputs.empty()) {
MS_LOG(EXCEPTION) << "LogSoftmaxGrad error input output size!";
}
CHECK_KERNEL_INPUTS_NUM(inputs.size(), kLogSoftmaxGradInputsNum, kernel_name_);
CHECK_KERNEL_OUTPUTS_NUM(outputs.size(), kLogSoftmaxGradOutputsNum, kernel_name_);
SetArgumentHandle(DNNL_ARG_DST, inputs[0]->addr);
SetArgumentHandle(DNNL_ARG_DIFF_DST, inputs[1]->addr);
SetArgumentHandle(DNNL_ARG_DIFF_SRC, outputs[0]->addr);


+ 1
- 0
mindspore/ccsrc/backend/kernel_compiler/cpu/mkldnn/log_softmax_grad_cpu_kernel.h View File

@@ -13,6 +13,7 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_LOG_SOFTMAX_GRAD_CPU_KERNEL_H_
#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_LOG_SOFTMAX_GRAD_CPU_KERNEL_H_



+ 21
- 12
mindspore/ccsrc/backend/kernel_compiler/cpu/mkldnn/lstm_cpu_kernel.cc View File

@@ -1,5 +1,5 @@
/**
* Copyright 2020 Huawei Technologies Co., Ltd
* Copyright 2020-2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
@@ -13,6 +13,7 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include "backend/kernel_compiler/cpu/mkldnn/lstm_cpu_kernel.h"
#include <string>
#include "utils/ms_utils.h"
@@ -21,9 +22,18 @@

namespace mindspore {
namespace kernel {
const int kMaxLSTMLayer = 100;
const int kOutputWorkSpaceIndex = 3;
const int kGateNum = 4;
namespace {
constexpr size_t kLstmInputsNum = 4;
constexpr size_t kLstmOutputsNum = 5;
constexpr int kMaxLSTMLayer = 100;
constexpr int kOutputWorkSpaceIndex = 3;
constexpr int kGateNum = 4;

using tag = dnnl::memory::format_tag;
using dim = dnnl::memory::dims;
using dt = dnnl::memory::data_type;
} // namespace

void LstmCPUKernel::InitInputOutputSize(const CNodePtr &kernel_node) {
CPUKernel::InitInputOutputSize(kernel_node);
output_size_list_[kOutputWorkSpaceIndex] = reserve_size_;
@@ -46,8 +56,7 @@ void LstmCPUKernel::InitKernel(const CNodePtr &kernel_node) {
_MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON);
#endif
MS_EXCEPTION_IF_NULL(kernel_node);
using tag = dnnl::memory::format_tag;
using dim = dnnl::memory::dims;
kernel_name_ = AnfAlgo::GetCNodeName(kernel_node);
CheckParam(kernel_node);
auto eng = MKLKernelEngine::Get().engine();
dnnl::rnn_direction direction = dnnl::rnn_direction::unidirectional;
@@ -70,10 +79,10 @@ void LstmCPUKernel::InitKernel(const CNodePtr &kernel_node) {
dnnl::memory::desc dst_desc = formatted_md(dst_dims, tag::tnc);
dnnl::memory::desc dst_h_desc = formatted_md(dst_h_dims, tag::ldnc);
dnnl::memory::desc dst_c_desc = formatted_md(dst_c_dims, tag::ldnc);
if (!kernel_node->HasAttr(kAttrIsTraining)) {
is_training = true;
} else {
if (kernel_node->HasAttr(kAttrIsTraining)) {
is_training = GetValue<bool>(kernel_node->GetAttr(kAttrIsTraining));
} else {
is_training = true;
}
auto prop_kind = dnnl::prop_kind::forward_training;
if (!is_training) {
@@ -106,9 +115,9 @@ void LstmCPUKernel::CheckParam(const CNodePtr &kernel_node) {
std::vector<size_t> src_h_shape = AnfAlgo::GetInputDeviceShape(kernel_node, 1);
std::vector<size_t> src_c_shape = AnfAlgo::GetInputDeviceShape(kernel_node, 2);
bidirectional_ = AnfAlgo::GetNodeAttr<bool>(kernel_node, "bidirectional");
input_size_ = static_cast<int>(AnfAlgo::GetNodeAttr<int64_t>(kernel_node, "input_size"));
hidden_size_ = static_cast<int>(AnfAlgo::GetNodeAttr<int64_t>(kernel_node, "hidden_size"));
num_layers_ = static_cast<int>(AnfAlgo::GetNodeAttr<int64_t>(kernel_node, "num_layers"));
input_size_ = LongToInt(AnfAlgo::GetNodeAttr<int64_t>(kernel_node, "input_size"));
hidden_size_ = LongToInt(AnfAlgo::GetNodeAttr<int64_t>(kernel_node, "hidden_size"));
num_layers_ = LongToInt(AnfAlgo::GetNodeAttr<int64_t>(kernel_node, "num_layers"));
has_bias_ = AnfAlgo::GetNodeAttr<bool>(kernel_node, "has_bias");
batch_size_ = SizeToInt(src_shape[1]);
seq_len_ = SizeToInt(src_shape[0]);


+ 18
- 13
mindspore/ccsrc/backend/kernel_compiler/cpu/mkldnn/lstm_cpu_kernel.h View File

@@ -1,5 +1,5 @@
/**
* Copyright 2020 Huawei Technologies Co., Ltd
* Copyright 2020-2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
@@ -16,15 +16,18 @@

#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_LSTM_CPU_KERNEL_H_
#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_LSTM_CPU_KERNEL_H_

#if defined(__x86_64__) || defined(__amd64__) || defined(_M_IX86) || defined(_M_X64)
#define PLATFORM_86
#endif
#ifdef PLATFORM_86
#include <pmmintrin.h>
#endif

#include <vector>
#include <memory>
#include "backend/kernel_compiler/cpu/mkldnn/mkl_cpu_kernel.h"

namespace mindspore {
namespace kernel {
class LstmCPUKernel : public MKLCPUKernel {
@@ -41,18 +44,20 @@ class LstmCPUKernel : public MKLCPUKernel {

private:
void CheckParam(const CNodePtr &kernel_node);
int weight_size_ = 0;
int weight_h_size_ = 0;
int input_size_;
int hidden_size_;
int num_layers_;
int batch_size_;
int seq_len_;
int num_directions_;
bool bidirectional_;
bool has_bias_;
size_t reserve_size_;
bool is_training;

int weight_size_{0};
int weight_h_size_{0};
int input_size_{0};
int hidden_size_{0};
int num_layers_{0};
int batch_size_{0};
int seq_len_{0};
int num_directions_{0};
bool bidirectional_{false};
bool has_bias_{false};
bool is_training{false};
size_t reserve_size_{0};

dnnl::memory::dims weights_dims_;
dnnl::memory::dims weights_h_dims_;
dnnl::memory::dims bias_dims_;


+ 16
- 7
mindspore/ccsrc/backend/kernel_compiler/cpu/mkldnn/lstm_grad_cpu_kernel.cc View File

@@ -1,5 +1,5 @@
/**
* Copyright 2020 Huawei Technologies Co., Ltd
* Copyright 2020-2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
@@ -13,6 +13,7 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include "backend/kernel_compiler/cpu/mkldnn/lstm_grad_cpu_kernel.h"
#include <cstring>
#include <string>
@@ -22,8 +23,17 @@

namespace mindspore {
namespace kernel {
const int kMaxLSTMLayer = 100;
const int kInputWorkSpaceIndex = 10;
namespace {
constexpr size_t kLstmGradInputsNum = 11;
constexpr size_t kLstmGradOutputsNum = 4;
constexpr int kMaxLSTMLayer = 100;
constexpr int kInputWorkSpaceIndex = 10;

using tag = dnnl::memory::format_tag;
using dim = dnnl::memory::dims;
using dt = dnnl::memory::data_type;
} // namespace

void LSTMGradCPUKernel::InitInputOutputSize(const CNodePtr &kernel_node) {
CPUKernel::InitInputOutputSize(kernel_node);
input_size_list_[kInputWorkSpaceIndex] = reserve_size_;
@@ -31,8 +41,7 @@ void LSTMGradCPUKernel::InitInputOutputSize(const CNodePtr &kernel_node) {

void LSTMGradCPUKernel::InitKernel(const CNodePtr &kernel_node) {
MS_EXCEPTION_IF_NULL(kernel_node);
using tag = dnnl::memory::format_tag;
using dim = dnnl::memory::dims;
kernel_name_ = AnfAlgo::GetCNodeName(kernel_node);
CheckParam(kernel_node);
auto eng = MKLKernelEngine::Get().engine();
dnnl::rnn_direction direction = dnnl::rnn_direction::unidirectional;
@@ -167,8 +176,8 @@ void LSTMGradCPUKernel::ResetMemory(const dnnl::memory &mem, const string name)

bool LSTMGradCPUKernel::Launch(const std::vector<kernel::AddressPtr> &inputs, const std::vector<kernel::AddressPtr> &,
const std::vector<kernel::AddressPtr> &outputs) {
using dt = dnnl::memory::data_type;
using tag = dnnl::memory::format_tag;
CHECK_KERNEL_INPUTS_NUM(inputs.size(), kLstmGradInputsNum, kernel_name_);
CHECK_KERNEL_OUTPUTS_NUM(outputs.size(), kLstmGradOutputsNum, kernel_name_);
auto eng = MKLKernelEngine::Get().engine();
// construct fw memory
auto user_weights_memory = dnnl::memory(dnnl::memory::desc{{weights_dims_}, dt::f32, tag::ldgoi}, eng);


+ 15
- 12
mindspore/ccsrc/backend/kernel_compiler/cpu/mkldnn/lstm_grad_cpu_kernel.h View File

@@ -1,5 +1,5 @@
/**
* Copyright 2020 Huawei Technologies Co., Ltd
* Copyright 2020-2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
@@ -13,6 +13,7 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_LSTM_GRAD_CPU_KERNEL_H_
#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_LSTM_GRAD_CPU_KERNEL_H_

@@ -47,17 +48,19 @@ class LSTMGradCPUKernel : public MKLCPUKernel {
const dnnl::memory &diff_bias_memory);
void ResetMemory(const dnnl::memory &mem, const string name) const;
void CheckParam(const CNodePtr &kernel_node);
int64_t weight_size_ = 0;
int64_t weight_h_size_ = 0;
int64_t input_size_;
int64_t hidden_size_;
int64_t num_layers_;
int64_t batch_size_;
int64_t seq_len_;
int num_directions_;
bool bidirectional_;
bool has_bias_;
size_t reserve_size_;

int num_directions_{0};
bool bidirectional_{false};
bool has_bias_{false};
int64_t weight_size_{0};
int64_t weight_h_size_{0};
int64_t input_size_{0};
int64_t hidden_size_{0};
int64_t num_layers_{0};
int64_t batch_size_{0};
int64_t seq_len_{0};
size_t reserve_size_{0};

dnnl::memory::dims weights_dims_;
dnnl::memory::dims weights_h_dims_;
dnnl::memory::dims bias_dims_;


+ 17
- 13
mindspore/ccsrc/backend/kernel_compiler/cpu/mkldnn/matmul_cpu_kernel.cc View File

@@ -1,5 +1,5 @@
/**
* Copyright 2019 Huawei Technologies Co., Ltd
* Copyright 2019-2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
@@ -13,10 +13,9 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "backend/kernel_compiler/cpu/mkldnn/matmul_cpu_kernel.h"

#include "backend/kernel_compiler/cpu/mkldnn/matmul_cpu_kernel.h"
#include <utility>

#include "common/thread_pool.h"
#include "backend/kernel_compiler/cpu/mkldnn/mkl_kernel_engine.h"
#include "backend/kernel_compiler/cpu/nnacl/op_base.h"
@@ -26,8 +25,10 @@
namespace mindspore {
namespace kernel {
namespace {
constexpr size_t kMatMulInputsNum = 2;
constexpr size_t kMatMulOutputsNum = 1;
const size_t kIndexOffset = 2;
}
} // namespace

void MatMulCPUKernel::InitTile() {
#ifdef ENABLE_AVX
@@ -47,13 +48,16 @@ void MatMulCPUKernel::InitTile() {

void MatMulCPUKernel::InitMatrixA(const float *src_ptr) {
const size_t size = param_.batch * param_.row_align_ * param_.deep_;
a_pack_ptr_ = new float[size];
a_pack_ptr_ = new (std::nothrow) float[size];
if (a_pack_ptr_ == nullptr) {
MS_LOG(EXCEPTION) << "MatMul new a_pack_ptr_ failed.";
}

if (vec_matmul_) {
const size_t count = size * sizeof(float);
if (memcpy_s(a_pack_ptr_, count, src_ptr, count) != EOK) {
FreeBuffer();
MS_LOG(EXCEPTION) << "Memcpy a_pack_ptr_ failed.";
MS_LOG(EXCEPTION) << "MatMul memcpy a_pack_ptr_ failed.";
}
return;
}
@@ -88,14 +92,14 @@ void MatMulCPUKernel::InitMatrixB(const float *src_ptr) {
b_pack_ptr_ = new (std::nothrow) float[size];
if (b_pack_ptr_ == nullptr) {
FreeBuffer();
MS_LOG(EXCEPTION) << "Malloc b_pack_ptr_ failed";
MS_LOG(EXCEPTION) << "MatMul new b_pack_ptr_ failed";
}
if (vec_matmul_) {
if (param_.b_transpose_) {
const size_t count = size * sizeof(float);
if (memcpy_s(b_pack_ptr_, count, src_ptr, count) != EOK) {
FreeBuffer();
MS_LOG(EXCEPTION) << "Memcpy b_pack_ptr_ failed.";
MS_LOG(EXCEPTION) << "MatMul memcpy b_pack_ptr_ failed.";
}
} else {
for (int i = 0; i < param_.batch; i++) {
@@ -169,6 +173,7 @@ void MatMulCPUKernel::InitX64Kernel(bool trans_a, bool trans_b, const std::vecto

void MatMulCPUKernel::InitKernel(const CNodePtr &kernel_node) {
MS_EXCEPTION_IF_NULL(kernel_node);
kernel_name_ = AnfAlgo::GetCNodeName(kernel_node);
std::vector<size_t> a_shape = AnfAlgo::GetInputDeviceShape(kernel_node, 0);
std::vector<size_t> b_shape = AnfAlgo::GetInputDeviceShape(kernel_node, 1);
std::vector<size_t> o_shape = AnfAlgo::GetOutputDeviceShape(kernel_node, 0);
@@ -190,7 +195,7 @@ void MatMulCPUKernel::InitKernel(const CNodePtr &kernel_node) {
#endif
}

int MatMulCPUKernel::FloatRun(size_t task_id) {
int MatMulCPUKernel::FloatRun(size_t task_id) const {
size_t current_stride_oc = thread_stride_ * col_tile_;
if (IntToSize(param_.col_) <= task_id * current_stride_oc) {
return common::SUCCESS;
@@ -238,7 +243,7 @@ void MatMulCPUKernel::LaunchARM(const float *input_a, const float *input_b, floa
FreeBuffer();
}

void MatMulCPUKernel::LaunchX64(const float *input_a, const float *input_b, float *output) {
void MatMulCPUKernel::LaunchX64(const float *input_a, const float *input_b, float *output) const {
dnnl_dim_t lda = (trans_a_ == TRANSPOSE_YES ? dim_m_ : dim_k_);
dnnl_dim_t ldb = (trans_b_ == TRANSPOSE_YES ? dim_k_ : dim_n_);
dnnl_dim_t ldc = dim_n_;
@@ -252,9 +257,8 @@ void MatMulCPUKernel::LaunchX64(const float *input_a, const float *input_b, floa

bool MatMulCPUKernel::Launch(const std::vector<kernel::AddressPtr> &inputs, const std::vector<kernel::AddressPtr> &,
const std::vector<kernel::AddressPtr> &outputs) {
if (inputs.size() < 2 || outputs.empty()) {
MS_LOG(EXCEPTION) << "matmul error input output size!";
}
CHECK_KERNEL_INPUTS_NUM(inputs.size(), kMatMulInputsNum, kernel_name_);
CHECK_KERNEL_OUTPUTS_NUM(outputs.size(), kMatMulOutputsNum, kernel_name_);
const auto input_a = reinterpret_cast<float *>(inputs[0]->addr);
const auto input_b = reinterpret_cast<float *>(inputs[1]->addr);
auto output = reinterpret_cast<float *>(outputs[0]->addr);


+ 6
- 5
mindspore/ccsrc/backend/kernel_compiler/cpu/mkldnn/matmul_cpu_kernel.h View File

@@ -1,5 +1,5 @@
/**
* Copyright 2019 Huawei Technologies Co., Ltd
* Copyright 2019-2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
@@ -13,6 +13,7 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_MATMUL_CPU_KERNEL_H_
#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_MATMUL_CPU_KERNEL_H_

@@ -42,14 +43,12 @@ class MatMulCPUKernel : public MKLCPUKernel {
const std::vector<size_t> &o_shape);
void InitX64Kernel(bool trans_a, bool trans_b, const std::vector<size_t> &a_shape, const std::vector<size_t> &b_shape,
const std::vector<size_t> &o_shape);
void LaunchX64(const float *input_a, const float *input_b, float *output);
void LaunchX64(const float *input_a, const float *input_b, float *output) const;
void LaunchARM(const float *input_a, const float *input_b, float *output);
void ParallelRun(float *output);
int FloatRun(size_t task_id);
int FloatRun(size_t task_id) const;
void FreeBuffer();

char trans_a_{TRANSPOSE_NO};
char trans_b_{TRANSPOSE_NO};
dnnl_dim_t dim_m_{0};
dnnl_dim_t dim_n_{0};
dnnl_dim_t dim_k_{0};
@@ -62,6 +61,8 @@ class MatMulCPUKernel : public MKLCPUKernel {
size_t size_mat_a_{0};
size_t size_mat_b_{0};
size_t size_mat_o_{0};
char trans_a_{TRANSPOSE_NO};
char trans_b_{TRANSPOSE_NO};
bool vec_matmul_{false};
float *a_pack_ptr_{nullptr};
float *b_pack_ptr_{nullptr};


+ 12
- 9
mindspore/ccsrc/backend/kernel_compiler/cpu/mkldnn/mkl_cpu_kernel.cc View File

@@ -1,5 +1,5 @@
/**
* Copyright 2020 Huawei Technologies Co., Ltd
* Copyright 2020-2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
@@ -13,9 +13,11 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include "backend/kernel_compiler/cpu/mkldnn/mkl_cpu_kernel.h"
#include <vector>
#include <string>
#include <algorithm>
#include "utils/ms_utils.h"
#include "backend/kernel_compiler/cpu/mkldnn/mkl_kernel_engine.h"

@@ -24,8 +26,10 @@ namespace kernel {
void MKLCPUKernel::GetPadding(const CNodePtr &kernel_node, const std::string &pad_mode,
const std::vector<size_t> &src_shape, const std::vector<size_t> &kernel_size,
const std::vector<int> &stride, std::vector<int> *padding_l, std::vector<int> *padding_r,
const std::vector<int> &dilation) {
const std::vector<int> &dilation) const {
MS_EXCEPTION_IF_NULL(kernel_node);
MS_EXCEPTION_IF_NULL(padding_l);
MS_EXCEPTION_IF_NULL(padding_r);
auto dim = src_shape.size();
if (dim < 2) {
MS_LOG(EXCEPTION) << "Set pad only support src dim >= 2!";
@@ -65,7 +69,7 @@ void MKLCPUKernel::GetPadding(const CNodePtr &kernel_node, const std::string &pa
}

bool MKLCPUKernel::BinaryBroadCast(std::vector<size_t> *src0_shape, std::vector<size_t> *src1_shape,
std::vector<size_t> *dst_shape) {
std::vector<size_t> *dst_shape) const {
MS_EXCEPTION_IF_NULL(src0_shape);
MS_EXCEPTION_IF_NULL(src1_shape);
MS_EXCEPTION_IF_NULL(dst_shape);
@@ -115,20 +119,19 @@ dnnl::memory::format_tag MKLCPUKernel::GetDefaultFormatTag(const dnnl::memory::d
dnnl::memory::format_tag::a, dnnl::memory::format_tag::ab, dnnl::memory::format_tag::abc,
dnnl::memory::format_tag::abcd, dnnl::memory::format_tag::abcde, dnnl::memory::format_tag::abcdef,
dnnl::memory::format_tag::abcdefg};

auto rank = dims.size();
size_t rank = dims.size();
if (rank > tag_vec.size()) {
MS_LOG(EXCEPTION) << "The kernel does not support construct " << rank << "-D tensor dnnl memory format_tag.";
}
return tag_vec[rank - 1];
}

dnnl::memory::desc MKLCPUKernel::GetDefaultMemDesc(const std::vector<size_t> &shape) {
dnnl::memory::desc MKLCPUKernel::GetDefaultMemDesc(const std::vector<size_t> &shape) const {
dnnl::memory::dims dims;
if (shape.size() == 0) {
dims.insert(dims.end(), 1);
if (shape.empty()) {
(void)dims.insert(dims.end(), 1);
} else {
dims.insert(dims.end(), shape.begin(), shape.end());
(void)dims.insert(dims.end(), shape.begin(), shape.end());
}
dnnl::memory::format_tag mem_tag = GetDefaultFormatTag(dims);
dnnl::memory::desc mem_desc(dims, dnnl::memory::data_type::f32, mem_tag);


+ 8
- 6
mindspore/ccsrc/backend/kernel_compiler/cpu/mkldnn/mkl_cpu_kernel.h View File

@@ -1,5 +1,5 @@
/**
* Copyright 2020 Huawei Technologies Co., Ltd
* Copyright 2020-2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
@@ -13,6 +13,7 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_MKL_CPU_KERNEL_H_
#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_MKL_CPU_KERNEL_H_

@@ -33,21 +34,22 @@ class MKLCPUKernel : public CPUKernel {

protected:
bool BinaryBroadCast(std::vector<size_t> *src0_shape, std::vector<size_t> *src1_shape,
std::vector<size_t> *dst_shape);
std::vector<size_t> *dst_shape) const;
void GetPadding(const CNodePtr &kernel_node, const std::string &pad_mode, const std::vector<size_t> &src_shape,
const std::vector<size_t> &kernel_size, const std::vector<int> &stride, std::vector<int> *padding_l,
std::vector<int> *padding_r, const std::vector<int> &dilation);
std::vector<int> *padding_r, const std::vector<int> &dilation) const;
void AddArgument(int arg_key, const dnnl::memory::desc &mem_desc, bool alloc = false);
void SetArgumentHandle(int arg_key, void *ptr);
dnnl::memory::format_tag GetDefaultFormatTag(const dnnl::memory::dims &dims) const;
dnnl::memory::desc GetDefaultMemDesc(const std::vector<size_t> &shape);
dnnl::memory::desc GetDefaultMemDesc(const std::vector<size_t> &shape) const;
void ExecutePrimitive();
std::unordered_map<int, dnnl::memory> arguments_;
std::shared_ptr<dnnl::primitive> primitive_{nullptr};
inline dnnl::memory::desc formatted_md(const dnnl::memory::dims &dimensions, dnnl::memory::format_tag layout) {
return dnnl::memory::desc{{dimensions}, dnnl::memory::data_type::f32, layout};
}
void Reorder(dnnl::memory *src_mem, dnnl::memory *dst_mem);

std::unordered_map<int, dnnl::memory> arguments_;
std::shared_ptr<dnnl::primitive> primitive_{nullptr};
};
} // namespace kernel
} // namespace mindspore


+ 3
- 1
mindspore/ccsrc/backend/kernel_compiler/cpu/mkldnn/mkl_kernel_engine.cc View File

@@ -1,5 +1,5 @@
/**
* Copyright 2020 Huawei Technologies Co., Ltd
* Copyright 2020-2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
@@ -13,6 +13,7 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include "backend/kernel_compiler/cpu/mkldnn/mkl_kernel_engine.h"
#include "utils/log_adapter.h"
#include "dnnl.hpp"
@@ -33,6 +34,7 @@ dnnl::memory MKLKernelEngine::CreateMemory(const dnnl::memory::desc &mem_desc, b
return dnnl::memory(mem_desc, engine_, nullptr);
}
}

void MKLKernelEngine::Reorder(dnnl::memory *src_mem, dnnl::memory *dst_mem) {
dnnl::reorder(*src_mem, *dst_mem).execute(stream_, *src_mem, *dst_mem);
}


+ 7
- 4
mindspore/ccsrc/backend/kernel_compiler/cpu/mkldnn/mkl_kernel_engine.h View File

@@ -1,5 +1,5 @@
/**
* Copyright 2020 Huawei Technologies Co., Ltd
* Copyright 2020-2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
@@ -13,8 +13,10 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef MINDSPORE_MKL_KERNEL_ENGINE_H_
#define MINDSPORE_MKL_KERNEL_ENGINE_H_

#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_MKL_KERNEL_ENGINE_H_
#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_MKL_KERNEL_ENGINE_H_

#include <cstdlib>
#include <algorithm>
#include <iostream>
@@ -46,10 +48,11 @@ class MKLKernelEngine {
private:
MKLKernelEngine() : engine_(dnnl::engine::kind::cpu, 0), stream_(engine_) {}
~MKLKernelEngine() = default;

dnnl::engine engine_;
dnnl::stream stream_;
};
} // namespace kernel
} // namespace mindspore

#endif // MINDSPORE_MKL_KERNEL_ENGINE_H_
#endif // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_MKL_KERNEL_ENGINE_H_

+ 0
- 65
mindspore/ccsrc/backend/kernel_compiler/cpu/mkldnn/mul_cpu_kernel.cc View File

@@ -1,65 +0,0 @@
/**
* Copyright 2019 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "backend/kernel_compiler/cpu/mkldnn/mul_cpu_kernel.h"
#include "backend/kernel_compiler/cpu/mkldnn/mkl_kernel_engine.h"
#include "runtime/device/cpu/cpu_device_address.h"
#include "utils/ms_utils.h"

namespace mindspore {
namespace kernel {
void MulCPUKernel::InitKernel(const CNodePtr &kernel_node) {
MS_EXCEPTION_IF_NULL(kernel_node);
std::vector<size_t> src0_shape = AnfAlgo::GetInputDeviceShape(kernel_node, 0);
std::vector<size_t> src1_shape = AnfAlgo::GetInputDeviceShape(kernel_node, 1);
std::vector<size_t> dst_shape = AnfAlgo::GetOutputDeviceShape(kernel_node, 0);
need_swap_ = BinaryBroadCast(&src0_shape, &src1_shape, &dst_shape);
dnnl::memory::desc src0_desc;
dnnl::memory::desc src1_desc;
if (need_swap_) {
src0_desc = GetDefaultMemDesc(src1_shape);
src1_desc = GetDefaultMemDesc(src0_shape);
} else {
src0_desc = GetDefaultMemDesc(src0_shape);
src1_desc = GetDefaultMemDesc(src1_shape);
}
dnnl::memory::desc dst_desc = GetDefaultMemDesc(dst_shape);
dnnl::binary::desc desc = dnnl::binary::desc(dnnl::algorithm::binary_mul, src0_desc, src1_desc, dst_desc);
auto prim_desc = dnnl::binary::primitive_desc(desc, MKLKernelEngine::Get().engine());
primitive_ = std::make_shared<dnnl::binary>(prim_desc);

AddArgument(DNNL_ARG_SRC_0, src0_desc);
AddArgument(DNNL_ARG_SRC_1, src1_desc);
AddArgument(DNNL_ARG_DST, dst_desc);
}

bool MulCPUKernel::Launch(const std::vector<kernel::AddressPtr> &inputs, const std::vector<kernel::AddressPtr> &,
const std::vector<kernel::AddressPtr> &outputs) {
if (inputs.size() < 2 || outputs.empty()) {
MS_LOG(EXCEPTION) << "mul error input output size!";
}
if (need_swap_) {
SetArgumentHandle(DNNL_ARG_SRC_0, inputs[1]->addr);
SetArgumentHandle(DNNL_ARG_SRC_1, inputs[0]->addr);
} else {
SetArgumentHandle(DNNL_ARG_SRC_0, inputs[0]->addr);
SetArgumentHandle(DNNL_ARG_SRC_1, inputs[1]->addr);
}
SetArgumentHandle(DNNL_ARG_DST, outputs[0]->addr);
ExecutePrimitive();
return true;
}
} // namespace kernel
} // namespace mindspore

+ 0
- 42
mindspore/ccsrc/backend/kernel_compiler/cpu/mkldnn/mul_cpu_kernel.h View File

@@ -1,42 +0,0 @@
/**
* Copyright 2019 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_MUL_CPU_KERNEL_H_
#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_MUL_CPU_KERNEL_H_

#include <vector>
#include <memory>
#include "backend/kernel_compiler/cpu/arithmetic_cpu_kernel.h"
#include "backend/kernel_compiler/cpu/mkldnn/mkl_cpu_kernel.h"

namespace mindspore {
namespace kernel {
class MulCPUKernel : public MKLCPUKernel {
public:
MulCPUKernel() = default;
~MulCPUKernel() override = default;

void InitKernel(const CNodePtr &kernel_node) override;

bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
const std::vector<AddressPtr> &outputs) override;

private:
bool need_swap_{false};
};
} // namespace kernel
} // namespace mindspore

#endif // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_MUL_CPU_KERNEL_H_

+ 5
- 5
mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/base/broadcast_to.c View File

@@ -39,7 +39,7 @@ void pad_input_shape(int *input_shape, int input_shape_len, int output_shape_len
}
}

#define BROADCAST_TO(type) \
#define BROADCAST_TO_IMPL(type) \
int broadcast_to_##type(const type *input, BroadcastShapeInfo *shape_info, type *output) { \
if (input == NULL || output == NULL) { \
return NNACL_NULL_PTR; \
@@ -96,9 +96,9 @@ void pad_input_shape(int *input_shape, int input_shape_len, int output_shape_len
return NNACL_OK; \
}

BROADCAST_TO(int)
BROADCAST_TO(float)
BROADCAST_TO(bool)
BROADCAST_TO_IMPL(int)
BROADCAST_TO_IMPL(float)
BROADCAST_TO_IMPL(bool)
#ifdef ENABLE_FP16
BROADCAST_TO(float16_t)
BROADCAST_TO_IMPL(float16_t)
#endif

+ 1
- 1
mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/base/broadcast_to.h View File

@@ -21,7 +21,7 @@
#ifdef __cplusplus
extern "C" {
#endif
#define BroadcastTo(type, input, shape_info, output) broadcast_to_##type(input, shape_info, output)
#define BROADCAST_TO(type, input, shape_info, output) broadcast_to_##type(input, shape_info, output)
int broadcast_to_int(const int *input, BroadcastShapeInfo *shape_info, int *output);
int broadcast_to_float(const float *input, BroadcastShapeInfo *shape_info, float *output);
int broadcast_to_bool(const bool *input, BroadcastShapeInfo *shape_info, bool *output);


+ 6
- 6
mindspore/lite/src/runtime/kernel/arm/fp32/broadcast_to_fp32.cc View File

@@ -59,17 +59,17 @@ int BroadcastToCPUKernel::Run() {

switch (data_type_) {
case kNumberTypeFloat32:
return BroadcastTo(float, reinterpret_cast<const float *>(input_data), &shape_info_,
reinterpret_cast<float *>(output_data));
return BROADCAST_TO(float, reinterpret_cast<const float *>(input_data), &shape_info_,
reinterpret_cast<float *>(output_data));
#ifdef ENABLE_FP16
case kNumberTypeFloat16:
return BroadcastTo(float16_t, reinterpret_cast<const float16_t *>(input_data), &shape_info_,
reinterpret_cast<float16_t *>(output_data));
return BROADCAST_TO(float16_t, reinterpret_cast<const float16_t *>(input_data), &shape_info_,
reinterpret_cast<float16_t *>(output_data));
#endif
case kNumberTypeInt32:
case kNumberTypeInt:
return BroadcastTo(int, reinterpret_cast<const int *>(input_data), &shape_info_,
reinterpret_cast<int *>(output_data));
return BROADCAST_TO(int, reinterpret_cast<const int *>(input_data), &shape_info_,
reinterpret_cast<int *>(output_data));
default:
MS_LOG(ERROR) << "UnSupported data type: " << data_type_;
return RET_ERROR;


Loading…
Cancel
Save