Merge pull request !23733 from zhangbuxue/code_check_fixtags/v1.6.0
| @@ -13,26 +13,32 @@ | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| #include "backend/kernel_compiler/cpu/mkldnn/mkl_kernel_engine.h" | |||
| #include "runtime/device/cpu/cpu_device_address.h" | |||
| #include "backend/kernel_compiler/cpu/adam_cpu_kernel.h" | |||
| #include "nnacl/errorcode.h" | |||
| #include "nnacl/fp32/adam_fp32.h" | |||
| #include "backend/kernel_compiler/cpu/nnacl/errorcode.h" | |||
| #include "backend/kernel_compiler/cpu/nnacl/fp32/adam_fp32.h" | |||
| #include "runtime/device/cpu/cpu_device_address.h" | |||
| #include "utils/ms_utils.h" | |||
| namespace mindspore { | |||
| namespace kernel { | |||
| namespace { | |||
| constexpr size_t kAdamInputsNum = 10; | |||
| constexpr size_t kAdamOutputsNum = 3; | |||
| constexpr size_t kScalarIndex = 0; | |||
| } // namespace | |||
| template <typename T> | |||
| void AdamCPUKernel::LaunchAdam(const std::vector<kernel::AddressPtr> &inputs, const std::vector<kernel::AddressPtr> &) { | |||
| T *var = reinterpret_cast<T *>(inputs[VAR]->addr); | |||
| T *m = reinterpret_cast<T *>(inputs[M]->addr); | |||
| T *v = reinterpret_cast<T *>(inputs[V]->addr); | |||
| float beta1_power = reinterpret_cast<float *>(inputs[BETA1_POWER]->addr)[SCALAR_INDEX]; | |||
| float beta2_power = reinterpret_cast<float *>(inputs[BETA2_POWER]->addr)[SCALAR_INDEX]; | |||
| float lr = reinterpret_cast<float *>(inputs[LR]->addr)[SCALAR_INDEX]; | |||
| T beta1 = static_cast<T>(reinterpret_cast<float *>(inputs[BETA1]->addr)[SCALAR_INDEX]); | |||
| T beta2 = static_cast<T>(reinterpret_cast<float *>(inputs[BETA1]->addr)[SCALAR_INDEX]); | |||
| T epsilon = static_cast<T>(reinterpret_cast<float *>(inputs[EPSILON]->addr)[SCALAR_INDEX]); | |||
| float beta1_power = reinterpret_cast<float *>(inputs[BETA1_POWER]->addr)[kScalarIndex]; | |||
| float beta2_power = reinterpret_cast<float *>(inputs[BETA2_POWER]->addr)[kScalarIndex]; | |||
| float lr = reinterpret_cast<float *>(inputs[LR]->addr)[kScalarIndex]; | |||
| T beta1 = static_cast<T>(reinterpret_cast<float *>(inputs[BETA1]->addr)[kScalarIndex]); | |||
| T beta2 = static_cast<T>(reinterpret_cast<float *>(inputs[BETA1]->addr)[kScalarIndex]); | |||
| T epsilon = static_cast<T>(reinterpret_cast<float *>(inputs[EPSILON]->addr)[kScalarIndex]); | |||
| T *gradient = reinterpret_cast<T *>(inputs[GRAD]->addr); | |||
| constexpr float ONE = 1.0; | |||
| if (beta1_power - ONE == 0) { | |||
| @@ -62,12 +68,12 @@ void AdamCPUKernel::LaunchAdamNnacl(const std::vector<kernel::AddressPtr> &input | |||
| float *var = reinterpret_cast<float *>(inputs[VAR]->addr); | |||
| float *m = reinterpret_cast<float *>(inputs[M]->addr); | |||
| float *v = reinterpret_cast<float *>(inputs[V]->addr); | |||
| float beta1_power = reinterpret_cast<float *>(inputs[BETA1_POWER]->addr)[SCALAR_INDEX]; | |||
| float beta2_power = reinterpret_cast<float *>(inputs[BETA2_POWER]->addr)[SCALAR_INDEX]; | |||
| float lr = reinterpret_cast<float *>(inputs[LR]->addr)[SCALAR_INDEX]; | |||
| float beta1 = reinterpret_cast<float *>(inputs[BETA1]->addr)[SCALAR_INDEX]; | |||
| float beta2 = reinterpret_cast<float *>(inputs[BETA2]->addr)[SCALAR_INDEX]; | |||
| float epsilon = reinterpret_cast<float *>(inputs[EPSILON]->addr)[SCALAR_INDEX]; | |||
| float beta1_power = reinterpret_cast<float *>(inputs[BETA1_POWER]->addr)[kScalarIndex]; | |||
| float beta2_power = reinterpret_cast<float *>(inputs[BETA2_POWER]->addr)[kScalarIndex]; | |||
| float lr = reinterpret_cast<float *>(inputs[LR]->addr)[kScalarIndex]; | |||
| float beta1 = reinterpret_cast<float *>(inputs[BETA1]->addr)[kScalarIndex]; | |||
| float beta2 = reinterpret_cast<float *>(inputs[BETA2]->addr)[kScalarIndex]; | |||
| float epsilon = reinterpret_cast<float *>(inputs[EPSILON]->addr)[kScalarIndex]; | |||
| float *gradient = reinterpret_cast<float *>(inputs[GRAD]->addr); | |||
| constexpr float ONE = 1.0; | |||
| if (beta1_power - ONE == 0) { | |||
| @@ -88,26 +94,20 @@ void AdamCPUKernel::LaunchAdamNnacl(const std::vector<kernel::AddressPtr> &input | |||
| void AdamCPUKernel::InitKernel(const CNodePtr &kernel_node) { | |||
| MS_EXCEPTION_IF_NULL(kernel_node); | |||
| size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node); | |||
| kernel_name_ = AnfAlgo::GetCNodeName(kernel_node); | |||
| dtype_ = AnfAlgo::GetInputDeviceDataType(kernel_node, 0); | |||
| if (input_num != INPUT_NUMS) { | |||
| MS_LOG(EXCEPTION) << "Input number is " << input_num << ", but Adam needs 10 inputs."; | |||
| } | |||
| size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node); | |||
| CHECK_KERNEL_INPUTS_NUM(input_num, kAdamInputsNum, kernel_name_); | |||
| size_t output_num = AnfAlgo::GetOutputTensorNum(kernel_node); | |||
| if (output_num != OUTPUT_NUMS) { | |||
| MS_LOG(EXCEPTION) << "Output number is " << output_num << ", but Adam needs 3 outputs."; | |||
| } | |||
| use_nesterov_ = AnfAlgo::GetNodeAttr<bool>(kernel_node, "use_nesterov"); | |||
| CHECK_KERNEL_OUTPUTS_NUM(output_num, kAdamOutputsNum, kernel_name_); | |||
| use_nesterov_ = AnfAlgo::GetNodeAttr<bool>(kernel_node, USE_NESTEROV); | |||
| } | |||
| bool AdamCPUKernel::Launch(const std::vector<kernel::AddressPtr> &inputs, const std::vector<kernel::AddressPtr> &, | |||
| const std::vector<kernel::AddressPtr> &outputs) { | |||
| if (inputs.size() != INPUT_NUMS) { | |||
| MS_LOG(EXCEPTION) << "Input number is " << inputs.size() << ", but Adam needs 10 inputs."; | |||
| } | |||
| if (outputs.size() != OUTPUT_NUMS) { | |||
| MS_LOG(EXCEPTION) << "Output number is " << outputs.size() << ", but Adam needs 3 outputs."; | |||
| } | |||
| CHECK_KERNEL_INPUTS_NUM(inputs.size(), kAdamInputsNum, kernel_name_); | |||
| CHECK_KERNEL_OUTPUTS_NUM(outputs.size(), kAdamOutputsNum, kernel_name_); | |||
| if (inputs[VAR]->size != inputs[M]->size || inputs[VAR]->size != inputs[V]->size || | |||
| inputs[VAR]->size != inputs[GRAD]->size) { | |||
| MS_LOG(EXCEPTION) << "Error input data size!"; | |||
| @@ -124,7 +124,6 @@ bool AdamCPUKernel::Launch(const std::vector<kernel::AddressPtr> &inputs, const | |||
| LaunchAdam<float16>(inputs, outputs); | |||
| } else { | |||
| MS_LOG(EXCEPTION) << "Adam not support " << dtype_; | |||
| return false; | |||
| } | |||
| return true; | |||
| } | |||
| @@ -13,33 +13,33 @@ | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_ADAM_CPU_KERNEL_H_ | |||
| #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_ADAM_CPU_KERNEL_H_ | |||
| #include <vector> | |||
| #include <memory> | |||
| #include "backend/kernel_compiler/cpu/cpu_kernel.h" | |||
| #include "backend/kernel_compiler/cpu/cpu_kernel_factory.h" | |||
| namespace mindspore { | |||
| namespace kernel { | |||
| constexpr size_t SCALAR_INDEX = 0; | |||
| constexpr size_t INPUT_NUMS = 10; | |||
| constexpr size_t OUTPUT_NUMS = 3; | |||
| class AdamCPUKernel : public CPUKernel { | |||
| public: | |||
| AdamCPUKernel() = default; | |||
| ~AdamCPUKernel() override = default; | |||
| template <typename T> | |||
| void LaunchAdam(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &outputs); | |||
| void LaunchAdamNnacl(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &outputs); | |||
| void InitKernel(const CNodePtr &kernel_node) override; | |||
| bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace, | |||
| const std::vector<AddressPtr> &outputs) override; | |||
| private: | |||
| template <typename T> | |||
| void LaunchAdam(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &outputs); | |||
| void LaunchAdamNnacl(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &outputs); | |||
| bool use_nesterov_{false}; | |||
| TypeId dtype_{kTypeUnknown}; | |||
| enum input_list_ { VAR, M, V, BETA1_POWER, BETA2_POWER, LR, BETA1, BETA2, EPSILON, GRAD }; | |||
| @@ -13,20 +13,24 @@ | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| #include <thread> | |||
| #include "backend/kernel_compiler/cpu/adam_delta_cpu_kernel.h" | |||
| #include <vector> | |||
| #include <string> | |||
| #include <memory> | |||
| #include "backend/kernel_compiler/common_utils.h" | |||
| #include "backend/kernel_compiler/cpu/nnacl/fp32/adam_fp32.h" | |||
| #include "runtime/device/cpu/cpu_device_address.h" | |||
| #include "backend/kernel_compiler/cpu/adam_delta_cpu_kernel.h" | |||
| #include "nnacl/errorcode.h" | |||
| #include "nnacl/fp32/adam_fp32.h" | |||
| #include "utils/ms_utils.h" | |||
| namespace mindspore { | |||
| namespace kernel { | |||
| constexpr size_t kAdamDeltaInputSize = 9; | |||
| namespace { | |||
| constexpr size_t kAdamDeltaInputsNum = 9; | |||
| constexpr size_t kAdamDeltaOutputsNum = 1; | |||
| } // namespace | |||
| template <typename T> | |||
| void AdamDeltaCPUKernel::LaunchAdamDelta(T *delta, T *m, T *v, float lr, float beta1, float beta2, float epsilon, | |||
| const T *gradient, size_t size) { | |||
| @@ -55,6 +59,7 @@ void AdamDeltaCPUKernel::LaunchAdamDelta(T *delta, T *m, T *v, float lr, float b | |||
| void AdamDeltaCPUKernel::InitKernel(const CNodePtr &kernel_node) { | |||
| MS_EXCEPTION_IF_NULL(kernel_node); | |||
| kernel_name_ = AnfAlgo::GetCNodeName(kernel_node); | |||
| std::vector<size_t> delta_shape = AnfAlgo::GetOutputDeviceShape(kernel_node, 0); | |||
| std::vector<size_t> m_shape = AnfAlgo::GetInputDeviceShape(kernel_node, 0); | |||
| std::vector<size_t> v_shape = AnfAlgo::GetInputDeviceShape(kernel_node, 1); | |||
| @@ -86,14 +91,14 @@ void AdamDeltaCPUKernel::InitKernel(const CNodePtr &kernel_node) { | |||
| void AdamDeltaCPUKernel::CheckParams(const std::vector<kernel::AddressPtr> &inputs, | |||
| const std::vector<kernel::AddressPtr> &outputs) const { | |||
| if (inputs.size() != kAdamDeltaInputSize) { | |||
| MS_LOG(EXCEPTION) << "Error input size!"; | |||
| } | |||
| CHECK_KERNEL_INPUTS_NUM(inputs.size(), kAdamDeltaInputsNum, kernel_name_); | |||
| CHECK_KERNEL_OUTPUTS_NUM(outputs.size(), kAdamDeltaOutputsNum, kernel_name_); | |||
| size_t elem_size = elem_num_ * 4; | |||
| std::vector<size_t> expect_sizes = {elem_size, elem_size, 4, 4, 4, 4, 4, 4, elem_size}; | |||
| std::vector<std::string> input_names = {"m", "v", "beta1_power", "beta2_power", "lr", | |||
| "beta1", "beta2", "epsilon", "grad"}; | |||
| for (size_t i = 0; i < kAdamDeltaInputSize; ++i) { | |||
| for (size_t i = 0; i < kAdamDeltaInputsNum; ++i) { | |||
| if (inputs[i]->size != expect_sizes[i]) { | |||
| MS_LOG(EXCEPTION) << "Error input " << input_names[i] << " size!"; | |||
| } | |||
| @@ -1,5 +1,5 @@ | |||
| /** | |||
| * Copyright 2020 Huawei Technologies Co., Ltd | |||
| * Copyright 2020-2021 Huawei Technologies Co., Ltd | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| @@ -13,9 +13,12 @@ | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_ADAM_DELTA_CPU_KERNEL_H_ | |||
| #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_ADAM_DELTA_CPU_KERNEL_H_ | |||
| #include <vector> | |||
| #include "backend/kernel_compiler/cpu/cpu_kernel.h" | |||
| #include "backend/kernel_compiler/cpu/cpu_kernel_factory.h" | |||
| @@ -29,8 +32,9 @@ class AdamDeltaCPUKernel : public CPUKernel { | |||
| bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace, | |||
| const std::vector<AddressPtr> &outputs) override; | |||
| protected: | |||
| private: | |||
| void CheckParams(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &outputs) const; | |||
| template <typename T> | |||
| void LaunchAdamDelta(T *delta, T *m, T *v, float lr, float beta1, float beta2, float epsilon, const T *gradient, | |||
| size_t size); | |||
| @@ -13,12 +13,14 @@ | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| #include "backend/kernel_compiler/cpu/adam_weight_decay_cpu_kernel.h" | |||
| #include <cmath> | |||
| #include "backend/kernel_compiler/cpu/mkldnn/mkl_kernel_engine.h" | |||
| #include "backend/kernel_compiler/cpu/nnacl/errorcode.h" | |||
| #include "backend/kernel_compiler/cpu/nnacl/fp32/adam_fp32.h" | |||
| #include "runtime/device/cpu/cpu_device_address.h" | |||
| #include "nnacl/errorcode.h" | |||
| #include "nnacl/fp32/adam_fp32.h" | |||
| #include "utils/ms_utils.h" | |||
| namespace mindspore { | |||
| @@ -13,11 +13,13 @@ | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_ADAM_WEIGHT_DECAY_CPU_KERNEL_H_ | |||
| #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_ADAM_WEIGHT_DECAY_CPU_KERNEL_H_ | |||
| #include <vector> | |||
| #include <memory> | |||
| #include "backend/kernel_compiler/cpu/cpu_kernel.h" | |||
| #include "backend/kernel_compiler/cpu/cpu_kernel_factory.h" | |||
| @@ -13,6 +13,7 @@ | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| #include "backend/kernel_compiler/cpu/allgather_cpu_kernel.h" | |||
| #include "runtime/device/cpu/cpu_device_address.h" | |||
| #include "runtime/device/cpu/mpi/mpi_interface.h" | |||
| @@ -21,28 +22,25 @@ | |||
| namespace mindspore { | |||
| namespace kernel { | |||
| namespace { | |||
| constexpr size_t kAllGatherInputsNum = 1; | |||
| constexpr size_t kAllGatherOutputsNum = 1; | |||
| constexpr auto kRanksGroup = "group"; | |||
| constexpr auto kAllGatherInputNum = 1; | |||
| } // namespace | |||
| void AllGatherCPUKernel::InitKernel(const CNodePtr &kernel_node) { | |||
| MS_EXCEPTION_IF_NULL(kernel_node); | |||
| kernel_name_ = AnfAlgo::GetCNodeName(kernel_node); | |||
| size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node); | |||
| if (input_num != kAllGatherInputNum) { | |||
| MS_LOG(EXCEPTION) << "Allgather input num:" << input_num; | |||
| } | |||
| auto ranks_group = AnfAlgo::GetCNodePrimitive(kernel_node)->GetAttr(kRanksGroup); | |||
| if (ranks_group != nullptr) { | |||
| ranks_group_ = GetValue<std::vector<int>>(ranks_group); | |||
| } else { | |||
| MS_LOG(EXCEPTION) << "Miss attribute " << kRanksGroup; | |||
| } | |||
| CHECK_KERNEL_INPUTS_NUM(input_num, kAllGatherInputsNum, kernel_name_); | |||
| ranks_group_ = AnfAlgo::GetNodeAttr<std::vector<int>>(kernel_node, kRanksGroup); | |||
| } | |||
| bool AllGatherCPUKernel::Launch(const std::vector<kernel::AddressPtr> &inputs, const std::vector<kernel::AddressPtr> &, | |||
| const std::vector<kernel::AddressPtr> &outputs) { | |||
| auto input_addr = reinterpret_cast<float *>(inputs[0]->addr); | |||
| auto output_addr = reinterpret_cast<float *>(outputs[0]->addr); | |||
| CHECK_KERNEL_INPUTS_NUM(inputs.size(), kAllGatherInputsNum, kernel_name_); | |||
| CHECK_KERNEL_OUTPUTS_NUM(outputs.size(), kAllGatherOutputsNum, kernel_name_); | |||
| auto *input_addr = reinterpret_cast<float *>(inputs[0]->addr); | |||
| auto *output_addr = reinterpret_cast<float *>(outputs[0]->addr); | |||
| auto input_data_num = inputs[0]->size / sizeof(float); | |||
| return MPIAllGather(input_addr, output_addr, ranks_group_, input_data_num); | |||
| } | |||
| @@ -1,5 +1,5 @@ | |||
| /** | |||
| * Copyright 2020 Huawei Technologies Co., Ltd | |||
| * Copyright 2020-2021 Huawei Technologies Co., Ltd | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| @@ -13,10 +13,13 @@ | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_REDUCE_SCATTER_CPU_KERNEL_H_ | |||
| #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_REDUCE_SCATTER_CPU_KERNEL_H_ | |||
| #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_ALLGATHER_CPU_KERNEL_H_ | |||
| #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_ALLGATHER_CPU_KERNEL_H_ | |||
| #include <vector> | |||
| #include <memory> | |||
| #include "backend/kernel_compiler/cpu/cpu_kernel.h" | |||
| #include "backend/kernel_compiler/cpu/cpu_kernel_factory.h" | |||
| @@ -41,4 +44,4 @@ MS_REG_CPU_KERNEL(_HostAllGather, KernelAttr().AddInputAttr(kNumberTypeFloat32). | |||
| } // namespace kernel | |||
| } // namespace mindspore | |||
| #endif // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_REDUCE_SCATTER_CPU_KERNEL_H_ | |||
| #endif // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_ALLGATHER_CPU_KERNEL_H_ | |||
| @@ -1,5 +1,5 @@ | |||
| /** | |||
| * Copyright 2020 Huawei Technologies Co., Ltd | |||
| * Copyright 2020-2021 Huawei Technologies Co., Ltd | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| @@ -24,11 +24,13 @@ namespace kernel { | |||
| namespace { | |||
| constexpr size_t kSizeFloat16 = 2; | |||
| constexpr size_t kSizeFloat32 = 4; | |||
| constexpr size_t kInputSize = 4; | |||
| constexpr size_t kOutputSize = 2; | |||
| constexpr size_t kApplyAdagradInputsNum = 4; | |||
| constexpr size_t kApplyAdagradOutputsNum = 2; | |||
| } // namespace | |||
| void ApplyAdagradCPUKernel::InitKernel(const CNodePtr &kernel_node) { | |||
| MS_EXCEPTION_IF_NULL(kernel_node); | |||
| kernel_name_ = AnfAlgo::GetCNodeName(kernel_node); | |||
| update_slots_ = AnfAlgo::GetNodeAttr<bool>(kernel_node, "update_slots"); | |||
| dtype_ = AnfAlgo::GetInputDeviceDataType(kernel_node, 0); | |||
| } | |||
| @@ -36,47 +38,41 @@ void ApplyAdagradCPUKernel::InitKernel(const CNodePtr &kernel_node) { | |||
| bool ApplyAdagradCPUKernel::Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &, | |||
| const std::vector<AddressPtr> &outputs) { | |||
| CheckParam(inputs, outputs); | |||
| if (dtype_ == kNumberTypeFloat16) { | |||
| LaunchKernel<float16>(inputs, outputs); | |||
| } else if (dtype_ == kNumberTypeFloat32) { | |||
| LaunchKernel<float>(inputs, outputs); | |||
| } else { | |||
| MS_LOG(EXCEPTION) << kernel_name_ << " only support float16 and float32 on CPU, but got " | |||
| << TypeIdToType(dtype_)->ToString(); | |||
| } | |||
| return true; | |||
| } | |||
| void ApplyAdagradCPUKernel::CheckParam(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &outputs) { | |||
| void ApplyAdagradCPUKernel::CheckParam(const std::vector<AddressPtr> &inputs, | |||
| const std::vector<AddressPtr> &outputs) const { | |||
| // inputs: var, accum, lr, gradient | |||
| if (inputs.size() != kInputSize) { | |||
| MS_LOG(EXCEPTION) << "Input number is " << inputs.size() << ", but ApplyAdagrad needs 4 inputs."; | |||
| } | |||
| // outputs: var, accum | |||
| if (outputs.size() != kOutputSize) { | |||
| MS_LOG(EXCEPTION) << "Output number is " << outputs.size() << ", but ApplyAdagrad needs 2 outputs."; | |||
| } | |||
| CHECK_KERNEL_INPUTS_NUM(inputs.size(), kApplyAdagradInputsNum, kernel_name_); | |||
| CHECK_KERNEL_OUTPUTS_NUM(outputs.size(), kApplyAdagradOutputsNum, kernel_name_); | |||
| if (inputs[0]->size != inputs[1]->size || inputs[0]->size != inputs[3]->size) { | |||
| MS_LOG(EXCEPTION) << "Error input data size!"; | |||
| } | |||
| if (inputs[2]->size != kSizeFloat16 && inputs[2]->size != kSizeFloat32) { | |||
| MS_LOG(EXCEPTION) << "The attribute lr and grad must be float16 or float32!"; | |||
| MS_LOG(EXCEPTION) << kernel_name_ << " requires the attribute lr and grad must be float16 or float32!"; | |||
| } | |||
| } | |||
| template <typename T> | |||
| void ApplyAdagradCPUKernel::LaunchKernel(const std::vector<AddressPtr> &inputs, | |||
| const std::vector<AddressPtr> &outputs) { | |||
| auto var = reinterpret_cast<T *>(inputs[0]->addr); | |||
| auto accum = reinterpret_cast<T *>(inputs[1]->addr); | |||
| auto lr = reinterpret_cast<T *>(inputs[2]->addr); | |||
| auto gradient = reinterpret_cast<T *>(inputs[3]->addr); | |||
| auto *var = reinterpret_cast<T *>(inputs[0]->addr); | |||
| auto *accum = reinterpret_cast<T *>(inputs[1]->addr); | |||
| const auto *lr = reinterpret_cast<T *>(inputs[2]->addr); | |||
| const auto *gradient = reinterpret_cast<T *>(inputs[3]->addr); | |||
| // multithreading | |||
| size_t length = inputs[0]->size / sizeof(T); | |||
| auto task = [this, &var, &accum, lr, gradient](size_t start, size_t end) { | |||
| auto task = [this, &var, &accum, &lr, &gradient](size_t start, size_t end) { | |||
| LaunchApplyAdagrad(var, accum, lr, gradient, start, end); | |||
| }; | |||
| CPUKernelUtils::ParallelForAutoSearch(task, length, ¶llel_search_info_); | |||
| @@ -87,19 +83,17 @@ void ApplyAdagradCPUKernel::LaunchKernel(const std::vector<AddressPtr> &inputs, | |||
| if (memcpy_s(output_var, outputs[0]->size, var, inputs[0]->size) != EOK) { | |||
| MS_LOG(EXCEPTION) << "Launch kernel error: memcpy failed."; | |||
| } | |||
| if (memcpy_s(output_accum, outputs[1]->size, accum, inputs[1]->size) != EOK) { | |||
| MS_LOG(EXCEPTION) << "Launch kernel error: memcpy failed."; | |||
| } | |||
| } | |||
| template <typename T> | |||
| void ApplyAdagradCPUKernel::LaunchApplyAdagrad(T const var, T const accum, const T lr, const T gradient, size_t start, | |||
| size_t end) { | |||
| void ApplyAdagradCPUKernel::LaunchApplyAdagrad(T *var, T *accum, const T *lr, const T *gradient, size_t start, | |||
| size_t end) const { | |||
| // DataType can only be float32 or float16, so eps will not be zero. | |||
| using DataType = typename std::iterator_traits<T>::value_type; | |||
| const DataType one = DataType(1); | |||
| const DataType eps = DataType(1e-6); | |||
| auto one = static_cast<T>(1); | |||
| auto eps = static_cast<T>(1e-6); | |||
| for (size_t i = start; i < end; ++i) { | |||
| // update accum: accum += grad * grad | |||
| if (update_slots_) { | |||
| @@ -1,5 +1,5 @@ | |||
| /** | |||
| * Copyright 2020 Huawei Technologies Co., Ltd | |||
| * Copyright 2020-2021 Huawei Technologies Co., Ltd | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| @@ -13,11 +13,13 @@ | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_APPLY_ADAGRAD_CPU_KERNEL_H_ | |||
| #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_APPLY_ADAGRAD_CPU_KERNEL_H_ | |||
| #include <thread> | |||
| #include <vector> | |||
| #include "backend/kernel_compiler/cpu/cpu_kernel.h" | |||
| #include "backend/kernel_compiler/cpu/cpu_kernel_factory.h" | |||
| @@ -34,11 +36,14 @@ class ApplyAdagradCPUKernel : public CPUKernel { | |||
| const std::vector<AddressPtr> &outputs) override; | |||
| private: | |||
| static void CheckParam(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &outputs); | |||
| void CheckParam(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &outputs) const; | |||
| template <typename T> | |||
| void LaunchKernel(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &outputs); | |||
| template <typename T> | |||
| void LaunchApplyAdagrad(T const var, T const accum, const T lr, const T gradient, size_t start, size_t end); | |||
| void LaunchApplyAdagrad(T *var, T *accum, const T *lr, const T *gradient, size_t start, size_t end) const; | |||
| bool update_slots_{true}; | |||
| TypeId dtype_{kTypeUnknown}; | |||
| }; | |||
| @@ -1,5 +1,5 @@ | |||
| /** | |||
| * Copyright 2019 Huawei Technologies Co., Ltd | |||
| * Copyright 2019-2021 Huawei Technologies Co., Ltd | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| @@ -13,6 +13,7 @@ | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| #include "backend/kernel_compiler/cpu/apply_momentum_cpu_kernel.h" | |||
| #include "backend/kernel_compiler/cpu/mkldnn/mkl_kernel_engine.h" | |||
| #include "runtime/device/cpu/cpu_device_address.h" | |||
| @@ -20,20 +21,25 @@ | |||
| namespace mindspore { | |||
| namespace kernel { | |||
| void ApplyMomentumCPUKernel::InitKernel(const CNodePtr &) {} | |||
| namespace { | |||
| constexpr size_t kApplyMomentumInputsNum = 5; | |||
| } // namespace | |||
| void ApplyMomentumCPUKernel::InitKernel(const CNodePtr &kernel_node) { | |||
| MS_EXCEPTION_IF_NULL(kernel_node); | |||
| kernel_name_ = AnfAlgo::GetCNodeName(kernel_node); | |||
| } | |||
| bool ApplyMomentumCPUKernel::Launch(const std::vector<kernel::AddressPtr> &inputs, | |||
| const std::vector<kernel::AddressPtr> &, const std::vector<kernel::AddressPtr> &) { | |||
| if (inputs.size() < 5) { | |||
| MS_LOG(EXCEPTION) << "Error input output size!"; | |||
| } | |||
| CHECK_KERNEL_INPUTS_NUM(inputs.size(), kApplyMomentumInputsNum, kernel_name_); | |||
| if (inputs[0]->size != inputs[1]->size || inputs[0]->size != inputs[3]->size) { | |||
| MS_LOG(EXCEPTION) << "Error input data size!"; | |||
| } | |||
| auto weight = reinterpret_cast<float *>(inputs[0]->addr); | |||
| auto accumulate = reinterpret_cast<float *>(inputs[1]->addr); | |||
| auto *weight = reinterpret_cast<float *>(inputs[0]->addr); | |||
| auto *accumulate = reinterpret_cast<float *>(inputs[1]->addr); | |||
| float learning_rate = reinterpret_cast<float *>(inputs[2]->addr)[0]; | |||
| auto gradient = reinterpret_cast<float *>(inputs[3]->addr); | |||
| const auto *gradient = reinterpret_cast<float *>(inputs[3]->addr); | |||
| float moment = reinterpret_cast<float *>(inputs[4]->addr)[0]; | |||
| size_t elem_num = inputs[0]->size / sizeof(float); | |||
| for (size_t i = 0; i < elem_num; ++i) { | |||
| @@ -1,5 +1,5 @@ | |||
| /** | |||
| * Copyright 2019 Huawei Technologies Co., Ltd | |||
| * Copyright 2019-2021 Huawei Technologies Co., Ltd | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| @@ -13,16 +13,19 @@ | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_APPLY_MOMENTUM_CPU_KERNEL_H_ | |||
| #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_APPLY_MOMENTUM_CPU_KERNEL_H_ | |||
| #include <vector> | |||
| #include <memory> | |||
| #include "backend/kernel_compiler/cpu/mkldnn/mkl_cpu_kernel.h" | |||
| #include "backend/kernel_compiler/cpu/cpu_kernel.h" | |||
| #include "backend/kernel_compiler/cpu/cpu_kernel_factory.h" | |||
| namespace mindspore { | |||
| namespace kernel { | |||
| class ApplyMomentumCPUKernel : public MKLCPUKernel { | |||
| class ApplyMomentumCPUKernel : public CPUKernel { | |||
| public: | |||
| ApplyMomentumCPUKernel() = default; | |||
| ~ApplyMomentumCPUKernel() override = default; | |||
| @@ -1,5 +1,5 @@ | |||
| /** | |||
| * Copyright 2019 Huawei Technologies Co., Ltd | |||
| * Copyright 2019-2021 Huawei Technologies Co., Ltd | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| @@ -13,12 +13,20 @@ | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| #include "backend/kernel_compiler/cpu/argmax_cpu_kernel.h" | |||
| #include <string> | |||
| #include "runtime/device/cpu/cpu_device_address.h" | |||
| namespace mindspore { | |||
| namespace kernel { | |||
| namespace { | |||
| constexpr size_t kArgMaxInputsNum = 1; | |||
| constexpr size_t kArgMaxOutputsNum = 1; | |||
| constexpr char kKernelName[] = "ArgMax"; | |||
| size_t get_element_num(const std::vector<size_t> &shape) { | |||
| size_t size = 1; | |||
| for (size_t i = 0; i < shape.size(); i++) { | |||
| @@ -30,17 +38,14 @@ size_t get_element_num(const std::vector<size_t> &shape) { | |||
| template <typename T> | |||
| bool check_validation(const std::vector<size_t> &shape, const size_t num_before_axis, const size_t num_after_axis, | |||
| const std::vector<kernel::AddressPtr> &inputs, const std::vector<kernel::AddressPtr> &outputs) { | |||
| if (inputs.size() != 1 || outputs.size() != 1) { | |||
| MS_LOG(EXCEPTION) << "Wrong number of inputs or outputs!"; | |||
| return false; | |||
| } | |||
| CHECK_KERNEL_INPUTS_NUM(inputs.size(), kArgMaxInputsNum, kKernelName); | |||
| CHECK_KERNEL_OUTPUTS_NUM(outputs.size(), kArgMaxOutputsNum, kKernelName); | |||
| size_t data_size = sizeof(T); | |||
| size_t input_size = get_element_num(shape) * data_size; | |||
| size_t output_num = num_before_axis * num_after_axis; | |||
| size_t output_size = output_num * sizeof(int); | |||
| if (inputs[0]->size != input_size || outputs[0]->size != output_size) { | |||
| MS_LOG(EXCEPTION) << "Invalid input or output data size!"; | |||
| return false; | |||
| } | |||
| return true; | |||
| } | |||
| @@ -49,24 +54,28 @@ bool check_validation(const std::vector<size_t> &shape, const size_t num_before_ | |||
| template <typename T> | |||
| void ArgmaxCPUKernel<T>::InitKernel(const CNodePtr &kernel_node) { | |||
| MS_EXCEPTION_IF_NULL(kernel_node); | |||
| kernel_name_ = AnfAlgo::GetCNodeName(kernel_node); | |||
| shape_ = AnfAlgo::GetInputDeviceShape(kernel_node, 0); | |||
| size_t shape_len = shape_.size(); | |||
| if (shape_len == 0) { | |||
| MS_LOG(EXCEPTION) << "Shape size should be greater than 0"; | |||
| } | |||
| int64_t axis = AnfAlgo::GetNodeAttr<int64_t>(kernel_node, AXIS); | |||
| axis += SizeToLong(shape_len); | |||
| if (axis < 0) { | |||
| MS_LOG(EXCEPTION) << "Invalid axis:" << axis << ", should in range [-1, " << (shape_len - 1) << "]"; | |||
| } | |||
| axis = axis % static_cast<int64_t>(shape_len); | |||
| axis = axis % SizeToLong(shape_len); | |||
| num_before_axis_ = 1; | |||
| num_after_axis_ = 1; | |||
| for (size_t i = 0; i < shape_len; i++) { | |||
| if (static_cast<int64_t>(i) < axis) { | |||
| if (SizeToLong(i) < axis) { | |||
| num_before_axis_ *= shape_[i]; | |||
| } else if (static_cast<int64_t>(i) > axis) { | |||
| } else if (SizeToLong(i) > axis) { | |||
| num_after_axis_ *= shape_[i]; | |||
| } | |||
| } | |||
| dim_axis_ = shape_[axis]; | |||
| dim_axis_ = shape_[LongToSize(axis)]; | |||
| } | |||
| template <typename T> | |||
| @@ -76,8 +85,8 @@ bool ArgmaxCPUKernel<T>::Launch(const std::vector<kernel::AddressPtr> &inputs, c | |||
| return false; | |||
| } | |||
| auto input = reinterpret_cast<T *>(inputs[0]->addr); | |||
| auto output = reinterpret_cast<int32_t *>(outputs[0]->addr); | |||
| const auto *input = reinterpret_cast<T *>(inputs[0]->addr); | |||
| auto *output = reinterpret_cast<int32_t *>(outputs[0]->addr); | |||
| std::vector<float> array_axis(dim_axis_); | |||
| for (size_t i = 0; i < num_before_axis_; i++) { | |||
| @@ -1,5 +1,5 @@ | |||
| /** | |||
| * Copyright 2019 Huawei Technologies Co., Ltd | |||
| * Copyright 2019-2021 Huawei Technologies Co., Ltd | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| @@ -13,10 +13,13 @@ | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_ARGMAX_CPU_KERNEL_H_ | |||
| #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_ARGMAX_CPU_KERNEL_H_ | |||
| #include <vector> | |||
| #include <memory> | |||
| #include "backend/kernel_compiler/cpu/cpu_kernel.h" | |||
| #include "backend/kernel_compiler/cpu/cpu_kernel_factory.h" | |||
| @@ -35,9 +38,9 @@ class ArgmaxCPUKernel : public CPUKernel { | |||
| private: | |||
| std::vector<size_t> shape_; | |||
| size_t num_before_axis_; | |||
| size_t num_after_axis_; | |||
| size_t dim_axis_; | |||
| size_t num_before_axis_{0}; | |||
| size_t num_after_axis_{0}; | |||
| size_t dim_axis_{0}; | |||
| }; | |||
| MS_REG_CPU_KERNEL_T(Argmax, KernelAttr(), ArgmaxCPUKernel, float); | |||
| @@ -13,6 +13,7 @@ | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| #include "backend/kernel_compiler/cpu/argmax_with_value_cpu_kernel.h" | |||
| #include "runtime/device/cpu/cpu_device_address.h" | |||
| @@ -13,12 +13,15 @@ | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_ARGMAXWITHVALUE_CPU_KERNEL_H_ | |||
| #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_ARGMAXWITHVALUE_CPU_KERNEL_H_ | |||
| #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_ARGMAX_WITH_VALUE_CPU_KERNEL_H_ | |||
| #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_ARGMAX_WITH_VALUE_CPU_KERNEL_H_ | |||
| #include <vector> | |||
| #include <map> | |||
| #include <memory> | |||
| #include <algorithm> | |||
| #include "backend/kernel_compiler/cpu/cpu_kernel.h" | |||
| #include "backend/kernel_compiler/cpu/cpu_kernel_factory.h" | |||
| @@ -47,4 +50,4 @@ MS_REG_CPU_KERNEL_T(ArgMaxWithValue, KernelAttr(), ArgMaxWithValueCPUKernel, flo | |||
| } // namespace kernel | |||
| } // namespace mindspore | |||
| #endif // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_ARGMAXWITHVALUE_CPU_KERNEL_H_ | |||
| #endif // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_ARGMAX_WITH_VALUE_CPU_KERNEL_H_ | |||
| @@ -1,5 +1,5 @@ | |||
| /** | |||
| * Copyright 2019 Huawei Technologies Co., Ltd | |||
| * Copyright 2019-2021 Huawei Technologies Co., Ltd | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| @@ -13,12 +13,20 @@ | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| #include "backend/kernel_compiler/cpu/argmin_with_value_cpu_kernel.h" | |||
| #include <string> | |||
| #include "runtime/device/cpu/cpu_device_address.h" | |||
| namespace mindspore { | |||
| namespace kernel { | |||
| namespace { | |||
| constexpr size_t kArgMinWithValueInputsNum = 1; | |||
| constexpr size_t kArgMinWithValueOutputsNum = 2; | |||
| constexpr char kKernelName[] = "ArgMaxWithValue"; | |||
| size_t get_element_num(const std::vector<size_t> &shape) { | |||
| size_t size = 1; | |||
| for (size_t i = 0; i < shape.size(); i++) { | |||
| @@ -30,10 +38,8 @@ size_t get_element_num(const std::vector<size_t> &shape) { | |||
| template <typename T> | |||
| bool check_validation(const std::vector<size_t> &shape, const size_t num_before_axis, const size_t num_after_axis, | |||
| const std::vector<kernel::AddressPtr> &inputs, const std::vector<kernel::AddressPtr> &outputs) { | |||
| if (inputs.size() != 1 || outputs.size() != 2) { | |||
| MS_LOG(EXCEPTION) << "Wrong number of inputs or outputs!"; | |||
| return false; | |||
| } | |||
| CHECK_KERNEL_INPUTS_NUM(inputs.size(), kArgMinWithValueInputsNum, kKernelName); | |||
| CHECK_KERNEL_OUTPUTS_NUM(outputs.size(), kArgMinWithValueOutputsNum, kKernelName); | |||
| size_t data_size = sizeof(T); | |||
| size_t input_size = get_element_num(shape) * data_size; | |||
| size_t output_num = num_before_axis * num_after_axis; | |||
| @@ -41,7 +47,6 @@ bool check_validation(const std::vector<size_t> &shape, const size_t num_before_ | |||
| size_t out1_size = output_num * data_size; | |||
| if (inputs[0]->size != input_size || outputs[0]->size != out0_size || outputs[1]->size != out1_size) { | |||
| MS_LOG(EXCEPTION) << "Invalid input or output data size!"; | |||
| return false; | |||
| } | |||
| return true; | |||
| } | |||
| @@ -50,8 +55,12 @@ bool check_validation(const std::vector<size_t> &shape, const size_t num_before_ | |||
| template <typename T> | |||
| void ArgMinWithValueCPUKernel<T>::InitKernel(const CNodePtr &kernel_node) { | |||
| MS_EXCEPTION_IF_NULL(kernel_node); | |||
| kernel_name_ = AnfAlgo::GetCNodeName(kernel_node); | |||
| shape_ = AnfAlgo::GetInputDeviceShape(kernel_node, 0); | |||
| size_t shape_len = shape_.size(); | |||
| if (shape_len == 0) { | |||
| MS_LOG(EXCEPTION) << "Shape size should be greater than 0"; | |||
| } | |||
| int64_t axis = AnfAlgo::GetNodeAttr<int64_t>(kernel_node, AXIS); | |||
| axis += static_cast<int64_t>(shape_len); | |||
| if (axis < 0) { | |||
| @@ -78,10 +87,9 @@ bool ArgMinWithValueCPUKernel<T>::Launch(const std::vector<kernel::AddressPtr> & | |||
| return false; | |||
| } | |||
| auto input = reinterpret_cast<T *>(inputs[0]->addr); | |||
| auto output0 = reinterpret_cast<int32_t *>(outputs[0]->addr); | |||
| auto output1 = reinterpret_cast<T *>(outputs[1]->addr); | |||
| const auto *input = reinterpret_cast<T *>(inputs[0]->addr); | |||
| auto *output0 = reinterpret_cast<int32_t *>(outputs[0]->addr); | |||
| auto *output1 = reinterpret_cast<T *>(outputs[1]->addr); | |||
| std::vector<float> array_axis(dim_axis_); | |||
| for (size_t i = 0; i < num_before_axis_; i++) { | |||
| size_t src_index_i = i * dim_axis_ * num_after_axis_; | |||
| @@ -93,9 +101,9 @@ bool ArgMinWithValueCPUKernel<T>::Launch(const std::vector<kernel::AddressPtr> & | |||
| } | |||
| auto min_ops = std::min_element(array_axis.begin(), array_axis.end()); | |||
| auto min_index = static_cast<int32_t>(std::distance(array_axis.begin(), min_ops)); | |||
| auto dst_index = i * num_after_axis_ + j; | |||
| size_t dst_index = i * num_after_axis_ + j; | |||
| output0[dst_index] = min_index; | |||
| auto src_index = IntToSize(min_index) * num_after_axis_ + src_index_j; | |||
| size_t src_index = IntToSize(min_index) * num_after_axis_ + src_index_j; | |||
| output1[dst_index] = input[src_index]; | |||
| } | |||
| } | |||
| @@ -1,5 +1,5 @@ | |||
| /** | |||
| * Copyright 2019 Huawei Technologies Co., Ltd | |||
| * Copyright 2019-2021 Huawei Technologies Co., Ltd | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| @@ -13,12 +13,15 @@ | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_ARGMINWITHVALUE_CPU_KERNEL_H_ | |||
| #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_ARGMINWITHVALUE_CPU_KERNEL_H_ | |||
| #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_ARGMIN_WITH_VALUE_CPU_KERNEL_H_ | |||
| #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_ARGMIN_WITH_VALUE_CPU_KERNEL_H_ | |||
| #include <vector> | |||
| #include <map> | |||
| #include <memory> | |||
| #include <algorithm> | |||
| #include "backend/kernel_compiler/cpu/cpu_kernel.h" | |||
| #include "backend/kernel_compiler/cpu/cpu_kernel_factory.h" | |||
| @@ -37,9 +40,9 @@ class ArgMinWithValueCPUKernel : public CPUKernel { | |||
| private: | |||
| std::vector<size_t> shape_; | |||
| size_t num_before_axis_; | |||
| size_t num_after_axis_; | |||
| size_t dim_axis_; | |||
| size_t num_before_axis_{0}; | |||
| size_t num_after_axis_{0}; | |||
| size_t dim_axis_{0}; | |||
| }; | |||
| MS_REG_CPU_KERNEL_T(ArgMinWithValue, KernelAttr(), ArgMinWithValueCPUKernel, float); | |||
| @@ -47,4 +50,4 @@ MS_REG_CPU_KERNEL_T(ArgMinWithValue, KernelAttr(), ArgMinWithValueCPUKernel, flo | |||
| } // namespace kernel | |||
| } // namespace mindspore | |||
| #endif // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_ARGMINWITHVALUE_CPU_KERNEL_H_ | |||
| #endif // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_ARGMIN_WITH_VALUE_CPU_KERNEL_H_ | |||
| @@ -1,5 +1,5 @@ | |||
| /** | |||
| * Copyright 2020 Huawei Technologies Co., Ltd | |||
| * Copyright 2020-2021 Huawei Technologies Co., Ltd | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| @@ -13,18 +13,56 @@ | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| #include <string> | |||
| #include <map> | |||
| #include "backend/kernel_compiler/cpu/arithmetic_cpu_kernel.h" | |||
| #include <cmath> | |||
| #include <string> | |||
| #include <unordered_map> | |||
| #include <limits> | |||
| #include "backend/kernel_compiler/cpu/nnacl/fp32/power_fp32.h" | |||
| #include "backend/kernel_compiler/cpu/nnacl/fp32/sub_fp32.h" | |||
| #include "backend/kernel_compiler/cpu/nnacl/fp32/mul_fp32.h" | |||
| #include "runtime/device/cpu/cpu_device_address.h" | |||
| #include "nnacl/fp32/power_fp32.h" | |||
| #include "nnacl/fp32/sub_fp32.h" | |||
| #include "nnacl/fp32/mul_fp32.h" | |||
| namespace mindspore { | |||
| namespace kernel { | |||
| namespace { | |||
| constexpr size_t kInputsNum = 2; | |||
| constexpr size_t kOutputsNum = 1; | |||
| constexpr float kMaxSubSerialSize = 10000.0; | |||
| constexpr float kMaxPowSerialSize = 700.0; | |||
| template <typename T> | |||
| void ArithmeticCPUKernel<T>::AssignAdd(T *input1, const T *input2, T *out) { | |||
| void ElementRealDiv(const T *input1, const T *input2, T *out, size_t size, size_t delta_1, size_t delta_2) { | |||
| size_t idx_1 = 0; | |||
| size_t idx_2 = 0; | |||
| auto zero = (T)0; | |||
| for (size_t i = 0; i < size; ++i) { | |||
| auto dividend = input1[idx_1]; | |||
| auto divisor = input2[idx_2]; | |||
| idx_1 += delta_1; | |||
| idx_2 += delta_2; | |||
| if (divisor == zero) { | |||
| if (dividend == zero) { | |||
| out[i] = std::numeric_limits<T>::quiet_NaN(); | |||
| continue; | |||
| } | |||
| if (std::numeric_limits<T>::has_infinity) { | |||
| out[i] = dividend > zero ? std::numeric_limits<T>::infinity() : -std::numeric_limits<T>::infinity(); | |||
| } else { | |||
| out[i] = dividend > zero ? std::numeric_limits<T>::max() : std::numeric_limits<T>::min(); | |||
| } | |||
| continue; | |||
| } | |||
| out[i] = dividend / divisor; | |||
| } | |||
| } | |||
| } // namespace | |||
| template <typename T> | |||
| void ArithmeticCPUKernel<T>::AssignAdd(T *input1, const T *input2, T *out) const { | |||
| auto task = [&input1, &input2, &out](size_t start, size_t end) { | |||
| for (size_t i = start; i < end; i++) { | |||
| out[i] = input1[i] + input2[i]; | |||
| @@ -35,7 +73,7 @@ void ArithmeticCPUKernel<T>::AssignAdd(T *input1, const T *input2, T *out) { | |||
| } | |||
| template <typename T> | |||
| void ArithmeticCPUKernel<T>::Add(const T *input1, const T *input2, T *out) { | |||
| void ArithmeticCPUKernel<T>::Add(const T *input1, const T *input2, T *out) const { | |||
| BroadcastIterator base_iter(input_shape1_, input_shape2_, output_shape_); | |||
| auto task = [&input1, &input2, &out, &base_iter](size_t start, size_t end) { | |||
| auto iter = base_iter; | |||
| @@ -58,12 +96,12 @@ void ArithmeticCPUKernel<T>::Sub(const T *input1, const T *input2, T *out) { | |||
| ParallelLaunchAutoSearch(task, output_size_, this, ¶llel_search_info_); | |||
| return; | |||
| } | |||
| if (op_para.in_elements_num0_ == 1 || op_para.in_elements_num1_ == 1) { | |||
| if (op_para_.in_elements_num0_ == 1 || op_para_.in_elements_num1_ == 1) { | |||
| auto task = [this, input1, input2, out](size_t start, size_t end) { | |||
| if (op_para.in_elements_num0_ == 1) { | |||
| (void)ElementOptSub(input1, input2 + start, out + start, end - start, &op_para); | |||
| if (op_para_.in_elements_num0_ == 1) { | |||
| (void)ElementOptSub(input1, input2 + start, out + start, end - start, &op_para_); | |||
| } else { | |||
| (void)ElementOptSub(input1 + start, input2, out + start, end - start, &op_para); | |||
| (void)ElementOptSub(input1 + start, input2, out + start, end - start, &op_para_); | |||
| } | |||
| }; | |||
| ParallelLaunchAutoSearch(task, output_size_, this, ¶llel_search_info_); | |||
| @@ -80,7 +118,7 @@ void ArithmeticCPUKernel<T>::Sub(const T *input1, const T *input2, T *out) { | |||
| iter.GenNextPos(); | |||
| } | |||
| }; | |||
| CPUKernelUtils::ParallelFor(task, output_size_, MAX_SUB_SERIAL_SIZE); | |||
| CPUKernelUtils::ParallelFor(task, output_size_, kMaxSubSerialSize); | |||
| } | |||
| template <typename T> | |||
| @@ -93,12 +131,12 @@ void ArithmeticCPUKernel<T>::Mul(const T *input1, const T *input2, T *out) { | |||
| ParallelLaunchAutoSearch(task, output_size_, this, ¶llel_search_info_); | |||
| return; | |||
| } | |||
| if (op_para.in_elements_num0_ == 1 || op_para.in_elements_num1_ == 1) { | |||
| if (op_para_.in_elements_num0_ == 1 || op_para_.in_elements_num1_ == 1) { | |||
| auto task = [this, input1, input2, out](size_t start, size_t end) { | |||
| if (op_para.in_elements_num0_ == 1) { | |||
| (void)ElementOptMul(input1, input2 + start, out + start, end - start, &op_para); | |||
| if (op_para_.in_elements_num0_ == 1) { | |||
| (void)ElementOptMul(input1, input2 + start, out + start, end - start, &op_para_); | |||
| } else { | |||
| (void)ElementOptMul(input1 + start, input2, out + start, end - start, &op_para); | |||
| (void)ElementOptMul(input1 + start, input2, out + start, end - start, &op_para_); | |||
| } | |||
| }; | |||
| ParallelLaunchAutoSearch(task, output_size_, this, ¶llel_search_info_); | |||
| @@ -110,39 +148,13 @@ void ArithmeticCPUKernel<T>::Mul(const T *input1, const T *input2, T *out) { | |||
| auto iter = base_iter; | |||
| iter.SetPos(start); | |||
| for (size_t i = start; i < end; i++) { | |||
| out[i] = input1[iter.GetInputPosA()] * input2[iter.GetInputPosB()]; | |||
| out[i] = static_cast<T>(input1[iter.GetInputPosA()] * input2[iter.GetInputPosB()]); | |||
| iter.GenNextPos(); | |||
| } | |||
| }; | |||
| CPUKernelUtils::ParallelFor(task, output_size_); | |||
| } | |||
| template <typename T> | |||
| void ElementRealDiv(const T *input1, const T *input2, T *out, size_t size, size_t delta_1, size_t delta_2) { | |||
| size_t idx_1 = 0; | |||
| size_t idx_2 = 0; | |||
| auto zero = (T)0; | |||
| for (size_t i = 0; i < size; ++i) { | |||
| auto dividend = input1[idx_1]; | |||
| auto divisor = input2[idx_2]; | |||
| idx_1 += delta_1; | |||
| idx_2 += delta_2; | |||
| if (divisor == zero) { | |||
| if (dividend == zero) { | |||
| out[i] = std::numeric_limits<T>::quiet_NaN(); | |||
| continue; | |||
| } | |||
| if (std::numeric_limits<T>::has_infinity) { | |||
| out[i] = dividend > zero ? std::numeric_limits<T>::infinity() : -std::numeric_limits<T>::infinity(); | |||
| } else { | |||
| out[i] = dividend > zero ? std::numeric_limits<T>::max() : std::numeric_limits<T>::min(); | |||
| } | |||
| continue; | |||
| } | |||
| out[i] = dividend / divisor; | |||
| } | |||
| } | |||
| template <typename T> | |||
| void ArithmeticCPUKernel<T>::RealDiv(const T *input1, const T *input2, T *out) { | |||
| if (input_shape1_ == input_shape2_) { | |||
| @@ -152,14 +164,14 @@ void ArithmeticCPUKernel<T>::RealDiv(const T *input1, const T *input2, T *out) { | |||
| ParallelLaunchAutoSearch(task, output_size_, this, ¶llel_search_info_); | |||
| return; | |||
| } | |||
| if (op_para.in_elements_num0_ == 1) { | |||
| if (op_para_.in_elements_num0_ == 1) { | |||
| auto task = [&](size_t start, size_t end) { | |||
| ElementRealDiv<T>(input1, input2 + start, out + start, end - start, 0, 1); | |||
| }; | |||
| ParallelLaunchAutoSearch(task, output_size_, this, ¶llel_search_info_); | |||
| return; | |||
| } | |||
| if (op_para.in_elements_num1_ == 1) { | |||
| if (op_para_.in_elements_num1_ == 1) { | |||
| auto task = [&](size_t start, size_t end) { | |||
| ElementRealDiv<T>(input1 + start, input2, out + start, end - start, 1, 0); | |||
| }; | |||
| @@ -195,7 +207,7 @@ void ArithmeticCPUKernel<T>::RealDiv(const T *input1, const T *input2, T *out) { | |||
| } | |||
| template <typename T> | |||
| void ArithmeticCPUKernel<T>::Div(const T *input1, const T *input2, T *out) { | |||
| void ArithmeticCPUKernel<T>::Div(const T *input1, const T *input2, T *out) const { | |||
| BroadcastIterator base_iter(input_shape1_, input_shape2_, output_shape_); | |||
| auto task = [&input1, &input2, &out, &base_iter](size_t start, size_t end) { | |||
| auto iter = base_iter; | |||
| @@ -224,7 +236,7 @@ void ArithmeticCPUKernel<T>::Div(const T *input1, const T *input2, T *out) { | |||
| } | |||
| template <typename T> | |||
| void ArithmeticCPUKernel<T>::FloorDiv(const T *input1, const T *input2, T *out) { | |||
| void ArithmeticCPUKernel<T>::FloorDiv(const T *input1, const T *input2, T *out) const { | |||
| BroadcastIterator base_iter(input_shape1_, input_shape2_, output_shape_); | |||
| auto task = [&input1, &input2, &out, &base_iter](size_t start, size_t end) { | |||
| auto iter = base_iter; | |||
| @@ -233,7 +245,7 @@ void ArithmeticCPUKernel<T>::FloorDiv(const T *input1, const T *input2, T *out) | |||
| auto dividend = input1[iter.GetInputPosA()]; | |||
| auto divisor = input2[iter.GetInputPosB()]; | |||
| iter.GenNextPos(); | |||
| auto zero = (T)0; | |||
| auto zero = static_cast<T>(0); | |||
| if (divisor == zero) { | |||
| if (dividend == zero) { | |||
| out[i] = std::numeric_limits<T>::quiet_NaN(); | |||
| @@ -246,14 +258,14 @@ void ArithmeticCPUKernel<T>::FloorDiv(const T *input1, const T *input2, T *out) | |||
| } | |||
| continue; | |||
| } | |||
| out[i] = (T)floor(static_cast<double>(dividend) / static_cast<double>(divisor)); | |||
| out[i] = static_cast<T>(floor(static_cast<double>(dividend) / static_cast<double>(divisor))); | |||
| } | |||
| }; | |||
| CPUKernelUtils::ParallelFor(task, output_size_); | |||
| } | |||
| template <typename T> | |||
| void ArithmeticCPUKernel<T>::Mod(const T *input1, const T *input2, T *out) { | |||
| void ArithmeticCPUKernel<T>::Mod(const T *input1, const T *input2, T *out) const { | |||
| BroadcastIterator base_iter(input_shape1_, input_shape2_, output_shape_); | |||
| auto task = [&input1, &input2, &out, &base_iter](size_t start, size_t end) { | |||
| auto iter = base_iter; | |||
| @@ -275,7 +287,7 @@ void ArithmeticCPUKernel<T>::Mod(const T *input1, const T *input2, T *out) { | |||
| } | |||
| template <typename T> | |||
| void ArithmeticCPUKernel<T>::FloorMod(const T *input1, const T *input2, T *out) { | |||
| void ArithmeticCPUKernel<T>::FloorMod(const T *input1, const T *input2, T *out) const { | |||
| BroadcastIterator base_iter(input_shape1_, input_shape2_, output_shape_); | |||
| auto task = [&input1, &input2, &out, &base_iter](size_t start, size_t end) { | |||
| auto iter = base_iter; | |||
| @@ -292,7 +304,7 @@ void ArithmeticCPUKernel<T>::FloorMod(const T *input1, const T *input2, T *out) | |||
| } | |||
| template <typename T> | |||
| void ArithmeticCPUKernel<T>::Pow(const T *input1, const T *input2, T *out) { | |||
| void ArithmeticCPUKernel<T>::Pow(const T *input1, const T *input2, T *out) const { | |||
| if constexpr (std::is_same_v<T, float>) { | |||
| auto is_power_single = [this]() { | |||
| bool is_power_single = false; | |||
| @@ -308,7 +320,7 @@ void ArithmeticCPUKernel<T>::Pow(const T *input1, const T *input2, T *out) { | |||
| return is_power_single; | |||
| }; | |||
| if (op_para.in_elements_num1_ == 1) { | |||
| if (op_para_.in_elements_num1_ == 1) { | |||
| auto task = [&](size_t start, size_t end) { | |||
| (void)Power(input1 + start, input2, out + start, end - start, 1, 0, true); | |||
| }; | |||
| @@ -325,7 +337,7 @@ void ArithmeticCPUKernel<T>::Pow(const T *input1, const T *input2, T *out) { | |||
| } | |||
| BroadcastIterator base_iter(input_shape1_, input_shape2_, output_shape_); | |||
| if (output_size_ > MAX_POW_SERIAL_SIZE) { | |||
| if (output_size_ > kMaxPowSerialSize) { | |||
| auto task = [&input1, &input2, &out, &base_iter](size_t start, size_t end) { | |||
| auto iter = base_iter; | |||
| iter.SetPos(start); | |||
| @@ -356,7 +368,7 @@ void ArithmeticCPUKernel<T>::SquaredDifference(const T *input1, const T *input2, | |||
| iter.SetPos(start); | |||
| for (size_t i = start; i < end; i++) { | |||
| T diff = input1[iter.GetInputPosA()] - input2[iter.GetInputPosB()]; | |||
| out[i] = diff * diff; | |||
| out[i] = static_cast<T>(diff * diff); | |||
| iter.GenNextPos(); | |||
| } | |||
| }; | |||
| @@ -364,44 +376,47 @@ void ArithmeticCPUKernel<T>::SquaredDifference(const T *input1, const T *input2, | |||
| } | |||
| template <typename T> | |||
| void ArithmeticCPUKernel<T>::Atan2(const T *input1, const T *input2, T *out) { | |||
| void ArithmeticCPUKernel<T>::Atan2(const T *input1, const T *input2, T *out) const { | |||
| BroadcastIterator base_iter(input_shape1_, input_shape2_, output_shape_); | |||
| auto task = [&input1, &input2, &out, &base_iter](size_t start, size_t end) { | |||
| auto iter = base_iter; | |||
| iter.SetPos(start); | |||
| for (size_t i = start; i < end; i++) { | |||
| out[i] = | |||
| (T)atan2(static_cast<double>(input1[iter.GetInputPosA()]), static_cast<double>(input2[iter.GetInputPosB()])); | |||
| out[i] = static_cast<T>( | |||
| atan2(static_cast<double>(input1[iter.GetInputPosA()]), static_cast<double>(input2[iter.GetInputPosB()]))); | |||
| iter.GenNextPos(); | |||
| } | |||
| }; | |||
| CPUKernelUtils::ParallelFor(task, output_size_); | |||
| } | |||
| static const std::map<std::string, OperateType> kArithmeticBinOpTypeMap = { | |||
| {prim::kPrimAdd->name(), ADD}, | |||
| {prim::kPrimSub->name(), SUB}, | |||
| {prim::kPrimMul->name(), MUL}, | |||
| {prim::kPrimDiv->name(), DIV}, | |||
| {prim::kPrimMod->name(), MOD}, | |||
| {prim::kPrimAssignAdd->name(), ASSIGNADD}, | |||
| {prim::kPrimPow->name(), POW}, | |||
| {prim::kPrimFloorDiv->name(), FLOORDIV}, | |||
| {prim::kPrimAtan2->name(), ATAN2}, | |||
| {prim::kPrimRealDiv->name(), REALDIV}, | |||
| {prim::kPrimSquaredDifference->name(), SQUAREDDIFFERENCE}, | |||
| {prim::kPrimFloorMod->name(), FLOORMOD}}; | |||
| template <typename T> | |||
| void ArithmeticCPUKernel<T>::InitComputeFunc() { | |||
| if (kernel_name_ == prim::kPrimAssignAdd->name()) { | |||
| return; | |||
| } | |||
| static const std::unordered_map<std::string, TypeComputeFunc> arithmeticMathFuncMap{ | |||
| {prim::kPrimAdd->name(), &ArithmeticCPUKernel<T>::Add}, | |||
| {prim::kPrimSub->name(), &ArithmeticCPUKernel<T>::Sub}, | |||
| {prim::kPrimMul->name(), &ArithmeticCPUKernel<T>::Mul}, | |||
| {prim::kPrimDiv->name(), &ArithmeticCPUKernel<T>::Div}, | |||
| {prim::kPrimMod->name(), &ArithmeticCPUKernel<T>::Mod}, | |||
| {prim::kPrimFloorMod->name(), &ArithmeticCPUKernel<T>::FloorMod}, | |||
| {prim::kPrimPow->name(), &ArithmeticCPUKernel<T>::Pow}, | |||
| {prim::kPrimFloorDiv->name(), &ArithmeticCPUKernel<T>::FloorDiv}, | |||
| {prim::kPrimAtan2->name(), &ArithmeticCPUKernel<T>::Atan2}, | |||
| {prim::kPrimRealDiv->name(), &ArithmeticCPUKernel<T>::RealDiv}, | |||
| {prim::kPrimSquaredDifference->name(), &ArithmeticCPUKernel<T>::SquaredDifference}}; | |||
| if (arithmeticMathFuncMap.find(kernel_name_) == arithmeticMathFuncMap.end()) { | |||
| MS_LOG(EXCEPTION) << "ArithmeticCPUKernel does not support " << kernel_name_; | |||
| } | |||
| compute_func_ = arithmeticMathFuncMap.at(kernel_name_); | |||
| } | |||
| template <typename T> | |||
| void ArithmeticCPUKernel<T>::InitKernel(const CNodePtr &kernel_node) { | |||
| MS_EXCEPTION_IF_NULL(kernel_node); | |||
| std::string kernel_name = AnfAlgo::GetCNodeName(kernel_node); | |||
| if (kArithmeticBinOpTypeMap.find(kernel_name) != kArithmeticBinOpTypeMap.end()) { | |||
| operate_type_ = kArithmeticBinOpTypeMap.at(kernel_name); | |||
| } else { | |||
| MS_LOG(EXCEPTION) << "Not support " << kernel_name; | |||
| } | |||
| kernel_name_ = AnfAlgo::GetCNodeName(kernel_node); | |||
| input_shape1_ = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0); | |||
| input_shape2_ = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 1); | |||
| output_shape_ = AnfAlgo::GetOutputInferShape(kernel_node, 0); | |||
| @@ -414,14 +429,14 @@ void ArithmeticCPUKernel<T>::InitKernel(const CNodePtr &kernel_node) { | |||
| output_size_ *= output_shape_[i]; | |||
| } | |||
| op_para.in_elements_num0_ = 1; | |||
| op_para_.in_elements_num0_ = 1; | |||
| for (size_t i = 0; i < input_shape1_.size(); ++i) { | |||
| op_para.in_elements_num0_ *= input_shape1_[i]; | |||
| op_para_.in_elements_num0_ *= input_shape1_[i]; | |||
| } | |||
| op_para.in_elements_num1_ = 1; | |||
| op_para_.in_elements_num1_ = 1; | |||
| for (size_t i = 0; i < input_shape2_.size(); ++i) { | |||
| op_para.in_elements_num1_ *= input_shape2_[i]; | |||
| op_para_.in_elements_num1_ *= input_shape2_[i]; | |||
| } | |||
| size_t l = input_shape1_.size(); | |||
| @@ -435,47 +450,21 @@ void ArithmeticCPUKernel<T>::InitKernel(const CNodePtr &kernel_node) { | |||
| CPUKernelUtils::GetElementNumEveryDim(input_shape1_, &input_element_num1_); | |||
| CPUKernelUtils::GetElementNumEveryDim(input_shape2_, &input_element_num2_); | |||
| CPUKernelUtils::GetElementNumEveryDim(output_shape_, &output_element_num_); | |||
| dtype_ = AnfAlgo::GetInputDeviceDataType(kernel_node, 0); | |||
| if (dtype_ != AnfAlgo::GetInputDeviceDataType(kernel_node, 1)) { | |||
| MS_LOG(EXCEPTION) << "Input0 and input1 must has the same data type"; | |||
| } | |||
| target_dtype_ = AnfAlgo::GetOutputDeviceDataType(kernel_node, 0); | |||
| InitComputeFunc(); | |||
| } | |||
| template <typename T> | |||
| bool ArithmeticCPUKernel<T>::Launch(const std::vector<AddressPtr> &inputs, | |||
| const std::vector<AddressPtr> & /* workspace */, | |||
| bool ArithmeticCPUKernel<T>::Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &, | |||
| const std::vector<AddressPtr> &outputs) { | |||
| T *input1 = reinterpret_cast<T *>(inputs[0]->addr); | |||
| T *input2 = reinterpret_cast<T *>(inputs[1]->addr); | |||
| T *output = reinterpret_cast<T *>(outputs[0]->addr); | |||
| if (operate_type_ == ADD) { | |||
| Add(input1, input2, output); | |||
| } else if (operate_type_ == SUB) { | |||
| Sub(input1, input2, output); | |||
| } else if (operate_type_ == MUL) { | |||
| Mul(input1, input2, output); | |||
| } else if (operate_type_ == REALDIV) { | |||
| RealDiv(input1, input2, output); | |||
| } else if (operate_type_ == DIV) { | |||
| Div(input1, input2, output); | |||
| } else if (operate_type_ == FLOORDIV) { | |||
| FloorDiv(input1, input2, output); | |||
| } else if (operate_type_ == MOD) { | |||
| Mod(input1, input2, output); | |||
| } else if (operate_type_ == FLOORMOD) { | |||
| FloorMod(input1, input2, output); | |||
| } else if (operate_type_ == POW) { | |||
| Pow(input1, input2, output); | |||
| } else if (operate_type_ == ASSIGNADD) { | |||
| CHECK_KERNEL_INPUTS_NUM(inputs.size(), kInputsNum, kernel_name_); | |||
| CHECK_KERNEL_OUTPUTS_NUM(outputs.size(), kOutputsNum, kernel_name_); | |||
| auto *input1 = reinterpret_cast<T *>(inputs[0]->addr); | |||
| const auto *input2 = reinterpret_cast<T *>(inputs[1]->addr); | |||
| auto *output = reinterpret_cast<T *>(outputs[0]->addr); | |||
| if (kernel_name_ == prim::kPrimAssignAdd->name()) { | |||
| AssignAdd(input1, input2, output); | |||
| } else if (operate_type_ == ATAN2) { | |||
| Atan2(input1, input2, output); | |||
| } else if (operate_type_ == SQUAREDDIFFERENCE) { | |||
| SquaredDifference(input1, input2, output); | |||
| } else { | |||
| MS_LOG(EXCEPTION) << "Not support " << operate_type_; | |||
| compute_func_(this, input1, input2, output); | |||
| } | |||
| return true; | |||
| } | |||
| @@ -13,18 +13,15 @@ | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_ARITHMETIC_CPU_KERNEL_H_ | |||
| #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_ARITHMETIC_CPU_KERNEL_H_ | |||
| #include <memory> | |||
| #include <vector> | |||
| #include <limits> | |||
| #include "backend/kernel_compiler/cpu/cpu_kernel.h" | |||
| #include "backend/kernel_compiler/cpu/cpu_kernel_factory.h" | |||
| #include "nnacl/arithmetic.h" | |||
| const float MAX_SUB_SERIAL_SIZE = 10000; | |||
| const float MAX_DIV_SERIAL_SIZE = 10000; | |||
| const float MAX_POW_SERIAL_SIZE = 700; | |||
| #include "backend/kernel_compiler/cpu/nnacl/arithmetic.h" | |||
| namespace mindspore { | |||
| namespace kernel { | |||
| @@ -40,29 +37,31 @@ class ArithmeticCPUKernel : public CPUKernel { | |||
| const std::vector<AddressPtr> &outputs) override; | |||
| private: | |||
| void InitComputeFunc(); | |||
| void Sub(const T *input1, const T *input2, T *out); | |||
| void Add(const T *input1, const T *input2, T *out); | |||
| void Add(const T *input1, const T *input2, T *out) const; | |||
| void Mul(const T *input1, const T *input2, T *out); | |||
| void RealDiv(const T *input1, const T *input2, T *out); | |||
| void Div(const T *input1, const T *input2, T *out); | |||
| void FloorDiv(const T *input1, const T *input2, T *out); | |||
| void Mod(const T *input1, const T *input2, T *out); | |||
| void FloorMod(const T *input1, const T *input2, T *out); | |||
| void Pow(const T *input1, const T *input2, T *out); | |||
| void AssignAdd(T *input1, const T *input2, T *out); | |||
| void Atan2(const T *input1, const T *input2, T *out); | |||
| void Div(const T *input1, const T *input2, T *out) const; | |||
| void FloorDiv(const T *input1, const T *input2, T *out) const; | |||
| void Mod(const T *input1, const T *input2, T *out) const; | |||
| void FloorMod(const T *input1, const T *input2, T *out) const; | |||
| void Pow(const T *input1, const T *input2, T *out) const; | |||
| void AssignAdd(T *input1, const T *input2, T *out) const; | |||
| void Atan2(const T *input1, const T *input2, T *out) const; | |||
| void SquaredDifference(const T *input1, const T *input2, T *out); | |||
| using TypeComputeFunc = std::function<void(ArithmeticCPUKernel *, const T *in_x, const T *in_y, T *out)>; | |||
| TypeComputeFunc compute_func_{nullptr}; | |||
| size_t output_size_{1}; | |||
| ArithmeticParameter op_para_{}; | |||
| std::vector<size_t> input_shape1_; | |||
| std::vector<size_t> input_shape2_; | |||
| std::vector<size_t> input_element_num1_; | |||
| std::vector<size_t> input_element_num2_; | |||
| std::vector<size_t> output_shape_; | |||
| std::vector<size_t> output_element_num_; | |||
| size_t output_size_{1}; | |||
| ArithmeticParameter op_para; | |||
| OperateType operate_type_{ADD}; | |||
| TypeId dtype_{kTypeUnknown}; | |||
| TypeId target_dtype_{kTypeUnknown}; | |||
| }; | |||
| MS_REG_CPU_KERNEL_T(Sub, KernelAttr(), ArithmeticCPUKernel, int32_t); | |||
| @@ -1,5 +1,5 @@ | |||
| /** | |||
| * Copyright 2020 Huawei Technologies Co., Ltd | |||
| * Copyright 2020-2021 Huawei Technologies Co., Ltd | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| @@ -15,18 +15,26 @@ | |||
| */ | |||
| #include "backend/kernel_compiler/cpu/arithmetic_logic_cpu_kernel.h" | |||
| #include <cmath> | |||
| #include <string> | |||
| #include <map> | |||
| #include <cmath> | |||
| #include <unordered_map> | |||
| #include <functional> | |||
| #include "runtime/device/cpu/cpu_device_address.h" | |||
| namespace mindspore { | |||
| namespace kernel { | |||
| namespace { | |||
| constexpr size_t kMaxLessSerialSize = 15000; | |||
| constexpr size_t kInputsNum = 2; | |||
| constexpr size_t kOutputsNum = 1; | |||
| } // namespace | |||
| template <typename T> | |||
| void ArithmeticLogicCPUKernel<T>::Less(const T *input1, const T *input2, bool *out) { | |||
| void ArithmeticLogicCPUKernel<T>::Less(const T *input1, const T *input2, bool *out) const { | |||
| BroadcastIterator base_iter(input_shape1_, input_shape2_, output_shape_); | |||
| if (output_size_ > MAX_LESS_SERIAL_SIZE) { | |||
| if (output_size_ > kMaxLessSerialSize) { | |||
| auto task = [&](size_t start, size_t end) { | |||
| auto iter = base_iter; | |||
| iter.SetPos(start); | |||
| @@ -50,7 +58,7 @@ void ArithmeticLogicCPUKernel<T>::Less(const T *input1, const T *input2, bool *o | |||
| } | |||
| template <typename T> | |||
| void ArithmeticLogicCPUKernel<T>::Equal(const T *input1, const T *input2, bool *out) { | |||
| void ArithmeticLogicCPUKernel<T>::Equal(const T *input1, const T *input2, bool *out) const { | |||
| BroadcastIterator base_iter(input_shape1_, input_shape2_, output_shape_); | |||
| auto task = [&](size_t start, size_t end) { | |||
| auto iter = base_iter; | |||
| @@ -66,7 +74,7 @@ void ArithmeticLogicCPUKernel<T>::Equal(const T *input1, const T *input2, bool * | |||
| } | |||
| template <typename T> | |||
| void ArithmeticLogicCPUKernel<T>::NotEqual(const T *input1, const T *input2, bool *out) { | |||
| void ArithmeticLogicCPUKernel<T>::NotEqual(const T *input1, const T *input2, bool *out) const { | |||
| BroadcastIterator base_iter(input_shape1_, input_shape2_, output_shape_); | |||
| auto task = [&](size_t start, size_t end) { | |||
| auto iter = base_iter; | |||
| @@ -82,7 +90,7 @@ void ArithmeticLogicCPUKernel<T>::NotEqual(const T *input1, const T *input2, boo | |||
| } | |||
| template <typename T> | |||
| void ArithmeticLogicCPUKernel<T>::LogicalAnd(const T *input1, const T *input2, bool *out) { | |||
| void ArithmeticLogicCPUKernel<T>::LogicalAnd(const T *input1, const T *input2, bool *out) const { | |||
| BroadcastIterator base_iter(input_shape1_, input_shape2_, output_shape_); | |||
| auto task = [&](size_t start, size_t end) { | |||
| auto iter = base_iter; | |||
| @@ -96,7 +104,7 @@ void ArithmeticLogicCPUKernel<T>::LogicalAnd(const T *input1, const T *input2, b | |||
| } | |||
| template <typename T> | |||
| void ArithmeticLogicCPUKernel<T>::LogicalOr(const T *input1, const T *input2, bool *out) { | |||
| void ArithmeticLogicCPUKernel<T>::LogicalOr(const T *input1, const T *input2, bool *out) const { | |||
| BroadcastIterator base_iter(input_shape1_, input_shape2_, output_shape_); | |||
| auto task = [&](size_t start, size_t end) { | |||
| auto iter = base_iter; | |||
| @@ -110,7 +118,7 @@ void ArithmeticLogicCPUKernel<T>::LogicalOr(const T *input1, const T *input2, bo | |||
| } | |||
| template <typename T> | |||
| void ArithmeticLogicCPUKernel<T>::Greater(const T *input1, const T *input2, bool *out) { | |||
| void ArithmeticLogicCPUKernel<T>::Greater(const T *input1, const T *input2, bool *out) const { | |||
| BroadcastIterator base_iter(input_shape1_, input_shape2_, output_shape_); | |||
| auto task = [&](size_t start, size_t end) { | |||
| auto iter = base_iter; | |||
| @@ -126,7 +134,7 @@ void ArithmeticLogicCPUKernel<T>::Greater(const T *input1, const T *input2, bool | |||
| } | |||
| template <typename T> | |||
| void ArithmeticLogicCPUKernel<T>::GreaterEqual(const T *input1, const T *input2, bool *out) { | |||
| void ArithmeticLogicCPUKernel<T>::GreaterEqual(const T *input1, const T *input2, bool *out) const { | |||
| BroadcastIterator base_iter(input_shape1_, input_shape2_, output_shape_); | |||
| auto task = [&](size_t start, size_t end) { | |||
| auto iter = base_iter; | |||
| @@ -142,7 +150,7 @@ void ArithmeticLogicCPUKernel<T>::GreaterEqual(const T *input1, const T *input2, | |||
| } | |||
| template <typename T> | |||
| void ArithmeticLogicCPUKernel<T>::LessEqual(const T *input1, const T *input2, bool *out) { | |||
| void ArithmeticLogicCPUKernel<T>::LessEqual(const T *input1, const T *input2, bool *out) const { | |||
| BroadcastIterator base_iter(input_shape1_, input_shape2_, output_shape_); | |||
| auto task = [&](size_t start, size_t end) { | |||
| auto iter = base_iter; | |||
| @@ -157,26 +165,31 @@ void ArithmeticLogicCPUKernel<T>::LessEqual(const T *input1, const T *input2, bo | |||
| CPUKernelUtils::ParallelFor(task, output_size_); | |||
| } | |||
| static const std::map<std::string, OperateType> kArithmeticBinOpTypeMap = { | |||
| {prim::kPrimGreater->name(), GREATER}, {prim::kPrimGreaterEqual->name(), GREATEREQUAL}, | |||
| {prim::kPrimLogicalAnd->name(), LOGICALAND}, {prim::kPrimLessEqual->name(), LESSEQUAL}, | |||
| {prim::kPrimLogicalOr->name(), LOGICALOR}, {prim::kPrimLess->name(), LESS}, | |||
| {prim::kPrimNotEqual->name(), NOTEQUAL}, {prim::kPrimEqual->name(), EQUAL}}; | |||
| template <typename T> | |||
| void ArithmeticLogicCPUKernel<T>::InitComputeFunc() { | |||
| static const std::unordered_map<std::string, TypeComputeFunc> arithmeticLogicFuncMap{ | |||
| {prim::kPrimGreater->name(), &ArithmeticLogicCPUKernel<T>::Greater}, | |||
| {prim::kPrimGreaterEqual->name(), &ArithmeticLogicCPUKernel<T>::GreaterEqual}, | |||
| {prim::kPrimLogicalAnd->name(), &ArithmeticLogicCPUKernel<T>::LogicalAnd}, | |||
| {prim::kPrimLessEqual->name(), &ArithmeticLogicCPUKernel<T>::LessEqual}, | |||
| {prim::kPrimLogicalOr->name(), &ArithmeticLogicCPUKernel<T>::LogicalOr}, | |||
| {prim::kPrimLess->name(), &ArithmeticLogicCPUKernel<T>::Less}, | |||
| {prim::kPrimNotEqual->name(), &ArithmeticLogicCPUKernel<T>::NotEqual}, | |||
| {prim::kPrimEqual->name(), &ArithmeticLogicCPUKernel<T>::Equal}}; | |||
| if (arithmeticLogicFuncMap.find(kernel_name_) == arithmeticLogicFuncMap.end()) { | |||
| MS_LOG(EXCEPTION) << "ArithmeticLogicCPUKernel does not support " << kernel_name_; | |||
| } | |||
| compute_func_ = arithmeticLogicFuncMap.at(kernel_name_); | |||
| } | |||
| template <typename T> | |||
| void ArithmeticLogicCPUKernel<T>::InitKernel(const CNodePtr &kernel_node) { | |||
| MS_EXCEPTION_IF_NULL(kernel_node); | |||
| std::string kernel_name = AnfAlgo::GetCNodeName(kernel_node); | |||
| if (kArithmeticBinOpTypeMap.find(kernel_name) != kArithmeticBinOpTypeMap.end()) { | |||
| operate_type_ = kArithmeticBinOpTypeMap.at(kernel_name); | |||
| } else { | |||
| MS_LOG(EXCEPTION) << "Not support " << kernel_name; | |||
| } | |||
| kernel_name_ = AnfAlgo::GetCNodeName(kernel_node); | |||
| input_shape1_ = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0); | |||
| input_shape2_ = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 1); | |||
| output_shape_ = AnfAlgo::GetOutputInferShape(kernel_node, 0); | |||
| if (output_shape_.size() == 0) { | |||
| if (output_shape_.empty()) { | |||
| (void)output_shape_.insert(output_shape_.begin(), 1); | |||
| } | |||
| @@ -200,36 +213,19 @@ void ArithmeticLogicCPUKernel<T>::InitKernel(const CNodePtr &kernel_node) { | |||
| if (dtype_ != AnfAlgo::GetInputDeviceDataType(kernel_node, 1)) { | |||
| MS_LOG(EXCEPTION) << "Input0 and input1 must has the same data type"; | |||
| } | |||
| target_dtype_ = AnfAlgo::GetOutputDeviceDataType(kernel_node, 0); | |||
| InitComputeFunc(); | |||
| } | |||
| template <typename T> | |||
| bool ArithmeticLogicCPUKernel<T>::Launch(const std::vector<AddressPtr> &inputs, | |||
| const std::vector<AddressPtr> & /* workspace */, | |||
| const std::vector<AddressPtr> &outputs) { | |||
| T *input1 = reinterpret_cast<T *>(inputs[0]->addr); | |||
| T *input2 = reinterpret_cast<T *>(inputs[1]->addr); | |||
| CHECK_KERNEL_INPUTS_NUM(inputs.size(), kInputsNum, kernel_name_); | |||
| CHECK_KERNEL_OUTPUTS_NUM(outputs.size(), kOutputsNum, kernel_name_); | |||
| const auto *input1 = reinterpret_cast<T *>(inputs[0]->addr); | |||
| const auto *input2 = reinterpret_cast<T *>(inputs[1]->addr); | |||
| bool *output = reinterpret_cast<bool *>(outputs[0]->addr); | |||
| if (operate_type_ == LESS) { | |||
| Less(input1, input2, output); | |||
| } else if (operate_type_ == EQUAL) { | |||
| Equal(input1, input2, output); | |||
| } else if (operate_type_ == NOTEQUAL) { | |||
| NotEqual(input1, input2, output); | |||
| } else if (operate_type_ == GREATER) { | |||
| Greater(input1, input2, output); | |||
| } else if (operate_type_ == GREATEREQUAL) { | |||
| GreaterEqual(input1, input2, output); | |||
| } else if (operate_type_ == LESSEQUAL) { | |||
| LessEqual(input1, input2, output); | |||
| } else if (operate_type_ == LOGICALAND) { | |||
| LogicalAnd(input1, input2, output); | |||
| } else if (operate_type_ == LOGICALOR) { | |||
| LogicalOr(input1, input2, output); | |||
| } else { | |||
| MS_LOG(EXCEPTION) << "Not support " << operate_type_; | |||
| } | |||
| compute_func_(this, input1, input2, output); | |||
| return true; | |||
| } | |||
| } // namespace kernel | |||
| @@ -13,16 +13,17 @@ | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_ARITHMETIC_LOGIC_CPU_KERNEL_H_ | |||
| #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_ARITHMETIC_LOGIC_CPU_KERNEL_H_ | |||
| #include <memory> | |||
| #include <vector> | |||
| #include <limits> | |||
| #include "backend/kernel_compiler/cpu/cpu_kernel.h" | |||
| #include "backend/kernel_compiler/cpu/cpu_kernel_factory.h" | |||
| #define MAX_LESS_SERIAL_SIZE 15000 | |||
| namespace mindspore { | |||
| namespace kernel { | |||
| template <typename T> | |||
| @@ -37,25 +38,27 @@ class ArithmeticLogicCPUKernel : public CPUKernel { | |||
| const std::vector<AddressPtr> &outputs) override; | |||
| private: | |||
| void GenIndex(size_t num, std::vector<size_t> *idx); | |||
| void Less(const T *input1, const T *input2, bool *out); | |||
| void Equal(const T *input1, const T *input2, bool *out); | |||
| void NotEqual(const T *input1, const T *input2, bool *out); | |||
| void Greater(const T *input1, const T *input2, bool *out); | |||
| void GreaterEqual(const T *input1, const T *input2, bool *out); | |||
| void LessEqual(const T *input1, const T *input2, bool *out); | |||
| void LogicalAnd(const T *input1, const T *input2, bool *out); | |||
| void LogicalOr(const T *input1, const T *input2, bool *out); | |||
| void InitComputeFunc(); | |||
| void Less(const T *input1, const T *input2, bool *out) const; | |||
| void Equal(const T *input1, const T *input2, bool *out) const; | |||
| void NotEqual(const T *input1, const T *input2, bool *out) const; | |||
| void Greater(const T *input1, const T *input2, bool *out) const; | |||
| void GreaterEqual(const T *input1, const T *input2, bool *out) const; | |||
| void LessEqual(const T *input1, const T *input2, bool *out) const; | |||
| void LogicalAnd(const T *input1, const T *input2, bool *out) const; | |||
| void LogicalOr(const T *input1, const T *input2, bool *out) const; | |||
| using TypeComputeFunc = std::function<void(ArithmeticLogicCPUKernel *, const T *, const T *, bool *)>; | |||
| TypeComputeFunc compute_func_{nullptr}; | |||
| size_t output_size_{1}; | |||
| TypeId dtype_{kTypeUnknown}; | |||
| std::vector<size_t> input_shape1_; | |||
| std::vector<size_t> input_shape2_; | |||
| std::vector<size_t> input_element_num1_; | |||
| std::vector<size_t> input_element_num2_; | |||
| std::vector<size_t> output_shape_; | |||
| std::vector<size_t> output_element_num_; | |||
| size_t output_size_{1}; | |||
| OperateType operate_type_{ADD}; | |||
| TypeId dtype_{kTypeUnknown}; | |||
| TypeId target_dtype_{kTypeUnknown}; | |||
| }; | |||
| MS_REG_CPU_KERNEL_T( | |||
| @@ -1,5 +1,5 @@ | |||
| /** | |||
| * Copyright 2020 Huawei Technologies Co., Ltd | |||
| * Copyright 2020-2021 Huawei Technologies Co., Ltd | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| @@ -13,17 +13,25 @@ | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| #include <algorithm> | |||
| #include "backend/kernel_compiler/cpu/arithmetic_self_cpu_kernel.h" | |||
| #include <cmath> | |||
| #include <string> | |||
| #include <thread> | |||
| #include <map> | |||
| #include "backend/kernel_compiler/cpu/arithmetic_self_cpu_kernel.h" | |||
| #include <algorithm> | |||
| #include <unordered_map> | |||
| #include "runtime/device/cpu/cpu_device_address.h" | |||
| namespace mindspore { | |||
| namespace kernel { | |||
| namespace { | |||
| constexpr float kMaxNegSerialSize = 5000.0f; | |||
| constexpr float kMaxSquareSerialSize = 5000.0f; | |||
| constexpr size_t kInputsNum = 1; | |||
| constexpr size_t kOutputsNum = 1; | |||
| template <typename T> | |||
| void Square(const T *in, T *out, size_t size) { | |||
| auto task = [&in, &out](size_t start, size_t end) { | |||
| @@ -31,7 +39,7 @@ void Square(const T *in, T *out, size_t size) { | |||
| out[i] = in[i] * in[i]; | |||
| } | |||
| }; | |||
| ParallelLaunch(task, size, MAX_SQUARE_SERIAL_SIZE); | |||
| ParallelLaunch(task, size, kMaxSquareSerialSize); | |||
| } | |||
| template <typename T> | |||
| @@ -57,11 +65,10 @@ void Neg(const T *in, T *out, size_t size) { | |||
| out[i] = -in[i]; | |||
| } | |||
| }; | |||
| ParallelLaunch(task, size, MAX_NEG_SERIAL_SIZE); | |||
| ParallelLaunch(task, size, kMaxNegSerialSize); | |||
| } | |||
| template <typename T> | |||
| void LogicalNot(const T *in, T *out, size_t size) { | |||
| void LogicalNot(const bool *in, bool *out, size_t size) { | |||
| auto task = [&in, &out](size_t start, size_t end) { | |||
| for (size_t i = start; i < end; i++) { | |||
| out[i] = !in[i]; | |||
| @@ -133,10 +140,12 @@ void Reciprocal(const T *in, T *out, size_t size) { | |||
| template <typename T> | |||
| void Gelu(const T *in, T *out, size_t size) { | |||
| auto task = [&in, &out](size_t start, size_t end) { | |||
| auto factor_a = static_cast<T>(0.7978845608); | |||
| auto factor_b = static_cast<T>(0.044715); | |||
| for (size_t i = start; i < end; i++) { | |||
| T x = in[i]; | |||
| auto double_x = static_cast<T>(x); | |||
| T tanh_res = static_cast<T>(std::tanh(0.7978845608 * (double_x + 0.044715 * double_x * double_x * double_x))); | |||
| T tanh_res = static_cast<T>(std::tanh(factor_a * (double_x + factor_b * double_x * double_x * double_x))); | |||
| out[i] = x * (static_cast<T>(1.0) + tanh_res) / static_cast<T>(2.0); | |||
| } | |||
| }; | |||
| @@ -259,40 +268,17 @@ void Identity(const T *in, T *out, size_t size) { | |||
| } | |||
| } // namespace | |||
| static const std::map<std::string, OperateType> kArithmeticOpTypeMap = {{prim::kPrimNeg->name(), NEG}, | |||
| {prim::kPrimSquare->name(), SQUARE}, | |||
| {prim::kPrimOnesLike->name(), ONESLIKE}, | |||
| {prim::kPrimZerosLike->name(), ZEROSLIKE}, | |||
| {prim::kPrimLogicalNot->name(), LOGICALNOT}, | |||
| {prim::kPrimSign->name(), SIGN}, | |||
| {prim::kPrimFloor->name(), FLOOR}, | |||
| {prim::kPrimRint->name(), RINT}, | |||
| {prim::kPrimRound->name(), ROUND}, | |||
| {prim::kPrimReciprocal->name(), RECIPROCAL}, | |||
| {prim::kPrimGeLU->name(), GELU}, | |||
| {prim::kPrimAsin->name(), ASIN}, | |||
| {prim::kPrimACos->name(), ACOS}, | |||
| {prim::kPrimAtan->name(), ATAN}, | |||
| {prim::kPrimSin->name(), SIN}, | |||
| {prim::kPrimCos->name(), COS}, | |||
| {prim::kPrimTan->name(), TAN}, | |||
| {prim::kPrimSinh->name(), SINH}, | |||
| {prim::kPrimCosh->name(), COSH}, | |||
| {prim::kPrimAsinh->name(), ASINH}, | |||
| {prim::kPrimAcosh->name(), ACOSH}, | |||
| {prim::kPrimAtanh->name(), ATANH}, | |||
| {prim::kPrimIdentityMath->name(), IDENTITY}}; | |||
| void ArithmeticSelfCPUKernel::InitKernel(const CNodePtr &kernel_node) { | |||
| MS_EXCEPTION_IF_NULL(kernel_node); | |||
| std::string kernel_name = AnfAlgo::GetCNodeName(kernel_node); | |||
| operate_type_ = kArithmeticOpTypeMap.at(kernel_name); | |||
| kernel_name_ = AnfAlgo::GetCNodeName(kernel_node); | |||
| dtype_ = AnfAlgo::GetInputDeviceDataType(kernel_node, 0); | |||
| } | |||
| bool ArithmeticSelfCPUKernel::Launch(const std::vector<kernel::AddressPtr> &inputs, | |||
| const std::vector<kernel::AddressPtr> &, | |||
| const std::vector<kernel::AddressPtr> &outputs) { | |||
| CHECK_KERNEL_INPUTS_NUM(inputs.size(), kInputsNum, kernel_name_); | |||
| CHECK_KERNEL_OUTPUTS_NUM(outputs.size(), kOutputsNum, kernel_name_); | |||
| if (dtype_ == kNumberTypeFloat32 || dtype_ == kNumberTypeFloat16 || dtype_ == kNumberTypeFloat64) { | |||
| LaunchKernel<float>(inputs, outputs); | |||
| } else if (dtype_ == kNumberTypeInt32 || dtype_ == kNumberTypeInt16) { | |||
| @@ -300,52 +286,63 @@ bool ArithmeticSelfCPUKernel::Launch(const std::vector<kernel::AddressPtr> &inpu | |||
| } else if (dtype_ == kNumberTypeInt64) { | |||
| LaunchKernel<int64_t>(inputs, outputs); | |||
| } else if (dtype_ == kNumberTypeBool) { | |||
| LaunchKernelLogic<bool>(inputs, outputs); | |||
| LaunchLogicalNot(inputs, outputs); | |||
| } else { | |||
| MS_LOG(EXCEPTION) << "Data type is " << TypeIdLabel(dtype_) << "is not support."; | |||
| } | |||
| return true; | |||
| } | |||
| template <typename T> | |||
| void ArithmeticSelfCPUKernel::LaunchKernelLogic(const std::vector<AddressPtr> &inputs, | |||
| const std::vector<AddressPtr> &outputs) { | |||
| T *input = reinterpret_cast<T *>(inputs[0]->addr); | |||
| T *output = reinterpret_cast<T *>(outputs[0]->addr); | |||
| size_t lens = outputs[0]->size > 0 ? static_cast<size_t>(outputs[0]->size / sizeof(T)) : 1; | |||
| LogicalNot<T>(input, output, lens); | |||
| return; | |||
| void ArithmeticSelfCPUKernel::LaunchLogicalNot(const std::vector<AddressPtr> &inputs, | |||
| const std::vector<AddressPtr> &outputs) const { | |||
| auto *input = reinterpret_cast<bool *>(inputs[0]->addr); | |||
| auto *output = reinterpret_cast<bool *>(outputs[0]->addr); | |||
| size_t lens = outputs[0]->size / sizeof(bool); | |||
| LogicalNot(input, output, lens); | |||
| } | |||
| template <typename T> | |||
| void ArithmeticSelfCPUKernel::LaunchKernel(const std::vector<AddressPtr> &inputs, | |||
| const std::vector<AddressPtr> &outputs) { | |||
| T *input = reinterpret_cast<T *>(inputs[0]->addr); | |||
| T *output = reinterpret_cast<T *>(outputs[0]->addr); | |||
| size_t lens = outputs[0]->size > 0 ? static_cast<size_t>(outputs[0]->size / sizeof(T)) : 1; | |||
| static const std::map<OperateType, std::function<void(const T *in, T *out, size_t size)>> kArithmeticOpFuncMap = { | |||
| {SQUARE, Square<T>}, {SIGN, Sign<T>}, | |||
| {NEG, Neg<T>}, {LOGICALNOT, LogicalNot<T>}, | |||
| {ONESLIKE, OnesLike<T>}, {ZEROSLIKE, ZerosLike<T>}, | |||
| {FLOOR, Floor<T>}, {RECIPROCAL, Reciprocal<T>}, | |||
| {GELU, Gelu<T>}, {SIN, Sin<T>}, | |||
| {COS, Cos<T>}, {TAN, Tan<T>}, | |||
| {ASIN, Asin<T>}, {ACOS, ACos<T>}, | |||
| {ATAN, Atan<T>}, {SINH, Sinh<T>}, | |||
| {COSH, Cosh<T>}, {ASINH, Asinh<T>}, | |||
| {ACOSH, Acosh<T>}, {ATANH, Atanh<T>}, | |||
| {RINT, Rint<T>}, {ROUND, Round<T>}}; | |||
| if (kArithmeticOpFuncMap.find(operate_type_) != kArithmeticOpFuncMap.end()) { | |||
| kArithmeticOpFuncMap.at(operate_type_)(input, output, lens); | |||
| } else { | |||
| MS_LOG(EXCEPTION) << "Not support " << operate_type_; | |||
| const std::vector<AddressPtr> &outputs) const { | |||
| const auto *input = reinterpret_cast<T *>(inputs[0]->addr); | |||
| auto *output = reinterpret_cast<T *>(outputs[0]->addr); | |||
| const size_t lens = outputs[0]->size / sizeof(T); | |||
| static const std::unordered_map<std::string, std::function<void(const T *, T *, size_t)>> arithmeticSelfFuncMap{ | |||
| {prim::kPrimSquare->name(), Square<T>}, | |||
| {prim::kPrimSign->name(), Sign<T>}, | |||
| {prim::kPrimNeg->name(), Neg<T>}, | |||
| {prim::kPrimAtanh->name(), Atanh<T>}, | |||
| {prim::kPrimAcosh->name(), Acosh<T>}, | |||
| {prim::kPrimFloor->name(), Floor<T>}, | |||
| {prim::kPrimSin->name(), Sin<T>}, | |||
| {prim::kPrimGeLU->name(), Gelu<T>}, | |||
| {prim::kPrimCos->name(), Cos<T>}, | |||
| {prim::kPrimTan->name(), Tan<T>}, | |||
| {prim::kPrimAsin->name(), Asin<T>}, | |||
| {prim::kPrimACos->name(), ACos<T>}, | |||
| {prim::kPrimAtan->name(), Atan<T>}, | |||
| {prim::kPrimSinh->name(), Sinh<T>}, | |||
| {prim::kPrimCosh->name(), Cosh<T>}, | |||
| {prim::kPrimAsinh->name(), Asinh<T>}, | |||
| {prim::kPrimZerosLike->name(), ZerosLike<T>}, | |||
| {prim::kPrimOnesLike->name(), OnesLike<T>}, | |||
| {prim::kPrimReciprocal->name(), Reciprocal<T>}, | |||
| {prim::kPrimRint->name(), Rint<T>}, | |||
| {prim::kPrimRound->name(), Round<T>}}; | |||
| const auto func_pair = arithmeticSelfFuncMap.find(kernel_name_); | |||
| if (arithmeticSelfFuncMap.find(kernel_name_) == arithmeticSelfFuncMap.end()) { | |||
| MS_LOG(EXCEPTION) << "ArithmeticSelfCPUKernel does not support " << kernel_name_; | |||
| } | |||
| func_pair->second(input, output, lens); | |||
| } | |||
| template <typename T> | |||
| bool IdentityCPUKernel<T>::Launch(const std::vector<kernel::AddressPtr> &inputs, | |||
| const std::vector<kernel::AddressPtr> &, | |||
| const std::vector<kernel::AddressPtr> &outputs) { | |||
| CHECK_KERNEL_INPUTS_NUM(inputs.size(), kInputsNum, kernel_name_); | |||
| CHECK_KERNEL_OUTPUTS_NUM(outputs.size(), kOutputsNum, kernel_name_); | |||
| T *input = reinterpret_cast<T *>(inputs[0]->addr); | |||
| T *output = reinterpret_cast<T *>(outputs[0]->addr); | |||
| size_t lens = outputs[0]->size > 0 ? static_cast<size_t>(outputs[0]->size / sizeof(T)) : 1; | |||
| @@ -13,16 +13,16 @@ | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_ARITHMETIC_SELF_CPU_KERNEL_H_ | |||
| #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_ARITHMETIC_SELF_CPU_KERNEL_H_ | |||
| #include <memory> | |||
| #include <vector> | |||
| #include "backend/kernel_compiler/cpu/cpu_kernel.h" | |||
| #include "backend/kernel_compiler/cpu/cpu_kernel_factory.h" | |||
| const float MAX_NEG_SERIAL_SIZE = 5000; | |||
| const float MAX_SQUARE_SERIAL_SIZE = 5000; | |||
| namespace mindspore { | |||
| namespace kernel { | |||
| class ArithmeticSelfCPUKernel : public CPUKernel { | |||
| @@ -35,13 +35,12 @@ class ArithmeticSelfCPUKernel : public CPUKernel { | |||
| bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace, | |||
| const std::vector<AddressPtr> &outputs) override; | |||
| private: | |||
| template <typename T> | |||
| void LaunchKernel(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &outputs); | |||
| template <typename T> | |||
| void LaunchKernelLogic(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &outputs); | |||
| void LaunchKernel(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &outputs) const; | |||
| void LaunchLogicalNot(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &outputs) const; | |||
| private: | |||
| OperateType operate_type_{SQUARE}; | |||
| TypeId dtype_{kTypeUnknown}; | |||
| TypeId target_dtype_{kTypeUnknown}; | |||
| }; | |||
| @@ -1,5 +1,5 @@ | |||
| /** | |||
| * Copyright 2020 Huawei Technologies Co., Ltd | |||
| * Copyright 2020-2021 Huawei Technologies Co., Ltd | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| @@ -15,23 +15,34 @@ | |||
| */ | |||
| #include "backend/kernel_compiler/cpu/assign_cpu_kernel.h" | |||
| #include <string> | |||
| #include <map> | |||
| #include "runtime/device/cpu/cpu_device_address.h" | |||
| #include "common/thread_pool.h" | |||
| namespace mindspore { | |||
| namespace kernel { | |||
| static std::map<TypeId, size_t> input_x_dtype_size_map = { | |||
| {kNumberTypeBool, sizeof(bool)}, {kNumberTypeInt8, 1}, {kNumberTypeInt16, 2}, {kNumberTypeInt32, 4}, | |||
| {kNumberTypeInt64, 8}, {kNumberTypeUInt8, 1}, {kNumberTypeUInt16, 2}, {kNumberTypeUInt32, 4}, | |||
| {kNumberTypeUInt64, 8}, {kNumberTypeFloat16, 2}, {kNumberTypeFloat32, 4}, {kNumberTypeFloat64, 8}}; | |||
| namespace { | |||
| constexpr size_t kAssignInputsNum = 2; | |||
| constexpr size_t kAssignOutputsNum = 1; | |||
| const std::map<TypeId, size_t> input_x_dtype_size_map = { | |||
| {kNumberTypeBool, sizeof(bool)}, {kNumberTypeInt8, sizeof(int8_t)}, {kNumberTypeInt16, sizeof(int16_t)}, | |||
| {kNumberTypeInt32, sizeof(int32_t)}, {kNumberTypeInt64, sizeof(int64_t)}, {kNumberTypeUInt8, sizeof(uint8_t)}, | |||
| {kNumberTypeUInt16, sizeof(uint16_t)}, {kNumberTypeUInt32, sizeof(uint32_t)}, {kNumberTypeUInt64, sizeof(uint64_t)}, | |||
| {kNumberTypeFloat16, sizeof(float16)}, {kNumberTypeFloat32, sizeof(float)}, {kNumberTypeFloat64, sizeof(double)}}; | |||
| } // namespace | |||
| void AssignCPUKernel::InitKernel(const CNodePtr &kernel_node) { | |||
| MS_EXCEPTION_IF_NULL(kernel_node); | |||
| kernel_name_ = AnfAlgo::GetCNodeName(kernel_node); | |||
| auto input_x_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0); | |||
| auto input_y_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 1); | |||
| if (input_x_shape.size() != input_y_shape.size()) MS_LOG(EXCEPTION) << "X and y must be same shape!"; | |||
| if (input_x_shape.size() != input_y_shape.size()) { | |||
| MS_LOG(EXCEPTION) << "X and y must be same shape!"; | |||
| } | |||
| for (size_t i = 0; i < input_x_shape.size(); ++i) { | |||
| if (input_x_shape[i] != input_y_shape[i]) { | |||
| MS_LOG(EXCEPTION) << "X and y must be same shape!"; | |||
| @@ -39,14 +50,17 @@ void AssignCPUKernel::InitKernel(const CNodePtr &kernel_node) { | |||
| batch_size_ *= input_x_shape[i]; | |||
| } | |||
| input_x_dtype_ = AnfAlgo::GetInputDeviceDataType(kernel_node, 0); | |||
| if (input_x_dtype_size_map.find(input_x_dtype_) == input_x_dtype_size_map.end()) { | |||
| auto type_len = input_x_dtype_size_map.find(input_x_dtype_); | |||
| if (type_len == input_x_dtype_size_map.end()) { | |||
| MS_LOG(EXCEPTION) << "Unsupported input_x dtype!"; | |||
| } | |||
| input_x_dtype_size_ = input_x_dtype_size_map[input_x_dtype_]; | |||
| input_x_dtype_size_ = type_len->second; | |||
| } | |||
| bool AssignCPUKernel::Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &, | |||
| const std::vector<AddressPtr> &outputs) { | |||
| CHECK_KERNEL_INPUTS_NUM(inputs.size(), kAssignInputsNum, kernel_name_); | |||
| CHECK_KERNEL_OUTPUTS_NUM(outputs.size(), kAssignOutputsNum, kernel_name_); | |||
| auto max_size = inputs[0]->size; | |||
| size_t total_size = input_x_dtype_size_ * batch_size_; | |||
| if (total_size > max_size) { | |||
| @@ -1,5 +1,5 @@ | |||
| /** | |||
| * Copyright 2020 Huawei Technologies Co., Ltd | |||
| * Copyright 2020-2021 Huawei Technologies Co., Ltd | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| @@ -13,12 +13,14 @@ | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_ASSIGN_CPU_KERNEL_H_ | |||
| #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_ASSIGN_CPU_KERNEL_H_ | |||
| #include <vector> | |||
| #include <memory> | |||
| #include <unordered_map> | |||
| #include "backend/kernel_compiler/cpu/cpu_kernel.h" | |||
| #include "backend/kernel_compiler/cpu/cpu_kernel_factory.h" | |||
| @@ -36,8 +38,8 @@ class AssignCPUKernel : public CPUKernel { | |||
| private: | |||
| size_t batch_size_{1}; | |||
| size_t input_x_dtype_size_{4}; | |||
| TypeId input_x_dtype_{kTypeUnknown}; | |||
| size_t input_x_dtype_size_ = 4; | |||
| }; | |||
| MS_REG_CPU_KERNEL( | |||
| @@ -1,5 +1,5 @@ | |||
| /** | |||
| * Copyright 2019 Huawei Technologies Co., Ltd | |||
| * Copyright 2019-2021 Huawei Technologies Co., Ltd | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| @@ -15,16 +15,21 @@ | |||
| */ | |||
| #include "backend/kernel_compiler/cpu/bias_add_cpu_kernel.h" | |||
| #include "nnacl/fp32/add_fp32.h" | |||
| #include "backend/kernel_compiler/cpu/nnacl/fp32/add_fp32.h" | |||
| #include "backend/kernel_compiler/cpu/nnacl/errorcode.h" | |||
| namespace mindspore { | |||
| namespace kernel { | |||
| namespace { | |||
| constexpr size_t kBiasAddMinDim = 2; | |||
| constexpr size_t kBiasAddMaxDim = 5; | |||
| constexpr size_t kBiasAddInputNum = 2; | |||
| constexpr size_t kBiasAddInputsNum = 2; | |||
| constexpr size_t kBiasAddOutputsNum = 1; | |||
| } // namespace | |||
| void BiasAddCPUKernel::InitKernel(const CNodePtr &kernel_node) { | |||
| MS_EXCEPTION_IF_NULL(kernel_node); | |||
| kernel_name_ = AnfAlgo::GetCNodeName(kernel_node); | |||
| input_shape_ = AnfAlgo::GetInputDeviceShape(kernel_node, 0); | |||
| bias_shape_ = AnfAlgo::GetInputDeviceShape(kernel_node, 1); | |||
| data_shape_ = input_shape_.size(); | |||
| @@ -44,13 +49,11 @@ void BiasAddCPUKernel::InitKernel(const CNodePtr &kernel_node) { | |||
| bool BiasAddCPUKernel::Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &, | |||
| const std::vector<AddressPtr> &outputs) { | |||
| if (inputs.size() != kBiasAddInputNum || outputs.size() != 1) { | |||
| MS_LOG(EXCEPTION) << "Inputs outputs size not supoort"; | |||
| } | |||
| auto src_addr = reinterpret_cast<float *>(inputs[0]->addr); | |||
| auto bias_addr = reinterpret_cast<float *>(inputs[1]->addr); | |||
| auto output_addr = reinterpret_cast<float *>(outputs[0]->addr); | |||
| CHECK_KERNEL_INPUTS_NUM(inputs.size(), kBiasAddInputsNum, kernel_name_); | |||
| CHECK_KERNEL_OUTPUTS_NUM(outputs.size(), kBiasAddOutputsNum, kernel_name_); | |||
| const auto *src_addr = reinterpret_cast<float *>(inputs[0]->addr); | |||
| const auto *bias_addr = reinterpret_cast<float *>(inputs[1]->addr); | |||
| auto *output_addr = reinterpret_cast<float *>(outputs[0]->addr); | |||
| if (input_shape_.size() > 2) { | |||
| size_t hw_size = 1; | |||
| @@ -87,11 +90,14 @@ bool BiasAddCPUKernel::Launch(const std::vector<AddressPtr> &inputs, const std:: | |||
| auto task = [&](size_t start, size_t end) { | |||
| for (size_t n = start; n < end; ++n) { | |||
| size_t n_offset = input_shape_[1] * n; | |||
| ElementAdd(src_addr + n_offset, bias_addr, output_addr + n_offset, input_shape_[1]); | |||
| if (ElementAdd(src_addr + n_offset, bias_addr, output_addr + n_offset, input_shape_[1]) != NNACL_OK) { | |||
| MS_LOG(EXCEPTION) << "ElementAdd failed."; | |||
| } | |||
| } | |||
| }; | |||
| ParallelLaunchAutoSearch(task, input_shape_[0], this, ¶llel_search_info_); | |||
| } | |||
| return true; | |||
| } | |||
| } // namespace kernel | |||
| @@ -1,5 +1,5 @@ | |||
| /** | |||
| * Copyright 2019 Huawei Technologies Co., Ltd | |||
| * Copyright 2019-2021 Huawei Technologies Co., Ltd | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| @@ -13,11 +13,13 @@ | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_BIAS_ADD_CPU_KERNEL_H_ | |||
| #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_BIAS_ADD_CPU_KERNEL_H_ | |||
| #include <vector> | |||
| #include <memory> | |||
| #include "backend/kernel_compiler/cpu/cpu_kernel.h" | |||
| #include "backend/kernel_compiler/cpu/cpu_kernel_factory.h" | |||
| @@ -1,5 +1,5 @@ | |||
| /** | |||
| * Copyright 2019 Huawei Technologies Co., Ltd | |||
| * Copyright 2019-2021 Huawei Technologies Co., Ltd | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| @@ -15,11 +15,19 @@ | |||
| */ | |||
| #include "backend/kernel_compiler/cpu/bias_add_grad_cpu_kernel.h" | |||
| #include "nnacl/fp32/reduce_fp32.h" | |||
| #include "backend/kernel_compiler/cpu/nnacl/fp32/reduce_fp32.h" | |||
| #include "backend/kernel_compiler/cpu/nnacl/errorcode.h" | |||
| namespace mindspore { | |||
| namespace kernel { | |||
| namespace { | |||
| constexpr size_t kBiasAddGradInputsNum = 1; | |||
| constexpr size_t kBiasAddGradOutputsNum = 1; | |||
| } // namespace | |||
| void BiasAddGradCPUKernel::InitKernel(const CNodePtr &kernel_node) { | |||
| MS_EXCEPTION_IF_NULL(kernel_node); | |||
| kernel_name_ = AnfAlgo::GetCNodeName(kernel_node); | |||
| input_shape_ = AnfAlgo::GetInputDeviceShape(kernel_node, 0); | |||
| if (input_shape_.size() < 2) { | |||
| MS_LOG(EXCEPTION) << "Input tensor's rank must be at least 2 for 'BiasAddGrad' Op, but input tensor's rank is " | |||
| @@ -29,11 +37,10 @@ void BiasAddGradCPUKernel::InitKernel(const CNodePtr &kernel_node) { | |||
| bool BiasAddGradCPUKernel::Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &, | |||
| const std::vector<AddressPtr> &outputs) { | |||
| if (inputs.size() != 1 || outputs.size() != 1) { | |||
| MS_LOG(EXCEPTION) << "input output size not support"; | |||
| } | |||
| auto output_addr = reinterpret_cast<float *>(outputs[0]->addr); | |||
| auto input_addr = reinterpret_cast<float *>(inputs[0]->addr); | |||
| CHECK_KERNEL_INPUTS_NUM(inputs.size(), kBiasAddGradInputsNum, kernel_name_); | |||
| CHECK_KERNEL_OUTPUTS_NUM(outputs.size(), kBiasAddGradOutputsNum, kernel_name_); | |||
| const auto *input_addr = reinterpret_cast<float *>(inputs[0]->addr); | |||
| auto *output_addr = reinterpret_cast<float *>(outputs[0]->addr); | |||
| if (input_shape_.size() > 2) { | |||
| size_t hw_size = 1; | |||
| @@ -53,7 +60,11 @@ bool BiasAddGradCPUKernel::Launch(const std::vector<AddressPtr> &inputs, const s | |||
| } | |||
| } else if (input_shape_.size() == 2) { | |||
| auto task = [this, input_addr, output_addr](size_t start, size_t end) { | |||
| (void)ReduceSumDim2Axis0(end - start, input_shape_[1], input_shape_[0], input_addr + start, output_addr + start); | |||
| int ret = | |||
| ReduceSumDim2Axis0(end - start, input_shape_[1], input_shape_[0], input_addr + start, output_addr + start); | |||
| if (ret != NNACL_OK) { | |||
| MS_LOG(EXCEPTION) << "ReduceSumDim2Axis0 failed."; | |||
| } | |||
| }; | |||
| ParallelLaunchAutoSearch(task, input_shape_[1], this, ¶llel_search_info_); | |||
| } | |||
| @@ -1,5 +1,5 @@ | |||
| /** | |||
| * Copyright 2019 Huawei Technologies Co., Ltd | |||
| * Copyright 2019-2021 Huawei Technologies Co., Ltd | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| @@ -14,11 +14,12 @@ | |||
| * limitations under the License. | |||
| */ | |||
| #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_BIASADDGRADCPUKERNEL_H_ | |||
| #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_BIASADDGRADCPUKERNEL_H_ | |||
| #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_BIAS_ADD_GRAD_CPU_KERNEL_H_ | |||
| #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_BIAS_ADD_GRAD_CPU_KERNEL_H_ | |||
| #include <vector> | |||
| #include <memory> | |||
| #include "backend/kernel_compiler/cpu/cpu_kernel.h" | |||
| #include "backend/kernel_compiler/cpu/cpu_kernel_factory.h" | |||
| @@ -39,4 +40,4 @@ class BiasAddGradCPUKernel : public CPUKernel { | |||
| MS_REG_CPU_KERNEL(BiasAddGrad, KernelAttr(), BiasAddGradCPUKernel); | |||
| } // namespace kernel | |||
| } // namespace mindspore | |||
| #endif // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_BIASADDGRADCPUKERNEL_H_ | |||
| #endif // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_BIAS_ADD_GRAD_CPU_KERNEL_H_ | |||
| @@ -13,14 +13,19 @@ | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| #include "backend/kernel_compiler/cpu/binary_cross_entropy_cpu_kernel.h" | |||
| namespace mindspore { | |||
| namespace kernel { | |||
| constexpr size_t kBceInputNumWithWeight = 3; | |||
| namespace { | |||
| constexpr size_t kBceInputsNumWithWeight = 3; | |||
| constexpr size_t kBceOutputsNum = 1; | |||
| } // namespace | |||
| template <typename T> | |||
| void BinaryCrossEntropyCpuKernel::LaunchToScalar(const int &input_size, const int &reduction, T *loss, T *tmp_loss) { | |||
| void BinaryCrossEntropyCpuKernel::LaunchToScalar(const int &input_size, const int &reduction, T *loss, | |||
| T *tmp_loss) const { | |||
| if (input_size % 2 == 1) { | |||
| tmp_loss[0] += tmp_loss[input_size - 1]; | |||
| } | |||
| @@ -35,83 +40,94 @@ void BinaryCrossEntropyCpuKernel::LaunchToScalar(const int &input_size, const in | |||
| } | |||
| loss[0] = tmp_loss[0]; | |||
| if (reduction == 1) { | |||
| if (reduction == kMean) { | |||
| loss[0] /= static_cast<T>(input_size); | |||
| } | |||
| } | |||
| template <typename T> | |||
| void BinaryCrossEntropyCpuKernel::Launchkernel(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &, | |||
| const std::vector<AddressPtr> &outputs) { | |||
| T *input_x = reinterpret_cast<T *>(inputs[0]->addr); | |||
| T *input_y = reinterpret_cast<T *>(inputs[1]->addr); | |||
| T *weight = nullptr; | |||
| if (weight_defined_) { | |||
| weight = reinterpret_cast<T *>(inputs[2]->addr); | |||
| } | |||
| T *loss = reinterpret_cast<T *>(outputs[0]->addr); | |||
| void BinaryCrossEntropyCpuKernel::LaunchKernel(const std::vector<AddressPtr> &inputs, | |||
| const std::vector<AddressPtr> &outputs) const { | |||
| const auto *input_x = reinterpret_cast<T *>(inputs[0]->addr); | |||
| const auto *input_y = reinterpret_cast<T *>(inputs[1]->addr); | |||
| const T *weight = weight_defined_ ? reinterpret_cast<T *>(inputs[2]->addr) : nullptr; | |||
| auto *loss = reinterpret_cast<T *>(outputs[0]->addr); | |||
| std::vector<T> tmp_loss(input_size_); | |||
| auto epsilon = static_cast<T>(1e-12); | |||
| auto one = static_cast<T>(1); | |||
| T epsilon = static_cast<T>(1e-12); | |||
| T one = static_cast<T>(1); | |||
| if (reduction_ == 0 && weight_defined_) { | |||
| for (size_t i = 0; i < input_size_; i++) { | |||
| T value = | |||
| -weight[i] * (input_y[i] * log(input_x[i] + epsilon) + (one - input_y[i]) * log(one - input_x[i] + epsilon)); | |||
| loss[i] = value; | |||
| } | |||
| } else if (reduction_ == 0 && (!weight_defined_)) { | |||
| for (size_t i = 0; i < input_size_; i++) { | |||
| T value = static_cast<T>( | |||
| -(input_y[i] * log(input_x[i] + epsilon) + (one - input_y[i]) * log(one - input_x[i] + epsilon))); | |||
| loss[i] = value; | |||
| } | |||
| } else if ((reduction_ != 0) && weight_defined_) { | |||
| for (size_t i = 0; i < input_size_; i++) { | |||
| T value = | |||
| -weight[i] * (input_y[i] * log(input_x[i] + epsilon) + (one - input_y[i]) * log(one - input_x[i] + epsilon)); | |||
| tmp_loss[i] = value; | |||
| if (reduction_ == kNone) { | |||
| if (weight_defined_) { | |||
| for (size_t i = 0; i < input_size_; i++) { | |||
| auto value = static_cast<T>( | |||
| -weight[i] * (input_y[i] * log(input_x[i] + epsilon) + (one - input_y[i]) * log(one - input_x[i] + epsilon))); | |||
| loss[i] = value; | |||
| } | |||
| } else { | |||
| for (size_t i = 0; i < input_size_; i++) { | |||
| auto value = static_cast<T>( | |||
| -(input_y[i] * log(input_x[i] + epsilon) + (one - input_y[i]) * log(one - input_x[i] + epsilon))); | |||
| loss[i] = value; | |||
| } | |||
| } | |||
| } else { | |||
| for (size_t i = 0; i < input_size_; i++) { | |||
| T value = static_cast<T>( | |||
| -(input_y[i] * log(input_x[i] + epsilon) + (one - input_y[i]) * log(one - input_x[i] + epsilon))); | |||
| tmp_loss[i] = value; | |||
| if (weight_defined_) { | |||
| for (size_t i = 0; i < input_size_; i++) { | |||
| auto value = static_cast<T>( | |||
| -weight[i] * (input_y[i] * log(input_x[i] + epsilon) + (one - input_y[i]) * log(one - input_x[i] + epsilon))); | |||
| tmp_loss[i] = value; | |||
| } | |||
| } else { | |||
| for (size_t i = 0; i < input_size_; i++) { | |||
| auto value = static_cast<T>( | |||
| -(input_y[i] * log(input_x[i] + epsilon) + (one - input_y[i]) * log(one - input_x[i] + epsilon))); | |||
| tmp_loss[i] = value; | |||
| } | |||
| } | |||
| } | |||
| if (reduction_ != 0) { | |||
| if (reduction_ != kNone) { | |||
| LaunchToScalar<T>(input_size_, reduction_, loss, tmp_loss.data()); | |||
| } | |||
| } | |||
| bool BinaryCrossEntropyCpuKernel::Launch(const std::vector<AddressPtr> &inputs, | |||
| const std::vector<AddressPtr> &workspace, | |||
| bool BinaryCrossEntropyCpuKernel::Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &, | |||
| const std::vector<AddressPtr> &outputs) { | |||
| if (input_size_ > 0) { | |||
| if (dtype_ == kNumberTypeFloat32) { | |||
| Launchkernel<float>(inputs, workspace, outputs); | |||
| } else if (dtype_ == kNumberTypeFloat16) { | |||
| Launchkernel<float16>(inputs, workspace, outputs); | |||
| } | |||
| const size_t expect_inputs_num = weight_defined_ ? kBceInputsNumWithWeight : kBceInputsNumWithWeight - 1; | |||
| CHECK_KERNEL_INPUTS_NUM(inputs.size(), expect_inputs_num, kernel_name_); | |||
| CHECK_KERNEL_OUTPUTS_NUM(outputs.size(), kBceOutputsNum, kernel_name_); | |||
| if (dtype_ == kNumberTypeFloat32) { | |||
| LaunchKernel<float>(inputs, outputs); | |||
| } else if (dtype_ == kNumberTypeFloat16) { | |||
| LaunchKernel<float16>(inputs, outputs); | |||
| } else { | |||
| MS_LOG(EXCEPTION) << kernel_name_ << " only support float16 and float32 on CPU, but got " | |||
| << TypeIdToType(dtype_)->ToString(); | |||
| } | |||
| return true; | |||
| } | |||
| void BinaryCrossEntropyCpuKernel::InitKernel(const CNodePtr &kernel_node) { | |||
| MS_EXCEPTION_IF_NULL(kernel_node); | |||
| kernel_name_ = AnfAlgo::GetCNodeName(kernel_node); | |||
| size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node); | |||
| weight_defined_ = (input_num == kBceInputsNumWithWeight); | |||
| dtype_ = AnfAlgo::GetInputDeviceDataType(kernel_node, 0); | |||
| auto input_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0); | |||
| for (size_t i = 0; i < input_shape.size(); i++) { | |||
| input_size_ *= input_shape[i]; | |||
| } | |||
| string reduction = AnfAlgo::GetNodeAttr<string>(kernel_node, "reduction"); | |||
| if (reduction == "none") { | |||
| reduction_ = 0; | |||
| } else if (reduction == "sum") { | |||
| reduction_ = 2; | |||
| const std::string reduction = AnfAlgo::GetNodeAttr<string>(kernel_node, REDUCTION); | |||
| if (reduction == NONE) { | |||
| reduction_ = kNone; | |||
| } else if (reduction == MEAN) { | |||
| reduction_ = kMean; | |||
| } else if (reduction == SUM) { | |||
| reduction_ = kSum; | |||
| } else { | |||
| MS_LOG(EXCEPTION) << kernel_name_ << "only support the reduction is 'none', 'mean', or 'sum', but got " | |||
| << reduction; | |||
| } | |||
| size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node); | |||
| weight_defined_ = (input_num == kBceInputNumWithWeight); | |||
| dtype_ = AnfAlgo::GetInputDeviceDataType(kernel_node, 0); | |||
| } | |||
| } // namespace kernel | |||
| } // namespace mindspore | |||
| @@ -13,19 +13,23 @@ | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_NN_BINARY_CROSS_ENTROPY_KERNEL_H | |||
| #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_NN_BINARY_CROSS_ENTROPY_KERNEL_H | |||
| #include <vector> | |||
| #include <string> | |||
| #include "backend/kernel_compiler/cpu/cpu_kernel.h" | |||
| #include "backend/kernel_compiler/cpu/cpu_kernel_factory.h" | |||
| namespace mindspore { | |||
| namespace kernel { | |||
| enum ReductionType { kNone, kMean, kSum }; | |||
| class BinaryCrossEntropyCpuKernel : public CPUKernel { | |||
| public: | |||
| BinaryCrossEntropyCpuKernel() : input_size_(1), reduction_(1), weight_defined_(false) {} | |||
| BinaryCrossEntropyCpuKernel() = default; | |||
| ~BinaryCrossEntropyCpuKernel() override = default; | |||
| void InitKernel(const CNodePtr &kernel_node) override; | |||
| @@ -34,15 +38,14 @@ class BinaryCrossEntropyCpuKernel : public CPUKernel { | |||
| private: | |||
| template <typename T> | |||
| void LaunchToScalar(const int &input_size, const int &reduction, T *loss, T *tmp_loss); | |||
| void LaunchToScalar(const int &input_size, const int &reduction, T *loss, T *tmp_loss) const; | |||
| template <typename T> | |||
| void Launchkernel(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace, | |||
| const std::vector<AddressPtr> &outputs); | |||
| void LaunchKernel(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &outputs) const; | |||
| TypeId dtype_{kTypeUnknown}; | |||
| size_t input_size_; | |||
| int reduction_; | |||
| bool weight_defined_; // true: there are 3 inputs, false: there are 2 inputs(no [weight]) | |||
| size_t input_size_{1}; | |||
| ReductionType reduction_{kNone}; | |||
| bool weight_defined_{false}; // true: there are 3 inputs, false: there are 2 inputs(no [weight]) | |||
| }; | |||
| MS_REG_CPU_KERNEL(BinaryCrossEntropy, | |||
| KernelAttr() | |||
| @@ -13,28 +13,28 @@ | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| #include "backend/kernel_compiler/cpu/binary_cross_entropy_grad_kernel.h" | |||
| namespace mindspore { | |||
| namespace kernel { | |||
| constexpr size_t kBceGradInputNumWithWeight = 4; | |||
| namespace { | |||
| constexpr size_t kBceGradInputsNumWithWeight = 4; | |||
| constexpr size_t kBceGradOutputsNum = 1; | |||
| } // namespace | |||
| template <typename T> | |||
| void BinaryCrossEntropyGradCpuKernel::Launchkernel(const std::vector<AddressPtr> &inputs, | |||
| const std::vector<AddressPtr> &outputs) { | |||
| T *input_x = reinterpret_cast<T *>(inputs[0]->addr); | |||
| T *input_y = reinterpret_cast<T *>(inputs[1]->addr); | |||
| T *dloss = reinterpret_cast<T *>(inputs[2]->addr); | |||
| T *weight = nullptr; | |||
| if (weight_defined_) { | |||
| weight = reinterpret_cast<T *>(inputs[3]->addr); | |||
| } | |||
| T *dx = reinterpret_cast<T *>(outputs[0]->addr); | |||
| void BinaryCrossEntropyGradCpuKernel::LaunchKernel(const std::vector<AddressPtr> &inputs, | |||
| const std::vector<AddressPtr> &outputs) const { | |||
| const auto *input_x = reinterpret_cast<T *>(inputs[0]->addr); | |||
| const auto *input_y = reinterpret_cast<T *>(inputs[1]->addr); | |||
| const auto *dloss = reinterpret_cast<T *>(inputs[2]->addr); | |||
| const T *weight = weight_defined_ ? reinterpret_cast<T *>(inputs[3]->addr) : nullptr; | |||
| auto *dx = reinterpret_cast<T *>(outputs[0]->addr); | |||
| auto epsilon = static_cast<T>(1e-12); | |||
| auto one = static_cast<T>(1); | |||
| T epsilon = static_cast<T>(1e-12); | |||
| T one = static_cast<T>(1); | |||
| if (reduction_ == 0) { | |||
| if (reduction_ == kNone) { | |||
| if (weight_defined_) { | |||
| for (size_t i = 0; i < input_size_; i++) { | |||
| T denominator = ((input_x[i] * (one - input_x[i])) > epsilon) ? (input_x[i] * (one - input_x[i])) : epsilon; | |||
| @@ -50,7 +50,7 @@ void BinaryCrossEntropyGradCpuKernel::Launchkernel(const std::vector<AddressPtr> | |||
| } | |||
| } else { | |||
| T dloss1 = dloss[0]; | |||
| if (reduction_ == 1) { | |||
| if (reduction_ == kMean) { | |||
| dloss1 = dloss[0] / static_cast<T>(input_size_); | |||
| } | |||
| if (weight_defined_) { | |||
| @@ -69,34 +69,44 @@ void BinaryCrossEntropyGradCpuKernel::Launchkernel(const std::vector<AddressPtr> | |||
| } | |||
| } | |||
| bool BinaryCrossEntropyGradCpuKernel::Launch(const std::vector<AddressPtr> &inputs, | |||
| const std::vector<AddressPtr> &workspace, | |||
| bool BinaryCrossEntropyGradCpuKernel::Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &, | |||
| const std::vector<AddressPtr> &outputs) { | |||
| if (input_size_ > 0) { | |||
| if (dtype_ == kNumberTypeFloat32) { | |||
| Launchkernel<float>(inputs, outputs); | |||
| } else if (dtype_ == kNumberTypeFloat16) { | |||
| Launchkernel<float16>(inputs, outputs); | |||
| } | |||
| const size_t expect_inputs_num = weight_defined_ ? kBceGradInputsNumWithWeight : kBceGradInputsNumWithWeight - 1; | |||
| CHECK_KERNEL_INPUTS_NUM(inputs.size(), expect_inputs_num, kernel_name_); | |||
| CHECK_KERNEL_OUTPUTS_NUM(outputs.size(), kBceGradOutputsNum, kernel_name_); | |||
| if (dtype_ == kNumberTypeFloat32) { | |||
| LaunchKernel<float>(inputs, outputs); | |||
| } else if (dtype_ == kNumberTypeFloat16) { | |||
| LaunchKernel<float16>(inputs, outputs); | |||
| } else { | |||
| MS_LOG(EXCEPTION) << kernel_name_ << " only support float16 and float32 on CPU, but got " | |||
| << TypeIdToType(dtype_)->ToString(); | |||
| } | |||
| return true; | |||
| } | |||
| void BinaryCrossEntropyGradCpuKernel::InitKernel(const CNodePtr &kernel_node) { | |||
| MS_EXCEPTION_IF_NULL(kernel_node); | |||
| kernel_name_ = AnfAlgo::GetCNodeName(kernel_node); | |||
| size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node); | |||
| weight_defined_ = (input_num == kBceGradInputsNumWithWeight); | |||
| dtype_ = AnfAlgo::GetInputDeviceDataType(kernel_node, 0); | |||
| auto input_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0); | |||
| for (size_t i = 0; i < input_shape.size(); i++) { | |||
| input_size_ *= input_shape[i]; | |||
| } | |||
| string reduction = AnfAlgo::GetNodeAttr<string>(kernel_node, "reduction"); | |||
| if (reduction == "none") { | |||
| reduction_ = 0; | |||
| } else if (reduction == "sum") { | |||
| reduction_ = 2; | |||
| } | |||
| size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node); | |||
| weight_defined_ = (input_num == kBceGradInputNumWithWeight); | |||
| dtype_ = AnfAlgo::GetInputDeviceDataType(kernel_node, 0); | |||
| const std::string reduction = AnfAlgo::GetNodeAttr<string>(kernel_node, REDUCTION); | |||
| if (reduction == NONE) { | |||
| reduction_ = kNone; | |||
| } else if (reduction == MEAN) { | |||
| reduction_ = kMean; | |||
| } else if (reduction == SUM) { | |||
| reduction_ = kSum; | |||
| } else { | |||
| MS_LOG(EXCEPTION) << kernel_name_ << "only support the reduction is 'none', 'mean', or 'sum', but got " | |||
| << reduction; | |||
| } | |||
| } | |||
| } // namespace kernel | |||
| } // namespace mindspore | |||
| @@ -13,19 +13,22 @@ | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_NN_BINARY_CROSS_ENTROPY_GRAD_KERNEL_H | |||
| #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_NN_BINARY_CROSS_ENTROPY_GRAD_KERNEL_H | |||
| #include <vector> | |||
| #include <string> | |||
| #include "backend/kernel_compiler/cpu/cpu_kernel.h" | |||
| #include "backend/kernel_compiler/cpu/cpu_kernel_factory.h" | |||
| #include "backend/kernel_compiler/cpu/binary_cross_entropy_cpu_kernel.h" | |||
| namespace mindspore { | |||
| namespace kernel { | |||
| class BinaryCrossEntropyGradCpuKernel : public CPUKernel { | |||
| public: | |||
| BinaryCrossEntropyGradCpuKernel() : input_size_(1), reduction_(1), weight_defined_(false) {} | |||
| BinaryCrossEntropyGradCpuKernel() = default; | |||
| ~BinaryCrossEntropyGradCpuKernel() override = default; | |||
| void InitKernel(const CNodePtr &kernel_node) override; | |||
| @@ -34,12 +37,12 @@ class BinaryCrossEntropyGradCpuKernel : public CPUKernel { | |||
| private: | |||
| template <typename T> | |||
| void Launchkernel(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &outputs); | |||
| void LaunchKernel(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &outputs) const; | |||
| TypeId dtype_{kTypeUnknown}; | |||
| size_t input_size_; | |||
| int reduction_; | |||
| bool weight_defined_; // true: there are 4 inputs, false: there are 3 inputs(no [weight]) | |||
| size_t input_size_{1}; | |||
| ReductionType reduction_{kNone}; | |||
| bool weight_defined_{false}; // true: there are 4 inputs, false: there are 3 inputs(no [weight]) | |||
| }; | |||
| MS_REG_CPU_KERNEL(BinaryCrossEntropyGrad, | |||
| KernelAttr() | |||
| @@ -13,6 +13,7 @@ | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| #include "backend/kernel_compiler/cpu/boundingbox_decode_cpu_kernel.h" | |||
| #include "runtime/device/cpu/cpu_device_address.h" | |||
| @@ -13,10 +13,13 @@ | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_BOUNDINGBOX_DECODE_CPU_KERNEL_H_ | |||
| #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_BOUNDINGBOX_DECODE_CPU_KERNEL_H_ | |||
| #include <vector> | |||
| #include <algorithm> | |||
| #include "backend/kernel_compiler/cpu/cpu_kernel.h" | |||
| #include "backend/kernel_compiler/cpu/cpu_kernel_factory.h" | |||
| @@ -13,6 +13,7 @@ | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| #include "backend/kernel_compiler/cpu/boundingbox_encode_cpu_kernel.h" | |||
| #include "runtime/device/cpu/cpu_device_address.h" | |||
| @@ -13,10 +13,13 @@ | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_BOUNDINGBOX_ENCODE_CPU_KERNEL_H_ | |||
| #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_BOUNDINGBOX_ENCODE_CPU_KERNEL_H_ | |||
| #include <vector> | |||
| #include <algorithm> | |||
| #include "backend/kernel_compiler/cpu/cpu_kernel.h" | |||
| #include "backend/kernel_compiler/cpu/cpu_kernel_factory.h" | |||
| @@ -15,13 +15,19 @@ | |||
| */ | |||
| #include "backend/kernel_compiler/cpu/broadcast_to_cpu_kernel.h" | |||
| #include "nnacl/errorcode.h" | |||
| #include "backend/kernel_compiler/cpu/nnacl/errorcode.h" | |||
| namespace mindspore { | |||
| namespace kernel { | |||
| namespace { | |||
| constexpr size_t kBroadcastToInputsNum = 1; | |||
| constexpr size_t kBroadcastToOutputsNum = 1; | |||
| } // namespace | |||
| template <typename T> | |||
| void BroadcastToCPUKernel<T>::InitKernel(const CNodePtr &kernel_node) { | |||
| MS_EXCEPTION_IF_NULL(kernel_node); | |||
| kernel_name_ = AnfAlgo::GetCNodeName(kernel_node); | |||
| input_shape_ = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0); | |||
| output_shape_ = AnfAlgo::GetOutputInferShape(kernel_node, 0); | |||
| size_t input_shape_size = input_shape_.size(); | |||
| @@ -55,35 +61,26 @@ void BroadcastToCPUKernel<T>::InitKernel(const CNodePtr &kernel_node) { | |||
| template <typename T> | |||
| bool BroadcastToCPUKernel<T>::Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &, | |||
| const std::vector<AddressPtr> &outputs) { | |||
| if (inputs.size() != 1 || outputs.size() != 1) { | |||
| MS_LOG(EXCEPTION) << "Wrong number of inputs or outputs!"; | |||
| } | |||
| if ((inputs[0] == nullptr) || (inputs[0]->size == 0)) { | |||
| MS_LOG(EXCEPTION) << "Input data is NULL!"; | |||
| } | |||
| if ((outputs[0] == nullptr) || (outputs[0]->size == 0)) { | |||
| MS_LOG(EXCEPTION) << "Output data is NULL!"; | |||
| } | |||
| const auto input_addr = reinterpret_cast<T *>(inputs[0]->addr); | |||
| auto output_addr = reinterpret_cast<T *>(outputs[0]->addr); | |||
| int ret = static_cast<int>(NNACL_ERR); | |||
| CHECK_KERNEL_INPUTS_NUM(inputs.size(), kBroadcastToInputsNum, kernel_name_); | |||
| CHECK_KERNEL_OUTPUTS_NUM(outputs.size(), kBroadcastToOutputsNum, kernel_name_); | |||
| const auto *input_addr = reinterpret_cast<T *>(inputs[0]->addr); | |||
| auto *output_addr = reinterpret_cast<T *>(outputs[0]->addr); | |||
| int status = static_cast<int>(NNACL_OK); | |||
| if constexpr (std::is_same_v<T, bool>) { | |||
| ret = BroadcastTo(bool, input_addr, &shape_info_, output_addr); | |||
| status = BROADCAST_TO(bool, input_addr, &shape_info_, output_addr); | |||
| } else if constexpr (std::is_same_v<T, int>) { | |||
| ret = BroadcastTo(int, input_addr, &shape_info_, output_addr); | |||
| status = BROADCAST_TO(int, input_addr, &shape_info_, output_addr); | |||
| } else if constexpr (std::is_same_v<T, float>) { | |||
| ret = BroadcastTo(float, input_addr, &shape_info_, output_addr); | |||
| status = BROADCAST_TO(float, input_addr, &shape_info_, output_addr); | |||
| } else { | |||
| MS_LOG(EXCEPTION) << "Not supported data type for BroadcastTo."; | |||
| } | |||
| if (ret == NNACL_OK) { | |||
| return true; | |||
| if (status != static_cast<int>(NNACL_OK)) { | |||
| MS_LOG(EXCEPTION) << "Broadcast tensor with shape " << input_shape_ << " to shape " << output_shape_ | |||
| << " execute failed, error code: " << status; | |||
| } | |||
| MS_LOG(ERROR) << "Broadcast tensor with shape " << input_shape_ << " to shape " << output_shape_ | |||
| << " execute failed."; | |||
| return false; | |||
| return true; | |||
| } | |||
| } // namespace kernel | |||
| } // namespace mindspore | |||
| @@ -14,14 +14,15 @@ | |||
| * limitations under the License. | |||
| */ | |||
| #ifndef MINDSPORE_BROADCAST_TO_CPU_KERNEL_H | |||
| #define MINDSPORE_BROADCAST_TO_CPU_KERNEL_H | |||
| #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_BROADCAST_TO_CPU_KERNEL_H_ | |||
| #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_BROADCAST_TO_CPU_KERNEL_H_ | |||
| #include <vector> | |||
| #include <memory> | |||
| #include "backend/kernel_compiler/cpu/cpu_kernel.h" | |||
| #include "backend/kernel_compiler/cpu/cpu_kernel_factory.h" | |||
| #include "nnacl/base/broadcast_to.h" | |||
| #include "backend/kernel_compiler/cpu/nnacl/base/broadcast_to.h" | |||
| namespace mindspore { | |||
| namespace kernel { | |||
| @@ -38,7 +39,7 @@ class BroadcastToCPUKernel : public CPUKernel { | |||
| private: | |||
| std::vector<size_t> input_shape_; | |||
| std::vector<size_t> output_shape_; | |||
| BroadcastShapeInfo shape_info_; | |||
| BroadcastShapeInfo shape_info_{}; | |||
| }; | |||
| MS_REG_CPU_KERNEL_T(BroadcastTo, KernelAttr().AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32), | |||
| @@ -50,4 +51,4 @@ MS_REG_CPU_KERNEL_T(BroadcastTo, KernelAttr().AddInputAttr(kNumberTypeBool).AddO | |||
| } // namespace kernel | |||
| } // namespace mindspore | |||
| #endif // MINDSPORE_BROADCAST_TO_CPU_KERNEL_H | |||
| #endif // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_BROADCAST_TO_CPU_KERNEL_H_ | |||
| @@ -1,5 +1,5 @@ | |||
| /** | |||
| * Copyright 2020 Huawei Technologies Co., Ltd | |||
| * Copyright 2020-2021 Huawei Technologies Co., Ltd | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| @@ -13,14 +13,22 @@ | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| #include "backend/kernel_compiler/cpu/cast_cpu_kernel.h" | |||
| #include <cmath> | |||
| #include <map> | |||
| #include <string> | |||
| #include "backend/kernel_compiler/cpu/cast_cpu_kernel.h" | |||
| #include "runtime/device/cpu/cpu_device_address.h" | |||
| namespace mindspore { | |||
| namespace kernel { | |||
| namespace { | |||
| constexpr size_t kCastInputsNum = 1; | |||
| constexpr size_t kCastOutputsNum = 1; | |||
| } // namespace | |||
| template <typename S, typename T> | |||
| void Cast(const S *in, T *out, size_t size) { | |||
| auto task = [&in, &out](size_t start, size_t end) { | |||
| @@ -34,6 +42,7 @@ void Cast(const S *in, T *out, size_t size) { | |||
| template <typename S, typename T> | |||
| void CastCPUKernel<S, T>::InitKernel(const CNodePtr &kernel_node) { | |||
| MS_EXCEPTION_IF_NULL(kernel_node); | |||
| kernel_name_ = AnfAlgo::GetCNodeName(kernel_node); | |||
| source_dtype_ = AnfAlgo::GetInputDeviceDataType(kernel_node, 0); | |||
| target_dtype_ = AnfAlgo::GetOutputDeviceDataType(kernel_node, 0); | |||
| } | |||
| @@ -41,17 +50,14 @@ void CastCPUKernel<S, T>::InitKernel(const CNodePtr &kernel_node) { | |||
| template <typename S, typename T> | |||
| bool CastCPUKernel<S, T>::Launch(const std::vector<kernel::AddressPtr> &inputs, const std::vector<kernel::AddressPtr> &, | |||
| const std::vector<kernel::AddressPtr> &outputs) { | |||
| if (inputs.size() != 1 || outputs.size() != 1) { | |||
| MS_LOG(ERROR) << "Cast requires 1 input and 1 output, but got " << inputs.size() << " input and " << outputs.size() | |||
| << " output."; | |||
| return false; | |||
| } | |||
| CHECK_KERNEL_INPUTS_NUM(inputs.size(), kCastInputsNum, kernel_name_); | |||
| CHECK_KERNEL_OUTPUTS_NUM(outputs.size(), kCastOutputsNum, kernel_name_); | |||
| if (outputs[0]->size == 0) { | |||
| MS_LOG(WARNING) << "Cast output memory size should be greater than 0, but got 0."; | |||
| return true; | |||
| } | |||
| const auto input = reinterpret_cast<S *>(inputs[0]->addr); | |||
| const auto output = reinterpret_cast<T *>(outputs[0]->addr); | |||
| const auto *input = reinterpret_cast<S *>(inputs[0]->addr); | |||
| auto *output = reinterpret_cast<T *>(outputs[0]->addr); | |||
| MS_LOG(DEBUG) << "Type source: " << typeid(S).name() << "; target: " << typeid(T).name(); | |||
| Cast<S, T>(input, output, outputs[0]->size / sizeof(T)); | |||
| return true; | |||
| @@ -1,5 +1,5 @@ | |||
| /** | |||
| * Copyright 2020 Huawei Technologies Co., Ltd | |||
| * Copyright 2020-2021 Huawei Technologies Co., Ltd | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| @@ -13,11 +13,14 @@ | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_CAST_CPU_KERNEL_H_ | |||
| #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_CAST_CPU_KERNEL_H_ | |||
| #include <functional> | |||
| #include <memory> | |||
| #include <vector> | |||
| #include "backend/kernel_compiler/cpu/cpu_kernel.h" | |||
| #include "backend/kernel_compiler/cpu/cpu_kernel_factory.h" | |||
| @@ -13,6 +13,9 @@ | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| #include <functional> | |||
| #include "backend/kernel_compiler/cpu/check_valid_cpu_kernel.h" | |||
| #include "runtime/device/cpu/cpu_device_address.h" | |||
| @@ -13,9 +13,12 @@ | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_CHECK_VALID_CPU_KERNEL_H_ | |||
| #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_CHECK_VALID_CPU_KERNEL_H_ | |||
| #include <vector> | |||
| #include "backend/kernel_compiler/cpu/cpu_kernel.h" | |||
| #include "backend/kernel_compiler/cpu/cpu_kernel_factory.h" | |||
| @@ -19,11 +19,15 @@ | |||
| namespace mindspore { | |||
| namespace kernel { | |||
| namespace { | |||
| constexpr size_t kConcatOutputsNum = 1; | |||
| } // namespace | |||
| template <typename T> | |||
| void ConcatCPUKernel<T>::InitKernel(const CNodePtr &kernel_node) { | |||
| MS_EXCEPTION_IF_NULL(kernel_node); | |||
| kernel_name_ = AnfAlgo::GetCNodeName(kernel_node); | |||
| node_wpt_ = kernel_node; | |||
| CheckParam(kernel_node); | |||
| axis_ = LongToInt(AnfAlgo::GetNodeAttr<int64_t>(kernel_node, AXIS)); | |||
| auto input_1_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0); | |||
| if (axis_ < 0) { | |||
| @@ -34,15 +38,18 @@ void ConcatCPUKernel<T>::InitKernel(const CNodePtr &kernel_node) { | |||
| template <typename T> | |||
| bool ConcatCPUKernel<T>::Launch(const std::vector<kernel::AddressPtr> &inputs, const std::vector<kernel::AddressPtr> &, | |||
| const std::vector<kernel::AddressPtr> &outputs) { | |||
| auto node_ = node_wpt_.lock(); | |||
| if (!node_) { | |||
| auto node = node_wpt_.lock(); | |||
| if (!node) { | |||
| MS_LOG(EXCEPTION) << "node_wpt_ is expired."; | |||
| } | |||
| size_t input_num = AnfAlgo::GetInputTensorNum(node_); | |||
| const size_t input_num = AnfAlgo::GetInputTensorNum(node); | |||
| CHECK_KERNEL_INPUTS_NUM(inputs.size(), input_num, kernel_name_); | |||
| CHECK_KERNEL_OUTPUTS_NUM(outputs.size(), kConcatOutputsNum, kernel_name_); | |||
| std::vector<std::vector<size_t>> input_flat_shape_list; | |||
| input_flat_shape_list.reserve(input_num); | |||
| for (size_t i = 0; i < input_num; i++) { | |||
| auto input_shape_i = AnfAlgo::GetPrevNodeOutputInferShape(node_, i); | |||
| auto input_shape_i = AnfAlgo::GetPrevNodeOutputInferShape(node, i); | |||
| auto flat_shape = CPUKernelUtils::FlatShapeByAxis(input_shape_i, axis_); | |||
| (void)input_flat_shape_list.emplace_back(flat_shape); | |||
| } | |||
| @@ -51,10 +58,10 @@ bool ConcatCPUKernel<T>::Launch(const std::vector<kernel::AddressPtr> &inputs, c | |||
| for (size_t j = 0; j < input_num; ++j) { | |||
| output_dim_1 += input_flat_shape_list[j][1]; | |||
| } | |||
| auto output_addr = reinterpret_cast<T *>(outputs[0]->addr); | |||
| auto *output_addr = reinterpret_cast<T *>(outputs[0]->addr); | |||
| std::vector<T *> input_addr_list; | |||
| for (size_t j = 0; j < input_num; ++j) { | |||
| auto tmp_addr = reinterpret_cast<T *>(inputs[j]->addr); | |||
| auto *tmp_addr = reinterpret_cast<T *>(inputs[j]->addr); | |||
| (void)input_addr_list.emplace_back(tmp_addr); | |||
| } | |||
| // each input's row of shape after flat are same | |||
| @@ -69,7 +76,10 @@ bool ConcatCPUKernel<T>::Launch(const std::vector<kernel::AddressPtr> &inputs, c | |||
| auto copy_num = input_flat_shape_list[j][1]; | |||
| auto copy_size = copy_num * sizeof(T); | |||
| auto offset = copy_num * i; | |||
| (void)memcpy_s(output_ptr, copy_size, input_addr_list[j] + offset, copy_size); | |||
| auto ret = memcpy_s(output_ptr, copy_size, input_addr_list[j] + offset, copy_size); | |||
| if (ret != EOK) { | |||
| MS_LOG(EXCEPTION) << "Memcpy failed."; | |||
| } | |||
| output_ptr += copy_num; | |||
| } | |||
| } | |||
| @@ -77,13 +87,5 @@ bool ConcatCPUKernel<T>::Launch(const std::vector<kernel::AddressPtr> &inputs, c | |||
| ParallelLaunchAutoSearch(task, before_axis, this, ¶llel_search_info_); | |||
| return true; | |||
| } | |||
| template <typename T> | |||
| void ConcatCPUKernel<T>::CheckParam(const CNodePtr &kernel_node) const { | |||
| size_t output_num = AnfAlgo::GetOutputTensorNum(kernel_node); | |||
| if (output_num != 1) { | |||
| MS_LOG(EXCEPTION) << "Output number is " << output_num << ", but ConcatCPUKernel needs 1 output."; | |||
| } | |||
| } | |||
| } // namespace kernel | |||
| } // namespace mindspore | |||
| @@ -1,5 +1,5 @@ | |||
| /** | |||
| * Copyright 2021 Huawei Technologies Co., Ltd | |||
| * Copyright 2020-2021 Huawei Technologies Co., Ltd | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| @@ -13,10 +13,13 @@ | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_CONCAT_CPU_KERNEL_H_ | |||
| #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_CONCAT_CPU_KERNEL_H_ | |||
| #include <vector> | |||
| #include <memory> | |||
| #include "backend/kernel_compiler/cpu/cpu_kernel.h" | |||
| #include "backend/kernel_compiler/cpu/cpu_kernel_factory.h" | |||
| @@ -34,8 +37,7 @@ class ConcatCPUKernel : public CPUKernel { | |||
| const std::vector<AddressPtr> &outputs) override; | |||
| private: | |||
| void CheckParam(const CNodePtr &kernel_node) const; | |||
| int axis_ = 0; | |||
| int axis_{0}; | |||
| CNodeWeakPtr node_wpt_; | |||
| }; | |||
| @@ -1,5 +1,5 @@ | |||
| /** | |||
| * Copyright 2019 Huawei Technologies Co., Ltd | |||
| * Copyright 2019-2021 Huawei Technologies Co., Ltd | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| @@ -13,10 +13,13 @@ | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| #include "backend/kernel_compiler/cpu/cpu_kernel.h" | |||
| #include <algorithm> | |||
| #include <utility> | |||
| #include <cmath> | |||
| #include "common/thread_pool.h" | |||
| #include "utils/profile.h" | |||
| @@ -52,10 +55,11 @@ void CPUKernel::Init(const CNodePtr &kernel_node) { | |||
| } | |||
| void CPUKernelUtils::ExpandDimsTo4(std::vector<size_t> *shape) { | |||
| MS_EXCEPTION_IF_NULL(shape); | |||
| auto len = shape->size(); | |||
| if (len < 4) { | |||
| for (size_t i = 0; i < 4 - len; ++i) { | |||
| shape->insert(shape->begin(), 1); | |||
| (void)shape->insert(shape->begin(), 1); | |||
| } | |||
| } | |||
| } | |||
| @@ -79,6 +83,7 @@ size_t CPUKernelUtils::GetElementNumOnAxis(const std::vector<size_t> &shape, int | |||
| void CPUKernelUtils::GetElementNumEveryDim(const std::vector<size_t> &shape, std::vector<size_t> *element_num) { | |||
| size_t accumulation = 1; | |||
| MS_EXCEPTION_IF_NULL(element_num); | |||
| (void)element_num->emplace_back(1); | |||
| for (size_t i = shape.size() - 1; i > 0; --i) { | |||
| accumulation *= shape[i]; | |||
| @@ -112,6 +117,7 @@ void CPUKernelUtils::ParallelFor(const CTask &task, size_t count, float block_si | |||
| void CPUKernelUtils::ParallelForAutoSearch(const CTask &task, size_t count, ParallelSearchInfo *parallel_search_info) { | |||
| const size_t MAX_POW = 6; | |||
| const size_t AVG_COUNT = 5; | |||
| MS_EXCEPTION_IF_NULL(parallel_search_info); | |||
| size_t current_pow = parallel_search_info->search_count / AVG_COUNT; | |||
| if (current_pow < MAX_POW) { | |||
| if (parallel_search_info->search_count % AVG_COUNT == 0) { | |||
| @@ -276,12 +282,12 @@ void BroadcastIterator::GenNextPos() { | |||
| void BroadcastIterator::BroadcastShape() { | |||
| int input_dimension_a = input_shape_a_.size(); | |||
| if (input_dimension_a < output_dimension_) { | |||
| input_shape_a_.insert(input_shape_a_.begin(), output_dimension_ - input_dimension_a, 1); | |||
| (void)input_shape_a_.insert(input_shape_a_.begin(), IntToSize(output_dimension_ - input_dimension_a), 1); | |||
| } | |||
| int input_dimension_b = input_shape_b_.size(); | |||
| if (input_dimension_b < output_dimension_) { | |||
| input_shape_b_.insert(input_shape_b_.begin(), output_dimension_ - input_dimension_b, 1); | |||
| (void)input_shape_b_.insert(input_shape_b_.begin(), IntToSize(output_dimension_ - input_dimension_b), 1); | |||
| } | |||
| } | |||
| @@ -297,10 +303,10 @@ void BroadcastIterator::InitStrides() { | |||
| // Update strides for broadcast | |||
| // While the axis value is 1, the stride is 0 | |||
| std::transform(input_strides_a_.begin(), input_strides_a_.end(), input_shape_a_.begin(), input_strides_a_.begin(), | |||
| [](const auto &a, const auto &b) { return b == 1 ? 0 : a; }); | |||
| std::transform(input_strides_b_.begin(), input_strides_b_.end(), input_shape_b_.begin(), input_strides_b_.begin(), | |||
| [](const auto &a, const auto &b) { return b == 1 ? 0 : a; }); | |||
| (void)std::transform(input_strides_a_.begin(), input_strides_a_.end(), input_shape_a_.begin(), | |||
| input_strides_a_.begin(), [](const auto &a, const auto &b) { return b == 1 ? 0 : a; }); | |||
| (void)std::transform(input_strides_b_.begin(), input_strides_b_.end(), input_shape_b_.begin(), | |||
| input_strides_b_.begin(), [](const auto &a, const auto &b) { return b == 1 ? 0 : a; }); | |||
| } | |||
| TransposeIterator::TransposeIterator(std::vector<size_t> output_shape, std::vector<size_t> axes, | |||
| @@ -13,14 +13,17 @@ | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_CPU_KERNEL_H_ | |||
| #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_CPU_KERNEL_H_ | |||
| #include <functional> | |||
| #include <memory> | |||
| #include <numeric> | |||
| #include <string> | |||
| #include <thread> | |||
| #include <vector> | |||
| #include "backend/kernel_compiler/kernel.h" | |||
| #include "backend/session/anf_runtime_algorithm.h" | |||
| #include "backend/kernel_compiler/common_utils.h" | |||
| @@ -33,106 +36,61 @@ using mindspore::kernel::AddressPtr; | |||
| using CTask = std::function<void(size_t, size_t)>; | |||
| namespace mindspore { | |||
| namespace kernel { | |||
| const char KERNEL_SIZE[] = "kernel_size"; | |||
| const char STRIDE[] = "stride"; | |||
| const char STRIDES[] = "strides"; | |||
| const char DILATION[] = "dilation"; | |||
| const char DILATIONS[] = "dilations"; | |||
| const char FORMAT[] = "format"; | |||
| const char PAD[] = "pad"; | |||
| const char PAD_LIST[] = "pad_list"; | |||
| const char PAD_MODE[] = "pad_mode"; | |||
| const char PAD_MODE_LOWER_SAME[] = "same"; | |||
| const char PAD_MODE_LOWER_VALID[] = "valid"; | |||
| const char PAD_MODE_UPPER_SAME[] = "SAME"; | |||
| const char PAD_MODE_UPPER_VALID[] = "VALID"; | |||
| const char TRANSPOSE_A[] = "transpose_a"; | |||
| const char TRANSPOSE_B[] = "transpose_b"; | |||
| const char IS_GRAD[] = "is_grad"; | |||
| const char TRANSPOSE_NO = 'N'; | |||
| const char TRANSPOSE_YES = 'T'; | |||
| const char AXIS[] = "axis"; | |||
| const char DIM[] = "dim"; | |||
| const char BEGIN[] = "begin"; | |||
| const char END[] = "end"; | |||
| const char SIZE[] = "size"; | |||
| const char USE_NESTEROV[] = "use_nesterov"; | |||
| const char GROUP[] = "group"; | |||
| const char START[] = "start"; | |||
| const char LIMIT[] = "limit"; | |||
| const char DELTA[] = "delta"; | |||
| const char SORTED[] = "sorted"; | |||
| const char ADJ_ST[] = "adjoint_st"; | |||
| const char ADJ_dT[] = "adjoint_dt"; | |||
| const char PERIODS[] = "periods"; | |||
| const char WINDOW[] = "window"; | |||
| const char MIN_PERIODS[] = "min_periods"; | |||
| const char CENTER[] = "center"; | |||
| const char METHOD[] = "method"; | |||
| const char CLOSED[] = "closed"; | |||
| const char NA_OPTION[] = "na_option"; | |||
| const char ASCENDING[] = "ascending"; | |||
| const char PCT[] = "pct"; | |||
| enum OperateType { | |||
| ADD = 0, | |||
| SUB, | |||
| MUL, | |||
| DIV, | |||
| SQUARE, | |||
| SQRT, | |||
| POW, | |||
| REALDIV, | |||
| FLOORDIV, | |||
| MOD, | |||
| FLOORMOD, | |||
| NEG, | |||
| LESS, | |||
| ASSIGNADD, | |||
| RELUGRAD, | |||
| RELU6GRAD, | |||
| ABSGRAD, | |||
| TANHGRAD, | |||
| SQRTGRAD, | |||
| SIGMOIDGRAD, | |||
| ONESLIKE, | |||
| ZEROSLIKE, | |||
| SIGN, | |||
| EQUAL, | |||
| NOTEQUAL, | |||
| LESSEQUAL, | |||
| LOGICALAND, | |||
| LOGICALOR, | |||
| LOGICALNOT, | |||
| FLOOR, | |||
| SQUAREDDIFFERENCE, | |||
| GREATER, | |||
| GREATEREQUAL, | |||
| RECIPROCAL, | |||
| GELU, | |||
| GELUGRAD, | |||
| ASIN, | |||
| ACOS, | |||
| ATAN, | |||
| ASINGRAD, | |||
| ACOSGRAD, | |||
| ATANGRAD, | |||
| SIN, | |||
| COS, | |||
| TAN, | |||
| SINH, | |||
| COSH, | |||
| ASINH, | |||
| ACOSH, | |||
| ATANH, | |||
| ASINHGRAD, | |||
| ACOSHGRAD, | |||
| ATAN2, | |||
| RINT, | |||
| ROUND, | |||
| EXP, | |||
| IDENTITY, | |||
| }; | |||
| constexpr char KERNEL_SIZE[] = "kernel_size"; | |||
| constexpr char STRIDE[] = "stride"; | |||
| constexpr char STRIDES[] = "strides"; | |||
| constexpr char DILATION[] = "dilation"; | |||
| constexpr char DILATIONS[] = "dilations"; | |||
| constexpr char FORMAT[] = "format"; | |||
| constexpr char PAD[] = "pad"; | |||
| constexpr char PAD_LIST[] = "pad_list"; | |||
| constexpr char PAD_MODE[] = "pad_mode"; | |||
| constexpr char PAD_MODE_LOWER_SAME[] = "same"; | |||
| constexpr char PAD_MODE_LOWER_VALID[] = "valid"; | |||
| constexpr char PAD_MODE_UPPER_SAME[] = "SAME"; | |||
| constexpr char PAD_MODE_UPPER_VALID[] = "VALID"; | |||
| constexpr char TRANSPOSE_A[] = "transpose_a"; | |||
| constexpr char TRANSPOSE_B[] = "transpose_b"; | |||
| constexpr char IS_GRAD[] = "is_grad"; | |||
| constexpr char TRANSPOSE_NO = 'N'; | |||
| constexpr char TRANSPOSE_YES = 'T'; | |||
| constexpr char AXIS[] = "axis"; | |||
| constexpr char DIM[] = "dim"; | |||
| constexpr char NUM[] = "num"; | |||
| constexpr char BEGIN[] = "begin"; | |||
| constexpr char END[] = "end"; | |||
| constexpr char SIZE[] = "size"; | |||
| constexpr char USE_NESTEROV[] = "use_nesterov"; | |||
| constexpr char GROUP[] = "group"; | |||
| constexpr char START[] = "start"; | |||
| constexpr char LIMIT[] = "limit"; | |||
| constexpr char DELTA[] = "delta"; | |||
| constexpr char SORTED[] = "sorted"; | |||
| constexpr char ADJ_ST[] = "adjoint_st"; | |||
| constexpr char ADJ_dT[] = "adjoint_dt"; | |||
| constexpr char REDUCTION[] = "reduction"; | |||
| constexpr char NONE[] = "none"; | |||
| constexpr char SUM[] = "sum"; | |||
| constexpr char MEAN[] = "mean"; | |||
| constexpr char BETA[] = "beta"; | |||
| constexpr char EXCLUSIVE[] = "exclusive"; | |||
| constexpr char REVERSE[] = "reverse"; | |||
| constexpr char PCR[] = "preprocess_collapse_repeated"; | |||
| constexpr char CTR[] = "ctc_merge_repeated"; | |||
| constexpr char ILOTI[] = "ignore_longer_outputs_than_inputs"; | |||
| constexpr char MOMENTUM[] = "momentum"; | |||
| constexpr char RHO[] = "rho"; | |||
| constexpr char EPSILON[] = "epsilon"; | |||
| constexpr char ALIGN_CORNERS[] = "align_corners"; | |||
| constexpr char PERIODS[] = "periods"; | |||
| constexpr char WINDOW[] = "window"; | |||
| constexpr char MIN_PERIODS[] = "min_periods"; | |||
| constexpr char CENTER[] = "center"; | |||
| constexpr char METHOD[] = "method"; | |||
| constexpr char CLOSED[] = "closed"; | |||
| constexpr char NA_OPTION[] = "na_option"; | |||
| constexpr char ASCENDING[] = "ascending"; | |||
| constexpr char PCT[] = "pct"; | |||
| struct ParallelSearchInfo { | |||
| double min_cost_time{DBL_MAX}; | |||
| @@ -1,5 +1,5 @@ | |||
| /** | |||
| * Copyright 2019 Huawei Technologies Co., Ltd | |||
| * Copyright 2019-2021 Huawei Technologies Co., Ltd | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| @@ -25,7 +25,10 @@ | |||
| namespace mindspore { | |||
| namespace kernel { | |||
| namespace { | |||
| const std::set<std::string> same_op_name = {"Concat", "Pack", "Stack", "Split", "Transpose", "Unpack", "AddN"}; | |||
| } // namespace | |||
| CPUKernelFactory &CPUKernelFactory::GetInstance() { | |||
| static CPUKernelFactory instance; | |||
| return instance; | |||
| @@ -40,6 +43,7 @@ void CPUKernelFactory::Register(const std::string &kernel_name, const KernelAttr | |||
| } | |||
| std::shared_ptr<CPUKernel> CPUKernelFactory::Create(const std::string &kernel_name, const CNodePtr &apply_kernel) { | |||
| MS_EXCEPTION_IF_NULL(apply_kernel); | |||
| auto kernel_info = dynamic_cast<device::KernelInfo *>(apply_kernel->kernel_info()); | |||
| MS_EXCEPTION_IF_NULL(kernel_info); | |||
| const KernelBuildInfo *kernel_build_Info = kernel_info->select_kernel_build_info(); | |||
| @@ -53,6 +57,8 @@ std::shared_ptr<CPUKernel> CPUKernelFactory::Create(const std::string &kernel_na | |||
| void CPUKernelFactory::SetKernelAttrs(const std::shared_ptr<kernel::OpInfo> op_info, | |||
| std::vector<KernelAttr> *kernel_attrs) { | |||
| MS_EXCEPTION_IF_NULL(kernel_attrs); | |||
| MS_EXCEPTION_IF_NULL(op_info); | |||
| auto inputs_ptr = op_info->inputs_ptr(); | |||
| auto outputs_ptr = op_info->outputs_ptr(); | |||
| if (inputs_ptr.empty()) { | |||
| @@ -1,5 +1,5 @@ | |||
| /** | |||
| * Copyright 2019 Huawei Technologies Co., Ltd | |||
| * Copyright 2019-2021 Huawei Technologies Co., Ltd | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| @@ -13,6 +13,7 @@ | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_CPU_KERNEL_FACTORY_H_ | |||
| #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_CPU_KERNEL_FACTORY_H_ | |||
| @@ -23,15 +24,16 @@ | |||
| #include <utility> | |||
| #include <vector> | |||
| #include "utils/ms_utils.h" | |||
| #include "backend/kernel_compiler/cpu/cpu_kernel.h" | |||
| #include "backend/kernel_compiler/oplib/oplib.h" | |||
| #include "runtime/device/cpu/kernel_select_cpu.h" | |||
| #include "utils/ms_utils.h" | |||
| namespace mindspore { | |||
| namespace kernel { | |||
| using mindspore::device::cpu::KernelAttr; | |||
| using CPUKernelCreator = std::function<std::shared_ptr<CPUKernel>()>; | |||
| class CPUKernelFactory { | |||
| public: | |||
| static CPUKernelFactory &GetInstance(); | |||
| @@ -13,6 +13,7 @@ | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| #include "backend/kernel_compiler/cpu/crop_and_resize_cpu_kernel.h" | |||
| #include "runtime/device/cpu/cpu_device_address.h" | |||
| @@ -13,11 +13,14 @@ | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_CROP_AND_RESIZE_CPU_KERNEL_H_ | |||
| #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_CROP_AND_RESIZE_CPU_KERNEL_H_ | |||
| #include <vector> | |||
| #include <string> | |||
| #include <algorithm> | |||
| #include "backend/kernel_compiler/cpu/cpu_kernel.h" | |||
| #include "backend/kernel_compiler/cpu/cpu_kernel_factory.h" | |||
| @@ -19,10 +19,62 @@ | |||
| namespace mindspore { | |||
| namespace kernel { | |||
| namespace { | |||
| constexpr size_t kCTCLossInputsNum = 4; | |||
| constexpr size_t kCTCLossOutputsNum = 2; | |||
| template <typename T> | |||
| inline T LogSumExp(const T logprob1, const T logprob2) { | |||
| T kLogZero_ = -std::numeric_limits<T>::infinity(); | |||
| if (logprob1 <= kLogZero_) { | |||
| return logprob2; | |||
| } | |||
| if (logprob2 <= kLogZero_) { | |||
| return logprob1; | |||
| } | |||
| return (logprob1 > logprob2) ? logprob1 + static_cast<T>(log1p(exp(logprob2 - logprob1))) | |||
| : logprob2 + static_cast<T>(log1p(exp(logprob1 - logprob2))); | |||
| } | |||
| template <typename T> | |||
| void InnerSoftMax(const T *inputs_addr, std::vector<std::vector<T>> *softmax_probs, const uint32_t sequence_length, | |||
| size_t num_class, size_t batch_size, size_t b) { | |||
| for (size_t t = 0; t < sequence_length; ++t) { | |||
| auto maxCoeff = static_cast<T>(0); | |||
| auto sumCoeff = static_cast<T>(0); | |||
| for (size_t c = 0; c < num_class; ++c) { | |||
| if (inputs_addr[t * batch_size * num_class + b * num_class + c] > maxCoeff) { | |||
| maxCoeff = inputs_addr[t * batch_size * num_class + b * num_class + c]; | |||
| } | |||
| } | |||
| for (size_t c = 0; c < num_class; ++c) { | |||
| sumCoeff += static_cast<T>(exp(inputs_addr[t * batch_size * num_class + b * num_class + c] - maxCoeff)); | |||
| (*softmax_probs)[c][t] = | |||
| static_cast<T>(exp(inputs_addr[t * batch_size * num_class + b * num_class + c] - maxCoeff)); | |||
| } | |||
| for (size_t c = 0; c < num_class; ++c) { | |||
| (*softmax_probs)[c][t] /= sumCoeff; | |||
| } | |||
| } | |||
| } | |||
| template <typename T> | |||
| void MatrixFromVector(uint32_t row, uint32_t col, std::vector<std::vector<T>> *array2D, const T init_value) { | |||
| array2D->resize(row); | |||
| for (size_t i = 0; i < row; ++i) { | |||
| (*array2D)[i].resize(col, init_value); | |||
| } | |||
| } | |||
| } // namespace | |||
| void CTCLossCPUKernel::InitKernel(const CNodePtr &kernel_node) { | |||
| CheckParam(kernel_node); | |||
| MS_EXCEPTION_IF_NULL(kernel_node); | |||
| kernel_name_ = AnfAlgo::GetCNodeName(kernel_node); | |||
| probs_shape_ = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0); | |||
| indice_dims_ = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 1); | |||
| indices_dims_ = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 1); | |||
| labels_dims_ = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 2); | |||
| dtype_ = AnfAlgo::GetInputDeviceDataType(kernel_node, 0); | |||
| @@ -32,14 +84,13 @@ void CTCLossCPUKernel::InitKernel(const CNodePtr &kernel_node) { | |||
| if (labels_dims_.size() != 1) { | |||
| MS_LOG(EXCEPTION) << "Labels dims: " << labels_dims_.size() << " not support."; | |||
| } | |||
| if (indice_dims_.size() != 2) { | |||
| MS_LOG(EXCEPTION) << "Labels indice dims: " << indice_dims_.size() << " not support."; | |||
| if (indices_dims_.size() != 2) { | |||
| MS_LOG(EXCEPTION) << "Labels indice dims: " << indices_dims_.size() << " not support."; | |||
| } | |||
| preprocess_collapse_repeated_ = AnfAlgo::GetNodeAttr<bool>(kernel_node, "preprocess_collapse_repeated"); | |||
| ctc_merge_repeated_ = AnfAlgo::GetNodeAttr<bool>(kernel_node, "ctc_merge_repeated"); | |||
| ignore_longer_outputs_than_inputs_ = AnfAlgo::GetNodeAttr<bool>(kernel_node, "ignore_longer_outputs_than_inputs"); | |||
| preprocess_collapse_repeated_ = AnfAlgo::GetNodeAttr<bool>(kernel_node, PCR); | |||
| ctc_merge_repeated_ = AnfAlgo::GetNodeAttr<bool>(kernel_node, CTR); | |||
| ignore_longer_outputs_than_inputs_ = AnfAlgo::GetNodeAttr<bool>(kernel_node, ILOTI); | |||
| max_time_ = probs_shape_[0]; | |||
| batch_size_ = probs_shape_[1]; | |||
| num_class_ = probs_shape_[2]; | |||
| @@ -48,31 +99,23 @@ void CTCLossCPUKernel::InitKernel(const CNodePtr &kernel_node) { | |||
| bool CTCLossCPUKernel::Launch(const std::vector<kernel::AddressPtr> &inputs, const std::vector<kernel::AddressPtr> &, | |||
| const std::vector<kernel::AddressPtr> &outputs) { | |||
| CHECK_KERNEL_INPUTS_NUM(inputs.size(), kCTCLossInputsNum, kernel_name_); | |||
| CHECK_KERNEL_OUTPUTS_NUM(outputs.size(), kCTCLossOutputsNum, kernel_name_); | |||
| if (dtype_ == kNumberTypeFloat16) { | |||
| LaunchKernel<float16>(inputs, outputs); | |||
| } else if (dtype_ == kNumberTypeFloat32) { | |||
| LaunchKernel<float>(inputs, outputs); | |||
| } | |||
| return true; | |||
| } | |||
| template <typename T> | |||
| inline T LogSumExp(const T logprob1, const T logprob2) { | |||
| T kLogZero_ = -std::numeric_limits<T>::infinity(); | |||
| if (logprob1 <= kLogZero_) { | |||
| return logprob2; | |||
| } else if (logprob2 <= kLogZero_) { | |||
| return logprob1; | |||
| } else { | |||
| return (logprob1 > logprob2) ? logprob1 + static_cast<T>(log1p(exp(logprob2 - logprob1))) | |||
| : logprob2 + static_cast<T>(log1p(exp(logprob1 - logprob2))); | |||
| MS_LOG(EXCEPTION) << kernel_name_ << " only support float16 and float32 on CPU, but got " | |||
| << TypeIdToType(dtype_)->ToString(); | |||
| } | |||
| return true; | |||
| } | |||
| template <typename TT> | |||
| void CTCLossCPUKernel::CalculateFwdVar(const std::vector<uint32_t> &label_with_blank, | |||
| const std::vector<std::vector<TT>> &y, | |||
| std::vector<std::vector<TT>> *log_alpha_b) { | |||
| std::vector<std::vector<TT>> *log_alpha_b) const { | |||
| int U = label_with_blank.size(); | |||
| int T = (*log_alpha_b)[0].size(); | |||
| TT kLogZero_ = -std::numeric_limits<TT>::infinity(); | |||
| @@ -112,7 +155,7 @@ void CTCLossCPUKernel::CalculateFwdVar(const std::vector<uint32_t> &label_with_b | |||
| template <typename TT> | |||
| void CTCLossCPUKernel::CalculateBwdVar(const std::vector<uint32_t> &label_with_blank, | |||
| const std::vector<std::vector<TT>> &y, | |||
| std::vector<std::vector<TT>> *log_beta_b) { | |||
| std::vector<std::vector<TT>> *log_beta_b) const { | |||
| int T = (*log_beta_b)[0].size(); | |||
| int U = label_with_blank.size(); | |||
| if (U > 1) { | |||
| @@ -154,7 +197,7 @@ void CTCLossCPUKernel::CalculateGrad(const std::vector<uint32_t> &label_with_bla | |||
| const std::vector<std::vector<TT>> &y, | |||
| const std::vector<std::vector<TT>> &log_alpha_b, | |||
| const std::vector<std::vector<TT>> &log_beta_b, const TT log_pzx, | |||
| std::vector<std::vector<TT>> *dy) { | |||
| std::vector<std::vector<TT>> *dy) const { | |||
| auto dy_b = dy; | |||
| TT kLogZero_ = -std::numeric_limits<TT>::infinity(); | |||
| if (log_pzx <= kLogZero_) { | |||
| @@ -179,8 +222,8 @@ void CTCLossCPUKernel::CalculateGrad(const std::vector<uint32_t> &label_with_bla | |||
| } | |||
| } | |||
| void CTCLossCPUKernel::GenLableWithBlank(const uint32_t *seq_len, const std::vector<std::vector<uint32_t>> &batch_label, | |||
| std::vector<std::vector<uint32_t>> *label_with_blank) { | |||
| void CTCLossCPUKernel::GenLabelWithBlank(const uint32_t *seq_len, const std::vector<std::vector<uint32_t>> &batch_label, | |||
| std::vector<std::vector<uint32_t>> *label_with_blank) const { | |||
| for (size_t b = 0; b < batch_size_; ++b) { | |||
| std::vector<uint32_t> l; | |||
| const std::vector<uint32_t> &label = batch_label[b]; | |||
| @@ -197,11 +240,9 @@ void CTCLossCPUKernel::GenLableWithBlank(const uint32_t *seq_len, const std::vec | |||
| } | |||
| } | |||
| } | |||
| if (!ignore_longer_outputs_than_inputs_) { | |||
| if (l.size() > seq_len[b]) { | |||
| MS_LOG(EXCEPTION) << "Input time(sequence length) should greater than output size(label length), but gets " | |||
| << seq_len[b] << "< " << l.size(); | |||
| } | |||
| if (!ignore_longer_outputs_than_inputs_ && l.size() > seq_len[b]) { | |||
| MS_LOG(EXCEPTION) << "Input time(sequence length) should greater than output size(label length), but gets " | |||
| << seq_len[b] << "< " << l.size(); | |||
| } | |||
| (*label_with_blank)[b].reserve(2 * l.size() + 1); | |||
| @@ -214,46 +255,14 @@ void CTCLossCPUKernel::GenLableWithBlank(const uint32_t *seq_len, const std::vec | |||
| } | |||
| template <typename T> | |||
| void InnerSoftMax(const T *inputs_addr, std::vector<std::vector<T>> *softmax_probs, const uint32_t sequence_length, | |||
| size_t num_class, size_t batch_size, size_t b) { | |||
| for (size_t t = 0; t < sequence_length; ++t) { | |||
| T maxCoeff(T(0)); | |||
| T sumCoeff(T(0)); | |||
| for (size_t c = 0; c < num_class; ++c) { | |||
| if (inputs_addr[t * batch_size * num_class + b * num_class + c] > maxCoeff) { | |||
| maxCoeff = inputs_addr[t * batch_size * num_class + b * num_class + c]; | |||
| } | |||
| } | |||
| for (size_t c = 0; c < num_class; ++c) { | |||
| sumCoeff += static_cast<T>(exp(inputs_addr[t * batch_size * num_class + b * num_class + c] - maxCoeff)); | |||
| (*softmax_probs)[c][t] = | |||
| static_cast<T>(exp(inputs_addr[t * batch_size * num_class + b * num_class + c] - maxCoeff)); | |||
| } | |||
| for (size_t c = 0; c < num_class; ++c) { | |||
| (*softmax_probs)[c][t] /= sumCoeff; | |||
| } | |||
| } | |||
| } | |||
| template <typename T> | |||
| void MatrixfromVector(uint32_t row, uint32_t col, std::vector<std::vector<T>> *array2D, const T init_value) { | |||
| array2D->resize(row); | |||
| for (size_t i = 0; i < row; ++i) { | |||
| (*array2D)[i].resize(col, init_value); | |||
| } | |||
| } | |||
| template <typename T> | |||
| void CTCLossCPUKernel::LaunchKernel(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &outputs) { | |||
| auto inputs_addr = reinterpret_cast<T *>(inputs[0]->addr); | |||
| auto labels_indices_addr = reinterpret_cast<uint64_t *>(inputs[1]->addr); | |||
| auto labels_values_addr = reinterpret_cast<uint32_t *>(inputs[2]->addr); | |||
| auto sequence_length_addr = reinterpret_cast<uint32_t *>(inputs[3]->addr); | |||
| auto loss_addr = reinterpret_cast<T *>(outputs[0]->addr); | |||
| auto gradient_addr = reinterpret_cast<T *>(outputs[1]->addr); | |||
| void CTCLossCPUKernel::LaunchKernel(const std::vector<AddressPtr> &inputs, | |||
| const std::vector<AddressPtr> &outputs) const { | |||
| const auto *inputs_addr = reinterpret_cast<T *>(inputs[0]->addr); | |||
| const auto *labels_indices_addr = reinterpret_cast<uint64_t *>(inputs[1]->addr); | |||
| const auto *labels_values_addr = reinterpret_cast<uint32_t *>(inputs[2]->addr); | |||
| const auto *sequence_length_addr = reinterpret_cast<uint32_t *>(inputs[3]->addr); | |||
| auto *loss_addr = reinterpret_cast<T *>(outputs[0]->addr); | |||
| auto *gradient_addr = reinterpret_cast<T *>(outputs[1]->addr); | |||
| std::vector<std::vector<uint32_t>> label_batch; | |||
| std::vector<std::vector<uint32_t>> labels_with_blank; | |||
| @@ -266,18 +275,21 @@ void CTCLossCPUKernel::LaunchKernel(const std::vector<AddressPtr> &inputs, const | |||
| T kLogZero_ = -std::numeric_limits<T>::infinity(); | |||
| // check validation of sequence length | |||
| for (size_t b = 0; b < batch_size_; ++b) { | |||
| if (sequence_length_addr[b] == uint32_t(0)) { | |||
| if (sequence_length_addr[b] == static_cast<uint32_t>(0)) { | |||
| MS_LOG(EXCEPTION) << "Sequence length should > 0, but gets " << sequence_length_addr[b]; | |||
| } | |||
| if (sequence_length_addr[b] > max_time_) { | |||
| MS_LOG(EXCEPTION) << "Max time should be greater than sequence length, but gets " << max_time_ << " < " | |||
| << sequence_length_addr[b]; | |||
| } | |||
| } | |||
| for (size_t i = 0; i < indice_dims_[0]; ++i) { | |||
| each_label_length[labels_indices_addr[i * 2]]++; | |||
| for (size_t i = 0; i < indices_dims_[0]; ++i) { | |||
| const size_t factor = 2; | |||
| auto index = labels_indices_addr[i * factor]; | |||
| if (index >= SizeToUlong(each_label_length.size())) { | |||
| MS_LOG(EXCEPTION) << "Index: " << index << "out of the bounds of the vector."; | |||
| } | |||
| each_label_length[index]++; | |||
| } | |||
| // convert label format of label_value and label_indices to batch_label | |||
| @@ -291,7 +303,7 @@ void CTCLossCPUKernel::LaunchKernel(const std::vector<AddressPtr> &inputs, const | |||
| } | |||
| // convert label to label with blank | |||
| GenLableWithBlank(sequence_length_addr, label_batch, &labels_with_blank); | |||
| GenLabelWithBlank(sequence_length_addr, label_batch, &labels_with_blank); | |||
| for (size_t b = 0; b < batch_size_; ++b) { | |||
| std::vector<uint32_t> label_with_blank = labels_with_blank[b]; | |||
| @@ -300,12 +312,11 @@ void CTCLossCPUKernel::LaunchKernel(const std::vector<AddressPtr> &inputs, const | |||
| std::vector<std::vector<T>> dy; | |||
| std::vector<std::vector<T>> log_alpha_b; | |||
| std::vector<std::vector<T>> log_beta_b; | |||
| MatrixfromVector(num_class_, sequence_length_addr[b], &y_b, kLogZero_); | |||
| MatrixfromVector(y_b.size(), y_b[0].size(), &dy, T(0)); | |||
| MatrixfromVector(label_with_blank.size(), sequence_length_addr[b], &log_alpha_b, kLogZero_); | |||
| MatrixfromVector(label_with_blank.size(), sequence_length_addr[b], &log_beta_b, kLogZero_); | |||
| MatrixFromVector(num_class_, sequence_length_addr[b], &y_b, kLogZero_); | |||
| MatrixFromVector(y_b.size(), y_b[0].size(), &dy, T(0)); | |||
| MatrixFromVector(label_with_blank.size(), sequence_length_addr[b], &log_alpha_b, kLogZero_); | |||
| MatrixFromVector(label_with_blank.size(), sequence_length_addr[b], &log_beta_b, kLogZero_); | |||
| InnerSoftMax(inputs_addr, &y_b, sequence_length_addr[b], num_class_, batch_size_, b); | |||
| CalculateFwdVar(label_with_blank, y_b, &log_alpha_b); | |||
| CalculateBwdVar(label_with_blank, y_b, &log_beta_b); | |||
| @@ -313,9 +324,7 @@ void CTCLossCPUKernel::LaunchKernel(const std::vector<AddressPtr> &inputs, const | |||
| for (size_t u = 0; u < label_with_blank.size(); ++u) { | |||
| log_pzx = LogSumExp(log_pzx, log_alpha_b[u][0] + log_beta_b[u][0]); | |||
| } | |||
| loss_addr[b] = -log_pzx; | |||
| CalculateGrad(label_with_blank, y_b, log_alpha_b, log_beta_b, log_pzx, &dy); | |||
| for (size_t t = 0; t < sequence_length_addr[b]; ++t) { | |||
| @@ -325,16 +334,5 @@ void CTCLossCPUKernel::LaunchKernel(const std::vector<AddressPtr> &inputs, const | |||
| } | |||
| } | |||
| } | |||
| void CTCLossCPUKernel::CheckParam(const CNodePtr &kernel_node) { | |||
| size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node); | |||
| if (input_num != 4) { | |||
| MS_LOG(EXCEPTION) << "CTCLossCPUKernel needs 4 inputs, but gets " << input_num; | |||
| } | |||
| size_t output_num = AnfAlgo::GetOutputTensorNum(kernel_node); | |||
| if (output_num != 2) { | |||
| MS_LOG(EXCEPTION) << "CTCLossCPUKernel expects 2 outputs, but gets" << output_num; | |||
| } | |||
| } | |||
| } // namespace kernel | |||
| } // namespace mindspore | |||
| @@ -16,11 +16,13 @@ | |||
| #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_CTCLOSS_CPU_KERNEL_H_ | |||
| #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_CTCLOSS_CPU_KERNEL_H_ | |||
| #include <memory> | |||
| #include <unordered_map> | |||
| #include <vector> | |||
| #include <algorithm> | |||
| #include <limits> | |||
| #include "backend/kernel_compiler/cpu/cpu_kernel.h" | |||
| #include "backend/kernel_compiler/cpu/cpu_kernel_factory.h" | |||
| @@ -36,36 +38,35 @@ class CTCLossCPUKernel : public CPUKernel { | |||
| bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace, | |||
| const std::vector<AddressPtr> &outputs) override; | |||
| void GenLableWithBlank(const uint32_t *seq_len, const std::vector<std::vector<uint32_t>> &batch_label, | |||
| std::vector<std::vector<uint32_t>> *label_with_blank); | |||
| private: | |||
| void GenLabelWithBlank(const uint32_t *seq_len, const std::vector<std::vector<uint32_t>> &batch_label, | |||
| std::vector<std::vector<uint32_t>> *label_with_blank) const; | |||
| template <typename T> | |||
| void CalculateFwdVar(const std::vector<uint32_t> &label_with_blank, const std::vector<std::vector<T>> &y, | |||
| std::vector<std::vector<T>> *log_alpha_b); | |||
| std::vector<std::vector<T>> *log_alpha_b) const; | |||
| template <typename T> | |||
| void CalculateBwdVar(const std::vector<uint32_t> &label_with_blank, const std::vector<std::vector<T>> &y, | |||
| std::vector<std::vector<T>> *log_beta_b); | |||
| std::vector<std::vector<T>> *log_beta_b) const; | |||
| template <typename T> | |||
| void CalculateGrad(const std::vector<uint32_t> &label_with_blank, const std::vector<std::vector<T>> &y, | |||
| const std::vector<std::vector<T>> &log_alpha_b, const std::vector<std::vector<T>> &log_beta_b, | |||
| const T log_pzx, std::vector<std::vector<T>> *dy); | |||
| const T log_pzx, std::vector<std::vector<T>> *dy) const; | |||
| template <typename T> | |||
| void LaunchKernel(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &outputs); | |||
| void LaunchKernel(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &outputs) const; | |||
| private: | |||
| void CheckParam(const CNodePtr &kernel_node); | |||
| std::vector<size_t> probs_shape_; | |||
| std::vector<size_t> indice_dims_; | |||
| std::vector<size_t> indices_dims_; | |||
| std::vector<size_t> labels_dims_; | |||
| size_t num_class_; | |||
| size_t max_time_; | |||
| size_t batch_size_; | |||
| uint32_t blank_index_; | |||
| size_t num_class_{0}; | |||
| size_t max_time_{0}; | |||
| size_t batch_size_{0}; | |||
| uint32_t blank_index_{0}; | |||
| TypeId dtype_{kTypeUnknown}; | |||
| bool preprocess_collapse_repeated_; | |||
| bool ctc_merge_repeated_; | |||
| bool ignore_longer_outputs_than_inputs_; | |||
| bool preprocess_collapse_repeated_{false}; | |||
| bool ctc_merge_repeated_{false}; | |||
| bool ignore_longer_outputs_than_inputs_{false}; | |||
| }; | |||
| MS_REG_CPU_KERNEL(CTCLoss, | |||
| @@ -1,5 +1,5 @@ | |||
| /** | |||
| * Copyright 2020 Huawei Technologies Co., Ltd | |||
| * Copyright 2020-2021 Huawei Technologies Co., Ltd | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| @@ -13,20 +13,29 @@ | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| #include <thread> | |||
| #include "backend/kernel_compiler/cpu/cumsum_cpu_kernel.h" | |||
| #include <thread> | |||
| #include "runtime/device/cpu/cpu_device_address.h" | |||
| namespace mindspore { | |||
| namespace kernel { | |||
| namespace { | |||
| constexpr size_t kCumSumInputsNum = 1; | |||
| constexpr size_t kCumSumOutputsNum = 1; | |||
| } // namespace | |||
| void CumSumCPUKernel::InitKernel(const CNodePtr &kernel_node) { | |||
| CheckParam(kernel_node); | |||
| MS_EXCEPTION_IF_NULL(kernel_node); | |||
| kernel_name_ = AnfAlgo::GetCNodeName(kernel_node); | |||
| shape_ = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0); | |||
| dtype_ = AnfAlgo::GetInputDeviceDataType(kernel_node, 0); | |||
| axis_ = static_cast<int>(AnfAlgo::GetNodeAttr<int64_t>(kernel_node, "axis")); | |||
| axis_ = LongToInt(AnfAlgo::GetNodeAttr<int64_t>(kernel_node, AXIS)); | |||
| dst_shape = AnfAlgo::GetOutputInferShape(kernel_node, 0); | |||
| exclusive_ = AnfAlgo::GetNodeAttr<bool>(kernel_node, "exclusive"); | |||
| reverse_ = AnfAlgo::GetNodeAttr<bool>(kernel_node, "reverse"); | |||
| exclusive_ = AnfAlgo::GetNodeAttr<bool>(kernel_node, EXCLUSIVE); | |||
| reverse_ = AnfAlgo::GetNodeAttr<bool>(kernel_node, REVERSE); | |||
| int input_dim_length = SizeToInt(shape_.size()); | |||
| if (axis_ >= input_dim_length) { | |||
| MS_LOG(EXCEPTION) << "Axis out of bounds."; | |||
| @@ -57,12 +66,17 @@ void CumSumCPUKernel::InitInputOutputSize(const CNodePtr &kernel_node) { | |||
| InitWorkspaceSize<int8_t>(); | |||
| } else if (dtype_ == kNumberTypeUInt8) { | |||
| InitWorkspaceSize<uint8_t>(); | |||
| } else { | |||
| MS_LOG(EXCEPTION) << kernel_name_ << " supports (float16, float32, uint8, int8, int32) on CPU, but got " | |||
| << TypeIdToType(dtype_)->ToString(); | |||
| } | |||
| } | |||
| bool CumSumCPUKernel::Launch(const std::vector<kernel::AddressPtr> &inputs, | |||
| const std::vector<kernel::AddressPtr> &workspace, | |||
| const std::vector<kernel::AddressPtr> &outputs) { | |||
| CHECK_KERNEL_INPUTS_NUM(inputs.size(), kCumSumInputsNum, kernel_name_); | |||
| CHECK_KERNEL_OUTPUTS_NUM(outputs.size(), kCumSumOutputsNum, kernel_name_); | |||
| Reshape(); | |||
| if (dtype_ == kNumberTypeFloat32) { | |||
| LaunchKernel<float_t>(inputs, workspace, outputs); | |||
| @@ -74,6 +88,9 @@ bool CumSumCPUKernel::Launch(const std::vector<kernel::AddressPtr> &inputs, | |||
| LaunchKernel<int8_t>(inputs, workspace, outputs); | |||
| } else if (dtype_ == kNumberTypeUInt8) { | |||
| LaunchKernel<uint8_t>(inputs, workspace, outputs); | |||
| } else { | |||
| MS_LOG(EXCEPTION) << kernel_name_ << " supports (float16, float32, uint8, int8, int32) on CPU, but got " | |||
| << TypeIdToType(dtype_)->ToString(); | |||
| } | |||
| return true; | |||
| } | |||
| @@ -90,12 +107,11 @@ void CumSumCPUKernel::Reshape() { | |||
| } | |||
| stride_ = dims_[1] * dims_[2]; | |||
| stride2_ = dims_[2]; | |||
| return; | |||
| } | |||
| template <typename T> | |||
| void CumSumCPUKernel::LeftMove(const T *input, T *output, size_t dim0, size_t dim1, size_t dim2, size_t stride, | |||
| size_t stride2, size_t start, size_t end) { | |||
| size_t stride2, size_t start, size_t end) const { | |||
| for (size_t i = start; i < end; i++) { | |||
| size_t k1 = i / dim2 % dim0; | |||
| size_t k2 = i % dim2; | |||
| @@ -114,7 +130,7 @@ void CumSumCPUKernel::LeftMove(const T *input, T *output, size_t dim0, size_t di | |||
| template <typename T> | |||
| void CumSumCPUKernel::RightMove(const T *input, T *output, size_t dim0, size_t dim1, size_t dim2, size_t stride, | |||
| size_t stride2, size_t start, size_t end) { | |||
| size_t stride2, size_t start, size_t end) const { | |||
| for (size_t i = start; i < end; i++) { | |||
| size_t k1 = i / dim2 % dim0; | |||
| size_t k2 = i % dim2; | |||
| @@ -133,7 +149,7 @@ void CumSumCPUKernel::RightMove(const T *input, T *output, size_t dim0, size_t d | |||
| template <typename T> | |||
| void CumSumCPUKernel::Copy(T *input, T *output, size_t dim0, size_t dim1, size_t dim2, size_t stride, size_t stride2, | |||
| size_t start, size_t end) { | |||
| size_t start, size_t end) const { | |||
| for (size_t i = start; i < end; i++) { | |||
| size_t k1 = i / dim2 % dim0; | |||
| size_t k2 = i % dim2; | |||
| @@ -147,7 +163,7 @@ void CumSumCPUKernel::Copy(T *input, T *output, size_t dim0, size_t dim1, size_t | |||
| template <typename T> | |||
| void CumSumCPUKernel::CumSumKernelReverse(const T *input, T *output, size_t dim0, size_t dim1, size_t dim2, | |||
| size_t stride, size_t stride2, size_t start, size_t end) { | |||
| size_t stride, size_t stride2, size_t start, size_t end) const { | |||
| for (size_t i = start; i < end; i++) { | |||
| size_t k1 = i / dim2 % dim0; | |||
| size_t k2 = i % dim2; | |||
| @@ -166,7 +182,7 @@ void CumSumCPUKernel::CumSumKernelReverse(const T *input, T *output, size_t dim0 | |||
| template <typename T> | |||
| void CumSumCPUKernel::CumSumKernel(const T *input, T *output, size_t dim0, size_t dim1, size_t dim2, size_t stride, | |||
| size_t stride2, size_t start, size_t end) { | |||
| size_t stride2, size_t start, size_t end) const { | |||
| for (size_t i = start; i < end; i++) { | |||
| size_t k1 = i / dim2 % dim0; | |||
| size_t k2 = i % dim2; | |||
| @@ -184,7 +200,7 @@ void CumSumCPUKernel::CumSumKernel(const T *input, T *output, size_t dim0, size_ | |||
| } | |||
| template <typename T> | |||
| void CumSumCPUKernel::LaunchCumSum(const T *input, T *output, T *workspace, size_t start, size_t end) { | |||
| void CumSumCPUKernel::LaunchCumSum(const T *input, T *output, T *workspace, size_t start, size_t end) const { | |||
| start = start / dims_[1]; | |||
| end = end / dims_[1]; | |||
| if (exclusive_) { | |||
| @@ -204,15 +220,14 @@ void CumSumCPUKernel::LaunchCumSum(const T *input, T *output, T *workspace, size | |||
| CumSumKernel(input, output, dims_[0], dims_[1], dims_[2], stride_, stride2_, start, end); | |||
| } | |||
| } | |||
| return; | |||
| } | |||
| template <typename T> | |||
| void CumSumCPUKernel::LaunchKernel(const std::vector<kernel::AddressPtr> &inputs, | |||
| const std::vector<kernel::AddressPtr> &workspace, | |||
| const std::vector<kernel::AddressPtr> &outputs) { | |||
| auto input = reinterpret_cast<T *>(inputs[0]->addr); | |||
| auto ws = reinterpret_cast<T *>(workspace[0]->addr); | |||
| const std::vector<kernel::AddressPtr> &outputs) const { | |||
| const auto *input = reinterpret_cast<T *>(inputs[0]->addr); | |||
| auto *ws = reinterpret_cast<T *>(workspace[0]->addr); | |||
| auto output = reinterpret_cast<T *>(outputs[0]->addr); | |||
| // multithreading | |||
| size_t lens = inputs[0]->size > 0 ? static_cast<size_t>(inputs[0]->size / sizeof(T)) : 1; | |||
| @@ -239,14 +254,6 @@ void CumSumCPUKernel::LaunchKernel(const std::vector<kernel::AddressPtr> &inputs | |||
| for (size_t i = 0; i < threads.size(); ++i) { | |||
| threads[i].join(); | |||
| } | |||
| return; | |||
| } | |||
| void CumSumCPUKernel::CheckParam(const CNodePtr &kernel_node) { | |||
| size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node); | |||
| if (input_num != 1) { | |||
| MS_LOG(EXCEPTION) << "Argument number is " << input_num << ", but CumSumGpuKernel needs 1."; | |||
| } | |||
| } | |||
| } // namespace kernel | |||
| } // namespace mindspore | |||
| @@ -1,5 +1,5 @@ | |||
| /** | |||
| * Copyright 2020 Huawei Technologies Co., Ltd | |||
| * Copyright 2020-2021 Huawei Technologies Co., Ltd | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| @@ -19,6 +19,7 @@ | |||
| #include <memory> | |||
| #include <vector> | |||
| #include "backend/kernel_compiler/cpu/cpu_kernel.h" | |||
| #include "backend/kernel_compiler/cpu/cpu_kernel_factory.h" | |||
| @@ -31,55 +32,53 @@ class CumSumCPUKernel : public CPUKernel { | |||
| void InitKernel(const CNodePtr &kernel_node) override; | |||
| void InitInputOutputSize(const CNodePtr &kernel_node) override; | |||
| bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace, | |||
| const std::vector<AddressPtr> &outputs) override; | |||
| template <typename T> | |||
| void InitWorkspaceSize(); | |||
| template <typename T> | |||
| void LaunchKernel(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace, | |||
| const std::vector<AddressPtr> &outputs); | |||
| private: | |||
| void Reshape(); | |||
| template <typename T> | |||
| void LaunchCumSum(const T *input_addr, T *output_addr, T *ws_addr, size_t start, size_t end); | |||
| private: | |||
| void CheckParam(const CNodePtr &kernel_node); | |||
| void InitWorkspaceSize(); | |||
| void Reshape(); | |||
| void InitInputOutputSize(const CNodePtr &kernel_node) override; | |||
| template <typename T> | |||
| void LeftMove(const T *input, T *output, size_t dim0, size_t dim1, size_t dim2, size_t stride, size_t stride2, | |||
| size_t start, size_t end); | |||
| size_t start, size_t end) const; | |||
| template <typename T> | |||
| void RightMove(const T *input, T *output, size_t dim0, size_t dim1, size_t dim2, size_t stride, size_t stride2, | |||
| size_t start, size_t end); | |||
| size_t start, size_t end) const; | |||
| template <typename T> | |||
| void Copy(T *input, T *output, size_t dim0, size_t dim1, size_t dim2, size_t stride, size_t stride2, size_t start, | |||
| size_t end); | |||
| size_t end) const; | |||
| template <typename T> | |||
| void CumSumKernelReverse(const T *input, T *output, size_t dim0, size_t dim1, size_t dim2, size_t stride, | |||
| size_t stride2, size_t start, size_t end); | |||
| size_t stride2, size_t start, size_t end) const; | |||
| template <typename T> | |||
| void CumSumKernel(const T *input, T *output, size_t dim0, size_t dim1, size_t dim2, size_t stride, size_t stride2, | |||
| size_t start, size_t end); | |||
| size_t start, size_t end) const; | |||
| template <typename T> | |||
| void LaunchKernel(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace, | |||
| const std::vector<AddressPtr> &outputs) const; | |||
| template <typename T> | |||
| void LaunchCumSum(const T *input_addr, T *output_addr, T *ws_addr, size_t start, size_t end) const; | |||
| std::vector<size_t> shape_; | |||
| std::vector<size_t> dst_shape; | |||
| size_t input_size_0_; | |||
| size_t stride_; | |||
| size_t stride2_; | |||
| size_t dims_[3] = {}; | |||
| int exclusive_; | |||
| int reverse_; | |||
| int axis_; | |||
| size_t input_size_0_{0}; | |||
| size_t stride_{0}; | |||
| size_t stride2_{0}; | |||
| size_t dims_[3]{0}; | |||
| int exclusive_{0}; | |||
| int reverse_{0}; | |||
| int axis_{0}; | |||
| TypeId dtype_{kTypeUnknown}; | |||
| }; | |||
| @@ -1,5 +1,5 @@ | |||
| /** | |||
| * Copyright 2020 Huawei Technologies Co., Ltd | |||
| * Copyright 2020-2021 Huawei Technologies Co., Ltd | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| @@ -13,28 +13,35 @@ | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| #include "backend/kernel_compiler/cpu/debug_cpu_kernel.h" | |||
| #include "runtime/device/cpu/cpu_device_address.h" | |||
| #include "utils/ms_utils.h" | |||
| namespace mindspore { | |||
| namespace kernel { | |||
| void DebugCPUKernel::InitKernel(const CNodePtr &kernel_node) { MS_EXCEPTION_IF_NULL(kernel_node); } | |||
| namespace { | |||
| constexpr size_t kDebugInputsNum = 1; | |||
| constexpr size_t kDebugOutputsNum = 1; | |||
| } // namespace | |||
| void DebugCPUKernel::InitKernel(const CNodePtr &kernel_node) { | |||
| MS_EXCEPTION_IF_NULL(kernel_node); | |||
| kernel_name_ = AnfAlgo::GetCNodeName(kernel_node); | |||
| } | |||
| bool DebugCPUKernel::Launch(const std::vector<kernel::AddressPtr> &inputs, const std::vector<kernel::AddressPtr> &, | |||
| const std::vector<kernel::AddressPtr> &outputs) { | |||
| if (inputs.size() < 1 || outputs.empty()) { | |||
| MS_LOG(EXCEPTION) << "Input or output empty!"; | |||
| } | |||
| auto val = reinterpret_cast<int *>(inputs[0]->addr); | |||
| MS_LOG(DEBUG) << " launch DebugCountCPUKernel val " << *val; | |||
| CHECK_KERNEL_INPUTS_NUM(inputs.size(), kDebugInputsNum, kernel_name_); | |||
| CHECK_KERNEL_OUTPUTS_NUM(outputs.size(), kDebugOutputsNum, kernel_name_); | |||
| const auto *val = reinterpret_cast<int *>(inputs[0]->addr); | |||
| MS_LOG(DEBUG) << " launch DebugCountCPUKernel"; | |||
| auto output = reinterpret_cast<int *>(outputs[0]->addr); | |||
| size_t elem_num = inputs[0]->size / sizeof(int); | |||
| for (size_t i = 0; i < elem_num; i++) { | |||
| output[i] = static_cast<int>(val[i]); | |||
| } | |||
| return true; | |||
| } | |||
| } // namespace kernel | |||
| @@ -1,5 +1,5 @@ | |||
| /** | |||
| * Copyright 2020 Huawei Technologies Co., Ltd | |||
| * Copyright 2020-2021 Huawei Technologies Co., Ltd | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| @@ -13,11 +13,13 @@ | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_DEBUG_CPU_KERNEL_H_ | |||
| #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_DEBUG_CPU_KERNEL_H_ | |||
| #include <vector> | |||
| #include <memory> | |||
| #include "backend/kernel_compiler/cpu/cpu_kernel.h" | |||
| #include "backend/kernel_compiler/cpu/cpu_kernel_factory.h" | |||
| @@ -13,14 +13,17 @@ | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_DEPTHTOSPACE_CPU_KERNEL_H_ | |||
| #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_DEPTHTOSPACE_CPU_KERNEL_H_ | |||
| #include <memory> | |||
| #include <string> | |||
| #include <vector> | |||
| #include "backend/kernel_compiler/cpu/cpu_kernel.h" | |||
| #include "backend/kernel_compiler/cpu/cpu_kernel_factory.h" | |||
| namespace mindspore { | |||
| namespace kernel { | |||
| template <typename T> | |||
| @@ -1,5 +1,5 @@ | |||
| /** | |||
| * Copyright 2020 Huawei Technologies Co., Ltd | |||
| * Copyright 2020-2021 Huawei Technologies Co., Ltd | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| @@ -14,24 +14,29 @@ | |||
| * limitations under the License. | |||
| */ | |||
| #include "backend/kernel_compiler/cpu/dropout_cpu_kernel.h" | |||
| #include <algorithm> | |||
| #include <random> | |||
| #include "runtime/device/cpu/cpu_device_address.h" | |||
| #include "backend/kernel_compiler/cpu/dropout_cpu_kernel.h" | |||
| namespace mindspore { | |||
| namespace kernel { | |||
| namespace { | |||
| constexpr size_t kDropoutInputsNum = 1; | |||
| constexpr size_t kDropoutOutputsNum = 2; | |||
| } // namespace | |||
| void DropoutCPUKernel::InitKernel(const CNodePtr &kernel_node) { | |||
| CheckParam(kernel_node); | |||
| MS_EXCEPTION_IF_NULL(kernel_node); | |||
| kernel_name_ = AnfAlgo::GetCNodeName(kernel_node); | |||
| input_shape_ = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0); | |||
| output_shape_ = AnfAlgo::GetOutputInferShape(kernel_node, 0); | |||
| mask_shape_ = AnfAlgo::GetOutputInferShape(kernel_node, 1); | |||
| keep_prob_ = AnfAlgo::GetNodeAttr<float>(kernel_node, "keep_prob"); | |||
| if (keep_prob_ <= 0.0) { | |||
| MS_LOG(EXCEPTION) << "Keep_prob is smaller or equal to zero but DropoutCPUKernel needs greater than 0"; | |||
| } | |||
| if (keep_prob_ > 1.0) { | |||
| MS_LOG(EXCEPTION) << "Keep_prob greater than one but DropoutCPUKernel needs smaller or equal to one"; | |||
| if (keep_prob_ <= 0.0 || keep_prob_ > 1.0) { | |||
| MS_LOG(EXCEPTION) << kernel_name_ << "requires keep_prob should be in (0.0, 1.0], but got " << keep_prob_; | |||
| } | |||
| dtype_ = AnfAlgo::GetInputDeviceDataType(kernel_node, 0); | |||
| for (const uint64_t &d : input_shape_) { | |||
| @@ -41,18 +46,24 @@ void DropoutCPUKernel::InitKernel(const CNodePtr &kernel_node) { | |||
| bool DropoutCPUKernel::Launch(const std::vector<kernel::AddressPtr> &inputs, const std::vector<kernel::AddressPtr> &, | |||
| const std::vector<kernel::AddressPtr> &outputs) { | |||
| CHECK_KERNEL_INPUTS_NUM(inputs.size(), kDropoutInputsNum, kernel_name_); | |||
| CHECK_KERNEL_OUTPUTS_NUM(outputs.size(), kDropoutOutputsNum, kernel_name_); | |||
| if (dtype_ == kNumberTypeFloat16) { | |||
| LaunchKernel<float16>(inputs, outputs); | |||
| } else if (dtype_ == kNumberTypeFloat32) { | |||
| LaunchKernel<float>(inputs, outputs); | |||
| } else { | |||
| MS_LOG(EXCEPTION) << kernel_name_ << " only support float16 and float32 on CPU, but got " | |||
| << TypeIdToType(dtype_)->ToString(); | |||
| } | |||
| return true; | |||
| } | |||
| template <typename T> | |||
| void DropoutCPUKernel::LaunchKernel(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &outputs) { | |||
| auto input_addr = reinterpret_cast<T *>(inputs[0]->addr); | |||
| auto output_addr = reinterpret_cast<T *>(outputs[0]->addr); | |||
| void DropoutCPUKernel::LaunchKernel(const std::vector<AddressPtr> &inputs, | |||
| const std::vector<AddressPtr> &outputs) const { | |||
| const auto *input_addr = reinterpret_cast<T *>(inputs[0]->addr); | |||
| auto *output_addr = reinterpret_cast<T *>(outputs[0]->addr); | |||
| auto mask_addr = reinterpret_cast<T *>(outputs[1]->addr); | |||
| std::random_device rd; | |||
| std::mt19937 gen(rd()); | |||
| @@ -63,17 +74,5 @@ void DropoutCPUKernel::LaunchKernel(const std::vector<AddressPtr> &inputs, const | |||
| output_addr[i] = mask_addr[i] * input_addr[i] * scale; | |||
| } | |||
| } | |||
| void DropoutCPUKernel::CheckParam(const CNodePtr &kernel_node) { | |||
| MS_EXCEPTION_IF_NULL(kernel_node); | |||
| size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node); | |||
| if (input_num != 1) { | |||
| MS_LOG(EXCEPTION) << "Input number is " << input_num << ", but DropoutCPUKernel needs 1 input."; | |||
| } | |||
| size_t output_num = AnfAlgo::GetOutputTensorNum(kernel_node); | |||
| if (output_num != 2) { | |||
| MS_LOG(EXCEPTION) << "Output number is " << output_num << ", but DropoutCPUKernel needs 1 output."; | |||
| } | |||
| } | |||
| } // namespace kernel | |||
| } // namespace mindspore | |||
| @@ -1,5 +1,5 @@ | |||
| /** | |||
| * Copyright 2020 Huawei Technologies Co., Ltd | |||
| * Copyright 2020-2021 Huawei Technologies Co., Ltd | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| @@ -16,8 +16,10 @@ | |||
| #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_DROPOUT_CPU_KERNEL_H_ | |||
| #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_DROPOUT_CPU_KERNEL_H_ | |||
| #include <memory> | |||
| #include <vector> | |||
| #include "backend/kernel_compiler/cpu/cpu_kernel.h" | |||
| #include "backend/kernel_compiler/cpu/cpu_kernel_factory.h" | |||
| @@ -33,17 +35,16 @@ class DropoutCPUKernel : public CPUKernel { | |||
| bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace, | |||
| const std::vector<AddressPtr> &outputs) override; | |||
| private: | |||
| template <typename T> | |||
| void LaunchKernel(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &outputs); | |||
| void LaunchKernel(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &outputs) const; | |||
| private: | |||
| void CheckParam(const CNodePtr &kernel_node); | |||
| std::vector<size_t> input_shape_; | |||
| std::vector<size_t> output_shape_; | |||
| std::vector<size_t> mask_shape_; | |||
| TypeId dtype_{kTypeUnknown}; | |||
| float keep_prob_ = 0.0; | |||
| uint64_t tensor_size_ = 1; | |||
| float keep_prob_{0.0}; | |||
| uint64_t tensor_size_{1}; | |||
| }; | |||
| MS_REG_CPU_KERNEL(Dropout, KernelAttr(), DropoutCPUKernel); | |||
| @@ -1,5 +1,5 @@ | |||
| /** | |||
| * Copyright 2020 Huawei Technologies Co., Ltd | |||
| * Copyright 2020-2021 Huawei Technologies Co., Ltd | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| @@ -13,16 +13,24 @@ | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| #include "backend/kernel_compiler/cpu/dropout_grad_kernel.h" | |||
| #include <vector> | |||
| #include "runtime/device/cpu/cpu_device_address.h" | |||
| #include "backend/kernel_compiler/cpu/dropout_grad_kernel.h" | |||
| #include "nnacl/fp32_grad/dropout_grad.h" | |||
| #include "backend/kernel_compiler/cpu/nnacl/fp32_grad/dropout_grad.h" | |||
| namespace mindspore { | |||
| namespace kernel { | |||
| namespace { | |||
| constexpr size_t kDropoutGradInputsNum = 2; | |||
| constexpr size_t kDropoutGradOutputsNum = 1; | |||
| } // namespace | |||
| void DropoutGradCpuBwdKernel::InitKernel(const CNodePtr &kernel_node) { | |||
| MS_EXCEPTION_IF_NULL(kernel_node); | |||
| kernel_name_ = AnfAlgo::GetCNodeName(kernel_node); | |||
| auto input_shape = AnfAlgo::GetInputDeviceShape(kernel_node, 0); | |||
| auto input_mask_shape = AnfAlgo::GetInputDeviceShape(kernel_node, 1); | |||
| if (input_shape.size() != input_mask_shape.size()) { | |||
| @@ -35,8 +43,8 @@ void DropoutGradCpuBwdKernel::InitKernel(const CNodePtr &kernel_node) { | |||
| } | |||
| dtype_ = AnfAlgo::GetInputDeviceDataType(kernel_node, 0); | |||
| keep_prob_ = AnfAlgo::GetNodeAttr<float>(kernel_node, "keep_prob"); | |||
| if (keep_prob_ == 0) { | |||
| MS_LOG(EXCEPTION) << "The keep_prob is zero."; | |||
| if (keep_prob_ <= 0.0 || keep_prob_ > 1.0) { | |||
| MS_LOG(EXCEPTION) << kernel_name_ << "requires keep_prob should be in (0.0, 1.0], but got " << keep_prob_; | |||
| } | |||
| } | |||
| @@ -51,12 +59,15 @@ void DropoutGradCpuBwdKernel::InitInputOutputSize(const CNodePtr &kernel_node) { | |||
| bool DropoutGradCpuBwdKernel::Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace, | |||
| const std::vector<AddressPtr> &outputs) { | |||
| CHECK_KERNEL_INPUTS_NUM(inputs.size(), kDropoutGradInputsNum, kernel_name_); | |||
| CHECK_KERNEL_OUTPUTS_NUM(outputs.size(), kDropoutGradOutputsNum, kernel_name_); | |||
| if (dtype_ == kNumberTypeFloat16) { | |||
| DropoutBackwardKernel<float16>(inputs, workspace, outputs, keep_prob_); | |||
| } else if (dtype_ == kNumberTypeFloat32) { | |||
| DropoutBackwardKernel<float>(inputs, workspace, outputs, keep_prob_); | |||
| } else { | |||
| MS_LOG(ERROR) << "Input data type: " << dtype_ << " is not supported for DropoutGrad kernel for CPU."; | |||
| MS_LOG(EXCEPTION) << kernel_name_ << " only support float16 and float32 on CPU, but got " | |||
| << TypeIdToType(dtype_)->ToString(); | |||
| } | |||
| return true; | |||
| @@ -1,5 +1,5 @@ | |||
| /** | |||
| * Copyright 2020 Huawei Technologies Co., Ltd | |||
| * Copyright 2020-2021 Huawei Technologies Co., Ltd | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| @@ -21,6 +21,7 @@ | |||
| #include <memory> | |||
| #include <string> | |||
| #include <unordered_map> | |||
| #include "backend/kernel_compiler/cpu/cpu_kernel.h" | |||
| #include "backend/kernel_compiler/cpu/cpu_kernel_factory.h" | |||
| @@ -36,12 +37,12 @@ class DropoutGradCpuBwdKernel : public CPUKernel { | |||
| const std::vector<AddressPtr> &outputs) override; | |||
| private: | |||
| float keep_prob_{1.0}; | |||
| size_t num_count_{1}; | |||
| TypeId dtype_{kTypeUnknown}; | |||
| template <typename T> | |||
| void DropoutBackwardKernel(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace, | |||
| const std::vector<AddressPtr> &outputs, float keep_prob); | |||
| float keep_prob_{1.0}; | |||
| size_t num_count_{1}; | |||
| TypeId dtype_{kTypeUnknown}; | |||
| }; | |||
| MS_REG_CPU_KERNEL(DropoutGrad, KernelAttr(), DropoutGradCpuBwdKernel); | |||
| @@ -1,5 +1,5 @@ | |||
| /** | |||
| * Copyright 2020 Huawei Technologies Co., Ltd | |||
| * Copyright 2020-2021 Huawei Technologies Co., Ltd | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| @@ -13,13 +13,23 @@ | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| #include <algorithm> | |||
| #include "backend/kernel_compiler/cpu/dynamic_assign_cpu_kernel.h" | |||
| #include <algorithm> | |||
| #include "runtime/device/cpu/cpu_device_address.h" | |||
| namespace mindspore { | |||
| namespace kernel { | |||
| namespace { | |||
| constexpr size_t kDynamicAssignInputsNum = 2; | |||
| constexpr size_t kDynamicAssignOutputsNum = 1; | |||
| } // namespace | |||
| void DynamicAssignCPUKernel::InitKernel(const CNodePtr &kernel_node) { | |||
| MS_EXCEPTION_IF_NULL(kernel_node); | |||
| kernel_name_ = AnfAlgo::GetCNodeName(kernel_node); | |||
| node_wpt_ = kernel_node; | |||
| input_x_dtype_ = AnfAlgo::GetInputDeviceDataType(kernel_node, 0); | |||
| input_x_dtype_size_ = GetTypeByte(TypeIdToType(input_x_dtype_)); | |||
| @@ -28,6 +38,8 @@ void DynamicAssignCPUKernel::InitKernel(const CNodePtr &kernel_node) { | |||
| bool DynamicAssignCPUKernel::Launch(const std::vector<kernel::AddressPtr> &inputs, | |||
| const std::vector<kernel::AddressPtr> &, | |||
| const std::vector<kernel::AddressPtr> &outputs) { | |||
| CHECK_KERNEL_INPUTS_NUM(inputs.size(), kDynamicAssignInputsNum, kernel_name_); | |||
| CHECK_KERNEL_OUTPUTS_NUM(outputs.size(), kDynamicAssignOutputsNum, kernel_name_); | |||
| if (input_x_dtype_ == kNumberTypeInt32) { | |||
| LaunchKernel<int>(inputs, outputs); | |||
| } else if (input_x_dtype_ == kNumberTypeInt64) { | |||
| @@ -37,8 +49,8 @@ bool DynamicAssignCPUKernel::Launch(const std::vector<kernel::AddressPtr> &input | |||
| } else if (input_x_dtype_ == kNumberTypeFloat64) { | |||
| LaunchKernel<double>(inputs, outputs); | |||
| } else { | |||
| MS_LOG(ERROR) << "Dtype of indices only support float32, float64, int32, int64"; | |||
| return false; | |||
| MS_LOG(EXCEPTION) << kernel_name_ << " support (int32, int64, float32, float64) on CPU , but got " | |||
| << TypeIdToType(input_x_dtype_)->ToString(); | |||
| } | |||
| return true; | |||
| } | |||
| @@ -46,25 +58,27 @@ bool DynamicAssignCPUKernel::Launch(const std::vector<kernel::AddressPtr> &input | |||
| template <typename T> | |||
| void DynamicAssignCPUKernel::LaunchKernel(const std::vector<AddressPtr> &inputs, | |||
| const std::vector<kernel::AddressPtr> &outputs) { | |||
| auto node_ = node_wpt_.lock(); | |||
| if (!node_) { | |||
| MS_LOG(EXCEPTION) << "node_wpt_ is expired."; | |||
| auto node = node_wpt_.lock(); | |||
| if (!node) { | |||
| MS_LOG(EXCEPTION) << kernel_name_ << " node_wpt_ is expired."; | |||
| } | |||
| auto input_x_shape = AnfAlgo::GetPrevNodeOutputInferShape(node_, 0); | |||
| auto input_y_shape = AnfAlgo::GetPrevNodeOutputInferShape(node_, 1); | |||
| auto input_x_shape = AnfAlgo::GetPrevNodeOutputInferShape(node, 0); | |||
| auto input_y_shape = AnfAlgo::GetPrevNodeOutputInferShape(node, 1); | |||
| batch_size_ = 1; | |||
| for (size_t i = 0; i < input_x_shape.size(); ++i) { | |||
| batch_size_ *= input_x_shape[i]; | |||
| } | |||
| if (input_x_shape.size() != input_y_shape.size()) MS_LOG(EXCEPTION) << "X and y must be same shape!"; | |||
| if (input_x_shape.size() != input_y_shape.size()) { | |||
| MS_LOG(EXCEPTION) << "X and y must be same shape"; | |||
| } | |||
| for (size_t i = 0; i < input_x_shape.size(); ++i) { | |||
| if (input_x_shape[i] != input_y_shape[i]) { | |||
| MS_LOG(EXCEPTION) << "X and y must be same shape!"; | |||
| MS_LOG(EXCEPTION) << "x and y must be same shape!"; | |||
| } | |||
| } | |||
| T *input_x = reinterpret_cast<T *>(inputs[0]->addr); | |||
| T *input_y = reinterpret_cast<T *>(inputs[1]->addr); | |||
| auto *input_x = reinterpret_cast<T *>(inputs[0]->addr); | |||
| auto *input_y = reinterpret_cast<T *>(inputs[1]->addr); | |||
| auto max_size = inputs[0]->size; | |||
| size_t total_size = input_x_dtype_size_ * batch_size_; | |||
| if (total_size > max_size) { | |||
| @@ -76,10 +90,10 @@ void DynamicAssignCPUKernel::LaunchKernel(const std::vector<AddressPtr> &inputs, | |||
| MS_LOG(EXCEPTION) << "Memcpy_s error, errorno" << ret; | |||
| } | |||
| auto node_with_idx = AnfAlgo::GetPrevNodeOutput(node_, 0); | |||
| auto node = node_with_idx.first; | |||
| if (node->isa<Parameter>()) { | |||
| auto node_ptr = node->cast<ParameterPtr>(); | |||
| auto node_with_idx = AnfAlgo::GetPrevNodeOutput(node, 0); | |||
| auto out_node = node_with_idx.first; | |||
| if (out_node->isa<Parameter>()) { | |||
| auto node_ptr = out_node->cast<ParameterPtr>(); | |||
| auto value = node_ptr->default_param(); | |||
| auto tensor = value->cast<std::shared_ptr<tensor::Tensor>>(); | |||
| ShapeVector shape_tmp; | |||
| @@ -1,5 +1,5 @@ | |||
| /** | |||
| * Copyright 2020 Huawei Technologies Co., Ltd | |||
| * Copyright 2020-2021 Huawei Technologies Co., Ltd | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| @@ -13,12 +13,14 @@ | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_DYNAMIC_ASSIGN_CPU_KERNEL_H_ | |||
| #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_DYNAMIC_ASSIGN_CPU_KERNEL_H_ | |||
| #include <vector> | |||
| #include <memory> | |||
| #include <unordered_map> | |||
| #include "backend/kernel_compiler/cpu/cpu_kernel.h" | |||
| #include "backend/kernel_compiler/cpu/cpu_kernel_factory.h" | |||
| @@ -34,13 +36,13 @@ class DynamicAssignCPUKernel : public CPUKernel { | |||
| bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace, | |||
| const std::vector<AddressPtr> &outputs) override; | |||
| private: | |||
| template <typename T> | |||
| void LaunchKernel(const std::vector<AddressPtr> &inputs, const std::vector<kernel::AddressPtr> &outputs); | |||
| private: | |||
| size_t batch_size_{1}; | |||
| TypeId input_x_dtype_{kTypeUnknown}; | |||
| size_t input_x_dtype_size_ = 4; | |||
| size_t input_x_dtype_size_{4}; | |||
| CNodeWeakPtr node_wpt_; | |||
| }; | |||
| @@ -1,5 +1,5 @@ | |||
| /** | |||
| * Copyright 2019 Huawei Technologies Co., Ltd | |||
| * Copyright 2019-2021 Huawei Technologies Co., Ltd | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| @@ -24,59 +24,57 @@ | |||
| namespace mindspore { | |||
| namespace kernel { | |||
| namespace { | |||
| constexpr size_t kInputsNum = 1; | |||
| constexpr size_t kOutputsNum = 1; | |||
| struct DescParam { | |||
| dnnl::algorithm algorithm; | |||
| float alpha = 0.f; | |||
| float beta = 0.f; | |||
| dnnl::algorithm algorithm{dnnl::algorithm::undef}; | |||
| float alpha{0.0f}; | |||
| float beta{0.0f}; | |||
| }; | |||
| } // namespace | |||
| dnnl::eltwise_forward::desc EltWiseCPUKernel::GetForwardEltwiseDesc(const CNodePtr &kernel_node, | |||
| const dnnl::memory::desc src_desc) { | |||
| dnnl::eltwise_forward::desc EltWiseCPUKernel::GetForwardEltwiseDesc(const dnnl::memory::desc src_desc) { | |||
| static const std::unordered_map<std::string, DescParam> eltWiseOpDescMap{ | |||
| {prim::kPrimRelu->name(), DescParam{dnnl::algorithm::eltwise_relu}}, | |||
| {prim::kPrimRelu6->name(), DescParam{dnnl::algorithm::eltwise_clip, 0.f, 6.f}}, | |||
| {prim::kPrimRelu6->name(), DescParam{dnnl::algorithm::eltwise_clip, 0.0f, 6.0f}}, | |||
| {prim::kPrimAbs->name(), DescParam{dnnl::algorithm::eltwise_abs}}, | |||
| {prim::kPrimExp->name(), DescParam{dnnl::algorithm::eltwise_exp}}, | |||
| {prim::kPrimLog->name(), DescParam{dnnl::algorithm::eltwise_log}}, | |||
| {prim::kPrimSigmoid->name(), DescParam{dnnl::algorithm::eltwise_logistic}}, | |||
| {prim::kPrimSqrt->name(), DescParam{dnnl::algorithm::eltwise_sqrt}}, | |||
| {prim::kPrimSquare->name(), DescParam{dnnl::algorithm::eltwise_square}}, | |||
| {prim::kPrimTanh->name(), DescParam{dnnl::algorithm::eltwise_tanh}}, | |||
| {prim::kPrimElu->name(), DescParam{dnnl::algorithm::eltwise_elu, 1.f, 0.f}}, | |||
| {prim::kPrimElu->name(), DescParam{dnnl::algorithm::eltwise_elu, 1.0f, 0.0f}}, | |||
| {prim::kPrimSoftplus->name(), DescParam{dnnl::algorithm::eltwise_soft_relu}}, | |||
| }; | |||
| std::string kernel_name = AnfAlgo::GetCNodeName(kernel_node); | |||
| const auto desc_pair = eltWiseOpDescMap.find(kernel_name); | |||
| const auto desc_pair = eltWiseOpDescMap.find(kernel_name_); | |||
| if (desc_pair == eltWiseOpDescMap.end()) { | |||
| MS_LOG(EXCEPTION) << "EltWiseCPUKernel does not support " << kernel_name; | |||
| MS_LOG(EXCEPTION) << "EltWiseCPUKernel does not support " << kernel_name_; | |||
| } | |||
| return dnnl::eltwise_forward::desc(DnnlForward, desc_pair->second.algorithm, src_desc, desc_pair->second.alpha, | |||
| return dnnl::eltwise_forward::desc(dnnl_forward_, desc_pair->second.algorithm, src_desc, desc_pair->second.alpha, | |||
| desc_pair->second.beta); | |||
| } | |||
| void EltWiseCPUKernel::InitKernel(const CNodePtr &kernel_node) { | |||
| MS_EXCEPTION_IF_NULL(kernel_node); | |||
| kernel_name_ = AnfAlgo::GetCNodeName(kernel_node); | |||
| std::vector<size_t> src_shape = AnfAlgo::GetInputDeviceShape(kernel_node, 0); | |||
| if (src_shape.size() == 0) { | |||
| src_shape.insert(src_shape.begin(), 1); | |||
| if (src_shape.empty()) { | |||
| (void)src_shape.insert(src_shape.begin(), 1); | |||
| } | |||
| dnnl::memory::desc src_desc = GetDefaultMemDesc(src_shape); | |||
| auto desc = GetForwardEltwiseDesc(kernel_node, src_desc); | |||
| auto desc = GetForwardEltwiseDesc(src_desc); | |||
| auto prim_desc = dnnl::eltwise_forward::primitive_desc(desc, MKLKernelEngine::Get().engine()); | |||
| primitive_ = std::make_shared<dnnl::eltwise_forward>(prim_desc); | |||
| AddArgument(DNNL_ARG_SRC, src_desc); | |||
| AddArgument(DNNL_ARG_DST, src_desc); | |||
| } | |||
| bool EltWiseCPUKernel::Launch(const std::vector<kernel::AddressPtr> &inputs, const std::vector<kernel::AddressPtr> &, | |||
| const std::vector<kernel::AddressPtr> &outputs) { | |||
| if (inputs.empty() || outputs.empty()) { | |||
| MS_LOG(EXCEPTION) << "Error input output size!"; | |||
| } | |||
| CHECK_KERNEL_INPUTS_NUM(inputs.size(), kInputsNum, kernel_name_); | |||
| CHECK_KERNEL_OUTPUTS_NUM(outputs.size(), kOutputsNum, kernel_name_); | |||
| SetArgumentHandle(DNNL_ARG_SRC, inputs[0]->addr); | |||
| SetArgumentHandle(DNNL_ARG_DST, outputs[0]->addr); | |||
| ExecutePrimitive(); | |||
| @@ -13,8 +13,10 @@ | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_ELTWISE_CPU_KERNEL_H_ | |||
| #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_ELTWISE_CPU_KERNEL_H_ | |||
| #include <memory> | |||
| #include <vector> | |||
| #include "backend/kernel_compiler/cpu/mkldnn/mkl_cpu_kernel.h" | |||
| @@ -32,8 +34,9 @@ class EltWiseCPUKernel : public MKLCPUKernel { | |||
| const std::vector<AddressPtr> &outputs) override; | |||
| private: | |||
| dnnl::eltwise_forward::desc GetForwardEltwiseDesc(const CNodePtr &kernel_node, const dnnl::memory::desc src_desc); | |||
| dnnl::prop_kind DnnlForward = dnnl::prop_kind::forward_training; | |||
| dnnl::eltwise_forward::desc GetForwardEltwiseDesc(const dnnl::memory::desc src_desc); | |||
| dnnl::prop_kind dnnl_forward_{dnnl::prop_kind::forward_training}; | |||
| }; | |||
| MS_REG_CPU_KERNEL(Elu, KernelAttr().AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32), | |||
| @@ -13,15 +13,23 @@ | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| #include "backend/kernel_compiler/cpu/mkldnn/log_softmax_cpu_kernel.h" | |||
| #include <algorithm> | |||
| #include "backend/kernel_compiler/cpu/mkldnn/mkl_kernel_engine.h" | |||
| #include "runtime/device/cpu/cpu_device_address.h" | |||
| #include "utils/ms_utils.h" | |||
| namespace mindspore { | |||
| namespace kernel { | |||
| namespace { | |||
| constexpr size_t kLogSoftmaxInputsNum = 1; | |||
| constexpr size_t kLogSoftmaxOutputsNum = 1; | |||
| } // namespace | |||
| void LogSoftmaxCPUKernel::InitKernel(const CNodePtr &kernel_node) { | |||
| MS_EXCEPTION_IF_NULL(kernel_node); | |||
| kernel_name_ = AnfAlgo::GetCNodeName(kernel_node); | |||
| std::vector<size_t> src_shape = AnfAlgo::GetInputDeviceShape(kernel_node, 0); | |||
| int axis = AnfAlgo::GetNodeAttr<int64_t>(kernel_node, AXIS); | |||
| if (axis >= SizeToInt(src_shape.size())) { | |||
| @@ -41,9 +49,8 @@ void LogSoftmaxCPUKernel::InitKernel(const CNodePtr &kernel_node) { | |||
| bool LogSoftmaxCPUKernel::Launch(const std::vector<kernel::AddressPtr> &inputs, const std::vector<kernel::AddressPtr> &, | |||
| const std::vector<kernel::AddressPtr> &outputs) { | |||
| if (inputs.empty() || outputs.empty()) { | |||
| MS_LOG(EXCEPTION) << "Log softmax error input output size!"; | |||
| } | |||
| CHECK_KERNEL_INPUTS_NUM(inputs.size(), kLogSoftmaxInputsNum, kernel_name_); | |||
| CHECK_KERNEL_OUTPUTS_NUM(outputs.size(), kLogSoftmaxOutputsNum, kernel_name_); | |||
| SetArgumentHandle(DNNL_ARG_SRC, inputs[0]->addr); | |||
| SetArgumentHandle(DNNL_ARG_DST, outputs[0]->addr); | |||
| ExecutePrimitive(); | |||
| @@ -13,6 +13,7 @@ | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_LOG_SOFTMAX_CPU_KERNEL_H_ | |||
| #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_LOG_SOFTMAX_CPU_KERNEL_H_ | |||
| @@ -13,15 +13,23 @@ | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| #include "backend/kernel_compiler/cpu/mkldnn/log_softmax_grad_cpu_kernel.h" | |||
| #include <algorithm> | |||
| #include "backend/kernel_compiler/cpu/mkldnn/mkl_kernel_engine.h" | |||
| #include "runtime/device/cpu/cpu_device_address.h" | |||
| #include "utils/ms_utils.h" | |||
| namespace mindspore { | |||
| namespace kernel { | |||
| namespace { | |||
| constexpr size_t kLogSoftmaxGradInputsNum = 2; | |||
| constexpr size_t kLogSoftmaxGradOutputsNum = 1; | |||
| } // namespace | |||
| void LogSoftmaxGradCPUKernel::InitKernel(const CNodePtr &kernel_node) { | |||
| MS_EXCEPTION_IF_NULL(kernel_node); | |||
| kernel_name_ = AnfAlgo::GetCNodeName(kernel_node); | |||
| std::vector<size_t> src_shape = AnfAlgo::GetInputDeviceShape(kernel_node, 0); | |||
| int axis = AnfAlgo::GetNodeAttr<int64_t>(kernel_node, AXIS); | |||
| if (axis >= SizeToInt(src_shape.size())) { | |||
| @@ -47,9 +55,8 @@ void LogSoftmaxGradCPUKernel::InitKernel(const CNodePtr &kernel_node) { | |||
| bool LogSoftmaxGradCPUKernel::Launch(const std::vector<kernel::AddressPtr> &inputs, | |||
| const std::vector<kernel::AddressPtr> &, | |||
| const std::vector<kernel::AddressPtr> &outputs) { | |||
| if (inputs.size() < 2 || outputs.empty()) { | |||
| MS_LOG(EXCEPTION) << "LogSoftmaxGrad error input output size!"; | |||
| } | |||
| CHECK_KERNEL_INPUTS_NUM(inputs.size(), kLogSoftmaxGradInputsNum, kernel_name_); | |||
| CHECK_KERNEL_OUTPUTS_NUM(outputs.size(), kLogSoftmaxGradOutputsNum, kernel_name_); | |||
| SetArgumentHandle(DNNL_ARG_DST, inputs[0]->addr); | |||
| SetArgumentHandle(DNNL_ARG_DIFF_DST, inputs[1]->addr); | |||
| SetArgumentHandle(DNNL_ARG_DIFF_SRC, outputs[0]->addr); | |||
| @@ -13,6 +13,7 @@ | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_LOG_SOFTMAX_GRAD_CPU_KERNEL_H_ | |||
| #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_LOG_SOFTMAX_GRAD_CPU_KERNEL_H_ | |||
| @@ -1,5 +1,5 @@ | |||
| /** | |||
| * Copyright 2020 Huawei Technologies Co., Ltd | |||
| * Copyright 2020-2021 Huawei Technologies Co., Ltd | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| @@ -13,6 +13,7 @@ | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| #include "backend/kernel_compiler/cpu/mkldnn/lstm_cpu_kernel.h" | |||
| #include <string> | |||
| #include "utils/ms_utils.h" | |||
| @@ -21,9 +22,18 @@ | |||
| namespace mindspore { | |||
| namespace kernel { | |||
| const int kMaxLSTMLayer = 100; | |||
| const int kOutputWorkSpaceIndex = 3; | |||
| const int kGateNum = 4; | |||
| namespace { | |||
| constexpr size_t kLstmInputsNum = 4; | |||
| constexpr size_t kLstmOutputsNum = 5; | |||
| constexpr int kMaxLSTMLayer = 100; | |||
| constexpr int kOutputWorkSpaceIndex = 3; | |||
| constexpr int kGateNum = 4; | |||
| using tag = dnnl::memory::format_tag; | |||
| using dim = dnnl::memory::dims; | |||
| using dt = dnnl::memory::data_type; | |||
| } // namespace | |||
| void LstmCPUKernel::InitInputOutputSize(const CNodePtr &kernel_node) { | |||
| CPUKernel::InitInputOutputSize(kernel_node); | |||
| output_size_list_[kOutputWorkSpaceIndex] = reserve_size_; | |||
| @@ -46,8 +56,7 @@ void LstmCPUKernel::InitKernel(const CNodePtr &kernel_node) { | |||
| _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON); | |||
| #endif | |||
| MS_EXCEPTION_IF_NULL(kernel_node); | |||
| using tag = dnnl::memory::format_tag; | |||
| using dim = dnnl::memory::dims; | |||
| kernel_name_ = AnfAlgo::GetCNodeName(kernel_node); | |||
| CheckParam(kernel_node); | |||
| auto eng = MKLKernelEngine::Get().engine(); | |||
| dnnl::rnn_direction direction = dnnl::rnn_direction::unidirectional; | |||
| @@ -70,10 +79,10 @@ void LstmCPUKernel::InitKernel(const CNodePtr &kernel_node) { | |||
| dnnl::memory::desc dst_desc = formatted_md(dst_dims, tag::tnc); | |||
| dnnl::memory::desc dst_h_desc = formatted_md(dst_h_dims, tag::ldnc); | |||
| dnnl::memory::desc dst_c_desc = formatted_md(dst_c_dims, tag::ldnc); | |||
| if (!kernel_node->HasAttr(kAttrIsTraining)) { | |||
| is_training = true; | |||
| } else { | |||
| if (kernel_node->HasAttr(kAttrIsTraining)) { | |||
| is_training = GetValue<bool>(kernel_node->GetAttr(kAttrIsTraining)); | |||
| } else { | |||
| is_training = true; | |||
| } | |||
| auto prop_kind = dnnl::prop_kind::forward_training; | |||
| if (!is_training) { | |||
| @@ -106,9 +115,9 @@ void LstmCPUKernel::CheckParam(const CNodePtr &kernel_node) { | |||
| std::vector<size_t> src_h_shape = AnfAlgo::GetInputDeviceShape(kernel_node, 1); | |||
| std::vector<size_t> src_c_shape = AnfAlgo::GetInputDeviceShape(kernel_node, 2); | |||
| bidirectional_ = AnfAlgo::GetNodeAttr<bool>(kernel_node, "bidirectional"); | |||
| input_size_ = static_cast<int>(AnfAlgo::GetNodeAttr<int64_t>(kernel_node, "input_size")); | |||
| hidden_size_ = static_cast<int>(AnfAlgo::GetNodeAttr<int64_t>(kernel_node, "hidden_size")); | |||
| num_layers_ = static_cast<int>(AnfAlgo::GetNodeAttr<int64_t>(kernel_node, "num_layers")); | |||
| input_size_ = LongToInt(AnfAlgo::GetNodeAttr<int64_t>(kernel_node, "input_size")); | |||
| hidden_size_ = LongToInt(AnfAlgo::GetNodeAttr<int64_t>(kernel_node, "hidden_size")); | |||
| num_layers_ = LongToInt(AnfAlgo::GetNodeAttr<int64_t>(kernel_node, "num_layers")); | |||
| has_bias_ = AnfAlgo::GetNodeAttr<bool>(kernel_node, "has_bias"); | |||
| batch_size_ = SizeToInt(src_shape[1]); | |||
| seq_len_ = SizeToInt(src_shape[0]); | |||
| @@ -1,5 +1,5 @@ | |||
| /** | |||
| * Copyright 2020 Huawei Technologies Co., Ltd | |||
| * Copyright 2020-2021 Huawei Technologies Co., Ltd | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| @@ -16,15 +16,18 @@ | |||
| #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_LSTM_CPU_KERNEL_H_ | |||
| #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_LSTM_CPU_KERNEL_H_ | |||
| #if defined(__x86_64__) || defined(__amd64__) || defined(_M_IX86) || defined(_M_X64) | |||
| #define PLATFORM_86 | |||
| #endif | |||
| #ifdef PLATFORM_86 | |||
| #include <pmmintrin.h> | |||
| #endif | |||
| #include <vector> | |||
| #include <memory> | |||
| #include "backend/kernel_compiler/cpu/mkldnn/mkl_cpu_kernel.h" | |||
| namespace mindspore { | |||
| namespace kernel { | |||
| class LstmCPUKernel : public MKLCPUKernel { | |||
| @@ -41,18 +44,20 @@ class LstmCPUKernel : public MKLCPUKernel { | |||
| private: | |||
| void CheckParam(const CNodePtr &kernel_node); | |||
| int weight_size_ = 0; | |||
| int weight_h_size_ = 0; | |||
| int input_size_; | |||
| int hidden_size_; | |||
| int num_layers_; | |||
| int batch_size_; | |||
| int seq_len_; | |||
| int num_directions_; | |||
| bool bidirectional_; | |||
| bool has_bias_; | |||
| size_t reserve_size_; | |||
| bool is_training; | |||
| int weight_size_{0}; | |||
| int weight_h_size_{0}; | |||
| int input_size_{0}; | |||
| int hidden_size_{0}; | |||
| int num_layers_{0}; | |||
| int batch_size_{0}; | |||
| int seq_len_{0}; | |||
| int num_directions_{0}; | |||
| bool bidirectional_{false}; | |||
| bool has_bias_{false}; | |||
| bool is_training{false}; | |||
| size_t reserve_size_{0}; | |||
| dnnl::memory::dims weights_dims_; | |||
| dnnl::memory::dims weights_h_dims_; | |||
| dnnl::memory::dims bias_dims_; | |||
| @@ -1,5 +1,5 @@ | |||
| /** | |||
| * Copyright 2020 Huawei Technologies Co., Ltd | |||
| * Copyright 2020-2021 Huawei Technologies Co., Ltd | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| @@ -13,6 +13,7 @@ | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| #include "backend/kernel_compiler/cpu/mkldnn/lstm_grad_cpu_kernel.h" | |||
| #include <cstring> | |||
| #include <string> | |||
| @@ -22,8 +23,17 @@ | |||
| namespace mindspore { | |||
| namespace kernel { | |||
| const int kMaxLSTMLayer = 100; | |||
| const int kInputWorkSpaceIndex = 10; | |||
| namespace { | |||
| constexpr size_t kLstmGradInputsNum = 11; | |||
| constexpr size_t kLstmGradOutputsNum = 4; | |||
| constexpr int kMaxLSTMLayer = 100; | |||
| constexpr int kInputWorkSpaceIndex = 10; | |||
| using tag = dnnl::memory::format_tag; | |||
| using dim = dnnl::memory::dims; | |||
| using dt = dnnl::memory::data_type; | |||
| } // namespace | |||
| void LSTMGradCPUKernel::InitInputOutputSize(const CNodePtr &kernel_node) { | |||
| CPUKernel::InitInputOutputSize(kernel_node); | |||
| input_size_list_[kInputWorkSpaceIndex] = reserve_size_; | |||
| @@ -31,8 +41,7 @@ void LSTMGradCPUKernel::InitInputOutputSize(const CNodePtr &kernel_node) { | |||
| void LSTMGradCPUKernel::InitKernel(const CNodePtr &kernel_node) { | |||
| MS_EXCEPTION_IF_NULL(kernel_node); | |||
| using tag = dnnl::memory::format_tag; | |||
| using dim = dnnl::memory::dims; | |||
| kernel_name_ = AnfAlgo::GetCNodeName(kernel_node); | |||
| CheckParam(kernel_node); | |||
| auto eng = MKLKernelEngine::Get().engine(); | |||
| dnnl::rnn_direction direction = dnnl::rnn_direction::unidirectional; | |||
| @@ -167,8 +176,8 @@ void LSTMGradCPUKernel::ResetMemory(const dnnl::memory &mem, const string name) | |||
| bool LSTMGradCPUKernel::Launch(const std::vector<kernel::AddressPtr> &inputs, const std::vector<kernel::AddressPtr> &, | |||
| const std::vector<kernel::AddressPtr> &outputs) { | |||
| using dt = dnnl::memory::data_type; | |||
| using tag = dnnl::memory::format_tag; | |||
| CHECK_KERNEL_INPUTS_NUM(inputs.size(), kLstmGradInputsNum, kernel_name_); | |||
| CHECK_KERNEL_OUTPUTS_NUM(outputs.size(), kLstmGradOutputsNum, kernel_name_); | |||
| auto eng = MKLKernelEngine::Get().engine(); | |||
| // construct fw memory | |||
| auto user_weights_memory = dnnl::memory(dnnl::memory::desc{{weights_dims_}, dt::f32, tag::ldgoi}, eng); | |||
| @@ -1,5 +1,5 @@ | |||
| /** | |||
| * Copyright 2020 Huawei Technologies Co., Ltd | |||
| * Copyright 2020-2021 Huawei Technologies Co., Ltd | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| @@ -13,6 +13,7 @@ | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_LSTM_GRAD_CPU_KERNEL_H_ | |||
| #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_LSTM_GRAD_CPU_KERNEL_H_ | |||
| @@ -47,17 +48,19 @@ class LSTMGradCPUKernel : public MKLCPUKernel { | |||
| const dnnl::memory &diff_bias_memory); | |||
| void ResetMemory(const dnnl::memory &mem, const string name) const; | |||
| void CheckParam(const CNodePtr &kernel_node); | |||
| int64_t weight_size_ = 0; | |||
| int64_t weight_h_size_ = 0; | |||
| int64_t input_size_; | |||
| int64_t hidden_size_; | |||
| int64_t num_layers_; | |||
| int64_t batch_size_; | |||
| int64_t seq_len_; | |||
| int num_directions_; | |||
| bool bidirectional_; | |||
| bool has_bias_; | |||
| size_t reserve_size_; | |||
| int num_directions_{0}; | |||
| bool bidirectional_{false}; | |||
| bool has_bias_{false}; | |||
| int64_t weight_size_{0}; | |||
| int64_t weight_h_size_{0}; | |||
| int64_t input_size_{0}; | |||
| int64_t hidden_size_{0}; | |||
| int64_t num_layers_{0}; | |||
| int64_t batch_size_{0}; | |||
| int64_t seq_len_{0}; | |||
| size_t reserve_size_{0}; | |||
| dnnl::memory::dims weights_dims_; | |||
| dnnl::memory::dims weights_h_dims_; | |||
| dnnl::memory::dims bias_dims_; | |||
| @@ -1,5 +1,5 @@ | |||
| /** | |||
| * Copyright 2019 Huawei Technologies Co., Ltd | |||
| * Copyright 2019-2021 Huawei Technologies Co., Ltd | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| @@ -13,10 +13,9 @@ | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| #include "backend/kernel_compiler/cpu/mkldnn/matmul_cpu_kernel.h" | |||
| #include "backend/kernel_compiler/cpu/mkldnn/matmul_cpu_kernel.h" | |||
| #include <utility> | |||
| #include "common/thread_pool.h" | |||
| #include "backend/kernel_compiler/cpu/mkldnn/mkl_kernel_engine.h" | |||
| #include "backend/kernel_compiler/cpu/nnacl/op_base.h" | |||
| @@ -26,8 +25,10 @@ | |||
| namespace mindspore { | |||
| namespace kernel { | |||
| namespace { | |||
| constexpr size_t kMatMulInputsNum = 2; | |||
| constexpr size_t kMatMulOutputsNum = 1; | |||
| const size_t kIndexOffset = 2; | |||
| } | |||
| } // namespace | |||
| void MatMulCPUKernel::InitTile() { | |||
| #ifdef ENABLE_AVX | |||
| @@ -47,13 +48,16 @@ void MatMulCPUKernel::InitTile() { | |||
| void MatMulCPUKernel::InitMatrixA(const float *src_ptr) { | |||
| const size_t size = param_.batch * param_.row_align_ * param_.deep_; | |||
| a_pack_ptr_ = new float[size]; | |||
| a_pack_ptr_ = new (std::nothrow) float[size]; | |||
| if (a_pack_ptr_ == nullptr) { | |||
| MS_LOG(EXCEPTION) << "MatMul new a_pack_ptr_ failed."; | |||
| } | |||
| if (vec_matmul_) { | |||
| const size_t count = size * sizeof(float); | |||
| if (memcpy_s(a_pack_ptr_, count, src_ptr, count) != EOK) { | |||
| FreeBuffer(); | |||
| MS_LOG(EXCEPTION) << "Memcpy a_pack_ptr_ failed."; | |||
| MS_LOG(EXCEPTION) << "MatMul memcpy a_pack_ptr_ failed."; | |||
| } | |||
| return; | |||
| } | |||
| @@ -88,14 +92,14 @@ void MatMulCPUKernel::InitMatrixB(const float *src_ptr) { | |||
| b_pack_ptr_ = new (std::nothrow) float[size]; | |||
| if (b_pack_ptr_ == nullptr) { | |||
| FreeBuffer(); | |||
| MS_LOG(EXCEPTION) << "Malloc b_pack_ptr_ failed"; | |||
| MS_LOG(EXCEPTION) << "MatMul new b_pack_ptr_ failed"; | |||
| } | |||
| if (vec_matmul_) { | |||
| if (param_.b_transpose_) { | |||
| const size_t count = size * sizeof(float); | |||
| if (memcpy_s(b_pack_ptr_, count, src_ptr, count) != EOK) { | |||
| FreeBuffer(); | |||
| MS_LOG(EXCEPTION) << "Memcpy b_pack_ptr_ failed."; | |||
| MS_LOG(EXCEPTION) << "MatMul memcpy b_pack_ptr_ failed."; | |||
| } | |||
| } else { | |||
| for (int i = 0; i < param_.batch; i++) { | |||
| @@ -169,6 +173,7 @@ void MatMulCPUKernel::InitX64Kernel(bool trans_a, bool trans_b, const std::vecto | |||
| void MatMulCPUKernel::InitKernel(const CNodePtr &kernel_node) { | |||
| MS_EXCEPTION_IF_NULL(kernel_node); | |||
| kernel_name_ = AnfAlgo::GetCNodeName(kernel_node); | |||
| std::vector<size_t> a_shape = AnfAlgo::GetInputDeviceShape(kernel_node, 0); | |||
| std::vector<size_t> b_shape = AnfAlgo::GetInputDeviceShape(kernel_node, 1); | |||
| std::vector<size_t> o_shape = AnfAlgo::GetOutputDeviceShape(kernel_node, 0); | |||
| @@ -190,7 +195,7 @@ void MatMulCPUKernel::InitKernel(const CNodePtr &kernel_node) { | |||
| #endif | |||
| } | |||
| int MatMulCPUKernel::FloatRun(size_t task_id) { | |||
| int MatMulCPUKernel::FloatRun(size_t task_id) const { | |||
| size_t current_stride_oc = thread_stride_ * col_tile_; | |||
| if (IntToSize(param_.col_) <= task_id * current_stride_oc) { | |||
| return common::SUCCESS; | |||
| @@ -238,7 +243,7 @@ void MatMulCPUKernel::LaunchARM(const float *input_a, const float *input_b, floa | |||
| FreeBuffer(); | |||
| } | |||
| void MatMulCPUKernel::LaunchX64(const float *input_a, const float *input_b, float *output) { | |||
| void MatMulCPUKernel::LaunchX64(const float *input_a, const float *input_b, float *output) const { | |||
| dnnl_dim_t lda = (trans_a_ == TRANSPOSE_YES ? dim_m_ : dim_k_); | |||
| dnnl_dim_t ldb = (trans_b_ == TRANSPOSE_YES ? dim_k_ : dim_n_); | |||
| dnnl_dim_t ldc = dim_n_; | |||
| @@ -252,9 +257,8 @@ void MatMulCPUKernel::LaunchX64(const float *input_a, const float *input_b, floa | |||
| bool MatMulCPUKernel::Launch(const std::vector<kernel::AddressPtr> &inputs, const std::vector<kernel::AddressPtr> &, | |||
| const std::vector<kernel::AddressPtr> &outputs) { | |||
| if (inputs.size() < 2 || outputs.empty()) { | |||
| MS_LOG(EXCEPTION) << "matmul error input output size!"; | |||
| } | |||
| CHECK_KERNEL_INPUTS_NUM(inputs.size(), kMatMulInputsNum, kernel_name_); | |||
| CHECK_KERNEL_OUTPUTS_NUM(outputs.size(), kMatMulOutputsNum, kernel_name_); | |||
| const auto input_a = reinterpret_cast<float *>(inputs[0]->addr); | |||
| const auto input_b = reinterpret_cast<float *>(inputs[1]->addr); | |||
| auto output = reinterpret_cast<float *>(outputs[0]->addr); | |||
| @@ -1,5 +1,5 @@ | |||
| /** | |||
| * Copyright 2019 Huawei Technologies Co., Ltd | |||
| * Copyright 2019-2021 Huawei Technologies Co., Ltd | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| @@ -13,6 +13,7 @@ | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_MATMUL_CPU_KERNEL_H_ | |||
| #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_MATMUL_CPU_KERNEL_H_ | |||
| @@ -42,14 +43,12 @@ class MatMulCPUKernel : public MKLCPUKernel { | |||
| const std::vector<size_t> &o_shape); | |||
| void InitX64Kernel(bool trans_a, bool trans_b, const std::vector<size_t> &a_shape, const std::vector<size_t> &b_shape, | |||
| const std::vector<size_t> &o_shape); | |||
| void LaunchX64(const float *input_a, const float *input_b, float *output); | |||
| void LaunchX64(const float *input_a, const float *input_b, float *output) const; | |||
| void LaunchARM(const float *input_a, const float *input_b, float *output); | |||
| void ParallelRun(float *output); | |||
| int FloatRun(size_t task_id); | |||
| int FloatRun(size_t task_id) const; | |||
| void FreeBuffer(); | |||
| char trans_a_{TRANSPOSE_NO}; | |||
| char trans_b_{TRANSPOSE_NO}; | |||
| dnnl_dim_t dim_m_{0}; | |||
| dnnl_dim_t dim_n_{0}; | |||
| dnnl_dim_t dim_k_{0}; | |||
| @@ -62,6 +61,8 @@ class MatMulCPUKernel : public MKLCPUKernel { | |||
| size_t size_mat_a_{0}; | |||
| size_t size_mat_b_{0}; | |||
| size_t size_mat_o_{0}; | |||
| char trans_a_{TRANSPOSE_NO}; | |||
| char trans_b_{TRANSPOSE_NO}; | |||
| bool vec_matmul_{false}; | |||
| float *a_pack_ptr_{nullptr}; | |||
| float *b_pack_ptr_{nullptr}; | |||
| @@ -1,5 +1,5 @@ | |||
| /** | |||
| * Copyright 2020 Huawei Technologies Co., Ltd | |||
| * Copyright 2020-2021 Huawei Technologies Co., Ltd | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| @@ -13,9 +13,11 @@ | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| #include "backend/kernel_compiler/cpu/mkldnn/mkl_cpu_kernel.h" | |||
| #include <vector> | |||
| #include <string> | |||
| #include <algorithm> | |||
| #include "utils/ms_utils.h" | |||
| #include "backend/kernel_compiler/cpu/mkldnn/mkl_kernel_engine.h" | |||
| @@ -24,8 +26,10 @@ namespace kernel { | |||
| void MKLCPUKernel::GetPadding(const CNodePtr &kernel_node, const std::string &pad_mode, | |||
| const std::vector<size_t> &src_shape, const std::vector<size_t> &kernel_size, | |||
| const std::vector<int> &stride, std::vector<int> *padding_l, std::vector<int> *padding_r, | |||
| const std::vector<int> &dilation) { | |||
| const std::vector<int> &dilation) const { | |||
| MS_EXCEPTION_IF_NULL(kernel_node); | |||
| MS_EXCEPTION_IF_NULL(padding_l); | |||
| MS_EXCEPTION_IF_NULL(padding_r); | |||
| auto dim = src_shape.size(); | |||
| if (dim < 2) { | |||
| MS_LOG(EXCEPTION) << "Set pad only support src dim >= 2!"; | |||
| @@ -65,7 +69,7 @@ void MKLCPUKernel::GetPadding(const CNodePtr &kernel_node, const std::string &pa | |||
| } | |||
| bool MKLCPUKernel::BinaryBroadCast(std::vector<size_t> *src0_shape, std::vector<size_t> *src1_shape, | |||
| std::vector<size_t> *dst_shape) { | |||
| std::vector<size_t> *dst_shape) const { | |||
| MS_EXCEPTION_IF_NULL(src0_shape); | |||
| MS_EXCEPTION_IF_NULL(src1_shape); | |||
| MS_EXCEPTION_IF_NULL(dst_shape); | |||
| @@ -115,20 +119,19 @@ dnnl::memory::format_tag MKLCPUKernel::GetDefaultFormatTag(const dnnl::memory::d | |||
| dnnl::memory::format_tag::a, dnnl::memory::format_tag::ab, dnnl::memory::format_tag::abc, | |||
| dnnl::memory::format_tag::abcd, dnnl::memory::format_tag::abcde, dnnl::memory::format_tag::abcdef, | |||
| dnnl::memory::format_tag::abcdefg}; | |||
| auto rank = dims.size(); | |||
| size_t rank = dims.size(); | |||
| if (rank > tag_vec.size()) { | |||
| MS_LOG(EXCEPTION) << "The kernel does not support construct " << rank << "-D tensor dnnl memory format_tag."; | |||
| } | |||
| return tag_vec[rank - 1]; | |||
| } | |||
| dnnl::memory::desc MKLCPUKernel::GetDefaultMemDesc(const std::vector<size_t> &shape) { | |||
| dnnl::memory::desc MKLCPUKernel::GetDefaultMemDesc(const std::vector<size_t> &shape) const { | |||
| dnnl::memory::dims dims; | |||
| if (shape.size() == 0) { | |||
| dims.insert(dims.end(), 1); | |||
| if (shape.empty()) { | |||
| (void)dims.insert(dims.end(), 1); | |||
| } else { | |||
| dims.insert(dims.end(), shape.begin(), shape.end()); | |||
| (void)dims.insert(dims.end(), shape.begin(), shape.end()); | |||
| } | |||
| dnnl::memory::format_tag mem_tag = GetDefaultFormatTag(dims); | |||
| dnnl::memory::desc mem_desc(dims, dnnl::memory::data_type::f32, mem_tag); | |||
| @@ -1,5 +1,5 @@ | |||
| /** | |||
| * Copyright 2020 Huawei Technologies Co., Ltd | |||
| * Copyright 2020-2021 Huawei Technologies Co., Ltd | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| @@ -13,6 +13,7 @@ | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_MKL_CPU_KERNEL_H_ | |||
| #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_MKL_CPU_KERNEL_H_ | |||
| @@ -33,21 +34,22 @@ class MKLCPUKernel : public CPUKernel { | |||
| protected: | |||
| bool BinaryBroadCast(std::vector<size_t> *src0_shape, std::vector<size_t> *src1_shape, | |||
| std::vector<size_t> *dst_shape); | |||
| std::vector<size_t> *dst_shape) const; | |||
| void GetPadding(const CNodePtr &kernel_node, const std::string &pad_mode, const std::vector<size_t> &src_shape, | |||
| const std::vector<size_t> &kernel_size, const std::vector<int> &stride, std::vector<int> *padding_l, | |||
| std::vector<int> *padding_r, const std::vector<int> &dilation); | |||
| std::vector<int> *padding_r, const std::vector<int> &dilation) const; | |||
| void AddArgument(int arg_key, const dnnl::memory::desc &mem_desc, bool alloc = false); | |||
| void SetArgumentHandle(int arg_key, void *ptr); | |||
| dnnl::memory::format_tag GetDefaultFormatTag(const dnnl::memory::dims &dims) const; | |||
| dnnl::memory::desc GetDefaultMemDesc(const std::vector<size_t> &shape); | |||
| dnnl::memory::desc GetDefaultMemDesc(const std::vector<size_t> &shape) const; | |||
| void ExecutePrimitive(); | |||
| std::unordered_map<int, dnnl::memory> arguments_; | |||
| std::shared_ptr<dnnl::primitive> primitive_{nullptr}; | |||
| inline dnnl::memory::desc formatted_md(const dnnl::memory::dims &dimensions, dnnl::memory::format_tag layout) { | |||
| return dnnl::memory::desc{{dimensions}, dnnl::memory::data_type::f32, layout}; | |||
| } | |||
| void Reorder(dnnl::memory *src_mem, dnnl::memory *dst_mem); | |||
| std::unordered_map<int, dnnl::memory> arguments_; | |||
| std::shared_ptr<dnnl::primitive> primitive_{nullptr}; | |||
| }; | |||
| } // namespace kernel | |||
| } // namespace mindspore | |||
| @@ -1,5 +1,5 @@ | |||
| /** | |||
| * Copyright 2020 Huawei Technologies Co., Ltd | |||
| * Copyright 2020-2021 Huawei Technologies Co., Ltd | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| @@ -13,6 +13,7 @@ | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| #include "backend/kernel_compiler/cpu/mkldnn/mkl_kernel_engine.h" | |||
| #include "utils/log_adapter.h" | |||
| #include "dnnl.hpp" | |||
| @@ -33,6 +34,7 @@ dnnl::memory MKLKernelEngine::CreateMemory(const dnnl::memory::desc &mem_desc, b | |||
| return dnnl::memory(mem_desc, engine_, nullptr); | |||
| } | |||
| } | |||
| void MKLKernelEngine::Reorder(dnnl::memory *src_mem, dnnl::memory *dst_mem) { | |||
| dnnl::reorder(*src_mem, *dst_mem).execute(stream_, *src_mem, *dst_mem); | |||
| } | |||
| @@ -1,5 +1,5 @@ | |||
| /** | |||
| * Copyright 2020 Huawei Technologies Co., Ltd | |||
| * Copyright 2020-2021 Huawei Technologies Co., Ltd | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| @@ -13,8 +13,10 @@ | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| #ifndef MINDSPORE_MKL_KERNEL_ENGINE_H_ | |||
| #define MINDSPORE_MKL_KERNEL_ENGINE_H_ | |||
| #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_MKL_KERNEL_ENGINE_H_ | |||
| #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_MKL_KERNEL_ENGINE_H_ | |||
| #include <cstdlib> | |||
| #include <algorithm> | |||
| #include <iostream> | |||
| @@ -46,10 +48,11 @@ class MKLKernelEngine { | |||
| private: | |||
| MKLKernelEngine() : engine_(dnnl::engine::kind::cpu, 0), stream_(engine_) {} | |||
| ~MKLKernelEngine() = default; | |||
| dnnl::engine engine_; | |||
| dnnl::stream stream_; | |||
| }; | |||
| } // namespace kernel | |||
| } // namespace mindspore | |||
| #endif // MINDSPORE_MKL_KERNEL_ENGINE_H_ | |||
| #endif // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_MKL_KERNEL_ENGINE_H_ | |||
| @@ -1,65 +0,0 @@ | |||
| /** | |||
| * Copyright 2019 Huawei Technologies Co., Ltd | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| * You may obtain a copy of the License at | |||
| * | |||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, software | |||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| #include "backend/kernel_compiler/cpu/mkldnn/mul_cpu_kernel.h" | |||
| #include "backend/kernel_compiler/cpu/mkldnn/mkl_kernel_engine.h" | |||
| #include "runtime/device/cpu/cpu_device_address.h" | |||
| #include "utils/ms_utils.h" | |||
| namespace mindspore { | |||
| namespace kernel { | |||
| void MulCPUKernel::InitKernel(const CNodePtr &kernel_node) { | |||
| MS_EXCEPTION_IF_NULL(kernel_node); | |||
| std::vector<size_t> src0_shape = AnfAlgo::GetInputDeviceShape(kernel_node, 0); | |||
| std::vector<size_t> src1_shape = AnfAlgo::GetInputDeviceShape(kernel_node, 1); | |||
| std::vector<size_t> dst_shape = AnfAlgo::GetOutputDeviceShape(kernel_node, 0); | |||
| need_swap_ = BinaryBroadCast(&src0_shape, &src1_shape, &dst_shape); | |||
| dnnl::memory::desc src0_desc; | |||
| dnnl::memory::desc src1_desc; | |||
| if (need_swap_) { | |||
| src0_desc = GetDefaultMemDesc(src1_shape); | |||
| src1_desc = GetDefaultMemDesc(src0_shape); | |||
| } else { | |||
| src0_desc = GetDefaultMemDesc(src0_shape); | |||
| src1_desc = GetDefaultMemDesc(src1_shape); | |||
| } | |||
| dnnl::memory::desc dst_desc = GetDefaultMemDesc(dst_shape); | |||
| dnnl::binary::desc desc = dnnl::binary::desc(dnnl::algorithm::binary_mul, src0_desc, src1_desc, dst_desc); | |||
| auto prim_desc = dnnl::binary::primitive_desc(desc, MKLKernelEngine::Get().engine()); | |||
| primitive_ = std::make_shared<dnnl::binary>(prim_desc); | |||
| AddArgument(DNNL_ARG_SRC_0, src0_desc); | |||
| AddArgument(DNNL_ARG_SRC_1, src1_desc); | |||
| AddArgument(DNNL_ARG_DST, dst_desc); | |||
| } | |||
| bool MulCPUKernel::Launch(const std::vector<kernel::AddressPtr> &inputs, const std::vector<kernel::AddressPtr> &, | |||
| const std::vector<kernel::AddressPtr> &outputs) { | |||
| if (inputs.size() < 2 || outputs.empty()) { | |||
| MS_LOG(EXCEPTION) << "mul error input output size!"; | |||
| } | |||
| if (need_swap_) { | |||
| SetArgumentHandle(DNNL_ARG_SRC_0, inputs[1]->addr); | |||
| SetArgumentHandle(DNNL_ARG_SRC_1, inputs[0]->addr); | |||
| } else { | |||
| SetArgumentHandle(DNNL_ARG_SRC_0, inputs[0]->addr); | |||
| SetArgumentHandle(DNNL_ARG_SRC_1, inputs[1]->addr); | |||
| } | |||
| SetArgumentHandle(DNNL_ARG_DST, outputs[0]->addr); | |||
| ExecutePrimitive(); | |||
| return true; | |||
| } | |||
| } // namespace kernel | |||
| } // namespace mindspore | |||
| @@ -1,42 +0,0 @@ | |||
| /** | |||
| * Copyright 2019 Huawei Technologies Co., Ltd | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| * You may obtain a copy of the License at | |||
| * | |||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, software | |||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_MUL_CPU_KERNEL_H_ | |||
| #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_MUL_CPU_KERNEL_H_ | |||
| #include <vector> | |||
| #include <memory> | |||
| #include "backend/kernel_compiler/cpu/arithmetic_cpu_kernel.h" | |||
| #include "backend/kernel_compiler/cpu/mkldnn/mkl_cpu_kernel.h" | |||
| namespace mindspore { | |||
| namespace kernel { | |||
| class MulCPUKernel : public MKLCPUKernel { | |||
| public: | |||
| MulCPUKernel() = default; | |||
| ~MulCPUKernel() override = default; | |||
| void InitKernel(const CNodePtr &kernel_node) override; | |||
| bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace, | |||
| const std::vector<AddressPtr> &outputs) override; | |||
| private: | |||
| bool need_swap_{false}; | |||
| }; | |||
| } // namespace kernel | |||
| } // namespace mindspore | |||
| #endif // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_MUL_CPU_KERNEL_H_ | |||
| @@ -39,7 +39,7 @@ void pad_input_shape(int *input_shape, int input_shape_len, int output_shape_len | |||
| } | |||
| } | |||
| #define BROADCAST_TO(type) \ | |||
| #define BROADCAST_TO_IMPL(type) \ | |||
| int broadcast_to_##type(const type *input, BroadcastShapeInfo *shape_info, type *output) { \ | |||
| if (input == NULL || output == NULL) { \ | |||
| return NNACL_NULL_PTR; \ | |||
| @@ -96,9 +96,9 @@ void pad_input_shape(int *input_shape, int input_shape_len, int output_shape_len | |||
| return NNACL_OK; \ | |||
| } | |||
| BROADCAST_TO(int) | |||
| BROADCAST_TO(float) | |||
| BROADCAST_TO(bool) | |||
| BROADCAST_TO_IMPL(int) | |||
| BROADCAST_TO_IMPL(float) | |||
| BROADCAST_TO_IMPL(bool) | |||
| #ifdef ENABLE_FP16 | |||
| BROADCAST_TO(float16_t) | |||
| BROADCAST_TO_IMPL(float16_t) | |||
| #endif | |||
| @@ -21,7 +21,7 @@ | |||
| #ifdef __cplusplus | |||
| extern "C" { | |||
| #endif | |||
| #define BroadcastTo(type, input, shape_info, output) broadcast_to_##type(input, shape_info, output) | |||
| #define BROADCAST_TO(type, input, shape_info, output) broadcast_to_##type(input, shape_info, output) | |||
| int broadcast_to_int(const int *input, BroadcastShapeInfo *shape_info, int *output); | |||
| int broadcast_to_float(const float *input, BroadcastShapeInfo *shape_info, float *output); | |||
| int broadcast_to_bool(const bool *input, BroadcastShapeInfo *shape_info, bool *output); | |||
| @@ -59,17 +59,17 @@ int BroadcastToCPUKernel::Run() { | |||
| switch (data_type_) { | |||
| case kNumberTypeFloat32: | |||
| return BroadcastTo(float, reinterpret_cast<const float *>(input_data), &shape_info_, | |||
| reinterpret_cast<float *>(output_data)); | |||
| return BROADCAST_TO(float, reinterpret_cast<const float *>(input_data), &shape_info_, | |||
| reinterpret_cast<float *>(output_data)); | |||
| #ifdef ENABLE_FP16 | |||
| case kNumberTypeFloat16: | |||
| return BroadcastTo(float16_t, reinterpret_cast<const float16_t *>(input_data), &shape_info_, | |||
| reinterpret_cast<float16_t *>(output_data)); | |||
| return BROADCAST_TO(float16_t, reinterpret_cast<const float16_t *>(input_data), &shape_info_, | |||
| reinterpret_cast<float16_t *>(output_data)); | |||
| #endif | |||
| case kNumberTypeInt32: | |||
| case kNumberTypeInt: | |||
| return BroadcastTo(int, reinterpret_cast<const int *>(input_data), &shape_info_, | |||
| reinterpret_cast<int *>(output_data)); | |||
| return BROADCAST_TO(int, reinterpret_cast<const int *>(input_data), &shape_info_, | |||
| reinterpret_cast<int *>(output_data)); | |||
| default: | |||
| MS_LOG(ERROR) << "UnSupported data type: " << data_type_; | |||
| return RET_ERROR; | |||