| @@ -1,116 +1,116 @@ | |||
| /** | |||
| * Copyright 2021 Huawei Technologies Co., Ltd | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| * You may obtain a copy of the License at | |||
| * | |||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, software | |||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| #include "backend/kernel_compiler/cpu/binary_cross_entropy_cpu_kernel.h" | |||
| namespace mindspore { | |||
| namespace kernel { | |||
| constexpr size_t kBceInputNumWithWeight = 3; | |||
| template <typename T> | |||
| void BinaryCrossEntropyCpuKernel::LaunchToScalar(const int &input_size, const int &reduction, T *loss, T *tmp_loss) { | |||
| if (input_size % 2 == 1) { | |||
| tmp_loss[0] += tmp_loss[input_size - 1]; | |||
| } | |||
| for (int stride = input_size / 2; stride > 0; stride = stride / 2) { | |||
| for (int i = 0; i < stride; i++) { | |||
| tmp_loss[i] += tmp_loss[i + stride]; | |||
| } | |||
| if (stride > 2 && stride % 2 == 1) { | |||
| tmp_loss[0] += tmp_loss[stride - 1]; | |||
| } | |||
| } | |||
| loss[0] += tmp_loss[0]; | |||
| if (reduction == 1) { | |||
| loss[0] /= static_cast<T>(input_size); | |||
| } | |||
| } | |||
| template <typename T> | |||
| void BinaryCrossEntropyCpuKernel::Launchkernel(const std::vector<AddressPtr> &inputs, | |||
| const std::vector<AddressPtr> &workspace, | |||
| const std::vector<AddressPtr> &outputs) { | |||
| T *input_x = reinterpret_cast<T *>(inputs[0]->addr); | |||
| T *input_y = reinterpret_cast<T *>(inputs[1]->addr); | |||
| T *weight = nullptr; | |||
| if (weight_defined_) { | |||
| weight = reinterpret_cast<T *>(inputs[2]->addr); | |||
| } | |||
| T *loss = reinterpret_cast<T *>(outputs[0]->addr); | |||
| std::vector<T> tmp_loss(input_size_); | |||
| T epsilon = static_cast<T>(1e-12); | |||
| T one = static_cast<T>(1); | |||
| if (reduction_ == 0 && weight_defined_) { | |||
| for (size_t i = 0; i < input_size_; i++) { | |||
| T value = | |||
| -weight[i] * (input_y[i] * log(input_x[i] + epsilon) + (one - input_y[i]) * log(one - input_x[i] + epsilon)); | |||
| loss[i] = value; | |||
| } | |||
| } else if (reduction_ == 0 && (!weight_defined_)) { | |||
| for (size_t i = 0; i < input_size_; i++) { | |||
| T value = -(input_y[i] * log(input_x[i] + epsilon) + (one - input_y[i]) * log(one - input_x[i] + epsilon)); | |||
| loss[i] = value; | |||
| } | |||
| } else if ((reduction_ != 0) && weight_defined_) { | |||
| for (size_t i = 0; i < input_size_; i++) { | |||
| T value = | |||
| -weight[i] * (input_y[i] * log(input_x[i] + epsilon) + (one - input_y[i]) * log(one - input_x[i] + epsilon)); | |||
| tmp_loss[i] = value; | |||
| } | |||
| } else { | |||
| for (size_t i = 0; i < input_size_; i++) { | |||
| T value = -(input_y[i] * log(input_x[i] + epsilon) + (one - input_y[i]) * log(one - input_x[i] + epsilon)); | |||
| tmp_loss[i] = value; | |||
| } | |||
| } | |||
| if (reduction_ != 0) { | |||
| LaunchToScalar<T>(input_size_, reduction_, loss, tmp_loss.data()); | |||
| } | |||
| } | |||
| bool BinaryCrossEntropyCpuKernel::Launch(const std::vector<AddressPtr> &inputs, | |||
| const std::vector<AddressPtr> &workspace, | |||
| const std::vector<AddressPtr> &outputs) { | |||
| if (input_size_ > 0) { | |||
| if (dtype_ == kNumberTypeFloat32) { | |||
| Launchkernel<float>(inputs, workspace, outputs); | |||
| } else if (dtype_ == kNumberTypeFloat16) { | |||
| Launchkernel<float16>(inputs, workspace, outputs); | |||
| } | |||
| } | |||
| return true; | |||
| } | |||
| void BinaryCrossEntropyCpuKernel::InitKernel(const CNodePtr &kernel_node) { | |||
| auto input_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0); | |||
| for (size_t i = 0; i < input_shape.size(); i++) { | |||
| input_size_ *= input_shape[i]; | |||
| } | |||
| string reduction = AnfAlgo::GetNodeAttr<string>(kernel_node, "reduction"); | |||
| if (reduction == "none") { | |||
| reduction_ = 0; | |||
| } else if (reduction == "sum") { | |||
| reduction_ = 2; | |||
| } | |||
| size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node); | |||
| weight_defined_ = (input_num == kBceInputNumWithWeight); | |||
| dtype_ = AnfAlgo::GetInputDeviceDataType(kernel_node, 0); | |||
| } | |||
| } // namespace kernel | |||
| } // namespace mindspore | |||
| /** | |||
| * Copyright 2021 Huawei Technologies Co., Ltd | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| * You may obtain a copy of the License at | |||
| * | |||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, software | |||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| #include "backend/kernel_compiler/cpu/binary_cross_entropy_cpu_kernel.h" | |||
| namespace mindspore { | |||
| namespace kernel { | |||
| constexpr size_t kBceInputNumWithWeight = 3; | |||
| template <typename T> | |||
| void BinaryCrossEntropyCpuKernel::LaunchToScalar(const int &input_size, const int &reduction, T *loss, T *tmp_loss) { | |||
| if (input_size % 2 == 1) { | |||
| tmp_loss[0] += tmp_loss[input_size - 1]; | |||
| } | |||
| for (int stride = input_size / 2; stride > 0; stride = stride / 2) { | |||
| for (int i = 0; i < stride; i++) { | |||
| tmp_loss[i] += tmp_loss[i + stride]; | |||
| } | |||
| if (stride > 2 && stride % 2 == 1) { | |||
| tmp_loss[0] += tmp_loss[stride - 1]; | |||
| } | |||
| } | |||
| loss[0] += tmp_loss[0]; | |||
| if (reduction == 1) { | |||
| loss[0] /= static_cast<T>(input_size); | |||
| } | |||
| } | |||
| template <typename T> | |||
| void BinaryCrossEntropyCpuKernel::Launchkernel(const std::vector<AddressPtr> &inputs, | |||
| const std::vector<AddressPtr> &workspace, | |||
| const std::vector<AddressPtr> &outputs) { | |||
| T *input_x = reinterpret_cast<T *>(inputs[0]->addr); | |||
| T *input_y = reinterpret_cast<T *>(inputs[1]->addr); | |||
| T *weight = nullptr; | |||
| if (weight_defined_) { | |||
| weight = reinterpret_cast<T *>(inputs[2]->addr); | |||
| } | |||
| T *loss = reinterpret_cast<T *>(outputs[0]->addr); | |||
| std::vector<T> tmp_loss(input_size_); | |||
| T epsilon = static_cast<T>(1e-12); | |||
| T one = static_cast<T>(1); | |||
| if (reduction_ == 0 && weight_defined_) { | |||
| for (size_t i = 0; i < input_size_; i++) { | |||
| T value = | |||
| -weight[i] * (input_y[i] * log(input_x[i] + epsilon) + (one - input_y[i]) * log(one - input_x[i] + epsilon)); | |||
| loss[i] = value; | |||
| } | |||
| } else if (reduction_ == 0 && (!weight_defined_)) { | |||
| for (size_t i = 0; i < input_size_; i++) { | |||
| T value = -(input_y[i] * log(input_x[i] + epsilon) + (one - input_y[i]) * log(one - input_x[i] + epsilon)); | |||
| loss[i] = value; | |||
| } | |||
| } else if ((reduction_ != 0) && weight_defined_) { | |||
| for (size_t i = 0; i < input_size_; i++) { | |||
| T value = | |||
| -weight[i] * (input_y[i] * log(input_x[i] + epsilon) + (one - input_y[i]) * log(one - input_x[i] + epsilon)); | |||
| tmp_loss[i] = value; | |||
| } | |||
| } else { | |||
| for (size_t i = 0; i < input_size_; i++) { | |||
| T value = -(input_y[i] * log(input_x[i] + epsilon) + (one - input_y[i]) * log(one - input_x[i] + epsilon)); | |||
| tmp_loss[i] = value; | |||
| } | |||
| } | |||
| if (reduction_ != 0) { | |||
| LaunchToScalar<T>(input_size_, reduction_, loss, tmp_loss.data()); | |||
| } | |||
| } | |||
| bool BinaryCrossEntropyCpuKernel::Launch(const std::vector<AddressPtr> &inputs, | |||
| const std::vector<AddressPtr> &workspace, | |||
| const std::vector<AddressPtr> &outputs) { | |||
| if (input_size_ > 0) { | |||
| if (dtype_ == kNumberTypeFloat32) { | |||
| Launchkernel<float>(inputs, workspace, outputs); | |||
| } else if (dtype_ == kNumberTypeFloat16) { | |||
| Launchkernel<float16>(inputs, workspace, outputs); | |||
| } | |||
| } | |||
| return true; | |||
| } | |||
| void BinaryCrossEntropyCpuKernel::InitKernel(const CNodePtr &kernel_node) { | |||
| auto input_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0); | |||
| for (size_t i = 0; i < input_shape.size(); i++) { | |||
| input_size_ *= input_shape[i]; | |||
| } | |||
| string reduction = AnfAlgo::GetNodeAttr<string>(kernel_node, "reduction"); | |||
| if (reduction == "none") { | |||
| reduction_ = 0; | |||
| } else if (reduction == "sum") { | |||
| reduction_ = 2; | |||
| } | |||
| size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node); | |||
| weight_defined_ = (input_num == kBceInputNumWithWeight); | |||
| dtype_ = AnfAlgo::GetInputDeviceDataType(kernel_node, 0); | |||
| } | |||
| } // namespace kernel | |||
| } // namespace mindspore | |||
| @@ -1,71 +1,71 @@ | |||
| /** | |||
| * Copyright 2021 Huawei Technologies Co., Ltd | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| * You may obtain a copy of the License at | |||
| * | |||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, software | |||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_NN_BINARY_CROSS_ENTROPY_KERNEL_H | |||
| #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_NN_BINARY_CROSS_ENTROPY_KERNEL_H | |||
| #include <vector> | |||
| #include <string> | |||
| #include "backend/kernel_compiler/cpu/cpu_kernel.h" | |||
| #include "backend/kernel_compiler/cpu/cpu_kernel_factory.h" | |||
| namespace mindspore { | |||
| namespace kernel { | |||
| class BinaryCrossEntropyCpuKernel : public CPUKernel { | |||
| public: | |||
| BinaryCrossEntropyCpuKernel() : input_size_(1), reduction_(1), weight_defined_(false) {} | |||
| ~BinaryCrossEntropyCpuKernel() override = default; | |||
| void InitKernel(const CNodePtr &kernel_node) override; | |||
| bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace, | |||
| const std::vector<AddressPtr> &outputs) override; | |||
| private: | |||
| template <typename T> | |||
| void LaunchToScalar(const int &input_size, const int &reduction, T *loss, T *tmp_loss); | |||
| template <typename T> | |||
| void Launchkernel(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace, | |||
| const std::vector<AddressPtr> &outputs); | |||
| TypeId dtype_{kTypeUnknown}; | |||
| size_t input_size_; | |||
| int reduction_; | |||
| bool weight_defined_; // true: there are 3 inputs, false: there are 2 inputs(no [weight]) | |||
| }; | |||
| MS_REG_CPU_KERNEL(BinaryCrossEntropy, | |||
| KernelAttr() | |||
| .AddInputAttr(kNumberTypeFloat16) | |||
| .AddInputAttr(kNumberTypeFloat16) | |||
| .AddInputAttr(kNumberTypeFloat16) | |||
| .AddOutputAttr(kNumberTypeFloat16), | |||
| BinaryCrossEntropyCpuKernel); | |||
| MS_REG_CPU_KERNEL(BinaryCrossEntropy, | |||
| KernelAttr() | |||
| .AddInputAttr(kNumberTypeFloat32) | |||
| .AddInputAttr(kNumberTypeFloat32) | |||
| .AddInputAttr(kNumberTypeFloat32) | |||
| .AddOutputAttr(kNumberTypeFloat32), | |||
| BinaryCrossEntropyCpuKernel); | |||
| MS_REG_CPU_KERNEL( | |||
| BinaryCrossEntropy, | |||
| KernelAttr().AddInputAttr(kNumberTypeFloat16).AddInputAttr(kNumberTypeFloat16).AddOutputAttr(kNumberTypeFloat16), | |||
| BinaryCrossEntropyCpuKernel); | |||
| MS_REG_CPU_KERNEL( | |||
| BinaryCrossEntropy, | |||
| KernelAttr().AddInputAttr(kNumberTypeFloat32).AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32), | |||
| BinaryCrossEntropyCpuKernel); | |||
| } // namespace kernel | |||
| } // namespace mindspore | |||
| #endif // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_NN_BINARY_CROSS_ENTROPY_KERNEL_H | |||
| /** | |||
| * Copyright 2021 Huawei Technologies Co., Ltd | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| * You may obtain a copy of the License at | |||
| * | |||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, software | |||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_NN_BINARY_CROSS_ENTROPY_KERNEL_H | |||
| #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_NN_BINARY_CROSS_ENTROPY_KERNEL_H | |||
| #include <vector> | |||
| #include <string> | |||
| #include "backend/kernel_compiler/cpu/cpu_kernel.h" | |||
| #include "backend/kernel_compiler/cpu/cpu_kernel_factory.h" | |||
| namespace mindspore { | |||
| namespace kernel { | |||
| class BinaryCrossEntropyCpuKernel : public CPUKernel { | |||
| public: | |||
| BinaryCrossEntropyCpuKernel() : input_size_(1), reduction_(1), weight_defined_(false) {} | |||
| ~BinaryCrossEntropyCpuKernel() override = default; | |||
| void InitKernel(const CNodePtr &kernel_node) override; | |||
| bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace, | |||
| const std::vector<AddressPtr> &outputs) override; | |||
| private: | |||
| template <typename T> | |||
| void LaunchToScalar(const int &input_size, const int &reduction, T *loss, T *tmp_loss); | |||
| template <typename T> | |||
| void Launchkernel(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace, | |||
| const std::vector<AddressPtr> &outputs); | |||
| TypeId dtype_{kTypeUnknown}; | |||
| size_t input_size_; | |||
| int reduction_; | |||
| bool weight_defined_; // true: there are 3 inputs, false: there are 2 inputs(no [weight]) | |||
| }; | |||
| MS_REG_CPU_KERNEL(BinaryCrossEntropy, | |||
| KernelAttr() | |||
| .AddInputAttr(kNumberTypeFloat16) | |||
| .AddInputAttr(kNumberTypeFloat16) | |||
| .AddInputAttr(kNumberTypeFloat16) | |||
| .AddOutputAttr(kNumberTypeFloat16), | |||
| BinaryCrossEntropyCpuKernel); | |||
| MS_REG_CPU_KERNEL(BinaryCrossEntropy, | |||
| KernelAttr() | |||
| .AddInputAttr(kNumberTypeFloat32) | |||
| .AddInputAttr(kNumberTypeFloat32) | |||
| .AddInputAttr(kNumberTypeFloat32) | |||
| .AddOutputAttr(kNumberTypeFloat32), | |||
| BinaryCrossEntropyCpuKernel); | |||
| MS_REG_CPU_KERNEL( | |||
| BinaryCrossEntropy, | |||
| KernelAttr().AddInputAttr(kNumberTypeFloat16).AddInputAttr(kNumberTypeFloat16).AddOutputAttr(kNumberTypeFloat16), | |||
| BinaryCrossEntropyCpuKernel); | |||
| MS_REG_CPU_KERNEL( | |||
| BinaryCrossEntropy, | |||
| KernelAttr().AddInputAttr(kNumberTypeFloat32).AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32), | |||
| BinaryCrossEntropyCpuKernel); | |||
| } // namespace kernel | |||
| } // namespace mindspore | |||
| #endif // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_NN_BINARY_CROSS_ENTROPY_KERNEL_H | |||
| @@ -1,102 +1,102 @@ | |||
| /** | |||
| * Copyright 2021 Huawei Technologies Co., Ltd | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| * You may obtain a copy of the License at | |||
| * | |||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, software | |||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| #include "backend/kernel_compiler/cpu/binary_cross_entropy_grad_kernel.h" | |||
| namespace mindspore { | |||
| namespace kernel { | |||
| constexpr size_t kBceGradInputNumWithWeight = 4; | |||
| template <typename T> | |||
| void BinaryCrossEntropyGradCpuKernel::Launchkernel(const std::vector<AddressPtr> &inputs, | |||
| const std::vector<AddressPtr> &outputs) { | |||
| T *input_x = reinterpret_cast<T *>(inputs[0]->addr); | |||
| T *input_y = reinterpret_cast<T *>(inputs[1]->addr); | |||
| T *dloss = reinterpret_cast<T *>(inputs[2]->addr); | |||
| T *weight = nullptr; | |||
| if (weight_defined_) { | |||
| weight = reinterpret_cast<T *>(inputs[3]->addr); | |||
| } | |||
| T *dx = reinterpret_cast<T *>(outputs[0]->addr); | |||
| T epsilon = static_cast<T>(1e-12); | |||
| T one = static_cast<T>(1); | |||
| if (reduction_ == 0) { | |||
| if (weight_defined_) { | |||
| for (size_t i = 0; i < input_size_; i++) { | |||
| T denominator = ((input_x[i] * (one - input_x[i])) > epsilon) ? (input_x[i] * (one - input_x[i])) : epsilon; | |||
| T value = weight[i] * (input_x[i] - input_y[i]) / denominator; | |||
| dx[i] = value * dloss[i]; | |||
| } | |||
| } else { | |||
| for (size_t i = 0; i < input_size_; i++) { | |||
| T denominator = ((input_x[i] * (one - input_x[i])) > epsilon) ? (input_x[i] * (one - input_x[i])) : epsilon; | |||
| T value = (input_x[i] - input_y[i]) / denominator; | |||
| dx[i] = value * dloss[i]; | |||
| } | |||
| } | |||
| } else { | |||
| T dloss1 = dloss[0]; | |||
| if (reduction_ == 1) { | |||
| dloss1 = dloss[0] / static_cast<T>(input_size_); | |||
| } | |||
| if (weight_defined_) { | |||
| for (size_t i = 0; i < input_size_; i++) { | |||
| T denominator = ((input_x[i] * (one - input_x[i])) > epsilon) ? (input_x[i] * (one - input_x[i])) : epsilon; | |||
| T value = weight[i] * (input_x[i] - input_y[i]) / denominator; | |||
| dx[i] = value * dloss1; | |||
| } | |||
| } else { | |||
| for (size_t i = 0; i < input_size_; i++) { | |||
| T denominator = ((input_x[i] * (one - input_x[i])) > epsilon) ? (input_x[i] * (one - input_x[i])) : epsilon; | |||
| T value = (input_x[i] - input_y[i]) / denominator; | |||
| dx[i] = value * dloss1; | |||
| } | |||
| } | |||
| } | |||
| } | |||
| bool BinaryCrossEntropyGradCpuKernel::Launch(const std::vector<AddressPtr> &inputs, | |||
| const std::vector<AddressPtr> &workspace, | |||
| const std::vector<AddressPtr> &outputs) { | |||
| if (input_size_ > 0) { | |||
| if (dtype_ == kNumberTypeFloat32) { | |||
| Launchkernel<float>(inputs, outputs); | |||
| } else if (dtype_ == kNumberTypeFloat16) { | |||
| Launchkernel<float16>(inputs, outputs); | |||
| } | |||
| } | |||
| return true; | |||
| } | |||
| void BinaryCrossEntropyGradCpuKernel::InitKernel(const CNodePtr &kernel_node) { | |||
| auto input_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0); | |||
| for (size_t i = 0; i < input_shape.size(); i++) { | |||
| input_size_ *= input_shape[i]; | |||
| } | |||
| string reduction = AnfAlgo::GetNodeAttr<string>(kernel_node, "reduction"); | |||
| if (reduction == "none") { | |||
| reduction_ = 0; | |||
| } else if (reduction == "sum") { | |||
| reduction_ = 2; | |||
| } | |||
| size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node); | |||
| weight_defined_ = (input_num == kBceGradInputNumWithWeight); | |||
| dtype_ = AnfAlgo::GetInputDeviceDataType(kernel_node, 0); | |||
| } | |||
| } // namespace kernel | |||
| } // namespace mindspore | |||
| /** | |||
| * Copyright 2021 Huawei Technologies Co., Ltd | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| * You may obtain a copy of the License at | |||
| * | |||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, software | |||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| #include "backend/kernel_compiler/cpu/binary_cross_entropy_grad_kernel.h" | |||
| namespace mindspore { | |||
| namespace kernel { | |||
| constexpr size_t kBceGradInputNumWithWeight = 4; | |||
| template <typename T> | |||
| void BinaryCrossEntropyGradCpuKernel::Launchkernel(const std::vector<AddressPtr> &inputs, | |||
| const std::vector<AddressPtr> &outputs) { | |||
| T *input_x = reinterpret_cast<T *>(inputs[0]->addr); | |||
| T *input_y = reinterpret_cast<T *>(inputs[1]->addr); | |||
| T *dloss = reinterpret_cast<T *>(inputs[2]->addr); | |||
| T *weight = nullptr; | |||
| if (weight_defined_) { | |||
| weight = reinterpret_cast<T *>(inputs[3]->addr); | |||
| } | |||
| T *dx = reinterpret_cast<T *>(outputs[0]->addr); | |||
| T epsilon = static_cast<T>(1e-12); | |||
| T one = static_cast<T>(1); | |||
| if (reduction_ == 0) { | |||
| if (weight_defined_) { | |||
| for (size_t i = 0; i < input_size_; i++) { | |||
| T denominator = ((input_x[i] * (one - input_x[i])) > epsilon) ? (input_x[i] * (one - input_x[i])) : epsilon; | |||
| T value = weight[i] * (input_x[i] - input_y[i]) / denominator; | |||
| dx[i] = value * dloss[i]; | |||
| } | |||
| } else { | |||
| for (size_t i = 0; i < input_size_; i++) { | |||
| T denominator = ((input_x[i] * (one - input_x[i])) > epsilon) ? (input_x[i] * (one - input_x[i])) : epsilon; | |||
| T value = (input_x[i] - input_y[i]) / denominator; | |||
| dx[i] = value * dloss[i]; | |||
| } | |||
| } | |||
| } else { | |||
| T dloss1 = dloss[0]; | |||
| if (reduction_ == 1) { | |||
| dloss1 = dloss[0] / static_cast<T>(input_size_); | |||
| } | |||
| if (weight_defined_) { | |||
| for (size_t i = 0; i < input_size_; i++) { | |||
| T denominator = ((input_x[i] * (one - input_x[i])) > epsilon) ? (input_x[i] * (one - input_x[i])) : epsilon; | |||
| T value = weight[i] * (input_x[i] - input_y[i]) / denominator; | |||
| dx[i] = value * dloss1; | |||
| } | |||
| } else { | |||
| for (size_t i = 0; i < input_size_; i++) { | |||
| T denominator = ((input_x[i] * (one - input_x[i])) > epsilon) ? (input_x[i] * (one - input_x[i])) : epsilon; | |||
| T value = (input_x[i] - input_y[i]) / denominator; | |||
| dx[i] = value * dloss1; | |||
| } | |||
| } | |||
| } | |||
| } | |||
| bool BinaryCrossEntropyGradCpuKernel::Launch(const std::vector<AddressPtr> &inputs, | |||
| const std::vector<AddressPtr> &workspace, | |||
| const std::vector<AddressPtr> &outputs) { | |||
| if (input_size_ > 0) { | |||
| if (dtype_ == kNumberTypeFloat32) { | |||
| Launchkernel<float>(inputs, outputs); | |||
| } else if (dtype_ == kNumberTypeFloat16) { | |||
| Launchkernel<float16>(inputs, outputs); | |||
| } | |||
| } | |||
| return true; | |||
| } | |||
| void BinaryCrossEntropyGradCpuKernel::InitKernel(const CNodePtr &kernel_node) { | |||
| auto input_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0); | |||
| for (size_t i = 0; i < input_shape.size(); i++) { | |||
| input_size_ *= input_shape[i]; | |||
| } | |||
| string reduction = AnfAlgo::GetNodeAttr<string>(kernel_node, "reduction"); | |||
| if (reduction == "none") { | |||
| reduction_ = 0; | |||
| } else if (reduction == "sum") { | |||
| reduction_ = 2; | |||
| } | |||
| size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node); | |||
| weight_defined_ = (input_num == kBceGradInputNumWithWeight); | |||
| dtype_ = AnfAlgo::GetInputDeviceDataType(kernel_node, 0); | |||
| } | |||
| } // namespace kernel | |||
| } // namespace mindspore | |||
| @@ -1,76 +1,76 @@ | |||
| /** | |||
| * Copyright 2021 Huawei Technologies Co., Ltd | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| * You may obtain a copy of the License at | |||
| * | |||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, software | |||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_NN_BINARY_CROSS_ENTROPY_GRAD_KERNEL_H | |||
| #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_NN_BINARY_CROSS_ENTROPY_GRAD_KERNEL_H | |||
| #include <vector> | |||
| #include <string> | |||
| #include "backend/kernel_compiler/cpu/cpu_kernel.h" | |||
| #include "backend/kernel_compiler/cpu/cpu_kernel_factory.h" | |||
| namespace mindspore { | |||
| namespace kernel { | |||
| class BinaryCrossEntropyGradCpuKernel : public CPUKernel { | |||
| public: | |||
| BinaryCrossEntropyGradCpuKernel() : input_size_(1), reduction_(1), weight_defined_(false) {} | |||
| ~BinaryCrossEntropyGradCpuKernel() override = default; | |||
| void InitKernel(const CNodePtr &kernel_node) override; | |||
| bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace, | |||
| const std::vector<AddressPtr> &outputs) override; | |||
| private: | |||
| template <typename T> | |||
| void Launchkernel(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &outputs); | |||
| TypeId dtype_{kTypeUnknown}; | |||
| size_t input_size_; | |||
| int reduction_; | |||
| bool weight_defined_; // true: there are 4 inputs, false: there are 3 inputs(no [weight]) | |||
| }; | |||
| MS_REG_CPU_KERNEL(BinaryCrossEntropyGrad, | |||
| KernelAttr() | |||
| .AddInputAttr(kNumberTypeFloat16) | |||
| .AddInputAttr(kNumberTypeFloat16) | |||
| .AddInputAttr(kNumberTypeFloat16) | |||
| .AddInputAttr(kNumberTypeFloat16) | |||
| .AddOutputAttr(kNumberTypeFloat16), | |||
| BinaryCrossEntropyGradCpuKernel); | |||
| MS_REG_CPU_KERNEL(BinaryCrossEntropyGrad, | |||
| KernelAttr() | |||
| .AddInputAttr(kNumberTypeFloat32) | |||
| .AddInputAttr(kNumberTypeFloat32) | |||
| .AddInputAttr(kNumberTypeFloat32) | |||
| .AddInputAttr(kNumberTypeFloat32) | |||
| .AddOutputAttr(kNumberTypeFloat32), | |||
| BinaryCrossEntropyGradCpuKernel); | |||
| MS_REG_CPU_KERNEL(BinaryCrossEntropyGrad, | |||
| KernelAttr() | |||
| .AddInputAttr(kNumberTypeFloat16) | |||
| .AddInputAttr(kNumberTypeFloat16) | |||
| .AddInputAttr(kNumberTypeFloat16) | |||
| .AddOutputAttr(kNumberTypeFloat16), | |||
| BinaryCrossEntropyGradCpuKernel); | |||
| MS_REG_CPU_KERNEL(BinaryCrossEntropyGrad, | |||
| KernelAttr() | |||
| .AddInputAttr(kNumberTypeFloat32) | |||
| .AddInputAttr(kNumberTypeFloat32) | |||
| .AddInputAttr(kNumberTypeFloat32) | |||
| .AddOutputAttr(kNumberTypeFloat32), | |||
| BinaryCrossEntropyGradCpuKernel); | |||
| } // namespace kernel | |||
| } // namespace mindspore | |||
| #endif // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_NN_BINARY_CROSS_ENTROPY_GRAD_KERNEL_H | |||
| /** | |||
| * Copyright 2021 Huawei Technologies Co., Ltd | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| * You may obtain a copy of the License at | |||
| * | |||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, software | |||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_NN_BINARY_CROSS_ENTROPY_GRAD_KERNEL_H | |||
| #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_NN_BINARY_CROSS_ENTROPY_GRAD_KERNEL_H | |||
| #include <vector> | |||
| #include <string> | |||
| #include "backend/kernel_compiler/cpu/cpu_kernel.h" | |||
| #include "backend/kernel_compiler/cpu/cpu_kernel_factory.h" | |||
| namespace mindspore { | |||
| namespace kernel { | |||
| class BinaryCrossEntropyGradCpuKernel : public CPUKernel { | |||
| public: | |||
| BinaryCrossEntropyGradCpuKernel() : input_size_(1), reduction_(1), weight_defined_(false) {} | |||
| ~BinaryCrossEntropyGradCpuKernel() override = default; | |||
| void InitKernel(const CNodePtr &kernel_node) override; | |||
| bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace, | |||
| const std::vector<AddressPtr> &outputs) override; | |||
| private: | |||
| template <typename T> | |||
| void Launchkernel(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &outputs); | |||
| TypeId dtype_{kTypeUnknown}; | |||
| size_t input_size_; | |||
| int reduction_; | |||
| bool weight_defined_; // true: there are 4 inputs, false: there are 3 inputs(no [weight]) | |||
| }; | |||
| MS_REG_CPU_KERNEL(BinaryCrossEntropyGrad, | |||
| KernelAttr() | |||
| .AddInputAttr(kNumberTypeFloat16) | |||
| .AddInputAttr(kNumberTypeFloat16) | |||
| .AddInputAttr(kNumberTypeFloat16) | |||
| .AddInputAttr(kNumberTypeFloat16) | |||
| .AddOutputAttr(kNumberTypeFloat16), | |||
| BinaryCrossEntropyGradCpuKernel); | |||
| MS_REG_CPU_KERNEL(BinaryCrossEntropyGrad, | |||
| KernelAttr() | |||
| .AddInputAttr(kNumberTypeFloat32) | |||
| .AddInputAttr(kNumberTypeFloat32) | |||
| .AddInputAttr(kNumberTypeFloat32) | |||
| .AddInputAttr(kNumberTypeFloat32) | |||
| .AddOutputAttr(kNumberTypeFloat32), | |||
| BinaryCrossEntropyGradCpuKernel); | |||
| MS_REG_CPU_KERNEL(BinaryCrossEntropyGrad, | |||
| KernelAttr() | |||
| .AddInputAttr(kNumberTypeFloat16) | |||
| .AddInputAttr(kNumberTypeFloat16) | |||
| .AddInputAttr(kNumberTypeFloat16) | |||
| .AddOutputAttr(kNumberTypeFloat16), | |||
| BinaryCrossEntropyGradCpuKernel); | |||
| MS_REG_CPU_KERNEL(BinaryCrossEntropyGrad, | |||
| KernelAttr() | |||
| .AddInputAttr(kNumberTypeFloat32) | |||
| .AddInputAttr(kNumberTypeFloat32) | |||
| .AddInputAttr(kNumberTypeFloat32) | |||
| .AddOutputAttr(kNumberTypeFloat32), | |||
| BinaryCrossEntropyGradCpuKernel); | |||
| } // namespace kernel | |||
| } // namespace mindspore | |||
| #endif // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_NN_BINARY_CROSS_ENTROPY_GRAD_KERNEL_H | |||
| @@ -1,271 +1,271 @@ | |||
| /** | |||
| * Copyright 2019 Huawei Technologies Co., Ltd | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| * You may obtain a copy of the License at | |||
| * | |||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, software | |||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| #include "backend/kernel_compiler/cpu/cpu_kernel.h" | |||
| #include <algorithm> | |||
| #include <utility> | |||
| #include "common/thread_pool.h" | |||
| namespace mindspore { | |||
| namespace kernel { | |||
| void CPUKernel::InitInputOutputSize(const CNodePtr &kernel_node) { | |||
| MS_EXCEPTION_IF_NULL(kernel_node); | |||
| size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node); | |||
| for (size_t input_index = 0; input_index < input_num; ++input_index) { | |||
| TypeId type_id = AnfAlgo::GetInputDeviceDataType(kernel_node, input_index); | |||
| size_t type_size = GetTypeByte(TypeIdToType(type_id)); | |||
| std::vector<size_t> shape = AnfAlgo::GetInputDeviceShape(kernel_node, input_index); | |||
| size_t tensor_size = | |||
| shape.empty() ? type_size : std::accumulate(shape.begin(), shape.end(), type_size, std::multiplies<size_t>()); | |||
| tensor_size = std::max(tensor_size, type_size); | |||
| input_size_list_.emplace_back(tensor_size); | |||
| } | |||
| size_t output_num = AnfAlgo::GetOutputTensorNum(kernel_node); | |||
| for (size_t output_index = 0; output_index < output_num; ++output_index) { | |||
| TypeId type_id = AnfAlgo::GetOutputDeviceDataType(kernel_node, output_index); | |||
| size_t type_size = GetTypeByte(TypeIdToType(type_id)); | |||
| std::vector<size_t> shape = AnfAlgo::GetOutputDeviceShape(kernel_node, output_index); | |||
| size_t tensor_size = | |||
| shape.empty() ? type_size : std::accumulate(shape.begin(), shape.end(), type_size, std::multiplies<size_t>()); | |||
| tensor_size = std::max(tensor_size, type_size); | |||
| output_size_list_.emplace_back(tensor_size); | |||
| } | |||
| } | |||
| void CPUKernel::Init(const CNodePtr &kernel_node) { | |||
| InitKernel(kernel_node); | |||
| InitInputOutputSize(kernel_node); | |||
| } | |||
| void CPUKernelUtils::ExpandDimsTo4(std::vector<size_t> *shape) { | |||
| auto len = shape->size(); | |||
| if (len < 4) { | |||
| for (size_t i = 0; i < 4 - len; ++i) { | |||
| shape->insert(shape->begin(), 1); | |||
| } | |||
| } | |||
| } | |||
| size_t CPUKernelUtils::CalcOffset(const std::vector<size_t> &shape, size_t dim0, size_t dim1, size_t dim2, | |||
| size_t dim3) { | |||
| size_t offset = dim0 * shape[1] * shape[2] * shape[3] + dim1 * shape[2] * shape[3] + dim2 * shape[3] + dim3; | |||
| return offset; | |||
| } | |||
| size_t CPUKernelUtils::GetElementNumOnAxis(const std::vector<size_t> &shape, int axis) { | |||
| if (axis < 0) { | |||
| axis = axis + SizeToInt(shape.size()); | |||
| } | |||
| size_t result = 1; | |||
| for (int j = 3; j > axis; --j) { | |||
| result *= shape[j]; | |||
| } | |||
| return result; | |||
| } | |||
| void CPUKernelUtils::GetElementNumEveryDim(const std::vector<size_t> &shape, std::vector<size_t> *element_num) { | |||
| size_t accumulation = 1; | |||
| element_num->emplace_back(1); | |||
| for (size_t i = shape.size() - 1; i > 0; --i) { | |||
| accumulation *= shape[i]; | |||
| element_num->emplace_back(accumulation); | |||
| } | |||
| std::reverse(element_num->begin(), element_num->end()); | |||
| } | |||
| void CPUKernelUtils::ParallelFor(const CTask &task, size_t count) { | |||
| auto max_thread_num = common::ThreadPool::GetInstance().GetSyncRunThreadNum(); | |||
| const float block_size = 128.0; | |||
| size_t thread_num = count < block_size * max_thread_num ? std::ceil(count / block_size) : max_thread_num; | |||
| std::vector<common::Task> tasks; | |||
| size_t start = 0; | |||
| size_t once_compute_size = (count + thread_num - 1) / thread_num; | |||
| while (start < count) { | |||
| size_t end = (start + once_compute_size) > count ? count : (start + once_compute_size); | |||
| auto block = [&, start, end]() { | |||
| task(start, end); | |||
| return common::SUCCESS; | |||
| }; | |||
| tasks.emplace_back(block); | |||
| start += once_compute_size; | |||
| } | |||
| common::ThreadPool::GetInstance().SyncRun(tasks); | |||
| } | |||
| std::vector<size_t> CPUKernelUtils::FlatShapeByAxis(const std::vector<size_t> &shape, int axis) { | |||
| if (axis < 0) { | |||
| axis = axis + SizeToInt(shape.size()); | |||
| } | |||
| size_t dim_row = 1; | |||
| size_t dim_col = 1; | |||
| std::vector<size_t> flat_shape; | |||
| for (size_t i = 0; i < shape.size(); ++i) { | |||
| if (SizeToInt(i) < axis) { | |||
| dim_row *= shape[i]; | |||
| } else { | |||
| dim_col *= shape[i]; | |||
| } | |||
| } | |||
| flat_shape.push_back(dim_row); | |||
| flat_shape.push_back(dim_col); | |||
| return flat_shape; | |||
| } | |||
| BroadcastIterator::BroadcastIterator(std::vector<size_t> input_shape_a, std::vector<size_t> input_shape_b, | |||
| std::vector<size_t> output_shape) | |||
| : input_shape_a_(std::move(input_shape_a)), | |||
| input_shape_b_(std::move(input_shape_b)), | |||
| output_shape_(std::move(output_shape)) { | |||
| output_dimension_ = SizeToInt(output_shape_.size()); // Assign dimension to int for iterator | |||
| BroadcastShape(); | |||
| // Allocate strides memory | |||
| input_strides_a_.resize(output_dimension_); | |||
| input_strides_b_.resize(output_dimension_); | |||
| input_back_strides_a_.resize(output_dimension_); | |||
| input_back_strides_b_.resize(output_dimension_); | |||
| coordinates_.resize(output_dimension_); | |||
| InitStrides(); | |||
| } | |||
| void BroadcastIterator::SetPos(size_t pos) { | |||
| for (int i = output_dimension_ - 1; i >= 0 && pos != 0; --i) { | |||
| coordinates_[i] = pos % output_shape_[i]; | |||
| input_pos_[0] += coordinates_[i] * input_strides_a_[i]; | |||
| input_pos_[1] += coordinates_[i] * input_strides_b_[i]; | |||
| pos /= output_shape_[i]; | |||
| } | |||
| } | |||
| void BroadcastIterator::GenNextPos() { | |||
| // Calculate output next coordinate | |||
| for (int i = output_dimension_ - 1; i >= 0; --i) { | |||
| if (coordinates_[i] + 1 == output_shape_[i]) { | |||
| coordinates_[i] = 0; | |||
| input_pos_[0] -= input_back_strides_a_[i]; | |||
| input_pos_[1] -= input_back_strides_b_[i]; | |||
| } else { | |||
| ++coordinates_[i]; | |||
| input_pos_[0] += input_strides_a_[i]; | |||
| input_pos_[1] += input_strides_b_[i]; | |||
| break; | |||
| } | |||
| } | |||
| } | |||
| void BroadcastIterator::BroadcastShape() { | |||
| int input_dimension_a = input_shape_a_.size(); | |||
| if (input_dimension_a < output_dimension_) { | |||
| input_shape_a_.insert(input_shape_a_.begin(), output_dimension_ - input_dimension_a, 1); | |||
| } | |||
| int input_dimension_b = input_shape_b_.size(); | |||
| if (input_dimension_b < output_dimension_) { | |||
| input_shape_b_.insert(input_shape_b_.begin(), output_dimension_ - input_dimension_b, 1); | |||
| } | |||
| } | |||
| void BroadcastIterator::InitStrides() { | |||
| input_strides_a_[output_dimension_ - 1] = 1; | |||
| input_strides_b_[output_dimension_ - 1] = 1; | |||
| for (int i = output_dimension_ - 2; i >= 0; --i) { | |||
| input_strides_a_[i] = input_shape_a_[i + 1] * input_strides_a_[i + 1]; | |||
| input_strides_b_[i] = input_shape_b_[i + 1] * input_strides_b_[i + 1]; | |||
| input_back_strides_a_[i + 1] = (input_shape_a_[i + 1] - 1) * input_strides_a_[i + 1]; | |||
| input_back_strides_b_[i + 1] = (input_shape_b_[i + 1] - 1) * input_strides_b_[i + 1]; | |||
| } | |||
| // Update strides for broadcast | |||
| // While the axis value is 1, the stride is 0 | |||
| std::transform(input_strides_a_.begin(), input_strides_a_.end(), input_shape_a_.begin(), input_strides_a_.begin(), | |||
| [](const auto &a, const auto &b) { return b == 1 ? 0 : a; }); | |||
| std::transform(input_strides_b_.begin(), input_strides_b_.end(), input_shape_b_.begin(), input_strides_b_.begin(), | |||
| [](const auto &a, const auto &b) { return b == 1 ? 0 : a; }); | |||
| } | |||
| TransposeIterator::TransposeIterator(std::vector<size_t> output_shape, std::vector<size_t> axes, | |||
| const std::vector<size_t> &input_shape) | |||
| : shape_(std::move(output_shape)), axes_(std::move(axes)) { | |||
| // Calculate strides | |||
| dimension_ = shape_.size(); | |||
| std::vector<uint32_t> strides(dimension_, 1); | |||
| for (int i = dimension_ - 2; i >= 0; --i) { | |||
| strides[i] = input_shape[i + 1] * strides[i + 1]; | |||
| } | |||
| // Swap shape ans strides and calculate back strides | |||
| strides_.resize(dimension_); | |||
| back_strides_.resize(dimension_); | |||
| for (int i = dimension_ - 1; i >= 0; --i) { | |||
| strides_[i] = strides[axes_[i]]; | |||
| back_strides_[i] = (shape_[i] - 1) * strides_[i]; | |||
| } | |||
| // Calculate coordinate by pos | |||
| coordinates_.resize(dimension_); | |||
| } | |||
| void TransposeIterator::SetPos(size_t pos) { | |||
| for (int i = dimension_ - 1; i >= 0 && pos != 0; --i) { | |||
| coordinates_[i] = pos % shape_[i]; | |||
| pos_ += coordinates_[i] * strides_[i]; | |||
| pos /= shape_[i]; | |||
| } | |||
| } | |||
| void TransposeIterator::GenNextPos() { | |||
| for (int i = dimension_ - 1; i >= 0; --i) { | |||
| if (coordinates_[i] + 1 == shape_[i]) { | |||
| coordinates_[i] = 0; | |||
| pos_ -= back_strides_[i]; | |||
| } else { | |||
| coordinates_[i]++; | |||
| pos_ += strides_[i]; | |||
| break; | |||
| } | |||
| } | |||
| } | |||
| std::vector<size_t> CPUKernelUtils::GetBroadcastShape(const std::vector<size_t> &x, const std::vector<size_t> &y) { | |||
| size_t x_len = x.size(); | |||
| size_t y_len = y.size(); | |||
| size_t length = x_len < y_len ? x_len : y_len; | |||
| std::vector<size_t> broadcast_shape; | |||
| std::vector<size_t> broadcast_shape_back; | |||
| for (int i = -length; i < 0; ++i) { | |||
| if (x[x_len + i] == 1) { | |||
| broadcast_shape_back.push_back(y[y_len + i]); | |||
| } else if (y[y_len + i] == 1) { | |||
| broadcast_shape_back.push_back(x[x_len + i]); | |||
| } else if (x[x_len + i] == y[y_len + i]) { | |||
| broadcast_shape_back.push_back(x[x_len + i]); | |||
| } | |||
| } | |||
| if (length == x_len) { | |||
| for (size_t i = 0; i < y_len - length; ++i) { | |||
| broadcast_shape.push_back(y[i]); | |||
| } | |||
| } else { | |||
| for (size_t i = 0; i < x_len - length; ++i) { | |||
| broadcast_shape.push_back(x[i]); | |||
| } | |||
| } | |||
| for (size_t i = 0; i < length; ++i) { | |||
| broadcast_shape.push_back(broadcast_shape_back[i]); | |||
| } | |||
| return broadcast_shape; | |||
| } | |||
| } // namespace kernel | |||
| } // namespace mindspore | |||
| /** | |||
| * Copyright 2019 Huawei Technologies Co., Ltd | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| * You may obtain a copy of the License at | |||
| * | |||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, software | |||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| #include "backend/kernel_compiler/cpu/cpu_kernel.h" | |||
| #include <algorithm> | |||
| #include <utility> | |||
| #include "common/thread_pool.h" | |||
| namespace mindspore { | |||
| namespace kernel { | |||
| void CPUKernel::InitInputOutputSize(const CNodePtr &kernel_node) { | |||
| MS_EXCEPTION_IF_NULL(kernel_node); | |||
| size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node); | |||
| for (size_t input_index = 0; input_index < input_num; ++input_index) { | |||
| TypeId type_id = AnfAlgo::GetInputDeviceDataType(kernel_node, input_index); | |||
| size_t type_size = GetTypeByte(TypeIdToType(type_id)); | |||
| std::vector<size_t> shape = AnfAlgo::GetInputDeviceShape(kernel_node, input_index); | |||
| size_t tensor_size = | |||
| shape.empty() ? type_size : std::accumulate(shape.begin(), shape.end(), type_size, std::multiplies<size_t>()); | |||
| tensor_size = std::max(tensor_size, type_size); | |||
| input_size_list_.emplace_back(tensor_size); | |||
| } | |||
| size_t output_num = AnfAlgo::GetOutputTensorNum(kernel_node); | |||
| for (size_t output_index = 0; output_index < output_num; ++output_index) { | |||
| TypeId type_id = AnfAlgo::GetOutputDeviceDataType(kernel_node, output_index); | |||
| size_t type_size = GetTypeByte(TypeIdToType(type_id)); | |||
| std::vector<size_t> shape = AnfAlgo::GetOutputDeviceShape(kernel_node, output_index); | |||
| size_t tensor_size = | |||
| shape.empty() ? type_size : std::accumulate(shape.begin(), shape.end(), type_size, std::multiplies<size_t>()); | |||
| tensor_size = std::max(tensor_size, type_size); | |||
| output_size_list_.emplace_back(tensor_size); | |||
| } | |||
| } | |||
| void CPUKernel::Init(const CNodePtr &kernel_node) { | |||
| InitKernel(kernel_node); | |||
| InitInputOutputSize(kernel_node); | |||
| } | |||
| void CPUKernelUtils::ExpandDimsTo4(std::vector<size_t> *shape) { | |||
| auto len = shape->size(); | |||
| if (len < 4) { | |||
| for (size_t i = 0; i < 4 - len; ++i) { | |||
| shape->insert(shape->begin(), 1); | |||
| } | |||
| } | |||
| } | |||
| size_t CPUKernelUtils::CalcOffset(const std::vector<size_t> &shape, size_t dim0, size_t dim1, size_t dim2, | |||
| size_t dim3) { | |||
| size_t offset = dim0 * shape[1] * shape[2] * shape[3] + dim1 * shape[2] * shape[3] + dim2 * shape[3] + dim3; | |||
| return offset; | |||
| } | |||
| size_t CPUKernelUtils::GetElementNumOnAxis(const std::vector<size_t> &shape, int axis) { | |||
| if (axis < 0) { | |||
| axis = axis + SizeToInt(shape.size()); | |||
| } | |||
| size_t result = 1; | |||
| for (int j = 3; j > axis; --j) { | |||
| result *= shape[j]; | |||
| } | |||
| return result; | |||
| } | |||
| void CPUKernelUtils::GetElementNumEveryDim(const std::vector<size_t> &shape, std::vector<size_t> *element_num) { | |||
| size_t accumulation = 1; | |||
| element_num->emplace_back(1); | |||
| for (size_t i = shape.size() - 1; i > 0; --i) { | |||
| accumulation *= shape[i]; | |||
| element_num->emplace_back(accumulation); | |||
| } | |||
| std::reverse(element_num->begin(), element_num->end()); | |||
| } | |||
| void CPUKernelUtils::ParallelFor(const CTask &task, size_t count) { | |||
| auto max_thread_num = common::ThreadPool::GetInstance().GetSyncRunThreadNum(); | |||
| const float block_size = 128.0; | |||
| size_t thread_num = count < block_size * max_thread_num ? std::ceil(count / block_size) : max_thread_num; | |||
| std::vector<common::Task> tasks; | |||
| size_t start = 0; | |||
| size_t once_compute_size = (count + thread_num - 1) / thread_num; | |||
| while (start < count) { | |||
| size_t end = (start + once_compute_size) > count ? count : (start + once_compute_size); | |||
| auto block = [&, start, end]() { | |||
| task(start, end); | |||
| return common::SUCCESS; | |||
| }; | |||
| tasks.emplace_back(block); | |||
| start += once_compute_size; | |||
| } | |||
| common::ThreadPool::GetInstance().SyncRun(tasks); | |||
| } | |||
| std::vector<size_t> CPUKernelUtils::FlatShapeByAxis(const std::vector<size_t> &shape, int axis) { | |||
| if (axis < 0) { | |||
| axis = axis + SizeToInt(shape.size()); | |||
| } | |||
| size_t dim_row = 1; | |||
| size_t dim_col = 1; | |||
| std::vector<size_t> flat_shape; | |||
| for (size_t i = 0; i < shape.size(); ++i) { | |||
| if (SizeToInt(i) < axis) { | |||
| dim_row *= shape[i]; | |||
| } else { | |||
| dim_col *= shape[i]; | |||
| } | |||
| } | |||
| flat_shape.push_back(dim_row); | |||
| flat_shape.push_back(dim_col); | |||
| return flat_shape; | |||
| } | |||
| BroadcastIterator::BroadcastIterator(std::vector<size_t> input_shape_a, std::vector<size_t> input_shape_b, | |||
| std::vector<size_t> output_shape) | |||
| : input_shape_a_(std::move(input_shape_a)), | |||
| input_shape_b_(std::move(input_shape_b)), | |||
| output_shape_(std::move(output_shape)) { | |||
| output_dimension_ = SizeToInt(output_shape_.size()); // Assign dimension to int for iterator | |||
| BroadcastShape(); | |||
| // Allocate strides memory | |||
| input_strides_a_.resize(output_dimension_); | |||
| input_strides_b_.resize(output_dimension_); | |||
| input_back_strides_a_.resize(output_dimension_); | |||
| input_back_strides_b_.resize(output_dimension_); | |||
| coordinates_.resize(output_dimension_); | |||
| InitStrides(); | |||
| } | |||
| void BroadcastIterator::SetPos(size_t pos) { | |||
| for (int i = output_dimension_ - 1; i >= 0 && pos != 0; --i) { | |||
| coordinates_[i] = pos % output_shape_[i]; | |||
| input_pos_[0] += coordinates_[i] * input_strides_a_[i]; | |||
| input_pos_[1] += coordinates_[i] * input_strides_b_[i]; | |||
| pos /= output_shape_[i]; | |||
| } | |||
| } | |||
| void BroadcastIterator::GenNextPos() { | |||
| // Calculate output next coordinate | |||
| for (int i = output_dimension_ - 1; i >= 0; --i) { | |||
| if (coordinates_[i] + 1 == output_shape_[i]) { | |||
| coordinates_[i] = 0; | |||
| input_pos_[0] -= input_back_strides_a_[i]; | |||
| input_pos_[1] -= input_back_strides_b_[i]; | |||
| } else { | |||
| ++coordinates_[i]; | |||
| input_pos_[0] += input_strides_a_[i]; | |||
| input_pos_[1] += input_strides_b_[i]; | |||
| break; | |||
| } | |||
| } | |||
| } | |||
| void BroadcastIterator::BroadcastShape() { | |||
| int input_dimension_a = input_shape_a_.size(); | |||
| if (input_dimension_a < output_dimension_) { | |||
| input_shape_a_.insert(input_shape_a_.begin(), output_dimension_ - input_dimension_a, 1); | |||
| } | |||
| int input_dimension_b = input_shape_b_.size(); | |||
| if (input_dimension_b < output_dimension_) { | |||
| input_shape_b_.insert(input_shape_b_.begin(), output_dimension_ - input_dimension_b, 1); | |||
| } | |||
| } | |||
| void BroadcastIterator::InitStrides() { | |||
| input_strides_a_[output_dimension_ - 1] = 1; | |||
| input_strides_b_[output_dimension_ - 1] = 1; | |||
| for (int i = output_dimension_ - 2; i >= 0; --i) { | |||
| input_strides_a_[i] = input_shape_a_[i + 1] * input_strides_a_[i + 1]; | |||
| input_strides_b_[i] = input_shape_b_[i + 1] * input_strides_b_[i + 1]; | |||
| input_back_strides_a_[i + 1] = (input_shape_a_[i + 1] - 1) * input_strides_a_[i + 1]; | |||
| input_back_strides_b_[i + 1] = (input_shape_b_[i + 1] - 1) * input_strides_b_[i + 1]; | |||
| } | |||
| // Update strides for broadcast | |||
| // While the axis value is 1, the stride is 0 | |||
| std::transform(input_strides_a_.begin(), input_strides_a_.end(), input_shape_a_.begin(), input_strides_a_.begin(), | |||
| [](const auto &a, const auto &b) { return b == 1 ? 0 : a; }); | |||
| std::transform(input_strides_b_.begin(), input_strides_b_.end(), input_shape_b_.begin(), input_strides_b_.begin(), | |||
| [](const auto &a, const auto &b) { return b == 1 ? 0 : a; }); | |||
| } | |||
| TransposeIterator::TransposeIterator(std::vector<size_t> output_shape, std::vector<size_t> axes, | |||
| const std::vector<size_t> &input_shape) | |||
| : shape_(std::move(output_shape)), axes_(std::move(axes)) { | |||
| // Calculate strides | |||
| dimension_ = shape_.size(); | |||
| std::vector<uint32_t> strides(dimension_, 1); | |||
| for (int i = dimension_ - 2; i >= 0; --i) { | |||
| strides[i] = input_shape[i + 1] * strides[i + 1]; | |||
| } | |||
| // Swap shape ans strides and calculate back strides | |||
| strides_.resize(dimension_); | |||
| back_strides_.resize(dimension_); | |||
| for (int i = dimension_ - 1; i >= 0; --i) { | |||
| strides_[i] = strides[axes_[i]]; | |||
| back_strides_[i] = (shape_[i] - 1) * strides_[i]; | |||
| } | |||
| // Calculate coordinate by pos | |||
| coordinates_.resize(dimension_); | |||
| } | |||
| void TransposeIterator::SetPos(size_t pos) { | |||
| for (int i = dimension_ - 1; i >= 0 && pos != 0; --i) { | |||
| coordinates_[i] = pos % shape_[i]; | |||
| pos_ += coordinates_[i] * strides_[i]; | |||
| pos /= shape_[i]; | |||
| } | |||
| } | |||
| void TransposeIterator::GenNextPos() { | |||
| for (int i = dimension_ - 1; i >= 0; --i) { | |||
| if (coordinates_[i] + 1 == shape_[i]) { | |||
| coordinates_[i] = 0; | |||
| pos_ -= back_strides_[i]; | |||
| } else { | |||
| coordinates_[i]++; | |||
| pos_ += strides_[i]; | |||
| break; | |||
| } | |||
| } | |||
| } | |||
| std::vector<size_t> CPUKernelUtils::GetBroadcastShape(const std::vector<size_t> &x, const std::vector<size_t> &y) { | |||
| size_t x_len = x.size(); | |||
| size_t y_len = y.size(); | |||
| size_t length = x_len < y_len ? x_len : y_len; | |||
| std::vector<size_t> broadcast_shape; | |||
| std::vector<size_t> broadcast_shape_back; | |||
| for (int i = -length; i < 0; ++i) { | |||
| if (x[x_len + i] == 1) { | |||
| broadcast_shape_back.push_back(y[y_len + i]); | |||
| } else if (y[y_len + i] == 1) { | |||
| broadcast_shape_back.push_back(x[x_len + i]); | |||
| } else if (x[x_len + i] == y[y_len + i]) { | |||
| broadcast_shape_back.push_back(x[x_len + i]); | |||
| } | |||
| } | |||
| if (length == x_len) { | |||
| for (size_t i = 0; i < y_len - length; ++i) { | |||
| broadcast_shape.push_back(y[i]); | |||
| } | |||
| } else { | |||
| for (size_t i = 0; i < x_len - length; ++i) { | |||
| broadcast_shape.push_back(x[i]); | |||
| } | |||
| } | |||
| for (size_t i = 0; i < length; ++i) { | |||
| broadcast_shape.push_back(broadcast_shape_back[i]); | |||
| } | |||
| return broadcast_shape; | |||
| } | |||
| } // namespace kernel | |||
| } // namespace mindspore | |||
| @@ -1,205 +1,205 @@ | |||
| /** | |||
| * Copyright 2019 Huawei Technologies Co., Ltd | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| * You may obtain a copy of the License at | |||
| * | |||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, software | |||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_CPU_KERNEL_H_ | |||
| #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_CPU_KERNEL_H_ | |||
| #include <functional> | |||
| #include <memory> | |||
| #include <numeric> | |||
| #include <string> | |||
| #include <thread> | |||
| #include <vector> | |||
| #include "backend/kernel_compiler/kernel.h" | |||
| #include "backend/session/anf_runtime_algorithm.h" | |||
| #include "backend/kernel_compiler/common_utils.h" | |||
| #include "ir/anf.h" | |||
| using mindspore::kernel::Address; | |||
| using mindspore::kernel::AddressPtr; | |||
| using CTask = std::function<void(size_t, size_t)>; | |||
| namespace mindspore { | |||
| namespace kernel { | |||
| const char KERNEL_SIZE[] = "kernel_size"; | |||
| const char STRIDE[] = "stride"; | |||
| const char STRIDES[] = "strides"; | |||
| const char DILATION[] = "dilation"; | |||
| const char DILATIONS[] = "dilations"; | |||
| const char FORMAT[] = "format"; | |||
| const char PAD[] = "pad"; | |||
| const char PAD_LIST[] = "pad_list"; | |||
| const char PAD_MODE[] = "pad_mode"; | |||
| const char PAD_MODE_LOWER_SAME[] = "same"; | |||
| const char PAD_MODE_LOWER_VALID[] = "valid"; | |||
| const char PAD_MODE_UPPER_SAME[] = "SAME"; | |||
| const char PAD_MODE_UPPER_VALID[] = "VALID"; | |||
| const char TRANSPOSE_A[] = "transpose_a"; | |||
| const char TRANSPOSE_B[] = "transpose_b"; | |||
| const char IS_GRAD[] = "is_grad"; | |||
| const char TRANSPOSE_NO = 'N'; | |||
| const char TRANSPOSE_YES = 'T'; | |||
| const char AXIS[] = "axis"; | |||
| const char DIM[] = "dim"; | |||
| const char BEGIN[] = "begin"; | |||
| const char END[] = "end"; | |||
| const char SIZE[] = "size"; | |||
| const char USE_NESTEROV[] = "use_nesterov"; | |||
| const char GROUP[] = "group"; | |||
| const char START[] = "start"; | |||
| const char LIMIT[] = "limit"; | |||
| const char DELTA[] = "delta"; | |||
| const char SORTED[] = "sorted"; | |||
| const char ADJ_ST[] = "adjoint_st"; | |||
| const char ADJ_dT[] = "adjoint_dt"; | |||
| enum OperateType { | |||
| ADD = 0, | |||
| SUB, | |||
| MUL, | |||
| DIV, | |||
| SQUARE, | |||
| SQRT, | |||
| POW, | |||
| REALDIV, | |||
| FLOORDIV, | |||
| MOD, | |||
| FLOORMOD, | |||
| NEG, | |||
| LESS, | |||
| ASSIGNADD, | |||
| RELUGRAD, | |||
| RELU6GRAD, | |||
| ABSGRAD, | |||
| TANHGRAD, | |||
| SQRTGRAD, | |||
| SIGMOIDGRAD, | |||
| ONESLIKE, | |||
| ZEROSLIKE, | |||
| SIGN, | |||
| EQUAL, | |||
| NOTEQUAL, | |||
| LESSEQUAL, | |||
| LOGICALAND, | |||
| LOGICALOR, | |||
| LOGICALNOT, | |||
| FLOOR, | |||
| SQUAREDDIFFERENCE, | |||
| GREATER, | |||
| GREATEREQUAL, | |||
| RECIPROCAL, | |||
| GELU, | |||
| GELUGRAD, | |||
| ASIN, | |||
| ACOS, | |||
| ATAN, | |||
| ASINGRAD, | |||
| ACOSGRAD, | |||
| ATANGRAD, | |||
| SIN, | |||
| COS, | |||
| TAN, | |||
| SINH, | |||
| COSH, | |||
| ASINH, | |||
| ACOSH, | |||
| ATANH, | |||
| ASINHGRAD, | |||
| ACOSHGRAD, | |||
| ATAN2, | |||
| RINT, | |||
| ROUND, | |||
| IDENTITY, | |||
| }; | |||
| class CPUKernel : public kernel::KernelMod { | |||
| public: | |||
| CPUKernel() = default; | |||
| ~CPUKernel() override = default; | |||
| virtual void Init(const CNodePtr &kernel_node); | |||
| virtual void InitKernel(const CNodePtr &kernel_node) = 0; | |||
| bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace, | |||
| const std::vector<AddressPtr> &outputs, void * /*stream_ptr*/) override { | |||
| return Launch(inputs, workspace, outputs); | |||
| }; | |||
| virtual bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace, | |||
| const std::vector<AddressPtr> &outputs) = 0; | |||
| const std::vector<size_t> &GetInputSizeList() const override { return input_size_list_; } | |||
| const std::vector<size_t> &GetOutputSizeList() const override { return output_size_list_; } | |||
| const std::vector<size_t> &GetWorkspaceSizeList() const override { return workspace_size_list_; } | |||
| protected: | |||
| virtual void InitInputOutputSize(const CNodePtr &kernel_node); | |||
| std::vector<size_t> input_size_list_; | |||
| std::vector<size_t> output_size_list_; | |||
| std::vector<size_t> workspace_size_list_; | |||
| }; | |||
| class CPUKernelUtils { | |||
| public: | |||
| static void ExpandDimsTo4(std::vector<size_t> *shape); | |||
| static size_t CalcOffset(const std::vector<size_t> &shape, size_t dim0, size_t dim1, size_t dim2, size_t dim3); | |||
| static size_t GetElementNumOnAxis(const std::vector<size_t> &shape, int axis); | |||
| static void GetElementNumEveryDim(const std::vector<size_t> &shape, std::vector<size_t> *element_num); | |||
| static void ParallelFor(const CTask &task, size_t count); | |||
| static std::vector<size_t> FlatShapeByAxis(const std::vector<size_t> &shape, int axis); | |||
| static std::vector<size_t> GetBroadcastShape(const std::vector<size_t> &x, const std::vector<size_t> &y); | |||
| }; | |||
| class BroadcastIterator { | |||
| public: | |||
| BroadcastIterator(std::vector<size_t> input_shape_a, std::vector<size_t> input_shape_b, | |||
| std::vector<size_t> output_shape); | |||
| virtual ~BroadcastIterator() = default; | |||
| inline size_t GetInputPosA() const { return input_pos_[0]; } | |||
| inline size_t GetInputPosB() const { return input_pos_[1]; } | |||
| void SetPos(size_t pos); | |||
| void GenNextPos(); | |||
| private: | |||
| void BroadcastShape(); | |||
| void InitStrides(); | |||
| std::vector<size_t> coordinates_; | |||
| std::vector<size_t> input_shape_a_; | |||
| std::vector<size_t> input_shape_b_; | |||
| std::vector<size_t> output_shape_; | |||
| std::vector<size_t> input_strides_a_; | |||
| std::vector<size_t> input_strides_b_; | |||
| std::vector<size_t> input_back_strides_a_; | |||
| std::vector<size_t> input_back_strides_b_; | |||
| std::array<size_t, 2> input_pos_{0}; | |||
| int output_dimension_{0}; | |||
| }; | |||
| class TransposeIterator { | |||
| public: | |||
| TransposeIterator(std::vector<size_t> output_shape, std::vector<size_t> axes, const std::vector<size_t> &input_shape); | |||
| virtual ~TransposeIterator() = default; | |||
| inline size_t GetPos() const { return pos_; } | |||
| void SetPos(size_t pos); | |||
| void GenNextPos(); | |||
| private: | |||
| int dimension_{0}; | |||
| std::vector<size_t> coordinates_; | |||
| std::vector<size_t> shape_; | |||
| std::vector<size_t> strides_; | |||
| std::vector<size_t> back_strides_; | |||
| std::vector<size_t> axes_; | |||
| size_t pos_{0}; | |||
| }; | |||
| } // namespace kernel | |||
| } // namespace mindspore | |||
| #endif // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_CPU_KERNEL_H_ | |||
| /** | |||
| * Copyright 2019 Huawei Technologies Co., Ltd | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| * You may obtain a copy of the License at | |||
| * | |||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, software | |||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_CPU_KERNEL_H_ | |||
| #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_CPU_KERNEL_H_ | |||
| #include <functional> | |||
| #include <memory> | |||
| #include <numeric> | |||
| #include <string> | |||
| #include <thread> | |||
| #include <vector> | |||
| #include "backend/kernel_compiler/kernel.h" | |||
| #include "backend/session/anf_runtime_algorithm.h" | |||
| #include "backend/kernel_compiler/common_utils.h" | |||
| #include "ir/anf.h" | |||
| using mindspore::kernel::Address; | |||
| using mindspore::kernel::AddressPtr; | |||
| using CTask = std::function<void(size_t, size_t)>; | |||
| namespace mindspore { | |||
| namespace kernel { | |||
| const char KERNEL_SIZE[] = "kernel_size"; | |||
| const char STRIDE[] = "stride"; | |||
| const char STRIDES[] = "strides"; | |||
| const char DILATION[] = "dilation"; | |||
| const char DILATIONS[] = "dilations"; | |||
| const char FORMAT[] = "format"; | |||
| const char PAD[] = "pad"; | |||
| const char PAD_LIST[] = "pad_list"; | |||
| const char PAD_MODE[] = "pad_mode"; | |||
| const char PAD_MODE_LOWER_SAME[] = "same"; | |||
| const char PAD_MODE_LOWER_VALID[] = "valid"; | |||
| const char PAD_MODE_UPPER_SAME[] = "SAME"; | |||
| const char PAD_MODE_UPPER_VALID[] = "VALID"; | |||
| const char TRANSPOSE_A[] = "transpose_a"; | |||
| const char TRANSPOSE_B[] = "transpose_b"; | |||
| const char IS_GRAD[] = "is_grad"; | |||
| const char TRANSPOSE_NO = 'N'; | |||
| const char TRANSPOSE_YES = 'T'; | |||
| const char AXIS[] = "axis"; | |||
| const char DIM[] = "dim"; | |||
| const char BEGIN[] = "begin"; | |||
| const char END[] = "end"; | |||
| const char SIZE[] = "size"; | |||
| const char USE_NESTEROV[] = "use_nesterov"; | |||
| const char GROUP[] = "group"; | |||
| const char START[] = "start"; | |||
| const char LIMIT[] = "limit"; | |||
| const char DELTA[] = "delta"; | |||
| const char SORTED[] = "sorted"; | |||
| const char ADJ_ST[] = "adjoint_st"; | |||
| const char ADJ_dT[] = "adjoint_dt"; | |||
| enum OperateType { | |||
| ADD = 0, | |||
| SUB, | |||
| MUL, | |||
| DIV, | |||
| SQUARE, | |||
| SQRT, | |||
| POW, | |||
| REALDIV, | |||
| FLOORDIV, | |||
| MOD, | |||
| FLOORMOD, | |||
| NEG, | |||
| LESS, | |||
| ASSIGNADD, | |||
| RELUGRAD, | |||
| RELU6GRAD, | |||
| ABSGRAD, | |||
| TANHGRAD, | |||
| SQRTGRAD, | |||
| SIGMOIDGRAD, | |||
| ONESLIKE, | |||
| ZEROSLIKE, | |||
| SIGN, | |||
| EQUAL, | |||
| NOTEQUAL, | |||
| LESSEQUAL, | |||
| LOGICALAND, | |||
| LOGICALOR, | |||
| LOGICALNOT, | |||
| FLOOR, | |||
| SQUAREDDIFFERENCE, | |||
| GREATER, | |||
| GREATEREQUAL, | |||
| RECIPROCAL, | |||
| GELU, | |||
| GELUGRAD, | |||
| ASIN, | |||
| ACOS, | |||
| ATAN, | |||
| ASINGRAD, | |||
| ACOSGRAD, | |||
| ATANGRAD, | |||
| SIN, | |||
| COS, | |||
| TAN, | |||
| SINH, | |||
| COSH, | |||
| ASINH, | |||
| ACOSH, | |||
| ATANH, | |||
| ASINHGRAD, | |||
| ACOSHGRAD, | |||
| ATAN2, | |||
| RINT, | |||
| ROUND, | |||
| IDENTITY, | |||
| }; | |||
| class CPUKernel : public kernel::KernelMod { | |||
| public: | |||
| CPUKernel() = default; | |||
| ~CPUKernel() override = default; | |||
| virtual void Init(const CNodePtr &kernel_node); | |||
| virtual void InitKernel(const CNodePtr &kernel_node) = 0; | |||
| bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace, | |||
| const std::vector<AddressPtr> &outputs, void * /*stream_ptr*/) override { | |||
| return Launch(inputs, workspace, outputs); | |||
| }; | |||
| virtual bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace, | |||
| const std::vector<AddressPtr> &outputs) = 0; | |||
| const std::vector<size_t> &GetInputSizeList() const override { return input_size_list_; } | |||
| const std::vector<size_t> &GetOutputSizeList() const override { return output_size_list_; } | |||
| const std::vector<size_t> &GetWorkspaceSizeList() const override { return workspace_size_list_; } | |||
| protected: | |||
| virtual void InitInputOutputSize(const CNodePtr &kernel_node); | |||
| std::vector<size_t> input_size_list_; | |||
| std::vector<size_t> output_size_list_; | |||
| std::vector<size_t> workspace_size_list_; | |||
| }; | |||
| class CPUKernelUtils { | |||
| public: | |||
| static void ExpandDimsTo4(std::vector<size_t> *shape); | |||
| static size_t CalcOffset(const std::vector<size_t> &shape, size_t dim0, size_t dim1, size_t dim2, size_t dim3); | |||
| static size_t GetElementNumOnAxis(const std::vector<size_t> &shape, int axis); | |||
| static void GetElementNumEveryDim(const std::vector<size_t> &shape, std::vector<size_t> *element_num); | |||
| static void ParallelFor(const CTask &task, size_t count); | |||
| static std::vector<size_t> FlatShapeByAxis(const std::vector<size_t> &shape, int axis); | |||
| static std::vector<size_t> GetBroadcastShape(const std::vector<size_t> &x, const std::vector<size_t> &y); | |||
| }; | |||
| class BroadcastIterator { | |||
| public: | |||
| BroadcastIterator(std::vector<size_t> input_shape_a, std::vector<size_t> input_shape_b, | |||
| std::vector<size_t> output_shape); | |||
| virtual ~BroadcastIterator() = default; | |||
| inline size_t GetInputPosA() const { return input_pos_[0]; } | |||
| inline size_t GetInputPosB() const { return input_pos_[1]; } | |||
| void SetPos(size_t pos); | |||
| void GenNextPos(); | |||
| private: | |||
| void BroadcastShape(); | |||
| void InitStrides(); | |||
| std::vector<size_t> coordinates_; | |||
| std::vector<size_t> input_shape_a_; | |||
| std::vector<size_t> input_shape_b_; | |||
| std::vector<size_t> output_shape_; | |||
| std::vector<size_t> input_strides_a_; | |||
| std::vector<size_t> input_strides_b_; | |||
| std::vector<size_t> input_back_strides_a_; | |||
| std::vector<size_t> input_back_strides_b_; | |||
| std::array<size_t, 2> input_pos_{0}; | |||
| int output_dimension_{0}; | |||
| }; | |||
| class TransposeIterator { | |||
| public: | |||
| TransposeIterator(std::vector<size_t> output_shape, std::vector<size_t> axes, const std::vector<size_t> &input_shape); | |||
| virtual ~TransposeIterator() = default; | |||
| inline size_t GetPos() const { return pos_; } | |||
| void SetPos(size_t pos); | |||
| void GenNextPos(); | |||
| private: | |||
| int dimension_{0}; | |||
| std::vector<size_t> coordinates_; | |||
| std::vector<size_t> shape_; | |||
| std::vector<size_t> strides_; | |||
| std::vector<size_t> back_strides_; | |||
| std::vector<size_t> axes_; | |||
| size_t pos_{0}; | |||
| }; | |||
| } // namespace kernel | |||
| } // namespace mindspore | |||
| #endif // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_CPU_KERNEL_H_ | |||
| @@ -1,340 +1,340 @@ | |||
| /** | |||
| * Copyright 2021 Huawei Technologies Co., Ltd | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| * You may obtain a copy of the License at | |||
| * | |||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, software | |||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| #include "backend/kernel_compiler/cpu/ctcloss_cpu_kernel.h" | |||
| #include "runtime/device/cpu/cpu_device_address.h" | |||
| namespace mindspore { | |||
| namespace kernel { | |||
| void CTCLossCPUKernel::InitKernel(const CNodePtr &kernel_node) { | |||
| CheckParam(kernel_node); | |||
| probs_shape_ = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0); | |||
| indice_dims_ = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 1); | |||
| labels_dims_ = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 2); | |||
| dtype_ = AnfAlgo::GetInputDeviceDataType(kernel_node, 0); | |||
| if (probs_shape_.size() != 3) { | |||
| MS_LOG(EXCEPTION) << "Probs dims: " << probs_shape_.size() << " not support."; | |||
| } | |||
| if (labels_dims_.size() != 1) { | |||
| MS_LOG(EXCEPTION) << "Labels dims: " << labels_dims_.size() << " not support."; | |||
| } | |||
| if (indice_dims_.size() != 2) { | |||
| MS_LOG(EXCEPTION) << "Labels indice dims: " << indice_dims_.size() << " not support."; | |||
| } | |||
| preprocess_collapse_repeated_ = AnfAlgo::GetNodeAttr<bool>(kernel_node, "preprocess_collapse_repeated"); | |||
| ctc_merge_repeated_ = AnfAlgo::GetNodeAttr<bool>(kernel_node, "ctc_merge_repeated"); | |||
| ignore_longer_outputs_than_inputs_ = AnfAlgo::GetNodeAttr<bool>(kernel_node, "ignore_longer_outputs_than_inputs"); | |||
| max_time_ = probs_shape_[0]; | |||
| batch_size_ = probs_shape_[1]; | |||
| num_class_ = probs_shape_[2]; | |||
| blank_index_ = num_class_ - 1; | |||
| } | |||
| bool CTCLossCPUKernel::Launch(const std::vector<kernel::AddressPtr> &inputs, const std::vector<kernel::AddressPtr> &, | |||
| const std::vector<kernel::AddressPtr> &outputs) { | |||
| if (dtype_ == kNumberTypeFloat16) { | |||
| LaunchKernel<float16>(inputs, outputs); | |||
| } else if (dtype_ == kNumberTypeFloat32) { | |||
| LaunchKernel<float>(inputs, outputs); | |||
| } | |||
| return true; | |||
| } | |||
| template <typename T> | |||
| inline T LogSumExp(const T logprob1, const T logprob2) { | |||
| T kLogZero_ = -std::numeric_limits<T>::infinity(); | |||
| if (logprob1 <= kLogZero_) { | |||
| return logprob2; | |||
| } else if (logprob2 <= kLogZero_) { | |||
| return logprob1; | |||
| } else { | |||
| return (logprob1 > logprob2) ? logprob1 + static_cast<T>(log1p(exp(logprob2 - logprob1))) | |||
| : logprob2 + static_cast<T>(log1p(exp(logprob1 - logprob2))); | |||
| } | |||
| } | |||
| template <typename TT> | |||
| void CTCLossCPUKernel::CalculateFwdVar(const std::vector<uint32_t> &label_with_blank, | |||
| const std::vector<std::vector<TT>> &y, | |||
| std::vector<std::vector<TT>> *log_alpha_b) { | |||
| int U = label_with_blank.size(); | |||
| int T = (*log_alpha_b)[0].size(); | |||
| TT kLogZero_ = -std::numeric_limits<TT>::infinity(); | |||
| (*log_alpha_b)[0][0] = static_cast<TT>(log(y[blank_index_][0])); | |||
| auto label_0 = (label_with_blank.size() > 1) ? label_with_blank[1] : blank_index_; | |||
| if (label_with_blank.size() > 1) { | |||
| (*log_alpha_b)[1][0] = static_cast<TT>(log(y[label_0][0])); | |||
| } | |||
| for (int t = 1; t < T; ++t) { | |||
| int low = std::max(0, U - (2 * (T - t))); | |||
| int high = std::min(U, 2 * (t + 1)); | |||
| for (int u = low; u < high; ++u) { | |||
| auto sum_log_alpha_b = kLogZero_; | |||
| if (ctc_merge_repeated_ || label_with_blank[u] == blank_index_) { | |||
| sum_log_alpha_b = (*log_alpha_b)[u][t - 1]; | |||
| } | |||
| if (u > 0) { | |||
| sum_log_alpha_b = LogSumExp(sum_log_alpha_b, (*log_alpha_b)[u - 1][t - 1]); | |||
| } | |||
| if (u > 1) { | |||
| bool matching_labels_merge = ctc_merge_repeated_ && (label_with_blank[u] == label_with_blank[u - 2]); | |||
| if (label_with_blank[u] != blank_index_ && !matching_labels_merge) { | |||
| sum_log_alpha_b = LogSumExp(sum_log_alpha_b, (*log_alpha_b)[u - 2][t - 1]); | |||
| } | |||
| } | |||
| (*log_alpha_b)[u][t] = | |||
| static_cast<TT>(log(static_cast<TT>(y[label_with_blank[IntToSize(u)]][IntToSize(t)]))) + sum_log_alpha_b; | |||
| } | |||
| } | |||
| } | |||
| template <typename TT> | |||
| void CTCLossCPUKernel::CalculateBwdVar(const std::vector<uint32_t> &label_with_blank, | |||
| const std::vector<std::vector<TT>> &y, | |||
| std::vector<std::vector<TT>> *log_beta_b) { | |||
| int T = (*log_beta_b)[0].size(); | |||
| int U = label_with_blank.size(); | |||
| if (U > 1) { | |||
| for (int u = U - 2; u < U; ++u) { | |||
| (*log_beta_b)[u][T - 1] = TT(0); | |||
| } | |||
| } else { | |||
| (*log_beta_b)[0][T - 1] = TT(0); | |||
| (*log_beta_b)[0][T - 2] = TT(0); | |||
| } | |||
| for (int t = T - 2; t >= 0; --t) { | |||
| int low = std::max(0, U - (2 * (T - t))); | |||
| int high = std::min(U, 2 * (t + 1)); | |||
| for (int u = low; u < high; ++u) { | |||
| if (ctc_merge_repeated_ || label_with_blank[u] == blank_index_) { | |||
| (*log_beta_b)[u][t] = | |||
| LogSumExp((*log_beta_b)[u][t], (*log_beta_b)[u][t + 1] + TT(log(y[label_with_blank[u]][t + 1]))); | |||
| } | |||
| if (u + 1 < U) { | |||
| (*log_beta_b)[u][t] = | |||
| LogSumExp((*log_beta_b)[u][t], (*log_beta_b)[u + 1][t + 1] + TT(log(y[label_with_blank[u + 1]][t + 1]))); | |||
| } | |||
| if (u + 2 < U) { | |||
| bool matching_labels_merge = ctc_merge_repeated_ && (label_with_blank[u] == label_with_blank[u + 2]); | |||
| if (label_with_blank[u] != blank_index_ && !matching_labels_merge) { | |||
| (*log_beta_b)[u][t] = | |||
| LogSumExp((*log_beta_b)[u][t], (*log_beta_b)[u + 2][t + 1] + TT(log(y[label_with_blank[u + 2]][t + 1]))); | |||
| } | |||
| } | |||
| } | |||
| } | |||
| } | |||
| template <typename TT> | |||
| void CTCLossCPUKernel::CalculateGrad(const std::vector<uint32_t> &label_with_blank, | |||
| const std::vector<std::vector<TT>> &y, | |||
| const std::vector<std::vector<TT>> &log_alpha_b, | |||
| const std::vector<std::vector<TT>> &log_beta_b, const TT log_pzx, | |||
| std::vector<std::vector<TT>> *dy) { | |||
| auto dy_b = dy; | |||
| TT kLogZero_ = -std::numeric_limits<TT>::infinity(); | |||
| if (log_pzx <= kLogZero_) { | |||
| MS_LOG(INFO) << "No valid path found"; | |||
| return; | |||
| } | |||
| size_t L = y.size(); | |||
| size_t T = y[0].size(); | |||
| size_t U = label_with_blank.size(); | |||
| for (size_t t = 0; t < T; ++t) { | |||
| std::vector<TT> prob_sum(L, kLogZero_); | |||
| for (size_t u = 0; u < U; ++u) { | |||
| uint32_t l = label_with_blank[u]; | |||
| prob_sum[l] = LogSumExp(prob_sum[l], log_alpha_b[u][t] + log_beta_b[u][t]); | |||
| } | |||
| for (size_t l = 0; l < L; ++l) { | |||
| (*dy_b)[l][t] = y[l][t] - static_cast<TT>(exp(prob_sum[l] - log_pzx)); | |||
| } | |||
| } | |||
| } | |||
| void CTCLossCPUKernel::GenLableWithBlank(const uint32_t *seq_len, const std::vector<std::vector<uint32_t>> &batch_label, | |||
| std::vector<std::vector<uint32_t>> *label_with_blank) { | |||
| for (size_t b = 0; b < batch_size_; ++b) { | |||
| std::vector<uint32_t> l; | |||
| const std::vector<uint32_t> &label = batch_label[b]; | |||
| bool has_blank = false; | |||
| for (size_t i = 0; i < label.size(); ++i) { | |||
| if (i == 0 || !preprocess_collapse_repeated_ || label[i] != label[i - 1]) { | |||
| if (label[i] >= num_class_ - 1) { | |||
| has_blank = true; | |||
| } else { | |||
| if (has_blank) { | |||
| MS_LOG(EXCEPTION) << "Invalid labels(index >= num_class - 1) should not appear between two valid labels"; | |||
| } | |||
| l.push_back(label[i]); | |||
| } | |||
| } | |||
| } | |||
| if (!ignore_longer_outputs_than_inputs_) { | |||
| if (l.size() > seq_len[b]) { | |||
| MS_LOG(EXCEPTION) << "Input time(sequence length) should greater than output size(label length), but gets " | |||
| << seq_len[b] << "< " << l.size(); | |||
| } | |||
| } | |||
| (*label_with_blank)[b].reserve(2 * l.size() + 1); | |||
| for (auto l_i : l) { | |||
| (*label_with_blank)[b].push_back(blank_index_); | |||
| (*label_with_blank)[b].push_back(l_i); | |||
| } | |||
| (*label_with_blank)[b].push_back(blank_index_); | |||
| } | |||
| } | |||
| template <typename T> | |||
| void InnerSoftMax(const T *inputs_addr, std::vector<std::vector<T>> *softmax_probs, const uint32_t sequence_length, | |||
| size_t num_class, size_t batch_size, size_t b) { | |||
| for (size_t t = 0; t < sequence_length; ++t) { | |||
| T maxCoeff(T(0)); | |||
| T sumCoeff(T(0)); | |||
| for (size_t c = 0; c < num_class; ++c) { | |||
| if (inputs_addr[t * batch_size * num_class + b * num_class + c] > maxCoeff) { | |||
| maxCoeff = inputs_addr[t * batch_size * num_class + b * num_class + c]; | |||
| } | |||
| } | |||
| for (size_t c = 0; c < num_class; ++c) { | |||
| sumCoeff += static_cast<T>(exp(inputs_addr[t * batch_size * num_class + b * num_class + c] - maxCoeff)); | |||
| (*softmax_probs)[c][t] = | |||
| static_cast<T>(exp(inputs_addr[t * batch_size * num_class + b * num_class + c] - maxCoeff)); | |||
| } | |||
| for (size_t c = 0; c < num_class; ++c) { | |||
| (*softmax_probs)[c][t] /= sumCoeff; | |||
| } | |||
| } | |||
| } | |||
| template <typename T> | |||
| void MatrixfromVector(uint32_t row, uint32_t col, std::vector<std::vector<T>> *array2D, const T init_value) { | |||
| array2D->resize(row); | |||
| for (size_t i = 0; i < row; ++i) { | |||
| (*array2D)[i].resize(col, init_value); | |||
| } | |||
| } | |||
| template <typename T> | |||
| void CTCLossCPUKernel::LaunchKernel(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &outputs) { | |||
| auto inputs_addr = reinterpret_cast<T *>(inputs[0]->addr); | |||
| auto labels_indices_addr = reinterpret_cast<uint64_t *>(inputs[1]->addr); | |||
| auto labels_values_addr = reinterpret_cast<uint32_t *>(inputs[2]->addr); | |||
| auto sequence_length_addr = reinterpret_cast<uint32_t *>(inputs[3]->addr); | |||
| auto loss_addr = reinterpret_cast<T *>(outputs[0]->addr); | |||
| auto gradient_addr = reinterpret_cast<T *>(outputs[1]->addr); | |||
| std::vector<std::vector<uint32_t>> label_batch; | |||
| std::vector<std::vector<uint32_t>> labels_with_blank; | |||
| std::vector<uint64_t> each_label_length; | |||
| label_batch.resize(batch_size_); | |||
| labels_with_blank.resize(batch_size_); | |||
| each_label_length.resize(batch_size_, 0); | |||
| T kLogZero_ = -std::numeric_limits<T>::infinity(); | |||
| // check validation of sequence length | |||
| for (size_t b = 0; b < batch_size_; ++b) { | |||
| if (sequence_length_addr[b] == uint32_t(0)) { | |||
| MS_LOG(EXCEPTION) << "Sequence length should > 0, but gets " << sequence_length_addr[b]; | |||
| } | |||
| if (sequence_length_addr[b] > max_time_) { | |||
| MS_LOG(EXCEPTION) << "Max time should be greater than sequence length, but gets " << max_time_ << " < " | |||
| << sequence_length_addr[b]; | |||
| } | |||
| } | |||
| for (size_t i = 0; i < indice_dims_[0]; ++i) { | |||
| each_label_length[labels_indices_addr[i * 2]]++; | |||
| } | |||
| // convert label format of label_value and label_indices to batch_label | |||
| uint64_t cum_sum = 0; | |||
| for (size_t b = 0; b < batch_size_; ++b) { | |||
| std::vector<uint32_t> *b_value = &label_batch[b]; | |||
| for (size_t l = 0; l < each_label_length[b]; ++l) { | |||
| b_value->push_back(labels_values_addr[cum_sum + l]); | |||
| } | |||
| cum_sum += each_label_length[b]; | |||
| } | |||
| // convert label to label with blank | |||
| GenLableWithBlank(sequence_length_addr, label_batch, &labels_with_blank); | |||
| for (size_t b = 0; b < batch_size_; ++b) { | |||
| std::vector<uint32_t> label_with_blank = labels_with_blank[b]; | |||
| // y_b [num_class, sequence_length] | |||
| std::vector<std::vector<T>> y_b; | |||
| std::vector<std::vector<T>> dy; | |||
| std::vector<std::vector<T>> log_alpha_b; | |||
| std::vector<std::vector<T>> log_beta_b; | |||
| MatrixfromVector(num_class_, sequence_length_addr[b], &y_b, kLogZero_); | |||
| MatrixfromVector(y_b.size(), y_b[0].size(), &dy, T(0)); | |||
| MatrixfromVector(label_with_blank.size(), sequence_length_addr[b], &log_alpha_b, kLogZero_); | |||
| MatrixfromVector(label_with_blank.size(), sequence_length_addr[b], &log_beta_b, kLogZero_); | |||
| InnerSoftMax(inputs_addr, &y_b, sequence_length_addr[b], num_class_, batch_size_, b); | |||
| CalculateFwdVar(label_with_blank, y_b, &log_alpha_b); | |||
| CalculateBwdVar(label_with_blank, y_b, &log_beta_b); | |||
| T log_pzx = kLogZero_; | |||
| for (size_t u = 0; u < label_with_blank.size(); ++u) { | |||
| log_pzx = LogSumExp(log_pzx, log_alpha_b[u][0] + log_beta_b[u][0]); | |||
| } | |||
| loss_addr[b] = -log_pzx; | |||
| CalculateGrad(label_with_blank, y_b, log_alpha_b, log_beta_b, log_pzx, &dy); | |||
| for (size_t t = 0; t < sequence_length_addr[b]; ++t) { | |||
| for (size_t c = 0; c < num_class_; ++c) { | |||
| gradient_addr[t * batch_size_ * num_class_ + b * num_class_ + c] = dy[c][t]; | |||
| } | |||
| } | |||
| } | |||
| } | |||
| void CTCLossCPUKernel::CheckParam(const CNodePtr &kernel_node) { | |||
| size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node); | |||
| if (input_num != 4) { | |||
| MS_LOG(EXCEPTION) << "CTCLossCPUKernel needs 4 inputs, but gets " << input_num; | |||
| } | |||
| size_t output_num = AnfAlgo::GetOutputTensorNum(kernel_node); | |||
| if (output_num != 2) { | |||
| MS_LOG(EXCEPTION) << "CTCLossCPUKernel expects 2 outputs, but gets" << output_num; | |||
| } | |||
| } | |||
| } // namespace kernel | |||
| } // namespace mindspore | |||
| /** | |||
| * Copyright 2021 Huawei Technologies Co., Ltd | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| * You may obtain a copy of the License at | |||
| * | |||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, software | |||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| #include "backend/kernel_compiler/cpu/ctcloss_cpu_kernel.h" | |||
| #include "runtime/device/cpu/cpu_device_address.h" | |||
| namespace mindspore { | |||
| namespace kernel { | |||
| void CTCLossCPUKernel::InitKernel(const CNodePtr &kernel_node) { | |||
| CheckParam(kernel_node); | |||
| probs_shape_ = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0); | |||
| indice_dims_ = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 1); | |||
| labels_dims_ = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 2); | |||
| dtype_ = AnfAlgo::GetInputDeviceDataType(kernel_node, 0); | |||
| if (probs_shape_.size() != 3) { | |||
| MS_LOG(EXCEPTION) << "Probs dims: " << probs_shape_.size() << " not support."; | |||
| } | |||
| if (labels_dims_.size() != 1) { | |||
| MS_LOG(EXCEPTION) << "Labels dims: " << labels_dims_.size() << " not support."; | |||
| } | |||
| if (indice_dims_.size() != 2) { | |||
| MS_LOG(EXCEPTION) << "Labels indice dims: " << indice_dims_.size() << " not support."; | |||
| } | |||
| preprocess_collapse_repeated_ = AnfAlgo::GetNodeAttr<bool>(kernel_node, "preprocess_collapse_repeated"); | |||
| ctc_merge_repeated_ = AnfAlgo::GetNodeAttr<bool>(kernel_node, "ctc_merge_repeated"); | |||
| ignore_longer_outputs_than_inputs_ = AnfAlgo::GetNodeAttr<bool>(kernel_node, "ignore_longer_outputs_than_inputs"); | |||
| max_time_ = probs_shape_[0]; | |||
| batch_size_ = probs_shape_[1]; | |||
| num_class_ = probs_shape_[2]; | |||
| blank_index_ = num_class_ - 1; | |||
| } | |||
| bool CTCLossCPUKernel::Launch(const std::vector<kernel::AddressPtr> &inputs, const std::vector<kernel::AddressPtr> &, | |||
| const std::vector<kernel::AddressPtr> &outputs) { | |||
| if (dtype_ == kNumberTypeFloat16) { | |||
| LaunchKernel<float16>(inputs, outputs); | |||
| } else if (dtype_ == kNumberTypeFloat32) { | |||
| LaunchKernel<float>(inputs, outputs); | |||
| } | |||
| return true; | |||
| } | |||
| template <typename T> | |||
| inline T LogSumExp(const T logprob1, const T logprob2) { | |||
| T kLogZero_ = -std::numeric_limits<T>::infinity(); | |||
| if (logprob1 <= kLogZero_) { | |||
| return logprob2; | |||
| } else if (logprob2 <= kLogZero_) { | |||
| return logprob1; | |||
| } else { | |||
| return (logprob1 > logprob2) ? logprob1 + static_cast<T>(log1p(exp(logprob2 - logprob1))) | |||
| : logprob2 + static_cast<T>(log1p(exp(logprob1 - logprob2))); | |||
| } | |||
| } | |||
| template <typename TT> | |||
| void CTCLossCPUKernel::CalculateFwdVar(const std::vector<uint32_t> &label_with_blank, | |||
| const std::vector<std::vector<TT>> &y, | |||
| std::vector<std::vector<TT>> *log_alpha_b) { | |||
| int U = label_with_blank.size(); | |||
| int T = (*log_alpha_b)[0].size(); | |||
| TT kLogZero_ = -std::numeric_limits<TT>::infinity(); | |||
| (*log_alpha_b)[0][0] = static_cast<TT>(log(y[blank_index_][0])); | |||
| auto label_0 = (label_with_blank.size() > 1) ? label_with_blank[1] : blank_index_; | |||
| if (label_with_blank.size() > 1) { | |||
| (*log_alpha_b)[1][0] = static_cast<TT>(log(y[label_0][0])); | |||
| } | |||
| for (int t = 1; t < T; ++t) { | |||
| int low = std::max(0, U - (2 * (T - t))); | |||
| int high = std::min(U, 2 * (t + 1)); | |||
| for (int u = low; u < high; ++u) { | |||
| auto sum_log_alpha_b = kLogZero_; | |||
| if (ctc_merge_repeated_ || label_with_blank[u] == blank_index_) { | |||
| sum_log_alpha_b = (*log_alpha_b)[u][t - 1]; | |||
| } | |||
| if (u > 0) { | |||
| sum_log_alpha_b = LogSumExp(sum_log_alpha_b, (*log_alpha_b)[u - 1][t - 1]); | |||
| } | |||
| if (u > 1) { | |||
| bool matching_labels_merge = ctc_merge_repeated_ && (label_with_blank[u] == label_with_blank[u - 2]); | |||
| if (label_with_blank[u] != blank_index_ && !matching_labels_merge) { | |||
| sum_log_alpha_b = LogSumExp(sum_log_alpha_b, (*log_alpha_b)[u - 2][t - 1]); | |||
| } | |||
| } | |||
| (*log_alpha_b)[u][t] = | |||
| static_cast<TT>(log(static_cast<TT>(y[label_with_blank[IntToSize(u)]][IntToSize(t)]))) + sum_log_alpha_b; | |||
| } | |||
| } | |||
| } | |||
| template <typename TT> | |||
| void CTCLossCPUKernel::CalculateBwdVar(const std::vector<uint32_t> &label_with_blank, | |||
| const std::vector<std::vector<TT>> &y, | |||
| std::vector<std::vector<TT>> *log_beta_b) { | |||
| int T = (*log_beta_b)[0].size(); | |||
| int U = label_with_blank.size(); | |||
| if (U > 1) { | |||
| for (int u = U - 2; u < U; ++u) { | |||
| (*log_beta_b)[u][T - 1] = TT(0); | |||
| } | |||
| } else { | |||
| (*log_beta_b)[0][T - 1] = TT(0); | |||
| (*log_beta_b)[0][T - 2] = TT(0); | |||
| } | |||
| for (int t = T - 2; t >= 0; --t) { | |||
| int low = std::max(0, U - (2 * (T - t))); | |||
| int high = std::min(U, 2 * (t + 1)); | |||
| for (int u = low; u < high; ++u) { | |||
| if (ctc_merge_repeated_ || label_with_blank[u] == blank_index_) { | |||
| (*log_beta_b)[u][t] = | |||
| LogSumExp((*log_beta_b)[u][t], (*log_beta_b)[u][t + 1] + TT(log(y[label_with_blank[u]][t + 1]))); | |||
| } | |||
| if (u + 1 < U) { | |||
| (*log_beta_b)[u][t] = | |||
| LogSumExp((*log_beta_b)[u][t], (*log_beta_b)[u + 1][t + 1] + TT(log(y[label_with_blank[u + 1]][t + 1]))); | |||
| } | |||
| if (u + 2 < U) { | |||
| bool matching_labels_merge = ctc_merge_repeated_ && (label_with_blank[u] == label_with_blank[u + 2]); | |||
| if (label_with_blank[u] != blank_index_ && !matching_labels_merge) { | |||
| (*log_beta_b)[u][t] = | |||
| LogSumExp((*log_beta_b)[u][t], (*log_beta_b)[u + 2][t + 1] + TT(log(y[label_with_blank[u + 2]][t + 1]))); | |||
| } | |||
| } | |||
| } | |||
| } | |||
| } | |||
| template <typename TT> | |||
| void CTCLossCPUKernel::CalculateGrad(const std::vector<uint32_t> &label_with_blank, | |||
| const std::vector<std::vector<TT>> &y, | |||
| const std::vector<std::vector<TT>> &log_alpha_b, | |||
| const std::vector<std::vector<TT>> &log_beta_b, const TT log_pzx, | |||
| std::vector<std::vector<TT>> *dy) { | |||
| auto dy_b = dy; | |||
| TT kLogZero_ = -std::numeric_limits<TT>::infinity(); | |||
| if (log_pzx <= kLogZero_) { | |||
| MS_LOG(INFO) << "No valid path found"; | |||
| return; | |||
| } | |||
| size_t L = y.size(); | |||
| size_t T = y[0].size(); | |||
| size_t U = label_with_blank.size(); | |||
| for (size_t t = 0; t < T; ++t) { | |||
| std::vector<TT> prob_sum(L, kLogZero_); | |||
| for (size_t u = 0; u < U; ++u) { | |||
| uint32_t l = label_with_blank[u]; | |||
| prob_sum[l] = LogSumExp(prob_sum[l], log_alpha_b[u][t] + log_beta_b[u][t]); | |||
| } | |||
| for (size_t l = 0; l < L; ++l) { | |||
| (*dy_b)[l][t] = y[l][t] - static_cast<TT>(exp(prob_sum[l] - log_pzx)); | |||
| } | |||
| } | |||
| } | |||
| void CTCLossCPUKernel::GenLableWithBlank(const uint32_t *seq_len, const std::vector<std::vector<uint32_t>> &batch_label, | |||
| std::vector<std::vector<uint32_t>> *label_with_blank) { | |||
| for (size_t b = 0; b < batch_size_; ++b) { | |||
| std::vector<uint32_t> l; | |||
| const std::vector<uint32_t> &label = batch_label[b]; | |||
| bool has_blank = false; | |||
| for (size_t i = 0; i < label.size(); ++i) { | |||
| if (i == 0 || !preprocess_collapse_repeated_ || label[i] != label[i - 1]) { | |||
| if (label[i] >= num_class_ - 1) { | |||
| has_blank = true; | |||
| } else { | |||
| if (has_blank) { | |||
| MS_LOG(EXCEPTION) << "Invalid labels(index >= num_class - 1) should not appear between two valid labels"; | |||
| } | |||
| l.push_back(label[i]); | |||
| } | |||
| } | |||
| } | |||
| if (!ignore_longer_outputs_than_inputs_) { | |||
| if (l.size() > seq_len[b]) { | |||
| MS_LOG(EXCEPTION) << "Input time(sequence length) should greater than output size(label length), but gets " | |||
| << seq_len[b] << "< " << l.size(); | |||
| } | |||
| } | |||
| (*label_with_blank)[b].reserve(2 * l.size() + 1); | |||
| for (auto l_i : l) { | |||
| (*label_with_blank)[b].push_back(blank_index_); | |||
| (*label_with_blank)[b].push_back(l_i); | |||
| } | |||
| (*label_with_blank)[b].push_back(blank_index_); | |||
| } | |||
| } | |||
| template <typename T> | |||
| void InnerSoftMax(const T *inputs_addr, std::vector<std::vector<T>> *softmax_probs, const uint32_t sequence_length, | |||
| size_t num_class, size_t batch_size, size_t b) { | |||
| for (size_t t = 0; t < sequence_length; ++t) { | |||
| T maxCoeff(T(0)); | |||
| T sumCoeff(T(0)); | |||
| for (size_t c = 0; c < num_class; ++c) { | |||
| if (inputs_addr[t * batch_size * num_class + b * num_class + c] > maxCoeff) { | |||
| maxCoeff = inputs_addr[t * batch_size * num_class + b * num_class + c]; | |||
| } | |||
| } | |||
| for (size_t c = 0; c < num_class; ++c) { | |||
| sumCoeff += static_cast<T>(exp(inputs_addr[t * batch_size * num_class + b * num_class + c] - maxCoeff)); | |||
| (*softmax_probs)[c][t] = | |||
| static_cast<T>(exp(inputs_addr[t * batch_size * num_class + b * num_class + c] - maxCoeff)); | |||
| } | |||
| for (size_t c = 0; c < num_class; ++c) { | |||
| (*softmax_probs)[c][t] /= sumCoeff; | |||
| } | |||
| } | |||
| } | |||
| template <typename T> | |||
| void MatrixfromVector(uint32_t row, uint32_t col, std::vector<std::vector<T>> *array2D, const T init_value) { | |||
| array2D->resize(row); | |||
| for (size_t i = 0; i < row; ++i) { | |||
| (*array2D)[i].resize(col, init_value); | |||
| } | |||
| } | |||
| template <typename T> | |||
| void CTCLossCPUKernel::LaunchKernel(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &outputs) { | |||
| auto inputs_addr = reinterpret_cast<T *>(inputs[0]->addr); | |||
| auto labels_indices_addr = reinterpret_cast<uint64_t *>(inputs[1]->addr); | |||
| auto labels_values_addr = reinterpret_cast<uint32_t *>(inputs[2]->addr); | |||
| auto sequence_length_addr = reinterpret_cast<uint32_t *>(inputs[3]->addr); | |||
| auto loss_addr = reinterpret_cast<T *>(outputs[0]->addr); | |||
| auto gradient_addr = reinterpret_cast<T *>(outputs[1]->addr); | |||
| std::vector<std::vector<uint32_t>> label_batch; | |||
| std::vector<std::vector<uint32_t>> labels_with_blank; | |||
| std::vector<uint64_t> each_label_length; | |||
| label_batch.resize(batch_size_); | |||
| labels_with_blank.resize(batch_size_); | |||
| each_label_length.resize(batch_size_, 0); | |||
| T kLogZero_ = -std::numeric_limits<T>::infinity(); | |||
| // check validation of sequence length | |||
| for (size_t b = 0; b < batch_size_; ++b) { | |||
| if (sequence_length_addr[b] == uint32_t(0)) { | |||
| MS_LOG(EXCEPTION) << "Sequence length should > 0, but gets " << sequence_length_addr[b]; | |||
| } | |||
| if (sequence_length_addr[b] > max_time_) { | |||
| MS_LOG(EXCEPTION) << "Max time should be greater than sequence length, but gets " << max_time_ << " < " | |||
| << sequence_length_addr[b]; | |||
| } | |||
| } | |||
| for (size_t i = 0; i < indice_dims_[0]; ++i) { | |||
| each_label_length[labels_indices_addr[i * 2]]++; | |||
| } | |||
| // convert label format of label_value and label_indices to batch_label | |||
| uint64_t cum_sum = 0; | |||
| for (size_t b = 0; b < batch_size_; ++b) { | |||
| std::vector<uint32_t> *b_value = &label_batch[b]; | |||
| for (size_t l = 0; l < each_label_length[b]; ++l) { | |||
| b_value->push_back(labels_values_addr[cum_sum + l]); | |||
| } | |||
| cum_sum += each_label_length[b]; | |||
| } | |||
| // convert label to label with blank | |||
| GenLableWithBlank(sequence_length_addr, label_batch, &labels_with_blank); | |||
| for (size_t b = 0; b < batch_size_; ++b) { | |||
| std::vector<uint32_t> label_with_blank = labels_with_blank[b]; | |||
| // y_b [num_class, sequence_length] | |||
| std::vector<std::vector<T>> y_b; | |||
| std::vector<std::vector<T>> dy; | |||
| std::vector<std::vector<T>> log_alpha_b; | |||
| std::vector<std::vector<T>> log_beta_b; | |||
| MatrixfromVector(num_class_, sequence_length_addr[b], &y_b, kLogZero_); | |||
| MatrixfromVector(y_b.size(), y_b[0].size(), &dy, T(0)); | |||
| MatrixfromVector(label_with_blank.size(), sequence_length_addr[b], &log_alpha_b, kLogZero_); | |||
| MatrixfromVector(label_with_blank.size(), sequence_length_addr[b], &log_beta_b, kLogZero_); | |||
| InnerSoftMax(inputs_addr, &y_b, sequence_length_addr[b], num_class_, batch_size_, b); | |||
| CalculateFwdVar(label_with_blank, y_b, &log_alpha_b); | |||
| CalculateBwdVar(label_with_blank, y_b, &log_beta_b); | |||
| T log_pzx = kLogZero_; | |||
| for (size_t u = 0; u < label_with_blank.size(); ++u) { | |||
| log_pzx = LogSumExp(log_pzx, log_alpha_b[u][0] + log_beta_b[u][0]); | |||
| } | |||
| loss_addr[b] = -log_pzx; | |||
| CalculateGrad(label_with_blank, y_b, log_alpha_b, log_beta_b, log_pzx, &dy); | |||
| for (size_t t = 0; t < sequence_length_addr[b]; ++t) { | |||
| for (size_t c = 0; c < num_class_; ++c) { | |||
| gradient_addr[t * batch_size_ * num_class_ + b * num_class_ + c] = dy[c][t]; | |||
| } | |||
| } | |||
| } | |||
| } | |||
| void CTCLossCPUKernel::CheckParam(const CNodePtr &kernel_node) { | |||
| size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node); | |||
| if (input_num != 4) { | |||
| MS_LOG(EXCEPTION) << "CTCLossCPUKernel needs 4 inputs, but gets " << input_num; | |||
| } | |||
| size_t output_num = AnfAlgo::GetOutputTensorNum(kernel_node); | |||
| if (output_num != 2) { | |||
| MS_LOG(EXCEPTION) << "CTCLossCPUKernel expects 2 outputs, but gets" << output_num; | |||
| } | |||
| } | |||
| } // namespace kernel | |||
| } // namespace mindspore | |||
| @@ -1,92 +1,92 @@ | |||
| /** | |||
| * Copyright 2021 Huawei Technologies Co., Ltd | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| * You may obtain a copy of the License at | |||
| * | |||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, software | |||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_CTCLOSS_CPU_KERNEL_H_ | |||
| #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_CTCLOSS_CPU_KERNEL_H_ | |||
| #include <memory> | |||
| #include <unordered_map> | |||
| #include <vector> | |||
| #include <algorithm> | |||
| #include <limits> | |||
| #include "backend/kernel_compiler/cpu/cpu_kernel.h" | |||
| #include "backend/kernel_compiler/cpu/cpu_kernel_factory.h" | |||
| namespace mindspore { | |||
| namespace kernel { | |||
| class CTCLossCPUKernel : public CPUKernel { | |||
| public: | |||
| CTCLossCPUKernel() = default; | |||
| ~CTCLossCPUKernel() override = default; | |||
| void InitKernel(const CNodePtr &kernel_node) override; | |||
| bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace, | |||
| const std::vector<AddressPtr> &outputs) override; | |||
| void GenLableWithBlank(const uint32_t *seq_len, const std::vector<std::vector<uint32_t>> &batch_label, | |||
| std::vector<std::vector<uint32_t>> *label_with_blank); | |||
| template <typename T> | |||
| void CalculateFwdVar(const std::vector<uint32_t> &label_with_blank, const std::vector<std::vector<T>> &y, | |||
| std::vector<std::vector<T>> *log_alpha_b); | |||
| template <typename T> | |||
| void CalculateBwdVar(const std::vector<uint32_t> &label_with_blank, const std::vector<std::vector<T>> &y, | |||
| std::vector<std::vector<T>> *log_beta_b); | |||
| template <typename T> | |||
| void CalculateGrad(const std::vector<uint32_t> &label_with_blank, const std::vector<std::vector<T>> &y, | |||
| const std::vector<std::vector<T>> &log_alpha_b, const std::vector<std::vector<T>> &log_beta_b, | |||
| const T log_pzx, std::vector<std::vector<T>> *dy); | |||
| template <typename T> | |||
| void LaunchKernel(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &outputs); | |||
| private: | |||
| void CheckParam(const CNodePtr &kernel_node); | |||
| std::vector<size_t> probs_shape_; | |||
| std::vector<size_t> indice_dims_; | |||
| std::vector<size_t> labels_dims_; | |||
| size_t num_class_; | |||
| size_t max_time_; | |||
| size_t batch_size_; | |||
| uint32_t blank_index_; | |||
| TypeId dtype_{kTypeUnknown}; | |||
| bool preprocess_collapse_repeated_; | |||
| bool ctc_merge_repeated_; | |||
| bool ignore_longer_outputs_than_inputs_; | |||
| }; | |||
| MS_REG_CPU_KERNEL(CTCLoss, | |||
| KernelAttr() | |||
| .AddInputAttr(kNumberTypeFloat16) | |||
| .AddInputAttr(kNumberTypeInt64) | |||
| .AddInputAttr(kNumberTypeInt32) | |||
| .AddInputAttr(kNumberTypeInt32) | |||
| .AddOutputAttr(kNumberTypeFloat16) | |||
| .AddOutputAttr(kNumberTypeFloat16), | |||
| CTCLossCPUKernel); | |||
| MS_REG_CPU_KERNEL(CTCLoss, | |||
| KernelAttr() | |||
| .AddInputAttr(kNumberTypeFloat32) | |||
| .AddInputAttr(kNumberTypeInt64) | |||
| .AddInputAttr(kNumberTypeInt32) | |||
| .AddInputAttr(kNumberTypeInt32) | |||
| .AddOutputAttr(kNumberTypeFloat32) | |||
| .AddOutputAttr(kNumberTypeFloat32), | |||
| CTCLossCPUKernel); | |||
| } // namespace kernel | |||
| } // namespace mindspore | |||
| #endif // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_CTCLOSS_CPU_KERNEL_H_ | |||
| /** | |||
| * Copyright 2021 Huawei Technologies Co., Ltd | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| * You may obtain a copy of the License at | |||
| * | |||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, software | |||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_CTCLOSS_CPU_KERNEL_H_ | |||
| #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_CTCLOSS_CPU_KERNEL_H_ | |||
| #include <memory> | |||
| #include <unordered_map> | |||
| #include <vector> | |||
| #include <algorithm> | |||
| #include <limits> | |||
| #include "backend/kernel_compiler/cpu/cpu_kernel.h" | |||
| #include "backend/kernel_compiler/cpu/cpu_kernel_factory.h" | |||
| namespace mindspore { | |||
| namespace kernel { | |||
| class CTCLossCPUKernel : public CPUKernel { | |||
| public: | |||
| CTCLossCPUKernel() = default; | |||
| ~CTCLossCPUKernel() override = default; | |||
| void InitKernel(const CNodePtr &kernel_node) override; | |||
| bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace, | |||
| const std::vector<AddressPtr> &outputs) override; | |||
| void GenLableWithBlank(const uint32_t *seq_len, const std::vector<std::vector<uint32_t>> &batch_label, | |||
| std::vector<std::vector<uint32_t>> *label_with_blank); | |||
| template <typename T> | |||
| void CalculateFwdVar(const std::vector<uint32_t> &label_with_blank, const std::vector<std::vector<T>> &y, | |||
| std::vector<std::vector<T>> *log_alpha_b); | |||
| template <typename T> | |||
| void CalculateBwdVar(const std::vector<uint32_t> &label_with_blank, const std::vector<std::vector<T>> &y, | |||
| std::vector<std::vector<T>> *log_beta_b); | |||
| template <typename T> | |||
| void CalculateGrad(const std::vector<uint32_t> &label_with_blank, const std::vector<std::vector<T>> &y, | |||
| const std::vector<std::vector<T>> &log_alpha_b, const std::vector<std::vector<T>> &log_beta_b, | |||
| const T log_pzx, std::vector<std::vector<T>> *dy); | |||
| template <typename T> | |||
| void LaunchKernel(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &outputs); | |||
| private: | |||
| void CheckParam(const CNodePtr &kernel_node); | |||
| std::vector<size_t> probs_shape_; | |||
| std::vector<size_t> indice_dims_; | |||
| std::vector<size_t> labels_dims_; | |||
| size_t num_class_; | |||
| size_t max_time_; | |||
| size_t batch_size_; | |||
| uint32_t blank_index_; | |||
| TypeId dtype_{kTypeUnknown}; | |||
| bool preprocess_collapse_repeated_; | |||
| bool ctc_merge_repeated_; | |||
| bool ignore_longer_outputs_than_inputs_; | |||
| }; | |||
| MS_REG_CPU_KERNEL(CTCLoss, | |||
| KernelAttr() | |||
| .AddInputAttr(kNumberTypeFloat16) | |||
| .AddInputAttr(kNumberTypeInt64) | |||
| .AddInputAttr(kNumberTypeInt32) | |||
| .AddInputAttr(kNumberTypeInt32) | |||
| .AddOutputAttr(kNumberTypeFloat16) | |||
| .AddOutputAttr(kNumberTypeFloat16), | |||
| CTCLossCPUKernel); | |||
| MS_REG_CPU_KERNEL(CTCLoss, | |||
| KernelAttr() | |||
| .AddInputAttr(kNumberTypeFloat32) | |||
| .AddInputAttr(kNumberTypeInt64) | |||
| .AddInputAttr(kNumberTypeInt32) | |||
| .AddInputAttr(kNumberTypeInt32) | |||
| .AddOutputAttr(kNumberTypeFloat32) | |||
| .AddOutputAttr(kNumberTypeFloat32), | |||
| CTCLossCPUKernel); | |||
| } // namespace kernel | |||
| } // namespace mindspore | |||
| #endif // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_CTCLOSS_CPU_KERNEL_H_ | |||
| @@ -1,89 +1,89 @@ | |||
| /** | |||
| * Copyright 2021 Huawei Technologies Co., Ltd | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| * You may obtain a copy of the License at | |||
| * | |||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, software | |||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| #include "backend/kernel_compiler/cpu/depthtospace_cpu_kernel.h" | |||
| #include <vector> | |||
| #include "runtime/device/cpu/cpu_device_address.h" | |||
| namespace mindspore { | |||
| namespace kernel { | |||
| template <typename T> | |||
| void DepthToSpaceCPUKernel<T>::InitKernel(const CNodePtr &kernel_node) { | |||
| MS_EXCEPTION_IF_NULL(kernel_node); | |||
| CheckParam(kernel_node); | |||
| input_shape_ = AnfAlgo::GetInputDeviceShape(kernel_node, 0); | |||
| output_shape_ = AnfAlgo::GetOutputDeviceShape(kernel_node, 0); | |||
| block_size_ = AnfAlgo::GetNodeAttr<int64_t>(kernel_node, "block_size"); | |||
| } | |||
| template <typename T> | |||
| bool DepthToSpaceCPUKernel<T>::Launch(const std::vector<kernel::AddressPtr> &inputs, | |||
| const std::vector<kernel::AddressPtr> & /*workspace*/, | |||
| const std::vector<kernel::AddressPtr> &outputs) { | |||
| auto input_addr = reinterpret_cast<T *>(inputs[0]->addr); | |||
| auto output_addr = reinterpret_cast<T *>(outputs[0]->addr); | |||
| size_t size = IntToSize(inputs[0]->size / sizeof(T)); | |||
| std::vector<size_t> input_shape = input_shape_; | |||
| std::vector<size_t> output_shape = output_shape_; | |||
| size_t block_size = block_size_; | |||
| size_t input_dimension = input_shape.size(); | |||
| size_t output_strides[3] = {1, 1, 1}; | |||
| for (size_t i = input_dimension - 1; i >= 1; --i) { | |||
| for (size_t j = 0; j < i; ++j) { | |||
| output_strides[j] *= output_shape[i]; | |||
| } | |||
| } | |||
| auto task = [&, input_addr, output_addr](size_t start, size_t end) { | |||
| std::vector<size_t> output_pos_array(input_dimension, 0); | |||
| for (size_t i = start; i < end; ++i) { | |||
| size_t tmp_pos = i; | |||
| for (size_t j = 0; j < input_dimension - 1; ++j) { | |||
| output_pos_array[j] = tmp_pos / output_strides[j]; | |||
| tmp_pos %= output_strides[j]; | |||
| } | |||
| output_pos_array.back() = tmp_pos; | |||
| size_t input_pos = output_pos_array[0]; | |||
| input_pos = | |||
| (input_pos * input_shape[1]) + | |||
| (output_pos_array[1] + | |||
| (block_size * (output_pos_array[2] % block_size) + output_pos_array[3] % block_size) * output_shape[1]); | |||
| input_pos = (input_pos * input_shape[2]) + (output_pos_array[2] / block_size); | |||
| input_pos = (input_pos * input_shape[3]) + (output_pos_array[3] / block_size); | |||
| output_addr[i] = input_addr[input_pos]; | |||
| } | |||
| }; | |||
| CPUKernelUtils::ParallelFor(task, size); | |||
| return true; | |||
| } | |||
| template <typename T> | |||
| void DepthToSpaceCPUKernel<T>::CheckParam(const CNodePtr &kernel_node) { | |||
| size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node); | |||
| if (input_num != 1) { | |||
| MS_LOG(EXCEPTION) << "Input number is " << input_num << ", but DepthToSpaceCPUKerrnel needs 1 input."; | |||
| } | |||
| size_t output_num = AnfAlgo::GetOutputTensorNum(kernel_node); | |||
| if (output_num != 1) { | |||
| MS_LOG(EXCEPTION) << "Output number is " << output_num << ", but DepthToSpaceCPUKernel needs 1 output."; | |||
| } | |||
| } | |||
| } // namespace kernel | |||
| } // namespace mindspore | |||
| /** | |||
| * Copyright 2021 Huawei Technologies Co., Ltd | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| * You may obtain a copy of the License at | |||
| * | |||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, software | |||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| #include "backend/kernel_compiler/cpu/depthtospace_cpu_kernel.h" | |||
| #include <vector> | |||
| #include "runtime/device/cpu/cpu_device_address.h" | |||
| namespace mindspore { | |||
| namespace kernel { | |||
| template <typename T> | |||
| void DepthToSpaceCPUKernel<T>::InitKernel(const CNodePtr &kernel_node) { | |||
| MS_EXCEPTION_IF_NULL(kernel_node); | |||
| CheckParam(kernel_node); | |||
| input_shape_ = AnfAlgo::GetInputDeviceShape(kernel_node, 0); | |||
| output_shape_ = AnfAlgo::GetOutputDeviceShape(kernel_node, 0); | |||
| block_size_ = AnfAlgo::GetNodeAttr<int64_t>(kernel_node, "block_size"); | |||
| } | |||
| template <typename T> | |||
| bool DepthToSpaceCPUKernel<T>::Launch(const std::vector<kernel::AddressPtr> &inputs, | |||
| const std::vector<kernel::AddressPtr> & /*workspace*/, | |||
| const std::vector<kernel::AddressPtr> &outputs) { | |||
| auto input_addr = reinterpret_cast<T *>(inputs[0]->addr); | |||
| auto output_addr = reinterpret_cast<T *>(outputs[0]->addr); | |||
| size_t size = IntToSize(inputs[0]->size / sizeof(T)); | |||
| std::vector<size_t> input_shape = input_shape_; | |||
| std::vector<size_t> output_shape = output_shape_; | |||
| size_t block_size = block_size_; | |||
| size_t input_dimension = input_shape.size(); | |||
| size_t output_strides[3] = {1, 1, 1}; | |||
| for (size_t i = input_dimension - 1; i >= 1; --i) { | |||
| for (size_t j = 0; j < i; ++j) { | |||
| output_strides[j] *= output_shape[i]; | |||
| } | |||
| } | |||
| auto task = [&, input_addr, output_addr](size_t start, size_t end) { | |||
| std::vector<size_t> output_pos_array(input_dimension, 0); | |||
| for (size_t i = start; i < end; ++i) { | |||
| size_t tmp_pos = i; | |||
| for (size_t j = 0; j < input_dimension - 1; ++j) { | |||
| output_pos_array[j] = tmp_pos / output_strides[j]; | |||
| tmp_pos %= output_strides[j]; | |||
| } | |||
| output_pos_array.back() = tmp_pos; | |||
| size_t input_pos = output_pos_array[0]; | |||
| input_pos = | |||
| (input_pos * input_shape[1]) + | |||
| (output_pos_array[1] + | |||
| (block_size * (output_pos_array[2] % block_size) + output_pos_array[3] % block_size) * output_shape[1]); | |||
| input_pos = (input_pos * input_shape[2]) + (output_pos_array[2] / block_size); | |||
| input_pos = (input_pos * input_shape[3]) + (output_pos_array[3] / block_size); | |||
| output_addr[i] = input_addr[input_pos]; | |||
| } | |||
| }; | |||
| CPUKernelUtils::ParallelFor(task, size); | |||
| return true; | |||
| } | |||
| template <typename T> | |||
| void DepthToSpaceCPUKernel<T>::CheckParam(const CNodePtr &kernel_node) { | |||
| size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node); | |||
| if (input_num != 1) { | |||
| MS_LOG(EXCEPTION) << "Input number is " << input_num << ", but DepthToSpaceCPUKerrnel needs 1 input."; | |||
| } | |||
| size_t output_num = AnfAlgo::GetOutputTensorNum(kernel_node); | |||
| if (output_num != 1) { | |||
| MS_LOG(EXCEPTION) << "Output number is " << output_num << ", but DepthToSpaceCPUKernel needs 1 output."; | |||
| } | |||
| } | |||
| } // namespace kernel | |||
| } // namespace mindspore | |||
| @@ -1,85 +1,85 @@ | |||
| /** | |||
| * Copyright 2021 Huawei Technologies Co., Ltd | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| * You may obtain a copy of the License at | |||
| * | |||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, software | |||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_DEPTHTOSPACE_CPU_KERNEL_H_ | |||
| #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_DEPTHTOSPACE_CPU_KERNEL_H_ | |||
| #include <memory> | |||
| #include <string> | |||
| #include <vector> | |||
| #include "backend/kernel_compiler/cpu/cpu_kernel.h" | |||
| #include "backend/kernel_compiler/cpu/cpu_kernel_factory.h" | |||
| namespace mindspore { | |||
| namespace kernel { | |||
| template <typename T> | |||
| class DepthToSpaceCPUKernel : public CPUKernel { | |||
| public: | |||
| DepthToSpaceCPUKernel() = default; | |||
| ~DepthToSpaceCPUKernel() override = default; | |||
| void InitKernel(const CNodePtr &kernel_node) override; | |||
| bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace, | |||
| const std::vector<AddressPtr> &outputs) override; | |||
| private: | |||
| void CheckParam(const CNodePtr &kernel_node); | |||
| std::vector<size_t> input_shape_; | |||
| std::vector<size_t> output_shape_; | |||
| size_t block_size_; | |||
| }; | |||
| MS_REG_CPU_KERNEL_T( | |||
| DepthToSpace, KernelAttr().SetAllSameAttr(true).AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32), | |||
| DepthToSpaceCPUKernel, float); | |||
| MS_REG_CPU_KERNEL_T( | |||
| DepthToSpace, KernelAttr().SetAllSameAttr(true).AddInputAttr(kNumberTypeFloat16).AddOutputAttr(kNumberTypeFloat16), | |||
| DepthToSpaceCPUKernel, float16); | |||
| MS_REG_CPU_KERNEL_T(DepthToSpace, | |||
| KernelAttr().SetAllSameAttr(true).AddInputAttr(kNumberTypeInt8).AddOutputAttr(kNumberTypeInt8), | |||
| DepthToSpaceCPUKernel, int8_t); | |||
| MS_REG_CPU_KERNEL_T(DepthToSpace, | |||
| KernelAttr().SetAllSameAttr(true).AddInputAttr(kNumberTypeInt16).AddOutputAttr(kNumberTypeInt16), | |||
| DepthToSpaceCPUKernel, int16_t); | |||
| MS_REG_CPU_KERNEL_T(DepthToSpace, | |||
| KernelAttr().SetAllSameAttr(true).AddInputAttr(kNumberTypeInt32).AddOutputAttr(kNumberTypeInt32), | |||
| DepthToSpaceCPUKernel, int); | |||
| MS_REG_CPU_KERNEL_T(DepthToSpace, | |||
| KernelAttr().SetAllSameAttr(true).AddInputAttr(kNumberTypeInt64).AddOutputAttr(kNumberTypeInt64), | |||
| DepthToSpaceCPUKernel, int64_t); | |||
| MS_REG_CPU_KERNEL_T(DepthToSpace, | |||
| KernelAttr().SetAllSameAttr(true).AddInputAttr(kNumberTypeUInt8).AddOutputAttr(kNumberTypeUInt8), | |||
| DepthToSpaceCPUKernel, uint8_t); | |||
| MS_REG_CPU_KERNEL_T(DepthToSpace, | |||
| KernelAttr().SetAllSameAttr(true).AddInputAttr(kNumberTypeUInt16).AddOutputAttr(kNumberTypeUInt16), | |||
| DepthToSpaceCPUKernel, uint16_t); | |||
| MS_REG_CPU_KERNEL_T(DepthToSpace, | |||
| KernelAttr().SetAllSameAttr(true).AddInputAttr(kNumberTypeUInt32).AddOutputAttr(kNumberTypeUInt32), | |||
| DepthToSpaceCPUKernel, uint32_t); | |||
| MS_REG_CPU_KERNEL_T(DepthToSpace, | |||
| KernelAttr().SetAllSameAttr(true).AddInputAttr(kNumberTypeUInt64).AddOutputAttr(kNumberTypeUInt64), | |||
| DepthToSpaceCPUKernel, uint64_t); | |||
| } // namespace kernel | |||
| } // namespace mindspore | |||
| #endif // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_DEPTHTOSPACE_CPU_KERNEL_H_ | |||
| /** | |||
| * Copyright 2021 Huawei Technologies Co., Ltd | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| * You may obtain a copy of the License at | |||
| * | |||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, software | |||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_DEPTHTOSPACE_CPU_KERNEL_H_ | |||
| #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_DEPTHTOSPACE_CPU_KERNEL_H_ | |||
| #include <memory> | |||
| #include <string> | |||
| #include <vector> | |||
| #include "backend/kernel_compiler/cpu/cpu_kernel.h" | |||
| #include "backend/kernel_compiler/cpu/cpu_kernel_factory.h" | |||
| namespace mindspore { | |||
| namespace kernel { | |||
| template <typename T> | |||
| class DepthToSpaceCPUKernel : public CPUKernel { | |||
| public: | |||
| DepthToSpaceCPUKernel() = default; | |||
| ~DepthToSpaceCPUKernel() override = default; | |||
| void InitKernel(const CNodePtr &kernel_node) override; | |||
| bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace, | |||
| const std::vector<AddressPtr> &outputs) override; | |||
| private: | |||
| void CheckParam(const CNodePtr &kernel_node); | |||
| std::vector<size_t> input_shape_; | |||
| std::vector<size_t> output_shape_; | |||
| size_t block_size_; | |||
| }; | |||
| MS_REG_CPU_KERNEL_T( | |||
| DepthToSpace, KernelAttr().SetAllSameAttr(true).AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32), | |||
| DepthToSpaceCPUKernel, float); | |||
| MS_REG_CPU_KERNEL_T( | |||
| DepthToSpace, KernelAttr().SetAllSameAttr(true).AddInputAttr(kNumberTypeFloat16).AddOutputAttr(kNumberTypeFloat16), | |||
| DepthToSpaceCPUKernel, float16); | |||
| MS_REG_CPU_KERNEL_T(DepthToSpace, | |||
| KernelAttr().SetAllSameAttr(true).AddInputAttr(kNumberTypeInt8).AddOutputAttr(kNumberTypeInt8), | |||
| DepthToSpaceCPUKernel, int8_t); | |||
| MS_REG_CPU_KERNEL_T(DepthToSpace, | |||
| KernelAttr().SetAllSameAttr(true).AddInputAttr(kNumberTypeInt16).AddOutputAttr(kNumberTypeInt16), | |||
| DepthToSpaceCPUKernel, int16_t); | |||
| MS_REG_CPU_KERNEL_T(DepthToSpace, | |||
| KernelAttr().SetAllSameAttr(true).AddInputAttr(kNumberTypeInt32).AddOutputAttr(kNumberTypeInt32), | |||
| DepthToSpaceCPUKernel, int); | |||
| MS_REG_CPU_KERNEL_T(DepthToSpace, | |||
| KernelAttr().SetAllSameAttr(true).AddInputAttr(kNumberTypeInt64).AddOutputAttr(kNumberTypeInt64), | |||
| DepthToSpaceCPUKernel, int64_t); | |||
| MS_REG_CPU_KERNEL_T(DepthToSpace, | |||
| KernelAttr().SetAllSameAttr(true).AddInputAttr(kNumberTypeUInt8).AddOutputAttr(kNumberTypeUInt8), | |||
| DepthToSpaceCPUKernel, uint8_t); | |||
| MS_REG_CPU_KERNEL_T(DepthToSpace, | |||
| KernelAttr().SetAllSameAttr(true).AddInputAttr(kNumberTypeUInt16).AddOutputAttr(kNumberTypeUInt16), | |||
| DepthToSpaceCPUKernel, uint16_t); | |||
| MS_REG_CPU_KERNEL_T(DepthToSpace, | |||
| KernelAttr().SetAllSameAttr(true).AddInputAttr(kNumberTypeUInt32).AddOutputAttr(kNumberTypeUInt32), | |||
| DepthToSpaceCPUKernel, uint32_t); | |||
| MS_REG_CPU_KERNEL_T(DepthToSpace, | |||
| KernelAttr().SetAllSameAttr(true).AddInputAttr(kNumberTypeUInt64).AddOutputAttr(kNumberTypeUInt64), | |||
| DepthToSpaceCPUKernel, uint64_t); | |||
| } // namespace kernel | |||
| } // namespace mindspore | |||
| #endif // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_DEPTHTOSPACE_CPU_KERNEL_H_ | |||
| @@ -1,102 +1,102 @@ | |||
| /** | |||
| * Copyright 2020 Huawei Technologies Co., Ltd | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| * You may obtain a copy of the License at | |||
| * | |||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, software | |||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| #include "backend/kernel_compiler/cpu/mkldnn/addn_cpu_kernel.h" | |||
| #include "backend/kernel_compiler/cpu/mkldnn/mkl_kernel_engine.h" | |||
| #include "runtime/device/cpu/cpu_device_address.h" | |||
| #include "backend/kernel_compiler/cpu/nnacl/fp32/add_fp32.h" | |||
| #include "backend/kernel_compiler/cpu/nnacl/errorcode.h" | |||
| #include "utils/ms_utils.h" | |||
| #include "common/thread_pool.h" | |||
| namespace mindspore { | |||
| namespace kernel { | |||
| void AddInt(const int *in_0, const int *in_1, int *out, int start, int end) { | |||
| int ret = ElementAddInt(in_0 + start, in_1 + start, out + start, end - start); | |||
| if (ret != NNACL_OK) { | |||
| MS_LOG(EXCEPTION) << "Add failed."; | |||
| } | |||
| } | |||
| void AddNCPUKernel::InitKernel(const CNodePtr &kernel_node) { | |||
| MS_EXCEPTION_IF_NULL(kernel_node); | |||
| CheckParam(kernel_node); | |||
| input_num_ = AnfAlgo::GetInputTensorNum(kernel_node); | |||
| dtype_ = AnfAlgo::GetInputDeviceDataType(kernel_node, 0); | |||
| std::vector<size_t> src0_shape = AnfAlgo::GetInputDeviceShape(kernel_node, 0); | |||
| std::vector<size_t> src1_shape = AnfAlgo::GetInputDeviceShape(kernel_node, 1); | |||
| std::vector<size_t> dst_shape = AnfAlgo::GetOutputDeviceShape(kernel_node, 0); | |||
| dnnl::memory::desc src0_mem_desc = GetDefaultMemDesc(src0_shape); | |||
| dnnl::memory::desc src1_mem_desc = GetDefaultMemDesc(src1_shape); | |||
| dnnl::memory::desc dst_mem_desc = GetDefaultMemDesc(dst_shape); | |||
| dnnl::binary::desc desc = dnnl::binary::desc(dnnl::algorithm::binary_add, src0_mem_desc, src1_mem_desc, dst_mem_desc); | |||
| auto prim_desc = dnnl::binary::primitive_desc(desc, MKLKernelEngine::Get().engine()); | |||
| primitive_ = std::make_shared<dnnl::binary>(prim_desc); | |||
| AddArgument(DNNL_ARG_SRC_0, src0_mem_desc); | |||
| AddArgument(DNNL_ARG_SRC_1, src1_mem_desc); | |||
| AddArgument(DNNL_ARG_DST, dst_mem_desc); | |||
| } | |||
| bool AddNCPUKernel::Launch(const std::vector<kernel::AddressPtr> &inputs, const std::vector<kernel::AddressPtr> &, | |||
| const std::vector<kernel::AddressPtr> &outputs) { | |||
| if (dtype_ == kNumberTypeFloat32) { | |||
| SetArgumentHandle(DNNL_ARG_SRC_0, inputs[0]->addr); | |||
| SetArgumentHandle(DNNL_ARG_SRC_1, inputs[1]->addr); | |||
| SetArgumentHandle(DNNL_ARG_DST, outputs[0]->addr); | |||
| ExecutePrimitive(); | |||
| for (size_t index = 2; index < input_num_; ++index) { | |||
| SetArgumentHandle(DNNL_ARG_SRC_0, outputs[0]->addr); | |||
| SetArgumentHandle(DNNL_ARG_SRC_1, inputs[index]->addr); | |||
| SetArgumentHandle(DNNL_ARG_DST, outputs[0]->addr); | |||
| ExecutePrimitive(); | |||
| } | |||
| } else if (dtype_ == kNumberTypeInt32) { | |||
| size_t elements_num = outputs[0]->size / sizeof(int); | |||
| const auto input_0 = reinterpret_cast<int *>(inputs[0]->addr); | |||
| const auto input_1 = reinterpret_cast<int *>(inputs[1]->addr); | |||
| auto output = reinterpret_cast<int *>(outputs[0]->addr); | |||
| auto task_0 = std::bind(AddInt, input_0, input_1, output, std::placeholders::_1, std::placeholders::_2); | |||
| CPUKernelUtils::ParallelFor(task_0, elements_num); | |||
| for (size_t index = 2; index < input_num_; ++index) { | |||
| const auto input = reinterpret_cast<int *>(inputs[index]->addr); | |||
| auto task = std::bind(AddInt, input, output, output, std::placeholders::_1, std::placeholders::_2); | |||
| CPUKernelUtils::ParallelFor(task, elements_num); | |||
| } | |||
| } else { | |||
| MS_LOG(EXCEPTION) << "AddN only support float32 and int32, but got " << TypeIdToType(dtype_)->ToString(); | |||
| } | |||
| return true; | |||
| } | |||
| void AddNCPUKernel::CheckParam(const CNodePtr &kernel_node) { | |||
| auto src0_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0); | |||
| auto dst_shape = AnfAlgo::GetOutputDeviceShape(kernel_node, 0); | |||
| if (src0_shape != dst_shape) { | |||
| MS_LOG(EXCEPTION) << "AddN output shape must be equal to input shape."; | |||
| } | |||
| for (size_t index = 1; index < input_num_; ++index) { | |||
| auto src_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, index); | |||
| if (src0_shape != src_shape) { | |||
| MS_LOG(EXCEPTION) << "AddN input shapes must be equal."; | |||
| } | |||
| } | |||
| size_t output_num = AnfAlgo::GetOutputTensorNum(kernel_node); | |||
| if (output_num != 1) { | |||
| MS_LOG(EXCEPTION) << "Output number is " << output_num << ", but AddNCPUKernel needs 1 output."; | |||
| } | |||
| } | |||
| } // namespace kernel | |||
| } // namespace mindspore | |||
| /** | |||
| * Copyright 2020 Huawei Technologies Co., Ltd | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| * You may obtain a copy of the License at | |||
| * | |||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, software | |||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| #include "backend/kernel_compiler/cpu/mkldnn/addn_cpu_kernel.h" | |||
| #include "backend/kernel_compiler/cpu/mkldnn/mkl_kernel_engine.h" | |||
| #include "runtime/device/cpu/cpu_device_address.h" | |||
| #include "backend/kernel_compiler/cpu/nnacl/fp32/add_fp32.h" | |||
| #include "backend/kernel_compiler/cpu/nnacl/errorcode.h" | |||
| #include "utils/ms_utils.h" | |||
| #include "common/thread_pool.h" | |||
| namespace mindspore { | |||
| namespace kernel { | |||
| void AddInt(const int *in_0, const int *in_1, int *out, int start, int end) { | |||
| int ret = ElementAddInt(in_0 + start, in_1 + start, out + start, end - start); | |||
| if (ret != NNACL_OK) { | |||
| MS_LOG(EXCEPTION) << "Add failed."; | |||
| } | |||
| } | |||
| void AddNCPUKernel::InitKernel(const CNodePtr &kernel_node) { | |||
| MS_EXCEPTION_IF_NULL(kernel_node); | |||
| CheckParam(kernel_node); | |||
| input_num_ = AnfAlgo::GetInputTensorNum(kernel_node); | |||
| dtype_ = AnfAlgo::GetInputDeviceDataType(kernel_node, 0); | |||
| std::vector<size_t> src0_shape = AnfAlgo::GetInputDeviceShape(kernel_node, 0); | |||
| std::vector<size_t> src1_shape = AnfAlgo::GetInputDeviceShape(kernel_node, 1); | |||
| std::vector<size_t> dst_shape = AnfAlgo::GetOutputDeviceShape(kernel_node, 0); | |||
| dnnl::memory::desc src0_mem_desc = GetDefaultMemDesc(src0_shape); | |||
| dnnl::memory::desc src1_mem_desc = GetDefaultMemDesc(src1_shape); | |||
| dnnl::memory::desc dst_mem_desc = GetDefaultMemDesc(dst_shape); | |||
| dnnl::binary::desc desc = dnnl::binary::desc(dnnl::algorithm::binary_add, src0_mem_desc, src1_mem_desc, dst_mem_desc); | |||
| auto prim_desc = dnnl::binary::primitive_desc(desc, MKLKernelEngine::Get().engine()); | |||
| primitive_ = std::make_shared<dnnl::binary>(prim_desc); | |||
| AddArgument(DNNL_ARG_SRC_0, src0_mem_desc); | |||
| AddArgument(DNNL_ARG_SRC_1, src1_mem_desc); | |||
| AddArgument(DNNL_ARG_DST, dst_mem_desc); | |||
| } | |||
| bool AddNCPUKernel::Launch(const std::vector<kernel::AddressPtr> &inputs, const std::vector<kernel::AddressPtr> &, | |||
| const std::vector<kernel::AddressPtr> &outputs) { | |||
| if (dtype_ == kNumberTypeFloat32) { | |||
| SetArgumentHandle(DNNL_ARG_SRC_0, inputs[0]->addr); | |||
| SetArgumentHandle(DNNL_ARG_SRC_1, inputs[1]->addr); | |||
| SetArgumentHandle(DNNL_ARG_DST, outputs[0]->addr); | |||
| ExecutePrimitive(); | |||
| for (size_t index = 2; index < input_num_; ++index) { | |||
| SetArgumentHandle(DNNL_ARG_SRC_0, outputs[0]->addr); | |||
| SetArgumentHandle(DNNL_ARG_SRC_1, inputs[index]->addr); | |||
| SetArgumentHandle(DNNL_ARG_DST, outputs[0]->addr); | |||
| ExecutePrimitive(); | |||
| } | |||
| } else if (dtype_ == kNumberTypeInt32) { | |||
| size_t elements_num = outputs[0]->size / sizeof(int); | |||
| const auto input_0 = reinterpret_cast<int *>(inputs[0]->addr); | |||
| const auto input_1 = reinterpret_cast<int *>(inputs[1]->addr); | |||
| auto output = reinterpret_cast<int *>(outputs[0]->addr); | |||
| auto task_0 = std::bind(AddInt, input_0, input_1, output, std::placeholders::_1, std::placeholders::_2); | |||
| CPUKernelUtils::ParallelFor(task_0, elements_num); | |||
| for (size_t index = 2; index < input_num_; ++index) { | |||
| const auto input = reinterpret_cast<int *>(inputs[index]->addr); | |||
| auto task = std::bind(AddInt, input, output, output, std::placeholders::_1, std::placeholders::_2); | |||
| CPUKernelUtils::ParallelFor(task, elements_num); | |||
| } | |||
| } else { | |||
| MS_LOG(EXCEPTION) << "AddN only support float32 and int32, but got " << TypeIdToType(dtype_)->ToString(); | |||
| } | |||
| return true; | |||
| } | |||
| void AddNCPUKernel::CheckParam(const CNodePtr &kernel_node) { | |||
| auto src0_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0); | |||
| auto dst_shape = AnfAlgo::GetOutputDeviceShape(kernel_node, 0); | |||
| if (src0_shape != dst_shape) { | |||
| MS_LOG(EXCEPTION) << "AddN output shape must be equal to input shape."; | |||
| } | |||
| for (size_t index = 1; index < input_num_; ++index) { | |||
| auto src_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, index); | |||
| if (src0_shape != src_shape) { | |||
| MS_LOG(EXCEPTION) << "AddN input shapes must be equal."; | |||
| } | |||
| } | |||
| size_t output_num = AnfAlgo::GetOutputTensorNum(kernel_node); | |||
| if (output_num != 1) { | |||
| MS_LOG(EXCEPTION) << "Output number is " << output_num << ", but AddNCPUKernel needs 1 output."; | |||
| } | |||
| } | |||
| } // namespace kernel | |||
| } // namespace mindspore | |||
| @@ -1,51 +1,51 @@ | |||
| /** | |||
| * Copyright 2020 Huawei Technologies Co., Ltd | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| * You may obtain a copy of the License at | |||
| * | |||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, software | |||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_ADDN_CPU_KERNEL_H_ | |||
| #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_ADDN_CPU_KERNEL_H_ | |||
| #include <vector> | |||
| #include <memory> | |||
| #include "backend/kernel_compiler/cpu/mkldnn/mkl_cpu_kernel.h" | |||
| namespace mindspore { | |||
| namespace kernel { | |||
| class AddNCPUKernel : public MKLCPUKernel { | |||
| public: | |||
| AddNCPUKernel() = default; | |||
| ~AddNCPUKernel() override = default; | |||
| void InitKernel(const CNodePtr &kernel_node) override; | |||
| bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace, | |||
| const std::vector<AddressPtr> &outputs) override; | |||
| private: | |||
| void CheckParam(const CNodePtr &kernel_node); | |||
| size_t input_num_{0}; | |||
| std::vector<size_t> output_shape_; | |||
| TypeId dtype_{kNumberTypeFloat32}; | |||
| }; | |||
| MS_REG_CPU_KERNEL(AddN, | |||
| KernelAttr().SetAllSameAttr(true).AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32), | |||
| AddNCPUKernel); | |||
| MS_REG_CPU_KERNEL(AddN, | |||
| KernelAttr().SetAllSameAttr(true).AddInputAttr(kNumberTypeInt32).AddOutputAttr(kNumberTypeInt32), | |||
| AddNCPUKernel); | |||
| } // namespace kernel | |||
| } // namespace mindspore | |||
| #endif // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_ADDN_CPU_KERNEL_H_ | |||
| /** | |||
| * Copyright 2020 Huawei Technologies Co., Ltd | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| * You may obtain a copy of the License at | |||
| * | |||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, software | |||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_ADDN_CPU_KERNEL_H_ | |||
| #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_ADDN_CPU_KERNEL_H_ | |||
| #include <vector> | |||
| #include <memory> | |||
| #include "backend/kernel_compiler/cpu/mkldnn/mkl_cpu_kernel.h" | |||
| namespace mindspore { | |||
| namespace kernel { | |||
| class AddNCPUKernel : public MKLCPUKernel { | |||
| public: | |||
| AddNCPUKernel() = default; | |||
| ~AddNCPUKernel() override = default; | |||
| void InitKernel(const CNodePtr &kernel_node) override; | |||
| bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace, | |||
| const std::vector<AddressPtr> &outputs) override; | |||
| private: | |||
| void CheckParam(const CNodePtr &kernel_node); | |||
| size_t input_num_{0}; | |||
| std::vector<size_t> output_shape_; | |||
| TypeId dtype_{kNumberTypeFloat32}; | |||
| }; | |||
| MS_REG_CPU_KERNEL(AddN, | |||
| KernelAttr().SetAllSameAttr(true).AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32), | |||
| AddNCPUKernel); | |||
| MS_REG_CPU_KERNEL(AddN, | |||
| KernelAttr().SetAllSameAttr(true).AddInputAttr(kNumberTypeInt32).AddOutputAttr(kNumberTypeInt32), | |||
| AddNCPUKernel); | |||
| } // namespace kernel | |||
| } // namespace mindspore | |||
| #endif // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_ADDN_CPU_KERNEL_H_ | |||
| @@ -1,178 +1,178 @@ | |||
| /** | |||
| * Copyright 2020 Huawei Technologies Co., Ltd | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| * You may obtain a copy of the License at | |||
| * | |||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, software | |||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| #include "backend/kernel_compiler/cpu/mkldnn/lstm_cpu_kernel.h" | |||
| #include <string> | |||
| #include "utils/ms_utils.h" | |||
| #include "backend/kernel_compiler/cpu/mkldnn/mkl_kernel_engine.h" | |||
| #include "runtime/device/cpu/cpu_device_address.h" | |||
| namespace mindspore { | |||
| namespace kernel { | |||
| const int kMaxLSTMLayer = 100; | |||
| const int kOutputWorkSpaceIndex = 3; | |||
| void LstmCPUKernel::InitInputOutputSize(const CNodePtr &kernel_node) { | |||
| CPUKernel::InitInputOutputSize(kernel_node); | |||
| output_size_list_[kOutputWorkSpaceIndex] = reserve_size_; | |||
| auto output_num = AnfAlgo::GetOutputTensorNum(kernel_node); | |||
| auto output_type = AnfAlgo::GetOutputInferDataType(kernel_node, 0); | |||
| auto output_types = std::vector<TypeId>(output_num, output_type); | |||
| std::vector<std::vector<size_t>> output_shapes; | |||
| for (size_t output_index = 0; output_index < output_num; ++output_index) { | |||
| std::vector<size_t> shape = AnfAlgo::GetOutputInferShape(kernel_node, output_index); | |||
| output_shapes.emplace_back(shape); | |||
| } | |||
| size_t len = reserve_size_ / 4; | |||
| output_shapes[kOutputWorkSpaceIndex] = {len, 1}; | |||
| AnfAlgo::SetOutputInferTypeAndShape(output_types, output_shapes, kernel_node.get()); | |||
| } | |||
| void LstmCPUKernel::InitKernel(const CNodePtr &kernel_node) { | |||
| #ifdef PLATFORM_86 | |||
| _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON); | |||
| _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON); | |||
| #endif | |||
| MS_EXCEPTION_IF_NULL(kernel_node); | |||
| using tag = dnnl::memory::format_tag; | |||
| using dim = dnnl::memory::dims; | |||
| CheckParam(kernel_node); | |||
| auto eng = MKLKernelEngine::Get().engine(); | |||
| dnnl::rnn_direction direction = dnnl::rnn_direction::unidirectional; | |||
| if (bidirectional_) { | |||
| direction = dnnl::rnn_direction::bidirectional_concat; | |||
| } | |||
| dim src_dims = {seq_len_, batch_size_, input_size_}; | |||
| dim src_h_dims = {num_layers_, num_directions_, batch_size_, hidden_size_}; | |||
| dim src_c_dims = {num_layers_, num_directions_, batch_size_, hidden_size_}; | |||
| weights_dims_ = {num_layers_, num_directions_, input_size_, 4, hidden_size_}; | |||
| weights_h_dims_ = {num_layers_, num_directions_, hidden_size_, 4, hidden_size_}; | |||
| bias_dims_ = {num_layers_, num_directions_, 4, hidden_size_}; | |||
| dim dst_dims = {seq_len_, batch_size_, static_cast<int64_t>(hidden_size_) * num_directions_}; | |||
| dim dst_h_dims = {num_layers_, num_directions_, batch_size_, hidden_size_}; | |||
| dim dst_c_dims = {num_layers_, num_directions_, batch_size_, hidden_size_}; | |||
| dnnl::memory::desc src_desc = formatted_md(src_dims, tag::tnc); | |||
| dnnl::memory::desc src_h_desc = formatted_md(src_h_dims, tag::ldnc); | |||
| dnnl::memory::desc src_c_desc = formatted_md(src_c_dims, tag::ldnc); | |||
| dnnl::memory::desc bias_desc = formatted_md(bias_dims_, tag::ldgo); | |||
| dnnl::memory::desc dst_desc = formatted_md(dst_dims, tag::tnc); | |||
| dnnl::memory::desc dst_h_desc = formatted_md(dst_h_dims, tag::ldnc); | |||
| dnnl::memory::desc dst_c_desc = formatted_md(dst_c_dims, tag::ldnc); | |||
| if (!kernel_node->HasAttr(kAttrIsTraining)) { | |||
| is_training = true; | |||
| } else { | |||
| is_training = GetValue<bool>(kernel_node->GetAttr(kAttrIsTraining)); | |||
| } | |||
| auto prop_kind = dnnl::prop_kind::forward_training; | |||
| if (!is_training) { | |||
| prop_kind = dnnl::prop_kind::forward_inference; | |||
| } | |||
| auto desc = std::make_shared<dnnl::lstm_forward::desc>( | |||
| prop_kind, direction, src_desc, src_h_desc, src_c_desc, formatted_md(weights_dims_, tag::any), | |||
| formatted_md(weights_h_dims_, tag::any), bias_desc, dst_desc, dst_h_desc, dst_c_desc); | |||
| prim_desc_ = dnnl::lstm_forward::primitive_desc(*desc, eng); | |||
| primitive_ = std::make_shared<dnnl::lstm_forward>(prim_desc_); | |||
| if (is_training) { | |||
| reserve_size_ = static_cast<size_t>(prim_desc_.workspace_desc().get_size()); | |||
| AddArgument(DNNL_ARG_WORKSPACE, prim_desc_.workspace_desc()); | |||
| } else { | |||
| reserve_size_ = 1; | |||
| } | |||
| AddArgument(DNNL_ARG_SRC_LAYER, src_desc); | |||
| AddArgument(DNNL_ARG_SRC_ITER, src_h_desc); | |||
| AddArgument(DNNL_ARG_SRC_ITER_C, src_c_desc); | |||
| AddArgument(DNNL_ARG_WEIGHTS_LAYER, prim_desc_.weights_layer_desc()); | |||
| AddArgument(DNNL_ARG_WEIGHTS_ITER, prim_desc_.weights_iter_desc()); | |||
| AddArgument(DNNL_ARG_BIAS, bias_desc); | |||
| AddArgument(DNNL_ARG_DST_LAYER, dst_desc); | |||
| AddArgument(DNNL_ARG_DST_ITER, dst_h_desc); | |||
| AddArgument(DNNL_ARG_DST_ITER_C, dst_c_desc); | |||
| } | |||
| void LstmCPUKernel::CheckParam(const CNodePtr &kernel_node) { | |||
| std::vector<size_t> src_shape = AnfAlgo::GetInputDeviceShape(kernel_node, 0); | |||
| std::vector<size_t> src_h_shape = AnfAlgo::GetInputDeviceShape(kernel_node, 1); | |||
| std::vector<size_t> src_c_shape = AnfAlgo::GetInputDeviceShape(kernel_node, 2); | |||
| bidirectional_ = AnfAlgo::GetNodeAttr<bool>(kernel_node, "bidirectional"); | |||
| input_size_ = static_cast<int>(AnfAlgo::GetNodeAttr<int64_t>(kernel_node, "input_size")); | |||
| hidden_size_ = static_cast<int>(AnfAlgo::GetNodeAttr<int64_t>(kernel_node, "hidden_size")); | |||
| num_layers_ = static_cast<int>(AnfAlgo::GetNodeAttr<int64_t>(kernel_node, "num_layers")); | |||
| has_bias_ = AnfAlgo::GetNodeAttr<bool>(kernel_node, "has_bias"); | |||
| batch_size_ = SizeToInt(src_shape[1]); | |||
| seq_len_ = SizeToInt(src_shape[0]); | |||
| num_directions_ = 1; | |||
| if (bidirectional_) { | |||
| num_directions_ = 2; | |||
| } | |||
| const int gate_size = 4 * hidden_size_; | |||
| if (num_layers_ <= 0) { | |||
| MS_LOG(EXCEPTION) << "Layers must be greater than zero!"; | |||
| } | |||
| if (num_layers_ > kMaxLSTMLayer) { | |||
| MS_LOG(EXCEPTION) << "Layers must be lower than 100!"; | |||
| } | |||
| for (int i = 0; i < num_layers_; ++i) { | |||
| weight_size_ += gate_size * (i == 0 ? input_size_ : hidden_size_ * num_directions_); | |||
| weight_h_size_ += gate_size * hidden_size_; | |||
| } | |||
| weight_size_ = weight_size_ * num_directions_; | |||
| weight_h_size_ = weight_h_size_ * num_directions_; | |||
| if (num_directions_ * num_layers_ != SizeToInt(src_h_shape[0])) { | |||
| MS_LOG(EXCEPTION) << "Error iteration shape!"; | |||
| } | |||
| if (src_shape.size() != 3 || src_h_shape.size() != 3 || src_c_shape.size() != 3) { | |||
| MS_LOG(EXCEPTION) << "Lstm only support 3-D input!"; | |||
| } | |||
| } | |||
| bool LstmCPUKernel::Launch(const std::vector<kernel::AddressPtr> &inputs, const std::vector<kernel::AddressPtr> &, | |||
| const std::vector<kernel::AddressPtr> &outputs) { | |||
| using dt = dnnl::memory::data_type; | |||
| using tag = dnnl::memory::format_tag; | |||
| auto eng = MKLKernelEngine::Get().engine(); | |||
| auto user_weights_memory = dnnl::memory(dnnl::memory::desc{{weights_dims_}, dt::f32, tag::ldgoi}, eng); | |||
| auto user_weights_h_memory = dnnl::memory(dnnl::memory::desc{{weights_h_dims_}, dt::f32, tag::ldgoi}, eng); | |||
| auto weights_memory = dnnl::memory(prim_desc_.weights_layer_desc(), eng); | |||
| auto weights_h_memory = dnnl::memory(prim_desc_.weights_iter_desc(), eng); | |||
| user_weights_memory.set_data_handle(inputs[3]->addr); | |||
| user_weights_h_memory.set_data_handle(reinterpret_cast<float *>(inputs[3]->addr) + weight_size_); | |||
| Reorder(&user_weights_memory, &weights_memory); | |||
| Reorder(&user_weights_h_memory, &weights_h_memory); | |||
| auto bias_memory = dnnl::memory(prim_desc_.bias_desc(), eng); | |||
| if (has_bias_) { | |||
| bias_memory.set_data_handle(reinterpret_cast<float *>(inputs[3]->addr) + weight_size_ + weight_h_size_); | |||
| } else { | |||
| if (memset_s(bias_memory.get_data_handle(), prim_desc_.bias_desc().get_size(), 0, | |||
| prim_desc_.bias_desc().get_size())) { | |||
| MS_LOG(EXCEPTION) << "Bias memset error"; | |||
| } | |||
| } | |||
| // set handle | |||
| SetArgumentHandle(DNNL_ARG_SRC_LAYER, inputs[0]->addr); | |||
| SetArgumentHandle(DNNL_ARG_SRC_ITER, inputs[1]->addr); | |||
| SetArgumentHandle(DNNL_ARG_SRC_ITER_C, inputs[2]->addr); | |||
| SetArgumentHandle(DNNL_ARG_WEIGHTS_LAYER, weights_memory.get_data_handle()); | |||
| SetArgumentHandle(DNNL_ARG_WEIGHTS_ITER, weights_h_memory.get_data_handle()); | |||
| SetArgumentHandle(DNNL_ARG_BIAS, bias_memory.get_data_handle()); | |||
| SetArgumentHandle(DNNL_ARG_DST_LAYER, outputs[0]->addr); | |||
| SetArgumentHandle(DNNL_ARG_DST_ITER, outputs[1]->addr); | |||
| SetArgumentHandle(DNNL_ARG_DST_ITER_C, outputs[2]->addr); | |||
| if (is_training) { | |||
| SetArgumentHandle(DNNL_ARG_WORKSPACE, outputs[3]->addr); | |||
| } | |||
| ExecutePrimitive(); | |||
| return true; | |||
| } | |||
| } // namespace kernel | |||
| } // namespace mindspore | |||
| /** | |||
| * Copyright 2020 Huawei Technologies Co., Ltd | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| * You may obtain a copy of the License at | |||
| * | |||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, software | |||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| #include "backend/kernel_compiler/cpu/mkldnn/lstm_cpu_kernel.h" | |||
| #include <string> | |||
| #include "utils/ms_utils.h" | |||
| #include "backend/kernel_compiler/cpu/mkldnn/mkl_kernel_engine.h" | |||
| #include "runtime/device/cpu/cpu_device_address.h" | |||
| namespace mindspore { | |||
| namespace kernel { | |||
| const int kMaxLSTMLayer = 100; | |||
| const int kOutputWorkSpaceIndex = 3; | |||
| void LstmCPUKernel::InitInputOutputSize(const CNodePtr &kernel_node) { | |||
| CPUKernel::InitInputOutputSize(kernel_node); | |||
| output_size_list_[kOutputWorkSpaceIndex] = reserve_size_; | |||
| auto output_num = AnfAlgo::GetOutputTensorNum(kernel_node); | |||
| auto output_type = AnfAlgo::GetOutputInferDataType(kernel_node, 0); | |||
| auto output_types = std::vector<TypeId>(output_num, output_type); | |||
| std::vector<std::vector<size_t>> output_shapes; | |||
| for (size_t output_index = 0; output_index < output_num; ++output_index) { | |||
| std::vector<size_t> shape = AnfAlgo::GetOutputInferShape(kernel_node, output_index); | |||
| output_shapes.emplace_back(shape); | |||
| } | |||
| size_t len = reserve_size_ / 4; | |||
| output_shapes[kOutputWorkSpaceIndex] = {len, 1}; | |||
| AnfAlgo::SetOutputInferTypeAndShape(output_types, output_shapes, kernel_node.get()); | |||
| } | |||
| void LstmCPUKernel::InitKernel(const CNodePtr &kernel_node) { | |||
| #ifdef PLATFORM_86 | |||
| _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON); | |||
| _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON); | |||
| #endif | |||
| MS_EXCEPTION_IF_NULL(kernel_node); | |||
| using tag = dnnl::memory::format_tag; | |||
| using dim = dnnl::memory::dims; | |||
| CheckParam(kernel_node); | |||
| auto eng = MKLKernelEngine::Get().engine(); | |||
| dnnl::rnn_direction direction = dnnl::rnn_direction::unidirectional; | |||
| if (bidirectional_) { | |||
| direction = dnnl::rnn_direction::bidirectional_concat; | |||
| } | |||
| dim src_dims = {seq_len_, batch_size_, input_size_}; | |||
| dim src_h_dims = {num_layers_, num_directions_, batch_size_, hidden_size_}; | |||
| dim src_c_dims = {num_layers_, num_directions_, batch_size_, hidden_size_}; | |||
| weights_dims_ = {num_layers_, num_directions_, input_size_, 4, hidden_size_}; | |||
| weights_h_dims_ = {num_layers_, num_directions_, hidden_size_, 4, hidden_size_}; | |||
| bias_dims_ = {num_layers_, num_directions_, 4, hidden_size_}; | |||
| dim dst_dims = {seq_len_, batch_size_, static_cast<int64_t>(hidden_size_) * num_directions_}; | |||
| dim dst_h_dims = {num_layers_, num_directions_, batch_size_, hidden_size_}; | |||
| dim dst_c_dims = {num_layers_, num_directions_, batch_size_, hidden_size_}; | |||
| dnnl::memory::desc src_desc = formatted_md(src_dims, tag::tnc); | |||
| dnnl::memory::desc src_h_desc = formatted_md(src_h_dims, tag::ldnc); | |||
| dnnl::memory::desc src_c_desc = formatted_md(src_c_dims, tag::ldnc); | |||
| dnnl::memory::desc bias_desc = formatted_md(bias_dims_, tag::ldgo); | |||
| dnnl::memory::desc dst_desc = formatted_md(dst_dims, tag::tnc); | |||
| dnnl::memory::desc dst_h_desc = formatted_md(dst_h_dims, tag::ldnc); | |||
| dnnl::memory::desc dst_c_desc = formatted_md(dst_c_dims, tag::ldnc); | |||
| if (!kernel_node->HasAttr(kAttrIsTraining)) { | |||
| is_training = true; | |||
| } else { | |||
| is_training = GetValue<bool>(kernel_node->GetAttr(kAttrIsTraining)); | |||
| } | |||
| auto prop_kind = dnnl::prop_kind::forward_training; | |||
| if (!is_training) { | |||
| prop_kind = dnnl::prop_kind::forward_inference; | |||
| } | |||
| auto desc = std::make_shared<dnnl::lstm_forward::desc>( | |||
| prop_kind, direction, src_desc, src_h_desc, src_c_desc, formatted_md(weights_dims_, tag::any), | |||
| formatted_md(weights_h_dims_, tag::any), bias_desc, dst_desc, dst_h_desc, dst_c_desc); | |||
| prim_desc_ = dnnl::lstm_forward::primitive_desc(*desc, eng); | |||
| primitive_ = std::make_shared<dnnl::lstm_forward>(prim_desc_); | |||
| if (is_training) { | |||
| reserve_size_ = static_cast<size_t>(prim_desc_.workspace_desc().get_size()); | |||
| AddArgument(DNNL_ARG_WORKSPACE, prim_desc_.workspace_desc()); | |||
| } else { | |||
| reserve_size_ = 1; | |||
| } | |||
| AddArgument(DNNL_ARG_SRC_LAYER, src_desc); | |||
| AddArgument(DNNL_ARG_SRC_ITER, src_h_desc); | |||
| AddArgument(DNNL_ARG_SRC_ITER_C, src_c_desc); | |||
| AddArgument(DNNL_ARG_WEIGHTS_LAYER, prim_desc_.weights_layer_desc()); | |||
| AddArgument(DNNL_ARG_WEIGHTS_ITER, prim_desc_.weights_iter_desc()); | |||
| AddArgument(DNNL_ARG_BIAS, bias_desc); | |||
| AddArgument(DNNL_ARG_DST_LAYER, dst_desc); | |||
| AddArgument(DNNL_ARG_DST_ITER, dst_h_desc); | |||
| AddArgument(DNNL_ARG_DST_ITER_C, dst_c_desc); | |||
| } | |||
| void LstmCPUKernel::CheckParam(const CNodePtr &kernel_node) { | |||
| std::vector<size_t> src_shape = AnfAlgo::GetInputDeviceShape(kernel_node, 0); | |||
| std::vector<size_t> src_h_shape = AnfAlgo::GetInputDeviceShape(kernel_node, 1); | |||
| std::vector<size_t> src_c_shape = AnfAlgo::GetInputDeviceShape(kernel_node, 2); | |||
| bidirectional_ = AnfAlgo::GetNodeAttr<bool>(kernel_node, "bidirectional"); | |||
| input_size_ = static_cast<int>(AnfAlgo::GetNodeAttr<int64_t>(kernel_node, "input_size")); | |||
| hidden_size_ = static_cast<int>(AnfAlgo::GetNodeAttr<int64_t>(kernel_node, "hidden_size")); | |||
| num_layers_ = static_cast<int>(AnfAlgo::GetNodeAttr<int64_t>(kernel_node, "num_layers")); | |||
| has_bias_ = AnfAlgo::GetNodeAttr<bool>(kernel_node, "has_bias"); | |||
| batch_size_ = SizeToInt(src_shape[1]); | |||
| seq_len_ = SizeToInt(src_shape[0]); | |||
| num_directions_ = 1; | |||
| if (bidirectional_) { | |||
| num_directions_ = 2; | |||
| } | |||
| const int gate_size = 4 * hidden_size_; | |||
| if (num_layers_ <= 0) { | |||
| MS_LOG(EXCEPTION) << "Layers must be greater than zero!"; | |||
| } | |||
| if (num_layers_ > kMaxLSTMLayer) { | |||
| MS_LOG(EXCEPTION) << "Layers must be lower than 100!"; | |||
| } | |||
| for (int i = 0; i < num_layers_; ++i) { | |||
| weight_size_ += gate_size * (i == 0 ? input_size_ : hidden_size_ * num_directions_); | |||
| weight_h_size_ += gate_size * hidden_size_; | |||
| } | |||
| weight_size_ = weight_size_ * num_directions_; | |||
| weight_h_size_ = weight_h_size_ * num_directions_; | |||
| if (num_directions_ * num_layers_ != SizeToInt(src_h_shape[0])) { | |||
| MS_LOG(EXCEPTION) << "Error iteration shape!"; | |||
| } | |||
| if (src_shape.size() != 3 || src_h_shape.size() != 3 || src_c_shape.size() != 3) { | |||
| MS_LOG(EXCEPTION) << "Lstm only support 3-D input!"; | |||
| } | |||
| } | |||
| bool LstmCPUKernel::Launch(const std::vector<kernel::AddressPtr> &inputs, const std::vector<kernel::AddressPtr> &, | |||
| const std::vector<kernel::AddressPtr> &outputs) { | |||
| using dt = dnnl::memory::data_type; | |||
| using tag = dnnl::memory::format_tag; | |||
| auto eng = MKLKernelEngine::Get().engine(); | |||
| auto user_weights_memory = dnnl::memory(dnnl::memory::desc{{weights_dims_}, dt::f32, tag::ldgoi}, eng); | |||
| auto user_weights_h_memory = dnnl::memory(dnnl::memory::desc{{weights_h_dims_}, dt::f32, tag::ldgoi}, eng); | |||
| auto weights_memory = dnnl::memory(prim_desc_.weights_layer_desc(), eng); | |||
| auto weights_h_memory = dnnl::memory(prim_desc_.weights_iter_desc(), eng); | |||
| user_weights_memory.set_data_handle(inputs[3]->addr); | |||
| user_weights_h_memory.set_data_handle(reinterpret_cast<float *>(inputs[3]->addr) + weight_size_); | |||
| Reorder(&user_weights_memory, &weights_memory); | |||
| Reorder(&user_weights_h_memory, &weights_h_memory); | |||
| auto bias_memory = dnnl::memory(prim_desc_.bias_desc(), eng); | |||
| if (has_bias_) { | |||
| bias_memory.set_data_handle(reinterpret_cast<float *>(inputs[3]->addr) + weight_size_ + weight_h_size_); | |||
| } else { | |||
| if (memset_s(bias_memory.get_data_handle(), prim_desc_.bias_desc().get_size(), 0, | |||
| prim_desc_.bias_desc().get_size())) { | |||
| MS_LOG(EXCEPTION) << "Bias memset error"; | |||
| } | |||
| } | |||
| // set handle | |||
| SetArgumentHandle(DNNL_ARG_SRC_LAYER, inputs[0]->addr); | |||
| SetArgumentHandle(DNNL_ARG_SRC_ITER, inputs[1]->addr); | |||
| SetArgumentHandle(DNNL_ARG_SRC_ITER_C, inputs[2]->addr); | |||
| SetArgumentHandle(DNNL_ARG_WEIGHTS_LAYER, weights_memory.get_data_handle()); | |||
| SetArgumentHandle(DNNL_ARG_WEIGHTS_ITER, weights_h_memory.get_data_handle()); | |||
| SetArgumentHandle(DNNL_ARG_BIAS, bias_memory.get_data_handle()); | |||
| SetArgumentHandle(DNNL_ARG_DST_LAYER, outputs[0]->addr); | |||
| SetArgumentHandle(DNNL_ARG_DST_ITER, outputs[1]->addr); | |||
| SetArgumentHandle(DNNL_ARG_DST_ITER_C, outputs[2]->addr); | |||
| if (is_training) { | |||
| SetArgumentHandle(DNNL_ARG_WORKSPACE, outputs[3]->addr); | |||
| } | |||
| ExecutePrimitive(); | |||
| return true; | |||
| } | |||
| } // namespace kernel | |||
| } // namespace mindspore | |||
| @@ -1,76 +1,76 @@ | |||
| /** | |||
| * Copyright 2020 Huawei Technologies Co., Ltd | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| * You may obtain a copy of the License at | |||
| * | |||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, software | |||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_LSTM_CPU_KERNEL_H_ | |||
| #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_LSTM_CPU_KERNEL_H_ | |||
| #if defined(__x86_64__) || defined(__amd64__) || defined(_M_IX86) || defined(_M_X64) | |||
| #define PLATFORM_86 | |||
| #endif | |||
| #ifdef PLATFORM_86 | |||
| #include <pmmintrin.h> | |||
| #endif | |||
| #include <vector> | |||
| #include <memory> | |||
| #include "backend/kernel_compiler/cpu/mkldnn/mkl_cpu_kernel.h" | |||
| namespace mindspore { | |||
| namespace kernel { | |||
| class LstmCPUKernel : public MKLCPUKernel { | |||
| public: | |||
| LstmCPUKernel() = default; | |||
| ~LstmCPUKernel() override = default; | |||
| void InitKernel(const CNodePtr &kernel_node) override; | |||
| bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace, | |||
| const std::vector<AddressPtr> &outputs) override; | |||
| protected: | |||
| void InitInputOutputSize(const CNodePtr &kernel_node) override; | |||
| private: | |||
| void CheckParam(const CNodePtr &kernel_node); | |||
| int weight_size_ = 0; | |||
| int weight_h_size_ = 0; | |||
| int input_size_; | |||
| int hidden_size_; | |||
| int num_layers_; | |||
| int batch_size_; | |||
| int seq_len_; | |||
| int num_directions_; | |||
| bool bidirectional_; | |||
| bool has_bias_; | |||
| size_t reserve_size_; | |||
| bool is_training; | |||
| dnnl::memory::dims weights_dims_; | |||
| dnnl::memory::dims weights_h_dims_; | |||
| dnnl::memory::dims bias_dims_; | |||
| dnnl::lstm_forward::primitive_desc prim_desc_; | |||
| }; | |||
| MS_REG_CPU_KERNEL(LSTM, | |||
| KernelAttr() | |||
| .AddInputAttr(kNumberTypeFloat32) | |||
| .AddInputAttr(kNumberTypeFloat32) | |||
| .AddInputAttr(kNumberTypeFloat32) | |||
| .AddInputAttr(kNumberTypeFloat32) | |||
| .AddOutputAttr(kNumberTypeFloat32) | |||
| .AddOutputAttr(kNumberTypeFloat32) | |||
| .AddOutputAttr(kNumberTypeFloat32) | |||
| .AddOutputAttr(kNumberTypeFloat32) | |||
| .AddOutputAttr(kNumberTypeFloat32), | |||
| LstmCPUKernel); | |||
| } // namespace kernel | |||
| } // namespace mindspore | |||
| #endif // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_LSTM_CPU_KERNEL_H | |||
| /** | |||
| * Copyright 2020 Huawei Technologies Co., Ltd | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| * You may obtain a copy of the License at | |||
| * | |||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, software | |||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_LSTM_CPU_KERNEL_H_ | |||
| #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_LSTM_CPU_KERNEL_H_ | |||
| #if defined(__x86_64__) || defined(__amd64__) || defined(_M_IX86) || defined(_M_X64) | |||
| #define PLATFORM_86 | |||
| #endif | |||
| #ifdef PLATFORM_86 | |||
| #include <pmmintrin.h> | |||
| #endif | |||
| #include <vector> | |||
| #include <memory> | |||
| #include "backend/kernel_compiler/cpu/mkldnn/mkl_cpu_kernel.h" | |||
| namespace mindspore { | |||
| namespace kernel { | |||
| class LstmCPUKernel : public MKLCPUKernel { | |||
| public: | |||
| LstmCPUKernel() = default; | |||
| ~LstmCPUKernel() override = default; | |||
| void InitKernel(const CNodePtr &kernel_node) override; | |||
| bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace, | |||
| const std::vector<AddressPtr> &outputs) override; | |||
| protected: | |||
| void InitInputOutputSize(const CNodePtr &kernel_node) override; | |||
| private: | |||
| void CheckParam(const CNodePtr &kernel_node); | |||
| int weight_size_ = 0; | |||
| int weight_h_size_ = 0; | |||
| int input_size_; | |||
| int hidden_size_; | |||
| int num_layers_; | |||
| int batch_size_; | |||
| int seq_len_; | |||
| int num_directions_; | |||
| bool bidirectional_; | |||
| bool has_bias_; | |||
| size_t reserve_size_; | |||
| bool is_training; | |||
| dnnl::memory::dims weights_dims_; | |||
| dnnl::memory::dims weights_h_dims_; | |||
| dnnl::memory::dims bias_dims_; | |||
| dnnl::lstm_forward::primitive_desc prim_desc_; | |||
| }; | |||
| MS_REG_CPU_KERNEL(LSTM, | |||
| KernelAttr() | |||
| .AddInputAttr(kNumberTypeFloat32) | |||
| .AddInputAttr(kNumberTypeFloat32) | |||
| .AddInputAttr(kNumberTypeFloat32) | |||
| .AddInputAttr(kNumberTypeFloat32) | |||
| .AddOutputAttr(kNumberTypeFloat32) | |||
| .AddOutputAttr(kNumberTypeFloat32) | |||
| .AddOutputAttr(kNumberTypeFloat32) | |||
| .AddOutputAttr(kNumberTypeFloat32) | |||
| .AddOutputAttr(kNumberTypeFloat32), | |||
| LstmCPUKernel); | |||
| } // namespace kernel | |||
| } // namespace mindspore | |||
| #endif // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_LSTM_CPU_KERNEL_H | |||
| @@ -1,218 +1,218 @@ | |||
| /** | |||
| * Copyright 2020 Huawei Technologies Co., Ltd | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| * You may obtain a copy of the License at | |||
| * | |||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, software | |||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| #include "backend/kernel_compiler/cpu/mkldnn/lstm_grad_cpu_kernel.h" | |||
| #include <cstring> | |||
| #include <string> | |||
| #include "utils/ms_utils.h" | |||
| #include "backend/kernel_compiler/cpu/mkldnn/mkl_kernel_engine.h" | |||
| #include "runtime/device/cpu/cpu_device_address.h" | |||
| namespace mindspore { | |||
| namespace kernel { | |||
| const int kMaxLSTMLayer = 100; | |||
| const int kInputWorkSpaceIndex = 10; | |||
| void LSTMGradCPUKernel::InitInputOutputSize(const CNodePtr &kernel_node) { | |||
| CPUKernel::InitInputOutputSize(kernel_node); | |||
| input_size_list_[kInputWorkSpaceIndex] = reserve_size_; | |||
| } | |||
| void LSTMGradCPUKernel::InitKernel(const CNodePtr &kernel_node) { | |||
| MS_EXCEPTION_IF_NULL(kernel_node); | |||
| using tag = dnnl::memory::format_tag; | |||
| using dim = dnnl::memory::dims; | |||
| CheckParam(kernel_node); | |||
| auto eng = MKLKernelEngine::Get().engine(); | |||
| dnnl::rnn_direction direction = dnnl::rnn_direction::unidirectional; | |||
| if (bidirectional_) { | |||
| direction = dnnl::rnn_direction::bidirectional_concat; | |||
| } | |||
| dim src_dims = {seq_len_, batch_size_, input_size_}; | |||
| dim src_h_dims = {num_layers_, num_directions_, batch_size_, hidden_size_}; | |||
| dim src_c_dims = {num_layers_, num_directions_, batch_size_, hidden_size_}; | |||
| weights_dims_ = {num_layers_, num_directions_, input_size_, 4, hidden_size_}; | |||
| weights_h_dims_ = {num_layers_, num_directions_, hidden_size_, 4, hidden_size_}; | |||
| bias_dims_ = {num_layers_, num_directions_, 4, hidden_size_}; | |||
| dim dst_dims = {seq_len_, batch_size_, static_cast<int64_t>(hidden_size_) * num_directions_}; | |||
| dim dst_h_dims = {num_layers_, num_directions_, batch_size_, hidden_size_}; | |||
| dim dst_c_dims = {num_layers_, num_directions_, batch_size_, hidden_size_}; | |||
| dnnl::memory::desc src_desc = formatted_md(src_dims, tag::tnc); | |||
| dnnl::memory::desc src_h_desc = formatted_md(src_h_dims, tag::ldnc); | |||
| dnnl::memory::desc src_c_desc = formatted_md(src_c_dims, tag::ldnc); | |||
| dnnl::memory::desc bias_desc = formatted_md(bias_dims_, tag::ldgo); | |||
| dnnl::memory::desc dst_desc = formatted_md(dst_dims, tag::tnc); | |||
| dnnl::memory::desc dst_h_desc = formatted_md(dst_h_dims, tag::ldnc); | |||
| dnnl::memory::desc dst_c_desc = formatted_md(dst_c_dims, tag::ldnc); | |||
| auto forward_desc = std::make_shared<dnnl::lstm_forward::desc>( | |||
| dnnl::prop_kind::forward_training, direction, src_desc, src_h_desc, src_c_desc, | |||
| formatted_md(weights_dims_, tag::any), formatted_md(weights_h_dims_, tag::any), bias_desc, dst_desc, dst_h_desc, | |||
| dst_c_desc); | |||
| auto prim_forward_desc = dnnl::lstm_forward::primitive_desc(*forward_desc, eng); | |||
| auto backward_desc = std::make_shared<dnnl::lstm_backward::desc>( | |||
| dnnl::prop_kind::backward, direction, src_desc, src_h_desc, src_c_desc, formatted_md(weights_dims_, tag::any), | |||
| formatted_md(weights_h_dims_, tag::any), bias_desc, dst_desc, dst_h_desc, dst_c_desc, src_desc, src_h_desc, | |||
| src_c_desc, formatted_md(weights_dims_, tag::any), formatted_md(weights_h_dims_, tag::any), bias_desc, dst_desc, | |||
| dst_h_desc, dst_c_desc); | |||
| prim_backward_desc_ = dnnl::lstm_backward::primitive_desc(*backward_desc, eng, prim_forward_desc); | |||
| primitive_ = std::make_shared<dnnl::lstm_backward>(prim_backward_desc_); | |||
| reserve_size_ = static_cast<size_t>(prim_forward_desc.workspace_desc().get_size()); | |||
| AddArgument(DNNL_ARG_WORKSPACE, prim_forward_desc.workspace_desc()); | |||
| AddArgumentOp(src_desc, src_h_desc, src_c_desc, bias_desc, dst_desc, dst_h_desc, dst_c_desc); | |||
| } | |||
| void LSTMGradCPUKernel::AddArgumentOp(const dnnl::memory::desc &src_desc, const dnnl::memory::desc &src_h_desc, | |||
| const dnnl::memory::desc &src_c_desc, const dnnl::memory::desc &bias_desc, | |||
| const dnnl::memory::desc &dst_desc, const dnnl::memory::desc &dst_h_desc, | |||
| const dnnl::memory::desc &dst_c_desc) { | |||
| AddArgument(DNNL_ARG_SRC_LAYER, src_desc); | |||
| AddArgument(DNNL_ARG_SRC_ITER, src_h_desc); | |||
| AddArgument(DNNL_ARG_SRC_ITER_C, src_c_desc); | |||
| AddArgument(DNNL_ARG_WEIGHTS_LAYER, prim_backward_desc_.weights_layer_desc()); | |||
| AddArgument(DNNL_ARG_WEIGHTS_ITER, prim_backward_desc_.weights_iter_desc()); | |||
| AddArgument(DNNL_ARG_BIAS, bias_desc); | |||
| AddArgument(DNNL_ARG_DST_LAYER, dst_desc); | |||
| AddArgument(DNNL_ARG_DST_ITER, dst_h_desc); | |||
| AddArgument(DNNL_ARG_DST_ITER_C, dst_c_desc); | |||
| AddArgument(DNNL_ARG_DIFF_SRC_LAYER, src_desc); | |||
| AddArgument(DNNL_ARG_DIFF_SRC_ITER, src_h_desc); | |||
| AddArgument(DNNL_ARG_DIFF_SRC_ITER_C, src_c_desc); | |||
| AddArgument(DNNL_ARG_DIFF_WEIGHTS_LAYER, prim_backward_desc_.diff_weights_layer_desc()); | |||
| AddArgument(DNNL_ARG_DIFF_WEIGHTS_ITER, prim_backward_desc_.diff_weights_iter_desc()); | |||
| AddArgument(DNNL_ARG_DIFF_BIAS, bias_desc); | |||
| AddArgument(DNNL_ARG_DIFF_DST_LAYER, dst_desc); | |||
| AddArgument(DNNL_ARG_DIFF_DST_ITER, dst_h_desc); | |||
| AddArgument(DNNL_ARG_DIFF_DST_ITER_C, dst_c_desc); | |||
| } | |||
| void LSTMGradCPUKernel::CheckParam(const CNodePtr &kernel_node) { | |||
| std::vector<size_t> src_shape = AnfAlgo::GetInputDeviceShape(kernel_node, 0); | |||
| std::vector<size_t> src_h_shape = AnfAlgo::GetInputDeviceShape(kernel_node, 1); | |||
| std::vector<size_t> src_c_shape = AnfAlgo::GetInputDeviceShape(kernel_node, 2); | |||
| bidirectional_ = AnfAlgo::GetNodeAttr<bool>(kernel_node, "bidirectional"); | |||
| input_size_ = AnfAlgo::GetNodeAttr<int64_t>(kernel_node, "input_size"); | |||
| hidden_size_ = AnfAlgo::GetNodeAttr<int64_t>(kernel_node, "hidden_size"); | |||
| num_layers_ = AnfAlgo::GetNodeAttr<int64_t>(kernel_node, "num_layers"); | |||
| has_bias_ = AnfAlgo::GetNodeAttr<bool>(kernel_node, "has_bias"); | |||
| batch_size_ = SizeToInt(src_shape[1]); | |||
| seq_len_ = SizeToInt(src_shape[0]); | |||
| num_directions_ = 1; | |||
| if (bidirectional_) { | |||
| num_directions_ = 2; | |||
| } | |||
| const int64_t gate_size = 4 * hidden_size_; | |||
| if (num_layers_ <= 0) { | |||
| MS_LOG(EXCEPTION) << "Layers must be greater than zero!"; | |||
| } | |||
| if (num_layers_ > kMaxLSTMLayer) { | |||
| MS_LOG(EXCEPTION) << "Layers must be lower than 100!"; | |||
| } | |||
| for (int64_t i = 0; i < num_layers_; ++i) { | |||
| weight_size_ += gate_size * (i == 0 ? input_size_ : hidden_size_ * num_directions_); | |||
| weight_h_size_ += gate_size * hidden_size_; | |||
| } | |||
| weight_size_ = weight_size_ * num_directions_; | |||
| weight_h_size_ = weight_h_size_ * num_directions_; | |||
| if (num_directions_ * num_layers_ != SizeToLong(src_h_shape[0])) { | |||
| MS_LOG(EXCEPTION) << "Error iteration shape!"; | |||
| } | |||
| if (src_shape.size() != 3 || src_h_shape.size() != 3 || src_c_shape.size() != 3) { | |||
| MS_LOG(EXCEPTION) << "Lstm only support 3-D input!"; | |||
| } | |||
| } | |||
| void LSTMGradCPUKernel::SetArgumentHandleOp(const std::vector<kernel::AddressPtr> &inputs, | |||
| const std::vector<kernel::AddressPtr> &outputs, | |||
| const dnnl::memory &weights_memory, const dnnl::memory &weights_h_memory, | |||
| const dnnl::memory &bias_memory, const dnnl::memory &diff_weights_memory, | |||
| const dnnl::memory &diff_weights_h_memory, | |||
| const dnnl::memory &diff_bias_memory) { | |||
| SetArgumentHandle(DNNL_ARG_SRC_LAYER, inputs[0]->addr); | |||
| SetArgumentHandle(DNNL_ARG_SRC_ITER, inputs[1]->addr); | |||
| SetArgumentHandle(DNNL_ARG_SRC_ITER_C, inputs[2]->addr); | |||
| SetArgumentHandle(DNNL_ARG_WEIGHTS_LAYER, weights_memory.get_data_handle()); | |||
| SetArgumentHandle(DNNL_ARG_WEIGHTS_ITER, weights_h_memory.get_data_handle()); | |||
| SetArgumentHandle(DNNL_ARG_BIAS, bias_memory.get_data_handle()); | |||
| SetArgumentHandle(DNNL_ARG_DST_LAYER, inputs[4]->addr); | |||
| SetArgumentHandle(DNNL_ARG_DST_ITER, inputs[5]->addr); | |||
| SetArgumentHandle(DNNL_ARG_DST_ITER_C, inputs[6]->addr); | |||
| SetArgumentHandle(DNNL_ARG_WORKSPACE, inputs[10]->addr); | |||
| SetArgumentHandle(DNNL_ARG_DIFF_SRC_LAYER, outputs[0]->addr); | |||
| SetArgumentHandle(DNNL_ARG_DIFF_SRC_ITER, outputs[1]->addr); | |||
| SetArgumentHandle(DNNL_ARG_DIFF_SRC_ITER_C, outputs[2]->addr); | |||
| SetArgumentHandle(DNNL_ARG_DIFF_WEIGHTS_LAYER, diff_weights_memory.get_data_handle()); | |||
| SetArgumentHandle(DNNL_ARG_DIFF_WEIGHTS_ITER, diff_weights_h_memory.get_data_handle()); | |||
| SetArgumentHandle(DNNL_ARG_DIFF_BIAS, diff_bias_memory.get_data_handle()); | |||
| SetArgumentHandle(DNNL_ARG_DIFF_DST_LAYER, inputs[7]->addr); | |||
| SetArgumentHandle(DNNL_ARG_DIFF_DST_ITER, inputs[8]->addr); | |||
| SetArgumentHandle(DNNL_ARG_DIFF_DST_ITER_C, inputs[9]->addr); | |||
| } | |||
| void LSTMGradCPUKernel::ResetMemory(const dnnl::memory &mem, const string name) const { | |||
| if (memset_s(mem.get_data_handle(), mem.get_desc().get_size(), 0, mem.get_desc().get_size())) { | |||
| MS_LOG(EXCEPTION) << name << " memset error"; | |||
| } | |||
| } | |||
| bool LSTMGradCPUKernel::Launch(const std::vector<kernel::AddressPtr> &inputs, const std::vector<kernel::AddressPtr> &, | |||
| const std::vector<kernel::AddressPtr> &outputs) { | |||
| using dt = dnnl::memory::data_type; | |||
| using tag = dnnl::memory::format_tag; | |||
| auto eng = MKLKernelEngine::Get().engine(); | |||
| // construct fw memory | |||
| auto user_weights_memory = dnnl::memory(dnnl::memory::desc{{weights_dims_}, dt::f32, tag::ldgoi}, eng); | |||
| auto user_weights_h_memory = dnnl::memory(dnnl::memory::desc{{weights_h_dims_}, dt::f32, tag::ldgoi}, eng); | |||
| auto weights_memory = dnnl::memory(prim_backward_desc_.weights_layer_desc(), eng); | |||
| auto weights_h_memory = dnnl::memory(prim_backward_desc_.weights_iter_desc(), eng); | |||
| auto bias_memory = dnnl::memory(prim_backward_desc_.bias_desc(), eng); | |||
| user_weights_memory.set_data_handle(inputs[3]->addr); | |||
| user_weights_h_memory.set_data_handle(reinterpret_cast<float *>(inputs[3]->addr) + weight_size_); | |||
| Reorder(&user_weights_memory, &weights_memory); | |||
| Reorder(&user_weights_h_memory, &weights_h_memory); | |||
| if (has_bias_) { | |||
| bias_memory.set_data_handle(reinterpret_cast<float *>(inputs[3]->addr) + weight_size_ + weight_h_size_); | |||
| } else { | |||
| if (memset_s(bias_memory.get_data_handle(), prim_backward_desc_.bias_desc().get_size(), 0, | |||
| prim_backward_desc_.bias_desc().get_size())) { | |||
| MS_LOG(EXCEPTION) << "Bias memset error"; | |||
| } | |||
| } | |||
| // construct bw memory | |||
| auto diff_weights_memory = dnnl::memory(prim_backward_desc_.diff_weights_layer_desc(), eng); | |||
| auto diff_weights_h_memory = dnnl::memory(prim_backward_desc_.diff_weights_iter_desc(), eng); | |||
| auto diff_bias_memory = dnnl::memory(prim_backward_desc_.diff_bias_desc(), eng); | |||
| auto user_diff_weights_memory = dnnl::memory(dnnl::memory::desc{{weights_dims_}, dt::f32, tag::ldgoi}, eng); | |||
| auto user_diff_weights_h_memory = dnnl::memory(dnnl::memory::desc{{weights_h_dims_}, dt::f32, tag::ldgoi}, eng); | |||
| user_diff_weights_memory.set_data_handle(outputs[3]->addr); | |||
| user_diff_weights_h_memory.set_data_handle(reinterpret_cast<float *>(outputs[3]->addr) + weight_size_); | |||
| ResetMemory(user_diff_weights_memory, "user weights grad"); | |||
| ResetMemory(user_diff_weights_h_memory, "user weights iter grad"); | |||
| ResetMemory(diff_weights_memory, "weights grad"); | |||
| ResetMemory(diff_weights_h_memory, "weights iter grad"); | |||
| if (has_bias_) { | |||
| diff_bias_memory.set_data_handle(reinterpret_cast<float *>(outputs[3]->addr) + weight_size_ + weight_h_size_); | |||
| } | |||
| if (memset_s(diff_bias_memory.get_data_handle(), prim_backward_desc_.diff_bias_desc().get_size(), 0, | |||
| prim_backward_desc_.diff_bias_desc().get_size())) { | |||
| MS_LOG(EXCEPTION) << "Bias grad memset error"; | |||
| } | |||
| SetArgumentHandleOp(inputs, outputs, weights_memory, weights_h_memory, bias_memory, diff_weights_memory, | |||
| diff_weights_h_memory, diff_bias_memory); | |||
| ExecutePrimitive(); | |||
| Reorder(&diff_weights_memory, &user_diff_weights_memory); | |||
| Reorder(&diff_weights_h_memory, &user_diff_weights_h_memory); | |||
| return true; | |||
| } | |||
| } // namespace kernel | |||
| } // namespace mindspore | |||
| /** | |||
| * Copyright 2020 Huawei Technologies Co., Ltd | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| * You may obtain a copy of the License at | |||
| * | |||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, software | |||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| #include "backend/kernel_compiler/cpu/mkldnn/lstm_grad_cpu_kernel.h" | |||
| #include <cstring> | |||
| #include <string> | |||
| #include "utils/ms_utils.h" | |||
| #include "backend/kernel_compiler/cpu/mkldnn/mkl_kernel_engine.h" | |||
| #include "runtime/device/cpu/cpu_device_address.h" | |||
| namespace mindspore { | |||
| namespace kernel { | |||
| const int kMaxLSTMLayer = 100; | |||
| const int kInputWorkSpaceIndex = 10; | |||
| void LSTMGradCPUKernel::InitInputOutputSize(const CNodePtr &kernel_node) { | |||
| CPUKernel::InitInputOutputSize(kernel_node); | |||
| input_size_list_[kInputWorkSpaceIndex] = reserve_size_; | |||
| } | |||
| void LSTMGradCPUKernel::InitKernel(const CNodePtr &kernel_node) { | |||
| MS_EXCEPTION_IF_NULL(kernel_node); | |||
| using tag = dnnl::memory::format_tag; | |||
| using dim = dnnl::memory::dims; | |||
| CheckParam(kernel_node); | |||
| auto eng = MKLKernelEngine::Get().engine(); | |||
| dnnl::rnn_direction direction = dnnl::rnn_direction::unidirectional; | |||
| if (bidirectional_) { | |||
| direction = dnnl::rnn_direction::bidirectional_concat; | |||
| } | |||
| dim src_dims = {seq_len_, batch_size_, input_size_}; | |||
| dim src_h_dims = {num_layers_, num_directions_, batch_size_, hidden_size_}; | |||
| dim src_c_dims = {num_layers_, num_directions_, batch_size_, hidden_size_}; | |||
| weights_dims_ = {num_layers_, num_directions_, input_size_, 4, hidden_size_}; | |||
| weights_h_dims_ = {num_layers_, num_directions_, hidden_size_, 4, hidden_size_}; | |||
| bias_dims_ = {num_layers_, num_directions_, 4, hidden_size_}; | |||
| dim dst_dims = {seq_len_, batch_size_, static_cast<int64_t>(hidden_size_) * num_directions_}; | |||
| dim dst_h_dims = {num_layers_, num_directions_, batch_size_, hidden_size_}; | |||
| dim dst_c_dims = {num_layers_, num_directions_, batch_size_, hidden_size_}; | |||
| dnnl::memory::desc src_desc = formatted_md(src_dims, tag::tnc); | |||
| dnnl::memory::desc src_h_desc = formatted_md(src_h_dims, tag::ldnc); | |||
| dnnl::memory::desc src_c_desc = formatted_md(src_c_dims, tag::ldnc); | |||
| dnnl::memory::desc bias_desc = formatted_md(bias_dims_, tag::ldgo); | |||
| dnnl::memory::desc dst_desc = formatted_md(dst_dims, tag::tnc); | |||
| dnnl::memory::desc dst_h_desc = formatted_md(dst_h_dims, tag::ldnc); | |||
| dnnl::memory::desc dst_c_desc = formatted_md(dst_c_dims, tag::ldnc); | |||
| auto forward_desc = std::make_shared<dnnl::lstm_forward::desc>( | |||
| dnnl::prop_kind::forward_training, direction, src_desc, src_h_desc, src_c_desc, | |||
| formatted_md(weights_dims_, tag::any), formatted_md(weights_h_dims_, tag::any), bias_desc, dst_desc, dst_h_desc, | |||
| dst_c_desc); | |||
| auto prim_forward_desc = dnnl::lstm_forward::primitive_desc(*forward_desc, eng); | |||
| auto backward_desc = std::make_shared<dnnl::lstm_backward::desc>( | |||
| dnnl::prop_kind::backward, direction, src_desc, src_h_desc, src_c_desc, formatted_md(weights_dims_, tag::any), | |||
| formatted_md(weights_h_dims_, tag::any), bias_desc, dst_desc, dst_h_desc, dst_c_desc, src_desc, src_h_desc, | |||
| src_c_desc, formatted_md(weights_dims_, tag::any), formatted_md(weights_h_dims_, tag::any), bias_desc, dst_desc, | |||
| dst_h_desc, dst_c_desc); | |||
| prim_backward_desc_ = dnnl::lstm_backward::primitive_desc(*backward_desc, eng, prim_forward_desc); | |||
| primitive_ = std::make_shared<dnnl::lstm_backward>(prim_backward_desc_); | |||
| reserve_size_ = static_cast<size_t>(prim_forward_desc.workspace_desc().get_size()); | |||
| AddArgument(DNNL_ARG_WORKSPACE, prim_forward_desc.workspace_desc()); | |||
| AddArgumentOp(src_desc, src_h_desc, src_c_desc, bias_desc, dst_desc, dst_h_desc, dst_c_desc); | |||
| } | |||
| void LSTMGradCPUKernel::AddArgumentOp(const dnnl::memory::desc &src_desc, const dnnl::memory::desc &src_h_desc, | |||
| const dnnl::memory::desc &src_c_desc, const dnnl::memory::desc &bias_desc, | |||
| const dnnl::memory::desc &dst_desc, const dnnl::memory::desc &dst_h_desc, | |||
| const dnnl::memory::desc &dst_c_desc) { | |||
| AddArgument(DNNL_ARG_SRC_LAYER, src_desc); | |||
| AddArgument(DNNL_ARG_SRC_ITER, src_h_desc); | |||
| AddArgument(DNNL_ARG_SRC_ITER_C, src_c_desc); | |||
| AddArgument(DNNL_ARG_WEIGHTS_LAYER, prim_backward_desc_.weights_layer_desc()); | |||
| AddArgument(DNNL_ARG_WEIGHTS_ITER, prim_backward_desc_.weights_iter_desc()); | |||
| AddArgument(DNNL_ARG_BIAS, bias_desc); | |||
| AddArgument(DNNL_ARG_DST_LAYER, dst_desc); | |||
| AddArgument(DNNL_ARG_DST_ITER, dst_h_desc); | |||
| AddArgument(DNNL_ARG_DST_ITER_C, dst_c_desc); | |||
| AddArgument(DNNL_ARG_DIFF_SRC_LAYER, src_desc); | |||
| AddArgument(DNNL_ARG_DIFF_SRC_ITER, src_h_desc); | |||
| AddArgument(DNNL_ARG_DIFF_SRC_ITER_C, src_c_desc); | |||
| AddArgument(DNNL_ARG_DIFF_WEIGHTS_LAYER, prim_backward_desc_.diff_weights_layer_desc()); | |||
| AddArgument(DNNL_ARG_DIFF_WEIGHTS_ITER, prim_backward_desc_.diff_weights_iter_desc()); | |||
| AddArgument(DNNL_ARG_DIFF_BIAS, bias_desc); | |||
| AddArgument(DNNL_ARG_DIFF_DST_LAYER, dst_desc); | |||
| AddArgument(DNNL_ARG_DIFF_DST_ITER, dst_h_desc); | |||
| AddArgument(DNNL_ARG_DIFF_DST_ITER_C, dst_c_desc); | |||
| } | |||
| void LSTMGradCPUKernel::CheckParam(const CNodePtr &kernel_node) { | |||
| std::vector<size_t> src_shape = AnfAlgo::GetInputDeviceShape(kernel_node, 0); | |||
| std::vector<size_t> src_h_shape = AnfAlgo::GetInputDeviceShape(kernel_node, 1); | |||
| std::vector<size_t> src_c_shape = AnfAlgo::GetInputDeviceShape(kernel_node, 2); | |||
| bidirectional_ = AnfAlgo::GetNodeAttr<bool>(kernel_node, "bidirectional"); | |||
| input_size_ = AnfAlgo::GetNodeAttr<int64_t>(kernel_node, "input_size"); | |||
| hidden_size_ = AnfAlgo::GetNodeAttr<int64_t>(kernel_node, "hidden_size"); | |||
| num_layers_ = AnfAlgo::GetNodeAttr<int64_t>(kernel_node, "num_layers"); | |||
| has_bias_ = AnfAlgo::GetNodeAttr<bool>(kernel_node, "has_bias"); | |||
| batch_size_ = SizeToInt(src_shape[1]); | |||
| seq_len_ = SizeToInt(src_shape[0]); | |||
| num_directions_ = 1; | |||
| if (bidirectional_) { | |||
| num_directions_ = 2; | |||
| } | |||
| const int64_t gate_size = 4 * hidden_size_; | |||
| if (num_layers_ <= 0) { | |||
| MS_LOG(EXCEPTION) << "Layers must be greater than zero!"; | |||
| } | |||
| if (num_layers_ > kMaxLSTMLayer) { | |||
| MS_LOG(EXCEPTION) << "Layers must be lower than 100!"; | |||
| } | |||
| for (int64_t i = 0; i < num_layers_; ++i) { | |||
| weight_size_ += gate_size * (i == 0 ? input_size_ : hidden_size_ * num_directions_); | |||
| weight_h_size_ += gate_size * hidden_size_; | |||
| } | |||
| weight_size_ = weight_size_ * num_directions_; | |||
| weight_h_size_ = weight_h_size_ * num_directions_; | |||
| if (num_directions_ * num_layers_ != SizeToLong(src_h_shape[0])) { | |||
| MS_LOG(EXCEPTION) << "Error iteration shape!"; | |||
| } | |||
| if (src_shape.size() != 3 || src_h_shape.size() != 3 || src_c_shape.size() != 3) { | |||
| MS_LOG(EXCEPTION) << "Lstm only support 3-D input!"; | |||
| } | |||
| } | |||
| void LSTMGradCPUKernel::SetArgumentHandleOp(const std::vector<kernel::AddressPtr> &inputs, | |||
| const std::vector<kernel::AddressPtr> &outputs, | |||
| const dnnl::memory &weights_memory, const dnnl::memory &weights_h_memory, | |||
| const dnnl::memory &bias_memory, const dnnl::memory &diff_weights_memory, | |||
| const dnnl::memory &diff_weights_h_memory, | |||
| const dnnl::memory &diff_bias_memory) { | |||
| SetArgumentHandle(DNNL_ARG_SRC_LAYER, inputs[0]->addr); | |||
| SetArgumentHandle(DNNL_ARG_SRC_ITER, inputs[1]->addr); | |||
| SetArgumentHandle(DNNL_ARG_SRC_ITER_C, inputs[2]->addr); | |||
| SetArgumentHandle(DNNL_ARG_WEIGHTS_LAYER, weights_memory.get_data_handle()); | |||
| SetArgumentHandle(DNNL_ARG_WEIGHTS_ITER, weights_h_memory.get_data_handle()); | |||
| SetArgumentHandle(DNNL_ARG_BIAS, bias_memory.get_data_handle()); | |||
| SetArgumentHandle(DNNL_ARG_DST_LAYER, inputs[4]->addr); | |||
| SetArgumentHandle(DNNL_ARG_DST_ITER, inputs[5]->addr); | |||
| SetArgumentHandle(DNNL_ARG_DST_ITER_C, inputs[6]->addr); | |||
| SetArgumentHandle(DNNL_ARG_WORKSPACE, inputs[10]->addr); | |||
| SetArgumentHandle(DNNL_ARG_DIFF_SRC_LAYER, outputs[0]->addr); | |||
| SetArgumentHandle(DNNL_ARG_DIFF_SRC_ITER, outputs[1]->addr); | |||
| SetArgumentHandle(DNNL_ARG_DIFF_SRC_ITER_C, outputs[2]->addr); | |||
| SetArgumentHandle(DNNL_ARG_DIFF_WEIGHTS_LAYER, diff_weights_memory.get_data_handle()); | |||
| SetArgumentHandle(DNNL_ARG_DIFF_WEIGHTS_ITER, diff_weights_h_memory.get_data_handle()); | |||
| SetArgumentHandle(DNNL_ARG_DIFF_BIAS, diff_bias_memory.get_data_handle()); | |||
| SetArgumentHandle(DNNL_ARG_DIFF_DST_LAYER, inputs[7]->addr); | |||
| SetArgumentHandle(DNNL_ARG_DIFF_DST_ITER, inputs[8]->addr); | |||
| SetArgumentHandle(DNNL_ARG_DIFF_DST_ITER_C, inputs[9]->addr); | |||
| } | |||
| void LSTMGradCPUKernel::ResetMemory(const dnnl::memory &mem, const string name) const { | |||
| if (memset_s(mem.get_data_handle(), mem.get_desc().get_size(), 0, mem.get_desc().get_size())) { | |||
| MS_LOG(EXCEPTION) << name << " memset error"; | |||
| } | |||
| } | |||
| bool LSTMGradCPUKernel::Launch(const std::vector<kernel::AddressPtr> &inputs, const std::vector<kernel::AddressPtr> &, | |||
| const std::vector<kernel::AddressPtr> &outputs) { | |||
| using dt = dnnl::memory::data_type; | |||
| using tag = dnnl::memory::format_tag; | |||
| auto eng = MKLKernelEngine::Get().engine(); | |||
| // construct fw memory | |||
| auto user_weights_memory = dnnl::memory(dnnl::memory::desc{{weights_dims_}, dt::f32, tag::ldgoi}, eng); | |||
| auto user_weights_h_memory = dnnl::memory(dnnl::memory::desc{{weights_h_dims_}, dt::f32, tag::ldgoi}, eng); | |||
| auto weights_memory = dnnl::memory(prim_backward_desc_.weights_layer_desc(), eng); | |||
| auto weights_h_memory = dnnl::memory(prim_backward_desc_.weights_iter_desc(), eng); | |||
| auto bias_memory = dnnl::memory(prim_backward_desc_.bias_desc(), eng); | |||
| user_weights_memory.set_data_handle(inputs[3]->addr); | |||
| user_weights_h_memory.set_data_handle(reinterpret_cast<float *>(inputs[3]->addr) + weight_size_); | |||
| Reorder(&user_weights_memory, &weights_memory); | |||
| Reorder(&user_weights_h_memory, &weights_h_memory); | |||
| if (has_bias_) { | |||
| bias_memory.set_data_handle(reinterpret_cast<float *>(inputs[3]->addr) + weight_size_ + weight_h_size_); | |||
| } else { | |||
| if (memset_s(bias_memory.get_data_handle(), prim_backward_desc_.bias_desc().get_size(), 0, | |||
| prim_backward_desc_.bias_desc().get_size())) { | |||
| MS_LOG(EXCEPTION) << "Bias memset error"; | |||
| } | |||
| } | |||
| // construct bw memory | |||
| auto diff_weights_memory = dnnl::memory(prim_backward_desc_.diff_weights_layer_desc(), eng); | |||
| auto diff_weights_h_memory = dnnl::memory(prim_backward_desc_.diff_weights_iter_desc(), eng); | |||
| auto diff_bias_memory = dnnl::memory(prim_backward_desc_.diff_bias_desc(), eng); | |||
| auto user_diff_weights_memory = dnnl::memory(dnnl::memory::desc{{weights_dims_}, dt::f32, tag::ldgoi}, eng); | |||
| auto user_diff_weights_h_memory = dnnl::memory(dnnl::memory::desc{{weights_h_dims_}, dt::f32, tag::ldgoi}, eng); | |||
| user_diff_weights_memory.set_data_handle(outputs[3]->addr); | |||
| user_diff_weights_h_memory.set_data_handle(reinterpret_cast<float *>(outputs[3]->addr) + weight_size_); | |||
| ResetMemory(user_diff_weights_memory, "user weights grad"); | |||
| ResetMemory(user_diff_weights_h_memory, "user weights iter grad"); | |||
| ResetMemory(diff_weights_memory, "weights grad"); | |||
| ResetMemory(diff_weights_h_memory, "weights iter grad"); | |||
| if (has_bias_) { | |||
| diff_bias_memory.set_data_handle(reinterpret_cast<float *>(outputs[3]->addr) + weight_size_ + weight_h_size_); | |||
| } | |||
| if (memset_s(diff_bias_memory.get_data_handle(), prim_backward_desc_.diff_bias_desc().get_size(), 0, | |||
| prim_backward_desc_.diff_bias_desc().get_size())) { | |||
| MS_LOG(EXCEPTION) << "Bias grad memset error"; | |||
| } | |||
| SetArgumentHandleOp(inputs, outputs, weights_memory, weights_h_memory, bias_memory, diff_weights_memory, | |||
| diff_weights_h_memory, diff_bias_memory); | |||
| ExecutePrimitive(); | |||
| Reorder(&diff_weights_memory, &user_diff_weights_memory); | |||
| Reorder(&diff_weights_h_memory, &user_diff_weights_h_memory); | |||
| return true; | |||
| } | |||
| } // namespace kernel | |||
| } // namespace mindspore | |||
| @@ -1,87 +1,87 @@ | |||
| /** | |||
| * Copyright 2020 Huawei Technologies Co., Ltd | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| * You may obtain a copy of the License at | |||
| * | |||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, software | |||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_LSTM_GRAD_CPU_KERNEL_H_ | |||
| #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_LSTM_GRAD_CPU_KERNEL_H_ | |||
| #include <string> | |||
| #include <vector> | |||
| #include <memory> | |||
| #include "backend/kernel_compiler/cpu/mkldnn/mkl_cpu_kernel.h" | |||
| namespace mindspore { | |||
| namespace kernel { | |||
| class LSTMGradCPUKernel : public MKLCPUKernel { | |||
| public: | |||
| LSTMGradCPUKernel() = default; | |||
| ~LSTMGradCPUKernel() override = default; | |||
| void InitKernel(const CNodePtr &kernel_node) override; | |||
| bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace, | |||
| const std::vector<AddressPtr> &outputs) override; | |||
| protected: | |||
| void InitInputOutputSize(const CNodePtr &kernel_node) override; | |||
| private: | |||
| void AddArgumentOp(const dnnl::memory::desc &src_desc, const dnnl::memory::desc &src_h_desc, | |||
| const dnnl::memory::desc &src_c_desc, const dnnl::memory::desc &bias_desc, | |||
| const dnnl::memory::desc &dst_desc, const dnnl::memory::desc &dst_h_desc, | |||
| const dnnl::memory::desc &dst_c_desc); | |||
| void SetArgumentHandleOp(const std::vector<kernel::AddressPtr> &inputs, | |||
| const std::vector<kernel::AddressPtr> &outputs, const dnnl::memory &weights_memory, | |||
| const dnnl::memory &weights_h_memory, const dnnl::memory &bias_memory, | |||
| const dnnl::memory &diff_weights_memory, const dnnl::memory &diff_weights_h_memory, | |||
| const dnnl::memory &diff_bias_memory); | |||
| void ResetMemory(const dnnl::memory &mem, const string name) const; | |||
| void CheckParam(const CNodePtr &kernel_node); | |||
| int64_t weight_size_ = 0; | |||
| int64_t weight_h_size_ = 0; | |||
| int64_t input_size_; | |||
| int64_t hidden_size_; | |||
| int64_t num_layers_; | |||
| int64_t batch_size_; | |||
| int64_t seq_len_; | |||
| int num_directions_; | |||
| bool bidirectional_; | |||
| bool has_bias_; | |||
| size_t reserve_size_; | |||
| dnnl::memory::dims weights_dims_; | |||
| dnnl::memory::dims weights_h_dims_; | |||
| dnnl::memory::dims bias_dims_; | |||
| dnnl::lstm_backward::primitive_desc prim_backward_desc_; | |||
| }; | |||
| MS_REG_CPU_KERNEL(LSTMGrad, | |||
| KernelAttr() | |||
| .AddInputAttr(kNumberTypeFloat32) | |||
| .AddInputAttr(kNumberTypeFloat32) | |||
| .AddInputAttr(kNumberTypeFloat32) | |||
| .AddInputAttr(kNumberTypeFloat32) | |||
| .AddInputAttr(kNumberTypeFloat32) | |||
| .AddInputAttr(kNumberTypeFloat32) | |||
| .AddInputAttr(kNumberTypeFloat32) | |||
| .AddInputAttr(kNumberTypeFloat32) | |||
| .AddInputAttr(kNumberTypeFloat32) | |||
| .AddInputAttr(kNumberTypeFloat32) | |||
| .AddInputAttr(kNumberTypeFloat32) | |||
| .AddOutputAttr(kNumberTypeFloat32) | |||
| .AddOutputAttr(kNumberTypeFloat32) | |||
| .AddOutputAttr(kNumberTypeFloat32) | |||
| .AddOutputAttr(kNumberTypeFloat32), | |||
| LSTMGradCPUKernel); | |||
| } // namespace kernel | |||
| } // namespace mindspore | |||
| #endif // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_LSTM_GRAD_CPU_KERNEL_H_ | |||
| /** | |||
| * Copyright 2020 Huawei Technologies Co., Ltd | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| * You may obtain a copy of the License at | |||
| * | |||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, software | |||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_LSTM_GRAD_CPU_KERNEL_H_ | |||
| #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_LSTM_GRAD_CPU_KERNEL_H_ | |||
| #include <string> | |||
| #include <vector> | |||
| #include <memory> | |||
| #include "backend/kernel_compiler/cpu/mkldnn/mkl_cpu_kernel.h" | |||
| namespace mindspore { | |||
| namespace kernel { | |||
| class LSTMGradCPUKernel : public MKLCPUKernel { | |||
| public: | |||
| LSTMGradCPUKernel() = default; | |||
| ~LSTMGradCPUKernel() override = default; | |||
| void InitKernel(const CNodePtr &kernel_node) override; | |||
| bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace, | |||
| const std::vector<AddressPtr> &outputs) override; | |||
| protected: | |||
| void InitInputOutputSize(const CNodePtr &kernel_node) override; | |||
| private: | |||
| void AddArgumentOp(const dnnl::memory::desc &src_desc, const dnnl::memory::desc &src_h_desc, | |||
| const dnnl::memory::desc &src_c_desc, const dnnl::memory::desc &bias_desc, | |||
| const dnnl::memory::desc &dst_desc, const dnnl::memory::desc &dst_h_desc, | |||
| const dnnl::memory::desc &dst_c_desc); | |||
| void SetArgumentHandleOp(const std::vector<kernel::AddressPtr> &inputs, | |||
| const std::vector<kernel::AddressPtr> &outputs, const dnnl::memory &weights_memory, | |||
| const dnnl::memory &weights_h_memory, const dnnl::memory &bias_memory, | |||
| const dnnl::memory &diff_weights_memory, const dnnl::memory &diff_weights_h_memory, | |||
| const dnnl::memory &diff_bias_memory); | |||
| void ResetMemory(const dnnl::memory &mem, const string name) const; | |||
| void CheckParam(const CNodePtr &kernel_node); | |||
| int64_t weight_size_ = 0; | |||
| int64_t weight_h_size_ = 0; | |||
| int64_t input_size_; | |||
| int64_t hidden_size_; | |||
| int64_t num_layers_; | |||
| int64_t batch_size_; | |||
| int64_t seq_len_; | |||
| int num_directions_; | |||
| bool bidirectional_; | |||
| bool has_bias_; | |||
| size_t reserve_size_; | |||
| dnnl::memory::dims weights_dims_; | |||
| dnnl::memory::dims weights_h_dims_; | |||
| dnnl::memory::dims bias_dims_; | |||
| dnnl::lstm_backward::primitive_desc prim_backward_desc_; | |||
| }; | |||
| MS_REG_CPU_KERNEL(LSTMGrad, | |||
| KernelAttr() | |||
| .AddInputAttr(kNumberTypeFloat32) | |||
| .AddInputAttr(kNumberTypeFloat32) | |||
| .AddInputAttr(kNumberTypeFloat32) | |||
| .AddInputAttr(kNumberTypeFloat32) | |||
| .AddInputAttr(kNumberTypeFloat32) | |||
| .AddInputAttr(kNumberTypeFloat32) | |||
| .AddInputAttr(kNumberTypeFloat32) | |||
| .AddInputAttr(kNumberTypeFloat32) | |||
| .AddInputAttr(kNumberTypeFloat32) | |||
| .AddInputAttr(kNumberTypeFloat32) | |||
| .AddInputAttr(kNumberTypeFloat32) | |||
| .AddOutputAttr(kNumberTypeFloat32) | |||
| .AddOutputAttr(kNumberTypeFloat32) | |||
| .AddOutputAttr(kNumberTypeFloat32) | |||
| .AddOutputAttr(kNumberTypeFloat32), | |||
| LSTMGradCPUKernel); | |||
| } // namespace kernel | |||
| } // namespace mindspore | |||
| #endif // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_LSTM_GRAD_CPU_KERNEL_H_ | |||
| @@ -1,99 +1,99 @@ | |||
| /** | |||
| * Copyright 2020 Huawei Technologies Co., Ltd | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| * You may obtain a copy of the License at | |||
| * | |||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, software | |||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| #include "backend/kernel_compiler/cpu/mkldnn/softmax_cross_entropy_with_logits_cpu_kernel.h" | |||
| #include <numeric> | |||
| #include <functional> | |||
| #include <cmath> | |||
| #include "backend/kernel_compiler/cpu/mkldnn/mkl_kernel_engine.h" | |||
| #include "runtime/device/cpu/cpu_device_address.h" | |||
| #include "utils/ms_utils.h" | |||
| namespace mindspore { | |||
| namespace kernel { | |||
| void SoftmaxCrossEntropyWithLogitsCPUKernel::InitInputOutputSize(const CNodePtr &kernel_node) { | |||
| CPUKernel::InitInputOutputSize(kernel_node); | |||
| MS_EXCEPTION_IF_NULL(kernel_node); | |||
| size_t type_size = sizeof(float); | |||
| std::vector<size_t> shape = AnfAlgo::GetInputDeviceShape(kernel_node, 0); | |||
| size_t tensor_size = std::accumulate(shape.begin(), shape.end(), type_size, std::multiplies<size_t>()); | |||
| workspace_size_list_.emplace_back(tensor_size); | |||
| } | |||
| void SoftmaxCrossEntropyWithLogitsCPUKernel::InitKernel(const CNodePtr &kernel_node) { | |||
| MS_EXCEPTION_IF_NULL(kernel_node); | |||
| std::vector<size_t> shape = AnfAlgo::GetInputDeviceShape(kernel_node, 0); | |||
| dnnl::memory::dims mem_dims; | |||
| mem_dims.insert(mem_dims.end(), shape.begin(), shape.end()); | |||
| if (mem_dims.size() != 2) { | |||
| MS_LOG(EXCEPTION) << "SoftmaxCrossEntropyWithLogits kernel dims invalid " << mem_dims.size(); | |||
| } | |||
| batch_size_ = shape[0]; | |||
| class_num_ = shape[1]; | |||
| if (batch_size_ == 0 || class_num_ == 0) { | |||
| MS_LOG(EXCEPTION) << "Invalid batch size or class num input!"; | |||
| } | |||
| dnnl::memory::desc mem_desc(mem_dims, dnnl::memory::data_type::f32, dnnl::memory::format_tag::nc); | |||
| dnnl::softmax_forward::desc desc = dnnl::softmax_forward::desc(dnnl::prop_kind::forward_training, mem_desc, 1); | |||
| auto prim_desc = dnnl::softmax_forward::primitive_desc(desc, MKLKernelEngine::Get().engine()); | |||
| primitive_ = std::make_shared<dnnl::softmax_forward>(prim_desc); | |||
| AddArgument(DNNL_ARG_SRC, mem_desc); | |||
| AddArgument(DNNL_ARG_DST, mem_desc); | |||
| } | |||
| void SoftmaxCrossEntropyWithLogitsCPUKernel::ForwardPostExecute(const float *logits, const float *labels, | |||
| float *output1, float *output2) const { | |||
| float epsilon = 1e-6; | |||
| for (size_t i = 0; i < batch_size_; ++i) { | |||
| output1[i] = 0; | |||
| float loss = 0.0; | |||
| for (size_t j = 0; j < class_num_; ++j) { | |||
| float logit = logf(logits[i * class_num_ + j] <= 0.0 ? epsilon : logits[i * class_num_ + j]); | |||
| output2[i * class_num_ + j] = logits[i * class_num_ + j] - labels[i * class_num_ + j]; | |||
| loss += labels[i * class_num_ + j] * logit; | |||
| } | |||
| output1[i] = -loss; | |||
| } | |||
| } | |||
| bool SoftmaxCrossEntropyWithLogitsCPUKernel::Launch(const std::vector<kernel::AddressPtr> &inputs, | |||
| const std::vector<kernel::AddressPtr> &workspace, | |||
| const std::vector<kernel::AddressPtr> &outputs) { | |||
| if (inputs.empty() || workspace.empty() || outputs.empty()) { | |||
| MS_LOG(EXCEPTION) << "Error input output size!"; | |||
| } | |||
| size_t batch_float_size = batch_size_ * sizeof(float); | |||
| size_t batch_class_float_size = class_num_ * batch_float_size; | |||
| if (inputs[0]->size != workspace[0]->size || inputs[0]->size != batch_class_float_size || | |||
| inputs[1]->size != batch_class_float_size) { | |||
| MS_LOG(EXCEPTION) << "Error input data size!"; | |||
| } | |||
| if (outputs[1]->size != batch_class_float_size || outputs[0]->size != batch_float_size) { | |||
| MS_LOG(EXCEPTION) << "Error output data size!"; | |||
| } | |||
| SetArgumentHandle(DNNL_ARG_SRC, inputs[0]->addr); | |||
| SetArgumentHandle(DNNL_ARG_DST, workspace[0]->addr); | |||
| ExecutePrimitive(); | |||
| auto labels = reinterpret_cast<float *>(inputs[1]->addr); | |||
| auto logits = reinterpret_cast<float *>(workspace[0]->addr); | |||
| auto output1 = reinterpret_cast<float *>(outputs[0]->addr); | |||
| auto output2 = reinterpret_cast<float *>(outputs[1]->addr); | |||
| ForwardPostExecute(logits, labels, output1, output2); | |||
| return true; | |||
| } | |||
| } // namespace kernel | |||
| } // namespace mindspore | |||
| /** | |||
| * Copyright 2020 Huawei Technologies Co., Ltd | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| * You may obtain a copy of the License at | |||
| * | |||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, software | |||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| #include "backend/kernel_compiler/cpu/mkldnn/softmax_cross_entropy_with_logits_cpu_kernel.h" | |||
| #include <numeric> | |||
| #include <functional> | |||
| #include <cmath> | |||
| #include "backend/kernel_compiler/cpu/mkldnn/mkl_kernel_engine.h" | |||
| #include "runtime/device/cpu/cpu_device_address.h" | |||
| #include "utils/ms_utils.h" | |||
| namespace mindspore { | |||
| namespace kernel { | |||
| void SoftmaxCrossEntropyWithLogitsCPUKernel::InitInputOutputSize(const CNodePtr &kernel_node) { | |||
| CPUKernel::InitInputOutputSize(kernel_node); | |||
| MS_EXCEPTION_IF_NULL(kernel_node); | |||
| size_t type_size = sizeof(float); | |||
| std::vector<size_t> shape = AnfAlgo::GetInputDeviceShape(kernel_node, 0); | |||
| size_t tensor_size = std::accumulate(shape.begin(), shape.end(), type_size, std::multiplies<size_t>()); | |||
| workspace_size_list_.emplace_back(tensor_size); | |||
| } | |||
| void SoftmaxCrossEntropyWithLogitsCPUKernel::InitKernel(const CNodePtr &kernel_node) { | |||
| MS_EXCEPTION_IF_NULL(kernel_node); | |||
| std::vector<size_t> shape = AnfAlgo::GetInputDeviceShape(kernel_node, 0); | |||
| dnnl::memory::dims mem_dims; | |||
| mem_dims.insert(mem_dims.end(), shape.begin(), shape.end()); | |||
| if (mem_dims.size() != 2) { | |||
| MS_LOG(EXCEPTION) << "SoftmaxCrossEntropyWithLogits kernel dims invalid " << mem_dims.size(); | |||
| } | |||
| batch_size_ = shape[0]; | |||
| class_num_ = shape[1]; | |||
| if (batch_size_ == 0 || class_num_ == 0) { | |||
| MS_LOG(EXCEPTION) << "Invalid batch size or class num input!"; | |||
| } | |||
| dnnl::memory::desc mem_desc(mem_dims, dnnl::memory::data_type::f32, dnnl::memory::format_tag::nc); | |||
| dnnl::softmax_forward::desc desc = dnnl::softmax_forward::desc(dnnl::prop_kind::forward_training, mem_desc, 1); | |||
| auto prim_desc = dnnl::softmax_forward::primitive_desc(desc, MKLKernelEngine::Get().engine()); | |||
| primitive_ = std::make_shared<dnnl::softmax_forward>(prim_desc); | |||
| AddArgument(DNNL_ARG_SRC, mem_desc); | |||
| AddArgument(DNNL_ARG_DST, mem_desc); | |||
| } | |||
| void SoftmaxCrossEntropyWithLogitsCPUKernel::ForwardPostExecute(const float *logits, const float *labels, | |||
| float *output1, float *output2) const { | |||
| float epsilon = 1e-6; | |||
| for (size_t i = 0; i < batch_size_; ++i) { | |||
| output1[i] = 0; | |||
| float loss = 0.0; | |||
| for (size_t j = 0; j < class_num_; ++j) { | |||
| float logit = logf(logits[i * class_num_ + j] <= 0.0 ? epsilon : logits[i * class_num_ + j]); | |||
| output2[i * class_num_ + j] = logits[i * class_num_ + j] - labels[i * class_num_ + j]; | |||
| loss += labels[i * class_num_ + j] * logit; | |||
| } | |||
| output1[i] = -loss; | |||
| } | |||
| } | |||
| bool SoftmaxCrossEntropyWithLogitsCPUKernel::Launch(const std::vector<kernel::AddressPtr> &inputs, | |||
| const std::vector<kernel::AddressPtr> &workspace, | |||
| const std::vector<kernel::AddressPtr> &outputs) { | |||
| if (inputs.empty() || workspace.empty() || outputs.empty()) { | |||
| MS_LOG(EXCEPTION) << "Error input output size!"; | |||
| } | |||
| size_t batch_float_size = batch_size_ * sizeof(float); | |||
| size_t batch_class_float_size = class_num_ * batch_float_size; | |||
| if (inputs[0]->size != workspace[0]->size || inputs[0]->size != batch_class_float_size || | |||
| inputs[1]->size != batch_class_float_size) { | |||
| MS_LOG(EXCEPTION) << "Error input data size!"; | |||
| } | |||
| if (outputs[1]->size != batch_class_float_size || outputs[0]->size != batch_float_size) { | |||
| MS_LOG(EXCEPTION) << "Error output data size!"; | |||
| } | |||
| SetArgumentHandle(DNNL_ARG_SRC, inputs[0]->addr); | |||
| SetArgumentHandle(DNNL_ARG_DST, workspace[0]->addr); | |||
| ExecutePrimitive(); | |||
| auto labels = reinterpret_cast<float *>(inputs[1]->addr); | |||
| auto logits = reinterpret_cast<float *>(workspace[0]->addr); | |||
| auto output1 = reinterpret_cast<float *>(outputs[0]->addr); | |||
| auto output2 = reinterpret_cast<float *>(outputs[1]->addr); | |||
| ForwardPostExecute(logits, labels, output1, output2); | |||
| return true; | |||
| } | |||
| } // namespace kernel | |||
| } // namespace mindspore | |||
| @@ -1,53 +1,53 @@ | |||
| /** | |||
| * Copyright 2020 Huawei Technologies Co., Ltd | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| * You may obtain a copy of the License at | |||
| * | |||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, software | |||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_SOFTMAX_CROSS_ENTROPY_WITH_LOGITS_CPU_KERNEL_H_ | |||
| #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_SOFTMAX_CROSS_ENTROPY_WITH_LOGITS_CPU_KERNEL_H_ | |||
| #include <vector> | |||
| #include <memory> | |||
| #include "backend/kernel_compiler/cpu/mkldnn/mkl_cpu_kernel.h" | |||
| namespace mindspore { | |||
| namespace kernel { | |||
| class SoftmaxCrossEntropyWithLogitsCPUKernel : public MKLCPUKernel { | |||
| public: | |||
| SoftmaxCrossEntropyWithLogitsCPUKernel() = default; | |||
| ~SoftmaxCrossEntropyWithLogitsCPUKernel() override = default; | |||
| void InitKernel(const CNodePtr &kernel_node) override; | |||
| bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace, | |||
| const std::vector<AddressPtr> &outputs) override; | |||
| protected: | |||
| void InitInputOutputSize(const CNodePtr &kernel_node) override; | |||
| private: | |||
| void ForwardPostExecute(const float *logits, const float *labels, float *output1, float *output2) const; | |||
| size_t class_num_{0}; | |||
| size_t batch_size_{0}; | |||
| }; | |||
| MS_REG_CPU_KERNEL(SoftmaxCrossEntropyWithLogits, | |||
| KernelAttr() | |||
| .AddInputAttr(kNumberTypeFloat32) | |||
| .AddInputAttr(kNumberTypeFloat32) | |||
| .AddOutputAttr(kNumberTypeFloat32) | |||
| .AddOutputAttr(kNumberTypeFloat32), | |||
| SoftmaxCrossEntropyWithLogitsCPUKernel); | |||
| } // namespace kernel | |||
| } // namespace mindspore | |||
| #endif // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_SOFTMAX_CROSS_ENTROPY_WITH_LOGITS_CPU_KERNEL_H_ | |||
| /** | |||
| * Copyright 2020 Huawei Technologies Co., Ltd | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| * You may obtain a copy of the License at | |||
| * | |||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, software | |||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_SOFTMAX_CROSS_ENTROPY_WITH_LOGITS_CPU_KERNEL_H_ | |||
| #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_SOFTMAX_CROSS_ENTROPY_WITH_LOGITS_CPU_KERNEL_H_ | |||
| #include <vector> | |||
| #include <memory> | |||
| #include "backend/kernel_compiler/cpu/mkldnn/mkl_cpu_kernel.h" | |||
| namespace mindspore { | |||
| namespace kernel { | |||
| class SoftmaxCrossEntropyWithLogitsCPUKernel : public MKLCPUKernel { | |||
| public: | |||
| SoftmaxCrossEntropyWithLogitsCPUKernel() = default; | |||
| ~SoftmaxCrossEntropyWithLogitsCPUKernel() override = default; | |||
| void InitKernel(const CNodePtr &kernel_node) override; | |||
| bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace, | |||
| const std::vector<AddressPtr> &outputs) override; | |||
| protected: | |||
| void InitInputOutputSize(const CNodePtr &kernel_node) override; | |||
| private: | |||
| void ForwardPostExecute(const float *logits, const float *labels, float *output1, float *output2) const; | |||
| size_t class_num_{0}; | |||
| size_t batch_size_{0}; | |||
| }; | |||
| MS_REG_CPU_KERNEL(SoftmaxCrossEntropyWithLogits, | |||
| KernelAttr() | |||
| .AddInputAttr(kNumberTypeFloat32) | |||
| .AddInputAttr(kNumberTypeFloat32) | |||
| .AddOutputAttr(kNumberTypeFloat32) | |||
| .AddOutputAttr(kNumberTypeFloat32), | |||
| SoftmaxCrossEntropyWithLogitsCPUKernel); | |||
| } // namespace kernel | |||
| } // namespace mindspore | |||
| #endif // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_SOFTMAX_CROSS_ENTROPY_WITH_LOGITS_CPU_KERNEL_H_ | |||
| @@ -1,59 +1,59 @@ | |||
| /** | |||
| * Copyright 2020 Huawei Technologies Co., Ltd | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| * You may obtain a copy of the License at | |||
| * | |||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, software | |||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_PS_PSERVER_KERNEL_H_ | |||
| #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_PS_PSERVER_KERNEL_H_ | |||
| #include <vector> | |||
| #include <memory> | |||
| #include "backend/kernel_compiler/kernel.h" | |||
| #include "ps/util.h" | |||
| namespace mindspore { | |||
| namespace kernel { | |||
| namespace ps { | |||
| using mindspore::ps::Util; | |||
| class PServerKernel { | |||
| public: | |||
| PServerKernel(size_t rank_id, size_t pserver_num, size_t worker_num) | |||
| : rank_id_(rank_id), pserver_num_(pserver_num), worker_num_(worker_num) {} | |||
| ~PServerKernel() = default; | |||
| PServerKernel(const PServerKernel &) = delete; | |||
| PServerKernel &operator=(const PServerKernel &) = delete; | |||
| virtual void InitKernel(const std::shared_ptr<std::vector<std::shared_ptr<std::vector<size_t>>>> &) {} | |||
| virtual void InitKernel(const CNodePtr &cnode, | |||
| const std::shared_ptr<std::vector<std::shared_ptr<std::vector<size_t>>>> &) {} | |||
| virtual void ReInit(const std::vector<std::vector<size_t>> &) {} | |||
| virtual bool Execute(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace, | |||
| const std::vector<AddressPtr> &outputs) = 0; | |||
| virtual void UpdateEmbeddings(float *embedding_table, const size_t *lookup_ids, const float *update_vals, | |||
| size_t ids_size) {} | |||
| virtual const std::vector<size_t> &input_sizes() const = 0; | |||
| virtual const std::vector<size_t> &output_sizes() const = 0; | |||
| virtual const std::vector<size_t> &workspace_sizes() const = 0; | |||
| protected: | |||
| virtual void ReInit(const std::vector<AddressPtr> &) {} | |||
| void Shard(std::vector<size_t> *shape, int axis); | |||
| size_t rank_id_; | |||
| size_t pserver_num_; | |||
| size_t worker_num_; | |||
| }; | |||
| } // namespace ps | |||
| } // namespace kernel | |||
| } // namespace mindspore | |||
| #endif // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_PS_PSERVER_KERNEL_H_ | |||
| /** | |||
| * Copyright 2020 Huawei Technologies Co., Ltd | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| * You may obtain a copy of the License at | |||
| * | |||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, software | |||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_PS_PSERVER_KERNEL_H_ | |||
| #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_PS_PSERVER_KERNEL_H_ | |||
| #include <vector> | |||
| #include <memory> | |||
| #include "backend/kernel_compiler/kernel.h" | |||
| #include "ps/util.h" | |||
| namespace mindspore { | |||
| namespace kernel { | |||
| namespace ps { | |||
| using mindspore::ps::Util; | |||
| class PServerKernel { | |||
| public: | |||
| PServerKernel(size_t rank_id, size_t pserver_num, size_t worker_num) | |||
| : rank_id_(rank_id), pserver_num_(pserver_num), worker_num_(worker_num) {} | |||
| ~PServerKernel() = default; | |||
| PServerKernel(const PServerKernel &) = delete; | |||
| PServerKernel &operator=(const PServerKernel &) = delete; | |||
| virtual void InitKernel(const std::shared_ptr<std::vector<std::shared_ptr<std::vector<size_t>>>> &) {} | |||
| virtual void InitKernel(const CNodePtr &cnode, | |||
| const std::shared_ptr<std::vector<std::shared_ptr<std::vector<size_t>>>> &) {} | |||
| virtual void ReInit(const std::vector<std::vector<size_t>> &) {} | |||
| virtual bool Execute(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace, | |||
| const std::vector<AddressPtr> &outputs) = 0; | |||
| virtual void UpdateEmbeddings(float *embedding_table, const size_t *lookup_ids, const float *update_vals, | |||
| size_t ids_size) {} | |||
| virtual const std::vector<size_t> &input_sizes() const = 0; | |||
| virtual const std::vector<size_t> &output_sizes() const = 0; | |||
| virtual const std::vector<size_t> &workspace_sizes() const = 0; | |||
| protected: | |||
| virtual void ReInit(const std::vector<AddressPtr> &) {} | |||
| void Shard(std::vector<size_t> *shape, int axis); | |||
| size_t rank_id_; | |||
| size_t pserver_num_; | |||
| size_t worker_num_; | |||
| }; | |||
| } // namespace ps | |||
| } // namespace kernel | |||
| } // namespace mindspore | |||
| #endif // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_PS_PSERVER_KERNEL_H_ | |||
| @@ -1,138 +1,138 @@ | |||
| /** | |||
| * Copyright 2021 Huawei Technologies Co., Ltd | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| * You may obtain a copy of the License at | |||
| * | |||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, software | |||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| #include "backend/kernel_compiler/cpu/reduce_cpu_kernel.h" | |||
| #include <string> | |||
| #include <vector> | |||
| #include <algorithm> | |||
| #include <utility> | |||
| namespace mindspore { | |||
| namespace kernel { | |||
| template <typename T> | |||
| void ReduceCPUKernel<T>::InitKernel(const CNodePtr &kernel_node) { | |||
| MS_EXCEPTION_IF_NULL(kernel_node); | |||
| input_shape_ = AnfAlgo::GetInputDeviceShape(kernel_node, 0); | |||
| auto axis_addr = AnfAlgo::GetCNodePrimitive(kernel_node)->GetAttr(AXIS); | |||
| if (axis_addr->isa<ValueTuple>() || axis_addr->isa<ValueList>()) { | |||
| axis_ = AnfAlgo::GetNodeAttr<std::vector<int64_t>>(kernel_node, AXIS); | |||
| } else if (axis_addr->isa<Int64Imm>()) { | |||
| axis_.emplace_back(AnfAlgo::GetNodeAttr<int64_t>(kernel_node, AXIS)); | |||
| } else { | |||
| MS_LOG(EXCEPTION) << "Attribute is invalid"; | |||
| } | |||
| int dimension = input_shape_.size(); | |||
| std::transform(axis_.begin(), axis_.end(), axis_.begin(), | |||
| [dimension](const auto &a) { return a < 0 ? dimension + a : a; }); | |||
| sort(axis_.begin(), axis_.end()); | |||
| // Delete the duplicate axis. | |||
| auto last = std::unique(axis_.begin(), axis_.end()); | |||
| axis_.erase(last, axis_.end()); | |||
| auto kernel_name = AnfAlgo::GetCNodeName(kernel_node); | |||
| if constexpr (std::is_same<T, bool>::value) { | |||
| if (kernel_name == "ReduceAll") { | |||
| reduce_type_ = kReduceAll; | |||
| reduce_func_ = [](const T *input, size_t pos, T *out) { *out &= input[pos]; }; | |||
| } else if (kernel_name == "ReduceAny") { | |||
| reduce_type_ = kReduceAny; | |||
| reduce_func_ = [](const T *input, size_t pos, T *out) { *out |= input[pos]; }; | |||
| } else { | |||
| MS_LOG(EXCEPTION) << "Unsupported reduce operation: " << fullname_ << " for bool."; | |||
| } | |||
| } else { | |||
| if (kernel_name == "ReduceMax") { | |||
| reduce_type_ = kReduceMax; | |||
| reduce_func_ = [](const T *input, size_t pos, T *out) { *out = std::max(input[pos], *out); }; | |||
| } else if (kernel_name == "ReduceMin") { | |||
| reduce_type_ = kReduceMin; | |||
| reduce_func_ = [](const T *input, size_t pos, T *out) { *out = std::min(input[pos], *out); }; | |||
| } else if (kernel_name == "ReduceSum") { | |||
| reduce_type_ = kReduceSum; | |||
| reduce_func_ = [](const T *input, size_t pos, T *out) { *out += input[pos]; }; | |||
| } else if (kernel_name == "ReduceMean") { | |||
| reduce_type_ = kReduceMean; | |||
| reduce_func_ = [](const T *input, size_t pos, T *out) { *out += input[pos]; }; | |||
| } else { | |||
| MS_LOG(EXCEPTION) << "Unsupported reduce operation: " << kernel_name; | |||
| } | |||
| } | |||
| } | |||
| template <typename T> | |||
| bool ReduceCPUKernel<T>::Launch(const std::vector<kernel::AddressPtr> &inputs, const std::vector<kernel::AddressPtr> &, | |||
| const std::vector<kernel::AddressPtr> &outputs) { | |||
| size_t input_size = inputs[0]->size / sizeof(T); | |||
| auto input_addr = reinterpret_cast<T *>(inputs[0]->addr); | |||
| auto output_addr = reinterpret_cast<T *>(outputs[0]->addr); | |||
| if (axis_.empty() || input_shape_.empty() || input_shape_.size() == 1) { | |||
| // Get one ret | |||
| *output_addr = input_addr[0]; | |||
| for (size_t i = 1; i < input_size; ++i) { | |||
| reduce_func_(input_addr, i, output_addr); | |||
| } | |||
| if (reduce_type_ == kReduceMean) { | |||
| *output_addr /= input_size; | |||
| } | |||
| } else { | |||
| // Calculate transpose axes and stride | |||
| int dimension = input_shape_.size(); | |||
| size_t stride = 1; | |||
| std::vector<size_t> axes(input_shape_.size()); | |||
| size_t j = 0; | |||
| size_t k = 0; | |||
| for (int i = 0; i < dimension; ++i) { | |||
| if (j == axis_.size() || i != axis_[j]) { | |||
| axes[k] = i; | |||
| ++k; | |||
| } else { | |||
| stride *= input_shape_[i]; | |||
| ++j; | |||
| } | |||
| } | |||
| for (auto &it : axis_) { | |||
| axes[k] = it; | |||
| ++k; | |||
| } | |||
| // Calculate transpose shape | |||
| std::vector<size_t> transpose_shape(input_shape_.size()); | |||
| for (int i = 0; i < dimension; ++i) { | |||
| transpose_shape[i] = input_shape_[axes[i]]; | |||
| } | |||
| size_t output_size = outputs[0]->size / sizeof(T); | |||
| TransposeIterator base_iter(std::move(transpose_shape), std::move(axes), input_shape_); | |||
| auto task = [this, &base_iter, input_addr, output_addr, stride](size_t start, size_t end) { | |||
| auto iter = base_iter; | |||
| iter.SetPos(start * stride); | |||
| for (size_t i = start; i < end; ++i) { | |||
| output_addr[i] = input_addr[iter.GetPos()]; | |||
| iter.GenNextPos(); | |||
| for (size_t j = 1; j < stride; ++j) { | |||
| reduce_func_(input_addr, iter.GetPos(), &output_addr[i]); | |||
| iter.GenNextPos(); | |||
| } | |||
| if (reduce_type_ == kReduceMean) { | |||
| output_addr[i] /= stride; | |||
| } | |||
| } | |||
| }; | |||
| CPUKernelUtils::ParallelFor(task, output_size); | |||
| } | |||
| return true; | |||
| } | |||
| } // namespace kernel | |||
| } // namespace mindspore | |||
| /** | |||
| * Copyright 2021 Huawei Technologies Co., Ltd | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| * You may obtain a copy of the License at | |||
| * | |||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, software | |||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| #include "backend/kernel_compiler/cpu/reduce_cpu_kernel.h" | |||
| #include <string> | |||
| #include <vector> | |||
| #include <algorithm> | |||
| #include <utility> | |||
| namespace mindspore { | |||
| namespace kernel { | |||
| template <typename T> | |||
| void ReduceCPUKernel<T>::InitKernel(const CNodePtr &kernel_node) { | |||
| MS_EXCEPTION_IF_NULL(kernel_node); | |||
| input_shape_ = AnfAlgo::GetInputDeviceShape(kernel_node, 0); | |||
| auto axis_addr = AnfAlgo::GetCNodePrimitive(kernel_node)->GetAttr(AXIS); | |||
| if (axis_addr->isa<ValueTuple>() || axis_addr->isa<ValueList>()) { | |||
| axis_ = AnfAlgo::GetNodeAttr<std::vector<int64_t>>(kernel_node, AXIS); | |||
| } else if (axis_addr->isa<Int64Imm>()) { | |||
| axis_.emplace_back(AnfAlgo::GetNodeAttr<int64_t>(kernel_node, AXIS)); | |||
| } else { | |||
| MS_LOG(EXCEPTION) << "Attribute is invalid"; | |||
| } | |||
| int dimension = input_shape_.size(); | |||
| std::transform(axis_.begin(), axis_.end(), axis_.begin(), | |||
| [dimension](const auto &a) { return a < 0 ? dimension + a : a; }); | |||
| sort(axis_.begin(), axis_.end()); | |||
| // Delete the duplicate axis. | |||
| auto last = std::unique(axis_.begin(), axis_.end()); | |||
| axis_.erase(last, axis_.end()); | |||
| auto kernel_name = AnfAlgo::GetCNodeName(kernel_node); | |||
| if constexpr (std::is_same<T, bool>::value) { | |||
| if (kernel_name == "ReduceAll") { | |||
| reduce_type_ = kReduceAll; | |||
| reduce_func_ = [](const T *input, size_t pos, T *out) { *out &= input[pos]; }; | |||
| } else if (kernel_name == "ReduceAny") { | |||
| reduce_type_ = kReduceAny; | |||
| reduce_func_ = [](const T *input, size_t pos, T *out) { *out |= input[pos]; }; | |||
| } else { | |||
| MS_LOG(EXCEPTION) << "Unsupported reduce operation: " << fullname_ << " for bool."; | |||
| } | |||
| } else { | |||
| if (kernel_name == "ReduceMax") { | |||
| reduce_type_ = kReduceMax; | |||
| reduce_func_ = [](const T *input, size_t pos, T *out) { *out = std::max(input[pos], *out); }; | |||
| } else if (kernel_name == "ReduceMin") { | |||
| reduce_type_ = kReduceMin; | |||
| reduce_func_ = [](const T *input, size_t pos, T *out) { *out = std::min(input[pos], *out); }; | |||
| } else if (kernel_name == "ReduceSum") { | |||
| reduce_type_ = kReduceSum; | |||
| reduce_func_ = [](const T *input, size_t pos, T *out) { *out += input[pos]; }; | |||
| } else if (kernel_name == "ReduceMean") { | |||
| reduce_type_ = kReduceMean; | |||
| reduce_func_ = [](const T *input, size_t pos, T *out) { *out += input[pos]; }; | |||
| } else { | |||
| MS_LOG(EXCEPTION) << "Unsupported reduce operation: " << kernel_name; | |||
| } | |||
| } | |||
| } | |||
| template <typename T> | |||
| bool ReduceCPUKernel<T>::Launch(const std::vector<kernel::AddressPtr> &inputs, const std::vector<kernel::AddressPtr> &, | |||
| const std::vector<kernel::AddressPtr> &outputs) { | |||
| size_t input_size = inputs[0]->size / sizeof(T); | |||
| auto input_addr = reinterpret_cast<T *>(inputs[0]->addr); | |||
| auto output_addr = reinterpret_cast<T *>(outputs[0]->addr); | |||
| if (axis_.empty() || input_shape_.empty() || input_shape_.size() == 1) { | |||
| // Get one ret | |||
| *output_addr = input_addr[0]; | |||
| for (size_t i = 1; i < input_size; ++i) { | |||
| reduce_func_(input_addr, i, output_addr); | |||
| } | |||
| if (reduce_type_ == kReduceMean) { | |||
| *output_addr /= input_size; | |||
| } | |||
| } else { | |||
| // Calculate transpose axes and stride | |||
| int dimension = input_shape_.size(); | |||
| size_t stride = 1; | |||
| std::vector<size_t> axes(input_shape_.size()); | |||
| size_t j = 0; | |||
| size_t k = 0; | |||
| for (int i = 0; i < dimension; ++i) { | |||
| if (j == axis_.size() || i != axis_[j]) { | |||
| axes[k] = i; | |||
| ++k; | |||
| } else { | |||
| stride *= input_shape_[i]; | |||
| ++j; | |||
| } | |||
| } | |||
| for (auto &it : axis_) { | |||
| axes[k] = it; | |||
| ++k; | |||
| } | |||
| // Calculate transpose shape | |||
| std::vector<size_t> transpose_shape(input_shape_.size()); | |||
| for (int i = 0; i < dimension; ++i) { | |||
| transpose_shape[i] = input_shape_[axes[i]]; | |||
| } | |||
| size_t output_size = outputs[0]->size / sizeof(T); | |||
| TransposeIterator base_iter(std::move(transpose_shape), std::move(axes), input_shape_); | |||
| auto task = [this, &base_iter, input_addr, output_addr, stride](size_t start, size_t end) { | |||
| auto iter = base_iter; | |||
| iter.SetPos(start * stride); | |||
| for (size_t i = start; i < end; ++i) { | |||
| output_addr[i] = input_addr[iter.GetPos()]; | |||
| iter.GenNextPos(); | |||
| for (size_t j = 1; j < stride; ++j) { | |||
| reduce_func_(input_addr, iter.GetPos(), &output_addr[i]); | |||
| iter.GenNextPos(); | |||
| } | |||
| if (reduce_type_ == kReduceMean) { | |||
| output_addr[i] /= stride; | |||
| } | |||
| } | |||
| }; | |||
| CPUKernelUtils::ParallelFor(task, output_size); | |||
| } | |||
| return true; | |||
| } | |||
| } // namespace kernel | |||
| } // namespace mindspore | |||
| @@ -1,69 +1,69 @@ | |||
| /** | |||
| * Copyright 2020 Huawei Technologies Co., Ltd | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| * You may obtain a copy of the License at | |||
| * | |||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, software | |||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_REDUCE_CPU_KERNEL_H_ | |||
| #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_REDUCE_CPU_KERNEL_H_ | |||
| #include <vector> | |||
| #include <memory> | |||
| #include <string> | |||
| #include <functional> | |||
| #include "backend/kernel_compiler/cpu/cpu_kernel.h" | |||
| #include "backend/kernel_compiler/cpu/cpu_kernel_factory.h" | |||
| namespace mindspore { | |||
| namespace kernel { | |||
| template <typename T> | |||
| class ReduceCPUKernel : public CPUKernel { | |||
| public: | |||
| ReduceCPUKernel() = default; | |||
| ~ReduceCPUKernel() override = default; | |||
| void InitKernel(const CNodePtr &kernel_node) override; | |||
| bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace, | |||
| const std::vector<AddressPtr> &outputs) override; | |||
| private: | |||
| enum ReduceType { kReduceAll, kReduceAny, kReduceMax, kReduceMin, kReduceSum, kReduceMean }; | |||
| std::vector<size_t> input_shape_; | |||
| std::vector<int64_t> axis_; | |||
| ReduceType reduce_type_{kReduceAll}; | |||
| std::function<void(const T *, size_t, T *)> reduce_func_; | |||
| }; | |||
| MS_REG_CPU_KERNEL_T(ReduceMean, KernelAttr(), ReduceCPUKernel, float); | |||
| MS_REG_CPU_KERNEL_T(ReduceMean, KernelAttr(), ReduceCPUKernel, double); | |||
| MS_REG_CPU_KERNEL_T(ReduceMean, KernelAttr(), ReduceCPUKernel, int32_t); | |||
| MS_REG_CPU_KERNEL_T(ReduceMean, KernelAttr(), ReduceCPUKernel, int64_t); | |||
| MS_REG_CPU_KERNEL_T(ReduceMax, KernelAttr(), ReduceCPUKernel, float); | |||
| MS_REG_CPU_KERNEL_T(ReduceMax, KernelAttr(), ReduceCPUKernel, double); | |||
| MS_REG_CPU_KERNEL_T(ReduceMax, KernelAttr(), ReduceCPUKernel, int32_t); | |||
| MS_REG_CPU_KERNEL_T(ReduceMax, KernelAttr(), ReduceCPUKernel, int64_t); | |||
| MS_REG_CPU_KERNEL_T(ReduceSum, KernelAttr(), ReduceCPUKernel, float); | |||
| MS_REG_CPU_KERNEL_T(ReduceSum, KernelAttr(), ReduceCPUKernel, double); | |||
| MS_REG_CPU_KERNEL_T(ReduceSum, KernelAttr(), ReduceCPUKernel, int32_t); | |||
| MS_REG_CPU_KERNEL_T(ReduceSum, KernelAttr(), ReduceCPUKernel, int64_t); | |||
| MS_REG_CPU_KERNEL_T(ReduceMin, KernelAttr(), ReduceCPUKernel, float); | |||
| MS_REG_CPU_KERNEL_T(ReduceMin, KernelAttr(), ReduceCPUKernel, double); | |||
| MS_REG_CPU_KERNEL_T(ReduceMin, KernelAttr(), ReduceCPUKernel, int32_t); | |||
| MS_REG_CPU_KERNEL_T(ReduceMin, KernelAttr(), ReduceCPUKernel, int64_t); | |||
| MS_REG_CPU_KERNEL_T(ReduceAll, KernelAttr(), ReduceCPUKernel, bool); | |||
| MS_REG_CPU_KERNEL_T(ReduceAny, KernelAttr(), ReduceCPUKernel, bool); | |||
| } // namespace kernel | |||
| } // namespace mindspore | |||
| #endif // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_REDUCE_CPU_KERNEL_H_ | |||
| /** | |||
| * Copyright 2020 Huawei Technologies Co., Ltd | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| * You may obtain a copy of the License at | |||
| * | |||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, software | |||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_REDUCE_CPU_KERNEL_H_ | |||
| #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_REDUCE_CPU_KERNEL_H_ | |||
| #include <vector> | |||
| #include <memory> | |||
| #include <string> | |||
| #include <functional> | |||
| #include "backend/kernel_compiler/cpu/cpu_kernel.h" | |||
| #include "backend/kernel_compiler/cpu/cpu_kernel_factory.h" | |||
| namespace mindspore { | |||
| namespace kernel { | |||
| template <typename T> | |||
| class ReduceCPUKernel : public CPUKernel { | |||
| public: | |||
| ReduceCPUKernel() = default; | |||
| ~ReduceCPUKernel() override = default; | |||
| void InitKernel(const CNodePtr &kernel_node) override; | |||
| bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace, | |||
| const std::vector<AddressPtr> &outputs) override; | |||
| private: | |||
| enum ReduceType { kReduceAll, kReduceAny, kReduceMax, kReduceMin, kReduceSum, kReduceMean }; | |||
| std::vector<size_t> input_shape_; | |||
| std::vector<int64_t> axis_; | |||
| ReduceType reduce_type_{kReduceAll}; | |||
| std::function<void(const T *, size_t, T *)> reduce_func_; | |||
| }; | |||
| MS_REG_CPU_KERNEL_T(ReduceMean, KernelAttr(), ReduceCPUKernel, float); | |||
| MS_REG_CPU_KERNEL_T(ReduceMean, KernelAttr(), ReduceCPUKernel, double); | |||
| MS_REG_CPU_KERNEL_T(ReduceMean, KernelAttr(), ReduceCPUKernel, int32_t); | |||
| MS_REG_CPU_KERNEL_T(ReduceMean, KernelAttr(), ReduceCPUKernel, int64_t); | |||
| MS_REG_CPU_KERNEL_T(ReduceMax, KernelAttr(), ReduceCPUKernel, float); | |||
| MS_REG_CPU_KERNEL_T(ReduceMax, KernelAttr(), ReduceCPUKernel, double); | |||
| MS_REG_CPU_KERNEL_T(ReduceMax, KernelAttr(), ReduceCPUKernel, int32_t); | |||
| MS_REG_CPU_KERNEL_T(ReduceMax, KernelAttr(), ReduceCPUKernel, int64_t); | |||
| MS_REG_CPU_KERNEL_T(ReduceSum, KernelAttr(), ReduceCPUKernel, float); | |||
| MS_REG_CPU_KERNEL_T(ReduceSum, KernelAttr(), ReduceCPUKernel, double); | |||
| MS_REG_CPU_KERNEL_T(ReduceSum, KernelAttr(), ReduceCPUKernel, int32_t); | |||
| MS_REG_CPU_KERNEL_T(ReduceSum, KernelAttr(), ReduceCPUKernel, int64_t); | |||
| MS_REG_CPU_KERNEL_T(ReduceMin, KernelAttr(), ReduceCPUKernel, float); | |||
| MS_REG_CPU_KERNEL_T(ReduceMin, KernelAttr(), ReduceCPUKernel, double); | |||
| MS_REG_CPU_KERNEL_T(ReduceMin, KernelAttr(), ReduceCPUKernel, int32_t); | |||
| MS_REG_CPU_KERNEL_T(ReduceMin, KernelAttr(), ReduceCPUKernel, int64_t); | |||
| MS_REG_CPU_KERNEL_T(ReduceAll, KernelAttr(), ReduceCPUKernel, bool); | |||
| MS_REG_CPU_KERNEL_T(ReduceAny, KernelAttr(), ReduceCPUKernel, bool); | |||
| } // namespace kernel | |||
| } // namespace mindspore | |||
| #endif // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_REDUCE_CPU_KERNEL_H_ | |||
| @@ -1,91 +1,91 @@ | |||
| /** | |||
| * Copyright 2021 Huawei Technologies Co., Ltd | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| * You may obtain a copy of the License at | |||
| * | |||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, software | |||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| #include "backend/kernel_compiler/cpu/spacetodepth_cpu_kernel.h" | |||
| #include <vector> | |||
| #include "runtime/device/cpu/cpu_device_address.h" | |||
| namespace mindspore { | |||
| namespace kernel { | |||
| template <typename T> | |||
| void SpaceToDepthCPUKernel<T>::InitKernel(const CNodePtr &kernel_node) { | |||
| MS_EXCEPTION_IF_NULL(kernel_node); | |||
| CheckParam(kernel_node); | |||
| input_shape_ = AnfAlgo::GetInputDeviceShape(kernel_node, 0); | |||
| output_shape_ = AnfAlgo::GetOutputDeviceShape(kernel_node, 0); | |||
| block_size_ = AnfAlgo::GetNodeAttr<int64_t>(kernel_node, "block_size"); | |||
| } | |||
| template <typename T> | |||
| bool SpaceToDepthCPUKernel<T>::Launch(const std::vector<kernel::AddressPtr> &inputs, | |||
| const std::vector<kernel::AddressPtr> & /*workspace*/, | |||
| const std::vector<kernel::AddressPtr> &outputs) { | |||
| auto input_addr = reinterpret_cast<T *>(inputs[0]->addr); | |||
| auto output_addr = reinterpret_cast<T *>(outputs[0]->addr); | |||
| size_t size = IntToSize(inputs[0]->size / sizeof(T)); | |||
| std::vector<size_t> input_shape = input_shape_; | |||
| std::vector<size_t> output_shape = output_shape_; | |||
| size_t block_size = block_size_; | |||
| size_t input_dimension = input_shape.size(); | |||
| size_t input_strides[3] = {1, 1, 1}; | |||
| for (size_t i = input_dimension - 1; i >= 1; --i) { | |||
| for (size_t j = 0; j < i; ++j) { | |||
| input_strides[j] *= input_shape[i]; | |||
| } | |||
| } | |||
| auto task = [&, input_addr, output_addr](size_t start, size_t end) { | |||
| std::vector<size_t> input_pos_array(input_dimension, 0); | |||
| for (size_t i = start; i < end; ++i) { | |||
| size_t tmp_pos = i; | |||
| for (size_t j = 0; j < input_dimension - 1; ++j) { | |||
| input_pos_array[j] = tmp_pos / input_strides[j]; | |||
| tmp_pos %= input_strides[j]; | |||
| } | |||
| input_pos_array.back() = tmp_pos; | |||
| size_t output_pos = input_pos_array[0]; | |||
| output_pos = | |||
| (output_pos * output_shape[1]) + | |||
| (input_pos_array[1] + | |||
| (block_size * (input_pos_array[2] % block_size) + input_pos_array[3] % block_size) * input_shape[1]); | |||
| output_pos = (output_pos * output_shape[2]) + (input_pos_array[2] / block_size); | |||
| output_pos = (output_pos * output_shape[3]) + (input_pos_array[3] / block_size); | |||
| output_addr[output_pos] = input_addr[i]; | |||
| } | |||
| }; | |||
| CPUKernelUtils::ParallelFor(task, size); | |||
| return true; | |||
| } | |||
| template <typename T> | |||
| void SpaceToDepthCPUKernel<T>::CheckParam(const CNodePtr &kernel_node) { | |||
| size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node); | |||
| if (input_num != 1) { | |||
| MS_LOG(EXCEPTION) << "Input number is " << input_num << ", but DepthToSpaceCPUKerrnel needs 1 input."; | |||
| } | |||
| size_t output_num = AnfAlgo::GetOutputTensorNum(kernel_node); | |||
| if (output_num != 1) { | |||
| MS_LOG(EXCEPTION) << "Output number is " << output_num << ", but DepthToSpaceCPUKernel needs 1 output."; | |||
| } | |||
| } | |||
| } // namespace kernel | |||
| } // namespace mindspore | |||
| /** | |||
| * Copyright 2021 Huawei Technologies Co., Ltd | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| * You may obtain a copy of the License at | |||
| * | |||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, software | |||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| #include "backend/kernel_compiler/cpu/spacetodepth_cpu_kernel.h" | |||
| #include <vector> | |||
| #include "runtime/device/cpu/cpu_device_address.h" | |||
| namespace mindspore { | |||
| namespace kernel { | |||
| template <typename T> | |||
| void SpaceToDepthCPUKernel<T>::InitKernel(const CNodePtr &kernel_node) { | |||
| MS_EXCEPTION_IF_NULL(kernel_node); | |||
| CheckParam(kernel_node); | |||
| input_shape_ = AnfAlgo::GetInputDeviceShape(kernel_node, 0); | |||
| output_shape_ = AnfAlgo::GetOutputDeviceShape(kernel_node, 0); | |||
| block_size_ = AnfAlgo::GetNodeAttr<int64_t>(kernel_node, "block_size"); | |||
| } | |||
| template <typename T> | |||
| bool SpaceToDepthCPUKernel<T>::Launch(const std::vector<kernel::AddressPtr> &inputs, | |||
| const std::vector<kernel::AddressPtr> & /*workspace*/, | |||
| const std::vector<kernel::AddressPtr> &outputs) { | |||
| auto input_addr = reinterpret_cast<T *>(inputs[0]->addr); | |||
| auto output_addr = reinterpret_cast<T *>(outputs[0]->addr); | |||
| size_t size = IntToSize(inputs[0]->size / sizeof(T)); | |||
| std::vector<size_t> input_shape = input_shape_; | |||
| std::vector<size_t> output_shape = output_shape_; | |||
| size_t block_size = block_size_; | |||
| size_t input_dimension = input_shape.size(); | |||
| size_t input_strides[3] = {1, 1, 1}; | |||
| for (size_t i = input_dimension - 1; i >= 1; --i) { | |||
| for (size_t j = 0; j < i; ++j) { | |||
| input_strides[j] *= input_shape[i]; | |||
| } | |||
| } | |||
| auto task = [&, input_addr, output_addr](size_t start, size_t end) { | |||
| std::vector<size_t> input_pos_array(input_dimension, 0); | |||
| for (size_t i = start; i < end; ++i) { | |||
| size_t tmp_pos = i; | |||
| for (size_t j = 0; j < input_dimension - 1; ++j) { | |||
| input_pos_array[j] = tmp_pos / input_strides[j]; | |||
| tmp_pos %= input_strides[j]; | |||
| } | |||
| input_pos_array.back() = tmp_pos; | |||
| size_t output_pos = input_pos_array[0]; | |||
| output_pos = | |||
| (output_pos * output_shape[1]) + | |||
| (input_pos_array[1] + | |||
| (block_size * (input_pos_array[2] % block_size) + input_pos_array[3] % block_size) * input_shape[1]); | |||
| output_pos = (output_pos * output_shape[2]) + (input_pos_array[2] / block_size); | |||
| output_pos = (output_pos * output_shape[3]) + (input_pos_array[3] / block_size); | |||
| output_addr[output_pos] = input_addr[i]; | |||
| } | |||
| }; | |||
| CPUKernelUtils::ParallelFor(task, size); | |||
| return true; | |||
| } | |||
| template <typename T> | |||
| void SpaceToDepthCPUKernel<T>::CheckParam(const CNodePtr &kernel_node) { | |||
| size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node); | |||
| if (input_num != 1) { | |||
| MS_LOG(EXCEPTION) << "Input number is " << input_num << ", but DepthToSpaceCPUKerrnel needs 1 input."; | |||
| } | |||
| size_t output_num = AnfAlgo::GetOutputTensorNum(kernel_node); | |||
| if (output_num != 1) { | |||
| MS_LOG(EXCEPTION) << "Output number is " << output_num << ", but DepthToSpaceCPUKernel needs 1 output."; | |||
| } | |||
| } | |||
| } // namespace kernel | |||
| } // namespace mindspore | |||
| @@ -1,84 +1,84 @@ | |||
| /** | |||
| * Copyright 2021 Huawei Technologies Co., Ltd | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| * You may obtain a copy of the License at | |||
| * | |||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, software | |||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_SPACETODEPTH_CPU_KERNEL_H_ | |||
| #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_SPACETODEPTH_CPU_KERNEL_H_ | |||
| #include <string> | |||
| #include <vector> | |||
| #include "backend/kernel_compiler/cpu/cpu_kernel.h" | |||
| #include "backend/kernel_compiler/cpu/cpu_kernel_factory.h" | |||
| namespace mindspore { | |||
| namespace kernel { | |||
| template <typename T> | |||
| class SpaceToDepthCPUKernel : public CPUKernel { | |||
| public: | |||
| SpaceToDepthCPUKernel() = default; | |||
| ~SpaceToDepthCPUKernel() override = default; | |||
| void InitKernel(const CNodePtr &kernel_node) override; | |||
| bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace, | |||
| const std::vector<AddressPtr> &outputs) override; | |||
| private: | |||
| void CheckParam(const CNodePtr &kernel_node); | |||
| std::vector<size_t> input_shape_; | |||
| std::vector<size_t> output_shape_; | |||
| size_t block_size_; | |||
| }; | |||
| MS_REG_CPU_KERNEL_T( | |||
| SpaceToDepth, KernelAttr().SetAllSameAttr(true).AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32), | |||
| SpaceToDepthCPUKernel, float); | |||
| MS_REG_CPU_KERNEL_T( | |||
| SpaceToDepth, KernelAttr().SetAllSameAttr(true).AddInputAttr(kNumberTypeFloat16).AddOutputAttr(kNumberTypeFloat16), | |||
| SpaceToDepthCPUKernel, float16); | |||
| MS_REG_CPU_KERNEL_T(SpaceToDepth, | |||
| KernelAttr().SetAllSameAttr(true).AddInputAttr(kNumberTypeInt8).AddOutputAttr(kNumberTypeInt8), | |||
| SpaceToDepthCPUKernel, int8_t); | |||
| MS_REG_CPU_KERNEL_T(SpaceToDepth, | |||
| KernelAttr().SetAllSameAttr(true).AddInputAttr(kNumberTypeInt16).AddOutputAttr(kNumberTypeInt16), | |||
| SpaceToDepthCPUKernel, int16_t); | |||
| MS_REG_CPU_KERNEL_T(SpaceToDepth, | |||
| KernelAttr().SetAllSameAttr(true).AddInputAttr(kNumberTypeInt32).AddOutputAttr(kNumberTypeInt32), | |||
| SpaceToDepthCPUKernel, int); | |||
| MS_REG_CPU_KERNEL_T(SpaceToDepth, | |||
| KernelAttr().SetAllSameAttr(true).AddInputAttr(kNumberTypeInt64).AddOutputAttr(kNumberTypeInt64), | |||
| SpaceToDepthCPUKernel, int64_t); | |||
| MS_REG_CPU_KERNEL_T(SpaceToDepth, | |||
| KernelAttr().SetAllSameAttr(true).AddInputAttr(kNumberTypeUInt8).AddOutputAttr(kNumberTypeUInt8), | |||
| SpaceToDepthCPUKernel, uint8_t); | |||
| MS_REG_CPU_KERNEL_T(SpaceToDepth, | |||
| KernelAttr().SetAllSameAttr(true).AddInputAttr(kNumberTypeUInt16).AddOutputAttr(kNumberTypeUInt16), | |||
| SpaceToDepthCPUKernel, uint16_t); | |||
| MS_REG_CPU_KERNEL_T(SpaceToDepth, | |||
| KernelAttr().SetAllSameAttr(true).AddInputAttr(kNumberTypeUInt32).AddOutputAttr(kNumberTypeUInt32), | |||
| SpaceToDepthCPUKernel, uint32_t); | |||
| MS_REG_CPU_KERNEL_T(SpaceToDepth, | |||
| KernelAttr().SetAllSameAttr(true).AddInputAttr(kNumberTypeUInt64).AddOutputAttr(kNumberTypeUInt64), | |||
| SpaceToDepthCPUKernel, uint64_t); | |||
| } // namespace kernel | |||
| } // namespace mindspore | |||
| #endif // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_SPACETODEPTH_CPU_KERNEL_H_ | |||
| /** | |||
| * Copyright 2021 Huawei Technologies Co., Ltd | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| * You may obtain a copy of the License at | |||
| * | |||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, software | |||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_SPACETODEPTH_CPU_KERNEL_H_ | |||
| #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_SPACETODEPTH_CPU_KERNEL_H_ | |||
| #include <string> | |||
| #include <vector> | |||
| #include "backend/kernel_compiler/cpu/cpu_kernel.h" | |||
| #include "backend/kernel_compiler/cpu/cpu_kernel_factory.h" | |||
| namespace mindspore { | |||
| namespace kernel { | |||
| template <typename T> | |||
| class SpaceToDepthCPUKernel : public CPUKernel { | |||
| public: | |||
| SpaceToDepthCPUKernel() = default; | |||
| ~SpaceToDepthCPUKernel() override = default; | |||
| void InitKernel(const CNodePtr &kernel_node) override; | |||
| bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace, | |||
| const std::vector<AddressPtr> &outputs) override; | |||
| private: | |||
| void CheckParam(const CNodePtr &kernel_node); | |||
| std::vector<size_t> input_shape_; | |||
| std::vector<size_t> output_shape_; | |||
| size_t block_size_; | |||
| }; | |||
| MS_REG_CPU_KERNEL_T( | |||
| SpaceToDepth, KernelAttr().SetAllSameAttr(true).AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32), | |||
| SpaceToDepthCPUKernel, float); | |||
| MS_REG_CPU_KERNEL_T( | |||
| SpaceToDepth, KernelAttr().SetAllSameAttr(true).AddInputAttr(kNumberTypeFloat16).AddOutputAttr(kNumberTypeFloat16), | |||
| SpaceToDepthCPUKernel, float16); | |||
| MS_REG_CPU_KERNEL_T(SpaceToDepth, | |||
| KernelAttr().SetAllSameAttr(true).AddInputAttr(kNumberTypeInt8).AddOutputAttr(kNumberTypeInt8), | |||
| SpaceToDepthCPUKernel, int8_t); | |||
| MS_REG_CPU_KERNEL_T(SpaceToDepth, | |||
| KernelAttr().SetAllSameAttr(true).AddInputAttr(kNumberTypeInt16).AddOutputAttr(kNumberTypeInt16), | |||
| SpaceToDepthCPUKernel, int16_t); | |||
| MS_REG_CPU_KERNEL_T(SpaceToDepth, | |||
| KernelAttr().SetAllSameAttr(true).AddInputAttr(kNumberTypeInt32).AddOutputAttr(kNumberTypeInt32), | |||
| SpaceToDepthCPUKernel, int); | |||
| MS_REG_CPU_KERNEL_T(SpaceToDepth, | |||
| KernelAttr().SetAllSameAttr(true).AddInputAttr(kNumberTypeInt64).AddOutputAttr(kNumberTypeInt64), | |||
| SpaceToDepthCPUKernel, int64_t); | |||
| MS_REG_CPU_KERNEL_T(SpaceToDepth, | |||
| KernelAttr().SetAllSameAttr(true).AddInputAttr(kNumberTypeUInt8).AddOutputAttr(kNumberTypeUInt8), | |||
| SpaceToDepthCPUKernel, uint8_t); | |||
| MS_REG_CPU_KERNEL_T(SpaceToDepth, | |||
| KernelAttr().SetAllSameAttr(true).AddInputAttr(kNumberTypeUInt16).AddOutputAttr(kNumberTypeUInt16), | |||
| SpaceToDepthCPUKernel, uint16_t); | |||
| MS_REG_CPU_KERNEL_T(SpaceToDepth, | |||
| KernelAttr().SetAllSameAttr(true).AddInputAttr(kNumberTypeUInt32).AddOutputAttr(kNumberTypeUInt32), | |||
| SpaceToDepthCPUKernel, uint32_t); | |||
| MS_REG_CPU_KERNEL_T(SpaceToDepth, | |||
| KernelAttr().SetAllSameAttr(true).AddInputAttr(kNumberTypeUInt64).AddOutputAttr(kNumberTypeUInt64), | |||
| SpaceToDepthCPUKernel, uint64_t); | |||
| } // namespace kernel | |||
| } // namespace mindspore | |||
| #endif // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_SPACETODEPTH_CPU_KERNEL_H_ | |||
| @@ -1,87 +1,87 @@ | |||
| /** | |||
| * Copyright 2020 Huawei Technologies Co., Ltd | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| * You may obtain a copy of the License at | |||
| * | |||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, software | |||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| #include <string> | |||
| #include <vector> | |||
| #include <algorithm> | |||
| #include <map> | |||
| #include "backend/kernel_compiler/cpu/topk_cpu_kernel.h" | |||
| #include "runtime/device/cpu/cpu_device_address.h" | |||
| namespace mindspore { | |||
| namespace kernel { | |||
| template <typename T> | |||
| void TopKCPUKernel::LaunchKernel(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &outputs) { | |||
| if (inputs.size() != 2 || outputs.size() != 2) { | |||
| MS_LOG(EXCEPTION) << "TopK needs 2 inputs and 2 outputs, but get inputs: " << inputs.size() | |||
| << "outputs: " << outputs.size(); | |||
| } | |||
| if (inputs[0]->size != outer_size_ * inner_size_ * sizeof(T)) { | |||
| MS_LOG(EXCEPTION) << "Error input data size!"; | |||
| } | |||
| if (inputs[1]->size != sizeof(int)) { | |||
| MS_LOG(EXCEPTION) << "Input K must be int!"; | |||
| } | |||
| auto input = reinterpret_cast<T *>(inputs[0]->addr); | |||
| int k = reinterpret_cast<int *>(inputs[1]->addr)[0]; | |||
| auto output = reinterpret_cast<T *>(outputs[0]->addr); | |||
| auto indices = reinterpret_cast<int *>(outputs[1]->addr); | |||
| if (k < 1) { | |||
| MS_LOG(EXCEPTION) << "Input k must > 0!"; | |||
| } | |||
| size_t k_num = IntToSize(std::min<int>(inner_size_, k)); | |||
| if (outputs[0]->size != outer_size_ * k_num * sizeof(T)) { | |||
| MS_LOG(EXCEPTION) << "Error output data size!"; | |||
| } | |||
| for (size_t i = 0; i < outer_size_; ++i) { | |||
| std::vector<size_t> idx(inner_size_); | |||
| auto base_input = i * inner_size_; | |||
| std::iota(idx.begin(), idx.end(), base_input); | |||
| std::stable_sort(idx.begin(), idx.end(), | |||
| [&input](size_t index_1, size_t index_2) { return input[index_1] > input[index_2]; }); | |||
| auto base_output = i * k_num; | |||
| if (!sorted_) { | |||
| std::stable_sort(idx.begin(), idx.begin() + SizeToLong(k_num)); | |||
| } | |||
| for (size_t j = 0; j < k_num; ++j) { | |||
| indices[base_output + j] = SizeToInt(idx[j]) - SizeToInt(base_input); | |||
| output[base_output + j] = input[idx[j]]; | |||
| } | |||
| } | |||
| } | |||
| void TopKCPUKernel::InitKernel(const CNodePtr &kernel_node) { | |||
| MS_EXCEPTION_IF_NULL(kernel_node); | |||
| auto x_shape_ = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0); | |||
| for (size_t i = 0; i < x_shape_.size() - 1; ++i) { | |||
| outer_size_ *= x_shape_[i]; | |||
| } | |||
| inner_size_ = x_shape_[x_shape_.size() - 1]; | |||
| sorted_ = AnfAlgo::GetNodeAttr<bool>(kernel_node, "sorted"); | |||
| dtype_ = AnfAlgo::GetInputDeviceDataType(kernel_node, 0); | |||
| } | |||
| bool TopKCPUKernel::Launch(const std::vector<kernel::AddressPtr> &inputs, const std::vector<kernel::AddressPtr> &, | |||
| const std::vector<kernel::AddressPtr> &outputs) { | |||
| if (dtype_ == kNumberTypeFloat16) { | |||
| LaunchKernel<float16>(inputs, outputs); | |||
| } else if (dtype_ == kNumberTypeFloat32) { | |||
| LaunchKernel<float>(inputs, outputs); | |||
| } | |||
| return true; | |||
| } | |||
| } // namespace kernel | |||
| } // namespace mindspore | |||
| /** | |||
| * Copyright 2020 Huawei Technologies Co., Ltd | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| * You may obtain a copy of the License at | |||
| * | |||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, software | |||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| #include <string> | |||
| #include <vector> | |||
| #include <algorithm> | |||
| #include <map> | |||
| #include "backend/kernel_compiler/cpu/topk_cpu_kernel.h" | |||
| #include "runtime/device/cpu/cpu_device_address.h" | |||
| namespace mindspore { | |||
| namespace kernel { | |||
| template <typename T> | |||
| void TopKCPUKernel::LaunchKernel(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &outputs) { | |||
| if (inputs.size() != 2 || outputs.size() != 2) { | |||
| MS_LOG(EXCEPTION) << "TopK needs 2 inputs and 2 outputs, but get inputs: " << inputs.size() | |||
| << "outputs: " << outputs.size(); | |||
| } | |||
| if (inputs[0]->size != outer_size_ * inner_size_ * sizeof(T)) { | |||
| MS_LOG(EXCEPTION) << "Error input data size!"; | |||
| } | |||
| if (inputs[1]->size != sizeof(int)) { | |||
| MS_LOG(EXCEPTION) << "Input K must be int!"; | |||
| } | |||
| auto input = reinterpret_cast<T *>(inputs[0]->addr); | |||
| int k = reinterpret_cast<int *>(inputs[1]->addr)[0]; | |||
| auto output = reinterpret_cast<T *>(outputs[0]->addr); | |||
| auto indices = reinterpret_cast<int *>(outputs[1]->addr); | |||
| if (k < 1) { | |||
| MS_LOG(EXCEPTION) << "Input k must > 0!"; | |||
| } | |||
| size_t k_num = IntToSize(std::min<int>(inner_size_, k)); | |||
| if (outputs[0]->size != outer_size_ * k_num * sizeof(T)) { | |||
| MS_LOG(EXCEPTION) << "Error output data size!"; | |||
| } | |||
| for (size_t i = 0; i < outer_size_; ++i) { | |||
| std::vector<size_t> idx(inner_size_); | |||
| auto base_input = i * inner_size_; | |||
| std::iota(idx.begin(), idx.end(), base_input); | |||
| std::stable_sort(idx.begin(), idx.end(), | |||
| [&input](size_t index_1, size_t index_2) { return input[index_1] > input[index_2]; }); | |||
| auto base_output = i * k_num; | |||
| if (!sorted_) { | |||
| std::stable_sort(idx.begin(), idx.begin() + SizeToLong(k_num)); | |||
| } | |||
| for (size_t j = 0; j < k_num; ++j) { | |||
| indices[base_output + j] = SizeToInt(idx[j]) - SizeToInt(base_input); | |||
| output[base_output + j] = input[idx[j]]; | |||
| } | |||
| } | |||
| } | |||
| void TopKCPUKernel::InitKernel(const CNodePtr &kernel_node) { | |||
| MS_EXCEPTION_IF_NULL(kernel_node); | |||
| auto x_shape_ = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0); | |||
| for (size_t i = 0; i < x_shape_.size() - 1; ++i) { | |||
| outer_size_ *= x_shape_[i]; | |||
| } | |||
| inner_size_ = x_shape_[x_shape_.size() - 1]; | |||
| sorted_ = AnfAlgo::GetNodeAttr<bool>(kernel_node, "sorted"); | |||
| dtype_ = AnfAlgo::GetInputDeviceDataType(kernel_node, 0); | |||
| } | |||
| bool TopKCPUKernel::Launch(const std::vector<kernel::AddressPtr> &inputs, const std::vector<kernel::AddressPtr> &, | |||
| const std::vector<kernel::AddressPtr> &outputs) { | |||
| if (dtype_ == kNumberTypeFloat16) { | |||
| LaunchKernel<float16>(inputs, outputs); | |||
| } else if (dtype_ == kNumberTypeFloat32) { | |||
| LaunchKernel<float>(inputs, outputs); | |||
| } | |||
| return true; | |||
| } | |||
| } // namespace kernel | |||
| } // namespace mindspore | |||
| @@ -1,46 +1,46 @@ | |||
| /** | |||
| * Copyright 2020 Huawei Technologies Co., Ltd | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| * You may obtain a copy of the License at | |||
| * | |||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, software | |||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_TOPK_CPU_KERNEL_H_ | |||
| #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_TOPK_CPU_KERNEL_H_ | |||
| #include <vector> | |||
| #include <memory> | |||
| #include <string> | |||
| #include "backend/kernel_compiler/cpu/cpu_kernel.h" | |||
| #include "backend/kernel_compiler/cpu/cpu_kernel_factory.h" | |||
| namespace mindspore { | |||
| namespace kernel { | |||
| class TopKCPUKernel : public CPUKernel { | |||
| public: | |||
| TopKCPUKernel() = default; | |||
| ~TopKCPUKernel() override = default; | |||
| void InitKernel(const CNodePtr &kernel_node) override; | |||
| bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace, | |||
| const std::vector<AddressPtr> &outputs) override; | |||
| private: | |||
| template <typename T> | |||
| void LaunchKernel(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &outputs); | |||
| size_t outer_size_{1}; | |||
| size_t inner_size_{1}; | |||
| bool sorted_{false}; | |||
| TypeId dtype_{kTypeUnknown}; | |||
| }; | |||
| MS_REG_CPU_KERNEL(TopK, KernelAttr(), TopKCPUKernel) | |||
| } // namespace kernel | |||
| } // namespace mindspore | |||
| #endif // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_TOPK_CPU_KERNEL_H_ | |||
| /** | |||
| * Copyright 2020 Huawei Technologies Co., Ltd | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| * You may obtain a copy of the License at | |||
| * | |||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, software | |||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_TOPK_CPU_KERNEL_H_ | |||
| #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_TOPK_CPU_KERNEL_H_ | |||
| #include <vector> | |||
| #include <memory> | |||
| #include <string> | |||
| #include "backend/kernel_compiler/cpu/cpu_kernel.h" | |||
| #include "backend/kernel_compiler/cpu/cpu_kernel_factory.h" | |||
| namespace mindspore { | |||
| namespace kernel { | |||
| class TopKCPUKernel : public CPUKernel { | |||
| public: | |||
| TopKCPUKernel() = default; | |||
| ~TopKCPUKernel() override = default; | |||
| void InitKernel(const CNodePtr &kernel_node) override; | |||
| bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace, | |||
| const std::vector<AddressPtr> &outputs) override; | |||
| private: | |||
| template <typename T> | |||
| void LaunchKernel(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &outputs); | |||
| size_t outer_size_{1}; | |||
| size_t inner_size_{1}; | |||
| bool sorted_{false}; | |||
| TypeId dtype_{kTypeUnknown}; | |||
| }; | |||
| MS_REG_CPU_KERNEL(TopK, KernelAttr(), TopKCPUKernel) | |||
| } // namespace kernel | |||
| } // namespace mindspore | |||
| #endif // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_TOPK_CPU_KERNEL_H_ | |||
| @@ -1,159 +1,159 @@ | |||
| /** | |||
| * Copyright 2021 Huawei Technologies Co., Ltd | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| * You may obtain a copy of the License at | |||
| * | |||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, software | |||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| #include "backend/kernel_compiler/cpu/transpose_cpu_kernel.h" | |||
| #include <algorithm> | |||
| #include <vector> | |||
| #include "runtime/device/cpu/cpu_device_address.h" | |||
| #include "common/thread_pool.h" | |||
| #include "nnacl/fp32/transpose_fp32.h" | |||
| #include "nnacl/int8/transpose_int8.h" | |||
| #include "nnacl/errorcode.h" | |||
| namespace mindspore { | |||
| namespace kernel { | |||
| void TransposeCPUFwdKernel::InitKernel(const CNodePtr &kernel_node) { | |||
| MS_EXCEPTION_IF_NULL(kernel_node); | |||
| input_shape_ = AnfAlgo::GetInputDeviceShape(kernel_node, 0); | |||
| output_shape_ = AnfAlgo::GetOutputDeviceShape(kernel_node, 0); | |||
| auto tmp = AnfAlgo::GetNodeAttr<std::vector<int64_t>>(kernel_node, "perm"); | |||
| axes_ = {tmp.begin(), tmp.end()}; | |||
| dtype_ = AnfAlgo::GetInputDeviceDataType(kernel_node, 0); | |||
| if (axes_.size() > MAX_TRANSPOSE_DIM_SIZE) { | |||
| MS_LOG(EXCEPTION) << "Transpose support max dimension is " << MAX_TRANSPOSE_DIM_SIZE << "D, but got " | |||
| << axes_.size() << "D."; | |||
| } | |||
| for (size_t i = 0; i < axes_.size(); ++i) { | |||
| transpose_param_.perm_[i] = SizeToInt(axes_[i]); | |||
| } | |||
| int num_axes = SizeToInt(input_shape_.size()); | |||
| transpose_param_.perm_size_ = axes_.size(); | |||
| transpose_param_.num_axes_ = num_axes; | |||
| transpose_param_.strides_[num_axes - 1] = 1; | |||
| transpose_param_.out_strides_[num_axes - 1] = 1; | |||
| for (int i = num_axes - 2; i >= 0; i--) { | |||
| transpose_param_.strides_[i] = input_shape_[i + 1] * transpose_param_.strides_[i + 1]; | |||
| transpose_param_.out_strides_[i] = output_shape_[i + 1] * transpose_param_.out_strides_[i + 1]; | |||
| } | |||
| launch_map_[kNumberTypeInt8] = &TransposeCPUFwdKernel::LaunchKernel<int8_t>; | |||
| launch_map_[kNumberTypeInt16] = &TransposeCPUFwdKernel::LaunchKernel<int16_t>; | |||
| launch_map_[kNumberTypeInt32] = &TransposeCPUFwdKernel::LaunchKernel<int>; | |||
| launch_map_[kNumberTypeInt64] = &TransposeCPUFwdKernel::LaunchKernel<int64_t>; | |||
| launch_map_[kNumberTypeUInt8] = &TransposeCPUFwdKernel::LaunchKernel<uint8_t>; | |||
| launch_map_[kNumberTypeUInt16] = &TransposeCPUFwdKernel::LaunchKernel<uint16_t>; | |||
| launch_map_[kNumberTypeUInt32] = &TransposeCPUFwdKernel::LaunchKernel<uint32_t>; | |||
| launch_map_[kNumberTypeUInt64] = &TransposeCPUFwdKernel::LaunchKernel<uint64_t>; | |||
| launch_map_[kNumberTypeFloat32] = &TransposeCPUFwdKernel::LaunchKernel<float>; | |||
| launch_map_[kNumberTypeBool] = &TransposeCPUFwdKernel::LaunchKernel<bool>; | |||
| auto iter = launch_map_.find(dtype_); | |||
| if (iter != launch_map_.end()) { | |||
| launch_func_ = iter->second; | |||
| } else { | |||
| MS_LOG(EXCEPTION) << "Input data type: " << dtype_ << "is not supported for Transpose kernel on CPU."; | |||
| } | |||
| } | |||
| bool TransposeCPUFwdKernel::Launch(const std::vector<kernel::AddressPtr> &inputs, | |||
| const std::vector<kernel::AddressPtr> &, | |||
| const std::vector<kernel::AddressPtr> &outputs) { | |||
| launch_func_(this, inputs, outputs); | |||
| return true; | |||
| } | |||
| template <typename T> | |||
| void TransposeCPUFwdKernel::LaunchKernel(const std::vector<AddressPtr> &inputs, | |||
| const std::vector<AddressPtr> &outputs) { | |||
| const auto *input_addr = reinterpret_cast<T *>(inputs[0]->addr); | |||
| auto *output_addr = reinterpret_cast<T *>(outputs[0]->addr); | |||
| transpose_param_.data_num_ = inputs[0]->size / sizeof(T); | |||
| int output_shape[SizeToInt(output_shape_.size())]; | |||
| for (size_t i = 0; i < output_shape_.size(); ++i) { | |||
| output_shape[i] = SizeToInt(output_shape_[i]); | |||
| } | |||
| size_t data_count = (inputs[0]->size) / sizeof(T); | |||
| if (axes_.size() <= DIMENSION_6D && data_count < MAX_TRANSPOSE_SERIAL_SIZE) { | |||
| int res = NNACL_ERR; | |||
| if constexpr (std::is_same_v<T, int8_t>) { | |||
| res = DoTransposeInt8(input_addr, output_addr, output_shape, &transpose_param_); | |||
| } else if constexpr (std::is_same_v<T, int16_t>) { | |||
| res = DoTransposeInt16(input_addr, output_addr, output_shape, &transpose_param_); | |||
| } else if constexpr (std::is_same_v<T, int32_t>) { | |||
| res = DoTransposeInt32(input_addr, output_addr, output_shape, &transpose_param_); | |||
| } else if constexpr (std::is_same_v<T, int64_t>) { | |||
| res = DoTransposeInt64(input_addr, output_addr, output_shape, &transpose_param_); | |||
| } else if constexpr (std::is_same_v<T, uint8_t>) { | |||
| res = DoTransposeUInt8(input_addr, output_addr, output_shape, &transpose_param_); | |||
| } else if constexpr (std::is_same_v<T, uint16_t>) { | |||
| res = DoTransposeUInt16(input_addr, output_addr, output_shape, &transpose_param_); | |||
| } else if constexpr (std::is_same_v<T, uint32_t>) { | |||
| res = DoTransposeUInt32(input_addr, output_addr, output_shape, &transpose_param_); | |||
| } else if constexpr (std::is_same_v<T, uint64_t>) { | |||
| res = DoTransposeUInt64(input_addr, output_addr, output_shape, &transpose_param_); | |||
| } else if constexpr (std::is_same_v<T, float>) { | |||
| res = DoTransposeFp32(input_addr, output_addr, output_shape, &transpose_param_); | |||
| } else if constexpr (std::is_same_v<T, bool>) { | |||
| res = DoTransposeBool(input_addr, output_addr, output_shape, &transpose_param_); | |||
| } | |||
| if (res != NNACL_OK) { | |||
| MS_LOG(ERROR) << "Transpose run failed"; | |||
| } | |||
| } else { | |||
| ParallelRun(input_addr, output_addr, output_shape, data_count); | |||
| } | |||
| } | |||
| template <typename T> | |||
| void TransposeCPUFwdKernel::ParallelRun(const T *input_addr, T *output_addr, const int *output_shape, size_t count) { | |||
| auto max_thread_num = common::ThreadPool::GetInstance().GetSyncRunThreadNum(); | |||
| const float block_size = 128.0; | |||
| size_t thread_num = count < block_size * max_thread_num ? std::ceil(count / block_size) : max_thread_num; | |||
| std::vector<common::Task> tasks; | |||
| std::function<void(const T *, T *, const int *, TransposeParameter *, int, int)> TransposeDims; | |||
| if constexpr (std::is_same_v<T, int8_t>) { | |||
| TransposeDims = &TransposeDimsInt8; | |||
| } else if constexpr (std::is_same_v<T, int16_t>) { | |||
| TransposeDims = &TransposeDimsInt16; | |||
| } else if constexpr (std::is_same_v<T, int32_t>) { | |||
| TransposeDims = &TransposeDimsInt32; | |||
| } else if constexpr (std::is_same_v<T, int64_t>) { | |||
| TransposeDims = &TransposeDimsInt64; | |||
| } else if constexpr (std::is_same_v<T, uint8_t>) { | |||
| TransposeDims = &TransposeDimsUInt8; | |||
| } else if constexpr (std::is_same_v<T, uint16_t>) { | |||
| TransposeDims = &TransposeDimsUInt16; | |||
| } else if constexpr (std::is_same_v<T, uint32_t>) { | |||
| TransposeDims = &TransposeDimsUInt32; | |||
| } else if constexpr (std::is_same_v<T, uint64_t>) { | |||
| TransposeDims = &TransposeDimsUInt64; | |||
| } else if constexpr (std::is_same_v<T, float>) { | |||
| TransposeDims = &TransposeDimsFp32; | |||
| } else if constexpr (std::is_same_v<T, bool>) { | |||
| TransposeDims = &TransposeDimsBool; | |||
| } | |||
| for (int task_id = 0; task_id < SizeToInt(thread_num); ++task_id) { | |||
| auto task = [&, task_id, thread_num]() { | |||
| TransposeDims(input_addr, output_addr, output_shape, &transpose_param_, task_id, SizeToInt(thread_num)); | |||
| return common::SUCCESS; | |||
| }; | |||
| tasks.emplace_back(task); | |||
| } | |||
| common::ThreadPool::GetInstance().SyncRun(tasks); | |||
| } | |||
| } // namespace kernel | |||
| } // namespace mindspore | |||
| /** | |||
| * Copyright 2021 Huawei Technologies Co., Ltd | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| * You may obtain a copy of the License at | |||
| * | |||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, software | |||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| #include "backend/kernel_compiler/cpu/transpose_cpu_kernel.h" | |||
| #include <algorithm> | |||
| #include <vector> | |||
| #include "runtime/device/cpu/cpu_device_address.h" | |||
| #include "common/thread_pool.h" | |||
| #include "nnacl/fp32/transpose_fp32.h" | |||
| #include "nnacl/int8/transpose_int8.h" | |||
| #include "nnacl/errorcode.h" | |||
| namespace mindspore { | |||
| namespace kernel { | |||
| void TransposeCPUFwdKernel::InitKernel(const CNodePtr &kernel_node) { | |||
| MS_EXCEPTION_IF_NULL(kernel_node); | |||
| input_shape_ = AnfAlgo::GetInputDeviceShape(kernel_node, 0); | |||
| output_shape_ = AnfAlgo::GetOutputDeviceShape(kernel_node, 0); | |||
| auto tmp = AnfAlgo::GetNodeAttr<std::vector<int64_t>>(kernel_node, "perm"); | |||
| axes_ = {tmp.begin(), tmp.end()}; | |||
| dtype_ = AnfAlgo::GetInputDeviceDataType(kernel_node, 0); | |||
| if (axes_.size() > MAX_TRANSPOSE_DIM_SIZE) { | |||
| MS_LOG(EXCEPTION) << "Transpose support max dimension is " << MAX_TRANSPOSE_DIM_SIZE << "D, but got " | |||
| << axes_.size() << "D."; | |||
| } | |||
| for (size_t i = 0; i < axes_.size(); ++i) { | |||
| transpose_param_.perm_[i] = SizeToInt(axes_[i]); | |||
| } | |||
| int num_axes = SizeToInt(input_shape_.size()); | |||
| transpose_param_.perm_size_ = axes_.size(); | |||
| transpose_param_.num_axes_ = num_axes; | |||
| transpose_param_.strides_[num_axes - 1] = 1; | |||
| transpose_param_.out_strides_[num_axes - 1] = 1; | |||
| for (int i = num_axes - 2; i >= 0; i--) { | |||
| transpose_param_.strides_[i] = input_shape_[i + 1] * transpose_param_.strides_[i + 1]; | |||
| transpose_param_.out_strides_[i] = output_shape_[i + 1] * transpose_param_.out_strides_[i + 1]; | |||
| } | |||
| launch_map_[kNumberTypeInt8] = &TransposeCPUFwdKernel::LaunchKernel<int8_t>; | |||
| launch_map_[kNumberTypeInt16] = &TransposeCPUFwdKernel::LaunchKernel<int16_t>; | |||
| launch_map_[kNumberTypeInt32] = &TransposeCPUFwdKernel::LaunchKernel<int>; | |||
| launch_map_[kNumberTypeInt64] = &TransposeCPUFwdKernel::LaunchKernel<int64_t>; | |||
| launch_map_[kNumberTypeUInt8] = &TransposeCPUFwdKernel::LaunchKernel<uint8_t>; | |||
| launch_map_[kNumberTypeUInt16] = &TransposeCPUFwdKernel::LaunchKernel<uint16_t>; | |||
| launch_map_[kNumberTypeUInt32] = &TransposeCPUFwdKernel::LaunchKernel<uint32_t>; | |||
| launch_map_[kNumberTypeUInt64] = &TransposeCPUFwdKernel::LaunchKernel<uint64_t>; | |||
| launch_map_[kNumberTypeFloat32] = &TransposeCPUFwdKernel::LaunchKernel<float>; | |||
| launch_map_[kNumberTypeBool] = &TransposeCPUFwdKernel::LaunchKernel<bool>; | |||
| auto iter = launch_map_.find(dtype_); | |||
| if (iter != launch_map_.end()) { | |||
| launch_func_ = iter->second; | |||
| } else { | |||
| MS_LOG(EXCEPTION) << "Input data type: " << dtype_ << "is not supported for Transpose kernel on CPU."; | |||
| } | |||
| } | |||
| bool TransposeCPUFwdKernel::Launch(const std::vector<kernel::AddressPtr> &inputs, | |||
| const std::vector<kernel::AddressPtr> &, | |||
| const std::vector<kernel::AddressPtr> &outputs) { | |||
| launch_func_(this, inputs, outputs); | |||
| return true; | |||
| } | |||
| template <typename T> | |||
| void TransposeCPUFwdKernel::LaunchKernel(const std::vector<AddressPtr> &inputs, | |||
| const std::vector<AddressPtr> &outputs) { | |||
| const auto *input_addr = reinterpret_cast<T *>(inputs[0]->addr); | |||
| auto *output_addr = reinterpret_cast<T *>(outputs[0]->addr); | |||
| transpose_param_.data_num_ = inputs[0]->size / sizeof(T); | |||
| int output_shape[SizeToInt(output_shape_.size())]; | |||
| for (size_t i = 0; i < output_shape_.size(); ++i) { | |||
| output_shape[i] = SizeToInt(output_shape_[i]); | |||
| } | |||
| size_t data_count = (inputs[0]->size) / sizeof(T); | |||
| if (axes_.size() <= DIMENSION_6D && data_count < MAX_TRANSPOSE_SERIAL_SIZE) { | |||
| int res = NNACL_ERR; | |||
| if constexpr (std::is_same_v<T, int8_t>) { | |||
| res = DoTransposeInt8(input_addr, output_addr, output_shape, &transpose_param_); | |||
| } else if constexpr (std::is_same_v<T, int16_t>) { | |||
| res = DoTransposeInt16(input_addr, output_addr, output_shape, &transpose_param_); | |||
| } else if constexpr (std::is_same_v<T, int32_t>) { | |||
| res = DoTransposeInt32(input_addr, output_addr, output_shape, &transpose_param_); | |||
| } else if constexpr (std::is_same_v<T, int64_t>) { | |||
| res = DoTransposeInt64(input_addr, output_addr, output_shape, &transpose_param_); | |||
| } else if constexpr (std::is_same_v<T, uint8_t>) { | |||
| res = DoTransposeUInt8(input_addr, output_addr, output_shape, &transpose_param_); | |||
| } else if constexpr (std::is_same_v<T, uint16_t>) { | |||
| res = DoTransposeUInt16(input_addr, output_addr, output_shape, &transpose_param_); | |||
| } else if constexpr (std::is_same_v<T, uint32_t>) { | |||
| res = DoTransposeUInt32(input_addr, output_addr, output_shape, &transpose_param_); | |||
| } else if constexpr (std::is_same_v<T, uint64_t>) { | |||
| res = DoTransposeUInt64(input_addr, output_addr, output_shape, &transpose_param_); | |||
| } else if constexpr (std::is_same_v<T, float>) { | |||
| res = DoTransposeFp32(input_addr, output_addr, output_shape, &transpose_param_); | |||
| } else if constexpr (std::is_same_v<T, bool>) { | |||
| res = DoTransposeBool(input_addr, output_addr, output_shape, &transpose_param_); | |||
| } | |||
| if (res != NNACL_OK) { | |||
| MS_LOG(ERROR) << "Transpose run failed"; | |||
| } | |||
| } else { | |||
| ParallelRun(input_addr, output_addr, output_shape, data_count); | |||
| } | |||
| } | |||
| template <typename T> | |||
| void TransposeCPUFwdKernel::ParallelRun(const T *input_addr, T *output_addr, const int *output_shape, size_t count) { | |||
| auto max_thread_num = common::ThreadPool::GetInstance().GetSyncRunThreadNum(); | |||
| const float block_size = 128.0; | |||
| size_t thread_num = count < block_size * max_thread_num ? std::ceil(count / block_size) : max_thread_num; | |||
| std::vector<common::Task> tasks; | |||
| std::function<void(const T *, T *, const int *, TransposeParameter *, int, int)> TransposeDims; | |||
| if constexpr (std::is_same_v<T, int8_t>) { | |||
| TransposeDims = &TransposeDimsInt8; | |||
| } else if constexpr (std::is_same_v<T, int16_t>) { | |||
| TransposeDims = &TransposeDimsInt16; | |||
| } else if constexpr (std::is_same_v<T, int32_t>) { | |||
| TransposeDims = &TransposeDimsInt32; | |||
| } else if constexpr (std::is_same_v<T, int64_t>) { | |||
| TransposeDims = &TransposeDimsInt64; | |||
| } else if constexpr (std::is_same_v<T, uint8_t>) { | |||
| TransposeDims = &TransposeDimsUInt8; | |||
| } else if constexpr (std::is_same_v<T, uint16_t>) { | |||
| TransposeDims = &TransposeDimsUInt16; | |||
| } else if constexpr (std::is_same_v<T, uint32_t>) { | |||
| TransposeDims = &TransposeDimsUInt32; | |||
| } else if constexpr (std::is_same_v<T, uint64_t>) { | |||
| TransposeDims = &TransposeDimsUInt64; | |||
| } else if constexpr (std::is_same_v<T, float>) { | |||
| TransposeDims = &TransposeDimsFp32; | |||
| } else if constexpr (std::is_same_v<T, bool>) { | |||
| TransposeDims = &TransposeDimsBool; | |||
| } | |||
| for (int task_id = 0; task_id < SizeToInt(thread_num); ++task_id) { | |||
| auto task = [&, task_id, thread_num]() { | |||
| TransposeDims(input_addr, output_addr, output_shape, &transpose_param_, task_id, SizeToInt(thread_num)); | |||
| return common::SUCCESS; | |||
| }; | |||
| tasks.emplace_back(task); | |||
| } | |||
| common::ThreadPool::GetInstance().SyncRun(tasks); | |||
| } | |||
| } // namespace kernel | |||
| } // namespace mindspore | |||
| @@ -1,58 +1,58 @@ | |||
| /** | |||
| * Copyright 2021 Huawei Technologies Co., Ltd | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| * You may obtain a copy of the License at | |||
| * | |||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, software | |||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_TRANSPOSE_CPU_KERNEL_H_ | |||
| #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_TRANSPOSE_CPU_KERNEL_H_ | |||
| #include <vector> | |||
| #include <unordered_map> | |||
| #include <memory> | |||
| #include <string> | |||
| #include "backend/kernel_compiler/cpu/cpu_kernel.h" | |||
| #include "backend/kernel_compiler/cpu/cpu_kernel_factory.h" | |||
| #include "nnacl/base/transpose_base.h" | |||
| namespace mindspore { | |||
| namespace kernel { | |||
| class TransposeCPUFwdKernel : public CPUKernel { | |||
| public: | |||
| TransposeCPUFwdKernel() = default; | |||
| ~TransposeCPUFwdKernel() override = default; | |||
| void InitKernel(const CNodePtr &kernel_node) override; | |||
| bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace, | |||
| const std::vector<AddressPtr> &outputs) override; | |||
| private: | |||
| template <typename T> | |||
| void LaunchKernel(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &outputs); | |||
| template <typename T> | |||
| void ParallelRun(const T *input_addr, T *output_addr, const int *output_shape, size_t count); | |||
| TransposeParameter transpose_param_; | |||
| std::vector<size_t> input_shape_; | |||
| std::vector<size_t> output_shape_; | |||
| std::vector<size_t> axes_; | |||
| TypeId dtype_{kTypeUnknown}; | |||
| using TypeKernel = | |||
| std::function<void(TransposeCPUFwdKernel *, const std::vector<AddressPtr> &, const std::vector<AddressPtr> &)>; | |||
| std::unordered_map<TypeId, TypeKernel> launch_map_; | |||
| TypeKernel launch_func_; | |||
| }; | |||
| MS_REG_CPU_KERNEL(Transpose, KernelAttr(), TransposeCPUFwdKernel); | |||
| } // namespace kernel | |||
| } // namespace mindspore | |||
| #endif // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_TRANSPOSE_CPU_KERNEL_H_ | |||
| /** | |||
| * Copyright 2021 Huawei Technologies Co., Ltd | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| * You may obtain a copy of the License at | |||
| * | |||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, software | |||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_TRANSPOSE_CPU_KERNEL_H_ | |||
| #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_TRANSPOSE_CPU_KERNEL_H_ | |||
| #include <vector> | |||
| #include <unordered_map> | |||
| #include <memory> | |||
| #include <string> | |||
| #include "backend/kernel_compiler/cpu/cpu_kernel.h" | |||
| #include "backend/kernel_compiler/cpu/cpu_kernel_factory.h" | |||
| #include "nnacl/base/transpose_base.h" | |||
| namespace mindspore { | |||
| namespace kernel { | |||
| class TransposeCPUFwdKernel : public CPUKernel { | |||
| public: | |||
| TransposeCPUFwdKernel() = default; | |||
| ~TransposeCPUFwdKernel() override = default; | |||
| void InitKernel(const CNodePtr &kernel_node) override; | |||
| bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace, | |||
| const std::vector<AddressPtr> &outputs) override; | |||
| private: | |||
| template <typename T> | |||
| void LaunchKernel(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &outputs); | |||
| template <typename T> | |||
| void ParallelRun(const T *input_addr, T *output_addr, const int *output_shape, size_t count); | |||
| TransposeParameter transpose_param_; | |||
| std::vector<size_t> input_shape_; | |||
| std::vector<size_t> output_shape_; | |||
| std::vector<size_t> axes_; | |||
| TypeId dtype_{kTypeUnknown}; | |||
| using TypeKernel = | |||
| std::function<void(TransposeCPUFwdKernel *, const std::vector<AddressPtr> &, const std::vector<AddressPtr> &)>; | |||
| std::unordered_map<TypeId, TypeKernel> launch_map_; | |||
| TypeKernel launch_func_; | |||
| }; | |||
| MS_REG_CPU_KERNEL(Transpose, KernelAttr(), TransposeCPUFwdKernel); | |||
| } // namespace kernel | |||
| } // namespace mindspore | |||
| #endif // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_TRANSPOSE_CPU_KERNEL_H_ | |||