| @@ -14,6 +14,8 @@ | |||||
| * limitations under the License. | * limitations under the License. | ||||
| */ | */ | ||||
| #include "backend/kernel_compiler/cpu/cpu_kernel.h" | #include "backend/kernel_compiler/cpu/cpu_kernel.h" | ||||
| #include <algorithm> | |||||
| #include <utility> | |||||
| #include "common/thread_pool.h" | #include "common/thread_pool.h" | ||||
| namespace mindspore { | namespace mindspore { | ||||
| @@ -119,5 +121,118 @@ std::vector<size_t> CPUKernelUtils::FlatShapeByAxis(const std::vector<size_t> &s | |||||
| return flat_shape; | return flat_shape; | ||||
| } | } | ||||
| BroadcastIterator::BroadcastIterator(std::vector<size_t> input_shape_a, std::vector<size_t> input_shape_b, | |||||
| std::vector<size_t> output_shape) | |||||
| : input_shape_a_(std::move(input_shape_a)), | |||||
| input_shape_b_(std::move(input_shape_b)), | |||||
| output_shape_(std::move(output_shape)) { | |||||
| output_dimension_ = SizeToInt(output_shape_.size()); // Assign dimension to int for iterator | |||||
| BroadcastShape(); | |||||
| // Allocate strides memory | |||||
| input_strides_a_.resize(output_dimension_); | |||||
| input_strides_b_.resize(output_dimension_); | |||||
| input_back_strides_a_.resize(output_dimension_); | |||||
| input_back_strides_b_.resize(output_dimension_); | |||||
| coordinates_.resize(output_dimension_); | |||||
| InitStrides(); | |||||
| } | |||||
| void BroadcastIterator::SetPos(size_t pos) { | |||||
| for (int i = output_dimension_ - 1; i >= 0 && pos != 0; --i) { | |||||
| coordinates_[i] = pos % output_shape_[i]; | |||||
| input_pos_[0] += coordinates_[i] * input_strides_a_[i]; | |||||
| input_pos_[1] += coordinates_[i] * input_strides_b_[i]; | |||||
| pos /= output_shape_[i]; | |||||
| } | |||||
| } | |||||
| void BroadcastIterator::GenNextPos() { | |||||
| // Calculate output next coordinate | |||||
| for (int i = output_dimension_ - 1; i >= 0; --i) { | |||||
| if (coordinates_[i] + 1 == output_shape_[i]) { | |||||
| coordinates_[i] = 0; | |||||
| input_pos_[0] -= input_back_strides_a_[i]; | |||||
| input_pos_[1] -= input_back_strides_b_[i]; | |||||
| } else { | |||||
| ++coordinates_[i]; | |||||
| input_pos_[0] += input_strides_a_[i]; | |||||
| input_pos_[1] += input_strides_b_[i]; | |||||
| break; | |||||
| } | |||||
| } | |||||
| } | |||||
| void BroadcastIterator::BroadcastShape() { | |||||
| int input_dimension_a = input_shape_a_.size(); | |||||
| if (input_dimension_a < output_dimension_) { | |||||
| input_shape_a_.insert(input_shape_a_.begin(), output_dimension_ - input_dimension_a, 1); | |||||
| } | |||||
| int input_dimension_b = input_shape_b_.size(); | |||||
| if (input_dimension_b < output_dimension_) { | |||||
| input_shape_b_.insert(input_shape_b_.begin(), output_dimension_ - input_dimension_b, 1); | |||||
| } | |||||
| } | |||||
| void BroadcastIterator::InitStrides() { | |||||
| input_strides_a_[output_dimension_ - 1] = 1; | |||||
| input_strides_b_[output_dimension_ - 1] = 1; | |||||
| for (int i = output_dimension_ - 2; i >= 0; --i) { | |||||
| input_strides_a_[i] = input_shape_a_[i + 1] * input_strides_a_[i + 1]; | |||||
| input_strides_b_[i] = input_shape_b_[i + 1] * input_strides_b_[i + 1]; | |||||
| input_back_strides_a_[i + 1] = (input_shape_a_[i + 1] - 1) * input_strides_a_[i + 1]; | |||||
| input_back_strides_b_[i + 1] = (input_shape_b_[i + 1] - 1) * input_strides_b_[i + 1]; | |||||
| } | |||||
| // Update strides for broadcast | |||||
| // While the axis value is 1, the stride is 0 | |||||
| std::transform(input_strides_a_.begin(), input_strides_a_.end(), input_shape_a_.begin(), input_strides_a_.begin(), | |||||
| [](const auto &a, const auto &b) { return b == 1 ? 0 : a; }); | |||||
| std::transform(input_strides_b_.begin(), input_strides_b_.end(), input_shape_b_.begin(), input_strides_b_.begin(), | |||||
| [](const auto &a, const auto &b) { return b == 1 ? 0 : a; }); | |||||
| } | |||||
| TransposeIterator::TransposeIterator(std::vector<size_t> output_shape, std::vector<size_t> axes, | |||||
| const std::vector<size_t> &input_shape) | |||||
| : shape_(std::move(output_shape)), axes_(std::move(axes)) { | |||||
| // Calculate strides | |||||
| dimension_ = shape_.size(); | |||||
| std::vector<uint32_t> strides(dimension_, 1); | |||||
| for (int i = dimension_ - 2; i >= 0; --i) { | |||||
| strides[i] = input_shape[i + 1] * strides[i + 1]; | |||||
| } | |||||
| // Swap shape ans strides and calculate back strides | |||||
| strides_.resize(dimension_); | |||||
| back_strides_.resize(dimension_); | |||||
| for (int i = dimension_ - 1; i >= 0; --i) { | |||||
| strides_[i] = strides[axes_[i]]; | |||||
| back_strides_[i] = (shape_[i] - 1) * strides_[i]; | |||||
| } | |||||
| // Calculate coordinate by pos | |||||
| coordinates_.resize(dimension_); | |||||
| } | |||||
| void TransposeIterator::SetPos(size_t pos) { | |||||
| for (int i = dimension_ - 1; i >= 0 && pos != 0; --i) { | |||||
| coordinates_[i] = pos % shape_[i]; | |||||
| pos_ += coordinates_[i] * strides_[i]; | |||||
| pos /= shape_[i]; | |||||
| } | |||||
| } | |||||
| void TransposeIterator::GenNextPos() { | |||||
| for (int i = dimension_ - 1; i >= 0; --i) { | |||||
| if (coordinates_[i] + 1 == shape_[i]) { | |||||
| coordinates_[i] = 0; | |||||
| pos_ -= back_strides_[i]; | |||||
| } else { | |||||
| coordinates_[i]++; | |||||
| pos_ += strides_[i]; | |||||
| break; | |||||
| } | |||||
| } | |||||
| } | |||||
| } // namespace kernel | } // namespace kernel | ||||
| } // namespace mindspore | } // namespace mindspore | ||||
| @@ -145,6 +145,48 @@ class CPUKernelUtils { | |||||
| static void ParallelFor(const CTask &task, size_t count); | static void ParallelFor(const CTask &task, size_t count); | ||||
| static std::vector<size_t> FlatShapeByAxis(const std::vector<size_t> &shape, int axis); | static std::vector<size_t> FlatShapeByAxis(const std::vector<size_t> &shape, int axis); | ||||
| }; | }; | ||||
| class BroadcastIterator { | |||||
| public: | |||||
| BroadcastIterator(std::vector<size_t> input_shape_a, std::vector<size_t> input_shape_b, | |||||
| std::vector<size_t> output_shape); | |||||
| inline size_t GetInputPosA() const { return input_pos_[0]; } | |||||
| inline size_t GetInputPosB() const { return input_pos_[1]; } | |||||
| void SetPos(size_t pos); | |||||
| void GenNextPos(); | |||||
| private: | |||||
| void BroadcastShape(); | |||||
| void InitStrides(); | |||||
| std::vector<size_t> coordinates_; | |||||
| std::vector<size_t> input_shape_a_; | |||||
| std::vector<size_t> input_shape_b_; | |||||
| std::vector<size_t> output_shape_; | |||||
| std::vector<size_t> input_strides_a_; | |||||
| std::vector<size_t> input_strides_b_; | |||||
| std::vector<size_t> input_back_strides_a_; | |||||
| std::vector<size_t> input_back_strides_b_; | |||||
| std::array<size_t, 2> input_pos_{0}; | |||||
| int output_dimension_{0}; | |||||
| }; | |||||
| class TransposeIterator { | |||||
| public: | |||||
| TransposeIterator(std::vector<size_t> output_shape, std::vector<size_t> axes, const std::vector<size_t> &input_shape); | |||||
| inline size_t GetPos() const { return pos_; } | |||||
| void SetPos(size_t pos); | |||||
| void GenNextPos(); | |||||
| private: | |||||
| int dimension_{0}; | |||||
| std::vector<size_t> coordinates_; | |||||
| std::vector<size_t> shape_; | |||||
| std::vector<size_t> strides_; | |||||
| std::vector<size_t> back_strides_; | |||||
| std::vector<size_t> axes_; | |||||
| size_t pos_{0}; | |||||
| }; | |||||
| } // namespace kernel | } // namespace kernel | ||||
| } // namespace mindspore | } // namespace mindspore | ||||
| @@ -18,13 +18,10 @@ | |||||
| #include <string> | #include <string> | ||||
| #include <vector> | #include <vector> | ||||
| #include <algorithm> | #include <algorithm> | ||||
| #include <unordered_set> | |||||
| #include <utility> | |||||
| namespace mindspore { | namespace mindspore { | ||||
| namespace kernel { | namespace kernel { | ||||
| namespace { | |||||
| const size_t kMaxDim = 10; | |||||
| } // namespace | |||||
| template <typename T> | template <typename T> | ||||
| void ReduceCPUKernel<T>::InitKernel(const CNodePtr &kernel_node) { | void ReduceCPUKernel<T>::InitKernel(const CNodePtr &kernel_node) { | ||||
| MS_EXCEPTION_IF_NULL(kernel_node); | MS_EXCEPTION_IF_NULL(kernel_node); | ||||
| @@ -37,10 +34,14 @@ void ReduceCPUKernel<T>::InitKernel(const CNodePtr &kernel_node) { | |||||
| } else { | } else { | ||||
| MS_LOG(EXCEPTION) << "Attribute is invalid"; | MS_LOG(EXCEPTION) << "Attribute is invalid"; | ||||
| } | } | ||||
| int dimension = input_shape_.size(); | int dimension = input_shape_.size(); | ||||
| std::transform(axis_.begin(), axis_.end(), axis_.begin(), | std::transform(axis_.begin(), axis_.end(), axis_.begin(), | ||||
| [dimension](const auto &a) { return a < 0 ? dimension + a : a; }); | [dimension](const auto &a) { return a < 0 ? dimension + a : a; }); | ||||
| sort(axis_.begin(), axis_.end()); | sort(axis_.begin(), axis_.end()); | ||||
| // Delete the duplicate axis. | |||||
| auto last = std::unique(axis_.begin(), axis_.end()); | |||||
| axis_.erase(last, axis_.end()); | |||||
| auto kernel_name = AnfAlgo::GetCNodeName(kernel_node); | auto kernel_name = AnfAlgo::GetCNodeName(kernel_node); | ||||
| if (kernel_name == "ReduceMax") { | if (kernel_name == "ReduceMax") { | ||||
| reduce_type_ = 1; | reduce_type_ = 1; | ||||
| @@ -55,10 +56,8 @@ void ReduceCPUKernel<T>::InitKernel(const CNodePtr &kernel_node) { | |||||
| reduce_type_ = 4; | reduce_type_ = 4; | ||||
| reduce_func_ = [](const T *input, size_t pos, T *out) { *out += input[pos]; }; | reduce_func_ = [](const T *input, size_t pos, T *out) { *out += input[pos]; }; | ||||
| } else { | } else { | ||||
| MS_LOG(EXCEPTION) << "unsupported reduce type: " << reduce_type_; | |||||
| MS_LOG(EXCEPTION) << "unsupported reduce type: " << reduce_type_; | |||||
| } | } | ||||
| CheckParameter(); | |||||
| } | } | ||||
| template <typename T> | template <typename T> | ||||
| @@ -68,7 +67,7 @@ bool ReduceCPUKernel<T>::Launch(const std::vector<kernel::AddressPtr> &inputs, | |||||
| size_t input_size = inputs[0]->size / sizeof(T); | size_t input_size = inputs[0]->size / sizeof(T); | ||||
| auto input_addr = reinterpret_cast<T *>(inputs[0]->addr); | auto input_addr = reinterpret_cast<T *>(inputs[0]->addr); | ||||
| auto output_addr = reinterpret_cast<T *>(outputs[0]->addr); | auto output_addr = reinterpret_cast<T *>(outputs[0]->addr); | ||||
| if (axis_.empty()) { | |||||
| if (axis_.empty() || input_shape_.empty() || input_shape_.size() == 1) { | |||||
| // Get one ret | // Get one ret | ||||
| *output_addr = input_addr[0]; | *output_addr = input_addr[0]; | ||||
| for (size_t i = 1; i < input_size; ++i) { | for (size_t i = 1; i < input_size; ++i) { | ||||
| @@ -78,107 +77,50 @@ bool ReduceCPUKernel<T>::Launch(const std::vector<kernel::AddressPtr> &inputs, | |||||
| *output_addr /= input_size; | *output_addr /= input_size; | ||||
| } | } | ||||
| } else { | } else { | ||||
| // transpose->calculate strides->calculate ret | |||||
| std::vector<size_t> out_shape; | |||||
| std::vector<size_t> strides; | |||||
| std::vector<size_t> back_strides; | |||||
| size_t stride; | |||||
| CalculateTransposeInfo(&out_shape, &strides, &back_strides, &stride); | |||||
| // Calculate transpose axes and stride | |||||
| int dimension = input_shape_.size(); | int dimension = input_shape_.size(); | ||||
| std::vector<size_t> coordinates(dimension); | |||||
| auto get_next_pos = [&coordinates, &out_shape, &strides, &back_strides, &dimension](size_t &curr_pos) { | |||||
| for (int i = dimension - 1; i >= 0; --i) { | |||||
| if (coordinates[i] + 1 == out_shape[i]) { | |||||
| coordinates[i] = 0; | |||||
| curr_pos -= back_strides[i]; | |||||
| } else { | |||||
| coordinates[i]++; | |||||
| curr_pos += strides[i]; | |||||
| break; | |||||
| } | |||||
| } | |||||
| }; | |||||
| size_t output_size = outputs[0]->size / sizeof(T); | |||||
| size_t pos = 0; | |||||
| for (size_t i = 0; i < output_size; ++i) { | |||||
| if (i != 0) { | |||||
| get_next_pos(pos); | |||||
| } | |||||
| output_addr[i] = input_addr[pos]; | |||||
| for (size_t j = 1; j < stride; ++j) { | |||||
| get_next_pos(pos); | |||||
| reduce_func_(input_addr, pos, &output_addr[i]); | |||||
| } | |||||
| if (reduce_type_ == 4) { // 4 is reduce mean | |||||
| output_addr[i] /= stride; | |||||
| size_t stride = 1; | |||||
| std::vector<size_t> axes(input_shape_.size()); | |||||
| size_t j = 0; | |||||
| size_t k = 0; | |||||
| for (int i = 0; i < dimension; ++i) { | |||||
| if (j == axis_.size() || i != axis_[j]) { | |||||
| axes[k] = i; | |||||
| ++k; | |||||
| } else { | |||||
| stride *= input_shape_[i]; | |||||
| ++j; | |||||
| } | } | ||||
| } | } | ||||
| } | |||||
| return true; | |||||
| } | |||||
| template <typename T> | |||||
| void ReduceCPUKernel<T>::CalculateTransposeInfo(std::vector<size_t> *new_shape, std::vector<size_t> *strides, | |||||
| std::vector<size_t> *back_strides, size_t *stride) const { | |||||
| int dimension = input_shape_.size(); | |||||
| std::vector<size_t> input_strides(dimension); | |||||
| input_strides[dimension - 1] = 1; | |||||
| for (int i = dimension - 2; i >= 0; --i) { | |||||
| input_strides[i] = input_shape_[i + 1] * input_strides[i + 1]; | |||||
| } | |||||
| // Calculate transpose axes and stride | |||||
| std::vector<size_t> axes(dimension); | |||||
| int j = 0; | |||||
| int k = 0; | |||||
| *stride = 1; | |||||
| for (int i = 0; i < dimension; ++i) { | |||||
| if (i != axis_[j]) { | |||||
| axes[k] = i; | |||||
| for (auto &it : axis_) { | |||||
| axes[k] = it; | |||||
| ++k; | ++k; | ||||
| } else { | |||||
| *stride *= input_shape_[i]; | |||||
| ++j; | |||||
| } | } | ||||
| } | |||||
| for (auto &it : axis_) { | |||||
| axes[k] = it; | |||||
| ++k; | |||||
| } | |||||
| // Calculate strides, new_shape, back strides | |||||
| strides->resize(dimension); | |||||
| new_shape->resize(dimension); | |||||
| back_strides->resize(dimension); | |||||
| for (int i = dimension - 1; i >= 0; --i) { | |||||
| (*strides)[i] = input_strides[axes[i]]; | |||||
| (*new_shape)[i] = input_shape_[axes[i]]; | |||||
| (*back_strides)[i] = ((*new_shape)[i] - 1) * (*strides)[i]; | |||||
| } | |||||
| } | |||||
| template <typename T> | |||||
| void ReduceCPUKernel<T>::CheckParameter() const { | |||||
| if (input_shape_.empty() || input_shape_.size() > kMaxDim) { | |||||
| MS_LOG(EXCEPTION) << "Invalid input tensor of dimension: " << input_shape_.size(); | |||||
| } | |||||
| if (axis_.empty()) { | |||||
| MS_LOG(INFO) << "axis is empty"; | |||||
| return; | |||||
| } | |||||
| std::unordered_set<int> checker(axis_.begin(), axis_.end()); | |||||
| if (checker.size() != axis_.size()) { | |||||
| MS_LOG(EXCEPTION) << "Duplicate value in axis"; | |||||
| } | |||||
| int maxDimension = input_shape_.size(); | |||||
| for (auto &axis : axis_) { | |||||
| if (axis >= maxDimension) { | |||||
| MS_LOG(EXCEPTION) << "Invalid value in axis: " << axis; | |||||
| // Calculate transpose shape | |||||
| std::vector<size_t> transpose_shape(input_shape_.size()); | |||||
| for (int i = 0; i < dimension; ++i) { | |||||
| transpose_shape[i] = input_shape_[axes[i]]; | |||||
| } | } | ||||
| size_t output_size = outputs[0]->size / sizeof(T); | |||||
| TransposeIterator base_iter(std::move(transpose_shape), std::move(axes), input_shape_); | |||||
| auto task = [this, &base_iter, input_addr, output_addr, stride](size_t start, size_t end) { | |||||
| auto iter = base_iter; | |||||
| iter.SetPos(start * stride); | |||||
| for (size_t i = start; i < end; ++i) { | |||||
| output_addr[i] = input_addr[iter.GetPos()]; | |||||
| iter.GenNextPos(); | |||||
| for (size_t j = 1; j < stride; ++j) { | |||||
| reduce_func_(input_addr, iter.GetPos(), &output_addr[i]); | |||||
| iter.GenNextPos(); | |||||
| } | |||||
| if (reduce_type_ == 4) { // 4 is reduce mean | |||||
| output_addr[i] /= stride; | |||||
| } | |||||
| } | |||||
| }; | |||||
| CPUKernelUtils::ParallelFor(task, output_size); | |||||
| } | } | ||||
| return true; | |||||
| } | } | ||||
| } // namespace kernel | } // namespace kernel | ||||
| } // namespace mindspore | } // namespace mindspore | ||||
| @@ -34,9 +34,6 @@ class ReduceCPUKernel : public CPUKernel { | |||||
| const std::vector<AddressPtr> &outputs) override; | const std::vector<AddressPtr> &outputs) override; | ||||
| private: | private: | ||||
| void CheckParameter() const; | |||||
| void CalculateTransposeInfo(std::vector<size_t> *new_shape, std::vector<size_t> *strides, | |||||
| std::vector<size_t> *back_strides, size_t *stride) const; | |||||
| std::vector<size_t> input_shape_; | std::vector<size_t> input_shape_; | ||||
| std::vector<int64_t> axis_; | std::vector<int64_t> axis_; | ||||
| int reduce_type_{0}; | int reduce_type_{0}; | ||||
| @@ -14,71 +14,11 @@ | |||||
| * limitations under the License. | * limitations under the License. | ||||
| */ | */ | ||||
| #include "backend/kernel_compiler/cpu/tensoradd_cpu_kernel.h" | #include "backend/kernel_compiler/cpu/tensoradd_cpu_kernel.h" | ||||
| #include <functional> | |||||
| #include <vector> | #include <vector> | ||||
| namespace mindspore { | namespace mindspore { | ||||
| namespace kernel { | namespace kernel { | ||||
| namespace { | |||||
| struct Iterator { | |||||
| std::vector<size_t> coordinates_; | |||||
| std::vector<size_t> input_shape_a_; | |||||
| std::vector<size_t> input_shape_b_; | |||||
| std::vector<size_t> output_shape_; | |||||
| std::vector<size_t> input_strides_a_; | |||||
| std::vector<size_t> input_strides_b_; | |||||
| int output_dimension_pos_{0}; | |||||
| size_t pos_{0}; | |||||
| Iterator(const std::vector<size_t> &input_shape_a, const std::vector<size_t> &input_shape_b, | |||||
| const std::vector<size_t> &output_shape, const std::vector<size_t> &input_strides_a, | |||||
| const std::vector<size_t> &input_strides_b, size_t pos) | |||||
| : input_shape_a_(input_shape_a), | |||||
| input_shape_b_(input_shape_b), | |||||
| output_shape_(output_shape), | |||||
| input_strides_a_(input_strides_a), | |||||
| input_strides_b_(input_strides_b), | |||||
| pos_{pos} { | |||||
| output_dimension_pos_ = output_shape.size() - 1; | |||||
| // Calculate coordinate with pos | |||||
| coordinates_.resize(output_dimension_pos_ + 1); | |||||
| int tmp = pos_; | |||||
| for (int i = output_dimension_pos_; i >= 0 && tmp != 0; --i) { | |||||
| coordinates_[i] = tmp % output_shape_[i]; | |||||
| tmp /= output_shape_[i]; | |||||
| } | |||||
| } | |||||
| void UpdateCoordinates() { | |||||
| // Calculate output next coordinate | |||||
| for (int i = output_dimension_pos_; i >= 0; --i) { | |||||
| if (coordinates_[i] + 1 == output_shape_[i]) { | |||||
| coordinates_[i] = 0; | |||||
| } else { | |||||
| ++coordinates_[i]; | |||||
| break; | |||||
| } | |||||
| } | |||||
| } | |||||
| void GenPoints(std::array<size_t, 2> *position) { | |||||
| auto &idx = *position; | |||||
| idx = {0, 0}; | |||||
| for (int k = 0; k < output_dimension_pos_; ++k) { | |||||
| if (input_shape_a_[k] > 1) { | |||||
| idx[0] += coordinates_[k] * input_strides_a_[k]; | |||||
| } | |||||
| if (input_shape_b_[k] > 1) { | |||||
| idx[1] += coordinates_[k] * input_strides_b_[k]; | |||||
| } | |||||
| } | |||||
| if (input_shape_a_[output_dimension_pos_] > 1) { | |||||
| idx[0] += coordinates_[output_dimension_pos_]; | |||||
| } | |||||
| if (input_shape_b_[output_dimension_pos_] > 1) { | |||||
| idx[1] += coordinates_[output_dimension_pos_]; | |||||
| } | |||||
| } | |||||
| }; | |||||
| } // namespace | |||||
| void TensorAddCPUKernel::InitKernel(const CNodePtr &kernel_node) { | void TensorAddCPUKernel::InitKernel(const CNodePtr &kernel_node) { | ||||
| MS_EXCEPTION_IF_NULL(kernel_node); | MS_EXCEPTION_IF_NULL(kernel_node); | ||||
| @@ -96,55 +36,25 @@ bool TensorAddCPUKernel::Launch(const std::vector<kernel::AddressPtr> &inputs, | |||||
| auto output_addr = reinterpret_cast<float *>(outputs[0]->addr); | auto output_addr = reinterpret_cast<float *>(outputs[0]->addr); | ||||
| auto output_size = outputs[0]->size / sizeof(float); | auto output_size = outputs[0]->size / sizeof(float); | ||||
| if (input_shape_a_ == input_shape_b_) { | if (input_shape_a_ == input_shape_b_) { | ||||
| NormalProcess(input_addr_a, input_addr_b, output_addr, output_size); | |||||
| auto task = [output_addr, input_addr_a, input_addr_b](size_t start, size_t end) { | |||||
| for (size_t i = start; i < end; ++i) { | |||||
| output_addr[i] = input_addr_a[i] + input_addr_b[i]; | |||||
| } | |||||
| }; | |||||
| CPUKernelUtils::ParallelFor(task, output_size); | |||||
| } else { // Broadcast | } else { // Broadcast | ||||
| BroadcastProcess(input_addr_a, input_addr_b, output_addr, output_size); | |||||
| BroadcastIterator base_iter(input_shape_a_, input_shape_b_, output_shape_); | |||||
| auto task = [&base_iter, output_addr, input_addr_a, input_addr_b](size_t start, size_t end) { | |||||
| auto iter = base_iter; | |||||
| iter.SetPos(start); | |||||
| for (size_t i = start; i < end; ++i) { | |||||
| output_addr[i] = input_addr_a[iter.GetInputPosA()] + input_addr_b[iter.GetInputPosB()]; | |||||
| iter.GenNextPos(); | |||||
| } | |||||
| }; | |||||
| CPUKernelUtils::ParallelFor(task, output_size); | |||||
| } | } | ||||
| return true; | return true; | ||||
| } | } | ||||
| void TensorAddCPUKernel::NormalProcess(const float *input_a, const float *input_b, float *output, size_t size) { | |||||
| auto task = [output, input_a, input_b](size_t start, size_t end) { | |||||
| for (size_t i = start; i < end; ++i) { | |||||
| output[i] = input_a[i] + input_b[i]; | |||||
| } | |||||
| }; | |||||
| CPUKernelUtils::ParallelFor(task, size); | |||||
| } | |||||
| void TensorAddCPUKernel::BroadcastProcess(const float *input_a, const float *input_b, float *output, size_t size) { | |||||
| // Broadcast shape | |||||
| int dimension = output_shape_.size(); | |||||
| int input_dimension_a = input_shape_a_.size(); | |||||
| if (input_dimension_a < dimension) { | |||||
| input_shape_a_.insert(input_shape_a_.begin(), dimension - input_dimension_a, 1); | |||||
| } | |||||
| int input_dimension_b = input_shape_b_.size(); | |||||
| if (input_dimension_b < dimension) { | |||||
| input_shape_b_.insert(input_shape_b_.begin(), dimension - input_dimension_b, 1); | |||||
| } | |||||
| // Calculate strides | |||||
| CalculateStrides(input_shape_a_, &input_strides_a_); | |||||
| CalculateStrides(input_shape_b_, &input_strides_b_); | |||||
| auto task = [this, input_a, input_b, output](size_t start, size_t end) { | |||||
| Iterator iter(input_shape_a_, input_shape_b_, output_shape_, input_strides_a_, input_strides_b_, start); | |||||
| std::array<size_t, 2> position{0}; | |||||
| for (size_t i = start; i < end; ++i) { | |||||
| iter.GenPoints(&position); | |||||
| output[i] = input_a[position[0]] + input_b[position[1]]; | |||||
| iter.UpdateCoordinates(); | |||||
| } | |||||
| }; | |||||
| CPUKernelUtils::ParallelFor(task, size); | |||||
| } | |||||
| void TensorAddCPUKernel::CalculateStrides(const std::vector<size_t> &shape, std::vector<size_t> *strides) { | |||||
| strides->resize(shape.size(), 1); | |||||
| for (int i = shape.size() - 2; i >= 0; --i) { | |||||
| (*strides)[i] = shape[i + 1] * (*strides)[i + 1]; | |||||
| } | |||||
| } | |||||
| } // namespace kernel | } // namespace kernel | ||||
| } // namespace mindspore | } // namespace mindspore | ||||
| @@ -34,15 +34,9 @@ class TensorAddCPUKernel : public CPUKernel { | |||||
| const std::vector<AddressPtr> &outputs) override; | const std::vector<AddressPtr> &outputs) override; | ||||
| private: | private: | ||||
| static void NormalProcess(const float *input_a, const float *input_b, float *output, size_t size); | |||||
| void BroadcastProcess(const float *input_a, const float *input_b, float *output, size_t size); | |||||
| static void CalculateStrides(const std::vector<size_t> &, std::vector<size_t> *); | |||||
| std::vector<size_t> input_shape_a_; | std::vector<size_t> input_shape_a_; | ||||
| std::vector<size_t> input_shape_b_; | std::vector<size_t> input_shape_b_; | ||||
| // Define follow var for Broadcast | |||||
| std::vector<size_t> output_shape_; | std::vector<size_t> output_shape_; | ||||
| std::vector<size_t> input_strides_a_; | |||||
| std::vector<size_t> input_strides_b_; | |||||
| }; | }; | ||||
| MS_REG_CPU_KERNEL( | MS_REG_CPU_KERNEL( | ||||
| @@ -17,21 +17,16 @@ | |||||
| #include "backend/kernel_compiler/cpu/transpose_cpu_kernel.h" | #include "backend/kernel_compiler/cpu/transpose_cpu_kernel.h" | ||||
| #include <algorithm> | #include <algorithm> | ||||
| #include <vector> | #include <vector> | ||||
| #include <unordered_set> | |||||
| #include "runtime/device/cpu/cpu_device_address.h" | #include "runtime/device/cpu/cpu_device_address.h" | ||||
| namespace mindspore { | namespace mindspore { | ||||
| namespace kernel { | namespace kernel { | ||||
| namespace { | |||||
| const size_t kMaxDim = 10; | |||||
| } | |||||
| void TransposeCPUFwdKernel::InitKernel(const CNodePtr &kernel_node) { | void TransposeCPUFwdKernel::InitKernel(const CNodePtr &kernel_node) { | ||||
| MS_EXCEPTION_IF_NULL(kernel_node); | MS_EXCEPTION_IF_NULL(kernel_node); | ||||
| input_shape_ = AnfAlgo::GetInputDeviceShape(kernel_node, 0); | input_shape_ = AnfAlgo::GetInputDeviceShape(kernel_node, 0); | ||||
| output_shape_ = AnfAlgo::GetOutputDeviceShape(kernel_node, 0); | output_shape_ = AnfAlgo::GetOutputDeviceShape(kernel_node, 0); | ||||
| axes_ = AnfAlgo::GetNodeAttr<std::vector<int64_t>>(kernel_node, "perm"); | |||||
| CheckParameter(); | |||||
| auto tmp = AnfAlgo::GetNodeAttr<std::vector<int64_t>>(kernel_node, "perm"); | |||||
| axes_ = {tmp.begin(), tmp.end()}; | |||||
| dtype_ = AnfAlgo ::GetPrevNodeOutputDeviceDataType(kernel_node, 0); | dtype_ = AnfAlgo ::GetPrevNodeOutputDeviceDataType(kernel_node, 0); | ||||
| if (dtype_ == kTypeUnknown) { | if (dtype_ == kTypeUnknown) { | ||||
| dtype_ = AnfAlgo::GetPrevNodeOutputInferDataType(kernel_node, 0); | dtype_ = AnfAlgo::GetPrevNodeOutputInferDataType(kernel_node, 0); | ||||
| @@ -63,77 +58,22 @@ bool TransposeCPUFwdKernel::Launch(const std::vector<kernel::AddressPtr> &inputs | |||||
| return true; | return true; | ||||
| } | } | ||||
| void TransposeCPUFwdKernel::CheckParameter() const { | |||||
| if (input_shape_.size() > kMaxDim) { | |||||
| MS_LOG(EXCEPTION) << "Input tensor is " << input_shape_.size() << ", out of bound max dimension 10"; | |||||
| } | |||||
| if (input_shape_.empty()) { | |||||
| MS_LOG(EXCEPTION) << "Input tensor is empty"; | |||||
| } | |||||
| if (input_shape_.size() != axes_.size()) { | |||||
| MS_LOG(EXCEPTION) << "Input perm size is not equal with input shape"; | |||||
| } | |||||
| // Input axes include the same axis | |||||
| std::unordered_set<int64_t> unique_axes{axes_.begin(), axes_.end()}; | |||||
| if (unique_axes.size() != axes_.size()) { | |||||
| MS_LOG(EXCEPTION) << "Input perm is illegal, it has the same axis"; | |||||
| } | |||||
| // Input axes not in ture range(input_shape_.size()) | |||||
| int64_t shape_size = input_shape_.size(); | |||||
| for (auto &axis : axes_) { | |||||
| if (axis < 0 || axis >= shape_size) { | |||||
| MS_LOG(EXCEPTION) << "Input perm axis is out of bound input shape size"; | |||||
| } | |||||
| } | |||||
| } | |||||
| template <typename T> | template <typename T> | ||||
| void TransposeCPUFwdKernel::LaunchKernel(const std::vector<AddressPtr> &inputs, | void TransposeCPUFwdKernel::LaunchKernel(const std::vector<AddressPtr> &inputs, | ||||
| const std::vector<AddressPtr> &outputs) { | const std::vector<AddressPtr> &outputs) { | ||||
| int dimension = input_shape_.size(); | |||||
| // Calculate input tensor strides | |||||
| std::array<uint32_t, kMaxDim> input_strides{0}; | |||||
| input_strides[dimension - 1] = 1; | |||||
| for (int i = dimension - 2; i >= 0; --i) { | |||||
| input_strides[i] = input_shape_[i + 1] * input_strides[i + 1]; | |||||
| } | |||||
| // Calculate output strides and back strides | |||||
| std::array<uint32_t, kMaxDim> strides{0}; | |||||
| std::array<uint32_t, kMaxDim> back_strides{0}; | |||||
| for (int i = dimension - 1; i >= 0; --i) { | |||||
| strides[i] = input_strides[axes_[i]]; | |||||
| back_strides[i] = (output_shape_[i] - 1) * strides[i]; | |||||
| } | |||||
| std::array<uint32_t, kMaxDim> coordinates{0}; | |||||
| auto get_next_pos = [&coordinates, &strides, &back_strides, &dimension, this](int curr_pos) { | |||||
| for (int i = dimension - 1; i >= 0; --i) { | |||||
| if (coordinates[i] + 1 == output_shape_[i]) { | |||||
| coordinates[i] = 0; | |||||
| curr_pos -= back_strides[i]; | |||||
| } else { | |||||
| coordinates[i]++; | |||||
| curr_pos += strides[i]; | |||||
| break; | |||||
| } | |||||
| auto input_addr = reinterpret_cast<T *>(inputs[0]->addr); | |||||
| auto output_addr = reinterpret_cast<T *>(outputs[0]->addr); | |||||
| size_t size = IntToSize(inputs[0]->size / sizeof(T)); | |||||
| TransposeIterator base_iter(output_shape_, axes_, input_shape_); | |||||
| auto task = [&base_iter, input_addr, output_addr](size_t start, size_t end) { | |||||
| auto iter = base_iter; | |||||
| iter.SetPos(start); | |||||
| for (size_t i = start; i < end; ++i) { | |||||
| output_addr[i] = input_addr[iter.GetPos()]; | |||||
| iter.GenNextPos(); | |||||
| } | } | ||||
| return curr_pos; | |||||
| }; | }; | ||||
| auto input = reinterpret_cast<T *>(inputs[0]->addr); | |||||
| auto output = reinterpret_cast<T *>(outputs[0]->addr); | |||||
| size_t size = IntToSize(inputs[0]->size / sizeof(T)); | |||||
| output[0] = input[0]; | |||||
| int pos = 0; | |||||
| for (size_t i = 1; i < size; ++i) { | |||||
| pos = get_next_pos(pos); | |||||
| output[i] = input[pos]; | |||||
| } | |||||
| CPUKernelUtils::ParallelFor(task, size); | |||||
| } | } | ||||
| } // namespace kernel | } // namespace kernel | ||||
| } // namespace mindspore | } // namespace mindspore | ||||
| @@ -34,13 +34,12 @@ class TransposeCPUFwdKernel : public CPUKernel { | |||||
| const std::vector<AddressPtr> &outputs) override; | const std::vector<AddressPtr> &outputs) override; | ||||
| private: | private: | ||||
| void CheckParameter() const; | |||||
| template <typename T> | template <typename T> | ||||
| void LaunchKernel(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &outputs); | void LaunchKernel(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &outputs); | ||||
| std::vector<size_t> input_shape_; | std::vector<size_t> input_shape_; | ||||
| std::vector<size_t> output_shape_; | std::vector<size_t> output_shape_; | ||||
| std::vector<int64_t> axes_; | |||||
| std::vector<size_t> axes_; | |||||
| TypeId dtype_{kTypeUnknown}; | TypeId dtype_{kTypeUnknown}; | ||||
| using TypeKernel = | using TypeKernel = | ||||
| std::function<void(TransposeCPUFwdKernel *, const std::vector<AddressPtr> &, const std::vector<AddressPtr> &)>; | std::function<void(TransposeCPUFwdKernel *, const std::vector<AddressPtr> &, const std::vector<AddressPtr> &)>; | ||||