transpose opt for server part2

4 years ago · e115dd744a
--- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/fp32/transpose_server_fp32.c
+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/fp32/transpose_server_fp32.c
@@ -0,0 +1,239 @@
 #ifdef BFC_MEMORY
 /**
 * Copyright 2022 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #include "nnacl/fp32/transpose_server_fp32.h"

 #define JUDGEPART(NUM)                         \
  if (dim_start##NUM == overflow_point##NUM) { \
    dim_start##NUM = 0;                        \
  } else {                                     \
    ++dim_start##NUM;                          \
    in_offset += stride##NUM;                  \
    continue;                                  \
  }

 void DoTransposeServerDim3(const float *in_data, float *out_data, const int64_t *overflow_points,
                           const int64_t *strides, const TransposeBlockBoundaryInfo *boundary_info) {
  int64_t stride2 = strides[THIRD_INPUT];
  int64_t size = boundary_info->sizes[0];
  int64_t in_offset = boundary_info->in_offsets[0];
  out_data += boundary_info->out_start_offset;
  for (int64_t i = 0; i < size; ++i) {
    out_data[i] = in_data[in_offset + i * stride2];
  }
  int64_t dim_start1 = boundary_info->start_dim[1];
  int64_t overflow_point1 = overflow_points[1];
  int64_t overflow_point2 = overflow_points[THIRD_INPUT];
  int64_t stride0 = strides[0];
  int64_t stride1 = strides[1];
  int64_t last_dim = overflow_point2 + 1;
  out_data += size;
  size = boundary_info->sizes[1];
  in_offset = boundary_info->in_offsets[1];
  for (int64_t i = 0; i < size; i += last_dim) {
    for (int64_t j = 0; j < overflow_point2; ++j) {
      out_data[i + j] = in_data[in_offset];
      in_offset += stride2;
    }
    out_data[i + overflow_point2] = in_data[in_offset];
    JUDGEPART(1)
    in_offset += stride0;
  }
  out_data += size;
  size = boundary_info->sizes[THIRD_INPUT];
  for (int64_t i = 0; i < size; ++i) {
    out_data[i] = in_data[in_offset + i * stride2];
  }
 }

 void DoTransposeServerDim4(const float *in_data, float *out_data, const int64_t *overflow_points,
                           const int64_t *strides, const TransposeBlockBoundaryInfo *boundary_info) {
  int64_t stride3 = strides[FOURTH_INPUT];
  int64_t size = boundary_info->sizes[0];
  int64_t in_offset = boundary_info->in_offsets[0];
  out_data += boundary_info->out_start_offset;
  for (int64_t i = 0; i < size; ++i) {
    out_data[i] = in_data[in_offset + i * stride3];
  }
  int64_t dim_start1 = boundary_info->start_dim[1];
  int64_t dim_start2 = boundary_info->start_dim[THIRD_INPUT];
  int64_t overflow_point1 = overflow_points[1];
  int64_t overflow_point2 = overflow_points[THIRD_INPUT];
  int64_t overflow_point3 = overflow_points[FOURTH_INPUT];
  int64_t stride0 = strides[0];
  int64_t stride1 = strides[1];
  int64_t stride2 = strides[THIRD_INPUT];
  int64_t last_dim = overflow_point3 + 1;
  out_data += size;
  size = boundary_info->sizes[1];
  in_offset = boundary_info->in_offsets[1];
  for (int64_t i = 0; i < size; i += last_dim) {
    for (int64_t j = 0; j < overflow_point3; ++j) {
      out_data[i + j] = in_data[in_offset];
      in_offset += stride3;
    }
    out_data[i + overflow_point3] = in_data[in_offset];
    JUDGEPART(2)
    JUDGEPART(1)
    in_offset += stride0;
  }
  out_data += size;
  size = boundary_info->sizes[THIRD_INPUT];
  for (int64_t i = 0; i < size; ++i) {
    out_data[i] = in_data[in_offset + i * stride3];
  }
 }

 void DoTransposeServerDim5(const float *in_data, float *out_data, const int64_t *overflow_points,
                           const int64_t *strides, const TransposeBlockBoundaryInfo *boundary_info) {
  int64_t stride4 = strides[FIFTH_INPUT];
  int64_t size = boundary_info->sizes[0];
  int64_t in_offset = boundary_info->in_offsets[0];
  out_data += boundary_info->out_start_offset;
  for (int64_t i = 0; i < size; ++i) {
    out_data[i] = in_data[in_offset + i * stride4];
  }
  int64_t dim_start1 = boundary_info->start_dim[1];
  int64_t dim_start2 = boundary_info->start_dim[THIRD_INPUT];
  int64_t dim_start3 = boundary_info->start_dim[FOURTH_INPUT];
  int64_t overflow_point1 = overflow_points[1];
  int64_t overflow_point2 = overflow_points[THIRD_INPUT];
  int64_t overflow_point3 = overflow_points[FOURTH_INPUT];
  int64_t overflow_point4 = overflow_points[FIFTH_INPUT];
  int64_t stride0 = strides[0];
  int64_t stride1 = strides[1];
  int64_t stride2 = strides[THIRD_INPUT];
  int64_t stride3 = strides[FOURTH_INPUT];
  int64_t last_dim = overflow_point4 + 1;
  out_data += size;
  size = boundary_info->sizes[1];
  in_offset = boundary_info->in_offsets[1];
  for (int64_t i = 0; i < size; i += last_dim) {
    for (int64_t j = 0; j < overflow_point4; ++j) {
      out_data[i + j] = in_data[in_offset];
      in_offset += stride4;
    }
    out_data[i + overflow_point4] = in_data[in_offset];
    JUDGEPART(3)
    JUDGEPART(2)
    JUDGEPART(1)
    in_offset += stride0;
  }
  out_data += size;
  size = boundary_info->sizes[THIRD_INPUT];
  for (int64_t i = 0; i < size; ++i) {
    out_data[i] = in_data[in_offset + i * stride4];
  }
 }

 void DoTransposeServerDim6(const float *in_data, float *out_data, const int64_t *overflow_points,
                           const int64_t *strides, const TransposeBlockBoundaryInfo *boundary_info) {
  int64_t stride5 = strides[SIXTH_INPUT];
  int64_t size = boundary_info->sizes[0];
  int64_t in_offset = boundary_info->in_offsets[0];
  out_data += boundary_info->out_start_offset;
  for (int64_t i = 0; i < size; ++i) {
    out_data[i] = in_data[in_offset + i * stride5];
  }
  int64_t dim_start1 = boundary_info->start_dim[1];
  int64_t dim_start2 = boundary_info->start_dim[THIRD_INPUT];
  int64_t dim_start3 = boundary_info->start_dim[FOURTH_INPUT];
  int64_t dim_start4 = boundary_info->start_dim[FIFTH_INPUT];
  int64_t overflow_point1 = overflow_points[1];
  int64_t overflow_point2 = overflow_points[THIRD_INPUT];
  int64_t overflow_point3 = overflow_points[FOURTH_INPUT];
  int64_t overflow_point4 = overflow_points[FIFTH_INPUT];
  int64_t overflow_point5 = overflow_points[SIXTH_INPUT];
  int64_t stride0 = strides[0];
  int64_t stride1 = strides[1];
  int64_t stride2 = strides[THIRD_INPUT];
  int64_t stride3 = strides[FOURTH_INPUT];
  int64_t stride4 = strides[FIFTH_INPUT];
  int64_t last_dim = overflow_point5 + 1;
  out_data += size;
  size = boundary_info->sizes[1];
  in_offset = boundary_info->in_offsets[1];
  for (int64_t i = 0; i < size; i += last_dim) {
    for (int64_t j = 0; j < overflow_point5; ++j) {
      out_data[i + j] = in_data[in_offset];
      in_offset += stride5;
    }
    out_data[i + overflow_point5] = in_data[in_offset];
    JUDGEPART(4)
    JUDGEPART(3)
    JUDGEPART(2)
    JUDGEPART(1)
    in_offset += stride0;
  }
  out_data += size;
  size = boundary_info->sizes[THIRD_INPUT];
  for (int64_t i = 0; i < size; ++i) {
    out_data[i] = in_data[in_offset + i * stride5];
  }
 }

 void DoTransposeServer(const float *in_data, float *out_data, const int64_t *overflow_points, const int64_t *strides,
                       int axis_num, const TransposeBlockBoundaryInfo *boundary_info) {
  if (axis_num == DIMENSION_3D) {
    DoTransposeServerDim3(in_data, out_data, overflow_points, strides, boundary_info);
    return;
  } else if (axis_num == DIMENSION_4D) {
    DoTransposeServerDim4(in_data, out_data, overflow_points, strides, boundary_info);
    return;
  } else if (axis_num == DIMENSION_5D) {
    DoTransposeServerDim5(in_data, out_data, overflow_points, strides, boundary_info);
    return;
  } else if (axis_num == DIMENSION_6D) {
    DoTransposeServerDim6(in_data, out_data, overflow_points, strides, boundary_info);
    return;
  }
  out_data += boundary_info->out_start_offset;
  int64_t stride = strides[axis_num - 1];
  int64_t size = boundary_info->sizes[0];
  int64_t in_offset = boundary_info->in_offsets[0];
  for (int64_t i = 0; i < size; ++i) {
    out_data[i] = in_data[in_offset + i * stride];
  }
  int64_t dim_info[MAX_TRANSPOSE_DIM_SIZE] = {};
  for (int i = 0; i < axis_num; ++i) {
    dim_info[i] = boundary_info->start_dim[i];
  }
  int64_t last_overflow_point = overflow_points[axis_num - 1];
  int64_t last_dim = last_overflow_point + 1;
  out_data += size;
  size = boundary_info->sizes[1];
  for (int64_t i = 0; i < size; i += last_dim) {
    for (int64_t j = 0; j < last_overflow_point; ++j) {
      out_data[i + j] = in_data[in_offset];
      in_offset += stride;
    }
    out_data[i + last_overflow_point] = in_data[in_offset];
    int j = axis_num - 2;
    while (dim_info[j] == overflow_points[j]) {
      dim_info[j] = 0;
      --j;
    }
    ++dim_info[j];
    in_offset += strides[j];
  }
  out_data += size;
  size = boundary_info->sizes[THIRD_INPUT];
  for (int64_t i = 0; i < size; ++i) {
    out_data[i] = in_data[in_offset + i * stride];
  }
 }
 #endif
--- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/fp32/transpose_server_fp32.h
+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/fp32/transpose_server_fp32.h
@@ -0,0 +1,40 @@
 /**
 * Copyright 2022 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #ifndef MINDSPORE_NNACL_FP32_TRANSPOSE_SERVER_FP32_H_
 #define MINDSPORE_NNACL_FP32_TRANSPOSE_SERVER_FP32_H_

 #ifdef BFC_MEMORY
 #include "nnacl/transpose.h"

 #ifdef __cplusplus
 extern "C" {
 #endif

 typedef struct TransposeBlockBoundaryInfo {
  int64_t out_start_offset;
  int64_t sizes[C3NUM];
  int64_t in_offsets[C2NUM];
  int64_t start_dim[MAX_TRANSPOSE_DIM_SIZE];
 } TransposeBlockBoundaryInfo;

 void DoTransposeServer(const float *in_data, float *out_data, const int64_t *overflow_points, const int64_t *strides,
                       int axis_num, const TransposeBlockBoundaryInfo *boundary_info);
 #ifdef __cplusplus
 };
 #endif

 #endif  // MINDSPORE_NNACL_FP32_TRANSPOSE_SERVER_FP32_H_
 #endif
--- a/mindspore/lite/src/runtime/kernel/arm/fp16/transpose_fp16.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/transpose_fp16.cc
@@ -1,5 +1,5 @@
 /**
 * Copyright 2020-2021 Huawei Technologies Co., Ltd
 * Copyright 2020-2022 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
--- a/mindspore/lite/src/runtime/kernel/arm/fp16/transpose_fp16.h
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/transpose_fp16.h
@@ -1,5 +1,5 @@
 /**
 * Copyright 2020-2021 Huawei Technologies Co., Ltd
 * Copyright 2020-2022 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/transpose_fp32.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/transpose_fp32.cc
@@ -1,5 +1,6 @@
 #ifndef BFC_MEMORY
 /**
 * Copyright 2020-2021 Huawei Technologies Co., Ltd
 * Copyright 2020-2022 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -55,3 +56,4 @@ int TransposeCPUKernel::DoTransposeMultiThread(int task_id) {
 REG_KERNEL(kCPU, kNumberTypeFloat32, PrimitiveType_Transpose, LiteKernelCreator<TransposeCPUKernel>)
 REG_KERNEL(kCPU, kNumberTypeInt32, PrimitiveType_Transpose, LiteKernelCreator<TransposeCPUKernel>)
 }  // namespace mindspore::kernel
 #endif
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/transpose_fp32.h
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/transpose_fp32.h
@@ -1,5 +1,5 @@
 /**
 * Copyright 2020-2021 Huawei Technologies Co., Ltd
 * Copyright 2020-2022 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -16,6 +16,7 @@
 #ifndef MINDSPORE_CCSRC_KERNEL_CPU_ARM_FP32_TRANSPOSE_FP32_H_
 #define MINDSPORE_CCSRC_KERNEL_CPU_ARM_FP32_TRANSPOSE_FP32_H_

 #ifndef BFC_MEMORY
 #include <vector>
 #include "src/runtime/kernel/arm/base/transpose_base.h"

@@ -36,3 +37,4 @@ class TransposeCPUKernel : public TransposeBaseCPUKernel {
 }  // namespace mindspore::kernel

 #endif  // MINDSPORE_CCSRC_KERNEL_CPU_ARM_FP32_TRANSPOSE_FP32_H_
 #endif
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/transpose_server_fp32.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/transpose_server_fp32.cc
@@ -0,0 +1,134 @@
 #ifdef BFC_MEMORY
 /**
 * Copyright 2022 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "src/runtime/kernel/arm/fp32/transpose_server_fp32.h"
 #include "src/kernel_registry.h"
 #include "nnacl/fp32/pack_fp32.h"

 using mindspore::lite::KernelRegistrar;
 using mindspore::lite::RET_OK;
 using mindspore::schema::PrimitiveType_Transpose;

 namespace mindspore::kernel {
 namespace {
 constexpr int64_t kMinCostPerThread = 1 << 18;
 }
 int TransposeServerCPUKernel::ReSize() {
  auto ret = TransposeBaseCPUKernel::ReSize();
  if (ret != RET_OK) {
    MS_LOG(ERROR) << "Do transpose resize failed.";
    return ret;
  }
  if (!is_valid_ || opt_run_) {
    return RET_OK;
  }
  ComputeIndividualOfflineInfo();
  return ChooseThreadCuttingStrategy();
 }

 void TransposeServerCPUKernel::ComputeIndividualOfflineInfo() {
  MS_ASSERT(param_->num_axes_ >= C3NUM);
  overflow_points_.resize(param_->num_axes_);
  for (int i = 0; i < param_->num_axes_; ++i) {
    overflow_points_[i] = (out_shape_[i] - 1);
  }
  strides_.resize(param_->num_axes_);
  for (int i = 0; i < param_->num_axes_; ++i) {
    strides_[i] = param_->strides_[param_->perm_[i]];
  }
  std::vector<int64_t> in_strides_temp = strides_;
  for (int i = param_->num_axes_ - C2NUM; i >= 0; --i) {
    strides_[i] =
      strides_[i] - in_strides_temp[i + 1] - in_strides_temp[i + 1] * overflow_points_[i + 1] + strides_[i + 1];
  }
 }

 int TransposeServerCPUKernel::ChooseThreadCuttingStrategy() {
  block_boundary_infos_.clear();
  int64_t element_num = in_tensors_.front()->ElementsNum();
  if (element_num <= kMinCostPerThread) {
    thread_num_ = 1;
  } else {
    thread_num_ = MSMIN(op_parameter_->thread_num_, UP_DIV(element_num, kMinCostPerThread));
  }
  if (thread_num_ < 1) {
    thread_num_ = 1;
  }
  if (thread_num_ > C4NUM) {
    thread_num_ = C4NUM;
  }
  int64_t block_size = element_num / thread_num_;
  int64_t remain_data = element_num - block_size * thread_num_;
  int64_t split_point = 0;
  block_boundary_infos_.clear();
  std::vector<int64_t> post_multi(param_->num_axes_, 1);
  for (int i = param_->num_axes_ - C2NUM; i >= 0; --i) {
    post_multi[i] = post_multi[i + 1] * out_shape_[i + 1];
  }
  while (split_point < element_num) {
    TransposeBlockBoundaryInfo block_boundary_info;
    int64_t in_offset = 0;
    block_boundary_info.out_start_offset = split_point;
    for (int i = 0; i < param_->num_axes_; ++i) {
      block_boundary_info.start_dim[i] = split_point / post_multi[i] % out_shape_[i];
      in_offset += block_boundary_info.start_dim[i] * param_->strides_[param_->perm_[i]];
    }
    block_boundary_info.in_offsets[0] = in_offset;
    split_point += block_size;
    if (remain_data > 0) {
      ++split_point;
      --remain_data;
    }
    if (split_point > element_num) {
      split_point = element_num;
    }
    int64_t size = split_point - block_boundary_info.out_start_offset;
    int last_axis_index = param_->num_axes_ - 1;
    block_boundary_info.sizes[0] =
      MSMIN(size, out_shape_[last_axis_index] - block_boundary_info.start_dim[last_axis_index]);
    size -= block_boundary_info.sizes[0];
    block_boundary_info.sizes[1] = DOWN_ROUND(size, out_shape_[last_axis_index]);
    block_boundary_info.sizes[C2NUM] = size - block_boundary_info.sizes[1];
    int64_t out_offset = block_boundary_info.out_start_offset + block_boundary_info.sizes[0];
    in_offset = 0;
    for (int i = 0; i < param_->num_axes_; ++i) {
      block_boundary_info.start_dim[i] = out_offset / post_multi[i] % out_shape_[i];
      in_offset += block_boundary_info.start_dim[i] * param_->strides_[param_->perm_[i]];
    }
    block_boundary_info.in_offsets[1] = in_offset;
    block_boundary_infos_.push_back(block_boundary_info);
  }
  thread_num_ = block_boundary_infos_.size();
  return RET_OK;
 }

 int TransposeServerCPUKernel::DoTransposeSingleThread() { return DoTransposeMultiThread(0); }

 int TransposeServerCPUKernel::DoTransposeMultiThread(int task_id) {
  if (opt_run_) {
    PackNHWCToNCHWFp32(in_data_, out_data_, opt_param_[FIRST_INPUT], opt_param_[SECOND_INPUT], opt_param_[THIRD_INPUT],
                       task_id, thread_num_);
    return RET_OK;
  }
  DoTransposeServer(static_cast<float *>(in_data_), static_cast<float *>(out_data_), overflow_points_.data(),
                    strides_.data(), param_->num_axes_, &block_boundary_infos_[task_id]);
  return RET_OK;
 }

 REG_KERNEL(kCPU, kNumberTypeFloat32, PrimitiveType_Transpose, LiteKernelCreator<TransposeServerCPUKernel>)
 REG_KERNEL(kCPU, kNumberTypeInt32, PrimitiveType_Transpose, LiteKernelCreator<TransposeServerCPUKernel>)
 }  // namespace mindspore::kernel
 #endif
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/transpose_server_fp32.h
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/transpose_server_fp32.h
@@ -0,0 +1,47 @@
 /**
 * Copyright 2022 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #ifndef MINDSPORE_CCSRC_KERNEL_CPU_ARM_FP32_TRANSPOSE_SERVER_FP32_H_
 #define MINDSPORE_CCSRC_KERNEL_CPU_ARM_FP32_TRANSPOSE_SERVER_FP32_H_

 #ifdef BFC_MEMORY
 #include <vector>
 #include "src/runtime/kernel/arm/base/transpose_base.h"
 #include "nnacl/fp32/transpose_server_fp32.h"

 namespace mindspore::kernel {
 class TransposeServerCPUKernel : public TransposeBaseCPUKernel {
 public:
  explicit TransposeServerCPUKernel(OpParameter *param, const std::vector<lite::Tensor *> &inputs,
                                    const std::vector<lite::Tensor *> &outputs, const lite::InnerContext *ctx)
      : TransposeBaseCPUKernel(param, inputs, outputs, ctx) {}
  ~TransposeServerCPUKernel() override = default;

  int ReSize() override;

 private:
  void ComputeIndividualOfflineInfo();
  int ChooseThreadCuttingStrategy();
  int DoTransposeSingleThread() override;
  int DoTransposeMultiThread(int task_id) override;

  std::vector<int64_t> overflow_points_;
  std::vector<int64_t> strides_;
  std::vector<TransposeBlockBoundaryInfo> block_boundary_infos_;
 };
 }  // namespace mindspore::kernel

 #endif  // MINDSPORE_CCSRC_KERNEL_CPU_ARM_FP32_TRANSPOSE_SERVER_FP32_H_
 #endif