Browse Source

transpose opt for server part2

r1.7
xuanyue 4 years ago
parent
commit
e115dd744a
8 changed files with 468 additions and 4 deletions
  1. +239
    -0
      mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/fp32/transpose_server_fp32.c
  2. +40
    -0
      mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/fp32/transpose_server_fp32.h
  3. +1
    -1
      mindspore/lite/src/runtime/kernel/arm/fp16/transpose_fp16.cc
  4. +1
    -1
      mindspore/lite/src/runtime/kernel/arm/fp16/transpose_fp16.h
  5. +3
    -1
      mindspore/lite/src/runtime/kernel/arm/fp32/transpose_fp32.cc
  6. +3
    -1
      mindspore/lite/src/runtime/kernel/arm/fp32/transpose_fp32.h
  7. +134
    -0
      mindspore/lite/src/runtime/kernel/arm/fp32/transpose_server_fp32.cc
  8. +47
    -0
      mindspore/lite/src/runtime/kernel/arm/fp32/transpose_server_fp32.h

+ 239
- 0
mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/fp32/transpose_server_fp32.c View File

@@ -0,0 +1,239 @@
#ifdef BFC_MEMORY
/**
* Copyright 2022 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include "nnacl/fp32/transpose_server_fp32.h"

#define JUDGEPART(NUM) \
if (dim_start##NUM == overflow_point##NUM) { \
dim_start##NUM = 0; \
} else { \
++dim_start##NUM; \
in_offset += stride##NUM; \
continue; \
}

void DoTransposeServerDim3(const float *in_data, float *out_data, const int64_t *overflow_points,
const int64_t *strides, const TransposeBlockBoundaryInfo *boundary_info) {
int64_t stride2 = strides[THIRD_INPUT];
int64_t size = boundary_info->sizes[0];
int64_t in_offset = boundary_info->in_offsets[0];
out_data += boundary_info->out_start_offset;
for (int64_t i = 0; i < size; ++i) {
out_data[i] = in_data[in_offset + i * stride2];
}
int64_t dim_start1 = boundary_info->start_dim[1];
int64_t overflow_point1 = overflow_points[1];
int64_t overflow_point2 = overflow_points[THIRD_INPUT];
int64_t stride0 = strides[0];
int64_t stride1 = strides[1];
int64_t last_dim = overflow_point2 + 1;
out_data += size;
size = boundary_info->sizes[1];
in_offset = boundary_info->in_offsets[1];
for (int64_t i = 0; i < size; i += last_dim) {
for (int64_t j = 0; j < overflow_point2; ++j) {
out_data[i + j] = in_data[in_offset];
in_offset += stride2;
}
out_data[i + overflow_point2] = in_data[in_offset];
JUDGEPART(1)
in_offset += stride0;
}
out_data += size;
size = boundary_info->sizes[THIRD_INPUT];
for (int64_t i = 0; i < size; ++i) {
out_data[i] = in_data[in_offset + i * stride2];
}
}

void DoTransposeServerDim4(const float *in_data, float *out_data, const int64_t *overflow_points,
const int64_t *strides, const TransposeBlockBoundaryInfo *boundary_info) {
int64_t stride3 = strides[FOURTH_INPUT];
int64_t size = boundary_info->sizes[0];
int64_t in_offset = boundary_info->in_offsets[0];
out_data += boundary_info->out_start_offset;
for (int64_t i = 0; i < size; ++i) {
out_data[i] = in_data[in_offset + i * stride3];
}
int64_t dim_start1 = boundary_info->start_dim[1];
int64_t dim_start2 = boundary_info->start_dim[THIRD_INPUT];
int64_t overflow_point1 = overflow_points[1];
int64_t overflow_point2 = overflow_points[THIRD_INPUT];
int64_t overflow_point3 = overflow_points[FOURTH_INPUT];
int64_t stride0 = strides[0];
int64_t stride1 = strides[1];
int64_t stride2 = strides[THIRD_INPUT];
int64_t last_dim = overflow_point3 + 1;
out_data += size;
size = boundary_info->sizes[1];
in_offset = boundary_info->in_offsets[1];
for (int64_t i = 0; i < size; i += last_dim) {
for (int64_t j = 0; j < overflow_point3; ++j) {
out_data[i + j] = in_data[in_offset];
in_offset += stride3;
}
out_data[i + overflow_point3] = in_data[in_offset];
JUDGEPART(2)
JUDGEPART(1)
in_offset += stride0;
}
out_data += size;
size = boundary_info->sizes[THIRD_INPUT];
for (int64_t i = 0; i < size; ++i) {
out_data[i] = in_data[in_offset + i * stride3];
}
}

void DoTransposeServerDim5(const float *in_data, float *out_data, const int64_t *overflow_points,
const int64_t *strides, const TransposeBlockBoundaryInfo *boundary_info) {
int64_t stride4 = strides[FIFTH_INPUT];
int64_t size = boundary_info->sizes[0];
int64_t in_offset = boundary_info->in_offsets[0];
out_data += boundary_info->out_start_offset;
for (int64_t i = 0; i < size; ++i) {
out_data[i] = in_data[in_offset + i * stride4];
}
int64_t dim_start1 = boundary_info->start_dim[1];
int64_t dim_start2 = boundary_info->start_dim[THIRD_INPUT];
int64_t dim_start3 = boundary_info->start_dim[FOURTH_INPUT];
int64_t overflow_point1 = overflow_points[1];
int64_t overflow_point2 = overflow_points[THIRD_INPUT];
int64_t overflow_point3 = overflow_points[FOURTH_INPUT];
int64_t overflow_point4 = overflow_points[FIFTH_INPUT];
int64_t stride0 = strides[0];
int64_t stride1 = strides[1];
int64_t stride2 = strides[THIRD_INPUT];
int64_t stride3 = strides[FOURTH_INPUT];
int64_t last_dim = overflow_point4 + 1;
out_data += size;
size = boundary_info->sizes[1];
in_offset = boundary_info->in_offsets[1];
for (int64_t i = 0; i < size; i += last_dim) {
for (int64_t j = 0; j < overflow_point4; ++j) {
out_data[i + j] = in_data[in_offset];
in_offset += stride4;
}
out_data[i + overflow_point4] = in_data[in_offset];
JUDGEPART(3)
JUDGEPART(2)
JUDGEPART(1)
in_offset += stride0;
}
out_data += size;
size = boundary_info->sizes[THIRD_INPUT];
for (int64_t i = 0; i < size; ++i) {
out_data[i] = in_data[in_offset + i * stride4];
}
}

void DoTransposeServerDim6(const float *in_data, float *out_data, const int64_t *overflow_points,
const int64_t *strides, const TransposeBlockBoundaryInfo *boundary_info) {
int64_t stride5 = strides[SIXTH_INPUT];
int64_t size = boundary_info->sizes[0];
int64_t in_offset = boundary_info->in_offsets[0];
out_data += boundary_info->out_start_offset;
for (int64_t i = 0; i < size; ++i) {
out_data[i] = in_data[in_offset + i * stride5];
}
int64_t dim_start1 = boundary_info->start_dim[1];
int64_t dim_start2 = boundary_info->start_dim[THIRD_INPUT];
int64_t dim_start3 = boundary_info->start_dim[FOURTH_INPUT];
int64_t dim_start4 = boundary_info->start_dim[FIFTH_INPUT];
int64_t overflow_point1 = overflow_points[1];
int64_t overflow_point2 = overflow_points[THIRD_INPUT];
int64_t overflow_point3 = overflow_points[FOURTH_INPUT];
int64_t overflow_point4 = overflow_points[FIFTH_INPUT];
int64_t overflow_point5 = overflow_points[SIXTH_INPUT];
int64_t stride0 = strides[0];
int64_t stride1 = strides[1];
int64_t stride2 = strides[THIRD_INPUT];
int64_t stride3 = strides[FOURTH_INPUT];
int64_t stride4 = strides[FIFTH_INPUT];
int64_t last_dim = overflow_point5 + 1;
out_data += size;
size = boundary_info->sizes[1];
in_offset = boundary_info->in_offsets[1];
for (int64_t i = 0; i < size; i += last_dim) {
for (int64_t j = 0; j < overflow_point5; ++j) {
out_data[i + j] = in_data[in_offset];
in_offset += stride5;
}
out_data[i + overflow_point5] = in_data[in_offset];
JUDGEPART(4)
JUDGEPART(3)
JUDGEPART(2)
JUDGEPART(1)
in_offset += stride0;
}
out_data += size;
size = boundary_info->sizes[THIRD_INPUT];
for (int64_t i = 0; i < size; ++i) {
out_data[i] = in_data[in_offset + i * stride5];
}
}

void DoTransposeServer(const float *in_data, float *out_data, const int64_t *overflow_points, const int64_t *strides,
int axis_num, const TransposeBlockBoundaryInfo *boundary_info) {
if (axis_num == DIMENSION_3D) {
DoTransposeServerDim3(in_data, out_data, overflow_points, strides, boundary_info);
return;
} else if (axis_num == DIMENSION_4D) {
DoTransposeServerDim4(in_data, out_data, overflow_points, strides, boundary_info);
return;
} else if (axis_num == DIMENSION_5D) {
DoTransposeServerDim5(in_data, out_data, overflow_points, strides, boundary_info);
return;
} else if (axis_num == DIMENSION_6D) {
DoTransposeServerDim6(in_data, out_data, overflow_points, strides, boundary_info);
return;
}
out_data += boundary_info->out_start_offset;
int64_t stride = strides[axis_num - 1];
int64_t size = boundary_info->sizes[0];
int64_t in_offset = boundary_info->in_offsets[0];
for (int64_t i = 0; i < size; ++i) {
out_data[i] = in_data[in_offset + i * stride];
}
int64_t dim_info[MAX_TRANSPOSE_DIM_SIZE] = {};
for (int i = 0; i < axis_num; ++i) {
dim_info[i] = boundary_info->start_dim[i];
}
int64_t last_overflow_point = overflow_points[axis_num - 1];
int64_t last_dim = last_overflow_point + 1;
out_data += size;
size = boundary_info->sizes[1];
for (int64_t i = 0; i < size; i += last_dim) {
for (int64_t j = 0; j < last_overflow_point; ++j) {
out_data[i + j] = in_data[in_offset];
in_offset += stride;
}
out_data[i + last_overflow_point] = in_data[in_offset];
int j = axis_num - 2;
while (dim_info[j] == overflow_points[j]) {
dim_info[j] = 0;
--j;
}
++dim_info[j];
in_offset += strides[j];
}
out_data += size;
size = boundary_info->sizes[THIRD_INPUT];
for (int64_t i = 0; i < size; ++i) {
out_data[i] = in_data[in_offset + i * stride];
}
}
#endif

+ 40
- 0
mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/fp32/transpose_server_fp32.h View File

@@ -0,0 +1,40 @@
/**
* Copyright 2022 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef MINDSPORE_NNACL_FP32_TRANSPOSE_SERVER_FP32_H_
#define MINDSPORE_NNACL_FP32_TRANSPOSE_SERVER_FP32_H_

#ifdef BFC_MEMORY
#include "nnacl/transpose.h"

#ifdef __cplusplus
extern "C" {
#endif

typedef struct TransposeBlockBoundaryInfo {
int64_t out_start_offset;
int64_t sizes[C3NUM];
int64_t in_offsets[C2NUM];
int64_t start_dim[MAX_TRANSPOSE_DIM_SIZE];
} TransposeBlockBoundaryInfo;

void DoTransposeServer(const float *in_data, float *out_data, const int64_t *overflow_points, const int64_t *strides,
int axis_num, const TransposeBlockBoundaryInfo *boundary_info);
#ifdef __cplusplus
};
#endif

#endif // MINDSPORE_NNACL_FP32_TRANSPOSE_SERVER_FP32_H_
#endif

+ 1
- 1
mindspore/lite/src/runtime/kernel/arm/fp16/transpose_fp16.cc View File

@@ -1,5 +1,5 @@
/**
* Copyright 2020-2021 Huawei Technologies Co., Ltd
* Copyright 2020-2022 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.


+ 1
- 1
mindspore/lite/src/runtime/kernel/arm/fp16/transpose_fp16.h View File

@@ -1,5 +1,5 @@
/**
* Copyright 2020-2021 Huawei Technologies Co., Ltd
* Copyright 2020-2022 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.


+ 3
- 1
mindspore/lite/src/runtime/kernel/arm/fp32/transpose_fp32.cc View File

@@ -1,5 +1,6 @@
#ifndef BFC_MEMORY
/**
* Copyright 2020-2021 Huawei Technologies Co., Ltd
* Copyright 2020-2022 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
@@ -55,3 +56,4 @@ int TransposeCPUKernel::DoTransposeMultiThread(int task_id) {
REG_KERNEL(kCPU, kNumberTypeFloat32, PrimitiveType_Transpose, LiteKernelCreator<TransposeCPUKernel>)
REG_KERNEL(kCPU, kNumberTypeInt32, PrimitiveType_Transpose, LiteKernelCreator<TransposeCPUKernel>)
} // namespace mindspore::kernel
#endif

+ 3
- 1
mindspore/lite/src/runtime/kernel/arm/fp32/transpose_fp32.h View File

@@ -1,5 +1,5 @@
/**
* Copyright 2020-2021 Huawei Technologies Co., Ltd
* Copyright 2020-2022 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
@@ -16,6 +16,7 @@
#ifndef MINDSPORE_CCSRC_KERNEL_CPU_ARM_FP32_TRANSPOSE_FP32_H_
#define MINDSPORE_CCSRC_KERNEL_CPU_ARM_FP32_TRANSPOSE_FP32_H_

#ifndef BFC_MEMORY
#include <vector>
#include "src/runtime/kernel/arm/base/transpose_base.h"

@@ -36,3 +37,4 @@ class TransposeCPUKernel : public TransposeBaseCPUKernel {
} // namespace mindspore::kernel

#endif // MINDSPORE_CCSRC_KERNEL_CPU_ARM_FP32_TRANSPOSE_FP32_H_
#endif

+ 134
- 0
mindspore/lite/src/runtime/kernel/arm/fp32/transpose_server_fp32.cc View File

@@ -0,0 +1,134 @@
#ifdef BFC_MEMORY
/**
* Copyright 2022 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "src/runtime/kernel/arm/fp32/transpose_server_fp32.h"
#include "src/kernel_registry.h"
#include "nnacl/fp32/pack_fp32.h"

using mindspore::lite::KernelRegistrar;
using mindspore::lite::RET_OK;
using mindspore::schema::PrimitiveType_Transpose;

namespace mindspore::kernel {
namespace {
constexpr int64_t kMinCostPerThread = 1 << 18;
}
int TransposeServerCPUKernel::ReSize() {
auto ret = TransposeBaseCPUKernel::ReSize();
if (ret != RET_OK) {
MS_LOG(ERROR) << "Do transpose resize failed.";
return ret;
}
if (!is_valid_ || opt_run_) {
return RET_OK;
}
ComputeIndividualOfflineInfo();
return ChooseThreadCuttingStrategy();
}

void TransposeServerCPUKernel::ComputeIndividualOfflineInfo() {
MS_ASSERT(param_->num_axes_ >= C3NUM);
overflow_points_.resize(param_->num_axes_);
for (int i = 0; i < param_->num_axes_; ++i) {
overflow_points_[i] = (out_shape_[i] - 1);
}
strides_.resize(param_->num_axes_);
for (int i = 0; i < param_->num_axes_; ++i) {
strides_[i] = param_->strides_[param_->perm_[i]];
}
std::vector<int64_t> in_strides_temp = strides_;
for (int i = param_->num_axes_ - C2NUM; i >= 0; --i) {
strides_[i] =
strides_[i] - in_strides_temp[i + 1] - in_strides_temp[i + 1] * overflow_points_[i + 1] + strides_[i + 1];
}
}

int TransposeServerCPUKernel::ChooseThreadCuttingStrategy() {
block_boundary_infos_.clear();
int64_t element_num = in_tensors_.front()->ElementsNum();
if (element_num <= kMinCostPerThread) {
thread_num_ = 1;
} else {
thread_num_ = MSMIN(op_parameter_->thread_num_, UP_DIV(element_num, kMinCostPerThread));
}
if (thread_num_ < 1) {
thread_num_ = 1;
}
if (thread_num_ > C4NUM) {
thread_num_ = C4NUM;
}
int64_t block_size = element_num / thread_num_;
int64_t remain_data = element_num - block_size * thread_num_;
int64_t split_point = 0;
block_boundary_infos_.clear();
std::vector<int64_t> post_multi(param_->num_axes_, 1);
for (int i = param_->num_axes_ - C2NUM; i >= 0; --i) {
post_multi[i] = post_multi[i + 1] * out_shape_[i + 1];
}
while (split_point < element_num) {
TransposeBlockBoundaryInfo block_boundary_info;
int64_t in_offset = 0;
block_boundary_info.out_start_offset = split_point;
for (int i = 0; i < param_->num_axes_; ++i) {
block_boundary_info.start_dim[i] = split_point / post_multi[i] % out_shape_[i];
in_offset += block_boundary_info.start_dim[i] * param_->strides_[param_->perm_[i]];
}
block_boundary_info.in_offsets[0] = in_offset;
split_point += block_size;
if (remain_data > 0) {
++split_point;
--remain_data;
}
if (split_point > element_num) {
split_point = element_num;
}
int64_t size = split_point - block_boundary_info.out_start_offset;
int last_axis_index = param_->num_axes_ - 1;
block_boundary_info.sizes[0] =
MSMIN(size, out_shape_[last_axis_index] - block_boundary_info.start_dim[last_axis_index]);
size -= block_boundary_info.sizes[0];
block_boundary_info.sizes[1] = DOWN_ROUND(size, out_shape_[last_axis_index]);
block_boundary_info.sizes[C2NUM] = size - block_boundary_info.sizes[1];
int64_t out_offset = block_boundary_info.out_start_offset + block_boundary_info.sizes[0];
in_offset = 0;
for (int i = 0; i < param_->num_axes_; ++i) {
block_boundary_info.start_dim[i] = out_offset / post_multi[i] % out_shape_[i];
in_offset += block_boundary_info.start_dim[i] * param_->strides_[param_->perm_[i]];
}
block_boundary_info.in_offsets[1] = in_offset;
block_boundary_infos_.push_back(block_boundary_info);
}
thread_num_ = block_boundary_infos_.size();
return RET_OK;
}

int TransposeServerCPUKernel::DoTransposeSingleThread() { return DoTransposeMultiThread(0); }

int TransposeServerCPUKernel::DoTransposeMultiThread(int task_id) {
if (opt_run_) {
PackNHWCToNCHWFp32(in_data_, out_data_, opt_param_[FIRST_INPUT], opt_param_[SECOND_INPUT], opt_param_[THIRD_INPUT],
task_id, thread_num_);
return RET_OK;
}
DoTransposeServer(static_cast<float *>(in_data_), static_cast<float *>(out_data_), overflow_points_.data(),
strides_.data(), param_->num_axes_, &block_boundary_infos_[task_id]);
return RET_OK;
}

REG_KERNEL(kCPU, kNumberTypeFloat32, PrimitiveType_Transpose, LiteKernelCreator<TransposeServerCPUKernel>)
REG_KERNEL(kCPU, kNumberTypeInt32, PrimitiveType_Transpose, LiteKernelCreator<TransposeServerCPUKernel>)
} // namespace mindspore::kernel
#endif

+ 47
- 0
mindspore/lite/src/runtime/kernel/arm/fp32/transpose_server_fp32.h View File

@@ -0,0 +1,47 @@
/**
* Copyright 2022 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef MINDSPORE_CCSRC_KERNEL_CPU_ARM_FP32_TRANSPOSE_SERVER_FP32_H_
#define MINDSPORE_CCSRC_KERNEL_CPU_ARM_FP32_TRANSPOSE_SERVER_FP32_H_

#ifdef BFC_MEMORY
#include <vector>
#include "src/runtime/kernel/arm/base/transpose_base.h"
#include "nnacl/fp32/transpose_server_fp32.h"

namespace mindspore::kernel {
class TransposeServerCPUKernel : public TransposeBaseCPUKernel {
public:
explicit TransposeServerCPUKernel(OpParameter *param, const std::vector<lite::Tensor *> &inputs,
const std::vector<lite::Tensor *> &outputs, const lite::InnerContext *ctx)
: TransposeBaseCPUKernel(param, inputs, outputs, ctx) {}
~TransposeServerCPUKernel() override = default;

int ReSize() override;

private:
void ComputeIndividualOfflineInfo();
int ChooseThreadCuttingStrategy();
int DoTransposeSingleThread() override;
int DoTransposeMultiThread(int task_id) override;

std::vector<int64_t> overflow_points_;
std::vector<int64_t> strides_;
std::vector<TransposeBlockBoundaryInfo> block_boundary_infos_;
};
} // namespace mindspore::kernel

#endif // MINDSPORE_CCSRC_KERNEL_CPU_ARM_FP32_TRANSPOSE_SERVER_FP32_H_
#endif

Loading…
Cancel
Save