Browse Source

add multi thread for stack

pull/15091/head
lixian 4 years ago
parent
commit
bb7df15c19
7 changed files with 61 additions and 17 deletions
  1. +3
    -5
      mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/base/stack_base.c
  2. +1
    -1
      mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/base/stack_base.h
  3. +28
    -8
      mindspore/lite/src/runtime/kernel/arm/base/stack_base.cc
  4. +4
    -1
      mindspore/lite/src/runtime/kernel/arm/base/stack_base.h
  5. +23
    -1
      mindspore/lite/src/runtime/kernel/arm/fp16/stack_fp16.cc
  6. +1
    -0
      mindspore/lite/src/runtime/kernel/arm/fp16/stack_fp16.h
  7. +1
    -1
      mindspore/lite/test/ut/src/runtime/kernel/arm/fp32/stack_fp32_test.cc

+ 3
- 5
mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/base/stack_base.c View File

@@ -15,14 +15,12 @@
*/
#include "nnacl/base/stack_base.h"

void Stack(char **inputs, char *output, size_t input_num, size_t copy_size, size_t outter_size) {
size_t in_offset = 0;
void Stack(char **inputs, char *output, size_t input_num, size_t copy_size, int outer_start, int outer_end) {
size_t out_offset = 0;
for (size_t i = 0; i < outter_size; ++i) {
for (size_t i = outer_start; i < outer_end; ++i) {
for (size_t j = 0; j < input_num; ++j) {
memcpy(output + out_offset, inputs[j] + in_offset, copy_size);
memcpy(output + out_offset, inputs[j] + i * copy_size, copy_size);
out_offset += copy_size;
}
in_offset += copy_size;
}
}

+ 1
- 1
mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/base/stack_base.h View File

@@ -23,7 +23,7 @@
#ifdef __cplusplus
extern "C" {
#endif
void Stack(char **inputs, char *output, size_t input_num, size_t copy_size, size_t outter_size);
void Stack(char **inputs, char *output, size_t input_num, size_t copy_size, int outer_start, int outer_end);
#ifdef __cplusplus
}
#endif


+ 28
- 8
mindspore/lite/src/runtime/kernel/arm/base/stack_base.cc View File

@@ -41,8 +41,8 @@ static inline int GetCopyNum(const std::vector<int> &in_shape, int axis, int n_d
return copy_num;
}

static inline size_t GetOuterSize(const std::vector<int> &in_shape, int axis) {
size_t outer_size = 1;
static inline int GetOuterSize(const std::vector<int> &in_shape, int axis) {
int outer_size = 1;
for (int i = 0; i < axis; ++i) {
outer_size *= in_shape[i];
}
@@ -72,23 +72,43 @@ int StackBaseCPUKernel::Init() {
return ReSize();
}

void StackBaseCPUKernel::Execute(int task_id) {
auto output_data = reinterpret_cast<char *>(out_tensors_.at(0)->data_c());
auto step = UP_DIV(outer_size_, num_threads_);
auto start = task_id * step;
auto end = MSMIN(start + step, outer_size_);
auto input_num = in_tensors_.size();
Stack(all_inputs_, output_data + input_num * start * copy_size_, input_num, copy_size_, start, end);
}

static int StackRun(void *cdata, int task_id) {
auto stack = reinterpret_cast<StackBaseCPUKernel *>(cdata);
stack->Execute(task_id);
return RET_OK;
}

int StackBaseCPUKernel::Run() {
// malloc temporary memory to store all the inputs
size_t inputs_num = in_tensors_.size();
char **all_inputs = static_cast<char **>(context_->allocator->Malloc(inputs_num * sizeof(char *)));
if (all_inputs == nullptr) {
all_inputs_ = static_cast<char **>(context_->allocator->Malloc(inputs_num * sizeof(char *)));
if (all_inputs_ == nullptr) {
MS_LOG(ERROR) << "malloc all_inputs failed.";
return RET_ERROR;
}
for (size_t j = 0; j < inputs_num; ++j) {
all_inputs[j] = reinterpret_cast<char *>(in_tensors_.at(j)->data_c());
all_inputs_[j] = reinterpret_cast<char *>(in_tensors_.at(j)->data_c());
}
// run stack
auto output_data = reinterpret_cast<char *>(out_tensors_.at(0)->data_c());
Stack(all_inputs, output_data, in_tensors_.size(), copy_size_, outer_size_);
num_threads_ = MSMIN(UP_DIV(outer_size_, 64), this->context_->thread_num_);
auto ret = ParallelLaunch(this->context_->thread_pool_, StackRun, this, num_threads_);
if (ret != RET_OK) {
MS_LOG(ERROR) << "StackBaseCPUKernel Run error: error_code[" << ret << "]";
return RET_ERROR;
}

// free temporary variable all_inputs
context_->allocator->Free(all_inputs);
context_->allocator->Free(all_inputs_);
all_inputs_ = nullptr;
return RET_OK;
}



+ 4
- 1
mindspore/lite/src/runtime/kernel/arm/base/stack_base.h View File

@@ -32,12 +32,15 @@ class StackBaseCPUKernel : public LiteKernel {
int Init() override;
int ReSize() override;
int Run() override;
void Execute(int task_id);

protected:
int axis_ = 0;
size_t data_type_size_ = 0;
size_t copy_size_ = 0;
size_t outer_size_ = 1;
int outer_size_ = 1;
int num_threads_ = 1;
char **all_inputs_ = nullptr;
};
} // namespace mindspore::kernel
#endif // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_BASE_STACK_BASE_H_

+ 23
- 1
mindspore/lite/src/runtime/kernel/arm/fp16/stack_fp16.cc View File

@@ -75,6 +75,22 @@ int StackFp16CPUKernel::Init() {
return ReSize();
}

void StackFp16CPUKernel::Execute(int task_id) {
auto inputs = buffers_.data();
char *output = reinterpret_cast<char *>(out_buffer_);
auto step = UP_DIV(outer_size_, num_threads_);
auto start = task_id * step;
auto end = MSMIN(start + step, outer_size_);
auto input_num = in_tensors_.size();
Stack(inputs, output + input_num * start * copy_size_, input_num, copy_size_, start, end);
}

static int StackRun(void *cdata, int task_id) {
auto stack = reinterpret_cast<StackFp16CPUKernel *>(cdata);
stack->Execute(task_id);
return RET_OK;
}

int StackFp16CPUKernel::Run() {
InitMallocFlags();
auto ret = MallocAssignBuffer();
@@ -82,7 +98,13 @@ int StackFp16CPUKernel::Run() {
FreeBuffer();
return ret;
}
Stack(buffers_.data(), reinterpret_cast<char *>(out_buffer_), in_tensors_.size(), copy_size_, outer_size_);
// run stack
num_threads_ = MSMIN(UP_DIV(outer_size_, 64), this->context_->thread_num_);
ret = ParallelLaunch(this->context_->thread_pool_, StackRun, this, num_threads_);
if (ret != RET_OK) {
MS_LOG(ERROR) << "StackBaseCPUKernel Run error: error_code[" << ret << "]";
return RET_ERROR;
}
// if output tensor is fp32, we need to transform
if (malloc_out_) {
auto out_tensor = out_tensors_.at(0);


+ 1
- 0
mindspore/lite/src/runtime/kernel/arm/fp16/stack_fp16.h View File

@@ -29,6 +29,7 @@ class StackFp16CPUKernel : public StackBaseCPUKernel {
~StackFp16CPUKernel() override = default;
int Init() override;
int Run() override;
void Execute(int task_id);

private:
void InitMallocFlags();


+ 1
- 1
mindspore/lite/test/ut/src/runtime/kernel/arm/fp32/stack_fp32_test.cc View File

@@ -34,7 +34,7 @@ TEST_F(StackTestFp32, StackTest1) {
constexpr int kOutSize = 18;
float expect_out[kOutSize] = {1, 4, 7, 2, 5, 8, 3, 6, 9, 10, 40, 70, 20, 50, 80, 30, 60, 90};
float output[kOutSize];
Stack(input, reinterpret_cast<char *>(output), 3, 4, 6);
Stack(input, reinterpret_cast<char *>(output), 3, 4, 0, 6);
for (float i : output) {
std::cout << i << " ";
}


Loading…
Cancel
Save