| @@ -15,14 +15,12 @@ | |||||
| */ | */ | ||||
| #include "nnacl/base/stack_base.h" | #include "nnacl/base/stack_base.h" | ||||
| void Stack(char **inputs, char *output, size_t input_num, size_t copy_size, size_t outter_size) { | |||||
| size_t in_offset = 0; | |||||
| void Stack(char **inputs, char *output, size_t input_num, size_t copy_size, int outer_start, int outer_end) { | |||||
| size_t out_offset = 0; | size_t out_offset = 0; | ||||
| for (size_t i = 0; i < outter_size; ++i) { | |||||
| for (size_t i = outer_start; i < outer_end; ++i) { | |||||
| for (size_t j = 0; j < input_num; ++j) { | for (size_t j = 0; j < input_num; ++j) { | ||||
| memcpy(output + out_offset, inputs[j] + in_offset, copy_size); | |||||
| memcpy(output + out_offset, inputs[j] + i * copy_size, copy_size); | |||||
| out_offset += copy_size; | out_offset += copy_size; | ||||
| } | } | ||||
| in_offset += copy_size; | |||||
| } | } | ||||
| } | } | ||||
| @@ -23,7 +23,7 @@ | |||||
| #ifdef __cplusplus | #ifdef __cplusplus | ||||
| extern "C" { | extern "C" { | ||||
| #endif | #endif | ||||
| void Stack(char **inputs, char *output, size_t input_num, size_t copy_size, size_t outter_size); | |||||
| void Stack(char **inputs, char *output, size_t input_num, size_t copy_size, int outer_start, int outer_end); | |||||
| #ifdef __cplusplus | #ifdef __cplusplus | ||||
| } | } | ||||
| #endif | #endif | ||||
| @@ -41,8 +41,8 @@ static inline int GetCopyNum(const std::vector<int> &in_shape, int axis, int n_d | |||||
| return copy_num; | return copy_num; | ||||
| } | } | ||||
| static inline size_t GetOuterSize(const std::vector<int> &in_shape, int axis) { | |||||
| size_t outer_size = 1; | |||||
| static inline int GetOuterSize(const std::vector<int> &in_shape, int axis) { | |||||
| int outer_size = 1; | |||||
| for (int i = 0; i < axis; ++i) { | for (int i = 0; i < axis; ++i) { | ||||
| outer_size *= in_shape[i]; | outer_size *= in_shape[i]; | ||||
| } | } | ||||
| @@ -72,23 +72,43 @@ int StackBaseCPUKernel::Init() { | |||||
| return ReSize(); | return ReSize(); | ||||
| } | } | ||||
| void StackBaseCPUKernel::Execute(int task_id) { | |||||
| auto output_data = reinterpret_cast<char *>(out_tensors_.at(0)->data_c()); | |||||
| auto step = UP_DIV(outer_size_, num_threads_); | |||||
| auto start = task_id * step; | |||||
| auto end = MSMIN(start + step, outer_size_); | |||||
| auto input_num = in_tensors_.size(); | |||||
| Stack(all_inputs_, output_data + input_num * start * copy_size_, input_num, copy_size_, start, end); | |||||
| } | |||||
| static int StackRun(void *cdata, int task_id) { | |||||
| auto stack = reinterpret_cast<StackBaseCPUKernel *>(cdata); | |||||
| stack->Execute(task_id); | |||||
| return RET_OK; | |||||
| } | |||||
| int StackBaseCPUKernel::Run() { | int StackBaseCPUKernel::Run() { | ||||
| // malloc temporary memory to store all the inputs | // malloc temporary memory to store all the inputs | ||||
| size_t inputs_num = in_tensors_.size(); | size_t inputs_num = in_tensors_.size(); | ||||
| char **all_inputs = static_cast<char **>(context_->allocator->Malloc(inputs_num * sizeof(char *))); | |||||
| if (all_inputs == nullptr) { | |||||
| all_inputs_ = static_cast<char **>(context_->allocator->Malloc(inputs_num * sizeof(char *))); | |||||
| if (all_inputs_ == nullptr) { | |||||
| MS_LOG(ERROR) << "malloc all_inputs failed."; | MS_LOG(ERROR) << "malloc all_inputs failed."; | ||||
| return RET_ERROR; | return RET_ERROR; | ||||
| } | } | ||||
| for (size_t j = 0; j < inputs_num; ++j) { | for (size_t j = 0; j < inputs_num; ++j) { | ||||
| all_inputs[j] = reinterpret_cast<char *>(in_tensors_.at(j)->data_c()); | |||||
| all_inputs_[j] = reinterpret_cast<char *>(in_tensors_.at(j)->data_c()); | |||||
| } | } | ||||
| // run stack | // run stack | ||||
| auto output_data = reinterpret_cast<char *>(out_tensors_.at(0)->data_c()); | |||||
| Stack(all_inputs, output_data, in_tensors_.size(), copy_size_, outer_size_); | |||||
| num_threads_ = MSMIN(UP_DIV(outer_size_, 64), this->context_->thread_num_); | |||||
| auto ret = ParallelLaunch(this->context_->thread_pool_, StackRun, this, num_threads_); | |||||
| if (ret != RET_OK) { | |||||
| MS_LOG(ERROR) << "StackBaseCPUKernel Run error: error_code[" << ret << "]"; | |||||
| return RET_ERROR; | |||||
| } | |||||
| // free temporary variable all_inputs | // free temporary variable all_inputs | ||||
| context_->allocator->Free(all_inputs); | |||||
| context_->allocator->Free(all_inputs_); | |||||
| all_inputs_ = nullptr; | |||||
| return RET_OK; | return RET_OK; | ||||
| } | } | ||||
| @@ -32,12 +32,15 @@ class StackBaseCPUKernel : public LiteKernel { | |||||
| int Init() override; | int Init() override; | ||||
| int ReSize() override; | int ReSize() override; | ||||
| int Run() override; | int Run() override; | ||||
| void Execute(int task_id); | |||||
| protected: | protected: | ||||
| int axis_ = 0; | int axis_ = 0; | ||||
| size_t data_type_size_ = 0; | size_t data_type_size_ = 0; | ||||
| size_t copy_size_ = 0; | size_t copy_size_ = 0; | ||||
| size_t outer_size_ = 1; | |||||
| int outer_size_ = 1; | |||||
| int num_threads_ = 1; | |||||
| char **all_inputs_ = nullptr; | |||||
| }; | }; | ||||
| } // namespace mindspore::kernel | } // namespace mindspore::kernel | ||||
| #endif // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_BASE_STACK_BASE_H_ | #endif // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_BASE_STACK_BASE_H_ | ||||
| @@ -75,6 +75,22 @@ int StackFp16CPUKernel::Init() { | |||||
| return ReSize(); | return ReSize(); | ||||
| } | } | ||||
| void StackFp16CPUKernel::Execute(int task_id) { | |||||
| auto inputs = buffers_.data(); | |||||
| char *output = reinterpret_cast<char *>(out_buffer_); | |||||
| auto step = UP_DIV(outer_size_, num_threads_); | |||||
| auto start = task_id * step; | |||||
| auto end = MSMIN(start + step, outer_size_); | |||||
| auto input_num = in_tensors_.size(); | |||||
| Stack(inputs, output + input_num * start * copy_size_, input_num, copy_size_, start, end); | |||||
| } | |||||
| static int StackRun(void *cdata, int task_id) { | |||||
| auto stack = reinterpret_cast<StackFp16CPUKernel *>(cdata); | |||||
| stack->Execute(task_id); | |||||
| return RET_OK; | |||||
| } | |||||
| int StackFp16CPUKernel::Run() { | int StackFp16CPUKernel::Run() { | ||||
| InitMallocFlags(); | InitMallocFlags(); | ||||
| auto ret = MallocAssignBuffer(); | auto ret = MallocAssignBuffer(); | ||||
| @@ -82,7 +98,13 @@ int StackFp16CPUKernel::Run() { | |||||
| FreeBuffer(); | FreeBuffer(); | ||||
| return ret; | return ret; | ||||
| } | } | ||||
| Stack(buffers_.data(), reinterpret_cast<char *>(out_buffer_), in_tensors_.size(), copy_size_, outer_size_); | |||||
| // run stack | |||||
| num_threads_ = MSMIN(UP_DIV(outer_size_, 64), this->context_->thread_num_); | |||||
| ret = ParallelLaunch(this->context_->thread_pool_, StackRun, this, num_threads_); | |||||
| if (ret != RET_OK) { | |||||
| MS_LOG(ERROR) << "StackBaseCPUKernel Run error: error_code[" << ret << "]"; | |||||
| return RET_ERROR; | |||||
| } | |||||
| // if output tensor is fp32, we need to transform | // if output tensor is fp32, we need to transform | ||||
| if (malloc_out_) { | if (malloc_out_) { | ||||
| auto out_tensor = out_tensors_.at(0); | auto out_tensor = out_tensors_.at(0); | ||||
| @@ -29,6 +29,7 @@ class StackFp16CPUKernel : public StackBaseCPUKernel { | |||||
| ~StackFp16CPUKernel() override = default; | ~StackFp16CPUKernel() override = default; | ||||
| int Init() override; | int Init() override; | ||||
| int Run() override; | int Run() override; | ||||
| void Execute(int task_id); | |||||
| private: | private: | ||||
| void InitMallocFlags(); | void InitMallocFlags(); | ||||
| @@ -34,7 +34,7 @@ TEST_F(StackTestFp32, StackTest1) { | |||||
| constexpr int kOutSize = 18; | constexpr int kOutSize = 18; | ||||
| float expect_out[kOutSize] = {1, 4, 7, 2, 5, 8, 3, 6, 9, 10, 40, 70, 20, 50, 80, 30, 60, 90}; | float expect_out[kOutSize] = {1, 4, 7, 2, 5, 8, 3, 6, 9, 10, 40, 70, 20, 50, 80, 30, 60, 90}; | ||||
| float output[kOutSize]; | float output[kOutSize]; | ||||
| Stack(input, reinterpret_cast<char *>(output), 3, 4, 6); | |||||
| Stack(input, reinterpret_cast<char *>(output), 3, 4, 0, 6); | |||||
| for (float i : output) { | for (float i : output) { | ||||
| std::cout << i << " "; | std::cout << i << " "; | ||||
| } | } | ||||