| @@ -61,6 +61,36 @@ int ArithmeticFP16CPUKernel::CheckDataType() { | |||||
| return RET_OK; | return RET_OK; | ||||
| } | } | ||||
| bool ArithmeticFP16CPUKernel::IsScalarClac() { // 2 32 240 240, 1 1 1 1 | |||||
| if ((param_->in_elements_num0_ == 1 || param_->in_elements_num1_ == 1) && (arithmetic_opt_func_ != nullptr)) { | |||||
| return true; | |||||
| } else { | |||||
| return false; | |||||
| } | |||||
| } | |||||
| bool ArithmeticFP16CPUKernel::IsBatchScalarCalc() { | |||||
| if (arithmetic_opt_func_ == nullptr) { | |||||
| return false; | |||||
| } | |||||
| size_t break_axis = 0; | |||||
| for (size_t i = 0; i < param_->ndim_; i++) { | |||||
| if (param_->in_shape0_[i] != param_->in_shape1_[i]) { | |||||
| break_axis = i; | |||||
| break; | |||||
| } | |||||
| } | |||||
| if (break_axis < param_->ndim_) { | |||||
| for (size_t i = break_axis; i < param_->ndim_; i++) { | |||||
| if (param_->in_shape1_[i] != 1) { | |||||
| return false; | |||||
| } | |||||
| } | |||||
| } | |||||
| break_pos_ = break_axis; | |||||
| return true; | |||||
| } | |||||
| void ArithmeticFP16CPUKernel::InitRunFunction(int primitive_type) { | void ArithmeticFP16CPUKernel::InitRunFunction(int primitive_type) { | ||||
| ARITHMETIC_FUNC_INFO_FP16 fun_table[] = { | ARITHMETIC_FUNC_INFO_FP16 fun_table[] = { | ||||
| {PrimitiveType_MulFusion, schema::ActivationType_RELU, ElementMulReluFp16, ElementOptMulReluFp16}, | {PrimitiveType_MulFusion, schema::ActivationType_RELU, ElementMulReluFp16, ElementOptMulReluFp16}, | ||||
| @@ -40,6 +40,8 @@ class ArithmeticFP16CPUKernel : public ArithmeticCPUKernel { | |||||
| ~ArithmeticFP16CPUKernel() = default; | ~ArithmeticFP16CPUKernel() = default; | ||||
| int ReSize() override; | int ReSize() override; | ||||
| int Run() override; | int Run() override; | ||||
| bool IsBatchScalarCalc() override; | |||||
| bool IsScalarClac() override; | |||||
| private: | private: | ||||
| void InitRunFunction(int primitive_type) override; | void InitRunFunction(int primitive_type) override; | ||||
| @@ -61,7 +61,7 @@ int ArithmeticCPUKernel::ReSize() { | |||||
| } | } | ||||
| } | } | ||||
| int ret = RET_OK; | int ret = RET_OK; | ||||
| if (!isScalarClac() && !isBatchScalarCalc() && !isBiasCalc()) { | |||||
| if (!IsScalarClac() && !IsBatchScalarCalc() && !IsBiasCalc()) { | |||||
| ret = ConstTensorBroadCast(); | ret = ConstTensorBroadCast(); | ||||
| } | } | ||||
| return ret; | return ret; | ||||
| @@ -77,7 +77,7 @@ int ArithmeticCPUKernel::CheckDataType() { | |||||
| return RET_OK; | return RET_OK; | ||||
| } | } | ||||
| bool ArithmeticCPUKernel::isScalarClac() { // 2 32 240 240, 1 1 1 1 | |||||
| bool ArithmeticCPUKernel::IsScalarClac() { // 2 32 240 240, 1 1 1 1 | |||||
| if ((param_->in_elements_num0_ == 1 || param_->in_elements_num1_ == 1) && (arithmetic_opt_run_ != nullptr)) { | if ((param_->in_elements_num0_ == 1 || param_->in_elements_num1_ == 1) && (arithmetic_opt_run_ != nullptr)) { | ||||
| return true; | return true; | ||||
| } else { | } else { | ||||
| @@ -85,7 +85,7 @@ bool ArithmeticCPUKernel::isScalarClac() { // 2 32 240 240, 1 1 1 1 | |||||
| } | } | ||||
| } | } | ||||
| bool ArithmeticCPUKernel::isBatchScalarCalc() { // 2 32 240 240, 2 32 1 1 | |||||
| bool ArithmeticCPUKernel::IsBatchScalarCalc() { // 2 32 240 240, 2 32 1 1 | |||||
| if (arithmetic_opt_run_ == nullptr) { | if (arithmetic_opt_run_ == nullptr) { | ||||
| return false; | return false; | ||||
| } | } | ||||
| @@ -107,7 +107,7 @@ bool ArithmeticCPUKernel::isBatchScalarCalc() { // 2 32 240 240, 2 32 1 1 | |||||
| return true; | return true; | ||||
| } | } | ||||
| bool ArithmeticCPUKernel::isBiasCalc() { // 2 240 240 32, 1 1 1 32 | |||||
| bool ArithmeticCPUKernel::IsBiasCalc() { // 2 240 240 32, 1 1 1 32 | |||||
| int last_shape0 = param_->in_shape0_[param_->ndim_ - 1]; | int last_shape0 = param_->in_shape0_[param_->ndim_ - 1]; | ||||
| int last_shape1 = param_->in_shape1_[param_->ndim_ - 1]; | int last_shape1 = param_->in_shape1_[param_->ndim_ - 1]; | ||||
| if (param_->in_elements_num0_ > param_->in_elements_num1_) { | if (param_->in_elements_num0_ > param_->in_elements_num1_) { | ||||
| @@ -365,7 +365,7 @@ int ArithmeticCPUKernel::DoArithmetic(int task_id) { | |||||
| } | } | ||||
| int offset = stride * task_id * data_type_len_; | int offset = stride * task_id * data_type_len_; | ||||
| /* run opt function, one of input is scalar */ | /* run opt function, one of input is scalar */ | ||||
| if (isScalarClac()) { // 2 32 240 240, 1 1 1 1 | |||||
| if (IsScalarClac()) { // 2 32 240 240, 1 1 1 1 | |||||
| if (param_->in_elements_num0_ == 1) { | if (param_->in_elements_num0_ == 1) { | ||||
| return Execute(input0_ptr_, static_cast<uint8_t *>(input1_ptr_) + offset, | return Execute(input0_ptr_, static_cast<uint8_t *>(input1_ptr_) + offset, | ||||
| static_cast<uint8_t *>(output_ptr_) + offset, count, true); | static_cast<uint8_t *>(output_ptr_) + offset, count, true); | ||||
| @@ -375,11 +375,11 @@ int ArithmeticCPUKernel::DoArithmetic(int task_id) { | |||||
| } | } | ||||
| } | } | ||||
| /* run opt function, every batch one of input is scalar */ | /* run opt function, every batch one of input is scalar */ | ||||
| if (isBatchScalarCalc()) { // 2 32 240 240, 2 32 1 1 | |||||
| if (IsBatchScalarCalc()) { // 2 32 240 240, 2 32 1 1 | |||||
| return BatchScalarCalc(task_id); | return BatchScalarCalc(task_id); | ||||
| } | } | ||||
| /* each batch is eltwise calculation */ | /* each batch is eltwise calculation */ | ||||
| if (isBiasCalc()) { // 2 240 240 32, 1 1 1 32 | |||||
| if (IsBiasCalc()) { // 2 240 240 32, 1 1 1 32 | |||||
| return BiasCalc(task_id); | return BiasCalc(task_id); | ||||
| } | } | ||||
| /* need broadcast in runtime */ | /* need broadcast in runtime */ | ||||
| @@ -97,6 +97,8 @@ class ArithmeticCPUKernel : public LiteKernel { | |||||
| virtual void TileConstTensor(const void *in_data, void *out_data, size_t ndim, const int *in_shape, | virtual void TileConstTensor(const void *in_data, void *out_data, size_t ndim, const int *in_shape, | ||||
| const int *in_strides, const int *out_strides, const int *multiple); | const int *in_strides, const int *out_strides, const int *multiple); | ||||
| virtual int Execute(const void *input0, const void *input1, void *output, int size, bool is_opt); | virtual int Execute(const void *input0, const void *input1, void *output, int size, bool is_opt); | ||||
| virtual bool IsBatchScalarCalc(); | |||||
| virtual bool IsScalarClac(); | |||||
| bool input0_broadcast_ = false; | bool input0_broadcast_ = false; | ||||
| bool input1_broadcast_ = false; | bool input1_broadcast_ = false; | ||||
| void *input0_ptr_ = nullptr; | void *input0_ptr_ = nullptr; | ||||
| @@ -111,9 +113,7 @@ class ArithmeticCPUKernel : public LiteKernel { | |||||
| int BatchScalarCalc(int task_id); | int BatchScalarCalc(int task_id); | ||||
| int BiasCalc(int task_id); | int BiasCalc(int task_id); | ||||
| void FreeConstTileBuff(); | void FreeConstTileBuff(); | ||||
| bool isScalarClac(); | |||||
| bool isBatchScalarCalc(); | |||||
| bool isBiasCalc(); | |||||
| bool IsBiasCalc(); | |||||
| ArithmeticRun arithmetic_run_ = nullptr; | ArithmeticRun arithmetic_run_ = nullptr; | ||||
| ArithmeticOptRun arithmetic_opt_run_ = nullptr; | ArithmeticOptRun arithmetic_opt_run_ = nullptr; | ||||
| ArithmeticIntRun arithmetic_run_int_ = nullptr; | ArithmeticIntRun arithmetic_run_int_ = nullptr; | ||||
| @@ -18,11 +18,10 @@ | |||||
| #include "src/runtime/parallel_executor.h" | #include "src/runtime/parallel_executor.h" | ||||
| #include "src/runtime/runtime_api.h" | #include "src/runtime/runtime_api.h" | ||||
| #define MAX_THREAD_NUM 8 | |||||
| namespace mindspore::lite { | namespace mindspore::lite { | ||||
| ParallelExecutor::~ParallelExecutor() { DestroyThreadPool(thread_pool_); } | ParallelExecutor::~ParallelExecutor() { DestroyThreadPool(thread_pool_); } | ||||
| int ParallelExecutor::Prepare(const std::vector<mindspore::kernel::LiteKernel *> &kernels) { | int ParallelExecutor::Prepare(const std::vector<mindspore::kernel::LiteKernel *> &kernels) { | ||||
| thread_pool_ = CreateLiteThreadPool(MAX_THREAD_NUM, NO_BIND); | |||||
| thread_pool_ = CreateLiteThreadPool(max_thread_num_, NO_BIND); | |||||
| if (thread_pool_ == nullptr) { | if (thread_pool_ == nullptr) { | ||||
| MS_LOG(ERROR) << "Memory error: fail to new ThreadPool"; | MS_LOG(ERROR) << "Memory error: fail to new ThreadPool"; | ||||
| return RET_ERROR; | return RET_ERROR; | ||||
| @@ -18,6 +18,7 @@ | |||||
| #define MINDSPORE_LITE_SRC_RUNTIME_PARALLEL_EXECUTOR_H_ | #define MINDSPORE_LITE_SRC_RUNTIME_PARALLEL_EXECUTOR_H_ | ||||
| #include <vector> | #include <vector> | ||||
| #include <thread> | |||||
| #include <unordered_map> | #include <unordered_map> | ||||
| #include "src/runtime/allocator.h" | #include "src/runtime/allocator.h" | ||||
| #include "src/lite_kernel.h" | #include "src/lite_kernel.h" | ||||
| @@ -43,6 +44,7 @@ class ParallelExecutor : public Executor { | |||||
| std::vector<kernel::LiteKernel *> readyKernels; | std::vector<kernel::LiteKernel *> readyKernels; | ||||
| std::vector<int> results; | std::vector<int> results; | ||||
| struct ThreadPool *thread_pool_ = nullptr; | struct ThreadPool *thread_pool_ = nullptr; | ||||
| int max_thread_num_ = std::thread::hardware_concurrency(); | |||||
| }; | }; | ||||
| } // namespace mindspore::lite | } // namespace mindspore::lite | ||||
| @@ -21,10 +21,14 @@ | |||||
| #include <semaphore.h> | #include <semaphore.h> | ||||
| #include <string.h> | #include <string.h> | ||||
| #include <stdlib.h> | #include <stdlib.h> | ||||
| #include <unistd.h> | |||||
| #ifdef __WIN32__ | |||||
| #include <windows.h> | |||||
| #endif | |||||
| #ifdef __ANDROID__ | #ifdef __ANDROID__ | ||||
| #define BIND_CORE | #define BIND_CORE | ||||
| #include <unistd.h> | |||||
| #include <sched.h> | #include <sched.h> | ||||
| #endif | #endif | ||||
| #ifdef MS_COMPILE_IOS | #ifdef MS_COMPILE_IOS | ||||
| @@ -48,7 +52,6 @@ | |||||
| #define RET_TP_ERROR (-8) | #define RET_TP_ERROR (-8) | ||||
| #define RET_TP_SYSTEM_ERROR (-1) | #define RET_TP_SYSTEM_ERROR (-1) | ||||
| #define MAX_THREAD_NUM (200) | |||||
| #define DEFAULT_SPIN_COUNT (30000) | #define DEFAULT_SPIN_COUNT (30000) | ||||
| typedef struct { | typedef struct { | ||||
| @@ -831,8 +834,15 @@ int CreateNewThread(struct ThreadPool *thread_pool, int thread_id) { | |||||
| } | } | ||||
| ThreadPool *CreateThreadPool(int thread_num, int mode) { | ThreadPool *CreateThreadPool(int thread_num, int mode) { | ||||
| #ifdef __WIN32__ | |||||
| SYSTEM_INFO sys_info; | |||||
| GetSystemInfo(&sys_info); | |||||
| long max_thread_num = sys_info.dwNumberOfProcessors; | |||||
| #else | |||||
| long max_thread_num = sysconf(_SC_NPROCESSORS_ONLN); | |||||
| #endif | |||||
| LOG_INFO("create thread pool, thread_num: %d, mode: %d", thread_num, mode); | LOG_INFO("create thread pool, thread_num: %d, mode: %d", thread_num, mode); | ||||
| if (thread_num <= 0 || thread_num > MAX_THREAD_NUM) { | |||||
| if (thread_num <= 0 || thread_num > max_thread_num) { | |||||
| LOG_ERROR("invalid thread num: %d", thread_num); | LOG_ERROR("invalid thread num: %d", thread_num); | ||||
| return NULL; | return NULL; | ||||
| } | } | ||||
| @@ -851,7 +861,7 @@ ThreadPool *CreateThreadPool(int thread_num, int mode) { | |||||
| LOG_ERROR("Malloc ThreadPool failed"); | LOG_ERROR("Malloc ThreadPool failed"); | ||||
| return NULL; | return NULL; | ||||
| } | } | ||||
| thread_pool->thread_num = thread_num > MAX_THREAD_NUM ? MAX_THREAD_NUM : thread_num; | |||||
| thread_pool->thread_num = thread_num > max_thread_num ? max_thread_num : thread_num; | |||||
| thread_pool->is_alive = ATOMIC_VAR_INIT(true); | thread_pool->is_alive = ATOMIC_VAR_INIT(true); | ||||
| thread_pool->mode = mode; | thread_pool->mode = mode; | ||||
| thread_pool->thread_list = NULL; | thread_pool->thread_list = NULL; | ||||