| @@ -530,15 +530,6 @@ typedef struct QuantMulArg { | |||
| int right_shift_; | |||
| } QuantMulArg; | |||
| #ifdef SERVER_INFERENCE | |||
| typedef struct ThreadCostContext { | |||
| int64_t total_unit_num_; | |||
| int64_t per_unit_load_num_; | |||
| int64_t per_unit_store_num_; | |||
| float per_unit_compute_cost_; | |||
| } ThreadCostContext; | |||
| #endif | |||
| typedef enum ActType { ActType_No, ActType_Relu, ActType_Sigmod, ActType_Relu6, ActType_Prelu } ActType; | |||
| typedef enum PadMode { Pad_pad, Pad_same, Pad_valid } PadMode; | |||
| typedef enum RoundingMode { Rounding_No, Rounding_Away_from_zero, Rounding_Up } RoundingMode; | |||
| @@ -138,6 +138,7 @@ set(LITE_SRC | |||
| ${CMAKE_CURRENT_SOURCE_DIR}/runtime/dynamic_mem_manager.cc | |||
| ${CMAKE_CURRENT_SOURCE_DIR}/runtime/numa_adapter.cc | |||
| ${CMAKE_CURRENT_SOURCE_DIR}/pack_weight_manager.cc | |||
| ${CMAKE_CURRENT_SOURCE_DIR}/thread_cost_model.cc | |||
| ) | |||
| endif() | |||
| @@ -402,67 +402,6 @@ void InnerContext::ReplaceLinkInfoSenderWithNewOne(void *new_sender, void *old_s | |||
| } | |||
| } | |||
| #ifdef SERVER_INFERENCE | |||
| float DtCostModel::per_unit_load_cost_ = 1.0 / 64 * 11; // 64: L2 cache size, 11 : L2 cache latency on Haswell | |||
| float DtCostModel::per_unit_store_cost_ = 1.0 / 64 * 11; // 64: L2 cache size, 11 : L2 cache latency on Haswell | |||
| int64_t DtCostModel::per_unit_compute_num_ = 1; // 1 : per unit compute num | |||
| float DtCostModel::thread_startup_cost_ = 100000.0f; // 100000 : thread startup inherent cost | |||
| float DtCostModel::single_thread_cost_ = 100000.0f; // 100000 : Minimum cost of single-threaded | |||
| float DtCostModel::parallel_thread_cost_ = 40000.0f; // 40000 : Minimum cost of per thread in parallel-thread | |||
| int DtCostModel::get_optimal_thread_num(const ThreadCostContext *dt_cost_context, const int thread_num) { | |||
| const int64_t max_oversharding_factor = 4; | |||
| int64_t block_size = | |||
| MSVALID(max_oversharding_factor * thread_num, thread_block_size(dt_cost_context), dt_cost_context->total_unit_num_); | |||
| int64_t block_count = UP_DIV(dt_cost_context->total_unit_num_, block_size); | |||
| int64_t max_block_size = MSMIN(dt_cost_context->total_unit_num_, 2 * block_size); | |||
| double max_efficiency = static_cast<double>(block_count) / (UP_DIV(block_count, thread_num) * thread_num); | |||
| for (int64_t prev_block_count = block_count; max_efficiency < 1.0 && prev_block_count > 1;) { | |||
| int64_t cur_block_size = UP_DIV(dt_cost_context->total_unit_num_, prev_block_count - 1); | |||
| if (cur_block_size > max_block_size) { | |||
| break; | |||
| } | |||
| const int64_t cur_block_count = UP_DIV(dt_cost_context->total_unit_num_, cur_block_size); | |||
| MS_ASSERT(cur_block_count < prev_block_count); | |||
| prev_block_count = cur_block_count; | |||
| const double cur_efficiency = | |||
| static_cast<double>(cur_block_count) / (UP_DIV(cur_block_count, thread_num) * thread_num); | |||
| if (cur_efficiency + 0.01 >= max_efficiency) { // update threshold : 0.01 | |||
| block_size = cur_block_size; | |||
| block_count = cur_block_count; | |||
| if (max_efficiency < cur_efficiency) { | |||
| max_efficiency = cur_efficiency; | |||
| } | |||
| } | |||
| } | |||
| return block_count; | |||
| } | |||
| int UpdateThreadNum(const Context *context, const ThreadCostContext *dt_cost_context, int task_num) { | |||
| if (task_num <= 1) { | |||
| return task_num; | |||
| } | |||
| ThreadPool *pool = static_cast<const lite::InnerContext *>(context)->thread_pool(); | |||
| if (pool == nullptr) { | |||
| MS_LOG(ERROR) << "thread pool is nullptr"; | |||
| return RET_NULL_PTR; | |||
| } | |||
| if (dt_cost_context != nullptr) { | |||
| if (DtCostModel::thread_num(dt_cost_context) == 1) { | |||
| return 1; | |||
| } | |||
| int opt_thread = static_cast<int>(DtCostModel::parallel_degree(dt_cost_context)); | |||
| task_num = MSVALID(1, opt_thread, task_num); | |||
| } | |||
| return task_num; | |||
| } | |||
| #endif | |||
| int ParallelLaunch(const Context *context, const Func &func, Content content, int task_num) { | |||
| ThreadPool *pool = static_cast<const lite::InnerContext *>(context)->thread_pool(); | |||
| if (pool == nullptr) { | |||
| @@ -120,45 +120,6 @@ struct InnerContext : public Context { | |||
| std::unordered_map<void *, std::set<void *>> link_info_{}; | |||
| }; | |||
| #ifdef SERVER_INFERENCE | |||
| struct DtCostModel { | |||
| static float unit_cost(const ThreadCostContext *dt_cost_context) { | |||
| return per_unit_load_cost_ * dt_cost_context->per_unit_load_num_ + | |||
| per_unit_store_cost_ * dt_cost_context->per_unit_store_num_ + | |||
| dt_cost_context->per_unit_compute_cost_ * per_unit_compute_num_; | |||
| } | |||
| static float total_cost(const ThreadCostContext *dt_cost_context) { | |||
| return dt_cost_context->total_unit_num_ * unit_cost(dt_cost_context); | |||
| } | |||
| // thread_num assesses parallel thread num. Value of 1.0 means ideal parallel task size. Values < 1.0 mean that task | |||
| // granularity needs to be increased to mitigate parallelization overheads. | |||
| static float parallel_degree(const ThreadCostContext *dt_cost_context) { | |||
| return total_cost(dt_cost_context) / parallel_thread_cost_; | |||
| } | |||
| static int thread_num(const ThreadCostContext *dt_cost_context) { | |||
| return MSMAX(1, static_cast<int>((total_cost(dt_cost_context) - thread_startup_cost_) / single_thread_cost_ + 0.9)); | |||
| } | |||
| static int64_t thread_block_size(const ThreadCostContext *dt_cost_context) { | |||
| return static_cast<int64_t>(parallel_thread_cost_ / unit_cost(dt_cost_context)); | |||
| } | |||
| static int get_optimal_thread_num(const ThreadCostContext *dt_cost_context, const int thread_num); | |||
| static float per_unit_load_cost_; // per unit load cost | |||
| static float per_unit_store_cost_; // per unit store cost | |||
| static int64_t per_unit_compute_num_; // per unit compute num | |||
| static float thread_startup_cost_; // thread startup inherent cost | |||
| static float single_thread_cost_; // Minimum cost of single-threaded | |||
| static float parallel_thread_cost_; // Minimum cost of per thread in parallel-thread | |||
| }; | |||
| int UpdateThreadNum(const Context *context, const ThreadCostContext *dt_cost_context, int task_num); | |||
| #endif | |||
| int ParallelLaunch(const Context *context, const Func &func, Content content, int task_num); | |||
| } // namespace mindspore::lite | |||
| @@ -32,6 +32,10 @@ | |||
| #include "include/api/context.h" | |||
| #include "include/api/kernel.h" | |||
| #ifdef SERVER_INFERENCE | |||
| #include "src/thread_cost_model.h" | |||
| #endif | |||
| namespace mindspore::kernel { | |||
| class InnerKernel : public Kernel { | |||
| public: | |||
| @@ -54,6 +58,13 @@ class InnerKernel : public Kernel { | |||
| op_parameter_ = nullptr; | |||
| FreeWorkspace(); | |||
| } | |||
| #ifdef SERVER_INFERENCE | |||
| if (thread_cost_context_ != nullptr) { | |||
| free(thread_cost_context_); | |||
| thread_cost_context_ = nullptr; | |||
| } | |||
| #endif | |||
| } | |||
| int Execute() override; | |||
| @@ -197,7 +208,7 @@ class InnerKernel : public Kernel { | |||
| int thread_num_ = 1; | |||
| #ifdef SERVER_INFERENCE | |||
| std::unique_ptr<ThreadCostContext> thread_cost_context = nullptr; | |||
| lite::ThreadCostContext *thread_cost_context_ = nullptr; | |||
| #endif | |||
| }; | |||
| } // namespace mindspore::kernel | |||
| @@ -25,6 +25,23 @@ using mindspore::lite::RET_OK; | |||
| using mindspore::schema::PrimitiveType_Split; | |||
| namespace mindspore::kernel { | |||
| #ifdef SERVER_INFERENCE | |||
| int SplitBaseCPUKernel::UpdateThreadNumPass() { | |||
| if (thread_cost_context_ == nullptr) { | |||
| thread_cost_context_ = new lite::ThreadCostContext(); | |||
| thread_cost_context_->per_unit_load_num_ = in_tensors_.at(0)->ElementsNum() / num_unit_; | |||
| thread_cost_context_->per_unit_store_num_ = in_tensors_.at(0)->ElementsNum() / num_unit_; | |||
| thread_cost_context_->per_unit_compute_cost_ = 17.573; // 17.573 : split per unit compute cost | |||
| } | |||
| if (thread_cost_context_ != nullptr) { | |||
| thread_cost_context_->total_unit_num_ = in_tensors_.at(0)->ElementsNum(); | |||
| thread_num_ = UpdateThreadNum(this->ms_context_, thread_cost_context_, op_parameter_->thread_num_); | |||
| } | |||
| return RET_OK; | |||
| } | |||
| #endif | |||
| int SplitBaseCPUKernel::Prepare() { | |||
| CHECK_LESS_RETURN(in_tensors_.size(), 1); | |||
| CHECK_LESS_RETURN(out_tensors_.size(), 1); | |||
| @@ -102,10 +119,17 @@ int SplitBaseCPUKernel::ReSize() { | |||
| // e.g. input dims is [1, 3, 4, 8], split axis is 2, num_split is 2, so split_count_ is 1*3, num_unit_ is 1*3*2 | |||
| MS_CHECK_FALSE(INT_MUL_OVERFLOW(param->split_count_, param->num_split_), RET_ERROR); | |||
| num_unit_ = param->split_count_ * param->num_split_; | |||
| thread_n_num_ = MSMIN(op_parameter_->thread_num_, num_unit_); | |||
| if (thread_n_num_ != 0) { | |||
| thread_n_stride_ = UP_DIV(num_unit_, thread_n_num_); | |||
| #ifdef SERVER_INFERENCE | |||
| if (UpdateThreadNumPass() != RET_OK) { | |||
| return RET_ERROR; | |||
| } | |||
| #else | |||
| thread_num_ = MSMIN(thread_num_, num_unit_); | |||
| #endif | |||
| CHECK_LESS_RETURN(thread_num_, 1); | |||
| thread_n_stride_ = UP_DIV(num_unit_, thread_num_); | |||
| return RET_OK; | |||
| } | |||
| @@ -152,7 +176,7 @@ int SplitBaseCPUKernel::Run() { | |||
| } | |||
| } | |||
| auto ret = ParallelLaunch(this->ms_context_, SplitRun, this, thread_n_num_); | |||
| auto ret = ParallelLaunch(this->ms_context_, SplitRun, this, thread_num_); | |||
| if (ret != RET_OK) { | |||
| MS_LOG(ERROR) << "split error error_code[" << ret << "]"; | |||
| } | |||
| @@ -43,10 +43,10 @@ class SplitBaseCPUKernel : public InnerKernel { | |||
| int Run() override; | |||
| virtual int Split(int task_id); | |||
| static int CheckAndInitSplitParam(const lite::Tensor &in_tensor, SplitParameter *param); | |||
| int UpdateThreadNumPass(); | |||
| protected: | |||
| int thread_n_stride_ = 0; | |||
| int thread_n_num_ = 0; | |||
| int num_unit_ = 0; | |||
| SplitParameter *param = nullptr; | |||
| void *input_ptr_ = nullptr; | |||
| @@ -35,7 +35,7 @@ using mindspore::schema::PrimitiveType_Activation; | |||
| namespace mindspore::kernel { | |||
| namespace { | |||
| #ifdef SERVER_INFERENCE | |||
| const std::map<int, float> dt_activation_cost_map_ = { | |||
| const std::map<int, float> activation_compute_cost_map_ = { | |||
| {schema::ActivationType_RELU, 1.806f}, | |||
| {schema::ActivationType_RELU6, 1.806f}, | |||
| {schema::ActivationType_LEAKY_RELU, 1.806f}, | |||
| @@ -48,12 +48,17 @@ const std::map<int, float> dt_activation_cost_map_ = { | |||
| } // namespace | |||
| #ifdef SERVER_INFERENCE | |||
| int ActivationCPUKernel::SetThreadCostContext() { | |||
| if (dt_activation_cost_map_.count(type_) > 0) { | |||
| thread_cost_context = std::make_unique<ThreadCostContext>(); | |||
| thread_cost_context->per_unit_load_num_ = 1; | |||
| thread_cost_context->per_unit_store_num_ = 1; | |||
| thread_cost_context->per_unit_compute_cost_ = dt_activation_cost_map_.at(type_); | |||
| int ActivationCPUKernel::UpdateThreadNumPass() { | |||
| if (thread_cost_context_ == nullptr && activation_compute_cost_map_.count(type_) > 0) { | |||
| thread_cost_context_ = new lite::ThreadCostContext(); | |||
| thread_cost_context_->per_unit_load_num_ = 1; | |||
| thread_cost_context_->per_unit_store_num_ = 1; | |||
| thread_cost_context_->per_unit_compute_cost_ = activation_compute_cost_map_.at(type_); | |||
| } | |||
| if (thread_cost_context_ != nullptr) { | |||
| thread_cost_context_->total_unit_num_ = in_tensors_.at(0)->ElementsNum(); | |||
| thread_num_ = UpdateThreadNum(this->ms_context_, thread_cost_context_, op_parameter_->thread_num_); | |||
| } | |||
| return RET_OK; | |||
| } | |||
| @@ -63,12 +68,6 @@ int ActivationCPUKernel::Prepare() { | |||
| CHECK_LESS_RETURN(in_tensors_.size(), 1); | |||
| CHECK_LESS_RETURN(out_tensors_.size(), 1); | |||
| #ifdef SERVER_INFERENCE | |||
| if (SetThreadCostContext() != RET_OK) { | |||
| return RET_ERROR; | |||
| } | |||
| #endif | |||
| if (in_tensors().front()->data_type() == kNumberTypeInt32) { | |||
| if (type_ != schema::ActivationType_RELU) { | |||
| MS_LOG(ERROR) << "Activation int32 not support type: " << type_; | |||
| @@ -96,9 +95,8 @@ int ActivationCPUKernel::Prepare() { | |||
| int ActivationCPUKernel::ReSize() { | |||
| #ifdef SERVER_INFERENCE | |||
| if (thread_cost_context != nullptr) { | |||
| thread_cost_context->total_unit_num_ = in_tensors_.at(0)->ElementsNum(); | |||
| thread_num_ = UpdateThreadNum(this->ms_context_, thread_cost_context.get(), op_parameter_->thread_num_); | |||
| if (UpdateThreadNumPass() != RET_OK) { | |||
| return RET_ERROR; | |||
| } | |||
| #endif | |||
| @@ -36,7 +36,7 @@ class ActivationCPUKernel : public InnerKernel { | |||
| } | |||
| ~ActivationCPUKernel() override = default; | |||
| int SetThreadCostContext(); | |||
| int UpdateThreadNumPass(); | |||
| int Prepare() override; | |||
| int ReSize() override; | |||
| int Run() override; | |||
| @@ -26,7 +26,7 @@ using mindspore::schema::PrimitiveType_Eltwise; | |||
| namespace mindspore::kernel { | |||
| namespace { | |||
| #ifdef SERVER_INFERENCE | |||
| const std::map<std::pair<int, int>, float> dt_arithmetic_cost_map_ = { | |||
| const std::map<std::pair<int, int>, float> arithmetic_compute_cost_map_ = { | |||
| // {{PrimitiveType_MulFusion, schema::ActivationType_RELU}, 1.0f}, | |||
| // {{PrimitiveType_MulFusion, schema::ActivationType_RELU6}, 1.0f}, | |||
| // {{PrimitiveType_MulFusion, schema::ActivationType_NO_ACTIVATION}, 1.0f}, | |||
| @@ -60,13 +60,18 @@ const std::map<std::pair<int, int>, float> dt_arithmetic_cost_map_ = { | |||
| } // namespace | |||
| #ifdef SERVER_INFERENCE | |||
| int ArithmeticCPUKernel::SetThreadCostContext() { | |||
| int ArithmeticCPUKernel::UpdateThreadNumPass() { | |||
| std::pair<int, int> fusion_type = std::make_pair(param_->op_parameter_.type_, param_->activation_type_); | |||
| if (dt_arithmetic_cost_map_.count(fusion_type) > 0) { | |||
| thread_cost_context = std::make_unique<ThreadCostContext>(); | |||
| thread_cost_context->per_unit_load_num_ = 1; | |||
| thread_cost_context->per_unit_store_num_ = 1; | |||
| thread_cost_context->per_unit_compute_cost_ = dt_arithmetic_cost_map_.at(fusion_type); | |||
| if (thread_cost_context_ == nullptr && arithmetic_compute_cost_map_.count(fusion_type) > 0) { | |||
| thread_cost_context_ = new lite::ThreadCostContext(); | |||
| thread_cost_context_->per_unit_load_num_ = 1; | |||
| thread_cost_context_->per_unit_store_num_ = 1; | |||
| thread_cost_context_->per_unit_compute_cost_ = arithmetic_compute_cost_map_.at(fusion_type); | |||
| } | |||
| if (thread_cost_context_ != nullptr) { | |||
| thread_cost_context_->total_unit_num_ = in_tensors_.at(0)->ElementsNum(); | |||
| thread_num_ = UpdateThreadNum(this->ms_context_, thread_cost_context_, op_parameter_->thread_num_); | |||
| } | |||
| return RET_OK; | |||
| } | |||
| @@ -76,12 +81,6 @@ int ArithmeticCPUKernel::Prepare() { | |||
| CHECK_LESS_RETURN(in_tensors_.size(), C2NUM); | |||
| CHECK_LESS_RETURN(out_tensors_.size(), 1); | |||
| #ifdef SERVER_INFERENCE | |||
| if (SetThreadCostContext() != RET_OK) { | |||
| return RET_ERROR; | |||
| } | |||
| #endif | |||
| auto primitive_type = param_->op_parameter_.type_; | |||
| if (primitive_type == schema::PrimitiveType_Eltwise) { | |||
| switch (param_->eltwise_mode_) { | |||
| @@ -113,11 +112,11 @@ bool ArithmeticCPUKernel::IsScalarClac() { | |||
| } | |||
| int ArithmeticCPUKernel::ReSize() { | |||
| #ifdef SERVER_INFERENCE | |||
| if (thread_cost_context != nullptr) { | |||
| thread_cost_context->total_unit_num_ = in_tensors_.at(0)->ElementsNum(); | |||
| thread_num_ = UpdateThreadNum(this->ms_context_, thread_cost_context.get(), op_parameter_->thread_num_); | |||
| if (UpdateThreadNumPass() != RET_OK) { | |||
| return RET_ERROR; | |||
| } | |||
| #endif | |||
| CalcMultiplesAndStrides(param_); | |||
| scalar_ = IsScalarClac(); | |||
| int ret = RET_OK; | |||
| @@ -117,7 +117,7 @@ class ArithmeticCPUKernel : public InnerKernel { | |||
| int BiasCalc(int task_id); | |||
| void FreeConstTileBuff(); | |||
| bool IsBiasCalc() const; | |||
| int SetThreadCostContext(); | |||
| int UpdateThreadNumPass(); | |||
| ArithmeticRun arithmetic_run_ = nullptr; | |||
| ArithmeticOptRun arithmetic_opt_run_ = nullptr; | |||
| ArithmeticIntRun arithmetic_run_int_ = nullptr; | |||
| @@ -29,7 +29,7 @@ struct TYPE_FUNC_INFO { | |||
| }; | |||
| #ifdef SERVER_INFERENCE | |||
| const std::map<int, float> dt_arithmetic_self_cost_map_ = { | |||
| const std::map<int, float> arithmetic_self_compute_cost_map_ = { | |||
| // {schema::PrimitiveType_Abs, 0.5f}, | |||
| // {schema::PrimitiveType_Cos, 1.0f}, | |||
| // {schema::PrimitiveType_Log, 1.0f}, | |||
| @@ -49,12 +49,17 @@ const std::map<int, float> dt_arithmetic_self_cost_map_ = { | |||
| } // namespace | |||
| #ifdef SERVER_INFERENCE | |||
| int ArithmeticSelfCPUKernel::SetThreadCostContext() { | |||
| if (thread_cost_context == nullptr && dt_arithmetic_self_cost_map_.count(type_) > 0) { | |||
| thread_cost_context = std::make_unique<ThreadCostContext>(); | |||
| thread_cost_context->per_unit_load_num_ = 1; | |||
| thread_cost_context->per_unit_store_num_ = 1; | |||
| thread_cost_context->per_unit_compute_cost_ = dt_arithmetic_self_cost_map_.at(type_); | |||
| int ArithmeticSelfCPUKernel::UpdateThreadNumPass() { | |||
| if (thread_cost_context_ == nullptr && arithmetic_self_compute_cost_map_.count(type_) > 0) { | |||
| thread_cost_context_ = new lite::ThreadCostContext(); | |||
| thread_cost_context_->per_unit_load_num_ = 1; | |||
| thread_cost_context_->per_unit_store_num_ = 1; | |||
| thread_cost_context_->per_unit_compute_cost_ = arithmetic_self_compute_cost_map_.at(type_); | |||
| } | |||
| if (thread_cost_context_ != nullptr) { | |||
| thread_cost_context_->total_unit_num_ = in_tensors_.at(0)->ElementsNum(); | |||
| thread_num_ = UpdateThreadNum(this->ms_context_, thread_cost_context_, op_parameter_->thread_num_); | |||
| } | |||
| return RET_OK; | |||
| } | |||
| @@ -94,12 +99,6 @@ int ArithmeticSelfCPUKernel::Prepare() { | |||
| CHECK_NOT_EQUAL_RETURN(in_tensors_.size(), 1); | |||
| CHECK_NOT_EQUAL_RETURN(out_tensors_.size(), 1); | |||
| #ifdef SERVER_INFERENCE | |||
| if (SetThreadCostContext() != RET_OK) { | |||
| return RET_ERROR; | |||
| } | |||
| #endif | |||
| if (!InferShapeDone()) { | |||
| return RET_OK; | |||
| } | |||
| @@ -108,9 +107,8 @@ int ArithmeticSelfCPUKernel::Prepare() { | |||
| int ArithmeticSelfCPUKernel::ReSize() { | |||
| #ifdef SERVER_INFERENCE | |||
| if (thread_cost_context != nullptr) { | |||
| thread_cost_context->total_unit_num_ = in_tensors_.at(0)->ElementsNum(); | |||
| thread_num_ = UpdateThreadNum(this->ms_context_, thread_cost_context.get(), op_parameter_->thread_num_); | |||
| if (UpdateThreadNumPass() != RET_OK) { | |||
| return RET_ERROR; | |||
| } | |||
| #endif | |||
| return RET_OK; | |||
| @@ -49,7 +49,7 @@ class ArithmeticSelfCPUKernel : public InnerKernel { | |||
| } | |||
| ~ArithmeticSelfCPUKernel() override = default; | |||
| int SetThreadCostContext(); | |||
| int UpdateThreadNumPass(); | |||
| int Prepare() override; | |||
| int ReSize() override; | |||
| int Run() override; | |||
| @@ -43,6 +43,23 @@ int SoftmaxCPUKernel::Prepare() { | |||
| return ReSize(); | |||
| } | |||
| #ifdef SERVER_INFERENCE | |||
| int SoftmaxCPUKernel::UpdateThreadNumPass() { | |||
| if (thread_cost_context_ == nullptr) { | |||
| thread_cost_context_ = new lite::ThreadCostContext(); | |||
| thread_cost_context_->per_unit_load_num_ = softmax_param_->input_shape_[softmax_param_->axis_]; | |||
| thread_cost_context_->per_unit_store_num_ = softmax_param_->input_shape_[softmax_param_->axis_]; | |||
| thread_cost_context_->per_unit_compute_cost_ = 42.042; // 42.042 : split per unit compute cost | |||
| } | |||
| if (thread_cost_context_ != nullptr) { | |||
| thread_cost_context_->total_unit_num_ = in_tensors_.at(0)->ElementsNum(); | |||
| thread_num_ = UpdateThreadNum(this->ms_context_, thread_cost_context_, op_parameter_->thread_num_); | |||
| } | |||
| return RET_OK; | |||
| } | |||
| #endif | |||
| int SoftmaxCPUKernel::ReSize() { | |||
| auto ret = SoftmaxBaseCPUKernel::ReSize(); | |||
| if (ret != RET_OK) { | |||
| @@ -73,11 +90,17 @@ int SoftmaxCPUKernel::ReSize() { | |||
| return RET_ERROR; | |||
| } | |||
| } | |||
| #ifdef SERVER_INFERENCE | |||
| if (UpdateThreadNumPass() != RET_OK) { | |||
| return RET_ERROR; | |||
| } | |||
| #endif | |||
| return RET_OK; | |||
| } | |||
| int SoftmaxCPUKernel::DoSoftmaxLastAxis(int task_id) { | |||
| int unit = UP_DIV(out_plane_size_, op_parameter_->thread_num_); | |||
| int unit = UP_DIV(out_plane_size_, thread_num_); | |||
| if (INT_MUL_OVERFLOW(task_id, unit)) { | |||
| MS_LOG(ERROR) << "int mul overflow."; | |||
| return RET_ERROR; | |||
| @@ -109,7 +132,7 @@ int SoftmaxLastAxisRun(void *cdata, int task_id, float lhs_scale, float rhs_scal | |||
| int SoftmaxCPUKernel::Run() { | |||
| int ret = RET_OK; | |||
| if (in_plane_size_ == 1) { | |||
| ret = ParallelLaunch(this->ms_context_, SoftmaxLastAxisRun, this, op_parameter_->thread_num_); | |||
| ret = ParallelLaunch(this->ms_context_, SoftmaxLastAxisRun, this, thread_num_); | |||
| if (ret != RET_OK) { | |||
| MS_LOG(ERROR) << "SoftmaxCPUKernel ParallelLaunch failed, ret: " << ret; | |||
| } | |||
| @@ -37,6 +37,7 @@ class SoftmaxCPUKernel : public SoftmaxBaseCPUKernel { | |||
| int ReSize() override; | |||
| int Run() override; | |||
| int DoSoftmaxLastAxis(int task_id); | |||
| int UpdateThreadNumPass(); | |||
| private: | |||
| float *sum_data_ = nullptr; | |||
| @@ -105,7 +105,7 @@ int SplitInt8CPUKernel::Run() { | |||
| output_ptr_[i] = reinterpret_cast<int8_t *>(out_tensors_.at(i)->data()); | |||
| } | |||
| auto ret = ParallelLaunch(this->ms_context_, SplitInt8Run, this, thread_n_num_); | |||
| auto ret = ParallelLaunch(this->ms_context_, SplitInt8Run, this, thread_num_); | |||
| if (ret != RET_OK) { | |||
| MS_LOG(ERROR) << "Scale error error_code[" << ret << "]"; | |||
| return RET_ERROR; | |||
| @@ -0,0 +1,81 @@ | |||
| /** | |||
| * Copyright 2022 Huawei Technologies Co., Ltd | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| * You may obtain a copy of the License at | |||
| * | |||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, software | |||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| #include "src/thread_cost_model.h" | |||
| #include "src/common/log_util.h" | |||
| #include "src/inner_context.h" | |||
| #include "thread/threadpool.h" | |||
| namespace mindspore::lite { | |||
| float ThreadCostModel::per_unit_load_cost_ = 1.0 / 64 * 11; // 64: L2 cache size, 11 : L2 cache latency on Haswell | |||
| float ThreadCostModel::per_unit_store_cost_ = 1.0 / 64 * 11; // 64: L2 cache size, 11 : L2 cache latency on Haswell | |||
| int64_t ThreadCostModel::per_unit_compute_num_ = 1; // 1 : per unit compute num | |||
| float ThreadCostModel::thread_startup_cost_ = 100000.0f; // 100000 : thread startup inherent cost | |||
| float ThreadCostModel::single_thread_cost_ = 100000.0f; // 100000 : Minimum cost of single-threaded | |||
| float ThreadCostModel::parallel_thread_cost_ = 40000.0f; // 40000 : Minimum cost of per thread in parallel-thread | |||
| int ThreadCostModel::get_optimal_thread_num(const ThreadCostContext *thread_cost_context, const int thread_num) { | |||
| const int64_t max_oversharding_factor = 4; | |||
| int64_t block_size = MSVALID(max_oversharding_factor * thread_num, thread_block_size(thread_cost_context), | |||
| thread_cost_context->total_unit_num_); | |||
| int64_t block_count = UP_DIV(thread_cost_context->total_unit_num_, block_size); | |||
| int64_t max_block_size = MSMIN(thread_cost_context->total_unit_num_, 2 * block_size); | |||
| double max_efficiency = static_cast<double>(block_count) / (UP_DIV(block_count, thread_num) * thread_num); | |||
| for (int64_t prev_block_count = block_count; max_efficiency < 1.0 && prev_block_count > 1;) { | |||
| int64_t cur_block_size = UP_DIV(thread_cost_context->total_unit_num_, prev_block_count - 1); | |||
| if (cur_block_size > max_block_size) { | |||
| break; | |||
| } | |||
| const int64_t cur_block_count = UP_DIV(thread_cost_context->total_unit_num_, cur_block_size); | |||
| MS_ASSERT(cur_block_count < prev_block_count); | |||
| prev_block_count = cur_block_count; | |||
| const double cur_efficiency = | |||
| static_cast<double>(cur_block_count) / (UP_DIV(cur_block_count, thread_num) * thread_num); | |||
| if (cur_efficiency + 0.01 >= max_efficiency) { // update threshold : 0.01 | |||
| block_size = cur_block_size; | |||
| block_count = cur_block_count; | |||
| if (max_efficiency < cur_efficiency) { | |||
| max_efficiency = cur_efficiency; | |||
| } | |||
| } | |||
| } | |||
| return block_count; | |||
| } | |||
| int UpdateThreadNum(const Context *context, const ThreadCostContext *thread_cost_context, int task_num) { | |||
| if (task_num <= 1) { | |||
| return task_num; | |||
| } | |||
| ThreadPool *pool = static_cast<const lite::InnerContext *>(context)->thread_pool(); | |||
| if (pool == nullptr) { | |||
| MS_LOG(ERROR) << "thread pool is nullptr"; | |||
| return RET_NULL_PTR; | |||
| } | |||
| if (thread_cost_context != nullptr) { | |||
| if (ThreadCostModel::thread_num(thread_cost_context) == 1) { | |||
| return 1; | |||
| } | |||
| int opt_thread = static_cast<int>(ThreadCostModel::parallel_degree(thread_cost_context)); | |||
| task_num = MSVALID(1, opt_thread, task_num); | |||
| task_num = MSMIN(task_num, thread_cost_context->total_unit_num_); | |||
| } | |||
| return task_num; | |||
| } | |||
| } // namespace mindspore::lite | |||
| @@ -0,0 +1,71 @@ | |||
| /** | |||
| * Copyright 2022 Huawei Technologies Co., Ltd | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| * You may obtain a copy of the License at | |||
| * | |||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, software | |||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| #ifndef MINDSPORE_LITE_SRC_THREAD_COST_MODEL_H | |||
| #define MINDSPORE_LITE_SRC_THREAD_COST_MODEL_H | |||
| #include <stdint.h> | |||
| #include "nnacl/op_base.h" | |||
| #include "include/api/context.h" | |||
| namespace mindspore::lite { | |||
| typedef struct ThreadCostContext { | |||
| int64_t total_unit_num_; | |||
| int64_t per_unit_load_num_; | |||
| int64_t per_unit_store_num_; | |||
| float per_unit_compute_cost_; | |||
| } ThreadCostContext; | |||
| struct ThreadCostModel { | |||
| static float unit_cost(const ThreadCostContext *thread_cost_context) { | |||
| return per_unit_load_cost_ * thread_cost_context->per_unit_load_num_ + | |||
| per_unit_store_cost_ * thread_cost_context->per_unit_store_num_ + | |||
| thread_cost_context->per_unit_compute_cost_ * per_unit_compute_num_; | |||
| } | |||
| static float total_cost(const ThreadCostContext *thread_cost_context) { | |||
| return thread_cost_context->total_unit_num_ * unit_cost(thread_cost_context); | |||
| } | |||
| // thread_num assesses parallel thread num. Value of 1.0 means ideal parallel task size. Values < 1.0 mean that task | |||
| // granularity needs to be increased to mitigate parallelization overheads. | |||
| static float parallel_degree(const ThreadCostContext *thread_cost_context) { | |||
| return total_cost(thread_cost_context) / parallel_thread_cost_; | |||
| } | |||
| static int thread_num(const ThreadCostContext *thread_cost_context) { | |||
| return MSMAX( | |||
| 1, static_cast<int>((total_cost(thread_cost_context) - thread_startup_cost_) / single_thread_cost_ + 0.9)); | |||
| } | |||
| static int64_t thread_block_size(const ThreadCostContext *thread_cost_context) { | |||
| return static_cast<int64_t>(parallel_thread_cost_ / unit_cost(thread_cost_context)); | |||
| } | |||
| static int get_optimal_thread_num(const ThreadCostContext *thread_cost_context, const int thread_num); | |||
| static float per_unit_load_cost_; // per unit load cost | |||
| static float per_unit_store_cost_; // per unit store cost | |||
| static int64_t per_unit_compute_num_; // per unit compute num | |||
| static float thread_startup_cost_; // thread startup inherent cost | |||
| static float single_thread_cost_; // Minimum cost of single-threaded | |||
| static float parallel_thread_cost_; // Minimum cost of per thread in parallel-thread | |||
| }; | |||
| int UpdateThreadNum(const Context *context, const ThreadCostContext *thread_cost_context, int task_num); | |||
| } // namespace mindspore::lite | |||
| #endif // MINDSPORE_LITE_SRC_INNER_CONTEXT_H | |||
| @@ -132,6 +132,7 @@ set(LITE_SRC | |||
| ${SRC_DIR}/runtime/dynamic_mem_allocator.cc | |||
| ${SRC_DIR}/runtime/dynamic_mem_manager.cc | |||
| ${SRC_DIR}/runtime/numa_adapter.cc | |||
| ${SRC_DIR}/thread_cost_model.cc | |||
| ) | |||
| endif() | |||