dynamic thread refactor and optimization, add split op and softmax op dynamic thread choose

4 years ago · 07255c8939
--- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/op_base.h
+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/op_base.h
@@ -530,15 +530,6 @@ typedef struct QuantMulArg {
  int right_shift_;
 } QuantMulArg;

 #ifdef SERVER_INFERENCE
 typedef struct ThreadCostContext {
  int64_t total_unit_num_;
  int64_t per_unit_load_num_;
  int64_t per_unit_store_num_;
  float per_unit_compute_cost_;
 } ThreadCostContext;
 #endif

 typedef enum ActType { ActType_No, ActType_Relu, ActType_Sigmod, ActType_Relu6, ActType_Prelu } ActType;
 typedef enum PadMode { Pad_pad, Pad_same, Pad_valid } PadMode;
 typedef enum RoundingMode { Rounding_No, Rounding_Away_from_zero, Rounding_Up } RoundingMode;
--- a/mindspore/lite/src/CMakeLists.txt
+++ b/mindspore/lite/src/CMakeLists.txt
@@ -138,6 +138,7 @@ set(LITE_SRC
        ${CMAKE_CURRENT_SOURCE_DIR}/runtime/dynamic_mem_manager.cc
        ${CMAKE_CURRENT_SOURCE_DIR}/runtime/numa_adapter.cc
        ${CMAKE_CURRENT_SOURCE_DIR}/pack_weight_manager.cc
        ${CMAKE_CURRENT_SOURCE_DIR}/thread_cost_model.cc
        )
 endif()

--- a/mindspore/lite/src/inner_context.cc
+++ b/mindspore/lite/src/inner_context.cc
@@ -402,67 +402,6 @@ void InnerContext::ReplaceLinkInfoSenderWithNewOne(void *new_sender, void *old_s
  }
 }

 #ifdef SERVER_INFERENCE
 float DtCostModel::per_unit_load_cost_ = 1.0 / 64 * 11;   // 64: L2 cache size, 11 : L2 cache latency on Haswell
 float DtCostModel::per_unit_store_cost_ = 1.0 / 64 * 11;  // 64: L2 cache size, 11 : L2 cache latency on Haswell
 int64_t DtCostModel::per_unit_compute_num_ = 1;           // 1 : per unit compute num

 float DtCostModel::thread_startup_cost_ = 100000.0f;  // 100000 : thread startup inherent cost
 float DtCostModel::single_thread_cost_ = 100000.0f;   // 100000 : Minimum cost of single-threaded
 float DtCostModel::parallel_thread_cost_ = 40000.0f;  // 40000 : Minimum cost of per thread in parallel-thread

 int DtCostModel::get_optimal_thread_num(const ThreadCostContext *dt_cost_context, const int thread_num) {
  const int64_t max_oversharding_factor = 4;

  int64_t block_size =
    MSVALID(max_oversharding_factor * thread_num, thread_block_size(dt_cost_context), dt_cost_context->total_unit_num_);
  int64_t block_count = UP_DIV(dt_cost_context->total_unit_num_, block_size);

  int64_t max_block_size = MSMIN(dt_cost_context->total_unit_num_, 2 * block_size);
  double max_efficiency = static_cast<double>(block_count) / (UP_DIV(block_count, thread_num) * thread_num);
  for (int64_t prev_block_count = block_count; max_efficiency < 1.0 && prev_block_count > 1;) {
    int64_t cur_block_size = UP_DIV(dt_cost_context->total_unit_num_, prev_block_count - 1);
    if (cur_block_size > max_block_size) {
      break;
    }
    const int64_t cur_block_count = UP_DIV(dt_cost_context->total_unit_num_, cur_block_size);
    MS_ASSERT(cur_block_count < prev_block_count);
    prev_block_count = cur_block_count;
    const double cur_efficiency =
      static_cast<double>(cur_block_count) / (UP_DIV(cur_block_count, thread_num) * thread_num);
    if (cur_efficiency + 0.01 >= max_efficiency) {  // update threshold : 0.01
      block_size = cur_block_size;
      block_count = cur_block_count;
      if (max_efficiency < cur_efficiency) {
        max_efficiency = cur_efficiency;
      }
    }
  }

  return block_count;
 }

 int UpdateThreadNum(const Context *context, const ThreadCostContext *dt_cost_context, int task_num) {
  if (task_num <= 1) {
    return task_num;
  }
  ThreadPool *pool = static_cast<const lite::InnerContext *>(context)->thread_pool();
  if (pool == nullptr) {
    MS_LOG(ERROR) << "thread pool is nullptr";
    return RET_NULL_PTR;
  }

  if (dt_cost_context != nullptr) {
    if (DtCostModel::thread_num(dt_cost_context) == 1) {
      return 1;
    }
    int opt_thread = static_cast<int>(DtCostModel::parallel_degree(dt_cost_context));
    task_num = MSVALID(1, opt_thread, task_num);
  }
  return task_num;
 }
 #endif

 int ParallelLaunch(const Context *context, const Func &func, Content content, int task_num) {
  ThreadPool *pool = static_cast<const lite::InnerContext *>(context)->thread_pool();
  if (pool == nullptr) {
--- a/mindspore/lite/src/inner_context.h
+++ b/mindspore/lite/src/inner_context.h
@@ -120,45 +120,6 @@ struct InnerContext : public Context {
  std::unordered_map<void *, std::set<void *>> link_info_{};
 };

 #ifdef SERVER_INFERENCE
 struct DtCostModel {
  static float unit_cost(const ThreadCostContext *dt_cost_context) {
    return per_unit_load_cost_ * dt_cost_context->per_unit_load_num_ +
           per_unit_store_cost_ * dt_cost_context->per_unit_store_num_ +
           dt_cost_context->per_unit_compute_cost_ * per_unit_compute_num_;
  }

  static float total_cost(const ThreadCostContext *dt_cost_context) {
    return dt_cost_context->total_unit_num_ * unit_cost(dt_cost_context);
  }

  // thread_num assesses parallel thread num. Value of 1.0 means ideal parallel task size. Values < 1.0 mean that task
  // granularity needs to be increased to mitigate parallelization overheads.
  static float parallel_degree(const ThreadCostContext *dt_cost_context) {
    return total_cost(dt_cost_context) / parallel_thread_cost_;
  }

  static int thread_num(const ThreadCostContext *dt_cost_context) {
    return MSMAX(1, static_cast<int>((total_cost(dt_cost_context) - thread_startup_cost_) / single_thread_cost_ + 0.9));
  }

  static int64_t thread_block_size(const ThreadCostContext *dt_cost_context) {
    return static_cast<int64_t>(parallel_thread_cost_ / unit_cost(dt_cost_context));
  }
  static int get_optimal_thread_num(const ThreadCostContext *dt_cost_context, const int thread_num);

  static float per_unit_load_cost_;      // per unit load cost
  static float per_unit_store_cost_;     // per unit store cost
  static int64_t per_unit_compute_num_;  // per unit compute num

  static float thread_startup_cost_;   // thread startup inherent cost
  static float single_thread_cost_;    // Minimum cost of single-threaded
  static float parallel_thread_cost_;  // Minimum cost of per thread in parallel-thread
 };

 int UpdateThreadNum(const Context *context, const ThreadCostContext *dt_cost_context, int task_num);
 #endif

 int ParallelLaunch(const Context *context, const Func &func, Content content, int task_num);
 }  // namespace mindspore::lite

--- a/mindspore/lite/src/inner_kernel.h
+++ b/mindspore/lite/src/inner_kernel.h
@@ -32,6 +32,10 @@
 #include "include/api/context.h"
 #include "include/api/kernel.h"

 #ifdef SERVER_INFERENCE
 #include "src/thread_cost_model.h"
 #endif

 namespace mindspore::kernel {
 class InnerKernel : public Kernel {
 public:
@@ -54,6 +58,13 @@ class InnerKernel : public Kernel {
      op_parameter_ = nullptr;
      FreeWorkspace();
    }

 #ifdef SERVER_INFERENCE
    if (thread_cost_context_ != nullptr) {
      free(thread_cost_context_);
      thread_cost_context_ = nullptr;
    }
 #endif
  }

  int Execute() override;
@@ -197,7 +208,7 @@ class InnerKernel : public Kernel {

  int thread_num_ = 1;
 #ifdef SERVER_INFERENCE
  std::unique_ptr<ThreadCostContext> thread_cost_context = nullptr;
  lite::ThreadCostContext *thread_cost_context_ = nullptr;
 #endif
 };
 }  // namespace mindspore::kernel
--- a/mindspore/lite/src/runtime/kernel/arm/base/split_base.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/base/split_base.cc
@@ -25,6 +25,23 @@ using mindspore::lite::RET_OK;
 using mindspore::schema::PrimitiveType_Split;

 namespace mindspore::kernel {
 #ifdef SERVER_INFERENCE
 int SplitBaseCPUKernel::UpdateThreadNumPass() {
  if (thread_cost_context_ == nullptr) {
    thread_cost_context_ = new lite::ThreadCostContext();
    thread_cost_context_->per_unit_load_num_ = in_tensors_.at(0)->ElementsNum() / num_unit_;
    thread_cost_context_->per_unit_store_num_ = in_tensors_.at(0)->ElementsNum() / num_unit_;
    thread_cost_context_->per_unit_compute_cost_ = 17.573;  // 17.573 : split per unit compute cost
  }

  if (thread_cost_context_ != nullptr) {
    thread_cost_context_->total_unit_num_ = in_tensors_.at(0)->ElementsNum();
    thread_num_ = UpdateThreadNum(this->ms_context_, thread_cost_context_, op_parameter_->thread_num_);
  }
  return RET_OK;
 }
 #endif

 int SplitBaseCPUKernel::Prepare() {
  CHECK_LESS_RETURN(in_tensors_.size(), 1);
  CHECK_LESS_RETURN(out_tensors_.size(), 1);
@@ -102,10 +119,17 @@ int SplitBaseCPUKernel::ReSize() {
  // e.g. input dims is [1, 3, 4, 8], split axis is 2, num_split is 2, so split_count_ is 1*3, num_unit_ is 1*3*2
  MS_CHECK_FALSE(INT_MUL_OVERFLOW(param->split_count_, param->num_split_), RET_ERROR);
  num_unit_ = param->split_count_ * param->num_split_;
  thread_n_num_ = MSMIN(op_parameter_->thread_num_, num_unit_);
  if (thread_n_num_ != 0) {
    thread_n_stride_ = UP_DIV(num_unit_, thread_n_num_);

 #ifdef SERVER_INFERENCE
  if (UpdateThreadNumPass() != RET_OK) {
    return RET_ERROR;
  }
 #else
  thread_num_ = MSMIN(thread_num_, num_unit_);
 #endif

  CHECK_LESS_RETURN(thread_num_, 1);
  thread_n_stride_ = UP_DIV(num_unit_, thread_num_);
  return RET_OK;
 }

@@ -152,7 +176,7 @@ int SplitBaseCPUKernel::Run() {
    }
  }

  auto ret = ParallelLaunch(this->ms_context_, SplitRun, this, thread_n_num_);
  auto ret = ParallelLaunch(this->ms_context_, SplitRun, this, thread_num_);
  if (ret != RET_OK) {
    MS_LOG(ERROR) << "split error error_code[" << ret << "]";
  }
--- a/mindspore/lite/src/runtime/kernel/arm/base/split_base.h
+++ b/mindspore/lite/src/runtime/kernel/arm/base/split_base.h
@@ -43,10 +43,10 @@ class SplitBaseCPUKernel : public InnerKernel {
  int Run() override;
  virtual int Split(int task_id);
  static int CheckAndInitSplitParam(const lite::Tensor &in_tensor, SplitParameter *param);
  int UpdateThreadNumPass();

 protected:
  int thread_n_stride_ = 0;
  int thread_n_num_ = 0;
  int num_unit_ = 0;
  SplitParameter *param = nullptr;
  void *input_ptr_ = nullptr;
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/activation_fp32.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/activation_fp32.cc
@@ -35,7 +35,7 @@ using mindspore::schema::PrimitiveType_Activation;
 namespace mindspore::kernel {
 namespace {
 #ifdef SERVER_INFERENCE
 const std::map<int, float> dt_activation_cost_map_ = {
 const std::map<int, float> activation_compute_cost_map_ = {
  {schema::ActivationType_RELU, 1.806f},
  {schema::ActivationType_RELU6, 1.806f},
  {schema::ActivationType_LEAKY_RELU, 1.806f},
@@ -48,12 +48,17 @@ const std::map<int, float> dt_activation_cost_map_ = {
 }  // namespace

 #ifdef SERVER_INFERENCE
 int ActivationCPUKernel::SetThreadCostContext() {
  if (dt_activation_cost_map_.count(type_) > 0) {
    thread_cost_context = std::make_unique<ThreadCostContext>();
    thread_cost_context->per_unit_load_num_ = 1;
    thread_cost_context->per_unit_store_num_ = 1;
    thread_cost_context->per_unit_compute_cost_ = dt_activation_cost_map_.at(type_);
 int ActivationCPUKernel::UpdateThreadNumPass() {
  if (thread_cost_context_ == nullptr && activation_compute_cost_map_.count(type_) > 0) {
    thread_cost_context_ = new lite::ThreadCostContext();
    thread_cost_context_->per_unit_load_num_ = 1;
    thread_cost_context_->per_unit_store_num_ = 1;
    thread_cost_context_->per_unit_compute_cost_ = activation_compute_cost_map_.at(type_);
  }

  if (thread_cost_context_ != nullptr) {
    thread_cost_context_->total_unit_num_ = in_tensors_.at(0)->ElementsNum();
    thread_num_ = UpdateThreadNum(this->ms_context_, thread_cost_context_, op_parameter_->thread_num_);
  }
  return RET_OK;
 }
@@ -63,12 +68,6 @@ int ActivationCPUKernel::Prepare() {
  CHECK_LESS_RETURN(in_tensors_.size(), 1);
  CHECK_LESS_RETURN(out_tensors_.size(), 1);

 #ifdef SERVER_INFERENCE
  if (SetThreadCostContext() != RET_OK) {
    return RET_ERROR;
  }
 #endif

  if (in_tensors().front()->data_type() == kNumberTypeInt32) {
    if (type_ != schema::ActivationType_RELU) {
      MS_LOG(ERROR) << "Activation int32 not support type: " << type_;
@@ -96,9 +95,8 @@ int ActivationCPUKernel::Prepare() {

 int ActivationCPUKernel::ReSize() {
 #ifdef SERVER_INFERENCE
  if (thread_cost_context != nullptr) {
    thread_cost_context->total_unit_num_ = in_tensors_.at(0)->ElementsNum();
    thread_num_ = UpdateThreadNum(this->ms_context_, thread_cost_context.get(), op_parameter_->thread_num_);
  if (UpdateThreadNumPass() != RET_OK) {
    return RET_ERROR;
  }
 #endif

--- a/mindspore/lite/src/runtime/kernel/arm/fp32/activation_fp32.h
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/activation_fp32.h
@@ -36,7 +36,7 @@ class ActivationCPUKernel : public InnerKernel {
  }
  ~ActivationCPUKernel() override = default;

  int SetThreadCostContext();
  int UpdateThreadNumPass();
  int Prepare() override;
  int ReSize() override;
  int Run() override;
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/arithmetic_fp32.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/arithmetic_fp32.cc
@@ -26,7 +26,7 @@ using mindspore::schema::PrimitiveType_Eltwise;
 namespace mindspore::kernel {
 namespace {
 #ifdef SERVER_INFERENCE
 const std::map<std::pair<int, int>, float> dt_arithmetic_cost_map_ = {
 const std::map<std::pair<int, int>, float> arithmetic_compute_cost_map_ = {
  // {{PrimitiveType_MulFusion, schema::ActivationType_RELU}, 1.0f},
  // {{PrimitiveType_MulFusion, schema::ActivationType_RELU6}, 1.0f},
  // {{PrimitiveType_MulFusion, schema::ActivationType_NO_ACTIVATION}, 1.0f},
@@ -60,13 +60,18 @@ const std::map<std::pair<int, int>, float> dt_arithmetic_cost_map_ = {
 }  // namespace

 #ifdef SERVER_INFERENCE
 int ArithmeticCPUKernel::SetThreadCostContext() {
 int ArithmeticCPUKernel::UpdateThreadNumPass() {
  std::pair<int, int> fusion_type = std::make_pair(param_->op_parameter_.type_, param_->activation_type_);
  if (dt_arithmetic_cost_map_.count(fusion_type) > 0) {
    thread_cost_context = std::make_unique<ThreadCostContext>();
    thread_cost_context->per_unit_load_num_ = 1;
    thread_cost_context->per_unit_store_num_ = 1;
    thread_cost_context->per_unit_compute_cost_ = dt_arithmetic_cost_map_.at(fusion_type);
  if (thread_cost_context_ == nullptr && arithmetic_compute_cost_map_.count(fusion_type) > 0) {
    thread_cost_context_ = new lite::ThreadCostContext();
    thread_cost_context_->per_unit_load_num_ = 1;
    thread_cost_context_->per_unit_store_num_ = 1;
    thread_cost_context_->per_unit_compute_cost_ = arithmetic_compute_cost_map_.at(fusion_type);
  }

  if (thread_cost_context_ != nullptr) {
    thread_cost_context_->total_unit_num_ = in_tensors_.at(0)->ElementsNum();
    thread_num_ = UpdateThreadNum(this->ms_context_, thread_cost_context_, op_parameter_->thread_num_);
  }
  return RET_OK;
 }
@@ -76,12 +81,6 @@ int ArithmeticCPUKernel::Prepare() {
  CHECK_LESS_RETURN(in_tensors_.size(), C2NUM);
  CHECK_LESS_RETURN(out_tensors_.size(), 1);

 #ifdef SERVER_INFERENCE
  if (SetThreadCostContext() != RET_OK) {
    return RET_ERROR;
  }
 #endif

  auto primitive_type = param_->op_parameter_.type_;
  if (primitive_type == schema::PrimitiveType_Eltwise) {
    switch (param_->eltwise_mode_) {
@@ -113,11 +112,11 @@ bool ArithmeticCPUKernel::IsScalarClac() {
 }
 int ArithmeticCPUKernel::ReSize() {
 #ifdef SERVER_INFERENCE
  if (thread_cost_context != nullptr) {
    thread_cost_context->total_unit_num_ = in_tensors_.at(0)->ElementsNum();
    thread_num_ = UpdateThreadNum(this->ms_context_, thread_cost_context.get(), op_parameter_->thread_num_);
  if (UpdateThreadNumPass() != RET_OK) {
    return RET_ERROR;
  }
 #endif

  CalcMultiplesAndStrides(param_);
  scalar_ = IsScalarClac();
  int ret = RET_OK;
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/arithmetic_fp32.h
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/arithmetic_fp32.h
@@ -117,7 +117,7 @@ class ArithmeticCPUKernel : public InnerKernel {
  int BiasCalc(int task_id);
  void FreeConstTileBuff();
  bool IsBiasCalc() const;
  int SetThreadCostContext();
  int UpdateThreadNumPass();
  ArithmeticRun arithmetic_run_ = nullptr;
  ArithmeticOptRun arithmetic_opt_run_ = nullptr;
  ArithmeticIntRun arithmetic_run_int_ = nullptr;
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/arithmetic_self_fp32.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/arithmetic_self_fp32.cc
@@ -29,7 +29,7 @@ struct TYPE_FUNC_INFO {
 };

 #ifdef SERVER_INFERENCE
 const std::map<int, float> dt_arithmetic_self_cost_map_ = {
 const std::map<int, float> arithmetic_self_compute_cost_map_ = {
  // {schema::PrimitiveType_Abs, 0.5f},
  // {schema::PrimitiveType_Cos, 1.0f},
  // {schema::PrimitiveType_Log, 1.0f},
@@ -49,12 +49,17 @@ const std::map<int, float> dt_arithmetic_self_cost_map_ = {
 }  // namespace

 #ifdef SERVER_INFERENCE
 int ArithmeticSelfCPUKernel::SetThreadCostContext() {
  if (thread_cost_context == nullptr && dt_arithmetic_self_cost_map_.count(type_) > 0) {
    thread_cost_context = std::make_unique<ThreadCostContext>();
    thread_cost_context->per_unit_load_num_ = 1;
    thread_cost_context->per_unit_store_num_ = 1;
    thread_cost_context->per_unit_compute_cost_ = dt_arithmetic_self_cost_map_.at(type_);
 int ArithmeticSelfCPUKernel::UpdateThreadNumPass() {
  if (thread_cost_context_ == nullptr && arithmetic_self_compute_cost_map_.count(type_) > 0) {
    thread_cost_context_ = new lite::ThreadCostContext();
    thread_cost_context_->per_unit_load_num_ = 1;
    thread_cost_context_->per_unit_store_num_ = 1;
    thread_cost_context_->per_unit_compute_cost_ = arithmetic_self_compute_cost_map_.at(type_);
  }

  if (thread_cost_context_ != nullptr) {
    thread_cost_context_->total_unit_num_ = in_tensors_.at(0)->ElementsNum();
    thread_num_ = UpdateThreadNum(this->ms_context_, thread_cost_context_, op_parameter_->thread_num_);
  }
  return RET_OK;
 }
@@ -94,12 +99,6 @@ int ArithmeticSelfCPUKernel::Prepare() {
  CHECK_NOT_EQUAL_RETURN(in_tensors_.size(), 1);
  CHECK_NOT_EQUAL_RETURN(out_tensors_.size(), 1);

 #ifdef SERVER_INFERENCE
  if (SetThreadCostContext() != RET_OK) {
    return RET_ERROR;
  }
 #endif

  if (!InferShapeDone()) {
    return RET_OK;
  }
@@ -108,9 +107,8 @@ int ArithmeticSelfCPUKernel::Prepare() {

 int ArithmeticSelfCPUKernel::ReSize() {
 #ifdef SERVER_INFERENCE
  if (thread_cost_context != nullptr) {
    thread_cost_context->total_unit_num_ = in_tensors_.at(0)->ElementsNum();
    thread_num_ = UpdateThreadNum(this->ms_context_, thread_cost_context.get(), op_parameter_->thread_num_);
  if (UpdateThreadNumPass() != RET_OK) {
    return RET_ERROR;
  }
 #endif
  return RET_OK;
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/arithmetic_self_fp32.h
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/arithmetic_self_fp32.h
@@ -49,7 +49,7 @@ class ArithmeticSelfCPUKernel : public InnerKernel {
  }
  ~ArithmeticSelfCPUKernel() override = default;

  int SetThreadCostContext();
  int UpdateThreadNumPass();
  int Prepare() override;
  int ReSize() override;
  int Run() override;
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/softmax_fp32.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/softmax_fp32.cc
@@ -43,6 +43,23 @@ int SoftmaxCPUKernel::Prepare() {
  return ReSize();
 }

 #ifdef SERVER_INFERENCE
 int SoftmaxCPUKernel::UpdateThreadNumPass() {
  if (thread_cost_context_ == nullptr) {
    thread_cost_context_ = new lite::ThreadCostContext();
    thread_cost_context_->per_unit_load_num_ = softmax_param_->input_shape_[softmax_param_->axis_];
    thread_cost_context_->per_unit_store_num_ = softmax_param_->input_shape_[softmax_param_->axis_];
    thread_cost_context_->per_unit_compute_cost_ = 42.042;  // 42.042 : split per unit compute cost
  }

  if (thread_cost_context_ != nullptr) {
    thread_cost_context_->total_unit_num_ = in_tensors_.at(0)->ElementsNum();
    thread_num_ = UpdateThreadNum(this->ms_context_, thread_cost_context_, op_parameter_->thread_num_);
  }
  return RET_OK;
 }
 #endif

 int SoftmaxCPUKernel::ReSize() {
  auto ret = SoftmaxBaseCPUKernel::ReSize();
  if (ret != RET_OK) {
@@ -73,11 +90,17 @@ int SoftmaxCPUKernel::ReSize() {
      return RET_ERROR;
    }
  }

 #ifdef SERVER_INFERENCE
  if (UpdateThreadNumPass() != RET_OK) {
    return RET_ERROR;
  }
 #endif
  return RET_OK;
 }

 int SoftmaxCPUKernel::DoSoftmaxLastAxis(int task_id) {
  int unit = UP_DIV(out_plane_size_, op_parameter_->thread_num_);
  int unit = UP_DIV(out_plane_size_, thread_num_);
  if (INT_MUL_OVERFLOW(task_id, unit)) {
    MS_LOG(ERROR) << "int mul overflow.";
    return RET_ERROR;
@@ -109,7 +132,7 @@ int SoftmaxLastAxisRun(void *cdata, int task_id, float lhs_scale, float rhs_scal
 int SoftmaxCPUKernel::Run() {
  int ret = RET_OK;
  if (in_plane_size_ == 1) {
    ret = ParallelLaunch(this->ms_context_, SoftmaxLastAxisRun, this, op_parameter_->thread_num_);
    ret = ParallelLaunch(this->ms_context_, SoftmaxLastAxisRun, this, thread_num_);
    if (ret != RET_OK) {
      MS_LOG(ERROR) << "SoftmaxCPUKernel ParallelLaunch failed, ret: " << ret;
    }
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/softmax_fp32.h
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/softmax_fp32.h
@@ -37,6 +37,7 @@ class SoftmaxCPUKernel : public SoftmaxBaseCPUKernel {
  int ReSize() override;
  int Run() override;
  int DoSoftmaxLastAxis(int task_id);
  int UpdateThreadNumPass();

 private:
  float *sum_data_ = nullptr;
--- a/mindspore/lite/src/runtime/kernel/arm/int8/split_int8.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/int8/split_int8.cc
@@ -105,7 +105,7 @@ int SplitInt8CPUKernel::Run() {
    output_ptr_[i] = reinterpret_cast<int8_t *>(out_tensors_.at(i)->data());
  }

  auto ret = ParallelLaunch(this->ms_context_, SplitInt8Run, this, thread_n_num_);
  auto ret = ParallelLaunch(this->ms_context_, SplitInt8Run, this, thread_num_);
  if (ret != RET_OK) {
    MS_LOG(ERROR) << "Scale error error_code[" << ret << "]";
    return RET_ERROR;
--- a/mindspore/lite/src/thread_cost_model.cc
+++ b/mindspore/lite/src/thread_cost_model.cc
@@ -0,0 +1,81 @@
 /**
 * Copyright 2022 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #include "src/thread_cost_model.h"
 #include "src/common/log_util.h"
 #include "src/inner_context.h"
 #include "thread/threadpool.h"

 namespace mindspore::lite {
 float ThreadCostModel::per_unit_load_cost_ = 1.0 / 64 * 11;   // 64: L2 cache size, 11 : L2 cache latency on Haswell
 float ThreadCostModel::per_unit_store_cost_ = 1.0 / 64 * 11;  // 64: L2 cache size, 11 : L2 cache latency on Haswell
 int64_t ThreadCostModel::per_unit_compute_num_ = 1;           // 1 : per unit compute num

 float ThreadCostModel::thread_startup_cost_ = 100000.0f;  // 100000 : thread startup inherent cost
 float ThreadCostModel::single_thread_cost_ = 100000.0f;   // 100000 : Minimum cost of single-threaded
 float ThreadCostModel::parallel_thread_cost_ = 40000.0f;  // 40000 : Minimum cost of per thread in parallel-thread

 int ThreadCostModel::get_optimal_thread_num(const ThreadCostContext *thread_cost_context, const int thread_num) {
  const int64_t max_oversharding_factor = 4;

  int64_t block_size = MSVALID(max_oversharding_factor * thread_num, thread_block_size(thread_cost_context),
                               thread_cost_context->total_unit_num_);
  int64_t block_count = UP_DIV(thread_cost_context->total_unit_num_, block_size);

  int64_t max_block_size = MSMIN(thread_cost_context->total_unit_num_, 2 * block_size);
  double max_efficiency = static_cast<double>(block_count) / (UP_DIV(block_count, thread_num) * thread_num);
  for (int64_t prev_block_count = block_count; max_efficiency < 1.0 && prev_block_count > 1;) {
    int64_t cur_block_size = UP_DIV(thread_cost_context->total_unit_num_, prev_block_count - 1);
    if (cur_block_size > max_block_size) {
      break;
    }
    const int64_t cur_block_count = UP_DIV(thread_cost_context->total_unit_num_, cur_block_size);
    MS_ASSERT(cur_block_count < prev_block_count);
    prev_block_count = cur_block_count;
    const double cur_efficiency =
      static_cast<double>(cur_block_count) / (UP_DIV(cur_block_count, thread_num) * thread_num);
    if (cur_efficiency + 0.01 >= max_efficiency) {  // update threshold : 0.01
      block_size = cur_block_size;
      block_count = cur_block_count;
      if (max_efficiency < cur_efficiency) {
        max_efficiency = cur_efficiency;
      }
    }
  }
  return block_count;
 }

 int UpdateThreadNum(const Context *context, const ThreadCostContext *thread_cost_context, int task_num) {
  if (task_num <= 1) {
    return task_num;
  }
  ThreadPool *pool = static_cast<const lite::InnerContext *>(context)->thread_pool();
  if (pool == nullptr) {
    MS_LOG(ERROR) << "thread pool is nullptr";
    return RET_NULL_PTR;
  }

  if (thread_cost_context != nullptr) {
    if (ThreadCostModel::thread_num(thread_cost_context) == 1) {
      return 1;
    }
    int opt_thread = static_cast<int>(ThreadCostModel::parallel_degree(thread_cost_context));
    task_num = MSVALID(1, opt_thread, task_num);
    task_num = MSMIN(task_num, thread_cost_context->total_unit_num_);
  }
  return task_num;
 }
 }  // namespace mindspore::lite
--- a/mindspore/lite/src/thread_cost_model.h
+++ b/mindspore/lite/src/thread_cost_model.h
@@ -0,0 +1,71 @@
 /**
 * Copyright 2022 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #ifndef MINDSPORE_LITE_SRC_THREAD_COST_MODEL_H
 #define MINDSPORE_LITE_SRC_THREAD_COST_MODEL_H

 #include <stdint.h>
 #include "nnacl/op_base.h"
 #include "include/api/context.h"

 namespace mindspore::lite {
 typedef struct ThreadCostContext {
  int64_t total_unit_num_;
  int64_t per_unit_load_num_;
  int64_t per_unit_store_num_;
  float per_unit_compute_cost_;
 } ThreadCostContext;

 struct ThreadCostModel {
  static float unit_cost(const ThreadCostContext *thread_cost_context) {
    return per_unit_load_cost_ * thread_cost_context->per_unit_load_num_ +
           per_unit_store_cost_ * thread_cost_context->per_unit_store_num_ +
           thread_cost_context->per_unit_compute_cost_ * per_unit_compute_num_;
  }

  static float total_cost(const ThreadCostContext *thread_cost_context) {
    return thread_cost_context->total_unit_num_ * unit_cost(thread_cost_context);
  }

  // thread_num assesses parallel thread num. Value of 1.0 means ideal parallel task size. Values < 1.0 mean that task
  // granularity needs to be increased to mitigate parallelization overheads.
  static float parallel_degree(const ThreadCostContext *thread_cost_context) {
    return total_cost(thread_cost_context) / parallel_thread_cost_;
  }

  static int thread_num(const ThreadCostContext *thread_cost_context) {
    return MSMAX(
      1, static_cast<int>((total_cost(thread_cost_context) - thread_startup_cost_) / single_thread_cost_ + 0.9));
  }

  static int64_t thread_block_size(const ThreadCostContext *thread_cost_context) {
    return static_cast<int64_t>(parallel_thread_cost_ / unit_cost(thread_cost_context));
  }
  static int get_optimal_thread_num(const ThreadCostContext *thread_cost_context, const int thread_num);

  static float per_unit_load_cost_;      // per unit load cost
  static float per_unit_store_cost_;     // per unit store cost
  static int64_t per_unit_compute_num_;  // per unit compute num

  static float thread_startup_cost_;   // thread startup inherent cost
  static float single_thread_cost_;    // Minimum cost of single-threaded
  static float parallel_thread_cost_;  // Minimum cost of per thread in parallel-thread
 };

 int UpdateThreadNum(const Context *context, const ThreadCostContext *thread_cost_context, int task_num);
 }  // namespace mindspore::lite

 #endif  // MINDSPORE_LITE_SRC_INNER_CONTEXT_H
--- a/mindspore/lite/tools/converter/CMakeLists.txt
+++ b/mindspore/lite/tools/converter/CMakeLists.txt
@@ -132,6 +132,7 @@ set(LITE_SRC
        ${SRC_DIR}/runtime/dynamic_mem_allocator.cc
        ${SRC_DIR}/runtime/dynamic_mem_manager.cc
        ${SRC_DIR}/runtime/numa_adapter.cc
        ${SRC_DIR}/thread_cost_model.cc
        )
 endif()