Browse Source

dynamic thread refactor and optimization, add split op and softmax op dynamic thread choose

r1.7
greatpanc 4 years ago
parent
commit
07255c8939
19 changed files with 268 additions and 169 deletions
  1. +0
    -9
      mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/op_base.h
  2. +1
    -0
      mindspore/lite/src/CMakeLists.txt
  3. +0
    -61
      mindspore/lite/src/inner_context.cc
  4. +0
    -39
      mindspore/lite/src/inner_context.h
  5. +12
    -1
      mindspore/lite/src/inner_kernel.h
  6. +28
    -4
      mindspore/lite/src/runtime/kernel/arm/base/split_base.cc
  7. +1
    -1
      mindspore/lite/src/runtime/kernel/arm/base/split_base.h
  8. +14
    -16
      mindspore/lite/src/runtime/kernel/arm/fp32/activation_fp32.cc
  9. +1
    -1
      mindspore/lite/src/runtime/kernel/arm/fp32/activation_fp32.h
  10. +15
    -16
      mindspore/lite/src/runtime/kernel/arm/fp32/arithmetic_fp32.cc
  11. +1
    -1
      mindspore/lite/src/runtime/kernel/arm/fp32/arithmetic_fp32.h
  12. +14
    -16
      mindspore/lite/src/runtime/kernel/arm/fp32/arithmetic_self_fp32.cc
  13. +1
    -1
      mindspore/lite/src/runtime/kernel/arm/fp32/arithmetic_self_fp32.h
  14. +25
    -2
      mindspore/lite/src/runtime/kernel/arm/fp32/softmax_fp32.cc
  15. +1
    -0
      mindspore/lite/src/runtime/kernel/arm/fp32/softmax_fp32.h
  16. +1
    -1
      mindspore/lite/src/runtime/kernel/arm/int8/split_int8.cc
  17. +81
    -0
      mindspore/lite/src/thread_cost_model.cc
  18. +71
    -0
      mindspore/lite/src/thread_cost_model.h
  19. +1
    -0
      mindspore/lite/tools/converter/CMakeLists.txt

+ 0
- 9
mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/op_base.h View File

@@ -530,15 +530,6 @@ typedef struct QuantMulArg {
int right_shift_;
} QuantMulArg;

#ifdef SERVER_INFERENCE
typedef struct ThreadCostContext {
int64_t total_unit_num_;
int64_t per_unit_load_num_;
int64_t per_unit_store_num_;
float per_unit_compute_cost_;
} ThreadCostContext;
#endif

typedef enum ActType { ActType_No, ActType_Relu, ActType_Sigmod, ActType_Relu6, ActType_Prelu } ActType;
typedef enum PadMode { Pad_pad, Pad_same, Pad_valid } PadMode;
typedef enum RoundingMode { Rounding_No, Rounding_Away_from_zero, Rounding_Up } RoundingMode;


+ 1
- 0
mindspore/lite/src/CMakeLists.txt View File

@@ -138,6 +138,7 @@ set(LITE_SRC
${CMAKE_CURRENT_SOURCE_DIR}/runtime/dynamic_mem_manager.cc
${CMAKE_CURRENT_SOURCE_DIR}/runtime/numa_adapter.cc
${CMAKE_CURRENT_SOURCE_DIR}/pack_weight_manager.cc
${CMAKE_CURRENT_SOURCE_DIR}/thread_cost_model.cc
)
endif()



+ 0
- 61
mindspore/lite/src/inner_context.cc View File

@@ -402,67 +402,6 @@ void InnerContext::ReplaceLinkInfoSenderWithNewOne(void *new_sender, void *old_s
}
}

#ifdef SERVER_INFERENCE
float DtCostModel::per_unit_load_cost_ = 1.0 / 64 * 11; // 64: L2 cache size, 11 : L2 cache latency on Haswell
float DtCostModel::per_unit_store_cost_ = 1.0 / 64 * 11; // 64: L2 cache size, 11 : L2 cache latency on Haswell
int64_t DtCostModel::per_unit_compute_num_ = 1; // 1 : per unit compute num

float DtCostModel::thread_startup_cost_ = 100000.0f; // 100000 : thread startup inherent cost
float DtCostModel::single_thread_cost_ = 100000.0f; // 100000 : Minimum cost of single-threaded
float DtCostModel::parallel_thread_cost_ = 40000.0f; // 40000 : Minimum cost of per thread in parallel-thread

int DtCostModel::get_optimal_thread_num(const ThreadCostContext *dt_cost_context, const int thread_num) {
const int64_t max_oversharding_factor = 4;

int64_t block_size =
MSVALID(max_oversharding_factor * thread_num, thread_block_size(dt_cost_context), dt_cost_context->total_unit_num_);
int64_t block_count = UP_DIV(dt_cost_context->total_unit_num_, block_size);

int64_t max_block_size = MSMIN(dt_cost_context->total_unit_num_, 2 * block_size);
double max_efficiency = static_cast<double>(block_count) / (UP_DIV(block_count, thread_num) * thread_num);
for (int64_t prev_block_count = block_count; max_efficiency < 1.0 && prev_block_count > 1;) {
int64_t cur_block_size = UP_DIV(dt_cost_context->total_unit_num_, prev_block_count - 1);
if (cur_block_size > max_block_size) {
break;
}
const int64_t cur_block_count = UP_DIV(dt_cost_context->total_unit_num_, cur_block_size);
MS_ASSERT(cur_block_count < prev_block_count);
prev_block_count = cur_block_count;
const double cur_efficiency =
static_cast<double>(cur_block_count) / (UP_DIV(cur_block_count, thread_num) * thread_num);
if (cur_efficiency + 0.01 >= max_efficiency) { // update threshold : 0.01
block_size = cur_block_size;
block_count = cur_block_count;
if (max_efficiency < cur_efficiency) {
max_efficiency = cur_efficiency;
}
}
}

return block_count;
}

int UpdateThreadNum(const Context *context, const ThreadCostContext *dt_cost_context, int task_num) {
if (task_num <= 1) {
return task_num;
}
ThreadPool *pool = static_cast<const lite::InnerContext *>(context)->thread_pool();
if (pool == nullptr) {
MS_LOG(ERROR) << "thread pool is nullptr";
return RET_NULL_PTR;
}

if (dt_cost_context != nullptr) {
if (DtCostModel::thread_num(dt_cost_context) == 1) {
return 1;
}
int opt_thread = static_cast<int>(DtCostModel::parallel_degree(dt_cost_context));
task_num = MSVALID(1, opt_thread, task_num);
}
return task_num;
}
#endif

int ParallelLaunch(const Context *context, const Func &func, Content content, int task_num) {
ThreadPool *pool = static_cast<const lite::InnerContext *>(context)->thread_pool();
if (pool == nullptr) {


+ 0
- 39
mindspore/lite/src/inner_context.h View File

@@ -120,45 +120,6 @@ struct InnerContext : public Context {
std::unordered_map<void *, std::set<void *>> link_info_{};
};

#ifdef SERVER_INFERENCE
struct DtCostModel {
static float unit_cost(const ThreadCostContext *dt_cost_context) {
return per_unit_load_cost_ * dt_cost_context->per_unit_load_num_ +
per_unit_store_cost_ * dt_cost_context->per_unit_store_num_ +
dt_cost_context->per_unit_compute_cost_ * per_unit_compute_num_;
}

static float total_cost(const ThreadCostContext *dt_cost_context) {
return dt_cost_context->total_unit_num_ * unit_cost(dt_cost_context);
}

// thread_num assesses parallel thread num. Value of 1.0 means ideal parallel task size. Values < 1.0 mean that task
// granularity needs to be increased to mitigate parallelization overheads.
static float parallel_degree(const ThreadCostContext *dt_cost_context) {
return total_cost(dt_cost_context) / parallel_thread_cost_;
}

static int thread_num(const ThreadCostContext *dt_cost_context) {
return MSMAX(1, static_cast<int>((total_cost(dt_cost_context) - thread_startup_cost_) / single_thread_cost_ + 0.9));
}

static int64_t thread_block_size(const ThreadCostContext *dt_cost_context) {
return static_cast<int64_t>(parallel_thread_cost_ / unit_cost(dt_cost_context));
}
static int get_optimal_thread_num(const ThreadCostContext *dt_cost_context, const int thread_num);

static float per_unit_load_cost_; // per unit load cost
static float per_unit_store_cost_; // per unit store cost
static int64_t per_unit_compute_num_; // per unit compute num

static float thread_startup_cost_; // thread startup inherent cost
static float single_thread_cost_; // Minimum cost of single-threaded
static float parallel_thread_cost_; // Minimum cost of per thread in parallel-thread
};

int UpdateThreadNum(const Context *context, const ThreadCostContext *dt_cost_context, int task_num);
#endif

int ParallelLaunch(const Context *context, const Func &func, Content content, int task_num);
} // namespace mindspore::lite



+ 12
- 1
mindspore/lite/src/inner_kernel.h View File

@@ -32,6 +32,10 @@
#include "include/api/context.h"
#include "include/api/kernel.h"

#ifdef SERVER_INFERENCE
#include "src/thread_cost_model.h"
#endif

namespace mindspore::kernel {
class InnerKernel : public Kernel {
public:
@@ -54,6 +58,13 @@ class InnerKernel : public Kernel {
op_parameter_ = nullptr;
FreeWorkspace();
}

#ifdef SERVER_INFERENCE
if (thread_cost_context_ != nullptr) {
free(thread_cost_context_);
thread_cost_context_ = nullptr;
}
#endif
}

int Execute() override;
@@ -197,7 +208,7 @@ class InnerKernel : public Kernel {

int thread_num_ = 1;
#ifdef SERVER_INFERENCE
std::unique_ptr<ThreadCostContext> thread_cost_context = nullptr;
lite::ThreadCostContext *thread_cost_context_ = nullptr;
#endif
};
} // namespace mindspore::kernel


+ 28
- 4
mindspore/lite/src/runtime/kernel/arm/base/split_base.cc View File

@@ -25,6 +25,23 @@ using mindspore::lite::RET_OK;
using mindspore::schema::PrimitiveType_Split;

namespace mindspore::kernel {
#ifdef SERVER_INFERENCE
int SplitBaseCPUKernel::UpdateThreadNumPass() {
if (thread_cost_context_ == nullptr) {
thread_cost_context_ = new lite::ThreadCostContext();
thread_cost_context_->per_unit_load_num_ = in_tensors_.at(0)->ElementsNum() / num_unit_;
thread_cost_context_->per_unit_store_num_ = in_tensors_.at(0)->ElementsNum() / num_unit_;
thread_cost_context_->per_unit_compute_cost_ = 17.573; // 17.573 : split per unit compute cost
}

if (thread_cost_context_ != nullptr) {
thread_cost_context_->total_unit_num_ = in_tensors_.at(0)->ElementsNum();
thread_num_ = UpdateThreadNum(this->ms_context_, thread_cost_context_, op_parameter_->thread_num_);
}
return RET_OK;
}
#endif

int SplitBaseCPUKernel::Prepare() {
CHECK_LESS_RETURN(in_tensors_.size(), 1);
CHECK_LESS_RETURN(out_tensors_.size(), 1);
@@ -102,10 +119,17 @@ int SplitBaseCPUKernel::ReSize() {
// e.g. input dims is [1, 3, 4, 8], split axis is 2, num_split is 2, so split_count_ is 1*3, num_unit_ is 1*3*2
MS_CHECK_FALSE(INT_MUL_OVERFLOW(param->split_count_, param->num_split_), RET_ERROR);
num_unit_ = param->split_count_ * param->num_split_;
thread_n_num_ = MSMIN(op_parameter_->thread_num_, num_unit_);
if (thread_n_num_ != 0) {
thread_n_stride_ = UP_DIV(num_unit_, thread_n_num_);

#ifdef SERVER_INFERENCE
if (UpdateThreadNumPass() != RET_OK) {
return RET_ERROR;
}
#else
thread_num_ = MSMIN(thread_num_, num_unit_);
#endif

CHECK_LESS_RETURN(thread_num_, 1);
thread_n_stride_ = UP_DIV(num_unit_, thread_num_);
return RET_OK;
}

@@ -152,7 +176,7 @@ int SplitBaseCPUKernel::Run() {
}
}

auto ret = ParallelLaunch(this->ms_context_, SplitRun, this, thread_n_num_);
auto ret = ParallelLaunch(this->ms_context_, SplitRun, this, thread_num_);
if (ret != RET_OK) {
MS_LOG(ERROR) << "split error error_code[" << ret << "]";
}


+ 1
- 1
mindspore/lite/src/runtime/kernel/arm/base/split_base.h View File

@@ -43,10 +43,10 @@ class SplitBaseCPUKernel : public InnerKernel {
int Run() override;
virtual int Split(int task_id);
static int CheckAndInitSplitParam(const lite::Tensor &in_tensor, SplitParameter *param);
int UpdateThreadNumPass();

protected:
int thread_n_stride_ = 0;
int thread_n_num_ = 0;
int num_unit_ = 0;
SplitParameter *param = nullptr;
void *input_ptr_ = nullptr;


+ 14
- 16
mindspore/lite/src/runtime/kernel/arm/fp32/activation_fp32.cc View File

@@ -35,7 +35,7 @@ using mindspore::schema::PrimitiveType_Activation;
namespace mindspore::kernel {
namespace {
#ifdef SERVER_INFERENCE
const std::map<int, float> dt_activation_cost_map_ = {
const std::map<int, float> activation_compute_cost_map_ = {
{schema::ActivationType_RELU, 1.806f},
{schema::ActivationType_RELU6, 1.806f},
{schema::ActivationType_LEAKY_RELU, 1.806f},
@@ -48,12 +48,17 @@ const std::map<int, float> dt_activation_cost_map_ = {
} // namespace

#ifdef SERVER_INFERENCE
int ActivationCPUKernel::SetThreadCostContext() {
if (dt_activation_cost_map_.count(type_) > 0) {
thread_cost_context = std::make_unique<ThreadCostContext>();
thread_cost_context->per_unit_load_num_ = 1;
thread_cost_context->per_unit_store_num_ = 1;
thread_cost_context->per_unit_compute_cost_ = dt_activation_cost_map_.at(type_);
int ActivationCPUKernel::UpdateThreadNumPass() {
if (thread_cost_context_ == nullptr && activation_compute_cost_map_.count(type_) > 0) {
thread_cost_context_ = new lite::ThreadCostContext();
thread_cost_context_->per_unit_load_num_ = 1;
thread_cost_context_->per_unit_store_num_ = 1;
thread_cost_context_->per_unit_compute_cost_ = activation_compute_cost_map_.at(type_);
}

if (thread_cost_context_ != nullptr) {
thread_cost_context_->total_unit_num_ = in_tensors_.at(0)->ElementsNum();
thread_num_ = UpdateThreadNum(this->ms_context_, thread_cost_context_, op_parameter_->thread_num_);
}
return RET_OK;
}
@@ -63,12 +68,6 @@ int ActivationCPUKernel::Prepare() {
CHECK_LESS_RETURN(in_tensors_.size(), 1);
CHECK_LESS_RETURN(out_tensors_.size(), 1);

#ifdef SERVER_INFERENCE
if (SetThreadCostContext() != RET_OK) {
return RET_ERROR;
}
#endif

if (in_tensors().front()->data_type() == kNumberTypeInt32) {
if (type_ != schema::ActivationType_RELU) {
MS_LOG(ERROR) << "Activation int32 not support type: " << type_;
@@ -96,9 +95,8 @@ int ActivationCPUKernel::Prepare() {

int ActivationCPUKernel::ReSize() {
#ifdef SERVER_INFERENCE
if (thread_cost_context != nullptr) {
thread_cost_context->total_unit_num_ = in_tensors_.at(0)->ElementsNum();
thread_num_ = UpdateThreadNum(this->ms_context_, thread_cost_context.get(), op_parameter_->thread_num_);
if (UpdateThreadNumPass() != RET_OK) {
return RET_ERROR;
}
#endif



+ 1
- 1
mindspore/lite/src/runtime/kernel/arm/fp32/activation_fp32.h View File

@@ -36,7 +36,7 @@ class ActivationCPUKernel : public InnerKernel {
}
~ActivationCPUKernel() override = default;

int SetThreadCostContext();
int UpdateThreadNumPass();
int Prepare() override;
int ReSize() override;
int Run() override;


+ 15
- 16
mindspore/lite/src/runtime/kernel/arm/fp32/arithmetic_fp32.cc View File

@@ -26,7 +26,7 @@ using mindspore::schema::PrimitiveType_Eltwise;
namespace mindspore::kernel {
namespace {
#ifdef SERVER_INFERENCE
const std::map<std::pair<int, int>, float> dt_arithmetic_cost_map_ = {
const std::map<std::pair<int, int>, float> arithmetic_compute_cost_map_ = {
// {{PrimitiveType_MulFusion, schema::ActivationType_RELU}, 1.0f},
// {{PrimitiveType_MulFusion, schema::ActivationType_RELU6}, 1.0f},
// {{PrimitiveType_MulFusion, schema::ActivationType_NO_ACTIVATION}, 1.0f},
@@ -60,13 +60,18 @@ const std::map<std::pair<int, int>, float> dt_arithmetic_cost_map_ = {
} // namespace

#ifdef SERVER_INFERENCE
int ArithmeticCPUKernel::SetThreadCostContext() {
int ArithmeticCPUKernel::UpdateThreadNumPass() {
std::pair<int, int> fusion_type = std::make_pair(param_->op_parameter_.type_, param_->activation_type_);
if (dt_arithmetic_cost_map_.count(fusion_type) > 0) {
thread_cost_context = std::make_unique<ThreadCostContext>();
thread_cost_context->per_unit_load_num_ = 1;
thread_cost_context->per_unit_store_num_ = 1;
thread_cost_context->per_unit_compute_cost_ = dt_arithmetic_cost_map_.at(fusion_type);
if (thread_cost_context_ == nullptr && arithmetic_compute_cost_map_.count(fusion_type) > 0) {
thread_cost_context_ = new lite::ThreadCostContext();
thread_cost_context_->per_unit_load_num_ = 1;
thread_cost_context_->per_unit_store_num_ = 1;
thread_cost_context_->per_unit_compute_cost_ = arithmetic_compute_cost_map_.at(fusion_type);
}

if (thread_cost_context_ != nullptr) {
thread_cost_context_->total_unit_num_ = in_tensors_.at(0)->ElementsNum();
thread_num_ = UpdateThreadNum(this->ms_context_, thread_cost_context_, op_parameter_->thread_num_);
}
return RET_OK;
}
@@ -76,12 +81,6 @@ int ArithmeticCPUKernel::Prepare() {
CHECK_LESS_RETURN(in_tensors_.size(), C2NUM);
CHECK_LESS_RETURN(out_tensors_.size(), 1);

#ifdef SERVER_INFERENCE
if (SetThreadCostContext() != RET_OK) {
return RET_ERROR;
}
#endif

auto primitive_type = param_->op_parameter_.type_;
if (primitive_type == schema::PrimitiveType_Eltwise) {
switch (param_->eltwise_mode_) {
@@ -113,11 +112,11 @@ bool ArithmeticCPUKernel::IsScalarClac() {
}
int ArithmeticCPUKernel::ReSize() {
#ifdef SERVER_INFERENCE
if (thread_cost_context != nullptr) {
thread_cost_context->total_unit_num_ = in_tensors_.at(0)->ElementsNum();
thread_num_ = UpdateThreadNum(this->ms_context_, thread_cost_context.get(), op_parameter_->thread_num_);
if (UpdateThreadNumPass() != RET_OK) {
return RET_ERROR;
}
#endif

CalcMultiplesAndStrides(param_);
scalar_ = IsScalarClac();
int ret = RET_OK;


+ 1
- 1
mindspore/lite/src/runtime/kernel/arm/fp32/arithmetic_fp32.h View File

@@ -117,7 +117,7 @@ class ArithmeticCPUKernel : public InnerKernel {
int BiasCalc(int task_id);
void FreeConstTileBuff();
bool IsBiasCalc() const;
int SetThreadCostContext();
int UpdateThreadNumPass();
ArithmeticRun arithmetic_run_ = nullptr;
ArithmeticOptRun arithmetic_opt_run_ = nullptr;
ArithmeticIntRun arithmetic_run_int_ = nullptr;


+ 14
- 16
mindspore/lite/src/runtime/kernel/arm/fp32/arithmetic_self_fp32.cc View File

@@ -29,7 +29,7 @@ struct TYPE_FUNC_INFO {
};

#ifdef SERVER_INFERENCE
const std::map<int, float> dt_arithmetic_self_cost_map_ = {
const std::map<int, float> arithmetic_self_compute_cost_map_ = {
// {schema::PrimitiveType_Abs, 0.5f},
// {schema::PrimitiveType_Cos, 1.0f},
// {schema::PrimitiveType_Log, 1.0f},
@@ -49,12 +49,17 @@ const std::map<int, float> dt_arithmetic_self_cost_map_ = {
} // namespace

#ifdef SERVER_INFERENCE
int ArithmeticSelfCPUKernel::SetThreadCostContext() {
if (thread_cost_context == nullptr && dt_arithmetic_self_cost_map_.count(type_) > 0) {
thread_cost_context = std::make_unique<ThreadCostContext>();
thread_cost_context->per_unit_load_num_ = 1;
thread_cost_context->per_unit_store_num_ = 1;
thread_cost_context->per_unit_compute_cost_ = dt_arithmetic_self_cost_map_.at(type_);
int ArithmeticSelfCPUKernel::UpdateThreadNumPass() {
if (thread_cost_context_ == nullptr && arithmetic_self_compute_cost_map_.count(type_) > 0) {
thread_cost_context_ = new lite::ThreadCostContext();
thread_cost_context_->per_unit_load_num_ = 1;
thread_cost_context_->per_unit_store_num_ = 1;
thread_cost_context_->per_unit_compute_cost_ = arithmetic_self_compute_cost_map_.at(type_);
}

if (thread_cost_context_ != nullptr) {
thread_cost_context_->total_unit_num_ = in_tensors_.at(0)->ElementsNum();
thread_num_ = UpdateThreadNum(this->ms_context_, thread_cost_context_, op_parameter_->thread_num_);
}
return RET_OK;
}
@@ -94,12 +99,6 @@ int ArithmeticSelfCPUKernel::Prepare() {
CHECK_NOT_EQUAL_RETURN(in_tensors_.size(), 1);
CHECK_NOT_EQUAL_RETURN(out_tensors_.size(), 1);

#ifdef SERVER_INFERENCE
if (SetThreadCostContext() != RET_OK) {
return RET_ERROR;
}
#endif

if (!InferShapeDone()) {
return RET_OK;
}
@@ -108,9 +107,8 @@ int ArithmeticSelfCPUKernel::Prepare() {

int ArithmeticSelfCPUKernel::ReSize() {
#ifdef SERVER_INFERENCE
if (thread_cost_context != nullptr) {
thread_cost_context->total_unit_num_ = in_tensors_.at(0)->ElementsNum();
thread_num_ = UpdateThreadNum(this->ms_context_, thread_cost_context.get(), op_parameter_->thread_num_);
if (UpdateThreadNumPass() != RET_OK) {
return RET_ERROR;
}
#endif
return RET_OK;


+ 1
- 1
mindspore/lite/src/runtime/kernel/arm/fp32/arithmetic_self_fp32.h View File

@@ -49,7 +49,7 @@ class ArithmeticSelfCPUKernel : public InnerKernel {
}
~ArithmeticSelfCPUKernel() override = default;

int SetThreadCostContext();
int UpdateThreadNumPass();
int Prepare() override;
int ReSize() override;
int Run() override;


+ 25
- 2
mindspore/lite/src/runtime/kernel/arm/fp32/softmax_fp32.cc View File

@@ -43,6 +43,23 @@ int SoftmaxCPUKernel::Prepare() {
return ReSize();
}

#ifdef SERVER_INFERENCE
int SoftmaxCPUKernel::UpdateThreadNumPass() {
if (thread_cost_context_ == nullptr) {
thread_cost_context_ = new lite::ThreadCostContext();
thread_cost_context_->per_unit_load_num_ = softmax_param_->input_shape_[softmax_param_->axis_];
thread_cost_context_->per_unit_store_num_ = softmax_param_->input_shape_[softmax_param_->axis_];
thread_cost_context_->per_unit_compute_cost_ = 42.042; // 42.042 : split per unit compute cost
}

if (thread_cost_context_ != nullptr) {
thread_cost_context_->total_unit_num_ = in_tensors_.at(0)->ElementsNum();
thread_num_ = UpdateThreadNum(this->ms_context_, thread_cost_context_, op_parameter_->thread_num_);
}
return RET_OK;
}
#endif

int SoftmaxCPUKernel::ReSize() {
auto ret = SoftmaxBaseCPUKernel::ReSize();
if (ret != RET_OK) {
@@ -73,11 +90,17 @@ int SoftmaxCPUKernel::ReSize() {
return RET_ERROR;
}
}

#ifdef SERVER_INFERENCE
if (UpdateThreadNumPass() != RET_OK) {
return RET_ERROR;
}
#endif
return RET_OK;
}

int SoftmaxCPUKernel::DoSoftmaxLastAxis(int task_id) {
int unit = UP_DIV(out_plane_size_, op_parameter_->thread_num_);
int unit = UP_DIV(out_plane_size_, thread_num_);
if (INT_MUL_OVERFLOW(task_id, unit)) {
MS_LOG(ERROR) << "int mul overflow.";
return RET_ERROR;
@@ -109,7 +132,7 @@ int SoftmaxLastAxisRun(void *cdata, int task_id, float lhs_scale, float rhs_scal
int SoftmaxCPUKernel::Run() {
int ret = RET_OK;
if (in_plane_size_ == 1) {
ret = ParallelLaunch(this->ms_context_, SoftmaxLastAxisRun, this, op_parameter_->thread_num_);
ret = ParallelLaunch(this->ms_context_, SoftmaxLastAxisRun, this, thread_num_);
if (ret != RET_OK) {
MS_LOG(ERROR) << "SoftmaxCPUKernel ParallelLaunch failed, ret: " << ret;
}


+ 1
- 0
mindspore/lite/src/runtime/kernel/arm/fp32/softmax_fp32.h View File

@@ -37,6 +37,7 @@ class SoftmaxCPUKernel : public SoftmaxBaseCPUKernel {
int ReSize() override;
int Run() override;
int DoSoftmaxLastAxis(int task_id);
int UpdateThreadNumPass();

private:
float *sum_data_ = nullptr;


+ 1
- 1
mindspore/lite/src/runtime/kernel/arm/int8/split_int8.cc View File

@@ -105,7 +105,7 @@ int SplitInt8CPUKernel::Run() {
output_ptr_[i] = reinterpret_cast<int8_t *>(out_tensors_.at(i)->data());
}

auto ret = ParallelLaunch(this->ms_context_, SplitInt8Run, this, thread_n_num_);
auto ret = ParallelLaunch(this->ms_context_, SplitInt8Run, this, thread_num_);
if (ret != RET_OK) {
MS_LOG(ERROR) << "Scale error error_code[" << ret << "]";
return RET_ERROR;


+ 81
- 0
mindspore/lite/src/thread_cost_model.cc View File

@@ -0,0 +1,81 @@
/**
* Copyright 2022 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include "src/thread_cost_model.h"
#include "src/common/log_util.h"
#include "src/inner_context.h"
#include "thread/threadpool.h"

namespace mindspore::lite {
float ThreadCostModel::per_unit_load_cost_ = 1.0 / 64 * 11; // 64: L2 cache size, 11 : L2 cache latency on Haswell
float ThreadCostModel::per_unit_store_cost_ = 1.0 / 64 * 11; // 64: L2 cache size, 11 : L2 cache latency on Haswell
int64_t ThreadCostModel::per_unit_compute_num_ = 1; // 1 : per unit compute num

float ThreadCostModel::thread_startup_cost_ = 100000.0f; // 100000 : thread startup inherent cost
float ThreadCostModel::single_thread_cost_ = 100000.0f; // 100000 : Minimum cost of single-threaded
float ThreadCostModel::parallel_thread_cost_ = 40000.0f; // 40000 : Minimum cost of per thread in parallel-thread

int ThreadCostModel::get_optimal_thread_num(const ThreadCostContext *thread_cost_context, const int thread_num) {
const int64_t max_oversharding_factor = 4;

int64_t block_size = MSVALID(max_oversharding_factor * thread_num, thread_block_size(thread_cost_context),
thread_cost_context->total_unit_num_);
int64_t block_count = UP_DIV(thread_cost_context->total_unit_num_, block_size);

int64_t max_block_size = MSMIN(thread_cost_context->total_unit_num_, 2 * block_size);
double max_efficiency = static_cast<double>(block_count) / (UP_DIV(block_count, thread_num) * thread_num);
for (int64_t prev_block_count = block_count; max_efficiency < 1.0 && prev_block_count > 1;) {
int64_t cur_block_size = UP_DIV(thread_cost_context->total_unit_num_, prev_block_count - 1);
if (cur_block_size > max_block_size) {
break;
}
const int64_t cur_block_count = UP_DIV(thread_cost_context->total_unit_num_, cur_block_size);
MS_ASSERT(cur_block_count < prev_block_count);
prev_block_count = cur_block_count;
const double cur_efficiency =
static_cast<double>(cur_block_count) / (UP_DIV(cur_block_count, thread_num) * thread_num);
if (cur_efficiency + 0.01 >= max_efficiency) { // update threshold : 0.01
block_size = cur_block_size;
block_count = cur_block_count;
if (max_efficiency < cur_efficiency) {
max_efficiency = cur_efficiency;
}
}
}
return block_count;
}

int UpdateThreadNum(const Context *context, const ThreadCostContext *thread_cost_context, int task_num) {
if (task_num <= 1) {
return task_num;
}
ThreadPool *pool = static_cast<const lite::InnerContext *>(context)->thread_pool();
if (pool == nullptr) {
MS_LOG(ERROR) << "thread pool is nullptr";
return RET_NULL_PTR;
}

if (thread_cost_context != nullptr) {
if (ThreadCostModel::thread_num(thread_cost_context) == 1) {
return 1;
}
int opt_thread = static_cast<int>(ThreadCostModel::parallel_degree(thread_cost_context));
task_num = MSVALID(1, opt_thread, task_num);
task_num = MSMIN(task_num, thread_cost_context->total_unit_num_);
}
return task_num;
}
} // namespace mindspore::lite

+ 71
- 0
mindspore/lite/src/thread_cost_model.h View File

@@ -0,0 +1,71 @@
/**
* Copyright 2022 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#ifndef MINDSPORE_LITE_SRC_THREAD_COST_MODEL_H
#define MINDSPORE_LITE_SRC_THREAD_COST_MODEL_H

#include <stdint.h>
#include "nnacl/op_base.h"
#include "include/api/context.h"

namespace mindspore::lite {
typedef struct ThreadCostContext {
int64_t total_unit_num_;
int64_t per_unit_load_num_;
int64_t per_unit_store_num_;
float per_unit_compute_cost_;
} ThreadCostContext;

struct ThreadCostModel {
static float unit_cost(const ThreadCostContext *thread_cost_context) {
return per_unit_load_cost_ * thread_cost_context->per_unit_load_num_ +
per_unit_store_cost_ * thread_cost_context->per_unit_store_num_ +
thread_cost_context->per_unit_compute_cost_ * per_unit_compute_num_;
}

static float total_cost(const ThreadCostContext *thread_cost_context) {
return thread_cost_context->total_unit_num_ * unit_cost(thread_cost_context);
}

// thread_num assesses parallel thread num. Value of 1.0 means ideal parallel task size. Values < 1.0 mean that task
// granularity needs to be increased to mitigate parallelization overheads.
static float parallel_degree(const ThreadCostContext *thread_cost_context) {
return total_cost(thread_cost_context) / parallel_thread_cost_;
}

static int thread_num(const ThreadCostContext *thread_cost_context) {
return MSMAX(
1, static_cast<int>((total_cost(thread_cost_context) - thread_startup_cost_) / single_thread_cost_ + 0.9));
}

static int64_t thread_block_size(const ThreadCostContext *thread_cost_context) {
return static_cast<int64_t>(parallel_thread_cost_ / unit_cost(thread_cost_context));
}
static int get_optimal_thread_num(const ThreadCostContext *thread_cost_context, const int thread_num);

static float per_unit_load_cost_; // per unit load cost
static float per_unit_store_cost_; // per unit store cost
static int64_t per_unit_compute_num_; // per unit compute num

static float thread_startup_cost_; // thread startup inherent cost
static float single_thread_cost_; // Minimum cost of single-threaded
static float parallel_thread_cost_; // Minimum cost of per thread in parallel-thread
};

int UpdateThreadNum(const Context *context, const ThreadCostContext *thread_cost_context, int task_num);
} // namespace mindspore::lite

#endif // MINDSPORE_LITE_SRC_INNER_CONTEXT_H

+ 1
- 0
mindspore/lite/tools/converter/CMakeLists.txt View File

@@ -132,6 +132,7 @@ set(LITE_SRC
${SRC_DIR}/runtime/dynamic_mem_allocator.cc
${SRC_DIR}/runtime/dynamic_mem_manager.cc
${SRC_DIR}/runtime/numa_adapter.cc
${SRC_DIR}/thread_cost_model.cc
)
endif()



Loading…
Cancel
Save