dynamic thread cut, version 2

4 years ago · c90ead5d0f
--- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/op_base.h
+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/op_base.h
@@ -1,5 +1,5 @@
 /**
 * Copyright 2020-2021 Huawei Technologies Co., Ltd
 * Copyright 2020-2022 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -530,6 +530,13 @@ typedef struct QuantMulArg {
  int right_shift_;
 } QuantMulArg;

 typedef struct DtCostContext {
  int64_t total_num_;
  float bytes_loaded_;
  float bytes_stored_;
  float compute_cost_;
 } DtCostContext;

 typedef enum ActType { ActType_No, ActType_Relu, ActType_Sigmod, ActType_Relu6, ActType_Prelu } ActType;
 typedef enum PadMode { Pad_pad, Pad_same, Pad_valid } PadMode;
 typedef enum RoundingMode { Rounding_No, Rounding_Away_from_zero, Rounding_Up } RoundingMode;
--- a/mindspore/core/mindrt/src/thread/core_affinity.cc
+++ b/mindspore/core/mindrt/src/thread/core_affinity.cc
@@ -1,5 +1,5 @@
 /**
 * Copyright 2021 Huawei Technologies Co., Ltd
 * Copyright 2021-2022 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -215,6 +215,32 @@ int GetMaxFrequency(int core_id) {
  return max_freq;
 }

 float CoreAffinity::GetServerFrequency() {
  float max_freq = -1.0f;
 #ifdef SERVER_INFERENCE
  // The CPU cores in the server of the numa architecture are the same.
  // The main frequency of the first core is obtained.
  FILE *fp = popen("cat /proc/cpuinfo|grep cpu\\ MHz | sed -e 's/.*:[^0-9]//'", "r");
  if (fp == nullptr) {
    THREAD_ERROR("get system cpuinfo frequency failed");
    return max_freq;
  }

  while (feof(fp) == 0) {
    float freq = 0;
    int tmp = fscanf(fp, "%f", &freq);
    if (tmp != 1) {
      break;
    }
    if (max_freq < freq) {
      max_freq = freq;
    }
  }
  (void)fclose(fp);
 #endif
  return max_freq;  // MHz
 }

 #ifdef _WIN32
 void SetWindowsAffinity(HANDLE thread, DWORD_PTR mask) {
  THREAD_INFO("Bind thread[%ld] to core[%lld].", GetThreadId(thread), mask);
--- a/mindspore/core/mindrt/src/thread/core_affinity.h
+++ b/mindspore/core/mindrt/src/thread/core_affinity.h
@@ -1,5 +1,5 @@
 /**
 * Copyright 2021 Huawei Technologies Co., Ltd
 * Copyright 2021-2022 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -58,6 +58,7 @@ class CoreAffinity {
  int BindProcess(BindMode bind_mode);
  std::vector<int> GetCoreId(size_t thread_num, BindMode bind_mode);
  void SetCoreId(const std::vector<int> &core_list);
  static float GetServerFrequency();

 private:
 #ifdef _WIN32
--- a/mindspore/core/mindrt/src/thread/threadpool.cc
+++ b/mindspore/core/mindrt/src/thread/threadpool.cc
@@ -324,6 +324,11 @@ int ThreadPool::InitAffinityInfo() {
    return THREAD_ERROR;
  }
 #endif

 #ifdef SERVER_INFERENCE
  server_cpu_frequence = CoreAffinity::GetServerFrequency() / 1000.0f;  // 1GHz = 1000MHz
 #endif

  return THREAD_OK;
 }

--- a/mindspore/core/mindrt/src/thread/threadpool.h
+++ b/mindspore/core/mindrt/src/thread/threadpool.h
@@ -1,5 +1,5 @@
 /**
 * Copyright 2021 Huawei Technologies Co., Ltd
 * Copyright 2021-2022 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -149,6 +149,7 @@ class MS_CORE_API ThreadPool {
  void ActiveWorkers() const;
  void SetWorkerIdMap();
  const std::unordered_map<std::thread::id, size_t> &GetWorkerIdMap() const { return worker_ids_; }
  float GetServerCpuFrequence() const { return server_cpu_frequence; }

 protected:
  ThreadPool() = default;
@@ -174,6 +175,7 @@ class MS_CORE_API ThreadPool {
  bool occupied_actor_thread_{true};
  int max_spin_count_{kDefaultSpinCount};
  int min_spin_count_{kMinSpinCount};
  float server_cpu_frequence = -1.0f;  // Unit : GHz
 };
 }  // namespace mindspore
 #endif  // MINDSPORE_CORE_MINDRT_RUNTIME_THREADPOOL_H_
--- a/mindspore/lite/src/inner_context.cc
+++ b/mindspore/lite/src/inner_context.cc
@@ -1,5 +1,5 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 * Copyright 2020-2022 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -399,6 +399,67 @@ void InnerContext::ReplaceLinkInfoSenderWithNewOne(void *new_sender, void *old_s
  }
 }

 #ifdef SERVER_INFERENCE
 float DtCostModel::load_cost_ = 1.0 / 64 * 11;   // 64: L2 cache size, 11 : L2 cache latency on Haswell
 float DtCostModel::store_cost_ = 1.0 / 64 * 11;  // 64: L2 cache size, 11 : L2 cache latency on Haswell
 float DtCostModel::compute_cycles_ = 1.0f;

 int DtCostModel::startup_cycles_ = 100000;
 int DtCostModel::per_thread_cycles_ = 100000;
 int DtCostModel::task_size_ = 40000;

 int DtCostModel::get_optimal_thread_num(const DtCostContext *dt_cost_context, const int thread_num) {
  const int64_t max_oversharding_factor = 4;

  int64_t block_size =
    MSVALID(max_oversharding_factor * thread_num, thread_block_size(dt_cost_context), dt_cost_context->total_num_);
  int64_t block_count = UP_DIV(dt_cost_context->total_num_, block_size);

  int64_t max_block_size = MSMIN(dt_cost_context->total_num_, 2 * block_size);
  double max_efficiency = static_cast<double>(block_count) / (UP_DIV(block_count, thread_num) * thread_num);
  for (int64_t prev_block_count = block_count; max_efficiency < 1.0 && prev_block_count > 1;) {
    int64_t cur_block_size = UP_DIV(dt_cost_context->total_num_, prev_block_count - 1);
    if (cur_block_size > max_block_size) {
      break;
    }
    const int64_t cur_block_count = UP_DIV(dt_cost_context->total_num_, cur_block_size);
    MS_ASSERT(cur_block_count < prev_block_count);
    prev_block_count = cur_block_count;
    const double cur_efficiency =
      static_cast<double>(cur_block_count) / (UP_DIV(cur_block_count, thread_num) * thread_num);
    if (cur_efficiency + 0.01 >= max_efficiency) {  // update threshold : 0.01
      block_size = cur_block_size;
      block_count = cur_block_count;
      if (max_efficiency < cur_efficiency) {
        max_efficiency = cur_efficiency;
      }
    }
  }

  return block_count;
 }

 int UpdateThreadNum(const Context *context, const DtCostContext *dt_cost_context, int task_num) {
  if (task_num <= 1) {
    return task_num;
  }
  ThreadPool *pool = static_cast<const lite::InnerContext *>(context)->thread_pool();
  if (pool == nullptr) {
    MS_LOG(ERROR) << "thread pool is nullptr";
    return RET_NULL_PTR;
  }

  if (dt_cost_context != nullptr) {
    if (DtCostModel::thread_num(dt_cost_context) == 1) {
      return 1;
    }
    int opt_thread = static_cast<int>(DtCostModel::parallel_degree(dt_cost_context));
    task_num = MSVALID(1, opt_thread, task_num);
  }
  return task_num;
 }
 #endif

 int ParallelLaunch(const Context *context, const Func &func, Content content, int task_num) {
  ThreadPool *pool = static_cast<const lite::InnerContext *>(context)->thread_pool();
  if (pool == nullptr) {
--- a/mindspore/lite/src/inner_context.h
+++ b/mindspore/lite/src/inner_context.h
@@ -1,5 +1,5 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 * Copyright 2020-2022 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -120,6 +120,44 @@ struct InnerContext : public Context {
  std::unordered_map<void *, std::set<void *>> link_info_{};
 };

 #ifdef SERVER_INFERENCE
 struct DtCostModel {
  static float unit_cost(const DtCostContext *dt_cost_context) {
    return load_cost_ * dt_cost_context->bytes_loaded_ + store_cost_ * dt_cost_context->bytes_stored_ +
           dt_cost_context->compute_cost_ * compute_cycles_;
  }

  static float total_cost(const DtCostContext *dt_cost_context) {
    return dt_cost_context->total_num_ * unit_cost(dt_cost_context);
  }

  // thread_num assesses parallel thread num. Value of 1.0 means ideal parallel task size. Values < 1.0 mean that task
  // granularity needs to be increased to mitigate parallelization overheads.
  static float parallel_degree(const DtCostContext *dt_cost_context) {
    return total_cost(dt_cost_context) / task_size_;
  }

  static int thread_num(const DtCostContext *dt_cost_context) {
    return MSMAX(1, static_cast<int>((total_cost(dt_cost_context) - startup_cycles_) / per_thread_cycles_ + 0.9));
  }

  static int64_t thread_block_size(const DtCostContext *dt_cost_context) {
    return static_cast<int64_t>(task_size_ / unit_cost(dt_cost_context));
  }
  static int get_optimal_thread_num(const DtCostContext *dt_cost_context, const int thread_num);

  static float load_cost_;   // 64: L2 cache size, 11 : L2 cache latency on Haswell
  static float store_cost_;  // 64: L2 cache size, 11 : L2 cache latency on Haswell
  static float compute_cycles_;

  static int startup_cycles_;
  static int per_thread_cycles_;
  static int task_size_;
 };

 int UpdateThreadNum(const Context *context, const DtCostContext *dt_cost_context, int task_num);
 #endif

 int ParallelLaunch(const Context *context, const Func &func, Content content, int task_num);
 }  // namespace mindspore::lite

--- a/mindspore/lite/src/inner_kernel.h
+++ b/mindspore/lite/src/inner_kernel.h
@@ -1,5 +1,5 @@
 /**
 * Copyright 2021 Huawei Technologies Co., Ltd
 * Copyright 2021-2022 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -190,6 +190,7 @@ class InnerKernel : public Kernel {
  size_t workspace_size_ = 0;
  void *workspace_ = nullptr;
  const lite::Context *ms_context_ = nullptr;
  std::unique_ptr<DtCostContext> dt_cost_context_ = nullptr;
 };
 }  // namespace mindspore::kernel

--- a/mindspore/lite/src/runtime/kernel/arm/fp32/activation_fp32.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/activation_fp32.cc
@@ -1,5 +1,5 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 * Copyright 2020-2022 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -33,6 +33,30 @@ using mindspore::schema::ActivationType_SWISH;
 using mindspore::schema::PrimitiveType_Activation;

 namespace mindspore::kernel {
 namespace {
 const std::map<int, float> dt_activation_cost_map_ = {
  {schema::ActivationType_RELU, 1.806f},
  {schema::ActivationType_RELU6, 1.806f},
  {schema::ActivationType_LEAKY_RELU, 1.806f},
  // {schema::ActivationType_SIGMOID, 10.0f}, {schema::ActivationType_TANH, 10.0f},
  // {schema::ActivationType_SWISH, 1.0f}, {schema::ActivationType_HSWISH, 1.0f},
  // {schema::ActivationType_HSIGMOID, 1.0f}, {schema::ActivationType_HARD_TANH, 1.0f},
  // {schema::ActivationType_GELU, 1.0f}, {schema::ActivationType_SOFTPLUS, 1.0f},   {schema::ActivationType_ELU, 1.0f},
 };
 }  // namespace

 #ifdef SERVER_INFERENCE
 int ActivationCPUKernel::SetDtCostContext() {
  if (dt_activation_cost_map_.count(type_) > 0) {
    dt_cost_context_ = std::make_unique<DtCostContext>();
    dt_cost_context_->bytes_loaded_ = 1;
    dt_cost_context_->bytes_stored_ = 1;
    dt_cost_context_->compute_cost_ = dt_activation_cost_map_.at(type_);
  }
  return RET_OK;
 }
 #endif

 int ActivationCPUKernel::Prepare() {
  CHECK_LESS_RETURN(in_tensors_.size(), 1);
  CHECK_LESS_RETURN(out_tensors_.size(), 1);
@@ -55,6 +79,11 @@ int ActivationCPUKernel::Prepare() {
      return RET_ERROR;
    }
  }
 #ifdef SERVER_INFERENCE
  if (SetDtCostContext() != RET_OK) {
    return RET_ERROR;
  }
 #endif
  return RET_OK;
 }

@@ -163,6 +192,12 @@ int ActivationRun(void *cdata, int task_id, float lhs_scale, float rhs_scale) {
 }

 int ActivationCPUKernel::Run() {
 #ifdef SERVER_INFERENCE
  if (dt_cost_context_ != nullptr) {
    dt_cost_context_->total_num_ = in_tensors_.at(0)->ElementsNum();
    thread_count_ = UpdateThreadNum(this->ms_context_, dt_cost_context_.get(), thread_count_);
  }
 #endif
  int error_code = ParallelLaunch(this->ms_context_, ActivationRun, this, thread_count_);
  if (error_code != RET_OK) {
    MS_LOG(ERROR) << "Activation function error error_code[" << error_code << "]";
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/activation_fp32.h
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/activation_fp32.h
@@ -1,5 +1,5 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 * Copyright 2020-2022 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -18,6 +18,8 @@
 #define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP32_ACTIVATION_H_

 #include <vector>
 #include <map>
 #include <memory>
 #include "src/inner_kernel.h"
 #include "nnacl/fp32/activation_fp32.h"

@@ -34,6 +36,7 @@ class ActivationCPUKernel : public InnerKernel {
  }
  ~ActivationCPUKernel() override = default;

  int SetDtCostContext();
  int Prepare() override;
  int ReSize() override;
  int Run() override;
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/arithmetic_fp32.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/arithmetic_fp32.cc
@@ -1,5 +1,5 @@
 /**
 * Copyright 2020-2021 Huawei Technologies Co., Ltd
 * Copyright 2020-2022 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -24,10 +24,61 @@ using mindspore::lite::RET_OK;
 using mindspore::schema::PrimitiveType_Eltwise;

 namespace mindspore::kernel {
 namespace {
 const std::map<std::pair<int, int>, float> dt_arithmetic_cost_map_ = {
  // {{PrimitiveType_MulFusion, schema::ActivationType_RELU}, 1.0f},
  // {{PrimitiveType_MulFusion, schema::ActivationType_RELU6}, 1.0f},
  // {{PrimitiveType_MulFusion, schema::ActivationType_NO_ACTIVATION}, 1.0f},

  {{PrimitiveType_AddFusion, schema::ActivationType_RELU}, 1.806f},
  {{PrimitiveType_AddFusion, schema::ActivationType_RELU6}, 1.806f},
  {{PrimitiveType_AddFusion, schema::ActivationType_NO_ACTIVATION}, 1.275f},

  {{PrimitiveType_SubFusion, schema::ActivationType_RELU}, 1.806f},
  {{PrimitiveType_SubFusion, schema::ActivationType_RELU6}, 1.806f},
  {{PrimitiveType_SubFusion, schema::ActivationType_NO_ACTIVATION}, 1.275f},

  // {{PrimitiveType_DivFusion, schema::ActivationType_RELU}, 1.0f},
  // {{PrimitiveType_DivFusion, schema::ActivationType_RELU6}, 1.0f},
  // {{PrimitiveType_DivFusion, schema::ActivationType_NO_ACTIVATION}, 1.0f},

  // {{PrimitiveType_RealDiv, schema::ActivationType_RELU}, 1.0f},
  // {{PrimitiveType_RealDiv, schema::ActivationType_RELU6}, 1.0f},
  // {{PrimitiveType_RealDiv, schema::ActivationType_NO_ACTIVATION}, 1.0f},

  // {{PrimitiveType_LogicalAnd, schema::ActivationType_NO_ACTIVATION}, 1.0f},
  // {{PrimitiveType_LogicalOr, schema::ActivationType_NO_ACTIVATION}, 1.0f},
  // {{PrimitiveType_Maximum, schema::ActivationType_NO_ACTIVATION}, 1.0f},
  // {{PrimitiveType_Minimum, schema::ActivationType_NO_ACTIVATION}, 1.0f},
  // {{PrimitiveType_FloorMod, schema::ActivationType_NO_ACTIVATION}, 1.0f},
  // {{PrimitiveType_FloorDiv, schema::ActivationType_NO_ACTIVATION}, 1.0f},
  // {{PrimitiveType_Mod, schema::ActivationType_NO_ACTIVATION}, 1.0f},
  // {{PrimitiveType_SquaredDifference, schema::ActivationType_NO_ACTIVATION}, 1.0f},
 };
 }  // namespace

 #ifdef SERVER_INFERENCE
 int ArithmeticCPUKernel::SetDtCostContext() {
  std::pair<int, int> fusion_type = std::make_pair(param_->op_parameter_.type_, param_->activation_type_);
  if (dt_arithmetic_cost_map_.count(fusion_type) > 0) {
    dt_cost_context_ = std::make_unique<DtCostContext>();
    dt_cost_context_->bytes_loaded_ = 1;
    dt_cost_context_->bytes_stored_ = 1;
    dt_cost_context_->compute_cost_ = dt_arithmetic_cost_map_.at(fusion_type);
  }
  return RET_OK;
 }
 #endif

 int ArithmeticCPUKernel::Prepare() {
  CHECK_LESS_RETURN(in_tensors_.size(), C2NUM);
  CHECK_LESS_RETURN(out_tensors_.size(), 1);
  auto primitive_type = param_->op_parameter_.type_;
 #ifdef SERVER_INFERENCE
  if (SetDtCostContext() != RET_OK) {
    return RET_ERROR;
  }
 #endif
  if (primitive_type == schema::PrimitiveType_Eltwise) {
    switch (param_->eltwise_mode_) {
      case schema::EltwiseMode_PROD:
@@ -437,9 +488,17 @@ int ArithmeticCPUKernel::Run() {
  }
  output_ptr_ = out_tensors_[0]->data();
  CHECK_NULL_RETURN(output_ptr_);

  batch_a_ptr_ = static_cast<uint8_t *>(input0_ptr_);
  batch_b_ptr_ = static_cast<uint8_t *>(input1_ptr_);
  batch_c_ptr_ = static_cast<uint8_t *>(output_ptr_);

 #ifdef SERVER_INFERENCE
  if (dt_cost_context_ != nullptr) {
    dt_cost_context_->total_num_ = in_tensors_.at(0)->ElementsNum();
    op_parameter_->thread_num_ = UpdateThreadNum(this->ms_context_, dt_cost_context_.get(), op_parameter_->thread_num_);
  }
 #endif
  auto ret = ParallelLaunch(this->ms_context_, ArithmeticsRun, this, op_parameter_->thread_num_);
  if (ret != RET_OK) {
    MS_LOG(ERROR) << "arithmetic failed";
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/arithmetic_fp32.h
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/arithmetic_fp32.h
@@ -1,5 +1,5 @@
 /**
 * Copyright 2020-2021 Huawei Technologies Co., Ltd
 * Copyright 2020-2022 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -17,6 +17,9 @@
 #define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP32_ARITHMETIC_FP32_H_

 #include <vector>
 #include <map>
 #include <memory>
 #include <utility>
 #include "src/inner_kernel.h"
 #include "nnacl/fp32/arithmetic_fp32.h"

@@ -114,6 +117,7 @@ class ArithmeticCPUKernel : public InnerKernel {
  int BiasCalc(int task_id);
  void FreeConstTileBuff();
  bool IsBiasCalc() const;
  int SetDtCostContext();
  ArithmeticRun arithmetic_run_ = nullptr;
  ArithmeticOptRun arithmetic_opt_run_ = nullptr;
  ArithmeticIntRun arithmetic_run_int_ = nullptr;
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/arithmetic_self_fp32.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/arithmetic_self_fp32.cc
@@ -1,5 +1,5 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 * Copyright 2020-2022 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -27,8 +27,37 @@ struct TYPE_FUNC_INFO {
  int primitive_type_ = 0;
  ArithmeticSelfFunc func_ = nullptr;
 };

 const std::map<int, float> dt_arithmetic_self_cost_map_ = {
  // {schema::PrimitiveType_Abs, 0.5f},
  // {schema::PrimitiveType_Cos, 1.0f},
  // {schema::PrimitiveType_Log, 1.0f},
  // {schema::PrimitiveType_Square, 10.0f},
  {schema::PrimitiveType_Sqrt, 1.806f},
  // {schema::PrimitiveType_Rsqrt, 1.0f},
  // {schema::PrimitiveType_Sin, 1.0f},
  // {schema::PrimitiveType_LogicalNot, 1.0f},
  // {schema::PrimitiveType_Floor, 1.0f},
  // {schema::PrimitiveType_Ceil, 1.0f},
  // {schema::PrimitiveType_Round, 1.0f},
  // {schema::PrimitiveType_Neg, 1.0f},
  // {schema::PrimitiveType_Reciprocal, 1.0f},
  // {schema::PrimitiveType_Erf, 1.0f},
 };
 }  // namespace

 #ifdef SERVER_INFERENCE
 int ArithmeticSelfCPUKernel::SetDtCostContext() {
  if (dt_arithmetic_self_cost_map_.count(type_) > 0) {
    dt_cost_context_ = std::make_unique<DtCostContext>();
    dt_cost_context_->bytes_loaded_ = 1;
    dt_cost_context_->bytes_stored_ = 1;
    dt_cost_context_->compute_cost_ = dt_arithmetic_self_cost_map_.at(type_);
  }
  return RET_OK;
 }
 #endif

 ArithmeticSelfFunc ArithmeticSelfCPUKernel::GetArithmeticSelfFun(int primitive_type) const {
  TYPE_FUNC_INFO type_func_table[] = {{mindspore::schema::PrimitiveType_Abs, ElementAbs},
                                      {mindspore::schema::PrimitiveType_Cos, ElementCos},
@@ -62,6 +91,11 @@ ArithmeticSelfBoolFunc ArithmeticSelfCPUKernel::GetArithmeticSelfBoolFun(int pri
 int ArithmeticSelfCPUKernel::Prepare() {
  CHECK_NOT_EQUAL_RETURN(in_tensors_.size(), 1);
  CHECK_NOT_EQUAL_RETURN(out_tensors_.size(), 1);
 #ifdef SERVER_INFERENCE
  if (SetDtCostContext() != RET_OK) {
    return RET_ERROR;
  }
 #endif
  if (!InferShapeDone()) {
    return RET_OK;
  }
@@ -117,6 +151,12 @@ int ArithmeticSelfRun(void *cdata, int task_id, float lhs_scale, float rhs_scale
 }

 int ArithmeticSelfCPUKernel::Run() {
 #ifdef SERVER_INFERENCE
  if (dt_cost_context_ != nullptr) {
    dt_cost_context_->total_num_ = in_tensors_.at(0)->ElementsNum();
    op_parameter_->thread_num_ = UpdateThreadNum(this->ms_context_, dt_cost_context_.get(), op_parameter_->thread_num_);
  }
 #endif
  auto ret = ParallelLaunch(this->ms_context_, ArithmeticSelfRun, this, op_parameter_->thread_num_);
  if (ret != RET_OK) {
    MS_LOG(ERROR) << "ArithmeticSelfRun error error_code[" << ret << "]";
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/arithmetic_self_fp32.h
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/arithmetic_self_fp32.h
@@ -1,5 +1,5 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 * Copyright 2020-2022 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -17,6 +17,8 @@
 #define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP32_ARITHMETIC_SELF_H_

 #include <vector>
 #include <map>
 #include <memory>
 #include "src/inner_kernel.h"

 using mindspore::schema::PrimitiveType_Abs;
@@ -47,6 +49,7 @@ class ArithmeticSelfCPUKernel : public InnerKernel {
  }
  ~ArithmeticSelfCPUKernel() override = default;

  int SetDtCostContext();
  int Prepare() override;
  int ReSize() override;
  int Run() override;
--- a/mindspore/lite/src/runtime/kernel/arm/int8/dynamic_quant.h
+++ b/mindspore/lite/src/runtime/kernel/arm/int8/dynamic_quant.h
@@ -50,8 +50,8 @@ class DynamicQuantCPUKernel : public InnerKernel {

  float real_min_array_[8];
  float real_max_array_[8];
  float real_min_;
  float real_max_;
  float real_min_ = FLT_MAX;
  float real_max_ = FLT_MIN;
  int32_t src_dtype_{0};
  int32_t dst_dtype_{0};
  bool symmetric_ = false;