!15176 [MS][LITE]Change lite_kernel.h interface of InnerContext

From: @gongdaguo Reviewed-by: @jpc_chenjianping,@zhanghaibo5 Signed-off-by: @jpc_chenjianping
5 years ago · c71ae4e831
--- a/mindspore/lite/schema/ops.fbs
+++ b/mindspore/lite/schema/ops.fbs
@@ -841,7 +841,7 @@ table Rsqrt {
 }
 table QuantDTypeCast {
    src_t: long; // deprecated
    src_t: long;
    dst_t: long;
 }
--- a/mindspore/lite/src/lite_kernel.h
+++ b/mindspore/lite/src/lite_kernel.h
@@ -30,6 +30,7 @@
 #include "src/tensor.h"
 #include "include/errorcode.h"
 #include "schema/model_generated.h"
 #include "include/context.h"
 namespace mindspore::kernel {
 enum KERNEL_ARCH {
@@ -64,7 +65,7 @@ class LiteKernel {
 public:
  LiteKernel() = default;
  LiteKernel(OpParameter *parameter, std::vector<lite::Tensor *> in_tensors, std::vector<lite::Tensor *> out_tensors,
             const lite::InnerContext *ctx)
             const lite::Context *ctx)
      : op_parameter_(parameter),
        in_tensors_(std::move(in_tensors)),
        out_tensors_(std::move(out_tensors)),
@@ -175,7 +176,7 @@ class LiteKernel {
  SubGraphType subgraph_type() const { return this->subgraph_type_; }
  const lite::InnerContext *context() const { return this->context_; }
  const lite::Context *context() const { return this->context_; }
  virtual std::string ToString() const;
@@ -202,7 +203,7 @@ class LiteKernel {
  // tensor will free in ~lite_session()
  std::vector<lite::Tensor *> in_tensors_;
  std::vector<lite::Tensor *> out_tensors_;
  const lite::InnerContext *context_ = nullptr;
  const lite::Context *context_ = nullptr;
  std::vector<LiteKernel *> in_kernels_;
  std::vector<LiteKernel *> out_kernels_;
  bool train_mode_ = false;
@@ -217,13 +218,13 @@ class LiteKernel {
 typedef LiteKernel *(*KernelCreator)(const std::vector<lite::Tensor *> &inputs,
                                     const std::vector<lite::Tensor *> &outputs, OpParameter *parameter,
                                     const lite::InnerContext *ctx, const KernelKey &desc);
                                     const lite::Context *ctx, const KernelKey &desc);
 template <class T>
 kernel::LiteKernel *LiteKernelCreator(const std::vector<lite::Tensor *> &inputs,
                                      const std::vector<lite::Tensor *> &outputs, OpParameter *parameter,
                                      const lite::InnerContext *ctx, const kernel::KernelKey &desc) {
  auto *kernel = new (std::nothrow) T(parameter, inputs, outputs, ctx);
                                      const lite::Context *ctx, const kernel::KernelKey &desc) {
  auto *kernel = new (std::nothrow) T(parameter, inputs, outputs, static_cast<const lite::InnerContext *>(ctx));
  if (kernel == nullptr) {
    MS_LOG(ERROR) << "kernel: " << parameter->name_ << "is nullptr.";
    free(parameter);
--- a/mindspore/lite/src/runtime/agent/npu/subgraph_npu_kernel.cc
+++ b/mindspore/lite/src/runtime/agent/npu/subgraph_npu_kernel.cc
@@ -206,7 +206,8 @@ int SubGraphNpuKernel::Init() {
    MS_ASSERT(npu_manager_ != nullptr);
    npu_manager_->AddModel(model_buffer_data, GetOMModelName(), context_->GetNpuInfo().frequency_);
    npu_manager_->AddModel(model_buffer_data, GetOMModelName(),
                           static_cast<const lite::InnerContext *>(context_)->GetNpuInfo().frequency_);
    executor_ = new (std::nothrow) mindspore::lite::NPUExecutor(GetOMModelName(), npu_manager_);
--- a/mindspore/lite/src/runtime/kernel/arm/base/constant_of_shape.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/base/constant_of_shape.cc
@@ -73,7 +73,8 @@ int ConstantOfShapeCPUKernel::Run() {
  int thread_count = MSMIN(op_parameter_->thread_num_, param_->element_size_);
  thread_stride_ = UP_DIV(param_->element_size_, thread_count);
  auto ret = ParallelLaunch(this->context_->thread_pool_, ConstantOfShapeRun, this, thread_count);
  auto ret = ParallelLaunch(static_cast<const lite::InnerContext *>(this->context_)->thread_pool_, ConstantOfShapeRun,
                            this, thread_count);
  if (ret != RET_OK) {
    MS_LOG(ERROR) << "ConstantOfShapeRun error error_code[" << ret << "]";
    return ret;
--- a/mindspore/lite/src/runtime/kernel/arm/base/detection_post_process_base.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/base/detection_post_process_base.cc
@@ -144,17 +144,7 @@ void DetectionPostProcessBaseCPUKernel::FreeAllocatedBuffer() {
  }
 }
 int DetectionPostProcessBaseCPUKernel::Run() {
  MS_ASSERT(context_->allocator != nullptr);
  int status = GetInputData();
  if (status != RET_OK) {
    return status;
  }
  auto output_boxes = reinterpret_cast<float *>(out_tensors_.at(0)->data_c());
  auto output_classes = reinterpret_cast<float *>(out_tensors_.at(1)->data_c());
  auto output_scores = reinterpret_cast<float *>(out_tensors_.at(2)->data_c());
  auto output_num = reinterpret_cast<float *>(out_tensors_.at(3)->data_c());
 int DetectionPostProcessBaseCPUKernel::ParamInit() {
  num_boxes_ = in_tensors_.at(0)->shape().at(1);
  num_classes_with_bg_ = in_tensors_.at(1)->shape().at(2);
  params_->decoded_boxes_ = context_->allocator->Malloc(num_boxes_ * 4 * sizeof(float));
@@ -221,6 +211,24 @@ int DetectionPostProcessBaseCPUKernel::Run() {
      return RET_ERROR;
    }
  }
  return RET_OK;
 }
 int DetectionPostProcessBaseCPUKernel::Run() {
  MS_ASSERT(context_->allocator != nullptr);
  int status = GetInputData();
  if (status != RET_OK) {
    return status;
  }
  auto output_boxes = reinterpret_cast<float *>(out_tensors_.at(0)->data_c());
  auto output_classes = reinterpret_cast<float *>(out_tensors_.at(1)->data_c());
  auto output_scores = reinterpret_cast<float *>(out_tensors_.at(2)->data_c());
  auto output_num = reinterpret_cast<float *>(out_tensors_.at(3)->data_c());
  if (ParamInit() != RET_OK) {
    MS_LOG(ERROR) << "ParamInit error";
    return status;
  }
  status = DecodeBoxes(num_boxes_, input_boxes_, params_->anchors_, params_);
  if (status != RET_OK) {
@@ -238,7 +246,8 @@ int DetectionPostProcessBaseCPUKernel::Run() {
      return status;
    }
  } else {
    status = ParallelLaunch(this->context_->thread_pool_, NmsMultiClassesFastCoreRun, this, op_parameter_->thread_num_);
    status = ParallelLaunch(static_cast<const lite::InnerContext *>(this->context_)->thread_pool_,
                            NmsMultiClassesFastCoreRun, this, op_parameter_->thread_num_);
    if (status != RET_OK) {
      MS_LOG(ERROR) << "NmsMultiClassesFastCoreRun error error_code[" << status << "]";
      FreeAllocatedBuffer();
--- a/mindspore/lite/src/runtime/kernel/arm/base/detection_post_process_base.h
+++ b/mindspore/lite/src/runtime/kernel/arm/base/detection_post_process_base.h
@@ -47,6 +47,7 @@ class DetectionPostProcessBaseCPUKernel : public LiteKernel {
 protected:
  virtual int GetInputData() = 0;
  int ParamInit();
 private:
  void FreeAllocatedBuffer();
--- a/mindspore/lite/src/runtime/kernel/arm/base/prior_box.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/base/prior_box.cc
@@ -166,7 +166,8 @@ int RunPriorBox(void *cdata, int task_id) {
 }
 int PriorBoxCPUKernel::Run() {
  int error_code = ParallelLaunch(this->context_->thread_pool_, RunPriorBox, this, thread_count_);
  int error_code = ParallelLaunch(static_cast<const lite::InnerContext *>(this->context_)->thread_pool_, RunPriorBox,
                                  this, thread_count_);
  if (error_code != RET_OK) {
    MS_LOG(ERROR) << "PriorBox run error, error_code[" << error_code << "]";
    return RET_ERROR;
--- a/mindspore/lite/src/runtime/kernel/arm/base/quant_dtype_cast.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/base/quant_dtype_cast.cc
@@ -172,7 +172,8 @@ int QuantDTypeCastCPUKernel::Run() {
    uint8_ptr_ = reinterpret_cast<uint8_t *>(out_tensors_[0]->data_c());
  }
  auto ret = ParallelLaunch(this->context_->thread_pool_, QuantDTypeCastRun, this, thread_n_num_);
  auto ret = ParallelLaunch(static_cast<const lite::InnerContext *>(this->context_)->thread_pool_, QuantDTypeCastRun,
                            this, thread_n_num_);
  if (ret != RET_OK) {
    MS_LOG(ERROR) << "Scale error error_code[" << ret << "]";
    if (in_tensors_[0]->data_type() == TypeId::kNumberTypeInt8 &&
--- a/mindspore/lite/src/runtime/kernel/arm/base/reshape_base.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/base/reshape_base.cc
@@ -66,7 +66,8 @@ int ReshapeRun(void *cdata, int task_id) {
 int ReshapeBaseCPUKernel::Run() {
  input_ptr_ = reinterpret_cast<uint8_t *>(in_tensors_.at(kInputIndex)->data_c());
  output_ptr_ = reinterpret_cast<uint8_t *>(out_tensors_.at(kOutputIndex)->data_c());
  auto ret = ParallelLaunch(this->context_->thread_pool_, ReshapeRun, this, context_->thread_num_);
  auto ret = ParallelLaunch(static_cast<const lite::InnerContext *>(this->context_)->thread_pool_, ReshapeRun, this,
                            context_->thread_num_);
  if (ret != RET_OK) {
    MS_LOG(ERROR) << "Reshape run error error_code[" << ret << "]";
    return ret;
--- a/mindspore/lite/src/runtime/kernel/arm/base/slice_base.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/base/slice_base.cc
@@ -81,7 +81,8 @@ int SliceCPUKernel::Run() {
                      lite::DataTypeSize(in_tensors_.at(0)->data_type()));
    return RET_OK;
  }
  auto ret = ParallelLaunch(this->context_->thread_pool_, SliceLaunch, this, op_parameter_->thread_num_);
  auto ret = ParallelLaunch(static_cast<const lite::InnerContext *>(this->context_)->thread_pool_, SliceLaunch, this,
                            op_parameter_->thread_num_);
  if (ret != RET_OK) {
    MS_LOG(ERROR) << "slice launch fail!ret: " << ret;
    return RET_ERROR;
--- a/mindspore/lite/src/runtime/kernel/arm/base/split_base.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/base/split_base.cc
@@ -120,7 +120,8 @@ int SplitBaseCPUKernel::Run() {
    output_ptr_.at(i) = output_tensor->data_c();
  }
  auto ret = ParallelLaunch(this->context_->thread_pool_, SplitRun, this, thread_n_num_);
  auto ret = ParallelLaunch(static_cast<const lite::InnerContext *>(this->context_)->thread_pool_, SplitRun, this,
                            thread_n_num_);
  if (ret != RET_OK) {
    MS_LOG(ERROR) << "split error error_code[" << ret << "]";
  }
--- a/mindspore/lite/src/runtime/kernel/arm/base/stack_base.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/base/stack_base.cc
@@ -100,7 +100,8 @@ int StackBaseCPUKernel::Run() {
  }
  // run stack
  num_threads_ = MSMIN(UP_DIV(outer_size_, 64), this->context_->thread_num_);
  auto ret = ParallelLaunch(this->context_->thread_pool_, StackRun, this, num_threads_);
  auto ret =
    ParallelLaunch(static_cast<const lite::InnerContext *>(this->context_)->thread_pool_, StackRun, this, num_threads_);
  if (ret != RET_OK) {
    MS_LOG(ERROR) << "StackBaseCPUKernel Run error: error_code[" << ret << "]";
    return RET_ERROR;
--- a/mindspore/lite/src/runtime/kernel/arm/base/strided_slice.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/base/strided_slice.cc
@@ -157,7 +157,8 @@ int StridedSliceCPUKernel::FastRun() {
  }
  input_ptr_ = reinterpret_cast<uint8_t *>(in_tensors_.front()->data_c());
  output_ptr_ = reinterpret_cast<uint8_t *>(out_tensors_.front()->data_c());
  auto ret = ParallelLaunch(this->context_->thread_pool_, StrideRun, this, context_->thread_num_);
  auto ret = ParallelLaunch(static_cast<const lite::InnerContext *>(this->context_)->thread_pool_, StrideRun, this,
                            context_->thread_num_);
  if (ret != RET_OK) {
    MS_LOG(ERROR) << "Stride run error error_code[" << ret << "]";
    return ret;
--- a/mindspore/lite/src/runtime/kernel/arm/base/tile_base.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/base/tile_base.cc
@@ -127,7 +127,8 @@ int TileCPUKernel::SimpleTileImpl(int task_id) {
 }
 int TileCPUKernel::RunSimpleTile() {
  auto ret = ParallelLaunch(context_->thread_pool_, SimpleTile, this, context_->thread_num_);
  auto ret = ParallelLaunch(static_cast<const lite::InnerContext *>(this->context_)->thread_pool_, SimpleTile, this,
                            context_->thread_num_);
  if (ret != RET_OK) {
    MS_LOG(ERROR) << "RunSimpleTile error code[" << ret << "]";
    return ret;
--- a/mindspore/lite/src/runtime/kernel/arm/fp16/activation_fp16.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/activation_fp16.cc
@@ -100,7 +100,8 @@ int ActivationFp16CPUKernel::Run() {
  fp16_input_ = reinterpret_cast<float16_t *>(input_tensor->data_c());
  fp16_output_ = reinterpret_cast<float16_t *>(output_tensor->data_c());
  int error_code = ParallelLaunch(this->context_->thread_pool_, ActivationFp16Run, this, thread_count_);
  int error_code = ParallelLaunch(static_cast<const lite::InnerContext *>(this->context_)->thread_pool_,
                                  ActivationFp16Run, this, thread_count_);
  if (error_code != RET_OK) {
    MS_LOG(ERROR) << "Activation function error error_code[" << error_code << "]";
    return RET_ERROR;
--- a/mindspore/lite/src/runtime/kernel/arm/fp16/arithmetic_compare_fp16.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/arithmetic_compare_fp16.cc
@@ -160,15 +160,16 @@ int ArithmeticCompareFP16CPUKernel::Run() {
  is_input0_fp32_ = in_tensors_.at(0)->data_type() == kNumberTypeFloat32;
  is_input1_fp32_ = in_tensors_.at(1)->data_type() == kNumberTypeFloat32;
  input0_fp16_ = ConvertInputFp32toFp16(in_tensors_.at(0), context_);
  input1_fp16_ = ConvertInputFp32toFp16(in_tensors_.at(1), context_);
  input0_fp16_ = ConvertInputFp32toFp16(in_tensors_.at(0), static_cast<const lite::InnerContext *>(this->context_));
  input1_fp16_ = ConvertInputFp32toFp16(in_tensors_.at(1), static_cast<const lite::InnerContext *>(this->context_));
  output_fp16_ = reinterpret_cast<uint8_t *>(output_tensor->MutableData());
  if (input0_fp16_ == nullptr || input1_fp16_ == nullptr || output_fp16_ == nullptr) {
    MS_LOG(ERROR) << "Memory allocation failed";
    FreeTmpBuffer();
    return RET_ERROR;
  }
  auto ret = ParallelLaunch(this->context_->thread_pool_, ArithmeticsRunFp16, this, context_->thread_num_);
  auto ret = ParallelLaunch(static_cast<const lite::InnerContext *>(this->context_)->thread_pool_, ArithmeticsRunFp16,
                            this, context_->thread_num_);
  if (ret != RET_OK) {
    MS_LOG(ERROR) << "ArithmeticsRunFp16 run error error_code[" << ret << "]";
  }
--- a/mindspore/lite/src/runtime/kernel/arm/fp16/arithmetic_fp16.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/arithmetic_fp16.cc
@@ -127,13 +127,13 @@ void ArithmeticFP16CPUKernel::InitRunFunction(int primitive_type) {
 int ArithmeticFP16CPUKernel::ConstTensorBroadCast() {
  int ret;
  if (in_tensors_[0]->data_c() != nullptr) {
    ret = ConvertFp32TensorToFp16(in_tensors_[0], context_);
    ret = ConvertFp32TensorToFp16(in_tensors_[0], static_cast<const lite::InnerContext *>(this->context_));
    if (ret != RET_OK) {
      return ret;
    }
  }
  if (in_tensors_[1]->data_c() != nullptr) {
    ret = ConvertFp32TensorToFp16(in_tensors_[1], context_);
    ret = ConvertFp32TensorToFp16(in_tensors_[1], static_cast<const lite::InnerContext *>(this->context_));
    if (ret != RET_OK) {
      return ret;
    }
@@ -167,18 +167,19 @@ int ArithmeticFP16CPUKernel::Run() {
    return RET_ERROR;
  }
  if (!input0_broadcast_) {
    input0_ptr_ = ConvertInputFp32toFp16(in_tensors_.at(0), context_);
    input0_ptr_ = ConvertInputFp32toFp16(in_tensors_.at(0), static_cast<const lite::InnerContext *>(this->context_));
  }
  if (!input1_broadcast_) {
    input1_ptr_ = ConvertInputFp32toFp16(in_tensors_.at(1), context_);
    input1_ptr_ = ConvertInputFp32toFp16(in_tensors_.at(1), static_cast<const lite::InnerContext *>(this->context_));
  }
  auto output_tensor = out_tensors_.at(0);
  output_ptr_ = MallocOutputFp16(output_tensor, context_);
  output_ptr_ = MallocOutputFp16(output_tensor, static_cast<const lite::InnerContext *>(this->context_));
  if (input0_ptr_ == nullptr || input1_ptr_ == nullptr || output_ptr_ == nullptr) {
    FreeFp16Buffer();
    return RET_ERROR;
  }
  auto ret = ParallelLaunch(this->context_->thread_pool_, ArithmeticsRun, this, context_->thread_num_);
  auto ret = ParallelLaunch(static_cast<const lite::InnerContext *>(this->context_)->thread_pool_, ArithmeticsRun, this,
                            context_->thread_num_);
  if (out_tensors_.at(0)->data_type() == kNumberTypeFloat32) {
    Float16ToFloat32(static_cast<float16_t *>(output_ptr_), reinterpret_cast<float *>(output_tensor->MutableData()),
                     output_tensor->ElementsNum());
--- a/mindspore/lite/src/runtime/kernel/arm/fp16/arithmetic_self_fp16.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/arithmetic_self_fp16.cc
@@ -77,13 +77,14 @@ int ArithmeticSelfFp16CPUKernel::Run() {
  auto output_tensor = out_tensors_.at(0);
  if (input_tensor->data_type() == kNumberTypeFloat32) {
    input_fp16_ptr_ = ConvertInputFp32toFp16(input_tensor, context_);
    input_fp16_ptr_ = ConvertInputFp32toFp16(input_tensor, static_cast<const lite::InnerContext *>(this->context_));
  } else {
    input_fp16_ptr_ = reinterpret_cast<float16_t *>(input_tensor->data_c());
  }
  output_fp16_ptr_ = reinterpret_cast<float16_t *>(output_tensor->data_c());
  auto ret = ParallelLaunch(this->context_->thread_pool_, ArithmeticSelfRun, this, op_parameter_->thread_num_);
  auto ret = ParallelLaunch(static_cast<const lite::InnerContext *>(this->context_)->thread_pool_, ArithmeticSelfRun,
                            this, op_parameter_->thread_num_);
  if (ret != RET_OK) {
    MS_LOG(ERROR) << "ArithmeticSelfRun error error_code[" << ret << "]";
  }
--- a/mindspore/lite/src/runtime/kernel/arm/fp16/batchnorm_fp16.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/batchnorm_fp16.cc
@@ -51,15 +51,16 @@ int BatchnormFp16CPUKernel::InitConstTensor() {
 int BatchnormFp16CPUKernel::Run() {
  auto input_tensor = in_tensors_.at(0);
  auto output_tensor = out_tensors_.at(0);
  input_ = ConvertInputFp32toFp16(input_tensor, context_);
  output_ = MallocOutputFp16(output_tensor, context_);
  input_ = ConvertInputFp32toFp16(input_tensor, static_cast<const lite::InnerContext *>(this->context_));
  output_ = MallocOutputFp16(output_tensor, static_cast<const lite::InnerContext *>(this->context_));
  if (input_ == nullptr || output_ == nullptr) {
    FreeInputAndOutput();
    MS_LOG(ERROR) << "input or output is nullptr";
    return RET_ERROR;
  }
  auto ret = ParallelLaunch(this->context_->thread_pool_, BatchNormRun, this, op_parameter_->thread_num_);
  auto ret = ParallelLaunch(static_cast<const lite::InnerContext *>(this->context_)->thread_pool_, BatchNormRun, this,
                            op_parameter_->thread_num_);
  if (ret != RET_OK) {
    MS_LOG(ERROR) << "BatchnormRun error error_code[" << ret << "]";
  }
--- a/mindspore/lite/src/runtime/kernel/arm/fp16/cast_fp16.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/cast_fp16.cc
@@ -132,7 +132,8 @@ int CastFp16CPUKernel::Run() {
  if (data_num_ == 0) {
    return RET_OK;
  }
  return ParallelLaunch(this->context_->thread_pool_, CastFp16Run, this, op_parameter_->thread_num_);
  return ParallelLaunch(static_cast<const lite::InnerContext *>(this->context_)->thread_pool_, CastFp16Run, this,
                        op_parameter_->thread_num_);
 }
 REG_KERNEL(kCPU, kNumberTypeFloat16, PrimitiveType_Cast, LiteKernelCreator<CastFp16CPUKernel>)
--- a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_1x1_fp16.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_1x1_fp16.cc
@@ -236,14 +236,16 @@ int Convolution1x1FP16CPUKernel::Run() {
    int ret = RET_ERROR;
    if (multi_thread_by_hw_) {
      ret = ParallelLaunch(this->context_->thread_pool_, Convolution1x1Fp16RunHw, this, thread_count_);
      ret = ParallelLaunch(static_cast<const lite::InnerContext *>(this->context_)->thread_pool_,
                           Convolution1x1Fp16RunHw, this, thread_count_);
    } else {
 #ifdef ENABLE_ARM64
      RowMajor2Col16MajorFp16Opt(input_ptr_, pack_input_, matmul_param_->row_, matmul_param_->deep_);
 #else
      RowMajor2Col12MajorFp16Opt(input_ptr_, pack_input_, matmul_param_->row_, matmul_param_->deep_);
 #endif
      ret = ParallelLaunch(this->context_->thread_pool_, Convolution1x1Fp16RunOc, this, thread_count_);
      ret = ParallelLaunch(static_cast<const lite::InnerContext *>(this->context_)->thread_pool_,
                           Convolution1x1Fp16RunOc, this, thread_count_);
    }
    if (ret != RET_OK) {
      MS_LOG(ERROR) << "ParallelLaunch failed.";
--- a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_delegate_fp16.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_delegate_fp16.cc
@@ -95,10 +95,11 @@ static void SetInputOutputShapeInfo(ConvParameter *conv_param, lite::Tensor *inp
 int ConvolutionDelegateFP16CPUKernel::ReSize() {
  // Update shape info of input and output
  kernel::SetInputOutputShapeInfo(reinterpret_cast<ConvParameter *>(op_parameter_), in_tensors_.front(),
                                  out_tensors_.front(), context_);
                                  out_tensors_.front(), static_cast<const lite::InnerContext *>(this->context_));
  if (fp16_conv_kernel_ == nullptr) {
    fp16_conv_kernel_ =
      CpuConvFp16KernelSelect(in_tensors_, out_tensors_, op_parameter_, context_, origin_weight_, origin_bias_);
      CpuConvFp16KernelSelect(in_tensors_, out_tensors_, op_parameter_,
                              static_cast<const lite::InnerContext *>(context_), origin_weight_, origin_bias_);
    if (fp16_conv_kernel_ == nullptr) {
      MS_LOG(ERROR) << "Selecting execute kernel failed for conv_kernel, got a nullptr.";
      return RET_ERROR;
@@ -184,7 +185,7 @@ kernel::LiteKernel *CpuGroupConvFp16KernelCreator(const std::vector<lite::Tensor
 /* creator func */
 kernel::LiteKernel *CpuConvFp16KernelCreator(const std::vector<lite::Tensor *> &inputs,
                                             const std::vector<lite::Tensor *> &outputs, OpParameter *opParameter,
                                             const InnerContext *ctx, const kernel::KernelKey &desc) {
                                             const lite::Context *ctx, const kernel::KernelKey &desc) {
  MS_ASSERT(opParameter != nullptr);
  MS_ASSERT(desc.type == schema::PrimitiveType_Conv2DFusion);
@@ -200,11 +201,12 @@ kernel::LiteKernel *CpuConvFp16KernelCreator(const std::vector<lite::Tensor *> &
  auto conv_param = reinterpret_cast<ConvParameter *>(opParameter);
  kernel::LiteKernel *kernel = nullptr;
  if (conv_param->group_ == 1) {
    kernel = new (std::nothrow) kernel::ConvolutionDelegateFP16CPUKernel(opParameter, inputs, outputs, ctx);
    kernel = new (std::nothrow) kernel::ConvolutionDelegateFP16CPUKernel(opParameter, inputs, outputs,
                                                                         static_cast<const lite::InnerContext *>(ctx));
  } else if (conv_param->group_ == conv_param->input_channel_ && conv_param->group_ == conv_param->output_channel_) {
    kernel = CpuConvDwFp16KernelCreator(inputs, outputs, opParameter, ctx);
    kernel = CpuConvDwFp16KernelCreator(inputs, outputs, opParameter, static_cast<const lite::InnerContext *>(ctx));
  } else {
    kernel = CpuGroupConvFp16KernelCreator(inputs, outputs, opParameter, ctx);
    kernel = CpuGroupConvFp16KernelCreator(inputs, outputs, opParameter, static_cast<const lite::InnerContext *>(ctx));
  }
  if (kernel == nullptr) {
--- a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_depthwise_fp16.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_depthwise_fp16.cc
@@ -104,7 +104,8 @@ static int ConvDwFp16Run(void *cdata, int task_id) {
 }
 int ConvolutionDepthwiseFp16CPUKernel::Run() {
  auto ret = ParallelLaunch(this->context_->thread_pool_, ConvDwFp16Run, this, conv_param_->thread_num_);
  auto ret = ParallelLaunch(static_cast<const lite::InnerContext *>(this->context_)->thread_pool_, ConvDwFp16Run, this,
                            conv_param_->thread_num_);
  if (ret != RET_OK) {
    MS_LOG(ERROR) << "ConvDwFp16Run error: error_code[" << ret << "]";
  }
--- a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_depthwise_slidewindow_fp16.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_depthwise_slidewindow_fp16.cc
@@ -155,7 +155,8 @@ int ConvolutionDepthwiseSWFp16CPUKernel::Run() {
    packed_output_ = output_ptr;
  }
  ret = ParallelLaunch(this->context_->thread_pool_, ConvDwSWFp16Run, this, conv_param_->thread_num_);
  ret = ParallelLaunch(static_cast<const lite::InnerContext *>(this->context_)->thread_pool_, ConvDwSWFp16Run, this,
                       conv_param_->thread_num_);
  if (ret != RET_OK) {
    MS_LOG(ERROR) << "ConvDwSWFp16Run error: error_code[" << ret << "]";
  }
--- a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_fp16.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_fp16.cc
@@ -144,7 +144,8 @@ int ConvolutionFP16CPUKernel::Run() {
    return RET_ERROR;
  }
  ret = ParallelLaunch(this->context_->thread_pool_, ConvolutionFp16Impl, this, thread_count_);
  ret = ParallelLaunch(static_cast<const lite::InnerContext *>(this->context_)->thread_pool_, ConvolutionFp16Impl, this,
                       thread_count_);
  if (ret != RET_OK) {
    MS_LOG(ERROR) << "conv fp16 error ret[" << ret << "]";
  }
--- a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_winograd_fp16.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_winograd_fp16.cc
@@ -213,7 +213,8 @@ int ConvolutionWinogradFP16CPUKernel::Run() {
    return RET_ERROR;
  }
  ret = ParallelLaunch(this->context_->thread_pool_, ConvolutionWinogradFp16Impl, this, thread_count_);
  ret = ParallelLaunch(static_cast<const lite::InnerContext *>(this->context_)->thread_pool_,
                       ConvolutionWinogradFp16Impl, this, thread_count_);
  if (ret != RET_OK) {
    MS_LOG(ERROR) << "conv winograd error error_code[" << ret << "]";
  }
--- a/mindspore/lite/src/runtime/kernel/arm/fp16/crop_fp16.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/crop_fp16.cc
@@ -53,7 +53,8 @@ int CropFp16CPUKernel::Run() {
  input_ptr_ = reinterpret_cast<float16_t *>(input_tensor->data_c());
  output_ptr_ = reinterpret_cast<float16_t *>(output_tensor->data_c());
  auto ret = ParallelLaunch(this->context_->thread_pool_, CropFp16Run, this, crop_para_->thread_count_);
  auto ret = ParallelLaunch(static_cast<const lite::InnerContext *>(this->context_)->thread_pool_, CropFp16Run, this,
                            crop_para_->thread_count_);
  if (ret != RET_OK) {
    MS_LOG(ERROR) << "ParallelLaunch failed: " << ret;
  }
--- a/mindspore/lite/src/runtime/kernel/arm/fp16/deconvolution_depthwise_fp16.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/deconvolution_depthwise_fp16.cc
@@ -173,7 +173,8 @@ int DeconvolutionDepthwiseFp16CPUKernel::Run() {
    memset(output_ptr, 0, out_tensors_.at(kOutputIndex)->ElementsNum() * sizeof(float16_t));
    packed_output_ = output_ptr;
  }
  ret = ParallelLaunch(this->context_->thread_pool_, DeconvDwFp16Run, this, conv_param_->thread_num_);
  ret = ParallelLaunch(static_cast<const lite::InnerContext *>(this->context_)->thread_pool_, DeconvDwFp16Run, this,
                       conv_param_->thread_num_);
  if (ret != RET_OK) {
    MS_LOG(ERROR) << "DeconvDwFp16Run error: error_code[" << ret << "]";
  }
--- a/mindspore/lite/src/runtime/kernel/arm/fp16/deconvolution_fp16.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/deconvolution_fp16.cc
@@ -217,7 +217,8 @@ int DeConvolutionFp16CPUKernel::Run() {
    RowMajor2Col16MajorFp16Opt(batch_input_, pack_input_, input_plane_, conv_param_->input_channel_);
    error_code = ParallelLaunch(this->context_->thread_pool_, DeConvFp16Run, this, thread_count_);
    error_code = ParallelLaunch(static_cast<const lite::InnerContext *>(this->context_)->thread_pool_, DeConvFp16Run,
                                this, thread_count_);
    if (error_code != RET_OK) {
      MS_LOG(ERROR) << "deconv fp16 run error! error_code[" << error_code << "]";
    }
@@ -229,7 +230,7 @@ int DeConvolutionFp16CPUKernel::Run() {
 kernel::LiteKernel *CpuDeConvFp16KernelCreator(const std::vector<lite::Tensor *> &inputs,
                                               const std::vector<lite::Tensor *> &outputs, OpParameter *op_parameter,
                                               const lite::InnerContext *ctx, const kernel::KernelKey &desc) {
                                               const lite::Context *ctx, const kernel::KernelKey &desc) {
  MS_ASSERT(op_parameter != nullptr);
  MS_ASSERT(desc.type == schema::PrimitiveType_Conv2dTransposeFusion);
@@ -238,12 +239,15 @@ kernel::LiteKernel *CpuDeConvFp16KernelCreator(const std::vector<lite::Tensor *>
  if (conv_param->group_ == 1) {
    if ((conv_param->stride_h_ != 1 || conv_param->stride_w_ != 1) &&
        (conv_param->dilation_h_ == 1 && conv_param->dilation_w_ == 1)) {
      kernel = new (std::nothrow) kernel::DeConvWinogradFp16CPUKernel(op_parameter, inputs, outputs, ctx);
      kernel = new (std::nothrow) kernel::DeConvWinogradFp16CPUKernel(op_parameter, inputs, outputs,
                                                                      static_cast<const lite::InnerContext *>(ctx));
    } else {
      kernel = new (std::nothrow) kernel::DeConvolutionFp16CPUKernel(op_parameter, inputs, outputs, ctx);
      kernel = new (std::nothrow)
        kernel::DeConvolutionFp16CPUKernel(op_parameter, inputs, outputs, static_cast<const lite::InnerContext *>(ctx));
    }
  } else if (conv_param->group_ == conv_param->input_channel_ && conv_param->group_ == conv_param->output_channel_) {
    kernel = new (std::nothrow) DeconvolutionDepthwiseFp16CPUKernel(op_parameter, inputs, outputs, ctx);
    kernel = new (std::nothrow)
      DeconvolutionDepthwiseFp16CPUKernel(op_parameter, inputs, outputs, static_cast<const lite::InnerContext *>(ctx));
  }
  if (kernel == nullptr) {
--- a/mindspore/lite/src/runtime/kernel/arm/fp16/deconvolution_winograd_fp16.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/deconvolution_winograd_fp16.cc
@@ -392,10 +392,12 @@ int DeConvWinogradFp16CPUKernel::Run() {
    nhwc_output_ = output_ptr + batch_index * deconv_param_->output_plane_ * conv_param_->output_channel_;
    ::memset(nc4hw4_output_, 0, deconv_param_->output_plane_ * deconv_param_->oc_div4_ * C4NUM * sizeof(float16_t));
    ParallelLaunch(this->context_->thread_pool_, DeConvWgFp16Run, this, deconv_param_->thread_num_);
    ParallelLaunch(static_cast<const lite::InnerContext *>(this->context_)->thread_pool_, DeConvWgFp16Run, this,
                   deconv_param_->thread_num_);
    /*post bias activate and nhwc */
    ParallelLaunch(this->context_->thread_pool_, DeConvWgPostFp16Run, this, thread_num_hw_);
    ParallelLaunch(static_cast<const lite::InnerContext *>(this->context_)->thread_pool_, DeConvWgPostFp16Run, this,
                   thread_num_hw_);
  }
  return RET_OK;
--- a/mindspore/lite/src/runtime/kernel/arm/fp16/gather_fp16.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/gather_fp16.cc
@@ -151,7 +151,8 @@ int GatherFp16CPUKernel::Run() {
      Float32ToFloat16(reinterpret_cast<float *>(input_tensor->data_c()), input_data_, input_tensor->ElementsNum());
    }
  }
  ret = ParallelLaunch(this->context_->thread_pool_, GatherRunFp16, this, op_parameter_->thread_num_);
  ret = ParallelLaunch(static_cast<const lite::InnerContext *>(this->context_)->thread_pool_, GatherRunFp16, this,
                       op_parameter_->thread_num_);
  if (ret != RET_OK) {
    MS_LOG(ERROR) << "Gather function error error_code[" << ret << "]";
  }
--- a/mindspore/lite/src/runtime/kernel/arm/fp16/instance_norm_fp16.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/instance_norm_fp16.cc
@@ -109,7 +109,8 @@ int InstanceNormFp16Run(void *cdata, int task_id) {
 int InstanceNormFp16CPUKernel::Run() {
  src_data_ = reinterpret_cast<float16_t *>(in_tensors_[0]->data_c());
  dst_data_ = reinterpret_cast<float16_t *>(out_tensors_[0]->data_c());
  auto ret = ParallelLaunch(this->context_->thread_pool_, InstanceNormFp16Run, this, op_parameter_->thread_num_);
  auto ret = ParallelLaunch(static_cast<const lite::InnerContext *>(this->context_)->thread_pool_, InstanceNormFp16Run,
                            this, op_parameter_->thread_num_);
  if (ret != RET_OK) {
    MS_LOG(ERROR) << "InstanceNormFp16Run error error_code[" << ret << "]";
    return ret;
--- a/mindspore/lite/src/runtime/kernel/arm/fp16/log_softmax_fp16.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/log_softmax_fp16.cc
@@ -95,7 +95,8 @@ int LogSoftmaxLastAxisFp16Run(void *cdata, int task_id) {
 int LogSoftmaxFp16CPUKernel::Run() {
  if (in_plane_size_ == 1) {
    auto ret = ParallelLaunch(this->context_->thread_pool_, LogSoftmaxLastAxisFp16Run, this, context_->thread_num_);
    auto ret = ParallelLaunch(static_cast<const lite::InnerContext *>(this->context_)->thread_pool_,
                              LogSoftmaxLastAxisFp16Run, this, context_->thread_num_);
    if (ret != RET_OK) {
      MS_LOG(ERROR) << "LogSoftmaxFp16CPUKernel ParallelLaunch failed, ret: " << ret;
    }
--- a/mindspore/lite/src/runtime/kernel/arm/fp16/matmul_base_fp16.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/matmul_base_fp16.cc
@@ -286,7 +286,8 @@ int MatmulBaseFP16CPUKernel::Run() {
      batch_b_ptr_ = b_pack_ptr_ + i * params_->deep_ * params_->col_align_;
      batch_c_ptr_ = c_ptr + i * params_->row_ * params_->col_;
    }
    auto ret = ParallelLaunch(this->context_->thread_pool_, MatmulBaseFP16Run, this, thread_count_);
    auto ret = ParallelLaunch(static_cast<const lite::InnerContext *>(this->context_)->thread_pool_, MatmulBaseFP16Run,
                              this, thread_count_);
    if (ret != RET_OK) {
      MS_LOG(ERROR) << "MatmulBaseFloatRun failed";
      return ret;
--- a/mindspore/lite/src/runtime/kernel/arm/fp16/pad_fp16.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/pad_fp16.cc
@@ -89,7 +89,8 @@ int PadFp16CPUKernel::Run() {
        output_[i] = pad_param_->constant_value_;
      }
    }
    ret = ParallelLaunch(this->context_->thread_pool_, PadImpl, this, op_parameter_->thread_num_);
    ret = ParallelLaunch(static_cast<const lite::InnerContext *>(this->context_)->thread_pool_, PadImpl, this,
                         op_parameter_->thread_num_);
    if (ret != RET_OK) {
      MS_LOG(ERROR) << "BatchnormRun error error_code[" << ret << "]";
    }
@@ -101,7 +102,8 @@ int PadFp16CPUKernel::Run() {
      return ret;
    }
    ret = ParallelLaunch(this->context_->thread_pool_, MirrorPadImpl, this, context_->thread_num_);
    ret = ParallelLaunch(static_cast<const lite::InnerContext *>(this->context_)->thread_pool_, MirrorPadImpl, this,
                         context_->thread_num_);
    if (ret != RET_OK) {
      MS_LOG(ERROR) << "Pad Reflect or Symmetric mode run error, error_code[" << ret << "]";
    }
--- a/mindspore/lite/src/runtime/kernel/arm/fp16/pooling_fp16.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/pooling_fp16.cc
@@ -90,7 +90,8 @@ int PoolingFp16CPUKernel::Run() {
  fp16_input_ = reinterpret_cast<float16_t *>(input_tensor->data_c());
  fp16_output_ = reinterpret_cast<float16_t *>(output_tensor->data_c());
  int error_code = ParallelLaunch(this->context_->thread_pool_, PoolingFp16Impl, this, thread_count_);
  int error_code = ParallelLaunch(static_cast<const lite::InnerContext *>(this->context_)->thread_pool_,
                                  PoolingFp16Impl, this, thread_count_);
  if (error_code != RET_OK) {
    MS_LOG(ERROR) << "pooling error error_code[" << error_code << "]";
    return RET_ERROR;
--- a/mindspore/lite/src/runtime/kernel/arm/fp16/power_fp16.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/power_fp16.cc
@@ -87,7 +87,8 @@ int PowerFp16CPUKernel::Run() {
      return ret;
    }
  }
  auto ret = ParallelLaunch(this->context_->thread_pool_, PowerImplFp16, this, thread_count_);
  auto ret = ParallelLaunch(static_cast<const lite::InnerContext *>(this->context_)->thread_pool_, PowerImplFp16, this,
                            thread_count_);
  if (ret != RET_OK) {
    MS_LOG(ERROR) << "PowerFp16CPUKernel error: " << ret;
    return RET_ERROR;
--- a/mindspore/lite/src/runtime/kernel/arm/fp16/quant_dtype_cast_fp16.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/quant_dtype_cast_fp16.cc
@@ -164,7 +164,8 @@ int QuantDTypeCastFp16CPUKernel::Run() {
    return RET_ERROR;
  }
  auto ret = ParallelLaunch(this->context_->thread_pool_, QuantDTypeCastFP16Run, this, thread_n_num_);
  auto ret = ParallelLaunch(static_cast<const lite::InnerContext *>(this->context_)->thread_pool_,
                            QuantDTypeCastFP16Run, this, thread_n_num_);
  if (ret != RET_OK) {
    MS_LOG(ERROR) << "Scale error error_code[" << ret << "]";
    return RET_ERROR;
--- a/mindspore/lite/src/runtime/kernel/arm/fp16/reduce_fp16.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/reduce_fp16.cc
@@ -93,7 +93,8 @@ int ReduceFp16CPUKernel::Run() {
    outer_size_ = outer_sizes_.at(i);
    inner_size_ = inner_sizes_.at(i);
    axis_size_ = axis_sizes_.at(i);
    auto error_code = ParallelLaunch(this->context_->thread_pool_, ReduceFp16Impl, this, context_->thread_num_);
    auto error_code = ParallelLaunch(static_cast<const lite::InnerContext *>(this->context_)->thread_pool_,
                                     ReduceFp16Impl, this, context_->thread_num_);
    if (error_code != RET_OK) {
      FreeTmpBuffer();
      MS_LOG(ERROR) << "Reduce run error, error_code[" << error_code << "]";
@@ -108,7 +109,8 @@ int ReduceFp16CPUKernel::Run() {
  outer_size_ = outer_sizes_.back();
  inner_size_ = inner_sizes_.back();
  axis_size_ = axis_sizes_.back();
  auto error_code = ParallelLaunch(this->context_->thread_pool_, ReduceFp16Impl, this, context_->thread_num_);
  auto error_code = ParallelLaunch(static_cast<const lite::InnerContext *>(this->context_)->thread_pool_,
                                   ReduceFp16Impl, this, context_->thread_num_);
  if (error_code != RET_OK) {
    FreeTmpBuffer();
    MS_LOG(ERROR) << "Reduce run error, error_code[" << error_code << "]";
--- a/mindspore/lite/src/runtime/kernel/arm/fp16/scale_fp16.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/scale_fp16.cc
@@ -115,7 +115,8 @@ int ScaleFp16CPUKernel::Run() {
    return ret;
  }
  ret = ParallelLaunch(this->context_->thread_pool_, ScaleFp16Run, this, op_parameter_->thread_num_);
  ret = ParallelLaunch(static_cast<const lite::InnerContext *>(this->context_)->thread_pool_, ScaleFp16Run, this,
                       op_parameter_->thread_num_);
  if (ret != RET_OK) {
    MS_LOG(ERROR) << "Scale error error_code[" << ret << "]";
    FreeTmpBuffer();
@@ -127,12 +128,12 @@ int ScaleFp16CPUKernel::Run() {
 }
 int ScaleFp16CPUKernel::MallocAssignTmpBuffer() {
  scale_ = ConvertInputFp32toFp16(in_tensors_.at(1), context_);
  scale_ = ConvertInputFp32toFp16(in_tensors_.at(1), static_cast<const lite::InnerContext *>(this->context_));
  if (scale_ == nullptr) {
    return RET_ERROR;
  }
  if (in_tensors_.size() == 3) {
    offset_ = ConvertInputFp32toFp16(in_tensors_.at(2), context_);
    offset_ = ConvertInputFp32toFp16(in_tensors_.at(2), static_cast<const lite::InnerContext *>(this->context_));
    if (offset_ == nullptr) {
      return RET_ERROR;
    }
--- a/mindspore/lite/src/runtime/kernel/arm/fp16/slice_fp16.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/slice_fp16.cc
@@ -63,7 +63,8 @@ int SliceFp16CPUKernel::Run() {
    DoSliceNoParallel(input_data, out_tensors_.at(0)->data_c(), param_, lite::DataTypeSize(kNumberTypeFloat16));
    return RET_OK;
  }
  auto ret = ParallelLaunch(this->context_->thread_pool_, SliceFp16Launch, this, op_parameter_->thread_num_);
  auto ret = ParallelLaunch(static_cast<const lite::InnerContext *>(this->context_)->thread_pool_, SliceFp16Launch,
                            this, op_parameter_->thread_num_);
  if (ret != RET_OK) {
    MS_LOG(ERROR) << "fp16 slice launch fail!ret: " << ret;
    return RET_ERROR;
--- a/mindspore/lite/src/runtime/kernel/arm/fp16/softmax_fp16.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/softmax_fp16.cc
@@ -95,7 +95,8 @@ int SoftmaxLastAxisFp16Run(void *cdata, int task_id) {
 int SoftmaxFp16CPUKernel::Run() {
  if (in_plane_size_ == 1) {
    auto ret = ParallelLaunch(this->context_->thread_pool_, SoftmaxLastAxisFp16Run, this, context_->thread_num_);
    auto ret = ParallelLaunch(static_cast<const lite::InnerContext *>(this->context_)->thread_pool_,
                              SoftmaxLastAxisFp16Run, this, context_->thread_num_);
    if (ret != RET_OK) {
      MS_LOG(ERROR) << "SoftmaxFp16CPUKernel ParallelLaunch failed, ret: " << ret;
    }
--- a/mindspore/lite/src/runtime/kernel/arm/fp16/stack_fp16.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/stack_fp16.cc
@@ -40,14 +40,15 @@ void StackFp16CPUKernel::InitMallocFlags() {
 int StackFp16CPUKernel::MallocAssignBuffer() {
  buffers_.resize(in_tensors_.size(), nullptr);
  for (size_t i = 0; i < in_tensors_.size(); ++i) {
    buffers_.at(i) = reinterpret_cast<char *>(ConvertInputFp32toFp16(in_tensors_.at(i), context_));
    buffers_.at(i) = reinterpret_cast<char *>(
      ConvertInputFp32toFp16(in_tensors_.at(i), static_cast<const lite::InnerContext *>(context_)));
    if (buffers_.at(i) == nullptr) {
      return RET_ERROR;
    }
  }
  out_buffer_ = nullptr;
  out_buffer_ = MallocOutputFp16(out_tensors_.at(0), context_);
  out_buffer_ = MallocOutputFp16(out_tensors_.at(0), static_cast<const lite::InnerContext *>(this->context_));
  if (out_buffer_ == nullptr) {
    return RET_ERROR;
  }
@@ -100,7 +101,8 @@ int StackFp16CPUKernel::Run() {
  }
  // run stack
  num_threads_ = MSMIN(UP_DIV(outer_size_, 64), this->context_->thread_num_);
  ret = ParallelLaunch(this->context_->thread_pool_, StackRun, this, num_threads_);
  ret =
    ParallelLaunch(static_cast<const lite::InnerContext *>(this->context_)->thread_pool_, StackRun, this, num_threads_);
  if (ret != RET_OK) {
    MS_LOG(ERROR) << "StackBaseCPUKernel Run error: error_code[" << ret << "]";
    return RET_ERROR;
--- a/mindspore/lite/src/runtime/kernel/arm/fp16_grad/activation_fp16_grad.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16_grad/activation_fp16_grad.cc
@@ -79,7 +79,8 @@ int ActivationGradRunFp16(void *cdata, int task_id) {
 }
 int ActivationGradCPUKernelFp16::Run() {
  int error_code = ParallelLaunch(this->context_->thread_pool_, ActivationGradRunFp16, this, thread_count_);
  int error_code = ParallelLaunch(static_cast<const lite::InnerContext *>(this->context_)->thread_pool_,
                                  ActivationGradRunFp16, this, thread_count_);
  if (error_code != RET_OK) {
    MS_LOG(ERROR) << "Activation Grad function error error_code[" << error_code << "]";
    return RET_ERROR;
--- a/mindspore/lite/src/runtime/kernel/arm/fp16_grad/arithmetic_fp16_self_grad.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16_grad/arithmetic_fp16_self_grad.cc
@@ -73,7 +73,8 @@ int ArithmeticSelfGradFp16Run(void *cdata, int task_id) {
 }
 int ArithmeticSelfGradFp16CPUKernel::Run() {
  int error_code = ParallelLaunch(this->context_->thread_pool_, ArithmeticSelfGradFp16Run, this, thread_count_);
  int error_code = ParallelLaunch(static_cast<const lite::InnerContext *>(this->context_)->thread_pool_,
                                  ArithmeticSelfGradFp16Run, this, thread_count_);
  if (error_code != RET_OK) {
    MS_LOG(ERROR) << "Activation Grad function error error_code[" << error_code << "]";
    return RET_ERROR;
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/activation_fp32.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/activation_fp32.cc
@@ -101,7 +101,8 @@ int ActivationRun(void *cdata, int task_id) {
 }
 int ActivationCPUKernel::Run() {
  int error_code = ParallelLaunch(this->context_->thread_pool_, ActivationRun, this, thread_count_);
  int error_code = ParallelLaunch(static_cast<const lite::InnerContext *>(this->context_)->thread_pool_, ActivationRun,
                                  this, thread_count_);
  if (error_code != RET_OK) {
    MS_LOG(ERROR) << "Activation function error error_code[" << error_code << "]";
    return RET_ERROR;
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/adder_fp32.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/adder_fp32.cc
@@ -122,7 +122,8 @@ int AdderCPUKernel::Run() {
    return RET_ERROR;
  }
  int error_code = ParallelLaunch(this->context_->thread_pool_, AdderImpl, this, thread_count_);
  int error_code = ParallelLaunch(static_cast<const lite::InnerContext *>(this->context_)->thread_pool_, AdderImpl,
                                  this, thread_count_);
  if (error_code != RET_OK) {
    MS_LOG(ERROR) << "adder error error_code[" << error_code << "]";
    FreeTmpBuffer();
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/addn_fp32.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/addn_fp32.cc
@@ -89,7 +89,8 @@ int AddNCPUKernel::Run() {
  in1_addr_ = input0_data;
  in2_addr_ = input1_data;
  out_addr_ = output_data;
  auto ret = ParallelLaunch(this->context_->thread_pool_, AddNLaunch, this, op_parameter_->thread_num_);
  auto ret = ParallelLaunch(static_cast<const lite::InnerContext *>(this->context_)->thread_pool_, AddNLaunch, this,
                            op_parameter_->thread_num_);
  if (ret != RET_OK) {
    MS_LOG(ERROR) << "addn launch fail!ret: " << ret;
    return RET_ERROR;
@@ -97,7 +98,8 @@ int AddNCPUKernel::Run() {
  for (size_t i = 2; i < in_tensors_.size(); ++i) {
    in1_addr_ = reinterpret_cast<float *>(in_tensors_[i]->MutableData());
    in2_addr_ = output_data;
    ret = ParallelLaunch(this->context_->thread_pool_, AddNLaunch, this, op_parameter_->thread_num_);
    ret = ParallelLaunch(static_cast<const lite::InnerContext *>(this->context_)->thread_pool_, AddNLaunch, this,
                         op_parameter_->thread_num_);
    if (ret != RET_OK) {
      MS_LOG(ERROR) << "addn launch fail!ret: " << ret << ", input index: " << i;
      return RET_ERROR;
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/arithmetic_fp32.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/arithmetic_fp32.cc
@@ -418,7 +418,8 @@ int ArithmeticCPUKernel::Run() {
    input1_ptr_ = in_tensors_[1]->data_c();
  }
  output_ptr_ = out_tensors_[0]->data_c();
  return ParallelLaunch(this->context_->thread_pool_, ArithmeticsRun, this, context_->thread_num_);
  return ParallelLaunch(static_cast<const lite::InnerContext *>(this->context_)->thread_pool_, ArithmeticsRun, this,
                        context_->thread_num_);
 }
 REG_KERNEL(kCPU, kNumberTypeFloat32, PrimitiveType_MulFusion, LiteKernelCreator<ArithmeticCPUKernel>)
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/arithmetic_self_fp32.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/arithmetic_self_fp32.cc
@@ -113,7 +113,8 @@ int ArithmeticSelfRun(void *cdata, int task_id) {
 }
 int ArithmeticSelfCPUKernel::Run() {
  auto ret = ParallelLaunch(this->context_->thread_pool_, ArithmeticSelfRun, this, op_parameter_->thread_num_);
  auto ret = ParallelLaunch(static_cast<const lite::InnerContext *>(this->context_)->thread_pool_, ArithmeticSelfRun,
                            this, op_parameter_->thread_num_);
  if (ret != RET_OK) {
    MS_LOG(ERROR) << "ArithmeticSelfRun error error_code[" << ret << "]";
  }
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/batchnorm_fp32.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/batchnorm_fp32.cc
@@ -75,7 +75,8 @@ int BatchnormCPUKernel::InitConstTensor() {
 }
 int BatchnormCPUKernel::Run() {
  auto ret = ParallelLaunch(this->context_->thread_pool_, BatchNormRun, this, op_parameter_->thread_num_);
  auto ret = ParallelLaunch(static_cast<const lite::InnerContext *>(this->context_)->thread_pool_, BatchNormRun, this,
                            op_parameter_->thread_num_);
  if (ret != RET_OK) {
    MS_LOG(ERROR) << "BatchnormRun error error_code[" << ret << "]";
  }
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/cast_fp32.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/cast_fp32.cc
@@ -141,7 +141,8 @@ int CastCPUKernel::Run() {
  if (data_num_ == 0) {
    return RET_OK;
  }
  return ParallelLaunch(this->context_->thread_pool_, CastRun, this, op_parameter_->thread_num_);
  return ParallelLaunch(static_cast<const lite::InnerContext *>(this->context_)->thread_pool_, CastRun, this,
                        op_parameter_->thread_num_);
 }
 REG_KERNEL(kCPU, kNumberTypeFloat32, PrimitiveType_Cast, LiteKernelCreator<CastCPUKernel>)
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/concat_fp32.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/concat_fp32.cc
@@ -69,7 +69,8 @@ int ConcatRun(void *cdata, int task_id) {
 }
 int ConcatCPUKernel::Run() {
  int error_code = ParallelLaunch(this->context_->thread_pool_, ConcatRun, this, op_parameter_->thread_num_);
  int error_code = ParallelLaunch(static_cast<const lite::InnerContext *>(this->context_)->thread_pool_, ConcatRun,
                                  this, op_parameter_->thread_num_);
  return error_code;
 }
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_1x1_fp32.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_1x1_fp32.cc
@@ -247,10 +247,12 @@ int Convolution1x1CPUKernel::Run() {
    }
    if (multi_thread_by_hw_) {
      ParallelLaunch(this->context_->thread_pool_, Convolution1x1RunHw, this, thread_count_);
      ParallelLaunch(static_cast<const lite::InnerContext *>(this->context_)->thread_pool_, Convolution1x1RunHw, this,
                     thread_count_);
    } else {
      PackMatmulInput(input_ptr_, pack_input_, matmul_param_->row_, matmul_param_->deep_);
      ParallelLaunch(this->context_->thread_pool_, Convolution1x1Run, this, thread_count_);
      ParallelLaunch(static_cast<const lite::InnerContext *>(this->context_)->thread_pool_, Convolution1x1Run, this,
                     thread_count_);
    }
  }
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_delegate_fp32.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_delegate_fp32.cc
@@ -138,16 +138,19 @@ kernel::LiteKernel *ConvolutionDelegateCPUKernel::CpuConvFp32KernelSelect() {
  kernel::LiteKernel *kernel = nullptr;
  auto conv_param = reinterpret_cast<ConvParameter *>(op_parameter_);
  if (conv_param->kernel_h_ == 1 && conv_param->kernel_w_ == 1) {
    kernel = new (std::nothrow)
      kernel::Convolution1x1CPUKernel(op_parameter_, in_tensors_, out_tensors_, context_, origin_weight_, origin_bias_);
    kernel = new (std::nothrow) kernel::Convolution1x1CPUKernel(op_parameter_, in_tensors_, out_tensors_,
                                                                static_cast<const lite::InnerContext *>(this->context_),
                                                                origin_weight_, origin_bias_);
  } else {
    int out_unit;
    if (CheckIfUseWinograd(&out_unit, conv_param)) {
      kernel = new (std::nothrow) kernel::ConvolutionWinogradCPUKernel(
        op_parameter_, in_tensors_, out_tensors_, context_, out_unit, origin_weight_, origin_bias_);
        op_parameter_, in_tensors_, out_tensors_, static_cast<const lite::InnerContext *>(this->context_), out_unit,
        origin_weight_, origin_bias_);
    } else {
      kernel = new (std::nothrow)
        kernel::ConvolutionCPUKernel(op_parameter_, in_tensors_, out_tensors_, context_, origin_weight_, origin_bias_);
      kernel = new (std::nothrow) kernel::ConvolutionCPUKernel(op_parameter_, in_tensors_, out_tensors_,
                                                               static_cast<const lite::InnerContext *>(this->context_),
                                                               origin_weight_, origin_bias_);
    }
  }
@@ -214,7 +217,7 @@ kernel::LiteKernel *CpuGroupConvFp32KernelCreator(const std::vector<lite::Tensor
 /* creator func */
 kernel::LiteKernel *CpuConvFp32KernelCreator(const std::vector<lite::Tensor *> &inputs,
                                             const std::vector<lite::Tensor *> &outputs, OpParameter *op_parameter,
                                             const InnerContext *ctx, const kernel::KernelKey &desc) {
                                             const lite::Context *ctx, const kernel::KernelKey &desc) {
  MS_ASSERT(op_parameter != nullptr);
  MS_ASSERT(desc.type == schema::PrimitiveType_Conv2DFusion);
  MS_ASSERT(desc.data_type == kNumberTypeFloat32);
@@ -222,11 +225,12 @@ kernel::LiteKernel *CpuConvFp32KernelCreator(const std::vector<lite::Tensor *> &
  auto conv_param = reinterpret_cast<ConvParameter *>(op_parameter);
  kernel::LiteKernel *kernel = nullptr;
  if (conv_param->group_ == 1) {
    kernel = new (std::nothrow) kernel::ConvolutionDelegateCPUKernel(op_parameter, inputs, outputs, ctx);
    kernel = new (std::nothrow)
      kernel::ConvolutionDelegateCPUKernel(op_parameter, inputs, outputs, static_cast<const lite::InnerContext *>(ctx));
  } else if (conv_param->group_ == conv_param->input_channel_ && conv_param->group_ == conv_param->output_channel_) {
    kernel = CpuConvDwFp32KernelCreator(inputs, outputs, op_parameter, ctx);
    kernel = CpuConvDwFp32KernelCreator(inputs, outputs, op_parameter, static_cast<const lite::InnerContext *>(ctx));
  } else {
    kernel = CpuGroupConvFp32KernelCreator(inputs, outputs, op_parameter, ctx);
    kernel = CpuGroupConvFp32KernelCreator(inputs, outputs, op_parameter, static_cast<const lite::InnerContext *>(ctx));
  }
  if (kernel == nullptr) {
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_depthwise_3x3_fp32.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_depthwise_3x3_fp32.cc
@@ -126,7 +126,8 @@ int ConvolutionDepthwise3x3CPUKernel::Run() {
  auto output_tensor = out_tensors_.at(kOutputIndex);
  output_ptr_ = reinterpret_cast<float *>(output_tensor->data_c());
  auto ret = ParallelLaunch(this->context_->thread_pool_, ConvDw3x3Run, this, conv_param_->thread_num_);
  auto ret = ParallelLaunch(static_cast<const lite::InnerContext *>(this->context_)->thread_pool_, ConvDw3x3Run, this,
                            conv_param_->thread_num_);
  ctx_->allocator->Free(buffer_);
  if (ret != RET_OK) {
    MS_LOG(ERROR) << "ConvDw3x3Run error: error_code[" << ret << "]";
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_depthwise_fp32.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_depthwise_fp32.cc
@@ -107,7 +107,8 @@ int ConvolutionDepthwiseCPUKernel::Run() {
  auto output_tensor = out_tensors_.at(kOutputIndex);
  output_ptr_ = reinterpret_cast<float *>(output_tensor->MutableData());
  auto ret = ParallelLaunch(this->context_->thread_pool_, ConvDwRun, this, conv_param_->thread_num_);
  auto ret = ParallelLaunch(static_cast<const lite::InnerContext *>(this->context_)->thread_pool_, ConvDwRun, this,
                            conv_param_->thread_num_);
  if (ret != RET_OK) {
    MS_LOG(ERROR) << "ConvDwRun error: error_code[" << ret << "]";
    return RET_ERROR;
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_depthwise_indirect_fp32.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_depthwise_indirect_fp32.cc
@@ -194,7 +194,8 @@ int ConvolutionDepthwiseIndirectCPUKernel::Run() {
  ConvDwInitIndirection(indirect_buffer_, packed_input_, zero_ptr_, conv_param_, step_h, step_w);
  auto ret = ParallelLaunch(this->context_->thread_pool_, ConvDwIndirectRun, this, conv_param_->thread_num_);
  auto ret = ParallelLaunch(static_cast<const lite::InnerContext *>(this->context_)->thread_pool_, ConvDwIndirectRun,
                            this, conv_param_->thread_num_);
  if (ret != RET_OK) {
    MS_LOG(ERROR) << "ConvDwIndirectRun error: error_code[" << ret << "]";
    return RET_ERROR;
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_depthwise_slidewindow_fp32.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_depthwise_slidewindow_fp32.cc
@@ -163,7 +163,8 @@ int ConvolutionDepthwiseSWCPUKernel::Run() {
    packed_output_ = output_ptr;
  }
  ret = ParallelLaunch(this->context_->thread_pool_, ConvDwSWRun, this, conv_param_->thread_num_);
  ret = ParallelLaunch(static_cast<const lite::InnerContext *>(this->context_)->thread_pool_, ConvDwSWRun, this,
                       conv_param_->thread_num_);
  if (ret != RET_OK) {
    MS_LOG(ERROR) << "ConvDwSWRun error: error_code[" << ret << "]";
  }
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_fp32.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_fp32.cc
@@ -152,7 +152,8 @@ int ConvolutionCPUKernel::Run() {
    PackWeight();
  }
  ret = ParallelLaunch(this->context_->thread_pool_, ConvolutionImpl, this, thread_count_);
  ret = ParallelLaunch(static_cast<const lite::InnerContext *>(this->context_)->thread_pool_, ConvolutionImpl, this,
                       thread_count_);
  if (ret != RET_OK) {
    MS_LOG(ERROR) << "conv error error_code[" << ret << "]";
  }
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_winograd_fp32.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_winograd_fp32.cc
@@ -219,7 +219,8 @@ int ConvolutionWinogradCPUKernel::Run() {
    InitWeightBias();
  }
  ret = ParallelLaunch(this->context_->thread_pool_, ConvolutionWinogradImpl, this, thread_count_);
  ret = ParallelLaunch(static_cast<const lite::InnerContext *>(this->context_)->thread_pool_, ConvolutionWinogradImpl,
                       this, thread_count_);
  if (ret != RET_OK) {
    MS_LOG(ERROR) << "conv winograd error error_code[" << ret << "]";
  }
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/crop_and_resize_fp32.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/crop_and_resize_fp32.cc
@@ -151,7 +151,8 @@ int CropAndResizeCPUKernel::Run() {
    return ret;
  }
  int error_code = ParallelLaunch(this->context_->thread_pool_, CropAndResizeImpl, this, context_->thread_num_);
  int error_code = ParallelLaunch(static_cast<const lite::InnerContext *>(this->context_)->thread_pool_,
                                  CropAndResizeImpl, this, context_->thread_num_);
  if (error_code != RET_OK) {
    MS_LOG(ERROR) << "CropAndResize run error, error_code[" << error_code << "]";
    FreeTmpBuffer();
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/crop_fp32.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/crop_fp32.cc
@@ -62,7 +62,8 @@ int CropCPUKernel::Run() {
    return RET_OK;
  }
  auto ret = ParallelLaunch(this->context_->thread_pool_, CropLaunch, this, crop_para_->thread_count_);
  auto ret = ParallelLaunch(static_cast<const lite::InnerContext *>(this->context_)->thread_pool_, CropLaunch, this,
                            crop_para_->thread_count_);
  if (ret != RET_OK) {
    MS_LOG(ERROR) << "Crop launch fail!ret: " << ret;
    return RET_ERROR;
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/deconvolution_depthwise_fp32.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/deconvolution_depthwise_fp32.cc
@@ -168,7 +168,8 @@ int DeconvolutionDepthwiseCPUKernel::Run() {
    packed_output_ = output_addr;
  }
  ret = ParallelLaunch(this->context_->thread_pool_, DeconvDwRun, this, conv_param_->thread_num_);
  ret = ParallelLaunch(static_cast<const lite::InnerContext *>(this->context_)->thread_pool_, DeconvDwRun, this,
                       conv_param_->thread_num_);
  if (ret != RET_OK) {
    MS_LOG(ERROR) << "DeconvDwRun error: error_code[" << ret << "]";
  }
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/deconvolution_fp32.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/deconvolution_fp32.cc
@@ -222,7 +222,8 @@ int DeConvolutionCPUKernel::Run() {
    RowMajor2Col12Major(input_ptr_, pack_input_, matmul_param_->row_, matmul_param_->deep_);
 #endif
    error_code = ParallelLaunch(this->context_->thread_pool_, DeConvFp32Run, this, thread_count_);
    error_code = ParallelLaunch(static_cast<const lite::InnerContext *>(this->context_)->thread_pool_, DeConvFp32Run,
                                this, thread_count_);
    if (error_code != RET_OK) {
      MS_LOG(ERROR) << "deconv fp32 run error! error_code[" << error_code << "]";
      FreeRunBuf();
@@ -236,7 +237,7 @@ int DeConvolutionCPUKernel::Run() {
 kernel::LiteKernel *CpuDeConvFp32KernelCreator(const std::vector<lite::Tensor *> &inputs,
                                               const std::vector<lite::Tensor *> &outputs, OpParameter *op_parameter,
                                               const lite::InnerContext *ctx, const kernel::KernelKey &desc) {
                                               const lite::Context *ctx, const kernel::KernelKey &desc) {
  MS_ASSERT(op_parameter != nullptr);
  MS_ASSERT(desc.type == schema::PrimitiveType_Conv2dTransposeFusion);
@@ -245,12 +246,15 @@ kernel::LiteKernel *CpuDeConvFp32KernelCreator(const std::vector<lite::Tensor *>
  if (conv_param->group_ == 1) {
    if ((conv_param->stride_h_ != 1 || conv_param->stride_w_ != 1) &&
        (conv_param->dilation_w_ == 1 && conv_param->dilation_h_ == 1)) {
      kernel = new (std::nothrow) kernel::DeConvolutionWinogradCPUKernel(op_parameter, inputs, outputs, ctx);
      kernel = new (std::nothrow) kernel::DeConvolutionWinogradCPUKernel(op_parameter, inputs, outputs,
                                                                         static_cast<const lite::InnerContext *>(ctx));
    } else {
      kernel = new (std::nothrow) kernel::DeConvolutionCPUKernel(op_parameter, inputs, outputs, ctx);
      kernel = new (std::nothrow)
        kernel::DeConvolutionCPUKernel(op_parameter, inputs, outputs, static_cast<const lite::InnerContext *>(ctx));
    }
  } else if (conv_param->group_ == conv_param->input_channel_ && conv_param->group_ == conv_param->output_channel_) {
    kernel = new (std::nothrow) kernel::DeconvolutionDepthwiseCPUKernel(op_parameter, inputs, outputs, ctx);
    kernel = new (std::nothrow) kernel::DeconvolutionDepthwiseCPUKernel(op_parameter, inputs, outputs,
                                                                        static_cast<const lite::InnerContext *>(ctx));
  } else {
    MS_LOG(ERROR) << "deconv do not support group deconv!";
    kernel = nullptr;
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/deconvolution_winograd_fp32.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/deconvolution_winograd_fp32.cc
@@ -411,10 +411,12 @@ int DeConvolutionWinogradCPUKernel::Run() {
    nhwc_output_ = src_out + batch_index * deconv_param_->output_plane_ * conv_param_->output_channel_;
    ::memset(nc4hw4_output_, 0, deconv_param_->output_plane_ * deconv_param_->oc_div4_ * C4NUM * sizeof(float));
    ParallelLaunch(this->context_->thread_pool_, DeConvWgFp32Run, this, deconv_param_->thread_num_);
    ParallelLaunch(static_cast<const lite::InnerContext *>(this->context_)->thread_pool_, DeConvWgFp32Run, this,
                   deconv_param_->thread_num_);
    /*post bias activate and nhwc */
    ParallelLaunch(this->context_->thread_pool_, DeConvWgPostFp32Run, this, thread_num_hw_);
    ParallelLaunch(static_cast<const lite::InnerContext *>(this->context_)->thread_pool_, DeConvWgPostFp32Run, this,
                   thread_num_hw_);
  }
  FreeRunBuf();
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/elu_fp32.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/elu_fp32.cc
@@ -55,7 +55,8 @@ int EluRun(void *cdata, int task_id) {
 }
 int EluCPUKernel::Run() {
  auto ret = ParallelLaunch(this->context_->thread_pool_, EluRun, this, op_parameter_->thread_num_);
  auto ret = ParallelLaunch(static_cast<const lite::InnerContext *>(this->context_)->thread_pool_, EluRun, this,
                            op_parameter_->thread_num_);
  if (ret != RET_OK) {
    MS_LOG(ERROR) << "Elu error: error_code[" << ret << "]";
    return RET_ERROR;
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/embedding_lookup_fp32.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/embedding_lookup_fp32.cc
@@ -87,7 +87,8 @@ int EmbeddingLookupCPUKernel::Run() {
    memcpy(input_addr_ + dest_loc, input_t, sizeof(float) * in_tensors_.at(i)->ElementsNum());
    dest_loc += in_tensors_.at(i)->ElementsNum();
  }
  auto ret = ParallelLaunch(this->context_->thread_pool_, EmbeddingLookupRun, this, op_parameter_->thread_num_);
  auto ret = ParallelLaunch(static_cast<const lite::InnerContext *>(this->context_)->thread_pool_, EmbeddingLookupRun,
                            this, op_parameter_->thread_num_);
  FreeRunBuff();
  if (ret != RET_OK) {
    MS_LOG(ERROR) << "EmbeddingLookup error: error_code[" << ret << "]";
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/exp_fp32.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/exp_fp32.cc
@@ -73,7 +73,8 @@ int ExpCPUKernel::Run() {
  output_addr_ = reinterpret_cast<float *>(out_tensors_.front()->MutableData());
  exp_parameter_->element_num_ = in_tensors_.front()->ElementsNum();
  auto ret = ParallelLaunch(this->context_->thread_pool_, ExpRun, this, exp_parameter_->thread_num_);
  auto ret = ParallelLaunch(static_cast<const lite::InnerContext *>(this->context_)->thread_pool_, ExpRun, this,
                            exp_parameter_->thread_num_);
  if (ret != RET_OK) {
    MS_LOG(ERROR) << "Exp error: error_code[" << ret << "]";
    return RET_ERROR;
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/fill_fp32.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/fill_fp32.cc
@@ -91,7 +91,8 @@ int FillCPUKernel::Run() {
    MS_LOG(ERROR) << "unsupported fill data type " << fill_input->data_type();
    return RET_ERROR;
  }
  auto ret = ParallelLaunch(this->context_->thread_pool_, FillRun, this, thread_sz_count_);
  auto ret = ParallelLaunch(static_cast<const lite::InnerContext *>(this->context_)->thread_pool_, FillRun, this,
                            thread_sz_count_);
  if (ret != RET_OK) {
    MS_LOG(ERROR) << "FillRun error error_code[" << ret << "]";
    return ret;
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/fused_batchnorm_fp32.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/fused_batchnorm_fp32.cc
@@ -93,7 +93,8 @@ int FusedBatchnormCPUKernel::Run() {
    trained_ = true;  // trained at least once
  }
  auto ret = ParallelLaunch(this->context_->thread_pool_, BatchNormRun, this, op_parameter_->thread_num_);
  auto ret = ParallelLaunch(static_cast<const lite::InnerContext *>(this->context_)->thread_pool_, BatchNormRun, this,
                            op_parameter_->thread_num_);
  if (ret != RET_OK) {
    MS_LOG(ERROR) << "BatchnormRun error error_code[" << ret << "]";
  }
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/gatherNd_fp32.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/gatherNd_fp32.cc
@@ -128,7 +128,8 @@ int GatherNdCPUKernel::Run() {
  in_ptr_ = reinterpret_cast<float *>(in_tensors_.front()->MutableData());
  out_ptr_ = reinterpret_cast<float *>(out_tensors_.front()->MutableData());
  InitOffset();
  auto ret = ParallelLaunch(this->context_->thread_pool_, GatherNdRun, this, thread_sz_count_);
  auto ret = ParallelLaunch(static_cast<const lite::InnerContext *>(this->context_)->thread_pool_, GatherNdRun, this,
                            thread_sz_count_);
  if (ret != RET_OK) {
    MS_LOG(ERROR) << "gatherNd error error_code[" << ret << "]";
    return ret;
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/gather_fp32.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/gather_fp32.cc
@@ -92,7 +92,8 @@ int GatherCPUKernel::Run() {
    return ret;
  }
  ret = ParallelLaunch(this->context_->thread_pool_, GatherRun, this, op_parameter_->thread_num_);
  ret = ParallelLaunch(static_cast<const lite::InnerContext *>(this->context_)->thread_pool_, GatherRun, this,
                       op_parameter_->thread_num_);
  if (ret != RET_OK) {
    MS_LOG(ERROR) << "Gather function error error_code[" << ret << "]";
  }
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/instance_norm_fp32.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/instance_norm_fp32.cc
@@ -66,7 +66,8 @@ int InstanceNormCPUKernel::Run() {
  gamma_data_ = reinterpret_cast<float *>(in_tensors_.at(1)->data_c());
  beta_data_ = reinterpret_cast<float *>(in_tensors_.at(2)->data_c());
  dst_data_ = reinterpret_cast<float *>(out_tensors_.at(0)->data_c());
  auto ret = ParallelLaunch(this->context_->thread_pool_, InstanceNormRun, this, op_parameter_->thread_num_);
  auto ret = ParallelLaunch(static_cast<const lite::InnerContext *>(this->context_)->thread_pool_, InstanceNormRun,
                            this, op_parameter_->thread_num_);
  if (ret != RET_OK) {
    MS_LOG(ERROR) << "InstanceNormRun error error_code[" << ret << "]";
    return ret;
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/l2_norm_fp32.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/l2_norm_fp32.cc
@@ -146,7 +146,8 @@ int L2NormCPUKernel::Run() {
  output_ptr_ = reinterpret_cast<float *>(out_tensors_.at(kOutputIndex)->MutableData());
  if (l2_norm_param_->axis_num_ == 0 || l2_norm_param_->axis_num_ == input_shape.size()) {
    // all axis
    auto ret = ParallelLaunch(this->context_->thread_pool_, SquareSumRun, this, context_->thread_num_);
    auto ret = ParallelLaunch(static_cast<const lite::InnerContext *>(this->context_)->thread_pool_, SquareSumRun, this,
                              context_->thread_num_);
    if (ret != RET_OK) {
      MS_LOG(ERROR) << "L2Norm error: error_code[" << ret << "]";
      return RET_ERROR;
@@ -156,13 +157,15 @@ int L2NormCPUKernel::Run() {
      sum += tmp_sum_[i];
    }
    sqrt_sum_ = sqrt(sum > l2_norm_param_->epsilon_ ? sum : l2_norm_param_->epsilon_);
    ret = ParallelLaunch(this->context_->thread_pool_, L2NormRun, this, context_->thread_num_);
    ret = ParallelLaunch(static_cast<const lite::InnerContext *>(this->context_)->thread_pool_, L2NormRun, this,
                         context_->thread_num_);
    if (ret != RET_OK) {
      MS_LOG(ERROR) << "L2Norm error: error_code[" << ret << "]";
      return RET_ERROR;
    }
  } else if (l2_norm_param_->axis_num_ == 1 && l2_norm_param_->axis_[0] == static_cast<int>(input_shape.size()) - 1) {
    auto ret = ParallelLaunch(this->context_->thread_pool_, L2NormTrailingAxisRun, this, context_->thread_num_);
    auto ret = ParallelLaunch(static_cast<const lite::InnerContext *>(this->context_)->thread_pool_,
                              L2NormTrailingAxisRun, this, context_->thread_num_);
    if (ret != RET_OK) {
      MS_LOG(ERROR) << "L2Norm error: error_code[" << ret << "]";
      return RET_ERROR;
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/layer_norm_fp32.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/layer_norm_fp32.cc
@@ -92,7 +92,8 @@ int LayerNormCPUKernel::Run() {
    mean_data_ = reinterpret_cast<float *>(context_->allocator->Malloc(param_->norm_outer_size_ * sizeof(float)));
    var_data_ = reinterpret_cast<float *>(context_->allocator->Malloc(param_->norm_outer_size_ * sizeof(float)));
  }
  ret = ParallelLaunch(this->context_->thread_pool_, LayerNormRun, this, op_parameter_->thread_num_);
  ret = ParallelLaunch(static_cast<const lite::InnerContext *>(this->context_)->thread_pool_, LayerNormRun, this,
                       op_parameter_->thread_num_);
  if (out_tensors_.size() != 3) {
    context_->allocator->Free(mean_data_);
    context_->allocator->Free(var_data_);
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/local_response_norm_fp32.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/local_response_norm_fp32.cc
@@ -74,7 +74,8 @@ int LocalResponseNormRun(void *cdata, int task_id) {
 }
 int LocalResponseNormCPUKernel::Run() {
  int error_code = ParallelLaunch(this->context_->thread_pool_, LocalResponseNormRun, this, thread_count_);
  int error_code = ParallelLaunch(static_cast<const lite::InnerContext *>(this->context_)->thread_pool_,
                                  LocalResponseNormRun, this, thread_count_);
  if (error_code != RET_OK) {
    MS_LOG(ERROR) << "LocalResponseNorm function error error_code[" << error_code << "]";
    return RET_ERROR;
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/log_softmax_fp32.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/log_softmax_fp32.cc
@@ -96,7 +96,8 @@ int LogSoftmaxLastAxisRun(void *cdata, int task_id) {
 int LogSoftmaxCPUKernel::Run() {
  int ret = RET_OK;
  if (in_plane_size_ == 1) {
    ret = ParallelLaunch(this->context_->thread_pool_, LogSoftmaxLastAxisRun, this, context_->thread_num_);
    ret = ParallelLaunch(static_cast<const lite::InnerContext *>(this->context_)->thread_pool_, LogSoftmaxLastAxisRun,
                         this, context_->thread_num_);
    if (ret != RET_OK) {
      MS_LOG(ERROR) << "LogSoftmaxCPUKernel ParallelLaunch failed, ret: " << ret;
    }
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/lsh_projection_fp32.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/lsh_projection_fp32.cc
@@ -61,7 +61,8 @@ int LshProjectionCPUKernel::Run() {
  if (ret != RET_OK) {
    return ret;
  }
  ret = ParallelLaunch(this->context_->thread_pool_, LshProjectionRun, this, op_parameter_->thread_num_);
  ret = ParallelLaunch(static_cast<const lite::InnerContext *>(this->context_)->thread_pool_, LshProjectionRun, this,
                       op_parameter_->thread_num_);
  if (ret != RET_OK) {
    MS_LOG(ERROR) << "LshProjection kernel parallel launch failed";
  }
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/matmul_fp32_base.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/matmul_fp32_base.cc
@@ -332,7 +332,8 @@ int MatmulFp32BaseCPUKernel::Run() {
      batch_b_ptr_ = b_pack_ptr_ + i * params_->deep_ * params_->col_align_;
      batch_c_ptr_ = c_ptr + i * params_->row_ * params_->col_;
    }
    auto ret = ParallelLaunch(this->context_->thread_pool_, MatmulBaseFloatRun, this, thread_count_);
    auto ret = ParallelLaunch(static_cast<const lite::InnerContext *>(this->context_)->thread_pool_, MatmulBaseFloatRun,
                              this, thread_count_);
    if (ret != RET_OK) {
      MS_LOG(ERROR) << "MatmulBaseFloatRun failed";
      return ret;
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/one_hot_fp32.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/one_hot_fp32.cc
@@ -181,7 +181,8 @@ int OneHotCPUKernel::GetParams() {
 }
 int OneHotCPUKernel::Run() {
  int error_code = ParallelLaunch(this->context_->thread_pool_, RunOneHot, this, context_->thread_num_);
  int error_code = ParallelLaunch(static_cast<const lite::InnerContext *>(this->context_)->thread_pool_, RunOneHot,
                                  this, context_->thread_num_);
  if (error_code != RET_OK) {
    MS_LOG(ERROR) << "OneHot function error error_code[" << error_code << "]";
    return RET_ERROR;
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/pad_fp32.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/pad_fp32.cc
@@ -395,7 +395,8 @@ int PadCPUKernel::Run() {
        output_data[i] = pad_param_->constant_value_;
      }
    }
    error_code = ParallelLaunch(this->context_->thread_pool_, PadImpl, this, context_->thread_num_);
    error_code = ParallelLaunch(static_cast<const lite::InnerContext *>(this->context_)->thread_pool_, PadImpl, this,
                                context_->thread_num_);
    if (error_code != RET_OK) {
      MS_LOG(ERROR) << "Pad run error, error_code[" << error_code << "]";
      return RET_ERROR;
@@ -408,7 +409,8 @@ int PadCPUKernel::Run() {
      return error_code;
    }
    error_code = ParallelLaunch(this->context_->thread_pool_, MirrorPadImpl, this, context_->thread_num_);
    error_code = ParallelLaunch(static_cast<const lite::InnerContext *>(this->context_)->thread_pool_, MirrorPadImpl,
                                this, context_->thread_num_);
    if (error_code != RET_OK) {
      MS_LOG(ERROR) << "Pad Reflect or Symmetric mode run error, error_code[" << error_code << "]";
      return RET_ERROR;
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/pooling_fp32.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/pooling_fp32.cc
@@ -85,7 +85,8 @@ int PoolingImpl(void *cdata, int task_id) {
 }
 int PoolingCPUKernel::Run() {
  int error_code = ParallelLaunch(this->context_->thread_pool_, PoolingImpl, this, thread_count_);
  int error_code = ParallelLaunch(static_cast<const lite::InnerContext *>(this->context_)->thread_pool_, PoolingImpl,
                                  this, thread_count_);
  if (error_code != RET_OK) {
    MS_LOG(ERROR) << "pooling error error_code[" << error_code << "]";
    return RET_ERROR;
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/power_fp32.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/power_fp32.cc
@@ -41,7 +41,8 @@ int PowerImpl(void *cdata, int task_id) {
 }
 int PowerCPUKernel::Run() {
  auto ret = ParallelLaunch(this->context_->thread_pool_, PowerImpl, this, thread_count_);
  auto ret = ParallelLaunch(static_cast<const lite::InnerContext *>(this->context_)->thread_pool_, PowerImpl, this,
                            thread_count_);
  if (ret != RET_OK) {
    MS_LOG(ERROR) << "PowerCPUKernel error: " << ret;
    return RET_ERROR;
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/prelu_fp32.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/prelu_fp32.cc
@@ -93,7 +93,8 @@ int PReluCPUKernel::Run() {
  auto negative_slope_tensor = in_tensors_.at(1);
  prelu_param_->slope_ = reinterpret_cast<float *>(negative_slope_tensor->data_c());
  auto ret = ParallelLaunch(this->context_->thread_pool_, PReluRun, this, prelu_param_->op_parameter_.thread_num_);
  auto ret = ParallelLaunch(static_cast<const lite::InnerContext *>(this->context_)->thread_pool_, PReluRun, this,
                            prelu_param_->op_parameter_.thread_num_);
  if (ret != RET_OK) {
    MS_LOG(ERROR) << "PRelu Run error: error_code[" << ret << "]";
    return RET_ERROR;
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/reduce_fp32.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/reduce_fp32.cc
@@ -117,7 +117,8 @@ int ReduceCPUKernel::Run() {
    outer_size_ = outer_sizes_.at(i);
    inner_size_ = inner_sizes_.at(i);
    axis_size_ = axis_sizes_.at(i);
    auto error_code = ParallelLaunch(this->context_->thread_pool_, ReduceImpl, this, context_->thread_num_);
    auto error_code = ParallelLaunch(static_cast<const lite::InnerContext *>(this->context_)->thread_pool_, ReduceImpl,
                                     this, context_->thread_num_);
    if (error_code != RET_OK) {
      MS_LOG(ERROR) << "Reduce run error, error_code[" << error_code << "]";
      FreeTmpBuffer();
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/resize_fp32.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/resize_fp32.cc
@@ -205,7 +205,8 @@ int ResizeCPUKernel::RunImpl(int task_id) {
 }
 int ResizeCPUKernel::Run() {
  int error_code = ParallelLaunch(this->context_->thread_pool_, ResizeImpl, this, context_->thread_num_);
  int error_code = ParallelLaunch(static_cast<const lite::InnerContext *>(this->context_)->thread_pool_, ResizeImpl,
                                  this, context_->thread_num_);
  if (error_code != RET_OK) {
    MS_LOG(ERROR) << "Resize run error, error_code[" << error_code << "]";
    FreeTmpBuffer();
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/reverse_fp32.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/reverse_fp32.cc
@@ -129,7 +129,8 @@ int ReverseCPUKernel::DoReverse(int task_id) {
 int ReverseCPUKernel::Run() {
  in_ptr_ = reinterpret_cast<float *>(in_tensors_[0]->MutableData());
  out_ptr_ = reinterpret_cast<float *>(out_tensors_[0]->MutableData());
  auto ret = ParallelLaunch(this->context_->thread_pool_, ReverseRun, this, thread_sz_count_);
  auto ret = ParallelLaunch(static_cast<const lite::InnerContext *>(this->context_)->thread_pool_, ReverseRun, this,
                            thread_sz_count_);
  if (ret != RET_OK) {
    MS_LOG(ERROR) << "Reverse run error error_code[" << ret << "]";
    return ret;
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/roi_pooling_fp32.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/roi_pooling_fp32.cc
@@ -101,7 +101,8 @@ int ROIPoolingCPUKernel::Run() {
  in_ptr_ = reinterpret_cast<float *>(in_tensors_.front()->MutableData());
  out_ptr_ = reinterpret_cast<float *>(out_tensors_.front()->MutableData());
  roi_ptr_ = reinterpret_cast<float *>(in_tensors_.at(1)->MutableData());
  auto ret = ParallelLaunch(this->context_->thread_pool_, ROIPoolingRun, this, param_->thread_num_);
  auto ret = ParallelLaunch(static_cast<const lite::InnerContext *>(this->context_)->thread_pool_, ROIPoolingRun, this,
                            param_->thread_num_);
  if (ret != RET_OK) {
    MS_LOG(ERROR) << "ROIPooling error: error_code[" << ret << "]";
    return ret;
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/scale_fp32.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/scale_fp32.cc
@@ -188,7 +188,8 @@ int ScaleCPUKernel::Run() {
  auto out_tensor = out_tensors_.front();
  output_ptr_ = reinterpret_cast<float *>(out_tensor->MutableData());
  auto ret = ParallelLaunch(this->context_->thread_pool_, ScaleRun, this, op_parameter_->thread_num_);
  auto ret = ParallelLaunch(static_cast<const lite::InnerContext *>(this->context_)->thread_pool_, ScaleRun, this,
                            op_parameter_->thread_num_);
  if (ret != RET_OK) {
    MS_LOG(ERROR) << "Scale error error_code[" << ret << "]";
    return RET_ERROR;
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/scatter_nd_fp32.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/scatter_nd_fp32.cc
@@ -149,7 +149,8 @@ int ScatterNDRun(void *cdata, int task_id) {
 }
 int ScatterNDCPUKernel::Run() {
  auto ret = ParallelLaunch(this->context_->thread_pool_, ScatterNDRun, this, thread_n_num_);
  auto ret = ParallelLaunch(static_cast<const lite::InnerContext *>(this->context_)->thread_pool_, ScatterNDRun, this,
                            thread_n_num_);
  if (ret != RET_OK) {
    MS_LOG(ERROR) << "ScatterND error error_code[" << ret << "]";
    return RET_ERROR;
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/softmax_fp32.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/softmax_fp32.cc
@@ -96,7 +96,8 @@ int SoftmaxLastAxisRun(void *cdata, int task_id) {
 int SoftmaxCPUKernel::Run() {
  int ret = RET_OK;
  if (in_plane_size_ == 1) {
    ret = ParallelLaunch(this->context_->thread_pool_, SoftmaxLastAxisRun, this, context_->thread_num_);
    ret = ParallelLaunch(static_cast<const lite::InnerContext *>(this->context_)->thread_pool_, SoftmaxLastAxisRun,
                         this, context_->thread_num_);
    if (ret != RET_OK) {
      MS_LOG(ERROR) << "SoftmaxCPUKernel ParallelLaunch failed, ret: " << ret;
    }
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/space_to_batch_fp32.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/space_to_batch_fp32.cc
@@ -102,7 +102,8 @@ int SpaceToBatchCPUKernel::Run() {
    }
  }
  ParallelLaunch(this->context_->thread_pool_, SpaceToBatchFp32Run, this, op_parameter_->thread_num_);
  ParallelLaunch(static_cast<const lite::InnerContext *>(this->context_)->thread_pool_, SpaceToBatchFp32Run, this,
                 op_parameter_->thread_num_);
  return RET_OK;
 }
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/space_to_depth_fp32.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/space_to_depth_fp32.cc
@@ -93,7 +93,8 @@ int SpaceToDepthCPUKernel::Run() {
  input_ptr_ = reinterpret_cast<float *>(in_tensors_.at(0)->data_c());
  output_ptr_ = reinterpret_cast<float *>(out_tensors_.at(0)->data_c());
  if (in_tensors_.at(0)->format() == schema::Format::Format_NHWC) {
    auto ret = ParallelLaunch(this->context_->thread_pool_, SpaceToDepthRun, this, thread_h_num_);
    auto ret = ParallelLaunch(static_cast<const lite::InnerContext *>(this->context_)->thread_pool_, SpaceToDepthRun,
                              this, thread_h_num_);
    if (ret != RET_OK) {
      MS_LOG(ERROR) << "SpaceToDepth error error_code[" << ret << "]";
      return ret;
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/sparse_to_dense_fp32.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/sparse_to_dense_fp32.cc
@@ -175,7 +175,8 @@ int SparseToDenseCPUKernel::Run() {
  }
  output_data = reinterpret_cast<float *>(out_tensors_.at(0)->MutableData());
  count_unit_ = thread_count_ > 1 ? UP_DIV(index_num, thread_count_) : index_num;
  ret = ParallelLaunch(this->context_->thread_pool_, SparseToDenseRun, this, s2d_param->thread_num_);
  ret = ParallelLaunch(static_cast<const lite::InnerContext *>(this->context_)->thread_pool_, SparseToDenseRun, this,
                       s2d_param->thread_num_);
  if (ret != RET_OK) {
    MS_LOG(ERROR) << "SparseToDenseRun error: error_code[" << ret << "]";
    return RET_ERROR;
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/transpose_fp32.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/transpose_fp32.cc
@@ -159,7 +159,8 @@ int TransposeCPUKernel::Run() {
  thread_count_ = op_parameter_->thread_num_;
  GetNHNCTransposeFunc(in_tensor, out_tensor, param_);
  if (NHNCTransposeFunc_ != nullptr) {
    auto ret = ParallelLaunch(this->context_->thread_pool_, TransposeImpl, this, thread_count_);
    auto ret = ParallelLaunch(static_cast<const lite::InnerContext *>(this->context_)->thread_pool_, TransposeImpl,
                              this, thread_count_);
    if (ret != RET_OK) {
      MS_LOG(ERROR) << "NHNCTransposeFunc_ is error!";
    }
@@ -187,7 +188,8 @@ int TransposeCPUKernel::Run() {
  }
  int ret;
  if (dims_ > MAX_TRANSPOSE_DIM_SIZE) {
    ret = ParallelLaunch(this->context_->thread_pool_, TransposeImpl, this, thread_count_);
    ret = ParallelLaunch(static_cast<const lite::InnerContext *>(this->context_)->thread_pool_, TransposeImpl, this,
                         thread_count_);
  } else {
    ret = DoTransposeFp32(in_data_, out_data_, out_shape_, param_);
  }
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/where_fp32.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/where_fp32.cc
@@ -133,7 +133,8 @@ int WhereCPUKernel::RunWithTripleInputs() {
    MS_LOG(ERROR) << "Error, inputs' length are zero !!!";
    return RET_ERROR;
  }
  auto ret = ParallelLaunch(this->context_->thread_pool_, WhereRun, this, where_param_->thread_num_);
  auto ret = ParallelLaunch(static_cast<const lite::InnerContext *>(this->context_)->thread_pool_, WhereRun, this,
                            where_param_->thread_num_);
  if (ret != RET_OK) {
    MS_LOG(ERROR) << "WhereDwRun error: error_code[" << ret << "]";
    return RET_ERROR;
--- a/mindspore/lite/src/runtime/kernel/arm/fp32_grad/activation_grad.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32_grad/activation_grad.cc
@@ -98,7 +98,8 @@ int ActivationGradRun(void *cdata, int task_id) {
 }
 int ActivationGradCPUKernel::Run() {
  int error_code = ParallelLaunch(this->context_->thread_pool_, ActivationGradRun, this, thread_count_);
  int error_code = ParallelLaunch(static_cast<const lite::InnerContext *>(this->context_)->thread_pool_,
                                  ActivationGradRun, this, thread_count_);
  if (error_code != RET_OK) {
    MS_LOG(ERROR) << "Activation Grad function error error_code[" << error_code << "]";
    return RET_ERROR;
--- a/mindspore/lite/src/runtime/kernel/arm/fp32_grad/adam.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32_grad/adam.cc
@@ -102,7 +102,8 @@ int AdamRun(void *cdata, int task_id) {
 }
 int AdamCPUKernel::Run() {
  int error_code = ParallelLaunch(this->context_->thread_pool_, AdamRun, this, thread_count_);
  int error_code =
    ParallelLaunch(static_cast<const lite::InnerContext *>(this->context_)->thread_pool_, AdamRun, this, thread_count_);
  if (error_code != RET_OK) {
    MS_LOG(ERROR) << "Adam function error error_code[" << error_code << "]";
    return RET_ERROR;
@@ -145,9 +146,10 @@ int AdamCPUKernel::OptimizerStep() {
 kernel::LiteKernel *CpuAdamFp32KernelCreator(const std::vector<lite::Tensor *> &inputs,
                                             const std::vector<lite::Tensor *> &outputs, OpParameter *opParameter,
                                             const lite::InnerContext *ctx, const kernel::KernelKey &desc) {
                                             const lite::Context *ctx, const kernel::KernelKey &desc) {
  MS_ASSERT(desc.type == schema::PrimitiveType_Adam);
  auto *kernel = new (std::nothrow) AdamCPUKernel(opParameter, inputs, outputs, ctx);
  auto *kernel =
    new (std::nothrow) AdamCPUKernel(opParameter, inputs, outputs, static_cast<const lite::InnerContext *>(ctx));
  if (kernel == nullptr) {
    MS_LOG(ERROR) << "new AdamCPUKernel fail!";
    free(opParameter);
--- a/mindspore/lite/src/runtime/kernel/arm/fp32_grad/apply_momentum.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32_grad/apply_momentum.cc
@@ -82,7 +82,8 @@ int ApplyMomentumRun(void *cdata, int task_id) {
 }
 int ApplyMomentumCPUKernel::Run() {
  int error_code = ParallelLaunch(this->context_->thread_pool_, ApplyMomentumRun, this, thread_count_);
  int error_code = ParallelLaunch(static_cast<const lite::InnerContext *>(this->context_)->thread_pool_,
                                  ApplyMomentumRun, this, thread_count_);
  if (error_code != RET_OK) {
    MS_LOG(ERROR) << "Apply Momentum function error error_code[" << error_code << "]";
    return RET_ERROR;
@@ -119,10 +120,11 @@ int ApplyMomentumCPUKernel::OptimizerStep() {
 kernel::LiteKernel *CpuApplyMomentumFp32KernelCreator(const std::vector<lite::Tensor *> &inputs,
                                                      const std::vector<lite::Tensor *> &outputs,
                                                      OpParameter *opParameter, const lite::InnerContext *ctx,
                                                      OpParameter *opParameter, const lite::Context *ctx,
                                                      const kernel::KernelKey &desc) {
  MS_ASSERT(desc.type == schema::PrimitiveType_ApplyMomentum);
  auto *kernel = new (std::nothrow) ApplyMomentumCPUKernel(opParameter, inputs, outputs, ctx);
  auto *kernel = new (std::nothrow)
    ApplyMomentumCPUKernel(opParameter, inputs, outputs, static_cast<const lite::InnerContext *>(ctx));
  if (kernel == nullptr) {
    MS_LOG(ERROR) << "new ApplyMomentumCPUKernel fail!";
    free(opParameter);