!11265 conv fp16 cast delete

From: @ling_qiao_min Reviewed-by: @zhang_xue_tong Signed-off-by: @zhang_xue_tong
4 years ago · ffa92acdef
--- a/mindspore/lite/src/runtime/kernel/arm/fp16/concat_fp16.h
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/concat_fp16.h
@@ -36,13 +36,9 @@ class ConcatFp16CPUKernel : public LiteKernel {
      : LiteKernel(parameter, inputs, outputs, ctx, primitive) {
    concat_param_ = reinterpret_cast<ConcatParameter *>(op_parameter_);
  }

  ~ConcatFp16CPUKernel() = default;

  int Init() override;

  int ReSize() override;

  int Run() override;

 private:
--- a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_1x1_fp16.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_1x1_fp16.cc
@@ -207,18 +207,12 @@ static int Convolution1x1Fp16RunHw(void *cdata, int task_id) {
 }

 int Convolution1x1FP16CPUKernel::Run() {
  auto ret = ConvolutionBaseFP16CPUKernel::GetExecuteTensor();
  if (ret != RET_OK) {
    MS_LOG(ERROR) << "Get executor tensor failed.";
    ConvolutionBaseFP16CPUKernel::FreeTmpBuffer();
    return ret;
  }
  ConvolutionBaseFP16CPUKernel::GetExecuteTensor();

  pack_input_ = reinterpret_cast<float16_t *>(
    ctx_->allocator->Malloc(matmul_param_->row_16_ * matmul_param_->deep_ * sizeof(float16_t)));
  if (pack_input_ == nullptr) {
    MS_LOG(ERROR) << "Conv1x1 Malloc pack_input_ error!";
    ConvolutionBaseFP16CPUKernel::FreeTmpBuffer();
    return RET_MEMORY_FAILED;
  }

@@ -232,6 +226,7 @@ int Convolution1x1FP16CPUKernel::Run() {
      input_ptr_ = batch_in;
    }

    int ret = RET_ERROR;
    if (multi_thread_by_hw_) {
      ret = ParallelLaunch(this->context_->thread_pool_, Convolution1x1Fp16RunHw, this, thread_count_);
    } else {
@@ -240,16 +235,12 @@ int Convolution1x1FP16CPUKernel::Run() {
    }
    if (ret != RET_OK) {
      MS_LOG(ERROR) << "ParallelLaunch failed.";
      ConvolutionBaseFP16CPUKernel::FreeTmpBuffer();
      ctx_->allocator->Free(pack_input_);
      pack_input_ = nullptr;
      return ret;
    }
  }

  ConvolutionBaseFP16CPUKernel::IfCastOutput();
  ConvolutionBaseFP16CPUKernel::FreeTmpBuffer();

  ctx_->allocator->Free(pack_input_);
  pack_input_ = nullptr;
  return RET_OK;
--- a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_base_fp16.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_base_fp16.cc
@@ -33,19 +33,10 @@ ConvolutionBaseFP16CPUKernel::~ConvolutionBaseFP16CPUKernel() {
 }

 int ConvolutionBaseFP16CPUKernel::GetExecuteTensor() {
  // ===================input====================//
  auto input_tensor = in_tensors_.at(kInputIndex);
  in_data_type_ = input_tensor->data_type();
  MS_ASSERT(in_data_type_ == kNumberTypeFloat32 || in_data_type_ == kNumberTypeFloat16);

  execute_input_ = ConvertInputFp32toFp16(input_tensor, context_);

  // ==================output====================//
  auto out_tensor = out_tensors_.at(kOutputIndex);
  out_data_type_ = out_tensor->data_type();
  MS_ASSERT(out_data_type_ == kNumberTypeFloat32 || out_data_type_ == kNumberTypeFloat16);

  execute_output_ = MallocOutputFp16(out_tensor, context_);
  auto input_tensor = in_tensors_.at(0);
  auto output_tensor = out_tensors_.at(0);
  execute_input_ = reinterpret_cast<float16_t *>(input_tensor->data_c());
  execute_output_ = reinterpret_cast<float16_t *>(output_tensor->data_c());
  return RET_OK;
 }

@@ -78,25 +69,4 @@ int ConvolutionBaseFP16CPUKernel::GetExecuteFilter() {
  }
  return RET_OK;
 }

 void ConvolutionBaseFP16CPUKernel::IfCastOutput() {
  if (out_data_type_ == kNumberTypeFloat32) {
    auto out_tensor = out_tensors_.at(kOutputIndex);
    auto out_ele_num = out_tensor->ElementsNum();
    auto output_addr = reinterpret_cast<float *>(out_tensor->MutableData());
    Float16ToFloat32(execute_output_, output_addr, out_ele_num);
  }
 }

 void ConvolutionBaseFP16CPUKernel::FreeTmpBuffer() {
  if (in_data_type_ == kNumberTypeFloat32) {
    context_->allocator->Free(execute_input_);
    execute_input_ = nullptr;
  }
  if (out_data_type_ == kNumberTypeFloat32) {
    context_->allocator->Free(execute_output_);
    execute_output_ = nullptr;
  }
 }

 }  // namespace mindspore::kernel
--- a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_base_fp16.h
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_base_fp16.h
@@ -38,16 +38,12 @@ class ConvolutionBaseFP16CPUKernel : public ConvolutionBaseCPUKernel {
  int RunImpl(int task_id) { return mindspore::lite::RET_OK; }
  virtual int GetExecuteTensor();
  virtual int GetExecuteFilter();
  virtual void IfCastOutput();
  void FreeTmpBuffer();

 protected:
  float16_t *fp16_weight_ = nullptr;
  float16_t *execute_input_ = nullptr;
  float16_t *execute_weight_ = nullptr;
  float16_t *execute_output_ = nullptr;
  TypeId in_data_type_;
  TypeId out_data_type_;
 };
 }  // namespace mindspore::kernel

--- a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_depthwise_fp16.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_depthwise_fp16.cc
@@ -114,19 +114,13 @@ static int ConvDwFp16Run(void *cdata, int task_id) {
 }

 int ConvolutionDepthwiseFp16CPUKernel::Run() {
  auto ret = ConvolutionBaseFP16CPUKernel::GetExecuteTensor();
  if (ret != RET_OK) {
    MS_LOG(ERROR) << "Get Execute tensor failed.";
    return ret;
  }
  ConvolutionBaseFP16CPUKernel::GetExecuteTensor();

  ret = ParallelLaunch(this->context_->thread_pool_, ConvDwFp16Run, this, conv_param_->thread_num_);
  auto ret = ParallelLaunch(this->context_->thread_pool_, ConvDwFp16Run, this, conv_param_->thread_num_);
  if (ret != RET_OK) {
    MS_LOG(ERROR) << "ConvDwFp16Run error: error_code[" << ret << "]";
  }

  ConvolutionBaseFP16CPUKernel::IfCastOutput();
  ConvolutionBaseFP16CPUKernel::FreeTmpBuffer();
  return ret;
 }

--- a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_depthwise_slidewindow_fp16.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_depthwise_slidewindow_fp16.cc
@@ -149,13 +149,8 @@ int ConvolutionDepthwiseSWFp16CPUKernel::Run() {
    return ret;
  }

  ret = ConvolutionBaseFP16CPUKernel::GetExecuteTensor();
  if (ret != RET_OK) {
    MS_LOG(ERROR) << "Get Execute tensor failed.";
    FreePackedInputOutput();
    ConvolutionBaseFP16CPUKernel::FreeTmpBuffer();
    return ret;
  }
  ConvolutionBaseFP16CPUKernel::GetExecuteTensor();

  if (need_align_) {
    PackNHWCToNHWC8Fp16(execute_input_, packed_input_, conv_param_->input_batch_,
                        conv_param_->input_h_ * conv_param_->input_w_, conv_param_->input_channel_);
@@ -172,8 +167,7 @@ int ConvolutionDepthwiseSWFp16CPUKernel::Run() {
    PackNHWC8ToNHWCFp16(packed_output_, execute_output_, conv_param_->output_batch_,
                        conv_param_->output_h_ * conv_param_->output_w_, conv_param_->output_channel_);
  }
  ConvolutionBaseFP16CPUKernel::IfCastOutput();
  ConvolutionBaseFP16CPUKernel::FreeTmpBuffer();

  FreePackedInputOutput();
  return ret;
 }
--- a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_fp16.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_fp16.cc
@@ -128,17 +128,11 @@ static int ConvolutionFp16Impl(void *cdata, int task_id) {
 }

 int ConvolutionFP16CPUKernel::Run() {
  auto ret = ConvolutionBaseFP16CPUKernel::GetExecuteTensor();
  if (ret != RET_OK) {
    MS_LOG(ERROR) << "Get Execute tensor failed.";
    ConvolutionBaseFP16CPUKernel::FreeTmpBuffer();
    return ret;
  }
  ConvolutionBaseFP16CPUKernel::GetExecuteTensor();

  ret = InitTmpBuffer();
  auto ret = InitTmpBuffer();
  if (ret != RET_OK) {
    MS_LOG(ERROR) << "Init tmp buffer failed.";
    ConvolutionBaseFP16CPUKernel::FreeTmpBuffer();
    FreeTmpBuffer();
    return RET_ERROR;
  }
@@ -147,8 +141,7 @@ int ConvolutionFP16CPUKernel::Run() {
  if (ret != RET_OK) {
    MS_LOG(ERROR) << "conv fp16 error ret[" << ret << "]";
  }
  ConvolutionBaseFP16CPUKernel::IfCastOutput();
  ConvolutionBaseFP16CPUKernel::FreeTmpBuffer();

  FreeTmpBuffer();
  return ret;
 }
--- a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_winograd_fp16.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_winograd_fp16.cc
@@ -195,17 +195,11 @@ static int ConvolutionWinogradFp16Impl(void *cdata, int task_id) {
 }

 int ConvolutionWinogradFP16CPUKernel::Run() {
  auto ret = ConvolutionBaseFP16CPUKernel::GetExecuteTensor();
  if (ret != RET_OK) {
    MS_LOG(ERROR) << "Get Execute tensor failed.";
    ConvolutionBaseFP16CPUKernel::FreeTmpBuffer();
    return ret;
  }
  ConvolutionBaseFP16CPUKernel::GetExecuteTensor();

  ret = InitTmpBuffer();
  auto ret = InitTmpBuffer();
  if (ret != RET_OK) {
    MS_LOG(ERROR) << "Init tmp buffer failed.";
    ConvolutionBaseFP16CPUKernel::FreeTmpBuffer();
    FreeTmpBuffer();
    return RET_ERROR;
  }
@@ -215,8 +209,6 @@ int ConvolutionWinogradFP16CPUKernel::Run() {
    MS_LOG(ERROR) << "conv winograd error error_code[" << ret << "]";
  }

  ConvolutionBaseFP16CPUKernel::IfCastOutput();
  ConvolutionBaseFP16CPUKernel::FreeTmpBuffer();
  FreeTmpBuffer();
  return ret;
 }
--- a/mindspore/lite/src/runtime/kernel/arm/fp16/deconvolution_depthwise_fp16.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/deconvolution_depthwise_fp16.cc
@@ -162,13 +162,8 @@ int DeconvolutionDepthwiseFp16CPUKernel::Run() {
    return RET_ERROR;
  }

  ret = ConvolutionBaseFP16CPUKernel::GetExecuteTensor();
  if (ret != RET_OK) {
    MS_LOG(ERROR) << "Get Execute tensor failed.";
    FreePackedInputOutput();
    ConvolutionBaseFP16CPUKernel::FreeTmpBuffer();
    return ret;
  }
  ConvolutionBaseFP16CPUKernel::GetExecuteTensor();

  if (need_align_) {
    PackNHWCToNHWC8Fp16(execute_input_, packed_input_, conv_param_->input_batch_,
                        conv_param_->input_h_ * conv_param_->input_w_, conv_param_->input_channel_);
@@ -189,8 +184,7 @@ int DeconvolutionDepthwiseFp16CPUKernel::Run() {
    PackNHWC8ToNHWCFp16(packed_output_, execute_output_, conv_param_->output_batch_,
                        conv_param_->output_h_ * conv_param_->output_w_, conv_param_->output_channel_);
  }
  ConvolutionBaseFP16CPUKernel::IfCastOutput();
  ConvolutionBaseFP16CPUKernel::FreeTmpBuffer();

  FreePackedInputOutput();
  return ret;
 }
--- a/mindspore/lite/src/runtime/kernel/arm/fp16/deconvolution_fp16.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/deconvolution_fp16.cc
@@ -189,7 +189,6 @@ int DeConvolutionFp16CPUKernel::Run() {
  int error_code = InitRunBuf();
  if (error_code != RET_OK) {
    MS_LOG(ERROR) << "deconv fp16 InitRunBuf error! error_code[" << error_code << "]";
    ConvolutionBaseFP16CPUKernel::FreeTmpBuffer();
    FreeRunBuf();
    return RET_ERROR;
  }
@@ -206,8 +205,6 @@ int DeConvolutionFp16CPUKernel::Run() {
    }
  }

  ConvolutionBaseFP16CPUKernel::IfCastOutput();
  ConvolutionBaseFP16CPUKernel::FreeTmpBuffer();
  FreeRunBuf();
  return error_code;
 }
--- a/mindspore/lite/src/runtime/kernel/arm/fp16/deconvolution_winograd_fp16.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/deconvolution_winograd_fp16.cc
@@ -405,9 +405,6 @@ int DeConvWinogradFp16CPUKernel::Run() {
    ParallelLaunch(this->context_->thread_pool_, DeConvWgPostFp16Run, this, thread_num_hw_);
  }

  ConvolutionBaseFP16CPUKernel::IfCastOutput();
  ConvolutionBaseFP16CPUKernel::FreeTmpBuffer();

  return RET_OK;
 }
 }  // namespace mindspore::kernel
--- a/mindspore/lite/src/runtime/kernel/arm/fp16/scale_fp16.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/scale_fp16.cc
@@ -33,9 +33,6 @@ using mindspore::schema::PrimitiveType_Scale;
 namespace mindspore::kernel {

 int ScaleFp16CPUKernel::InitScaleOffset() {
  auto input_tensor = in_tensors_.at(0);
  malloc_input_ = input_tensor->data_type() == kNumberTypeFloat32;

  auto scale_tensor = in_tensors_.at(1);
  malloc_scale_ = scale_tensor->data_type() == kNumberTypeFloat32;

@@ -45,9 +42,6 @@ int ScaleFp16CPUKernel::InitScaleOffset() {
    auto offset_tensor = in_tensors_.at(2);
    malloc_offset_ = offset_tensor->data_type() == kNumberTypeFloat32;
  }

  auto output_tensor = out_tensors_.at(0);
  malloc_output_ = output_tensor->data_type() == kNumberTypeFloat32;
  return RET_OK;
 }

@@ -103,6 +97,11 @@ int ScaleFp16Run(void *cdata, int task_id) {
 }

 int ScaleFp16CPUKernel::Run() {
  auto input_tensor = in_tensors_.at(0);
  auto output_tensor = out_tensors_.at(0);
  input_ = reinterpret_cast<float16_t *>(input_tensor->MutableData());
  output_ = reinterpret_cast<float16_t *>(output_tensor->MutableData());

  auto ret = InitScaleOffset();
  if (ret != RET_OK) {
    MS_LOG(ERROR) << "Scale fp16 InitScaleOffset failed.";
@@ -123,20 +122,11 @@ int ScaleFp16CPUKernel::Run() {
    return RET_ERROR;
  }

  // if output tensor is fp32, we need to transform
  if (malloc_output_) {
    auto out_tensor = out_tensors_.at(0);
    Float16ToFloat32(output_, reinterpret_cast<float *>(out_tensor->MutableData()), out_tensor->ElementsNum());
  }
  FreeTmpBuffer();
  return RET_OK;
 }

 int ScaleFp16CPUKernel::MallocAssignTmpBuffer() {
  input_ = ConvertInputFp32toFp16(in_tensors_.at(0), context_);
  if (input_ == nullptr) {
    return RET_ERROR;
  }
  scale_ = ConvertInputFp32toFp16(in_tensors_.at(1), context_);
  if (scale_ == nullptr) {
    return RET_ERROR;
@@ -155,18 +145,10 @@ int ScaleFp16CPUKernel::MallocAssignTmpBuffer() {
    }
    memset(offset_, 0, in_tensors_.at(1)->ElementsNum() * sizeof(float16_t));
  }
  output_ = MallocOutputFp16(out_tensors_.at(0), context_);
  if (output_ == nullptr) {
    return RET_ERROR;
  }
  return RET_OK;
 }

 void ScaleFp16CPUKernel::FreeTmpBuffer() {
  if (malloc_input_ && input_ != nullptr) {
    context_->allocator->Free(input_);
    input_ = nullptr;
  }
  if (malloc_scale_ && scale_ != nullptr) {
    context_->allocator->Free(scale_);
    scale_ = nullptr;
@@ -175,10 +157,6 @@ void ScaleFp16CPUKernel::FreeTmpBuffer() {
    context_->allocator->Free(offset_);
    offset_ = nullptr;
  }
  if (malloc_output_ && output_ != nullptr) {
    context_->allocator->Free(output_);
    output_ = nullptr;
  }
 }

 REG_KERNEL(kCPU, kNumberTypeFloat16, PrimitiveType_Scale, LiteKernelCreator<ScaleFp16CPUKernel>)
--- a/mindspore/lite/src/runtime/kernel/arm/fp16/scale_fp16.h
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/scale_fp16.h
@@ -43,10 +43,8 @@ class ScaleFp16CPUKernel : public ScaleCPUKernel {
  void FreeTmpBuffer();

 private:
  bool malloc_input_ = false;
  bool malloc_scale_ = false;
  bool malloc_offset_ = false;
  bool malloc_output_ = false;

  float16_t *input_ = nullptr;
  float16_t *scale_ = nullptr;
--- a/mindspore/lite/src/runtime/kernel/arm/fp16/stack_fp16.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/stack_fp16.cc
@@ -29,7 +29,6 @@ using mindspore::lite::RET_OK;
 using mindspore::schema::PrimitiveType_Stack;

 namespace mindspore::kernel {

 int StackFp16CPUKernel::Init() {
  if (!InferShapeDone()) {
    return RET_OK;
--- a/mindspore/lite/src/runtime/kernel/arm/fp16/stack_fp16.h
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/stack_fp16.h
@@ -27,9 +27,7 @@ class StackFp16CPUKernel : public StackCPUKernel {
                     const std::vector<lite::Tensor *> &outputs, const lite::InnerContext *ctx,
                     const mindspore::lite::PrimitiveC *primitive)
      : StackCPUKernel(parameter, inputs, outputs, ctx, primitive) {}

  ~StackFp16CPUKernel() = default;

  int Init() override;
  int Run() override;