Browse Source

[MS][LITE] optimize arm cpu fp16 conv op: add common converter functions for input and output

tags/v0.7.0-beta
yangruoqi713 5 years ago
parent
commit
1b89036bea
19 changed files with 130 additions and 222 deletions
  1. +46
    -0
      mindspore/lite/src/runtime/kernel/arm/fp16/common_fp16.cc
  2. +28
    -0
      mindspore/lite/src/runtime/kernel/arm/fp16/common_fp16.h
  3. +1
    -39
      mindspore/lite/src/runtime/kernel/arm/fp16/convolution_1x1_fp16.cc
  4. +0
    -7
      mindspore/lite/src/runtime/kernel/arm/fp16/convolution_1x1_fp16.h
  5. +1
    -24
      mindspore/lite/src/runtime/kernel/arm/fp16/convolution_3x3_fp16.cc
  6. +0
    -6
      mindspore/lite/src/runtime/kernel/arm/fp16/convolution_3x3_fp16.h
  7. +20
    -21
      mindspore/lite/src/runtime/kernel/arm/fp16/convolution_base_fp16.cc
  8. +4
    -4
      mindspore/lite/src/runtime/kernel/arm/fp16/convolution_base_fp16.h
  9. +8
    -19
      mindspore/lite/src/runtime/kernel/arm/fp16/convolution_depthwise_fp16.cc
  10. +3
    -3
      mindspore/lite/src/runtime/kernel/arm/fp16/convolution_depthwise_fp16.h
  11. +1
    -23
      mindspore/lite/src/runtime/kernel/arm/fp16/convolution_fp16.cc
  12. +0
    -6
      mindspore/lite/src/runtime/kernel/arm/fp16/convolution_fp16.h
  13. +1
    -23
      mindspore/lite/src/runtime/kernel/arm/fp16/convolution_sw_fp16.cc
  14. +0
    -6
      mindspore/lite/src/runtime/kernel/arm/fp16/convolution_sw_fp16.h
  15. +1
    -23
      mindspore/lite/src/runtime/kernel/arm/fp16/convolution_winograd_fp16.cc
  16. +0
    -6
      mindspore/lite/src/runtime/kernel/arm/fp16/convolution_winograd_fp16.h
  17. +11
    -7
      mindspore/lite/src/runtime/kernel/arm/fp16/deconvolution_depthwise_fp16.cc
  18. +3
    -3
      mindspore/lite/src/runtime/kernel/arm/fp16/deconvolution_depthwise_fp16.h
  19. +2
    -2
      mindspore/lite/src/runtime/kernel/arm/nnacl/fp16/pack_fp16.c

+ 46
- 0
mindspore/lite/src/runtime/kernel/arm/fp16/common_fp16.cc View File

@@ -0,0 +1,46 @@
/**
* Copyright 2020 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include "src/runtime/kernel/arm/fp16/common_fp16.h"
#include "src/runtime/kernel/arm/nnacl/fp16/cast_fp16.h"

namespace mindspore::kernel {
float16_t *ConvertInputFp32toFp16(lite::tensor::Tensor *input, const lite::Context *ctx) {
float16_t *fp16_data = nullptr;
auto data_type = input->data_type();
if (data_type == kNumberTypeFloat32) {
auto ele_num = input->ElementsNum();
fp16_data = reinterpret_cast<float16_t *>(ctx->allocator->Malloc(ele_num * sizeof(float16_t)));
auto ori_data = reinterpret_cast<float *>(input->Data());
Float32ToFloat16(ori_data, fp16_data, ele_num);
} else {
fp16_data = reinterpret_cast<float16_t *>(input->Data());
}
return fp16_data;
}

float16_t *MallocOutputFp16(lite::tensor::Tensor *output, const lite::Context *ctx) {
float16_t *fp16_data = nullptr;
auto data_type = output->data_type();
if (data_type == kNumberTypeFloat32) {
auto ele_num = output->ElementsNum();
fp16_data = reinterpret_cast<float16_t *>(ctx->allocator->Malloc(ele_num * sizeof(float16_t)));
} else {
fp16_data = reinterpret_cast<float16_t *>(output->Data());
}
return fp16_data;
}
} // namespace mindspore::kernel

+ 28
- 0
mindspore/lite/src/runtime/kernel/arm/fp16/common_fp16.h View File

@@ -0,0 +1,28 @@
/**
* Copyright 2020 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_COMMON_FP16_H_
#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_COMMON_FP16_H_

#include "src/lite_kernel.h"

namespace mindspore::kernel {
float16_t *ConvertInputFp32toFp16(lite::tensor::Tensor *input, const lite::Context *ctx);

float16_t *MallocOutputFp16(lite::tensor::Tensor *output, const lite::Context *ctx);

} // namespace mindspore::kernel

#endif // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_COMMON_FP16_H_

+ 1
- 39
mindspore/lite/src/runtime/kernel/arm/fp16/convolution_1x1_fp16.cc View File

@@ -98,28 +98,6 @@ int Convolution1x1FP16CPUKernel::InitWeightBias() {
return RET_OK; return RET_OK;
} }


int Convolution1x1FP16CPUKernel::InitBuffer() {
/*=============================fp16_input_============================*/
size_t fp16_input_size = conv_param_->input_channel_ * conv_param_->input_batch_ * conv_param_->input_h_ *
conv_param_->input_w_ * sizeof(float16_t);
fp16_input_ = reinterpret_cast<float16_t *>(malloc(fp16_input_size));
if (fp16_input_ == nullptr) {
MS_LOG(ERROR) << "malloc fp16_input_ failed.";
return RET_ERROR;
}
memset(fp16_input_, 0, fp16_input_size);

/*=============================fp16_out_============================*/
size_t fp16_output_size = conv_param_->output_channel_ * conv_param_->output_batch_ * conv_param_->output_h_ *
conv_param_->output_w_ * sizeof(float16_t);
fp16_out_ = reinterpret_cast<float16_t *>(malloc(fp16_output_size));
if (fp16_out_ == nullptr) {
MS_LOG(ERROR) << "malloc fp16_out_ failed.";
return RET_ERROR;
}
return RET_OK;
}

int Convolution1x1FP16CPUKernel::Init() { int Convolution1x1FP16CPUKernel::Init() {
auto ret = ConvolutionBaseCPUKernel::Init(); auto ret = ConvolutionBaseCPUKernel::Init();
if (ret != RET_OK) { if (ret != RET_OK) {
@@ -136,11 +114,6 @@ int Convolution1x1FP16CPUKernel::Init() {
MS_LOG(ERROR) << "Init conv1x1 param failed."; MS_LOG(ERROR) << "Init conv1x1 param failed.";
return ret; return ret;
} }
ret = InitBuffer();
if (ret != RET_OK) {
MS_LOG(ERROR) << "Init buffer failed.";
return ret;
}
ret = InitWeightBias(); ret = InitWeightBias();
if (ret != RET_OK) { if (ret != RET_OK) {
MS_LOG(ERROR) << "Init weight bias failed."; MS_LOG(ERROR) << "Init weight bias failed.";
@@ -150,12 +123,6 @@ int Convolution1x1FP16CPUKernel::Init() {
} }


int Convolution1x1FP16CPUKernel::ReSize() { int Convolution1x1FP16CPUKernel::ReSize() {
if (fp16_out_ != nullptr) {
free(fp16_out_);
}
if (fp16_input_ != nullptr) {
free(fp16_input_);
}
if (fp16_weight_ != nullptr) { if (fp16_weight_ != nullptr) {
free(fp16_weight_); free(fp16_weight_);
} }
@@ -181,12 +148,6 @@ int Convolution1x1FP16CPUKernel::ReSize() {
MS_LOG(ERROR) << "Init conv1x1 param failed."; MS_LOG(ERROR) << "Init conv1x1 param failed.";
return ret; return ret;
} }
ret = InitBuffer();
if (ret != RET_OK) {
MS_LOG(ERROR) << "Init buffer failed.";
return ret;
}

return RET_OK; return RET_OK;
} }


@@ -253,6 +214,7 @@ int Convolution1x1FP16CPUKernel::Run() {
} }


ConvolutionBaseFP16CPUKernel::IfCastOutput(); ConvolutionBaseFP16CPUKernel::IfCastOutput();
ConvolutionBaseFP16CPUKernel::FreeTmpBuffer();
return RET_OK; return RET_OK;
} }
} // namespace mindspore::kernel } // namespace mindspore::kernel

+ 0
- 7
mindspore/lite/src/runtime/kernel/arm/fp16/convolution_1x1_fp16.h View File

@@ -35,15 +35,9 @@ class Convolution1x1FP16CPUKernel : public ConvolutionBaseFP16CPUKernel {
matmul_param_ = new MatMulParameter(); matmul_param_ = new MatMulParameter();
} }
~Convolution1x1FP16CPUKernel() override { ~Convolution1x1FP16CPUKernel() override {
if (fp16_input_ != nullptr) {
free(fp16_input_);
}
if (fp16_weight_ != nullptr) { if (fp16_weight_ != nullptr) {
free(fp16_weight_); free(fp16_weight_);
} }
if (fp16_out_ != nullptr) {
free(fp16_out_);
}
if (input_ptr_ != nullptr) { if (input_ptr_ != nullptr) {
free(input_ptr_); free(input_ptr_);
} }
@@ -57,7 +51,6 @@ class Convolution1x1FP16CPUKernel : public ConvolutionBaseFP16CPUKernel {
int ReSize() override; int ReSize() override;
int Run() override; int Run() override;
int RunImpl(int task_id); int RunImpl(int task_id);
int InitBuffer();
int InitConv1x1Param(); int InitConv1x1Param();
int InitMatmulParam(); int InitMatmulParam();
int InitWeightBias(); int InitWeightBias();


+ 1
- 24
mindspore/lite/src/runtime/kernel/arm/fp16/convolution_3x3_fp16.cc View File

@@ -132,16 +132,6 @@ int Convolution3x3FP16CPUKernel::InitTmpBuffer() {
} }
memset(tmp_out_, 0, tmp_out_size); memset(tmp_out_, 0, tmp_out_size);


/*=============================fp16_input_============================*/
size_t fp16_input_size = conv_param_->input_channel_ * conv_param_->input_batch_ * conv_param_->input_h_ *
conv_param_->input_w_ * sizeof(float16_t);
fp16_input_ = reinterpret_cast<float16_t *>(malloc(fp16_input_size));
if (fp16_input_ == nullptr) {
MS_LOG(ERROR) << "malloc fp16_input_ failed.";
return RET_ERROR;
}
memset(fp16_input_, 0, fp16_input_size);

/*=============================nhwc4_input_============================*/ /*=============================nhwc4_input_============================*/
size_t nhwc4_input_size = size_t nhwc4_input_size =
iC4 * C4NUM * conv_param_->input_batch_ * conv_param_->input_h_ * conv_param_->input_w_ * sizeof(float16_t); iC4 * C4NUM * conv_param_->input_batch_ * conv_param_->input_h_ * conv_param_->input_w_ * sizeof(float16_t);
@@ -152,14 +142,6 @@ int Convolution3x3FP16CPUKernel::InitTmpBuffer() {
} }
memset(nhwc4_input_, 0, nhwc4_input_size); memset(nhwc4_input_, 0, nhwc4_input_size);


/*=============================fp16_out_============================*/
size_t fp16_output_size = conv_param_->output_channel_ * conv_param_->output_batch_ * conv_param_->output_h_ *
conv_param_->output_w_ * sizeof(float16_t);
fp16_out_ = reinterpret_cast<float16_t *>(malloc(fp16_output_size));
if (fp16_out_ == nullptr) {
MS_LOG(ERROR) << "malloc fp16_out_ failed.";
return RET_ERROR;
}
return RET_OK; return RET_OK;
} }


@@ -207,12 +189,6 @@ int Convolution3x3FP16CPUKernel::ReSize() {
if (tmp_out_ != nullptr) { if (tmp_out_ != nullptr) {
free(tmp_out_); free(tmp_out_);
} }
if (fp16_out_ != nullptr) {
free(fp16_out_);
}
if (fp16_input_ != nullptr) {
free(fp16_input_);
}
if (nhwc4_input_ != nullptr) { if (nhwc4_input_ != nullptr) {
free(nhwc4_input_); free(nhwc4_input_);
} }
@@ -304,6 +280,7 @@ int Convolution3x3FP16CPUKernel::Run() {
} }


ConvolutionBaseFP16CPUKernel::IfCastOutput(); ConvolutionBaseFP16CPUKernel::IfCastOutput();
ConvolutionBaseFP16CPUKernel::FreeTmpBuffer();
return RET_OK; return RET_OK;
} }
} // namespace mindspore::kernel } // namespace mindspore::kernel

+ 0
- 6
mindspore/lite/src/runtime/kernel/arm/fp16/convolution_3x3_fp16.h View File

@@ -31,15 +31,9 @@ class Convolution3x3FP16CPUKernel : public ConvolutionBaseFP16CPUKernel {
const lite::Primitive *primitive) const lite::Primitive *primitive)
: ConvolutionBaseFP16CPUKernel(parameter, inputs, outputs, ctx, primitive) {} : ConvolutionBaseFP16CPUKernel(parameter, inputs, outputs, ctx, primitive) {}
~Convolution3x3FP16CPUKernel() override { ~Convolution3x3FP16CPUKernel() override {
if (fp16_input_ != nullptr) {
free(fp16_input_);
}
if (fp16_weight_ != nullptr) { if (fp16_weight_ != nullptr) {
free(fp16_weight_); free(fp16_weight_);
} }
if (fp16_out_ != nullptr) {
free(fp16_out_);
}
if (transformed_filter_addr_ != nullptr) { if (transformed_filter_addr_ != nullptr) {
free(transformed_filter_addr_); free(transformed_filter_addr_);
} }


+ 20
- 21
mindspore/lite/src/runtime/kernel/arm/fp16/convolution_base_fp16.cc View File

@@ -16,6 +16,7 @@


#include "src/runtime/kernel/arm/fp16/convolution_base_fp16.h" #include "src/runtime/kernel/arm/fp16/convolution_base_fp16.h"
#include "src/runtime/kernel/arm/nnacl/fp16/cast_fp16.h" #include "src/runtime/kernel/arm/nnacl/fp16/cast_fp16.h"
#include "src/runtime/kernel/arm/fp16/common_fp16.h"
#include "schema/model_generated.h" #include "schema/model_generated.h"
#include "src/kernel_factory.h" #include "src/kernel_factory.h"
#include "include/errorcode.h" #include "include/errorcode.h"
@@ -25,28 +26,17 @@ namespace mindspore::kernel {
int ConvolutionBaseFP16CPUKernel::GetExecuteTensor() { int ConvolutionBaseFP16CPUKernel::GetExecuteTensor() {
// ===================input====================// // ===================input====================//
auto input_tensor = in_tensors_.at(kInputIndex); auto input_tensor = in_tensors_.at(kInputIndex);
auto input_data_type = input_tensor->data_type();
MS_ASSERT(input_data_type == kNumberTypeFloat32 || input_data_type == kNumberTypeFloat16);
if (input_data_type == kNumberTypeFloat32) {
auto input_ele_num = input_tensor->ElementsNum();
auto ori_input_data = reinterpret_cast<float *>(input_tensor->Data());
Float32ToFloat16(ori_input_data, fp16_input_, input_ele_num);
execute_input_ = fp16_input_;
} else {
auto ori_input_data = reinterpret_cast<float16_t *>(input_tensor->Data());
execute_input_ = ori_input_data;
}
in_data_type_ = input_tensor->data_type();
MS_ASSERT(in_data_type_ == kNumberTypeFloat32 || in_data_type_ == kNumberTypeFloat16);

execute_input_ = ConvertInputFp32toFp16(input_tensor, context_);

// ==================output====================// // ==================output====================//
auto out_tensor = out_tensors_.at(kOutputIndex); auto out_tensor = out_tensors_.at(kOutputIndex);
auto out_data_type = out_tensor->data_type();
MS_ASSERT(out_data_type == kNumberTypeFloat32 || out_data_type == kNumberTypeFloat16);
out_data_type_ = out_data_type;
if (out_data_type == kNumberTypeFloat32) {
execute_output_ = fp16_out_;
} else {
auto out_ptr = reinterpret_cast<float16_t *>(out_tensor->Data());
execute_output_ = out_ptr;
}
out_data_type_ = out_tensor->data_type();
MS_ASSERT(out_data_type_ == kNumberTypeFloat32 || out_data_type_ == kNumberTypeFloat16);

execute_output_ = MallocOutputFp16(out_tensor, context_);
return RET_OK; return RET_OK;
} }


@@ -79,7 +69,16 @@ void ConvolutionBaseFP16CPUKernel::IfCastOutput() {
auto out_tensor = out_tensors_.at(kOutputIndex); auto out_tensor = out_tensors_.at(kOutputIndex);
auto out_ele_num = out_tensor->ElementsNum(); auto out_ele_num = out_tensor->ElementsNum();
auto output_addr = reinterpret_cast<float *>(out_tensor->Data()); auto output_addr = reinterpret_cast<float *>(out_tensor->Data());
Float16ToFloat32(fp16_out_, output_addr, out_ele_num);
Float16ToFloat32(execute_output_, output_addr, out_ele_num);
}
}

void ConvolutionBaseFP16CPUKernel::FreeTmpBuffer() {
if (in_data_type_ == kNumberTypeFloat32) {
context_->allocator->Free(execute_input_);
}
if (out_data_type_ == kNumberTypeFloat32) {
context_->allocator->Free(execute_output_);
} }
} }




+ 4
- 4
mindspore/lite/src/runtime/kernel/arm/fp16/convolution_base_fp16.h View File

@@ -39,14 +39,14 @@ class ConvolutionBaseFP16CPUKernel : public ConvolutionBaseCPUKernel {
virtual int GetExecuteTensor(); virtual int GetExecuteTensor();
virtual int GetExecuteFilter(); virtual int GetExecuteFilter();
virtual void IfCastOutput(); virtual void IfCastOutput();
void FreeTmpBuffer();


protected: protected:
float16_t *fp16_input_ = nullptr;
float16_t *fp16_weight_ = nullptr; float16_t *fp16_weight_ = nullptr;
float16_t *fp16_out_ = nullptr;
float16_t *execute_input_;
float16_t *execute_input_; // ctx allocator malloc and free
float16_t *execute_weight_; float16_t *execute_weight_;
float16_t *execute_output_;
float16_t *execute_output_; // ctx allocator malloc and free
TypeId in_data_type_;
TypeId out_data_type_; TypeId out_data_type_;
}; };
} // namespace mindspore::kernel } // namespace mindspore::kernel


+ 8
- 19
mindspore/lite/src/runtime/kernel/arm/fp16/convolution_depthwise_fp16.cc View File

@@ -173,22 +173,13 @@ int ConvolutionDepthwiseFp16CPUKernel::Run() {
return RET_ERROR; return RET_ERROR;
} }


auto input_tensor = in_tensors_.at(kInputIndex);
float16_t *input_addr;
if (input_tensor->data_type() == kNumberTypeFloat32) {
input_addr =
reinterpret_cast<float16_t *>(context_->allocator->Malloc(input_tensor->ElementsNum() * sizeof(float16_t)));
if (input_addr == nullptr) {
MS_LOG(ERROR) << "Malloc buffer failed.";
return RET_ERROR;
}
Float32ToFloat16(reinterpret_cast<float *>(input_tensor->Data()), input_addr, input_tensor->ElementsNum());
} else {
input_addr = reinterpret_cast<float16_t *>(input_tensor->Data());
ret = ConvolutionBaseFP16CPUKernel::GetExecuteTensor();
if (ret != RET_OK) {
MS_LOG(ERROR) << "Get Execute tensor failed.";
return ret;
} }

// pack input: to nhwc8 // pack input: to nhwc8
PackNHWCToNHWC8Fp16(input_addr, packed_input_, conv_param_->input_batch_,
PackNHWCToNHWC8Fp16(execute_input_, packed_input_, conv_param_->input_batch_,
conv_param_->input_h_ * conv_param_->input_w_, conv_param_->input_channel_); conv_param_->input_h_ * conv_param_->input_w_, conv_param_->input_channel_);


ret = LiteBackendParallelLaunch(ConvDwFp16Run, this, conv_param_->thread_num_); ret = LiteBackendParallelLaunch(ConvDwFp16Run, this, conv_param_->thread_num_);
@@ -197,13 +188,11 @@ int ConvolutionDepthwiseFp16CPUKernel::Run() {
return RET_ERROR; return RET_ERROR;
} }


auto output_addr = reinterpret_cast<float16_t *>(out_tensors_.at(kOutputIndex)->Data());
PackNHWC8ToNHWCFp16(packed_output_, output_addr, conv_param_->output_batch_,
PackNHWC8ToNHWCFp16(packed_output_, execute_output_, conv_param_->output_batch_,
conv_param_->output_h_ * conv_param_->output_w_, conv_param_->output_channel_); conv_param_->output_h_ * conv_param_->output_w_, conv_param_->output_channel_);


if (input_tensor->data_type() == kNumberTypeFloat32) {
context_->allocator->Free(input_addr);
}
ConvolutionBaseFP16CPUKernel::IfCastOutput();
ConvolutionBaseFP16CPUKernel::FreeTmpBuffer();
return RET_OK; return RET_OK;
} }




+ 3
- 3
mindspore/lite/src/runtime/kernel/arm/fp16/convolution_depthwise_fp16.h View File

@@ -19,7 +19,7 @@


#include <vector> #include <vector>
#include "src/lite_kernel.h" #include "src/lite_kernel.h"
#include "src/runtime/kernel/arm/base/convolution_base.h"
#include "src/runtime/kernel/arm/fp16/convolution_base_fp16.h"
#include "src/runtime/kernel/arm/nnacl/fp16/conv_depthwise_fp16.h" #include "src/runtime/kernel/arm/nnacl/fp16/conv_depthwise_fp16.h"


#ifdef __cplusplus #ifdef __cplusplus
@@ -34,12 +34,12 @@ void ConvDwC8Fp16(float16_t *output_data, const float16_t *input_data, const flo




namespace mindspore::kernel { namespace mindspore::kernel {
class ConvolutionDepthwiseFp16CPUKernel : public ConvolutionBaseCPUKernel {
class ConvolutionDepthwiseFp16CPUKernel : public ConvolutionBaseFP16CPUKernel {
public: public:
ConvolutionDepthwiseFp16CPUKernel(OpParameter *parameter, const std::vector<lite::tensor::Tensor *> &inputs, ConvolutionDepthwiseFp16CPUKernel(OpParameter *parameter, const std::vector<lite::tensor::Tensor *> &inputs,
const std::vector<lite::tensor::Tensor *> &outputs, const Context *ctx, const std::vector<lite::tensor::Tensor *> &outputs, const Context *ctx,
const lite::Primitive *primitive) const lite::Primitive *primitive)
: ConvolutionBaseCPUKernel(parameter, inputs, outputs, ctx, primitive) {}
: ConvolutionBaseFP16CPUKernel(parameter, inputs, outputs, ctx, primitive) {}
~ConvolutionDepthwiseFp16CPUKernel() override; ~ConvolutionDepthwiseFp16CPUKernel() override;


int Init() override; int Init() override;


+ 1
- 23
mindspore/lite/src/runtime/kernel/arm/fp16/convolution_fp16.cc View File

@@ -103,15 +103,6 @@ int ConvolutionFP16CPUKernel::InitTmpBuffer() {
} }
memset(packed_input_, 0, in_batch * packed_input_size * sizeof(float16_t)); memset(packed_input_, 0, in_batch * packed_input_size * sizeof(float16_t));


/*=============================fp16_input_============================*/
size_t fp16_input_size =
in_channel * conv_param_->input_batch_ * conv_param_->input_h_ * conv_param_->input_w_ * sizeof(float16_t);
fp16_input_ = reinterpret_cast<float16_t *>(malloc(fp16_input_size));
if (fp16_input_ == nullptr) {
MS_LOG(ERROR) << "malloc fp16_input_ failed.";
return RET_ERROR;
}

/*=============================nhwc4_input_============================*/ /*=============================nhwc4_input_============================*/
size_t nhwc4_input_size = channel_block * C4NUM * conv_param_->input_batch_ * conv_param_->input_h_ * size_t nhwc4_input_size = channel_block * C4NUM * conv_param_->input_batch_ * conv_param_->input_h_ *
conv_param_->input_w_ * sizeof(float16_t); conv_param_->input_w_ * sizeof(float16_t);
@@ -129,14 +120,6 @@ int ConvolutionFP16CPUKernel::InitTmpBuffer() {
return RET_ERROR; return RET_ERROR;
} }


/*=============================fp16_out_============================*/
size_t fp16_output_size =
out_channel * conv_param_->output_batch_ * conv_param_->output_h_ * conv_param_->output_w_ * sizeof(float16_t);
fp16_out_ = reinterpret_cast<float16_t *>(malloc(fp16_output_size));
if (fp16_out_ == nullptr) {
MS_LOG(ERROR) << "malloc fp16_out_ failed.";
return RET_ERROR;
}
return RET_OK; return RET_OK;
} }


@@ -181,12 +164,6 @@ int ConvolutionFP16CPUKernel::ReSize() {
if (nhwc4_input_ != nullptr) { if (nhwc4_input_ != nullptr) {
free(nhwc4_input_); free(nhwc4_input_);
} }
if (fp16_input_ != nullptr) {
free(fp16_input_);
}
if (fp16_out_ != nullptr) {
free(fp16_out_);
}


auto ret = ConvolutionBaseCPUKernel::Init(); auto ret = ConvolutionBaseCPUKernel::Init();
if (ret != RET_OK) { if (ret != RET_OK) {
@@ -242,6 +219,7 @@ int ConvolutionFP16CPUKernel::Run() {
return RET_ERROR; return RET_ERROR;
} }


ConvolutionBaseFP16CPUKernel::FreeTmpBuffer();
ConvolutionBaseFP16CPUKernel::IfCastOutput(); ConvolutionBaseFP16CPUKernel::IfCastOutput();
return RET_OK; return RET_OK;
} }


+ 0
- 6
mindspore/lite/src/runtime/kernel/arm/fp16/convolution_fp16.h View File

@@ -30,15 +30,9 @@ class ConvolutionFP16CPUKernel : public ConvolutionBaseFP16CPUKernel {
const lite::Primitive *primitive) const lite::Primitive *primitive)
: ConvolutionBaseFP16CPUKernel(parameter, inputs, outputs, ctx, primitive) {} : ConvolutionBaseFP16CPUKernel(parameter, inputs, outputs, ctx, primitive) {}
~ConvolutionFP16CPUKernel() override { ~ConvolutionFP16CPUKernel() override {
if (fp16_input_ != nullptr) {
free(fp16_input_);
}
if (fp16_weight_ != nullptr) { if (fp16_weight_ != nullptr) {
free(fp16_weight_); free(fp16_weight_);
} }
if (fp16_out_ != nullptr) {
free(fp16_out_);
}
if (packed_input_ != nullptr) { if (packed_input_ != nullptr) {
free(packed_input_); free(packed_input_);
} }


+ 1
- 23
mindspore/lite/src/runtime/kernel/arm/fp16/convolution_sw_fp16.cc View File

@@ -106,15 +106,6 @@ int ConvolutionSWFP16CPUKernel::InitTmpBuffer() {
int channel_block = UP_DIV(in_channel, C4NUM); int channel_block = UP_DIV(in_channel, C4NUM);
int oc4 = UP_DIV(out_channel, C4NUM); int oc4 = UP_DIV(out_channel, C4NUM);


/*=============================fp16_input_============================*/
size_t fp16_input_size =
in_channel * conv_param_->input_batch_ * conv_param_->input_h_ * conv_param_->input_w_ * sizeof(float16_t);
fp16_input_ = reinterpret_cast<float16_t *>(malloc(fp16_input_size));
if (fp16_input_ == nullptr) {
MS_LOG(ERROR) << "malloc fp16_input_ failed.";
return RET_ERROR;
}

/*=============================nhwc4_input_============================*/ /*=============================nhwc4_input_============================*/
size_t nhwc4_input_size = channel_block * C4NUM * conv_param_->input_batch_ * conv_param_->input_h_ * size_t nhwc4_input_size = channel_block * C4NUM * conv_param_->input_batch_ * conv_param_->input_h_ *
conv_param_->input_w_ * sizeof(float16_t); conv_param_->input_w_ * sizeof(float16_t);
@@ -133,14 +124,6 @@ int ConvolutionSWFP16CPUKernel::InitTmpBuffer() {
return RET_ERROR; return RET_ERROR;
} }


/*=============================fp16_out_============================*/
size_t fp16_output_size =
out_channel * conv_param_->output_batch_ * conv_param_->output_h_ * conv_param_->output_w_ * sizeof(float16_t);
fp16_out_ = reinterpret_cast<float16_t *>(malloc(fp16_output_size));
if (fp16_out_ == nullptr) {
MS_LOG(ERROR) << "malloc fp16_out_ failed.";
return RET_ERROR;
}
return RET_OK; return RET_OK;
} }


@@ -186,12 +169,6 @@ int ConvolutionSWFP16CPUKernel::ReSize() {
if (nhwc4_input_ != nullptr) { if (nhwc4_input_ != nullptr) {
free(nhwc4_input_); free(nhwc4_input_);
} }
if (fp16_input_ != nullptr) {
free(fp16_input_);
}
if (fp16_out_ != nullptr) {
free(fp16_out_);
}
delete slidingWindow_param_; delete slidingWindow_param_;


auto ret = ConvolutionBaseCPUKernel::Init(); auto ret = ConvolutionBaseCPUKernel::Init();
@@ -258,6 +235,7 @@ int ConvolutionSWFP16CPUKernel::Run() {
conv_param_->output_channel_); conv_param_->output_channel_);
} }
ConvolutionBaseFP16CPUKernel::IfCastOutput(); ConvolutionBaseFP16CPUKernel::IfCastOutput();
ConvolutionBaseFP16CPUKernel::FreeTmpBuffer();
return RET_OK; return RET_OK;
} }
} // namespace mindspore::kernel } // namespace mindspore::kernel

+ 0
- 6
mindspore/lite/src/runtime/kernel/arm/fp16/convolution_sw_fp16.h View File

@@ -29,15 +29,9 @@ class ConvolutionSWFP16CPUKernel : public ConvolutionBaseFP16CPUKernel {
const lite::Primitive *primitive) const lite::Primitive *primitive)
: ConvolutionBaseFP16CPUKernel(parameter, inputs, outputs, ctx, primitive) {} : ConvolutionBaseFP16CPUKernel(parameter, inputs, outputs, ctx, primitive) {}
~ConvolutionSWFP16CPUKernel() override { ~ConvolutionSWFP16CPUKernel() override {
if (fp16_input_ != nullptr) {
free(fp16_input_);
}
if (fp16_weight_ != nullptr) { if (fp16_weight_ != nullptr) {
free(fp16_weight_); free(fp16_weight_);
} }
if (fp16_out_ != nullptr) {
free(fp16_out_);
}
if (packed_weight_ != nullptr) { if (packed_weight_ != nullptr) {
free(packed_weight_); free(packed_weight_);
} }


+ 1
- 23
mindspore/lite/src/runtime/kernel/arm/fp16/convolution_winograd_fp16.cc View File

@@ -187,15 +187,6 @@ int ConvolutionWinogradFP16CPUKernel::InitTmpBuffer() {
int ic4 = UP_DIV(channel_in, C4NUM); int ic4 = UP_DIV(channel_in, C4NUM);
int oc8 = UP_DIV(channel_out, C8NUM); int oc8 = UP_DIV(channel_out, C8NUM);


/*=============================fp16_input_============================*/
size_t fp16_input_size = conv_param_->input_channel_ * conv_param_->input_batch_ * conv_param_->input_h_ *
conv_param_->input_w_ * sizeof(float16_t);
fp16_input_ = reinterpret_cast<float16_t *>(malloc(fp16_input_size));
if (fp16_input_ == nullptr) {
MS_LOG(ERROR) << "malloc fp16_input_ failed.";
return RET_ERROR;
}

/*=============================trans_input_============================*/ /*=============================trans_input_============================*/
size_t tile_buffer_size = thread_count_ * cal_num * input_unit_ * input_unit_ * ic4 * C4NUM * sizeof(float16_t); size_t tile_buffer_size = thread_count_ * cal_num * input_unit_ * input_unit_ * ic4 * C4NUM * sizeof(float16_t);
trans_input_ = reinterpret_cast<float16_t *>(malloc(tile_buffer_size)); trans_input_ = reinterpret_cast<float16_t *>(malloc(tile_buffer_size));
@@ -222,14 +213,6 @@ int ConvolutionWinogradFP16CPUKernel::InitTmpBuffer() {
MS_LOG(ERROR) << "malloc tmp_out_data_ failed."; MS_LOG(ERROR) << "malloc tmp_out_data_ failed.";
return RET_ERROR; return RET_ERROR;
} }
/*=============================fp16_out_============================*/
size_t fp16_output_size = conv_param_->output_channel_ * conv_param_->output_batch_ * conv_param_->output_h_ *
conv_param_->output_w_ * sizeof(float16_t);
fp16_out_ = reinterpret_cast<float16_t *>(malloc(fp16_output_size));
if (fp16_out_ == nullptr) {
MS_LOG(ERROR) << "malloc fp16_out_ failed.";
return RET_ERROR;
}


/*=============================tmp_data_============================*/ /*=============================tmp_data_============================*/
tmp_data_ = tmp_data_ =
@@ -327,12 +310,6 @@ int ConvolutionWinogradFP16CPUKernel::ReSize() {
if (nhwc4_input_ != nullptr) { if (nhwc4_input_ != nullptr) {
free(nhwc4_input_); free(nhwc4_input_);
} }
if (fp16_input_ != nullptr) {
free(fp16_input_);
}
if (fp16_out_ != nullptr) {
free(fp16_out_);
}


auto ret = ConvolutionBaseCPUKernel::Init(); auto ret = ConvolutionBaseCPUKernel::Init();
if (ret != RET_OK) { if (ret != RET_OK) {
@@ -412,6 +389,7 @@ int ConvolutionWinogradFP16CPUKernel::Run() {
// do nothing // do nothing
} }
ConvolutionBaseFP16CPUKernel::IfCastOutput(); ConvolutionBaseFP16CPUKernel::IfCastOutput();
ConvolutionBaseFP16CPUKernel::FreeTmpBuffer();
return RET_OK; return RET_OK;
} }
} // namespace mindspore::kernel } // namespace mindspore::kernel

+ 0
- 6
mindspore/lite/src/runtime/kernel/arm/fp16/convolution_winograd_fp16.h View File

@@ -34,15 +34,9 @@ class ConvolutionWinogradFP16CPUKernel : public ConvolutionBaseFP16CPUKernel {
const lite::Primitive *primitive) const lite::Primitive *primitive)
: ConvolutionBaseFP16CPUKernel(parameter, inputs, outputs, ctx, primitive) {} : ConvolutionBaseFP16CPUKernel(parameter, inputs, outputs, ctx, primitive) {}
~ConvolutionWinogradFP16CPUKernel() override { ~ConvolutionWinogradFP16CPUKernel() override {
if (fp16_input_ != nullptr) {
free(fp16_input_);
}
if (fp16_weight_ != nullptr) { if (fp16_weight_ != nullptr) {
free(fp16_weight_); free(fp16_weight_);
} }
if (fp16_out_ != nullptr) {
free(fp16_out_);
}
if (tmp_data_ != nullptr) { if (tmp_data_ != nullptr) {
free(tmp_data_); free(tmp_data_);
} }


+ 11
- 7
mindspore/lite/src/runtime/kernel/arm/fp16/deconvolution_depthwise_fp16.cc View File

@@ -185,11 +185,14 @@ int DeconvolutionDepthwiseFp16CPUKernel::Run() {
return RET_ERROR; return RET_ERROR;
} }


auto input_tensor = in_tensors_.at(kInputIndex);
auto input_addr = reinterpret_cast<float *>(input_tensor->Data());
ret = ConvolutionBaseFP16CPUKernel::GetExecuteTensor();
if (ret != RET_OK) {
MS_LOG(ERROR) << "Get Execute tensor failed.";
return ret;
}
// pack input: to nhwc8 // pack input: to nhwc8
PackNHWCFp32ToNHWC8Fp16(input_addr, packed_input_, conv_param_->input_batch_,
conv_param_->input_h_ * conv_param_->input_w_, conv_param_->input_channel_);
PackNHWCToNHWC8Fp16(execute_input_, packed_input_, conv_param_->input_batch_,
conv_param_->input_h_ * conv_param_->input_w_, conv_param_->input_channel_);


ret = LiteBackendParallelLaunch(DeconvDwFp16Run, this, conv_param_->thread_num_); ret = LiteBackendParallelLaunch(DeconvDwFp16Run, this, conv_param_->thread_num_);
if (ret != RET_OK) { if (ret != RET_OK) {
@@ -197,9 +200,10 @@ int DeconvolutionDepthwiseFp16CPUKernel::Run() {
return RET_ERROR; return RET_ERROR;
} }


auto output_addr = reinterpret_cast<float *>(out_tensors_.at(kOutputIndex)->Data());
PackNHWC8Fp16ToNHWCFp32(packed_output_, output_addr, conv_param_->output_batch_,
conv_param_->output_h_ * conv_param_->output_w_, conv_param_->output_channel_);
PackNHWC8ToNHWCFp16(packed_output_, execute_output_, conv_param_->output_batch_,
conv_param_->output_h_ * conv_param_->output_w_, conv_param_->output_channel_);
ConvolutionBaseFP16CPUKernel::IfCastOutput();
ConvolutionBaseFP16CPUKernel::FreeTmpBuffer();
return RET_OK; return RET_OK;
} }




+ 3
- 3
mindspore/lite/src/runtime/kernel/arm/fp16/deconvolution_depthwise_fp16.h View File

@@ -19,7 +19,7 @@


#include <vector> #include <vector>
#include "src/lite_kernel.h" #include "src/lite_kernel.h"
#include "src/runtime/kernel/arm/base/convolution_base.h"
#include "src/runtime/kernel/arm/fp16/convolution_base_fp16.h"
#include "src/runtime/kernel/arm/nnacl/fp16/conv_depthwise_fp16.h" #include "src/runtime/kernel/arm/nnacl/fp16/conv_depthwise_fp16.h"


#ifdef __cplusplus #ifdef __cplusplus
@@ -34,12 +34,12 @@ void ComputeStrides(int *shape, int *strides, int ndim);
#endif #endif


namespace mindspore::kernel { namespace mindspore::kernel {
class DeconvolutionDepthwiseFp16CPUKernel : public ConvolutionBaseCPUKernel {
class DeconvolutionDepthwiseFp16CPUKernel : public ConvolutionBaseFP16CPUKernel {
public: public:
DeconvolutionDepthwiseFp16CPUKernel(OpParameter *parameter, const std::vector<lite::tensor::Tensor *> &inputs, DeconvolutionDepthwiseFp16CPUKernel(OpParameter *parameter, const std::vector<lite::tensor::Tensor *> &inputs,
const std::vector<lite::tensor::Tensor *> &outputs, const Context *ctx, const std::vector<lite::tensor::Tensor *> &outputs, const Context *ctx,
const lite::Primitive *primitive) const lite::Primitive *primitive)
: ConvolutionBaseCPUKernel(parameter, inputs, outputs, ctx, primitive) {}
: ConvolutionBaseFP16CPUKernel(parameter, inputs, outputs, ctx, primitive) {}
~DeconvolutionDepthwiseFp16CPUKernel() override; ~DeconvolutionDepthwiseFp16CPUKernel() override;


int Init() override; int Init() override;


+ 2
- 2
mindspore/lite/src/runtime/kernel/arm/nnacl/fp16/pack_fp16.c View File

@@ -392,7 +392,7 @@ void PackNHWCToNHWC8Fp16(float16_t *src, float16_t *dst, int batch, int plane, i
for (int i = 0; i < plane; i++) { for (int i = 0; i < plane; i++) {
float16_t *dst_plane = dst_batch + i * c8_channel; float16_t *dst_plane = dst_batch + i * c8_channel;
float16_t *src_plane = src_batch + i * channel; float16_t *src_plane = src_batch + i * channel;
memcpy(dst_plane, src_batch, channel * sizeof(float16_t));
memcpy(dst_plane, src_plane, channel * sizeof(float16_t));
} }
} }
} }
@@ -405,7 +405,7 @@ void PackNHWC8ToNHWCFp16(float16_t *src, float16_t *dst, int batch, int plane, i
for (int i = 0; i < plane; i++) { for (int i = 0; i < plane; i++) {
float16_t *src_plane = src_batch + i * c8_channel; float16_t *src_plane = src_batch + i * c8_channel;
float16_t *dst_plane = dst_batch + i * channel; float16_t *dst_plane = dst_batch + i * channel;
memcpy(dst_plane, src_batch, channel * sizeof(float16_t));
memcpy(dst_plane, src_plane, channel * sizeof(float16_t));
} }
} }
} }

Loading…
Cancel
Save