Browse Source

[MS][LITE] optimize arm cpu fp16 conv op: add common converter functions for input and output

tags/v0.7.0-beta
yangruoqi713 5 years ago
parent
commit
1b89036bea
19 changed files with 130 additions and 222 deletions
  1. +46
    -0
      mindspore/lite/src/runtime/kernel/arm/fp16/common_fp16.cc
  2. +28
    -0
      mindspore/lite/src/runtime/kernel/arm/fp16/common_fp16.h
  3. +1
    -39
      mindspore/lite/src/runtime/kernel/arm/fp16/convolution_1x1_fp16.cc
  4. +0
    -7
      mindspore/lite/src/runtime/kernel/arm/fp16/convolution_1x1_fp16.h
  5. +1
    -24
      mindspore/lite/src/runtime/kernel/arm/fp16/convolution_3x3_fp16.cc
  6. +0
    -6
      mindspore/lite/src/runtime/kernel/arm/fp16/convolution_3x3_fp16.h
  7. +20
    -21
      mindspore/lite/src/runtime/kernel/arm/fp16/convolution_base_fp16.cc
  8. +4
    -4
      mindspore/lite/src/runtime/kernel/arm/fp16/convolution_base_fp16.h
  9. +8
    -19
      mindspore/lite/src/runtime/kernel/arm/fp16/convolution_depthwise_fp16.cc
  10. +3
    -3
      mindspore/lite/src/runtime/kernel/arm/fp16/convolution_depthwise_fp16.h
  11. +1
    -23
      mindspore/lite/src/runtime/kernel/arm/fp16/convolution_fp16.cc
  12. +0
    -6
      mindspore/lite/src/runtime/kernel/arm/fp16/convolution_fp16.h
  13. +1
    -23
      mindspore/lite/src/runtime/kernel/arm/fp16/convolution_sw_fp16.cc
  14. +0
    -6
      mindspore/lite/src/runtime/kernel/arm/fp16/convolution_sw_fp16.h
  15. +1
    -23
      mindspore/lite/src/runtime/kernel/arm/fp16/convolution_winograd_fp16.cc
  16. +0
    -6
      mindspore/lite/src/runtime/kernel/arm/fp16/convolution_winograd_fp16.h
  17. +11
    -7
      mindspore/lite/src/runtime/kernel/arm/fp16/deconvolution_depthwise_fp16.cc
  18. +3
    -3
      mindspore/lite/src/runtime/kernel/arm/fp16/deconvolution_depthwise_fp16.h
  19. +2
    -2
      mindspore/lite/src/runtime/kernel/arm/nnacl/fp16/pack_fp16.c

+ 46
- 0
mindspore/lite/src/runtime/kernel/arm/fp16/common_fp16.cc View File

@@ -0,0 +1,46 @@
/**
* Copyright 2020 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include "src/runtime/kernel/arm/fp16/common_fp16.h"
#include "src/runtime/kernel/arm/nnacl/fp16/cast_fp16.h"

namespace mindspore::kernel {
float16_t *ConvertInputFp32toFp16(lite::tensor::Tensor *input, const lite::Context *ctx) {
float16_t *fp16_data = nullptr;
auto data_type = input->data_type();
if (data_type == kNumberTypeFloat32) {
auto ele_num = input->ElementsNum();
fp16_data = reinterpret_cast<float16_t *>(ctx->allocator->Malloc(ele_num * sizeof(float16_t)));
auto ori_data = reinterpret_cast<float *>(input->Data());
Float32ToFloat16(ori_data, fp16_data, ele_num);
} else {
fp16_data = reinterpret_cast<float16_t *>(input->Data());
}
return fp16_data;
}

float16_t *MallocOutputFp16(lite::tensor::Tensor *output, const lite::Context *ctx) {
float16_t *fp16_data = nullptr;
auto data_type = output->data_type();
if (data_type == kNumberTypeFloat32) {
auto ele_num = output->ElementsNum();
fp16_data = reinterpret_cast<float16_t *>(ctx->allocator->Malloc(ele_num * sizeof(float16_t)));
} else {
fp16_data = reinterpret_cast<float16_t *>(output->Data());
}
return fp16_data;
}
} // namespace mindspore::kernel

+ 28
- 0
mindspore/lite/src/runtime/kernel/arm/fp16/common_fp16.h View File

@@ -0,0 +1,28 @@
/**
* Copyright 2020 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_COMMON_FP16_H_
#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_COMMON_FP16_H_

#include "src/lite_kernel.h"

namespace mindspore::kernel {
float16_t *ConvertInputFp32toFp16(lite::tensor::Tensor *input, const lite::Context *ctx);

float16_t *MallocOutputFp16(lite::tensor::Tensor *output, const lite::Context *ctx);

} // namespace mindspore::kernel

#endif // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_COMMON_FP16_H_

+ 1
- 39
mindspore/lite/src/runtime/kernel/arm/fp16/convolution_1x1_fp16.cc View File

@@ -98,28 +98,6 @@ int Convolution1x1FP16CPUKernel::InitWeightBias() {
return RET_OK;
}

int Convolution1x1FP16CPUKernel::InitBuffer() {
/*=============================fp16_input_============================*/
size_t fp16_input_size = conv_param_->input_channel_ * conv_param_->input_batch_ * conv_param_->input_h_ *
conv_param_->input_w_ * sizeof(float16_t);
fp16_input_ = reinterpret_cast<float16_t *>(malloc(fp16_input_size));
if (fp16_input_ == nullptr) {
MS_LOG(ERROR) << "malloc fp16_input_ failed.";
return RET_ERROR;
}
memset(fp16_input_, 0, fp16_input_size);

/*=============================fp16_out_============================*/
size_t fp16_output_size = conv_param_->output_channel_ * conv_param_->output_batch_ * conv_param_->output_h_ *
conv_param_->output_w_ * sizeof(float16_t);
fp16_out_ = reinterpret_cast<float16_t *>(malloc(fp16_output_size));
if (fp16_out_ == nullptr) {
MS_LOG(ERROR) << "malloc fp16_out_ failed.";
return RET_ERROR;
}
return RET_OK;
}

int Convolution1x1FP16CPUKernel::Init() {
auto ret = ConvolutionBaseCPUKernel::Init();
if (ret != RET_OK) {
@@ -136,11 +114,6 @@ int Convolution1x1FP16CPUKernel::Init() {
MS_LOG(ERROR) << "Init conv1x1 param failed.";
return ret;
}
ret = InitBuffer();
if (ret != RET_OK) {
MS_LOG(ERROR) << "Init buffer failed.";
return ret;
}
ret = InitWeightBias();
if (ret != RET_OK) {
MS_LOG(ERROR) << "Init weight bias failed.";
@@ -150,12 +123,6 @@ int Convolution1x1FP16CPUKernel::Init() {
}

int Convolution1x1FP16CPUKernel::ReSize() {
if (fp16_out_ != nullptr) {
free(fp16_out_);
}
if (fp16_input_ != nullptr) {
free(fp16_input_);
}
if (fp16_weight_ != nullptr) {
free(fp16_weight_);
}
@@ -181,12 +148,6 @@ int Convolution1x1FP16CPUKernel::ReSize() {
MS_LOG(ERROR) << "Init conv1x1 param failed.";
return ret;
}
ret = InitBuffer();
if (ret != RET_OK) {
MS_LOG(ERROR) << "Init buffer failed.";
return ret;
}

return RET_OK;
}

@@ -253,6 +214,7 @@ int Convolution1x1FP16CPUKernel::Run() {
}

ConvolutionBaseFP16CPUKernel::IfCastOutput();
ConvolutionBaseFP16CPUKernel::FreeTmpBuffer();
return RET_OK;
}
} // namespace mindspore::kernel

+ 0
- 7
mindspore/lite/src/runtime/kernel/arm/fp16/convolution_1x1_fp16.h View File

@@ -35,15 +35,9 @@ class Convolution1x1FP16CPUKernel : public ConvolutionBaseFP16CPUKernel {
matmul_param_ = new MatMulParameter();
}
~Convolution1x1FP16CPUKernel() override {
if (fp16_input_ != nullptr) {
free(fp16_input_);
}
if (fp16_weight_ != nullptr) {
free(fp16_weight_);
}
if (fp16_out_ != nullptr) {
free(fp16_out_);
}
if (input_ptr_ != nullptr) {
free(input_ptr_);
}
@@ -57,7 +51,6 @@ class Convolution1x1FP16CPUKernel : public ConvolutionBaseFP16CPUKernel {
int ReSize() override;
int Run() override;
int RunImpl(int task_id);
int InitBuffer();
int InitConv1x1Param();
int InitMatmulParam();
int InitWeightBias();


+ 1
- 24
mindspore/lite/src/runtime/kernel/arm/fp16/convolution_3x3_fp16.cc View File

@@ -132,16 +132,6 @@ int Convolution3x3FP16CPUKernel::InitTmpBuffer() {
}
memset(tmp_out_, 0, tmp_out_size);

/*=============================fp16_input_============================*/
size_t fp16_input_size = conv_param_->input_channel_ * conv_param_->input_batch_ * conv_param_->input_h_ *
conv_param_->input_w_ * sizeof(float16_t);
fp16_input_ = reinterpret_cast<float16_t *>(malloc(fp16_input_size));
if (fp16_input_ == nullptr) {
MS_LOG(ERROR) << "malloc fp16_input_ failed.";
return RET_ERROR;
}
memset(fp16_input_, 0, fp16_input_size);

/*=============================nhwc4_input_============================*/
size_t nhwc4_input_size =
iC4 * C4NUM * conv_param_->input_batch_ * conv_param_->input_h_ * conv_param_->input_w_ * sizeof(float16_t);
@@ -152,14 +142,6 @@ int Convolution3x3FP16CPUKernel::InitTmpBuffer() {
}
memset(nhwc4_input_, 0, nhwc4_input_size);

/*=============================fp16_out_============================*/
size_t fp16_output_size = conv_param_->output_channel_ * conv_param_->output_batch_ * conv_param_->output_h_ *
conv_param_->output_w_ * sizeof(float16_t);
fp16_out_ = reinterpret_cast<float16_t *>(malloc(fp16_output_size));
if (fp16_out_ == nullptr) {
MS_LOG(ERROR) << "malloc fp16_out_ failed.";
return RET_ERROR;
}
return RET_OK;
}

@@ -207,12 +189,6 @@ int Convolution3x3FP16CPUKernel::ReSize() {
if (tmp_out_ != nullptr) {
free(tmp_out_);
}
if (fp16_out_ != nullptr) {
free(fp16_out_);
}
if (fp16_input_ != nullptr) {
free(fp16_input_);
}
if (nhwc4_input_ != nullptr) {
free(nhwc4_input_);
}
@@ -304,6 +280,7 @@ int Convolution3x3FP16CPUKernel::Run() {
}

ConvolutionBaseFP16CPUKernel::IfCastOutput();
ConvolutionBaseFP16CPUKernel::FreeTmpBuffer();
return RET_OK;
}
} // namespace mindspore::kernel

+ 0
- 6
mindspore/lite/src/runtime/kernel/arm/fp16/convolution_3x3_fp16.h View File

@@ -31,15 +31,9 @@ class Convolution3x3FP16CPUKernel : public ConvolutionBaseFP16CPUKernel {
const lite::Primitive *primitive)
: ConvolutionBaseFP16CPUKernel(parameter, inputs, outputs, ctx, primitive) {}
~Convolution3x3FP16CPUKernel() override {
if (fp16_input_ != nullptr) {
free(fp16_input_);
}
if (fp16_weight_ != nullptr) {
free(fp16_weight_);
}
if (fp16_out_ != nullptr) {
free(fp16_out_);
}
if (transformed_filter_addr_ != nullptr) {
free(transformed_filter_addr_);
}


+ 20
- 21
mindspore/lite/src/runtime/kernel/arm/fp16/convolution_base_fp16.cc View File

@@ -16,6 +16,7 @@

#include "src/runtime/kernel/arm/fp16/convolution_base_fp16.h"
#include "src/runtime/kernel/arm/nnacl/fp16/cast_fp16.h"
#include "src/runtime/kernel/arm/fp16/common_fp16.h"
#include "schema/model_generated.h"
#include "src/kernel_factory.h"
#include "include/errorcode.h"
@@ -25,28 +26,17 @@ namespace mindspore::kernel {
int ConvolutionBaseFP16CPUKernel::GetExecuteTensor() {
// ===================input====================//
auto input_tensor = in_tensors_.at(kInputIndex);
auto input_data_type = input_tensor->data_type();
MS_ASSERT(input_data_type == kNumberTypeFloat32 || input_data_type == kNumberTypeFloat16);
if (input_data_type == kNumberTypeFloat32) {
auto input_ele_num = input_tensor->ElementsNum();
auto ori_input_data = reinterpret_cast<float *>(input_tensor->Data());
Float32ToFloat16(ori_input_data, fp16_input_, input_ele_num);
execute_input_ = fp16_input_;
} else {
auto ori_input_data = reinterpret_cast<float16_t *>(input_tensor->Data());
execute_input_ = ori_input_data;
}
in_data_type_ = input_tensor->data_type();
MS_ASSERT(in_data_type_ == kNumberTypeFloat32 || in_data_type_ == kNumberTypeFloat16);

execute_input_ = ConvertInputFp32toFp16(input_tensor, context_);

// ==================output====================//
auto out_tensor = out_tensors_.at(kOutputIndex);
auto out_data_type = out_tensor->data_type();
MS_ASSERT(out_data_type == kNumberTypeFloat32 || out_data_type == kNumberTypeFloat16);
out_data_type_ = out_data_type;
if (out_data_type == kNumberTypeFloat32) {
execute_output_ = fp16_out_;
} else {
auto out_ptr = reinterpret_cast<float16_t *>(out_tensor->Data());
execute_output_ = out_ptr;
}
out_data_type_ = out_tensor->data_type();
MS_ASSERT(out_data_type_ == kNumberTypeFloat32 || out_data_type_ == kNumberTypeFloat16);

execute_output_ = MallocOutputFp16(out_tensor, context_);
return RET_OK;
}

@@ -79,7 +69,16 @@ void ConvolutionBaseFP16CPUKernel::IfCastOutput() {
auto out_tensor = out_tensors_.at(kOutputIndex);
auto out_ele_num = out_tensor->ElementsNum();
auto output_addr = reinterpret_cast<float *>(out_tensor->Data());
Float16ToFloat32(fp16_out_, output_addr, out_ele_num);
Float16ToFloat32(execute_output_, output_addr, out_ele_num);
}
}

void ConvolutionBaseFP16CPUKernel::FreeTmpBuffer() {
if (in_data_type_ == kNumberTypeFloat32) {
context_->allocator->Free(execute_input_);
}
if (out_data_type_ == kNumberTypeFloat32) {
context_->allocator->Free(execute_output_);
}
}



+ 4
- 4
mindspore/lite/src/runtime/kernel/arm/fp16/convolution_base_fp16.h View File

@@ -39,14 +39,14 @@ class ConvolutionBaseFP16CPUKernel : public ConvolutionBaseCPUKernel {
virtual int GetExecuteTensor();
virtual int GetExecuteFilter();
virtual void IfCastOutput();
void FreeTmpBuffer();

protected:
float16_t *fp16_input_ = nullptr;
float16_t *fp16_weight_ = nullptr;
float16_t *fp16_out_ = nullptr;
float16_t *execute_input_;
float16_t *execute_input_; // ctx allocator malloc and free
float16_t *execute_weight_;
float16_t *execute_output_;
float16_t *execute_output_; // ctx allocator malloc and free
TypeId in_data_type_;
TypeId out_data_type_;
};
} // namespace mindspore::kernel


+ 8
- 19
mindspore/lite/src/runtime/kernel/arm/fp16/convolution_depthwise_fp16.cc View File

@@ -173,22 +173,13 @@ int ConvolutionDepthwiseFp16CPUKernel::Run() {
return RET_ERROR;
}

auto input_tensor = in_tensors_.at(kInputIndex);
float16_t *input_addr;
if (input_tensor->data_type() == kNumberTypeFloat32) {
input_addr =
reinterpret_cast<float16_t *>(context_->allocator->Malloc(input_tensor->ElementsNum() * sizeof(float16_t)));
if (input_addr == nullptr) {
MS_LOG(ERROR) << "Malloc buffer failed.";
return RET_ERROR;
}
Float32ToFloat16(reinterpret_cast<float *>(input_tensor->Data()), input_addr, input_tensor->ElementsNum());
} else {
input_addr = reinterpret_cast<float16_t *>(input_tensor->Data());
ret = ConvolutionBaseFP16CPUKernel::GetExecuteTensor();
if (ret != RET_OK) {
MS_LOG(ERROR) << "Get Execute tensor failed.";
return ret;
}

// pack input: to nhwc8
PackNHWCToNHWC8Fp16(input_addr, packed_input_, conv_param_->input_batch_,
PackNHWCToNHWC8Fp16(execute_input_, packed_input_, conv_param_->input_batch_,
conv_param_->input_h_ * conv_param_->input_w_, conv_param_->input_channel_);

ret = LiteBackendParallelLaunch(ConvDwFp16Run, this, conv_param_->thread_num_);
@@ -197,13 +188,11 @@ int ConvolutionDepthwiseFp16CPUKernel::Run() {
return RET_ERROR;
}

auto output_addr = reinterpret_cast<float16_t *>(out_tensors_.at(kOutputIndex)->Data());
PackNHWC8ToNHWCFp16(packed_output_, output_addr, conv_param_->output_batch_,
PackNHWC8ToNHWCFp16(packed_output_, execute_output_, conv_param_->output_batch_,
conv_param_->output_h_ * conv_param_->output_w_, conv_param_->output_channel_);

if (input_tensor->data_type() == kNumberTypeFloat32) {
context_->allocator->Free(input_addr);
}
ConvolutionBaseFP16CPUKernel::IfCastOutput();
ConvolutionBaseFP16CPUKernel::FreeTmpBuffer();
return RET_OK;
}



+ 3
- 3
mindspore/lite/src/runtime/kernel/arm/fp16/convolution_depthwise_fp16.h View File

@@ -19,7 +19,7 @@

#include <vector>
#include "src/lite_kernel.h"
#include "src/runtime/kernel/arm/base/convolution_base.h"
#include "src/runtime/kernel/arm/fp16/convolution_base_fp16.h"
#include "src/runtime/kernel/arm/nnacl/fp16/conv_depthwise_fp16.h"

#ifdef __cplusplus
@@ -34,12 +34,12 @@ void ConvDwC8Fp16(float16_t *output_data, const float16_t *input_data, const flo


namespace mindspore::kernel {
class ConvolutionDepthwiseFp16CPUKernel : public ConvolutionBaseCPUKernel {
class ConvolutionDepthwiseFp16CPUKernel : public ConvolutionBaseFP16CPUKernel {
public:
ConvolutionDepthwiseFp16CPUKernel(OpParameter *parameter, const std::vector<lite::tensor::Tensor *> &inputs,
const std::vector<lite::tensor::Tensor *> &outputs, const Context *ctx,
const lite::Primitive *primitive)
: ConvolutionBaseCPUKernel(parameter, inputs, outputs, ctx, primitive) {}
: ConvolutionBaseFP16CPUKernel(parameter, inputs, outputs, ctx, primitive) {}
~ConvolutionDepthwiseFp16CPUKernel() override;

int Init() override;


+ 1
- 23
mindspore/lite/src/runtime/kernel/arm/fp16/convolution_fp16.cc View File

@@ -103,15 +103,6 @@ int ConvolutionFP16CPUKernel::InitTmpBuffer() {
}
memset(packed_input_, 0, in_batch * packed_input_size * sizeof(float16_t));

/*=============================fp16_input_============================*/
size_t fp16_input_size =
in_channel * conv_param_->input_batch_ * conv_param_->input_h_ * conv_param_->input_w_ * sizeof(float16_t);
fp16_input_ = reinterpret_cast<float16_t *>(malloc(fp16_input_size));
if (fp16_input_ == nullptr) {
MS_LOG(ERROR) << "malloc fp16_input_ failed.";
return RET_ERROR;
}

/*=============================nhwc4_input_============================*/
size_t nhwc4_input_size = channel_block * C4NUM * conv_param_->input_batch_ * conv_param_->input_h_ *
conv_param_->input_w_ * sizeof(float16_t);
@@ -129,14 +120,6 @@ int ConvolutionFP16CPUKernel::InitTmpBuffer() {
return RET_ERROR;
}

/*=============================fp16_out_============================*/
size_t fp16_output_size =
out_channel * conv_param_->output_batch_ * conv_param_->output_h_ * conv_param_->output_w_ * sizeof(float16_t);
fp16_out_ = reinterpret_cast<float16_t *>(malloc(fp16_output_size));
if (fp16_out_ == nullptr) {
MS_LOG(ERROR) << "malloc fp16_out_ failed.";
return RET_ERROR;
}
return RET_OK;
}

@@ -181,12 +164,6 @@ int ConvolutionFP16CPUKernel::ReSize() {
if (nhwc4_input_ != nullptr) {
free(nhwc4_input_);
}
if (fp16_input_ != nullptr) {
free(fp16_input_);
}
if (fp16_out_ != nullptr) {
free(fp16_out_);
}

auto ret = ConvolutionBaseCPUKernel::Init();
if (ret != RET_OK) {
@@ -242,6 +219,7 @@ int ConvolutionFP16CPUKernel::Run() {
return RET_ERROR;
}

ConvolutionBaseFP16CPUKernel::FreeTmpBuffer();
ConvolutionBaseFP16CPUKernel::IfCastOutput();
return RET_OK;
}


+ 0
- 6
mindspore/lite/src/runtime/kernel/arm/fp16/convolution_fp16.h View File

@@ -30,15 +30,9 @@ class ConvolutionFP16CPUKernel : public ConvolutionBaseFP16CPUKernel {
const lite::Primitive *primitive)
: ConvolutionBaseFP16CPUKernel(parameter, inputs, outputs, ctx, primitive) {}
~ConvolutionFP16CPUKernel() override {
if (fp16_input_ != nullptr) {
free(fp16_input_);
}
if (fp16_weight_ != nullptr) {
free(fp16_weight_);
}
if (fp16_out_ != nullptr) {
free(fp16_out_);
}
if (packed_input_ != nullptr) {
free(packed_input_);
}


+ 1
- 23
mindspore/lite/src/runtime/kernel/arm/fp16/convolution_sw_fp16.cc View File

@@ -106,15 +106,6 @@ int ConvolutionSWFP16CPUKernel::InitTmpBuffer() {
int channel_block = UP_DIV(in_channel, C4NUM);
int oc4 = UP_DIV(out_channel, C4NUM);

/*=============================fp16_input_============================*/
size_t fp16_input_size =
in_channel * conv_param_->input_batch_ * conv_param_->input_h_ * conv_param_->input_w_ * sizeof(float16_t);
fp16_input_ = reinterpret_cast<float16_t *>(malloc(fp16_input_size));
if (fp16_input_ == nullptr) {
MS_LOG(ERROR) << "malloc fp16_input_ failed.";
return RET_ERROR;
}

/*=============================nhwc4_input_============================*/
size_t nhwc4_input_size = channel_block * C4NUM * conv_param_->input_batch_ * conv_param_->input_h_ *
conv_param_->input_w_ * sizeof(float16_t);
@@ -133,14 +124,6 @@ int ConvolutionSWFP16CPUKernel::InitTmpBuffer() {
return RET_ERROR;
}

/*=============================fp16_out_============================*/
size_t fp16_output_size =
out_channel * conv_param_->output_batch_ * conv_param_->output_h_ * conv_param_->output_w_ * sizeof(float16_t);
fp16_out_ = reinterpret_cast<float16_t *>(malloc(fp16_output_size));
if (fp16_out_ == nullptr) {
MS_LOG(ERROR) << "malloc fp16_out_ failed.";
return RET_ERROR;
}
return RET_OK;
}

@@ -186,12 +169,6 @@ int ConvolutionSWFP16CPUKernel::ReSize() {
if (nhwc4_input_ != nullptr) {
free(nhwc4_input_);
}
if (fp16_input_ != nullptr) {
free(fp16_input_);
}
if (fp16_out_ != nullptr) {
free(fp16_out_);
}
delete slidingWindow_param_;

auto ret = ConvolutionBaseCPUKernel::Init();
@@ -258,6 +235,7 @@ int ConvolutionSWFP16CPUKernel::Run() {
conv_param_->output_channel_);
}
ConvolutionBaseFP16CPUKernel::IfCastOutput();
ConvolutionBaseFP16CPUKernel::FreeTmpBuffer();
return RET_OK;
}
} // namespace mindspore::kernel

+ 0
- 6
mindspore/lite/src/runtime/kernel/arm/fp16/convolution_sw_fp16.h View File

@@ -29,15 +29,9 @@ class ConvolutionSWFP16CPUKernel : public ConvolutionBaseFP16CPUKernel {
const lite::Primitive *primitive)
: ConvolutionBaseFP16CPUKernel(parameter, inputs, outputs, ctx, primitive) {}
~ConvolutionSWFP16CPUKernel() override {
if (fp16_input_ != nullptr) {
free(fp16_input_);
}
if (fp16_weight_ != nullptr) {
free(fp16_weight_);
}
if (fp16_out_ != nullptr) {
free(fp16_out_);
}
if (packed_weight_ != nullptr) {
free(packed_weight_);
}


+ 1
- 23
mindspore/lite/src/runtime/kernel/arm/fp16/convolution_winograd_fp16.cc View File

@@ -187,15 +187,6 @@ int ConvolutionWinogradFP16CPUKernel::InitTmpBuffer() {
int ic4 = UP_DIV(channel_in, C4NUM);
int oc8 = UP_DIV(channel_out, C8NUM);

/*=============================fp16_input_============================*/
size_t fp16_input_size = conv_param_->input_channel_ * conv_param_->input_batch_ * conv_param_->input_h_ *
conv_param_->input_w_ * sizeof(float16_t);
fp16_input_ = reinterpret_cast<float16_t *>(malloc(fp16_input_size));
if (fp16_input_ == nullptr) {
MS_LOG(ERROR) << "malloc fp16_input_ failed.";
return RET_ERROR;
}

/*=============================trans_input_============================*/
size_t tile_buffer_size = thread_count_ * cal_num * input_unit_ * input_unit_ * ic4 * C4NUM * sizeof(float16_t);
trans_input_ = reinterpret_cast<float16_t *>(malloc(tile_buffer_size));
@@ -222,14 +213,6 @@ int ConvolutionWinogradFP16CPUKernel::InitTmpBuffer() {
MS_LOG(ERROR) << "malloc tmp_out_data_ failed.";
return RET_ERROR;
}
/*=============================fp16_out_============================*/
size_t fp16_output_size = conv_param_->output_channel_ * conv_param_->output_batch_ * conv_param_->output_h_ *
conv_param_->output_w_ * sizeof(float16_t);
fp16_out_ = reinterpret_cast<float16_t *>(malloc(fp16_output_size));
if (fp16_out_ == nullptr) {
MS_LOG(ERROR) << "malloc fp16_out_ failed.";
return RET_ERROR;
}

/*=============================tmp_data_============================*/
tmp_data_ =
@@ -327,12 +310,6 @@ int ConvolutionWinogradFP16CPUKernel::ReSize() {
if (nhwc4_input_ != nullptr) {
free(nhwc4_input_);
}
if (fp16_input_ != nullptr) {
free(fp16_input_);
}
if (fp16_out_ != nullptr) {
free(fp16_out_);
}

auto ret = ConvolutionBaseCPUKernel::Init();
if (ret != RET_OK) {
@@ -412,6 +389,7 @@ int ConvolutionWinogradFP16CPUKernel::Run() {
// do nothing
}
ConvolutionBaseFP16CPUKernel::IfCastOutput();
ConvolutionBaseFP16CPUKernel::FreeTmpBuffer();
return RET_OK;
}
} // namespace mindspore::kernel

+ 0
- 6
mindspore/lite/src/runtime/kernel/arm/fp16/convolution_winograd_fp16.h View File

@@ -34,15 +34,9 @@ class ConvolutionWinogradFP16CPUKernel : public ConvolutionBaseFP16CPUKernel {
const lite::Primitive *primitive)
: ConvolutionBaseFP16CPUKernel(parameter, inputs, outputs, ctx, primitive) {}
~ConvolutionWinogradFP16CPUKernel() override {
if (fp16_input_ != nullptr) {
free(fp16_input_);
}
if (fp16_weight_ != nullptr) {
free(fp16_weight_);
}
if (fp16_out_ != nullptr) {
free(fp16_out_);
}
if (tmp_data_ != nullptr) {
free(tmp_data_);
}


+ 11
- 7
mindspore/lite/src/runtime/kernel/arm/fp16/deconvolution_depthwise_fp16.cc View File

@@ -185,11 +185,14 @@ int DeconvolutionDepthwiseFp16CPUKernel::Run() {
return RET_ERROR;
}

auto input_tensor = in_tensors_.at(kInputIndex);
auto input_addr = reinterpret_cast<float *>(input_tensor->Data());
ret = ConvolutionBaseFP16CPUKernel::GetExecuteTensor();
if (ret != RET_OK) {
MS_LOG(ERROR) << "Get Execute tensor failed.";
return ret;
}
// pack input: to nhwc8
PackNHWCFp32ToNHWC8Fp16(input_addr, packed_input_, conv_param_->input_batch_,
conv_param_->input_h_ * conv_param_->input_w_, conv_param_->input_channel_);
PackNHWCToNHWC8Fp16(execute_input_, packed_input_, conv_param_->input_batch_,
conv_param_->input_h_ * conv_param_->input_w_, conv_param_->input_channel_);

ret = LiteBackendParallelLaunch(DeconvDwFp16Run, this, conv_param_->thread_num_);
if (ret != RET_OK) {
@@ -197,9 +200,10 @@ int DeconvolutionDepthwiseFp16CPUKernel::Run() {
return RET_ERROR;
}

auto output_addr = reinterpret_cast<float *>(out_tensors_.at(kOutputIndex)->Data());
PackNHWC8Fp16ToNHWCFp32(packed_output_, output_addr, conv_param_->output_batch_,
conv_param_->output_h_ * conv_param_->output_w_, conv_param_->output_channel_);
PackNHWC8ToNHWCFp16(packed_output_, execute_output_, conv_param_->output_batch_,
conv_param_->output_h_ * conv_param_->output_w_, conv_param_->output_channel_);
ConvolutionBaseFP16CPUKernel::IfCastOutput();
ConvolutionBaseFP16CPUKernel::FreeTmpBuffer();
return RET_OK;
}



+ 3
- 3
mindspore/lite/src/runtime/kernel/arm/fp16/deconvolution_depthwise_fp16.h View File

@@ -19,7 +19,7 @@

#include <vector>
#include "src/lite_kernel.h"
#include "src/runtime/kernel/arm/base/convolution_base.h"
#include "src/runtime/kernel/arm/fp16/convolution_base_fp16.h"
#include "src/runtime/kernel/arm/nnacl/fp16/conv_depthwise_fp16.h"

#ifdef __cplusplus
@@ -34,12 +34,12 @@ void ComputeStrides(int *shape, int *strides, int ndim);
#endif

namespace mindspore::kernel {
class DeconvolutionDepthwiseFp16CPUKernel : public ConvolutionBaseCPUKernel {
class DeconvolutionDepthwiseFp16CPUKernel : public ConvolutionBaseFP16CPUKernel {
public:
DeconvolutionDepthwiseFp16CPUKernel(OpParameter *parameter, const std::vector<lite::tensor::Tensor *> &inputs,
const std::vector<lite::tensor::Tensor *> &outputs, const Context *ctx,
const lite::Primitive *primitive)
: ConvolutionBaseCPUKernel(parameter, inputs, outputs, ctx, primitive) {}
: ConvolutionBaseFP16CPUKernel(parameter, inputs, outputs, ctx, primitive) {}
~DeconvolutionDepthwiseFp16CPUKernel() override;

int Init() override;


+ 2
- 2
mindspore/lite/src/runtime/kernel/arm/nnacl/fp16/pack_fp16.c View File

@@ -392,7 +392,7 @@ void PackNHWCToNHWC8Fp16(float16_t *src, float16_t *dst, int batch, int plane, i
for (int i = 0; i < plane; i++) {
float16_t *dst_plane = dst_batch + i * c8_channel;
float16_t *src_plane = src_batch + i * channel;
memcpy(dst_plane, src_batch, channel * sizeof(float16_t));
memcpy(dst_plane, src_plane, channel * sizeof(float16_t));
}
}
}
@@ -405,7 +405,7 @@ void PackNHWC8ToNHWCFp16(float16_t *src, float16_t *dst, int batch, int plane, i
for (int i = 0; i < plane; i++) {
float16_t *src_plane = src_batch + i * c8_channel;
float16_t *dst_plane = dst_batch + i * channel;
memcpy(dst_plane, src_batch, channel * sizeof(float16_t));
memcpy(dst_plane, src_plane, channel * sizeof(float16_t));
}
}
}

Loading…
Cancel
Save