Browse Source

!15142 [MSLITE][DEVELOP] rectify group conv register

From: @yangruoqi713
Reviewed-by: @zhanghaibo5,@hangangqiang
Signed-off-by: @zhanghaibo5
pull/15142/MERGE
mindspore-ci-bot Gitee 4 years ago
parent
commit
5b008a0949
24 changed files with 273 additions and 598 deletions
  1. +31
    -78
      mindspore/lite/src/runtime/kernel/arm/base/group_convolution_creator.cc
  2. +11
    -17
      mindspore/lite/src/runtime/kernel/arm/base/group_convolution_creator.h
  3. +10
    -12
      mindspore/lite/src/runtime/kernel/arm/fp16/convolution_1x1_fp16.cc
  4. +4
    -4
      mindspore/lite/src/runtime/kernel/arm/fp16/convolution_1x1_fp16.h
  5. +0
    -52
      mindspore/lite/src/runtime/kernel/arm/fp16/convolution_base_fp16.cc
  6. +0
    -57
      mindspore/lite/src/runtime/kernel/arm/fp16/convolution_base_fp16.h
  7. +39
    -196
      mindspore/lite/src/runtime/kernel/arm/fp16/convolution_delegate_fp16.cc
  8. +3
    -9
      mindspore/lite/src/runtime/kernel/arm/fp16/convolution_delegate_fp16.h
  9. +11
    -20
      mindspore/lite/src/runtime/kernel/arm/fp16/convolution_depthwise_fp16.cc
  10. +4
    -10
      mindspore/lite/src/runtime/kernel/arm/fp16/convolution_depthwise_fp16.h
  11. +15
    -13
      mindspore/lite/src/runtime/kernel/arm/fp16/convolution_depthwise_slidewindow_fp16.cc
  12. +4
    -10
      mindspore/lite/src/runtime/kernel/arm/fp16/convolution_depthwise_slidewindow_fp16.h
  13. +10
    -14
      mindspore/lite/src/runtime/kernel/arm/fp16/convolution_fp16.cc
  14. +4
    -4
      mindspore/lite/src/runtime/kernel/arm/fp16/convolution_fp16.h
  15. +12
    -26
      mindspore/lite/src/runtime/kernel/arm/fp16/convolution_winograd_fp16.cc
  16. +4
    -5
      mindspore/lite/src/runtime/kernel/arm/fp16/convolution_winograd_fp16.h
  17. +14
    -11
      mindspore/lite/src/runtime/kernel/arm/fp16/deconvolution_depthwise_fp16.cc
  18. +4
    -5
      mindspore/lite/src/runtime/kernel/arm/fp16/deconvolution_depthwise_fp16.h
  19. +20
    -23
      mindspore/lite/src/runtime/kernel/arm/fp16/deconvolution_fp16.cc
  20. +5
    -5
      mindspore/lite/src/runtime/kernel/arm/fp16/deconvolution_fp16.h
  21. +12
    -18
      mindspore/lite/src/runtime/kernel/arm/fp16/deconvolution_winograd_fp16.cc
  22. +4
    -5
      mindspore/lite/src/runtime/kernel/arm/fp16/deconvolution_winograd_fp16.h
  23. +28
    -4
      mindspore/lite/src/runtime/kernel/arm/fp32/convolution_delegate_fp32.cc
  24. +24
    -0
      mindspore/lite/src/runtime/kernel/arm/int8/convolution_int8_creator.cc

+ 31
- 78
mindspore/lite/src/runtime/kernel/arm/base/group_convolution_creator.cc View File

@@ -15,10 +15,6 @@
*/

#include "src/runtime/kernel/arm/base/group_convolution_creator.h"
#include "src/runtime/kernel/arm/base/group_convolution.h"
#include "src/runtime/kernel/arm/int8/convolution_int8_creator.h"
#include "src/runtime/kernel/arm/fp32/convolution_delegate_fp32.h"
#include "src/runtime/kernel/arm/int8/group_convolution_int8.h"

namespace mindspore::kernel {
void CopyTensorQuantParam(lite::Tensor *dst, lite::Tensor *src) {
@@ -37,15 +33,11 @@ ConvParameter *CreateNewConvParameter(ConvParameter *parameter) {
return conv_parameter;
}

void FreeMemory(ConvParameter *conv_param, const std::vector<lite::Tensor *> &new_inputs,
const std::vector<lite::Tensor *> &new_outputs) {
if (conv_param != nullptr) {
free(conv_param);
}
for (auto &in_tensor : new_inputs) {
void FreeMemory(const std::vector<lite::Tensor *> *new_inputs, const std::vector<lite::Tensor *> *new_outputs) {
for (auto &in_tensor : *new_inputs) {
delete in_tensor;
}
for (auto &out_tensor : new_outputs) {
for (auto &out_tensor : *new_outputs) {
delete out_tensor;
}
}
@@ -106,6 +98,7 @@ void GroupConvCreator::CopyQuantParam(std::vector<lite::Tensor *> *tensors) {
CopyTensorQuantParam(tensors->at(j), origin_inputs_.at(j));
}
}

bool GroupConvCreator::CheckIfValidPoint(void *ptr) {
if (ptr == nullptr) {
for (auto &sub_conv : group_convs_) {
@@ -117,18 +110,17 @@ bool GroupConvCreator::CheckIfValidPoint(void *ptr) {
}

int GroupConvCreator::NewInputTensor(std::vector<lite::Tensor *> *tensors) {
auto in_tensor = CreateVarTensor(
{input_shape_, schema::Format_NHWC, origin_inputs_.at(0)->data_type(), lite::Tensor::Category::VAR, true},
infered_);
auto in_tensor =
CreateVarTensor({input_shape_, schema::Format_NHWC, data_type_, lite::Tensor::Category::VAR, true}, infered_);
if (!CheckIfValidPoint(in_tensor)) {
return lite::RET_ERROR;
}
tensors->emplace_back(in_tensor);
return lite::RET_OK;
}

int GroupConvCreator::NewOutputTensor(std::vector<lite::Tensor *> *tensors, lite::Tensor *output) {
auto out_tensor =
CreateVarTensor({output_shape_, output->format(), output->data_type(), output->category(), false}, infered_);
auto out_tensor = CreateVarTensor({output_shape_, output->format(), data_type_, output->category(), false}, infered_);
if (!CheckIfValidPoint(out_tensor)) {
return lite::RET_ERROR;
}
@@ -153,6 +145,7 @@ int GroupConvCreator::NewConstTensor(std::vector<lite::Tensor *> *tensors, int g
}
return lite::RET_OK;
}

void GroupConvCreator::SetShapeOfTensors() {
int new_in_channel = origin_inputs_.at(kWeightIndex)->Channel();
int new_out_channel;
@@ -176,71 +169,31 @@ void GroupConvCreator::SetShapeOfTensors() {
}
}

int GroupConvCreator::CreatGroupConv() {
for (int i = 0; i < conv_param_->group_; ++i) {
auto new_conv_parameter = CreateNewConvParameter(conv_param_);
if (!CheckIfValidPoint(new_conv_parameter)) {
return lite::RET_ERROR;
}
// create new input for each group
std::vector<lite::Tensor *> new_inputs;
if (NewInputTensor(&new_inputs) != lite::RET_OK) {
MS_LOG(ERROR) << "new input tensor failed.";
FreeMemory(new_conv_parameter, new_inputs, {});
return lite::RET_ERROR;
}
// const tensor
if (NewConstTensor(&new_inputs, i) != lite::RET_OK) {
MS_LOG(ERROR) << "new const tensor failed.";
FreeMemory(new_conv_parameter, new_inputs, {});
int GroupConvCreator::GetSingleConvParam(ConvParameter *conv_param, std::vector<lite::Tensor *> *new_inputs,
std::vector<lite::Tensor *> *new_outputs, int group_id) {
if (!CheckIfValidPoint(conv_param)) {
return lite::RET_ERROR;
}
// create new input for each group
if (NewInputTensor(new_inputs) != lite::RET_OK) {
MS_LOG(ERROR) << "new input tensor failed.";
FreeMemory(new_inputs, {});
return lite::RET_ERROR;
}
// const tensor
if (NewConstTensor(new_inputs, group_id) != lite::RET_OK) {
MS_LOG(ERROR) << "new const tensor failed.";
FreeMemory(new_inputs, {});
return lite::RET_ERROR;
}
// create new output tensor
for (auto &output : origin_outputs_) {
if (NewOutputTensor(new_outputs, output) != lite::RET_OK) {
MS_LOG(ERROR) << "new output tensor failed.";
FreeMemory(new_inputs, new_outputs);
return lite::RET_ERROR;
}
// create new output tensor
std::vector<lite::Tensor *> new_outputs;
for (auto &output : origin_outputs_) {
if (NewOutputTensor(&new_outputs, output) != lite::RET_OK) {
MS_LOG(ERROR) << "new output tensor failed.";
FreeMemory(new_conv_parameter, new_inputs, new_outputs);
return lite::RET_ERROR;
}
}

if (is_quant_) {
CopyQuantParam(&new_inputs);
group_convs_.emplace_back(CpuConvInt8KernelSelect(new_inputs, new_outputs,
reinterpret_cast<OpParameter *>(new_conv_parameter), context_));
} else {
group_convs_.emplace_back(new (std::nothrow) kernel::ConvolutionDelegateCPUKernel(
reinterpret_cast<OpParameter *>(new_conv_parameter), new_inputs, new_outputs, context_));
}
}
return lite::RET_OK;
}

kernel::LiteKernel *CpuGroupConvFp32KernelCreator(const std::vector<lite::Tensor *> &inputs,
const std::vector<lite::Tensor *> &outputs, OpParameter *op_parameter,
const lite::InnerContext *ctx) {
GroupConvCreator group_conv_creator(inputs, outputs, op_parameter, ctx, false);
group_conv_creator.SetShapeOfTensors();
if (group_conv_creator.CreatGroupConv() != lite::RET_OK) {
MS_LOG(ERROR) << "Create fp32 group conv failed.";
return nullptr;
}
return new (std::nothrow)
GroupConvolutionCPUKernel(op_parameter, inputs, outputs, ctx, group_conv_creator.get_group_conv(),
reinterpret_cast<ConvParameter *>(op_parameter)->group_);
}

kernel::LiteKernel *CpuGroupConvInt8KernelCreator(const std::vector<lite::Tensor *> &inputs,
const std::vector<lite::Tensor *> &outputs, OpParameter *op_parameter,
const lite::InnerContext *ctx, int group) {
GroupConvCreator group_conv_creator(inputs, outputs, op_parameter, ctx, true);
group_conv_creator.SetShapeOfTensors();
if (group_conv_creator.CreatGroupConv() != lite::RET_OK) {
MS_LOG(ERROR) << "Create int8 group conv failed.";
return nullptr;
}
return new (std::nothrow)
GroupConvolutionInt8CPUKernel(op_parameter, inputs, outputs, ctx, group_conv_creator.get_group_conv(), group);
}
} // namespace mindspore::kernel

+ 11
- 17
mindspore/lite/src/runtime/kernel/arm/base/group_convolution_creator.h View File

@@ -34,12 +34,12 @@ struct TensorInfo {
class GroupConvCreator {
public:
GroupConvCreator(std::vector<lite::Tensor *> inputs, std::vector<lite::Tensor *> outputs, OpParameter *op_parameter,
const lite::InnerContext *ctx, bool is_quant)
const lite::InnerContext *ctx, bool is_quant, TypeId data_type)
: origin_inputs_(std::move(inputs)),
origin_outputs_(std::move(outputs)),
context_(ctx),
infered_(op_parameter->infer_flag_),
is_quant_(is_quant) {
is_quant_(is_quant),
data_type_(data_type) {
conv_param_ = reinterpret_cast<ConvParameter *>(op_parameter);
}

@@ -47,15 +47,16 @@ class GroupConvCreator {

public:
void SetShapeOfTensors();
int CreatGroupConv();
std::vector<kernel::LiteKernel *> get_group_conv() { return group_convs_; }
std::vector<kernel::LiteKernel *> *get_group_conv() { return &group_convs_; }
void CopyQuantParam(std::vector<lite::Tensor *> *tensors);
int GetSingleConvParam(ConvParameter *conv_param, std::vector<lite::Tensor *> *new_inputs,
std::vector<lite::Tensor *> *new_outputs, int group_id);

protected:
void set_input_shape(const std::vector<int> &shape) { input_shape_ = shape; }
void set_output_shape(const std::vector<int> &shape) { output_shape_ = shape; }
void set_filter_shape(const std::vector<int> &shape) { filter_shape_ = shape; }
void set_bias_shape(const std::vector<int> &shape) { bias_shape_ = shape; }
void CopyQuantParam(std::vector<lite::Tensor *> *tensors);
bool CheckIfValidPoint(void *ptr);
int NewInputTensor(std::vector<lite::Tensor *> *tensors);
int NewConstTensor(std::vector<lite::Tensor *> *tensors, int group_id);
@@ -69,20 +70,13 @@ class GroupConvCreator {
std::vector<int> output_shape_;
std::vector<int> filter_shape_;
std::vector<int> bias_shape_;
const lite::InnerContext *context_;
ConvParameter *conv_param_;
bool infered_;
bool is_quant_;
bool infered_ = false;
bool is_quant_ = false;
TypeId data_type_;
};

LiteKernel *CpuGroupConvFp32KernelCreator(const std::vector<lite::Tensor *> &inputs,
const std::vector<lite::Tensor *> &outputs, OpParameter *op_parameter,
const lite::InnerContext *ctx);

LiteKernel *CpuGroupConvInt8KernelCreator(const std::vector<lite::Tensor *> &inputs,
const std::vector<lite::Tensor *> &outputs, OpParameter *op_parameter,
const lite::InnerContext *ctx, int group);

ConvParameter *CreateNewConvParameter(ConvParameter *parameter);
} // namespace mindspore::kernel

#endif // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_GROUP_CONVOLUTION_CREATOR_H_

+ 10
- 12
mindspore/lite/src/runtime/kernel/arm/fp16/convolution_1x1_fp16.cc View File

@@ -88,12 +88,7 @@ int Convolution1x1FP16CPUKernel::InitWeightBias() {
MS_LOG(ERROR) << "Conv1x1 Malloc bias_ptr_ error!";
return RET_ERROR;
}
if (origin_bias_data_type_ == kNumberTypeFloat16) {
memcpy(bias_data_, origin_bias_, output_channel * sizeof(float16_t));
} else {
MS_LOG(ERROR) << "Conv1x1 only support fp16 weight";
return RET_ERROR;
}
memcpy(bias_data_, origin_bias_, output_channel * sizeof(float16_t));
memset(reinterpret_cast<char *>(bias_data_) + bias_size, 0, size - bias_size);
}

@@ -105,8 +100,7 @@ int Convolution1x1FP16CPUKernel::InitWeightBias() {
return RET_ERROR;
}
memset(reinterpret_cast<char *>(weight_ptr_) + down_size, 0, size - down_size);
ColMajor2Row8MajorFp16(origin_weight_, weight_ptr_, input_channel, output_channel,
origin_weight_data_type_ == kNumberTypeFloat16);
ColMajor2Row8MajorFp16(origin_weight_, weight_ptr_, input_channel, output_channel, true);
return RET_OK;
}

@@ -217,8 +211,12 @@ static int Convolution1x1Fp16RunHw(void *cdata, int task_id) {
}

int Convolution1x1FP16CPUKernel::Run() {
ConvolutionBaseFP16CPUKernel::GetExecuteTensor();

auto input_data = reinterpret_cast<float16_t *>(in_tensors_.at(0)->data_c());
auto output_data = reinterpret_cast<float16_t *>(out_tensors_.at(0)->data_c());
if (input_data == nullptr || output_data == nullptr) {
MS_LOG(ERROR) << "Convolution1x1 Fp16 get null tensor data!";
return RET_ERROR;
}
pack_input_ = reinterpret_cast<float16_t *>(
ctx_->allocator->Malloc(matmul_param_->row_align_ * matmul_param_->deep_ * sizeof(float16_t)));
if (pack_input_ == nullptr) {
@@ -227,9 +225,9 @@ int Convolution1x1FP16CPUKernel::Run() {
}

for (int batch_index = 0; batch_index < conv_param_->input_batch_; batch_index++) {
output_ptr_ = execute_output_ + batch_index * matmul_param_->row_ * matmul_param_->col_;
output_ptr_ = output_data + batch_index * matmul_param_->row_ * matmul_param_->col_;
float16_t *batch_in =
execute_input_ + batch_index * conv_param_->input_h_ * conv_param_->input_w_ * conv_param_->input_channel_;
input_data + batch_index * conv_param_->input_h_ * conv_param_->input_w_ * conv_param_->input_channel_;
if (pre_trans_input_) {
Conv1x1InputPack(batch_in, input_ptr_, conv_param_, sizeof(float16_t));
} else {


+ 4
- 4
mindspore/lite/src/runtime/kernel/arm/fp16/convolution_1x1_fp16.h View File

@@ -20,18 +20,18 @@
#include <arm_neon.h>
#include <vector>
#include "src/lite_kernel.h"
#include "src/runtime/kernel/arm/fp16/convolution_base_fp16.h"
#include "src/runtime/kernel/arm/base/convolution_base.h"
#include "src/common/utils.h"
#include "nnacl/matmul_parameter.h"
#include "nnacl/fp16/matmul_fp16.h"

namespace mindspore::kernel {
class Convolution1x1FP16CPUKernel : public ConvolutionBaseFP16CPUKernel {
class Convolution1x1FP16CPUKernel : public ConvolutionBaseCPUKernel {
public:
Convolution1x1FP16CPUKernel(OpParameter *parameter, const std::vector<lite::Tensor *> &inputs,
const std::vector<lite::Tensor *> &outputs, const InnerContext *ctx, void *origin_weight,
void *origin_bias, TypeId origin_weight_data_type, TypeId origin_bias_data_type)
: ConvolutionBaseFP16CPUKernel(parameter, inputs, outputs, ctx, origin_weight_data_type, origin_bias_data_type),
void *origin_bias)
: ConvolutionBaseCPUKernel(parameter, inputs, outputs, ctx),
origin_weight_(origin_weight),
origin_bias_(origin_bias) {}
~Convolution1x1FP16CPUKernel() override;


+ 0
- 52
mindspore/lite/src/runtime/kernel/arm/fp16/convolution_base_fp16.cc View File

@@ -1,52 +0,0 @@
/**
* Copyright 2020 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include "src/runtime/kernel/arm/fp16/convolution_base_fp16.h"
#include "nnacl/fp16/cast_fp16.h"
#include "src/runtime/kernel/arm/fp16/common_fp16.h"
#include "include/errorcode.h"
#include "src/runtime/runtime_api.h"

namespace mindspore::kernel {
using mindspore::lite::RET_ERROR;
using mindspore::lite::RET_OK;
ConvolutionBaseFP16CPUKernel::~ConvolutionBaseFP16CPUKernel() {
if (fp16_weight_ != nullptr) {
free(fp16_weight_);
fp16_weight_ = nullptr;
}
}

int ConvolutionBaseFP16CPUKernel::GetExecuteTensor() {
auto input_tensor = in_tensors_.at(0);
auto output_tensor = out_tensors_.at(0);
execute_input_ = reinterpret_cast<float16_t *>(input_tensor->data_c());
execute_output_ = reinterpret_cast<float16_t *>(output_tensor->data_c());
return RET_OK;
}

int ConvolutionBaseFP16CPUKernel::GetExecuteFilter(lite::Tensor *weight_tensor, void *origin_data) {
MS_ASSERT(origin_weight_data_type_ == kNumberTypeFloat32 || origin_weight_data_type_ == kNumberTypeFloat16);
if (origin_weight_data_type_ == kNumberTypeFloat32) {
MS_LOG(ERROR) << "Conv fp16 only support fp16 weight";
return RET_ERROR;
} else {
execute_weight_ = reinterpret_cast<float16_t *>(origin_data);
fp16_weight_ = nullptr;
}
return RET_OK;
}
} // namespace mindspore::kernel

+ 0
- 57
mindspore/lite/src/runtime/kernel/arm/fp16/convolution_base_fp16.h View File

@@ -1,57 +0,0 @@
/**
* Copyright 2020 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_CONVOLUTION_BASE_FP16_H_
#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_CONVOLUTION_BASE_FP16_H_

#include <arm_neon.h>
#include <vector>
#include "src/lite_kernel.h"
#include "src/runtime/kernel/arm/base/convolution_base.h"
#include "src/common/utils.h"

namespace mindspore::kernel {
class ConvolutionBaseFP16CPUKernel : public ConvolutionBaseCPUKernel {
public:
ConvolutionBaseFP16CPUKernel(OpParameter *parameter, const std::vector<lite::Tensor *> &inputs,
const std::vector<lite::Tensor *> &outputs, const InnerContext *ctx,
TypeId origin_weight_data_type, TypeId origin_bias_data_type)
: ConvolutionBaseCPUKernel(parameter, inputs, outputs, ctx),
origin_weight_data_type_(origin_weight_data_type),
origin_bias_data_type_(origin_bias_data_type) {}
~ConvolutionBaseFP16CPUKernel() override;

int Init() override { return mindspore::lite::RET_OK; }
int ReSize() override { return mindspore::lite::RET_OK; }
int Run() override { return mindspore::lite::RET_OK; }
int RunImpl(int task_id) { return mindspore::lite::RET_OK; }
virtual int GetExecuteTensor();
// origin_data may not be the same as the data in the weight tensor,
// because weight tensor has released data already. In this situation,
// origin_data is the pointer of another memory block.
virtual int GetExecuteFilter(lite::Tensor *weight_tensor, void *origin_data);

protected:
float16_t *fp16_weight_ = nullptr;
float16_t *execute_input_ = nullptr;
float16_t *execute_weight_ = nullptr;
float16_t *execute_output_ = nullptr;
TypeId origin_weight_data_type_;
TypeId origin_bias_data_type_;
};
} // namespace mindspore::kernel

#endif // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_CONVOLUTION_BASE_FP16_H_

+ 39
- 196
mindspore/lite/src/runtime/kernel/arm/fp16/convolution_delegate_fp16.cc View File

@@ -22,6 +22,7 @@
#include "src/runtime/kernel/arm/fp16/group_convolution_fp16.h"
#include "src/runtime/kernel/arm/fp16/convolution_depthwise_fp16.h"
#include "src/runtime/kernel/arm/fp16/convolution_depthwise_slidewindow_fp16.h"
#include "src/runtime/kernel/arm/base/group_convolution_creator.h"
#include "schema/model_generated.h"
#include "src/kernel_registry.h"
#include "include/errorcode.h"
@@ -96,8 +97,8 @@ int ConvolutionDelegateFP16CPUKernel::ReSize() {
kernel::SetInputOutputShapeInfo(reinterpret_cast<ConvParameter *>(op_parameter_), in_tensors_.front(),
out_tensors_.front(), context_);
if (fp16_conv_kernel_ == nullptr) {
fp16_conv_kernel_ = CpuConvFp16KernelSelect(in_tensors_, out_tensors_, op_parameter_, context_, origin_weight_,
origin_bias_, origin_weight_data_type_, origin_bias_data_type_);
fp16_conv_kernel_ =
CpuConvFp16KernelSelect(in_tensors_, out_tensors_, op_parameter_, context_, origin_weight_, origin_bias_);
if (fp16_conv_kernel_ == nullptr) {
MS_LOG(ERROR) << "Selecting execute kernel failed for conv_kernel, got a nullptr.";
return RET_ERROR;
@@ -108,29 +109,16 @@ int ConvolutionDelegateFP16CPUKernel::ReSize() {
return fp16_conv_kernel_->ReSize();
}

ConvParameter *CreateNewConvParameterFp16(ConvParameter *parameter) {
auto conv_parameter = reinterpret_cast<ConvParameter *>(malloc(sizeof(ConvParameter)));
if (conv_parameter == nullptr) {
MS_LOG(ERROR) << "Malloc new conv parameter failed.";
return nullptr;
}
memcpy(conv_parameter, parameter, sizeof(ConvParameter));
return conv_parameter;
}

kernel::LiteKernel *CpuConvDwFp16KernelCreator(const std::vector<lite::Tensor *> &inputs,
const std::vector<lite::Tensor *> &outputs, OpParameter *opParameter,
const InnerContext *ctx, void *origin_weight, void *origin_bias,
TypeId origin_weight_data_type, TypeId origin_bias_data_type) {
const InnerContext *ctx) {
MS_ASSERT(opParameter != nullptr);
auto conv_param = reinterpret_cast<ConvParameter *>(opParameter);
kernel::LiteKernel *kernel = nullptr;
if (conv_param->input_channel_ < 32) {
kernel = new (std::nothrow) kernel::ConvolutionDepthwiseSWFp16CPUKernel(
opParameter, inputs, outputs, ctx, origin_weight, origin_bias, origin_weight_data_type, origin_bias_data_type);
kernel = new (std::nothrow) kernel::ConvolutionDepthwiseSWFp16CPUKernel(opParameter, inputs, outputs, ctx);
} else {
kernel = new (std::nothrow) kernel::ConvolutionDepthwiseFp16CPUKernel(
opParameter, inputs, outputs, ctx, origin_weight, origin_bias, origin_weight_data_type, origin_bias_data_type);
kernel = new (std::nothrow) kernel::ConvolutionDepthwiseFp16CPUKernel(opParameter, inputs, outputs, ctx);
}
if (kernel == nullptr) {
MS_LOG(ERROR) << "kernel is nullptr.";
@@ -142,27 +130,22 @@ kernel::LiteKernel *CpuConvDwFp16KernelCreator(const std::vector<lite::Tensor *>

kernel::LiteKernel *CpuConvFp16KernelSelect(const std::vector<lite::Tensor *> &inputs,
const std::vector<lite::Tensor *> &outputs, OpParameter *op_parameter,
const lite::InnerContext *ctx, void *origin_weight, void *origin_bias,
TypeId origin_weight_data_type, TypeId origin_bias_data_type) {
const lite::InnerContext *ctx, void *origin_weight, void *origin_bias) {
auto conv_param = reinterpret_cast<ConvParameter *>(op_parameter);
bool use_winograd = false;
int out_unit;
CheckIfUseWinogradFp16(&use_winograd, &out_unit, conv_param);
kernel::LiteKernel *kernel = nullptr;

if (conv_param->group_ == conv_param->input_channel_ && conv_param->group_ == conv_param->output_channel_) {
kernel = CpuConvDwFp16KernelCreator(inputs, outputs, op_parameter, ctx, origin_weight, origin_bias,
origin_weight_data_type, origin_bias_data_type);
} else if (conv_param->kernel_h_ == 1 && conv_param->kernel_w_ == 1) {
kernel = new (std::nothrow) kernel::Convolution1x1FP16CPUKernel(
op_parameter, inputs, outputs, ctx, origin_weight, origin_bias, origin_weight_data_type, origin_bias_data_type);
} else if (use_winograd) {
if (conv_param->kernel_h_ == 1 && conv_param->kernel_w_ == 1) {
kernel = new (std::nothrow)
kernel::ConvolutionWinogradFP16CPUKernel(op_parameter, inputs, outputs, ctx, out_unit, origin_weight, origin_bias,
origin_weight_data_type, origin_bias_data_type);
kernel::Convolution1x1FP16CPUKernel(op_parameter, inputs, outputs, ctx, origin_weight, origin_bias);
} else if (use_winograd) {
kernel = new (std::nothrow) kernel::ConvolutionWinogradFP16CPUKernel(op_parameter, inputs, outputs, ctx, out_unit,
origin_weight, origin_bias);
} else {
kernel = new (std::nothrow) kernel::ConvolutionFP16CPUKernel(
op_parameter, inputs, outputs, ctx, origin_weight, origin_bias, origin_weight_data_type, origin_bias_data_type);
kernel = new (std::nothrow)
kernel::ConvolutionFP16CPUKernel(op_parameter, inputs, outputs, ctx, origin_weight, origin_bias);
}
// Once kernel is selected, init func will invoke InitWeightAndBias
auto ret = kernel->Init();
@@ -174,194 +157,54 @@ kernel::LiteKernel *CpuConvFp16KernelSelect(const std::vector<lite::Tensor *> &i
return kernel;
}

void FreeMemoryFp16(const std::vector<kernel::LiteKernel *> &group_convs, const std::vector<lite::Tensor *> &new_inputs,
const std::vector<lite::Tensor *> &new_outputs) {
for (auto sub_conv : group_convs) {
delete sub_conv;
}
for (auto in_tensor : new_inputs) {
delete in_tensor;
}
for (auto out_tensor : new_outputs) {
delete out_tensor;
}
}

static lite::Tensor *CreateInputTensorFp16(TypeId data_type, const std::vector<int> &in_shape, bool infered_flag) {
auto in_tensor = new (std::nothrow) lite::Tensor(data_type, in_shape, Format_NHWC, lite::Tensor::Category::VAR);
if (in_tensor == nullptr) {
MS_LOG(ERROR) << "new in_tensor failed.";
return nullptr;
}
if (infered_flag) {
auto ret = in_tensor->MallocData();
if (ret != RET_OK) {
delete in_tensor;
MS_LOG(ERROR) << "in tensor malloc failed.";
return nullptr;
}
}
return in_tensor;
}

static lite::Tensor *CreateConstTensorFp16(lite::Tensor *tensor, const std::vector<int> &shape, const int index) {
auto new_tensor =
new (std::nothrow) lite::Tensor(tensor->data_type(), shape, Format_NHWC, lite::Tensor::Category::CONST_TENSOR);
if (new_tensor == nullptr) {
MS_LOG(ERROR) << "Create new_tensor failed.";
return nullptr;
}
auto ret = new_tensor->MallocData();
if (ret != RET_OK) {
delete new_tensor;
MS_LOG(ERROR) << "Malloc new_tensor failed.";
return nullptr;
}
memcpy(new_tensor->data_c(), reinterpret_cast<char *>(tensor->data_c()) + index * new_tensor->Size(),
new_tensor->Size());
return new_tensor;
}

static lite::Tensor *CreateOutputTensorFp16(const std::vector<int> &out_shape,
const std::vector<lite::Tensor *> &outputs, bool infered_flag, int index) {
auto out_tensor = new (std::nothrow) lite::Tensor();
if (out_tensor == nullptr) {
MS_LOG(ERROR) << "new tmp_out_tensor failed.";
return nullptr;
}
out_tensor->set_data_type(mindspore::kNumberTypeFloat16);
out_tensor->set_format(outputs.at(index)->format());
if (infered_flag) {
out_tensor->set_shape(out_shape);
auto ret = out_tensor->MallocData();
if (ret != RET_OK) {
delete out_tensor;
MS_LOG(ERROR) << "out_tensor malloc data failed.";
return nullptr;
}
}
return out_tensor;
}

kernel::LiteKernel *CreateDelegateConvFp16(const std::vector<lite::Tensor *> &inputs,
const std::vector<lite::Tensor *> &outputs, OpParameter *op_parameter,
const InnerContext *ctx) {
auto weight_data_type = inputs.at(1)->data_type();
if (weight_data_type != kNumberTypeFloat16) {
MS_LOG(ERROR) << "Convfp16 only support fp16 weight";
return nullptr;
}
TypeId bias_data_type = kTypeUnknown;
if (inputs.size() == 3) {
bias_data_type = inputs.at(2)->data_type();
}
return new (std::nothrow)
kernel::ConvolutionDelegateFP16CPUKernel(op_parameter, inputs, outputs, ctx, weight_data_type, bias_data_type);
}

kernel::LiteKernel *CpuGroupConvFp16KernelCreator(const std::vector<lite::Tensor *> &inputs,
const std::vector<lite::Tensor *> &outputs, OpParameter *op_parameter,
const InnerContext *ctx) {
bool infer_flag = op_parameter->infer_flag_;
auto conv_param = reinterpret_cast<ConvParameter *>(op_parameter);
// update new shape info for each sub kernel
int new_in_channel = inputs.at(kWeightIndex)->Channel();
int new_out_channel = 0;
if (conv_param->group_ == 0) {
MS_LOG(ERROR) << "Divisor 'group' cannot be 0.";
return nullptr;
} else {
new_out_channel = inputs.at(kWeightIndex)->Batch() / conv_param->group_;
}
GroupConvCreator group_conv_creator(inputs, outputs, op_parameter, ctx, false, kNumberTypeFloat16);
group_conv_creator.SetShapeOfTensors();

std::vector<int> in_shape;
std::vector<int> out_shape;
if (infer_flag) {
conv_param->input_channel_ = new_in_channel;
conv_param->output_channel_ = new_out_channel;
in_shape = {inputs.front()->Batch(), inputs.front()->Height(), inputs.front()->Width(), new_in_channel};
out_shape = {inputs.front()->Batch(), outputs.front()->Height(), outputs.front()->Width(), new_out_channel};
}
std::vector<int> filter_shape = {new_out_channel, conv_param->kernel_h_, conv_param->kernel_w_, new_in_channel};
std::vector<int> bias_shape = {new_out_channel};

// new group conv op
std::vector<kernel::LiteKernel *> group_convs;
// create tensors for every sub conv kernel
for (int i = 0; i < conv_param->group_; ++i) {
ConvParameter *new_conv_param = CreateNewConvParameter(conv_param);
std::vector<lite::Tensor *> new_inputs;
std::vector<lite::Tensor *> new_outputs;
auto new_conv_parameter = CreateNewConvParameterFp16(conv_param);
if (new_conv_parameter == nullptr) {
FreeMemoryFp16(group_convs, new_inputs, new_outputs);
MS_LOG(ERROR) << "Get new conv parameter failed.";
return nullptr;
}
// create new input for each group
auto in_tensor = CreateInputTensorFp16(mindspore::kNumberTypeFloat16, in_shape, infer_flag);
if (in_tensor == nullptr) {
delete new_conv_parameter;
FreeMemoryFp16(group_convs, new_inputs, new_outputs);
MS_LOG(ERROR) << "create input tensor failed.";
return nullptr;
}
new_inputs.emplace_back(in_tensor);

// create new weight
auto filter_tensor = CreateConstTensorFp16(inputs.at(kWeightIndex), filter_shape, i);
if (filter_tensor == nullptr) {
delete new_conv_parameter;
FreeMemoryFp16(group_convs, new_inputs, new_outputs);
MS_LOG(ERROR) << "create filter tensor failed.";
auto ret = group_conv_creator.GetSingleConvParam(new_conv_param, &new_inputs, &new_outputs, i);
if (ret != RET_OK) {
MS_LOG(ERROR) << "GetSingleConv for fp16 group conv failed.";
return nullptr;
}
new_inputs.emplace_back(filter_tensor);

// if has bias, create new bias
if (inputs.size() == 3) {
auto bias_tensor = CreateConstTensorFp16(inputs.at(kBiasIndex), bias_shape, i);
if (bias_tensor == nullptr) {
delete new_conv_parameter;
FreeMemoryFp16(group_convs, new_inputs, new_outputs);
MS_LOG(ERROR) << "create bias_tensor failed.";
return nullptr;
}
new_inputs.emplace_back(bias_tensor);
}

// create new output tensors
for (size_t j = 0; j < outputs.size(); ++j) {
auto out_tensor = CreateOutputTensorFp16(out_shape, outputs, infer_flag, j);
if (out_tensor == nullptr) {
delete new_conv_parameter;
FreeMemoryFp16(group_convs, new_inputs, new_outputs);
MS_LOG(ERROR) << "new out_tensor failed.";
return nullptr;
}
new_outputs.emplace_back(out_tensor);
}
group_convs.emplace_back(
CreateDelegateConvFp16(new_inputs, new_outputs, reinterpret_cast<OpParameter *>(new_conv_parameter), ctx));
group_conv_creator.get_group_conv()->emplace_back(new (std::nothrow) ConvolutionDelegateFP16CPUKernel(
reinterpret_cast<OpParameter *>(new_conv_param), new_inputs, new_outputs, ctx));
}
return new (std::nothrow)
GroupConvolutionFP16CPUKernel(op_parameter, inputs, outputs, ctx, group_convs, conv_param->group_);
GroupConvolutionFP16CPUKernel(op_parameter, inputs, outputs, ctx, *(group_conv_creator.get_group_conv()),
reinterpret_cast<ConvParameter *>(op_parameter)->group_);
}

/* creator func */
kernel::LiteKernel *CpuConvFp16KernelCreator(const std::vector<lite::Tensor *> &inputs,
const std::vector<lite::Tensor *> &outputs, OpParameter *opParameter,
const InnerContext *ctx, const kernel::KernelKey &desc) {
MS_ASSERT(opParameter != nullptr);
MS_ASSERT(desc.type == schema::PrimitiveType_Conv2DFusion);

auto weight_data_type = inputs.at(1)->data_type();
TypeId bias_data_type = weight_data_type;
if (inputs.size() == 3) {
bias_data_type = inputs.at(2)->data_type();
}
if (weight_data_type != kNumberTypeFloat16 || bias_data_type != kNumberTypeFloat16) {
MS_LOG(ERROR) << "Convfp16 only support fp16 weight and fp16 bias.";
return nullptr;
}
auto conv_param = reinterpret_cast<ConvParameter *>(opParameter);
kernel::LiteKernel *kernel = nullptr;
bool is_depthwise =
(conv_param->group_ == conv_param->input_channel_ && conv_param->group_ == conv_param->output_channel_);

if (conv_param->group_ > 1 && !is_depthwise) {
kernel = CpuGroupConvFp16KernelCreator(inputs, outputs, opParameter, ctx);
if (conv_param->group_ == 1) {
kernel = new (std::nothrow) kernel::ConvolutionDelegateFP16CPUKernel(opParameter, inputs, outputs, ctx);
} else if (conv_param->group_ == conv_param->input_channel_ && conv_param->group_ == conv_param->output_channel_) {
kernel = CpuConvDwFp16KernelCreator(inputs, outputs, opParameter, ctx);
} else {
kernel = CreateDelegateConvFp16(inputs, outputs, opParameter, ctx);
kernel = CpuGroupConvFp16KernelCreator(inputs, outputs, opParameter, ctx);
}

if (kernel == nullptr) {


+ 3
- 9
mindspore/lite/src/runtime/kernel/arm/fp16/convolution_delegate_fp16.h View File

@@ -29,11 +29,8 @@ namespace mindspore::kernel {
class ConvolutionDelegateFP16CPUKernel : public LiteKernel {
public:
ConvolutionDelegateFP16CPUKernel(OpParameter *parameter, const std::vector<lite::Tensor *> &inputs,
const std::vector<lite::Tensor *> &outputs, const lite::InnerContext *ctx,
TypeId origin_weight_data_type, TypeId origin_bias_data_type)
: LiteKernel(parameter, inputs, outputs, ctx),
origin_weight_data_type_(origin_weight_data_type),
origin_bias_data_type_(origin_bias_data_type) {}
const std::vector<lite::Tensor *> &outputs, const lite::InnerContext *ctx)
: LiteKernel(parameter, inputs, outputs, ctx) {}
~ConvolutionDelegateFP16CPUKernel() override {
FreeCopiedData();
if (fp16_conv_kernel_ != nullptr) {
@@ -56,14 +53,11 @@ class ConvolutionDelegateFP16CPUKernel : public LiteKernel {
void *origin_weight_ = nullptr;
void *origin_bias_ = nullptr;
kernel::LiteKernel *fp16_conv_kernel_ = nullptr;
TypeId origin_weight_data_type_;
TypeId origin_bias_data_type_;
};

kernel::LiteKernel *CpuConvFp16KernelSelect(const std::vector<lite::Tensor *> &inputs,
const std::vector<lite::Tensor *> &outputs, OpParameter *op_parameter,
const lite::InnerContext *ctx, void *origin_weight, void *origin_bias,
TypeId origin_weight_data_type, TypeId origin_bias_data_type);
const lite::InnerContext *ctx, void *origin_weight, void *origin_bias);
} // namespace mindspore::kernel

#endif // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_CONVOLUTION_DELEGATE_FP16_H_

+ 11
- 20
mindspore/lite/src/runtime/kernel/arm/fp16/convolution_depthwise_fp16.cc View File

@@ -36,23 +36,15 @@ int ConvolutionDepthwiseFp16CPUKernel::InitWeightBias() {
auto weight_tensor = in_tensors_.at(kWeightIndex);
int channel = weight_tensor->Batch();
int pack_weight_size = channel * weight_tensor->Height() * weight_tensor->Width();
auto origin_weight = reinterpret_cast<float16_t *>(weight_tensor->data_c());

packed_weight_ = reinterpret_cast<float16_t *>(malloc(pack_weight_size * sizeof(float16_t)));
if (packed_weight_ == nullptr) {
MS_LOG(ERROR) << "Malloc buffer failed.";
return RET_ERROR;
}
auto ret = ConvolutionBaseFP16CPUKernel::GetExecuteFilter(weight_tensor, origin_weight_);
if (ret != RET_OK) {
MS_LOG(ERROR) << "get execute filter data failed.";
return ret;
}
PackNCHWToNHWCFp16(execute_weight_, packed_weight_, 1, weight_tensor->Height() * weight_tensor->Width(),
PackNCHWToNHWCFp16(origin_weight, packed_weight_, 1, weight_tensor->Height() * weight_tensor->Width(),
weight_tensor->Batch());
if (fp16_weight_ != nullptr) {
free(fp16_weight_);
fp16_weight_ = nullptr;
}

bias_data_ = reinterpret_cast<float16_t *>(malloc(channel * sizeof(float16_t)));
if (bias_data_ == nullptr) {
@@ -60,14 +52,10 @@ int ConvolutionDepthwiseFp16CPUKernel::InitWeightBias() {
return RET_ERROR;
}
memset(bias_data_, 0, channel * sizeof(float16_t));
auto bias_fp16 = reinterpret_cast<float16_t *>(bias_data_);
if (in_tensors_.size() == kInputSize2) {
auto bias_tensor = in_tensors_.at(kBiasIndex);
MS_ASSERT(origin_bias_);
auto ori_bias = reinterpret_cast<float16_t *>(origin_bias_);
for (int i = 0; i < bias_tensor->ElementsNum(); i++) {
bias_fp16[i] = (float16_t)ori_bias[i];
}
auto ori_bias = reinterpret_cast<float16_t *>(bias_tensor->data_c());
memcpy(bias_data_, ori_bias, bias_tensor->Size());
}
return RET_OK;
}
@@ -95,8 +83,13 @@ int ConvolutionDepthwiseFp16CPUKernel::ReSize() {
}

int ConvolutionDepthwiseFp16CPUKernel::Execute(int task_id) {
ConvDwFp16(execute_output_, execute_input_, packed_weight_, reinterpret_cast<float16_t *>(bias_data_), conv_param_,
task_id);
auto input_ptr = reinterpret_cast<float16_t *>(in_tensors_.at(0)->data_c());
auto output_ptr = reinterpret_cast<float16_t *>(out_tensors_.at(0)->data_c());
if (input_ptr == nullptr || output_ptr == nullptr) {
MS_LOG(ERROR) << "Convolution depthwise Fp16 get null tensor data!";
return RET_ERROR;
}
ConvDwFp16(output_ptr, input_ptr, packed_weight_, reinterpret_cast<float16_t *>(bias_data_), conv_param_, task_id);
return RET_OK;
}

@@ -111,8 +104,6 @@ static int ConvDwFp16Run(void *cdata, int task_id) {
}

int ConvolutionDepthwiseFp16CPUKernel::Run() {
ConvolutionBaseFP16CPUKernel::GetExecuteTensor();

auto ret = ParallelLaunch(this->context_->thread_pool_, ConvDwFp16Run, this, conv_param_->thread_num_);
if (ret != RET_OK) {
MS_LOG(ERROR) << "ConvDwFp16Run error: error_code[" << ret << "]";


+ 4
- 10
mindspore/lite/src/runtime/kernel/arm/fp16/convolution_depthwise_fp16.h View File

@@ -19,7 +19,7 @@

#include <vector>
#include "src/lite_kernel.h"
#include "src/runtime/kernel/arm/fp16/convolution_base_fp16.h"
#include "src/runtime/kernel/arm/base/convolution_base.h"
#include "nnacl/fp16/conv_depthwise_fp16.h"

#ifdef __cplusplus
@@ -32,15 +32,11 @@ void ConvDwFp16(float16_t *output_data, const float16_t *input_data, const float
#endif

namespace mindspore::kernel {
class ConvolutionDepthwiseFp16CPUKernel : public ConvolutionBaseFP16CPUKernel {
class ConvolutionDepthwiseFp16CPUKernel : public ConvolutionBaseCPUKernel {
public:
ConvolutionDepthwiseFp16CPUKernel(OpParameter *parameter, const std::vector<lite::Tensor *> &inputs,
const std::vector<lite::Tensor *> &outputs, const InnerContext *ctx,
void *origin_weight, void *origin_bias, TypeId origin_weight_data_type,
TypeId origin_bias_data_type)
: ConvolutionBaseFP16CPUKernel(parameter, inputs, outputs, ctx, origin_weight_data_type, origin_bias_data_type),
origin_weight_(origin_weight),
origin_bias_(origin_bias) {}
const std::vector<lite::Tensor *> &outputs, const InnerContext *ctx)
: ConvolutionBaseCPUKernel(parameter, inputs, outputs, ctx) {}
~ConvolutionDepthwiseFp16CPUKernel() override;

int Init() override;
@@ -51,8 +47,6 @@ class ConvolutionDepthwiseFp16CPUKernel : public ConvolutionBaseFP16CPUKernel {
int Execute(int task_id);

private:
void *origin_weight_; // do not free
void *origin_bias_; // do not free
float16_t *packed_weight_ = nullptr;
};
} // namespace mindspore::kernel


+ 15
- 13
mindspore/lite/src/runtime/kernel/arm/fp16/convolution_depthwise_slidewindow_fp16.cc View File

@@ -62,14 +62,15 @@ int ConvolutionDepthwiseSWFp16CPUKernel::InitWeightBias() {
auto weight_tensor = in_tensors_.at(kWeightIndex);
int OC8 = UP_DIV(weight_tensor->Batch(), C8NUM);
int pack_weight_size = C8NUM * OC8 * weight_tensor->Height() * weight_tensor->Width();
auto origin_weight = reinterpret_cast<float16_t *>(weight_tensor->data_c());

packed_weight_ = reinterpret_cast<float16_t *>(malloc(pack_weight_size * sizeof(float16_t)));
if (packed_weight_ == nullptr) {
MS_LOG(ERROR) << "Malloc buffer failed.";
return RET_ERROR;
}
PackNCHWFp16ToNC8HW8Fp16(reinterpret_cast<float16_t *>(origin_weight_), packed_weight_, 1,
weight_tensor->Height() * weight_tensor->Width(), weight_tensor->Batch());
PackNCHWFp16ToNC8HW8Fp16(origin_weight, packed_weight_, 1, weight_tensor->Height() * weight_tensor->Width(),
weight_tensor->Batch());

bias_data_ = reinterpret_cast<float16_t *>(malloc(C8NUM * OC8 * sizeof(float16_t)));
if (bias_data_ == nullptr) {
@@ -77,14 +78,10 @@ int ConvolutionDepthwiseSWFp16CPUKernel::InitWeightBias() {
return RET_ERROR;
}
memset(bias_data_, 0, C8NUM * OC8 * sizeof(float16_t));
auto bias_fp16 = reinterpret_cast<float16_t *>(bias_data_);
if (in_tensors_.size() == kInputSize2) {
auto bias_tensor = in_tensors_.at(kBiasIndex);
MS_ASSERT(origin_bias_);
auto ori_bias = reinterpret_cast<float16_t *>(origin_bias_);
for (int i = 0; i < bias_tensor->ElementsNum(); i++) {
bias_fp16[i] = (float16_t)ori_bias[i];
}
auto ori_bias = reinterpret_cast<float16_t *>(bias_tensor->data_c());
memcpy(bias_data_, ori_bias, bias_tensor->Size());
}

conv_param_->thread_num_ = MSMIN(thread_count_, OC8);
@@ -143,14 +140,19 @@ int ConvolutionDepthwiseSWFp16CPUKernel::Run() {
return ret;
}

ConvolutionBaseFP16CPUKernel::GetExecuteTensor();
auto input_ptr = reinterpret_cast<float16_t *>(in_tensors_.at(0)->data_c());
auto output_ptr = reinterpret_cast<float16_t *>(out_tensors_.at(0)->data_c());
if (input_ptr == nullptr || output_ptr == nullptr) {
MS_LOG(ERROR) << "Convolution depthwise Fp16 get null tensor data!";
return RET_ERROR;
}

if (need_align_) {
PackNHWCToNHWC8Fp16(execute_input_, packed_input_, conv_param_->input_batch_,
PackNHWCToNHWC8Fp16(input_ptr, packed_input_, conv_param_->input_batch_,
conv_param_->input_h_ * conv_param_->input_w_, conv_param_->input_channel_);
} else {
packed_input_ = execute_input_;
packed_output_ = execute_output_;
packed_input_ = input_ptr;
packed_output_ = output_ptr;
}

ret = ParallelLaunch(this->context_->thread_pool_, ConvDwSWFp16Run, this, conv_param_->thread_num_);
@@ -158,7 +160,7 @@ int ConvolutionDepthwiseSWFp16CPUKernel::Run() {
MS_LOG(ERROR) << "ConvDwSWFp16Run error: error_code[" << ret << "]";
}
if (need_align_) {
PackNHWC8ToNHWCFp16(packed_output_, execute_output_, conv_param_->output_batch_,
PackNHWC8ToNHWCFp16(packed_output_, output_ptr, conv_param_->output_batch_,
conv_param_->output_h_ * conv_param_->output_w_, conv_param_->output_channel_);
}



+ 4
- 10
mindspore/lite/src/runtime/kernel/arm/fp16/convolution_depthwise_slidewindow_fp16.h View File

@@ -19,7 +19,7 @@

#include <vector>
#include "src/lite_kernel.h"
#include "src/runtime/kernel/arm/fp16/convolution_base_fp16.h"
#include "src/runtime/kernel/arm/base/convolution_base.h"
#include "nnacl/fp16/conv_depthwise_fp16.h"

#ifdef __cplusplus
@@ -33,15 +33,11 @@ void ConvDwC8Fp16(float16_t *output_data, const float16_t *input_data, const flo
#endif

namespace mindspore::kernel {
class ConvolutionDepthwiseSWFp16CPUKernel : public ConvolutionBaseFP16CPUKernel {
class ConvolutionDepthwiseSWFp16CPUKernel : public ConvolutionBaseCPUKernel {
public:
ConvolutionDepthwiseSWFp16CPUKernel(OpParameter *parameter, const std::vector<lite::Tensor *> &inputs,
const std::vector<lite::Tensor *> &outputs, const InnerContext *ctx,
void *origin_weight, void *origin_bias, TypeId origin_weight_data_type,
TypeId origin_bias_data_type)
: ConvolutionBaseFP16CPUKernel(parameter, inputs, outputs, ctx, origin_weight_data_type, origin_bias_data_type),
origin_weight_(origin_weight),
origin_bias_(origin_bias) {}
const std::vector<lite::Tensor *> &outputs, const InnerContext *ctx)
: ConvolutionBaseCPUKernel(parameter, inputs, outputs, ctx) {}
~ConvolutionDepthwiseSWFp16CPUKernel() override;

int Init() override;
@@ -54,8 +50,6 @@ class ConvolutionDepthwiseSWFp16CPUKernel : public ConvolutionBaseFP16CPUKernel

private:
void FreePackedInputOutput();
void *origin_weight_; // do not free
void *origin_bias_; // do not free
SlidingWindowParam *sliding_ = nullptr;
float16_t *packed_weight_ = nullptr;
float16_t *packed_input_ = nullptr;


+ 10
- 14
mindspore/lite/src/runtime/kernel/arm/fp16/convolution_fp16.cc View File

@@ -45,8 +45,7 @@ int ConvolutionFP16CPUKernel::InitWeightBias() {
return RET_ERROR;
}
memset(packed_weight_, 0, pack_weight_size * sizeof(float16_t));
RowMajor2Col8MajorFp16(origin_weight_, packed_weight_, out_channel, in_channel * kernel_plane,
origin_weight_data_type_ == kNumberTypeFloat32);
RowMajor2Col8MajorFp16(origin_weight_, packed_weight_, out_channel, in_channel * kernel_plane, false);

// init bias
bias_data_ = malloc(oc8 * sizeof(float16_t));
@@ -56,14 +55,7 @@ int ConvolutionFP16CPUKernel::InitWeightBias() {
}
memset(bias_data_, 0, oc8 * sizeof(float16_t));
if (in_tensors_.size() == kInputSize2) {
if (origin_bias_data_type_ == kNumberTypeFloat16) {
memcpy(bias_data_, origin_bias_, out_channel * sizeof(float16_t));
} else {
MS_LOG(ERROR) << "Conv fp16 only support fp16 bias";
return RET_ERROR;
}
} else {
MS_ASSERT(in_tensors_.size() == kInputSize1);
memcpy(bias_data_, origin_bias_, out_channel * sizeof(float16_t));
}
return RET_OK;
}
@@ -123,8 +115,14 @@ int ConvolutionFP16CPUKernel::ReSize() {
}

int ConvolutionFP16CPUKernel::RunImpl(int task_id) {
ConvFp16(execute_input_, packed_input_, packed_weight_, reinterpret_cast<float16_t *>(bias_data_), col_major_input_,
execute_output_, task_id, conv_param_);
auto input_ptr = reinterpret_cast<float16_t *>(in_tensors_.at(0)->data_c());
auto output_ptr = reinterpret_cast<float16_t *>(out_tensors_.at(0)->data_c());
if (input_ptr == nullptr || output_ptr == nullptr) {
MS_LOG(ERROR) << "Convolution Fp16 get null tensor data!";
return RET_ERROR;
}
ConvFp16(input_ptr, packed_input_, packed_weight_, reinterpret_cast<float16_t *>(bias_data_), col_major_input_,
output_ptr, task_id, conv_param_);
return RET_OK;
}

@@ -139,8 +137,6 @@ static int ConvolutionFp16Impl(void *cdata, int task_id) {
}

int ConvolutionFP16CPUKernel::Run() {
ConvolutionBaseFP16CPUKernel::GetExecuteTensor();

auto ret = InitTmpBuffer();
if (ret != RET_OK) {
MS_LOG(ERROR) << "Init tmp buffer failed.";


+ 4
- 4
mindspore/lite/src/runtime/kernel/arm/fp16/convolution_fp16.h View File

@@ -20,15 +20,15 @@
#include <arm_neon.h>
#include <vector>
#include "src/lite_kernel.h"
#include "src/runtime/kernel/arm/fp16/convolution_base_fp16.h"
#include "src/runtime/kernel/arm/base/convolution_base.h"

namespace mindspore::kernel {
class ConvolutionFP16CPUKernel : public ConvolutionBaseFP16CPUKernel {
class ConvolutionFP16CPUKernel : public ConvolutionBaseCPUKernel {
public:
ConvolutionFP16CPUKernel(OpParameter *parameter, const std::vector<lite::Tensor *> &inputs,
const std::vector<lite::Tensor *> &outputs, const InnerContext *ctx, void *origin_weight,
void *origin_bias, TypeId origin_weight_data_type, TypeId origin_bias_data_type)
: ConvolutionBaseFP16CPUKernel(parameter, inputs, outputs, ctx, origin_weight_data_type, origin_bias_data_type),
void *origin_bias)
: ConvolutionBaseCPUKernel(parameter, inputs, outputs, ctx),
origin_weight_(origin_weight),
origin_bias_(origin_bias) {}
~ConvolutionFP16CPUKernel() override {


+ 12
- 26
mindspore/lite/src/runtime/kernel/arm/fp16/convolution_winograd_fp16.cc View File

@@ -33,9 +33,9 @@ int ConvolutionWinogradFP16CPUKernel::WinogradFilterTransformFp16(const float16_
}

int ConvolutionWinogradFP16CPUKernel::InitWeightBias() {
auto filter_tensor = in_tensors_.at(kWeightIndex);
int in_channel = filter_tensor->Channel();
int out_channel = filter_tensor->Batch();
auto weight_tensor = in_tensors_.at(kWeightIndex);
int in_channel = weight_tensor->Channel();
int out_channel = weight_tensor->Batch();
conv_param_->input_channel_ = in_channel;
conv_param_->output_channel_ = out_channel;
int oc_block_num = UP_DIV(out_channel, col_tile_);
@@ -65,21 +65,11 @@ int ConvolutionWinogradFP16CPUKernel::InitWeightBias() {
MS_LOG(ERROR) << "get matrix g from CookToomFilter failed.";
return ret;
}
ret = GetExecuteFilter(filter_tensor, origin_weight_);
if (ret != RET_OK) {
MS_LOG(ERROR) << "get execute filter failed.";
return ret;
}
ret = WinogradFilterTransformFp16(execute_weight_, matrix_g, matrix_gt, col_tile_);
ret = WinogradFilterTransformFp16(reinterpret_cast<float16_t *>(origin_weight_), matrix_g, matrix_gt, col_tile_);
if (ret != RET_OK) {
MS_LOG(ERROR) << "winograd filter transform failed.";
return ret;
}
// if fp16_weight is malloced, free it. It will not be used in runtime anymore.
if (fp16_weight_ != nullptr) {
free(fp16_weight_);
fp16_weight_ = nullptr;
}

// init bias
bias_data_ = malloc(oc_block_num * col_tile_ * sizeof(float16_t));
@@ -88,16 +78,8 @@ int ConvolutionWinogradFP16CPUKernel::InitWeightBias() {
return RET_ERROR;
}
memset(bias_data_, 0, oc_block_num * col_tile_ * sizeof(float16_t));

if (in_tensors_.size() == kInputSize2) {
if (origin_bias_data_type_ == kNumberTypeFloat16) {
memcpy(bias_data_, origin_bias_, out_channel * sizeof(float16_t));
} else {
MS_LOG(ERROR) << "Conv winograd fp16 only support fp16 bias";
return RET_ERROR;
}
} else {
MS_ASSERT(in_tensors_.size() == kInputSize1);
memcpy(bias_data_, origin_bias_, out_channel * sizeof(float16_t));
}
return RET_OK;
}
@@ -202,7 +184,13 @@ int ConvolutionWinogradFP16CPUKernel::ReSize() {
}

int ConvolutionWinogradFP16CPUKernel::RunImpl(int task_id) {
ConvWinogardFp16(execute_input_, trans_weight_, reinterpret_cast<const float16_t *>(bias_data_), execute_output_,
auto input_ptr = reinterpret_cast<float16_t *>(in_tensors_.at(0)->data_c());
auto output_ptr = reinterpret_cast<float16_t *>(out_tensors_.at(0)->data_c());
if (input_ptr == nullptr || output_ptr == nullptr) {
MS_LOG(ERROR) << "Convolution Winograd Fp16 get null tensor data!";
return RET_ERROR;
}
ConvWinogardFp16(input_ptr, trans_weight_, reinterpret_cast<const float16_t *>(bias_data_), output_ptr,
tmp_buffer_address_list_, task_id, conv_param_, in_func_, out_func_);
return RET_OK;
}
@@ -218,8 +206,6 @@ static int ConvolutionWinogradFp16Impl(void *cdata, int task_id) {
}

int ConvolutionWinogradFP16CPUKernel::Run() {
ConvolutionBaseFP16CPUKernel::GetExecuteTensor();

auto ret = InitTmpBuffer();
if (ret != RET_OK) {
MS_LOG(ERROR) << "Init tmp buffer failed.";


+ 4
- 5
mindspore/lite/src/runtime/kernel/arm/fp16/convolution_winograd_fp16.h View File

@@ -20,20 +20,19 @@
#include <arm_neon.h>
#include <vector>
#include "src/lite_kernel.h"
#include "src/runtime/kernel/arm/fp16/convolution_base_fp16.h"
#include "src/runtime/kernel/arm/base/convolution_base.h"
#include "nnacl/fp16/conv_fp16.h"
#include "nnacl/fp16/winograd_utils_fp16.h"
#include "src/common/utils.h"
#include "nnacl/base/minimal_filtering_generator.h"

namespace mindspore::kernel {
class ConvolutionWinogradFP16CPUKernel : public ConvolutionBaseFP16CPUKernel {
class ConvolutionWinogradFP16CPUKernel : public ConvolutionBaseCPUKernel {
public:
ConvolutionWinogradFP16CPUKernel(OpParameter *parameter, const std::vector<lite::Tensor *> &inputs,
const std::vector<lite::Tensor *> &outputs, const InnerContext *ctx, int out_unit,
void *origin_weight, void *origin_bias, TypeId origin_weight_data_type,
TypeId origin_bias_data_type)
: ConvolutionBaseFP16CPUKernel(parameter, inputs, outputs, ctx, origin_weight_data_type, origin_bias_data_type),
void *origin_weight, void *origin_bias)
: ConvolutionBaseCPUKernel(parameter, inputs, outputs, ctx),
output_unit_(out_unit),
origin_weight_(origin_weight),
origin_bias_(origin_bias) {}


+ 14
- 11
mindspore/lite/src/runtime/kernel/arm/fp16/deconvolution_depthwise_fp16.cc View File

@@ -73,7 +73,7 @@ int DeconvolutionDepthwiseFp16CPUKernel::InitWeightBias() {
// init weight: o, h, w, i; o == group, i == 1
auto weight_tensor = in_tensors_.at(kWeightIndex);
int OC8 = UP_DIV(weight_tensor->Batch(), C8NUM);
auto origin_weight = reinterpret_cast<float16_t *>(weight_tensor->MutableData());
auto origin_weight = reinterpret_cast<float16_t *>(weight_tensor->data_c());
int pack_weight_size = C8NUM * OC8 * weight_tensor->Height() * weight_tensor->Width();

packed_weight_ = reinterpret_cast<float16_t *>(malloc(pack_weight_size * sizeof(float16_t)));
@@ -92,10 +92,8 @@ int DeconvolutionDepthwiseFp16CPUKernel::InitWeightBias() {
memset(bias_data_, 0, C8NUM * OC8 * sizeof(float16_t));
if (in_tensors_.size() == kInputSize2) {
auto bias_tensor = in_tensors_.at(kBiasIndex);
auto ori_bias = reinterpret_cast<float16_t *>(bias_tensor->MutableData());
for (int i = 0; i < bias_tensor->ElementsNum(); i++) {
reinterpret_cast<float16_t *>(bias_data_)[i] = ori_bias[i];
}
auto ori_bias = reinterpret_cast<float16_t *>(bias_tensor->data_c());
memcpy(bias_data_, ori_bias, bias_tensor->Size());
}

conv_param_->thread_num_ = MSMIN(thread_count_, OC8);
@@ -157,18 +155,23 @@ int DeconvolutionDepthwiseFp16CPUKernel::Run() {
return RET_ERROR;
}

ConvolutionBaseFP16CPUKernel::GetExecuteTensor();
auto input_ptr = reinterpret_cast<float16_t *>(in_tensors_.at(0)->data_c());
auto output_ptr = reinterpret_cast<float16_t *>(out_tensors_.at(0)->data_c());
if (input_ptr == nullptr || output_ptr == nullptr) {
MS_LOG(ERROR) << "Deconvolution depthwise Fp16 get null tensor data!";
return RET_ERROR;
}

if (need_align_) {
PackNHWCToNHWC8Fp16(execute_input_, packed_input_, conv_param_->input_batch_,
PackNHWCToNHWC8Fp16(input_ptr, packed_input_, conv_param_->input_batch_,
conv_param_->input_h_ * conv_param_->input_w_, conv_param_->input_channel_);
} else {
packed_input_ = execute_input_;
packed_input_ = input_ptr;
}

if (!need_align_) {
memset(execute_output_, 0, out_tensors_.at(kOutputIndex)->ElementsNum() * sizeof(float16_t));
packed_output_ = execute_output_;
memset(output_ptr, 0, out_tensors_.at(kOutputIndex)->ElementsNum() * sizeof(float16_t));
packed_output_ = output_ptr;
}
ret = ParallelLaunch(this->context_->thread_pool_, DeconvDwFp16Run, this, conv_param_->thread_num_);
if (ret != RET_OK) {
@@ -176,7 +179,7 @@ int DeconvolutionDepthwiseFp16CPUKernel::Run() {
}

if (need_align_) {
PackNHWC8ToNHWCFp16(packed_output_, execute_output_, conv_param_->output_batch_,
PackNHWC8ToNHWCFp16(packed_output_, output_ptr, conv_param_->output_batch_,
conv_param_->output_h_ * conv_param_->output_w_, conv_param_->output_channel_);
}



+ 4
- 5
mindspore/lite/src/runtime/kernel/arm/fp16/deconvolution_depthwise_fp16.h View File

@@ -19,7 +19,7 @@

#include <vector>
#include "src/lite_kernel.h"
#include "src/runtime/kernel/arm/fp16/convolution_base_fp16.h"
#include "src/runtime/kernel/arm/base/convolution_base.h"
#include "nnacl/fp16/conv_depthwise_fp16.h"

#ifdef __cplusplus
@@ -34,12 +34,11 @@ void ComputeStrides(int *shape, int *strides, int ndim);
#endif

namespace mindspore::kernel {
class DeconvolutionDepthwiseFp16CPUKernel : public ConvolutionBaseFP16CPUKernel {
class DeconvolutionDepthwiseFp16CPUKernel : public ConvolutionBaseCPUKernel {
public:
DeconvolutionDepthwiseFp16CPUKernel(OpParameter *parameter, const std::vector<lite::Tensor *> &inputs,
const std::vector<lite::Tensor *> &outputs, const InnerContext *ctx,
TypeId origin_weight_data_type, TypeId origin_bias_data_type)
: ConvolutionBaseFP16CPUKernel(parameter, inputs, outputs, ctx, origin_weight_data_type, origin_bias_data_type) {}
const std::vector<lite::Tensor *> &outputs, const InnerContext *ctx)
: ConvolutionBaseCPUKernel(parameter, inputs, outputs, ctx) {}
~DeconvolutionDepthwiseFp16CPUKernel() override;

int Init() override;


+ 20
- 23
mindspore/lite/src/runtime/kernel/arm/fp16/deconvolution_fp16.cc View File

@@ -32,9 +32,9 @@ DeConvolutionFp16CPUKernel::~DeConvolutionFp16CPUKernel() {
delete matmul_param_;
matmul_param_ = nullptr;
}
if (execute_weight_ != nullptr) {
free(execute_weight_);
execute_weight_ = nullptr;
if (pack_weight_ != nullptr) {
free(pack_weight_);
pack_weight_ = nullptr;
}
return;
}
@@ -78,17 +78,17 @@ int DeConvolutionFp16CPUKernel::InitWeightBias() {
}

size_t weight_pack_size = input_channel * kernel_w * kernel_h * UP_ROUND(output_channel, C8NUM) * sizeof(float16_t);
execute_weight_ = reinterpret_cast<float16_t *>(malloc(weight_pack_size));
if (execute_weight_ == nullptr) {
MS_LOG(ERROR) << "deconv malloc execute_weight_ error!";
pack_weight_ = reinterpret_cast<float16_t *>(malloc(weight_pack_size));
if (pack_weight_ == nullptr) {
MS_LOG(ERROR) << "deconv malloc pack_weight_ error!";
return RET_ERROR;
}
memset(execute_weight_, 0, weight_pack_size);
memset(pack_weight_, 0, weight_pack_size);
if (in_tensors_.at(1)->data_type() != kNumberTypeFloat16) {
MS_LOG(ERROR) << "deconv fp16 kernel require fp16 weight";
return RET_ERROR;
}
PackNHWCFp16ToC8HWN8Fp16(reinterpret_cast<float16_t *>(in_tensors_.at(1)->data_c()), execute_weight_, input_channel,
PackNHWCFp16ToC8HWN8Fp16(reinterpret_cast<float16_t *>(in_tensors_.at(1)->data_c()), pack_weight_, input_channel,
kernel_w * kernel_h, output_channel);
return RET_OK;
}
@@ -169,7 +169,7 @@ int DeConvolutionFp16CPUKernel::DoDeconv(int task_id) {
}

auto tmp_buf = tmp_buffer_ + task_id * thread_stride_ * C8NUM * kernel_plane_ * matmul_param_->row_16_;
MatMulFp16(pack_input_, execute_weight_ + task_id * thread_stride_ * C8NUM * kernel_plane_ * matmul_param_->deep_,
MatMulFp16(pack_input_, pack_weight_ + task_id * thread_stride_ * C8NUM * kernel_plane_ * matmul_param_->deep_,
tmp_buf, nullptr, ActType_No, matmul_param_->deep_, matmul_param_->row_, oc * C8NUM * kernel_plane_, 0,
OutType_C8);

@@ -197,7 +197,12 @@ int DeConvolutionFp16CPUKernel::Init() {
}

int DeConvolutionFp16CPUKernel::Run() {
ConvolutionBaseFP16CPUKernel::GetExecuteTensor();
auto input_ptr = reinterpret_cast<float16_t *>(in_tensors_.at(0)->data_c());
auto output_ptr = reinterpret_cast<float16_t *>(out_tensors_.at(0)->data_c());
if (input_ptr == nullptr || output_ptr == nullptr) {
MS_LOG(ERROR) << "DeConvolution Fp16 get null tensor data!";
return RET_ERROR;
}

int error_code = InitRunBuf();
if (error_code != RET_OK) {
@@ -207,8 +212,8 @@ int DeConvolutionFp16CPUKernel::Run() {
}

for (int batch_index = 0; batch_index < conv_param_->input_batch_; batch_index++) {
batch_input_ = execute_input_ + batch_index * conv_param_->input_channel_ * input_plane_;
batch_output_ = execute_output_ + batch_index * conv_param_->output_channel_ * output_plane_;
batch_input_ = input_ptr + batch_index * conv_param_->input_channel_ * input_plane_;
batch_output_ = output_ptr + batch_index * conv_param_->output_channel_ * output_plane_;

RowMajor2Col16MajorFp16Opt(batch_input_, pack_input_, input_plane_, conv_param_->input_channel_);

@@ -228,25 +233,17 @@ kernel::LiteKernel *CpuDeConvFp16KernelCreator(const std::vector<lite::Tensor *>
MS_ASSERT(op_parameter != nullptr);
MS_ASSERT(desc.type == schema::PrimitiveType_Conv2dTransposeFusion);

auto weight_data_type = inputs.at(1)->data_type();
TypeId bias_data_type = kTypeUnknown;
if (inputs.size() == 3) {
bias_data_type = inputs.at(2)->data_type();
}
kernel::LiteKernel *kernel = nullptr;
auto conv_param = reinterpret_cast<ConvParameter *>(op_parameter);
if (conv_param->group_ == 1) {
if ((conv_param->stride_h_ != 1 || conv_param->stride_w_ != 1) &&
(conv_param->dilation_h_ == 1 && conv_param->dilation_w_ == 1)) {
kernel = new (std::nothrow)
kernel::DeConvWinogradFp16CPUKernel(op_parameter, inputs, outputs, ctx, weight_data_type, bias_data_type);
kernel = new (std::nothrow) kernel::DeConvWinogradFp16CPUKernel(op_parameter, inputs, outputs, ctx);
} else {
kernel = new (std::nothrow)
kernel::DeConvolutionFp16CPUKernel(op_parameter, inputs, outputs, ctx, weight_data_type, bias_data_type);
kernel = new (std::nothrow) kernel::DeConvolutionFp16CPUKernel(op_parameter, inputs, outputs, ctx);
}
} else if (conv_param->group_ == conv_param->input_channel_ && conv_param->group_ == conv_param->output_channel_) {
kernel = new (std::nothrow)
DeconvolutionDepthwiseFp16CPUKernel(op_parameter, inputs, outputs, ctx, weight_data_type, bias_data_type);
kernel = new (std::nothrow) DeconvolutionDepthwiseFp16CPUKernel(op_parameter, inputs, outputs, ctx);
}

if (kernel == nullptr) {


+ 5
- 5
mindspore/lite/src/runtime/kernel/arm/fp16/deconvolution_fp16.h View File

@@ -21,15 +21,14 @@
#include "nnacl/fp16/deconv_fp16.h"
#include "nnacl/fp16/matmul_fp16.h"
#include "src/kernel_registry.h"
#include "src/runtime/kernel/arm/fp16/convolution_base_fp16.h"
#include "src/runtime/kernel/arm/base/convolution_base.h"

namespace mindspore::kernel {
class DeConvolutionFp16CPUKernel : public ConvolutionBaseFP16CPUKernel {
class DeConvolutionFp16CPUKernel : public ConvolutionBaseCPUKernel {
public:
DeConvolutionFp16CPUKernel(OpParameter *parameter, const std::vector<lite::Tensor *> &inputs,
const std::vector<lite::Tensor *> &outputs, const lite::InnerContext *ctx,
TypeId origin_weight_data_type, TypeId origin_bias_data_type)
: ConvolutionBaseFP16CPUKernel(parameter, inputs, outputs, ctx, origin_weight_data_type, origin_bias_data_type) {}
const std::vector<lite::Tensor *> &outputs, const lite::InnerContext *ctx)
: ConvolutionBaseCPUKernel(parameter, inputs, outputs, ctx) {}
~DeConvolutionFp16CPUKernel() override;
int Init() override;
int Run() override;
@@ -52,6 +51,7 @@ class DeConvolutionFp16CPUKernel : public ConvolutionBaseFP16CPUKernel {
int thread_count_;
int thread_stride_;
float16_t *pack_input_;
float16_t *pack_weight_;
float16_t *pack_output_;
float16_t *tmp_buffer_;
float16_t *batch_input_;


+ 12
- 18
mindspore/lite/src/runtime/kernel/arm/fp16/deconvolution_winograd_fp16.cc View File

@@ -317,15 +317,11 @@ int DeConvWinogradFp16CPUKernel::InitComputeParam() {
int DeConvWinogradFp16CPUKernel::InitDataParam() {
/* unit data : weight & winograd data*/
auto weight_tensor = in_tensors_.at(kWeightIndex);
auto ret = ConvolutionBaseFP16CPUKernel::GetExecuteFilter(weight_tensor, weight_tensor->data_c());
if (ret != RET_OK) {
MS_LOG(ERROR) << "Get Execute filter failed.";
return ret;
}
auto origin_weight = reinterpret_cast<float16_t *>(weight_tensor->data_c());

for (int i = 0; i < deconv_param_->compute_size_; i++) {
DeConvComputeUnit *unit = &deconv_param_->compute_units_[i];
ret = PackDeConvWgDataFp16(execute_weight_, unit, conv_param_, deconv_param_);
auto ret = PackDeConvWgDataFp16(origin_weight, unit, conv_param_, deconv_param_);
if (ret != RET_OK) {
return ret;
}
@@ -338,18 +334,11 @@ int DeConvWinogradFp16CPUKernel::InitDataParam() {
return RET_ERROR;
}
memset(bias_data_, 0, deconv_param_->oc_up4_ * sizeof(float16_t));
auto fp16_bias_data = reinterpret_cast<float16_t *>(bias_data_);
if (in_tensors_.size() == 3 && in_tensors_.at(kBiasIndex)->shape().size() == 1 &&
in_tensors_.at(kBiasIndex)->DimensionSize(0) == conv_param_->output_channel_) {
auto src_bias = reinterpret_cast<float16_t *>(in_tensors_.at(kBiasIndex)->MutableData());
MS_ASSERT(src_bias);
for (int i = 0; i < conv_param_->output_channel_; ++i) {
fp16_bias_data[i] = (float16_t)src_bias[i];
}
} else {
MS_ASSERT(in_tensors_.size() == kInputSize1);
auto src_bias = reinterpret_cast<float16_t *>(in_tensors_.at(kBiasIndex)->data_c());
memcpy(bias_data_, src_bias, in_tensors_.at(kBiasIndex)->Size());
}

return RET_OK;
}

@@ -391,11 +380,16 @@ int DeConvWinogradFp16CPUKernel::Init() {
}

int DeConvWinogradFp16CPUKernel::Run() {
ConvolutionBaseFP16CPUKernel::GetExecuteTensor();
auto input_ptr = reinterpret_cast<float16_t *>(in_tensors_.at(0)->data_c());
auto output_ptr = reinterpret_cast<float16_t *>(out_tensors_.at(0)->data_c());
if (input_ptr == nullptr || output_ptr == nullptr) {
MS_LOG(ERROR) << "Deconvolution Winograd Fp16 get null tensor data!";
return RET_ERROR;
}

for (int batch_index = 0; batch_index < conv_param_->input_batch_; batch_index++) {
nhwc_input_ = execute_input_ + batch_index * deconv_param_->input_plane_ * conv_param_->input_channel_;
nhwc_output_ = execute_output_ + batch_index * deconv_param_->output_plane_ * conv_param_->output_channel_;
nhwc_input_ = input_ptr + batch_index * deconv_param_->input_plane_ * conv_param_->input_channel_;
nhwc_output_ = output_ptr + batch_index * deconv_param_->output_plane_ * conv_param_->output_channel_;

::memset(nc4hw4_output_, 0, deconv_param_->output_plane_ * deconv_param_->oc_div4_ * C4NUM * sizeof(float16_t));
ParallelLaunch(this->context_->thread_pool_, DeConvWgFp16Run, this, deconv_param_->thread_num_);


+ 4
- 5
mindspore/lite/src/runtime/kernel/arm/fp16/deconvolution_winograd_fp16.h View File

@@ -22,15 +22,14 @@
#include "nnacl/fp16/common_func_fp16.h"
#include "nnacl/fp16/deconv_winograd_fp16.h"
#include "nnacl/fp16/pack_fp16.h"
#include "src/runtime/kernel/arm/fp16/convolution_base_fp16.h"
#include "src/runtime/kernel/arm/base/convolution_base.h"

namespace mindspore::kernel {
class DeConvWinogradFp16CPUKernel : public ConvolutionBaseFP16CPUKernel {
class DeConvWinogradFp16CPUKernel : public ConvolutionBaseCPUKernel {
public:
DeConvWinogradFp16CPUKernel(OpParameter *parameter, const std::vector<lite::Tensor *> &inputs,
const std::vector<lite::Tensor *> &outputs, const lite::InnerContext *ctx,
TypeId origin_weight_data_type, TypeId origin_bias_data_type)
: ConvolutionBaseFP16CPUKernel(parameter, inputs, outputs, ctx, origin_weight_data_type, origin_bias_data_type) {}
const std::vector<lite::Tensor *> &outputs, const lite::InnerContext *ctx)
: ConvolutionBaseCPUKernel(parameter, inputs, outputs, ctx) {}
~DeConvWinogradFp16CPUKernel() override;
int Init() override;
int Run() override;


+ 28
- 4
mindspore/lite/src/runtime/kernel/arm/fp32/convolution_delegate_fp32.cc View File

@@ -24,6 +24,7 @@
#include "src/runtime/kernel/arm/fp32/convolution_depthwise_slidewindow_fp32.h"
#include "src/runtime/kernel/arm/fp32/convolution_depthwise_indirect_fp32.h"
#include "src/runtime/kernel/arm/base/group_convolution_creator.h"
#include "src/runtime/kernel/arm/base/group_convolution.h"
#include "schema/model_generated.h"
#include "include/errorcode.h"

@@ -161,9 +162,9 @@ kernel::LiteKernel *ConvolutionDelegateCPUKernel::CpuConvFp32KernelSelect() {
return kernel;
}

kernel::LiteKernel *DispatchConvDw(const std::vector<lite::Tensor *> &inputs,
const std::vector<lite::Tensor *> &outputs, OpParameter *opParameter,
const InnerContext *ctx) {
kernel::LiteKernel *CpuConvDwFp32KernelCreator(const std::vector<lite::Tensor *> &inputs,
const std::vector<lite::Tensor *> &outputs, OpParameter *opParameter,
const InnerContext *ctx) {
auto conv_param = reinterpret_cast<ConvParameter *>(opParameter);
kernel::LiteKernel *kernel = nullptr;
if (opParameter != nullptr && opParameter->infer_flag_) {
@@ -187,6 +188,29 @@ kernel::LiteKernel *DispatchConvDw(const std::vector<lite::Tensor *> &inputs,
return kernel;
}

kernel::LiteKernel *CpuGroupConvFp32KernelCreator(const std::vector<lite::Tensor *> &inputs,
const std::vector<lite::Tensor *> &outputs, OpParameter *op_parameter,
const lite::InnerContext *ctx) {
auto conv_param = reinterpret_cast<ConvParameter *>(op_parameter);
GroupConvCreator group_conv_creator(inputs, outputs, op_parameter, ctx, false, kNumberTypeFloat32);
group_conv_creator.SetShapeOfTensors();
for (int i = 0; i < conv_param->group_; ++i) {
ConvParameter *new_conv_param = CreateNewConvParameter(conv_param);
std::vector<lite::Tensor *> new_inputs;
std::vector<lite::Tensor *> new_outputs;
auto ret = group_conv_creator.GetSingleConvParam(new_conv_param, &new_inputs, &new_outputs, i);
if (ret != RET_OK) {
MS_LOG(ERROR) << "GetSingleConv for fp32 group conv failed.";
return nullptr;
}
group_conv_creator.get_group_conv()->emplace_back(new (std::nothrow) ConvolutionDelegateCPUKernel(
reinterpret_cast<OpParameter *>(new_conv_param), new_inputs, new_outputs, ctx));
}
return new (std::nothrow)
GroupConvolutionCPUKernel(op_parameter, inputs, outputs, ctx, *(group_conv_creator.get_group_conv()),
reinterpret_cast<ConvParameter *>(op_parameter)->group_);
}

/* creator func */
kernel::LiteKernel *CpuConvFp32KernelCreator(const std::vector<lite::Tensor *> &inputs,
const std::vector<lite::Tensor *> &outputs, OpParameter *op_parameter,
@@ -200,7 +224,7 @@ kernel::LiteKernel *CpuConvFp32KernelCreator(const std::vector<lite::Tensor *> &
if (conv_param->group_ == 1) {
kernel = new (std::nothrow) kernel::ConvolutionDelegateCPUKernel(op_parameter, inputs, outputs, ctx);
} else if (conv_param->group_ == conv_param->input_channel_ && conv_param->group_ == conv_param->output_channel_) {
kernel = DispatchConvDw(inputs, outputs, op_parameter, ctx);
kernel = CpuConvDwFp32KernelCreator(inputs, outputs, op_parameter, ctx);
} else {
kernel = CpuGroupConvFp32KernelCreator(inputs, outputs, op_parameter, ctx);
}


+ 24
- 0
mindspore/lite/src/runtime/kernel/arm/int8/convolution_int8_creator.cc View File

@@ -21,6 +21,7 @@
#include "src/runtime/kernel/arm/int8/convolution_depthwise_int8.h"
#include "src/runtime/kernel/arm/int8/convolution_depthwise_3x3_int8.h"
#include "src/runtime/kernel/arm/int8/convolution_depthwise_slidewindow_int8.h"
#include "src/runtime/kernel/arm/int8/group_convolution_int8.h"
#include "src/runtime/kernel/arm/base/group_convolution_creator.h"
#include "schema/model_generated.h"
#include "src/kernel_registry.h"
@@ -83,6 +84,29 @@ kernel::LiteKernel *CpuConvInt8KernelSelect(const std::vector<lite::Tensor *> &i
return kernel;
}

kernel::LiteKernel *CpuGroupConvInt8KernelCreator(const std::vector<lite::Tensor *> &inputs,
const std::vector<lite::Tensor *> &outputs, OpParameter *op_parameter,
const lite::InnerContext *ctx, int group) {
auto conv_param = reinterpret_cast<ConvParameter *>(op_parameter);
GroupConvCreator group_conv_creator(inputs, outputs, op_parameter, ctx, true, kNumberTypeInt8);
group_conv_creator.SetShapeOfTensors();
for (int i = 0; i < conv_param->group_; ++i) {
ConvParameter *new_conv_param = CreateNewConvParameter(conv_param);
std::vector<lite::Tensor *> new_inputs;
std::vector<lite::Tensor *> new_outputs;
auto ret = group_conv_creator.GetSingleConvParam(new_conv_param, &new_inputs, &new_outputs, i);
if (ret != RET_OK) {
MS_LOG(ERROR) << "GetSingleConv for int8 group conv failed.";
return nullptr;
}
group_conv_creator.CopyQuantParam(&new_inputs);
group_conv_creator.get_group_conv()->emplace_back(
CpuConvInt8KernelSelect(new_inputs, new_outputs, reinterpret_cast<OpParameter *>(new_conv_param), ctx));
}
return new (std::nothrow)
GroupConvolutionInt8CPUKernel(op_parameter, inputs, outputs, ctx, *(group_conv_creator.get_group_conv()), group);
}

kernel::LiteKernel *CpuConvInt8KernelCreator(const std::vector<lite::Tensor *> &inputs,
const std::vector<lite::Tensor *> &outputs, OpParameter *op_parameter,
const InnerContext *ctx, const kernel::KernelKey &desc) {


Loading…
Cancel
Save