Browse Source

!11105 [MS][LITE][CPU]dispatch convolution op through delegate

From: @fuzhiye
Reviewed-by: @hangangqiang,@zhang_xue_tong
Signed-off-by: @zhang_xue_tong
tags/v1.2.0-rc1
mindspore-ci-bot Gitee 5 years ago
parent
commit
3f03b83257
23 changed files with 1108 additions and 929 deletions
  1. +0
    -3
      mindspore/lite/nnacl/int8/quantize.h
  2. +3
    -14
      mindspore/lite/src/runtime/kernel/arm/fp16/convolution_1x1_fp16.cc
  3. +7
    -2
      mindspore/lite/src/runtime/kernel/arm/fp16/convolution_1x1_fp16.h
  4. +418
    -0
      mindspore/lite/src/runtime/kernel/arm/fp16/convolution_delegate_fp16.cc
  5. +65
    -0
      mindspore/lite/src/runtime/kernel/arm/fp16/convolution_delegate_fp16.h
  6. +10
    -335
      mindspore/lite/src/runtime/kernel/arm/fp16/convolution_fp16.cc
  7. +6
    -6
      mindspore/lite/src/runtime/kernel/arm/fp16/convolution_fp16.h
  8. +6
    -27
      mindspore/lite/src/runtime/kernel/arm/fp16/convolution_winograd_fp16.cc
  9. +8
    -6
      mindspore/lite/src/runtime/kernel/arm/fp16/convolution_winograd_fp16.h
  10. +8
    -12
      mindspore/lite/src/runtime/kernel/arm/fp16/group_convolution_fp16.cc
  11. +27
    -0
      mindspore/lite/src/runtime/kernel/arm/fp32/adder_fp32.cc
  12. +3
    -1
      mindspore/lite/src/runtime/kernel/arm/fp32/adder_fp32.h
  13. +12
    -15
      mindspore/lite/src/runtime/kernel/arm/fp32/convolution_1x1_fp32.cc
  14. +6
    -2
      mindspore/lite/src/runtime/kernel/arm/fp32/convolution_1x1_fp32.h
  15. +416
    -0
      mindspore/lite/src/runtime/kernel/arm/fp32/convolution_delegate_fp32.cc
  16. +77
    -0
      mindspore/lite/src/runtime/kernel/arm/fp32/convolution_delegate_fp32.h
  17. +10
    -319
      mindspore/lite/src/runtime/kernel/arm/fp32/convolution_fp32.cc
  18. +6
    -12
      mindspore/lite/src/runtime/kernel/arm/fp32/convolution_fp32.h
  19. +5
    -17
      mindspore/lite/src/runtime/kernel/arm/fp32/convolution_winograd_fp32.cc
  20. +6
    -2
      mindspore/lite/src/runtime/kernel/arm/fp32/convolution_winograd_fp32.h
  21. +8
    -12
      mindspore/lite/src/runtime/kernel/arm/fp32/group_convolution_fp32.cc
  22. +1
    -2
      mindspore/lite/src/runtime/kernel/arm/int8/convolution_int8.cc
  23. +0
    -142
      mindspore/lite/test/ut/src/runtime/kernel/arm/fp32/conv1x1_fp32_tests.cc

+ 0
- 3
mindspore/lite/nnacl/int8/quantize.h View File

@@ -21,9 +21,6 @@
#include <limits.h>
#include "nnacl/op_base.h"

#define INPUT_ASYMMETRIC 0b001
#define FILTER_ASYMMETRIC 0b010
#define OUTPUT_ASYMMETRIC 0b100
#define INPUT_PER_CHANNEL 0b001
#define FILTER_PER_CHANNEL 0b010
#define OUTPUT_PER_CHANNEL 0b100


+ 3
- 14
mindspore/lite/src/runtime/kernel/arm/fp16/convolution_1x1_fp16.cc View File

@@ -93,13 +93,7 @@ int Convolution1x1FP16CPUKernel::InitWeightBias() {
MS_LOG(ERROR) << "Conv1x1 Malloc bias_ptr_ error!";
return RET_ERROR;
}
auto bias_tensor = in_tensors_.at(kBiasIndex);
if (bias_tensor->data_type() == kNumberTypeFloat16) {
memcpy(bias_data_, bias_tensor->MutableData(), output_channel * sizeof(float16_t));
} else {
Float32ToFloat16(reinterpret_cast<float *>(bias_tensor->MutableData()), reinterpret_cast<float16_t *>(bias_data_),
output_channel);
}
memcpy(bias_data_, fp16_bias_, output_channel * sizeof(float16_t));
memset(reinterpret_cast<char *>(bias_data_) + weight_size, 0, size - weight_size);
}

@@ -111,8 +105,7 @@ int Convolution1x1FP16CPUKernel::InitWeightBias() {
return RET_ERROR;
}
memset(reinterpret_cast<char *>(weight_ptr_) + down_size, 0, size - down_size);
ColMajor2Row8MajorFp16(weight_tensor->MutableData(), weight_ptr_, input_channel, output_channel,
weight_tensor->data_type() == kNumberTypeFloat16);
ColMajor2Row8MajorFp16(fp16_weight_, weight_ptr_, input_channel, output_channel, true);
return RET_OK;
}

@@ -127,10 +120,7 @@ int Convolution1x1FP16CPUKernel::Init() {
MS_LOG(ERROR) << "Init weight bias failed.";
return ret;
}
if (!InferShapeDone()) {
return RET_OK;
}
return ReSize();
return RET_OK;
}

void Convolution1x1FP16CPUKernel::FreeTmpBuffer() {
@@ -143,7 +133,6 @@ void Convolution1x1FP16CPUKernel::FreeTmpBuffer() {

int Convolution1x1FP16CPUKernel::ReSize() {
FreeTmpBuffer();

auto ret = ConvolutionBaseCPUKernel::Init();
if (ret != RET_OK) {
MS_LOG(ERROR) << "ConvolutionBase init failed.";


+ 7
- 2
mindspore/lite/src/runtime/kernel/arm/fp16/convolution_1x1_fp16.h View File

@@ -30,8 +30,11 @@ class Convolution1x1FP16CPUKernel : public ConvolutionBaseFP16CPUKernel {
public:
Convolution1x1FP16CPUKernel(OpParameter *parameter, const std::vector<lite::Tensor *> &inputs,
const std::vector<lite::Tensor *> &outputs, const InnerContext *ctx,
const mindspore::lite::PrimitiveC *primitive)
: ConvolutionBaseFP16CPUKernel(parameter, inputs, outputs, ctx, primitive) {}
const mindspore::lite::PrimitiveC *primitive, float16_t *fp16_weight,
float16_t *fp16_bias)
: ConvolutionBaseFP16CPUKernel(parameter, inputs, outputs, ctx, primitive),
fp16_weight_(fp16_weight),
fp16_bias_(fp16_bias) {}
~Convolution1x1FP16CPUKernel() override;

int Init() override;
@@ -53,6 +56,8 @@ class Convolution1x1FP16CPUKernel : public ConvolutionBaseFP16CPUKernel {
bool multi_thread_by_hw_ = false;
int thread_count_ = 1;
int thread_stride_ = 0;
float16_t *fp16_weight_; // do not free
float16_t *fp16_bias_; // do not free
float16_t *weight_ptr_ = nullptr;
float16_t *input_ptr_ = nullptr;
float16_t *pack_input_ = nullptr;


+ 418
- 0
mindspore/lite/src/runtime/kernel/arm/fp16/convolution_delegate_fp16.cc View File

@@ -0,0 +1,418 @@
/**
* Copyright 2020 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include "src/runtime/kernel/arm/fp16/convolution_delegate_fp16.h"
#include <vector>
#include "src/runtime/kernel/arm/fp32/convolution_delegate_fp32.h"
#include "src/runtime/kernel/arm/fp16/convolution_fp16.h"
#include "src/runtime/kernel/arm/fp16/convolution_winograd_fp16.h"
#include "src/runtime/kernel/arm/fp16/convolution_1x1_fp16.h"
#include "src/runtime/kernel/arm/fp16/group_convolution_fp16.h"
#include "schema/model_generated.h"
#include "src/kernel_registry.h"
#include "include/errorcode.h"
#include "src/runtime/runtime_api.h"
#include "src/runtime/kernel/arm/base/dequant.h"

using mindspore::kernel::KERNEL_ARCH::kCPU;
using mindspore::lite::KernelRegistrar;
using mindspore::lite::RET_ERROR;
using mindspore::lite::RET_OK;
using mindspore::schema::PrimitiveType_Conv2D;
using mindspore::schema::Format::Format_NHWC;

namespace mindspore::kernel {
void ConvolutionDelegateFP16CPUKernel::FreeCopiedData() {
if ((fp16_weight_ != nullptr) && (need_free_ & WEIGHT_NEED_FREE)) {
free(fp16_weight_);
fp16_weight_ = nullptr;
}
if ((fp16_bias_ != nullptr) && (need_free_ & BIAS_NEED_FREE)) {
free(fp16_bias_);
fp16_bias_ = nullptr;
}
}

int ConvolutionDelegateFP16CPUKernel::GetFp16WeightAndBias() {
auto ret = GetFp16Weight();
if (ret != RET_OK) {
MS_LOG(ERROR) << "Get Fp16 Weight failed.";
return RET_ERROR;
}

ret = GetFp16Bias();
if (ret != RET_OK) {
MS_LOG(ERROR) << "Get Fp16 Bias failed.";
return RET_ERROR;
}
return RET_OK;
}

int ConvolutionDelegateFP16CPUKernel::GetFp16Weight() {
auto weight_tensor = in_tensors_.at(kWeightIndex);
if (weight_tensor->data_type() == kNumberTypeFloat16 && InferShapeDone()) {
// do not need malloc new memory to store origin data
fp16_weight_ = reinterpret_cast<float16_t *>(weight_tensor->data_c());
return RET_OK;
} else {
fp16_weight_ = CopyData(weight_tensor);
if (fp16_weight_ == nullptr) {
MS_LOG(ERROR) << "Generate fp16_weight failed.";
return RET_ERROR;
}
need_free_ = need_free_ | WEIGHT_NEED_FREE;
return RET_OK;
}
return RET_OK;
}

int ConvolutionDelegateFP16CPUKernel::GetFp16Bias() {
if (in_tensors_.size() == 3) {
// has bias situation
auto bias_tensor = in_tensors_.at(kBiasIndex);
if (bias_tensor->data_type() == kNumberTypeFloat16 && InferShapeDone()) {
// do not need malloc new memory to store origin data
fp16_bias_ = reinterpret_cast<float16_t *>(bias_tensor->data_c());
return RET_OK;
} else {
fp16_bias_ = CopyData(bias_tensor);
if (fp16_bias_ == nullptr) {
MS_LOG(ERROR) << "Generate fp16_bias failed.";
return RET_ERROR;
}
need_free_ = need_free_ | BIAS_NEED_FREE;
return RET_OK;
}
}
return RET_OK;
}

float16_t *ConvolutionDelegateFP16CPUKernel::CopyData(lite::Tensor *tensor) {
auto data_type = tensor->data_type();
MS_ASSERT(data_type == kNumberTypeFloat32 || data_type == kNumberTypeFloat16);
auto fp16_data = reinterpret_cast<float16_t *>(malloc(tensor->ElementsNum() * sizeof(float16_t)));
if (fp16_data == nullptr) {
MS_LOG(ERROR) << "Malloc fp16_data failed.";
return nullptr;
}
if (data_type == kNumberTypeFloat32) {
float *origin_data = reinterpret_cast<float *>(tensor->data_c());
for (size_t i = 0; i < tensor->ElementsNum(); ++i) {
fp16_data[i] = (float16_t)origin_data[i];
}
} else {
auto *origin_data = reinterpret_cast<float16_t *>(tensor->data_c());
memcpy(fp16_data, origin_data, tensor->Size());
}
return fp16_data;
}

int ConvolutionDelegateFP16CPUKernel::Init() {
auto ret = GetFp16WeightAndBias();
if (ret != RET_OK) {
MS_LOG(ERROR) << "Get fp16 weight and bias failed.";
return ret;
}
if (!InferShapeDone()) {
return RET_OK;
}
return ReSize();
}

int ConvolutionDelegateFP16CPUKernel::ReSize() {
// Update shape info of input and output
SetInputOutputShapeInfo(reinterpret_cast<ConvParameter *>(op_parameter_), in_tensors_.front(), out_tensors_.front(),
context_);
if (fp16_conv_kernel_ == nullptr) {
fp16_conv_kernel_ =
CpuConvFp16KernelSelect(in_tensors_, out_tensors_, op_parameter_, context_, primitive_, fp16_weight_, fp16_bias_);
if (fp16_conv_kernel_ == nullptr) {
MS_LOG(ERROR) << "Selecting execute kernel failed for conv_kernel, got a nullptr.";
return RET_ERROR;
}
}
// copied weight and bias are not be used anymore,free them.
FreeCopiedData();
return fp16_conv_kernel_->ReSize();
}

ConvParameter *CreateNewConvParameterFp16(ConvParameter *parameter) {
auto conv_parameter = reinterpret_cast<ConvParameter *>(malloc(sizeof(ConvParameter)));
if (conv_parameter == nullptr) {
MS_LOG(ERROR) << "Malloc new conv parameter failed.";
return nullptr;
}
memcpy(conv_parameter, parameter, sizeof(ConvParameter));
return conv_parameter;
}

kernel::LiteKernel *CpuConvFp16KernelSelect(const std::vector<lite::Tensor *> &inputs,
const std::vector<lite::Tensor *> &outputs, OpParameter *op_parameter,
const lite::InnerContext *ctx, const mindspore::lite::PrimitiveC *primitive,
float16_t *fp16_weight, float16_t *fp16_bias) {
auto conv_param = reinterpret_cast<ConvParameter *>(op_parameter);
bool use_winograd = false;
int out_unit;
CheckIfUseWinogradFp16(&use_winograd, &out_unit, conv_param);
kernel::LiteKernel *kernel = nullptr;
if (conv_param->kernel_h_ == 1 && conv_param->kernel_w_ == 1) {
kernel = new (std::nothrow)
kernel::Convolution1x1FP16CPUKernel(op_parameter, inputs, outputs, ctx, primitive, fp16_weight, fp16_bias);
} else if (use_winograd) {
kernel = new (std::nothrow) kernel::ConvolutionWinogradFP16CPUKernel(op_parameter, inputs, outputs, ctx, primitive,
out_unit, fp16_weight, fp16_bias);
} else {
kernel = new (std::nothrow)
kernel::ConvolutionFP16CPUKernel(op_parameter, inputs, outputs, ctx, primitive, fp16_weight, fp16_bias);
}
// Once kernel is selected, init func will invoke InitWeightAndBias
auto ret = kernel->Init();
if (ret != RET_OK) {
MS_LOG(ERROR) << "kernel init failed.";
delete kernel;
return nullptr;
}
return kernel;
}

void FreeMemoryFp16(const std::vector<kernel::LiteKernel *> &group_convs, const std::vector<lite::Tensor *> &new_inputs,
const std::vector<lite::Tensor *> &new_outputs) {
for (auto sub_conv : group_convs) {
delete sub_conv;
}
for (auto in_tensor : new_inputs) {
delete in_tensor;
}
for (auto out_tensor : new_outputs) {
delete out_tensor;
}
}

static lite::Tensor *CreateInputTensorFp16(TypeId data_type, const std::vector<int> &in_shape, bool infered_flag) {
auto in_tensor = new (std::nothrow) lite::Tensor(data_type, in_shape, Format_NHWC, lite::Tensor::Category::VAR);
if (in_tensor == nullptr) {
MS_LOG(ERROR) << "new in_tensor failed.";
return nullptr;
}
if (infered_flag) {
auto ret = in_tensor->MallocData();
if (ret != RET_OK) {
delete in_tensor;
MS_LOG(ERROR) << "in tensor malloc failed.";
return nullptr;
}
}
return in_tensor;
}

static lite::Tensor *CreateConstTensorFp16(lite::Tensor *tensor, const std::vector<int> &shape, const int index) {
auto new_tensor =
new (std::nothrow) lite::Tensor(tensor->data_type(), shape, Format_NHWC, lite::Tensor::Category::CONST_TENSOR);
if (new_tensor == nullptr) {
MS_LOG(ERROR) << "Create new_tensor failed.";
return nullptr;
}
auto ret = new_tensor->MallocData();
if (ret != RET_OK) {
delete new_tensor;
MS_LOG(ERROR) << "Malloc new_tensor failed.";
return nullptr;
}
memcpy(new_tensor->data_c(), reinterpret_cast<char *>(tensor->data_c()) + index * new_tensor->Size(),
new_tensor->Size());
return new_tensor;
}

static lite::Tensor *CreateOutputTensorFp16(const std::vector<int> &out_shape,
const std::vector<lite::Tensor *> &outputs, bool infered_flag, int index) {
auto out_tensor = new (std::nothrow) lite::Tensor();
if (out_tensor == nullptr) {
MS_LOG(ERROR) << "new tmp_out_tensor failed.";
return nullptr;
}
out_tensor->set_data_type(mindspore::kNumberTypeFloat16);
out_tensor->set_format(outputs.at(index)->format());
if (infered_flag) {
out_tensor->set_shape(out_shape);
auto ret = out_tensor->MallocData();
if (ret != RET_OK) {
delete out_tensor;
MS_LOG(ERROR) << "out_tensor malloc data failed.";
return nullptr;
}
}
return out_tensor;
}

kernel::LiteKernel *CreateDelegateConvFp16(const std::vector<lite::Tensor *> &inputs,
const std::vector<lite::Tensor *> &outputs, OpParameter *op_parameter,
const InnerContext *ctx, const mindspore::lite::PrimitiveC *primitive) {
return new (std::nothrow) kernel::ConvolutionDelegateFP16CPUKernel(op_parameter, inputs, outputs, ctx, primitive);
}

kernel::LiteKernel *CpuGroupConvFp16KernelCreator(const std::vector<lite::Tensor *> &inputs,
const std::vector<lite::Tensor *> &outputs, OpParameter *op_parameter,
const InnerContext *ctx,
const mindspore::lite::PrimitiveC *primitive) {
bool infer_flag = (primitive != nullptr && primitive->infer_flag());
auto conv_param = reinterpret_cast<ConvParameter *>(op_parameter);
// update new shape info for each sub kernel
int new_in_channel = inputs.at(kWeightIndex)->Channel();
int new_out_channel = 0;
if (conv_param->group_ == 0) {
MS_LOG(ERROR) << "Divisor 'group' cannot be 0.";
return nullptr;
} else {
new_out_channel = inputs.at(kWeightIndex)->Batch() / conv_param->group_;
}

std::vector<int> in_shape;
std::vector<int> out_shape;
if (infer_flag) {
conv_param->input_channel_ = new_in_channel;
conv_param->output_channel_ = new_out_channel;
in_shape = {inputs.front()->Batch(), inputs.front()->Height(), inputs.front()->Width(), new_in_channel};
out_shape = {inputs.front()->Batch(), outputs.front()->Height(), outputs.front()->Width(), new_out_channel};
}
std::vector<int> filter_shape = {new_out_channel, conv_param->kernel_h_, conv_param->kernel_w_, new_in_channel};
std::vector<int> bias_shape = {new_out_channel};

// new group conv op
std::vector<kernel::LiteKernel *> group_convs;
// create tensors for every sub conv kernel
for (int i = 0; i < conv_param->group_; ++i) {
std::vector<lite::Tensor *> new_inputs;
std::vector<lite::Tensor *> new_outputs;
auto new_conv_parameter = CreateNewConvParameterFp16(conv_param);
if (new_conv_parameter == nullptr) {
FreeMemoryFp16(group_convs, new_inputs, new_outputs);
MS_LOG(ERROR) << "Get new conv parameter failed.";
return nullptr;
}
// create new input for each group
auto in_tensor = CreateInputTensorFp16(mindspore::kNumberTypeFloat16, in_shape, infer_flag);
if (in_tensor == nullptr) {
delete new_conv_parameter;
FreeMemoryFp16(group_convs, new_inputs, new_outputs);
MS_LOG(ERROR) << "create input tensor failed.";
return nullptr;
}
new_inputs.emplace_back(in_tensor);

// create new weight
auto filter_tensor = CreateConstTensorFp16(inputs.at(kWeightIndex), filter_shape, i);
if (filter_tensor == nullptr) {
delete new_conv_parameter;
FreeMemoryFp16(group_convs, new_inputs, new_outputs);
MS_LOG(ERROR) << "create filter tensor failed.";
return nullptr;
}
new_inputs.emplace_back(filter_tensor);

// if has bias, create new bias
if (inputs.size() == 3) {
auto bias_tensor = CreateConstTensorFp16(inputs.at(kBiasIndex), bias_shape, i);
if (bias_tensor == nullptr) {
delete new_conv_parameter;
FreeMemoryFp16(group_convs, new_inputs, new_outputs);
MS_LOG(ERROR) << "create bias_tensor failed.";
return nullptr;
}
new_inputs.emplace_back(bias_tensor);
}

// create new output tensors
for (size_t j = 0; j < outputs.size(); ++j) {
auto out_tensor = CreateOutputTensorFp16(out_shape, outputs, infer_flag, j);
if (out_tensor == nullptr) {
delete new_conv_parameter;
FreeMemoryFp16(group_convs, new_inputs, new_outputs);
MS_LOG(ERROR) << "new out_tensor failed.";
return nullptr;
}
new_outputs.emplace_back(out_tensor);
}
group_convs.emplace_back(CreateDelegateConvFp16(
new_inputs, new_outputs, reinterpret_cast<OpParameter *>(new_conv_parameter), ctx, primitive));
}
return new (std::nothrow)
GroupConvolutionFP16CPUKernel(op_parameter, inputs, outputs, ctx, primitive, group_convs, conv_param->group_);
}

kernel::LiteKernel *CpuConvFp16KernelCreator(const std::vector<lite::Tensor *> &inputs,
const std::vector<lite::Tensor *> &outputs, OpParameter *opParameter,
const InnerContext *ctx, const kernel::KernelKey &desc,
const mindspore::lite::PrimitiveC *primitive) {
MS_ASSERT(opParameter != nullptr);
MS_ASSERT(desc.type == schema::PrimitiveType_Conv2D);

auto *weight_tensor = inputs.at(kWeightIndex);
auto *restore_data = weight_tensor->data_c();
auto restore_type = weight_tensor->data_type();
bool dequant_flag =
!weight_tensor->quant_params().empty() && weight_tensor->quant_params().front().inited && restore_data != nullptr;
if (dequant_flag) {
auto *dequant_weight = kernel::DequantUtil::DequantWeight(weight_tensor);
if (dequant_weight == nullptr) {
MS_LOG(ERROR) << "dequant data is nullptr.";
free(opParameter);
return nullptr;
}
weight_tensor->set_data_type(kNumberTypeFloat32);
weight_tensor->set_data(dequant_weight);
}

auto conv_param = reinterpret_cast<ConvParameter *>(opParameter);
kernel::LiteKernel *kernel = nullptr;
if (conv_param->group_ == 1) {
kernel = CreateDelegateConvFp16(inputs, outputs, opParameter, ctx, primitive);
} else {
kernel = CpuGroupConvFp16KernelCreator(inputs, outputs, opParameter, ctx, primitive);
}

if (kernel == nullptr) {
MS_LOG(DEBUG) << "Create conv fp16 kernel failed.";
if (dequant_flag) {
weight_tensor->FreeData();
weight_tensor->set_data(restore_data);
weight_tensor->set_data_type(restore_type);
}
free(opParameter);
return nullptr;
}

auto ret = kernel->Init();
if (ret != RET_OK) {
MS_LOG(INFO) << "Init fp16 kernel failed, name: " << opParameter->name_
<< ", type: " << schema::EnumNamePrimitiveType(static_cast<schema::PrimitiveType>(opParameter->type_));
if (dequant_flag) {
weight_tensor->FreeData();
weight_tensor->set_data(restore_data);
weight_tensor->set_data_type(restore_type);
}
delete kernel;
return nullptr;
}

if (dequant_flag) {
weight_tensor->FreeData();
weight_tensor->set_data(restore_data);
weight_tensor->set_data_type(restore_type);
}
return kernel;
}
REG_KERNEL(kCPU, kNumberTypeFloat16, PrimitiveType_Conv2D, CpuConvFp16KernelCreator)
} // namespace mindspore::kernel

+ 65
- 0
mindspore/lite/src/runtime/kernel/arm/fp16/convolution_delegate_fp16.h View File

@@ -0,0 +1,65 @@
/**
* Copyright 2020 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_CONVOLUTION_DELEGATE_FP16_H_
#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_CONVOLUTION_DELEGATE_FP16_H_

#include <arm_neon.h>
#include <vector>
#include "src/lite_kernel.h"
#include "nnacl/conv_parameter.h"
#include "nnacl/op_base.h"

#define WEIGHT_NEED_FREE 0b01
#define BIAS_NEED_FREE 0b10

namespace mindspore::kernel {
class ConvolutionDelegateFP16CPUKernel : public LiteKernel {
public:
ConvolutionDelegateFP16CPUKernel(OpParameter *parameter, const std::vector<lite::Tensor *> &inputs,
const std::vector<lite::Tensor *> &outputs, const lite::InnerContext *ctx,
const mindspore::lite::PrimitiveC *primitive)
: LiteKernel(parameter, inputs, outputs, ctx, primitive) {}
~ConvolutionDelegateFP16CPUKernel() override {
FreeCopiedData();
if (fp16_conv_kernel_ != nullptr) {
op_parameter_ = nullptr; // set op_parameter of delegate to nullptr, avoiding double free
delete fp16_conv_kernel_;
fp16_conv_kernel_ = nullptr;
}
}
int GetFp16WeightAndBias();
int GetFp16Weight();
int GetFp16Bias();
float16_t *CopyData(lite::Tensor *tensor);
void FreeCopiedData();
int Init() override;
int ReSize() override;
int Run() override { return fp16_conv_kernel_->Run(); }

private:
uint8_t need_free_ = 0b00;
kernel::LiteKernel *fp16_conv_kernel_ = nullptr;
float16_t *fp16_weight_ = nullptr;
float16_t *fp16_bias_ = nullptr;
};

kernel::LiteKernel *CpuConvFp16KernelSelect(const std::vector<lite::Tensor *> &inputs,
const std::vector<lite::Tensor *> &outputs, OpParameter *op_parameter,
const lite::InnerContext *ctx, const mindspore::lite::PrimitiveC *primitive,
float16_t *fp16_weight, float16_t *fp16_bias);
} // namespace mindspore::kernel

#endif // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_CONVOLUTION_DELEGATE_FP16_H_

+ 10
- 335
mindspore/lite/src/runtime/kernel/arm/fp16/convolution_fp16.cc View File

@@ -16,19 +16,16 @@

#include "src/runtime/kernel/arm/fp16/convolution_fp16.h"
#include <vector>
#include "src/runtime/kernel/arm/fp16/convolution_winograd_fp16.h"
#include "src/runtime/kernel/arm/fp16/convolution_1x1_fp16.h"
#include "src/runtime/kernel/arm/fp16/group_convolution_fp16.h"
#include "nnacl/fp16/conv_fp16.h"
#include "nnacl/fp16/cast_fp16.h"
#include "nnacl/fp16/pack_fp16.h"
#include "src/runtime/kernel/arm/fp16/layout_transform_fp16.h"
#include "include/errorcode.h"
#include "schema/model_generated.h"
#include "src/kernel_registry.h"
#include "include/errorcode.h"
#include "src/runtime/runtime_api.h"
#include "nnacl/fp16/winograd_utils_fp16.h"
#include "src/runtime/kernel/arm/base/dequant.h"
#include "nnacl/fp16/conv_fp16.h"
#include "nnacl/fp16/matmul_fp16.h"
#include "nnacl/fp16/cast_fp16.h"
#include "nnacl/fp16/pack_fp16.h"
#include "nnacl/fp16/winograd_utils_fp16.h"

using mindspore::kernel::KERNEL_ARCH::kCPU;
using mindspore::lite::KernelRegistrar;
@@ -49,23 +46,13 @@ int ConvolutionFP16CPUKernel::InitWeightBias() {
int pack_weight_size = oc8 * in_channel * kernel_plane;

// init weight
auto ret = ConvolutionBaseFP16CPUKernel::GetExecuteFilter();
if (ret != RET_OK) {
MS_LOG(ERROR) << "Get Execute filter failed.";
return ret;
}
packed_weight_ = reinterpret_cast<float16_t *>(malloc(pack_weight_size * sizeof(float16_t)));
if (packed_weight_ == nullptr) {
MS_LOG(ERROR) << "malloc packed_weight_ failed.";
return RET_ERROR;
}
memset(packed_weight_, 0, pack_weight_size * sizeof(float16_t));
RowMajor2Col8MajorFp16(execute_weight_, packed_weight_, out_channel, in_channel * kernel_plane, false);
if (fp16_weight_ != nullptr) {
free(fp16_weight_);
fp16_weight_ = nullptr;
execute_weight_ = nullptr;
}
RowMajor2Col8MajorFp16(fp16_weight_, packed_weight_, out_channel, in_channel * kernel_plane, false);

// init bias
bias_data_ = malloc(oc8 * sizeof(float16_t));
@@ -74,12 +61,9 @@ int ConvolutionFP16CPUKernel::InitWeightBias() {
return RET_ERROR;
}
memset(bias_data_, 0, oc8 * sizeof(float16_t));
auto fp16_bias_data = reinterpret_cast<float16_t *>(bias_data_);
if (in_tensors_.size() == kInputSize2) {
auto ori_bias = reinterpret_cast<float *>(in_tensors_.at(kBiasIndex)->data_c());
for (int i = 0; i < out_channel; ++i) {
fp16_bias_data[i] = (float16_t)ori_bias[i];
}
auto fp16_bias_data = reinterpret_cast<float16_t *>(bias_data_);
memcpy(fp16_bias_data, fp16_bias_, out_channel * sizeof(float16_t));
} else {
MS_ASSERT(in_tensors_.size() == kInputSize1);
}
@@ -111,10 +95,7 @@ int ConvolutionFP16CPUKernel::Init() {
MS_LOG(ERROR) << "Init weight bias failed.";
return RET_ERROR;
}
if (!InferShapeDone()) {
return RET_OK;
}
return ReSize();
return RET_OK;
}

int ConvolutionFP16CPUKernel::ReSize() {
@@ -123,7 +104,6 @@ int ConvolutionFP16CPUKernel::ReSize() {
MS_LOG(ERROR) << "Resize is invalid.";
return ret;
}

ret = ConvolutionBaseCPUKernel::Init();
if (ret != RET_OK) {
MS_LOG(ERROR) << "ConvolutionBase init fail!ret: " << ret;
@@ -173,309 +153,4 @@ int ConvolutionFP16CPUKernel::Run() {
FreeTmpBuffer();
return ret;
}

ConvParameter *CreateNewConvParameterFp16(ConvParameter *parameter) {
auto conv_parameter = reinterpret_cast<ConvParameter *>(malloc(sizeof(ConvParameter)));
if (conv_parameter == nullptr) {
MS_LOG(ERROR) << "Malloc new conv parameter failed.";
return nullptr;
}
memcpy(conv_parameter, parameter, sizeof(ConvParameter));
return conv_parameter;
}

kernel::LiteKernel *CpuConvFp16KernelSelect(const std::vector<lite::Tensor *> &inputs,
const std::vector<lite::Tensor *> &outputs, OpParameter *op_parameter,
const InnerContext *ctx, const mindspore::lite::PrimitiveC *primitive,
bool use_winograd, int out_unit) {
auto conv_param = reinterpret_cast<ConvParameter *>(op_parameter);
if (conv_param->kernel_h_ == 1 && conv_param->kernel_w_ == 1) {
return new (std::nothrow) kernel::Convolution1x1FP16CPUKernel(op_parameter, inputs, outputs, ctx, primitive);
} else if (use_winograd) {
return new (std::nothrow)
kernel::ConvolutionWinogradFP16CPUKernel(op_parameter, inputs, outputs, ctx, primitive, out_unit);
} else {
return new (std::nothrow) kernel::ConvolutionFP16CPUKernel(op_parameter, inputs, outputs, ctx, primitive);
}
return nullptr;
}

void FreeMemoryFp16(const std::vector<kernel::LiteKernel *> &group_convs, const std::vector<lite::Tensor *> &new_inputs,
const std::vector<lite::Tensor *> &new_outputs) {
for (auto sub_conv : group_convs) {
delete sub_conv;
}
for (auto in_tensor : new_inputs) {
delete in_tensor;
}
for (auto out_tensor : new_outputs) {
delete out_tensor;
}
}

lite::Tensor *CreateInputTensorFp16(TypeId data_type, std::vector<int> in_shape, bool infered_flag) {
auto in_tensor = new (std::nothrow) lite::Tensor(data_type, in_shape, Format_NHWC, lite::Tensor::Category::VAR);
if (in_tensor == nullptr) {
MS_LOG(ERROR) << "new in_tensor failed.";
return nullptr;
}
if (infered_flag) {
auto ret = in_tensor->MallocData();
if (ret != RET_OK) {
delete in_tensor;
MS_LOG(ERROR) << "in tensor malloc failed.";
return nullptr;
}
}
return in_tensor;
}

lite::Tensor *CreateFilterTensorFp16(TypeId data_type, std::vector<int> filter_shape,
const std::vector<lite::Tensor *> &inputs, int copy_length, int index) {
auto filter_tensor =
new (std::nothrow) lite::Tensor(data_type, filter_shape, Format_NHWC, lite::Tensor::Category::CONST_TENSOR);
if (filter_tensor == nullptr) {
MS_LOG(ERROR) << "new filter_tensor failed.";
return nullptr;
}
auto ret = filter_tensor->MallocData();
if (ret != RET_OK) {
delete filter_tensor;
MS_LOG(ERROR) << "filter_tensor malloc failed.";
return nullptr;
}
if (data_type == kNumberTypeFloat16) {
auto *origin_weight = reinterpret_cast<float16_t *>(inputs.at(kWeightIndex)->data_c());
memcpy(filter_tensor->data_c(), origin_weight + index * copy_length, copy_length * sizeof(float16_t));
} else {
MS_ASSERT(data_type == kNumberTypeFloat32);
auto *origin_weight = reinterpret_cast<float *>(inputs.at(kWeightIndex)->data_c());
memcpy(filter_tensor->data_c(), origin_weight + index * copy_length, copy_length * sizeof(float));
}
return filter_tensor;
}

lite::Tensor *CreateBiasTensorFp16(TypeId data_type, std::vector<int> bias_shape,
const std::vector<lite::Tensor *> &inputs, int new_out_channel, int index) {
auto *origin_bias = inputs.at(kBiasIndex)->data_c();
auto bias_tensor =
new (std::nothrow) lite::Tensor(data_type, bias_shape, Format_NHWC, lite::Tensor::Category::CONST_TENSOR);
if (bias_tensor == nullptr) {
MS_LOG(ERROR) << "new bias_tensor failed.";
return nullptr;
}
auto ret = bias_tensor->MallocData();
if (ret != RET_OK) {
delete bias_tensor;
MS_LOG(ERROR) << "bias_tensor malloc failed.";
return nullptr;
}
if (data_type == kNumberTypeFloat16) {
auto bias_data = reinterpret_cast<float16_t *>(origin_bias);
memcpy(bias_tensor->data_c(), bias_data + index * new_out_channel, new_out_channel * sizeof(float16_t));
} else {
MS_ASSERT(data_type == kNumberTypeFloat32);
auto bias_data = reinterpret_cast<float *>(origin_bias);
memcpy(bias_tensor->data_c(), bias_data + index * new_out_channel, new_out_channel * sizeof(float));
}
return bias_tensor;
}

lite::Tensor *CreateOutputTensorFp16(std::vector<int> out_shape, const std::vector<lite::Tensor *> &outputs,
bool infered_flag, int index) {
auto out_tensor = new (std::nothrow) lite::Tensor();
if (out_tensor == nullptr) {
MS_LOG(ERROR) << "new tmp_out_tensor failed.";
return nullptr;
}
out_tensor->set_data_type(mindspore::kNumberTypeFloat16);
out_tensor->set_format(outputs.at(index)->format());
if (infered_flag) {
out_tensor->set_shape(out_shape);
auto ret = out_tensor->MallocData();
if (ret != RET_OK) {
delete out_tensor;
MS_LOG(ERROR) << "out_tensor malloc data failed.";
return nullptr;
}
}
return out_tensor;
}

kernel::LiteKernel *CpuGroupConvFp16KernelCreator(const std::vector<lite::Tensor *> &inputs,
const std::vector<lite::Tensor *> &outputs, OpParameter *op_parameter,
const InnerContext *ctx, const mindspore::lite::PrimitiveC *primitive,
int group) {
int out_unit;
bool has_bias = inputs.size() == 3;
bool use_winograd = false;
bool infered_flag = (primitive != nullptr && primitive->infer_flag());
auto conv_param = reinterpret_cast<ConvParameter *>(op_parameter);

// update new shape info for each sub kernel
int new_in_channel = inputs.at(kWeightIndex)->Channel();
int new_out_channel = 0;
if (group == 0) {
MS_LOG(ERROR) << "Divisor 'group' cannot be 0.";
return nullptr;
} else {
new_out_channel = inputs.at(kWeightIndex)->Batch() / group;
}

std::vector<int> in_shape;
std::vector<int> out_shape;
int batch = inputs.front()->Batch();
conv_param->input_batch_ = batch;
conv_param->output_batch_ = batch;
if (infered_flag) {
conv_param->input_channel_ = new_in_channel;
conv_param->output_channel_ = new_out_channel;
CheckIfUseWinogradFp16(&use_winograd, &out_unit, conv_param);
in_shape = {batch, inputs.front()->Height(), inputs.front()->Width(), new_in_channel};
out_shape = {batch, conv_param->output_h_, conv_param->output_w_, new_out_channel};
}
std::vector<int> filter_shape = {new_out_channel, conv_param->kernel_h_, conv_param->kernel_w_, new_in_channel};
std::vector<int> bias_shape = {new_out_channel};

// new group conv op
std::vector<kernel::LiteKernel *> group_convs;
// create tensors for every sub conv kernel
for (int i = 0; i < group; ++i) {
std::vector<lite::Tensor *> new_inputs;
std::vector<lite::Tensor *> new_outputs;
auto new_conv_parameter = CreateNewConvParameterFp16(conv_param);
if (new_conv_parameter == nullptr) {
FreeMemoryFp16(group_convs, new_inputs, new_outputs);
MS_LOG(ERROR) << "Get new conv parameter failed.";
return nullptr;
}
// create new input for each group
auto in_tensor = CreateInputTensorFp16(mindspore::kNumberTypeFloat16, in_shape, infered_flag);
if (in_tensor == nullptr) {
delete new_conv_parameter;
FreeMemoryFp16(group_convs, new_inputs, new_outputs);
MS_LOG(ERROR) << "create input tensor failed.";
return nullptr;
}
new_inputs.emplace_back(in_tensor);

// create new weight
int copy_length = conv_param->kernel_h_ * conv_param->kernel_w_ * new_in_channel * new_out_channel;
auto filter_tensor =
CreateFilterTensorFp16(inputs.at(kWeightIndex)->data_type(), filter_shape, inputs, copy_length, i);
if (filter_tensor == nullptr) {
delete new_conv_parameter;
FreeMemoryFp16(group_convs, new_inputs, new_outputs);
MS_LOG(ERROR) << "create filter tensor failed.";
return nullptr;
}
new_inputs.emplace_back(filter_tensor);

// if has bias, create new bias
if (has_bias) {
auto bias_tensor =
CreateBiasTensorFp16(inputs.at(kBiasIndex)->data_type(), bias_shape, inputs, new_out_channel, i);
if (bias_tensor == nullptr) {
delete new_conv_parameter;
FreeMemoryFp16(group_convs, new_inputs, new_outputs);
MS_LOG(ERROR) << "create bias_tensor failed.";
return nullptr;
}
new_inputs.emplace_back(bias_tensor);
}

// create new output tensors
for (size_t j = 0; j < outputs.size(); ++j) {
auto out_tensor = CreateOutputTensorFp16(out_shape, outputs, infered_flag, j);
if (out_tensor == nullptr) {
delete new_conv_parameter;
FreeMemoryFp16(group_convs, new_inputs, new_outputs);
MS_LOG(ERROR) << "new out_tensor failed.";
return nullptr;
}
new_outputs.emplace_back(out_tensor);
}
group_convs.emplace_back(CpuConvFp16KernelSelect(new_inputs, new_outputs,
reinterpret_cast<OpParameter *>(new_conv_parameter), ctx,
primitive, use_winograd, out_unit));
}

return new (std::nothrow)
GroupConvolutionFP16CPUKernel(op_parameter, inputs, outputs, ctx, primitive, group_convs, group);
}

kernel::LiteKernel *CpuConvFp16KernelCreator(const std::vector<lite::Tensor *> &inputs,
const std::vector<lite::Tensor *> &outputs, OpParameter *opParameter,
const InnerContext *ctx, const kernel::KernelKey &desc,
const mindspore::lite::PrimitiveC *primitive) {
MS_ASSERT(opParameter != nullptr);
MS_ASSERT(desc.type == schema::PrimitiveType_Conv2D);

auto *weight_tensor = inputs.at(kWeightIndex);
auto *restore_data = weight_tensor->data_c();
auto restore_type = weight_tensor->data_type();
bool dequant_flag =
!weight_tensor->quant_params().empty() && weight_tensor->quant_params().front().inited && restore_data != nullptr;
if (dequant_flag) {
auto *dequant_weight = kernel::DequantUtil::DequantWeight(weight_tensor);
if (dequant_weight == nullptr) {
MS_LOG(ERROR) << "dequant data is nullptr.";
free(opParameter);
return nullptr;
}
weight_tensor->set_data_type(kNumberTypeFloat32);
weight_tensor->set_data(dequant_weight);
}

auto conv_param = reinterpret_cast<ConvParameter *>(opParameter);
bool use_winograd = false;
int out_unit;
if (primitive != nullptr && primitive->infer_flag()) {
conv_param->input_h_ = inputs.front()->Height();
conv_param->input_w_ = inputs.front()->Width();
conv_param->input_channel_ = inputs.front()->Channel();
conv_param->output_h_ = outputs.front()->Height();
conv_param->output_w_ = outputs.front()->Width();
conv_param->output_channel_ = outputs.front()->Channel();
conv_param->op_parameter_.thread_num_ = ctx->thread_num_;
CheckIfUseWinogradFp16(&use_winograd, &out_unit, conv_param);
}
int group = conv_param->group_;
kernel::LiteKernel *kernel = nullptr;
if (group == 1) {
kernel = CpuConvFp16KernelSelect(inputs, outputs, opParameter, ctx, primitive, use_winograd, out_unit);
} else {
kernel = CpuGroupConvFp16KernelCreator(inputs, outputs, opParameter, ctx, primitive, group);
}

if (kernel == nullptr) {
MS_LOG(DEBUG) << "Create conv fp16 kernel failed.";
if (dequant_flag) {
weight_tensor->FreeData();
weight_tensor->set_data(restore_data);
weight_tensor->set_data_type(restore_type);
}
free(opParameter);
return nullptr;
}
auto ret = kernel->Init();
if (ret != RET_OK) {
MS_LOG(INFO) << "Init fp16 kernel failed, name: " << opParameter->name_
<< ", type: " << schema::EnumNamePrimitiveType(static_cast<schema::PrimitiveType>(opParameter->type_));
if (dequant_flag) {
weight_tensor->FreeData();
weight_tensor->set_data(restore_data);
weight_tensor->set_data_type(restore_type);
}
delete kernel;
return nullptr;
}
if (dequant_flag) {
weight_tensor->FreeData();
weight_tensor->set_data(restore_data);
weight_tensor->set_data_type(restore_type);
}
return kernel;
}
REG_KERNEL(kCPU, kNumberTypeFloat16, PrimitiveType_Conv2D, CpuConvFp16KernelCreator)
} // namespace mindspore::kernel

+ 6
- 6
mindspore/lite/src/runtime/kernel/arm/fp16/convolution_fp16.h View File

@@ -27,13 +27,11 @@ class ConvolutionFP16CPUKernel : public ConvolutionBaseFP16CPUKernel {
public:
ConvolutionFP16CPUKernel(OpParameter *parameter, const std::vector<lite::Tensor *> &inputs,
const std::vector<lite::Tensor *> &outputs, const InnerContext *ctx,
const mindspore::lite::PrimitiveC *primitive)
: ConvolutionBaseFP16CPUKernel(parameter, inputs, outputs, ctx, primitive) {}
const mindspore::lite::PrimitiveC *primitive, float16_t *fp16_weight, float16_t *fp16_bias)
: ConvolutionBaseFP16CPUKernel(parameter, inputs, outputs, ctx, primitive),
fp16_weight_(fp16_weight),
fp16_bias_(fp16_bias) {}
~ConvolutionFP16CPUKernel() override {
if (fp16_weight_ != nullptr) {
free(fp16_weight_);
fp16_weight_ = nullptr;
}
if (packed_weight_ != nullptr) {
free(packed_weight_);
packed_weight_ = nullptr;
@@ -58,6 +56,8 @@ class ConvolutionFP16CPUKernel : public ConvolutionBaseFP16CPUKernel {
col_major_input_ = nullptr;
}
}
float16_t *fp16_weight_; // do not free
float16_t *fp16_bias_; // do not free
float16_t *packed_input_ = nullptr;
float16_t *packed_weight_ = nullptr;
float16_t *col_major_input_ = nullptr;


+ 6
- 27
mindspore/lite/src/runtime/kernel/arm/fp16/convolution_winograd_fp16.cc View File

@@ -43,12 +43,6 @@ int ConvolutionWinogradFP16CPUKernel::InitWeightBias() {
int oc_block_num = UP_DIV(out_channel, C8NUM);

// init weight
auto ret = ConvolutionBaseFP16CPUKernel::GetExecuteFilter();
if (ret != RET_OK) {
MS_LOG(ERROR) << "Get Execute filter failed.";
return ret;
}

// set data
auto trans_matrix_data_size = input_unit_ * input_unit_ * in_channel * oc_block_num * oc_block * sizeof(float16_t);
trans_weight_ = reinterpret_cast<float16_t *>(malloc(trans_matrix_data_size));
@@ -68,21 +62,17 @@ int ConvolutionWinogradFP16CPUKernel::InitWeightBias() {
if (input_unit_ == 8) {
coef = 0.5f;
}
ret = CookToomFilter(matrix_a, matrix_at, matrix_b, matrix_bt, matrix_g, matrix_gt, coef, output_unit_, kernel_unit_);
auto ret =
CookToomFilter(matrix_a, matrix_at, matrix_b, matrix_bt, matrix_g, matrix_gt, coef, output_unit_, kernel_unit_);
if (ret != RET_OK) {
MS_LOG(ERROR) << "get matrix g from CookToomFilter failed.";
return ret;
}
ret = WinogradFilterTransformFp16(execute_weight_, matrix_g, matrix_gt, oc_block);
ret = WinogradFilterTransformFp16(fp16_origin_weight_, matrix_g, matrix_gt, oc_block);
if (ret != RET_OK) {
MS_LOG(ERROR) << "winograd filter transfrom failed.";
return ret;
}
if (fp16_weight_ != nullptr) {
free(fp16_weight_);
fp16_weight_ = nullptr;
execute_weight_ = nullptr;
}

// init bias
bias_data_ = malloc(oc_block_num * oc_block * sizeof(float16_t));
@@ -93,10 +83,7 @@ int ConvolutionWinogradFP16CPUKernel::InitWeightBias() {
memset(bias_data_, 0, oc_block_num * oc_block * sizeof(float16_t));
auto fp16_bias_data = reinterpret_cast<float16_t *>(bias_data_);
if (in_tensors_.size() == kInputSize2) {
auto ori_bias = reinterpret_cast<float *>(in_tensors_.at(kBiasIndex)->MutableData());
for (int i = 0; i < out_channel; ++i) {
fp16_bias_data[i] = (float16_t)ori_bias[i];
}
memcpy(fp16_bias_data, fp16_bias_, out_channel * sizeof(float16_t));
} else {
MS_ASSERT(in_tensors_.size() == kInputSize1);
}
@@ -163,15 +150,13 @@ int ConvolutionWinogradFP16CPUKernel::Init() {
input_unit_ = output_unit_ + kernel_unit_ - 1;
conv_param_->input_unit_ = input_unit_;
conv_param_->output_unit_ = output_unit_;

auto ret = InitWeightBias();
if (ret != RET_OK) {
MS_LOG(ERROR) << "Init weight bias failed.";
return RET_ERROR;
}
if (!InferShapeDone()) {
return RET_OK;
}
return ReSize();
return RET_OK;
}

int ConvolutionWinogradFP16CPUKernel::ReSize() {
@@ -180,17 +165,11 @@ int ConvolutionWinogradFP16CPUKernel::ReSize() {
MS_LOG(ERROR) << "Resize is invalid.";
return ret;
}

ret = ConvolutionBaseCPUKernel::Init();
if (ret != RET_OK) {
MS_LOG(ERROR) << "ConvolutionBase init failed.";
return RET_ERROR;
}
kernel_unit_ = conv_param_->kernel_h_;
input_unit_ = output_unit_ + kernel_unit_ - 1;
conv_param_->input_unit_ = input_unit_;
conv_param_->output_unit_ = output_unit_;

ret = ConfigInputOutput();
if (ret != RET_OK) {
MS_LOG(ERROR) << "ConfigInputOutput failed.";


+ 8
- 6
mindspore/lite/src/runtime/kernel/arm/fp16/convolution_winograd_fp16.h View File

@@ -31,13 +31,13 @@ class ConvolutionWinogradFP16CPUKernel : public ConvolutionBaseFP16CPUKernel {
public:
ConvolutionWinogradFP16CPUKernel(OpParameter *parameter, const std::vector<lite::Tensor *> &inputs,
const std::vector<lite::Tensor *> &outputs, const InnerContext *ctx,
const mindspore::lite::PrimitiveC *primitive, int out_unit)
: ConvolutionBaseFP16CPUKernel(parameter, inputs, outputs, ctx, primitive), output_unit_(out_unit) {}
const mindspore::lite::PrimitiveC *primitive, int out_unit, float16_t *fp16_weight,
float16_t *fp16_bias)
: ConvolutionBaseFP16CPUKernel(parameter, inputs, outputs, ctx, primitive),
output_unit_(out_unit),
fp16_origin_weight_(fp16_weight),
fp16_bias_(fp16_bias) {}
~ConvolutionWinogradFP16CPUKernel() override {
if (fp16_weight_ != nullptr) {
free(fp16_weight_);
fp16_weight_ = nullptr;
}
if (trans_weight_ != nullptr) {
free(trans_weight_);
trans_weight_ = nullptr;
@@ -75,6 +75,8 @@ class ConvolutionWinogradFP16CPUKernel : public ConvolutionBaseFP16CPUKernel {
int kernel_unit_;
int input_unit_;
int output_unit_;
float16_t *fp16_origin_weight_; // do not free
float16_t *fp16_bias_; // do not free
float16_t *tmp_data_ = nullptr;
float16_t *trans_input_ = nullptr;
float16_t *gemm_out_ = nullptr;


+ 8
- 12
mindspore/lite/src/runtime/kernel/arm/fp16/group_convolution_fp16.cc View File

@@ -87,11 +87,8 @@ int GroupConvolutionFP16CPUKernel::PreProcess() {
std::vector<int> out_shape;
for (int i = 0; i < group_num_; ++i) {
// in
int in_batch = conv_param_->input_batch_;
int in_h = conv_param_->input_h_;
int in_w = conv_param_->input_w_;
int in_c = conv_param_->input_channel_;
in_shape = {in_batch, in_h, in_w, in_c};
auto in_tensor = in_tensors_.front();
in_shape = {in_tensor->Batch(), in_tensor->Height(), in_tensor->Width(), conv_param_->input_channel_};
auto sub_kernel_in_tensor = group_convs_.at(i)->in_tensors().front();
sub_kernel_in_tensor->set_shape(in_shape);
ret = sub_kernel_in_tensor->MallocData();
@@ -101,11 +98,8 @@ int GroupConvolutionFP16CPUKernel::PreProcess() {
return ret;
}
// out
int out_batch = conv_param_->output_batch_;
int out_h = conv_param_->output_h_;
int out_w = conv_param_->output_w_;
int out_c = conv_param_->output_channel_;
out_shape = {out_batch, out_h, out_w, out_c};
auto out_tensor = out_tensors_.front();
out_shape = {out_tensor->Batch(), out_tensor->Height(), out_tensor->Width(), conv_param_->output_channel_};
auto sub_kernel_out_tensors = group_convs_[i]->out_tensors();
for (auto tensor : sub_kernel_out_tensors) {
tensor->set_shape(out_shape);
@@ -139,7 +133,8 @@ int GroupConvolutionFP16CPUKernel::PreProcess() {

int GroupConvolutionFP16CPUKernel::SeparateInput(int group_id) {
// input may either be float32 or float16
int in_plane = conv_param_->input_h_ * conv_param_->input_w_ * conv_param_->input_batch_;
auto in_tensor = in_tensors_.front();
int in_plane = in_tensor->Height() * in_tensor->Width() * in_tensor->Batch();
int sub_in_channel = conv_param_->input_channel_;
int ori_in_channel = sub_in_channel * group_num_;
auto sub_in_data = group_convs_.at(group_id)->in_tensors().front()->data_c();
@@ -179,7 +174,8 @@ int GroupConvolutionFP16CPUKernel::SeparateInput(int group_id) {

void GroupConvolutionFP16CPUKernel::PostConcat(int group_id) {
// output is must float16 data type
int out_plane = conv_param_->output_h_ * conv_param_->output_w_ * conv_param_->output_batch_;
auto out_tensor = out_tensors_.front();
int out_plane = out_tensor->Height() * out_tensor->Width() * out_tensor->Batch();
int sub_out_channel = conv_param_->output_channel_;
int ori_out_channel = sub_out_channel * group_num_;
auto sub_out_data = reinterpret_cast<float16_t *>(group_convs_.at(group_id)->out_tensors().front()->data_c());


+ 27
- 0
mindspore/lite/src/runtime/kernel/arm/fp32/adder_fp32.cc View File

@@ -31,6 +31,33 @@ using mindspore::schema::PrimitiveType_Adder;
using mindspore::schema::Format::Format_NHWC;

namespace mindspore::kernel {
int AdderCPUKernel::Init() {
auto ret = InitWeightBias();
if (ret != RET_OK) {
MS_LOG(ERROR) << "Init weight bias failed.";
return RET_ERROR;
}
if (!InferShapeDone()) {
return RET_OK;
}
return ReSize();
}

int AdderCPUKernel::ReSize() {
auto ret = ConvolutionBaseCPUKernel::CheckResizeValid();
if (ret != RET_OK) {
MS_LOG(ERROR) << "Resize is invalid.";
return ret;
}

ret = ConvolutionBaseCPUKernel::Init();
if (ret != RET_OK) {
MS_LOG(ERROR) << "ConvolutionBase init failed.";
return ret;
}
return RET_OK;
}

int AdderCPUKernel::InitWeightBias() {
auto filter_tensor = in_tensors_.at(kWeightIndex);
int kernel_h = filter_tensor->Height();


+ 3
- 1
mindspore/lite/src/runtime/kernel/arm/fp32/adder_fp32.h View File

@@ -29,10 +29,12 @@ class AdderCPUKernel : public ConvolutionCPUKernel {
AdderCPUKernel(OpParameter *parameter, const std::vector<lite::Tensor *> &inputs,
const std::vector<lite::Tensor *> &outputs, const lite::InnerContext *ctx,
const mindspore::lite::PrimitiveC *primitive)
: ConvolutionCPUKernel(parameter, inputs, outputs, ctx, primitive) {}
: ConvolutionCPUKernel(parameter, inputs, outputs, ctx, primitive, nullptr, nullptr) {}
~AdderCPUKernel() override = default;

int InitWeightBias() override;
int Init() override;
int ReSize() override;
int Run() override;
int RunImpl(int task_id) override;
};


+ 12
- 15
mindspore/lite/src/runtime/kernel/arm/fp32/convolution_1x1_fp32.cc View File

@@ -44,10 +44,13 @@ void Convolution1x1CPUKernel::FreeTmpBuffer() {

int Convolution1x1CPUKernel::ReSize() {
FreeTmpBuffer();
ConvolutionBaseCPUKernel::Init();
auto error_code = ConvolutionBaseCPUKernel::Init();
if (error_code != RET_OK) {
MS_LOG(ERROR) << "conv base init failed.";
return error_code;
}
InitConv1x1MatmulParam();

int error_code = InitConv1x1Param();
error_code = InitConv1x1Param();
if (error_code != RET_OK) {
MS_LOG(ERROR) << "Convolution base init failed.";
return error_code;
@@ -95,7 +98,7 @@ int Convolution1x1CPUKernel::InitConv1x1BiasWeight() {
MS_LOG(ERROR) << "Conv1x1 Malloc bias_ptr_ error!";
return RET_ERROR;
}
memcpy(bias_data_, in_tensors_[kBiasIndex]->MutableData(), weight_size);
memcpy(bias_data_, origin_bias_, weight_size);
memset(reinterpret_cast<char *>(bias_data_) + weight_size, 0, size - weight_size);
}

@@ -108,14 +111,11 @@ int Convolution1x1CPUKernel::InitConv1x1BiasWeight() {
}
memset(reinterpret_cast<char *>(weight_ptr_) + down_size, 0, size - down_size);
#ifdef ENABLE_AVX
RowMajor2Col16Major(reinterpret_cast<float *>(filter_tensor->MutableData()), weight_ptr_, output_channel,
input_channel);
RowMajor2Col16Major(origin_weight_, weight_ptr_, output_channel, input_channel);
#elif defined(ENABLE_ARM32)
RowMajor2Col4Major(reinterpret_cast<float *>(filter_tensor->MutableData()), weight_ptr_, output_channel,
input_channel);
RowMajor2Col4Major(origin_weight_, weight_ptr_, output_channel, input_channel);
#else
RowMajor2Col8Major(reinterpret_cast<float *>(filter_tensor->MutableData()), weight_ptr_, output_channel,
input_channel);
RowMajor2Col8Major(origin_weight_, weight_ptr_, output_channel, input_channel);
#endif
return RET_OK;
}
@@ -153,13 +153,10 @@ int Convolution1x1CPUKernel::Init() {
}
int error_code = InitConv1x1BiasWeight();
if (error_code != RET_OK) {
MS_LOG(ERROR) << "Convolution base init failed.";
MS_LOG(ERROR) << "Convolution1x1 init weight and bias failed.";
return error_code;
}
if (!InferShapeDone()) {
return RET_OK;
}
return ReSize();
return RET_OK;
}

void Convolution1x1CPUKernel::PackMatmulInput(const float *src_ptr, float *dst_ptr, int row, int col) {


+ 6
- 2
mindspore/lite/src/runtime/kernel/arm/fp32/convolution_1x1_fp32.h View File

@@ -34,8 +34,10 @@ class Convolution1x1CPUKernel : public ConvolutionBaseCPUKernel {
public:
Convolution1x1CPUKernel(OpParameter *parameter, const std::vector<lite::Tensor *> &inputs,
const std::vector<lite::Tensor *> &outputs, const lite::InnerContext *ctx,
const mindspore::lite::PrimitiveC *primitive)
: ConvolutionBaseCPUKernel(parameter, inputs, outputs, ctx, primitive) {}
const mindspore::lite::PrimitiveC *primitive, float *origin_weight, float *origin_bias)
: ConvolutionBaseCPUKernel(parameter, inputs, outputs, ctx, primitive),
origin_weight_(origin_weight),
origin_bias_(origin_bias) {}
~Convolution1x1CPUKernel();
int Init() override;
int Run() override;
@@ -58,6 +60,8 @@ class Convolution1x1CPUKernel : public ConvolutionBaseCPUKernel {
bool multi_thread_by_hw_ = false;
int thread_count_ = 0;
int thread_stride_ = 0;
float *origin_weight_; // do not free
float *origin_bias_; // do not free
float *weight_ptr_ = nullptr;
float *pack_input_ = nullptr;
float *input_ptr_ = nullptr;


+ 416
- 0
mindspore/lite/src/runtime/kernel/arm/fp32/convolution_delegate_fp32.cc View File

@@ -0,0 +1,416 @@
/**
* Copyright 2020 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "src/runtime/kernel/arm/fp32/convolution_delegate_fp32.h"
#include "src/runtime/kernel/arm/fp32/convolution_fp32.h"
#include "src/runtime/kernel/arm/fp32/convolution_1x1_fp32.h"
#include "src/runtime/kernel/arm/fp32/convolution_winograd_fp32.h"
#include "src/runtime/kernel/arm/fp32/group_convolution_fp32.h"
#include "schema/model_generated.h"
#include "src/kernel_registry.h"
#include "include/errorcode.h"
#include "src/runtime/runtime_api.h"
#include "src/runtime/kernel/arm/base/dequant.h"

using mindspore::kernel::KERNEL_ARCH::kCPU;
using mindspore::lite::KernelRegistrar;
using mindspore::lite::RET_ERROR;
using mindspore::lite::RET_INFER_INVALID;
using mindspore::lite::RET_OK;
using mindspore::schema::PrimitiveType_Conv2D;
using mindspore::schema::Format::Format_NHWC;

namespace mindspore::kernel {
float *ConvolutionDelegateCPUKernel::CopyData(lite::Tensor *tensor) {
auto data = reinterpret_cast<float *>(malloc(tensor->Size()));
if (data == nullptr) {
MS_LOG(ERROR) << "Malloc data failed.";
return nullptr;
}
memcpy(data, tensor->data_c(), tensor->Size());
return data;
}

void ConvolutionDelegateCPUKernel::FreeCopiedData() {
if (origin_weight_ != nullptr && need_free_weight_) {
free(origin_weight_);
origin_weight_ = nullptr;
}
if (origin_bias_ != nullptr && need_free_bias_) {
free(origin_bias_);
origin_bias_ = nullptr;
}
}

int ConvolutionDelegateCPUKernel::GetWeightAndBias() {
auto ret = GetWeightData();
if (ret != RET_OK) {
MS_LOG(ERROR) << "Get weight data failed.";
return ret;
}
ret = GetBiasData();
if (ret != RET_OK) {
MS_LOG(ERROR) << "Get bias data failed.";
return ret;
}
return RET_OK;
}

int ConvolutionDelegateCPUKernel::GetWeightData() {
if (InferShapeDone()) {
origin_weight_ = reinterpret_cast<float *>(in_tensors_.at(kWeightIndex)->data_c());
return RET_OK;
} else {
origin_weight_ = CopyData(in_tensors_.at(kWeightIndex));
if (origin_weight_ == nullptr) {
MS_LOG(ERROR) << "Copy weight data failed.";
return RET_ERROR;
}
need_free_weight_ = true;
return RET_OK;
}
return RET_OK;
}

int ConvolutionDelegateCPUKernel::GetBiasData() {
if (in_tensors_.size() == 3) {
if (InferShapeDone()) {
origin_bias_ = reinterpret_cast<float *>(in_tensors_.at(kBiasIndex)->data_c());
return RET_OK;
} else {
origin_bias_ = CopyData(in_tensors_.at(kBiasIndex));
if (origin_bias_ == nullptr) {
MS_LOG(ERROR) << "Copy bias data failed.";
return RET_ERROR;
}
need_free_bias_ = true;
return RET_OK;
}
}
return RET_OK;
}

int ConvolutionDelegateCPUKernel::Init() {
auto ret = GetWeightAndBias();
if (ret != RET_OK) {
MS_LOG(ERROR) << "Get weight and bias failed.";
return ret;
}
if (!InferShapeDone()) {
return RET_OK;
}
return ReSize();
}

int ConvolutionDelegateCPUKernel::ReSize() {
// Updata shape info of input and output
SetInputOutputShapeInfo(reinterpret_cast<ConvParameter *>(op_parameter_), in_tensors_.front(), out_tensors_.front(),
context_);
if (conv_kernel_ == nullptr) {
// need to select actual execute kernel here
conv_kernel_ = CpuConvFp32KernelSelect(in_tensors_, out_tensors_, op_parameter_, context_, primitive_,
origin_weight_, origin_bias_);
if (conv_kernel_ == nullptr) {
MS_LOG(ERROR) << "Selecting execute kernel failed for conv_kernel, got a nullptr.";
return RET_ERROR;
}
}
FreeCopiedData();
return conv_kernel_->ReSize();
}

void SetInputOutputShapeInfo(ConvParameter *conv_param, const lite::Tensor *input, const lite::Tensor *output,
const InnerContext *ctx) {
conv_param->input_batch_ = input->Batch();
conv_param->input_h_ = input->Height();
conv_param->input_w_ = input->Width();
conv_param->input_channel_ = input->Channel();
conv_param->output_batch_ = output->Batch();
conv_param->output_h_ = output->Height();
conv_param->output_w_ = output->Width();
conv_param->output_channel_ = output->Channel();
conv_param->op_parameter_.thread_num_ = ctx->thread_num_;
}

ConvParameter *CreateNewConvParameter(ConvParameter *parameter) {
auto conv_parameter = new (std::nothrow) ConvParameter;
if (conv_parameter == nullptr) {
MS_LOG(ERROR) << "Malloc new conv parameter failed.";
return nullptr;
}
memcpy(conv_parameter, parameter, sizeof(ConvParameter));
return conv_parameter;
}

void FreeMemory(const std::vector<kernel::LiteKernel *> &group_convs, const std::vector<lite::Tensor *> &new_inputs,
const std::vector<lite::Tensor *> &new_outputs) {
for (auto sub_conv : group_convs) {
delete sub_conv;
}
for (auto in_tensor : new_inputs) {
delete in_tensor;
}
for (auto out_tensor : new_outputs) {
delete out_tensor;
}
}

lite::Tensor *CreateInputTensor(TypeId data_type, const std::vector<int> &in_shape, bool infered_flag) {
auto in_tensor = new (std::nothrow) lite::Tensor(data_type, in_shape, Format_NHWC, lite::Tensor::Category::VAR);
if (in_tensor == nullptr) {
MS_LOG(ERROR) << "new in_tensor failed.";
return nullptr;
}
if (infered_flag) {
auto ret = in_tensor->MallocData();
if (ret != RET_OK) {
delete in_tensor;
MS_LOG(ERROR) << "in tensor malloc failed.";
return nullptr;
}
}
return in_tensor;
}

// weight and bias are const
static lite::Tensor *CreateConstTensorFp32(lite::Tensor *tensor, const std::vector<int> &shape, const int index) {
auto new_tensor =
new (std::nothrow) lite::Tensor(tensor->data_type(), shape, Format_NHWC, lite::Tensor::Category::CONST_TENSOR);
if (new_tensor == nullptr) {
MS_LOG(ERROR) << "Create new_tensor failed.";
return nullptr;
}
auto ret = new_tensor->MallocData();
if (ret != RET_OK) {
delete new_tensor;
MS_LOG(ERROR) << "Malloc new_tensor failed.";
return nullptr;
}
MS_ASSERT(tensor->data_type() == kNumberTypeFloat32);
memcpy(new_tensor->data_c(), reinterpret_cast<char *>(tensor->data_c()) + index * new_tensor->Size(),
new_tensor->Size());
return new_tensor;
}

lite::Tensor *CreateOutputTensor(const std::vector<int> &out_shape, const std::vector<lite::Tensor *> &outputs,
bool infered_flag, int index) {
auto out_tensor = new (std::nothrow) lite::Tensor();
if (out_tensor == nullptr) {
MS_LOG(ERROR) << "new tmp_out_tensor failed.";
return nullptr;
}
out_tensor->set_data_type(outputs.at(index)->data_type());
out_tensor->set_format(outputs.at(index)->format());
if (infered_flag) {
out_tensor->set_shape(out_shape);
auto ret = out_tensor->MallocData();
if (ret != RET_OK) {
delete out_tensor;
MS_LOG(ERROR) << "out_tensor malloc data failed.";
return nullptr;
}
}
return out_tensor;
}

kernel::LiteKernel *CpuConvFp32KernelSelect(const std::vector<lite::Tensor *> &inputs,
const std::vector<lite::Tensor *> &outputs, OpParameter *op_parameter,
const InnerContext *ctx, const mindspore::lite::PrimitiveC *primitive,
float *origin_weight, float *origin_bias) {
auto conv_param = reinterpret_cast<ConvParameter *>(op_parameter);
bool use_winograd = false;
int out_unit;
CheckIfUseWinograd(&use_winograd, &out_unit, conv_param);
kernel::LiteKernel *kernel = nullptr;
if (conv_param->kernel_h_ == 1 && conv_param->kernel_w_ == 1) {
kernel = new (std::nothrow)
kernel::Convolution1x1CPUKernel(op_parameter, inputs, outputs, ctx, primitive, origin_weight, origin_bias);
} else if (use_winograd) {
kernel = new (std::nothrow) kernel::ConvolutionWinogradCPUKernel(op_parameter, inputs, outputs, ctx, primitive,
out_unit, origin_weight, origin_bias);
} else {
kernel = new (std::nothrow)
kernel::ConvolutionCPUKernel(op_parameter, inputs, outputs, ctx, primitive, origin_weight, origin_bias);
}
if (kernel != nullptr) {
auto ret = kernel->Init();
if (ret != RET_OK) {
MS_LOG(ERROR) << "conv kernel init failed.";
delete kernel;
return nullptr;
}
}
return kernel;
}

static kernel::LiteKernel *CreateDelegateConv(const std::vector<lite::Tensor *> &inputs,
const std::vector<lite::Tensor *> &outputs, OpParameter *op_parameter,
const InnerContext *ctx, const mindspore::lite::PrimitiveC *primitive) {
return new (std::nothrow) kernel::ConvolutionDelegateCPUKernel(op_parameter, inputs, outputs, ctx, primitive);
}

kernel::LiteKernel *CpuGroupConvFp32KernelCreator(const std::vector<lite::Tensor *> &inputs,
const std::vector<lite::Tensor *> &outputs, OpParameter *op_parameter,
const InnerContext *ctx,
const mindspore::lite::PrimitiveC *primitive) {
bool infer_flag = primitive != nullptr && primitive->infer_flag();
auto conv_param = reinterpret_cast<ConvParameter *>(op_parameter);
int new_in_channel = inputs.at(kWeightIndex)->Channel();
int new_out_channel;
if (conv_param->group_ == 0) {
MS_LOG(ERROR) << "Divisor 'group' cannot be 0.";
return nullptr;
} else {
new_out_channel = inputs.at(kWeightIndex)->Batch() / conv_param->group_;
}
std::vector<int> in_shape;
std::vector<int> out_shape;
if (infer_flag) {
conv_param->input_channel_ = new_in_channel;
conv_param->output_channel_ = new_out_channel;
in_shape = {inputs.front()->Batch(), inputs.front()->Height(), inputs.front()->Width(), new_in_channel};
out_shape = {inputs.front()->Batch(), outputs.front()->Height(), outputs.front()->Width(), new_out_channel};
}
std::vector<int> filter_shape = {new_out_channel, conv_param->kernel_h_, conv_param->kernel_w_, new_in_channel};
std::vector<int> bias_shape = {new_out_channel};

// create sub kernels
std::vector<kernel::LiteKernel *> group_convs;
for (int i = 0; i < conv_param->group_; ++i) {
std::vector<lite::Tensor *> new_inputs;
std::vector<lite::Tensor *> new_outputs;
auto new_conv_parameter = CreateNewConvParameter(conv_param);
if (new_conv_parameter == nullptr) {
FreeMemory(group_convs, new_inputs, new_outputs);
MS_LOG(ERROR) << "Get new conv parameter failed.";
return nullptr;
}

// create new input for each group
auto in_tensor = CreateInputTensor(inputs.front()->data_type(), in_shape, infer_flag);
if (in_tensor == nullptr) {
delete new_conv_parameter;
FreeMemory(group_convs, new_inputs, new_outputs);
MS_LOG(ERROR) << "create input tensor failed.";
return nullptr;
}
new_inputs.emplace_back(in_tensor);

// create new weight
auto filter_tensor = CreateConstTensorFp32(inputs.at(kWeightIndex), filter_shape, i);
if (filter_tensor == nullptr) {
delete new_conv_parameter;
FreeMemory(group_convs, new_inputs, new_outputs);
MS_LOG(ERROR) << "create filter tensor failed.";
return nullptr;
}
new_inputs.emplace_back(filter_tensor);

// if has bias, create new bias
if (inputs.size() == 3) {
auto bias_tensor = CreateConstTensorFp32(inputs.at(kBiasIndex), bias_shape, i);
if (bias_tensor == nullptr) {
delete new_conv_parameter;
FreeMemory(group_convs, new_inputs, new_outputs);
MS_LOG(ERROR) << "create bias_tensor failed.";
return nullptr;
}
new_inputs.emplace_back(bias_tensor);
}

// create new output tensor
for (size_t j = 0; j < outputs.size(); ++j) {
auto out_tensor = CreateOutputTensor(out_shape, outputs, infer_flag, j);
if (out_tensor == nullptr) {
delete new_conv_parameter;
FreeMemory(group_convs, new_inputs, new_outputs);
MS_LOG(ERROR) << "new out_tensor failed.";
return nullptr;
}
new_outputs.emplace_back(out_tensor);
}
group_convs.emplace_back(
CreateDelegateConv(new_inputs, new_outputs, reinterpret_cast<OpParameter *>(new_conv_parameter), ctx, primitive));
}
return new (std::nothrow)
GroupConvolutionCPUKernel(op_parameter, inputs, outputs, ctx, primitive, group_convs, conv_param->group_);
}

kernel::LiteKernel *CpuConvFp32KernelCreator(const std::vector<lite::Tensor *> &inputs,
const std::vector<lite::Tensor *> &outputs, OpParameter *op_parameter,
const InnerContext *ctx, const kernel::KernelKey &desc,
const mindspore::lite::PrimitiveC *primitive) {
MS_ASSERT(op_parameter != nullptr);
MS_ASSERT(desc.type == schema::PrimitiveType_Conv2D);
MS_ASSERT(desc.data_type == kNumberTypeFloat32);

// if get quantized weight, dequantize it to float32 type data.
auto *weight_tensor = inputs.at(kWeightIndex);
auto *restore_data = weight_tensor->data_c();
auto restore_type = weight_tensor->data_type();
bool dequant_flag =
!weight_tensor->quant_params().empty() && weight_tensor->quant_params().front().inited && restore_data != nullptr;
if (dequant_flag) {
auto *dequant_weight = kernel::DequantUtil::DequantWeight(weight_tensor);
if (dequant_weight == nullptr) {
MS_LOG(ERROR) << "dequant data is nullptr.";
free(op_parameter);
return nullptr;
}
weight_tensor->set_data(dequant_weight);
}

auto conv_param = reinterpret_cast<ConvParameter *>(op_parameter);
kernel::LiteKernel *kernel = nullptr;
if (conv_param->group_ == 1) {
kernel = CreateDelegateConv(inputs, outputs, op_parameter, ctx, primitive);
} else {
kernel = CpuGroupConvFp32KernelCreator(inputs, outputs, op_parameter, ctx, primitive);
}

if (kernel == nullptr) {
MS_LOG(ERROR) << "kernel is nullptr.";
if (dequant_flag) {
weight_tensor->FreeData();
weight_tensor->set_data(restore_data);
weight_tensor->set_data_type(restore_type);
}
free(op_parameter);
return nullptr;
}

auto ret = kernel->Init();
if (ret != RET_OK && ret != RET_INFER_INVALID) {
MS_LOG(ERROR) << "Init kernel failed, name: " << op_parameter->name_ << ", type: "
<< schema::EnumNamePrimitiveType(static_cast<schema::PrimitiveType>(op_parameter->type_));
if (dequant_flag) {
weight_tensor->FreeData();
weight_tensor->set_data(restore_data);
weight_tensor->set_data_type(restore_type);
}
delete kernel;
return nullptr;
}

if (dequant_flag) {
weight_tensor->FreeData();
weight_tensor->set_data(restore_data);
weight_tensor->set_data_type(restore_type);
}
return kernel;
}

REG_KERNEL(kCPU, kNumberTypeFloat32, PrimitiveType_Conv2D, CpuConvFp32KernelCreator)
} // namespace mindspore::kernel

+ 77
- 0
mindspore/lite/src/runtime/kernel/arm/fp32/convolution_delegate_fp32.h View File

@@ -0,0 +1,77 @@
/**
* Copyright 2020 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP32_CONVOLUTION_DELEGATE_FP32_H_
#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP32_CONVOLUTION_DELEGATE_FP32_H_

#include <vector>
#include "src/ops/conv2d.h"
#include "src/lite_kernel.h"
#include "nnacl/conv_parameter.h"
#include "nnacl/op_base.h"

using mindspore::lite::InnerContext;
namespace mindspore::kernel {
class ConvolutionDelegateCPUKernel : public LiteKernel {
public:
ConvolutionDelegateCPUKernel(OpParameter *parameter, const std::vector<lite::Tensor *> &inputs,
const std::vector<lite::Tensor *> &outputs, const lite::InnerContext *ctx,
const mindspore::lite::PrimitiveC *primitive)
: LiteKernel(parameter, inputs, outputs, ctx, primitive) {}
~ConvolutionDelegateCPUKernel() override {
FreeCopiedData();
if (conv_kernel_ != nullptr) {
op_parameter_ = nullptr; // op_parameter will be freed in conv_kernel
delete conv_kernel_;
conv_kernel_ = nullptr;
}
};
int Init() override;
int ReSize() override;
int Run() override { return conv_kernel_->Run(); }
int GetWeightAndBias();
int GetWeightData();
int GetBiasData();
static float *CopyData(lite::Tensor *tensor);
void FreeCopiedData();

protected:
bool need_free_weight_ = false;
bool need_free_bias_ = false;
kernel::LiteKernel *conv_kernel_ = nullptr;
float *origin_weight_ = nullptr;
float *origin_bias_ = nullptr;
};

void SetInputOutputShapeInfo(ConvParameter *conv_param, const lite::Tensor *input, const lite::Tensor *output,
const InnerContext *ctx);

void FreeMemory(const std::vector<kernel::LiteKernel *> &group_convs, const std::vector<lite::Tensor *> &new_inputs,
const std::vector<lite::Tensor *> &new_outputs);

ConvParameter *CreateNewConvParameter(ConvParameter *parameter);

lite::Tensor *CreateInputTensor(TypeId data_type, const std::vector<int> &in_shape, bool infered_flag);

lite::Tensor *CreateOutputTensor(const std::vector<int> &out_shape, const std::vector<lite::Tensor *> &outputs,
bool infered_flag, int index);

kernel::LiteKernel *CpuConvFp32KernelSelect(const std::vector<lite::Tensor *> &inputs,
const std::vector<lite::Tensor *> &outputs, OpParameter *op_parameter,
const InnerContext *ctx, const mindspore::lite::PrimitiveC *primitive,
float *origin_weight, float *origin_bias);
} // namespace mindspore::kernel

#endif // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP32_CONVOLUTION_DELEGATE_FP32_H_

+ 10
- 319
mindspore/lite/src/runtime/kernel/arm/fp32/convolution_fp32.cc View File

@@ -15,16 +15,13 @@
*/

#include "src/runtime/kernel/arm/fp32/convolution_fp32.h"
#include "src/runtime/kernel/arm/fp32/convolution_1x1_fp32.h"
#include "src/runtime/kernel/arm/fp32/convolution_winograd_fp32.h"
#include "src/runtime/kernel/arm/fp32/group_convolution_fp32.h"
#include "nnacl/fp32/conv_fp32.h"
#include "nnacl/common_func.h"
#include "include/errorcode.h"
#include "schema/model_generated.h"
#include "src/kernel_registry.h"
#include "include/errorcode.h"
#include "src/runtime/runtime_api.h"
#include "src/runtime/kernel/arm/base/dequant.h"
#include "nnacl/fp32/conv_fp32.h"
#include "nnacl/fp32/matmul_fp32.h"

using mindspore::kernel::KERNEL_ARCH::kCPU;
using mindspore::lite::KernelRegistrar;
@@ -52,7 +49,6 @@ int ConvolutionCPUKernel::InitWeightBias() {
int oc_block_num = UP_ROUND(out_channel, oc_block);
int pack_weight_size = oc_block_num * in_channel * kernel_plane;

auto origin_weight = reinterpret_cast<float *>(filter_tensor->data_c());
packed_weight_ = reinterpret_cast<float *>(malloc(pack_weight_size * sizeof(float)));
if (packed_weight_ == nullptr) {
MS_LOG(ERROR) << "malloc packed weight failed.";
@@ -60,11 +56,11 @@ int ConvolutionCPUKernel::InitWeightBias() {
}
memset(packed_weight_, 0, pack_weight_size * sizeof(float));
#ifdef ENABLE_AVX
RowMajor2Col16Major(origin_weight, packed_weight_, out_channel, in_channel * kernel_plane);
RowMajor2Col16Major(origin_weight_, packed_weight_, out_channel, in_channel * kernel_plane);
#elif ENABLE_ARM32
RowMajor2Col4Major(origin_weight, packed_weight_, out_channel, in_channel * kernel_plane);
RowMajor2Col4Major(origin_weight_, packed_weight_, out_channel, in_channel * kernel_plane);
#else
RowMajor2Col8Major(origin_weight, packed_weight_, out_channel, in_channel * kernel_plane);
RowMajor2Col8Major(origin_weight_, packed_weight_, out_channel, in_channel * kernel_plane);
#endif

bias_data_ = reinterpret_cast<float *>(malloc(oc_block_num * sizeof(float)));
@@ -75,8 +71,7 @@ int ConvolutionCPUKernel::InitWeightBias() {
memset(bias_data_, 0, oc_block_num * sizeof(float));

if (in_tensors_.size() == kInputSize2) {
auto ori_bias = reinterpret_cast<float *>(in_tensors_.at(kBiasIndex)->data_c());
memcpy(bias_data_, ori_bias, out_channel * sizeof(float));
memcpy(bias_data_, origin_bias_, out_channel * sizeof(float));
} else {
MS_ASSERT(in_tensors_.size() == kInputSize1);
}
@@ -114,10 +109,7 @@ int ConvolutionCPUKernel::Init() {
MS_LOG(ERROR) << "Init weight bias failed.";
return RET_ERROR;
}
if (!InferShapeDone()) {
return RET_OK;
}
return ReSize();
return RET_OK;
}

int ConvolutionCPUKernel::ReSize() {
@@ -126,11 +118,10 @@ int ConvolutionCPUKernel::ReSize() {
MS_LOG(ERROR) << "Resize is invalid.";
return ret;
}

ret = ConvolutionBaseCPUKernel::Init();
if (ret != RET_OK) {
MS_LOG(ERROR) << "ConvolutionBase init failed.";
return RET_ERROR;
MS_LOG(ERROR) << "conv base init failed.";
return ret;
}
return RET_OK;
}
@@ -168,304 +159,4 @@ int ConvolutionCPUKernel::Run() {
FreeTmpBuffer();
return ret;
}

ConvParameter *CreateNewConvParameter(ConvParameter *parameter) {
auto conv_parameter = new (std::nothrow) ConvParameter;
if (conv_parameter == nullptr) {
MS_LOG(ERROR) << "Malloc new conv parameter failed.";
return nullptr;
}
memcpy(conv_parameter, parameter, sizeof(ConvParameter));
return conv_parameter;
}

void FreeMemory(const std::vector<kernel::LiteKernel *> &group_convs, const std::vector<lite::Tensor *> &new_inputs,
const std::vector<lite::Tensor *> &new_outputs) {
for (auto sub_conv : group_convs) {
delete sub_conv;
}
for (auto in_tensor : new_inputs) {
delete in_tensor;
}
for (auto out_tensor : new_outputs) {
delete out_tensor;
}
}

lite::Tensor *CreateInputTensor(TypeId data_type, std::vector<int> in_shape, bool infered_flag) {
auto in_tensor = new (std::nothrow) lite::Tensor(data_type, in_shape, Format_NHWC, lite::Tensor::Category::VAR);
if (in_tensor == nullptr) {
MS_LOG(ERROR) << "new in_tensor failed.";
return nullptr;
}
if (infered_flag) {
auto ret = in_tensor->MallocData();
if (ret != RET_OK) {
delete in_tensor;
MS_LOG(ERROR) << "in tensor malloc failed.";
return nullptr;
}
}
return in_tensor;
}

lite::Tensor *CreateFilterTensorFp32(TypeId data_type, std::vector<int> filter_shape,
const std::vector<lite::Tensor *> &inputs, int copy_length, int index) {
auto filter_tensor =
new (std::nothrow) lite::Tensor(data_type, filter_shape, Format_NHWC, lite::Tensor::Category::CONST_TENSOR);
if (filter_tensor == nullptr) {
MS_LOG(ERROR) << "new filter_tensor failed.";
return nullptr;
}
auto ret = filter_tensor->MallocData();
if (ret != RET_OK) {
delete filter_tensor;
MS_LOG(ERROR) << "filter_tensor malloc failed.";
return nullptr;
}

MS_ASSERT(data_type == kNumberTypeFloat32);
auto *origin_weight = reinterpret_cast<float *>(inputs.at(kWeightIndex)->data_c());
memcpy(filter_tensor->data_c(), origin_weight + index * copy_length, copy_length * sizeof(float));
return filter_tensor;
}

lite::Tensor *CreateBiasTensorFp32(TypeId data_type, std::vector<int> bias_shape,
const std::vector<lite::Tensor *> &inputs, int new_out_channel, int index) {
auto *origin_bias = inputs.at(kBiasIndex)->data_c();
auto bias_tensor =
new (std::nothrow) lite::Tensor(data_type, bias_shape, Format_NHWC, lite::Tensor::Category::CONST_TENSOR);
if (bias_tensor == nullptr) {
MS_LOG(ERROR) << "new bias_tensor failed.";
return nullptr;
}
auto ret = bias_tensor->MallocData();
if (ret != RET_OK) {
delete bias_tensor;
MS_LOG(ERROR) << "bias_tensor malloc failed.";
return nullptr;
}
MS_ASSERT(data_type == kNumberTypeFloat32);
auto bias_data = reinterpret_cast<float *>(origin_bias);
memcpy(bias_tensor->data_c(), bias_data + index * new_out_channel, new_out_channel * sizeof(float));

return bias_tensor;
}

lite::Tensor *CreateOutputTensor(std::vector<int> out_shape, const std::vector<lite::Tensor *> &outputs,
bool infered_flag, int index) {
auto out_tensor = new (std::nothrow) lite::Tensor();
if (out_tensor == nullptr) {
MS_LOG(ERROR) << "new tmp_out_tensor failed.";
return nullptr;
}
out_tensor->set_data_type(outputs.at(index)->data_type());
out_tensor->set_format(outputs.at(index)->format());
if (infered_flag) {
out_tensor->set_shape(out_shape);
auto ret = out_tensor->MallocData();
if (ret != RET_OK) {
delete out_tensor;
MS_LOG(ERROR) << "out_tensor malloc data failed.";
return nullptr;
}
}
return out_tensor;
}

kernel::LiteKernel *CpuConvFp32KernelSelect(const std::vector<lite::Tensor *> &inputs,
const std::vector<lite::Tensor *> &outputs, OpParameter *op_parameter,
const InnerContext *ctx, const mindspore::lite::PrimitiveC *primitive,
bool use_winograd, int out_unit) {
auto conv_param = reinterpret_cast<ConvParameter *>(op_parameter);
if (conv_param->kernel_h_ == 1 && conv_param->kernel_w_ == 1) {
return new (std::nothrow) kernel::Convolution1x1CPUKernel(op_parameter, inputs, outputs, ctx, primitive);
} else if (use_winograd) {
return new (std::nothrow)
kernel::ConvolutionWinogradCPUKernel(op_parameter, inputs, outputs, ctx, primitive, out_unit);
} else {
return new (std::nothrow) kernel::ConvolutionCPUKernel(op_parameter, inputs, outputs, ctx, primitive);
}
return nullptr;
}

kernel::LiteKernel *CpuGroupConvFp32KernelCreator(const std::vector<lite::Tensor *> &inputs,
const std::vector<lite::Tensor *> &outputs, OpParameter *op_parameter,
const InnerContext *ctx, const mindspore::lite::PrimitiveC *primitive,
int group) {
int out_unit;
bool has_bias = inputs.size() == 3;
bool use_winograd = false;
bool infered_flag = primitive != nullptr && primitive->infer_flag();
auto conv_param = reinterpret_cast<ConvParameter *>(op_parameter);

std::vector<int> in_shape;
std::vector<int> out_shape;
int new_in_channel = inputs.at(kWeightIndex)->Channel();
int new_out_channel = 0;
if (group == 0) {
MS_LOG(ERROR) << "Divisor 'group' cannot be 0.";
return nullptr;
} else {
new_out_channel = inputs.at(kWeightIndex)->Batch() / group;
}
int batch = inputs.front()->Batch();
conv_param->input_batch_ = batch;
conv_param->output_batch_ = batch;
if (infered_flag) {
int in_h = inputs.front()->Height();
int in_w = inputs.front()->Width();
conv_param->input_channel_ = new_in_channel;
conv_param->output_channel_ = new_out_channel;
CheckIfUseWinograd(&use_winograd, &out_unit, conv_param);
in_shape = {batch, in_h, in_w, new_in_channel};
out_shape = {batch, conv_param->output_h_, conv_param->output_w_, new_out_channel};
}
std::vector<int> filter_shape = {new_out_channel, conv_param->kernel_h_, conv_param->kernel_w_, new_in_channel};
std::vector<int> bias_shape = {new_out_channel};

// create sub kernels
std::vector<kernel::LiteKernel *> group_convs;
for (int i = 0; i < group; ++i) {
std::vector<lite::Tensor *> new_inputs;
std::vector<lite::Tensor *> new_outputs;
auto new_conv_parameter = CreateNewConvParameter(conv_param);
if (new_conv_parameter == nullptr) {
FreeMemory(group_convs, new_inputs, new_outputs);
MS_LOG(ERROR) << "Get new conv parameter failed.";
return nullptr;
}

// create new input for each group
auto in_tensor = CreateInputTensor(inputs.front()->data_type(), in_shape, infered_flag);
if (in_tensor == nullptr) {
delete new_conv_parameter;
FreeMemory(group_convs, new_inputs, new_outputs);
MS_LOG(ERROR) << "create input tensor failed.";
return nullptr;
}
new_inputs.emplace_back(in_tensor);

// create new weight
int copy_length = conv_param->kernel_h_ * conv_param->kernel_w_ * new_in_channel * new_out_channel;
auto filter_tensor =
CreateFilterTensorFp32(inputs.at(kWeightIndex)->data_type(), filter_shape, inputs, copy_length, i);
if (filter_tensor == nullptr) {
delete new_conv_parameter;
FreeMemory(group_convs, new_inputs, new_outputs);
MS_LOG(ERROR) << "create filter tensor failed.";
return nullptr;
}
new_inputs.emplace_back(filter_tensor);

// if has bias, create new bias
if (has_bias) {
auto bias_tensor =
CreateBiasTensorFp32(inputs.at(kBiasIndex)->data_type(), bias_shape, inputs, new_out_channel, i);
if (bias_tensor == nullptr) {
delete new_conv_parameter;
FreeMemory(group_convs, new_inputs, new_outputs);
MS_LOG(ERROR) << "create bias_tensor failed.";
return nullptr;
}
new_inputs.emplace_back(bias_tensor);
}

// create new output tensor
for (size_t j = 0; j < outputs.size(); ++j) {
auto out_tensor = CreateOutputTensor(out_shape, outputs, infered_flag, j);
if (out_tensor == nullptr) {
delete new_conv_parameter;
FreeMemory(group_convs, new_inputs, new_outputs);
MS_LOG(ERROR) << "new out_tensor failed.";
return nullptr;
}
new_outputs.emplace_back(out_tensor);
}
group_convs.emplace_back(CpuConvFp32KernelSelect(new_inputs, new_outputs,
reinterpret_cast<OpParameter *>(new_conv_parameter), ctx,
primitive, use_winograd, out_unit));
}

return new (std::nothrow)
GroupConvolutionCPUKernel(op_parameter, inputs, outputs, ctx, primitive, group_convs, group);
}

kernel::LiteKernel *CpuConvFp32KernelCreator(const std::vector<lite::Tensor *> &inputs,
const std::vector<lite::Tensor *> &outputs, OpParameter *op_parameter,
const InnerContext *ctx, const kernel::KernelKey &desc,
const mindspore::lite::PrimitiveC *primitive) {
MS_ASSERT(op_parameter != nullptr);
MS_ASSERT(desc.type == schema::PrimitiveType_Conv2D);
MS_ASSERT(desc.data_type == kNumberTypeFloat32);
auto conv_param = reinterpret_cast<ConvParameter *>(op_parameter);
int group = conv_param->group_;
bool use_winograd = false;
int out_unit;
if (primitive != nullptr && primitive->infer_flag()) {
conv_param->input_h_ = inputs.front()->Height();
conv_param->input_w_ = inputs.front()->Width();
conv_param->input_channel_ = inputs.front()->Channel();
conv_param->output_h_ = outputs.front()->Height();
conv_param->output_w_ = outputs.front()->Width();
conv_param->output_channel_ = outputs.front()->Channel();
conv_param->op_parameter_.thread_num_ = ctx->thread_num_;
CheckIfUseWinograd(&use_winograd, &out_unit, conv_param);
}

auto *weight_tensor = inputs.at(kWeightIndex);
auto *restore_data = weight_tensor->data_c();
auto restore_type = weight_tensor->data_type();
bool dequant_flag =
!weight_tensor->quant_params().empty() && weight_tensor->quant_params().front().inited && restore_data != nullptr;
if (dequant_flag) {
auto *dequant_weight = kernel::DequantUtil::DequantWeight(weight_tensor);
if (dequant_weight == nullptr) {
MS_LOG(ERROR) << "dequant data is nullptr.";
free(op_parameter);
return nullptr;
}
weight_tensor->set_data(dequant_weight);
}

kernel::LiteKernel *kernel;
if (group == 1) {
kernel = CpuConvFp32KernelSelect(inputs, outputs, op_parameter, ctx, primitive, use_winograd, out_unit);
} else {
kernel = CpuGroupConvFp32KernelCreator(inputs, outputs, op_parameter, ctx, primitive, group);
}

if (kernel == nullptr) {
MS_LOG(ERROR) << "kernel is nullptr.";
if (dequant_flag) {
weight_tensor->FreeData();
weight_tensor->set_data(restore_data);
weight_tensor->set_data_type(restore_type);
}
free(op_parameter);
return nullptr;
}
auto ret = kernel->Init();
if (ret != RET_OK && ret != RET_INFER_INVALID) {
MS_LOG(ERROR) << "Init kernel failed, name: " << op_parameter->name_ << ", type: "
<< schema::EnumNamePrimitiveType(static_cast<schema::PrimitiveType>(op_parameter->type_));
if (dequant_flag) {
weight_tensor->FreeData();
weight_tensor->set_data(restore_data);
weight_tensor->set_data_type(restore_type);
}
delete kernel;
return nullptr;
}

if (dequant_flag) {
weight_tensor->FreeData();
weight_tensor->set_data(restore_data);
weight_tensor->set_data_type(restore_type);
}

return kernel;
}

REG_KERNEL(kCPU, kNumberTypeFloat32, PrimitiveType_Conv2D, CpuConvFp32KernelCreator)
} // namespace mindspore::kernel

+ 6
- 12
mindspore/lite/src/runtime/kernel/arm/fp32/convolution_fp32.h View File

@@ -28,8 +28,10 @@ class ConvolutionCPUKernel : public ConvolutionBaseCPUKernel {
public:
ConvolutionCPUKernel(OpParameter *parameter, const std::vector<lite::Tensor *> &inputs,
const std::vector<lite::Tensor *> &outputs, const lite::InnerContext *ctx,
const mindspore::lite::PrimitiveC *primitive)
: ConvolutionBaseCPUKernel(parameter, inputs, outputs, ctx, primitive) {}
const mindspore::lite::PrimitiveC *primitive, float *origin_weight, float *origin_bias)
: ConvolutionBaseCPUKernel(parameter, inputs, outputs, ctx, primitive),
origin_weight_(origin_weight),
origin_bias_(origin_bias) {}
~ConvolutionCPUKernel() override {
if (packed_weight_ != nullptr) {
free(packed_weight_);
@@ -57,20 +59,12 @@ class ConvolutionCPUKernel : public ConvolutionBaseCPUKernel {
}

protected:
float *origin_weight_; // do not free
float *origin_bias_; // do not free
float *packed_weight_ = nullptr;
float *packed_input_ = nullptr;
float *col_major_input_ = nullptr;
};

void FreeMemory(const std::vector<kernel::LiteKernel *> &group_convs, const std::vector<lite::Tensor *> &new_inputs,
const std::vector<lite::Tensor *> &new_outputs);

ConvParameter *CreateNewConvParameter(ConvParameter *parameter);

lite::Tensor *CreateInputTensor(TypeId data_type, std::vector<int> in_shape, bool infered_flag);

lite::Tensor *CreateOutputTensor(std::vector<int> out_shape, const std::vector<lite::Tensor *> &outputs,
bool infered_flag, int index);
} // namespace mindspore::kernel

#endif // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP32_CONVOLUTION_H_

+ 5
- 17
mindspore/lite/src/runtime/kernel/arm/fp32/convolution_winograd_fp32.cc View File

@@ -81,8 +81,7 @@ int ConvolutionWinogradCPUKernel::InitWeightBias() {
MS_LOG(ERROR) << "get matrix g from CookToomFilter failed.";
return ret;
}
auto weight_data = reinterpret_cast<float *>(filter_tensor->MutableData());
ret = WinogradFilterTransform(weight_data, matrix_g, matrix_gt, oc_block);
ret = WinogradFilterTransform(origin_weight_, matrix_g, matrix_gt, oc_block);
if (ret != RET_OK) {
MS_LOG(ERROR) << "winograd filter transfrom failed.";
return ret;
@@ -97,8 +96,7 @@ int ConvolutionWinogradCPUKernel::InitWeightBias() {
}
memset(bias_data_, 0, new_bias_size);
if (in_tensors_.size() == kInputSize2) {
auto ori_bias_addr = reinterpret_cast<float *>(in_tensors_.at(kBiasIndex)->MutableData());
memcpy(bias_data_, ori_bias_addr, out_channel * sizeof(float));
memcpy(bias_data_, origin_bias_, out_channel * sizeof(float));
} else {
MS_ASSERT(in_tensors_.size() == kInputSize1);
}
@@ -171,10 +169,7 @@ int ConvolutionWinogradCPUKernel::Init() {
MS_LOG(ERROR) << "Init weight bias failed.";
return RET_ERROR;
}
if (!InferShapeDone()) {
return RET_OK;
}
return ReSize();
return RET_OK;
}

int ConvolutionWinogradCPUKernel::ReSize() {
@@ -183,18 +178,11 @@ int ConvolutionWinogradCPUKernel::ReSize() {
MS_LOG(ERROR) << "Resize is invalid.";
return ret;
}

ret = ConvolutionBaseCPUKernel::Init();
if (ret != RET_OK) {
MS_LOG(ERROR) << "ConvolutionBase init failed.";
return RET_ERROR;
MS_LOG(ERROR) << "conv base init failed.";
return ret;
}

kernel_unit_ = conv_param_->kernel_h_;
input_unit_ = output_unit_ + kernel_unit_ - 1;
conv_param_->input_unit_ = input_unit_;
conv_param_->output_unit_ = output_unit_;

ret = ConfigInputOutput();
if (ret != RET_OK) {
MS_LOG(ERROR) << "ConfigInputOutput failed.";


+ 6
- 2
mindspore/lite/src/runtime/kernel/arm/fp32/convolution_winograd_fp32.h View File

@@ -28,10 +28,12 @@ class ConvolutionWinogradCPUKernel : public ConvolutionBaseCPUKernel {
public:
ConvolutionWinogradCPUKernel(OpParameter *parameter, const std::vector<lite::Tensor *> &inputs,
const std::vector<lite::Tensor *> &outputs, const lite::InnerContext *ctx,
const mindspore::lite::PrimitiveC *primitive, int output_unit)
const mindspore::lite::PrimitiveC *primitive, int output_unit, float *origin_weight,
float *origin_bias)
: ConvolutionBaseCPUKernel(parameter, inputs, outputs, ctx, primitive),
output_unit_(output_unit),
trans_weight_(nullptr) {}
origin_weight_(origin_weight),
origin_bias_(origin_bias) {}
~ConvolutionWinogradCPUKernel() override {
if (trans_weight_ != nullptr) {
free(trans_weight_);
@@ -69,6 +71,8 @@ class ConvolutionWinogradCPUKernel : public ConvolutionBaseCPUKernel {
int kernel_unit_;
int input_unit_;
int output_unit_;
float *origin_weight_; // do not free
float *origin_bias_; // do not free
float *tmp_data_ = nullptr;
float *trans_input_ = nullptr;
float *gemm_out_ = nullptr;


+ 8
- 12
mindspore/lite/src/runtime/kernel/arm/fp32/group_convolution_fp32.cc View File

@@ -92,11 +92,8 @@ int GroupConvolutionCPUKernel::PreProcess() {
std::vector<int> out_shape;
for (int i = 0; i < group_num_; ++i) {
// in
int in_batch = conv_param_->input_batch_;
int in_h = conv_param_->input_h_;
int in_w = conv_param_->input_w_;
int in_c = conv_param_->input_channel_;
in_shape = {in_batch, in_h, in_w, in_c};
auto in_tensor = in_tensors_.front();
in_shape = {in_tensor->Batch(), in_tensor->Height(), in_tensor->Width(), conv_param_->input_channel_};
auto sub_kernel_in_tensor = group_convs_.at(i)->in_tensors().front();
sub_kernel_in_tensor->set_shape(in_shape);
ret = sub_kernel_in_tensor->MallocData();
@@ -106,11 +103,8 @@ int GroupConvolutionCPUKernel::PreProcess() {
return ret;
}
// out
int out_batch = conv_param_->output_batch_;
int out_h = conv_param_->output_h_;
int out_w = conv_param_->output_w_;
int out_c = conv_param_->output_channel_;
out_shape = {out_batch, out_h, out_w, out_c};
auto out_tensor = out_tensors_.front();
out_shape = {out_tensor->Batch(), out_tensor->Height(), out_tensor->Width(), conv_param_->output_channel_};
auto sub_kernel_out_tensors = group_convs_.at(i)->out_tensors();
for (auto tensor : sub_kernel_out_tensors) {
tensor->set_shape(out_shape);
@@ -143,7 +137,8 @@ int GroupConvolutionCPUKernel::PreProcess() {
}

void GroupConvolutionCPUKernel::SeparateInput(int group_id) {
int in_plane = conv_param_->input_h_ * conv_param_->input_w_ * conv_param_->input_batch_;
auto in_tensor = in_tensors_.front();
int in_plane = in_tensor->Height() * in_tensor->Width() * in_tensor->Batch();
int sub_in_channel = conv_param_->input_channel_;
int ori_in_channel = sub_in_channel * group_num_;
auto sub_in_data = reinterpret_cast<float *>(group_convs_.at(group_id)->in_tensors().front()->data_c());
@@ -157,7 +152,8 @@ void GroupConvolutionCPUKernel::SeparateInput(int group_id) {
}

void GroupConvolutionCPUKernel::PostConcat(int group_id) {
int out_plane = conv_param_->output_h_ * conv_param_->output_w_ * conv_param_->output_batch_;
auto out_tensor = out_tensors_.front();
int out_plane = out_tensor->Height() * out_tensor->Width() * out_tensor->Batch();
int sub_out_channel = conv_param_->output_channel_;
int ori_out_channel = sub_out_channel * group_num_;
auto sub_out_data = reinterpret_cast<float *>(group_convs_.at(group_id)->out_tensors().front()->data_c());


+ 1
- 2
mindspore/lite/src/runtime/kernel/arm/int8/convolution_int8.cc View File

@@ -19,8 +19,7 @@
#include "nnacl/int8/conv_int8.h"
#include "schema/model_generated.h"
#include "src/kernel_registry.h"
#include "src/runtime/kernel/arm/base/layout_transform.h"
#include "src/runtime/kernel/arm/fp32/convolution_fp32.h"
#include "src/runtime/kernel/arm/fp32/convolution_delegate_fp32.h"
#include "src/runtime/kernel/arm/int8/convolution_1x1_int8.h"
#include "src/runtime/kernel/arm/int8/convolution_3x3_int8.h"
#include "src/runtime/kernel/arm/int8/group_convolution_int8.h"


+ 0
- 142
mindspore/lite/test/ut/src/runtime/kernel/arm/fp32/conv1x1_fp32_tests.cc View File

@@ -139,146 +139,4 @@ TEST_F(TestConv1x1Fp32, Input1x1PrePack4) {
EXPECT_EQ(0, CompareOutputData(out, correct, 54));
delete conv_param;
}

int Conv1x1TestInit1(std::vector<lite::Tensor *> *inputs_, std::vector<lite::Tensor *> *outputs_,
ConvParameter *conv_param, float **correct) {
auto *in_t = new lite::Tensor(kNumberTypeFloat, {1, 2, 3, 4}, schema::Format_NHWC, lite::Tensor::VAR);
in_t->MallocData();
float in[] = {12.216284, 3.3466918, 15.327419, 5.234958, 0.804376, 9.952188, 14.727955, -8.080715,
13.71383, 8.055829, 6.5845337, -9.25232, -4.24519, 11.550042, 9.262012, 1.2780352,
6.7263746, -3.9301445, 3.764492, -8.602078, -3.3558068, 13.619035, -2.6694393, 3.2008505};
memcpy(in_t->MutableData(), in, sizeof(float) * 24);
inputs_->push_back(in_t);

auto *weight_t = new lite::Tensor(kNumberTypeFloat, {3, 1, 1, 4}, schema::Format_NHWC, lite::Tensor::CONST_TENSOR);
weight_t->MallocData();
float weight[] = {-0.7308652, 0.5257509, -0.87825793, -1.123181, -1.2206168, 0.562695,
1.5382664, -0.5020635, 0.8591602, -0.26410004, 1.1262615, 0.073132955}; /* nhwc */
memcpy(weight_t->MutableData(), weight, sizeof(float) * 12);
inputs_->push_back(weight_t);

auto *bias_t = new lite::Tensor(kNumberTypeFloat, {3}, schema::Format_NHWC, lite::Tensor::CONST_TENSOR);
bias_t->MallocData();
float bias[] = {2, 2, 2};
memcpy(bias_t->MutableData(), bias, sizeof(float) * 3);
inputs_->push_back(bias_t);

auto *out_t = new lite::Tensor(kNumberTypeFloat, {1, 2, 3, 3}, schema::Format_NHWC, lite::Tensor::VAR);
out_t->MallocData();
outputs_->push_back(out_t);

*correct = reinterpret_cast<float *>(malloc(out_t->ElementsNum() * sizeof(float)));
float co[] = {2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 1.3731456, 1.6877825, 12.427691, 2., 2., 2.};
memcpy(*correct, co, out_t->ElementsNum() * sizeof(float));

conv_param->kernel_h_ = conv_param->kernel_w_ = 1;
conv_param->stride_h_ = conv_param->stride_w_ = 2;
conv_param->dilation_h_ = conv_param->dilation_w_ = 1;
conv_param->pad_u_ = conv_param->pad_l_ = 1;
conv_param->act_type_ = ActType_No;
return out_t->ElementsNum();
}

TEST_F(TestConv1x1Fp32, Conv1x1Test1) {
std::vector<lite::Tensor *> inputs_;
std::vector<lite::Tensor *> outputs_;
auto conv_param = new ConvParameter();
auto *ctx = new lite::InnerContext();
ctx->thread_num_ = 1;
ASSERT_EQ(lite::RET_OK, ctx->Init());
float *correct;
int total_size = Conv1x1TestInit1(&inputs_, &outputs_, conv_param, &correct);
auto *conv1x1 =
new kernel::Convolution1x1CPUKernel(reinterpret_cast<OpParameter *>(conv_param), inputs_, outputs_, ctx, nullptr);

conv1x1->Init();
conv1x1->Run();

ASSERT_EQ(0, CompareOutputData(reinterpret_cast<float *>(outputs_[0]->MutableData()), correct, total_size, 0.0001));
delete conv_param;
delete conv1x1;
for (auto t : inputs_) delete t;
for (auto t : outputs_) delete t;
free(correct);
}

int Conv1x1TestInit2(std::vector<lite::Tensor *> *inputs_, std::vector<lite::Tensor *> *outputs_,
ConvParameter *conv_param, float **correct) {
size_t buffer_size;
auto *in_t = new lite::Tensor(kNumberTypeFloat, {1, 300, 300, 24}, schema::Format_NHWC, lite::Tensor::VAR);
in_t->MallocData();
std::string input_path = "./conv/conv1x1fp32_input1_nhwc.bin";
auto in = reinterpret_cast<float *>(mindspore::lite::ReadFile(input_path.c_str(), &buffer_size));
memcpy(in_t->MutableData(), in, buffer_size);
inputs_->push_back(in_t);

auto *weight_t = new lite::Tensor(kNumberTypeFloat, {40, 1, 1, 24}, schema::Format_NHWC, lite::Tensor::CONST_TENSOR);
weight_t->MallocData();
std::string weight_path = "./conv/conv1x1fp32_weight1_nhwc.bin";
auto weight = reinterpret_cast<float *>(mindspore::lite::ReadFile(weight_path.c_str(), &buffer_size));
memcpy(weight_t->MutableData(), weight, buffer_size);
inputs_->push_back(weight_t);

auto *bias_t = new lite::Tensor(kNumberTypeFloat, {40}, schema::Format_NHWC, lite::Tensor::CONST_TENSOR);
bias_t->MallocData();
std::string bias_path = "./conv/conv1x1fp32_bias1_nhwc.bin";
auto bias = mindspore::lite::ReadFile(bias_path.c_str(), &buffer_size);
memcpy(bias_t->MutableData(), bias, buffer_size);
inputs_->push_back(bias_t);

auto *out_t = new lite::Tensor(kNumberTypeFloat, {1, 300, 300, 40}, schema::Format_NHWC, lite::Tensor::VAR);
out_t->MallocData();
outputs_->push_back(out_t);

std::string out_path = "./conv/conv1x1fp32_output1_nhwc.bin";
auto out_nhwc = mindspore::lite::ReadFile(out_path.c_str(), &buffer_size);
*correct = reinterpret_cast<float *>(malloc(buffer_size));
memcpy(*correct, out_nhwc, buffer_size);

conv_param->kernel_h_ = conv_param->kernel_w_ = 1;
conv_param->stride_h_ = conv_param->stride_w_ = 1;
conv_param->dilation_h_ = conv_param->dilation_w_ = 1;
conv_param->pad_u_ = conv_param->pad_l_ = 0;
conv_param->act_type_ = ActType_No;
return out_t->ElementsNum();
}

TEST_F(TestConv1x1Fp32, Conv1x1Test2) {
std::vector<lite::Tensor *> inputs_;
std::vector<lite::Tensor *> outputs_;
auto conv_param = new ConvParameter();
auto *ctx = new lite::InnerContext();
ctx->thread_num_ = 2;
ASSERT_EQ(lite::RET_OK, ctx->Init());
float *correct;
int total_size = Conv1x1TestInit2(&inputs_, &outputs_, conv_param, &correct);
auto *conv1x1 =
new kernel::Convolution1x1CPUKernel(reinterpret_cast<OpParameter *>(conv_param), inputs_, outputs_, ctx, nullptr);

conv1x1->Init();
conv1x1->Run();
ASSERT_EQ(0, CompareOutputData(reinterpret_cast<float *>(outputs_[0]->MutableData()), correct, total_size, 0.0001));

/* running warm up */
for (int i = 0; i < 0; i++) {
conv1x1->Run();
}

/* running time cost */
int loop_count = 1;
auto time_start = mindspore::lite::GetTimeUs();
for (int i = 0; i < loop_count; i++) {
conv1x1->Run();
}
auto time_end = mindspore::lite::GetTimeUs();
auto cost = time_end - time_start;
uint64_t time_avg = cost / loop_count;
printf("1x1 average time : %f ms\n", time_avg / 1000.0f);

delete conv_param;
delete conv1x1;
for (auto t : inputs_) delete t;
for (auto t : outputs_) delete t;
free(correct);
}
} // namespace mindspore

Loading…
Cancel
Save