From: @yangruoqi713 Reviewed-by: @zhanghaibo5,@hangangqiang Signed-off-by: @zhanghaibo5pull/15142/MERGE
| @@ -15,10 +15,6 @@ | |||
| */ | |||
| #include "src/runtime/kernel/arm/base/group_convolution_creator.h" | |||
| #include "src/runtime/kernel/arm/base/group_convolution.h" | |||
| #include "src/runtime/kernel/arm/int8/convolution_int8_creator.h" | |||
| #include "src/runtime/kernel/arm/fp32/convolution_delegate_fp32.h" | |||
| #include "src/runtime/kernel/arm/int8/group_convolution_int8.h" | |||
| namespace mindspore::kernel { | |||
| void CopyTensorQuantParam(lite::Tensor *dst, lite::Tensor *src) { | |||
| @@ -37,15 +33,11 @@ ConvParameter *CreateNewConvParameter(ConvParameter *parameter) { | |||
| return conv_parameter; | |||
| } | |||
| void FreeMemory(ConvParameter *conv_param, const std::vector<lite::Tensor *> &new_inputs, | |||
| const std::vector<lite::Tensor *> &new_outputs) { | |||
| if (conv_param != nullptr) { | |||
| free(conv_param); | |||
| } | |||
| for (auto &in_tensor : new_inputs) { | |||
| void FreeMemory(const std::vector<lite::Tensor *> *new_inputs, const std::vector<lite::Tensor *> *new_outputs) { | |||
| for (auto &in_tensor : *new_inputs) { | |||
| delete in_tensor; | |||
| } | |||
| for (auto &out_tensor : new_outputs) { | |||
| for (auto &out_tensor : *new_outputs) { | |||
| delete out_tensor; | |||
| } | |||
| } | |||
| @@ -106,6 +98,7 @@ void GroupConvCreator::CopyQuantParam(std::vector<lite::Tensor *> *tensors) { | |||
| CopyTensorQuantParam(tensors->at(j), origin_inputs_.at(j)); | |||
| } | |||
| } | |||
| bool GroupConvCreator::CheckIfValidPoint(void *ptr) { | |||
| if (ptr == nullptr) { | |||
| for (auto &sub_conv : group_convs_) { | |||
| @@ -117,18 +110,17 @@ bool GroupConvCreator::CheckIfValidPoint(void *ptr) { | |||
| } | |||
| int GroupConvCreator::NewInputTensor(std::vector<lite::Tensor *> *tensors) { | |||
| auto in_tensor = CreateVarTensor( | |||
| {input_shape_, schema::Format_NHWC, origin_inputs_.at(0)->data_type(), lite::Tensor::Category::VAR, true}, | |||
| infered_); | |||
| auto in_tensor = | |||
| CreateVarTensor({input_shape_, schema::Format_NHWC, data_type_, lite::Tensor::Category::VAR, true}, infered_); | |||
| if (!CheckIfValidPoint(in_tensor)) { | |||
| return lite::RET_ERROR; | |||
| } | |||
| tensors->emplace_back(in_tensor); | |||
| return lite::RET_OK; | |||
| } | |||
| int GroupConvCreator::NewOutputTensor(std::vector<lite::Tensor *> *tensors, lite::Tensor *output) { | |||
| auto out_tensor = | |||
| CreateVarTensor({output_shape_, output->format(), output->data_type(), output->category(), false}, infered_); | |||
| auto out_tensor = CreateVarTensor({output_shape_, output->format(), data_type_, output->category(), false}, infered_); | |||
| if (!CheckIfValidPoint(out_tensor)) { | |||
| return lite::RET_ERROR; | |||
| } | |||
| @@ -153,6 +145,7 @@ int GroupConvCreator::NewConstTensor(std::vector<lite::Tensor *> *tensors, int g | |||
| } | |||
| return lite::RET_OK; | |||
| } | |||
| void GroupConvCreator::SetShapeOfTensors() { | |||
| int new_in_channel = origin_inputs_.at(kWeightIndex)->Channel(); | |||
| int new_out_channel; | |||
| @@ -176,71 +169,31 @@ void GroupConvCreator::SetShapeOfTensors() { | |||
| } | |||
| } | |||
| int GroupConvCreator::CreatGroupConv() { | |||
| for (int i = 0; i < conv_param_->group_; ++i) { | |||
| auto new_conv_parameter = CreateNewConvParameter(conv_param_); | |||
| if (!CheckIfValidPoint(new_conv_parameter)) { | |||
| return lite::RET_ERROR; | |||
| } | |||
| // create new input for each group | |||
| std::vector<lite::Tensor *> new_inputs; | |||
| if (NewInputTensor(&new_inputs) != lite::RET_OK) { | |||
| MS_LOG(ERROR) << "new input tensor failed."; | |||
| FreeMemory(new_conv_parameter, new_inputs, {}); | |||
| return lite::RET_ERROR; | |||
| } | |||
| // const tensor | |||
| if (NewConstTensor(&new_inputs, i) != lite::RET_OK) { | |||
| MS_LOG(ERROR) << "new const tensor failed."; | |||
| FreeMemory(new_conv_parameter, new_inputs, {}); | |||
| int GroupConvCreator::GetSingleConvParam(ConvParameter *conv_param, std::vector<lite::Tensor *> *new_inputs, | |||
| std::vector<lite::Tensor *> *new_outputs, int group_id) { | |||
| if (!CheckIfValidPoint(conv_param)) { | |||
| return lite::RET_ERROR; | |||
| } | |||
| // create new input for each group | |||
| if (NewInputTensor(new_inputs) != lite::RET_OK) { | |||
| MS_LOG(ERROR) << "new input tensor failed."; | |||
| FreeMemory(new_inputs, {}); | |||
| return lite::RET_ERROR; | |||
| } | |||
| // const tensor | |||
| if (NewConstTensor(new_inputs, group_id) != lite::RET_OK) { | |||
| MS_LOG(ERROR) << "new const tensor failed."; | |||
| FreeMemory(new_inputs, {}); | |||
| return lite::RET_ERROR; | |||
| } | |||
| // create new output tensor | |||
| for (auto &output : origin_outputs_) { | |||
| if (NewOutputTensor(new_outputs, output) != lite::RET_OK) { | |||
| MS_LOG(ERROR) << "new output tensor failed."; | |||
| FreeMemory(new_inputs, new_outputs); | |||
| return lite::RET_ERROR; | |||
| } | |||
| // create new output tensor | |||
| std::vector<lite::Tensor *> new_outputs; | |||
| for (auto &output : origin_outputs_) { | |||
| if (NewOutputTensor(&new_outputs, output) != lite::RET_OK) { | |||
| MS_LOG(ERROR) << "new output tensor failed."; | |||
| FreeMemory(new_conv_parameter, new_inputs, new_outputs); | |||
| return lite::RET_ERROR; | |||
| } | |||
| } | |||
| if (is_quant_) { | |||
| CopyQuantParam(&new_inputs); | |||
| group_convs_.emplace_back(CpuConvInt8KernelSelect(new_inputs, new_outputs, | |||
| reinterpret_cast<OpParameter *>(new_conv_parameter), context_)); | |||
| } else { | |||
| group_convs_.emplace_back(new (std::nothrow) kernel::ConvolutionDelegateCPUKernel( | |||
| reinterpret_cast<OpParameter *>(new_conv_parameter), new_inputs, new_outputs, context_)); | |||
| } | |||
| } | |||
| return lite::RET_OK; | |||
| } | |||
| kernel::LiteKernel *CpuGroupConvFp32KernelCreator(const std::vector<lite::Tensor *> &inputs, | |||
| const std::vector<lite::Tensor *> &outputs, OpParameter *op_parameter, | |||
| const lite::InnerContext *ctx) { | |||
| GroupConvCreator group_conv_creator(inputs, outputs, op_parameter, ctx, false); | |||
| group_conv_creator.SetShapeOfTensors(); | |||
| if (group_conv_creator.CreatGroupConv() != lite::RET_OK) { | |||
| MS_LOG(ERROR) << "Create fp32 group conv failed."; | |||
| return nullptr; | |||
| } | |||
| return new (std::nothrow) | |||
| GroupConvolutionCPUKernel(op_parameter, inputs, outputs, ctx, group_conv_creator.get_group_conv(), | |||
| reinterpret_cast<ConvParameter *>(op_parameter)->group_); | |||
| } | |||
| kernel::LiteKernel *CpuGroupConvInt8KernelCreator(const std::vector<lite::Tensor *> &inputs, | |||
| const std::vector<lite::Tensor *> &outputs, OpParameter *op_parameter, | |||
| const lite::InnerContext *ctx, int group) { | |||
| GroupConvCreator group_conv_creator(inputs, outputs, op_parameter, ctx, true); | |||
| group_conv_creator.SetShapeOfTensors(); | |||
| if (group_conv_creator.CreatGroupConv() != lite::RET_OK) { | |||
| MS_LOG(ERROR) << "Create int8 group conv failed."; | |||
| return nullptr; | |||
| } | |||
| return new (std::nothrow) | |||
| GroupConvolutionInt8CPUKernel(op_parameter, inputs, outputs, ctx, group_conv_creator.get_group_conv(), group); | |||
| } | |||
| } // namespace mindspore::kernel | |||
| @@ -34,12 +34,12 @@ struct TensorInfo { | |||
| class GroupConvCreator { | |||
| public: | |||
| GroupConvCreator(std::vector<lite::Tensor *> inputs, std::vector<lite::Tensor *> outputs, OpParameter *op_parameter, | |||
| const lite::InnerContext *ctx, bool is_quant) | |||
| const lite::InnerContext *ctx, bool is_quant, TypeId data_type) | |||
| : origin_inputs_(std::move(inputs)), | |||
| origin_outputs_(std::move(outputs)), | |||
| context_(ctx), | |||
| infered_(op_parameter->infer_flag_), | |||
| is_quant_(is_quant) { | |||
| is_quant_(is_quant), | |||
| data_type_(data_type) { | |||
| conv_param_ = reinterpret_cast<ConvParameter *>(op_parameter); | |||
| } | |||
| @@ -47,15 +47,16 @@ class GroupConvCreator { | |||
| public: | |||
| void SetShapeOfTensors(); | |||
| int CreatGroupConv(); | |||
| std::vector<kernel::LiteKernel *> get_group_conv() { return group_convs_; } | |||
| std::vector<kernel::LiteKernel *> *get_group_conv() { return &group_convs_; } | |||
| void CopyQuantParam(std::vector<lite::Tensor *> *tensors); | |||
| int GetSingleConvParam(ConvParameter *conv_param, std::vector<lite::Tensor *> *new_inputs, | |||
| std::vector<lite::Tensor *> *new_outputs, int group_id); | |||
| protected: | |||
| void set_input_shape(const std::vector<int> &shape) { input_shape_ = shape; } | |||
| void set_output_shape(const std::vector<int> &shape) { output_shape_ = shape; } | |||
| void set_filter_shape(const std::vector<int> &shape) { filter_shape_ = shape; } | |||
| void set_bias_shape(const std::vector<int> &shape) { bias_shape_ = shape; } | |||
| void CopyQuantParam(std::vector<lite::Tensor *> *tensors); | |||
| bool CheckIfValidPoint(void *ptr); | |||
| int NewInputTensor(std::vector<lite::Tensor *> *tensors); | |||
| int NewConstTensor(std::vector<lite::Tensor *> *tensors, int group_id); | |||
| @@ -69,20 +70,13 @@ class GroupConvCreator { | |||
| std::vector<int> output_shape_; | |||
| std::vector<int> filter_shape_; | |||
| std::vector<int> bias_shape_; | |||
| const lite::InnerContext *context_; | |||
| ConvParameter *conv_param_; | |||
| bool infered_; | |||
| bool is_quant_; | |||
| bool infered_ = false; | |||
| bool is_quant_ = false; | |||
| TypeId data_type_; | |||
| }; | |||
| LiteKernel *CpuGroupConvFp32KernelCreator(const std::vector<lite::Tensor *> &inputs, | |||
| const std::vector<lite::Tensor *> &outputs, OpParameter *op_parameter, | |||
| const lite::InnerContext *ctx); | |||
| LiteKernel *CpuGroupConvInt8KernelCreator(const std::vector<lite::Tensor *> &inputs, | |||
| const std::vector<lite::Tensor *> &outputs, OpParameter *op_parameter, | |||
| const lite::InnerContext *ctx, int group); | |||
| ConvParameter *CreateNewConvParameter(ConvParameter *parameter); | |||
| } // namespace mindspore::kernel | |||
| #endif // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_GROUP_CONVOLUTION_CREATOR_H_ | |||
| @@ -88,12 +88,7 @@ int Convolution1x1FP16CPUKernel::InitWeightBias() { | |||
| MS_LOG(ERROR) << "Conv1x1 Malloc bias_ptr_ error!"; | |||
| return RET_ERROR; | |||
| } | |||
| if (origin_bias_data_type_ == kNumberTypeFloat16) { | |||
| memcpy(bias_data_, origin_bias_, output_channel * sizeof(float16_t)); | |||
| } else { | |||
| MS_LOG(ERROR) << "Conv1x1 only support fp16 weight"; | |||
| return RET_ERROR; | |||
| } | |||
| memcpy(bias_data_, origin_bias_, output_channel * sizeof(float16_t)); | |||
| memset(reinterpret_cast<char *>(bias_data_) + bias_size, 0, size - bias_size); | |||
| } | |||
| @@ -105,8 +100,7 @@ int Convolution1x1FP16CPUKernel::InitWeightBias() { | |||
| return RET_ERROR; | |||
| } | |||
| memset(reinterpret_cast<char *>(weight_ptr_) + down_size, 0, size - down_size); | |||
| ColMajor2Row8MajorFp16(origin_weight_, weight_ptr_, input_channel, output_channel, | |||
| origin_weight_data_type_ == kNumberTypeFloat16); | |||
| ColMajor2Row8MajorFp16(origin_weight_, weight_ptr_, input_channel, output_channel, true); | |||
| return RET_OK; | |||
| } | |||
| @@ -217,8 +211,12 @@ static int Convolution1x1Fp16RunHw(void *cdata, int task_id) { | |||
| } | |||
| int Convolution1x1FP16CPUKernel::Run() { | |||
| ConvolutionBaseFP16CPUKernel::GetExecuteTensor(); | |||
| auto input_data = reinterpret_cast<float16_t *>(in_tensors_.at(0)->data_c()); | |||
| auto output_data = reinterpret_cast<float16_t *>(out_tensors_.at(0)->data_c()); | |||
| if (input_data == nullptr || output_data == nullptr) { | |||
| MS_LOG(ERROR) << "Convolution1x1 Fp16 get null tensor data!"; | |||
| return RET_ERROR; | |||
| } | |||
| pack_input_ = reinterpret_cast<float16_t *>( | |||
| ctx_->allocator->Malloc(matmul_param_->row_align_ * matmul_param_->deep_ * sizeof(float16_t))); | |||
| if (pack_input_ == nullptr) { | |||
| @@ -227,9 +225,9 @@ int Convolution1x1FP16CPUKernel::Run() { | |||
| } | |||
| for (int batch_index = 0; batch_index < conv_param_->input_batch_; batch_index++) { | |||
| output_ptr_ = execute_output_ + batch_index * matmul_param_->row_ * matmul_param_->col_; | |||
| output_ptr_ = output_data + batch_index * matmul_param_->row_ * matmul_param_->col_; | |||
| float16_t *batch_in = | |||
| execute_input_ + batch_index * conv_param_->input_h_ * conv_param_->input_w_ * conv_param_->input_channel_; | |||
| input_data + batch_index * conv_param_->input_h_ * conv_param_->input_w_ * conv_param_->input_channel_; | |||
| if (pre_trans_input_) { | |||
| Conv1x1InputPack(batch_in, input_ptr_, conv_param_, sizeof(float16_t)); | |||
| } else { | |||
| @@ -20,18 +20,18 @@ | |||
| #include <arm_neon.h> | |||
| #include <vector> | |||
| #include "src/lite_kernel.h" | |||
| #include "src/runtime/kernel/arm/fp16/convolution_base_fp16.h" | |||
| #include "src/runtime/kernel/arm/base/convolution_base.h" | |||
| #include "src/common/utils.h" | |||
| #include "nnacl/matmul_parameter.h" | |||
| #include "nnacl/fp16/matmul_fp16.h" | |||
| namespace mindspore::kernel { | |||
| class Convolution1x1FP16CPUKernel : public ConvolutionBaseFP16CPUKernel { | |||
| class Convolution1x1FP16CPUKernel : public ConvolutionBaseCPUKernel { | |||
| public: | |||
| Convolution1x1FP16CPUKernel(OpParameter *parameter, const std::vector<lite::Tensor *> &inputs, | |||
| const std::vector<lite::Tensor *> &outputs, const InnerContext *ctx, void *origin_weight, | |||
| void *origin_bias, TypeId origin_weight_data_type, TypeId origin_bias_data_type) | |||
| : ConvolutionBaseFP16CPUKernel(parameter, inputs, outputs, ctx, origin_weight_data_type, origin_bias_data_type), | |||
| void *origin_bias) | |||
| : ConvolutionBaseCPUKernel(parameter, inputs, outputs, ctx), | |||
| origin_weight_(origin_weight), | |||
| origin_bias_(origin_bias) {} | |||
| ~Convolution1x1FP16CPUKernel() override; | |||
| @@ -1,52 +0,0 @@ | |||
| /** | |||
| * Copyright 2020 Huawei Technologies Co., Ltd | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| * You may obtain a copy of the License at | |||
| * | |||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, software | |||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| #include "src/runtime/kernel/arm/fp16/convolution_base_fp16.h" | |||
| #include "nnacl/fp16/cast_fp16.h" | |||
| #include "src/runtime/kernel/arm/fp16/common_fp16.h" | |||
| #include "include/errorcode.h" | |||
| #include "src/runtime/runtime_api.h" | |||
| namespace mindspore::kernel { | |||
| using mindspore::lite::RET_ERROR; | |||
| using mindspore::lite::RET_OK; | |||
| ConvolutionBaseFP16CPUKernel::~ConvolutionBaseFP16CPUKernel() { | |||
| if (fp16_weight_ != nullptr) { | |||
| free(fp16_weight_); | |||
| fp16_weight_ = nullptr; | |||
| } | |||
| } | |||
| int ConvolutionBaseFP16CPUKernel::GetExecuteTensor() { | |||
| auto input_tensor = in_tensors_.at(0); | |||
| auto output_tensor = out_tensors_.at(0); | |||
| execute_input_ = reinterpret_cast<float16_t *>(input_tensor->data_c()); | |||
| execute_output_ = reinterpret_cast<float16_t *>(output_tensor->data_c()); | |||
| return RET_OK; | |||
| } | |||
| int ConvolutionBaseFP16CPUKernel::GetExecuteFilter(lite::Tensor *weight_tensor, void *origin_data) { | |||
| MS_ASSERT(origin_weight_data_type_ == kNumberTypeFloat32 || origin_weight_data_type_ == kNumberTypeFloat16); | |||
| if (origin_weight_data_type_ == kNumberTypeFloat32) { | |||
| MS_LOG(ERROR) << "Conv fp16 only support fp16 weight"; | |||
| return RET_ERROR; | |||
| } else { | |||
| execute_weight_ = reinterpret_cast<float16_t *>(origin_data); | |||
| fp16_weight_ = nullptr; | |||
| } | |||
| return RET_OK; | |||
| } | |||
| } // namespace mindspore::kernel | |||
| @@ -1,57 +0,0 @@ | |||
| /** | |||
| * Copyright 2020 Huawei Technologies Co., Ltd | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| * You may obtain a copy of the License at | |||
| * | |||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, software | |||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| #ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_CONVOLUTION_BASE_FP16_H_ | |||
| #define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_CONVOLUTION_BASE_FP16_H_ | |||
| #include <arm_neon.h> | |||
| #include <vector> | |||
| #include "src/lite_kernel.h" | |||
| #include "src/runtime/kernel/arm/base/convolution_base.h" | |||
| #include "src/common/utils.h" | |||
| namespace mindspore::kernel { | |||
| class ConvolutionBaseFP16CPUKernel : public ConvolutionBaseCPUKernel { | |||
| public: | |||
| ConvolutionBaseFP16CPUKernel(OpParameter *parameter, const std::vector<lite::Tensor *> &inputs, | |||
| const std::vector<lite::Tensor *> &outputs, const InnerContext *ctx, | |||
| TypeId origin_weight_data_type, TypeId origin_bias_data_type) | |||
| : ConvolutionBaseCPUKernel(parameter, inputs, outputs, ctx), | |||
| origin_weight_data_type_(origin_weight_data_type), | |||
| origin_bias_data_type_(origin_bias_data_type) {} | |||
| ~ConvolutionBaseFP16CPUKernel() override; | |||
| int Init() override { return mindspore::lite::RET_OK; } | |||
| int ReSize() override { return mindspore::lite::RET_OK; } | |||
| int Run() override { return mindspore::lite::RET_OK; } | |||
| int RunImpl(int task_id) { return mindspore::lite::RET_OK; } | |||
| virtual int GetExecuteTensor(); | |||
| // origin_data may not be the same as the data in the weight tensor, | |||
| // because weight tensor has released data already. In this situation, | |||
| // origin_data is the pointer of another memory block. | |||
| virtual int GetExecuteFilter(lite::Tensor *weight_tensor, void *origin_data); | |||
| protected: | |||
| float16_t *fp16_weight_ = nullptr; | |||
| float16_t *execute_input_ = nullptr; | |||
| float16_t *execute_weight_ = nullptr; | |||
| float16_t *execute_output_ = nullptr; | |||
| TypeId origin_weight_data_type_; | |||
| TypeId origin_bias_data_type_; | |||
| }; | |||
| } // namespace mindspore::kernel | |||
| #endif // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_CONVOLUTION_BASE_FP16_H_ | |||
| @@ -22,6 +22,7 @@ | |||
| #include "src/runtime/kernel/arm/fp16/group_convolution_fp16.h" | |||
| #include "src/runtime/kernel/arm/fp16/convolution_depthwise_fp16.h" | |||
| #include "src/runtime/kernel/arm/fp16/convolution_depthwise_slidewindow_fp16.h" | |||
| #include "src/runtime/kernel/arm/base/group_convolution_creator.h" | |||
| #include "schema/model_generated.h" | |||
| #include "src/kernel_registry.h" | |||
| #include "include/errorcode.h" | |||
| @@ -96,8 +97,8 @@ int ConvolutionDelegateFP16CPUKernel::ReSize() { | |||
| kernel::SetInputOutputShapeInfo(reinterpret_cast<ConvParameter *>(op_parameter_), in_tensors_.front(), | |||
| out_tensors_.front(), context_); | |||
| if (fp16_conv_kernel_ == nullptr) { | |||
| fp16_conv_kernel_ = CpuConvFp16KernelSelect(in_tensors_, out_tensors_, op_parameter_, context_, origin_weight_, | |||
| origin_bias_, origin_weight_data_type_, origin_bias_data_type_); | |||
| fp16_conv_kernel_ = | |||
| CpuConvFp16KernelSelect(in_tensors_, out_tensors_, op_parameter_, context_, origin_weight_, origin_bias_); | |||
| if (fp16_conv_kernel_ == nullptr) { | |||
| MS_LOG(ERROR) << "Selecting execute kernel failed for conv_kernel, got a nullptr."; | |||
| return RET_ERROR; | |||
| @@ -108,29 +109,16 @@ int ConvolutionDelegateFP16CPUKernel::ReSize() { | |||
| return fp16_conv_kernel_->ReSize(); | |||
| } | |||
| ConvParameter *CreateNewConvParameterFp16(ConvParameter *parameter) { | |||
| auto conv_parameter = reinterpret_cast<ConvParameter *>(malloc(sizeof(ConvParameter))); | |||
| if (conv_parameter == nullptr) { | |||
| MS_LOG(ERROR) << "Malloc new conv parameter failed."; | |||
| return nullptr; | |||
| } | |||
| memcpy(conv_parameter, parameter, sizeof(ConvParameter)); | |||
| return conv_parameter; | |||
| } | |||
| kernel::LiteKernel *CpuConvDwFp16KernelCreator(const std::vector<lite::Tensor *> &inputs, | |||
| const std::vector<lite::Tensor *> &outputs, OpParameter *opParameter, | |||
| const InnerContext *ctx, void *origin_weight, void *origin_bias, | |||
| TypeId origin_weight_data_type, TypeId origin_bias_data_type) { | |||
| const InnerContext *ctx) { | |||
| MS_ASSERT(opParameter != nullptr); | |||
| auto conv_param = reinterpret_cast<ConvParameter *>(opParameter); | |||
| kernel::LiteKernel *kernel = nullptr; | |||
| if (conv_param->input_channel_ < 32) { | |||
| kernel = new (std::nothrow) kernel::ConvolutionDepthwiseSWFp16CPUKernel( | |||
| opParameter, inputs, outputs, ctx, origin_weight, origin_bias, origin_weight_data_type, origin_bias_data_type); | |||
| kernel = new (std::nothrow) kernel::ConvolutionDepthwiseSWFp16CPUKernel(opParameter, inputs, outputs, ctx); | |||
| } else { | |||
| kernel = new (std::nothrow) kernel::ConvolutionDepthwiseFp16CPUKernel( | |||
| opParameter, inputs, outputs, ctx, origin_weight, origin_bias, origin_weight_data_type, origin_bias_data_type); | |||
| kernel = new (std::nothrow) kernel::ConvolutionDepthwiseFp16CPUKernel(opParameter, inputs, outputs, ctx); | |||
| } | |||
| if (kernel == nullptr) { | |||
| MS_LOG(ERROR) << "kernel is nullptr."; | |||
| @@ -142,27 +130,22 @@ kernel::LiteKernel *CpuConvDwFp16KernelCreator(const std::vector<lite::Tensor *> | |||
| kernel::LiteKernel *CpuConvFp16KernelSelect(const std::vector<lite::Tensor *> &inputs, | |||
| const std::vector<lite::Tensor *> &outputs, OpParameter *op_parameter, | |||
| const lite::InnerContext *ctx, void *origin_weight, void *origin_bias, | |||
| TypeId origin_weight_data_type, TypeId origin_bias_data_type) { | |||
| const lite::InnerContext *ctx, void *origin_weight, void *origin_bias) { | |||
| auto conv_param = reinterpret_cast<ConvParameter *>(op_parameter); | |||
| bool use_winograd = false; | |||
| int out_unit; | |||
| CheckIfUseWinogradFp16(&use_winograd, &out_unit, conv_param); | |||
| kernel::LiteKernel *kernel = nullptr; | |||
| if (conv_param->group_ == conv_param->input_channel_ && conv_param->group_ == conv_param->output_channel_) { | |||
| kernel = CpuConvDwFp16KernelCreator(inputs, outputs, op_parameter, ctx, origin_weight, origin_bias, | |||
| origin_weight_data_type, origin_bias_data_type); | |||
| } else if (conv_param->kernel_h_ == 1 && conv_param->kernel_w_ == 1) { | |||
| kernel = new (std::nothrow) kernel::Convolution1x1FP16CPUKernel( | |||
| op_parameter, inputs, outputs, ctx, origin_weight, origin_bias, origin_weight_data_type, origin_bias_data_type); | |||
| } else if (use_winograd) { | |||
| if (conv_param->kernel_h_ == 1 && conv_param->kernel_w_ == 1) { | |||
| kernel = new (std::nothrow) | |||
| kernel::ConvolutionWinogradFP16CPUKernel(op_parameter, inputs, outputs, ctx, out_unit, origin_weight, origin_bias, | |||
| origin_weight_data_type, origin_bias_data_type); | |||
| kernel::Convolution1x1FP16CPUKernel(op_parameter, inputs, outputs, ctx, origin_weight, origin_bias); | |||
| } else if (use_winograd) { | |||
| kernel = new (std::nothrow) kernel::ConvolutionWinogradFP16CPUKernel(op_parameter, inputs, outputs, ctx, out_unit, | |||
| origin_weight, origin_bias); | |||
| } else { | |||
| kernel = new (std::nothrow) kernel::ConvolutionFP16CPUKernel( | |||
| op_parameter, inputs, outputs, ctx, origin_weight, origin_bias, origin_weight_data_type, origin_bias_data_type); | |||
| kernel = new (std::nothrow) | |||
| kernel::ConvolutionFP16CPUKernel(op_parameter, inputs, outputs, ctx, origin_weight, origin_bias); | |||
| } | |||
| // Once kernel is selected, init func will invoke InitWeightAndBias | |||
| auto ret = kernel->Init(); | |||
| @@ -174,194 +157,54 @@ kernel::LiteKernel *CpuConvFp16KernelSelect(const std::vector<lite::Tensor *> &i | |||
| return kernel; | |||
| } | |||
| void FreeMemoryFp16(const std::vector<kernel::LiteKernel *> &group_convs, const std::vector<lite::Tensor *> &new_inputs, | |||
| const std::vector<lite::Tensor *> &new_outputs) { | |||
| for (auto sub_conv : group_convs) { | |||
| delete sub_conv; | |||
| } | |||
| for (auto in_tensor : new_inputs) { | |||
| delete in_tensor; | |||
| } | |||
| for (auto out_tensor : new_outputs) { | |||
| delete out_tensor; | |||
| } | |||
| } | |||
| static lite::Tensor *CreateInputTensorFp16(TypeId data_type, const std::vector<int> &in_shape, bool infered_flag) { | |||
| auto in_tensor = new (std::nothrow) lite::Tensor(data_type, in_shape, Format_NHWC, lite::Tensor::Category::VAR); | |||
| if (in_tensor == nullptr) { | |||
| MS_LOG(ERROR) << "new in_tensor failed."; | |||
| return nullptr; | |||
| } | |||
| if (infered_flag) { | |||
| auto ret = in_tensor->MallocData(); | |||
| if (ret != RET_OK) { | |||
| delete in_tensor; | |||
| MS_LOG(ERROR) << "in tensor malloc failed."; | |||
| return nullptr; | |||
| } | |||
| } | |||
| return in_tensor; | |||
| } | |||
| static lite::Tensor *CreateConstTensorFp16(lite::Tensor *tensor, const std::vector<int> &shape, const int index) { | |||
| auto new_tensor = | |||
| new (std::nothrow) lite::Tensor(tensor->data_type(), shape, Format_NHWC, lite::Tensor::Category::CONST_TENSOR); | |||
| if (new_tensor == nullptr) { | |||
| MS_LOG(ERROR) << "Create new_tensor failed."; | |||
| return nullptr; | |||
| } | |||
| auto ret = new_tensor->MallocData(); | |||
| if (ret != RET_OK) { | |||
| delete new_tensor; | |||
| MS_LOG(ERROR) << "Malloc new_tensor failed."; | |||
| return nullptr; | |||
| } | |||
| memcpy(new_tensor->data_c(), reinterpret_cast<char *>(tensor->data_c()) + index * new_tensor->Size(), | |||
| new_tensor->Size()); | |||
| return new_tensor; | |||
| } | |||
| static lite::Tensor *CreateOutputTensorFp16(const std::vector<int> &out_shape, | |||
| const std::vector<lite::Tensor *> &outputs, bool infered_flag, int index) { | |||
| auto out_tensor = new (std::nothrow) lite::Tensor(); | |||
| if (out_tensor == nullptr) { | |||
| MS_LOG(ERROR) << "new tmp_out_tensor failed."; | |||
| return nullptr; | |||
| } | |||
| out_tensor->set_data_type(mindspore::kNumberTypeFloat16); | |||
| out_tensor->set_format(outputs.at(index)->format()); | |||
| if (infered_flag) { | |||
| out_tensor->set_shape(out_shape); | |||
| auto ret = out_tensor->MallocData(); | |||
| if (ret != RET_OK) { | |||
| delete out_tensor; | |||
| MS_LOG(ERROR) << "out_tensor malloc data failed."; | |||
| return nullptr; | |||
| } | |||
| } | |||
| return out_tensor; | |||
| } | |||
| kernel::LiteKernel *CreateDelegateConvFp16(const std::vector<lite::Tensor *> &inputs, | |||
| const std::vector<lite::Tensor *> &outputs, OpParameter *op_parameter, | |||
| const InnerContext *ctx) { | |||
| auto weight_data_type = inputs.at(1)->data_type(); | |||
| if (weight_data_type != kNumberTypeFloat16) { | |||
| MS_LOG(ERROR) << "Convfp16 only support fp16 weight"; | |||
| return nullptr; | |||
| } | |||
| TypeId bias_data_type = kTypeUnknown; | |||
| if (inputs.size() == 3) { | |||
| bias_data_type = inputs.at(2)->data_type(); | |||
| } | |||
| return new (std::nothrow) | |||
| kernel::ConvolutionDelegateFP16CPUKernel(op_parameter, inputs, outputs, ctx, weight_data_type, bias_data_type); | |||
| } | |||
| kernel::LiteKernel *CpuGroupConvFp16KernelCreator(const std::vector<lite::Tensor *> &inputs, | |||
| const std::vector<lite::Tensor *> &outputs, OpParameter *op_parameter, | |||
| const InnerContext *ctx) { | |||
| bool infer_flag = op_parameter->infer_flag_; | |||
| auto conv_param = reinterpret_cast<ConvParameter *>(op_parameter); | |||
| // update new shape info for each sub kernel | |||
| int new_in_channel = inputs.at(kWeightIndex)->Channel(); | |||
| int new_out_channel = 0; | |||
| if (conv_param->group_ == 0) { | |||
| MS_LOG(ERROR) << "Divisor 'group' cannot be 0."; | |||
| return nullptr; | |||
| } else { | |||
| new_out_channel = inputs.at(kWeightIndex)->Batch() / conv_param->group_; | |||
| } | |||
| GroupConvCreator group_conv_creator(inputs, outputs, op_parameter, ctx, false, kNumberTypeFloat16); | |||
| group_conv_creator.SetShapeOfTensors(); | |||
| std::vector<int> in_shape; | |||
| std::vector<int> out_shape; | |||
| if (infer_flag) { | |||
| conv_param->input_channel_ = new_in_channel; | |||
| conv_param->output_channel_ = new_out_channel; | |||
| in_shape = {inputs.front()->Batch(), inputs.front()->Height(), inputs.front()->Width(), new_in_channel}; | |||
| out_shape = {inputs.front()->Batch(), outputs.front()->Height(), outputs.front()->Width(), new_out_channel}; | |||
| } | |||
| std::vector<int> filter_shape = {new_out_channel, conv_param->kernel_h_, conv_param->kernel_w_, new_in_channel}; | |||
| std::vector<int> bias_shape = {new_out_channel}; | |||
| // new group conv op | |||
| std::vector<kernel::LiteKernel *> group_convs; | |||
| // create tensors for every sub conv kernel | |||
| for (int i = 0; i < conv_param->group_; ++i) { | |||
| ConvParameter *new_conv_param = CreateNewConvParameter(conv_param); | |||
| std::vector<lite::Tensor *> new_inputs; | |||
| std::vector<lite::Tensor *> new_outputs; | |||
| auto new_conv_parameter = CreateNewConvParameterFp16(conv_param); | |||
| if (new_conv_parameter == nullptr) { | |||
| FreeMemoryFp16(group_convs, new_inputs, new_outputs); | |||
| MS_LOG(ERROR) << "Get new conv parameter failed."; | |||
| return nullptr; | |||
| } | |||
| // create new input for each group | |||
| auto in_tensor = CreateInputTensorFp16(mindspore::kNumberTypeFloat16, in_shape, infer_flag); | |||
| if (in_tensor == nullptr) { | |||
| delete new_conv_parameter; | |||
| FreeMemoryFp16(group_convs, new_inputs, new_outputs); | |||
| MS_LOG(ERROR) << "create input tensor failed."; | |||
| return nullptr; | |||
| } | |||
| new_inputs.emplace_back(in_tensor); | |||
| // create new weight | |||
| auto filter_tensor = CreateConstTensorFp16(inputs.at(kWeightIndex), filter_shape, i); | |||
| if (filter_tensor == nullptr) { | |||
| delete new_conv_parameter; | |||
| FreeMemoryFp16(group_convs, new_inputs, new_outputs); | |||
| MS_LOG(ERROR) << "create filter tensor failed."; | |||
| auto ret = group_conv_creator.GetSingleConvParam(new_conv_param, &new_inputs, &new_outputs, i); | |||
| if (ret != RET_OK) { | |||
| MS_LOG(ERROR) << "GetSingleConv for fp16 group conv failed."; | |||
| return nullptr; | |||
| } | |||
| new_inputs.emplace_back(filter_tensor); | |||
| // if has bias, create new bias | |||
| if (inputs.size() == 3) { | |||
| auto bias_tensor = CreateConstTensorFp16(inputs.at(kBiasIndex), bias_shape, i); | |||
| if (bias_tensor == nullptr) { | |||
| delete new_conv_parameter; | |||
| FreeMemoryFp16(group_convs, new_inputs, new_outputs); | |||
| MS_LOG(ERROR) << "create bias_tensor failed."; | |||
| return nullptr; | |||
| } | |||
| new_inputs.emplace_back(bias_tensor); | |||
| } | |||
| // create new output tensors | |||
| for (size_t j = 0; j < outputs.size(); ++j) { | |||
| auto out_tensor = CreateOutputTensorFp16(out_shape, outputs, infer_flag, j); | |||
| if (out_tensor == nullptr) { | |||
| delete new_conv_parameter; | |||
| FreeMemoryFp16(group_convs, new_inputs, new_outputs); | |||
| MS_LOG(ERROR) << "new out_tensor failed."; | |||
| return nullptr; | |||
| } | |||
| new_outputs.emplace_back(out_tensor); | |||
| } | |||
| group_convs.emplace_back( | |||
| CreateDelegateConvFp16(new_inputs, new_outputs, reinterpret_cast<OpParameter *>(new_conv_parameter), ctx)); | |||
| group_conv_creator.get_group_conv()->emplace_back(new (std::nothrow) ConvolutionDelegateFP16CPUKernel( | |||
| reinterpret_cast<OpParameter *>(new_conv_param), new_inputs, new_outputs, ctx)); | |||
| } | |||
| return new (std::nothrow) | |||
| GroupConvolutionFP16CPUKernel(op_parameter, inputs, outputs, ctx, group_convs, conv_param->group_); | |||
| GroupConvolutionFP16CPUKernel(op_parameter, inputs, outputs, ctx, *(group_conv_creator.get_group_conv()), | |||
| reinterpret_cast<ConvParameter *>(op_parameter)->group_); | |||
| } | |||
| /* creator func */ | |||
| kernel::LiteKernel *CpuConvFp16KernelCreator(const std::vector<lite::Tensor *> &inputs, | |||
| const std::vector<lite::Tensor *> &outputs, OpParameter *opParameter, | |||
| const InnerContext *ctx, const kernel::KernelKey &desc) { | |||
| MS_ASSERT(opParameter != nullptr); | |||
| MS_ASSERT(desc.type == schema::PrimitiveType_Conv2DFusion); | |||
| auto weight_data_type = inputs.at(1)->data_type(); | |||
| TypeId bias_data_type = weight_data_type; | |||
| if (inputs.size() == 3) { | |||
| bias_data_type = inputs.at(2)->data_type(); | |||
| } | |||
| if (weight_data_type != kNumberTypeFloat16 || bias_data_type != kNumberTypeFloat16) { | |||
| MS_LOG(ERROR) << "Convfp16 only support fp16 weight and fp16 bias."; | |||
| return nullptr; | |||
| } | |||
| auto conv_param = reinterpret_cast<ConvParameter *>(opParameter); | |||
| kernel::LiteKernel *kernel = nullptr; | |||
| bool is_depthwise = | |||
| (conv_param->group_ == conv_param->input_channel_ && conv_param->group_ == conv_param->output_channel_); | |||
| if (conv_param->group_ > 1 && !is_depthwise) { | |||
| kernel = CpuGroupConvFp16KernelCreator(inputs, outputs, opParameter, ctx); | |||
| if (conv_param->group_ == 1) { | |||
| kernel = new (std::nothrow) kernel::ConvolutionDelegateFP16CPUKernel(opParameter, inputs, outputs, ctx); | |||
| } else if (conv_param->group_ == conv_param->input_channel_ && conv_param->group_ == conv_param->output_channel_) { | |||
| kernel = CpuConvDwFp16KernelCreator(inputs, outputs, opParameter, ctx); | |||
| } else { | |||
| kernel = CreateDelegateConvFp16(inputs, outputs, opParameter, ctx); | |||
| kernel = CpuGroupConvFp16KernelCreator(inputs, outputs, opParameter, ctx); | |||
| } | |||
| if (kernel == nullptr) { | |||
| @@ -29,11 +29,8 @@ namespace mindspore::kernel { | |||
| class ConvolutionDelegateFP16CPUKernel : public LiteKernel { | |||
| public: | |||
| ConvolutionDelegateFP16CPUKernel(OpParameter *parameter, const std::vector<lite::Tensor *> &inputs, | |||
| const std::vector<lite::Tensor *> &outputs, const lite::InnerContext *ctx, | |||
| TypeId origin_weight_data_type, TypeId origin_bias_data_type) | |||
| : LiteKernel(parameter, inputs, outputs, ctx), | |||
| origin_weight_data_type_(origin_weight_data_type), | |||
| origin_bias_data_type_(origin_bias_data_type) {} | |||
| const std::vector<lite::Tensor *> &outputs, const lite::InnerContext *ctx) | |||
| : LiteKernel(parameter, inputs, outputs, ctx) {} | |||
| ~ConvolutionDelegateFP16CPUKernel() override { | |||
| FreeCopiedData(); | |||
| if (fp16_conv_kernel_ != nullptr) { | |||
| @@ -56,14 +53,11 @@ class ConvolutionDelegateFP16CPUKernel : public LiteKernel { | |||
| void *origin_weight_ = nullptr; | |||
| void *origin_bias_ = nullptr; | |||
| kernel::LiteKernel *fp16_conv_kernel_ = nullptr; | |||
| TypeId origin_weight_data_type_; | |||
| TypeId origin_bias_data_type_; | |||
| }; | |||
| kernel::LiteKernel *CpuConvFp16KernelSelect(const std::vector<lite::Tensor *> &inputs, | |||
| const std::vector<lite::Tensor *> &outputs, OpParameter *op_parameter, | |||
| const lite::InnerContext *ctx, void *origin_weight, void *origin_bias, | |||
| TypeId origin_weight_data_type, TypeId origin_bias_data_type); | |||
| const lite::InnerContext *ctx, void *origin_weight, void *origin_bias); | |||
| } // namespace mindspore::kernel | |||
| #endif // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_CONVOLUTION_DELEGATE_FP16_H_ | |||
| @@ -36,23 +36,15 @@ int ConvolutionDepthwiseFp16CPUKernel::InitWeightBias() { | |||
| auto weight_tensor = in_tensors_.at(kWeightIndex); | |||
| int channel = weight_tensor->Batch(); | |||
| int pack_weight_size = channel * weight_tensor->Height() * weight_tensor->Width(); | |||
| auto origin_weight = reinterpret_cast<float16_t *>(weight_tensor->data_c()); | |||
| packed_weight_ = reinterpret_cast<float16_t *>(malloc(pack_weight_size * sizeof(float16_t))); | |||
| if (packed_weight_ == nullptr) { | |||
| MS_LOG(ERROR) << "Malloc buffer failed."; | |||
| return RET_ERROR; | |||
| } | |||
| auto ret = ConvolutionBaseFP16CPUKernel::GetExecuteFilter(weight_tensor, origin_weight_); | |||
| if (ret != RET_OK) { | |||
| MS_LOG(ERROR) << "get execute filter data failed."; | |||
| return ret; | |||
| } | |||
| PackNCHWToNHWCFp16(execute_weight_, packed_weight_, 1, weight_tensor->Height() * weight_tensor->Width(), | |||
| PackNCHWToNHWCFp16(origin_weight, packed_weight_, 1, weight_tensor->Height() * weight_tensor->Width(), | |||
| weight_tensor->Batch()); | |||
| if (fp16_weight_ != nullptr) { | |||
| free(fp16_weight_); | |||
| fp16_weight_ = nullptr; | |||
| } | |||
| bias_data_ = reinterpret_cast<float16_t *>(malloc(channel * sizeof(float16_t))); | |||
| if (bias_data_ == nullptr) { | |||
| @@ -60,14 +52,10 @@ int ConvolutionDepthwiseFp16CPUKernel::InitWeightBias() { | |||
| return RET_ERROR; | |||
| } | |||
| memset(bias_data_, 0, channel * sizeof(float16_t)); | |||
| auto bias_fp16 = reinterpret_cast<float16_t *>(bias_data_); | |||
| if (in_tensors_.size() == kInputSize2) { | |||
| auto bias_tensor = in_tensors_.at(kBiasIndex); | |||
| MS_ASSERT(origin_bias_); | |||
| auto ori_bias = reinterpret_cast<float16_t *>(origin_bias_); | |||
| for (int i = 0; i < bias_tensor->ElementsNum(); i++) { | |||
| bias_fp16[i] = (float16_t)ori_bias[i]; | |||
| } | |||
| auto ori_bias = reinterpret_cast<float16_t *>(bias_tensor->data_c()); | |||
| memcpy(bias_data_, ori_bias, bias_tensor->Size()); | |||
| } | |||
| return RET_OK; | |||
| } | |||
| @@ -95,8 +83,13 @@ int ConvolutionDepthwiseFp16CPUKernel::ReSize() { | |||
| } | |||
| int ConvolutionDepthwiseFp16CPUKernel::Execute(int task_id) { | |||
| ConvDwFp16(execute_output_, execute_input_, packed_weight_, reinterpret_cast<float16_t *>(bias_data_), conv_param_, | |||
| task_id); | |||
| auto input_ptr = reinterpret_cast<float16_t *>(in_tensors_.at(0)->data_c()); | |||
| auto output_ptr = reinterpret_cast<float16_t *>(out_tensors_.at(0)->data_c()); | |||
| if (input_ptr == nullptr || output_ptr == nullptr) { | |||
| MS_LOG(ERROR) << "Convolution depthwise Fp16 get null tensor data!"; | |||
| return RET_ERROR; | |||
| } | |||
| ConvDwFp16(output_ptr, input_ptr, packed_weight_, reinterpret_cast<float16_t *>(bias_data_), conv_param_, task_id); | |||
| return RET_OK; | |||
| } | |||
| @@ -111,8 +104,6 @@ static int ConvDwFp16Run(void *cdata, int task_id) { | |||
| } | |||
| int ConvolutionDepthwiseFp16CPUKernel::Run() { | |||
| ConvolutionBaseFP16CPUKernel::GetExecuteTensor(); | |||
| auto ret = ParallelLaunch(this->context_->thread_pool_, ConvDwFp16Run, this, conv_param_->thread_num_); | |||
| if (ret != RET_OK) { | |||
| MS_LOG(ERROR) << "ConvDwFp16Run error: error_code[" << ret << "]"; | |||
| @@ -19,7 +19,7 @@ | |||
| #include <vector> | |||
| #include "src/lite_kernel.h" | |||
| #include "src/runtime/kernel/arm/fp16/convolution_base_fp16.h" | |||
| #include "src/runtime/kernel/arm/base/convolution_base.h" | |||
| #include "nnacl/fp16/conv_depthwise_fp16.h" | |||
| #ifdef __cplusplus | |||
| @@ -32,15 +32,11 @@ void ConvDwFp16(float16_t *output_data, const float16_t *input_data, const float | |||
| #endif | |||
| namespace mindspore::kernel { | |||
| class ConvolutionDepthwiseFp16CPUKernel : public ConvolutionBaseFP16CPUKernel { | |||
| class ConvolutionDepthwiseFp16CPUKernel : public ConvolutionBaseCPUKernel { | |||
| public: | |||
| ConvolutionDepthwiseFp16CPUKernel(OpParameter *parameter, const std::vector<lite::Tensor *> &inputs, | |||
| const std::vector<lite::Tensor *> &outputs, const InnerContext *ctx, | |||
| void *origin_weight, void *origin_bias, TypeId origin_weight_data_type, | |||
| TypeId origin_bias_data_type) | |||
| : ConvolutionBaseFP16CPUKernel(parameter, inputs, outputs, ctx, origin_weight_data_type, origin_bias_data_type), | |||
| origin_weight_(origin_weight), | |||
| origin_bias_(origin_bias) {} | |||
| const std::vector<lite::Tensor *> &outputs, const InnerContext *ctx) | |||
| : ConvolutionBaseCPUKernel(parameter, inputs, outputs, ctx) {} | |||
| ~ConvolutionDepthwiseFp16CPUKernel() override; | |||
| int Init() override; | |||
| @@ -51,8 +47,6 @@ class ConvolutionDepthwiseFp16CPUKernel : public ConvolutionBaseFP16CPUKernel { | |||
| int Execute(int task_id); | |||
| private: | |||
| void *origin_weight_; // do not free | |||
| void *origin_bias_; // do not free | |||
| float16_t *packed_weight_ = nullptr; | |||
| }; | |||
| } // namespace mindspore::kernel | |||
| @@ -62,14 +62,15 @@ int ConvolutionDepthwiseSWFp16CPUKernel::InitWeightBias() { | |||
| auto weight_tensor = in_tensors_.at(kWeightIndex); | |||
| int OC8 = UP_DIV(weight_tensor->Batch(), C8NUM); | |||
| int pack_weight_size = C8NUM * OC8 * weight_tensor->Height() * weight_tensor->Width(); | |||
| auto origin_weight = reinterpret_cast<float16_t *>(weight_tensor->data_c()); | |||
| packed_weight_ = reinterpret_cast<float16_t *>(malloc(pack_weight_size * sizeof(float16_t))); | |||
| if (packed_weight_ == nullptr) { | |||
| MS_LOG(ERROR) << "Malloc buffer failed."; | |||
| return RET_ERROR; | |||
| } | |||
| PackNCHWFp16ToNC8HW8Fp16(reinterpret_cast<float16_t *>(origin_weight_), packed_weight_, 1, | |||
| weight_tensor->Height() * weight_tensor->Width(), weight_tensor->Batch()); | |||
| PackNCHWFp16ToNC8HW8Fp16(origin_weight, packed_weight_, 1, weight_tensor->Height() * weight_tensor->Width(), | |||
| weight_tensor->Batch()); | |||
| bias_data_ = reinterpret_cast<float16_t *>(malloc(C8NUM * OC8 * sizeof(float16_t))); | |||
| if (bias_data_ == nullptr) { | |||
| @@ -77,14 +78,10 @@ int ConvolutionDepthwiseSWFp16CPUKernel::InitWeightBias() { | |||
| return RET_ERROR; | |||
| } | |||
| memset(bias_data_, 0, C8NUM * OC8 * sizeof(float16_t)); | |||
| auto bias_fp16 = reinterpret_cast<float16_t *>(bias_data_); | |||
| if (in_tensors_.size() == kInputSize2) { | |||
| auto bias_tensor = in_tensors_.at(kBiasIndex); | |||
| MS_ASSERT(origin_bias_); | |||
| auto ori_bias = reinterpret_cast<float16_t *>(origin_bias_); | |||
| for (int i = 0; i < bias_tensor->ElementsNum(); i++) { | |||
| bias_fp16[i] = (float16_t)ori_bias[i]; | |||
| } | |||
| auto ori_bias = reinterpret_cast<float16_t *>(bias_tensor->data_c()); | |||
| memcpy(bias_data_, ori_bias, bias_tensor->Size()); | |||
| } | |||
| conv_param_->thread_num_ = MSMIN(thread_count_, OC8); | |||
| @@ -143,14 +140,19 @@ int ConvolutionDepthwiseSWFp16CPUKernel::Run() { | |||
| return ret; | |||
| } | |||
| ConvolutionBaseFP16CPUKernel::GetExecuteTensor(); | |||
| auto input_ptr = reinterpret_cast<float16_t *>(in_tensors_.at(0)->data_c()); | |||
| auto output_ptr = reinterpret_cast<float16_t *>(out_tensors_.at(0)->data_c()); | |||
| if (input_ptr == nullptr || output_ptr == nullptr) { | |||
| MS_LOG(ERROR) << "Convolution depthwise Fp16 get null tensor data!"; | |||
| return RET_ERROR; | |||
| } | |||
| if (need_align_) { | |||
| PackNHWCToNHWC8Fp16(execute_input_, packed_input_, conv_param_->input_batch_, | |||
| PackNHWCToNHWC8Fp16(input_ptr, packed_input_, conv_param_->input_batch_, | |||
| conv_param_->input_h_ * conv_param_->input_w_, conv_param_->input_channel_); | |||
| } else { | |||
| packed_input_ = execute_input_; | |||
| packed_output_ = execute_output_; | |||
| packed_input_ = input_ptr; | |||
| packed_output_ = output_ptr; | |||
| } | |||
| ret = ParallelLaunch(this->context_->thread_pool_, ConvDwSWFp16Run, this, conv_param_->thread_num_); | |||
| @@ -158,7 +160,7 @@ int ConvolutionDepthwiseSWFp16CPUKernel::Run() { | |||
| MS_LOG(ERROR) << "ConvDwSWFp16Run error: error_code[" << ret << "]"; | |||
| } | |||
| if (need_align_) { | |||
| PackNHWC8ToNHWCFp16(packed_output_, execute_output_, conv_param_->output_batch_, | |||
| PackNHWC8ToNHWCFp16(packed_output_, output_ptr, conv_param_->output_batch_, | |||
| conv_param_->output_h_ * conv_param_->output_w_, conv_param_->output_channel_); | |||
| } | |||
| @@ -19,7 +19,7 @@ | |||
| #include <vector> | |||
| #include "src/lite_kernel.h" | |||
| #include "src/runtime/kernel/arm/fp16/convolution_base_fp16.h" | |||
| #include "src/runtime/kernel/arm/base/convolution_base.h" | |||
| #include "nnacl/fp16/conv_depthwise_fp16.h" | |||
| #ifdef __cplusplus | |||
| @@ -33,15 +33,11 @@ void ConvDwC8Fp16(float16_t *output_data, const float16_t *input_data, const flo | |||
| #endif | |||
| namespace mindspore::kernel { | |||
| class ConvolutionDepthwiseSWFp16CPUKernel : public ConvolutionBaseFP16CPUKernel { | |||
| class ConvolutionDepthwiseSWFp16CPUKernel : public ConvolutionBaseCPUKernel { | |||
| public: | |||
| ConvolutionDepthwiseSWFp16CPUKernel(OpParameter *parameter, const std::vector<lite::Tensor *> &inputs, | |||
| const std::vector<lite::Tensor *> &outputs, const InnerContext *ctx, | |||
| void *origin_weight, void *origin_bias, TypeId origin_weight_data_type, | |||
| TypeId origin_bias_data_type) | |||
| : ConvolutionBaseFP16CPUKernel(parameter, inputs, outputs, ctx, origin_weight_data_type, origin_bias_data_type), | |||
| origin_weight_(origin_weight), | |||
| origin_bias_(origin_bias) {} | |||
| const std::vector<lite::Tensor *> &outputs, const InnerContext *ctx) | |||
| : ConvolutionBaseCPUKernel(parameter, inputs, outputs, ctx) {} | |||
| ~ConvolutionDepthwiseSWFp16CPUKernel() override; | |||
| int Init() override; | |||
| @@ -54,8 +50,6 @@ class ConvolutionDepthwiseSWFp16CPUKernel : public ConvolutionBaseFP16CPUKernel | |||
| private: | |||
| void FreePackedInputOutput(); | |||
| void *origin_weight_; // do not free | |||
| void *origin_bias_; // do not free | |||
| SlidingWindowParam *sliding_ = nullptr; | |||
| float16_t *packed_weight_ = nullptr; | |||
| float16_t *packed_input_ = nullptr; | |||
| @@ -45,8 +45,7 @@ int ConvolutionFP16CPUKernel::InitWeightBias() { | |||
| return RET_ERROR; | |||
| } | |||
| memset(packed_weight_, 0, pack_weight_size * sizeof(float16_t)); | |||
| RowMajor2Col8MajorFp16(origin_weight_, packed_weight_, out_channel, in_channel * kernel_plane, | |||
| origin_weight_data_type_ == kNumberTypeFloat32); | |||
| RowMajor2Col8MajorFp16(origin_weight_, packed_weight_, out_channel, in_channel * kernel_plane, false); | |||
| // init bias | |||
| bias_data_ = malloc(oc8 * sizeof(float16_t)); | |||
| @@ -56,14 +55,7 @@ int ConvolutionFP16CPUKernel::InitWeightBias() { | |||
| } | |||
| memset(bias_data_, 0, oc8 * sizeof(float16_t)); | |||
| if (in_tensors_.size() == kInputSize2) { | |||
| if (origin_bias_data_type_ == kNumberTypeFloat16) { | |||
| memcpy(bias_data_, origin_bias_, out_channel * sizeof(float16_t)); | |||
| } else { | |||
| MS_LOG(ERROR) << "Conv fp16 only support fp16 bias"; | |||
| return RET_ERROR; | |||
| } | |||
| } else { | |||
| MS_ASSERT(in_tensors_.size() == kInputSize1); | |||
| memcpy(bias_data_, origin_bias_, out_channel * sizeof(float16_t)); | |||
| } | |||
| return RET_OK; | |||
| } | |||
| @@ -123,8 +115,14 @@ int ConvolutionFP16CPUKernel::ReSize() { | |||
| } | |||
| int ConvolutionFP16CPUKernel::RunImpl(int task_id) { | |||
| ConvFp16(execute_input_, packed_input_, packed_weight_, reinterpret_cast<float16_t *>(bias_data_), col_major_input_, | |||
| execute_output_, task_id, conv_param_); | |||
| auto input_ptr = reinterpret_cast<float16_t *>(in_tensors_.at(0)->data_c()); | |||
| auto output_ptr = reinterpret_cast<float16_t *>(out_tensors_.at(0)->data_c()); | |||
| if (input_ptr == nullptr || output_ptr == nullptr) { | |||
| MS_LOG(ERROR) << "Convolution Fp16 get null tensor data!"; | |||
| return RET_ERROR; | |||
| } | |||
| ConvFp16(input_ptr, packed_input_, packed_weight_, reinterpret_cast<float16_t *>(bias_data_), col_major_input_, | |||
| output_ptr, task_id, conv_param_); | |||
| return RET_OK; | |||
| } | |||
| @@ -139,8 +137,6 @@ static int ConvolutionFp16Impl(void *cdata, int task_id) { | |||
| } | |||
| int ConvolutionFP16CPUKernel::Run() { | |||
| ConvolutionBaseFP16CPUKernel::GetExecuteTensor(); | |||
| auto ret = InitTmpBuffer(); | |||
| if (ret != RET_OK) { | |||
| MS_LOG(ERROR) << "Init tmp buffer failed."; | |||
| @@ -20,15 +20,15 @@ | |||
| #include <arm_neon.h> | |||
| #include <vector> | |||
| #include "src/lite_kernel.h" | |||
| #include "src/runtime/kernel/arm/fp16/convolution_base_fp16.h" | |||
| #include "src/runtime/kernel/arm/base/convolution_base.h" | |||
| namespace mindspore::kernel { | |||
| class ConvolutionFP16CPUKernel : public ConvolutionBaseFP16CPUKernel { | |||
| class ConvolutionFP16CPUKernel : public ConvolutionBaseCPUKernel { | |||
| public: | |||
| ConvolutionFP16CPUKernel(OpParameter *parameter, const std::vector<lite::Tensor *> &inputs, | |||
| const std::vector<lite::Tensor *> &outputs, const InnerContext *ctx, void *origin_weight, | |||
| void *origin_bias, TypeId origin_weight_data_type, TypeId origin_bias_data_type) | |||
| : ConvolutionBaseFP16CPUKernel(parameter, inputs, outputs, ctx, origin_weight_data_type, origin_bias_data_type), | |||
| void *origin_bias) | |||
| : ConvolutionBaseCPUKernel(parameter, inputs, outputs, ctx), | |||
| origin_weight_(origin_weight), | |||
| origin_bias_(origin_bias) {} | |||
| ~ConvolutionFP16CPUKernel() override { | |||
| @@ -33,9 +33,9 @@ int ConvolutionWinogradFP16CPUKernel::WinogradFilterTransformFp16(const float16_ | |||
| } | |||
| int ConvolutionWinogradFP16CPUKernel::InitWeightBias() { | |||
| auto filter_tensor = in_tensors_.at(kWeightIndex); | |||
| int in_channel = filter_tensor->Channel(); | |||
| int out_channel = filter_tensor->Batch(); | |||
| auto weight_tensor = in_tensors_.at(kWeightIndex); | |||
| int in_channel = weight_tensor->Channel(); | |||
| int out_channel = weight_tensor->Batch(); | |||
| conv_param_->input_channel_ = in_channel; | |||
| conv_param_->output_channel_ = out_channel; | |||
| int oc_block_num = UP_DIV(out_channel, col_tile_); | |||
| @@ -65,21 +65,11 @@ int ConvolutionWinogradFP16CPUKernel::InitWeightBias() { | |||
| MS_LOG(ERROR) << "get matrix g from CookToomFilter failed."; | |||
| return ret; | |||
| } | |||
| ret = GetExecuteFilter(filter_tensor, origin_weight_); | |||
| if (ret != RET_OK) { | |||
| MS_LOG(ERROR) << "get execute filter failed."; | |||
| return ret; | |||
| } | |||
| ret = WinogradFilterTransformFp16(execute_weight_, matrix_g, matrix_gt, col_tile_); | |||
| ret = WinogradFilterTransformFp16(reinterpret_cast<float16_t *>(origin_weight_), matrix_g, matrix_gt, col_tile_); | |||
| if (ret != RET_OK) { | |||
| MS_LOG(ERROR) << "winograd filter transform failed."; | |||
| return ret; | |||
| } | |||
| // if fp16_weight is malloced, free it. It will not be used in runtime anymore. | |||
| if (fp16_weight_ != nullptr) { | |||
| free(fp16_weight_); | |||
| fp16_weight_ = nullptr; | |||
| } | |||
| // init bias | |||
| bias_data_ = malloc(oc_block_num * col_tile_ * sizeof(float16_t)); | |||
| @@ -88,16 +78,8 @@ int ConvolutionWinogradFP16CPUKernel::InitWeightBias() { | |||
| return RET_ERROR; | |||
| } | |||
| memset(bias_data_, 0, oc_block_num * col_tile_ * sizeof(float16_t)); | |||
| if (in_tensors_.size() == kInputSize2) { | |||
| if (origin_bias_data_type_ == kNumberTypeFloat16) { | |||
| memcpy(bias_data_, origin_bias_, out_channel * sizeof(float16_t)); | |||
| } else { | |||
| MS_LOG(ERROR) << "Conv winograd fp16 only support fp16 bias"; | |||
| return RET_ERROR; | |||
| } | |||
| } else { | |||
| MS_ASSERT(in_tensors_.size() == kInputSize1); | |||
| memcpy(bias_data_, origin_bias_, out_channel * sizeof(float16_t)); | |||
| } | |||
| return RET_OK; | |||
| } | |||
| @@ -202,7 +184,13 @@ int ConvolutionWinogradFP16CPUKernel::ReSize() { | |||
| } | |||
| int ConvolutionWinogradFP16CPUKernel::RunImpl(int task_id) { | |||
| ConvWinogardFp16(execute_input_, trans_weight_, reinterpret_cast<const float16_t *>(bias_data_), execute_output_, | |||
| auto input_ptr = reinterpret_cast<float16_t *>(in_tensors_.at(0)->data_c()); | |||
| auto output_ptr = reinterpret_cast<float16_t *>(out_tensors_.at(0)->data_c()); | |||
| if (input_ptr == nullptr || output_ptr == nullptr) { | |||
| MS_LOG(ERROR) << "Convolution Winograd Fp16 get null tensor data!"; | |||
| return RET_ERROR; | |||
| } | |||
| ConvWinogardFp16(input_ptr, trans_weight_, reinterpret_cast<const float16_t *>(bias_data_), output_ptr, | |||
| tmp_buffer_address_list_, task_id, conv_param_, in_func_, out_func_); | |||
| return RET_OK; | |||
| } | |||
| @@ -218,8 +206,6 @@ static int ConvolutionWinogradFp16Impl(void *cdata, int task_id) { | |||
| } | |||
| int ConvolutionWinogradFP16CPUKernel::Run() { | |||
| ConvolutionBaseFP16CPUKernel::GetExecuteTensor(); | |||
| auto ret = InitTmpBuffer(); | |||
| if (ret != RET_OK) { | |||
| MS_LOG(ERROR) << "Init tmp buffer failed."; | |||
| @@ -20,20 +20,19 @@ | |||
| #include <arm_neon.h> | |||
| #include <vector> | |||
| #include "src/lite_kernel.h" | |||
| #include "src/runtime/kernel/arm/fp16/convolution_base_fp16.h" | |||
| #include "src/runtime/kernel/arm/base/convolution_base.h" | |||
| #include "nnacl/fp16/conv_fp16.h" | |||
| #include "nnacl/fp16/winograd_utils_fp16.h" | |||
| #include "src/common/utils.h" | |||
| #include "nnacl/base/minimal_filtering_generator.h" | |||
| namespace mindspore::kernel { | |||
| class ConvolutionWinogradFP16CPUKernel : public ConvolutionBaseFP16CPUKernel { | |||
| class ConvolutionWinogradFP16CPUKernel : public ConvolutionBaseCPUKernel { | |||
| public: | |||
| ConvolutionWinogradFP16CPUKernel(OpParameter *parameter, const std::vector<lite::Tensor *> &inputs, | |||
| const std::vector<lite::Tensor *> &outputs, const InnerContext *ctx, int out_unit, | |||
| void *origin_weight, void *origin_bias, TypeId origin_weight_data_type, | |||
| TypeId origin_bias_data_type) | |||
| : ConvolutionBaseFP16CPUKernel(parameter, inputs, outputs, ctx, origin_weight_data_type, origin_bias_data_type), | |||
| void *origin_weight, void *origin_bias) | |||
| : ConvolutionBaseCPUKernel(parameter, inputs, outputs, ctx), | |||
| output_unit_(out_unit), | |||
| origin_weight_(origin_weight), | |||
| origin_bias_(origin_bias) {} | |||
| @@ -73,7 +73,7 @@ int DeconvolutionDepthwiseFp16CPUKernel::InitWeightBias() { | |||
| // init weight: o, h, w, i; o == group, i == 1 | |||
| auto weight_tensor = in_tensors_.at(kWeightIndex); | |||
| int OC8 = UP_DIV(weight_tensor->Batch(), C8NUM); | |||
| auto origin_weight = reinterpret_cast<float16_t *>(weight_tensor->MutableData()); | |||
| auto origin_weight = reinterpret_cast<float16_t *>(weight_tensor->data_c()); | |||
| int pack_weight_size = C8NUM * OC8 * weight_tensor->Height() * weight_tensor->Width(); | |||
| packed_weight_ = reinterpret_cast<float16_t *>(malloc(pack_weight_size * sizeof(float16_t))); | |||
| @@ -92,10 +92,8 @@ int DeconvolutionDepthwiseFp16CPUKernel::InitWeightBias() { | |||
| memset(bias_data_, 0, C8NUM * OC8 * sizeof(float16_t)); | |||
| if (in_tensors_.size() == kInputSize2) { | |||
| auto bias_tensor = in_tensors_.at(kBiasIndex); | |||
| auto ori_bias = reinterpret_cast<float16_t *>(bias_tensor->MutableData()); | |||
| for (int i = 0; i < bias_tensor->ElementsNum(); i++) { | |||
| reinterpret_cast<float16_t *>(bias_data_)[i] = ori_bias[i]; | |||
| } | |||
| auto ori_bias = reinterpret_cast<float16_t *>(bias_tensor->data_c()); | |||
| memcpy(bias_data_, ori_bias, bias_tensor->Size()); | |||
| } | |||
| conv_param_->thread_num_ = MSMIN(thread_count_, OC8); | |||
| @@ -157,18 +155,23 @@ int DeconvolutionDepthwiseFp16CPUKernel::Run() { | |||
| return RET_ERROR; | |||
| } | |||
| ConvolutionBaseFP16CPUKernel::GetExecuteTensor(); | |||
| auto input_ptr = reinterpret_cast<float16_t *>(in_tensors_.at(0)->data_c()); | |||
| auto output_ptr = reinterpret_cast<float16_t *>(out_tensors_.at(0)->data_c()); | |||
| if (input_ptr == nullptr || output_ptr == nullptr) { | |||
| MS_LOG(ERROR) << "Deconvolution depthwise Fp16 get null tensor data!"; | |||
| return RET_ERROR; | |||
| } | |||
| if (need_align_) { | |||
| PackNHWCToNHWC8Fp16(execute_input_, packed_input_, conv_param_->input_batch_, | |||
| PackNHWCToNHWC8Fp16(input_ptr, packed_input_, conv_param_->input_batch_, | |||
| conv_param_->input_h_ * conv_param_->input_w_, conv_param_->input_channel_); | |||
| } else { | |||
| packed_input_ = execute_input_; | |||
| packed_input_ = input_ptr; | |||
| } | |||
| if (!need_align_) { | |||
| memset(execute_output_, 0, out_tensors_.at(kOutputIndex)->ElementsNum() * sizeof(float16_t)); | |||
| packed_output_ = execute_output_; | |||
| memset(output_ptr, 0, out_tensors_.at(kOutputIndex)->ElementsNum() * sizeof(float16_t)); | |||
| packed_output_ = output_ptr; | |||
| } | |||
| ret = ParallelLaunch(this->context_->thread_pool_, DeconvDwFp16Run, this, conv_param_->thread_num_); | |||
| if (ret != RET_OK) { | |||
| @@ -176,7 +179,7 @@ int DeconvolutionDepthwiseFp16CPUKernel::Run() { | |||
| } | |||
| if (need_align_) { | |||
| PackNHWC8ToNHWCFp16(packed_output_, execute_output_, conv_param_->output_batch_, | |||
| PackNHWC8ToNHWCFp16(packed_output_, output_ptr, conv_param_->output_batch_, | |||
| conv_param_->output_h_ * conv_param_->output_w_, conv_param_->output_channel_); | |||
| } | |||
| @@ -19,7 +19,7 @@ | |||
| #include <vector> | |||
| #include "src/lite_kernel.h" | |||
| #include "src/runtime/kernel/arm/fp16/convolution_base_fp16.h" | |||
| #include "src/runtime/kernel/arm/base/convolution_base.h" | |||
| #include "nnacl/fp16/conv_depthwise_fp16.h" | |||
| #ifdef __cplusplus | |||
| @@ -34,12 +34,11 @@ void ComputeStrides(int *shape, int *strides, int ndim); | |||
| #endif | |||
| namespace mindspore::kernel { | |||
| class DeconvolutionDepthwiseFp16CPUKernel : public ConvolutionBaseFP16CPUKernel { | |||
| class DeconvolutionDepthwiseFp16CPUKernel : public ConvolutionBaseCPUKernel { | |||
| public: | |||
| DeconvolutionDepthwiseFp16CPUKernel(OpParameter *parameter, const std::vector<lite::Tensor *> &inputs, | |||
| const std::vector<lite::Tensor *> &outputs, const InnerContext *ctx, | |||
| TypeId origin_weight_data_type, TypeId origin_bias_data_type) | |||
| : ConvolutionBaseFP16CPUKernel(parameter, inputs, outputs, ctx, origin_weight_data_type, origin_bias_data_type) {} | |||
| const std::vector<lite::Tensor *> &outputs, const InnerContext *ctx) | |||
| : ConvolutionBaseCPUKernel(parameter, inputs, outputs, ctx) {} | |||
| ~DeconvolutionDepthwiseFp16CPUKernel() override; | |||
| int Init() override; | |||
| @@ -32,9 +32,9 @@ DeConvolutionFp16CPUKernel::~DeConvolutionFp16CPUKernel() { | |||
| delete matmul_param_; | |||
| matmul_param_ = nullptr; | |||
| } | |||
| if (execute_weight_ != nullptr) { | |||
| free(execute_weight_); | |||
| execute_weight_ = nullptr; | |||
| if (pack_weight_ != nullptr) { | |||
| free(pack_weight_); | |||
| pack_weight_ = nullptr; | |||
| } | |||
| return; | |||
| } | |||
| @@ -78,17 +78,17 @@ int DeConvolutionFp16CPUKernel::InitWeightBias() { | |||
| } | |||
| size_t weight_pack_size = input_channel * kernel_w * kernel_h * UP_ROUND(output_channel, C8NUM) * sizeof(float16_t); | |||
| execute_weight_ = reinterpret_cast<float16_t *>(malloc(weight_pack_size)); | |||
| if (execute_weight_ == nullptr) { | |||
| MS_LOG(ERROR) << "deconv malloc execute_weight_ error!"; | |||
| pack_weight_ = reinterpret_cast<float16_t *>(malloc(weight_pack_size)); | |||
| if (pack_weight_ == nullptr) { | |||
| MS_LOG(ERROR) << "deconv malloc pack_weight_ error!"; | |||
| return RET_ERROR; | |||
| } | |||
| memset(execute_weight_, 0, weight_pack_size); | |||
| memset(pack_weight_, 0, weight_pack_size); | |||
| if (in_tensors_.at(1)->data_type() != kNumberTypeFloat16) { | |||
| MS_LOG(ERROR) << "deconv fp16 kernel require fp16 weight"; | |||
| return RET_ERROR; | |||
| } | |||
| PackNHWCFp16ToC8HWN8Fp16(reinterpret_cast<float16_t *>(in_tensors_.at(1)->data_c()), execute_weight_, input_channel, | |||
| PackNHWCFp16ToC8HWN8Fp16(reinterpret_cast<float16_t *>(in_tensors_.at(1)->data_c()), pack_weight_, input_channel, | |||
| kernel_w * kernel_h, output_channel); | |||
| return RET_OK; | |||
| } | |||
| @@ -169,7 +169,7 @@ int DeConvolutionFp16CPUKernel::DoDeconv(int task_id) { | |||
| } | |||
| auto tmp_buf = tmp_buffer_ + task_id * thread_stride_ * C8NUM * kernel_plane_ * matmul_param_->row_16_; | |||
| MatMulFp16(pack_input_, execute_weight_ + task_id * thread_stride_ * C8NUM * kernel_plane_ * matmul_param_->deep_, | |||
| MatMulFp16(pack_input_, pack_weight_ + task_id * thread_stride_ * C8NUM * kernel_plane_ * matmul_param_->deep_, | |||
| tmp_buf, nullptr, ActType_No, matmul_param_->deep_, matmul_param_->row_, oc * C8NUM * kernel_plane_, 0, | |||
| OutType_C8); | |||
| @@ -197,7 +197,12 @@ int DeConvolutionFp16CPUKernel::Init() { | |||
| } | |||
| int DeConvolutionFp16CPUKernel::Run() { | |||
| ConvolutionBaseFP16CPUKernel::GetExecuteTensor(); | |||
| auto input_ptr = reinterpret_cast<float16_t *>(in_tensors_.at(0)->data_c()); | |||
| auto output_ptr = reinterpret_cast<float16_t *>(out_tensors_.at(0)->data_c()); | |||
| if (input_ptr == nullptr || output_ptr == nullptr) { | |||
| MS_LOG(ERROR) << "DeConvolution Fp16 get null tensor data!"; | |||
| return RET_ERROR; | |||
| } | |||
| int error_code = InitRunBuf(); | |||
| if (error_code != RET_OK) { | |||
| @@ -207,8 +212,8 @@ int DeConvolutionFp16CPUKernel::Run() { | |||
| } | |||
| for (int batch_index = 0; batch_index < conv_param_->input_batch_; batch_index++) { | |||
| batch_input_ = execute_input_ + batch_index * conv_param_->input_channel_ * input_plane_; | |||
| batch_output_ = execute_output_ + batch_index * conv_param_->output_channel_ * output_plane_; | |||
| batch_input_ = input_ptr + batch_index * conv_param_->input_channel_ * input_plane_; | |||
| batch_output_ = output_ptr + batch_index * conv_param_->output_channel_ * output_plane_; | |||
| RowMajor2Col16MajorFp16Opt(batch_input_, pack_input_, input_plane_, conv_param_->input_channel_); | |||
| @@ -228,25 +233,17 @@ kernel::LiteKernel *CpuDeConvFp16KernelCreator(const std::vector<lite::Tensor *> | |||
| MS_ASSERT(op_parameter != nullptr); | |||
| MS_ASSERT(desc.type == schema::PrimitiveType_Conv2dTransposeFusion); | |||
| auto weight_data_type = inputs.at(1)->data_type(); | |||
| TypeId bias_data_type = kTypeUnknown; | |||
| if (inputs.size() == 3) { | |||
| bias_data_type = inputs.at(2)->data_type(); | |||
| } | |||
| kernel::LiteKernel *kernel = nullptr; | |||
| auto conv_param = reinterpret_cast<ConvParameter *>(op_parameter); | |||
| if (conv_param->group_ == 1) { | |||
| if ((conv_param->stride_h_ != 1 || conv_param->stride_w_ != 1) && | |||
| (conv_param->dilation_h_ == 1 && conv_param->dilation_w_ == 1)) { | |||
| kernel = new (std::nothrow) | |||
| kernel::DeConvWinogradFp16CPUKernel(op_parameter, inputs, outputs, ctx, weight_data_type, bias_data_type); | |||
| kernel = new (std::nothrow) kernel::DeConvWinogradFp16CPUKernel(op_parameter, inputs, outputs, ctx); | |||
| } else { | |||
| kernel = new (std::nothrow) | |||
| kernel::DeConvolutionFp16CPUKernel(op_parameter, inputs, outputs, ctx, weight_data_type, bias_data_type); | |||
| kernel = new (std::nothrow) kernel::DeConvolutionFp16CPUKernel(op_parameter, inputs, outputs, ctx); | |||
| } | |||
| } else if (conv_param->group_ == conv_param->input_channel_ && conv_param->group_ == conv_param->output_channel_) { | |||
| kernel = new (std::nothrow) | |||
| DeconvolutionDepthwiseFp16CPUKernel(op_parameter, inputs, outputs, ctx, weight_data_type, bias_data_type); | |||
| kernel = new (std::nothrow) DeconvolutionDepthwiseFp16CPUKernel(op_parameter, inputs, outputs, ctx); | |||
| } | |||
| if (kernel == nullptr) { | |||
| @@ -21,15 +21,14 @@ | |||
| #include "nnacl/fp16/deconv_fp16.h" | |||
| #include "nnacl/fp16/matmul_fp16.h" | |||
| #include "src/kernel_registry.h" | |||
| #include "src/runtime/kernel/arm/fp16/convolution_base_fp16.h" | |||
| #include "src/runtime/kernel/arm/base/convolution_base.h" | |||
| namespace mindspore::kernel { | |||
| class DeConvolutionFp16CPUKernel : public ConvolutionBaseFP16CPUKernel { | |||
| class DeConvolutionFp16CPUKernel : public ConvolutionBaseCPUKernel { | |||
| public: | |||
| DeConvolutionFp16CPUKernel(OpParameter *parameter, const std::vector<lite::Tensor *> &inputs, | |||
| const std::vector<lite::Tensor *> &outputs, const lite::InnerContext *ctx, | |||
| TypeId origin_weight_data_type, TypeId origin_bias_data_type) | |||
| : ConvolutionBaseFP16CPUKernel(parameter, inputs, outputs, ctx, origin_weight_data_type, origin_bias_data_type) {} | |||
| const std::vector<lite::Tensor *> &outputs, const lite::InnerContext *ctx) | |||
| : ConvolutionBaseCPUKernel(parameter, inputs, outputs, ctx) {} | |||
| ~DeConvolutionFp16CPUKernel() override; | |||
| int Init() override; | |||
| int Run() override; | |||
| @@ -52,6 +51,7 @@ class DeConvolutionFp16CPUKernel : public ConvolutionBaseFP16CPUKernel { | |||
| int thread_count_; | |||
| int thread_stride_; | |||
| float16_t *pack_input_; | |||
| float16_t *pack_weight_; | |||
| float16_t *pack_output_; | |||
| float16_t *tmp_buffer_; | |||
| float16_t *batch_input_; | |||
| @@ -317,15 +317,11 @@ int DeConvWinogradFp16CPUKernel::InitComputeParam() { | |||
| int DeConvWinogradFp16CPUKernel::InitDataParam() { | |||
| /* unit data : weight & winograd data*/ | |||
| auto weight_tensor = in_tensors_.at(kWeightIndex); | |||
| auto ret = ConvolutionBaseFP16CPUKernel::GetExecuteFilter(weight_tensor, weight_tensor->data_c()); | |||
| if (ret != RET_OK) { | |||
| MS_LOG(ERROR) << "Get Execute filter failed."; | |||
| return ret; | |||
| } | |||
| auto origin_weight = reinterpret_cast<float16_t *>(weight_tensor->data_c()); | |||
| for (int i = 0; i < deconv_param_->compute_size_; i++) { | |||
| DeConvComputeUnit *unit = &deconv_param_->compute_units_[i]; | |||
| ret = PackDeConvWgDataFp16(execute_weight_, unit, conv_param_, deconv_param_); | |||
| auto ret = PackDeConvWgDataFp16(origin_weight, unit, conv_param_, deconv_param_); | |||
| if (ret != RET_OK) { | |||
| return ret; | |||
| } | |||
| @@ -338,18 +334,11 @@ int DeConvWinogradFp16CPUKernel::InitDataParam() { | |||
| return RET_ERROR; | |||
| } | |||
| memset(bias_data_, 0, deconv_param_->oc_up4_ * sizeof(float16_t)); | |||
| auto fp16_bias_data = reinterpret_cast<float16_t *>(bias_data_); | |||
| if (in_tensors_.size() == 3 && in_tensors_.at(kBiasIndex)->shape().size() == 1 && | |||
| in_tensors_.at(kBiasIndex)->DimensionSize(0) == conv_param_->output_channel_) { | |||
| auto src_bias = reinterpret_cast<float16_t *>(in_tensors_.at(kBiasIndex)->MutableData()); | |||
| MS_ASSERT(src_bias); | |||
| for (int i = 0; i < conv_param_->output_channel_; ++i) { | |||
| fp16_bias_data[i] = (float16_t)src_bias[i]; | |||
| } | |||
| } else { | |||
| MS_ASSERT(in_tensors_.size() == kInputSize1); | |||
| auto src_bias = reinterpret_cast<float16_t *>(in_tensors_.at(kBiasIndex)->data_c()); | |||
| memcpy(bias_data_, src_bias, in_tensors_.at(kBiasIndex)->Size()); | |||
| } | |||
| return RET_OK; | |||
| } | |||
| @@ -391,11 +380,16 @@ int DeConvWinogradFp16CPUKernel::Init() { | |||
| } | |||
| int DeConvWinogradFp16CPUKernel::Run() { | |||
| ConvolutionBaseFP16CPUKernel::GetExecuteTensor(); | |||
| auto input_ptr = reinterpret_cast<float16_t *>(in_tensors_.at(0)->data_c()); | |||
| auto output_ptr = reinterpret_cast<float16_t *>(out_tensors_.at(0)->data_c()); | |||
| if (input_ptr == nullptr || output_ptr == nullptr) { | |||
| MS_LOG(ERROR) << "Deconvolution Winograd Fp16 get null tensor data!"; | |||
| return RET_ERROR; | |||
| } | |||
| for (int batch_index = 0; batch_index < conv_param_->input_batch_; batch_index++) { | |||
| nhwc_input_ = execute_input_ + batch_index * deconv_param_->input_plane_ * conv_param_->input_channel_; | |||
| nhwc_output_ = execute_output_ + batch_index * deconv_param_->output_plane_ * conv_param_->output_channel_; | |||
| nhwc_input_ = input_ptr + batch_index * deconv_param_->input_plane_ * conv_param_->input_channel_; | |||
| nhwc_output_ = output_ptr + batch_index * deconv_param_->output_plane_ * conv_param_->output_channel_; | |||
| ::memset(nc4hw4_output_, 0, deconv_param_->output_plane_ * deconv_param_->oc_div4_ * C4NUM * sizeof(float16_t)); | |||
| ParallelLaunch(this->context_->thread_pool_, DeConvWgFp16Run, this, deconv_param_->thread_num_); | |||
| @@ -22,15 +22,14 @@ | |||
| #include "nnacl/fp16/common_func_fp16.h" | |||
| #include "nnacl/fp16/deconv_winograd_fp16.h" | |||
| #include "nnacl/fp16/pack_fp16.h" | |||
| #include "src/runtime/kernel/arm/fp16/convolution_base_fp16.h" | |||
| #include "src/runtime/kernel/arm/base/convolution_base.h" | |||
| namespace mindspore::kernel { | |||
| class DeConvWinogradFp16CPUKernel : public ConvolutionBaseFP16CPUKernel { | |||
| class DeConvWinogradFp16CPUKernel : public ConvolutionBaseCPUKernel { | |||
| public: | |||
| DeConvWinogradFp16CPUKernel(OpParameter *parameter, const std::vector<lite::Tensor *> &inputs, | |||
| const std::vector<lite::Tensor *> &outputs, const lite::InnerContext *ctx, | |||
| TypeId origin_weight_data_type, TypeId origin_bias_data_type) | |||
| : ConvolutionBaseFP16CPUKernel(parameter, inputs, outputs, ctx, origin_weight_data_type, origin_bias_data_type) {} | |||
| const std::vector<lite::Tensor *> &outputs, const lite::InnerContext *ctx) | |||
| : ConvolutionBaseCPUKernel(parameter, inputs, outputs, ctx) {} | |||
| ~DeConvWinogradFp16CPUKernel() override; | |||
| int Init() override; | |||
| int Run() override; | |||
| @@ -24,6 +24,7 @@ | |||
| #include "src/runtime/kernel/arm/fp32/convolution_depthwise_slidewindow_fp32.h" | |||
| #include "src/runtime/kernel/arm/fp32/convolution_depthwise_indirect_fp32.h" | |||
| #include "src/runtime/kernel/arm/base/group_convolution_creator.h" | |||
| #include "src/runtime/kernel/arm/base/group_convolution.h" | |||
| #include "schema/model_generated.h" | |||
| #include "include/errorcode.h" | |||
| @@ -161,9 +162,9 @@ kernel::LiteKernel *ConvolutionDelegateCPUKernel::CpuConvFp32KernelSelect() { | |||
| return kernel; | |||
| } | |||
| kernel::LiteKernel *DispatchConvDw(const std::vector<lite::Tensor *> &inputs, | |||
| const std::vector<lite::Tensor *> &outputs, OpParameter *opParameter, | |||
| const InnerContext *ctx) { | |||
| kernel::LiteKernel *CpuConvDwFp32KernelCreator(const std::vector<lite::Tensor *> &inputs, | |||
| const std::vector<lite::Tensor *> &outputs, OpParameter *opParameter, | |||
| const InnerContext *ctx) { | |||
| auto conv_param = reinterpret_cast<ConvParameter *>(opParameter); | |||
| kernel::LiteKernel *kernel = nullptr; | |||
| if (opParameter != nullptr && opParameter->infer_flag_) { | |||
| @@ -187,6 +188,29 @@ kernel::LiteKernel *DispatchConvDw(const std::vector<lite::Tensor *> &inputs, | |||
| return kernel; | |||
| } | |||
| kernel::LiteKernel *CpuGroupConvFp32KernelCreator(const std::vector<lite::Tensor *> &inputs, | |||
| const std::vector<lite::Tensor *> &outputs, OpParameter *op_parameter, | |||
| const lite::InnerContext *ctx) { | |||
| auto conv_param = reinterpret_cast<ConvParameter *>(op_parameter); | |||
| GroupConvCreator group_conv_creator(inputs, outputs, op_parameter, ctx, false, kNumberTypeFloat32); | |||
| group_conv_creator.SetShapeOfTensors(); | |||
| for (int i = 0; i < conv_param->group_; ++i) { | |||
| ConvParameter *new_conv_param = CreateNewConvParameter(conv_param); | |||
| std::vector<lite::Tensor *> new_inputs; | |||
| std::vector<lite::Tensor *> new_outputs; | |||
| auto ret = group_conv_creator.GetSingleConvParam(new_conv_param, &new_inputs, &new_outputs, i); | |||
| if (ret != RET_OK) { | |||
| MS_LOG(ERROR) << "GetSingleConv for fp32 group conv failed."; | |||
| return nullptr; | |||
| } | |||
| group_conv_creator.get_group_conv()->emplace_back(new (std::nothrow) ConvolutionDelegateCPUKernel( | |||
| reinterpret_cast<OpParameter *>(new_conv_param), new_inputs, new_outputs, ctx)); | |||
| } | |||
| return new (std::nothrow) | |||
| GroupConvolutionCPUKernel(op_parameter, inputs, outputs, ctx, *(group_conv_creator.get_group_conv()), | |||
| reinterpret_cast<ConvParameter *>(op_parameter)->group_); | |||
| } | |||
| /* creator func */ | |||
| kernel::LiteKernel *CpuConvFp32KernelCreator(const std::vector<lite::Tensor *> &inputs, | |||
| const std::vector<lite::Tensor *> &outputs, OpParameter *op_parameter, | |||
| @@ -200,7 +224,7 @@ kernel::LiteKernel *CpuConvFp32KernelCreator(const std::vector<lite::Tensor *> & | |||
| if (conv_param->group_ == 1) { | |||
| kernel = new (std::nothrow) kernel::ConvolutionDelegateCPUKernel(op_parameter, inputs, outputs, ctx); | |||
| } else if (conv_param->group_ == conv_param->input_channel_ && conv_param->group_ == conv_param->output_channel_) { | |||
| kernel = DispatchConvDw(inputs, outputs, op_parameter, ctx); | |||
| kernel = CpuConvDwFp32KernelCreator(inputs, outputs, op_parameter, ctx); | |||
| } else { | |||
| kernel = CpuGroupConvFp32KernelCreator(inputs, outputs, op_parameter, ctx); | |||
| } | |||
| @@ -21,6 +21,7 @@ | |||
| #include "src/runtime/kernel/arm/int8/convolution_depthwise_int8.h" | |||
| #include "src/runtime/kernel/arm/int8/convolution_depthwise_3x3_int8.h" | |||
| #include "src/runtime/kernel/arm/int8/convolution_depthwise_slidewindow_int8.h" | |||
| #include "src/runtime/kernel/arm/int8/group_convolution_int8.h" | |||
| #include "src/runtime/kernel/arm/base/group_convolution_creator.h" | |||
| #include "schema/model_generated.h" | |||
| #include "src/kernel_registry.h" | |||
| @@ -83,6 +84,29 @@ kernel::LiteKernel *CpuConvInt8KernelSelect(const std::vector<lite::Tensor *> &i | |||
| return kernel; | |||
| } | |||
| kernel::LiteKernel *CpuGroupConvInt8KernelCreator(const std::vector<lite::Tensor *> &inputs, | |||
| const std::vector<lite::Tensor *> &outputs, OpParameter *op_parameter, | |||
| const lite::InnerContext *ctx, int group) { | |||
| auto conv_param = reinterpret_cast<ConvParameter *>(op_parameter); | |||
| GroupConvCreator group_conv_creator(inputs, outputs, op_parameter, ctx, true, kNumberTypeInt8); | |||
| group_conv_creator.SetShapeOfTensors(); | |||
| for (int i = 0; i < conv_param->group_; ++i) { | |||
| ConvParameter *new_conv_param = CreateNewConvParameter(conv_param); | |||
| std::vector<lite::Tensor *> new_inputs; | |||
| std::vector<lite::Tensor *> new_outputs; | |||
| auto ret = group_conv_creator.GetSingleConvParam(new_conv_param, &new_inputs, &new_outputs, i); | |||
| if (ret != RET_OK) { | |||
| MS_LOG(ERROR) << "GetSingleConv for int8 group conv failed."; | |||
| return nullptr; | |||
| } | |||
| group_conv_creator.CopyQuantParam(&new_inputs); | |||
| group_conv_creator.get_group_conv()->emplace_back( | |||
| CpuConvInt8KernelSelect(new_inputs, new_outputs, reinterpret_cast<OpParameter *>(new_conv_param), ctx)); | |||
| } | |||
| return new (std::nothrow) | |||
| GroupConvolutionInt8CPUKernel(op_parameter, inputs, outputs, ctx, *(group_conv_creator.get_group_conv()), group); | |||
| } | |||
| kernel::LiteKernel *CpuConvInt8KernelCreator(const std::vector<lite::Tensor *> &inputs, | |||
| const std::vector<lite::Tensor *> &outputs, OpParameter *op_parameter, | |||
| const InnerContext *ctx, const kernel::KernelKey &desc) { | |||