Merge pull request !4179 from ling/conv1x1tags/v0.7.0-beta
| @@ -43,17 +43,22 @@ int Convolution1x1FP16CPUKernel::InitMatmulParam() { | |||||
| return RET_OK; | return RET_OK; | ||||
| } | } | ||||
| Convolution1x1FP16CPUKernel::~Convolution1x1FP16CPUKernel() { | |||||
| FreeTmpBuffer(); | |||||
| if (weight_ptr_ != nullptr) { | |||||
| free(weight_ptr_); | |||||
| weight_ptr_ = nullptr; | |||||
| } | |||||
| if (matmul_param_ != nullptr) { | |||||
| delete matmul_param_; | |||||
| matmul_param_ = nullptr; | |||||
| } | |||||
| return; | |||||
| } | |||||
| int Convolution1x1FP16CPUKernel::InitConv1x1Param() { | int Convolution1x1FP16CPUKernel::InitConv1x1Param() { | ||||
| pre_trans_input_ = (conv_param_->pad_h_ != 0 || conv_param_->pad_w_ != 0 || conv_param_->stride_h_ != 1 || | pre_trans_input_ = (conv_param_->pad_h_ != 0 || conv_param_->pad_w_ != 0 || conv_param_->stride_h_ != 1 || | ||||
| conv_param_->stride_w_ != 1); | conv_param_->stride_w_ != 1); | ||||
| if (pre_trans_input_) { | |||||
| input_ptr_ = reinterpret_cast<float16_t *>(malloc(matmul_param_->row_ * matmul_param_->deep_ * sizeof(float16_t))); | |||||
| if (input_ptr_ == nullptr) { | |||||
| MS_LOG(ERROR) << "Conv1x1 Malloc input_ptr_ error!"; | |||||
| return RET_MEMORY_FAILED; | |||||
| } | |||||
| memset(input_ptr_, 0, matmul_param_->row_ * matmul_param_->deep_ * sizeof(float16_t)); | |||||
| } | |||||
| thread_count_ = MSMIN(op_parameter_->thread_num_, UP_DIV(matmul_param_->col_, C8NUM)); | thread_count_ = MSMIN(op_parameter_->thread_num_, UP_DIV(matmul_param_->col_, C8NUM)); | ||||
| thread_stride_ = UP_DIV(UP_DIV(matmul_param_->col_, C8NUM), thread_count_) * C8NUM; | thread_stride_ = UP_DIV(UP_DIV(matmul_param_->col_, C8NUM), thread_count_) * C8NUM; | ||||
| @@ -74,17 +79,16 @@ int Convolution1x1FP16CPUKernel::InitWeightBias() { | |||||
| MS_LOG(ERROR) << "Get Execute filter failed."; | MS_LOG(ERROR) << "Get Execute filter failed."; | ||||
| return ret; | return ret; | ||||
| } | } | ||||
| bias_data_ = malloc(matmul_param_->col_8_ * sizeof(float16_t)); | |||||
| if (bias_data_ == nullptr) { | |||||
| MS_LOG(ERROR) << "Conv1x1 Malloc bias_ptr_ error!"; | |||||
| return RET_ERROR; | |||||
| } | |||||
| memset(bias_data_, 0, matmul_param_->col_8_ * sizeof(float16_t)); | |||||
| if (in_tensors_.size() == 3) { | if (in_tensors_.size() == 3) { | ||||
| bias_data_ = malloc(matmul_param_->col_8_ * sizeof(float16_t)); | |||||
| if (bias_data_ == nullptr) { | |||||
| MS_LOG(ERROR) << "Conv1x1 Malloc bias_ptr_ error!"; | |||||
| return RET_ERROR; | |||||
| } | |||||
| memset(bias_data_, 0, matmul_param_->col_8_ * sizeof(float16_t)); | |||||
| Float32ToFloat16(reinterpret_cast<float *>(in_tensors_[2]->Data()), reinterpret_cast<float16_t *>(bias_data_), | Float32ToFloat16(reinterpret_cast<float *>(in_tensors_[2]->Data()), reinterpret_cast<float16_t *>(bias_data_), | ||||
| conv_param_->output_channel_); | conv_param_->output_channel_); | ||||
| } else { | |||||
| bias_data_ = nullptr; | |||||
| } | } | ||||
| weight_ptr_ = reinterpret_cast<float16_t *>(malloc(matmul_param_->deep_ * matmul_param_->col_8_ * sizeof(float16_t))); | weight_ptr_ = reinterpret_cast<float16_t *>(malloc(matmul_param_->deep_ * matmul_param_->col_8_ * sizeof(float16_t))); | ||||
| @@ -102,22 +106,19 @@ int Convolution1x1FP16CPUKernel::Init() { | |||||
| if (!InferShapeDone()) { | if (!InferShapeDone()) { | ||||
| return RET_OK; | return RET_OK; | ||||
| } | } | ||||
| int ret = InitWeightBias(); | |||||
| if (ret != RET_OK) { | |||||
| MS_LOG(ERROR) << "Init weight bias failed."; | |||||
| return ret; | |||||
| } | |||||
| return ReSize(); | return ReSize(); | ||||
| } | } | ||||
| void Convolution1x1FP16CPUKernel::FreeTmpBuffer() { | void Convolution1x1FP16CPUKernel::FreeTmpBuffer() { | ||||
| if (weight_ptr_ != nullptr) { | |||||
| free(weight_ptr_); | |||||
| weight_ptr_ = nullptr; | |||||
| } | |||||
| if (pack_input_ != nullptr) { | if (pack_input_ != nullptr) { | ||||
| free(pack_input_); | free(pack_input_); | ||||
| pack_input_ = nullptr; | pack_input_ = nullptr; | ||||
| } | } | ||||
| if (pre_trans_input_ && input_ptr_ != nullptr) { | |||||
| free(input_ptr_); | |||||
| input_ptr_ = nullptr; | |||||
| } | |||||
| return; | return; | ||||
| } | } | ||||
| @@ -139,11 +140,6 @@ int Convolution1x1FP16CPUKernel::ReSize() { | |||||
| MS_LOG(ERROR) << "Init conv1x1 param failed."; | MS_LOG(ERROR) << "Init conv1x1 param failed."; | ||||
| return ret; | return ret; | ||||
| } | } | ||||
| ret = InitWeightBias(); | |||||
| if (ret != RET_OK) { | |||||
| MS_LOG(ERROR) << "Init weight bias failed."; | |||||
| return ret; | |||||
| } | |||||
| return RET_OK; | return RET_OK; | ||||
| } | } | ||||
| @@ -197,6 +193,15 @@ int Convolution1x1FP16CPUKernel::Run() { | |||||
| return ret; | return ret; | ||||
| } | } | ||||
| if (pre_trans_input_) { | |||||
| input_ptr_ = reinterpret_cast<float16_t *>( | |||||
| ctx_->allocator->Malloc(matmul_param_->row_ * matmul_param_->deep_ * sizeof(float16_t))); | |||||
| if (input_ptr_ == nullptr) { | |||||
| MS_LOG(ERROR) << "Conv1x1 Malloc input_ptr_ error!"; | |||||
| return RET_MEMORY_FAILED; | |||||
| } | |||||
| } | |||||
| for (int batch_index = 0; batch_index < conv_param_->input_batch_; batch_index++) { | for (int batch_index = 0; batch_index < conv_param_->input_batch_; batch_index++) { | ||||
| Pre1x1Trans( | Pre1x1Trans( | ||||
| execute_input_ + batch_index * conv_param_->input_h_ * conv_param_->input_w_ * conv_param_->input_channel_, | execute_input_ + batch_index * conv_param_->input_h_ * conv_param_->input_w_ * conv_param_->input_channel_, | ||||
| @@ -211,6 +216,11 @@ int Convolution1x1FP16CPUKernel::Run() { | |||||
| ConvolutionBaseFP16CPUKernel::IfCastOutput(); | ConvolutionBaseFP16CPUKernel::IfCastOutput(); | ||||
| ConvolutionBaseFP16CPUKernel::FreeTmpBuffer(); | ConvolutionBaseFP16CPUKernel::FreeTmpBuffer(); | ||||
| if (pre_trans_input_ && input_ptr_ != nullptr) { | |||||
| ctx_->allocator->Free(input_ptr_); | |||||
| input_ptr_ = nullptr; | |||||
| } | |||||
| return RET_OK; | return RET_OK; | ||||
| } | } | ||||
| } // namespace mindspore::kernel | } // namespace mindspore::kernel | ||||
| @@ -34,13 +34,7 @@ class Convolution1x1FP16CPUKernel : public ConvolutionBaseFP16CPUKernel { | |||||
| : ConvolutionBaseFP16CPUKernel(parameter, inputs, outputs, ctx, primitive) { | : ConvolutionBaseFP16CPUKernel(parameter, inputs, outputs, ctx, primitive) { | ||||
| matmul_param_ = new MatMulParameter(); | matmul_param_ = new MatMulParameter(); | ||||
| } | } | ||||
| ~Convolution1x1FP16CPUKernel() override { | |||||
| FreeTmpBuffer(); | |||||
| if (matmul_param_ != nullptr) { | |||||
| delete matmul_param_; | |||||
| matmul_param_ = nullptr; | |||||
| } | |||||
| } | |||||
| ~Convolution1x1FP16CPUKernel() override; | |||||
| int Init() override; | int Init() override; | ||||
| int ReSize() override; | int ReSize() override; | ||||
| @@ -43,11 +43,16 @@ int ConvolutionBaseFP16CPUKernel::GetExecuteTensor() { | |||||
| int ConvolutionBaseFP16CPUKernel::GetExecuteFilter() { | int ConvolutionBaseFP16CPUKernel::GetExecuteFilter() { | ||||
| auto weight_tensor = in_tensors_.at(kWeightIndex); | auto weight_tensor = in_tensors_.at(kWeightIndex); | ||||
| auto weight_data_type = weight_tensor->data_type(); | auto weight_data_type = weight_tensor->data_type(); | ||||
| auto input_channel = weight_tensor->Channel(); | |||||
| auto output_channel = weight_tensor->Batch(); | |||||
| auto kernel_h = weight_tensor->Height(); | |||||
| auto kernel_w = weight_tensor->Width(); | |||||
| MS_ASSERT(weight_data_type == kNumberTypeFloat32 || weight_data_type == kNumberTypeFloat16); | MS_ASSERT(weight_data_type == kNumberTypeFloat32 || weight_data_type == kNumberTypeFloat16); | ||||
| if (weight_data_type == kNumberTypeFloat32) { | if (weight_data_type == kNumberTypeFloat32) { | ||||
| float *origin_weight = reinterpret_cast<float *>(in_tensors_.at(kWeightIndex)->Data()); | float *origin_weight = reinterpret_cast<float *>(in_tensors_.at(kWeightIndex)->Data()); | ||||
| size_t fp16_weight_size = conv_param_->input_channel_ * conv_param_->output_channel_ * conv_param_->kernel_h_ * | |||||
| conv_param_->kernel_w_ * sizeof(float16_t); | |||||
| size_t fp16_weight_size = input_channel * output_channel * kernel_h * kernel_w * sizeof(float16_t); | |||||
| fp16_weight_ = reinterpret_cast<float16_t *>(malloc(fp16_weight_size)); | fp16_weight_ = reinterpret_cast<float16_t *>(malloc(fp16_weight_size)); | ||||
| if (fp16_weight_ == nullptr) { | if (fp16_weight_ == nullptr) { | ||||
| MS_LOG(ERROR) << "malloc fp16_weight_ failed."; | MS_LOG(ERROR) << "malloc fp16_weight_ failed."; | ||||
| @@ -53,18 +53,10 @@ int DeConvolutionFp16CPUKernel::ReSize() { | |||||
| } | } | ||||
| void DeConvolutionFp16CPUKernel::FreeParam() { | void DeConvolutionFp16CPUKernel::FreeParam() { | ||||
| if (tmp_buffer_ != nullptr) { | |||||
| free(tmp_buffer_); | |||||
| tmp_buffer_ = nullptr; | |||||
| } | |||||
| if (pack_input_ != nullptr) { | if (pack_input_ != nullptr) { | ||||
| free(pack_input_); | free(pack_input_); | ||||
| pack_input_ = nullptr; | pack_input_ = nullptr; | ||||
| } | } | ||||
| if (pack_output_ != nullptr) { | |||||
| free(pack_output_); | |||||
| pack_output_ = nullptr; | |||||
| } | |||||
| return; | return; | ||||
| } | } | ||||
| @@ -107,28 +99,44 @@ int DeConvolutionFp16CPUKernel::InitParam() { | |||||
| thread_count_ = MSMIN(op_parameter_->thread_num_, UP_DIV(conv_param_->output_channel_, C8NUM)); | thread_count_ = MSMIN(op_parameter_->thread_num_, UP_DIV(conv_param_->output_channel_, C8NUM)); | ||||
| thread_stride_ = UP_DIV(UP_DIV(conv_param_->output_channel_, C8NUM), thread_count_); | thread_stride_ = UP_DIV(UP_DIV(conv_param_->output_channel_, C8NUM), thread_count_); | ||||
| pack_input_ = reinterpret_cast<float16_t *>(malloc(row16_ * matmul_param_->deep_ * sizeof(float16_t))); | |||||
| size_t size = row16_ * matmul_param_->deep_ * sizeof(float16_t); | |||||
| pack_input_ = reinterpret_cast<float16_t *>(malloc(size)); | |||||
| if (pack_input_ == nullptr) { | if (pack_input_ == nullptr) { | ||||
| MS_LOG(ERROR) << "deconv Malloc pack_input_ error!"; | MS_LOG(ERROR) << "deconv Malloc pack_input_ error!"; | ||||
| return RET_ERROR; | return RET_ERROR; | ||||
| } | } | ||||
| memset(pack_input_, 0, size); | |||||
| return RET_OK; | |||||
| } | |||||
| int DeConvolutionFp16CPUKernel::InitRunBuf() { | |||||
| pack_output_ = reinterpret_cast<float16_t *>( | pack_output_ = reinterpret_cast<float16_t *>( | ||||
| malloc(UP_ROUND(conv_param_->output_channel_, C8NUM) * output_plane_ * sizeof(float16_t))); | |||||
| ctx_->allocator->Malloc(UP_ROUND(conv_param_->output_channel_, C8NUM) * output_plane_ * sizeof(float16_t))); | |||||
| if (pack_output_ == nullptr) { | if (pack_output_ == nullptr) { | ||||
| MS_LOG(ERROR) << "deconv Malloc pack_output_ error!"; | MS_LOG(ERROR) << "deconv Malloc pack_output_ error!"; | ||||
| return RET_NULL_PTR; | return RET_NULL_PTR; | ||||
| } | } | ||||
| tmp_buffer_ = reinterpret_cast<float16_t *>(malloc(row16_ * col8_ * sizeof(float16_t))); | |||||
| tmp_buffer_ = reinterpret_cast<float16_t *>(ctx_->allocator->Malloc(row16_ * col8_ * sizeof(float16_t))); | |||||
| if (tmp_buffer_ == nullptr) { | if (tmp_buffer_ == nullptr) { | ||||
| MS_LOG(ERROR) << "deconv Malloc tmp_buffer_ error!"; | MS_LOG(ERROR) << "deconv Malloc tmp_buffer_ error!"; | ||||
| return RET_ERROR; | return RET_ERROR; | ||||
| } | } | ||||
| return RET_OK; | return RET_OK; | ||||
| } | } | ||||
| void DeConvolutionFp16CPUKernel::FreeRunBuf() { | |||||
| if (tmp_buffer_ != nullptr) { | |||||
| ctx_->allocator->Free(tmp_buffer_); | |||||
| tmp_buffer_ = nullptr; | |||||
| } | |||||
| if (pack_output_ != nullptr) { | |||||
| ctx_->allocator->Free(pack_output_); | |||||
| pack_output_ = nullptr; | |||||
| } | |||||
| return; | |||||
| } | |||||
| int DeConvFp16Run(int task_id, LiteParallelGroupEnv *penv, void *cdata) { | int DeConvFp16Run(int task_id, LiteParallelGroupEnv *penv, void *cdata) { | ||||
| auto deconv = reinterpret_cast<DeConvolutionFp16CPUKernel *>(cdata); | auto deconv = reinterpret_cast<DeConvolutionFp16CPUKernel *>(cdata); | ||||
| auto error_code = deconv->DoDeconv(task_id); | auto error_code = deconv->DoDeconv(task_id); | ||||
| @@ -171,10 +179,16 @@ int DeConvolutionFp16CPUKernel::Run() { | |||||
| } | } | ||||
| ConvolutionBaseFP16CPUKernel::GetExecuteTensor(); | ConvolutionBaseFP16CPUKernel::GetExecuteTensor(); | ||||
| int error_code = InitRunBuf(); | |||||
| if (error_code != RET_OK) { | |||||
| MS_LOG(ERROR) << "deconv fp32 InitRunBuf error! error_code[" << error_code << "]"; | |||||
| return RET_ERROR; | |||||
| } | |||||
| for (int batch_index = 0; batch_index < conv_param_->input_batch_; batch_index++) { | for (int batch_index = 0; batch_index < conv_param_->input_batch_; batch_index++) { | ||||
| RowMajor2Col16MajorFp16(execute_input_, pack_input_, input_plane_, conv_param_->input_channel_); | RowMajor2Col16MajorFp16(execute_input_, pack_input_, input_plane_, conv_param_->input_channel_); | ||||
| int error_code = LiteBackendParallelLaunch(DeConvFp16Run, this, thread_count_); | |||||
| error_code = LiteBackendParallelLaunch(DeConvFp16Run, this, thread_count_); | |||||
| if (error_code != RET_OK) { | if (error_code != RET_OK) { | ||||
| MS_LOG(ERROR) << "deconv fp32 run error! error_code[" << error_code << "]"; | MS_LOG(ERROR) << "deconv fp32 run error! error_code[" << error_code << "]"; | ||||
| return RET_ERROR; | return RET_ERROR; | ||||
| @@ -183,6 +197,7 @@ int DeConvolutionFp16CPUKernel::Run() { | |||||
| ConvolutionBaseFP16CPUKernel::IfCastOutput(); | ConvolutionBaseFP16CPUKernel::IfCastOutput(); | ||||
| ConvolutionBaseFP16CPUKernel::FreeTmpBuffer(); | ConvolutionBaseFP16CPUKernel::FreeTmpBuffer(); | ||||
| FreeRunBuf(); | |||||
| return RET_OK; | return RET_OK; | ||||
| } | } | ||||
| @@ -47,6 +47,8 @@ class DeConvolutionFp16CPUKernel : public ConvolutionBaseFP16CPUKernel { | |||||
| int DoDeconv(int task_id); | int DoDeconv(int task_id); | ||||
| private: | private: | ||||
| int InitRunBuf(); | |||||
| void FreeRunBuf(); | |||||
| void FreeParam(); | void FreeParam(); | ||||
| int InitParam(); | int InitParam(); | ||||
| int InitWeightBias(); | int InitWeightBias(); | ||||
| @@ -24,6 +24,10 @@ using mindspore::lite::RET_OK; | |||||
| namespace mindspore::kernel { | namespace mindspore::kernel { | ||||
| Convolution1x1CPUKernel::~Convolution1x1CPUKernel() { | Convolution1x1CPUKernel::~Convolution1x1CPUKernel() { | ||||
| FreeTmpBuffer(); | FreeTmpBuffer(); | ||||
| if (weight_ptr_ != nullptr) { | |||||
| free(weight_ptr_); | |||||
| weight_ptr_ = nullptr; | |||||
| } | |||||
| if (matmul_param_ != nullptr) { | if (matmul_param_ != nullptr) { | ||||
| delete matmul_param_; | delete matmul_param_; | ||||
| matmul_param_ = nullptr; | matmul_param_ = nullptr; | ||||
| @@ -31,18 +35,10 @@ Convolution1x1CPUKernel::~Convolution1x1CPUKernel() { | |||||
| } | } | ||||
| void Convolution1x1CPUKernel::FreeTmpBuffer() { | void Convolution1x1CPUKernel::FreeTmpBuffer() { | ||||
| if (weight_ptr_ != nullptr) { | |||||
| free(weight_ptr_); | |||||
| weight_ptr_ = nullptr; | |||||
| } | |||||
| if (pack_input_ != nullptr) { | if (pack_input_ != nullptr) { | ||||
| free(pack_input_); | free(pack_input_); | ||||
| pack_input_ = nullptr; | pack_input_ = nullptr; | ||||
| } | } | ||||
| if (pre_trans_input_ && input_ptr_ != nullptr) { | |||||
| free(input_ptr_); | |||||
| input_ptr_ = nullptr; | |||||
| } | |||||
| return; | return; | ||||
| } | } | ||||
| @@ -51,12 +47,7 @@ int Convolution1x1CPUKernel::ReSize() { | |||||
| ConvolutionBaseCPUKernel::Init(); | ConvolutionBaseCPUKernel::Init(); | ||||
| InitConv1x1MatmulParam(); | InitConv1x1MatmulParam(); | ||||
| int error_code = InitConv1x1BiasWeight(); | |||||
| if (error_code != RET_OK) { | |||||
| MS_LOG(ERROR) << "Convolution base init failed."; | |||||
| return error_code; | |||||
| } | |||||
| error_code = InitConv1x1Param(); | |||||
| int error_code = InitConv1x1Param(); | |||||
| if (error_code != RET_OK) { | if (error_code != RET_OK) { | ||||
| MS_LOG(ERROR) << "Convolution base init failed."; | MS_LOG(ERROR) << "Convolution base init failed."; | ||||
| return error_code; | return error_code; | ||||
| @@ -76,40 +67,35 @@ void Convolution1x1CPUKernel::InitConv1x1MatmulParam() { | |||||
| } | } | ||||
| int Convolution1x1CPUKernel::InitConv1x1BiasWeight() { | int Convolution1x1CPUKernel::InitConv1x1BiasWeight() { | ||||
| auto filter_tensor = in_tensors_.at(kWeightIndex); | |||||
| auto input_channel = filter_tensor->Channel(); | |||||
| auto output_channel = filter_tensor->Batch(); | |||||
| int size = UP_ROUND(output_channel, C8NUM) * sizeof(float); | |||||
| bias_data_ = malloc(size); | |||||
| if (bias_data_ == nullptr) { | |||||
| MS_LOG(ERROR) << "Conv1x1 Malloc bias_ptr_ error!"; | |||||
| return RET_ERROR; | |||||
| } | |||||
| memset(bias_data_, 0, size); | |||||
| if (in_tensors_.size() == 3) { | if (in_tensors_.size() == 3) { | ||||
| bias_data_ = malloc(matmul_param_->col_8_ * sizeof(float)); | |||||
| if (bias_data_ == nullptr) { | |||||
| MS_LOG(ERROR) << "Conv1x1 Malloc bias_ptr_ error!"; | |||||
| return RET_ERROR; | |||||
| } | |||||
| memset(bias_data_, 0, matmul_param_->col_8_ * sizeof(float)); | |||||
| memcpy(bias_data_, in_tensors_[2]->Data(), conv_param_->output_channel_ * sizeof(float)); | |||||
| } else { | |||||
| bias_data_ = nullptr; | |||||
| memcpy(bias_data_, in_tensors_[kBiasIndex]->Data(), output_channel * sizeof(float)); | |||||
| } | } | ||||
| weight_ptr_ = reinterpret_cast<float *>(malloc(matmul_param_->deep_ * matmul_param_->col_8_ * sizeof(float))); | |||||
| size = input_channel * UP_ROUND(output_channel, C8NUM) * sizeof(float); | |||||
| weight_ptr_ = reinterpret_cast<float *>(malloc(size)); | |||||
| if (weight_ptr_ == nullptr) { | if (weight_ptr_ == nullptr) { | ||||
| MS_LOG(ERROR) << "Conv1x1 Malloc weight_ptr_ error!"; | MS_LOG(ERROR) << "Conv1x1 Malloc weight_ptr_ error!"; | ||||
| return RET_ERROR; | return RET_ERROR; | ||||
| } | } | ||||
| memset(weight_ptr_, 0, matmul_param_->deep_ * matmul_param_->col_8_ * sizeof(float)); | |||||
| RowMajor2Col8Major(reinterpret_cast<float *>(in_tensors_[1]->Data()), weight_ptr_, matmul_param_->col_, | |||||
| matmul_param_->deep_); | |||||
| memset(weight_ptr_, 0, size); | |||||
| RowMajor2Col8Major(reinterpret_cast<float *>(filter_tensor->Data()), weight_ptr_, output_channel, input_channel); | |||||
| return RET_OK; | return RET_OK; | ||||
| } | } | ||||
| int Convolution1x1CPUKernel::InitConv1x1Param() { | int Convolution1x1CPUKernel::InitConv1x1Param() { | ||||
| pre_trans_input_ = (conv_param_->pad_h_ != 0 || conv_param_->pad_w_ != 0 || conv_param_->stride_h_ != 1 || | pre_trans_input_ = (conv_param_->pad_h_ != 0 || conv_param_->pad_w_ != 0 || conv_param_->stride_h_ != 1 || | ||||
| conv_param_->stride_w_ != 1); | conv_param_->stride_w_ != 1); | ||||
| if (pre_trans_input_) { | |||||
| input_ptr_ = reinterpret_cast<float *>(malloc(matmul_param_->row_ * matmul_param_->deep_ * sizeof(float))); | |||||
| if (input_ptr_ == nullptr) { | |||||
| MS_LOG(ERROR) << "Conv1x1 Malloc input_ptr_ error!"; | |||||
| return RET_MEMORY_FAILED; | |||||
| } | |||||
| memset(input_ptr_, 0, matmul_param_->row_ * matmul_param_->deep_ * sizeof(float)); | |||||
| } | |||||
| thread_count_ = MSMIN(op_parameter_->thread_num_, UP_DIV(matmul_param_->col_, C8NUM)); | thread_count_ = MSMIN(op_parameter_->thread_num_, UP_DIV(matmul_param_->col_, C8NUM)); | ||||
| thread_stride_ = UP_DIV(UP_DIV(matmul_param_->col_, C8NUM), thread_count_) * C8NUM; | thread_stride_ = UP_DIV(UP_DIV(matmul_param_->col_, C8NUM), thread_count_) * C8NUM; | ||||
| @@ -140,6 +126,12 @@ int Convolution1x1CPUKernel::Init() { | |||||
| if (!InferShapeDone()) { | if (!InferShapeDone()) { | ||||
| return RET_OK; | return RET_OK; | ||||
| } | } | ||||
| int error_code = InitConv1x1BiasWeight(); | |||||
| if (error_code != RET_OK) { | |||||
| MS_LOG(ERROR) << "Convolution base init failed."; | |||||
| return error_code; | |||||
| } | |||||
| return ReSize(); | return ReSize(); | ||||
| } | } | ||||
| @@ -177,6 +169,15 @@ int Convolution1x1CPUKernel::Run() { | |||||
| auto src_in = reinterpret_cast<float *>(in_tensors_[0]->Data()); | auto src_in = reinterpret_cast<float *>(in_tensors_[0]->Data()); | ||||
| auto src_out = reinterpret_cast<float *>(out_tensors_[0]->Data()); | auto src_out = reinterpret_cast<float *>(out_tensors_[0]->Data()); | ||||
| if (pre_trans_input_) { | |||||
| input_ptr_ = | |||||
| reinterpret_cast<float *>(ctx_->allocator->Malloc(matmul_param_->row_ * matmul_param_->deep_ * sizeof(float))); | |||||
| if (input_ptr_ == nullptr) { | |||||
| MS_LOG(ERROR) << "Conv1x1 Malloc input_ptr_ error!"; | |||||
| return RET_MEMORY_FAILED; | |||||
| } | |||||
| } | |||||
| for (int batch_index = 0; batch_index < conv_param_->input_batch_; batch_index++) { | for (int batch_index = 0; batch_index < conv_param_->input_batch_; batch_index++) { | ||||
| Pre1x1Trans(src_in + batch_index * conv_param_->input_h_ * conv_param_->input_w_ * conv_param_->input_channel_, | Pre1x1Trans(src_in + batch_index * conv_param_->input_h_ * conv_param_->input_w_ * conv_param_->input_channel_, | ||||
| src_out + batch_index * matmul_param_->row_ * matmul_param_->col_); | src_out + batch_index * matmul_param_->row_ * matmul_param_->col_); | ||||
| @@ -187,6 +188,11 @@ int Convolution1x1CPUKernel::Run() { | |||||
| return RET_ERROR; | return RET_ERROR; | ||||
| } | } | ||||
| } | } | ||||
| if (pre_trans_input_) { | |||||
| ctx_->allocator->Free(input_ptr_); | |||||
| input_ptr_ = nullptr; | |||||
| } | |||||
| return RET_OK; | return RET_OK; | ||||
| } | } | ||||
| } // namespace mindspore::kernel | } // namespace mindspore::kernel | ||||
| @@ -38,18 +38,10 @@ void DeConvolutionCPUKernel::FreeTmpBuffer() { | |||||
| free(weight_ptr_); | free(weight_ptr_); | ||||
| weight_ptr_ = nullptr; | weight_ptr_ = nullptr; | ||||
| } | } | ||||
| if (tmp_buffer_ != nullptr) { | |||||
| free(tmp_buffer_); | |||||
| tmp_buffer_ = nullptr; | |||||
| } | |||||
| if (pack_input_ != nullptr) { | if (pack_input_ != nullptr) { | ||||
| free(pack_input_); | free(pack_input_); | ||||
| pack_input_ = nullptr; | pack_input_ = nullptr; | ||||
| } | } | ||||
| if (pack_output_ != nullptr) { | |||||
| free(pack_output_); | |||||
| pack_output_ = nullptr; | |||||
| } | |||||
| return; | return; | ||||
| } | } | ||||
| @@ -114,19 +106,6 @@ int DeConvolutionCPUKernel::InitParam() { | |||||
| MS_LOG(ERROR) << "deconv Malloc pack_input_ error!"; | MS_LOG(ERROR) << "deconv Malloc pack_input_ error!"; | ||||
| return RET_ERROR; | return RET_ERROR; | ||||
| } | } | ||||
| pack_output_ = | |||||
| reinterpret_cast<float *>(malloc(UP_ROUND(conv_param_->output_channel_, C8NUM) * output_plane_ * sizeof(float))); | |||||
| if (pack_output_ == nullptr) { | |||||
| MS_LOG(ERROR) << "deconv Malloc pack_output_ error!"; | |||||
| return RET_NULL_PTR; | |||||
| } | |||||
| tmp_buffer_ = reinterpret_cast<float *>(malloc(matmul_param_->row_8_ * matmul_param_->col_8_ * sizeof(float))); | |||||
| if (tmp_buffer_ == nullptr) { | |||||
| MS_LOG(ERROR) << "Conv1x1 Malloc tmp_buffer_ error!"; | |||||
| return RET_ERROR; | |||||
| } | |||||
| return RET_OK; | return RET_OK; | ||||
| } | } | ||||
| @@ -165,6 +144,35 @@ int DeConvolutionCPUKernel::Init() { | |||||
| return ReSize(); | return ReSize(); | ||||
| } | } | ||||
| void DeConvolutionCPUKernel::FreeRunBuf() { | |||||
| if (pack_output_ != nullptr) { | |||||
| ctx_->allocator->Free(pack_output_); | |||||
| pack_output_ = nullptr; | |||||
| } | |||||
| if (tmp_buffer_ != nullptr) { | |||||
| ctx_->allocator->Free(tmp_buffer_); | |||||
| tmp_buffer_ = nullptr; | |||||
| } | |||||
| return; | |||||
| } | |||||
| int DeConvolutionCPUKernel::InitRunBuf() { | |||||
| pack_output_ = reinterpret_cast<float *>( | |||||
| ctx_->allocator->Malloc(UP_ROUND(conv_param_->output_channel_, C8NUM) * output_plane_ * sizeof(float))); | |||||
| if (pack_output_ == nullptr) { | |||||
| MS_LOG(ERROR) << "deconv Malloc pack_output_ error!"; | |||||
| return RET_NULL_PTR; | |||||
| } | |||||
| tmp_buffer_ = | |||||
| reinterpret_cast<float *>(ctx_->allocator->Malloc(matmul_param_->row_8_ * matmul_param_->col_8_ * sizeof(float))); | |||||
| if (tmp_buffer_ == nullptr) { | |||||
| MS_LOG(ERROR) << "Conv1x1 Malloc tmp_buffer_ error!"; | |||||
| return RET_NULL_PTR; | |||||
| } | |||||
| return RET_OK; | |||||
| } | |||||
| int DeConvolutionCPUKernel::Run() { | int DeConvolutionCPUKernel::Run() { | ||||
| auto prepare_ret = Prepare(); | auto prepare_ret = Prepare(); | ||||
| if (prepare_ret != RET_OK) { | if (prepare_ret != RET_OK) { | ||||
| @@ -174,18 +182,26 @@ int DeConvolutionCPUKernel::Run() { | |||||
| float *src_in = reinterpret_cast<float *>(in_tensors_[0]->Data()); | float *src_in = reinterpret_cast<float *>(in_tensors_[0]->Data()); | ||||
| float *src_out = reinterpret_cast<float *>(out_tensors_[0]->Data()); | float *src_out = reinterpret_cast<float *>(out_tensors_[0]->Data()); | ||||
| int error_code = InitRunBuf(); | |||||
| if (error_code != RET_OK) { | |||||
| MS_LOG(ERROR) << "deconv fp32 InitRunBuf error! error_code[" << error_code << "]"; | |||||
| return error_code; | |||||
| } | |||||
| for (int batch_index = 0; batch_index < conv_param_->input_batch_; batch_index++) { | for (int batch_index = 0; batch_index < conv_param_->input_batch_; batch_index++) { | ||||
| input_ptr_ = src_in + batch_index * input_plane_ * conv_param_->input_channel_; | input_ptr_ = src_in + batch_index * input_plane_ * conv_param_->input_channel_; | ||||
| output_ptr_ = src_out + batch_index * output_plane_ * conv_param_->output_channel_; | output_ptr_ = src_out + batch_index * output_plane_ * conv_param_->output_channel_; | ||||
| RowMajor2Col8Major(input_ptr_, pack_input_, input_plane_, conv_param_->input_channel_); | RowMajor2Col8Major(input_ptr_, pack_input_, input_plane_, conv_param_->input_channel_); | ||||
| int error_code = LiteBackendParallelLaunch(DeConvFp32Run, this, thread_count_); | |||||
| error_code = LiteBackendParallelLaunch(DeConvFp32Run, this, thread_count_); | |||||
| if (error_code != RET_OK) { | if (error_code != RET_OK) { | ||||
| MS_LOG(ERROR) << "deconv fp32 run error! error_code[" << error_code << "]"; | MS_LOG(ERROR) << "deconv fp32 run error! error_code[" << error_code << "]"; | ||||
| return RET_ERROR; | |||||
| return error_code; | |||||
| } | } | ||||
| } | } | ||||
| FreeRunBuf(); | |||||
| return RET_OK; | return RET_OK; | ||||
| } | } | ||||
| @@ -45,6 +45,8 @@ class DeConvolutionCPUKernel : public ConvolutionBaseCPUKernel { | |||||
| int DoDeconv(int task_id); | int DoDeconv(int task_id); | ||||
| private: | private: | ||||
| int InitRunBuf(); | |||||
| void FreeRunBuf(); | |||||
| int InitParam(); | int InitParam(); | ||||
| int InitWeightBias(); | int InitWeightBias(); | ||||
| void FreeTmpBuffer(); | void FreeTmpBuffer(); | ||||
| @@ -37,21 +37,13 @@ void DeConvInt8CPUKernel::FreeTmpBuffer() { | |||||
| free(weight_ptr_); | free(weight_ptr_); | ||||
| weight_ptr_ = nullptr; | weight_ptr_ = nullptr; | ||||
| } | } | ||||
| if (tmp_buffer_ != nullptr) { | |||||
| free(tmp_buffer_); | |||||
| tmp_buffer_ = nullptr; | |||||
| } | |||||
| if (input_ptr_ != nullptr) { | if (input_ptr_ != nullptr) { | ||||
| free(input_ptr_); | free(input_ptr_); | ||||
| input_ptr_ = nullptr; | input_ptr_ = nullptr; | ||||
| } | } | ||||
| if (tmp_output_ != nullptr) { | |||||
| free(tmp_output_); | |||||
| tmp_output_ = nullptr; | |||||
| } | |||||
| if (input_sum_ != nullptr) { | |||||
| free(input_sum_); | |||||
| input_sum_ = nullptr; | |||||
| if (weight_sum_ != nullptr) { | |||||
| free(weight_sum_); | |||||
| weight_sum_ = nullptr; | |||||
| } | } | ||||
| return; | return; | ||||
| } | } | ||||
| @@ -176,21 +168,24 @@ int DeConvInt8CPUKernel::InitData() { | |||||
| } | } | ||||
| memset(input_ptr_, static_cast<int8_t>(conv_param_->conv_quant_arg_.input_quant_args_[0].zp_), size * sizeof(int8_t)); | memset(input_ptr_, static_cast<int8_t>(conv_param_->conv_quant_arg_.input_quant_args_[0].zp_), size * sizeof(int8_t)); | ||||
| size = UP_ROUND(conv_param_->input_h_ * conv_param_->input_w_, C4NUM) * | |||||
| UP_ROUND(conv_param_->output_channel_, C4NUM) * conv_param_->kernel_w_ * conv_param_->kernel_h_; | |||||
| tmp_buffer_ = reinterpret_cast<int32_t *>(malloc(size * sizeof(int32_t))); | |||||
| return RET_OK; | |||||
| } | |||||
| int DeConvInt8CPUKernel::InitRunBuf() { | |||||
| int size = UP_ROUND(conv_param_->input_h_ * conv_param_->input_w_, C4NUM) * | |||||
| UP_ROUND(conv_param_->output_channel_, C4NUM) * conv_param_->kernel_w_ * conv_param_->kernel_h_; | |||||
| tmp_buffer_ = reinterpret_cast<int32_t *>(ctx_->allocator->Malloc(size * sizeof(int32_t))); | |||||
| if (tmp_buffer_ == nullptr) { | if (tmp_buffer_ == nullptr) { | ||||
| return RET_MEMORY_FAILED; | return RET_MEMORY_FAILED; | ||||
| } | } | ||||
| size = UP_ROUND(conv_param_->output_channel_, C4NUM) * conv_param_->output_h_ * conv_param_->output_w_; | size = UP_ROUND(conv_param_->output_channel_, C4NUM) * conv_param_->output_h_ * conv_param_->output_w_; | ||||
| tmp_output_ = reinterpret_cast<int32_t *>(malloc(size * sizeof(int32_t))); | |||||
| tmp_output_ = reinterpret_cast<int32_t *>(ctx_->allocator->Malloc(size * sizeof(int32_t))); | |||||
| if (tmp_output_ == nullptr) { | if (tmp_output_ == nullptr) { | ||||
| return RET_MEMORY_FAILED; | return RET_MEMORY_FAILED; | ||||
| } | } | ||||
| size = UP_ROUND(matmul_param_->row_, C4NUM); | size = UP_ROUND(matmul_param_->row_, C4NUM); | ||||
| input_sum_ = reinterpret_cast<int32_t *>(malloc(size * sizeof(int32_t))); | |||||
| input_sum_ = reinterpret_cast<int32_t *>(ctx_->allocator->Malloc(size * sizeof(int32_t))); | |||||
| if (input_sum_ == nullptr) { | if (input_sum_ == nullptr) { | ||||
| return RET_MEMORY_FAILED; | return RET_MEMORY_FAILED; | ||||
| } | } | ||||
| @@ -198,6 +193,22 @@ int DeConvInt8CPUKernel::InitData() { | |||||
| return RET_OK; | return RET_OK; | ||||
| } | } | ||||
| void DeConvInt8CPUKernel::FreeRunBuf() { | |||||
| if (tmp_buffer_ != nullptr) { | |||||
| ctx_->allocator->Free(tmp_buffer_); | |||||
| tmp_buffer_ = nullptr; | |||||
| } | |||||
| if (tmp_output_ != nullptr) { | |||||
| ctx_->allocator->Free(tmp_output_); | |||||
| tmp_output_ = nullptr; | |||||
| } | |||||
| if (input_sum_ != nullptr) { | |||||
| ctx_->allocator->Free(input_sum_); | |||||
| input_sum_ = nullptr; | |||||
| } | |||||
| return; | |||||
| } | |||||
| int DeConvInt8Run(int task_id, LiteParallelGroupEnv *penv, void *cdata) { | int DeConvInt8Run(int task_id, LiteParallelGroupEnv *penv, void *cdata) { | ||||
| auto deconv = reinterpret_cast<DeConvInt8CPUKernel *>(cdata); | auto deconv = reinterpret_cast<DeConvInt8CPUKernel *>(cdata); | ||||
| auto error_code = deconv->DoDeconv(task_id); | auto error_code = deconv->DoDeconv(task_id); | ||||
| @@ -240,6 +251,12 @@ int DeConvInt8CPUKernel::Run() { | |||||
| int8_t *src_in = reinterpret_cast<int8_t *>(in_tensors_[0]->Data()); | int8_t *src_in = reinterpret_cast<int8_t *>(in_tensors_[0]->Data()); | ||||
| int8_t *src_out = reinterpret_cast<int8_t *>(out_tensors_[0]->Data()); | int8_t *src_out = reinterpret_cast<int8_t *>(out_tensors_[0]->Data()); | ||||
| int error_code = InitRunBuf(); | |||||
| if (error_code != RET_OK) { | |||||
| MS_LOG(ERROR) << "deconv int8 InitRunBuf error! error_code[" << error_code << "]"; | |||||
| return RET_ERROR; | |||||
| } | |||||
| for (int batch_index = 0; batch_index < conv_param_->input_batch_; batch_index++) { | for (int batch_index = 0; batch_index < conv_param_->input_batch_; batch_index++) { | ||||
| input_trans_func_(src_in + batch_index * matmul_param_->row_ * conv_param_->input_channel_, input_ptr_, | input_trans_func_(src_in + batch_index * matmul_param_->row_ * conv_param_->input_channel_, input_ptr_, | ||||
| matmul_param_->row_, matmul_param_->deep_); | matmul_param_->row_, matmul_param_->deep_); | ||||
| @@ -248,13 +265,14 @@ int DeConvInt8CPUKernel::Run() { | |||||
| DeConvPackInputSum(input_ptr_, input_sum_, conv_param_->conv_quant_arg_.filter_quant_args_[0].zp_, | DeConvPackInputSum(input_ptr_, input_sum_, conv_param_->conv_quant_arg_.filter_quant_args_[0].zp_, | ||||
| UP_ROUND(matmul_param_->row_, C4NUM), UP_ROUND(matmul_param_->deep_, C16NUM), support_optimize_); | UP_ROUND(matmul_param_->row_, C4NUM), UP_ROUND(matmul_param_->deep_, C16NUM), support_optimize_); | ||||
| int error_code = LiteBackendParallelLaunch(DeConvInt8Run, this, thread_count_); | |||||
| error_code = LiteBackendParallelLaunch(DeConvInt8Run, this, thread_count_); | |||||
| if (error_code != RET_OK) { | if (error_code != RET_OK) { | ||||
| MS_LOG(ERROR) << "deconv int8 run error! error_code[" << error_code << "]"; | MS_LOG(ERROR) << "deconv int8 run error! error_code[" << error_code << "]"; | ||||
| return RET_ERROR; | return RET_ERROR; | ||||
| } | } | ||||
| } | } | ||||
| FreeRunBuf(); | |||||
| return RET_OK; | return RET_OK; | ||||
| } | } | ||||
| @@ -51,6 +51,8 @@ class DeConvInt8CPUKernel : public ConvolutionBaseCPUKernel { | |||||
| int InitParam(); | int InitParam(); | ||||
| int InitBiasWeight(); | int InitBiasWeight(); | ||||
| void CheckSupportOptimize(); | void CheckSupportOptimize(); | ||||
| int InitRunBuf(); | |||||
| void FreeRunBuf(); | |||||
| private: | private: | ||||
| int32_t *tmp_buffer_ = nullptr; /* record matmul result */ | int32_t *tmp_buffer_ = nullptr; /* record matmul result */ | ||||