| @@ -20,8 +20,12 @@ | |||
| namespace mindspore::kernel { | |||
| Matrix *TransformMatrixGenerator(int m, int k) { | |||
| auto matrix = new Matrix; | |||
| auto aa = malloc(m * k * sizeof(float)); | |||
| matrix->SetData(aa); | |||
| auto data = malloc(m * k * sizeof(float)); | |||
| if (data == nullptr) { | |||
| MS_LOG(ERROR) << "Malloc matrix data failed."; | |||
| return nullptr; | |||
| } | |||
| matrix->SetData(data); | |||
| matrix->SetNum(m, k); | |||
| return matrix; | |||
| } | |||
| @@ -57,7 +57,7 @@ class Matrix { | |||
| int GetK() { return this->k_; } | |||
| protected: | |||
| void *data_; | |||
| void *data_ = nullptr; | |||
| std::vector<int> shape_; | |||
| std::vector<int> stride_; | |||
| int m_; | |||
| @@ -57,7 +57,7 @@ int Convolution3x3FP16CPUKernel::InitWeightBias() { | |||
| conv_param_->output_channel_ = output_channel; | |||
| int iC8 = UP_DIV(input_channel, C8NUM); | |||
| int oC8 = UP_DIV(output_channel, C8NUM); | |||
| // ===========================init weight========================== // | |||
| size_t transformed_size = iC8 * C8NUM * oC8 * C8NUM * 36 * sizeof(float16_t); | |||
| transformed_filter_addr_ = reinterpret_cast<float16_t *>(malloc(transformed_size)); | |||
| if (transformed_filter_addr_ == nullptr) { | |||
| @@ -72,7 +72,6 @@ int Convolution3x3FP16CPUKernel::InitWeightBias() { | |||
| } | |||
| ProcessFilterFp16(execute_weight_, transformed_filter_addr_, conv_param_); | |||
| // =============================init bias========================= // | |||
| size_t new_bias_size = oC8 * C8NUM * sizeof(float16_t); | |||
| bias_data_ = malloc(new_bias_size); | |||
| if (bias_data_ == nullptr) { | |||
| @@ -97,7 +96,7 @@ int Convolution3x3FP16CPUKernel::InitTmpBuffer() { | |||
| const int k_plane = 36; | |||
| int oC8 = UP_DIV(conv_param_->output_channel_, C8NUM); | |||
| MS_ASSERT(ctx_->allocator != nullptr); | |||
| /*=============================block_unit_buffer_============================*/ | |||
| size_t block_unit_buffer_size = thread_count_ * k_plane * C8NUM * sizeof(float16_t); | |||
| block_unit_buffer_ = reinterpret_cast<float16_t *>(ctx_->allocator->Malloc(block_unit_buffer_size)); | |||
| if (block_unit_buffer_ == nullptr) { | |||
| @@ -105,7 +104,6 @@ int Convolution3x3FP16CPUKernel::InitTmpBuffer() { | |||
| return RET_ERROR; | |||
| } | |||
| /*=============================tmp_dst_buffer_============================*/ | |||
| size_t tmp_dst_buffer_size = thread_count_ * tile_num * k_plane * oC8 * C8NUM * sizeof(float16_t); | |||
| tmp_dst_buffer_ = reinterpret_cast<float16_t *>(ctx_->allocator->Malloc(tmp_dst_buffer_size)); | |||
| if (tmp_dst_buffer_ == nullptr) { | |||
| @@ -113,7 +111,6 @@ int Convolution3x3FP16CPUKernel::InitTmpBuffer() { | |||
| return RET_ERROR; | |||
| } | |||
| /*=============================tmp_out_============================*/ | |||
| int new_out_plane = UP_DIV(conv_param_->output_h_, C4NUM) * UP_DIV(conv_param_->output_w_, C4NUM) * C4NUM * C4NUM; | |||
| size_t tmp_out_size = oC8 * C8NUM * conv_param_->output_batch_ * new_out_plane * sizeof(float16_t); | |||
| tmp_out_ = reinterpret_cast<float16_t *>(ctx_->allocator->Malloc(tmp_out_size)); | |||
| @@ -155,7 +152,6 @@ int Convolution3x3FP16CPUKernel::ReSize() { | |||
| return ret; | |||
| } | |||
| FreeTmpBuffer(); | |||
| if (tile_buffer_ != nullptr) { | |||
| free(tile_buffer_); | |||
| tile_buffer_ = nullptr; | |||
| @@ -174,7 +170,6 @@ int Convolution3x3FP16CPUKernel::ReSize() { | |||
| const int k_plane = 36; | |||
| int iC8 = UP_DIV(conv_param_->input_channel_, C8NUM); | |||
| /*=============================nhwc4_input_============================*/ | |||
| size_t nhwc8_input_size = | |||
| iC8 * C8NUM * conv_param_->input_batch_ * conv_param_->input_h_ * conv_param_->input_w_ * sizeof(float16_t); | |||
| nhwc4_input_ = malloc(nhwc8_input_size); | |||
| @@ -184,7 +179,6 @@ int Convolution3x3FP16CPUKernel::ReSize() { | |||
| } | |||
| memset(nhwc4_input_, 0, nhwc8_input_size); | |||
| /*=============================tile_buffer_============================*/ | |||
| size_t tile_buffer_size = thread_count_ * tile_num * k_plane * iC8 * C8NUM * sizeof(float16_t); | |||
| tile_buffer_ = reinterpret_cast<float16_t *>(malloc(tile_buffer_size)); | |||
| if (tile_buffer_ == nullptr) { | |||
| @@ -96,7 +96,6 @@ int ConvolutionFP16CPUKernel::InitTmpBuffer() { | |||
| int unit_size = kernel_plane * channel_block * C4NUM; | |||
| int packed_input_size = output_tile_count * cal_num * unit_size; | |||
| /*=============================packed_input_============================*/ | |||
| packed_input_ = reinterpret_cast<float16_t *>(malloc(in_batch * packed_input_size * sizeof(float16_t))); | |||
| if (packed_input_ == nullptr) { | |||
| MS_LOG(ERROR) << "malloc packed_input_ failed."; | |||
| @@ -104,7 +103,6 @@ int ConvolutionFP16CPUKernel::InitTmpBuffer() { | |||
| } | |||
| memset(packed_input_, 0, in_batch * packed_input_size * sizeof(float16_t)); | |||
| /*=============================nhwc4_input_============================*/ | |||
| size_t nhwc4_input_size = channel_block * C4NUM * conv_param_->input_batch_ * conv_param_->input_h_ * | |||
| conv_param_->input_w_ * sizeof(float16_t); | |||
| nhwc4_input_ = malloc(nhwc4_input_size); | |||
| @@ -114,7 +112,6 @@ int ConvolutionFP16CPUKernel::InitTmpBuffer() { | |||
| } | |||
| memset(nhwc4_input_, 0, nhwc4_input_size); | |||
| /*=============================tmp_output_block_============================*/ | |||
| tmp_output_block_ = reinterpret_cast<float16_t *>(malloc(cal_num * out_channel * sizeof(float16_t))); | |||
| if (tmp_output_block_ == nullptr) { | |||
| MS_LOG(ERROR) << "malloc tmp_output_block_ failed."; | |||
| @@ -71,7 +71,6 @@ int ConvolutionSWFP16CPUKernel::InitWeightBias() { | |||
| int kernel_plane = kernel_h * kernel_w; | |||
| int pack_weight_size = oc4 * ic4 * C4NUM * C4NUM * kernel_plane; | |||
| // ========================init weight==================== // | |||
| packed_weight_ = reinterpret_cast<float16_t *>(malloc(pack_weight_size * sizeof(float16_t))); | |||
| if (packed_weight_ == nullptr) { | |||
| MS_LOG(ERROR) << "malloc packed_weight_ failed."; | |||
| @@ -84,7 +83,6 @@ int ConvolutionSWFP16CPUKernel::InitWeightBias() { | |||
| return ret; | |||
| } | |||
| // =======================init bias====================== // | |||
| bias_data_ = malloc(oc4 * C4NUM * sizeof(float16_t)); | |||
| if (bias_data_ == nullptr) { | |||
| MS_LOG(ERROR) << "malloc bias_data_ failed."; | |||
| @@ -107,7 +105,6 @@ int ConvolutionSWFP16CPUKernel::InitTmpBuffer() { | |||
| int out_channel = conv_param_->output_channel_; | |||
| int oc4 = UP_DIV(out_channel, C4NUM); | |||
| /*=============================tmp_output_block_============================*/ | |||
| tmp_output_block_ = reinterpret_cast<float16_t *>(ctx_->allocator->Malloc( | |||
| conv_param_->output_batch_ * conv_param_->output_h_ * conv_param_->output_w_ * oc4 * C4NUM * sizeof(float16_t))); | |||
| if (tmp_output_block_ == nullptr) { | |||
| @@ -148,11 +145,14 @@ int ConvolutionSWFP16CPUKernel::ReSize() { | |||
| return ret; | |||
| } | |||
| FreeTmpBuffer(); | |||
| if (nhwc4_input_ != nullptr) { | |||
| free(nhwc4_input_); | |||
| nhwc4_input_ = nullptr; | |||
| } | |||
| if (slidingWindow_param_ != nullptr) { | |||
| delete slidingWindow_param_; | |||
| slidingWindow_param_ = nullptr; | |||
| } | |||
| ret = ConvolutionBaseCPUKernel::Init(); | |||
| if (ret != RET_OK) { | |||
| @@ -160,10 +160,9 @@ int ConvolutionSWFP16CPUKernel::ReSize() { | |||
| return ret; | |||
| } | |||
| /*=============================nhwc4_input_============================*/ | |||
| int ic4 = UP_DIV(conv_param_->input_channel_, C4NUM); | |||
| size_t nhwc4_input_size = ic4 * C4NUM * conv_param_->input_batch_ * conv_param_->input_h_ * | |||
| conv_param_->input_w_ * sizeof(float16_t); | |||
| size_t nhwc4_input_size = | |||
| ic4 * C4NUM * conv_param_->input_batch_ * conv_param_->input_h_ * conv_param_->input_w_ * sizeof(float16_t); | |||
| nhwc4_input_ = malloc(nhwc4_input_size); | |||
| if (nhwc4_input_ == nullptr) { | |||
| MS_LOG(ERROR) << "malloc nhwc4_input_ failed."; | |||
| @@ -37,6 +37,10 @@ class ConvolutionSWFP16CPUKernel : public ConvolutionBaseFP16CPUKernel { | |||
| free(packed_weight_); | |||
| packed_weight_ = nullptr; | |||
| } | |||
| if (slidingWindow_param_ != nullptr) { | |||
| delete slidingWindow_param_; | |||
| slidingWindow_param_ = nullptr; | |||
| } | |||
| } | |||
| int Init() override; | |||
| @@ -54,10 +58,6 @@ class ConvolutionSWFP16CPUKernel : public ConvolutionBaseFP16CPUKernel { | |||
| ctx_->allocator->Free(tmp_output_block_); | |||
| tmp_output_block_ = nullptr; | |||
| } | |||
| if (slidingWindow_param_ != nullptr) { | |||
| delete slidingWindow_param_; | |||
| slidingWindow_param_ = nullptr; | |||
| } | |||
| } | |||
| float16_t *packed_weight_ = nullptr; | |||
| float16_t *tmp_output_block_ = nullptr; | |||
| @@ -35,8 +35,8 @@ using mindspore::lite::RET_OK; | |||
| using mindspore::schema::PrimitiveType_Conv2D; | |||
| namespace mindspore::kernel { | |||
| void WinogradFilterTransformFp16(const float16_t *weight_data, Matrix *trans_weight, int kernel_unit, int input_unit, | |||
| ConvParameter *conv_param, int oc_block) { | |||
| int WinogradFilterTransformFp16(const float16_t *weight_data, Matrix *trans_weight, int kernel_unit, int input_unit, | |||
| ConvParameter *conv_param, int oc_block) { | |||
| // original weight format : ohwi | |||
| auto channel_in = conv_param->input_channel_; | |||
| auto channel_out = conv_param->output_channel_; | |||
| @@ -44,7 +44,18 @@ void WinogradFilterTransformFp16(const float16_t *weight_data, Matrix *trans_wei | |||
| // generate matrix_G && matrix_GT | |||
| auto matrix_g = TransformMatrixGenerator(input_unit, kernel_unit); | |||
| if (matrix_g == nullptr) { | |||
| MS_LOG(ERROR) << "matrix_g is null."; | |||
| delete matrix_g; | |||
| return RET_ERROR; | |||
| } | |||
| auto matrix_gt = TransformMatrixGenerator(kernel_unit, input_unit); | |||
| if (matrix_gt == nullptr) { | |||
| MS_LOG(ERROR) << "matrix_gt is null."; | |||
| delete matrix_g; | |||
| delete matrix_gt; | |||
| return RET_ERROR; | |||
| } | |||
| ChooseMatrixG(matrix_g, matrix_gt); | |||
| auto matrix_g_data = reinterpret_cast<float *>(matrix_g->GetData()); | |||
| auto matrix_gt_data = reinterpret_cast<float *>(matrix_gt->GetData()); | |||
| @@ -72,7 +83,7 @@ void WinogradFilterTransformFp16(const float16_t *weight_data, Matrix *trans_wei | |||
| free(matrix_gt_data_fp16); | |||
| delete matrix_g; | |||
| delete matrix_gt; | |||
| return; | |||
| return RET_ERROR; | |||
| } | |||
| for (int i = 0; i < channel_out; i++) { | |||
| int out_c_block = i / oc_block; | |||
| @@ -107,6 +118,7 @@ void WinogradFilterTransformFp16(const float16_t *weight_data, Matrix *trans_wei | |||
| free(matrix_gt_data_fp16); | |||
| delete matrix_g; | |||
| delete matrix_gt; | |||
| return RET_OK; | |||
| } | |||
| int ConvolutionWinogradFP16CPUKernel::InitWeightBias() { | |||
| @@ -132,7 +144,12 @@ int ConvolutionWinogradFP16CPUKernel::InitWeightBias() { | |||
| MS_LOG(ERROR) << "Get Execute filter failed."; | |||
| return ret; | |||
| } | |||
| WinogradFilterTransformFp16(execute_weight_, trans_weight_, kernel_unit_, input_unit_, conv_param_, oc_block); | |||
| ret = WinogradFilterTransformFp16(execute_weight_, trans_weight_, kernel_unit_, input_unit_, conv_param_, oc_block); | |||
| if (ret != RET_OK) { | |||
| MS_LOG(ERROR) << "winograd filter transfrom failed."; | |||
| return ret; | |||
| } | |||
| // init bias | |||
| bias_data_ = malloc(oc_block_num * oc_block * sizeof(float16_t)); | |||
| @@ -203,7 +220,6 @@ int ConvolutionWinogradFP16CPUKernel::InitTmpBuffer() { | |||
| int output_w = conv_param_->output_w_; | |||
| int oc8 = UP_DIV(channel_out, C8NUM); | |||
| /*=============================gemm_out_============================*/ | |||
| gemm_out_ = reinterpret_cast<float16_t *>( | |||
| ctx_->allocator->Malloc(thread_count_ * cal_num * input_unit_ * input_unit_ * oc8 * C8NUM * sizeof(float16_t))); | |||
| if (gemm_out_ == nullptr) { | |||
| @@ -211,7 +227,6 @@ int ConvolutionWinogradFP16CPUKernel::InitTmpBuffer() { | |||
| return RET_ERROR; | |||
| } | |||
| /*=============================tmp_out_data_============================*/ | |||
| int out_w_block = UP_DIV(output_w, output_unit_); | |||
| int out_h_block = UP_DIV(output_h, output_unit_); | |||
| tmp_out_data_ = reinterpret_cast<float16_t *>( | |||
| @@ -222,7 +237,6 @@ int ConvolutionWinogradFP16CPUKernel::InitTmpBuffer() { | |||
| return RET_ERROR; | |||
| } | |||
| /*=============================tmp_data_============================*/ | |||
| tmp_data_ = reinterpret_cast<float16_t *>( | |||
| ctx_->allocator->Malloc(thread_count_ * C8NUM * input_unit_ * input_unit_ * sizeof(float16_t))); | |||
| if (tmp_data_ == nullptr) { | |||
| @@ -279,7 +293,6 @@ int ConvolutionWinogradFP16CPUKernel::ReSize() { | |||
| return ret; | |||
| } | |||
| FreeTmpBuffer(); | |||
| if (nhwc4_input_ != nullptr) { | |||
| free(nhwc4_input_); | |||
| nhwc4_input_ = nullptr; | |||
| @@ -302,7 +315,7 @@ int ConvolutionWinogradFP16CPUKernel::ReSize() { | |||
| int cal_num = 16; | |||
| int channel_in = conv_param_->input_channel_; | |||
| int ic8 = UP_DIV(channel_in, C8NUM); | |||
| /*=============================nhwc4_input_============================*/ | |||
| size_t nhwc8_input_size = | |||
| ic8 * C8NUM * conv_param_->input_batch_ * conv_param_->input_h_ * conv_param_->input_w_ * sizeof(float16_t); | |||
| nhwc4_input_ = malloc(nhwc8_input_size); | |||
| @@ -312,7 +325,6 @@ int ConvolutionWinogradFP16CPUKernel::ReSize() { | |||
| } | |||
| memset(nhwc4_input_, 0, nhwc8_input_size); | |||
| /*=============================trans_input_============================*/ | |||
| size_t tile_buffer_size = thread_count_ * cal_num * input_unit_ * input_unit_ * ic8 * C8NUM * sizeof(float16_t); | |||
| trans_input_ = reinterpret_cast<float16_t *>(malloc(tile_buffer_size)); | |||
| if (trans_input_ == nullptr) { | |||
| @@ -84,7 +84,7 @@ class ConvolutionWinogradFP16CPUKernel : public ConvolutionBaseFP16CPUKernel { | |||
| OutputTransformUnitFp16Func output_trans_func_; | |||
| TmpBufferAddressFp16 tmp_buffer_address_list_[4]; | |||
| }; | |||
| void WinogradFilterTransformFp16(const float16_t *weight_data, Matrix *trans_weight, int kernel_unit, int input_unit, | |||
| int WinogradFilterTransformFp16(const float16_t *weight_data, Matrix *trans_weight, int kernel_unit, int input_unit, | |||
| ConvParameter *conv_param, int oc_block); | |||
| } // namespace mindspore::kernel | |||
| @@ -54,7 +54,6 @@ int ConvolutionCPUKernel::InitWeightBias() { | |||
| // #endif | |||
| int pack_weight_size = oc_block_num * oc_block * ic4 * C4NUM * kernel_plane; | |||
| // =====================init weight==========================// | |||
| auto origin_weight = reinterpret_cast<float *>(filter_tensor->Data()); | |||
| packed_weight_ = reinterpret_cast<float *>(malloc(pack_weight_size * sizeof(float))); | |||
| if (packed_weight_ == nullptr) { | |||
| @@ -64,7 +63,6 @@ int ConvolutionCPUKernel::InitWeightBias() { | |||
| memset(packed_weight_, 0, pack_weight_size * sizeof(float)); | |||
| PackWeightFp32(origin_weight, conv_param_, packed_weight_, oc_block, oc_block_num); | |||
| // =======================init bias==========================// | |||
| bias_data_ = reinterpret_cast<float *>(malloc(oc_block_num * oc_block * sizeof(float))); | |||
| if (bias_data_ == nullptr) { | |||
| MS_LOG(ERROR) << "malloc bias failed."; | |||
| @@ -84,7 +82,6 @@ int ConvolutionCPUKernel::InitTmpBuffer() { | |||
| int out_channel = conv_param_->output_channel_; | |||
| MS_ASSERT(ctx_->allocator != nullptr); | |||
| /*=============================tmp_output_block_============================*/ | |||
| tmp_output_block_ = reinterpret_cast<float *>(ctx_->allocator->Malloc(TILE_NUM * out_channel * sizeof(float))); | |||
| if (tmp_output_block_ == nullptr) { | |||
| MS_LOG(ERROR) << "malloc tmp output block failed."; | |||
| @@ -125,7 +122,6 @@ int ConvolutionCPUKernel::ReSize() { | |||
| return ret; | |||
| } | |||
| FreeTmpBuffer(); | |||
| if (nhwc4_input_ != nullptr) { | |||
| free(nhwc4_input_); | |||
| nhwc4_input_ = nullptr; | |||
| @@ -140,7 +136,6 @@ int ConvolutionCPUKernel::ReSize() { | |||
| return RET_ERROR; | |||
| } | |||
| /*=============================nhwc4_input_============================*/ | |||
| int ic4 = UP_DIV(conv_param_->input_channel_, C4NUM); | |||
| size_t nhwc4_input_size = | |||
| ic4 * C4NUM * conv_param_->input_batch_ * conv_param_->input_h_ * conv_param_->input_w_ * sizeof(float); | |||
| @@ -151,7 +146,6 @@ int ConvolutionCPUKernel::ReSize() { | |||
| } | |||
| memset(nhwc4_input_, 0, nhwc4_input_size); | |||
| /*=============================packed_input============================*/ | |||
| int output_count = conv_param_->output_h_ * conv_param_->output_w_; | |||
| int output_tile_count = UP_DIV(output_count, TILE_NUM); | |||
| int unit_size = conv_param_->kernel_h_ * conv_param_->kernel_w_ * ic4 * C4NUM; | |||
| @@ -192,7 +186,7 @@ int ConvolutionCPUKernel::Run() { | |||
| MS_LOG(ERROR) << "Prepare fail!ret: " << prepare_ret; | |||
| return prepare_ret; | |||
| } | |||
| // ============Init buffer using memory pool allocator=============// | |||
| auto ret = InitTmpBuffer(); | |||
| if (ret != RET_OK) { | |||
| MS_LOG(ERROR) << "Init tmp buffer failed."; | |||
| @@ -264,8 +258,7 @@ kernel::LiteKernel *CpuConvFp32KernelCreator(const std::vector<lite::tensor::Ten | |||
| kernel = | |||
| new (std::nothrow) kernel::ConvolutionWinogradCPUKernel(op_parameter, inputs, outputs, ctx, primitive, out_unit); | |||
| } else if (use_sw) { | |||
| // kernel = new (std::nothrow) kernel::ConvolutionSWCPUKernel(op_parameter, inputs, outputs, ctx, primitive); | |||
| kernel = new (std::nothrow) kernel::ConvolutionCPUKernel(op_parameter, inputs, outputs, ctx, primitive); | |||
| kernel = new (std::nothrow) kernel::ConvolutionSWCPUKernel(op_parameter, inputs, outputs, ctx, primitive); | |||
| } else { | |||
| kernel = new (std::nothrow) kernel::ConvolutionCPUKernel(op_parameter, inputs, outputs, ctx, primitive); | |||
| } | |||
| @@ -98,7 +98,6 @@ int Convolution3x3CPUKernel::InitTmpBuffer() { | |||
| const int k_plane = 16; | |||
| MS_ASSERT(ctx_->allocator != nullptr); | |||
| /*=============================block_unit_buffer_============================*/ | |||
| size_t block_unit_buffer_size = thread_count_ * k_plane * C4NUM * sizeof(float); | |||
| block_unit_buffer_ = reinterpret_cast<float *>(ctx_->allocator->Malloc(block_unit_buffer_size)); | |||
| if (block_unit_buffer_ == nullptr) { | |||
| @@ -106,7 +105,6 @@ int Convolution3x3CPUKernel::InitTmpBuffer() { | |||
| return RET_ERROR; | |||
| } | |||
| /*=============================tmp_dst_buffer_============================*/ | |||
| size_t tmp_dst_buffer_size = thread_count_ * TILE_NUM * k_plane * oC4 * C4NUM * sizeof(float); | |||
| tmp_dst_buffer_ = reinterpret_cast<float *>(ctx_->allocator->Malloc(tmp_dst_buffer_size)); | |||
| if (tmp_dst_buffer_ == nullptr) { | |||
| @@ -114,7 +112,6 @@ int Convolution3x3CPUKernel::InitTmpBuffer() { | |||
| return RET_ERROR; | |||
| } | |||
| /*=============================nc4hw4_out_============================*/ | |||
| size_t nc4hw4_out_size = | |||
| oC4 * C4NUM * conv_param_->output_batch_ * conv_param_->output_h_ * conv_param_->output_w_ * sizeof(float); | |||
| nc4hw4_out_ = reinterpret_cast<float *>(ctx_->allocator->Malloc(nc4hw4_out_size)); | |||
| @@ -160,7 +157,6 @@ int Convolution3x3CPUKernel::ReSize() { | |||
| return ret; | |||
| } | |||
| FreeTmpBuffer(); | |||
| if (nhwc4_input_ != nullptr) { | |||
| free(nhwc4_input_); | |||
| nhwc4_input_ = nullptr; | |||
| @@ -177,7 +173,6 @@ int Convolution3x3CPUKernel::ReSize() { | |||
| } | |||
| int iC4 = UP_DIV(conv_param_->input_channel_, C4NUM); | |||
| /*=============================nhwc4_input_============================*/ | |||
| size_t nhwc4_input_size = | |||
| iC4 * C4NUM * conv_param_->input_batch_ * conv_param_->input_h_ * conv_param_->input_w_ * sizeof(float); | |||
| nhwc4_input_ = malloc(nhwc4_input_size); | |||
| @@ -187,7 +182,6 @@ int Convolution3x3CPUKernel::ReSize() { | |||
| } | |||
| memset(nhwc4_input_, 0, nhwc4_input_size); | |||
| /*=============================tile_buffer_============================*/ | |||
| size_t tile_buffer_size = thread_count_ * TILE_NUM * C16NUM * iC4 * C4NUM * sizeof(float); | |||
| tile_buffer_ = reinterpret_cast<float *>(malloc(tile_buffer_size)); | |||
| if (tile_buffer_ == nullptr) { | |||
| @@ -123,7 +123,11 @@ int ConvolutionDepthwiseCPUKernel::ReSize() { | |||
| ConvolutionBaseCPUKernel::Init(); | |||
| // init sliding window param | |||
| sliding_ = new SlidingWindowParam; | |||
| sliding_ = new (std::nothrow) SlidingWindowParam; | |||
| if (sliding_ == nullptr) { | |||
| MS_LOG(ERROR) << "new sliding window param failed."; | |||
| return RET_ERROR; | |||
| } | |||
| InitSlidingParamConvDw(sliding_, conv_param_, C4NUM); | |||
| auto ret = InitWeightBias(); | |||
| @@ -43,7 +43,6 @@ int ConvolutionSWCPUKernel::InitWeightBias() { | |||
| int oc_block_num = UP_DIV(output_channel, C4NUM); | |||
| int pack_weight_size = oc_block_num * oc_block * ic4 * C4NUM * kernel_plane; | |||
| // ==================================init weight======================================// | |||
| auto origin_weight = reinterpret_cast<float *>(in_tensors_.at(kWeightIndex)->Data()); | |||
| packed_weight_ = reinterpret_cast<float *>(malloc(pack_weight_size * sizeof(float))); | |||
| if (packed_weight_ == nullptr) { | |||
| @@ -61,7 +60,6 @@ int ConvolutionSWCPUKernel::InitWeightBias() { | |||
| } | |||
| } | |||
| // ====================================init bias====================================== // | |||
| bias_data_ = reinterpret_cast<float *>(malloc(oc_block_num * oc_block * sizeof(float))); | |||
| if (bias_data_ == nullptr) { | |||
| MS_LOG(ERROR) << "malloc bias failed."; | |||
| @@ -82,7 +80,6 @@ int ConvolutionSWCPUKernel::InitTmpBuffer() { | |||
| int oc4 = UP_DIV(out_channel, C4NUM); | |||
| MS_ASSERT(ctx_->allocator != nullptr); | |||
| /*=============================tmp_output_block_============================*/ | |||
| tmp_output_block_ = reinterpret_cast<float *>(ctx_->allocator->Malloc( | |||
| conv_param_->output_batch_ * conv_param_->output_h_ * conv_param_->output_w_ * oc4 * C4NUM * sizeof(float))); | |||
| if (tmp_output_block_ == nullptr) { | |||
| @@ -119,18 +116,21 @@ int ConvolutionSWCPUKernel::ReSize() { | |||
| return ret; | |||
| } | |||
| FreeTmpBuffer(); | |||
| if (nhwc4_input_ != nullptr) { | |||
| free(nhwc4_input_); | |||
| nhwc4_input_ = nullptr; | |||
| } | |||
| if (slidingWindow_param_ != nullptr) { | |||
| delete slidingWindow_param_; | |||
| slidingWindow_param_ = nullptr; | |||
| } | |||
| ret = ConvolutionBaseCPUKernel::Init(); | |||
| if (ret != RET_OK) { | |||
| MS_LOG(ERROR) << "ConvolutionBase init failed."; | |||
| return RET_ERROR; | |||
| } | |||
| /*=============================nhwc4_input_============================*/ | |||
| int ic4 = UP_DIV(conv_param_->input_channel_, C4NUM); | |||
| size_t nhwc4_input_size = | |||
| ic4 * C4NUM * conv_param_->input_batch_ * conv_param_->input_h_ * conv_param_->input_w_ * sizeof(float); | |||
| @@ -37,6 +37,10 @@ class ConvolutionSWCPUKernel : public ConvolutionBaseCPUKernel { | |||
| free(packed_weight_); | |||
| packed_weight_ = nullptr; | |||
| } | |||
| if (slidingWindow_param_ != nullptr) { | |||
| delete slidingWindow_param_; | |||
| slidingWindow_param_ = nullptr; | |||
| } | |||
| } | |||
| int Init() override; | |||
| @@ -53,10 +57,6 @@ class ConvolutionSWCPUKernel : public ConvolutionBaseCPUKernel { | |||
| ctx_->allocator->Free(tmp_output_block_); | |||
| tmp_output_block_ = nullptr; | |||
| } | |||
| if (slidingWindow_param_ != nullptr) { | |||
| delete slidingWindow_param_; | |||
| slidingWindow_param_ = nullptr; | |||
| } | |||
| } | |||
| float *packed_weight_ = nullptr; | |||
| float *tmp_output_block_ = nullptr; | |||
| @@ -28,16 +28,27 @@ using mindspore::lite::RET_OK; | |||
| using mindspore::schema::PrimitiveType_Conv2D; | |||
| namespace mindspore::kernel { | |||
| void WinogradFilterTransform(const float *weight_data, Matrix *trans_weight, int kernel_unit, int input_unit, | |||
| ConvParameter *conv_param, int oc_block) { | |||
| // =============original weight format : ohwi===============// | |||
| int WinogradFilterTransform(const float *weight_data, Matrix *trans_weight, int kernel_unit, int input_unit, | |||
| ConvParameter *conv_param, int oc_block) { | |||
| // original weight format : ohwi | |||
| auto channel_in = conv_param->input_channel_; | |||
| auto channel_out = conv_param->output_channel_; | |||
| int input_unit_square = input_unit * input_unit; | |||
| // =============generate matrix_G && matrix_GT===============// | |||
| // generate matrix_G && matrix_GT | |||
| auto matrix_g = TransformMatrixGenerator(input_unit, kernel_unit); | |||
| if (matrix_g == nullptr) { | |||
| MS_LOG(ERROR) << "matrix_g is null."; | |||
| delete matrix_g; | |||
| return RET_ERROR; | |||
| } | |||
| auto matrix_gt = TransformMatrixGenerator(kernel_unit, input_unit); | |||
| if (matrix_gt == nullptr) { | |||
| MS_LOG(ERROR) << "matrix_gt is null."; | |||
| delete matrix_g; | |||
| delete matrix_gt; | |||
| return RET_ERROR; | |||
| } | |||
| ChooseMatrixG(matrix_g, matrix_gt); | |||
| auto matrix_g_data = reinterpret_cast<float *>(matrix_g->GetData()); | |||
| auto matrix_gt_data = reinterpret_cast<float *>(matrix_gt->GetData()); | |||
| @@ -59,7 +70,7 @@ void WinogradFilterTransform(const float *weight_data, Matrix *trans_weight, int | |||
| free(trans_out_data); | |||
| delete matrix_g; | |||
| delete matrix_gt; | |||
| return; | |||
| return RET_ERROR; | |||
| } | |||
| for (int i = 0; i < channel_out; i++) { | |||
| int out_c_block = i / oc_block; | |||
| @@ -92,6 +103,7 @@ void WinogradFilterTransform(const float *weight_data, Matrix *trans_weight, int | |||
| free(trans_out_data); | |||
| delete matrix_g; | |||
| delete matrix_gt; | |||
| return RET_OK; | |||
| } | |||
| int ConvolutionWinogradCPUKernel::InitWeightBias() { | |||
| @@ -118,7 +130,11 @@ int ConvolutionWinogradCPUKernel::InitWeightBias() { | |||
| return RET_ERROR; | |||
| } | |||
| auto weight_data = reinterpret_cast<float *>(filter_tensor->Data()); | |||
| WinogradFilterTransform(weight_data, trans_weight_, kernel_unit_, input_unit_, conv_param_, oc_block); | |||
| ret = WinogradFilterTransform(weight_data, trans_weight_, kernel_unit_, input_unit_, conv_param_, oc_block); | |||
| if (ret != RET_OK) { | |||
| MS_LOG(ERROR) << "winograd filter transfrom failed."; | |||
| return ret; | |||
| } | |||
| // init bias | |||
| size_t new_bias_size = oc4 * C4NUM * sizeof(float); | |||
| @@ -182,7 +198,6 @@ int ConvolutionWinogradCPUKernel::InitTmpBuffer() { | |||
| int oc4 = UP_DIV(channel_out, C4NUM); | |||
| MS_ASSERT(ctx_->allocator != nullptr); | |||
| /*=============================gemm_out_============================*/ | |||
| gemm_out_ = reinterpret_cast<float *>( | |||
| ctx_->allocator->Malloc(thread_count_ * TILE_NUM * input_unit_ * input_unit_ * oc4 * C4NUM * sizeof(float))); | |||
| if (gemm_out_ == nullptr) { | |||
| @@ -190,7 +205,6 @@ int ConvolutionWinogradCPUKernel::InitTmpBuffer() { | |||
| return RET_ERROR; | |||
| } | |||
| /*=============================tmp_out_data_============================*/ | |||
| int out_w_block = UP_DIV(output_w, output_unit_); | |||
| int out_h_block = UP_DIV(output_h, output_unit_); | |||
| tmp_out_data_ = | |||
| @@ -201,7 +215,6 @@ int ConvolutionWinogradCPUKernel::InitTmpBuffer() { | |||
| return RET_ERROR; | |||
| } | |||
| /*=============================tmp_data_============================*/ | |||
| tmp_data_ = reinterpret_cast<float *>( | |||
| ctx_->allocator->Malloc(thread_count_ * C4NUM * input_unit_ * input_unit_ * sizeof(float))); | |||
| if (tmp_data_ == nullptr) { | |||
| @@ -263,7 +276,6 @@ int ConvolutionWinogradCPUKernel::ReSize() { | |||
| return ret; | |||
| } | |||
| FreeTmpBuffer(); | |||
| if (nhwc4_input_ != nullptr) { | |||
| free(nhwc4_input_); | |||
| nhwc4_input_ = nullptr; | |||
| @@ -284,7 +296,6 @@ int ConvolutionWinogradCPUKernel::ReSize() { | |||
| conv_param_->input_unit_ = input_unit_; | |||
| conv_param_->output_unit_ = output_unit_; | |||
| /*=============================nhwc4_input_============================*/ | |||
| int ic4 = UP_DIV(conv_param_->input_channel_, C4NUM); | |||
| size_t nhwc4_input_size = | |||
| ic4 * C4NUM * conv_param_->input_batch_ * conv_param_->input_h_ * conv_param_->input_w_ * sizeof(float); | |||
| @@ -295,7 +306,6 @@ int ConvolutionWinogradCPUKernel::ReSize() { | |||
| } | |||
| memset(nhwc4_input_, 0, nhwc4_input_size); | |||
| /*=============================trans_input_============================*/ | |||
| size_t tile_buffer_size = thread_count_ * TILE_NUM * input_unit_ * input_unit_ * ic4 * C4NUM * sizeof(float); | |||
| trans_input_ = reinterpret_cast<float *>(malloc(tile_buffer_size)); | |||
| if (trans_input_ == nullptr) { | |||
| @@ -80,7 +80,7 @@ class ConvolutionWinogradCPUKernel : public ConvolutionBaseCPUKernel { | |||
| TmpBufferAddress tmp_buffer_address_list_[5]; | |||
| GEMM_FUNC_FP32 gemm_func_ = nullptr; | |||
| }; | |||
| void WinogradFilterTransform(const float *weight_data, Matrix *trans_weight, int kernel_unit, int input_unit, | |||
| ConvParameter *conv_param, int oc_block); | |||
| int WinogradFilterTransform(const float *weight_data, Matrix *trans_weight, int kernel_unit, int input_unit, | |||
| ConvParameter *conv_param, int oc_block); | |||
| } // namespace mindspore::kernel | |||
| #endif // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP32_CONVOLUTION_WINOGRAD_H_ | |||
| @@ -61,6 +61,8 @@ int SoftmaxCPUKernel::ReSize() { | |||
| for (int i = axis + 1; i < n_dim; i++) { | |||
| in_plane_size *= in_shape[i]; | |||
| } | |||
| in_plane_size_ = in_plane_size; | |||
| out_plane_size_ = out_plane_size; | |||
| if (sum_data_ != nullptr) { | |||
| free(sum_data_); | |||
| } | |||
| @@ -69,7 +71,6 @@ int SoftmaxCPUKernel::ReSize() { | |||
| MS_LOG(ERROR) << "malloc data for softmax fail!"; | |||
| return RET_ERROR; | |||
| } | |||
| memset(sum_data_, 0, out_plane_size * in_plane_size * sizeof(float)); | |||
| return RET_OK; | |||
| } | |||
| @@ -79,6 +80,7 @@ int SoftmaxCPUKernel::Run() { | |||
| MS_LOG(ERROR) << "Prepare fail!ret: " << ret; | |||
| return RET_ERROR; | |||
| } | |||
| memset(sum_data_, 0, in_plane_size_ * out_plane_size_ * sizeof(float)); | |||
| auto input_ptr = reinterpret_cast<float *>(in_tensors_.at(kInputIndex)->Data()); | |||
| auto output_ptr = reinterpret_cast<float *>(out_tensors_.at(kOutputIndex)->Data()); | |||
| Softmax(input_ptr, output_ptr, sum_data_, softmax_param_); | |||
| @@ -40,6 +40,8 @@ class SoftmaxCPUKernel : public SoftmaxBaseCPUKernel { | |||
| private: | |||
| float *sum_data_ = nullptr; | |||
| int in_plane_size_; | |||
| int out_plane_size_; | |||
| }; | |||
| } // namespace mindspore::kernel | |||
| @@ -117,7 +117,6 @@ int Convolution3x3Int8CPUKernel::InitTmpBuffer() { | |||
| int output_h = conv_param_->output_h_; | |||
| MS_ASSERT(ctx_->allocator != nullptr); | |||
| /*=============================block_unit_buffer_============================*/ | |||
| size_t block_unit_buffer_size = thread_count_ * 4 * 4 * C8NUM * sizeof(int16_t); | |||
| block_unit_buffer_ = reinterpret_cast<int16_t *>(ctx_->allocator->Malloc(block_unit_buffer_size)); | |||
| if (block_unit_buffer_ == nullptr) { | |||
| @@ -125,7 +124,6 @@ int Convolution3x3Int8CPUKernel::InitTmpBuffer() { | |||
| return RET_ERROR; | |||
| } | |||
| /*=============================tmp_dst_buffer_============================*/ | |||
| size_t tmp_dst_buffer_size = thread_count_ * TILE_NUM * 16 * oc4 * C4NUM * sizeof(int32_t); | |||
| tmp_dst_buffer_ = reinterpret_cast<int32_t *>(ctx_->allocator->Malloc(tmp_dst_buffer_size)); | |||
| if (tmp_dst_buffer_ == nullptr) { | |||
| @@ -133,7 +131,6 @@ int Convolution3x3Int8CPUKernel::InitTmpBuffer() { | |||
| return RET_ERROR; | |||
| } | |||
| /*=============================tmp_out_============================*/ | |||
| size_t tmp_out_size = oc4 * C4NUM * output_batch * output_w * output_h * sizeof(uint8_t); | |||
| tmp_out_ = reinterpret_cast<int8_t *>(ctx_->allocator->Malloc(tmp_out_size)); | |||
| if (tmp_out_ == nullptr) { | |||
| @@ -174,7 +171,6 @@ int Convolution3x3Int8CPUKernel::ReSize() { | |||
| return ret; | |||
| } | |||
| FreeTmpBuffer(); | |||
| if (input_data_ != nullptr) { | |||
| free(input_data_); | |||
| input_data_ = nullptr; | |||
| @@ -190,7 +186,6 @@ int Convolution3x3Int8CPUKernel::ReSize() { | |||
| return RET_ERROR; | |||
| } | |||
| /*=============================input_data_============================*/ | |||
| int ic8 = UP_DIV(conv_param_->input_channel_, C8NUM); | |||
| size_t c8_input_size = | |||
| conv_param_->input_batch_ * conv_param_->input_h_ * conv_param_->input_w_ * ic8 * C8NUM * sizeof(int16_t); | |||
| @@ -201,7 +196,6 @@ int Convolution3x3Int8CPUKernel::ReSize() { | |||
| } | |||
| memset(input_data_, 0, c8_input_size); | |||
| /*=============================tile_buffer_============================*/ | |||
| size_t tile_buffer_size = thread_count_ * TILE_NUM * C16NUM * ic8 * C8NUM * sizeof(int16_t); | |||
| tile_buffer_ = reinterpret_cast<int16_t *>(malloc(tile_buffer_size)); | |||
| if (tile_buffer_ == nullptr) { | |||
| @@ -35,22 +35,25 @@ void ConvolutionDepthwiseInt8CPUKernel::FreeTmpBuffer() { | |||
| } | |||
| if (packed_weight_ != nullptr) { | |||
| delete packed_weight_; | |||
| free(packed_weight_); | |||
| packed_weight_ = nullptr; | |||
| } | |||
| if (packed_input_ != nullptr) { | |||
| delete packed_input_; | |||
| free(packed_input_); | |||
| packed_input_ = nullptr; | |||
| } | |||
| if (need_align_) { | |||
| if (packed_output_ != nullptr) { | |||
| delete packed_output_; | |||
| free(packed_output_); | |||
| packed_output_ = nullptr; | |||
| } | |||
| } | |||
| } | |||
| ConvolutionDepthwiseInt8CPUKernel::~ConvolutionDepthwiseInt8CPUKernel() { FreeTmpBuffer(); } | |||
| ConvolutionDepthwiseInt8CPUKernel::~ConvolutionDepthwiseInt8CPUKernel() { | |||
| FreeTmpBuffer(); | |||
| FreeQuantParam(); | |||
| } | |||
| int ConvolutionDepthwiseInt8CPUKernel::InitWeightBias() { | |||
| // init weight, int8 -> int16 | |||
| @@ -118,7 +121,11 @@ int ConvolutionDepthwiseInt8CPUKernel::ReSize() { | |||
| ConvolutionBaseCPUKernel::Init(); | |||
| // init sliding window param | |||
| sliding = new SlidingWindowParam; | |||
| sliding = new (std::nothrow) SlidingWindowParam; | |||
| if (sliding == nullptr) { | |||
| MS_LOG(ERROR) << "new sliding window param."; | |||
| return RET_ERROR; | |||
| } | |||
| InitSlidingParamConvDw(sliding, conv_param_, C4NUM); | |||
| // init quant param | |||
| @@ -113,25 +113,24 @@ int ConvolutionInt8CPUKernel::InitWeightBias() { | |||
| } | |||
| free(weight_sum); | |||
| /*=============================input_sum_============================*/ | |||
| size_t input_sum_size; | |||
| if (conv_quant_arg_->per_channel_ & FILTER_PER_CHANNEL) { | |||
| input_sum_size = conv_param_->output_channel_ * tile_num_ * thread_count_ * sizeof(int32_t); | |||
| } else { | |||
| input_sum_size = tile_num_ * thread_count_ * sizeof(int32_t); | |||
| } | |||
| input_sum_ = reinterpret_cast<int32_t *>(ctx_->allocator->Malloc(input_sum_size)); | |||
| input_sum_ = reinterpret_cast<int32_t *>(malloc(input_sum_size)); | |||
| if (input_sum_ == nullptr) { | |||
| MS_LOG(ERROR) << "malloc input_sum_ failed."; | |||
| return RET_ERROR; | |||
| } | |||
| memset(input_sum_, 0, tile_num_ * thread_count_ * sizeof(int32_t)); | |||
| memset(input_sum_, 0, input_sum_size); | |||
| return RET_OK; | |||
| } | |||
| int ConvolutionInt8CPUKernel::InitTmpBuffer() { | |||
| MS_ASSERT(ctx_->allocator != nullptr); | |||
| /*=============================tmp_dst_============================*/ | |||
| size_t tmp_dst_size = thread_count_ * tile_num_ * conv_param_->output_channel_ * sizeof(int32_t); | |||
| tmp_dst_ = reinterpret_cast<int32_t *>(ctx_->allocator->Malloc(tmp_dst_size)); | |||
| if (tmp_dst_ == nullptr) { | |||
| @@ -139,7 +138,6 @@ int ConvolutionInt8CPUKernel::InitTmpBuffer() { | |||
| return RET_ERROR; | |||
| } | |||
| /*=============================tmp_out_============================*/ | |||
| tmp_out_ = | |||
| reinterpret_cast<int8_t *>(ctx_->allocator->Malloc(thread_count_ * tile_num_ * conv_param_->output_channel_)); | |||
| if (tmp_out_ == nullptr) { | |||
| @@ -202,7 +200,6 @@ int ConvolutionInt8CPUKernel::InitWeightBiasOpt() { | |||
| } | |||
| free(weight_sum); | |||
| /*=============================input_sum_============================*/ | |||
| size_t input_sum_size; | |||
| if (conv_quant_arg_->per_channel_ & FILTER_PER_CHANNEL) { | |||
| input_sum_size = conv_param_->output_channel_ * tile_num_ * thread_count_ * sizeof(int32_t); | |||
| @@ -214,13 +211,13 @@ int ConvolutionInt8CPUKernel::InitWeightBiasOpt() { | |||
| MS_LOG(ERROR) << "malloc input_sum_ failed."; | |||
| return RET_ERROR; | |||
| } | |||
| memset(input_sum_, 0, tile_num_ * thread_count_ * sizeof(int32_t)); | |||
| memset(input_sum_, 0, input_sum_size); | |||
| return RET_OK; | |||
| } | |||
| int ConvolutionInt8CPUKernel::InitTmpBufferOpt() { | |||
| MS_ASSERT(ctx_->allocator != nullptr); | |||
| /*=============================tmp_dst_============================*/ | |||
| size_t tmp_dst_size = thread_count_ * tile_num_ * conv_param_->output_channel_ * sizeof(int32_t); | |||
| tmp_dst_ = reinterpret_cast<int32_t *>(ctx_->allocator->Malloc(tmp_dst_size)); | |||
| if (tmp_dst_ == nullptr) { | |||
| @@ -228,7 +225,6 @@ int ConvolutionInt8CPUKernel::InitTmpBufferOpt() { | |||
| return RET_ERROR; | |||
| } | |||
| /*=============================tmp_out_============================*/ | |||
| tmp_out_ = | |||
| reinterpret_cast<int8_t *>(ctx_->allocator->Malloc(thread_count_ * tile_num_ * conv_param_->output_channel_)); | |||
| if (tmp_out_ == nullptr) { | |||
| @@ -287,7 +283,6 @@ int ConvolutionInt8CPUKernel::ReSize() { | |||
| return ret; | |||
| } | |||
| FreeTmpBuffer(); | |||
| if (nhwc4_input_ != nullptr) { | |||
| free(nhwc4_input_); | |||
| nhwc4_input_ = nullptr; | |||
| @@ -312,7 +307,6 @@ int ConvolutionInt8CPUKernel::ReSize() { | |||
| } | |||
| memset(nhwc4_input_, 0, nhwc4_input_size); | |||
| /*=============================packed_input_============================*/ | |||
| int output_count = conv_param_->output_h_ * conv_param_->output_w_; | |||
| int output_tile_count = UP_DIV(output_count, tile_num_); | |||
| int kernel_plane = conv_param_->kernel_h_ * conv_param_->kernel_w_; | |||
| @@ -28,7 +28,10 @@ using mindspore::lite::RET_OK; | |||
| using mindspore::schema::PrimitiveType_DeDepthwiseConv2D; | |||
| namespace mindspore::kernel { | |||
| DeconvolutionDepthwiseInt8CPUKernel::~DeconvolutionDepthwiseInt8CPUKernel() { FreeTmpBuffer(); } | |||
| DeconvolutionDepthwiseInt8CPUKernel::~DeconvolutionDepthwiseInt8CPUKernel() { | |||
| FreeTmpBuffer(); | |||
| FreeQuantParam(); | |||
| } | |||
| void DeconvolutionDepthwiseInt8CPUKernel::FreeTmpBuffer() { | |||
| if (sliding != nullptr) { | |||
| @@ -49,7 +49,7 @@ int PoolingInt8CPUKernel::ReSize() { | |||
| MS_LOG(ERROR) << "PoolingBase Init failed."; | |||
| return ret; | |||
| } | |||
| SetQuantParam(); | |||
| ret = SetQuantParam(); | |||
| if (ret != RET_OK) { | |||
| MS_LOG(ERROR) << "Set pooling quant param failed."; | |||
| @@ -262,6 +262,12 @@ void ConvInt8(int8_t *input_data, int8_t *packed_input, int8_t *packed_weight, c | |||
| int kernel_plane = kernel_h * kernel_w; | |||
| int unit_size = kernel_plane * ic4 * C4NUM; | |||
| int packed_input_size = output_tile_count * tile_n * unit_size; | |||
| int input_sum_offset; | |||
| if (conv_param->conv_quant_arg_.per_channel_ & FILTER_PER_CHANNEL) { | |||
| input_sum_offset = tile_n * out_channel; | |||
| } else { | |||
| input_sum_offset = tile_n; | |||
| } | |||
| for (int b = 0; b < in_batch; b++) { | |||
| int in_batch_offset = b * ic4 * C4NUM * in_h * in_w; | |||
| @@ -270,7 +276,7 @@ void ConvInt8(int8_t *input_data, int8_t *packed_input, int8_t *packed_weight, c | |||
| for (int thread_id = task_id; thread_id < output_tile_count; thread_id += thread_count) { | |||
| int start_index = thread_id * tile_n; | |||
| int real_cal_num = (output_count - start_index) < tile_n ? (output_count - start_index) : tile_n; | |||
| int32_t *tmp_input_sum = input_sum + task_id * tile_n; | |||
| int32_t *tmp_input_sum = input_sum + task_id * input_sum_offset; | |||
| int8_t *gemm_input = packed_input + thread_id * unit_size * tile_n + gemm_in_batch_offset; | |||
| // clear tmp buffer before compute | |||
| memset(gemm_input, (int8_t)input_zp, unit_size * tile_n); | |||
| @@ -317,6 +323,12 @@ void ConvInt8Opt(int8_t *input_data, int8_t *packed_input, int8_t *packed_weight | |||
| int kernel_plane = kernel_h * kernel_w; | |||
| int unit_size = kernel_plane * ic4 * C4NUM; | |||
| int packed_input_size = output_tile_count * tile_n * unit_size; | |||
| int input_sum_offset; | |||
| if (conv_param->conv_quant_arg_.per_channel_ & FILTER_PER_CHANNEL) { | |||
| input_sum_offset = tile_n * out_channel; | |||
| } else { | |||
| input_sum_offset = tile_n; | |||
| } | |||
| for (int b = 0; b < in_batch; b++) { | |||
| int in_batch_offset = b * ic4 * C4NUM * in_h * in_w; | |||
| @@ -325,7 +337,7 @@ void ConvInt8Opt(int8_t *input_data, int8_t *packed_input, int8_t *packed_weight | |||
| for (int thread_id = task_id; thread_id < output_tile_count; thread_id += thread_count) { | |||
| int start_index = thread_id * tile_n; | |||
| int real_cal_num = (output_count - start_index) < tile_n ? (output_count - start_index) : tile_n; | |||
| int32_t *tmp_input_sum = input_sum + task_id * tile_n; | |||
| int32_t *tmp_input_sum = input_sum + task_id * input_sum_offset; | |||
| int8_t *gemm_input = packed_input + thread_id * unit_size * tile_n + gemm_in_batch_offset; | |||
| // clear tmp buffer before compute | |||
| memset(gemm_input, (int8_t)input_zp, unit_size * tile_n); | |||