| @@ -344,7 +344,6 @@ void ConvFp16(float16_t *input_data, float16_t *packed_input, float16_t *packed_ | |||
| int channel_block = UP_DIV(in_channel, C4NUM); | |||
| int kernel_plane = kernel_h * kernel_w; | |||
| int unit_size = kernel_plane * channel_block * C4NUM; | |||
| int packed_input_size = output_tile_count * tile_n * unit_size; | |||
| // we accumulate 4 channels per time for input blocks | |||
| int ic4 = UP_DIV(in_channel, C4NUM); | |||
| @@ -355,11 +354,10 @@ void ConvFp16(float16_t *input_data, float16_t *packed_input, float16_t *packed_ | |||
| for (int b = 0; b < in_batch; b++) { | |||
| int in_batch_offset = b * ic4 * C4NUM * in_h * in_w; | |||
| int out_batch_offset = b * out_channel * out_h * out_w; | |||
| int gemm_in_batch_offset = b * packed_input_size; | |||
| for (int thread_id = task_id; thread_id < output_tile_count; thread_id += thread_count) { | |||
| int start_index = thread_id * tile_n; | |||
| int real_cal_num = (output_count - start_index) < tile_n ? (output_count - start_index) : tile_n; | |||
| float16_t *gemm_input = (float16_t *)(packed_input + thread_id * unit_size * tile_n + gemm_in_batch_offset); | |||
| float16_t *gemm_input = (float16_t *)(packed_input + task_id * unit_size * tile_n); | |||
| Im2ColPackUnitFp16(input_data + in_batch_offset, conv_param, gemm_input, real_cal_num, start_index); | |||
| int out_offset = thread_id * tile_n * out_channel + out_batch_offset; | |||
| @@ -55,23 +55,24 @@ void Im2ColPackUnitFp16(float16_t *input_data, ConvParameter *conv_param, float1 | |||
| int in_w = conv_param->input_w_; | |||
| int out_w = conv_param->output_w_; | |||
| int ic4 = UP_DIV(in_channel, 4); | |||
| int ic4_minus = in_channel / 4; | |||
| memset(packed_input, 0, kernel_w * kernel_h * ic4 * C4NUM * 16 * sizeof(float16_t)); | |||
| for (int i = 0; i < real_cal_num; i++) { | |||
| int block_start = block_index + i; | |||
| int input_h = block_start / out_w * stride_h - pad_h; | |||
| int input_w = block_start % out_w * stride_w - pad_w; | |||
| int input_stride = input_h * in_w * ic4 * C4NUM + input_w * ic4 * C4NUM; | |||
| int input_stride = (input_h * in_w + input_w) * in_channel; | |||
| int kh_s = MSMAX(0, UP_DIV(-input_h, dilation_h)); | |||
| int kh_e = MSMIN(kernel_h, UP_DIV(in_h - input_h, dilation_h)); | |||
| int kw_s = MSMAX(0, UP_DIV(-input_w, dilation_w)); | |||
| int kw_e = MSMIN(kernel_w, UP_DIV(in_w - input_w, dilation_w)); | |||
| for (int j = kh_s; j < kh_e; j++) { | |||
| int input_y_stride = j * dilation_h * in_w * ic4 * C4NUM + input_stride; | |||
| int input_y_stride = j * dilation_h * in_w * in_channel + input_stride; | |||
| for (int n = kw_s; n < kw_e; n++) { | |||
| int input_x_stride = input_y_stride + n * dilation_w * ic4 * C4NUM; | |||
| int input_x_stride = input_y_stride + n * dilation_w * in_channel; | |||
| int input_plane_offset = (j * kernel_w + n) * 16 * C4NUM * ic4 + i * C4NUM; | |||
| for (int m = 0; m < ic4; m++) { | |||
| for (int m = 0; m < ic4_minus; m++) { | |||
| int channel_block_stride = input_x_stride + m * C4NUM; | |||
| int channel_block_offset = input_plane_offset + m * 16 * C4NUM; | |||
| #ifdef ENABLE_ARM64 | |||
| @@ -82,9 +83,15 @@ void Im2ColPackUnitFp16(float16_t *input_data, ConvParameter *conv_param, float1 | |||
| } | |||
| #endif | |||
| } // channel_block loop | |||
| } // kernel_w loop | |||
| } // kernel_h loop | |||
| } // tile num loop | |||
| int ic_res = in_channel - ic4_minus * C4NUM; | |||
| for (int l = 0; l < ic_res; ++l) { | |||
| int channel_block_stride = input_x_stride + ic4_minus * C4NUM + l; | |||
| int channel_block_offset = input_plane_offset + ic4_minus * 16 * C4NUM + l; | |||
| packed_input[channel_block_offset] = input_data[channel_block_stride]; | |||
| } | |||
| } // kernel_w loop | |||
| } // kernel_h loop | |||
| } // tile num loop | |||
| } | |||
| void PackWeightFp16(float16_t *weight_data, ConvParameter *conv_param, float16_t *packed_weight) { | |||
| @@ -334,7 +341,8 @@ void PackNHWCToNCHWFp16(const void *src, void *dst, int batches, int plane, int | |||
| "st1 {v27.8h}, [x11], %[dstStride]\n" | |||
| "st1 {v31.8h}, [x10], %[dstStride]\n" | |||
| : | |||
| : [ dst_ptr ] "r"(dst_ptr), [ src_ptr ] "r"(src_ptr), [ srcStride ] "r"(srcStride), [ dstStride ] "r"(dstStride) | |||
| : | |||
| [ dst_ptr ] "r"(dst_ptr), [ src_ptr ] "r"(src_ptr), [ srcStride ] "r"(srcStride), [ dstStride ] "r"(dstStride) | |||
| : "x10", "x11", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", | |||
| "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", | |||
| "v30", "v31"); | |||
| @@ -78,6 +78,7 @@ void PackWeightInt8(int8_t *weight_data, ConvParameter *conv_param, int8_t *pack | |||
| int plane_c4 = UP_DIV(kernel_plane, C4NUM); | |||
| int pack_weight_size = oc4 * C4NUM * ic4 * C4NUM * plane_c4 * C4NUM; | |||
| int block_size = pack_weight_size / oc4; | |||
| QuantArg *filter_args = conv_param->conv_quant_arg_.filter_quant_args_; | |||
| for (int m = 0; m < kernel_plane; m++) { | |||
| int kernel_plane_stride = m * in_channel; | |||
| @@ -101,7 +102,13 @@ void PackWeightInt8(int8_t *weight_data, ConvParameter *conv_param, int8_t *pack | |||
| int8_t *origin_data_ptr = weight_data + kernel_block_stride + k * kernel_plane * in_channel; | |||
| int8_t *packed_data_ptr = packed_weight + packed_kernel_block_size + k * C4NUM * C4NUM; | |||
| *packed_data_ptr = origin_data_ptr[0]; | |||
| weight_sum[j * C4NUM + k] += (int32_t)packed_data_ptr[0]; | |||
| int32_t f_zp; | |||
| if (conv_param->conv_quant_arg_.per_channel_ & FILTER_PER_CHANNEL) { | |||
| f_zp = filter_args[j * C4NUM + k].zp_; | |||
| } else { | |||
| f_zp = filter_args[0].zp_; | |||
| } | |||
| weight_sum[j * C4NUM + k] += (int32_t)(packed_data_ptr[0] - f_zp); | |||
| } | |||
| } // kernel block loop | |||
| } // inchannel block loop | |||
| @@ -121,6 +128,7 @@ void PackWeightInt8Opt(int8_t *weight_data, ConvParameter *conv_param, int8_t *p | |||
| int pack_weight_size = oc4 * ic4 * C4NUM * C4NUM * kernel_plane; | |||
| int unit_size = C4NUM * C4NUM; | |||
| int block_size = pack_weight_size / oc4; | |||
| QuantArg *filter_args = conv_param->conv_quant_arg_.filter_quant_args_; | |||
| for (int m = 0; m < kernel_plane; m++) { | |||
| int kernel_plane_stride = m * in_channel; | |||
| @@ -142,7 +150,13 @@ void PackWeightInt8Opt(int8_t *weight_data, ConvParameter *conv_param, int8_t *p | |||
| int8_t *origin_data_ptr = weight_data + kernel_block_stride + k * kernel_plane * in_channel; | |||
| int8_t *packed_data_ptr = packed_weight + packed_kernel_block_size + k * C4NUM; | |||
| *packed_data_ptr = origin_data_ptr[0]; | |||
| weight_sum[j * C4NUM + k] += (int32_t)(packed_data_ptr[0]); | |||
| int32_t f_zp; | |||
| if (conv_param->conv_quant_arg_.per_channel_ & FILTER_PER_CHANNEL) { | |||
| f_zp = filter_args[j * C4NUM + k].zp_; | |||
| } else { | |||
| f_zp = filter_args[0].zp_; | |||
| } | |||
| weight_sum[j * C4NUM + k] += (int32_t)(packed_data_ptr[0] - f_zp); | |||
| } | |||
| } // kernel block loop | |||
| } // inchannel block loop | |||
| @@ -400,6 +414,9 @@ void Im2ColPackUnitInt8(const int8_t *input_data, int8_t *packed_input, int real | |||
| packed_input[channel_block_offset] = input_data[channel_block_stride]; | |||
| input_accumulator += (packed_input + channel_block_offset)[0]; | |||
| } | |||
| for (int l = 0; l < (C4NUM - ic_res); l++) { | |||
| input_accumulator += conv_param->conv_quant_arg_.input_quant_args_[0].zp_; | |||
| } | |||
| } // kernel_w loop | |||
| } // kernel_h loop | |||
| if (!(conv_param->conv_quant_arg_.asymmetric_ & FILTER_ASYMMETRIC)) { | |||
| @@ -84,53 +84,29 @@ int ConvolutionFP16CPUKernel::InitWeightBias() { | |||
| } | |||
| int ConvolutionFP16CPUKernel::InitTmpBuffer() { | |||
| int in_batch = conv_param_->input_batch_; | |||
| int in_channel = conv_param_->input_channel_; | |||
| int out_channel = conv_param_->output_channel_; | |||
| int channel_block = UP_DIV(in_channel, C4NUM); | |||
| int cal_num = 16; | |||
| int output_count = conv_param_->output_h_ * conv_param_->output_w_; | |||
| int output_tile_count = UP_DIV(output_count, cal_num); | |||
| int kernel_plane = conv_param_->kernel_h_ * conv_param_->kernel_w_; | |||
| int unit_size = kernel_plane * channel_block * C4NUM; | |||
| int packed_input_size = output_tile_count * cal_num * unit_size; | |||
| int packed_input_size = thread_count_ * cal_num * unit_size; | |||
| packed_input_ = | |||
| reinterpret_cast<float16_t *>(ctx_->allocator->Malloc(in_batch * packed_input_size * sizeof(float16_t))); | |||
| packed_input_ = reinterpret_cast<float16_t *>(ctx_->allocator->Malloc(packed_input_size * sizeof(float16_t))); | |||
| if (packed_input_ == nullptr) { | |||
| MS_LOG(ERROR) << "malloc packed_input_ failed."; | |||
| return RET_ERROR; | |||
| } | |||
| size_t nhwc4_input_size = | |||
| channel_block * C4NUM * in_batch * conv_param_->input_h_ * conv_param_->input_w_ * sizeof(float16_t); | |||
| nhwc4_input_ = ctx_->allocator->Malloc(nhwc4_input_size); | |||
| if (nhwc4_input_ == nullptr) { | |||
| MS_LOG(ERROR) << "malloc nhwc4_input_ failed."; | |||
| return RET_ERROR; | |||
| } | |||
| tmp_output_block_ = | |||
| reinterpret_cast<float16_t *>(ctx_->allocator->Malloc(thread_count_ * cal_num * out_channel * sizeof(float16_t))); | |||
| if (tmp_output_block_ == nullptr) { | |||
| MS_LOG(ERROR) << "malloc tmp_output_block_ failed."; | |||
| return RET_ERROR; | |||
| } | |||
| return RET_OK; | |||
| } | |||
| void ConvolutionFP16CPUKernel::ConfigInputOutput() { | |||
| auto input_tensor = in_tensors_.at(kInputIndex); | |||
| auto input_format = input_tensor->GetFormat(); | |||
| schema::Format execute_format = schema::Format::Format_NHWC4; | |||
| convert_func_ = LayoutTransformFp16(input_format, execute_format); | |||
| if (convert_func_ == nullptr) { | |||
| MS_LOG(ERROR) << "layout convert func is nullptr."; | |||
| return; | |||
| } | |||
| } | |||
| int ConvolutionFP16CPUKernel::Init() { | |||
| auto ret = InitWeightBias(); | |||
| if (ret != RET_OK) { | |||
| @@ -140,7 +116,6 @@ int ConvolutionFP16CPUKernel::Init() { | |||
| if (!InferShapeDone()) { | |||
| return RET_OK; | |||
| } | |||
| ConfigInputOutput(); | |||
| return ReSize(); | |||
| } | |||
| @@ -160,8 +135,8 @@ int ConvolutionFP16CPUKernel::ReSize() { | |||
| } | |||
| int ConvolutionFP16CPUKernel::RunImpl(int task_id) { | |||
| ConvFp16(reinterpret_cast<float16_t *>(nhwc4_input_), packed_input_, packed_weight_, | |||
| reinterpret_cast<float16_t *>(bias_data_), tmp_output_block_, execute_output_, task_id, conv_param_); | |||
| ConvFp16(execute_input_, packed_input_, packed_weight_, reinterpret_cast<float16_t *>(bias_data_), tmp_output_block_, | |||
| execute_output_, task_id, conv_param_); | |||
| return RET_OK; | |||
| } | |||
| @@ -194,12 +169,6 @@ int ConvolutionFP16CPUKernel::Run() { | |||
| return RET_ERROR; | |||
| } | |||
| int in_batch = conv_param_->input_batch_; | |||
| int in_h = conv_param_->input_h_; | |||
| int in_w = conv_param_->input_w_; | |||
| int in_channel = conv_param_->input_channel_; | |||
| convert_func_(reinterpret_cast<void *>(execute_input_), nhwc4_input_, in_batch, in_h * in_w, in_channel); | |||
| int error_code = ParallelLaunch(this->context_->thread_pool_, ConvolutionFp16Impl, this, thread_count_); | |||
| if (error_code != RET_OK) { | |||
| MS_LOG(ERROR) << "conv fp16 error error_code[" << error_code << "]"; | |||
| @@ -46,14 +46,9 @@ class ConvolutionFP16CPUKernel : public ConvolutionBaseFP16CPUKernel { | |||
| int RunImpl(int task_id); | |||
| int InitWeightBias(); | |||
| int InitTmpBuffer(); | |||
| void ConfigInputOutput(); | |||
| private: | |||
| void FreeTmpBuffer() { | |||
| if (nhwc4_input_ != nullptr) { | |||
| ctx_->allocator->Free(nhwc4_input_); | |||
| nhwc4_input_ = nullptr; | |||
| } | |||
| if (packed_input_ != nullptr) { | |||
| ctx_->allocator->Free(packed_input_); | |||
| packed_input_ = nullptr; | |||
| @@ -236,12 +236,8 @@ kernel::LiteKernel *CpuConvFp32KernelCreator(const std::vector<lite::Tensor *> & | |||
| if (kernel_h == 1 && kernel_w == 1) { | |||
| kernel = new (std::nothrow) kernel::Convolution1x1CPUKernel(op_parameter, inputs, outputs, ctx, primitive); | |||
| } else if (use_winograd) { | |||
| if (kernel_h == 3 && kernel_w == 3 && out_unit == 2) { | |||
| kernel = new (std::nothrow) kernel::Convolution3x3CPUKernel(op_parameter, inputs, outputs, ctx, primitive); | |||
| } else { | |||
| kernel = new (std::nothrow) | |||
| kernel::ConvolutionWinogradCPUKernel(op_parameter, inputs, outputs, ctx, primitive, out_unit); | |||
| } | |||
| kernel = | |||
| new (std::nothrow) kernel::ConvolutionWinogradCPUKernel(op_parameter, inputs, outputs, ctx, primitive, out_unit); | |||
| } else { | |||
| kernel = new (std::nothrow) kernel::ConvolutionCPUKernel(op_parameter, inputs, outputs, ctx, primitive); | |||
| } | |||
| @@ -89,7 +89,13 @@ int ConvolutionInt8CPUKernel::InitWeightBias() { | |||
| MS_LOG(ERROR) << "malloc weight_sum failed."; | |||
| return RET_ERROR; | |||
| } | |||
| for (int i = 0; i < output_channel; i++) weight_sum[i] = 0; | |||
| for (int i = 0; i < output_channel; i++) { | |||
| if (conv_quant_arg_->per_channel_ & FILTER_PER_CHANNEL) { | |||
| weight_sum[i] = ic4 * C4NUM * kernel_plane * filter_arg[i].zp_; | |||
| } else { | |||
| weight_sum[i] = ic4 * C4NUM * kernel_plane * filter_arg[0].zp_; | |||
| } | |||
| } | |||
| PackWeightInt8(origin_weight, conv_param_, packed_weight_, weight_sum); | |||
| // init bias | |||
| @@ -190,7 +196,13 @@ int ConvolutionInt8CPUKernel::InitWeightBiasOpt() { | |||
| MS_LOG(ERROR) << "malloc weight_sum failed."; | |||
| return RET_ERROR; | |||
| } | |||
| for (int i = 0; i < output_channel; i++) weight_sum[i] = 0; | |||
| for (int i = 0; i < output_channel; i++) { | |||
| if (conv_quant_arg_->per_channel_ & FILTER_PER_CHANNEL) { | |||
| weight_sum[i] = ic4 * C4NUM * kernel_plane * filter_arg[i].zp_; | |||
| } else { | |||
| weight_sum[i] = ic4 * C4NUM * kernel_plane * filter_arg[0].zp_; | |||
| } | |||
| } | |||
| PackWeightInt8Opt(origin_weight, conv_param_, packed_weight_, weight_sum); | |||
| // init bias | |||
| @@ -261,14 +273,7 @@ int ConvolutionInt8CPUKernel::InitTmpBufferOpt() { | |||
| return RET_OK; | |||
| } | |||
| void ConvolutionInt8CPUKernel::ConfigInputOutput() { | |||
| auto output_tensor = out_tensors_.at(kOutputIndex); | |||
| output_tensor->SetFormat(schema::Format::Format_NHWC); | |||
| } | |||
| int ConvolutionInt8CPUKernel::Init() { | |||
| // config input output | |||
| ConfigInputOutput(); | |||
| CheckSupportOptimize(); | |||
| auto ret = SetQuantParam(); | |||
| if (ret != RET_OK) { | |||
| @@ -51,7 +51,6 @@ class ConvolutionInt8CPUKernel : public ConvolutionBaseCPUKernel { | |||
| int InitTmpBufferOpt(); | |||
| int InitWeightBias(); | |||
| int InitTmpBuffer(); | |||
| void ConfigInputOutput(); | |||
| private: | |||
| void FreeTmpBuffer() { | |||