diff --git a/mindspore/lite/nnacl/conv_parameter.h b/mindspore/lite/nnacl/conv_parameter.h index ef01f73ab2..ae1a60982f 100644 --- a/mindspore/lite/nnacl/conv_parameter.h +++ b/mindspore/lite/nnacl/conv_parameter.h @@ -71,4 +71,57 @@ typedef struct SlidingWindowParam { int kernel_step_; } SlidingWindowParam; +#define DECONV_WINOGRAD_DEFAULT_UNIT 3 +#define DECONV_WINOGRAD_DEFAULT_TILE 8 +#define DECONV_WINOGRAD_BUFFER_COUNT 8 +typedef struct DeConvWg { + void *b_buffer_; + void *AT_; + void *BT_; + + int kh_; + int kw_; + + int k_; + int i_; + int o_; +} DeConvWg; + +typedef struct DeConvWgABuffer { + bool buf_init_; + bool trans_formed_; + void *middle_buffer_; + void *dest_buffer_; +} DeConvWgABuffer; + +typedef struct DeConvComputeUnit { + void *weight_; + void *tmp_buffer_; + int w_start_; + int h_start_; + int w_size_; + int h_size_; + bool use_winograd_; + DeConvWg winograd_; +} DeConvComputeUnit; + +typedef struct DeConvParam { + DeConvComputeUnit *compute_units_; + int compute_size_; + DeConvWgABuffer a_buffer_[DECONV_WINOGRAD_BUFFER_COUNT]; + int input_plane_; + int output_plane_; + int kernel_plane_; + int ic_div4_; + int oc_div4_; + int ic_up4_; + int oc_up4_; + int thread_num_; + int in_tile_count_; + int in_tile_h_count_; + int in_tile_w_count_; + int out_tile_h_; + int out_tile_w_; +} DeConvParam; + #endif // MINDSPORE_LITE_NNACL_CONV_PARAMETER_H_ diff --git a/mindspore/lite/nnacl/errorcode.h b/mindspore/lite/nnacl/errorcode.h index a26a095997..f80ce1bb67 100644 --- a/mindspore/lite/nnacl/errorcode.h +++ b/mindspore/lite/nnacl/errorcode.h @@ -22,7 +22,7 @@ typedef enum ErrorCodeCommonEnum { NNACL_ERR = 1, NNACL_NULL_PTR, NNACL_PARAM_INVALID, - OPLIB_COMMON_END = 9999 + NNACL_COMMON_END = 9999 } ErrorCodeCommonEnum; typedef enum ErrorCodeFp32OpEnum { @@ -34,6 +34,7 @@ typedef enum ErrorCodeFp32OpEnum { NNACL_ERRCODE_LOG_NEGATIVE_OR_ZERO, NNACL_ERRCODE_DIVISOR_ZERO, NNACL_ERRCODE_INDEX_OUT_OF_RANGE, + NNACL_ERRCODE_WINOGRAD_GENERATOR_ERROR, NNACL_ERRCODE_OP_FP32_END = 19999 } ErrorCodeFp32OpEnum; diff --git a/mindspore/lite/nnacl/fp32/common_func.c b/mindspore/lite/nnacl/fp32/common_func.c index 2ff8a4303f..fe6ed6e9b2 100644 --- a/mindspore/lite/nnacl/fp32/common_func.c +++ b/mindspore/lite/nnacl/fp32/common_func.c @@ -15,8 +15,9 @@ */ #include "nnacl/fp32/common_func.h" + void PostConvFuncComm(const float *src_ptr_, float *out_ptr, const float *bias_ptr, size_t output_channel, - size_t plane_size, size_t stride, bool is_relu, bool is_relu6, int size) { + size_t plane_size, size_t plane_stride, size_t oc_stride, bool is_relu, bool is_relu6, int size) { int oc_div = 0, oc_mod = 0; for (int oc = 0; oc < output_channel; oc++) { if (size != 0) { @@ -26,8 +27,8 @@ void PostConvFuncComm(const float *src_ptr_, float *out_ptr, const float *bias_p return; } for (int hw = 0; hw < plane_size; hw++) { - int src_index = oc_div * size * plane_size + hw * size + oc_mod; - int dst_index = hw * stride + oc; + int src_index = oc_div * size * plane_stride + hw * size + oc_mod; + int dst_index = hw * oc_stride + oc; float value = src_ptr_[src_index]; if (bias_ptr != NULL) { value = value + bias_ptr[oc]; @@ -43,7 +44,8 @@ void PostConvFuncComm(const float *src_ptr_, float *out_ptr, const float *bias_p void PostConvFuncFp32C8(const float *c8_out_ptr, float *out_ptr, const float *bias_ptr, size_t output_channel, size_t plane_size, size_t stride, bool is_relu, bool is_relu6) { #ifndef ENABLE_ARM - PostConvFuncComm(c8_out_ptr, out_ptr, bias_ptr, output_channel, plane_size, stride, is_relu, is_relu6, C8NUM); + PostConvFuncComm(c8_out_ptr, out_ptr, bias_ptr, output_channel, plane_size, plane_size, stride, is_relu, is_relu6, + C8NUM); #else size_t oc8mod = output_channel % C8NUM; size_t oc8div = output_channel - oc8mod; @@ -55,6 +57,59 @@ void PostConvFuncFp32C8(const float *c8_out_ptr, float *out_ptr, const float *bi return; } +void PostConvFuncFp32C4(const float *c4_out_ptr, float *out_ptr, const float *bias_ptr, size_t output_channel, + size_t plane_size, size_t plane_stride, bool is_relu, bool is_relu6) { + PostConvFuncComm(c4_out_ptr, out_ptr, bias_ptr, output_channel, plane_size, plane_stride, output_channel, is_relu, + is_relu6, C4NUM); + return; +} + +void WinogradMatrixProductLeft(const float *S, const float *B, float *M, size_t w, size_t h, size_t k, size_t length) { + int unitStep = 4 * length; + for (int y = 0; y < h; ++y) { + float *dstY = M + y * w * unitStep; + for (int x = 0; x < w; ++x) { + float *dstX = dstY + x * unitStep; + const float *srcX = S + x * unitStep; + memset(dstX, 0, unitStep * sizeof(float)); + for (int i = 0; i < k; ++i) { + float b = B[i * h + y]; + const float *srcY = srcX + i * w * unitStep; + if (0.0f == b) { + continue; + } + for (int j = 0; j < unitStep; ++j) { + dstX[j] += srcY[j] * b; + } + } + } + } +} + +// M = S * B , M = w*h * l, S = k*h * l, B = w*k +void WinogradMatrixProductRight(const float *S, const float *B, float *M, size_t w, size_t h, size_t k, size_t length) { + int unitStep = 4 * length; + for (int y = 0; y < h; ++y) { + float *dstY = M + y * w * unitStep; + const float *srcY = S + y * k * unitStep; + + for (int x = 0; x < w; ++x) { + float *dstX = dstY + x * unitStep; + memset(dstX, 0, unitStep * sizeof(float)); + for (int i = 0; i < k; ++i) { + const float *srcX = srcY + i * unitStep; + float b = B[i * h + x]; + if (0.0f == b) { + continue; + } + for (int j = 0; j < unitStep; ++j) { + dstX[j] += srcX[j] * b; + } + } + } + } +} + union float32_bits { unsigned int u; float f; diff --git a/mindspore/lite/nnacl/fp32/common_func.h b/mindspore/lite/nnacl/fp32/common_func.h index 4969851225..ab40622391 100644 --- a/mindspore/lite/nnacl/fp32/common_func.h +++ b/mindspore/lite/nnacl/fp32/common_func.h @@ -29,6 +29,12 @@ extern "C" { void PostConvFuncFp32C8(const float *c8_out_ptr, float *out_ptr, const float *bias_ptr, size_t output_channel, size_t plane_size, size_t stride, bool is_relu, bool is_relu6); +void PostConvFuncFp32C4(const float *c4_out_ptr, float *out_ptr, const float *bias_ptr, size_t output_channel, + size_t plane_size, size_t plane_stride, bool is_relu, bool is_relu6); + +void WinogradMatrixProductLeft(const float *S, const float *B, float *M, size_t w, size_t h, size_t k, size_t length); +void WinogradMatrixProductRight(const float *S, const float *B, float *M, size_t w, size_t h, size_t k, size_t length); + float ShortToFloat32(uint16_t src_value); uint16_t Float32ToShort(float src_value); diff --git a/mindspore/lite/nnacl/fp32/deconv.c b/mindspore/lite/nnacl/fp32/deconv.c index 61c27e12f0..a6f41bd424 100644 --- a/mindspore/lite/nnacl/fp32/deconv.c +++ b/mindspore/lite/nnacl/fp32/deconv.c @@ -33,9 +33,10 @@ void PackDeConvWeightFp32(const float *weight, float *dst, int input_channel, in return; } -int DeConvPostFp32C12x8(const float *src, float *tmp, const float *bias, float *dst, int output_channel, - ConvParameter *conv_param) { - /* row12x8-major(ih*iw x oc*kh*kw) -> row8-major(oh*ow x oc) */ +void DeConvPostFp32C8(const float *src, float *tmp, const float *bias, float *dst, int output_channel, + ConvParameter *conv_param) { + /* arm64 row12x8-major(ih*iw x oc*kh*kw) -> row8-major(oh*ow x oc) */ + /* arm32 row4x8-major(ih*iw x oc*kh*kw) -> row8-major(oh*ow x oc) */ size_t input_plane = conv_param->input_w_ * conv_param->input_h_; size_t kernel_plane = conv_param->kernel_w_ * conv_param->kernel_h_; size_t output_plane = conv_param->output_w_ * conv_param->output_h_; @@ -45,11 +46,11 @@ int DeConvPostFp32C12x8(const float *src, float *tmp, const float *bias, float * #else const int tile_num = 12; #endif - int in_plane12 = UP_ROUND(input_plane, tile_num); + int in_plane_round = UP_ROUND(input_plane, tile_num); int src_iw_stride = C8NUM; int src_ih_stride = conv_param->input_w_ * C8NUM; - int src_kw_stride = in_plane12 * C8NUM; - int src_kh_stride = in_plane12 * conv_param->kernel_w_ * C8NUM; + int src_kw_stride = in_plane_round * C8NUM; + int src_kh_stride = in_plane_round * conv_param->kernel_w_ * C8NUM; int dst_oh_stride = conv_param->output_w_ * C8NUM; int dst_ow_stride = C8NUM; int dst_kh_stride = conv_param->dilation_h_ * conv_param->output_w_ * C8NUM; @@ -57,7 +58,7 @@ int DeConvPostFp32C12x8(const float *src, float *tmp, const float *bias, float * for (int c = 0; c < oc8; c += 8) { float *dst_ptr = tmp + c * output_plane; - const float *src_ptr = src + c * in_plane12 * kernel_plane; + const float *src_ptr = src + c * in_plane_round * kernel_plane; memset(dst_ptr, 0, output_plane * C8NUM * sizeof(float)); for (int ih = 0; ih < conv_param->input_h_; ih++) { @@ -104,5 +105,5 @@ int DeConvPostFp32C12x8(const float *src, float *tmp, const float *bias, float * PostConvFuncFp32C8(tmp, dst, bias, output_channel, output_plane, conv_param->output_channel_, conv_param->act_type_ == ActType_Relu, conv_param->act_type_ == ActType_Relu6); - return NNACL_OK; + return; } diff --git a/mindspore/lite/nnacl/fp32/deconv.h b/mindspore/lite/nnacl/fp32/deconv.h index 65b9b41fbe..e3140d39ee 100644 --- a/mindspore/lite/nnacl/fp32/deconv.h +++ b/mindspore/lite/nnacl/fp32/deconv.h @@ -22,13 +22,15 @@ #include "nnacl/conv_parameter.h" #include "nnacl/errorcode.h" #include "nnacl/fp32/common_func.h" +#include "nnacl/fp32/conv.h" +#include "nnacl/minimal_filtering_generator.h" #ifdef __cplusplus extern "C" { #endif void PackDeConvWeightFp32(const float *weight, float *dst, int input_channel, int output_channel, int plane); -int DeConvPostFp32C12x8(const float *src, float *tmp_out, const float *bias, float *dst, int output_channel, - ConvParameter *conv_param); +void DeConvPostFp32C8(const float *src, float *tmp_out, const float *bias, float *dst, int output_channel, + ConvParameter *conv_param); #ifdef __cplusplus } #endif diff --git a/mindspore/lite/nnacl/fp32/deconv_winograd.c b/mindspore/lite/nnacl/fp32/deconv_winograd.c new file mode 100644 index 0000000000..24da2dc977 --- /dev/null +++ b/mindspore/lite/nnacl/fp32/deconv_winograd.c @@ -0,0 +1,337 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "nnacl/fp32/deconv_winograd.h" + +int PackDeConvWgDataFp32(float *nhwc_weight, DeConvComputeUnit *unit, ConvParameter *conv_param, + DeConvParam *deconv_param) { + int tmp_kernel_plane = unit->w_size_ * unit->h_size_; + int size = conv_param->input_channel_ * conv_param->output_channel_ * tmp_kernel_plane; + float *current_unit_weight = (float *)malloc(size * sizeof(float)); + if (current_unit_weight == NULL) { + return NNACL_NULL_PTR; + } + for (int ic = 0; ic < conv_param->input_channel_; ic++) { + float *src_ic = nhwc_weight + deconv_param->kernel_plane_ * conv_param->output_channel_ * ic; + float *dst_ic = current_unit_weight + tmp_kernel_plane * conv_param->output_channel_ * ic; + for (int uhi = 0; uhi < unit->h_size_; uhi++) { + for (int uwi = 0; uwi < unit->w_size_; uwi++) { + int src_h_offset = unit->h_start_ + uhi * conv_param->stride_h_; + int src_w_offset = unit->w_start_ + uwi * conv_param->stride_w_; + float *src_hw = src_ic + (src_h_offset * conv_param->kernel_w_ + src_w_offset) * conv_param->output_channel_; + float *dst_hw = dst_ic + (uhi * unit->w_size_ + uwi) * conv_param->output_channel_; + memcpy(dst_hw, src_hw, conv_param->output_channel_ * sizeof(float)); + } + } + } + + if (unit->use_winograd_) { + /* Generate winograd */ + float matrix_g[64]; + float matrix_gt[64]; + float matrix_a[64]; + float matrix_at[64]; + float matrix_b[64]; + float matrix_bt[64]; + int ret = CookToomFilter(matrix_a, matrix_at, matrix_b, matrix_bt, matrix_g, matrix_gt, 0.5f, + DECONV_WINOGRAD_DEFAULT_UNIT, unit->h_size_); + if (ret != NNACL_OK) { + return NNACL_ERRCODE_WINOGRAD_GENERATOR_ERROR; + } + + /* winograd AT */ + unit->winograd_.AT_ = malloc(unit->winograd_.i_ * unit->winograd_.o_ * sizeof(float)); + if (unit->winograd_.AT_ == NULL) { + return NNACL_NULL_PTR; + } + memcpy(unit->winograd_.AT_, matrix_at, unit->winograd_.i_ * unit->winograd_.o_ * sizeof(float)); + + /* winograd BT */ + unit->winograd_.BT_ = malloc(unit->winograd_.o_ * unit->winograd_.o_ * sizeof(float)); + if (unit->winograd_.BT_ == NULL) { + return NNACL_NULL_PTR; + } + memcpy(unit->winograd_.BT_, matrix_bt, unit->winograd_.o_ * unit->winograd_.o_ * sizeof(float)); + + /* winograd Weight */ + size = conv_param->input_channel_ * conv_param->output_channel_ * unit->winograd_.kh_ * unit->winograd_.kw_; + float *winograd_unit_weight = (float *)malloc(size * sizeof(float)); + if (winograd_unit_weight == NULL) { + return NNACL_NULL_PTR; + } + WinogradWeightTransform(current_unit_weight, winograd_unit_weight, matrix_g, matrix_gt, C4NUM, unit->winograd_.kh_, + unit->h_size_, conv_param->output_channel_, conv_param->input_channel_, false); + + /* reset weight data & info */ + tmp_kernel_plane = unit->winograd_.kh_ * unit->winograd_.kw_; + free(current_unit_weight); + current_unit_weight = winograd_unit_weight; + winograd_unit_weight = NULL; + } + + /* trans mhwc -> hw1:k1-knc0-c4:k1-knc5-c8:hw2:k1-knc0-c4:k1 */ + float *dst_weight = (float *)unit->weight_; + size = deconv_param->ic_up4_ * deconv_param->oc_up4_ * tmp_kernel_plane; + memset(dst_weight, 0, size * sizeof(float)); + for (int ic = 0; ic < conv_param->input_channel_; ic++) { + for (int oc = 0; oc < conv_param->output_channel_; oc++) { + int oc4div = oc / C4NUM, oc4mod = oc % C4NUM; + for (int upi = 0; upi < tmp_kernel_plane; upi++) { + int src_index = ic * conv_param->output_channel_ * tmp_kernel_plane + upi * conv_param->output_channel_ + oc; + int dst_index = upi * deconv_param->oc_up4_ * deconv_param->ic_up4_ + oc4div * C4NUM * deconv_param->ic_up4_ + + ic * C4NUM + oc4mod; + dst_weight[dst_index] = current_unit_weight[src_index]; + } + } + } + + free(current_unit_weight); + return NNACL_OK; +} + +void DeConvWgInputPack(float *src_ptr, float *dst_ptr, int channel, int stride) { + int ic4div = channel / C4NUM; + int ic4mod = channel % C4NUM; + float *src = src_ptr; + float *dst = dst_ptr; + + for (int ic = 0; ic < ic4div; ic++) { + memcpy(dst, src, C4NUM * sizeof(float)); + dst += stride; + src += C4NUM; + } + + if (ic4mod != 0) { + int ic_res = 0; + for (; ic_res < ic4mod; ic_res++) { + dst[ic_res] = src[ic_res]; + } + for (; ic_res < C4NUM; ic_res++) { + dst[ic_res] = 0; + } + } + return; +} + +void MSGemmFloatCommon_4(float *dst, const float *src, const float *weight, size_t src_depth_quad, size_t dst_step, + size_t dst_depth_quad, size_t width, size_t weight_depth_offset) { + int dx, sz, dz; + int src_depth_step = 4 * width; + for (dz = 0; dz < dst_depth_quad; ++dz) { + float *dst_z = dst + dz * dst_step; + const float *weight_dz = weight + dz * (src_depth_quad * 16 + weight_depth_offset); + for (dx = 0; dx < width; ++dx) { + float *dst_x = dst_z + dx * 4; + dst_x[0] = 0.0f; + dst_x[1] = 0.0f; + dst_x[2] = 0.0f; + dst_x[3] = 0.0f; + const float *src_dx = src + 4 * dx; + for (sz = 0; sz < src_depth_quad; ++sz) { + const float *src_z = src_dx + sz * src_depth_step; + const float *weight_z = weight_dz + sz * 16; + for (int i = 0; i < 4; ++i) { + for (int j = 0; j < 4; ++j) { + dst_x[j] += src_z[i] * weight_z[4 * i + j]; + } + } + } + } + } +} + +void MSGemmFloatUnit_4(float *dstOrigin, const float *src, const float *weight, size_t src_depth_quad, size_t dst_step, + size_t dst_depth_quad, size_t weight_depth_offset) { + MSGemmFloatCommon_4(dstOrigin, src, weight, src_depth_quad, dst_step, dst_depth_quad, DECONV_WINOGRAD_DEFAULT_TILE, + weight_depth_offset); +} + +void DeConvWgMerge(const float *source, float *dest, size_t srcStride, size_t dstStride, size_t count) { + for (int i = 0; i < count; ++i) { + const float *s = source + i * srcStride; + float *d = dest + i * dstStride; + for (int j = 0; j < 4; ++j) { + d[j] += s[j]; + } + } +} + +void _deConvWinograd(float *tile_in, float *tile_out, float *weight_buf, float *tmp_buf, float *at_buf, + float *a_mid_buf, float *trans_a_buf, bool a_trans, float *bt_buf, float *b_tmp_buf, int unit_size, + int w_start, int h_start, ConvParameter *conv_param, DeConvParam *deconv_param) { + int winograd_plane = unit_size * unit_size; + if (!a_trans) { + WinogradMatrixProductLeft(tile_in, at_buf, a_mid_buf, DECONV_WINOGRAD_DEFAULT_UNIT, unit_size, + DECONV_WINOGRAD_DEFAULT_UNIT, deconv_param->ic_div4_ * DECONV_WINOGRAD_DEFAULT_TILE); + WinogradMatrixProductRight(a_mid_buf, at_buf, trans_a_buf, unit_size, unit_size, DECONV_WINOGRAD_DEFAULT_UNIT, + deconv_param->ic_div4_ * DECONV_WINOGRAD_DEFAULT_TILE); + } + + for (int index = 0; index < winograd_plane; index++) { + float *src = trans_a_buf + index * DECONV_WINOGRAD_DEFAULT_TILE * deconv_param->ic_up4_; + float *dst = tmp_buf + index * deconv_param->oc_up4_ * DECONV_WINOGRAD_DEFAULT_TILE; + float *weight = weight_buf + index * deconv_param->ic_up4_ * deconv_param->oc_up4_; + MSGemmFloatUnit_4(dst, src, weight, deconv_param->ic_div4_, DECONV_WINOGRAD_DEFAULT_TILE * C4NUM, + deconv_param->oc_div4_, 0); + } + + WinogradMatrixProductLeft(tmp_buf, bt_buf, b_tmp_buf, unit_size, unit_size, unit_size, + deconv_param->oc_div4_ * DECONV_WINOGRAD_DEFAULT_TILE); + WinogradMatrixProductRight(b_tmp_buf, bt_buf, tmp_buf, unit_size, unit_size, unit_size, + deconv_param->oc_div4_ * DECONV_WINOGRAD_DEFAULT_TILE); + + // Add to dest + for (int uhi = 0; uhi < unit_size; uhi++) { + int h_index = uhi * conv_param->stride_h_ + h_start; + for (int uwi = 0; uwi < unit_size; uwi++) { + int w_index = uwi * conv_param->stride_w_ + w_start; + + float *dst = tile_out + w_index * DECONV_WINOGRAD_DEFAULT_TILE * deconv_param->oc_up4_ + + h_index * deconv_param->out_tile_w_ * DECONV_WINOGRAD_DEFAULT_TILE * deconv_param->oc_up4_; + float *src = tmp_buf + (uwi + uhi * unit_size) * DECONV_WINOGRAD_DEFAULT_TILE * deconv_param->oc_up4_; + DeConvWgMerge(src, dst, 4, 4, DECONV_WINOGRAD_DEFAULT_TILE * deconv_param->oc_div4_); + } + } + return; +} + +void _deConvCommon(float *tile_in, float *tile_out, float *weight, float *tmp_buf, int h_start, int w_start, int h_size, + int w_size, ConvParameter *conv_param, DeConvParam *deconv_param) { + int count = deconv_param->oc_div4_ * w_size * h_size; + int in_stride = DECONV_WINOGRAD_DEFAULT_TILE * deconv_param->ic_up4_; + int out_stride = DECONV_WINOGRAD_DEFAULT_TILE * deconv_param->oc_up4_; + + for (int hi = 0; hi < DECONV_WINOGRAD_DEFAULT_UNIT; hi++) { + for (int wi = 0; wi < DECONV_WINOGRAD_DEFAULT_UNIT; wi++) { + float *src_in = tile_in + (wi + hi * DECONV_WINOGRAD_DEFAULT_UNIT) * in_stride; + MSGemmFloatUnit_4(tmp_buf, src_in, weight, deconv_param->ic_div4_, DECONV_WINOGRAD_DEFAULT_TILE * 4, count, 0); + + for (int uhi = 0; uhi < h_size; uhi++) { + for (int uwi = 0; uwi < w_size; uwi++) { + int w_index = (wi + uwi) * conv_param->stride_w_ + w_start; + int h_index = (hi + uhi) * conv_param->stride_h_ + h_start; + float *dst = tile_out + h_index * out_stride * deconv_param->out_tile_w_ + w_index * out_stride; + float *src = tmp_buf + (uwi + uhi * w_size) * out_stride; + DeConvWgMerge(src, dst, 4, 4, DECONV_WINOGRAD_DEFAULT_TILE * deconv_param->oc_div4_); + } + } + } + } + + return; +} + +void DeconvWg(float *nhwc_input_, float *tile_in, float *tile_out, int start_index, int calculate_count, + ConvParameter *conv_param, DeConvParam *deconv_param, int task_id) { + /* pack tile input */ + int tile_in_unit_stride = deconv_param->ic_up4_ * DECONV_WINOGRAD_DEFAULT_TILE; +#ifdef ENABLE_ARM + float32x4_t zero = vdupq_n_f32(0.0f); +#endif + for (int unit_index = 0; unit_index < calculate_count; unit_index++) { + int plane_index = start_index + unit_index; + int w_unit_index = plane_index % deconv_param->in_tile_w_count_; + int h_unit_index = plane_index / deconv_param->in_tile_w_count_; + int w_start = w_unit_index * DECONV_WINOGRAD_DEFAULT_UNIT; + int h_start = h_unit_index * DECONV_WINOGRAD_DEFAULT_UNIT; + + float *dst_unit = tile_in + unit_index * C4NUM; + for (int hi = 0; hi < DECONV_WINOGRAD_DEFAULT_UNIT; hi++) { + for (int wi = 0; wi < DECONV_WINOGRAD_DEFAULT_UNIT; wi++) { + float *dst = dst_unit + (wi + hi * DECONV_WINOGRAD_DEFAULT_UNIT) * tile_in_unit_stride; + int w_index = w_start + wi; + int h_index = h_start + hi; + if (w_index >= conv_param->input_w_ || h_index >= conv_param->input_h_) { + for (int ic4_index = 0; ic4_index < deconv_param->ic_div4_; ic4_index++) { +#ifdef ENABLE_ARM + vst1q_f32(dst + ic4_index * DECONV_WINOGRAD_DEFAULT_TILE * C4NUM, zero); +#else + for (int i = 0; i < 4; i++) { + dst[C4NUM * DECONV_WINOGRAD_DEFAULT_TILE * ic4_index + i] = 0; + } +#endif + } + continue; + } + + float *src = nhwc_input_ + (w_index + h_index * conv_param->input_w_) * conv_param->input_channel_; + DeConvWgInputPack(src, dst, conv_param->input_channel_, DECONV_WINOGRAD_DEFAULT_TILE * C4NUM); + } + } + } + + /* compute */ + for (int i = 0; i < deconv_param->compute_size_; i++) { + DeConvComputeUnit *unit = &deconv_param->compute_units_[i]; + if (unit->use_winograd_) { + float *tmp_buf = (float *)unit->tmp_buffer_ + task_id * unit->winograd_.kh_ * unit->winograd_.kw_ * + deconv_param->oc_div4_ * DECONV_WINOGRAD_DEFAULT_TILE * C4NUM; + + /* winograd a buffer */ + DeConvWgABuffer *wg_buf = &deconv_param->a_buffer_[unit->winograd_.kh_]; + float *wg_mid_a_buf = (float *)wg_buf->middle_buffer_ + task_id * unit->winograd_.kw_ * unit->winograd_.kh_ * + DECONV_WINOGRAD_DEFAULT_TILE * deconv_param->ic_up4_; + float *wg_dst_a_buf = (float *)wg_buf->dest_buffer_ + task_id * unit->winograd_.kw_ * unit->winograd_.kh_ * + DECONV_WINOGRAD_DEFAULT_TILE * deconv_param->ic_up4_; + float *tmp_b_buf = (float *)unit->winograd_.b_buffer_ + task_id * unit->winograd_.kh_ * unit->winograd_.kw_ * + deconv_param->oc_up4_ * DECONV_WINOGRAD_DEFAULT_TILE; + _deConvWinograd(tile_in, tile_out, (float *)unit->weight_, tmp_buf, unit->winograd_.AT_, wg_mid_a_buf, + wg_dst_a_buf, wg_buf->trans_formed_, unit->winograd_.BT_, tmp_b_buf, unit->winograd_.kh_, + unit->w_start_, unit->h_start_, conv_param, deconv_param); + wg_buf->trans_formed_ = true; + } else { + float *tmp_buf = (float *)unit->tmp_buffer_ + task_id * deconv_param->oc_div4_ * unit->w_size_ * unit->h_size_ * + DECONV_WINOGRAD_DEFAULT_TILE * C4NUM; + _deConvCommon(tile_in, tile_out, (float *)unit->weight_, tmp_buf, unit->h_start_, unit->w_start_, unit->h_size_, + unit->w_size_, conv_param, deconv_param); + } + } + return; +} + +void DeconvWgPost(float *tile_out, float *nc4hw4_output, ConvParameter *conv_param, DeConvParam *deconv_param, + int calculate_count, int tile_index) { + /* merge */ + int src_unit_stride = deconv_param->oc_up4_ * DECONV_WINOGRAD_DEFAULT_TILE; + + int src_stride = DECONV_WINOGRAD_DEFAULT_TILE * C4NUM; + int dst_stride = conv_param->output_w_ * conv_param->output_h_ * C4NUM; + + for (int index = 0; index < calculate_count; ++index) { + float *src_start = tile_out + index * C4NUM; + + int plane_index = tile_index * DECONV_WINOGRAD_DEFAULT_TILE + index; + int w_unit_index = plane_index % deconv_param->in_tile_w_count_; + int h_unit_index = plane_index / deconv_param->in_tile_w_count_; + int w_start = w_unit_index * DECONV_WINOGRAD_DEFAULT_UNIT * conv_param->stride_w_ - conv_param->pad_l_; + int h_start = h_unit_index * DECONV_WINOGRAD_DEFAULT_UNIT * conv_param->stride_h_ - conv_param->pad_u_; + float *dst_start = nc4hw4_output + h_start * conv_param->output_w_ * C4NUM + w_start * C4NUM; + + int merge_w_start = MSMAX(-w_start, 0); + int merge_h_start = MSMAX(-h_start, 0); + int merge_h_end = MSMIN(deconv_param->out_tile_h_, conv_param->output_h_ - h_start); + int merge_w_end = MSMIN(deconv_param->out_tile_w_, conv_param->output_w_ - w_start); + + for (int hi = merge_h_start; hi < merge_h_end; hi++) { + for (int wi = merge_w_start; wi < merge_w_end; wi++) { + float *src = src_start + (hi * deconv_param->out_tile_w_ + wi) * src_unit_stride; + float *dst = dst_start + (hi * conv_param->output_w_ + wi) * C4NUM; + DeConvWgMerge(src, dst, src_stride, dst_stride, deconv_param->oc_div4_); + } + } + } + return; +} diff --git a/mindspore/lite/nnacl/fp32/deconv_winograd.h b/mindspore/lite/nnacl/fp32/deconv_winograd.h new file mode 100644 index 0000000000..576b772d7d --- /dev/null +++ b/mindspore/lite/nnacl/fp32/deconv_winograd.h @@ -0,0 +1,42 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef MINDSPORE_LITE_NNACL_FP32_DECONV_WINOGRAD_H_ +#define MINDSPORE_LITE_NNACL_FP32_DECONV_WINOGRAD_H_ + +#include +#include "nnacl/pack.h" +#include "nnacl/op_base.h" +#include "nnacl/conv_parameter.h" +#include "nnacl/errorcode.h" +#include "nnacl/fp32/common_func.h" +#include "nnacl/minimal_filtering_generator.h" + +#ifdef __cplusplus +extern "C" { +#endif + +int PackDeConvWgDataFp32(float *nhwc_weight, DeConvComputeUnit *unit, ConvParameter *conv_param, + DeConvParam *deconv_param); +void DeconvWg(float *nhwc_input_, float *tile_in, float *tile_out, int start_index, int calculate_count, + ConvParameter *conv_param, DeConvParam *deconv_param, int task_id); +void DeconvWgPost(float *tile_out, float *nc4hw4_output, ConvParameter *conv_param, DeConvParam *deconv_param, + int calculate_count, int tile_index); + +#ifdef __cplusplus +} +#endif + +#endif // MINDSPORE_LITE_NNACL_FP32_DECONV_WINOGRAD_H_ diff --git a/mindspore/lite/nnacl/minimal_filtering_generator.c b/mindspore/lite/nnacl/minimal_filtering_generator.c index 0d41315051..7e4b0be340 100644 --- a/mindspore/lite/nnacl/minimal_filtering_generator.c +++ b/mindspore/lite/nnacl/minimal_filtering_generator.c @@ -254,3 +254,88 @@ void MatrixMultiplyVec(const float32x4_t *matrix_a, const float32x4_t *matrix_b, } } #endif + +int WinogradWeightTransform(const float *weight_data, float *winograd_data, float *matrix_g, float *matrix_gt, + int oc_block, int input_unit, int kernel_unit, int channel, int batch, bool pack) { + // original weight format : ohwi + int oc_block_num = UP_DIV(batch, oc_block); + int block_stride = channel * oc_block; + int block_num_stride = block_stride * oc_block_num; + + // trans_filter = G*g*GT (g represents weight_data) + // separate into two steps ===> tmp = (g * GT)T ===> trans = (tmp * GT)T use same function:MatrixMultiplyWinograd + float *tmp_data = (float *)(malloc(channel * input_unit * kernel_unit * sizeof(float))); + if (tmp_data == NULL) { + return NNACL_ERR; + } + float *trans_out_data = (float *)(malloc(channel * input_unit * input_unit * sizeof(float))); + if (trans_out_data == NULL) { + free(tmp_data); + return NNACL_ERR; + } + +#ifndef ENABLE_ARM + float *tmp_data1 = (float *)(malloc(channel * input_unit * kernel_unit * sizeof(float))); + if (tmp_data1 == NULL) { + free(tmp_data); + free(trans_out_data); + return NNACL_ERR; + } + float *trans_out_data1 = (float *)(malloc(channel * input_unit * input_unit * sizeof(float))); + if (trans_out_data1 == NULL) { + free(tmp_data); + free(tmp_data1); + free(trans_out_data); + return NNACL_ERR; + } +#endif + + int input_oz_offset = kernel_unit * kernel_unit * channel; + for (int i = 0; i < batch; i++) { + int out_c_block = i / oc_block; + int out_c_res = i % oc_block; + int output_oz_offset = out_c_block * block_stride + out_c_res; + +#ifndef ENABLE_ARM + // tmp_data = g * GT + MatrixMultiplyWinograd(weight_data + i * input_oz_offset, matrix_gt, tmp_data, kernel_unit, kernel_unit, input_unit, + channel, channel * 4); + // tmp_data1 = (tmp_data)T + PackHWCToWHC(tmp_data, tmp_data1, kernel_unit, input_unit, channel); + // trans_out_data1 = tmp * GT + MatrixMultiplyWinograd(tmp_data1, matrix_gt, trans_out_data1, input_unit, kernel_unit, input_unit, channel, + channel * 4); + // trans_out_data = (trans_out_data1)T + PackHWCToWHC(trans_out_data1, trans_out_data, input_unit, input_unit, channel); +#else + // tmp = (g * GT)T + MatrixMultiplyWinograd(weight_data + i * input_oz_offset, matrix_gt, tmp_data, kernel_unit, kernel_unit, input_unit, + channel, channel * 4); + // trans = (tmp * GT)T + MatrixMultiplyWinograd(tmp_data, matrix_gt, trans_out_data, input_unit, kernel_unit, input_unit, channel, + channel * 4); +#endif + if (pack) { + int in_offset = 0; + for (int j = 0; j < input_unit; ++j) { + for (int k = 0; k < input_unit; ++k) { + for (int c = 0; c < channel; ++c) { + *(winograd_data + output_oz_offset + c * oc_block) = trans_out_data[in_offset + c]; + } + in_offset += channel; + output_oz_offset += block_num_stride; + } + } + } else { + memcpy(winograd_data + i * channel * input_unit * input_unit, trans_out_data, + channel * input_unit * input_unit * sizeof(float)); + } + } +#ifndef ENABLE_ARM + free(tmp_data1); + free(trans_out_data1); +#endif + free(tmp_data); + free(trans_out_data); + return NNACL_OK; +} diff --git a/mindspore/lite/nnacl/minimal_filtering_generator.h b/mindspore/lite/nnacl/minimal_filtering_generator.h index 2936bbedc0..53b146940a 100644 --- a/mindspore/lite/nnacl/minimal_filtering_generator.h +++ b/mindspore/lite/nnacl/minimal_filtering_generator.h @@ -20,6 +20,8 @@ #ifdef ENABLE_ARM #include #endif +#include +#include "nnacl/pack.h" #ifdef __cplusplus extern "C" { @@ -47,6 +49,9 @@ int CookToomFilter(float *matrix_a, float *matrix_at, float *matrix_b, float *ma void MatrixMultiplyWinograd(const float *matix_a, const float *matrix_b, float *matrix_c, int m, int k, int n, int in_channel, int c4_channel); +int WinogradWeightTransform(const float *weight_data, float *winograd_data, float *matrix_g, float *matrix_gt, + int oc_block, int input_unit_, int kernel_unit_, int channel, int batch, bool pack); + #ifdef ENABLE_ARM void MatrixMultiplyVec(const float32x4_t *matrix_a, const float32x4_t *matrix_b, float32x4_t *matrix_c, const float *bias, int m, int k, int n); diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_winograd.cc b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_winograd.cc index 47107834e5..60c2cbca39 100644 --- a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_winograd.cc +++ b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_winograd.cc @@ -36,89 +36,9 @@ int ConvolutionWinogradCPUKernel::WinogradFilterTransform(const float *weight_da MS_LOG(ERROR) << "Divide by zero"; return RET_ERROR; } - // original weight format : ohwi - auto channel_in = conv_param_->input_channel_; - auto channel_out = conv_param_->output_channel_; - int oc_block_num = UP_DIV(channel_out, oc_block); - int block_stride = channel_in * oc_block; - int block_num_stride = block_stride * oc_block_num; - // trans_filter = G*g*GT (g represents weight_data) - // separate into two steps ===> tmp = (g * GT)T ===> trans = (tmp * GT)T use same function:MatrixMultiplyWinograd - auto tmp_data = reinterpret_cast(malloc(channel_in * input_unit_ * kernel_unit_ * sizeof(float))); - if (tmp_data == nullptr) { - MS_LOG(ERROR) << "malloc tmp_data failed."; - return RET_MEMORY_FAILED; - } - auto trans_out_data = reinterpret_cast(malloc(channel_in * input_unit_ * input_unit_ * sizeof(float))); - if (trans_out_data == nullptr) { - free(tmp_data); - MS_LOG(ERROR) << "malloc trans_out_data failed."; - return RET_MEMORY_FAILED; - } - -#ifndef ENABLE_ARM - auto tmp_data1 = reinterpret_cast(malloc(channel_in * input_unit_ * kernel_unit_ * sizeof(float))); - if (tmp_data1 == nullptr) { - free(tmp_data); - free(trans_out_data); - MS_LOG(ERROR) << "malloc tmp_data1 failed."; - return RET_MEMORY_FAILED; - } - auto trans_out_data1 = reinterpret_cast(malloc(channel_in * input_unit_ * input_unit_ * sizeof(float))); - if (trans_out_data1 == nullptr) { - free(tmp_data); - free(tmp_data1); - free(trans_out_data); - MS_LOG(ERROR) << "malloc trans_out_data1 failed."; - return RET_MEMORY_FAILED; - } -#endif - - int input_oz_offset = kernel_unit_ * kernel_unit_ * channel_in; - for (int i = 0; i < channel_out; i++) { - int out_c_block = i / oc_block; - int out_c_res = i % oc_block; - int output_oz_offset = out_c_block * block_stride + out_c_res; - -#ifndef ENABLE_ARM - // tmp_data = g * GT - MatrixMultiplyWinograd(weight_data + i * input_oz_offset, matrix_gt, tmp_data, kernel_unit_, kernel_unit_, - input_unit_, channel_in, channel_in * 4); - // tmp_data1 = (tmp_data)T - PackHWCToWHC(tmp_data, tmp_data1, kernel_unit_, input_unit_, channel_in); - // trans_out_data1 = tmp * GT - MatrixMultiplyWinograd(tmp_data1, matrix_gt, trans_out_data1, input_unit_, kernel_unit_, input_unit_, channel_in, - channel_in * 4); - // trans_out_data = (trans_out_data1)T - PackHWCToWHC(trans_out_data1, trans_out_data, input_unit_, input_unit_, channel_in); -#else - // tmp = (g * GT)T - MatrixMultiplyWinograd(weight_data + i * input_oz_offset, matrix_gt, tmp_data, kernel_unit_, kernel_unit_, - input_unit_, channel_in, channel_in * 4); - // trans = (tmp * GT)T - MatrixMultiplyWinograd(tmp_data, matrix_gt, trans_out_data, input_unit_, kernel_unit_, input_unit_, channel_in, - channel_in * 4); -#endif - - int in_offset = 0; - for (int j = 0; j < input_unit_; ++j) { - for (int k = 0; k < input_unit_; ++k) { - for (int c = 0; c < channel_in; ++c) { - *(trans_weight_ + output_oz_offset + c * oc_block) = trans_out_data[in_offset + c]; - } - in_offset += channel_in; - output_oz_offset += block_num_stride; - } - } - } -#ifndef ENABLE_ARM - free(tmp_data1); - free(trans_out_data1); -#endif - free(tmp_data); - free(trans_out_data); - return RET_OK; + return WinogradWeightTransform(weight_data, trans_weight_, matrix_g, matrix_gt, oc_block, input_unit_, kernel_unit_, + conv_param_->input_channel_, conv_param_->output_channel_, true); } int ConvolutionWinogradCPUKernel::InitWeightBias() { diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/deconvolution.cc b/mindspore/lite/src/runtime/kernel/arm/fp32/deconvolution.cc index d219b44def..d3b65cab7c 100644 --- a/mindspore/lite/src/runtime/kernel/arm/fp32/deconvolution.cc +++ b/mindspore/lite/src/runtime/kernel/arm/fp32/deconvolution.cc @@ -15,6 +15,7 @@ */ #include "src/runtime/kernel/arm/fp32/deconvolution.h" +#include "src/runtime/kernel/arm/fp32/deconvolution_winograd.h" #include "src/runtime/runtime_api.h" using mindspore::kernel::KERNEL_ARCH::kCPU; @@ -125,9 +126,9 @@ int DeConvolutionCPUKernel::DoDeconv(int task_id) { matmul_param_->col_, OutType_C8); #endif - DeConvPostFp32C12x8(tmp_buffer, pack_output_ + task_id * thread_stride_ * C8NUM * output_plane_, - reinterpret_cast(bias_data_) + thread_stride_ * task_id * C8NUM, - output_ptr_ + task_id * thread_stride_ * C8NUM, oc_res, conv_param_); + DeConvPostFp32C8(tmp_buffer, pack_output_ + task_id * thread_stride_ * C8NUM * output_plane_, + reinterpret_cast(bias_data_) + thread_stride_ * task_id * C8NUM, + output_ptr_ + task_id * thread_stride_ * C8NUM, oc_res, conv_param_); return RET_OK; } @@ -246,7 +247,17 @@ kernel::LiteKernel *CpuDeConvFp32KernelCreator(const std::vector } weight_tensor->SetData(dequant_weight); } - auto kernel = new (std::nothrow) kernel::DeConvolutionCPUKernel(opParameter, inputs, outputs, ctx, primitive); + + kernel::LiteKernel *kernel; + auto conv_param = reinterpret_cast(opParameter); + if ((conv_param->stride_h_ != 1 || conv_param->stride_w_ != 1) && + (conv_param->dilation_w_ == 1 && conv_param->dilation_h_ == 1)) { + /* DeConvolutionWinogradCPUKernel */ + kernel = new (std::nothrow) kernel::DeConvolutionCPUKernel(opParameter, inputs, outputs, ctx, primitive); + } else { + kernel = new (std::nothrow) kernel::DeConvolutionCPUKernel(opParameter, inputs, outputs, ctx, primitive); + } + if (kernel == nullptr) { MS_LOG(ERROR) << "kernel is nullptr."; if (weight_tensor->data_type() == kNumberTypeInt8) { diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/deconvolution_winograd.cc b/mindspore/lite/src/runtime/kernel/arm/fp32/deconvolution_winograd.cc new file mode 100644 index 0000000000..da47475d34 --- /dev/null +++ b/mindspore/lite/src/runtime/kernel/arm/fp32/deconvolution_winograd.cc @@ -0,0 +1,368 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "src/runtime/kernel/arm/fp32/deconvolution_winograd.h" +#include "src/runtime/runtime_api.h" + +using mindspore::kernel::KERNEL_ARCH::kCPU; +using mindspore::lite::KernelRegistrar; +using mindspore::lite::RET_ERROR; +using mindspore::lite::RET_NULL_PTR; +using mindspore::lite::RET_OK; +using mindspore::schema::PrimitiveType_DeConv2D; +using mindspore::schema::Format::Format_NHWC; + +namespace mindspore::kernel { + +DeConvolutionWinogradCPUKernel::~DeConvolutionWinogradCPUKernel() { + FreeResizeBuf(); + FreeDeconvParam(); + return; +} + +void DeConvolutionWinogradCPUKernel::FreeResizeBuf() { + for (int i = 0; i < deconv_param_->compute_size_; i++) { + DeConvComputeUnit &unit = deconv_param_->compute_units_[i]; + if (unit.tmp_buffer_ != nullptr) { + free(unit.tmp_buffer_); + unit.tmp_buffer_ = nullptr; + } + + if (unit.use_winograd_) { + if (unit.winograd_.b_buffer_ != nullptr) { + free(unit.winograd_.b_buffer_); + unit.winograd_.b_buffer_ = nullptr; + } + } + } + + for (int i = 0; i < DECONV_WINOGRAD_BUFFER_COUNT; i++) { + DeConvWgABuffer &wg = deconv_param_->a_buffer_[i]; + if (wg.buf_init_) { + if (wg.dest_buffer_ != nullptr) { + free(wg.dest_buffer_); + wg.dest_buffer_ = nullptr; + } + if (wg.middle_buffer_ != nullptr) { + free(wg.middle_buffer_); + wg.middle_buffer_ = nullptr; + } + } + wg.buf_init_ = false; + } + + if (nc4hw4_output_ != nullptr) { + free(nc4hw4_output_); + nc4hw4_output_ = nullptr; + } + + if (tile_input_ != nullptr) { + free(tile_input_); + tile_input_ = nullptr; + } + + if (tile_output_ != nullptr) { + free(tile_output_); + tile_output_ = nullptr; + } + return; +} + +void DeConvolutionWinogradCPUKernel::FreeDeconvParam() { + for (int i = 0; i < deconv_param_->compute_size_; i++) { + DeConvComputeUnit &unit = deconv_param_->compute_units_[i]; + + if (unit.weight_ != nullptr) { + free(unit.weight_); + unit.weight_ = nullptr; + } + + if (unit.use_winograd_) { + if (unit.winograd_.AT_ != nullptr) { + free(unit.winograd_.AT_); + unit.winograd_.AT_ = nullptr; + } + if (unit.winograd_.BT_ != nullptr) { + free(unit.winograd_.BT_); + unit.winograd_.BT_ = nullptr; + } + } + } + + if (deconv_param_ != nullptr) { + delete (deconv_param_); + deconv_param_ = nullptr; + } + return; +} + +int DeConvolutionWinogradCPUKernel::InitParameter() { + deconv_param_->input_plane_ = conv_param_->input_h_ * conv_param_->input_w_; + deconv_param_->output_plane_ = conv_param_->output_h_ * conv_param_->output_w_; + + nc4hw4_output_ = + reinterpret_cast(malloc(deconv_param_->oc_up4_ * deconv_param_->output_plane_ * sizeof(float))); + + deconv_param_->in_tile_w_count_ = UP_DIV(conv_param_->input_w_, DECONV_WINOGRAD_DEFAULT_UNIT); + deconv_param_->in_tile_h_count_ = UP_DIV(conv_param_->input_h_, DECONV_WINOGRAD_DEFAULT_UNIT); + + deconv_param_->in_tile_count_ = + UP_DIV(deconv_param_->in_tile_w_count_ * deconv_param_->in_tile_h_count_, DECONV_WINOGRAD_DEFAULT_TILE); + deconv_param_->thread_num_ = MSMAX(1, op_parameter_->thread_num_); + deconv_param_->thread_num_ = MSMIN(deconv_param_->thread_num_, deconv_param_->in_tile_count_); + + thread_num_hw_ = MSMIN(op_parameter_->thread_num_, deconv_param_->output_plane_); + thread_stride_hw_ = UP_DIV(deconv_param_->output_plane_, thread_num_hw_); + + int size = deconv_param_->thread_num_ * DECONV_WINOGRAD_DEFAULT_UNIT * DECONV_WINOGRAD_DEFAULT_UNIT * + DECONV_WINOGRAD_DEFAULT_TILE * deconv_param_->ic_up4_; + tile_input_ = reinterpret_cast(malloc(size * sizeof(float))); + memset(tile_input_, 0, size * sizeof(float)); + + deconv_param_->out_tile_w_ = (DECONV_WINOGRAD_DEFAULT_UNIT - 1) * conv_param_->stride_w_ + conv_param_->kernel_w_; + deconv_param_->out_tile_h_ = (DECONV_WINOGRAD_DEFAULT_UNIT - 1) * conv_param_->stride_h_ + conv_param_->kernel_h_; + size = deconv_param_->thread_num_ * deconv_param_->out_tile_w_ * deconv_param_->out_tile_h_ * + DECONV_WINOGRAD_DEFAULT_TILE * deconv_param_->oc_up4_; + tile_output_ = reinterpret_cast(malloc(size * sizeof(float))); + + for (int i = 0; i < deconv_param_->compute_size_; i++) { + DeConvComputeUnit &unit = deconv_param_->compute_units_[i]; + if (unit.use_winograd_) { + if (deconv_param_->a_buffer_[unit.winograd_.kh_].buf_init_ == false) { + deconv_param_->a_buffer_[unit.winograd_.kh_].buf_init_ = true; + deconv_param_->a_buffer_[unit.winograd_.kh_].trans_formed_ = false; + + size = unit.winograd_.kh_ * unit.winograd_.kw_ * DECONV_WINOGRAD_DEFAULT_TILE * deconv_param_->ic_up4_; + deconv_param_->a_buffer_[unit.winograd_.kh_].middle_buffer_ = + malloc(deconv_param_->thread_num_ * size * sizeof(float)); + deconv_param_->a_buffer_[unit.winograd_.kh_].dest_buffer_ = + malloc(deconv_param_->thread_num_ * size * sizeof(float)); + } + + unit.winograd_.b_buffer_ = malloc(deconv_param_->thread_num_ * unit.winograd_.kh_ * unit.winograd_.kw_ * + deconv_param_->oc_up4_ * DECONV_WINOGRAD_DEFAULT_TILE * sizeof(float)); + unit.tmp_buffer_ = malloc(deconv_param_->thread_num_ * unit.winograd_.kh_ * unit.winograd_.kw_ * + deconv_param_->oc_div4_ * DECONV_WINOGRAD_DEFAULT_TILE * C4NUM * sizeof(float)); + + } else { + unit.tmp_buffer_ = malloc(deconv_param_->thread_num_ * deconv_param_->oc_div4_ * unit.w_size_ * unit.h_size_ * + DECONV_WINOGRAD_DEFAULT_TILE * C4NUM * sizeof(float)); + } + } + + return RET_OK; +} + +int DeConvWgFp32Run(void *cdata, int task_id) { + auto deconvWg = reinterpret_cast(cdata); + deconvWg->DoDeconv(task_id); + return RET_OK; +} + +int DeConvWgPostFp32Run(void *cdata, int task_id) { + auto deconvWg = reinterpret_cast(cdata); + deconvWg->DeDeconvPost(task_id); + return RET_OK; +} + +int DeConvolutionWinogradCPUKernel::InitComputeParam() { + auto weight_tensor = in_tensors_[1]; + + conv_param_->input_channel_ = weight_tensor->Batch(); + conv_param_->output_channel_ = weight_tensor->Channel(); + conv_param_->kernel_w_ = weight_tensor->Width(); + conv_param_->kernel_h_ = weight_tensor->Height(); + + deconv_param_->kernel_plane_ = conv_param_->kernel_w_ * conv_param_->kernel_h_; + deconv_param_->ic_div4_ = UP_DIV(conv_param_->input_channel_, C4NUM); + deconv_param_->oc_div4_ = UP_DIV(conv_param_->output_channel_, C4NUM); + deconv_param_->ic_up4_ = deconv_param_->ic_div4_ * C4NUM; + deconv_param_->oc_up4_ = deconv_param_->oc_div4_ * C4NUM; + + deconv_param_->compute_size_ = 0; + for (int si_h = 0; si_h < conv_param_->stride_h_; si_h++) { + for (int si_w = 0; si_w < conv_param_->stride_w_; si_w++) { + if (si_h < conv_param_->kernel_h_ && si_w < conv_param_->kernel_w_) { + deconv_param_->compute_size_++; + } + } + } + + int size = deconv_param_->compute_size_ * sizeof(DeConvComputeUnit); + deconv_param_->compute_units_ = reinterpret_cast(malloc(size)); + if (deconv_param_->compute_units_ == nullptr) { + return RET_NULL_PTR; + } + int cur_count = 0; + for (int si_h = 0; si_h < conv_param_->stride_h_; si_h++) { + if (si_h >= conv_param_->kernel_h_) { + continue; + } + for (int si_w = 0; si_w < conv_param_->stride_w_; si_w++) { + if (si_w >= conv_param_->kernel_w_) { + continue; + } + + int h_size = 1 + (conv_param_->kernel_h_ - si_h - 1) / conv_param_->stride_h_; + int w_size = 1 + (conv_param_->kernel_w_ - si_w - 1) / conv_param_->stride_w_; + + DeConvComputeUnit unit; + + unit.h_start_ = si_h; + unit.w_start_ = si_w; + unit.h_size_ = h_size; + unit.w_size_ = w_size; + + if (h_size == w_size) { + unit.use_winograd_ = true; + + unit.winograd_.k_ = unit.h_size_; + unit.winograd_.i_ = DECONV_WINOGRAD_DEFAULT_UNIT; + unit.winograd_.o_ = DECONV_WINOGRAD_DEFAULT_UNIT + unit.h_size_ - 1; + unit.winograd_.kh_ = unit.h_size_ + DECONV_WINOGRAD_DEFAULT_UNIT - 1; + unit.winograd_.kw_ = unit.w_size_ + DECONV_WINOGRAD_DEFAULT_UNIT - 1; + + unit.winograd_.b_buffer_ = nullptr; + unit.weight_ = malloc(unit.winograd_.kh_ * unit.winograd_.kw_ * deconv_param_->oc_up4_ * + deconv_param_->ic_up4_ * sizeof(float)); + } else { + unit.use_winograd_ = false; + unit.weight_ = malloc(h_size * w_size * deconv_param_->ic_up4_ * deconv_param_->oc_up4_ * sizeof(float)); + } + unit.tmp_buffer_ = nullptr; + deconv_param_->compute_units_[cur_count] = unit; + cur_count++; + } + } + return RET_OK; +} + +int DeConvolutionWinogradCPUKernel::InitDataParam() { + auto weight_tensor = in_tensors_.at(kWeightIndex); + float *nhwc_weight = reinterpret_cast(weight_tensor->data_c()); + + /* unit data : weight & winograd data*/ + for (int i = 0; i < deconv_param_->compute_size_; i++) { + DeConvComputeUnit *unit = &deconv_param_->compute_units_[i]; + int ret = PackDeConvWgDataFp32(nhwc_weight, unit, conv_param_, deconv_param_); + if (ret != RET_OK) { + return ret; + } + } + + /* bias */ + auto bias_tensor = in_tensors_.at(kBiasIndex); + bias_data_ = malloc(deconv_param_->oc_up4_ * sizeof(float)); + memset(bias_data_, 0, deconv_param_->oc_up4_ * sizeof(float)); + if (in_tensors_.size() == 3) { + memcpy(bias_data_, bias_tensor->data_c(), conv_param_->output_channel_ * sizeof(float)); + } + return RET_OK; +} + +int DeConvolutionWinogradCPUKernel::ReSize() { + FreeResizeBuf(); + ConvolutionBaseCPUKernel::Init(); + InitParameter(); + return RET_OK; +} + +int DeConvolutionWinogradCPUKernel::Init() { + int error_code = InitComputeParam(); + if (error_code != RET_OK) { + MS_LOG(ERROR) << "InitComputeParam error! ret: " << error_code; + return error_code; + } + + error_code = InitDataParam(); + if (error_code != RET_OK) { + MS_LOG(ERROR) << "InitWeightBias error! ret: " << error_code; + return error_code; + } + + if (!InferShapeDone()) { + return RET_OK; + } + + return ReSize(); +} + +int DeConvolutionWinogradCPUKernel::DoDeconv(int task_id) { + for (int tile_index = task_id; tile_index < deconv_param_->in_tile_count_; tile_index += deconv_param_->thread_num_) { + float *tile_in = tile_input_ + task_id * DECONV_WINOGRAD_DEFAULT_UNIT * DECONV_WINOGRAD_DEFAULT_UNIT * + DECONV_WINOGRAD_DEFAULT_TILE * deconv_param_->ic_up4_; + int size = deconv_param_->out_tile_w_ * deconv_param_->out_tile_h_ * DECONV_WINOGRAD_DEFAULT_TILE * + deconv_param_->oc_div4_ * C4NUM; + float *tile_out = tile_output_ + task_id * size; + memset(tile_out, 0, size * sizeof(float)); + + int start_index = tile_index * DECONV_WINOGRAD_DEFAULT_TILE; + int calculate_count = MSMIN(DECONV_WINOGRAD_DEFAULT_TILE, + deconv_param_->in_tile_w_count_ * deconv_param_->in_tile_h_count_ - start_index); + + for (int i = 0; i < DECONV_WINOGRAD_BUFFER_COUNT; i++) { + deconv_param_->a_buffer_[i].trans_formed_ = false; + } + DeconvWg(nhwc_input_, tile_in, tile_out, start_index, calculate_count, conv_param_, deconv_param_, task_id); + + std::unique_lock merge_lock(lock_); + DeconvWgPost(tile_out, nc4hw4_output_, conv_param_, deconv_param_, calculate_count, tile_index); + } + return RET_OK; +} + +int DeConvolutionWinogradCPUKernel::DeDeconvPost(int task_id) { + int rest_plane = deconv_param_->output_plane_ - task_id * thread_stride_hw_; + int current_plane = MSMIN(rest_plane, thread_stride_hw_); + if (current_plane <= 0) { + return RET_OK; + } + + PostConvFuncFp32C4(nc4hw4_output_ + task_id * thread_stride_hw_ * C4NUM, + nhwc_output_ + task_id * thread_stride_hw_ * conv_param_->output_channel_, + reinterpret_cast(bias_data_), conv_param_->output_channel_, current_plane, + deconv_param_->output_plane_, conv_param_->act_type_ == ActType_Relu, + conv_param_->act_type_ == ActType_Relu6); + return RET_OK; +} + +int DeConvolutionWinogradCPUKernel::Run() { + auto prepare_ret = Prepare(); + if (prepare_ret != RET_OK) { + MS_LOG(ERROR) << "Prepare fail!ret: " << prepare_ret; + return prepare_ret; + } + float *src_in = reinterpret_cast(in_tensors_[0]->data_c()); + float *src_out = reinterpret_cast(out_tensors_[0]->data_c()); + + for (int batch_index = 0; batch_index < conv_param_->input_batch_; batch_index++) { + nhwc_input_ = src_in + batch_index * deconv_param_->input_plane_ * conv_param_->input_channel_; + nhwc_output_ = src_out + batch_index * deconv_param_->output_plane_ * conv_param_->output_channel_; + + ::memset(nc4hw4_output_, 0, deconv_param_->output_plane_ * deconv_param_->oc_div4_ * C4NUM * sizeof(float)); + for (int i = 0; i < deconv_param_->thread_num_; i++) { + DoDeconv(i); + } + // ParallelLaunch(this->context_->thread_pool_, DeConvWgFp32Run, this, deconv_param_->thread_num_); + + /*post bias activate and nhwc */ + ParallelLaunch(this->context_->thread_pool_, DeConvWgPostFp32Run, this, thread_num_hw_); + } + + return RET_OK; +} + +} // namespace mindspore::kernel diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/deconvolution_winograd.h b/mindspore/lite/src/runtime/kernel/arm/fp32/deconvolution_winograd.h new file mode 100644 index 0000000000..17205a78f2 --- /dev/null +++ b/mindspore/lite/src/runtime/kernel/arm/fp32/deconvolution_winograd.h @@ -0,0 +1,70 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP32_DECONVOLUTION_WINOGRAD_H_ +#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP32_DECONVOLUTION_WINOGRAD_H_ + +#include +#include +#include "src/lite_kernel.h" +#include "src/kernel_registry.h" +#include "include/errorcode.h" +#include "schema/model_generated.h" +#include "nnacl/fp32/matmul.h" +#include "nnacl/fp32/deconv_winograd.h" +#include "src/runtime/kernel/arm/base/convolution_base.h" + +namespace mindspore::kernel { +class DeConvolutionWinogradCPUKernel : public ConvolutionBaseCPUKernel { + public: + DeConvolutionWinogradCPUKernel(OpParameter *parameter, const std::vector &inputs, + const std::vector &outputs, const lite::InnerContext *ctx, + const mindspore::lite::PrimitiveC *primitive) + : ConvolutionBaseCPUKernel(parameter, inputs, outputs, ctx, primitive) { + deconv_param_ = new DeConvParam(); + for (auto &wg : deconv_param_->a_buffer_) { + wg.buf_init_ = false; + } + } + ~DeConvolutionWinogradCPUKernel() override; + int Init() override; + int Run() override; + int ReSize() override; + + public: + int DoDeconv(int task_id); + int DeDeconvPost(int task_id); + + private: + int InitComputeParam(); + int InitDataParam(); + int InitParameter(); + void FreeDeconvParam(); + void FreeResizeBuf(); + + private: + DeConvParam *deconv_param_; + float *nhwc_input_ = nullptr; + float *nhwc_output_ = nullptr; + float *nc4hw4_output_ = nullptr; + float *tile_input_ = nullptr; + float *tile_output_ = nullptr; + std::mutex lock_; + int thread_num_hw_; + int thread_stride_hw_; +}; +} // namespace mindspore::kernel +#endif // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP32_DECONVOLUTION_WINOGRAD_H_