From 8213de44347d29bcb8f38888de5def6e676bde93 Mon Sep 17 00:00:00 2001 From: liuzhongkai Date: Tue, 13 Oct 2020 09:55:23 +0800 Subject: [PATCH] conv1x1 init time optimize --- mindspore/lite/nnacl/fp32/matmul.c | 120 ++++++++++++------ .../kernel/arm/fp16/convolution_1x1_fp16.cc | 20 +-- .../kernel/arm/fp32/convolution_1x1.cc | 29 +++-- 3 files changed, 104 insertions(+), 65 deletions(-) diff --git a/mindspore/lite/nnacl/fp32/matmul.c b/mindspore/lite/nnacl/fp32/matmul.c index ed0eb1c633..dd2a7a77ec 100644 --- a/mindspore/lite/nnacl/fp32/matmul.c +++ b/mindspore/lite/nnacl/fp32/matmul.c @@ -220,68 +220,104 @@ void RowMajor2Col12Major(float *src_ptr, float *dst_ptr, size_t row, size_t col) void RowMajor2Col8Major(float *src_ptr, float *dst_ptr, size_t row, size_t col) { size_t row8 = row / C8NUM * C8NUM; - size_t col4 = col / C4NUM * C4NUM; +#ifdef ENABLE_ARM64 + size_t col_skip = col / C8NUM * C8NUM; + int skip_size = C8NUM; +#else + size_t col_skip = col / C4NUM * C4NUM; + int skip_size = C4NUM; +#endif float *src_r = src_ptr; float *dst_r = dst_ptr; size_t ri = 0; for (; ri < row8; ri += C8NUM) { size_t ci = 0; - for (; ci < col4; ci += C4NUM) { + for (; ci < col_skip; ci += skip_size) { float *src_c = src_r + ci; float *dst_c = dst_r + ci * C8NUM; - /* 8x4 row-major to col-major */ #ifdef ENABLE_ARM64 + /* 8x8 row-major to col-major */ size_t stride = col * sizeof(float); asm volatile( "mov x10, %[src_c]\n" "mov x11, %[dst_c]\n" - "ld1 {v0.4s}, [x10], %[stride]\n" - "ld1 {v1.4s}, [x10], %[stride]\n" - "ld1 {v2.4s}, [x10], %[stride]\n" - "ld1 {v3.4s}, [x10], %[stride]\n" - - "zip1 v4.4s, v0.4s, v1.4s\n" - "zip2 v5.4s, v0.4s, v1.4s\n" - "zip1 v6.4s, v2.4s, v3.4s\n" - "zip2 v7.4s, v2.4s, v3.4s\n" - - "ld1 {v8.4s}, [x10], %[stride]\n" - "ld1 {v9.4s}, [x10], %[stride]\n" - "ld1 {v10.4s}, [x10], %[stride]\n" - "ld1 {v11.4s}, [x10], %[stride]\n" - - "trn1 v0.2d, v4.2d, v6.2d\n" - "trn2 v1.2d, v4.2d, v6.2d\n" - "trn1 v2.2d, v5.2d, v7.2d\n" - "trn2 v3.2d, v5.2d, v7.2d\n" - - "zip1 v12.4s, v8.4s, v9.4s\n" - "zip2 v13.4s, v8.4s, v9.4s\n" - "zip1 v14.4s, v10.4s, v11.4s\n" - "zip2 v15.4s, v10.4s, v11.4s\n" - - "trn1 v8.2d, v12.2d, v14.2d\n" - "trn2 v9.2d, v12.2d, v14.2d\n" - "trn1 v10.2d, v13.2d, v15.2d\n" - "trn2 v11.2d, v13.2d, v15.2d\n" - - "st1 {v0.4s}, [x11], #16\n" - "st1 {v8.4s}, [x11], #16\n" - "st1 {v1.4s}, [x11], #16\n" - "st1 {v9.4s}, [x11], #16\n" - "st1 {v2.4s}, [x11],#16\n" - "st1 {v10.4s}, [x11], #16\n" - "st1 {v3.4s}, [x11],#16\n" - "st1 {v11.4s}, [x11], #16\n" + "ld1 {v0.4s, v1.4s}, [x10], %[stride]\n" + "ld1 {v2.4s, v3.4s}, [x10], %[stride]\n" + "ld1 {v4.4s, v5.4s}, [x10], %[stride]\n" + "ld1 {v6.4s, v7.4s}, [x10], %[stride]\n" + + "zip1 v8.4s, v0.4s, v2.4s\n" + "zip2 v9.4s, v0.4s, v2.4s\n" + "zip1 v10.4s, v4.4s, v6.4s\n" + "zip2 v11.4s, v4.4s, v6.4s\n" + + "ld1 {v16.4s, v17.4s}, [x10], %[stride]\n" + "ld1 {v18.4s, v19.4s}, [x10], %[stride]\n" + "ld1 {v20.4s, v21.4s}, [x10], %[stride]\n" + "ld1 {v22.4s, v23.4s}, [x10], %[stride]\n" + + "zip1 v12.4s, v1.4s, v3.4s\n" + "zip2 v13.4s, v1.4s, v3.4s\n" + "zip1 v14.4s, v5.4s, v7.4s\n" + "zip2 v15.4s, v5.4s, v7.4s\n" + + "trn1 v0.2d, v8.2d, v10.2d\n" + "trn2 v1.2d, v8.2d, v10.2d\n" + "trn1 v2.2d, v9.2d, v11.2d\n" + "trn2 v3.2d, v9.2d, v11.2d\n" + + "zip1 v24.4s, v16.4s, v18.4s\n" + "zip2 v25.4s, v16.4s, v18.4s\n" + "zip1 v26.4s, v20.4s, v22.4s\n" + "zip2 v27.4s, v20.4s, v22.4s\n" + + "trn1 v4.2d, v12.2d, v14.2d\n" + "trn2 v5.2d, v12.2d, v14.2d\n" + "trn1 v6.2d, v13.2d, v15.2d\n" + "trn2 v7.2d, v13.2d, v15.2d\n" + + "zip1 v28.4s, v17.4s, v19.4s\n" + "zip2 v29.4s, v17.4s, v19.4s\n" + "zip1 v30.4s, v21.4s, v23.4s\n" + "zip2 v31.4s, v21.4s, v23.4s\n" + + "trn1 v16.2d, v24.2d, v26.2d\n" + "trn2 v17.2d, v24.2d, v26.2d\n" + "trn1 v18.2d, v25.2d, v27.2d\n" + "trn2 v19.2d, v25.2d, v27.2d\n" + + "trn1 v20.2d, v28.2d, v30.2d\n" + "trn2 v21.2d, v28.2d, v30.2d\n" + "trn1 v22.2d, v29.2d, v31.2d\n" + "trn2 v23.2d, v29.2d, v31.2d\n" + + "st1 {v0.4s}, [x11], #16\n" + "st1 {v16.4s}, [x11], #16\n" + "st1 {v1.4s}, [x11], #16\n" + "st1 {v17.4s}, [x11], #16\n" + "st1 {v2.4s}, [x11], #16\n" + "st1 {v18.4s}, [x11], #16\n" + "st1 {v3.4s}, [x11], #16\n" + "st1 {v19.4s}, [x11], #16\n" + "st1 {v4.4s}, [x11], #16\n" + "st1 {v20.4s}, [x11], #16\n" + "st1 {v5.4s}, [x11], #16\n" + "st1 {v21.4s}, [x11], #16\n" + "st1 {v6.4s}, [x11], #16\n" + "st1 {v22.4s}, [x11], #16\n" + "st1 {v7.4s}, [x11], #16\n" + "st1 {v23.4s}, [x11], #16\n" : : [ dst_c ] "r"(dst_c), [ src_c ] "r"(src_c), [ stride ] "r"(stride) : "x10", "x11", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", - "v15"); + "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", + "v30", "v31"); #elif ENABLE_ARM32 + /* 8x4 row-major to col-major */ size_t stride = col * sizeof(float); asm volatile( "mov r10, %[src_c]\n" diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_1x1_fp16.cc b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_1x1_fp16.cc index 0948bc4e63..e5f62f8a46 100644 --- a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_1x1_fp16.cc +++ b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_1x1_fp16.cc @@ -85,14 +85,14 @@ int Convolution1x1FP16CPUKernel::InitWeightBias() { auto input_channel = weight_tensor->Channel(); auto output_channel = weight_tensor->Batch(); - size_t size = UP_ROUND(output_channel, C8NUM) * sizeof(float16_t); - bias_data_ = malloc(size); - if (bias_data_ == nullptr) { - MS_LOG(ERROR) << "Conv1x1 Malloc bias_ptr_ error!"; - return RET_ERROR; - } - memset(bias_data_, 0, size); if (in_tensors_.size() == 3) { + size_t size = UP_ROUND(output_channel, C8NUM) * sizeof(float16_t); + size_t weight_size = output_channel * sizeof(float16_t); + bias_data_ = malloc(size); + if (bias_data_ == nullptr) { + MS_LOG(ERROR) << "Conv1x1 Malloc bias_ptr_ error!"; + return RET_ERROR; + } auto bias_tensor = in_tensors_.at(kBiasIndex); if (bias_tensor->data_type() == kNumberTypeFloat16) { memcpy(bias_data_, bias_tensor->MutableData(), output_channel * sizeof(float16_t)); @@ -100,15 +100,17 @@ int Convolution1x1FP16CPUKernel::InitWeightBias() { Float32ToFloat16(reinterpret_cast(bias_tensor->MutableData()), reinterpret_cast(bias_data_), output_channel); } + memset(reinterpret_cast(bias_data_) + weight_size, 0, size - weight_size); } - size = input_channel * UP_ROUND(output_channel, C8NUM) * sizeof(float16_t); + size_t size = input_channel * UP_ROUND(output_channel, C8NUM) * sizeof(float16_t); + size_t down_size = input_channel * DOWN_DIV(output_channel, C8NUM) * C8NUM * sizeof(float16_t); weight_ptr_ = reinterpret_cast(malloc(size)); if (weight_ptr_ == nullptr) { MS_LOG(ERROR) << "Conv1x1 Malloc weight_ptr_ error!"; return RET_ERROR; } - memset(weight_ptr_, 0, size); + memset(reinterpret_cast(weight_ptr_) + down_size, 0, size - down_size); ColMajor2Row8MajorFp16(weight_tensor->MutableData(), weight_ptr_, input_channel, output_channel, weight_tensor->data_type() == kNumberTypeFloat16); return RET_OK; diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_1x1.cc b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_1x1.cc index 4da5295f5a..d4478d4702 100644 --- a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_1x1.cc +++ b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_1x1.cc @@ -71,24 +71,26 @@ int Convolution1x1CPUKernel::InitConv1x1BiasWeight() { auto input_channel = filter_tensor->Channel(); auto output_channel = filter_tensor->Batch(); - int size = UP_ROUND(output_channel, C8NUM) * sizeof(float); - bias_data_ = malloc(size); - if (bias_data_ == nullptr) { - MS_LOG(ERROR) << "Conv1x1 Malloc bias_ptr_ error!"; - return RET_ERROR; - } - memset(bias_data_, 0, size); if (in_tensors_.size() == 3) { - memcpy(bias_data_, in_tensors_[kBiasIndex]->MutableData(), output_channel * sizeof(float)); + int size = UP_ROUND(output_channel, C8NUM) * sizeof(float); + int weight_size = output_channel * sizeof(float); + bias_data_ = malloc(size); + if (bias_data_ == nullptr) { + MS_LOG(ERROR) << "Conv1x1 Malloc bias_ptr_ error!"; + return RET_ERROR; + } + memcpy(bias_data_, in_tensors_[kBiasIndex]->MutableData(), weight_size); + memset(reinterpret_cast(bias_data_) + weight_size, 0, size - weight_size); } - size = input_channel * UP_ROUND(output_channel, C8NUM) * sizeof(float); + int size = input_channel * UP_ROUND(output_channel, C8NUM) * sizeof(float); + int down_size = input_channel * DOWN_DIV(output_channel, C8NUM) * C8NUM * sizeof(float); weight_ptr_ = reinterpret_cast(malloc(size)); if (weight_ptr_ == nullptr) { MS_LOG(ERROR) << "Conv1x1 Malloc weight_ptr_ error!"; return RET_ERROR; } - memset(weight_ptr_, 0, size); + memset(reinterpret_cast(weight_ptr_) + down_size, 0, size - down_size); RowMajor2Col8Major(reinterpret_cast(filter_tensor->MutableData()), weight_ptr_, output_channel, input_channel); return RET_OK; @@ -141,10 +143,10 @@ int Convolution1x1CPUKernel::DoConv1x1(int task_id) { if (cur_oc <= 0) { return RET_OK; } + auto bias = (bias_data_ == nullptr) ? nullptr : reinterpret_cast(bias_data_) + thread_stride_ * task_id; MatMulOpt(pack_input_, weight_ptr_ + task_id * thread_stride_ * matmul_param_->deep_, - output_ptr_ + task_id * thread_stride_, reinterpret_cast(bias_data_) + thread_stride_ * task_id, - matmul_param_->act_type_, matmul_param_->deep_, matmul_param_->row_, cur_oc, matmul_param_->col_, - OutType_Nhwc); + output_ptr_ + task_id * thread_stride_, bias, matmul_param_->act_type_, matmul_param_->deep_, + matmul_param_->row_, cur_oc, matmul_param_->col_, OutType_Nhwc); return RET_OK; } @@ -178,7 +180,6 @@ int Convolution1x1CPUKernel::DoConv1x1Hw(int task_id) { MatMulOpt(thread_pack_input, weight_ptr_, thread_output_ptr, reinterpret_cast(bias_data_), matmul_param_->act_type_, matmul_param_->deep_, cur_hw_, matmul_param_->col_, matmul_param_->col_, OutType_Nhwc); - return RET_OK; }