Merge pull request !7249 from liuzhongkai/conv1x1_asmoptags/v1.1.0
| @@ -220,68 +220,104 @@ void RowMajor2Col12Major(float *src_ptr, float *dst_ptr, size_t row, size_t col) | |||||
| void RowMajor2Col8Major(float *src_ptr, float *dst_ptr, size_t row, size_t col) { | void RowMajor2Col8Major(float *src_ptr, float *dst_ptr, size_t row, size_t col) { | ||||
| size_t row8 = row / C8NUM * C8NUM; | size_t row8 = row / C8NUM * C8NUM; | ||||
| size_t col4 = col / C4NUM * C4NUM; | |||||
| #ifdef ENABLE_ARM64 | |||||
| size_t col_skip = col / C8NUM * C8NUM; | |||||
| int skip_size = C8NUM; | |||||
| #else | |||||
| size_t col_skip = col / C4NUM * C4NUM; | |||||
| int skip_size = C4NUM; | |||||
| #endif | |||||
| float *src_r = src_ptr; | float *src_r = src_ptr; | ||||
| float *dst_r = dst_ptr; | float *dst_r = dst_ptr; | ||||
| size_t ri = 0; | size_t ri = 0; | ||||
| for (; ri < row8; ri += C8NUM) { | for (; ri < row8; ri += C8NUM) { | ||||
| size_t ci = 0; | size_t ci = 0; | ||||
| for (; ci < col4; ci += C4NUM) { | |||||
| for (; ci < col_skip; ci += skip_size) { | |||||
| float *src_c = src_r + ci; | float *src_c = src_r + ci; | ||||
| float *dst_c = dst_r + ci * C8NUM; | float *dst_c = dst_r + ci * C8NUM; | ||||
| /* 8x4 row-major to col-major */ | |||||
| #ifdef ENABLE_ARM64 | #ifdef ENABLE_ARM64 | ||||
| /* 8x8 row-major to col-major */ | |||||
| size_t stride = col * sizeof(float); | size_t stride = col * sizeof(float); | ||||
| asm volatile( | asm volatile( | ||||
| "mov x10, %[src_c]\n" | "mov x10, %[src_c]\n" | ||||
| "mov x11, %[dst_c]\n" | "mov x11, %[dst_c]\n" | ||||
| "ld1 {v0.4s}, [x10], %[stride]\n" | |||||
| "ld1 {v1.4s}, [x10], %[stride]\n" | |||||
| "ld1 {v2.4s}, [x10], %[stride]\n" | |||||
| "ld1 {v3.4s}, [x10], %[stride]\n" | |||||
| "zip1 v4.4s, v0.4s, v1.4s\n" | |||||
| "zip2 v5.4s, v0.4s, v1.4s\n" | |||||
| "zip1 v6.4s, v2.4s, v3.4s\n" | |||||
| "zip2 v7.4s, v2.4s, v3.4s\n" | |||||
| "ld1 {v8.4s}, [x10], %[stride]\n" | |||||
| "ld1 {v9.4s}, [x10], %[stride]\n" | |||||
| "ld1 {v10.4s}, [x10], %[stride]\n" | |||||
| "ld1 {v11.4s}, [x10], %[stride]\n" | |||||
| "trn1 v0.2d, v4.2d, v6.2d\n" | |||||
| "trn2 v1.2d, v4.2d, v6.2d\n" | |||||
| "trn1 v2.2d, v5.2d, v7.2d\n" | |||||
| "trn2 v3.2d, v5.2d, v7.2d\n" | |||||
| "zip1 v12.4s, v8.4s, v9.4s\n" | |||||
| "zip2 v13.4s, v8.4s, v9.4s\n" | |||||
| "zip1 v14.4s, v10.4s, v11.4s\n" | |||||
| "zip2 v15.4s, v10.4s, v11.4s\n" | |||||
| "trn1 v8.2d, v12.2d, v14.2d\n" | |||||
| "trn2 v9.2d, v12.2d, v14.2d\n" | |||||
| "trn1 v10.2d, v13.2d, v15.2d\n" | |||||
| "trn2 v11.2d, v13.2d, v15.2d\n" | |||||
| "st1 {v0.4s}, [x11], #16\n" | |||||
| "st1 {v8.4s}, [x11], #16\n" | |||||
| "st1 {v1.4s}, [x11], #16\n" | |||||
| "st1 {v9.4s}, [x11], #16\n" | |||||
| "st1 {v2.4s}, [x11],#16\n" | |||||
| "st1 {v10.4s}, [x11], #16\n" | |||||
| "st1 {v3.4s}, [x11],#16\n" | |||||
| "st1 {v11.4s}, [x11], #16\n" | |||||
| "ld1 {v0.4s, v1.4s}, [x10], %[stride]\n" | |||||
| "ld1 {v2.4s, v3.4s}, [x10], %[stride]\n" | |||||
| "ld1 {v4.4s, v5.4s}, [x10], %[stride]\n" | |||||
| "ld1 {v6.4s, v7.4s}, [x10], %[stride]\n" | |||||
| "zip1 v8.4s, v0.4s, v2.4s\n" | |||||
| "zip2 v9.4s, v0.4s, v2.4s\n" | |||||
| "zip1 v10.4s, v4.4s, v6.4s\n" | |||||
| "zip2 v11.4s, v4.4s, v6.4s\n" | |||||
| "ld1 {v16.4s, v17.4s}, [x10], %[stride]\n" | |||||
| "ld1 {v18.4s, v19.4s}, [x10], %[stride]\n" | |||||
| "ld1 {v20.4s, v21.4s}, [x10], %[stride]\n" | |||||
| "ld1 {v22.4s, v23.4s}, [x10], %[stride]\n" | |||||
| "zip1 v12.4s, v1.4s, v3.4s\n" | |||||
| "zip2 v13.4s, v1.4s, v3.4s\n" | |||||
| "zip1 v14.4s, v5.4s, v7.4s\n" | |||||
| "zip2 v15.4s, v5.4s, v7.4s\n" | |||||
| "trn1 v0.2d, v8.2d, v10.2d\n" | |||||
| "trn2 v1.2d, v8.2d, v10.2d\n" | |||||
| "trn1 v2.2d, v9.2d, v11.2d\n" | |||||
| "trn2 v3.2d, v9.2d, v11.2d\n" | |||||
| "zip1 v24.4s, v16.4s, v18.4s\n" | |||||
| "zip2 v25.4s, v16.4s, v18.4s\n" | |||||
| "zip1 v26.4s, v20.4s, v22.4s\n" | |||||
| "zip2 v27.4s, v20.4s, v22.4s\n" | |||||
| "trn1 v4.2d, v12.2d, v14.2d\n" | |||||
| "trn2 v5.2d, v12.2d, v14.2d\n" | |||||
| "trn1 v6.2d, v13.2d, v15.2d\n" | |||||
| "trn2 v7.2d, v13.2d, v15.2d\n" | |||||
| "zip1 v28.4s, v17.4s, v19.4s\n" | |||||
| "zip2 v29.4s, v17.4s, v19.4s\n" | |||||
| "zip1 v30.4s, v21.4s, v23.4s\n" | |||||
| "zip2 v31.4s, v21.4s, v23.4s\n" | |||||
| "trn1 v16.2d, v24.2d, v26.2d\n" | |||||
| "trn2 v17.2d, v24.2d, v26.2d\n" | |||||
| "trn1 v18.2d, v25.2d, v27.2d\n" | |||||
| "trn2 v19.2d, v25.2d, v27.2d\n" | |||||
| "trn1 v20.2d, v28.2d, v30.2d\n" | |||||
| "trn2 v21.2d, v28.2d, v30.2d\n" | |||||
| "trn1 v22.2d, v29.2d, v31.2d\n" | |||||
| "trn2 v23.2d, v29.2d, v31.2d\n" | |||||
| "st1 {v0.4s}, [x11], #16\n" | |||||
| "st1 {v16.4s}, [x11], #16\n" | |||||
| "st1 {v1.4s}, [x11], #16\n" | |||||
| "st1 {v17.4s}, [x11], #16\n" | |||||
| "st1 {v2.4s}, [x11], #16\n" | |||||
| "st1 {v18.4s}, [x11], #16\n" | |||||
| "st1 {v3.4s}, [x11], #16\n" | |||||
| "st1 {v19.4s}, [x11], #16\n" | |||||
| "st1 {v4.4s}, [x11], #16\n" | |||||
| "st1 {v20.4s}, [x11], #16\n" | |||||
| "st1 {v5.4s}, [x11], #16\n" | |||||
| "st1 {v21.4s}, [x11], #16\n" | |||||
| "st1 {v6.4s}, [x11], #16\n" | |||||
| "st1 {v22.4s}, [x11], #16\n" | |||||
| "st1 {v7.4s}, [x11], #16\n" | |||||
| "st1 {v23.4s}, [x11], #16\n" | |||||
| : | : | ||||
| : [ dst_c ] "r"(dst_c), [ src_c ] "r"(src_c), [ stride ] "r"(stride) | : [ dst_c ] "r"(dst_c), [ src_c ] "r"(src_c), [ stride ] "r"(stride) | ||||
| : "x10", "x11", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", | : "x10", "x11", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", | ||||
| "v15"); | |||||
| "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", | |||||
| "v30", "v31"); | |||||
| #elif ENABLE_ARM32 | #elif ENABLE_ARM32 | ||||
| /* 8x4 row-major to col-major */ | |||||
| size_t stride = col * sizeof(float); | size_t stride = col * sizeof(float); | ||||
| asm volatile( | asm volatile( | ||||
| "mov r10, %[src_c]\n" | "mov r10, %[src_c]\n" | ||||
| @@ -85,14 +85,14 @@ int Convolution1x1FP16CPUKernel::InitWeightBias() { | |||||
| auto input_channel = weight_tensor->Channel(); | auto input_channel = weight_tensor->Channel(); | ||||
| auto output_channel = weight_tensor->Batch(); | auto output_channel = weight_tensor->Batch(); | ||||
| size_t size = UP_ROUND(output_channel, C8NUM) * sizeof(float16_t); | |||||
| bias_data_ = malloc(size); | |||||
| if (bias_data_ == nullptr) { | |||||
| MS_LOG(ERROR) << "Conv1x1 Malloc bias_ptr_ error!"; | |||||
| return RET_ERROR; | |||||
| } | |||||
| memset(bias_data_, 0, size); | |||||
| if (in_tensors_.size() == 3) { | if (in_tensors_.size() == 3) { | ||||
| size_t size = UP_ROUND(output_channel, C8NUM) * sizeof(float16_t); | |||||
| size_t weight_size = output_channel * sizeof(float16_t); | |||||
| bias_data_ = malloc(size); | |||||
| if (bias_data_ == nullptr) { | |||||
| MS_LOG(ERROR) << "Conv1x1 Malloc bias_ptr_ error!"; | |||||
| return RET_ERROR; | |||||
| } | |||||
| auto bias_tensor = in_tensors_.at(kBiasIndex); | auto bias_tensor = in_tensors_.at(kBiasIndex); | ||||
| if (bias_tensor->data_type() == kNumberTypeFloat16) { | if (bias_tensor->data_type() == kNumberTypeFloat16) { | ||||
| memcpy(bias_data_, bias_tensor->MutableData(), output_channel * sizeof(float16_t)); | memcpy(bias_data_, bias_tensor->MutableData(), output_channel * sizeof(float16_t)); | ||||
| @@ -100,15 +100,17 @@ int Convolution1x1FP16CPUKernel::InitWeightBias() { | |||||
| Float32ToFloat16(reinterpret_cast<float *>(bias_tensor->MutableData()), reinterpret_cast<float16_t *>(bias_data_), | Float32ToFloat16(reinterpret_cast<float *>(bias_tensor->MutableData()), reinterpret_cast<float16_t *>(bias_data_), | ||||
| output_channel); | output_channel); | ||||
| } | } | ||||
| memset(reinterpret_cast<char *>(bias_data_) + weight_size, 0, size - weight_size); | |||||
| } | } | ||||
| size = input_channel * UP_ROUND(output_channel, C8NUM) * sizeof(float16_t); | |||||
| size_t size = input_channel * UP_ROUND(output_channel, C8NUM) * sizeof(float16_t); | |||||
| size_t down_size = input_channel * DOWN_DIV(output_channel, C8NUM) * C8NUM * sizeof(float16_t); | |||||
| weight_ptr_ = reinterpret_cast<float16_t *>(malloc(size)); | weight_ptr_ = reinterpret_cast<float16_t *>(malloc(size)); | ||||
| if (weight_ptr_ == nullptr) { | if (weight_ptr_ == nullptr) { | ||||
| MS_LOG(ERROR) << "Conv1x1 Malloc weight_ptr_ error!"; | MS_LOG(ERROR) << "Conv1x1 Malloc weight_ptr_ error!"; | ||||
| return RET_ERROR; | return RET_ERROR; | ||||
| } | } | ||||
| memset(weight_ptr_, 0, size); | |||||
| memset(reinterpret_cast<char *>(weight_ptr_) + down_size, 0, size - down_size); | |||||
| ColMajor2Row8MajorFp16(weight_tensor->MutableData(), weight_ptr_, input_channel, output_channel, | ColMajor2Row8MajorFp16(weight_tensor->MutableData(), weight_ptr_, input_channel, output_channel, | ||||
| weight_tensor->data_type() == kNumberTypeFloat16); | weight_tensor->data_type() == kNumberTypeFloat16); | ||||
| return RET_OK; | return RET_OK; | ||||
| @@ -71,24 +71,26 @@ int Convolution1x1CPUKernel::InitConv1x1BiasWeight() { | |||||
| auto input_channel = filter_tensor->Channel(); | auto input_channel = filter_tensor->Channel(); | ||||
| auto output_channel = filter_tensor->Batch(); | auto output_channel = filter_tensor->Batch(); | ||||
| int size = UP_ROUND(output_channel, C8NUM) * sizeof(float); | |||||
| bias_data_ = malloc(size); | |||||
| if (bias_data_ == nullptr) { | |||||
| MS_LOG(ERROR) << "Conv1x1 Malloc bias_ptr_ error!"; | |||||
| return RET_ERROR; | |||||
| } | |||||
| memset(bias_data_, 0, size); | |||||
| if (in_tensors_.size() == 3) { | if (in_tensors_.size() == 3) { | ||||
| memcpy(bias_data_, in_tensors_[kBiasIndex]->MutableData(), output_channel * sizeof(float)); | |||||
| int size = UP_ROUND(output_channel, C8NUM) * sizeof(float); | |||||
| int weight_size = output_channel * sizeof(float); | |||||
| bias_data_ = malloc(size); | |||||
| if (bias_data_ == nullptr) { | |||||
| MS_LOG(ERROR) << "Conv1x1 Malloc bias_ptr_ error!"; | |||||
| return RET_ERROR; | |||||
| } | |||||
| memcpy(bias_data_, in_tensors_[kBiasIndex]->MutableData(), weight_size); | |||||
| memset(reinterpret_cast<char *>(bias_data_) + weight_size, 0, size - weight_size); | |||||
| } | } | ||||
| size = input_channel * UP_ROUND(output_channel, C8NUM) * sizeof(float); | |||||
| int size = input_channel * UP_ROUND(output_channel, C8NUM) * sizeof(float); | |||||
| int down_size = input_channel * DOWN_DIV(output_channel, C8NUM) * C8NUM * sizeof(float); | |||||
| weight_ptr_ = reinterpret_cast<float *>(malloc(size)); | weight_ptr_ = reinterpret_cast<float *>(malloc(size)); | ||||
| if (weight_ptr_ == nullptr) { | if (weight_ptr_ == nullptr) { | ||||
| MS_LOG(ERROR) << "Conv1x1 Malloc weight_ptr_ error!"; | MS_LOG(ERROR) << "Conv1x1 Malloc weight_ptr_ error!"; | ||||
| return RET_ERROR; | return RET_ERROR; | ||||
| } | } | ||||
| memset(weight_ptr_, 0, size); | |||||
| memset(reinterpret_cast<char *>(weight_ptr_) + down_size, 0, size - down_size); | |||||
| RowMajor2Col8Major(reinterpret_cast<float *>(filter_tensor->MutableData()), weight_ptr_, output_channel, | RowMajor2Col8Major(reinterpret_cast<float *>(filter_tensor->MutableData()), weight_ptr_, output_channel, | ||||
| input_channel); | input_channel); | ||||
| return RET_OK; | return RET_OK; | ||||
| @@ -141,10 +143,10 @@ int Convolution1x1CPUKernel::DoConv1x1(int task_id) { | |||||
| if (cur_oc <= 0) { | if (cur_oc <= 0) { | ||||
| return RET_OK; | return RET_OK; | ||||
| } | } | ||||
| auto bias = (bias_data_ == nullptr) ? nullptr : reinterpret_cast<float *>(bias_data_) + thread_stride_ * task_id; | |||||
| MatMulOpt(pack_input_, weight_ptr_ + task_id * thread_stride_ * matmul_param_->deep_, | MatMulOpt(pack_input_, weight_ptr_ + task_id * thread_stride_ * matmul_param_->deep_, | ||||
| output_ptr_ + task_id * thread_stride_, reinterpret_cast<float *>(bias_data_) + thread_stride_ * task_id, | |||||
| matmul_param_->act_type_, matmul_param_->deep_, matmul_param_->row_, cur_oc, matmul_param_->col_, | |||||
| OutType_Nhwc); | |||||
| output_ptr_ + task_id * thread_stride_, bias, matmul_param_->act_type_, matmul_param_->deep_, | |||||
| matmul_param_->row_, cur_oc, matmul_param_->col_, OutType_Nhwc); | |||||
| return RET_OK; | return RET_OK; | ||||
| } | } | ||||
| @@ -178,7 +180,6 @@ int Convolution1x1CPUKernel::DoConv1x1Hw(int task_id) { | |||||
| MatMulOpt(thread_pack_input, weight_ptr_, thread_output_ptr, reinterpret_cast<float *>(bias_data_), | MatMulOpt(thread_pack_input, weight_ptr_, thread_output_ptr, reinterpret_cast<float *>(bias_data_), | ||||
| matmul_param_->act_type_, matmul_param_->deep_, cur_hw_, matmul_param_->col_, matmul_param_->col_, | matmul_param_->act_type_, matmul_param_->deep_, cur_hw_, matmul_param_->col_, matmul_param_->col_, | ||||
| OutType_Nhwc); | OutType_Nhwc); | ||||
| return RET_OK; | return RET_OK; | ||||
| } | } | ||||