| @@ -0,0 +1,279 @@ | |||
| .text | |||
| .align 5 | |||
| //.p2align 5,,15 | |||
| .global PostFuncBiasReluC4Fp16 | |||
| #ifndef __APPLE__ | |||
| .type PostFuncBiasReluC4Fp16, %function | |||
| #endif | |||
| //void PostFuncBiasReluC4Fp16(float16_t *dst, const float16_t *src, const float16_t *bias, size_t oc4div, size_t oc4mod, | |||
| // size_t plane_size, size_t plane_stride, size_t relu_type); | |||
| // x0 dst x1 srx x2 bias | |||
| // w3 oc4div w4 oc4mod w5 plane_size | |||
| // x6 plane_stride x7 relu_type | |||
| PostFuncBiasReluC4Fp16: | |||
| movi v26.4h, #6 | |||
| scvtf v26.4h, v26.4h | |||
| dup v27.4h, wzr | |||
| mov x10, #2 | |||
| add x12, x3, x4 | |||
| mul x12, x12, x10 | |||
| mov w10, #0 | |||
| Loop_C4: | |||
| cmp w10, w3 | |||
| beq Loop_C1 | |||
| mov x15, #2 | |||
| mul x14, x10, x15 | |||
| add x15, x0, x14 | |||
| add w10, w10, #4 | |||
| mov w13, w5 | |||
| ld1 {v16.4h}, [x2], #8 | |||
| Loop_8x4: | |||
| cmp w13, #8 | |||
| blt Loop_4x4 | |||
| sub w13, w13, #8 | |||
| ld1 {v0.4h, v1.4h, v2.4h, v3.4h}, [x1], #32 | |||
| ld1 {v4.4h, v5.4h, v6.4h, v7.4h}, [x1], #32 | |||
| fadd v0.4h, v0.4h, v16.4h | |||
| fadd v1.4h, v1.4h, v16.4h | |||
| fadd v2.4h, v2.4h, v16.4h | |||
| fadd v3.4h, v3.4h, v16.4h | |||
| fadd v4.4h, v4.4h, v16.4h | |||
| fadd v5.4h, v5.4h, v16.4h | |||
| fadd v6.4h, v6.4h, v16.4h | |||
| fadd v7.4h, v7.4h, v16.4h | |||
| cmp x7, #3 | |||
| beq Relu6_8x4 | |||
| cmp x7, #1 | |||
| beq Relu_8x4 | |||
| b Write_8x4 | |||
| Relu6_8x4: | |||
| fmin v0.4h, v0.4h, v26.4h | |||
| fmin v1.4h, v1.4h, v26.4h | |||
| fmin v2.4h, v2.4h, v26.4h | |||
| fmin v3.4h, v3.4h, v26.4h | |||
| fmin v4.4h, v4.4h, v26.4h | |||
| fmin v5.4h, v5.4h, v26.4h | |||
| fmin v6.4h, v6.4h, v26.4h | |||
| fmin v7.4h, v7.4h, v26.4h | |||
| Relu_8x4: | |||
| fmax v0.4h, v0.4h, v27.4h | |||
| fmax v1.4h, v1.4h, v27.4h | |||
| fmax v2.4h, v2.4h, v27.4h | |||
| fmax v3.4h, v3.4h, v27.4h | |||
| fmax v4.4h, v4.4h, v27.4h | |||
| fmax v5.4h, v5.4h, v27.4h | |||
| fmax v6.4h, v6.4h, v27.4h | |||
| fmax v7.4h, v7.4h, v27.4h | |||
| Write_8x4: | |||
| st1 {v0.4h}, [x15], x12 | |||
| st1 {v1.4h}, [x15], x12 | |||
| st1 {v2.4h}, [x15], x12 | |||
| st1 {v3.4h}, [x15], x12 | |||
| st1 {v4.4h}, [x15], x12 | |||
| st1 {v5.4h}, [x15], x12 | |||
| st1 {v6.4h}, [x15], x12 | |||
| st1 {v7.4h}, [x15], x12 | |||
| b Loop_8x4 | |||
| Loop_4x4: | |||
| cmp w13, #4 | |||
| blt Loop_1x4 | |||
| sub w13, w13, #4 | |||
| ld1 {v0.4h, v1.4h, v2.4h, v3.4h}, [x1], #32 | |||
| fadd v0.4h, v0.4h, v16.4h | |||
| fadd v1.4h, v1.4h, v16.4h | |||
| fadd v2.4h, v2.4h, v16.4h | |||
| fadd v3.4h, v3.4h, v16.4h | |||
| cmp x7, #3 | |||
| beq Relu6_4x4 | |||
| cmp x7, #1 | |||
| beq Relu_4x4 | |||
| b Write_4x4 | |||
| Relu6_4x4: | |||
| fmin v0.4h, v0.4h, v26.4h | |||
| fmin v1.4h, v1.4h, v26.4h | |||
| fmin v2.4h, v2.4h, v26.4h | |||
| fmin v3.4h, v3.4h, v26.4h | |||
| Relu_4x4: | |||
| fmax v0.4h, v0.4h, v27.4h | |||
| fmax v1.4h, v1.4h, v27.4h | |||
| fmax v2.4h, v2.4h, v27.4h | |||
| fmax v3.4h, v3.4h, v27.4h | |||
| Write_4x4: | |||
| st1 {v0.4h}, [x15], x12 | |||
| st1 {v1.4h}, [x15], x12 | |||
| st1 {v2.4h}, [x15], x12 | |||
| st1 {v3.4h}, [x15], x12 | |||
| Loop_1x4: | |||
| cmp x7, #3 | |||
| beq Relu6_1x4 | |||
| cmp x7, #1 | |||
| beq Relu_1x4 | |||
| b Write_1x4 | |||
| Relu6_1x4: | |||
| cmp w13, #0 | |||
| beq HW_Add | |||
| sub w13, w13, #1 | |||
| ld1 {v0.4h}, [x1], #8 | |||
| fadd v0.4h, v0.4h, v16.4h | |||
| fmin v0.4h, v0.4h, v26.4h | |||
| fmax v0.4h, v0.4h, v27.4h | |||
| st1 {v0.4h}, [x15], x12 | |||
| b Relu6_1x4 | |||
| Relu_1x4: | |||
| cmp w13, #0 | |||
| beq HW_Add | |||
| sub w13, w13, #1 | |||
| ld1 {v0.4h}, [x1], #8 | |||
| fadd v0.4h, v0.4h, v16.4h | |||
| fmax v0.4h, v0.4h, v27.4h | |||
| st1 {v0.4h}, [x15], x12 | |||
| b Relu_1x4 | |||
| Write_1x4: | |||
| cmp w13, #0 | |||
| beq HW_Add | |||
| sub w13, w13, #1 | |||
| ld1 {v0.4h}, [x1], #8 | |||
| fadd v0.4h, v0.4h, v16.4h | |||
| st1 {v0.4h}, [x15], x12 | |||
| b Write_1x4 | |||
| HW_Add: | |||
| add x1, x1, x6 | |||
| b Loop_C4 | |||
| Loop_C1: | |||
| cmp w4, #0 | |||
| beq End | |||
| mov w13, w5 | |||
| ld1 {v16.4h}, [x2], #8 | |||
| mov x15, #2 | |||
| mul x14, x10, x15 | |||
| add x0, x0, x14 | |||
| cmp w4, #1 | |||
| beq Loop_C1_1 | |||
| cmp w4, #2 | |||
| beq Loop_C1_2 | |||
| cmp w4, #3 | |||
| beq Loop_C1_3 | |||
| Loop_C1_1: | |||
| cmp x7, #3 | |||
| beq Loop_C1_1_Relu6 | |||
| cmp x7, #1 | |||
| beq Loop_C1_1_Relu | |||
| b Loop_C1_1_Write | |||
| Loop_C1_1_Relu6: | |||
| cmp w13, #0 | |||
| beq End | |||
| sub w13, w13, #1 | |||
| ld1 {v0.4h}, [x1], #8 | |||
| fadd v0.4h, v0.4h, v16.4h | |||
| fmin v0.4h, v0.4h, v26.4h | |||
| fmax v0.4h, v0.4h, v27.4h | |||
| st1 {v0.h}[0], [x0], x12 | |||
| b Loop_C1_1_Relu6 | |||
| Loop_C1_1_Relu: | |||
| cmp w13, #0 | |||
| beq End | |||
| sub w13, w13, #1 | |||
| ld1 {v0.4h}, [x1], #8 | |||
| fadd v0.4h, v0.4h, v16.4h | |||
| fmax v0.4h, v0.4h, v27.4h | |||
| st1 {v0.h}[0], [x0], x12 | |||
| b Loop_C1_1_Relu | |||
| Loop_C1_1_Write: | |||
| cmp w13, #0 | |||
| beq End | |||
| sub w13, w13, #1 | |||
| ld1 {v0.4h}, [x1], #8 | |||
| fadd v0.4h, v0.4h, v16.4h | |||
| st1 {v0.h}[0], [x0], x12 | |||
| b Loop_C1_1_Write | |||
| Loop_C1_2: | |||
| cmp x7, #3 | |||
| beq Loop_C1_2_Relu6 | |||
| cmp x7, #1 | |||
| beq Loop_C1_2_Relu | |||
| b Loop_C1_2_Write | |||
| Loop_C1_2_Relu6: | |||
| cmp w13, #0 | |||
| beq End | |||
| sub w13, w13, #1 | |||
| ld1 {v0.4h}, [x1], #8 | |||
| fadd v0.4h, v0.4h, v16.4h | |||
| fmin v0.4h, v0.4h, v26.4h | |||
| fmax v0.4h, v0.4h, v27.4h | |||
| st1 {v0.s}[0], [x0], x12 | |||
| b Loop_C1_2_Relu6 | |||
| Loop_C1_2_Relu: | |||
| cmp w13, #0 | |||
| beq End | |||
| sub w13, w13, #1 | |||
| ld1 {v0.4h}, [x1], #8 | |||
| fadd v0.4h, v0.4h, v16.4h | |||
| fmax v0.4h, v0.4h, v27.4h | |||
| st1 {v0.s}[0], [x0], x12 | |||
| b Loop_C1_2_Relu | |||
| Loop_C1_2_Write: | |||
| cmp w13, #0 | |||
| beq End | |||
| sub w13, w13, #1 | |||
| ld1 {v0.4h}, [x1], #8 | |||
| fadd v0.4h, v0.4h, v16.4h | |||
| st1 {v0.s}[0], [x0], x12 | |||
| b Loop_C1_2_Write | |||
| Loop_C1_3: | |||
| add x15, x0, #4 | |||
| cmp x7, #3 | |||
| beq Loop_C1_3_Relu6 | |||
| cmp x7, #1 | |||
| beq Loop_C1_3_Relu | |||
| b Loop_C1_3_Write | |||
| Loop_C1_3_Relu6: | |||
| cmp w13, #0 | |||
| beq End | |||
| sub w13, w13, #1 | |||
| ld1 {v0.4h}, [x1], #8 | |||
| fadd v0.4h, v0.4h, v16.4h | |||
| fmin v0.4h, v0.4h, v26.4h | |||
| fmax v0.4h, v0.4h, v27.4h | |||
| st1 {v0.s}[0], [x0], x12 | |||
| st1 {v0.h}[2], [x15], x12 | |||
| b Loop_C1_3_Relu6 | |||
| Loop_C1_3_Relu: | |||
| cmp w13, #0 | |||
| beq End | |||
| sub w13, w13, #1 | |||
| ld1 {v0.4h}, [x1], #8 | |||
| fadd v0.4h, v0.4h, v16.4h | |||
| fmax v0.4h, v0.4h, v27.4h | |||
| st1 {v0.s}[0], [x0], x12 | |||
| st1 {v0.h}[2], [x15], x12 | |||
| b Loop_C1_3_Relu | |||
| Loop_C1_3_Write: | |||
| cmp w13, #0 | |||
| beq End | |||
| sub w13, w13, #1 | |||
| ld1 {v0.4h}, [x1], #8 | |||
| fadd v0.4h, v0.4h, v16.4h | |||
| st1 {v0.s}[0], [x0], x12 | |||
| st1 {v0.h}[2], [x15], x12 | |||
| b Loop_C1_3_Write | |||
| End: | |||
| ret | |||
| @@ -50,5 +50,9 @@ void PostConvFuncFp16C8(const float16_t *c8_out, float16_t *nhwc_out, const floa | |||
| void PostConvFuncFp16C4(const float16_t *c4_out, float16_t *nhwc_out, const float16_t *bias, size_t oc, size_t plane, | |||
| size_t plane_stride, ActType act_type) { | |||
| PostConvFuncCommFp16(nhwc_out, c4_out, bias, oc, plane, oc, plane_stride, act_type, C4NUM); | |||
| size_t oc4mod = oc % C4NUM; | |||
| size_t oc4div = oc - oc4mod; | |||
| size_t stride_size = (plane_stride - plane) * C4NUM * sizeof(float16_t); | |||
| PostFuncBiasReluC4Fp16(nhwc_out, c4_out, bias, oc4div, oc4mod, plane, stride_size, act_type); | |||
| return; | |||
| } | |||
| @@ -32,6 +32,8 @@ void PostFuncBiasReluC8Fp16(float16_t *dst, const float16_t *src, const float16_ | |||
| /* deconv winograd */ | |||
| void PostConvFuncFp16C4(const float16_t *c4_out, float16_t *nhwc_out, const float16_t *bias, size_t output_channel, | |||
| size_t plane_size, size_t plane_stride, ActType act_type); | |||
| void PostFuncBiasReluC4Fp16(float16_t *dst, const float16_t *src, const float16_t *bias, size_t oc4div, size_t oc4mod, | |||
| size_t plane_size, size_t plane_stride, size_t relu_type); | |||
| #ifdef __cplusplus | |||
| } | |||
| @@ -24,7 +24,7 @@ void DeConvWgInputPackFp16(float16_t *src_ptr, float16_t *dst_ptr, int channel, | |||
| float16_t *dst = dst_ptr; | |||
| for (int ic = 0; ic < ic4div; ic++) { | |||
| memcpy(dst, src, C4NUM * sizeof(float16_t)); | |||
| vst1_f16(dst, vld1_f16(src)); | |||
| dst += stride; | |||
| src += C4NUM; | |||
| } | |||
| @@ -85,23 +85,6 @@ std::string RealPath(const char *path) { | |||
| return res; | |||
| } | |||
| int WriteToBin(const std::string &file_path, void *data, size_t size) { | |||
| std::ofstream out_file; | |||
| out_file.open(file_path.c_str(), std::ios::binary); | |||
| if (!out_file.good()) { | |||
| MS_LOG(ERROR) << "file is bad"; | |||
| return -1; | |||
| } | |||
| if (!out_file.is_open()) { | |||
| MS_LOG(ERROR) << "file open failed"; | |||
| return -1; | |||
| } | |||
| out_file.write(reinterpret_cast<char *>(data), size); | |||
| return 0; | |||
| } | |||
| int CompareOutputData(float *output_data, size_t output_size, float *correct_data, size_t data_size) { | |||
| if (output_size != data_size) { | |||
| printf("compare failed, output_size %zu isn't equal to data_size %zu.\n", output_size, data_size); | |||
| @@ -48,7 +48,15 @@ void WriteToTxt(const std::string &file_path, void *data, size_t element_size) { | |||
| out_file.close(); | |||
| } | |||
| int WriteToBin(const std::string &file_path, void *data, size_t size); | |||
| inline int WriteToBin(const std::string &file_path, void *data, size_t size) { | |||
| std::ofstream out_file; | |||
| out_file.open(file_path.c_str(), std::ios::binary); | |||
| if (!out_file.good() || !out_file.is_open()) { | |||
| return -1; | |||
| } | |||
| out_file.write(reinterpret_cast<char *>(data), size); | |||
| return 0; | |||
| } | |||
| int CompareOutputData(float *output_data, size_t output_num, float *correct_data, size_t data_size); | |||
| int CompareOutput(float *output_data, size_t output_num, std::string file_path); | |||
| @@ -233,8 +233,7 @@ kernel::LiteKernel *CpuDeConvFp16KernelCreator(const std::vector<lite::Tensor *> | |||
| auto conv_param = reinterpret_cast<ConvParameter *>(opParameter); | |||
| if ((conv_param->stride_h_ != 1 || conv_param->stride_w_ != 1) && | |||
| (conv_param->dilation_w_ == 1 && conv_param->dilation_h_ == 1)) { | |||
| /* DeConvWinogradFp16CPUKernel */ | |||
| kernel = new (std::nothrow) kernel::DeConvolutionFp16CPUKernel(opParameter, inputs, outputs, ctx, primitive); | |||
| kernel = new (std::nothrow) kernel::DeConvWinogradFp16CPUKernel(opParameter, inputs, outputs, ctx, primitive); | |||
| } else { | |||
| kernel = new (std::nothrow) kernel::DeConvolutionFp16CPUKernel(opParameter, inputs, outputs, ctx, primitive); | |||
| } | |||
| @@ -266,5 +265,4 @@ kernel::LiteKernel *CpuDeConvFp16KernelCreator(const std::vector<lite::Tensor *> | |||
| return kernel; | |||
| } | |||
| REG_KERNEL(kCPU, kNumberTypeFloat16, PrimitiveType_DeConv2D, CpuDeConvFp16KernelCreator) | |||
| } // namespace mindspore::kernel | |||
| @@ -248,7 +248,6 @@ kernel::LiteKernel *CpuDeConvFp32KernelCreator(const std::vector<lite::Tensor *> | |||
| auto conv_param = reinterpret_cast<ConvParameter *>(opParameter); | |||
| if ((conv_param->stride_h_ != 1 || conv_param->stride_w_ != 1) && | |||
| (conv_param->dilation_w_ == 1 && conv_param->dilation_h_ == 1)) { | |||
| /* DeConvolutionWinogradCPUKernel */ | |||
| kernel = new (std::nothrow) kernel::DeConvolutionWinogradCPUKernel(opParameter, inputs, outputs, ctx, primitive); | |||
| } else { | |||
| kernel = new (std::nothrow) kernel::DeConvolutionCPUKernel(opParameter, inputs, outputs, ctx, primitive); | |||
| @@ -18,6 +18,7 @@ | |||
| #include "src/runtime/runtime_api.h" | |||
| using mindspore::lite::RET_ERROR; | |||
| using mindspore::lite::RET_MEMORY_FAILED; | |||
| using mindspore::lite::RET_NULL_PTR; | |||
| using mindspore::lite::RET_OK; | |||
| @@ -59,20 +60,10 @@ void DeConvolutionWinogradCPUKernel::FreeResizeBuf() { | |||
| wg.buf_init_ = false; | |||
| } | |||
| if (nc4hw4_output_ != nullptr) { | |||
| free(nc4hw4_output_); | |||
| nc4hw4_output_ = nullptr; | |||
| } | |||
| if (tile_input_ != nullptr) { | |||
| free(tile_input_); | |||
| tile_input_ = nullptr; | |||
| } | |||
| if (tile_output_ != nullptr) { | |||
| free(tile_output_); | |||
| tile_output_ = nullptr; | |||
| } | |||
| return; | |||
| } | |||
| @@ -108,9 +99,6 @@ int DeConvolutionWinogradCPUKernel::InitParameter() { | |||
| deconv_param_->input_plane_ = conv_param_->input_h_ * conv_param_->input_w_; | |||
| deconv_param_->output_plane_ = conv_param_->output_h_ * conv_param_->output_w_; | |||
| nc4hw4_output_ = | |||
| reinterpret_cast<float *>(malloc(deconv_param_->oc_up4_ * deconv_param_->output_plane_ * sizeof(float))); | |||
| deconv_param_->in_tile_w_count_ = UP_DIV(conv_param_->input_w_, DECONV_WINOGRAD_DEFAULT_UNIT); | |||
| deconv_param_->in_tile_h_count_ = UP_DIV(conv_param_->input_h_, DECONV_WINOGRAD_DEFAULT_UNIT); | |||
| @@ -129,9 +117,6 @@ int DeConvolutionWinogradCPUKernel::InitParameter() { | |||
| deconv_param_->out_tile_w_ = (DECONV_WINOGRAD_DEFAULT_UNIT - 1) * conv_param_->stride_w_ + conv_param_->kernel_w_; | |||
| deconv_param_->out_tile_h_ = (DECONV_WINOGRAD_DEFAULT_UNIT - 1) * conv_param_->stride_h_ + conv_param_->kernel_h_; | |||
| size = deconv_param_->thread_num_ * deconv_param_->out_tile_w_ * deconv_param_->out_tile_h_ * | |||
| DECONV_WINOGRAD_DEFAULT_TILE * deconv_param_->oc_up4_; | |||
| tile_output_ = reinterpret_cast<float *>(malloc(size * sizeof(float))); | |||
| for (int i = 0; i < deconv_param_->compute_size_; i++) { | |||
| DeConvComputeUnit &unit = deconv_param_->compute_units_[i]; | |||
| @@ -329,7 +314,44 @@ int DeConvolutionWinogradCPUKernel::DeDeconvPost(int task_id) { | |||
| return RET_OK; | |||
| } | |||
| int DeConvolutionWinogradCPUKernel::InitRunBuf() { | |||
| int size = deconv_param_->oc_up4_ * deconv_param_->output_plane_; | |||
| nc4hw4_output_ = reinterpret_cast<float *>(ctx_->allocator->Malloc(size * sizeof(float))); | |||
| if (nc4hw4_output_ == nullptr) { | |||
| MS_LOG(ERROR) << "de conv wg Malloc nc4hw4_output_ error!"; | |||
| return RET_MEMORY_FAILED; | |||
| } | |||
| size = deconv_param_->thread_num_ * deconv_param_->out_tile_w_ * deconv_param_->out_tile_h_ * | |||
| DECONV_WINOGRAD_DEFAULT_TILE * deconv_param_->oc_up4_; | |||
| tile_output_ = reinterpret_cast<float *>(ctx_->allocator->Malloc(size * sizeof(float))); | |||
| if (tile_output_ == nullptr) { | |||
| MS_LOG(ERROR) << "de conv wg Malloc tile_output_ error!"; | |||
| return RET_MEMORY_FAILED; | |||
| } | |||
| return RET_OK; | |||
| } | |||
| void DeConvolutionWinogradCPUKernel::FreeRunBuf() { | |||
| if (nc4hw4_output_ != nullptr) { | |||
| ctx_->allocator->Free(nc4hw4_output_); | |||
| nc4hw4_output_ = nullptr; | |||
| } | |||
| if (tile_output_ != nullptr) { | |||
| ctx_->allocator->Free(tile_output_); | |||
| tile_output_ = nullptr; | |||
| } | |||
| return; | |||
| } | |||
| int DeConvolutionWinogradCPUKernel::Run() { | |||
| auto ret = InitRunBuf(); | |||
| if (ret != RET_OK) { | |||
| MS_LOG(ERROR) << "InitRunBuf fail!ret: " << ret; | |||
| return ret; | |||
| } | |||
| float *src_in = reinterpret_cast<float *>(in_tensors_[0]->data_c()); | |||
| float *src_out = reinterpret_cast<float *>(out_tensors_[0]->data_c()); | |||
| @@ -344,6 +366,7 @@ int DeConvolutionWinogradCPUKernel::Run() { | |||
| ParallelLaunch(this->context_->thread_pool_, DeConvWgPostFp32Run, this, thread_num_hw_); | |||
| } | |||
| FreeRunBuf(); | |||
| return RET_OK; | |||
| } | |||
| @@ -54,6 +54,8 @@ class DeConvolutionWinogradCPUKernel : public ConvolutionBaseCPUKernel { | |||
| int InitParameter(); | |||
| void FreeDeconvParam(); | |||
| void FreeResizeBuf(); | |||
| int InitRunBuf(); | |||
| void FreeRunBuf(); | |||
| private: | |||
| DeConvParam *deconv_param_; | |||
| @@ -11,6 +11,7 @@ ml_face_contour | |||
| mnet | |||
| ml_face_landmark | |||
| ml_liveness_detect_landmark | |||
| deconv_test_model | |||
| # aware_training | |||
| video_infer.tflite | |||
| mobilenet_v1_1.0_224_quant.tflite | |||
| @@ -56,3 +56,4 @@ hiai_face_attr1 | |||
| detect-mbv1-shortcut-400-400_nopostprocess_simplified | |||
| detect_mbv1_640_480_nopostprocess_simplified | |||
| retinaface | |||
| deconv_test_model | |||