| @@ -39,7 +39,8 @@ int ConvolutionDepthwiseFP32Coder::InitWeightBias() { | |||||
| MS_CHECK_PTR(packed_weight_); | MS_CHECK_PTR(packed_weight_); | ||||
| MS_CHECK_RET_CODE(memset_s(packed_weight_, packed_weight_data_size, 0, packed_weight_data_size), | MS_CHECK_RET_CODE(memset_s(packed_weight_, packed_weight_data_size, 0, packed_weight_data_size), | ||||
| "memset packed weight failed!"); | "memset packed weight failed!"); | ||||
| PackNCHWToNHWCFp32(origin_weight, packed_weight_, 1, filter_tensor_->Height() * filter_tensor_->Width(), channel); | |||||
| PackNCHWToNHWCFp32(origin_weight, packed_weight_, 1, filter_tensor_->Height() * filter_tensor_->Width(), channel, 0, | |||||
| 0); | |||||
| auto channel_size = static_cast<size_t>(channel); | auto channel_size = static_cast<size_t>(channel); | ||||
| auto bias_size = static_cast<size_t>(channel_size * sizeof(float)); | auto bias_size = static_cast<size_t>(channel_size * sizeof(float)); | ||||
| @@ -127,6 +127,52 @@ void Fp16TransposeDim5(const float16_t *in_data, float16_t *out_data, int *strid | |||||
| } | } | ||||
| } | } | ||||
| void Fp16TransposeDim6(const float16_t *in_data, float16_t *out_data, int *strides, int *out_strides, int *perm, | |||||
| const int *output_shape) { | |||||
| const int stride0 = strides[perm[0]]; | |||||
| const int stride1 = strides[perm[1]]; | |||||
| const int stride2 = strides[perm[2]]; | |||||
| const int stride3 = strides[perm[3]]; | |||||
| const int stride4 = strides[perm[4]]; | |||||
| const int stride5 = strides[perm[5]]; | |||||
| const int out_stride0 = out_strides[0]; | |||||
| const int out_stride1 = out_strides[1]; | |||||
| const int out_stride2 = out_strides[2]; | |||||
| const int out_stride3 = out_strides[3]; | |||||
| const int out_stride4 = out_strides[4]; | |||||
| const int output0 = output_shape[0]; | |||||
| const int output1 = output_shape[1]; | |||||
| const int output2 = output_shape[2]; | |||||
| const int output3 = output_shape[3]; | |||||
| const int output4 = output_shape[4]; | |||||
| const int output5 = output_shape[5]; | |||||
| for (int i = 0; i < output0; ++i) { | |||||
| int out_stride0_i = i * out_stride0; | |||||
| int stride0_i = i * stride0; | |||||
| for (int j = 0; j < output1; ++j) { | |||||
| int out_stride1_j = j * out_stride1; | |||||
| int stride1_j = j * stride1; | |||||
| for (int k = 0; k < output2; ++k) { | |||||
| int out_stride2_k = k * out_stride2; | |||||
| int stride2_k = k * stride2; | |||||
| for (int m = 0; m < output3; ++m) { | |||||
| int out_stride3_m = m * out_stride3; | |||||
| int stride3_m = m * stride3; | |||||
| for (int n = 0; n < output4; ++n) { | |||||
| int out_stride4_n = n * out_stride4; | |||||
| int stride4_n = n * stride4; | |||||
| for (int g = 0; g < output5; ++g) { | |||||
| out_data[out_stride0_i + out_stride1_j + out_stride2_k + out_stride3_m + out_stride4_n + g] = | |||||
| in_data[stride0_i + stride1_j + stride2_k + stride3_m + stride4_n + g * stride5]; | |||||
| } | |||||
| } | |||||
| } | |||||
| } | |||||
| } | |||||
| } | |||||
| } | |||||
| void TransposeDimsFp16(const float16_t *in_data, float16_t *out_data, const int *strides, const int *out_strides, | void TransposeDimsFp16(const float16_t *in_data, float16_t *out_data, const int *strides, const int *out_strides, | ||||
| const int *perm, const int *output_shape, int dims, int *size, int *position) { | const int *perm, const int *output_shape, int dims, int *size, int *position) { | ||||
| *(size + dims - 1) = 1; | *(size + dims - 1) = 1; | ||||
| @@ -190,6 +236,8 @@ int Fp16DoTranspose(const float16_t *in_data, float16_t *out_data, const int *ou | |||||
| Fp16TransposeDim4(in_data, out_data, strides, out_strides, perm, output_shape); | Fp16TransposeDim4(in_data, out_data, strides, out_strides, perm, output_shape); | ||||
| } else if (num_axes == 5) { | } else if (num_axes == 5) { | ||||
| Fp16TransposeDim5(in_data, out_data, strides, out_strides, perm, output_shape); | Fp16TransposeDim5(in_data, out_data, strides, out_strides, perm, output_shape); | ||||
| } else if (num_axes == 6) { | |||||
| Fp16TransposeDim6(in_data, out_data, strides, out_strides, perm, output_shape); | |||||
| } else { | } else { | ||||
| TransposeDimsFp16(in_data, out_data, strides, out_strides, perm, output_shape, num_axes, size, position); | TransposeDimsFp16(in_data, out_data, strides, out_strides, perm, output_shape, num_axes, size, position); | ||||
| } | } | ||||
| @@ -1,5 +1,5 @@ | |||||
| /** | /** | ||||
| * Copyright 2020 Huawei Technologies Co., Ltd | |||||
| * Copyright 2020-2021 Huawei Technologies Co., Ltd | |||||
| * | * | ||||
| * Licensed under the Apache License, Version 2.0 (the "License"); | * Licensed under the Apache License, Version 2.0 (the "License"); | ||||
| * you may not use this file except in compliance with the License. | * you may not use this file except in compliance with the License. | ||||
| @@ -17,7 +17,7 @@ | |||||
| #include "nnacl/fp32/pack_fp32.h" | #include "nnacl/fp32/pack_fp32.h" | ||||
| void PackWeightKHWToHWKFp32(const void *src, void *dst, int plane, int channel) { | void PackWeightKHWToHWKFp32(const void *src, void *dst, int plane, int channel) { | ||||
| return PackNCHWToNHWCFp32(src, dst, 1, plane, channel); | |||||
| return PackNCHWToNHWCFp32(src, dst, 1, plane, channel, 0, 0); | |||||
| } | } | ||||
| void PackHWCToWHC(const float *src, float *dst, int height, int width, int channel) { | void PackHWCToWHC(const float *src, float *dst, int height, int width, int channel) { | ||||
| @@ -286,166 +286,45 @@ void PackDepthwiseIndirectWeightC8Fp32(const void *src, void *dst, int height, i | |||||
| } | } | ||||
| } | } | ||||
| #ifndef ENABLE_SSE | |||||
| void PackNHWCToNCHWFp32(const void *src, void *dst, int batches, int plane, int channel) { | |||||
| int hw8 = plane / C8NUM * C8NUM; | |||||
| void PackNHWCToNCHWFp32(const void *src, void *dst, int batches, int plane, int channel, int task_id, | |||||
| int thread_count) { | |||||
| #ifdef ENABLE_ARM64 | |||||
| Transpose8X8Fp32Func Transpose8X8Fp32Func_ = Transpose8X8Fp32Arm64; | |||||
| #elif defined(ENABLE_ARM32) | |||||
| Transpose8X8Fp32Func Transpose8X8Fp32Func_ = Transpose8X8Fp32Arm32; | |||||
| #elif defined(ENABLE_AVX) | |||||
| Transpose8X8Fp32Func Transpose8X8Fp32Func_ = Transpose8X8Fp32Avx; | |||||
| #elif defined(ENABLE_SSE) && !defined(ENABLE_AVX) | |||||
| Transpose8X8Fp32Func Transpose8X8Fp32Func_ = Transpose8X8Fp32Sse; | |||||
| #endif | |||||
| int hw8 = plane / C8NUM; | |||||
| int task_start = 0; | |||||
| int task_end = plane; | |||||
| if (thread_count > 0) { | |||||
| int offset_hw = UP_DIV(hw8, thread_count) * C8NUM; | |||||
| task_start = offset_hw * task_id; | |||||
| int count = plane - task_start; | |||||
| if (count <= 0) { | |||||
| return; | |||||
| } | |||||
| task_end = (task_id + 1) == thread_count ? plane : MSMIN(plane, task_start + offset_hw); | |||||
| hw8 = task_start + (task_end - task_start) > offset_hw ? offset_hw : 0; | |||||
| } else { | |||||
| hw8 *= C8NUM; | |||||
| } | |||||
| int c8 = channel / C8NUM * C8NUM; | int c8 = channel / C8NUM * C8NUM; | ||||
| int batch = plane * channel; | int batch = plane * channel; | ||||
| for (int n = 0; n < batches; n++) { | for (int n = 0; n < batches; n++) { | ||||
| const float *src_batch = (const float *)src + n * batch; | const float *src_batch = (const float *)src + n * batch; | ||||
| float *dst_batch = (float *)dst + n * batch; | float *dst_batch = (float *)dst + n * batch; | ||||
| int hw = 0; | |||||
| int hw = task_start; | |||||
| for (; hw < hw8; hw += C8NUM) { | for (; hw < hw8; hw += C8NUM) { | ||||
| int c = 0; | int c = 0; | ||||
| for (; c < c8; c += C8NUM) { | for (; c < c8; c += C8NUM) { | ||||
| const float *src_ptr = src_batch + hw * channel + c; | const float *src_ptr = src_batch + hw * channel + c; | ||||
| float *dst_ptr = dst_batch + c * plane + hw; | float *dst_ptr = dst_batch + c * plane + hw; | ||||
| #ifdef ENABLE_ARM64 | |||||
| size_t srcStride = channel * sizeof(float); | |||||
| size_t dstStride = plane * sizeof(float); | |||||
| asm volatile( | |||||
| "mov x10, %[src_ptr]\n" | |||||
| "mov x11, %[dst_ptr]\n" | |||||
| "ld1 {v0.4s, v1.4s}, [x10], %[srcStride]\n" | |||||
| "ld1 {v2.4s, v3.4s}, [x10], %[srcStride]\n" | |||||
| "zip1 v8.4s, v0.4s, v2.4s\n" | |||||
| "zip2 v9.4s, v0.4s, v2.4s\n" | |||||
| "zip1 v12.4s, v1.4s, v3.4s\n" | |||||
| "zip2 v13.4s, v1.4s, v3.4s\n" | |||||
| "ld1 {v4.4s, v5.4s}, [x10], %[srcStride]\n" | |||||
| "ld1 {v6.4s, v7.4s}, [x10], %[srcStride]\n" | |||||
| "zip1 v10.4s, v4.4s, v6.4s\n" | |||||
| "zip2 v11.4s, v4.4s, v6.4s\n" | |||||
| "zip1 v14.4s, v5.4s, v7.4s\n" | |||||
| "zip2 v15.4s, v5.4s, v7.4s\n" | |||||
| "ld1 {v0.4s, v1.4s}, [x10], %[srcStride]\n" | |||||
| "ld1 {v2.4s, v3.4s}, [x10], %[srcStride]\n" | |||||
| "trn1 v16.2d, v8.2d, v10.2d\n" | |||||
| "trn2 v18.2d, v8.2d, v10.2d\n" | |||||
| "trn1 v20.2d, v9.2d, v11.2d\n" | |||||
| "trn2 v22.2d, v9.2d, v11.2d\n" | |||||
| "ld1 {v4.4s, v5.4s}, [x10], %[srcStride]\n" | |||||
| "ld1 {v6.4s, v7.4s}, [x10], %[srcStride]\n" | |||||
| "trn1 v24.2d, v12.2d, v14.2d\n" | |||||
| "trn2 v26.2d, v12.2d, v14.2d\n" | |||||
| "trn1 v28.2d, v13.2d, v15.2d\n" | |||||
| "trn2 v30.2d, v13.2d, v15.2d\n" | |||||
| "zip1 v8.4s, v0.4s, v2.4s\n" | |||||
| "zip2 v9.4s, v0.4s, v2.4s\n" | |||||
| "zip1 v12.4s, v1.4s, v3.4s\n" | |||||
| "zip2 v13.4s, v1.4s, v3.4s\n" | |||||
| "zip1 v10.4s, v4.4s, v6.4s\n" | |||||
| "zip2 v11.4s, v4.4s, v6.4s\n" | |||||
| "zip1 v14.4s, v5.4s, v7.4s\n" | |||||
| "zip2 v15.4s, v5.4s, v7.4s\n" | |||||
| "trn1 v17.2d, v8.2d, v10.2d\n" | |||||
| "trn2 v19.2d, v8.2d, v10.2d\n" | |||||
| "trn1 v21.2d, v9.2d, v11.2d\n" | |||||
| "trn2 v23.2d, v9.2d, v11.2d\n" | |||||
| "trn1 v25.2d, v12.2d, v14.2d\n" | |||||
| "trn2 v27.2d, v12.2d, v14.2d\n" | |||||
| "trn1 v29.2d, v13.2d, v15.2d\n" | |||||
| "trn2 v31.2d, v13.2d, v15.2d\n" | |||||
| "st1 {v16.4s, v17.4s}, [x11], %[dstStride]\n" | |||||
| "st1 {v18.4s, v19.4s}, [x11], %[dstStride]\n" | |||||
| "st1 {v20.4s, v21.4s}, [x11], %[dstStride]\n" | |||||
| "st1 {v22.4s, v23.4s}, [x11], %[dstStride]\n" | |||||
| "st1 {v24.4s, v25.4s}, [x11], %[dstStride]\n" | |||||
| "st1 {v26.4s, v27.4s}, [x11], %[dstStride]\n" | |||||
| "st1 {v28.4s, v29.4s}, [x11], %[dstStride]\n" | |||||
| "st1 {v30.4s, v31.4s}, [x11], %[dstStride]\n" | |||||
| : | |||||
| : | |||||
| [ dst_ptr ] "r"(dst_ptr), [ src_ptr ] "r"(src_ptr), [ srcStride ] "r"(srcStride), [ dstStride ] "r"(dstStride) | |||||
| : "x10", "x11", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", | |||||
| "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", | |||||
| "v30", "v31"); | |||||
| #elif ENABLE_ARM32 | |||||
| size_t srcStride = channel * sizeof(float); | |||||
| size_t dstStride = plane * sizeof(float); | |||||
| asm volatile( | |||||
| "mov r10, %[src_ptr]\n" | |||||
| "mov r12, %[dst_ptr]\n" | |||||
| "vld1.32 {q0, q1}, [r10], %[srcStride]\n" | |||||
| "vld1.32 {q2, q3}, [r10], %[srcStride]\n" | |||||
| "vtrn.32 d0, d4\n" | |||||
| "vtrn.32 d1, d5\n" | |||||
| "vtrn.32 d2, d6\n" | |||||
| "vtrn.32 d3, d7\n" | |||||
| "vld1.32 {q4, q5}, [r10], %[srcStride]\n" | |||||
| "vld1.32 {q6, q7}, [r10], %[srcStride]\n" | |||||
| "vtrn.32 d8, d12\n" | |||||
| "vtrn.32 d9, d13\n" | |||||
| "vtrn.32 d10, d14\n" | |||||
| "vtrn.32 d11, d15\n" | |||||
| "vld1.32 {q8, q9}, [r10], %[srcStride]\n" | |||||
| "vld1.32 {q10, q11}, [r10], %[srcStride]\n" | |||||
| "vswp d1, d8\n" | |||||
| "vswp d3, d10\n" | |||||
| "vswp d5, d12\n" | |||||
| "vswp d7, d14\n" | |||||
| "vtrn.32 d16, d20\n" | |||||
| "vtrn.32 d17, d21\n" | |||||
| "vtrn.32 d18, d22\n" | |||||
| "vtrn.32 d19, d23\n" | |||||
| "vld1.32 {q12, q13}, [r10], %[srcStride]\n" | |||||
| "vld1.32 {q14, q15}, [r10], %[srcStride]\n" | |||||
| "vtrn.32 d24, d28\n" | |||||
| "vtrn.32 d25, d29\n" | |||||
| "vtrn.32 d26, d30\n" | |||||
| "vtrn.32 d27, d31\n" | |||||
| "vswp d17, d24\n" | |||||
| "vswp d19, d26\n" | |||||
| "vswp d21, d28\n" | |||||
| "vswp d23, d30\n" | |||||
| "add r10, r12, #16\n" | |||||
| "vst1.32 {q0}, [r12], %[dstStride]\n" | |||||
| "vst1.32 {q8}, [r10], %[dstStride]\n" | |||||
| "vst1.32 {q2}, [r12], %[dstStride]\n" | |||||
| "vst1.32 {q10}, [r10], %[dstStride]\n" | |||||
| "vst1.32 {q4}, [r12], %[dstStride]\n" | |||||
| "vst1.32 {q12}, [r10], %[dstStride]\n" | |||||
| "vst1.32 {q6}, [r12], %[dstStride]\n" | |||||
| "vst1.32 {q14}, [r10], %[dstStride]\n" | |||||
| "vst1.32 {q1}, [r12], %[dstStride]\n" | |||||
| "vst1.32 {q9}, [r10], %[dstStride]\n" | |||||
| "vst1.32 {q3}, [r12], %[dstStride]\n" | |||||
| "vst1.32 {q11}, [r10], %[dstStride]\n" | |||||
| "vst1.32 {q5}, [r12], %[dstStride]\n" | |||||
| "vst1.32 {q13}, [r10], %[dstStride]\n" | |||||
| "vst1.32 {q7}, [r12], %[dstStride]\n" | |||||
| "vst1.32 {q15}, [r10], %[dstStride]\n" | |||||
| : | |||||
| : | |||||
| [ dst_ptr ] "r"(dst_ptr), [ src_ptr ] "r"(src_ptr), [ srcStride ] "r"(srcStride), [ dstStride ] "r"(dstStride) | |||||
| : "r10", "r12", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", | |||||
| "q15"); | |||||
| #if defined(ENABLE_ARM64) || defined(ENABLE_AVX) || defined(ENABLE_SSE) || defined(ENABLE_ARM32) | |||||
| Transpose8X8Fp32Func_(src_ptr, dst_ptr, channel, plane); | |||||
| #else | #else | ||||
| for (int tr = 0; tr < C8NUM; tr++) { | for (int tr = 0; tr < C8NUM; tr++) { | ||||
| for (int tc = 0; tc < C8NUM; tc++) { | for (int tc = 0; tc < C8NUM; tc++) { | ||||
| @@ -462,7 +341,7 @@ void PackNHWCToNCHWFp32(const void *src, void *dst, int batches, int plane, int | |||||
| } | } | ||||
| } | } | ||||
| } | } | ||||
| for (; hw < plane; hw++) { | |||||
| for (; hw < task_end; hw++) { | |||||
| const float *src_ptr = src_batch + hw * channel; | const float *src_ptr = src_batch + hw * channel; | ||||
| float *dst_ptr = dst_batch + hw; | float *dst_ptr = dst_batch + hw; | ||||
| for (size_t i = 0; i < channel; i++) { | for (size_t i = 0; i < channel; i++) { | ||||
| @@ -470,10 +349,286 @@ void PackNHWCToNCHWFp32(const void *src, void *dst, int batches, int plane, int | |||||
| } | } | ||||
| } | } | ||||
| } | } | ||||
| return; | |||||
| } | |||||
| void PackNCHWToNHWCFp32(const void *src, void *dst, int batch, int plane, int channel, int task_id, int thread_count) { | |||||
| return PackNHWCToNCHWFp32(src, dst, batch, channel, plane, task_id, thread_count); | |||||
| } | |||||
| #ifdef ENABLE_ARM64 | |||||
| inline void Transpose8X8Fp32Arm64(const float *src_ptr, float *dst_ptr, int src_stride, int dst_stride) { | |||||
| size_t srcStride = src_stride * sizeof(float); | |||||
| size_t dstStride = dst_stride * sizeof(float); | |||||
| asm volatile( | |||||
| "mov x10, %[src_ptr]\n" | |||||
| "mov x11, %[dst_ptr]\n" | |||||
| "ld1 {v0.4s, v1.4s}, [x10], %[srcStride]\n" | |||||
| "ld1 {v2.4s, v3.4s}, [x10], %[srcStride]\n" | |||||
| "zip1 v8.4s, v0.4s, v2.4s\n" | |||||
| "zip2 v9.4s, v0.4s, v2.4s\n" | |||||
| "zip1 v12.4s, v1.4s, v3.4s\n" | |||||
| "zip2 v13.4s, v1.4s, v3.4s\n" | |||||
| "ld1 {v4.4s, v5.4s}, [x10], %[srcStride]\n" | |||||
| "ld1 {v6.4s, v7.4s}, [x10], %[srcStride]\n" | |||||
| "zip1 v10.4s, v4.4s, v6.4s\n" | |||||
| "zip2 v11.4s, v4.4s, v6.4s\n" | |||||
| "zip1 v14.4s, v5.4s, v7.4s\n" | |||||
| "zip2 v15.4s, v5.4s, v7.4s\n" | |||||
| "ld1 {v0.4s, v1.4s}, [x10], %[srcStride]\n" | |||||
| "ld1 {v2.4s, v3.4s}, [x10], %[srcStride]\n" | |||||
| "trn1 v16.2d, v8.2d, v10.2d\n" | |||||
| "trn2 v18.2d, v8.2d, v10.2d\n" | |||||
| "trn1 v20.2d, v9.2d, v11.2d\n" | |||||
| "trn2 v22.2d, v9.2d, v11.2d\n" | |||||
| "ld1 {v4.4s, v5.4s}, [x10], %[srcStride]\n" | |||||
| "ld1 {v6.4s, v7.4s}, [x10], %[srcStride]\n" | |||||
| "trn1 v24.2d, v12.2d, v14.2d\n" | |||||
| "trn2 v26.2d, v12.2d, v14.2d\n" | |||||
| "trn1 v28.2d, v13.2d, v15.2d\n" | |||||
| "trn2 v30.2d, v13.2d, v15.2d\n" | |||||
| "zip1 v8.4s, v0.4s, v2.4s\n" | |||||
| "zip2 v9.4s, v0.4s, v2.4s\n" | |||||
| "zip1 v12.4s, v1.4s, v3.4s\n" | |||||
| "zip2 v13.4s, v1.4s, v3.4s\n" | |||||
| "zip1 v10.4s, v4.4s, v6.4s\n" | |||||
| "zip2 v11.4s, v4.4s, v6.4s\n" | |||||
| "zip1 v14.4s, v5.4s, v7.4s\n" | |||||
| "zip2 v15.4s, v5.4s, v7.4s\n" | |||||
| "trn1 v17.2d, v8.2d, v10.2d\n" | |||||
| "trn2 v19.2d, v8.2d, v10.2d\n" | |||||
| "trn1 v21.2d, v9.2d, v11.2d\n" | |||||
| "trn2 v23.2d, v9.2d, v11.2d\n" | |||||
| "trn1 v25.2d, v12.2d, v14.2d\n" | |||||
| "trn2 v27.2d, v12.2d, v14.2d\n" | |||||
| "trn1 v29.2d, v13.2d, v15.2d\n" | |||||
| "trn2 v31.2d, v13.2d, v15.2d\n" | |||||
| "st1 {v16.4s, v17.4s}, [x11], %[dstStride]\n" | |||||
| "st1 {v18.4s, v19.4s}, [x11], %[dstStride]\n" | |||||
| "st1 {v20.4s, v21.4s}, [x11], %[dstStride]\n" | |||||
| "st1 {v22.4s, v23.4s}, [x11], %[dstStride]\n" | |||||
| "st1 {v24.4s, v25.4s}, [x11], %[dstStride]\n" | |||||
| "st1 {v26.4s, v27.4s}, [x11], %[dstStride]\n" | |||||
| "st1 {v28.4s, v29.4s}, [x11], %[dstStride]\n" | |||||
| "st1 {v30.4s, v31.4s}, [x11], %[dstStride]\n" | |||||
| : | |||||
| : [ dst_ptr ] "r"(dst_ptr), [ src_ptr ] "r"(src_ptr), [ srcStride ] "r"(srcStride), [ dstStride ] "r"(dstStride) | |||||
| : "x10", "x11", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", | |||||
| "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", | |||||
| "v31"); | |||||
| } | |||||
| #endif | |||||
| #ifdef ENABLE_ARM32 | |||||
| inline void Transpose8X8Fp32Arm32(const float *src_ptr, float *dst_ptr, int src_stride, int dst_stride) { | |||||
| size_t srcStride = src_stride * sizeof(float); | |||||
| size_t dstStride = dst_stride * sizeof(float); | |||||
| asm volatile( | |||||
| "mov r10, %[src_ptr]\n" | |||||
| "mov r12, %[dst_ptr]\n" | |||||
| "vld1.32 {q0, q1}, [r10], %[srcStride]\n" | |||||
| "vld1.32 {q2, q3}, [r10], %[srcStride]\n" | |||||
| "vtrn.32 d0, d4\n" | |||||
| "vtrn.32 d1, d5\n" | |||||
| "vtrn.32 d2, d6\n" | |||||
| "vtrn.32 d3, d7\n" | |||||
| "vld1.32 {q4, q5}, [r10], %[srcStride]\n" | |||||
| "vld1.32 {q6, q7}, [r10], %[srcStride]\n" | |||||
| "vtrn.32 d8, d12\n" | |||||
| "vtrn.32 d9, d13\n" | |||||
| "vtrn.32 d10, d14\n" | |||||
| "vtrn.32 d11, d15\n" | |||||
| "vld1.32 {q8, q9}, [r10], %[srcStride]\n" | |||||
| "vld1.32 {q10, q11}, [r10], %[srcStride]\n" | |||||
| "vswp d1, d8\n" | |||||
| "vswp d3, d10\n" | |||||
| "vswp d5, d12\n" | |||||
| "vswp d7, d14\n" | |||||
| "vtrn.32 d16, d20\n" | |||||
| "vtrn.32 d17, d21\n" | |||||
| "vtrn.32 d18, d22\n" | |||||
| "vtrn.32 d19, d23\n" | |||||
| "vld1.32 {q12, q13}, [r10], %[srcStride]\n" | |||||
| "vld1.32 {q14, q15}, [r10], %[srcStride]\n" | |||||
| "vtrn.32 d24, d28\n" | |||||
| "vtrn.32 d25, d29\n" | |||||
| "vtrn.32 d26, d30\n" | |||||
| "vtrn.32 d27, d31\n" | |||||
| "vswp d17, d24\n" | |||||
| "vswp d19, d26\n" | |||||
| "vswp d21, d28\n" | |||||
| "vswp d23, d30\n" | |||||
| "add r10, r12, #16\n" | |||||
| "vst1.32 {q0}, [r12], %[dstStride]\n" | |||||
| "vst1.32 {q8}, [r10], %[dstStride]\n" | |||||
| "vst1.32 {q2}, [r12], %[dstStride]\n" | |||||
| "vst1.32 {q10}, [r10], %[dstStride]\n" | |||||
| "vst1.32 {q4}, [r12], %[dstStride]\n" | |||||
| "vst1.32 {q12}, [r10], %[dstStride]\n" | |||||
| "vst1.32 {q6}, [r12], %[dstStride]\n" | |||||
| "vst1.32 {q14}, [r10], %[dstStride]\n" | |||||
| "vst1.32 {q1}, [r12], %[dstStride]\n" | |||||
| "vst1.32 {q9}, [r10], %[dstStride]\n" | |||||
| "vst1.32 {q3}, [r12], %[dstStride]\n" | |||||
| "vst1.32 {q11}, [r10], %[dstStride]\n" | |||||
| "vst1.32 {q5}, [r12], %[dstStride]\n" | |||||
| "vst1.32 {q13}, [r10], %[dstStride]\n" | |||||
| "vst1.32 {q7}, [r12], %[dstStride]\n" | |||||
| "vst1.32 {q15}, [r10], %[dstStride]\n" | |||||
| : | |||||
| : [ dst_ptr ] "r"(dst_ptr), [ src_ptr ] "r"(src_ptr), [ srcStride ] "r"(srcStride), [ dstStride ] "r"(dstStride) | |||||
| : "r10", "r12", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", | |||||
| "q15"); | |||||
| } | } | ||||
| #endif | #endif | ||||
| void PackNCHWToNHWCFp32(const void *src, void *dst, int batch, int plane, int channel) { | |||||
| return PackNHWCToNCHWFp32(src, dst, batch, channel, plane); | |||||
| #ifdef ENABLE_AVX | |||||
| inline void Transpose8X8Fp32Avx(const float *src_ptr, float *dst_ptr, int src_stride, int dst_stride) { | |||||
| LOAD256X8_F32(src, src_ptr, src_stride) | |||||
| __m256 r1 = _mm256_unpacklo_ps(src1, src2); | |||||
| __m256 r2 = _mm256_unpackhi_ps(src1, src2); | |||||
| __m256 r3 = _mm256_unpacklo_ps(src3, src4); | |||||
| __m256 r4 = _mm256_unpackhi_ps(src3, src4); | |||||
| __m256 r5 = _mm256_unpacklo_ps(src5, src6); | |||||
| __m256 r6 = _mm256_unpackhi_ps(src5, src6); | |||||
| __m256 r7 = _mm256_unpacklo_ps(src7, src8); | |||||
| __m256 r8 = _mm256_unpackhi_ps(src7, src8); | |||||
| __m256 v; | |||||
| v = _mm256_shuffle_ps(r1, r3, 0x4E); | |||||
| src1 = _mm256_blend_ps(r1, v, 0xCC); | |||||
| src2 = _mm256_blend_ps(r3, v, 0x33); | |||||
| v = _mm256_shuffle_ps(r2, r4, 0x4E); | |||||
| src3 = _mm256_blend_ps(r2, v, 0xCC); | |||||
| src4 = _mm256_blend_ps(r4, v, 0x33); | |||||
| v = _mm256_shuffle_ps(r5, r7, 0x4E); | |||||
| src5 = _mm256_blend_ps(r5, v, 0xCC); | |||||
| src6 = _mm256_blend_ps(r7, v, 0x33); | |||||
| v = _mm256_shuffle_ps(r6, r8, 0x4E); | |||||
| src7 = _mm256_blend_ps(r6, v, 0xCC); | |||||
| src8 = _mm256_blend_ps(r8, v, 0x33); | |||||
| r1 = _mm256_permute2f128_ps(src1, src5, 0x20); | |||||
| r2 = _mm256_permute2f128_ps(src2, src6, 0x20); | |||||
| r3 = _mm256_permute2f128_ps(src3, src7, 0x20); | |||||
| r4 = _mm256_permute2f128_ps(src4, src8, 0x20); | |||||
| r5 = _mm256_permute2f128_ps(src1, src5, 0x31); | |||||
| r6 = _mm256_permute2f128_ps(src2, src6, 0x31); | |||||
| r7 = _mm256_permute2f128_ps(src3, src7, 0x31); | |||||
| r8 = _mm256_permute2f128_ps(src4, src8, 0x31); | |||||
| STORE256X8_F32(dst_ptr, dst_stride, r); | |||||
| } | } | ||||
| #endif | |||||
| #if defined(ENABLE_SSE) && !defined(ENABLE_AVX) | |||||
| inline void Transpose8X8Fp32Sse(const float *src_ptr, float *dst_ptr, int src_stride, int dst_stride) { | |||||
| __m128 v0_ma = _mm_loadu_ps(src_ptr); | |||||
| __m128 v1_ma = _mm_loadu_ps(src_ptr + src_stride); | |||||
| __m128 v2_ma = _mm_loadu_ps(src_ptr + 2 * src_stride); | |||||
| __m128 v3_ma = _mm_loadu_ps(src_ptr + 3 * src_stride); | |||||
| __m128 v4_ma = _mm_unpacklo_ps(v0_ma, v1_ma); | |||||
| __m128 v5_ma = _mm_unpackhi_ps(v0_ma, v1_ma); | |||||
| __m128 v6_ma = _mm_unpacklo_ps(v2_ma, v3_ma); | |||||
| __m128 v7_ma = _mm_unpackhi_ps(v2_ma, v3_ma); | |||||
| __m128 v8_ma = _mm_movelh_ps(v4_ma, v6_ma); | |||||
| __m128 v9_ma = _mm_movehl_ps(v6_ma, v4_ma); | |||||
| __m128 v10_ma = _mm_movelh_ps(v5_ma, v7_ma); | |||||
| __m128 v11_ma = _mm_movehl_ps(v7_ma, v5_ma); | |||||
| _mm_storeu_ps(dst_ptr, v8_ma); | |||||
| _mm_storeu_ps(dst_ptr + dst_stride, v9_ma); | |||||
| _mm_storeu_ps(dst_ptr + 2 * dst_stride, v10_ma); | |||||
| _mm_storeu_ps(dst_ptr + 3 * dst_stride, v11_ma); | |||||
| v0_ma = _mm_loadu_ps(src_ptr + C4NUM); | |||||
| v1_ma = _mm_loadu_ps(src_ptr + src_stride + C4NUM); | |||||
| v2_ma = _mm_loadu_ps(src_ptr + 2 * src_stride + C4NUM); | |||||
| v3_ma = _mm_loadu_ps(src_ptr + 3 * src_stride + C4NUM); | |||||
| v4_ma = _mm_unpacklo_ps(v0_ma, v1_ma); | |||||
| v5_ma = _mm_unpackhi_ps(v0_ma, v1_ma); | |||||
| v6_ma = _mm_unpacklo_ps(v2_ma, v3_ma); | |||||
| v7_ma = _mm_unpackhi_ps(v2_ma, v3_ma); | |||||
| v8_ma = _mm_movelh_ps(v4_ma, v6_ma); | |||||
| v9_ma = _mm_movehl_ps(v6_ma, v4_ma); | |||||
| v10_ma = _mm_movelh_ps(v5_ma, v7_ma); | |||||
| v11_ma = _mm_movehl_ps(v7_ma, v5_ma); | |||||
| _mm_storeu_ps(dst_ptr + C4NUM * dst_stride, v8_ma); | |||||
| _mm_storeu_ps(dst_ptr + (C4NUM + 1) * dst_stride, v9_ma); | |||||
| _mm_storeu_ps(dst_ptr + (C4NUM + 2) * dst_stride, v10_ma); | |||||
| _mm_storeu_ps(dst_ptr + (C4NUM + 3) * dst_stride, v11_ma); | |||||
| v0_ma = _mm_loadu_ps(src_ptr + C4NUM * src_stride); | |||||
| v1_ma = _mm_loadu_ps(src_ptr + (C4NUM + 1) * src_stride); | |||||
| v2_ma = _mm_loadu_ps(src_ptr + (C4NUM + 2) * src_stride); | |||||
| v3_ma = _mm_loadu_ps(src_ptr + (C4NUM + 3) * src_stride); | |||||
| v4_ma = _mm_unpacklo_ps(v0_ma, v1_ma); | |||||
| v5_ma = _mm_unpackhi_ps(v0_ma, v1_ma); | |||||
| v6_ma = _mm_unpacklo_ps(v2_ma, v3_ma); | |||||
| v7_ma = _mm_unpackhi_ps(v2_ma, v3_ma); | |||||
| v8_ma = _mm_movelh_ps(v4_ma, v6_ma); | |||||
| v9_ma = _mm_movehl_ps(v6_ma, v4_ma); | |||||
| v10_ma = _mm_movelh_ps(v5_ma, v7_ma); | |||||
| v11_ma = _mm_movehl_ps(v7_ma, v5_ma); | |||||
| _mm_storeu_ps(dst_ptr + C4NUM, v8_ma); | |||||
| _mm_storeu_ps(dst_ptr + dst_stride + C4NUM, v9_ma); | |||||
| _mm_storeu_ps(dst_ptr + 2 * dst_stride + C4NUM, v10_ma); | |||||
| _mm_storeu_ps(dst_ptr + 3 * dst_stride + C4NUM, v11_ma); | |||||
| v0_ma = _mm_loadu_ps(src_ptr + C4NUM * src_stride + C4NUM); | |||||
| v1_ma = _mm_loadu_ps(src_ptr + (C4NUM + 1) * src_stride + C4NUM); | |||||
| v2_ma = _mm_loadu_ps(src_ptr + (C4NUM + 2) * src_stride + C4NUM); | |||||
| v3_ma = _mm_loadu_ps(src_ptr + (C4NUM + 3) * src_stride + C4NUM); | |||||
| v4_ma = _mm_unpacklo_ps(v0_ma, v1_ma); | |||||
| v5_ma = _mm_unpackhi_ps(v0_ma, v1_ma); | |||||
| v6_ma = _mm_unpacklo_ps(v2_ma, v3_ma); | |||||
| v7_ma = _mm_unpackhi_ps(v2_ma, v3_ma); | |||||
| v8_ma = _mm_movelh_ps(v4_ma, v6_ma); | |||||
| v9_ma = _mm_movehl_ps(v6_ma, v4_ma); | |||||
| v10_ma = _mm_movelh_ps(v5_ma, v7_ma); | |||||
| v11_ma = _mm_movehl_ps(v7_ma, v5_ma); | |||||
| _mm_storeu_ps(dst_ptr + C4NUM * dst_stride + C4NUM, v8_ma); | |||||
| _mm_storeu_ps(dst_ptr + (C4NUM + 1) * dst_stride + C4NUM, v9_ma); | |||||
| _mm_storeu_ps(dst_ptr + (C4NUM + 2) * dst_stride + C4NUM, v10_ma); | |||||
| _mm_storeu_ps(dst_ptr + (C4NUM + 3) * dst_stride + C4NUM, v11_ma); | |||||
| } | |||||
| #endif | |||||
| @@ -1,5 +1,5 @@ | |||||
| /** | /** | ||||
| * Copyright 2020 Huawei Technologies Co., Ltd | |||||
| * Copyright 2020-2021 Huawei Technologies Co., Ltd | |||||
| * | * | ||||
| * Licensed under the Apache License, Version 2.0 (the "License"); | * Licensed under the Apache License, Version 2.0 (the "License"); | ||||
| * you may not use this file except in compliance with the License. | * you may not use this file except in compliance with the License. | ||||
| @@ -30,8 +30,9 @@ void PackNHWCToNC4HW4Fp32(const void *src, void *dst, int batch, int plane, int | |||||
| void PackNCHWToNC4HW4Fp32(const void *src, void *dst, int batch, int plane, int channel); | void PackNCHWToNC4HW4Fp32(const void *src, void *dst, int batch, int plane, int channel); | ||||
| void PackNHWCToNHWC4Fp32(const void *src, void *dst, int batch, int plane, int channel); | void PackNHWCToNHWC4Fp32(const void *src, void *dst, int batch, int plane, int channel); | ||||
| void PackNHWCToNHWC8Fp32(const void *src, void *dst, int batch, int plane, int channel); | void PackNHWCToNHWC8Fp32(const void *src, void *dst, int batch, int plane, int channel); | ||||
| void PackNHWCToNCHWFp32(const void *src, void *dst, int batch, int plane, int channel); | |||||
| void PackNCHWToNHWCFp32(const void *src, void *dst, int batch, int plane, int channel); | |||||
| // Note: If not multithreaded, please set task_id = 0 and thread_count = 0; | |||||
| void PackNHWCToNCHWFp32(const void *src, void *dst, int batch, int plane, int channel, int task_id, int thread_count); | |||||
| void PackNCHWToNHWCFp32(const void *src, void *dst, int batch, int plane, int channel, int task_id, int thread_count); | |||||
| void PackNHWC4ToNHWCFp32(const void *src, void *dst, int batch, int plane, int channel); | void PackNHWC4ToNHWCFp32(const void *src, void *dst, int batch, int plane, int channel); | ||||
| void PackNC4HW4ToNHWC4Fp32(const void *src, void *dst, int batch, int plane, int channel); | void PackNC4HW4ToNHWC4Fp32(const void *src, void *dst, int batch, int plane, int channel); | ||||
| void PackNC4HW4ToNHWCFp32(const void *src, void *dst, int batch, int plane, int channel); | void PackNC4HW4ToNHWCFp32(const void *src, void *dst, int batch, int plane, int channel); | ||||
| @@ -43,6 +44,21 @@ void PackDepthwiseIndirectWeightC8Fp32(const void *src, void *dst, int height, i | |||||
| void Im2ColPackUnitFp32(const float *input_data, const ConvParameter *conv_param, float *packed_input, int real_cal_num, | void Im2ColPackUnitFp32(const float *input_data, const ConvParameter *conv_param, float *packed_input, int real_cal_num, | ||||
| int block_index); | int block_index); | ||||
| // Transpose 8X8 Fp32 block data | |||||
| typedef void (*Transpose8X8Fp32Func)(const float *src_ptr, float *dst_ptr, int src_stride, int dst_stride); | |||||
| #ifdef ENABLE_ARM64 | |||||
| void Transpose8X8Fp32Arm64(const float *src_ptr, float *dst_ptr, int src_stride, int dst_stride); | |||||
| #endif | |||||
| #ifdef ENABLE_ARM32 | |||||
| void Transpose8X8Fp32Arm32(const float *src_ptr, float *dst_ptr, int src_stride, int dst_stride); | |||||
| #endif | |||||
| #ifdef ENABLE_AVX | |||||
| void Transpose8X8Fp32Avx(const float *src_ptr, float *dst_ptr, int src_stride, int dst_stride); | |||||
| #endif | |||||
| #if defined(ENABLE_SSE) && !defined(ENABLE_AVX) | |||||
| void Transpose8X8Fp32Sse(const float *src_ptr, float *dst_ptr, int src_stride, int dst_stride); | |||||
| #endif | |||||
| #ifdef __cplusplus | #ifdef __cplusplus | ||||
| } | } | ||||
| #endif | #endif | ||||
| @@ -125,20 +125,73 @@ void TransposeDim5Fp32(const float *in_data, float *out_data, const int *strides | |||||
| } | } | ||||
| } | } | ||||
| void TransposeDimsFp32(const float *in_data, float *out_data, const int *strides, const int *out_strides, | |||||
| const int *perm, const int *output_shape, int dims, int *size, int *position) { | |||||
| *(size + dims - 1) = 1; | |||||
| for (int i = dims - 1; i > 0; --i) { | |||||
| *(size + i - 1) = *(size + i) * output_shape[i]; | |||||
| void TransposeDim6Fp32(const float *in_data, float *out_data, const int *strides, const int *out_strides, | |||||
| const int *perm, const int *output_shape) { | |||||
| const int stride0 = strides[perm[0]]; | |||||
| const int stride1 = strides[perm[1]]; | |||||
| const int stride2 = strides[perm[2]]; | |||||
| const int stride3 = strides[perm[3]]; | |||||
| const int stride4 = strides[perm[4]]; | |||||
| const int stride5 = strides[perm[5]]; | |||||
| const int out_stride0 = out_strides[0]; | |||||
| const int out_stride1 = out_strides[1]; | |||||
| const int out_stride2 = out_strides[2]; | |||||
| const int out_stride3 = out_strides[3]; | |||||
| const int out_stride4 = out_strides[4]; | |||||
| const int output0 = output_shape[0]; | |||||
| const int output1 = output_shape[1]; | |||||
| const int output2 = output_shape[2]; | |||||
| const int output3 = output_shape[3]; | |||||
| const int output4 = output_shape[4]; | |||||
| const int output5 = output_shape[5]; | |||||
| for (int i = 0; i < output0; ++i) { | |||||
| int out_stride0_i = i * out_stride0; | |||||
| int stride0_i = i * stride0; | |||||
| for (int j = 0; j < output1; ++j) { | |||||
| int out_stride1_j = j * out_stride1; | |||||
| int stride1_j = j * stride1; | |||||
| for (int k = 0; k < output2; ++k) { | |||||
| int out_stride2_k = k * out_stride2; | |||||
| int stride2_k = k * stride2; | |||||
| for (int m = 0; m < output3; ++m) { | |||||
| int out_stride3_m = m * out_stride3; | |||||
| int stride3_m = m * stride3; | |||||
| for (int n = 0; n < output4; ++n) { | |||||
| int out_stride4_m = n * out_stride4; | |||||
| int stride4_m = n * stride4; | |||||
| for (int g = 0; g < output5; ++g) { | |||||
| out_data[out_stride0_i + out_stride1_j + out_stride2_k + out_stride3_m + out_stride4_m + g] = | |||||
| in_data[stride0_i + stride1_j + stride2_k + stride3_m + stride4_m + g * stride5]; | |||||
| } | |||||
| } | |||||
| } | |||||
| } | |||||
| } | |||||
| } | } | ||||
| } | |||||
| for (size_t idx = 0; idx < (*size) * output_shape[0]; ++idx) { | |||||
| void TransposeDimsFp32(const float *in_data, float *out_data, const int *output_shape, int *size, int *position, | |||||
| TransposeParameter *transpose_param, int task_id, int thread_num) { | |||||
| int *perm = transpose_param->perm_; | |||||
| int *strides = transpose_param->strides_; | |||||
| int *out_strides = transpose_param->out_strides_; | |||||
| int num_axes = transpose_param->num_axes_; | |||||
| size_t data_size = (*size) * output_shape[0]; | |||||
| size_t offset_size = UP_DIV(data_size, thread_num); | |||||
| size_t task_offset = offset_size * task_id; | |||||
| int count = data_size - task_offset; | |||||
| if (count <= 0) { | |||||
| return; | |||||
| } | |||||
| count = MSMIN(offset_size, count); | |||||
| for (size_t idx = task_offset; idx < task_offset + count; ++idx) { | |||||
| int pos = idx; | int pos = idx; | ||||
| int output_idx = 0; | int output_idx = 0; | ||||
| int input_idx = 0; | int input_idx = 0; | ||||
| for (int i = 0; i < dims; ++i) { | |||||
| for (int i = 0; i < num_axes; ++i) { | |||||
| *(position + i) = pos / *(size + i); | *(position + i) = pos / *(size + i); | ||||
| int out_stride = i < dims - 1 ? out_strides[i] : 1; | |||||
| int out_stride = i < num_axes - 1 ? out_strides[i] : 1; | |||||
| output_idx += (*(position + i) * out_stride); | output_idx += (*(position + i) * out_stride); | ||||
| input_idx += (*(position + i) * strides[perm[i]]); | input_idx += (*(position + i) * strides[perm[i]]); | ||||
| pos -= *(position + i) * (*(size + i)); | pos -= *(position + i) * (*(size + i)); | ||||
| @@ -147,8 +200,8 @@ void TransposeDimsFp32(const float *in_data, float *out_data, const int *strides | |||||
| } | } | ||||
| } | } | ||||
| int DoTransposeFp32(const float *in_data, float *out_data, const int *output_shape, TransposeParameter *transpose_param, | |||||
| int *size, int *position) { | |||||
| int DoTransposeFp32(const float *in_data, float *out_data, const int *output_shape, | |||||
| TransposeParameter *transpose_param) { | |||||
| if (in_data == NULL || out_data == NULL) { | if (in_data == NULL || out_data == NULL) { | ||||
| return NNACL_ERR; | return NNACL_ERR; | ||||
| } | } | ||||
| @@ -188,8 +241,10 @@ int DoTransposeFp32(const float *in_data, float *out_data, const int *output_sha | |||||
| TransposeDim4Fp32(in_data, out_data, strides, out_strides, perm, output_shape); | TransposeDim4Fp32(in_data, out_data, strides, out_strides, perm, output_shape); | ||||
| } else if (num_axes == 5) { | } else if (num_axes == 5) { | ||||
| TransposeDim5Fp32(in_data, out_data, strides, out_strides, perm, output_shape); | TransposeDim5Fp32(in_data, out_data, strides, out_strides, perm, output_shape); | ||||
| } else if (num_axes == 6) { | |||||
| TransposeDim6Fp32(in_data, out_data, strides, out_strides, perm, output_shape); | |||||
| } else { | } else { | ||||
| TransposeDimsFp32(in_data, out_data, strides, out_strides, perm, output_shape, num_axes, size, position); | |||||
| return NNACL_ERR; | |||||
| } | } | ||||
| return NNACL_OK; | return NNACL_OK; | ||||
| } | } | ||||
| @@ -25,9 +25,9 @@ | |||||
| extern "C" { | extern "C" { | ||||
| #endif | #endif | ||||
| int DoTransposeFp32(const float *in_data, float *out_data, const int *output_shape, TransposeParameter *transpose_param, | |||||
| int *size, int *position); | |||||
| int DoTransposeFp32(const float *in_data, float *out_data, const int *output_shape, TransposeParameter *param); | |||||
| void TransposeDimsFp32(const float *in_data, float *out_data, const int *output_shape, int *size, int *position, | |||||
| TransposeParameter *transpose_param, int task_id, int thread_num); | |||||
| #ifdef __cplusplus | #ifdef __cplusplus | ||||
| } | } | ||||
| #endif | #endif | ||||
| @@ -1,140 +0,0 @@ | |||||
| /** | |||||
| * Copyright 2020 Huawei Technologies Co., Ltd | |||||
| * | |||||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||||
| * you may not use this file except in compliance with the License. | |||||
| * You may obtain a copy of the License at | |||||
| * | |||||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||||
| * | |||||
| * Unless required by applicable law or agreed to in writing, software | |||||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| * See the License for the specific language governing permissions and | |||||
| * limitations under the License. | |||||
| */ | |||||
| #ifdef ENABLE_SSE | |||||
| #include <x86intrin.h> | |||||
| #include "nnacl/pack.h" | |||||
| #include "nnacl/int8/conv_int8.h" | |||||
| void PackNHWCToNCHWFp32(const void *src, void *dst, int batches, int plane, int channel) { | |||||
| int hw8 = plane / C8NUM * C8NUM; | |||||
| int c8 = channel / C8NUM * C8NUM; | |||||
| int batch = plane * channel; | |||||
| for (int n = 0; n < batches; n++) { | |||||
| const float *src_batch = (const float *)src + n * batch; | |||||
| float *dst_batch = (float *)dst + n * batch; | |||||
| int hw = 0; | |||||
| for (; hw < hw8; hw += C8NUM) { | |||||
| int c = 0; | |||||
| for (; c < c8; c += C8NUM) { | |||||
| const float *src_ptr = src_batch + hw * channel + c; | |||||
| float *dst_ptr = dst_batch + c * plane + hw; | |||||
| // 11-14 | |||||
| __m128 v0_ma = _mm_loadu_ps(src_ptr); | |||||
| __m128 v1_ma = _mm_loadu_ps(src_ptr + channel); | |||||
| __m128 v2_ma = _mm_loadu_ps(src_ptr + 2 * channel); | |||||
| __m128 v3_ma = _mm_loadu_ps(src_ptr + 3 * channel); | |||||
| __m128 v4_ma = _mm_unpacklo_ps(v0_ma, v1_ma); | |||||
| __m128 v5_ma = _mm_unpackhi_ps(v0_ma, v1_ma); | |||||
| __m128 v6_ma = _mm_unpacklo_ps(v2_ma, v3_ma); | |||||
| __m128 v7_ma = _mm_unpackhi_ps(v2_ma, v3_ma); | |||||
| __m128 v8_ma = _mm_movelh_ps(v4_ma, v6_ma); | |||||
| __m128 v9_ma = _mm_movehl_ps(v6_ma, v4_ma); | |||||
| __m128 v10_ma = _mm_movelh_ps(v5_ma, v7_ma); | |||||
| __m128 v11_ma = _mm_movehl_ps(v7_ma, v5_ma); | |||||
| _mm_storeu_ps(dst_ptr, v8_ma); | |||||
| _mm_storeu_ps(dst_ptr + plane, v9_ma); | |||||
| _mm_storeu_ps(dst_ptr + 2 * plane, v10_ma); | |||||
| _mm_storeu_ps(dst_ptr + 3 * plane, v11_ma); | |||||
| // 15-18 | |||||
| v0_ma = _mm_loadu_ps(src_ptr + C4NUM); | |||||
| v1_ma = _mm_loadu_ps(src_ptr + channel + C4NUM); | |||||
| v2_ma = _mm_loadu_ps(src_ptr + 2 * channel + C4NUM); | |||||
| v3_ma = _mm_loadu_ps(src_ptr + 3 * channel + C4NUM); | |||||
| v4_ma = _mm_unpacklo_ps(v0_ma, v1_ma); | |||||
| v5_ma = _mm_unpackhi_ps(v0_ma, v1_ma); | |||||
| v6_ma = _mm_unpacklo_ps(v2_ma, v3_ma); | |||||
| v7_ma = _mm_unpackhi_ps(v2_ma, v3_ma); | |||||
| v8_ma = _mm_movelh_ps(v4_ma, v6_ma); | |||||
| v9_ma = _mm_movehl_ps(v6_ma, v4_ma); | |||||
| v10_ma = _mm_movelh_ps(v5_ma, v7_ma); | |||||
| v11_ma = _mm_movehl_ps(v7_ma, v5_ma); | |||||
| _mm_storeu_ps(dst_ptr + C4NUM * plane, v8_ma); | |||||
| _mm_storeu_ps(dst_ptr + (C4NUM + 1) * plane, v9_ma); | |||||
| _mm_storeu_ps(dst_ptr + (C4NUM + 2) * plane, v10_ma); | |||||
| _mm_storeu_ps(dst_ptr + (C4NUM + 3) * plane, v11_ma); | |||||
| // 21-24 | |||||
| v0_ma = _mm_loadu_ps(src_ptr + C4NUM * channel); | |||||
| v1_ma = _mm_loadu_ps(src_ptr + (C4NUM + 1) * channel); | |||||
| v2_ma = _mm_loadu_ps(src_ptr + (C4NUM + 2) * channel); | |||||
| v3_ma = _mm_loadu_ps(src_ptr + (C4NUM + 3) * channel); | |||||
| v4_ma = _mm_unpacklo_ps(v0_ma, v1_ma); | |||||
| v5_ma = _mm_unpackhi_ps(v0_ma, v1_ma); | |||||
| v6_ma = _mm_unpacklo_ps(v2_ma, v3_ma); | |||||
| v7_ma = _mm_unpackhi_ps(v2_ma, v3_ma); | |||||
| v8_ma = _mm_movelh_ps(v4_ma, v6_ma); | |||||
| v9_ma = _mm_movehl_ps(v6_ma, v4_ma); | |||||
| v10_ma = _mm_movelh_ps(v5_ma, v7_ma); | |||||
| v11_ma = _mm_movehl_ps(v7_ma, v5_ma); | |||||
| _mm_storeu_ps(dst_ptr + C4NUM, v8_ma); | |||||
| _mm_storeu_ps(dst_ptr + plane + C4NUM, v9_ma); | |||||
| _mm_storeu_ps(dst_ptr + 2 * plane + C4NUM, v10_ma); | |||||
| _mm_storeu_ps(dst_ptr + 3 * plane + C4NUM, v11_ma); | |||||
| // 25-28 | |||||
| v0_ma = _mm_loadu_ps(src_ptr + C4NUM * channel + C4NUM); | |||||
| v1_ma = _mm_loadu_ps(src_ptr + (C4NUM + 1) * channel + C4NUM); | |||||
| v2_ma = _mm_loadu_ps(src_ptr + (C4NUM + 2) * channel + C4NUM); | |||||
| v3_ma = _mm_loadu_ps(src_ptr + (C4NUM + 3) * channel + C4NUM); | |||||
| v4_ma = _mm_unpacklo_ps(v0_ma, v1_ma); | |||||
| v5_ma = _mm_unpackhi_ps(v0_ma, v1_ma); | |||||
| v6_ma = _mm_unpacklo_ps(v2_ma, v3_ma); | |||||
| v7_ma = _mm_unpackhi_ps(v2_ma, v3_ma); | |||||
| v8_ma = _mm_movelh_ps(v4_ma, v6_ma); | |||||
| v9_ma = _mm_movehl_ps(v6_ma, v4_ma); | |||||
| v10_ma = _mm_movelh_ps(v5_ma, v7_ma); | |||||
| v11_ma = _mm_movehl_ps(v7_ma, v5_ma); | |||||
| _mm_storeu_ps(dst_ptr + C4NUM * plane + C4NUM, v8_ma); | |||||
| _mm_storeu_ps(dst_ptr + (C4NUM + 1) * plane + C4NUM, v9_ma); | |||||
| _mm_storeu_ps(dst_ptr + (C4NUM + 2) * plane + C4NUM, v10_ma); | |||||
| _mm_storeu_ps(dst_ptr + (C4NUM + 3) * plane + C4NUM, v11_ma); | |||||
| } | |||||
| for (; c < channel; c++) { | |||||
| const float *src_ptr = src_batch + hw * channel + c; | |||||
| float *dst_ptr = dst_batch + c * plane + hw; | |||||
| for (size_t i = 0; i < C8NUM; i++) { | |||||
| dst_ptr[i] = src_ptr[i * channel]; | |||||
| } | |||||
| } | |||||
| } | |||||
| for (; hw < plane; hw++) { | |||||
| const float *src_ptr = src_batch + hw * channel; | |||||
| float *dst_ptr = dst_batch + hw; | |||||
| for (size_t i = 0; i < channel; i++) { | |||||
| dst_ptr[i * plane] = src_ptr[i]; | |||||
| } | |||||
| } | |||||
| } | |||||
| return; | |||||
| } | |||||
| #endif | |||||
| @@ -19,7 +19,7 @@ | |||||
| #include "nnacl/op_base.h" | #include "nnacl/op_base.h" | ||||
| #define MAX_TRANSPOSE_DIM_SIZE 5 | |||||
| #define MAX_TRANSPOSE_DIM_SIZE 6 | |||||
| typedef struct TransposeParameter { | typedef struct TransposeParameter { | ||||
| // primitive parameter | // primitive parameter | ||||
| @@ -22,6 +22,7 @@ | |||||
| using mindspore::lite::KernelRegistrar; | using mindspore::lite::KernelRegistrar; | ||||
| using mindspore::lite::RET_ERROR; | using mindspore::lite::RET_ERROR; | ||||
| using mindspore::lite::RET_NULL_PTR; | |||||
| using mindspore::lite::RET_OK; | using mindspore::lite::RET_OK; | ||||
| using mindspore::lite::RET_OP_EXECUTE_FAILURE; | using mindspore::lite::RET_OP_EXECUTE_FAILURE; | ||||
| using mindspore::schema::PrimitiveType_Transpose; | using mindspore::schema::PrimitiveType_Transpose; | ||||
| @@ -82,31 +83,46 @@ TransposeCPUKernel::~TransposeCPUKernel() { | |||||
| } | } | ||||
| } | } | ||||
| int TransposeCPUKernel::NhNcTranspose(lite::Tensor *in_tensor, lite::Tensor *out_tensor, TransposeParameter *param) { | |||||
| void TransposeCPUKernel::GetNHNCTransposeFunc(lite::Tensor *in_tensor, lite::Tensor *out_tensor, | |||||
| TransposeParameter *param) { | |||||
| auto out_shape = out_tensor->shape(); | auto out_shape = out_tensor->shape(); | ||||
| if (in_tensor->shape().size() == 4 && param->perm_[0] == 0 && param->perm_[1] == 2 && param->perm_[2] == 3 && | if (in_tensor->shape().size() == 4 && param->perm_[0] == 0 && param->perm_[1] == 2 && param->perm_[2] == 3 && | ||||
| param->perm_[3] == 1) { | param->perm_[3] == 1) { | ||||
| nhnc_param_[0] = out_shape[0]; | |||||
| nhnc_param_[1] = out_shape[1] * out_shape[2]; | |||||
| nhnc_param_[2] = out_shape[3]; | |||||
| if (in_tensor->data_type() == kNumberTypeFloat32) { | if (in_tensor->data_type() == kNumberTypeFloat32) { | ||||
| PackNCHWToNHWCFp32(in_tensor->MutableData(), out_tensor->MutableData(), out_shape[0], out_shape[1] * out_shape[2], | |||||
| out_shape[3]); | |||||
| } else if (in_tensor->data_type() == kNumberTypeInt8) { | |||||
| PackNCHWToNHWCInt8(in_tensor->MutableData(), out_tensor->MutableData(), out_shape[0], out_shape[1] * out_shape[2], | |||||
| out_shape[3]); | |||||
| NHNCTransposeFunc_ = PackNCHWToNHWCFp32; | |||||
| } | } | ||||
| return RET_OK; | |||||
| } | } | ||||
| if (in_tensor->shape().size() == 4 && param->perm_[0] == 0 && param->perm_[1] == 3 && param->perm_[2] == 1 && | if (in_tensor->shape().size() == 4 && param->perm_[0] == 0 && param->perm_[1] == 3 && param->perm_[2] == 1 && | ||||
| param->perm_[3] == 2) { | param->perm_[3] == 2) { | ||||
| nhnc_param_[0] = out_shape[0]; | |||||
| nhnc_param_[1] = out_shape[2] * out_shape[3]; | |||||
| nhnc_param_[2] = out_shape[1]; | |||||
| if (in_tensor->data_type() == kNumberTypeFloat32) { | if (in_tensor->data_type() == kNumberTypeFloat32) { | ||||
| PackNHWCToNCHWFp32(in_tensor->MutableData(), out_tensor->MutableData(), out_shape[0], out_shape[2] * out_shape[3], | |||||
| out_shape[1]); | |||||
| } else if (in_tensor->data_type() == kNumberTypeInt8) { | |||||
| PackNHWCToNCHWInt8(in_tensor->MutableData(), out_tensor->MutableData(), out_shape[0], out_shape[2] * out_shape[3], | |||||
| out_shape[1]); | |||||
| NHNCTransposeFunc_ = PackNHWCToNCHWFp32; | |||||
| } | } | ||||
| return RET_OK; | |||||
| } | } | ||||
| return RET_ERROR; | |||||
| } | |||||
| int TransposeCPUKernel::RunImpl(int task_id) { | |||||
| if (NHNCTransposeFunc_ != nullptr) { | |||||
| NHNCTransposeFunc_(in_data_, out_data_, nhnc_param_[0], nhnc_param_[1], nhnc_param_[2], task_id, thread_count_); | |||||
| } else { | |||||
| TransposeDimsFp32(in_data_, out_data_, out_shape_, dim_size_, position_ + dims_ * task_id, param_, task_id, | |||||
| thread_count_); | |||||
| } | |||||
| return RET_OK; | |||||
| } | |||||
| int TransposeImpl(void *kernel, int task_id) { | |||||
| auto transpose = reinterpret_cast<TransposeCPUKernel *>(kernel); | |||||
| auto ret = transpose->RunImpl(task_id); | |||||
| if (ret != RET_OK) { | |||||
| MS_LOG(ERROR) << "TransposeImpl Run error task_id[" << task_id << "] error_code[" << ret << "]"; | |||||
| } | |||||
| return ret; | |||||
| } | } | ||||
| int TransposeCPUKernel::Run() { | int TransposeCPUKernel::Run() { | ||||
| @@ -123,8 +139,8 @@ int TransposeCPUKernel::Run() { | |||||
| MS_ASSERT(in_data_); | MS_ASSERT(in_data_); | ||||
| MS_ASSERT(out_data_); | MS_ASSERT(out_data_); | ||||
| TransposeParameter *param = reinterpret_cast<TransposeParameter *>(this->op_parameter_); | |||||
| if (in_tensor->shape().size() != static_cast<size_t>(param->num_axes_)) { | |||||
| param_ = reinterpret_cast<TransposeParameter *>(this->op_parameter_); | |||||
| if (in_tensor->shape().size() != static_cast<size_t>(param_->num_axes_)) { | |||||
| memcpy(out_data_, in_data_, in_tensor->ElementsNum() * sizeof(float)); | memcpy(out_data_, in_data_, in_tensor->ElementsNum() * sizeof(float)); | ||||
| return RET_OK; | return RET_OK; | ||||
| } | } | ||||
| @@ -134,40 +150,48 @@ int TransposeCPUKernel::Run() { | |||||
| MS_ASSERT(input_perm->data_c() != nullptr); | MS_ASSERT(input_perm->data_c() != nullptr); | ||||
| int *perm_data = reinterpret_cast<int *>(input_perm->data_c()); | int *perm_data = reinterpret_cast<int *>(input_perm->data_c()); | ||||
| for (int i = 0; i < input_perm->ElementsNum(); ++i) { | for (int i = 0; i < input_perm->ElementsNum(); ++i) { | ||||
| param->perm_[i] = perm_data[i]; | |||||
| param_->perm_[i] = perm_data[i]; | |||||
| } | } | ||||
| for (int i = input_perm->ElementsNum(); i < MAX_SHAPE_SIZE; ++i) { | for (int i = input_perm->ElementsNum(); i < MAX_SHAPE_SIZE; ++i) { | ||||
| param->perm_[i] = 0; | |||||
| param_->perm_[i] = 0; | |||||
| } | } | ||||
| } | } | ||||
| auto ret = NhNcTranspose(in_tensor, out_tensor, param); | |||||
| if (ret == RET_OK) { | |||||
| thread_count_ = op_parameter_->thread_num_; | |||||
| GetNHNCTransposeFunc(in_tensor, out_tensor, param_); | |||||
| if (NHNCTransposeFunc_ != nullptr) { | |||||
| auto ret = ParallelLaunch(this->context_->thread_pool_, TransposeImpl, this, thread_count_); | |||||
| if (ret != RET_OK) { | |||||
| MS_LOG(ERROR) << "NHNCTransposeFunc_ is error!"; | |||||
| } | |||||
| return ret; | return ret; | ||||
| } | } | ||||
| if (in_tensor->data_type() == kNumberTypeInt8) { | |||||
| MS_LOG(ERROR) << "not support now"; | |||||
| return RET_ERROR; | |||||
| } | |||||
| int dims = out_tensor->shape().size(); | |||||
| if (dims > MAX_TRANSPOSE_DIM_SIZE) { | |||||
| dim_size_ = reinterpret_cast<int *>(context_->allocator->Malloc(dims * sizeof(int))); | |||||
| MS_ASSERT(out_shape_); | |||||
| dims_ = out_tensor->shape().size(); | |||||
| if (dims_ > MAX_TRANSPOSE_DIM_SIZE) { | |||||
| dim_size_ = reinterpret_cast<int *>(context_->allocator->Malloc(dims_ * sizeof(int))); | |||||
| if (dim_size_ == nullptr) { | if (dim_size_ == nullptr) { | ||||
| MS_LOG(ERROR) << "Malloc data failed"; | MS_LOG(ERROR) << "Malloc data failed"; | ||||
| return RET_ERROR; | |||||
| return RET_NULL_PTR; | |||||
| } | } | ||||
| position_ = reinterpret_cast<int *>(context_->allocator->Malloc(dims * sizeof(int))); | |||||
| *(dim_size_ + dims_ - 1) = 1; | |||||
| for (int i = dims_ - 1; i > 0; --i) { | |||||
| *(dim_size_ + i - 1) = *(dim_size_ + i) * out_shape_[i]; | |||||
| } | |||||
| position_ = reinterpret_cast<int *>(context_->allocator->Malloc(dims_ * sizeof(int) * thread_count_)); | |||||
| if (position_ == nullptr) { | if (position_ == nullptr) { | ||||
| MS_LOG(ERROR) << "Malloc data failed"; | |||||
| context_->allocator->Free(dim_size_); | context_->allocator->Free(dim_size_); | ||||
| dim_size_ = nullptr; | |||||
| return RET_ERROR; | |||||
| MS_LOG(ERROR) << "Malloc data failed"; | |||||
| return RET_NULL_PTR; | |||||
| } | } | ||||
| } | } | ||||
| MS_ASSERT(out_shape_); | |||||
| ret = DoTransposeFp32(in_data_, out_data_, out_shape_, param, dim_size_, position_); | |||||
| if (dims > MAX_TRANSPOSE_DIM_SIZE) { | |||||
| int ret; | |||||
| if (dims_ > MAX_TRANSPOSE_DIM_SIZE) { | |||||
| ret = ParallelLaunch(this->context_->thread_pool_, TransposeImpl, this, thread_count_); | |||||
| } else { | |||||
| ret = DoTransposeFp32(in_data_, out_data_, out_shape_, param_); | |||||
| } | |||||
| if (dims_ > MAX_TRANSPOSE_DIM_SIZE) { | |||||
| context_->allocator->Free(dim_size_); | context_->allocator->Free(dim_size_); | ||||
| context_->allocator->Free(position_); | context_->allocator->Free(position_); | ||||
| dim_size_ = nullptr; | dim_size_ = nullptr; | ||||
| @@ -175,13 +199,10 @@ int TransposeCPUKernel::Run() { | |||||
| } | } | ||||
| if (ret != RET_OK) { | if (ret != RET_OK) { | ||||
| MS_LOG(ERROR) << "Transpose run failed"; | MS_LOG(ERROR) << "Transpose run failed"; | ||||
| return RET_ERROR; | |||||
| } | } | ||||
| return ret; | return ret; | ||||
| } | } | ||||
| REG_KERNEL(kCPU, kNumberTypeFloat32, PrimitiveType_Transpose, LiteKernelCreator<TransposeCPUKernel>) | REG_KERNEL(kCPU, kNumberTypeFloat32, PrimitiveType_Transpose, LiteKernelCreator<TransposeCPUKernel>) | ||||
| REG_KERNEL(kCPU, kNumberTypeInt32, PrimitiveType_Transpose, LiteKernelCreator<TransposeCPUKernel>) | REG_KERNEL(kCPU, kNumberTypeInt32, PrimitiveType_Transpose, LiteKernelCreator<TransposeCPUKernel>) | ||||
| REG_KERNEL(kCPU, kNumberTypeInt8, PrimitiveType_Transpose, LiteKernelCreator<TransposeCPUKernel>) | |||||
| } // namespace mindspore::kernel | } // namespace mindspore::kernel | ||||
| @@ -25,6 +25,10 @@ | |||||
| #include "src/kernel_registry.h" | #include "src/kernel_registry.h" | ||||
| namespace mindspore::kernel { | namespace mindspore::kernel { | ||||
| typedef void (*TransposeFunc)(const void *src, void *dst, int batch, int plane, int channel, int thread_num, | |||||
| int task_id); | |||||
| class TransposeCPUKernel : public LiteKernel { | class TransposeCPUKernel : public LiteKernel { | ||||
| public: | public: | ||||
| explicit TransposeCPUKernel(OpParameter *param, const std::vector<lite::Tensor *> &inputs, | explicit TransposeCPUKernel(OpParameter *param, const std::vector<lite::Tensor *> &inputs, | ||||
| @@ -35,14 +39,20 @@ class TransposeCPUKernel : public LiteKernel { | |||||
| int Init() override; | int Init() override; | ||||
| int ReSize() override; | int ReSize() override; | ||||
| int Run() override; | int Run() override; | ||||
| int NhNcTranspose(lite::Tensor *in_tensor, lite::Tensor *out_tensor, TransposeParameter *param); | |||||
| int RunImpl(int task_id); | |||||
| protected: | protected: | ||||
| void GetNHNCTransposeFunc(lite::Tensor *in_tensor, lite::Tensor *out_tensor, TransposeParameter *param); | |||||
| float *in_data_ = nullptr; | float *in_data_ = nullptr; | ||||
| float *out_data_ = nullptr; | float *out_data_ = nullptr; | ||||
| int *out_shape_ = nullptr; | int *out_shape_ = nullptr; | ||||
| int *dim_size_ = nullptr; | int *dim_size_ = nullptr; | ||||
| int *position_ = nullptr; | int *position_ = nullptr; | ||||
| TransposeParameter *param_ = nullptr; | |||||
| TransposeFunc NHNCTransposeFunc_ = nullptr; | |||||
| int thread_count_ = 0; | |||||
| int nhnc_param_[3]; | |||||
| int dims_ = 0; | |||||
| }; | }; | ||||
| } // namespace mindspore::kernel | } // namespace mindspore::kernel | ||||
| @@ -141,6 +141,25 @@ int TransposeInt8CPUKernel::DoTranspose(int task_id) { | |||||
| return RET_OK; | return RET_OK; | ||||
| } | } | ||||
| void TransposeInt8CPUKernel::GetNHNCTransposeFunc(lite::Tensor *in_tensor, lite::Tensor *out_tensor, | |||||
| TransposeParameter *param) { | |||||
| auto out_shape = out_tensor->shape(); | |||||
| if (in_tensor->shape().size() == 4 && param->perm_[0] == 0 && param->perm_[1] == 2 && param->perm_[2] == 3 && | |||||
| param->perm_[3] == 1) { | |||||
| nhnc_param_[0] = out_shape[0]; | |||||
| nhnc_param_[1] = out_shape[1] * out_shape[2]; | |||||
| nhnc_param_[2] = out_shape[3]; | |||||
| NHNCTransposeFunc_ = PackNCHWToNHWCInt8; | |||||
| } | |||||
| if (in_tensor->shape().size() == 4 && param->perm_[0] == 0 && param->perm_[1] == 3 && param->perm_[2] == 1 && | |||||
| param->perm_[3] == 2) { | |||||
| nhnc_param_[0] = out_shape[0]; | |||||
| nhnc_param_[1] = out_shape[2] * out_shape[3]; | |||||
| nhnc_param_[2] = out_shape[1]; | |||||
| NHNCTransposeFunc_ = PackNHWCToNCHWInt8; | |||||
| } | |||||
| } | |||||
| int TransposeInt8CPUKernel::Run() { | int TransposeInt8CPUKernel::Run() { | ||||
| auto in_tensor = in_tensors_.front(); | auto in_tensor = in_tensors_.front(); | ||||
| auto out_tensor = out_tensors_.front(); | auto out_tensor = out_tensors_.front(); | ||||
| @@ -150,7 +169,11 @@ int TransposeInt8CPUKernel::Run() { | |||||
| in_ptr_ = reinterpret_cast<int8_t *>(in_tensor->data_c()); | in_ptr_ = reinterpret_cast<int8_t *>(in_tensor->data_c()); | ||||
| out_ptr_ = reinterpret_cast<int8_t *>(out_tensor->data_c()); | out_ptr_ = reinterpret_cast<int8_t *>(out_tensor->data_c()); | ||||
| GetNHNCTransposeFunc(in_tensor, out_tensor, transpose_param_); | |||||
| if (NHNCTransposeFunc_ != nullptr) { | |||||
| NHNCTransposeFunc_(in_ptr_, out_ptr_, nhnc_param_[0], nhnc_param_[1], nhnc_param_[2]); | |||||
| return RET_OK; | |||||
| } | |||||
| memcpy(in_shape_, in_dims.data(), in_dims.size() * sizeof(int)); | memcpy(in_shape_, in_dims.data(), in_dims.size() * sizeof(int)); | ||||
| memcpy(out_shape_, out_dims.data(), out_dims.size() * sizeof(int)); | memcpy(out_shape_, out_dims.data(), out_dims.size() * sizeof(int)); | ||||
| @@ -17,12 +17,16 @@ | |||||
| #define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_INT8_TRANSPOSE_INT8_H_ | #define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_INT8_TRANSPOSE_INT8_H_ | ||||
| #include <vector> | #include <vector> | ||||
| #include "nnacl/int8/pack_int8.h" | |||||
| #include "nnacl/int8/transpose_int8.h" | #include "nnacl/int8/transpose_int8.h" | ||||
| #include "src/kernel_registry.h" | #include "src/kernel_registry.h" | ||||
| #include "src/lite_kernel.h" | #include "src/lite_kernel.h" | ||||
| #include "include/errorcode.h" | #include "include/errorcode.h" | ||||
| namespace mindspore::kernel { | namespace mindspore::kernel { | ||||
| typedef void (*TransposeFunc)(const void *src, void *dst, int batch, int plane, int channel); | |||||
| class TransposeInt8CPUKernel : public LiteKernel { | class TransposeInt8CPUKernel : public LiteKernel { | ||||
| public: | public: | ||||
| TransposeInt8CPUKernel(OpParameter *parameter, const std::vector<lite::Tensor *> &inputs, | TransposeInt8CPUKernel(OpParameter *parameter, const std::vector<lite::Tensor *> &inputs, | ||||
| @@ -44,7 +48,9 @@ class TransposeInt8CPUKernel : public LiteKernel { | |||||
| void FreeTmpBuf(); | void FreeTmpBuf(); | ||||
| private: | private: | ||||
| void GetNHNCTransposeFunc(lite::Tensor *in_tensor, lite::Tensor *out_tensor, TransposeParameter *param); | |||||
| TransposeParameter *transpose_param_; | TransposeParameter *transpose_param_; | ||||
| TransposeFunc NHNCTransposeFunc_ = nullptr; | |||||
| int8_t *in_ptr_ = nullptr; | int8_t *in_ptr_ = nullptr; | ||||
| int8_t *out_ptr_ = nullptr; | int8_t *out_ptr_ = nullptr; | ||||
| int *dim_size_ = nullptr; | int *dim_size_ = nullptr; | ||||
| @@ -56,6 +62,7 @@ class TransposeInt8CPUKernel : public LiteKernel { | |||||
| int num_unit_ = 0; | int num_unit_ = 0; | ||||
| int in_shape_[8] = {0}; | int in_shape_[8] = {0}; | ||||
| int out_shape_[8] = {0}; | int out_shape_[8] = {0}; | ||||
| int nhnc_param_[3]; | |||||
| }; | }; | ||||
| } // namespace mindspore::kernel | } // namespace mindspore::kernel | ||||
| @@ -47,7 +47,7 @@ int ConvolutionBaseNPUKernel::InitWeightConst(const std::vector<lite::Tensor *> | |||||
| MS_LOG(ERROR) << "Malloc buffer failed."; | MS_LOG(ERROR) << "Malloc buffer failed."; | ||||
| return RET_ERROR; | return RET_ERROR; | ||||
| } | } | ||||
| PackNHWCToNCHWFp32(nhwc_data, nchw_data, w_shape[0], w_shape[1] * w_shape[2], w_shape[3]); | |||||
| PackNHWCToNCHWFp32(nhwc_data, nchw_data, w_shape[0], w_shape[1] * w_shape[2], w_shape[3], 0, 0); | |||||
| std::shared_ptr<ge::Tensor> weight_tensor = std::shared_ptr<ge::Tensor>(new (std::nothrow) ge::Tensor()); | std::shared_ptr<ge::Tensor> weight_tensor = std::shared_ptr<ge::Tensor>(new (std::nothrow) ge::Tensor()); | ||||
| if (weight_tensor == nullptr) { | if (weight_tensor == nullptr) { | ||||
| @@ -335,7 +335,8 @@ int DeConvTestInit1(std::vector<lite::Tensor *> *inputs_, std::vector<lite::Tens | |||||
| 0.2642501, 0.29840338, 0.38820496, 0.37829784, 0.105839334, 0.07713295, 0.45629853, 0.9290373, 0.56323594, | 0.2642501, 0.29840338, 0.38820496, 0.37829784, 0.105839334, 0.07713295, 0.45629853, 0.9290373, 0.56323594, | ||||
| 0.59976774, 0.48325357, 0.102543674, 0.35449505, 0.3158472, 0.02927611, 0.44739273, 0.0516185, 0.12340133, | 0.59976774, 0.48325357, 0.102543674, 0.35449505, 0.3158472, 0.02927611, 0.44739273, 0.0516185, 0.12340133, | ||||
| 0.13908496, 0.54970616, 0.74672216, 0.673308, 0.6400629, 0.26790652, 0.98673576}; /* nhwc */ | 0.13908496, 0.54970616, 0.74672216, 0.673308, 0.6400629, 0.26790652, 0.98673576}; /* nhwc */ | ||||
| PackNCHWToNHWCFp32(in_nchw, in_t->MutableData(), in_t->Batch(), in_t->Width() * in_t->Height(), in_t->Channel()); | |||||
| PackNCHWToNHWCFp32(in_nchw, in_t->MutableData(), in_t->Batch(), in_t->Width() * in_t->Height(), in_t->Channel(), 0, | |||||
| 0); | |||||
| inputs_->push_back(in_t); | inputs_->push_back(in_t); | ||||
| std::vector<int> weight_dims_nhwc = {2, 3, 3, 6}; | std::vector<int> weight_dims_nhwc = {2, 3, 3, 6}; | ||||
| @@ -358,7 +359,7 @@ int DeConvTestInit1(std::vector<lite::Tensor *> *inputs_, std::vector<lite::Tens | |||||
| 0.06060236, 0.10848369, -0.4512424, 0.023834296, 0.1643943, -0.25290534, 0.066953085, -0.11685201, | 0.06060236, 0.10848369, -0.4512424, 0.023834296, 0.1643943, -0.25290534, 0.066953085, -0.11685201, | ||||
| -0.4159784, 0.37839416, -0.11141268, -0.15986018}; /* nhwc */ | -0.4159784, 0.37839416, -0.11141268, -0.15986018}; /* nhwc */ | ||||
| PackNCHWToNHWCFp32(weight_nchw, weight_t->MutableData(), weight_t->Batch(), weight_t->Width() * weight_t->Height(), | PackNCHWToNHWCFp32(weight_nchw, weight_t->MutableData(), weight_t->Batch(), weight_t->Width() * weight_t->Height(), | ||||
| weight_t->Channel()); | |||||
| weight_t->Channel(), 0, 0); | |||||
| inputs_->push_back(weight_t); | inputs_->push_back(weight_t); | ||||
| auto *bias_t = new lite::Tensor(kNumberTypeFloat, {6}, schema::Format_NHWC, lite::Tensor::Category::CONST_TENSOR); | auto *bias_t = new lite::Tensor(kNumberTypeFloat, {6}, schema::Format_NHWC, lite::Tensor::Category::CONST_TENSOR); | ||||
| @@ -463,7 +464,7 @@ int DeConvTestInit1(std::vector<lite::Tensor *> *inputs_, std::vector<lite::Tens | |||||
| 0.8622399, 0.47823763, 0.8856161, 0.6762785, 0.73437214, 0.3766058, 0.764144, 0.60693324, | 0.8622399, 0.47823763, 0.8856161, 0.6762785, 0.73437214, 0.3766058, 0.764144, 0.60693324, | ||||
| 0.89371794, 0.92908806, 0.7702812, 0.79492164, 0.58807003, 0.678272, 0.4573259, 0.7444603, | 0.89371794, 0.92908806, 0.7702812, 0.79492164, 0.58807003, 0.678272, 0.4573259, 0.7444603, | ||||
| 0.49847388, 0.84439206, 0.51984715, 0.9452883, 0.7511028, 0.81281227}; | 0.49847388, 0.84439206, 0.51984715, 0.9452883, 0.7511028, 0.81281227}; | ||||
| PackNCHWToNHWCFp32(nchw_co, *correct, out_t->Batch(), out_t->Width() * out_t->Height(), out_t->Channel()); | |||||
| PackNCHWToNHWCFp32(nchw_co, *correct, out_t->Batch(), out_t->Width() * out_t->Height(), out_t->Channel(), 0, 0); | |||||
| conv_param->kernel_h_ = conv_param->kernel_w_ = 3; | conv_param->kernel_h_ = conv_param->kernel_w_ = 3; | ||||
| conv_param->stride_h_ = conv_param->stride_w_ = 2; | conv_param->stride_h_ = conv_param->stride_w_ = 2; | ||||
| @@ -531,7 +532,7 @@ int DeConvTestInit2(std::vector<lite::Tensor *> *inputs_, std::vector<lite::Tens | |||||
| -11.827093, -12.340071, -2.6368382, -14.432123, -8.483799, -12.28651, 0.80561405, | -11.827093, -12.340071, -2.6368382, -14.432123, -8.483799, -12.28651, 0.80561405, | ||||
| 11.332421, -0.43688506, -3.476327, -4.587028, -1.9491882, -3.3619316, -15.831648, | 11.332421, -0.43688506, -3.476327, -4.587028, -1.9491882, -3.3619316, -15.831648, | ||||
| -10.517606, -9.204161, -0.15148449, 1.5822954, -10.122691, -4.7448387, 3.99177}; | -10.517606, -9.204161, -0.15148449, 1.5822954, -10.122691, -4.7448387, 3.99177}; | ||||
| PackNCHWToNHWCFp32(nchw_co, *correct, out_t->Batch(), out_t->Width() * out_t->Height(), out_t->Channel()); | |||||
| PackNCHWToNHWCFp32(nchw_co, *correct, out_t->Batch(), out_t->Width() * out_t->Height(), out_t->Channel(), 0, 0); | |||||
| conv_param->kernel_h_ = conv_param->kernel_w_ = 3; | conv_param->kernel_h_ = conv_param->kernel_w_ = 3; | ||||
| conv_param->stride_h_ = conv_param->stride_w_ = 2; | conv_param->stride_h_ = conv_param->stride_w_ = 2; | ||||
| @@ -571,7 +572,7 @@ int DeConvTestInit3(std::vector<lite::Tensor *> *inputs_, std::vector<lite::Tens | |||||
| 0.26498786, 0.6701024, 0.9744634, 0.49075702, 0.03877404, 0.48646277, | 0.26498786, 0.6701024, 0.9744634, 0.49075702, 0.03877404, 0.48646277, | ||||
| 0.5473929, 0.32438126, 0.87553847, 0.75820315, 0.86666644, 0.4852329}; | 0.5473929, 0.32438126, 0.87553847, 0.75820315, 0.86666644, 0.4852329}; | ||||
| PackNCHWToNHWCFp32(in_nchw, reinterpret_cast<float *>(in_t->MutableData()), in_t->Batch(), | PackNCHWToNHWCFp32(in_nchw, reinterpret_cast<float *>(in_t->MutableData()), in_t->Batch(), | ||||
| in_t->Width() * in_t->Height(), in_t->Channel()); | |||||
| in_t->Width() * in_t->Height(), in_t->Channel(), 0, 0); | |||||
| inputs_->push_back(in_t); | inputs_->push_back(in_t); | ||||
| std::vector<int> w_dims_nhwc = {2, 2, 2, 2}; | std::vector<int> w_dims_nhwc = {2, 2, 2, 2}; | ||||
| @@ -582,7 +583,7 @@ int DeConvTestInit3(std::vector<lite::Tensor *> *inputs_, std::vector<lite::Tens | |||||
| -0.34362152, 0.7557833, 0.16503833, 0.2418737, -0.26612744, 0.5072577, | -0.34362152, 0.7557833, 0.16503833, 0.2418737, -0.26612744, 0.5072577, | ||||
| -0.4284475, 0.2215941, 0.9273913, 0.34634787}; | -0.4284475, 0.2215941, 0.9273913, 0.34634787}; | ||||
| PackNCHWToNHWCFp32(w_nchw, weight_t->MutableData(), weight_t->Batch(), weight_t->Width() * weight_t->Height(), | PackNCHWToNHWCFp32(w_nchw, weight_t->MutableData(), weight_t->Batch(), weight_t->Width() * weight_t->Height(), | ||||
| weight_t->Channel()); | |||||
| weight_t->Channel(), 0, 0); | |||||
| inputs_->push_back(weight_t); | inputs_->push_back(weight_t); | ||||
| std::vector<int> out_dims_nhwc = {1, 9, 9, 2}; | std::vector<int> out_dims_nhwc = {1, 9, 9, 2}; | ||||
| @@ -609,7 +610,7 @@ int DeConvTestInit3(std::vector<lite::Tensor *> *inputs_, std::vector<lite::Tens | |||||
| -0.026721025, 0.0, 0.24602996, 0.38258934, 0.0, 0.38933694, 0.88844025, 0.0, 0.3944222, | -0.026721025, 0.0, 0.24602996, 0.38258934, 0.0, 0.38933694, 0.88844025, 0.0, 0.3944222, | ||||
| 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, | 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, | ||||
| 0.6120955, 0.0, 0.46287543, 0.57347727, 0.0, 0.80662024, 0.11515418, 0.0, 0.90454257}; | 0.6120955, 0.0, 0.46287543, 0.57347727, 0.0, 0.80662024, 0.11515418, 0.0, 0.90454257}; | ||||
| PackNCHWToNHWCFp32(nchw_co, *correct, out_t->Batch(), out_t->Width() * out_t->Height(), out_t->Channel()); | |||||
| PackNCHWToNHWCFp32(nchw_co, *correct, out_t->Batch(), out_t->Width() * out_t->Height(), out_t->Channel(), 0, 0); | |||||
| conv_param->kernel_h_ = conv_param->kernel_w_ = 2; | conv_param->kernel_h_ = conv_param->kernel_w_ = 2; | ||||
| conv_param->stride_h_ = conv_param->stride_w_ = 3; | conv_param->stride_h_ = conv_param->stride_w_ = 3; | ||||
| @@ -658,7 +659,7 @@ int DeConvTestInit4(std::vector<lite::Tensor *> *inputs_, std::vector<lite::Tens | |||||
| std::string weight_path = "./deconv/deconv_fp32_nchw_weight1.bin"; | std::string weight_path = "./deconv/deconv_fp32_nchw_weight1.bin"; | ||||
| auto weight_nchw = reinterpret_cast<float *>(mindspore::lite::ReadFile(weight_path.c_str(), &buffer_size)); | auto weight_nchw = reinterpret_cast<float *>(mindspore::lite::ReadFile(weight_path.c_str(), &buffer_size)); | ||||
| PackNCHWToNHWCFp32(weight_nchw, weight_t->MutableData(), weight_t->Batch(), weight_t->Width() * weight_t->Height(), | PackNCHWToNHWCFp32(weight_nchw, weight_t->MutableData(), weight_t->Batch(), weight_t->Width() * weight_t->Height(), | ||||
| weight_t->Channel()); | |||||
| weight_t->Channel(), 0, 0); | |||||
| inputs_->push_back(weight_t); | inputs_->push_back(weight_t); | ||||
| auto *bias_t = new lite::Tensor(kNumberTypeFloat, {40}, schema::Format_NHWC, lite::Tensor::Category::CONST_TENSOR); | auto *bias_t = new lite::Tensor(kNumberTypeFloat, {40}, schema::Format_NHWC, lite::Tensor::Category::CONST_TENSOR); | ||||
| @@ -676,7 +677,7 @@ int DeConvTestInit4(std::vector<lite::Tensor *> *inputs_, std::vector<lite::Tens | |||||
| std::string out_path = "./deconv/deconv_fp32_nchw_output1.bin"; | std::string out_path = "./deconv/deconv_fp32_nchw_output1.bin"; | ||||
| auto out_nchw = mindspore::lite::ReadFile(out_path.c_str(), &buffer_size); | auto out_nchw = mindspore::lite::ReadFile(out_path.c_str(), &buffer_size); | ||||
| *correct = reinterpret_cast<float *>(malloc(buffer_size)); | *correct = reinterpret_cast<float *>(malloc(buffer_size)); | ||||
| PackNCHWToNHWCFp32(out_nchw, *correct, out_t->Batch(), out_t->Width() * out_t->Height(), out_t->Channel()); | |||||
| PackNCHWToNHWCFp32(out_nchw, *correct, out_t->Batch(), out_t->Width() * out_t->Height(), out_t->Channel(), 0, 0); | |||||
| conv_param->kernel_h_ = conv_param->kernel_w_ = 3; | conv_param->kernel_h_ = conv_param->kernel_w_ = 3; | ||||
| conv_param->stride_h_ = conv_param->stride_w_ = 1; | conv_param->stride_h_ = conv_param->stride_w_ = 1; | ||||
| @@ -63,7 +63,7 @@ TEST_F(TestTransposeFp32, TransposeFp32_axes4) { | |||||
| param->out_strides_[i] = out_strides[i]; | param->out_strides_[i] = out_strides[i]; | ||||
| } | } | ||||
| auto ret = DoTransposeFp32(in, out, output_shape, param, nullptr, nullptr); | |||||
| auto ret = DoTransposeFp32(in, out, output_shape, param); | |||||
| ASSERT_EQ(ret, 0); | ASSERT_EQ(ret, 0); | ||||
| delete param; | delete param; | ||||
| ASSERT_EQ(0, CompareOutputData(out, correct, 24, 0.000001)); | ASSERT_EQ(0, CompareOutputData(out, correct, 24, 0.000001)); | ||||
| @@ -102,7 +102,7 @@ TEST_F(TestTransposeFp32, TransposeFp32_axes3) { | |||||
| param->out_strides_[i] = out_strides[i]; | param->out_strides_[i] = out_strides[i]; | ||||
| } | } | ||||
| auto ret = DoTransposeFp32(in, out, output_shape, param, nullptr, nullptr); | |||||
| auto ret = DoTransposeFp32(in, out, output_shape, param); | |||||
| ASSERT_EQ(ret, 0); | ASSERT_EQ(ret, 0); | ||||
| delete param; | delete param; | ||||
| ASSERT_EQ(0, CompareOutputData(out, correct, 24, 0.000001)); | ASSERT_EQ(0, CompareOutputData(out, correct, 24, 0.000001)); | ||||
| @@ -142,7 +142,7 @@ TEST_F(TestTransposeFp32, TransposeFp32_axes2) { | |||||
| param->out_strides_[i] = out_strides[i]; | param->out_strides_[i] = out_strides[i]; | ||||
| } | } | ||||
| auto ret = DoTransposeFp32(in, out, output_shape, param, nullptr, nullptr); | |||||
| auto ret = DoTransposeFp32(in, out, output_shape, param); | |||||
| ASSERT_EQ(ret, 0); | ASSERT_EQ(ret, 0); | ||||
| delete param; | delete param; | ||||
| ASSERT_EQ(0, CompareOutputData(out, correct, 24, 0.000001)); | ASSERT_EQ(0, CompareOutputData(out, correct, 24, 0.000001)); | ||||