| @@ -84,136 +84,142 @@ void RowMajor2Row16Major(const float *src_ptr, float *dst_ptr, int row, int col) | |||
| return; | |||
| } | |||
| #ifdef ENABLE_ARM64 | |||
| void RowMajor2Col12Major_arm64(const float *src_c, float *dst_c, size_t col) { | |||
| size_t stride = col * sizeof(float); | |||
| asm volatile( | |||
| "mov x10, %[src_c]\n" | |||
| "mov x11, %[dst_c]\n" | |||
| "ld1 {v0.4s}, [x10], %[stride]\n" | |||
| "ld1 {v1.4s}, [x10], %[stride]\n" | |||
| "ld1 {v2.4s}, [x10], %[stride]\n" | |||
| "ld1 {v3.4s}, [x10], %[stride]\n" | |||
| "ld1 {v4.4s}, [x10], %[stride]\n" | |||
| "ld1 {v5.4s}, [x10], %[stride]\n" | |||
| "ld1 {v6.4s}, [x10], %[stride]\n" | |||
| "ld1 {v7.4s}, [x10], %[stride]\n" | |||
| "zip1 v12.4s, v0.4s, v1.4s\n" | |||
| "zip2 v13.4s, v0.4s, v1.4s\n" | |||
| "zip1 v14.4s, v2.4s, v3.4s\n" | |||
| "zip2 v15.4s, v2.4s, v3.4s\n" | |||
| "ld1 {v8.4s}, [x10], %[stride]\n" | |||
| "ld1 {v9.4s}, [x10], %[stride]\n" | |||
| "ld1 {v10.4s}, [x10], %[stride]\n" | |||
| "ld1 {v11.4s}, [x10], %[stride]\n" | |||
| "zip1 v16.4s, v4.4s, v5.4s\n" | |||
| "zip2 v17.4s, v4.4s, v5.4s\n" | |||
| "zip1 v18.4s, v6.4s, v7.4s\n" | |||
| "zip2 v19.4s, v6.4s, v7.4s\n" | |||
| "trn1 v20.2d, v12.2d, v14.2d\n" | |||
| "trn2 v23.2d, v12.2d, v14.2d\n" | |||
| "trn1 v26.2d, v13.2d, v15.2d\n" | |||
| "trn2 v29.2d, v13.2d, v15.2d\n" | |||
| "trn1 v21.2d, v16.2d, v18.2d\n" | |||
| "trn2 v24.2d, v16.2d, v18.2d\n" | |||
| "trn1 v27.2d, v17.2d, v19.2d\n" | |||
| "trn2 v30.2d, v17.2d, v19.2d\n" | |||
| "zip1 v12.4s, v8.4s, v9.4s\n" | |||
| "zip2 v13.4s, v8.4s, v9.4s\n" | |||
| "zip1 v14.4s, v10.4s, v11.4s\n" | |||
| "zip2 v15.4s, v10.4s, v11.4s\n" | |||
| "trn1 v22.2d, v12.2d, v14.2d\n" | |||
| "trn2 v25.2d, v12.2d, v14.2d\n" | |||
| "trn1 v28.2d, v13.2d, v15.2d\n" | |||
| "trn2 v31.2d, v13.2d, v15.2d\n" | |||
| "st1 {v20.4s, v21.4s, v22.4s, v23.4s}, [x11], #64\n" | |||
| "st1 {v24.4s, v25.4s, v26.4s, v27.4s}, [x11], #64\n" | |||
| "st1 {v28.4s, v29.4s, v30.4s, v31.4s}, [x11], #64\n" | |||
| : | |||
| : [ dst_c ] "r"(dst_c), [ src_c ] "r"(src_c), [ stride ] "r"(stride) | |||
| : "x10", "x11", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", | |||
| "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", | |||
| "v31"); | |||
| return; | |||
| } | |||
| #endif | |||
| #ifdef ENABLE_ARM32 | |||
| void RowMajor2Col12Major_arm32(const float *src_c, float *dst_c, size_t col) { | |||
| size_t stride = col * sizeof(float); | |||
| asm volatile( | |||
| "mov r10, %[src_c]\n" | |||
| "mov r12, %[dst_c]\n" | |||
| "vld1.32 {q0}, [r10], %[stride]\n" | |||
| "vld1.32 {q3}, [r10], %[stride]\n" | |||
| "vld1.32 {q10}, [r10], %[stride]\n" | |||
| "vld1.32 {q13}, [r10], %[stride]\n" | |||
| "vtrn.32 d0, d6\n" | |||
| "vtrn.32 d1, d7\n" | |||
| "vtrn.32 d20, d26\n" | |||
| "vtrn.32 d21, d27\n" | |||
| "vld1.32 {q1}, [r10], %[stride]\n" | |||
| "vld1.32 {q8}, [r10], %[stride]\n" | |||
| "vld1.32 {q11}, [r10], %[stride]\n" | |||
| "vld1.32 {q14}, [r10], %[stride]\n" | |||
| "vswp d1, d20\n" | |||
| "vswp d7, d26\n" | |||
| "vld1.32 {q2}, [r10], %[stride]\n" | |||
| "vld1.32 {q9}, [r10], %[stride]\n" | |||
| "vld1.32 {q12}, [r10], %[stride]\n" | |||
| "vld1.32 {q15}, [r10], %[stride]\n" | |||
| "vtrn.32 d2, d16\n" | |||
| "vtrn.32 d3, d17\n" | |||
| "vtrn.32 d22, d28\n" | |||
| "vtrn.32 d23, d29\n" | |||
| "vswp d3, d22\n" | |||
| "vswp d17, d28\n" | |||
| "vtrn.32 d4, d18\n" | |||
| "vtrn.32 d5, d19\n" | |||
| "vtrn.32 d24, d30\n" | |||
| "vtrn.32 d25, d31\n" | |||
| "vswp d5, d24\n" | |||
| "vswp d19, d30\n" | |||
| "vst1.32 {q0, q1}, [r12]!\n" | |||
| "vst1.32 {q2, q3}, [r12]!\n" | |||
| "vst1.32 {q8, q9}, [r12]!\n" | |||
| "vst1.32 {q10, q11}, [r12]!\n" | |||
| "vst1.32 {q12, q13}, [r12]!\n" | |||
| "vst1.32 {q14, q15}, [r12]!\n" | |||
| : | |||
| : [ dst_c ] "r"(dst_c), [ src_c ] "r"(src_c), [ stride ] "r"(stride) | |||
| : "r10", "r12", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"); | |||
| return; | |||
| } | |||
| #endif | |||
| void RowMajor2Col12Major(const float *src_ptr, float *dst_ptr, size_t row, size_t col) { | |||
| size_t row_up_12 = UP_ROUND(row, C12NUM); | |||
| size_t row12 = row / C12NUM * C12NUM; | |||
| size_t col4 = col / C4NUM * C4NUM; | |||
| const float *src_r = src_ptr; | |||
| float *dst_r = dst_ptr; | |||
| size_t ri = 0; | |||
| for (; ri < row12; ri += C12NUM) { | |||
| for (; ri < (row / C12NUM * C12NUM); ri += C12NUM) { | |||
| size_t ci = 0; | |||
| for (; ci < col4; ci += C4NUM) { | |||
| for (; ci < (col / C4NUM * C4NUM); ci += C4NUM) { | |||
| const float *src_c = src_r + ci; | |||
| float *dst_c = dst_r + ci * C12NUM; | |||
| /* 12x4 row-major to col-major */ | |||
| #ifdef ENABLE_ARM64 | |||
| size_t stride = col * sizeof(float); | |||
| asm volatile( | |||
| "mov x10, %[src_c]\n" | |||
| "mov x11, %[dst_c]\n" | |||
| "ld1 {v0.4s}, [x10], %[stride]\n" | |||
| "ld1 {v1.4s}, [x10], %[stride]\n" | |||
| "ld1 {v2.4s}, [x10], %[stride]\n" | |||
| "ld1 {v3.4s}, [x10], %[stride]\n" | |||
| "ld1 {v4.4s}, [x10], %[stride]\n" | |||
| "ld1 {v5.4s}, [x10], %[stride]\n" | |||
| "ld1 {v6.4s}, [x10], %[stride]\n" | |||
| "ld1 {v7.4s}, [x10], %[stride]\n" | |||
| "zip1 v12.4s, v0.4s, v1.4s\n" | |||
| "zip2 v13.4s, v0.4s, v1.4s\n" | |||
| "zip1 v14.4s, v2.4s, v3.4s\n" | |||
| "zip2 v15.4s, v2.4s, v3.4s\n" | |||
| "ld1 {v8.4s}, [x10], %[stride]\n" | |||
| "ld1 {v9.4s}, [x10], %[stride]\n" | |||
| "ld1 {v10.4s}, [x10], %[stride]\n" | |||
| "ld1 {v11.4s}, [x10], %[stride]\n" | |||
| "zip1 v16.4s, v4.4s, v5.4s\n" | |||
| "zip2 v17.4s, v4.4s, v5.4s\n" | |||
| "zip1 v18.4s, v6.4s, v7.4s\n" | |||
| "zip2 v19.4s, v6.4s, v7.4s\n" | |||
| "trn1 v20.2d, v12.2d, v14.2d\n" | |||
| "trn2 v23.2d, v12.2d, v14.2d\n" | |||
| "trn1 v26.2d, v13.2d, v15.2d\n" | |||
| "trn2 v29.2d, v13.2d, v15.2d\n" | |||
| "trn1 v21.2d, v16.2d, v18.2d\n" | |||
| "trn2 v24.2d, v16.2d, v18.2d\n" | |||
| "trn1 v27.2d, v17.2d, v19.2d\n" | |||
| "trn2 v30.2d, v17.2d, v19.2d\n" | |||
| "zip1 v12.4s, v8.4s, v9.4s\n" | |||
| "zip2 v13.4s, v8.4s, v9.4s\n" | |||
| "zip1 v14.4s, v10.4s, v11.4s\n" | |||
| "zip2 v15.4s, v10.4s, v11.4s\n" | |||
| "trn1 v22.2d, v12.2d, v14.2d\n" | |||
| "trn2 v25.2d, v12.2d, v14.2d\n" | |||
| "trn1 v28.2d, v13.2d, v15.2d\n" | |||
| "trn2 v31.2d, v13.2d, v15.2d\n" | |||
| "st1 {v20.4s, v21.4s, v22.4s, v23.4s}, [x11], #64\n" | |||
| "st1 {v24.4s, v25.4s, v26.4s, v27.4s}, [x11], #64\n" | |||
| "st1 {v28.4s, v29.4s, v30.4s, v31.4s}, [x11], #64\n" | |||
| : | |||
| : [ dst_c ] "r"(dst_c), [ src_c ] "r"(src_c), [ stride ] "r"(stride) | |||
| : "x10", "x11", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", | |||
| "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", | |||
| "v30", "v31"); | |||
| RowMajor2Col12Major_arm64(src_c, dst_c, col); | |||
| #elif ENABLE_ARM32 | |||
| size_t stride = col * sizeof(float); | |||
| asm volatile( | |||
| "mov r10, %[src_c]\n" | |||
| "mov r12, %[dst_c]\n" | |||
| "vld1.32 {q0}, [r10], %[stride]\n" | |||
| "vld1.32 {q3}, [r10], %[stride]\n" | |||
| "vld1.32 {q10}, [r10], %[stride]\n" | |||
| "vld1.32 {q13}, [r10], %[stride]\n" | |||
| "vtrn.32 d0, d6\n" | |||
| "vtrn.32 d1, d7\n" | |||
| "vtrn.32 d20, d26\n" | |||
| "vtrn.32 d21, d27\n" | |||
| "vld1.32 {q1}, [r10], %[stride]\n" | |||
| "vld1.32 {q8}, [r10], %[stride]\n" | |||
| "vld1.32 {q11}, [r10], %[stride]\n" | |||
| "vld1.32 {q14}, [r10], %[stride]\n" | |||
| "vswp d1, d20\n" | |||
| "vswp d7, d26\n" | |||
| "vld1.32 {q2}, [r10], %[stride]\n" | |||
| "vld1.32 {q9}, [r10], %[stride]\n" | |||
| "vld1.32 {q12}, [r10], %[stride]\n" | |||
| "vld1.32 {q15}, [r10], %[stride]\n" | |||
| "vtrn.32 d2, d16\n" | |||
| "vtrn.32 d3, d17\n" | |||
| "vtrn.32 d22, d28\n" | |||
| "vtrn.32 d23, d29\n" | |||
| "vswp d3, d22\n" | |||
| "vswp d17, d28\n" | |||
| "vtrn.32 d4, d18\n" | |||
| "vtrn.32 d5, d19\n" | |||
| "vtrn.32 d24, d30\n" | |||
| "vtrn.32 d25, d31\n" | |||
| "vswp d5, d24\n" | |||
| "vswp d19, d30\n" | |||
| "vst1.32 {q0, q1}, [r12]!\n" | |||
| "vst1.32 {q2, q3}, [r12]!\n" | |||
| "vst1.32 {q8, q9}, [r12]!\n" | |||
| "vst1.32 {q10, q11}, [r12]!\n" | |||
| "vst1.32 {q12, q13}, [r12]!\n" | |||
| "vst1.32 {q14, q15}, [r12]!\n" | |||
| : | |||
| : [ dst_c ] "r"(dst_c), [ src_c ] "r"(src_c), [ stride ] "r"(stride) | |||
| : "r10", "r12", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"); | |||
| RowMajor2Col12Major_arm32(src_c, dst_c, col); | |||
| #elif ENABLE_SSE | |||
| __m128 src1 = _mm_loadu_ps(src_c); | |||
| __m128 src2 = _mm_loadu_ps(src_c + col); | |||
| @@ -288,24 +294,145 @@ void RowMajor2Col12Major(const float *src_ptr, float *dst_ptr, size_t row, size_ | |||
| src_r += C12NUM * col; | |||
| dst_r += C12NUM * col; | |||
| } | |||
| for (; ri < row; ri++) { | |||
| for (; ri < row; ri++, dst_r++, src_r += col) { | |||
| for (size_t i = 0; i < col; i++) { | |||
| dst_r[i * C12NUM] = src_r[i]; | |||
| } | |||
| src_r += col; | |||
| dst_r += 1; | |||
| } | |||
| for (; ri < row_up_12; ri++) { | |||
| for (; ri < UP_ROUND(row, C12NUM); ri++, dst_r++) { | |||
| for (size_t i = 0; i < col; i++) { | |||
| dst_r[i * C12NUM] = 0; | |||
| } | |||
| dst_r += 1; | |||
| } | |||
| return; | |||
| } | |||
| #ifdef ENABLE_ARM64 | |||
| void RowMajor2Col8Major_arm64(const float *src_c, float *dst_c, size_t col) { | |||
| size_t stride = col * sizeof(float); | |||
| asm volatile( | |||
| "mov x10, %[src_c]\n" | |||
| "mov x11, %[dst_c]\n" | |||
| "ld1 {v0.4s, v1.4s}, [x10], %[stride]\n" | |||
| "ld1 {v2.4s, v3.4s}, [x10], %[stride]\n" | |||
| "ld1 {v4.4s, v5.4s}, [x10], %[stride]\n" | |||
| "ld1 {v6.4s, v7.4s}, [x10], %[stride]\n" | |||
| "zip1 v8.4s, v0.4s, v2.4s\n" | |||
| "zip2 v9.4s, v0.4s, v2.4s\n" | |||
| "zip1 v10.4s, v4.4s, v6.4s\n" | |||
| "zip2 v11.4s, v4.4s, v6.4s\n" | |||
| "ld1 {v16.4s, v17.4s}, [x10], %[stride]\n" | |||
| "ld1 {v18.4s, v19.4s}, [x10], %[stride]\n" | |||
| "ld1 {v20.4s, v21.4s}, [x10], %[stride]\n" | |||
| "ld1 {v22.4s, v23.4s}, [x10], %[stride]\n" | |||
| "zip1 v12.4s, v1.4s, v3.4s\n" | |||
| "zip2 v13.4s, v1.4s, v3.4s\n" | |||
| "zip1 v14.4s, v5.4s, v7.4s\n" | |||
| "zip2 v15.4s, v5.4s, v7.4s\n" | |||
| "trn1 v0.2d, v8.2d, v10.2d\n" | |||
| "trn2 v1.2d, v8.2d, v10.2d\n" | |||
| "trn1 v2.2d, v9.2d, v11.2d\n" | |||
| "trn2 v3.2d, v9.2d, v11.2d\n" | |||
| "zip1 v24.4s, v16.4s, v18.4s\n" | |||
| "zip2 v25.4s, v16.4s, v18.4s\n" | |||
| "zip1 v26.4s, v20.4s, v22.4s\n" | |||
| "zip2 v27.4s, v20.4s, v22.4s\n" | |||
| "trn1 v4.2d, v12.2d, v14.2d\n" | |||
| "trn2 v5.2d, v12.2d, v14.2d\n" | |||
| "trn1 v6.2d, v13.2d, v15.2d\n" | |||
| "trn2 v7.2d, v13.2d, v15.2d\n" | |||
| "zip1 v28.4s, v17.4s, v19.4s\n" | |||
| "zip2 v29.4s, v17.4s, v19.4s\n" | |||
| "zip1 v30.4s, v21.4s, v23.4s\n" | |||
| "zip2 v31.4s, v21.4s, v23.4s\n" | |||
| "trn1 v16.2d, v24.2d, v26.2d\n" | |||
| "trn2 v17.2d, v24.2d, v26.2d\n" | |||
| "trn1 v18.2d, v25.2d, v27.2d\n" | |||
| "trn2 v19.2d, v25.2d, v27.2d\n" | |||
| "trn1 v20.2d, v28.2d, v30.2d\n" | |||
| "trn2 v21.2d, v28.2d, v30.2d\n" | |||
| "trn1 v22.2d, v29.2d, v31.2d\n" | |||
| "trn2 v23.2d, v29.2d, v31.2d\n" | |||
| "st1 {v0.4s}, [x11], #16\n" | |||
| "st1 {v16.4s}, [x11], #16\n" | |||
| "st1 {v1.4s}, [x11], #16\n" | |||
| "st1 {v17.4s}, [x11], #16\n" | |||
| "st1 {v2.4s}, [x11], #16\n" | |||
| "st1 {v18.4s}, [x11], #16\n" | |||
| "st1 {v3.4s}, [x11], #16\n" | |||
| "st1 {v19.4s}, [x11], #16\n" | |||
| "st1 {v4.4s}, [x11], #16\n" | |||
| "st1 {v20.4s}, [x11], #16\n" | |||
| "st1 {v5.4s}, [x11], #16\n" | |||
| "st1 {v21.4s}, [x11], #16\n" | |||
| "st1 {v6.4s}, [x11], #16\n" | |||
| "st1 {v22.4s}, [x11], #16\n" | |||
| "st1 {v7.4s}, [x11], #16\n" | |||
| "st1 {v23.4s}, [x11], #16\n" | |||
| : | |||
| : [ dst_c ] "r"(dst_c), [ src_c ] "r"(src_c), [ stride ] "r"(stride) | |||
| : "x10", "x11", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", | |||
| "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", | |||
| "v31"); | |||
| return; | |||
| } | |||
| #endif | |||
| #ifdef ENABLE_ARM32 | |||
| void RowMajor2Col8Major_arm32(const float *src_c, float *dst_c, size_t col) { | |||
| size_t stride = col * sizeof(float); | |||
| asm volatile( | |||
| "mov r10, %[src_c]\n" | |||
| "mov r11, %[dst_c]\n" | |||
| "vld1.32 {q0}, [r10], %[stride]\n" | |||
| "vld1.32 {q2}, [r10], %[stride]\n" | |||
| "vld1.32 {q4}, [r10], %[stride]\n" | |||
| "vld1.32 {q6}, [r10], %[stride]\n" | |||
| "vtrn.32 d0, d4\n" | |||
| "vtrn.32 d1, d5\n" | |||
| "vtrn.32 d8, d12\n" | |||
| "vtrn.32 d9, d13\n" | |||
| "vld1.32 {q1}, [r10], %[stride]\n" | |||
| "vld1.32 {q3}, [r10], %[stride]\n" | |||
| "vld1.32 {q5}, [r10], %[stride]\n" | |||
| "vld1.32 {q7}, [r10], %[stride]\n" | |||
| "vswp d1, d8\n" | |||
| "vswp d5, d12\n" | |||
| "vtrn.32 d2, d6\n" | |||
| "vtrn.32 d3, d7\n" | |||
| "vtrn.32 d10, d14\n" | |||
| "vtrn.32 d11, d15\n" | |||
| "vswp d3, d10\n" | |||
| "vswp d7, d14\n" | |||
| "vst1.32 {q0, q1}, [r11]!\n" | |||
| "vst1.32 {q2, q3}, [r11]!\n" | |||
| "vst1.32 {q4, q5}, [r11]!\n" | |||
| "vst1.32 {q6, q7}, [r11]!\n" | |||
| : | |||
| : [ dst_c ] "r"(dst_c), [ src_c ] "r"(src_c), [ stride ] "r"(stride) | |||
| : "r10", "r11", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7"); | |||
| return; | |||
| } | |||
| #endif | |||
| void RowMajor2Col8Major(const float *src_ptr, float *dst_ptr, size_t row, size_t col) { | |||
| size_t row8 = row / C8NUM * C8NUM; | |||
| #ifdef ENABLE_ARM64 | |||
| @@ -326,127 +453,10 @@ void RowMajor2Col8Major(const float *src_ptr, float *dst_ptr, size_t row, size_t | |||
| float *dst_c = dst_r + ci * C8NUM; | |||
| #ifdef ENABLE_ARM64 | |||
| /* 8x8 row-major to col-major */ | |||
| size_t stride = col * sizeof(float); | |||
| asm volatile( | |||
| "mov x10, %[src_c]\n" | |||
| "mov x11, %[dst_c]\n" | |||
| "ld1 {v0.4s, v1.4s}, [x10], %[stride]\n" | |||
| "ld1 {v2.4s, v3.4s}, [x10], %[stride]\n" | |||
| "ld1 {v4.4s, v5.4s}, [x10], %[stride]\n" | |||
| "ld1 {v6.4s, v7.4s}, [x10], %[stride]\n" | |||
| "zip1 v8.4s, v0.4s, v2.4s\n" | |||
| "zip2 v9.4s, v0.4s, v2.4s\n" | |||
| "zip1 v10.4s, v4.4s, v6.4s\n" | |||
| "zip2 v11.4s, v4.4s, v6.4s\n" | |||
| "ld1 {v16.4s, v17.4s}, [x10], %[stride]\n" | |||
| "ld1 {v18.4s, v19.4s}, [x10], %[stride]\n" | |||
| "ld1 {v20.4s, v21.4s}, [x10], %[stride]\n" | |||
| "ld1 {v22.4s, v23.4s}, [x10], %[stride]\n" | |||
| "zip1 v12.4s, v1.4s, v3.4s\n" | |||
| "zip2 v13.4s, v1.4s, v3.4s\n" | |||
| "zip1 v14.4s, v5.4s, v7.4s\n" | |||
| "zip2 v15.4s, v5.4s, v7.4s\n" | |||
| "trn1 v0.2d, v8.2d, v10.2d\n" | |||
| "trn2 v1.2d, v8.2d, v10.2d\n" | |||
| "trn1 v2.2d, v9.2d, v11.2d\n" | |||
| "trn2 v3.2d, v9.2d, v11.2d\n" | |||
| "zip1 v24.4s, v16.4s, v18.4s\n" | |||
| "zip2 v25.4s, v16.4s, v18.4s\n" | |||
| "zip1 v26.4s, v20.4s, v22.4s\n" | |||
| "zip2 v27.4s, v20.4s, v22.4s\n" | |||
| "trn1 v4.2d, v12.2d, v14.2d\n" | |||
| "trn2 v5.2d, v12.2d, v14.2d\n" | |||
| "trn1 v6.2d, v13.2d, v15.2d\n" | |||
| "trn2 v7.2d, v13.2d, v15.2d\n" | |||
| "zip1 v28.4s, v17.4s, v19.4s\n" | |||
| "zip2 v29.4s, v17.4s, v19.4s\n" | |||
| "zip1 v30.4s, v21.4s, v23.4s\n" | |||
| "zip2 v31.4s, v21.4s, v23.4s\n" | |||
| "trn1 v16.2d, v24.2d, v26.2d\n" | |||
| "trn2 v17.2d, v24.2d, v26.2d\n" | |||
| "trn1 v18.2d, v25.2d, v27.2d\n" | |||
| "trn2 v19.2d, v25.2d, v27.2d\n" | |||
| "trn1 v20.2d, v28.2d, v30.2d\n" | |||
| "trn2 v21.2d, v28.2d, v30.2d\n" | |||
| "trn1 v22.2d, v29.2d, v31.2d\n" | |||
| "trn2 v23.2d, v29.2d, v31.2d\n" | |||
| "st1 {v0.4s}, [x11], #16\n" | |||
| "st1 {v16.4s}, [x11], #16\n" | |||
| "st1 {v1.4s}, [x11], #16\n" | |||
| "st1 {v17.4s}, [x11], #16\n" | |||
| "st1 {v2.4s}, [x11], #16\n" | |||
| "st1 {v18.4s}, [x11], #16\n" | |||
| "st1 {v3.4s}, [x11], #16\n" | |||
| "st1 {v19.4s}, [x11], #16\n" | |||
| "st1 {v4.4s}, [x11], #16\n" | |||
| "st1 {v20.4s}, [x11], #16\n" | |||
| "st1 {v5.4s}, [x11], #16\n" | |||
| "st1 {v21.4s}, [x11], #16\n" | |||
| "st1 {v6.4s}, [x11], #16\n" | |||
| "st1 {v22.4s}, [x11], #16\n" | |||
| "st1 {v7.4s}, [x11], #16\n" | |||
| "st1 {v23.4s}, [x11], #16\n" | |||
| : | |||
| : [ dst_c ] "r"(dst_c), [ src_c ] "r"(src_c), [ stride ] "r"(stride) | |||
| : "x10", "x11", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", | |||
| "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", | |||
| "v30", "v31"); | |||
| RowMajor2Col8Major_arm64(src_c, dst_c, col); | |||
| #elif ENABLE_ARM32 | |||
| /* 8x4 row-major to col-major */ | |||
| size_t stride = col * sizeof(float); | |||
| asm volatile( | |||
| "mov r10, %[src_c]\n" | |||
| "mov r11, %[dst_c]\n" | |||
| "vld1.32 {q0}, [r10], %[stride]\n" | |||
| "vld1.32 {q2}, [r10], %[stride]\n" | |||
| "vld1.32 {q4}, [r10], %[stride]\n" | |||
| "vld1.32 {q6}, [r10], %[stride]\n" | |||
| "vtrn.32 d0, d4\n" | |||
| "vtrn.32 d1, d5\n" | |||
| "vtrn.32 d8, d12\n" | |||
| "vtrn.32 d9, d13\n" | |||
| "vld1.32 {q1}, [r10], %[stride]\n" | |||
| "vld1.32 {q3}, [r10], %[stride]\n" | |||
| "vld1.32 {q5}, [r10], %[stride]\n" | |||
| "vld1.32 {q7}, [r10], %[stride]\n" | |||
| "vswp d1, d8\n" | |||
| "vswp d5, d12\n" | |||
| "vtrn.32 d2, d6\n" | |||
| "vtrn.32 d3, d7\n" | |||
| "vtrn.32 d10, d14\n" | |||
| "vtrn.32 d11, d15\n" | |||
| "vswp d3, d10\n" | |||
| "vswp d7, d14\n" | |||
| "vst1.32 {q0, q1}, [r11]!\n" | |||
| "vst1.32 {q2, q3}, [r11]!\n" | |||
| "vst1.32 {q4, q5}, [r11]!\n" | |||
| "vst1.32 {q6, q7}, [r11]!\n" | |||
| : | |||
| : [ dst_c ] "r"(dst_c), [ src_c ] "r"(src_c), [ stride ] "r"(stride) | |||
| : "r10", "r11", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7"); | |||
| RowMajor2Col8Major_arm32(src_c, dst_c, col); | |||
| #elif ENABLE_SSE | |||
| /* 8x4 row-major to col-major */ | |||
| __m128 src1 = _mm_loadu_ps(src_c); | |||
| __m128 src2 = _mm_loadu_ps(src_c + col); | |||
| __m128 src3 = _mm_loadu_ps(src_c + 2 * col); | |||
| @@ -492,12 +502,16 @@ void RowMajor2Col8Major(const float *src_ptr, float *dst_ptr, size_t row, size_t | |||
| src_r += C8NUM * col; | |||
| dst_r += C8NUM * col; | |||
| } | |||
| for (; ri < row; ri++) { | |||
| for (; ri < row; ri++, src_r += col, dst_r++) { | |||
| for (size_t i = 0; i < col; i++) { | |||
| dst_r[i * C8NUM] = src_r[i]; | |||
| } | |||
| src_r += col; | |||
| dst_r += 1; | |||
| } | |||
| for (; ri < UP_ROUND(row, C8NUM); ri++, dst_r++) { | |||
| for (size_t i = 0; i < col; i++) { | |||
| dst_r[i * C8NUM] = 0; | |||
| } | |||
| } | |||
| return; | |||
| } | |||
| @@ -538,6 +552,14 @@ void RowMajor2Col16Major(const float *src_ptr, float *dst_ptr, size_t row, size_ | |||
| src_r += col; | |||
| dst_r += 1; | |||
| } | |||
| size_t total_row = UP_ROUND(row, C16NUM); | |||
| for (; ri < total_row; ri++) { | |||
| for (size_t i = 0; i < col; i++) { | |||
| dst_r[i * C16NUM] = 0; | |||
| } | |||
| dst_r += 1; | |||
| } | |||
| return; | |||
| } | |||
| @@ -555,7 +577,6 @@ void RowMajor2Col6Major(const float *src_ptr, float *dst_ptr, size_t row, size_t | |||
| const float *src_c = src_r + ci; | |||
| float *dst_c = dst_r + ci * C6NUM; | |||
| /* 6x8 row-major to col-major */ | |||
| #ifdef ENABLE_AVX | |||
| __m256 src0 = _mm256_loadu_ps(src_c); | |||
| __m256 src1 = _mm256_loadu_ps(src_c + col); | |||
| @@ -642,19 +663,19 @@ void RowMajor2Col6Major(const float *src_ptr, float *dst_ptr, size_t row, size_t | |||
| } | |||
| void RowMajor2Col4Major(const float *src_ptr, float *dst_ptr, size_t row, size_t col) { | |||
| size_t row8 = row / C4NUM * C4NUM; | |||
| size_t total_row = UP_ROUND(row, C4NUM); | |||
| size_t row4 = row / C4NUM * C4NUM; | |||
| size_t col4 = col / C4NUM * C4NUM; | |||
| const float *src_r = src_ptr; | |||
| float *dst_r = dst_ptr; | |||
| size_t ri = 0; | |||
| for (; ri < row8; ri += C4NUM) { | |||
| for (; ri < row4; ri += C4NUM) { | |||
| size_t ci = 0; | |||
| for (; ci < col4; ci += C4NUM) { | |||
| const float *src_c = src_r + ci; | |||
| float *dst_c = dst_r + ci * C4NUM; | |||
| /* 4x4 row-major to col-major */ | |||
| #ifdef ENABLE_ARM32 | |||
| size_t stride = col * 4; | |||
| asm volatile( | |||
| @@ -727,9 +748,31 @@ void RowMajor2Col4Major(const float *src_ptr, float *dst_ptr, size_t row, size_t | |||
| src_r += col; | |||
| dst_r += 1; | |||
| } | |||
| for (; ri < total_row; ri++) { | |||
| for (size_t i = 0; i < col; i++) { | |||
| dst_r[i * C4NUM] = 0; | |||
| } | |||
| dst_r += 1; | |||
| } | |||
| return; | |||
| } | |||
| #ifndef ENABLE_ARM | |||
| void MatVecMulFp32(const float *a, const float *b, float *c, const float *bias, int act_type, int depth, int col) { | |||
| for (int ci = 0; ci < col; ci++) { | |||
| float value = 0; | |||
| for (int di = 0; di < depth; di++) { | |||
| value += a[di] * b[ci * depth + di]; | |||
| } | |||
| if (bias != NULL) value += bias[ci]; | |||
| if (act_type == ActType_Relu6) value = MSMIN(6.0f, value); | |||
| if (act_type == ActType_Relu || act_type == ActType_Relu6) value = MSMAX(0.0f, value); | |||
| c[ci] = value; | |||
| } | |||
| return; | |||
| } | |||
| #endif | |||
| void MatMul12x8(const float *a, const float *b, float *dst, const float *bias, ActType act_type, int deep, int row, | |||
| int col, int stride, int out_type) { | |||
| if (out_type == OutType_Nhwc) { | |||
| @@ -744,9 +787,9 @@ void MatMul12x8(const float *a, const float *b, float *dst, const float *bias, A | |||
| size_t bi = c8div * deep * 8 + d * 8 + c8mod; | |||
| value = value + a[ai] * b[bi]; | |||
| } | |||
| if (bias != NULL) value += bias[c]; | |||
| if (act_type == ActType_Relu6) value = MSMIN(6.0f, value); | |||
| if (act_type != ActType_No) value = MSMAX(0.0f, value); | |||
| ADD_BIAS(value, bias, c) | |||
| DO_RELU(value, act_type) | |||
| DO_RELU6(value, act_type) | |||
| dst[ci] = value; | |||
| } | |||
| } | |||
| @@ -764,9 +807,9 @@ void MatMul12x8(const float *a, const float *b, float *dst, const float *bias, A | |||
| size_t bi = c8div * deep * C8NUM + d * C8NUM + c8mod; | |||
| value = value + a[ai] * b[bi]; | |||
| } | |||
| if (bias != NULL) value += bias[c]; | |||
| if (act_type == ActType_Relu6) value = MSMIN(6.0f, value); | |||
| if (act_type != ActType_No) value = MSMAX(0.0f, value); | |||
| ADD_BIAS(value, bias, c) | |||
| DO_RELU(value, act_type) | |||
| DO_RELU6(value, act_type) | |||
| dst[ci] = value; | |||
| } | |||
| } | |||
| @@ -783,79 +826,9 @@ void MatMul12x8(const float *a, const float *b, float *dst, const float *bias, A | |||
| size_t bi = c8div * deep * 8 + d * 8 + c8mod; | |||
| value = value + a[ai] * b[bi]; | |||
| } | |||
| if (bias != NULL) value += bias[j]; | |||
| if (act_type == ActType_Relu6) value = MSMIN(6.0f, value); | |||
| if (act_type != ActType_No) value = MSMAX(0.0f, value); | |||
| dst[ci] = value; | |||
| } | |||
| } | |||
| } | |||
| return; | |||
| } | |||
| void MatMul6x16(const float *a, const float *b, float *dst, const float *bias, ActType act_type, int deep, int row, | |||
| int col, int stride, int out_type) { | |||
| if (out_type == OutType_Nhwc) { | |||
| for (int r = 0; r < row; r++) { | |||
| for (int c = 0; c < col; c++) { | |||
| int r6div = r / C6NUM, r6mod = r % C6NUM; | |||
| int c16div = c / C16NUM, c16mod = c % C16NUM; | |||
| size_t ci = r * stride + c; | |||
| float value = 0; | |||
| for (int d = 0; d < deep; d++) { | |||
| size_t ai = r6div * deep * C6NUM + d * C6NUM + r6mod; | |||
| size_t bi = c16div * deep * C16NUM + d * C16NUM + c16mod; | |||
| value = value + a[ai] * b[bi]; | |||
| } | |||
| if (bias != NULL) value += bias[c]; | |||
| if (act_type == ActType_Relu6) value = MSMIN(6.0f, value); | |||
| if (act_type != ActType_No) value = MSMAX(0.0f, value); | |||
| dst[ci] = value; | |||
| } | |||
| } | |||
| } else { | |||
| for (int i = 0; i < row; ++i) { | |||
| int dst_r_offset = i * col * stride; | |||
| int r6div = i / C6NUM, r6mod = i % C6NUM; | |||
| for (int j = 0; j < col; ++j) { | |||
| int b16div = j / C16NUM, b16mod = j % C16NUM; | |||
| int c8div = j / C8NUM, c8mod = j % C8NUM; | |||
| size_t ci = dst_r_offset + c8div * C8NUM * stride + c8mod; | |||
| float value = 0; | |||
| for (int d = 0; d < deep; ++d) { | |||
| size_t ai = r6div * deep * C6NUM + d * C6NUM + r6mod; | |||
| size_t bi = b16div * deep * C16NUM + d * C16NUM + b16mod; | |||
| value = value + a[ai] * b[bi]; | |||
| } | |||
| if (bias != NULL) value += bias[j]; | |||
| if (act_type == ActType_Relu6) value = MSMIN(6.0f, value); | |||
| if (act_type != ActType_No) value = MSMAX(0.0f, value); | |||
| dst[ci] = value; | |||
| } | |||
| } | |||
| } | |||
| return; | |||
| } | |||
| void MatMul4x8(const float *a, const float *b, float *dst, const float *bias, ActType act_type, int deep, int row, | |||
| int col, int stride, int out_type) { | |||
| if (out_type == OutType_C8) { | |||
| int col_8 = UP_ROUND(col, C8NUM); | |||
| int row_4 = UP_ROUND(row, C4NUM); | |||
| for (int r = 0; r < row_4; r++) { | |||
| for (int c = 0; c < col_8; c++) { | |||
| int r4div = r / C4NUM, r4mod = r % C4NUM; | |||
| int c8div = c / C8NUM, c8mod = c % C8NUM; | |||
| size_t ci = (c8div * C8NUM * row_4 + r * C8NUM + c8mod); | |||
| float value = 0; | |||
| for (int d = 0; d < deep; d++) { | |||
| size_t ai = r4div * deep * C4NUM + d * C4NUM + r4mod; | |||
| size_t bi = c8div * deep * C8NUM + d * C8NUM + c8mod; | |||
| value = value + a[ai] * b[bi]; | |||
| } | |||
| if (bias != NULL) value += bias[c]; | |||
| if (act_type == ActType_Relu6) value = MSMIN(6.0f, value); | |||
| if (act_type != ActType_No) value = MSMAX(0.0f, value); | |||
| ADD_BIAS(value, bias, j) | |||
| DO_RELU(value, act_type) | |||
| DO_RELU6(value, act_type) | |||
| dst[ci] = value; | |||
| } | |||
| } | |||
| @@ -895,44 +868,3 @@ void MatMulOpt(const float *a, const float *b, float *c, const float *bias, ActT | |||
| MatMul12x8(a, b, c, bias, act_type, deep, row, col, stride, out_type); | |||
| #endif | |||
| } | |||
| void MatVecMul(const float *a, const float *b, float *c, const float *bias, ActType act_type, int depth, int col) { | |||
| #ifdef ENABLE_ARM | |||
| MatVecMulFp32(a, b, c, bias, (int)act_type, depth, col); | |||
| #endif | |||
| } | |||
| #ifdef ENABLE_NNACL_INFER_SHAPE | |||
| static void SwapDims(int *dims, int index1, int index2) { | |||
| int tmp = dims[index1]; | |||
| dims[index1] = dims[index2]; | |||
| dims[index2] = tmp; | |||
| } | |||
| int MatMulInferShape(int **in_shape, int in_num, size_t *dim_size, int *out_shape, int *in_format, int *out_format, | |||
| int *in_datatype, int *out_datatype, OpParameter *param) { | |||
| *out_datatype = in_datatype[0]; | |||
| *out_format = in_format[0]; | |||
| if (dim_size[0] < 2 || dim_size[1] < 2) { | |||
| return NNACL_PARAM_INVALID; | |||
| } | |||
| for (int i = 0; i < dim_size[0] - 2; ++i) { | |||
| if (in_shape[0][i] != in_shape[1][i]) { | |||
| return NNACL_PARAM_INVALID; | |||
| } | |||
| } | |||
| MatMulParameter *matmul_param = (MatMulParameter *)param; | |||
| if (matmul_param->a_transpose_) { | |||
| SwapDims(in_shape[0], dim_size[0] - 1, dim_size[0] - 2); | |||
| } | |||
| if (matmul_param->b_transpose_) { | |||
| SwapDims(in_shape[1], dim_size[1] - 1, dim_size[1] - 2); | |||
| } | |||
| for (int i = 0; i < dim_size[0] - 1; ++i) { | |||
| out_shape[i] = in_shape[0][i]; | |||
| } | |||
| out_shape[dim_size[0] - 1] = in_shape[1][dim_size[1] - 1]; | |||
| return NNACL_OK; | |||
| } | |||
| #endif | |||
| @@ -23,12 +23,23 @@ | |||
| #include "nnacl/matmul_parameter.h" | |||
| #include "nnacl/op_base.h" | |||
| #define ADD_BIAS(value, bias, c) \ | |||
| if (bias != NULL) value = value + bias[c]; | |||
| #define DO_RELU(value, act_type) \ | |||
| if (act_type == ActType_Relu) value = MSMAX(0.0f, value); | |||
| #define DO_RELU6(value, act_type) \ | |||
| if (act_type == ActType_Relu6) value = MSMIN(6.0f, value); \ | |||
| if (act_type == ActType_Relu6) value = MSMAX(0.0f, value); | |||
| #ifdef __cplusplus | |||
| extern "C" { | |||
| #endif | |||
| void MatMulOpt(const float *a, const float *b, float *c, const float *bias, ActType act_type, int deep, int row, | |||
| int col, size_t stride, int out_type); | |||
| void MatVecMul(const float *a, const float *b, float *c, const float *bias, ActType act_type, int depth, int col); | |||
| void MatVecMulFp32(const float *a, const float *b, float *c, const float *bias, int act_type, int depth, int col); | |||
| void RowMajor2ColMajor(const float *src_ptr, float *dst_ptr, int row, int col); | |||
| void RowMajor2Row4Major(const float *src_ptr, float *dst_ptr, int row, int col); | |||
| void RowMajor2Row6Major(const float *src_ptr, float *dst_ptr, int row, int col); | |||
| @@ -40,9 +51,7 @@ void RowMajor2Col6Major(const float *src_ptr, float *dst_ptr, size_t row, size_t | |||
| void RowMajor2Col8Major(const float *src_ptr, float *dst_ptr, size_t row, size_t col); | |||
| void RowMajor2Col12Major(const float *src_ptr, float *dst_ptr, size_t row, size_t col); | |||
| void RowMajor2Col16Major(const float *src_ptr, float *dst_ptr, size_t row, size_t col); | |||
| #ifdef ENABLE_ARM | |||
| void MatVecMulFp32(const float *a, const float *b, float *c, const float *bias, int act_type, int depth, int col); | |||
| #endif | |||
| #ifdef ENABLE_ARM64 | |||
| void MatmulFloatNeon64(const float *a, const float *b, float *c, const float *bias, int act_type, int depth, int row, | |||
| int col, size_t stride, size_t writeNhwc, size_t WriteWino); | |||
| @@ -67,10 +76,6 @@ void MatmulFloatAvxOpt(const float *a, const float *b, float *c, const float *bi | |||
| #endif | |||
| #endif | |||
| #ifdef ENABLE_NNACL_INFER_SHAPE | |||
| int MatMulInferShape(int **in_shape, int in_num, size_t *dim_size, int *out_shape, int *in_format, int *out_format, | |||
| int *in_datatype, int *out_datatype, OpParameter *param); | |||
| #endif | |||
| #ifdef __cplusplus | |||
| } | |||
| #endif | |||
| @@ -44,14 +44,11 @@ typedef struct MatMulParameter { | |||
| int col_; | |||
| int row_4_; | |||
| int row_6_; | |||
| int row_8_; | |||
| int row_12_; | |||
| int row_16_; | |||
| int row_align_; | |||
| int col_2_; | |||
| int col_4_; | |||
| int col_8_; | |||
| int col_16_; | |||
| int col_align_; | |||
| int deep_; | |||
| int deep_4_; | |||
| @@ -61,8 +58,6 @@ typedef struct MatMulParameter { | |||
| bool b_transpose_; /* true : col-major */ | |||
| bool a_const_; | |||
| bool b_const_; | |||
| bool a_init_shape_; | |||
| bool b_init_shape_; | |||
| ActType act_type_; | |||
| } MatMulParameter; | |||
| @@ -62,11 +62,8 @@ void Convolution1x1CPUKernel::InitConv1x1MatmulParam() { | |||
| matmul_param_->row_ = conv_param_->output_h_ * conv_param_->output_w_; | |||
| matmul_param_->col_ = conv_param_->output_channel_; | |||
| matmul_param_->deep_ = conv_param_->input_channel_; | |||
| matmul_param_->row_4_ = UP_ROUND(matmul_param_->row_, C4NUM); | |||
| matmul_param_->row_6_ = UP_ROUND(matmul_param_->row_, C6NUM); | |||
| matmul_param_->row_12_ = UP_ROUND(matmul_param_->row_, C12NUM); | |||
| matmul_param_->row_align_ = UP_ROUND(matmul_param_->row_, row_tile_); | |||
| matmul_param_->col_8_ = UP_ROUND(matmul_param_->col_, C8NUM); | |||
| matmul_param_->col_align_ = UP_ROUND(matmul_param_->col_, col_tile_); | |||
| matmul_param_->act_type_ = conv_param_->act_type_; | |||
| return; | |||
| } | |||
| @@ -76,20 +73,6 @@ int Convolution1x1CPUKernel::InitConv1x1BiasWeight() { | |||
| auto input_channel = filter_tensor->Channel(); | |||
| auto output_channel = filter_tensor->Batch(); | |||
| #ifdef ENABLE_AVX | |||
| row_tile_ = C6NUM; | |||
| col_tile_ = C16NUM; | |||
| #elif defined(ENABLE_SSE) | |||
| row_tile_ = C4NUM; | |||
| col_tile_ = C8NUM; | |||
| #elif defined(ENABLE_ARM32) | |||
| row_tile_ = C12NUM; | |||
| col_tile_ = C4NUM; | |||
| #else | |||
| row_tile_ = C12NUM; | |||
| col_tile_ = C8NUM; | |||
| #endif | |||
| if (in_tensors_.size() == 3) { | |||
| int size = UP_ROUND(output_channel, col_tile_) * sizeof(float); | |||
| int weight_size = output_channel * sizeof(float); | |||
| @@ -146,6 +129,19 @@ int Convolution1x1CPUKernel::InitConv1x1Param() { | |||
| } | |||
| int Convolution1x1CPUKernel::Init() { | |||
| #ifdef ENABLE_AVX | |||
| row_tile_ = C6NUM; | |||
| col_tile_ = C16NUM; | |||
| #elif defined(ENABLE_SSE) | |||
| row_tile_ = C4NUM; | |||
| col_tile_ = C8NUM; | |||
| #elif defined(ENABLE_ARM32) | |||
| row_tile_ = C12NUM; | |||
| col_tile_ = C4NUM; | |||
| #else | |||
| row_tile_ = C12NUM; | |||
| col_tile_ = C8NUM; | |||
| #endif | |||
| matmul_param_ = new (std::nothrow) MatMulParameter; | |||
| if (matmul_param_ == nullptr) { | |||
| MS_LOG(ERROR) << "Memory allocation failed"; | |||
| @@ -270,20 +266,6 @@ void Convolution1x1CPUKernel::PackWeight() { | |||
| auto input_channel = filter_tensor->Channel(); | |||
| auto output_channel = filter_tensor->Batch(); | |||
| #ifdef ENABLE_AVX | |||
| row_tile_ = C6NUM; | |||
| col_tile_ = C16NUM; | |||
| #elif defined(ENABLE_SSE) | |||
| row_tile_ = C4NUM; | |||
| col_tile_ = C8NUM; | |||
| #elif defined(ENABLE_ARM32) | |||
| row_tile_ = C12NUM; | |||
| col_tile_ = C4NUM; | |||
| #else | |||
| row_tile_ = C12NUM; | |||
| col_tile_ = C8NUM; | |||
| #endif | |||
| int size = input_channel * UP_ROUND(output_channel, col_tile_) * sizeof(float); | |||
| int down_size = input_channel * DOWN_DIV(output_channel, col_tile_) * col_tile_ * sizeof(float); | |||
| memset(reinterpret_cast<char *>(weight_ptr_) + down_size, 0, size - down_size); | |||
| @@ -21,228 +21,56 @@ | |||
| using mindspore::kernel::KERNEL_ARCH::kCPU; | |||
| using mindspore::lite::KernelRegistrar; | |||
| using mindspore::lite::RET_ERROR; | |||
| using mindspore::lite::RET_INVALID_OP_ATTR; | |||
| using mindspore::lite::RET_MEMORY_FAILED; | |||
| using mindspore::lite::RET_NULL_PTR; | |||
| using mindspore::lite::RET_OK; | |||
| using mindspore::schema::PrimitiveType_FullConnection; | |||
| namespace mindspore::kernel { | |||
| FullconnectionCPUKernel::~FullconnectionCPUKernel() { | |||
| FreeBuf(); | |||
| return; | |||
| } | |||
| void FullconnectionCPUKernel::FreeBuf() { | |||
| if (a_pack_ptr_ != nullptr) { | |||
| free(a_pack_ptr_); | |||
| a_pack_ptr_ = nullptr; | |||
| } | |||
| if (b_pack_ptr_ != nullptr) { | |||
| free(b_pack_ptr_); | |||
| b_pack_ptr_ = nullptr; | |||
| } | |||
| if (bias_ptr_ != nullptr) { | |||
| free(bias_ptr_); | |||
| bias_ptr_ = nullptr; | |||
| } | |||
| } | |||
| int FullconnectionCPUKernel::ReSize() { | |||
| FreeBuf(); | |||
| int row = 1; | |||
| for (size_t i = 0; i < out_tensors_.at(0)->shape().size() - 1; ++i) { | |||
| row *= (out_tensors_.at(0)->shape())[i]; | |||
| } | |||
| fc_param_->row_ = row; | |||
| fc_param_->col_ = out_tensors_.at(0)->shape().back(); | |||
| fc_param_->deep_ = (in_tensors_.at(1)->shape()).at(1); | |||
| #ifdef ENABLE_AVX | |||
| int col_tile = C16NUM; | |||
| #elif defined(ENABLE_ARM32) | |||
| int col_tile = C4NUM; | |||
| #else | |||
| int col_tile = C8NUM; | |||
| #endif | |||
| fc_param_->row_12_ = UP_ROUND(fc_param_->row_, C12NUM); | |||
| fc_param_->col_align_ = UP_ROUND(fc_param_->col_, col_tile); | |||
| fc_param_->row_6_ = UP_ROUND(fc_param_->row_, C6NUM); | |||
| fc_param_->row_4_ = UP_ROUND(fc_param_->row_, C4NUM); | |||
| thread_count_ = MSMIN(op_parameter_->thread_num_, UP_DIV(fc_param_->col_align_, col_tile)); | |||
| thread_stride_ = UP_DIV(UP_DIV(fc_param_->col_align_, col_tile), thread_count_); | |||
| int FullconnectionCPUKernel::Init() { | |||
| MatmulFp32BaseCPUKernel::InitParameter(); | |||
| #ifdef ENABLE_ARM | |||
| if (fc_param_->row_ == 1) { | |||
| is_vector_input_ = true; | |||
| } else { | |||
| is_vector_input_ = false; | |||
| } | |||
| #endif | |||
| if (in_tensors_.size() == 3) { | |||
| int col_tmp = is_vector_input_ ? fc_param_->col_ : fc_param_->col_align_; | |||
| bias_ptr_ = reinterpret_cast<float *>(malloc(col_tmp * sizeof(float))); | |||
| if (bias_ptr_ == nullptr) { | |||
| MS_LOG(ERROR) << "malloc bias_ptr_ failed"; | |||
| return RET_ERROR; | |||
| } | |||
| memcpy(bias_ptr_, in_tensors_[2]->MutableData(), fc_param_->col_ * sizeof(float)); | |||
| if (params_->a_const_ == true) { | |||
| auto a_shape = in_tensors_.at(0)->shape(); | |||
| params_->row_ = a_shape[0]; | |||
| params_->deep_ = a_shape[1]; | |||
| } | |||
| #ifdef ENABLE_AVX | |||
| int row_tmp = is_vector_input_ ? 1 : fc_param_->row_6_; | |||
| #elif defined(ENABLE_SSE) | |||
| int row_tmp = is_vector_input_ ? 1 : fc_param_->row_4_; | |||
| #else | |||
| int row_tmp = is_vector_input_ ? 1 : fc_param_->row_12_; | |||
| #endif | |||
| a_pack_ptr_ = reinterpret_cast<float *>(malloc(row_tmp * fc_param_->deep_ * sizeof(float))); | |||
| if (a_pack_ptr_ == nullptr) { | |||
| return RET_MEMORY_FAILED; | |||
| if (params_->b_const_ == true) { | |||
| auto b_shape = in_tensors_.at(1)->shape(); | |||
| params_->col_ = b_shape[0]; | |||
| params_->deep_ = b_shape[1]; | |||
| } | |||
| memset(a_pack_ptr_, 0, row_tmp * fc_param_->deep_ * sizeof(float)); | |||
| int col_tmp = is_vector_input_ ? fc_param_->col_ : fc_param_->col_align_; | |||
| b_pack_ptr_ = reinterpret_cast<float *>(malloc(col_tmp * fc_param_->deep_ * sizeof(float))); | |||
| if (b_pack_ptr_ == nullptr) { | |||
| FreeBuf(); | |||
| return RET_MEMORY_FAILED; | |||
| } | |||
| memset(b_pack_ptr_, 0, col_tmp * fc_param_->deep_ * sizeof(float)); | |||
| params_->batch = 1; | |||
| params_->a_transpose_ = false; | |||
| params_->b_transpose_ = true; | |||
| fc_param_->a_const_ = (in_tensors_.at(0)->data_c() != nullptr); | |||
| fc_param_->b_const_ = (in_tensors_.at(1)->data_c() != nullptr); | |||
| if (fc_param_->a_const_) { | |||
| InitMatrixA(reinterpret_cast<float *>(in_tensors_.at(0)->MutableData()), a_pack_ptr_); | |||
| a_ptr_ = a_pack_ptr_; | |||
| } | |||
| if (fc_param_->b_const_) { | |||
| InitMatrixB(reinterpret_cast<float *>(in_tensors_.at(1)->MutableData()), b_pack_ptr_); | |||
| b_ptr_ = b_pack_ptr_; | |||
| auto ret = MatmulFp32BaseCPUKernel::Init(); | |||
| if (ret != RET_OK) { | |||
| return ret; | |||
| } | |||
| return RET_OK; | |||
| } | |||
| int FullconnectionCPUKernel::Init() { | |||
| if (!InferShapeDone()) { | |||
| return RET_OK; | |||
| } | |||
| return ReSize(); | |||
| } | |||
| void FullconnectionCPUKernel::InitMatrixA(const float *src_ptr, float *dst_ptr) { | |||
| if (is_vector_input_) { | |||
| memcpy(dst_ptr, src_ptr, fc_param_->deep_ * sizeof(float)); | |||
| return; | |||
| } | |||
| #ifdef ENABLE_AVX | |||
| RowMajor2Col6Major(src_ptr, a_pack_ptr_, fc_param_->row_, fc_param_->deep_); | |||
| #elif defined(ENABLE_SSE) | |||
| RowMajor2Col4Major(src_ptr, a_pack_ptr_, fc_param_->row_, fc_param_->deep_); | |||
| #else | |||
| RowMajor2Col12Major(src_ptr, a_pack_ptr_, fc_param_->row_, fc_param_->deep_); | |||
| #endif | |||
| } | |||
| void FullconnectionCPUKernel::InitMatrixB(const float *src_ptr, float *dst_ptr) { | |||
| if (is_vector_input_) { | |||
| memcpy(dst_ptr, src_ptr, fc_param_->col_ * fc_param_->deep_ * sizeof(float)); | |||
| return; | |||
| } | |||
| #ifdef ENABLE_AVX | |||
| RowMajor2Col16Major(src_ptr, dst_ptr, fc_param_->col_, fc_param_->deep_); | |||
| #elif defined(ENABLE_ARM32) | |||
| RowMajor2Col4Major(src_ptr, dst_ptr, fc_param_->col_, fc_param_->deep_); | |||
| #else | |||
| RowMajor2Col8Major(src_ptr, dst_ptr, fc_param_->col_, fc_param_->deep_); | |||
| #endif | |||
| } | |||
| int FcFp32MatmulRun(void *cdata, int task_id) { | |||
| auto fc = reinterpret_cast<FullconnectionCPUKernel *>(cdata); | |||
| auto error_code = fc->DoMatmul(task_id); | |||
| if (error_code != RET_OK) { | |||
| MS_LOG(ERROR) << "FcFp32MatmulRun error task_id[" << task_id << "] error_code[" << error_code << "]"; | |||
| return RET_ERROR; | |||
| } | |||
| return RET_OK; | |||
| } | |||
| int FullconnectionCPUKernel::DoMatmul(int task_id) { | |||
| #ifdef ENABLE_AVX | |||
| int col_tile = C16NUM; | |||
| #elif defined(ENABLE_ARM32) | |||
| int col_tile = C4NUM; | |||
| #else | |||
| int col_tile = C8NUM; | |||
| #endif | |||
| int cur_oc = MSMIN(thread_stride_ * col_tile, fc_param_->col_ - task_id * thread_stride_ * col_tile); | |||
| if (cur_oc <= 0) { | |||
| return RET_OK; | |||
| } | |||
| auto b = b_ptr_ + task_id * thread_stride_ * col_tile * fc_param_->deep_; | |||
| auto bias = (bias_ptr_ == nullptr) ? nullptr : bias_ptr_ + task_id * thread_stride_ * col_tile; | |||
| auto c = c_ptr_ + task_id * thread_stride_ * col_tile; | |||
| if (is_vector_input_) { | |||
| MatVecMul(a_ptr_, b, c, bias, fc_param_->act_type_, fc_param_->deep_, cur_oc); | |||
| } else { | |||
| MatMulOpt(a_ptr_, b, c, bias, fc_param_->act_type_, fc_param_->deep_, fc_param_->row_, cur_oc, fc_param_->col_, | |||
| OutType_Nhwc); | |||
| int FullconnectionCPUKernel::ReSize() { | |||
| int row = 1; | |||
| for (size_t i = 0; i < out_tensors_.at(0)->shape().size() - 1; ++i) { | |||
| row *= (out_tensors_.at(0)->shape())[i]; | |||
| } | |||
| params_->row_ = row; | |||
| params_->col_ = out_tensors_.at(0)->shape().back(); | |||
| params_->deep_ = (in_tensors_.at(1)->shape()).at(1); | |||
| return RET_OK; | |||
| return MatmulFp32BaseCPUKernel::ReSize(); | |||
| } | |||
| int FullconnectionCPUKernel::Run() { | |||
| auto a_ptr = reinterpret_cast<float *>(in_tensors_.at(0)->data_c()); | |||
| auto b_ptr = reinterpret_cast<float *>(in_tensors_.at(1)->data_c()); | |||
| c_ptr_ = reinterpret_cast<float *>(out_tensors_.at(0)->data_c()); | |||
| if (!fc_param_->a_const_) { | |||
| if (is_vector_input_) { | |||
| a_ptr_ = a_ptr; | |||
| } else { | |||
| InitMatrixA(a_ptr, a_pack_ptr_); | |||
| a_ptr_ = a_pack_ptr_; | |||
| } | |||
| } | |||
| if (!fc_param_->b_const_) { | |||
| if (is_vector_input_) { | |||
| b_ptr_ = b_ptr; | |||
| } else { | |||
| InitMatrixB(b_ptr, b_pack_ptr_); | |||
| b_ptr_ = b_pack_ptr_; | |||
| } | |||
| } | |||
| ParallelLaunch(this->context_->thread_pool_, FcFp32MatmulRun, this, thread_count_); | |||
| MatmulFp32BaseCPUKernel::Run(); | |||
| return RET_OK; | |||
| } | |||
| kernel::LiteKernel *CpuFullConnectionFp32KernelCreator(const std::vector<lite::Tensor *> &inputs, | |||
| const std::vector<lite::Tensor *> &outputs, | |||
| OpParameter *opParameter, const lite::InnerContext *ctx, | |||
| const kernel::KernelKey &desc, | |||
| const mindspore::lite::PrimitiveC *primitive) { | |||
| MS_ASSERT(opParameter != nullptr); | |||
| MS_ASSERT(desc.type == schema::PrimitiveType_FullConnection); | |||
| auto kernel = new (std::nothrow) FullconnectionCPUKernel(opParameter, inputs, outputs, ctx, primitive); | |||
| if (!kernel) { | |||
| MS_LOG(ERROR) << "kernel is nullptr."; | |||
| free(opParameter); | |||
| return nullptr; | |||
| } | |||
| auto ret = kernel->Init(); | |||
| if (ret != RET_OK) { | |||
| MS_LOG(ERROR) << "Init kernel failed, name: " << opParameter->name_ << ", type: " | |||
| << schema::EnumNamePrimitiveType(static_cast<schema::PrimitiveType>(opParameter->type_)); | |||
| delete kernel; | |||
| return nullptr; | |||
| } | |||
| return kernel; | |||
| } | |||
| REG_KERNEL(kCPU, kNumberTypeFloat32, PrimitiveType_FullConnection, CpuFullConnectionFp32KernelCreator) | |||
| REG_KERNEL(kCPU, kNumberTypeFloat32, PrimitiveType_FullConnection, LiteKernelCreator<FullconnectionCPUKernel>) | |||
| } // namespace mindspore::kernel | |||
| @@ -21,43 +21,19 @@ | |||
| #include "include/context.h" | |||
| #include "include/errorcode.h" | |||
| #include "nnacl/fp32/matmul_fp32.h" | |||
| #include "src/lite_kernel.h" | |||
| #include "src/runtime/kernel/arm/fp32/matmul_fp32_base.h" | |||
| using mindspore::lite::InnerContext; | |||
| namespace mindspore::kernel { | |||
| class FullconnectionCPUKernel : public LiteKernel { | |||
| class FullconnectionCPUKernel : public MatmulFp32BaseCPUKernel { | |||
| public: | |||
| FullconnectionCPUKernel(OpParameter *parameter, const std::vector<lite::Tensor *> &inputs, | |||
| const std::vector<lite::Tensor *> &outputs, const InnerContext *ctx, | |||
| const std::vector<lite::Tensor *> &outputs, const mindspore::lite::InnerContext *ctx, | |||
| const mindspore::lite::PrimitiveC *primitive) | |||
| : LiteKernel(parameter, inputs, outputs, ctx, primitive) { | |||
| fc_param_ = reinterpret_cast<MatMulParameter *>(op_parameter_); | |||
| } | |||
| ~FullconnectionCPUKernel() override; | |||
| : MatmulFp32BaseCPUKernel(parameter, inputs, outputs, ctx, primitive) {} | |||
| ~FullconnectionCPUKernel() = default; | |||
| int Init() override; | |||
| int ReSize() override; | |||
| int Run() override; | |||
| public: | |||
| int DoMatmul(int task_id); | |||
| void FreeBuf(); | |||
| private: | |||
| void InitMatrixA(const float *src_ptr, float *dst_ptr); | |||
| void InitMatrixB(const float *src_ptr, float *dst_ptr); | |||
| private: | |||
| MatMulParameter *fc_param_ = nullptr; | |||
| float *a_pack_ptr_ = nullptr; | |||
| float *b_pack_ptr_ = nullptr; | |||
| float *c_ptr_ = nullptr; | |||
| float *bias_ptr_ = nullptr; | |||
| float *a_ptr_ = nullptr; | |||
| float *b_ptr_ = nullptr; | |||
| bool is_vector_input_ = false; | |||
| int thread_count_ = 1; | |||
| int thread_stride_ = 0; | |||
| }; | |||
| } // namespace mindspore::kernel | |||
| #endif // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP32_FULLCONNECTION_H_ | |||
| @@ -17,47 +17,53 @@ | |||
| #include "src/runtime/kernel/arm/fp32/matmul_fp32.h" | |||
| #include "include/errorcode.h" | |||
| #include "nnacl/fp32/matmul_fp32.h" | |||
| #include "src/runtime/runtime_api.h" | |||
| #include "src/kernel_registry.h" | |||
| using mindspore::lite::RET_ERROR; | |||
| using mindspore::lite::RET_INPUT_TENSOR_ERROR; | |||
| using mindspore::lite::RET_MEMORY_FAILED; | |||
| using mindspore::lite::RET_OK; | |||
| using mindspore::lite::KernelRegistrar; | |||
| using mindspore::lite::RET_ERROR; | |||
| using mindspore::lite::RET_OK; | |||
| using mindspore::schema::PrimitiveType_MatMul; | |||
| namespace mindspore::kernel { | |||
| MatmulCPUKernel::~MatmulCPUKernel() { | |||
| if (a_pack_ptr_ != nullptr) { | |||
| free(a_pack_ptr_); | |||
| a_pack_ptr_ = nullptr; | |||
| } | |||
| if (b_pack_ptr_ != nullptr) { | |||
| free(b_pack_ptr_); | |||
| b_pack_ptr_ = nullptr; | |||
| int MatmulCPUKernel::Init() { | |||
| MatmulFp32BaseCPUKernel::InitParameter(); | |||
| if (params_->a_const_ == true) { | |||
| auto a_shape = in_tensors_.at(0)->shape(); | |||
| int batch = 1; | |||
| for (size_t i = 0; i < a_shape.size() - 2; ++i) { | |||
| batch *= a_shape[i]; | |||
| } | |||
| params_->batch = batch; | |||
| params_->row_ = params_->a_transpose_ ? a_shape[a_shape.size() - 1] : a_shape[a_shape.size() - 2]; | |||
| params_->deep_ = params_->a_transpose_ ? a_shape[a_shape.size() - 2] : a_shape[a_shape.size() - 1]; | |||
| } | |||
| if (bias_ptr_ != nullptr) { | |||
| free(bias_ptr_); | |||
| bias_ptr_ = nullptr; | |||
| if (params_->b_const_ == true) { | |||
| auto b_shape = in_tensors_.at(1)->shape(); | |||
| int batch = 1; | |||
| for (size_t i = 0; i < b_shape.size() - 2; ++i) { | |||
| batch *= b_shape[i]; | |||
| } | |||
| params_->batch = batch; | |||
| params_->col_ = params_->b_transpose_ ? b_shape[b_shape.size() - 2] : b_shape[b_shape.size() - 1]; | |||
| params_->deep_ = params_->b_transpose_ ? b_shape[b_shape.size() - 1] : b_shape[b_shape.size() - 2]; | |||
| } | |||
| } | |||
| void MatmulCPUKernel::FreeTmpBuffer() { | |||
| if (a_pack_ptr_ != nullptr) { | |||
| params_->a_const_ ? free(a_pack_ptr_) : context_->allocator->Free(a_pack_ptr_); | |||
| a_pack_ptr_ = nullptr; | |||
| auto ret = MatmulFp32BaseCPUKernel::Init(); | |||
| if (ret != RET_OK) { | |||
| return ret; | |||
| } | |||
| if (b_pack_ptr_ != nullptr) { | |||
| params_->b_const_ ? free(b_pack_ptr_) : context_->allocator->Free(b_pack_ptr_); | |||
| b_pack_ptr_ = nullptr; | |||
| if (!InferShapeDone()) { | |||
| return RET_OK; | |||
| } | |||
| return ReSize(); | |||
| } | |||
| int MatmulCPUKernel::MallocMatrixABuffer() { | |||
| int MatmulCPUKernel::ReSize() { | |||
| auto a_shape = in_tensors_.at(0)->shape(); | |||
| auto b_shape = in_tensors_.at(1)->shape(); | |||
| int batch = 1; | |||
| MS_ASSERT(a_shape.size() >= 2); | |||
| for (size_t i = 0; i < a_shape.size() - 2; ++i) { | |||
| @@ -65,307 +71,34 @@ int MatmulCPUKernel::MallocMatrixABuffer() { | |||
| } | |||
| params_->batch = batch; | |||
| params_->row_ = params_->a_transpose_ ? a_shape[a_shape.size() - 1] : a_shape[a_shape.size() - 2]; | |||
| #ifdef ENABLE_ARM | |||
| if (params_->a_init_shape_ && params_->row_ == 1) { | |||
| is_vector_a_ = true; | |||
| } else { | |||
| is_vector_a_ = false; | |||
| } | |||
| #endif | |||
| params_->deep_ = params_->a_transpose_ ? a_shape[a_shape.size() - 2] : a_shape[a_shape.size() - 1]; | |||
| #ifdef ENABLE_AVX | |||
| params_->row_align_ = UP_ROUND(params_->row_, C6NUM); | |||
| #elif defined(ENABLE_SSE) | |||
| params_->row_align_ = UP_ROUND(params_->row_, C4NUM); | |||
| #else | |||
| params_->row_align_ = UP_ROUND(params_->row_, C12NUM); | |||
| #endif | |||
| int row_tmp = is_vector_a_ ? 1 : params_->row_align_; | |||
| if (params_->a_const_) { | |||
| a_pack_ptr_ = reinterpret_cast<float *>(malloc(params_->batch * row_tmp * params_->deep_ * sizeof(float))); | |||
| } else { | |||
| a_pack_ptr_ = | |||
| reinterpret_cast<float *>(context_->allocator->Malloc(params_->batch * row_tmp * params_->deep_ * sizeof(float))); | |||
| } | |||
| if (a_pack_ptr_ == nullptr) { | |||
| FreeTmpBuffer(); | |||
| return RET_MEMORY_FAILED; | |||
| } | |||
| return RET_OK; | |||
| } | |||
| int MatmulCPUKernel::MallocMatrixBBuffer() { | |||
| auto b_shape = in_tensors_.at(1)->shape(); | |||
| if (b_shape.empty()) { | |||
| return RET_OK; | |||
| } | |||
| int batch = 1; | |||
| MS_ASSERT(b_shape.size() >= 2); | |||
| for (size_t i = 0; i < b_shape.size() - 2; ++i) { | |||
| batch *= b_shape[i]; | |||
| } | |||
| params_->batch = batch; | |||
| params_->col_ = params_->b_transpose_ ? b_shape[b_shape.size() - 2] : b_shape[b_shape.size() - 1]; | |||
| params_->col_align_ = UP_ROUND(params_->col_, col_tile_); | |||
| params_->deep_ = params_->b_transpose_ ? b_shape[b_shape.size() - 1] : b_shape[b_shape.size() - 2]; | |||
| int col_tmp = is_vector_a_ ? params_->col_ : params_->col_align_; | |||
| if (params_->b_const_) { | |||
| b_pack_ptr_ = reinterpret_cast<float *>(malloc(params_->batch * col_tmp * params_->deep_ * sizeof(float))); | |||
| } else { | |||
| b_pack_ptr_ = | |||
| reinterpret_cast<float *>(context_->allocator->Malloc(params_->batch * col_tmp * params_->deep_ * sizeof(float))); | |||
| } | |||
| if (b_pack_ptr_ == nullptr) { | |||
| FreeTmpBuffer(); | |||
| return RET_MEMORY_FAILED; | |||
| } | |||
| thread_count_ = MSMIN(op_parameter_->thread_num_, UP_DIV(params_->col_align_, col_tile_)); | |||
| thread_stride_ = UP_DIV(UP_DIV(params_->col_align_, col_tile_), thread_count_); | |||
| return RET_OK; | |||
| } | |||
| params_->deep_ = params_->a_transpose_ ? a_shape[a_shape.size() - 2] : a_shape[a_shape.size() - 1]; | |||
| int MatmulCPUKernel::InitBias() { | |||
| auto b_shape = in_tensors_.at(1)->shape(); | |||
| auto c_shape = out_tensors_.at(0)->shape(); | |||
| params_->col_ = params_->b_const_ | |||
| ? (params_->b_transpose_ ? b_shape.at(b_shape.size() - 2) : b_shape.at(b_shape.size() - 1)) | |||
| : (c_shape.at(c_shape.size() - 1)); | |||
| params_->col_align_ = UP_ROUND(params_->col_, col_tile_); | |||
| auto col_tmp = is_vector_a_ ? params_->col_ : params_->col_align_; | |||
| if (bias_ptr_ == nullptr) { | |||
| bias_ptr_ = reinterpret_cast<float *>(malloc(col_tmp * sizeof(float))); | |||
| if (bias_ptr_ == nullptr) { | |||
| FreeTmpBuffer(); | |||
| return RET_MEMORY_FAILED; | |||
| } | |||
| } | |||
| memset(bias_ptr_, 0, col_tmp * sizeof(float)); | |||
| if (in_tensors_.size() == 3) { | |||
| memcpy(bias_ptr_, in_tensors_[2]->data_c(), in_tensors_[2]->ElementsNum() * sizeof(float)); | |||
| } | |||
| return RET_OK; | |||
| return MatmulFp32BaseCPUKernel::ReSize(); | |||
| } | |||
| int MatmulCPUKernel::ReSize() { | |||
| if (!params_->b_const_) { | |||
| free(bias_ptr_); | |||
| bias_ptr_ = nullptr; | |||
| auto ret = InitBias(); | |||
| if (ret != RET_OK) { | |||
| MS_LOG(ERROR) << "Matmul fp32 init bias failed"; | |||
| int MatmulCPUKernel::Run() { | |||
| if (IsTrain()) { | |||
| if (RET_OK != InitBufferA()) { | |||
| return RET_ERROR; | |||
| } | |||
| } | |||
| return RET_OK; | |||
| } | |||
| void MatmulCPUKernel::InitMatrixA(const float *src_ptr, float *dst_ptr) { | |||
| if (is_vector_a_) { | |||
| memcpy(dst_ptr, src_ptr, params_->batch * params_->deep_ * sizeof(float)); | |||
| return; | |||
| } | |||
| InitMatrixA(reinterpret_cast<float *>(in_tensors_.at(0)->data_c())); | |||
| for (int i = 0; i < params_->batch; i++) { | |||
| const float *src = src_ptr + i * params_->deep_ * params_->row_; | |||
| float *dst = dst_ptr + i * params_->deep_ * params_->row_align_; | |||
| #ifdef ENABLE_AVX | |||
| if (params_->a_transpose_) { | |||
| RowMajor2Row6Major(src, dst, params_->deep_, params_->row_); | |||
| } else { | |||
| RowMajor2Col6Major(src, dst, params_->row_, params_->deep_); | |||
| } | |||
| #elif defined(ENABLE_SSE) | |||
| if (params_->a_transpose_) { | |||
| RowMajor2Row4Major(src, dst, params_->deep_, params_->row_); | |||
| } else { | |||
| RowMajor2Col4Major(src, dst, params_->row_, params_->deep_); | |||
| } | |||
| #else | |||
| if (params_->a_transpose_) { | |||
| RowMajor2Row12Major(src, dst, params_->deep_, params_->row_); | |||
| } else { | |||
| RowMajor2Col12Major(src, dst, params_->row_, params_->deep_); | |||
| } | |||
| #endif | |||
| } | |||
| return; | |||
| } | |||
| void MatmulCPUKernel::InitMatrixB(const float *src_ptr, float *dst_ptr) { | |||
| if (is_vector_a_) { | |||
| if (params_->b_transpose_) { | |||
| memcpy(dst_ptr, src_ptr, params_->batch * params_->col_ * params_->deep_ * sizeof(float)); | |||
| } else { | |||
| for (int i = 0; i < params_->batch; i++) { | |||
| const float *src = src_ptr + i * params_->deep_ * params_->col_; | |||
| float *dst = dst_ptr + i * params_->deep_ * params_->col_; | |||
| RowMajor2ColMajor(src, dst, params_->deep_, params_->col_); | |||
| } | |||
| } | |||
| return; | |||
| } | |||
| for (int i = 0; i < params_->batch; i++) { | |||
| const float *src = src_ptr + i * params_->deep_ * params_->col_; | |||
| float *dst = dst_ptr + i * params_->deep_ * params_->col_align_; | |||
| #ifdef ENABLE_AVX | |||
| if (params_->b_transpose_) { | |||
| RowMajor2Col16Major(src, dst, params_->col_, params_->deep_); | |||
| } else { | |||
| RowMajor2Row16Major(src, dst, params_->deep_, params_->col_); | |||
| } | |||
| #elif defined(ENABLE_ARM32) | |||
| if (params_->b_transpose_) { | |||
| RowMajor2Col4Major(src, dst, params_->col_, params_->deep_); | |||
| } else { | |||
| RowMajor2Row4Major(src, dst, params_->deep_, params_->col_); | |||
| } | |||
| #else | |||
| if (params_->b_transpose_) { | |||
| RowMajor2Col8Major(src, dst, params_->col_, params_->deep_); | |||
| } else { | |||
| RowMajor2Row8Major(src, dst, params_->deep_, params_->col_); | |||
| } | |||
| #endif | |||
| } | |||
| return; | |||
| } | |||
| int MatmulCPUKernel::Init() { | |||
| #ifdef ENABLE_AVX | |||
| col_tile_ = C16NUM; | |||
| #elif defined(ENABLE_ARM32) | |||
| col_tile_ = C4NUM; | |||
| #else | |||
| col_tile_ = C8NUM; | |||
| #endif | |||
| params_->a_const_ = (in_tensors_.at(0)->data_c() != nullptr); | |||
| params_->b_const_ = (in_tensors_.at(1)->data_c() != nullptr); | |||
| if (params_->a_const_) { | |||
| auto ret = MallocMatrixABuffer(); | |||
| if (ret != RET_OK) { | |||
| MS_LOG(ERROR) << "Matmul fp32 malloc matrix A buffer failed"; | |||
| return RET_ERROR; | |||
| } | |||
| InitMatrixA(reinterpret_cast<float *>(in_tensors_.at(0)->data_c()), a_pack_ptr_); | |||
| a_ptr_ = a_pack_ptr_; | |||
| } | |||
| if (params_->b_const_) { | |||
| auto ret = MallocMatrixBBuffer(); | |||
| if (ret != RET_OK) { | |||
| MS_LOG(ERROR) << "Matmul fp32 malloc matrix B buffer failed"; | |||
| return RET_ERROR; | |||
| } | |||
| InitMatrixB(reinterpret_cast<float *>(in_tensors_.at(1)->data_c()), b_pack_ptr_); | |||
| b_ptr_ = b_pack_ptr_; | |||
| // init bias | |||
| ret = InitBias(); | |||
| if (ret != RET_OK) { | |||
| MS_LOG(ERROR) << "Matmul fp32 init bias failed"; | |||
| if (RET_OK != InitBufferB()) { | |||
| return RET_ERROR; | |||
| } | |||
| } | |||
| return RET_OK; | |||
| } | |||
| InitMatrixB(reinterpret_cast<float *>(in_tensors_.at(1)->data_c())); | |||
| int MatmulCPUKernel::RunImpl(int task_id) { | |||
| int cur_oc = MSMIN(thread_stride_ * col_tile_, params_->col_ - task_id * thread_stride_ * col_tile_); | |||
| if (cur_oc <= 0) { | |||
| return RET_OK; | |||
| FreeBiasBuf(); | |||
| InitBiasData(); | |||
| } | |||
| auto b = cur_b_ptr_ + task_id * thread_stride_ * col_tile_ * params_->deep_; | |||
| auto c = cur_c_ptr_ + task_id * thread_stride_ * col_tile_; | |||
| auto bias = bias_ptr_ ? bias_ptr_ + task_id * thread_stride_ * col_tile_ : NULL; | |||
| MS_ASSERT(cur_a_ptr_); | |||
| MS_ASSERT(b); | |||
| MS_ASSERT(c); | |||
| if (is_vector_a_) { | |||
| MatVecMul(cur_a_ptr_, b, c, bias, ActType_No, params_->deep_, cur_oc); | |||
| } else { | |||
| MatMulOpt(cur_a_ptr_, b, c, bias, ActType_No, params_->deep_, params_->row_, cur_oc, params_->col_, OutType_Nhwc); | |||
| } | |||
| return RET_OK; | |||
| } | |||
| int MatmulFloatRun(void *cdata, int task_id) { | |||
| auto op = reinterpret_cast<MatmulCPUKernel *>(cdata); | |||
| auto error_code = op->RunImpl(task_id); | |||
| if (error_code != RET_OK) { | |||
| MS_LOG(ERROR) << "MatmulFp32Run error task_id[" << task_id << "] error_code[" << error_code << "]"; | |||
| return RET_ERROR; | |||
| } | |||
| return RET_OK; | |||
| } | |||
| int MatmulCPUKernel::Run() { | |||
| auto a_src = reinterpret_cast<float *>(in_tensors_.at(0)->data_c()); | |||
| auto b_src = reinterpret_cast<float *>(in_tensors_.at(1)->data_c()); | |||
| auto c_src = reinterpret_cast<float *>(out_tensors_.at(0)->data_c()); | |||
| MatmulFp32BaseCPUKernel::Run(); | |||
| if (!params_->a_const_ || IsTrain()) { | |||
| if (a_pack_ptr_ != nullptr) { | |||
| params_->a_const_ ? free(a_pack_ptr_) : context_->allocator->Free(a_pack_ptr_); | |||
| a_pack_ptr_ = nullptr; | |||
| } | |||
| auto ret = MallocMatrixABuffer(); | |||
| if (ret != RET_OK) { | |||
| MS_LOG(ERROR) << "Matmul fp32 malloc matrix a buffer failed"; | |||
| return RET_ERROR; | |||
| } | |||
| if (is_vector_a_) { | |||
| a_ptr_ = a_src; | |||
| } else { | |||
| InitMatrixA(a_src, a_pack_ptr_); | |||
| a_ptr_ = a_pack_ptr_; | |||
| } | |||
| } | |||
| if (!params_->b_const_ || IsTrain()) { | |||
| if (b_pack_ptr_ != nullptr) { | |||
| params_->b_const_ ? free(b_pack_ptr_) : context_->allocator->Free(b_pack_ptr_); | |||
| b_pack_ptr_ = nullptr; | |||
| } | |||
| auto ret = MallocMatrixBBuffer(); | |||
| if (ret != RET_OK) { | |||
| MS_LOG(ERROR) << "Matmul fp32 malloc matrix b buffer failed"; | |||
| return RET_ERROR; | |||
| } | |||
| if (is_vector_a_ && params_->b_transpose_) { | |||
| b_ptr_ = b_src; | |||
| } else { | |||
| InitMatrixB(b_src, b_pack_ptr_); | |||
| b_ptr_ = b_pack_ptr_; | |||
| } | |||
| } | |||
| if (IsTrain()) { | |||
| InitBias(); | |||
| } | |||
| for (int i = 0; i < params_->batch; ++i) { | |||
| if (is_vector_a_) { | |||
| cur_a_ptr_ = a_ptr_ + i * params_->deep_; | |||
| cur_b_ptr_ = b_ptr_ + i * params_->deep_ * params_->col_; | |||
| cur_c_ptr_ = c_src + i * params_->row_ * params_->col_; | |||
| } else { | |||
| cur_a_ptr_ = a_ptr_ + i * params_->row_align_ * params_->deep_; | |||
| cur_b_ptr_ = b_ptr_ + i * params_->deep_ * params_->col_align_; | |||
| cur_c_ptr_ = c_src + i * params_->row_ * params_->col_; | |||
| } | |||
| auto ret = ParallelLaunch(this->context_->thread_pool_, MatmulFloatRun, this, thread_count_); | |||
| if (ret != RET_OK) { | |||
| MS_LOG(ERROR) << "Matmul fp32 run function MatmulFloatRun failed"; | |||
| FreeTmpBuffer(); | |||
| return RET_ERROR; | |||
| } | |||
| } | |||
| if (!params_->a_const_ || IsTrain()) { | |||
| params_->a_const_ ? free(a_pack_ptr_) : context_->allocator->Free(a_pack_ptr_); | |||
| context_->allocator->Free(a_pack_ptr_); | |||
| context_->allocator->Free(b_pack_ptr_); | |||
| a_pack_ptr_ = nullptr; | |||
| } | |||
| if (!params_->b_const_ || IsTrain()) { | |||
| params_->b_const_ ? free(b_pack_ptr_) : context_->allocator->Free(b_pack_ptr_); | |||
| b_pack_ptr_ = nullptr; | |||
| } | |||
| return RET_OK; | |||
| @@ -376,61 +109,24 @@ int MatmulCPUKernel::Eval() { | |||
| auto a_src = reinterpret_cast<float *>(in_tensors_.at(0)->data_c()); | |||
| auto b_src = reinterpret_cast<float *>(in_tensors_.at(1)->data_c()); | |||
| LiteKernel::Eval(); | |||
| if (params_->a_const_) { | |||
| if (a_pack_ptr_ == nullptr) { | |||
| auto ret = MallocMatrixABuffer(); | |||
| if (ret != RET_OK) { | |||
| MS_LOG(ERROR) << "Matmul fp32 malloc matrix a buffer failed"; | |||
| return RET_ERROR; | |||
| } | |||
| } | |||
| if (is_vector_a_) { | |||
| a_ptr_ = a_src; | |||
| } else { | |||
| InitMatrixA(a_src, a_pack_ptr_); | |||
| a_ptr_ = a_pack_ptr_; | |||
| if (RET_OK != InitBufferA()) { | |||
| return RET_ERROR; | |||
| } | |||
| InitMatrixA(a_src); | |||
| } | |||
| if (params_->b_const_) { | |||
| if (b_pack_ptr_ == nullptr) { | |||
| auto ret = MallocMatrixBBuffer(); | |||
| if (ret != RET_OK) { | |||
| MS_LOG(ERROR) << "Matmul fp32 malloc matrix b buffer failed"; | |||
| return RET_ERROR; | |||
| } | |||
| } | |||
| if (is_vector_a_ && params_->b_transpose_) { | |||
| b_ptr_ = b_src; | |||
| } else { | |||
| InitMatrixB(b_src, b_pack_ptr_); | |||
| b_ptr_ = b_pack_ptr_; | |||
| if (RET_OK != InitBufferB()) { | |||
| return RET_ERROR; | |||
| } | |||
| InitMatrixB(b_src); | |||
| } | |||
| InitBias(); | |||
| return RET_OK; | |||
| } | |||
| kernel::LiteKernel *CpuMatmulFp32KernelCreator(const std::vector<lite::Tensor *> &inputs, | |||
| const std::vector<lite::Tensor *> &outputs, OpParameter *opParameter, | |||
| const lite::InnerContext *ctx, const kernel::KernelKey &desc, | |||
| const mindspore::lite::PrimitiveC *primitive) { | |||
| MS_ASSERT(opParameter != nullptr); | |||
| MS_ASSERT(desc.type == schema::PrimitiveType_MatMul); | |||
| auto kernel = new (std::nothrow) MatmulCPUKernel(opParameter, inputs, outputs, ctx, primitive); | |||
| if (kernel == nullptr) { | |||
| MS_LOG(ERROR) << "kernel is nullptr."; | |||
| free(opParameter); | |||
| return nullptr; | |||
| } | |||
| auto ret = kernel->Init(); | |||
| if (ret != RET_OK) { | |||
| MS_LOG(ERROR) << "Init kernel failed, name: " << opParameter->name_ << ", type: " | |||
| << schema::EnumNamePrimitiveType(static_cast<schema::PrimitiveType>(opParameter->type_)); | |||
| delete kernel; | |||
| return nullptr; | |||
| } | |||
| return kernel; | |||
| FreeBiasBuf(); | |||
| InitBiasData(); | |||
| return RET_OK; | |||
| } | |||
| REG_KERNEL(kCPU, kNumberTypeFloat32, PrimitiveType_MatMul, CpuMatmulFp32KernelCreator) | |||
| REG_KERNEL(kCPU, kNumberTypeFloat32, PrimitiveType_MatMul, LiteKernelCreator<MatmulCPUKernel>) | |||
| } // namespace mindspore::kernel | |||
| @@ -19,47 +19,20 @@ | |||
| #include <vector> | |||
| #include "nnacl/matmul_parameter.h" | |||
| #include "src/lite_kernel.h" | |||
| #include "src/runtime/kernel/arm/fp32/matmul_fp32_base.h" | |||
| namespace mindspore::kernel { | |||
| class MatmulCPUKernel : public LiteKernel { | |||
| class MatmulCPUKernel : public MatmulFp32BaseCPUKernel { | |||
| public: | |||
| explicit MatmulCPUKernel(OpParameter *parameter, const std::vector<lite::Tensor *> &inputs, | |||
| const std::vector<lite::Tensor *> &outputs, const lite::InnerContext *ctx, | |||
| const mindspore::lite::PrimitiveC *primitive) | |||
| : LiteKernel(parameter, inputs, outputs, ctx, primitive) { | |||
| params_ = reinterpret_cast<MatMulParameter *>(op_parameter_); | |||
| } | |||
| ~MatmulCPUKernel() override; | |||
| : MatmulFp32BaseCPUKernel(parameter, inputs, outputs, ctx, primitive) {} | |||
| ~MatmulCPUKernel() = default; | |||
| int Init() override; | |||
| int ReSize() override; | |||
| int Run() override; | |||
| int RunImpl(int task_id); | |||
| int Eval() override; | |||
| private: | |||
| int MallocMatrixABuffer(); | |||
| int MallocMatrixBBuffer(); | |||
| int InitBias(); | |||
| void InitMatrixA(const float *src_ptr, float *dst_ptr); | |||
| void InitMatrixB(const float *src_ptr, float *dst_ptr); | |||
| void FreeTmpBuffer(); | |||
| private: | |||
| MatMulParameter *params_ = nullptr; | |||
| float *a_pack_ptr_ = nullptr; | |||
| float *b_pack_ptr_ = nullptr; | |||
| float *bias_ptr_ = nullptr; | |||
| float *a_ptr_ = nullptr; | |||
| float *b_ptr_ = nullptr; | |||
| float *cur_a_ptr_ = nullptr; | |||
| float *cur_b_ptr_ = nullptr; | |||
| float *cur_c_ptr_ = nullptr; | |||
| bool is_vector_a_ = false; | |||
| int col_tile_ = 0; | |||
| int thread_stride_ = 0; | |||
| int thread_count_ = 0; | |||
| }; | |||
| } // namespace mindspore::kernel | |||
| #endif // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP32_MATMUL_H_ | |||
| @@ -0,0 +1,299 @@ | |||
| /** | |||
| * Copyright 2020 Huawei Technologies Co., Ltd | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| * You may obtain a copy of the License at | |||
| * | |||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, software | |||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| #include "src/runtime/kernel/arm/fp32/matmul_fp32_base.h" | |||
| #include "nnacl/fp32/matmul_fp32.h" | |||
| namespace mindspore::kernel { | |||
| int MatmulBaseFloatRun(void *cdata, int task_id) { | |||
| auto op = reinterpret_cast<MatmulFp32BaseCPUKernel *>(cdata); | |||
| auto error_code = op->FloatRun(task_id); | |||
| if (error_code != RET_OK) { | |||
| MS_LOG(ERROR) << "MatmulFp32Run error task_id[" << task_id << "] error_code[" << error_code << "]"; | |||
| return RET_ERROR; | |||
| } | |||
| return RET_OK; | |||
| } | |||
| MatmulFp32BaseCPUKernel::~MatmulFp32BaseCPUKernel() { | |||
| FreeResizeBufA(); | |||
| FreeResizeBufB(); | |||
| FreeBiasBuf(); | |||
| return; | |||
| } | |||
| void MatmulFp32BaseCPUKernel::InitParameter() { | |||
| params_->a_const_ = (in_tensors_.at(0)->data_c() != nullptr); | |||
| params_->b_const_ = (in_tensors_.at(1)->data_c() != nullptr); | |||
| #ifdef ENABLE_AVX | |||
| row_tile_ = C6NUM; | |||
| col_tile_ = C16NUM; | |||
| #elif defined(ENABLE_ARM32) | |||
| row_tile_ = C12NUM; | |||
| col_tile_ = C4NUM; | |||
| #elif defined(ENABLE_SSE) | |||
| row_tile_ = C4NUM; | |||
| col_tile_ = C8NUM; | |||
| #else | |||
| row_tile_ = C12NUM; | |||
| col_tile_ = C8NUM; | |||
| #endif | |||
| return; | |||
| } | |||
| void MatmulFp32BaseCPUKernel::ResizeParameter() { | |||
| if (params_->row_ == 1 && params_->b_const_ == false) { | |||
| vec_matmul_ = true; | |||
| } | |||
| params_->row_align_ = vec_matmul_ ? 1 : UP_ROUND(params_->row_, row_tile_); | |||
| params_->col_align_ = vec_matmul_ ? params_->col_ : UP_ROUND(params_->col_, col_tile_); | |||
| return; | |||
| } | |||
| int MatmulFp32BaseCPUKernel::InitBufferA() { | |||
| if (a_pack_ptr_ != nullptr) { | |||
| return RET_OK; | |||
| } | |||
| a_pack_ptr_ = | |||
| reinterpret_cast<float *>(malloc(params_->batch * params_->row_align_ * params_->deep_ * sizeof(float))); | |||
| if (a_pack_ptr_ == nullptr) { | |||
| MS_LOG(ERROR) << "malloc a_pack_ptr_ failed"; | |||
| return RET_ERROR; | |||
| } | |||
| return RET_OK; | |||
| } | |||
| int MatmulFp32BaseCPUKernel::InitBufferB() { | |||
| if (b_pack_ptr_ != nullptr) { | |||
| return RET_OK; | |||
| } | |||
| b_pack_ptr_ = | |||
| reinterpret_cast<float *>(malloc(params_->batch * params_->col_align_ * params_->deep_ * sizeof(float))); | |||
| if (b_pack_ptr_ == nullptr) { | |||
| MS_LOG(ERROR) << "malloc b_pack_ptr_ failed"; | |||
| return RET_ERROR; | |||
| } | |||
| return RET_OK; | |||
| } | |||
| int MatmulFp32BaseCPUKernel::InitBiasData() { | |||
| if (in_tensors_.size() == 3) { | |||
| auto bias_tensor = in_tensors_[2]; | |||
| int max_bias_data = UP_ROUND(bias_tensor->ElementsNum(), C16NUM); | |||
| bias_ptr_ = reinterpret_cast<float *>(malloc(max_bias_data * sizeof(float))); | |||
| if (bias_ptr_ == nullptr) { | |||
| MS_LOG(ERROR) << "malloc bias_ptr_ failed"; | |||
| return RET_ERROR; | |||
| } | |||
| memcpy(bias_ptr_, bias_tensor->data_c(), bias_tensor->ElementsNum() * sizeof(float)); | |||
| } | |||
| return RET_OK; | |||
| } | |||
| int MatmulFp32BaseCPUKernel::InitMatrixA(const float *src_ptr) { | |||
| if (vec_matmul_) { | |||
| memcpy(a_pack_ptr_, src_ptr, params_->batch * params_->deep_ * sizeof(float)); | |||
| return RET_OK; | |||
| } | |||
| for (int i = 0; i < params_->batch; i++) { | |||
| const float *src = src_ptr + i * params_->deep_ * params_->row_; | |||
| float *dst = a_pack_ptr_ + i * params_->deep_ * params_->row_align_; | |||
| #ifdef ENABLE_AVX | |||
| if (params_->a_transpose_) { | |||
| RowMajor2Row6Major(src, dst, params_->deep_, params_->row_); | |||
| } else { | |||
| RowMajor2Col6Major(src, dst, params_->row_, params_->deep_); | |||
| } | |||
| #elif defined(ENABLE_SSE) | |||
| if (params_->a_transpose_) { | |||
| RowMajor2Row4Major(src, dst, params_->deep_, params_->row_); | |||
| } else { | |||
| RowMajor2Col4Major(src, dst, params_->row_, params_->deep_); | |||
| } | |||
| #else | |||
| if (params_->a_transpose_) { | |||
| RowMajor2Row12Major(src, dst, params_->deep_, params_->row_); | |||
| } else { | |||
| RowMajor2Col12Major(src, dst, params_->row_, params_->deep_); | |||
| } | |||
| #endif | |||
| } | |||
| return RET_OK; | |||
| } | |||
| int MatmulFp32BaseCPUKernel::InitMatrixB(const float *src_ptr) { | |||
| if (vec_matmul_) { | |||
| if (params_->b_transpose_) { | |||
| memcpy(b_pack_ptr_, src_ptr, params_->batch * params_->col_ * params_->deep_ * sizeof(float)); | |||
| } else { | |||
| for (int i = 0; i < params_->batch; i++) { | |||
| const float *src = src_ptr + i * params_->deep_ * params_->col_; | |||
| float *dst = b_pack_ptr_ + i * params_->deep_ * params_->col_; | |||
| RowMajor2ColMajor(src, dst, params_->deep_, params_->col_); | |||
| } | |||
| } | |||
| return RET_OK; | |||
| } | |||
| for (int i = 0; i < params_->batch; i++) { | |||
| const float *src = src_ptr + i * params_->deep_ * params_->col_; | |||
| float *dst = b_pack_ptr_ + i * params_->deep_ * params_->col_align_; | |||
| #ifdef ENABLE_AVX | |||
| if (params_->b_transpose_) { | |||
| RowMajor2Col16Major(src, dst, params_->col_, params_->deep_); | |||
| } else { | |||
| RowMajor2Row16Major(src, dst, params_->deep_, params_->col_); | |||
| } | |||
| #elif defined(ENABLE_ARM32) | |||
| if (params_->b_transpose_) { | |||
| RowMajor2Col4Major(src, dst, params_->col_, params_->deep_); | |||
| } else { | |||
| RowMajor2Row4Major(src, dst, params_->deep_, params_->col_); | |||
| } | |||
| #else | |||
| if (params_->b_transpose_) { | |||
| RowMajor2Col8Major(src, dst, params_->col_, params_->deep_); | |||
| } else { | |||
| RowMajor2Row8Major(src, dst, params_->deep_, params_->col_); | |||
| } | |||
| #endif | |||
| } | |||
| return RET_OK; | |||
| } | |||
| void MatmulFp32BaseCPUKernel::FreeBiasBuf() { | |||
| if (bias_ptr_ != nullptr) { | |||
| free(bias_ptr_); | |||
| bias_ptr_ = nullptr; | |||
| } | |||
| return; | |||
| } | |||
| void MatmulFp32BaseCPUKernel::FreeResizeBufA() { | |||
| if (a_pack_ptr_ != nullptr) { | |||
| context_->allocator->Free(a_pack_ptr_); | |||
| a_pack_ptr_ = nullptr; | |||
| } | |||
| return; | |||
| } | |||
| void MatmulFp32BaseCPUKernel::FreeResizeBufB() { | |||
| if (b_pack_ptr_ != nullptr) { | |||
| context_->allocator->Free(b_pack_ptr_); | |||
| b_pack_ptr_ = nullptr; | |||
| } | |||
| return; | |||
| } | |||
| int MatmulFp32BaseCPUKernel::FloatRun(int task_id) { | |||
| int cur_oc = MSMIN(thread_stride_ * col_tile_, params_->col_ - task_id * thread_stride_ * col_tile_); | |||
| if (cur_oc <= 0) { | |||
| return RET_OK; | |||
| } | |||
| auto b = batch_b_ptr_ + task_id * thread_stride_ * col_tile_ * params_->deep_; | |||
| auto c = batch_c_ptr_ + task_id * thread_stride_ * col_tile_; | |||
| auto bias = (bias_ptr_ == nullptr) ? nullptr : bias_ptr_ + task_id * thread_stride_ * col_tile_; | |||
| if (vec_matmul_) { | |||
| MatVecMulFp32(batch_a_ptr_, b, c, bias, params_->act_type_, params_->deep_, cur_oc); | |||
| } else { | |||
| MatMulOpt(batch_a_ptr_, b, c, bias, params_->act_type_, params_->deep_, params_->row_, cur_oc, params_->col_, | |||
| OutType_Nhwc); | |||
| } | |||
| return RET_OK; | |||
| } | |||
| int MatmulFp32BaseCPUKernel::Init() { | |||
| ResizeParameter(); | |||
| auto ret = InitBiasData(); | |||
| if (ret != RET_OK) { | |||
| MS_LOG(ERROR) << "InitBiasData failed"; | |||
| return ret; | |||
| } | |||
| if (params_->a_const_ == true) { | |||
| if (RET_OK != InitBufferA()) { | |||
| return RET_ERROR; | |||
| } | |||
| InitMatrixA(reinterpret_cast<float *>(in_tensors_[0]->data_c())); | |||
| } | |||
| if (params_->b_const_ == true) { | |||
| if (RET_OK != InitBufferB()) { | |||
| return RET_ERROR; | |||
| } | |||
| InitMatrixB(reinterpret_cast<float *>(in_tensors_[1]->data_c())); | |||
| } | |||
| return RET_OK; | |||
| } | |||
| int MatmulFp32BaseCPUKernel::ReSize() { | |||
| ResizeParameter(); | |||
| thread_count_ = MSMIN(op_parameter_->thread_num_, UP_DIV(params_->col_align_, col_tile_)); | |||
| thread_stride_ = UP_DIV(UP_DIV(params_->col_align_, col_tile_), thread_count_); | |||
| return RET_OK; | |||
| } | |||
| int MatmulFp32BaseCPUKernel::Run() { | |||
| auto a_ptr = reinterpret_cast<float *>(in_tensors_.at(0)->data_c()); | |||
| auto b_ptr = reinterpret_cast<float *>(in_tensors_.at(1)->data_c()); | |||
| c_ptr_ = reinterpret_cast<float *>(out_tensors_.at(0)->data_c()); | |||
| if (params_->a_const_ == false) { | |||
| if (RET_OK != InitBufferA()) { | |||
| return RET_ERROR; | |||
| } | |||
| InitMatrixA(a_ptr); | |||
| } | |||
| if (params_->b_const_ == false) { | |||
| if (RET_OK != InitBufferB()) { | |||
| FreeResizeBufA(); | |||
| return RET_ERROR; | |||
| } | |||
| InitMatrixB(b_ptr); | |||
| } | |||
| for (int i = 0; i < params_->batch; ++i) { | |||
| if (vec_matmul_) { | |||
| batch_a_ptr_ = a_pack_ptr_ + i * params_->deep_; | |||
| batch_b_ptr_ = b_pack_ptr_ + i * params_->deep_ * params_->col_; | |||
| batch_c_ptr_ = c_ptr_ + i * params_->row_ * params_->col_; | |||
| } else { | |||
| batch_a_ptr_ = a_pack_ptr_ + i * params_->row_align_ * params_->deep_; | |||
| batch_b_ptr_ = b_pack_ptr_ + i * params_->deep_ * params_->col_align_; | |||
| batch_c_ptr_ = c_ptr_ + i * params_->row_ * params_->col_; | |||
| } | |||
| auto ret = ParallelLaunch(this->context_->thread_pool_, MatmulBaseFloatRun, this, thread_count_); | |||
| if (ret != RET_OK) { | |||
| MS_LOG(ERROR) << "MatmulBaseFloatRun failed"; | |||
| return ret; | |||
| } | |||
| } | |||
| if (params_->a_const_ == false) { | |||
| FreeResizeBufA(); | |||
| } | |||
| if (params_->b_const_ == false) { | |||
| FreeResizeBufB(); | |||
| } | |||
| return RET_OK; | |||
| } | |||
| } // namespace mindspore::kernel | |||
| @@ -0,0 +1,77 @@ | |||
| /** | |||
| * Copyright 2020 Huawei Technologies Co., Ltd | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| * You may obtain a copy of the License at | |||
| * | |||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, software | |||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| #ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP32_MATMUL_FP32_BASE_H_ | |||
| #define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP32_MATMUL_FP32_BASE_H_ | |||
| #include <vector> | |||
| #include "src/lite_kernel.h" | |||
| #include "nnacl/matmul_parameter.h" | |||
| #include "include/errorcode.h" | |||
| using mindspore::lite::RET_ERROR; | |||
| using mindspore::lite::RET_MEMORY_FAILED; | |||
| using mindspore::lite::RET_OK; | |||
| namespace mindspore::kernel { | |||
| class MatmulFp32BaseCPUKernel : public LiteKernel { | |||
| public: | |||
| MatmulFp32BaseCPUKernel(OpParameter *parameter, const std::vector<lite::Tensor *> &inputs, | |||
| const std::vector<lite::Tensor *> &outputs, const mindspore::lite::InnerContext *ctx, | |||
| const mindspore::lite::PrimitiveC *primitive) | |||
| : LiteKernel(parameter, inputs, outputs, ctx, primitive) { | |||
| params_ = reinterpret_cast<MatMulParameter *>(op_parameter_); | |||
| vec_matmul_ = false; | |||
| } | |||
| ~MatmulFp32BaseCPUKernel(); | |||
| int Init() override; | |||
| int ReSize() override; | |||
| int Run() override; | |||
| public: | |||
| int FloatRun(int task_id); | |||
| protected: | |||
| int InitBufferA(); | |||
| int InitBufferB(); | |||
| int InitMatrixA(const float *src_ptr); | |||
| int InitMatrixB(const float *src_ptr); | |||
| void FreeBiasBuf(); | |||
| int InitBiasData(); | |||
| void InitParameter(); | |||
| private: | |||
| void ResizeParameter(); | |||
| void FreeResizeBufA(); | |||
| void FreeResizeBufB(); | |||
| protected: | |||
| MatMulParameter *params_ = nullptr; | |||
| float *a_pack_ptr_ = nullptr; | |||
| float *b_pack_ptr_ = nullptr; | |||
| float *c_ptr_ = nullptr; | |||
| float *bias_ptr_ = nullptr; | |||
| float *batch_a_ptr_ = nullptr; | |||
| float *batch_b_ptr_ = nullptr; | |||
| float *batch_c_ptr_ = nullptr; | |||
| int col_tile_ = 0; | |||
| int row_tile_ = 0; | |||
| int thread_stride_ = 0; | |||
| int thread_count_ = 0; | |||
| bool vec_matmul_ = false; | |||
| }; | |||
| } // namespace mindspore::kernel | |||
| #endif // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP32_MATMUL_FP32_BASE_H_ | |||
| @@ -318,12 +318,7 @@ int Convolution1x1Int8CPUKernel::InitParam() { | |||
| matmul_param_->row_ = conv_param_->output_h_ * conv_param_->output_w_; | |||
| matmul_param_->deep_ = conv_param_->input_channel_; | |||
| matmul_param_->col_ = conv_param_->output_channel_; | |||
| matmul_param_->col_2_ = UP_ROUND(matmul_param_->col_, C2NUM); | |||
| matmul_param_->col_4_ = UP_ROUND(matmul_param_->col_, C4NUM); | |||
| matmul_param_->col_8_ = UP_ROUND(matmul_param_->col_, C8NUM); | |||
| matmul_param_->col_16_ = UP_ROUND(matmul_param_->col_, C16NUM); | |||
| matmul_param_->row_4_ = UP_ROUND(matmul_param_->row_, C4NUM); | |||
| matmul_param_->row_8_ = UP_ROUND(matmul_param_->row_, C8NUM); | |||
| matmul_param_->deep_4_ = UP_ROUND(matmul_param_->deep_, C4NUM); | |||
| matmul_param_->deep_16_ = UP_ROUND(matmul_param_->deep_, C16NUM); | |||
| @@ -156,12 +156,8 @@ void FullconnectionInt8CPUKernel::InitParam() { | |||
| fc_param_->deep_ = (in_tensors_.at(1)->shape()).at(1); | |||
| fc_param_->row_4_ = UP_ROUND(fc_param_->row_, C4NUM); | |||
| fc_param_->row_8_ = UP_ROUND(fc_param_->row_, C8NUM); | |||
| fc_param_->col_2_ = UP_ROUND(fc_param_->col_, C2NUM); | |||
| fc_param_->col_4_ = UP_ROUND(fc_param_->col_, C4NUM); | |||
| fc_param_->col_8_ = UP_ROUND(fc_param_->col_, C8NUM); | |||
| fc_param_->col_16_ = UP_ROUND(fc_param_->col_, C16NUM); | |||
| fc_param_->deep_4_ = UP_ROUND(fc_param_->deep_, C4NUM); | |||
| fc_param_->deep_16_ = UP_ROUND(fc_param_->deep_, C16NUM); | |||
| thread_count_ = MSMIN(op_parameter_->thread_num_, UP_DIV(fc_param_->col_4_, C4NUM)); | |||