| @@ -80,25 +80,12 @@ THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| float64x2_t a##m##_k##offset_k = vld1q_dup_f64(&A_ELEMENT_K(m, offset_k)); | float64x2_t a##m##_k##offset_k = vld1q_dup_f64(&A_ELEMENT_K(m, offset_k)); | ||||
| #define LOAD_A1(m, offset_k) \ | #define LOAD_A1(m, offset_k) \ | ||||
| float64_t a##m##_k##offset_k = A_ELEMENT_K(m, offset_k); | float64_t a##m##_k##offset_k = A_ELEMENT_K(m, offset_k); | ||||
| #define VECTOR_LOAD_B_K2(n, offset_k) \ | |||||
| float64x2_t b##k##n##_k##offset_k = vld1q_f64(&B_ELEMENT_K(n, offset_k)); | |||||
| #define TRANSPOSE_B2_K2(n0, n1, offset_k0, offset_k1) \ | |||||
| float64x2_t b##n0##_k##offset_k0 = \ | |||||
| vzip1q_f64(b##k##n0##_k##offset_k0, b##k##n1##_k##offset_k0); \ | |||||
| float64x2_t b##n0##_k##offset_k1 = \ | |||||
| vzip2q_f64(b##k##n0##_k##offset_k0, b##k##n1##_k##offset_k0); | |||||
| #define SCALE_B2_K2(n0, offset_k0, offset_k1) \ | |||||
| svfloat64_t b##s##n0##_k##offset_k0 = svdup_neonq_f64(b##n0##_k##offset_k0); \ | |||||
| svfloat64_t b##s##n0##_k##offset_k1 = svdup_neonq_f64(b##n0##_k##offset_k1); | |||||
| #define GATHER_LOAD_B2(n, offset_k) \ | #define GATHER_LOAD_B2(n, offset_k) \ | ||||
| float64x2_t b##n##_k##offset_k = vdupq_n_f64(B_ELEMENT_K(n, offset_k)); \ | float64x2_t b##n##_k##offset_k = vdupq_n_f64(B_ELEMENT_K(n, offset_k)); \ | ||||
| b##n##_k##offset_k = \ | b##n##_k##offset_k = \ | ||||
| vsetq_lane_f64(B_ELEMENT_K(n + 1, offset_k), b##n##_k##offset_k, 1); | vsetq_lane_f64(B_ELEMENT_K(n + 1, offset_k), b##n##_k##offset_k, 1); | ||||
| #define VECTOR_UNPACK_B2(n, offset_k) \ | #define VECTOR_UNPACK_B2(n, offset_k) \ | ||||
| float64x2_t b##n##_k##offset_k = vld1q_f64(&PACK_ELEMENT_K(n, offset_k)); | float64x2_t b##n##_k##offset_k = vld1q_f64(&PACK_ELEMENT_K(n, offset_k)); | ||||
| #define VECTOR_PACK_B2(n, offset_k) \ | |||||
| vst1q_f64(&PACK_ELEMENT_K(n, offset_k), b##n##_k##offset_k); | |||||
| #define PACK_B0(n, offset_k) \ | #define PACK_B0(n, offset_k) \ | ||||
| PACK_ELEMENT_K(n, offset_k) = vget_lane_f64(b##n##_k##offset_k, 0); | PACK_ELEMENT_K(n, offset_k) = vget_lane_f64(b##n##_k##offset_k, 0); | ||||
| #define UPDATE_RESULT_VECTOR2(m, n, offset_k) \ | #define UPDATE_RESULT_VECTOR2(m, n, offset_k) \ | ||||
| @@ -128,9 +115,6 @@ THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| svfloat64_t b##s##n##_k##offset_k = svdup_f64(B_ELEMENT_K(n, offset_k)); | svfloat64_t b##s##n##_k##offset_k = svdup_f64(B_ELEMENT_K(n, offset_k)); | ||||
| #define VECTOR_LOAD_A(pg, m, offset_k) \ | #define VECTOR_LOAD_A(pg, m, offset_k) \ | ||||
| svfloat64_t a##s##m##_k##offset_k = svld1(pg, &A_ELEMENT_K(m, offset_k)); | svfloat64_t a##s##m##_k##offset_k = svld1(pg, &A_ELEMENT_K(m, offset_k)); | ||||
| #define QUADWORD_LOAD_B(n, offset_k) \ | |||||
| svfloat64_t b##s##n##_k##offset_k = \ | |||||
| svld1rq(pg_true, &B_ELEMENT_K(n, offset_k)); | |||||
| #define GATHER_LOAD_A(pg, m, offset_k) \ | #define GATHER_LOAD_A(pg, m, offset_k) \ | ||||
| svfloat64_t a##s##m##_k##offset_k = \ | svfloat64_t a##s##m##_k##offset_k = \ | ||||
| svld1_gather_index(pg, &A_ELEMENT_K(m, offset_k), lda_vec); | svld1_gather_index(pg, &A_ELEMENT_K(m, offset_k), lda_vec); | ||||
| @@ -226,7 +210,6 @@ CNAME(BLASLONG M, | |||||
| const BLASLONG v_m1 = M & -v_size; | const BLASLONG v_m1 = M & -v_size; | ||||
| const BLASLONG n4 = N & -4; | const BLASLONG n4 = N & -4; | ||||
| const BLASLONG n2 = N & -2; | const BLASLONG n2 = N & -2; | ||||
| const BLASLONG k2 = K & -2; | |||||
| const int pack_a = M >= v_size2 && N >= 8 && K >= 8 ? 1 : 0; | const int pack_a = M >= v_size2 && N >= 8 && K >= 8 ? 1 : 0; | ||||
| FLOAT* packed_a = | FLOAT* packed_a = | ||||
| @@ -266,6 +249,7 @@ CNAME(BLASLONG M, | |||||
| if (LIKELY(packed_a != NULL)) { | if (LIKELY(packed_a != NULL)) { | ||||
| if (j == 0) { | if (j == 0) { | ||||
| for (; k < K; k++) { | for (; k < K; k++) { | ||||
| BROADCAST_LOAD_B(0, 0); | BROADCAST_LOAD_B(0, 0); | ||||
| GATHER_LOAD_A(pg_true, 0, 0); | GATHER_LOAD_A(pg_true, 0, 0); | ||||
| VECTOR_PACK_A(0, 0); | VECTOR_PACK_A(0, 0); | ||||
| @@ -285,6 +269,7 @@ CNAME(BLASLONG M, | |||||
| } | } | ||||
| } else { | } else { | ||||
| for (; k < K; k++) { | for (; k < K; k++) { | ||||
| BROADCAST_LOAD_B(0, 0); | BROADCAST_LOAD_B(0, 0); | ||||
| UNPACK_VECTOR_A(0, 0); | UNPACK_VECTOR_A(0, 0); | ||||
| UPDATE_RESULT_VECTOR(pg_true, 0, 0, 0); | UPDATE_RESULT_VECTOR(pg_true, 0, 0, 0); | ||||
| @@ -345,6 +330,7 @@ CNAME(BLASLONG M, | |||||
| if (LIKELY(packed_a != NULL)) { | if (LIKELY(packed_a != NULL)) { | ||||
| for (; k < K; k++) { | for (; k < K; k++) { | ||||
| BROADCAST_LOAD_B(0, 0); | BROADCAST_LOAD_B(0, 0); | ||||
| UNPACK_VECTOR_A(0, 0); | UNPACK_VECTOR_A(0, 0); | ||||
| UPDATE_RESULT_VECTOR(pg_true, 0, 0, 0); | UPDATE_RESULT_VECTOR(pg_true, 0, 0, 0); | ||||
| @@ -356,6 +342,7 @@ CNAME(BLASLONG M, | |||||
| } | } | ||||
| } else { | } else { | ||||
| for (; k < K; k++) { | for (; k < K; k++) { | ||||
| BROADCAST_LOAD_B(0, 0); | BROADCAST_LOAD_B(0, 0); | ||||
| GATHER_LOAD_A(pg_true, 0, 0); | GATHER_LOAD_A(pg_true, 0, 0); | ||||
| UPDATE_RESULT_VECTOR(pg_true, 0, 0, 0); | UPDATE_RESULT_VECTOR(pg_true, 0, 0, 0); | ||||
| @@ -580,4 +567,4 @@ CNAME(BLASLONG M, | |||||
| free(packed_a); | free(packed_a); | ||||
| return 0; | return 0; | ||||
| } | |||||
| } | |||||
| @@ -237,6 +237,7 @@ CNAME(BLASLONG M, | |||||
| #endif | #endif | ||||
| { | { | ||||
| const uint64_t v_size = svcntw(); | const uint64_t v_size = svcntw(); | ||||
| const uint64_t v_size2 = v_size * 2; | |||||
| const svbool_t pg_true = svptrue_b32(); | const svbool_t pg_true = svptrue_b32(); | ||||
| const svbool_t pg_quad = svwhilelt_b32(0, 4); | const svbool_t pg_quad = svwhilelt_b32(0, 4); | ||||
| const svbool_t pg_first = svwhilelt_b32(0, 1); | const svbool_t pg_first = svwhilelt_b32(0, 1); | ||||
| @@ -245,10 +246,11 @@ CNAME(BLASLONG M, | |||||
| const svfloat32_t beta_vec = svdup_f32(beta); | const svfloat32_t beta_vec = svdup_f32(beta); | ||||
| #endif | #endif | ||||
| const BLASLONG n4 = N & -4; | const BLASLONG n4 = N & -4; | ||||
| const BLASLONG v_m2 = M & -v_size2; | |||||
| const BLASLONG v_m1 = M & -v_size; | const BLASLONG v_m1 = M & -v_size; | ||||
| const BLASLONG k4 = K & -4; | const BLASLONG k4 = K & -4; | ||||
| const int pack_b = M >= v_size && N >= 8 && K >= 8 ? 1 : 0; | |||||
| const int pack_b = M >= v_size2 && N >= 8 && K >= 8 ? 1 : 0; | |||||
| FLOAT* packed_b = | FLOAT* packed_b = | ||||
| (pack_b) ? packed_b = (FLOAT*)malloc(K * 4 * sizeof(FLOAT)) : NULL; | (pack_b) ? packed_b = (FLOAT*)malloc(K * 4 * sizeof(FLOAT)) : NULL; | ||||
| @@ -269,16 +271,21 @@ CNAME(BLASLONG M, | |||||
| CREATE_B_POINTER(3, 3); | CREATE_B_POINTER(3, 3); | ||||
| BLASLONG i = 0; | BLASLONG i = 0; | ||||
| for (; i < v_m1; i += v_size) { | |||||
| for (; i < v_m2; i += v_size2) { | |||||
| CREATE_A_POINTER(0, 0); | CREATE_A_POINTER(0, 0); | ||||
| UPDATE_A_POINTER(v_size); | |||||
| CREATE_A_POINTER(1, v_size); | |||||
| UPDATE_A_POINTER(v_size2); | |||||
| BLASLONG k = 0; | BLASLONG k = 0; | ||||
| DECLARE_RESULT_VECTOR(0, 0); | DECLARE_RESULT_VECTOR(0, 0); | ||||
| DECLARE_RESULT_VECTOR(0, 1); | DECLARE_RESULT_VECTOR(0, 1); | ||||
| DECLARE_RESULT_VECTOR(0, 2); | DECLARE_RESULT_VECTOR(0, 2); | ||||
| DECLARE_RESULT_VECTOR(0, 3); | DECLARE_RESULT_VECTOR(0, 3); | ||||
| DECLARE_RESULT_VECTOR(1, 0); | |||||
| DECLARE_RESULT_VECTOR(1, 1); | |||||
| DECLARE_RESULT_VECTOR(1, 2); | |||||
| DECLARE_RESULT_VECTOR(1, 3); | |||||
| if (LIKELY(packed_b != NULL)) { | if (LIKELY(packed_b != NULL)) { | ||||
| if (i == 0) { | if (i == 0) { | ||||
| @@ -314,6 +321,26 @@ CNAME(BLASLONG M, | |||||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 1, 0, 1, 3); | UPDATE_RESULT_VECTOR_QUADWORD(0, 1, 0, 1, 3); | ||||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 2, 0, 2, 3); | UPDATE_RESULT_VECTOR_QUADWORD(0, 2, 0, 2, 3); | ||||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 3, 0, 3, 3); | UPDATE_RESULT_VECTOR_QUADWORD(0, 3, 0, 3, 3); | ||||
| VECTOR_LOAD_A(pg_true, 1, 0); | |||||
| UPDATE_RESULT_VECTOR_QUADWORD(1, 0, 0, 0, 0); | |||||
| UPDATE_RESULT_VECTOR_QUADWORD(1, 1, 0, 1, 0); | |||||
| UPDATE_RESULT_VECTOR_QUADWORD(1, 2, 0, 2, 0); | |||||
| UPDATE_RESULT_VECTOR_QUADWORD(1, 3, 0, 3, 0); | |||||
| VECTOR_LOAD_A(pg_true, 1, 1); | |||||
| UPDATE_RESULT_VECTOR_QUADWORD(1, 0, 0, 0, 1); | |||||
| UPDATE_RESULT_VECTOR_QUADWORD(1, 1, 0, 1, 1); | |||||
| UPDATE_RESULT_VECTOR_QUADWORD(1, 2, 0, 2, 1); | |||||
| UPDATE_RESULT_VECTOR_QUADWORD(1, 3, 0, 3, 1); | |||||
| VECTOR_LOAD_A(pg_true, 1, 2); | |||||
| UPDATE_RESULT_VECTOR_QUADWORD(1, 0, 0, 0, 2); | |||||
| UPDATE_RESULT_VECTOR_QUADWORD(1, 1, 0, 1, 2); | |||||
| UPDATE_RESULT_VECTOR_QUADWORD(1, 2, 0, 2, 2); | |||||
| UPDATE_RESULT_VECTOR_QUADWORD(1, 3, 0, 3, 2); | |||||
| VECTOR_LOAD_A(pg_true, 1, 3); | |||||
| UPDATE_RESULT_VECTOR_QUADWORD(1, 0, 0, 0, 3); | |||||
| UPDATE_RESULT_VECTOR_QUADWORD(1, 1, 0, 1, 3); | |||||
| UPDATE_RESULT_VECTOR_QUADWORD(1, 2, 0, 2, 3); | |||||
| UPDATE_RESULT_VECTOR_QUADWORD(1, 3, 0, 3, 3); | |||||
| } | } | ||||
| for (; k < K; k++) { | for (; k < K; k++) { | ||||
| @@ -324,12 +351,17 @@ CNAME(BLASLONG M, | |||||
| BROADCAST_LOAD_B(1, 0); | BROADCAST_LOAD_B(1, 0); | ||||
| PACK_B(1, 0); | PACK_B(1, 0); | ||||
| UPDATE_RESULT_VECTOR(pg_true, 0, 1, 0); | UPDATE_RESULT_VECTOR(pg_true, 0, 1, 0); | ||||
| VECTOR_LOAD_A(pg_true, 1, 0); | |||||
| UPDATE_RESULT_VECTOR(pg_true, 1, 0, 0); | |||||
| UPDATE_RESULT_VECTOR(pg_true, 1, 1, 0); | |||||
| BROADCAST_LOAD_B(2, 0); | BROADCAST_LOAD_B(2, 0); | ||||
| PACK_B(2, 0); | PACK_B(2, 0); | ||||
| UPDATE_RESULT_VECTOR(pg_true, 0, 2, 0); | UPDATE_RESULT_VECTOR(pg_true, 0, 2, 0); | ||||
| UPDATE_RESULT_VECTOR(pg_true, 1, 2, 0); | |||||
| BROADCAST_LOAD_B(3, 0); | BROADCAST_LOAD_B(3, 0); | ||||
| PACK_B(3, 0); | PACK_B(3, 0); | ||||
| UPDATE_RESULT_VECTOR(pg_true, 0, 3, 0); | UPDATE_RESULT_VECTOR(pg_true, 0, 3, 0); | ||||
| UPDATE_RESULT_VECTOR(pg_true, 1, 3, 0); | |||||
| } | } | ||||
| } else { | } else { | ||||
| for (; k < K; k++) { | for (; k < K; k++) { | ||||
| @@ -340,11 +372,118 @@ CNAME(BLASLONG M, | |||||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 1, 0, 1, 0); | UPDATE_RESULT_VECTOR_QUADWORD(0, 1, 0, 1, 0); | ||||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 2, 0, 2, 0); | UPDATE_RESULT_VECTOR_QUADWORD(0, 2, 0, 2, 0); | ||||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 3, 0, 3, 0); | UPDATE_RESULT_VECTOR_QUADWORD(0, 3, 0, 3, 0); | ||||
| VECTOR_LOAD_A(pg_true, 1, 0); | |||||
| UPDATE_RESULT_VECTOR_QUADWORD(1, 0, 0, 0, 0); | |||||
| UPDATE_RESULT_VECTOR_QUADWORD(1, 1, 0, 1, 0); | |||||
| UPDATE_RESULT_VECTOR_QUADWORD(1, 2, 0, 2, 0); | |||||
| UPDATE_RESULT_VECTOR_QUADWORD(1, 3, 0, 3, 0); | |||||
| } | } | ||||
| } | } | ||||
| } else { | } else { | ||||
| for (; k < k4; k += 4) { | for (; k < k4; k += 4) { | ||||
| VECTOR_LOAD_B_K4(0, 0); | |||||
| VECTOR_LOAD_B_K4(1, 0); | |||||
| VECTOR_LOAD_B_K4(2, 0); | |||||
| VECTOR_LOAD_B_K4(3, 0); | |||||
| TRANSPOSE_B4_K4(0, 1, 2, 3, 0, 1, 2, 3); | |||||
| SCALE_B4_K4(0, 0, 1, 2, 3); | |||||
| VECTOR_LOAD_A(pg_true, 0, 0); | |||||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 0, 0, 0, 0); | |||||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 1, 0, 1, 0); | |||||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 2, 0, 2, 0); | |||||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 3, 0, 3, 0); | |||||
| VECTOR_LOAD_A(pg_true, 0, 1); | |||||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 0, 0, 0, 1); | |||||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 1, 0, 1, 1); | |||||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 2, 0, 2, 1); | |||||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 3, 0, 3, 1); | |||||
| VECTOR_LOAD_A(pg_true, 0, 2); | |||||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 0, 0, 0, 2); | |||||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 1, 0, 1, 2); | |||||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 2, 0, 2, 2); | |||||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 3, 0, 3, 2); | |||||
| VECTOR_LOAD_A(pg_true, 0, 3); | |||||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 0, 0, 0, 3); | |||||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 1, 0, 1, 3); | |||||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 2, 0, 2, 3); | |||||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 3, 0, 3, 3); | |||||
| VECTOR_LOAD_A(pg_true, 1, 0); | |||||
| UPDATE_RESULT_VECTOR_QUADWORD(1, 0, 0, 0, 0); | |||||
| UPDATE_RESULT_VECTOR_QUADWORD(1, 1, 0, 1, 0); | |||||
| UPDATE_RESULT_VECTOR_QUADWORD(1, 2, 0, 2, 0); | |||||
| UPDATE_RESULT_VECTOR_QUADWORD(1, 3, 0, 3, 0); | |||||
| VECTOR_LOAD_A(pg_true, 1, 1); | |||||
| UPDATE_RESULT_VECTOR_QUADWORD(1, 0, 0, 0, 1); | |||||
| UPDATE_RESULT_VECTOR_QUADWORD(1, 1, 0, 1, 1); | |||||
| UPDATE_RESULT_VECTOR_QUADWORD(1, 2, 0, 2, 1); | |||||
| UPDATE_RESULT_VECTOR_QUADWORD(1, 3, 0, 3, 1); | |||||
| VECTOR_LOAD_A(pg_true, 1, 2); | |||||
| UPDATE_RESULT_VECTOR_QUADWORD(1, 0, 0, 0, 2); | |||||
| UPDATE_RESULT_VECTOR_QUADWORD(1, 1, 0, 1, 2); | |||||
| UPDATE_RESULT_VECTOR_QUADWORD(1, 2, 0, 2, 2); | |||||
| UPDATE_RESULT_VECTOR_QUADWORD(1, 3, 0, 3, 2); | |||||
| VECTOR_LOAD_A(pg_true, 1, 3); | |||||
| UPDATE_RESULT_VECTOR_QUADWORD(1, 0, 0, 0, 3); | |||||
| UPDATE_RESULT_VECTOR_QUADWORD(1, 1, 0, 1, 3); | |||||
| UPDATE_RESULT_VECTOR_QUADWORD(1, 2, 0, 2, 3); | |||||
| UPDATE_RESULT_VECTOR_QUADWORD(1, 3, 0, 3, 3); | |||||
| } | |||||
| for (; k < K; k++) { | |||||
| BROADCAST_LOAD_B(0, 0); | |||||
| VECTOR_LOAD_A(pg_true, 0, 0); | |||||
| UPDATE_RESULT_VECTOR(pg_true, 0, 0, 0); | |||||
| BROADCAST_LOAD_B(1, 0); | |||||
| UPDATE_RESULT_VECTOR(pg_true, 0, 1, 0); | |||||
| VECTOR_LOAD_A(pg_true, 1, 0); | |||||
| UPDATE_RESULT_VECTOR(pg_true, 1, 0, 0); | |||||
| UPDATE_RESULT_VECTOR(pg_true, 1, 1, 0); | |||||
| BROADCAST_LOAD_B(2, 0); | |||||
| UPDATE_RESULT_VECTOR(pg_true, 0, 2, 0); | |||||
| UPDATE_RESULT_VECTOR(pg_true, 1, 2, 0); | |||||
| BROADCAST_LOAD_B(3, 0); | |||||
| UPDATE_RESULT_VECTOR(pg_true, 0, 3, 0); | |||||
| UPDATE_RESULT_VECTOR(pg_true, 1, 3, 0); | |||||
| } | |||||
| } | |||||
| VECTOR_STORE(pg_true, 0, 0); | |||||
| VECTOR_STORE(pg_true, 0, 1); | |||||
| VECTOR_STORE(pg_true, 0, 2); | |||||
| VECTOR_STORE(pg_true, 0, 3); | |||||
| VECTOR_STORE(pg_true, 1, 0); | |||||
| VECTOR_STORE(pg_true, 1, 1); | |||||
| VECTOR_STORE(pg_true, 1, 2); | |||||
| VECTOR_STORE(pg_true, 1, 3); | |||||
| INCR_C_POINTER(0, v_size2); | |||||
| INCR_C_POINTER(1, v_size2); | |||||
| INCR_C_POINTER(2, v_size2); | |||||
| INCR_C_POINTER(3, v_size2); | |||||
| } | |||||
| for (; i < v_m1; i += v_size) { | |||||
| CREATE_A_POINTER(0, 0); | |||||
| UPDATE_A_POINTER(v_size); | |||||
| BLASLONG k = 0; | |||||
| DECLARE_RESULT_VECTOR(0, 0); | |||||
| DECLARE_RESULT_VECTOR(0, 1); | |||||
| DECLARE_RESULT_VECTOR(0, 2); | |||||
| DECLARE_RESULT_VECTOR(0, 3); | |||||
| if (LIKELY(packed_b != NULL)) { | |||||
| for (; k < K; k++) { | |||||
| UNPACK_QUADWORD_B(0, 0); | |||||
| VECTOR_LOAD_A(pg_true, 0, 0); | |||||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 0, 0, 0, 0); | |||||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 1, 0, 1, 0); | |||||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 2, 0, 2, 0); | |||||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 3, 0, 3, 0); | |||||
| } | |||||
| } else { | |||||
| for (; k < k4; k += 4) { | |||||
| VECTOR_LOAD_B_K4(0, 0); | VECTOR_LOAD_B_K4(0, 0); | ||||
| VECTOR_LOAD_B_K4(1, 0); | VECTOR_LOAD_B_K4(1, 0); | ||||
| VECTOR_LOAD_B_K4(2, 0); | VECTOR_LOAD_B_K4(2, 0); | ||||
| @@ -478,6 +617,28 @@ CNAME(BLASLONG M, | |||||
| CREATE_B_POINTER(0, 0); | CREATE_B_POINTER(0, 0); | ||||
| BLASLONG i = 0; | BLASLONG i = 0; | ||||
| for (; i < v_m2; i += v_size2) { | |||||
| CREATE_A_POINTER(0, 0); | |||||
| CREATE_A_POINTER(1, v_size); | |||||
| UPDATE_A_POINTER(v_size2); | |||||
| BLASLONG k = 0; | |||||
| DECLARE_RESULT_VECTOR(0, 0); | |||||
| DECLARE_RESULT_VECTOR(1, 0); | |||||
| for (; k < K; k++) { | |||||
| BROADCAST_LOAD_B(0, 0); | |||||
| VECTOR_LOAD_A(pg_true, 0, 0); | |||||
| UPDATE_RESULT_VECTOR(pg_true, 0, 0, 0); | |||||
| VECTOR_LOAD_A(pg_true, 1, 0); | |||||
| UPDATE_RESULT_VECTOR(pg_true, 1, 0, 0); | |||||
| } | |||||
| VECTOR_STORE(pg_true, 0, 0); | |||||
| VECTOR_STORE(pg_true, 1, 0); | |||||
| INCR_C_POINTER(0, v_size2); | |||||
| } | |||||
| for (; i < v_m1; i += v_size) { | for (; i < v_m1; i += v_size) { | ||||
| CREATE_A_POINTER(0, 0); | CREATE_A_POINTER(0, 0); | ||||
| @@ -209,6 +209,7 @@ CNAME(BLASLONG M, | |||||
| #endif | #endif | ||||
| { | { | ||||
| const uint64_t v_size = svcntw(); | const uint64_t v_size = svcntw(); | ||||
| const uint64_t v_size2 = v_size * 2; | |||||
| const svbool_t pg_true = svptrue_b32(); | const svbool_t pg_true = svptrue_b32(); | ||||
| const svbool_t pg_quad = svwhilelt_b32(0, 4); | const svbool_t pg_quad = svwhilelt_b32(0, 4); | ||||
| const svbool_t pg_first = svwhilelt_b32(0, 1); | const svbool_t pg_first = svwhilelt_b32(0, 1); | ||||
| @@ -217,9 +218,10 @@ CNAME(BLASLONG M, | |||||
| const svfloat32_t beta_vec = svdup_f32(beta); | const svfloat32_t beta_vec = svdup_f32(beta); | ||||
| #endif | #endif | ||||
| const BLASLONG n4 = N & -4; | const BLASLONG n4 = N & -4; | ||||
| const BLASLONG v_m2 = M & -v_size2; | |||||
| const BLASLONG v_m1 = M & -v_size; | const BLASLONG v_m1 = M & -v_size; | ||||
| const int pack_b = M >= v_size && N >= 8 && K >= 8 ? 1 : 0; | |||||
| const int pack_b = M >= v_size2 && N >= 8 && K >= 8 ? 1 : 0; | |||||
| FLOAT* packed_b = | FLOAT* packed_b = | ||||
| (pack_b) ? packed_b = (FLOAT*)malloc(K * 4 * sizeof(FLOAT)) : NULL; | (pack_b) ? packed_b = (FLOAT*)malloc(K * 4 * sizeof(FLOAT)) : NULL; | ||||
| @@ -240,16 +242,21 @@ CNAME(BLASLONG M, | |||||
| CREATE_B_POINTER(3, 3); | CREATE_B_POINTER(3, 3); | ||||
| BLASLONG i = 0; | BLASLONG i = 0; | ||||
| for (; i < v_m1; i += v_size) { | |||||
| for (; i < v_m2; i += v_size2) { | |||||
| CREATE_A_POINTER(0, 0); | CREATE_A_POINTER(0, 0); | ||||
| UPDATE_A_POINTER(v_size); | |||||
| CREATE_A_POINTER(1, v_size); | |||||
| UPDATE_A_POINTER(v_size2); | |||||
| BLASLONG k = 0; | BLASLONG k = 0; | ||||
| DECLARE_RESULT_VECTOR(0, 0); | DECLARE_RESULT_VECTOR(0, 0); | ||||
| DECLARE_RESULT_VECTOR(0, 1); | DECLARE_RESULT_VECTOR(0, 1); | ||||
| DECLARE_RESULT_VECTOR(0, 2); | DECLARE_RESULT_VECTOR(0, 2); | ||||
| DECLARE_RESULT_VECTOR(0, 3); | DECLARE_RESULT_VECTOR(0, 3); | ||||
| DECLARE_RESULT_VECTOR(1, 0); | |||||
| DECLARE_RESULT_VECTOR(1, 1); | |||||
| DECLARE_RESULT_VECTOR(1, 2); | |||||
| DECLARE_RESULT_VECTOR(1, 3); | |||||
| if (LIKELY(packed_b != NULL)) { | if (LIKELY(packed_b != NULL)) { | ||||
| if (i == 0) { | if (i == 0) { | ||||
| @@ -262,6 +269,11 @@ CNAME(BLASLONG M, | |||||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 1, 0, 1, 0); | UPDATE_RESULT_VECTOR_QUADWORD(0, 1, 0, 1, 0); | ||||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 2, 0, 2, 0); | UPDATE_RESULT_VECTOR_QUADWORD(0, 2, 0, 2, 0); | ||||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 3, 0, 3, 0); | UPDATE_RESULT_VECTOR_QUADWORD(0, 3, 0, 3, 0); | ||||
| VECTOR_LOAD_A(pg_true, 1, 0); | |||||
| UPDATE_RESULT_VECTOR_QUADWORD(1, 0, 0, 0, 0); | |||||
| UPDATE_RESULT_VECTOR_QUADWORD(1, 1, 0, 1, 0); | |||||
| UPDATE_RESULT_VECTOR_QUADWORD(1, 2, 0, 2, 0); | |||||
| UPDATE_RESULT_VECTOR_QUADWORD(1, 3, 0, 3, 0); | |||||
| } | } | ||||
| } else { | } else { | ||||
| for (; k < K; k++) { | for (; k < K; k++) { | ||||
| @@ -272,11 +284,66 @@ CNAME(BLASLONG M, | |||||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 1, 0, 1, 0); | UPDATE_RESULT_VECTOR_QUADWORD(0, 1, 0, 1, 0); | ||||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 2, 0, 2, 0); | UPDATE_RESULT_VECTOR_QUADWORD(0, 2, 0, 2, 0); | ||||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 3, 0, 3, 0); | UPDATE_RESULT_VECTOR_QUADWORD(0, 3, 0, 3, 0); | ||||
| VECTOR_LOAD_A(pg_true, 1, 0); | |||||
| UPDATE_RESULT_VECTOR_QUADWORD(1, 0, 0, 0, 0); | |||||
| UPDATE_RESULT_VECTOR_QUADWORD(1, 1, 0, 1, 0); | |||||
| UPDATE_RESULT_VECTOR_QUADWORD(1, 2, 0, 2, 0); | |||||
| UPDATE_RESULT_VECTOR_QUADWORD(1, 3, 0, 3, 0); | |||||
| } | } | ||||
| } | } | ||||
| } else { | } else { | ||||
| for (; k < K; k++) { | for (; k < K; k++) { | ||||
| QUADWORD_LOAD_B(0, 0); | |||||
| VECTOR_LOAD_A(pg_true, 0, 0); | |||||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 0, 0, 0, 0); | |||||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 1, 0, 1, 0); | |||||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 2, 0, 2, 0); | |||||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 3, 0, 3, 0); | |||||
| VECTOR_LOAD_A(pg_true, 1, 0); | |||||
| UPDATE_RESULT_VECTOR_QUADWORD(1, 0, 0, 0, 0); | |||||
| UPDATE_RESULT_VECTOR_QUADWORD(1, 1, 0, 1, 0); | |||||
| UPDATE_RESULT_VECTOR_QUADWORD(1, 2, 0, 2, 0); | |||||
| UPDATE_RESULT_VECTOR_QUADWORD(1, 3, 0, 3, 0); | |||||
| } | |||||
| } | |||||
| VECTOR_STORE(pg_true, 0, 0); | |||||
| VECTOR_STORE(pg_true, 0, 1); | |||||
| VECTOR_STORE(pg_true, 0, 2); | |||||
| VECTOR_STORE(pg_true, 0, 3); | |||||
| VECTOR_STORE(pg_true, 1, 0); | |||||
| VECTOR_STORE(pg_true, 1, 1); | |||||
| VECTOR_STORE(pg_true, 1, 2); | |||||
| VECTOR_STORE(pg_true, 1, 3); | |||||
| INCR_C_POINTER(0, v_size2); | |||||
| INCR_C_POINTER(1, v_size2); | |||||
| INCR_C_POINTER(2, v_size2); | |||||
| INCR_C_POINTER(3, v_size2); | |||||
| } | |||||
| for (; i < v_m1; i += v_size) { | |||||
| CREATE_A_POINTER(0, 0); | |||||
| UPDATE_A_POINTER(v_size); | |||||
| BLASLONG k = 0; | |||||
| DECLARE_RESULT_VECTOR(0, 0); | |||||
| DECLARE_RESULT_VECTOR(0, 1); | |||||
| DECLARE_RESULT_VECTOR(0, 2); | |||||
| DECLARE_RESULT_VECTOR(0, 3); | |||||
| if (LIKELY(packed_b != NULL)) { | |||||
| for (; k < K; k++) { | |||||
| UNPACK_QUADWORD_B(0, 0); | |||||
| VECTOR_LOAD_A(pg_true, 0, 0); | |||||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 0, 0, 0, 0); | |||||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 1, 0, 1, 0); | |||||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 2, 0, 2, 0); | |||||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 3, 0, 3, 0); | |||||
| } | |||||
| } else { | |||||
| for (; k < K; k++) { | |||||
| QUADWORD_LOAD_B(0, 0); | QUADWORD_LOAD_B(0, 0); | ||||
| VECTOR_LOAD_A(pg_true, 0, 0); | VECTOR_LOAD_A(pg_true, 0, 0); | ||||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 0, 0, 0, 0); | UPDATE_RESULT_VECTOR_QUADWORD(0, 0, 0, 0, 0); | ||||
| @@ -346,6 +413,28 @@ CNAME(BLASLONG M, | |||||
| CREATE_B_POINTER(0, 0); | CREATE_B_POINTER(0, 0); | ||||
| BLASLONG i = 0; | BLASLONG i = 0; | ||||
| for (; i < v_m2; i += v_size2) { | |||||
| CREATE_A_POINTER(0, 0); | |||||
| CREATE_A_POINTER(1, v_size); | |||||
| UPDATE_A_POINTER(v_size2); | |||||
| BLASLONG k = 0; | |||||
| DECLARE_RESULT_VECTOR(0, 0); | |||||
| DECLARE_RESULT_VECTOR(1, 0); | |||||
| for (; k < K; k++) { | |||||
| BROADCAST_LOAD_B(0, 0); | |||||
| VECTOR_LOAD_A(pg_true, 0, 0); | |||||
| UPDATE_RESULT_VECTOR(pg_true, 0, 0, 0); | |||||
| VECTOR_LOAD_A(pg_true, 1, 0); | |||||
| UPDATE_RESULT_VECTOR(pg_true, 1, 0, 0); | |||||
| } | |||||
| VECTOR_STORE(pg_true, 0, 0); | |||||
| VECTOR_STORE(pg_true, 1, 0); | |||||
| INCR_C_POINTER(0, v_size2); | |||||
| } | |||||
| for (; i < v_m1; i += v_size) { | for (; i < v_m1; i += v_size) { | ||||
| CREATE_A_POINTER(0, 0); | CREATE_A_POINTER(0, 0); | ||||
| @@ -69,7 +69,7 @@ THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| // #undef C_ELEMENT | // #undef C_ELEMENT | ||||
| // #define C_ELEMENT(m, n) C[(i+(m))+(j+(n))*ldc] | // #define C_ELEMENT(m, n) C[(i+(m))+(j+(n))*ldc] | ||||
| #define PACK_ELEMENT_K(m, offset_k) packed_a[(k + offset_k) * v_size + m] | |||||
| #define PACK_ELEMENT_K(m, offset_k) packed_a[(k + offset_k) * v_size2 + m] | |||||
| #define PACK_ELEMENT(m) PACK_ELEMENT_K(m, 0) | #define PACK_ELEMENT(m) PACK_ELEMENT_K(m, 0) | ||||
| // ASIMD | // ASIMD | ||||
| @@ -206,6 +206,7 @@ CNAME(BLASLONG M, | |||||
| #endif | #endif | ||||
| { | { | ||||
| const uint64_t v_size = svcntw(); | const uint64_t v_size = svcntw(); | ||||
| const uint64_t v_size2 = v_size * 2; | |||||
| const svbool_t pg_true = svptrue_b32(); | const svbool_t pg_true = svptrue_b32(); | ||||
| const svbool_t pg_quad = svwhilelt_b32(0, 4); | const svbool_t pg_quad = svwhilelt_b32(0, 4); | ||||
| const svbool_t pg_first = svwhilelt_b32(0, 1); | const svbool_t pg_first = svwhilelt_b32(0, 1); | ||||
| @@ -215,22 +216,25 @@ CNAME(BLASLONG M, | |||||
| #endif | #endif | ||||
| const svuint32_t lda_vec = svindex_u32(0LL, lda); | const svuint32_t lda_vec = svindex_u32(0LL, lda); | ||||
| const BLASLONG v_m2 = M & -v_size2; | |||||
| const BLASLONG v_m1 = M & -v_size; | const BLASLONG v_m1 = M & -v_size; | ||||
| const BLASLONG n4 = N & -4; | const BLASLONG n4 = N & -4; | ||||
| const int pack_a = M >= v_size && N >= 8 && K >= 8 ? 1 : 0; | |||||
| const int pack_a = M >= v_size2 && N >= 8 && K >= 8 ? 1 : 0; | |||||
| FLOAT* packed_a = | FLOAT* packed_a = | ||||
| (pack_a) ? packed_a = (FLOAT*)malloc(K * v_size * sizeof(FLOAT)) : NULL; | |||||
| (pack_a) ? packed_a = (FLOAT*)malloc(K * v_size2 * sizeof(FLOAT)) : NULL; | |||||
| FLOAT* a_offset = A; | FLOAT* a_offset = A; | ||||
| FLOAT* b_offset = B; | FLOAT* b_offset = B; | ||||
| FLOAT* c_offset = C; | FLOAT* c_offset = C; | ||||
| BLASLONG i = 0; | BLASLONG i = 0; | ||||
| for (; i < v_m1; i += v_size) { | |||||
| for (; i < v_m2; i += v_size2) { | |||||
| CREATE_C_POINTER(0, 0); | CREATE_C_POINTER(0, 0); | ||||
| CREATE_C_POINTER(1, v_size); | |||||
| CREATE_A_POINTER(0, 0); | CREATE_A_POINTER(0, 0); | ||||
| CREATE_A_POINTER(1, v_size); | |||||
| BLASLONG j = 0; | BLASLONG j = 0; | ||||
| for (; j < n4; j += 4) { | for (; j < n4; j += 4) { | ||||
| @@ -246,6 +250,10 @@ CNAME(BLASLONG M, | |||||
| DECLARE_RESULT_VECTOR(0, 1); | DECLARE_RESULT_VECTOR(0, 1); | ||||
| DECLARE_RESULT_VECTOR(0, 2); | DECLARE_RESULT_VECTOR(0, 2); | ||||
| DECLARE_RESULT_VECTOR(0, 3); | DECLARE_RESULT_VECTOR(0, 3); | ||||
| DECLARE_RESULT_VECTOR(1, 0); | |||||
| DECLARE_RESULT_VECTOR(1, 1); | |||||
| DECLARE_RESULT_VECTOR(1, 2); | |||||
| DECLARE_RESULT_VECTOR(1, 3); | |||||
| if (LIKELY(packed_a != NULL)) { | if (LIKELY(packed_a != NULL)) { | ||||
| if (j == 0) { | if (j == 0) { | ||||
| @@ -257,10 +265,16 @@ CNAME(BLASLONG M, | |||||
| UPDATE_RESULT_VECTOR(pg_true, 0, 0, 0); | UPDATE_RESULT_VECTOR(pg_true, 0, 0, 0); | ||||
| BROADCAST_LOAD_B(1, 0); | BROADCAST_LOAD_B(1, 0); | ||||
| UPDATE_RESULT_VECTOR(pg_true, 0, 1, 0); | UPDATE_RESULT_VECTOR(pg_true, 0, 1, 0); | ||||
| GATHER_LOAD_A(pg_true, 1, 0); | |||||
| VECTOR_PACK_A(1, 0); | |||||
| UPDATE_RESULT_VECTOR(pg_true, 1, 0, 0); | |||||
| UPDATE_RESULT_VECTOR(pg_true, 1, 1, 0); | |||||
| BROADCAST_LOAD_B(2, 0); | BROADCAST_LOAD_B(2, 0); | ||||
| UPDATE_RESULT_VECTOR(pg_true, 0, 2, 0); | UPDATE_RESULT_VECTOR(pg_true, 0, 2, 0); | ||||
| UPDATE_RESULT_VECTOR(pg_true, 1, 2, 0); | |||||
| BROADCAST_LOAD_B(3, 0); | BROADCAST_LOAD_B(3, 0); | ||||
| UPDATE_RESULT_VECTOR(pg_true, 0, 3, 0); | UPDATE_RESULT_VECTOR(pg_true, 0, 3, 0); | ||||
| UPDATE_RESULT_VECTOR(pg_true, 1, 3, 0); | |||||
| } | } | ||||
| } else { | } else { | ||||
| for (; k < K; k++) { | for (; k < K; k++) { | ||||
| @@ -270,10 +284,15 @@ CNAME(BLASLONG M, | |||||
| UPDATE_RESULT_VECTOR(pg_true, 0, 0, 0); | UPDATE_RESULT_VECTOR(pg_true, 0, 0, 0); | ||||
| BROADCAST_LOAD_B(1, 0); | BROADCAST_LOAD_B(1, 0); | ||||
| UPDATE_RESULT_VECTOR(pg_true, 0, 1, 0); | UPDATE_RESULT_VECTOR(pg_true, 0, 1, 0); | ||||
| UNPACK_VECTOR_A(1, 0); | |||||
| UPDATE_RESULT_VECTOR(pg_true, 1, 0, 0); | |||||
| UPDATE_RESULT_VECTOR(pg_true, 1, 1, 0); | |||||
| BROADCAST_LOAD_B(2, 0); | BROADCAST_LOAD_B(2, 0); | ||||
| UPDATE_RESULT_VECTOR(pg_true, 0, 2, 0); | UPDATE_RESULT_VECTOR(pg_true, 0, 2, 0); | ||||
| UPDATE_RESULT_VECTOR(pg_true, 1, 2, 0); | |||||
| BROADCAST_LOAD_B(3, 0); | BROADCAST_LOAD_B(3, 0); | ||||
| UPDATE_RESULT_VECTOR(pg_true, 0, 3, 0); | UPDATE_RESULT_VECTOR(pg_true, 0, 3, 0); | ||||
| UPDATE_RESULT_VECTOR(pg_true, 1, 3, 0); | |||||
| } | } | ||||
| } | } | ||||
| } else { | } else { | ||||
| @@ -284,17 +303,27 @@ CNAME(BLASLONG M, | |||||
| UPDATE_RESULT_VECTOR(pg_true, 0, 0, 0); | UPDATE_RESULT_VECTOR(pg_true, 0, 0, 0); | ||||
| BROADCAST_LOAD_B(1, 0); | BROADCAST_LOAD_B(1, 0); | ||||
| UPDATE_RESULT_VECTOR(pg_true, 0, 1, 0); | UPDATE_RESULT_VECTOR(pg_true, 0, 1, 0); | ||||
| GATHER_LOAD_A(pg_true, 1, 0); | |||||
| UPDATE_RESULT_VECTOR(pg_true, 1, 0, 0); | |||||
| UPDATE_RESULT_VECTOR(pg_true, 1, 1, 0); | |||||
| BROADCAST_LOAD_B(2, 0); | BROADCAST_LOAD_B(2, 0); | ||||
| UPDATE_RESULT_VECTOR(pg_true, 0, 2, 0); | UPDATE_RESULT_VECTOR(pg_true, 0, 2, 0); | ||||
| UPDATE_RESULT_VECTOR(pg_true, 1, 2, 0); | |||||
| BROADCAST_LOAD_B(3, 0); | BROADCAST_LOAD_B(3, 0); | ||||
| UPDATE_RESULT_VECTOR(pg_true, 0, 3, 0); | UPDATE_RESULT_VECTOR(pg_true, 0, 3, 0); | ||||
| UPDATE_RESULT_VECTOR(pg_true, 1, 3, 0); | |||||
| } | } | ||||
| } | } | ||||
| VECTOR_STORE(pg_true, 0, 0); | VECTOR_STORE(pg_true, 0, 0); | ||||
| VECTOR_STORE(pg_true, 0, 1); | VECTOR_STORE(pg_true, 0, 1); | ||||
| VECTOR_STORE(pg_true, 0, 2); | VECTOR_STORE(pg_true, 0, 2); | ||||
| VECTOR_STORE(pg_true, 0, 3); | VECTOR_STORE(pg_true, 0, 3); | ||||
| VECTOR_STORE(pg_true, 1, 0); | |||||
| VECTOR_STORE(pg_true, 1, 1); | |||||
| VECTOR_STORE(pg_true, 1, 2); | |||||
| VECTOR_STORE(pg_true, 1, 3); | |||||
| INCR_C_POINTER(0, 4); | INCR_C_POINTER(0, 4); | ||||
| INCR_C_POINTER(1, 4); | |||||
| } | } | ||||
| for (; j < N; j++) { | for (; j < N; j++) { | ||||
| @@ -303,6 +332,7 @@ CNAME(BLASLONG M, | |||||
| BLASLONG k = 0; | BLASLONG k = 0; | ||||
| DECLARE_RESULT_VECTOR(0, 0); | DECLARE_RESULT_VECTOR(0, 0); | ||||
| DECLARE_RESULT_VECTOR(1, 0); | |||||
| if (LIKELY(packed_a != NULL)) { | if (LIKELY(packed_a != NULL)) { | ||||
| for (; k < K; k++) { | for (; k < K; k++) { | ||||
| @@ -310,6 +340,8 @@ CNAME(BLASLONG M, | |||||
| BROADCAST_LOAD_B(0, 0); | BROADCAST_LOAD_B(0, 0); | ||||
| UNPACK_VECTOR_A(0, 0); | UNPACK_VECTOR_A(0, 0); | ||||
| UPDATE_RESULT_VECTOR(pg_true, 0, 0, 0); | UPDATE_RESULT_VECTOR(pg_true, 0, 0, 0); | ||||
| UNPACK_VECTOR_A(1, 0); | |||||
| UPDATE_RESULT_VECTOR(pg_true, 1, 0, 0); | |||||
| } | } | ||||
| } else { | } else { | ||||
| for (; k < K; k++) { | for (; k < K; k++) { | ||||
| @@ -317,9 +349,73 @@ CNAME(BLASLONG M, | |||||
| BROADCAST_LOAD_B(0, 0); | BROADCAST_LOAD_B(0, 0); | ||||
| GATHER_LOAD_A(pg_true, 0, 0); | GATHER_LOAD_A(pg_true, 0, 0); | ||||
| UPDATE_RESULT_VECTOR(pg_true, 0, 0, 0); | UPDATE_RESULT_VECTOR(pg_true, 0, 0, 0); | ||||
| GATHER_LOAD_A(pg_true, 1, 0); | |||||
| UPDATE_RESULT_VECTOR(pg_true, 1, 0, 0); | |||||
| } | } | ||||
| } | } | ||||
| VECTOR_STORE(pg_true, 0, 0); | VECTOR_STORE(pg_true, 0, 0); | ||||
| VECTOR_STORE(pg_true, 1, 0); | |||||
| INCR_C_POINTER(0, 1); | |||||
| INCR_C_POINTER(1, 1); | |||||
| } | |||||
| UPDATE_A_POINTER(v_size2); | |||||
| RESET_B_POINTER(); | |||||
| UPDATE_C_POINTER(v_size2); | |||||
| } | |||||
| for (; i < v_m1; i += v_size) { | |||||
| CREATE_C_POINTER(0, 0); | |||||
| CREATE_A_POINTER(0, 0); | |||||
| BLASLONG j = 0; | |||||
| for (; j < n4; j += 4) { | |||||
| CREATE_B_POINTER(0, 0); | |||||
| CREATE_B_POINTER(1, 1); | |||||
| CREATE_B_POINTER(2, 2); | |||||
| CREATE_B_POINTER(3, 3); | |||||
| UPDATE_B_POINTER(4); | |||||
| BLASLONG k = 0; | |||||
| DECLARE_RESULT_VECTOR(0, 0); | |||||
| DECLARE_RESULT_VECTOR(0, 1); | |||||
| DECLARE_RESULT_VECTOR(0, 2); | |||||
| DECLARE_RESULT_VECTOR(0, 3); | |||||
| for (; k < K; k++) { | |||||
| BROADCAST_LOAD_B(0, 0); | |||||
| GATHER_LOAD_A(pg_true, 0, 0); | |||||
| UPDATE_RESULT_VECTOR(pg_true, 0, 0, 0); | |||||
| BROADCAST_LOAD_B(1, 0); | |||||
| UPDATE_RESULT_VECTOR(pg_true, 0, 1, 0); | |||||
| BROADCAST_LOAD_B(2, 0); | |||||
| UPDATE_RESULT_VECTOR(pg_true, 0, 2, 0); | |||||
| BROADCAST_LOAD_B(3, 0); | |||||
| UPDATE_RESULT_VECTOR(pg_true, 0, 3, 0); | |||||
| } | |||||
| VECTOR_STORE(pg_true, 0, 0); | |||||
| VECTOR_STORE(pg_true, 0, 1); | |||||
| VECTOR_STORE(pg_true, 0, 2); | |||||
| VECTOR_STORE(pg_true, 0, 3); | |||||
| INCR_C_POINTER(0, 4); | |||||
| } | |||||
| for (; j < N; j++) { | |||||
| CREATE_B_POINTER(0, 0); | |||||
| UPDATE_B_POINTER(1); | |||||
| BLASLONG k = 0; | |||||
| DECLARE_RESULT_VECTOR(0, 0); | |||||
| for (; k < K; k++) { | |||||
| BROADCAST_LOAD_B(0, 0); | |||||
| GATHER_LOAD_A(pg_true, 0, 0); | |||||
| UPDATE_RESULT_VECTOR(pg_true, 0, 0, 0); | |||||
| } | |||||
| VECTOR_STORE(pg_true, 0, 0); | |||||
| INCR_C_POINTER(0, 1); | INCR_C_POINTER(0, 1); | ||||
| } | } | ||||
| @@ -69,7 +69,7 @@ THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| // #undef C_ELEMENT | // #undef C_ELEMENT | ||||
| // #define C_ELEMENT(m, n) C[(i+(m))+(j+(n))*ldc] | // #define C_ELEMENT(m, n) C[(i+(m))+(j+(n))*ldc] | ||||
| #define PACK_ELEMENT_K(m, offset_k) packed_a[(k + offset_k) * v_size + m] | |||||
| #define PACK_ELEMENT_K(m, offset_k) packed_a[(k + offset_k) * v_size2 + m] | |||||
| #define PACK_ELEMENT(m) PACK_ELEMENT_K(m, 0) | #define PACK_ELEMENT(m) PACK_ELEMENT_K(m, 0) | ||||
| // ASIMD | // ASIMD | ||||
| @@ -207,6 +207,7 @@ CNAME(BLASLONG M, | |||||
| #endif | #endif | ||||
| { | { | ||||
| const uint64_t v_size = svcntw(); | const uint64_t v_size = svcntw(); | ||||
| const uint64_t v_size2 = v_size * 2; | |||||
| const svbool_t pg_true = svptrue_b32(); | const svbool_t pg_true = svptrue_b32(); | ||||
| const svbool_t pg_quad = svwhilelt_b32(0, 4); | const svbool_t pg_quad = svwhilelt_b32(0, 4); | ||||
| const svbool_t pg_first = svwhilelt_b32(0, 1); | const svbool_t pg_first = svwhilelt_b32(0, 1); | ||||
| @@ -216,22 +217,25 @@ CNAME(BLASLONG M, | |||||
| #endif | #endif | ||||
| const svuint32_t lda_vec = svindex_u32(0LL, lda); | const svuint32_t lda_vec = svindex_u32(0LL, lda); | ||||
| const BLASLONG v_m2 = M & -v_size2; | |||||
| const BLASLONG v_m1 = M & -v_size; | const BLASLONG v_m1 = M & -v_size; | ||||
| const BLASLONG n4 = N & -4; | const BLASLONG n4 = N & -4; | ||||
| const int pack_a = M >= v_size && N >= 8 && K >= 8 ? 1 : 0; | |||||
| const int pack_a = M >= v_size2 && N >= 8 && K >= 8 ? 1 : 0; | |||||
| FLOAT* packed_a = | FLOAT* packed_a = | ||||
| (pack_a) ? packed_a = (FLOAT*)malloc(K * v_size * sizeof(FLOAT)) : NULL; | |||||
| (pack_a) ? packed_a = (FLOAT*)malloc(K * v_size2 * sizeof(FLOAT)) : NULL; | |||||
| FLOAT* a_offset = A; | FLOAT* a_offset = A; | ||||
| FLOAT* b_offset = B; | FLOAT* b_offset = B; | ||||
| FLOAT* c_offset = C; | FLOAT* c_offset = C; | ||||
| BLASLONG i = 0; | BLASLONG i = 0; | ||||
| for (; i < v_m1; i += v_size) { | |||||
| for (; i < v_m2; i += v_size2) { | |||||
| CREATE_C_POINTER(0, 0); | CREATE_C_POINTER(0, 0); | ||||
| CREATE_C_POINTER(1, v_size); | |||||
| CREATE_A_POINTER(0, 0); | CREATE_A_POINTER(0, 0); | ||||
| CREATE_A_POINTER(1, v_size); | |||||
| BLASLONG j = 0; | BLASLONG j = 0; | ||||
| for (; j < n4; j += 4) { | for (; j < n4; j += 4) { | ||||
| @@ -247,6 +251,10 @@ CNAME(BLASLONG M, | |||||
| DECLARE_RESULT_VECTOR(0, 1); | DECLARE_RESULT_VECTOR(0, 1); | ||||
| DECLARE_RESULT_VECTOR(0, 2); | DECLARE_RESULT_VECTOR(0, 2); | ||||
| DECLARE_RESULT_VECTOR(0, 3); | DECLARE_RESULT_VECTOR(0, 3); | ||||
| DECLARE_RESULT_VECTOR(1, 0); | |||||
| DECLARE_RESULT_VECTOR(1, 1); | |||||
| DECLARE_RESULT_VECTOR(1, 2); | |||||
| DECLARE_RESULT_VECTOR(1, 3); | |||||
| if (LIKELY(packed_a != NULL)) { | if (LIKELY(packed_a != NULL)) { | ||||
| if (j == 0) { | if (j == 0) { | ||||
| @@ -259,6 +267,12 @@ CNAME(BLASLONG M, | |||||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 1, 0, 1, 0); | UPDATE_RESULT_VECTOR_QUADWORD(0, 1, 0, 1, 0); | ||||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 2, 0, 2, 0); | UPDATE_RESULT_VECTOR_QUADWORD(0, 2, 0, 2, 0); | ||||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 3, 0, 3, 0); | UPDATE_RESULT_VECTOR_QUADWORD(0, 3, 0, 3, 0); | ||||
| GATHER_LOAD_A(pg_true, 1, 0); | |||||
| VECTOR_PACK_A(1, 0); | |||||
| UPDATE_RESULT_VECTOR_QUADWORD(1, 0, 0, 0, 0); | |||||
| UPDATE_RESULT_VECTOR_QUADWORD(1, 1, 0, 1, 0); | |||||
| UPDATE_RESULT_VECTOR_QUADWORD(1, 2, 0, 2, 0); | |||||
| UPDATE_RESULT_VECTOR_QUADWORD(1, 3, 0, 3, 0); | |||||
| } | } | ||||
| } else { | } else { | ||||
| for (; k < K; k++) { | for (; k < K; k++) { | ||||
| @@ -269,6 +283,11 @@ CNAME(BLASLONG M, | |||||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 1, 0, 1, 0); | UPDATE_RESULT_VECTOR_QUADWORD(0, 1, 0, 1, 0); | ||||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 2, 0, 2, 0); | UPDATE_RESULT_VECTOR_QUADWORD(0, 2, 0, 2, 0); | ||||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 3, 0, 3, 0); | UPDATE_RESULT_VECTOR_QUADWORD(0, 3, 0, 3, 0); | ||||
| UNPACK_VECTOR_A(1, 0); | |||||
| UPDATE_RESULT_VECTOR_QUADWORD(1, 0, 0, 0, 0); | |||||
| UPDATE_RESULT_VECTOR_QUADWORD(1, 1, 0, 1, 0); | |||||
| UPDATE_RESULT_VECTOR_QUADWORD(1, 2, 0, 2, 0); | |||||
| UPDATE_RESULT_VECTOR_QUADWORD(1, 3, 0, 3, 0); | |||||
| } | } | ||||
| } | } | ||||
| } else { | } else { | ||||
| @@ -280,13 +299,23 @@ CNAME(BLASLONG M, | |||||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 1, 0, 1, 0); | UPDATE_RESULT_VECTOR_QUADWORD(0, 1, 0, 1, 0); | ||||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 2, 0, 2, 0); | UPDATE_RESULT_VECTOR_QUADWORD(0, 2, 0, 2, 0); | ||||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 3, 0, 3, 0); | UPDATE_RESULT_VECTOR_QUADWORD(0, 3, 0, 3, 0); | ||||
| GATHER_LOAD_A(pg_true, 1, 0); | |||||
| UPDATE_RESULT_VECTOR_QUADWORD(1, 0, 0, 0, 0); | |||||
| UPDATE_RESULT_VECTOR_QUADWORD(1, 1, 0, 1, 0); | |||||
| UPDATE_RESULT_VECTOR_QUADWORD(1, 2, 0, 2, 0); | |||||
| UPDATE_RESULT_VECTOR_QUADWORD(1, 3, 0, 3, 0); | |||||
| } | } | ||||
| } | } | ||||
| VECTOR_STORE(pg_true, 0, 0); | VECTOR_STORE(pg_true, 0, 0); | ||||
| VECTOR_STORE(pg_true, 0, 1); | VECTOR_STORE(pg_true, 0, 1); | ||||
| VECTOR_STORE(pg_true, 0, 2); | VECTOR_STORE(pg_true, 0, 2); | ||||
| VECTOR_STORE(pg_true, 0, 3); | VECTOR_STORE(pg_true, 0, 3); | ||||
| VECTOR_STORE(pg_true, 1, 0); | |||||
| VECTOR_STORE(pg_true, 1, 1); | |||||
| VECTOR_STORE(pg_true, 1, 2); | |||||
| VECTOR_STORE(pg_true, 1, 3); | |||||
| INCR_C_POINTER(0, 4); | INCR_C_POINTER(0, 4); | ||||
| INCR_C_POINTER(1, 4); | |||||
| } | } | ||||
| for (; j < N; j++) { | for (; j < N; j++) { | ||||
| @@ -295,6 +324,7 @@ CNAME(BLASLONG M, | |||||
| BLASLONG k = 0; | BLASLONG k = 0; | ||||
| DECLARE_RESULT_VECTOR(0, 0); | DECLARE_RESULT_VECTOR(0, 0); | ||||
| DECLARE_RESULT_VECTOR(1, 0); | |||||
| if (LIKELY(packed_a != NULL)) { | if (LIKELY(packed_a != NULL)) { | ||||
| for (; k < K; k++) { | for (; k < K; k++) { | ||||
| @@ -302,6 +332,8 @@ CNAME(BLASLONG M, | |||||
| BROADCAST_LOAD_B(0, 0); | BROADCAST_LOAD_B(0, 0); | ||||
| UNPACK_VECTOR_A(0, 0); | UNPACK_VECTOR_A(0, 0); | ||||
| UPDATE_RESULT_VECTOR(pg_true, 0, 0, 0); | UPDATE_RESULT_VECTOR(pg_true, 0, 0, 0); | ||||
| UNPACK_VECTOR_A(1, 0); | |||||
| UPDATE_RESULT_VECTOR(pg_true, 1, 0, 0); | |||||
| } | } | ||||
| } else { | } else { | ||||
| for (; k < K; k++) { | for (; k < K; k++) { | ||||
| @@ -309,9 +341,70 @@ CNAME(BLASLONG M, | |||||
| BROADCAST_LOAD_B(0, 0); | BROADCAST_LOAD_B(0, 0); | ||||
| GATHER_LOAD_A(pg_true, 0, 0); | GATHER_LOAD_A(pg_true, 0, 0); | ||||
| UPDATE_RESULT_VECTOR(pg_true, 0, 0, 0); | UPDATE_RESULT_VECTOR(pg_true, 0, 0, 0); | ||||
| GATHER_LOAD_A(pg_true, 1, 0); | |||||
| UPDATE_RESULT_VECTOR(pg_true, 1, 0, 0); | |||||
| } | } | ||||
| } | } | ||||
| VECTOR_STORE(pg_true, 0, 0); | VECTOR_STORE(pg_true, 0, 0); | ||||
| VECTOR_STORE(pg_true, 1, 0); | |||||
| INCR_C_POINTER(0, 1); | |||||
| INCR_C_POINTER(1, 1); | |||||
| } | |||||
| UPDATE_A_POINTER(v_size2); | |||||
| RESET_B_POINTER(); | |||||
| UPDATE_C_POINTER(v_size2); | |||||
| } | |||||
| for (; i < v_m1; i += v_size) { | |||||
| CREATE_C_POINTER(0, 0); | |||||
| CREATE_A_POINTER(0, 0); | |||||
| BLASLONG j = 0; | |||||
| for (; j < n4; j += 4) { | |||||
| CREATE_B_POINTER(0, 0); | |||||
| CREATE_B_POINTER(1, 1); | |||||
| CREATE_B_POINTER(2, 2); | |||||
| CREATE_B_POINTER(3, 3); | |||||
| UPDATE_B_POINTER(4); | |||||
| BLASLONG k = 0; | |||||
| DECLARE_RESULT_VECTOR(0, 0); | |||||
| DECLARE_RESULT_VECTOR(0, 1); | |||||
| DECLARE_RESULT_VECTOR(0, 2); | |||||
| DECLARE_RESULT_VECTOR(0, 3); | |||||
| for (; k < K; k++) { | |||||
| QUADWORD_LOAD_B(0, 0); | |||||
| GATHER_LOAD_A(pg_true, 0, 0); | |||||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 0, 0, 0, 0); | |||||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 1, 0, 1, 0); | |||||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 2, 0, 2, 0); | |||||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 3, 0, 3, 0); | |||||
| } | |||||
| VECTOR_STORE(pg_true, 0, 0); | |||||
| VECTOR_STORE(pg_true, 0, 1); | |||||
| VECTOR_STORE(pg_true, 0, 2); | |||||
| VECTOR_STORE(pg_true, 0, 3); | |||||
| INCR_C_POINTER(0, 4); | |||||
| } | |||||
| for (; j < N; j++) { | |||||
| CREATE_B_POINTER(0, 0); | |||||
| UPDATE_B_POINTER(1); | |||||
| BLASLONG k = 0; | |||||
| DECLARE_RESULT_VECTOR(0, 0); | |||||
| for (; k < K; k++) { | |||||
| BROADCAST_LOAD_B(0, 0); | |||||
| GATHER_LOAD_A(pg_true, 0, 0); | |||||
| UPDATE_RESULT_VECTOR(pg_true, 0, 0, 0); | |||||
| } | |||||
| VECTOR_STORE(pg_true, 0, 0); | |||||
| INCR_C_POINTER(0, 1); | INCR_C_POINTER(0, 1); | ||||
| } | } | ||||