| @@ -80,25 +80,12 @@ THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| float64x2_t a##m##_k##offset_k = vld1q_dup_f64(&A_ELEMENT_K(m, offset_k)); | |||
| #define LOAD_A1(m, offset_k) \ | |||
| float64_t a##m##_k##offset_k = A_ELEMENT_K(m, offset_k); | |||
| #define VECTOR_LOAD_B_K2(n, offset_k) \ | |||
| float64x2_t b##k##n##_k##offset_k = vld1q_f64(&B_ELEMENT_K(n, offset_k)); | |||
| #define TRANSPOSE_B2_K2(n0, n1, offset_k0, offset_k1) \ | |||
| float64x2_t b##n0##_k##offset_k0 = \ | |||
| vzip1q_f64(b##k##n0##_k##offset_k0, b##k##n1##_k##offset_k0); \ | |||
| float64x2_t b##n0##_k##offset_k1 = \ | |||
| vzip2q_f64(b##k##n0##_k##offset_k0, b##k##n1##_k##offset_k0); | |||
| #define SCALE_B2_K2(n0, offset_k0, offset_k1) \ | |||
| svfloat64_t b##s##n0##_k##offset_k0 = svdup_neonq_f64(b##n0##_k##offset_k0); \ | |||
| svfloat64_t b##s##n0##_k##offset_k1 = svdup_neonq_f64(b##n0##_k##offset_k1); | |||
| #define GATHER_LOAD_B2(n, offset_k) \ | |||
| float64x2_t b##n##_k##offset_k = vdupq_n_f64(B_ELEMENT_K(n, offset_k)); \ | |||
| b##n##_k##offset_k = \ | |||
| vsetq_lane_f64(B_ELEMENT_K(n + 1, offset_k), b##n##_k##offset_k, 1); | |||
| #define VECTOR_UNPACK_B2(n, offset_k) \ | |||
| float64x2_t b##n##_k##offset_k = vld1q_f64(&PACK_ELEMENT_K(n, offset_k)); | |||
| #define VECTOR_PACK_B2(n, offset_k) \ | |||
| vst1q_f64(&PACK_ELEMENT_K(n, offset_k), b##n##_k##offset_k); | |||
| #define PACK_B0(n, offset_k) \ | |||
| PACK_ELEMENT_K(n, offset_k) = vget_lane_f64(b##n##_k##offset_k, 0); | |||
| #define UPDATE_RESULT_VECTOR2(m, n, offset_k) \ | |||
| @@ -128,9 +115,6 @@ THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| svfloat64_t b##s##n##_k##offset_k = svdup_f64(B_ELEMENT_K(n, offset_k)); | |||
| #define VECTOR_LOAD_A(pg, m, offset_k) \ | |||
| svfloat64_t a##s##m##_k##offset_k = svld1(pg, &A_ELEMENT_K(m, offset_k)); | |||
| #define QUADWORD_LOAD_B(n, offset_k) \ | |||
| svfloat64_t b##s##n##_k##offset_k = \ | |||
| svld1rq(pg_true, &B_ELEMENT_K(n, offset_k)); | |||
| #define GATHER_LOAD_A(pg, m, offset_k) \ | |||
| svfloat64_t a##s##m##_k##offset_k = \ | |||
| svld1_gather_index(pg, &A_ELEMENT_K(m, offset_k), lda_vec); | |||
| @@ -226,7 +210,6 @@ CNAME(BLASLONG M, | |||
| const BLASLONG v_m1 = M & -v_size; | |||
| const BLASLONG n4 = N & -4; | |||
| const BLASLONG n2 = N & -2; | |||
| const BLASLONG k2 = K & -2; | |||
| const int pack_a = M >= v_size2 && N >= 8 && K >= 8 ? 1 : 0; | |||
| FLOAT* packed_a = | |||
| @@ -266,6 +249,7 @@ CNAME(BLASLONG M, | |||
| if (LIKELY(packed_a != NULL)) { | |||
| if (j == 0) { | |||
| for (; k < K; k++) { | |||
| BROADCAST_LOAD_B(0, 0); | |||
| GATHER_LOAD_A(pg_true, 0, 0); | |||
| VECTOR_PACK_A(0, 0); | |||
| @@ -285,6 +269,7 @@ CNAME(BLASLONG M, | |||
| } | |||
| } else { | |||
| for (; k < K; k++) { | |||
| BROADCAST_LOAD_B(0, 0); | |||
| UNPACK_VECTOR_A(0, 0); | |||
| UPDATE_RESULT_VECTOR(pg_true, 0, 0, 0); | |||
| @@ -345,6 +330,7 @@ CNAME(BLASLONG M, | |||
| if (LIKELY(packed_a != NULL)) { | |||
| for (; k < K; k++) { | |||
| BROADCAST_LOAD_B(0, 0); | |||
| UNPACK_VECTOR_A(0, 0); | |||
| UPDATE_RESULT_VECTOR(pg_true, 0, 0, 0); | |||
| @@ -356,6 +342,7 @@ CNAME(BLASLONG M, | |||
| } | |||
| } else { | |||
| for (; k < K; k++) { | |||
| BROADCAST_LOAD_B(0, 0); | |||
| GATHER_LOAD_A(pg_true, 0, 0); | |||
| UPDATE_RESULT_VECTOR(pg_true, 0, 0, 0); | |||
| @@ -580,4 +567,4 @@ CNAME(BLASLONG M, | |||
| free(packed_a); | |||
| return 0; | |||
| } | |||
| } | |||
| @@ -237,6 +237,7 @@ CNAME(BLASLONG M, | |||
| #endif | |||
| { | |||
| const uint64_t v_size = svcntw(); | |||
| const uint64_t v_size2 = v_size * 2; | |||
| const svbool_t pg_true = svptrue_b32(); | |||
| const svbool_t pg_quad = svwhilelt_b32(0, 4); | |||
| const svbool_t pg_first = svwhilelt_b32(0, 1); | |||
| @@ -245,10 +246,11 @@ CNAME(BLASLONG M, | |||
| const svfloat32_t beta_vec = svdup_f32(beta); | |||
| #endif | |||
| const BLASLONG n4 = N & -4; | |||
| const BLASLONG v_m2 = M & -v_size2; | |||
| const BLASLONG v_m1 = M & -v_size; | |||
| const BLASLONG k4 = K & -4; | |||
| const int pack_b = M >= v_size && N >= 8 && K >= 8 ? 1 : 0; | |||
| const int pack_b = M >= v_size2 && N >= 8 && K >= 8 ? 1 : 0; | |||
| FLOAT* packed_b = | |||
| (pack_b) ? packed_b = (FLOAT*)malloc(K * 4 * sizeof(FLOAT)) : NULL; | |||
| @@ -269,16 +271,21 @@ CNAME(BLASLONG M, | |||
| CREATE_B_POINTER(3, 3); | |||
| BLASLONG i = 0; | |||
| for (; i < v_m1; i += v_size) { | |||
| for (; i < v_m2; i += v_size2) { | |||
| CREATE_A_POINTER(0, 0); | |||
| UPDATE_A_POINTER(v_size); | |||
| CREATE_A_POINTER(1, v_size); | |||
| UPDATE_A_POINTER(v_size2); | |||
| BLASLONG k = 0; | |||
| DECLARE_RESULT_VECTOR(0, 0); | |||
| DECLARE_RESULT_VECTOR(0, 1); | |||
| DECLARE_RESULT_VECTOR(0, 2); | |||
| DECLARE_RESULT_VECTOR(0, 3); | |||
| DECLARE_RESULT_VECTOR(1, 0); | |||
| DECLARE_RESULT_VECTOR(1, 1); | |||
| DECLARE_RESULT_VECTOR(1, 2); | |||
| DECLARE_RESULT_VECTOR(1, 3); | |||
| if (LIKELY(packed_b != NULL)) { | |||
| if (i == 0) { | |||
| @@ -314,6 +321,26 @@ CNAME(BLASLONG M, | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 1, 0, 1, 3); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 2, 0, 2, 3); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 3, 0, 3, 3); | |||
| VECTOR_LOAD_A(pg_true, 1, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(1, 0, 0, 0, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(1, 1, 0, 1, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(1, 2, 0, 2, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(1, 3, 0, 3, 0); | |||
| VECTOR_LOAD_A(pg_true, 1, 1); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(1, 0, 0, 0, 1); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(1, 1, 0, 1, 1); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(1, 2, 0, 2, 1); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(1, 3, 0, 3, 1); | |||
| VECTOR_LOAD_A(pg_true, 1, 2); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(1, 0, 0, 0, 2); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(1, 1, 0, 1, 2); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(1, 2, 0, 2, 2); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(1, 3, 0, 3, 2); | |||
| VECTOR_LOAD_A(pg_true, 1, 3); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(1, 0, 0, 0, 3); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(1, 1, 0, 1, 3); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(1, 2, 0, 2, 3); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(1, 3, 0, 3, 3); | |||
| } | |||
| for (; k < K; k++) { | |||
| @@ -324,12 +351,17 @@ CNAME(BLASLONG M, | |||
| BROADCAST_LOAD_B(1, 0); | |||
| PACK_B(1, 0); | |||
| UPDATE_RESULT_VECTOR(pg_true, 0, 1, 0); | |||
| VECTOR_LOAD_A(pg_true, 1, 0); | |||
| UPDATE_RESULT_VECTOR(pg_true, 1, 0, 0); | |||
| UPDATE_RESULT_VECTOR(pg_true, 1, 1, 0); | |||
| BROADCAST_LOAD_B(2, 0); | |||
| PACK_B(2, 0); | |||
| UPDATE_RESULT_VECTOR(pg_true, 0, 2, 0); | |||
| UPDATE_RESULT_VECTOR(pg_true, 1, 2, 0); | |||
| BROADCAST_LOAD_B(3, 0); | |||
| PACK_B(3, 0); | |||
| UPDATE_RESULT_VECTOR(pg_true, 0, 3, 0); | |||
| UPDATE_RESULT_VECTOR(pg_true, 1, 3, 0); | |||
| } | |||
| } else { | |||
| for (; k < K; k++) { | |||
| @@ -340,11 +372,118 @@ CNAME(BLASLONG M, | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 1, 0, 1, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 2, 0, 2, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 3, 0, 3, 0); | |||
| VECTOR_LOAD_A(pg_true, 1, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(1, 0, 0, 0, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(1, 1, 0, 1, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(1, 2, 0, 2, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(1, 3, 0, 3, 0); | |||
| } | |||
| } | |||
| } else { | |||
| for (; k < k4; k += 4) { | |||
| VECTOR_LOAD_B_K4(0, 0); | |||
| VECTOR_LOAD_B_K4(1, 0); | |||
| VECTOR_LOAD_B_K4(2, 0); | |||
| VECTOR_LOAD_B_K4(3, 0); | |||
| TRANSPOSE_B4_K4(0, 1, 2, 3, 0, 1, 2, 3); | |||
| SCALE_B4_K4(0, 0, 1, 2, 3); | |||
| VECTOR_LOAD_A(pg_true, 0, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 0, 0, 0, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 1, 0, 1, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 2, 0, 2, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 3, 0, 3, 0); | |||
| VECTOR_LOAD_A(pg_true, 0, 1); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 0, 0, 0, 1); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 1, 0, 1, 1); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 2, 0, 2, 1); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 3, 0, 3, 1); | |||
| VECTOR_LOAD_A(pg_true, 0, 2); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 0, 0, 0, 2); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 1, 0, 1, 2); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 2, 0, 2, 2); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 3, 0, 3, 2); | |||
| VECTOR_LOAD_A(pg_true, 0, 3); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 0, 0, 0, 3); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 1, 0, 1, 3); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 2, 0, 2, 3); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 3, 0, 3, 3); | |||
| VECTOR_LOAD_A(pg_true, 1, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(1, 0, 0, 0, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(1, 1, 0, 1, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(1, 2, 0, 2, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(1, 3, 0, 3, 0); | |||
| VECTOR_LOAD_A(pg_true, 1, 1); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(1, 0, 0, 0, 1); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(1, 1, 0, 1, 1); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(1, 2, 0, 2, 1); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(1, 3, 0, 3, 1); | |||
| VECTOR_LOAD_A(pg_true, 1, 2); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(1, 0, 0, 0, 2); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(1, 1, 0, 1, 2); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(1, 2, 0, 2, 2); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(1, 3, 0, 3, 2); | |||
| VECTOR_LOAD_A(pg_true, 1, 3); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(1, 0, 0, 0, 3); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(1, 1, 0, 1, 3); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(1, 2, 0, 2, 3); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(1, 3, 0, 3, 3); | |||
| } | |||
| for (; k < K; k++) { | |||
| BROADCAST_LOAD_B(0, 0); | |||
| VECTOR_LOAD_A(pg_true, 0, 0); | |||
| UPDATE_RESULT_VECTOR(pg_true, 0, 0, 0); | |||
| BROADCAST_LOAD_B(1, 0); | |||
| UPDATE_RESULT_VECTOR(pg_true, 0, 1, 0); | |||
| VECTOR_LOAD_A(pg_true, 1, 0); | |||
| UPDATE_RESULT_VECTOR(pg_true, 1, 0, 0); | |||
| UPDATE_RESULT_VECTOR(pg_true, 1, 1, 0); | |||
| BROADCAST_LOAD_B(2, 0); | |||
| UPDATE_RESULT_VECTOR(pg_true, 0, 2, 0); | |||
| UPDATE_RESULT_VECTOR(pg_true, 1, 2, 0); | |||
| BROADCAST_LOAD_B(3, 0); | |||
| UPDATE_RESULT_VECTOR(pg_true, 0, 3, 0); | |||
| UPDATE_RESULT_VECTOR(pg_true, 1, 3, 0); | |||
| } | |||
| } | |||
| VECTOR_STORE(pg_true, 0, 0); | |||
| VECTOR_STORE(pg_true, 0, 1); | |||
| VECTOR_STORE(pg_true, 0, 2); | |||
| VECTOR_STORE(pg_true, 0, 3); | |||
| VECTOR_STORE(pg_true, 1, 0); | |||
| VECTOR_STORE(pg_true, 1, 1); | |||
| VECTOR_STORE(pg_true, 1, 2); | |||
| VECTOR_STORE(pg_true, 1, 3); | |||
| INCR_C_POINTER(0, v_size2); | |||
| INCR_C_POINTER(1, v_size2); | |||
| INCR_C_POINTER(2, v_size2); | |||
| INCR_C_POINTER(3, v_size2); | |||
| } | |||
| for (; i < v_m1; i += v_size) { | |||
| CREATE_A_POINTER(0, 0); | |||
| UPDATE_A_POINTER(v_size); | |||
| BLASLONG k = 0; | |||
| DECLARE_RESULT_VECTOR(0, 0); | |||
| DECLARE_RESULT_VECTOR(0, 1); | |||
| DECLARE_RESULT_VECTOR(0, 2); | |||
| DECLARE_RESULT_VECTOR(0, 3); | |||
| if (LIKELY(packed_b != NULL)) { | |||
| for (; k < K; k++) { | |||
| UNPACK_QUADWORD_B(0, 0); | |||
| VECTOR_LOAD_A(pg_true, 0, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 0, 0, 0, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 1, 0, 1, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 2, 0, 2, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 3, 0, 3, 0); | |||
| } | |||
| } else { | |||
| for (; k < k4; k += 4) { | |||
| VECTOR_LOAD_B_K4(0, 0); | |||
| VECTOR_LOAD_B_K4(1, 0); | |||
| VECTOR_LOAD_B_K4(2, 0); | |||
| @@ -478,6 +617,28 @@ CNAME(BLASLONG M, | |||
| CREATE_B_POINTER(0, 0); | |||
| BLASLONG i = 0; | |||
| for (; i < v_m2; i += v_size2) { | |||
| CREATE_A_POINTER(0, 0); | |||
| CREATE_A_POINTER(1, v_size); | |||
| UPDATE_A_POINTER(v_size2); | |||
| BLASLONG k = 0; | |||
| DECLARE_RESULT_VECTOR(0, 0); | |||
| DECLARE_RESULT_VECTOR(1, 0); | |||
| for (; k < K; k++) { | |||
| BROADCAST_LOAD_B(0, 0); | |||
| VECTOR_LOAD_A(pg_true, 0, 0); | |||
| UPDATE_RESULT_VECTOR(pg_true, 0, 0, 0); | |||
| VECTOR_LOAD_A(pg_true, 1, 0); | |||
| UPDATE_RESULT_VECTOR(pg_true, 1, 0, 0); | |||
| } | |||
| VECTOR_STORE(pg_true, 0, 0); | |||
| VECTOR_STORE(pg_true, 1, 0); | |||
| INCR_C_POINTER(0, v_size2); | |||
| } | |||
| for (; i < v_m1; i += v_size) { | |||
| CREATE_A_POINTER(0, 0); | |||
| @@ -209,6 +209,7 @@ CNAME(BLASLONG M, | |||
| #endif | |||
| { | |||
| const uint64_t v_size = svcntw(); | |||
| const uint64_t v_size2 = v_size * 2; | |||
| const svbool_t pg_true = svptrue_b32(); | |||
| const svbool_t pg_quad = svwhilelt_b32(0, 4); | |||
| const svbool_t pg_first = svwhilelt_b32(0, 1); | |||
| @@ -217,9 +218,10 @@ CNAME(BLASLONG M, | |||
| const svfloat32_t beta_vec = svdup_f32(beta); | |||
| #endif | |||
| const BLASLONG n4 = N & -4; | |||
| const BLASLONG v_m2 = M & -v_size2; | |||
| const BLASLONG v_m1 = M & -v_size; | |||
| const int pack_b = M >= v_size && N >= 8 && K >= 8 ? 1 : 0; | |||
| const int pack_b = M >= v_size2 && N >= 8 && K >= 8 ? 1 : 0; | |||
| FLOAT* packed_b = | |||
| (pack_b) ? packed_b = (FLOAT*)malloc(K * 4 * sizeof(FLOAT)) : NULL; | |||
| @@ -240,16 +242,21 @@ CNAME(BLASLONG M, | |||
| CREATE_B_POINTER(3, 3); | |||
| BLASLONG i = 0; | |||
| for (; i < v_m1; i += v_size) { | |||
| for (; i < v_m2; i += v_size2) { | |||
| CREATE_A_POINTER(0, 0); | |||
| UPDATE_A_POINTER(v_size); | |||
| CREATE_A_POINTER(1, v_size); | |||
| UPDATE_A_POINTER(v_size2); | |||
| BLASLONG k = 0; | |||
| DECLARE_RESULT_VECTOR(0, 0); | |||
| DECLARE_RESULT_VECTOR(0, 1); | |||
| DECLARE_RESULT_VECTOR(0, 2); | |||
| DECLARE_RESULT_VECTOR(0, 3); | |||
| DECLARE_RESULT_VECTOR(1, 0); | |||
| DECLARE_RESULT_VECTOR(1, 1); | |||
| DECLARE_RESULT_VECTOR(1, 2); | |||
| DECLARE_RESULT_VECTOR(1, 3); | |||
| if (LIKELY(packed_b != NULL)) { | |||
| if (i == 0) { | |||
| @@ -262,6 +269,11 @@ CNAME(BLASLONG M, | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 1, 0, 1, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 2, 0, 2, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 3, 0, 3, 0); | |||
| VECTOR_LOAD_A(pg_true, 1, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(1, 0, 0, 0, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(1, 1, 0, 1, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(1, 2, 0, 2, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(1, 3, 0, 3, 0); | |||
| } | |||
| } else { | |||
| for (; k < K; k++) { | |||
| @@ -272,11 +284,66 @@ CNAME(BLASLONG M, | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 1, 0, 1, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 2, 0, 2, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 3, 0, 3, 0); | |||
| VECTOR_LOAD_A(pg_true, 1, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(1, 0, 0, 0, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(1, 1, 0, 1, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(1, 2, 0, 2, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(1, 3, 0, 3, 0); | |||
| } | |||
| } | |||
| } else { | |||
| for (; k < K; k++) { | |||
| QUADWORD_LOAD_B(0, 0); | |||
| VECTOR_LOAD_A(pg_true, 0, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 0, 0, 0, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 1, 0, 1, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 2, 0, 2, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 3, 0, 3, 0); | |||
| VECTOR_LOAD_A(pg_true, 1, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(1, 0, 0, 0, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(1, 1, 0, 1, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(1, 2, 0, 2, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(1, 3, 0, 3, 0); | |||
| } | |||
| } | |||
| VECTOR_STORE(pg_true, 0, 0); | |||
| VECTOR_STORE(pg_true, 0, 1); | |||
| VECTOR_STORE(pg_true, 0, 2); | |||
| VECTOR_STORE(pg_true, 0, 3); | |||
| VECTOR_STORE(pg_true, 1, 0); | |||
| VECTOR_STORE(pg_true, 1, 1); | |||
| VECTOR_STORE(pg_true, 1, 2); | |||
| VECTOR_STORE(pg_true, 1, 3); | |||
| INCR_C_POINTER(0, v_size2); | |||
| INCR_C_POINTER(1, v_size2); | |||
| INCR_C_POINTER(2, v_size2); | |||
| INCR_C_POINTER(3, v_size2); | |||
| } | |||
| for (; i < v_m1; i += v_size) { | |||
| CREATE_A_POINTER(0, 0); | |||
| UPDATE_A_POINTER(v_size); | |||
| BLASLONG k = 0; | |||
| DECLARE_RESULT_VECTOR(0, 0); | |||
| DECLARE_RESULT_VECTOR(0, 1); | |||
| DECLARE_RESULT_VECTOR(0, 2); | |||
| DECLARE_RESULT_VECTOR(0, 3); | |||
| if (LIKELY(packed_b != NULL)) { | |||
| for (; k < K; k++) { | |||
| UNPACK_QUADWORD_B(0, 0); | |||
| VECTOR_LOAD_A(pg_true, 0, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 0, 0, 0, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 1, 0, 1, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 2, 0, 2, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 3, 0, 3, 0); | |||
| } | |||
| } else { | |||
| for (; k < K; k++) { | |||
| QUADWORD_LOAD_B(0, 0); | |||
| VECTOR_LOAD_A(pg_true, 0, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 0, 0, 0, 0); | |||
| @@ -346,6 +413,28 @@ CNAME(BLASLONG M, | |||
| CREATE_B_POINTER(0, 0); | |||
| BLASLONG i = 0; | |||
| for (; i < v_m2; i += v_size2) { | |||
| CREATE_A_POINTER(0, 0); | |||
| CREATE_A_POINTER(1, v_size); | |||
| UPDATE_A_POINTER(v_size2); | |||
| BLASLONG k = 0; | |||
| DECLARE_RESULT_VECTOR(0, 0); | |||
| DECLARE_RESULT_VECTOR(1, 0); | |||
| for (; k < K; k++) { | |||
| BROADCAST_LOAD_B(0, 0); | |||
| VECTOR_LOAD_A(pg_true, 0, 0); | |||
| UPDATE_RESULT_VECTOR(pg_true, 0, 0, 0); | |||
| VECTOR_LOAD_A(pg_true, 1, 0); | |||
| UPDATE_RESULT_VECTOR(pg_true, 1, 0, 0); | |||
| } | |||
| VECTOR_STORE(pg_true, 0, 0); | |||
| VECTOR_STORE(pg_true, 1, 0); | |||
| INCR_C_POINTER(0, v_size2); | |||
| } | |||
| for (; i < v_m1; i += v_size) { | |||
| CREATE_A_POINTER(0, 0); | |||
| @@ -69,7 +69,7 @@ THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| // #undef C_ELEMENT | |||
| // #define C_ELEMENT(m, n) C[(i+(m))+(j+(n))*ldc] | |||
| #define PACK_ELEMENT_K(m, offset_k) packed_a[(k + offset_k) * v_size + m] | |||
| #define PACK_ELEMENT_K(m, offset_k) packed_a[(k + offset_k) * v_size2 + m] | |||
| #define PACK_ELEMENT(m) PACK_ELEMENT_K(m, 0) | |||
| // ASIMD | |||
| @@ -206,6 +206,7 @@ CNAME(BLASLONG M, | |||
| #endif | |||
| { | |||
| const uint64_t v_size = svcntw(); | |||
| const uint64_t v_size2 = v_size * 2; | |||
| const svbool_t pg_true = svptrue_b32(); | |||
| const svbool_t pg_quad = svwhilelt_b32(0, 4); | |||
| const svbool_t pg_first = svwhilelt_b32(0, 1); | |||
| @@ -215,22 +216,25 @@ CNAME(BLASLONG M, | |||
| #endif | |||
| const svuint32_t lda_vec = svindex_u32(0LL, lda); | |||
| const BLASLONG v_m2 = M & -v_size2; | |||
| const BLASLONG v_m1 = M & -v_size; | |||
| const BLASLONG n4 = N & -4; | |||
| const int pack_a = M >= v_size && N >= 8 && K >= 8 ? 1 : 0; | |||
| const int pack_a = M >= v_size2 && N >= 8 && K >= 8 ? 1 : 0; | |||
| FLOAT* packed_a = | |||
| (pack_a) ? packed_a = (FLOAT*)malloc(K * v_size * sizeof(FLOAT)) : NULL; | |||
| (pack_a) ? packed_a = (FLOAT*)malloc(K * v_size2 * sizeof(FLOAT)) : NULL; | |||
| FLOAT* a_offset = A; | |||
| FLOAT* b_offset = B; | |||
| FLOAT* c_offset = C; | |||
| BLASLONG i = 0; | |||
| for (; i < v_m1; i += v_size) { | |||
| for (; i < v_m2; i += v_size2) { | |||
| CREATE_C_POINTER(0, 0); | |||
| CREATE_C_POINTER(1, v_size); | |||
| CREATE_A_POINTER(0, 0); | |||
| CREATE_A_POINTER(1, v_size); | |||
| BLASLONG j = 0; | |||
| for (; j < n4; j += 4) { | |||
| @@ -246,6 +250,10 @@ CNAME(BLASLONG M, | |||
| DECLARE_RESULT_VECTOR(0, 1); | |||
| DECLARE_RESULT_VECTOR(0, 2); | |||
| DECLARE_RESULT_VECTOR(0, 3); | |||
| DECLARE_RESULT_VECTOR(1, 0); | |||
| DECLARE_RESULT_VECTOR(1, 1); | |||
| DECLARE_RESULT_VECTOR(1, 2); | |||
| DECLARE_RESULT_VECTOR(1, 3); | |||
| if (LIKELY(packed_a != NULL)) { | |||
| if (j == 0) { | |||
| @@ -257,10 +265,16 @@ CNAME(BLASLONG M, | |||
| UPDATE_RESULT_VECTOR(pg_true, 0, 0, 0); | |||
| BROADCAST_LOAD_B(1, 0); | |||
| UPDATE_RESULT_VECTOR(pg_true, 0, 1, 0); | |||
| GATHER_LOAD_A(pg_true, 1, 0); | |||
| VECTOR_PACK_A(1, 0); | |||
| UPDATE_RESULT_VECTOR(pg_true, 1, 0, 0); | |||
| UPDATE_RESULT_VECTOR(pg_true, 1, 1, 0); | |||
| BROADCAST_LOAD_B(2, 0); | |||
| UPDATE_RESULT_VECTOR(pg_true, 0, 2, 0); | |||
| UPDATE_RESULT_VECTOR(pg_true, 1, 2, 0); | |||
| BROADCAST_LOAD_B(3, 0); | |||
| UPDATE_RESULT_VECTOR(pg_true, 0, 3, 0); | |||
| UPDATE_RESULT_VECTOR(pg_true, 1, 3, 0); | |||
| } | |||
| } else { | |||
| for (; k < K; k++) { | |||
| @@ -270,10 +284,15 @@ CNAME(BLASLONG M, | |||
| UPDATE_RESULT_VECTOR(pg_true, 0, 0, 0); | |||
| BROADCAST_LOAD_B(1, 0); | |||
| UPDATE_RESULT_VECTOR(pg_true, 0, 1, 0); | |||
| UNPACK_VECTOR_A(1, 0); | |||
| UPDATE_RESULT_VECTOR(pg_true, 1, 0, 0); | |||
| UPDATE_RESULT_VECTOR(pg_true, 1, 1, 0); | |||
| BROADCAST_LOAD_B(2, 0); | |||
| UPDATE_RESULT_VECTOR(pg_true, 0, 2, 0); | |||
| UPDATE_RESULT_VECTOR(pg_true, 1, 2, 0); | |||
| BROADCAST_LOAD_B(3, 0); | |||
| UPDATE_RESULT_VECTOR(pg_true, 0, 3, 0); | |||
| UPDATE_RESULT_VECTOR(pg_true, 1, 3, 0); | |||
| } | |||
| } | |||
| } else { | |||
| @@ -284,17 +303,27 @@ CNAME(BLASLONG M, | |||
| UPDATE_RESULT_VECTOR(pg_true, 0, 0, 0); | |||
| BROADCAST_LOAD_B(1, 0); | |||
| UPDATE_RESULT_VECTOR(pg_true, 0, 1, 0); | |||
| GATHER_LOAD_A(pg_true, 1, 0); | |||
| UPDATE_RESULT_VECTOR(pg_true, 1, 0, 0); | |||
| UPDATE_RESULT_VECTOR(pg_true, 1, 1, 0); | |||
| BROADCAST_LOAD_B(2, 0); | |||
| UPDATE_RESULT_VECTOR(pg_true, 0, 2, 0); | |||
| UPDATE_RESULT_VECTOR(pg_true, 1, 2, 0); | |||
| BROADCAST_LOAD_B(3, 0); | |||
| UPDATE_RESULT_VECTOR(pg_true, 0, 3, 0); | |||
| UPDATE_RESULT_VECTOR(pg_true, 1, 3, 0); | |||
| } | |||
| } | |||
| VECTOR_STORE(pg_true, 0, 0); | |||
| VECTOR_STORE(pg_true, 0, 1); | |||
| VECTOR_STORE(pg_true, 0, 2); | |||
| VECTOR_STORE(pg_true, 0, 3); | |||
| VECTOR_STORE(pg_true, 1, 0); | |||
| VECTOR_STORE(pg_true, 1, 1); | |||
| VECTOR_STORE(pg_true, 1, 2); | |||
| VECTOR_STORE(pg_true, 1, 3); | |||
| INCR_C_POINTER(0, 4); | |||
| INCR_C_POINTER(1, 4); | |||
| } | |||
| for (; j < N; j++) { | |||
| @@ -303,6 +332,7 @@ CNAME(BLASLONG M, | |||
| BLASLONG k = 0; | |||
| DECLARE_RESULT_VECTOR(0, 0); | |||
| DECLARE_RESULT_VECTOR(1, 0); | |||
| if (LIKELY(packed_a != NULL)) { | |||
| for (; k < K; k++) { | |||
| @@ -310,6 +340,8 @@ CNAME(BLASLONG M, | |||
| BROADCAST_LOAD_B(0, 0); | |||
| UNPACK_VECTOR_A(0, 0); | |||
| UPDATE_RESULT_VECTOR(pg_true, 0, 0, 0); | |||
| UNPACK_VECTOR_A(1, 0); | |||
| UPDATE_RESULT_VECTOR(pg_true, 1, 0, 0); | |||
| } | |||
| } else { | |||
| for (; k < K; k++) { | |||
| @@ -317,9 +349,73 @@ CNAME(BLASLONG M, | |||
| BROADCAST_LOAD_B(0, 0); | |||
| GATHER_LOAD_A(pg_true, 0, 0); | |||
| UPDATE_RESULT_VECTOR(pg_true, 0, 0, 0); | |||
| GATHER_LOAD_A(pg_true, 1, 0); | |||
| UPDATE_RESULT_VECTOR(pg_true, 1, 0, 0); | |||
| } | |||
| } | |||
| VECTOR_STORE(pg_true, 0, 0); | |||
| VECTOR_STORE(pg_true, 1, 0); | |||
| INCR_C_POINTER(0, 1); | |||
| INCR_C_POINTER(1, 1); | |||
| } | |||
| UPDATE_A_POINTER(v_size2); | |||
| RESET_B_POINTER(); | |||
| UPDATE_C_POINTER(v_size2); | |||
| } | |||
| for (; i < v_m1; i += v_size) { | |||
| CREATE_C_POINTER(0, 0); | |||
| CREATE_A_POINTER(0, 0); | |||
| BLASLONG j = 0; | |||
| for (; j < n4; j += 4) { | |||
| CREATE_B_POINTER(0, 0); | |||
| CREATE_B_POINTER(1, 1); | |||
| CREATE_B_POINTER(2, 2); | |||
| CREATE_B_POINTER(3, 3); | |||
| UPDATE_B_POINTER(4); | |||
| BLASLONG k = 0; | |||
| DECLARE_RESULT_VECTOR(0, 0); | |||
| DECLARE_RESULT_VECTOR(0, 1); | |||
| DECLARE_RESULT_VECTOR(0, 2); | |||
| DECLARE_RESULT_VECTOR(0, 3); | |||
| for (; k < K; k++) { | |||
| BROADCAST_LOAD_B(0, 0); | |||
| GATHER_LOAD_A(pg_true, 0, 0); | |||
| UPDATE_RESULT_VECTOR(pg_true, 0, 0, 0); | |||
| BROADCAST_LOAD_B(1, 0); | |||
| UPDATE_RESULT_VECTOR(pg_true, 0, 1, 0); | |||
| BROADCAST_LOAD_B(2, 0); | |||
| UPDATE_RESULT_VECTOR(pg_true, 0, 2, 0); | |||
| BROADCAST_LOAD_B(3, 0); | |||
| UPDATE_RESULT_VECTOR(pg_true, 0, 3, 0); | |||
| } | |||
| VECTOR_STORE(pg_true, 0, 0); | |||
| VECTOR_STORE(pg_true, 0, 1); | |||
| VECTOR_STORE(pg_true, 0, 2); | |||
| VECTOR_STORE(pg_true, 0, 3); | |||
| INCR_C_POINTER(0, 4); | |||
| } | |||
| for (; j < N; j++) { | |||
| CREATE_B_POINTER(0, 0); | |||
| UPDATE_B_POINTER(1); | |||
| BLASLONG k = 0; | |||
| DECLARE_RESULT_VECTOR(0, 0); | |||
| for (; k < K; k++) { | |||
| BROADCAST_LOAD_B(0, 0); | |||
| GATHER_LOAD_A(pg_true, 0, 0); | |||
| UPDATE_RESULT_VECTOR(pg_true, 0, 0, 0); | |||
| } | |||
| VECTOR_STORE(pg_true, 0, 0); | |||
| INCR_C_POINTER(0, 1); | |||
| } | |||
| @@ -69,7 +69,7 @@ THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| // #undef C_ELEMENT | |||
| // #define C_ELEMENT(m, n) C[(i+(m))+(j+(n))*ldc] | |||
| #define PACK_ELEMENT_K(m, offset_k) packed_a[(k + offset_k) * v_size + m] | |||
| #define PACK_ELEMENT_K(m, offset_k) packed_a[(k + offset_k) * v_size2 + m] | |||
| #define PACK_ELEMENT(m) PACK_ELEMENT_K(m, 0) | |||
| // ASIMD | |||
| @@ -207,6 +207,7 @@ CNAME(BLASLONG M, | |||
| #endif | |||
| { | |||
| const uint64_t v_size = svcntw(); | |||
| const uint64_t v_size2 = v_size * 2; | |||
| const svbool_t pg_true = svptrue_b32(); | |||
| const svbool_t pg_quad = svwhilelt_b32(0, 4); | |||
| const svbool_t pg_first = svwhilelt_b32(0, 1); | |||
| @@ -216,22 +217,25 @@ CNAME(BLASLONG M, | |||
| #endif | |||
| const svuint32_t lda_vec = svindex_u32(0LL, lda); | |||
| const BLASLONG v_m2 = M & -v_size2; | |||
| const BLASLONG v_m1 = M & -v_size; | |||
| const BLASLONG n4 = N & -4; | |||
| const int pack_a = M >= v_size && N >= 8 && K >= 8 ? 1 : 0; | |||
| const int pack_a = M >= v_size2 && N >= 8 && K >= 8 ? 1 : 0; | |||
| FLOAT* packed_a = | |||
| (pack_a) ? packed_a = (FLOAT*)malloc(K * v_size * sizeof(FLOAT)) : NULL; | |||
| (pack_a) ? packed_a = (FLOAT*)malloc(K * v_size2 * sizeof(FLOAT)) : NULL; | |||
| FLOAT* a_offset = A; | |||
| FLOAT* b_offset = B; | |||
| FLOAT* c_offset = C; | |||
| BLASLONG i = 0; | |||
| for (; i < v_m1; i += v_size) { | |||
| for (; i < v_m2; i += v_size2) { | |||
| CREATE_C_POINTER(0, 0); | |||
| CREATE_C_POINTER(1, v_size); | |||
| CREATE_A_POINTER(0, 0); | |||
| CREATE_A_POINTER(1, v_size); | |||
| BLASLONG j = 0; | |||
| for (; j < n4; j += 4) { | |||
| @@ -247,6 +251,10 @@ CNAME(BLASLONG M, | |||
| DECLARE_RESULT_VECTOR(0, 1); | |||
| DECLARE_RESULT_VECTOR(0, 2); | |||
| DECLARE_RESULT_VECTOR(0, 3); | |||
| DECLARE_RESULT_VECTOR(1, 0); | |||
| DECLARE_RESULT_VECTOR(1, 1); | |||
| DECLARE_RESULT_VECTOR(1, 2); | |||
| DECLARE_RESULT_VECTOR(1, 3); | |||
| if (LIKELY(packed_a != NULL)) { | |||
| if (j == 0) { | |||
| @@ -259,6 +267,12 @@ CNAME(BLASLONG M, | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 1, 0, 1, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 2, 0, 2, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 3, 0, 3, 0); | |||
| GATHER_LOAD_A(pg_true, 1, 0); | |||
| VECTOR_PACK_A(1, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(1, 0, 0, 0, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(1, 1, 0, 1, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(1, 2, 0, 2, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(1, 3, 0, 3, 0); | |||
| } | |||
| } else { | |||
| for (; k < K; k++) { | |||
| @@ -269,6 +283,11 @@ CNAME(BLASLONG M, | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 1, 0, 1, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 2, 0, 2, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 3, 0, 3, 0); | |||
| UNPACK_VECTOR_A(1, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(1, 0, 0, 0, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(1, 1, 0, 1, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(1, 2, 0, 2, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(1, 3, 0, 3, 0); | |||
| } | |||
| } | |||
| } else { | |||
| @@ -280,13 +299,23 @@ CNAME(BLASLONG M, | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 1, 0, 1, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 2, 0, 2, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 3, 0, 3, 0); | |||
| GATHER_LOAD_A(pg_true, 1, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(1, 0, 0, 0, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(1, 1, 0, 1, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(1, 2, 0, 2, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(1, 3, 0, 3, 0); | |||
| } | |||
| } | |||
| VECTOR_STORE(pg_true, 0, 0); | |||
| VECTOR_STORE(pg_true, 0, 1); | |||
| VECTOR_STORE(pg_true, 0, 2); | |||
| VECTOR_STORE(pg_true, 0, 3); | |||
| VECTOR_STORE(pg_true, 1, 0); | |||
| VECTOR_STORE(pg_true, 1, 1); | |||
| VECTOR_STORE(pg_true, 1, 2); | |||
| VECTOR_STORE(pg_true, 1, 3); | |||
| INCR_C_POINTER(0, 4); | |||
| INCR_C_POINTER(1, 4); | |||
| } | |||
| for (; j < N; j++) { | |||
| @@ -295,6 +324,7 @@ CNAME(BLASLONG M, | |||
| BLASLONG k = 0; | |||
| DECLARE_RESULT_VECTOR(0, 0); | |||
| DECLARE_RESULT_VECTOR(1, 0); | |||
| if (LIKELY(packed_a != NULL)) { | |||
| for (; k < K; k++) { | |||
| @@ -302,6 +332,8 @@ CNAME(BLASLONG M, | |||
| BROADCAST_LOAD_B(0, 0); | |||
| UNPACK_VECTOR_A(0, 0); | |||
| UPDATE_RESULT_VECTOR(pg_true, 0, 0, 0); | |||
| UNPACK_VECTOR_A(1, 0); | |||
| UPDATE_RESULT_VECTOR(pg_true, 1, 0, 0); | |||
| } | |||
| } else { | |||
| for (; k < K; k++) { | |||
| @@ -309,9 +341,70 @@ CNAME(BLASLONG M, | |||
| BROADCAST_LOAD_B(0, 0); | |||
| GATHER_LOAD_A(pg_true, 0, 0); | |||
| UPDATE_RESULT_VECTOR(pg_true, 0, 0, 0); | |||
| GATHER_LOAD_A(pg_true, 1, 0); | |||
| UPDATE_RESULT_VECTOR(pg_true, 1, 0, 0); | |||
| } | |||
| } | |||
| VECTOR_STORE(pg_true, 0, 0); | |||
| VECTOR_STORE(pg_true, 1, 0); | |||
| INCR_C_POINTER(0, 1); | |||
| INCR_C_POINTER(1, 1); | |||
| } | |||
| UPDATE_A_POINTER(v_size2); | |||
| RESET_B_POINTER(); | |||
| UPDATE_C_POINTER(v_size2); | |||
| } | |||
| for (; i < v_m1; i += v_size) { | |||
| CREATE_C_POINTER(0, 0); | |||
| CREATE_A_POINTER(0, 0); | |||
| BLASLONG j = 0; | |||
| for (; j < n4; j += 4) { | |||
| CREATE_B_POINTER(0, 0); | |||
| CREATE_B_POINTER(1, 1); | |||
| CREATE_B_POINTER(2, 2); | |||
| CREATE_B_POINTER(3, 3); | |||
| UPDATE_B_POINTER(4); | |||
| BLASLONG k = 0; | |||
| DECLARE_RESULT_VECTOR(0, 0); | |||
| DECLARE_RESULT_VECTOR(0, 1); | |||
| DECLARE_RESULT_VECTOR(0, 2); | |||
| DECLARE_RESULT_VECTOR(0, 3); | |||
| for (; k < K; k++) { | |||
| QUADWORD_LOAD_B(0, 0); | |||
| GATHER_LOAD_A(pg_true, 0, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 0, 0, 0, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 1, 0, 1, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 2, 0, 2, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 3, 0, 3, 0); | |||
| } | |||
| VECTOR_STORE(pg_true, 0, 0); | |||
| VECTOR_STORE(pg_true, 0, 1); | |||
| VECTOR_STORE(pg_true, 0, 2); | |||
| VECTOR_STORE(pg_true, 0, 3); | |||
| INCR_C_POINTER(0, 4); | |||
| } | |||
| for (; j < N; j++) { | |||
| CREATE_B_POINTER(0, 0); | |||
| UPDATE_B_POINTER(1); | |||
| BLASLONG k = 0; | |||
| DECLARE_RESULT_VECTOR(0, 0); | |||
| for (; k < K; k++) { | |||
| BROADCAST_LOAD_B(0, 0); | |||
| GATHER_LOAD_A(pg_true, 0, 0); | |||
| UPDATE_RESULT_VECTOR(pg_true, 0, 0, 0); | |||
| } | |||
| VECTOR_STORE(pg_true, 0, 0); | |||
| INCR_C_POINTER(0, 1); | |||
| } | |||