diff --git a/kernel/zarch/KERNEL.Z14 b/kernel/zarch/KERNEL.Z14 index bd3a966b1..49fa28175 100644 --- a/kernel/zarch/KERNEL.Z14 +++ b/kernel/zarch/KERNEL.Z14 @@ -86,7 +86,7 @@ DGEMVTKERNEL = dgemv_t_4.c CGEMVTKERNEL = cgemv_t_4.c ZGEMVTKERNEL = zgemv_t_4.c -STRMMKERNEL = strmm8x4V.S +STRMMKERNEL = gemm_vec.c DTRMMKERNEL = trmm8x4V.S CTRMMKERNEL = ctrmm4x4V.S ZTRMMKERNEL = ztrmm4x4V.S @@ -101,8 +101,6 @@ SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) - - DGEMMKERNEL = gemm8x4V.S DGEMMINCOPY = ../generic/gemm_ncopy_8.c DGEMMITCOPY = ../generic/gemm_tcopy_8.c @@ -145,7 +143,3 @@ ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c - - - - diff --git a/kernel/zarch/gemm_vec.c b/kernel/zarch/gemm_vec.c index e6d613c44..a9531c7a5 100644 --- a/kernel/zarch/gemm_vec.c +++ b/kernel/zarch/gemm_vec.c @@ -51,6 +51,29 @@ static const size_t unroll_m = UNROLL_M; static const size_t unroll_n = UNROLL_N; +/* Handling of triangular matrices */ +#ifdef TRMMKERNEL +static const bool trmm = true; +static const bool left = +#ifdef LEFT + true; +#else + false; +#endif + +static const bool backwards = +#if defined(LEFT) != defined(TRANSA) + true; +#else + false; +#endif + +#else +static const bool trmm = false; +static const bool left = false; +static const bool backwards = false; +#endif /* TRMMKERNEL */ + /* * Background: * @@ -111,6 +134,17 @@ static const size_t unroll_n = UNROLL_N; * vectorization for varying block sizes) * - add alpha * row block of C_aux back into C_j. * + * Note that there are additional mechanics for handling triangular matrices, + * calculating B := alpha (A * B) where either of the matrices A or B can be + * triangular. In case of A, the macro "LEFT" is defined. In addition, A can + * optionally be transposed. + * The code effectively skips an "offset" number of columns in A and rows of B + * in each block, to save unnecessary work by exploiting the triangular nature. + * To handle all cases, the code discerns (1) a "left" mode when A is triangular + * and (2) "forward" / "backwards" modes where only the first "offset" + * columns/rows of A/B are used or where the first "offset" columns/rows are + * skipped, respectively. + * * Reference: * * The summary above is based on staring at various kernel implementations and: @@ -176,7 +210,11 @@ typedef FLOAT vector_float __attribute__ ((vector_size (16))); vector_float *C_ij = \ (vector_float *)(C + i * VLEN_FLOATS + \ j * ldc); \ - *C_ij += alpha * Caux[i][j]; \ + if (trmm) { \ + *C_ij = alpha * Caux[i][j]; \ + } else { \ + *C_ij += alpha * Caux[i][j]; \ + } \ } \ } \ } @@ -209,17 +247,37 @@ VECTOR_BLOCK(2, 2) * @param[inout] C Pointer to current column block (panel) of output matrix C. * @param[in] ldc Offset between elements in adjacent columns in C. * @param[in] alpha Scalar factor. + * @param[in] offset Number of columns of A and rows of B to skip (for triangular matrices). + * @param[in] off Running offset for handling triangular matrices. */ static inline void GEBP_block(BLASLONG m, BLASLONG n, BLASLONG first_row, const FLOAT * restrict A, BLASLONG k, const FLOAT * restrict B, FLOAT *restrict C, BLASLONG ldc, - FLOAT alpha) + FLOAT alpha, + BLASLONG offset, BLASLONG off) { + if (trmm && left) + off = offset + first_row; + A += first_row * k; C += first_row; + if (trmm) { + if (backwards) { + A += off * m; + B += off * n; + k -= off; + } else { + if (left) { + k = off + m; + } else { + k = off + n; + } + } + } + #define BLOCK(bm, bn) \ if (m == bm && n == bn) { \ GEBP_block_##bm##_##bn(A, k, B, C, ldc, alpha); \ @@ -253,7 +311,11 @@ static inline void GEBP_block(BLASLONG m, BLASLONG n, for (BLASLONG i = 0; i < m; i++) for (BLASLONG j = 0; j < n; j++) - C[i + j * ldc] += alpha * Caux[i][j]; + if (trmm) { + C[i + j * ldc] = alpha * Caux[i][j]; + } else { + C[i + j * ldc] += alpha * Caux[i][j]; + } } /** @@ -268,12 +330,15 @@ static inline void GEBP_block(BLASLONG m, BLASLONG n, * @param[inout] C Pointer to output matrix C (note: all of it). * @param[in] ldc Offset between elements in adjacent columns in C. * @param[in] alpha Scalar factor. + * @param[in] offset Number of columns of A and rows of B to skip (for triangular matrices). */ static inline void GEBP_column_block(BLASLONG num_cols, BLASLONG first_col, const FLOAT *restrict A, BLASLONG bk, const FLOAT *restrict B, BLASLONG bm, FLOAT *restrict C, BLASLONG ldc, - FLOAT alpha) { + FLOAT alpha, + BLASLONG const offset) { + FLOAT *restrict C_i = C + first_col * ldc; /* * B is in column-order with n_r packed row elements, which does @@ -282,6 +347,15 @@ static inline void GEBP_column_block(BLASLONG num_cols, BLASLONG first_col, */ const FLOAT *restrict B_i = B + first_col * bk; + BLASLONG off = 0; + if (trmm) { + if (left) { + off = offset; + } else { + off = -offset + first_col; + } + } + /* * Calculate C_aux := A * B_j * then unpack C_i += alpha * C_aux. @@ -293,7 +367,7 @@ static inline void GEBP_column_block(BLASLONG num_cols, BLASLONG first_col, for (BLASLONG block_size = unroll_m; block_size > 0; block_size /= 2) for (; bm - row >= block_size; row += block_size) GEBP_block(block_size, num_cols, row, A, bk, B_i, C_i, - ldc, alpha); + ldc, alpha, offset, off); } /** @@ -301,6 +375,9 @@ static inline void GEBP_column_block(BLASLONG num_cols, BLASLONG first_col, * where C is an m-by-n matrix, A is m-by-k and B is k-by-n. Note that A, B, and * C are pointers to submatrices of the actual matrices. * + * For triangular matrix multiplication, calculate B := alpha (A * B) where A + * or B can be triangular (in case of A, the macro LEFT will be defined). + * * @param[in] bm Number of rows in C and A. * @param[in] bn Number of columns in C and B. * @param[in] bk Number of columns in A and rows in B. @@ -309,11 +386,16 @@ static inline void GEBP_column_block(BLASLONG num_cols, BLASLONG first_col, * @param[in] bb Pointer to input matrix B. * @param[inout] C Pointer to output matrix C. * @param[in] ldc Offset between elements in adjacent columns in C. + * @param[in] offset Number of columns of A and rows of B to skip (for triangular matrices). * @returns 0 on success. */ int CNAME(BLASLONG bm, BLASLONG bn, BLASLONG bk, FLOAT alpha, FLOAT *restrict ba, FLOAT *restrict bb, - FLOAT *restrict C, BLASLONG ldc) + FLOAT *restrict C, BLASLONG ldc +#ifdef TRMMKERNEL + , BLASLONG offset +#endif + ) { if ( (bm == 0) || (bn == 0) || (bk == 0) || (alpha == ZERO)) return 0; @@ -326,6 +408,14 @@ int CNAME(BLASLONG bm, BLASLONG bn, BLASLONG bk, FLOAT alpha, ba = __builtin_assume_aligned(ba, 16); bb = __builtin_assume_aligned(bb, 16); + /* + * Use offset and off even when compiled as SGEMMKERNEL to simplify + * function signatures and function calls. + */ +#ifndef TRMMKERNEL + BLASLONG const offset = 0; +#endif + /* * Partition B and C into blocks of n_r (unroll_n) columns, called B_i * and C_i. For each partition, calculate C_i += alpha * (A * B_j). @@ -336,7 +426,7 @@ int CNAME(BLASLONG bm, BLASLONG bn, BLASLONG bk, FLOAT alpha, BLASLONG col = 0; for (BLASLONG block_size = unroll_n; block_size > 0; block_size /= 2) for (; bn - col >= block_size; col += block_size) - GEBP_column_block(block_size, col, ba, bk, bb, bm, C, ldc, alpha); + GEBP_column_block(block_size, col, ba, bk, bb, bm, C, ldc, alpha, offset); return 0; }