| @@ -143,7 +143,7 @@ endif | |||
| SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
| SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
| DGEMMKERNEL = dgemm_kernel_sve_v1x$(DGEMM_UNROLL_N).S | |||
| DGEMMKERNEL = dgemm_kernel_sve_v2x$(DGEMM_UNROLL_N).S | |||
| DTRMMKERNEL = dtrmm_kernel_sve_v1x$(DGEMM_UNROLL_N).S | |||
| DGEMMINCOPY = dgemm_ncopy_sve_v1.c | |||
| @@ -54,7 +54,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define alpha0 d10 | |||
| #define alphaZ z2.d | |||
| #define A_PRE_SIZE 2560 | |||
| #define A_PRE_SIZE 1536 | |||
| #define B_PRE_SIZE 512 | |||
| #define C_PRE_SIZE 128 | |||
| @@ -134,7 +134,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .macro KERNELv1x8_I | |||
| ld1d z0.d, p1/z, [pA] | |||
| ld1d z1.d, p1/z, [pA, lanes, lsl #3] // next one | |||
| //incb pA, all, mul #2 | |||
| add pA, pA, lanes, lsl #4 // pA = pA + lanes * 2 * 8 | |||
| ld1rd z8.d, p0/z, [pB] | |||
| @@ -476,13 +475,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| ptrue p0.d // create true predicate | |||
| mov pB, origPB | |||
| // Loop over N | |||
| mov counterJ, origN | |||
| asr counterJ, counterJ, #3 // J = J / 8 | |||
| cmp counterJ, #0 | |||
| ble .Ldgemm_kernel_L4_BEGIN | |||
| /******************************************************************************/ | |||
| /* Repeat this as long as there are 8 left in N */ | |||
| .align 5 | |||
| .Ldgemm_kernel_L8_BEGIN: | |||
| @@ -494,8 +494,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .Ldgemm_kernel_L8_Mv1_BEGIN: | |||
| /* Loop over M is done in an SVE fashion. This has the benefit of the last M%SVE_LEN iterations being done in a single sweep */ | |||
| mov counterI, #0 | |||
| whilelt p1.d, counterI, origM //SVE instruction | |||
| whilelt p1.d, counterI, origM | |||
| cntp lanes, p0, p1.d // lanes contain number of active SVE lanes in M dimension | |||
| .align 5 | |||
| @@ -607,7 +608,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| bgt .Ldgemm_kernel_L8_BEGIN | |||
| /******************************************************************************/ | |||
| /******************************************************************************/ | |||
| /* Repeat the same thing if 4 left in N */ | |||
| .align 5 | |||
| .Ldgemm_kernel_L4_BEGIN: | |||
| @@ -692,7 +693,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| add origPB, origPB, temp // B = B + K * 4 * 8 | |||
| /******************************************************************************/ | |||
| /******************************************************************************/ | |||
| /* Repeat the same thing if 2 left in N */ | |||
| .align 5 | |||
| .Ldgemm_kernel_L2_BEGIN: | |||
| @@ -773,7 +774,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| add origPB, origPB, origK, lsl #4 // B = B + K * 2 * 8 | |||
| /******************************************************************************/ | |||
| /******************************************************************************/ | |||
| /* Repeat the same thing if 1 left in N */ | |||
| .align 5 | |||
| .Ldgemm_kernel_L1_BEGIN: | |||
| @@ -25,6 +25,11 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *******************************************************************************/ | |||
| /* This is an SVE dgemm kernel with size 2*SVE_LEN x 8. | |||
| However, the data layout is the same as for the kernel 1*SVE_LEN x 8. | |||
| This means that we sweep two panels of packed A when iterating in a loop over K. | |||
| With this approach, we can reuse dgemm_n|tcopy_sve_v1.c packing functions. */ | |||
| #define ASSEMBLER | |||
| #include "common.h" | |||
| @@ -57,7 +62,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define alpha0 d10 | |||
| #define alphaZ z7.d | |||
| #define A_PRE_SIZE 2560 | |||
| #define A_PRE_SIZE 1536 | |||
| #define B_PRE_SIZE 512 | |||
| #define C_PRE_SIZE 128 | |||
| @@ -96,8 +101,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| //v00 ALPHA -> pA10_0 | |||
| //v01 pA10_1 | |||
| //v02 | |||
| //v03 | |||
| //v02 pA20_0 | |||
| //v03 pA20_1 | |||
| //v04 | |||
| //v05 | |||
| //v06 | |||
| @@ -118,6 +123,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| //v21 must save C5 | |||
| //v22 must save C6 | |||
| //v23 must save C7 | |||
| //v24 must save C8 | |||
| //v25 must save C9 | |||
| //v26 must save C10 | |||
| //v27 must save C11 | |||
| //v28 must save C12 | |||
| //v29 must save C13 | |||
| //v30 must save C14 | |||
| //v31 must save C15 | |||
| /******************************************************************************* | |||
| * Macro definitions | |||
| @@ -583,7 +596,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .macro KERNELv1x8_I | |||
| ld1d z0.d, p1/z, [pA1] | |||
| ld1d z1.d, p1/z, [pA1, lanes, lsl #3] // next one | |||
| //incb pA1, all, mul #2 | |||
| add pA1, pA1, lanes, lsl #4 // pA1 = pA1 + lanes * 2 * 8 | |||
| ld1rd z8.d, p0/z, [pB] | |||
| @@ -928,13 +940,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| ptrue p0.d // create true predicate | |||
| mov pB, origPB | |||
| // Loop over N | |||
| mov counterJ, origN | |||
| asr counterJ, counterJ, #3 // J = J / 8 | |||
| cmp counterJ, #0 | |||
| ble .Ldgemm_kernel_L4_BEGIN | |||
| /******************************************************************************/ | |||
| /* Repeat this as long as there are 8 left in N */ | |||
| .align 5 | |||
| .Ldgemm_kernel_L8_BEGIN: | |||
| @@ -947,11 +960,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .Ldgemm_kernel_L8_Mv2_BEGIN: | |||
| mov counterI, #0 | |||
| cmp origM, vec_lenx2 | |||
| cmp origM, vec_lenx2 // Check if M < 2*SVE_LEN | |||
| blt .Ldgemm_kernel_L8_Mv1_BEGIN | |||
| mov counterI, origM | |||
| /* Until we have at least 2*SVE_LEN iters left in M, we do them with V2*8 kernel */ | |||
| mul temp, vec_len, origK // generate address of pA2 | |||
| add pA2, pA1, temp, lsl #3 // pA1 = start of A array | |||
| prfm PLDL1KEEP, [pA2] | |||
| @@ -1063,7 +1077,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| cmp counterI, origM | |||
| beq .Ldgemm_kernel_L8_END | |||
| ////////////////////////////////// | |||
| ////////////////////////////////////////// | |||
| // We have less than 2*SVE_LEN left. We do this with V1x8 kernel. | |||
| .Ldgemm_kernel_L8_Mv1_BEGIN: | |||
| whilelt p1.d, counterI, origM //SVE instruction | |||
| @@ -1178,7 +1193,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| bgt .Ldgemm_kernel_L8_BEGIN | |||
| /******************************************************************************/ | |||
| /******************************************************************************/ | |||
| /* Repeat the same thing if 4 left in N */ | |||
| .align 5 | |||
| .Ldgemm_kernel_L4_BEGIN: | |||
| @@ -1270,6 +1285,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| beq .Ldgemm_kernel_L4_END | |||
| ////////////////////////////////// | |||
| // We have less than 2*SVE_LEN left. We do this with V1x4 kernel. | |||
| .Ldgemm_kernel_L4_Mv1_BEGIN: | |||
| whilelt p1.d, counterI, origM //SVE instruction | |||
| @@ -1338,7 +1354,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| add origPB, origPB, temp // B = B + K * 4 * 8 | |||
| /******************************************************************************/ | |||
| /******************************************************************************/ | |||
| /* Repeat the same thing if 2 left in N */ | |||
| .align 5 | |||
| .Ldgemm_kernel_L2_BEGIN: | |||
| @@ -1428,6 +1444,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| ////////////////////////////////// | |||
| // We have less than 2*SVE_LEN left. We do this with V1x2 kernel. | |||
| .Ldgemm_kernel_L2_Mv1_BEGIN: | |||
| whilelt p1.d, counterI, origM //SVE instruction | |||
| @@ -1493,7 +1510,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| add origPB, origPB, origK, lsl #4 // B = B + K * 2 * 8 | |||
| /******************************************************************************/ | |||
| /******************************************************************************/ | |||
| /* Repeat the same thing if 1 left in N */ | |||
| .align 5 | |||
| .Ldgemm_kernel_L1_BEGIN: | |||
| @@ -1581,6 +1598,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| ////////////////////////////////// | |||
| // We have less than 2*SVE_LEN left. We do this with V1x1 kernel. | |||
| .Ldgemm_kernel_L1_Mv1_BEGIN: | |||
| whilelt p1.d, counterI, origM //SVE instruction | |||
| @@ -40,40 +40,40 @@ | |||
| #include "common.h" | |||
| #include <arm_sve.h> | |||
| // TODO: write in assembly with proper unrolling | |||
| // TODO: write in assembly with proper unrolling of inner loop | |||
| int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ | |||
| BLASLONG j; | |||
| IFLOAT *aoffset, *aoffset1, *boffset; | |||
| BLASLONG j; | |||
| IFLOAT *aoffset, *aoffset1, *boffset; | |||
| svint64_t lda_vec = svindex_s64(0LL, lda); | |||
| uint64_t sve_size = svcntd(); | |||
| svint64_t lda_vec = svindex_s64(0LL, lda); | |||
| uint64_t sve_size = svcntd(); | |||
| aoffset = a; | |||
| boffset = b; | |||
| aoffset = a; | |||
| boffset = b; | |||
| j = 0; | |||
| svbool_t pg = svwhilelt_b64(j, n); | |||
| uint64_t active = svcntp_b64(svptrue_b64(), pg); | |||
| do { | |||
| j = 0; | |||
| svbool_t pg = svwhilelt_b64(j, n); | |||
| uint64_t active = svcntp_b64(svptrue_b64(), pg); | |||
| do { | |||
| aoffset1 = aoffset; | |||
| aoffset1 = aoffset; | |||
| uint64_t i_cnt = m; | |||
| while (i_cnt--) { | |||
| svfloat64_t a_vec = svld1_gather_index(pg, (double *) aoffset1, lda_vec); | |||
| svst1_f64(pg, (double *) boffset, a_vec); | |||
| aoffset1++; | |||
| boffset += active; | |||
| } | |||
| aoffset += sve_size * lda; | |||
| uint64_t i_cnt = m; | |||
| while (i_cnt--) { | |||
| svfloat64_t a_vec = svld1_gather_index(pg, (double *) aoffset1, lda_vec); | |||
| svst1_f64(pg, (double *) boffset, a_vec); | |||
| aoffset1++; | |||
| boffset += active; | |||
| } | |||
| aoffset += sve_size * lda; | |||
| j += svcntd(); | |||
| pg = svwhilelt_b64(j, n); | |||
| active = svcntp_b64(svptrue_b64(), pg); | |||
| j += svcntd(); | |||
| pg = svwhilelt_b64(j, n); | |||
| active = svcntp_b64(svptrue_b64(), pg); | |||
| } while (svptest_any(svptrue_b64(), pg)); | |||
| } while (svptest_any(svptrue_b64(), pg)); | |||
| return 0; | |||
| return 0; | |||
| } | |||
| @@ -40,38 +40,38 @@ | |||
| #include "common.h" | |||
| #include <arm_sve.h> | |||
| // TODO: write in assembly with proper unrolling | |||
| // TODO: write in assembly with proper unrolling of inner loop | |||
| int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ | |||
| BLASLONG j; | |||
| IFLOAT *aoffset, *aoffset1, *boffset; | |||
| BLASLONG j; | |||
| IFLOAT *aoffset, *aoffset1, *boffset; | |||
| uint64_t sve_size = svcntd(); | |||
| uint64_t sve_size = svcntd(); | |||
| aoffset = a; | |||
| boffset = b; | |||
| aoffset = a; | |||
| boffset = b; | |||
| j = 0; | |||
| svbool_t pg = svwhilelt_b64(j, n); | |||
| uint64_t active = svcntp_b64(svptrue_b64(), pg); | |||
| do { | |||
| j = 0; | |||
| svbool_t pg = svwhilelt_b64(j, n); | |||
| uint64_t active = svcntp_b64(svptrue_b64(), pg); | |||
| do { | |||
| aoffset1 = aoffset; | |||
| aoffset1 = aoffset; | |||
| uint64_t i_cnt = m; | |||
| while (i_cnt--) { | |||
| svfloat64_t a_vec = svld1(pg, (double *)aoffset1); | |||
| svst1_f64(pg, (double *) boffset, a_vec); | |||
| aoffset1 += lda; | |||
| boffset += active; | |||
| } | |||
| aoffset += sve_size; | |||
| uint64_t i_cnt = m; | |||
| while (i_cnt--) { | |||
| svfloat64_t a_vec = svld1(pg, (double *)aoffset1); | |||
| svst1_f64(pg, (double *) boffset, a_vec); | |||
| aoffset1 += lda; | |||
| boffset += active; | |||
| } | |||
| aoffset += sve_size; | |||
| j += svcntd(); | |||
| pg = svwhilelt_b64(j, n); | |||
| active = svcntp_b64(svptrue_b64(), pg); | |||
| j += svcntd(); | |||
| pg = svwhilelt_b64(j, n); | |||
| active = svcntp_b64(svptrue_b64(), pg); | |||
| } while (svptest_any(svptrue_b64(), pg)); | |||
| } while (svptest_any(svptrue_b64(), pg)); | |||
| return 0; | |||
| return 0; | |||
| } | |||
| @@ -58,7 +58,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define alpha0 d10 | |||
| #define alphaZ z2.d | |||
| #define A_PRE_SIZE 2560 | |||
| #define A_PRE_SIZE 1536 | |||
| #define B_PRE_SIZE 512 | |||
| #define C_PRE_SIZE 128 | |||
| @@ -138,7 +138,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .macro KERNELv1x8_I | |||
| ld1d z0.d, p1/z, [pA] | |||
| ld1d z1.d, p1/z, [pA, lanes, lsl #3] // next one | |||
| //incb pA, all, mul #2 | |||
| add pA, pA, lanes, lsl #4 // pA = pA + lanes * 2 * 8 | |||
| ld1rd z8.d, p0/z, [pB] | |||
| @@ -469,13 +468,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #endif | |||
| mov pB, origPB | |||
| // Loop over N | |||
| mov counterJ, origN | |||
| asr counterJ, counterJ, #3 // J = J / 8 | |||
| cmp counterJ, #0 | |||
| ble .Ldtrmm_kernel_L4_BEGIN | |||
| /******************************************************************************/ | |||
| /* Repeat this as long as there are 8 left in N */ | |||
| .align 5 | |||
| .Ldtrmm_kernel_L8_BEGIN: | |||
| @@ -491,9 +491,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .Ldtrmm_kernel_L8_Mv1_BEGIN: | |||
| /* Loop over M is done in an SVE fashion. This has the benefit of the last M%SVE_LEN iterations being done in a single sweep */ | |||
| mov counterI, #0 | |||
| whilelt p1.d, counterI, origM //SVE instruction | |||
| cntp lanes, p0, p1.d | |||
| whilelt p1.d, counterI, origM | |||
| cntp lanes, p0, p1.d // lanes contain number of active SVE lanes in M dimension | |||
| .align 5 | |||
| .Ldtrmm_kernel_L8_Mv1_20: | |||
| @@ -641,7 +642,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| bgt .Ldtrmm_kernel_L8_BEGIN | |||
| /******************************************************************************/ | |||
| /******************************************************************************/ | |||
| /* Repeat the same thing if 4 left in N */ | |||
| .align 5 | |||
| .Ldtrmm_kernel_L4_BEGIN: | |||
| @@ -757,7 +758,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #endif | |||
| /******************************************************************************/ | |||
| /******************************************************************************/ | |||
| /* Repeat the same thing if 2 left in N */ | |||
| .align 5 | |||
| .Ldtrmm_kernel_L2_BEGIN: | |||
| @@ -873,7 +874,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #endif | |||
| /******************************************************************************/ | |||
| /******************************************************************************/ | |||
| /* Repeat the same thing if 1 left in N */ | |||
| .align 5 | |||
| .Ldtrmm_kernel_L1_BEGIN: | |||
| @@ -47,7 +47,6 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON | |||
| BLASLONG i, js; | |||
| BLASLONG X; | |||
| //printf("Using trmm_ln.\n"); | |||
| int sve_len = svcntd(); | |||
| svint64_t index = svindex_s64(0LL, lda); | |||
| @@ -67,11 +66,9 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON | |||
| } | |||
| i = 0; | |||
| /* svbool_t pm = svwhilelt_b64(i, m); */ | |||
| /* int m_active = svcntp_b64(svptrue_b64(), pm); */ | |||
| do | |||
| { | |||
| if (X > posY) { // optimize this: unroll over DGEMM_UNROLL_M: vl | |||
| if (X > posY) { | |||
| svfloat64_t aj_vec = svld1_gather_index(pn, ao, index); | |||
| svst1(pn, b, aj_vec); | |||
| ao ++; | |||
| @@ -85,6 +82,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON | |||
| X ++; | |||
| i ++; | |||
| } else { | |||
| /* I did not find a way to unroll this while preserving vector-length-agnostic code. */ | |||
| #ifdef UNIT | |||
| int temp = 0; | |||
| for (int j = 0; j < n_active; j++) { | |||
| @@ -114,9 +112,6 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON | |||
| } | |||
| } while (i < m); | |||
| //printf("\n"); | |||
| posY += n_active; | |||
| js += n_active; | |||
| pn = svwhilelt_b64(js, n); | |||
| @@ -48,8 +48,6 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON | |||
| BLASLONG i, js; | |||
| BLASLONG X; | |||
| //printf("Using trmm_lt.\n"); | |||
| int sve_len = svcntd(); | |||
| FLOAT *ao; | |||
| @@ -67,11 +65,9 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON | |||
| } | |||
| i = 0; | |||
| /* svbool_t pm = svwhilelt_b64(i, m); */ | |||
| /* int m_active = svcntp_b64(svptrue_b64(), pm); */ | |||
| do | |||
| { | |||
| if (X > posY) { // optimize this: unroll over DGEMM_UNROLL_M: vl | |||
| if (X > posY) { | |||
| ao ++; | |||
| b += n_active; | |||
| X ++; | |||
| @@ -85,6 +81,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON | |||
| X ++; | |||
| i ++; | |||
| } else { | |||
| /* I did not find a way to unroll this while preserving vector-length-agnostic code. */ | |||
| #ifdef UNIT | |||
| int temp = 0; | |||
| for (int j = 0; j < n_active; j++) { | |||
| @@ -114,8 +111,6 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON | |||
| } | |||
| } while (i < m); | |||
| //printf("\n"); | |||
| posY += n_active; | |||
| js += n_active; | |||
| @@ -47,10 +47,6 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON | |||
| BLASLONG i, js; | |||
| BLASLONG X; | |||
| //printf("Using trmm_un.\n"); | |||
| //printf("Using m %ld, n %ld.\n", m, n); | |||
| //printf("Using lda %ld.\n", lda); | |||
| //printf("Using posX %ld, posY %ld.\n", posX, posY); | |||
| int sve_len = svcntd(); | |||
| svint64_t index = svindex_s64(0LL, lda); | |||
| @@ -70,11 +66,9 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON | |||
| } | |||
| i = 0; | |||
| /* svbool_t pm = svwhilelt_b64(i, m); */ | |||
| /* int m_active = svcntp_b64(svptrue_b64(), pm); */ | |||
| do | |||
| { | |||
| if (X < posY) { // optimize this: unroll over DGEMM_UNROLL_M: vl | |||
| if (X < posY) { | |||
| svfloat64_t aj_vec = svld1_gather_index(pn, ao, index); | |||
| svst1(pn, b, aj_vec); | |||
| ao ++; | |||
| @@ -88,6 +82,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON | |||
| X ++; | |||
| i ++; | |||
| } else { | |||
| /* I did not find a way to unroll this while preserving vector-length-agnostic code. */ | |||
| #ifdef UNIT | |||
| int temp = 0; | |||
| for (int j = 0; j < n_active; j++) { | |||
| @@ -117,9 +112,6 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON | |||
| } | |||
| } while (i < m); | |||
| //printf("\n"); | |||
| posY += n_active; | |||
| js += n_active; | |||
| pn = svwhilelt_b64(js, n); | |||
| @@ -47,7 +47,6 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON | |||
| BLASLONG i, js; | |||
| BLASLONG X; | |||
| //printf("Using trmm_ut.\n"); | |||
| int sve_len = svcntd(); | |||
| @@ -66,11 +65,9 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON | |||
| } | |||
| i = 0; | |||
| /* svbool_t pm = svwhilelt_b64(i, m); */ | |||
| /* int m_active = svcntp_b64(svptrue_b64(), pm); */ | |||
| do | |||
| { | |||
| if (X < posY) { // optimize this: unroll over DGEMM_UNROLL_M: vl | |||
| if (X < posY) { | |||
| ao ++; | |||
| b += n_active; | |||
| X ++; | |||
| @@ -83,7 +80,8 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON | |||
| b += n_active; | |||
| X ++; | |||
| i ++; | |||
| } else { | |||
| } else { | |||
| /* I did not find a way to unroll this while preserving vector-length-agnostic code. */ | |||
| #ifdef UNIT | |||
| int temp = 0; | |||
| for (int j = 0; j < n_active; j++) { | |||
| @@ -113,9 +111,6 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON | |||
| } | |||
| } while (i < m); | |||
| //printf("\n"); | |||
| posY += n_active; | |||
| js += n_active; | |||
| pn = svwhilelt_b64(js, n); | |||