Browse Source

fix sve dgemm kernel + sve dtrmm

tags/v0.3.19
Bine Brank 4 years ago
parent
commit
a8fbdbac34
2 changed files with 1088 additions and 59 deletions
  1. +81
    -59
      kernel/arm64/dgemm_kernel_sve_v1x8.S
  2. +1007
    -0
      kernel/arm64/dtrmm_kernel_sve_v1x8.S

+ 81
- 59
kernel/arm64/dgemm_kernel_sve_v1x8.S View File

@@ -46,16 +46,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define pCRow0 x12
#define pCRow1 x13
#define pCRow2 x14
#define pCRow3 x15

#define lanes x15
#define pA x16
#define alpha x17

#define alpha0 d10
#define alphaZ z10.d
#define alphaV0 v10.d[0]
#define alphaZ z2.d

#define A_PRE_SIZE 2560
#define B_PRE_SIZE 448
#define B_PRE_SIZE 512
#define C_PRE_SIZE 128

// 00 origM
@@ -73,9 +73,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
// 12 pCRow0
// 13 pCRow1
// 14 pCRow2
// 15 pCRow3
// 15 lanes
// 16 pA
// 17
// 17
// 18 must save
// 19 must save
// 20 must save
@@ -93,20 +93,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

//v00 ALPHA -> pA0_0
//v01 pA0_1
//v02 pA0_2
//v03 pA0_3
//v04 pA0_4
//v05 pA0_5
//v06 pA0_6
//v07 pA0_7
//v02 ALPHA0
//v03
//v04
//v05
//v06
//v07
//v08 must save pB0_0
//v09 must save pB0_1
//v10 must save pB0_2 --> ALPHA0
//v10 must save pB0_2
//v11 must save pB0_3
//v12 must save pB1_0
//v13 must save pB1_1
//v14 must save pB1_2
//v15 must save pB1_3
//v12 must save pB0_4
//v13 must save pB0_5
//v14 must save pB0_6
//v15 must save pB0_7
//v16 must save C0
//v17 must save C1
//v18 must save C2
@@ -133,9 +133,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

.macro KERNELv1x8_I
ld1d z0.d, p1/z, [pA]
ld1d z1.d, p1/z, [pA, x18, lsl #3] // next one
ld1d z1.d, p1/z, [pA, lanes, lsl #3] // next one
//incb pA, all, mul #2
add pA, pA, x18, lsl #4 // pA = pA + cnt_active * 2 * 8
add pA, pA, lanes, lsl #4 // pA = pA + lanes * 2 * 8

ld1rd z8.d, p0/z, [pB]
ld1rd z9.d, p0/z, [pB, 8]
@@ -157,12 +157,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
fmla z19.d, p1/m, z0.d, z11.d
ld1rd z11.d, p0/z, [pB, 24]
fmla z20.d, p1/m, z0.d, z12.d
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
ld1rd z12.d, p0/z, [pB, 32]
fmla z21.d, p1/m, z0.d, z13.d
ld1rd z13.d, p0/z, [pB, 40]
fmla z22.d, p1/m, z0.d, z14.d
ld1rd z14.d, p0/z, [pB, 48]
fmla z23.d, p1/m, z0.d, z15.d
prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64]
ld1rd z15.d, p0/z, [pB, 56]

add pB, pB, 64
@@ -170,7 +172,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

.macro KERNELv1x8_M1
ld1d z1.d, p1/z, [pA]
add pA, pA, x18, lsl #3 // pA = pA + cnt_active * 8
add pA, pA, lanes, lsl #3 // pA = pA + lanes * 8

fmla z16.d, p1/m, z0.d, z8.d
ld1rd z8.d, p0/z, [pB]
@@ -181,12 +183,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
fmla z19.d, p1/m, z0.d, z11.d
ld1rd z11.d, p0/z, [pB, 24]
fmla z20.d, p1/m, z0.d, z12.d
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
ld1rd z12.d, p0/z, [pB, 32]
fmla z21.d, p1/m, z0.d, z13.d
ld1rd z13.d, p0/z, [pB, 40]
fmla z22.d, p1/m, z0.d, z14.d
ld1rd z14.d, p0/z, [pB, 48]
fmla z23.d, p1/m, z0.d, z15.d
prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64]
ld1rd z15.d, p0/z, [pB, 56]

add pB, pB, 64
@@ -194,7 +198,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

.macro KERNELv1x8_M2
ld1d z0.d, p1/z, [pA]
add pA, pA, x18, lsl #3 // pA = pA + cnt_active * 8
add pA, pA, lanes, lsl #3 // pA = pA + lanes * 8

fmla z16.d, p1/m, z1.d, z8.d
ld1rd z8.d, p0/z, [pB]
@@ -206,6 +210,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1rd z11.d, p0/z, [pB, 24]
fmla z20.d, p1/m, z1.d, z12.d
ld1rd z12.d, p0/z, [pB, 32]
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
fmla z21.d, p1/m, z1.d, z13.d
ld1rd z13.d, p0/z, [pB, 40]
fmla z22.d, p1/m, z1.d, z14.d
@@ -222,6 +227,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
fmla z18.d, p1/m, z1.d, z10.d
fmla z19.d, p1/m, z1.d, z11.d
fmla z20.d, p1/m, z1.d, z12.d
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
fmla z21.d, p1/m, z1.d, z13.d
fmla z22.d, p1/m, z1.d, z14.d
fmla z23.d, p1/m, z1.d, z15.d
@@ -229,7 +235,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

.macro KERNELv1x8_SUB
ld1d z0.d, p1/z, [pA]
add pA, pA, x18, lsl #3 // pA = pA + cnt_active * 8
add pA, pA, lanes, lsl #3 // pA = pA + lanes * 8

ld1rd z8.d, p0/z, [pB]
ld1rd z9.d, p0/z, [pB, 8]
@@ -245,16 +251,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
fmla z16.d, p1/m, z0.d, z8.d
fmla z17.d, p1/m, z0.d, z9.d
fmla z18.d, p1/m, z0.d, z10.d
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
fmla z19.d, p1/m, z0.d, z11.d
fmla z20.d, p1/m, z0.d, z12.d
fmla z21.d, p1/m, z0.d, z13.d
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
fmla z22.d, p1/m, z0.d, z14.d
fmla z23.d, p1/m, z0.d, z15.d

.endm

.macro SAVEv1x8
dup alphaZ, alpha

prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE]

@@ -262,43 +269,49 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1d z24.d, p1/z, [pCRow0]
fmla z24.d, p1/m, z16.d, alphaZ
st1d z24.d, p1, [pCRow0]
prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE]

add pCRow2, pCRow1, LDC
ld1d z25.d, p1/z, [pCRow1]
fmla z25.d, p1/m, z17.d, alphaZ
st1d z25.d, p1, [pCRow1]
prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE]

add pCRow1, pCRow2, LDC
ld1d z26.d, p1/z, [pCRow2]
fmla z26.d, p1/m, z18.d, alphaZ
st1d z26.d, p1, [pCRow2]
prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE]

add pCRow2, pCRow1, LDC
ld1d z27.d, p1/z, [pCRow1]
fmla z27.d, p1/m, z19.d, alphaZ
st1d z27.d, p1, [pCRow1]
prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE]

add pCRow1, pCRow2, LDC
ld1d z28.d, p1/z, [pCRow2]
fmla z28.d, p1/m, z20.d, alphaZ
st1d z28.d, p1, [pCRow2]
prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE]

add pCRow2, pCRow1, LDC
ld1d z29.d, p1/z, [pCRow1]
fmla z29.d, p1/m, z21.d, alphaZ
st1d z29.d, p1, [pCRow1]
prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE]

add pCRow1, pCRow2, LDC
ld1d z30.d, p1/z, [pCRow2]
fmla z30.d, p1/m, z22.d, alphaZ
st1d z30.d, p1, [pCRow2]
prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE]

add pCRow2, pCRow1, LDC
ld1d z31.d, p1/z, [pCRow1]
fmla z31.d, p1/m, z23.d, alphaZ
st1d z31.d, p1, [pCRow1]

add pCRow0, pCRow0, x18, lsl #3 // pC = pC + cnt_active * 8
add pCRow0, pCRow0, lanes, lsl #3 // pC = pC + lanes * 8

.endm

@@ -313,7 +326,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

.macro KERNELv1x4_SUB
ld1d z0.d, p1/z, [pA]
add pA, pA, x18, lsl #3 // pA = pA + cnt_active * 8
add pA, pA, lanes, lsl #3 // pA = pA + lanes * 8

ld1rd z8.d, p0/z, [pB]
ld1rd z9.d, p0/z, [pB, 8]
@@ -324,13 +337,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

fmla z16.d, p1/m, z0.d, z8.d
fmla z17.d, p1/m, z0.d, z9.d
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
fmla z18.d, p1/m, z0.d, z10.d
fmla z19.d, p1/m, z0.d, z11.d

.endm

.macro SAVEv1x4
dup alphaZ, alpha

prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE]

@@ -338,23 +351,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1d z24.d, p1/z, [pCRow0]
fmla z24.d, p1/m, z16.d, alphaZ
st1d z24.d, p1, [pCRow0]
prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE]

add pCRow2, pCRow1, LDC
ld1d z25.d, p1/z, [pCRow1]
fmla z25.d, p1/m, z17.d, alphaZ
st1d z25.d, p1, [pCRow1]
prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE]

add pCRow1, pCRow2, LDC
ld1d z26.d, p1/z, [pCRow2]
fmla z26.d, p1/m, z18.d, alphaZ
st1d z26.d, p1, [pCRow2]
prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE]

add pCRow2, pCRow1, LDC
ld1d z27.d, p1/z, [pCRow1]
fmla z27.d, p1/m, z19.d, alphaZ
st1d z27.d, p1, [pCRow1]

add pCRow0, pCRow0, x18, lsl #3 // pC = pC + cnt_active * 8
add pCRow0, pCRow0, lanes, lsl #3 // pC = pC + lanes * 8

.endm

@@ -367,7 +382,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

.macro KERNELv1x2_SUB
ld1d z0.d, p1/z, [pA]
add pA, pA, x18, lsl #3 // pA = pA + cnt_active * 8
add pA, pA, lanes, lsl #3 // pA = pA + lanes * 8

ld1rd z8.d, p0/z, [pB]
ld1rd z9.d, p0/z, [pB, 8]
@@ -375,12 +390,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
add pB, pB, 16

fmla z16.d, p1/m, z0.d, z8.d
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
fmla z17.d, p1/m, z0.d, z9.d

.endm

.macro SAVEv1x2
dup alphaZ, alpha

prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE]

@@ -388,13 +403,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1d z24.d, p1/z, [pCRow0]
fmla z24.d, p1/m, z16.d, alphaZ
st1d z24.d, p1, [pCRow0]
prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE]

add pCRow2, pCRow1, LDC
ld1d z25.d, p1/z, [pCRow1]
fmla z25.d, p1/m, z17.d, alphaZ
st1d z25.d, p1, [pCRow1]

add pCRow0, pCRow0, x18, lsl #3 // pC = pC + cnt_active * 8
add pCRow0, pCRow0, lanes, lsl #3 // pC = pC + lanes * 8

.endm

@@ -406,28 +421,27 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

.macro KERNELv1x1_SUB
ld1d z0.d, p1/z, [pA]
add pA, pA, x18, lsl #3 // pA = pA + cnt_active * 8
add pA, pA, lanes, lsl #3 // pA = pA + lanes * 8

ld1rd z8.d, p0/z, [pB]

add pB, pB, 8

fmla z16.d, p1/m, z0.d, z8.d
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]

.endm

.macro SAVEv1x1
dup alphaZ, alpha

prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE]

add pCRow1, pCRow0, LDC
ld1d z24.d, p1/z, [pCRow0]
fmla z24.d, p1/m, z16.d, alphaZ
st1d z24.d, p1, [pCRow0]


add pCRow0, pCRow0, x18, lsl #3 // pC = pC + cnt_active * 8
add pCRow0, pCRow0, lanes, lsl #3 // pC = pC + lanes * 8

.endm

@@ -456,6 +470,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
prfm PLDL1KEEP, [origPA]

fmov alpha, d0
dup alphaZ, alpha

lsl LDC, LDC, #3 // ldc = ldc * 8
ptrue p0.d // create true predicate
@@ -473,7 +488,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.Ldgemm_kernel_L8_BEGIN:
mov pCRow0, pC

add pC, pCRow0, LDC, lsl #3 // add 8 x LDC
add pC, pC, LDC, lsl #3 // add 8 x LDC

mov pA, origPA // pA = start of A array

@@ -481,11 +496,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

mov counterI, #0
whilelt p1.d, counterI, origM //SVE instruction
cntp x18, p0, p1.d
/* mov counterI, origM */
/* asr counterI, counterI, #3 // counterI = counterI / 8 */
/* cmp counterI, #0 */
/* ble .Ldgemm_kernel_L4_M4_BEGIN */
cntp lanes, p0, p1.d // lanes contain number of active SVE lanes in M dimension

.align 5
.Ldgemm_kernel_L8_Mv1_20:
@@ -584,7 +595,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

incd counterI
whilelt p1.d, counterI, origM //SVE instruction
cntp x18, p0, p1.d
cntp lanes, p0, p1.d // lanes contain number of active SVE lanes in M dimension
b.any .Ldgemm_kernel_L8_Mv1_20

.Ldgemm_kernel_L8_END:
@@ -608,7 +619,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

mov pCRow0, pC

add pC, pCRow0, LDC, lsl #2 // add 4 x LDC
add pC, pC, LDC, lsl #2 // add 4 x LDC

mov pA, origPA // pA = start of A array

@@ -616,7 +627,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

mov counterI, #0
whilelt p1.d, counterI, origM //SVE instruction
cntp x18, p0, p1.d
cntp lanes, p0, p1.d

.align 5
.Ldgemm_kernel_L4_Mv1_20:
@@ -626,17 +637,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

asr counterL , origK, #3 // L = K / 8
cmp counterL , #0 // is there at least 4 to do?
blt .Ldgemm_kernel_L4_Mv1_44
ble .Ldgemm_kernel_L4_Mv1_44

.align 5
.Ldgemm_kernel_L4_Mv1_22:

prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
KERNELv1x4_SUB
KERNELv1x4_SUB
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
KERNELv1x4_SUB
KERNELv1x4_SUB
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
KERNELv1x4_SUB
KERNELv1x4_SUB
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
KERNELv1x4_SUB
KERNELv1x4_SUB

@@ -651,6 +666,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.align 5
.Ldgemm_kernel_L4_Mv1_46:

prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
KERNELv1x4_SUB

subs counterL, counterL, #1
@@ -667,12 +683,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

incd counterI
whilelt p1.d, counterI, origM //SVE instruction
cntp x18, p0, p1.d
cntp lanes, p0, p1.d
b.any .Ldgemm_kernel_L4_Mv1_20


.Ldgemm_kernel_L4_END:
add origPB, origPB, origK, lsl #5 // B = B + K * 4 * 8
lsl temp, origK, #5
add origPB, origPB, temp // B = B + K * 4 * 8

/******************************************************************************/
/******************************************************************************/
@@ -686,7 +703,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

mov pCRow0, pC

add pC, pCRow0, LDC, lsl #1 // add 2 x LDC
add pC, pC, LDC, lsl #1 // add 2 x LDC

mov pA, origPA // pA = start of A array

@@ -694,7 +711,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

mov counterI, #0
whilelt p1.d, counterI, origM //SVE instruction
cntp x18, p0, p1.d
cntp lanes, p0, p1.d

.align 5
.Ldgemm_kernel_L2_Mv1_20:
@@ -704,15 +721,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

asr counterL , origK, #3 // L = K / 8
cmp counterL , #0 // is there at least 4 to do?
blt .Ldgemm_kernel_L2_Mv1_44
ble .Ldgemm_kernel_L2_Mv1_44

.align 5
.Ldgemm_kernel_L2_Mv1_22:

prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
KERNELv1x2_SUB
KERNELv1x2_SUB
KERNELv1x2_SUB
KERNELv1x2_SUB
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
KERNELv1x2_SUB
KERNELv1x2_SUB
KERNELv1x2_SUB
@@ -729,6 +748,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.align 5
.Ldgemm_kernel_L2_Mv1_46:

prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
KERNELv1x2_SUB

subs counterL, counterL, #1
@@ -745,7 +765,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

incd counterI
whilelt p1.d, counterI, origM //SVE instruction
cntp x18, p0, p1.d
cntp lanes, p0, p1.d
b.any .Ldgemm_kernel_L2_Mv1_20


@@ -764,7 +784,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

mov pCRow0, pC

add pC, pCRow0, LDC, lsl #1 // add 2 x LDC
add pC, pC, LDC // add 1 x LDC

mov pA, origPA // pA = start of A array

@@ -772,7 +792,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

mov counterI, #0
whilelt p1.d, counterI, origM //SVE instruction
cntp x18, p0, p1.d
cntp lanes, p0, p1.d

.align 5
.Ldgemm_kernel_L1_Mv1_20:
@@ -781,12 +801,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
INITv1x1 // fill with zeros

asr counterL , origK, #3 // L = K / 8
cmp counterL , #0 // is there at least 4 to do?
blt .Ldgemm_kernel_L1_Mv1_44
cmp counterL , #0 // is there at least 8 to do?
ble .Ldgemm_kernel_L1_Mv1_44

.align 5
.Ldgemm_kernel_L1_Mv1_22:

prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
KERNELv1x1_SUB
KERNELv1x1_SUB
KERNELv1x1_SUB
@@ -807,10 +828,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.align 5
.Ldgemm_kernel_L1_Mv1_46:

prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
KERNELv1x1_SUB

subs counterL, counterL, #1
bne .Ldgemm_kernel_L1_Mv1_46
bgt .Ldgemm_kernel_L1_Mv1_46

.Ldgemm_kernel_L1_Mv1_100:
prfm PLDL1KEEP, [pA]
@@ -823,7 +845,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

incd counterI
whilelt p1.d, counterI, origM //SVE instruction
cntp x18, p0, p1.d
cntp lanes, p0, p1.d
b.any .Ldgemm_kernel_L1_Mv1_20




+ 1007
- 0
kernel/arm64/dtrmm_kernel_sve_v1x8.S
File diff suppressed because it is too large
View File


Loading…
Cancel
Save