#include "common.h" #include #define KERNEL8x4_I \ "addi t1, %[PB], 1*8 \n\t"\ "addi t2, %[PB], 2*8 \n\t"\ "addi t3, %[PB], 3*8 \n\t"\ "fld ft0, (%[PB]) \n\t"\ "fld ft1, (t1) \n\t"\ "fld ft2, (t2) \n\t"\ "fld ft3, (t3) \n\t"\ "vle.v v0, (%[PA]) \n\t"\ "addi t4, %[PA], 2*8 \n\t"\ "addi t5, %[PA], 4*8 \n\t"\ "vfmv.v.f v8, ft0 \n\t"\ "addi t6, %[PA], 6*8 \n\t"\ "addi %[PA], %[PA], 8*8 \n\t"\ "vle.v v1, (t4) \n\t"\ "addi t4, t4, 8*8 \n\t"\ "vfmv.v.f v9, ft1 \n\t"\ "vle.v v2, (t5) \n\t"\ "addi t5, t5, 8*8 \n\t"\ "vle.v v3, (t6) \n\t"\ "addi t6, t6, 8*8 \n\t"\ "vfmv.v.f v10, ft2 \n\t"\ "addi %[PB], %[PB], 4*8 \n\t"\ "vle.v v4, (%[PA]) \n\t"\ "addi %[PA], %[PA], 8*8 \n\t"\ "vfmv.v.f v11, ft3 \n\t"\ "vfmacc.vv v16, v8, v0 \n\t"\ "addi t1, t1, 4*8 \n\t"\ "vle.v v5, (t4) \n\t"\ "addi t4, t4, 8*8 \n\t"\ "vfmacc.vv v17, v8, v1 \n\t"\ "addi t2, t2, 4*8 \n\t"\ "vle.v v6, (t5) \n\t"\ "addi t5, t5, 8*8 \n\t"\ "vfmacc.vv v18, v8, v2 \n\t"\ "addi t3, t3, 4*8 \n\t"\ "vle.v v7, (t6) \n\t"\ "addi t6, t6, 8*8 \n\t"\ "vfmacc.vv v19, v8, v3 \n\t"\ "fld ft4, (%[PB]) \n\t"\ "vfmacc.vv v20, v9, v0 \n\t"\ "fld ft5, (t1) \n\t"\ "vfmacc.vv v21, v9, v1 \n\t"\ "fld ft6, (t2) \n\t"\ "vfmacc.vv v22, v9, v2 \n\t"\ "fld ft7, (t3) \n\t"\ "vfmacc.vv v23, v9, v3 \n\t"\ "vfmv.v.f v12, ft4 \n\t"\ "vfmacc.vv v24, v10, v0 \n\t"\ "vfmv.v.f v13, ft5 \n\t"\ "vfmacc.vv v25, v10, v1 \n\t"\ "vfmv.v.f v14, ft6 \n\t"\ "vfmacc.vv v26, v10, v2 \n\t"\ "vfmv.v.f v15, ft7 \n\t"\ "vfmacc.vv v27, v10, v3 \n\t"\ "addi %[PB], %[PB], 4*8 \n\t"\ "vfmacc.vv v28, v11, v0 \n\t"\ "addi t1, t1, 4*8 \n\t"\ "vfmacc.vv v29, v11, v1 \n\t"\ "addi t2, t2, 4*8 \n\t"\ "vfmacc.vv v30, v11, v2 \n\t"\ "addi t3, t3, 4*8 \n\t"\ "vfmacc.vv v31, v11, v3 \n\t" #define KERNEL8x4_M1 \ "vfmacc.vv v16, v8, v0 \n\t"\ "vle.v v4, (%[PA]) \n\t"\ "addi %[PA], %[PA], 8*8 \n\t"\ "vfmacc.vv v17, v8, v1 \n\t"\ "vle.v v5, (t4) \n\t"\ "addi t4, t4, 8*8 \n\t"\ "vfmacc.vv v18, v8, v2 \n\t"\ "vle.v v6, (t5) \n\t"\ "addi t5, t5, 8*8 \n\t"\ "vfmacc.vv v19, v8, v3 \n\t"\ "vle.v v7, (t6) \n\t"\ "addi t6, t6, 8*8 \n\t"\ "vfmacc.vv v20, v9, v0 \n\t"\ "fld ft4, (%[PB]) \n\t"\ "vfmacc.vv v21, v9, v1 \n\t"\ "fld ft5, (t1) \n\t"\ "vfmacc.vv v22, v9, v2 \n\t"\ "fld ft6, (t2) \n\t"\ "vfmacc.vv v23, v9, v3 \n\t"\ "fld ft7, (t3) \n\t"\ "addi %[PB], %[PB], 4*8 \n\t"\ "vfmacc.vv v24, v10, v0 \n\t"\ "addi t1, t1, 4*8 \n\t"\ "vfmacc.vv v25, v10, v1 \n\t"\ "vfmv.v.f v12, ft4 \n\t"\ "vfmacc.vv v26, v10, v2 \n\t"\ "addi t2, t2, 4*8 \n\t"\ "vfmacc.vv v27, v10, v3 \n\t"\ "vfmv.v.f v13, ft5 \n\t"\ "vfmacc.vv v28, v11, v0 \n\t"\ "addi t3, t3, 4*8 \n\t"\ "vfmacc.vv v29, v11, v1 \n\t"\ "vfmv.v.f v14, ft6 \n\t"\ "vfmacc.vv v30, v11, v2 \n\t"\ "vfmacc.vv v31, v11, v3 \n\t"\ "vfmv.v.f v15, ft7 \n\t" #define KERNEL8x4_M2 \ "vfmacc.vv v16, v12, v4 \n\t"\ "vle.v v0, (%[PA]) \n\t"\ "addi %[PA], %[PA], 8*8 \n\t"\ "vfmacc.vv v17, v12, v5 \n\t"\ "vle.v v1, (t4) \n\t"\ "addi t4, t4, 8*8 \n\t"\ "vfmacc.vv v18, v12, v6 \n\t"\ "vle.v v2, (t5) \n\t"\ "addi t5, t5, 8*8 \n\t"\ "vfmacc.vv v19, v12, v7 \n\t"\ "vle.v v3, (t6) \n\t"\ "addi t6, t6, 8*8 \n\t"\ "vfmacc.vv v20, v13, v4 \n\t"\ "fld ft0, (%[PB]) \n\t"\ "vfmacc.vv v21, v13, v5 \n\t"\ "fld ft1, (t1) \n\t"\ "vfmacc.vv v22, v13, v6 \n\t"\ "fld ft2, (t2) \n\t"\ "vfmacc.vv v23, v13, v7 \n\t"\ "fld ft3, (t3) \n\t"\ "addi %[PB], %[PB], 4*8 \n\t"\ "vfmacc.vv v24, v14, v4 \n\t"\ "addi t1, t1, 4*8 \n\t"\ "vfmacc.vv v25, v14, v5 \n\t"\ "vfmv.v.f v8, ft0 \n\t"\ "vfmacc.vv v26, v14, v6 \n\t"\ "addi t2, t2, 4*8 \n\t"\ "vfmacc.vv v27, v14, v7 \n\t"\ "vfmv.v.f v9, ft1 \n\t"\ "vfmacc.vv v28, v15, v4 \n\t"\ "addi t3, t3, 4*8 \n\t"\ "vfmacc.vv v29, v15, v5 \n\t"\ "vfmv.v.f v10, ft2 \n\t"\ "vfmacc.vv v30, v15, v6 \n\t"\ "vfmacc.vv v31, v15, v7 \n\t"\ "vfmv.v.f v11, ft3 \n\t" #define KERNEL8x4_E \ "vfmacc.vv v16, v12, v4 \n\t"\ "vfmacc.vv v17, v12, v5 \n\t"\ "vfmacc.vv v18, v12, v6 \n\t"\ "vfmacc.vv v19, v12, v7 \n\t"\ "vfmacc.vv v20, v13, v4 \n\t"\ "vfmacc.vv v21, v13, v5 \n\t"\ "vfmacc.vv v22, v13, v6 \n\t"\ "vfmacc.vv v23, v13, v7 \n\t"\ "vfmacc.vv v24, v14, v4 \n\t"\ "vfmacc.vv v25, v14, v5 \n\t"\ "vfmacc.vv v26, v14, v6 \n\t"\ "vfmacc.vv v27, v14, v7 \n\t"\ "vfmacc.vv v28, v15, v4 \n\t"\ "vfmacc.vv v29, v15, v5 \n\t"\ "vfmacc.vv v30, v15, v6 \n\t"\ "vfmacc.vv v31, v15, v7 \n\t" int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc #ifdef TRMMKERNEL ,BLASLONG offset #endif ) { BLASLONG i,j,k; FLOAT *C0,*C1,*C2,*C3; FLOAT *ptrba,*ptrbb; FLOAT loadb0,loadb1,loadb2,loadb3; FLOAT load0,load1,load2,load3,load4,load5,load6,load7; FLOAT res0,res1,res2,res3; FLOAT res4,res5,res6,res7; FLOAT res8,res9,res10,res11; FLOAT res12,res13,res14,res15; for (j=0; j