| @@ -228,20 +228,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .L11: | |||
| //X & Y algin | |||
| gsLQC1(X_BASE,A2,A1,0*SIZE) | |||
| gsLQC1(X_BASE,A4,A3,2*SIZE) | |||
| gsLQC1(X_BASE,A6,A5,4*SIZE) | |||
| gsLQC1(X_BASE,A8,A7,6*SIZE) | |||
| gsLQC1(X_BASE,A10,A9,8*SIZE) | |||
| gsLQC1(X_BASE,A12,A11,10*SIZE) | |||
| gsLQC1(X_BASE,A14,A13,12*SIZE) | |||
| gsLQC1(X_BASE,A16,A15,14*SIZE) | |||
| gsLQC1(Y_BASE,B2,B1,0*SIZE) | |||
| gsLQC1(Y_BASE,B4,B3,2*SIZE) | |||
| gsLQC1(Y_BASE,B6,B5,4*SIZE) | |||
| gsLQC1(Y_BASE,B8,B7,6*SIZE) | |||
| gsLQC1(X_BASE,A2,A1,0) | |||
| gsLQC1(X_BASE,A4,A3,1) | |||
| gsLQC1(X_BASE,A6,A5,2) | |||
| gsLQC1(X_BASE,A8,A7,3) | |||
| gsLQC1(X_BASE,A10,A9,4) | |||
| gsLQC1(X_BASE,A12,A11,5) | |||
| gsLQC1(X_BASE,A14,A13,6) | |||
| gsLQC1(X_BASE,A16,A15,7) | |||
| gsLQC1(Y_BASE,B2,B1,0) | |||
| gsLQC1(Y_BASE,B4,B3,1) | |||
| gsLQC1(Y_BASE,B6,B5,2) | |||
| gsLQC1(Y_BASE,B8,B7,3) | |||
| blez I, .L13 | |||
| NOP | |||
| @@ -251,65 +251,65 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| MADD t1, b1, ALPHA, a1 | |||
| MADD t2, b2, ALPHA, a2 | |||
| gsSQC1(Y_BASE, T2, T1, 0*SIZE) | |||
| gsLQC1(Y_BASE,B2,B1,8*SIZE) | |||
| gsSQC1(Y_BASE, T2, T1, 0) | |||
| gsLQC1(Y_BASE,B2,B1,4) | |||
| MADD t3, b3, ALPHA, a3 | |||
| MADD t4, b4, ALPHA, a4 | |||
| gsSQC1(Y_BASE, T4, T3, 2*SIZE) | |||
| gsLQC1(Y_BASE,B4,B3,10*SIZE) | |||
| gsSQC1(Y_BASE, T4, T3, 1) | |||
| gsLQC1(Y_BASE,B4,B3,5) | |||
| PREFETCHD(PREFETCH_DISTANCE*SIZE(Y)) | |||
| PREFETCHD((PREFETCH_DISTANCE+4)*SIZE(Y)) | |||
| MADD t1, b5, ALPHA, a5 | |||
| MADD t2, b6, ALPHA, a6 | |||
| gsSQC1(Y_BASE, T2, T1, 4*SIZE) | |||
| gsLQC1(Y_BASE,B6,B5,12*SIZE) | |||
| gsSQC1(Y_BASE, T2, T1, 2) | |||
| gsLQC1(Y_BASE,B6,B5,6) | |||
| MADD t3, b7, ALPHA, a7 | |||
| MADD t4, b8, ALPHA, a8 | |||
| gsSQC1(Y_BASE, T4, T3, 6*SIZE) | |||
| gsLQC1(Y_BASE,B8,B7,14*SIZE) | |||
| gsSQC1(Y_BASE, T4, T3, 3) | |||
| gsLQC1(Y_BASE,B8,B7, 7) | |||
| PREFETCHD((PREFETCH_DISTANCE+8)*SIZE(Y)) | |||
| PREFETCHD((PREFETCH_DISTANCE+12)*SIZE(Y)) | |||
| MADD t1, b1, ALPHA, a9 | |||
| MADD t2, b2, ALPHA, a10 | |||
| gsSQC1(Y_BASE, T2, T1, 8*SIZE) | |||
| gsLQC1(Y_BASE,B2,B1,16*SIZE) | |||
| gsSQC1(Y_BASE, T2, T1, 4) | |||
| gsLQC1(Y_BASE,B2,B1,8) | |||
| MADD t3, b3, ALPHA, a11 | |||
| MADD t4, b4, ALPHA, a12 | |||
| gsSQC1(Y_BASE, T4, T3, 10*SIZE) | |||
| gsLQC1(Y_BASE,B4,B3,18*SIZE) | |||
| gsSQC1(Y_BASE, T4, T3, 5) | |||
| gsLQC1(Y_BASE,B4,B3,9) | |||
| PREFETCHD(PREFETCH_DISTANCE*SIZE(X)) | |||
| PREFETCHD((PREFETCH_DISTANCE+4)*SIZE(X)) | |||
| MADD t1, b5, ALPHA, a13 | |||
| MADD t2, b6, ALPHA, a14 | |||
| gsSQC1(Y_BASE, T2, T1, 12*SIZE) | |||
| gsLQC1(Y_BASE,B6,B5,20*SIZE) | |||
| gsSQC1(Y_BASE, T2, T1, 6) | |||
| gsLQC1(Y_BASE,B6,B5,10) | |||
| MADD t3, b7, ALPHA, a15 | |||
| MADD t4, b8, ALPHA, a16 | |||
| gsSQC1(Y_BASE, T4, T3, 14*SIZE) | |||
| gsLQC1(Y_BASE,B8,B7,22*SIZE) | |||
| gsSQC1(Y_BASE, T4, T3, 7) | |||
| gsLQC1(Y_BASE,B8,B7,11) | |||
| PREFETCHD((PREFETCH_DISTANCE+8)*SIZE(X)) | |||
| PREFETCHD((PREFETCH_DISTANCE+12)*SIZE(X)) | |||
| gsLQC1(X_BASE,A2,A1,16*SIZE) | |||
| gsLQC1(X_BASE,A4,A3,18*SIZE) | |||
| gsLQC1(X_BASE,A6,A5,20*SIZE) | |||
| gsLQC1(X_BASE,A8,A7,22*SIZE) | |||
| gsLQC1(X_BASE,A2,A1,8) | |||
| gsLQC1(X_BASE,A4,A3,9) | |||
| gsLQC1(X_BASE,A6,A5,10) | |||
| gsLQC1(X_BASE,A8,A7,11) | |||
| gsLQC1(X_BASE,A10,A9,24*SIZE) | |||
| gsLQC1(X_BASE,A12,A11,26*SIZE) | |||
| gsLQC1(X_BASE,A14,A13,28*SIZE) | |||
| gsLQC1(X_BASE,A16,A15,30*SIZE) | |||
| gsLQC1(X_BASE,A10,A9,12) | |||
| gsLQC1(X_BASE,A12,A11,13) | |||
| gsLQC1(X_BASE,A14,A13,14) | |||
| gsLQC1(X_BASE,A16,A15,15) | |||
| daddiu I, I, -1 | |||
| @@ -324,44 +324,44 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| MADD t1, b1, ALPHA, a1 | |||
| MADD t2, b2, ALPHA, a2 | |||
| gsSQC1(Y_BASE, T2, T1, 0*SIZE) | |||
| gsLQC1(Y_BASE,B2,B1,8*SIZE) | |||
| gsSQC1(Y_BASE, T2, T1, 0) | |||
| gsLQC1(Y_BASE,B2,B1,4) | |||
| MADD t3, b3, ALPHA, a3 | |||
| MADD t4, b4, ALPHA, a4 | |||
| gsSQC1(Y_BASE, T4, T3, 2*SIZE) | |||
| gsLQC1(Y_BASE,B4,B3,10*SIZE) | |||
| gsSQC1(Y_BASE, T4, T3, 1) | |||
| gsLQC1(Y_BASE,B4,B3,5) | |||
| MADD t1, b5, ALPHA, a5 | |||
| MADD t2, b6, ALPHA, a6 | |||
| gsSQC1(Y_BASE, T2, T1, 4*SIZE) | |||
| gsLQC1(Y_BASE,B6,B5,12*SIZE) | |||
| gsSQC1(Y_BASE, T2, T1, 2) | |||
| gsLQC1(Y_BASE,B6,B5,6) | |||
| MADD t3, b7, ALPHA, a7 | |||
| MADD t4, b8, ALPHA, a8 | |||
| gsSQC1(Y_BASE, T4, T3, 6*SIZE) | |||
| gsLQC1(Y_BASE,B8,B7,14*SIZE) | |||
| gsSQC1(Y_BASE, T4, T3, 3) | |||
| gsLQC1(Y_BASE,B8,B7,7) | |||
| MADD t1, b1, ALPHA, a9 | |||
| MADD t2, b2, ALPHA, a10 | |||
| gsSQC1(Y_BASE, T2, T1, 8*SIZE) | |||
| gsSQC1(Y_BASE, T2, T1, 4) | |||
| MADD t3, b3, ALPHA, a11 | |||
| MADD t4, b4, ALPHA, a12 | |||
| gsSQC1(Y_BASE, T4, T3, 10*SIZE) | |||
| gsSQC1(Y_BASE, T4, T3, 5) | |||
| MADD t1, b5, ALPHA, a13 | |||
| MADD t2, b6, ALPHA, a14 | |||
| gsSQC1(Y_BASE, T2, T1, 12*SIZE) | |||
| gsSQC1(Y_BASE, T2, T1, 6) | |||
| MADD t3, b7, ALPHA, a15 | |||
| MADD t4, b8, ALPHA, a16 | |||
| gsSQC1(Y_BASE, T4, T3, 14*SIZE) | |||
| gsSQC1(Y_BASE, T4, T3, 7) | |||
| daddiu X, X, 16 * SIZE | |||
| @@ -415,97 +415,148 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| //unloop 16 | |||
| LD a1, 0 * SIZE(X) | |||
| gsLQC1(X_BASE,A3,A2,1*SIZE) | |||
| gsLQC1(X_BASE,A5,A4,3*SIZE) | |||
| gsLQC1(X_BASE,A7,A6,5*SIZE) | |||
| gsLQC1(X_BASE,A9,A8,7*SIZE) | |||
| gsLQC1(X_BASE,A11,A10,8*SIZE) | |||
| gsLQC1(X_BASE,A13,A12,11*SIZE) | |||
| gsLQC1(X_BASE,A15,A14,13*SIZE) | |||
| LD a16, 15 * SIZE(X) | |||
| daddiu X, X, SIZE | |||
| gsLQC1(X_BASE,A3,A2,0) | |||
| gsLQC1(X_BASE,A5,A4,1) | |||
| gsLQC1(X_BASE,A7,A6,2) | |||
| gsLQC1(X_BASE,A9,A8,3) | |||
| gsLQC1(X_BASE,A11,A10,4) | |||
| gsLQC1(X_BASE,A13,A12,5) | |||
| gsLQC1(X_BASE,A15,A14,6) | |||
| LD a16, 14 * SIZE(X) | |||
| gsLQC1(Y_BASE,B2,B1,0*SIZE) | |||
| gsLQC1(Y_BASE,B4,B3,2*SIZE) | |||
| gsLQC1(Y_BASE,B6,B5,4*SIZE) | |||
| gsLQC1(Y_BASE,B8,B7,6*SIZE) | |||
| gsLQC1(Y_BASE,B2,B1,0) | |||
| gsLQC1(Y_BASE,B4,B3,1) | |||
| gsLQC1(Y_BASE,B6,B5,2) | |||
| gsLQC1(Y_BASE,B8,B7,3) | |||
| blez I, .L13 | |||
| blez I, .L32 | |||
| NOP | |||
| .align 5 | |||
| .L31: | |||
| MADD t1, b1, ALPHA, a1 | |||
| MADD t2, b2, ALPHA, a2 | |||
| gsSQC1(Y_BASE, T2, T1, 0*SIZE) | |||
| gsLQC1(Y_BASE,B2,B1,8*SIZE) | |||
| gsSQC1(Y_BASE, T2, T1, 0) | |||
| gsLQC1(Y_BASE,B2,B1,4) | |||
| MADD t3, b3, ALPHA, a3 | |||
| MADD t4, b4, ALPHA, a4 | |||
| gsSQC1(Y_BASE, T4, T3, 2*SIZE) | |||
| gsLQC1(Y_BASE,B4,B3,10*SIZE) | |||
| gsSQC1(Y_BASE, T4, T3, 1) | |||
| gsLQC1(Y_BASE,B4,B3,5) | |||
| PREFETCHD(PREFETCH_DISTANCE*SIZE(Y)) | |||
| PREFETCHD((PREFETCH_DISTANCE+4)*SIZE(Y)) | |||
| MADD t1, b5, ALPHA, a5 | |||
| MADD t2, b6, ALPHA, a6 | |||
| gsSQC1(Y_BASE, T2, T1, 4*SIZE) | |||
| gsLQC1(Y_BASE,B6,B5,12*SIZE) | |||
| gsSQC1(Y_BASE, T2, T1, 2) | |||
| gsLQC1(Y_BASE,B6,B5,6) | |||
| MADD t3, b7, ALPHA, a7 | |||
| MADD t4, b8, ALPHA, a8 | |||
| gsSQC1(Y_BASE, T4, T3, 6*SIZE) | |||
| gsLQC1(Y_BASE,B8,B7,14*SIZE) | |||
| gsSQC1(Y_BASE, T4, T3, 3) | |||
| gsLQC1(Y_BASE,B8,B7,7) | |||
| PREFETCHD((PREFETCH_DISTANCE+8)*SIZE(Y)) | |||
| PREFETCHD((PREFETCH_DISTANCE+12)*SIZE(Y)) | |||
| MADD t1, b1, ALPHA, a9 | |||
| MADD t2, b2, ALPHA, a10 | |||
| gsSQC1(Y_BASE, T2, T1, 8*SIZE) | |||
| gsLQC1(Y_BASE,B2,B1,16*SIZE) | |||
| gsSQC1(Y_BASE, T2, T1, 4) | |||
| gsLQC1(Y_BASE,B2,B1,8) | |||
| MADD t3, b3, ALPHA, a11 | |||
| MADD t4, b4, ALPHA, a12 | |||
| gsSQC1(Y_BASE, T4, T3, 10*SIZE) | |||
| gsLQC1(Y_BASE,B4,B3,18*SIZE) | |||
| gsSQC1(Y_BASE, T4, T3, 5) | |||
| gsLQC1(Y_BASE,B4,B3,9) | |||
| PREFETCHD(PREFETCH_DISTANCE*SIZE(X)) | |||
| PREFETCHD((PREFETCH_DISTANCE+4)*SIZE(X)) | |||
| MADD t1, b5, ALPHA, a13 | |||
| MADD t2, b6, ALPHA, a14 | |||
| gsSQC1(Y_BASE, T2, T1, 12*SIZE) | |||
| gsLQC1(Y_BASE,B6,B5,20*SIZE) | |||
| gsSQC1(Y_BASE, T2, T1, 6) | |||
| gsLQC1(Y_BASE,B6,B5,10) | |||
| MADD t3, b7, ALPHA, a15 | |||
| MADD t4, b8, ALPHA, a16 | |||
| gsSQC1(Y_BASE, T4, T3, 14*SIZE) | |||
| gsLQC1(Y_BASE,B8,B7,22*SIZE) | |||
| gsSQC1(Y_BASE, T4, T3, 7) | |||
| gsLQC1(Y_BASE,B8,B7,11) | |||
| PREFETCHD((PREFETCH_DISTANCE+8)*SIZE(X)) | |||
| PREFETCHD((PREFETCH_DISTANCE+12)*SIZE(X)) | |||
| LD a1, 16 * SIZE(X) | |||
| gsLQC1(X_BASE,A3,A2,17*SIZE) | |||
| gsLQC1(X_BASE,A5,A4,19*SIZE) | |||
| gsLQC1(X_BASE,A7,A6,21*SIZE) | |||
| gsLQC1(X_BASE,A9,A8,23*SIZE) | |||
| LD a1, 15 * SIZE(X) | |||
| gsLQC1(X_BASE,A3,A2,8) | |||
| gsLQC1(X_BASE,A5,A4,9) | |||
| gsLQC1(X_BASE,A7,A6,10) | |||
| gsLQC1(X_BASE,A9,A8,11) | |||
| gsLQC1(X_BASE,A11,A10,25*SIZE) | |||
| gsLQC1(X_BASE,A13,A12,27*SIZE) | |||
| gsLQC1(X_BASE,A15,A14,29*SIZE) | |||
| LD a16, 31 * SIZE(X) | |||
| gsLQC1(X_BASE,A11,A10,12) | |||
| gsLQC1(X_BASE,A13,A12,13) | |||
| gsLQC1(X_BASE,A15,A14,14) | |||
| LD a16, 30 * SIZE(X) | |||
| daddiu I, I, -1 | |||
| daddiu Y, Y, 16 * SIZE | |||
| daddiu X, X, 16 * SIZE | |||
| bgtz I, .L31 | |||
| //jump back to the remain loop process. | |||
| b .L13 | |||
| .align 5 | |||
| //Loop end: | |||
| .L32: | |||
| MADD t1, b1, ALPHA, a1 | |||
| MADD t2, b2, ALPHA, a2 | |||
| gsSQC1(Y_BASE, T2, T1, 0) | |||
| gsLQC1(Y_BASE,B2,B1,4) | |||
| MADD t3, b3, ALPHA, a3 | |||
| MADD t4, b4, ALPHA, a4 | |||
| gsSQC1(Y_BASE, T4, T3, 1) | |||
| gsLQC1(Y_BASE,B4,B3,5) | |||
| MADD t1, b5, ALPHA, a5 | |||
| MADD t2, b6, ALPHA, a6 | |||
| gsSQC1(Y_BASE, T2, T1, 2) | |||
| gsLQC1(Y_BASE,B6,B5,6) | |||
| MADD t3, b7, ALPHA, a7 | |||
| MADD t4, b8, ALPHA, a8 | |||
| gsSQC1(Y_BASE, T4, T3, 3) | |||
| gsLQC1(Y_BASE,B8,B7,7) | |||
| MADD t1, b1, ALPHA, a9 | |||
| MADD t2, b2, ALPHA, a10 | |||
| gsSQC1(Y_BASE, T2, T1, 4) | |||
| MADD t3, b3, ALPHA, a11 | |||
| MADD t4, b4, ALPHA, a12 | |||
| gsSQC1(Y_BASE, T4, T3, 5) | |||
| MADD t1, b5, ALPHA, a13 | |||
| MADD t2, b6, ALPHA, a14 | |||
| gsSQC1(Y_BASE, T2, T1, 6) | |||
| MADD t3, b7, ALPHA, a15 | |||
| MADD t4, b8, ALPHA, a16 | |||
| gsSQC1(Y_BASE, T4, T3, 7) | |||
| daddiu X, X, 15 * SIZE | |||
| daddiu Y, Y, 16 * SIZE | |||
| //jump back to the remain process. | |||
| b .L15 | |||
| .align 5 | |||
| //INCX!=1 or INCY != 1 | |||