| @@ -33,6 +33,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #define ALPHAI $f1 | #define ALPHAI $f1 | ||||
| #define X $r7 | #define X $r7 | ||||
| #define INCX $r8 | #define INCX $r8 | ||||
| #define DUMMY2 $r9 | |||||
| #define I $r12 | #define I $r12 | ||||
| #define TEMP $r13 | #define TEMP $r13 | ||||
| @@ -65,6 +66,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| bge $r0, N, .L999 | bge $r0, N, .L999 | ||||
| bge $r0, INCX, .L999 | bge $r0, INCX, .L999 | ||||
| ld.d DUMMY2, $sp, 0 | |||||
| li.d TEMP, 1 | li.d TEMP, 1 | ||||
| movgr2fr.d a1, $r0 | movgr2fr.d a1, $r0 | ||||
| FFINT a1, a1 | FFINT a1, a1 | ||||
| @@ -84,24 +86,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| srai.d I, N, 2 | srai.d I, N, 2 | ||||
| bne INCX, TEMP, .L22 | bne INCX, TEMP, .L22 | ||||
| /////// INCX == 1 //////// | |||||
| .L11: | .L11: | ||||
| bge $r0, I, .L997 | |||||
| CMPEQ $fcc0, ALPHAR, a1 | CMPEQ $fcc0, ALPHAR, a1 | ||||
| CMPEQ $fcc1, ALPHAI, a1 | CMPEQ $fcc1, ALPHAI, a1 | ||||
| bceqz $fcc0, .L13 | |||||
| b .L14 | |||||
| .align 3 | |||||
| bge $r0, I, .L19 | |||||
| .L13: | |||||
| bceqz $fcc1, .L114 //alpha_r != 0.0 && alpha_i != 0.0 | |||||
| b .L113 //alpha_r != 0.0 && alpha_i == 0.0 | |||||
| /////// INCX == 1 && N >= 4 //////// | |||||
| bnez DUMMY2, .L17 // if DUMMPY2 == 1, called from c/zscal. | |||||
| .L14: | |||||
| bceqz $fcc1, .L114 //alpha_r == 0.0 && alpha_i != 0.0 | |||||
| b .L111 //alpha_r == 0.0 && alpha_i == 0.0 | |||||
| .align 3 | |||||
| bceqz $fcc0, .L17 | |||||
| .L111: //alpha_r == 0.0 && alpha_i == 0.0 | |||||
| bceqz $fcc1, .L17 | |||||
| .L15: //alpha_r == 0.0 && alpha_i == 0.0 | |||||
| vst VXZ, X, 0 * SIZE | vst VXZ, X, 0 * SIZE | ||||
| #ifdef DOUBLE | #ifdef DOUBLE | ||||
| vst VXZ, X, 2 * SIZE | vst VXZ, X, 2 * SIZE | ||||
| @@ -112,50 +110,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #endif | #endif | ||||
| addi.d X, X, 8 * SIZE | addi.d X, X, 8 * SIZE | ||||
| addi.d I, I, -1 | addi.d I, I, -1 | ||||
| blt $r0, I, .L111 | |||||
| b .L997 | |||||
| .align 3 | |||||
| .L113: //alpha_r != 0.0 && alpha_i == 0.0 | |||||
| vld VX0, X, 0 * SIZE | |||||
| #ifdef DOUBLE | |||||
| vld VX1, X, 2 * SIZE | |||||
| vpickev.d x1, VX1, VX0 | |||||
| vpickod.d x2, VX1, VX0 | |||||
| vfmul.d x3, VXAR, x1 | |||||
| vfmul.d x4, VXAR, x2 | |||||
| vilvl.d VX2, x4 ,x3 | |||||
| vilvh.d VX3, x4, x3 | |||||
| vst VX2, X, 0 * SIZE | |||||
| vst VX3, X, 2 * SIZE | |||||
| vld VX0, X, 4 * SIZE | |||||
| vld VX1, X, 6 * SIZE | |||||
| vpickev.d x1, VX1, VX0 | |||||
| vpickod.d x2, VX1, VX0 | |||||
| vfmul.d x3, VXAR, x1 | |||||
| vfmul.d x4, VXAR, x2 | |||||
| vilvl.d VX2, x4 ,x3 | |||||
| vilvh.d VX3, x4, x3 | |||||
| vst VX2, X, 4 * SIZE | |||||
| vst VX3, X, 6 * SIZE | |||||
| #else | |||||
| vld VX1, X, 4 * SIZE | |||||
| vpickev.w x1, VX1, VX0 | |||||
| vpickod.w x2, VX1, VX0 | |||||
| vfmul.s x3, VXAR, x1 | |||||
| vfmul.s x4, VXAR, x2 | |||||
| vilvl.w VX2, x4 ,x3 | |||||
| vilvh.w VX3, x4, x3 | |||||
| vst VX2, X, 0 * SIZE | |||||
| vst VX3, X, 4 * SIZE | |||||
| #endif | |||||
| addi.d X, X, 8 * SIZE | |||||
| addi.d I, I, -1 | |||||
| blt $r0, I, .L113 | |||||
| b .L997 | |||||
| blt $r0, I, .L15 | |||||
| b .L19 | |||||
| .align 3 | .align 3 | ||||
| .L114: //alpha_r != 0.0 && alpha_i != 0.0 | |||||
| .L17: | |||||
| vld VX0, X, 0 * SIZE | vld VX0, X, 0 * SIZE | ||||
| #ifdef DOUBLE | #ifdef DOUBLE | ||||
| vld VX1, X, 2 * SIZE | vld VX1, X, 2 * SIZE | ||||
| @@ -196,29 +155,35 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #endif | #endif | ||||
| addi.d X, X, 8 * SIZE | addi.d X, X, 8 * SIZE | ||||
| addi.d I, I, -1 | addi.d I, I, -1 | ||||
| blt $r0, I, .L114 | |||||
| b .L997 | |||||
| blt $r0, I, .L17 | |||||
| b .L19 | |||||
| .align 3 | .align 3 | ||||
| /////// INCX == 1 && N < 8 /////// | |||||
| .L19: | |||||
| andi I, N, 3 | |||||
| beqz I, .L999 | |||||
| bnez DUMMY2, .L998 // if DUMMPY2 == 1, called from c/zscal. | |||||
| bceqz $fcc0, .L998 | |||||
| bceqz $fcc1, .L998 | |||||
| b .L995 // alpha_r == 0.0 && alpha_i == 0.0 | |||||
| /////// INCX != 1 //////// | |||||
| .L22: | .L22: | ||||
| bge $r0, I, .L997 | |||||
| move XX, X | |||||
| CMPEQ $fcc0, ALPHAR, a1 | CMPEQ $fcc0, ALPHAR, a1 | ||||
| CMPEQ $fcc1, ALPHAI, a1 | CMPEQ $fcc1, ALPHAI, a1 | ||||
| bceqz $fcc0, .L23 | |||||
| b .L24 | |||||
| .align 3 | |||||
| move XX, X | |||||
| bge $r0, I, .L29 | |||||
| bnez DUMMY2, .L25 // if DUMMPY2 == 1, called from c/zscal. | |||||
| .L23: | |||||
| bceqz $fcc1, .L224 //alpha_r != 0.0 && alpha_i != 0.0 | |||||
| b .L223 //alpha_r != 0.0 && alpha_i == 0.0 | |||||
| bceqz $fcc0, .L25 | |||||
| .L24: | |||||
| bceqz $fcc1, .L224 //alpha_r == 0.0 && alpha_i != 0.0 | |||||
| b .L221 //alpha_r == 0.0 && alpha_i == 0.0 | |||||
| .align 3 | |||||
| bceqz $fcc1, .L25 | |||||
| .L221: //alpha_r == 0.0 && alpha_i == 0.0 | |||||
| .L27: //alpha_r == 0.0 && alpha_i == 0.0 | |||||
| #ifdef DOUBLE | #ifdef DOUBLE | ||||
| vstelm.d VXZ, X, 0, 0 | vstelm.d VXZ, X, 0, 0 | ||||
| vstelm.d VXZ, X, 1 * SIZE, 0 | vstelm.d VXZ, X, 1 * SIZE, 0 | ||||
| @@ -246,92 +211,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #endif | #endif | ||||
| add.d X, X, INCX | add.d X, X, INCX | ||||
| addi.d I, I, -1 | addi.d I, I, -1 | ||||
| blt $r0, I, .L221 | |||||
| b .L997 | |||||
| blt $r0, I, .L27 | |||||
| b .L29 | |||||
| .align 3 | .align 3 | ||||
| .L223: //alpha_r != 0.0 && alpha_i == 0.0 | |||||
| #ifdef DOUBLE | |||||
| ld.d t1, X, 0 * SIZE | |||||
| ld.d t2, X, 1 * SIZE | |||||
| add.d X, X, INCX | |||||
| ld.d t3, X, 0 * SIZE | |||||
| ld.d t4, X, 1 * SIZE | |||||
| add.d X, X, INCX | |||||
| vinsgr2vr.d x1, t1, 0 | |||||
| vinsgr2vr.d x2, t2, 0 | |||||
| vinsgr2vr.d x1, t3, 1 | |||||
| vinsgr2vr.d x2, t4, 1 | |||||
| vfmul.d x3, VXAR, x1 | |||||
| vfmul.d x4, VXAR, x2 | |||||
| vstelm.d x3, XX, 0 * SIZE, 0 | |||||
| vstelm.d x4, XX, 1 * SIZE, 0 | |||||
| add.d XX, XX, INCX | |||||
| vstelm.d x3, XX, 0 * SIZE, 1 | |||||
| vstelm.d x4, XX, 1 * SIZE, 1 | |||||
| add.d XX, XX, INCX | |||||
| ld.d t1, X, 0 * SIZE | |||||
| ld.d t2, X, 1 * SIZE | |||||
| add.d X, X, INCX | |||||
| ld.d t3, X, 0 * SIZE | |||||
| ld.d t4, X, 1 * SIZE | |||||
| vinsgr2vr.d x1, t1, 0 | |||||
| vinsgr2vr.d x2, t2, 0 | |||||
| vinsgr2vr.d x1, t3, 1 | |||||
| vinsgr2vr.d x2, t4, 1 | |||||
| add.d X, X, INCX | |||||
| vfmul.d x3, VXAR, x1 | |||||
| vfmul.d x4, VXAR, x2 | |||||
| addi.d I, I, -1 | |||||
| vstelm.d x3, XX, 0 * SIZE, 0 | |||||
| vstelm.d x4, XX, 1 * SIZE, 0 | |||||
| add.d XX, XX, INCX | |||||
| vstelm.d x3, XX, 0 * SIZE, 1 | |||||
| vstelm.d x4, XX, 1 * SIZE, 1 | |||||
| #else | |||||
| ld.w t1, X, 0 * SIZE | |||||
| ld.w t2, X, 1 * SIZE | |||||
| add.d X, X, INCX | |||||
| ld.w t3, X, 0 * SIZE | |||||
| ld.w t4, X, 1 * SIZE | |||||
| add.d X, X, INCX | |||||
| vinsgr2vr.w x1, t1, 0 | |||||
| vinsgr2vr.w x2, t2, 0 | |||||
| vinsgr2vr.w x1, t3, 1 | |||||
| vinsgr2vr.w x2, t4, 1 | |||||
| ld.w t1, X, 0 * SIZE | |||||
| ld.w t2, X, 1 * SIZE | |||||
| add.d X, X, INCX | |||||
| ld.w t3, X, 0 * SIZE | |||||
| ld.w t4, X, 1 * SIZE | |||||
| vinsgr2vr.w x1, t1, 2 | |||||
| vinsgr2vr.w x2, t2, 2 | |||||
| vinsgr2vr.w x1, t3, 3 | |||||
| vinsgr2vr.w x2, t4, 3 | |||||
| add.d X, X, INCX | |||||
| vfmul.s x3, VXAR, x1 | |||||
| vfmul.s x4, VXAR, x2 | |||||
| addi.d I, I, -1 | |||||
| vstelm.w x3, XX, 0 * SIZE, 0 | |||||
| vstelm.w x4, XX, 1 * SIZE, 0 | |||||
| add.d XX, XX, INCX | |||||
| vstelm.w x3, XX, 0 * SIZE, 1 | |||||
| vstelm.w x4, XX, 1 * SIZE, 1 | |||||
| add.d XX, XX, INCX | |||||
| vstelm.w x3, XX, 0 * SIZE, 2 | |||||
| vstelm.w x4, XX, 1 * SIZE, 2 | |||||
| add.d XX, XX, INCX | |||||
| vstelm.w x3, XX, 0 * SIZE, 3 | |||||
| vstelm.w x4, XX, 1 * SIZE, 3 | |||||
| #endif | |||||
| add.d XX, XX, INCX | |||||
| blt $r0, I, .L223 | |||||
| b .L997 | |||||
| .align 3 | |||||
| .L224: //alpha_r != 0.0 && alpha_i != 0.0 | |||||
| .L25: | |||||
| #ifdef DOUBLE | #ifdef DOUBLE | ||||
| ld.d t1, X, 0 * SIZE | ld.d t1, X, 0 * SIZE | ||||
| ld.d t2, X, 1 * SIZE | ld.d t2, X, 1 * SIZE | ||||
| @@ -414,15 +298,29 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| vstelm.w x4, XX, 1 * SIZE, 3 | vstelm.w x4, XX, 1 * SIZE, 3 | ||||
| #endif | #endif | ||||
| add.d XX, XX, INCX | add.d XX, XX, INCX | ||||
| blt $r0, I, .L224 | |||||
| b .L997 | |||||
| blt $r0, I, .L25 | |||||
| b .L29 | |||||
| .align 3 | .align 3 | ||||
| .L997: | |||||
| andi I, N, 3 | |||||
| bge $r0, I, .L999 | |||||
| .align 3 | |||||
| /////// INCX != 1 && N < 8 /////// | |||||
| .L29: | |||||
| andi I, N, 3 | |||||
| beqz I, .L999 | |||||
| bnez DUMMY2, .L998 // if DUMMPY2 == 1, called from c/zscal. | |||||
| bceqz $fcc0, .L998 | |||||
| bceqz $fcc1, .L998 | |||||
| b .L995 // alpha_r == 0.0 && alpha_i == 0.0 | |||||
| .L995: // alpha_r == 0.0 && alpha_i == 0.0 | |||||
| ST a1, X, 0 * SIZE | |||||
| ST a1, X, 1 * SIZE | |||||
| addi.d I, I, -1 | |||||
| add.d X, X, INCX | |||||
| blt $r0, I, .L995 | |||||
| b .L999 | |||||
| .L998: | .L998: | ||||
| LD a1, X, 0 * SIZE | LD a1, X, 0 * SIZE | ||||
| LD a2, X, 1 * SIZE | LD a2, X, 1 * SIZE | ||||
| @@ -435,7 +333,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| ST s2, X, 1 * SIZE | ST s2, X, 1 * SIZE | ||||
| add.d X, X, INCX | add.d X, X, INCX | ||||
| blt $r0, I, .L998 | blt $r0, I, .L998 | ||||
| .align 3 | |||||
| b .L999 | |||||
| .L999: | .L999: | ||||
| move $r4, $r12 | move $r4, $r12 | ||||