optimized dgemm and dgetrf for POWER8tags/v0.2.19^2
| @@ -332,6 +332,13 @@ typedef int blasint; | |||||
| #endif | #endif | ||||
| #endif | #endif | ||||
| #ifdef POWER8 | |||||
| #ifndef YIELDING | |||||
| #define YIELDING __asm__ __volatile__ ("nop;nop;nop;nop;nop;nop;nop;nop;\n"); | |||||
| #endif | |||||
| #endif | |||||
| /* | /* | ||||
| #ifdef PILEDRIVER | #ifdef PILEDRIVER | ||||
| #ifndef YIELDING | #ifndef YIELDING | ||||
| @@ -33,6 +33,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| * LAPACK-TEST : OK | * LAPACK-TEST : OK | ||||
| **************************************************************************************/ | **************************************************************************************/ | ||||
| #define MY_ALIGN .align 3 | |||||
| srawi. J, N, 2 | srawi. J, N, 2 | ||||
| ble LDGEMM_L4_END | ble LDGEMM_L4_END | ||||
| @@ -53,7 +54,7 @@ LDGEMM_L4_BEGIN: | |||||
| srawi. I, M, 4 | srawi. I, M, 4 | ||||
| ble LDGEMM_L4x16_END | ble LDGEMM_L4x16_END | ||||
| .align 4 | |||||
| MY_ALIGN | |||||
| LDGEMM_L4x16_BEGIN_FIRST: | LDGEMM_L4x16_BEGIN_FIRST: | ||||
| li L, -128 | li L, -128 | ||||
| @@ -90,7 +91,7 @@ LDGEMM_L4x16_BEGIN_FIRST: | |||||
| cmpwi cr0, L, 1 | cmpwi cr0, L, 1 | ||||
| ble LDGEMM_L4x16_SUB4_FIRST | ble LDGEMM_L4x16_SUB4_FIRST | ||||
| .align 4 | |||||
| MY_ALIGN | |||||
| LDGEMM_L4x16_LOOP_START_FIRST: | LDGEMM_L4x16_LOOP_START_FIRST: | ||||
| li T2, 512 | li T2, 512 | ||||
| @@ -115,7 +116,7 @@ LDGEMM_L4x16_LOOP_START_FIRST: | |||||
| ble LDGEMM_L4x16_LOOP_END_FIRST | ble LDGEMM_L4x16_LOOP_END_FIRST | ||||
| mtctr L | mtctr L | ||||
| .align 4 | |||||
| MY_ALIGN | |||||
| LDGEMM_L4x16_LOOP_FIRST: | LDGEMM_L4x16_LOOP_FIRST: | ||||
| @@ -132,7 +133,7 @@ LDGEMM_L4x16_LOOP_FIRST: | |||||
| bdnz LDGEMM_L4x16_LOOP_FIRST | bdnz LDGEMM_L4x16_LOOP_FIRST | ||||
| .align 4 | |||||
| MY_ALIGN | |||||
| LDGEMM_L4x16_LOOP_END_FIRST: | LDGEMM_L4x16_LOOP_END_FIRST: | ||||
| @@ -175,7 +176,7 @@ LDGEMM_L4x16_SUB2_FIRST: | |||||
| addic. L, L, -1 | addic. L, L, -1 | ||||
| bgt LDGEMM_L4x16_SUB2_FIRST | bgt LDGEMM_L4x16_SUB2_FIRST | ||||
| .align 4 | |||||
| MY_ALIGN | |||||
| LDGEMM_L4x16_SAVE_FIRST: | LDGEMM_L4x16_SAVE_FIRST: | ||||
| SAVE4x16 | SAVE4x16 | ||||
| @@ -185,7 +186,8 @@ LDGEMM_L4x16_SAVE_FIRST: | |||||
| LDGEMM_L4x16_END_FIRST: | LDGEMM_L4x16_END_FIRST: | ||||
| .align 4 | |||||
| MY_ALIGN | |||||
| LDGEMM_L4x16_BEGIN: | LDGEMM_L4x16_BEGIN: | ||||
| li L, -128 | li L, -128 | ||||
| @@ -222,7 +224,8 @@ LDGEMM_L4x16_BEGIN: | |||||
| cmpwi cr0, L, 1 | cmpwi cr0, L, 1 | ||||
| ble- LDGEMM_L4x16_SUB4 | ble- LDGEMM_L4x16_SUB4 | ||||
| .align 4 | |||||
| MY_ALIGN | |||||
| LDGEMM_L4x16_LOOP_START: | LDGEMM_L4x16_LOOP_START: | ||||
| li o40, 40 | li o40, 40 | ||||
| @@ -239,20 +242,19 @@ LDGEMM_L4x16_LOOP_START: | |||||
| ble- LDGEMM_L4x16_LOOP_END | ble- LDGEMM_L4x16_LOOP_END | ||||
| mtctr L | mtctr L | ||||
| .align 4 | |||||
| MY_ALIGN | |||||
| LDGEMM_L4x16_LOOP: | LDGEMM_L4x16_LOOP: | ||||
| dcbt AO, PRE | dcbt AO, PRE | ||||
| KERNEL4x16_L1 | KERNEL4x16_L1 | ||||
| dcbt AO, PRE | dcbt AO, PRE | ||||
| // addic. L, L, -1 | |||||
| KERNEL4x16_L2 | KERNEL4x16_L2 | ||||
| bdnz+ LDGEMM_L4x16_LOOP | bdnz+ LDGEMM_L4x16_LOOP | ||||
| .align 4 | |||||
| MY_ALIGN | |||||
| LDGEMM_L4x16_LOOP_END: | LDGEMM_L4x16_LOOP_END: | ||||
| @@ -261,6 +263,8 @@ LDGEMM_L4x16_LOOP_END: | |||||
| b LDGEMM_L4x16_SUB1 | b LDGEMM_L4x16_SUB1 | ||||
| MY_ALIGN | |||||
| LDGEMM_L4x16_SUB4: | LDGEMM_L4x16_SUB4: | ||||
| KERNEL4x16_SUBI1 | KERNEL4x16_SUBI1 | ||||
| @@ -268,6 +272,8 @@ LDGEMM_L4x16_SUB4: | |||||
| b LDGEMM_L4x16_SUB1 | b LDGEMM_L4x16_SUB1 | ||||
| MY_ALIGN | |||||
| LDGEMM_L4x16_SUB0: | LDGEMM_L4x16_SUB0: | ||||
| andi. L, K, 1 | andi. L, K, 1 | ||||
| @@ -278,11 +284,15 @@ LDGEMM_L4x16_SUB0: | |||||
| ble LDGEMM_L4x16_SAVE | ble LDGEMM_L4x16_SAVE | ||||
| b LDGEMM_L4x16_SUB2 | b LDGEMM_L4x16_SUB2 | ||||
| MY_ALIGN | |||||
| LDGEMM_L4x16_SUB1: | LDGEMM_L4x16_SUB1: | ||||
| andi. L, K, 1 | andi. L, K, 1 | ||||
| ble LDGEMM_L4x16_SAVE | ble LDGEMM_L4x16_SAVE | ||||
| MY_ALIGN | |||||
| LDGEMM_L4x16_SUB2: | LDGEMM_L4x16_SUB2: | ||||
| KERNEL4x16_SUB1 | KERNEL4x16_SUB1 | ||||
| @@ -290,7 +300,8 @@ LDGEMM_L4x16_SUB2: | |||||
| addic. L, L, -1 | addic. L, L, -1 | ||||
| bgt LDGEMM_L4x16_SUB2 | bgt LDGEMM_L4x16_SUB2 | ||||
| .align 4 | |||||
| MY_ALIGN | |||||
| LDGEMM_L4x16_SAVE: | LDGEMM_L4x16_SAVE: | ||||
| SAVE4x16 | SAVE4x16 | ||||
| @@ -334,7 +345,7 @@ LDGEMM_L4x8_LOOP_START: | |||||
| addic. L, L, -2 | addic. L, L, -2 | ||||
| ble LDGEMM_L4x8_LOOP_END | ble LDGEMM_L4x8_LOOP_END | ||||
| .align 5 | |||||
| MY_ALIGN | |||||
| LDGEMM_L4x8_LOOP: | LDGEMM_L4x8_LOOP: | ||||
| @@ -441,7 +452,7 @@ LDGEMM_L4x4_LOOP_START: | |||||
| addic. L, L, -2 | addic. L, L, -2 | ||||
| ble LDGEMM_L4x4_LOOP_END | ble LDGEMM_L4x4_LOOP_END | ||||
| .align 5 | |||||
| MY_ALIGN | |||||
| LDGEMM_L4x4_LOOP: | LDGEMM_L4x4_LOOP: | ||||
| @@ -543,7 +554,7 @@ LDGEMM_L4x2_LOOP_START: | |||||
| addic. L, L, -2 | addic. L, L, -2 | ||||
| ble LDGEMM_L4x2_LOOP_END | ble LDGEMM_L4x2_LOOP_END | ||||
| .align 5 | |||||
| MY_ALIGN | |||||
| LDGEMM_L4x2_LOOP: | LDGEMM_L4x2_LOOP: | ||||
| @@ -643,7 +654,7 @@ LDGEMM_L4x1_LOOP_START: | |||||
| addic. L, L, -2 | addic. L, L, -2 | ||||
| ble LDGEMM_L4x1_LOOP_END | ble LDGEMM_L4x1_LOOP_END | ||||
| .align 5 | |||||
| MY_ALIGN | |||||
| LDGEMM_L4x1_LOOP: | LDGEMM_L4x1_LOOP: | ||||
| @@ -778,7 +789,7 @@ LDGEMM_L2x16_LOOP_START: | |||||
| addic. L, L, -2 | addic. L, L, -2 | ||||
| ble LDGEMM_L2x16_LOOP_END | ble LDGEMM_L2x16_LOOP_END | ||||
| .align 5 | |||||
| MY_ALIGN | |||||
| LDGEMM_L2x16_LOOP: | LDGEMM_L2x16_LOOP: | ||||
| @@ -907,7 +918,7 @@ LDGEMM_L2x8_LOOP_START: | |||||
| addic. L, L, -2 | addic. L, L, -2 | ||||
| ble LDGEMM_L2x8_LOOP_END | ble LDGEMM_L2x8_LOOP_END | ||||
| .align 5 | |||||
| MY_ALIGN | |||||
| LDGEMM_L2x8_LOOP: | LDGEMM_L2x8_LOOP: | ||||
| @@ -1011,7 +1022,7 @@ LDGEMM_L2x4_LOOP_START: | |||||
| addic. L, L, -2 | addic. L, L, -2 | ||||
| ble LDGEMM_L2x4_LOOP_END | ble LDGEMM_L2x4_LOOP_END | ||||
| .align 5 | |||||
| MY_ALIGN | |||||
| LDGEMM_L2x4_LOOP: | LDGEMM_L2x4_LOOP: | ||||
| @@ -1111,7 +1122,7 @@ LDGEMM_L2x2_LOOP_START: | |||||
| addic. L, L, -2 | addic. L, L, -2 | ||||
| ble LDGEMM_L2x2_LOOP_END | ble LDGEMM_L2x2_LOOP_END | ||||
| .align 5 | |||||
| MY_ALIGN | |||||
| LDGEMM_L2x2_LOOP: | LDGEMM_L2x2_LOOP: | ||||
| @@ -1211,7 +1222,7 @@ LDGEMM_L2x1_LOOP_START: | |||||
| addic. L, L, -2 | addic. L, L, -2 | ||||
| ble LDGEMM_L2x1_LOOP_END | ble LDGEMM_L2x1_LOOP_END | ||||
| .align 5 | |||||
| MY_ALIGN | |||||
| LDGEMM_L2x1_LOOP: | LDGEMM_L2x1_LOOP: | ||||
| @@ -1331,7 +1342,7 @@ LDGEMM_L1x16_LOOP_START: | |||||
| addic. L, L, -2 | addic. L, L, -2 | ||||
| ble LDGEMM_L1x16_LOOP_END | ble LDGEMM_L1x16_LOOP_END | ||||
| .align 5 | |||||
| MY_ALIGN | |||||
| LDGEMM_L1x16_LOOP: | LDGEMM_L1x16_LOOP: | ||||
| @@ -1460,7 +1471,7 @@ LDGEMM_L1x8_LOOP_START: | |||||
| addic. L, L, -2 | addic. L, L, -2 | ||||
| ble LDGEMM_L1x8_LOOP_END | ble LDGEMM_L1x8_LOOP_END | ||||
| .align 5 | |||||
| MY_ALIGN | |||||
| LDGEMM_L1x8_LOOP: | LDGEMM_L1x8_LOOP: | ||||
| @@ -1564,7 +1575,7 @@ LDGEMM_L1x4_LOOP_START: | |||||
| addic. L, L, -2 | addic. L, L, -2 | ||||
| ble LDGEMM_L1x4_LOOP_END | ble LDGEMM_L1x4_LOOP_END | ||||
| .align 5 | |||||
| MY_ALIGN | |||||
| LDGEMM_L1x4_LOOP: | LDGEMM_L1x4_LOOP: | ||||
| @@ -1664,7 +1675,7 @@ LDGEMM_L1x2_LOOP_START: | |||||
| addic. L, L, -2 | addic. L, L, -2 | ||||
| ble LDGEMM_L1x2_LOOP_END | ble LDGEMM_L1x2_LOOP_END | ||||
| .align 5 | |||||
| MY_ALIGN | |||||
| LDGEMM_L1x2_LOOP: | LDGEMM_L1x2_LOOP: | ||||
| @@ -1764,7 +1775,7 @@ LDGEMM_L1x1_LOOP_START: | |||||
| addic. L, L, -2 | addic. L, L, -2 | ||||
| ble LDGEMM_L1x1_LOOP_END | ble LDGEMM_L1x1_LOOP_END | ||||
| .align 5 | |||||
| MY_ALIGN | |||||
| LDGEMM_L1x1_LOOP: | LDGEMM_L1x1_LOOP: | ||||
| @@ -127,6 +127,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| xxpermdi vs62, vs7, vs15, 3 | xxpermdi vs62, vs7, vs15, 3 | ||||
| xxpermdi vs63, vs23, vs31, 3 | xxpermdi vs63, vs23, vs31, 3 | ||||
| dcbt BO, PREB | |||||
| stxvd2x vs32, o0, BO | stxvd2x vs32, o0, BO | ||||
| stxvd2x vs33, o16, BO | stxvd2x vs33, o16, BO | ||||
| @@ -138,6 +139,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| stxvd2x vs39, o112, BO | stxvd2x vs39, o112, BO | ||||
| addi BO, BO, 128 | addi BO, BO, 128 | ||||
| dcbt BO, PREB | |||||
| stxvd2x vs40, o0, BO | stxvd2x vs40, o0, BO | ||||
| stxvd2x vs41, o16, BO | stxvd2x vs41, o16, BO | ||||
| stxvd2x vs42, o32, BO | stxvd2x vs42, o32, BO | ||||
| @@ -148,6 +151,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| stxvd2x vs47, o112, BO | stxvd2x vs47, o112, BO | ||||
| addi BO, BO, 128 | addi BO, BO, 128 | ||||
| dcbt BO, PREB | |||||
| stxvd2x vs48, o0, BO | stxvd2x vs48, o0, BO | ||||
| stxvd2x vs49, o16, BO | stxvd2x vs49, o16, BO | ||||
| stxvd2x vs50, o32, BO | stxvd2x vs50, o32, BO | ||||
| @@ -158,6 +163,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| stxvd2x vs55, o112, BO | stxvd2x vs55, o112, BO | ||||
| addi BO, BO, 128 | addi BO, BO, 128 | ||||
| dcbt BO, PREB | |||||
| stxvd2x vs56, o0, BO | stxvd2x vs56, o0, BO | ||||
| stxvd2x vs57, o16, BO | stxvd2x vs57, o16, BO | ||||
| stxvd2x vs58, o32, BO | stxvd2x vs58, o32, BO | ||||
| @@ -170,7 +170,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| add B2, B2, B | add B2, B2, B | ||||
| add B1, B1, B | add B1, B1, B | ||||
| li PREA, 256 | |||||
| li PREA, 384 | |||||
| addi PREB, M16, 128 | addi PREB, M16, 128 | ||||
| li o8, 8 | li o8, 8 | ||||
| @@ -52,31 +52,31 @@ DCOPYT_L4_BEGIN: | |||||
| ble DCOPYT_L4x8_BEGIN | ble DCOPYT_L4x8_BEGIN | ||||
| mr BO, B16 | mr BO, B16 | ||||
| addi T2, M16, 384 | |||||
| mtctr J | |||||
| .align 5 | .align 5 | ||||
| DCOPYT_L4x16_LOOP: | DCOPYT_L4x16_LOOP: | ||||
| /* | |||||
| addi T1, PREB, 128 | |||||
| addi T2, PREB, 256 | |||||
| */ | |||||
| addi T1, M16, 256 | |||||
| dcbt A0, PREA | dcbt A0, PREA | ||||
| dcbt A1, PREA | dcbt A1, PREA | ||||
| dcbt A2, PREA | dcbt A2, PREA | ||||
| dcbt A3, PREA | dcbt A3, PREA | ||||
| /* | |||||
| dcbtst BO, M16 | |||||
| dcbtst BO, PREB | |||||
| dcbtst BO, T1 | |||||
| dcbtst BO, T2 | |||||
| */ | |||||
| dcbt BO, M16 | |||||
| dcbt BO, PREB | |||||
| dcbt BO, T1 | |||||
| dcbt BO, T2 | |||||
| COPY_4x16 | COPY_4x16 | ||||
| add BO, BO, M16 | add BO, BO, M16 | ||||
| addic. J, J, -1 | |||||
| bgt DCOPYT_L4x16_LOOP | |||||
| // addic. J, J, -1 | |||||
| bdnz+ DCOPYT_L4x16_LOOP | |||||
| DCOPYT_L4x8_BEGIN: | DCOPYT_L4x8_BEGIN: | ||||
| @@ -46,52 +46,48 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| lxvd2x vs35, o48, A0 | lxvd2x vs35, o48, A0 | ||||
| addi A0, A0, 64 | addi A0, A0, 64 | ||||
| lxvd2x vs36, o0, A0 | |||||
| lxvd2x vs37, o16, A0 | |||||
| lxvd2x vs38, o32, A0 | |||||
| lxvd2x vs39, o48, A0 | |||||
| addi A0, A0, 64 | |||||
| lxvd2x vs40, o0, A1 | lxvd2x vs40, o0, A1 | ||||
| lxvd2x vs41, o16, A1 | lxvd2x vs41, o16, A1 | ||||
| lxvd2x vs42, o32, A1 | lxvd2x vs42, o32, A1 | ||||
| lxvd2x vs43, o48, A1 | lxvd2x vs43, o48, A1 | ||||
| addi A1, A1, 64 | addi A1, A1, 64 | ||||
| lxvd2x vs44, o0, A1 | |||||
| lxvd2x vs45, o16, A1 | |||||
| lxvd2x vs46, o32, A1 | |||||
| lxvd2x vs47, o48, A1 | |||||
| addi A1, A1, 64 | |||||
| lxvd2x vs48, o0, A2 | lxvd2x vs48, o0, A2 | ||||
| lxvd2x vs49, o16, A2 | lxvd2x vs49, o16, A2 | ||||
| lxvd2x vs50, o32, A2 | lxvd2x vs50, o32, A2 | ||||
| lxvd2x vs51, o48, A2 | lxvd2x vs51, o48, A2 | ||||
| addi A2, A2, 64 | addi A2, A2, 64 | ||||
| lxvd2x vs52, o0, A2 | |||||
| lxvd2x vs53, o16, A2 | |||||
| lxvd2x vs54, o32, A2 | |||||
| lxvd2x vs55, o48, A2 | |||||
| addi A2, A2, 64 | |||||
| lxvd2x vs56, o0, A3 | lxvd2x vs56, o0, A3 | ||||
| lxvd2x vs57, o16, A3 | lxvd2x vs57, o16, A3 | ||||
| lxvd2x vs58, o32, A3 | lxvd2x vs58, o32, A3 | ||||
| lxvd2x vs59, o48, A3 | lxvd2x vs59, o48, A3 | ||||
| addi A3, A3, 64 | addi A3, A3, 64 | ||||
| lxvd2x vs36, o0, A0 | |||||
| lxvd2x vs37, o16, A0 | |||||
| lxvd2x vs38, o32, A0 | |||||
| lxvd2x vs39, o48, A0 | |||||
| addi A0, A0, 64 | |||||
| lxvd2x vs44, o0, A1 | |||||
| lxvd2x vs45, o16, A1 | |||||
| lxvd2x vs46, o32, A1 | |||||
| lxvd2x vs47, o48, A1 | |||||
| addi A1, A1, 64 | |||||
| lxvd2x vs52, o0, A2 | |||||
| lxvd2x vs53, o16, A2 | |||||
| lxvd2x vs54, o32, A2 | |||||
| lxvd2x vs55, o48, A2 | |||||
| addi A2, A2, 64 | |||||
| lxvd2x vs60, o0, A3 | lxvd2x vs60, o0, A3 | ||||
| lxvd2x vs61, o16, A3 | lxvd2x vs61, o16, A3 | ||||
| lxvd2x vs62, o32, A3 | lxvd2x vs62, o32, A3 | ||||
| lxvd2x vs63, o48, A3 | lxvd2x vs63, o48, A3 | ||||
| addi A3, A3, 64 | addi A3, A3, 64 | ||||
| mr T1, BO | mr T1, BO | ||||
| stxvd2x vs32, o0, T1 | stxvd2x vs32, o0, T1 | ||||
| @@ -173,10 +173,17 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, | |||||
| blocking = (mn / 2 + GEMM_UNROLL_N - 1) & ~(GEMM_UNROLL_N - 1); | blocking = (mn / 2 + GEMM_UNROLL_N - 1) & ~(GEMM_UNROLL_N - 1); | ||||
| if (blocking > GEMM_Q) blocking = GEMM_Q; | if (blocking > GEMM_Q) blocking = GEMM_Q; | ||||
| if (blocking <= GEMM_UNROLL_N * 2) { | |||||
| #ifdef POWER8 | |||||
| if (blocking <= GEMM_UNROLL_N) { | |||||
| info = GETF2(args, NULL, range_n, sa, sb, 0); | info = GETF2(args, NULL, range_n, sa, sb, 0); | ||||
| return info; | return info; | ||||
| } | } | ||||
| #else | |||||
| if (blocking <= GEMM_UNROLL_N*2) { | |||||
| info = GETF2(args, NULL, range_n, sa, sb, 0); | |||||
| return info; | |||||
| } | |||||
| #endif | |||||
| sbb = (FLOAT *)((((BLASULONG)(sb + blocking * blocking * COMPSIZE) + GEMM_ALIGN) & ~GEMM_ALIGN) + GEMM_OFFSET_B); | sbb = (FLOAT *)((((BLASULONG)(sb + blocking * blocking * COMPSIZE) + GEMM_ALIGN) & ~GEMM_ALIGN) + GEMM_OFFSET_B); | ||||
| @@ -77,10 +77,17 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, | |||||
| blocking = (mn / 2 + GEMM_UNROLL_N - 1) & ~(GEMM_UNROLL_N - 1); | blocking = (mn / 2 + GEMM_UNROLL_N - 1) & ~(GEMM_UNROLL_N - 1); | ||||
| if (blocking > GEMM_Q) blocking = GEMM_Q; | if (blocking > GEMM_Q) blocking = GEMM_Q; | ||||
| #ifdef POWER8 | |||||
| if (blocking <= GEMM_UNROLL_N) { | |||||
| info = GETF2(args, NULL, range_n, sa, sb, 0); | |||||
| return info; | |||||
| } | |||||
| #else | |||||
| if (blocking <= GEMM_UNROLL_N * 2) { | if (blocking <= GEMM_UNROLL_N * 2) { | ||||
| info = GETF2(args, NULL, range_n, sa, sb, 0); | info = GETF2(args, NULL, range_n, sa, sb, 0); | ||||
| return info; | return info; | ||||
| } | } | ||||
| #endif | |||||
| sbb = (FLOAT *)((((BLASULONG)(sb + blocking * blocking * COMPSIZE) + GEMM_ALIGN) & ~GEMM_ALIGN) + GEMM_OFFSET_B); | sbb = (FLOAT *)((((BLASULONG)(sb + blocking * blocking * COMPSIZE) + GEMM_ALIGN) & ~GEMM_ALIGN) + GEMM_OFFSET_B); | ||||