updated sgemm- and strmm-kernel for POWER8tags/v0.2.18^2
| @@ -26,11 +26,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *****************************************************************************/ | *****************************************************************************/ | ||||
| /************************************************************************************** | /************************************************************************************** | ||||
| * 2016/03/18 Werner Saar (wernsaar@googlemail.com) | |||||
| * 2016/04/02 Werner Saar (wernsaar@googlemail.com) | |||||
| * BLASTEST : OK | * BLASTEST : OK | ||||
| * CTEST : OK | * CTEST : OK | ||||
| * TEST : OK | * TEST : OK | ||||
| * LAPACK-TEST : OK | |||||
| * LAPACK-TEST : OK | |||||
| **************************************************************************************/ | **************************************************************************************/ | ||||
| /*********************************************************************/ | /*********************************************************************/ | ||||
| @@ -128,17 +128,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #endif | #endif | ||||
| #define alpha_r vs30 | #define alpha_r vs30 | ||||
| #define alpha_vr vs31 | |||||
| #define o0 0 | #define o0 0 | ||||
| #define TBUFFER r14 | |||||
| #define BBUFFER r14 | |||||
| #define o4 r15 | #define o4 r15 | ||||
| #define o12 r16 | #define o12 r16 | ||||
| #define o8 r17 | #define o8 r17 | ||||
| #define L r18 | #define L r18 | ||||
| #define T1 r19 | #define T1 r19 | ||||
| #define KK r20 | #define KK r20 | ||||
| #define BB r21 | |||||
| #define BBO r21 | |||||
| #define I r22 | #define I r22 | ||||
| #define J r23 | #define J r23 | ||||
| #define AO r24 | #define AO r24 | ||||
| @@ -256,11 +257,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| cmpwi cr0, M, 0 | cmpwi cr0, M, 0 | ||||
| ble .L999_H1 | |||||
| ble L999_H1 | |||||
| cmpwi cr0, N, 0 | cmpwi cr0, N, 0 | ||||
| ble .L999_H1 | |||||
| ble L999_H1 | |||||
| cmpwi cr0, K, 0 | cmpwi cr0, K, 0 | ||||
| ble .L999_H1 | |||||
| ble L999_H1 | |||||
| li PRE, 256 | li PRE, 256 | ||||
| li o4 , 4 | li o4 , 4 | ||||
| @@ -269,18 +270,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| li o16, 16 | li o16, 16 | ||||
| li o32, 32 | li o32, 32 | ||||
| li o48, 48 | li o48, 48 | ||||
| addi TBUFFER, SP, 320 | |||||
| li T1, 256 | |||||
| slwi T1, T1, 9 // 131072 | |||||
| sub BBUFFER, A, T1 // temp buffer for B unrolled | |||||
| addi T1, SP, 300 | addi T1, SP, 300 | ||||
| stfs f1, 0(T1) | |||||
| stxsspx f1, o0 , T1 | |||||
| stxsspx f1, o4 , T1 | |||||
| stxsspx f1, o8 , T1 | |||||
| stxsspx f1, o12 , T1 | |||||
| lxsspx alpha_r, 0, T1 | |||||
| lxsspx alpha_r, o0, T1 | |||||
| lxvw4x alpha_vr, o0, T1 | |||||
| #include "sgemm_logic_16x8_power8.S" | #include "sgemm_logic_16x8_power8.S" | ||||
| .L999: | |||||
| L999: | |||||
| addi r3, 0, 0 | addi r3, 0, 0 | ||||
| lfd f14, 0(SP) | lfd f14, 0(SP) | ||||
| @@ -26,11 +26,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *****************************************************************************/ | *****************************************************************************/ | ||||
| /************************************************************************************** | /************************************************************************************** | ||||
| * 2016/03/18 Werner Saar (wernsaar@googlemail.com) | |||||
| * 2016/04/02 Werner Saar (wernsaar@googlemail.com) | |||||
| * BLASTEST : OK | * BLASTEST : OK | ||||
| * CTEST : OK | * CTEST : OK | ||||
| * TEST : OK | * TEST : OK | ||||
| * LAPACK-TEST : OK | |||||
| * LAPACK-TEST : OK | |||||
| **************************************************************************************/ | **************************************************************************************/ | ||||
| /*********************************************************************/ | /*********************************************************************/ | ||||
| @@ -128,6 +128,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #endif | #endif | ||||
| #define alpha_r vs30 | #define alpha_r vs30 | ||||
| #define alpha_vr vs31 | |||||
| #define o0 0 | #define o0 0 | ||||
| @@ -152,7 +153,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #define PRE r30 | #define PRE r30 | ||||
| #define T2 r31 | #define T2 r31 | ||||
| #include "sgemm_macros_16x8_power8.S" | |||||
| #include "strmm_macros_16x8_power8.S" | |||||
| #ifndef NEEDPARAM | #ifndef NEEDPARAM | ||||
| @@ -264,11 +265,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| cmpwi cr0, M, 0 | cmpwi cr0, M, 0 | ||||
| ble .L999_H1 | |||||
| ble L999_H1 | |||||
| cmpwi cr0, N, 0 | cmpwi cr0, N, 0 | ||||
| ble .L999_H1 | |||||
| ble L999_H1 | |||||
| cmpwi cr0, K, 0 | cmpwi cr0, K, 0 | ||||
| ble .L999_H1 | |||||
| ble L999_H1 | |||||
| li PRE, 256 | li PRE, 256 | ||||
| li o4 , 4 | li o4 , 4 | ||||
| @@ -280,16 +281,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| addi TBUFFER, SP, 320 | addi TBUFFER, SP, 320 | ||||
| addi T1, SP, 300 | addi T1, SP, 300 | ||||
| stfs f1, 0(T1) | |||||
| lxsspx alpha_r, 0, T1 | |||||
| stxsspx f1, o0 , T1 | |||||
| stxsspx f1, o4 , T1 | |||||
| stxsspx f1, o8 , T1 | |||||
| stxsspx f1, o12 , T1 | |||||
| lxsspx alpha_r, o0, T1 | |||||
| lxvw4x alpha_vr, o0, T1 | |||||
| #include "strmm_logic_16x8_power8.S" | #include "strmm_logic_16x8_power8.S" | ||||
| .L999: | |||||
| L999: | |||||
| addi r3, 0, 0 | addi r3, 0, 0 | ||||
| lfd f14, 0(SP) | lfd f14, 0(SP) | ||||
| @@ -1964,7 +1964,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #define SNUMOPT 16 | #define SNUMOPT 16 | ||||
| #define DNUMOPT 8 | #define DNUMOPT 8 | ||||
| #define GEMM_DEFAULT_OFFSET_A 384 | |||||
| #define GEMM_DEFAULT_OFFSET_A 131072 | |||||
| #define GEMM_DEFAULT_OFFSET_B 1024 | #define GEMM_DEFAULT_OFFSET_B 1024 | ||||
| #define GEMM_DEFAULT_ALIGN 0x03fffUL | #define GEMM_DEFAULT_ALIGN 0x03fffUL | ||||
| @@ -1977,17 +1977,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #define ZGEMM_DEFAULT_UNROLL_M 8 | #define ZGEMM_DEFAULT_UNROLL_M 8 | ||||
| #define ZGEMM_DEFAULT_UNROLL_N 2 | #define ZGEMM_DEFAULT_UNROLL_N 2 | ||||
| #define SGEMM_DEFAULT_P 480 | |||||
| #define SGEMM_DEFAULT_P 960 | |||||
| #define DGEMM_DEFAULT_P 480 | #define DGEMM_DEFAULT_P 480 | ||||
| #define CGEMM_DEFAULT_P 480 | #define CGEMM_DEFAULT_P 480 | ||||
| #define ZGEMM_DEFAULT_P 240 | #define ZGEMM_DEFAULT_P 240 | ||||
| #define SGEMM_DEFAULT_Q 1440 | |||||
| #define SGEMM_DEFAULT_Q 720 | |||||
| #define DGEMM_DEFAULT_Q 720 | #define DGEMM_DEFAULT_Q 720 | ||||
| #define CGEMM_DEFAULT_Q 720 | #define CGEMM_DEFAULT_Q 720 | ||||
| #define ZGEMM_DEFAULT_Q 360 | #define ZGEMM_DEFAULT_Q 360 | ||||
| #define SGEMM_DEFAULT_R 28800 | |||||
| #define SGEMM_DEFAULT_R 14400 | |||||
| #define DGEMM_DEFAULT_R 14400 | #define DGEMM_DEFAULT_R 14400 | ||||
| #define CGEMM_DEFAULT_R 14400 | #define CGEMM_DEFAULT_R 14400 | ||||
| #define ZGEMM_DEFAULT_R 7200 | #define ZGEMM_DEFAULT_R 7200 | ||||